aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/acct.c6
-rw-r--r--kernel/audit.c14
-rw-r--r--kernel/auditfilter.c2
-rw-r--r--kernel/auditsc.c11
-rw-r--r--kernel/capability.c2
-rw-r--r--kernel/compat.c33
-rw-r--r--kernel/cpu.c138
-rw-r--r--kernel/cpuset.c113
-rw-r--r--kernel/exit.c39
-rw-r--r--kernel/fork.c20
-rw-r--r--kernel/futex.c10
-rw-r--r--kernel/hrtimer.c20
-rw-r--r--kernel/irq/chip.c6
-rw-r--r--kernel/irq/handle.c2
-rw-r--r--kernel/kexec.c8
-rw-r--r--kernel/kfifo.c28
-rw-r--r--kernel/kmod.c12
-rw-r--r--kernel/lockdep.c26
-rw-r--r--kernel/module.c32
-rw-r--r--kernel/panic.c12
-rw-r--r--kernel/params.c15
-rw-r--r--kernel/pid.c12
-rw-r--r--kernel/posix-cpu-timers.c101
-rw-r--r--kernel/posix-timers.c21
-rw-r--r--kernel/power/Kconfig22
-rw-r--r--kernel/power/Makefile2
-rw-r--r--kernel/power/disk.c11
-rw-r--r--kernel/power/main.c40
-rw-r--r--kernel/power/power.h59
-rw-r--r--kernel/power/smp.c62
-rw-r--r--kernel/power/snapshot.c1155
-rw-r--r--kernel/power/swap.c270
-rw-r--r--kernel/power/swsusp.c14
-rw-r--r--kernel/power/user.c17
-rw-r--r--kernel/printk.c3
-rw-r--r--kernel/profile.c16
-rw-r--r--kernel/ptrace.c55
-rw-r--r--kernel/rcutorture.c8
-rw-r--r--kernel/relay.c38
-rw-r--r--kernel/resource.c32
-rw-r--r--kernel/rtmutex.c51
-rw-r--r--kernel/sched.c128
-rw-r--r--kernel/signal.c11
-rw-r--r--kernel/softirq.c4
-rw-r--r--kernel/softlockup.c3
-rw-r--r--kernel/spinlock.c16
-rw-r--r--kernel/stop_machine.c3
-rw-r--r--kernel/sys.c32
-rw-r--r--kernel/sysctl.c154
-rw-r--r--kernel/taskstats.c2
-rw-r--r--kernel/timer.c53
-rw-r--r--kernel/unwind.c39
52 files changed, 2002 insertions, 981 deletions
diff --git a/kernel/acct.c b/kernel/acct.c
index 2a7c933651c7..f4330acead46 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -483,10 +483,14 @@ static void do_acct_process(struct file *file)
483 ac.ac_ppid = current->parent->tgid; 483 ac.ac_ppid = current->parent->tgid;
484#endif 484#endif
485 485
486 read_lock(&tasklist_lock); /* pin current->signal */ 486 mutex_lock(&tty_mutex);
487 /* FIXME: Whoever is responsible for current->signal locking needs
488 to use the same locking all over the kernel and document it */
489 read_lock(&tasklist_lock);
487 ac.ac_tty = current->signal->tty ? 490 ac.ac_tty = current->signal->tty ?
488 old_encode_dev(tty_devnum(current->signal->tty)) : 0; 491 old_encode_dev(tty_devnum(current->signal->tty)) : 0;
489 read_unlock(&tasklist_lock); 492 read_unlock(&tasklist_lock);
493 mutex_unlock(&tty_mutex);
490 494
491 spin_lock_irq(&current->sighand->siglock); 495 spin_lock_irq(&current->sighand->siglock);
492 ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); 496 ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
diff --git a/kernel/audit.c b/kernel/audit.c
index 963fd15c9621..f9889ee77825 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -244,7 +244,7 @@ static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sid)
244 char *ctx = NULL; 244 char *ctx = NULL;
245 u32 len; 245 u32 len;
246 int rc; 246 int rc;
247 if ((rc = selinux_ctxid_to_string(sid, &ctx, &len))) 247 if ((rc = selinux_sid_to_string(sid, &ctx, &len)))
248 return rc; 248 return rc;
249 else 249 else
250 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, 250 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
@@ -267,7 +267,7 @@ static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid)
267 char *ctx = NULL; 267 char *ctx = NULL;
268 u32 len; 268 u32 len;
269 int rc; 269 int rc;
270 if ((rc = selinux_ctxid_to_string(sid, &ctx, &len))) 270 if ((rc = selinux_sid_to_string(sid, &ctx, &len)))
271 return rc; 271 return rc;
272 else 272 else
273 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, 273 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
@@ -293,7 +293,7 @@ static int audit_set_enabled(int state, uid_t loginuid, u32 sid)
293 char *ctx = NULL; 293 char *ctx = NULL;
294 u32 len; 294 u32 len;
295 int rc; 295 int rc;
296 if ((rc = selinux_ctxid_to_string(sid, &ctx, &len))) 296 if ((rc = selinux_sid_to_string(sid, &ctx, &len)))
297 return rc; 297 return rc;
298 else 298 else
299 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, 299 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
@@ -321,7 +321,7 @@ static int audit_set_failure(int state, uid_t loginuid, u32 sid)
321 char *ctx = NULL; 321 char *ctx = NULL;
322 u32 len; 322 u32 len;
323 int rc; 323 int rc;
324 if ((rc = selinux_ctxid_to_string(sid, &ctx, &len))) 324 if ((rc = selinux_sid_to_string(sid, &ctx, &len)))
325 return rc; 325 return rc;
326 else 326 else
327 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, 327 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
@@ -538,7 +538,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
538 if (status_get->mask & AUDIT_STATUS_PID) { 538 if (status_get->mask & AUDIT_STATUS_PID) {
539 int old = audit_pid; 539 int old = audit_pid;
540 if (sid) { 540 if (sid) {
541 if ((err = selinux_ctxid_to_string( 541 if ((err = selinux_sid_to_string(
542 sid, &ctx, &len))) 542 sid, &ctx, &len)))
543 return err; 543 return err;
544 else 544 else
@@ -576,7 +576,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
576 "user pid=%d uid=%u auid=%u", 576 "user pid=%d uid=%u auid=%u",
577 pid, uid, loginuid); 577 pid, uid, loginuid);
578 if (sid) { 578 if (sid) {
579 if (selinux_ctxid_to_string( 579 if (selinux_sid_to_string(
580 sid, &ctx, &len)) { 580 sid, &ctx, &len)) {
581 audit_log_format(ab, 581 audit_log_format(ab,
582 " ssid=%u", sid); 582 " ssid=%u", sid);
@@ -614,7 +614,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
614 loginuid, sid); 614 loginuid, sid);
615 break; 615 break;
616 case AUDIT_SIGNAL_INFO: 616 case AUDIT_SIGNAL_INFO:
617 err = selinux_ctxid_to_string(audit_sig_sid, &ctx, &len); 617 err = selinux_sid_to_string(audit_sig_sid, &ctx, &len);
618 if (err) 618 if (err)
619 return err; 619 return err;
620 sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL); 620 sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index a44879b0c72f..1a58a81fb09d 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1398,7 +1398,7 @@ static void audit_log_rule_change(uid_t loginuid, u32 sid, char *action,
1398 if (sid) { 1398 if (sid) {
1399 char *ctx = NULL; 1399 char *ctx = NULL;
1400 u32 len; 1400 u32 len;
1401 if (selinux_ctxid_to_string(sid, &ctx, &len)) 1401 if (selinux_sid_to_string(sid, &ctx, &len))
1402 audit_log_format(ab, " ssid=%u", sid); 1402 audit_log_format(ab, " ssid=%u", sid);
1403 else 1403 else
1404 audit_log_format(ab, " subj=%s", ctx); 1404 audit_log_format(ab, " subj=%s", ctx);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 1bd8827a0102..105147631753 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -385,7 +385,7 @@ static int audit_filter_rules(struct task_struct *tsk,
385 logged upon error */ 385 logged upon error */
386 if (f->se_rule) { 386 if (f->se_rule) {
387 if (need_sid) { 387 if (need_sid) {
388 selinux_task_ctxid(tsk, &sid); 388 selinux_get_task_sid(tsk, &sid);
389 need_sid = 0; 389 need_sid = 0;
390 } 390 }
391 result = selinux_audit_rule_match(sid, f->type, 391 result = selinux_audit_rule_match(sid, f->type,
@@ -817,6 +817,8 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
817 audit_log_format(ab, " success=%s exit=%ld", 817 audit_log_format(ab, " success=%s exit=%ld",
818 (context->return_valid==AUDITSC_SUCCESS)?"yes":"no", 818 (context->return_valid==AUDITSC_SUCCESS)?"yes":"no",
819 context->return_code); 819 context->return_code);
820
821 mutex_lock(&tty_mutex);
820 if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) 822 if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
821 tty = tsk->signal->tty->name; 823 tty = tsk->signal->tty->name;
822 else 824 else
@@ -838,6 +840,9 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
838 context->gid, 840 context->gid,
839 context->euid, context->suid, context->fsuid, 841 context->euid, context->suid, context->fsuid,
840 context->egid, context->sgid, context->fsgid, tty); 842 context->egid, context->sgid, context->fsgid, tty);
843
844 mutex_unlock(&tty_mutex);
845
841 audit_log_task_info(ab, tsk); 846 audit_log_task_info(ab, tsk);
842 if (context->filterkey) { 847 if (context->filterkey) {
843 audit_log_format(ab, " key="); 848 audit_log_format(ab, " key=");
@@ -898,7 +903,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
898 if (axi->osid != 0) { 903 if (axi->osid != 0) {
899 char *ctx = NULL; 904 char *ctx = NULL;
900 u32 len; 905 u32 len;
901 if (selinux_ctxid_to_string( 906 if (selinux_sid_to_string(
902 axi->osid, &ctx, &len)) { 907 axi->osid, &ctx, &len)) {
903 audit_log_format(ab, " osid=%u", 908 audit_log_format(ab, " osid=%u",
904 axi->osid); 909 axi->osid);
@@ -1005,7 +1010,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1005 if (n->osid != 0) { 1010 if (n->osid != 0) {
1006 char *ctx = NULL; 1011 char *ctx = NULL;
1007 u32 len; 1012 u32 len;
1008 if (selinux_ctxid_to_string( 1013 if (selinux_sid_to_string(
1009 n->osid, &ctx, &len)) { 1014 n->osid, &ctx, &len)) {
1010 audit_log_format(ab, " osid=%u", n->osid); 1015 audit_log_format(ab, " osid=%u", n->osid);
1011 call_panic = 2; 1016 call_panic = 2;
diff --git a/kernel/capability.c b/kernel/capability.c
index c7685ad00a97..edb845a6e84a 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -133,7 +133,7 @@ static inline int cap_set_all(kernel_cap_t *effective,
133 int found = 0; 133 int found = 0;
134 134
135 do_each_thread(g, target) { 135 do_each_thread(g, target) {
136 if (target == current || target->pid == 1) 136 if (target == current || is_init(target))
137 continue; 137 continue;
138 found = 1; 138 found = 1;
139 if (security_capset_check(target, effective, inheritable, 139 if (security_capset_check(target, effective, inheritable,
diff --git a/kernel/compat.c b/kernel/compat.c
index 126dee9530aa..75573e5d27b0 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -22,6 +22,7 @@
22#include <linux/security.h> 22#include <linux/security.h>
23#include <linux/timex.h> 23#include <linux/timex.h>
24#include <linux/migrate.h> 24#include <linux/migrate.h>
25#include <linux/posix-timers.h>
25 26
26#include <asm/uaccess.h> 27#include <asm/uaccess.h>
27 28
@@ -601,6 +602,30 @@ long compat_sys_clock_getres(clockid_t which_clock,
601 return err; 602 return err;
602} 603}
603 604
605static long compat_clock_nanosleep_restart(struct restart_block *restart)
606{
607 long err;
608 mm_segment_t oldfs;
609 struct timespec tu;
610 struct compat_timespec *rmtp = (struct compat_timespec *)(restart->arg1);
611
612 restart->arg1 = (unsigned long) &tu;
613 oldfs = get_fs();
614 set_fs(KERNEL_DS);
615 err = clock_nanosleep_restart(restart);
616 set_fs(oldfs);
617
618 if ((err == -ERESTART_RESTARTBLOCK) && rmtp &&
619 put_compat_timespec(&tu, rmtp))
620 return -EFAULT;
621
622 if (err == -ERESTART_RESTARTBLOCK) {
623 restart->fn = compat_clock_nanosleep_restart;
624 restart->arg1 = (unsigned long) rmtp;
625 }
626 return err;
627}
628
604long compat_sys_clock_nanosleep(clockid_t which_clock, int flags, 629long compat_sys_clock_nanosleep(clockid_t which_clock, int flags,
605 struct compat_timespec __user *rqtp, 630 struct compat_timespec __user *rqtp,
606 struct compat_timespec __user *rmtp) 631 struct compat_timespec __user *rmtp)
@@ -608,6 +633,7 @@ long compat_sys_clock_nanosleep(clockid_t which_clock, int flags,
608 long err; 633 long err;
609 mm_segment_t oldfs; 634 mm_segment_t oldfs;
610 struct timespec in, out; 635 struct timespec in, out;
636 struct restart_block *restart;
611 637
612 if (get_compat_timespec(&in, rqtp)) 638 if (get_compat_timespec(&in, rqtp))
613 return -EFAULT; 639 return -EFAULT;
@@ -618,9 +644,16 @@ long compat_sys_clock_nanosleep(clockid_t which_clock, int flags,
618 (struct timespec __user *) &in, 644 (struct timespec __user *) &in,
619 (struct timespec __user *) &out); 645 (struct timespec __user *) &out);
620 set_fs(oldfs); 646 set_fs(oldfs);
647
621 if ((err == -ERESTART_RESTARTBLOCK) && rmtp && 648 if ((err == -ERESTART_RESTARTBLOCK) && rmtp &&
622 put_compat_timespec(&out, rmtp)) 649 put_compat_timespec(&out, rmtp))
623 return -EFAULT; 650 return -EFAULT;
651
652 if (err == -ERESTART_RESTARTBLOCK) {
653 restart = &current_thread_info()->restart_block;
654 restart->fn = compat_clock_nanosleep_restart;
655 restart->arg1 = (unsigned long) rmtp;
656 }
624 return err; 657 return err;
625} 658}
626 659
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f230f9ae01c2..32c96628463e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -21,6 +21,11 @@ static DEFINE_MUTEX(cpu_bitmask_lock);
21 21
22static __cpuinitdata BLOCKING_NOTIFIER_HEAD(cpu_chain); 22static __cpuinitdata BLOCKING_NOTIFIER_HEAD(cpu_chain);
23 23
24/* If set, cpu_up and cpu_down will return -EBUSY and do nothing.
25 * Should always be manipulated under cpu_add_remove_lock
26 */
27static int cpu_hotplug_disabled;
28
24#ifdef CONFIG_HOTPLUG_CPU 29#ifdef CONFIG_HOTPLUG_CPU
25 30
26/* Crappy recursive lock-takers in cpufreq! Complain loudly about idiots */ 31/* Crappy recursive lock-takers in cpufreq! Complain loudly about idiots */
@@ -108,30 +113,25 @@ static int take_cpu_down(void *unused)
108 return 0; 113 return 0;
109} 114}
110 115
111int cpu_down(unsigned int cpu) 116/* Requires cpu_add_remove_lock to be held */
117static int _cpu_down(unsigned int cpu)
112{ 118{
113 int err; 119 int err;
114 struct task_struct *p; 120 struct task_struct *p;
115 cpumask_t old_allowed, tmp; 121 cpumask_t old_allowed, tmp;
116 122
117 mutex_lock(&cpu_add_remove_lock); 123 if (num_online_cpus() == 1)
118 if (num_online_cpus() == 1) { 124 return -EBUSY;
119 err = -EBUSY;
120 goto out;
121 }
122 125
123 if (!cpu_online(cpu)) { 126 if (!cpu_online(cpu))
124 err = -EINVAL; 127 return -EINVAL;
125 goto out;
126 }
127 128
128 err = blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE, 129 err = blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE,
129 (void *)(long)cpu); 130 (void *)(long)cpu);
130 if (err == NOTIFY_BAD) { 131 if (err == NOTIFY_BAD) {
131 printk("%s: attempt to take down CPU %u failed\n", 132 printk("%s: attempt to take down CPU %u failed\n",
132 __FUNCTION__, cpu); 133 __FUNCTION__, cpu);
133 err = -EINVAL; 134 return -EINVAL;
134 goto out;
135 } 135 }
136 136
137 /* Ensure that we are not runnable on dying cpu */ 137 /* Ensure that we are not runnable on dying cpu */
@@ -179,22 +179,32 @@ out_thread:
179 err = kthread_stop(p); 179 err = kthread_stop(p);
180out_allowed: 180out_allowed:
181 set_cpus_allowed(current, old_allowed); 181 set_cpus_allowed(current, old_allowed);
182out: 182 return err;
183}
184
185int cpu_down(unsigned int cpu)
186{
187 int err = 0;
188
189 mutex_lock(&cpu_add_remove_lock);
190 if (cpu_hotplug_disabled)
191 err = -EBUSY;
192 else
193 err = _cpu_down(cpu);
194
183 mutex_unlock(&cpu_add_remove_lock); 195 mutex_unlock(&cpu_add_remove_lock);
184 return err; 196 return err;
185} 197}
186#endif /*CONFIG_HOTPLUG_CPU*/ 198#endif /*CONFIG_HOTPLUG_CPU*/
187 199
188int __devinit cpu_up(unsigned int cpu) 200/* Requires cpu_add_remove_lock to be held */
201static int __devinit _cpu_up(unsigned int cpu)
189{ 202{
190 int ret; 203 int ret;
191 void *hcpu = (void *)(long)cpu; 204 void *hcpu = (void *)(long)cpu;
192 205
193 mutex_lock(&cpu_add_remove_lock); 206 if (cpu_online(cpu) || !cpu_present(cpu))
194 if (cpu_online(cpu) || !cpu_present(cpu)) { 207 return -EINVAL;
195 ret = -EINVAL;
196 goto out;
197 }
198 208
199 ret = blocking_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu); 209 ret = blocking_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu);
200 if (ret == NOTIFY_BAD) { 210 if (ret == NOTIFY_BAD) {
@@ -219,7 +229,95 @@ out_notify:
219 if (ret != 0) 229 if (ret != 0)
220 blocking_notifier_call_chain(&cpu_chain, 230 blocking_notifier_call_chain(&cpu_chain,
221 CPU_UP_CANCELED, hcpu); 231 CPU_UP_CANCELED, hcpu);
232
233 return ret;
234}
235
236int __devinit cpu_up(unsigned int cpu)
237{
238 int err = 0;
239
240 mutex_lock(&cpu_add_remove_lock);
241 if (cpu_hotplug_disabled)
242 err = -EBUSY;
243 else
244 err = _cpu_up(cpu);
245
246 mutex_unlock(&cpu_add_remove_lock);
247 return err;
248}
249
250#ifdef CONFIG_SUSPEND_SMP
251static cpumask_t frozen_cpus;
252
253int disable_nonboot_cpus(void)
254{
255 int cpu, first_cpu, error;
256
257 mutex_lock(&cpu_add_remove_lock);
258 first_cpu = first_cpu(cpu_present_map);
259 if (!cpu_online(first_cpu)) {
260 error = _cpu_up(first_cpu);
261 if (error) {
262 printk(KERN_ERR "Could not bring CPU%d up.\n",
263 first_cpu);
264 goto out;
265 }
266 }
267 error = set_cpus_allowed(current, cpumask_of_cpu(first_cpu));
268 if (error) {
269 printk(KERN_ERR "Could not run on CPU%d\n", first_cpu);
270 goto out;
271 }
272 /* We take down all of the non-boot CPUs in one shot to avoid races
273 * with the userspace trying to use the CPU hotplug at the same time
274 */
275 cpus_clear(frozen_cpus);
276 printk("Disabling non-boot CPUs ...\n");
277 for_each_online_cpu(cpu) {
278 if (cpu == first_cpu)
279 continue;
280 error = _cpu_down(cpu);
281 if (!error) {
282 cpu_set(cpu, frozen_cpus);
283 printk("CPU%d is down\n", cpu);
284 } else {
285 printk(KERN_ERR "Error taking CPU%d down: %d\n",
286 cpu, error);
287 break;
288 }
289 }
290 if (!error) {
291 BUG_ON(num_online_cpus() > 1);
292 /* Make sure the CPUs won't be enabled by someone else */
293 cpu_hotplug_disabled = 1;
294 } else {
295 printk(KERN_ERR "Non-boot CPUs are not disabled");
296 }
222out: 297out:
223 mutex_unlock(&cpu_add_remove_lock); 298 mutex_unlock(&cpu_add_remove_lock);
224 return ret; 299 return error;
300}
301
302void enable_nonboot_cpus(void)
303{
304 int cpu, error;
305
306 /* Allow everyone to use the CPU hotplug again */
307 mutex_lock(&cpu_add_remove_lock);
308 cpu_hotplug_disabled = 0;
309 mutex_unlock(&cpu_add_remove_lock);
310
311 printk("Enabling non-boot CPUs ...\n");
312 for_each_cpu_mask(cpu, frozen_cpus) {
313 error = cpu_up(cpu);
314 if (!error) {
315 printk("CPU%d is up\n", cpu);
316 continue;
317 }
318 printk(KERN_WARNING "Error taking CPU%d up: %d\n",
319 cpu, error);
320 }
321 cpus_clear(frozen_cpus);
225} 322}
323#endif
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 4ea6f0dc2fc5..8c3c400cce91 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -240,7 +240,7 @@ static struct super_block *cpuset_sb;
240 * A cpuset can only be deleted if both its 'count' of using tasks 240 * A cpuset can only be deleted if both its 'count' of using tasks
241 * is zero, and its list of 'children' cpusets is empty. Since all 241 * is zero, and its list of 'children' cpusets is empty. Since all
242 * tasks in the system use _some_ cpuset, and since there is always at 242 * tasks in the system use _some_ cpuset, and since there is always at
243 * least one task in the system (init, pid == 1), therefore, top_cpuset 243 * least one task in the system (init), therefore, top_cpuset
244 * always has either children cpusets and/or using tasks. So we don't 244 * always has either children cpusets and/or using tasks. So we don't
245 * need a special hack to ensure that top_cpuset cannot be deleted. 245 * need a special hack to ensure that top_cpuset cannot be deleted.
246 * 246 *
@@ -289,7 +289,6 @@ static struct inode *cpuset_new_inode(mode_t mode)
289 inode->i_mode = mode; 289 inode->i_mode = mode;
290 inode->i_uid = current->fsuid; 290 inode->i_uid = current->fsuid;
291 inode->i_gid = current->fsgid; 291 inode->i_gid = current->fsgid;
292 inode->i_blksize = PAGE_CACHE_SIZE;
293 inode->i_blocks = 0; 292 inode->i_blocks = 0;
294 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 293 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
295 inode->i_mapping->backing_dev_info = &cpuset_backing_dev_info; 294 inode->i_mapping->backing_dev_info = &cpuset_backing_dev_info;
@@ -913,6 +912,10 @@ static int update_nodemask(struct cpuset *cs, char *buf)
913 int fudge; 912 int fudge;
914 int retval; 913 int retval;
915 914
915 /* top_cpuset.mems_allowed tracks node_online_map; it's read-only */
916 if (cs == &top_cpuset)
917 return -EACCES;
918
916 trialcs = *cs; 919 trialcs = *cs;
917 retval = nodelist_parse(buf, trialcs.mems_allowed); 920 retval = nodelist_parse(buf, trialcs.mems_allowed);
918 if (retval < 0) 921 if (retval < 0)
@@ -1222,7 +1225,12 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
1222 1225
1223 task_lock(tsk); 1226 task_lock(tsk);
1224 oldcs = tsk->cpuset; 1227 oldcs = tsk->cpuset;
1225 if (!oldcs) { 1228 /*
1229 * After getting 'oldcs' cpuset ptr, be sure still not exiting.
1230 * If 'oldcs' might be the top_cpuset due to the_top_cpuset_hack
1231 * then fail this attach_task(), to avoid breaking top_cpuset.count.
1232 */
1233 if (tsk->flags & PF_EXITING) {
1226 task_unlock(tsk); 1234 task_unlock(tsk);
1227 mutex_unlock(&callback_mutex); 1235 mutex_unlock(&callback_mutex);
1228 put_task_struct(tsk); 1236 put_task_struct(tsk);
@@ -2037,33 +2045,104 @@ out:
2037 return err; 2045 return err;
2038} 2046}
2039 2047
2048#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
2040/* 2049/*
2041 * The top_cpuset tracks what CPUs and Memory Nodes are online, 2050 * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs
2042 * period. This is necessary in order to make cpusets transparent 2051 * or memory nodes, we need to walk over the cpuset hierarchy,
2043 * (of no affect) on systems that are actively using CPU hotplug 2052 * removing that CPU or node from all cpusets. If this removes the
2044 * but making no active use of cpusets. 2053 * last CPU or node from a cpuset, then the guarantee_online_cpus()
2045 * 2054 * or guarantee_online_mems() code will use that emptied cpusets
2046 * This handles CPU hotplug (cpuhp) events. If someday Memory 2055 * parent online CPUs or nodes. Cpusets that were already empty of
2047 * Nodes can be hotplugged (dynamically changing node_online_map) 2056 * CPUs or nodes are left empty.
2048 * then we should handle that too, perhaps in a similar way. 2057 *
2058 * This routine is intentionally inefficient in a couple of regards.
2059 * It will check all cpusets in a subtree even if the top cpuset of
2060 * the subtree has no offline CPUs or nodes. It checks both CPUs and
2061 * nodes, even though the caller could have been coded to know that
2062 * only one of CPUs or nodes needed to be checked on a given call.
2063 * This was done to minimize text size rather than cpu cycles.
2064 *
2065 * Call with both manage_mutex and callback_mutex held.
2066 *
2067 * Recursive, on depth of cpuset subtree.
2049 */ 2068 */
2050 2069
2051#ifdef CONFIG_HOTPLUG_CPU 2070static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur)
2052static int cpuset_handle_cpuhp(struct notifier_block *nb, 2071{
2053 unsigned long phase, void *cpu) 2072 struct cpuset *c;
2073
2074 /* Each of our child cpusets mems must be online */
2075 list_for_each_entry(c, &cur->children, sibling) {
2076 guarantee_online_cpus_mems_in_subtree(c);
2077 if (!cpus_empty(c->cpus_allowed))
2078 guarantee_online_cpus(c, &c->cpus_allowed);
2079 if (!nodes_empty(c->mems_allowed))
2080 guarantee_online_mems(c, &c->mems_allowed);
2081 }
2082}
2083
2084/*
2085 * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track
2086 * cpu_online_map and node_online_map. Force the top cpuset to track
2087 * whats online after any CPU or memory node hotplug or unplug event.
2088 *
2089 * To ensure that we don't remove a CPU or node from the top cpuset
2090 * that is currently in use by a child cpuset (which would violate
2091 * the rule that cpusets must be subsets of their parent), we first
2092 * call the recursive routine guarantee_online_cpus_mems_in_subtree().
2093 *
2094 * Since there are two callers of this routine, one for CPU hotplug
2095 * events and one for memory node hotplug events, we could have coded
2096 * two separate routines here. We code it as a single common routine
2097 * in order to minimize text size.
2098 */
2099
2100static void common_cpu_mem_hotplug_unplug(void)
2054{ 2101{
2055 mutex_lock(&manage_mutex); 2102 mutex_lock(&manage_mutex);
2056 mutex_lock(&callback_mutex); 2103 mutex_lock(&callback_mutex);
2057 2104
2105 guarantee_online_cpus_mems_in_subtree(&top_cpuset);
2058 top_cpuset.cpus_allowed = cpu_online_map; 2106 top_cpuset.cpus_allowed = cpu_online_map;
2107 top_cpuset.mems_allowed = node_online_map;
2059 2108
2060 mutex_unlock(&callback_mutex); 2109 mutex_unlock(&callback_mutex);
2061 mutex_unlock(&manage_mutex); 2110 mutex_unlock(&manage_mutex);
2111}
2112#endif
2113
2114#ifdef CONFIG_HOTPLUG_CPU
2115/*
2116 * The top_cpuset tracks what CPUs and Memory Nodes are online,
2117 * period. This is necessary in order to make cpusets transparent
2118 * (of no affect) on systems that are actively using CPU hotplug
2119 * but making no active use of cpusets.
2120 *
2121 * This routine ensures that top_cpuset.cpus_allowed tracks
2122 * cpu_online_map on each CPU hotplug (cpuhp) event.
2123 */
2062 2124
2125static int cpuset_handle_cpuhp(struct notifier_block *nb,
2126 unsigned long phase, void *cpu)
2127{
2128 common_cpu_mem_hotplug_unplug();
2063 return 0; 2129 return 0;
2064} 2130}
2065#endif 2131#endif
2066 2132
2133#ifdef CONFIG_MEMORY_HOTPLUG
2134/*
2135 * Keep top_cpuset.mems_allowed tracking node_online_map.
2136 * Call this routine anytime after you change node_online_map.
2137 * See also the previous routine cpuset_handle_cpuhp().
2138 */
2139
2140void cpuset_track_online_nodes()
2141{
2142 common_cpu_mem_hotplug_unplug();
2143}
2144#endif
2145
2067/** 2146/**
2068 * cpuset_init_smp - initialize cpus_allowed 2147 * cpuset_init_smp - initialize cpus_allowed
2069 * 2148 *
@@ -2245,7 +2324,7 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
2245 int i; 2324 int i;
2246 2325
2247 for (i = 0; zl->zones[i]; i++) { 2326 for (i = 0; zl->zones[i]; i++) {
2248 int nid = zl->zones[i]->zone_pgdat->node_id; 2327 int nid = zone_to_nid(zl->zones[i]);
2249 2328
2250 if (node_isset(nid, current->mems_allowed)) 2329 if (node_isset(nid, current->mems_allowed))
2251 return 1; 2330 return 1;
@@ -2316,9 +2395,9 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
2316 const struct cpuset *cs; /* current cpuset ancestors */ 2395 const struct cpuset *cs; /* current cpuset ancestors */
2317 int allowed; /* is allocation in zone z allowed? */ 2396 int allowed; /* is allocation in zone z allowed? */
2318 2397
2319 if (in_interrupt()) 2398 if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
2320 return 1; 2399 return 1;
2321 node = z->zone_pgdat->node_id; 2400 node = zone_to_nid(z);
2322 might_sleep_if(!(gfp_mask & __GFP_HARDWALL)); 2401 might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
2323 if (node_isset(node, current->mems_allowed)) 2402 if (node_isset(node, current->mems_allowed))
2324 return 1; 2403 return 1;
diff --git a/kernel/exit.c b/kernel/exit.c
index d891883420f7..2e4c13cba95a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -219,7 +219,7 @@ static int will_become_orphaned_pgrp(int pgrp, struct task_struct *ignored_task)
219 do_each_task_pid(pgrp, PIDTYPE_PGID, p) { 219 do_each_task_pid(pgrp, PIDTYPE_PGID, p) {
220 if (p == ignored_task 220 if (p == ignored_task
221 || p->exit_state 221 || p->exit_state
222 || p->real_parent->pid == 1) 222 || is_init(p->real_parent))
223 continue; 223 continue;
224 if (process_group(p->real_parent) != pgrp 224 if (process_group(p->real_parent) != pgrp
225 && p->real_parent->signal->session == p->signal->session) { 225 && p->real_parent->signal->session == p->signal->session) {
@@ -249,17 +249,6 @@ static int has_stopped_jobs(int pgrp)
249 do_each_task_pid(pgrp, PIDTYPE_PGID, p) { 249 do_each_task_pid(pgrp, PIDTYPE_PGID, p) {
250 if (p->state != TASK_STOPPED) 250 if (p->state != TASK_STOPPED)
251 continue; 251 continue;
252
253 /* If p is stopped by a debugger on a signal that won't
254 stop it, then don't count p as stopped. This isn't
255 perfect but it's a good approximation. */
256 if (unlikely (p->ptrace)
257 && p->exit_code != SIGSTOP
258 && p->exit_code != SIGTSTP
259 && p->exit_code != SIGTTOU
260 && p->exit_code != SIGTTIN)
261 continue;
262
263 retval = 1; 252 retval = 1;
264 break; 253 break;
265 } while_each_task_pid(pgrp, PIDTYPE_PGID, p); 254 } while_each_task_pid(pgrp, PIDTYPE_PGID, p);
@@ -292,9 +281,7 @@ static void reparent_to_init(void)
292 /* Set the exit signal to SIGCHLD so we signal init on exit */ 281 /* Set the exit signal to SIGCHLD so we signal init on exit */
293 current->exit_signal = SIGCHLD; 282 current->exit_signal = SIGCHLD;
294 283
295 if ((current->policy == SCHED_NORMAL || 284 if (!has_rt_policy(current) && (task_nice(current) < 0))
296 current->policy == SCHED_BATCH)
297 && (task_nice(current) < 0))
298 set_user_nice(current, 0); 285 set_user_nice(current, 0);
299 /* cpus_allowed? */ 286 /* cpus_allowed? */
300 /* rt_priority? */ 287 /* rt_priority? */
@@ -487,6 +474,18 @@ void fastcall put_files_struct(struct files_struct *files)
487 474
488EXPORT_SYMBOL(put_files_struct); 475EXPORT_SYMBOL(put_files_struct);
489 476
477void reset_files_struct(struct task_struct *tsk, struct files_struct *files)
478{
479 struct files_struct *old;
480
481 old = tsk->files;
482 task_lock(tsk);
483 tsk->files = files;
484 task_unlock(tsk);
485 put_files_struct(old);
486}
487EXPORT_SYMBOL(reset_files_struct);
488
490static inline void __exit_files(struct task_struct *tsk) 489static inline void __exit_files(struct task_struct *tsk)
491{ 490{
492 struct files_struct * files = tsk->files; 491 struct files_struct * files = tsk->files;
@@ -954,15 +953,15 @@ fastcall NORET_TYPE void do_exit(long code)
954 if (tsk->splice_pipe) 953 if (tsk->splice_pipe)
955 __free_pipe_info(tsk->splice_pipe); 954 __free_pipe_info(tsk->splice_pipe);
956 955
957 /* PF_DEAD causes final put_task_struct after we schedule. */
958 preempt_disable(); 956 preempt_disable();
959 BUG_ON(tsk->flags & PF_DEAD); 957 /* causes final put_task_struct in finish_task_switch(). */
960 tsk->flags |= PF_DEAD; 958 tsk->state = TASK_DEAD;
961 959
962 schedule(); 960 schedule();
963 BUG(); 961 BUG();
964 /* Avoid "noreturn function does return". */ 962 /* Avoid "noreturn function does return". */
965 for (;;) ; 963 for (;;)
964 cpu_relax(); /* For when BUG is null */
966} 965}
967 966
968EXPORT_SYMBOL_GPL(do_exit); 967EXPORT_SYMBOL_GPL(do_exit);
@@ -971,7 +970,7 @@ NORET_TYPE void complete_and_exit(struct completion *comp, long code)
971{ 970{
972 if (comp) 971 if (comp)
973 complete(comp); 972 complete(comp);
974 973
975 do_exit(code); 974 do_exit(code);
976} 975}
977 976
diff --git a/kernel/fork.c b/kernel/fork.c
index f9b014e3e700..1c999f3e0b47 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -45,6 +45,7 @@
45#include <linux/cn_proc.h> 45#include <linux/cn_proc.h>
46#include <linux/delayacct.h> 46#include <linux/delayacct.h>
47#include <linux/taskstats_kern.h> 47#include <linux/taskstats_kern.h>
48#include <linux/random.h>
48 49
49#include <asm/pgtable.h> 50#include <asm/pgtable.h>
50#include <asm/pgalloc.h> 51#include <asm/pgalloc.h>
@@ -175,10 +176,16 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
175 tsk->thread_info = ti; 176 tsk->thread_info = ti;
176 setup_thread_stack(tsk, orig); 177 setup_thread_stack(tsk, orig);
177 178
179#ifdef CONFIG_CC_STACKPROTECTOR
180 tsk->stack_canary = get_random_int();
181#endif
182
178 /* One for us, one for whoever does the "release_task()" (usually parent) */ 183 /* One for us, one for whoever does the "release_task()" (usually parent) */
179 atomic_set(&tsk->usage,2); 184 atomic_set(&tsk->usage,2);
180 atomic_set(&tsk->fs_excl, 0); 185 atomic_set(&tsk->fs_excl, 0);
186#ifdef CONFIG_BLK_DEV_IO_TRACE
181 tsk->btrace_seq = 0; 187 tsk->btrace_seq = 0;
188#endif
182 tsk->splice_pipe = NULL; 189 tsk->splice_pipe = NULL;
183 return tsk; 190 return tsk;
184} 191}
@@ -1056,7 +1063,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1056#endif 1063#endif
1057#ifdef CONFIG_TRACE_IRQFLAGS 1064#ifdef CONFIG_TRACE_IRQFLAGS
1058 p->irq_events = 0; 1065 p->irq_events = 0;
1066#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1067 p->hardirqs_enabled = 1;
1068#else
1059 p->hardirqs_enabled = 0; 1069 p->hardirqs_enabled = 0;
1070#endif
1060 p->hardirq_enable_ip = 0; 1071 p->hardirq_enable_ip = 0;
1061 p->hardirq_enable_event = 0; 1072 p->hardirq_enable_event = 0;
1062 p->hardirq_disable_ip = _THIS_IP_; 1073 p->hardirq_disable_ip = _THIS_IP_;
@@ -1139,7 +1150,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1139 1150
1140 /* Our parent execution domain becomes current domain 1151 /* Our parent execution domain becomes current domain
1141 These must match for thread signalling to apply */ 1152 These must match for thread signalling to apply */
1142
1143 p->parent_exec_id = p->self_exec_id; 1153 p->parent_exec_id = p->self_exec_id;
1144 1154
1145 /* ok, now we should be set up.. */ 1155 /* ok, now we should be set up.. */
@@ -1162,6 +1172,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1162 /* Need tasklist lock for parent etc handling! */ 1172 /* Need tasklist lock for parent etc handling! */
1163 write_lock_irq(&tasklist_lock); 1173 write_lock_irq(&tasklist_lock);
1164 1174
1175 /* for sys_ioprio_set(IOPRIO_WHO_PGRP) */
1176 p->ioprio = current->ioprio;
1177
1165 /* 1178 /*
1166 * The task hasn't been attached yet, so its cpus_allowed mask will 1179 * The task hasn't been attached yet, so its cpus_allowed mask will
1167 * not be changed, nor will its assigned CPU. 1180 * not be changed, nor will its assigned CPU.
@@ -1221,11 +1234,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1221 } 1234 }
1222 } 1235 }
1223 1236
1224 /*
1225 * inherit ioprio
1226 */
1227 p->ioprio = current->ioprio;
1228
1229 if (likely(p->pid)) { 1237 if (likely(p->pid)) {
1230 add_parent(p); 1238 add_parent(p);
1231 if (unlikely(p->ptrace & PT_PTRACED)) 1239 if (unlikely(p->ptrace & PT_PTRACED))
diff --git a/kernel/futex.c b/kernel/futex.c
index 9d260e838cff..4b6770e9806d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -389,7 +389,7 @@ static struct task_struct * futex_find_get_task(pid_t pid)
389{ 389{
390 struct task_struct *p; 390 struct task_struct *p;
391 391
392 read_lock(&tasklist_lock); 392 rcu_read_lock();
393 p = find_task_by_pid(pid); 393 p = find_task_by_pid(pid);
394 if (!p) 394 if (!p)
395 goto out_unlock; 395 goto out_unlock;
@@ -403,7 +403,7 @@ static struct task_struct * futex_find_get_task(pid_t pid)
403 } 403 }
404 get_task_struct(p); 404 get_task_struct(p);
405out_unlock: 405out_unlock:
406 read_unlock(&tasklist_lock); 406 rcu_read_unlock();
407 407
408 return p; 408 return p;
409} 409}
@@ -1624,7 +1624,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr,
1624 struct task_struct *p; 1624 struct task_struct *p;
1625 1625
1626 ret = -ESRCH; 1626 ret = -ESRCH;
1627 read_lock(&tasklist_lock); 1627 rcu_read_lock();
1628 p = find_task_by_pid(pid); 1628 p = find_task_by_pid(pid);
1629 if (!p) 1629 if (!p)
1630 goto err_unlock; 1630 goto err_unlock;
@@ -1633,7 +1633,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr,
1633 !capable(CAP_SYS_PTRACE)) 1633 !capable(CAP_SYS_PTRACE))
1634 goto err_unlock; 1634 goto err_unlock;
1635 head = p->robust_list; 1635 head = p->robust_list;
1636 read_unlock(&tasklist_lock); 1636 rcu_read_unlock();
1637 } 1637 }
1638 1638
1639 if (put_user(sizeof(*head), len_ptr)) 1639 if (put_user(sizeof(*head), len_ptr))
@@ -1641,7 +1641,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr,
1641 return put_user(head, head_ptr); 1641 return put_user(head, head_ptr);
1642 1642
1643err_unlock: 1643err_unlock:
1644 read_unlock(&tasklist_lock); 1644 rcu_read_unlock();
1645 1645
1646 return ret; 1646 return ret;
1647} 1647}
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 21c38a7e666b..d0ba190dfeb6 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -693,7 +693,7 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
693 return t->task == NULL; 693 return t->task == NULL;
694} 694}
695 695
696static long __sched nanosleep_restart(struct restart_block *restart) 696long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
697{ 697{
698 struct hrtimer_sleeper t; 698 struct hrtimer_sleeper t;
699 struct timespec __user *rmtp; 699 struct timespec __user *rmtp;
@@ -702,13 +702,13 @@ static long __sched nanosleep_restart(struct restart_block *restart)
702 702
703 restart->fn = do_no_restart_syscall; 703 restart->fn = do_no_restart_syscall;
704 704
705 hrtimer_init(&t.timer, restart->arg3, HRTIMER_ABS); 705 hrtimer_init(&t.timer, restart->arg0, HRTIMER_ABS);
706 t.timer.expires.tv64 = ((u64)restart->arg1 << 32) | (u64) restart->arg0; 706 t.timer.expires.tv64 = ((u64)restart->arg3 << 32) | (u64) restart->arg2;
707 707
708 if (do_nanosleep(&t, HRTIMER_ABS)) 708 if (do_nanosleep(&t, HRTIMER_ABS))
709 return 0; 709 return 0;
710 710
711 rmtp = (struct timespec __user *) restart->arg2; 711 rmtp = (struct timespec __user *) restart->arg1;
712 if (rmtp) { 712 if (rmtp) {
713 time = ktime_sub(t.timer.expires, t.timer.base->get_time()); 713 time = ktime_sub(t.timer.expires, t.timer.base->get_time());
714 if (time.tv64 <= 0) 714 if (time.tv64 <= 0)
@@ -718,7 +718,7 @@ static long __sched nanosleep_restart(struct restart_block *restart)
718 return -EFAULT; 718 return -EFAULT;
719 } 719 }
720 720
721 restart->fn = nanosleep_restart; 721 restart->fn = hrtimer_nanosleep_restart;
722 722
723 /* The other values in restart are already filled in */ 723 /* The other values in restart are already filled in */
724 return -ERESTART_RESTARTBLOCK; 724 return -ERESTART_RESTARTBLOCK;
@@ -751,11 +751,11 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
751 } 751 }
752 752
753 restart = &current_thread_info()->restart_block; 753 restart = &current_thread_info()->restart_block;
754 restart->fn = nanosleep_restart; 754 restart->fn = hrtimer_nanosleep_restart;
755 restart->arg0 = t.timer.expires.tv64 & 0xFFFFFFFF; 755 restart->arg0 = (unsigned long) t.timer.base->index;
756 restart->arg1 = t.timer.expires.tv64 >> 32; 756 restart->arg1 = (unsigned long) rmtp;
757 restart->arg2 = (unsigned long) rmtp; 757 restart->arg2 = t.timer.expires.tv64 & 0xFFFFFFFF;
758 restart->arg3 = (unsigned long) t.timer.base->index; 758 restart->arg3 = t.timer.expires.tv64 >> 32;
759 759
760 return -ERESTART_RESTARTBLOCK; 760 return -ERESTART_RESTARTBLOCK;
761} 761}
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index ac1f850d4937..736cb0bd498f 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -40,10 +40,6 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip)
40 spin_lock_irqsave(&desc->lock, flags); 40 spin_lock_irqsave(&desc->lock, flags);
41 irq_chip_set_defaults(chip); 41 irq_chip_set_defaults(chip);
42 desc->chip = chip; 42 desc->chip = chip;
43 /*
44 * For compatibility only:
45 */
46 desc->chip = chip;
47 spin_unlock_irqrestore(&desc->lock, flags); 43 spin_unlock_irqrestore(&desc->lock, flags);
48 44
49 return 0; 45 return 0;
@@ -146,7 +142,7 @@ static void default_disable(unsigned int irq)
146 struct irq_desc *desc = irq_desc + irq; 142 struct irq_desc *desc = irq_desc + irq;
147 143
148 if (!(desc->status & IRQ_DELAYED_DISABLE)) 144 if (!(desc->status & IRQ_DELAYED_DISABLE))
149 irq_desc[irq].chip->mask(irq); 145 desc->chip->mask(irq);
150} 146}
151 147
152/* 148/*
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 48a53f68af96..4c6cdbaed661 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -154,6 +154,7 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
154 return retval; 154 return retval;
155} 155}
156 156
157#ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ
157/** 158/**
158 * __do_IRQ - original all in one highlevel IRQ handler 159 * __do_IRQ - original all in one highlevel IRQ handler
159 * @irq: the interrupt number 160 * @irq: the interrupt number
@@ -253,6 +254,7 @@ out:
253 254
254 return 1; 255 return 1;
255} 256}
257#endif
256 258
257#ifdef CONFIG_TRACE_IRQFLAGS 259#ifdef CONFIG_TRACE_IRQFLAGS
258 260
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 50087ecf337e..fcdd5d2bc3f4 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -40,7 +40,7 @@ struct resource crashk_res = {
40 40
41int kexec_should_crash(struct task_struct *p) 41int kexec_should_crash(struct task_struct *p)
42{ 42{
43 if (in_interrupt() || !p->pid || p->pid == 1 || panic_on_oops) 43 if (in_interrupt() || !p->pid || is_init(p) || panic_on_oops)
44 return 1; 44 return 1;
45 return 0; 45 return 0;
46} 46}
@@ -995,7 +995,8 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
995 image = xchg(dest_image, image); 995 image = xchg(dest_image, image);
996 996
997out: 997out:
998 xchg(&kexec_lock, 0); /* Release the mutex */ 998 locked = xchg(&kexec_lock, 0); /* Release the mutex */
999 BUG_ON(!locked);
999 kimage_free(image); 1000 kimage_free(image);
1000 1001
1001 return result; 1002 return result;
@@ -1061,7 +1062,8 @@ void crash_kexec(struct pt_regs *regs)
1061 machine_crash_shutdown(&fixed_regs); 1062 machine_crash_shutdown(&fixed_regs);
1062 machine_kexec(kexec_crash_image); 1063 machine_kexec(kexec_crash_image);
1063 } 1064 }
1064 xchg(&kexec_lock, 0); 1065 locked = xchg(&kexec_lock, 0);
1066 BUG_ON(!locked);
1065 } 1067 }
1066} 1068}
1067 1069
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 64ab045c3d9d..5d1d907378a2 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -122,6 +122,13 @@ unsigned int __kfifo_put(struct kfifo *fifo,
122 122
123 len = min(len, fifo->size - fifo->in + fifo->out); 123 len = min(len, fifo->size - fifo->in + fifo->out);
124 124
125 /*
126 * Ensure that we sample the fifo->out index -before- we
127 * start putting bytes into the kfifo.
128 */
129
130 smp_mb();
131
125 /* first put the data starting from fifo->in to buffer end */ 132 /* first put the data starting from fifo->in to buffer end */
126 l = min(len, fifo->size - (fifo->in & (fifo->size - 1))); 133 l = min(len, fifo->size - (fifo->in & (fifo->size - 1)));
127 memcpy(fifo->buffer + (fifo->in & (fifo->size - 1)), buffer, l); 134 memcpy(fifo->buffer + (fifo->in & (fifo->size - 1)), buffer, l);
@@ -129,6 +136,13 @@ unsigned int __kfifo_put(struct kfifo *fifo,
129 /* then put the rest (if any) at the beginning of the buffer */ 136 /* then put the rest (if any) at the beginning of the buffer */
130 memcpy(fifo->buffer, buffer + l, len - l); 137 memcpy(fifo->buffer, buffer + l, len - l);
131 138
139 /*
140 * Ensure that we add the bytes to the kfifo -before-
141 * we update the fifo->in index.
142 */
143
144 smp_wmb();
145
132 fifo->in += len; 146 fifo->in += len;
133 147
134 return len; 148 return len;
@@ -154,6 +168,13 @@ unsigned int __kfifo_get(struct kfifo *fifo,
154 168
155 len = min(len, fifo->in - fifo->out); 169 len = min(len, fifo->in - fifo->out);
156 170
171 /*
172 * Ensure that we sample the fifo->in index -before- we
173 * start removing bytes from the kfifo.
174 */
175
176 smp_rmb();
177
157 /* first get the data from fifo->out until the end of the buffer */ 178 /* first get the data from fifo->out until the end of the buffer */
158 l = min(len, fifo->size - (fifo->out & (fifo->size - 1))); 179 l = min(len, fifo->size - (fifo->out & (fifo->size - 1)));
159 memcpy(buffer, fifo->buffer + (fifo->out & (fifo->size - 1)), l); 180 memcpy(buffer, fifo->buffer + (fifo->out & (fifo->size - 1)), l);
@@ -161,6 +182,13 @@ unsigned int __kfifo_get(struct kfifo *fifo,
161 /* then get the rest (if any) from the beginning of the buffer */ 182 /* then get the rest (if any) from the beginning of the buffer */
162 memcpy(buffer + l, fifo->buffer, len - l); 183 memcpy(buffer + l, fifo->buffer, len - l);
163 184
185 /*
186 * Ensure that we remove the bytes from the kfifo -before-
187 * we update the fifo->out index.
188 */
189
190 smp_mb();
191
164 fifo->out += len; 192 fifo->out += len;
165 193
166 return len; 194 return len;
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 5c470c57fb57..842f8015d7fd 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -176,6 +176,8 @@ static int wait_for_helper(void *data)
176 if (pid < 0) { 176 if (pid < 0) {
177 sub_info->retval = pid; 177 sub_info->retval = pid;
178 } else { 178 } else {
179 int ret;
180
179 /* 181 /*
180 * Normally it is bogus to call wait4() from in-kernel because 182 * Normally it is bogus to call wait4() from in-kernel because
181 * wait4() wants to write the exit code to a userspace address. 183 * wait4() wants to write the exit code to a userspace address.
@@ -185,7 +187,15 @@ static int wait_for_helper(void *data)
185 * 187 *
186 * Thus the __user pointer cast is valid here. 188 * Thus the __user pointer cast is valid here.
187 */ 189 */
188 sys_wait4(pid, (int __user *) &sub_info->retval, 0, NULL); 190 sys_wait4(pid, (int __user *)&ret, 0, NULL);
191
192 /*
193 * If ret is 0, either ____call_usermodehelper failed and the
194 * real error code is already in sub_info->retval or
195 * sub_info->retval is 0 anyway, so don't mess with it then.
196 */
197 if (ret)
198 sub_info->retval = ret;
189 } 199 }
190 200
191 complete(sub_info->complete); 201 complete(sub_info->complete);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 9bad17884513..e596525669ed 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -36,6 +36,7 @@
36#include <linux/stacktrace.h> 36#include <linux/stacktrace.h>
37#include <linux/debug_locks.h> 37#include <linux/debug_locks.h>
38#include <linux/irqflags.h> 38#include <linux/irqflags.h>
39#include <linux/utsname.h>
39 40
40#include <asm/sections.h> 41#include <asm/sections.h>
41 42
@@ -121,8 +122,8 @@ static struct list_head chainhash_table[CHAINHASH_SIZE];
121 * unique. 122 * unique.
122 */ 123 */
123#define iterate_chain_key(key1, key2) \ 124#define iterate_chain_key(key1, key2) \
124 (((key1) << MAX_LOCKDEP_KEYS_BITS/2) ^ \ 125 (((key1) << MAX_LOCKDEP_KEYS_BITS) ^ \
125 ((key1) >> (64-MAX_LOCKDEP_KEYS_BITS/2)) ^ \ 126 ((key1) >> (64-MAX_LOCKDEP_KEYS_BITS)) ^ \
126 (key2)) 127 (key2))
127 128
128void lockdep_off(void) 129void lockdep_off(void)
@@ -224,7 +225,14 @@ static int save_trace(struct stack_trace *trace)
224 trace->max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries; 225 trace->max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries;
225 trace->entries = stack_trace + nr_stack_trace_entries; 226 trace->entries = stack_trace + nr_stack_trace_entries;
226 227
227 save_stack_trace(trace, NULL, 0, 3); 228 trace->skip = 3;
229 trace->all_contexts = 0;
230
231 /* Make sure to not recurse in case the the unwinder needs to tak
232e locks. */
233 lockdep_off();
234 save_stack_trace(trace, NULL);
235 lockdep_on();
228 236
229 trace->max_entries = trace->nr_entries; 237 trace->max_entries = trace->nr_entries;
230 238
@@ -508,6 +516,13 @@ print_circular_bug_entry(struct lock_list *target, unsigned int depth)
508 return 0; 516 return 0;
509} 517}
510 518
519static void print_kernel_version(void)
520{
521 printk("%s %.*s\n", system_utsname.release,
522 (int)strcspn(system_utsname.version, " "),
523 system_utsname.version);
524}
525
511/* 526/*
512 * When a circular dependency is detected, print the 527 * When a circular dependency is detected, print the
513 * header first: 528 * header first:
@@ -524,6 +539,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth)
524 539
525 printk("\n=======================================================\n"); 540 printk("\n=======================================================\n");
526 printk( "[ INFO: possible circular locking dependency detected ]\n"); 541 printk( "[ INFO: possible circular locking dependency detected ]\n");
542 print_kernel_version();
527 printk( "-------------------------------------------------------\n"); 543 printk( "-------------------------------------------------------\n");
528 printk("%s/%d is trying to acquire lock:\n", 544 printk("%s/%d is trying to acquire lock:\n",
529 curr->comm, curr->pid); 545 curr->comm, curr->pid);
@@ -705,6 +721,7 @@ print_bad_irq_dependency(struct task_struct *curr,
705 printk("\n======================================================\n"); 721 printk("\n======================================================\n");
706 printk( "[ INFO: %s-safe -> %s-unsafe lock order detected ]\n", 722 printk( "[ INFO: %s-safe -> %s-unsafe lock order detected ]\n",
707 irqclass, irqclass); 723 irqclass, irqclass);
724 print_kernel_version();
708 printk( "------------------------------------------------------\n"); 725 printk( "------------------------------------------------------\n");
709 printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", 726 printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
710 curr->comm, curr->pid, 727 curr->comm, curr->pid,
@@ -786,6 +803,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
786 803
787 printk("\n=============================================\n"); 804 printk("\n=============================================\n");
788 printk( "[ INFO: possible recursive locking detected ]\n"); 805 printk( "[ INFO: possible recursive locking detected ]\n");
806 print_kernel_version();
789 printk( "---------------------------------------------\n"); 807 printk( "---------------------------------------------\n");
790 printk("%s/%d is trying to acquire lock:\n", 808 printk("%s/%d is trying to acquire lock:\n",
791 curr->comm, curr->pid); 809 curr->comm, curr->pid);
@@ -1368,6 +1386,7 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other,
1368 1386
1369 printk("\n=========================================================\n"); 1387 printk("\n=========================================================\n");
1370 printk( "[ INFO: possible irq lock inversion dependency detected ]\n"); 1388 printk( "[ INFO: possible irq lock inversion dependency detected ]\n");
1389 print_kernel_version();
1371 printk( "---------------------------------------------------------\n"); 1390 printk( "---------------------------------------------------------\n");
1372 printk("%s/%d just changed the state of lock:\n", 1391 printk("%s/%d just changed the state of lock:\n",
1373 curr->comm, curr->pid); 1392 curr->comm, curr->pid);
@@ -1462,6 +1481,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
1462 1481
1463 printk("\n=================================\n"); 1482 printk("\n=================================\n");
1464 printk( "[ INFO: inconsistent lock state ]\n"); 1483 printk( "[ INFO: inconsistent lock state ]\n");
1484 print_kernel_version();
1465 printk( "---------------------------------\n"); 1485 printk( "---------------------------------\n");
1466 1486
1467 printk("inconsistent {%s} -> {%s} usage.\n", 1487 printk("inconsistent {%s} -> {%s} usage.\n",
diff --git a/kernel/module.c b/kernel/module.c
index 2a19cd47c046..05625d5dc758 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -933,6 +933,15 @@ static ssize_t module_sect_show(struct module_attribute *mattr,
933 return sprintf(buf, "0x%lx\n", sattr->address); 933 return sprintf(buf, "0x%lx\n", sattr->address);
934} 934}
935 935
936static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
937{
938 int section;
939
940 for (section = 0; section < sect_attrs->nsections; section++)
941 kfree(sect_attrs->attrs[section].name);
942 kfree(sect_attrs);
943}
944
936static void add_sect_attrs(struct module *mod, unsigned int nsect, 945static void add_sect_attrs(struct module *mod, unsigned int nsect,
937 char *secstrings, Elf_Shdr *sechdrs) 946 char *secstrings, Elf_Shdr *sechdrs)
938{ 947{
@@ -949,21 +958,26 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
949 + nloaded * sizeof(sect_attrs->attrs[0]), 958 + nloaded * sizeof(sect_attrs->attrs[0]),
950 sizeof(sect_attrs->grp.attrs[0])); 959 sizeof(sect_attrs->grp.attrs[0]));
951 size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.attrs[0]); 960 size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.attrs[0]);
952 if (! (sect_attrs = kmalloc(size[0] + size[1], GFP_KERNEL))) 961 sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL);
962 if (sect_attrs == NULL)
953 return; 963 return;
954 964
955 /* Setup section attributes. */ 965 /* Setup section attributes. */
956 sect_attrs->grp.name = "sections"; 966 sect_attrs->grp.name = "sections";
957 sect_attrs->grp.attrs = (void *)sect_attrs + size[0]; 967 sect_attrs->grp.attrs = (void *)sect_attrs + size[0];
958 968
969 sect_attrs->nsections = 0;
959 sattr = &sect_attrs->attrs[0]; 970 sattr = &sect_attrs->attrs[0];
960 gattr = &sect_attrs->grp.attrs[0]; 971 gattr = &sect_attrs->grp.attrs[0];
961 for (i = 0; i < nsect; i++) { 972 for (i = 0; i < nsect; i++) {
962 if (! (sechdrs[i].sh_flags & SHF_ALLOC)) 973 if (! (sechdrs[i].sh_flags & SHF_ALLOC))
963 continue; 974 continue;
964 sattr->address = sechdrs[i].sh_addr; 975 sattr->address = sechdrs[i].sh_addr;
965 strlcpy(sattr->name, secstrings + sechdrs[i].sh_name, 976 sattr->name = kstrdup(secstrings + sechdrs[i].sh_name,
966 MODULE_SECT_NAME_LEN); 977 GFP_KERNEL);
978 if (sattr->name == NULL)
979 goto out;
980 sect_attrs->nsections++;
967 sattr->mattr.show = module_sect_show; 981 sattr->mattr.show = module_sect_show;
968 sattr->mattr.store = NULL; 982 sattr->mattr.store = NULL;
969 sattr->mattr.attr.name = sattr->name; 983 sattr->mattr.attr.name = sattr->name;
@@ -979,7 +993,7 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
979 mod->sect_attrs = sect_attrs; 993 mod->sect_attrs = sect_attrs;
980 return; 994 return;
981 out: 995 out:
982 kfree(sect_attrs); 996 free_sect_attrs(sect_attrs);
983} 997}
984 998
985static void remove_sect_attrs(struct module *mod) 999static void remove_sect_attrs(struct module *mod)
@@ -989,13 +1003,13 @@ static void remove_sect_attrs(struct module *mod)
989 &mod->sect_attrs->grp); 1003 &mod->sect_attrs->grp);
990 /* We are positive that no one is using any sect attrs 1004 /* We are positive that no one is using any sect attrs
991 * at this point. Deallocate immediately. */ 1005 * at this point. Deallocate immediately. */
992 kfree(mod->sect_attrs); 1006 free_sect_attrs(mod->sect_attrs);
993 mod->sect_attrs = NULL; 1007 mod->sect_attrs = NULL;
994 } 1008 }
995} 1009}
996 1010
997
998#else 1011#else
1012
999static inline void add_sect_attrs(struct module *mod, unsigned int nsect, 1013static inline void add_sect_attrs(struct module *mod, unsigned int nsect,
1000 char *sectstrings, Elf_Shdr *sechdrs) 1014 char *sectstrings, Elf_Shdr *sechdrs)
1001{ 1015{
@@ -1054,6 +1068,12 @@ static int mod_sysfs_setup(struct module *mod,
1054{ 1068{
1055 int err; 1069 int err;
1056 1070
1071 if (!module_subsys.kset.subsys) {
1072 printk(KERN_ERR "%s: module_subsys not initialized\n",
1073 mod->name);
1074 err = -EINVAL;
1075 goto out;
1076 }
1057 memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj)); 1077 memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj));
1058 err = kobject_set_name(&mod->mkobj.kobj, "%s", mod->name); 1078 err = kobject_set_name(&mod->mkobj.kobj, "%s", mod->name);
1059 if (err) 1079 if (err)
diff --git a/kernel/panic.c b/kernel/panic.c
index 8010b9b17aca..525e365f7239 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -270,3 +270,15 @@ void oops_exit(void)
270{ 270{
271 do_oops_enter_exit(); 271 do_oops_enter_exit();
272} 272}
273
274#ifdef CONFIG_CC_STACKPROTECTOR
275/*
276 * Called when gcc's -fstack-protector feature is used, and
277 * gcc detects corruption of the on-stack canary value
278 */
279void __stack_chk_fail(void)
280{
281 panic("stack-protector: Kernel stack is corrupted");
282}
283EXPORT_SYMBOL(__stack_chk_fail);
284#endif
diff --git a/kernel/params.c b/kernel/params.c
index 91aea7aa532e..f406655d6653 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -547,6 +547,7 @@ static void __init kernel_param_sysfs_setup(const char *name,
547 unsigned int name_skip) 547 unsigned int name_skip)
548{ 548{
549 struct module_kobject *mk; 549 struct module_kobject *mk;
550 int ret;
550 551
551 mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL); 552 mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL);
552 BUG_ON(!mk); 553 BUG_ON(!mk);
@@ -554,7 +555,8 @@ static void __init kernel_param_sysfs_setup(const char *name,
554 mk->mod = THIS_MODULE; 555 mk->mod = THIS_MODULE;
555 kobj_set_kset_s(mk, module_subsys); 556 kobj_set_kset_s(mk, module_subsys);
556 kobject_set_name(&mk->kobj, name); 557 kobject_set_name(&mk->kobj, name);
557 kobject_register(&mk->kobj); 558 ret = kobject_register(&mk->kobj);
559 BUG_ON(ret < 0);
558 560
559 /* no need to keep the kobject if no parameter is exported */ 561 /* no need to keep the kobject if no parameter is exported */
560 if (!param_sysfs_setup(mk, kparam, num_params, name_skip)) { 562 if (!param_sysfs_setup(mk, kparam, num_params, name_skip)) {
@@ -684,13 +686,20 @@ decl_subsys(module, &module_ktype, NULL);
684 */ 686 */
685static int __init param_sysfs_init(void) 687static int __init param_sysfs_init(void)
686{ 688{
687 subsystem_register(&module_subsys); 689 int ret;
690
691 ret = subsystem_register(&module_subsys);
692 if (ret < 0) {
693 printk(KERN_WARNING "%s (%d): subsystem_register error: %d\n",
694 __FILE__, __LINE__, ret);
695 return ret;
696 }
688 697
689 param_sysfs_builtin(); 698 param_sysfs_builtin();
690 699
691 return 0; 700 return 0;
692} 701}
693__initcall(param_sysfs_init); 702subsys_initcall(param_sysfs_init);
694 703
695EXPORT_SYMBOL(param_set_byte); 704EXPORT_SYMBOL(param_set_byte);
696EXPORT_SYMBOL(param_get_byte); 705EXPORT_SYMBOL(param_get_byte);
diff --git a/kernel/pid.c b/kernel/pid.c
index 93e212f20671..8387e8c68193 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -223,9 +223,6 @@ int fastcall attach_pid(struct task_struct *task, enum pid_type type, int nr)
223 struct pid_link *link; 223 struct pid_link *link;
224 struct pid *pid; 224 struct pid *pid;
225 225
226 WARN_ON(!task->pid); /* to be removed soon */
227 WARN_ON(!nr); /* to be removed soon */
228
229 link = &task->pids[type]; 226 link = &task->pids[type];
230 link->pid = pid = find_pid(nr); 227 link->pid = pid = find_pid(nr);
231 hlist_add_head_rcu(&link->node, &pid->tasks[type]); 228 hlist_add_head_rcu(&link->node, &pid->tasks[type]);
@@ -252,6 +249,15 @@ void fastcall detach_pid(struct task_struct *task, enum pid_type type)
252 free_pid(pid); 249 free_pid(pid);
253} 250}
254 251
252/* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */
253void fastcall transfer_pid(struct task_struct *old, struct task_struct *new,
254 enum pid_type type)
255{
256 new->pids[type].pid = old->pids[type].pid;
257 hlist_replace_rcu(&old->pids[type].node, &new->pids[type].node);
258 old->pids[type].pid = NULL;
259}
260
255struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type) 261struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type)
256{ 262{
257 struct task_struct *result = NULL; 263 struct task_struct *result = NULL;
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index d38d9ec3276c..479b16b44f79 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1393,25 +1393,13 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1393 } 1393 }
1394} 1394}
1395 1395
1396static long posix_cpu_clock_nanosleep_restart(struct restart_block *); 1396static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
1397 1397 struct timespec *rqtp, struct itimerspec *it)
1398int posix_cpu_nsleep(const clockid_t which_clock, int flags,
1399 struct timespec *rqtp, struct timespec __user *rmtp)
1400{ 1398{
1401 struct restart_block *restart_block =
1402 &current_thread_info()->restart_block;
1403 struct k_itimer timer; 1399 struct k_itimer timer;
1404 int error; 1400 int error;
1405 1401
1406 /* 1402 /*
1407 * Diagnose required errors first.
1408 */
1409 if (CPUCLOCK_PERTHREAD(which_clock) &&
1410 (CPUCLOCK_PID(which_clock) == 0 ||
1411 CPUCLOCK_PID(which_clock) == current->pid))
1412 return -EINVAL;
1413
1414 /*
1415 * Set up a temporary timer and then wait for it to go off. 1403 * Set up a temporary timer and then wait for it to go off.
1416 */ 1404 */
1417 memset(&timer, 0, sizeof timer); 1405 memset(&timer, 0, sizeof timer);
@@ -1422,11 +1410,12 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags,
1422 timer.it_process = current; 1410 timer.it_process = current;
1423 if (!error) { 1411 if (!error) {
1424 static struct itimerspec zero_it; 1412 static struct itimerspec zero_it;
1425 struct itimerspec it = { .it_value = *rqtp, 1413
1426 .it_interval = {} }; 1414 memset(it, 0, sizeof *it);
1415 it->it_value = *rqtp;
1427 1416
1428 spin_lock_irq(&timer.it_lock); 1417 spin_lock_irq(&timer.it_lock);
1429 error = posix_cpu_timer_set(&timer, flags, &it, NULL); 1418 error = posix_cpu_timer_set(&timer, flags, it, NULL);
1430 if (error) { 1419 if (error) {
1431 spin_unlock_irq(&timer.it_lock); 1420 spin_unlock_irq(&timer.it_lock);
1432 return error; 1421 return error;
@@ -1454,49 +1443,89 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags,
1454 * We were interrupted by a signal. 1443 * We were interrupted by a signal.
1455 */ 1444 */
1456 sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp); 1445 sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp);
1457 posix_cpu_timer_set(&timer, 0, &zero_it, &it); 1446 posix_cpu_timer_set(&timer, 0, &zero_it, it);
1458 spin_unlock_irq(&timer.it_lock); 1447 spin_unlock_irq(&timer.it_lock);
1459 1448
1460 if ((it.it_value.tv_sec | it.it_value.tv_nsec) == 0) { 1449 if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) {
1461 /* 1450 /*
1462 * It actually did fire already. 1451 * It actually did fire already.
1463 */ 1452 */
1464 return 0; 1453 return 0;
1465 } 1454 }
1466 1455
1456 error = -ERESTART_RESTARTBLOCK;
1457 }
1458
1459 return error;
1460}
1461
1462int posix_cpu_nsleep(const clockid_t which_clock, int flags,
1463 struct timespec *rqtp, struct timespec __user *rmtp)
1464{
1465 struct restart_block *restart_block =
1466 &current_thread_info()->restart_block;
1467 struct itimerspec it;
1468 int error;
1469
1470 /*
1471 * Diagnose required errors first.
1472 */
1473 if (CPUCLOCK_PERTHREAD(which_clock) &&
1474 (CPUCLOCK_PID(which_clock) == 0 ||
1475 CPUCLOCK_PID(which_clock) == current->pid))
1476 return -EINVAL;
1477
1478 error = do_cpu_nanosleep(which_clock, flags, rqtp, &it);
1479
1480 if (error == -ERESTART_RESTARTBLOCK) {
1481
1482 if (flags & TIMER_ABSTIME)
1483 return -ERESTARTNOHAND;
1467 /* 1484 /*
1468 * Report back to the user the time still remaining. 1485 * Report back to the user the time still remaining.
1469 */ 1486 */
1470 if (rmtp != NULL && !(flags & TIMER_ABSTIME) && 1487 if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
1471 copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
1472 return -EFAULT; 1488 return -EFAULT;
1473 1489
1474 restart_block->fn = posix_cpu_clock_nanosleep_restart; 1490 restart_block->fn = posix_cpu_nsleep_restart;
1475 /* Caller already set restart_block->arg1 */
1476 restart_block->arg0 = which_clock; 1491 restart_block->arg0 = which_clock;
1477 restart_block->arg1 = (unsigned long) rmtp; 1492 restart_block->arg1 = (unsigned long) rmtp;
1478 restart_block->arg2 = rqtp->tv_sec; 1493 restart_block->arg2 = rqtp->tv_sec;
1479 restart_block->arg3 = rqtp->tv_nsec; 1494 restart_block->arg3 = rqtp->tv_nsec;
1480
1481 error = -ERESTART_RESTARTBLOCK;
1482 } 1495 }
1483
1484 return error; 1496 return error;
1485} 1497}
1486 1498
1487static long 1499long posix_cpu_nsleep_restart(struct restart_block *restart_block)
1488posix_cpu_clock_nanosleep_restart(struct restart_block *restart_block)
1489{ 1500{
1490 clockid_t which_clock = restart_block->arg0; 1501 clockid_t which_clock = restart_block->arg0;
1491 struct timespec __user *rmtp; 1502 struct timespec __user *rmtp;
1492 struct timespec t; 1503 struct timespec t;
1504 struct itimerspec it;
1505 int error;
1493 1506
1494 rmtp = (struct timespec __user *) restart_block->arg1; 1507 rmtp = (struct timespec __user *) restart_block->arg1;
1495 t.tv_sec = restart_block->arg2; 1508 t.tv_sec = restart_block->arg2;
1496 t.tv_nsec = restart_block->arg3; 1509 t.tv_nsec = restart_block->arg3;
1497 1510
1498 restart_block->fn = do_no_restart_syscall; 1511 restart_block->fn = do_no_restart_syscall;
1499 return posix_cpu_nsleep(which_clock, TIMER_ABSTIME, &t, rmtp); 1512 error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it);
1513
1514 if (error == -ERESTART_RESTARTBLOCK) {
1515 /*
1516 * Report back to the user the time still remaining.
1517 */
1518 if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
1519 return -EFAULT;
1520
1521 restart_block->fn = posix_cpu_nsleep_restart;
1522 restart_block->arg0 = which_clock;
1523 restart_block->arg1 = (unsigned long) rmtp;
1524 restart_block->arg2 = t.tv_sec;
1525 restart_block->arg3 = t.tv_nsec;
1526 }
1527 return error;
1528
1500} 1529}
1501 1530
1502 1531
@@ -1524,6 +1553,10 @@ static int process_cpu_nsleep(const clockid_t which_clock, int flags,
1524{ 1553{
1525 return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp); 1554 return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp);
1526} 1555}
1556static long process_cpu_nsleep_restart(struct restart_block *restart_block)
1557{
1558 return -EINVAL;
1559}
1527static int thread_cpu_clock_getres(const clockid_t which_clock, 1560static int thread_cpu_clock_getres(const clockid_t which_clock,
1528 struct timespec *tp) 1561 struct timespec *tp)
1529{ 1562{
@@ -1544,6 +1577,10 @@ static int thread_cpu_nsleep(const clockid_t which_clock, int flags,
1544{ 1577{
1545 return -EINVAL; 1578 return -EINVAL;
1546} 1579}
1580static long thread_cpu_nsleep_restart(struct restart_block *restart_block)
1581{
1582 return -EINVAL;
1583}
1547 1584
1548static __init int init_posix_cpu_timers(void) 1585static __init int init_posix_cpu_timers(void)
1549{ 1586{
@@ -1553,6 +1590,7 @@ static __init int init_posix_cpu_timers(void)
1553 .clock_set = do_posix_clock_nosettime, 1590 .clock_set = do_posix_clock_nosettime,
1554 .timer_create = process_cpu_timer_create, 1591 .timer_create = process_cpu_timer_create,
1555 .nsleep = process_cpu_nsleep, 1592 .nsleep = process_cpu_nsleep,
1593 .nsleep_restart = process_cpu_nsleep_restart,
1556 }; 1594 };
1557 struct k_clock thread = { 1595 struct k_clock thread = {
1558 .clock_getres = thread_cpu_clock_getres, 1596 .clock_getres = thread_cpu_clock_getres,
@@ -1560,6 +1598,7 @@ static __init int init_posix_cpu_timers(void)
1560 .clock_set = do_posix_clock_nosettime, 1598 .clock_set = do_posix_clock_nosettime,
1561 .timer_create = thread_cpu_timer_create, 1599 .timer_create = thread_cpu_timer_create,
1562 .nsleep = thread_cpu_nsleep, 1600 .nsleep = thread_cpu_nsleep,
1601 .nsleep_restart = thread_cpu_nsleep_restart,
1563 }; 1602 };
1564 1603
1565 register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process); 1604 register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index ac6dc8744429..e5ebcc1ec3a0 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -973,3 +973,24 @@ sys_clock_nanosleep(const clockid_t which_clock, int flags,
973 return CLOCK_DISPATCH(which_clock, nsleep, 973 return CLOCK_DISPATCH(which_clock, nsleep,
974 (which_clock, flags, &t, rmtp)); 974 (which_clock, flags, &t, rmtp));
975} 975}
976
977/*
978 * nanosleep_restart for monotonic and realtime clocks
979 */
980static int common_nsleep_restart(struct restart_block *restart_block)
981{
982 return hrtimer_nanosleep_restart(restart_block);
983}
984
985/*
986 * This will restart clock_nanosleep. This is required only by
987 * compat_clock_nanosleep_restart for now.
988 */
989long
990clock_nanosleep_restart(struct restart_block *restart_block)
991{
992 clockid_t which_clock = restart_block->arg0;
993
994 return CLOCK_DISPATCH(which_clock, nsleep_restart,
995 (restart_block));
996}
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 619ecabf7c58..825068ca3479 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -36,6 +36,17 @@ config PM_DEBUG
36 code. This is helpful when debugging and reporting various PM bugs, 36 code. This is helpful when debugging and reporting various PM bugs,
37 like suspend support. 37 like suspend support.
38 38
39config DISABLE_CONSOLE_SUSPEND
40 bool "Keep console(s) enabled during suspend/resume (DANGEROUS)"
41 depends on PM && PM_DEBUG
42 default n
43 ---help---
44 This option turns off the console suspend mechanism that prevents
45 debug messages from reaching the console during the suspend/resume
46 operations. This may be helpful when debugging device drivers'
47 suspend/resume routines, but may itself lead to problems, for example
48 if netconsole is used.
49
39config PM_TRACE 50config PM_TRACE
40 bool "Suspend/resume event tracing" 51 bool "Suspend/resume event tracing"
41 depends on PM && PM_DEBUG && X86_32 && EXPERIMENTAL 52 depends on PM && PM_DEBUG && X86_32 && EXPERIMENTAL
@@ -53,6 +64,17 @@ config PM_TRACE
53 CAUTION: this option will cause your machine's real-time clock to be 64 CAUTION: this option will cause your machine's real-time clock to be
54 set to an invalid time after a resume. 65 set to an invalid time after a resume.
55 66
67config PM_SYSFS_DEPRECATED
68 bool "Driver model /sys/devices/.../power/state files (DEPRECATED)"
69 depends on PM && SYSFS
70 default n
71 help
72 The driver model started out with a sysfs file intended to provide
73 a userspace hook for device power management. This feature has never
74 worked very well, except for limited testing purposes, and so it will
75 be removed. It's not clear that a generic mechanism could really
76 handle the wide variability of device power states; any replacements
77 are likely to be bus or driver specific.
56 78
57config SOFTWARE_SUSPEND 79config SOFTWARE_SUSPEND
58 bool "Software Suspend" 80 bool "Software Suspend"
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 8d0af3d37a4b..38725f526afc 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -7,6 +7,4 @@ obj-y := main.o process.o console.o
7obj-$(CONFIG_PM_LEGACY) += pm.o 7obj-$(CONFIG_PM_LEGACY) += pm.o
8obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o snapshot.o swap.o user.o 8obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o snapshot.o swap.o user.o
9 9
10obj-$(CONFIG_SUSPEND_SMP) += smp.o
11
12obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o 10obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index e13e74067845..d72234942798 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -18,6 +18,7 @@
18#include <linux/fs.h> 18#include <linux/fs.h>
19#include <linux/mount.h> 19#include <linux/mount.h>
20#include <linux/pm.h> 20#include <linux/pm.h>
21#include <linux/cpu.h>
21 22
22#include "power.h" 23#include "power.h"
23 24
@@ -72,7 +73,10 @@ static int prepare_processes(void)
72 int error; 73 int error;
73 74
74 pm_prepare_console(); 75 pm_prepare_console();
75 disable_nonboot_cpus(); 76
77 error = disable_nonboot_cpus();
78 if (error)
79 goto enable_cpus;
76 80
77 if (freeze_processes()) { 81 if (freeze_processes()) {
78 error = -EBUSY; 82 error = -EBUSY;
@@ -84,6 +88,7 @@ static int prepare_processes(void)
84 return 0; 88 return 0;
85thaw: 89thaw:
86 thaw_processes(); 90 thaw_processes();
91enable_cpus:
87 enable_nonboot_cpus(); 92 enable_nonboot_cpus();
88 pm_restore_console(); 93 pm_restore_console();
89 return error; 94 return error;
@@ -98,7 +103,7 @@ static void unprepare_processes(void)
98} 103}
99 104
100/** 105/**
101 * pm_suspend_disk - The granpappy of power management. 106 * pm_suspend_disk - The granpappy of hibernation power management.
102 * 107 *
103 * If we're going through the firmware, then get it over with quickly. 108 * If we're going through the firmware, then get it over with quickly.
104 * 109 *
@@ -207,7 +212,7 @@ static int software_resume(void)
207 212
208 pr_debug("PM: Preparing devices for restore.\n"); 213 pr_debug("PM: Preparing devices for restore.\n");
209 214
210 if ((error = device_suspend(PMSG_FREEZE))) { 215 if ((error = device_suspend(PMSG_PRETHAW))) {
211 printk("Some devices failed to suspend\n"); 216 printk("Some devices failed to suspend\n");
212 swsusp_free(); 217 swsusp_free();
213 goto Thaw; 218 goto Thaw;
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 6d295c776794..873228c71dab 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -16,6 +16,8 @@
16#include <linux/init.h> 16#include <linux/init.h>
17#include <linux/pm.h> 17#include <linux/pm.h>
18#include <linux/console.h> 18#include <linux/console.h>
19#include <linux/cpu.h>
20#include <linux/resume-trace.h>
19 21
20#include "power.h" 22#include "power.h"
21 23
@@ -51,7 +53,7 @@ void pm_set_ops(struct pm_ops * ops)
51 53
52static int suspend_prepare(suspend_state_t state) 54static int suspend_prepare(suspend_state_t state)
53{ 55{
54 int error = 0; 56 int error;
55 unsigned int free_pages; 57 unsigned int free_pages;
56 58
57 if (!pm_ops || !pm_ops->enter) 59 if (!pm_ops || !pm_ops->enter)
@@ -59,12 +61,9 @@ static int suspend_prepare(suspend_state_t state)
59 61
60 pm_prepare_console(); 62 pm_prepare_console();
61 63
62 disable_nonboot_cpus(); 64 error = disable_nonboot_cpus();
63 65 if (error)
64 if (num_online_cpus() != 1) {
65 error = -EPERM;
66 goto Enable_cpu; 66 goto Enable_cpu;
67 }
68 67
69 if (freeze_processes()) { 68 if (freeze_processes()) {
70 error = -EAGAIN; 69 error = -EAGAIN;
@@ -283,10 +282,39 @@ static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n
283 282
284power_attr(state); 283power_attr(state);
285 284
285#ifdef CONFIG_PM_TRACE
286int pm_trace_enabled;
287
288static ssize_t pm_trace_show(struct subsystem * subsys, char * buf)
289{
290 return sprintf(buf, "%d\n", pm_trace_enabled);
291}
292
293static ssize_t
294pm_trace_store(struct subsystem * subsys, const char * buf, size_t n)
295{
296 int val;
297
298 if (sscanf(buf, "%d", &val) == 1) {
299 pm_trace_enabled = !!val;
300 return n;
301 }
302 return -EINVAL;
303}
304
305power_attr(pm_trace);
306
307static struct attribute * g[] = {
308 &state_attr.attr,
309 &pm_trace_attr.attr,
310 NULL,
311};
312#else
286static struct attribute * g[] = { 313static struct attribute * g[] = {
287 &state_attr.attr, 314 &state_attr.attr,
288 NULL, 315 NULL,
289}; 316};
317#endif /* CONFIG_PM_TRACE */
290 318
291static struct attribute_group attr_group = { 319static struct attribute_group attr_group = {
292 .attrs = g, 320 .attrs = g,
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 57a792982fb9..bfe999f7b272 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -38,8 +38,6 @@ extern struct subsystem power_subsys;
38/* References to section boundaries */ 38/* References to section boundaries */
39extern const void __nosave_begin, __nosave_end; 39extern const void __nosave_begin, __nosave_end;
40 40
41extern struct pbe *pagedir_nosave;
42
43/* Preferred image size in bytes (default 500 MB) */ 41/* Preferred image size in bytes (default 500 MB) */
44extern unsigned long image_size; 42extern unsigned long image_size;
45extern int in_suspend; 43extern int in_suspend;
@@ -50,21 +48,62 @@ extern asmlinkage int swsusp_arch_resume(void);
50 48
51extern unsigned int count_data_pages(void); 49extern unsigned int count_data_pages(void);
52 50
51/**
52 * Auxiliary structure used for reading the snapshot image data and
53 * metadata from and writing them to the list of page backup entries
54 * (PBEs) which is the main data structure of swsusp.
55 *
56 * Using struct snapshot_handle we can transfer the image, including its
57 * metadata, as a continuous sequence of bytes with the help of
58 * snapshot_read_next() and snapshot_write_next().
59 *
60 * The code that writes the image to a storage or transfers it to
61 * the user land is required to use snapshot_read_next() for this
62 * purpose and it should not make any assumptions regarding the internal
63 * structure of the image. Similarly, the code that reads the image from
64 * a storage or transfers it from the user land is required to use
65 * snapshot_write_next().
66 *
67 * This may allow us to change the internal structure of the image
68 * in the future with considerably less effort.
69 */
70
53struct snapshot_handle { 71struct snapshot_handle {
54 loff_t offset; 72 loff_t offset; /* number of the last byte ready for reading
55 unsigned int page; 73 * or writing in the sequence
56 unsigned int page_offset; 74 */
57 unsigned int prev; 75 unsigned int cur; /* number of the block of PAGE_SIZE bytes the
58 struct pbe *pbe, *last_pbe; 76 * next operation will refer to (ie. current)
59 void *buffer; 77 */
60 unsigned int buf_offset; 78 unsigned int cur_offset; /* offset with respect to the current
79 * block (for the next operation)
80 */
81 unsigned int prev; /* number of the block of PAGE_SIZE bytes that
82 * was the current one previously
83 */
84 void *buffer; /* address of the block to read from
85 * or write to
86 */
87 unsigned int buf_offset; /* location to read from or write to,
88 * given as a displacement from 'buffer'
89 */
90 int sync_read; /* Set to one to notify the caller of
91 * snapshot_write_next() that it may
92 * need to call wait_on_bio_chain()
93 */
61}; 94};
62 95
96/* This macro returns the address from/to which the caller of
97 * snapshot_read_next()/snapshot_write_next() is allowed to
98 * read/write data after the function returns
99 */
63#define data_of(handle) ((handle).buffer + (handle).buf_offset) 100#define data_of(handle) ((handle).buffer + (handle).buf_offset)
64 101
102extern unsigned int snapshot_additional_pages(struct zone *zone);
65extern int snapshot_read_next(struct snapshot_handle *handle, size_t count); 103extern int snapshot_read_next(struct snapshot_handle *handle, size_t count);
66extern int snapshot_write_next(struct snapshot_handle *handle, size_t count); 104extern int snapshot_write_next(struct snapshot_handle *handle, size_t count);
67int snapshot_image_loaded(struct snapshot_handle *handle); 105extern int snapshot_image_loaded(struct snapshot_handle *handle);
106extern void snapshot_free_unused_memory(struct snapshot_handle *handle);
68 107
69#define SNAPSHOT_IOC_MAGIC '3' 108#define SNAPSHOT_IOC_MAGIC '3'
70#define SNAPSHOT_FREEZE _IO(SNAPSHOT_IOC_MAGIC, 1) 109#define SNAPSHOT_FREEZE _IO(SNAPSHOT_IOC_MAGIC, 1)
diff --git a/kernel/power/smp.c b/kernel/power/smp.c
deleted file mode 100644
index 5957312b2d68..000000000000
--- a/kernel/power/smp.c
+++ /dev/null
@@ -1,62 +0,0 @@
1/*
2 * drivers/power/smp.c - Functions for stopping other CPUs.
3 *
4 * Copyright 2004 Pavel Machek <pavel@suse.cz>
5 * Copyright (C) 2002-2003 Nigel Cunningham <ncunningham@clear.net.nz>
6 *
7 * This file is released under the GPLv2.
8 */
9
10#undef DEBUG
11
12#include <linux/smp_lock.h>
13#include <linux/interrupt.h>
14#include <linux/suspend.h>
15#include <linux/module.h>
16#include <linux/cpu.h>
17#include <asm/atomic.h>
18#include <asm/tlbflush.h>
19
20/* This is protected by pm_sem semaphore */
21static cpumask_t frozen_cpus;
22
23void disable_nonboot_cpus(void)
24{
25 int cpu, error;
26
27 error = 0;
28 cpus_clear(frozen_cpus);
29 printk("Freezing cpus ...\n");
30 for_each_online_cpu(cpu) {
31 if (cpu == 0)
32 continue;
33 error = cpu_down(cpu);
34 if (!error) {
35 cpu_set(cpu, frozen_cpus);
36 printk("CPU%d is down\n", cpu);
37 continue;
38 }
39 printk("Error taking cpu %d down: %d\n", cpu, error);
40 }
41 BUG_ON(raw_smp_processor_id() != 0);
42 if (error)
43 panic("cpus not sleeping");
44}
45
46void enable_nonboot_cpus(void)
47{
48 int cpu, error;
49
50 printk("Thawing cpus ...\n");
51 for_each_cpu_mask(cpu, frozen_cpus) {
52 error = cpu_up(cpu);
53 if (!error) {
54 printk("CPU%d is up\n", cpu);
55 continue;
56 }
57 printk("Error taking cpu %d up: %d\n", cpu, error);
58 panic("Not enough cpus");
59 }
60 cpus_clear(frozen_cpus);
61}
62
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 75d4886e648e..1b84313cbab5 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -34,10 +34,12 @@
34 34
35#include "power.h" 35#include "power.h"
36 36
37struct pbe *pagedir_nosave; 37/* List of PBEs used for creating and restoring the suspend image */
38struct pbe *restore_pblist;
39
38static unsigned int nr_copy_pages; 40static unsigned int nr_copy_pages;
39static unsigned int nr_meta_pages; 41static unsigned int nr_meta_pages;
40static unsigned long *buffer; 42static void *buffer;
41 43
42#ifdef CONFIG_HIGHMEM 44#ifdef CONFIG_HIGHMEM
43unsigned int count_highmem_pages(void) 45unsigned int count_highmem_pages(void)
@@ -156,240 +158,637 @@ static inline int save_highmem(void) {return 0;}
156static inline int restore_highmem(void) {return 0;} 158static inline int restore_highmem(void) {return 0;}
157#endif 159#endif
158 160
159static int pfn_is_nosave(unsigned long pfn) 161/**
162 * @safe_needed - on resume, for storing the PBE list and the image,
163 * we can only use memory pages that do not conflict with the pages
164 * used before suspend.
165 *
166 * The unsafe pages are marked with the PG_nosave_free flag
167 * and we count them using unsafe_pages
168 */
169
170#define PG_ANY 0
171#define PG_SAFE 1
172#define PG_UNSAFE_CLEAR 1
173#define PG_UNSAFE_KEEP 0
174
175static unsigned int allocated_unsafe_pages;
176
177static void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
160{ 178{
161 unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT; 179 void *res;
162 unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT; 180
163 return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); 181 res = (void *)get_zeroed_page(gfp_mask);
182 if (safe_needed)
183 while (res && PageNosaveFree(virt_to_page(res))) {
184 /* The page is unsafe, mark it for swsusp_free() */
185 SetPageNosave(virt_to_page(res));
186 allocated_unsafe_pages++;
187 res = (void *)get_zeroed_page(gfp_mask);
188 }
189 if (res) {
190 SetPageNosave(virt_to_page(res));
191 SetPageNosaveFree(virt_to_page(res));
192 }
193 return res;
194}
195
196unsigned long get_safe_page(gfp_t gfp_mask)
197{
198 return (unsigned long)alloc_image_page(gfp_mask, PG_SAFE);
164} 199}
165 200
166/** 201/**
167 * saveable - Determine whether a page should be cloned or not. 202 * free_image_page - free page represented by @addr, allocated with
168 * @pfn: The page 203 * alloc_image_page (page flags set by it must be cleared)
169 *
170 * We save a page if it's Reserved, and not in the range of pages
171 * statically defined as 'unsaveable', or if it isn't reserved, and
172 * isn't part of a free chunk of pages.
173 */ 204 */
174 205
175static int saveable(struct zone *zone, unsigned long *zone_pfn) 206static inline void free_image_page(void *addr, int clear_nosave_free)
176{ 207{
177 unsigned long pfn = *zone_pfn + zone->zone_start_pfn; 208 ClearPageNosave(virt_to_page(addr));
178 struct page *page; 209 if (clear_nosave_free)
210 ClearPageNosaveFree(virt_to_page(addr));
211 free_page((unsigned long)addr);
212}
179 213
180 if (!pfn_valid(pfn)) 214/* struct linked_page is used to build chains of pages */
181 return 0;
182 215
183 page = pfn_to_page(pfn); 216#define LINKED_PAGE_DATA_SIZE (PAGE_SIZE - sizeof(void *))
184 BUG_ON(PageReserved(page) && PageNosave(page));
185 if (PageNosave(page))
186 return 0;
187 if (PageReserved(page) && pfn_is_nosave(pfn))
188 return 0;
189 if (PageNosaveFree(page))
190 return 0;
191 217
192 return 1; 218struct linked_page {
193} 219 struct linked_page *next;
220 char data[LINKED_PAGE_DATA_SIZE];
221} __attribute__((packed));
194 222
195unsigned int count_data_pages(void) 223static inline void
224free_list_of_pages(struct linked_page *list, int clear_page_nosave)
196{ 225{
197 struct zone *zone; 226 while (list) {
198 unsigned long zone_pfn; 227 struct linked_page *lp = list->next;
199 unsigned int n = 0;
200 228
201 for_each_zone (zone) { 229 free_image_page(list, clear_page_nosave);
202 if (is_highmem(zone)) 230 list = lp;
203 continue;
204 mark_free_pages(zone);
205 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
206 n += saveable(zone, &zone_pfn);
207 } 231 }
208 return n;
209} 232}
210 233
211static void copy_data_pages(struct pbe *pblist) 234/**
235 * struct chain_allocator is used for allocating small objects out of
236 * a linked list of pages called 'the chain'.
237 *
238 * The chain grows each time when there is no room for a new object in
239 * the current page. The allocated objects cannot be freed individually.
240 * It is only possible to free them all at once, by freeing the entire
241 * chain.
242 *
243 * NOTE: The chain allocator may be inefficient if the allocated objects
244 * are not much smaller than PAGE_SIZE.
245 */
246
247struct chain_allocator {
248 struct linked_page *chain; /* the chain */
249 unsigned int used_space; /* total size of objects allocated out
250 * of the current page
251 */
252 gfp_t gfp_mask; /* mask for allocating pages */
253 int safe_needed; /* if set, only "safe" pages are allocated */
254};
255
256static void
257chain_init(struct chain_allocator *ca, gfp_t gfp_mask, int safe_needed)
212{ 258{
213 struct zone *zone; 259 ca->chain = NULL;
214 unsigned long zone_pfn; 260 ca->used_space = LINKED_PAGE_DATA_SIZE;
215 struct pbe *pbe, *p; 261 ca->gfp_mask = gfp_mask;
262 ca->safe_needed = safe_needed;
263}
216 264
217 pbe = pblist; 265static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
218 for_each_zone (zone) { 266{
219 if (is_highmem(zone)) 267 void *ret;
220 continue; 268
221 mark_free_pages(zone); 269 if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) {
222 /* This is necessary for swsusp_free() */ 270 struct linked_page *lp;
223 for_each_pb_page (p, pblist) 271
224 SetPageNosaveFree(virt_to_page(p)); 272 lp = alloc_image_page(ca->gfp_mask, ca->safe_needed);
225 for_each_pbe (p, pblist) 273 if (!lp)
226 SetPageNosaveFree(virt_to_page(p->address)); 274 return NULL;
227 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) { 275
228 if (saveable(zone, &zone_pfn)) { 276 lp->next = ca->chain;
229 struct page *page; 277 ca->chain = lp;
230 long *src, *dst; 278 ca->used_space = 0;
231 int n;
232
233 page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
234 BUG_ON(!pbe);
235 pbe->orig_address = (unsigned long)page_address(page);
236 /* copy_page and memcpy are not usable for copying task structs. */
237 dst = (long *)pbe->address;
238 src = (long *)pbe->orig_address;
239 for (n = PAGE_SIZE / sizeof(long); n; n--)
240 *dst++ = *src++;
241 pbe = pbe->next;
242 }
243 }
244 } 279 }
245 BUG_ON(pbe); 280 ret = ca->chain->data + ca->used_space;
281 ca->used_space += size;
282 return ret;
246} 283}
247 284
285static void chain_free(struct chain_allocator *ca, int clear_page_nosave)
286{
287 free_list_of_pages(ca->chain, clear_page_nosave);
288 memset(ca, 0, sizeof(struct chain_allocator));
289}
248 290
249/** 291/**
250 * free_pagedir - free pages allocated with alloc_pagedir() 292 * Data types related to memory bitmaps.
293 *
294 * Memory bitmap is a structure consiting of many linked lists of
295 * objects. The main list's elements are of type struct zone_bitmap
296 * and each of them corresonds to one zone. For each zone bitmap
297 * object there is a list of objects of type struct bm_block that
298 * represent each blocks of bit chunks in which information is
299 * stored.
300 *
301 * struct memory_bitmap contains a pointer to the main list of zone
302 * bitmap objects, a struct bm_position used for browsing the bitmap,
303 * and a pointer to the list of pages used for allocating all of the
304 * zone bitmap objects and bitmap block objects.
305 *
306 * NOTE: It has to be possible to lay out the bitmap in memory
307 * using only allocations of order 0. Additionally, the bitmap is
308 * designed to work with arbitrary number of zones (this is over the
309 * top for now, but let's avoid making unnecessary assumptions ;-).
310 *
311 * struct zone_bitmap contains a pointer to a list of bitmap block
312 * objects and a pointer to the bitmap block object that has been
313 * most recently used for setting bits. Additionally, it contains the
314 * pfns that correspond to the start and end of the represented zone.
315 *
316 * struct bm_block contains a pointer to the memory page in which
317 * information is stored (in the form of a block of bit chunks
318 * of type unsigned long each). It also contains the pfns that
319 * correspond to the start and end of the represented memory area and
320 * the number of bit chunks in the block.
321 *
322 * NOTE: Memory bitmaps are used for two types of operations only:
323 * "set a bit" and "find the next bit set". Moreover, the searching
324 * is always carried out after all of the "set a bit" operations
325 * on given bitmap.
251 */ 326 */
252 327
253static void free_pagedir(struct pbe *pblist, int clear_nosave_free) 328#define BM_END_OF_MAP (~0UL)
329
330#define BM_CHUNKS_PER_BLOCK (PAGE_SIZE / sizeof(long))
331#define BM_BITS_PER_CHUNK (sizeof(long) << 3)
332#define BM_BITS_PER_BLOCK (PAGE_SIZE << 3)
333
334struct bm_block {
335 struct bm_block *next; /* next element of the list */
336 unsigned long start_pfn; /* pfn represented by the first bit */
337 unsigned long end_pfn; /* pfn represented by the last bit plus 1 */
338 unsigned int size; /* number of bit chunks */
339 unsigned long *data; /* chunks of bits representing pages */
340};
341
342struct zone_bitmap {
343 struct zone_bitmap *next; /* next element of the list */
344 unsigned long start_pfn; /* minimal pfn in this zone */
345 unsigned long end_pfn; /* maximal pfn in this zone plus 1 */
346 struct bm_block *bm_blocks; /* list of bitmap blocks */
347 struct bm_block *cur_block; /* recently used bitmap block */
348};
349
350/* strcut bm_position is used for browsing memory bitmaps */
351
352struct bm_position {
353 struct zone_bitmap *zone_bm;
354 struct bm_block *block;
355 int chunk;
356 int bit;
357};
358
359struct memory_bitmap {
360 struct zone_bitmap *zone_bm_list; /* list of zone bitmaps */
361 struct linked_page *p_list; /* list of pages used to store zone
362 * bitmap objects and bitmap block
363 * objects
364 */
365 struct bm_position cur; /* most recently used bit position */
366};
367
368/* Functions that operate on memory bitmaps */
369
370static inline void memory_bm_reset_chunk(struct memory_bitmap *bm)
254{ 371{
255 struct pbe *pbe; 372 bm->cur.chunk = 0;
373 bm->cur.bit = -1;
374}
256 375
257 while (pblist) { 376static void memory_bm_position_reset(struct memory_bitmap *bm)
258 pbe = (pblist + PB_PAGE_SKIP)->next; 377{
259 ClearPageNosave(virt_to_page(pblist)); 378 struct zone_bitmap *zone_bm;
260 if (clear_nosave_free) 379
261 ClearPageNosaveFree(virt_to_page(pblist)); 380 zone_bm = bm->zone_bm_list;
262 free_page((unsigned long)pblist); 381 bm->cur.zone_bm = zone_bm;
263 pblist = pbe; 382 bm->cur.block = zone_bm->bm_blocks;
264 } 383 memory_bm_reset_chunk(bm);
265} 384}
266 385
386static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
387
267/** 388/**
268 * fill_pb_page - Create a list of PBEs on a given memory page 389 * create_bm_block_list - create a list of block bitmap objects
269 */ 390 */
270 391
271static inline void fill_pb_page(struct pbe *pbpage) 392static inline struct bm_block *
393create_bm_block_list(unsigned int nr_blocks, struct chain_allocator *ca)
272{ 394{
273 struct pbe *p; 395 struct bm_block *bblist = NULL;
396
397 while (nr_blocks-- > 0) {
398 struct bm_block *bb;
274 399
275 p = pbpage; 400 bb = chain_alloc(ca, sizeof(struct bm_block));
276 pbpage += PB_PAGE_SKIP; 401 if (!bb)
277 do 402 return NULL;
278 p->next = p + 1; 403
279 while (++p < pbpage); 404 bb->next = bblist;
405 bblist = bb;
406 }
407 return bblist;
280} 408}
281 409
282/** 410/**
283 * create_pbe_list - Create a list of PBEs on top of a given chain 411 * create_zone_bm_list - create a list of zone bitmap objects
284 * of memory pages allocated with alloc_pagedir()
285 */ 412 */
286 413
287static inline void create_pbe_list(struct pbe *pblist, unsigned int nr_pages) 414static inline struct zone_bitmap *
415create_zone_bm_list(unsigned int nr_zones, struct chain_allocator *ca)
288{ 416{
289 struct pbe *pbpage, *p; 417 struct zone_bitmap *zbmlist = NULL;
290 unsigned int num = PBES_PER_PAGE;
291 418
292 for_each_pb_page (pbpage, pblist) { 419 while (nr_zones-- > 0) {
293 if (num >= nr_pages) 420 struct zone_bitmap *zbm;
294 break; 421
422 zbm = chain_alloc(ca, sizeof(struct zone_bitmap));
423 if (!zbm)
424 return NULL;
425
426 zbm->next = zbmlist;
427 zbmlist = zbm;
428 }
429 return zbmlist;
430}
431
432/**
433 * memory_bm_create - allocate memory for a memory bitmap
434 */
435
436static int
437memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
438{
439 struct chain_allocator ca;
440 struct zone *zone;
441 struct zone_bitmap *zone_bm;
442 struct bm_block *bb;
443 unsigned int nr;
444
445 chain_init(&ca, gfp_mask, safe_needed);
295 446
296 fill_pb_page(pbpage); 447 /* Compute the number of zones */
297 num += PBES_PER_PAGE; 448 nr = 0;
449 for_each_zone (zone)
450 if (populated_zone(zone) && !is_highmem(zone))
451 nr++;
452
453 /* Allocate the list of zones bitmap objects */
454 zone_bm = create_zone_bm_list(nr, &ca);
455 bm->zone_bm_list = zone_bm;
456 if (!zone_bm) {
457 chain_free(&ca, PG_UNSAFE_CLEAR);
458 return -ENOMEM;
298 } 459 }
299 if (pbpage) { 460
300 for (num -= PBES_PER_PAGE - 1, p = pbpage; num < nr_pages; p++, num++) 461 /* Initialize the zone bitmap objects */
301 p->next = p + 1; 462 for_each_zone (zone) {
302 p->next = NULL; 463 unsigned long pfn;
464
465 if (!populated_zone(zone) || is_highmem(zone))
466 continue;
467
468 zone_bm->start_pfn = zone->zone_start_pfn;
469 zone_bm->end_pfn = zone->zone_start_pfn + zone->spanned_pages;
470 /* Allocate the list of bitmap block objects */
471 nr = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
472 bb = create_bm_block_list(nr, &ca);
473 zone_bm->bm_blocks = bb;
474 zone_bm->cur_block = bb;
475 if (!bb)
476 goto Free;
477
478 nr = zone->spanned_pages;
479 pfn = zone->zone_start_pfn;
480 /* Initialize the bitmap block objects */
481 while (bb) {
482 unsigned long *ptr;
483
484 ptr = alloc_image_page(gfp_mask, safe_needed);
485 bb->data = ptr;
486 if (!ptr)
487 goto Free;
488
489 bb->start_pfn = pfn;
490 if (nr >= BM_BITS_PER_BLOCK) {
491 pfn += BM_BITS_PER_BLOCK;
492 bb->size = BM_CHUNKS_PER_BLOCK;
493 nr -= BM_BITS_PER_BLOCK;
494 } else {
495 /* This is executed only once in the loop */
496 pfn += nr;
497 bb->size = DIV_ROUND_UP(nr, BM_BITS_PER_CHUNK);
498 }
499 bb->end_pfn = pfn;
500 bb = bb->next;
501 }
502 zone_bm = zone_bm->next;
303 } 503 }
504 bm->p_list = ca.chain;
505 memory_bm_position_reset(bm);
506 return 0;
507
508Free:
509 bm->p_list = ca.chain;
510 memory_bm_free(bm, PG_UNSAFE_CLEAR);
511 return -ENOMEM;
304} 512}
305 513
306static unsigned int unsafe_pages; 514/**
515 * memory_bm_free - free memory occupied by the memory bitmap @bm
516 */
517
518static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
519{
520 struct zone_bitmap *zone_bm;
521
522 /* Free the list of bit blocks for each zone_bitmap object */
523 zone_bm = bm->zone_bm_list;
524 while (zone_bm) {
525 struct bm_block *bb;
526
527 bb = zone_bm->bm_blocks;
528 while (bb) {
529 if (bb->data)
530 free_image_page(bb->data, clear_nosave_free);
531 bb = bb->next;
532 }
533 zone_bm = zone_bm->next;
534 }
535 free_list_of_pages(bm->p_list, clear_nosave_free);
536 bm->zone_bm_list = NULL;
537}
307 538
308/** 539/**
309 * @safe_needed - on resume, for storing the PBE list and the image, 540 * memory_bm_set_bit - set the bit in the bitmap @bm that corresponds
310 * we can only use memory pages that do not conflict with the pages 541 * to given pfn. The cur_zone_bm member of @bm and the cur_block member
311 * used before suspend. 542 * of @bm->cur_zone_bm are updated.
312 * 543 *
313 * The unsafe pages are marked with the PG_nosave_free flag 544 * If the bit cannot be set, the function returns -EINVAL .
314 * and we count them using unsafe_pages
315 */ 545 */
316 546
317static inline void *alloc_image_page(gfp_t gfp_mask, int safe_needed) 547static int
548memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn)
318{ 549{
319 void *res; 550 struct zone_bitmap *zone_bm;
320 551 struct bm_block *bb;
321 res = (void *)get_zeroed_page(gfp_mask); 552
322 if (safe_needed) 553 /* Check if the pfn is from the current zone */
323 while (res && PageNosaveFree(virt_to_page(res))) { 554 zone_bm = bm->cur.zone_bm;
324 /* The page is unsafe, mark it for swsusp_free() */ 555 if (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) {
325 SetPageNosave(virt_to_page(res)); 556 zone_bm = bm->zone_bm_list;
326 unsafe_pages++; 557 /* We don't assume that the zones are sorted by pfns */
327 res = (void *)get_zeroed_page(gfp_mask); 558 while (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) {
559 zone_bm = zone_bm->next;
560 if (unlikely(!zone_bm))
561 return -EINVAL;
328 } 562 }
329 if (res) { 563 bm->cur.zone_bm = zone_bm;
330 SetPageNosave(virt_to_page(res));
331 SetPageNosaveFree(virt_to_page(res));
332 } 564 }
333 return res; 565 /* Check if the pfn corresponds to the current bitmap block */
566 bb = zone_bm->cur_block;
567 if (pfn < bb->start_pfn)
568 bb = zone_bm->bm_blocks;
569
570 while (pfn >= bb->end_pfn) {
571 bb = bb->next;
572 if (unlikely(!bb))
573 return -EINVAL;
574 }
575 zone_bm->cur_block = bb;
576 pfn -= bb->start_pfn;
577 set_bit(pfn % BM_BITS_PER_CHUNK, bb->data + pfn / BM_BITS_PER_CHUNK);
578 return 0;
334} 579}
335 580
336unsigned long get_safe_page(gfp_t gfp_mask) 581/* Two auxiliary functions for memory_bm_next_pfn */
582
583/* Find the first set bit in the given chunk, if there is one */
584
585static inline int next_bit_in_chunk(int bit, unsigned long *chunk_p)
337{ 586{
338 return (unsigned long)alloc_image_page(gfp_mask, 1); 587 bit++;
588 while (bit < BM_BITS_PER_CHUNK) {
589 if (test_bit(bit, chunk_p))
590 return bit;
591
592 bit++;
593 }
594 return -1;
595}
596
597/* Find a chunk containing some bits set in given block of bits */
598
599static inline int next_chunk_in_block(int n, struct bm_block *bb)
600{
601 n++;
602 while (n < bb->size) {
603 if (bb->data[n])
604 return n;
605
606 n++;
607 }
608 return -1;
339} 609}
340 610
341/** 611/**
342 * alloc_pagedir - Allocate the page directory. 612 * memory_bm_next_pfn - find the pfn that corresponds to the next set bit
343 * 613 * in the bitmap @bm. If the pfn cannot be found, BM_END_OF_MAP is
344 * First, determine exactly how many pages we need and 614 * returned.
345 * allocate them.
346 * 615 *
347 * We arrange the pages in a chain: each page is an array of PBES_PER_PAGE 616 * It is required to run memory_bm_position_reset() before the first call to
348 * struct pbe elements (pbes) and the last element in the page points 617 * this function.
349 * to the next page. 618 */
619
620static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
621{
622 struct zone_bitmap *zone_bm;
623 struct bm_block *bb;
624 int chunk;
625 int bit;
626
627 do {
628 bb = bm->cur.block;
629 do {
630 chunk = bm->cur.chunk;
631 bit = bm->cur.bit;
632 do {
633 bit = next_bit_in_chunk(bit, bb->data + chunk);
634 if (bit >= 0)
635 goto Return_pfn;
636
637 chunk = next_chunk_in_block(chunk, bb);
638 bit = -1;
639 } while (chunk >= 0);
640 bb = bb->next;
641 bm->cur.block = bb;
642 memory_bm_reset_chunk(bm);
643 } while (bb);
644 zone_bm = bm->cur.zone_bm->next;
645 if (zone_bm) {
646 bm->cur.zone_bm = zone_bm;
647 bm->cur.block = zone_bm->bm_blocks;
648 memory_bm_reset_chunk(bm);
649 }
650 } while (zone_bm);
651 memory_bm_position_reset(bm);
652 return BM_END_OF_MAP;
653
654Return_pfn:
655 bm->cur.chunk = chunk;
656 bm->cur.bit = bit;
657 return bb->start_pfn + chunk * BM_BITS_PER_CHUNK + bit;
658}
659
660/**
661 * snapshot_additional_pages - estimate the number of additional pages
662 * be needed for setting up the suspend image data structures for given
663 * zone (usually the returned value is greater than the exact number)
664 */
665
666unsigned int snapshot_additional_pages(struct zone *zone)
667{
668 unsigned int res;
669
670 res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
671 res += DIV_ROUND_UP(res * sizeof(struct bm_block), PAGE_SIZE);
672 return res;
673}
674
675/**
676 * pfn_is_nosave - check if given pfn is in the 'nosave' section
677 */
678
679static inline int pfn_is_nosave(unsigned long pfn)
680{
681 unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT;
682 unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT;
683 return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
684}
685
686/**
687 * saveable - Determine whether a page should be cloned or not.
688 * @pfn: The page
350 * 689 *
351 * On each page we set up a list of struct_pbe elements. 690 * We save a page if it isn't Nosave, and is not in the range of pages
691 * statically defined as 'unsaveable', and it
692 * isn't a part of a free chunk of pages.
352 */ 693 */
353 694
354static struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, 695static struct page *saveable_page(unsigned long pfn)
355 int safe_needed)
356{ 696{
357 unsigned int num; 697 struct page *page;
358 struct pbe *pblist, *pbe; 698
699 if (!pfn_valid(pfn))
700 return NULL;
359 701
360 if (!nr_pages) 702 page = pfn_to_page(pfn);
703
704 if (PageNosave(page))
705 return NULL;
706 if (PageReserved(page) && pfn_is_nosave(pfn))
361 return NULL; 707 return NULL;
708 if (PageNosaveFree(page))
709 return NULL;
710
711 return page;
712}
713
714unsigned int count_data_pages(void)
715{
716 struct zone *zone;
717 unsigned long pfn, max_zone_pfn;
718 unsigned int n = 0;
362 719
363 pblist = alloc_image_page(gfp_mask, safe_needed); 720 for_each_zone (zone) {
364 /* FIXME: rewrite this ugly loop */ 721 if (is_highmem(zone))
365 for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages; 722 continue;
366 pbe = pbe->next, num += PBES_PER_PAGE) { 723 mark_free_pages(zone);
367 pbe += PB_PAGE_SKIP; 724 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
368 pbe->next = alloc_image_page(gfp_mask, safe_needed); 725 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
726 n += !!saveable_page(pfn);
369 } 727 }
370 if (!pbe) { /* get_zeroed_page() failed */ 728 return n;
371 free_pagedir(pblist, 1); 729}
372 pblist = NULL; 730
373 } else 731static inline void copy_data_page(long *dst, long *src)
374 create_pbe_list(pblist, nr_pages); 732{
375 return pblist; 733 int n;
734
735 /* copy_page and memcpy are not usable for copying task structs. */
736 for (n = PAGE_SIZE / sizeof(long); n; n--)
737 *dst++ = *src++;
738}
739
740static void
741copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
742{
743 struct zone *zone;
744 unsigned long pfn;
745
746 for_each_zone (zone) {
747 unsigned long max_zone_pfn;
748
749 if (is_highmem(zone))
750 continue;
751
752 mark_free_pages(zone);
753 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
754 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
755 if (saveable_page(pfn))
756 memory_bm_set_bit(orig_bm, pfn);
757 }
758 memory_bm_position_reset(orig_bm);
759 memory_bm_position_reset(copy_bm);
760 do {
761 pfn = memory_bm_next_pfn(orig_bm);
762 if (likely(pfn != BM_END_OF_MAP)) {
763 struct page *page;
764 void *src;
765
766 page = pfn_to_page(pfn);
767 src = page_address(page);
768 page = pfn_to_page(memory_bm_next_pfn(copy_bm));
769 copy_data_page(page_address(page), src);
770 }
771 } while (pfn != BM_END_OF_MAP);
376} 772}
377 773
378/** 774/**
379 * Free pages we allocated for suspend. Suspend pages are alocated 775 * swsusp_free - free pages allocated for the suspend.
380 * before atomic copy, so we need to free them after resume. 776 *
777 * Suspend pages are alocated before the atomic copy is made, so we
778 * need to release them after the resume.
381 */ 779 */
382 780
383void swsusp_free(void) 781void swsusp_free(void)
384{ 782{
385 struct zone *zone; 783 struct zone *zone;
386 unsigned long zone_pfn; 784 unsigned long pfn, max_zone_pfn;
387 785
388 for_each_zone(zone) { 786 for_each_zone(zone) {
389 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) 787 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
390 if (pfn_valid(zone_pfn + zone->zone_start_pfn)) { 788 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
391 struct page *page; 789 if (pfn_valid(pfn)) {
392 page = pfn_to_page(zone_pfn + zone->zone_start_pfn); 790 struct page *page = pfn_to_page(pfn);
791
393 if (PageNosave(page) && PageNosaveFree(page)) { 792 if (PageNosave(page) && PageNosaveFree(page)) {
394 ClearPageNosave(page); 793 ClearPageNosave(page);
395 ClearPageNosaveFree(page); 794 ClearPageNosaveFree(page);
@@ -399,7 +798,7 @@ void swsusp_free(void)
399 } 798 }
400 nr_copy_pages = 0; 799 nr_copy_pages = 0;
401 nr_meta_pages = 0; 800 nr_meta_pages = 0;
402 pagedir_nosave = NULL; 801 restore_pblist = NULL;
403 buffer = NULL; 802 buffer = NULL;
404} 803}
405 804
@@ -414,46 +813,57 @@ void swsusp_free(void)
414static int enough_free_mem(unsigned int nr_pages) 813static int enough_free_mem(unsigned int nr_pages)
415{ 814{
416 struct zone *zone; 815 struct zone *zone;
417 unsigned int n = 0; 816 unsigned int free = 0, meta = 0;
418 817
419 for_each_zone (zone) 818 for_each_zone (zone)
420 if (!is_highmem(zone)) 819 if (!is_highmem(zone)) {
421 n += zone->free_pages; 820 free += zone->free_pages;
422 pr_debug("swsusp: available memory: %u pages\n", n); 821 meta += snapshot_additional_pages(zone);
423 return n > (nr_pages + PAGES_FOR_IO + 822 }
424 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
425}
426 823
427static int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed) 824 pr_debug("swsusp: pages needed: %u + %u + %u, available pages: %u\n",
428{ 825 nr_pages, PAGES_FOR_IO, meta, free);
429 struct pbe *p;
430 826
431 for_each_pbe (p, pblist) { 827 return free > nr_pages + PAGES_FOR_IO + meta;
432 p->address = (unsigned long)alloc_image_page(gfp_mask, safe_needed);
433 if (!p->address)
434 return -ENOMEM;
435 }
436 return 0;
437} 828}
438 829
439static struct pbe *swsusp_alloc(unsigned int nr_pages) 830static int
831swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
832 unsigned int nr_pages)
440{ 833{
441 struct pbe *pblist; 834 int error;
442 835
443 if (!(pblist = alloc_pagedir(nr_pages, GFP_ATOMIC | __GFP_COLD, 0))) { 836 error = memory_bm_create(orig_bm, GFP_ATOMIC | __GFP_COLD, PG_ANY);
444 printk(KERN_ERR "suspend: Allocating pagedir failed.\n"); 837 if (error)
445 return NULL; 838 goto Free;
446 }
447 839
448 if (alloc_data_pages(pblist, GFP_ATOMIC | __GFP_COLD, 0)) { 840 error = memory_bm_create(copy_bm, GFP_ATOMIC | __GFP_COLD, PG_ANY);
449 printk(KERN_ERR "suspend: Allocating image pages failed.\n"); 841 if (error)
450 swsusp_free(); 842 goto Free;
451 return NULL; 843
844 while (nr_pages-- > 0) {
845 struct page *page = alloc_page(GFP_ATOMIC | __GFP_COLD);
846 if (!page)
847 goto Free;
848
849 SetPageNosave(page);
850 SetPageNosaveFree(page);
851 memory_bm_set_bit(copy_bm, page_to_pfn(page));
452 } 852 }
853 return 0;
453 854
454 return pblist; 855Free:
856 swsusp_free();
857 return -ENOMEM;
455} 858}
456 859
860/* Memory bitmap used for marking saveable pages */
861static struct memory_bitmap orig_bm;
862/* Memory bitmap used for marking allocated pages that will contain the copies
863 * of saveable pages
864 */
865static struct memory_bitmap copy_bm;
866
457asmlinkage int swsusp_save(void) 867asmlinkage int swsusp_save(void)
458{ 868{
459 unsigned int nr_pages; 869 unsigned int nr_pages;
@@ -464,25 +874,19 @@ asmlinkage int swsusp_save(void)
464 nr_pages = count_data_pages(); 874 nr_pages = count_data_pages();
465 printk("swsusp: Need to copy %u pages\n", nr_pages); 875 printk("swsusp: Need to copy %u pages\n", nr_pages);
466 876
467 pr_debug("swsusp: pages needed: %u + %lu + %u, free: %u\n",
468 nr_pages,
469 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE,
470 PAGES_FOR_IO, nr_free_pages());
471
472 if (!enough_free_mem(nr_pages)) { 877 if (!enough_free_mem(nr_pages)) {
473 printk(KERN_ERR "swsusp: Not enough free memory\n"); 878 printk(KERN_ERR "swsusp: Not enough free memory\n");
474 return -ENOMEM; 879 return -ENOMEM;
475 } 880 }
476 881
477 pagedir_nosave = swsusp_alloc(nr_pages); 882 if (swsusp_alloc(&orig_bm, &copy_bm, nr_pages))
478 if (!pagedir_nosave)
479 return -ENOMEM; 883 return -ENOMEM;
480 884
481 /* During allocating of suspend pagedir, new cold pages may appear. 885 /* During allocating of suspend pagedir, new cold pages may appear.
482 * Kill them. 886 * Kill them.
483 */ 887 */
484 drain_local_pages(); 888 drain_local_pages();
485 copy_data_pages(pagedir_nosave); 889 copy_data_pages(&copy_bm, &orig_bm);
486 890
487 /* 891 /*
488 * End of critical section. From now on, we can write to memory, 892 * End of critical section. From now on, we can write to memory,
@@ -511,22 +915,20 @@ static void init_header(struct swsusp_info *info)
511} 915}
512 916
513/** 917/**
514 * pack_orig_addresses - the .orig_address fields of the PBEs from the 918 * pack_pfns - pfns corresponding to the set bits found in the bitmap @bm
515 * list starting at @pbe are stored in the array @buf[] (1 page) 919 * are stored in the array @buf[] (1 page at a time)
516 */ 920 */
517 921
518static inline struct pbe *pack_orig_addresses(unsigned long *buf, struct pbe *pbe) 922static inline void
923pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
519{ 924{
520 int j; 925 int j;
521 926
522 for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) { 927 for (j = 0; j < PAGE_SIZE / sizeof(long); j++) {
523 buf[j] = pbe->orig_address; 928 buf[j] = memory_bm_next_pfn(bm);
524 pbe = pbe->next; 929 if (unlikely(buf[j] == BM_END_OF_MAP))
930 break;
525 } 931 }
526 if (!pbe)
527 for (; j < PAGE_SIZE / sizeof(long); j++)
528 buf[j] = 0;
529 return pbe;
530} 932}
531 933
532/** 934/**
@@ -553,37 +955,39 @@ static inline struct pbe *pack_orig_addresses(unsigned long *buf, struct pbe *pb
553 955
554int snapshot_read_next(struct snapshot_handle *handle, size_t count) 956int snapshot_read_next(struct snapshot_handle *handle, size_t count)
555{ 957{
556 if (handle->page > nr_meta_pages + nr_copy_pages) 958 if (handle->cur > nr_meta_pages + nr_copy_pages)
557 return 0; 959 return 0;
960
558 if (!buffer) { 961 if (!buffer) {
559 /* This makes the buffer be freed by swsusp_free() */ 962 /* This makes the buffer be freed by swsusp_free() */
560 buffer = alloc_image_page(GFP_ATOMIC, 0); 963 buffer = alloc_image_page(GFP_ATOMIC, PG_ANY);
561 if (!buffer) 964 if (!buffer)
562 return -ENOMEM; 965 return -ENOMEM;
563 } 966 }
564 if (!handle->offset) { 967 if (!handle->offset) {
565 init_header((struct swsusp_info *)buffer); 968 init_header((struct swsusp_info *)buffer);
566 handle->buffer = buffer; 969 handle->buffer = buffer;
567 handle->pbe = pagedir_nosave; 970 memory_bm_position_reset(&orig_bm);
971 memory_bm_position_reset(&copy_bm);
568 } 972 }
569 if (handle->prev < handle->page) { 973 if (handle->prev < handle->cur) {
570 if (handle->page <= nr_meta_pages) { 974 if (handle->cur <= nr_meta_pages) {
571 handle->pbe = pack_orig_addresses(buffer, handle->pbe); 975 memset(buffer, 0, PAGE_SIZE);
572 if (!handle->pbe) 976 pack_pfns(buffer, &orig_bm);
573 handle->pbe = pagedir_nosave;
574 } else { 977 } else {
575 handle->buffer = (void *)handle->pbe->address; 978 unsigned long pfn = memory_bm_next_pfn(&copy_bm);
576 handle->pbe = handle->pbe->next; 979
980 handle->buffer = page_address(pfn_to_page(pfn));
577 } 981 }
578 handle->prev = handle->page; 982 handle->prev = handle->cur;
579 } 983 }
580 handle->buf_offset = handle->page_offset; 984 handle->buf_offset = handle->cur_offset;
581 if (handle->page_offset + count >= PAGE_SIZE) { 985 if (handle->cur_offset + count >= PAGE_SIZE) {
582 count = PAGE_SIZE - handle->page_offset; 986 count = PAGE_SIZE - handle->cur_offset;
583 handle->page_offset = 0; 987 handle->cur_offset = 0;
584 handle->page++; 988 handle->cur++;
585 } else { 989 } else {
586 handle->page_offset += count; 990 handle->cur_offset += count;
587 } 991 }
588 handle->offset += count; 992 handle->offset += count;
589 return count; 993 return count;
@@ -595,47 +999,50 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
595 * had been used before suspend 999 * had been used before suspend
596 */ 1000 */
597 1001
598static int mark_unsafe_pages(struct pbe *pblist) 1002static int mark_unsafe_pages(struct memory_bitmap *bm)
599{ 1003{
600 struct zone *zone; 1004 struct zone *zone;
601 unsigned long zone_pfn; 1005 unsigned long pfn, max_zone_pfn;
602 struct pbe *p;
603
604 if (!pblist) /* a sanity check */
605 return -EINVAL;
606 1006
607 /* Clear page flags */ 1007 /* Clear page flags */
608 for_each_zone (zone) { 1008 for_each_zone (zone) {
609 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) 1009 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
610 if (pfn_valid(zone_pfn + zone->zone_start_pfn)) 1010 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
611 ClearPageNosaveFree(pfn_to_page(zone_pfn + 1011 if (pfn_valid(pfn))
612 zone->zone_start_pfn)); 1012 ClearPageNosaveFree(pfn_to_page(pfn));
613 } 1013 }
614 1014
615 /* Mark orig addresses */ 1015 /* Mark pages that correspond to the "original" pfns as "unsafe" */
616 for_each_pbe (p, pblist) { 1016 memory_bm_position_reset(bm);
617 if (virt_addr_valid(p->orig_address)) 1017 do {
618 SetPageNosaveFree(virt_to_page(p->orig_address)); 1018 pfn = memory_bm_next_pfn(bm);
619 else 1019 if (likely(pfn != BM_END_OF_MAP)) {
620 return -EFAULT; 1020 if (likely(pfn_valid(pfn)))
621 } 1021 SetPageNosaveFree(pfn_to_page(pfn));
1022 else
1023 return -EFAULT;
1024 }
1025 } while (pfn != BM_END_OF_MAP);
622 1026
623 unsafe_pages = 0; 1027 allocated_unsafe_pages = 0;
624 1028
625 return 0; 1029 return 0;
626} 1030}
627 1031
628static void copy_page_backup_list(struct pbe *dst, struct pbe *src) 1032static void
1033duplicate_memory_bitmap(struct memory_bitmap *dst, struct memory_bitmap *src)
629{ 1034{
630 /* We assume both lists contain the same number of elements */ 1035 unsigned long pfn;
631 while (src) { 1036
632 dst->orig_address = src->orig_address; 1037 memory_bm_position_reset(src);
633 dst = dst->next; 1038 pfn = memory_bm_next_pfn(src);
634 src = src->next; 1039 while (pfn != BM_END_OF_MAP) {
1040 memory_bm_set_bit(dst, pfn);
1041 pfn = memory_bm_next_pfn(src);
635 } 1042 }
636} 1043}
637 1044
638static int check_header(struct swsusp_info *info) 1045static inline int check_header(struct swsusp_info *info)
639{ 1046{
640 char *reason = NULL; 1047 char *reason = NULL;
641 1048
@@ -662,19 +1069,14 @@ static int check_header(struct swsusp_info *info)
662 * load header - check the image header and copy data from it 1069 * load header - check the image header and copy data from it
663 */ 1070 */
664 1071
665static int load_header(struct snapshot_handle *handle, 1072static int
666 struct swsusp_info *info) 1073load_header(struct swsusp_info *info)
667{ 1074{
668 int error; 1075 int error;
669 struct pbe *pblist;
670 1076
1077 restore_pblist = NULL;
671 error = check_header(info); 1078 error = check_header(info);
672 if (!error) { 1079 if (!error) {
673 pblist = alloc_pagedir(info->image_pages, GFP_ATOMIC, 0);
674 if (!pblist)
675 return -ENOMEM;
676 pagedir_nosave = pblist;
677 handle->pbe = pblist;
678 nr_copy_pages = info->image_pages; 1080 nr_copy_pages = info->image_pages;
679 nr_meta_pages = info->pages - info->image_pages - 1; 1081 nr_meta_pages = info->pages - info->image_pages - 1;
680 } 1082 }
@@ -682,113 +1084,137 @@ static int load_header(struct snapshot_handle *handle,
682} 1084}
683 1085
684/** 1086/**
685 * unpack_orig_addresses - copy the elements of @buf[] (1 page) to 1087 * unpack_orig_pfns - for each element of @buf[] (1 page at a time) set
686 * the PBEs in the list starting at @pbe 1088 * the corresponding bit in the memory bitmap @bm
687 */ 1089 */
688 1090
689static inline struct pbe *unpack_orig_addresses(unsigned long *buf, 1091static inline void
690 struct pbe *pbe) 1092unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
691{ 1093{
692 int j; 1094 int j;
693 1095
694 for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) { 1096 for (j = 0; j < PAGE_SIZE / sizeof(long); j++) {
695 pbe->orig_address = buf[j]; 1097 if (unlikely(buf[j] == BM_END_OF_MAP))
696 pbe = pbe->next; 1098 break;
1099
1100 memory_bm_set_bit(bm, buf[j]);
697 } 1101 }
698 return pbe;
699} 1102}
700 1103
701/** 1104/**
702 * prepare_image - use metadata contained in the PBE list 1105 * prepare_image - use the memory bitmap @bm to mark the pages that will
703 * pointed to by pagedir_nosave to mark the pages that will 1106 * be overwritten in the process of restoring the system memory state
704 * be overwritten in the process of restoring the system 1107 * from the suspend image ("unsafe" pages) and allocate memory for the
705 * memory state from the image ("unsafe" pages) and allocate 1108 * image.
706 * memory for the image
707 * 1109 *
708 * The idea is to allocate the PBE list first and then 1110 * The idea is to allocate a new memory bitmap first and then allocate
709 * allocate as many pages as it's needed for the image data, 1111 * as many pages as needed for the image data, but not to assign these
710 * but not to assign these pages to the PBEs initially. 1112 * pages to specific tasks initially. Instead, we just mark them as
711 * Instead, we just mark them as allocated and create a list 1113 * allocated and create a list of "safe" pages that will be used later.
712 * of "safe" which will be used later
713 */ 1114 */
714 1115
715struct safe_page { 1116#define PBES_PER_LINKED_PAGE (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe))
716 struct safe_page *next;
717 char padding[PAGE_SIZE - sizeof(void *)];
718};
719 1117
720static struct safe_page *safe_pages; 1118static struct linked_page *safe_pages_list;
721 1119
722static int prepare_image(struct snapshot_handle *handle) 1120static int
1121prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
723{ 1122{
724 int error = 0; 1123 unsigned int nr_pages;
725 unsigned int nr_pages = nr_copy_pages; 1124 struct linked_page *sp_list, *lp;
726 struct pbe *p, *pblist = NULL; 1125 int error;
727 1126
728 p = pagedir_nosave; 1127 error = mark_unsafe_pages(bm);
729 error = mark_unsafe_pages(p); 1128 if (error)
730 if (!error) { 1129 goto Free;
731 pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1); 1130
732 if (pblist) 1131 error = memory_bm_create(new_bm, GFP_ATOMIC, PG_SAFE);
733 copy_page_backup_list(pblist, p); 1132 if (error)
734 free_pagedir(p, 0); 1133 goto Free;
735 if (!pblist) 1134
1135 duplicate_memory_bitmap(new_bm, bm);
1136 memory_bm_free(bm, PG_UNSAFE_KEEP);
1137 /* Reserve some safe pages for potential later use.
1138 *
1139 * NOTE: This way we make sure there will be enough safe pages for the
1140 * chain_alloc() in get_buffer(). It is a bit wasteful, but
1141 * nr_copy_pages cannot be greater than 50% of the memory anyway.
1142 */
1143 sp_list = NULL;
1144 /* nr_copy_pages cannot be lesser than allocated_unsafe_pages */
1145 nr_pages = nr_copy_pages - allocated_unsafe_pages;
1146 nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE);
1147 while (nr_pages > 0) {
1148 lp = alloc_image_page(GFP_ATOMIC, PG_SAFE);
1149 if (!lp) {
736 error = -ENOMEM; 1150 error = -ENOMEM;
1151 goto Free;
1152 }
1153 lp->next = sp_list;
1154 sp_list = lp;
1155 nr_pages--;
737 } 1156 }
738 safe_pages = NULL; 1157 /* Preallocate memory for the image */
739 if (!error && nr_pages > unsafe_pages) { 1158 safe_pages_list = NULL;
740 nr_pages -= unsafe_pages; 1159 nr_pages = nr_copy_pages - allocated_unsafe_pages;
741 while (nr_pages--) { 1160 while (nr_pages > 0) {
742 struct safe_page *ptr; 1161 lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC);
743 1162 if (!lp) {
744 ptr = (struct safe_page *)get_zeroed_page(GFP_ATOMIC); 1163 error = -ENOMEM;
745 if (!ptr) { 1164 goto Free;
746 error = -ENOMEM; 1165 }
747 break; 1166 if (!PageNosaveFree(virt_to_page(lp))) {
748 } 1167 /* The page is "safe", add it to the list */
749 if (!PageNosaveFree(virt_to_page(ptr))) { 1168 lp->next = safe_pages_list;
750 /* The page is "safe", add it to the list */ 1169 safe_pages_list = lp;
751 ptr->next = safe_pages;
752 safe_pages = ptr;
753 }
754 /* Mark the page as allocated */
755 SetPageNosave(virt_to_page(ptr));
756 SetPageNosaveFree(virt_to_page(ptr));
757 } 1170 }
1171 /* Mark the page as allocated */
1172 SetPageNosave(virt_to_page(lp));
1173 SetPageNosaveFree(virt_to_page(lp));
1174 nr_pages--;
758 } 1175 }
759 if (!error) { 1176 /* Free the reserved safe pages so that chain_alloc() can use them */
760 pagedir_nosave = pblist; 1177 while (sp_list) {
761 } else { 1178 lp = sp_list->next;
762 handle->pbe = NULL; 1179 free_image_page(sp_list, PG_UNSAFE_CLEAR);
763 swsusp_free(); 1180 sp_list = lp;
764 } 1181 }
1182 return 0;
1183
1184Free:
1185 swsusp_free();
765 return error; 1186 return error;
766} 1187}
767 1188
768static void *get_buffer(struct snapshot_handle *handle) 1189/**
1190 * get_buffer - compute the address that snapshot_write_next() should
1191 * set for its caller to write to.
1192 */
1193
1194static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
769{ 1195{
770 struct pbe *pbe = handle->pbe, *last = handle->last_pbe; 1196 struct pbe *pbe;
771 struct page *page = virt_to_page(pbe->orig_address); 1197 struct page *page = pfn_to_page(memory_bm_next_pfn(bm));
772 1198
773 if (PageNosave(page) && PageNosaveFree(page)) { 1199 if (PageNosave(page) && PageNosaveFree(page))
774 /* 1200 /* We have allocated the "original" page frame and we can
775 * We have allocated the "original" page frame and we can 1201 * use it directly to store the loaded page.
776 * use it directly to store the read page
777 */ 1202 */
778 pbe->address = 0; 1203 return page_address(page);
779 if (last && last->next) 1204
780 last->next = NULL; 1205 /* The "original" page frame has not been allocated and we have to
781 return (void *)pbe->orig_address; 1206 * use a "safe" page frame to store the loaded page.
782 }
783 /*
784 * The "original" page frame has not been allocated and we have to
785 * use a "safe" page frame to store the read page
786 */ 1207 */
787 pbe->address = (unsigned long)safe_pages; 1208 pbe = chain_alloc(ca, sizeof(struct pbe));
788 safe_pages = safe_pages->next; 1209 if (!pbe) {
789 if (last) 1210 swsusp_free();
790 last->next = pbe; 1211 return NULL;
791 handle->last_pbe = pbe; 1212 }
1213 pbe->orig_address = (unsigned long)page_address(page);
1214 pbe->address = (unsigned long)safe_pages_list;
1215 safe_pages_list = safe_pages_list->next;
1216 pbe->next = restore_pblist;
1217 restore_pblist = pbe;
792 return (void *)pbe->address; 1218 return (void *)pbe->address;
793} 1219}
794 1220
@@ -816,46 +1242,60 @@ static void *get_buffer(struct snapshot_handle *handle)
816 1242
817int snapshot_write_next(struct snapshot_handle *handle, size_t count) 1243int snapshot_write_next(struct snapshot_handle *handle, size_t count)
818{ 1244{
1245 static struct chain_allocator ca;
819 int error = 0; 1246 int error = 0;
820 1247
821 if (handle->prev && handle->page > nr_meta_pages + nr_copy_pages) 1248 /* Check if we have already loaded the entire image */
1249 if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages)
822 return 0; 1250 return 0;
1251
823 if (!buffer) { 1252 if (!buffer) {
824 /* This makes the buffer be freed by swsusp_free() */ 1253 /* This makes the buffer be freed by swsusp_free() */
825 buffer = alloc_image_page(GFP_ATOMIC, 0); 1254 buffer = alloc_image_page(GFP_ATOMIC, PG_ANY);
826 if (!buffer) 1255 if (!buffer)
827 return -ENOMEM; 1256 return -ENOMEM;
828 } 1257 }
829 if (!handle->offset) 1258 if (!handle->offset)
830 handle->buffer = buffer; 1259 handle->buffer = buffer;
831 if (handle->prev < handle->page) { 1260 handle->sync_read = 1;
832 if (!handle->prev) { 1261 if (handle->prev < handle->cur) {
833 error = load_header(handle, (struct swsusp_info *)buffer); 1262 if (handle->prev == 0) {
1263 error = load_header(buffer);
834 if (error) 1264 if (error)
835 return error; 1265 return error;
1266
1267 error = memory_bm_create(&copy_bm, GFP_ATOMIC, PG_ANY);
1268 if (error)
1269 return error;
1270
836 } else if (handle->prev <= nr_meta_pages) { 1271 } else if (handle->prev <= nr_meta_pages) {
837 handle->pbe = unpack_orig_addresses(buffer, handle->pbe); 1272 unpack_orig_pfns(buffer, &copy_bm);
838 if (!handle->pbe) { 1273 if (handle->prev == nr_meta_pages) {
839 error = prepare_image(handle); 1274 error = prepare_image(&orig_bm, &copy_bm);
840 if (error) 1275 if (error)
841 return error; 1276 return error;
842 handle->pbe = pagedir_nosave; 1277
843 handle->last_pbe = NULL; 1278 chain_init(&ca, GFP_ATOMIC, PG_SAFE);
844 handle->buffer = get_buffer(handle); 1279 memory_bm_position_reset(&orig_bm);
1280 restore_pblist = NULL;
1281 handle->buffer = get_buffer(&orig_bm, &ca);
1282 handle->sync_read = 0;
1283 if (!handle->buffer)
1284 return -ENOMEM;
845 } 1285 }
846 } else { 1286 } else {
847 handle->pbe = handle->pbe->next; 1287 handle->buffer = get_buffer(&orig_bm, &ca);
848 handle->buffer = get_buffer(handle); 1288 handle->sync_read = 0;
849 } 1289 }
850 handle->prev = handle->page; 1290 handle->prev = handle->cur;
851 } 1291 }
852 handle->buf_offset = handle->page_offset; 1292 handle->buf_offset = handle->cur_offset;
853 if (handle->page_offset + count >= PAGE_SIZE) { 1293 if (handle->cur_offset + count >= PAGE_SIZE) {
854 count = PAGE_SIZE - handle->page_offset; 1294 count = PAGE_SIZE - handle->cur_offset;
855 handle->page_offset = 0; 1295 handle->cur_offset = 0;
856 handle->page++; 1296 handle->cur++;
857 } else { 1297 } else {
858 handle->page_offset += count; 1298 handle->cur_offset += count;
859 } 1299 }
860 handle->offset += count; 1300 handle->offset += count;
861 return count; 1301 return count;
@@ -863,6 +1303,13 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
863 1303
864int snapshot_image_loaded(struct snapshot_handle *handle) 1304int snapshot_image_loaded(struct snapshot_handle *handle)
865{ 1305{
866 return !(!handle->pbe || handle->pbe->next || !nr_copy_pages || 1306 return !(!nr_copy_pages ||
867 handle->page <= nr_meta_pages + nr_copy_pages); 1307 handle->cur <= nr_meta_pages + nr_copy_pages);
1308}
1309
1310void snapshot_free_unused_memory(struct snapshot_handle *handle)
1311{
1312 /* Free only if we have loaded the image entirely */
1313 if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages)
1314 memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR);
868} 1315}
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index f1dd146bd64d..9b2ee5344dee 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -22,6 +22,7 @@
22#include <linux/device.h> 22#include <linux/device.h>
23#include <linux/buffer_head.h> 23#include <linux/buffer_head.h>
24#include <linux/bio.h> 24#include <linux/bio.h>
25#include <linux/blkdev.h>
25#include <linux/swap.h> 26#include <linux/swap.h>
26#include <linux/swapops.h> 27#include <linux/swapops.h>
27#include <linux/pm.h> 28#include <linux/pm.h>
@@ -49,18 +50,16 @@ static int mark_swapfiles(swp_entry_t start)
49{ 50{
50 int error; 51 int error;
51 52
52 rw_swap_page_sync(READ, 53 rw_swap_page_sync(READ, swp_entry(root_swap, 0),
53 swp_entry(root_swap, 0), 54 virt_to_page((unsigned long)&swsusp_header), NULL);
54 virt_to_page((unsigned long)&swsusp_header));
55 if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) || 55 if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) ||
56 !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) { 56 !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
57 memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); 57 memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
58 memcpy(swsusp_header.sig,SWSUSP_SIG, 10); 58 memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
59 swsusp_header.image = start; 59 swsusp_header.image = start;
60 error = rw_swap_page_sync(WRITE, 60 error = rw_swap_page_sync(WRITE, swp_entry(root_swap, 0),
61 swp_entry(root_swap, 0), 61 virt_to_page((unsigned long)&swsusp_header),
62 virt_to_page((unsigned long) 62 NULL);
63 &swsusp_header));
64 } else { 63 } else {
65 pr_debug("swsusp: Partition is not swap space.\n"); 64 pr_debug("swsusp: Partition is not swap space.\n");
66 error = -ENODEV; 65 error = -ENODEV;
@@ -88,16 +87,37 @@ static int swsusp_swap_check(void) /* This is called before saving image */
88 * write_page - Write one page to given swap location. 87 * write_page - Write one page to given swap location.
89 * @buf: Address we're writing. 88 * @buf: Address we're writing.
90 * @offset: Offset of the swap page we're writing to. 89 * @offset: Offset of the swap page we're writing to.
90 * @bio_chain: Link the next write BIO here
91 */ 91 */
92 92
93static int write_page(void *buf, unsigned long offset) 93static int write_page(void *buf, unsigned long offset, struct bio **bio_chain)
94{ 94{
95 swp_entry_t entry; 95 swp_entry_t entry;
96 int error = -ENOSPC; 96 int error = -ENOSPC;
97 97
98 if (offset) { 98 if (offset) {
99 struct page *page = virt_to_page(buf);
100
101 if (bio_chain) {
102 /*
103 * Whether or not we successfully allocated a copy page,
104 * we take a ref on the page here. It gets undone in
105 * wait_on_bio_chain().
106 */
107 struct page *page_copy;
108 page_copy = alloc_page(GFP_ATOMIC);
109 if (page_copy == NULL) {
110 WARN_ON_ONCE(1);
111 bio_chain = NULL; /* Go synchronous */
112 get_page(page);
113 } else {
114 memcpy(page_address(page_copy),
115 page_address(page), PAGE_SIZE);
116 page = page_copy;
117 }
118 }
99 entry = swp_entry(root_swap, offset); 119 entry = swp_entry(root_swap, offset);
100 error = rw_swap_page_sync(WRITE, entry, virt_to_page(buf)); 120 error = rw_swap_page_sync(WRITE, entry, page, bio_chain);
101 } 121 }
102 return error; 122 return error;
103} 123}
@@ -146,6 +166,26 @@ static void release_swap_writer(struct swap_map_handle *handle)
146 handle->bitmap = NULL; 166 handle->bitmap = NULL;
147} 167}
148 168
169static void show_speed(struct timeval *start, struct timeval *stop,
170 unsigned nr_pages, char *msg)
171{
172 s64 elapsed_centisecs64;
173 int centisecs;
174 int k;
175 int kps;
176
177 elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
178 do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
179 centisecs = elapsed_centisecs64;
180 if (centisecs == 0)
181 centisecs = 1; /* avoid div-by-zero */
182 k = nr_pages * (PAGE_SIZE / 1024);
183 kps = (k * 100) / centisecs;
184 printk("%s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", msg, k,
185 centisecs / 100, centisecs % 100,
186 kps / 1000, (kps % 1000) / 10);
187}
188
149static int get_swap_writer(struct swap_map_handle *handle) 189static int get_swap_writer(struct swap_map_handle *handle)
150{ 190{
151 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL); 191 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
@@ -165,37 +205,70 @@ static int get_swap_writer(struct swap_map_handle *handle)
165 return 0; 205 return 0;
166} 206}
167 207
168static int swap_write_page(struct swap_map_handle *handle, void *buf) 208static int wait_on_bio_chain(struct bio **bio_chain)
169{ 209{
170 int error; 210 struct bio *bio;
211 struct bio *next_bio;
212 int ret = 0;
213
214 if (bio_chain == NULL)
215 return 0;
216
217 bio = *bio_chain;
218 if (bio == NULL)
219 return 0;
220 while (bio) {
221 struct page *page;
222
223 next_bio = bio->bi_private;
224 page = bio->bi_io_vec[0].bv_page;
225 wait_on_page_locked(page);
226 if (!PageUptodate(page) || PageError(page))
227 ret = -EIO;
228 put_page(page);
229 bio_put(bio);
230 bio = next_bio;
231 }
232 *bio_chain = NULL;
233 return ret;
234}
235
236static int swap_write_page(struct swap_map_handle *handle, void *buf,
237 struct bio **bio_chain)
238{
239 int error = 0;
171 unsigned long offset; 240 unsigned long offset;
172 241
173 if (!handle->cur) 242 if (!handle->cur)
174 return -EINVAL; 243 return -EINVAL;
175 offset = alloc_swap_page(root_swap, handle->bitmap); 244 offset = alloc_swap_page(root_swap, handle->bitmap);
176 error = write_page(buf, offset); 245 error = write_page(buf, offset, bio_chain);
177 if (error) 246 if (error)
178 return error; 247 return error;
179 handle->cur->entries[handle->k++] = offset; 248 handle->cur->entries[handle->k++] = offset;
180 if (handle->k >= MAP_PAGE_ENTRIES) { 249 if (handle->k >= MAP_PAGE_ENTRIES) {
250 error = wait_on_bio_chain(bio_chain);
251 if (error)
252 goto out;
181 offset = alloc_swap_page(root_swap, handle->bitmap); 253 offset = alloc_swap_page(root_swap, handle->bitmap);
182 if (!offset) 254 if (!offset)
183 return -ENOSPC; 255 return -ENOSPC;
184 handle->cur->next_swap = offset; 256 handle->cur->next_swap = offset;
185 error = write_page(handle->cur, handle->cur_swap); 257 error = write_page(handle->cur, handle->cur_swap, NULL);
186 if (error) 258 if (error)
187 return error; 259 goto out;
188 memset(handle->cur, 0, PAGE_SIZE); 260 memset(handle->cur, 0, PAGE_SIZE);
189 handle->cur_swap = offset; 261 handle->cur_swap = offset;
190 handle->k = 0; 262 handle->k = 0;
191 } 263 }
192 return 0; 264out:
265 return error;
193} 266}
194 267
195static int flush_swap_writer(struct swap_map_handle *handle) 268static int flush_swap_writer(struct swap_map_handle *handle)
196{ 269{
197 if (handle->cur && handle->cur_swap) 270 if (handle->cur && handle->cur_swap)
198 return write_page(handle->cur, handle->cur_swap); 271 return write_page(handle->cur, handle->cur_swap, NULL);
199 else 272 else
200 return -EINVAL; 273 return -EINVAL;
201} 274}
@@ -206,21 +279,29 @@ static int flush_swap_writer(struct swap_map_handle *handle)
206 279
207static int save_image(struct swap_map_handle *handle, 280static int save_image(struct swap_map_handle *handle,
208 struct snapshot_handle *snapshot, 281 struct snapshot_handle *snapshot,
209 unsigned int nr_pages) 282 unsigned int nr_to_write)
210{ 283{
211 unsigned int m; 284 unsigned int m;
212 int ret; 285 int ret;
213 int error = 0; 286 int error = 0;
287 int nr_pages;
288 int err2;
289 struct bio *bio;
290 struct timeval start;
291 struct timeval stop;
214 292
215 printk("Saving image data pages (%u pages) ... ", nr_pages); 293 printk("Saving image data pages (%u pages) ... ", nr_to_write);
216 m = nr_pages / 100; 294 m = nr_to_write / 100;
217 if (!m) 295 if (!m)
218 m = 1; 296 m = 1;
219 nr_pages = 0; 297 nr_pages = 0;
298 bio = NULL;
299 do_gettimeofday(&start);
220 do { 300 do {
221 ret = snapshot_read_next(snapshot, PAGE_SIZE); 301 ret = snapshot_read_next(snapshot, PAGE_SIZE);
222 if (ret > 0) { 302 if (ret > 0) {
223 error = swap_write_page(handle, data_of(*snapshot)); 303 error = swap_write_page(handle, data_of(*snapshot),
304 &bio);
224 if (error) 305 if (error)
225 break; 306 break;
226 if (!(nr_pages % m)) 307 if (!(nr_pages % m))
@@ -228,8 +309,13 @@ static int save_image(struct swap_map_handle *handle,
228 nr_pages++; 309 nr_pages++;
229 } 310 }
230 } while (ret > 0); 311 } while (ret > 0);
312 err2 = wait_on_bio_chain(&bio);
313 do_gettimeofday(&stop);
314 if (!error)
315 error = err2;
231 if (!error) 316 if (!error)
232 printk("\b\b\b\bdone\n"); 317 printk("\b\b\b\bdone\n");
318 show_speed(&start, &stop, nr_to_write, "Wrote");
233 return error; 319 return error;
234} 320}
235 321
@@ -245,8 +331,7 @@ static int enough_swap(unsigned int nr_pages)
245 unsigned int free_swap = count_swap_pages(root_swap, 1); 331 unsigned int free_swap = count_swap_pages(root_swap, 1);
246 332
247 pr_debug("swsusp: free swap pages: %u\n", free_swap); 333 pr_debug("swsusp: free swap pages: %u\n", free_swap);
248 return free_swap > (nr_pages + PAGES_FOR_IO + 334 return free_swap > nr_pages + PAGES_FOR_IO;
249 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
250} 335}
251 336
252/** 337/**
@@ -266,7 +351,8 @@ int swsusp_write(void)
266 int error; 351 int error;
267 352
268 if ((error = swsusp_swap_check())) { 353 if ((error = swsusp_swap_check())) {
269 printk(KERN_ERR "swsusp: Cannot find swap device, try swapon -a.\n"); 354 printk(KERN_ERR "swsusp: Cannot find swap device, try "
355 "swapon -a.\n");
270 return error; 356 return error;
271 } 357 }
272 memset(&snapshot, 0, sizeof(struct snapshot_handle)); 358 memset(&snapshot, 0, sizeof(struct snapshot_handle));
@@ -281,7 +367,7 @@ int swsusp_write(void)
281 error = get_swap_writer(&handle); 367 error = get_swap_writer(&handle);
282 if (!error) { 368 if (!error) {
283 unsigned long start = handle.cur_swap; 369 unsigned long start = handle.cur_swap;
284 error = swap_write_page(&handle, header); 370 error = swap_write_page(&handle, header, NULL);
285 if (!error) 371 if (!error)
286 error = save_image(&handle, &snapshot, 372 error = save_image(&handle, &snapshot,
287 header->pages - 1); 373 header->pages - 1);
@@ -298,27 +384,6 @@ int swsusp_write(void)
298 return error; 384 return error;
299} 385}
300 386
301/*
302 * Using bio to read from swap.
303 * This code requires a bit more work than just using buffer heads
304 * but, it is the recommended way for 2.5/2.6.
305 * The following are to signal the beginning and end of I/O. Bios
306 * finish asynchronously, while we want them to happen synchronously.
307 * A simple atomic_t, and a wait loop take care of this problem.
308 */
309
310static atomic_t io_done = ATOMIC_INIT(0);
311
312static int end_io(struct bio *bio, unsigned int num, int err)
313{
314 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
315 printk(KERN_ERR "I/O error reading swsusp image.\n");
316 return -EIO;
317 }
318 atomic_set(&io_done, 0);
319 return 0;
320}
321
322static struct block_device *resume_bdev; 387static struct block_device *resume_bdev;
323 388
324/** 389/**
@@ -326,15 +391,15 @@ static struct block_device *resume_bdev;
326 * @rw: READ or WRITE. 391 * @rw: READ or WRITE.
327 * @off physical offset of page. 392 * @off physical offset of page.
328 * @page: page we're reading or writing. 393 * @page: page we're reading or writing.
394 * @bio_chain: list of pending biod (for async reading)
329 * 395 *
330 * Straight from the textbook - allocate and initialize the bio. 396 * Straight from the textbook - allocate and initialize the bio.
331 * If we're writing, make sure the page is marked as dirty. 397 * If we're reading, make sure the page is marked as dirty.
332 * Then submit it and wait. 398 * Then submit it and, if @bio_chain == NULL, wait.
333 */ 399 */
334 400static int submit(int rw, pgoff_t page_off, struct page *page,
335static int submit(int rw, pgoff_t page_off, void *page) 401 struct bio **bio_chain)
336{ 402{
337 int error = 0;
338 struct bio *bio; 403 struct bio *bio;
339 404
340 bio = bio_alloc(GFP_ATOMIC, 1); 405 bio = bio_alloc(GFP_ATOMIC, 1);
@@ -342,33 +407,40 @@ static int submit(int rw, pgoff_t page_off, void *page)
342 return -ENOMEM; 407 return -ENOMEM;
343 bio->bi_sector = page_off * (PAGE_SIZE >> 9); 408 bio->bi_sector = page_off * (PAGE_SIZE >> 9);
344 bio->bi_bdev = resume_bdev; 409 bio->bi_bdev = resume_bdev;
345 bio->bi_end_io = end_io; 410 bio->bi_end_io = end_swap_bio_read;
346 411
347 if (bio_add_page(bio, virt_to_page(page), PAGE_SIZE, 0) < PAGE_SIZE) { 412 if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
348 printk("swsusp: ERROR: adding page to bio at %ld\n",page_off); 413 printk("swsusp: ERROR: adding page to bio at %ld\n", page_off);
349 error = -EFAULT; 414 bio_put(bio);
350 goto Done; 415 return -EFAULT;
351 } 416 }
352 417
353 atomic_set(&io_done, 1); 418 lock_page(page);
354 submit_bio(rw | (1 << BIO_RW_SYNC), bio); 419 bio_get(bio);
355 while (atomic_read(&io_done)) 420
356 yield(); 421 if (bio_chain == NULL) {
357 if (rw == READ) 422 submit_bio(rw | (1 << BIO_RW_SYNC), bio);
358 bio_set_pages_dirty(bio); 423 wait_on_page_locked(page);
359 Done: 424 if (rw == READ)
360 bio_put(bio); 425 bio_set_pages_dirty(bio);
361 return error; 426 bio_put(bio);
427 } else {
428 get_page(page);
429 bio->bi_private = *bio_chain;
430 *bio_chain = bio;
431 submit_bio(rw | (1 << BIO_RW_SYNC), bio);
432 }
433 return 0;
362} 434}
363 435
364static int bio_read_page(pgoff_t page_off, void *page) 436static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
365{ 437{
366 return submit(READ, page_off, page); 438 return submit(READ, page_off, virt_to_page(addr), bio_chain);
367} 439}
368 440
369static int bio_write_page(pgoff_t page_off, void *page) 441static int bio_write_page(pgoff_t page_off, void *addr)
370{ 442{
371 return submit(WRITE, page_off, page); 443 return submit(WRITE, page_off, virt_to_page(addr), NULL);
372} 444}
373 445
374/** 446/**
@@ -393,7 +465,7 @@ static int get_swap_reader(struct swap_map_handle *handle,
393 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); 465 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
394 if (!handle->cur) 466 if (!handle->cur)
395 return -ENOMEM; 467 return -ENOMEM;
396 error = bio_read_page(swp_offset(start), handle->cur); 468 error = bio_read_page(swp_offset(start), handle->cur, NULL);
397 if (error) { 469 if (error) {
398 release_swap_reader(handle); 470 release_swap_reader(handle);
399 return error; 471 return error;
@@ -402,7 +474,8 @@ static int get_swap_reader(struct swap_map_handle *handle,
402 return 0; 474 return 0;
403} 475}
404 476
405static int swap_read_page(struct swap_map_handle *handle, void *buf) 477static int swap_read_page(struct swap_map_handle *handle, void *buf,
478 struct bio **bio_chain)
406{ 479{
407 unsigned long offset; 480 unsigned long offset;
408 int error; 481 int error;
@@ -412,16 +485,17 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf)
412 offset = handle->cur->entries[handle->k]; 485 offset = handle->cur->entries[handle->k];
413 if (!offset) 486 if (!offset)
414 return -EFAULT; 487 return -EFAULT;
415 error = bio_read_page(offset, buf); 488 error = bio_read_page(offset, buf, bio_chain);
416 if (error) 489 if (error)
417 return error; 490 return error;
418 if (++handle->k >= MAP_PAGE_ENTRIES) { 491 if (++handle->k >= MAP_PAGE_ENTRIES) {
492 error = wait_on_bio_chain(bio_chain);
419 handle->k = 0; 493 handle->k = 0;
420 offset = handle->cur->next_swap; 494 offset = handle->cur->next_swap;
421 if (!offset) 495 if (!offset)
422 release_swap_reader(handle); 496 release_swap_reader(handle);
423 else 497 else if (!error)
424 error = bio_read_page(offset, handle->cur); 498 error = bio_read_page(offset, handle->cur, NULL);
425 } 499 }
426 return error; 500 return error;
427} 501}
@@ -434,33 +508,49 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf)
434 508
435static int load_image(struct swap_map_handle *handle, 509static int load_image(struct swap_map_handle *handle,
436 struct snapshot_handle *snapshot, 510 struct snapshot_handle *snapshot,
437 unsigned int nr_pages) 511 unsigned int nr_to_read)
438{ 512{
439 unsigned int m; 513 unsigned int m;
440 int ret;
441 int error = 0; 514 int error = 0;
515 struct timeval start;
516 struct timeval stop;
517 struct bio *bio;
518 int err2;
519 unsigned nr_pages;
442 520
443 printk("Loading image data pages (%u pages) ... ", nr_pages); 521 printk("Loading image data pages (%u pages) ... ", nr_to_read);
444 m = nr_pages / 100; 522 m = nr_to_read / 100;
445 if (!m) 523 if (!m)
446 m = 1; 524 m = 1;
447 nr_pages = 0; 525 nr_pages = 0;
448 do { 526 bio = NULL;
449 ret = snapshot_write_next(snapshot, PAGE_SIZE); 527 do_gettimeofday(&start);
450 if (ret > 0) { 528 for ( ; ; ) {
451 error = swap_read_page(handle, data_of(*snapshot)); 529 error = snapshot_write_next(snapshot, PAGE_SIZE);
452 if (error) 530 if (error <= 0)
453 break; 531 break;
454 if (!(nr_pages % m)) 532 error = swap_read_page(handle, data_of(*snapshot), &bio);
455 printk("\b\b\b\b%3d%%", nr_pages / m); 533 if (error)
456 nr_pages++; 534 break;
457 } 535 if (snapshot->sync_read)
458 } while (ret > 0); 536 error = wait_on_bio_chain(&bio);
537 if (error)
538 break;
539 if (!(nr_pages % m))
540 printk("\b\b\b\b%3d%%", nr_pages / m);
541 nr_pages++;
542 }
543 err2 = wait_on_bio_chain(&bio);
544 do_gettimeofday(&stop);
545 if (!error)
546 error = err2;
459 if (!error) { 547 if (!error) {
460 printk("\b\b\b\bdone\n"); 548 printk("\b\b\b\bdone\n");
549 snapshot_free_unused_memory(snapshot);
461 if (!snapshot_image_loaded(snapshot)) 550 if (!snapshot_image_loaded(snapshot))
462 error = -ENODATA; 551 error = -ENODATA;
463 } 552 }
553 show_speed(&start, &stop, nr_to_read, "Read");
464 return error; 554 return error;
465} 555}
466 556
@@ -483,7 +573,7 @@ int swsusp_read(void)
483 header = (struct swsusp_info *)data_of(snapshot); 573 header = (struct swsusp_info *)data_of(snapshot);
484 error = get_swap_reader(&handle, swsusp_header.image); 574 error = get_swap_reader(&handle, swsusp_header.image);
485 if (!error) 575 if (!error)
486 error = swap_read_page(&handle, header); 576 error = swap_read_page(&handle, header, NULL);
487 if (!error) 577 if (!error)
488 error = load_image(&handle, &snapshot, header->pages - 1); 578 error = load_image(&handle, &snapshot, header->pages - 1);
489 release_swap_reader(&handle); 579 release_swap_reader(&handle);
@@ -509,7 +599,7 @@ int swsusp_check(void)
509 if (!IS_ERR(resume_bdev)) { 599 if (!IS_ERR(resume_bdev)) {
510 set_blocksize(resume_bdev, PAGE_SIZE); 600 set_blocksize(resume_bdev, PAGE_SIZE);
511 memset(&swsusp_header, 0, sizeof(swsusp_header)); 601 memset(&swsusp_header, 0, sizeof(swsusp_header));
512 if ((error = bio_read_page(0, &swsusp_header))) 602 if ((error = bio_read_page(0, &swsusp_header, NULL)))
513 return error; 603 return error;
514 if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) { 604 if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
515 memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10); 605 memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 17f669c83012..0b66659dc516 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -193,14 +193,13 @@ int swsusp_shrink_memory(void)
193 printk("Shrinking memory... "); 193 printk("Shrinking memory... ");
194 do { 194 do {
195 size = 2 * count_highmem_pages(); 195 size = 2 * count_highmem_pages();
196 size += size / 50 + count_data_pages(); 196 size += size / 50 + count_data_pages() + PAGES_FOR_IO;
197 size += (size + PBES_PER_PAGE - 1) / PBES_PER_PAGE +
198 PAGES_FOR_IO;
199 tmp = size; 197 tmp = size;
200 for_each_zone (zone) 198 for_each_zone (zone)
201 if (!is_highmem(zone) && populated_zone(zone)) { 199 if (!is_highmem(zone) && populated_zone(zone)) {
202 tmp -= zone->free_pages; 200 tmp -= zone->free_pages;
203 tmp += zone->lowmem_reserve[ZONE_NORMAL]; 201 tmp += zone->lowmem_reserve[ZONE_NORMAL];
202 tmp += snapshot_additional_pages(zone);
204 } 203 }
205 if (tmp > 0) { 204 if (tmp > 0) {
206 tmp = __shrink_memory(tmp); 205 tmp = __shrink_memory(tmp);
@@ -248,6 +247,9 @@ int swsusp_suspend(void)
248 restore_processor_state(); 247 restore_processor_state();
249Restore_highmem: 248Restore_highmem:
250 restore_highmem(); 249 restore_highmem();
250 /* NOTE: device_power_up() is just a resume() for devices
251 * that suspended with irqs off ... no overall powerup.
252 */
251 device_power_up(); 253 device_power_up();
252Enable_irqs: 254Enable_irqs:
253 local_irq_enable(); 255 local_irq_enable();
@@ -257,8 +259,12 @@ Enable_irqs:
257int swsusp_resume(void) 259int swsusp_resume(void)
258{ 260{
259 int error; 261 int error;
262
260 local_irq_disable(); 263 local_irq_disable();
261 if (device_power_down(PMSG_FREEZE)) 264 /* NOTE: device_power_down() is just a suspend() with irqs off;
265 * it has no special "power things down" semantics
266 */
267 if (device_power_down(PMSG_PRETHAW))
262 printk(KERN_ERR "Some devices failed to power down, very bad\n"); 268 printk(KERN_ERR "Some devices failed to power down, very bad\n");
263 /* We'll ignore saved state, but this gets preempt count (etc) right */ 269 /* We'll ignore saved state, but this gets preempt count (etc) right */
264 save_processor_state(); 270 save_processor_state();
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 3f1539fbe48a..72825c853cd7 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -19,6 +19,7 @@
19#include <linux/swapops.h> 19#include <linux/swapops.h>
20#include <linux/pm.h> 20#include <linux/pm.h>
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/cpu.h>
22 23
23#include <asm/uaccess.h> 24#include <asm/uaccess.h>
24 25
@@ -139,12 +140,15 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
139 if (data->frozen) 140 if (data->frozen)
140 break; 141 break;
141 down(&pm_sem); 142 down(&pm_sem);
142 disable_nonboot_cpus(); 143 error = disable_nonboot_cpus();
143 if (freeze_processes()) { 144 if (!error) {
144 thaw_processes(); 145 error = freeze_processes();
145 enable_nonboot_cpus(); 146 if (error) {
146 error = -EBUSY; 147 thaw_processes();
148 error = -EBUSY;
149 }
147 } 150 }
151 enable_nonboot_cpus();
148 up(&pm_sem); 152 up(&pm_sem);
149 if (!error) 153 if (!error)
150 data->frozen = 1; 154 data->frozen = 1;
@@ -189,9 +193,10 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
189 error = -EPERM; 193 error = -EPERM;
190 break; 194 break;
191 } 195 }
196 snapshot_free_unused_memory(&data->handle);
192 down(&pm_sem); 197 down(&pm_sem);
193 pm_prepare_console(); 198 pm_prepare_console();
194 error = device_suspend(PMSG_FREEZE); 199 error = device_suspend(PMSG_PRETHAW);
195 if (!error) { 200 if (!error) {
196 error = swsusp_resume(); 201 error = swsusp_resume();
197 device_resume(); 202 device_resume();
diff --git a/kernel/printk.c b/kernel/printk.c
index 1149365e989e..771f5e861bcd 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -721,6 +721,7 @@ int __init add_preferred_console(char *name, int idx, char *options)
721 return 0; 721 return 0;
722} 722}
723 723
724#ifndef CONFIG_DISABLE_CONSOLE_SUSPEND
724/** 725/**
725 * suspend_console - suspend the console subsystem 726 * suspend_console - suspend the console subsystem
726 * 727 *
@@ -728,6 +729,7 @@ int __init add_preferred_console(char *name, int idx, char *options)
728 */ 729 */
729void suspend_console(void) 730void suspend_console(void)
730{ 731{
732 printk("Suspending console(s)\n");
731 acquire_console_sem(); 733 acquire_console_sem();
732 console_suspended = 1; 734 console_suspended = 1;
733} 735}
@@ -737,6 +739,7 @@ void resume_console(void)
737 console_suspended = 0; 739 console_suspended = 0;
738 release_console_sem(); 740 release_console_sem();
739} 741}
742#endif /* CONFIG_DISABLE_CONSOLE_SUSPEND */
740 743
741/** 744/**
742 * acquire_console_sem - lock the console system for exclusive use. 745 * acquire_console_sem - lock the console system for exclusive use.
diff --git a/kernel/profile.c b/kernel/profile.c
index d5bd75e7501c..fb660c7d35ba 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -309,13 +309,17 @@ static int __devinit profile_cpu_callback(struct notifier_block *info,
309 node = cpu_to_node(cpu); 309 node = cpu_to_node(cpu);
310 per_cpu(cpu_profile_flip, cpu) = 0; 310 per_cpu(cpu_profile_flip, cpu) = 0;
311 if (!per_cpu(cpu_profile_hits, cpu)[1]) { 311 if (!per_cpu(cpu_profile_hits, cpu)[1]) {
312 page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); 312 page = alloc_pages_node(node,
313 GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
314 0);
313 if (!page) 315 if (!page)
314 return NOTIFY_BAD; 316 return NOTIFY_BAD;
315 per_cpu(cpu_profile_hits, cpu)[1] = page_address(page); 317 per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
316 } 318 }
317 if (!per_cpu(cpu_profile_hits, cpu)[0]) { 319 if (!per_cpu(cpu_profile_hits, cpu)[0]) {
318 page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); 320 page = alloc_pages_node(node,
321 GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
322 0);
319 if (!page) 323 if (!page)
320 goto out_free; 324 goto out_free;
321 per_cpu(cpu_profile_hits, cpu)[0] = page_address(page); 325 per_cpu(cpu_profile_hits, cpu)[0] = page_address(page);
@@ -491,12 +495,16 @@ static int __init create_hash_tables(void)
491 int node = cpu_to_node(cpu); 495 int node = cpu_to_node(cpu);
492 struct page *page; 496 struct page *page;
493 497
494 page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); 498 page = alloc_pages_node(node,
499 GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
500 0);
495 if (!page) 501 if (!page)
496 goto out_cleanup; 502 goto out_cleanup;
497 per_cpu(cpu_profile_hits, cpu)[1] 503 per_cpu(cpu_profile_hits, cpu)[1]
498 = (struct profile_hit *)page_address(page); 504 = (struct profile_hit *)page_address(page);
499 page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); 505 page = alloc_pages_node(node,
506 GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
507 0);
500 if (!page) 508 if (!page)
501 goto out_cleanup; 509 goto out_cleanup;
502 per_cpu(cpu_profile_hits, cpu)[0] 510 per_cpu(cpu_profile_hits, cpu)[0]
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 9a111f70145c..4d50e06fd745 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -241,60 +241,6 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
241 return 0; 241 return 0;
242} 242}
243 243
244/*
245 * Access another process' address space.
246 * Source/target buffer must be kernel space,
247 * Do not walk the page table directly, use get_user_pages
248 */
249
250int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
251{
252 struct mm_struct *mm;
253 struct vm_area_struct *vma;
254 struct page *page;
255 void *old_buf = buf;
256
257 mm = get_task_mm(tsk);
258 if (!mm)
259 return 0;
260
261 down_read(&mm->mmap_sem);
262 /* ignore errors, just check how much was sucessfully transfered */
263 while (len) {
264 int bytes, ret, offset;
265 void *maddr;
266
267 ret = get_user_pages(tsk, mm, addr, 1,
268 write, 1, &page, &vma);
269 if (ret <= 0)
270 break;
271
272 bytes = len;
273 offset = addr & (PAGE_SIZE-1);
274 if (bytes > PAGE_SIZE-offset)
275 bytes = PAGE_SIZE-offset;
276
277 maddr = kmap(page);
278 if (write) {
279 copy_to_user_page(vma, page, addr,
280 maddr + offset, buf, bytes);
281 set_page_dirty_lock(page);
282 } else {
283 copy_from_user_page(vma, page, addr,
284 buf, maddr + offset, bytes);
285 }
286 kunmap(page);
287 page_cache_release(page);
288 len -= bytes;
289 buf += bytes;
290 addr += bytes;
291 }
292 up_read(&mm->mmap_sem);
293 mmput(mm);
294
295 return buf - old_buf;
296}
297
298int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len) 244int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len)
299{ 245{
300 int copied = 0; 246 int copied = 0;
@@ -494,6 +440,7 @@ struct task_struct *ptrace_get_task_struct(pid_t pid)
494 child = find_task_by_pid(pid); 440 child = find_task_by_pid(pid);
495 if (child) 441 if (child)
496 get_task_struct(child); 442 get_task_struct(child);
443
497 read_unlock(&tasklist_lock); 444 read_unlock(&tasklist_lock);
498 if (!child) 445 if (!child)
499 return ERR_PTR(-ESRCH); 446 return ERR_PTR(-ESRCH);
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 4d1c3d247127..4f2c4272d59c 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -192,13 +192,13 @@ static struct rcu_torture_ops *cur_ops = NULL;
192 * Definitions for rcu torture testing. 192 * Definitions for rcu torture testing.
193 */ 193 */
194 194
195static int rcu_torture_read_lock(void) 195static int rcu_torture_read_lock(void) __acquires(RCU)
196{ 196{
197 rcu_read_lock(); 197 rcu_read_lock();
198 return 0; 198 return 0;
199} 199}
200 200
201static void rcu_torture_read_unlock(int idx) 201static void rcu_torture_read_unlock(int idx) __releases(RCU)
202{ 202{
203 rcu_read_unlock(); 203 rcu_read_unlock();
204} 204}
@@ -250,13 +250,13 @@ static struct rcu_torture_ops rcu_ops = {
250 * Definitions for rcu_bh torture testing. 250 * Definitions for rcu_bh torture testing.
251 */ 251 */
252 252
253static int rcu_bh_torture_read_lock(void) 253static int rcu_bh_torture_read_lock(void) __acquires(RCU_BH)
254{ 254{
255 rcu_read_lock_bh(); 255 rcu_read_lock_bh();
256 return 0; 256 return 0;
257} 257}
258 258
259static void rcu_bh_torture_read_unlock(int idx) 259static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH)
260{ 260{
261 rcu_read_unlock_bh(); 261 rcu_read_unlock_bh();
262} 262}
diff --git a/kernel/relay.c b/kernel/relay.c
index 33345e73485c..1d63ecddfa70 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -95,7 +95,7 @@ int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma)
95 * @buf: the buffer struct 95 * @buf: the buffer struct
96 * @size: total size of the buffer 96 * @size: total size of the buffer
97 * 97 *
98 * Returns a pointer to the resulting buffer, NULL if unsuccessful. The 98 * Returns a pointer to the resulting buffer, %NULL if unsuccessful. The
99 * passed in size will get page aligned, if it isn't already. 99 * passed in size will get page aligned, if it isn't already.
100 */ 100 */
101static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size) 101static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size)
@@ -132,10 +132,9 @@ depopulate:
132 132
133/** 133/**
134 * relay_create_buf - allocate and initialize a channel buffer 134 * relay_create_buf - allocate and initialize a channel buffer
135 * @alloc_size: size of the buffer to allocate 135 * @chan: the relay channel
136 * @n_subbufs: number of sub-buffers in the channel
137 * 136 *
138 * Returns channel buffer if successful, NULL otherwise 137 * Returns channel buffer if successful, %NULL otherwise.
139 */ 138 */
140struct rchan_buf *relay_create_buf(struct rchan *chan) 139struct rchan_buf *relay_create_buf(struct rchan *chan)
141{ 140{
@@ -163,6 +162,7 @@ free_buf:
163 162
164/** 163/**
165 * relay_destroy_channel - free the channel struct 164 * relay_destroy_channel - free the channel struct
165 * @kref: target kernel reference that contains the relay channel
166 * 166 *
167 * Should only be called from kref_put(). 167 * Should only be called from kref_put().
168 */ 168 */
@@ -194,6 +194,7 @@ void relay_destroy_buf(struct rchan_buf *buf)
194 194
195/** 195/**
196 * relay_remove_buf - remove a channel buffer 196 * relay_remove_buf - remove a channel buffer
197 * @kref: target kernel reference that contains the relay buffer
197 * 198 *
198 * Removes the file from the fileystem, which also frees the 199 * Removes the file from the fileystem, which also frees the
199 * rchan_buf_struct and the channel buffer. Should only be called from 200 * rchan_buf_struct and the channel buffer. Should only be called from
@@ -374,7 +375,7 @@ void relay_reset(struct rchan *chan)
374} 375}
375EXPORT_SYMBOL_GPL(relay_reset); 376EXPORT_SYMBOL_GPL(relay_reset);
376 377
377/** 378/*
378 * relay_open_buf - create a new relay channel buffer 379 * relay_open_buf - create a new relay channel buffer
379 * 380 *
380 * Internal - used by relay_open(). 381 * Internal - used by relay_open().
@@ -448,12 +449,12 @@ static inline void setup_callbacks(struct rchan *chan,
448/** 449/**
449 * relay_open - create a new relay channel 450 * relay_open - create a new relay channel
450 * @base_filename: base name of files to create 451 * @base_filename: base name of files to create
451 * @parent: dentry of parent directory, NULL for root directory 452 * @parent: dentry of parent directory, %NULL for root directory
452 * @subbuf_size: size of sub-buffers 453 * @subbuf_size: size of sub-buffers
453 * @n_subbufs: number of sub-buffers 454 * @n_subbufs: number of sub-buffers
454 * @cb: client callback functions 455 * @cb: client callback functions
455 * 456 *
456 * Returns channel pointer if successful, NULL otherwise. 457 * Returns channel pointer if successful, %NULL otherwise.
457 * 458 *
458 * Creates a channel buffer for each cpu using the sizes and 459 * Creates a channel buffer for each cpu using the sizes and
459 * attributes specified. The created channel buffer files 460 * attributes specified. The created channel buffer files
@@ -585,7 +586,7 @@ EXPORT_SYMBOL_GPL(relay_switch_subbuf);
585 * subbufs_consumed should be the number of sub-buffers newly consumed, 586 * subbufs_consumed should be the number of sub-buffers newly consumed,
586 * not the total consumed. 587 * not the total consumed.
587 * 588 *
588 * NOTE: kernel clients don't need to call this function if the channel 589 * NOTE: Kernel clients don't need to call this function if the channel
589 * mode is 'overwrite'. 590 * mode is 'overwrite'.
590 */ 591 */
591void relay_subbufs_consumed(struct rchan *chan, 592void relay_subbufs_consumed(struct rchan *chan,
@@ -641,7 +642,7 @@ EXPORT_SYMBOL_GPL(relay_close);
641 * relay_flush - close the channel 642 * relay_flush - close the channel
642 * @chan: the channel 643 * @chan: the channel
643 * 644 *
644 * Flushes all channel buffers i.e. forces buffer switch. 645 * Flushes all channel buffers, i.e. forces buffer switch.
645 */ 646 */
646void relay_flush(struct rchan *chan) 647void relay_flush(struct rchan *chan)
647{ 648{
@@ -669,7 +670,7 @@ EXPORT_SYMBOL_GPL(relay_flush);
669 */ 670 */
670static int relay_file_open(struct inode *inode, struct file *filp) 671static int relay_file_open(struct inode *inode, struct file *filp)
671{ 672{
672 struct rchan_buf *buf = inode->u.generic_ip; 673 struct rchan_buf *buf = inode->i_private;
673 kref_get(&buf->kref); 674 kref_get(&buf->kref);
674 filp->private_data = buf; 675 filp->private_data = buf;
675 676
@@ -729,7 +730,7 @@ static int relay_file_release(struct inode *inode, struct file *filp)
729 return 0; 730 return 0;
730} 731}
731 732
732/** 733/*
733 * relay_file_read_consume - update the consumed count for the buffer 734 * relay_file_read_consume - update the consumed count for the buffer
734 */ 735 */
735static void relay_file_read_consume(struct rchan_buf *buf, 736static void relay_file_read_consume(struct rchan_buf *buf,
@@ -756,7 +757,7 @@ static void relay_file_read_consume(struct rchan_buf *buf,
756 } 757 }
757} 758}
758 759
759/** 760/*
760 * relay_file_read_avail - boolean, are there unconsumed bytes available? 761 * relay_file_read_avail - boolean, are there unconsumed bytes available?
761 */ 762 */
762static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos) 763static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
@@ -793,6 +794,8 @@ static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
793 794
794/** 795/**
795 * relay_file_read_subbuf_avail - return bytes available in sub-buffer 796 * relay_file_read_subbuf_avail - return bytes available in sub-buffer
797 * @read_pos: file read position
798 * @buf: relay channel buffer
796 */ 799 */
797static size_t relay_file_read_subbuf_avail(size_t read_pos, 800static size_t relay_file_read_subbuf_avail(size_t read_pos,
798 struct rchan_buf *buf) 801 struct rchan_buf *buf)
@@ -818,6 +821,8 @@ static size_t relay_file_read_subbuf_avail(size_t read_pos,
818 821
819/** 822/**
820 * relay_file_read_start_pos - find the first available byte to read 823 * relay_file_read_start_pos - find the first available byte to read
824 * @read_pos: file read position
825 * @buf: relay channel buffer
821 * 826 *
822 * If the read_pos is in the middle of padding, return the 827 * If the read_pos is in the middle of padding, return the
823 * position of the first actually available byte, otherwise 828 * position of the first actually available byte, otherwise
@@ -844,6 +849,9 @@ static size_t relay_file_read_start_pos(size_t read_pos,
844 849
845/** 850/**
846 * relay_file_read_end_pos - return the new read position 851 * relay_file_read_end_pos - return the new read position
852 * @read_pos: file read position
853 * @buf: relay channel buffer
854 * @count: number of bytes to be read
847 */ 855 */
848static size_t relay_file_read_end_pos(struct rchan_buf *buf, 856static size_t relay_file_read_end_pos(struct rchan_buf *buf,
849 size_t read_pos, 857 size_t read_pos,
@@ -865,7 +873,7 @@ static size_t relay_file_read_end_pos(struct rchan_buf *buf,
865 return end_pos; 873 return end_pos;
866} 874}
867 875
868/** 876/*
869 * subbuf_read_actor - read up to one subbuf's worth of data 877 * subbuf_read_actor - read up to one subbuf's worth of data
870 */ 878 */
871static int subbuf_read_actor(size_t read_start, 879static int subbuf_read_actor(size_t read_start,
@@ -890,7 +898,7 @@ static int subbuf_read_actor(size_t read_start,
890 return ret; 898 return ret;
891} 899}
892 900
893/** 901/*
894 * subbuf_send_actor - send up to one subbuf's worth of data 902 * subbuf_send_actor - send up to one subbuf's worth of data
895 */ 903 */
896static int subbuf_send_actor(size_t read_start, 904static int subbuf_send_actor(size_t read_start,
@@ -933,7 +941,7 @@ typedef int (*subbuf_actor_t) (size_t read_start,
933 read_descriptor_t *desc, 941 read_descriptor_t *desc,
934 read_actor_t actor); 942 read_actor_t actor);
935 943
936/** 944/*
937 * relay_file_read_subbufs - read count bytes, bridging subbuf boundaries 945 * relay_file_read_subbufs - read count bytes, bridging subbuf boundaries
938 */ 946 */
939static inline ssize_t relay_file_read_subbufs(struct file *filp, 947static inline ssize_t relay_file_read_subbufs(struct file *filp,
diff --git a/kernel/resource.c b/kernel/resource.c
index 46286434af80..9db38a1a7520 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -344,12 +344,11 @@ EXPORT_SYMBOL(allocate_resource);
344 * 344 *
345 * Returns 0 on success, -EBUSY if the resource can't be inserted. 345 * Returns 0 on success, -EBUSY if the resource can't be inserted.
346 * 346 *
347 * This function is equivalent of request_resource when no conflict 347 * This function is equivalent to request_resource when no conflict
348 * happens. If a conflict happens, and the conflicting resources 348 * happens. If a conflict happens, and the conflicting resources
349 * entirely fit within the range of the new resource, then the new 349 * entirely fit within the range of the new resource, then the new
350 * resource is inserted and the conflicting resources become childs of 350 * resource is inserted and the conflicting resources become children of
351 * the new resource. Otherwise the new resource becomes the child of 351 * the new resource.
352 * the conflicting resource
353 */ 352 */
354int insert_resource(struct resource *parent, struct resource *new) 353int insert_resource(struct resource *parent, struct resource *new)
355{ 354{
@@ -357,20 +356,21 @@ int insert_resource(struct resource *parent, struct resource *new)
357 struct resource *first, *next; 356 struct resource *first, *next;
358 357
359 write_lock(&resource_lock); 358 write_lock(&resource_lock);
360 begin:
361 result = 0;
362 first = __request_resource(parent, new);
363 if (!first)
364 goto out;
365 359
366 result = -EBUSY; 360 for (;; parent = first) {
367 if (first == parent) 361 result = 0;
368 goto out; 362 first = __request_resource(parent, new);
363 if (!first)
364 goto out;
369 365
370 /* Resource fully contained by the clashing resource? Recurse into it */ 366 result = -EBUSY;
371 if (first->start <= new->start && first->end >= new->end) { 367 if (first == parent)
372 parent = first; 368 goto out;
373 goto begin; 369
370 if ((first->start > new->start) || (first->end < new->end))
371 break;
372 if ((first->start == new->start) && (first->end == new->end))
373 break;
374 } 374 }
375 375
376 for (next = first; ; next = next->sibling) { 376 for (next = first; ; next = next->sibling) {
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 3e13a1e5856f..4ab17da46fd8 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -251,6 +251,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
251 251
252 /* Grab the next task */ 252 /* Grab the next task */
253 task = rt_mutex_owner(lock); 253 task = rt_mutex_owner(lock);
254 get_task_struct(task);
254 spin_lock_irqsave(&task->pi_lock, flags); 255 spin_lock_irqsave(&task->pi_lock, flags);
255 256
256 if (waiter == rt_mutex_top_waiter(lock)) { 257 if (waiter == rt_mutex_top_waiter(lock)) {
@@ -269,7 +270,6 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
269 __rt_mutex_adjust_prio(task); 270 __rt_mutex_adjust_prio(task);
270 } 271 }
271 272
272 get_task_struct(task);
273 spin_unlock_irqrestore(&task->pi_lock, flags); 273 spin_unlock_irqrestore(&task->pi_lock, flags);
274 274
275 top_waiter = rt_mutex_top_waiter(lock); 275 top_waiter = rt_mutex_top_waiter(lock);
@@ -409,7 +409,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
409 struct task_struct *owner = rt_mutex_owner(lock); 409 struct task_struct *owner = rt_mutex_owner(lock);
410 struct rt_mutex_waiter *top_waiter = waiter; 410 struct rt_mutex_waiter *top_waiter = waiter;
411 unsigned long flags; 411 unsigned long flags;
412 int boost = 0, res; 412 int chain_walk = 0, res;
413 413
414 spin_lock_irqsave(&current->pi_lock, flags); 414 spin_lock_irqsave(&current->pi_lock, flags);
415 __rt_mutex_adjust_prio(current); 415 __rt_mutex_adjust_prio(current);
@@ -433,25 +433,23 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
433 plist_add(&waiter->pi_list_entry, &owner->pi_waiters); 433 plist_add(&waiter->pi_list_entry, &owner->pi_waiters);
434 434
435 __rt_mutex_adjust_prio(owner); 435 __rt_mutex_adjust_prio(owner);
436 if (owner->pi_blocked_on) { 436 if (owner->pi_blocked_on)
437 boost = 1; 437 chain_walk = 1;
438 /* gets dropped in rt_mutex_adjust_prio_chain()! */
439 get_task_struct(owner);
440 }
441 spin_unlock_irqrestore(&owner->pi_lock, flags);
442 }
443 else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) {
444 spin_lock_irqsave(&owner->pi_lock, flags);
445 if (owner->pi_blocked_on) {
446 boost = 1;
447 /* gets dropped in rt_mutex_adjust_prio_chain()! */
448 get_task_struct(owner);
449 }
450 spin_unlock_irqrestore(&owner->pi_lock, flags); 438 spin_unlock_irqrestore(&owner->pi_lock, flags);
451 } 439 }
452 if (!boost) 440 else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock))
441 chain_walk = 1;
442
443 if (!chain_walk)
453 return 0; 444 return 0;
454 445
446 /*
447 * The owner can't disappear while holding a lock,
448 * so the owner struct is protected by wait_lock.
449 * Gets dropped in rt_mutex_adjust_prio_chain()!
450 */
451 get_task_struct(owner);
452
455 spin_unlock(&lock->wait_lock); 453 spin_unlock(&lock->wait_lock);
456 454
457 res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, 455 res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter,
@@ -532,7 +530,7 @@ static void remove_waiter(struct rt_mutex *lock,
532 int first = (waiter == rt_mutex_top_waiter(lock)); 530 int first = (waiter == rt_mutex_top_waiter(lock));
533 struct task_struct *owner = rt_mutex_owner(lock); 531 struct task_struct *owner = rt_mutex_owner(lock);
534 unsigned long flags; 532 unsigned long flags;
535 int boost = 0; 533 int chain_walk = 0;
536 534
537 spin_lock_irqsave(&current->pi_lock, flags); 535 spin_lock_irqsave(&current->pi_lock, flags);
538 plist_del(&waiter->list_entry, &lock->wait_list); 536 plist_del(&waiter->list_entry, &lock->wait_list);
@@ -554,19 +552,20 @@ static void remove_waiter(struct rt_mutex *lock,
554 } 552 }
555 __rt_mutex_adjust_prio(owner); 553 __rt_mutex_adjust_prio(owner);
556 554
557 if (owner->pi_blocked_on) { 555 if (owner->pi_blocked_on)
558 boost = 1; 556 chain_walk = 1;
559 /* gets dropped in rt_mutex_adjust_prio_chain()! */ 557
560 get_task_struct(owner);
561 }
562 spin_unlock_irqrestore(&owner->pi_lock, flags); 558 spin_unlock_irqrestore(&owner->pi_lock, flags);
563 } 559 }
564 560
565 WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); 561 WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
566 562
567 if (!boost) 563 if (!chain_walk)
568 return; 564 return;
569 565
566 /* gets dropped in rt_mutex_adjust_prio_chain()! */
567 get_task_struct(owner);
568
570 spin_unlock(&lock->wait_lock); 569 spin_unlock(&lock->wait_lock);
571 570
572 rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current); 571 rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current);
@@ -592,10 +591,10 @@ void rt_mutex_adjust_pi(struct task_struct *task)
592 return; 591 return;
593 } 592 }
594 593
595 /* gets dropped in rt_mutex_adjust_prio_chain()! */
596 get_task_struct(task);
597 spin_unlock_irqrestore(&task->pi_lock, flags); 594 spin_unlock_irqrestore(&task->pi_lock, flags);
598 595
596 /* gets dropped in rt_mutex_adjust_prio_chain()! */
597 get_task_struct(task);
599 rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task); 598 rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task);
600} 599}
601 600
diff --git a/kernel/sched.c b/kernel/sched.c
index a234fbee1238..74f169ac0773 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -238,6 +238,7 @@ struct rq {
238 /* For active balancing */ 238 /* For active balancing */
239 int active_balance; 239 int active_balance;
240 int push_cpu; 240 int push_cpu;
241 int cpu; /* cpu of this runqueue */
241 242
242 struct task_struct *migration_thread; 243 struct task_struct *migration_thread;
243 struct list_head migration_queue; 244 struct list_head migration_queue;
@@ -267,6 +268,15 @@ struct rq {
267 268
268static DEFINE_PER_CPU(struct rq, runqueues); 269static DEFINE_PER_CPU(struct rq, runqueues);
269 270
271static inline int cpu_of(struct rq *rq)
272{
273#ifdef CONFIG_SMP
274 return rq->cpu;
275#else
276 return 0;
277#endif
278}
279
270/* 280/*
271 * The domain tree (rq->sd) is protected by RCU's quiescent state transition. 281 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
272 * See detach_destroy_domains: synchronize_sched for details. 282 * See detach_destroy_domains: synchronize_sched for details.
@@ -1745,27 +1755,27 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
1745 __releases(rq->lock) 1755 __releases(rq->lock)
1746{ 1756{
1747 struct mm_struct *mm = rq->prev_mm; 1757 struct mm_struct *mm = rq->prev_mm;
1748 unsigned long prev_task_flags; 1758 long prev_state;
1749 1759
1750 rq->prev_mm = NULL; 1760 rq->prev_mm = NULL;
1751 1761
1752 /* 1762 /*
1753 * A task struct has one reference for the use as "current". 1763 * A task struct has one reference for the use as "current".
1754 * If a task dies, then it sets EXIT_ZOMBIE in tsk->exit_state and 1764 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
1755 * calls schedule one last time. The schedule call will never return, 1765 * schedule one last time. The schedule call will never return, and
1756 * and the scheduled task must drop that reference. 1766 * the scheduled task must drop that reference.
1757 * The test for EXIT_ZOMBIE must occur while the runqueue locks are 1767 * The test for TASK_DEAD must occur while the runqueue locks are
1758 * still held, otherwise prev could be scheduled on another cpu, die 1768 * still held, otherwise prev could be scheduled on another cpu, die
1759 * there before we look at prev->state, and then the reference would 1769 * there before we look at prev->state, and then the reference would
1760 * be dropped twice. 1770 * be dropped twice.
1761 * Manfred Spraul <manfred@colorfullife.com> 1771 * Manfred Spraul <manfred@colorfullife.com>
1762 */ 1772 */
1763 prev_task_flags = prev->flags; 1773 prev_state = prev->state;
1764 finish_arch_switch(prev); 1774 finish_arch_switch(prev);
1765 finish_lock_switch(rq, prev); 1775 finish_lock_switch(rq, prev);
1766 if (mm) 1776 if (mm)
1767 mmdrop(mm); 1777 mmdrop(mm);
1768 if (unlikely(prev_task_flags & PF_DEAD)) { 1778 if (unlikely(prev_state == TASK_DEAD)) {
1769 /* 1779 /*
1770 * Remove function-return probe instances associated with this 1780 * Remove function-return probe instances associated with this
1771 * task and put them back on the free list. 1781 * task and put them back on the free list.
@@ -2211,7 +2221,8 @@ out:
2211 */ 2221 */
2212static struct sched_group * 2222static struct sched_group *
2213find_busiest_group(struct sched_domain *sd, int this_cpu, 2223find_busiest_group(struct sched_domain *sd, int this_cpu,
2214 unsigned long *imbalance, enum idle_type idle, int *sd_idle) 2224 unsigned long *imbalance, enum idle_type idle, int *sd_idle,
2225 cpumask_t *cpus)
2215{ 2226{
2216 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 2227 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
2217 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 2228 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@ -2248,7 +2259,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2248 sum_weighted_load = sum_nr_running = avg_load = 0; 2259 sum_weighted_load = sum_nr_running = avg_load = 0;
2249 2260
2250 for_each_cpu_mask(i, group->cpumask) { 2261 for_each_cpu_mask(i, group->cpumask) {
2251 struct rq *rq = cpu_rq(i); 2262 struct rq *rq;
2263
2264 if (!cpu_isset(i, *cpus))
2265 continue;
2266
2267 rq = cpu_rq(i);
2252 2268
2253 if (*sd_idle && !idle_cpu(i)) 2269 if (*sd_idle && !idle_cpu(i))
2254 *sd_idle = 0; 2270 *sd_idle = 0;
@@ -2466,13 +2482,17 @@ ret:
2466 */ 2482 */
2467static struct rq * 2483static struct rq *
2468find_busiest_queue(struct sched_group *group, enum idle_type idle, 2484find_busiest_queue(struct sched_group *group, enum idle_type idle,
2469 unsigned long imbalance) 2485 unsigned long imbalance, cpumask_t *cpus)
2470{ 2486{
2471 struct rq *busiest = NULL, *rq; 2487 struct rq *busiest = NULL, *rq;
2472 unsigned long max_load = 0; 2488 unsigned long max_load = 0;
2473 int i; 2489 int i;
2474 2490
2475 for_each_cpu_mask(i, group->cpumask) { 2491 for_each_cpu_mask(i, group->cpumask) {
2492
2493 if (!cpu_isset(i, *cpus))
2494 continue;
2495
2476 rq = cpu_rq(i); 2496 rq = cpu_rq(i);
2477 2497
2478 if (rq->nr_running == 1 && rq->raw_weighted_load > imbalance) 2498 if (rq->nr_running == 1 && rq->raw_weighted_load > imbalance)
@@ -2511,6 +2531,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
2511 struct sched_group *group; 2531 struct sched_group *group;
2512 unsigned long imbalance; 2532 unsigned long imbalance;
2513 struct rq *busiest; 2533 struct rq *busiest;
2534 cpumask_t cpus = CPU_MASK_ALL;
2514 2535
2515 if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && 2536 if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
2516 !sched_smt_power_savings) 2537 !sched_smt_power_savings)
@@ -2518,13 +2539,15 @@ static int load_balance(int this_cpu, struct rq *this_rq,
2518 2539
2519 schedstat_inc(sd, lb_cnt[idle]); 2540 schedstat_inc(sd, lb_cnt[idle]);
2520 2541
2521 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle); 2542redo:
2543 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
2544 &cpus);
2522 if (!group) { 2545 if (!group) {
2523 schedstat_inc(sd, lb_nobusyg[idle]); 2546 schedstat_inc(sd, lb_nobusyg[idle]);
2524 goto out_balanced; 2547 goto out_balanced;
2525 } 2548 }
2526 2549
2527 busiest = find_busiest_queue(group, idle, imbalance); 2550 busiest = find_busiest_queue(group, idle, imbalance, &cpus);
2528 if (!busiest) { 2551 if (!busiest) {
2529 schedstat_inc(sd, lb_nobusyq[idle]); 2552 schedstat_inc(sd, lb_nobusyq[idle]);
2530 goto out_balanced; 2553 goto out_balanced;
@@ -2549,8 +2572,12 @@ static int load_balance(int this_cpu, struct rq *this_rq,
2549 double_rq_unlock(this_rq, busiest); 2572 double_rq_unlock(this_rq, busiest);
2550 2573
2551 /* All tasks on this runqueue were pinned by CPU affinity */ 2574 /* All tasks on this runqueue were pinned by CPU affinity */
2552 if (unlikely(all_pinned)) 2575 if (unlikely(all_pinned)) {
2576 cpu_clear(cpu_of(busiest), cpus);
2577 if (!cpus_empty(cpus))
2578 goto redo;
2553 goto out_balanced; 2579 goto out_balanced;
2580 }
2554 } 2581 }
2555 2582
2556 if (!nr_moved) { 2583 if (!nr_moved) {
@@ -2639,18 +2666,22 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
2639 unsigned long imbalance; 2666 unsigned long imbalance;
2640 int nr_moved = 0; 2667 int nr_moved = 0;
2641 int sd_idle = 0; 2668 int sd_idle = 0;
2669 cpumask_t cpus = CPU_MASK_ALL;
2642 2670
2643 if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings) 2671 if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
2644 sd_idle = 1; 2672 sd_idle = 1;
2645 2673
2646 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); 2674 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
2647 group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, &sd_idle); 2675redo:
2676 group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE,
2677 &sd_idle, &cpus);
2648 if (!group) { 2678 if (!group) {
2649 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); 2679 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
2650 goto out_balanced; 2680 goto out_balanced;
2651 } 2681 }
2652 2682
2653 busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance); 2683 busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance,
2684 &cpus);
2654 if (!busiest) { 2685 if (!busiest) {
2655 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); 2686 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
2656 goto out_balanced; 2687 goto out_balanced;
@@ -2668,6 +2699,12 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
2668 minus_1_or_zero(busiest->nr_running), 2699 minus_1_or_zero(busiest->nr_running),
2669 imbalance, sd, NEWLY_IDLE, NULL); 2700 imbalance, sd, NEWLY_IDLE, NULL);
2670 spin_unlock(&busiest->lock); 2701 spin_unlock(&busiest->lock);
2702
2703 if (!nr_moved) {
2704 cpu_clear(cpu_of(busiest), cpus);
2705 if (!cpus_empty(cpus))
2706 goto redo;
2707 }
2671 } 2708 }
2672 2709
2673 if (!nr_moved) { 2710 if (!nr_moved) {
@@ -3311,9 +3348,6 @@ need_resched_nonpreemptible:
3311 3348
3312 spin_lock_irq(&rq->lock); 3349 spin_lock_irq(&rq->lock);
3313 3350
3314 if (unlikely(prev->flags & PF_DEAD))
3315 prev->state = EXIT_DEAD;
3316
3317 switch_count = &prev->nivcsw; 3351 switch_count = &prev->nivcsw;
3318 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 3352 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3319 switch_count = &prev->nvcsw; 3353 switch_count = &prev->nvcsw;
@@ -4043,6 +4077,8 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
4043 * @p: the task in question. 4077 * @p: the task in question.
4044 * @policy: new policy. 4078 * @policy: new policy.
4045 * @param: structure containing the new RT priority. 4079 * @param: structure containing the new RT priority.
4080 *
4081 * NOTE: the task may be already dead
4046 */ 4082 */
4047int sched_setscheduler(struct task_struct *p, int policy, 4083int sched_setscheduler(struct task_struct *p, int policy,
4048 struct sched_param *param) 4084 struct sched_param *param)
@@ -4070,28 +4106,32 @@ recheck:
4070 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || 4106 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
4071 (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) 4107 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
4072 return -EINVAL; 4108 return -EINVAL;
4073 if ((policy == SCHED_NORMAL || policy == SCHED_BATCH) 4109 if (is_rt_policy(policy) != (param->sched_priority != 0))
4074 != (param->sched_priority == 0))
4075 return -EINVAL; 4110 return -EINVAL;
4076 4111
4077 /* 4112 /*
4078 * Allow unprivileged RT tasks to decrease priority: 4113 * Allow unprivileged RT tasks to decrease priority:
4079 */ 4114 */
4080 if (!capable(CAP_SYS_NICE)) { 4115 if (!capable(CAP_SYS_NICE)) {
4081 /* 4116 if (is_rt_policy(policy)) {
4082 * can't change policy, except between SCHED_NORMAL 4117 unsigned long rlim_rtprio;
4083 * and SCHED_BATCH: 4118 unsigned long flags;
4084 */ 4119
4085 if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH) && 4120 if (!lock_task_sighand(p, &flags))
4086 (policy != SCHED_BATCH && p->policy != SCHED_NORMAL)) && 4121 return -ESRCH;
4087 !p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) 4122 rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
4088 return -EPERM; 4123 unlock_task_sighand(p, &flags);
4089 /* can't increase priority */ 4124
4090 if ((policy != SCHED_NORMAL && policy != SCHED_BATCH) && 4125 /* can't set/change the rt policy */
4091 param->sched_priority > p->rt_priority && 4126 if (policy != p->policy && !rlim_rtprio)
4092 param->sched_priority > 4127 return -EPERM;
4093 p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) 4128
4094 return -EPERM; 4129 /* can't increase priority */
4130 if (param->sched_priority > p->rt_priority &&
4131 param->sched_priority > rlim_rtprio)
4132 return -EPERM;
4133 }
4134
4095 /* can't change other user's priorities */ 4135 /* can't change other user's priorities */
4096 if ((current->euid != p->euid) && 4136 if ((current->euid != p->euid) &&
4097 (current->euid != p->uid)) 4137 (current->euid != p->uid))
@@ -4156,14 +4196,13 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4156 return -EINVAL; 4196 return -EINVAL;
4157 if (copy_from_user(&lparam, param, sizeof(struct sched_param))) 4197 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
4158 return -EFAULT; 4198 return -EFAULT;
4159 read_lock_irq(&tasklist_lock); 4199
4200 rcu_read_lock();
4201 retval = -ESRCH;
4160 p = find_process_by_pid(pid); 4202 p = find_process_by_pid(pid);
4161 if (!p) { 4203 if (p != NULL)
4162 read_unlock_irq(&tasklist_lock); 4204 retval = sched_setscheduler(p, policy, &lparam);
4163 return -ESRCH; 4205 rcu_read_unlock();
4164 }
4165 retval = sched_setscheduler(p, policy, &lparam);
4166 read_unlock_irq(&tasklist_lock);
4167 4206
4168 return retval; 4207 return retval;
4169} 4208}
@@ -5114,7 +5153,7 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5114 BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD); 5153 BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD);
5115 5154
5116 /* Cannot have done final schedule yet: would have vanished. */ 5155 /* Cannot have done final schedule yet: would have vanished. */
5117 BUG_ON(p->flags & PF_DEAD); 5156 BUG_ON(p->state == TASK_DEAD);
5118 5157
5119 get_task_struct(p); 5158 get_task_struct(p);
5120 5159
@@ -5235,9 +5274,11 @@ static struct notifier_block __cpuinitdata migration_notifier = {
5235int __init migration_init(void) 5274int __init migration_init(void)
5236{ 5275{
5237 void *cpu = (void *)(long)smp_processor_id(); 5276 void *cpu = (void *)(long)smp_processor_id();
5277 int err;
5238 5278
5239 /* Start one for the boot CPU: */ 5279 /* Start one for the boot CPU: */
5240 migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); 5280 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5281 BUG_ON(err == NOTIFY_BAD);
5241 migration_call(&migration_notifier, CPU_ONLINE, cpu); 5282 migration_call(&migration_notifier, CPU_ONLINE, cpu);
5242 register_cpu_notifier(&migration_notifier); 5283 register_cpu_notifier(&migration_notifier);
5243 5284
@@ -6747,6 +6788,7 @@ void __init sched_init(void)
6747 rq->cpu_load[j] = 0; 6788 rq->cpu_load[j] = 0;
6748 rq->active_balance = 0; 6789 rq->active_balance = 0;
6749 rq->push_cpu = 0; 6790 rq->push_cpu = 0;
6791 rq->cpu = i;
6750 rq->migration_thread = NULL; 6792 rq->migration_thread = NULL;
6751 INIT_LIST_HEAD(&rq->migration_queue); 6793 INIT_LIST_HEAD(&rq->migration_queue);
6752#endif 6794#endif
diff --git a/kernel/signal.c b/kernel/signal.c
index bfdb5686fa3e..fb5da6d19f14 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -417,9 +417,8 @@ static int collect_signal(int sig, struct sigpending *list, siginfo_t *info)
417static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, 417static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
418 siginfo_t *info) 418 siginfo_t *info)
419{ 419{
420 int sig = 0; 420 int sig = next_signal(pending, mask);
421 421
422 sig = next_signal(pending, mask);
423 if (sig) { 422 if (sig) {
424 if (current->notifier) { 423 if (current->notifier) {
425 if (sigismember(current->notifier_mask, sig)) { 424 if (sigismember(current->notifier_mask, sig)) {
@@ -432,9 +431,7 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
432 431
433 if (!collect_signal(sig, pending, info)) 432 if (!collect_signal(sig, pending, info))
434 sig = 0; 433 sig = 0;
435
436 } 434 }
437 recalc_sigpending();
438 435
439 return sig; 436 return sig;
440} 437}
@@ -451,6 +448,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
451 if (!signr) 448 if (!signr)
452 signr = __dequeue_signal(&tsk->signal->shared_pending, 449 signr = __dequeue_signal(&tsk->signal->shared_pending,
453 mask, info); 450 mask, info);
451 recalc_sigpending_tsk(tsk);
454 if (signr && unlikely(sig_kernel_stop(signr))) { 452 if (signr && unlikely(sig_kernel_stop(signr))) {
455 /* 453 /*
456 * Set a marker that we have dequeued a stop signal. Our 454 * Set a marker that we have dequeued a stop signal. Our
@@ -2577,6 +2575,11 @@ asmlinkage long sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize)
2577} 2575}
2578#endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */ 2576#endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */
2579 2577
2578__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma)
2579{
2580 return NULL;
2581}
2582
2580void __init signals_init(void) 2583void __init signals_init(void)
2581{ 2584{
2582 sigqueue_cachep = 2585 sigqueue_cachep =
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 3789ca98197c..bf25015dce16 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -612,7 +612,9 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
612__init int spawn_ksoftirqd(void) 612__init int spawn_ksoftirqd(void)
613{ 613{
614 void *cpu = (void *)(long)smp_processor_id(); 614 void *cpu = (void *)(long)smp_processor_id();
615 cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); 615 int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
616
617 BUG_ON(err == NOTIFY_BAD);
616 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); 618 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
617 register_cpu_notifier(&cpu_nfb); 619 register_cpu_notifier(&cpu_nfb);
618 return 0; 620 return 0;
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 03e6a2b0b787..50afeb813305 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -149,8 +149,9 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
149__init void spawn_softlockup_task(void) 149__init void spawn_softlockup_task(void)
150{ 150{
151 void *cpu = (void *)(long)smp_processor_id(); 151 void *cpu = (void *)(long)smp_processor_id();
152 int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
152 153
153 cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); 154 BUG_ON(err == NOTIFY_BAD);
154 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); 155 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
155 register_cpu_notifier(&cpu_nfb); 156 register_cpu_notifier(&cpu_nfb);
156 157
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index fb524b009eef..d48143eafbfd 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -7,6 +7,11 @@
7 * 7 *
8 * This file contains the spinlock/rwlock implementations for the 8 * This file contains the spinlock/rwlock implementations for the
9 * SMP and the DEBUG_SPINLOCK cases. (UP-nondebug inlines them) 9 * SMP and the DEBUG_SPINLOCK cases. (UP-nondebug inlines them)
10 *
11 * Note that some architectures have special knowledge about the
12 * stack frames of these functions in their profile_pc. If you
13 * change anything significant here that could change the stack
14 * frame contact the architecture maintainers.
10 */ 15 */
11 16
12#include <linux/linkage.h> 17#include <linux/linkage.h>
@@ -16,17 +21,6 @@
16#include <linux/debug_locks.h> 21#include <linux/debug_locks.h>
17#include <linux/module.h> 22#include <linux/module.h>
18 23
19/*
20 * Generic declaration of the raw read_trylock() function,
21 * architectures are supposed to optimize this:
22 */
23int __lockfunc generic__raw_read_trylock(raw_rwlock_t *lock)
24{
25 __raw_read_lock(lock);
26 return 1;
27}
28EXPORT_SYMBOL(generic__raw_read_trylock);
29
30int __lockfunc _spin_trylock(spinlock_t *lock) 24int __lockfunc _spin_trylock(spinlock_t *lock)
31{ 25{
32 preempt_disable(); 26 preempt_disable();
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 51cacd111dbd..12458040e665 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -1,3 +1,6 @@
1/* Copyright 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation.
2 * GPL v2 and any later version.
3 */
1#include <linux/stop_machine.h> 4#include <linux/stop_machine.h>
2#include <linux/kthread.h> 5#include <linux/kthread.h>
3#include <linux/sched.h> 6#include <linux/sched.h>
diff --git a/kernel/sys.c b/kernel/sys.c
index e236f98f7ec5..b88806c66244 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -28,6 +28,7 @@
28#include <linux/tty.h> 28#include <linux/tty.h>
29#include <linux/signal.h> 29#include <linux/signal.h>
30#include <linux/cn_proc.h> 30#include <linux/cn_proc.h>
31#include <linux/getcpu.h>
31 32
32#include <linux/compat.h> 33#include <linux/compat.h>
33#include <linux/syscalls.h> 34#include <linux/syscalls.h>
@@ -611,7 +612,6 @@ void kernel_restart(char *cmd)
611 } else { 612 } else {
612 printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); 613 printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd);
613 } 614 }
614 printk(".\n");
615 machine_restart(cmd); 615 machine_restart(cmd);
616} 616}
617EXPORT_SYMBOL_GPL(kernel_restart); 617EXPORT_SYMBOL_GPL(kernel_restart);
@@ -2062,3 +2062,33 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
2062 } 2062 }
2063 return error; 2063 return error;
2064} 2064}
2065
2066asmlinkage long sys_getcpu(unsigned __user *cpup, unsigned __user *nodep,
2067 struct getcpu_cache __user *cache)
2068{
2069 int err = 0;
2070 int cpu = raw_smp_processor_id();
2071 if (cpup)
2072 err |= put_user(cpu, cpup);
2073 if (nodep)
2074 err |= put_user(cpu_to_node(cpu), nodep);
2075 if (cache) {
2076 /*
2077 * The cache is not needed for this implementation,
2078 * but make sure user programs pass something
2079 * valid. vsyscall implementations can instead make
2080 * good use of the cache. Only use t0 and t1 because
2081 * these are available in both 32bit and 64bit ABI (no
2082 * need for a compat_getcpu). 32bit has enough
2083 * padding
2084 */
2085 unsigned long t0, t1;
2086 get_user(t0, &cache->blob[0]);
2087 get_user(t1, &cache->blob[1]);
2088 t0++;
2089 t1++;
2090 put_user(t0, &cache->blob[0]);
2091 put_user(t1, &cache->blob[1]);
2092 }
2093 return err ? -EFAULT : 0;
2094}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 362a0cc37138..c57c4532e296 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -52,6 +52,10 @@
52extern int proc_nr_files(ctl_table *table, int write, struct file *filp, 52extern int proc_nr_files(ctl_table *table, int write, struct file *filp,
53 void __user *buffer, size_t *lenp, loff_t *ppos); 53 void __user *buffer, size_t *lenp, loff_t *ppos);
54 54
55#ifdef CONFIG_X86
56#include <asm/nmi.h>
57#endif
58
55#if defined(CONFIG_SYSCTL) 59#if defined(CONFIG_SYSCTL)
56 60
57/* External variables not in a header file. */ 61/* External variables not in a header file. */
@@ -74,12 +78,6 @@ extern int sysctl_drop_caches;
74extern int percpu_pagelist_fraction; 78extern int percpu_pagelist_fraction;
75extern int compat_log; 79extern int compat_log;
76 80
77#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
78int unknown_nmi_panic;
79extern int proc_unknown_nmi_panic(ctl_table *, int, struct file *,
80 void __user *, size_t *, loff_t *);
81#endif
82
83/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ 81/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
84static int maxolduid = 65535; 82static int maxolduid = 65535;
85static int minolduid; 83static int minolduid;
@@ -136,8 +134,11 @@ extern int no_unaligned_warning;
136extern int max_lock_depth; 134extern int max_lock_depth;
137#endif 135#endif
138 136
139static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t, 137#ifdef CONFIG_SYSCTL_SYSCALL
140 ctl_table *, void **); 138static int parse_table(int __user *, int, void __user *, size_t __user *,
139 void __user *, size_t, ctl_table *, void **);
140#endif
141
141static int proc_doutsstring(ctl_table *table, int write, struct file *filp, 142static int proc_doutsstring(ctl_table *table, int write, struct file *filp,
142 void __user *buffer, size_t *lenp, loff_t *ppos); 143 void __user *buffer, size_t *lenp, loff_t *ppos);
143 144
@@ -164,7 +165,7 @@ int sysctl_legacy_va_layout;
164 165
165/* /proc declarations: */ 166/* /proc declarations: */
166 167
167#ifdef CONFIG_PROC_FS 168#ifdef CONFIG_PROC_SYSCTL
168 169
169static ssize_t proc_readsys(struct file *, char __user *, size_t, loff_t *); 170static ssize_t proc_readsys(struct file *, char __user *, size_t, loff_t *);
170static ssize_t proc_writesys(struct file *, const char __user *, size_t, loff_t *); 171static ssize_t proc_writesys(struct file *, const char __user *, size_t, loff_t *);
@@ -628,11 +629,27 @@ static ctl_table kern_table[] = {
628 .data = &unknown_nmi_panic, 629 .data = &unknown_nmi_panic,
629 .maxlen = sizeof (int), 630 .maxlen = sizeof (int),
630 .mode = 0644, 631 .mode = 0644,
631 .proc_handler = &proc_unknown_nmi_panic, 632 .proc_handler = &proc_dointvec,
633 },
634 {
635 .ctl_name = KERN_NMI_WATCHDOG,
636 .procname = "nmi_watchdog",
637 .data = &nmi_watchdog_enabled,
638 .maxlen = sizeof (int),
639 .mode = 0644,
640 .proc_handler = &proc_nmi_enabled,
632 }, 641 },
633#endif 642#endif
634#if defined(CONFIG_X86) 643#if defined(CONFIG_X86)
635 { 644 {
645 .ctl_name = KERN_PANIC_ON_NMI,
646 .procname = "panic_on_unrecovered_nmi",
647 .data = &panic_on_unrecovered_nmi,
648 .maxlen = sizeof(int),
649 .mode = 0644,
650 .proc_handler = &proc_dointvec,
651 },
652 {
636 .ctl_name = KERN_BOOTLOADER_TYPE, 653 .ctl_name = KERN_BOOTLOADER_TYPE,
637 .procname = "bootloader_type", 654 .procname = "bootloader_type",
638 .data = &bootloader_type, 655 .data = &bootloader_type,
@@ -943,6 +960,17 @@ static ctl_table vm_table[] = {
943 .extra1 = &zero, 960 .extra1 = &zero,
944 .extra2 = &one_hundred, 961 .extra2 = &one_hundred,
945 }, 962 },
963 {
964 .ctl_name = VM_MIN_SLAB,
965 .procname = "min_slab_ratio",
966 .data = &sysctl_min_slab_ratio,
967 .maxlen = sizeof(sysctl_min_slab_ratio),
968 .mode = 0644,
969 .proc_handler = &sysctl_min_slab_ratio_sysctl_handler,
970 .strategy = &sysctl_intvec,
971 .extra1 = &zero,
972 .extra2 = &one_hundred,
973 },
946#endif 974#endif
947#ifdef CONFIG_X86_32 975#ifdef CONFIG_X86_32
948 { 976 {
@@ -1138,12 +1166,13 @@ static void start_unregistering(struct ctl_table_header *p)
1138 1166
1139void __init sysctl_init(void) 1167void __init sysctl_init(void)
1140{ 1168{
1141#ifdef CONFIG_PROC_FS 1169#ifdef CONFIG_PROC_SYSCTL
1142 register_proc_table(root_table, proc_sys_root, &root_table_header); 1170 register_proc_table(root_table, proc_sys_root, &root_table_header);
1143 init_irq_proc(); 1171 init_irq_proc();
1144#endif 1172#endif
1145} 1173}
1146 1174
1175#ifdef CONFIG_SYSCTL_SYSCALL
1147int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, 1176int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp,
1148 void __user *newval, size_t newlen) 1177 void __user *newval, size_t newlen)
1149{ 1178{
@@ -1197,6 +1226,7 @@ asmlinkage long sys_sysctl(struct __sysctl_args __user *args)
1197 unlock_kernel(); 1226 unlock_kernel();
1198 return error; 1227 return error;
1199} 1228}
1229#endif /* CONFIG_SYSCTL_SYSCALL */
1200 1230
1201/* 1231/*
1202 * ctl_perm does NOT grant the superuser all rights automatically, because 1232 * ctl_perm does NOT grant the superuser all rights automatically, because
@@ -1223,6 +1253,7 @@ static inline int ctl_perm(ctl_table *table, int op)
1223 return test_perm(table->mode, op); 1253 return test_perm(table->mode, op);
1224} 1254}
1225 1255
1256#ifdef CONFIG_SYSCTL_SYSCALL
1226static int parse_table(int __user *name, int nlen, 1257static int parse_table(int __user *name, int nlen,
1227 void __user *oldval, size_t __user *oldlenp, 1258 void __user *oldval, size_t __user *oldlenp,
1228 void __user *newval, size_t newlen, 1259 void __user *newval, size_t newlen,
@@ -1312,6 +1343,7 @@ int do_sysctl_strategy (ctl_table *table,
1312 } 1343 }
1313 return 0; 1344 return 0;
1314} 1345}
1346#endif /* CONFIG_SYSCTL_SYSCALL */
1315 1347
1316/** 1348/**
1317 * register_sysctl_table - register a sysctl hierarchy 1349 * register_sysctl_table - register a sysctl hierarchy
@@ -1399,7 +1431,7 @@ struct ctl_table_header *register_sysctl_table(ctl_table * table,
1399 else 1431 else
1400 list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry); 1432 list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry);
1401 spin_unlock(&sysctl_lock); 1433 spin_unlock(&sysctl_lock);
1402#ifdef CONFIG_PROC_FS 1434#ifdef CONFIG_PROC_SYSCTL
1403 register_proc_table(table, proc_sys_root, tmp); 1435 register_proc_table(table, proc_sys_root, tmp);
1404#endif 1436#endif
1405 return tmp; 1437 return tmp;
@@ -1417,18 +1449,31 @@ void unregister_sysctl_table(struct ctl_table_header * header)
1417 might_sleep(); 1449 might_sleep();
1418 spin_lock(&sysctl_lock); 1450 spin_lock(&sysctl_lock);
1419 start_unregistering(header); 1451 start_unregistering(header);
1420#ifdef CONFIG_PROC_FS 1452#ifdef CONFIG_PROC_SYSCTL
1421 unregister_proc_table(header->ctl_table, proc_sys_root); 1453 unregister_proc_table(header->ctl_table, proc_sys_root);
1422#endif 1454#endif
1423 spin_unlock(&sysctl_lock); 1455 spin_unlock(&sysctl_lock);
1424 kfree(header); 1456 kfree(header);
1425} 1457}
1426 1458
1459#else /* !CONFIG_SYSCTL */
1460struct ctl_table_header * register_sysctl_table(ctl_table * table,
1461 int insert_at_head)
1462{
1463 return NULL;
1464}
1465
1466void unregister_sysctl_table(struct ctl_table_header * table)
1467{
1468}
1469
1470#endif /* CONFIG_SYSCTL */
1471
1427/* 1472/*
1428 * /proc/sys support 1473 * /proc/sys support
1429 */ 1474 */
1430 1475
1431#ifdef CONFIG_PROC_FS 1476#ifdef CONFIG_PROC_SYSCTL
1432 1477
1433/* Scan the sysctl entries in table and add them all into /proc */ 1478/* Scan the sysctl entries in table and add them all into /proc */
1434static void register_proc_table(ctl_table * table, struct proc_dir_entry *root, void *set) 1479static void register_proc_table(ctl_table * table, struct proc_dir_entry *root, void *set)
@@ -1867,7 +1912,7 @@ int proc_dointvec_bset(ctl_table *table, int write, struct file *filp,
1867 return -EPERM; 1912 return -EPERM;
1868 } 1913 }
1869 1914
1870 op = (current->pid == 1) ? OP_SET : OP_AND; 1915 op = is_init(current) ? OP_SET : OP_AND;
1871 return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, 1916 return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
1872 do_proc_dointvec_bset_conv,&op); 1917 do_proc_dointvec_bset_conv,&op);
1873} 1918}
@@ -2290,6 +2335,7 @@ int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write,
2290#endif /* CONFIG_PROC_FS */ 2335#endif /* CONFIG_PROC_FS */
2291 2336
2292 2337
2338#ifdef CONFIG_SYSCTL_SYSCALL
2293/* 2339/*
2294 * General sysctl support routines 2340 * General sysctl support routines
2295 */ 2341 */
@@ -2432,11 +2478,19 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
2432 return 1; 2478 return 1;
2433} 2479}
2434 2480
2435#else /* CONFIG_SYSCTL */ 2481#else /* CONFIG_SYSCTL_SYSCALL */
2436 2482
2437 2483
2438asmlinkage long sys_sysctl(struct __sysctl_args __user *args) 2484asmlinkage long sys_sysctl(struct __sysctl_args __user *args)
2439{ 2485{
2486 static int msg_count;
2487
2488 if (msg_count < 5) {
2489 msg_count++;
2490 printk(KERN_INFO
2491 "warning: process `%s' used the removed sysctl "
2492 "system call\n", current->comm);
2493 }
2440 return -ENOSYS; 2494 return -ENOSYS;
2441} 2495}
2442 2496
@@ -2468,73 +2522,7 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
2468 return -ENOSYS; 2522 return -ENOSYS;
2469} 2523}
2470 2524
2471int proc_dostring(ctl_table *table, int write, struct file *filp, 2525#endif /* CONFIG_SYSCTL_SYSCALL */
2472 void __user *buffer, size_t *lenp, loff_t *ppos)
2473{
2474 return -ENOSYS;
2475}
2476
2477int proc_dointvec(ctl_table *table, int write, struct file *filp,
2478 void __user *buffer, size_t *lenp, loff_t *ppos)
2479{
2480 return -ENOSYS;
2481}
2482
2483int proc_dointvec_bset(ctl_table *table, int write, struct file *filp,
2484 void __user *buffer, size_t *lenp, loff_t *ppos)
2485{
2486 return -ENOSYS;
2487}
2488
2489int proc_dointvec_minmax(ctl_table *table, int write, struct file *filp,
2490 void __user *buffer, size_t *lenp, loff_t *ppos)
2491{
2492 return -ENOSYS;
2493}
2494
2495int proc_dointvec_jiffies(ctl_table *table, int write, struct file *filp,
2496 void __user *buffer, size_t *lenp, loff_t *ppos)
2497{
2498 return -ENOSYS;
2499}
2500
2501int proc_dointvec_userhz_jiffies(ctl_table *table, int write, struct file *filp,
2502 void __user *buffer, size_t *lenp, loff_t *ppos)
2503{
2504 return -ENOSYS;
2505}
2506
2507int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp,
2508 void __user *buffer, size_t *lenp, loff_t *ppos)
2509{
2510 return -ENOSYS;
2511}
2512
2513int proc_doulongvec_minmax(ctl_table *table, int write, struct file *filp,
2514 void __user *buffer, size_t *lenp, loff_t *ppos)
2515{
2516 return -ENOSYS;
2517}
2518
2519int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write,
2520 struct file *filp,
2521 void __user *buffer,
2522 size_t *lenp, loff_t *ppos)
2523{
2524 return -ENOSYS;
2525}
2526
2527struct ctl_table_header * register_sysctl_table(ctl_table * table,
2528 int insert_at_head)
2529{
2530 return NULL;
2531}
2532
2533void unregister_sysctl_table(struct ctl_table_header * table)
2534{
2535}
2536
2537#endif /* CONFIG_SYSCTL */
2538 2526
2539/* 2527/*
2540 * No sense putting this after each symbol definition, twice, 2528 * No sense putting this after each symbol definition, twice,
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index e78187657330..2ed4040d0dc5 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -75,7 +75,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
75 /* 75 /*
76 * If new attributes are added, please revisit this allocation 76 * If new attributes are added, please revisit this allocation
77 */ 77 */
78 skb = nlmsg_new(size); 78 skb = nlmsg_new(size, GFP_KERNEL);
79 if (!skb) 79 if (!skb)
80 return -ENOMEM; 80 return -ENOMEM;
81 81
diff --git a/kernel/timer.c b/kernel/timer.c
index 1d7dd6267c2d..4f55622b0d38 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -136,7 +136,7 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
136 list_add_tail(&timer->entry, vec); 136 list_add_tail(&timer->entry, vec);
137} 137}
138 138
139/*** 139/**
140 * init_timer - initialize a timer. 140 * init_timer - initialize a timer.
141 * @timer: the timer to be initialized 141 * @timer: the timer to be initialized
142 * 142 *
@@ -175,6 +175,7 @@ static inline void detach_timer(struct timer_list *timer,
175 */ 175 */
176static tvec_base_t *lock_timer_base(struct timer_list *timer, 176static tvec_base_t *lock_timer_base(struct timer_list *timer,
177 unsigned long *flags) 177 unsigned long *flags)
178 __acquires(timer->base->lock)
178{ 179{
179 tvec_base_t *base; 180 tvec_base_t *base;
180 181
@@ -235,7 +236,7 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
235 236
236EXPORT_SYMBOL(__mod_timer); 237EXPORT_SYMBOL(__mod_timer);
237 238
238/*** 239/**
239 * add_timer_on - start a timer on a particular CPU 240 * add_timer_on - start a timer on a particular CPU
240 * @timer: the timer to be added 241 * @timer: the timer to be added
241 * @cpu: the CPU to start it on 242 * @cpu: the CPU to start it on
@@ -255,9 +256,10 @@ void add_timer_on(struct timer_list *timer, int cpu)
255} 256}
256 257
257 258
258/*** 259/**
259 * mod_timer - modify a timer's timeout 260 * mod_timer - modify a timer's timeout
260 * @timer: the timer to be modified 261 * @timer: the timer to be modified
262 * @expires: new timeout in jiffies
261 * 263 *
262 * mod_timer is a more efficient way to update the expire field of an 264 * mod_timer is a more efficient way to update the expire field of an
263 * active timer (if the timer is inactive it will be activated) 265 * active timer (if the timer is inactive it will be activated)
@@ -291,7 +293,7 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
291 293
292EXPORT_SYMBOL(mod_timer); 294EXPORT_SYMBOL(mod_timer);
293 295
294/*** 296/**
295 * del_timer - deactive a timer. 297 * del_timer - deactive a timer.
296 * @timer: the timer to be deactivated 298 * @timer: the timer to be deactivated
297 * 299 *
@@ -323,7 +325,10 @@ int del_timer(struct timer_list *timer)
323EXPORT_SYMBOL(del_timer); 325EXPORT_SYMBOL(del_timer);
324 326
325#ifdef CONFIG_SMP 327#ifdef CONFIG_SMP
326/* 328/**
329 * try_to_del_timer_sync - Try to deactivate a timer
330 * @timer: timer do del
331 *
327 * This function tries to deactivate a timer. Upon successful (ret >= 0) 332 * This function tries to deactivate a timer. Upon successful (ret >= 0)
328 * exit the timer is not queued and the handler is not running on any CPU. 333 * exit the timer is not queued and the handler is not running on any CPU.
329 * 334 *
@@ -351,7 +356,7 @@ out:
351 return ret; 356 return ret;
352} 357}
353 358
354/*** 359/**
355 * del_timer_sync - deactivate a timer and wait for the handler to finish. 360 * del_timer_sync - deactivate a timer and wait for the handler to finish.
356 * @timer: the timer to be deactivated 361 * @timer: the timer to be deactivated
357 * 362 *
@@ -401,15 +406,15 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index)
401 return index; 406 return index;
402} 407}
403 408
404/*** 409#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
410
411/**
405 * __run_timers - run all expired timers (if any) on this CPU. 412 * __run_timers - run all expired timers (if any) on this CPU.
406 * @base: the timer vector to be processed. 413 * @base: the timer vector to be processed.
407 * 414 *
408 * This function cascades all vectors and executes all expired timer 415 * This function cascades all vectors and executes all expired timer
409 * vectors. 416 * vectors.
410 */ 417 */
411#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
412
413static inline void __run_timers(tvec_base_t *base) 418static inline void __run_timers(tvec_base_t *base)
414{ 419{
415 struct timer_list *timer; 420 struct timer_list *timer;
@@ -970,7 +975,7 @@ void __init timekeeping_init(void)
970 975
971 976
972static int timekeeping_suspended; 977static int timekeeping_suspended;
973/* 978/**
974 * timekeeping_resume - Resumes the generic timekeeping subsystem. 979 * timekeeping_resume - Resumes the generic timekeeping subsystem.
975 * @dev: unused 980 * @dev: unused
976 * 981 *
@@ -1106,7 +1111,7 @@ static void clocksource_adjust(struct clocksource *clock, s64 offset)
1106 clock->error -= (interval - offset) << (TICK_LENGTH_SHIFT - clock->shift); 1111 clock->error -= (interval - offset) << (TICK_LENGTH_SHIFT - clock->shift);
1107} 1112}
1108 1113
1109/* 1114/**
1110 * update_wall_time - Uses the current clocksource to increment the wall time 1115 * update_wall_time - Uses the current clocksource to increment the wall time
1111 * 1116 *
1112 * Called from the timer interrupt, must hold a write on xtime_lock. 1117 * Called from the timer interrupt, must hold a write on xtime_lock.
@@ -1217,10 +1222,8 @@ static inline void calc_load(unsigned long ticks)
1217 unsigned long active_tasks; /* fixed-point */ 1222 unsigned long active_tasks; /* fixed-point */
1218 static int count = LOAD_FREQ; 1223 static int count = LOAD_FREQ;
1219 1224
1220 count -= ticks; 1225 active_tasks = count_active_tasks();
1221 if (count < 0) { 1226 for (count -= ticks; count < 0; count += LOAD_FREQ) {
1222 count += LOAD_FREQ;
1223 active_tasks = count_active_tasks();
1224 CALC_LOAD(avenrun[0], EXP_1, active_tasks); 1227 CALC_LOAD(avenrun[0], EXP_1, active_tasks);
1225 CALC_LOAD(avenrun[1], EXP_5, active_tasks); 1228 CALC_LOAD(avenrun[1], EXP_5, active_tasks);
1226 CALC_LOAD(avenrun[2], EXP_15, active_tasks); 1229 CALC_LOAD(avenrun[2], EXP_15, active_tasks);
@@ -1265,11 +1268,8 @@ void run_local_timers(void)
1265 * Called by the timer interrupt. xtime_lock must already be taken 1268 * Called by the timer interrupt. xtime_lock must already be taken
1266 * by the timer IRQ! 1269 * by the timer IRQ!
1267 */ 1270 */
1268static inline void update_times(void) 1271static inline void update_times(unsigned long ticks)
1269{ 1272{
1270 unsigned long ticks;
1271
1272 ticks = jiffies - wall_jiffies;
1273 wall_jiffies += ticks; 1273 wall_jiffies += ticks;
1274 update_wall_time(); 1274 update_wall_time();
1275 calc_load(ticks); 1275 calc_load(ticks);
@@ -1281,12 +1281,10 @@ static inline void update_times(void)
1281 * jiffies is defined in the linker script... 1281 * jiffies is defined in the linker script...
1282 */ 1282 */
1283 1283
1284void do_timer(struct pt_regs *regs) 1284void do_timer(unsigned long ticks)
1285{ 1285{
1286 jiffies_64++; 1286 jiffies_64 += ticks;
1287 /* prevent loading jiffies before storing new jiffies_64 value. */ 1287 update_times(ticks);
1288 barrier();
1289 update_times();
1290} 1288}
1291 1289
1292#ifdef __ARCH_WANT_SYS_ALARM 1290#ifdef __ARCH_WANT_SYS_ALARM
@@ -1470,8 +1468,9 @@ asmlinkage long sys_gettid(void)
1470 return current->pid; 1468 return current->pid;
1471} 1469}
1472 1470
1473/* 1471/**
1474 * sys_sysinfo - fill in sysinfo struct 1472 * sys_sysinfo - fill in sysinfo struct
1473 * @info: pointer to buffer to fill
1475 */ 1474 */
1476asmlinkage long sys_sysinfo(struct sysinfo __user *info) 1475asmlinkage long sys_sysinfo(struct sysinfo __user *info)
1477{ 1476{
@@ -1688,8 +1687,10 @@ static struct notifier_block __cpuinitdata timers_nb = {
1688 1687
1689void __init init_timers(void) 1688void __init init_timers(void)
1690{ 1689{
1691 timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, 1690 int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
1692 (void *)(long)smp_processor_id()); 1691 (void *)(long)smp_processor_id());
1692
1693 BUG_ON(err == NOTIFY_BAD);
1693 register_cpu_notifier(&timers_nb); 1694 register_cpu_notifier(&timers_nb);
1694 open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL); 1695 open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL);
1695} 1696}
diff --git a/kernel/unwind.c b/kernel/unwind.c
index f69c804c8e62..2e2368607aab 100644
--- a/kernel/unwind.c
+++ b/kernel/unwind.c
@@ -102,7 +102,7 @@ static struct unwind_table {
102 unsigned long size; 102 unsigned long size;
103 struct unwind_table *link; 103 struct unwind_table *link;
104 const char *name; 104 const char *name;
105} root_table, *last_table; 105} root_table;
106 106
107struct unwind_item { 107struct unwind_item {
108 enum item_location { 108 enum item_location {
@@ -174,6 +174,8 @@ void __init unwind_init(void)
174 174
175#ifdef CONFIG_MODULES 175#ifdef CONFIG_MODULES
176 176
177static struct unwind_table *last_table;
178
177/* Must be called with module_mutex held. */ 179/* Must be called with module_mutex held. */
178void *unwind_add_table(struct module *module, 180void *unwind_add_table(struct module *module,
179 const void *table_start, 181 const void *table_start,
@@ -603,6 +605,7 @@ int unwind(struct unwind_frame_info *frame)
603#define FRAME_REG(r, t) (((t *)frame)[reg_info[r].offs]) 605#define FRAME_REG(r, t) (((t *)frame)[reg_info[r].offs])
604 const u32 *fde = NULL, *cie = NULL; 606 const u32 *fde = NULL, *cie = NULL;
605 const u8 *ptr = NULL, *end = NULL; 607 const u8 *ptr = NULL, *end = NULL;
608 unsigned long pc = UNW_PC(frame) - frame->call_frame;
606 unsigned long startLoc = 0, endLoc = 0, cfa; 609 unsigned long startLoc = 0, endLoc = 0, cfa;
607 unsigned i; 610 unsigned i;
608 signed ptrType = -1; 611 signed ptrType = -1;
@@ -612,7 +615,7 @@ int unwind(struct unwind_frame_info *frame)
612 615
613 if (UNW_PC(frame) == 0) 616 if (UNW_PC(frame) == 0)
614 return -EINVAL; 617 return -EINVAL;
615 if ((table = find_table(UNW_PC(frame))) != NULL 618 if ((table = find_table(pc)) != NULL
616 && !(table->size & (sizeof(*fde) - 1))) { 619 && !(table->size & (sizeof(*fde) - 1))) {
617 unsigned long tableSize = table->size; 620 unsigned long tableSize = table->size;
618 621
@@ -647,7 +650,7 @@ int unwind(struct unwind_frame_info *frame)
647 ptrType & DW_EH_PE_indirect 650 ptrType & DW_EH_PE_indirect
648 ? ptrType 651 ? ptrType
649 : ptrType & (DW_EH_PE_FORM|DW_EH_PE_signed)); 652 : ptrType & (DW_EH_PE_FORM|DW_EH_PE_signed));
650 if (UNW_PC(frame) >= startLoc && UNW_PC(frame) < endLoc) 653 if (pc >= startLoc && pc < endLoc)
651 break; 654 break;
652 cie = NULL; 655 cie = NULL;
653 } 656 }
@@ -657,16 +660,28 @@ int unwind(struct unwind_frame_info *frame)
657 state.cieEnd = ptr; /* keep here temporarily */ 660 state.cieEnd = ptr; /* keep here temporarily */
658 ptr = (const u8 *)(cie + 2); 661 ptr = (const u8 *)(cie + 2);
659 end = (const u8 *)(cie + 1) + *cie; 662 end = (const u8 *)(cie + 1) + *cie;
663 frame->call_frame = 1;
660 if ((state.version = *ptr) != 1) 664 if ((state.version = *ptr) != 1)
661 cie = NULL; /* unsupported version */ 665 cie = NULL; /* unsupported version */
662 else if (*++ptr) { 666 else if (*++ptr) {
663 /* check if augmentation size is first (and thus present) */ 667 /* check if augmentation size is first (and thus present) */
664 if (*ptr == 'z') { 668 if (*ptr == 'z') {
665 /* check for ignorable (or already handled) 669 while (++ptr < end && *ptr) {
666 * nul-terminated augmentation string */ 670 switch(*ptr) {
667 while (++ptr < end && *ptr) 671 /* check for ignorable (or already handled)
668 if (strchr("LPR", *ptr) == NULL) 672 * nul-terminated augmentation string */
673 case 'L':
674 case 'P':
675 case 'R':
676 continue;
677 case 'S':
678 frame->call_frame = 0;
679 continue;
680 default:
669 break; 681 break;
682 }
683 break;
684 }
670 } 685 }
671 if (ptr >= end || *ptr) 686 if (ptr >= end || *ptr)
672 cie = NULL; 687 cie = NULL;
@@ -755,7 +770,7 @@ int unwind(struct unwind_frame_info *frame)
755 state.org = startLoc; 770 state.org = startLoc;
756 memcpy(&state.cfa, &badCFA, sizeof(state.cfa)); 771 memcpy(&state.cfa, &badCFA, sizeof(state.cfa));
757 /* process instructions */ 772 /* process instructions */
758 if (!processCFI(ptr, end, UNW_PC(frame), ptrType, &state) 773 if (!processCFI(ptr, end, pc, ptrType, &state)
759 || state.loc > endLoc 774 || state.loc > endLoc
760 || state.regs[retAddrReg].where == Nowhere 775 || state.regs[retAddrReg].where == Nowhere
761 || state.cfa.reg >= ARRAY_SIZE(reg_info) 776 || state.cfa.reg >= ARRAY_SIZE(reg_info)
@@ -763,6 +778,11 @@ int unwind(struct unwind_frame_info *frame)
763 || state.cfa.offs % sizeof(unsigned long)) 778 || state.cfa.offs % sizeof(unsigned long))
764 return -EIO; 779 return -EIO;
765 /* update frame */ 780 /* update frame */
781#ifndef CONFIG_AS_CFI_SIGNAL_FRAME
782 if(frame->call_frame
783 && !UNW_DEFAULT_RA(state.regs[retAddrReg], state.dataAlign))
784 frame->call_frame = 0;
785#endif
766 cfa = FRAME_REG(state.cfa.reg, unsigned long) + state.cfa.offs; 786 cfa = FRAME_REG(state.cfa.reg, unsigned long) + state.cfa.offs;
767 startLoc = min((unsigned long)UNW_SP(frame), cfa); 787 startLoc = min((unsigned long)UNW_SP(frame), cfa);
768 endLoc = max((unsigned long)UNW_SP(frame), cfa); 788 endLoc = max((unsigned long)UNW_SP(frame), cfa);
@@ -866,6 +886,7 @@ int unwind_init_frame_info(struct unwind_frame_info *info,
866 /*const*/ struct pt_regs *regs) 886 /*const*/ struct pt_regs *regs)
867{ 887{
868 info->task = tsk; 888 info->task = tsk;
889 info->call_frame = 0;
869 arch_unw_init_frame_info(info, regs); 890 arch_unw_init_frame_info(info, regs);
870 891
871 return 0; 892 return 0;
@@ -879,6 +900,7 @@ int unwind_init_blocked(struct unwind_frame_info *info,
879 struct task_struct *tsk) 900 struct task_struct *tsk)
880{ 901{
881 info->task = tsk; 902 info->task = tsk;
903 info->call_frame = 0;
882 arch_unw_init_blocked(info); 904 arch_unw_init_blocked(info);
883 905
884 return 0; 906 return 0;
@@ -894,6 +916,7 @@ int unwind_init_running(struct unwind_frame_info *info,
894 void *arg) 916 void *arg)
895{ 917{
896 info->task = current; 918 info->task = current;
919 info->call_frame = 0;
897 920
898 return arch_unwind_init_running(info, callback, arg); 921 return arch_unwind_init_running(info, callback, arg);
899} 922}