aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/audit.c14
-rw-r--r--kernel/auditfilter.c2
-rw-r--r--kernel/auditsc.c6
-rw-r--r--kernel/cpu.c138
-rw-r--r--kernel/cpuset.c6
-rw-r--r--kernel/irq/handle.c2
-rw-r--r--kernel/power/Kconfig11
-rw-r--r--kernel/power/Makefile2
-rw-r--r--kernel/power/disk.c7
-rw-r--r--kernel/power/main.c40
-rw-r--r--kernel/power/power.h59
-rw-r--r--kernel/power/smp.c62
-rw-r--r--kernel/power/snapshot.c1155
-rw-r--r--kernel/power/swap.c270
-rw-r--r--kernel/power/swsusp.c5
-rw-r--r--kernel/power/user.c15
-rw-r--r--kernel/printk.c3
-rw-r--r--kernel/profile.c16
-rw-r--r--kernel/sched.c54
-rw-r--r--kernel/sysctl.c11
20 files changed, 1299 insertions, 579 deletions
diff --git a/kernel/audit.c b/kernel/audit.c
index 963fd15c9621..f9889ee77825 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -244,7 +244,7 @@ static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sid)
244 char *ctx = NULL; 244 char *ctx = NULL;
245 u32 len; 245 u32 len;
246 int rc; 246 int rc;
247 if ((rc = selinux_ctxid_to_string(sid, &ctx, &len))) 247 if ((rc = selinux_sid_to_string(sid, &ctx, &len)))
248 return rc; 248 return rc;
249 else 249 else
250 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, 250 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
@@ -267,7 +267,7 @@ static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid)
267 char *ctx = NULL; 267 char *ctx = NULL;
268 u32 len; 268 u32 len;
269 int rc; 269 int rc;
270 if ((rc = selinux_ctxid_to_string(sid, &ctx, &len))) 270 if ((rc = selinux_sid_to_string(sid, &ctx, &len)))
271 return rc; 271 return rc;
272 else 272 else
273 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, 273 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
@@ -293,7 +293,7 @@ static int audit_set_enabled(int state, uid_t loginuid, u32 sid)
293 char *ctx = NULL; 293 char *ctx = NULL;
294 u32 len; 294 u32 len;
295 int rc; 295 int rc;
296 if ((rc = selinux_ctxid_to_string(sid, &ctx, &len))) 296 if ((rc = selinux_sid_to_string(sid, &ctx, &len)))
297 return rc; 297 return rc;
298 else 298 else
299 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, 299 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
@@ -321,7 +321,7 @@ static int audit_set_failure(int state, uid_t loginuid, u32 sid)
321 char *ctx = NULL; 321 char *ctx = NULL;
322 u32 len; 322 u32 len;
323 int rc; 323 int rc;
324 if ((rc = selinux_ctxid_to_string(sid, &ctx, &len))) 324 if ((rc = selinux_sid_to_string(sid, &ctx, &len)))
325 return rc; 325 return rc;
326 else 326 else
327 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, 327 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
@@ -538,7 +538,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
538 if (status_get->mask & AUDIT_STATUS_PID) { 538 if (status_get->mask & AUDIT_STATUS_PID) {
539 int old = audit_pid; 539 int old = audit_pid;
540 if (sid) { 540 if (sid) {
541 if ((err = selinux_ctxid_to_string( 541 if ((err = selinux_sid_to_string(
542 sid, &ctx, &len))) 542 sid, &ctx, &len)))
543 return err; 543 return err;
544 else 544 else
@@ -576,7 +576,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
576 "user pid=%d uid=%u auid=%u", 576 "user pid=%d uid=%u auid=%u",
577 pid, uid, loginuid); 577 pid, uid, loginuid);
578 if (sid) { 578 if (sid) {
579 if (selinux_ctxid_to_string( 579 if (selinux_sid_to_string(
580 sid, &ctx, &len)) { 580 sid, &ctx, &len)) {
581 audit_log_format(ab, 581 audit_log_format(ab,
582 " ssid=%u", sid); 582 " ssid=%u", sid);
@@ -614,7 +614,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
614 loginuid, sid); 614 loginuid, sid);
615 break; 615 break;
616 case AUDIT_SIGNAL_INFO: 616 case AUDIT_SIGNAL_INFO:
617 err = selinux_ctxid_to_string(audit_sig_sid, &ctx, &len); 617 err = selinux_sid_to_string(audit_sig_sid, &ctx, &len);
618 if (err) 618 if (err)
619 return err; 619 return err;
620 sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL); 620 sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index a44879b0c72f..1a58a81fb09d 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1398,7 +1398,7 @@ static void audit_log_rule_change(uid_t loginuid, u32 sid, char *action,
1398 if (sid) { 1398 if (sid) {
1399 char *ctx = NULL; 1399 char *ctx = NULL;
1400 u32 len; 1400 u32 len;
1401 if (selinux_ctxid_to_string(sid, &ctx, &len)) 1401 if (selinux_sid_to_string(sid, &ctx, &len))
1402 audit_log_format(ab, " ssid=%u", sid); 1402 audit_log_format(ab, " ssid=%u", sid);
1403 else 1403 else
1404 audit_log_format(ab, " subj=%s", ctx); 1404 audit_log_format(ab, " subj=%s", ctx);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 1bd8827a0102..fb83c5cb8c32 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -385,7 +385,7 @@ static int audit_filter_rules(struct task_struct *tsk,
385 logged upon error */ 385 logged upon error */
386 if (f->se_rule) { 386 if (f->se_rule) {
387 if (need_sid) { 387 if (need_sid) {
388 selinux_task_ctxid(tsk, &sid); 388 selinux_get_task_sid(tsk, &sid);
389 need_sid = 0; 389 need_sid = 0;
390 } 390 }
391 result = selinux_audit_rule_match(sid, f->type, 391 result = selinux_audit_rule_match(sid, f->type,
@@ -898,7 +898,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
898 if (axi->osid != 0) { 898 if (axi->osid != 0) {
899 char *ctx = NULL; 899 char *ctx = NULL;
900 u32 len; 900 u32 len;
901 if (selinux_ctxid_to_string( 901 if (selinux_sid_to_string(
902 axi->osid, &ctx, &len)) { 902 axi->osid, &ctx, &len)) {
903 audit_log_format(ab, " osid=%u", 903 audit_log_format(ab, " osid=%u",
904 axi->osid); 904 axi->osid);
@@ -1005,7 +1005,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1005 if (n->osid != 0) { 1005 if (n->osid != 0) {
1006 char *ctx = NULL; 1006 char *ctx = NULL;
1007 u32 len; 1007 u32 len;
1008 if (selinux_ctxid_to_string( 1008 if (selinux_sid_to_string(
1009 n->osid, &ctx, &len)) { 1009 n->osid, &ctx, &len)) {
1010 audit_log_format(ab, " osid=%u", n->osid); 1010 audit_log_format(ab, " osid=%u", n->osid);
1011 call_panic = 2; 1011 call_panic = 2;
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f230f9ae01c2..32c96628463e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -21,6 +21,11 @@ static DEFINE_MUTEX(cpu_bitmask_lock);
21 21
22static __cpuinitdata BLOCKING_NOTIFIER_HEAD(cpu_chain); 22static __cpuinitdata BLOCKING_NOTIFIER_HEAD(cpu_chain);
23 23
24/* If set, cpu_up and cpu_down will return -EBUSY and do nothing.
25 * Should always be manipulated under cpu_add_remove_lock
26 */
27static int cpu_hotplug_disabled;
28
24#ifdef CONFIG_HOTPLUG_CPU 29#ifdef CONFIG_HOTPLUG_CPU
25 30
26/* Crappy recursive lock-takers in cpufreq! Complain loudly about idiots */ 31/* Crappy recursive lock-takers in cpufreq! Complain loudly about idiots */
@@ -108,30 +113,25 @@ static int take_cpu_down(void *unused)
108 return 0; 113 return 0;
109} 114}
110 115
111int cpu_down(unsigned int cpu) 116/* Requires cpu_add_remove_lock to be held */
117static int _cpu_down(unsigned int cpu)
112{ 118{
113 int err; 119 int err;
114 struct task_struct *p; 120 struct task_struct *p;
115 cpumask_t old_allowed, tmp; 121 cpumask_t old_allowed, tmp;
116 122
117 mutex_lock(&cpu_add_remove_lock); 123 if (num_online_cpus() == 1)
118 if (num_online_cpus() == 1) { 124 return -EBUSY;
119 err = -EBUSY;
120 goto out;
121 }
122 125
123 if (!cpu_online(cpu)) { 126 if (!cpu_online(cpu))
124 err = -EINVAL; 127 return -EINVAL;
125 goto out;
126 }
127 128
128 err = blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE, 129 err = blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE,
129 (void *)(long)cpu); 130 (void *)(long)cpu);
130 if (err == NOTIFY_BAD) { 131 if (err == NOTIFY_BAD) {
131 printk("%s: attempt to take down CPU %u failed\n", 132 printk("%s: attempt to take down CPU %u failed\n",
132 __FUNCTION__, cpu); 133 __FUNCTION__, cpu);
133 err = -EINVAL; 134 return -EINVAL;
134 goto out;
135 } 135 }
136 136
137 /* Ensure that we are not runnable on dying cpu */ 137 /* Ensure that we are not runnable on dying cpu */
@@ -179,22 +179,32 @@ out_thread:
179 err = kthread_stop(p); 179 err = kthread_stop(p);
180out_allowed: 180out_allowed:
181 set_cpus_allowed(current, old_allowed); 181 set_cpus_allowed(current, old_allowed);
182out: 182 return err;
183}
184
185int cpu_down(unsigned int cpu)
186{
187 int err = 0;
188
189 mutex_lock(&cpu_add_remove_lock);
190 if (cpu_hotplug_disabled)
191 err = -EBUSY;
192 else
193 err = _cpu_down(cpu);
194
183 mutex_unlock(&cpu_add_remove_lock); 195 mutex_unlock(&cpu_add_remove_lock);
184 return err; 196 return err;
185} 197}
186#endif /*CONFIG_HOTPLUG_CPU*/ 198#endif /*CONFIG_HOTPLUG_CPU*/
187 199
188int __devinit cpu_up(unsigned int cpu) 200/* Requires cpu_add_remove_lock to be held */
201static int __devinit _cpu_up(unsigned int cpu)
189{ 202{
190 int ret; 203 int ret;
191 void *hcpu = (void *)(long)cpu; 204 void *hcpu = (void *)(long)cpu;
192 205
193 mutex_lock(&cpu_add_remove_lock); 206 if (cpu_online(cpu) || !cpu_present(cpu))
194 if (cpu_online(cpu) || !cpu_present(cpu)) { 207 return -EINVAL;
195 ret = -EINVAL;
196 goto out;
197 }
198 208
199 ret = blocking_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu); 209 ret = blocking_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu);
200 if (ret == NOTIFY_BAD) { 210 if (ret == NOTIFY_BAD) {
@@ -219,7 +229,95 @@ out_notify:
219 if (ret != 0) 229 if (ret != 0)
220 blocking_notifier_call_chain(&cpu_chain, 230 blocking_notifier_call_chain(&cpu_chain,
221 CPU_UP_CANCELED, hcpu); 231 CPU_UP_CANCELED, hcpu);
232
233 return ret;
234}
235
236int __devinit cpu_up(unsigned int cpu)
237{
238 int err = 0;
239
240 mutex_lock(&cpu_add_remove_lock);
241 if (cpu_hotplug_disabled)
242 err = -EBUSY;
243 else
244 err = _cpu_up(cpu);
245
246 mutex_unlock(&cpu_add_remove_lock);
247 return err;
248}
249
250#ifdef CONFIG_SUSPEND_SMP
251static cpumask_t frozen_cpus;
252
253int disable_nonboot_cpus(void)
254{
255 int cpu, first_cpu, error;
256
257 mutex_lock(&cpu_add_remove_lock);
258 first_cpu = first_cpu(cpu_present_map);
259 if (!cpu_online(first_cpu)) {
260 error = _cpu_up(first_cpu);
261 if (error) {
262 printk(KERN_ERR "Could not bring CPU%d up.\n",
263 first_cpu);
264 goto out;
265 }
266 }
267 error = set_cpus_allowed(current, cpumask_of_cpu(first_cpu));
268 if (error) {
269 printk(KERN_ERR "Could not run on CPU%d\n", first_cpu);
270 goto out;
271 }
272 /* We take down all of the non-boot CPUs in one shot to avoid races
273 * with the userspace trying to use the CPU hotplug at the same time
274 */
275 cpus_clear(frozen_cpus);
276 printk("Disabling non-boot CPUs ...\n");
277 for_each_online_cpu(cpu) {
278 if (cpu == first_cpu)
279 continue;
280 error = _cpu_down(cpu);
281 if (!error) {
282 cpu_set(cpu, frozen_cpus);
283 printk("CPU%d is down\n", cpu);
284 } else {
285 printk(KERN_ERR "Error taking CPU%d down: %d\n",
286 cpu, error);
287 break;
288 }
289 }
290 if (!error) {
291 BUG_ON(num_online_cpus() > 1);
292 /* Make sure the CPUs won't be enabled by someone else */
293 cpu_hotplug_disabled = 1;
294 } else {
295 printk(KERN_ERR "Non-boot CPUs are not disabled");
296 }
222out: 297out:
223 mutex_unlock(&cpu_add_remove_lock); 298 mutex_unlock(&cpu_add_remove_lock);
224 return ret; 299 return error;
300}
301
302void enable_nonboot_cpus(void)
303{
304 int cpu, error;
305
306 /* Allow everyone to use the CPU hotplug again */
307 mutex_lock(&cpu_add_remove_lock);
308 cpu_hotplug_disabled = 0;
309 mutex_unlock(&cpu_add_remove_lock);
310
311 printk("Enabling non-boot CPUs ...\n");
312 for_each_cpu_mask(cpu, frozen_cpus) {
313 error = cpu_up(cpu);
314 if (!error) {
315 printk("CPU%d is up\n", cpu);
316 continue;
317 }
318 printk(KERN_WARNING "Error taking CPU%d up: %d\n",
319 cpu, error);
320 }
321 cpus_clear(frozen_cpus);
225} 322}
323#endif
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 4ea6f0dc2fc5..cff41511269f 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2245,7 +2245,7 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
2245 int i; 2245 int i;
2246 2246
2247 for (i = 0; zl->zones[i]; i++) { 2247 for (i = 0; zl->zones[i]; i++) {
2248 int nid = zl->zones[i]->zone_pgdat->node_id; 2248 int nid = zone_to_nid(zl->zones[i]);
2249 2249
2250 if (node_isset(nid, current->mems_allowed)) 2250 if (node_isset(nid, current->mems_allowed))
2251 return 1; 2251 return 1;
@@ -2316,9 +2316,9 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
2316 const struct cpuset *cs; /* current cpuset ancestors */ 2316 const struct cpuset *cs; /* current cpuset ancestors */
2317 int allowed; /* is allocation in zone z allowed? */ 2317 int allowed; /* is allocation in zone z allowed? */
2318 2318
2319 if (in_interrupt()) 2319 if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
2320 return 1; 2320 return 1;
2321 node = z->zone_pgdat->node_id; 2321 node = zone_to_nid(z);
2322 might_sleep_if(!(gfp_mask & __GFP_HARDWALL)); 2322 might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
2323 if (node_isset(node, current->mems_allowed)) 2323 if (node_isset(node, current->mems_allowed))
2324 return 1; 2324 return 1;
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 48a53f68af96..4c6cdbaed661 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -154,6 +154,7 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
154 return retval; 154 return retval;
155} 155}
156 156
157#ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ
157/** 158/**
158 * __do_IRQ - original all in one highlevel IRQ handler 159 * __do_IRQ - original all in one highlevel IRQ handler
159 * @irq: the interrupt number 160 * @irq: the interrupt number
@@ -253,6 +254,7 @@ out:
253 254
254 return 1; 255 return 1;
255} 256}
257#endif
256 258
257#ifdef CONFIG_TRACE_IRQFLAGS 259#ifdef CONFIG_TRACE_IRQFLAGS
258 260
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 1ed972070d19..825068ca3479 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -36,6 +36,17 @@ config PM_DEBUG
36 code. This is helpful when debugging and reporting various PM bugs, 36 code. This is helpful when debugging and reporting various PM bugs,
37 like suspend support. 37 like suspend support.
38 38
39config DISABLE_CONSOLE_SUSPEND
40 bool "Keep console(s) enabled during suspend/resume (DANGEROUS)"
41 depends on PM && PM_DEBUG
42 default n
43 ---help---
44 This option turns off the console suspend mechanism that prevents
45 debug messages from reaching the console during the suspend/resume
46 operations. This may be helpful when debugging device drivers'
47 suspend/resume routines, but may itself lead to problems, for example
48 if netconsole is used.
49
39config PM_TRACE 50config PM_TRACE
40 bool "Suspend/resume event tracing" 51 bool "Suspend/resume event tracing"
41 depends on PM && PM_DEBUG && X86_32 && EXPERIMENTAL 52 depends on PM && PM_DEBUG && X86_32 && EXPERIMENTAL
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 8d0af3d37a4b..38725f526afc 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -7,6 +7,4 @@ obj-y := main.o process.o console.o
7obj-$(CONFIG_PM_LEGACY) += pm.o 7obj-$(CONFIG_PM_LEGACY) += pm.o
8obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o snapshot.o swap.o user.o 8obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o snapshot.o swap.o user.o
9 9
10obj-$(CONFIG_SUSPEND_SMP) += smp.o
11
12obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o 10obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index a3c34fb14321..d72234942798 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -18,6 +18,7 @@
18#include <linux/fs.h> 18#include <linux/fs.h>
19#include <linux/mount.h> 19#include <linux/mount.h>
20#include <linux/pm.h> 20#include <linux/pm.h>
21#include <linux/cpu.h>
21 22
22#include "power.h" 23#include "power.h"
23 24
@@ -72,7 +73,10 @@ static int prepare_processes(void)
72 int error; 73 int error;
73 74
74 pm_prepare_console(); 75 pm_prepare_console();
75 disable_nonboot_cpus(); 76
77 error = disable_nonboot_cpus();
78 if (error)
79 goto enable_cpus;
76 80
77 if (freeze_processes()) { 81 if (freeze_processes()) {
78 error = -EBUSY; 82 error = -EBUSY;
@@ -84,6 +88,7 @@ static int prepare_processes(void)
84 return 0; 88 return 0;
85thaw: 89thaw:
86 thaw_processes(); 90 thaw_processes();
91enable_cpus:
87 enable_nonboot_cpus(); 92 enable_nonboot_cpus();
88 pm_restore_console(); 93 pm_restore_console();
89 return error; 94 return error;
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 6d295c776794..873228c71dab 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -16,6 +16,8 @@
16#include <linux/init.h> 16#include <linux/init.h>
17#include <linux/pm.h> 17#include <linux/pm.h>
18#include <linux/console.h> 18#include <linux/console.h>
19#include <linux/cpu.h>
20#include <linux/resume-trace.h>
19 21
20#include "power.h" 22#include "power.h"
21 23
@@ -51,7 +53,7 @@ void pm_set_ops(struct pm_ops * ops)
51 53
52static int suspend_prepare(suspend_state_t state) 54static int suspend_prepare(suspend_state_t state)
53{ 55{
54 int error = 0; 56 int error;
55 unsigned int free_pages; 57 unsigned int free_pages;
56 58
57 if (!pm_ops || !pm_ops->enter) 59 if (!pm_ops || !pm_ops->enter)
@@ -59,12 +61,9 @@ static int suspend_prepare(suspend_state_t state)
59 61
60 pm_prepare_console(); 62 pm_prepare_console();
61 63
62 disable_nonboot_cpus(); 64 error = disable_nonboot_cpus();
63 65 if (error)
64 if (num_online_cpus() != 1) {
65 error = -EPERM;
66 goto Enable_cpu; 66 goto Enable_cpu;
67 }
68 67
69 if (freeze_processes()) { 68 if (freeze_processes()) {
70 error = -EAGAIN; 69 error = -EAGAIN;
@@ -283,10 +282,39 @@ static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n
283 282
284power_attr(state); 283power_attr(state);
285 284
285#ifdef CONFIG_PM_TRACE
286int pm_trace_enabled;
287
288static ssize_t pm_trace_show(struct subsystem * subsys, char * buf)
289{
290 return sprintf(buf, "%d\n", pm_trace_enabled);
291}
292
293static ssize_t
294pm_trace_store(struct subsystem * subsys, const char * buf, size_t n)
295{
296 int val;
297
298 if (sscanf(buf, "%d", &val) == 1) {
299 pm_trace_enabled = !!val;
300 return n;
301 }
302 return -EINVAL;
303}
304
305power_attr(pm_trace);
306
307static struct attribute * g[] = {
308 &state_attr.attr,
309 &pm_trace_attr.attr,
310 NULL,
311};
312#else
286static struct attribute * g[] = { 313static struct attribute * g[] = {
287 &state_attr.attr, 314 &state_attr.attr,
288 NULL, 315 NULL,
289}; 316};
317#endif /* CONFIG_PM_TRACE */
290 318
291static struct attribute_group attr_group = { 319static struct attribute_group attr_group = {
292 .attrs = g, 320 .attrs = g,
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 57a792982fb9..bfe999f7b272 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -38,8 +38,6 @@ extern struct subsystem power_subsys;
38/* References to section boundaries */ 38/* References to section boundaries */
39extern const void __nosave_begin, __nosave_end; 39extern const void __nosave_begin, __nosave_end;
40 40
41extern struct pbe *pagedir_nosave;
42
43/* Preferred image size in bytes (default 500 MB) */ 41/* Preferred image size in bytes (default 500 MB) */
44extern unsigned long image_size; 42extern unsigned long image_size;
45extern int in_suspend; 43extern int in_suspend;
@@ -50,21 +48,62 @@ extern asmlinkage int swsusp_arch_resume(void);
50 48
51extern unsigned int count_data_pages(void); 49extern unsigned int count_data_pages(void);
52 50
51/**
52 * Auxiliary structure used for reading the snapshot image data and
53 * metadata from and writing them to the list of page backup entries
54 * (PBEs) which is the main data structure of swsusp.
55 *
56 * Using struct snapshot_handle we can transfer the image, including its
57 * metadata, as a continuous sequence of bytes with the help of
58 * snapshot_read_next() and snapshot_write_next().
59 *
60 * The code that writes the image to a storage or transfers it to
61 * the user land is required to use snapshot_read_next() for this
62 * purpose and it should not make any assumptions regarding the internal
63 * structure of the image. Similarly, the code that reads the image from
64 * a storage or transfers it from the user land is required to use
65 * snapshot_write_next().
66 *
67 * This may allow us to change the internal structure of the image
68 * in the future with considerably less effort.
69 */
70
53struct snapshot_handle { 71struct snapshot_handle {
54 loff_t offset; 72 loff_t offset; /* number of the last byte ready for reading
55 unsigned int page; 73 * or writing in the sequence
56 unsigned int page_offset; 74 */
57 unsigned int prev; 75 unsigned int cur; /* number of the block of PAGE_SIZE bytes the
58 struct pbe *pbe, *last_pbe; 76 * next operation will refer to (ie. current)
59 void *buffer; 77 */
60 unsigned int buf_offset; 78 unsigned int cur_offset; /* offset with respect to the current
79 * block (for the next operation)
80 */
81 unsigned int prev; /* number of the block of PAGE_SIZE bytes that
82 * was the current one previously
83 */
84 void *buffer; /* address of the block to read from
85 * or write to
86 */
87 unsigned int buf_offset; /* location to read from or write to,
88 * given as a displacement from 'buffer'
89 */
90 int sync_read; /* Set to one to notify the caller of
91 * snapshot_write_next() that it may
92 * need to call wait_on_bio_chain()
93 */
61}; 94};
62 95
96/* This macro returns the address from/to which the caller of
97 * snapshot_read_next()/snapshot_write_next() is allowed to
98 * read/write data after the function returns
99 */
63#define data_of(handle) ((handle).buffer + (handle).buf_offset) 100#define data_of(handle) ((handle).buffer + (handle).buf_offset)
64 101
102extern unsigned int snapshot_additional_pages(struct zone *zone);
65extern int snapshot_read_next(struct snapshot_handle *handle, size_t count); 103extern int snapshot_read_next(struct snapshot_handle *handle, size_t count);
66extern int snapshot_write_next(struct snapshot_handle *handle, size_t count); 104extern int snapshot_write_next(struct snapshot_handle *handle, size_t count);
67int snapshot_image_loaded(struct snapshot_handle *handle); 105extern int snapshot_image_loaded(struct snapshot_handle *handle);
106extern void snapshot_free_unused_memory(struct snapshot_handle *handle);
68 107
69#define SNAPSHOT_IOC_MAGIC '3' 108#define SNAPSHOT_IOC_MAGIC '3'
70#define SNAPSHOT_FREEZE _IO(SNAPSHOT_IOC_MAGIC, 1) 109#define SNAPSHOT_FREEZE _IO(SNAPSHOT_IOC_MAGIC, 1)
diff --git a/kernel/power/smp.c b/kernel/power/smp.c
deleted file mode 100644
index 5957312b2d68..000000000000
--- a/kernel/power/smp.c
+++ /dev/null
@@ -1,62 +0,0 @@
1/*
2 * drivers/power/smp.c - Functions for stopping other CPUs.
3 *
4 * Copyright 2004 Pavel Machek <pavel@suse.cz>
5 * Copyright (C) 2002-2003 Nigel Cunningham <ncunningham@clear.net.nz>
6 *
7 * This file is released under the GPLv2.
8 */
9
10#undef DEBUG
11
12#include <linux/smp_lock.h>
13#include <linux/interrupt.h>
14#include <linux/suspend.h>
15#include <linux/module.h>
16#include <linux/cpu.h>
17#include <asm/atomic.h>
18#include <asm/tlbflush.h>
19
20/* This is protected by pm_sem semaphore */
21static cpumask_t frozen_cpus;
22
23void disable_nonboot_cpus(void)
24{
25 int cpu, error;
26
27 error = 0;
28 cpus_clear(frozen_cpus);
29 printk("Freezing cpus ...\n");
30 for_each_online_cpu(cpu) {
31 if (cpu == 0)
32 continue;
33 error = cpu_down(cpu);
34 if (!error) {
35 cpu_set(cpu, frozen_cpus);
36 printk("CPU%d is down\n", cpu);
37 continue;
38 }
39 printk("Error taking cpu %d down: %d\n", cpu, error);
40 }
41 BUG_ON(raw_smp_processor_id() != 0);
42 if (error)
43 panic("cpus not sleeping");
44}
45
46void enable_nonboot_cpus(void)
47{
48 int cpu, error;
49
50 printk("Thawing cpus ...\n");
51 for_each_cpu_mask(cpu, frozen_cpus) {
52 error = cpu_up(cpu);
53 if (!error) {
54 printk("CPU%d is up\n", cpu);
55 continue;
56 }
57 printk("Error taking cpu %d up: %d\n", cpu, error);
58 panic("Not enough cpus");
59 }
60 cpus_clear(frozen_cpus);
61}
62
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 75d4886e648e..1b84313cbab5 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -34,10 +34,12 @@
34 34
35#include "power.h" 35#include "power.h"
36 36
37struct pbe *pagedir_nosave; 37/* List of PBEs used for creating and restoring the suspend image */
38struct pbe *restore_pblist;
39
38static unsigned int nr_copy_pages; 40static unsigned int nr_copy_pages;
39static unsigned int nr_meta_pages; 41static unsigned int nr_meta_pages;
40static unsigned long *buffer; 42static void *buffer;
41 43
42#ifdef CONFIG_HIGHMEM 44#ifdef CONFIG_HIGHMEM
43unsigned int count_highmem_pages(void) 45unsigned int count_highmem_pages(void)
@@ -156,240 +158,637 @@ static inline int save_highmem(void) {return 0;}
156static inline int restore_highmem(void) {return 0;} 158static inline int restore_highmem(void) {return 0;}
157#endif 159#endif
158 160
159static int pfn_is_nosave(unsigned long pfn) 161/**
162 * @safe_needed - on resume, for storing the PBE list and the image,
163 * we can only use memory pages that do not conflict with the pages
164 * used before suspend.
165 *
166 * The unsafe pages are marked with the PG_nosave_free flag
167 * and we count them using unsafe_pages
168 */
169
170#define PG_ANY 0
171#define PG_SAFE 1
172#define PG_UNSAFE_CLEAR 1
173#define PG_UNSAFE_KEEP 0
174
175static unsigned int allocated_unsafe_pages;
176
177static void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
160{ 178{
161 unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT; 179 void *res;
162 unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT; 180
163 return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); 181 res = (void *)get_zeroed_page(gfp_mask);
182 if (safe_needed)
183 while (res && PageNosaveFree(virt_to_page(res))) {
184 /* The page is unsafe, mark it for swsusp_free() */
185 SetPageNosave(virt_to_page(res));
186 allocated_unsafe_pages++;
187 res = (void *)get_zeroed_page(gfp_mask);
188 }
189 if (res) {
190 SetPageNosave(virt_to_page(res));
191 SetPageNosaveFree(virt_to_page(res));
192 }
193 return res;
194}
195
196unsigned long get_safe_page(gfp_t gfp_mask)
197{
198 return (unsigned long)alloc_image_page(gfp_mask, PG_SAFE);
164} 199}
165 200
166/** 201/**
167 * saveable - Determine whether a page should be cloned or not. 202 * free_image_page - free page represented by @addr, allocated with
168 * @pfn: The page 203 * alloc_image_page (page flags set by it must be cleared)
169 *
170 * We save a page if it's Reserved, and not in the range of pages
171 * statically defined as 'unsaveable', or if it isn't reserved, and
172 * isn't part of a free chunk of pages.
173 */ 204 */
174 205
175static int saveable(struct zone *zone, unsigned long *zone_pfn) 206static inline void free_image_page(void *addr, int clear_nosave_free)
176{ 207{
177 unsigned long pfn = *zone_pfn + zone->zone_start_pfn; 208 ClearPageNosave(virt_to_page(addr));
178 struct page *page; 209 if (clear_nosave_free)
210 ClearPageNosaveFree(virt_to_page(addr));
211 free_page((unsigned long)addr);
212}
179 213
180 if (!pfn_valid(pfn)) 214/* struct linked_page is used to build chains of pages */
181 return 0;
182 215
183 page = pfn_to_page(pfn); 216#define LINKED_PAGE_DATA_SIZE (PAGE_SIZE - sizeof(void *))
184 BUG_ON(PageReserved(page) && PageNosave(page));
185 if (PageNosave(page))
186 return 0;
187 if (PageReserved(page) && pfn_is_nosave(pfn))
188 return 0;
189 if (PageNosaveFree(page))
190 return 0;
191 217
192 return 1; 218struct linked_page {
193} 219 struct linked_page *next;
220 char data[LINKED_PAGE_DATA_SIZE];
221} __attribute__((packed));
194 222
195unsigned int count_data_pages(void) 223static inline void
224free_list_of_pages(struct linked_page *list, int clear_page_nosave)
196{ 225{
197 struct zone *zone; 226 while (list) {
198 unsigned long zone_pfn; 227 struct linked_page *lp = list->next;
199 unsigned int n = 0;
200 228
201 for_each_zone (zone) { 229 free_image_page(list, clear_page_nosave);
202 if (is_highmem(zone)) 230 list = lp;
203 continue;
204 mark_free_pages(zone);
205 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
206 n += saveable(zone, &zone_pfn);
207 } 231 }
208 return n;
209} 232}
210 233
211static void copy_data_pages(struct pbe *pblist) 234/**
235 * struct chain_allocator is used for allocating small objects out of
236 * a linked list of pages called 'the chain'.
237 *
238 * The chain grows each time when there is no room for a new object in
239 * the current page. The allocated objects cannot be freed individually.
240 * It is only possible to free them all at once, by freeing the entire
241 * chain.
242 *
243 * NOTE: The chain allocator may be inefficient if the allocated objects
244 * are not much smaller than PAGE_SIZE.
245 */
246
247struct chain_allocator {
248 struct linked_page *chain; /* the chain */
249 unsigned int used_space; /* total size of objects allocated out
250 * of the current page
251 */
252 gfp_t gfp_mask; /* mask for allocating pages */
253 int safe_needed; /* if set, only "safe" pages are allocated */
254};
255
256static void
257chain_init(struct chain_allocator *ca, gfp_t gfp_mask, int safe_needed)
212{ 258{
213 struct zone *zone; 259 ca->chain = NULL;
214 unsigned long zone_pfn; 260 ca->used_space = LINKED_PAGE_DATA_SIZE;
215 struct pbe *pbe, *p; 261 ca->gfp_mask = gfp_mask;
262 ca->safe_needed = safe_needed;
263}
216 264
217 pbe = pblist; 265static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
218 for_each_zone (zone) { 266{
219 if (is_highmem(zone)) 267 void *ret;
220 continue; 268
221 mark_free_pages(zone); 269 if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) {
222 /* This is necessary for swsusp_free() */ 270 struct linked_page *lp;
223 for_each_pb_page (p, pblist) 271
224 SetPageNosaveFree(virt_to_page(p)); 272 lp = alloc_image_page(ca->gfp_mask, ca->safe_needed);
225 for_each_pbe (p, pblist) 273 if (!lp)
226 SetPageNosaveFree(virt_to_page(p->address)); 274 return NULL;
227 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) { 275
228 if (saveable(zone, &zone_pfn)) { 276 lp->next = ca->chain;
229 struct page *page; 277 ca->chain = lp;
230 long *src, *dst; 278 ca->used_space = 0;
231 int n;
232
233 page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
234 BUG_ON(!pbe);
235 pbe->orig_address = (unsigned long)page_address(page);
236 /* copy_page and memcpy are not usable for copying task structs. */
237 dst = (long *)pbe->address;
238 src = (long *)pbe->orig_address;
239 for (n = PAGE_SIZE / sizeof(long); n; n--)
240 *dst++ = *src++;
241 pbe = pbe->next;
242 }
243 }
244 } 279 }
245 BUG_ON(pbe); 280 ret = ca->chain->data + ca->used_space;
281 ca->used_space += size;
282 return ret;
246} 283}
247 284
285static void chain_free(struct chain_allocator *ca, int clear_page_nosave)
286{
287 free_list_of_pages(ca->chain, clear_page_nosave);
288 memset(ca, 0, sizeof(struct chain_allocator));
289}
248 290
249/** 291/**
250 * free_pagedir - free pages allocated with alloc_pagedir() 292 * Data types related to memory bitmaps.
293 *
294 * Memory bitmap is a structure consiting of many linked lists of
295 * objects. The main list's elements are of type struct zone_bitmap
296 * and each of them corresonds to one zone. For each zone bitmap
297 * object there is a list of objects of type struct bm_block that
298 * represent each blocks of bit chunks in which information is
299 * stored.
300 *
301 * struct memory_bitmap contains a pointer to the main list of zone
302 * bitmap objects, a struct bm_position used for browsing the bitmap,
303 * and a pointer to the list of pages used for allocating all of the
304 * zone bitmap objects and bitmap block objects.
305 *
306 * NOTE: It has to be possible to lay out the bitmap in memory
307 * using only allocations of order 0. Additionally, the bitmap is
308 * designed to work with arbitrary number of zones (this is over the
309 * top for now, but let's avoid making unnecessary assumptions ;-).
310 *
311 * struct zone_bitmap contains a pointer to a list of bitmap block
312 * objects and a pointer to the bitmap block object that has been
313 * most recently used for setting bits. Additionally, it contains the
314 * pfns that correspond to the start and end of the represented zone.
315 *
316 * struct bm_block contains a pointer to the memory page in which
317 * information is stored (in the form of a block of bit chunks
318 * of type unsigned long each). It also contains the pfns that
319 * correspond to the start and end of the represented memory area and
320 * the number of bit chunks in the block.
321 *
322 * NOTE: Memory bitmaps are used for two types of operations only:
323 * "set a bit" and "find the next bit set". Moreover, the searching
324 * is always carried out after all of the "set a bit" operations
325 * on given bitmap.
251 */ 326 */
252 327
253static void free_pagedir(struct pbe *pblist, int clear_nosave_free) 328#define BM_END_OF_MAP (~0UL)
329
330#define BM_CHUNKS_PER_BLOCK (PAGE_SIZE / sizeof(long))
331#define BM_BITS_PER_CHUNK (sizeof(long) << 3)
332#define BM_BITS_PER_BLOCK (PAGE_SIZE << 3)
333
334struct bm_block {
335 struct bm_block *next; /* next element of the list */
336 unsigned long start_pfn; /* pfn represented by the first bit */
337 unsigned long end_pfn; /* pfn represented by the last bit plus 1 */
338 unsigned int size; /* number of bit chunks */
339 unsigned long *data; /* chunks of bits representing pages */
340};
341
342struct zone_bitmap {
343 struct zone_bitmap *next; /* next element of the list */
344 unsigned long start_pfn; /* minimal pfn in this zone */
345 unsigned long end_pfn; /* maximal pfn in this zone plus 1 */
346 struct bm_block *bm_blocks; /* list of bitmap blocks */
347 struct bm_block *cur_block; /* recently used bitmap block */
348};
349
350/* strcut bm_position is used for browsing memory bitmaps */
351
352struct bm_position {
353 struct zone_bitmap *zone_bm;
354 struct bm_block *block;
355 int chunk;
356 int bit;
357};
358
359struct memory_bitmap {
360 struct zone_bitmap *zone_bm_list; /* list of zone bitmaps */
361 struct linked_page *p_list; /* list of pages used to store zone
362 * bitmap objects and bitmap block
363 * objects
364 */
365 struct bm_position cur; /* most recently used bit position */
366};
367
368/* Functions that operate on memory bitmaps */
369
370static inline void memory_bm_reset_chunk(struct memory_bitmap *bm)
254{ 371{
255 struct pbe *pbe; 372 bm->cur.chunk = 0;
373 bm->cur.bit = -1;
374}
256 375
257 while (pblist) { 376static void memory_bm_position_reset(struct memory_bitmap *bm)
258 pbe = (pblist + PB_PAGE_SKIP)->next; 377{
259 ClearPageNosave(virt_to_page(pblist)); 378 struct zone_bitmap *zone_bm;
260 if (clear_nosave_free) 379
261 ClearPageNosaveFree(virt_to_page(pblist)); 380 zone_bm = bm->zone_bm_list;
262 free_page((unsigned long)pblist); 381 bm->cur.zone_bm = zone_bm;
263 pblist = pbe; 382 bm->cur.block = zone_bm->bm_blocks;
264 } 383 memory_bm_reset_chunk(bm);
265} 384}
266 385
386static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
387
267/** 388/**
268 * fill_pb_page - Create a list of PBEs on a given memory page 389 * create_bm_block_list - create a list of block bitmap objects
269 */ 390 */
270 391
271static inline void fill_pb_page(struct pbe *pbpage) 392static inline struct bm_block *
393create_bm_block_list(unsigned int nr_blocks, struct chain_allocator *ca)
272{ 394{
273 struct pbe *p; 395 struct bm_block *bblist = NULL;
396
397 while (nr_blocks-- > 0) {
398 struct bm_block *bb;
274 399
275 p = pbpage; 400 bb = chain_alloc(ca, sizeof(struct bm_block));
276 pbpage += PB_PAGE_SKIP; 401 if (!bb)
277 do 402 return NULL;
278 p->next = p + 1; 403
279 while (++p < pbpage); 404 bb->next = bblist;
405 bblist = bb;
406 }
407 return bblist;
280} 408}
281 409
282/** 410/**
283 * create_pbe_list - Create a list of PBEs on top of a given chain 411 * create_zone_bm_list - create a list of zone bitmap objects
284 * of memory pages allocated with alloc_pagedir()
285 */ 412 */
286 413
287static inline void create_pbe_list(struct pbe *pblist, unsigned int nr_pages) 414static inline struct zone_bitmap *
415create_zone_bm_list(unsigned int nr_zones, struct chain_allocator *ca)
288{ 416{
289 struct pbe *pbpage, *p; 417 struct zone_bitmap *zbmlist = NULL;
290 unsigned int num = PBES_PER_PAGE;
291 418
292 for_each_pb_page (pbpage, pblist) { 419 while (nr_zones-- > 0) {
293 if (num >= nr_pages) 420 struct zone_bitmap *zbm;
294 break; 421
422 zbm = chain_alloc(ca, sizeof(struct zone_bitmap));
423 if (!zbm)
424 return NULL;
425
426 zbm->next = zbmlist;
427 zbmlist = zbm;
428 }
429 return zbmlist;
430}
431
432/**
433 * memory_bm_create - allocate memory for a memory bitmap
434 */
435
436static int
437memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
438{
439 struct chain_allocator ca;
440 struct zone *zone;
441 struct zone_bitmap *zone_bm;
442 struct bm_block *bb;
443 unsigned int nr;
444
445 chain_init(&ca, gfp_mask, safe_needed);
295 446
296 fill_pb_page(pbpage); 447 /* Compute the number of zones */
297 num += PBES_PER_PAGE; 448 nr = 0;
449 for_each_zone (zone)
450 if (populated_zone(zone) && !is_highmem(zone))
451 nr++;
452
453 /* Allocate the list of zones bitmap objects */
454 zone_bm = create_zone_bm_list(nr, &ca);
455 bm->zone_bm_list = zone_bm;
456 if (!zone_bm) {
457 chain_free(&ca, PG_UNSAFE_CLEAR);
458 return -ENOMEM;
298 } 459 }
299 if (pbpage) { 460
300 for (num -= PBES_PER_PAGE - 1, p = pbpage; num < nr_pages; p++, num++) 461 /* Initialize the zone bitmap objects */
301 p->next = p + 1; 462 for_each_zone (zone) {
302 p->next = NULL; 463 unsigned long pfn;
464
465 if (!populated_zone(zone) || is_highmem(zone))
466 continue;
467
468 zone_bm->start_pfn = zone->zone_start_pfn;
469 zone_bm->end_pfn = zone->zone_start_pfn + zone->spanned_pages;
470 /* Allocate the list of bitmap block objects */
471 nr = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
472 bb = create_bm_block_list(nr, &ca);
473 zone_bm->bm_blocks = bb;
474 zone_bm->cur_block = bb;
475 if (!bb)
476 goto Free;
477
478 nr = zone->spanned_pages;
479 pfn = zone->zone_start_pfn;
480 /* Initialize the bitmap block objects */
481 while (bb) {
482 unsigned long *ptr;
483
484 ptr = alloc_image_page(gfp_mask, safe_needed);
485 bb->data = ptr;
486 if (!ptr)
487 goto Free;
488
489 bb->start_pfn = pfn;
490 if (nr >= BM_BITS_PER_BLOCK) {
491 pfn += BM_BITS_PER_BLOCK;
492 bb->size = BM_CHUNKS_PER_BLOCK;
493 nr -= BM_BITS_PER_BLOCK;
494 } else {
495 /* This is executed only once in the loop */
496 pfn += nr;
497 bb->size = DIV_ROUND_UP(nr, BM_BITS_PER_CHUNK);
498 }
499 bb->end_pfn = pfn;
500 bb = bb->next;
501 }
502 zone_bm = zone_bm->next;
303 } 503 }
504 bm->p_list = ca.chain;
505 memory_bm_position_reset(bm);
506 return 0;
507
508Free:
509 bm->p_list = ca.chain;
510 memory_bm_free(bm, PG_UNSAFE_CLEAR);
511 return -ENOMEM;
304} 512}
305 513
306static unsigned int unsafe_pages; 514/**
515 * memory_bm_free - free memory occupied by the memory bitmap @bm
516 */
517
518static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
519{
520 struct zone_bitmap *zone_bm;
521
522 /* Free the list of bit blocks for each zone_bitmap object */
523 zone_bm = bm->zone_bm_list;
524 while (zone_bm) {
525 struct bm_block *bb;
526
527 bb = zone_bm->bm_blocks;
528 while (bb) {
529 if (bb->data)
530 free_image_page(bb->data, clear_nosave_free);
531 bb = bb->next;
532 }
533 zone_bm = zone_bm->next;
534 }
535 free_list_of_pages(bm->p_list, clear_nosave_free);
536 bm->zone_bm_list = NULL;
537}
307 538
308/** 539/**
309 * @safe_needed - on resume, for storing the PBE list and the image, 540 * memory_bm_set_bit - set the bit in the bitmap @bm that corresponds
310 * we can only use memory pages that do not conflict with the pages 541 * to given pfn. The cur_zone_bm member of @bm and the cur_block member
311 * used before suspend. 542 * of @bm->cur_zone_bm are updated.
312 * 543 *
313 * The unsafe pages are marked with the PG_nosave_free flag 544 * If the bit cannot be set, the function returns -EINVAL .
314 * and we count them using unsafe_pages
315 */ 545 */
316 546
317static inline void *alloc_image_page(gfp_t gfp_mask, int safe_needed) 547static int
548memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn)
318{ 549{
319 void *res; 550 struct zone_bitmap *zone_bm;
320 551 struct bm_block *bb;
321 res = (void *)get_zeroed_page(gfp_mask); 552
322 if (safe_needed) 553 /* Check if the pfn is from the current zone */
323 while (res && PageNosaveFree(virt_to_page(res))) { 554 zone_bm = bm->cur.zone_bm;
324 /* The page is unsafe, mark it for swsusp_free() */ 555 if (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) {
325 SetPageNosave(virt_to_page(res)); 556 zone_bm = bm->zone_bm_list;
326 unsafe_pages++; 557 /* We don't assume that the zones are sorted by pfns */
327 res = (void *)get_zeroed_page(gfp_mask); 558 while (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) {
559 zone_bm = zone_bm->next;
560 if (unlikely(!zone_bm))
561 return -EINVAL;
328 } 562 }
329 if (res) { 563 bm->cur.zone_bm = zone_bm;
330 SetPageNosave(virt_to_page(res));
331 SetPageNosaveFree(virt_to_page(res));
332 } 564 }
333 return res; 565 /* Check if the pfn corresponds to the current bitmap block */
566 bb = zone_bm->cur_block;
567 if (pfn < bb->start_pfn)
568 bb = zone_bm->bm_blocks;
569
570 while (pfn >= bb->end_pfn) {
571 bb = bb->next;
572 if (unlikely(!bb))
573 return -EINVAL;
574 }
575 zone_bm->cur_block = bb;
576 pfn -= bb->start_pfn;
577 set_bit(pfn % BM_BITS_PER_CHUNK, bb->data + pfn / BM_BITS_PER_CHUNK);
578 return 0;
334} 579}
335 580
336unsigned long get_safe_page(gfp_t gfp_mask) 581/* Two auxiliary functions for memory_bm_next_pfn */
582
583/* Find the first set bit in the given chunk, if there is one */
584
585static inline int next_bit_in_chunk(int bit, unsigned long *chunk_p)
337{ 586{
338 return (unsigned long)alloc_image_page(gfp_mask, 1); 587 bit++;
588 while (bit < BM_BITS_PER_CHUNK) {
589 if (test_bit(bit, chunk_p))
590 return bit;
591
592 bit++;
593 }
594 return -1;
595}
596
597/* Find a chunk containing some bits set in given block of bits */
598
599static inline int next_chunk_in_block(int n, struct bm_block *bb)
600{
601 n++;
602 while (n < bb->size) {
603 if (bb->data[n])
604 return n;
605
606 n++;
607 }
608 return -1;
339} 609}
340 610
341/** 611/**
342 * alloc_pagedir - Allocate the page directory. 612 * memory_bm_next_pfn - find the pfn that corresponds to the next set bit
343 * 613 * in the bitmap @bm. If the pfn cannot be found, BM_END_OF_MAP is
344 * First, determine exactly how many pages we need and 614 * returned.
345 * allocate them.
346 * 615 *
347 * We arrange the pages in a chain: each page is an array of PBES_PER_PAGE 616 * It is required to run memory_bm_position_reset() before the first call to
348 * struct pbe elements (pbes) and the last element in the page points 617 * this function.
349 * to the next page. 618 */
619
620static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
621{
622 struct zone_bitmap *zone_bm;
623 struct bm_block *bb;
624 int chunk;
625 int bit;
626
627 do {
628 bb = bm->cur.block;
629 do {
630 chunk = bm->cur.chunk;
631 bit = bm->cur.bit;
632 do {
633 bit = next_bit_in_chunk(bit, bb->data + chunk);
634 if (bit >= 0)
635 goto Return_pfn;
636
637 chunk = next_chunk_in_block(chunk, bb);
638 bit = -1;
639 } while (chunk >= 0);
640 bb = bb->next;
641 bm->cur.block = bb;
642 memory_bm_reset_chunk(bm);
643 } while (bb);
644 zone_bm = bm->cur.zone_bm->next;
645 if (zone_bm) {
646 bm->cur.zone_bm = zone_bm;
647 bm->cur.block = zone_bm->bm_blocks;
648 memory_bm_reset_chunk(bm);
649 }
650 } while (zone_bm);
651 memory_bm_position_reset(bm);
652 return BM_END_OF_MAP;
653
654Return_pfn:
655 bm->cur.chunk = chunk;
656 bm->cur.bit = bit;
657 return bb->start_pfn + chunk * BM_BITS_PER_CHUNK + bit;
658}
659
660/**
661 * snapshot_additional_pages - estimate the number of additional pages
662 * be needed for setting up the suspend image data structures for given
663 * zone (usually the returned value is greater than the exact number)
664 */
665
666unsigned int snapshot_additional_pages(struct zone *zone)
667{
668 unsigned int res;
669
670 res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
671 res += DIV_ROUND_UP(res * sizeof(struct bm_block), PAGE_SIZE);
672 return res;
673}
674
675/**
676 * pfn_is_nosave - check if given pfn is in the 'nosave' section
677 */
678
679static inline int pfn_is_nosave(unsigned long pfn)
680{
681 unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT;
682 unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT;
683 return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
684}
685
686/**
687 * saveable - Determine whether a page should be cloned or not.
688 * @pfn: The page
350 * 689 *
351 * On each page we set up a list of struct_pbe elements. 690 * We save a page if it isn't Nosave, and is not in the range of pages
691 * statically defined as 'unsaveable', and it
692 * isn't a part of a free chunk of pages.
352 */ 693 */
353 694
354static struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, 695static struct page *saveable_page(unsigned long pfn)
355 int safe_needed)
356{ 696{
357 unsigned int num; 697 struct page *page;
358 struct pbe *pblist, *pbe; 698
699 if (!pfn_valid(pfn))
700 return NULL;
359 701
360 if (!nr_pages) 702 page = pfn_to_page(pfn);
703
704 if (PageNosave(page))
705 return NULL;
706 if (PageReserved(page) && pfn_is_nosave(pfn))
361 return NULL; 707 return NULL;
708 if (PageNosaveFree(page))
709 return NULL;
710
711 return page;
712}
713
714unsigned int count_data_pages(void)
715{
716 struct zone *zone;
717 unsigned long pfn, max_zone_pfn;
718 unsigned int n = 0;
362 719
363 pblist = alloc_image_page(gfp_mask, safe_needed); 720 for_each_zone (zone) {
364 /* FIXME: rewrite this ugly loop */ 721 if (is_highmem(zone))
365 for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages; 722 continue;
366 pbe = pbe->next, num += PBES_PER_PAGE) { 723 mark_free_pages(zone);
367 pbe += PB_PAGE_SKIP; 724 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
368 pbe->next = alloc_image_page(gfp_mask, safe_needed); 725 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
726 n += !!saveable_page(pfn);
369 } 727 }
370 if (!pbe) { /* get_zeroed_page() failed */ 728 return n;
371 free_pagedir(pblist, 1); 729}
372 pblist = NULL; 730
373 } else 731static inline void copy_data_page(long *dst, long *src)
374 create_pbe_list(pblist, nr_pages); 732{
375 return pblist; 733 int n;
734
735 /* copy_page and memcpy are not usable for copying task structs. */
736 for (n = PAGE_SIZE / sizeof(long); n; n--)
737 *dst++ = *src++;
738}
739
740static void
741copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
742{
743 struct zone *zone;
744 unsigned long pfn;
745
746 for_each_zone (zone) {
747 unsigned long max_zone_pfn;
748
749 if (is_highmem(zone))
750 continue;
751
752 mark_free_pages(zone);
753 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
754 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
755 if (saveable_page(pfn))
756 memory_bm_set_bit(orig_bm, pfn);
757 }
758 memory_bm_position_reset(orig_bm);
759 memory_bm_position_reset(copy_bm);
760 do {
761 pfn = memory_bm_next_pfn(orig_bm);
762 if (likely(pfn != BM_END_OF_MAP)) {
763 struct page *page;
764 void *src;
765
766 page = pfn_to_page(pfn);
767 src = page_address(page);
768 page = pfn_to_page(memory_bm_next_pfn(copy_bm));
769 copy_data_page(page_address(page), src);
770 }
771 } while (pfn != BM_END_OF_MAP);
376} 772}
377 773
378/** 774/**
379 * Free pages we allocated for suspend. Suspend pages are alocated 775 * swsusp_free - free pages allocated for the suspend.
380 * before atomic copy, so we need to free them after resume. 776 *
777 * Suspend pages are alocated before the atomic copy is made, so we
778 * need to release them after the resume.
381 */ 779 */
382 780
383void swsusp_free(void) 781void swsusp_free(void)
384{ 782{
385 struct zone *zone; 783 struct zone *zone;
386 unsigned long zone_pfn; 784 unsigned long pfn, max_zone_pfn;
387 785
388 for_each_zone(zone) { 786 for_each_zone(zone) {
389 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) 787 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
390 if (pfn_valid(zone_pfn + zone->zone_start_pfn)) { 788 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
391 struct page *page; 789 if (pfn_valid(pfn)) {
392 page = pfn_to_page(zone_pfn + zone->zone_start_pfn); 790 struct page *page = pfn_to_page(pfn);
791
393 if (PageNosave(page) && PageNosaveFree(page)) { 792 if (PageNosave(page) && PageNosaveFree(page)) {
394 ClearPageNosave(page); 793 ClearPageNosave(page);
395 ClearPageNosaveFree(page); 794 ClearPageNosaveFree(page);
@@ -399,7 +798,7 @@ void swsusp_free(void)
399 } 798 }
400 nr_copy_pages = 0; 799 nr_copy_pages = 0;
401 nr_meta_pages = 0; 800 nr_meta_pages = 0;
402 pagedir_nosave = NULL; 801 restore_pblist = NULL;
403 buffer = NULL; 802 buffer = NULL;
404} 803}
405 804
@@ -414,46 +813,57 @@ void swsusp_free(void)
414static int enough_free_mem(unsigned int nr_pages) 813static int enough_free_mem(unsigned int nr_pages)
415{ 814{
416 struct zone *zone; 815 struct zone *zone;
417 unsigned int n = 0; 816 unsigned int free = 0, meta = 0;
418 817
419 for_each_zone (zone) 818 for_each_zone (zone)
420 if (!is_highmem(zone)) 819 if (!is_highmem(zone)) {
421 n += zone->free_pages; 820 free += zone->free_pages;
422 pr_debug("swsusp: available memory: %u pages\n", n); 821 meta += snapshot_additional_pages(zone);
423 return n > (nr_pages + PAGES_FOR_IO + 822 }
424 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
425}
426 823
427static int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed) 824 pr_debug("swsusp: pages needed: %u + %u + %u, available pages: %u\n",
428{ 825 nr_pages, PAGES_FOR_IO, meta, free);
429 struct pbe *p;
430 826
431 for_each_pbe (p, pblist) { 827 return free > nr_pages + PAGES_FOR_IO + meta;
432 p->address = (unsigned long)alloc_image_page(gfp_mask, safe_needed);
433 if (!p->address)
434 return -ENOMEM;
435 }
436 return 0;
437} 828}
438 829
439static struct pbe *swsusp_alloc(unsigned int nr_pages) 830static int
831swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
832 unsigned int nr_pages)
440{ 833{
441 struct pbe *pblist; 834 int error;
442 835
443 if (!(pblist = alloc_pagedir(nr_pages, GFP_ATOMIC | __GFP_COLD, 0))) { 836 error = memory_bm_create(orig_bm, GFP_ATOMIC | __GFP_COLD, PG_ANY);
444 printk(KERN_ERR "suspend: Allocating pagedir failed.\n"); 837 if (error)
445 return NULL; 838 goto Free;
446 }
447 839
448 if (alloc_data_pages(pblist, GFP_ATOMIC | __GFP_COLD, 0)) { 840 error = memory_bm_create(copy_bm, GFP_ATOMIC | __GFP_COLD, PG_ANY);
449 printk(KERN_ERR "suspend: Allocating image pages failed.\n"); 841 if (error)
450 swsusp_free(); 842 goto Free;
451 return NULL; 843
844 while (nr_pages-- > 0) {
845 struct page *page = alloc_page(GFP_ATOMIC | __GFP_COLD);
846 if (!page)
847 goto Free;
848
849 SetPageNosave(page);
850 SetPageNosaveFree(page);
851 memory_bm_set_bit(copy_bm, page_to_pfn(page));
452 } 852 }
853 return 0;
453 854
454 return pblist; 855Free:
856 swsusp_free();
857 return -ENOMEM;
455} 858}
456 859
860/* Memory bitmap used for marking saveable pages */
861static struct memory_bitmap orig_bm;
862/* Memory bitmap used for marking allocated pages that will contain the copies
863 * of saveable pages
864 */
865static struct memory_bitmap copy_bm;
866
457asmlinkage int swsusp_save(void) 867asmlinkage int swsusp_save(void)
458{ 868{
459 unsigned int nr_pages; 869 unsigned int nr_pages;
@@ -464,25 +874,19 @@ asmlinkage int swsusp_save(void)
464 nr_pages = count_data_pages(); 874 nr_pages = count_data_pages();
465 printk("swsusp: Need to copy %u pages\n", nr_pages); 875 printk("swsusp: Need to copy %u pages\n", nr_pages);
466 876
467 pr_debug("swsusp: pages needed: %u + %lu + %u, free: %u\n",
468 nr_pages,
469 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE,
470 PAGES_FOR_IO, nr_free_pages());
471
472 if (!enough_free_mem(nr_pages)) { 877 if (!enough_free_mem(nr_pages)) {
473 printk(KERN_ERR "swsusp: Not enough free memory\n"); 878 printk(KERN_ERR "swsusp: Not enough free memory\n");
474 return -ENOMEM; 879 return -ENOMEM;
475 } 880 }
476 881
477 pagedir_nosave = swsusp_alloc(nr_pages); 882 if (swsusp_alloc(&orig_bm, &copy_bm, nr_pages))
478 if (!pagedir_nosave)
479 return -ENOMEM; 883 return -ENOMEM;
480 884
481 /* During allocating of suspend pagedir, new cold pages may appear. 885 /* During allocating of suspend pagedir, new cold pages may appear.
482 * Kill them. 886 * Kill them.
483 */ 887 */
484 drain_local_pages(); 888 drain_local_pages();
485 copy_data_pages(pagedir_nosave); 889 copy_data_pages(&copy_bm, &orig_bm);
486 890
487 /* 891 /*
488 * End of critical section. From now on, we can write to memory, 892 * End of critical section. From now on, we can write to memory,
@@ -511,22 +915,20 @@ static void init_header(struct swsusp_info *info)
511} 915}
512 916
513/** 917/**
514 * pack_orig_addresses - the .orig_address fields of the PBEs from the 918 * pack_pfns - pfns corresponding to the set bits found in the bitmap @bm
515 * list starting at @pbe are stored in the array @buf[] (1 page) 919 * are stored in the array @buf[] (1 page at a time)
516 */ 920 */
517 921
518static inline struct pbe *pack_orig_addresses(unsigned long *buf, struct pbe *pbe) 922static inline void
923pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
519{ 924{
520 int j; 925 int j;
521 926
522 for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) { 927 for (j = 0; j < PAGE_SIZE / sizeof(long); j++) {
523 buf[j] = pbe->orig_address; 928 buf[j] = memory_bm_next_pfn(bm);
524 pbe = pbe->next; 929 if (unlikely(buf[j] == BM_END_OF_MAP))
930 break;
525 } 931 }
526 if (!pbe)
527 for (; j < PAGE_SIZE / sizeof(long); j++)
528 buf[j] = 0;
529 return pbe;
530} 932}
531 933
532/** 934/**
@@ -553,37 +955,39 @@ static inline struct pbe *pack_orig_addresses(unsigned long *buf, struct pbe *pb
553 955
554int snapshot_read_next(struct snapshot_handle *handle, size_t count) 956int snapshot_read_next(struct snapshot_handle *handle, size_t count)
555{ 957{
556 if (handle->page > nr_meta_pages + nr_copy_pages) 958 if (handle->cur > nr_meta_pages + nr_copy_pages)
557 return 0; 959 return 0;
960
558 if (!buffer) { 961 if (!buffer) {
559 /* This makes the buffer be freed by swsusp_free() */ 962 /* This makes the buffer be freed by swsusp_free() */
560 buffer = alloc_image_page(GFP_ATOMIC, 0); 963 buffer = alloc_image_page(GFP_ATOMIC, PG_ANY);
561 if (!buffer) 964 if (!buffer)
562 return -ENOMEM; 965 return -ENOMEM;
563 } 966 }
564 if (!handle->offset) { 967 if (!handle->offset) {
565 init_header((struct swsusp_info *)buffer); 968 init_header((struct swsusp_info *)buffer);
566 handle->buffer = buffer; 969 handle->buffer = buffer;
567 handle->pbe = pagedir_nosave; 970 memory_bm_position_reset(&orig_bm);
971 memory_bm_position_reset(&copy_bm);
568 } 972 }
569 if (handle->prev < handle->page) { 973 if (handle->prev < handle->cur) {
570 if (handle->page <= nr_meta_pages) { 974 if (handle->cur <= nr_meta_pages) {
571 handle->pbe = pack_orig_addresses(buffer, handle->pbe); 975 memset(buffer, 0, PAGE_SIZE);
572 if (!handle->pbe) 976 pack_pfns(buffer, &orig_bm);
573 handle->pbe = pagedir_nosave;
574 } else { 977 } else {
575 handle->buffer = (void *)handle->pbe->address; 978 unsigned long pfn = memory_bm_next_pfn(&copy_bm);
576 handle->pbe = handle->pbe->next; 979
980 handle->buffer = page_address(pfn_to_page(pfn));
577 } 981 }
578 handle->prev = handle->page; 982 handle->prev = handle->cur;
579 } 983 }
580 handle->buf_offset = handle->page_offset; 984 handle->buf_offset = handle->cur_offset;
581 if (handle->page_offset + count >= PAGE_SIZE) { 985 if (handle->cur_offset + count >= PAGE_SIZE) {
582 count = PAGE_SIZE - handle->page_offset; 986 count = PAGE_SIZE - handle->cur_offset;
583 handle->page_offset = 0; 987 handle->cur_offset = 0;
584 handle->page++; 988 handle->cur++;
585 } else { 989 } else {
586 handle->page_offset += count; 990 handle->cur_offset += count;
587 } 991 }
588 handle->offset += count; 992 handle->offset += count;
589 return count; 993 return count;
@@ -595,47 +999,50 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
595 * had been used before suspend 999 * had been used before suspend
596 */ 1000 */
597 1001
598static int mark_unsafe_pages(struct pbe *pblist) 1002static int mark_unsafe_pages(struct memory_bitmap *bm)
599{ 1003{
600 struct zone *zone; 1004 struct zone *zone;
601 unsigned long zone_pfn; 1005 unsigned long pfn, max_zone_pfn;
602 struct pbe *p;
603
604 if (!pblist) /* a sanity check */
605 return -EINVAL;
606 1006
607 /* Clear page flags */ 1007 /* Clear page flags */
608 for_each_zone (zone) { 1008 for_each_zone (zone) {
609 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) 1009 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
610 if (pfn_valid(zone_pfn + zone->zone_start_pfn)) 1010 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
611 ClearPageNosaveFree(pfn_to_page(zone_pfn + 1011 if (pfn_valid(pfn))
612 zone->zone_start_pfn)); 1012 ClearPageNosaveFree(pfn_to_page(pfn));
613 } 1013 }
614 1014
615 /* Mark orig addresses */ 1015 /* Mark pages that correspond to the "original" pfns as "unsafe" */
616 for_each_pbe (p, pblist) { 1016 memory_bm_position_reset(bm);
617 if (virt_addr_valid(p->orig_address)) 1017 do {
618 SetPageNosaveFree(virt_to_page(p->orig_address)); 1018 pfn = memory_bm_next_pfn(bm);
619 else 1019 if (likely(pfn != BM_END_OF_MAP)) {
620 return -EFAULT; 1020 if (likely(pfn_valid(pfn)))
621 } 1021 SetPageNosaveFree(pfn_to_page(pfn));
1022 else
1023 return -EFAULT;
1024 }
1025 } while (pfn != BM_END_OF_MAP);
622 1026
623 unsafe_pages = 0; 1027 allocated_unsafe_pages = 0;
624 1028
625 return 0; 1029 return 0;
626} 1030}
627 1031
628static void copy_page_backup_list(struct pbe *dst, struct pbe *src) 1032static void
1033duplicate_memory_bitmap(struct memory_bitmap *dst, struct memory_bitmap *src)
629{ 1034{
630 /* We assume both lists contain the same number of elements */ 1035 unsigned long pfn;
631 while (src) { 1036
632 dst->orig_address = src->orig_address; 1037 memory_bm_position_reset(src);
633 dst = dst->next; 1038 pfn = memory_bm_next_pfn(src);
634 src = src->next; 1039 while (pfn != BM_END_OF_MAP) {
1040 memory_bm_set_bit(dst, pfn);
1041 pfn = memory_bm_next_pfn(src);
635 } 1042 }
636} 1043}
637 1044
638static int check_header(struct swsusp_info *info) 1045static inline int check_header(struct swsusp_info *info)
639{ 1046{
640 char *reason = NULL; 1047 char *reason = NULL;
641 1048
@@ -662,19 +1069,14 @@ static int check_header(struct swsusp_info *info)
662 * load header - check the image header and copy data from it 1069 * load header - check the image header and copy data from it
663 */ 1070 */
664 1071
665static int load_header(struct snapshot_handle *handle, 1072static int
666 struct swsusp_info *info) 1073load_header(struct swsusp_info *info)
667{ 1074{
668 int error; 1075 int error;
669 struct pbe *pblist;
670 1076
1077 restore_pblist = NULL;
671 error = check_header(info); 1078 error = check_header(info);
672 if (!error) { 1079 if (!error) {
673 pblist = alloc_pagedir(info->image_pages, GFP_ATOMIC, 0);
674 if (!pblist)
675 return -ENOMEM;
676 pagedir_nosave = pblist;
677 handle->pbe = pblist;
678 nr_copy_pages = info->image_pages; 1080 nr_copy_pages = info->image_pages;
679 nr_meta_pages = info->pages - info->image_pages - 1; 1081 nr_meta_pages = info->pages - info->image_pages - 1;
680 } 1082 }
@@ -682,113 +1084,137 @@ static int load_header(struct snapshot_handle *handle,
682} 1084}
683 1085
684/** 1086/**
685 * unpack_orig_addresses - copy the elements of @buf[] (1 page) to 1087 * unpack_orig_pfns - for each element of @buf[] (1 page at a time) set
686 * the PBEs in the list starting at @pbe 1088 * the corresponding bit in the memory bitmap @bm
687 */ 1089 */
688 1090
689static inline struct pbe *unpack_orig_addresses(unsigned long *buf, 1091static inline void
690 struct pbe *pbe) 1092unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
691{ 1093{
692 int j; 1094 int j;
693 1095
694 for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) { 1096 for (j = 0; j < PAGE_SIZE / sizeof(long); j++) {
695 pbe->orig_address = buf[j]; 1097 if (unlikely(buf[j] == BM_END_OF_MAP))
696 pbe = pbe->next; 1098 break;
1099
1100 memory_bm_set_bit(bm, buf[j]);
697 } 1101 }
698 return pbe;
699} 1102}
700 1103
701/** 1104/**
702 * prepare_image - use metadata contained in the PBE list 1105 * prepare_image - use the memory bitmap @bm to mark the pages that will
703 * pointed to by pagedir_nosave to mark the pages that will 1106 * be overwritten in the process of restoring the system memory state
704 * be overwritten in the process of restoring the system 1107 * from the suspend image ("unsafe" pages) and allocate memory for the
705 * memory state from the image ("unsafe" pages) and allocate 1108 * image.
706 * memory for the image
707 * 1109 *
708 * The idea is to allocate the PBE list first and then 1110 * The idea is to allocate a new memory bitmap first and then allocate
709 * allocate as many pages as it's needed for the image data, 1111 * as many pages as needed for the image data, but not to assign these
710 * but not to assign these pages to the PBEs initially. 1112 * pages to specific tasks initially. Instead, we just mark them as
711 * Instead, we just mark them as allocated and create a list 1113 * allocated and create a list of "safe" pages that will be used later.
712 * of "safe" which will be used later
713 */ 1114 */
714 1115
715struct safe_page { 1116#define PBES_PER_LINKED_PAGE (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe))
716 struct safe_page *next;
717 char padding[PAGE_SIZE - sizeof(void *)];
718};
719 1117
720static struct safe_page *safe_pages; 1118static struct linked_page *safe_pages_list;
721 1119
722static int prepare_image(struct snapshot_handle *handle) 1120static int
1121prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
723{ 1122{
724 int error = 0; 1123 unsigned int nr_pages;
725 unsigned int nr_pages = nr_copy_pages; 1124 struct linked_page *sp_list, *lp;
726 struct pbe *p, *pblist = NULL; 1125 int error;
727 1126
728 p = pagedir_nosave; 1127 error = mark_unsafe_pages(bm);
729 error = mark_unsafe_pages(p); 1128 if (error)
730 if (!error) { 1129 goto Free;
731 pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1); 1130
732 if (pblist) 1131 error = memory_bm_create(new_bm, GFP_ATOMIC, PG_SAFE);
733 copy_page_backup_list(pblist, p); 1132 if (error)
734 free_pagedir(p, 0); 1133 goto Free;
735 if (!pblist) 1134
1135 duplicate_memory_bitmap(new_bm, bm);
1136 memory_bm_free(bm, PG_UNSAFE_KEEP);
1137 /* Reserve some safe pages for potential later use.
1138 *
1139 * NOTE: This way we make sure there will be enough safe pages for the
1140 * chain_alloc() in get_buffer(). It is a bit wasteful, but
1141 * nr_copy_pages cannot be greater than 50% of the memory anyway.
1142 */
1143 sp_list = NULL;
1144 /* nr_copy_pages cannot be lesser than allocated_unsafe_pages */
1145 nr_pages = nr_copy_pages - allocated_unsafe_pages;
1146 nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE);
1147 while (nr_pages > 0) {
1148 lp = alloc_image_page(GFP_ATOMIC, PG_SAFE);
1149 if (!lp) {
736 error = -ENOMEM; 1150 error = -ENOMEM;
1151 goto Free;
1152 }
1153 lp->next = sp_list;
1154 sp_list = lp;
1155 nr_pages--;
737 } 1156 }
738 safe_pages = NULL; 1157 /* Preallocate memory for the image */
739 if (!error && nr_pages > unsafe_pages) { 1158 safe_pages_list = NULL;
740 nr_pages -= unsafe_pages; 1159 nr_pages = nr_copy_pages - allocated_unsafe_pages;
741 while (nr_pages--) { 1160 while (nr_pages > 0) {
742 struct safe_page *ptr; 1161 lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC);
743 1162 if (!lp) {
744 ptr = (struct safe_page *)get_zeroed_page(GFP_ATOMIC); 1163 error = -ENOMEM;
745 if (!ptr) { 1164 goto Free;
746 error = -ENOMEM; 1165 }
747 break; 1166 if (!PageNosaveFree(virt_to_page(lp))) {
748 } 1167 /* The page is "safe", add it to the list */
749 if (!PageNosaveFree(virt_to_page(ptr))) { 1168 lp->next = safe_pages_list;
750 /* The page is "safe", add it to the list */ 1169 safe_pages_list = lp;
751 ptr->next = safe_pages;
752 safe_pages = ptr;
753 }
754 /* Mark the page as allocated */
755 SetPageNosave(virt_to_page(ptr));
756 SetPageNosaveFree(virt_to_page(ptr));
757 } 1170 }
1171 /* Mark the page as allocated */
1172 SetPageNosave(virt_to_page(lp));
1173 SetPageNosaveFree(virt_to_page(lp));
1174 nr_pages--;
758 } 1175 }
759 if (!error) { 1176 /* Free the reserved safe pages so that chain_alloc() can use them */
760 pagedir_nosave = pblist; 1177 while (sp_list) {
761 } else { 1178 lp = sp_list->next;
762 handle->pbe = NULL; 1179 free_image_page(sp_list, PG_UNSAFE_CLEAR);
763 swsusp_free(); 1180 sp_list = lp;
764 } 1181 }
1182 return 0;
1183
1184Free:
1185 swsusp_free();
765 return error; 1186 return error;
766} 1187}
767 1188
768static void *get_buffer(struct snapshot_handle *handle) 1189/**
1190 * get_buffer - compute the address that snapshot_write_next() should
1191 * set for its caller to write to.
1192 */
1193
1194static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
769{ 1195{
770 struct pbe *pbe = handle->pbe, *last = handle->last_pbe; 1196 struct pbe *pbe;
771 struct page *page = virt_to_page(pbe->orig_address); 1197 struct page *page = pfn_to_page(memory_bm_next_pfn(bm));
772 1198
773 if (PageNosave(page) && PageNosaveFree(page)) { 1199 if (PageNosave(page) && PageNosaveFree(page))
774 /* 1200 /* We have allocated the "original" page frame and we can
775 * We have allocated the "original" page frame and we can 1201 * use it directly to store the loaded page.
776 * use it directly to store the read page
777 */ 1202 */
778 pbe->address = 0; 1203 return page_address(page);
779 if (last && last->next) 1204
780 last->next = NULL; 1205 /* The "original" page frame has not been allocated and we have to
781 return (void *)pbe->orig_address; 1206 * use a "safe" page frame to store the loaded page.
782 }
783 /*
784 * The "original" page frame has not been allocated and we have to
785 * use a "safe" page frame to store the read page
786 */ 1207 */
787 pbe->address = (unsigned long)safe_pages; 1208 pbe = chain_alloc(ca, sizeof(struct pbe));
788 safe_pages = safe_pages->next; 1209 if (!pbe) {
789 if (last) 1210 swsusp_free();
790 last->next = pbe; 1211 return NULL;
791 handle->last_pbe = pbe; 1212 }
1213 pbe->orig_address = (unsigned long)page_address(page);
1214 pbe->address = (unsigned long)safe_pages_list;
1215 safe_pages_list = safe_pages_list->next;
1216 pbe->next = restore_pblist;
1217 restore_pblist = pbe;
792 return (void *)pbe->address; 1218 return (void *)pbe->address;
793} 1219}
794 1220
@@ -816,46 +1242,60 @@ static void *get_buffer(struct snapshot_handle *handle)
816 1242
817int snapshot_write_next(struct snapshot_handle *handle, size_t count) 1243int snapshot_write_next(struct snapshot_handle *handle, size_t count)
818{ 1244{
1245 static struct chain_allocator ca;
819 int error = 0; 1246 int error = 0;
820 1247
821 if (handle->prev && handle->page > nr_meta_pages + nr_copy_pages) 1248 /* Check if we have already loaded the entire image */
1249 if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages)
822 return 0; 1250 return 0;
1251
823 if (!buffer) { 1252 if (!buffer) {
824 /* This makes the buffer be freed by swsusp_free() */ 1253 /* This makes the buffer be freed by swsusp_free() */
825 buffer = alloc_image_page(GFP_ATOMIC, 0); 1254 buffer = alloc_image_page(GFP_ATOMIC, PG_ANY);
826 if (!buffer) 1255 if (!buffer)
827 return -ENOMEM; 1256 return -ENOMEM;
828 } 1257 }
829 if (!handle->offset) 1258 if (!handle->offset)
830 handle->buffer = buffer; 1259 handle->buffer = buffer;
831 if (handle->prev < handle->page) { 1260 handle->sync_read = 1;
832 if (!handle->prev) { 1261 if (handle->prev < handle->cur) {
833 error = load_header(handle, (struct swsusp_info *)buffer); 1262 if (handle->prev == 0) {
1263 error = load_header(buffer);
834 if (error) 1264 if (error)
835 return error; 1265 return error;
1266
1267 error = memory_bm_create(&copy_bm, GFP_ATOMIC, PG_ANY);
1268 if (error)
1269 return error;
1270
836 } else if (handle->prev <= nr_meta_pages) { 1271 } else if (handle->prev <= nr_meta_pages) {
837 handle->pbe = unpack_orig_addresses(buffer, handle->pbe); 1272 unpack_orig_pfns(buffer, &copy_bm);
838 if (!handle->pbe) { 1273 if (handle->prev == nr_meta_pages) {
839 error = prepare_image(handle); 1274 error = prepare_image(&orig_bm, &copy_bm);
840 if (error) 1275 if (error)
841 return error; 1276 return error;
842 handle->pbe = pagedir_nosave; 1277
843 handle->last_pbe = NULL; 1278 chain_init(&ca, GFP_ATOMIC, PG_SAFE);
844 handle->buffer = get_buffer(handle); 1279 memory_bm_position_reset(&orig_bm);
1280 restore_pblist = NULL;
1281 handle->buffer = get_buffer(&orig_bm, &ca);
1282 handle->sync_read = 0;
1283 if (!handle->buffer)
1284 return -ENOMEM;
845 } 1285 }
846 } else { 1286 } else {
847 handle->pbe = handle->pbe->next; 1287 handle->buffer = get_buffer(&orig_bm, &ca);
848 handle->buffer = get_buffer(handle); 1288 handle->sync_read = 0;
849 } 1289 }
850 handle->prev = handle->page; 1290 handle->prev = handle->cur;
851 } 1291 }
852 handle->buf_offset = handle->page_offset; 1292 handle->buf_offset = handle->cur_offset;
853 if (handle->page_offset + count >= PAGE_SIZE) { 1293 if (handle->cur_offset + count >= PAGE_SIZE) {
854 count = PAGE_SIZE - handle->page_offset; 1294 count = PAGE_SIZE - handle->cur_offset;
855 handle->page_offset = 0; 1295 handle->cur_offset = 0;
856 handle->page++; 1296 handle->cur++;
857 } else { 1297 } else {
858 handle->page_offset += count; 1298 handle->cur_offset += count;
859 } 1299 }
860 handle->offset += count; 1300 handle->offset += count;
861 return count; 1301 return count;
@@ -863,6 +1303,13 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
863 1303
864int snapshot_image_loaded(struct snapshot_handle *handle) 1304int snapshot_image_loaded(struct snapshot_handle *handle)
865{ 1305{
866 return !(!handle->pbe || handle->pbe->next || !nr_copy_pages || 1306 return !(!nr_copy_pages ||
867 handle->page <= nr_meta_pages + nr_copy_pages); 1307 handle->cur <= nr_meta_pages + nr_copy_pages);
1308}
1309
1310void snapshot_free_unused_memory(struct snapshot_handle *handle)
1311{
1312 /* Free only if we have loaded the image entirely */
1313 if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages)
1314 memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR);
868} 1315}
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index f1dd146bd64d..9b2ee5344dee 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -22,6 +22,7 @@
22#include <linux/device.h> 22#include <linux/device.h>
23#include <linux/buffer_head.h> 23#include <linux/buffer_head.h>
24#include <linux/bio.h> 24#include <linux/bio.h>
25#include <linux/blkdev.h>
25#include <linux/swap.h> 26#include <linux/swap.h>
26#include <linux/swapops.h> 27#include <linux/swapops.h>
27#include <linux/pm.h> 28#include <linux/pm.h>
@@ -49,18 +50,16 @@ static int mark_swapfiles(swp_entry_t start)
49{ 50{
50 int error; 51 int error;
51 52
52 rw_swap_page_sync(READ, 53 rw_swap_page_sync(READ, swp_entry(root_swap, 0),
53 swp_entry(root_swap, 0), 54 virt_to_page((unsigned long)&swsusp_header), NULL);
54 virt_to_page((unsigned long)&swsusp_header));
55 if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) || 55 if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) ||
56 !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) { 56 !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
57 memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); 57 memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
58 memcpy(swsusp_header.sig,SWSUSP_SIG, 10); 58 memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
59 swsusp_header.image = start; 59 swsusp_header.image = start;
60 error = rw_swap_page_sync(WRITE, 60 error = rw_swap_page_sync(WRITE, swp_entry(root_swap, 0),
61 swp_entry(root_swap, 0), 61 virt_to_page((unsigned long)&swsusp_header),
62 virt_to_page((unsigned long) 62 NULL);
63 &swsusp_header));
64 } else { 63 } else {
65 pr_debug("swsusp: Partition is not swap space.\n"); 64 pr_debug("swsusp: Partition is not swap space.\n");
66 error = -ENODEV; 65 error = -ENODEV;
@@ -88,16 +87,37 @@ static int swsusp_swap_check(void) /* This is called before saving image */
88 * write_page - Write one page to given swap location. 87 * write_page - Write one page to given swap location.
89 * @buf: Address we're writing. 88 * @buf: Address we're writing.
90 * @offset: Offset of the swap page we're writing to. 89 * @offset: Offset of the swap page we're writing to.
90 * @bio_chain: Link the next write BIO here
91 */ 91 */
92 92
93static int write_page(void *buf, unsigned long offset) 93static int write_page(void *buf, unsigned long offset, struct bio **bio_chain)
94{ 94{
95 swp_entry_t entry; 95 swp_entry_t entry;
96 int error = -ENOSPC; 96 int error = -ENOSPC;
97 97
98 if (offset) { 98 if (offset) {
99 struct page *page = virt_to_page(buf);
100
101 if (bio_chain) {
102 /*
103 * Whether or not we successfully allocated a copy page,
104 * we take a ref on the page here. It gets undone in
105 * wait_on_bio_chain().
106 */
107 struct page *page_copy;
108 page_copy = alloc_page(GFP_ATOMIC);
109 if (page_copy == NULL) {
110 WARN_ON_ONCE(1);
111 bio_chain = NULL; /* Go synchronous */
112 get_page(page);
113 } else {
114 memcpy(page_address(page_copy),
115 page_address(page), PAGE_SIZE);
116 page = page_copy;
117 }
118 }
99 entry = swp_entry(root_swap, offset); 119 entry = swp_entry(root_swap, offset);
100 error = rw_swap_page_sync(WRITE, entry, virt_to_page(buf)); 120 error = rw_swap_page_sync(WRITE, entry, page, bio_chain);
101 } 121 }
102 return error; 122 return error;
103} 123}
@@ -146,6 +166,26 @@ static void release_swap_writer(struct swap_map_handle *handle)
146 handle->bitmap = NULL; 166 handle->bitmap = NULL;
147} 167}
148 168
169static void show_speed(struct timeval *start, struct timeval *stop,
170 unsigned nr_pages, char *msg)
171{
172 s64 elapsed_centisecs64;
173 int centisecs;
174 int k;
175 int kps;
176
177 elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
178 do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
179 centisecs = elapsed_centisecs64;
180 if (centisecs == 0)
181 centisecs = 1; /* avoid div-by-zero */
182 k = nr_pages * (PAGE_SIZE / 1024);
183 kps = (k * 100) / centisecs;
184 printk("%s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", msg, k,
185 centisecs / 100, centisecs % 100,
186 kps / 1000, (kps % 1000) / 10);
187}
188
149static int get_swap_writer(struct swap_map_handle *handle) 189static int get_swap_writer(struct swap_map_handle *handle)
150{ 190{
151 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL); 191 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
@@ -165,37 +205,70 @@ static int get_swap_writer(struct swap_map_handle *handle)
165 return 0; 205 return 0;
166} 206}
167 207
168static int swap_write_page(struct swap_map_handle *handle, void *buf) 208static int wait_on_bio_chain(struct bio **bio_chain)
169{ 209{
170 int error; 210 struct bio *bio;
211 struct bio *next_bio;
212 int ret = 0;
213
214 if (bio_chain == NULL)
215 return 0;
216
217 bio = *bio_chain;
218 if (bio == NULL)
219 return 0;
220 while (bio) {
221 struct page *page;
222
223 next_bio = bio->bi_private;
224 page = bio->bi_io_vec[0].bv_page;
225 wait_on_page_locked(page);
226 if (!PageUptodate(page) || PageError(page))
227 ret = -EIO;
228 put_page(page);
229 bio_put(bio);
230 bio = next_bio;
231 }
232 *bio_chain = NULL;
233 return ret;
234}
235
236static int swap_write_page(struct swap_map_handle *handle, void *buf,
237 struct bio **bio_chain)
238{
239 int error = 0;
171 unsigned long offset; 240 unsigned long offset;
172 241
173 if (!handle->cur) 242 if (!handle->cur)
174 return -EINVAL; 243 return -EINVAL;
175 offset = alloc_swap_page(root_swap, handle->bitmap); 244 offset = alloc_swap_page(root_swap, handle->bitmap);
176 error = write_page(buf, offset); 245 error = write_page(buf, offset, bio_chain);
177 if (error) 246 if (error)
178 return error; 247 return error;
179 handle->cur->entries[handle->k++] = offset; 248 handle->cur->entries[handle->k++] = offset;
180 if (handle->k >= MAP_PAGE_ENTRIES) { 249 if (handle->k >= MAP_PAGE_ENTRIES) {
250 error = wait_on_bio_chain(bio_chain);
251 if (error)
252 goto out;
181 offset = alloc_swap_page(root_swap, handle->bitmap); 253 offset = alloc_swap_page(root_swap, handle->bitmap);
182 if (!offset) 254 if (!offset)
183 return -ENOSPC; 255 return -ENOSPC;
184 handle->cur->next_swap = offset; 256 handle->cur->next_swap = offset;
185 error = write_page(handle->cur, handle->cur_swap); 257 error = write_page(handle->cur, handle->cur_swap, NULL);
186 if (error) 258 if (error)
187 return error; 259 goto out;
188 memset(handle->cur, 0, PAGE_SIZE); 260 memset(handle->cur, 0, PAGE_SIZE);
189 handle->cur_swap = offset; 261 handle->cur_swap = offset;
190 handle->k = 0; 262 handle->k = 0;
191 } 263 }
192 return 0; 264out:
265 return error;
193} 266}
194 267
195static int flush_swap_writer(struct swap_map_handle *handle) 268static int flush_swap_writer(struct swap_map_handle *handle)
196{ 269{
197 if (handle->cur && handle->cur_swap) 270 if (handle->cur && handle->cur_swap)
198 return write_page(handle->cur, handle->cur_swap); 271 return write_page(handle->cur, handle->cur_swap, NULL);
199 else 272 else
200 return -EINVAL; 273 return -EINVAL;
201} 274}
@@ -206,21 +279,29 @@ static int flush_swap_writer(struct swap_map_handle *handle)
206 279
207static int save_image(struct swap_map_handle *handle, 280static int save_image(struct swap_map_handle *handle,
208 struct snapshot_handle *snapshot, 281 struct snapshot_handle *snapshot,
209 unsigned int nr_pages) 282 unsigned int nr_to_write)
210{ 283{
211 unsigned int m; 284 unsigned int m;
212 int ret; 285 int ret;
213 int error = 0; 286 int error = 0;
287 int nr_pages;
288 int err2;
289 struct bio *bio;
290 struct timeval start;
291 struct timeval stop;
214 292
215 printk("Saving image data pages (%u pages) ... ", nr_pages); 293 printk("Saving image data pages (%u pages) ... ", nr_to_write);
216 m = nr_pages / 100; 294 m = nr_to_write / 100;
217 if (!m) 295 if (!m)
218 m = 1; 296 m = 1;
219 nr_pages = 0; 297 nr_pages = 0;
298 bio = NULL;
299 do_gettimeofday(&start);
220 do { 300 do {
221 ret = snapshot_read_next(snapshot, PAGE_SIZE); 301 ret = snapshot_read_next(snapshot, PAGE_SIZE);
222 if (ret > 0) { 302 if (ret > 0) {
223 error = swap_write_page(handle, data_of(*snapshot)); 303 error = swap_write_page(handle, data_of(*snapshot),
304 &bio);
224 if (error) 305 if (error)
225 break; 306 break;
226 if (!(nr_pages % m)) 307 if (!(nr_pages % m))
@@ -228,8 +309,13 @@ static int save_image(struct swap_map_handle *handle,
228 nr_pages++; 309 nr_pages++;
229 } 310 }
230 } while (ret > 0); 311 } while (ret > 0);
312 err2 = wait_on_bio_chain(&bio);
313 do_gettimeofday(&stop);
314 if (!error)
315 error = err2;
231 if (!error) 316 if (!error)
232 printk("\b\b\b\bdone\n"); 317 printk("\b\b\b\bdone\n");
318 show_speed(&start, &stop, nr_to_write, "Wrote");
233 return error; 319 return error;
234} 320}
235 321
@@ -245,8 +331,7 @@ static int enough_swap(unsigned int nr_pages)
245 unsigned int free_swap = count_swap_pages(root_swap, 1); 331 unsigned int free_swap = count_swap_pages(root_swap, 1);
246 332
247 pr_debug("swsusp: free swap pages: %u\n", free_swap); 333 pr_debug("swsusp: free swap pages: %u\n", free_swap);
248 return free_swap > (nr_pages + PAGES_FOR_IO + 334 return free_swap > nr_pages + PAGES_FOR_IO;
249 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
250} 335}
251 336
252/** 337/**
@@ -266,7 +351,8 @@ int swsusp_write(void)
266 int error; 351 int error;
267 352
268 if ((error = swsusp_swap_check())) { 353 if ((error = swsusp_swap_check())) {
269 printk(KERN_ERR "swsusp: Cannot find swap device, try swapon -a.\n"); 354 printk(KERN_ERR "swsusp: Cannot find swap device, try "
355 "swapon -a.\n");
270 return error; 356 return error;
271 } 357 }
272 memset(&snapshot, 0, sizeof(struct snapshot_handle)); 358 memset(&snapshot, 0, sizeof(struct snapshot_handle));
@@ -281,7 +367,7 @@ int swsusp_write(void)
281 error = get_swap_writer(&handle); 367 error = get_swap_writer(&handle);
282 if (!error) { 368 if (!error) {
283 unsigned long start = handle.cur_swap; 369 unsigned long start = handle.cur_swap;
284 error = swap_write_page(&handle, header); 370 error = swap_write_page(&handle, header, NULL);
285 if (!error) 371 if (!error)
286 error = save_image(&handle, &snapshot, 372 error = save_image(&handle, &snapshot,
287 header->pages - 1); 373 header->pages - 1);
@@ -298,27 +384,6 @@ int swsusp_write(void)
298 return error; 384 return error;
299} 385}
300 386
301/*
302 * Using bio to read from swap.
303 * This code requires a bit more work than just using buffer heads
304 * but, it is the recommended way for 2.5/2.6.
305 * The following are to signal the beginning and end of I/O. Bios
306 * finish asynchronously, while we want them to happen synchronously.
307 * A simple atomic_t, and a wait loop take care of this problem.
308 */
309
310static atomic_t io_done = ATOMIC_INIT(0);
311
312static int end_io(struct bio *bio, unsigned int num, int err)
313{
314 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
315 printk(KERN_ERR "I/O error reading swsusp image.\n");
316 return -EIO;
317 }
318 atomic_set(&io_done, 0);
319 return 0;
320}
321
322static struct block_device *resume_bdev; 387static struct block_device *resume_bdev;
323 388
324/** 389/**
@@ -326,15 +391,15 @@ static struct block_device *resume_bdev;
326 * @rw: READ or WRITE. 391 * @rw: READ or WRITE.
327 * @off physical offset of page. 392 * @off physical offset of page.
328 * @page: page we're reading or writing. 393 * @page: page we're reading or writing.
394 * @bio_chain: list of pending biod (for async reading)
329 * 395 *
330 * Straight from the textbook - allocate and initialize the bio. 396 * Straight from the textbook - allocate and initialize the bio.
331 * If we're writing, make sure the page is marked as dirty. 397 * If we're reading, make sure the page is marked as dirty.
332 * Then submit it and wait. 398 * Then submit it and, if @bio_chain == NULL, wait.
333 */ 399 */
334 400static int submit(int rw, pgoff_t page_off, struct page *page,
335static int submit(int rw, pgoff_t page_off, void *page) 401 struct bio **bio_chain)
336{ 402{
337 int error = 0;
338 struct bio *bio; 403 struct bio *bio;
339 404
340 bio = bio_alloc(GFP_ATOMIC, 1); 405 bio = bio_alloc(GFP_ATOMIC, 1);
@@ -342,33 +407,40 @@ static int submit(int rw, pgoff_t page_off, void *page)
342 return -ENOMEM; 407 return -ENOMEM;
343 bio->bi_sector = page_off * (PAGE_SIZE >> 9); 408 bio->bi_sector = page_off * (PAGE_SIZE >> 9);
344 bio->bi_bdev = resume_bdev; 409 bio->bi_bdev = resume_bdev;
345 bio->bi_end_io = end_io; 410 bio->bi_end_io = end_swap_bio_read;
346 411
347 if (bio_add_page(bio, virt_to_page(page), PAGE_SIZE, 0) < PAGE_SIZE) { 412 if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
348 printk("swsusp: ERROR: adding page to bio at %ld\n",page_off); 413 printk("swsusp: ERROR: adding page to bio at %ld\n", page_off);
349 error = -EFAULT; 414 bio_put(bio);
350 goto Done; 415 return -EFAULT;
351 } 416 }
352 417
353 atomic_set(&io_done, 1); 418 lock_page(page);
354 submit_bio(rw | (1 << BIO_RW_SYNC), bio); 419 bio_get(bio);
355 while (atomic_read(&io_done)) 420
356 yield(); 421 if (bio_chain == NULL) {
357 if (rw == READ) 422 submit_bio(rw | (1 << BIO_RW_SYNC), bio);
358 bio_set_pages_dirty(bio); 423 wait_on_page_locked(page);
359 Done: 424 if (rw == READ)
360 bio_put(bio); 425 bio_set_pages_dirty(bio);
361 return error; 426 bio_put(bio);
427 } else {
428 get_page(page);
429 bio->bi_private = *bio_chain;
430 *bio_chain = bio;
431 submit_bio(rw | (1 << BIO_RW_SYNC), bio);
432 }
433 return 0;
362} 434}
363 435
364static int bio_read_page(pgoff_t page_off, void *page) 436static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
365{ 437{
366 return submit(READ, page_off, page); 438 return submit(READ, page_off, virt_to_page(addr), bio_chain);
367} 439}
368 440
369static int bio_write_page(pgoff_t page_off, void *page) 441static int bio_write_page(pgoff_t page_off, void *addr)
370{ 442{
371 return submit(WRITE, page_off, page); 443 return submit(WRITE, page_off, virt_to_page(addr), NULL);
372} 444}
373 445
374/** 446/**
@@ -393,7 +465,7 @@ static int get_swap_reader(struct swap_map_handle *handle,
393 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); 465 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
394 if (!handle->cur) 466 if (!handle->cur)
395 return -ENOMEM; 467 return -ENOMEM;
396 error = bio_read_page(swp_offset(start), handle->cur); 468 error = bio_read_page(swp_offset(start), handle->cur, NULL);
397 if (error) { 469 if (error) {
398 release_swap_reader(handle); 470 release_swap_reader(handle);
399 return error; 471 return error;
@@ -402,7 +474,8 @@ static int get_swap_reader(struct swap_map_handle *handle,
402 return 0; 474 return 0;
403} 475}
404 476
405static int swap_read_page(struct swap_map_handle *handle, void *buf) 477static int swap_read_page(struct swap_map_handle *handle, void *buf,
478 struct bio **bio_chain)
406{ 479{
407 unsigned long offset; 480 unsigned long offset;
408 int error; 481 int error;
@@ -412,16 +485,17 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf)
412 offset = handle->cur->entries[handle->k]; 485 offset = handle->cur->entries[handle->k];
413 if (!offset) 486 if (!offset)
414 return -EFAULT; 487 return -EFAULT;
415 error = bio_read_page(offset, buf); 488 error = bio_read_page(offset, buf, bio_chain);
416 if (error) 489 if (error)
417 return error; 490 return error;
418 if (++handle->k >= MAP_PAGE_ENTRIES) { 491 if (++handle->k >= MAP_PAGE_ENTRIES) {
492 error = wait_on_bio_chain(bio_chain);
419 handle->k = 0; 493 handle->k = 0;
420 offset = handle->cur->next_swap; 494 offset = handle->cur->next_swap;
421 if (!offset) 495 if (!offset)
422 release_swap_reader(handle); 496 release_swap_reader(handle);
423 else 497 else if (!error)
424 error = bio_read_page(offset, handle->cur); 498 error = bio_read_page(offset, handle->cur, NULL);
425 } 499 }
426 return error; 500 return error;
427} 501}
@@ -434,33 +508,49 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf)
434 508
435static int load_image(struct swap_map_handle *handle, 509static int load_image(struct swap_map_handle *handle,
436 struct snapshot_handle *snapshot, 510 struct snapshot_handle *snapshot,
437 unsigned int nr_pages) 511 unsigned int nr_to_read)
438{ 512{
439 unsigned int m; 513 unsigned int m;
440 int ret;
441 int error = 0; 514 int error = 0;
515 struct timeval start;
516 struct timeval stop;
517 struct bio *bio;
518 int err2;
519 unsigned nr_pages;
442 520
443 printk("Loading image data pages (%u pages) ... ", nr_pages); 521 printk("Loading image data pages (%u pages) ... ", nr_to_read);
444 m = nr_pages / 100; 522 m = nr_to_read / 100;
445 if (!m) 523 if (!m)
446 m = 1; 524 m = 1;
447 nr_pages = 0; 525 nr_pages = 0;
448 do { 526 bio = NULL;
449 ret = snapshot_write_next(snapshot, PAGE_SIZE); 527 do_gettimeofday(&start);
450 if (ret > 0) { 528 for ( ; ; ) {
451 error = swap_read_page(handle, data_of(*snapshot)); 529 error = snapshot_write_next(snapshot, PAGE_SIZE);
452 if (error) 530 if (error <= 0)
453 break; 531 break;
454 if (!(nr_pages % m)) 532 error = swap_read_page(handle, data_of(*snapshot), &bio);
455 printk("\b\b\b\b%3d%%", nr_pages / m); 533 if (error)
456 nr_pages++; 534 break;
457 } 535 if (snapshot->sync_read)
458 } while (ret > 0); 536 error = wait_on_bio_chain(&bio);
537 if (error)
538 break;
539 if (!(nr_pages % m))
540 printk("\b\b\b\b%3d%%", nr_pages / m);
541 nr_pages++;
542 }
543 err2 = wait_on_bio_chain(&bio);
544 do_gettimeofday(&stop);
545 if (!error)
546 error = err2;
459 if (!error) { 547 if (!error) {
460 printk("\b\b\b\bdone\n"); 548 printk("\b\b\b\bdone\n");
549 snapshot_free_unused_memory(snapshot);
461 if (!snapshot_image_loaded(snapshot)) 550 if (!snapshot_image_loaded(snapshot))
462 error = -ENODATA; 551 error = -ENODATA;
463 } 552 }
553 show_speed(&start, &stop, nr_to_read, "Read");
464 return error; 554 return error;
465} 555}
466 556
@@ -483,7 +573,7 @@ int swsusp_read(void)
483 header = (struct swsusp_info *)data_of(snapshot); 573 header = (struct swsusp_info *)data_of(snapshot);
484 error = get_swap_reader(&handle, swsusp_header.image); 574 error = get_swap_reader(&handle, swsusp_header.image);
485 if (!error) 575 if (!error)
486 error = swap_read_page(&handle, header); 576 error = swap_read_page(&handle, header, NULL);
487 if (!error) 577 if (!error)
488 error = load_image(&handle, &snapshot, header->pages - 1); 578 error = load_image(&handle, &snapshot, header->pages - 1);
489 release_swap_reader(&handle); 579 release_swap_reader(&handle);
@@ -509,7 +599,7 @@ int swsusp_check(void)
509 if (!IS_ERR(resume_bdev)) { 599 if (!IS_ERR(resume_bdev)) {
510 set_blocksize(resume_bdev, PAGE_SIZE); 600 set_blocksize(resume_bdev, PAGE_SIZE);
511 memset(&swsusp_header, 0, sizeof(swsusp_header)); 601 memset(&swsusp_header, 0, sizeof(swsusp_header));
512 if ((error = bio_read_page(0, &swsusp_header))) 602 if ((error = bio_read_page(0, &swsusp_header, NULL)))
513 return error; 603 return error;
514 if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) { 604 if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
515 memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10); 605 memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 62752899b1a1..0b66659dc516 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -193,14 +193,13 @@ int swsusp_shrink_memory(void)
193 printk("Shrinking memory... "); 193 printk("Shrinking memory... ");
194 do { 194 do {
195 size = 2 * count_highmem_pages(); 195 size = 2 * count_highmem_pages();
196 size += size / 50 + count_data_pages(); 196 size += size / 50 + count_data_pages() + PAGES_FOR_IO;
197 size += (size + PBES_PER_PAGE - 1) / PBES_PER_PAGE +
198 PAGES_FOR_IO;
199 tmp = size; 197 tmp = size;
200 for_each_zone (zone) 198 for_each_zone (zone)
201 if (!is_highmem(zone) && populated_zone(zone)) { 199 if (!is_highmem(zone) && populated_zone(zone)) {
202 tmp -= zone->free_pages; 200 tmp -= zone->free_pages;
203 tmp += zone->lowmem_reserve[ZONE_NORMAL]; 201 tmp += zone->lowmem_reserve[ZONE_NORMAL];
202 tmp += snapshot_additional_pages(zone);
204 } 203 }
205 if (tmp > 0) { 204 if (tmp > 0) {
206 tmp = __shrink_memory(tmp); 205 tmp = __shrink_memory(tmp);
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 5a8d060d7909..72825c853cd7 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -19,6 +19,7 @@
19#include <linux/swapops.h> 19#include <linux/swapops.h>
20#include <linux/pm.h> 20#include <linux/pm.h>
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/cpu.h>
22 23
23#include <asm/uaccess.h> 24#include <asm/uaccess.h>
24 25
@@ -139,12 +140,15 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
139 if (data->frozen) 140 if (data->frozen)
140 break; 141 break;
141 down(&pm_sem); 142 down(&pm_sem);
142 disable_nonboot_cpus(); 143 error = disable_nonboot_cpus();
143 if (freeze_processes()) { 144 if (!error) {
144 thaw_processes(); 145 error = freeze_processes();
145 enable_nonboot_cpus(); 146 if (error) {
146 error = -EBUSY; 147 thaw_processes();
148 error = -EBUSY;
149 }
147 } 150 }
151 enable_nonboot_cpus();
148 up(&pm_sem); 152 up(&pm_sem);
149 if (!error) 153 if (!error)
150 data->frozen = 1; 154 data->frozen = 1;
@@ -189,6 +193,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
189 error = -EPERM; 193 error = -EPERM;
190 break; 194 break;
191 } 195 }
196 snapshot_free_unused_memory(&data->handle);
192 down(&pm_sem); 197 down(&pm_sem);
193 pm_prepare_console(); 198 pm_prepare_console();
194 error = device_suspend(PMSG_PRETHAW); 199 error = device_suspend(PMSG_PRETHAW);
diff --git a/kernel/printk.c b/kernel/printk.c
index 1149365e989e..771f5e861bcd 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -721,6 +721,7 @@ int __init add_preferred_console(char *name, int idx, char *options)
721 return 0; 721 return 0;
722} 722}
723 723
724#ifndef CONFIG_DISABLE_CONSOLE_SUSPEND
724/** 725/**
725 * suspend_console - suspend the console subsystem 726 * suspend_console - suspend the console subsystem
726 * 727 *
@@ -728,6 +729,7 @@ int __init add_preferred_console(char *name, int idx, char *options)
728 */ 729 */
729void suspend_console(void) 730void suspend_console(void)
730{ 731{
732 printk("Suspending console(s)\n");
731 acquire_console_sem(); 733 acquire_console_sem();
732 console_suspended = 1; 734 console_suspended = 1;
733} 735}
@@ -737,6 +739,7 @@ void resume_console(void)
737 console_suspended = 0; 739 console_suspended = 0;
738 release_console_sem(); 740 release_console_sem();
739} 741}
742#endif /* CONFIG_DISABLE_CONSOLE_SUSPEND */
740 743
741/** 744/**
742 * acquire_console_sem - lock the console system for exclusive use. 745 * acquire_console_sem - lock the console system for exclusive use.
diff --git a/kernel/profile.c b/kernel/profile.c
index d5bd75e7501c..fb660c7d35ba 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -309,13 +309,17 @@ static int __devinit profile_cpu_callback(struct notifier_block *info,
309 node = cpu_to_node(cpu); 309 node = cpu_to_node(cpu);
310 per_cpu(cpu_profile_flip, cpu) = 0; 310 per_cpu(cpu_profile_flip, cpu) = 0;
311 if (!per_cpu(cpu_profile_hits, cpu)[1]) { 311 if (!per_cpu(cpu_profile_hits, cpu)[1]) {
312 page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); 312 page = alloc_pages_node(node,
313 GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
314 0);
313 if (!page) 315 if (!page)
314 return NOTIFY_BAD; 316 return NOTIFY_BAD;
315 per_cpu(cpu_profile_hits, cpu)[1] = page_address(page); 317 per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
316 } 318 }
317 if (!per_cpu(cpu_profile_hits, cpu)[0]) { 319 if (!per_cpu(cpu_profile_hits, cpu)[0]) {
318 page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); 320 page = alloc_pages_node(node,
321 GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
322 0);
319 if (!page) 323 if (!page)
320 goto out_free; 324 goto out_free;
321 per_cpu(cpu_profile_hits, cpu)[0] = page_address(page); 325 per_cpu(cpu_profile_hits, cpu)[0] = page_address(page);
@@ -491,12 +495,16 @@ static int __init create_hash_tables(void)
491 int node = cpu_to_node(cpu); 495 int node = cpu_to_node(cpu);
492 struct page *page; 496 struct page *page;
493 497
494 page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); 498 page = alloc_pages_node(node,
499 GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
500 0);
495 if (!page) 501 if (!page)
496 goto out_cleanup; 502 goto out_cleanup;
497 per_cpu(cpu_profile_hits, cpu)[1] 503 per_cpu(cpu_profile_hits, cpu)[1]
498 = (struct profile_hit *)page_address(page); 504 = (struct profile_hit *)page_address(page);
499 page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); 505 page = alloc_pages_node(node,
506 GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
507 0);
500 if (!page) 508 if (!page)
501 goto out_cleanup; 509 goto out_cleanup;
502 per_cpu(cpu_profile_hits, cpu)[0] 510 per_cpu(cpu_profile_hits, cpu)[0]
diff --git a/kernel/sched.c b/kernel/sched.c
index a234fbee1238..5c848fd4e461 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -238,6 +238,7 @@ struct rq {
238 /* For active balancing */ 238 /* For active balancing */
239 int active_balance; 239 int active_balance;
240 int push_cpu; 240 int push_cpu;
241 int cpu; /* cpu of this runqueue */
241 242
242 struct task_struct *migration_thread; 243 struct task_struct *migration_thread;
243 struct list_head migration_queue; 244 struct list_head migration_queue;
@@ -267,6 +268,15 @@ struct rq {
267 268
268static DEFINE_PER_CPU(struct rq, runqueues); 269static DEFINE_PER_CPU(struct rq, runqueues);
269 270
271static inline int cpu_of(struct rq *rq)
272{
273#ifdef CONFIG_SMP
274 return rq->cpu;
275#else
276 return 0;
277#endif
278}
279
270/* 280/*
271 * The domain tree (rq->sd) is protected by RCU's quiescent state transition. 281 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
272 * See detach_destroy_domains: synchronize_sched for details. 282 * See detach_destroy_domains: synchronize_sched for details.
@@ -2211,7 +2221,8 @@ out:
2211 */ 2221 */
2212static struct sched_group * 2222static struct sched_group *
2213find_busiest_group(struct sched_domain *sd, int this_cpu, 2223find_busiest_group(struct sched_domain *sd, int this_cpu,
2214 unsigned long *imbalance, enum idle_type idle, int *sd_idle) 2224 unsigned long *imbalance, enum idle_type idle, int *sd_idle,
2225 cpumask_t *cpus)
2215{ 2226{
2216 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 2227 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
2217 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 2228 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@ -2248,7 +2259,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2248 sum_weighted_load = sum_nr_running = avg_load = 0; 2259 sum_weighted_load = sum_nr_running = avg_load = 0;
2249 2260
2250 for_each_cpu_mask(i, group->cpumask) { 2261 for_each_cpu_mask(i, group->cpumask) {
2251 struct rq *rq = cpu_rq(i); 2262 struct rq *rq;
2263
2264 if (!cpu_isset(i, *cpus))
2265 continue;
2266
2267 rq = cpu_rq(i);
2252 2268
2253 if (*sd_idle && !idle_cpu(i)) 2269 if (*sd_idle && !idle_cpu(i))
2254 *sd_idle = 0; 2270 *sd_idle = 0;
@@ -2466,13 +2482,17 @@ ret:
2466 */ 2482 */
2467static struct rq * 2483static struct rq *
2468find_busiest_queue(struct sched_group *group, enum idle_type idle, 2484find_busiest_queue(struct sched_group *group, enum idle_type idle,
2469 unsigned long imbalance) 2485 unsigned long imbalance, cpumask_t *cpus)
2470{ 2486{
2471 struct rq *busiest = NULL, *rq; 2487 struct rq *busiest = NULL, *rq;
2472 unsigned long max_load = 0; 2488 unsigned long max_load = 0;
2473 int i; 2489 int i;
2474 2490
2475 for_each_cpu_mask(i, group->cpumask) { 2491 for_each_cpu_mask(i, group->cpumask) {
2492
2493 if (!cpu_isset(i, *cpus))
2494 continue;
2495
2476 rq = cpu_rq(i); 2496 rq = cpu_rq(i);
2477 2497
2478 if (rq->nr_running == 1 && rq->raw_weighted_load > imbalance) 2498 if (rq->nr_running == 1 && rq->raw_weighted_load > imbalance)
@@ -2511,6 +2531,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
2511 struct sched_group *group; 2531 struct sched_group *group;
2512 unsigned long imbalance; 2532 unsigned long imbalance;
2513 struct rq *busiest; 2533 struct rq *busiest;
2534 cpumask_t cpus = CPU_MASK_ALL;
2514 2535
2515 if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && 2536 if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
2516 !sched_smt_power_savings) 2537 !sched_smt_power_savings)
@@ -2518,13 +2539,15 @@ static int load_balance(int this_cpu, struct rq *this_rq,
2518 2539
2519 schedstat_inc(sd, lb_cnt[idle]); 2540 schedstat_inc(sd, lb_cnt[idle]);
2520 2541
2521 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle); 2542redo:
2543 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
2544 &cpus);
2522 if (!group) { 2545 if (!group) {
2523 schedstat_inc(sd, lb_nobusyg[idle]); 2546 schedstat_inc(sd, lb_nobusyg[idle]);
2524 goto out_balanced; 2547 goto out_balanced;
2525 } 2548 }
2526 2549
2527 busiest = find_busiest_queue(group, idle, imbalance); 2550 busiest = find_busiest_queue(group, idle, imbalance, &cpus);
2528 if (!busiest) { 2551 if (!busiest) {
2529 schedstat_inc(sd, lb_nobusyq[idle]); 2552 schedstat_inc(sd, lb_nobusyq[idle]);
2530 goto out_balanced; 2553 goto out_balanced;
@@ -2549,8 +2572,12 @@ static int load_balance(int this_cpu, struct rq *this_rq,
2549 double_rq_unlock(this_rq, busiest); 2572 double_rq_unlock(this_rq, busiest);
2550 2573
2551 /* All tasks on this runqueue were pinned by CPU affinity */ 2574 /* All tasks on this runqueue were pinned by CPU affinity */
2552 if (unlikely(all_pinned)) 2575 if (unlikely(all_pinned)) {
2576 cpu_clear(cpu_of(busiest), cpus);
2577 if (!cpus_empty(cpus))
2578 goto redo;
2553 goto out_balanced; 2579 goto out_balanced;
2580 }
2554 } 2581 }
2555 2582
2556 if (!nr_moved) { 2583 if (!nr_moved) {
@@ -2639,18 +2666,22 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
2639 unsigned long imbalance; 2666 unsigned long imbalance;
2640 int nr_moved = 0; 2667 int nr_moved = 0;
2641 int sd_idle = 0; 2668 int sd_idle = 0;
2669 cpumask_t cpus = CPU_MASK_ALL;
2642 2670
2643 if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings) 2671 if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
2644 sd_idle = 1; 2672 sd_idle = 1;
2645 2673
2646 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); 2674 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
2647 group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, &sd_idle); 2675redo:
2676 group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE,
2677 &sd_idle, &cpus);
2648 if (!group) { 2678 if (!group) {
2649 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); 2679 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
2650 goto out_balanced; 2680 goto out_balanced;
2651 } 2681 }
2652 2682
2653 busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance); 2683 busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance,
2684 &cpus);
2654 if (!busiest) { 2685 if (!busiest) {
2655 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); 2686 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
2656 goto out_balanced; 2687 goto out_balanced;
@@ -2668,6 +2699,12 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
2668 minus_1_or_zero(busiest->nr_running), 2699 minus_1_or_zero(busiest->nr_running),
2669 imbalance, sd, NEWLY_IDLE, NULL); 2700 imbalance, sd, NEWLY_IDLE, NULL);
2670 spin_unlock(&busiest->lock); 2701 spin_unlock(&busiest->lock);
2702
2703 if (!nr_moved) {
2704 cpu_clear(cpu_of(busiest), cpus);
2705 if (!cpus_empty(cpus))
2706 goto redo;
2707 }
2671 } 2708 }
2672 2709
2673 if (!nr_moved) { 2710 if (!nr_moved) {
@@ -6747,6 +6784,7 @@ void __init sched_init(void)
6747 rq->cpu_load[j] = 0; 6784 rq->cpu_load[j] = 0;
6748 rq->active_balance = 0; 6785 rq->active_balance = 0;
6749 rq->push_cpu = 0; 6786 rq->push_cpu = 0;
6787 rq->cpu = i;
6750 rq->migration_thread = NULL; 6788 rq->migration_thread = NULL;
6751 INIT_LIST_HEAD(&rq->migration_queue); 6789 INIT_LIST_HEAD(&rq->migration_queue);
6752#endif 6790#endif
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 362a0cc37138..fd43c3e6786b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -943,6 +943,17 @@ static ctl_table vm_table[] = {
943 .extra1 = &zero, 943 .extra1 = &zero,
944 .extra2 = &one_hundred, 944 .extra2 = &one_hundred,
945 }, 945 },
946 {
947 .ctl_name = VM_MIN_SLAB,
948 .procname = "min_slab_ratio",
949 .data = &sysctl_min_slab_ratio,
950 .maxlen = sizeof(sysctl_min_slab_ratio),
951 .mode = 0644,
952 .proc_handler = &sysctl_min_slab_ratio_sysctl_handler,
953 .strategy = &sysctl_intvec,
954 .extra1 = &zero,
955 .extra2 = &one_hundred,
956 },
946#endif 957#endif
947#ifdef CONFIG_X86_32 958#ifdef CONFIG_X86_32
948 { 959 {