aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/audit.c16
-rw-r--r--kernel/auditsc.c24
-rw-r--r--kernel/cpu.c32
-rw-r--r--kernel/cpuset.c22
-rw-r--r--kernel/delayacct.c6
-rw-r--r--kernel/exit.c19
-rw-r--r--kernel/fork.c7
-rw-r--r--kernel/futex.c2
-rw-r--r--kernel/hrtimer.c33
-rw-r--r--kernel/irq/chip.c3
-rw-r--r--kernel/irq/devres.c2
-rw-r--r--kernel/kallsyms.c23
-rw-r--r--kernel/ksysfs.c12
-rw-r--r--kernel/lockdep.c8
-rw-r--r--kernel/module.c40
-rw-r--r--kernel/params.c6
-rw-r--r--kernel/pid.c4
-rw-r--r--kernel/power/Kconfig11
-rw-r--r--kernel/power/console.c10
-rw-r--r--kernel/power/disk.c171
-rw-r--r--kernel/power/main.c70
-rw-r--r--kernel/power/power.h49
-rw-r--r--kernel/power/process.c6
-rw-r--r--kernel/power/snapshot.c309
-rw-r--r--kernel/power/swap.c60
-rw-r--r--kernel/power/swsusp.c141
-rw-r--r--kernel/power/user.c48
-rw-r--r--kernel/resource.c21
-rw-r--r--kernel/sched.c46
-rw-r--r--kernel/signal.c6
-rw-r--r--kernel/sys.c2
-rw-r--r--kernel/sysctl.c2
-rw-r--r--kernel/taskstats.c8
-rw-r--r--kernel/time.c4
-rw-r--r--kernel/time/clockevents.c69
-rw-r--r--kernel/time/clocksource.c3
-rw-r--r--kernel/time/jiffies.c2
-rw-r--r--kernel/time/ntp.c30
-rw-r--r--kernel/time/tick-broadcast.c27
-rw-r--r--kernel/time/tick-common.c13
-rw-r--r--kernel/time/tick-internal.h11
-rw-r--r--kernel/time/tick-oneshot.c12
-rw-r--r--kernel/time/timer_list.c6
-rw-r--r--kernel/timer.c23
44 files changed, 879 insertions, 540 deletions
diff --git a/kernel/audit.c b/kernel/audit.c
index 76c9a11b72..4e9d208296 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -151,7 +151,7 @@ struct audit_buffer {
151 151
152static void audit_set_pid(struct audit_buffer *ab, pid_t pid) 152static void audit_set_pid(struct audit_buffer *ab, pid_t pid)
153{ 153{
154 struct nlmsghdr *nlh = (struct nlmsghdr *)ab->skb->data; 154 struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
155 nlh->nlmsg_pid = pid; 155 nlh->nlmsg_pid = pid;
156} 156}
157 157
@@ -750,7 +750,7 @@ static void audit_receive_skb(struct sk_buff *skb)
750 u32 rlen; 750 u32 rlen;
751 751
752 while (skb->len >= NLMSG_SPACE(0)) { 752 while (skb->len >= NLMSG_SPACE(0)) {
753 nlh = (struct nlmsghdr *)skb->data; 753 nlh = nlmsg_hdr(skb);
754 if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) 754 if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
755 return; 755 return;
756 rlen = NLMSG_ALIGN(nlh->nlmsg_len); 756 rlen = NLMSG_ALIGN(nlh->nlmsg_len);
@@ -795,7 +795,7 @@ static int __init audit_init(void)
795 printk(KERN_INFO "audit: initializing netlink socket (%s)\n", 795 printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
796 audit_default ? "enabled" : "disabled"); 796 audit_default ? "enabled" : "disabled");
797 audit_sock = netlink_kernel_create(NETLINK_AUDIT, 0, audit_receive, 797 audit_sock = netlink_kernel_create(NETLINK_AUDIT, 0, audit_receive,
798 THIS_MODULE); 798 NULL, THIS_MODULE);
799 if (!audit_sock) 799 if (!audit_sock)
800 audit_panic("cannot initialize netlink socket"); 800 audit_panic("cannot initialize netlink socket");
801 else 801 else
@@ -1073,7 +1073,7 @@ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
1073 goto out; 1073 goto out;
1074 } 1074 }
1075 va_copy(args2, args); 1075 va_copy(args2, args);
1076 len = vsnprintf(skb->tail, avail, fmt, args); 1076 len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args);
1077 if (len >= avail) { 1077 if (len >= avail) {
1078 /* The printk buffer is 1024 bytes long, so if we get 1078 /* The printk buffer is 1024 bytes long, so if we get
1079 * here and AUDIT_BUFSIZ is at least 1024, then we can 1079 * here and AUDIT_BUFSIZ is at least 1024, then we can
@@ -1082,7 +1082,7 @@ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
1082 max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail)); 1082 max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail));
1083 if (!avail) 1083 if (!avail)
1084 goto out; 1084 goto out;
1085 len = vsnprintf(skb->tail, avail, fmt, args2); 1085 len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args2);
1086 } 1086 }
1087 if (len > 0) 1087 if (len > 0)
1088 skb_put(skb, len); 1088 skb_put(skb, len);
@@ -1143,7 +1143,7 @@ void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf,
1143 return; 1143 return;
1144 } 1144 }
1145 1145
1146 ptr = skb->tail; 1146 ptr = skb_tail_pointer(skb);
1147 for (i=0; i<len; i++) { 1147 for (i=0; i<len; i++) {
1148 *ptr++ = hex[(buf[i] & 0xF0)>>4]; /* Upper nibble */ 1148 *ptr++ = hex[(buf[i] & 0xF0)>>4]; /* Upper nibble */
1149 *ptr++ = hex[buf[i] & 0x0F]; /* Lower nibble */ 1149 *ptr++ = hex[buf[i] & 0x0F]; /* Lower nibble */
@@ -1175,7 +1175,7 @@ static void audit_log_n_string(struct audit_buffer *ab, size_t slen,
1175 if (!avail) 1175 if (!avail)
1176 return; 1176 return;
1177 } 1177 }
1178 ptr = skb->tail; 1178 ptr = skb_tail_pointer(skb);
1179 *ptr++ = '"'; 1179 *ptr++ = '"';
1180 memcpy(ptr, string, slen); 1180 memcpy(ptr, string, slen);
1181 ptr += slen; 1181 ptr += slen;
@@ -1268,7 +1268,7 @@ void audit_log_end(struct audit_buffer *ab)
1268 audit_log_lost("rate limit exceeded"); 1268 audit_log_lost("rate limit exceeded");
1269 } else { 1269 } else {
1270 if (audit_pid) { 1270 if (audit_pid) {
1271 struct nlmsghdr *nlh = (struct nlmsghdr *)ab->skb->data; 1271 struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
1272 nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0); 1272 nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0);
1273 skb_queue_tail(&audit_skb_queue, ab->skb); 1273 skb_queue_tail(&audit_skb_queue, ab->skb);
1274 ab->skb = NULL; 1274 ab->skb = NULL;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 359955800d..628c7ac590 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -739,28 +739,26 @@ static inline void audit_free_context(struct audit_context *context)
739void audit_log_task_context(struct audit_buffer *ab) 739void audit_log_task_context(struct audit_buffer *ab)
740{ 740{
741 char *ctx = NULL; 741 char *ctx = NULL;
742 ssize_t len = 0; 742 unsigned len;
743 int error;
744 u32 sid;
745
746 selinux_get_task_sid(current, &sid);
747 if (!sid)
748 return;
743 749
744 len = security_getprocattr(current, "current", NULL, 0); 750 error = selinux_sid_to_string(sid, &ctx, &len);
745 if (len < 0) { 751 if (error) {
746 if (len != -EINVAL) 752 if (error != -EINVAL)
747 goto error_path; 753 goto error_path;
748 return; 754 return;
749 } 755 }
750 756
751 ctx = kmalloc(len, GFP_KERNEL);
752 if (!ctx)
753 goto error_path;
754
755 len = security_getprocattr(current, "current", ctx, len);
756 if (len < 0 )
757 goto error_path;
758
759 audit_log_format(ab, " subj=%s", ctx); 757 audit_log_format(ab, " subj=%s", ctx);
758 kfree(ctx);
760 return; 759 return;
761 760
762error_path: 761error_path:
763 kfree(ctx);
764 audit_panic("error in audit_log_task_context"); 762 audit_panic("error in audit_log_task_context");
765 return; 763 return;
766} 764}
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 3d4206ada5..36e70845cf 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -254,6 +254,12 @@ int __cpuinit cpu_up(unsigned int cpu)
254} 254}
255 255
256#ifdef CONFIG_SUSPEND_SMP 256#ifdef CONFIG_SUSPEND_SMP
257/* Needed to prevent the microcode driver from requesting firmware in its CPU
258 * hotplug notifier during the suspend/resume.
259 */
260int suspend_cpu_hotplug;
261EXPORT_SYMBOL(suspend_cpu_hotplug);
262
257static cpumask_t frozen_cpus; 263static cpumask_t frozen_cpus;
258 264
259int disable_nonboot_cpus(void) 265int disable_nonboot_cpus(void)
@@ -261,16 +267,8 @@ int disable_nonboot_cpus(void)
261 int cpu, first_cpu, error = 0; 267 int cpu, first_cpu, error = 0;
262 268
263 mutex_lock(&cpu_add_remove_lock); 269 mutex_lock(&cpu_add_remove_lock);
264 first_cpu = first_cpu(cpu_present_map); 270 suspend_cpu_hotplug = 1;
265 if (!cpu_online(first_cpu)) { 271 first_cpu = first_cpu(cpu_online_map);
266 error = _cpu_up(first_cpu);
267 if (error) {
268 printk(KERN_ERR "Could not bring CPU%d up.\n",
269 first_cpu);
270 goto out;
271 }
272 }
273
274 /* We take down all of the non-boot CPUs in one shot to avoid races 272 /* We take down all of the non-boot CPUs in one shot to avoid races
275 * with the userspace trying to use the CPU hotplug at the same time 273 * with the userspace trying to use the CPU hotplug at the same time
276 */ 274 */
@@ -296,7 +294,7 @@ int disable_nonboot_cpus(void)
296 } else { 294 } else {
297 printk(KERN_ERR "Non-boot CPUs are not disabled\n"); 295 printk(KERN_ERR "Non-boot CPUs are not disabled\n");
298 } 296 }
299out: 297 suspend_cpu_hotplug = 0;
300 mutex_unlock(&cpu_add_remove_lock); 298 mutex_unlock(&cpu_add_remove_lock);
301 return error; 299 return error;
302} 300}
@@ -308,20 +306,22 @@ void enable_nonboot_cpus(void)
308 /* Allow everyone to use the CPU hotplug again */ 306 /* Allow everyone to use the CPU hotplug again */
309 mutex_lock(&cpu_add_remove_lock); 307 mutex_lock(&cpu_add_remove_lock);
310 cpu_hotplug_disabled = 0; 308 cpu_hotplug_disabled = 0;
311 mutex_unlock(&cpu_add_remove_lock);
312 if (cpus_empty(frozen_cpus)) 309 if (cpus_empty(frozen_cpus))
313 return; 310 goto out;
314 311
312 suspend_cpu_hotplug = 1;
315 printk("Enabling non-boot CPUs ...\n"); 313 printk("Enabling non-boot CPUs ...\n");
316 for_each_cpu_mask(cpu, frozen_cpus) { 314 for_each_cpu_mask(cpu, frozen_cpus) {
317 error = cpu_up(cpu); 315 error = _cpu_up(cpu);
318 if (!error) { 316 if (!error) {
319 printk("CPU%d is up\n", cpu); 317 printk("CPU%d is up\n", cpu);
320 continue; 318 continue;
321 } 319 }
322 printk(KERN_WARNING "Error taking CPU%d up: %d\n", 320 printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error);
323 cpu, error);
324 } 321 }
325 cpus_clear(frozen_cpus); 322 cpus_clear(frozen_cpus);
323 suspend_cpu_hotplug = 0;
324out:
325 mutex_unlock(&cpu_add_remove_lock);
326} 326}
327#endif 327#endif
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f382b0f775..d240349cbf 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2351,6 +2351,8 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
2351 * z's node is in our tasks mems_allowed, yes. If it's not a 2351 * z's node is in our tasks mems_allowed, yes. If it's not a
2352 * __GFP_HARDWALL request and this zone's nodes is in the nearest 2352 * __GFP_HARDWALL request and this zone's nodes is in the nearest
2353 * mem_exclusive cpuset ancestor to this tasks cpuset, yes. 2353 * mem_exclusive cpuset ancestor to this tasks cpuset, yes.
2354 * If the task has been OOM killed and has access to memory reserves
2355 * as specified by the TIF_MEMDIE flag, yes.
2354 * Otherwise, no. 2356 * Otherwise, no.
2355 * 2357 *
2356 * If __GFP_HARDWALL is set, cpuset_zone_allowed_softwall() 2358 * If __GFP_HARDWALL is set, cpuset_zone_allowed_softwall()
@@ -2368,7 +2370,8 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
2368 * calls get to this routine, we should just shut up and say 'yes'. 2370 * calls get to this routine, we should just shut up and say 'yes'.
2369 * 2371 *
2370 * GFP_USER allocations are marked with the __GFP_HARDWALL bit, 2372 * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
2371 * and do not allow allocations outside the current tasks cpuset. 2373 * and do not allow allocations outside the current tasks cpuset
2374 * unless the task has been OOM killed as is marked TIF_MEMDIE.
2372 * GFP_KERNEL allocations are not so marked, so can escape to the 2375 * GFP_KERNEL allocations are not so marked, so can escape to the
2373 * nearest enclosing mem_exclusive ancestor cpuset. 2376 * nearest enclosing mem_exclusive ancestor cpuset.
2374 * 2377 *
@@ -2392,6 +2395,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
2392 * affect that: 2395 * affect that:
2393 * in_interrupt - any node ok (current task context irrelevant) 2396 * in_interrupt - any node ok (current task context irrelevant)
2394 * GFP_ATOMIC - any node ok 2397 * GFP_ATOMIC - any node ok
2398 * TIF_MEMDIE - any node ok
2395 * GFP_KERNEL - any node in enclosing mem_exclusive cpuset ok 2399 * GFP_KERNEL - any node in enclosing mem_exclusive cpuset ok
2396 * GFP_USER - only nodes in current tasks mems allowed ok. 2400 * GFP_USER - only nodes in current tasks mems allowed ok.
2397 * 2401 *
@@ -2413,6 +2417,12 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
2413 might_sleep_if(!(gfp_mask & __GFP_HARDWALL)); 2417 might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
2414 if (node_isset(node, current->mems_allowed)) 2418 if (node_isset(node, current->mems_allowed))
2415 return 1; 2419 return 1;
2420 /*
2421 * Allow tasks that have access to memory reserves because they have
2422 * been OOM killed to get memory anywhere.
2423 */
2424 if (unlikely(test_thread_flag(TIF_MEMDIE)))
2425 return 1;
2416 if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ 2426 if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */
2417 return 0; 2427 return 0;
2418 2428
@@ -2438,7 +2448,9 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
2438 * 2448 *
2439 * If we're in interrupt, yes, we can always allocate. 2449 * If we're in interrupt, yes, we can always allocate.
2440 * If __GFP_THISNODE is set, yes, we can always allocate. If zone 2450 * If __GFP_THISNODE is set, yes, we can always allocate. If zone
2441 * z's node is in our tasks mems_allowed, yes. Otherwise, no. 2451 * z's node is in our tasks mems_allowed, yes. If the task has been
2452 * OOM killed and has access to memory reserves as specified by the
2453 * TIF_MEMDIE flag, yes. Otherwise, no.
2442 * 2454 *
2443 * The __GFP_THISNODE placement logic is really handled elsewhere, 2455 * The __GFP_THISNODE placement logic is really handled elsewhere,
2444 * by forcibly using a zonelist starting at a specified node, and by 2456 * by forcibly using a zonelist starting at a specified node, and by
@@ -2462,6 +2474,12 @@ int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
2462 node = zone_to_nid(z); 2474 node = zone_to_nid(z);
2463 if (node_isset(node, current->mems_allowed)) 2475 if (node_isset(node, current->mems_allowed))
2464 return 1; 2476 return 1;
2477 /*
2478 * Allow tasks that have access to memory reserves because they have
2479 * been OOM killed to get memory anywhere.
2480 */
2481 if (unlikely(test_thread_flag(TIF_MEMDIE)))
2482 return 1;
2465 return 0; 2483 return 0;
2466} 2484}
2467 2485
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 766d5912b2..c0148ae992 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -31,11 +31,7 @@ __setup("nodelayacct", delayacct_setup_disable);
31 31
32void delayacct_init(void) 32void delayacct_init(void)
33{ 33{
34 delayacct_cache = kmem_cache_create("delayacct_cache", 34 delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC);
35 sizeof(struct task_delay_info),
36 0,
37 SLAB_PANIC,
38 NULL, NULL);
39 delayacct_tsk_init(&init_task); 35 delayacct_tsk_init(&init_task);
40} 36}
41 37
diff --git a/kernel/exit.c b/kernel/exit.c
index f132349c03..92369240d9 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -790,7 +790,7 @@ static void exit_notify(struct task_struct *tsk)
790 790
791 pgrp = task_pgrp(tsk); 791 pgrp = task_pgrp(tsk);
792 if ((task_pgrp(t) != pgrp) && 792 if ((task_pgrp(t) != pgrp) &&
793 (task_session(t) != task_session(tsk)) && 793 (task_session(t) == task_session(tsk)) &&
794 will_become_orphaned_pgrp(pgrp, tsk) && 794 will_become_orphaned_pgrp(pgrp, tsk) &&
795 has_stopped_jobs(pgrp)) { 795 has_stopped_jobs(pgrp)) {
796 __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp); 796 __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp);
@@ -1033,6 +1033,8 @@ asmlinkage void sys_exit_group(int error_code)
1033 1033
1034static int eligible_child(pid_t pid, int options, struct task_struct *p) 1034static int eligible_child(pid_t pid, int options, struct task_struct *p)
1035{ 1035{
1036 int err;
1037
1036 if (pid > 0) { 1038 if (pid > 0) {
1037 if (p->pid != pid) 1039 if (p->pid != pid)
1038 return 0; 1040 return 0;
@@ -1066,8 +1068,9 @@ static int eligible_child(pid_t pid, int options, struct task_struct *p)
1066 if (delay_group_leader(p)) 1068 if (delay_group_leader(p))
1067 return 2; 1069 return 2;
1068 1070
1069 if (security_task_wait(p)) 1071 err = security_task_wait(p);
1070 return 0; 1072 if (err)
1073 return err;
1071 1074
1072 return 1; 1075 return 1;
1073} 1076}
@@ -1449,6 +1452,7 @@ static long do_wait(pid_t pid, int options, struct siginfo __user *infop,
1449 DECLARE_WAITQUEUE(wait, current); 1452 DECLARE_WAITQUEUE(wait, current);
1450 struct task_struct *tsk; 1453 struct task_struct *tsk;
1451 int flag, retval; 1454 int flag, retval;
1455 int allowed, denied;
1452 1456
1453 add_wait_queue(&current->signal->wait_chldexit,&wait); 1457 add_wait_queue(&current->signal->wait_chldexit,&wait);
1454repeat: 1458repeat:
@@ -1457,6 +1461,7 @@ repeat:
1457 * match our criteria, even if we are not able to reap it yet. 1461 * match our criteria, even if we are not able to reap it yet.
1458 */ 1462 */
1459 flag = 0; 1463 flag = 0;
1464 allowed = denied = 0;
1460 current->state = TASK_INTERRUPTIBLE; 1465 current->state = TASK_INTERRUPTIBLE;
1461 read_lock(&tasklist_lock); 1466 read_lock(&tasklist_lock);
1462 tsk = current; 1467 tsk = current;
@@ -1472,6 +1477,12 @@ repeat:
1472 if (!ret) 1477 if (!ret)
1473 continue; 1478 continue;
1474 1479
1480 if (unlikely(ret < 0)) {
1481 denied = ret;
1482 continue;
1483 }
1484 allowed = 1;
1485
1475 switch (p->state) { 1486 switch (p->state) {
1476 case TASK_TRACED: 1487 case TASK_TRACED:
1477 /* 1488 /*
@@ -1570,6 +1581,8 @@ check_continued:
1570 goto repeat; 1581 goto repeat;
1571 } 1582 }
1572 retval = -ECHILD; 1583 retval = -ECHILD;
1584 if (unlikely(denied) && !allowed)
1585 retval = denied;
1573end: 1586end:
1574 current->state = TASK_RUNNING; 1587 current->state = TASK_RUNNING;
1575 remove_wait_queue(&current->signal->wait_chldexit,&wait); 1588 remove_wait_queue(&current->signal->wait_chldexit,&wait);
diff --git a/kernel/fork.c b/kernel/fork.c
index d154cc7864..b7d169def9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -286,6 +286,8 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
286 if (retval) 286 if (retval)
287 goto out; 287 goto out;
288 } 288 }
289 /* a new mm has just been created */
290 arch_dup_mmap(oldmm, mm);
289 retval = 0; 291 retval = 0;
290out: 292out:
291 up_write(&mm->mmap_sem); 293 up_write(&mm->mmap_sem);
@@ -933,8 +935,8 @@ asmlinkage long sys_set_tid_address(int __user *tidptr)
933 935
934static inline void rt_mutex_init_task(struct task_struct *p) 936static inline void rt_mutex_init_task(struct task_struct *p)
935{ 937{
936#ifdef CONFIG_RT_MUTEXES
937 spin_lock_init(&p->pi_lock); 938 spin_lock_init(&p->pi_lock);
939#ifdef CONFIG_RT_MUTEXES
938 plist_head_init(&p->pi_waiters, &p->pi_lock); 940 plist_head_init(&p->pi_waiters, &p->pi_lock);
939 p->pi_blocked_on = NULL; 941 p->pi_blocked_on = NULL;
940#endif 942#endif
@@ -1423,8 +1425,7 @@ static void sighand_ctor(void *data, struct kmem_cache *cachep, unsigned long fl
1423{ 1425{
1424 struct sighand_struct *sighand = data; 1426 struct sighand_struct *sighand = data;
1425 1427
1426 if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) == 1428 if (flags & SLAB_CTOR_CONSTRUCTOR)
1427 SLAB_CTOR_CONSTRUCTOR)
1428 spin_lock_init(&sighand->siglock); 1429 spin_lock_init(&sighand->siglock);
1429} 1430}
1430 1431
diff --git a/kernel/futex.c b/kernel/futex.c
index e749e7df14..5a270b5e3f 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -565,6 +565,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
565 if (!pi_state) 565 if (!pi_state)
566 return -EINVAL; 566 return -EINVAL;
567 567
568 spin_lock(&pi_state->pi_mutex.wait_lock);
568 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); 569 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
569 570
570 /* 571 /*
@@ -604,6 +605,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
604 pi_state->owner = new_owner; 605 pi_state->owner = new_owner;
605 spin_unlock_irq(&new_owner->pi_lock); 606 spin_unlock_irq(&new_owner->pi_lock);
606 607
608 spin_unlock(&pi_state->pi_mutex.wait_lock);
607 rt_mutex_unlock(&pi_state->pi_mutex); 609 rt_mutex_unlock(&pi_state->pi_mutex);
608 610
609 return 0; 611 return 0;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index ec4cb9f3e3..1b3033105b 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -59,6 +59,7 @@ ktime_t ktime_get(void)
59 59
60 return timespec_to_ktime(now); 60 return timespec_to_ktime(now);
61} 61}
62EXPORT_SYMBOL_GPL(ktime_get);
62 63
63/** 64/**
64 * ktime_get_real - get the real (wall-) time in ktime_t format 65 * ktime_get_real - get the real (wall-) time in ktime_t format
@@ -135,7 +136,7 @@ EXPORT_SYMBOL_GPL(ktime_get_ts);
135static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) 136static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
136{ 137{
137 ktime_t xtim, tomono; 138 ktime_t xtim, tomono;
138 struct timespec xts; 139 struct timespec xts, tom;
139 unsigned long seq; 140 unsigned long seq;
140 141
141 do { 142 do {
@@ -145,10 +146,11 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
145#else 146#else
146 xts = xtime; 147 xts = xtime;
147#endif 148#endif
149 tom = wall_to_monotonic;
148 } while (read_seqretry(&xtime_lock, seq)); 150 } while (read_seqretry(&xtime_lock, seq));
149 151
150 xtim = timespec_to_ktime(xts); 152 xtim = timespec_to_ktime(xts);
151 tomono = timespec_to_ktime(wall_to_monotonic); 153 tomono = timespec_to_ktime(tom);
152 base->clock_base[CLOCK_REALTIME].softirq_time = xtim; 154 base->clock_base[CLOCK_REALTIME].softirq_time = xtim;
153 base->clock_base[CLOCK_MONOTONIC].softirq_time = 155 base->clock_base[CLOCK_MONOTONIC].softirq_time =
154 ktime_add(xtim, tomono); 156 ktime_add(xtim, tomono);
@@ -277,6 +279,8 @@ ktime_t ktime_add_ns(const ktime_t kt, u64 nsec)
277 279
278 return ktime_add(kt, tmp); 280 return ktime_add(kt, tmp);
279} 281}
282
283EXPORT_SYMBOL_GPL(ktime_add_ns);
280# endif /* !CONFIG_KTIME_SCALAR */ 284# endif /* !CONFIG_KTIME_SCALAR */
281 285
282/* 286/*
@@ -458,6 +462,18 @@ void clock_was_set(void)
458} 462}
459 463
460/* 464/*
465 * During resume we might have to reprogram the high resolution timer
466 * interrupt (on the local CPU):
467 */
468void hres_timers_resume(void)
469{
470 WARN_ON_ONCE(num_online_cpus() > 1);
471
472 /* Retrigger the CPU local events: */
473 retrigger_next_event(NULL);
474}
475
476/*
461 * Check, whether the timer is on the callback pending list 477 * Check, whether the timer is on the callback pending list
462 */ 478 */
463static inline int hrtimer_cb_pending(const struct hrtimer *timer) 479static inline int hrtimer_cb_pending(const struct hrtimer *timer)
@@ -644,6 +660,12 @@ hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
644 orun++; 660 orun++;
645 } 661 }
646 timer->expires = ktime_add(timer->expires, interval); 662 timer->expires = ktime_add(timer->expires, interval);
663 /*
664 * Make sure, that the result did not wrap with a very large
665 * interval.
666 */
667 if (timer->expires.tv64 < 0)
668 timer->expires = ktime_set(KTIME_SEC_MAX, 0);
647 669
648 return orun; 670 return orun;
649} 671}
@@ -807,7 +829,12 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
807 829
808 timer_stats_hrtimer_set_start_info(timer); 830 timer_stats_hrtimer_set_start_info(timer);
809 831
810 enqueue_hrtimer(timer, new_base, base == new_base); 832 /*
833 * Only allow reprogramming if the new base is on this CPU.
834 * (it might still be on another CPU if the timer was pending)
835 */
836 enqueue_hrtimer(timer, new_base,
837 new_base->cpu_base == &__get_cpu_var(hrtimer_bases));
811 838
812 unlock_hrtimer_base(timer, &flags); 839 unlock_hrtimer_base(timer, &flags);
813 840
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 0133f4f9e9..615ce97c6c 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -11,6 +11,7 @@
11 */ 11 */
12 12
13#include <linux/irq.h> 13#include <linux/irq.h>
14#include <linux/msi.h>
14#include <linux/module.h> 15#include <linux/module.h>
15#include <linux/interrupt.h> 16#include <linux/interrupt.h>
16#include <linux/kernel_stat.h> 17#include <linux/kernel_stat.h>
@@ -185,6 +186,8 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry)
185 desc = irq_desc + irq; 186 desc = irq_desc + irq;
186 spin_lock_irqsave(&desc->lock, flags); 187 spin_lock_irqsave(&desc->lock, flags);
187 desc->msi_desc = entry; 188 desc->msi_desc = entry;
189 if (entry)
190 entry->irq = irq;
188 spin_unlock_irqrestore(&desc->lock, flags); 191 spin_unlock_irqrestore(&desc->lock, flags);
189 return 0; 192 return 0;
190} 193}
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index 85a430da0f..d8ee241115 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -54,7 +54,7 @@ int devm_request_irq(struct device *dev, unsigned int irq,
54 54
55 rc = request_irq(irq, handler, irqflags, devname, dev_id); 55 rc = request_irq(irq, handler, irqflags, devname, dev_id);
56 if (rc) { 56 if (rc) {
57 kfree(dr); 57 devres_free(dr);
58 return rc; 58 return rc;
59 } 59 }
60 60
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 6f294ff4f9..5a0de84097 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -267,27 +267,33 @@ const char *kallsyms_lookup(unsigned long addr,
267 return NULL; 267 return NULL;
268} 268}
269 269
270/* Replace "%s" in format with address, or returns -errno. */ 270/* Look up a kernel symbol and return it in a text buffer. */
271void __print_symbol(const char *fmt, unsigned long address) 271int sprint_symbol(char *buffer, unsigned long address)
272{ 272{
273 char *modname; 273 char *modname;
274 const char *name; 274 const char *name;
275 unsigned long offset, size; 275 unsigned long offset, size;
276 char namebuf[KSYM_NAME_LEN+1]; 276 char namebuf[KSYM_NAME_LEN+1];
277 char buffer[sizeof("%s+%#lx/%#lx [%s]") + KSYM_NAME_LEN +
278 2*(BITS_PER_LONG*3/10) + MODULE_NAME_LEN + 1];
279 277
280 name = kallsyms_lookup(address, &size, &offset, &modname, namebuf); 278 name = kallsyms_lookup(address, &size, &offset, &modname, namebuf);
281
282 if (!name) 279 if (!name)
283 sprintf(buffer, "0x%lx", address); 280 return sprintf(buffer, "0x%lx", address);
284 else { 281 else {
285 if (modname) 282 if (modname)
286 sprintf(buffer, "%s+%#lx/%#lx [%s]", name, offset, 283 return sprintf(buffer, "%s+%#lx/%#lx [%s]", name, offset,
287 size, modname); 284 size, modname);
288 else 285 else
289 sprintf(buffer, "%s+%#lx/%#lx", name, offset, size); 286 return sprintf(buffer, "%s+%#lx/%#lx", name, offset, size);
290 } 287 }
288}
289
290/* Look up a kernel symbol and print it to the kernel messages. */
291void __print_symbol(const char *fmt, unsigned long address)
292{
293 char buffer[KSYM_SYMBOL_LEN];
294
295 sprint_symbol(buffer, address);
296
291 printk(fmt, buffer); 297 printk(fmt, buffer);
292} 298}
293 299
@@ -452,3 +458,4 @@ static int __init kallsyms_init(void)
452__initcall(kallsyms_init); 458__initcall(kallsyms_init);
453 459
454EXPORT_SYMBOL(__print_symbol); 460EXPORT_SYMBOL(__print_symbol);
461EXPORT_SYMBOL_GPL(sprint_symbol);
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index e0ffe4ab09..559deca5ed 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -24,18 +24,18 @@ static struct subsys_attribute _name##_attr = \
24 24
25#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) 25#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
26/* current uevent sequence number */ 26/* current uevent sequence number */
27static ssize_t uevent_seqnum_show(struct subsystem *subsys, char *page) 27static ssize_t uevent_seqnum_show(struct kset *kset, char *page)
28{ 28{
29 return sprintf(page, "%llu\n", (unsigned long long)uevent_seqnum); 29 return sprintf(page, "%llu\n", (unsigned long long)uevent_seqnum);
30} 30}
31KERNEL_ATTR_RO(uevent_seqnum); 31KERNEL_ATTR_RO(uevent_seqnum);
32 32
33/* uevent helper program, used during early boo */ 33/* uevent helper program, used during early boo */
34static ssize_t uevent_helper_show(struct subsystem *subsys, char *page) 34static ssize_t uevent_helper_show(struct kset *kset, char *page)
35{ 35{
36 return sprintf(page, "%s\n", uevent_helper); 36 return sprintf(page, "%s\n", uevent_helper);
37} 37}
38static ssize_t uevent_helper_store(struct subsystem *subsys, const char *page, size_t count) 38static ssize_t uevent_helper_store(struct kset *kset, const char *page, size_t count)
39{ 39{
40 if (count+1 > UEVENT_HELPER_PATH_LEN) 40 if (count+1 > UEVENT_HELPER_PATH_LEN)
41 return -ENOENT; 41 return -ENOENT;
@@ -49,13 +49,13 @@ KERNEL_ATTR_RW(uevent_helper);
49#endif 49#endif
50 50
51#ifdef CONFIG_KEXEC 51#ifdef CONFIG_KEXEC
52static ssize_t kexec_loaded_show(struct subsystem *subsys, char *page) 52static ssize_t kexec_loaded_show(struct kset *kset, char *page)
53{ 53{
54 return sprintf(page, "%d\n", !!kexec_image); 54 return sprintf(page, "%d\n", !!kexec_image);
55} 55}
56KERNEL_ATTR_RO(kexec_loaded); 56KERNEL_ATTR_RO(kexec_loaded);
57 57
58static ssize_t kexec_crash_loaded_show(struct subsystem *subsys, char *page) 58static ssize_t kexec_crash_loaded_show(struct kset *kset, char *page)
59{ 59{
60 return sprintf(page, "%d\n", !!kexec_crash_image); 60 return sprintf(page, "%d\n", !!kexec_crash_image);
61} 61}
@@ -85,7 +85,7 @@ static int __init ksysfs_init(void)
85{ 85{
86 int error = subsystem_register(&kernel_subsys); 86 int error = subsystem_register(&kernel_subsys);
87 if (!error) 87 if (!error)
88 error = sysfs_create_group(&kernel_subsys.kset.kobj, 88 error = sysfs_create_group(&kernel_subsys.kobj,
89 &kernel_attr_group); 89 &kernel_attr_group);
90 90
91 return error; 91 return error;
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 8dc24c92dc..7065a687ac 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2742,6 +2742,10 @@ void debug_show_all_locks(void)
2742 int count = 10; 2742 int count = 10;
2743 int unlock = 1; 2743 int unlock = 1;
2744 2744
2745 if (unlikely(!debug_locks)) {
2746 printk("INFO: lockdep is turned off.\n");
2747 return;
2748 }
2745 printk("\nShowing all locks held in the system:\n"); 2749 printk("\nShowing all locks held in the system:\n");
2746 2750
2747 /* 2751 /*
@@ -2785,6 +2789,10 @@ EXPORT_SYMBOL_GPL(debug_show_all_locks);
2785 2789
2786void debug_show_held_locks(struct task_struct *task) 2790void debug_show_held_locks(struct task_struct *task)
2787{ 2791{
2792 if (unlikely(!debug_locks)) {
2793 printk("INFO: lockdep is turned off.\n");
2794 return;
2795 }
2788 lockdep_print_held_locks(task); 2796 lockdep_print_held_locks(task);
2789} 2797}
2790 2798
diff --git a/kernel/module.c b/kernel/module.c
index fbc51de644..1eb8ca565b 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -45,6 +45,8 @@
45#include <asm/cacheflush.h> 45#include <asm/cacheflush.h>
46#include <linux/license.h> 46#include <linux/license.h>
47 47
48extern int module_sysfs_initialized;
49
48#if 0 50#if 0
49#define DEBUGP printk 51#define DEBUGP printk
50#else 52#else
@@ -346,10 +348,10 @@ static void *percpu_modalloc(unsigned long size, unsigned long align,
346 unsigned int i; 348 unsigned int i;
347 void *ptr; 349 void *ptr;
348 350
349 if (align > SMP_CACHE_BYTES) { 351 if (align > PAGE_SIZE) {
350 printk(KERN_WARNING "%s: per-cpu alignment %li > %i\n", 352 printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
351 name, align, SMP_CACHE_BYTES); 353 name, align, PAGE_SIZE);
352 align = SMP_CACHE_BYTES; 354 align = PAGE_SIZE;
353 } 355 }
354 356
355 ptr = __per_cpu_start; 357 ptr = __per_cpu_start;
@@ -430,7 +432,7 @@ static int percpu_modinit(void)
430 pcpu_size = kmalloc(sizeof(pcpu_size[0]) * pcpu_num_allocated, 432 pcpu_size = kmalloc(sizeof(pcpu_size[0]) * pcpu_num_allocated,
431 GFP_KERNEL); 433 GFP_KERNEL);
432 /* Static in-kernel percpu data (used). */ 434 /* Static in-kernel percpu data (used). */
433 pcpu_size[0] = -ALIGN(__per_cpu_end-__per_cpu_start, SMP_CACHE_BYTES); 435 pcpu_size[0] = -(__per_cpu_end-__per_cpu_start);
434 /* Free room. */ 436 /* Free room. */
435 pcpu_size[1] = PERCPU_ENOUGH_ROOM + pcpu_size[0]; 437 pcpu_size[1] = PERCPU_ENOUGH_ROOM + pcpu_size[0];
436 if (pcpu_size[1] < 0) { 438 if (pcpu_size[1] < 0) {
@@ -1117,8 +1119,8 @@ int mod_sysfs_init(struct module *mod)
1117{ 1119{
1118 int err; 1120 int err;
1119 1121
1120 if (!module_subsys.kset.subsys) { 1122 if (!module_sysfs_initialized) {
1121 printk(KERN_ERR "%s: module_subsys not initialized\n", 1123 printk(KERN_ERR "%s: module sysfs not initialized\n",
1122 mod->name); 1124 mod->name);
1123 err = -EINVAL; 1125 err = -EINVAL;
1124 goto out; 1126 goto out;
@@ -1148,8 +1150,10 @@ int mod_sysfs_setup(struct module *mod,
1148 goto out; 1150 goto out;
1149 1151
1150 mod->holders_dir = kobject_add_dir(&mod->mkobj.kobj, "holders"); 1152 mod->holders_dir = kobject_add_dir(&mod->mkobj.kobj, "holders");
1151 if (!mod->holders_dir) 1153 if (!mod->holders_dir) {
1154 err = -ENOMEM;
1152 goto out_unreg; 1155 goto out_unreg;
1156 }
1153 1157
1154 err = module_param_sysfs_setup(mod, kparam, num_params); 1158 err = module_param_sysfs_setup(mod, kparam, num_params);
1155 if (err) 1159 if (err)
@@ -2383,9 +2387,14 @@ void module_add_driver(struct module *mod, struct device_driver *drv)
2383 struct kobject *mkobj; 2387 struct kobject *mkobj;
2384 2388
2385 /* Lookup built-in module entry in /sys/modules */ 2389 /* Lookup built-in module entry in /sys/modules */
2386 mkobj = kset_find_obj(&module_subsys.kset, drv->mod_name); 2390 mkobj = kset_find_obj(&module_subsys, drv->mod_name);
2387 if (mkobj) 2391 if (mkobj) {
2388 mk = container_of(mkobj, struct module_kobject, kobj); 2392 mk = container_of(mkobj, struct module_kobject, kobj);
2393 /* remember our module structure */
2394 drv->mkobj = mk;
2395 /* kset_find_obj took a reference */
2396 kobject_put(mkobj);
2397 }
2389 } 2398 }
2390 2399
2391 if (!mk) 2400 if (!mk)
@@ -2405,17 +2414,22 @@ EXPORT_SYMBOL(module_add_driver);
2405 2414
2406void module_remove_driver(struct device_driver *drv) 2415void module_remove_driver(struct device_driver *drv)
2407{ 2416{
2417 struct module_kobject *mk = NULL;
2408 char *driver_name; 2418 char *driver_name;
2409 2419
2410 if (!drv) 2420 if (!drv)
2411 return; 2421 return;
2412 2422
2413 sysfs_remove_link(&drv->kobj, "module"); 2423 sysfs_remove_link(&drv->kobj, "module");
2414 if (drv->owner && drv->owner->mkobj.drivers_dir) { 2424
2425 if (drv->owner)
2426 mk = &drv->owner->mkobj;
2427 else if (drv->mkobj)
2428 mk = drv->mkobj;
2429 if (mk && mk->drivers_dir) {
2415 driver_name = make_driver_name(drv); 2430 driver_name = make_driver_name(drv);
2416 if (driver_name) { 2431 if (driver_name) {
2417 sysfs_remove_link(drv->owner->mkobj.drivers_dir, 2432 sysfs_remove_link(mk->drivers_dir, driver_name);
2418 driver_name);
2419 kfree(driver_name); 2433 kfree(driver_name);
2420 } 2434 }
2421 } 2435 }
diff --git a/kernel/params.c b/kernel/params.c
index e265b13195..312172320b 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -356,6 +356,10 @@ int param_set_copystring(const char *val, struct kernel_param *kp)
356{ 356{
357 struct kparam_string *kps = kp->arg; 357 struct kparam_string *kps = kp->arg;
358 358
359 if (!val) {
360 printk(KERN_ERR "%s: missing param set value\n", kp->name);
361 return -EINVAL;
362 }
359 if (strlen(val)+1 > kps->maxlen) { 363 if (strlen(val)+1 > kps->maxlen) {
360 printk(KERN_ERR "%s: string doesn't fit in %u chars.\n", 364 printk(KERN_ERR "%s: string doesn't fit in %u chars.\n",
361 kp->name, kps->maxlen-1); 365 kp->name, kps->maxlen-1);
@@ -687,6 +691,7 @@ static struct kset_uevent_ops module_uevent_ops = {
687}; 691};
688 692
689decl_subsys(module, &module_ktype, &module_uevent_ops); 693decl_subsys(module, &module_ktype, &module_uevent_ops);
694int module_sysfs_initialized;
690 695
691static struct kobj_type module_ktype = { 696static struct kobj_type module_ktype = {
692 .sysfs_ops = &module_sysfs_ops, 697 .sysfs_ops = &module_sysfs_ops,
@@ -705,6 +710,7 @@ static int __init param_sysfs_init(void)
705 __FILE__, __LINE__, ret); 710 __FILE__, __LINE__, ret);
706 return ret; 711 return ret;
707 } 712 }
713 module_sysfs_initialized = 1;
708 714
709 param_sysfs_builtin(); 715 param_sysfs_builtin();
710 716
diff --git a/kernel/pid.c b/kernel/pid.c
index 78f2aee90f..9c80bc23d6 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -412,7 +412,5 @@ void __init pidmap_init(void)
412 set_bit(0, init_pid_ns.pidmap[0].page); 412 set_bit(0, init_pid_ns.pidmap[0].page);
413 atomic_dec(&init_pid_ns.pidmap[0].nr_free); 413 atomic_dec(&init_pid_ns.pidmap[0].nr_free);
414 414
415 pid_cachep = kmem_cache_create("pid", sizeof(struct pid), 415 pid_cachep = KMEM_CACHE(pid, SLAB_PANIC);
416 __alignof__(struct pid),
417 SLAB_PANIC, NULL, NULL);
418} 416}
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 51a4dd0f1b..877721708f 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -78,17 +78,22 @@ config PM_SYSFS_DEPRECATED
78 are likely to be bus or driver specific. 78 are likely to be bus or driver specific.
79 79
80config SOFTWARE_SUSPEND 80config SOFTWARE_SUSPEND
81 bool "Software Suspend" 81 bool "Software Suspend (Hibernation)"
82 depends on PM && SWAP && ((X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP)) 82 depends on PM && SWAP && ((X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP))
83 ---help--- 83 ---help---
84 Enable the suspend to disk (STD) functionality. 84 Enable the suspend to disk (STD) functionality, which is usually
85 called "hibernation" in user interfaces. STD checkpoints the
86 system and powers it off; and restores that checkpoint on reboot.
85 87
86 You can suspend your machine with 'echo disk > /sys/power/state'. 88 You can suspend your machine with 'echo disk > /sys/power/state'.
87 Alternatively, you can use the additional userland tools available 89 Alternatively, you can use the additional userland tools available
88 from <http://suspend.sf.net>. 90 from <http://suspend.sf.net>.
89 91
90 In principle it does not require ACPI or APM, although for example 92 In principle it does not require ACPI or APM, although for example
91 ACPI will be used if available. 93 ACPI will be used for the final steps when it is available. One
94 of the reasons to use software suspend is that the firmware hooks
95 for suspend states like suspend-to-RAM (STR) often don't work very
96 well with Linux.
92 97
93 It creates an image which is saved in your active swap. Upon the next 98 It creates an image which is saved in your active swap. Upon the next
94 boot, pass the 'resume=/dev/swappartition' argument to the kernel to 99 boot, pass the 'resume=/dev/swappartition' argument to the kernel to
diff --git a/kernel/power/console.c b/kernel/power/console.c
index 623786d441..89bcf4973e 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -27,7 +27,15 @@ int pm_prepare_console(void)
27 return 1; 27 return 1;
28 } 28 }
29 29
30 set_console(SUSPEND_CONSOLE); 30 if (set_console(SUSPEND_CONSOLE)) {
31 /*
32 * We're unable to switch to the SUSPEND_CONSOLE.
33 * Let the calling function know so it can decide
34 * what to do.
35 */
36 release_console_sem();
37 return 1;
38 }
31 release_console_sem(); 39 release_console_sem();
32 40
33 if (vt_waitactive(SUSPEND_CONSOLE)) { 41 if (vt_waitactive(SUSPEND_CONSOLE)) {
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 406b20adb2..06331374d8 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -39,7 +39,13 @@ static inline int platform_prepare(void)
39{ 39{
40 int error = 0; 40 int error = 0;
41 41
42 if (pm_disk_mode == PM_DISK_PLATFORM) { 42 switch (pm_disk_mode) {
43 case PM_DISK_TEST:
44 case PM_DISK_TESTPROC:
45 case PM_DISK_SHUTDOWN:
46 case PM_DISK_REBOOT:
47 break;
48 default:
43 if (pm_ops && pm_ops->prepare) 49 if (pm_ops && pm_ops->prepare)
44 error = pm_ops->prepare(PM_SUSPEND_DISK); 50 error = pm_ops->prepare(PM_SUSPEND_DISK);
45 } 51 }
@@ -48,40 +54,48 @@ static inline int platform_prepare(void)
48 54
49/** 55/**
50 * power_down - Shut machine down for hibernate. 56 * power_down - Shut machine down for hibernate.
51 * @mode: Suspend-to-disk mode
52 * 57 *
53 * Use the platform driver, if configured so, and return gracefully if it 58 * Use the platform driver, if configured so; otherwise try
54 * fails. 59 * to power off or reboot.
55 * Otherwise, try to power off and reboot. If they fail, halt the machine,
56 * there ain't no turning back.
57 */ 60 */
58 61
59static void power_down(suspend_disk_method_t mode) 62static void power_down(void)
60{ 63{
61 switch(mode) { 64 switch (pm_disk_mode) {
62 case PM_DISK_PLATFORM: 65 case PM_DISK_TEST:
63 if (pm_ops && pm_ops->enter) { 66 case PM_DISK_TESTPROC:
64 kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); 67 break;
65 pm_ops->enter(PM_SUSPEND_DISK);
66 break;
67 }
68 case PM_DISK_SHUTDOWN: 68 case PM_DISK_SHUTDOWN:
69 kernel_power_off(); 69 kernel_power_off();
70 break; 70 break;
71 case PM_DISK_REBOOT: 71 case PM_DISK_REBOOT:
72 kernel_restart(NULL); 72 kernel_restart(NULL);
73 break; 73 break;
74 default:
75 if (pm_ops && pm_ops->enter) {
76 kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
77 pm_ops->enter(PM_SUSPEND_DISK);
78 break;
79 }
74 } 80 }
75 kernel_halt(); 81 kernel_halt();
76 /* Valid image is on the disk, if we continue we risk serious data corruption 82 /*
77 after resume. */ 83 * Valid image is on the disk, if we continue we risk serious data
84 * corruption after resume.
85 */
78 printk(KERN_CRIT "Please power me down manually\n"); 86 printk(KERN_CRIT "Please power me down manually\n");
79 while(1); 87 while(1);
80} 88}
81 89
82static inline void platform_finish(void) 90static inline void platform_finish(void)
83{ 91{
84 if (pm_disk_mode == PM_DISK_PLATFORM) { 92 switch (pm_disk_mode) {
93 case PM_DISK_TEST:
94 case PM_DISK_TESTPROC:
95 case PM_DISK_SHUTDOWN:
96 case PM_DISK_REBOOT:
97 break;
98 default:
85 if (pm_ops && pm_ops->finish) 99 if (pm_ops && pm_ops->finish)
86 pm_ops->finish(PM_SUSPEND_DISK); 100 pm_ops->finish(PM_SUSPEND_DISK);
87 } 101 }
@@ -108,8 +122,6 @@ static int prepare_processes(void)
108/** 122/**
109 * pm_suspend_disk - The granpappy of hibernation power management. 123 * pm_suspend_disk - The granpappy of hibernation power management.
110 * 124 *
111 * If we're going through the firmware, then get it over with quickly.
112 *
113 * If not, then call swsusp to do its thing, then figure out how 125 * If not, then call swsusp to do its thing, then figure out how
114 * to power down the system. 126 * to power down the system.
115 */ 127 */
@@ -118,15 +130,25 @@ int pm_suspend_disk(void)
118{ 130{
119 int error; 131 int error;
120 132
133 /* The snapshot device should not be opened while we're running */
134 if (!atomic_add_unless(&snapshot_device_available, -1, 0))
135 return -EBUSY;
136
137 /* Allocate memory management structures */
138 error = create_basic_memory_bitmaps();
139 if (error)
140 goto Exit;
141
121 error = prepare_processes(); 142 error = prepare_processes();
122 if (error) 143 if (error)
123 return error; 144 goto Finish;
124 145
125 if (pm_disk_mode == PM_DISK_TESTPROC) { 146 if (pm_disk_mode == PM_DISK_TESTPROC) {
126 printk("swsusp debug: Waiting for 5 seconds.\n"); 147 printk("swsusp debug: Waiting for 5 seconds.\n");
127 mdelay(5000); 148 mdelay(5000);
128 goto Thaw; 149 goto Thaw;
129 } 150 }
151
130 /* Free memory before shutting down devices. */ 152 /* Free memory before shutting down devices. */
131 error = swsusp_shrink_memory(); 153 error = swsusp_shrink_memory();
132 if (error) 154 if (error)
@@ -166,7 +188,7 @@ int pm_suspend_disk(void)
166 pr_debug("PM: writing image.\n"); 188 pr_debug("PM: writing image.\n");
167 error = swsusp_write(); 189 error = swsusp_write();
168 if (!error) 190 if (!error)
169 power_down(pm_disk_mode); 191 power_down();
170 else { 192 else {
171 swsusp_free(); 193 swsusp_free();
172 goto Thaw; 194 goto Thaw;
@@ -184,6 +206,10 @@ int pm_suspend_disk(void)
184 resume_console(); 206 resume_console();
185 Thaw: 207 Thaw:
186 unprepare_processes(); 208 unprepare_processes();
209 Finish:
210 free_basic_memory_bitmaps();
211 Exit:
212 atomic_inc(&snapshot_device_available);
187 return error; 213 return error;
188} 214}
189 215
@@ -227,25 +253,27 @@ static int software_resume(void)
227 } 253 }
228 254
229 pr_debug("PM: Checking swsusp image.\n"); 255 pr_debug("PM: Checking swsusp image.\n");
230
231 error = swsusp_check(); 256 error = swsusp_check();
232 if (error) 257 if (error)
233 goto Done; 258 goto Unlock;
234 259
235 pr_debug("PM: Preparing processes for restore.\n"); 260 /* The snapshot device should not be opened while we're running */
261 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
262 error = -EBUSY;
263 goto Unlock;
264 }
236 265
266 error = create_basic_memory_bitmaps();
267 if (error)
268 goto Finish;
269
270 pr_debug("PM: Preparing processes for restore.\n");
237 error = prepare_processes(); 271 error = prepare_processes();
238 if (error) { 272 if (error) {
239 swsusp_close(); 273 swsusp_close();
240 goto Done; 274 goto Done;
241 } 275 }
242 276
243 error = platform_prepare();
244 if (error) {
245 swsusp_free();
246 goto Thaw;
247 }
248
249 pr_debug("PM: Reading swsusp image.\n"); 277 pr_debug("PM: Reading swsusp image.\n");
250 278
251 error = swsusp_read(); 279 error = swsusp_read();
@@ -268,14 +296,17 @@ static int software_resume(void)
268 enable_nonboot_cpus(); 296 enable_nonboot_cpus();
269 Free: 297 Free:
270 swsusp_free(); 298 swsusp_free();
271 platform_finish();
272 device_resume(); 299 device_resume();
273 resume_console(); 300 resume_console();
274 Thaw: 301 Thaw:
275 printk(KERN_ERR "PM: Restore failed, recovering.\n"); 302 printk(KERN_ERR "PM: Restore failed, recovering.\n");
276 unprepare_processes(); 303 unprepare_processes();
277 Done: 304 Done:
305 free_basic_memory_bitmaps();
306 Finish:
307 atomic_inc(&snapshot_device_available);
278 /* For success case, the suspend path will release the lock */ 308 /* For success case, the suspend path will release the lock */
309 Unlock:
279 mutex_unlock(&pm_mutex); 310 mutex_unlock(&pm_mutex);
280 pr_debug("PM: Resume from disk failed.\n"); 311 pr_debug("PM: Resume from disk failed.\n");
281 return 0; 312 return 0;
@@ -285,7 +316,6 @@ late_initcall(software_resume);
285 316
286 317
287static const char * const pm_disk_modes[] = { 318static const char * const pm_disk_modes[] = {
288 [PM_DISK_FIRMWARE] = "firmware",
289 [PM_DISK_PLATFORM] = "platform", 319 [PM_DISK_PLATFORM] = "platform",
290 [PM_DISK_SHUTDOWN] = "shutdown", 320 [PM_DISK_SHUTDOWN] = "shutdown",
291 [PM_DISK_REBOOT] = "reboot", 321 [PM_DISK_REBOOT] = "reboot",
@@ -296,37 +326,62 @@ static const char * const pm_disk_modes[] = {
296/** 326/**
297 * disk - Control suspend-to-disk mode 327 * disk - Control suspend-to-disk mode
298 * 328 *
299 * Suspend-to-disk can be handled in several ways. The greatest 329 * Suspend-to-disk can be handled in several ways. We have a few options
300 * distinction is who writes memory to disk - the firmware or the OS. 330 * for putting the system to sleep - using the platform driver (e.g. ACPI
301 * If the firmware does it, we assume that it also handles suspending 331 * or other pm_ops), powering off the system or rebooting the system
302 * the system. 332 * (for testing) as well as the two test modes.
303 * If the OS does it, then we have three options for putting the system
304 * to sleep - using the platform driver (e.g. ACPI or other PM registers),
305 * powering off the system or rebooting the system (for testing).
306 * 333 *
307 * The system will support either 'firmware' or 'platform', and that is 334 * The system can support 'platform', and that is known a priori (and
308 * known a priori (and encoded in pm_ops). But, the user may choose 335 * encoded in pm_ops). However, the user may choose 'shutdown' or 'reboot'
309 * 'shutdown' or 'reboot' as alternatives. 336 * as alternatives, as well as the test modes 'test' and 'testproc'.
310 * 337 *
311 * show() will display what the mode is currently set to. 338 * show() will display what the mode is currently set to.
312 * store() will accept one of 339 * store() will accept one of
313 * 340 *
314 * 'firmware'
315 * 'platform' 341 * 'platform'
316 * 'shutdown' 342 * 'shutdown'
317 * 'reboot' 343 * 'reboot'
344 * 'test'
345 * 'testproc'
318 * 346 *
319 * It will only change to 'firmware' or 'platform' if the system 347 * It will only change to 'platform' if the system
320 * supports it (as determined from pm_ops->pm_disk_mode). 348 * supports it (as determined from pm_ops->pm_disk_mode).
321 */ 349 */
322 350
323static ssize_t disk_show(struct subsystem * subsys, char * buf) 351static ssize_t disk_show(struct kset *kset, char *buf)
324{ 352{
325 return sprintf(buf, "%s\n", pm_disk_modes[pm_disk_mode]); 353 int i;
354 char *start = buf;
355
356 for (i = PM_DISK_PLATFORM; i < PM_DISK_MAX; i++) {
357 if (!pm_disk_modes[i])
358 continue;
359 switch (i) {
360 case PM_DISK_SHUTDOWN:
361 case PM_DISK_REBOOT:
362 case PM_DISK_TEST:
363 case PM_DISK_TESTPROC:
364 break;
365 default:
366 if (pm_ops && pm_ops->enter &&
367 (i == pm_ops->pm_disk_mode))
368 break;
369 /* not a valid mode, continue with loop */
370 continue;
371 }
372 if (i == pm_disk_mode)
373 buf += sprintf(buf, "[%s]", pm_disk_modes[i]);
374 else
375 buf += sprintf(buf, "%s", pm_disk_modes[i]);
376 if (i+1 != PM_DISK_MAX)
377 buf += sprintf(buf, " ");
378 }
379 buf += sprintf(buf, "\n");
380 return buf-start;
326} 381}
327 382
328 383
329static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n) 384static ssize_t disk_store(struct kset *kset, const char *buf, size_t n)
330{ 385{
331 int error = 0; 386 int error = 0;
332 int i; 387 int i;
@@ -338,17 +393,21 @@ static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n)
338 len = p ? p - buf : n; 393 len = p ? p - buf : n;
339 394
340 mutex_lock(&pm_mutex); 395 mutex_lock(&pm_mutex);
341 for (i = PM_DISK_FIRMWARE; i < PM_DISK_MAX; i++) { 396 for (i = PM_DISK_PLATFORM; i < PM_DISK_MAX; i++) {
342 if (!strncmp(buf, pm_disk_modes[i], len)) { 397 if (!strncmp(buf, pm_disk_modes[i], len)) {
343 mode = i; 398 mode = i;
344 break; 399 break;
345 } 400 }
346 } 401 }
347 if (mode) { 402 if (mode) {
348 if (mode == PM_DISK_SHUTDOWN || mode == PM_DISK_REBOOT || 403 switch (mode) {
349 mode == PM_DISK_TEST || mode == PM_DISK_TESTPROC) { 404 case PM_DISK_SHUTDOWN:
405 case PM_DISK_REBOOT:
406 case PM_DISK_TEST:
407 case PM_DISK_TESTPROC:
350 pm_disk_mode = mode; 408 pm_disk_mode = mode;
351 } else { 409 break;
410 default:
352 if (pm_ops && pm_ops->enter && 411 if (pm_ops && pm_ops->enter &&
353 (mode == pm_ops->pm_disk_mode)) 412 (mode == pm_ops->pm_disk_mode))
354 pm_disk_mode = mode; 413 pm_disk_mode = mode;
@@ -367,13 +426,13 @@ static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n)
367 426
368power_attr(disk); 427power_attr(disk);
369 428
370static ssize_t resume_show(struct subsystem * subsys, char *buf) 429static ssize_t resume_show(struct kset *kset, char *buf)
371{ 430{
372 return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device), 431 return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device),
373 MINOR(swsusp_resume_device)); 432 MINOR(swsusp_resume_device));
374} 433}
375 434
376static ssize_t resume_store(struct subsystem *subsys, const char *buf, size_t n) 435static ssize_t resume_store(struct kset *kset, const char *buf, size_t n)
377{ 436{
378 unsigned int maj, min; 437 unsigned int maj, min;
379 dev_t res; 438 dev_t res;
@@ -399,12 +458,12 @@ static ssize_t resume_store(struct subsystem *subsys, const char *buf, size_t n)
399 458
400power_attr(resume); 459power_attr(resume);
401 460
402static ssize_t image_size_show(struct subsystem * subsys, char *buf) 461static ssize_t image_size_show(struct kset *kset, char *buf)
403{ 462{
404 return sprintf(buf, "%lu\n", image_size); 463 return sprintf(buf, "%lu\n", image_size);
405} 464}
406 465
407static ssize_t image_size_store(struct subsystem * subsys, const char * buf, size_t n) 466static ssize_t image_size_store(struct kset *kset, const char *buf, size_t n)
408{ 467{
409 unsigned long size; 468 unsigned long size;
410 469
@@ -433,7 +492,7 @@ static struct attribute_group attr_group = {
433 492
434static int __init pm_disk_init(void) 493static int __init pm_disk_init(void)
435{ 494{
436 return sysfs_create_group(&power_subsys.kset.kobj,&attr_group); 495 return sysfs_create_group(&power_subsys.kobj, &attr_group);
437} 496}
438 497
439core_initcall(pm_disk_init); 498core_initcall(pm_disk_init);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index a064dfd887..f6dda685e7 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -30,7 +30,7 @@
30DEFINE_MUTEX(pm_mutex); 30DEFINE_MUTEX(pm_mutex);
31 31
32struct pm_ops *pm_ops; 32struct pm_ops *pm_ops;
33suspend_disk_method_t pm_disk_mode = PM_DISK_PLATFORM; 33suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN;
34 34
35/** 35/**
36 * pm_set_ops - Set the global power method table. 36 * pm_set_ops - Set the global power method table.
@@ -41,9 +41,26 @@ void pm_set_ops(struct pm_ops * ops)
41{ 41{
42 mutex_lock(&pm_mutex); 42 mutex_lock(&pm_mutex);
43 pm_ops = ops; 43 pm_ops = ops;
44 if (ops && ops->pm_disk_mode != PM_DISK_INVALID) {
45 pm_disk_mode = ops->pm_disk_mode;
46 } else
47 pm_disk_mode = PM_DISK_SHUTDOWN;
44 mutex_unlock(&pm_mutex); 48 mutex_unlock(&pm_mutex);
45} 49}
46 50
51/**
52 * pm_valid_only_mem - generic memory-only valid callback
53 *
54 * pm_ops drivers that implement mem suspend only and only need
55 * to check for that in their .valid callback can use this instead
56 * of rolling their own .valid callback.
57 */
58int pm_valid_only_mem(suspend_state_t state)
59{
60 return state == PM_SUSPEND_MEM;
61}
62
63
47static inline void pm_finish(suspend_state_t state) 64static inline void pm_finish(suspend_state_t state)
48{ 65{
49 if (pm_ops->finish) 66 if (pm_ops->finish)
@@ -111,13 +128,24 @@ static int suspend_prepare(suspend_state_t state)
111 return error; 128 return error;
112} 129}
113 130
131/* default implementation */
132void __attribute__ ((weak)) arch_suspend_disable_irqs(void)
133{
134 local_irq_disable();
135}
136
137/* default implementation */
138void __attribute__ ((weak)) arch_suspend_enable_irqs(void)
139{
140 local_irq_enable();
141}
114 142
115int suspend_enter(suspend_state_t state) 143int suspend_enter(suspend_state_t state)
116{ 144{
117 int error = 0; 145 int error = 0;
118 unsigned long flags;
119 146
120 local_irq_save(flags); 147 arch_suspend_disable_irqs();
148 BUG_ON(!irqs_disabled());
121 149
122 if ((error = device_power_down(PMSG_SUSPEND))) { 150 if ((error = device_power_down(PMSG_SUSPEND))) {
123 printk(KERN_ERR "Some devices failed to power down\n"); 151 printk(KERN_ERR "Some devices failed to power down\n");
@@ -126,7 +154,8 @@ int suspend_enter(suspend_state_t state)
126 error = pm_ops->enter(state); 154 error = pm_ops->enter(state);
127 device_power_up(); 155 device_power_up();
128 Done: 156 Done:
129 local_irq_restore(flags); 157 arch_suspend_enable_irqs();
158 BUG_ON(irqs_disabled());
130 return error; 159 return error;
131} 160}
132 161
@@ -155,22 +184,26 @@ static void suspend_finish(suspend_state_t state)
155static const char * const pm_states[PM_SUSPEND_MAX] = { 184static const char * const pm_states[PM_SUSPEND_MAX] = {
156 [PM_SUSPEND_STANDBY] = "standby", 185 [PM_SUSPEND_STANDBY] = "standby",
157 [PM_SUSPEND_MEM] = "mem", 186 [PM_SUSPEND_MEM] = "mem",
158#ifdef CONFIG_SOFTWARE_SUSPEND
159 [PM_SUSPEND_DISK] = "disk", 187 [PM_SUSPEND_DISK] = "disk",
160#endif
161}; 188};
162 189
163static inline int valid_state(suspend_state_t state) 190static inline int valid_state(suspend_state_t state)
164{ 191{
165 /* Suspend-to-disk does not really need low-level support. 192 /* Suspend-to-disk does not really need low-level support.
166 * It can work with reboot if needed. */ 193 * It can work with shutdown/reboot if needed. If it isn't
194 * configured, then it cannot be supported.
195 */
167 if (state == PM_SUSPEND_DISK) 196 if (state == PM_SUSPEND_DISK)
197#ifdef CONFIG_SOFTWARE_SUSPEND
168 return 1; 198 return 1;
199#else
200 return 0;
201#endif
169 202
170 /* all other states need lowlevel support and need to be 203 /* all other states need lowlevel support and need to be
171 * valid to the lowlevel implementation, no valid callback 204 * valid to the lowlevel implementation, no valid callback
172 * implies that all are valid. */ 205 * implies that none are valid. */
173 if (!pm_ops || (pm_ops->valid && !pm_ops->valid(state))) 206 if (!pm_ops || !pm_ops->valid || !pm_ops->valid(state))
174 return 0; 207 return 0;
175 return 1; 208 return 1;
176} 209}
@@ -215,15 +248,6 @@ static int enter_state(suspend_state_t state)
215 return error; 248 return error;
216} 249}
217 250
218/*
219 * This is main interface to the outside world. It needs to be
220 * called from process context.
221 */
222int software_suspend(void)
223{
224 return enter_state(PM_SUSPEND_DISK);
225}
226
227 251
228/** 252/**
229 * pm_suspend - Externally visible function for suspending system. 253 * pm_suspend - Externally visible function for suspending system.
@@ -256,7 +280,7 @@ decl_subsys(power,NULL,NULL);
256 * proper enumerated value, and initiates a suspend transition. 280 * proper enumerated value, and initiates a suspend transition.
257 */ 281 */
258 282
259static ssize_t state_show(struct subsystem * subsys, char * buf) 283static ssize_t state_show(struct kset *kset, char *buf)
260{ 284{
261 int i; 285 int i;
262 char * s = buf; 286 char * s = buf;
@@ -269,7 +293,7 @@ static ssize_t state_show(struct subsystem * subsys, char * buf)
269 return (s - buf); 293 return (s - buf);
270} 294}
271 295
272static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n) 296static ssize_t state_store(struct kset *kset, const char *buf, size_t n)
273{ 297{
274 suspend_state_t state = PM_SUSPEND_STANDBY; 298 suspend_state_t state = PM_SUSPEND_STANDBY;
275 const char * const *s; 299 const char * const *s;
@@ -296,13 +320,13 @@ power_attr(state);
296#ifdef CONFIG_PM_TRACE 320#ifdef CONFIG_PM_TRACE
297int pm_trace_enabled; 321int pm_trace_enabled;
298 322
299static ssize_t pm_trace_show(struct subsystem * subsys, char * buf) 323static ssize_t pm_trace_show(struct kset *kset, char *buf)
300{ 324{
301 return sprintf(buf, "%d\n", pm_trace_enabled); 325 return sprintf(buf, "%d\n", pm_trace_enabled);
302} 326}
303 327
304static ssize_t 328static ssize_t
305pm_trace_store(struct subsystem * subsys, const char * buf, size_t n) 329pm_trace_store(struct kset *kset, const char *buf, size_t n)
306{ 330{
307 int val; 331 int val;
308 332
@@ -336,7 +360,7 @@ static int __init pm_init(void)
336{ 360{
337 int error = subsystem_register(&power_subsys); 361 int error = subsystem_register(&power_subsys);
338 if (!error) 362 if (!error)
339 error = sysfs_create_group(&power_subsys.kset.kobj,&attr_group); 363 error = sysfs_create_group(&power_subsys.kobj,&attr_group);
340 return error; 364 return error;
341} 365}
342 366
diff --git a/kernel/power/power.h b/kernel/power/power.h
index eb461b816b..34b4354278 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -14,8 +14,18 @@ struct swsusp_info {
14 14
15 15
16#ifdef CONFIG_SOFTWARE_SUSPEND 16#ifdef CONFIG_SOFTWARE_SUSPEND
17extern int pm_suspend_disk(void); 17/*
18 * Keep some memory free so that I/O operations can succeed without paging
19 * [Might this be more than 4 MB?]
20 */
21#define PAGES_FOR_IO ((4096 * 1024) >> PAGE_SHIFT)
22/*
23 * Keep 1 MB of memory free so that device drivers can allocate some pages in
24 * their .suspend() routines without breaking the suspend to disk.
25 */
26#define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT)
18 27
28extern int pm_suspend_disk(void);
19#else 29#else
20static inline int pm_suspend_disk(void) 30static inline int pm_suspend_disk(void)
21{ 31{
@@ -23,6 +33,8 @@ static inline int pm_suspend_disk(void)
23} 33}
24#endif 34#endif
25 35
36extern int pfn_is_nosave(unsigned long);
37
26extern struct mutex pm_mutex; 38extern struct mutex pm_mutex;
27 39
28#define power_attr(_name) \ 40#define power_attr(_name) \
@@ -35,10 +47,7 @@ static struct subsys_attribute _name##_attr = { \
35 .store = _name##_store, \ 47 .store = _name##_store, \
36} 48}
37 49
38extern struct subsystem power_subsys; 50extern struct kset power_subsys;
39
40/* References to section boundaries */
41extern const void __nosave_begin, __nosave_end;
42 51
43/* Preferred image size in bytes (default 500 MB) */ 52/* Preferred image size in bytes (default 500 MB) */
44extern unsigned long image_size; 53extern unsigned long image_size;
@@ -49,6 +58,8 @@ extern sector_t swsusp_resume_block;
49extern asmlinkage int swsusp_arch_suspend(void); 58extern asmlinkage int swsusp_arch_suspend(void);
50extern asmlinkage int swsusp_arch_resume(void); 59extern asmlinkage int swsusp_arch_resume(void);
51 60
61extern int create_basic_memory_bitmaps(void);
62extern void free_basic_memory_bitmaps(void);
52extern unsigned int count_data_pages(void); 63extern unsigned int count_data_pages(void);
53 64
54/** 65/**
@@ -139,30 +150,12 @@ struct resume_swap_area {
139#define PMOPS_ENTER 2 150#define PMOPS_ENTER 2
140#define PMOPS_FINISH 3 151#define PMOPS_FINISH 3
141 152
142/** 153/* If unset, the snapshot device cannot be open. */
143 * The bitmap is used for tracing allocated swap pages 154extern atomic_t snapshot_device_available;
144 *
145 * The entire bitmap consists of a number of bitmap_page
146 * structures linked with the help of the .next member.
147 * Thus each page can be allocated individually, so we only
148 * need to make 0-order memory allocations to create
149 * the bitmap.
150 */
151
152#define BITMAP_PAGE_SIZE (PAGE_SIZE - sizeof(void *))
153#define BITMAP_PAGE_CHUNKS (BITMAP_PAGE_SIZE / sizeof(long))
154#define BITS_PER_CHUNK (sizeof(long) * 8)
155#define BITMAP_PAGE_BITS (BITMAP_PAGE_CHUNKS * BITS_PER_CHUNK)
156
157struct bitmap_page {
158 unsigned long chunks[BITMAP_PAGE_CHUNKS];
159 struct bitmap_page *next;
160};
161 155
162extern void free_bitmap(struct bitmap_page *bitmap); 156extern sector_t alloc_swapdev_block(int swap);
163extern struct bitmap_page *alloc_bitmap(unsigned int nr_bits); 157extern void free_all_swap_pages(int swap);
164extern sector_t alloc_swapdev_block(int swap, struct bitmap_page *bitmap); 158extern int swsusp_swap_in_use(void);
165extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap);
166 159
167extern int swsusp_check(void); 160extern int swsusp_check(void);
168extern int swsusp_shrink_memory(void); 161extern int swsusp_shrink_memory(void);
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 6d566bf708..0eb5c420e8 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -47,8 +47,10 @@ void refrigerator(void)
47 recalc_sigpending(); /* We sent fake signal, clean it up */ 47 recalc_sigpending(); /* We sent fake signal, clean it up */
48 spin_unlock_irq(&current->sighand->siglock); 48 spin_unlock_irq(&current->sighand->siglock);
49 49
50 while (frozen(current)) { 50 for (;;) {
51 current->state = TASK_UNINTERRUPTIBLE; 51 set_current_state(TASK_UNINTERRUPTIBLE);
52 if (!frozen(current))
53 break;
52 schedule(); 54 schedule();
53 } 55 }
54 pr_debug("%s left refrigerator\n", current->comm); 56 pr_debug("%s left refrigerator\n", current->comm);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index fc53ad0681..128da11f01 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -21,6 +21,7 @@
21#include <linux/kernel.h> 21#include <linux/kernel.h>
22#include <linux/pm.h> 22#include <linux/pm.h>
23#include <linux/device.h> 23#include <linux/device.h>
24#include <linux/init.h>
24#include <linux/bootmem.h> 25#include <linux/bootmem.h>
25#include <linux/syscalls.h> 26#include <linux/syscalls.h>
26#include <linux/console.h> 27#include <linux/console.h>
@@ -34,6 +35,10 @@
34 35
35#include "power.h" 36#include "power.h"
36 37
38static int swsusp_page_is_free(struct page *);
39static void swsusp_set_page_forbidden(struct page *);
40static void swsusp_unset_page_forbidden(struct page *);
41
37/* List of PBEs needed for restoring the pages that were allocated before 42/* List of PBEs needed for restoring the pages that were allocated before
38 * the suspend and included in the suspend image, but have also been 43 * the suspend and included in the suspend image, but have also been
39 * allocated by the "resume" kernel, so their contents cannot be written 44 * allocated by the "resume" kernel, so their contents cannot be written
@@ -67,15 +72,15 @@ static void *get_image_page(gfp_t gfp_mask, int safe_needed)
67 72
68 res = (void *)get_zeroed_page(gfp_mask); 73 res = (void *)get_zeroed_page(gfp_mask);
69 if (safe_needed) 74 if (safe_needed)
70 while (res && PageNosaveFree(virt_to_page(res))) { 75 while (res && swsusp_page_is_free(virt_to_page(res))) {
71 /* The page is unsafe, mark it for swsusp_free() */ 76 /* The page is unsafe, mark it for swsusp_free() */
72 SetPageNosave(virt_to_page(res)); 77 swsusp_set_page_forbidden(virt_to_page(res));
73 allocated_unsafe_pages++; 78 allocated_unsafe_pages++;
74 res = (void *)get_zeroed_page(gfp_mask); 79 res = (void *)get_zeroed_page(gfp_mask);
75 } 80 }
76 if (res) { 81 if (res) {
77 SetPageNosave(virt_to_page(res)); 82 swsusp_set_page_forbidden(virt_to_page(res));
78 SetPageNosaveFree(virt_to_page(res)); 83 swsusp_set_page_free(virt_to_page(res));
79 } 84 }
80 return res; 85 return res;
81} 86}
@@ -91,8 +96,8 @@ static struct page *alloc_image_page(gfp_t gfp_mask)
91 96
92 page = alloc_page(gfp_mask); 97 page = alloc_page(gfp_mask);
93 if (page) { 98 if (page) {
94 SetPageNosave(page); 99 swsusp_set_page_forbidden(page);
95 SetPageNosaveFree(page); 100 swsusp_set_page_free(page);
96 } 101 }
97 return page; 102 return page;
98} 103}
@@ -110,9 +115,9 @@ static inline void free_image_page(void *addr, int clear_nosave_free)
110 115
111 page = virt_to_page(addr); 116 page = virt_to_page(addr);
112 117
113 ClearPageNosave(page); 118 swsusp_unset_page_forbidden(page);
114 if (clear_nosave_free) 119 if (clear_nosave_free)
115 ClearPageNosaveFree(page); 120 swsusp_unset_page_free(page);
116 121
117 __free_page(page); 122 __free_page(page);
118} 123}
@@ -224,11 +229,6 @@ static void chain_free(struct chain_allocator *ca, int clear_page_nosave)
224 * of type unsigned long each). It also contains the pfns that 229 * of type unsigned long each). It also contains the pfns that
225 * correspond to the start and end of the represented memory area and 230 * correspond to the start and end of the represented memory area and
226 * the number of bit chunks in the block. 231 * the number of bit chunks in the block.
227 *
228 * NOTE: Memory bitmaps are used for two types of operations only:
229 * "set a bit" and "find the next bit set". Moreover, the searching
230 * is always carried out after all of the "set a bit" operations
231 * on given bitmap.
232 */ 232 */
233 233
234#define BM_END_OF_MAP (~0UL) 234#define BM_END_OF_MAP (~0UL)
@@ -443,15 +443,13 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
443} 443}
444 444
445/** 445/**
446 * memory_bm_set_bit - set the bit in the bitmap @bm that corresponds 446 * memory_bm_find_bit - find the bit in the bitmap @bm that corresponds
447 * to given pfn. The cur_zone_bm member of @bm and the cur_block member 447 * to given pfn. The cur_zone_bm member of @bm and the cur_block member
448 * of @bm->cur_zone_bm are updated. 448 * of @bm->cur_zone_bm are updated.
449 *
450 * If the bit cannot be set, the function returns -EINVAL .
451 */ 449 */
452 450
453static int 451static void memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
454memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn) 452 void **addr, unsigned int *bit_nr)
455{ 453{
456 struct zone_bitmap *zone_bm; 454 struct zone_bitmap *zone_bm;
457 struct bm_block *bb; 455 struct bm_block *bb;
@@ -463,8 +461,8 @@ memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn)
463 /* We don't assume that the zones are sorted by pfns */ 461 /* We don't assume that the zones are sorted by pfns */
464 while (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) { 462 while (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) {
465 zone_bm = zone_bm->next; 463 zone_bm = zone_bm->next;
466 if (unlikely(!zone_bm)) 464
467 return -EINVAL; 465 BUG_ON(!zone_bm);
468 } 466 }
469 bm->cur.zone_bm = zone_bm; 467 bm->cur.zone_bm = zone_bm;
470 } 468 }
@@ -475,13 +473,40 @@ memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn)
475 473
476 while (pfn >= bb->end_pfn) { 474 while (pfn >= bb->end_pfn) {
477 bb = bb->next; 475 bb = bb->next;
478 if (unlikely(!bb)) 476
479 return -EINVAL; 477 BUG_ON(!bb);
480 } 478 }
481 zone_bm->cur_block = bb; 479 zone_bm->cur_block = bb;
482 pfn -= bb->start_pfn; 480 pfn -= bb->start_pfn;
483 set_bit(pfn % BM_BITS_PER_CHUNK, bb->data + pfn / BM_BITS_PER_CHUNK); 481 *bit_nr = pfn % BM_BITS_PER_CHUNK;
484 return 0; 482 *addr = bb->data + pfn / BM_BITS_PER_CHUNK;
483}
484
485static void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn)
486{
487 void *addr;
488 unsigned int bit;
489
490 memory_bm_find_bit(bm, pfn, &addr, &bit);
491 set_bit(bit, addr);
492}
493
494static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn)
495{
496 void *addr;
497 unsigned int bit;
498
499 memory_bm_find_bit(bm, pfn, &addr, &bit);
500 clear_bit(bit, addr);
501}
502
503static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
504{
505 void *addr;
506 unsigned int bit;
507
508 memory_bm_find_bit(bm, pfn, &addr, &bit);
509 return test_bit(bit, addr);
485} 510}
486 511
487/* Two auxiliary functions for memory_bm_next_pfn */ 512/* Two auxiliary functions for memory_bm_next_pfn */
@@ -564,6 +589,199 @@ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
564} 589}
565 590
566/** 591/**
592 * This structure represents a range of page frames the contents of which
593 * should not be saved during the suspend.
594 */
595
596struct nosave_region {
597 struct list_head list;
598 unsigned long start_pfn;
599 unsigned long end_pfn;
600};
601
602static LIST_HEAD(nosave_regions);
603
604/**
605 * register_nosave_region - register a range of page frames the contents
606 * of which should not be saved during the suspend (to be used in the early
607 * initialization code)
608 */
609
610void __init
611register_nosave_region(unsigned long start_pfn, unsigned long end_pfn)
612{
613 struct nosave_region *region;
614
615 if (start_pfn >= end_pfn)
616 return;
617
618 if (!list_empty(&nosave_regions)) {
619 /* Try to extend the previous region (they should be sorted) */
620 region = list_entry(nosave_regions.prev,
621 struct nosave_region, list);
622 if (region->end_pfn == start_pfn) {
623 region->end_pfn = end_pfn;
624 goto Report;
625 }
626 }
627 /* This allocation cannot fail */
628 region = alloc_bootmem_low(sizeof(struct nosave_region));
629 region->start_pfn = start_pfn;
630 region->end_pfn = end_pfn;
631 list_add_tail(&region->list, &nosave_regions);
632 Report:
633 printk("swsusp: Registered nosave memory region: %016lx - %016lx\n",
634 start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
635}
636
637/*
638 * Set bits in this map correspond to the page frames the contents of which
639 * should not be saved during the suspend.
640 */
641static struct memory_bitmap *forbidden_pages_map;
642
643/* Set bits in this map correspond to free page frames. */
644static struct memory_bitmap *free_pages_map;
645
646/*
647 * Each page frame allocated for creating the image is marked by setting the
648 * corresponding bits in forbidden_pages_map and free_pages_map simultaneously
649 */
650
651void swsusp_set_page_free(struct page *page)
652{
653 if (free_pages_map)
654 memory_bm_set_bit(free_pages_map, page_to_pfn(page));
655}
656
657static int swsusp_page_is_free(struct page *page)
658{
659 return free_pages_map ?
660 memory_bm_test_bit(free_pages_map, page_to_pfn(page)) : 0;
661}
662
663void swsusp_unset_page_free(struct page *page)
664{
665 if (free_pages_map)
666 memory_bm_clear_bit(free_pages_map, page_to_pfn(page));
667}
668
669static void swsusp_set_page_forbidden(struct page *page)
670{
671 if (forbidden_pages_map)
672 memory_bm_set_bit(forbidden_pages_map, page_to_pfn(page));
673}
674
675int swsusp_page_is_forbidden(struct page *page)
676{
677 return forbidden_pages_map ?
678 memory_bm_test_bit(forbidden_pages_map, page_to_pfn(page)) : 0;
679}
680
681static void swsusp_unset_page_forbidden(struct page *page)
682{
683 if (forbidden_pages_map)
684 memory_bm_clear_bit(forbidden_pages_map, page_to_pfn(page));
685}
686
687/**
688 * mark_nosave_pages - set bits corresponding to the page frames the
689 * contents of which should not be saved in a given bitmap.
690 */
691
692static void mark_nosave_pages(struct memory_bitmap *bm)
693{
694 struct nosave_region *region;
695
696 if (list_empty(&nosave_regions))
697 return;
698
699 list_for_each_entry(region, &nosave_regions, list) {
700 unsigned long pfn;
701
702 printk("swsusp: Marking nosave pages: %016lx - %016lx\n",
703 region->start_pfn << PAGE_SHIFT,
704 region->end_pfn << PAGE_SHIFT);
705
706 for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++)
707 memory_bm_set_bit(bm, pfn);
708 }
709}
710
711/**
712 * create_basic_memory_bitmaps - create bitmaps needed for marking page
713 * frames that should not be saved and free page frames. The pointers
714 * forbidden_pages_map and free_pages_map are only modified if everything
715 * goes well, because we don't want the bits to be used before both bitmaps
716 * are set up.
717 */
718
719int create_basic_memory_bitmaps(void)
720{
721 struct memory_bitmap *bm1, *bm2;
722 int error = 0;
723
724 BUG_ON(forbidden_pages_map || free_pages_map);
725
726 bm1 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL);
727 if (!bm1)
728 return -ENOMEM;
729
730 error = memory_bm_create(bm1, GFP_KERNEL, PG_ANY);
731 if (error)
732 goto Free_first_object;
733
734 bm2 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL);
735 if (!bm2)
736 goto Free_first_bitmap;
737
738 error = memory_bm_create(bm2, GFP_KERNEL, PG_ANY);
739 if (error)
740 goto Free_second_object;
741
742 forbidden_pages_map = bm1;
743 free_pages_map = bm2;
744 mark_nosave_pages(forbidden_pages_map);
745
746 printk("swsusp: Basic memory bitmaps created\n");
747
748 return 0;
749
750 Free_second_object:
751 kfree(bm2);
752 Free_first_bitmap:
753 memory_bm_free(bm1, PG_UNSAFE_CLEAR);
754 Free_first_object:
755 kfree(bm1);
756 return -ENOMEM;
757}
758
759/**
760 * free_basic_memory_bitmaps - free memory bitmaps allocated by
761 * create_basic_memory_bitmaps(). The auxiliary pointers are necessary
762 * so that the bitmaps themselves are not referred to while they are being
763 * freed.
764 */
765
766void free_basic_memory_bitmaps(void)
767{
768 struct memory_bitmap *bm1, *bm2;
769
770 BUG_ON(!(forbidden_pages_map && free_pages_map));
771
772 bm1 = forbidden_pages_map;
773 bm2 = free_pages_map;
774 forbidden_pages_map = NULL;
775 free_pages_map = NULL;
776 memory_bm_free(bm1, PG_UNSAFE_CLEAR);
777 kfree(bm1);
778 memory_bm_free(bm2, PG_UNSAFE_CLEAR);
779 kfree(bm2);
780
781 printk("swsusp: Basic memory bitmaps freed\n");
782}
783
784/**
567 * snapshot_additional_pages - estimate the number of additional pages 785 * snapshot_additional_pages - estimate the number of additional pages
568 * be needed for setting up the suspend image data structures for given 786 * be needed for setting up the suspend image data structures for given
569 * zone (usually the returned value is greater than the exact number) 787 * zone (usually the returned value is greater than the exact number)
@@ -615,7 +833,8 @@ static struct page *saveable_highmem_page(unsigned long pfn)
615 833
616 BUG_ON(!PageHighMem(page)); 834 BUG_ON(!PageHighMem(page));
617 835
618 if (PageNosave(page) || PageReserved(page) || PageNosaveFree(page)) 836 if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page) ||
837 PageReserved(page))
619 return NULL; 838 return NULL;
620 839
621 return page; 840 return page;
@@ -651,17 +870,6 @@ static inline unsigned int count_highmem_pages(void) { return 0; }
651#endif /* CONFIG_HIGHMEM */ 870#endif /* CONFIG_HIGHMEM */
652 871
653/** 872/**
654 * pfn_is_nosave - check if given pfn is in the 'nosave' section
655 */
656
657static inline int pfn_is_nosave(unsigned long pfn)
658{
659 unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT;
660 unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT;
661 return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
662}
663
664/**
665 * saveable - Determine whether a non-highmem page should be included in 873 * saveable - Determine whether a non-highmem page should be included in
666 * the suspend image. 874 * the suspend image.
667 * 875 *
@@ -681,7 +889,7 @@ static struct page *saveable_page(unsigned long pfn)
681 889
682 BUG_ON(PageHighMem(page)); 890 BUG_ON(PageHighMem(page));
683 891
684 if (PageNosave(page) || PageNosaveFree(page)) 892 if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page))
685 return NULL; 893 return NULL;
686 894
687 if (PageReserved(page) && pfn_is_nosave(pfn)) 895 if (PageReserved(page) && pfn_is_nosave(pfn))
@@ -821,9 +1029,10 @@ void swsusp_free(void)
821 if (pfn_valid(pfn)) { 1029 if (pfn_valid(pfn)) {
822 struct page *page = pfn_to_page(pfn); 1030 struct page *page = pfn_to_page(pfn);
823 1031
824 if (PageNosave(page) && PageNosaveFree(page)) { 1032 if (swsusp_page_is_forbidden(page) &&
825 ClearPageNosave(page); 1033 swsusp_page_is_free(page)) {
826 ClearPageNosaveFree(page); 1034 swsusp_unset_page_forbidden(page);
1035 swsusp_unset_page_free(page);
827 __free_page(page); 1036 __free_page(page);
828 } 1037 }
829 } 1038 }
@@ -1146,7 +1355,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm)
1146 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 1355 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
1147 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 1356 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
1148 if (pfn_valid(pfn)) 1357 if (pfn_valid(pfn))
1149 ClearPageNosaveFree(pfn_to_page(pfn)); 1358 swsusp_unset_page_free(pfn_to_page(pfn));
1150 } 1359 }
1151 1360
1152 /* Mark pages that correspond to the "original" pfns as "unsafe" */ 1361 /* Mark pages that correspond to the "original" pfns as "unsafe" */
@@ -1155,7 +1364,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm)
1155 pfn = memory_bm_next_pfn(bm); 1364 pfn = memory_bm_next_pfn(bm);
1156 if (likely(pfn != BM_END_OF_MAP)) { 1365 if (likely(pfn != BM_END_OF_MAP)) {
1157 if (likely(pfn_valid(pfn))) 1366 if (likely(pfn_valid(pfn)))
1158 SetPageNosaveFree(pfn_to_page(pfn)); 1367 swsusp_set_page_free(pfn_to_page(pfn));
1159 else 1368 else
1160 return -EFAULT; 1369 return -EFAULT;
1161 } 1370 }
@@ -1321,14 +1530,14 @@ prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
1321 struct page *page; 1530 struct page *page;
1322 1531
1323 page = alloc_page(__GFP_HIGHMEM); 1532 page = alloc_page(__GFP_HIGHMEM);
1324 if (!PageNosaveFree(page)) { 1533 if (!swsusp_page_is_free(page)) {
1325 /* The page is "safe", set its bit the bitmap */ 1534 /* The page is "safe", set its bit the bitmap */
1326 memory_bm_set_bit(bm, page_to_pfn(page)); 1535 memory_bm_set_bit(bm, page_to_pfn(page));
1327 safe_highmem_pages++; 1536 safe_highmem_pages++;
1328 } 1537 }
1329 /* Mark the page as allocated */ 1538 /* Mark the page as allocated */
1330 SetPageNosave(page); 1539 swsusp_set_page_forbidden(page);
1331 SetPageNosaveFree(page); 1540 swsusp_set_page_free(page);
1332 } 1541 }
1333 memory_bm_position_reset(bm); 1542 memory_bm_position_reset(bm);
1334 safe_highmem_bm = bm; 1543 safe_highmem_bm = bm;
@@ -1360,7 +1569,7 @@ get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
1360 struct highmem_pbe *pbe; 1569 struct highmem_pbe *pbe;
1361 void *kaddr; 1570 void *kaddr;
1362 1571
1363 if (PageNosave(page) && PageNosaveFree(page)) { 1572 if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page)) {
1364 /* We have allocated the "original" page frame and we can 1573 /* We have allocated the "original" page frame and we can
1365 * use it directly to store the loaded page. 1574 * use it directly to store the loaded page.
1366 */ 1575 */
@@ -1522,14 +1731,14 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
1522 error = -ENOMEM; 1731 error = -ENOMEM;
1523 goto Free; 1732 goto Free;
1524 } 1733 }
1525 if (!PageNosaveFree(virt_to_page(lp))) { 1734 if (!swsusp_page_is_free(virt_to_page(lp))) {
1526 /* The page is "safe", add it to the list */ 1735 /* The page is "safe", add it to the list */
1527 lp->next = safe_pages_list; 1736 lp->next = safe_pages_list;
1528 safe_pages_list = lp; 1737 safe_pages_list = lp;
1529 } 1738 }
1530 /* Mark the page as allocated */ 1739 /* Mark the page as allocated */
1531 SetPageNosave(virt_to_page(lp)); 1740 swsusp_set_page_forbidden(virt_to_page(lp));
1532 SetPageNosaveFree(virt_to_page(lp)); 1741 swsusp_set_page_free(virt_to_page(lp));
1533 nr_pages--; 1742 nr_pages--;
1534 } 1743 }
1535 /* Free the reserved safe pages so that chain_alloc() can use them */ 1744 /* Free the reserved safe pages so that chain_alloc() can use them */
@@ -1558,7 +1767,7 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
1558 if (PageHighMem(page)) 1767 if (PageHighMem(page))
1559 return get_highmem_page_buffer(page, ca); 1768 return get_highmem_page_buffer(page, ca);
1560 1769
1561 if (PageNosave(page) && PageNosaveFree(page)) 1770 if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page))
1562 /* We have allocated the "original" page frame and we can 1771 /* We have allocated the "original" page frame and we can
1563 * use it directly to store the loaded page. 1772 * use it directly to store the loaded page.
1564 */ 1773 */
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 3581f8f86a..e83ed9945a 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -33,12 +33,14 @@ extern char resume_file[];
33 33
34#define SWSUSP_SIG "S1SUSPEND" 34#define SWSUSP_SIG "S1SUSPEND"
35 35
36static struct swsusp_header { 36struct swsusp_header {
37 char reserved[PAGE_SIZE - 20 - sizeof(sector_t)]; 37 char reserved[PAGE_SIZE - 20 - sizeof(sector_t)];
38 sector_t image; 38 sector_t image;
39 char orig_sig[10]; 39 char orig_sig[10];
40 char sig[10]; 40 char sig[10];
41} __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header; 41} __attribute__((packed));
42
43static struct swsusp_header *swsusp_header;
42 44
43/* 45/*
44 * General things 46 * General things
@@ -141,14 +143,14 @@ static int mark_swapfiles(sector_t start)
141{ 143{
142 int error; 144 int error;
143 145
144 bio_read_page(swsusp_resume_block, &swsusp_header, NULL); 146 bio_read_page(swsusp_resume_block, swsusp_header, NULL);
145 if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) || 147 if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) ||
146 !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) { 148 !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) {
147 memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); 149 memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10);
148 memcpy(swsusp_header.sig,SWSUSP_SIG, 10); 150 memcpy(swsusp_header->sig,SWSUSP_SIG, 10);
149 swsusp_header.image = start; 151 swsusp_header->image = start;
150 error = bio_write_page(swsusp_resume_block, 152 error = bio_write_page(swsusp_resume_block,
151 &swsusp_header, NULL); 153 swsusp_header, NULL);
152 } else { 154 } else {
153 printk(KERN_ERR "swsusp: Swap header not found!\n"); 155 printk(KERN_ERR "swsusp: Swap header not found!\n");
154 error = -ENODEV; 156 error = -ENODEV;
@@ -241,7 +243,6 @@ struct swap_map_page {
241struct swap_map_handle { 243struct swap_map_handle {
242 struct swap_map_page *cur; 244 struct swap_map_page *cur;
243 sector_t cur_swap; 245 sector_t cur_swap;
244 struct bitmap_page *bitmap;
245 unsigned int k; 246 unsigned int k;
246}; 247};
247 248
@@ -250,9 +251,6 @@ static void release_swap_writer(struct swap_map_handle *handle)
250 if (handle->cur) 251 if (handle->cur)
251 free_page((unsigned long)handle->cur); 252 free_page((unsigned long)handle->cur);
252 handle->cur = NULL; 253 handle->cur = NULL;
253 if (handle->bitmap)
254 free_bitmap(handle->bitmap);
255 handle->bitmap = NULL;
256} 254}
257 255
258static int get_swap_writer(struct swap_map_handle *handle) 256static int get_swap_writer(struct swap_map_handle *handle)
@@ -260,12 +258,7 @@ static int get_swap_writer(struct swap_map_handle *handle)
260 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL); 258 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
261 if (!handle->cur) 259 if (!handle->cur)
262 return -ENOMEM; 260 return -ENOMEM;
263 handle->bitmap = alloc_bitmap(count_swap_pages(root_swap, 0)); 261 handle->cur_swap = alloc_swapdev_block(root_swap);
264 if (!handle->bitmap) {
265 release_swap_writer(handle);
266 return -ENOMEM;
267 }
268 handle->cur_swap = alloc_swapdev_block(root_swap, handle->bitmap);
269 if (!handle->cur_swap) { 262 if (!handle->cur_swap) {
270 release_swap_writer(handle); 263 release_swap_writer(handle);
271 return -ENOSPC; 264 return -ENOSPC;
@@ -282,7 +275,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
282 275
283 if (!handle->cur) 276 if (!handle->cur)
284 return -EINVAL; 277 return -EINVAL;
285 offset = alloc_swapdev_block(root_swap, handle->bitmap); 278 offset = alloc_swapdev_block(root_swap);
286 error = write_page(buf, offset, bio_chain); 279 error = write_page(buf, offset, bio_chain);
287 if (error) 280 if (error)
288 return error; 281 return error;
@@ -291,7 +284,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
291 error = wait_on_bio_chain(bio_chain); 284 error = wait_on_bio_chain(bio_chain);
292 if (error) 285 if (error)
293 goto out; 286 goto out;
294 offset = alloc_swapdev_block(root_swap, handle->bitmap); 287 offset = alloc_swapdev_block(root_swap);
295 if (!offset) 288 if (!offset)
296 return -ENOSPC; 289 return -ENOSPC;
297 handle->cur->next_swap = offset; 290 handle->cur->next_swap = offset;
@@ -428,7 +421,8 @@ int swsusp_write(void)
428 } 421 }
429 } 422 }
430 if (error) 423 if (error)
431 free_all_swap_pages(root_swap, handle.bitmap); 424 free_all_swap_pages(root_swap);
425
432 release_swap_writer(&handle); 426 release_swap_writer(&handle);
433 out: 427 out:
434 swsusp_close(); 428 swsusp_close();
@@ -564,7 +558,7 @@ int swsusp_read(void)
564 if (error < PAGE_SIZE) 558 if (error < PAGE_SIZE)
565 return error < 0 ? error : -EFAULT; 559 return error < 0 ? error : -EFAULT;
566 header = (struct swsusp_info *)data_of(snapshot); 560 header = (struct swsusp_info *)data_of(snapshot);
567 error = get_swap_reader(&handle, swsusp_header.image); 561 error = get_swap_reader(&handle, swsusp_header->image);
568 if (!error) 562 if (!error)
569 error = swap_read_page(&handle, header, NULL); 563 error = swap_read_page(&handle, header, NULL);
570 if (!error) 564 if (!error)
@@ -591,17 +585,17 @@ int swsusp_check(void)
591 resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); 585 resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
592 if (!IS_ERR(resume_bdev)) { 586 if (!IS_ERR(resume_bdev)) {
593 set_blocksize(resume_bdev, PAGE_SIZE); 587 set_blocksize(resume_bdev, PAGE_SIZE);
594 memset(&swsusp_header, 0, sizeof(swsusp_header)); 588 memset(swsusp_header, 0, sizeof(PAGE_SIZE));
595 error = bio_read_page(swsusp_resume_block, 589 error = bio_read_page(swsusp_resume_block,
596 &swsusp_header, NULL); 590 swsusp_header, NULL);
597 if (error) 591 if (error)
598 return error; 592 return error;
599 593
600 if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) { 594 if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) {
601 memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10); 595 memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
602 /* Reset swap signature now */ 596 /* Reset swap signature now */
603 error = bio_write_page(swsusp_resume_block, 597 error = bio_write_page(swsusp_resume_block,
604 &swsusp_header, NULL); 598 swsusp_header, NULL);
605 } else { 599 } else {
606 return -EINVAL; 600 return -EINVAL;
607 } 601 }
@@ -632,3 +626,13 @@ void swsusp_close(void)
632 626
633 blkdev_put(resume_bdev); 627 blkdev_put(resume_bdev);
634} 628}
629
630static int swsusp_header_init(void)
631{
632 swsusp_header = (struct swsusp_header*) __get_free_page(GFP_KERNEL);
633 if (!swsusp_header)
634 panic("Could not allocate memory for swsusp_header\n");
635 return 0;
636}
637
638core_initcall(swsusp_header_init);
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 7fb834397a..5da304c8f1 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -50,6 +50,7 @@
50#include <linux/syscalls.h> 50#include <linux/syscalls.h>
51#include <linux/highmem.h> 51#include <linux/highmem.h>
52#include <linux/time.h> 52#include <linux/time.h>
53#include <linux/rbtree.h>
53 54
54#include "power.h" 55#include "power.h"
55 56
@@ -74,72 +75,69 @@ static inline unsigned int count_highmem_pages(void) { return 0; }
74/** 75/**
75 * The following functions are used for tracing the allocated 76 * The following functions are used for tracing the allocated
76 * swap pages, so that they can be freed in case of an error. 77 * swap pages, so that they can be freed in case of an error.
77 *
78 * The functions operate on a linked bitmap structure defined
79 * in power.h
80 */ 78 */
81 79
82void free_bitmap(struct bitmap_page *bitmap) 80struct swsusp_extent {
83{ 81 struct rb_node node;
84 struct bitmap_page *bp; 82 unsigned long start;
83 unsigned long end;
84};
85 85
86 while (bitmap) { 86static struct rb_root swsusp_extents = RB_ROOT;
87 bp = bitmap->next;
88 free_page((unsigned long)bitmap);
89 bitmap = bp;
90 }
91}
92 87
93struct bitmap_page *alloc_bitmap(unsigned int nr_bits) 88static int swsusp_extents_insert(unsigned long swap_offset)
94{ 89{
95 struct bitmap_page *bitmap, *bp; 90 struct rb_node **new = &(swsusp_extents.rb_node);
96 unsigned int n; 91 struct rb_node *parent = NULL;
97 92 struct swsusp_extent *ext;
98 if (!nr_bits) 93
99 return NULL; 94 /* Figure out where to put the new node */
100 95 while (*new) {
101 bitmap = (struct bitmap_page *)get_zeroed_page(GFP_KERNEL); 96 ext = container_of(*new, struct swsusp_extent, node);
102 bp = bitmap; 97 parent = *new;
103 for (n = BITMAP_PAGE_BITS; n < nr_bits; n += BITMAP_PAGE_BITS) { 98 if (swap_offset < ext->start) {
104 bp->next = (struct bitmap_page *)get_zeroed_page(GFP_KERNEL); 99 /* Try to merge */
105 bp = bp->next; 100 if (swap_offset == ext->start - 1) {
106 if (!bp) { 101 ext->start--;
107 free_bitmap(bitmap); 102 return 0;
108 return NULL; 103 }
104 new = &((*new)->rb_left);
105 } else if (swap_offset > ext->end) {
106 /* Try to merge */
107 if (swap_offset == ext->end + 1) {
108 ext->end++;
109 return 0;
110 }
111 new = &((*new)->rb_right);
112 } else {
113 /* It already is in the tree */
114 return -EINVAL;
109 } 115 }
110 } 116 }
111 return bitmap; 117 /* Add the new node and rebalance the tree. */
112} 118 ext = kzalloc(sizeof(struct swsusp_extent), GFP_KERNEL);
113 119 if (!ext)
114static int bitmap_set(struct bitmap_page *bitmap, unsigned long bit) 120 return -ENOMEM;
115{ 121
116 unsigned int n; 122 ext->start = swap_offset;
117 123 ext->end = swap_offset;
118 n = BITMAP_PAGE_BITS; 124 rb_link_node(&ext->node, parent, new);
119 while (bitmap && n <= bit) { 125 rb_insert_color(&ext->node, &swsusp_extents);
120 n += BITMAP_PAGE_BITS;
121 bitmap = bitmap->next;
122 }
123 if (!bitmap)
124 return -EINVAL;
125 n -= BITMAP_PAGE_BITS;
126 bit -= n;
127 n = 0;
128 while (bit >= BITS_PER_CHUNK) {
129 bit -= BITS_PER_CHUNK;
130 n++;
131 }
132 bitmap->chunks[n] |= (1UL << bit);
133 return 0; 126 return 0;
134} 127}
135 128
136sector_t alloc_swapdev_block(int swap, struct bitmap_page *bitmap) 129/**
130 * alloc_swapdev_block - allocate a swap page and register that it has
131 * been allocated, so that it can be freed in case of an error.
132 */
133
134sector_t alloc_swapdev_block(int swap)
137{ 135{
138 unsigned long offset; 136 unsigned long offset;
139 137
140 offset = swp_offset(get_swap_page_of_type(swap)); 138 offset = swp_offset(get_swap_page_of_type(swap));
141 if (offset) { 139 if (offset) {
142 if (bitmap_set(bitmap, offset)) 140 if (swsusp_extents_insert(offset))
143 swap_free(swp_entry(swap, offset)); 141 swap_free(swp_entry(swap, offset));
144 else 142 else
145 return swapdev_block(swap, offset); 143 return swapdev_block(swap, offset);
@@ -147,23 +145,34 @@ sector_t alloc_swapdev_block(int swap, struct bitmap_page *bitmap)
147 return 0; 145 return 0;
148} 146}
149 147
150void free_all_swap_pages(int swap, struct bitmap_page *bitmap) 148/**
149 * free_all_swap_pages - free swap pages allocated for saving image data.
150 * It also frees the extents used to register which swap entres had been
151 * allocated.
152 */
153
154void free_all_swap_pages(int swap)
151{ 155{
152 unsigned int bit, n; 156 struct rb_node *node;
153 unsigned long test; 157
154 158 while ((node = swsusp_extents.rb_node)) {
155 bit = 0; 159 struct swsusp_extent *ext;
156 while (bitmap) { 160 unsigned long offset;
157 for (n = 0; n < BITMAP_PAGE_CHUNKS; n++) 161
158 for (test = 1UL; test; test <<= 1) { 162 ext = container_of(node, struct swsusp_extent, node);
159 if (bitmap->chunks[n] & test) 163 rb_erase(node, &swsusp_extents);
160 swap_free(swp_entry(swap, bit)); 164 for (offset = ext->start; offset <= ext->end; offset++)
161 bit++; 165 swap_free(swp_entry(swap, offset));
162 } 166
163 bitmap = bitmap->next; 167 kfree(ext);
164 } 168 }
165} 169}
166 170
171int swsusp_swap_in_use(void)
172{
173 return (swsusp_extents.rb_node != NULL);
174}
175
167/** 176/**
168 * swsusp_show_speed - print the time elapsed between two events represented by 177 * swsusp_show_speed - print the time elapsed between two events represented by
169 * @start and @stop 178 * @start and @stop
@@ -224,18 +233,18 @@ int swsusp_shrink_memory(void)
224 long size, highmem_size; 233 long size, highmem_size;
225 234
226 highmem_size = count_highmem_pages(); 235 highmem_size = count_highmem_pages();
227 size = count_data_pages() + PAGES_FOR_IO; 236 size = count_data_pages() + PAGES_FOR_IO + SPARE_PAGES;
228 tmp = size; 237 tmp = size;
229 size += highmem_size; 238 size += highmem_size;
230 for_each_zone (zone) 239 for_each_zone (zone)
231 if (populated_zone(zone)) { 240 if (populated_zone(zone)) {
241 tmp += snapshot_additional_pages(zone);
232 if (is_highmem(zone)) { 242 if (is_highmem(zone)) {
233 highmem_size -= 243 highmem_size -=
234 zone_page_state(zone, NR_FREE_PAGES); 244 zone_page_state(zone, NR_FREE_PAGES);
235 } else { 245 } else {
236 tmp -= zone_page_state(zone, NR_FREE_PAGES); 246 tmp -= zone_page_state(zone, NR_FREE_PAGES);
237 tmp += zone->lowmem_reserve[ZONE_NORMAL]; 247 tmp += zone->lowmem_reserve[ZONE_NORMAL];
238 tmp += snapshot_additional_pages(zone);
239 } 248 }
240 } 249 }
241 250
diff --git a/kernel/power/user.c b/kernel/power/user.c
index dd09efe7df..040560d9c3 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -33,25 +33,29 @@
33static struct snapshot_data { 33static struct snapshot_data {
34 struct snapshot_handle handle; 34 struct snapshot_handle handle;
35 int swap; 35 int swap;
36 struct bitmap_page *bitmap;
37 int mode; 36 int mode;
38 char frozen; 37 char frozen;
39 char ready; 38 char ready;
40 char platform_suspend; 39 char platform_suspend;
41} snapshot_state; 40} snapshot_state;
42 41
43static atomic_t device_available = ATOMIC_INIT(1); 42atomic_t snapshot_device_available = ATOMIC_INIT(1);
44 43
45static int snapshot_open(struct inode *inode, struct file *filp) 44static int snapshot_open(struct inode *inode, struct file *filp)
46{ 45{
47 struct snapshot_data *data; 46 struct snapshot_data *data;
48 47
49 if (!atomic_add_unless(&device_available, -1, 0)) 48 if (!atomic_add_unless(&snapshot_device_available, -1, 0))
50 return -EBUSY; 49 return -EBUSY;
51 50
52 if ((filp->f_flags & O_ACCMODE) == O_RDWR) 51 if ((filp->f_flags & O_ACCMODE) == O_RDWR) {
52 atomic_inc(&snapshot_device_available);
53 return -ENOSYS; 53 return -ENOSYS;
54 54 }
55 if(create_basic_memory_bitmaps()) {
56 atomic_inc(&snapshot_device_available);
57 return -ENOMEM;
58 }
55 nonseekable_open(inode, filp); 59 nonseekable_open(inode, filp);
56 data = &snapshot_state; 60 data = &snapshot_state;
57 filp->private_data = data; 61 filp->private_data = data;
@@ -64,7 +68,6 @@ static int snapshot_open(struct inode *inode, struct file *filp)
64 data->swap = -1; 68 data->swap = -1;
65 data->mode = O_WRONLY; 69 data->mode = O_WRONLY;
66 } 70 }
67 data->bitmap = NULL;
68 data->frozen = 0; 71 data->frozen = 0;
69 data->ready = 0; 72 data->ready = 0;
70 data->platform_suspend = 0; 73 data->platform_suspend = 0;
@@ -77,16 +80,15 @@ static int snapshot_release(struct inode *inode, struct file *filp)
77 struct snapshot_data *data; 80 struct snapshot_data *data;
78 81
79 swsusp_free(); 82 swsusp_free();
83 free_basic_memory_bitmaps();
80 data = filp->private_data; 84 data = filp->private_data;
81 free_all_swap_pages(data->swap, data->bitmap); 85 free_all_swap_pages(data->swap);
82 free_bitmap(data->bitmap);
83 if (data->frozen) { 86 if (data->frozen) {
84 mutex_lock(&pm_mutex); 87 mutex_lock(&pm_mutex);
85 thaw_processes(); 88 thaw_processes();
86 enable_nonboot_cpus();
87 mutex_unlock(&pm_mutex); 89 mutex_unlock(&pm_mutex);
88 } 90 }
89 atomic_inc(&device_available); 91 atomic_inc(&snapshot_device_available);
90 return 0; 92 return 0;
91} 93}
92 94
@@ -294,14 +296,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
294 error = -ENODEV; 296 error = -ENODEV;
295 break; 297 break;
296 } 298 }
297 if (!data->bitmap) { 299 offset = alloc_swapdev_block(data->swap);
298 data->bitmap = alloc_bitmap(count_swap_pages(data->swap, 0));
299 if (!data->bitmap) {
300 error = -ENOMEM;
301 break;
302 }
303 }
304 offset = alloc_swapdev_block(data->swap, data->bitmap);
305 if (offset) { 300 if (offset) {
306 offset <<= PAGE_SHIFT; 301 offset <<= PAGE_SHIFT;
307 error = put_user(offset, (sector_t __user *)arg); 302 error = put_user(offset, (sector_t __user *)arg);
@@ -315,13 +310,11 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
315 error = -ENODEV; 310 error = -ENODEV;
316 break; 311 break;
317 } 312 }
318 free_all_swap_pages(data->swap, data->bitmap); 313 free_all_swap_pages(data->swap);
319 free_bitmap(data->bitmap);
320 data->bitmap = NULL;
321 break; 314 break;
322 315
323 case SNAPSHOT_SET_SWAP_FILE: 316 case SNAPSHOT_SET_SWAP_FILE:
324 if (!data->bitmap) { 317 if (!swsusp_swap_in_use()) {
325 /* 318 /*
326 * User space encodes device types as two-byte values, 319 * User space encodes device types as two-byte values,
327 * so we need to recode them 320 * so we need to recode them
@@ -368,9 +361,12 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
368 if (error) { 361 if (error) {
369 printk(KERN_ERR "Failed to suspend some devices.\n"); 362 printk(KERN_ERR "Failed to suspend some devices.\n");
370 } else { 363 } else {
371 /* Enter S3, system is already frozen */ 364 error = disable_nonboot_cpus();
372 suspend_enter(PM_SUSPEND_MEM); 365 if (!error) {
373 366 /* Enter S3, system is already frozen */
367 suspend_enter(PM_SUSPEND_MEM);
368 enable_nonboot_cpus();
369 }
374 /* Wake up devices */ 370 /* Wake up devices */
375 device_resume(); 371 device_resume();
376 } 372 }
@@ -417,7 +413,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
417 break; 413 break;
418 414
419 case SNAPSHOT_SET_SWAP_AREA: 415 case SNAPSHOT_SET_SWAP_AREA:
420 if (data->bitmap) { 416 if (swsusp_swap_in_use()) {
421 error = -EPERM; 417 error = -EPERM;
422 } else { 418 } else {
423 struct resume_swap_area swap_area; 419 struct resume_swap_area swap_area;
diff --git a/kernel/resource.c b/kernel/resource.c
index bdb55a33f9..9bd14fd3e6 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -213,27 +213,6 @@ int request_resource(struct resource *root, struct resource *new)
213EXPORT_SYMBOL(request_resource); 213EXPORT_SYMBOL(request_resource);
214 214
215/** 215/**
216 * ____request_resource - reserve a resource, with resource conflict returned
217 * @root: root resource descriptor
218 * @new: resource descriptor desired by caller
219 *
220 * Returns:
221 * On success, NULL is returned.
222 * On error, a pointer to the conflicting resource is returned.
223 */
224struct resource *____request_resource(struct resource *root, struct resource *new)
225{
226 struct resource *conflict;
227
228 write_lock(&resource_lock);
229 conflict = __request_resource(root, new);
230 write_unlock(&resource_lock);
231 return conflict;
232}
233
234EXPORT_SYMBOL(____request_resource);
235
236/**
237 * release_resource - release a previously reserved resource 216 * release_resource - release a previously reserved resource
238 * @old: resource pointer 217 * @old: resource pointer
239 */ 218 */
diff --git a/kernel/sched.c b/kernel/sched.c
index a4ca632c47..0227f1625a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4687,32 +4687,10 @@ out_unlock:
4687 return retval; 4687 return retval;
4688} 4688}
4689 4689
4690static inline struct task_struct *eldest_child(struct task_struct *p)
4691{
4692 if (list_empty(&p->children))
4693 return NULL;
4694 return list_entry(p->children.next,struct task_struct,sibling);
4695}
4696
4697static inline struct task_struct *older_sibling(struct task_struct *p)
4698{
4699 if (p->sibling.prev==&p->parent->children)
4700 return NULL;
4701 return list_entry(p->sibling.prev,struct task_struct,sibling);
4702}
4703
4704static inline struct task_struct *younger_sibling(struct task_struct *p)
4705{
4706 if (p->sibling.next==&p->parent->children)
4707 return NULL;
4708 return list_entry(p->sibling.next,struct task_struct,sibling);
4709}
4710
4711static const char stat_nam[] = "RSDTtZX"; 4690static const char stat_nam[] = "RSDTtZX";
4712 4691
4713static void show_task(struct task_struct *p) 4692static void show_task(struct task_struct *p)
4714{ 4693{
4715 struct task_struct *relative;
4716 unsigned long free = 0; 4694 unsigned long free = 0;
4717 unsigned state; 4695 unsigned state;
4718 4696
@@ -4738,19 +4716,7 @@ static void show_task(struct task_struct *p)
4738 free = (unsigned long)n - (unsigned long)end_of_stack(p); 4716 free = (unsigned long)n - (unsigned long)end_of_stack(p);
4739 } 4717 }
4740#endif 4718#endif
4741 printk("%5lu %5d %6d ", free, p->pid, p->parent->pid); 4719 printk("%5lu %5d %6d", free, p->pid, p->parent->pid);
4742 if ((relative = eldest_child(p)))
4743 printk("%5d ", relative->pid);
4744 else
4745 printk(" ");
4746 if ((relative = younger_sibling(p)))
4747 printk("%7d", relative->pid);
4748 else
4749 printk(" ");
4750 if ((relative = older_sibling(p)))
4751 printk(" %5d", relative->pid);
4752 else
4753 printk(" ");
4754 if (!p->mm) 4720 if (!p->mm)
4755 printk(" (L-TLB)\n"); 4721 printk(" (L-TLB)\n");
4756 else 4722 else
@@ -4780,7 +4746,7 @@ void show_state_filter(unsigned long state_filter)
4780 * console might take alot of time: 4746 * console might take alot of time:
4781 */ 4747 */
4782 touch_nmi_watchdog(); 4748 touch_nmi_watchdog();
4783 if (p->state & state_filter) 4749 if (!state_filter || (p->state & state_filter))
4784 show_task(p); 4750 show_task(p);
4785 } while_each_thread(g, p); 4751 } while_each_thread(g, p);
4786 4752
@@ -5278,6 +5244,11 @@ int __init migration_init(void)
5278#endif 5244#endif
5279 5245
5280#ifdef CONFIG_SMP 5246#ifdef CONFIG_SMP
5247
5248/* Number of possible processor ids */
5249int nr_cpu_ids __read_mostly = NR_CPUS;
5250EXPORT_SYMBOL(nr_cpu_ids);
5251
5281#undef SCHED_DOMAIN_DEBUG 5252#undef SCHED_DOMAIN_DEBUG
5282#ifdef SCHED_DOMAIN_DEBUG 5253#ifdef SCHED_DOMAIN_DEBUG
5283static void sched_domain_debug(struct sched_domain *sd, int cpu) 5254static void sched_domain_debug(struct sched_domain *sd, int cpu)
@@ -6760,6 +6731,7 @@ int in_sched_functions(unsigned long addr)
6760void __init sched_init(void) 6731void __init sched_init(void)
6761{ 6732{
6762 int i, j, k; 6733 int i, j, k;
6734 int highest_cpu = 0;
6763 6735
6764 for_each_possible_cpu(i) { 6736 for_each_possible_cpu(i) {
6765 struct prio_array *array; 6737 struct prio_array *array;
@@ -6794,11 +6766,13 @@ void __init sched_init(void)
6794 // delimiter for bitsearch 6766 // delimiter for bitsearch
6795 __set_bit(MAX_PRIO, array->bitmap); 6767 __set_bit(MAX_PRIO, array->bitmap);
6796 } 6768 }
6769 highest_cpu = i;
6797 } 6770 }
6798 6771
6799 set_load_weight(&init_task); 6772 set_load_weight(&init_task);
6800 6773
6801#ifdef CONFIG_SMP 6774#ifdef CONFIG_SMP
6775 nr_cpu_ids = highest_cpu + 1;
6802 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); 6776 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
6803#endif 6777#endif
6804 6778
diff --git a/kernel/signal.c b/kernel/signal.c
index 3670225ecb..2b4087d545 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2636,9 +2636,5 @@ __attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma)
2636 2636
2637void __init signals_init(void) 2637void __init signals_init(void)
2638{ 2638{
2639 sigqueue_cachep = 2639 sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC);
2640 kmem_cache_create("sigqueue",
2641 sizeof(struct sigqueue),
2642 __alignof__(struct sigqueue),
2643 SLAB_PANIC, NULL, NULL);
2644} 2640}
diff --git a/kernel/sys.c b/kernel/sys.c
index 123b165080..fe1f3ab204 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -881,7 +881,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
881#ifdef CONFIG_SOFTWARE_SUSPEND 881#ifdef CONFIG_SOFTWARE_SUSPEND
882 case LINUX_REBOOT_CMD_SW_SUSPEND: 882 case LINUX_REBOOT_CMD_SW_SUSPEND:
883 { 883 {
884 int ret = software_suspend(); 884 int ret = pm_suspend(PM_SUSPEND_DISK);
885 unlock_kernel(); 885 unlock_kernel();
886 return ret; 886 return ret;
887 } 887 }
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 1b255df4fc..c904748f22 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1676,7 +1676,7 @@ static int proc_dointvec_taint(ctl_table *table, int write, struct file *filp,
1676{ 1676{
1677 int op; 1677 int op;
1678 1678
1679 if (!capable(CAP_SYS_ADMIN)) 1679 if (write && !capable(CAP_SYS_ADMIN))
1680 return -EPERM; 1680 return -EPERM;
1681 1681
1682 op = OP_OR; 1682 op = OP_OR;
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 4c3476fa05..906cae7715 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -102,7 +102,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
102 */ 102 */
103static int send_reply(struct sk_buff *skb, pid_t pid) 103static int send_reply(struct sk_buff *skb, pid_t pid)
104{ 104{
105 struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); 105 struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb));
106 void *reply = genlmsg_data(genlhdr); 106 void *reply = genlmsg_data(genlhdr);
107 int rc; 107 int rc;
108 108
@@ -121,7 +121,7 @@ static int send_reply(struct sk_buff *skb, pid_t pid)
121static void send_cpu_listeners(struct sk_buff *skb, 121static void send_cpu_listeners(struct sk_buff *skb,
122 struct listener_list *listeners) 122 struct listener_list *listeners)
123{ 123{
124 struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); 124 struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb));
125 struct listener *s, *tmp; 125 struct listener *s, *tmp;
126 struct sk_buff *skb_next, *skb_cur = skb; 126 struct sk_buff *skb_next, *skb_cur = skb;
127 void *reply = genlmsg_data(genlhdr); 127 void *reply = genlmsg_data(genlhdr);
@@ -524,9 +524,7 @@ void __init taskstats_init_early(void)
524{ 524{
525 unsigned int i; 525 unsigned int i;
526 526
527 taskstats_cache = kmem_cache_create("taskstats_cache", 527 taskstats_cache = KMEM_CACHE(taskstats, SLAB_PANIC);
528 sizeof(struct taskstats),
529 0, SLAB_PANIC, NULL, NULL);
530 for_each_possible_cpu(i) { 528 for_each_possible_cpu(i) {
531 INIT_LIST_HEAD(&(per_cpu(listener_array, i).list)); 529 INIT_LIST_HEAD(&(per_cpu(listener_array, i).list));
532 init_rwsem(&(per_cpu(listener_array, i).sem)); 530 init_rwsem(&(per_cpu(listener_array, i).sem));
diff --git a/kernel/time.c b/kernel/time.c
index c6c80ea5d0..ba18ec4899 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -452,6 +452,7 @@ struct timespec ns_to_timespec(const s64 nsec)
452 452
453 return ts; 453 return ts;
454} 454}
455EXPORT_SYMBOL(ns_to_timespec);
455 456
456/** 457/**
457 * ns_to_timeval - Convert nanoseconds to timeval 458 * ns_to_timeval - Convert nanoseconds to timeval
@@ -469,6 +470,7 @@ struct timeval ns_to_timeval(const s64 nsec)
469 470
470 return tv; 471 return tv;
471} 472}
473EXPORT_SYMBOL(ns_to_timeval);
472 474
473/* 475/*
474 * Convert jiffies to milliseconds and back. 476 * Convert jiffies to milliseconds and back.
@@ -635,6 +637,7 @@ timeval_to_jiffies(const struct timeval *value)
635 (((u64)usec * USEC_CONVERSION + USEC_ROUND) >> 637 (((u64)usec * USEC_CONVERSION + USEC_ROUND) >>
636 (USEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; 638 (USEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
637} 639}
640EXPORT_SYMBOL(timeval_to_jiffies);
638 641
639void jiffies_to_timeval(const unsigned long jiffies, struct timeval *value) 642void jiffies_to_timeval(const unsigned long jiffies, struct timeval *value)
640{ 643{
@@ -649,6 +652,7 @@ void jiffies_to_timeval(const unsigned long jiffies, struct timeval *value)
649 tv_usec /= NSEC_PER_USEC; 652 tv_usec /= NSEC_PER_USEC;
650 value->tv_usec = tv_usec; 653 value->tv_usec = tv_usec;
651} 654}
655EXPORT_SYMBOL(jiffies_to_timeval);
652 656
653/* 657/*
654 * Convert jiffies/jiffies_64 to clock_t and back. 658 * Convert jiffies/jiffies_64 to clock_t and back.
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 67932ea78c..76212b2a99 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -274,72 +274,3 @@ void clockevents_notify(unsigned long reason, void *arg)
274} 274}
275EXPORT_SYMBOL_GPL(clockevents_notify); 275EXPORT_SYMBOL_GPL(clockevents_notify);
276 276
277#ifdef CONFIG_SYSFS
278
279/**
280 * clockevents_show_registered - sysfs interface for listing clockevents
281 * @dev: unused
282 * @buf: char buffer to be filled with clock events list
283 *
284 * Provides sysfs interface for listing registered clock event devices
285 */
286static ssize_t clockevents_show_registered(struct sys_device *dev, char *buf)
287{
288 struct list_head *tmp;
289 char *p = buf;
290 int cpu;
291
292 spin_lock(&clockevents_lock);
293
294 list_for_each(tmp, &clockevent_devices) {
295 struct clock_event_device *ce;
296
297 ce = list_entry(tmp, struct clock_event_device, list);
298 p += sprintf(p, "%-20s F:%04x M:%d", ce->name,
299 ce->features, ce->mode);
300 p += sprintf(p, " C:");
301 if (!cpus_equal(ce->cpumask, cpu_possible_map)) {
302 for_each_cpu_mask(cpu, ce->cpumask)
303 p += sprintf(p, " %d", cpu);
304 } else {
305 /*
306 * FIXME: Add the cpu which is handling this sucker
307 */
308 }
309 p += sprintf(p, "\n");
310 }
311
312 spin_unlock(&clockevents_lock);
313
314 return p - buf;
315}
316
317/*
318 * Sysfs setup bits:
319 */
320static SYSDEV_ATTR(registered, 0600,
321 clockevents_show_registered, NULL);
322
323static struct sysdev_class clockevents_sysclass = {
324 set_kset_name("clockevents"),
325};
326
327static struct sys_device clockevents_sys_device = {
328 .id = 0,
329 .cls = &clockevents_sysclass,
330};
331
332static int __init clockevents_sysfs_init(void)
333{
334 int error = sysdev_class_register(&clockevents_sysclass);
335
336 if (!error)
337 error = sysdev_register(&clockevents_sys_device);
338 if (!error)
339 error = sysdev_create_file(
340 &clockevents_sys_device,
341 &attr_registered);
342 return error;
343}
344device_initcall(clockevents_sysfs_init);
345#endif
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 5b0e46b56f..fe5c7db242 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -151,7 +151,8 @@ static void clocksource_check_watchdog(struct clocksource *cs)
151 watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; 151 watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
152 add_timer(&watchdog_timer); 152 add_timer(&watchdog_timer);
153 } 153 }
154 } else if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) { 154 } else {
155 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
155 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; 156 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
156 157
157 if (!watchdog || cs->rating > watchdog->rating) { 158 if (!watchdog || cs->rating > watchdog->rating) {
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 3be8da8fed..4c256fdb88 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -69,4 +69,4 @@ static int __init init_jiffies_clocksource(void)
69 return clocksource_register(&clocksource_jiffies); 69 return clocksource_register(&clocksource_jiffies);
70} 70}
71 71
72module_init(init_jiffies_clocksource); 72core_initcall(init_jiffies_clocksource);
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index eb12509e00..cb25649c6f 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -32,7 +32,7 @@ static u64 tick_length, tick_length_base;
32/* TIME_ERROR prevents overwriting the CMOS clock */ 32/* TIME_ERROR prevents overwriting the CMOS clock */
33static int time_state = TIME_OK; /* clock synchronization status */ 33static int time_state = TIME_OK; /* clock synchronization status */
34int time_status = STA_UNSYNC; /* clock status bits */ 34int time_status = STA_UNSYNC; /* clock status bits */
35static long time_offset; /* time adjustment (ns) */ 35static s64 time_offset; /* time adjustment (ns) */
36static long time_constant = 2; /* pll time constant */ 36static long time_constant = 2; /* pll time constant */
37long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ 37long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */
38long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ 38long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */
@@ -196,7 +196,7 @@ void __attribute__ ((weak)) notify_arch_cmos_timer(void)
196 */ 196 */
197int do_adjtimex(struct timex *txc) 197int do_adjtimex(struct timex *txc)
198{ 198{
199 long ltemp, mtemp, save_adjust; 199 long mtemp, save_adjust, rem;
200 s64 freq_adj, temp64; 200 s64 freq_adj, temp64;
201 int result; 201 int result;
202 202
@@ -277,14 +277,14 @@ int do_adjtimex(struct timex *txc)
277 time_adjust = txc->offset; 277 time_adjust = txc->offset;
278 } 278 }
279 else if (time_status & STA_PLL) { 279 else if (time_status & STA_PLL) {
280 ltemp = txc->offset * NSEC_PER_USEC; 280 time_offset = txc->offset * NSEC_PER_USEC;
281 281
282 /* 282 /*
283 * Scale the phase adjustment and 283 * Scale the phase adjustment and
284 * clamp to the operating range. 284 * clamp to the operating range.
285 */ 285 */
286 time_offset = min(ltemp, MAXPHASE * NSEC_PER_USEC); 286 time_offset = min(time_offset, (s64)MAXPHASE * NSEC_PER_USEC);
287 time_offset = max(time_offset, -MAXPHASE * NSEC_PER_USEC); 287 time_offset = max(time_offset, (s64)-MAXPHASE * NSEC_PER_USEC);
288 288
289 /* 289 /*
290 * Select whether the frequency is to be controlled 290 * Select whether the frequency is to be controlled
@@ -297,11 +297,11 @@ int do_adjtimex(struct timex *txc)
297 mtemp = xtime.tv_sec - time_reftime; 297 mtemp = xtime.tv_sec - time_reftime;
298 time_reftime = xtime.tv_sec; 298 time_reftime = xtime.tv_sec;
299 299
300 freq_adj = (s64)time_offset * mtemp; 300 freq_adj = time_offset * mtemp;
301 freq_adj = shift_right(freq_adj, time_constant * 2 + 301 freq_adj = shift_right(freq_adj, time_constant * 2 +
302 (SHIFT_PLL + 2) * 2 - SHIFT_NSEC); 302 (SHIFT_PLL + 2) * 2 - SHIFT_NSEC);
303 if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) { 303 if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) {
304 temp64 = (s64)time_offset << (SHIFT_NSEC - SHIFT_FLL); 304 temp64 = time_offset << (SHIFT_NSEC - SHIFT_FLL);
305 if (time_offset < 0) { 305 if (time_offset < 0) {
306 temp64 = -temp64; 306 temp64 = -temp64;
307 do_div(temp64, mtemp); 307 do_div(temp64, mtemp);
@@ -314,8 +314,10 @@ int do_adjtimex(struct timex *txc)
314 freq_adj += time_freq; 314 freq_adj += time_freq;
315 freq_adj = min(freq_adj, (s64)MAXFREQ_NSEC); 315 freq_adj = min(freq_adj, (s64)MAXFREQ_NSEC);
316 time_freq = max(freq_adj, (s64)-MAXFREQ_NSEC); 316 time_freq = max(freq_adj, (s64)-MAXFREQ_NSEC);
317 time_offset = (time_offset / NTP_INTERVAL_FREQ) 317 time_offset = div_long_long_rem_signed(time_offset,
318 << SHIFT_UPDATE; 318 NTP_INTERVAL_FREQ,
319 &rem);
320 time_offset <<= SHIFT_UPDATE;
319 } /* STA_PLL */ 321 } /* STA_PLL */
320 } /* txc->modes & ADJ_OFFSET */ 322 } /* txc->modes & ADJ_OFFSET */
321 if (txc->modes & ADJ_TICK) 323 if (txc->modes & ADJ_TICK)
@@ -328,12 +330,12 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
328 result = TIME_ERROR; 330 result = TIME_ERROR;
329 331
330 if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) 332 if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
331 txc->offset = save_adjust; 333 txc->offset = save_adjust;
332 else 334 else
333 txc->offset = shift_right(time_offset, SHIFT_UPDATE) 335 txc->offset = ((long)shift_right(time_offset, SHIFT_UPDATE)) *
334 * NTP_INTERVAL_FREQ / 1000; 336 NTP_INTERVAL_FREQ / 1000;
335 txc->freq = (time_freq / NSEC_PER_USEC) 337 txc->freq = (time_freq / NSEC_PER_USEC) <<
336 << (SHIFT_USEC - SHIFT_NSEC); 338 (SHIFT_USEC - SHIFT_NSEC);
337 txc->maxerror = time_maxerror; 339 txc->maxerror = time_maxerror;
338 txc->esterror = time_esterror; 340 txc->esterror = time_esterror;
339 txc->status = time_status; 341 txc->status = time_status;
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 5567745470..eadfce2fff 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -307,12 +307,19 @@ int tick_resume_broadcast(void)
307 spin_lock_irqsave(&tick_broadcast_lock, flags); 307 spin_lock_irqsave(&tick_broadcast_lock, flags);
308 308
309 bc = tick_broadcast_device.evtdev; 309 bc = tick_broadcast_device.evtdev;
310 if (bc) {
311 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC &&
312 !cpus_empty(tick_broadcast_mask))
313 tick_broadcast_start_periodic(bc);
314 310
315 broadcast = cpu_isset(smp_processor_id(), tick_broadcast_mask); 311 if (bc) {
312 switch (tick_broadcast_device.mode) {
313 case TICKDEV_MODE_PERIODIC:
314 if(!cpus_empty(tick_broadcast_mask))
315 tick_broadcast_start_periodic(bc);
316 broadcast = cpu_isset(smp_processor_id(),
317 tick_broadcast_mask);
318 break;
319 case TICKDEV_MODE_ONESHOT:
320 broadcast = tick_resume_broadcast_oneshot(bc);
321 break;
322 }
316 } 323 }
317 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 324 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
318 325
@@ -347,6 +354,16 @@ static int tick_broadcast_set_event(ktime_t expires, int force)
347 } 354 }
348} 355}
349 356
357int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
358{
359 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
360
361 if(!cpus_empty(tick_broadcast_oneshot_mask))
362 tick_broadcast_set_event(ktime_get(), 1);
363
364 return cpu_isset(smp_processor_id(), tick_broadcast_oneshot_mask);
365}
366
350/* 367/*
351 * Reprogram the broadcast device: 368 * Reprogram the broadcast device:
352 * 369 *
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 43ba1bdec1..bfda3f7f07 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -298,18 +298,17 @@ static void tick_shutdown(unsigned int *cpup)
298 spin_unlock_irqrestore(&tick_device_lock, flags); 298 spin_unlock_irqrestore(&tick_device_lock, flags);
299} 299}
300 300
301static void tick_suspend_periodic(void) 301static void tick_suspend(void)
302{ 302{
303 struct tick_device *td = &__get_cpu_var(tick_cpu_device); 303 struct tick_device *td = &__get_cpu_var(tick_cpu_device);
304 unsigned long flags; 304 unsigned long flags;
305 305
306 spin_lock_irqsave(&tick_device_lock, flags); 306 spin_lock_irqsave(&tick_device_lock, flags);
307 if (td->mode == TICKDEV_MODE_PERIODIC) 307 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_SHUTDOWN);
308 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_SHUTDOWN);
309 spin_unlock_irqrestore(&tick_device_lock, flags); 308 spin_unlock_irqrestore(&tick_device_lock, flags);
310} 309}
311 310
312static void tick_resume_periodic(void) 311static void tick_resume(void)
313{ 312{
314 struct tick_device *td = &__get_cpu_var(tick_cpu_device); 313 struct tick_device *td = &__get_cpu_var(tick_cpu_device);
315 unsigned long flags; 314 unsigned long flags;
@@ -317,6 +316,8 @@ static void tick_resume_periodic(void)
317 spin_lock_irqsave(&tick_device_lock, flags); 316 spin_lock_irqsave(&tick_device_lock, flags);
318 if (td->mode == TICKDEV_MODE_PERIODIC) 317 if (td->mode == TICKDEV_MODE_PERIODIC)
319 tick_setup_periodic(td->evtdev, 0); 318 tick_setup_periodic(td->evtdev, 0);
319 else
320 tick_resume_oneshot();
320 spin_unlock_irqrestore(&tick_device_lock, flags); 321 spin_unlock_irqrestore(&tick_device_lock, flags);
321} 322}
322 323
@@ -348,13 +349,13 @@ static int tick_notify(struct notifier_block *nb, unsigned long reason,
348 break; 349 break;
349 350
350 case CLOCK_EVT_NOTIFY_SUSPEND: 351 case CLOCK_EVT_NOTIFY_SUSPEND:
351 tick_suspend_periodic(); 352 tick_suspend();
352 tick_suspend_broadcast(); 353 tick_suspend_broadcast();
353 break; 354 break;
354 355
355 case CLOCK_EVT_NOTIFY_RESUME: 356 case CLOCK_EVT_NOTIFY_RESUME:
356 if (!tick_resume_broadcast()) 357 if (!tick_resume_broadcast())
357 tick_resume_periodic(); 358 tick_resume();
358 break; 359 break;
359 360
360 default: 361 default:
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 75890efd24..c9d203bde5 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -19,12 +19,13 @@ extern void tick_setup_oneshot(struct clock_event_device *newdev,
19extern int tick_program_event(ktime_t expires, int force); 19extern int tick_program_event(ktime_t expires, int force);
20extern void tick_oneshot_notify(void); 20extern void tick_oneshot_notify(void);
21extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)); 21extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *));
22 22extern void tick_resume_oneshot(void);
23# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST 23# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
24extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc); 24extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc);
25extern void tick_broadcast_oneshot_control(unsigned long reason); 25extern void tick_broadcast_oneshot_control(unsigned long reason);
26extern void tick_broadcast_switch_to_oneshot(void); 26extern void tick_broadcast_switch_to_oneshot(void);
27extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); 27extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);
28extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc);
28# else /* BROADCAST */ 29# else /* BROADCAST */
29static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) 30static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
30{ 31{
@@ -43,6 +44,10 @@ void tick_setup_oneshot(struct clock_event_device *newdev,
43{ 44{
44 BUG(); 45 BUG();
45} 46}
47static inline void tick_resume_oneshot(void)
48{
49 BUG();
50}
46static inline int tick_program_event(ktime_t expires, int force) 51static inline int tick_program_event(ktime_t expires, int force)
47{ 52{
48 return 0; 53 return 0;
@@ -54,6 +59,10 @@ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
54} 59}
55static inline void tick_broadcast_oneshot_control(unsigned long reason) { } 60static inline void tick_broadcast_oneshot_control(unsigned long reason) { }
56static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } 61static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
62static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
63{
64 return 0;
65}
57#endif /* !TICK_ONESHOT */ 66#endif /* !TICK_ONESHOT */
58 67
59/* 68/*
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 2e8b7ff863..f6997ab0c3 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -41,6 +41,18 @@ int tick_program_event(ktime_t expires, int force)
41} 41}
42 42
43/** 43/**
44 * tick_resume_onshot - resume oneshot mode
45 */
46void tick_resume_oneshot(void)
47{
48 struct tick_device *td = &__get_cpu_var(tick_cpu_device);
49 struct clock_event_device *dev = td->evtdev;
50
51 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
52 tick_program_event(ktime_get(), 1);
53}
54
55/**
44 * tick_setup_oneshot - setup the event device for oneshot mode (hres or nohz) 56 * tick_setup_oneshot - setup the event device for oneshot mode (hres or nohz)
45 */ 57 */
46void tick_setup_oneshot(struct clock_event_device *newdev, 58void tick_setup_oneshot(struct clock_event_device *newdev,
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index f82c635c3d..59df5e8555 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -194,9 +194,9 @@ print_tickdevice(struct seq_file *m, struct tick_device *td)
194 return; 194 return;
195 } 195 }
196 SEQ_printf(m, "%s\n", dev->name); 196 SEQ_printf(m, "%s\n", dev->name);
197 SEQ_printf(m, " max_delta_ns: %ld\n", dev->max_delta_ns); 197 SEQ_printf(m, " max_delta_ns: %lu\n", dev->max_delta_ns);
198 SEQ_printf(m, " min_delta_ns: %ld\n", dev->min_delta_ns); 198 SEQ_printf(m, " min_delta_ns: %lu\n", dev->min_delta_ns);
199 SEQ_printf(m, " mult: %ld\n", dev->mult); 199 SEQ_printf(m, " mult: %lu\n", dev->mult);
200 SEQ_printf(m, " shift: %d\n", dev->shift); 200 SEQ_printf(m, " shift: %d\n", dev->shift);
201 SEQ_printf(m, " mode: %d\n", dev->mode); 201 SEQ_printf(m, " mode: %d\n", dev->mode);
202 SEQ_printf(m, " next_event: %Ld nsecs\n", 202 SEQ_printf(m, " next_event: %Ld nsecs\n",
diff --git a/kernel/timer.c b/kernel/timer.c
index 797cccb864..b22bd39740 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -505,6 +505,8 @@ out:
505 return ret; 505 return ret;
506} 506}
507 507
508EXPORT_SYMBOL(try_to_del_timer_sync);
509
508/** 510/**
509 * del_timer_sync - deactivate a timer and wait for the handler to finish. 511 * del_timer_sync - deactivate a timer and wait for the handler to finish.
510 * @timer: the timer to be deactivated 512 * @timer: the timer to be deactivated
@@ -695,15 +697,28 @@ static unsigned long cmp_next_hrtimer_event(unsigned long now,
695{ 697{
696 ktime_t hr_delta = hrtimer_get_next_event(); 698 ktime_t hr_delta = hrtimer_get_next_event();
697 struct timespec tsdelta; 699 struct timespec tsdelta;
700 unsigned long delta;
698 701
699 if (hr_delta.tv64 == KTIME_MAX) 702 if (hr_delta.tv64 == KTIME_MAX)
700 return expires; 703 return expires;
701 704
702 if (hr_delta.tv64 <= TICK_NSEC) 705 /*
703 return now; 706 * Expired timer available, let it expire in the next tick
707 */
708 if (hr_delta.tv64 <= 0)
709 return now + 1;
704 710
705 tsdelta = ktime_to_timespec(hr_delta); 711 tsdelta = ktime_to_timespec(hr_delta);
706 now += timespec_to_jiffies(&tsdelta); 712 delta = timespec_to_jiffies(&tsdelta);
713 /*
714 * Take rounding errors in to account and make sure, that it
715 * expires in the next tick. Otherwise we go into an endless
716 * ping pong due to tick_nohz_stop_sched_tick() retriggering
717 * the timer softirq
718 */
719 if (delta < 1)
720 delta = 1;
721 now += delta;
707 if (time_before(now, expires)) 722 if (time_before(now, expires))
708 return now; 723 return now;
709 return expires; 724 return expires;
@@ -1003,7 +1018,7 @@ static int timekeeping_resume(struct sys_device *dev)
1003 clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL); 1018 clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
1004 1019
1005 /* Resume hrtimers */ 1020 /* Resume hrtimers */
1006 clock_was_set(); 1021 hres_timers_resume();
1007 1022
1008 return 0; 1023 return 0;
1009} 1024}