aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.hz20
-rw-r--r--kernel/acct.c3
-rw-r--r--kernel/audit.c1
-rw-r--r--kernel/auditfilter.c3
-rw-r--r--kernel/configs.c2
-rw-r--r--kernel/cpu.c6
-rw-r--r--kernel/cpuset.c22
-rw-r--r--kernel/delayacct.c4
-rw-r--r--kernel/dma.c2
-rw-r--r--kernel/exit.c8
-rw-r--r--kernel/fork.c42
-rw-r--r--kernel/futex.c45
-rw-r--r--kernel/irq/handle.c2
-rw-r--r--kernel/kallsyms.c17
-rw-r--r--kernel/kexec.c59
-rw-r--r--kernel/kmod.c16
-rw-r--r--kernel/kprobes.c117
-rw-r--r--kernel/kthread.c13
-rw-r--r--kernel/lockdep.c48
-rw-r--r--kernel/lockdep_internals.h2
-rw-r--r--kernel/lockdep_proc.c6
-rw-r--r--kernel/module.c2
-rw-r--r--kernel/mutex-debug.c3
-rw-r--r--kernel/pid.c2
-rw-r--r--kernel/posix-timers.c2
-rw-r--r--kernel/power/Kconfig2
-rw-r--r--kernel/power/disk.c66
-rw-r--r--kernel/power/main.c14
-rw-r--r--kernel/power/power.h32
-rw-r--r--kernel/power/poweroff.c4
-rw-r--r--kernel/power/process.c130
-rw-r--r--kernel/power/snapshot.c860
-rw-r--r--kernel/power/swap.c347
-rw-r--r--kernel/power/swsusp.c98
-rw-r--r--kernel/power/user.c102
-rw-r--r--kernel/printk.c24
-rw-r--r--kernel/profile.c47
-rw-r--r--kernel/rcupdate.c4
-rw-r--r--kernel/rcutorture.c4
-rw-r--r--kernel/relay.c12
-rw-r--r--kernel/resource.c6
-rw-r--r--kernel/rtmutex-tester.c1
-rw-r--r--kernel/sched.c39
-rw-r--r--kernel/signal.c6
-rw-r--r--kernel/softirq.c2
-rw-r--r--kernel/sys.c8
-rw-r--r--kernel/sysctl.c17
-rw-r--r--kernel/taskstats.c169
-rw-r--r--kernel/user.c4
-rw-r--r--kernel/workqueue.c144
50 files changed, 1729 insertions, 860 deletions
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 248e1c396f8b..4af15802ccd4 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -7,7 +7,7 @@ choice
7 default HZ_250 7 default HZ_250
8 help 8 help
9 Allows the configuration of the timer frequency. It is customary 9 Allows the configuration of the timer frequency. It is customary
10 to have the timer interrupt run at 1000 HZ but 100 HZ may be more 10 to have the timer interrupt run at 1000 Hz but 100 Hz may be more
11 beneficial for servers and NUMA systems that do not need to have 11 beneficial for servers and NUMA systems that do not need to have
12 a fast response for user interaction and that may experience bus 12 a fast response for user interaction and that may experience bus
13 contention and cacheline bounces as a result of timer interrupts. 13 contention and cacheline bounces as a result of timer interrupts.
@@ -19,21 +19,30 @@ choice
19 config HZ_100 19 config HZ_100
20 bool "100 HZ" 20 bool "100 HZ"
21 help 21 help
22 100 HZ is a typical choice for servers, SMP and NUMA systems 22 100 Hz is a typical choice for servers, SMP and NUMA systems
23 with lots of processors that may show reduced performance if 23 with lots of processors that may show reduced performance if
24 too many timer interrupts are occurring. 24 too many timer interrupts are occurring.
25 25
26 config HZ_250 26 config HZ_250
27 bool "250 HZ" 27 bool "250 HZ"
28 help 28 help
29 250 HZ is a good compromise choice allowing server performance 29 250 Hz is a good compromise choice allowing server performance
30 while also showing good interactive responsiveness even 30 while also showing good interactive responsiveness even
31 on SMP and NUMA systems. 31 on SMP and NUMA systems. If you are going to be using NTSC video
32 or multimedia, selected 300Hz instead.
33
34 config HZ_300
35 bool "300 HZ"
36 help
37 300 Hz is a good compromise choice allowing server performance
38 while also showing good interactive responsiveness even
39 on SMP and NUMA systems and exactly dividing by both PAL and
40 NTSC frame rates for video and multimedia work.
32 41
33 config HZ_1000 42 config HZ_1000
34 bool "1000 HZ" 43 bool "1000 HZ"
35 help 44 help
36 1000 HZ is the preferred choice for desktop systems and other 45 1000 Hz is the preferred choice for desktop systems and other
37 systems requiring fast interactive responses to events. 46 systems requiring fast interactive responses to events.
38 47
39endchoice 48endchoice
@@ -42,5 +51,6 @@ config HZ
42 int 51 int
43 default 100 if HZ_100 52 default 100 if HZ_100
44 default 250 if HZ_250 53 default 250 if HZ_250
54 default 300 if HZ_300
45 default 1000 if HZ_1000 55 default 1000 if HZ_1000
46 56
diff --git a/kernel/acct.c b/kernel/acct.c
index 0aad5ca36a81..dc12db8600e7 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -89,7 +89,8 @@ struct acct_glbs {
89 struct timer_list timer; 89 struct timer_list timer;
90}; 90};
91 91
92static struct acct_glbs acct_globals __cacheline_aligned = {SPIN_LOCK_UNLOCKED}; 92static struct acct_glbs acct_globals __cacheline_aligned =
93 {__SPIN_LOCK_UNLOCKED(acct_globals.lock)};
93 94
94/* 95/*
95 * Called whenever the timer says to check the free space. 96 * Called whenever the timer says to check the free space.
diff --git a/kernel/audit.c b/kernel/audit.c
index 98106f6078b0..d9b690ac684b 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -57,6 +57,7 @@
57#include <linux/netlink.h> 57#include <linux/netlink.h>
58#include <linux/selinux.h> 58#include <linux/selinux.h>
59#include <linux/inotify.h> 59#include <linux/inotify.h>
60#include <linux/freezer.h>
60 61
61#include "audit.h" 62#include "audit.h"
62 63
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 4f40d923af8e..2e896f8ae29e 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -636,10 +636,9 @@ static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule)
636 struct audit_rule *rule; 636 struct audit_rule *rule;
637 int i; 637 int i;
638 638
639 rule = kmalloc(sizeof(*rule), GFP_KERNEL); 639 rule = kzalloc(sizeof(*rule), GFP_KERNEL);
640 if (unlikely(!rule)) 640 if (unlikely(!rule))
641 return NULL; 641 return NULL;
642 memset(rule, 0, sizeof(*rule));
643 642
644 rule->flags = krule->flags | krule->listnr; 643 rule->flags = krule->flags | krule->listnr;
645 rule->action = krule->action; 644 rule->action = krule->action;
diff --git a/kernel/configs.c b/kernel/configs.c
index f9e31974f4ad..8fa1fb28f8a7 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -75,7 +75,7 @@ ikconfig_read_current(struct file *file, char __user *buf,
75 return count; 75 return count;
76} 76}
77 77
78static struct file_operations ikconfig_file_ops = { 78static const struct file_operations ikconfig_file_ops = {
79 .owner = THIS_MODULE, 79 .owner = THIS_MODULE,
80 .read = ikconfig_read_current, 80 .read = ikconfig_read_current,
81}; 81};
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 272254f20d97..9124669f4586 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -270,11 +270,7 @@ int disable_nonboot_cpus(void)
270 goto out; 270 goto out;
271 } 271 }
272 } 272 }
273 error = set_cpus_allowed(current, cpumask_of_cpu(first_cpu)); 273
274 if (error) {
275 printk(KERN_ERR "Could not run on CPU%d\n", first_cpu);
276 goto out;
277 }
278 /* We take down all of the non-boot CPUs in one shot to avoid races 274 /* We take down all of the non-boot CPUs in one shot to avoid races
279 * with the userspace trying to use the CPU hotplug at the same time 275 * with the userspace trying to use the CPU hotplug at the same time
280 */ 276 */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 6313c38c930e..0a6b4d89f9a0 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -729,9 +729,11 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
729 } 729 }
730 730
731 /* Remaining checks don't apply to root cpuset */ 731 /* Remaining checks don't apply to root cpuset */
732 if ((par = cur->parent) == NULL) 732 if (cur == &top_cpuset)
733 return 0; 733 return 0;
734 734
735 par = cur->parent;
736
735 /* We must be a subset of our parent cpuset */ 737 /* We must be a subset of our parent cpuset */
736 if (!is_cpuset_subset(trial, par)) 738 if (!is_cpuset_subset(trial, par))
737 return -EACCES; 739 return -EACCES;
@@ -1060,10 +1062,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
1060 cpu_exclusive_changed = 1062 cpu_exclusive_changed =
1061 (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs)); 1063 (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs));
1062 mutex_lock(&callback_mutex); 1064 mutex_lock(&callback_mutex);
1063 if (turning_on) 1065 cs->flags = trialcs.flags;
1064 set_bit(bit, &cs->flags);
1065 else
1066 clear_bit(bit, &cs->flags);
1067 mutex_unlock(&callback_mutex); 1066 mutex_unlock(&callback_mutex);
1068 1067
1069 if (cpu_exclusive_changed) 1068 if (cpu_exclusive_changed)
@@ -1281,7 +1280,8 @@ typedef enum {
1281 FILE_TASKLIST, 1280 FILE_TASKLIST,
1282} cpuset_filetype_t; 1281} cpuset_filetype_t;
1283 1282
1284static ssize_t cpuset_common_file_write(struct file *file, const char __user *userbuf, 1283static ssize_t cpuset_common_file_write(struct file *file,
1284 const char __user *userbuf,
1285 size_t nbytes, loff_t *unused_ppos) 1285 size_t nbytes, loff_t *unused_ppos)
1286{ 1286{
1287 struct cpuset *cs = __d_cs(file->f_dentry->d_parent); 1287 struct cpuset *cs = __d_cs(file->f_dentry->d_parent);
@@ -1292,7 +1292,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
1292 int retval = 0; 1292 int retval = 0;
1293 1293
1294 /* Crude upper limit on largest legitimate cpulist user might write. */ 1294 /* Crude upper limit on largest legitimate cpulist user might write. */
1295 if (nbytes > 100 + 6 * NR_CPUS) 1295 if (nbytes > 100 + 6 * max(NR_CPUS, MAX_NUMNODES))
1296 return -E2BIG; 1296 return -E2BIG;
1297 1297
1298 /* +1 for nul-terminator */ 1298 /* +1 for nul-terminator */
@@ -1532,7 +1532,7 @@ static int cpuset_rename(struct inode *old_dir, struct dentry *old_dentry,
1532 return simple_rename(old_dir, old_dentry, new_dir, new_dentry); 1532 return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
1533} 1533}
1534 1534
1535static struct file_operations cpuset_file_operations = { 1535static const struct file_operations cpuset_file_operations = {
1536 .read = cpuset_file_read, 1536 .read = cpuset_file_read,
1537 .write = cpuset_file_write, 1537 .write = cpuset_file_write,
1538 .llseek = generic_file_llseek, 1538 .llseek = generic_file_llseek,
@@ -2045,7 +2045,6 @@ out:
2045 return err; 2045 return err;
2046} 2046}
2047 2047
2048#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
2049/* 2048/*
2050 * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs 2049 * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs
2051 * or memory nodes, we need to walk over the cpuset hierarchy, 2050 * or memory nodes, we need to walk over the cpuset hierarchy,
@@ -2109,9 +2108,7 @@ static void common_cpu_mem_hotplug_unplug(void)
2109 mutex_unlock(&callback_mutex); 2108 mutex_unlock(&callback_mutex);
2110 mutex_unlock(&manage_mutex); 2109 mutex_unlock(&manage_mutex);
2111} 2110}
2112#endif
2113 2111
2114#ifdef CONFIG_HOTPLUG_CPU
2115/* 2112/*
2116 * The top_cpuset tracks what CPUs and Memory Nodes are online, 2113 * The top_cpuset tracks what CPUs and Memory Nodes are online,
2117 * period. This is necessary in order to make cpusets transparent 2114 * period. This is necessary in order to make cpusets transparent
@@ -2128,7 +2125,6 @@ static int cpuset_handle_cpuhp(struct notifier_block *nb,
2128 common_cpu_mem_hotplug_unplug(); 2125 common_cpu_mem_hotplug_unplug();
2129 return 0; 2126 return 0;
2130} 2127}
2131#endif
2132 2128
2133#ifdef CONFIG_MEMORY_HOTPLUG 2129#ifdef CONFIG_MEMORY_HOTPLUG
2134/* 2130/*
@@ -2610,7 +2606,7 @@ static int cpuset_open(struct inode *inode, struct file *file)
2610 return single_open(file, proc_cpuset_show, pid); 2606 return single_open(file, proc_cpuset_show, pid);
2611} 2607}
2612 2608
2613struct file_operations proc_cpuset_operations = { 2609const struct file_operations proc_cpuset_operations = {
2614 .open = cpuset_open, 2610 .open = cpuset_open,
2615 .read = seq_read, 2611 .read = seq_read,
2616 .llseek = seq_lseek, 2612 .llseek = seq_lseek,
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 66a0ea48751d..766d5912b26a 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -20,7 +20,7 @@
20#include <linux/delayacct.h> 20#include <linux/delayacct.h>
21 21
22int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */ 22int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */
23kmem_cache_t *delayacct_cache; 23struct kmem_cache *delayacct_cache;
24 24
25static int __init delayacct_setup_disable(char *str) 25static int __init delayacct_setup_disable(char *str)
26{ 26{
@@ -41,7 +41,7 @@ void delayacct_init(void)
41 41
42void __delayacct_tsk_init(struct task_struct *tsk) 42void __delayacct_tsk_init(struct task_struct *tsk)
43{ 43{
44 tsk->delays = kmem_cache_zalloc(delayacct_cache, SLAB_KERNEL); 44 tsk->delays = kmem_cache_zalloc(delayacct_cache, GFP_KERNEL);
45 if (tsk->delays) 45 if (tsk->delays)
46 spin_lock_init(&tsk->delays->lock); 46 spin_lock_init(&tsk->delays->lock);
47} 47}
diff --git a/kernel/dma.c b/kernel/dma.c
index 2020644c938a..937b13ca33ba 100644
--- a/kernel/dma.c
+++ b/kernel/dma.c
@@ -140,7 +140,7 @@ static int proc_dma_open(struct inode *inode, struct file *file)
140 return single_open(file, proc_dma_show, NULL); 140 return single_open(file, proc_dma_show, NULL);
141} 141}
142 142
143static struct file_operations proc_dma_operations = { 143static const struct file_operations proc_dma_operations = {
144 .open = proc_dma_open, 144 .open = proc_dma_open,
145 .read = seq_read, 145 .read = seq_read,
146 .llseek = seq_lseek, 146 .llseek = seq_lseek,
diff --git a/kernel/exit.c b/kernel/exit.c
index 06de6c4e8ca3..4e3f919edc48 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -850,9 +850,7 @@ static void exit_notify(struct task_struct *tsk)
850fastcall NORET_TYPE void do_exit(long code) 850fastcall NORET_TYPE void do_exit(long code)
851{ 851{
852 struct task_struct *tsk = current; 852 struct task_struct *tsk = current;
853 struct taskstats *tidstats;
854 int group_dead; 853 int group_dead;
855 unsigned int mycpu;
856 854
857 profile_task_exit(tsk); 855 profile_task_exit(tsk);
858 856
@@ -890,8 +888,6 @@ fastcall NORET_TYPE void do_exit(long code)
890 current->comm, current->pid, 888 current->comm, current->pid,
891 preempt_count()); 889 preempt_count());
892 890
893 taskstats_exit_alloc(&tidstats, &mycpu);
894
895 acct_update_integrals(tsk); 891 acct_update_integrals(tsk);
896 if (tsk->mm) { 892 if (tsk->mm) {
897 update_hiwater_rss(tsk->mm); 893 update_hiwater_rss(tsk->mm);
@@ -911,8 +907,8 @@ fastcall NORET_TYPE void do_exit(long code)
911#endif 907#endif
912 if (unlikely(tsk->audit_context)) 908 if (unlikely(tsk->audit_context))
913 audit_free(tsk); 909 audit_free(tsk);
914 taskstats_exit_send(tsk, tidstats, group_dead, mycpu); 910
915 taskstats_exit_free(tidstats); 911 taskstats_exit(tsk, group_dead);
916 912
917 exit_mm(tsk); 913 exit_mm(tsk);
918 914
diff --git a/kernel/fork.c b/kernel/fork.c
index fd22245e3881..7f2e31ba33af 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -82,26 +82,26 @@ int nr_processes(void)
82#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR 82#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
83# define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL) 83# define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL)
84# define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk)) 84# define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk))
85static kmem_cache_t *task_struct_cachep; 85static struct kmem_cache *task_struct_cachep;
86#endif 86#endif
87 87
88/* SLAB cache for signal_struct structures (tsk->signal) */ 88/* SLAB cache for signal_struct structures (tsk->signal) */
89static kmem_cache_t *signal_cachep; 89static struct kmem_cache *signal_cachep;
90 90
91/* SLAB cache for sighand_struct structures (tsk->sighand) */ 91/* SLAB cache for sighand_struct structures (tsk->sighand) */
92kmem_cache_t *sighand_cachep; 92struct kmem_cache *sighand_cachep;
93 93
94/* SLAB cache for files_struct structures (tsk->files) */ 94/* SLAB cache for files_struct structures (tsk->files) */
95kmem_cache_t *files_cachep; 95struct kmem_cache *files_cachep;
96 96
97/* SLAB cache for fs_struct structures (tsk->fs) */ 97/* SLAB cache for fs_struct structures (tsk->fs) */
98kmem_cache_t *fs_cachep; 98struct kmem_cache *fs_cachep;
99 99
100/* SLAB cache for vm_area_struct structures */ 100/* SLAB cache for vm_area_struct structures */
101kmem_cache_t *vm_area_cachep; 101struct kmem_cache *vm_area_cachep;
102 102
103/* SLAB cache for mm_struct structures (tsk->mm) */ 103/* SLAB cache for mm_struct structures (tsk->mm) */
104static kmem_cache_t *mm_cachep; 104static struct kmem_cache *mm_cachep;
105 105
106void free_task(struct task_struct *tsk) 106void free_task(struct task_struct *tsk)
107{ 107{
@@ -237,7 +237,7 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
237 goto fail_nomem; 237 goto fail_nomem;
238 charge = len; 238 charge = len;
239 } 239 }
240 tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); 240 tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
241 if (!tmp) 241 if (!tmp)
242 goto fail_nomem; 242 goto fail_nomem;
243 *tmp = *mpnt; 243 *tmp = *mpnt;
@@ -319,7 +319,7 @@ static inline void mm_free_pgd(struct mm_struct * mm)
319 319
320 __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock); 320 __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
321 321
322#define allocate_mm() (kmem_cache_alloc(mm_cachep, SLAB_KERNEL)) 322#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
323#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) 323#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
324 324
325#include <linux/init_task.h> 325#include <linux/init_task.h>
@@ -448,7 +448,16 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
448 tsk->vfork_done = NULL; 448 tsk->vfork_done = NULL;
449 complete(vfork_done); 449 complete(vfork_done);
450 } 450 }
451 if (tsk->clear_child_tid && atomic_read(&mm->mm_users) > 1) { 451
452 /*
453 * If we're exiting normally, clear a user-space tid field if
454 * requested. We leave this alone when dying by signal, to leave
455 * the value intact in a core dump, and to save the unnecessary
456 * trouble otherwise. Userland only wants this done for a sys_exit.
457 */
458 if (tsk->clear_child_tid
459 && !(tsk->flags & PF_SIGNALED)
460 && atomic_read(&mm->mm_users) > 1) {
452 u32 __user * tidptr = tsk->clear_child_tid; 461 u32 __user * tidptr = tsk->clear_child_tid;
453 tsk->clear_child_tid = NULL; 462 tsk->clear_child_tid = NULL;
454 463
@@ -479,6 +488,10 @@ static struct mm_struct *dup_mm(struct task_struct *tsk)
479 488
480 memcpy(mm, oldmm, sizeof(*mm)); 489 memcpy(mm, oldmm, sizeof(*mm));
481 490
491 /* Initializing for Swap token stuff */
492 mm->token_priority = 0;
493 mm->last_interval = 0;
494
482 if (!mm_init(mm)) 495 if (!mm_init(mm))
483 goto fail_nomem; 496 goto fail_nomem;
484 497
@@ -542,6 +555,10 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
542 goto fail_nomem; 555 goto fail_nomem;
543 556
544good_mm: 557good_mm:
558 /* Initializing for Swap token stuff */
559 mm->token_priority = 0;
560 mm->last_interval = 0;
561
545 tsk->mm = mm; 562 tsk->mm = mm;
546 tsk->active_mm = mm; 563 tsk->active_mm = mm;
547 return 0; 564 return 0;
@@ -613,7 +630,7 @@ static struct files_struct *alloc_files(void)
613 struct files_struct *newf; 630 struct files_struct *newf;
614 struct fdtable *fdt; 631 struct fdtable *fdt;
615 632
616 newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL); 633 newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
617 if (!newf) 634 if (!newf)
618 goto out; 635 goto out;
619 636
@@ -830,7 +847,6 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
830 if (clone_flags & CLONE_THREAD) { 847 if (clone_flags & CLONE_THREAD) {
831 atomic_inc(&current->signal->count); 848 atomic_inc(&current->signal->count);
832 atomic_inc(&current->signal->live); 849 atomic_inc(&current->signal->live);
833 taskstats_tgid_alloc(current);
834 return 0; 850 return 0;
835 } 851 }
836 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); 852 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
@@ -1413,7 +1429,7 @@ long do_fork(unsigned long clone_flags,
1413#define ARCH_MIN_MMSTRUCT_ALIGN 0 1429#define ARCH_MIN_MMSTRUCT_ALIGN 0
1414#endif 1430#endif
1415 1431
1416static void sighand_ctor(void *data, kmem_cache_t *cachep, unsigned long flags) 1432static void sighand_ctor(void *data, struct kmem_cache *cachep, unsigned long flags)
1417{ 1433{
1418 struct sighand_struct *sighand = data; 1434 struct sighand_struct *sighand = data;
1419 1435
diff --git a/kernel/futex.c b/kernel/futex.c
index 93ef30ba209f..95989a3b4168 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -282,9 +282,9 @@ static inline int get_futex_value_locked(u32 *dest, u32 __user *from)
282{ 282{
283 int ret; 283 int ret;
284 284
285 inc_preempt_count(); 285 pagefault_disable();
286 ret = __copy_from_user_inatomic(dest, from, sizeof(u32)); 286 ret = __copy_from_user_inatomic(dest, from, sizeof(u32));
287 dec_preempt_count(); 287 pagefault_enable();
288 288
289 return ret ? -EFAULT : 0; 289 return ret ? -EFAULT : 0;
290} 290}
@@ -324,12 +324,11 @@ static int refill_pi_state_cache(void)
324 if (likely(current->pi_state_cache)) 324 if (likely(current->pi_state_cache))
325 return 0; 325 return 0;
326 326
327 pi_state = kmalloc(sizeof(*pi_state), GFP_KERNEL); 327 pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
328 328
329 if (!pi_state) 329 if (!pi_state)
330 return -ENOMEM; 330 return -ENOMEM;
331 331
332 memset(pi_state, 0, sizeof(*pi_state));
333 INIT_LIST_HEAD(&pi_state->list); 332 INIT_LIST_HEAD(&pi_state->list);
334 /* pi_mutex gets initialized later */ 333 /* pi_mutex gets initialized later */
335 pi_state->owner = NULL; 334 pi_state->owner = NULL;
@@ -553,7 +552,7 @@ static void wake_futex(struct futex_q *q)
553 * at the end of wake_up_all() does not prevent this store from 552 * at the end of wake_up_all() does not prevent this store from
554 * moving. 553 * moving.
555 */ 554 */
556 wmb(); 555 smp_wmb();
557 q->lock_ptr = NULL; 556 q->lock_ptr = NULL;
558} 557}
559 558
@@ -585,9 +584,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
585 if (!(uval & FUTEX_OWNER_DIED)) { 584 if (!(uval & FUTEX_OWNER_DIED)) {
586 newval = FUTEX_WAITERS | new_owner->pid; 585 newval = FUTEX_WAITERS | new_owner->pid;
587 586
588 inc_preempt_count(); 587 pagefault_disable();
589 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); 588 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
590 dec_preempt_count(); 589 pagefault_enable();
591 if (curval == -EFAULT) 590 if (curval == -EFAULT)
592 return -EFAULT; 591 return -EFAULT;
593 if (curval != uval) 592 if (curval != uval)
@@ -618,9 +617,9 @@ static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
618 * There is no waiter, so we unlock the futex. The owner died 617 * There is no waiter, so we unlock the futex. The owner died
619 * bit has not to be preserved here. We are the owner: 618 * bit has not to be preserved here. We are the owner:
620 */ 619 */
621 inc_preempt_count(); 620 pagefault_disable();
622 oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0); 621 oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0);
623 dec_preempt_count(); 622 pagefault_enable();
624 623
625 if (oldval == -EFAULT) 624 if (oldval == -EFAULT)
626 return oldval; 625 return oldval;
@@ -1158,9 +1157,9 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1158 */ 1157 */
1159 newval = current->pid; 1158 newval = current->pid;
1160 1159
1161 inc_preempt_count(); 1160 pagefault_disable();
1162 curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval); 1161 curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval);
1163 dec_preempt_count(); 1162 pagefault_enable();
1164 1163
1165 if (unlikely(curval == -EFAULT)) 1164 if (unlikely(curval == -EFAULT))
1166 goto uaddr_faulted; 1165 goto uaddr_faulted;
@@ -1183,9 +1182,9 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1183 uval = curval; 1182 uval = curval;
1184 newval = uval | FUTEX_WAITERS; 1183 newval = uval | FUTEX_WAITERS;
1185 1184
1186 inc_preempt_count(); 1185 pagefault_disable();
1187 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); 1186 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
1188 dec_preempt_count(); 1187 pagefault_enable();
1189 1188
1190 if (unlikely(curval == -EFAULT)) 1189 if (unlikely(curval == -EFAULT))
1191 goto uaddr_faulted; 1190 goto uaddr_faulted;
@@ -1215,10 +1214,10 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1215 newval = current->pid | 1214 newval = current->pid |
1216 FUTEX_OWNER_DIED | FUTEX_WAITERS; 1215 FUTEX_OWNER_DIED | FUTEX_WAITERS;
1217 1216
1218 inc_preempt_count(); 1217 pagefault_disable();
1219 curval = futex_atomic_cmpxchg_inatomic(uaddr, 1218 curval = futex_atomic_cmpxchg_inatomic(uaddr,
1220 uval, newval); 1219 uval, newval);
1221 dec_preempt_count(); 1220 pagefault_enable();
1222 1221
1223 if (unlikely(curval == -EFAULT)) 1222 if (unlikely(curval == -EFAULT))
1224 goto uaddr_faulted; 1223 goto uaddr_faulted;
@@ -1390,9 +1389,9 @@ retry_locked:
1390 * anyone else up: 1389 * anyone else up:
1391 */ 1390 */
1392 if (!(uval & FUTEX_OWNER_DIED)) { 1391 if (!(uval & FUTEX_OWNER_DIED)) {
1393 inc_preempt_count(); 1392 pagefault_disable();
1394 uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0); 1393 uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0);
1395 dec_preempt_count(); 1394 pagefault_enable();
1396 } 1395 }
1397 1396
1398 if (unlikely(uval == -EFAULT)) 1397 if (unlikely(uval == -EFAULT))
@@ -1493,7 +1492,7 @@ static unsigned int futex_poll(struct file *filp,
1493 return ret; 1492 return ret;
1494} 1493}
1495 1494
1496static struct file_operations futex_fops = { 1495static const struct file_operations futex_fops = {
1497 .release = futex_close, 1496 .release = futex_close,
1498 .poll = futex_poll, 1497 .poll = futex_poll,
1499}; 1498};
@@ -1858,10 +1857,16 @@ static struct file_system_type futex_fs_type = {
1858 1857
1859static int __init init(void) 1858static int __init init(void)
1860{ 1859{
1861 unsigned int i; 1860 int i = register_filesystem(&futex_fs_type);
1861
1862 if (i)
1863 return i;
1862 1864
1863 register_filesystem(&futex_fs_type);
1864 futex_mnt = kern_mount(&futex_fs_type); 1865 futex_mnt = kern_mount(&futex_fs_type);
1866 if (IS_ERR(futex_mnt)) {
1867 unregister_filesystem(&futex_fs_type);
1868 return PTR_ERR(futex_mnt);
1869 }
1865 1870
1866 for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { 1871 for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
1867 INIT_LIST_HEAD(&futex_queues[i].chain); 1872 INIT_LIST_HEAD(&futex_queues[i].chain);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index a681912bc89a..aff1f0fabb0d 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -54,7 +54,7 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned = {
54 .chip = &no_irq_chip, 54 .chip = &no_irq_chip,
55 .handle_irq = handle_bad_irq, 55 .handle_irq = handle_bad_irq,
56 .depth = 1, 56 .depth = 1,
57 .lock = SPIN_LOCK_UNLOCKED, 57 .lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock),
58#ifdef CONFIG_SMP 58#ifdef CONFIG_SMP
59 .affinity = CPU_MASK_ALL 59 .affinity = CPU_MASK_ALL
60#endif 60#endif
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index eeac3e313b2b..ab63cfc42992 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -20,6 +20,7 @@
20#include <linux/proc_fs.h> 20#include <linux/proc_fs.h>
21#include <linux/sched.h> /* for cond_resched */ 21#include <linux/sched.h> /* for cond_resched */
22#include <linux/mm.h> 22#include <linux/mm.h>
23#include <linux/ctype.h>
23 24
24#include <asm/sections.h> 25#include <asm/sections.h>
25 26
@@ -301,13 +302,6 @@ struct kallsym_iter
301 char name[KSYM_NAME_LEN+1]; 302 char name[KSYM_NAME_LEN+1];
302}; 303};
303 304
304/* Only label it "global" if it is exported. */
305static void upcase_if_global(struct kallsym_iter *iter)
306{
307 if (is_exported(iter->name, iter->owner))
308 iter->type += 'A' - 'a';
309}
310
311static int get_ksymbol_mod(struct kallsym_iter *iter) 305static int get_ksymbol_mod(struct kallsym_iter *iter)
312{ 306{
313 iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms, 307 iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms,
@@ -316,7 +310,10 @@ static int get_ksymbol_mod(struct kallsym_iter *iter)
316 if (iter->owner == NULL) 310 if (iter->owner == NULL)
317 return 0; 311 return 0;
318 312
319 upcase_if_global(iter); 313 /* Label it "global" if it is exported, "local" if not exported. */
314 iter->type = is_exported(iter->name, iter->owner)
315 ? toupper(iter->type) : tolower(iter->type);
316
320 return 1; 317 return 1;
321} 318}
322 319
@@ -401,7 +398,7 @@ static int s_show(struct seq_file *m, void *p)
401 return 0; 398 return 0;
402} 399}
403 400
404static struct seq_operations kallsyms_op = { 401static const struct seq_operations kallsyms_op = {
405 .start = s_start, 402 .start = s_start,
406 .next = s_next, 403 .next = s_next,
407 .stop = s_stop, 404 .stop = s_stop,
@@ -436,7 +433,7 @@ static int kallsyms_release(struct inode *inode, struct file *file)
436 return seq_release(inode, file); 433 return seq_release(inode, file);
437} 434}
438 435
439static struct file_operations kallsyms_operations = { 436static const struct file_operations kallsyms_operations = {
440 .open = kallsyms_open, 437 .open = kallsyms_open,
441 .read = seq_read, 438 .read = seq_read,
442 .llseek = seq_lseek, 439 .llseek = seq_lseek,
diff --git a/kernel/kexec.c b/kernel/kexec.c
index fcdd5d2bc3f4..afbbbe981be2 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -20,6 +20,8 @@
20#include <linux/syscalls.h> 20#include <linux/syscalls.h>
21#include <linux/ioport.h> 21#include <linux/ioport.h>
22#include <linux/hardirq.h> 22#include <linux/hardirq.h>
23#include <linux/elf.h>
24#include <linux/elfcore.h>
23 25
24#include <asm/page.h> 26#include <asm/page.h>
25#include <asm/uaccess.h> 27#include <asm/uaccess.h>
@@ -108,11 +110,10 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
108 110
109 /* Allocate a controlling structure */ 111 /* Allocate a controlling structure */
110 result = -ENOMEM; 112 result = -ENOMEM;
111 image = kmalloc(sizeof(*image), GFP_KERNEL); 113 image = kzalloc(sizeof(*image), GFP_KERNEL);
112 if (!image) 114 if (!image)
113 goto out; 115 goto out;
114 116
115 memset(image, 0, sizeof(*image));
116 image->head = 0; 117 image->head = 0;
117 image->entry = &image->head; 118 image->entry = &image->head;
118 image->last_entry = &image->head; 119 image->last_entry = &image->head;
@@ -1067,6 +1068,60 @@ void crash_kexec(struct pt_regs *regs)
1067 } 1068 }
1068} 1069}
1069 1070
1071static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
1072 size_t data_len)
1073{
1074 struct elf_note note;
1075
1076 note.n_namesz = strlen(name) + 1;
1077 note.n_descsz = data_len;
1078 note.n_type = type;
1079 memcpy(buf, &note, sizeof(note));
1080 buf += (sizeof(note) + 3)/4;
1081 memcpy(buf, name, note.n_namesz);
1082 buf += (note.n_namesz + 3)/4;
1083 memcpy(buf, data, note.n_descsz);
1084 buf += (note.n_descsz + 3)/4;
1085
1086 return buf;
1087}
1088
1089static void final_note(u32 *buf)
1090{
1091 struct elf_note note;
1092
1093 note.n_namesz = 0;
1094 note.n_descsz = 0;
1095 note.n_type = 0;
1096 memcpy(buf, &note, sizeof(note));
1097}
1098
1099void crash_save_cpu(struct pt_regs *regs, int cpu)
1100{
1101 struct elf_prstatus prstatus;
1102 u32 *buf;
1103
1104 if ((cpu < 0) || (cpu >= NR_CPUS))
1105 return;
1106
1107 /* Using ELF notes here is opportunistic.
1108 * I need a well defined structure format
1109 * for the data I pass, and I need tags
1110 * on the data to indicate what information I have
1111 * squirrelled away. ELF notes happen to provide
1112 * all of that, so there is no need to invent something new.
1113 */
1114 buf = (u32*)per_cpu_ptr(crash_notes, cpu);
1115 if (!buf)
1116 return;
1117 memset(&prstatus, 0, sizeof(prstatus));
1118 prstatus.pr_pid = current->pid;
1119 elf_core_copy_regs(&prstatus.pr_reg, regs);
1120 buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus,
1121 sizeof(prstatus));
1122 final_note(buf);
1123}
1124
1070static int __init crash_notes_memory_init(void) 1125static int __init crash_notes_memory_init(void)
1071{ 1126{
1072 /* Allocate memory for saving cpu registers. */ 1127 /* Allocate memory for saving cpu registers. */
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 2b76dee28496..8d2bea09a4ec 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -114,6 +114,7 @@ EXPORT_SYMBOL(request_module);
114#endif /* CONFIG_KMOD */ 114#endif /* CONFIG_KMOD */
115 115
116struct subprocess_info { 116struct subprocess_info {
117 struct work_struct work;
117 struct completion *complete; 118 struct completion *complete;
118 char *path; 119 char *path;
119 char **argv; 120 char **argv;
@@ -221,9 +222,10 @@ static int wait_for_helper(void *data)
221} 222}
222 223
223/* This is run by khelper thread */ 224/* This is run by khelper thread */
224static void __call_usermodehelper(void *data) 225static void __call_usermodehelper(struct work_struct *work)
225{ 226{
226 struct subprocess_info *sub_info = data; 227 struct subprocess_info *sub_info =
228 container_of(work, struct subprocess_info, work);
227 pid_t pid; 229 pid_t pid;
228 int wait = sub_info->wait; 230 int wait = sub_info->wait;
229 231
@@ -264,6 +266,8 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp,
264{ 266{
265 DECLARE_COMPLETION_ONSTACK(done); 267 DECLARE_COMPLETION_ONSTACK(done);
266 struct subprocess_info sub_info = { 268 struct subprocess_info sub_info = {
269 .work = __WORK_INITIALIZER(sub_info.work,
270 __call_usermodehelper),
267 .complete = &done, 271 .complete = &done,
268 .path = path, 272 .path = path,
269 .argv = argv, 273 .argv = argv,
@@ -272,7 +276,6 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp,
272 .wait = wait, 276 .wait = wait,
273 .retval = 0, 277 .retval = 0,
274 }; 278 };
275 DECLARE_WORK(work, __call_usermodehelper, &sub_info);
276 279
277 if (!khelper_wq) 280 if (!khelper_wq)
278 return -EBUSY; 281 return -EBUSY;
@@ -280,7 +283,7 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp,
280 if (path[0] == '\0') 283 if (path[0] == '\0')
281 return 0; 284 return 0;
282 285
283 queue_work(khelper_wq, &work); 286 queue_work(khelper_wq, &sub_info.work);
284 wait_for_completion(&done); 287 wait_for_completion(&done);
285 return sub_info.retval; 288 return sub_info.retval;
286} 289}
@@ -291,6 +294,8 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp,
291{ 294{
292 DECLARE_COMPLETION(done); 295 DECLARE_COMPLETION(done);
293 struct subprocess_info sub_info = { 296 struct subprocess_info sub_info = {
297 .work = __WORK_INITIALIZER(sub_info.work,
298 __call_usermodehelper),
294 .complete = &done, 299 .complete = &done,
295 .path = path, 300 .path = path,
296 .argv = argv, 301 .argv = argv,
@@ -298,7 +303,6 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp,
298 .retval = 0, 303 .retval = 0,
299 }; 304 };
300 struct file *f; 305 struct file *f;
301 DECLARE_WORK(work, __call_usermodehelper, &sub_info);
302 306
303 if (!khelper_wq) 307 if (!khelper_wq)
304 return -EBUSY; 308 return -EBUSY;
@@ -318,7 +322,7 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp,
318 } 322 }
319 sub_info.stdin = f; 323 sub_info.stdin = f;
320 324
321 queue_work(khelper_wq, &work); 325 queue_work(khelper_wq, &sub_info.work);
322 wait_for_completion(&done); 326 wait_for_completion(&done);
323 return sub_info.retval; 327 return sub_info.retval;
324} 328}
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 610c837ad9e0..17ec4afb0994 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -38,6 +38,7 @@
38#include <linux/module.h> 38#include <linux/module.h>
39#include <linux/moduleloader.h> 39#include <linux/moduleloader.h>
40#include <linux/kallsyms.h> 40#include <linux/kallsyms.h>
41#include <linux/freezer.h>
41#include <asm-generic/sections.h> 42#include <asm-generic/sections.h>
42#include <asm/cacheflush.h> 43#include <asm/cacheflush.h>
43#include <asm/errno.h> 44#include <asm/errno.h>
@@ -83,9 +84,36 @@ struct kprobe_insn_page {
83 kprobe_opcode_t *insns; /* Page of instruction slots */ 84 kprobe_opcode_t *insns; /* Page of instruction slots */
84 char slot_used[INSNS_PER_PAGE]; 85 char slot_used[INSNS_PER_PAGE];
85 int nused; 86 int nused;
87 int ngarbage;
86}; 88};
87 89
88static struct hlist_head kprobe_insn_pages; 90static struct hlist_head kprobe_insn_pages;
91static int kprobe_garbage_slots;
92static int collect_garbage_slots(void);
93
94static int __kprobes check_safety(void)
95{
96 int ret = 0;
97#if defined(CONFIG_PREEMPT) && defined(CONFIG_PM)
98 ret = freeze_processes();
99 if (ret == 0) {
100 struct task_struct *p, *q;
101 do_each_thread(p, q) {
102 if (p != current && p->state == TASK_RUNNING &&
103 p->pid != 0) {
104 printk("Check failed: %s is running\n",p->comm);
105 ret = -1;
106 goto loop_end;
107 }
108 } while_each_thread(p, q);
109 }
110loop_end:
111 thaw_processes();
112#else
113 synchronize_sched();
114#endif
115 return ret;
116}
89 117
90/** 118/**
91 * get_insn_slot() - Find a slot on an executable page for an instruction. 119 * get_insn_slot() - Find a slot on an executable page for an instruction.
@@ -96,6 +124,7 @@ kprobe_opcode_t __kprobes *get_insn_slot(void)
96 struct kprobe_insn_page *kip; 124 struct kprobe_insn_page *kip;
97 struct hlist_node *pos; 125 struct hlist_node *pos;
98 126
127 retry:
99 hlist_for_each(pos, &kprobe_insn_pages) { 128 hlist_for_each(pos, &kprobe_insn_pages) {
100 kip = hlist_entry(pos, struct kprobe_insn_page, hlist); 129 kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
101 if (kip->nused < INSNS_PER_PAGE) { 130 if (kip->nused < INSNS_PER_PAGE) {
@@ -112,7 +141,11 @@ kprobe_opcode_t __kprobes *get_insn_slot(void)
112 } 141 }
113 } 142 }
114 143
115 /* All out of space. Need to allocate a new page. Use slot 0.*/ 144 /* If there are any garbage slots, collect it and try again. */
145 if (kprobe_garbage_slots && collect_garbage_slots() == 0) {
146 goto retry;
147 }
148 /* All out of space. Need to allocate a new page. Use slot 0. */
116 kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL); 149 kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL);
117 if (!kip) { 150 if (!kip) {
118 return NULL; 151 return NULL;
@@ -133,10 +166,62 @@ kprobe_opcode_t __kprobes *get_insn_slot(void)
133 memset(kip->slot_used, 0, INSNS_PER_PAGE); 166 memset(kip->slot_used, 0, INSNS_PER_PAGE);
134 kip->slot_used[0] = 1; 167 kip->slot_used[0] = 1;
135 kip->nused = 1; 168 kip->nused = 1;
169 kip->ngarbage = 0;
136 return kip->insns; 170 return kip->insns;
137} 171}
138 172
139void __kprobes free_insn_slot(kprobe_opcode_t *slot) 173/* Return 1 if all garbages are collected, otherwise 0. */
174static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
175{
176 kip->slot_used[idx] = 0;
177 kip->nused--;
178 if (kip->nused == 0) {
179 /*
180 * Page is no longer in use. Free it unless
181 * it's the last one. We keep the last one
182 * so as not to have to set it up again the
183 * next time somebody inserts a probe.
184 */
185 hlist_del(&kip->hlist);
186 if (hlist_empty(&kprobe_insn_pages)) {
187 INIT_HLIST_NODE(&kip->hlist);
188 hlist_add_head(&kip->hlist,
189 &kprobe_insn_pages);
190 } else {
191 module_free(NULL, kip->insns);
192 kfree(kip);
193 }
194 return 1;
195 }
196 return 0;
197}
198
199static int __kprobes collect_garbage_slots(void)
200{
201 struct kprobe_insn_page *kip;
202 struct hlist_node *pos, *next;
203
204 /* Ensure no-one is preepmted on the garbages */
205 if (check_safety() != 0)
206 return -EAGAIN;
207
208 hlist_for_each_safe(pos, next, &kprobe_insn_pages) {
209 int i;
210 kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
211 if (kip->ngarbage == 0)
212 continue;
213 kip->ngarbage = 0; /* we will collect all garbages */
214 for (i = 0; i < INSNS_PER_PAGE; i++) {
215 if (kip->slot_used[i] == -1 &&
216 collect_one_slot(kip, i))
217 break;
218 }
219 }
220 kprobe_garbage_slots = 0;
221 return 0;
222}
223
224void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
140{ 225{
141 struct kprobe_insn_page *kip; 226 struct kprobe_insn_page *kip;
142 struct hlist_node *pos; 227 struct hlist_node *pos;
@@ -146,28 +231,18 @@ void __kprobes free_insn_slot(kprobe_opcode_t *slot)
146 if (kip->insns <= slot && 231 if (kip->insns <= slot &&
147 slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { 232 slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) {
148 int i = (slot - kip->insns) / MAX_INSN_SIZE; 233 int i = (slot - kip->insns) / MAX_INSN_SIZE;
149 kip->slot_used[i] = 0; 234 if (dirty) {
150 kip->nused--; 235 kip->slot_used[i] = -1;
151 if (kip->nused == 0) { 236 kip->ngarbage++;
152 /* 237 } else {
153 * Page is no longer in use. Free it unless 238 collect_one_slot(kip, i);
154 * it's the last one. We keep the last one
155 * so as not to have to set it up again the
156 * next time somebody inserts a probe.
157 */
158 hlist_del(&kip->hlist);
159 if (hlist_empty(&kprobe_insn_pages)) {
160 INIT_HLIST_NODE(&kip->hlist);
161 hlist_add_head(&kip->hlist,
162 &kprobe_insn_pages);
163 } else {
164 module_free(NULL, kip->insns);
165 kfree(kip);
166 }
167 } 239 }
168 return; 240 break;
169 } 241 }
170 } 242 }
243 if (dirty && (++kprobe_garbage_slots > INSNS_PER_PAGE)) {
244 collect_garbage_slots();
245 }
171} 246}
172#endif 247#endif
173 248
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 4f9c60ef95e8..1db8c72d0d38 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -31,6 +31,8 @@ struct kthread_create_info
31 /* Result passed back to kthread_create() from keventd. */ 31 /* Result passed back to kthread_create() from keventd. */
32 struct task_struct *result; 32 struct task_struct *result;
33 struct completion done; 33 struct completion done;
34
35 struct work_struct work;
34}; 36};
35 37
36struct kthread_stop_info 38struct kthread_stop_info
@@ -111,9 +113,10 @@ static int kthread(void *_create)
111} 113}
112 114
113/* We are keventd: create a thread. */ 115/* We are keventd: create a thread. */
114static void keventd_create_kthread(void *_create) 116static void keventd_create_kthread(struct work_struct *work)
115{ 117{
116 struct kthread_create_info *create = _create; 118 struct kthread_create_info *create =
119 container_of(work, struct kthread_create_info, work);
117 int pid; 120 int pid;
118 121
119 /* We want our own signal handler (we take no signals by default). */ 122 /* We want our own signal handler (we take no signals by default). */
@@ -154,20 +157,20 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
154 ...) 157 ...)
155{ 158{
156 struct kthread_create_info create; 159 struct kthread_create_info create;
157 DECLARE_WORK(work, keventd_create_kthread, &create);
158 160
159 create.threadfn = threadfn; 161 create.threadfn = threadfn;
160 create.data = data; 162 create.data = data;
161 init_completion(&create.started); 163 init_completion(&create.started);
162 init_completion(&create.done); 164 init_completion(&create.done);
165 INIT_WORK(&create.work, keventd_create_kthread);
163 166
164 /* 167 /*
165 * The workqueue needs to start up first: 168 * The workqueue needs to start up first:
166 */ 169 */
167 if (!helper_wq) 170 if (!helper_wq)
168 work.func(work.data); 171 create.work.func(&create.work);
169 else { 172 else {
170 queue_work(helper_wq, &work); 173 queue_work(helper_wq, &create.work);
171 wait_for_completion(&create.done); 174 wait_for_completion(&create.done);
172 } 175 }
173 if (!IS_ERR(create.result)) { 176 if (!IS_ERR(create.result)) {
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 9bb8d784eb02..b02032476dc2 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -140,13 +140,6 @@ void lockdep_on(void)
140 140
141EXPORT_SYMBOL(lockdep_on); 141EXPORT_SYMBOL(lockdep_on);
142 142
143int lockdep_internal(void)
144{
145 return current->lockdep_recursion != 0;
146}
147
148EXPORT_SYMBOL(lockdep_internal);
149
150/* 143/*
151 * Debugging switches: 144 * Debugging switches:
152 */ 145 */
@@ -233,8 +226,10 @@ static int save_trace(struct stack_trace *trace)
233 trace->max_entries = trace->nr_entries; 226 trace->max_entries = trace->nr_entries;
234 227
235 nr_stack_trace_entries += trace->nr_entries; 228 nr_stack_trace_entries += trace->nr_entries;
236 if (DEBUG_LOCKS_WARN_ON(nr_stack_trace_entries > MAX_STACK_TRACE_ENTRIES)) 229 if (DEBUG_LOCKS_WARN_ON(nr_stack_trace_entries > MAX_STACK_TRACE_ENTRIES)) {
230 __raw_spin_unlock(&hash_lock);
237 return 0; 231 return 0;
232 }
238 233
239 if (nr_stack_trace_entries == MAX_STACK_TRACE_ENTRIES) { 234 if (nr_stack_trace_entries == MAX_STACK_TRACE_ENTRIES) {
240 __raw_spin_unlock(&hash_lock); 235 __raw_spin_unlock(&hash_lock);
@@ -353,7 +348,7 @@ get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4
353 348
354static void print_lock_name(struct lock_class *class) 349static void print_lock_name(struct lock_class *class)
355{ 350{
356 char str[128], c1, c2, c3, c4; 351 char str[KSYM_NAME_LEN + 1], c1, c2, c3, c4;
357 const char *name; 352 const char *name;
358 353
359 get_usage_chars(class, &c1, &c2, &c3, &c4); 354 get_usage_chars(class, &c1, &c2, &c3, &c4);
@@ -375,7 +370,7 @@ static void print_lock_name(struct lock_class *class)
375static void print_lockdep_cache(struct lockdep_map *lock) 370static void print_lockdep_cache(struct lockdep_map *lock)
376{ 371{
377 const char *name; 372 const char *name;
378 char str[128]; 373 char str[KSYM_NAME_LEN + 1];
379 374
380 name = lock->name; 375 name = lock->name;
381 if (!name) 376 if (!name)
@@ -445,7 +440,9 @@ static void print_lock_dependencies(struct lock_class *class, int depth)
445 print_lock_class_header(class, depth); 440 print_lock_class_header(class, depth);
446 441
447 list_for_each_entry(entry, &class->locks_after, entry) { 442 list_for_each_entry(entry, &class->locks_after, entry) {
448 DEBUG_LOCKS_WARN_ON(!entry->class); 443 if (DEBUG_LOCKS_WARN_ON(!entry->class))
444 return;
445
449 print_lock_dependencies(entry->class, depth + 1); 446 print_lock_dependencies(entry->class, depth + 1);
450 447
451 printk("%*s ... acquired at:\n",depth,""); 448 printk("%*s ... acquired at:\n",depth,"");
@@ -470,7 +467,8 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
470 return 0; 467 return 0;
471 468
472 entry->class = this; 469 entry->class = this;
473 save_trace(&entry->trace); 470 if (!save_trace(&entry->trace))
471 return 0;
474 472
475 /* 473 /*
476 * Since we never remove from the dependency list, the list can 474 * Since we never remove from the dependency list, the list can
@@ -558,8 +556,12 @@ static noinline int print_circular_bug_tail(void)
558 if (debug_locks_silent) 556 if (debug_locks_silent)
559 return 0; 557 return 0;
560 558
559 /* hash_lock unlocked by the header */
560 __raw_spin_lock(&hash_lock);
561 this.class = check_source->class; 561 this.class = check_source->class;
562 save_trace(&this.trace); 562 if (!save_trace(&this.trace))
563 return 0;
564 __raw_spin_unlock(&hash_lock);
563 print_circular_bug_entry(&this, 0); 565 print_circular_bug_entry(&this, 0);
564 566
565 printk("\nother info that might help us debug this:\n\n"); 567 printk("\nother info that might help us debug this:\n\n");
@@ -962,14 +964,11 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
962 &prev->class->locks_after, next->acquire_ip); 964 &prev->class->locks_after, next->acquire_ip);
963 if (!ret) 965 if (!ret)
964 return 0; 966 return 0;
965 /* 967
966 * Return value of 2 signals 'dependency already added',
967 * in that case we dont have to add the backlink either.
968 */
969 if (ret == 2)
970 return 2;
971 ret = add_lock_to_list(next->class, prev->class, 968 ret = add_lock_to_list(next->class, prev->class,
972 &next->class->locks_before, next->acquire_ip); 969 &next->class->locks_before, next->acquire_ip);
970 if (!ret)
971 return 0;
973 972
974 /* 973 /*
975 * Debugging printouts: 974 * Debugging printouts:
@@ -1021,7 +1020,8 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
1021 * added: 1020 * added:
1022 */ 1021 */
1023 if (hlock->read != 2) { 1022 if (hlock->read != 2) {
1024 check_prev_add(curr, hlock, next); 1023 if (!check_prev_add(curr, hlock, next))
1024 return 0;
1025 /* 1025 /*
1026 * Stop after the first non-trylock entry, 1026 * Stop after the first non-trylock entry,
1027 * as non-trylock entries have added their 1027 * as non-trylock entries have added their
@@ -1178,6 +1178,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
1178 struct lockdep_subclass_key *key; 1178 struct lockdep_subclass_key *key;
1179 struct list_head *hash_head; 1179 struct list_head *hash_head;
1180 struct lock_class *class; 1180 struct lock_class *class;
1181 unsigned long flags;
1181 1182
1182 class = look_up_lock_class(lock, subclass); 1183 class = look_up_lock_class(lock, subclass);
1183 if (likely(class)) 1184 if (likely(class))
@@ -1199,6 +1200,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
1199 key = lock->key->subkeys + subclass; 1200 key = lock->key->subkeys + subclass;
1200 hash_head = classhashentry(key); 1201 hash_head = classhashentry(key);
1201 1202
1203 raw_local_irq_save(flags);
1202 __raw_spin_lock(&hash_lock); 1204 __raw_spin_lock(&hash_lock);
1203 /* 1205 /*
1204 * We have to do the hash-walk again, to avoid races 1206 * We have to do the hash-walk again, to avoid races
@@ -1213,6 +1215,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
1213 */ 1215 */
1214 if (nr_lock_classes >= MAX_LOCKDEP_KEYS) { 1216 if (nr_lock_classes >= MAX_LOCKDEP_KEYS) {
1215 __raw_spin_unlock(&hash_lock); 1217 __raw_spin_unlock(&hash_lock);
1218 raw_local_irq_restore(flags);
1216 debug_locks_off(); 1219 debug_locks_off();
1217 printk("BUG: MAX_LOCKDEP_KEYS too low!\n"); 1220 printk("BUG: MAX_LOCKDEP_KEYS too low!\n");
1218 printk("turning off the locking correctness validator.\n"); 1221 printk("turning off the locking correctness validator.\n");
@@ -1235,15 +1238,18 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
1235 1238
1236 if (verbose(class)) { 1239 if (verbose(class)) {
1237 __raw_spin_unlock(&hash_lock); 1240 __raw_spin_unlock(&hash_lock);
1241 raw_local_irq_restore(flags);
1238 printk("\nnew class %p: %s", class->key, class->name); 1242 printk("\nnew class %p: %s", class->key, class->name);
1239 if (class->name_version > 1) 1243 if (class->name_version > 1)
1240 printk("#%d", class->name_version); 1244 printk("#%d", class->name_version);
1241 printk("\n"); 1245 printk("\n");
1242 dump_stack(); 1246 dump_stack();
1247 raw_local_irq_save(flags);
1243 __raw_spin_lock(&hash_lock); 1248 __raw_spin_lock(&hash_lock);
1244 } 1249 }
1245out_unlock_set: 1250out_unlock_set:
1246 __raw_spin_unlock(&hash_lock); 1251 __raw_spin_unlock(&hash_lock);
1252 raw_local_irq_restore(flags);
1247 1253
1248 if (!subclass || force) 1254 if (!subclass || force)
1249 lock->class_cache = class; 1255 lock->class_cache = class;
@@ -1724,6 +1730,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
1724 debug_atomic_dec(&nr_unused_locks); 1730 debug_atomic_dec(&nr_unused_locks);
1725 break; 1731 break;
1726 default: 1732 default:
1733 __raw_spin_unlock(&hash_lock);
1727 debug_locks_off(); 1734 debug_locks_off();
1728 WARN_ON(1); 1735 WARN_ON(1);
1729 return 0; 1736 return 0;
@@ -2641,6 +2648,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len)
2641 } 2648 }
2642 local_irq_restore(flags); 2649 local_irq_restore(flags);
2643} 2650}
2651EXPORT_SYMBOL_GPL(debug_check_no_locks_freed);
2644 2652
2645static void print_held_locks_bug(struct task_struct *curr) 2653static void print_held_locks_bug(struct task_struct *curr)
2646{ 2654{
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index eab043c83bb2..8ce09bc4613d 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -20,7 +20,7 @@
20#define MAX_LOCKDEP_KEYS_BITS 11 20#define MAX_LOCKDEP_KEYS_BITS 11
21#define MAX_LOCKDEP_KEYS (1UL << MAX_LOCKDEP_KEYS_BITS) 21#define MAX_LOCKDEP_KEYS (1UL << MAX_LOCKDEP_KEYS_BITS)
22 22
23#define MAX_LOCKDEP_CHAINS_BITS 13 23#define MAX_LOCKDEP_CHAINS_BITS 14
24#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) 24#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS)
25 25
26/* 26/*
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index f6e72eaab3fa..b554b40a4aa6 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -113,7 +113,7 @@ static int l_show(struct seq_file *m, void *v)
113 return 0; 113 return 0;
114} 114}
115 115
116static struct seq_operations lockdep_ops = { 116static const struct seq_operations lockdep_ops = {
117 .start = l_start, 117 .start = l_start,
118 .next = l_next, 118 .next = l_next,
119 .stop = l_stop, 119 .stop = l_stop,
@@ -135,7 +135,7 @@ static int lockdep_open(struct inode *inode, struct file *file)
135 return res; 135 return res;
136} 136}
137 137
138static struct file_operations proc_lockdep_operations = { 138static const struct file_operations proc_lockdep_operations = {
139 .open = lockdep_open, 139 .open = lockdep_open,
140 .read = seq_read, 140 .read = seq_read,
141 .llseek = seq_lseek, 141 .llseek = seq_lseek,
@@ -319,7 +319,7 @@ static int lockdep_stats_open(struct inode *inode, struct file *file)
319 return single_open(file, lockdep_stats_show, NULL); 319 return single_open(file, lockdep_stats_show, NULL);
320} 320}
321 321
322static struct file_operations proc_lockdep_stats_operations = { 322static const struct file_operations proc_lockdep_stats_operations = {
323 .open = lockdep_stats_open, 323 .open = lockdep_stats_open,
324 .read = seq_read, 324 .read = seq_read,
325 .llseek = seq_lseek, 325 .llseek = seq_lseek,
diff --git a/kernel/module.c b/kernel/module.c
index e2d09d604ca0..d9eae45d0145 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2209,7 +2209,7 @@ static int m_show(struct seq_file *m, void *p)
2209 Where refcount is a number or -, and deps is a comma-separated list 2209 Where refcount is a number or -, and deps is a comma-separated list
2210 of depends or -. 2210 of depends or -.
2211*/ 2211*/
2212struct seq_operations modules_op = { 2212const struct seq_operations modules_op = {
2213 .start = m_start, 2213 .start = m_start,
2214 .next = m_next, 2214 .next = m_next,
2215 .stop = m_stop, 2215 .stop = m_stop,
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index 18651641a7b5..841539d72c55 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -77,6 +77,9 @@ void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
77 77
78void debug_mutex_unlock(struct mutex *lock) 78void debug_mutex_unlock(struct mutex *lock)
79{ 79{
80 if (unlikely(!debug_locks))
81 return;
82
80 DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); 83 DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
81 DEBUG_LOCKS_WARN_ON(lock->magic != lock); 84 DEBUG_LOCKS_WARN_ON(lock->magic != lock);
82 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); 85 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
diff --git a/kernel/pid.c b/kernel/pid.c
index b914392085f9..a48879b0b921 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -31,7 +31,7 @@
31#define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift) 31#define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift)
32static struct hlist_head *pid_hash; 32static struct hlist_head *pid_hash;
33static int pidhash_shift; 33static int pidhash_shift;
34static kmem_cache_t *pid_cachep; 34static struct kmem_cache *pid_cachep;
35 35
36int pid_max = PID_MAX_DEFAULT; 36int pid_max = PID_MAX_DEFAULT;
37 37
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 9cbb5d1be06f..5fe87de10ff0 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -70,7 +70,7 @@
70/* 70/*
71 * Lets keep our timers in a slab cache :-) 71 * Lets keep our timers in a slab cache :-)
72 */ 72 */
73static kmem_cache_t *posix_timers_cache; 73static struct kmem_cache *posix_timers_cache;
74static struct idr posix_timers_id; 74static struct idr posix_timers_id;
75static DEFINE_SPINLOCK(idr_lock); 75static DEFINE_SPINLOCK(idr_lock);
76 76
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 825068ca3479..710ed084e7c5 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -78,7 +78,7 @@ config PM_SYSFS_DEPRECATED
78 78
79config SOFTWARE_SUSPEND 79config SOFTWARE_SUSPEND
80 bool "Software Suspend" 80 bool "Software Suspend"
81 depends on PM && SWAP && ((X86 && (!SMP || SUSPEND_SMP) && !X86_PAE) || ((FRV || PPC32) && !SMP)) 81 depends on PM && SWAP && ((X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP))
82 ---help--- 82 ---help---
83 Enable the possibility of suspending the machine. 83 Enable the possibility of suspending the machine.
84 It doesn't need ACPI or APM. 84 It doesn't need ACPI or APM.
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index b1fb7866b0b3..0b00f56c2ad0 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -20,6 +20,7 @@
20#include <linux/pm.h> 20#include <linux/pm.h>
21#include <linux/console.h> 21#include <linux/console.h>
22#include <linux/cpu.h> 22#include <linux/cpu.h>
23#include <linux/freezer.h>
23 24
24#include "power.h" 25#include "power.h"
25 26
@@ -27,6 +28,23 @@
27static int noresume = 0; 28static int noresume = 0;
28char resume_file[256] = CONFIG_PM_STD_PARTITION; 29char resume_file[256] = CONFIG_PM_STD_PARTITION;
29dev_t swsusp_resume_device; 30dev_t swsusp_resume_device;
31sector_t swsusp_resume_block;
32
33/**
34 * platform_prepare - prepare the machine for hibernation using the
35 * platform driver if so configured and return an error code if it fails
36 */
37
38static inline int platform_prepare(void)
39{
40 int error = 0;
41
42 if (pm_disk_mode == PM_DISK_PLATFORM) {
43 if (pm_ops && pm_ops->prepare)
44 error = pm_ops->prepare(PM_SUSPEND_DISK);
45 }
46 return error;
47}
30 48
31/** 49/**
32 * power_down - Shut machine down for hibernate. 50 * power_down - Shut machine down for hibernate.
@@ -40,12 +58,10 @@ dev_t swsusp_resume_device;
40 58
41static void power_down(suspend_disk_method_t mode) 59static void power_down(suspend_disk_method_t mode)
42{ 60{
43 int error = 0;
44
45 switch(mode) { 61 switch(mode) {
46 case PM_DISK_PLATFORM: 62 case PM_DISK_PLATFORM:
47 kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); 63 kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
48 error = pm_ops->enter(PM_SUSPEND_DISK); 64 pm_ops->enter(PM_SUSPEND_DISK);
49 break; 65 break;
50 case PM_DISK_SHUTDOWN: 66 case PM_DISK_SHUTDOWN:
51 kernel_power_off(); 67 kernel_power_off();
@@ -90,12 +106,18 @@ static int prepare_processes(void)
90 goto thaw; 106 goto thaw;
91 } 107 }
92 108
109 error = platform_prepare();
110 if (error)
111 goto thaw;
112
93 /* Free memory before shutting down devices. */ 113 /* Free memory before shutting down devices. */
94 if (!(error = swsusp_shrink_memory())) 114 if (!(error = swsusp_shrink_memory()))
95 return 0; 115 return 0;
96thaw: 116
117 platform_finish();
118 thaw:
97 thaw_processes(); 119 thaw_processes();
98enable_cpus: 120 enable_cpus:
99 enable_nonboot_cpus(); 121 enable_nonboot_cpus();
100 pm_restore_console(); 122 pm_restore_console();
101 return error; 123 return error;
@@ -127,7 +149,7 @@ int pm_suspend_disk(void)
127 return error; 149 return error;
128 150
129 if (pm_disk_mode == PM_DISK_TESTPROC) 151 if (pm_disk_mode == PM_DISK_TESTPROC)
130 goto Thaw; 152 return 0;
131 153
132 suspend_console(); 154 suspend_console();
133 error = device_suspend(PMSG_FREEZE); 155 error = device_suspend(PMSG_FREEZE);
@@ -189,10 +211,10 @@ static int software_resume(void)
189{ 211{
190 int error; 212 int error;
191 213
192 down(&pm_sem); 214 mutex_lock(&pm_mutex);
193 if (!swsusp_resume_device) { 215 if (!swsusp_resume_device) {
194 if (!strlen(resume_file)) { 216 if (!strlen(resume_file)) {
195 up(&pm_sem); 217 mutex_unlock(&pm_mutex);
196 return -ENOENT; 218 return -ENOENT;
197 } 219 }
198 swsusp_resume_device = name_to_dev_t(resume_file); 220 swsusp_resume_device = name_to_dev_t(resume_file);
@@ -207,7 +229,7 @@ static int software_resume(void)
207 * FIXME: If noresume is specified, we need to find the partition 229 * FIXME: If noresume is specified, we need to find the partition
208 * and reset it back to normal swap space. 230 * and reset it back to normal swap space.
209 */ 231 */
210 up(&pm_sem); 232 mutex_unlock(&pm_mutex);
211 return 0; 233 return 0;
212 } 234 }
213 235
@@ -251,7 +273,7 @@ static int software_resume(void)
251 unprepare_processes(); 273 unprepare_processes();
252 Done: 274 Done:
253 /* For success case, the suspend path will release the lock */ 275 /* For success case, the suspend path will release the lock */
254 up(&pm_sem); 276 mutex_unlock(&pm_mutex);
255 pr_debug("PM: Resume from disk failed.\n"); 277 pr_debug("PM: Resume from disk failed.\n");
256 return 0; 278 return 0;
257} 279}
@@ -312,7 +334,7 @@ static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n)
312 p = memchr(buf, '\n', n); 334 p = memchr(buf, '\n', n);
313 len = p ? p - buf : n; 335 len = p ? p - buf : n;
314 336
315 down(&pm_sem); 337 mutex_lock(&pm_mutex);
316 for (i = PM_DISK_FIRMWARE; i < PM_DISK_MAX; i++) { 338 for (i = PM_DISK_FIRMWARE; i < PM_DISK_MAX; i++) {
317 if (!strncmp(buf, pm_disk_modes[i], len)) { 339 if (!strncmp(buf, pm_disk_modes[i], len)) {
318 mode = i; 340 mode = i;
@@ -336,7 +358,7 @@ static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n)
336 358
337 pr_debug("PM: suspend-to-disk mode set to '%s'\n", 359 pr_debug("PM: suspend-to-disk mode set to '%s'\n",
338 pm_disk_modes[mode]); 360 pm_disk_modes[mode]);
339 up(&pm_sem); 361 mutex_unlock(&pm_mutex);
340 return error ? error : n; 362 return error ? error : n;
341} 363}
342 364
@@ -361,14 +383,14 @@ static ssize_t resume_store(struct subsystem *subsys, const char *buf, size_t n)
361 if (maj != MAJOR(res) || min != MINOR(res)) 383 if (maj != MAJOR(res) || min != MINOR(res))
362 goto out; 384 goto out;
363 385
364 down(&pm_sem); 386 mutex_lock(&pm_mutex);
365 swsusp_resume_device = res; 387 swsusp_resume_device = res;
366 up(&pm_sem); 388 mutex_unlock(&pm_mutex);
367 printk("Attempting manual resume\n"); 389 printk("Attempting manual resume\n");
368 noresume = 0; 390 noresume = 0;
369 software_resume(); 391 software_resume();
370 ret = n; 392 ret = n;
371out: 393 out:
372 return ret; 394 return ret;
373} 395}
374 396
@@ -423,6 +445,19 @@ static int __init resume_setup(char *str)
423 return 1; 445 return 1;
424} 446}
425 447
448static int __init resume_offset_setup(char *str)
449{
450 unsigned long long offset;
451
452 if (noresume)
453 return 1;
454
455 if (sscanf(str, "%llu", &offset) == 1)
456 swsusp_resume_block = offset;
457
458 return 1;
459}
460
426static int __init noresume_setup(char *str) 461static int __init noresume_setup(char *str)
427{ 462{
428 noresume = 1; 463 noresume = 1;
@@ -430,4 +465,5 @@ static int __init noresume_setup(char *str)
430} 465}
431 466
432__setup("noresume", noresume_setup); 467__setup("noresume", noresume_setup);
468__setup("resume_offset=", resume_offset_setup);
433__setup("resume=", resume_setup); 469__setup("resume=", resume_setup);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 873228c71dab..500eb87f643d 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -8,6 +8,7 @@
8 * 8 *
9 */ 9 */
10 10
11#include <linux/module.h>
11#include <linux/suspend.h> 12#include <linux/suspend.h>
12#include <linux/kobject.h> 13#include <linux/kobject.h>
13#include <linux/string.h> 14#include <linux/string.h>
@@ -18,13 +19,14 @@
18#include <linux/console.h> 19#include <linux/console.h>
19#include <linux/cpu.h> 20#include <linux/cpu.h>
20#include <linux/resume-trace.h> 21#include <linux/resume-trace.h>
22#include <linux/freezer.h>
21 23
22#include "power.h" 24#include "power.h"
23 25
24/*This is just an arbitrary number */ 26/*This is just an arbitrary number */
25#define FREE_PAGE_NUMBER (100) 27#define FREE_PAGE_NUMBER (100)
26 28
27DECLARE_MUTEX(pm_sem); 29DEFINE_MUTEX(pm_mutex);
28 30
29struct pm_ops *pm_ops; 31struct pm_ops *pm_ops;
30suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN; 32suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN;
@@ -36,9 +38,9 @@ suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN;
36 38
37void pm_set_ops(struct pm_ops * ops) 39void pm_set_ops(struct pm_ops * ops)
38{ 40{
39 down(&pm_sem); 41 mutex_lock(&pm_mutex);
40 pm_ops = ops; 42 pm_ops = ops;
41 up(&pm_sem); 43 mutex_unlock(&pm_mutex);
42} 44}
43 45
44 46
@@ -182,7 +184,7 @@ static int enter_state(suspend_state_t state)
182 184
183 if (!valid_state(state)) 185 if (!valid_state(state))
184 return -ENODEV; 186 return -ENODEV;
185 if (down_trylock(&pm_sem)) 187 if (!mutex_trylock(&pm_mutex))
186 return -EBUSY; 188 return -EBUSY;
187 189
188 if (state == PM_SUSPEND_DISK) { 190 if (state == PM_SUSPEND_DISK) {
@@ -200,7 +202,7 @@ static int enter_state(suspend_state_t state)
200 pr_debug("PM: Finishing wakeup.\n"); 202 pr_debug("PM: Finishing wakeup.\n");
201 suspend_finish(state); 203 suspend_finish(state);
202 Unlock: 204 Unlock:
203 up(&pm_sem); 205 mutex_unlock(&pm_mutex);
204 return error; 206 return error;
205} 207}
206 208
@@ -229,7 +231,7 @@ int pm_suspend(suspend_state_t state)
229 return -EINVAL; 231 return -EINVAL;
230} 232}
231 233
232 234EXPORT_SYMBOL(pm_suspend);
233 235
234decl_subsys(power,NULL,NULL); 236decl_subsys(power,NULL,NULL);
235 237
diff --git a/kernel/power/power.h b/kernel/power/power.h
index bfe999f7b272..eb461b816bf4 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -22,7 +22,9 @@ static inline int pm_suspend_disk(void)
22 return -EPERM; 22 return -EPERM;
23} 23}
24#endif 24#endif
25extern struct semaphore pm_sem; 25
26extern struct mutex pm_mutex;
27
26#define power_attr(_name) \ 28#define power_attr(_name) \
27static struct subsys_attribute _name##_attr = { \ 29static struct subsys_attribute _name##_attr = { \
28 .attr = { \ 30 .attr = { \
@@ -42,6 +44,7 @@ extern const void __nosave_begin, __nosave_end;
42extern unsigned long image_size; 44extern unsigned long image_size;
43extern int in_suspend; 45extern int in_suspend;
44extern dev_t swsusp_resume_device; 46extern dev_t swsusp_resume_device;
47extern sector_t swsusp_resume_block;
45 48
46extern asmlinkage int swsusp_arch_suspend(void); 49extern asmlinkage int swsusp_arch_suspend(void);
47extern asmlinkage int swsusp_arch_resume(void); 50extern asmlinkage int swsusp_arch_resume(void);
@@ -102,8 +105,18 @@ struct snapshot_handle {
102extern unsigned int snapshot_additional_pages(struct zone *zone); 105extern unsigned int snapshot_additional_pages(struct zone *zone);
103extern int snapshot_read_next(struct snapshot_handle *handle, size_t count); 106extern int snapshot_read_next(struct snapshot_handle *handle, size_t count);
104extern int snapshot_write_next(struct snapshot_handle *handle, size_t count); 107extern int snapshot_write_next(struct snapshot_handle *handle, size_t count);
108extern void snapshot_write_finalize(struct snapshot_handle *handle);
105extern int snapshot_image_loaded(struct snapshot_handle *handle); 109extern int snapshot_image_loaded(struct snapshot_handle *handle);
106extern void snapshot_free_unused_memory(struct snapshot_handle *handle); 110
111/*
112 * This structure is used to pass the values needed for the identification
113 * of the resume swap area from a user space to the kernel via the
114 * SNAPSHOT_SET_SWAP_AREA ioctl
115 */
116struct resume_swap_area {
117 loff_t offset;
118 u_int32_t dev;
119} __attribute__((packed));
107 120
108#define SNAPSHOT_IOC_MAGIC '3' 121#define SNAPSHOT_IOC_MAGIC '3'
109#define SNAPSHOT_FREEZE _IO(SNAPSHOT_IOC_MAGIC, 1) 122#define SNAPSHOT_FREEZE _IO(SNAPSHOT_IOC_MAGIC, 1)
@@ -117,7 +130,14 @@ extern void snapshot_free_unused_memory(struct snapshot_handle *handle);
117#define SNAPSHOT_FREE_SWAP_PAGES _IO(SNAPSHOT_IOC_MAGIC, 9) 130#define SNAPSHOT_FREE_SWAP_PAGES _IO(SNAPSHOT_IOC_MAGIC, 9)
118#define SNAPSHOT_SET_SWAP_FILE _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int) 131#define SNAPSHOT_SET_SWAP_FILE _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int)
119#define SNAPSHOT_S2RAM _IO(SNAPSHOT_IOC_MAGIC, 11) 132#define SNAPSHOT_S2RAM _IO(SNAPSHOT_IOC_MAGIC, 11)
120#define SNAPSHOT_IOC_MAXNR 11 133#define SNAPSHOT_PMOPS _IOW(SNAPSHOT_IOC_MAGIC, 12, unsigned int)
134#define SNAPSHOT_SET_SWAP_AREA _IOW(SNAPSHOT_IOC_MAGIC, 13, \
135 struct resume_swap_area)
136#define SNAPSHOT_IOC_MAXNR 13
137
138#define PMOPS_PREPARE 1
139#define PMOPS_ENTER 2
140#define PMOPS_FINISH 3
121 141
122/** 142/**
123 * The bitmap is used for tracing allocated swap pages 143 * The bitmap is used for tracing allocated swap pages
@@ -141,7 +161,7 @@ struct bitmap_page {
141 161
142extern void free_bitmap(struct bitmap_page *bitmap); 162extern void free_bitmap(struct bitmap_page *bitmap);
143extern struct bitmap_page *alloc_bitmap(unsigned int nr_bits); 163extern struct bitmap_page *alloc_bitmap(unsigned int nr_bits);
144extern unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap); 164extern sector_t alloc_swapdev_block(int swap, struct bitmap_page *bitmap);
145extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap); 165extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap);
146 166
147extern int swsusp_check(void); 167extern int swsusp_check(void);
@@ -153,3 +173,7 @@ extern int swsusp_read(void);
153extern int swsusp_write(void); 173extern int swsusp_write(void);
154extern void swsusp_close(void); 174extern void swsusp_close(void);
155extern int suspend_enter(suspend_state_t state); 175extern int suspend_enter(suspend_state_t state);
176
177struct timeval;
178extern void swsusp_show_speed(struct timeval *, struct timeval *,
179 unsigned int, char *);
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index f1f900ac3164..678ec736076b 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -16,12 +16,12 @@
16 * callback we use. 16 * callback we use.
17 */ 17 */
18 18
19static void do_poweroff(void *dummy) 19static void do_poweroff(struct work_struct *dummy)
20{ 20{
21 kernel_power_off(); 21 kernel_power_off();
22} 22}
23 23
24static DECLARE_WORK(poweroff_work, do_poweroff, NULL); 24static DECLARE_WORK(poweroff_work, do_poweroff);
25 25
26static void handle_poweroff(int key, struct tty_struct *tty) 26static void handle_poweroff(int key, struct tty_struct *tty)
27{ 27{
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 72e72d2c61e6..99eeb119b06d 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -13,12 +13,15 @@
13#include <linux/suspend.h> 13#include <linux/suspend.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/syscalls.h> 15#include <linux/syscalls.h>
16#include <linux/freezer.h>
16 17
17/* 18/*
18 * Timeout for stopping processes 19 * Timeout for stopping processes
19 */ 20 */
20#define TIMEOUT (20 * HZ) 21#define TIMEOUT (20 * HZ)
21 22
23#define FREEZER_KERNEL_THREADS 0
24#define FREEZER_USER_SPACE 1
22 25
23static inline int freezeable(struct task_struct * p) 26static inline int freezeable(struct task_struct * p)
24{ 27{
@@ -39,7 +42,6 @@ void refrigerator(void)
39 long save; 42 long save;
40 save = current->state; 43 save = current->state;
41 pr_debug("%s entered refrigerator\n", current->comm); 44 pr_debug("%s entered refrigerator\n", current->comm);
42 printk("=");
43 45
44 frozen_process(current); 46 frozen_process(current);
45 spin_lock_irq(&current->sighand->siglock); 47 spin_lock_irq(&current->sighand->siglock);
@@ -79,96 +81,136 @@ static void cancel_freezing(struct task_struct *p)
79 } 81 }
80} 82}
81 83
82/* 0 = success, else # of processes that we failed to stop */ 84static inline int is_user_space(struct task_struct *p)
83int freeze_processes(void) 85{
86 return p->mm && !(p->flags & PF_BORROWED_MM);
87}
88
89static unsigned int try_to_freeze_tasks(int freeze_user_space)
84{ 90{
85 int todo, nr_user, user_frozen;
86 unsigned long start_time;
87 struct task_struct *g, *p; 91 struct task_struct *g, *p;
92 unsigned long end_time;
93 unsigned int todo;
88 94
89 printk( "Stopping tasks: " ); 95 end_time = jiffies + TIMEOUT;
90 start_time = jiffies;
91 user_frozen = 0;
92 do { 96 do {
93 nr_user = todo = 0; 97 todo = 0;
94 read_lock(&tasklist_lock); 98 read_lock(&tasklist_lock);
95 do_each_thread(g, p) { 99 do_each_thread(g, p) {
96 if (!freezeable(p)) 100 if (!freezeable(p))
97 continue; 101 continue;
102
98 if (frozen(p)) 103 if (frozen(p))
99 continue; 104 continue;
100 if (p->state == TASK_TRACED && frozen(p->parent)) { 105
106 if (p->state == TASK_TRACED &&
107 (frozen(p->parent) ||
108 p->parent->state == TASK_STOPPED)) {
101 cancel_freezing(p); 109 cancel_freezing(p);
102 continue; 110 continue;
103 } 111 }
104 if (p->mm && !(p->flags & PF_BORROWED_MM)) { 112 if (is_user_space(p)) {
105 /* The task is a user-space one. 113 if (!freeze_user_space)
106 * Freeze it unless there's a vfork completion 114 continue;
107 * pending 115
116 /* Freeze the task unless there is a vfork
117 * completion pending
108 */ 118 */
109 if (!p->vfork_done) 119 if (!p->vfork_done)
110 freeze_process(p); 120 freeze_process(p);
111 nr_user++;
112 } else { 121 } else {
113 /* Freeze only if the user space is frozen */ 122 if (freeze_user_space)
114 if (user_frozen) 123 continue;
115 freeze_process(p); 124
116 todo++; 125 freeze_process(p);
117 } 126 }
127 todo++;
118 } while_each_thread(g, p); 128 } while_each_thread(g, p);
119 read_unlock(&tasklist_lock); 129 read_unlock(&tasklist_lock);
120 todo += nr_user;
121 if (!user_frozen && !nr_user) {
122 sys_sync();
123 start_time = jiffies;
124 }
125 user_frozen = !nr_user;
126 yield(); /* Yield is okay here */ 130 yield(); /* Yield is okay here */
127 if (todo && time_after(jiffies, start_time + TIMEOUT)) 131 if (todo && time_after(jiffies, end_time))
128 break; 132 break;
129 } while(todo); 133 } while (todo);
130 134
131 /* This does not unfreeze processes that are already frozen
132 * (we have slightly ugly calling convention in that respect,
133 * and caller must call thaw_processes() if something fails),
134 * but it cleans up leftover PF_FREEZE requests.
135 */
136 if (todo) { 135 if (todo) {
137 printk( "\n" ); 136 /* This does not unfreeze processes that are already frozen
138 printk(KERN_ERR " stopping tasks timed out " 137 * (we have slightly ugly calling convention in that respect,
139 "after %d seconds (%d tasks remaining):\n", 138 * and caller must call thaw_processes() if something fails),
140 TIMEOUT / HZ, todo); 139 * but it cleans up leftover PF_FREEZE requests.
140 */
141 printk("\n");
142 printk(KERN_ERR "Stopping %s timed out after %d seconds "
143 "(%d tasks refusing to freeze):\n",
144 freeze_user_space ? "user space processes" :
145 "kernel threads",
146 TIMEOUT / HZ, todo);
141 read_lock(&tasklist_lock); 147 read_lock(&tasklist_lock);
142 do_each_thread(g, p) { 148 do_each_thread(g, p) {
149 if (is_user_space(p) == !freeze_user_space)
150 continue;
151
143 if (freezeable(p) && !frozen(p)) 152 if (freezeable(p) && !frozen(p))
144 printk(KERN_ERR " %s\n", p->comm); 153 printk(KERN_ERR " %s\n", p->comm);
154
145 cancel_freezing(p); 155 cancel_freezing(p);
146 } while_each_thread(g, p); 156 } while_each_thread(g, p);
147 read_unlock(&tasklist_lock); 157 read_unlock(&tasklist_lock);
148 return todo;
149 } 158 }
150 159
151 printk( "|\n" ); 160 return todo;
161}
162
163/**
164 * freeze_processes - tell processes to enter the refrigerator
165 *
166 * Returns 0 on success, or the number of processes that didn't freeze,
167 * although they were told to.
168 */
169int freeze_processes(void)
170{
171 unsigned int nr_unfrozen;
172
173 printk("Stopping tasks ... ");
174 nr_unfrozen = try_to_freeze_tasks(FREEZER_USER_SPACE);
175 if (nr_unfrozen)
176 return nr_unfrozen;
177
178 sys_sync();
179 nr_unfrozen = try_to_freeze_tasks(FREEZER_KERNEL_THREADS);
180 if (nr_unfrozen)
181 return nr_unfrozen;
182
183 printk("done.\n");
152 BUG_ON(in_atomic()); 184 BUG_ON(in_atomic());
153 return 0; 185 return 0;
154} 186}
155 187
156void thaw_processes(void) 188static void thaw_tasks(int thaw_user_space)
157{ 189{
158 struct task_struct *g, *p; 190 struct task_struct *g, *p;
159 191
160 printk( "Restarting tasks..." );
161 read_lock(&tasklist_lock); 192 read_lock(&tasklist_lock);
162 do_each_thread(g, p) { 193 do_each_thread(g, p) {
163 if (!freezeable(p)) 194 if (!freezeable(p))
164 continue; 195 continue;
196
197 if (is_user_space(p) == !thaw_user_space)
198 continue;
199
165 if (!thaw_process(p)) 200 if (!thaw_process(p))
166 printk(KERN_INFO " Strange, %s not stopped\n", p->comm ); 201 printk(KERN_WARNING " Strange, %s not stopped\n",
202 p->comm );
167 } while_each_thread(g, p); 203 } while_each_thread(g, p);
168
169 read_unlock(&tasklist_lock); 204 read_unlock(&tasklist_lock);
205}
206
207void thaw_processes(void)
208{
209 printk("Restarting tasks ... ");
210 thaw_tasks(FREEZER_KERNEL_THREADS);
211 thaw_tasks(FREEZER_USER_SPACE);
170 schedule(); 212 schedule();
171 printk( " done\n" ); 213 printk("done.\n");
172} 214}
173 215
174EXPORT_SYMBOL(refrigerator); 216EXPORT_SYMBOL(refrigerator);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 99f9b7d177d6..c024606221c4 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1,15 +1,15 @@
1/* 1/*
2 * linux/kernel/power/snapshot.c 2 * linux/kernel/power/snapshot.c
3 * 3 *
4 * This file provide system snapshot/restore functionality. 4 * This file provides system snapshot/restore functionality for swsusp.
5 * 5 *
6 * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz> 6 * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz>
7 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
7 * 8 *
8 * This file is released under the GPLv2, and is based on swsusp.c. 9 * This file is released under the GPLv2.
9 * 10 *
10 */ 11 */
11 12
12
13#include <linux/version.h> 13#include <linux/version.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/mm.h> 15#include <linux/mm.h>
@@ -34,137 +34,24 @@
34 34
35#include "power.h" 35#include "power.h"
36 36
37/* List of PBEs used for creating and restoring the suspend image */ 37/* List of PBEs needed for restoring the pages that were allocated before
38 * the suspend and included in the suspend image, but have also been
39 * allocated by the "resume" kernel, so their contents cannot be written
40 * directly to their "original" page frames.
41 */
38struct pbe *restore_pblist; 42struct pbe *restore_pblist;
39 43
40static unsigned int nr_copy_pages; 44/* Pointer to an auxiliary buffer (1 page) */
41static unsigned int nr_meta_pages;
42static void *buffer; 45static void *buffer;
43 46
44#ifdef CONFIG_HIGHMEM
45unsigned int count_highmem_pages(void)
46{
47 struct zone *zone;
48 unsigned long zone_pfn;
49 unsigned int n = 0;
50
51 for_each_zone (zone)
52 if (is_highmem(zone)) {
53 mark_free_pages(zone);
54 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; zone_pfn++) {
55 struct page *page;
56 unsigned long pfn = zone_pfn + zone->zone_start_pfn;
57 if (!pfn_valid(pfn))
58 continue;
59 page = pfn_to_page(pfn);
60 if (PageReserved(page))
61 continue;
62 if (PageNosaveFree(page))
63 continue;
64 n++;
65 }
66 }
67 return n;
68}
69
70struct highmem_page {
71 char *data;
72 struct page *page;
73 struct highmem_page *next;
74};
75
76static struct highmem_page *highmem_copy;
77
78static int save_highmem_zone(struct zone *zone)
79{
80 unsigned long zone_pfn;
81 mark_free_pages(zone);
82 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
83 struct page *page;
84 struct highmem_page *save;
85 void *kaddr;
86 unsigned long pfn = zone_pfn + zone->zone_start_pfn;
87
88 if (!(pfn%10000))
89 printk(".");
90 if (!pfn_valid(pfn))
91 continue;
92 page = pfn_to_page(pfn);
93 /*
94 * This condition results from rvmalloc() sans vmalloc_32()
95 * and architectural memory reservations. This should be
96 * corrected eventually when the cases giving rise to this
97 * are better understood.
98 */
99 if (PageReserved(page))
100 continue;
101 BUG_ON(PageNosave(page));
102 if (PageNosaveFree(page))
103 continue;
104 save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC);
105 if (!save)
106 return -ENOMEM;
107 save->next = highmem_copy;
108 save->page = page;
109 save->data = (void *) get_zeroed_page(GFP_ATOMIC);
110 if (!save->data) {
111 kfree(save);
112 return -ENOMEM;
113 }
114 kaddr = kmap_atomic(page, KM_USER0);
115 memcpy(save->data, kaddr, PAGE_SIZE);
116 kunmap_atomic(kaddr, KM_USER0);
117 highmem_copy = save;
118 }
119 return 0;
120}
121
122int save_highmem(void)
123{
124 struct zone *zone;
125 int res = 0;
126
127 pr_debug("swsusp: Saving Highmem");
128 drain_local_pages();
129 for_each_zone (zone) {
130 if (is_highmem(zone))
131 res = save_highmem_zone(zone);
132 if (res)
133 return res;
134 }
135 printk("\n");
136 return 0;
137}
138
139int restore_highmem(void)
140{
141 printk("swsusp: Restoring Highmem\n");
142 while (highmem_copy) {
143 struct highmem_page *save = highmem_copy;
144 void *kaddr;
145 highmem_copy = save->next;
146
147 kaddr = kmap_atomic(save->page, KM_USER0);
148 memcpy(kaddr, save->data, PAGE_SIZE);
149 kunmap_atomic(kaddr, KM_USER0);
150 free_page((long) save->data);
151 kfree(save);
152 }
153 return 0;
154}
155#else
156static inline unsigned int count_highmem_pages(void) {return 0;}
157static inline int save_highmem(void) {return 0;}
158static inline int restore_highmem(void) {return 0;}
159#endif
160
161/** 47/**
162 * @safe_needed - on resume, for storing the PBE list and the image, 48 * @safe_needed - on resume, for storing the PBE list and the image,
163 * we can only use memory pages that do not conflict with the pages 49 * we can only use memory pages that do not conflict with the pages
164 * used before suspend. 50 * used before suspend. The unsafe pages have PageNosaveFree set
51 * and we count them using unsafe_pages.
165 * 52 *
166 * The unsafe pages are marked with the PG_nosave_free flag 53 * Each allocated image page is marked as PageNosave and PageNosaveFree
167 * and we count them using unsafe_pages 54 * so that swsusp_free() can release it.
168 */ 55 */
169 56
170#define PG_ANY 0 57#define PG_ANY 0
@@ -174,7 +61,7 @@ static inline int restore_highmem(void) {return 0;}
174 61
175static unsigned int allocated_unsafe_pages; 62static unsigned int allocated_unsafe_pages;
176 63
177static void *alloc_image_page(gfp_t gfp_mask, int safe_needed) 64static void *get_image_page(gfp_t gfp_mask, int safe_needed)
178{ 65{
179 void *res; 66 void *res;
180 67
@@ -195,20 +82,39 @@ static void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
195 82
196unsigned long get_safe_page(gfp_t gfp_mask) 83unsigned long get_safe_page(gfp_t gfp_mask)
197{ 84{
198 return (unsigned long)alloc_image_page(gfp_mask, PG_SAFE); 85 return (unsigned long)get_image_page(gfp_mask, PG_SAFE);
86}
87
88static struct page *alloc_image_page(gfp_t gfp_mask)
89{
90 struct page *page;
91
92 page = alloc_page(gfp_mask);
93 if (page) {
94 SetPageNosave(page);
95 SetPageNosaveFree(page);
96 }
97 return page;
199} 98}
200 99
201/** 100/**
202 * free_image_page - free page represented by @addr, allocated with 101 * free_image_page - free page represented by @addr, allocated with
203 * alloc_image_page (page flags set by it must be cleared) 102 * get_image_page (page flags set by it must be cleared)
204 */ 103 */
205 104
206static inline void free_image_page(void *addr, int clear_nosave_free) 105static inline void free_image_page(void *addr, int clear_nosave_free)
207{ 106{
208 ClearPageNosave(virt_to_page(addr)); 107 struct page *page;
108
109 BUG_ON(!virt_addr_valid(addr));
110
111 page = virt_to_page(addr);
112
113 ClearPageNosave(page);
209 if (clear_nosave_free) 114 if (clear_nosave_free)
210 ClearPageNosaveFree(virt_to_page(addr)); 115 ClearPageNosaveFree(page);
211 free_page((unsigned long)addr); 116
117 __free_page(page);
212} 118}
213 119
214/* struct linked_page is used to build chains of pages */ 120/* struct linked_page is used to build chains of pages */
@@ -269,7 +175,7 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
269 if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) { 175 if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) {
270 struct linked_page *lp; 176 struct linked_page *lp;
271 177
272 lp = alloc_image_page(ca->gfp_mask, ca->safe_needed); 178 lp = get_image_page(ca->gfp_mask, ca->safe_needed);
273 if (!lp) 179 if (!lp)
274 return NULL; 180 return NULL;
275 181
@@ -446,8 +352,8 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
446 352
447 /* Compute the number of zones */ 353 /* Compute the number of zones */
448 nr = 0; 354 nr = 0;
449 for_each_zone (zone) 355 for_each_zone(zone)
450 if (populated_zone(zone) && !is_highmem(zone)) 356 if (populated_zone(zone))
451 nr++; 357 nr++;
452 358
453 /* Allocate the list of zones bitmap objects */ 359 /* Allocate the list of zones bitmap objects */
@@ -459,10 +365,10 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
459 } 365 }
460 366
461 /* Initialize the zone bitmap objects */ 367 /* Initialize the zone bitmap objects */
462 for_each_zone (zone) { 368 for_each_zone(zone) {
463 unsigned long pfn; 369 unsigned long pfn;
464 370
465 if (!populated_zone(zone) || is_highmem(zone)) 371 if (!populated_zone(zone))
466 continue; 372 continue;
467 373
468 zone_bm->start_pfn = zone->zone_start_pfn; 374 zone_bm->start_pfn = zone->zone_start_pfn;
@@ -481,7 +387,7 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
481 while (bb) { 387 while (bb) {
482 unsigned long *ptr; 388 unsigned long *ptr;
483 389
484 ptr = alloc_image_page(gfp_mask, safe_needed); 390 ptr = get_image_page(gfp_mask, safe_needed);
485 bb->data = ptr; 391 bb->data = ptr;
486 if (!ptr) 392 if (!ptr)
487 goto Free; 393 goto Free;
@@ -505,7 +411,7 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
505 memory_bm_position_reset(bm); 411 memory_bm_position_reset(bm);
506 return 0; 412 return 0;
507 413
508Free: 414 Free:
509 bm->p_list = ca.chain; 415 bm->p_list = ca.chain;
510 memory_bm_free(bm, PG_UNSAFE_CLEAR); 416 memory_bm_free(bm, PG_UNSAFE_CLEAR);
511 return -ENOMEM; 417 return -ENOMEM;
@@ -651,7 +557,7 @@ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
651 memory_bm_position_reset(bm); 557 memory_bm_position_reset(bm);
652 return BM_END_OF_MAP; 558 return BM_END_OF_MAP;
653 559
654Return_pfn: 560 Return_pfn:
655 bm->cur.chunk = chunk; 561 bm->cur.chunk = chunk;
656 bm->cur.bit = bit; 562 bm->cur.bit = bit;
657 return bb->start_pfn + chunk * BM_BITS_PER_CHUNK + bit; 563 return bb->start_pfn + chunk * BM_BITS_PER_CHUNK + bit;
@@ -669,10 +575,82 @@ unsigned int snapshot_additional_pages(struct zone *zone)
669 575
670 res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); 576 res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
671 res += DIV_ROUND_UP(res * sizeof(struct bm_block), PAGE_SIZE); 577 res += DIV_ROUND_UP(res * sizeof(struct bm_block), PAGE_SIZE);
672 return res; 578 return 2 * res;
579}
580
581#ifdef CONFIG_HIGHMEM
582/**
583 * count_free_highmem_pages - compute the total number of free highmem
584 * pages, system-wide.
585 */
586
587static unsigned int count_free_highmem_pages(void)
588{
589 struct zone *zone;
590 unsigned int cnt = 0;
591
592 for_each_zone(zone)
593 if (populated_zone(zone) && is_highmem(zone))
594 cnt += zone->free_pages;
595
596 return cnt;
597}
598
599/**
600 * saveable_highmem_page - Determine whether a highmem page should be
601 * included in the suspend image.
602 *
603 * We should save the page if it isn't Nosave or NosaveFree, or Reserved,
604 * and it isn't a part of a free chunk of pages.
605 */
606
607static struct page *saveable_highmem_page(unsigned long pfn)
608{
609 struct page *page;
610
611 if (!pfn_valid(pfn))
612 return NULL;
613
614 page = pfn_to_page(pfn);
615
616 BUG_ON(!PageHighMem(page));
617
618 if (PageNosave(page) || PageReserved(page) || PageNosaveFree(page))
619 return NULL;
620
621 return page;
673} 622}
674 623
675/** 624/**
625 * count_highmem_pages - compute the total number of saveable highmem
626 * pages.
627 */
628
629unsigned int count_highmem_pages(void)
630{
631 struct zone *zone;
632 unsigned int n = 0;
633
634 for_each_zone(zone) {
635 unsigned long pfn, max_zone_pfn;
636
637 if (!is_highmem(zone))
638 continue;
639
640 mark_free_pages(zone);
641 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
642 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
643 if (saveable_highmem_page(pfn))
644 n++;
645 }
646 return n;
647}
648#else
649static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; }
650static inline unsigned int count_highmem_pages(void) { return 0; }
651#endif /* CONFIG_HIGHMEM */
652
653/**
676 * pfn_is_nosave - check if given pfn is in the 'nosave' section 654 * pfn_is_nosave - check if given pfn is in the 'nosave' section
677 */ 655 */
678 656
@@ -684,12 +662,12 @@ static inline int pfn_is_nosave(unsigned long pfn)
684} 662}
685 663
686/** 664/**
687 * saveable - Determine whether a page should be cloned or not. 665 * saveable - Determine whether a non-highmem page should be included in
688 * @pfn: The page 666 * the suspend image.
689 * 667 *
690 * We save a page if it isn't Nosave, and is not in the range of pages 668 * We should save the page if it isn't Nosave, and is not in the range
691 * statically defined as 'unsaveable', and it 669 * of pages statically defined as 'unsaveable', and it isn't a part of
692 * isn't a part of a free chunk of pages. 670 * a free chunk of pages.
693 */ 671 */
694 672
695static struct page *saveable_page(unsigned long pfn) 673static struct page *saveable_page(unsigned long pfn)
@@ -701,76 +679,130 @@ static struct page *saveable_page(unsigned long pfn)
701 679
702 page = pfn_to_page(pfn); 680 page = pfn_to_page(pfn);
703 681
704 if (PageNosave(page)) 682 BUG_ON(PageHighMem(page));
683
684 if (PageNosave(page) || PageNosaveFree(page))
705 return NULL; 685 return NULL;
686
706 if (PageReserved(page) && pfn_is_nosave(pfn)) 687 if (PageReserved(page) && pfn_is_nosave(pfn))
707 return NULL; 688 return NULL;
708 if (PageNosaveFree(page))
709 return NULL;
710 689
711 return page; 690 return page;
712} 691}
713 692
693/**
694 * count_data_pages - compute the total number of saveable non-highmem
695 * pages.
696 */
697
714unsigned int count_data_pages(void) 698unsigned int count_data_pages(void)
715{ 699{
716 struct zone *zone; 700 struct zone *zone;
717 unsigned long pfn, max_zone_pfn; 701 unsigned long pfn, max_zone_pfn;
718 unsigned int n = 0; 702 unsigned int n = 0;
719 703
720 for_each_zone (zone) { 704 for_each_zone(zone) {
721 if (is_highmem(zone)) 705 if (is_highmem(zone))
722 continue; 706 continue;
707
723 mark_free_pages(zone); 708 mark_free_pages(zone);
724 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 709 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
725 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 710 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
726 n += !!saveable_page(pfn); 711 if(saveable_page(pfn))
712 n++;
727 } 713 }
728 return n; 714 return n;
729} 715}
730 716
731static inline void copy_data_page(long *dst, long *src) 717/* This is needed, because copy_page and memcpy are not usable for copying
718 * task structs.
719 */
720static inline void do_copy_page(long *dst, long *src)
732{ 721{
733 int n; 722 int n;
734 723
735 /* copy_page and memcpy are not usable for copying task structs. */
736 for (n = PAGE_SIZE / sizeof(long); n; n--) 724 for (n = PAGE_SIZE / sizeof(long); n; n--)
737 *dst++ = *src++; 725 *dst++ = *src++;
738} 726}
739 727
728#ifdef CONFIG_HIGHMEM
729static inline struct page *
730page_is_saveable(struct zone *zone, unsigned long pfn)
731{
732 return is_highmem(zone) ?
733 saveable_highmem_page(pfn) : saveable_page(pfn);
734}
735
736static inline void
737copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
738{
739 struct page *s_page, *d_page;
740 void *src, *dst;
741
742 s_page = pfn_to_page(src_pfn);
743 d_page = pfn_to_page(dst_pfn);
744 if (PageHighMem(s_page)) {
745 src = kmap_atomic(s_page, KM_USER0);
746 dst = kmap_atomic(d_page, KM_USER1);
747 do_copy_page(dst, src);
748 kunmap_atomic(src, KM_USER0);
749 kunmap_atomic(dst, KM_USER1);
750 } else {
751 src = page_address(s_page);
752 if (PageHighMem(d_page)) {
753 /* Page pointed to by src may contain some kernel
754 * data modified by kmap_atomic()
755 */
756 do_copy_page(buffer, src);
757 dst = kmap_atomic(pfn_to_page(dst_pfn), KM_USER0);
758 memcpy(dst, buffer, PAGE_SIZE);
759 kunmap_atomic(dst, KM_USER0);
760 } else {
761 dst = page_address(d_page);
762 do_copy_page(dst, src);
763 }
764 }
765}
766#else
767#define page_is_saveable(zone, pfn) saveable_page(pfn)
768
769static inline void
770copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
771{
772 do_copy_page(page_address(pfn_to_page(dst_pfn)),
773 page_address(pfn_to_page(src_pfn)));
774}
775#endif /* CONFIG_HIGHMEM */
776
740static void 777static void
741copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm) 778copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
742{ 779{
743 struct zone *zone; 780 struct zone *zone;
744 unsigned long pfn; 781 unsigned long pfn;
745 782
746 for_each_zone (zone) { 783 for_each_zone(zone) {
747 unsigned long max_zone_pfn; 784 unsigned long max_zone_pfn;
748 785
749 if (is_highmem(zone))
750 continue;
751
752 mark_free_pages(zone); 786 mark_free_pages(zone);
753 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 787 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
754 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 788 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
755 if (saveable_page(pfn)) 789 if (page_is_saveable(zone, pfn))
756 memory_bm_set_bit(orig_bm, pfn); 790 memory_bm_set_bit(orig_bm, pfn);
757 } 791 }
758 memory_bm_position_reset(orig_bm); 792 memory_bm_position_reset(orig_bm);
759 memory_bm_position_reset(copy_bm); 793 memory_bm_position_reset(copy_bm);
760 do { 794 do {
761 pfn = memory_bm_next_pfn(orig_bm); 795 pfn = memory_bm_next_pfn(orig_bm);
762 if (likely(pfn != BM_END_OF_MAP)) { 796 if (likely(pfn != BM_END_OF_MAP))
763 struct page *page; 797 copy_data_page(memory_bm_next_pfn(copy_bm), pfn);
764 void *src;
765
766 page = pfn_to_page(pfn);
767 src = page_address(page);
768 page = pfn_to_page(memory_bm_next_pfn(copy_bm));
769 copy_data_page(page_address(page), src);
770 }
771 } while (pfn != BM_END_OF_MAP); 798 } while (pfn != BM_END_OF_MAP);
772} 799}
773 800
801/* Total number of image pages */
802static unsigned int nr_copy_pages;
803/* Number of pages needed for saving the original pfns of the image pages */
804static unsigned int nr_meta_pages;
805
774/** 806/**
775 * swsusp_free - free pages allocated for the suspend. 807 * swsusp_free - free pages allocated for the suspend.
776 * 808 *
@@ -792,7 +824,7 @@ void swsusp_free(void)
792 if (PageNosave(page) && PageNosaveFree(page)) { 824 if (PageNosave(page) && PageNosaveFree(page)) {
793 ClearPageNosave(page); 825 ClearPageNosave(page);
794 ClearPageNosaveFree(page); 826 ClearPageNosaveFree(page);
795 free_page((long) page_address(page)); 827 __free_page(page);
796 } 828 }
797 } 829 }
798 } 830 }
@@ -802,34 +834,108 @@ void swsusp_free(void)
802 buffer = NULL; 834 buffer = NULL;
803} 835}
804 836
837#ifdef CONFIG_HIGHMEM
838/**
839 * count_pages_for_highmem - compute the number of non-highmem pages
840 * that will be necessary for creating copies of highmem pages.
841 */
842
843static unsigned int count_pages_for_highmem(unsigned int nr_highmem)
844{
845 unsigned int free_highmem = count_free_highmem_pages();
846
847 if (free_highmem >= nr_highmem)
848 nr_highmem = 0;
849 else
850 nr_highmem -= free_highmem;
851
852 return nr_highmem;
853}
854#else
855static unsigned int
856count_pages_for_highmem(unsigned int nr_highmem) { return 0; }
857#endif /* CONFIG_HIGHMEM */
805 858
806/** 859/**
807 * enough_free_mem - Make sure we enough free memory to snapshot. 860 * enough_free_mem - Make sure we have enough free memory for the
808 * 861 * snapshot image.
809 * Returns TRUE or FALSE after checking the number of available
810 * free pages.
811 */ 862 */
812 863
813static int enough_free_mem(unsigned int nr_pages) 864static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
814{ 865{
815 struct zone *zone; 866 struct zone *zone;
816 unsigned int free = 0, meta = 0; 867 unsigned int free = 0, meta = 0;
817 868
818 for_each_zone (zone) 869 for_each_zone(zone) {
819 if (!is_highmem(zone)) { 870 meta += snapshot_additional_pages(zone);
871 if (!is_highmem(zone))
820 free += zone->free_pages; 872 free += zone->free_pages;
821 meta += snapshot_additional_pages(zone); 873 }
822 }
823 874
824 pr_debug("swsusp: pages needed: %u + %u + %u, available pages: %u\n", 875 nr_pages += count_pages_for_highmem(nr_highmem);
876 pr_debug("swsusp: Normal pages needed: %u + %u + %u, available pages: %u\n",
825 nr_pages, PAGES_FOR_IO, meta, free); 877 nr_pages, PAGES_FOR_IO, meta, free);
826 878
827 return free > nr_pages + PAGES_FOR_IO + meta; 879 return free > nr_pages + PAGES_FOR_IO + meta;
828} 880}
829 881
882#ifdef CONFIG_HIGHMEM
883/**
884 * get_highmem_buffer - if there are some highmem pages in the suspend
885 * image, we may need the buffer to copy them and/or load their data.
886 */
887
888static inline int get_highmem_buffer(int safe_needed)
889{
890 buffer = get_image_page(GFP_ATOMIC | __GFP_COLD, safe_needed);
891 return buffer ? 0 : -ENOMEM;
892}
893
894/**
895 * alloc_highmem_image_pages - allocate some highmem pages for the image.
896 * Try to allocate as many pages as needed, but if the number of free
897 * highmem pages is lesser than that, allocate them all.
898 */
899
900static inline unsigned int
901alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
902{
903 unsigned int to_alloc = count_free_highmem_pages();
904
905 if (to_alloc > nr_highmem)
906 to_alloc = nr_highmem;
907
908 nr_highmem -= to_alloc;
909 while (to_alloc-- > 0) {
910 struct page *page;
911
912 page = alloc_image_page(__GFP_HIGHMEM);
913 memory_bm_set_bit(bm, page_to_pfn(page));
914 }
915 return nr_highmem;
916}
917#else
918static inline int get_highmem_buffer(int safe_needed) { return 0; }
919
920static inline unsigned int
921alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int n) { return 0; }
922#endif /* CONFIG_HIGHMEM */
923
924/**
925 * swsusp_alloc - allocate memory for the suspend image
926 *
927 * We first try to allocate as many highmem pages as there are
928 * saveable highmem pages in the system. If that fails, we allocate
929 * non-highmem pages for the copies of the remaining highmem ones.
930 *
931 * In this approach it is likely that the copies of highmem pages will
932 * also be located in the high memory, because of the way in which
933 * copy_data_pages() works.
934 */
935
830static int 936static int
831swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, 937swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
832 unsigned int nr_pages) 938 unsigned int nr_pages, unsigned int nr_highmem)
833{ 939{
834 int error; 940 int error;
835 941
@@ -841,46 +947,61 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
841 if (error) 947 if (error)
842 goto Free; 948 goto Free;
843 949
950 if (nr_highmem > 0) {
951 error = get_highmem_buffer(PG_ANY);
952 if (error)
953 goto Free;
954
955 nr_pages += alloc_highmem_image_pages(copy_bm, nr_highmem);
956 }
844 while (nr_pages-- > 0) { 957 while (nr_pages-- > 0) {
845 struct page *page = alloc_page(GFP_ATOMIC | __GFP_COLD); 958 struct page *page = alloc_image_page(GFP_ATOMIC | __GFP_COLD);
959
846 if (!page) 960 if (!page)
847 goto Free; 961 goto Free;
848 962
849 SetPageNosave(page);
850 SetPageNosaveFree(page);
851 memory_bm_set_bit(copy_bm, page_to_pfn(page)); 963 memory_bm_set_bit(copy_bm, page_to_pfn(page));
852 } 964 }
853 return 0; 965 return 0;
854 966
855Free: 967 Free:
856 swsusp_free(); 968 swsusp_free();
857 return -ENOMEM; 969 return -ENOMEM;
858} 970}
859 971
860/* Memory bitmap used for marking saveable pages */ 972/* Memory bitmap used for marking saveable pages (during suspend) or the
973 * suspend image pages (during resume)
974 */
861static struct memory_bitmap orig_bm; 975static struct memory_bitmap orig_bm;
862/* Memory bitmap used for marking allocated pages that will contain the copies 976/* Memory bitmap used on suspend for marking allocated pages that will contain
863 * of saveable pages 977 * the copies of saveable pages. During resume it is initially used for
978 * marking the suspend image pages, but then its set bits are duplicated in
979 * @orig_bm and it is released. Next, on systems with high memory, it may be
980 * used for marking "safe" highmem pages, but it has to be reinitialized for
981 * this purpose.
864 */ 982 */
865static struct memory_bitmap copy_bm; 983static struct memory_bitmap copy_bm;
866 984
867asmlinkage int swsusp_save(void) 985asmlinkage int swsusp_save(void)
868{ 986{
869 unsigned int nr_pages; 987 unsigned int nr_pages, nr_highmem;
870 988
871 pr_debug("swsusp: critical section: \n"); 989 printk("swsusp: critical section: \n");
872 990
873 drain_local_pages(); 991 drain_local_pages();
874 nr_pages = count_data_pages(); 992 nr_pages = count_data_pages();
875 printk("swsusp: Need to copy %u pages\n", nr_pages); 993 nr_highmem = count_highmem_pages();
994 printk("swsusp: Need to copy %u pages\n", nr_pages + nr_highmem);
876 995
877 if (!enough_free_mem(nr_pages)) { 996 if (!enough_free_mem(nr_pages, nr_highmem)) {
878 printk(KERN_ERR "swsusp: Not enough free memory\n"); 997 printk(KERN_ERR "swsusp: Not enough free memory\n");
879 return -ENOMEM; 998 return -ENOMEM;
880 } 999 }
881 1000
882 if (swsusp_alloc(&orig_bm, &copy_bm, nr_pages)) 1001 if (swsusp_alloc(&orig_bm, &copy_bm, nr_pages, nr_highmem)) {
1002 printk(KERN_ERR "swsusp: Memory allocation failed\n");
883 return -ENOMEM; 1003 return -ENOMEM;
1004 }
884 1005
885 /* During allocating of suspend pagedir, new cold pages may appear. 1006 /* During allocating of suspend pagedir, new cold pages may appear.
886 * Kill them. 1007 * Kill them.
@@ -894,10 +1015,12 @@ asmlinkage int swsusp_save(void)
894 * touch swap space! Except we must write out our image of course. 1015 * touch swap space! Except we must write out our image of course.
895 */ 1016 */
896 1017
1018 nr_pages += nr_highmem;
897 nr_copy_pages = nr_pages; 1019 nr_copy_pages = nr_pages;
898 nr_meta_pages = (nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT; 1020 nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE);
899 1021
900 printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages); 1022 printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages);
1023
901 return 0; 1024 return 0;
902} 1025}
903 1026
@@ -960,7 +1083,7 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
960 1083
961 if (!buffer) { 1084 if (!buffer) {
962 /* This makes the buffer be freed by swsusp_free() */ 1085 /* This makes the buffer be freed by swsusp_free() */
963 buffer = alloc_image_page(GFP_ATOMIC, PG_ANY); 1086 buffer = get_image_page(GFP_ATOMIC, PG_ANY);
964 if (!buffer) 1087 if (!buffer)
965 return -ENOMEM; 1088 return -ENOMEM;
966 } 1089 }
@@ -975,9 +1098,23 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
975 memset(buffer, 0, PAGE_SIZE); 1098 memset(buffer, 0, PAGE_SIZE);
976 pack_pfns(buffer, &orig_bm); 1099 pack_pfns(buffer, &orig_bm);
977 } else { 1100 } else {
978 unsigned long pfn = memory_bm_next_pfn(&copy_bm); 1101 struct page *page;
979 1102
980 handle->buffer = page_address(pfn_to_page(pfn)); 1103 page = pfn_to_page(memory_bm_next_pfn(&copy_bm));
1104 if (PageHighMem(page)) {
1105 /* Highmem pages are copied to the buffer,
1106 * because we can't return with a kmapped
1107 * highmem page (we may not be called again).
1108 */
1109 void *kaddr;
1110
1111 kaddr = kmap_atomic(page, KM_USER0);
1112 memcpy(buffer, kaddr, PAGE_SIZE);
1113 kunmap_atomic(kaddr, KM_USER0);
1114 handle->buffer = buffer;
1115 } else {
1116 handle->buffer = page_address(page);
1117 }
981 } 1118 }
982 handle->prev = handle->cur; 1119 handle->prev = handle->cur;
983 } 1120 }
@@ -1005,7 +1142,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm)
1005 unsigned long pfn, max_zone_pfn; 1142 unsigned long pfn, max_zone_pfn;
1006 1143
1007 /* Clear page flags */ 1144 /* Clear page flags */
1008 for_each_zone (zone) { 1145 for_each_zone(zone) {
1009 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 1146 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
1010 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 1147 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
1011 if (pfn_valid(pfn)) 1148 if (pfn_valid(pfn))
@@ -1101,6 +1238,218 @@ unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
1101 } 1238 }
1102} 1239}
1103 1240
1241/* List of "safe" pages that may be used to store data loaded from the suspend
1242 * image
1243 */
1244static struct linked_page *safe_pages_list;
1245
1246#ifdef CONFIG_HIGHMEM
1247/* struct highmem_pbe is used for creating the list of highmem pages that
1248 * should be restored atomically during the resume from disk, because the page
1249 * frames they have occupied before the suspend are in use.
1250 */
1251struct highmem_pbe {
1252 struct page *copy_page; /* data is here now */
1253 struct page *orig_page; /* data was here before the suspend */
1254 struct highmem_pbe *next;
1255};
1256
1257/* List of highmem PBEs needed for restoring the highmem pages that were
1258 * allocated before the suspend and included in the suspend image, but have
1259 * also been allocated by the "resume" kernel, so their contents cannot be
1260 * written directly to their "original" page frames.
1261 */
1262static struct highmem_pbe *highmem_pblist;
1263
1264/**
1265 * count_highmem_image_pages - compute the number of highmem pages in the
1266 * suspend image. The bits in the memory bitmap @bm that correspond to the
1267 * image pages are assumed to be set.
1268 */
1269
1270static unsigned int count_highmem_image_pages(struct memory_bitmap *bm)
1271{
1272 unsigned long pfn;
1273 unsigned int cnt = 0;
1274
1275 memory_bm_position_reset(bm);
1276 pfn = memory_bm_next_pfn(bm);
1277 while (pfn != BM_END_OF_MAP) {
1278 if (PageHighMem(pfn_to_page(pfn)))
1279 cnt++;
1280
1281 pfn = memory_bm_next_pfn(bm);
1282 }
1283 return cnt;
1284}
1285
1286/**
1287 * prepare_highmem_image - try to allocate as many highmem pages as
1288 * there are highmem image pages (@nr_highmem_p points to the variable
1289 * containing the number of highmem image pages). The pages that are
1290 * "safe" (ie. will not be overwritten when the suspend image is
1291 * restored) have the corresponding bits set in @bm (it must be
1292 * unitialized).
1293 *
1294 * NOTE: This function should not be called if there are no highmem
1295 * image pages.
1296 */
1297
1298static unsigned int safe_highmem_pages;
1299
1300static struct memory_bitmap *safe_highmem_bm;
1301
1302static int
1303prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
1304{
1305 unsigned int to_alloc;
1306
1307 if (memory_bm_create(bm, GFP_ATOMIC, PG_SAFE))
1308 return -ENOMEM;
1309
1310 if (get_highmem_buffer(PG_SAFE))
1311 return -ENOMEM;
1312
1313 to_alloc = count_free_highmem_pages();
1314 if (to_alloc > *nr_highmem_p)
1315 to_alloc = *nr_highmem_p;
1316 else
1317 *nr_highmem_p = to_alloc;
1318
1319 safe_highmem_pages = 0;
1320 while (to_alloc-- > 0) {
1321 struct page *page;
1322
1323 page = alloc_page(__GFP_HIGHMEM);
1324 if (!PageNosaveFree(page)) {
1325 /* The page is "safe", set its bit the bitmap */
1326 memory_bm_set_bit(bm, page_to_pfn(page));
1327 safe_highmem_pages++;
1328 }
1329 /* Mark the page as allocated */
1330 SetPageNosave(page);
1331 SetPageNosaveFree(page);
1332 }
1333 memory_bm_position_reset(bm);
1334 safe_highmem_bm = bm;
1335 return 0;
1336}
1337
1338/**
1339 * get_highmem_page_buffer - for given highmem image page find the buffer
1340 * that suspend_write_next() should set for its caller to write to.
1341 *
1342 * If the page is to be saved to its "original" page frame or a copy of
1343 * the page is to be made in the highmem, @buffer is returned. Otherwise,
1344 * the copy of the page is to be made in normal memory, so the address of
1345 * the copy is returned.
1346 *
1347 * If @buffer is returned, the caller of suspend_write_next() will write
1348 * the page's contents to @buffer, so they will have to be copied to the
1349 * right location on the next call to suspend_write_next() and it is done
1350 * with the help of copy_last_highmem_page(). For this purpose, if
1351 * @buffer is returned, @last_highmem page is set to the page to which
1352 * the data will have to be copied from @buffer.
1353 */
1354
1355static struct page *last_highmem_page;
1356
1357static void *
1358get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
1359{
1360 struct highmem_pbe *pbe;
1361 void *kaddr;
1362
1363 if (PageNosave(page) && PageNosaveFree(page)) {
1364 /* We have allocated the "original" page frame and we can
1365 * use it directly to store the loaded page.
1366 */
1367 last_highmem_page = page;
1368 return buffer;
1369 }
1370 /* The "original" page frame has not been allocated and we have to
1371 * use a "safe" page frame to store the loaded page.
1372 */
1373 pbe = chain_alloc(ca, sizeof(struct highmem_pbe));
1374 if (!pbe) {
1375 swsusp_free();
1376 return NULL;
1377 }
1378 pbe->orig_page = page;
1379 if (safe_highmem_pages > 0) {
1380 struct page *tmp;
1381
1382 /* Copy of the page will be stored in high memory */
1383 kaddr = buffer;
1384 tmp = pfn_to_page(memory_bm_next_pfn(safe_highmem_bm));
1385 safe_highmem_pages--;
1386 last_highmem_page = tmp;
1387 pbe->copy_page = tmp;
1388 } else {
1389 /* Copy of the page will be stored in normal memory */
1390 kaddr = safe_pages_list;
1391 safe_pages_list = safe_pages_list->next;
1392 pbe->copy_page = virt_to_page(kaddr);
1393 }
1394 pbe->next = highmem_pblist;
1395 highmem_pblist = pbe;
1396 return kaddr;
1397}
1398
1399/**
1400 * copy_last_highmem_page - copy the contents of a highmem image from
1401 * @buffer, where the caller of snapshot_write_next() has place them,
1402 * to the right location represented by @last_highmem_page .
1403 */
1404
1405static void copy_last_highmem_page(void)
1406{
1407 if (last_highmem_page) {
1408 void *dst;
1409
1410 dst = kmap_atomic(last_highmem_page, KM_USER0);
1411 memcpy(dst, buffer, PAGE_SIZE);
1412 kunmap_atomic(dst, KM_USER0);
1413 last_highmem_page = NULL;
1414 }
1415}
1416
1417static inline int last_highmem_page_copied(void)
1418{
1419 return !last_highmem_page;
1420}
1421
1422static inline void free_highmem_data(void)
1423{
1424 if (safe_highmem_bm)
1425 memory_bm_free(safe_highmem_bm, PG_UNSAFE_CLEAR);
1426
1427 if (buffer)
1428 free_image_page(buffer, PG_UNSAFE_CLEAR);
1429}
1430#else
1431static inline int get_safe_write_buffer(void) { return 0; }
1432
1433static unsigned int
1434count_highmem_image_pages(struct memory_bitmap *bm) { return 0; }
1435
1436static inline int
1437prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
1438{
1439 return 0;
1440}
1441
1442static inline void *
1443get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
1444{
1445 return NULL;
1446}
1447
1448static inline void copy_last_highmem_page(void) {}
1449static inline int last_highmem_page_copied(void) { return 1; }
1450static inline void free_highmem_data(void) {}
1451#endif /* CONFIG_HIGHMEM */
1452
1104/** 1453/**
1105 * prepare_image - use the memory bitmap @bm to mark the pages that will 1454 * prepare_image - use the memory bitmap @bm to mark the pages that will
1106 * be overwritten in the process of restoring the system memory state 1455 * be overwritten in the process of restoring the system memory state
@@ -1110,20 +1459,25 @@ unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
1110 * The idea is to allocate a new memory bitmap first and then allocate 1459 * The idea is to allocate a new memory bitmap first and then allocate
1111 * as many pages as needed for the image data, but not to assign these 1460 * as many pages as needed for the image data, but not to assign these
1112 * pages to specific tasks initially. Instead, we just mark them as 1461 * pages to specific tasks initially. Instead, we just mark them as
1113 * allocated and create a list of "safe" pages that will be used later. 1462 * allocated and create a lists of "safe" pages that will be used
1463 * later. On systems with high memory a list of "safe" highmem pages is
1464 * also created.
1114 */ 1465 */
1115 1466
1116#define PBES_PER_LINKED_PAGE (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe)) 1467#define PBES_PER_LINKED_PAGE (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe))
1117 1468
1118static struct linked_page *safe_pages_list;
1119
1120static int 1469static int
1121prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) 1470prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
1122{ 1471{
1123 unsigned int nr_pages; 1472 unsigned int nr_pages, nr_highmem;
1124 struct linked_page *sp_list, *lp; 1473 struct linked_page *sp_list, *lp;
1125 int error; 1474 int error;
1126 1475
1476 /* If there is no highmem, the buffer will not be necessary */
1477 free_image_page(buffer, PG_UNSAFE_CLEAR);
1478 buffer = NULL;
1479
1480 nr_highmem = count_highmem_image_pages(bm);
1127 error = mark_unsafe_pages(bm); 1481 error = mark_unsafe_pages(bm);
1128 if (error) 1482 if (error)
1129 goto Free; 1483 goto Free;
@@ -1134,6 +1488,11 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
1134 1488
1135 duplicate_memory_bitmap(new_bm, bm); 1489 duplicate_memory_bitmap(new_bm, bm);
1136 memory_bm_free(bm, PG_UNSAFE_KEEP); 1490 memory_bm_free(bm, PG_UNSAFE_KEEP);
1491 if (nr_highmem > 0) {
1492 error = prepare_highmem_image(bm, &nr_highmem);
1493 if (error)
1494 goto Free;
1495 }
1137 /* Reserve some safe pages for potential later use. 1496 /* Reserve some safe pages for potential later use.
1138 * 1497 *
1139 * NOTE: This way we make sure there will be enough safe pages for the 1498 * NOTE: This way we make sure there will be enough safe pages for the
@@ -1142,10 +1501,10 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
1142 */ 1501 */
1143 sp_list = NULL; 1502 sp_list = NULL;
1144 /* nr_copy_pages cannot be lesser than allocated_unsafe_pages */ 1503 /* nr_copy_pages cannot be lesser than allocated_unsafe_pages */
1145 nr_pages = nr_copy_pages - allocated_unsafe_pages; 1504 nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages;
1146 nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE); 1505 nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE);
1147 while (nr_pages > 0) { 1506 while (nr_pages > 0) {
1148 lp = alloc_image_page(GFP_ATOMIC, PG_SAFE); 1507 lp = get_image_page(GFP_ATOMIC, PG_SAFE);
1149 if (!lp) { 1508 if (!lp) {
1150 error = -ENOMEM; 1509 error = -ENOMEM;
1151 goto Free; 1510 goto Free;
@@ -1156,7 +1515,7 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
1156 } 1515 }
1157 /* Preallocate memory for the image */ 1516 /* Preallocate memory for the image */
1158 safe_pages_list = NULL; 1517 safe_pages_list = NULL;
1159 nr_pages = nr_copy_pages - allocated_unsafe_pages; 1518 nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages;
1160 while (nr_pages > 0) { 1519 while (nr_pages > 0) {
1161 lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC); 1520 lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC);
1162 if (!lp) { 1521 if (!lp) {
@@ -1181,7 +1540,7 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
1181 } 1540 }
1182 return 0; 1541 return 0;
1183 1542
1184Free: 1543 Free:
1185 swsusp_free(); 1544 swsusp_free();
1186 return error; 1545 return error;
1187} 1546}
@@ -1196,6 +1555,9 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
1196 struct pbe *pbe; 1555 struct pbe *pbe;
1197 struct page *page = pfn_to_page(memory_bm_next_pfn(bm)); 1556 struct page *page = pfn_to_page(memory_bm_next_pfn(bm));
1198 1557
1558 if (PageHighMem(page))
1559 return get_highmem_page_buffer(page, ca);
1560
1199 if (PageNosave(page) && PageNosaveFree(page)) 1561 if (PageNosave(page) && PageNosaveFree(page))
1200 /* We have allocated the "original" page frame and we can 1562 /* We have allocated the "original" page frame and we can
1201 * use it directly to store the loaded page. 1563 * use it directly to store the loaded page.
@@ -1210,12 +1572,12 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
1210 swsusp_free(); 1572 swsusp_free();
1211 return NULL; 1573 return NULL;
1212 } 1574 }
1213 pbe->orig_address = (unsigned long)page_address(page); 1575 pbe->orig_address = page_address(page);
1214 pbe->address = (unsigned long)safe_pages_list; 1576 pbe->address = safe_pages_list;
1215 safe_pages_list = safe_pages_list->next; 1577 safe_pages_list = safe_pages_list->next;
1216 pbe->next = restore_pblist; 1578 pbe->next = restore_pblist;
1217 restore_pblist = pbe; 1579 restore_pblist = pbe;
1218 return (void *)pbe->address; 1580 return pbe->address;
1219} 1581}
1220 1582
1221/** 1583/**
@@ -1249,14 +1611,16 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
1249 if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) 1611 if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages)
1250 return 0; 1612 return 0;
1251 1613
1252 if (!buffer) { 1614 if (handle->offset == 0) {
1253 /* This makes the buffer be freed by swsusp_free() */ 1615 if (!buffer)
1254 buffer = alloc_image_page(GFP_ATOMIC, PG_ANY); 1616 /* This makes the buffer be freed by swsusp_free() */
1617 buffer = get_image_page(GFP_ATOMIC, PG_ANY);
1618
1255 if (!buffer) 1619 if (!buffer)
1256 return -ENOMEM; 1620 return -ENOMEM;
1257 } 1621
1258 if (!handle->offset)
1259 handle->buffer = buffer; 1622 handle->buffer = buffer;
1623 }
1260 handle->sync_read = 1; 1624 handle->sync_read = 1;
1261 if (handle->prev < handle->cur) { 1625 if (handle->prev < handle->cur) {
1262 if (handle->prev == 0) { 1626 if (handle->prev == 0) {
@@ -1284,8 +1648,10 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
1284 return -ENOMEM; 1648 return -ENOMEM;
1285 } 1649 }
1286 } else { 1650 } else {
1651 copy_last_highmem_page();
1287 handle->buffer = get_buffer(&orig_bm, &ca); 1652 handle->buffer = get_buffer(&orig_bm, &ca);
1288 handle->sync_read = 0; 1653 if (handle->buffer != buffer)
1654 handle->sync_read = 0;
1289 } 1655 }
1290 handle->prev = handle->cur; 1656 handle->prev = handle->cur;
1291 } 1657 }
@@ -1301,15 +1667,73 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
1301 return count; 1667 return count;
1302} 1668}
1303 1669
1670/**
1671 * snapshot_write_finalize - must be called after the last call to
1672 * snapshot_write_next() in case the last page in the image happens
1673 * to be a highmem page and its contents should be stored in the
1674 * highmem. Additionally, it releases the memory that will not be
1675 * used any more.
1676 */
1677
1678void snapshot_write_finalize(struct snapshot_handle *handle)
1679{
1680 copy_last_highmem_page();
1681 /* Free only if we have loaded the image entirely */
1682 if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) {
1683 memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR);
1684 free_highmem_data();
1685 }
1686}
1687
1304int snapshot_image_loaded(struct snapshot_handle *handle) 1688int snapshot_image_loaded(struct snapshot_handle *handle)
1305{ 1689{
1306 return !(!nr_copy_pages || 1690 return !(!nr_copy_pages || !last_highmem_page_copied() ||
1307 handle->cur <= nr_meta_pages + nr_copy_pages); 1691 handle->cur <= nr_meta_pages + nr_copy_pages);
1308} 1692}
1309 1693
1310void snapshot_free_unused_memory(struct snapshot_handle *handle) 1694#ifdef CONFIG_HIGHMEM
1695/* Assumes that @buf is ready and points to a "safe" page */
1696static inline void
1697swap_two_pages_data(struct page *p1, struct page *p2, void *buf)
1311{ 1698{
1312 /* Free only if we have loaded the image entirely */ 1699 void *kaddr1, *kaddr2;
1313 if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) 1700
1314 memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR); 1701 kaddr1 = kmap_atomic(p1, KM_USER0);
1702 kaddr2 = kmap_atomic(p2, KM_USER1);
1703 memcpy(buf, kaddr1, PAGE_SIZE);
1704 memcpy(kaddr1, kaddr2, PAGE_SIZE);
1705 memcpy(kaddr2, buf, PAGE_SIZE);
1706 kunmap_atomic(kaddr1, KM_USER0);
1707 kunmap_atomic(kaddr2, KM_USER1);
1708}
1709
1710/**
1711 * restore_highmem - for each highmem page that was allocated before
1712 * the suspend and included in the suspend image, and also has been
1713 * allocated by the "resume" kernel swap its current (ie. "before
1714 * resume") contents with the previous (ie. "before suspend") one.
1715 *
1716 * If the resume eventually fails, we can call this function once
1717 * again and restore the "before resume" highmem state.
1718 */
1719
1720int restore_highmem(void)
1721{
1722 struct highmem_pbe *pbe = highmem_pblist;
1723 void *buf;
1724
1725 if (!pbe)
1726 return 0;
1727
1728 buf = get_image_page(GFP_ATOMIC, PG_SAFE);
1729 if (!buf)
1730 return -ENOMEM;
1731
1732 while (pbe) {
1733 swap_two_pages_data(pbe->copy_page, pbe->orig_page, buf);
1734 pbe = pbe->next;
1735 }
1736 free_image_page(buf, PG_UNSAFE_CLEAR);
1737 return 0;
1315} 1738}
1739#endif /* CONFIG_HIGHMEM */
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 1a3b0dd2c3fc..f133d4a6d817 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -34,34 +34,123 @@ extern char resume_file[];
34#define SWSUSP_SIG "S1SUSPEND" 34#define SWSUSP_SIG "S1SUSPEND"
35 35
36static struct swsusp_header { 36static struct swsusp_header {
37 char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)]; 37 char reserved[PAGE_SIZE - 20 - sizeof(sector_t)];
38 swp_entry_t image; 38 sector_t image;
39 char orig_sig[10]; 39 char orig_sig[10];
40 char sig[10]; 40 char sig[10];
41} __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header; 41} __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header;
42 42
43/* 43/*
44 * Saving part... 44 * General things
45 */ 45 */
46 46
47static unsigned short root_swap = 0xffff; 47static unsigned short root_swap = 0xffff;
48static struct block_device *resume_bdev;
49
50/**
51 * submit - submit BIO request.
52 * @rw: READ or WRITE.
53 * @off physical offset of page.
54 * @page: page we're reading or writing.
55 * @bio_chain: list of pending biod (for async reading)
56 *
57 * Straight from the textbook - allocate and initialize the bio.
58 * If we're reading, make sure the page is marked as dirty.
59 * Then submit it and, if @bio_chain == NULL, wait.
60 */
61static int submit(int rw, pgoff_t page_off, struct page *page,
62 struct bio **bio_chain)
63{
64 struct bio *bio;
65
66 bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
67 if (!bio)
68 return -ENOMEM;
69 bio->bi_sector = page_off * (PAGE_SIZE >> 9);
70 bio->bi_bdev = resume_bdev;
71 bio->bi_end_io = end_swap_bio_read;
72
73 if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
74 printk("swsusp: ERROR: adding page to bio at %ld\n", page_off);
75 bio_put(bio);
76 return -EFAULT;
77 }
78
79 lock_page(page);
80 bio_get(bio);
81
82 if (bio_chain == NULL) {
83 submit_bio(rw | (1 << BIO_RW_SYNC), bio);
84 wait_on_page_locked(page);
85 if (rw == READ)
86 bio_set_pages_dirty(bio);
87 bio_put(bio);
88 } else {
89 if (rw == READ)
90 get_page(page); /* These pages are freed later */
91 bio->bi_private = *bio_chain;
92 *bio_chain = bio;
93 submit_bio(rw | (1 << BIO_RW_SYNC), bio);
94 }
95 return 0;
96}
97
98static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
99{
100 return submit(READ, page_off, virt_to_page(addr), bio_chain);
101}
102
103static int bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
104{
105 return submit(WRITE, page_off, virt_to_page(addr), bio_chain);
106}
107
108static int wait_on_bio_chain(struct bio **bio_chain)
109{
110 struct bio *bio;
111 struct bio *next_bio;
112 int ret = 0;
113
114 if (bio_chain == NULL)
115 return 0;
116
117 bio = *bio_chain;
118 if (bio == NULL)
119 return 0;
120 while (bio) {
121 struct page *page;
122
123 next_bio = bio->bi_private;
124 page = bio->bi_io_vec[0].bv_page;
125 wait_on_page_locked(page);
126 if (!PageUptodate(page) || PageError(page))
127 ret = -EIO;
128 put_page(page);
129 bio_put(bio);
130 bio = next_bio;
131 }
132 *bio_chain = NULL;
133 return ret;
134}
135
136/*
137 * Saving part
138 */
48 139
49static int mark_swapfiles(swp_entry_t start) 140static int mark_swapfiles(sector_t start)
50{ 141{
51 int error; 142 int error;
52 143
53 rw_swap_page_sync(READ, swp_entry(root_swap, 0), 144 bio_read_page(swsusp_resume_block, &swsusp_header, NULL);
54 virt_to_page((unsigned long)&swsusp_header), NULL);
55 if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) || 145 if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) ||
56 !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) { 146 !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
57 memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); 147 memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
58 memcpy(swsusp_header.sig,SWSUSP_SIG, 10); 148 memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
59 swsusp_header.image = start; 149 swsusp_header.image = start;
60 error = rw_swap_page_sync(WRITE, swp_entry(root_swap, 0), 150 error = bio_write_page(swsusp_resume_block,
61 virt_to_page((unsigned long)&swsusp_header), 151 &swsusp_header, NULL);
62 NULL);
63 } else { 152 } else {
64 pr_debug("swsusp: Partition is not swap space.\n"); 153 printk(KERN_ERR "swsusp: Swap header not found!\n");
65 error = -ENODEV; 154 error = -ENODEV;
66 } 155 }
67 return error; 156 return error;
@@ -74,12 +163,21 @@ static int mark_swapfiles(swp_entry_t start)
74 163
75static int swsusp_swap_check(void) /* This is called before saving image */ 164static int swsusp_swap_check(void) /* This is called before saving image */
76{ 165{
77 int res = swap_type_of(swsusp_resume_device); 166 int res;
167
168 res = swap_type_of(swsusp_resume_device, swsusp_resume_block);
169 if (res < 0)
170 return res;
171
172 root_swap = res;
173 resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_WRITE);
174 if (IS_ERR(resume_bdev))
175 return PTR_ERR(resume_bdev);
176
177 res = set_blocksize(resume_bdev, PAGE_SIZE);
178 if (res < 0)
179 blkdev_put(resume_bdev);
78 180
79 if (res >= 0) {
80 root_swap = res;
81 return 0;
82 }
83 return res; 181 return res;
84} 182}
85 183
@@ -90,36 +188,26 @@ static int swsusp_swap_check(void) /* This is called before saving image */
90 * @bio_chain: Link the next write BIO here 188 * @bio_chain: Link the next write BIO here
91 */ 189 */
92 190
93static int write_page(void *buf, unsigned long offset, struct bio **bio_chain) 191static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
94{ 192{
95 swp_entry_t entry; 193 void *src;
96 int error = -ENOSPC; 194
97 195 if (!offset)
98 if (offset) { 196 return -ENOSPC;
99 struct page *page = virt_to_page(buf); 197
100 198 if (bio_chain) {
101 if (bio_chain) { 199 src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
102 /* 200 if (src) {
103 * Whether or not we successfully allocated a copy page, 201 memcpy(src, buf, PAGE_SIZE);
104 * we take a ref on the page here. It gets undone in 202 } else {
105 * wait_on_bio_chain(). 203 WARN_ON_ONCE(1);
106 */ 204 bio_chain = NULL; /* Go synchronous */
107 struct page *page_copy; 205 src = buf;
108 page_copy = alloc_page(GFP_ATOMIC);
109 if (page_copy == NULL) {
110 WARN_ON_ONCE(1);
111 bio_chain = NULL; /* Go synchronous */
112 get_page(page);
113 } else {
114 memcpy(page_address(page_copy),
115 page_address(page), PAGE_SIZE);
116 page = page_copy;
117 }
118 } 206 }
119 entry = swp_entry(root_swap, offset); 207 } else {
120 error = rw_swap_page_sync(WRITE, entry, page, bio_chain); 208 src = buf;
121 } 209 }
122 return error; 210 return bio_write_page(offset, src, bio_chain);
123} 211}
124 212
125/* 213/*
@@ -137,11 +225,11 @@ static int write_page(void *buf, unsigned long offset, struct bio **bio_chain)
137 * at a time. 225 * at a time.
138 */ 226 */
139 227
140#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(long) - 1) 228#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1)
141 229
142struct swap_map_page { 230struct swap_map_page {
143 unsigned long entries[MAP_PAGE_ENTRIES]; 231 sector_t entries[MAP_PAGE_ENTRIES];
144 unsigned long next_swap; 232 sector_t next_swap;
145}; 233};
146 234
147/** 235/**
@@ -151,7 +239,7 @@ struct swap_map_page {
151 239
152struct swap_map_handle { 240struct swap_map_handle {
153 struct swap_map_page *cur; 241 struct swap_map_page *cur;
154 unsigned long cur_swap; 242 sector_t cur_swap;
155 struct bitmap_page *bitmap; 243 struct bitmap_page *bitmap;
156 unsigned int k; 244 unsigned int k;
157}; 245};
@@ -166,26 +254,6 @@ static void release_swap_writer(struct swap_map_handle *handle)
166 handle->bitmap = NULL; 254 handle->bitmap = NULL;
167} 255}
168 256
169static void show_speed(struct timeval *start, struct timeval *stop,
170 unsigned nr_pages, char *msg)
171{
172 s64 elapsed_centisecs64;
173 int centisecs;
174 int k;
175 int kps;
176
177 elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
178 do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
179 centisecs = elapsed_centisecs64;
180 if (centisecs == 0)
181 centisecs = 1; /* avoid div-by-zero */
182 k = nr_pages * (PAGE_SIZE / 1024);
183 kps = (k * 100) / centisecs;
184 printk("%s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", msg, k,
185 centisecs / 100, centisecs % 100,
186 kps / 1000, (kps % 1000) / 10);
187}
188
189static int get_swap_writer(struct swap_map_handle *handle) 257static int get_swap_writer(struct swap_map_handle *handle)
190{ 258{
191 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL); 259 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
@@ -196,7 +264,7 @@ static int get_swap_writer(struct swap_map_handle *handle)
196 release_swap_writer(handle); 264 release_swap_writer(handle);
197 return -ENOMEM; 265 return -ENOMEM;
198 } 266 }
199 handle->cur_swap = alloc_swap_page(root_swap, handle->bitmap); 267 handle->cur_swap = alloc_swapdev_block(root_swap, handle->bitmap);
200 if (!handle->cur_swap) { 268 if (!handle->cur_swap) {
201 release_swap_writer(handle); 269 release_swap_writer(handle);
202 return -ENOSPC; 270 return -ENOSPC;
@@ -205,43 +273,15 @@ static int get_swap_writer(struct swap_map_handle *handle)
205 return 0; 273 return 0;
206} 274}
207 275
208static int wait_on_bio_chain(struct bio **bio_chain)
209{
210 struct bio *bio;
211 struct bio *next_bio;
212 int ret = 0;
213
214 if (bio_chain == NULL)
215 return 0;
216
217 bio = *bio_chain;
218 if (bio == NULL)
219 return 0;
220 while (bio) {
221 struct page *page;
222
223 next_bio = bio->bi_private;
224 page = bio->bi_io_vec[0].bv_page;
225 wait_on_page_locked(page);
226 if (!PageUptodate(page) || PageError(page))
227 ret = -EIO;
228 put_page(page);
229 bio_put(bio);
230 bio = next_bio;
231 }
232 *bio_chain = NULL;
233 return ret;
234}
235
236static int swap_write_page(struct swap_map_handle *handle, void *buf, 276static int swap_write_page(struct swap_map_handle *handle, void *buf,
237 struct bio **bio_chain) 277 struct bio **bio_chain)
238{ 278{
239 int error = 0; 279 int error = 0;
240 unsigned long offset; 280 sector_t offset;
241 281
242 if (!handle->cur) 282 if (!handle->cur)
243 return -EINVAL; 283 return -EINVAL;
244 offset = alloc_swap_page(root_swap, handle->bitmap); 284 offset = alloc_swapdev_block(root_swap, handle->bitmap);
245 error = write_page(buf, offset, bio_chain); 285 error = write_page(buf, offset, bio_chain);
246 if (error) 286 if (error)
247 return error; 287 return error;
@@ -250,7 +290,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
250 error = wait_on_bio_chain(bio_chain); 290 error = wait_on_bio_chain(bio_chain);
251 if (error) 291 if (error)
252 goto out; 292 goto out;
253 offset = alloc_swap_page(root_swap, handle->bitmap); 293 offset = alloc_swapdev_block(root_swap, handle->bitmap);
254 if (!offset) 294 if (!offset)
255 return -ENOSPC; 295 return -ENOSPC;
256 handle->cur->next_swap = offset; 296 handle->cur->next_swap = offset;
@@ -261,7 +301,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
261 handle->cur_swap = offset; 301 handle->cur_swap = offset;
262 handle->k = 0; 302 handle->k = 0;
263 } 303 }
264out: 304 out:
265 return error; 305 return error;
266} 306}
267 307
@@ -315,7 +355,7 @@ static int save_image(struct swap_map_handle *handle,
315 error = err2; 355 error = err2;
316 if (!error) 356 if (!error)
317 printk("\b\b\b\bdone\n"); 357 printk("\b\b\b\bdone\n");
318 show_speed(&start, &stop, nr_to_write, "Wrote"); 358 swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
319 return error; 359 return error;
320} 360}
321 361
@@ -350,100 +390,50 @@ int swsusp_write(void)
350 struct swsusp_info *header; 390 struct swsusp_info *header;
351 int error; 391 int error;
352 392
353 if ((error = swsusp_swap_check())) { 393 error = swsusp_swap_check();
394 if (error) {
354 printk(KERN_ERR "swsusp: Cannot find swap device, try " 395 printk(KERN_ERR "swsusp: Cannot find swap device, try "
355 "swapon -a.\n"); 396 "swapon -a.\n");
356 return error; 397 return error;
357 } 398 }
358 memset(&snapshot, 0, sizeof(struct snapshot_handle)); 399 memset(&snapshot, 0, sizeof(struct snapshot_handle));
359 error = snapshot_read_next(&snapshot, PAGE_SIZE); 400 error = snapshot_read_next(&snapshot, PAGE_SIZE);
360 if (error < PAGE_SIZE) 401 if (error < PAGE_SIZE) {
361 return error < 0 ? error : -EFAULT; 402 if (error >= 0)
403 error = -EFAULT;
404
405 goto out;
406 }
362 header = (struct swsusp_info *)data_of(snapshot); 407 header = (struct swsusp_info *)data_of(snapshot);
363 if (!enough_swap(header->pages)) { 408 if (!enough_swap(header->pages)) {
364 printk(KERN_ERR "swsusp: Not enough free swap\n"); 409 printk(KERN_ERR "swsusp: Not enough free swap\n");
365 return -ENOSPC; 410 error = -ENOSPC;
411 goto out;
366 } 412 }
367 error = get_swap_writer(&handle); 413 error = get_swap_writer(&handle);
368 if (!error) { 414 if (!error) {
369 unsigned long start = handle.cur_swap; 415 sector_t start = handle.cur_swap;
416
370 error = swap_write_page(&handle, header, NULL); 417 error = swap_write_page(&handle, header, NULL);
371 if (!error) 418 if (!error)
372 error = save_image(&handle, &snapshot, 419 error = save_image(&handle, &snapshot,
373 header->pages - 1); 420 header->pages - 1);
421
374 if (!error) { 422 if (!error) {
375 flush_swap_writer(&handle); 423 flush_swap_writer(&handle);
376 printk("S"); 424 printk("S");
377 error = mark_swapfiles(swp_entry(root_swap, start)); 425 error = mark_swapfiles(start);
378 printk("|\n"); 426 printk("|\n");
379 } 427 }
380 } 428 }
381 if (error) 429 if (error)
382 free_all_swap_pages(root_swap, handle.bitmap); 430 free_all_swap_pages(root_swap, handle.bitmap);
383 release_swap_writer(&handle); 431 release_swap_writer(&handle);
432 out:
433 swsusp_close();
384 return error; 434 return error;
385} 435}
386 436
387static struct block_device *resume_bdev;
388
389/**
390 * submit - submit BIO request.
391 * @rw: READ or WRITE.
392 * @off physical offset of page.
393 * @page: page we're reading or writing.
394 * @bio_chain: list of pending biod (for async reading)
395 *
396 * Straight from the textbook - allocate and initialize the bio.
397 * If we're reading, make sure the page is marked as dirty.
398 * Then submit it and, if @bio_chain == NULL, wait.
399 */
400static int submit(int rw, pgoff_t page_off, struct page *page,
401 struct bio **bio_chain)
402{
403 struct bio *bio;
404
405 bio = bio_alloc(GFP_ATOMIC, 1);
406 if (!bio)
407 return -ENOMEM;
408 bio->bi_sector = page_off * (PAGE_SIZE >> 9);
409 bio->bi_bdev = resume_bdev;
410 bio->bi_end_io = end_swap_bio_read;
411
412 if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
413 printk("swsusp: ERROR: adding page to bio at %ld\n", page_off);
414 bio_put(bio);
415 return -EFAULT;
416 }
417
418 lock_page(page);
419 bio_get(bio);
420
421 if (bio_chain == NULL) {
422 submit_bio(rw | (1 << BIO_RW_SYNC), bio);
423 wait_on_page_locked(page);
424 if (rw == READ)
425 bio_set_pages_dirty(bio);
426 bio_put(bio);
427 } else {
428 if (rw == READ)
429 get_page(page); /* These pages are freed later */
430 bio->bi_private = *bio_chain;
431 *bio_chain = bio;
432 submit_bio(rw | (1 << BIO_RW_SYNC), bio);
433 }
434 return 0;
435}
436
437static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
438{
439 return submit(READ, page_off, virt_to_page(addr), bio_chain);
440}
441
442static int bio_write_page(pgoff_t page_off, void *addr)
443{
444 return submit(WRITE, page_off, virt_to_page(addr), NULL);
445}
446
447/** 437/**
448 * The following functions allow us to read data using a swap map 438 * The following functions allow us to read data using a swap map
449 * in a file-alike way 439 * in a file-alike way
@@ -456,17 +446,18 @@ static void release_swap_reader(struct swap_map_handle *handle)
456 handle->cur = NULL; 446 handle->cur = NULL;
457} 447}
458 448
459static int get_swap_reader(struct swap_map_handle *handle, 449static int get_swap_reader(struct swap_map_handle *handle, sector_t start)
460 swp_entry_t start)
461{ 450{
462 int error; 451 int error;
463 452
464 if (!swp_offset(start)) 453 if (!start)
465 return -EINVAL; 454 return -EINVAL;
466 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); 455
456 handle->cur = (struct swap_map_page *)get_zeroed_page(__GFP_WAIT | __GFP_HIGH);
467 if (!handle->cur) 457 if (!handle->cur)
468 return -ENOMEM; 458 return -ENOMEM;
469 error = bio_read_page(swp_offset(start), handle->cur, NULL); 459
460 error = bio_read_page(start, handle->cur, NULL);
470 if (error) { 461 if (error) {
471 release_swap_reader(handle); 462 release_swap_reader(handle);
472 return error; 463 return error;
@@ -478,7 +469,7 @@ static int get_swap_reader(struct swap_map_handle *handle,
478static int swap_read_page(struct swap_map_handle *handle, void *buf, 469static int swap_read_page(struct swap_map_handle *handle, void *buf,
479 struct bio **bio_chain) 470 struct bio **bio_chain)
480{ 471{
481 unsigned long offset; 472 sector_t offset;
482 int error; 473 int error;
483 474
484 if (!handle->cur) 475 if (!handle->cur)
@@ -547,11 +538,11 @@ static int load_image(struct swap_map_handle *handle,
547 error = err2; 538 error = err2;
548 if (!error) { 539 if (!error) {
549 printk("\b\b\b\bdone\n"); 540 printk("\b\b\b\bdone\n");
550 snapshot_free_unused_memory(snapshot); 541 snapshot_write_finalize(snapshot);
551 if (!snapshot_image_loaded(snapshot)) 542 if (!snapshot_image_loaded(snapshot))
552 error = -ENODATA; 543 error = -ENODATA;
553 } 544 }
554 show_speed(&start, &stop, nr_to_read, "Read"); 545 swsusp_show_speed(&start, &stop, nr_to_read, "Read");
555 return error; 546 return error;
556} 547}
557 548
@@ -600,12 +591,16 @@ int swsusp_check(void)
600 if (!IS_ERR(resume_bdev)) { 591 if (!IS_ERR(resume_bdev)) {
601 set_blocksize(resume_bdev, PAGE_SIZE); 592 set_blocksize(resume_bdev, PAGE_SIZE);
602 memset(&swsusp_header, 0, sizeof(swsusp_header)); 593 memset(&swsusp_header, 0, sizeof(swsusp_header));
603 if ((error = bio_read_page(0, &swsusp_header, NULL))) 594 error = bio_read_page(swsusp_resume_block,
595 &swsusp_header, NULL);
596 if (error)
604 return error; 597 return error;
598
605 if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) { 599 if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
606 memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10); 600 memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
607 /* Reset swap signature now */ 601 /* Reset swap signature now */
608 error = bio_write_page(0, &swsusp_header); 602 error = bio_write_page(swsusp_resume_block,
603 &swsusp_header, NULL);
609 } else { 604 } else {
610 return -EINVAL; 605 return -EINVAL;
611 } 606 }
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 0b66659dc516..31aa0390c777 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -49,6 +49,7 @@
49#include <linux/bootmem.h> 49#include <linux/bootmem.h>
50#include <linux/syscalls.h> 50#include <linux/syscalls.h>
51#include <linux/highmem.h> 51#include <linux/highmem.h>
52#include <linux/time.h>
52 53
53#include "power.h" 54#include "power.h"
54 55
@@ -64,10 +65,8 @@ int in_suspend __nosavedata = 0;
64 65
65#ifdef CONFIG_HIGHMEM 66#ifdef CONFIG_HIGHMEM
66unsigned int count_highmem_pages(void); 67unsigned int count_highmem_pages(void);
67int save_highmem(void);
68int restore_highmem(void); 68int restore_highmem(void);
69#else 69#else
70static inline int save_highmem(void) { return 0; }
71static inline int restore_highmem(void) { return 0; } 70static inline int restore_highmem(void) { return 0; }
72static inline unsigned int count_highmem_pages(void) { return 0; } 71static inline unsigned int count_highmem_pages(void) { return 0; }
73#endif 72#endif
@@ -134,18 +133,18 @@ static int bitmap_set(struct bitmap_page *bitmap, unsigned long bit)
134 return 0; 133 return 0;
135} 134}
136 135
137unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap) 136sector_t alloc_swapdev_block(int swap, struct bitmap_page *bitmap)
138{ 137{
139 unsigned long offset; 138 unsigned long offset;
140 139
141 offset = swp_offset(get_swap_page_of_type(swap)); 140 offset = swp_offset(get_swap_page_of_type(swap));
142 if (offset) { 141 if (offset) {
143 if (bitmap_set(bitmap, offset)) { 142 if (bitmap_set(bitmap, offset))
144 swap_free(swp_entry(swap, offset)); 143 swap_free(swp_entry(swap, offset));
145 offset = 0; 144 else
146 } 145 return swapdev_block(swap, offset);
147 } 146 }
148 return offset; 147 return 0;
149} 148}
150 149
151void free_all_swap_pages(int swap, struct bitmap_page *bitmap) 150void free_all_swap_pages(int swap, struct bitmap_page *bitmap)
@@ -166,6 +165,34 @@ void free_all_swap_pages(int swap, struct bitmap_page *bitmap)
166} 165}
167 166
168/** 167/**
168 * swsusp_show_speed - print the time elapsed between two events represented by
169 * @start and @stop
170 *
171 * @nr_pages - number of pages processed between @start and @stop
172 * @msg - introductory message to print
173 */
174
175void swsusp_show_speed(struct timeval *start, struct timeval *stop,
176 unsigned nr_pages, char *msg)
177{
178 s64 elapsed_centisecs64;
179 int centisecs;
180 int k;
181 int kps;
182
183 elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
184 do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
185 centisecs = elapsed_centisecs64;
186 if (centisecs == 0)
187 centisecs = 1; /* avoid div-by-zero */
188 k = nr_pages * (PAGE_SIZE / 1024);
189 kps = (k * 100) / centisecs;
190 printk("%s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", msg, k,
191 centisecs / 100, centisecs % 100,
192 kps / 1000, (kps % 1000) / 10);
193}
194
195/**
169 * swsusp_shrink_memory - Try to free as much memory as needed 196 * swsusp_shrink_memory - Try to free as much memory as needed
170 * 197 *
171 * ... but do not OOM-kill anyone 198 * ... but do not OOM-kill anyone
@@ -184,23 +211,37 @@ static inline unsigned long __shrink_memory(long tmp)
184 211
185int swsusp_shrink_memory(void) 212int swsusp_shrink_memory(void)
186{ 213{
187 long size, tmp; 214 long tmp;
188 struct zone *zone; 215 struct zone *zone;
189 unsigned long pages = 0; 216 unsigned long pages = 0;
190 unsigned int i = 0; 217 unsigned int i = 0;
191 char *p = "-\\|/"; 218 char *p = "-\\|/";
219 struct timeval start, stop;
192 220
193 printk("Shrinking memory... "); 221 printk("Shrinking memory... ");
222 do_gettimeofday(&start);
194 do { 223 do {
195 size = 2 * count_highmem_pages(); 224 long size, highmem_size;
196 size += size / 50 + count_data_pages() + PAGES_FOR_IO; 225
226 highmem_size = count_highmem_pages();
227 size = count_data_pages() + PAGES_FOR_IO;
197 tmp = size; 228 tmp = size;
229 size += highmem_size;
198 for_each_zone (zone) 230 for_each_zone (zone)
199 if (!is_highmem(zone) && populated_zone(zone)) { 231 if (populated_zone(zone)) {
200 tmp -= zone->free_pages; 232 if (is_highmem(zone)) {
201 tmp += zone->lowmem_reserve[ZONE_NORMAL]; 233 highmem_size -= zone->free_pages;
202 tmp += snapshot_additional_pages(zone); 234 } else {
235 tmp -= zone->free_pages;
236 tmp += zone->lowmem_reserve[ZONE_NORMAL];
237 tmp += snapshot_additional_pages(zone);
238 }
203 } 239 }
240
241 if (highmem_size < 0)
242 highmem_size = 0;
243
244 tmp += highmem_size;
204 if (tmp > 0) { 245 if (tmp > 0) {
205 tmp = __shrink_memory(tmp); 246 tmp = __shrink_memory(tmp);
206 if (!tmp) 247 if (!tmp)
@@ -212,7 +253,9 @@ int swsusp_shrink_memory(void)
212 } 253 }
213 printk("\b%c", p[i++%4]); 254 printk("\b%c", p[i++%4]);
214 } while (tmp > 0); 255 } while (tmp > 0);
256 do_gettimeofday(&stop);
215 printk("\bdone (%lu pages freed)\n", pages); 257 printk("\bdone (%lu pages freed)\n", pages);
258 swsusp_show_speed(&start, &stop, pages, "Freed");
216 259
217 return 0; 260 return 0;
218} 261}
@@ -223,6 +266,7 @@ int swsusp_suspend(void)
223 266
224 if ((error = arch_prepare_suspend())) 267 if ((error = arch_prepare_suspend()))
225 return error; 268 return error;
269
226 local_irq_disable(); 270 local_irq_disable();
227 /* At this point, device_suspend() has been called, but *not* 271 /* At this point, device_suspend() has been called, but *not*
228 * device_power_down(). We *must* device_power_down() now. 272 * device_power_down(). We *must* device_power_down() now.
@@ -235,23 +279,16 @@ int swsusp_suspend(void)
235 goto Enable_irqs; 279 goto Enable_irqs;
236 } 280 }
237 281
238 if ((error = save_highmem())) {
239 printk(KERN_ERR "swsusp: Not enough free pages for highmem\n");
240 goto Restore_highmem;
241 }
242
243 save_processor_state(); 282 save_processor_state();
244 if ((error = swsusp_arch_suspend())) 283 if ((error = swsusp_arch_suspend()))
245 printk(KERN_ERR "Error %d suspending\n", error); 284 printk(KERN_ERR "Error %d suspending\n", error);
246 /* Restore control flow magically appears here */ 285 /* Restore control flow magically appears here */
247 restore_processor_state(); 286 restore_processor_state();
248Restore_highmem:
249 restore_highmem();
250 /* NOTE: device_power_up() is just a resume() for devices 287 /* NOTE: device_power_up() is just a resume() for devices
251 * that suspended with irqs off ... no overall powerup. 288 * that suspended with irqs off ... no overall powerup.
252 */ 289 */
253 device_power_up(); 290 device_power_up();
254Enable_irqs: 291 Enable_irqs:
255 local_irq_enable(); 292 local_irq_enable();
256 return error; 293 return error;
257} 294}
@@ -268,18 +305,23 @@ int swsusp_resume(void)
268 printk(KERN_ERR "Some devices failed to power down, very bad\n"); 305 printk(KERN_ERR "Some devices failed to power down, very bad\n");
269 /* We'll ignore saved state, but this gets preempt count (etc) right */ 306 /* We'll ignore saved state, but this gets preempt count (etc) right */
270 save_processor_state(); 307 save_processor_state();
271 error = swsusp_arch_resume(); 308 error = restore_highmem();
272 /* Code below is only ever reached in case of failure. Otherwise 309 if (!error) {
273 * execution continues at place where swsusp_arch_suspend was called 310 error = swsusp_arch_resume();
274 */ 311 /* The code below is only ever reached in case of a failure.
275 BUG_ON(!error); 312 * Otherwise execution continues at place where
313 * swsusp_arch_suspend() was called
314 */
315 BUG_ON(!error);
316 /* This call to restore_highmem() undos the previous one */
317 restore_highmem();
318 }
276 /* The only reason why swsusp_arch_resume() can fail is memory being 319 /* The only reason why swsusp_arch_resume() can fail is memory being
277 * very tight, so we have to free it as soon as we can to avoid 320 * very tight, so we have to free it as soon as we can to avoid
278 * subsequent failures 321 * subsequent failures
279 */ 322 */
280 swsusp_free(); 323 swsusp_free();
281 restore_processor_state(); 324 restore_processor_state();
282 restore_highmem();
283 touch_softlockup_watchdog(); 325 touch_softlockup_watchdog();
284 device_power_up(); 326 device_power_up();
285 local_irq_enable(); 327 local_irq_enable();
diff --git a/kernel/power/user.c b/kernel/power/user.c
index d991d3b0e5a4..89443b85163b 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -11,6 +11,7 @@
11 11
12#include <linux/suspend.h> 12#include <linux/suspend.h>
13#include <linux/syscalls.h> 13#include <linux/syscalls.h>
14#include <linux/reboot.h>
14#include <linux/string.h> 15#include <linux/string.h>
15#include <linux/device.h> 16#include <linux/device.h>
16#include <linux/miscdevice.h> 17#include <linux/miscdevice.h>
@@ -21,6 +22,7 @@
21#include <linux/fs.h> 22#include <linux/fs.h>
22#include <linux/console.h> 23#include <linux/console.h>
23#include <linux/cpu.h> 24#include <linux/cpu.h>
25#include <linux/freezer.h>
24 26
25#include <asm/uaccess.h> 27#include <asm/uaccess.h>
26 28
@@ -54,7 +56,8 @@ static int snapshot_open(struct inode *inode, struct file *filp)
54 filp->private_data = data; 56 filp->private_data = data;
55 memset(&data->handle, 0, sizeof(struct snapshot_handle)); 57 memset(&data->handle, 0, sizeof(struct snapshot_handle));
56 if ((filp->f_flags & O_ACCMODE) == O_RDONLY) { 58 if ((filp->f_flags & O_ACCMODE) == O_RDONLY) {
57 data->swap = swsusp_resume_device ? swap_type_of(swsusp_resume_device) : -1; 59 data->swap = swsusp_resume_device ?
60 swap_type_of(swsusp_resume_device, 0) : -1;
58 data->mode = O_RDONLY; 61 data->mode = O_RDONLY;
59 } else { 62 } else {
60 data->swap = -1; 63 data->swap = -1;
@@ -76,10 +79,10 @@ static int snapshot_release(struct inode *inode, struct file *filp)
76 free_all_swap_pages(data->swap, data->bitmap); 79 free_all_swap_pages(data->swap, data->bitmap);
77 free_bitmap(data->bitmap); 80 free_bitmap(data->bitmap);
78 if (data->frozen) { 81 if (data->frozen) {
79 down(&pm_sem); 82 mutex_lock(&pm_mutex);
80 thaw_processes(); 83 thaw_processes();
81 enable_nonboot_cpus(); 84 enable_nonboot_cpus();
82 up(&pm_sem); 85 mutex_unlock(&pm_mutex);
83 } 86 }
84 atomic_inc(&device_available); 87 atomic_inc(&device_available);
85 return 0; 88 return 0;
@@ -124,7 +127,8 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
124{ 127{
125 int error = 0; 128 int error = 0;
126 struct snapshot_data *data; 129 struct snapshot_data *data;
127 loff_t offset, avail; 130 loff_t avail;
131 sector_t offset;
128 132
129 if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC) 133 if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC)
130 return -ENOTTY; 134 return -ENOTTY;
@@ -140,7 +144,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
140 case SNAPSHOT_FREEZE: 144 case SNAPSHOT_FREEZE:
141 if (data->frozen) 145 if (data->frozen)
142 break; 146 break;
143 down(&pm_sem); 147 mutex_lock(&pm_mutex);
144 error = disable_nonboot_cpus(); 148 error = disable_nonboot_cpus();
145 if (!error) { 149 if (!error) {
146 error = freeze_processes(); 150 error = freeze_processes();
@@ -150,7 +154,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
150 error = -EBUSY; 154 error = -EBUSY;
151 } 155 }
152 } 156 }
153 up(&pm_sem); 157 mutex_unlock(&pm_mutex);
154 if (!error) 158 if (!error)
155 data->frozen = 1; 159 data->frozen = 1;
156 break; 160 break;
@@ -158,10 +162,10 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
158 case SNAPSHOT_UNFREEZE: 162 case SNAPSHOT_UNFREEZE:
159 if (!data->frozen) 163 if (!data->frozen)
160 break; 164 break;
161 down(&pm_sem); 165 mutex_lock(&pm_mutex);
162 thaw_processes(); 166 thaw_processes();
163 enable_nonboot_cpus(); 167 enable_nonboot_cpus();
164 up(&pm_sem); 168 mutex_unlock(&pm_mutex);
165 data->frozen = 0; 169 data->frozen = 0;
166 break; 170 break;
167 171
@@ -170,7 +174,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
170 error = -EPERM; 174 error = -EPERM;
171 break; 175 break;
172 } 176 }
173 down(&pm_sem); 177 mutex_lock(&pm_mutex);
174 /* Free memory before shutting down devices. */ 178 /* Free memory before shutting down devices. */
175 error = swsusp_shrink_memory(); 179 error = swsusp_shrink_memory();
176 if (!error) { 180 if (!error) {
@@ -183,7 +187,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
183 } 187 }
184 resume_console(); 188 resume_console();
185 } 189 }
186 up(&pm_sem); 190 mutex_unlock(&pm_mutex);
187 if (!error) 191 if (!error)
188 error = put_user(in_suspend, (unsigned int __user *)arg); 192 error = put_user(in_suspend, (unsigned int __user *)arg);
189 if (!error) 193 if (!error)
@@ -191,13 +195,13 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
191 break; 195 break;
192 196
193 case SNAPSHOT_ATOMIC_RESTORE: 197 case SNAPSHOT_ATOMIC_RESTORE:
198 snapshot_write_finalize(&data->handle);
194 if (data->mode != O_WRONLY || !data->frozen || 199 if (data->mode != O_WRONLY || !data->frozen ||
195 !snapshot_image_loaded(&data->handle)) { 200 !snapshot_image_loaded(&data->handle)) {
196 error = -EPERM; 201 error = -EPERM;
197 break; 202 break;
198 } 203 }
199 snapshot_free_unused_memory(&data->handle); 204 mutex_lock(&pm_mutex);
200 down(&pm_sem);
201 pm_prepare_console(); 205 pm_prepare_console();
202 suspend_console(); 206 suspend_console();
203 error = device_suspend(PMSG_PRETHAW); 207 error = device_suspend(PMSG_PRETHAW);
@@ -207,7 +211,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
207 } 211 }
208 resume_console(); 212 resume_console();
209 pm_restore_console(); 213 pm_restore_console();
210 up(&pm_sem); 214 mutex_unlock(&pm_mutex);
211 break; 215 break;
212 216
213 case SNAPSHOT_FREE: 217 case SNAPSHOT_FREE:
@@ -238,10 +242,10 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
238 break; 242 break;
239 } 243 }
240 } 244 }
241 offset = alloc_swap_page(data->swap, data->bitmap); 245 offset = alloc_swapdev_block(data->swap, data->bitmap);
242 if (offset) { 246 if (offset) {
243 offset <<= PAGE_SHIFT; 247 offset <<= PAGE_SHIFT;
244 error = put_user(offset, (loff_t __user *)arg); 248 error = put_user(offset, (sector_t __user *)arg);
245 } else { 249 } else {
246 error = -ENOSPC; 250 error = -ENOSPC;
247 } 251 }
@@ -264,7 +268,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
264 * so we need to recode them 268 * so we need to recode them
265 */ 269 */
266 if (old_decode_dev(arg)) { 270 if (old_decode_dev(arg)) {
267 data->swap = swap_type_of(old_decode_dev(arg)); 271 data->swap = swap_type_of(old_decode_dev(arg), 0);
268 if (data->swap < 0) 272 if (data->swap < 0)
269 error = -ENODEV; 273 error = -ENODEV;
270 } else { 274 } else {
@@ -282,7 +286,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
282 break; 286 break;
283 } 287 }
284 288
285 if (down_trylock(&pm_sem)) { 289 if (!mutex_trylock(&pm_mutex)) {
286 error = -EBUSY; 290 error = -EBUSY;
287 break; 291 break;
288 } 292 }
@@ -309,8 +313,66 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
309 if (pm_ops->finish) 313 if (pm_ops->finish)
310 pm_ops->finish(PM_SUSPEND_MEM); 314 pm_ops->finish(PM_SUSPEND_MEM);
311 315
312OutS3: 316 OutS3:
313 up(&pm_sem); 317 mutex_unlock(&pm_mutex);
318 break;
319
320 case SNAPSHOT_PMOPS:
321 switch (arg) {
322
323 case PMOPS_PREPARE:
324 if (pm_ops->prepare) {
325 error = pm_ops->prepare(PM_SUSPEND_DISK);
326 }
327 break;
328
329 case PMOPS_ENTER:
330 kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
331 error = pm_ops->enter(PM_SUSPEND_DISK);
332 break;
333
334 case PMOPS_FINISH:
335 if (pm_ops && pm_ops->finish) {
336 pm_ops->finish(PM_SUSPEND_DISK);
337 }
338 break;
339
340 default:
341 printk(KERN_ERR "SNAPSHOT_PMOPS: invalid argument %ld\n", arg);
342 error = -EINVAL;
343
344 }
345 break;
346
347 case SNAPSHOT_SET_SWAP_AREA:
348 if (data->bitmap) {
349 error = -EPERM;
350 } else {
351 struct resume_swap_area swap_area;
352 dev_t swdev;
353
354 error = copy_from_user(&swap_area, (void __user *)arg,
355 sizeof(struct resume_swap_area));
356 if (error) {
357 error = -EFAULT;
358 break;
359 }
360
361 /*
362 * User space encodes device types as two-byte values,
363 * so we need to recode them
364 */
365 swdev = old_decode_dev(swap_area.dev);
366 if (swdev) {
367 offset = swap_area.offset;
368 data->swap = swap_type_of(swdev, offset);
369 if (data->swap < 0)
370 error = -ENODEV;
371 } else {
372 data->swap = -1;
373 error = -EINVAL;
374 }
375 }
314 break; 376 break;
315 377
316 default: 378 default:
@@ -321,7 +383,7 @@ OutS3:
321 return error; 383 return error;
322} 384}
323 385
324static struct file_operations snapshot_fops = { 386static const struct file_operations snapshot_fops = {
325 .open = snapshot_open, 387 .open = snapshot_open,
326 .release = snapshot_release, 388 .release = snapshot_release,
327 .read = snapshot_read, 389 .read = snapshot_read,
diff --git a/kernel/printk.c b/kernel/printk.c
index 66426552fbfe..185bb45eacf7 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -53,8 +53,6 @@ int console_printk[4] = {
53 DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ 53 DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */
54}; 54};
55 55
56EXPORT_UNUSED_SYMBOL(console_printk); /* June 2006 */
57
58/* 56/*
59 * Low lever drivers may need that to know if they can schedule in 57 * Low lever drivers may need that to know if they can schedule in
60 * their unblank() callback or not. So let's export it. 58 * their unblank() callback or not. So let's export it.
@@ -335,13 +333,25 @@ static void __call_console_drivers(unsigned long start, unsigned long end)
335 } 333 }
336} 334}
337 335
336static int __read_mostly ignore_loglevel;
337
338int __init ignore_loglevel_setup(char *str)
339{
340 ignore_loglevel = 1;
341 printk(KERN_INFO "debug: ignoring loglevel setting.\n");
342
343 return 1;
344}
345
346__setup("ignore_loglevel", ignore_loglevel_setup);
347
338/* 348/*
339 * Write out chars from start to end - 1 inclusive 349 * Write out chars from start to end - 1 inclusive
340 */ 350 */
341static void _call_console_drivers(unsigned long start, 351static void _call_console_drivers(unsigned long start,
342 unsigned long end, int msg_log_level) 352 unsigned long end, int msg_log_level)
343{ 353{
344 if (msg_log_level < console_loglevel && 354 if ((msg_log_level < console_loglevel || ignore_loglevel) &&
345 console_drivers && start != end) { 355 console_drivers && start != end) {
346 if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) { 356 if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) {
347 /* wrapped write */ 357 /* wrapped write */
@@ -631,12 +641,7 @@ EXPORT_SYMBOL(vprintk);
631 641
632asmlinkage long sys_syslog(int type, char __user *buf, int len) 642asmlinkage long sys_syslog(int type, char __user *buf, int len)
633{ 643{
634 return 0; 644 return -ENOSYS;
635}
636
637int do_syslog(int type, char __user *buf, int len)
638{
639 return 0;
640} 645}
641 646
642static void call_console_drivers(unsigned long start, unsigned long end) 647static void call_console_drivers(unsigned long start, unsigned long end)
@@ -777,7 +782,6 @@ int is_console_locked(void)
777{ 782{
778 return console_locked; 783 return console_locked;
779} 784}
780EXPORT_UNUSED_SYMBOL(is_console_locked); /* June 2006 */
781 785
782/** 786/**
783 * release_console_sem - unlock the console system 787 * release_console_sem - unlock the console system
diff --git a/kernel/profile.c b/kernel/profile.c
index f940b462eec9..fb5e03d57e9d 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -40,7 +40,7 @@ int (*timer_hook)(struct pt_regs *) __read_mostly;
40 40
41static atomic_t *prof_buffer; 41static atomic_t *prof_buffer;
42static unsigned long prof_len, prof_shift; 42static unsigned long prof_len, prof_shift;
43static int prof_on __read_mostly; 43int prof_on __read_mostly;
44static cpumask_t prof_cpu_mask = CPU_MASK_ALL; 44static cpumask_t prof_cpu_mask = CPU_MASK_ALL;
45#ifdef CONFIG_SMP 45#ifdef CONFIG_SMP
46static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); 46static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
@@ -51,9 +51,19 @@ static DEFINE_MUTEX(profile_flip_mutex);
51static int __init profile_setup(char * str) 51static int __init profile_setup(char * str)
52{ 52{
53 static char __initdata schedstr[] = "schedule"; 53 static char __initdata schedstr[] = "schedule";
54 static char __initdata sleepstr[] = "sleep";
54 int par; 55 int par;
55 56
56 if (!strncmp(str, schedstr, strlen(schedstr))) { 57 if (!strncmp(str, sleepstr, strlen(sleepstr))) {
58 prof_on = SLEEP_PROFILING;
59 if (str[strlen(sleepstr)] == ',')
60 str += strlen(sleepstr) + 1;
61 if (get_option(&str, &par))
62 prof_shift = par;
63 printk(KERN_INFO
64 "kernel sleep profiling enabled (shift: %ld)\n",
65 prof_shift);
66 } else if (!strncmp(str, sleepstr, strlen(sleepstr))) {
57 prof_on = SCHED_PROFILING; 67 prof_on = SCHED_PROFILING;
58 if (str[strlen(schedstr)] == ',') 68 if (str[strlen(schedstr)] == ',')
59 str += strlen(schedstr) + 1; 69 str += strlen(schedstr) + 1;
@@ -204,7 +214,8 @@ EXPORT_SYMBOL_GPL(profile_event_unregister);
204 * positions to which hits are accounted during short intervals (e.g. 214 * positions to which hits are accounted during short intervals (e.g.
205 * several seconds) is usually very small. Exclusion from buffer 215 * several seconds) is usually very small. Exclusion from buffer
206 * flipping is provided by interrupt disablement (note that for 216 * flipping is provided by interrupt disablement (note that for
207 * SCHED_PROFILING profile_hit() may be called from process context). 217 * SCHED_PROFILING or SLEEP_PROFILING profile_hit() may be called from
218 * process context).
208 * The hash function is meant to be lightweight as opposed to strong, 219 * The hash function is meant to be lightweight as opposed to strong,
209 * and was vaguely inspired by ppc64 firmware-supported inverted 220 * and was vaguely inspired by ppc64 firmware-supported inverted
210 * pagetable hash functions, but uses a full hashtable full of finite 221 * pagetable hash functions, but uses a full hashtable full of finite
@@ -257,7 +268,7 @@ static void profile_discard_flip_buffers(void)
257 mutex_unlock(&profile_flip_mutex); 268 mutex_unlock(&profile_flip_mutex);
258} 269}
259 270
260void profile_hit(int type, void *__pc) 271void profile_hits(int type, void *__pc, unsigned int nr_hits)
261{ 272{
262 unsigned long primary, secondary, flags, pc = (unsigned long)__pc; 273 unsigned long primary, secondary, flags, pc = (unsigned long)__pc;
263 int i, j, cpu; 274 int i, j, cpu;
@@ -274,21 +285,31 @@ void profile_hit(int type, void *__pc)
274 put_cpu(); 285 put_cpu();
275 return; 286 return;
276 } 287 }
288 /*
289 * We buffer the global profiler buffer into a per-CPU
290 * queue and thus reduce the number of global (and possibly
291 * NUMA-alien) accesses. The write-queue is self-coalescing:
292 */
277 local_irq_save(flags); 293 local_irq_save(flags);
278 do { 294 do {
279 for (j = 0; j < PROFILE_GRPSZ; ++j) { 295 for (j = 0; j < PROFILE_GRPSZ; ++j) {
280 if (hits[i + j].pc == pc) { 296 if (hits[i + j].pc == pc) {
281 hits[i + j].hits++; 297 hits[i + j].hits += nr_hits;
282 goto out; 298 goto out;
283 } else if (!hits[i + j].hits) { 299 } else if (!hits[i + j].hits) {
284 hits[i + j].pc = pc; 300 hits[i + j].pc = pc;
285 hits[i + j].hits = 1; 301 hits[i + j].hits = nr_hits;
286 goto out; 302 goto out;
287 } 303 }
288 } 304 }
289 i = (i + secondary) & (NR_PROFILE_HIT - 1); 305 i = (i + secondary) & (NR_PROFILE_HIT - 1);
290 } while (i != primary); 306 } while (i != primary);
291 atomic_inc(&prof_buffer[pc]); 307
308 /*
309 * Add the current hit(s) and flush the write-queue out
310 * to the global buffer:
311 */
312 atomic_add(nr_hits, &prof_buffer[pc]);
292 for (i = 0; i < NR_PROFILE_HIT; ++i) { 313 for (i = 0; i < NR_PROFILE_HIT; ++i) {
293 atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]); 314 atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
294 hits[i].pc = hits[i].hits = 0; 315 hits[i].pc = hits[i].hits = 0;
@@ -298,7 +319,6 @@ out:
298 put_cpu(); 319 put_cpu();
299} 320}
300 321
301#ifdef CONFIG_HOTPLUG_CPU
302static int __devinit profile_cpu_callback(struct notifier_block *info, 322static int __devinit profile_cpu_callback(struct notifier_block *info,
303 unsigned long action, void *__cpu) 323 unsigned long action, void *__cpu)
304{ 324{
@@ -351,19 +371,19 @@ static int __devinit profile_cpu_callback(struct notifier_block *info,
351 } 371 }
352 return NOTIFY_OK; 372 return NOTIFY_OK;
353} 373}
354#endif /* CONFIG_HOTPLUG_CPU */
355#else /* !CONFIG_SMP */ 374#else /* !CONFIG_SMP */
356#define profile_flip_buffers() do { } while (0) 375#define profile_flip_buffers() do { } while (0)
357#define profile_discard_flip_buffers() do { } while (0) 376#define profile_discard_flip_buffers() do { } while (0)
377#define profile_cpu_callback NULL
358 378
359void profile_hit(int type, void *__pc) 379void profile_hits(int type, void *__pc, unsigned int nr_hits)
360{ 380{
361 unsigned long pc; 381 unsigned long pc;
362 382
363 if (prof_on != type || !prof_buffer) 383 if (prof_on != type || !prof_buffer)
364 return; 384 return;
365 pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift; 385 pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift;
366 atomic_inc(&prof_buffer[min(pc, prof_len - 1)]); 386 atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]);
367} 387}
368#endif /* !CONFIG_SMP */ 388#endif /* !CONFIG_SMP */
369 389
@@ -442,7 +462,8 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
442 read = 0; 462 read = 0;
443 463
444 while (p < sizeof(unsigned int) && count > 0) { 464 while (p < sizeof(unsigned int) && count > 0) {
445 put_user(*((char *)(&sample_step)+p),buf); 465 if (put_user(*((char *)(&sample_step)+p),buf))
466 return -EFAULT;
446 buf++; p++; count--; read++; 467 buf++; p++; count--; read++;
447 } 468 }
448 pnt = (char *)prof_buffer + p - sizeof(atomic_t); 469 pnt = (char *)prof_buffer + p - sizeof(atomic_t);
@@ -480,7 +501,7 @@ static ssize_t write_profile(struct file *file, const char __user *buf,
480 return count; 501 return count;
481} 502}
482 503
483static struct file_operations proc_profile_operations = { 504static const struct file_operations proc_profile_operations = {
484 .read = read_profile, 505 .read = read_profile,
485 .write = write_profile, 506 .write = write_profile,
486}; 507};
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 26bb5ffe1ef1..3554b76da84c 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -235,12 +235,14 @@ static void rcu_do_batch(struct rcu_data *rdp)
235 235
236 list = rdp->donelist; 236 list = rdp->donelist;
237 while (list) { 237 while (list) {
238 next = rdp->donelist = list->next; 238 next = list->next;
239 prefetch(next);
239 list->func(list); 240 list->func(list);
240 list = next; 241 list = next;
241 if (++count >= rdp->blimit) 242 if (++count >= rdp->blimit)
242 break; 243 break;
243 } 244 }
245 rdp->donelist = list;
244 246
245 local_irq_disable(); 247 local_irq_disable();
246 rdp->qlen -= count; 248 rdp->qlen -= count;
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index e2bda18f6f42..c52f981ea008 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -401,7 +401,7 @@ static void srcu_torture_cleanup(void)
401 cleanup_srcu_struct(&srcu_ctl); 401 cleanup_srcu_struct(&srcu_ctl);
402} 402}
403 403
404static int srcu_torture_read_lock(void) 404static int srcu_torture_read_lock(void) __acquires(&srcu_ctl)
405{ 405{
406 return srcu_read_lock(&srcu_ctl); 406 return srcu_read_lock(&srcu_ctl);
407} 407}
@@ -419,7 +419,7 @@ static void srcu_read_delay(struct rcu_random_state *rrsp)
419 schedule_timeout_interruptible(longdelay); 419 schedule_timeout_interruptible(longdelay);
420} 420}
421 421
422static void srcu_torture_read_unlock(int idx) 422static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl)
423{ 423{
424 srcu_read_unlock(&srcu_ctl, idx); 424 srcu_read_unlock(&srcu_ctl, idx);
425} 425}
diff --git a/kernel/relay.c b/kernel/relay.c
index f04bbdb56ac2..75a3a9a7efc2 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -308,9 +308,10 @@ static struct rchan_callbacks default_channel_callbacks = {
308 * reason waking is deferred is that calling directly from write 308 * reason waking is deferred is that calling directly from write
309 * causes problems if you're writing from say the scheduler. 309 * causes problems if you're writing from say the scheduler.
310 */ 310 */
311static void wakeup_readers(void *private) 311static void wakeup_readers(struct work_struct *work)
312{ 312{
313 struct rchan_buf *buf = private; 313 struct rchan_buf *buf =
314 container_of(work, struct rchan_buf, wake_readers.work);
314 wake_up_interruptible(&buf->read_wait); 315 wake_up_interruptible(&buf->read_wait);
315} 316}
316 317
@@ -328,7 +329,7 @@ static inline void __relay_reset(struct rchan_buf *buf, unsigned int init)
328 if (init) { 329 if (init) {
329 init_waitqueue_head(&buf->read_wait); 330 init_waitqueue_head(&buf->read_wait);
330 kref_init(&buf->kref); 331 kref_init(&buf->kref);
331 INIT_WORK(&buf->wake_readers, NULL, NULL); 332 INIT_DELAYED_WORK(&buf->wake_readers, NULL);
332 } else { 333 } else {
333 cancel_delayed_work(&buf->wake_readers); 334 cancel_delayed_work(&buf->wake_readers);
334 flush_scheduled_work(); 335 flush_scheduled_work();
@@ -549,7 +550,8 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
549 buf->padding[old_subbuf]; 550 buf->padding[old_subbuf];
550 smp_mb(); 551 smp_mb();
551 if (waitqueue_active(&buf->read_wait)) { 552 if (waitqueue_active(&buf->read_wait)) {
552 PREPARE_WORK(&buf->wake_readers, wakeup_readers, buf); 553 PREPARE_DELAYED_WORK(&buf->wake_readers,
554 wakeup_readers);
553 schedule_delayed_work(&buf->wake_readers, 1); 555 schedule_delayed_work(&buf->wake_readers, 1);
554 } 556 }
555 } 557 }
@@ -1011,7 +1013,7 @@ static ssize_t relay_file_sendfile(struct file *filp,
1011 actor, &desc); 1013 actor, &desc);
1012} 1014}
1013 1015
1014struct file_operations relay_file_operations = { 1016const struct file_operations relay_file_operations = {
1015 .open = relay_file_open, 1017 .open = relay_file_open,
1016 .poll = relay_file_poll, 1018 .poll = relay_file_poll,
1017 .mmap = relay_file_mmap, 1019 .mmap = relay_file_mmap,
diff --git a/kernel/resource.c b/kernel/resource.c
index 6de60c12143e..7b9a497419d9 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -88,7 +88,7 @@ static int r_show(struct seq_file *m, void *v)
88 return 0; 88 return 0;
89} 89}
90 90
91static struct seq_operations resource_op = { 91static const struct seq_operations resource_op = {
92 .start = r_start, 92 .start = r_start,
93 .next = r_next, 93 .next = r_next,
94 .stop = r_stop, 94 .stop = r_stop,
@@ -115,14 +115,14 @@ static int iomem_open(struct inode *inode, struct file *file)
115 return res; 115 return res;
116} 116}
117 117
118static struct file_operations proc_ioports_operations = { 118static const struct file_operations proc_ioports_operations = {
119 .open = ioports_open, 119 .open = ioports_open,
120 .read = seq_read, 120 .read = seq_read,
121 .llseek = seq_lseek, 121 .llseek = seq_lseek,
122 .release = seq_release, 122 .release = seq_release,
123}; 123};
124 124
125static struct file_operations proc_iomem_operations = { 125static const struct file_operations proc_iomem_operations = {
126 .open = iomem_open, 126 .open = iomem_open,
127 .read = seq_read, 127 .read = seq_read,
128 .llseek = seq_lseek, 128 .llseek = seq_lseek,
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 6dcea9dd8c94..015fc633c96c 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -13,6 +13,7 @@
13#include <linux/spinlock.h> 13#include <linux/spinlock.h>
14#include <linux/sysdev.h> 14#include <linux/sysdev.h>
15#include <linux/timer.h> 15#include <linux/timer.h>
16#include <linux/freezer.h>
16 17
17#include "rtmutex.h" 18#include "rtmutex.h"
18 19
diff --git a/kernel/sched.c b/kernel/sched.c
index 3399701c680e..f385eff4682d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -34,7 +34,7 @@
34#include <linux/security.h> 34#include <linux/security.h>
35#include <linux/notifier.h> 35#include <linux/notifier.h>
36#include <linux/profile.h> 36#include <linux/profile.h>
37#include <linux/suspend.h> 37#include <linux/freezer.h>
38#include <linux/vmalloc.h> 38#include <linux/vmalloc.h>
39#include <linux/blkdev.h> 39#include <linux/blkdev.h>
40#include <linux/delay.h> 40#include <linux/delay.h>
@@ -505,7 +505,7 @@ static int schedstat_open(struct inode *inode, struct file *file)
505 return res; 505 return res;
506} 506}
507 507
508struct file_operations proc_schedstat_operations = { 508const struct file_operations proc_schedstat_operations = {
509 .open = schedstat_open, 509 .open = schedstat_open,
510 .read = seq_read, 510 .read = seq_read,
511 .llseek = seq_lseek, 511 .llseek = seq_lseek,
@@ -948,6 +948,17 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local)
948 } 948 }
949#endif 949#endif
950 950
951 /*
952 * Sleep time is in units of nanosecs, so shift by 20 to get a
953 * milliseconds-range estimation of the amount of time that the task
954 * spent sleeping:
955 */
956 if (unlikely(prof_on == SLEEP_PROFILING)) {
957 if (p->state == TASK_UNINTERRUPTIBLE)
958 profile_hits(SLEEP_PROFILING, (void *)get_wchan(p),
959 (now - p->timestamp) >> 20);
960 }
961
951 if (!rt_task(p)) 962 if (!rt_task(p))
952 p->prio = recalc_task_prio(p, now); 963 p->prio = recalc_task_prio(p, now);
953 964
@@ -3333,6 +3344,7 @@ asmlinkage void __sched schedule(void)
3333 printk(KERN_ERR "BUG: scheduling while atomic: " 3344 printk(KERN_ERR "BUG: scheduling while atomic: "
3334 "%s/0x%08x/%d\n", 3345 "%s/0x%08x/%d\n",
3335 current->comm, preempt_count(), current->pid); 3346 current->comm, preempt_count(), current->pid);
3347 debug_show_held_locks(current);
3336 dump_stack(); 3348 dump_stack();
3337 } 3349 }
3338 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 3350 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
@@ -4804,18 +4816,18 @@ static void show_task(struct task_struct *p)
4804 show_stack(p, NULL); 4816 show_stack(p, NULL);
4805} 4817}
4806 4818
4807void show_state(void) 4819void show_state_filter(unsigned long state_filter)
4808{ 4820{
4809 struct task_struct *g, *p; 4821 struct task_struct *g, *p;
4810 4822
4811#if (BITS_PER_LONG == 32) 4823#if (BITS_PER_LONG == 32)
4812 printk("\n" 4824 printk("\n"
4813 " sibling\n"); 4825 " free sibling\n");
4814 printk(" task PC pid father child younger older\n"); 4826 printk(" task PC stack pid father child younger older\n");
4815#else 4827#else
4816 printk("\n" 4828 printk("\n"
4817 " sibling\n"); 4829 " free sibling\n");
4818 printk(" task PC pid father child younger older\n"); 4830 printk(" task PC stack pid father child younger older\n");
4819#endif 4831#endif
4820 read_lock(&tasklist_lock); 4832 read_lock(&tasklist_lock);
4821 do_each_thread(g, p) { 4833 do_each_thread(g, p) {
@@ -4824,11 +4836,16 @@ void show_state(void)
4824 * console might take alot of time: 4836 * console might take alot of time:
4825 */ 4837 */
4826 touch_nmi_watchdog(); 4838 touch_nmi_watchdog();
4827 show_task(p); 4839 if (p->state & state_filter)
4840 show_task(p);
4828 } while_each_thread(g, p); 4841 } while_each_thread(g, p);
4829 4842
4830 read_unlock(&tasklist_lock); 4843 read_unlock(&tasklist_lock);
4831 debug_show_all_locks(); 4844 /*
4845 * Only show locks if all tasks are dumped:
4846 */
4847 if (state_filter == -1)
4848 debug_show_all_locks();
4832} 4849}
4833 4850
4834/** 4851/**
@@ -6723,8 +6740,6 @@ SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
6723 sched_smt_power_savings_store); 6740 sched_smt_power_savings_store);
6724#endif 6741#endif
6725 6742
6726
6727#ifdef CONFIG_HOTPLUG_CPU
6728/* 6743/*
6729 * Force a reinitialization of the sched domains hierarchy. The domains 6744 * Force a reinitialization of the sched domains hierarchy. The domains
6730 * and groups cannot be updated in place without racing with the balancing 6745 * and groups cannot be updated in place without racing with the balancing
@@ -6757,7 +6772,6 @@ static int update_sched_domains(struct notifier_block *nfb,
6757 6772
6758 return NOTIFY_OK; 6773 return NOTIFY_OK;
6759} 6774}
6760#endif
6761 6775
6762void __init sched_init_smp(void) 6776void __init sched_init_smp(void)
6763{ 6777{
@@ -6867,6 +6881,7 @@ void __might_sleep(char *file, int line)
6867 " context at %s:%d\n", file, line); 6881 " context at %s:%d\n", file, line);
6868 printk("in_atomic():%d, irqs_disabled():%d\n", 6882 printk("in_atomic():%d, irqs_disabled():%d\n",
6869 in_atomic(), irqs_disabled()); 6883 in_atomic(), irqs_disabled());
6884 debug_show_held_locks(current);
6870 dump_stack(); 6885 dump_stack();
6871 } 6886 }
6872#endif 6887#endif
diff --git a/kernel/signal.c b/kernel/signal.c
index df18c167a2a7..ec81defde339 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -23,6 +23,7 @@
23#include <linux/ptrace.h> 23#include <linux/ptrace.h>
24#include <linux/signal.h> 24#include <linux/signal.h>
25#include <linux/capability.h> 25#include <linux/capability.h>
26#include <linux/freezer.h>
26#include <asm/param.h> 27#include <asm/param.h>
27#include <asm/uaccess.h> 28#include <asm/uaccess.h>
28#include <asm/unistd.h> 29#include <asm/unistd.h>
@@ -33,7 +34,7 @@
33 * SLAB caches for signal bits. 34 * SLAB caches for signal bits.
34 */ 35 */
35 36
36static kmem_cache_t *sigqueue_cachep; 37static struct kmem_cache *sigqueue_cachep;
37 38
38/* 39/*
39 * In POSIX a signal is sent either to a specific thread (Linux task) 40 * In POSIX a signal is sent either to a specific thread (Linux task)
@@ -1133,8 +1134,7 @@ int kill_pid_info(int sig, struct siginfo *info, struct pid *pid)
1133 return error; 1134 return error;
1134} 1135}
1135 1136
1136int 1137static int kill_proc_info(int sig, struct siginfo *info, pid_t pid)
1137kill_proc_info(int sig, struct siginfo *info, pid_t pid)
1138{ 1138{
1139 int error; 1139 int error;
1140 rcu_read_lock(); 1140 rcu_read_lock();
diff --git a/kernel/softirq.c b/kernel/softirq.c
index bf25015dce16..918e52df090e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -574,8 +574,6 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
574 574
575 switch (action) { 575 switch (action) {
576 case CPU_UP_PREPARE: 576 case CPU_UP_PREPARE:
577 BUG_ON(per_cpu(tasklet_vec, hotcpu).list);
578 BUG_ON(per_cpu(tasklet_hi_vec, hotcpu).list);
579 p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); 577 p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
580 if (IS_ERR(p)) { 578 if (IS_ERR(p)) {
581 printk("ksoftirqd for %i failed\n", hotcpu); 579 printk("ksoftirqd for %i failed\n", hotcpu);
diff --git a/kernel/sys.c b/kernel/sys.c
index 98489d82801b..a0c1a29a507f 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -880,7 +880,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
880 return 0; 880 return 0;
881} 881}
882 882
883static void deferred_cad(void *dummy) 883static void deferred_cad(struct work_struct *dummy)
884{ 884{
885 kernel_restart(NULL); 885 kernel_restart(NULL);
886} 886}
@@ -892,7 +892,7 @@ static void deferred_cad(void *dummy)
892 */ 892 */
893void ctrl_alt_del(void) 893void ctrl_alt_del(void)
894{ 894{
895 static DECLARE_WORK(cad_work, deferred_cad, NULL); 895 static DECLARE_WORK(cad_work, deferred_cad);
896 896
897 if (C_A_D) 897 if (C_A_D)
898 schedule_work(&cad_work); 898 schedule_work(&cad_work);
@@ -1102,14 +1102,14 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
1102asmlinkage long sys_setuid(uid_t uid) 1102asmlinkage long sys_setuid(uid_t uid)
1103{ 1103{
1104 int old_euid = current->euid; 1104 int old_euid = current->euid;
1105 int old_ruid, old_suid, new_ruid, new_suid; 1105 int old_ruid, old_suid, new_suid;
1106 int retval; 1106 int retval;
1107 1107
1108 retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID); 1108 retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID);
1109 if (retval) 1109 if (retval)
1110 return retval; 1110 return retval;
1111 1111
1112 old_ruid = new_ruid = current->uid; 1112 old_ruid = current->uid;
1113 old_suid = current->suid; 1113 old_suid = current->suid;
1114 new_suid = old_suid; 1114 new_suid = old_suid;
1115 1115
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6fc5e17086f4..8e9f00fd6d18 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -171,7 +171,7 @@ static ssize_t proc_readsys(struct file *, char __user *, size_t, loff_t *);
171static ssize_t proc_writesys(struct file *, const char __user *, size_t, loff_t *); 171static ssize_t proc_writesys(struct file *, const char __user *, size_t, loff_t *);
172static int proc_opensys(struct inode *, struct file *); 172static int proc_opensys(struct inode *, struct file *);
173 173
174struct file_operations proc_sys_file_operations = { 174const struct file_operations proc_sys_file_operations = {
175 .open = proc_opensys, 175 .open = proc_opensys,
176 .read = proc_readsys, 176 .read = proc_readsys,
177 .write = proc_writesys, 177 .write = proc_writesys,
@@ -986,17 +986,6 @@ static ctl_table vm_table[] = {
986 .extra1 = &zero, 986 .extra1 = &zero,
987 }, 987 },
988#endif 988#endif
989#ifdef CONFIG_SWAP
990 {
991 .ctl_name = VM_SWAP_TOKEN_TIMEOUT,
992 .procname = "swap_token_timeout",
993 .data = &swap_token_default_timeout,
994 .maxlen = sizeof(swap_token_default_timeout),
995 .mode = 0644,
996 .proc_handler = &proc_dointvec_jiffies,
997 .strategy = &sysctl_jiffies,
998 },
999#endif
1000#ifdef CONFIG_NUMA 989#ifdef CONFIG_NUMA
1001 { 990 {
1002 .ctl_name = VM_ZONE_RECLAIM_MODE, 991 .ctl_name = VM_ZONE_RECLAIM_MODE,
@@ -1895,7 +1884,7 @@ static int __do_proc_dointvec(void *tbl_data, ctl_table *table,
1895 p = buf; 1884 p = buf;
1896 if (*p == '-' && left > 1) { 1885 if (*p == '-' && left > 1) {
1897 neg = 1; 1886 neg = 1;
1898 left--, p++; 1887 p++;
1899 } 1888 }
1900 if (*p < '0' || *p > '9') 1889 if (*p < '0' || *p > '9')
1901 break; 1890 break;
@@ -2146,7 +2135,7 @@ static int __do_proc_doulongvec_minmax(void *data, ctl_table *table, int write,
2146 p = buf; 2135 p = buf;
2147 if (*p == '-' && left > 1) { 2136 if (*p == '-' && left > 1) {
2148 neg = 1; 2137 neg = 1;
2149 left--, p++; 2138 p++;
2150 } 2139 }
2151 if (*p < '0' || *p > '9') 2140 if (*p < '0' || *p > '9')
2152 break; 2141 break;
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index d3d28919d4b4..4c3476fa058d 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -34,7 +34,7 @@
34 34
35static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 }; 35static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 };
36static int family_registered; 36static int family_registered;
37kmem_cache_t *taskstats_cache; 37struct kmem_cache *taskstats_cache;
38 38
39static struct genl_family family = { 39static struct genl_family family = {
40 .id = GENL_ID_GENERATE, 40 .id = GENL_ID_GENERATE,
@@ -69,7 +69,7 @@ enum actions {
69}; 69};
70 70
71static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, 71static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
72 void **replyp, size_t size) 72 size_t size)
73{ 73{
74 struct sk_buff *skb; 74 struct sk_buff *skb;
75 void *reply; 75 void *reply;
@@ -94,7 +94,6 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
94 } 94 }
95 95
96 *skbp = skb; 96 *skbp = skb;
97 *replyp = reply;
98 return 0; 97 return 0;
99} 98}
100 99
@@ -119,10 +118,10 @@ static int send_reply(struct sk_buff *skb, pid_t pid)
119/* 118/*
120 * Send taskstats data in @skb to listeners registered for @cpu's exit data 119 * Send taskstats data in @skb to listeners registered for @cpu's exit data
121 */ 120 */
122static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu) 121static void send_cpu_listeners(struct sk_buff *skb,
122 struct listener_list *listeners)
123{ 123{
124 struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); 124 struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
125 struct listener_list *listeners;
126 struct listener *s, *tmp; 125 struct listener *s, *tmp;
127 struct sk_buff *skb_next, *skb_cur = skb; 126 struct sk_buff *skb_next, *skb_cur = skb;
128 void *reply = genlmsg_data(genlhdr); 127 void *reply = genlmsg_data(genlhdr);
@@ -135,7 +134,6 @@ static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
135 } 134 }
136 135
137 rc = 0; 136 rc = 0;
138 listeners = &per_cpu(listener_array, cpu);
139 down_read(&listeners->sem); 137 down_read(&listeners->sem);
140 list_for_each_entry(s, &listeners->list, list) { 138 list_for_each_entry(s, &listeners->list, list) {
141 skb_next = NULL; 139 skb_next = NULL;
@@ -186,6 +184,7 @@ static int fill_pid(pid_t pid, struct task_struct *tsk,
186 } else 184 } else
187 get_task_struct(tsk); 185 get_task_struct(tsk);
188 186
187 memset(stats, 0, sizeof(*stats));
189 /* 188 /*
190 * Each accounting subsystem adds calls to its functions to 189 * Each accounting subsystem adds calls to its functions to
191 * fill in relevant parts of struct taskstsats as follows 190 * fill in relevant parts of struct taskstsats as follows
@@ -228,6 +227,8 @@ static int fill_tgid(pid_t tgid, struct task_struct *first,
228 227
229 if (first->signal->stats) 228 if (first->signal->stats)
230 memcpy(stats, first->signal->stats, sizeof(*stats)); 229 memcpy(stats, first->signal->stats, sizeof(*stats));
230 else
231 memset(stats, 0, sizeof(*stats));
231 232
232 tsk = first; 233 tsk = first;
233 do { 234 do {
@@ -344,14 +345,36 @@ static int parse(struct nlattr *na, cpumask_t *mask)
344 return ret; 345 return ret;
345} 346}
346 347
348static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
349{
350 struct nlattr *na, *ret;
351 int aggr;
352
353 aggr = (type == TASKSTATS_TYPE_PID)
354 ? TASKSTATS_TYPE_AGGR_PID
355 : TASKSTATS_TYPE_AGGR_TGID;
356
357 na = nla_nest_start(skb, aggr);
358 if (!na)
359 goto err;
360 if (nla_put(skb, type, sizeof(pid), &pid) < 0)
361 goto err;
362 ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats));
363 if (!ret)
364 goto err;
365 nla_nest_end(skb, na);
366
367 return nla_data(ret);
368err:
369 return NULL;
370}
371
347static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) 372static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
348{ 373{
349 int rc = 0; 374 int rc = 0;
350 struct sk_buff *rep_skb; 375 struct sk_buff *rep_skb;
351 struct taskstats stats; 376 struct taskstats *stats;
352 void *reply;
353 size_t size; 377 size_t size;
354 struct nlattr *na;
355 cpumask_t mask; 378 cpumask_t mask;
356 379
357 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask); 380 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask);
@@ -372,83 +395,71 @@ static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
372 size = nla_total_size(sizeof(u32)) + 395 size = nla_total_size(sizeof(u32)) +
373 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); 396 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
374 397
375 memset(&stats, 0, sizeof(stats)); 398 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size);
376 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
377 if (rc < 0) 399 if (rc < 0)
378 return rc; 400 return rc;
379 401
402 rc = -EINVAL;
380 if (info->attrs[TASKSTATS_CMD_ATTR_PID]) { 403 if (info->attrs[TASKSTATS_CMD_ATTR_PID]) {
381 u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); 404 u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]);
382 rc = fill_pid(pid, NULL, &stats); 405 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid);
383 if (rc < 0) 406 if (!stats)
384 goto err; 407 goto err;
385 408
386 na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID); 409 rc = fill_pid(pid, NULL, stats);
387 NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, pid); 410 if (rc < 0)
388 NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, 411 goto err;
389 stats);
390 } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) { 412 } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) {
391 u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); 413 u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]);
392 rc = fill_tgid(tgid, NULL, &stats); 414 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid);
393 if (rc < 0) 415 if (!stats)
394 goto err; 416 goto err;
395 417
396 na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID); 418 rc = fill_tgid(tgid, NULL, stats);
397 NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, tgid); 419 if (rc < 0)
398 NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, 420 goto err;
399 stats); 421 } else
400 } else {
401 rc = -EINVAL;
402 goto err; 422 goto err;
403 }
404
405 nla_nest_end(rep_skb, na);
406 423
407 return send_reply(rep_skb, info->snd_pid); 424 return send_reply(rep_skb, info->snd_pid);
408
409nla_put_failure:
410 rc = genlmsg_cancel(rep_skb, reply);
411err: 425err:
412 nlmsg_free(rep_skb); 426 nlmsg_free(rep_skb);
413 return rc; 427 return rc;
414} 428}
415 429
416void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu) 430static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk)
417{ 431{
418 struct listener_list *listeners; 432 struct signal_struct *sig = tsk->signal;
419 struct taskstats *tmp; 433 struct taskstats *stats;
420 /*
421 * This is the cpu on which the task is exiting currently and will
422 * be the one for which the exit event is sent, even if the cpu
423 * on which this function is running changes later.
424 */
425 *mycpu = raw_smp_processor_id();
426 434
427 *ptidstats = NULL; 435 if (sig->stats || thread_group_empty(tsk))
428 tmp = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL); 436 goto ret;
429 if (!tmp)
430 return;
431 437
432 listeners = &per_cpu(listener_array, *mycpu); 438 /* No problem if kmem_cache_zalloc() fails */
433 down_read(&listeners->sem); 439 stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL);
434 if (!list_empty(&listeners->list)) { 440
435 *ptidstats = tmp; 441 spin_lock_irq(&tsk->sighand->siglock);
436 tmp = NULL; 442 if (!sig->stats) {
443 sig->stats = stats;
444 stats = NULL;
437 } 445 }
438 up_read(&listeners->sem); 446 spin_unlock_irq(&tsk->sighand->siglock);
439 kfree(tmp); 447
448 if (stats)
449 kmem_cache_free(taskstats_cache, stats);
450ret:
451 return sig->stats;
440} 452}
441 453
442/* Send pid data out on exit */ 454/* Send pid data out on exit */
443void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, 455void taskstats_exit(struct task_struct *tsk, int group_dead)
444 int group_dead, unsigned int mycpu)
445{ 456{
446 int rc; 457 int rc;
458 struct listener_list *listeners;
459 struct taskstats *stats;
447 struct sk_buff *rep_skb; 460 struct sk_buff *rep_skb;
448 void *reply;
449 size_t size; 461 size_t size;
450 int is_thread_group; 462 int is_thread_group;
451 struct nlattr *na;
452 463
453 if (!family_registered) 464 if (!family_registered)
454 return; 465 return;
@@ -459,7 +470,7 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
459 size = nla_total_size(sizeof(u32)) + 470 size = nla_total_size(sizeof(u32)) +
460 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); 471 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
461 472
462 is_thread_group = (tsk->signal->stats != NULL); 473 is_thread_group = !!taskstats_tgid_alloc(tsk);
463 if (is_thread_group) { 474 if (is_thread_group) {
464 /* PID + STATS + TGID + STATS */ 475 /* PID + STATS + TGID + STATS */
465 size = 2 * size; 476 size = 2 * size;
@@ -467,49 +478,39 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
467 fill_tgid_exit(tsk); 478 fill_tgid_exit(tsk);
468 } 479 }
469 480
470 if (!tidstats) 481 listeners = &__raw_get_cpu_var(listener_array);
482 if (list_empty(&listeners->list))
471 return; 483 return;
472 484
473 rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, &reply, size); 485 rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, size);
474 if (rc < 0)
475 goto ret;
476
477 rc = fill_pid(tsk->pid, tsk, tidstats);
478 if (rc < 0) 486 if (rc < 0)
479 goto err_skb; 487 return;
480 488
481 na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID); 489 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, tsk->pid);
482 NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, (u32)tsk->pid); 490 if (!stats)
483 NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, 491 goto err;
484 *tidstats);
485 nla_nest_end(rep_skb, na);
486 492
487 if (!is_thread_group) 493 rc = fill_pid(tsk->pid, tsk, stats);
488 goto send; 494 if (rc < 0)
495 goto err;
489 496
490 /* 497 /*
491 * Doesn't matter if tsk is the leader or the last group member leaving 498 * Doesn't matter if tsk is the leader or the last group member leaving
492 */ 499 */
493 if (!group_dead) 500 if (!is_thread_group || !group_dead)
494 goto send; 501 goto send;
495 502
496 na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID); 503 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid);
497 NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, (u32)tsk->tgid); 504 if (!stats)
498 /* No locking needed for tsk->signal->stats since group is dead */ 505 goto err;
499 NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, 506
500 *tsk->signal->stats); 507 memcpy(stats, tsk->signal->stats, sizeof(*stats));
501 nla_nest_end(rep_skb, na);
502 508
503send: 509send:
504 send_cpu_listeners(rep_skb, mycpu); 510 send_cpu_listeners(rep_skb, listeners);
505 return; 511 return;
506 512err:
507nla_put_failure:
508 genlmsg_cancel(rep_skb, reply);
509err_skb:
510 nlmsg_free(rep_skb); 513 nlmsg_free(rep_skb);
511ret:
512 return;
513} 514}
514 515
515static struct genl_ops taskstats_ops = { 516static struct genl_ops taskstats_ops = {
diff --git a/kernel/user.c b/kernel/user.c
index 220e586127a0..4869563080e9 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -26,7 +26,7 @@
26#define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK) 26#define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK)
27#define uidhashentry(uid) (uidhash_table + __uidhashfn((uid))) 27#define uidhashentry(uid) (uidhash_table + __uidhashfn((uid)))
28 28
29static kmem_cache_t *uid_cachep; 29static struct kmem_cache *uid_cachep;
30static struct list_head uidhash_table[UIDHASH_SZ]; 30static struct list_head uidhash_table[UIDHASH_SZ];
31 31
32/* 32/*
@@ -132,7 +132,7 @@ struct user_struct * alloc_uid(uid_t uid)
132 if (!up) { 132 if (!up) {
133 struct user_struct *new; 133 struct user_struct *new;
134 134
135 new = kmem_cache_alloc(uid_cachep, SLAB_KERNEL); 135 new = kmem_cache_alloc(uid_cachep, GFP_KERNEL);
136 if (!new) 136 if (!new)
137 return NULL; 137 return NULL;
138 new->uid = uid; 138 new->uid = uid;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 17c2f03d2c27..c5257316f4b9 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -29,6 +29,9 @@
29#include <linux/kthread.h> 29#include <linux/kthread.h>
30#include <linux/hardirq.h> 30#include <linux/hardirq.h>
31#include <linux/mempolicy.h> 31#include <linux/mempolicy.h>
32#include <linux/freezer.h>
33#include <linux/kallsyms.h>
34#include <linux/debug_locks.h>
32 35
33/* 36/*
34 * The per-CPU workqueue (if single thread, we always use the first 37 * The per-CPU workqueue (if single thread, we always use the first
@@ -55,6 +58,8 @@ struct cpu_workqueue_struct {
55 struct task_struct *thread; 58 struct task_struct *thread;
56 59
57 int run_depth; /* Detect run_workqueue() recursion depth */ 60 int run_depth; /* Detect run_workqueue() recursion depth */
61
62 int freezeable; /* Freeze the thread during suspend */
58} ____cacheline_aligned; 63} ____cacheline_aligned;
59 64
60/* 65/*
@@ -80,6 +85,29 @@ static inline int is_single_threaded(struct workqueue_struct *wq)
80 return list_empty(&wq->list); 85 return list_empty(&wq->list);
81} 86}
82 87
88static inline void set_wq_data(struct work_struct *work, void *wq)
89{
90 unsigned long new, old, res;
91
92 /* assume the pending flag is already set and that the task has already
93 * been queued on this workqueue */
94 new = (unsigned long) wq | (1UL << WORK_STRUCT_PENDING);
95 res = work->management;
96 if (res != new) {
97 do {
98 old = res;
99 new = (unsigned long) wq;
100 new |= (old & WORK_STRUCT_FLAG_MASK);
101 res = cmpxchg(&work->management, old, new);
102 } while (res != old);
103 }
104}
105
106static inline void *get_wq_data(struct work_struct *work)
107{
108 return (void *) (work->management & WORK_STRUCT_WQ_DATA_MASK);
109}
110
83/* Preempt must be disabled. */ 111/* Preempt must be disabled. */
84static void __queue_work(struct cpu_workqueue_struct *cwq, 112static void __queue_work(struct cpu_workqueue_struct *cwq,
85 struct work_struct *work) 113 struct work_struct *work)
@@ -87,7 +115,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
87 unsigned long flags; 115 unsigned long flags;
88 116
89 spin_lock_irqsave(&cwq->lock, flags); 117 spin_lock_irqsave(&cwq->lock, flags);
90 work->wq_data = cwq; 118 set_wq_data(work, cwq);
91 list_add_tail(&work->entry, &cwq->worklist); 119 list_add_tail(&work->entry, &cwq->worklist);
92 cwq->insert_sequence++; 120 cwq->insert_sequence++;
93 wake_up(&cwq->more_work); 121 wake_up(&cwq->more_work);
@@ -108,7 +136,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
108{ 136{
109 int ret = 0, cpu = get_cpu(); 137 int ret = 0, cpu = get_cpu();
110 138
111 if (!test_and_set_bit(0, &work->pending)) { 139 if (!test_and_set_bit(WORK_STRUCT_PENDING, &work->management)) {
112 if (unlikely(is_single_threaded(wq))) 140 if (unlikely(is_single_threaded(wq)))
113 cpu = singlethread_cpu; 141 cpu = singlethread_cpu;
114 BUG_ON(!list_empty(&work->entry)); 142 BUG_ON(!list_empty(&work->entry));
@@ -122,38 +150,42 @@ EXPORT_SYMBOL_GPL(queue_work);
122 150
123static void delayed_work_timer_fn(unsigned long __data) 151static void delayed_work_timer_fn(unsigned long __data)
124{ 152{
125 struct work_struct *work = (struct work_struct *)__data; 153 struct delayed_work *dwork = (struct delayed_work *)__data;
126 struct workqueue_struct *wq = work->wq_data; 154 struct workqueue_struct *wq = get_wq_data(&dwork->work);
127 int cpu = smp_processor_id(); 155 int cpu = smp_processor_id();
128 156
129 if (unlikely(is_single_threaded(wq))) 157 if (unlikely(is_single_threaded(wq)))
130 cpu = singlethread_cpu; 158 cpu = singlethread_cpu;
131 159
132 __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work); 160 __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), &dwork->work);
133} 161}
134 162
135/** 163/**
136 * queue_delayed_work - queue work on a workqueue after delay 164 * queue_delayed_work - queue work on a workqueue after delay
137 * @wq: workqueue to use 165 * @wq: workqueue to use
138 * @work: work to queue 166 * @work: delayable work to queue
139 * @delay: number of jiffies to wait before queueing 167 * @delay: number of jiffies to wait before queueing
140 * 168 *
141 * Returns 0 if @work was already on a queue, non-zero otherwise. 169 * Returns 0 if @work was already on a queue, non-zero otherwise.
142 */ 170 */
143int fastcall queue_delayed_work(struct workqueue_struct *wq, 171int fastcall queue_delayed_work(struct workqueue_struct *wq,
144 struct work_struct *work, unsigned long delay) 172 struct delayed_work *dwork, unsigned long delay)
145{ 173{
146 int ret = 0; 174 int ret = 0;
147 struct timer_list *timer = &work->timer; 175 struct timer_list *timer = &dwork->timer;
176 struct work_struct *work = &dwork->work;
177
178 if (delay == 0)
179 return queue_work(wq, work);
148 180
149 if (!test_and_set_bit(0, &work->pending)) { 181 if (!test_and_set_bit(WORK_STRUCT_PENDING, &work->management)) {
150 BUG_ON(timer_pending(timer)); 182 BUG_ON(timer_pending(timer));
151 BUG_ON(!list_empty(&work->entry)); 183 BUG_ON(!list_empty(&work->entry));
152 184
153 /* This stores wq for the moment, for the timer_fn */ 185 /* This stores wq for the moment, for the timer_fn */
154 work->wq_data = wq; 186 set_wq_data(work, wq);
155 timer->expires = jiffies + delay; 187 timer->expires = jiffies + delay;
156 timer->data = (unsigned long)work; 188 timer->data = (unsigned long)dwork;
157 timer->function = delayed_work_timer_fn; 189 timer->function = delayed_work_timer_fn;
158 add_timer(timer); 190 add_timer(timer);
159 ret = 1; 191 ret = 1;
@@ -172,19 +204,20 @@ EXPORT_SYMBOL_GPL(queue_delayed_work);
172 * Returns 0 if @work was already on a queue, non-zero otherwise. 204 * Returns 0 if @work was already on a queue, non-zero otherwise.
173 */ 205 */
174int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, 206int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
175 struct work_struct *work, unsigned long delay) 207 struct delayed_work *dwork, unsigned long delay)
176{ 208{
177 int ret = 0; 209 int ret = 0;
178 struct timer_list *timer = &work->timer; 210 struct timer_list *timer = &dwork->timer;
211 struct work_struct *work = &dwork->work;
179 212
180 if (!test_and_set_bit(0, &work->pending)) { 213 if (!test_and_set_bit(WORK_STRUCT_PENDING, &work->management)) {
181 BUG_ON(timer_pending(timer)); 214 BUG_ON(timer_pending(timer));
182 BUG_ON(!list_empty(&work->entry)); 215 BUG_ON(!list_empty(&work->entry));
183 216
184 /* This stores wq for the moment, for the timer_fn */ 217 /* This stores wq for the moment, for the timer_fn */
185 work->wq_data = wq; 218 set_wq_data(work, wq);
186 timer->expires = jiffies + delay; 219 timer->expires = jiffies + delay;
187 timer->data = (unsigned long)work; 220 timer->data = (unsigned long)dwork;
188 timer->function = delayed_work_timer_fn; 221 timer->function = delayed_work_timer_fn;
189 add_timer_on(timer, cpu); 222 add_timer_on(timer, cpu);
190 ret = 1; 223 ret = 1;
@@ -212,15 +245,26 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
212 while (!list_empty(&cwq->worklist)) { 245 while (!list_empty(&cwq->worklist)) {
213 struct work_struct *work = list_entry(cwq->worklist.next, 246 struct work_struct *work = list_entry(cwq->worklist.next,
214 struct work_struct, entry); 247 struct work_struct, entry);
215 void (*f) (void *) = work->func; 248 work_func_t f = work->func;
216 void *data = work->data;
217 249
218 list_del_init(cwq->worklist.next); 250 list_del_init(cwq->worklist.next);
219 spin_unlock_irqrestore(&cwq->lock, flags); 251 spin_unlock_irqrestore(&cwq->lock, flags);
220 252
221 BUG_ON(work->wq_data != cwq); 253 BUG_ON(get_wq_data(work) != cwq);
222 clear_bit(0, &work->pending); 254 if (!test_bit(WORK_STRUCT_NOAUTOREL, &work->management))
223 f(data); 255 work_release(work);
256 f(work);
257
258 if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
259 printk(KERN_ERR "BUG: workqueue leaked lock or atomic: "
260 "%s/0x%08x/%d\n",
261 current->comm, preempt_count(),
262 current->pid);
263 printk(KERN_ERR " last function: ");
264 print_symbol("%s\n", (unsigned long)f);
265 debug_show_held_locks(current);
266 dump_stack();
267 }
224 268
225 spin_lock_irqsave(&cwq->lock, flags); 269 spin_lock_irqsave(&cwq->lock, flags);
226 cwq->remove_sequence++; 270 cwq->remove_sequence++;
@@ -237,7 +281,8 @@ static int worker_thread(void *__cwq)
237 struct k_sigaction sa; 281 struct k_sigaction sa;
238 sigset_t blocked; 282 sigset_t blocked;
239 283
240 current->flags |= PF_NOFREEZE; 284 if (!cwq->freezeable)
285 current->flags |= PF_NOFREEZE;
241 286
242 set_user_nice(current, -5); 287 set_user_nice(current, -5);
243 288
@@ -260,6 +305,9 @@ static int worker_thread(void *__cwq)
260 305
261 set_current_state(TASK_INTERRUPTIBLE); 306 set_current_state(TASK_INTERRUPTIBLE);
262 while (!kthread_should_stop()) { 307 while (!kthread_should_stop()) {
308 if (cwq->freezeable)
309 try_to_freeze();
310
263 add_wait_queue(&cwq->more_work, &wait); 311 add_wait_queue(&cwq->more_work, &wait);
264 if (list_empty(&cwq->worklist)) 312 if (list_empty(&cwq->worklist))
265 schedule(); 313 schedule();
@@ -336,7 +384,7 @@ void fastcall flush_workqueue(struct workqueue_struct *wq)
336EXPORT_SYMBOL_GPL(flush_workqueue); 384EXPORT_SYMBOL_GPL(flush_workqueue);
337 385
338static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq, 386static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
339 int cpu) 387 int cpu, int freezeable)
340{ 388{
341 struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); 389 struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
342 struct task_struct *p; 390 struct task_struct *p;
@@ -346,6 +394,7 @@ static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
346 cwq->thread = NULL; 394 cwq->thread = NULL;
347 cwq->insert_sequence = 0; 395 cwq->insert_sequence = 0;
348 cwq->remove_sequence = 0; 396 cwq->remove_sequence = 0;
397 cwq->freezeable = freezeable;
349 INIT_LIST_HEAD(&cwq->worklist); 398 INIT_LIST_HEAD(&cwq->worklist);
350 init_waitqueue_head(&cwq->more_work); 399 init_waitqueue_head(&cwq->more_work);
351 init_waitqueue_head(&cwq->work_done); 400 init_waitqueue_head(&cwq->work_done);
@@ -361,7 +410,7 @@ static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
361} 410}
362 411
363struct workqueue_struct *__create_workqueue(const char *name, 412struct workqueue_struct *__create_workqueue(const char *name,
364 int singlethread) 413 int singlethread, int freezeable)
365{ 414{
366 int cpu, destroy = 0; 415 int cpu, destroy = 0;
367 struct workqueue_struct *wq; 416 struct workqueue_struct *wq;
@@ -381,7 +430,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
381 mutex_lock(&workqueue_mutex); 430 mutex_lock(&workqueue_mutex);
382 if (singlethread) { 431 if (singlethread) {
383 INIT_LIST_HEAD(&wq->list); 432 INIT_LIST_HEAD(&wq->list);
384 p = create_workqueue_thread(wq, singlethread_cpu); 433 p = create_workqueue_thread(wq, singlethread_cpu, freezeable);
385 if (!p) 434 if (!p)
386 destroy = 1; 435 destroy = 1;
387 else 436 else
@@ -389,7 +438,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
389 } else { 438 } else {
390 list_add(&wq->list, &workqueues); 439 list_add(&wq->list, &workqueues);
391 for_each_online_cpu(cpu) { 440 for_each_online_cpu(cpu) {
392 p = create_workqueue_thread(wq, cpu); 441 p = create_workqueue_thread(wq, cpu, freezeable);
393 if (p) { 442 if (p) {
394 kthread_bind(p, cpu); 443 kthread_bind(p, cpu);
395 wake_up_process(p); 444 wake_up_process(p);
@@ -468,38 +517,37 @@ EXPORT_SYMBOL(schedule_work);
468 517
469/** 518/**
470 * schedule_delayed_work - put work task in global workqueue after delay 519 * schedule_delayed_work - put work task in global workqueue after delay
471 * @work: job to be done 520 * @dwork: job to be done
472 * @delay: number of jiffies to wait 521 * @delay: number of jiffies to wait or 0 for immediate execution
473 * 522 *
474 * After waiting for a given time this puts a job in the kernel-global 523 * After waiting for a given time this puts a job in the kernel-global
475 * workqueue. 524 * workqueue.
476 */ 525 */
477int fastcall schedule_delayed_work(struct work_struct *work, unsigned long delay) 526int fastcall schedule_delayed_work(struct delayed_work *dwork, unsigned long delay)
478{ 527{
479 return queue_delayed_work(keventd_wq, work, delay); 528 return queue_delayed_work(keventd_wq, dwork, delay);
480} 529}
481EXPORT_SYMBOL(schedule_delayed_work); 530EXPORT_SYMBOL(schedule_delayed_work);
482 531
483/** 532/**
484 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay 533 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
485 * @cpu: cpu to use 534 * @cpu: cpu to use
486 * @work: job to be done 535 * @dwork: job to be done
487 * @delay: number of jiffies to wait 536 * @delay: number of jiffies to wait
488 * 537 *
489 * After waiting for a given time this puts a job in the kernel-global 538 * After waiting for a given time this puts a job in the kernel-global
490 * workqueue on the specified CPU. 539 * workqueue on the specified CPU.
491 */ 540 */
492int schedule_delayed_work_on(int cpu, 541int schedule_delayed_work_on(int cpu,
493 struct work_struct *work, unsigned long delay) 542 struct delayed_work *dwork, unsigned long delay)
494{ 543{
495 return queue_delayed_work_on(cpu, keventd_wq, work, delay); 544 return queue_delayed_work_on(cpu, keventd_wq, dwork, delay);
496} 545}
497EXPORT_SYMBOL(schedule_delayed_work_on); 546EXPORT_SYMBOL(schedule_delayed_work_on);
498 547
499/** 548/**
500 * schedule_on_each_cpu - call a function on each online CPU from keventd 549 * schedule_on_each_cpu - call a function on each online CPU from keventd
501 * @func: the function to call 550 * @func: the function to call
502 * @info: a pointer to pass to func()
503 * 551 *
504 * Returns zero on success. 552 * Returns zero on success.
505 * Returns -ve errno on failure. 553 * Returns -ve errno on failure.
@@ -508,7 +556,7 @@ EXPORT_SYMBOL(schedule_delayed_work_on);
508 * 556 *
509 * schedule_on_each_cpu() is very slow. 557 * schedule_on_each_cpu() is very slow.
510 */ 558 */
511int schedule_on_each_cpu(void (*func)(void *info), void *info) 559int schedule_on_each_cpu(work_func_t func)
512{ 560{
513 int cpu; 561 int cpu;
514 struct work_struct *works; 562 struct work_struct *works;
@@ -519,7 +567,7 @@ int schedule_on_each_cpu(void (*func)(void *info), void *info)
519 567
520 mutex_lock(&workqueue_mutex); 568 mutex_lock(&workqueue_mutex);
521 for_each_online_cpu(cpu) { 569 for_each_online_cpu(cpu) {
522 INIT_WORK(per_cpu_ptr(works, cpu), func, info); 570 INIT_WORK(per_cpu_ptr(works, cpu), func);
523 __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), 571 __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu),
524 per_cpu_ptr(works, cpu)); 572 per_cpu_ptr(works, cpu));
525 } 573 }
@@ -539,12 +587,12 @@ EXPORT_SYMBOL(flush_scheduled_work);
539 * cancel_rearming_delayed_workqueue - reliably kill off a delayed 587 * cancel_rearming_delayed_workqueue - reliably kill off a delayed
540 * work whose handler rearms the delayed work. 588 * work whose handler rearms the delayed work.
541 * @wq: the controlling workqueue structure 589 * @wq: the controlling workqueue structure
542 * @work: the delayed work struct 590 * @dwork: the delayed work struct
543 */ 591 */
544void cancel_rearming_delayed_workqueue(struct workqueue_struct *wq, 592void cancel_rearming_delayed_workqueue(struct workqueue_struct *wq,
545 struct work_struct *work) 593 struct delayed_work *dwork)
546{ 594{
547 while (!cancel_delayed_work(work)) 595 while (!cancel_delayed_work(dwork))
548 flush_workqueue(wq); 596 flush_workqueue(wq);
549} 597}
550EXPORT_SYMBOL(cancel_rearming_delayed_workqueue); 598EXPORT_SYMBOL(cancel_rearming_delayed_workqueue);
@@ -552,18 +600,17 @@ EXPORT_SYMBOL(cancel_rearming_delayed_workqueue);
552/** 600/**
553 * cancel_rearming_delayed_work - reliably kill off a delayed keventd 601 * cancel_rearming_delayed_work - reliably kill off a delayed keventd
554 * work whose handler rearms the delayed work. 602 * work whose handler rearms the delayed work.
555 * @work: the delayed work struct 603 * @dwork: the delayed work struct
556 */ 604 */
557void cancel_rearming_delayed_work(struct work_struct *work) 605void cancel_rearming_delayed_work(struct delayed_work *dwork)
558{ 606{
559 cancel_rearming_delayed_workqueue(keventd_wq, work); 607 cancel_rearming_delayed_workqueue(keventd_wq, dwork);
560} 608}
561EXPORT_SYMBOL(cancel_rearming_delayed_work); 609EXPORT_SYMBOL(cancel_rearming_delayed_work);
562 610
563/** 611/**
564 * execute_in_process_context - reliably execute the routine with user context 612 * execute_in_process_context - reliably execute the routine with user context
565 * @fn: the function to execute 613 * @fn: the function to execute
566 * @data: data to pass to the function
567 * @ew: guaranteed storage for the execute work structure (must 614 * @ew: guaranteed storage for the execute work structure (must
568 * be available when the work executes) 615 * be available when the work executes)
569 * 616 *
@@ -573,15 +620,14 @@ EXPORT_SYMBOL(cancel_rearming_delayed_work);
573 * Returns: 0 - function was executed 620 * Returns: 0 - function was executed
574 * 1 - function was scheduled for execution 621 * 1 - function was scheduled for execution
575 */ 622 */
576int execute_in_process_context(void (*fn)(void *data), void *data, 623int execute_in_process_context(work_func_t fn, struct execute_work *ew)
577 struct execute_work *ew)
578{ 624{
579 if (!in_interrupt()) { 625 if (!in_interrupt()) {
580 fn(data); 626 fn(&ew->work);
581 return 0; 627 return 0;
582 } 628 }
583 629
584 INIT_WORK(&ew->work, fn, data); 630 INIT_WORK(&ew->work, fn);
585 schedule_work(&ew->work); 631 schedule_work(&ew->work);
586 632
587 return 1; 633 return 1;
@@ -609,7 +655,6 @@ int current_is_keventd(void)
609 655
610} 656}
611 657
612#ifdef CONFIG_HOTPLUG_CPU
613/* Take the work from this (downed) CPU. */ 658/* Take the work from this (downed) CPU. */
614static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) 659static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
615{ 660{
@@ -642,7 +687,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
642 mutex_lock(&workqueue_mutex); 687 mutex_lock(&workqueue_mutex);
643 /* Create a new workqueue thread for it. */ 688 /* Create a new workqueue thread for it. */
644 list_for_each_entry(wq, &workqueues, list) { 689 list_for_each_entry(wq, &workqueues, list) {
645 if (!create_workqueue_thread(wq, hotcpu)) { 690 if (!create_workqueue_thread(wq, hotcpu, 0)) {
646 printk("workqueue for %i failed\n", hotcpu); 691 printk("workqueue for %i failed\n", hotcpu);
647 return NOTIFY_BAD; 692 return NOTIFY_BAD;
648 } 693 }
@@ -692,7 +737,6 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
692 737
693 return NOTIFY_OK; 738 return NOTIFY_OK;
694} 739}
695#endif
696 740
697void init_workqueues(void) 741void init_workqueues(void)
698{ 742{