aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorJeff Garzik <jeff@garzik.org>2006-03-23 17:13:43 -0500
committerJeff Garzik <jeff@garzik.org>2006-03-23 17:13:43 -0500
commit88e3c1da8b3258a81c5c81d4e7e22557b7d71ba7 (patch)
treeab518773c0ff4606f1a57d00b5931332a7e1d96e /kernel
parentfa4fa40a990f8f4eff65476bef32007c154bbac0 (diff)
parentb0e6e962992b76580f4900b166a337bad7c1e81b (diff)
Merge branch 'master'
Diffstat (limited to 'kernel')
-rw-r--r--kernel/auditsc.c2
-rw-r--r--kernel/cpuset.c212
-rw-r--r--kernel/exit.c4
-rw-r--r--kernel/fork.c8
-rw-r--r--kernel/kprobes.c14
-rw-r--r--kernel/kthread.c7
-rw-r--r--kernel/module.c53
-rw-r--r--kernel/panic.c97
-rw-r--r--kernel/posix-timers.c1
-rw-r--r--kernel/power/Makefile2
-rw-r--r--kernel/power/disk.c20
-rw-r--r--kernel/power/main.c2
-rw-r--r--kernel/power/pm.c21
-rw-r--r--kernel/power/power.h75
-rw-r--r--kernel/power/process.c61
-rw-r--r--kernel/power/snapshot.c335
-rw-r--r--kernel/power/swap.c544
-rw-r--r--kernel/power/swsusp.c887
-rw-r--r--kernel/power/user.c333
-rw-r--r--kernel/profile.c11
-rw-r--r--kernel/rcupdate.c14
-rw-r--r--kernel/sched.c13
-rw-r--r--kernel/signal.c11
-rw-r--r--kernel/spinlock.c9
-rw-r--r--kernel/sys.c46
25 files changed, 1706 insertions, 1076 deletions
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index d7e7e637b92a..c4394abcd5e6 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -958,7 +958,7 @@ void audit_syscall_entry(struct task_struct *tsk, int arch, int major,
958 * 958 *
959 * i386 no 959 * i386 no
960 * x86_64 no 960 * x86_64 no
961 * ppc64 yes (see arch/ppc64/kernel/misc.S) 961 * ppc64 yes (see arch/powerpc/platforms/iseries/misc.S)
962 * 962 *
963 * This also happens with vm86 emulation in a non-nested manner 963 * This also happens with vm86 emulation in a non-nested manner
964 * (entries without exits), so this case must be caught. 964 * (entries without exits), so this case must be caught.
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 12815d3f1a05..c86ee051b734 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -53,7 +53,7 @@
53 53
54#include <asm/uaccess.h> 54#include <asm/uaccess.h>
55#include <asm/atomic.h> 55#include <asm/atomic.h>
56#include <asm/semaphore.h> 56#include <linux/mutex.h>
57 57
58#define CPUSET_SUPER_MAGIC 0x27e0eb 58#define CPUSET_SUPER_MAGIC 0x27e0eb
59 59
@@ -168,63 +168,57 @@ static struct vfsmount *cpuset_mount;
168static struct super_block *cpuset_sb; 168static struct super_block *cpuset_sb;
169 169
170/* 170/*
171 * We have two global cpuset semaphores below. They can nest. 171 * We have two global cpuset mutexes below. They can nest.
172 * It is ok to first take manage_sem, then nest callback_sem. We also 172 * It is ok to first take manage_mutex, then nest callback_mutex. We also
173 * require taking task_lock() when dereferencing a tasks cpuset pointer. 173 * require taking task_lock() when dereferencing a tasks cpuset pointer.
174 * See "The task_lock() exception", at the end of this comment. 174 * See "The task_lock() exception", at the end of this comment.
175 * 175 *
176 * A task must hold both semaphores to modify cpusets. If a task 176 * A task must hold both mutexes to modify cpusets. If a task
177 * holds manage_sem, then it blocks others wanting that semaphore, 177 * holds manage_mutex, then it blocks others wanting that mutex,
178 * ensuring that it is the only task able to also acquire callback_sem 178 * ensuring that it is the only task able to also acquire callback_mutex
179 * and be able to modify cpusets. It can perform various checks on 179 * and be able to modify cpusets. It can perform various checks on
180 * the cpuset structure first, knowing nothing will change. It can 180 * the cpuset structure first, knowing nothing will change. It can
181 * also allocate memory while just holding manage_sem. While it is 181 * also allocate memory while just holding manage_mutex. While it is
182 * performing these checks, various callback routines can briefly 182 * performing these checks, various callback routines can briefly
183 * acquire callback_sem to query cpusets. Once it is ready to make 183 * acquire callback_mutex to query cpusets. Once it is ready to make
184 * the changes, it takes callback_sem, blocking everyone else. 184 * the changes, it takes callback_mutex, blocking everyone else.
185 * 185 *
186 * Calls to the kernel memory allocator can not be made while holding 186 * Calls to the kernel memory allocator can not be made while holding
187 * callback_sem, as that would risk double tripping on callback_sem 187 * callback_mutex, as that would risk double tripping on callback_mutex
188 * from one of the callbacks into the cpuset code from within 188 * from one of the callbacks into the cpuset code from within
189 * __alloc_pages(). 189 * __alloc_pages().
190 * 190 *
191 * If a task is only holding callback_sem, then it has read-only 191 * If a task is only holding callback_mutex, then it has read-only
192 * access to cpusets. 192 * access to cpusets.
193 * 193 *
194 * The task_struct fields mems_allowed and mems_generation may only 194 * The task_struct fields mems_allowed and mems_generation may only
195 * be accessed in the context of that task, so require no locks. 195 * be accessed in the context of that task, so require no locks.
196 * 196 *
197 * Any task can increment and decrement the count field without lock. 197 * Any task can increment and decrement the count field without lock.
198 * So in general, code holding manage_sem or callback_sem can't rely 198 * So in general, code holding manage_mutex or callback_mutex can't rely
199 * on the count field not changing. However, if the count goes to 199 * on the count field not changing. However, if the count goes to
200 * zero, then only attach_task(), which holds both semaphores, can 200 * zero, then only attach_task(), which holds both mutexes, can
201 * increment it again. Because a count of zero means that no tasks 201 * increment it again. Because a count of zero means that no tasks
202 * are currently attached, therefore there is no way a task attached 202 * are currently attached, therefore there is no way a task attached
203 * to that cpuset can fork (the other way to increment the count). 203 * to that cpuset can fork (the other way to increment the count).
204 * So code holding manage_sem or callback_sem can safely assume that 204 * So code holding manage_mutex or callback_mutex can safely assume that
205 * if the count is zero, it will stay zero. Similarly, if a task 205 * if the count is zero, it will stay zero. Similarly, if a task
206 * holds manage_sem or callback_sem on a cpuset with zero count, it 206 * holds manage_mutex or callback_mutex on a cpuset with zero count, it
207 * knows that the cpuset won't be removed, as cpuset_rmdir() needs 207 * knows that the cpuset won't be removed, as cpuset_rmdir() needs
208 * both of those semaphores. 208 * both of those mutexes.
209 *
210 * A possible optimization to improve parallelism would be to make
211 * callback_sem a R/W semaphore (rwsem), allowing the callback routines
212 * to proceed in parallel, with read access, until the holder of
213 * manage_sem needed to take this rwsem for exclusive write access
214 * and modify some cpusets.
215 * 209 *
216 * The cpuset_common_file_write handler for operations that modify 210 * The cpuset_common_file_write handler for operations that modify
217 * the cpuset hierarchy holds manage_sem across the entire operation, 211 * the cpuset hierarchy holds manage_mutex across the entire operation,
218 * single threading all such cpuset modifications across the system. 212 * single threading all such cpuset modifications across the system.
219 * 213 *
220 * The cpuset_common_file_read() handlers only hold callback_sem across 214 * The cpuset_common_file_read() handlers only hold callback_mutex across
221 * small pieces of code, such as when reading out possibly multi-word 215 * small pieces of code, such as when reading out possibly multi-word
222 * cpumasks and nodemasks. 216 * cpumasks and nodemasks.
223 * 217 *
224 * The fork and exit callbacks cpuset_fork() and cpuset_exit(), don't 218 * The fork and exit callbacks cpuset_fork() and cpuset_exit(), don't
225 * (usually) take either semaphore. These are the two most performance 219 * (usually) take either mutex. These are the two most performance
226 * critical pieces of code here. The exception occurs on cpuset_exit(), 220 * critical pieces of code here. The exception occurs on cpuset_exit(),
227 * when a task in a notify_on_release cpuset exits. Then manage_sem 221 * when a task in a notify_on_release cpuset exits. Then manage_mutex
228 * is taken, and if the cpuset count is zero, a usermode call made 222 * is taken, and if the cpuset count is zero, a usermode call made
229 * to /sbin/cpuset_release_agent with the name of the cpuset (path 223 * to /sbin/cpuset_release_agent with the name of the cpuset (path
230 * relative to the root of cpuset file system) as the argument. 224 * relative to the root of cpuset file system) as the argument.
@@ -242,9 +236,9 @@ static struct super_block *cpuset_sb;
242 * 236 *
243 * The need for this exception arises from the action of attach_task(), 237 * The need for this exception arises from the action of attach_task(),
244 * which overwrites one tasks cpuset pointer with another. It does 238 * which overwrites one tasks cpuset pointer with another. It does
245 * so using both semaphores, however there are several performance 239 * so using both mutexes, however there are several performance
246 * critical places that need to reference task->cpuset without the 240 * critical places that need to reference task->cpuset without the
247 * expense of grabbing a system global semaphore. Therefore except as 241 * expense of grabbing a system global mutex. Therefore except as
248 * noted below, when dereferencing or, as in attach_task(), modifying 242 * noted below, when dereferencing or, as in attach_task(), modifying
249 * a tasks cpuset pointer we use task_lock(), which acts on a spinlock 243 * a tasks cpuset pointer we use task_lock(), which acts on a spinlock
250 * (task->alloc_lock) already in the task_struct routinely used for 244 * (task->alloc_lock) already in the task_struct routinely used for
@@ -256,8 +250,8 @@ static struct super_block *cpuset_sb;
256 * the routine cpuset_update_task_memory_state(). 250 * the routine cpuset_update_task_memory_state().
257 */ 251 */
258 252
259static DECLARE_MUTEX(manage_sem); 253static DEFINE_MUTEX(manage_mutex);
260static DECLARE_MUTEX(callback_sem); 254static DEFINE_MUTEX(callback_mutex);
261 255
262/* 256/*
263 * A couple of forward declarations required, due to cyclic reference loop: 257 * A couple of forward declarations required, due to cyclic reference loop:
@@ -432,7 +426,7 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
432} 426}
433 427
434/* 428/*
435 * Call with manage_sem held. Writes path of cpuset into buf. 429 * Call with manage_mutex held. Writes path of cpuset into buf.
436 * Returns 0 on success, -errno on error. 430 * Returns 0 on success, -errno on error.
437 */ 431 */
438 432
@@ -484,11 +478,11 @@ static int cpuset_path(const struct cpuset *cs, char *buf, int buflen)
484 * status of the /sbin/cpuset_release_agent task, so no sense holding 478 * status of the /sbin/cpuset_release_agent task, so no sense holding
485 * our caller up for that. 479 * our caller up for that.
486 * 480 *
487 * When we had only one cpuset semaphore, we had to call this 481 * When we had only one cpuset mutex, we had to call this
488 * without holding it, to avoid deadlock when call_usermodehelper() 482 * without holding it, to avoid deadlock when call_usermodehelper()
489 * allocated memory. With two locks, we could now call this while 483 * allocated memory. With two locks, we could now call this while
490 * holding manage_sem, but we still don't, so as to minimize 484 * holding manage_mutex, but we still don't, so as to minimize
491 * the time manage_sem is held. 485 * the time manage_mutex is held.
492 */ 486 */
493 487
494static void cpuset_release_agent(const char *pathbuf) 488static void cpuset_release_agent(const char *pathbuf)
@@ -520,15 +514,15 @@ static void cpuset_release_agent(const char *pathbuf)
520 * cs is notify_on_release() and now both the user count is zero and 514 * cs is notify_on_release() and now both the user count is zero and
521 * the list of children is empty, prepare cpuset path in a kmalloc'd 515 * the list of children is empty, prepare cpuset path in a kmalloc'd
522 * buffer, to be returned via ppathbuf, so that the caller can invoke 516 * buffer, to be returned via ppathbuf, so that the caller can invoke
523 * cpuset_release_agent() with it later on, once manage_sem is dropped. 517 * cpuset_release_agent() with it later on, once manage_mutex is dropped.
524 * Call here with manage_sem held. 518 * Call here with manage_mutex held.
525 * 519 *
526 * This check_for_release() routine is responsible for kmalloc'ing 520 * This check_for_release() routine is responsible for kmalloc'ing
527 * pathbuf. The above cpuset_release_agent() is responsible for 521 * pathbuf. The above cpuset_release_agent() is responsible for
528 * kfree'ing pathbuf. The caller of these routines is responsible 522 * kfree'ing pathbuf. The caller of these routines is responsible
529 * for providing a pathbuf pointer, initialized to NULL, then 523 * for providing a pathbuf pointer, initialized to NULL, then
530 * calling check_for_release() with manage_sem held and the address 524 * calling check_for_release() with manage_mutex held and the address
531 * of the pathbuf pointer, then dropping manage_sem, then calling 525 * of the pathbuf pointer, then dropping manage_mutex, then calling
532 * cpuset_release_agent() with pathbuf, as set by check_for_release(). 526 * cpuset_release_agent() with pathbuf, as set by check_for_release().
533 */ 527 */
534 528
@@ -559,7 +553,7 @@ static void check_for_release(struct cpuset *cs, char **ppathbuf)
559 * One way or another, we guarantee to return some non-empty subset 553 * One way or another, we guarantee to return some non-empty subset
560 * of cpu_online_map. 554 * of cpu_online_map.
561 * 555 *
562 * Call with callback_sem held. 556 * Call with callback_mutex held.
563 */ 557 */
564 558
565static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) 559static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask)
@@ -583,7 +577,7 @@ static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask)
583 * One way or another, we guarantee to return some non-empty subset 577 * One way or another, we guarantee to return some non-empty subset
584 * of node_online_map. 578 * of node_online_map.
585 * 579 *
586 * Call with callback_sem held. 580 * Call with callback_mutex held.
587 */ 581 */
588 582
589static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) 583static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
@@ -608,12 +602,12 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
608 * current->cpuset if a task has its memory placement changed. 602 * current->cpuset if a task has its memory placement changed.
609 * Do not call this routine if in_interrupt(). 603 * Do not call this routine if in_interrupt().
610 * 604 *
611 * Call without callback_sem or task_lock() held. May be called 605 * Call without callback_mutex or task_lock() held. May be called
612 * with or without manage_sem held. Doesn't need task_lock to guard 606 * with or without manage_mutex held. Doesn't need task_lock to guard
613 * against another task changing a non-NULL cpuset pointer to NULL, 607 * against another task changing a non-NULL cpuset pointer to NULL,
614 * as that is only done by a task on itself, and if the current task 608 * as that is only done by a task on itself, and if the current task
615 * is here, it is not simultaneously in the exit code NULL'ing its 609 * is here, it is not simultaneously in the exit code NULL'ing its
616 * cpuset pointer. This routine also might acquire callback_sem and 610 * cpuset pointer. This routine also might acquire callback_mutex and
617 * current->mm->mmap_sem during call. 611 * current->mm->mmap_sem during call.
618 * 612 *
619 * Reading current->cpuset->mems_generation doesn't need task_lock 613 * Reading current->cpuset->mems_generation doesn't need task_lock
@@ -658,13 +652,13 @@ void cpuset_update_task_memory_state(void)
658 } 652 }
659 653
660 if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) { 654 if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
661 down(&callback_sem); 655 mutex_lock(&callback_mutex);
662 task_lock(tsk); 656 task_lock(tsk);
663 cs = tsk->cpuset; /* Maybe changed when task not locked */ 657 cs = tsk->cpuset; /* Maybe changed when task not locked */
664 guarantee_online_mems(cs, &tsk->mems_allowed); 658 guarantee_online_mems(cs, &tsk->mems_allowed);
665 tsk->cpuset_mems_generation = cs->mems_generation; 659 tsk->cpuset_mems_generation = cs->mems_generation;
666 task_unlock(tsk); 660 task_unlock(tsk);
667 up(&callback_sem); 661 mutex_unlock(&callback_mutex);
668 mpol_rebind_task(tsk, &tsk->mems_allowed); 662 mpol_rebind_task(tsk, &tsk->mems_allowed);
669 } 663 }
670} 664}
@@ -674,7 +668,7 @@ void cpuset_update_task_memory_state(void)
674 * 668 *
675 * One cpuset is a subset of another if all its allowed CPUs and 669 * One cpuset is a subset of another if all its allowed CPUs and
676 * Memory Nodes are a subset of the other, and its exclusive flags 670 * Memory Nodes are a subset of the other, and its exclusive flags
677 * are only set if the other's are set. Call holding manage_sem. 671 * are only set if the other's are set. Call holding manage_mutex.
678 */ 672 */
679 673
680static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) 674static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
@@ -692,7 +686,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
692 * If we replaced the flag and mask values of the current cpuset 686 * If we replaced the flag and mask values of the current cpuset
693 * (cur) with those values in the trial cpuset (trial), would 687 * (cur) with those values in the trial cpuset (trial), would
694 * our various subset and exclusive rules still be valid? Presumes 688 * our various subset and exclusive rules still be valid? Presumes
695 * manage_sem held. 689 * manage_mutex held.
696 * 690 *
697 * 'cur' is the address of an actual, in-use cpuset. Operations 691 * 'cur' is the address of an actual, in-use cpuset. Operations
698 * such as list traversal that depend on the actual address of the 692 * such as list traversal that depend on the actual address of the
@@ -746,7 +740,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
746 * exclusive child cpusets 740 * exclusive child cpusets
747 * Build these two partitions by calling partition_sched_domains 741 * Build these two partitions by calling partition_sched_domains
748 * 742 *
749 * Call with manage_sem held. May nest a call to the 743 * Call with manage_mutex held. May nest a call to the
750 * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. 744 * lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
751 */ 745 */
752 746
@@ -792,7 +786,7 @@ static void update_cpu_domains(struct cpuset *cur)
792} 786}
793 787
794/* 788/*
795 * Call with manage_sem held. May take callback_sem during call. 789 * Call with manage_mutex held. May take callback_mutex during call.
796 */ 790 */
797 791
798static int update_cpumask(struct cpuset *cs, char *buf) 792static int update_cpumask(struct cpuset *cs, char *buf)
@@ -811,9 +805,9 @@ static int update_cpumask(struct cpuset *cs, char *buf)
811 if (retval < 0) 805 if (retval < 0)
812 return retval; 806 return retval;
813 cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed); 807 cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed);
814 down(&callback_sem); 808 mutex_lock(&callback_mutex);
815 cs->cpus_allowed = trialcs.cpus_allowed; 809 cs->cpus_allowed = trialcs.cpus_allowed;
816 up(&callback_sem); 810 mutex_unlock(&callback_mutex);
817 if (is_cpu_exclusive(cs) && !cpus_unchanged) 811 if (is_cpu_exclusive(cs) && !cpus_unchanged)
818 update_cpu_domains(cs); 812 update_cpu_domains(cs);
819 return 0; 813 return 0;
@@ -827,7 +821,7 @@ static int update_cpumask(struct cpuset *cs, char *buf)
827 * the cpuset is marked 'memory_migrate', migrate the tasks 821 * the cpuset is marked 'memory_migrate', migrate the tasks
828 * pages to the new memory. 822 * pages to the new memory.
829 * 823 *
830 * Call with manage_sem held. May take callback_sem during call. 824 * Call with manage_mutex held. May take callback_mutex during call.
831 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, 825 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
832 * lock each such tasks mm->mmap_sem, scan its vma's and rebind 826 * lock each such tasks mm->mmap_sem, scan its vma's and rebind
833 * their mempolicies to the cpusets new mems_allowed. 827 * their mempolicies to the cpusets new mems_allowed.
@@ -862,11 +856,11 @@ static int update_nodemask(struct cpuset *cs, char *buf)
862 if (retval < 0) 856 if (retval < 0)
863 goto done; 857 goto done;
864 858
865 down(&callback_sem); 859 mutex_lock(&callback_mutex);
866 cs->mems_allowed = trialcs.mems_allowed; 860 cs->mems_allowed = trialcs.mems_allowed;
867 atomic_inc(&cpuset_mems_generation); 861 atomic_inc(&cpuset_mems_generation);
868 cs->mems_generation = atomic_read(&cpuset_mems_generation); 862 cs->mems_generation = atomic_read(&cpuset_mems_generation);
869 up(&callback_sem); 863 mutex_unlock(&callback_mutex);
870 864
871 set_cpuset_being_rebound(cs); /* causes mpol_copy() rebind */ 865 set_cpuset_being_rebound(cs); /* causes mpol_copy() rebind */
872 866
@@ -922,7 +916,7 @@ static int update_nodemask(struct cpuset *cs, char *buf)
922 * tasklist_lock. Forks can happen again now - the mpol_copy() 916 * tasklist_lock. Forks can happen again now - the mpol_copy()
923 * cpuset_being_rebound check will catch such forks, and rebind 917 * cpuset_being_rebound check will catch such forks, and rebind
924 * their vma mempolicies too. Because we still hold the global 918 * their vma mempolicies too. Because we still hold the global
925 * cpuset manage_sem, we know that no other rebind effort will 919 * cpuset manage_mutex, we know that no other rebind effort will
926 * be contending for the global variable cpuset_being_rebound. 920 * be contending for the global variable cpuset_being_rebound.
927 * It's ok if we rebind the same mm twice; mpol_rebind_mm() 921 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
928 * is idempotent. Also migrate pages in each mm to new nodes. 922 * is idempotent. Also migrate pages in each mm to new nodes.
@@ -948,7 +942,7 @@ done:
948} 942}
949 943
950/* 944/*
951 * Call with manage_sem held. 945 * Call with manage_mutex held.
952 */ 946 */
953 947
954static int update_memory_pressure_enabled(struct cpuset *cs, char *buf) 948static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
@@ -967,7 +961,7 @@ static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
967 * cs: the cpuset to update 961 * cs: the cpuset to update
968 * buf: the buffer where we read the 0 or 1 962 * buf: the buffer where we read the 0 or 1
969 * 963 *
970 * Call with manage_sem held. 964 * Call with manage_mutex held.
971 */ 965 */
972 966
973static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) 967static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
@@ -989,12 +983,12 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
989 return err; 983 return err;
990 cpu_exclusive_changed = 984 cpu_exclusive_changed =
991 (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs)); 985 (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs));
992 down(&callback_sem); 986 mutex_lock(&callback_mutex);
993 if (turning_on) 987 if (turning_on)
994 set_bit(bit, &cs->flags); 988 set_bit(bit, &cs->flags);
995 else 989 else
996 clear_bit(bit, &cs->flags); 990 clear_bit(bit, &cs->flags);
997 up(&callback_sem); 991 mutex_unlock(&callback_mutex);
998 992
999 if (cpu_exclusive_changed) 993 if (cpu_exclusive_changed)
1000 update_cpu_domains(cs); 994 update_cpu_domains(cs);
@@ -1104,7 +1098,7 @@ static int fmeter_getrate(struct fmeter *fmp)
1104 * writing the path of the old cpuset in 'ppathbuf' if it needs to be 1098 * writing the path of the old cpuset in 'ppathbuf' if it needs to be
1105 * notified on release. 1099 * notified on release.
1106 * 1100 *
1107 * Call holding manage_sem. May take callback_sem and task_lock of 1101 * Call holding manage_mutex. May take callback_mutex and task_lock of
1108 * the task 'pid' during call. 1102 * the task 'pid' during call.
1109 */ 1103 */
1110 1104
@@ -1144,13 +1138,13 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
1144 get_task_struct(tsk); 1138 get_task_struct(tsk);
1145 } 1139 }
1146 1140
1147 down(&callback_sem); 1141 mutex_lock(&callback_mutex);
1148 1142
1149 task_lock(tsk); 1143 task_lock(tsk);
1150 oldcs = tsk->cpuset; 1144 oldcs = tsk->cpuset;
1151 if (!oldcs) { 1145 if (!oldcs) {
1152 task_unlock(tsk); 1146 task_unlock(tsk);
1153 up(&callback_sem); 1147 mutex_unlock(&callback_mutex);
1154 put_task_struct(tsk); 1148 put_task_struct(tsk);
1155 return -ESRCH; 1149 return -ESRCH;
1156 } 1150 }
@@ -1164,7 +1158,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
1164 from = oldcs->mems_allowed; 1158 from = oldcs->mems_allowed;
1165 to = cs->mems_allowed; 1159 to = cs->mems_allowed;
1166 1160
1167 up(&callback_sem); 1161 mutex_unlock(&callback_mutex);
1168 1162
1169 mm = get_task_mm(tsk); 1163 mm = get_task_mm(tsk);
1170 if (mm) { 1164 if (mm) {
@@ -1221,7 +1215,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
1221 } 1215 }
1222 buffer[nbytes] = 0; /* nul-terminate */ 1216 buffer[nbytes] = 0; /* nul-terminate */
1223 1217
1224 down(&manage_sem); 1218 mutex_lock(&manage_mutex);
1225 1219
1226 if (is_removed(cs)) { 1220 if (is_removed(cs)) {
1227 retval = -ENODEV; 1221 retval = -ENODEV;
@@ -1264,7 +1258,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
1264 if (retval == 0) 1258 if (retval == 0)
1265 retval = nbytes; 1259 retval = nbytes;
1266out2: 1260out2:
1267 up(&manage_sem); 1261 mutex_unlock(&manage_mutex);
1268 cpuset_release_agent(pathbuf); 1262 cpuset_release_agent(pathbuf);
1269out1: 1263out1:
1270 kfree(buffer); 1264 kfree(buffer);
@@ -1304,9 +1298,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
1304{ 1298{
1305 cpumask_t mask; 1299 cpumask_t mask;
1306 1300
1307 down(&callback_sem); 1301 mutex_lock(&callback_mutex);
1308 mask = cs->cpus_allowed; 1302 mask = cs->cpus_allowed;
1309 up(&callback_sem); 1303 mutex_unlock(&callback_mutex);
1310 1304
1311 return cpulist_scnprintf(page, PAGE_SIZE, mask); 1305 return cpulist_scnprintf(page, PAGE_SIZE, mask);
1312} 1306}
@@ -1315,9 +1309,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
1315{ 1309{
1316 nodemask_t mask; 1310 nodemask_t mask;
1317 1311
1318 down(&callback_sem); 1312 mutex_lock(&callback_mutex);
1319 mask = cs->mems_allowed; 1313 mask = cs->mems_allowed;
1320 up(&callback_sem); 1314 mutex_unlock(&callback_mutex);
1321 1315
1322 return nodelist_scnprintf(page, PAGE_SIZE, mask); 1316 return nodelist_scnprintf(page, PAGE_SIZE, mask);
1323} 1317}
@@ -1598,7 +1592,7 @@ static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids)
1598 * Handle an open on 'tasks' file. Prepare a buffer listing the 1592 * Handle an open on 'tasks' file. Prepare a buffer listing the
1599 * process id's of tasks currently attached to the cpuset being opened. 1593 * process id's of tasks currently attached to the cpuset being opened.
1600 * 1594 *
1601 * Does not require any specific cpuset semaphores, and does not take any. 1595 * Does not require any specific cpuset mutexes, and does not take any.
1602 */ 1596 */
1603static int cpuset_tasks_open(struct inode *unused, struct file *file) 1597static int cpuset_tasks_open(struct inode *unused, struct file *file)
1604{ 1598{
@@ -1754,7 +1748,7 @@ static int cpuset_populate_dir(struct dentry *cs_dentry)
1754 * name: name of the new cpuset. Will be strcpy'ed. 1748 * name: name of the new cpuset. Will be strcpy'ed.
1755 * mode: mode to set on new inode 1749 * mode: mode to set on new inode
1756 * 1750 *
1757 * Must be called with the semaphore on the parent inode held 1751 * Must be called with the mutex on the parent inode held
1758 */ 1752 */
1759 1753
1760static long cpuset_create(struct cpuset *parent, const char *name, int mode) 1754static long cpuset_create(struct cpuset *parent, const char *name, int mode)
@@ -1766,7 +1760,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
1766 if (!cs) 1760 if (!cs)
1767 return -ENOMEM; 1761 return -ENOMEM;
1768 1762
1769 down(&manage_sem); 1763 mutex_lock(&manage_mutex);
1770 cpuset_update_task_memory_state(); 1764 cpuset_update_task_memory_state();
1771 cs->flags = 0; 1765 cs->flags = 0;
1772 if (notify_on_release(parent)) 1766 if (notify_on_release(parent))
@@ -1782,28 +1776,28 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
1782 1776
1783 cs->parent = parent; 1777 cs->parent = parent;
1784 1778
1785 down(&callback_sem); 1779 mutex_lock(&callback_mutex);
1786 list_add(&cs->sibling, &cs->parent->children); 1780 list_add(&cs->sibling, &cs->parent->children);
1787 number_of_cpusets++; 1781 number_of_cpusets++;
1788 up(&callback_sem); 1782 mutex_unlock(&callback_mutex);
1789 1783
1790 err = cpuset_create_dir(cs, name, mode); 1784 err = cpuset_create_dir(cs, name, mode);
1791 if (err < 0) 1785 if (err < 0)
1792 goto err; 1786 goto err;
1793 1787
1794 /* 1788 /*
1795 * Release manage_sem before cpuset_populate_dir() because it 1789 * Release manage_mutex before cpuset_populate_dir() because it
1796 * will down() this new directory's i_mutex and if we race with 1790 * will down() this new directory's i_mutex and if we race with
1797 * another mkdir, we might deadlock. 1791 * another mkdir, we might deadlock.
1798 */ 1792 */
1799 up(&manage_sem); 1793 mutex_unlock(&manage_mutex);
1800 1794
1801 err = cpuset_populate_dir(cs->dentry); 1795 err = cpuset_populate_dir(cs->dentry);
1802 /* If err < 0, we have a half-filled directory - oh well ;) */ 1796 /* If err < 0, we have a half-filled directory - oh well ;) */
1803 return 0; 1797 return 0;
1804err: 1798err:
1805 list_del(&cs->sibling); 1799 list_del(&cs->sibling);
1806 up(&manage_sem); 1800 mutex_unlock(&manage_mutex);
1807 kfree(cs); 1801 kfree(cs);
1808 return err; 1802 return err;
1809} 1803}
@@ -1825,18 +1819,18 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
1825 1819
1826 /* the vfs holds both inode->i_mutex already */ 1820 /* the vfs holds both inode->i_mutex already */
1827 1821
1828 down(&manage_sem); 1822 mutex_lock(&manage_mutex);
1829 cpuset_update_task_memory_state(); 1823 cpuset_update_task_memory_state();
1830 if (atomic_read(&cs->count) > 0) { 1824 if (atomic_read(&cs->count) > 0) {
1831 up(&manage_sem); 1825 mutex_unlock(&manage_mutex);
1832 return -EBUSY; 1826 return -EBUSY;
1833 } 1827 }
1834 if (!list_empty(&cs->children)) { 1828 if (!list_empty(&cs->children)) {
1835 up(&manage_sem); 1829 mutex_unlock(&manage_mutex);
1836 return -EBUSY; 1830 return -EBUSY;
1837 } 1831 }
1838 parent = cs->parent; 1832 parent = cs->parent;
1839 down(&callback_sem); 1833 mutex_lock(&callback_mutex);
1840 set_bit(CS_REMOVED, &cs->flags); 1834 set_bit(CS_REMOVED, &cs->flags);
1841 if (is_cpu_exclusive(cs)) 1835 if (is_cpu_exclusive(cs))
1842 update_cpu_domains(cs); 1836 update_cpu_domains(cs);
@@ -1848,10 +1842,10 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
1848 cpuset_d_remove_dir(d); 1842 cpuset_d_remove_dir(d);
1849 dput(d); 1843 dput(d);
1850 number_of_cpusets--; 1844 number_of_cpusets--;
1851 up(&callback_sem); 1845 mutex_unlock(&callback_mutex);
1852 if (list_empty(&parent->children)) 1846 if (list_empty(&parent->children))
1853 check_for_release(parent, &pathbuf); 1847 check_for_release(parent, &pathbuf);
1854 up(&manage_sem); 1848 mutex_unlock(&manage_mutex);
1855 cpuset_release_agent(pathbuf); 1849 cpuset_release_agent(pathbuf);
1856 return 0; 1850 return 0;
1857} 1851}
@@ -1960,19 +1954,19 @@ void cpuset_fork(struct task_struct *child)
1960 * Description: Detach cpuset from @tsk and release it. 1954 * Description: Detach cpuset from @tsk and release it.
1961 * 1955 *
1962 * Note that cpusets marked notify_on_release force every task in 1956 * Note that cpusets marked notify_on_release force every task in
1963 * them to take the global manage_sem semaphore when exiting. 1957 * them to take the global manage_mutex mutex when exiting.
1964 * This could impact scaling on very large systems. Be reluctant to 1958 * This could impact scaling on very large systems. Be reluctant to
1965 * use notify_on_release cpusets where very high task exit scaling 1959 * use notify_on_release cpusets where very high task exit scaling
1966 * is required on large systems. 1960 * is required on large systems.
1967 * 1961 *
1968 * Don't even think about derefencing 'cs' after the cpuset use count 1962 * Don't even think about derefencing 'cs' after the cpuset use count
1969 * goes to zero, except inside a critical section guarded by manage_sem 1963 * goes to zero, except inside a critical section guarded by manage_mutex
1970 * or callback_sem. Otherwise a zero cpuset use count is a license to 1964 * or callback_mutex. Otherwise a zero cpuset use count is a license to
1971 * any other task to nuke the cpuset immediately, via cpuset_rmdir(). 1965 * any other task to nuke the cpuset immediately, via cpuset_rmdir().
1972 * 1966 *
1973 * This routine has to take manage_sem, not callback_sem, because 1967 * This routine has to take manage_mutex, not callback_mutex, because
1974 * it is holding that semaphore while calling check_for_release(), 1968 * it is holding that mutex while calling check_for_release(),
1975 * which calls kmalloc(), so can't be called holding callback__sem(). 1969 * which calls kmalloc(), so can't be called holding callback_mutex().
1976 * 1970 *
1977 * We don't need to task_lock() this reference to tsk->cpuset, 1971 * We don't need to task_lock() this reference to tsk->cpuset,
1978 * because tsk is already marked PF_EXITING, so attach_task() won't 1972 * because tsk is already marked PF_EXITING, so attach_task() won't
@@ -2022,10 +2016,10 @@ void cpuset_exit(struct task_struct *tsk)
2022 if (notify_on_release(cs)) { 2016 if (notify_on_release(cs)) {
2023 char *pathbuf = NULL; 2017 char *pathbuf = NULL;
2024 2018
2025 down(&manage_sem); 2019 mutex_lock(&manage_mutex);
2026 if (atomic_dec_and_test(&cs->count)) 2020 if (atomic_dec_and_test(&cs->count))
2027 check_for_release(cs, &pathbuf); 2021 check_for_release(cs, &pathbuf);
2028 up(&manage_sem); 2022 mutex_unlock(&manage_mutex);
2029 cpuset_release_agent(pathbuf); 2023 cpuset_release_agent(pathbuf);
2030 } else { 2024 } else {
2031 atomic_dec(&cs->count); 2025 atomic_dec(&cs->count);
@@ -2046,11 +2040,11 @@ cpumask_t cpuset_cpus_allowed(struct task_struct *tsk)
2046{ 2040{
2047 cpumask_t mask; 2041 cpumask_t mask;
2048 2042
2049 down(&callback_sem); 2043 mutex_lock(&callback_mutex);
2050 task_lock(tsk); 2044 task_lock(tsk);
2051 guarantee_online_cpus(tsk->cpuset, &mask); 2045 guarantee_online_cpus(tsk->cpuset, &mask);
2052 task_unlock(tsk); 2046 task_unlock(tsk);
2053 up(&callback_sem); 2047 mutex_unlock(&callback_mutex);
2054 2048
2055 return mask; 2049 return mask;
2056} 2050}
@@ -2074,11 +2068,11 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
2074{ 2068{
2075 nodemask_t mask; 2069 nodemask_t mask;
2076 2070
2077 down(&callback_sem); 2071 mutex_lock(&callback_mutex);
2078 task_lock(tsk); 2072 task_lock(tsk);
2079 guarantee_online_mems(tsk->cpuset, &mask); 2073 guarantee_online_mems(tsk->cpuset, &mask);
2080 task_unlock(tsk); 2074 task_unlock(tsk);
2081 up(&callback_sem); 2075 mutex_unlock(&callback_mutex);
2082 2076
2083 return mask; 2077 return mask;
2084} 2078}
@@ -2104,7 +2098,7 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
2104 2098
2105/* 2099/*
2106 * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive 2100 * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive
2107 * ancestor to the specified cpuset. Call holding callback_sem. 2101 * ancestor to the specified cpuset. Call holding callback_mutex.
2108 * If no ancestor is mem_exclusive (an unusual configuration), then 2102 * If no ancestor is mem_exclusive (an unusual configuration), then
2109 * returns the root cpuset. 2103 * returns the root cpuset.
2110 */ 2104 */
@@ -2131,12 +2125,12 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
2131 * GFP_KERNEL allocations are not so marked, so can escape to the 2125 * GFP_KERNEL allocations are not so marked, so can escape to the
2132 * nearest mem_exclusive ancestor cpuset. 2126 * nearest mem_exclusive ancestor cpuset.
2133 * 2127 *
2134 * Scanning up parent cpusets requires callback_sem. The __alloc_pages() 2128 * Scanning up parent cpusets requires callback_mutex. The __alloc_pages()
2135 * routine only calls here with __GFP_HARDWALL bit _not_ set if 2129 * routine only calls here with __GFP_HARDWALL bit _not_ set if
2136 * it's a GFP_KERNEL allocation, and all nodes in the current tasks 2130 * it's a GFP_KERNEL allocation, and all nodes in the current tasks
2137 * mems_allowed came up empty on the first pass over the zonelist. 2131 * mems_allowed came up empty on the first pass over the zonelist.
2138 * So only GFP_KERNEL allocations, if all nodes in the cpuset are 2132 * So only GFP_KERNEL allocations, if all nodes in the cpuset are
2139 * short of memory, might require taking the callback_sem semaphore. 2133 * short of memory, might require taking the callback_mutex mutex.
2140 * 2134 *
2141 * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages() 2135 * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages()
2142 * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing 2136 * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing
@@ -2171,31 +2165,31 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
2171 return 1; 2165 return 1;
2172 2166
2173 /* Not hardwall and node outside mems_allowed: scan up cpusets */ 2167 /* Not hardwall and node outside mems_allowed: scan up cpusets */
2174 down(&callback_sem); 2168 mutex_lock(&callback_mutex);
2175 2169
2176 task_lock(current); 2170 task_lock(current);
2177 cs = nearest_exclusive_ancestor(current->cpuset); 2171 cs = nearest_exclusive_ancestor(current->cpuset);
2178 task_unlock(current); 2172 task_unlock(current);
2179 2173
2180 allowed = node_isset(node, cs->mems_allowed); 2174 allowed = node_isset(node, cs->mems_allowed);
2181 up(&callback_sem); 2175 mutex_unlock(&callback_mutex);
2182 return allowed; 2176 return allowed;
2183} 2177}
2184 2178
2185/** 2179/**
2186 * cpuset_lock - lock out any changes to cpuset structures 2180 * cpuset_lock - lock out any changes to cpuset structures
2187 * 2181 *
2188 * The out of memory (oom) code needs to lock down cpusets 2182 * The out of memory (oom) code needs to mutex_lock cpusets
2189 * from being changed while it scans the tasklist looking for a 2183 * from being changed while it scans the tasklist looking for a
2190 * task in an overlapping cpuset. Expose callback_sem via this 2184 * task in an overlapping cpuset. Expose callback_mutex via this
2191 * cpuset_lock() routine, so the oom code can lock it, before 2185 * cpuset_lock() routine, so the oom code can lock it, before
2192 * locking the task list. The tasklist_lock is a spinlock, so 2186 * locking the task list. The tasklist_lock is a spinlock, so
2193 * must be taken inside callback_sem. 2187 * must be taken inside callback_mutex.
2194 */ 2188 */
2195 2189
2196void cpuset_lock(void) 2190void cpuset_lock(void)
2197{ 2191{
2198 down(&callback_sem); 2192 mutex_lock(&callback_mutex);
2199} 2193}
2200 2194
2201/** 2195/**
@@ -2206,7 +2200,7 @@ void cpuset_lock(void)
2206 2200
2207void cpuset_unlock(void) 2201void cpuset_unlock(void)
2208{ 2202{
2209 up(&callback_sem); 2203 mutex_unlock(&callback_mutex);
2210} 2204}
2211 2205
2212/** 2206/**
@@ -2218,7 +2212,7 @@ void cpuset_unlock(void)
2218 * determine if task @p's memory usage might impact the memory 2212 * determine if task @p's memory usage might impact the memory
2219 * available to the current task. 2213 * available to the current task.
2220 * 2214 *
2221 * Call while holding callback_sem. 2215 * Call while holding callback_mutex.
2222 **/ 2216 **/
2223 2217
2224int cpuset_excl_nodes_overlap(const struct task_struct *p) 2218int cpuset_excl_nodes_overlap(const struct task_struct *p)
@@ -2289,7 +2283,7 @@ void __cpuset_memory_pressure_bump(void)
2289 * - Used for /proc/<pid>/cpuset. 2283 * - Used for /proc/<pid>/cpuset.
2290 * - No need to task_lock(tsk) on this tsk->cpuset reference, as it 2284 * - No need to task_lock(tsk) on this tsk->cpuset reference, as it
2291 * doesn't really matter if tsk->cpuset changes after we read it, 2285 * doesn't really matter if tsk->cpuset changes after we read it,
2292 * and we take manage_sem, keeping attach_task() from changing it 2286 * and we take manage_mutex, keeping attach_task() from changing it
2293 * anyway. 2287 * anyway.
2294 */ 2288 */
2295 2289
@@ -2305,7 +2299,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
2305 return -ENOMEM; 2299 return -ENOMEM;
2306 2300
2307 tsk = m->private; 2301 tsk = m->private;
2308 down(&manage_sem); 2302 mutex_lock(&manage_mutex);
2309 cs = tsk->cpuset; 2303 cs = tsk->cpuset;
2310 if (!cs) { 2304 if (!cs) {
2311 retval = -EINVAL; 2305 retval = -EINVAL;
@@ -2318,7 +2312,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
2318 seq_puts(m, buf); 2312 seq_puts(m, buf);
2319 seq_putc(m, '\n'); 2313 seq_putc(m, '\n');
2320out: 2314out:
2321 up(&manage_sem); 2315 mutex_unlock(&manage_mutex);
2322 kfree(buf); 2316 kfree(buf);
2323 return retval; 2317 return retval;
2324} 2318}
diff --git a/kernel/exit.c b/kernel/exit.c
index d1e8d500a7e1..8037405e136e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -345,9 +345,9 @@ void daemonize(const char *name, ...)
345 exit_mm(current); 345 exit_mm(current);
346 346
347 set_special_pids(1, 1); 347 set_special_pids(1, 1);
348 down(&tty_sem); 348 mutex_lock(&tty_mutex);
349 current->signal->tty = NULL; 349 current->signal->tty = NULL;
350 up(&tty_sem); 350 mutex_unlock(&tty_mutex);
351 351
352 /* Block and flush all signals */ 352 /* Block and flush all signals */
353 sigfillset(&blocked); 353 sigfillset(&blocked);
diff --git a/kernel/fork.c b/kernel/fork.c
index 9bd7b65ee418..c79ae0b19a49 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -607,12 +607,12 @@ static struct files_struct *alloc_files(void)
607 atomic_set(&newf->count, 1); 607 atomic_set(&newf->count, 1);
608 608
609 spin_lock_init(&newf->file_lock); 609 spin_lock_init(&newf->file_lock);
610 newf->next_fd = 0;
610 fdt = &newf->fdtab; 611 fdt = &newf->fdtab;
611 fdt->next_fd = 0;
612 fdt->max_fds = NR_OPEN_DEFAULT; 612 fdt->max_fds = NR_OPEN_DEFAULT;
613 fdt->max_fdset = __FD_SETSIZE; 613 fdt->max_fdset = EMBEDDED_FD_SET_SIZE;
614 fdt->close_on_exec = &newf->close_on_exec_init; 614 fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
615 fdt->open_fds = &newf->open_fds_init; 615 fdt->open_fds = (fd_set *)&newf->open_fds_init;
616 fdt->fd = &newf->fd_array[0]; 616 fdt->fd = &newf->fd_array[0];
617 INIT_RCU_HEAD(&fdt->rcu); 617 INIT_RCU_HEAD(&fdt->rcu);
618 fdt->free_files = NULL; 618 fdt->free_files = NULL;
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index fef1af8a73ce..1fb9f753ef60 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -48,7 +48,7 @@
48static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; 48static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
49static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; 49static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
50 50
51DECLARE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ 51DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */
52DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ 52DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */
53static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; 53static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
54 54
@@ -460,7 +460,7 @@ static int __kprobes __register_kprobe(struct kprobe *p,
460 } 460 }
461 461
462 p->nmissed = 0; 462 p->nmissed = 0;
463 down(&kprobe_mutex); 463 mutex_lock(&kprobe_mutex);
464 old_p = get_kprobe(p->addr); 464 old_p = get_kprobe(p->addr);
465 if (old_p) { 465 if (old_p) {
466 ret = register_aggr_kprobe(old_p, p); 466 ret = register_aggr_kprobe(old_p, p);
@@ -477,7 +477,7 @@ static int __kprobes __register_kprobe(struct kprobe *p,
477 arch_arm_kprobe(p); 477 arch_arm_kprobe(p);
478 478
479out: 479out:
480 up(&kprobe_mutex); 480 mutex_unlock(&kprobe_mutex);
481 481
482 if (ret && probed_mod) 482 if (ret && probed_mod)
483 module_put(probed_mod); 483 module_put(probed_mod);
@@ -496,10 +496,10 @@ void __kprobes unregister_kprobe(struct kprobe *p)
496 struct kprobe *old_p, *list_p; 496 struct kprobe *old_p, *list_p;
497 int cleanup_p; 497 int cleanup_p;
498 498
499 down(&kprobe_mutex); 499 mutex_lock(&kprobe_mutex);
500 old_p = get_kprobe(p->addr); 500 old_p = get_kprobe(p->addr);
501 if (unlikely(!old_p)) { 501 if (unlikely(!old_p)) {
502 up(&kprobe_mutex); 502 mutex_unlock(&kprobe_mutex);
503 return; 503 return;
504 } 504 }
505 if (p != old_p) { 505 if (p != old_p) {
@@ -507,7 +507,7 @@ void __kprobes unregister_kprobe(struct kprobe *p)
507 if (list_p == p) 507 if (list_p == p)
508 /* kprobe p is a valid probe */ 508 /* kprobe p is a valid probe */
509 goto valid_p; 509 goto valid_p;
510 up(&kprobe_mutex); 510 mutex_unlock(&kprobe_mutex);
511 return; 511 return;
512 } 512 }
513valid_p: 513valid_p:
@@ -523,7 +523,7 @@ valid_p:
523 cleanup_p = 0; 523 cleanup_p = 0;
524 } 524 }
525 525
526 up(&kprobe_mutex); 526 mutex_unlock(&kprobe_mutex);
527 527
528 synchronize_sched(); 528 synchronize_sched();
529 if (p->mod_refcounted && 529 if (p->mod_refcounted &&
diff --git a/kernel/kthread.c b/kernel/kthread.c
index e75950a1092c..6a5373868a98 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -12,6 +12,7 @@
12#include <linux/unistd.h> 12#include <linux/unistd.h>
13#include <linux/file.h> 13#include <linux/file.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/mutex.h>
15#include <asm/semaphore.h> 16#include <asm/semaphore.h>
16 17
17/* 18/*
@@ -41,7 +42,7 @@ struct kthread_stop_info
41 42
42/* Thread stopping is done by setthing this var: lock serializes 43/* Thread stopping is done by setthing this var: lock serializes
43 * multiple kthread_stop calls. */ 44 * multiple kthread_stop calls. */
44static DECLARE_MUTEX(kthread_stop_lock); 45static DEFINE_MUTEX(kthread_stop_lock);
45static struct kthread_stop_info kthread_stop_info; 46static struct kthread_stop_info kthread_stop_info;
46 47
47int kthread_should_stop(void) 48int kthread_should_stop(void)
@@ -173,7 +174,7 @@ int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
173{ 174{
174 int ret; 175 int ret;
175 176
176 down(&kthread_stop_lock); 177 mutex_lock(&kthread_stop_lock);
177 178
178 /* It could exit after stop_info.k set, but before wake_up_process. */ 179 /* It could exit after stop_info.k set, but before wake_up_process. */
179 get_task_struct(k); 180 get_task_struct(k);
@@ -194,7 +195,7 @@ int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
194 wait_for_completion(&kthread_stop_info.done); 195 wait_for_completion(&kthread_stop_info.done);
195 kthread_stop_info.k = NULL; 196 kthread_stop_info.k = NULL;
196 ret = kthread_stop_info.err; 197 ret = kthread_stop_info.err;
197 up(&kthread_stop_lock); 198 mutex_unlock(&kthread_stop_lock);
198 199
199 return ret; 200 return ret;
200} 201}
diff --git a/kernel/module.c b/kernel/module.c
index 77764f22f021..fb404299082e 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -39,6 +39,7 @@
39#include <linux/device.h> 39#include <linux/device.h>
40#include <linux/string.h> 40#include <linux/string.h>
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/mutex.h>
42#include <asm/uaccess.h> 43#include <asm/uaccess.h>
43#include <asm/semaphore.h> 44#include <asm/semaphore.h>
44#include <asm/cacheflush.h> 45#include <asm/cacheflush.h>
@@ -60,18 +61,18 @@
60static DEFINE_SPINLOCK(modlist_lock); 61static DEFINE_SPINLOCK(modlist_lock);
61 62
62/* List of modules, protected by module_mutex AND modlist_lock */ 63/* List of modules, protected by module_mutex AND modlist_lock */
63static DECLARE_MUTEX(module_mutex); 64static DEFINE_MUTEX(module_mutex);
64static LIST_HEAD(modules); 65static LIST_HEAD(modules);
65 66
66static DECLARE_MUTEX(notify_mutex); 67static DEFINE_MUTEX(notify_mutex);
67static struct notifier_block * module_notify_list; 68static struct notifier_block * module_notify_list;
68 69
69int register_module_notifier(struct notifier_block * nb) 70int register_module_notifier(struct notifier_block * nb)
70{ 71{
71 int err; 72 int err;
72 down(&notify_mutex); 73 mutex_lock(&notify_mutex);
73 err = notifier_chain_register(&module_notify_list, nb); 74 err = notifier_chain_register(&module_notify_list, nb);
74 up(&notify_mutex); 75 mutex_unlock(&notify_mutex);
75 return err; 76 return err;
76} 77}
77EXPORT_SYMBOL(register_module_notifier); 78EXPORT_SYMBOL(register_module_notifier);
@@ -79,9 +80,9 @@ EXPORT_SYMBOL(register_module_notifier);
79int unregister_module_notifier(struct notifier_block * nb) 80int unregister_module_notifier(struct notifier_block * nb)
80{ 81{
81 int err; 82 int err;
82 down(&notify_mutex); 83 mutex_lock(&notify_mutex);
83 err = notifier_chain_unregister(&module_notify_list, nb); 84 err = notifier_chain_unregister(&module_notify_list, nb);
84 up(&notify_mutex); 85 mutex_unlock(&notify_mutex);
85 return err; 86 return err;
86} 87}
87EXPORT_SYMBOL(unregister_module_notifier); 88EXPORT_SYMBOL(unregister_module_notifier);
@@ -601,7 +602,7 @@ static void free_module(struct module *mod);
601static void wait_for_zero_refcount(struct module *mod) 602static void wait_for_zero_refcount(struct module *mod)
602{ 603{
603 /* Since we might sleep for some time, drop the semaphore first */ 604 /* Since we might sleep for some time, drop the semaphore first */
604 up(&module_mutex); 605 mutex_unlock(&module_mutex);
605 for (;;) { 606 for (;;) {
606 DEBUGP("Looking at refcount...\n"); 607 DEBUGP("Looking at refcount...\n");
607 set_current_state(TASK_UNINTERRUPTIBLE); 608 set_current_state(TASK_UNINTERRUPTIBLE);
@@ -610,7 +611,7 @@ static void wait_for_zero_refcount(struct module *mod)
610 schedule(); 611 schedule();
611 } 612 }
612 current->state = TASK_RUNNING; 613 current->state = TASK_RUNNING;
613 down(&module_mutex); 614 mutex_lock(&module_mutex);
614} 615}
615 616
616asmlinkage long 617asmlinkage long
@@ -627,7 +628,7 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
627 return -EFAULT; 628 return -EFAULT;
628 name[MODULE_NAME_LEN-1] = '\0'; 629 name[MODULE_NAME_LEN-1] = '\0';
629 630
630 if (down_interruptible(&module_mutex) != 0) 631 if (mutex_lock_interruptible(&module_mutex) != 0)
631 return -EINTR; 632 return -EINTR;
632 633
633 mod = find_module(name); 634 mod = find_module(name);
@@ -676,14 +677,14 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
676 677
677 /* Final destruction now noone is using it. */ 678 /* Final destruction now noone is using it. */
678 if (mod->exit != NULL) { 679 if (mod->exit != NULL) {
679 up(&module_mutex); 680 mutex_unlock(&module_mutex);
680 mod->exit(); 681 mod->exit();
681 down(&module_mutex); 682 mutex_lock(&module_mutex);
682 } 683 }
683 free_module(mod); 684 free_module(mod);
684 685
685 out: 686 out:
686 up(&module_mutex); 687 mutex_unlock(&module_mutex);
687 return ret; 688 return ret;
688} 689}
689 690
@@ -1972,13 +1973,13 @@ sys_init_module(void __user *umod,
1972 return -EPERM; 1973 return -EPERM;
1973 1974
1974 /* Only one module load at a time, please */ 1975 /* Only one module load at a time, please */
1975 if (down_interruptible(&module_mutex) != 0) 1976 if (mutex_lock_interruptible(&module_mutex) != 0)
1976 return -EINTR; 1977 return -EINTR;
1977 1978
1978 /* Do all the hard work */ 1979 /* Do all the hard work */
1979 mod = load_module(umod, len, uargs); 1980 mod = load_module(umod, len, uargs);
1980 if (IS_ERR(mod)) { 1981 if (IS_ERR(mod)) {
1981 up(&module_mutex); 1982 mutex_unlock(&module_mutex);
1982 return PTR_ERR(mod); 1983 return PTR_ERR(mod);
1983 } 1984 }
1984 1985
@@ -1987,11 +1988,11 @@ sys_init_module(void __user *umod,
1987 stop_machine_run(__link_module, mod, NR_CPUS); 1988 stop_machine_run(__link_module, mod, NR_CPUS);
1988 1989
1989 /* Drop lock so they can recurse */ 1990 /* Drop lock so they can recurse */
1990 up(&module_mutex); 1991 mutex_unlock(&module_mutex);
1991 1992
1992 down(&notify_mutex); 1993 mutex_lock(&notify_mutex);
1993 notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod); 1994 notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod);
1994 up(&notify_mutex); 1995 mutex_unlock(&notify_mutex);
1995 1996
1996 /* Start the module */ 1997 /* Start the module */
1997 if (mod->init != NULL) 1998 if (mod->init != NULL)
@@ -2006,15 +2007,15 @@ sys_init_module(void __user *umod,
2006 mod->name); 2007 mod->name);
2007 else { 2008 else {
2008 module_put(mod); 2009 module_put(mod);
2009 down(&module_mutex); 2010 mutex_lock(&module_mutex);
2010 free_module(mod); 2011 free_module(mod);
2011 up(&module_mutex); 2012 mutex_unlock(&module_mutex);
2012 } 2013 }
2013 return ret; 2014 return ret;
2014 } 2015 }
2015 2016
2016 /* Now it's a first class citizen! */ 2017 /* Now it's a first class citizen! */
2017 down(&module_mutex); 2018 mutex_lock(&module_mutex);
2018 mod->state = MODULE_STATE_LIVE; 2019 mod->state = MODULE_STATE_LIVE;
2019 /* Drop initial reference. */ 2020 /* Drop initial reference. */
2020 module_put(mod); 2021 module_put(mod);
@@ -2022,7 +2023,7 @@ sys_init_module(void __user *umod,
2022 mod->module_init = NULL; 2023 mod->module_init = NULL;
2023 mod->init_size = 0; 2024 mod->init_size = 0;
2024 mod->init_text_size = 0; 2025 mod->init_text_size = 0;
2025 up(&module_mutex); 2026 mutex_unlock(&module_mutex);
2026 2027
2027 return 0; 2028 return 0;
2028} 2029}
@@ -2112,7 +2113,7 @@ struct module *module_get_kallsym(unsigned int symnum,
2112{ 2113{
2113 struct module *mod; 2114 struct module *mod;
2114 2115
2115 down(&module_mutex); 2116 mutex_lock(&module_mutex);
2116 list_for_each_entry(mod, &modules, list) { 2117 list_for_each_entry(mod, &modules, list) {
2117 if (symnum < mod->num_symtab) { 2118 if (symnum < mod->num_symtab) {
2118 *value = mod->symtab[symnum].st_value; 2119 *value = mod->symtab[symnum].st_value;
@@ -2120,12 +2121,12 @@ struct module *module_get_kallsym(unsigned int symnum,
2120 strncpy(namebuf, 2121 strncpy(namebuf,
2121 mod->strtab + mod->symtab[symnum].st_name, 2122 mod->strtab + mod->symtab[symnum].st_name,
2122 127); 2123 127);
2123 up(&module_mutex); 2124 mutex_unlock(&module_mutex);
2124 return mod; 2125 return mod;
2125 } 2126 }
2126 symnum -= mod->num_symtab; 2127 symnum -= mod->num_symtab;
2127 } 2128 }
2128 up(&module_mutex); 2129 mutex_unlock(&module_mutex);
2129 return NULL; 2130 return NULL;
2130} 2131}
2131 2132
@@ -2168,7 +2169,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
2168 struct list_head *i; 2169 struct list_head *i;
2169 loff_t n = 0; 2170 loff_t n = 0;
2170 2171
2171 down(&module_mutex); 2172 mutex_lock(&module_mutex);
2172 list_for_each(i, &modules) { 2173 list_for_each(i, &modules) {
2173 if (n++ == *pos) 2174 if (n++ == *pos)
2174 break; 2175 break;
@@ -2189,7 +2190,7 @@ static void *m_next(struct seq_file *m, void *p, loff_t *pos)
2189 2190
2190static void m_stop(struct seq_file *m, void *p) 2191static void m_stop(struct seq_file *m, void *p)
2191{ 2192{
2192 up(&module_mutex); 2193 mutex_unlock(&module_mutex);
2193} 2194}
2194 2195
2195static int m_show(struct seq_file *m, void *p) 2196static int m_show(struct seq_file *m, void *p)
diff --git a/kernel/panic.c b/kernel/panic.c
index 126dc43f1c74..acd95adddb93 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -20,10 +20,13 @@
20#include <linux/nmi.h> 20#include <linux/nmi.h>
21#include <linux/kexec.h> 21#include <linux/kexec.h>
22 22
23int panic_timeout;
24int panic_on_oops; 23int panic_on_oops;
25int tainted; 24int tainted;
25static int pause_on_oops;
26static int pause_on_oops_flag;
27static DEFINE_SPINLOCK(pause_on_oops_lock);
26 28
29int panic_timeout;
27EXPORT_SYMBOL(panic_timeout); 30EXPORT_SYMBOL(panic_timeout);
28 31
29struct notifier_block *panic_notifier_list; 32struct notifier_block *panic_notifier_list;
@@ -174,3 +177,95 @@ void add_taint(unsigned flag)
174 tainted |= flag; 177 tainted |= flag;
175} 178}
176EXPORT_SYMBOL(add_taint); 179EXPORT_SYMBOL(add_taint);
180
181static int __init pause_on_oops_setup(char *str)
182{
183 pause_on_oops = simple_strtoul(str, NULL, 0);
184 return 1;
185}
186__setup("pause_on_oops=", pause_on_oops_setup);
187
188static void spin_msec(int msecs)
189{
190 int i;
191
192 for (i = 0; i < msecs; i++) {
193 touch_nmi_watchdog();
194 mdelay(1);
195 }
196}
197
198/*
199 * It just happens that oops_enter() and oops_exit() are identically
200 * implemented...
201 */
202static void do_oops_enter_exit(void)
203{
204 unsigned long flags;
205 static int spin_counter;
206
207 if (!pause_on_oops)
208 return;
209
210 spin_lock_irqsave(&pause_on_oops_lock, flags);
211 if (pause_on_oops_flag == 0) {
212 /* This CPU may now print the oops message */
213 pause_on_oops_flag = 1;
214 } else {
215 /* We need to stall this CPU */
216 if (!spin_counter) {
217 /* This CPU gets to do the counting */
218 spin_counter = pause_on_oops;
219 do {
220 spin_unlock(&pause_on_oops_lock);
221 spin_msec(MSEC_PER_SEC);
222 spin_lock(&pause_on_oops_lock);
223 } while (--spin_counter);
224 pause_on_oops_flag = 0;
225 } else {
226 /* This CPU waits for a different one */
227 while (spin_counter) {
228 spin_unlock(&pause_on_oops_lock);
229 spin_msec(1);
230 spin_lock(&pause_on_oops_lock);
231 }
232 }
233 }
234 spin_unlock_irqrestore(&pause_on_oops_lock, flags);
235}
236
237/*
238 * Return true if the calling CPU is allowed to print oops-related info. This
239 * is a bit racy..
240 */
241int oops_may_print(void)
242{
243 return pause_on_oops_flag == 0;
244}
245
246/*
247 * Called when the architecture enters its oops handler, before it prints
248 * anything. If this is the first CPU to oops, and it's oopsing the first time
249 * then let it proceed.
250 *
251 * This is all enabled by the pause_on_oops kernel boot option. We do all this
252 * to ensure that oopses don't scroll off the screen. It has the side-effect
253 * of preventing later-oopsing CPUs from mucking up the display, too.
254 *
255 * It turns out that the CPU which is allowed to print ends up pausing for the
256 * right duration, whereas all the other CPUs pause for twice as long: once in
257 * oops_enter(), once in oops_exit().
258 */
259void oops_enter(void)
260{
261 do_oops_enter_exit();
262}
263
264/*
265 * Called when the architecture exits its oops handler, after printing
266 * everything.
267 */
268void oops_exit(void)
269{
270 do_oops_enter_exit();
271}
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index fa895fc2ecf5..9944379360b5 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -35,6 +35,7 @@
35#include <linux/interrupt.h> 35#include <linux/interrupt.h>
36#include <linux/slab.h> 36#include <linux/slab.h>
37#include <linux/time.h> 37#include <linux/time.h>
38#include <linux/mutex.h>
38 39
39#include <asm/uaccess.h> 40#include <asm/uaccess.h>
40#include <asm/semaphore.h> 41#include <asm/semaphore.h>
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 04be7d0d96a7..8d0af3d37a4b 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -5,7 +5,7 @@ endif
5 5
6obj-y := main.o process.o console.o 6obj-y := main.o process.o console.o
7obj-$(CONFIG_PM_LEGACY) += pm.o 7obj-$(CONFIG_PM_LEGACY) += pm.o
8obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o snapshot.o 8obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o snapshot.o swap.o user.o
9 9
10obj-$(CONFIG_SUSPEND_SMP) += smp.o 10obj-$(CONFIG_SUSPEND_SMP) += smp.o
11 11
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 0b43847dc980..81d4d982f3f0 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -22,17 +22,6 @@
22#include "power.h" 22#include "power.h"
23 23
24 24
25extern suspend_disk_method_t pm_disk_mode;
26
27extern int swsusp_shrink_memory(void);
28extern int swsusp_suspend(void);
29extern int swsusp_write(struct pbe *pblist, unsigned int nr_pages);
30extern int swsusp_check(void);
31extern int swsusp_read(struct pbe **pblist_ptr);
32extern void swsusp_close(void);
33extern int swsusp_resume(void);
34
35
36static int noresume = 0; 25static int noresume = 0;
37char resume_file[256] = CONFIG_PM_STD_PARTITION; 26char resume_file[256] = CONFIG_PM_STD_PARTITION;
38dev_t swsusp_resume_device; 27dev_t swsusp_resume_device;
@@ -70,10 +59,6 @@ static void power_down(suspend_disk_method_t mode)
70 while(1); 59 while(1);
71} 60}
72 61
73
74static int in_suspend __nosavedata = 0;
75
76
77static inline void platform_finish(void) 62static inline void platform_finish(void)
78{ 63{
79 if (pm_disk_mode == PM_DISK_PLATFORM) { 64 if (pm_disk_mode == PM_DISK_PLATFORM) {
@@ -87,7 +72,6 @@ static int prepare_processes(void)
87 int error; 72 int error;
88 73
89 pm_prepare_console(); 74 pm_prepare_console();
90 sys_sync();
91 disable_nonboot_cpus(); 75 disable_nonboot_cpus();
92 76
93 if (freeze_processes()) { 77 if (freeze_processes()) {
@@ -145,7 +129,7 @@ int pm_suspend_disk(void)
145 if (in_suspend) { 129 if (in_suspend) {
146 device_resume(); 130 device_resume();
147 pr_debug("PM: writing image.\n"); 131 pr_debug("PM: writing image.\n");
148 error = swsusp_write(pagedir_nosave, nr_copy_pages); 132 error = swsusp_write();
149 if (!error) 133 if (!error)
150 power_down(pm_disk_mode); 134 power_down(pm_disk_mode);
151 else { 135 else {
@@ -216,7 +200,7 @@ static int software_resume(void)
216 200
217 pr_debug("PM: Reading swsusp image.\n"); 201 pr_debug("PM: Reading swsusp image.\n");
218 202
219 if ((error = swsusp_read(&pagedir_nosave))) { 203 if ((error = swsusp_read())) {
220 swsusp_free(); 204 swsusp_free();
221 goto Thaw; 205 goto Thaw;
222 } 206 }
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 9cb235cba4a9..ee371f50ccaa 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -103,7 +103,7 @@ static int suspend_prepare(suspend_state_t state)
103} 103}
104 104
105 105
106static int suspend_enter(suspend_state_t state) 106int suspend_enter(suspend_state_t state)
107{ 107{
108 int error = 0; 108 int error = 0;
109 unsigned long flags; 109 unsigned long flags;
diff --git a/kernel/power/pm.c b/kernel/power/pm.c
index 33c508e857dd..0f6908cce1dd 100644
--- a/kernel/power/pm.c
+++ b/kernel/power/pm.c
@@ -25,6 +25,7 @@
25#include <linux/pm.h> 25#include <linux/pm.h>
26#include <linux/pm_legacy.h> 26#include <linux/pm_legacy.h>
27#include <linux/interrupt.h> 27#include <linux/interrupt.h>
28#include <linux/mutex.h>
28 29
29int pm_active; 30int pm_active;
30 31
@@ -40,7 +41,7 @@ int pm_active;
40 * until a resume but that will be fine. 41 * until a resume but that will be fine.
41 */ 42 */
42 43
43static DECLARE_MUTEX(pm_devs_lock); 44static DEFINE_MUTEX(pm_devs_lock);
44static LIST_HEAD(pm_devs); 45static LIST_HEAD(pm_devs);
45 46
46/** 47/**
@@ -67,9 +68,9 @@ struct pm_dev *pm_register(pm_dev_t type,
67 dev->id = id; 68 dev->id = id;
68 dev->callback = callback; 69 dev->callback = callback;
69 70
70 down(&pm_devs_lock); 71 mutex_lock(&pm_devs_lock);
71 list_add(&dev->entry, &pm_devs); 72 list_add(&dev->entry, &pm_devs);
72 up(&pm_devs_lock); 73 mutex_unlock(&pm_devs_lock);
73 } 74 }
74 return dev; 75 return dev;
75} 76}
@@ -85,9 +86,9 @@ struct pm_dev *pm_register(pm_dev_t type,
85void pm_unregister(struct pm_dev *dev) 86void pm_unregister(struct pm_dev *dev)
86{ 87{
87 if (dev) { 88 if (dev) {
88 down(&pm_devs_lock); 89 mutex_lock(&pm_devs_lock);
89 list_del(&dev->entry); 90 list_del(&dev->entry);
90 up(&pm_devs_lock); 91 mutex_unlock(&pm_devs_lock);
91 92
92 kfree(dev); 93 kfree(dev);
93 } 94 }
@@ -118,7 +119,7 @@ void pm_unregister_all(pm_callback callback)
118 if (!callback) 119 if (!callback)
119 return; 120 return;
120 121
121 down(&pm_devs_lock); 122 mutex_lock(&pm_devs_lock);
122 entry = pm_devs.next; 123 entry = pm_devs.next;
123 while (entry != &pm_devs) { 124 while (entry != &pm_devs) {
124 struct pm_dev *dev = list_entry(entry, struct pm_dev, entry); 125 struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
@@ -126,7 +127,7 @@ void pm_unregister_all(pm_callback callback)
126 if (dev->callback == callback) 127 if (dev->callback == callback)
127 __pm_unregister(dev); 128 __pm_unregister(dev);
128 } 129 }
129 up(&pm_devs_lock); 130 mutex_unlock(&pm_devs_lock);
130} 131}
131 132
132/** 133/**
@@ -234,7 +235,7 @@ int pm_send_all(pm_request_t rqst, void *data)
234{ 235{
235 struct list_head *entry; 236 struct list_head *entry;
236 237
237 down(&pm_devs_lock); 238 mutex_lock(&pm_devs_lock);
238 entry = pm_devs.next; 239 entry = pm_devs.next;
239 while (entry != &pm_devs) { 240 while (entry != &pm_devs) {
240 struct pm_dev *dev = list_entry(entry, struct pm_dev, entry); 241 struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
@@ -246,13 +247,13 @@ int pm_send_all(pm_request_t rqst, void *data)
246 */ 247 */
247 if (rqst == PM_SUSPEND) 248 if (rqst == PM_SUSPEND)
248 pm_undo_all(dev); 249 pm_undo_all(dev);
249 up(&pm_devs_lock); 250 mutex_unlock(&pm_devs_lock);
250 return status; 251 return status;
251 } 252 }
252 } 253 }
253 entry = entry->next; 254 entry = entry->next;
254 } 255 }
255 up(&pm_devs_lock); 256 mutex_unlock(&pm_devs_lock);
256 return 0; 257 return 0;
257} 258}
258 259
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 388dba680841..f06f12f21767 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -8,6 +8,7 @@ struct swsusp_info {
8 int cpus; 8 int cpus;
9 unsigned long image_pages; 9 unsigned long image_pages;
10 unsigned long pages; 10 unsigned long pages;
11 unsigned long size;
11} __attribute__((aligned(PAGE_SIZE))); 12} __attribute__((aligned(PAGE_SIZE)));
12 13
13 14
@@ -37,21 +38,79 @@ extern struct subsystem power_subsys;
37/* References to section boundaries */ 38/* References to section boundaries */
38extern const void __nosave_begin, __nosave_end; 39extern const void __nosave_begin, __nosave_end;
39 40
40extern unsigned int nr_copy_pages;
41extern struct pbe *pagedir_nosave; 41extern struct pbe *pagedir_nosave;
42 42
43/* Preferred image size in bytes (default 500 MB) */ 43/* Preferred image size in bytes (default 500 MB) */
44extern unsigned long image_size; 44extern unsigned long image_size;
45extern int in_suspend;
46extern dev_t swsusp_resume_device;
45 47
46extern asmlinkage int swsusp_arch_suspend(void); 48extern asmlinkage int swsusp_arch_suspend(void);
47extern asmlinkage int swsusp_arch_resume(void); 49extern asmlinkage int swsusp_arch_resume(void);
48 50
49extern unsigned int count_data_pages(void); 51extern unsigned int count_data_pages(void);
50extern void free_pagedir(struct pbe *pblist); 52
51extern void release_eaten_pages(void); 53struct snapshot_handle {
52extern struct pbe *alloc_pagedir(unsigned nr_pages, gfp_t gfp_mask, int safe_needed); 54 loff_t offset;
55 unsigned int page;
56 unsigned int page_offset;
57 unsigned int prev;
58 struct pbe *pbe;
59 void *buffer;
60 unsigned int buf_offset;
61};
62
63#define data_of(handle) ((handle).buffer + (handle).buf_offset)
64
65extern int snapshot_read_next(struct snapshot_handle *handle, size_t count);
66extern int snapshot_write_next(struct snapshot_handle *handle, size_t count);
67int snapshot_image_loaded(struct snapshot_handle *handle);
68
69#define SNAPSHOT_IOC_MAGIC '3'
70#define SNAPSHOT_FREEZE _IO(SNAPSHOT_IOC_MAGIC, 1)
71#define SNAPSHOT_UNFREEZE _IO(SNAPSHOT_IOC_MAGIC, 2)
72#define SNAPSHOT_ATOMIC_SNAPSHOT _IOW(SNAPSHOT_IOC_MAGIC, 3, void *)
73#define SNAPSHOT_ATOMIC_RESTORE _IO(SNAPSHOT_IOC_MAGIC, 4)
74#define SNAPSHOT_FREE _IO(SNAPSHOT_IOC_MAGIC, 5)
75#define SNAPSHOT_SET_IMAGE_SIZE _IOW(SNAPSHOT_IOC_MAGIC, 6, unsigned long)
76#define SNAPSHOT_AVAIL_SWAP _IOR(SNAPSHOT_IOC_MAGIC, 7, void *)
77#define SNAPSHOT_GET_SWAP_PAGE _IOR(SNAPSHOT_IOC_MAGIC, 8, void *)
78#define SNAPSHOT_FREE_SWAP_PAGES _IO(SNAPSHOT_IOC_MAGIC, 9)
79#define SNAPSHOT_SET_SWAP_FILE _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int)
80#define SNAPSHOT_S2RAM _IO(SNAPSHOT_IOC_MAGIC, 11)
81#define SNAPSHOT_IOC_MAXNR 11
82
83/**
84 * The bitmap is used for tracing allocated swap pages
85 *
86 * The entire bitmap consists of a number of bitmap_page
87 * structures linked with the help of the .next member.
88 * Thus each page can be allocated individually, so we only
89 * need to make 0-order memory allocations to create
90 * the bitmap.
91 */
92
93#define BITMAP_PAGE_SIZE (PAGE_SIZE - sizeof(void *))
94#define BITMAP_PAGE_CHUNKS (BITMAP_PAGE_SIZE / sizeof(long))
95#define BITS_PER_CHUNK (sizeof(long) * 8)
96#define BITMAP_PAGE_BITS (BITMAP_PAGE_CHUNKS * BITS_PER_CHUNK)
97
98struct bitmap_page {
99 unsigned long chunks[BITMAP_PAGE_CHUNKS];
100 struct bitmap_page *next;
101};
102
103extern void free_bitmap(struct bitmap_page *bitmap);
104extern struct bitmap_page *alloc_bitmap(unsigned int nr_bits);
105extern unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap);
106extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap);
107
108extern int swsusp_check(void);
109extern int swsusp_shrink_memory(void);
53extern void swsusp_free(void); 110extern void swsusp_free(void);
54extern int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed); 111extern int swsusp_suspend(void);
55extern unsigned int snapshot_nr_pages(void); 112extern int swsusp_resume(void);
56extern struct pbe *snapshot_pblist(void); 113extern int swsusp_read(void);
57extern void snapshot_pblist_set(struct pbe *pblist); 114extern int swsusp_write(void);
115extern void swsusp_close(void);
116extern int suspend_enter(suspend_state_t state);
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 28de118f7a0b..8ac7c35fad77 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -12,11 +12,12 @@
12#include <linux/interrupt.h> 12#include <linux/interrupt.h>
13#include <linux/suspend.h> 13#include <linux/suspend.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/syscalls.h>
15 16
16/* 17/*
17 * Timeout for stopping processes 18 * Timeout for stopping processes
18 */ 19 */
19#define TIMEOUT (6 * HZ) 20#define TIMEOUT (20 * HZ)
20 21
21 22
22static inline int freezeable(struct task_struct * p) 23static inline int freezeable(struct task_struct * p)
@@ -54,38 +55,62 @@ void refrigerator(void)
54 current->state = save; 55 current->state = save;
55} 56}
56 57
58static inline void freeze_process(struct task_struct *p)
59{
60 unsigned long flags;
61
62 if (!freezing(p)) {
63 freeze(p);
64 spin_lock_irqsave(&p->sighand->siglock, flags);
65 signal_wake_up(p, 0);
66 spin_unlock_irqrestore(&p->sighand->siglock, flags);
67 }
68}
69
57/* 0 = success, else # of processes that we failed to stop */ 70/* 0 = success, else # of processes that we failed to stop */
58int freeze_processes(void) 71int freeze_processes(void)
59{ 72{
60 int todo; 73 int todo, nr_user, user_frozen;
61 unsigned long start_time; 74 unsigned long start_time;
62 struct task_struct *g, *p; 75 struct task_struct *g, *p;
63 unsigned long flags; 76 unsigned long flags;
64 77
65 printk( "Stopping tasks: " ); 78 printk( "Stopping tasks: " );
66 start_time = jiffies; 79 start_time = jiffies;
80 user_frozen = 0;
67 do { 81 do {
68 todo = 0; 82 nr_user = todo = 0;
69 read_lock(&tasklist_lock); 83 read_lock(&tasklist_lock);
70 do_each_thread(g, p) { 84 do_each_thread(g, p) {
71 if (!freezeable(p)) 85 if (!freezeable(p))
72 continue; 86 continue;
73 if (frozen(p)) 87 if (frozen(p))
74 continue; 88 continue;
75 89 if (p->mm && !(p->flags & PF_BORROWED_MM)) {
76 freeze(p); 90 /* The task is a user-space one.
77 spin_lock_irqsave(&p->sighand->siglock, flags); 91 * Freeze it unless there's a vfork completion
78 signal_wake_up(p, 0); 92 * pending
79 spin_unlock_irqrestore(&p->sighand->siglock, flags); 93 */
80 todo++; 94 if (!p->vfork_done)
95 freeze_process(p);
96 nr_user++;
97 } else {
98 /* Freeze only if the user space is frozen */
99 if (user_frozen)
100 freeze_process(p);
101 todo++;
102 }
81 } while_each_thread(g, p); 103 } while_each_thread(g, p);
82 read_unlock(&tasklist_lock); 104 read_unlock(&tasklist_lock);
105 todo += nr_user;
106 if (!user_frozen && !nr_user) {
107 sys_sync();
108 start_time = jiffies;
109 }
110 user_frozen = !nr_user;
83 yield(); /* Yield is okay here */ 111 yield(); /* Yield is okay here */
84 if (todo && time_after(jiffies, start_time + TIMEOUT)) { 112 if (todo && time_after(jiffies, start_time + TIMEOUT))
85 printk( "\n" );
86 printk(KERN_ERR " stopping tasks failed (%d tasks remaining)\n", todo );
87 break; 113 break;
88 }
89 } while(todo); 114 } while(todo);
90 115
91 /* This does not unfreeze processes that are already frozen 116 /* This does not unfreeze processes that are already frozen
@@ -94,8 +119,14 @@ int freeze_processes(void)
94 * but it cleans up leftover PF_FREEZE requests. 119 * but it cleans up leftover PF_FREEZE requests.
95 */ 120 */
96 if (todo) { 121 if (todo) {
122 printk( "\n" );
123 printk(KERN_ERR " stopping tasks timed out "
124 "after %d seconds (%d tasks remaining):\n",
125 TIMEOUT / HZ, todo);
97 read_lock(&tasklist_lock); 126 read_lock(&tasklist_lock);
98 do_each_thread(g, p) 127 do_each_thread(g, p) {
128 if (freezeable(p) && !frozen(p))
129 printk(KERN_ERR " %s\n", p->comm);
99 if (freezing(p)) { 130 if (freezing(p)) {
100 pr_debug(" clean up: %s\n", p->comm); 131 pr_debug(" clean up: %s\n", p->comm);
101 p->flags &= ~PF_FREEZE; 132 p->flags &= ~PF_FREEZE;
@@ -103,7 +134,7 @@ int freeze_processes(void)
103 recalc_sigpending_tsk(p); 134 recalc_sigpending_tsk(p);
104 spin_unlock_irqrestore(&p->sighand->siglock, flags); 135 spin_unlock_irqrestore(&p->sighand->siglock, flags);
105 } 136 }
106 while_each_thread(g, p); 137 } while_each_thread(g, p);
107 read_unlock(&tasklist_lock); 138 read_unlock(&tasklist_lock);
108 return todo; 139 return todo;
109 } 140 }
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 8d5a5986d621..c5863d02c89e 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -10,6 +10,7 @@
10 */ 10 */
11 11
12 12
13#include <linux/version.h>
13#include <linux/module.h> 14#include <linux/module.h>
14#include <linux/mm.h> 15#include <linux/mm.h>
15#include <linux/suspend.h> 16#include <linux/suspend.h>
@@ -34,7 +35,9 @@
34#include "power.h" 35#include "power.h"
35 36
36struct pbe *pagedir_nosave; 37struct pbe *pagedir_nosave;
37unsigned int nr_copy_pages; 38static unsigned int nr_copy_pages;
39static unsigned int nr_meta_pages;
40static unsigned long *buffer;
38 41
39#ifdef CONFIG_HIGHMEM 42#ifdef CONFIG_HIGHMEM
40unsigned int count_highmem_pages(void) 43unsigned int count_highmem_pages(void)
@@ -80,7 +83,7 @@ static int save_highmem_zone(struct zone *zone)
80 void *kaddr; 83 void *kaddr;
81 unsigned long pfn = zone_pfn + zone->zone_start_pfn; 84 unsigned long pfn = zone_pfn + zone->zone_start_pfn;
82 85
83 if (!(pfn%1000)) 86 if (!(pfn%10000))
84 printk("."); 87 printk(".");
85 if (!pfn_valid(pfn)) 88 if (!pfn_valid(pfn))
86 continue; 89 continue;
@@ -119,13 +122,15 @@ int save_highmem(void)
119 struct zone *zone; 122 struct zone *zone;
120 int res = 0; 123 int res = 0;
121 124
122 pr_debug("swsusp: Saving Highmem\n"); 125 pr_debug("swsusp: Saving Highmem");
126 drain_local_pages();
123 for_each_zone (zone) { 127 for_each_zone (zone) {
124 if (is_highmem(zone)) 128 if (is_highmem(zone))
125 res = save_highmem_zone(zone); 129 res = save_highmem_zone(zone);
126 if (res) 130 if (res)
127 return res; 131 return res;
128 } 132 }
133 printk("\n");
129 return 0; 134 return 0;
130} 135}
131 136
@@ -235,7 +240,7 @@ static void copy_data_pages(struct pbe *pblist)
235 * free_pagedir - free pages allocated with alloc_pagedir() 240 * free_pagedir - free pages allocated with alloc_pagedir()
236 */ 241 */
237 242
238void free_pagedir(struct pbe *pblist) 243static void free_pagedir(struct pbe *pblist)
239{ 244{
240 struct pbe *pbe; 245 struct pbe *pbe;
241 246
@@ -301,7 +306,7 @@ struct eaten_page {
301 306
302static struct eaten_page *eaten_pages = NULL; 307static struct eaten_page *eaten_pages = NULL;
303 308
304void release_eaten_pages(void) 309static void release_eaten_pages(void)
305{ 310{
306 struct eaten_page *p, *q; 311 struct eaten_page *p, *q;
307 312
@@ -376,7 +381,6 @@ struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed
376 if (!nr_pages) 381 if (!nr_pages)
377 return NULL; 382 return NULL;
378 383
379 pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages);
380 pblist = alloc_image_page(gfp_mask, safe_needed); 384 pblist = alloc_image_page(gfp_mask, safe_needed);
381 /* FIXME: rewrite this ugly loop */ 385 /* FIXME: rewrite this ugly loop */
382 for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages; 386 for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages;
@@ -388,7 +392,7 @@ struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed
388 free_pagedir(pblist); 392 free_pagedir(pblist);
389 pblist = NULL; 393 pblist = NULL;
390 } else 394 } else
391 create_pbe_list(pblist, nr_pages); 395 create_pbe_list(pblist, nr_pages);
392 return pblist; 396 return pblist;
393} 397}
394 398
@@ -414,6 +418,10 @@ void swsusp_free(void)
414 } 418 }
415 } 419 }
416 } 420 }
421 nr_copy_pages = 0;
422 nr_meta_pages = 0;
423 pagedir_nosave = NULL;
424 buffer = NULL;
417} 425}
418 426
419 427
@@ -437,7 +445,7 @@ static int enough_free_mem(unsigned int nr_pages)
437 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); 445 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
438} 446}
439 447
440int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed) 448static int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed)
441{ 449{
442 struct pbe *p; 450 struct pbe *p;
443 451
@@ -504,7 +512,318 @@ asmlinkage int swsusp_save(void)
504 */ 512 */
505 513
506 nr_copy_pages = nr_pages; 514 nr_copy_pages = nr_pages;
515 nr_meta_pages = (nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT;
507 516
508 printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages); 517 printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages);
509 return 0; 518 return 0;
510} 519}
520
521static void init_header(struct swsusp_info *info)
522{
523 memset(info, 0, sizeof(struct swsusp_info));
524 info->version_code = LINUX_VERSION_CODE;
525 info->num_physpages = num_physpages;
526 memcpy(&info->uts, &system_utsname, sizeof(system_utsname));
527 info->cpus = num_online_cpus();
528 info->image_pages = nr_copy_pages;
529 info->pages = nr_copy_pages + nr_meta_pages + 1;
530 info->size = info->pages;
531 info->size <<= PAGE_SHIFT;
532}
533
534/**
535 * pack_orig_addresses - the .orig_address fields of the PBEs from the
536 * list starting at @pbe are stored in the array @buf[] (1 page)
537 */
538
539static inline struct pbe *pack_orig_addresses(unsigned long *buf, struct pbe *pbe)
540{
541 int j;
542
543 for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
544 buf[j] = pbe->orig_address;
545 pbe = pbe->next;
546 }
547 if (!pbe)
548 for (; j < PAGE_SIZE / sizeof(long); j++)
549 buf[j] = 0;
550 return pbe;
551}
552
553/**
554 * snapshot_read_next - used for reading the system memory snapshot.
555 *
556 * On the first call to it @handle should point to a zeroed
557 * snapshot_handle structure. The structure gets updated and a pointer
558 * to it should be passed to this function every next time.
559 *
560 * The @count parameter should contain the number of bytes the caller
561 * wants to read from the snapshot. It must not be zero.
562 *
563 * On success the function returns a positive number. Then, the caller
564 * is allowed to read up to the returned number of bytes from the memory
565 * location computed by the data_of() macro. The number returned
566 * may be smaller than @count, but this only happens if the read would
567 * cross a page boundary otherwise.
568 *
569 * The function returns 0 to indicate the end of data stream condition,
570 * and a negative number is returned on error. In such cases the
571 * structure pointed to by @handle is not updated and should not be used
572 * any more.
573 */
574
575int snapshot_read_next(struct snapshot_handle *handle, size_t count)
576{
577 if (handle->page > nr_meta_pages + nr_copy_pages)
578 return 0;
579 if (!buffer) {
580 /* This makes the buffer be freed by swsusp_free() */
581 buffer = alloc_image_page(GFP_ATOMIC, 0);
582 if (!buffer)
583 return -ENOMEM;
584 }
585 if (!handle->offset) {
586 init_header((struct swsusp_info *)buffer);
587 handle->buffer = buffer;
588 handle->pbe = pagedir_nosave;
589 }
590 if (handle->prev < handle->page) {
591 if (handle->page <= nr_meta_pages) {
592 handle->pbe = pack_orig_addresses(buffer, handle->pbe);
593 if (!handle->pbe)
594 handle->pbe = pagedir_nosave;
595 } else {
596 handle->buffer = (void *)handle->pbe->address;
597 handle->pbe = handle->pbe->next;
598 }
599 handle->prev = handle->page;
600 }
601 handle->buf_offset = handle->page_offset;
602 if (handle->page_offset + count >= PAGE_SIZE) {
603 count = PAGE_SIZE - handle->page_offset;
604 handle->page_offset = 0;
605 handle->page++;
606 } else {
607 handle->page_offset += count;
608 }
609 handle->offset += count;
610 return count;
611}
612
613/**
614 * mark_unsafe_pages - mark the pages that cannot be used for storing
615 * the image during resume, because they conflict with the pages that
616 * had been used before suspend
617 */
618
619static int mark_unsafe_pages(struct pbe *pblist)
620{
621 struct zone *zone;
622 unsigned long zone_pfn;
623 struct pbe *p;
624
625 if (!pblist) /* a sanity check */
626 return -EINVAL;
627
628 /* Clear page flags */
629 for_each_zone (zone) {
630 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
631 if (pfn_valid(zone_pfn + zone->zone_start_pfn))
632 ClearPageNosaveFree(pfn_to_page(zone_pfn +
633 zone->zone_start_pfn));
634 }
635
636 /* Mark orig addresses */
637 for_each_pbe (p, pblist) {
638 if (virt_addr_valid(p->orig_address))
639 SetPageNosaveFree(virt_to_page(p->orig_address));
640 else
641 return -EFAULT;
642 }
643
644 return 0;
645}
646
647static void copy_page_backup_list(struct pbe *dst, struct pbe *src)
648{
649 /* We assume both lists contain the same number of elements */
650 while (src) {
651 dst->orig_address = src->orig_address;
652 dst = dst->next;
653 src = src->next;
654 }
655}
656
657static int check_header(struct swsusp_info *info)
658{
659 char *reason = NULL;
660
661 if (info->version_code != LINUX_VERSION_CODE)
662 reason = "kernel version";
663 if (info->num_physpages != num_physpages)
664 reason = "memory size";
665 if (strcmp(info->uts.sysname,system_utsname.sysname))
666 reason = "system type";
667 if (strcmp(info->uts.release,system_utsname.release))
668 reason = "kernel release";
669 if (strcmp(info->uts.version,system_utsname.version))
670 reason = "version";
671 if (strcmp(info->uts.machine,system_utsname.machine))
672 reason = "machine";
673 if (reason) {
674 printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason);
675 return -EPERM;
676 }
677 return 0;
678}
679
680/**
681 * load header - check the image header and copy data from it
682 */
683
684static int load_header(struct snapshot_handle *handle,
685 struct swsusp_info *info)
686{
687 int error;
688 struct pbe *pblist;
689
690 error = check_header(info);
691 if (!error) {
692 pblist = alloc_pagedir(info->image_pages, GFP_ATOMIC, 0);
693 if (!pblist)
694 return -ENOMEM;
695 pagedir_nosave = pblist;
696 handle->pbe = pblist;
697 nr_copy_pages = info->image_pages;
698 nr_meta_pages = info->pages - info->image_pages - 1;
699 }
700 return error;
701}
702
703/**
704 * unpack_orig_addresses - copy the elements of @buf[] (1 page) to
705 * the PBEs in the list starting at @pbe
706 */
707
708static inline struct pbe *unpack_orig_addresses(unsigned long *buf,
709 struct pbe *pbe)
710{
711 int j;
712
713 for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
714 pbe->orig_address = buf[j];
715 pbe = pbe->next;
716 }
717 return pbe;
718}
719
720/**
721 * create_image - use metadata contained in the PBE list
722 * pointed to by pagedir_nosave to mark the pages that will
723 * be overwritten in the process of restoring the system
724 * memory state from the image and allocate memory for
725 * the image avoiding these pages
726 */
727
728static int create_image(struct snapshot_handle *handle)
729{
730 int error = 0;
731 struct pbe *p, *pblist;
732
733 p = pagedir_nosave;
734 error = mark_unsafe_pages(p);
735 if (!error) {
736 pblist = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 1);
737 if (pblist)
738 copy_page_backup_list(pblist, p);
739 free_pagedir(p);
740 if (!pblist)
741 error = -ENOMEM;
742 }
743 if (!error)
744 error = alloc_data_pages(pblist, GFP_ATOMIC, 1);
745 if (!error) {
746 release_eaten_pages();
747 pagedir_nosave = pblist;
748 } else {
749 pagedir_nosave = NULL;
750 handle->pbe = NULL;
751 nr_copy_pages = 0;
752 nr_meta_pages = 0;
753 }
754 return error;
755}
756
757/**
758 * snapshot_write_next - used for writing the system memory snapshot.
759 *
760 * On the first call to it @handle should point to a zeroed
761 * snapshot_handle structure. The structure gets updated and a pointer
762 * to it should be passed to this function every next time.
763 *
764 * The @count parameter should contain the number of bytes the caller
765 * wants to write to the image. It must not be zero.
766 *
767 * On success the function returns a positive number. Then, the caller
768 * is allowed to write up to the returned number of bytes to the memory
769 * location computed by the data_of() macro. The number returned
770 * may be smaller than @count, but this only happens if the write would
771 * cross a page boundary otherwise.
772 *
773 * The function returns 0 to indicate the "end of file" condition,
774 * and a negative number is returned on error. In such cases the
775 * structure pointed to by @handle is not updated and should not be used
776 * any more.
777 */
778
779int snapshot_write_next(struct snapshot_handle *handle, size_t count)
780{
781 int error = 0;
782
783 if (handle->prev && handle->page > nr_meta_pages + nr_copy_pages)
784 return 0;
785 if (!buffer) {
786 /* This makes the buffer be freed by swsusp_free() */
787 buffer = alloc_image_page(GFP_ATOMIC, 0);
788 if (!buffer)
789 return -ENOMEM;
790 }
791 if (!handle->offset)
792 handle->buffer = buffer;
793 if (handle->prev < handle->page) {
794 if (!handle->prev) {
795 error = load_header(handle, (struct swsusp_info *)buffer);
796 if (error)
797 return error;
798 } else if (handle->prev <= nr_meta_pages) {
799 handle->pbe = unpack_orig_addresses(buffer, handle->pbe);
800 if (!handle->pbe) {
801 error = create_image(handle);
802 if (error)
803 return error;
804 handle->pbe = pagedir_nosave;
805 handle->buffer = (void *)handle->pbe->address;
806 }
807 } else {
808 handle->pbe = handle->pbe->next;
809 handle->buffer = (void *)handle->pbe->address;
810 }
811 handle->prev = handle->page;
812 }
813 handle->buf_offset = handle->page_offset;
814 if (handle->page_offset + count >= PAGE_SIZE) {
815 count = PAGE_SIZE - handle->page_offset;
816 handle->page_offset = 0;
817 handle->page++;
818 } else {
819 handle->page_offset += count;
820 }
821 handle->offset += count;
822 return count;
823}
824
825int snapshot_image_loaded(struct snapshot_handle *handle)
826{
827 return !(!handle->pbe || handle->pbe->next || !nr_copy_pages ||
828 handle->page <= nr_meta_pages + nr_copy_pages);
829}
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
new file mode 100644
index 000000000000..9177f3f73a6c
--- /dev/null
+++ b/kernel/power/swap.c
@@ -0,0 +1,544 @@
1/*
2 * linux/kernel/power/swap.c
3 *
4 * This file provides functions for reading the suspend image from
5 * and writing it to a swap partition.
6 *
7 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz>
8 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
9 *
10 * This file is released under the GPLv2.
11 *
12 */
13
14#include <linux/module.h>
15#include <linux/smp_lock.h>
16#include <linux/file.h>
17#include <linux/utsname.h>
18#include <linux/version.h>
19#include <linux/delay.h>
20#include <linux/bitops.h>
21#include <linux/genhd.h>
22#include <linux/device.h>
23#include <linux/buffer_head.h>
24#include <linux/bio.h>
25#include <linux/swap.h>
26#include <linux/swapops.h>
27#include <linux/pm.h>
28
29#include "power.h"
30
31extern char resume_file[];
32
33#define SWSUSP_SIG "S1SUSPEND"
34
35static struct swsusp_header {
36 char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)];
37 swp_entry_t image;
38 char orig_sig[10];
39 char sig[10];
40} __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header;
41
42/*
43 * Saving part...
44 */
45
46static unsigned short root_swap = 0xffff;
47
48static int mark_swapfiles(swp_entry_t start)
49{
50 int error;
51
52 rw_swap_page_sync(READ,
53 swp_entry(root_swap, 0),
54 virt_to_page((unsigned long)&swsusp_header));
55 if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) ||
56 !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
57 memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
58 memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
59 swsusp_header.image = start;
60 error = rw_swap_page_sync(WRITE,
61 swp_entry(root_swap, 0),
62 virt_to_page((unsigned long)
63 &swsusp_header));
64 } else {
65 pr_debug("swsusp: Partition is not swap space.\n");
66 error = -ENODEV;
67 }
68 return error;
69}
70
71/**
72 * swsusp_swap_check - check if the resume device is a swap device
73 * and get its index (if so)
74 */
75
76static int swsusp_swap_check(void) /* This is called before saving image */
77{
78 int res = swap_type_of(swsusp_resume_device);
79
80 if (res >= 0) {
81 root_swap = res;
82 return 0;
83 }
84 return res;
85}
86
87/**
88 * write_page - Write one page to given swap location.
89 * @buf: Address we're writing.
90 * @offset: Offset of the swap page we're writing to.
91 */
92
93static int write_page(void *buf, unsigned long offset)
94{
95 swp_entry_t entry;
96 int error = -ENOSPC;
97
98 if (offset) {
99 entry = swp_entry(root_swap, offset);
100 error = rw_swap_page_sync(WRITE, entry, virt_to_page(buf));
101 }
102 return error;
103}
104
105/*
106 * The swap map is a data structure used for keeping track of each page
107 * written to a swap partition. It consists of many swap_map_page
108 * structures that contain each an array of MAP_PAGE_SIZE swap entries.
109 * These structures are stored on the swap and linked together with the
110 * help of the .next_swap member.
111 *
112 * The swap map is created during suspend. The swap map pages are
113 * allocated and populated one at a time, so we only need one memory
114 * page to set up the entire structure.
115 *
116 * During resume we also only need to use one swap_map_page structure
117 * at a time.
118 */
119
120#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(long) - 1)
121
122struct swap_map_page {
123 unsigned long entries[MAP_PAGE_ENTRIES];
124 unsigned long next_swap;
125};
126
127/**
128 * The swap_map_handle structure is used for handling swap in
129 * a file-alike way
130 */
131
132struct swap_map_handle {
133 struct swap_map_page *cur;
134 unsigned long cur_swap;
135 struct bitmap_page *bitmap;
136 unsigned int k;
137};
138
139static void release_swap_writer(struct swap_map_handle *handle)
140{
141 if (handle->cur)
142 free_page((unsigned long)handle->cur);
143 handle->cur = NULL;
144 if (handle->bitmap)
145 free_bitmap(handle->bitmap);
146 handle->bitmap = NULL;
147}
148
149static int get_swap_writer(struct swap_map_handle *handle)
150{
151 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
152 if (!handle->cur)
153 return -ENOMEM;
154 handle->bitmap = alloc_bitmap(count_swap_pages(root_swap, 0));
155 if (!handle->bitmap) {
156 release_swap_writer(handle);
157 return -ENOMEM;
158 }
159 handle->cur_swap = alloc_swap_page(root_swap, handle->bitmap);
160 if (!handle->cur_swap) {
161 release_swap_writer(handle);
162 return -ENOSPC;
163 }
164 handle->k = 0;
165 return 0;
166}
167
168static int swap_write_page(struct swap_map_handle *handle, void *buf)
169{
170 int error;
171 unsigned long offset;
172
173 if (!handle->cur)
174 return -EINVAL;
175 offset = alloc_swap_page(root_swap, handle->bitmap);
176 error = write_page(buf, offset);
177 if (error)
178 return error;
179 handle->cur->entries[handle->k++] = offset;
180 if (handle->k >= MAP_PAGE_ENTRIES) {
181 offset = alloc_swap_page(root_swap, handle->bitmap);
182 if (!offset)
183 return -ENOSPC;
184 handle->cur->next_swap = offset;
185 error = write_page(handle->cur, handle->cur_swap);
186 if (error)
187 return error;
188 memset(handle->cur, 0, PAGE_SIZE);
189 handle->cur_swap = offset;
190 handle->k = 0;
191 }
192 return 0;
193}
194
195static int flush_swap_writer(struct swap_map_handle *handle)
196{
197 if (handle->cur && handle->cur_swap)
198 return write_page(handle->cur, handle->cur_swap);
199 else
200 return -EINVAL;
201}
202
203/**
204 * save_image - save the suspend image data
205 */
206
207static int save_image(struct swap_map_handle *handle,
208 struct snapshot_handle *snapshot,
209 unsigned int nr_pages)
210{
211 unsigned int m;
212 int ret;
213 int error = 0;
214
215 printk("Saving image data pages (%u pages) ... ", nr_pages);
216 m = nr_pages / 100;
217 if (!m)
218 m = 1;
219 nr_pages = 0;
220 do {
221 ret = snapshot_read_next(snapshot, PAGE_SIZE);
222 if (ret > 0) {
223 error = swap_write_page(handle, data_of(*snapshot));
224 if (error)
225 break;
226 if (!(nr_pages % m))
227 printk("\b\b\b\b%3d%%", nr_pages / m);
228 nr_pages++;
229 }
230 } while (ret > 0);
231 if (!error)
232 printk("\b\b\b\bdone\n");
233 return error;
234}
235
236/**
237 * enough_swap - Make sure we have enough swap to save the image.
238 *
239 * Returns TRUE or FALSE after checking the total amount of swap
240 * space avaiable from the resume partition.
241 */
242
243static int enough_swap(unsigned int nr_pages)
244{
245 unsigned int free_swap = count_swap_pages(root_swap, 1);
246
247 pr_debug("swsusp: free swap pages: %u\n", free_swap);
248 return free_swap > (nr_pages + PAGES_FOR_IO +
249 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
250}
251
252/**
253 * swsusp_write - Write entire image and metadata.
254 *
255 * It is important _NOT_ to umount filesystems at this point. We want
256 * them synced (in case something goes wrong) but we DO not want to mark
257 * filesystem clean: it is not. (And it does not matter, if we resume
258 * correctly, we'll mark system clean, anyway.)
259 */
260
261int swsusp_write(void)
262{
263 struct swap_map_handle handle;
264 struct snapshot_handle snapshot;
265 struct swsusp_info *header;
266 unsigned long start;
267 int error;
268
269 if ((error = swsusp_swap_check())) {
270 printk(KERN_ERR "swsusp: Cannot find swap device, try swapon -a.\n");
271 return error;
272 }
273 memset(&snapshot, 0, sizeof(struct snapshot_handle));
274 error = snapshot_read_next(&snapshot, PAGE_SIZE);
275 if (error < PAGE_SIZE)
276 return error < 0 ? error : -EFAULT;
277 header = (struct swsusp_info *)data_of(snapshot);
278 if (!enough_swap(header->pages)) {
279 printk(KERN_ERR "swsusp: Not enough free swap\n");
280 return -ENOSPC;
281 }
282 error = get_swap_writer(&handle);
283 if (!error) {
284 start = handle.cur_swap;
285 error = swap_write_page(&handle, header);
286 }
287 if (!error)
288 error = save_image(&handle, &snapshot, header->pages - 1);
289 if (!error) {
290 flush_swap_writer(&handle);
291 printk("S");
292 error = mark_swapfiles(swp_entry(root_swap, start));
293 printk("|\n");
294 }
295 if (error)
296 free_all_swap_pages(root_swap, handle.bitmap);
297 release_swap_writer(&handle);
298 return error;
299}
300
301/*
302 * Using bio to read from swap.
303 * This code requires a bit more work than just using buffer heads
304 * but, it is the recommended way for 2.5/2.6.
305 * The following are to signal the beginning and end of I/O. Bios
306 * finish asynchronously, while we want them to happen synchronously.
307 * A simple atomic_t, and a wait loop take care of this problem.
308 */
309
310static atomic_t io_done = ATOMIC_INIT(0);
311
312static int end_io(struct bio *bio, unsigned int num, int err)
313{
314 if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
315 panic("I/O error reading memory image");
316 atomic_set(&io_done, 0);
317 return 0;
318}
319
320static struct block_device *resume_bdev;
321
322/**
323 * submit - submit BIO request.
324 * @rw: READ or WRITE.
325 * @off physical offset of page.
326 * @page: page we're reading or writing.
327 *
328 * Straight from the textbook - allocate and initialize the bio.
329 * If we're writing, make sure the page is marked as dirty.
330 * Then submit it and wait.
331 */
332
333static int submit(int rw, pgoff_t page_off, void *page)
334{
335 int error = 0;
336 struct bio *bio;
337
338 bio = bio_alloc(GFP_ATOMIC, 1);
339 if (!bio)
340 return -ENOMEM;
341 bio->bi_sector = page_off * (PAGE_SIZE >> 9);
342 bio->bi_bdev = resume_bdev;
343 bio->bi_end_io = end_io;
344
345 if (bio_add_page(bio, virt_to_page(page), PAGE_SIZE, 0) < PAGE_SIZE) {
346 printk("swsusp: ERROR: adding page to bio at %ld\n",page_off);
347 error = -EFAULT;
348 goto Done;
349 }
350
351 atomic_set(&io_done, 1);
352 submit_bio(rw | (1 << BIO_RW_SYNC), bio);
353 while (atomic_read(&io_done))
354 yield();
355 if (rw == READ)
356 bio_set_pages_dirty(bio);
357 Done:
358 bio_put(bio);
359 return error;
360}
361
362static int bio_read_page(pgoff_t page_off, void *page)
363{
364 return submit(READ, page_off, page);
365}
366
367static int bio_write_page(pgoff_t page_off, void *page)
368{
369 return submit(WRITE, page_off, page);
370}
371
372/**
373 * The following functions allow us to read data using a swap map
374 * in a file-alike way
375 */
376
377static void release_swap_reader(struct swap_map_handle *handle)
378{
379 if (handle->cur)
380 free_page((unsigned long)handle->cur);
381 handle->cur = NULL;
382}
383
384static int get_swap_reader(struct swap_map_handle *handle,
385 swp_entry_t start)
386{
387 int error;
388
389 if (!swp_offset(start))
390 return -EINVAL;
391 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
392 if (!handle->cur)
393 return -ENOMEM;
394 error = bio_read_page(swp_offset(start), handle->cur);
395 if (error) {
396 release_swap_reader(handle);
397 return error;
398 }
399 handle->k = 0;
400 return 0;
401}
402
403static int swap_read_page(struct swap_map_handle *handle, void *buf)
404{
405 unsigned long offset;
406 int error;
407
408 if (!handle->cur)
409 return -EINVAL;
410 offset = handle->cur->entries[handle->k];
411 if (!offset)
412 return -EFAULT;
413 error = bio_read_page(offset, buf);
414 if (error)
415 return error;
416 if (++handle->k >= MAP_PAGE_ENTRIES) {
417 handle->k = 0;
418 offset = handle->cur->next_swap;
419 if (!offset)
420 release_swap_reader(handle);
421 else
422 error = bio_read_page(offset, handle->cur);
423 }
424 return error;
425}
426
427/**
428 * load_image - load the image using the swap map handle
429 * @handle and the snapshot handle @snapshot
430 * (assume there are @nr_pages pages to load)
431 */
432
433static int load_image(struct swap_map_handle *handle,
434 struct snapshot_handle *snapshot,
435 unsigned int nr_pages)
436{
437 unsigned int m;
438 int ret;
439 int error = 0;
440
441 printk("Loading image data pages (%u pages) ... ", nr_pages);
442 m = nr_pages / 100;
443 if (!m)
444 m = 1;
445 nr_pages = 0;
446 do {
447 ret = snapshot_write_next(snapshot, PAGE_SIZE);
448 if (ret > 0) {
449 error = swap_read_page(handle, data_of(*snapshot));
450 if (error)
451 break;
452 if (!(nr_pages % m))
453 printk("\b\b\b\b%3d%%", nr_pages / m);
454 nr_pages++;
455 }
456 } while (ret > 0);
457 if (!error)
458 printk("\b\b\b\bdone\n");
459 if (!snapshot_image_loaded(snapshot))
460 error = -ENODATA;
461 return error;
462}
463
464int swsusp_read(void)
465{
466 int error;
467 struct swap_map_handle handle;
468 struct snapshot_handle snapshot;
469 struct swsusp_info *header;
470
471 if (IS_ERR(resume_bdev)) {
472 pr_debug("swsusp: block device not initialised\n");
473 return PTR_ERR(resume_bdev);
474 }
475
476 memset(&snapshot, 0, sizeof(struct snapshot_handle));
477 error = snapshot_write_next(&snapshot, PAGE_SIZE);
478 if (error < PAGE_SIZE)
479 return error < 0 ? error : -EFAULT;
480 header = (struct swsusp_info *)data_of(snapshot);
481 error = get_swap_reader(&handle, swsusp_header.image);
482 if (!error)
483 error = swap_read_page(&handle, header);
484 if (!error)
485 error = load_image(&handle, &snapshot, header->pages - 1);
486 release_swap_reader(&handle);
487
488 blkdev_put(resume_bdev);
489
490 if (!error)
491 pr_debug("swsusp: Reading resume file was successful\n");
492 else
493 pr_debug("swsusp: Error %d resuming\n", error);
494 return error;
495}
496
497/**
498 * swsusp_check - Check for swsusp signature in the resume device
499 */
500
501int swsusp_check(void)
502{
503 int error;
504
505 resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
506 if (!IS_ERR(resume_bdev)) {
507 set_blocksize(resume_bdev, PAGE_SIZE);
508 memset(&swsusp_header, 0, sizeof(swsusp_header));
509 if ((error = bio_read_page(0, &swsusp_header)))
510 return error;
511 if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
512 memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
513 /* Reset swap signature now */
514 error = bio_write_page(0, &swsusp_header);
515 } else {
516 return -EINVAL;
517 }
518 if (error)
519 blkdev_put(resume_bdev);
520 else
521 pr_debug("swsusp: Signature found, resuming\n");
522 } else {
523 error = PTR_ERR(resume_bdev);
524 }
525
526 if (error)
527 pr_debug("swsusp: Error %d check for resume file\n", error);
528
529 return error;
530}
531
532/**
533 * swsusp_close - close swap device.
534 */
535
536void swsusp_close(void)
537{
538 if (IS_ERR(resume_bdev)) {
539 pr_debug("swsusp: block device not initialised\n");
540 return;
541 }
542
543 blkdev_put(resume_bdev);
544}
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 2d9d08f72f76..c4016cbbd3e0 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -31,41 +31,24 @@
31 * Fixed runaway init 31 * Fixed runaway init
32 * 32 *
33 * Rafael J. Wysocki <rjw@sisk.pl> 33 * Rafael J. Wysocki <rjw@sisk.pl>
34 * Added the swap map data structure and reworked the handling of swap 34 * Reworked the freeing of memory and the handling of swap
35 * 35 *
36 * More state savers are welcome. Especially for the scsi layer... 36 * More state savers are welcome. Especially for the scsi layer...
37 * 37 *
38 * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt 38 * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt
39 */ 39 */
40 40
41#include <linux/module.h>
42#include <linux/mm.h> 41#include <linux/mm.h>
43#include <linux/suspend.h> 42#include <linux/suspend.h>
44#include <linux/smp_lock.h>
45#include <linux/file.h>
46#include <linux/utsname.h>
47#include <linux/version.h>
48#include <linux/delay.h>
49#include <linux/bitops.h>
50#include <linux/spinlock.h> 43#include <linux/spinlock.h>
51#include <linux/genhd.h>
52#include <linux/kernel.h> 44#include <linux/kernel.h>
53#include <linux/major.h> 45#include <linux/major.h>
54#include <linux/swap.h> 46#include <linux/swap.h>
55#include <linux/pm.h> 47#include <linux/pm.h>
56#include <linux/device.h>
57#include <linux/buffer_head.h>
58#include <linux/swapops.h> 48#include <linux/swapops.h>
59#include <linux/bootmem.h> 49#include <linux/bootmem.h>
60#include <linux/syscalls.h> 50#include <linux/syscalls.h>
61#include <linux/highmem.h> 51#include <linux/highmem.h>
62#include <linux/bio.h>
63
64#include <asm/uaccess.h>
65#include <asm/mmu_context.h>
66#include <asm/pgtable.h>
67#include <asm/tlbflush.h>
68#include <asm/io.h>
69 52
70#include "power.h" 53#include "power.h"
71 54
@@ -77,6 +60,8 @@
77 */ 60 */
78unsigned long image_size = 500 * 1024 * 1024; 61unsigned long image_size = 500 * 1024 * 1024;
79 62
63int in_suspend __nosavedata = 0;
64
80#ifdef CONFIG_HIGHMEM 65#ifdef CONFIG_HIGHMEM
81unsigned int count_highmem_pages(void); 66unsigned int count_highmem_pages(void);
82int save_highmem(void); 67int save_highmem(void);
@@ -87,471 +72,97 @@ static int restore_highmem(void) { return 0; }
87static unsigned int count_highmem_pages(void) { return 0; } 72static unsigned int count_highmem_pages(void) { return 0; }
88#endif 73#endif
89 74
90extern char resume_file[];
91
92#define SWSUSP_SIG "S1SUSPEND"
93
94static struct swsusp_header {
95 char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)];
96 swp_entry_t image;
97 char orig_sig[10];
98 char sig[10];
99} __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header;
100
101static struct swsusp_info swsusp_info;
102
103/*
104 * Saving part...
105 */
106
107static unsigned short root_swap = 0xffff;
108
109static int mark_swapfiles(swp_entry_t start)
110{
111 int error;
112
113 rw_swap_page_sync(READ,
114 swp_entry(root_swap, 0),
115 virt_to_page((unsigned long)&swsusp_header));
116 if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) ||
117 !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
118 memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
119 memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
120 swsusp_header.image = start;
121 error = rw_swap_page_sync(WRITE,
122 swp_entry(root_swap, 0),
123 virt_to_page((unsigned long)
124 &swsusp_header));
125 } else {
126 pr_debug("swsusp: Partition is not swap space.\n");
127 error = -ENODEV;
128 }
129 return error;
130}
131
132/*
133 * Check whether the swap device is the specified resume
134 * device, irrespective of whether they are specified by
135 * identical names.
136 *
137 * (Thus, device inode aliasing is allowed. You can say /dev/hda4
138 * instead of /dev/ide/host0/bus0/target0/lun0/part4 [if using devfs]
139 * and they'll be considered the same device. This is *necessary* for
140 * devfs, since the resume code can only recognize the form /dev/hda4,
141 * but the suspend code would see the long name.)
142 */
143static inline int is_resume_device(const struct swap_info_struct *swap_info)
144{
145 struct file *file = swap_info->swap_file;
146 struct inode *inode = file->f_dentry->d_inode;
147
148 return S_ISBLK(inode->i_mode) &&
149 swsusp_resume_device == MKDEV(imajor(inode), iminor(inode));
150}
151
152static int swsusp_swap_check(void) /* This is called before saving image */
153{
154 int i;
155
156 spin_lock(&swap_lock);
157 for (i = 0; i < MAX_SWAPFILES; i++) {
158 if (!(swap_info[i].flags & SWP_WRITEOK))
159 continue;
160 if (!swsusp_resume_device || is_resume_device(swap_info + i)) {
161 spin_unlock(&swap_lock);
162 root_swap = i;
163 return 0;
164 }
165 }
166 spin_unlock(&swap_lock);
167 return -ENODEV;
168}
169
170/**
171 * write_page - Write one page to a fresh swap location.
172 * @addr: Address we're writing.
173 * @loc: Place to store the entry we used.
174 *
175 * Allocate a new swap entry and 'sync' it. Note we discard -EIO
176 * errors. That is an artifact left over from swsusp. It did not
177 * check the return of rw_swap_page_sync() at all, since most pages
178 * written back to swap would return -EIO.
179 * This is a partial improvement, since we will at least return other
180 * errors, though we need to eventually fix the damn code.
181 */
182static int write_page(unsigned long addr, swp_entry_t *loc)
183{
184 swp_entry_t entry;
185 int error = -ENOSPC;
186
187 entry = get_swap_page_of_type(root_swap);
188 if (swp_offset(entry)) {
189 error = rw_swap_page_sync(WRITE, entry, virt_to_page(addr));
190 if (!error || error == -EIO)
191 *loc = entry;
192 }
193 return error;
194}
195
196/** 75/**
197 * Swap map-handling functions 76 * The following functions are used for tracing the allocated
198 * 77 * swap pages, so that they can be freed in case of an error.
199 * The swap map is a data structure used for keeping track of each page
200 * written to the swap. It consists of many swap_map_page structures
201 * that contain each an array of MAP_PAGE_SIZE swap entries.
202 * These structures are linked together with the help of either the
203 * .next (in memory) or the .next_swap (in swap) member.
204 * 78 *
205 * The swap map is created during suspend. At that time we need to keep 79 * The functions operate on a linked bitmap structure defined
206 * it in memory, because we have to free all of the allocated swap 80 * in power.h
207 * entries if an error occurs. The memory needed is preallocated
208 * so that we know in advance if there's enough of it.
209 *
210 * The first swap_map_page structure is filled with the swap entries that
211 * correspond to the first MAP_PAGE_SIZE data pages written to swap and
212 * so on. After the all of the data pages have been written, the order
213 * of the swap_map_page structures in the map is reversed so that they
214 * can be read from swap in the original order. This causes the data
215 * pages to be loaded in exactly the same order in which they have been
216 * saved.
217 *
218 * During resume we only need to use one swap_map_page structure
219 * at a time, which means that we only need to use two memory pages for
220 * reading the image - one for reading the swap_map_page structures
221 * and the second for reading the data pages from swap.
222 */ 81 */
223 82
224#define MAP_PAGE_SIZE ((PAGE_SIZE - sizeof(swp_entry_t) - sizeof(void *)) \ 83void free_bitmap(struct bitmap_page *bitmap)
225 / sizeof(swp_entry_t))
226
227struct swap_map_page {
228 swp_entry_t entries[MAP_PAGE_SIZE];
229 swp_entry_t next_swap;
230 struct swap_map_page *next;
231};
232
233static inline void free_swap_map(struct swap_map_page *swap_map)
234{ 84{
235 struct swap_map_page *swp; 85 struct bitmap_page *bp;
236 86
237 while (swap_map) { 87 while (bitmap) {
238 swp = swap_map->next; 88 bp = bitmap->next;
239 free_page((unsigned long)swap_map); 89 free_page((unsigned long)bitmap);
240 swap_map = swp; 90 bitmap = bp;
241 } 91 }
242} 92}
243 93
244static struct swap_map_page *alloc_swap_map(unsigned int nr_pages) 94struct bitmap_page *alloc_bitmap(unsigned int nr_bits)
245{ 95{
246 struct swap_map_page *swap_map, *swp; 96 struct bitmap_page *bitmap, *bp;
247 unsigned n = 0; 97 unsigned int n;
248 98
249 if (!nr_pages) 99 if (!nr_bits)
250 return NULL; 100 return NULL;
251 101
252 pr_debug("alloc_swap_map(): nr_pages = %d\n", nr_pages); 102 bitmap = (struct bitmap_page *)get_zeroed_page(GFP_KERNEL);
253 swap_map = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); 103 bp = bitmap;
254 swp = swap_map; 104 for (n = BITMAP_PAGE_BITS; n < nr_bits; n += BITMAP_PAGE_BITS) {
255 for (n = MAP_PAGE_SIZE; n < nr_pages; n += MAP_PAGE_SIZE) { 105 bp->next = (struct bitmap_page *)get_zeroed_page(GFP_KERNEL);
256 swp->next = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); 106 bp = bp->next;
257 swp = swp->next; 107 if (!bp) {
258 if (!swp) { 108 free_bitmap(bitmap);
259 free_swap_map(swap_map);
260 return NULL; 109 return NULL;
261 } 110 }
262 } 111 }
263 return swap_map; 112 return bitmap;
264} 113}
265 114
266/** 115static int bitmap_set(struct bitmap_page *bitmap, unsigned long bit)
267 * reverse_swap_map - reverse the order of pages in the swap map
268 * @swap_map
269 */
270
271static inline struct swap_map_page *reverse_swap_map(struct swap_map_page *swap_map)
272{
273 struct swap_map_page *prev, *next;
274
275 prev = NULL;
276 while (swap_map) {
277 next = swap_map->next;
278 swap_map->next = prev;
279 prev = swap_map;
280 swap_map = next;
281 }
282 return prev;
283}
284
285/**
286 * free_swap_map_entries - free the swap entries allocated to store
287 * the swap map @swap_map (this is only called in case of an error)
288 */
289static inline void free_swap_map_entries(struct swap_map_page *swap_map)
290{
291 while (swap_map) {
292 if (swap_map->next_swap.val)
293 swap_free(swap_map->next_swap);
294 swap_map = swap_map->next;
295 }
296}
297
298/**
299 * save_swap_map - save the swap map used for tracing the data pages
300 * stored in the swap
301 */
302
303static int save_swap_map(struct swap_map_page *swap_map, swp_entry_t *start)
304{
305 swp_entry_t entry = (swp_entry_t){0};
306 int error;
307
308 while (swap_map) {
309 swap_map->next_swap = entry;
310 if ((error = write_page((unsigned long)swap_map, &entry)))
311 return error;
312 swap_map = swap_map->next;
313 }
314 *start = entry;
315 return 0;
316}
317
318/**
319 * free_image_entries - free the swap entries allocated to store
320 * the image data pages (this is only called in case of an error)
321 */
322
323static inline void free_image_entries(struct swap_map_page *swp)
324{ 116{
325 unsigned k; 117 unsigned int n;
326 118
327 while (swp) { 119 n = BITMAP_PAGE_BITS;
328 for (k = 0; k < MAP_PAGE_SIZE; k++) 120 while (bitmap && n <= bit) {
329 if (swp->entries[k].val) 121 n += BITMAP_PAGE_BITS;
330 swap_free(swp->entries[k]); 122 bitmap = bitmap->next;
331 swp = swp->next;
332 } 123 }
333} 124 if (!bitmap)
334 125 return -EINVAL;
335/** 126 n -= BITMAP_PAGE_BITS;
336 * The swap_map_handle structure is used for handling the swap map in 127 bit -= n;
337 * a file-alike way 128 n = 0;
338 */ 129 while (bit >= BITS_PER_CHUNK) {
339 130 bit -= BITS_PER_CHUNK;
340struct swap_map_handle { 131 n++;
341 struct swap_map_page *cur;
342 unsigned int k;
343};
344
345static inline void init_swap_map_handle(struct swap_map_handle *handle,
346 struct swap_map_page *map)
347{
348 handle->cur = map;
349 handle->k = 0;
350}
351
352static inline int swap_map_write_page(struct swap_map_handle *handle,
353 unsigned long addr)
354{
355 int error;
356
357 error = write_page(addr, handle->cur->entries + handle->k);
358 if (error)
359 return error;
360 if (++handle->k >= MAP_PAGE_SIZE) {
361 handle->cur = handle->cur->next;
362 handle->k = 0;
363 } 132 }
133 bitmap->chunks[n] |= (1UL << bit);
364 return 0; 134 return 0;
365} 135}
366 136
367/** 137unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap)
368 * save_image_data - save the data pages pointed to by the PBEs
369 * from the list @pblist using the swap map handle @handle
370 * (assume there are @nr_pages data pages to save)
371 */
372
373static int save_image_data(struct pbe *pblist,
374 struct swap_map_handle *handle,
375 unsigned int nr_pages)
376{
377 unsigned int m;
378 struct pbe *p;
379 int error = 0;
380
381 printk("Saving image data pages (%u pages) ... ", nr_pages);
382 m = nr_pages / 100;
383 if (!m)
384 m = 1;
385 nr_pages = 0;
386 for_each_pbe (p, pblist) {
387 error = swap_map_write_page(handle, p->address);
388 if (error)
389 break;
390 if (!(nr_pages % m))
391 printk("\b\b\b\b%3d%%", nr_pages / m);
392 nr_pages++;
393 }
394 if (!error)
395 printk("\b\b\b\bdone\n");
396 return error;
397}
398
399static void dump_info(void)
400{
401 pr_debug(" swsusp: Version: %u\n",swsusp_info.version_code);
402 pr_debug(" swsusp: Num Pages: %ld\n",swsusp_info.num_physpages);
403 pr_debug(" swsusp: UTS Sys: %s\n",swsusp_info.uts.sysname);
404 pr_debug(" swsusp: UTS Node: %s\n",swsusp_info.uts.nodename);
405 pr_debug(" swsusp: UTS Release: %s\n",swsusp_info.uts.release);
406 pr_debug(" swsusp: UTS Version: %s\n",swsusp_info.uts.version);
407 pr_debug(" swsusp: UTS Machine: %s\n",swsusp_info.uts.machine);
408 pr_debug(" swsusp: UTS Domain: %s\n",swsusp_info.uts.domainname);
409 pr_debug(" swsusp: CPUs: %d\n",swsusp_info.cpus);
410 pr_debug(" swsusp: Image: %ld Pages\n",swsusp_info.image_pages);
411 pr_debug(" swsusp: Total: %ld Pages\n", swsusp_info.pages);
412}
413
414static void init_header(unsigned int nr_pages)
415{
416 memset(&swsusp_info, 0, sizeof(swsusp_info));
417 swsusp_info.version_code = LINUX_VERSION_CODE;
418 swsusp_info.num_physpages = num_physpages;
419 memcpy(&swsusp_info.uts, &system_utsname, sizeof(system_utsname));
420
421 swsusp_info.cpus = num_online_cpus();
422 swsusp_info.image_pages = nr_pages;
423 swsusp_info.pages = nr_pages +
424 ((nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT) + 1;
425}
426
427/**
428 * pack_orig_addresses - the .orig_address fields of the PBEs from the
429 * list starting at @pbe are stored in the array @buf[] (1 page)
430 */
431
432static inline struct pbe *pack_orig_addresses(unsigned long *buf,
433 struct pbe *pbe)
434{
435 int j;
436
437 for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
438 buf[j] = pbe->orig_address;
439 pbe = pbe->next;
440 }
441 if (!pbe)
442 for (; j < PAGE_SIZE / sizeof(long); j++)
443 buf[j] = 0;
444 return pbe;
445}
446
447/**
448 * save_image_metadata - save the .orig_address fields of the PBEs
449 * from the list @pblist using the swap map handle @handle
450 */
451
452static int save_image_metadata(struct pbe *pblist,
453 struct swap_map_handle *handle)
454{ 138{
455 unsigned long *buf; 139 unsigned long offset;
456 unsigned int n = 0;
457 struct pbe *p;
458 int error = 0;
459 140
460 printk("Saving image metadata ... "); 141 offset = swp_offset(get_swap_page_of_type(swap));
461 buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC); 142 if (offset) {
462 if (!buf) 143 if (bitmap_set(bitmap, offset)) {
463 return -ENOMEM; 144 swap_free(swp_entry(swap, offset));
464 p = pblist; 145 offset = 0;
465 while (p) { 146 }
466 p = pack_orig_addresses(buf, p);
467 error = swap_map_write_page(handle, (unsigned long)buf);
468 if (error)
469 break;
470 n++;
471 } 147 }
472 free_page((unsigned long)buf); 148 return offset;
473 if (!error)
474 printk("done (%u pages saved)\n", n);
475 return error;
476} 149}
477 150
478/** 151void free_all_swap_pages(int swap, struct bitmap_page *bitmap)
479 * enough_swap - Make sure we have enough swap to save the image.
480 *
481 * Returns TRUE or FALSE after checking the total amount of swap
482 * space avaiable from the resume partition.
483 */
484
485static int enough_swap(unsigned int nr_pages)
486{ 152{
487 unsigned int free_swap = swap_info[root_swap].pages - 153 unsigned int bit, n;
488 swap_info[root_swap].inuse_pages; 154 unsigned long test;
489
490 pr_debug("swsusp: free swap pages: %u\n", free_swap);
491 return free_swap > (nr_pages + PAGES_FOR_IO +
492 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
493}
494 155
495/** 156 bit = 0;
496 * swsusp_write - Write entire image and metadata. 157 while (bitmap) {
497 * 158 for (n = 0; n < BITMAP_PAGE_CHUNKS; n++)
498 * It is important _NOT_ to umount filesystems at this point. We want 159 for (test = 1UL; test; test <<= 1) {
499 * them synced (in case something goes wrong) but we DO not want to mark 160 if (bitmap->chunks[n] & test)
500 * filesystem clean: it is not. (And it does not matter, if we resume 161 swap_free(swp_entry(swap, bit));
501 * correctly, we'll mark system clean, anyway.) 162 bit++;
502 */ 163 }
503 164 bitmap = bitmap->next;
504int swsusp_write(struct pbe *pblist, unsigned int nr_pages)
505{
506 struct swap_map_page *swap_map;
507 struct swap_map_handle handle;
508 swp_entry_t start;
509 int error;
510
511 if ((error = swsusp_swap_check())) {
512 printk(KERN_ERR "swsusp: Cannot find swap device, try swapon -a.\n");
513 return error;
514 }
515 if (!enough_swap(nr_pages)) {
516 printk(KERN_ERR "swsusp: Not enough free swap\n");
517 return -ENOSPC;
518 } 165 }
519
520 init_header(nr_pages);
521 swap_map = alloc_swap_map(swsusp_info.pages);
522 if (!swap_map)
523 return -ENOMEM;
524 init_swap_map_handle(&handle, swap_map);
525
526 error = swap_map_write_page(&handle, (unsigned long)&swsusp_info);
527 if (!error)
528 error = save_image_metadata(pblist, &handle);
529 if (!error)
530 error = save_image_data(pblist, &handle, nr_pages);
531 if (error)
532 goto Free_image_entries;
533
534 swap_map = reverse_swap_map(swap_map);
535 error = save_swap_map(swap_map, &start);
536 if (error)
537 goto Free_map_entries;
538
539 dump_info();
540 printk( "S" );
541 error = mark_swapfiles(start);
542 printk( "|\n" );
543 if (error)
544 goto Free_map_entries;
545
546Free_swap_map:
547 free_swap_map(swap_map);
548 return error;
549
550Free_map_entries:
551 free_swap_map_entries(swap_map);
552Free_image_entries:
553 free_image_entries(swap_map);
554 goto Free_swap_map;
555} 166}
556 167
557/** 168/**
@@ -660,379 +271,3 @@ int swsusp_resume(void)
660 local_irq_enable(); 271 local_irq_enable();
661 return error; 272 return error;
662} 273}
663
664/**
665 * mark_unsafe_pages - mark the pages that cannot be used for storing
666 * the image during resume, because they conflict with the pages that
667 * had been used before suspend
668 */
669
670static void mark_unsafe_pages(struct pbe *pblist)
671{
672 struct zone *zone;
673 unsigned long zone_pfn;
674 struct pbe *p;
675
676 if (!pblist) /* a sanity check */
677 return;
678
679 /* Clear page flags */
680 for_each_zone (zone) {
681 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
682 if (pfn_valid(zone_pfn + zone->zone_start_pfn))
683 ClearPageNosaveFree(pfn_to_page(zone_pfn +
684 zone->zone_start_pfn));
685 }
686
687 /* Mark orig addresses */
688 for_each_pbe (p, pblist)
689 SetPageNosaveFree(virt_to_page(p->orig_address));
690
691}
692
693static void copy_page_backup_list(struct pbe *dst, struct pbe *src)
694{
695 /* We assume both lists contain the same number of elements */
696 while (src) {
697 dst->orig_address = src->orig_address;
698 dst = dst->next;
699 src = src->next;
700 }
701}
702
703/*
704 * Using bio to read from swap.
705 * This code requires a bit more work than just using buffer heads
706 * but, it is the recommended way for 2.5/2.6.
707 * The following are to signal the beginning and end of I/O. Bios
708 * finish asynchronously, while we want them to happen synchronously.
709 * A simple atomic_t, and a wait loop take care of this problem.
710 */
711
712static atomic_t io_done = ATOMIC_INIT(0);
713
714static int end_io(struct bio *bio, unsigned int num, int err)
715{
716 if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
717 panic("I/O error reading memory image");
718 atomic_set(&io_done, 0);
719 return 0;
720}
721
722static struct block_device *resume_bdev;
723
724/**
725 * submit - submit BIO request.
726 * @rw: READ or WRITE.
727 * @off physical offset of page.
728 * @page: page we're reading or writing.
729 *
730 * Straight from the textbook - allocate and initialize the bio.
731 * If we're writing, make sure the page is marked as dirty.
732 * Then submit it and wait.
733 */
734
735static int submit(int rw, pgoff_t page_off, void *page)
736{
737 int error = 0;
738 struct bio *bio;
739
740 bio = bio_alloc(GFP_ATOMIC, 1);
741 if (!bio)
742 return -ENOMEM;
743 bio->bi_sector = page_off * (PAGE_SIZE >> 9);
744 bio->bi_bdev = resume_bdev;
745 bio->bi_end_io = end_io;
746
747 if (bio_add_page(bio, virt_to_page(page), PAGE_SIZE, 0) < PAGE_SIZE) {
748 printk("swsusp: ERROR: adding page to bio at %ld\n",page_off);
749 error = -EFAULT;
750 goto Done;
751 }
752
753
754 atomic_set(&io_done, 1);
755 submit_bio(rw | (1 << BIO_RW_SYNC), bio);
756 while (atomic_read(&io_done))
757 yield();
758 if (rw == READ)
759 bio_set_pages_dirty(bio);
760 Done:
761 bio_put(bio);
762 return error;
763}
764
765static int bio_read_page(pgoff_t page_off, void *page)
766{
767 return submit(READ, page_off, page);
768}
769
770static int bio_write_page(pgoff_t page_off, void *page)
771{
772 return submit(WRITE, page_off, page);
773}
774
775/**
776 * The following functions allow us to read data using a swap map
777 * in a file-alike way
778 */
779
780static inline void release_swap_map_reader(struct swap_map_handle *handle)
781{
782 if (handle->cur)
783 free_page((unsigned long)handle->cur);
784 handle->cur = NULL;
785}
786
787static inline int get_swap_map_reader(struct swap_map_handle *handle,
788 swp_entry_t start)
789{
790 int error;
791
792 if (!swp_offset(start))
793 return -EINVAL;
794 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
795 if (!handle->cur)
796 return -ENOMEM;
797 error = bio_read_page(swp_offset(start), handle->cur);
798 if (error) {
799 release_swap_map_reader(handle);
800 return error;
801 }
802 handle->k = 0;
803 return 0;
804}
805
806static inline int swap_map_read_page(struct swap_map_handle *handle, void *buf)
807{
808 unsigned long offset;
809 int error;
810
811 if (!handle->cur)
812 return -EINVAL;
813 offset = swp_offset(handle->cur->entries[handle->k]);
814 if (!offset)
815 return -EINVAL;
816 error = bio_read_page(offset, buf);
817 if (error)
818 return error;
819 if (++handle->k >= MAP_PAGE_SIZE) {
820 handle->k = 0;
821 offset = swp_offset(handle->cur->next_swap);
822 if (!offset)
823 release_swap_map_reader(handle);
824 else
825 error = bio_read_page(offset, handle->cur);
826 }
827 return error;
828}
829
830static int check_header(void)
831{
832 char *reason = NULL;
833
834 dump_info();
835 if (swsusp_info.version_code != LINUX_VERSION_CODE)
836 reason = "kernel version";
837 if (swsusp_info.num_physpages != num_physpages)
838 reason = "memory size";
839 if (strcmp(swsusp_info.uts.sysname,system_utsname.sysname))
840 reason = "system type";
841 if (strcmp(swsusp_info.uts.release,system_utsname.release))
842 reason = "kernel release";
843 if (strcmp(swsusp_info.uts.version,system_utsname.version))
844 reason = "version";
845 if (strcmp(swsusp_info.uts.machine,system_utsname.machine))
846 reason = "machine";
847 if (reason) {
848 printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason);
849 return -EPERM;
850 }
851 return 0;
852}
853
854/**
855 * load_image_data - load the image data using the swap map handle
856 * @handle and store them using the page backup list @pblist
857 * (assume there are @nr_pages pages to load)
858 */
859
860static int load_image_data(struct pbe *pblist,
861 struct swap_map_handle *handle,
862 unsigned int nr_pages)
863{
864 int error;
865 unsigned int m;
866 struct pbe *p;
867
868 if (!pblist)
869 return -EINVAL;
870 printk("Loading image data pages (%u pages) ... ", nr_pages);
871 m = nr_pages / 100;
872 if (!m)
873 m = 1;
874 nr_pages = 0;
875 p = pblist;
876 while (p) {
877 error = swap_map_read_page(handle, (void *)p->address);
878 if (error)
879 break;
880 p = p->next;
881 if (!(nr_pages % m))
882 printk("\b\b\b\b%3d%%", nr_pages / m);
883 nr_pages++;
884 }
885 if (!error)
886 printk("\b\b\b\bdone\n");
887 return error;
888}
889
890/**
891 * unpack_orig_addresses - copy the elements of @buf[] (1 page) to
892 * the PBEs in the list starting at @pbe
893 */
894
895static inline struct pbe *unpack_orig_addresses(unsigned long *buf,
896 struct pbe *pbe)
897{
898 int j;
899
900 for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
901 pbe->orig_address = buf[j];
902 pbe = pbe->next;
903 }
904 return pbe;
905}
906
907/**
908 * load_image_metadata - load the image metadata using the swap map
909 * handle @handle and put them into the PBEs in the list @pblist
910 */
911
912static int load_image_metadata(struct pbe *pblist, struct swap_map_handle *handle)
913{
914 struct pbe *p;
915 unsigned long *buf;
916 unsigned int n = 0;
917 int error = 0;
918
919 printk("Loading image metadata ... ");
920 buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC);
921 if (!buf)
922 return -ENOMEM;
923 p = pblist;
924 while (p) {
925 error = swap_map_read_page(handle, buf);
926 if (error)
927 break;
928 p = unpack_orig_addresses(buf, p);
929 n++;
930 }
931 free_page((unsigned long)buf);
932 if (!error)
933 printk("done (%u pages loaded)\n", n);
934 return error;
935}
936
937int swsusp_read(struct pbe **pblist_ptr)
938{
939 int error;
940 struct pbe *p, *pblist;
941 struct swap_map_handle handle;
942 unsigned int nr_pages;
943
944 if (IS_ERR(resume_bdev)) {
945 pr_debug("swsusp: block device not initialised\n");
946 return PTR_ERR(resume_bdev);
947 }
948
949 error = get_swap_map_reader(&handle, swsusp_header.image);
950 if (!error)
951 error = swap_map_read_page(&handle, &swsusp_info);
952 if (!error)
953 error = check_header();
954 if (error)
955 return error;
956 nr_pages = swsusp_info.image_pages;
957 p = alloc_pagedir(nr_pages, GFP_ATOMIC, 0);
958 if (!p)
959 return -ENOMEM;
960 error = load_image_metadata(p, &handle);
961 if (!error) {
962 mark_unsafe_pages(p);
963 pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1);
964 if (pblist)
965 copy_page_backup_list(pblist, p);
966 free_pagedir(p);
967 if (!pblist)
968 error = -ENOMEM;
969
970 /* Allocate memory for the image and read the data from swap */
971 if (!error)
972 error = alloc_data_pages(pblist, GFP_ATOMIC, 1);
973 if (!error) {
974 release_eaten_pages();
975 error = load_image_data(pblist, &handle, nr_pages);
976 }
977 if (!error)
978 *pblist_ptr = pblist;
979 }
980 release_swap_map_reader(&handle);
981
982 blkdev_put(resume_bdev);
983
984 if (!error)
985 pr_debug("swsusp: Reading resume file was successful\n");
986 else
987 pr_debug("swsusp: Error %d resuming\n", error);
988 return error;
989}
990
991/**
992 * swsusp_check - Check for swsusp signature in the resume device
993 */
994
995int swsusp_check(void)
996{
997 int error;
998
999 resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
1000 if (!IS_ERR(resume_bdev)) {
1001 set_blocksize(resume_bdev, PAGE_SIZE);
1002 memset(&swsusp_header, 0, sizeof(swsusp_header));
1003 if ((error = bio_read_page(0, &swsusp_header)))
1004 return error;
1005 if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
1006 memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
1007 /* Reset swap signature now */
1008 error = bio_write_page(0, &swsusp_header);
1009 } else {
1010 return -EINVAL;
1011 }
1012 if (error)
1013 blkdev_put(resume_bdev);
1014 else
1015 pr_debug("swsusp: Signature found, resuming\n");
1016 } else {
1017 error = PTR_ERR(resume_bdev);
1018 }
1019
1020 if (error)
1021 pr_debug("swsusp: Error %d check for resume file\n", error);
1022
1023 return error;
1024}
1025
1026/**
1027 * swsusp_close - close swap device.
1028 */
1029
1030void swsusp_close(void)
1031{
1032 if (IS_ERR(resume_bdev)) {
1033 pr_debug("swsusp: block device not initialised\n");
1034 return;
1035 }
1036
1037 blkdev_put(resume_bdev);
1038}
diff --git a/kernel/power/user.c b/kernel/power/user.c
new file mode 100644
index 000000000000..3f1539fbe48a
--- /dev/null
+++ b/kernel/power/user.c
@@ -0,0 +1,333 @@
1/*
2 * linux/kernel/power/user.c
3 *
4 * This file provides the user space interface for software suspend/resume.
5 *
6 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
7 *
8 * This file is released under the GPLv2.
9 *
10 */
11
12#include <linux/suspend.h>
13#include <linux/syscalls.h>
14#include <linux/string.h>
15#include <linux/device.h>
16#include <linux/miscdevice.h>
17#include <linux/mm.h>
18#include <linux/swap.h>
19#include <linux/swapops.h>
20#include <linux/pm.h>
21#include <linux/fs.h>
22
23#include <asm/uaccess.h>
24
25#include "power.h"
26
27#define SNAPSHOT_MINOR 231
28
29static struct snapshot_data {
30 struct snapshot_handle handle;
31 int swap;
32 struct bitmap_page *bitmap;
33 int mode;
34 char frozen;
35 char ready;
36} snapshot_state;
37
38static atomic_t device_available = ATOMIC_INIT(1);
39
40static int snapshot_open(struct inode *inode, struct file *filp)
41{
42 struct snapshot_data *data;
43
44 if (!atomic_add_unless(&device_available, -1, 0))
45 return -EBUSY;
46
47 if ((filp->f_flags & O_ACCMODE) == O_RDWR)
48 return -ENOSYS;
49
50 nonseekable_open(inode, filp);
51 data = &snapshot_state;
52 filp->private_data = data;
53 memset(&data->handle, 0, sizeof(struct snapshot_handle));
54 if ((filp->f_flags & O_ACCMODE) == O_RDONLY) {
55 data->swap = swsusp_resume_device ? swap_type_of(swsusp_resume_device) : -1;
56 data->mode = O_RDONLY;
57 } else {
58 data->swap = -1;
59 data->mode = O_WRONLY;
60 }
61 data->bitmap = NULL;
62 data->frozen = 0;
63 data->ready = 0;
64
65 return 0;
66}
67
68static int snapshot_release(struct inode *inode, struct file *filp)
69{
70 struct snapshot_data *data;
71
72 swsusp_free();
73 data = filp->private_data;
74 free_all_swap_pages(data->swap, data->bitmap);
75 free_bitmap(data->bitmap);
76 if (data->frozen) {
77 down(&pm_sem);
78 thaw_processes();
79 enable_nonboot_cpus();
80 up(&pm_sem);
81 }
82 atomic_inc(&device_available);
83 return 0;
84}
85
86static ssize_t snapshot_read(struct file *filp, char __user *buf,
87 size_t count, loff_t *offp)
88{
89 struct snapshot_data *data;
90 ssize_t res;
91
92 data = filp->private_data;
93 res = snapshot_read_next(&data->handle, count);
94 if (res > 0) {
95 if (copy_to_user(buf, data_of(data->handle), res))
96 res = -EFAULT;
97 else
98 *offp = data->handle.offset;
99 }
100 return res;
101}
102
103static ssize_t snapshot_write(struct file *filp, const char __user *buf,
104 size_t count, loff_t *offp)
105{
106 struct snapshot_data *data;
107 ssize_t res;
108
109 data = filp->private_data;
110 res = snapshot_write_next(&data->handle, count);
111 if (res > 0) {
112 if (copy_from_user(data_of(data->handle), buf, res))
113 res = -EFAULT;
114 else
115 *offp = data->handle.offset;
116 }
117 return res;
118}
119
120static int snapshot_ioctl(struct inode *inode, struct file *filp,
121 unsigned int cmd, unsigned long arg)
122{
123 int error = 0;
124 struct snapshot_data *data;
125 loff_t offset, avail;
126
127 if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC)
128 return -ENOTTY;
129 if (_IOC_NR(cmd) > SNAPSHOT_IOC_MAXNR)
130 return -ENOTTY;
131 if (!capable(CAP_SYS_ADMIN))
132 return -EPERM;
133
134 data = filp->private_data;
135
136 switch (cmd) {
137
138 case SNAPSHOT_FREEZE:
139 if (data->frozen)
140 break;
141 down(&pm_sem);
142 disable_nonboot_cpus();
143 if (freeze_processes()) {
144 thaw_processes();
145 enable_nonboot_cpus();
146 error = -EBUSY;
147 }
148 up(&pm_sem);
149 if (!error)
150 data->frozen = 1;
151 break;
152
153 case SNAPSHOT_UNFREEZE:
154 if (!data->frozen)
155 break;
156 down(&pm_sem);
157 thaw_processes();
158 enable_nonboot_cpus();
159 up(&pm_sem);
160 data->frozen = 0;
161 break;
162
163 case SNAPSHOT_ATOMIC_SNAPSHOT:
164 if (data->mode != O_RDONLY || !data->frozen || data->ready) {
165 error = -EPERM;
166 break;
167 }
168 down(&pm_sem);
169 /* Free memory before shutting down devices. */
170 error = swsusp_shrink_memory();
171 if (!error) {
172 error = device_suspend(PMSG_FREEZE);
173 if (!error) {
174 in_suspend = 1;
175 error = swsusp_suspend();
176 device_resume();
177 }
178 }
179 up(&pm_sem);
180 if (!error)
181 error = put_user(in_suspend, (unsigned int __user *)arg);
182 if (!error)
183 data->ready = 1;
184 break;
185
186 case SNAPSHOT_ATOMIC_RESTORE:
187 if (data->mode != O_WRONLY || !data->frozen ||
188 !snapshot_image_loaded(&data->handle)) {
189 error = -EPERM;
190 break;
191 }
192 down(&pm_sem);
193 pm_prepare_console();
194 error = device_suspend(PMSG_FREEZE);
195 if (!error) {
196 error = swsusp_resume();
197 device_resume();
198 }
199 pm_restore_console();
200 up(&pm_sem);
201 break;
202
203 case SNAPSHOT_FREE:
204 swsusp_free();
205 memset(&data->handle, 0, sizeof(struct snapshot_handle));
206 data->ready = 0;
207 break;
208
209 case SNAPSHOT_SET_IMAGE_SIZE:
210 image_size = arg;
211 break;
212
213 case SNAPSHOT_AVAIL_SWAP:
214 avail = count_swap_pages(data->swap, 1);
215 avail <<= PAGE_SHIFT;
216 error = put_user(avail, (loff_t __user *)arg);
217 break;
218
219 case SNAPSHOT_GET_SWAP_PAGE:
220 if (data->swap < 0 || data->swap >= MAX_SWAPFILES) {
221 error = -ENODEV;
222 break;
223 }
224 if (!data->bitmap) {
225 data->bitmap = alloc_bitmap(count_swap_pages(data->swap, 0));
226 if (!data->bitmap) {
227 error = -ENOMEM;
228 break;
229 }
230 }
231 offset = alloc_swap_page(data->swap, data->bitmap);
232 if (offset) {
233 offset <<= PAGE_SHIFT;
234 error = put_user(offset, (loff_t __user *)arg);
235 } else {
236 error = -ENOSPC;
237 }
238 break;
239
240 case SNAPSHOT_FREE_SWAP_PAGES:
241 if (data->swap < 0 || data->swap >= MAX_SWAPFILES) {
242 error = -ENODEV;
243 break;
244 }
245 free_all_swap_pages(data->swap, data->bitmap);
246 free_bitmap(data->bitmap);
247 data->bitmap = NULL;
248 break;
249
250 case SNAPSHOT_SET_SWAP_FILE:
251 if (!data->bitmap) {
252 /*
253 * User space encodes device types as two-byte values,
254 * so we need to recode them
255 */
256 if (old_decode_dev(arg)) {
257 data->swap = swap_type_of(old_decode_dev(arg));
258 if (data->swap < 0)
259 error = -ENODEV;
260 } else {
261 data->swap = -1;
262 error = -EINVAL;
263 }
264 } else {
265 error = -EPERM;
266 }
267 break;
268
269 case SNAPSHOT_S2RAM:
270 if (!data->frozen) {
271 error = -EPERM;
272 break;
273 }
274
275 if (down_trylock(&pm_sem)) {
276 error = -EBUSY;
277 break;
278 }
279
280 if (pm_ops->prepare) {
281 error = pm_ops->prepare(PM_SUSPEND_MEM);
282 if (error)
283 goto OutS3;
284 }
285
286 /* Put devices to sleep */
287 error = device_suspend(PMSG_SUSPEND);
288 if (error) {
289 printk(KERN_ERR "Failed to suspend some devices.\n");
290 } else {
291 /* Enter S3, system is already frozen */
292 suspend_enter(PM_SUSPEND_MEM);
293
294 /* Wake up devices */
295 device_resume();
296 }
297
298 if (pm_ops->finish)
299 pm_ops->finish(PM_SUSPEND_MEM);
300
301OutS3:
302 up(&pm_sem);
303 break;
304
305 default:
306 error = -ENOTTY;
307
308 }
309
310 return error;
311}
312
313static struct file_operations snapshot_fops = {
314 .open = snapshot_open,
315 .release = snapshot_release,
316 .read = snapshot_read,
317 .write = snapshot_write,
318 .llseek = no_llseek,
319 .ioctl = snapshot_ioctl,
320};
321
322static struct miscdevice snapshot_device = {
323 .minor = SNAPSHOT_MINOR,
324 .name = "snapshot",
325 .fops = &snapshot_fops,
326};
327
328static int __init snapshot_device_init(void)
329{
330 return misc_register(&snapshot_device);
331};
332
333device_initcall(snapshot_device_init);
diff --git a/kernel/profile.c b/kernel/profile.c
index f89248e6d704..ad81f799a9b4 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -23,6 +23,7 @@
23#include <linux/cpu.h> 23#include <linux/cpu.h>
24#include <linux/profile.h> 24#include <linux/profile.h>
25#include <linux/highmem.h> 25#include <linux/highmem.h>
26#include <linux/mutex.h>
26#include <asm/sections.h> 27#include <asm/sections.h>
27#include <asm/semaphore.h> 28#include <asm/semaphore.h>
28 29
@@ -44,7 +45,7 @@ static cpumask_t prof_cpu_mask = CPU_MASK_ALL;
44#ifdef CONFIG_SMP 45#ifdef CONFIG_SMP
45static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); 46static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
46static DEFINE_PER_CPU(int, cpu_profile_flip); 47static DEFINE_PER_CPU(int, cpu_profile_flip);
47static DECLARE_MUTEX(profile_flip_mutex); 48static DEFINE_MUTEX(profile_flip_mutex);
48#endif /* CONFIG_SMP */ 49#endif /* CONFIG_SMP */
49 50
50static int __init profile_setup(char * str) 51static int __init profile_setup(char * str)
@@ -243,7 +244,7 @@ static void profile_flip_buffers(void)
243{ 244{
244 int i, j, cpu; 245 int i, j, cpu;
245 246
246 down(&profile_flip_mutex); 247 mutex_lock(&profile_flip_mutex);
247 j = per_cpu(cpu_profile_flip, get_cpu()); 248 j = per_cpu(cpu_profile_flip, get_cpu());
248 put_cpu(); 249 put_cpu();
249 on_each_cpu(__profile_flip_buffers, NULL, 0, 1); 250 on_each_cpu(__profile_flip_buffers, NULL, 0, 1);
@@ -259,14 +260,14 @@ static void profile_flip_buffers(void)
259 hits[i].hits = hits[i].pc = 0; 260 hits[i].hits = hits[i].pc = 0;
260 } 261 }
261 } 262 }
262 up(&profile_flip_mutex); 263 mutex_unlock(&profile_flip_mutex);
263} 264}
264 265
265static void profile_discard_flip_buffers(void) 266static void profile_discard_flip_buffers(void)
266{ 267{
267 int i, cpu; 268 int i, cpu;
268 269
269 down(&profile_flip_mutex); 270 mutex_lock(&profile_flip_mutex);
270 i = per_cpu(cpu_profile_flip, get_cpu()); 271 i = per_cpu(cpu_profile_flip, get_cpu());
271 put_cpu(); 272 put_cpu();
272 on_each_cpu(__profile_flip_buffers, NULL, 0, 1); 273 on_each_cpu(__profile_flip_buffers, NULL, 0, 1);
@@ -274,7 +275,7 @@ static void profile_discard_flip_buffers(void)
274 struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i]; 275 struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i];
275 memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit)); 276 memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit));
276 } 277 }
277 up(&profile_flip_mutex); 278 mutex_unlock(&profile_flip_mutex);
278} 279}
279 280
280void profile_hit(int type, void *__pc) 281void profile_hit(int type, void *__pc)
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index fedf5e369755..6df1559b1c02 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -47,15 +47,16 @@
47#include <linux/notifier.h> 47#include <linux/notifier.h>
48#include <linux/rcupdate.h> 48#include <linux/rcupdate.h>
49#include <linux/cpu.h> 49#include <linux/cpu.h>
50#include <linux/mutex.h>
50 51
51/* Definition for rcupdate control block. */ 52/* Definition for rcupdate control block. */
52struct rcu_ctrlblk rcu_ctrlblk = { 53static struct rcu_ctrlblk rcu_ctrlblk = {
53 .cur = -300, 54 .cur = -300,
54 .completed = -300, 55 .completed = -300,
55 .lock = SPIN_LOCK_UNLOCKED, 56 .lock = SPIN_LOCK_UNLOCKED,
56 .cpumask = CPU_MASK_NONE, 57 .cpumask = CPU_MASK_NONE,
57}; 58};
58struct rcu_ctrlblk rcu_bh_ctrlblk = { 59static struct rcu_ctrlblk rcu_bh_ctrlblk = {
59 .cur = -300, 60 .cur = -300,
60 .completed = -300, 61 .completed = -300,
61 .lock = SPIN_LOCK_UNLOCKED, 62 .lock = SPIN_LOCK_UNLOCKED,
@@ -75,7 +76,7 @@ static int rsinterval = 1000;
75#endif 76#endif
76 77
77static atomic_t rcu_barrier_cpu_count; 78static atomic_t rcu_barrier_cpu_count;
78static struct semaphore rcu_barrier_sema; 79static DEFINE_MUTEX(rcu_barrier_mutex);
79static struct completion rcu_barrier_completion; 80static struct completion rcu_barrier_completion;
80 81
81#ifdef CONFIG_SMP 82#ifdef CONFIG_SMP
@@ -207,13 +208,13 @@ static void rcu_barrier_func(void *notused)
207void rcu_barrier(void) 208void rcu_barrier(void)
208{ 209{
209 BUG_ON(in_interrupt()); 210 BUG_ON(in_interrupt());
210 /* Take cpucontrol semaphore to protect against CPU hotplug */ 211 /* Take cpucontrol mutex to protect against CPU hotplug */
211 down(&rcu_barrier_sema); 212 mutex_lock(&rcu_barrier_mutex);
212 init_completion(&rcu_barrier_completion); 213 init_completion(&rcu_barrier_completion);
213 atomic_set(&rcu_barrier_cpu_count, 0); 214 atomic_set(&rcu_barrier_cpu_count, 0);
214 on_each_cpu(rcu_barrier_func, NULL, 0, 1); 215 on_each_cpu(rcu_barrier_func, NULL, 0, 1);
215 wait_for_completion(&rcu_barrier_completion); 216 wait_for_completion(&rcu_barrier_completion);
216 up(&rcu_barrier_sema); 217 mutex_unlock(&rcu_barrier_mutex);
217} 218}
218EXPORT_SYMBOL_GPL(rcu_barrier); 219EXPORT_SYMBOL_GPL(rcu_barrier);
219 220
@@ -549,7 +550,6 @@ static struct notifier_block __devinitdata rcu_nb = {
549 */ 550 */
550void __init rcu_init(void) 551void __init rcu_init(void)
551{ 552{
552 sema_init(&rcu_barrier_sema, 1);
553 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, 553 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
554 (void *)(long)smp_processor_id()); 554 (void *)(long)smp_processor_id());
555 /* Register notifier for non-boot CPUs */ 555 /* Register notifier for non-boot CPUs */
diff --git a/kernel/sched.c b/kernel/sched.c
index 6b6e0d70eb30..7ffaabd64f89 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -237,6 +237,7 @@ struct runqueue {
237 237
238 task_t *migration_thread; 238 task_t *migration_thread;
239 struct list_head migration_queue; 239 struct list_head migration_queue;
240 int cpu;
240#endif 241#endif
241 242
242#ifdef CONFIG_SCHEDSTATS 243#ifdef CONFIG_SCHEDSTATS
@@ -1654,6 +1655,9 @@ unsigned long nr_iowait(void)
1654/* 1655/*
1655 * double_rq_lock - safely lock two runqueues 1656 * double_rq_lock - safely lock two runqueues
1656 * 1657 *
1658 * We must take them in cpu order to match code in
1659 * dependent_sleeper and wake_dependent_sleeper.
1660 *
1657 * Note this does not disable interrupts like task_rq_lock, 1661 * Note this does not disable interrupts like task_rq_lock,
1658 * you need to do so manually before calling. 1662 * you need to do so manually before calling.
1659 */ 1663 */
@@ -1665,7 +1669,7 @@ static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
1665 spin_lock(&rq1->lock); 1669 spin_lock(&rq1->lock);
1666 __acquire(rq2->lock); /* Fake it out ;) */ 1670 __acquire(rq2->lock); /* Fake it out ;) */
1667 } else { 1671 } else {
1668 if (rq1 < rq2) { 1672 if (rq1->cpu < rq2->cpu) {
1669 spin_lock(&rq1->lock); 1673 spin_lock(&rq1->lock);
1670 spin_lock(&rq2->lock); 1674 spin_lock(&rq2->lock);
1671 } else { 1675 } else {
@@ -1701,7 +1705,7 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
1701 __acquires(this_rq->lock) 1705 __acquires(this_rq->lock)
1702{ 1706{
1703 if (unlikely(!spin_trylock(&busiest->lock))) { 1707 if (unlikely(!spin_trylock(&busiest->lock))) {
1704 if (busiest < this_rq) { 1708 if (busiest->cpu < this_rq->cpu) {
1705 spin_unlock(&this_rq->lock); 1709 spin_unlock(&this_rq->lock);
1706 spin_lock(&busiest->lock); 1710 spin_lock(&busiest->lock);
1707 spin_lock(&this_rq->lock); 1711 spin_lock(&this_rq->lock);
@@ -2869,7 +2873,7 @@ asmlinkage void __sched schedule(void)
2869 */ 2873 */
2870 if (likely(!current->exit_state)) { 2874 if (likely(!current->exit_state)) {
2871 if (unlikely(in_atomic())) { 2875 if (unlikely(in_atomic())) {
2872 printk(KERN_ERR "scheduling while atomic: " 2876 printk(KERN_ERR "BUG: scheduling while atomic: "
2873 "%s/0x%08x/%d\n", 2877 "%s/0x%08x/%d\n",
2874 current->comm, preempt_count(), current->pid); 2878 current->comm, preempt_count(), current->pid);
2875 dump_stack(); 2879 dump_stack();
@@ -6029,6 +6033,7 @@ void __init sched_init(void)
6029 rq->push_cpu = 0; 6033 rq->push_cpu = 0;
6030 rq->migration_thread = NULL; 6034 rq->migration_thread = NULL;
6031 INIT_LIST_HEAD(&rq->migration_queue); 6035 INIT_LIST_HEAD(&rq->migration_queue);
6036 rq->cpu = i;
6032#endif 6037#endif
6033 atomic_set(&rq->nr_iowait, 0); 6038 atomic_set(&rq->nr_iowait, 0);
6034 6039
@@ -6069,7 +6074,7 @@ void __might_sleep(char *file, int line)
6069 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 6074 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
6070 return; 6075 return;
6071 prev_jiffy = jiffies; 6076 prev_jiffy = jiffies;
6072 printk(KERN_ERR "Debug: sleeping function called from invalid" 6077 printk(KERN_ERR "BUG: sleeping function called from invalid"
6073 " context at %s:%d\n", file, line); 6078 " context at %s:%d\n", file, line);
6074 printk("in_atomic():%d, irqs_disabled():%d\n", 6079 printk("in_atomic():%d, irqs_disabled():%d\n",
6075 in_atomic(), irqs_disabled()); 6080 in_atomic(), irqs_disabled());
diff --git a/kernel/signal.c b/kernel/signal.c
index ea154104a00b..75f7341b0c39 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1922,6 +1922,8 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
1922 sigset_t *mask = &current->blocked; 1922 sigset_t *mask = &current->blocked;
1923 int signr = 0; 1923 int signr = 0;
1924 1924
1925 try_to_freeze();
1926
1925relock: 1927relock:
1926 spin_lock_irq(&current->sighand->siglock); 1928 spin_lock_irq(&current->sighand->siglock);
1927 for (;;) { 1929 for (;;) {
@@ -2099,10 +2101,11 @@ long do_no_restart_syscall(struct restart_block *param)
2099int sigprocmask(int how, sigset_t *set, sigset_t *oldset) 2101int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
2100{ 2102{
2101 int error; 2103 int error;
2102 sigset_t old_block;
2103 2104
2104 spin_lock_irq(&current->sighand->siglock); 2105 spin_lock_irq(&current->sighand->siglock);
2105 old_block = current->blocked; 2106 if (oldset)
2107 *oldset = current->blocked;
2108
2106 error = 0; 2109 error = 0;
2107 switch (how) { 2110 switch (how) {
2108 case SIG_BLOCK: 2111 case SIG_BLOCK:
@@ -2119,8 +2122,7 @@ int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
2119 } 2122 }
2120 recalc_sigpending(); 2123 recalc_sigpending();
2121 spin_unlock_irq(&current->sighand->siglock); 2124 spin_unlock_irq(&current->sighand->siglock);
2122 if (oldset) 2125
2123 *oldset = old_block;
2124 return error; 2126 return error;
2125} 2127}
2126 2128
@@ -2307,7 +2309,6 @@ sys_rt_sigtimedwait(const sigset_t __user *uthese,
2307 2309
2308 timeout = schedule_timeout_interruptible(timeout); 2310 timeout = schedule_timeout_interruptible(timeout);
2309 2311
2310 try_to_freeze();
2311 spin_lock_irq(&current->sighand->siglock); 2312 spin_lock_irq(&current->sighand->siglock);
2312 sig = dequeue_signal(current, &these, &info); 2313 sig = dequeue_signal(current, &these, &info);
2313 current->blocked = current->real_blocked; 2314 current->blocked = current->real_blocked;
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 0375fcd5921d..d1b810782bc4 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -179,16 +179,16 @@ EXPORT_SYMBOL(_write_lock);
179#define BUILD_LOCK_OPS(op, locktype) \ 179#define BUILD_LOCK_OPS(op, locktype) \
180void __lockfunc _##op##_lock(locktype##_t *lock) \ 180void __lockfunc _##op##_lock(locktype##_t *lock) \
181{ \ 181{ \
182 preempt_disable(); \
183 for (;;) { \ 182 for (;;) { \
183 preempt_disable(); \
184 if (likely(_raw_##op##_trylock(lock))) \ 184 if (likely(_raw_##op##_trylock(lock))) \
185 break; \ 185 break; \
186 preempt_enable(); \ 186 preempt_enable(); \
187 \
187 if (!(lock)->break_lock) \ 188 if (!(lock)->break_lock) \
188 (lock)->break_lock = 1; \ 189 (lock)->break_lock = 1; \
189 while (!op##_can_lock(lock) && (lock)->break_lock) \ 190 while (!op##_can_lock(lock) && (lock)->break_lock) \
190 cpu_relax(); \ 191 cpu_relax(); \
191 preempt_disable(); \
192 } \ 192 } \
193 (lock)->break_lock = 0; \ 193 (lock)->break_lock = 0; \
194} \ 194} \
@@ -199,19 +199,18 @@ unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock) \
199{ \ 199{ \
200 unsigned long flags; \ 200 unsigned long flags; \
201 \ 201 \
202 preempt_disable(); \
203 for (;;) { \ 202 for (;;) { \
203 preempt_disable(); \
204 local_irq_save(flags); \ 204 local_irq_save(flags); \
205 if (likely(_raw_##op##_trylock(lock))) \ 205 if (likely(_raw_##op##_trylock(lock))) \
206 break; \ 206 break; \
207 local_irq_restore(flags); \ 207 local_irq_restore(flags); \
208 \
209 preempt_enable(); \ 208 preempt_enable(); \
209 \
210 if (!(lock)->break_lock) \ 210 if (!(lock)->break_lock) \
211 (lock)->break_lock = 1; \ 211 (lock)->break_lock = 1; \
212 while (!op##_can_lock(lock) && (lock)->break_lock) \ 212 while (!op##_can_lock(lock) && (lock)->break_lock) \
213 cpu_relax(); \ 213 cpu_relax(); \
214 preempt_disable(); \
215 } \ 214 } \
216 (lock)->break_lock = 0; \ 215 (lock)->break_lock = 0; \
217 return flags; \ 216 return flags; \
diff --git a/kernel/sys.c b/kernel/sys.c
index f91218a5463e..c0fcad9f826c 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1227,7 +1227,7 @@ asmlinkage long sys_setsid(void)
1227 struct pid *pid; 1227 struct pid *pid;
1228 int err = -EPERM; 1228 int err = -EPERM;
1229 1229
1230 down(&tty_sem); 1230 mutex_lock(&tty_mutex);
1231 write_lock_irq(&tasklist_lock); 1231 write_lock_irq(&tasklist_lock);
1232 1232
1233 pid = find_pid(PIDTYPE_PGID, group_leader->pid); 1233 pid = find_pid(PIDTYPE_PGID, group_leader->pid);
@@ -1241,7 +1241,7 @@ asmlinkage long sys_setsid(void)
1241 err = process_group(group_leader); 1241 err = process_group(group_leader);
1242out: 1242out:
1243 write_unlock_irq(&tasklist_lock); 1243 write_unlock_irq(&tasklist_lock);
1244 up(&tty_sem); 1244 mutex_unlock(&tty_mutex);
1245 return err; 1245 return err;
1246} 1246}
1247 1247
@@ -1677,9 +1677,6 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
1677 * a lot simpler! (Which we're not doing right now because we're not 1677 * a lot simpler! (Which we're not doing right now because we're not
1678 * measuring them yet). 1678 * measuring them yet).
1679 * 1679 *
1680 * This expects to be called with tasklist_lock read-locked or better,
1681 * and the siglock not locked. It may momentarily take the siglock.
1682 *
1683 * When sampling multiple threads for RUSAGE_SELF, under SMP we might have 1680 * When sampling multiple threads for RUSAGE_SELF, under SMP we might have
1684 * races with threads incrementing their own counters. But since word 1681 * races with threads incrementing their own counters. But since word
1685 * reads are atomic, we either get new values or old values and we don't 1682 * reads are atomic, we either get new values or old values and we don't
@@ -1687,6 +1684,25 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
1687 * the c* fields from p->signal from races with exit.c updating those 1684 * the c* fields from p->signal from races with exit.c updating those
1688 * fields when reaping, so a sample either gets all the additions of a 1685 * fields when reaping, so a sample either gets all the additions of a
1689 * given child after it's reaped, or none so this sample is before reaping. 1686 * given child after it's reaped, or none so this sample is before reaping.
1687 *
1688 * tasklist_lock locking optimisation:
1689 * If we are current and single threaded, we do not need to take the tasklist
1690 * lock or the siglock. No one else can take our signal_struct away,
1691 * no one else can reap the children to update signal->c* counters, and
1692 * no one else can race with the signal-> fields.
1693 * If we do not take the tasklist_lock, the signal-> fields could be read
1694 * out of order while another thread was just exiting. So we place a
1695 * read memory barrier when we avoid the lock. On the writer side,
1696 * write memory barrier is implied in __exit_signal as __exit_signal releases
1697 * the siglock spinlock after updating the signal-> fields.
1698 *
1699 * We don't really need the siglock when we access the non c* fields
1700 * of the signal_struct (for RUSAGE_SELF) even in multithreaded
1701 * case, since we take the tasklist lock for read and the non c* signal->
1702 * fields are updated only in __exit_signal, which is called with
1703 * tasklist_lock taken for write, hence these two threads cannot execute
1704 * concurrently.
1705 *
1690 */ 1706 */
1691 1707
1692static void k_getrusage(struct task_struct *p, int who, struct rusage *r) 1708static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
@@ -1694,13 +1710,23 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1694 struct task_struct *t; 1710 struct task_struct *t;
1695 unsigned long flags; 1711 unsigned long flags;
1696 cputime_t utime, stime; 1712 cputime_t utime, stime;
1713 int need_lock = 0;
1697 1714
1698 memset((char *) r, 0, sizeof *r); 1715 memset((char *) r, 0, sizeof *r);
1716 utime = stime = cputime_zero;
1699 1717
1700 if (unlikely(!p->signal)) 1718 if (p != current || !thread_group_empty(p))
1701 return; 1719 need_lock = 1;
1702 1720
1703 utime = stime = cputime_zero; 1721 if (need_lock) {
1722 read_lock(&tasklist_lock);
1723 if (unlikely(!p->signal)) {
1724 read_unlock(&tasklist_lock);
1725 return;
1726 }
1727 } else
1728 /* See locking comments above */
1729 smp_rmb();
1704 1730
1705 switch (who) { 1731 switch (who) {
1706 case RUSAGE_BOTH: 1732 case RUSAGE_BOTH:
@@ -1740,6 +1766,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1740 BUG(); 1766 BUG();
1741 } 1767 }
1742 1768
1769 if (need_lock)
1770 read_unlock(&tasklist_lock);
1743 cputime_to_timeval(utime, &r->ru_utime); 1771 cputime_to_timeval(utime, &r->ru_utime);
1744 cputime_to_timeval(stime, &r->ru_stime); 1772 cputime_to_timeval(stime, &r->ru_stime);
1745} 1773}
@@ -1747,9 +1775,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1747int getrusage(struct task_struct *p, int who, struct rusage __user *ru) 1775int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
1748{ 1776{
1749 struct rusage r; 1777 struct rusage r;
1750 read_lock(&tasklist_lock);
1751 k_getrusage(p, who, &r); 1778 k_getrusage(p, who, &r);
1752 read_unlock(&tasklist_lock);
1753 return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; 1779 return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;
1754} 1780}
1755 1781