Merge branch 'master'

author: Jeff Garzik <jeff@garzik.org> 2006-03-23 17:13:43 -0500
committer: Jeff Garzik <jeff@garzik.org> 2006-03-23 17:13:43 -0500
commit: 88e3c1da8b3258a81c5c81d4e7e22557b7d71ba7 (patch)
tree: ab518773c0ff4606f1a57d00b5931332a7e1d96e /kernel
parent: fa4fa40a990f8f4eff65476bef32007c154bbac0 (diff)
parent: b0e6e962992b76580f4900b166a337bad7c1e81b (diff)
25 files changed, 1706 insertions, 1076 deletions
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index d7e7e637b92a..c4394abcd5e6 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -958,7 +958,7 @@ void audit_syscall_entry(struct task_struct *tsk, int arch, int major,
         *
         * i386     no
         * x86_64   no
-         * ppc64    yes (see arch/ppc64/kernel/misc.S)
+         * ppc64    yes (see arch/powerpc/platforms/iseries/misc.S)
         *
         * This also happens with vm86 emulation in a non-nested manner
         * (entries without exits), so this case must be caught.
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 12815d3f1a05..c86ee051b734 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -53,7 +53,7 @@
 #include <asm/uaccess.h>
 #include <asm/atomic.h>
-#include <asm/semaphore.h>
+#include <linux/mutex.h>
 #define CPUSET_SUPER_MAGIC              0x27e0eb
@@ -168,63 +168,57 @@ static struct vfsmount *cpuset_mount;
 static struct super_block *cpuset_sb;
 /*
- * We have two global cpuset semaphores below.  They can nest.
+ * We have two global cpuset mutexes below.  They can nest.
- * It is ok to first take manage_sem, then nest callback_sem.  We also
+ * It is ok to first take manage_mutex, then nest callback_mutex.  We also
 * require taking task_lock() when dereferencing a tasks cpuset pointer.
 * See "The task_lock() exception", at the end of this comment.
 *
- * A task must hold both semaphores to modify cpusets.  If a task
+ * A task must hold both mutexes to modify cpusets.  If a task
- * holds manage_sem, then it blocks others wanting that semaphore,
+ * holds manage_mutex, then it blocks others wanting that mutex,
- * ensuring that it is the only task able to also acquire callback_sem
+ * ensuring that it is the only task able to also acquire callback_mutex
 * and be able to modify cpusets.  It can perform various checks on
 * the cpuset structure first, knowing nothing will change.  It can
- * also allocate memory while just holding manage_sem.  While it is
+ * also allocate memory while just holding manage_mutex.  While it is
 * performing these checks, various callback routines can briefly
- * acquire callback_sem to query cpusets.  Once it is ready to make
+ * acquire callback_mutex to query cpusets.  Once it is ready to make
- * the changes, it takes callback_sem, blocking everyone else.
+ * the changes, it takes callback_mutex, blocking everyone else.
 *
 * Calls to the kernel memory allocator can not be made while holding
- * callback_sem, as that would risk double tripping on callback_sem
+ * callback_mutex, as that would risk double tripping on callback_mutex
 * from one of the callbacks into the cpuset code from within
 * __alloc_pages().
 *
- * If a task is only holding callback_sem, then it has read-only
+ * If a task is only holding callback_mutex, then it has read-only
 * access to cpusets.
 *
 * The task_struct fields mems_allowed and mems_generation may only
 * be accessed in the context of that task, so require no locks.
 *
 * Any task can increment and decrement the count field without lock.
- * So in general, code holding manage_sem or callback_sem can't rely
+ * So in general, code holding manage_mutex or callback_mutex can't rely
 * on the count field not changing.  However, if the count goes to
- * zero, then only attach_task(), which holds both semaphores, can
+ * zero, then only attach_task(), which holds both mutexes, can
 * increment it again.  Because a count of zero means that no tasks
 * are currently attached, therefore there is no way a task attached
 * to that cpuset can fork (the other way to increment the count).
- * So code holding manage_sem or callback_sem can safely assume that
+ * So code holding manage_mutex or callback_mutex can safely assume that
 * if the count is zero, it will stay zero.  Similarly, if a task
- * holds manage_sem or callback_sem on a cpuset with zero count, it
+ * holds manage_mutex or callback_mutex on a cpuset with zero count, it
 * knows that the cpuset won't be removed, as cpuset_rmdir() needs
- * both of those semaphores.
+ * both of those mutexes.
- *
- * A possible optimization to improve parallelism would be to make
- * callback_sem a R/W semaphore (rwsem), allowing the callback routines
- * to proceed in parallel, with read access, until the holder of
- * manage_sem needed to take this rwsem for exclusive write access
- * and modify some cpusets.
 *
 * The cpuset_common_file_write handler for operations that modify
- * the cpuset hierarchy holds manage_sem across the entire operation,
+ * the cpuset hierarchy holds manage_mutex across the entire operation,
 * single threading all such cpuset modifications across the system.
 *
- * The cpuset_common_file_read() handlers only hold callback_sem across
+ * The cpuset_common_file_read() handlers only hold callback_mutex across
 * small pieces of code, such as when reading out possibly multi-word
 * cpumasks and nodemasks.
 *
 * The fork and exit callbacks cpuset_fork() and cpuset_exit(), don't
- * (usually) take either semaphore.  These are the two most performance
+ * (usually) take either mutex.  These are the two most performance
 * critical pieces of code here.  The exception occurs on cpuset_exit(),
- * when a task in a notify_on_release cpuset exits.  Then manage_sem
+ * when a task in a notify_on_release cpuset exits.  Then manage_mutex
 * is taken, and if the cpuset count is zero, a usermode call made
 * to /sbin/cpuset_release_agent with the name of the cpuset (path
 * relative to the root of cpuset file system) as the argument.
@@ -242,9 +236,9 @@ static struct super_block *cpuset_sb;
 *
 * The need for this exception arises from the action of attach_task(),
 * which overwrites one tasks cpuset pointer with another.  It does
- * so using both semaphores, however there are several performance
+ * so using both mutexes, however there are several performance
 * critical places that need to reference task->cpuset without the
- * expense of grabbing a system global semaphore.  Therefore except as
+ * expense of grabbing a system global mutex.  Therefore except as
 * noted below, when dereferencing or, as in attach_task(), modifying
 * a tasks cpuset pointer we use task_lock(), which acts on a spinlock
 * (task->alloc_lock) already in the task_struct routinely used for
@@ -256,8 +250,8 @@ static struct super_block *cpuset_sb;
 * the routine cpuset_update_task_memory_state().
 */
-static DECLARE_MUTEX(manage_sem);
+static DEFINE_MUTEX(manage_mutex);
-static DECLARE_MUTEX(callback_sem);
+static DEFINE_MUTEX(callback_mutex);
 /*
 * A couple of forward declarations required, due to cyclic reference loop:
@@ -432,7 +426,7 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
 }
 /*
- * Call with manage_sem held.  Writes path of cpuset into buf.
+ * Call with manage_mutex held.  Writes path of cpuset into buf.
 * Returns 0 on success, -errno on error.
 */
@@ -484,11 +478,11 @@ static int cpuset_path(const struct cpuset *cs, char *buf, int buflen)
 * status of the /sbin/cpuset_release_agent task, so no sense holding
 * our caller up for that.
 *
- * When we had only one cpuset semaphore, we had to call this
+ * When we had only one cpuset mutex, we had to call this
 * without holding it, to avoid deadlock when call_usermodehelper()
 * allocated memory.  With two locks, we could now call this while
- * holding manage_sem, but we still don't, so as to minimize
+ * holding manage_mutex, but we still don't, so as to minimize
- * the time manage_sem is held.
+ * the time manage_mutex is held.
 */
 static void cpuset_release_agent(const char *pathbuf)
@@ -520,15 +514,15 @@ static void cpuset_release_agent(const char *pathbuf)
 * cs is notify_on_release() and now both the user count is zero and
 * the list of children is empty, prepare cpuset path in a kmalloc'd
 * buffer, to be returned via ppathbuf, so that the caller can invoke
- * cpuset_release_agent() with it later on, once manage_sem is dropped.
+ * cpuset_release_agent() with it later on, once manage_mutex is dropped.
- * Call here with manage_sem held.
+ * Call here with manage_mutex held.
 *
 * This check_for_release() routine is responsible for kmalloc'ing
 * pathbuf.  The above cpuset_release_agent() is responsible for
 * kfree'ing pathbuf.  The caller of these routines is responsible
 * for providing a pathbuf pointer, initialized to NULL, then
- * calling check_for_release() with manage_sem held and the address
+ * calling check_for_release() with manage_mutex held and the address
- * of the pathbuf pointer, then dropping manage_sem, then calling
+ * of the pathbuf pointer, then dropping manage_mutex, then calling
 * cpuset_release_agent() with pathbuf, as set by check_for_release().
 */
@@ -559,7 +553,7 @@ static void check_for_release(struct cpuset *cs, char **ppathbuf)
 * One way or another, we guarantee to return some non-empty subset
 * of cpu_online_map.
 *
- * Call with callback_sem held.
+ * Call with callback_mutex held.
 */
 static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask)
@@ -583,7 +577,7 @@ static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask)
 * One way or another, we guarantee to return some non-empty subset
 * of node_online_map.
 *
- * Call with callback_sem held.
+ * Call with callback_mutex held.
 */
 static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
@@ -608,12 +602,12 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
 * current->cpuset if a task has its memory placement changed.
 * Do not call this routine if in_interrupt().
 *
- * Call without callback_sem or task_lock() held.  May be called
+ * Call without callback_mutex or task_lock() held.  May be called
- * with or without manage_sem held.  Doesn't need task_lock to guard
+ * with or without manage_mutex held.  Doesn't need task_lock to guard
 * against another task changing a non-NULL cpuset pointer to NULL,
 * as that is only done by a task on itself, and if the current task
 * is here, it is not simultaneously in the exit code NULL'ing its
- * cpuset pointer.  This routine also might acquire callback_sem and
+ * cpuset pointer.  This routine also might acquire callback_mutex and
 * current->mm->mmap_sem during call.
 *
 * Reading current->cpuset->mems_generation doesn't need task_lock
@@ -658,13 +652,13 @@ void cpuset_update_task_memory_state(void)
        }
        if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
-                down(&callback_sem);
+                mutex_lock(&callback_mutex);
                task_lock(tsk);
                cs = tsk->cpuset;       /* Maybe changed when task not locked */
                guarantee_online_mems(cs, &tsk->mems_allowed);
                tsk->cpuset_mems_generation = cs->mems_generation;
                task_unlock(tsk);
-                up(&callback_sem);
+                mutex_unlock(&callback_mutex);
                mpol_rebind_task(tsk, &tsk->mems_allowed);
        }
 }
@@ -674,7 +668,7 @@ void cpuset_update_task_memory_state(void)
 *
 * One cpuset is a subset of another if all its allowed CPUs and
 * Memory Nodes are a subset of the other, and its exclusive flags
- * are only set if the other's are set.  Call holding manage_sem.
+ * are only set if the other's are set.  Call holding manage_mutex.
 */
 static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
@@ -692,7 +686,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
 * If we replaced the flag and mask values of the current cpuset
 * (cur) with those values in the trial cpuset (trial), would
 * our various subset and exclusive rules still be valid?  Presumes
- * manage_sem held.
+ * manage_mutex held.
 *
 * 'cur' is the address of an actual, in-use cpuset.  Operations
 * such as list traversal that depend on the actual address of the
@@ -746,7 +740,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
 *    exclusive child cpusets
 * Build these two partitions by calling partition_sched_domains
 *
- * Call with manage_sem held.  May nest a call to the
+ * Call with manage_mutex held.  May nest a call to the
 * lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
 */
@@ -792,7 +786,7 @@ static void update_cpu_domains(struct cpuset *cur)
 }
 /*
- * Call with manage_sem held.  May take callback_sem during call.
+ * Call with manage_mutex held.  May take callback_mutex during call.
 */
 static int update_cpumask(struct cpuset *cs, char *buf)
@@ -811,9 +805,9 @@ static int update_cpumask(struct cpuset *cs, char *buf)
        if (retval < 0)
                return retval;
        cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed);
-        down(&callback_sem);
+        mutex_lock(&callback_mutex);
        cs->cpus_allowed = trialcs.cpus_allowed;
-        up(&callback_sem);
+        mutex_unlock(&callback_mutex);
        if (is_cpu_exclusive(cs) && !cpus_unchanged)
                update_cpu_domains(cs);
        return 0;
@@ -827,7 +821,7 @@ static int update_cpumask(struct cpuset *cs, char *buf)
 * the cpuset is marked 'memory_migrate', migrate the tasks
 * pages to the new memory.
 *
- * Call with manage_sem held.  May take callback_sem during call.
+ * Call with manage_mutex held.  May take callback_mutex during call.
 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
 * lock each such tasks mm->mmap_sem, scan its vma's and rebind
 * their mempolicies to the cpusets new mems_allowed.
@@ -862,11 +856,11 @@ static int update_nodemask(struct cpuset *cs, char *buf)
        if (retval < 0)
                goto done;
-        down(&callback_sem);
+        mutex_lock(&callback_mutex);
        cs->mems_allowed = trialcs.mems_allowed;
        atomic_inc(&cpuset_mems_generation);
        cs->mems_generation = atomic_read(&cpuset_mems_generation);
-        up(&callback_sem);
+        mutex_unlock(&callback_mutex);
        set_cpuset_being_rebound(cs);           /* causes mpol_copy() rebind */
@@ -922,7 +916,7 @@ static int update_nodemask(struct cpuset *cs, char *buf)
         * tasklist_lock.  Forks can happen again now - the mpol_copy()
         * cpuset_being_rebound check will catch such forks, and rebind
         * their vma mempolicies too.  Because we still hold the global
-         * cpuset manage_sem, we know that no other rebind effort will
+         * cpuset manage_mutex, we know that no other rebind effort will
         * be contending for the global variable cpuset_being_rebound.
         * It's ok if we rebind the same mm twice; mpol_rebind_mm()
         * is idempotent.  Also migrate pages in each mm to new nodes.
@@ -948,7 +942,7 @@ done:
 }
 /*
- * Call with manage_sem held.
+ * Call with manage_mutex held.
 */
 static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
@@ -967,7 +961,7 @@ static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
 * cs:  the cpuset to update
 * buf: the buffer where we read the 0 or 1
 *
- * Call with manage_sem held.
+ * Call with manage_mutex held.
 */
 static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
@@ -989,12 +983,12 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
                return err;
        cpu_exclusive_changed =
                (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs));
-        down(&callback_sem);
+        mutex_lock(&callback_mutex);
        if (turning_on)
                set_bit(bit, &cs->flags);
        else
                clear_bit(bit, &cs->flags);
-        up(&callback_sem);
+        mutex_unlock(&callback_mutex);
        if (cpu_exclusive_changed)
                update_cpu_domains(cs);
@@ -1104,7 +1098,7 @@ static int fmeter_getrate(struct fmeter *fmp)
 * writing the path of the old cpuset in 'ppathbuf' if it needs to be
 * notified on release.
 *
- * Call holding manage_sem.  May take callback_sem and task_lock of
+ * Call holding manage_mutex.  May take callback_mutex and task_lock of
 * the task 'pid' during call.
 */
@@ -1144,13 +1138,13 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
                get_task_struct(tsk);
        }
-        down(&callback_sem);
+        mutex_lock(&callback_mutex);
        task_lock(tsk);
        oldcs = tsk->cpuset;
        if (!oldcs) {
                task_unlock(tsk);
-                up(&callback_sem);
+                mutex_unlock(&callback_mutex);
                put_task_struct(tsk);
                return -ESRCH;
        }
@@ -1164,7 +1158,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
        from = oldcs->mems_allowed;
        to = cs->mems_allowed;
-        up(&callback_sem);
+        mutex_unlock(&callback_mutex);
        mm = get_task_mm(tsk);
        if (mm) {
@@ -1221,7 +1215,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
        }
        buffer[nbytes] = 0;     /* nul-terminate */
-        down(&manage_sem);
+        mutex_lock(&manage_mutex);
        if (is_removed(cs)) {
                retval = -ENODEV;
@@ -1264,7 +1258,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
        if (retval == 0)
                retval = nbytes;
 out2:
-        up(&manage_sem);
+        mutex_unlock(&manage_mutex);
        cpuset_release_agent(pathbuf);
 out1:
        kfree(buffer);
@@ -1304,9 +1298,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
 {
        cpumask_t mask;
-        down(&callback_sem);
+        mutex_lock(&callback_mutex);
        mask = cs->cpus_allowed;
-        up(&callback_sem);
+        mutex_unlock(&callback_mutex);
        return cpulist_scnprintf(page, PAGE_SIZE, mask);
 }
@@ -1315,9 +1309,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
 {
        nodemask_t mask;
-        down(&callback_sem);
+        mutex_lock(&callback_mutex);
        mask = cs->mems_allowed;
-        up(&callback_sem);
+        mutex_unlock(&callback_mutex);
        return nodelist_scnprintf(page, PAGE_SIZE, mask);
 }
@@ -1598,7 +1592,7 @@ static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids)
 * Handle an open on 'tasks' file.  Prepare a buffer listing the
 * process id's of tasks currently attached to the cpuset being opened.
 *
- * Does not require any specific cpuset semaphores, and does not take any.
+ * Does not require any specific cpuset mutexes, and does not take any.
 */
 static int cpuset_tasks_open(struct inode *unused, struct file *file)
 {
@@ -1754,7 +1748,7 @@ static int cpuset_populate_dir(struct dentry *cs_dentry)
 *      name:           name of the new cpuset. Will be strcpy'ed.
 *      mode:           mode to set on new inode
 *
- *      Must be called with the semaphore on the parent inode held
+ *      Must be called with the mutex on the parent inode held
 */
 static long cpuset_create(struct cpuset *parent, const char *name, int mode)
@@ -1766,7 +1760,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
        if (!cs)
                return -ENOMEM;
-        down(&manage_sem);
+        mutex_lock(&manage_mutex);
        cpuset_update_task_memory_state();
        cs->flags = 0;
        if (notify_on_release(parent))
@@ -1782,28 +1776,28 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
        cs->parent = parent;
-        down(&callback_sem);
+        mutex_lock(&callback_mutex);
        list_add(&cs->sibling, &cs->parent->children);
        number_of_cpusets++;
-        up(&callback_sem);
+        mutex_unlock(&callback_mutex);
        err = cpuset_create_dir(cs, name, mode);
        if (err < 0)
                goto err;
        /*
-         * Release manage_sem before cpuset_populate_dir() because it
+         * Release manage_mutex before cpuset_populate_dir() because it
         * will down() this new directory's i_mutex and if we race with
         * another mkdir, we might deadlock.
         */
-        up(&manage_sem);
+        mutex_unlock(&manage_mutex);
        err = cpuset_populate_dir(cs->dentry);
        /* If err < 0, we have a half-filled directory - oh well ;) */
        return 0;
 err:
        list_del(&cs->sibling);
-        up(&manage_sem);
+        mutex_unlock(&manage_mutex);
        kfree(cs);
        return err;
 }
@@ -1825,18 +1819,18 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
        /* the vfs holds both inode->i_mutex already */
-        down(&manage_sem);
+        mutex_lock(&manage_mutex);
        cpuset_update_task_memory_state();
        if (atomic_read(&cs->count) > 0) {
-                up(&manage_sem);
+                mutex_unlock(&manage_mutex);
                return -EBUSY;
        }
        if (!list_empty(&cs->children)) {
-                up(&manage_sem);
+                mutex_unlock(&manage_mutex);
                return -EBUSY;
        }
        parent = cs->parent;
-        down(&callback_sem);
+        mutex_lock(&callback_mutex);
        set_bit(CS_REMOVED, &cs->flags);
        if (is_cpu_exclusive(cs))
                update_cpu_domains(cs);
@@ -1848,10 +1842,10 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
        cpuset_d_remove_dir(d);
        dput(d);
        number_of_cpusets--;
-        up(&callback_sem);
+        mutex_unlock(&callback_mutex);
        if (list_empty(&parent->children))
                check_for_release(parent, &pathbuf);
-        up(&manage_sem);
+        mutex_unlock(&manage_mutex);
        cpuset_release_agent(pathbuf);
        return 0;
 }
@@ -1960,19 +1954,19 @@ void cpuset_fork(struct task_struct *child)
 * Description: Detach cpuset from @tsk and release it.
 *
 * Note that cpusets marked notify_on_release force every task in
- * them to take the global manage_sem semaphore when exiting.
+ * them to take the global manage_mutex mutex when exiting.
 * This could impact scaling on very large systems.  Be reluctant to
 * use notify_on_release cpusets where very high task exit scaling
 * is required on large systems.
 *
 * Don't even think about derefencing 'cs' after the cpuset use count
- * goes to zero, except inside a critical section guarded by manage_sem
+ * goes to zero, except inside a critical section guarded by manage_mutex
- * or callback_sem.   Otherwise a zero cpuset use count is a license to
+ * or callback_mutex.   Otherwise a zero cpuset use count is a license to
 * any other task to nuke the cpuset immediately, via cpuset_rmdir().
 *
- * This routine has to take manage_sem, not callback_sem, because
+ * This routine has to take manage_mutex, not callback_mutex, because
- * it is holding that semaphore while calling check_for_release(),
+ * it is holding that mutex while calling check_for_release(),
- * which calls kmalloc(), so can't be called holding callback__sem().
+ * which calls kmalloc(), so can't be called holding callback_mutex().
 *
 * We don't need to task_lock() this reference to tsk->cpuset,
 * because tsk is already marked PF_EXITING, so attach_task() won't
@@ -2022,10 +2016,10 @@ void cpuset_exit(struct task_struct *tsk)
        if (notify_on_release(cs)) {
                char *pathbuf = NULL;
-                down(&manage_sem);
+                mutex_lock(&manage_mutex);
                if (atomic_dec_and_test(&cs->count))
                        check_for_release(cs, &pathbuf);
-                up(&manage_sem);
+                mutex_unlock(&manage_mutex);
                cpuset_release_agent(pathbuf);
        } else {
                atomic_dec(&cs->count);
@@ -2046,11 +2040,11 @@ cpumask_t cpuset_cpus_allowed(struct task_struct *tsk)
 {
        cpumask_t mask;
-        down(&callback_sem);
+        mutex_lock(&callback_mutex);
        task_lock(tsk);
        guarantee_online_cpus(tsk->cpuset, &mask);
        task_unlock(tsk);
-        up(&callback_sem);
+        mutex_unlock(&callback_mutex);
        return mask;
 }
@@ -2074,11 +2068,11 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
 {
        nodemask_t mask;
-        down(&callback_sem);
+        mutex_lock(&callback_mutex);
        task_lock(tsk);
        guarantee_online_mems(tsk->cpuset, &mask);
        task_unlock(tsk);
-        up(&callback_sem);
+        mutex_unlock(&callback_mutex);
        return mask;
 }
@@ -2104,7 +2098,7 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
 /*
 * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive
- * ancestor to the specified cpuset.  Call holding callback_sem.
+ * ancestor to the specified cpuset.  Call holding callback_mutex.
 * If no ancestor is mem_exclusive (an unusual configuration), then
 * returns the root cpuset.
 */
@@ -2131,12 +2125,12 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
 * GFP_KERNEL allocations are not so marked, so can escape to the
 * nearest mem_exclusive ancestor cpuset.
 *
- * Scanning up parent cpusets requires callback_sem.  The __alloc_pages()
+ * Scanning up parent cpusets requires callback_mutex.  The __alloc_pages()
 * routine only calls here with __GFP_HARDWALL bit _not_ set if
 * it's a GFP_KERNEL allocation, and all nodes in the current tasks
 * mems_allowed came up empty on the first pass over the zonelist.
 * So only GFP_KERNEL allocations, if all nodes in the cpuset are
- * short of memory, might require taking the callback_sem semaphore.
+ * short of memory, might require taking the callback_mutex mutex.
 *
 * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages()
 * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing
@@ -2171,31 +2165,31 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
                return 1;
        /* Not hardwall and node outside mems_allowed: scan up cpusets */
-        down(&callback_sem);
+        mutex_lock(&callback_mutex);
        task_lock(current);
        cs = nearest_exclusive_ancestor(current->cpuset);
        task_unlock(current);
        allowed = node_isset(node, cs->mems_allowed);
-        up(&callback_sem);
+        mutex_unlock(&callback_mutex);
        return allowed;
 }
 /**
 * cpuset_lock - lock out any changes to cpuset structures
 *
- * The out of memory (oom) code needs to lock down cpusets
+ * The out of memory (oom) code needs to mutex_lock cpusets
 * from being changed while it scans the tasklist looking for a
- * task in an overlapping cpuset.  Expose callback_sem via this
+ * task in an overlapping cpuset.  Expose callback_mutex via this
 * cpuset_lock() routine, so the oom code can lock it, before
 * locking the task list.  The tasklist_lock is a spinlock, so
- * must be taken inside callback_sem.
+ * must be taken inside callback_mutex.
 */
 void cpuset_lock(void)
 {
-        down(&callback_sem);
+        mutex_lock(&callback_mutex);
 }
 /**
@@ -2206,7 +2200,7 @@ void cpuset_lock(void)
 void cpuset_unlock(void)
 {
-        up(&callback_sem);
+        mutex_unlock(&callback_mutex);
 }
 /**
@@ -2218,7 +2212,7 @@ void cpuset_unlock(void)
 * determine if task @p's memory usage might impact the memory
 * available to the current task.
 *
- * Call while holding callback_sem.
+ * Call while holding callback_mutex.
 **/
 int cpuset_excl_nodes_overlap(const struct task_struct *p)
@@ -2289,7 +2283,7 @@ void __cpuset_memory_pressure_bump(void)
 *  - Used for /proc/<pid>/cpuset.
 *  - No need to task_lock(tsk) on this tsk->cpuset reference, as it
 *    doesn't really matter if tsk->cpuset changes after we read it,
- *    and we take manage_sem, keeping attach_task() from changing it
+ *    and we take manage_mutex, keeping attach_task() from changing it
 *    anyway.
 */
@@ -2305,7 +2299,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
                return -ENOMEM;
        tsk = m->private;
-        down(&manage_sem);
+        mutex_lock(&manage_mutex);
        cs = tsk->cpuset;
        if (!cs) {
                retval = -EINVAL;
@@ -2318,7 +2312,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
        seq_puts(m, buf);
        seq_putc(m, '\n');
 out:
-        up(&manage_sem);
+        mutex_unlock(&manage_mutex);
        kfree(buf);
        return retval;
 }
diff --git a/kernel/exit.c b/kernel/exit.c
index d1e8d500a7e1..8037405e136e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -345,9 +345,9 @@ void daemonize(const char *name, ...)
        exit_mm(current);
        set_special_pids(1, 1);
-        down(&tty_sem);
+        mutex_lock(&tty_mutex);
        current->signal->tty = NULL;
-        up(&tty_sem);
+        mutex_unlock(&tty_mutex);
        /* Block and flush all signals */
        sigfillset(&blocked);
diff --git a/kernel/fork.c b/kernel/fork.c
index 9bd7b65ee418..c79ae0b19a49 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -607,12 +607,12 @@ static struct files_struct *alloc_files(void)
        atomic_set(&newf->count, 1);
        spin_lock_init(&newf->file_lock);
+        newf->next_fd = 0;
        fdt = &newf->fdtab;
-        fdt->next_fd = 0;
        fdt->max_fds = NR_OPEN_DEFAULT;
-        fdt->max_fdset = __FD_SETSIZE;
+        fdt->max_fdset = EMBEDDED_FD_SET_SIZE;
-        fdt->close_on_exec = &newf->close_on_exec_init;
+        fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
-        fdt->open_fds = &newf->open_fds_init;
+        fdt->open_fds = (fd_set *)&newf->open_fds_init;
        fdt->fd = &newf->fd_array[0];
        INIT_RCU_HEAD(&fdt->rcu);
        fdt->free_files = NULL;
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index fef1af8a73ce..1fb9f753ef60 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -48,7 +48,7 @@
 static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
 static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
-DECLARE_MUTEX(kprobe_mutex);            /* Protects kprobe_table */
+DEFINE_MUTEX(kprobe_mutex);             /* Protects kprobe_table */
 DEFINE_SPINLOCK(kretprobe_lock);        /* Protects kretprobe_inst_table */
 static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
@@ -460,7 +460,7 @@ static int __kprobes __register_kprobe(struct kprobe *p,
        }
        p->nmissed = 0;
-        down(&kprobe_mutex);
+        mutex_lock(&kprobe_mutex);
        old_p = get_kprobe(p->addr);
        if (old_p) {
                ret = register_aggr_kprobe(old_p, p);
@@ -477,7 +477,7 @@ static int __kprobes __register_kprobe(struct kprobe *p,
        arch_arm_kprobe(p);
 out:
-        up(&kprobe_mutex);
+        mutex_unlock(&kprobe_mutex);
        if (ret && probed_mod)
                module_put(probed_mod);
@@ -496,10 +496,10 @@ void __kprobes unregister_kprobe(struct kprobe *p)
        struct kprobe *old_p, *list_p;
        int cleanup_p;
-        down(&kprobe_mutex);
+        mutex_lock(&kprobe_mutex);
        old_p = get_kprobe(p->addr);
        if (unlikely(!old_p)) {
-                up(&kprobe_mutex);
+                mutex_unlock(&kprobe_mutex);
                return;
        }
        if (p != old_p) {
@@ -507,7 +507,7 @@ void __kprobes unregister_kprobe(struct kprobe *p)
                        if (list_p == p)
                        /* kprobe p is a valid probe */
                                goto valid_p;
-                up(&kprobe_mutex);
+                mutex_unlock(&kprobe_mutex);
                return;
        }
 valid_p:
@@ -523,7 +523,7 @@ valid_p:
                cleanup_p = 0;
        }
-        up(&kprobe_mutex);
+        mutex_unlock(&kprobe_mutex);
        synchronize_sched();
        if (p->mod_refcounted &&
diff --git a/kernel/kthread.c b/kernel/kthread.c
index e75950a1092c..6a5373868a98 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -12,6 +12,7 @@
 #include <linux/unistd.h>
 #include <linux/file.h>
 #include <linux/module.h>
+#include <linux/mutex.h>
 #include <asm/semaphore.h>
 /*
@@ -41,7 +42,7 @@ struct kthread_stop_info
 /* Thread stopping is done by setthing this var: lock serializes
 * multiple kthread_stop calls. */
-static DECLARE_MUTEX(kthread_stop_lock);
+static DEFINE_MUTEX(kthread_stop_lock);
 static struct kthread_stop_info kthread_stop_info;
 int kthread_should_stop(void)
@@ -173,7 +174,7 @@ int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
 {
        int ret;
-        down(&kthread_stop_lock);
+        mutex_lock(&kthread_stop_lock);
        /* It could exit after stop_info.k set, but before wake_up_process. */
        get_task_struct(k);
@@ -194,7 +195,7 @@ int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
        wait_for_completion(&kthread_stop_info.done);
        kthread_stop_info.k = NULL;
        ret = kthread_stop_info.err;
-        up(&kthread_stop_lock);
+        mutex_unlock(&kthread_stop_lock);
        return ret;
 }
diff --git a/kernel/module.c b/kernel/module.c
index 77764f22f021..fb404299082e 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -39,6 +39,7 @@
 #include <linux/device.h>
 #include <linux/string.h>
 #include <linux/sched.h>
+#include <linux/mutex.h>
 #include <asm/uaccess.h>
 #include <asm/semaphore.h>
 #include <asm/cacheflush.h>
@@ -60,18 +61,18 @@
 static DEFINE_SPINLOCK(modlist_lock);
 /* List of modules, protected by module_mutex AND modlist_lock */
-static DECLARE_MUTEX(module_mutex);
+static DEFINE_MUTEX(module_mutex);
 static LIST_HEAD(modules);
-static DECLARE_MUTEX(notify_mutex);
+static DEFINE_MUTEX(notify_mutex);
 static struct notifier_block * module_notify_list;
 int register_module_notifier(struct notifier_block * nb)
 {
        int err;
-        down(&notify_mutex);
+        mutex_lock(&notify_mutex);
        err = notifier_chain_register(&module_notify_list, nb);
-        up(&notify_mutex);
+        mutex_unlock(&notify_mutex);
        return err;
 }
 EXPORT_SYMBOL(register_module_notifier);
@@ -79,9 +80,9 @@ EXPORT_SYMBOL(register_module_notifier);
 int unregister_module_notifier(struct notifier_block * nb)
 {
        int err;
-        down(&notify_mutex);
+        mutex_lock(&notify_mutex);
        err = notifier_chain_unregister(&module_notify_list, nb);
-        up(&notify_mutex);
+        mutex_unlock(&notify_mutex);
        return err;
 }
 EXPORT_SYMBOL(unregister_module_notifier);
@@ -601,7 +602,7 @@ static void free_module(struct module *mod);
 static void wait_for_zero_refcount(struct module *mod)
 {
        /* Since we might sleep for some time, drop the semaphore first */
-        up(&module_mutex);
+        mutex_unlock(&module_mutex);
        for (;;) {
                DEBUGP("Looking at refcount...\n");
                set_current_state(TASK_UNINTERRUPTIBLE);
@@ -610,7 +611,7 @@ static void wait_for_zero_refcount(struct module *mod)
                schedule();
        }
        current->state = TASK_RUNNING;
-        down(&module_mutex);
+        mutex_lock(&module_mutex);
 }
 asmlinkage long
@@ -627,7 +628,7 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
                return -EFAULT;
        name[MODULE_NAME_LEN-1] = '\0';
-        if (down_interruptible(&module_mutex) != 0)
+        if (mutex_lock_interruptible(&module_mutex) != 0)
                return -EINTR;
        mod = find_module(name);
@@ -676,14 +677,14 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
        /* Final destruction now noone is using it. */
        if (mod->exit != NULL) {
-                up(&module_mutex);
+                mutex_unlock(&module_mutex);
                mod->exit();
-                down(&module_mutex);
+                mutex_lock(&module_mutex);
        }
        free_module(mod);
 out:
-        up(&module_mutex);
+        mutex_unlock(&module_mutex);
        return ret;
 }
@@ -1972,13 +1973,13 @@ sys_init_module(void __user *umod,
                return -EPERM;
        /* Only one module load at a time, please */
-        if (down_interruptible(&module_mutex) != 0)
+        if (mutex_lock_interruptible(&module_mutex) != 0)
                return -EINTR;
        /* Do all the hard work */
        mod = load_module(umod, len, uargs);
        if (IS_ERR(mod)) {
-                up(&module_mutex);
+                mutex_unlock(&module_mutex);
                return PTR_ERR(mod);
        }
@@ -1987,11 +1988,11 @@ sys_init_module(void __user *umod,
        stop_machine_run(__link_module, mod, NR_CPUS);
        /* Drop lock so they can recurse */
-        up(&module_mutex);
+        mutex_unlock(&module_mutex);
-        down(&notify_mutex);
+        mutex_lock(&notify_mutex);
        notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod);
-        up(&notify_mutex);
+        mutex_unlock(&notify_mutex);
        /* Start the module */
        if (mod->init != NULL)
@@ -2006,15 +2007,15 @@ sys_init_module(void __user *umod,
                               mod->name);
                else {
                        module_put(mod);
-                        down(&module_mutex);
+                        mutex_lock(&module_mutex);
                        free_module(mod);
-                        up(&module_mutex);
+                        mutex_unlock(&module_mutex);
                }
                return ret;
        }
        /* Now it's a first class citizen! */
-        down(&module_mutex);
+        mutex_lock(&module_mutex);
        mod->state = MODULE_STATE_LIVE;
        /* Drop initial reference. */
        module_put(mod);
@@ -2022,7 +2023,7 @@ sys_init_module(void __user *umod,
        mod->module_init = NULL;
        mod->init_size = 0;
        mod->init_text_size = 0;
-        up(&module_mutex);
+        mutex_unlock(&module_mutex);
        return 0;
 }
@@ -2112,7 +2113,7 @@ struct module *module_get_kallsym(unsigned int symnum,
 {
        struct module *mod;
-        down(&module_mutex);
+        mutex_lock(&module_mutex);
        list_for_each_entry(mod, &modules, list) {
                if (symnum < mod->num_symtab) {
                        *value = mod->symtab[symnum].st_value;
@@ -2120,12 +2121,12 @@ struct module *module_get_kallsym(unsigned int symnum,
                        strncpy(namebuf,
                                mod->strtab + mod->symtab[symnum].st_name,
                                127);
-                        up(&module_mutex);
+                        mutex_unlock(&module_mutex);
                        return mod;
                }
                symnum -= mod->num_symtab;
        }
-        up(&module_mutex);
+        mutex_unlock(&module_mutex);
        return NULL;
 }
@@ -2168,7 +2169,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
        struct list_head *i;
        loff_t n = 0;
-        down(&module_mutex);
+        mutex_lock(&module_mutex);
        list_for_each(i, &modules) {
                if (n++ == *pos)
                        break;
@@ -2189,7 +2190,7 @@ static void *m_next(struct seq_file *m, void *p, loff_t *pos)
 static void m_stop(struct seq_file *m, void *p)
 {
-        up(&module_mutex);
+        mutex_unlock(&module_mutex);
 }
 static int m_show(struct seq_file *m, void *p)
diff --git a/kernel/panic.c b/kernel/panic.c
index 126dc43f1c74..acd95adddb93 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -20,10 +20,13 @@
 #include <linux/nmi.h>
 #include <linux/kexec.h>
-int panic_timeout;
 int panic_on_oops;
 int tainted;
+static int pause_on_oops;
+static int pause_on_oops_flag;
+static DEFINE_SPINLOCK(pause_on_oops_lock);
+int panic_timeout;
 EXPORT_SYMBOL(panic_timeout);
 struct notifier_block *panic_notifier_list;
@@ -174,3 +177,95 @@ void add_taint(unsigned flag)
        tainted |= flag;
 }
 EXPORT_SYMBOL(add_taint);
+static int __init pause_on_oops_setup(char *str)
+{
+        pause_on_oops = simple_strtoul(str, NULL, 0);
+        return 1;
+}
+__setup("pause_on_oops=", pause_on_oops_setup);
+static void spin_msec(int msecs)
+{
+        int i;
+        for (i = 0; i < msecs; i++) {
+                touch_nmi_watchdog();
+                mdelay(1);
+        }
+}
+/*
+ * It just happens that oops_enter() and oops_exit() are identically
+ * implemented...
+ */
+static void do_oops_enter_exit(void)
+{
+        unsigned long flags;
+        static int spin_counter;
+        if (!pause_on_oops)
+                return;
+        spin_lock_irqsave(&pause_on_oops_lock, flags);
+        if (pause_on_oops_flag == 0) {
+                /* This CPU may now print the oops message */
+                pause_on_oops_flag = 1;
+        } else {
+                /* We need to stall this CPU */
+                if (!spin_counter) {
+                        /* This CPU gets to do the counting */
+                        spin_counter = pause_on_oops;
+                        do {
+                                spin_unlock(&pause_on_oops_lock);
+                                spin_msec(MSEC_PER_SEC);
+                                spin_lock(&pause_on_oops_lock);
+                        } while (--spin_counter);
+                        pause_on_oops_flag = 0;
+                } else {
+                        /* This CPU waits for a different one */
+                        while (spin_counter) {
+                                spin_unlock(&pause_on_oops_lock);
+                                spin_msec(1);
+                                spin_lock(&pause_on_oops_lock);
+                        }
+                }
+        }
+        spin_unlock_irqrestore(&pause_on_oops_lock, flags);
+}
+/*
+ * Return true if the calling CPU is allowed to print oops-related info.  This
+ * is a bit racy..
+ */
+int oops_may_print(void)
+{
+        return pause_on_oops_flag == 0;
+}
+/*
+ * Called when the architecture enters its oops handler, before it prints
+ * anything.  If this is the first CPU to oops, and it's oopsing the first time
+ * then let it proceed.
+ *
+ * This is all enabled by the pause_on_oops kernel boot option.  We do all this
+ * to ensure that oopses don't scroll off the screen.  It has the side-effect
+ * of preventing later-oopsing CPUs from mucking up the display, too.
+ *
+ * It turns out that the CPU which is allowed to print ends up pausing for the
+ * right duration, whereas all the other CPUs pause for twice as long: once in
+ * oops_enter(), once in oops_exit().
+ */
+void oops_enter(void)
+{
+        do_oops_enter_exit();
+}
+/*
+ * Called when the architecture exits its oops handler, after printing
+ * everything.
+ */
+void oops_exit(void)
+{
+        do_oops_enter_exit();
+}
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index fa895fc2ecf5..9944379360b5 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -35,6 +35,7 @@
 #include <linux/interrupt.h>
 #include <linux/slab.h>
 #include <linux/time.h>
+#include <linux/mutex.h>
 #include <asm/uaccess.h>
 #include <asm/semaphore.h>
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 04be7d0d96a7..8d0af3d37a4b 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -5,7 +5,7 @@ endif
 obj-y                           := main.o process.o console.o
 obj-$(CONFIG_PM_LEGACY)         += pm.o
-obj-$(CONFIG_SOFTWARE_SUSPEND)  += swsusp.o disk.o snapshot.o
+obj-$(CONFIG_SOFTWARE_SUSPEND)  += swsusp.o disk.o snapshot.o swap.o user.o
 obj-$(CONFIG_SUSPEND_SMP)       += smp.o
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 0b43847dc980..81d4d982f3f0 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -22,17 +22,6 @@
 #include "power.h"
-extern suspend_disk_method_t pm_disk_mode;
-extern int swsusp_shrink_memory(void);
-extern int swsusp_suspend(void);
-extern int swsusp_write(struct pbe *pblist, unsigned int nr_pages);
-extern int swsusp_check(void);
-extern int swsusp_read(struct pbe **pblist_ptr);
-extern void swsusp_close(void);
-extern int swsusp_resume(void);
 static int noresume = 0;
 char resume_file[256] = CONFIG_PM_STD_PARTITION;
 dev_t swsusp_resume_device;
@@ -70,10 +59,6 @@ static void power_down(suspend_disk_method_t mode)
        while(1);
 }
-static int in_suspend __nosavedata = 0;
 static inline void platform_finish(void)
 {
        if (pm_disk_mode == PM_DISK_PLATFORM) {
@@ -87,7 +72,6 @@ static int prepare_processes(void)
        int error;
        pm_prepare_console();
-        sys_sync();
        disable_nonboot_cpus();
        if (freeze_processes()) {
@@ -145,7 +129,7 @@ int pm_suspend_disk(void)
        if (in_suspend) {
                device_resume();
                pr_debug("PM: writing image.\n");
-                error = swsusp_write(pagedir_nosave, nr_copy_pages);
+                error = swsusp_write();
                if (!error)
                        power_down(pm_disk_mode);
                else {
@@ -216,7 +200,7 @@ static int software_resume(void)
        pr_debug("PM: Reading swsusp image.\n");
-        if ((error = swsusp_read(&pagedir_nosave))) {
+        if ((error = swsusp_read())) {
                swsusp_free();
                goto Thaw;
        }
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 9cb235cba4a9..ee371f50ccaa 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -103,7 +103,7 @@ static int suspend_prepare(suspend_state_t state)
 }
-static int suspend_enter(suspend_state_t state)
+int suspend_enter(suspend_state_t state)
 {
        int error = 0;
        unsigned long flags;
diff --git a/kernel/power/pm.c b/kernel/power/pm.c
index 33c508e857dd..0f6908cce1dd 100644
--- a/kernel/power/pm.c
+++ b/kernel/power/pm.c
@@ -25,6 +25,7 @@
 #include <linux/pm.h>
 #include <linux/pm_legacy.h>
 #include <linux/interrupt.h>
+#include <linux/mutex.h>
 int pm_active;
@@ -40,7 +41,7 @@ int pm_active;
 *      until a resume but that will be fine.
 */
 
-static DECLARE_MUTEX(pm_devs_lock);
+static DEFINE_MUTEX(pm_devs_lock);
 static LIST_HEAD(pm_devs);
 /**
@@ -67,9 +68,9 @@ struct pm_dev *pm_register(pm_dev_t type,
                dev->id = id;
                dev->callback = callback;
-                down(&pm_devs_lock);
+                mutex_lock(&pm_devs_lock);
                list_add(&dev->entry, &pm_devs);
-                up(&pm_devs_lock);
+                mutex_unlock(&pm_devs_lock);
        }
        return dev;
 }
@@ -85,9 +86,9 @@ struct pm_dev *pm_register(pm_dev_t type,
 void pm_unregister(struct pm_dev *dev)
 {
        if (dev) {
-                down(&pm_devs_lock);
+                mutex_lock(&pm_devs_lock);
                list_del(&dev->entry);
-                up(&pm_devs_lock);
+                mutex_unlock(&pm_devs_lock);
                kfree(dev);
        }
@@ -118,7 +119,7 @@ void pm_unregister_all(pm_callback callback)
        if (!callback)
                return;
-        down(&pm_devs_lock);
+        mutex_lock(&pm_devs_lock);
        entry = pm_devs.next;
        while (entry != &pm_devs) {
                struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
@@ -126,7 +127,7 @@ void pm_unregister_all(pm_callback callback)
                if (dev->callback == callback)
                        __pm_unregister(dev);
        }
-        up(&pm_devs_lock);
+        mutex_unlock(&pm_devs_lock);
 }
 /**
@@ -234,7 +235,7 @@ int pm_send_all(pm_request_t rqst, void *data)
 {
        struct list_head *entry;
        
-        down(&pm_devs_lock);
+        mutex_lock(&pm_devs_lock);
        entry = pm_devs.next;
        while (entry != &pm_devs) {
                struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
@@ -246,13 +247,13 @@ int pm_send_all(pm_request_t rqst, void *data)
                                 */
                                if (rqst == PM_SUSPEND)
                                        pm_undo_all(dev);
-                                up(&pm_devs_lock);
+                                mutex_unlock(&pm_devs_lock);
                                return status;
                        }
                }
                entry = entry->next;
        }
-        up(&pm_devs_lock);
+        mutex_unlock(&pm_devs_lock);
        return 0;
 }
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 388dba680841..f06f12f21767 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -8,6 +8,7 @@ struct swsusp_info {
        int                     cpus;
        unsigned long           image_pages;
        unsigned long           pages;
+        unsigned long           size;
 } __attribute__((aligned(PAGE_SIZE)));
@@ -37,21 +38,79 @@ extern struct subsystem power_subsys;
 /* References to section boundaries */
 extern const void __nosave_begin, __nosave_end;
-extern unsigned int nr_copy_pages;
 extern struct pbe *pagedir_nosave;
 /* Preferred image size in bytes (default 500 MB) */
 extern unsigned long image_size;
+extern int in_suspend;
+extern dev_t swsusp_resume_device;
 extern asmlinkage int swsusp_arch_suspend(void);
 extern asmlinkage int swsusp_arch_resume(void);
 extern unsigned int count_data_pages(void);
-extern void free_pagedir(struct pbe *pblist);
-extern void release_eaten_pages(void);
+struct snapshot_handle {
-extern struct pbe *alloc_pagedir(unsigned nr_pages, gfp_t gfp_mask, int safe_needed);
+        loff_t          offset;
+        unsigned int    page;
+        unsigned int    page_offset;
+        unsigned int    prev;
+        struct pbe      *pbe;
+        void            *buffer;
+        unsigned int    buf_offset;
+};
+#define data_of(handle) ((handle).buffer + (handle).buf_offset)
+extern int snapshot_read_next(struct snapshot_handle *handle, size_t count);
+extern int snapshot_write_next(struct snapshot_handle *handle, size_t count);
+int snapshot_image_loaded(struct snapshot_handle *handle);
+#define SNAPSHOT_IOC_MAGIC      '3'
+#define SNAPSHOT_FREEZE                 _IO(SNAPSHOT_IOC_MAGIC, 1)
+#define SNAPSHOT_UNFREEZE               _IO(SNAPSHOT_IOC_MAGIC, 2)
+#define SNAPSHOT_ATOMIC_SNAPSHOT        _IOW(SNAPSHOT_IOC_MAGIC, 3, void *)
+#define SNAPSHOT_ATOMIC_RESTORE         _IO(SNAPSHOT_IOC_MAGIC, 4)
+#define SNAPSHOT_FREE                   _IO(SNAPSHOT_IOC_MAGIC, 5)
+#define SNAPSHOT_SET_IMAGE_SIZE         _IOW(SNAPSHOT_IOC_MAGIC, 6, unsigned long)
+#define SNAPSHOT_AVAIL_SWAP             _IOR(SNAPSHOT_IOC_MAGIC, 7, void *)
+#define SNAPSHOT_GET_SWAP_PAGE          _IOR(SNAPSHOT_IOC_MAGIC, 8, void *)
+#define SNAPSHOT_FREE_SWAP_PAGES        _IO(SNAPSHOT_IOC_MAGIC, 9)
+#define SNAPSHOT_SET_SWAP_FILE          _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int)
+#define SNAPSHOT_S2RAM                  _IO(SNAPSHOT_IOC_MAGIC, 11)
+#define SNAPSHOT_IOC_MAXNR      11
+/**
+ *      The bitmap is used for tracing allocated swap pages
+ *
+ *      The entire bitmap consists of a number of bitmap_page
+ *      structures linked with the help of the .next member.
+ *      Thus each page can be allocated individually, so we only
+ *      need to make 0-order memory allocations to create
+ *      the bitmap.
+ */
+#define BITMAP_PAGE_SIZE        (PAGE_SIZE - sizeof(void *))
+#define BITMAP_PAGE_CHUNKS      (BITMAP_PAGE_SIZE / sizeof(long))
+#define BITS_PER_CHUNK          (sizeof(long) * 8)
+#define BITMAP_PAGE_BITS        (BITMAP_PAGE_CHUNKS * BITS_PER_CHUNK)
+struct bitmap_page {
+        unsigned long           chunks[BITMAP_PAGE_CHUNKS];
+        struct bitmap_page      *next;
+};
+extern void free_bitmap(struct bitmap_page *bitmap);
+extern struct bitmap_page *alloc_bitmap(unsigned int nr_bits);
+extern unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap);
+extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap);
+extern int swsusp_check(void);
+extern int swsusp_shrink_memory(void);
 extern void swsusp_free(void);
-extern int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed);
+extern int swsusp_suspend(void);
-extern unsigned int snapshot_nr_pages(void);
+extern int swsusp_resume(void);
-extern struct pbe *snapshot_pblist(void);
+extern int swsusp_read(void);
-extern void snapshot_pblist_set(struct pbe *pblist);
+extern int swsusp_write(void);
+extern void swsusp_close(void);
+extern int suspend_enter(suspend_state_t state);
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 28de118f7a0b..8ac7c35fad77 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -12,11 +12,12 @@
 #include <linux/interrupt.h>
 #include <linux/suspend.h>
 #include <linux/module.h>
+#include <linux/syscalls.h>
 /* 
 * Timeout for stopping processes
 */
-#define TIMEOUT (6 * HZ)
+#define TIMEOUT (20 * HZ)
 static inline int freezeable(struct task_struct * p)
@@ -54,38 +55,62 @@ void refrigerator(void)
        current->state = save;
 }
+static inline void freeze_process(struct task_struct *p)
+{
+        unsigned long flags;
+        if (!freezing(p)) {
+                freeze(p);
+                spin_lock_irqsave(&p->sighand->siglock, flags);
+                signal_wake_up(p, 0);
+                spin_unlock_irqrestore(&p->sighand->siglock, flags);
+        }
+}
 /* 0 = success, else # of processes that we failed to stop */
 int freeze_processes(void)
 {
-        int todo;
+        int todo, nr_user, user_frozen;
        unsigned long start_time;
        struct task_struct *g, *p;
        unsigned long flags;
        printk( "Stopping tasks: " );
        start_time = jiffies;
+        user_frozen = 0;
        do {
-                todo = 0;
+                nr_user = todo = 0;
                read_lock(&tasklist_lock);
                do_each_thread(g, p) {
                        if (!freezeable(p))
                                continue;
                        if (frozen(p))
                                continue;
+                        if (p->mm && !(p->flags & PF_BORROWED_MM)) {
-                        freeze(p);
+                                /* The task is a user-space one.
-                        spin_lock_irqsave(&p->sighand->siglock, flags);
+                                 * Freeze it unless there's a vfork completion
-                        signal_wake_up(p, 0);
+                                 * pending
-                        spin_unlock_irqrestore(&p->sighand->siglock, flags);
+                                 */
-                        todo++;
+                                if (!p->vfork_done)
+                                        freeze_process(p);
+                                nr_user++;
+                        } else {
+                                /* Freeze only if the user space is frozen */
+                                if (user_frozen)
+                                        freeze_process(p);
+                                todo++;
+                        }
                } while_each_thread(g, p);
                read_unlock(&tasklist_lock);
+                todo += nr_user;
+                if (!user_frozen && !nr_user) {
+                        sys_sync();
+                        start_time = jiffies;
+                }
+                user_frozen = !nr_user;
                yield();                        /* Yield is okay here */
-                if (todo && time_after(jiffies, start_time + TIMEOUT)) {
+                if (todo && time_after(jiffies, start_time + TIMEOUT))
-                        printk( "\n" );
-                        printk(KERN_ERR " stopping tasks failed (%d tasks remaining)\n", todo );
                        break;
-                }
        } while(todo);
        /* This does not unfreeze processes that are already frozen
@@ -94,8 +119,14 @@ int freeze_processes(void)
         * but it cleans up leftover PF_FREEZE requests.
         */
        if (todo) {
+                printk( "\n" );
+                printk(KERN_ERR " stopping tasks timed out "
+                        "after %d seconds (%d tasks remaining):\n",
+                        TIMEOUT / HZ, todo);
                read_lock(&tasklist_lock);
-                do_each_thread(g, p)
+                do_each_thread(g, p) {
+                        if (freezeable(p) && !frozen(p))
+                                printk(KERN_ERR "  %s\n", p->comm);
                        if (freezing(p)) {
                                pr_debug("  clean up: %s\n", p->comm);
                                p->flags &= ~PF_FREEZE;
@@ -103,7 +134,7 @@ int freeze_processes(void)
                                recalc_sigpending_tsk(p);
                                spin_unlock_irqrestore(&p->sighand->siglock, flags);
                        }
-                while_each_thread(g, p);
+                } while_each_thread(g, p);
                read_unlock(&tasklist_lock);
                return todo;
        }
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 8d5a5986d621..c5863d02c89e 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -10,6 +10,7 @@
 */
+#include <linux/version.h>
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/suspend.h>
@@ -34,7 +35,9 @@
 #include "power.h"
 struct pbe *pagedir_nosave;
-unsigned int nr_copy_pages;
+static unsigned int nr_copy_pages;
+static unsigned int nr_meta_pages;
+static unsigned long *buffer;
 #ifdef CONFIG_HIGHMEM
 unsigned int count_highmem_pages(void)
@@ -80,7 +83,7 @@ static int save_highmem_zone(struct zone *zone)
                void *kaddr;
                unsigned long pfn = zone_pfn + zone->zone_start_pfn;
-                if (!(pfn%1000))
+                if (!(pfn%10000))
                        printk(".");
                if (!pfn_valid(pfn))
                        continue;
@@ -119,13 +122,15 @@ int save_highmem(void)
        struct zone *zone;
        int res = 0;
-        pr_debug("swsusp: Saving Highmem\n");
+        pr_debug("swsusp: Saving Highmem");
+        drain_local_pages();
        for_each_zone (zone) {
                if (is_highmem(zone))
                        res = save_highmem_zone(zone);
                if (res)
                        return res;
        }
+        printk("\n");
        return 0;
 }
@@ -235,7 +240,7 @@ static void copy_data_pages(struct pbe *pblist)
 *      free_pagedir - free pages allocated with alloc_pagedir()
 */
-void free_pagedir(struct pbe *pblist)
+static void free_pagedir(struct pbe *pblist)
 {
        struct pbe *pbe;
@@ -301,7 +306,7 @@ struct eaten_page {
 static struct eaten_page *eaten_pages = NULL;
-void release_eaten_pages(void)
+static void release_eaten_pages(void)
 {
        struct eaten_page *p, *q;
@@ -376,7 +381,6 @@ struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed
        if (!nr_pages)
                return NULL;
-        pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages);
        pblist = alloc_image_page(gfp_mask, safe_needed);
        /* FIXME: rewrite this ugly loop */
        for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages;
@@ -388,7 +392,7 @@ struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed
                free_pagedir(pblist);
                pblist = NULL;
        } else
-                create_pbe_list(pblist, nr_pages);
+                create_pbe_list(pblist, nr_pages);
        return pblist;
 }
@@ -414,6 +418,10 @@ void swsusp_free(void)
                                }
                        }
        }
+        nr_copy_pages = 0;
+        nr_meta_pages = 0;
+        pagedir_nosave = NULL;
+        buffer = NULL;
 }
@@ -437,7 +445,7 @@ static int enough_free_mem(unsigned int nr_pages)
                (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
 }
-int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed)
+static int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed)
 {
        struct pbe *p;
@@ -504,7 +512,318 @@ asmlinkage int swsusp_save(void)
         */
        nr_copy_pages = nr_pages;
+        nr_meta_pages = (nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT;
        printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages);
        return 0;
 }
+static void init_header(struct swsusp_info *info)
+{
+        memset(info, 0, sizeof(struct swsusp_info));
+        info->version_code = LINUX_VERSION_CODE;
+        info->num_physpages = num_physpages;
+        memcpy(&info->uts, &system_utsname, sizeof(system_utsname));
+        info->cpus = num_online_cpus();
+        info->image_pages = nr_copy_pages;
+        info->pages = nr_copy_pages + nr_meta_pages + 1;
+        info->size = info->pages;
+        info->size <<= PAGE_SHIFT;
+}
+/**
+ *      pack_orig_addresses - the .orig_address fields of the PBEs from the
+ *      list starting at @pbe are stored in the array @buf[] (1 page)
+ */
+static inline struct pbe *pack_orig_addresses(unsigned long *buf, struct pbe *pbe)
+{
+        int j;
+        for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
+                buf[j] = pbe->orig_address;
+                pbe = pbe->next;
+        }
+        if (!pbe)
+                for (; j < PAGE_SIZE / sizeof(long); j++)
+                        buf[j] = 0;
+        return pbe;
+}
+/**
+ *      snapshot_read_next - used for reading the system memory snapshot.
+ *
+ *      On the first call to it @handle should point to a zeroed
+ *      snapshot_handle structure.  The structure gets updated and a pointer
+ *      to it should be passed to this function every next time.
+ *
+ *      The @count parameter should contain the number of bytes the caller
+ *      wants to read from the snapshot.  It must not be zero.
+ *
+ *      On success the function returns a positive number.  Then, the caller
+ *      is allowed to read up to the returned number of bytes from the memory
+ *      location computed by the data_of() macro.  The number returned
+ *      may be smaller than @count, but this only happens if the read would
+ *      cross a page boundary otherwise.
+ *
+ *      The function returns 0 to indicate the end of data stream condition,
+ *      and a negative number is returned on error.  In such cases the
+ *      structure pointed to by @handle is not updated and should not be used
+ *      any more.
+ */
+int snapshot_read_next(struct snapshot_handle *handle, size_t count)
+{
+        if (handle->page > nr_meta_pages + nr_copy_pages)
+                return 0;
+        if (!buffer) {
+                /* This makes the buffer be freed by swsusp_free() */
+                buffer = alloc_image_page(GFP_ATOMIC, 0);
+                if (!buffer)
+                        return -ENOMEM;
+        }
+        if (!handle->offset) {
+                init_header((struct swsusp_info *)buffer);
+                handle->buffer = buffer;
+                handle->pbe = pagedir_nosave;
+        }
+        if (handle->prev < handle->page) {
+                if (handle->page <= nr_meta_pages) {
+                        handle->pbe = pack_orig_addresses(buffer, handle->pbe);
+                        if (!handle->pbe)
+                                handle->pbe = pagedir_nosave;
+                } else {
+                        handle->buffer = (void *)handle->pbe->address;
+                        handle->pbe = handle->pbe->next;
+                }
+                handle->prev = handle->page;
+        }
+        handle->buf_offset = handle->page_offset;
+        if (handle->page_offset + count >= PAGE_SIZE) {
+                count = PAGE_SIZE - handle->page_offset;
+                handle->page_offset = 0;
+                handle->page++;
+        } else {
+                handle->page_offset += count;
+        }
+        handle->offset += count;
+        return count;
+}
+/**
+ *      mark_unsafe_pages - mark the pages that cannot be used for storing
+ *      the image during resume, because they conflict with the pages that
+ *      had been used before suspend
+ */
+static int mark_unsafe_pages(struct pbe *pblist)
+{
+        struct zone *zone;
+        unsigned long zone_pfn;
+        struct pbe *p;
+        if (!pblist) /* a sanity check */
+                return -EINVAL;
+        /* Clear page flags */
+        for_each_zone (zone) {
+                for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
+                        if (pfn_valid(zone_pfn + zone->zone_start_pfn))
+                                ClearPageNosaveFree(pfn_to_page(zone_pfn +
+                                        zone->zone_start_pfn));
+        }
+        /* Mark orig addresses */
+        for_each_pbe (p, pblist) {
+                if (virt_addr_valid(p->orig_address))
+                        SetPageNosaveFree(virt_to_page(p->orig_address));
+                else
+                        return -EFAULT;
+        }
+        return 0;
+}
+static void copy_page_backup_list(struct pbe *dst, struct pbe *src)
+{
+        /* We assume both lists contain the same number of elements */
+        while (src) {
+                dst->orig_address = src->orig_address;
+                dst = dst->next;
+                src = src->next;
+        }
+}
+static int check_header(struct swsusp_info *info)
+{
+        char *reason = NULL;
+        if (info->version_code != LINUX_VERSION_CODE)
+                reason = "kernel version";
+        if (info->num_physpages != num_physpages)
+                reason = "memory size";
+        if (strcmp(info->uts.sysname,system_utsname.sysname))
+                reason = "system type";
+        if (strcmp(info->uts.release,system_utsname.release))
+                reason = "kernel release";
+        if (strcmp(info->uts.version,system_utsname.version))
+                reason = "version";
+        if (strcmp(info->uts.machine,system_utsname.machine))
+                reason = "machine";
+        if (reason) {
+                printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason);
+                return -EPERM;
+        }
+        return 0;
+}
+/**
+ *      load header - check the image header and copy data from it
+ */
+static int load_header(struct snapshot_handle *handle,
+                              struct swsusp_info *info)
+{
+        int error;
+        struct pbe *pblist;
+        error = check_header(info);
+        if (!error) {
+                pblist = alloc_pagedir(info->image_pages, GFP_ATOMIC, 0);
+                if (!pblist)
+                        return -ENOMEM;
+                pagedir_nosave = pblist;
+                handle->pbe = pblist;
+                nr_copy_pages = info->image_pages;
+                nr_meta_pages = info->pages - info->image_pages - 1;
+        }
+        return error;
+}
+/**
+ *      unpack_orig_addresses - copy the elements of @buf[] (1 page) to
+ *      the PBEs in the list starting at @pbe
+ */
+static inline struct pbe *unpack_orig_addresses(unsigned long *buf,
+                                                struct pbe *pbe)
+{
+        int j;
+        for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
+                pbe->orig_address = buf[j];
+                pbe = pbe->next;
+        }
+        return pbe;
+}
+/**
+ *      create_image - use metadata contained in the PBE list
+ *      pointed to by pagedir_nosave to mark the pages that will
+ *      be overwritten in the process of restoring the system
+ *      memory state from the image and allocate memory for
+ *      the image avoiding these pages
+ */
+static int create_image(struct snapshot_handle *handle)
+{
+        int error = 0;
+        struct pbe *p, *pblist;
+        p = pagedir_nosave;
+        error = mark_unsafe_pages(p);
+        if (!error) {
+                pblist = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 1);
+                if (pblist)
+                        copy_page_backup_list(pblist, p);
+                free_pagedir(p);
+                if (!pblist)
+                        error = -ENOMEM;
+        }
+        if (!error)
+                error = alloc_data_pages(pblist, GFP_ATOMIC, 1);
+        if (!error) {
+                release_eaten_pages();
+                pagedir_nosave = pblist;
+        } else {
+                pagedir_nosave = NULL;
+                handle->pbe = NULL;
+                nr_copy_pages = 0;
+                nr_meta_pages = 0;
+        }
+        return error;
+}
+/**
+ *      snapshot_write_next - used for writing the system memory snapshot.
+ *
+ *      On the first call to it @handle should point to a zeroed
+ *      snapshot_handle structure.  The structure gets updated and a pointer
+ *      to it should be passed to this function every next time.
+ *
+ *      The @count parameter should contain the number of bytes the caller
+ *      wants to write to the image.  It must not be zero.
+ *
+ *      On success the function returns a positive number.  Then, the caller
+ *      is allowed to write up to the returned number of bytes to the memory
+ *      location computed by the data_of() macro.  The number returned
+ *      may be smaller than @count, but this only happens if the write would
+ *      cross a page boundary otherwise.
+ *
+ *      The function returns 0 to indicate the "end of file" condition,
+ *      and a negative number is returned on error.  In such cases the
+ *      structure pointed to by @handle is not updated and should not be used
+ *      any more.
+ */
+int snapshot_write_next(struct snapshot_handle *handle, size_t count)
+{
+        int error = 0;
+        if (handle->prev && handle->page > nr_meta_pages + nr_copy_pages)
+                return 0;
+        if (!buffer) {
+                /* This makes the buffer be freed by swsusp_free() */
+                buffer = alloc_image_page(GFP_ATOMIC, 0);
+                if (!buffer)
+                        return -ENOMEM;
+        }
+        if (!handle->offset)
+                handle->buffer = buffer;
+        if (handle->prev < handle->page) {
+                if (!handle->prev) {
+                        error = load_header(handle, (struct swsusp_info *)buffer);
+                        if (error)
+                                return error;
+                } else if (handle->prev <= nr_meta_pages) {
+                        handle->pbe = unpack_orig_addresses(buffer, handle->pbe);
+                        if (!handle->pbe) {
+                                error = create_image(handle);
+                                if (error)
+                                        return error;
+                                handle->pbe = pagedir_nosave;
+                                handle->buffer = (void *)handle->pbe->address;
+                        }
+                } else {
+                        handle->pbe = handle->pbe->next;
+                        handle->buffer = (void *)handle->pbe->address;
+                }
+                handle->prev = handle->page;
+        }
+        handle->buf_offset = handle->page_offset;
+        if (handle->page_offset + count >= PAGE_SIZE) {
+                count = PAGE_SIZE - handle->page_offset;
+                handle->page_offset = 0;
+                handle->page++;
+        } else {
+                handle->page_offset += count;
+        }
+        handle->offset += count;
+        return count;
+}
+int snapshot_image_loaded(struct snapshot_handle *handle)
+{
+        return !(!handle->pbe || handle->pbe->next || !nr_copy_pages ||
+                handle->page <= nr_meta_pages + nr_copy_pages);
+}
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
new file mode 100644
index 000000000000..9177f3f73a6c
--- /dev/null
+++ b/kernel/power/swap.c
@@ -0,0 +1,544 @@
+/*
+ * linux/kernel/power/swap.c
+ *
+ * This file provides functions for reading the suspend image from
+ * and writing it to a swap partition.
+ *
+ * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
+ *
+ * This file is released under the GPLv2.
+ *
+ */
+#include <linux/module.h>
+#include <linux/smp_lock.h>
+#include <linux/file.h>
+#include <linux/utsname.h>
+#include <linux/version.h>
+#include <linux/delay.h>
+#include <linux/bitops.h>
+#include <linux/genhd.h>
+#include <linux/device.h>
+#include <linux/buffer_head.h>
+#include <linux/bio.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/pm.h>
+#include "power.h"
+extern char resume_file[];
+#define SWSUSP_SIG      "S1SUSPEND"
+static struct swsusp_header {
+        char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)];
+        swp_entry_t image;
+        char    orig_sig[10];
+        char    sig[10];
+} __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header;
+/*
+ * Saving part...
+ */
+static unsigned short root_swap = 0xffff;
+static int mark_swapfiles(swp_entry_t start)
+{
+        int error;
+        rw_swap_page_sync(READ,
+                          swp_entry(root_swap, 0),
+                          virt_to_page((unsigned long)&swsusp_header));
+        if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) ||
+            !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
+                memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
+                memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
+                swsusp_header.image = start;
+                error = rw_swap_page_sync(WRITE,
+                                          swp_entry(root_swap, 0),
+                                          virt_to_page((unsigned long)
+                                                       &swsusp_header));
+        } else {
+                pr_debug("swsusp: Partition is not swap space.\n");
+                error = -ENODEV;
+        }
+        return error;
+}
+/**
+ *      swsusp_swap_check - check if the resume device is a swap device
+ *      and get its index (if so)
+ */
+static int swsusp_swap_check(void) /* This is called before saving image */
+{
+        int res = swap_type_of(swsusp_resume_device);
+        if (res >= 0) {
+                root_swap = res;
+                return 0;
+        }
+        return res;
+}
+/**
+ *      write_page - Write one page to given swap location.
+ *      @buf:           Address we're writing.
+ *      @offset:        Offset of the swap page we're writing to.
+ */
+static int write_page(void *buf, unsigned long offset)
+{
+        swp_entry_t entry;
+        int error = -ENOSPC;
+        if (offset) {
+                entry = swp_entry(root_swap, offset);
+                error = rw_swap_page_sync(WRITE, entry, virt_to_page(buf));
+        }
+        return error;
+}
+/*
+ *      The swap map is a data structure used for keeping track of each page
+ *      written to a swap partition.  It consists of many swap_map_page
+ *      structures that contain each an array of MAP_PAGE_SIZE swap entries.
+ *      These structures are stored on the swap and linked together with the
+ *      help of the .next_swap member.
+ *
+ *      The swap map is created during suspend.  The swap map pages are
+ *      allocated and populated one at a time, so we only need one memory
+ *      page to set up the entire structure.
+ *
+ *      During resume we also only need to use one swap_map_page structure
+ *      at a time.
+ */
+#define MAP_PAGE_ENTRIES        (PAGE_SIZE / sizeof(long) - 1)
+struct swap_map_page {
+        unsigned long           entries[MAP_PAGE_ENTRIES];
+        unsigned long           next_swap;
+};
+/**
+ *      The swap_map_handle structure is used for handling swap in
+ *      a file-alike way
+ */
+struct swap_map_handle {
+        struct swap_map_page *cur;
+        unsigned long cur_swap;
+        struct bitmap_page *bitmap;
+        unsigned int k;
+};
+static void release_swap_writer(struct swap_map_handle *handle)
+{
+        if (handle->cur)
+                free_page((unsigned long)handle->cur);
+        handle->cur = NULL;
+        if (handle->bitmap)
+                free_bitmap(handle->bitmap);
+        handle->bitmap = NULL;
+}
+static int get_swap_writer(struct swap_map_handle *handle)
+{
+        handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
+        if (!handle->cur)
+                return -ENOMEM;
+        handle->bitmap = alloc_bitmap(count_swap_pages(root_swap, 0));
+        if (!handle->bitmap) {
+                release_swap_writer(handle);
+                return -ENOMEM;
+        }
+        handle->cur_swap = alloc_swap_page(root_swap, handle->bitmap);
+        if (!handle->cur_swap) {
+                release_swap_writer(handle);
+                return -ENOSPC;
+        }
+        handle->k = 0;
+        return 0;
+}
+static int swap_write_page(struct swap_map_handle *handle, void *buf)
+{
+        int error;
+        unsigned long offset;
+        if (!handle->cur)
+                return -EINVAL;
+        offset = alloc_swap_page(root_swap, handle->bitmap);
+        error = write_page(buf, offset);
+        if (error)
+                return error;
+        handle->cur->entries[handle->k++] = offset;
+        if (handle->k >= MAP_PAGE_ENTRIES) {
+                offset = alloc_swap_page(root_swap, handle->bitmap);
+                if (!offset)
+                        return -ENOSPC;
+                handle->cur->next_swap = offset;
+                error = write_page(handle->cur, handle->cur_swap);
+                if (error)
+                        return error;
+                memset(handle->cur, 0, PAGE_SIZE);
+                handle->cur_swap = offset;
+                handle->k = 0;
+        }
+        return 0;
+}
+static int flush_swap_writer(struct swap_map_handle *handle)
+{
+        if (handle->cur && handle->cur_swap)
+                return write_page(handle->cur, handle->cur_swap);
+        else
+                return -EINVAL;
+}
+/**
+ *      save_image - save the suspend image data
+ */
+static int save_image(struct swap_map_handle *handle,
+                      struct snapshot_handle *snapshot,
+                      unsigned int nr_pages)
+{
+        unsigned int m;
+        int ret;
+        int error = 0;
+        printk("Saving image data pages (%u pages) ...     ", nr_pages);
+        m = nr_pages / 100;
+        if (!m)
+                m = 1;
+        nr_pages = 0;
+        do {
+                ret = snapshot_read_next(snapshot, PAGE_SIZE);
+                if (ret > 0) {
+                        error = swap_write_page(handle, data_of(*snapshot));
+                        if (error)
+                                break;
+                        if (!(nr_pages % m))
+                                printk("\b\b\b\b%3d%%", nr_pages / m);
+                        nr_pages++;
+                }
+        } while (ret > 0);
+        if (!error)
+                printk("\b\b\b\bdone\n");
+        return error;
+}
+/**
+ *      enough_swap - Make sure we have enough swap to save the image.
+ *
+ *      Returns TRUE or FALSE after checking the total amount of swap
+ *      space avaiable from the resume partition.
+ */
+static int enough_swap(unsigned int nr_pages)
+{
+        unsigned int free_swap = count_swap_pages(root_swap, 1);
+        pr_debug("swsusp: free swap pages: %u\n", free_swap);
+        return free_swap > (nr_pages + PAGES_FOR_IO +
+                (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
+}
+/**
+ *      swsusp_write - Write entire image and metadata.
+ *
+ *      It is important _NOT_ to umount filesystems at this point. We want
+ *      them synced (in case something goes wrong) but we DO not want to mark
+ *      filesystem clean: it is not. (And it does not matter, if we resume
+ *      correctly, we'll mark system clean, anyway.)
+ */
+int swsusp_write(void)
+{
+        struct swap_map_handle handle;
+        struct snapshot_handle snapshot;
+        struct swsusp_info *header;
+        unsigned long start;
+        int error;
+        if ((error = swsusp_swap_check())) {
+                printk(KERN_ERR "swsusp: Cannot find swap device, try swapon -a.\n");
+                return error;
+        }
+        memset(&snapshot, 0, sizeof(struct snapshot_handle));
+        error = snapshot_read_next(&snapshot, PAGE_SIZE);
+        if (error < PAGE_SIZE)
+                return error < 0 ? error : -EFAULT;
+        header = (struct swsusp_info *)data_of(snapshot);
+        if (!enough_swap(header->pages)) {
+                printk(KERN_ERR "swsusp: Not enough free swap\n");
+                return -ENOSPC;
+        }
+        error = get_swap_writer(&handle);
+        if (!error) {
+                start = handle.cur_swap;
+                error = swap_write_page(&handle, header);
+        }
+        if (!error)
+                error = save_image(&handle, &snapshot, header->pages - 1);
+        if (!error) {
+                flush_swap_writer(&handle);
+                printk("S");
+                error = mark_swapfiles(swp_entry(root_swap, start));
+                printk("|\n");
+        }
+        if (error)
+                free_all_swap_pages(root_swap, handle.bitmap);
+        release_swap_writer(&handle);
+        return error;
+}
+/*
+ *      Using bio to read from swap.
+ *      This code requires a bit more work than just using buffer heads
+ *      but, it is the recommended way for 2.5/2.6.
+ *      The following are to signal the beginning and end of I/O. Bios
+ *      finish asynchronously, while we want them to happen synchronously.
+ *      A simple atomic_t, and a wait loop take care of this problem.
+ */
+static atomic_t io_done = ATOMIC_INIT(0);
+static int end_io(struct bio *bio, unsigned int num, int err)
+{
+        if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+                panic("I/O error reading memory image");
+        atomic_set(&io_done, 0);
+        return 0;
+}
+static struct block_device *resume_bdev;
+/**
+ *      submit - submit BIO request.
+ *      @rw:    READ or WRITE.
+ *      @off    physical offset of page.
+ *      @page:  page we're reading or writing.
+ *
+ *      Straight from the textbook - allocate and initialize the bio.
+ *      If we're writing, make sure the page is marked as dirty.
+ *      Then submit it and wait.
+ */
+static int submit(int rw, pgoff_t page_off, void *page)
+{
+        int error = 0;
+        struct bio *bio;
+        bio = bio_alloc(GFP_ATOMIC, 1);
+        if (!bio)
+                return -ENOMEM;
+        bio->bi_sector = page_off * (PAGE_SIZE >> 9);
+        bio->bi_bdev = resume_bdev;
+        bio->bi_end_io = end_io;
+        if (bio_add_page(bio, virt_to_page(page), PAGE_SIZE, 0) < PAGE_SIZE) {
+                printk("swsusp: ERROR: adding page to bio at %ld\n",page_off);
+                error = -EFAULT;
+                goto Done;
+        }
+        atomic_set(&io_done, 1);
+        submit_bio(rw | (1 << BIO_RW_SYNC), bio);
+        while (atomic_read(&io_done))
+                yield();
+        if (rw == READ)
+                bio_set_pages_dirty(bio);
+ Done:
+        bio_put(bio);
+        return error;
+}
+static int bio_read_page(pgoff_t page_off, void *page)
+{
+        return submit(READ, page_off, page);
+}
+static int bio_write_page(pgoff_t page_off, void *page)
+{
+        return submit(WRITE, page_off, page);
+}
+/**
+ *      The following functions allow us to read data using a swap map
+ *      in a file-alike way
+ */
+static void release_swap_reader(struct swap_map_handle *handle)
+{
+        if (handle->cur)
+                free_page((unsigned long)handle->cur);
+        handle->cur = NULL;
+}
+static int get_swap_reader(struct swap_map_handle *handle,
+                                      swp_entry_t start)
+{
+        int error;
+        if (!swp_offset(start))
+                return -EINVAL;
+        handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
+        if (!handle->cur)
+                return -ENOMEM;
+        error = bio_read_page(swp_offset(start), handle->cur);
+        if (error) {
+                release_swap_reader(handle);
+                return error;
+        }
+        handle->k = 0;
+        return 0;
+}
+static int swap_read_page(struct swap_map_handle *handle, void *buf)
+{
+        unsigned long offset;
+        int error;
+        if (!handle->cur)
+                return -EINVAL;
+        offset = handle->cur->entries[handle->k];
+        if (!offset)
+                return -EFAULT;
+        error = bio_read_page(offset, buf);
+        if (error)
+                return error;
+        if (++handle->k >= MAP_PAGE_ENTRIES) {
+                handle->k = 0;
+                offset = handle->cur->next_swap;
+                if (!offset)
+                        release_swap_reader(handle);
+                else
+                        error = bio_read_page(offset, handle->cur);
+        }
+        return error;
+}
+/**
+ *      load_image - load the image using the swap map handle
+ *      @handle and the snapshot handle @snapshot
+ *      (assume there are @nr_pages pages to load)
+ */
+static int load_image(struct swap_map_handle *handle,
+                      struct snapshot_handle *snapshot,
+                      unsigned int nr_pages)
+{
+        unsigned int m;
+        int ret;
+        int error = 0;
+        printk("Loading image data pages (%u pages) ...     ", nr_pages);
+        m = nr_pages / 100;
+        if (!m)
+                m = 1;
+        nr_pages = 0;
+        do {
+                ret = snapshot_write_next(snapshot, PAGE_SIZE);
+                if (ret > 0) {
+                        error = swap_read_page(handle, data_of(*snapshot));
+                        if (error)
+                                break;
+                        if (!(nr_pages % m))
+                                printk("\b\b\b\b%3d%%", nr_pages / m);
+                        nr_pages++;
+                }
+        } while (ret > 0);
+        if (!error)
+                printk("\b\b\b\bdone\n");
+        if (!snapshot_image_loaded(snapshot))
+                error = -ENODATA;
+        return error;
+}
+int swsusp_read(void)
+{
+        int error;
+        struct swap_map_handle handle;
+        struct snapshot_handle snapshot;
+        struct swsusp_info *header;
+        if (IS_ERR(resume_bdev)) {
+                pr_debug("swsusp: block device not initialised\n");
+                return PTR_ERR(resume_bdev);
+        }
+        memset(&snapshot, 0, sizeof(struct snapshot_handle));
+        error = snapshot_write_next(&snapshot, PAGE_SIZE);
+        if (error < PAGE_SIZE)
+                return error < 0 ? error : -EFAULT;
+        header = (struct swsusp_info *)data_of(snapshot);
+        error = get_swap_reader(&handle, swsusp_header.image);
+        if (!error)
+                error = swap_read_page(&handle, header);
+        if (!error)
+                error = load_image(&handle, &snapshot, header->pages - 1);
+        release_swap_reader(&handle);
+        blkdev_put(resume_bdev);
+        if (!error)
+                pr_debug("swsusp: Reading resume file was successful\n");
+        else
+                pr_debug("swsusp: Error %d resuming\n", error);
+        return error;
+}
+/**
+ *      swsusp_check - Check for swsusp signature in the resume device
+ */
+int swsusp_check(void)
+{
+        int error;
+        resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
+        if (!IS_ERR(resume_bdev)) {
+                set_blocksize(resume_bdev, PAGE_SIZE);
+                memset(&swsusp_header, 0, sizeof(swsusp_header));
+                if ((error = bio_read_page(0, &swsusp_header)))
+                        return error;
+                if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
+                        memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
+                        /* Reset swap signature now */
+                        error = bio_write_page(0, &swsusp_header);
+                } else {
+                        return -EINVAL;
+                }
+                if (error)
+                        blkdev_put(resume_bdev);
+                else
+                        pr_debug("swsusp: Signature found, resuming\n");
+        } else {
+                error = PTR_ERR(resume_bdev);
+        }
+        if (error)
+                pr_debug("swsusp: Error %d check for resume file\n", error);
+        return error;
+}
+/**
+ *      swsusp_close - close swap device.
+ */
+void swsusp_close(void)
+{
+        if (IS_ERR(resume_bdev)) {
+                pr_debug("swsusp: block device not initialised\n");
+                return;
+        }
+        blkdev_put(resume_bdev);
+}
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 2d9d08f72f76..c4016cbbd3e0 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -31,41 +31,24 @@
 * Fixed runaway init
 *
 * Rafael J. Wysocki <rjw@sisk.pl>
- * Added the swap map data structure and reworked the handling of swap
+ * Reworked the freeing of memory and the handling of swap
 *
 * More state savers are welcome. Especially for the scsi layer...
 *
 * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt
 */
-#include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/suspend.h>
-#include <linux/smp_lock.h>
-#include <linux/file.h>
-#include <linux/utsname.h>
-#include <linux/version.h>
-#include <linux/delay.h>
-#include <linux/bitops.h>
 #include <linux/spinlock.h>
-#include <linux/genhd.h>
 #include <linux/kernel.h>
 #include <linux/major.h>
 #include <linux/swap.h>
 #include <linux/pm.h>
-#include <linux/device.h>
-#include <linux/buffer_head.h>
 #include <linux/swapops.h>
 #include <linux/bootmem.h>
 #include <linux/syscalls.h>
 #include <linux/highmem.h>
-#include <linux/bio.h>
-#include <asm/uaccess.h>
-#include <asm/mmu_context.h>
-#include <asm/pgtable.h>
-#include <asm/tlbflush.h>
-#include <asm/io.h>
 #include "power.h"
@@ -77,6 +60,8 @@
 */
 unsigned long image_size = 500 * 1024 * 1024;
+int in_suspend __nosavedata = 0;
 #ifdef CONFIG_HIGHMEM
 unsigned int count_highmem_pages(void);
 int save_highmem(void);
@@ -87,471 +72,97 @@ static int restore_highmem(void) { return 0; }
 static unsigned int count_highmem_pages(void) { return 0; }
 #endif
-extern char resume_file[];
-#define SWSUSP_SIG      "S1SUSPEND"
-static struct swsusp_header {
-        char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)];
-        swp_entry_t image;
-        char    orig_sig[10];
-        char    sig[10];
-} __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header;
-static struct swsusp_info swsusp_info;
-/*
- * Saving part...
- */
-static unsigned short root_swap = 0xffff;
-static int mark_swapfiles(swp_entry_t start)
-{
-        int error;
-        rw_swap_page_sync(READ,
-                          swp_entry(root_swap, 0),
-                          virt_to_page((unsigned long)&swsusp_header));
-        if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) ||
-            !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
-                memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
-                memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
-                swsusp_header.image = start;
-                error = rw_swap_page_sync(WRITE,
-                                          swp_entry(root_swap, 0),
-                                          virt_to_page((unsigned long)
-                                                       &swsusp_header));
-        } else {
-                pr_debug("swsusp: Partition is not swap space.\n");
-                error = -ENODEV;
-        }
-        return error;
-}
-/*
- * Check whether the swap device is the specified resume
- * device, irrespective of whether they are specified by
- * identical names.
- *
- * (Thus, device inode aliasing is allowed.  You can say /dev/hda4
- * instead of /dev/ide/host0/bus0/target0/lun0/part4 [if using devfs]
- * and they'll be considered the same device.  This is *necessary* for
- * devfs, since the resume code can only recognize the form /dev/hda4,
- * but the suspend code would see the long name.)
- */
-static inline int is_resume_device(const struct swap_info_struct *swap_info)
-{
-        struct file *file = swap_info->swap_file;
-        struct inode *inode = file->f_dentry->d_inode;
-        return S_ISBLK(inode->i_mode) &&
-                swsusp_resume_device == MKDEV(imajor(inode), iminor(inode));
-}
-static int swsusp_swap_check(void) /* This is called before saving image */
-{
-        int i;
-        spin_lock(&swap_lock);
-        for (i = 0; i < MAX_SWAPFILES; i++) {
-                if (!(swap_info[i].flags & SWP_WRITEOK))
-                        continue;
-                if (!swsusp_resume_device || is_resume_device(swap_info + i)) {
-                        spin_unlock(&swap_lock);
-                        root_swap = i;
-                        return 0;
-                }
-        }
-        spin_unlock(&swap_lock);
-        return -ENODEV;
-}
-/**
- *      write_page - Write one page to a fresh swap location.
- *      @addr:  Address we're writing.
- *      @loc:   Place to store the entry we used.
- *
- *      Allocate a new swap entry and 'sync' it. Note we discard -EIO
- *      errors. That is an artifact left over from swsusp. It did not
- *      check the return of rw_swap_page_sync() at all, since most pages
- *      written back to swap would return -EIO.
- *      This is a partial improvement, since we will at least return other
- *      errors, though we need to eventually fix the damn code.
- */
-static int write_page(unsigned long addr, swp_entry_t *loc)
-{
-        swp_entry_t entry;
-        int error = -ENOSPC;
-        entry = get_swap_page_of_type(root_swap);
-        if (swp_offset(entry)) {
-                error = rw_swap_page_sync(WRITE, entry, virt_to_page(addr));
-                if (!error || error == -EIO)
-                        *loc = entry;
-        }
-        return error;
-}
 /**
- *      Swap map-handling functions
+ *      The following functions are used for tracing the allocated
- *
+ *      swap pages, so that they can be freed in case of an error.
- *      The swap map is a data structure used for keeping track of each page
- *      written to the swap.  It consists of many swap_map_page structures
- *      that contain each an array of MAP_PAGE_SIZE swap entries.
- *      These structures are linked together with the help of either the
- *      .next (in memory) or the .next_swap (in swap) member.
 *
- *      The swap map is created during suspend.  At that time we need to keep
+ *      The functions operate on a linked bitmap structure defined
- *      it in memory, because we have to free all of the allocated swap
+ *      in power.h
- *      entries if an error occurs.  The memory needed is preallocated
- *      so that we know in advance if there's enough of it.
- *
- *      The first swap_map_page structure is filled with the swap entries that
- *      correspond to the first MAP_PAGE_SIZE data pages written to swap and
- *      so on.  After the all of the data pages have been written, the order
- *      of the swap_map_page structures in the map is reversed so that they
- *      can be read from swap in the original order.  This causes the data
- *      pages to be loaded in exactly the same order in which they have been
- *      saved.
- *
- *      During resume we only need to use one swap_map_page structure
- *      at a time, which means that we only need to use two memory pages for
- *      reading the image - one for reading the swap_map_page structures
- *      and the second for reading the data pages from swap.
 */
-#define MAP_PAGE_SIZE   ((PAGE_SIZE - sizeof(swp_entry_t) - sizeof(void *)) \
+void free_bitmap(struct bitmap_page *bitmap)
-                        / sizeof(swp_entry_t))
-struct swap_map_page {
-        swp_entry_t             entries[MAP_PAGE_SIZE];
-        swp_entry_t             next_swap;
-        struct swap_map_page    *next;
-};
-static inline void free_swap_map(struct swap_map_page *swap_map)
 {
-        struct swap_map_page *swp;
+        struct bitmap_page *bp;
-        while (swap_map) {
+        while (bitmap) {
-                swp = swap_map->next;
+                bp = bitmap->next;
-                free_page((unsigned long)swap_map);
+                free_page((unsigned long)bitmap);
-                swap_map = swp;
+                bitmap = bp;
        }
 }
-static struct swap_map_page *alloc_swap_map(unsigned int nr_pages)
+struct bitmap_page *alloc_bitmap(unsigned int nr_bits)
 {
-        struct swap_map_page *swap_map, *swp;
+        struct bitmap_page *bitmap, *bp;
-        unsigned n = 0;
+        unsigned int n;
-        if (!nr_pages)
+        if (!nr_bits)
                return NULL;
-        pr_debug("alloc_swap_map(): nr_pages = %d\n", nr_pages);
+        bitmap = (struct bitmap_page *)get_zeroed_page(GFP_KERNEL);
-        swap_map = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
+        bp = bitmap;
-        swp = swap_map;
+        for (n = BITMAP_PAGE_BITS; n < nr_bits; n += BITMAP_PAGE_BITS) {
-        for (n = MAP_PAGE_SIZE; n < nr_pages; n += MAP_PAGE_SIZE) {
+                bp->next = (struct bitmap_page *)get_zeroed_page(GFP_KERNEL);
-                swp->next = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
+                bp = bp->next;
-                swp = swp->next;
+                if (!bp) {
-                if (!swp) {
+                        free_bitmap(bitmap);
-                        free_swap_map(swap_map);
                        return NULL;
                }
        }
-        return swap_map;
+        return bitmap;
 }
-/**
+static int bitmap_set(struct bitmap_page *bitmap, unsigned long bit)
- *      reverse_swap_map - reverse the order of pages in the swap map
- *      @swap_map
- */
-static inline struct swap_map_page *reverse_swap_map(struct swap_map_page *swap_map)
-{
-        struct swap_map_page *prev, *next;
-        prev = NULL;
-        while (swap_map) {
-                next = swap_map->next;
-                swap_map->next = prev;
-                prev = swap_map;
-                swap_map = next;
-        }
-        return prev;
-}
-/**
- *      free_swap_map_entries - free the swap entries allocated to store
- *      the swap map @swap_map (this is only called in case of an error)
- */
-static inline void free_swap_map_entries(struct swap_map_page *swap_map)
-{
-        while (swap_map) {
-                if (swap_map->next_swap.val)
-                        swap_free(swap_map->next_swap);
-                swap_map = swap_map->next;
-        }
-}
-/**
- *      save_swap_map - save the swap map used for tracing the data pages
- *      stored in the swap
- */
-static int save_swap_map(struct swap_map_page *swap_map, swp_entry_t *start)
-{
-        swp_entry_t entry = (swp_entry_t){0};
-        int error;
-        while (swap_map) {
-                swap_map->next_swap = entry;
-                if ((error = write_page((unsigned long)swap_map, &entry)))
-                        return error;
-                swap_map = swap_map->next;
-        }
-        *start = entry;
-        return 0;
-}
-/**
- *      free_image_entries - free the swap entries allocated to store
- *      the image data pages (this is only called in case of an error)
- */
-static inline void free_image_entries(struct swap_map_page *swp)
 {
-        unsigned k;
+        unsigned int n;
-        while (swp) {
+        n = BITMAP_PAGE_BITS;
-                for (k = 0; k < MAP_PAGE_SIZE; k++)
+        while (bitmap && n <= bit) {
-                        if (swp->entries[k].val)
+                n += BITMAP_PAGE_BITS;
-                                swap_free(swp->entries[k]);
+                bitmap = bitmap->next;
-                swp = swp->next;
        }
-}
+        if (!bitmap)
+                return -EINVAL;
-/**
+        n -= BITMAP_PAGE_BITS;
- *      The swap_map_handle structure is used for handling the swap map in
+        bit -= n;
- *      a file-alike way
+        n = 0;
- */
+        while (bit >= BITS_PER_CHUNK) {
+                bit -= BITS_PER_CHUNK;
-struct swap_map_handle {
+                n++;
-        struct swap_map_page *cur;
-        unsigned int k;
-};
-static inline void init_swap_map_handle(struct swap_map_handle *handle,
-                                        struct swap_map_page *map)
-{
-        handle->cur = map;
-        handle->k = 0;
-}
-static inline int swap_map_write_page(struct swap_map_handle *handle,
-                                      unsigned long addr)
-{
-        int error;
-        error = write_page(addr, handle->cur->entries + handle->k);
-        if (error)
-                return error;
-        if (++handle->k >= MAP_PAGE_SIZE) {
-                handle->cur = handle->cur->next;
-                handle->k = 0;
        }
+        bitmap->chunks[n] |= (1UL << bit);
        return 0;
 }
-/**
+unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap)
- *      save_image_data - save the data pages pointed to by the PBEs
- *      from the list @pblist using the swap map handle @handle
- *      (assume there are @nr_pages data pages to save)
- */
-static int save_image_data(struct pbe *pblist,
-                           struct swap_map_handle *handle,
-                           unsigned int nr_pages)
-{
-        unsigned int m;
-        struct pbe *p;
-        int error = 0;
-        printk("Saving image data pages (%u pages) ...     ", nr_pages);
-        m = nr_pages / 100;
-        if (!m)
-                m = 1;
-        nr_pages = 0;
-        for_each_pbe (p, pblist) {
-                error = swap_map_write_page(handle, p->address);
-                if (error)
-                        break;
-                if (!(nr_pages % m))
-                        printk("\b\b\b\b%3d%%", nr_pages / m);
-                nr_pages++;
-        }
-        if (!error)
-                printk("\b\b\b\bdone\n");
-        return error;
-}
-static void dump_info(void)
-{
-        pr_debug(" swsusp: Version: %u\n",swsusp_info.version_code);
-        pr_debug(" swsusp: Num Pages: %ld\n",swsusp_info.num_physpages);
-        pr_debug(" swsusp: UTS Sys: %s\n",swsusp_info.uts.sysname);
-        pr_debug(" swsusp: UTS Node: %s\n",swsusp_info.uts.nodename);
-        pr_debug(" swsusp: UTS Release: %s\n",swsusp_info.uts.release);
-        pr_debug(" swsusp: UTS Version: %s\n",swsusp_info.uts.version);
-        pr_debug(" swsusp: UTS Machine: %s\n",swsusp_info.uts.machine);
-        pr_debug(" swsusp: UTS Domain: %s\n",swsusp_info.uts.domainname);
-        pr_debug(" swsusp: CPUs: %d\n",swsusp_info.cpus);
-        pr_debug(" swsusp: Image: %ld Pages\n",swsusp_info.image_pages);
-        pr_debug(" swsusp: Total: %ld Pages\n", swsusp_info.pages);
-}
-static void init_header(unsigned int nr_pages)
-{
-        memset(&swsusp_info, 0, sizeof(swsusp_info));
-        swsusp_info.version_code = LINUX_VERSION_CODE;
-        swsusp_info.num_physpages = num_physpages;
-        memcpy(&swsusp_info.uts, &system_utsname, sizeof(system_utsname));
-        swsusp_info.cpus = num_online_cpus();
-        swsusp_info.image_pages = nr_pages;
-        swsusp_info.pages = nr_pages +
-                ((nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT) + 1;
-}
-/**
- *      pack_orig_addresses - the .orig_address fields of the PBEs from the
- *      list starting at @pbe are stored in the array @buf[] (1 page)
- */
-static inline struct pbe *pack_orig_addresses(unsigned long *buf,
-                                              struct pbe *pbe)
-{
-        int j;
-        for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
-                buf[j] = pbe->orig_address;
-                pbe = pbe->next;
-        }
-        if (!pbe)
-                for (; j < PAGE_SIZE / sizeof(long); j++)
-                        buf[j] = 0;
-        return pbe;
-}
-/**
- *      save_image_metadata - save the .orig_address fields of the PBEs
- *      from the list @pblist using the swap map handle @handle
- */
-static int save_image_metadata(struct pbe *pblist,
-                               struct swap_map_handle *handle)
 {
-        unsigned long *buf;
+        unsigned long offset;
-        unsigned int n = 0;
-        struct pbe *p;
-        int error = 0;
-        printk("Saving image metadata ... ");
+        offset = swp_offset(get_swap_page_of_type(swap));
-        buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC);
+        if (offset) {
-        if (!buf)
+                if (bitmap_set(bitmap, offset)) {
-                return -ENOMEM;
+                        swap_free(swp_entry(swap, offset));
-        p = pblist;
+                        offset = 0;
-        while (p) {
+                }
-                p = pack_orig_addresses(buf, p);
-                error = swap_map_write_page(handle, (unsigned long)buf);
-                if (error)
-                        break;
-                n++;
        }
-        free_page((unsigned long)buf);
+        return offset;
-        if (!error)
-                printk("done (%u pages saved)\n", n);
-        return error;
 }
-/**
+void free_all_swap_pages(int swap, struct bitmap_page *bitmap)
- *      enough_swap - Make sure we have enough swap to save the image.
- *
- *      Returns TRUE or FALSE after checking the total amount of swap
- *      space avaiable from the resume partition.
- */
-static int enough_swap(unsigned int nr_pages)
 {
-        unsigned int free_swap = swap_info[root_swap].pages -
+        unsigned int bit, n;
-                swap_info[root_swap].inuse_pages;
+        unsigned long test;
-        pr_debug("swsusp: free swap pages: %u\n", free_swap);
-        return free_swap > (nr_pages + PAGES_FOR_IO +
-                (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
-}
-/**
+        bit = 0;
- *      swsusp_write - Write entire image and metadata.
+        while (bitmap) {
- *
+                for (n = 0; n < BITMAP_PAGE_CHUNKS; n++)
- *      It is important _NOT_ to umount filesystems at this point. We want
+                        for (test = 1UL; test; test <<= 1) {
- *      them synced (in case something goes wrong) but we DO not want to mark
+                                if (bitmap->chunks[n] & test)
- *      filesystem clean: it is not. (And it does not matter, if we resume
+                                        swap_free(swp_entry(swap, bit));
- *      correctly, we'll mark system clean, anyway.)
+                                bit++;
- */
+                        }
+                bitmap = bitmap->next;
-int swsusp_write(struct pbe *pblist, unsigned int nr_pages)
-{
-        struct swap_map_page *swap_map;
-        struct swap_map_handle handle;
-        swp_entry_t start;
-        int error;
-        if ((error = swsusp_swap_check())) {
-                printk(KERN_ERR "swsusp: Cannot find swap device, try swapon -a.\n");
-                return error;
-        }
-        if (!enough_swap(nr_pages)) {
-                printk(KERN_ERR "swsusp: Not enough free swap\n");
-                return -ENOSPC;
        }
-        init_header(nr_pages);
-        swap_map = alloc_swap_map(swsusp_info.pages);
-        if (!swap_map)
-                return -ENOMEM;
-        init_swap_map_handle(&handle, swap_map);
-        error = swap_map_write_page(&handle, (unsigned long)&swsusp_info);
-        if (!error)
-                error = save_image_metadata(pblist, &handle);
-        if (!error)
-                error = save_image_data(pblist, &handle, nr_pages);
-        if (error)
-                goto Free_image_entries;
-        swap_map = reverse_swap_map(swap_map);
-        error = save_swap_map(swap_map, &start);
-        if (error)
-                goto Free_map_entries;
-        dump_info();
-        printk( "S" );
-        error = mark_swapfiles(start);
-        printk( "|\n" );
-        if (error)
-                goto Free_map_entries;
-Free_swap_map:
-        free_swap_map(swap_map);
-        return error;
-Free_map_entries:
-        free_swap_map_entries(swap_map);
-Free_image_entries:
-        free_image_entries(swap_map);
-        goto Free_swap_map;
 }
 /**
@@ -660,379 +271,3 @@ int swsusp_resume(void)
        local_irq_enable();
        return error;
 }
-/**
- *      mark_unsafe_pages - mark the pages that cannot be used for storing
- *      the image during resume, because they conflict with the pages that
- *      had been used before suspend
- */
-static void mark_unsafe_pages(struct pbe *pblist)
-{
-        struct zone *zone;
-        unsigned long zone_pfn;
-        struct pbe *p;
-        if (!pblist) /* a sanity check */
-                return;
-        /* Clear page flags */
-        for_each_zone (zone) {
-                for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
-                        if (pfn_valid(zone_pfn + zone->zone_start_pfn))
-                                ClearPageNosaveFree(pfn_to_page(zone_pfn +
-                                        zone->zone_start_pfn));
-        }
-        /* Mark orig addresses */
-        for_each_pbe (p, pblist)
-                SetPageNosaveFree(virt_to_page(p->orig_address));
-}
-static void copy_page_backup_list(struct pbe *dst, struct pbe *src)
-{
-        /* We assume both lists contain the same number of elements */
-        while (src) {
-                dst->orig_address = src->orig_address;
-                dst = dst->next;
-                src = src->next;
-        }
-}
-/*
- *      Using bio to read from swap.
- *      This code requires a bit more work than just using buffer heads
- *      but, it is the recommended way for 2.5/2.6.
- *      The following are to signal the beginning and end of I/O. Bios
- *      finish asynchronously, while we want them to happen synchronously.
- *      A simple atomic_t, and a wait loop take care of this problem.
- */
-static atomic_t io_done = ATOMIC_INIT(0);
-static int end_io(struct bio *bio, unsigned int num, int err)
-{
-        if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
-                panic("I/O error reading memory image");
-        atomic_set(&io_done, 0);
-        return 0;
-}
-static struct block_device *resume_bdev;
-/**
- *      submit - submit BIO request.
- *      @rw:    READ or WRITE.
- *      @off    physical offset of page.
- *      @page:  page we're reading or writing.
- *
- *      Straight from the textbook - allocate and initialize the bio.
- *      If we're writing, make sure the page is marked as dirty.
- *      Then submit it and wait.
- */
-static int submit(int rw, pgoff_t page_off, void *page)
-{
-        int error = 0;
-        struct bio *bio;
-        bio = bio_alloc(GFP_ATOMIC, 1);
-        if (!bio)
-                return -ENOMEM;
-        bio->bi_sector = page_off * (PAGE_SIZE >> 9);
-        bio->bi_bdev = resume_bdev;
-        bio->bi_end_io = end_io;
-        if (bio_add_page(bio, virt_to_page(page), PAGE_SIZE, 0) < PAGE_SIZE) {
-                printk("swsusp: ERROR: adding page to bio at %ld\n",page_off);
-                error = -EFAULT;
-                goto Done;
-        }
-        atomic_set(&io_done, 1);
-        submit_bio(rw | (1 << BIO_RW_SYNC), bio);
-        while (atomic_read(&io_done))
-                yield();
-        if (rw == READ)
-                bio_set_pages_dirty(bio);
- Done:
-        bio_put(bio);
-        return error;
-}
-static int bio_read_page(pgoff_t page_off, void *page)
-{
-        return submit(READ, page_off, page);
-}
-static int bio_write_page(pgoff_t page_off, void *page)
-{
-        return submit(WRITE, page_off, page);
-}
-/**
- *      The following functions allow us to read data using a swap map
- *      in a file-alike way
- */
-static inline void release_swap_map_reader(struct swap_map_handle *handle)
-{
-        if (handle->cur)
-                free_page((unsigned long)handle->cur);
-        handle->cur = NULL;
-}
-static inline int get_swap_map_reader(struct swap_map_handle *handle,
-                                      swp_entry_t start)
-{
-        int error;
-        if (!swp_offset(start))
-                return -EINVAL;
-        handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
-        if (!handle->cur)
-                return -ENOMEM;
-        error = bio_read_page(swp_offset(start), handle->cur);
-        if (error) {
-                release_swap_map_reader(handle);
-                return error;
-        }
-        handle->k = 0;
-        return 0;
-}
-static inline int swap_map_read_page(struct swap_map_handle *handle, void *buf)
-{
-        unsigned long offset;
-        int error;
-        if (!handle->cur)
-                return -EINVAL;
-        offset = swp_offset(handle->cur->entries[handle->k]);
-        if (!offset)
-                return -EINVAL;
-        error = bio_read_page(offset, buf);
-        if (error)
-                return error;
-        if (++handle->k >= MAP_PAGE_SIZE) {
-                handle->k = 0;
-                offset = swp_offset(handle->cur->next_swap);
-                if (!offset)
-                        release_swap_map_reader(handle);
-                else
-                        error = bio_read_page(offset, handle->cur);
-        }
-        return error;
-}
-static int check_header(void)
-{
-        char *reason = NULL;
-        dump_info();
-        if (swsusp_info.version_code != LINUX_VERSION_CODE)
-                reason = "kernel version";
-        if (swsusp_info.num_physpages != num_physpages)
-                reason = "memory size";
-        if (strcmp(swsusp_info.uts.sysname,system_utsname.sysname))
-                reason = "system type";
-        if (strcmp(swsusp_info.uts.release,system_utsname.release))
-                reason = "kernel release";
-        if (strcmp(swsusp_info.uts.version,system_utsname.version))
-                reason = "version";
-        if (strcmp(swsusp_info.uts.machine,system_utsname.machine))
-                reason = "machine";
-        if (reason) {
-                printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason);
-                return -EPERM;
-        }
-        return 0;
-}
-/**
- *      load_image_data - load the image data using the swap map handle
- *      @handle and store them using the page backup list @pblist
- *      (assume there are @nr_pages pages to load)
- */
-static int load_image_data(struct pbe *pblist,
-                           struct swap_map_handle *handle,
-                           unsigned int nr_pages)
-{
-        int error;
-        unsigned int m;
-        struct pbe *p;
-        if (!pblist)
-                return -EINVAL;
-        printk("Loading image data pages (%u pages) ...     ", nr_pages);
-        m = nr_pages / 100;
-        if (!m)
-                m = 1;
-        nr_pages = 0;
-        p = pblist;
-        while (p) {
-                error = swap_map_read_page(handle, (void *)p->address);
-                if (error)
-                        break;
-                p = p->next;
-                if (!(nr_pages % m))
-                        printk("\b\b\b\b%3d%%", nr_pages / m);
-                nr_pages++;
-        }
-        if (!error)
-                printk("\b\b\b\bdone\n");
-        return error;
-}
-/**
- *      unpack_orig_addresses - copy the elements of @buf[] (1 page) to
- *      the PBEs in the list starting at @pbe
- */
-static inline struct pbe *unpack_orig_addresses(unsigned long *buf,
-                                                struct pbe *pbe)
-{
-        int j;
-        for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
-                pbe->orig_address = buf[j];
-                pbe = pbe->next;
-        }
-        return pbe;
-}
-/**
- *      load_image_metadata - load the image metadata using the swap map
- *      handle @handle and put them into the PBEs in the list @pblist
- */
-static int load_image_metadata(struct pbe *pblist, struct swap_map_handle *handle)
-{
-        struct pbe *p;
-        unsigned long *buf;
-        unsigned int n = 0;
-        int error = 0;
-        printk("Loading image metadata ... ");
-        buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC);
-        if (!buf)
-                return -ENOMEM;
-        p = pblist;
-        while (p) {
-                error = swap_map_read_page(handle, buf);
-                if (error)
-                        break;
-                p = unpack_orig_addresses(buf, p);
-                n++;
-        }
-        free_page((unsigned long)buf);
-        if (!error)
-                printk("done (%u pages loaded)\n", n);
-        return error;
-}
-int swsusp_read(struct pbe **pblist_ptr)
-{
-        int error;
-        struct pbe *p, *pblist;
-        struct swap_map_handle handle;
-        unsigned int nr_pages;
-        if (IS_ERR(resume_bdev)) {
-                pr_debug("swsusp: block device not initialised\n");
-                return PTR_ERR(resume_bdev);
-        }
-        error = get_swap_map_reader(&handle, swsusp_header.image);
-        if (!error)
-                error = swap_map_read_page(&handle, &swsusp_info);
-        if (!error)
-                error = check_header();
-        if (error)
-                return error;
-        nr_pages = swsusp_info.image_pages;
-        p = alloc_pagedir(nr_pages, GFP_ATOMIC, 0);
-        if (!p)
-                return -ENOMEM;
-        error = load_image_metadata(p, &handle);
-        if (!error) {
-                mark_unsafe_pages(p);
-                pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1);
-                if (pblist)
-                        copy_page_backup_list(pblist, p);
-                free_pagedir(p);
-                if (!pblist)
-                        error = -ENOMEM;
-                /* Allocate memory for the image and read the data from swap */
-                if (!error)
-                        error = alloc_data_pages(pblist, GFP_ATOMIC, 1);
-                if (!error) {
-                        release_eaten_pages();
-                        error = load_image_data(pblist, &handle, nr_pages);
-                }
-                if (!error)
-                        *pblist_ptr = pblist;
-        }
-        release_swap_map_reader(&handle);
-        blkdev_put(resume_bdev);
-        if (!error)
-                pr_debug("swsusp: Reading resume file was successful\n");
-        else
-                pr_debug("swsusp: Error %d resuming\n", error);
-        return error;
-}
-/**
- *      swsusp_check - Check for swsusp signature in the resume device
- */
-int swsusp_check(void)
-{
-        int error;
-        resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
-        if (!IS_ERR(resume_bdev)) {
-                set_blocksize(resume_bdev, PAGE_SIZE);
-                memset(&swsusp_header, 0, sizeof(swsusp_header));
-                if ((error = bio_read_page(0, &swsusp_header)))
-                        return error;
-                if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
-                        memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
-                        /* Reset swap signature now */
-                        error = bio_write_page(0, &swsusp_header);
-                } else {
-                        return -EINVAL;
-                }
-                if (error)
-                        blkdev_put(resume_bdev);
-                else
-                        pr_debug("swsusp: Signature found, resuming\n");
-        } else {
-                error = PTR_ERR(resume_bdev);
-        }
-        if (error)
-                pr_debug("swsusp: Error %d check for resume file\n", error);
-        return error;
-}
-/**
- *      swsusp_close - close swap device.
- */
-void swsusp_close(void)
-{
-        if (IS_ERR(resume_bdev)) {
-                pr_debug("swsusp: block device not initialised\n");
-                return;
-        }
-        blkdev_put(resume_bdev);
-}
diff --git a/kernel/power/user.c b/kernel/power/user.c
new file mode 100644
index 000000000000..3f1539fbe48a
--- /dev/null
+++ b/kernel/power/user.c
@@ -0,0 +1,333 @@
+/*
+ * linux/kernel/power/user.c
+ *
+ * This file provides the user space interface for software suspend/resume.
+ *
+ * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
+ *
+ * This file is released under the GPLv2.
+ *
+ */
+#include <linux/suspend.h>
+#include <linux/syscalls.h>
+#include <linux/string.h>
+#include <linux/device.h>
+#include <linux/miscdevice.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/pm.h>
+#include <linux/fs.h>
+#include <asm/uaccess.h>
+#include "power.h"
+#define SNAPSHOT_MINOR  231
+static struct snapshot_data {
+        struct snapshot_handle handle;
+        int swap;
+        struct bitmap_page *bitmap;
+        int mode;
+        char frozen;
+        char ready;
+} snapshot_state;
+static atomic_t device_available = ATOMIC_INIT(1);
+static int snapshot_open(struct inode *inode, struct file *filp)
+{
+        struct snapshot_data *data;
+        if (!atomic_add_unless(&device_available, -1, 0))
+                return -EBUSY;
+        if ((filp->f_flags & O_ACCMODE) == O_RDWR)
+                return -ENOSYS;
+        nonseekable_open(inode, filp);
+        data = &snapshot_state;
+        filp->private_data = data;
+        memset(&data->handle, 0, sizeof(struct snapshot_handle));
+        if ((filp->f_flags & O_ACCMODE) == O_RDONLY) {
+                data->swap = swsusp_resume_device ? swap_type_of(swsusp_resume_device) : -1;
+                data->mode = O_RDONLY;
+        } else {
+                data->swap = -1;
+                data->mode = O_WRONLY;
+        }
+        data->bitmap = NULL;
+        data->frozen = 0;
+        data->ready = 0;
+        return 0;
+}
+static int snapshot_release(struct inode *inode, struct file *filp)
+{
+        struct snapshot_data *data;
+        swsusp_free();
+        data = filp->private_data;
+        free_all_swap_pages(data->swap, data->bitmap);
+        free_bitmap(data->bitmap);
+        if (data->frozen) {
+                down(&pm_sem);
+                thaw_processes();
+                enable_nonboot_cpus();
+                up(&pm_sem);
+        }
+        atomic_inc(&device_available);
+        return 0;
+}
+static ssize_t snapshot_read(struct file *filp, char __user *buf,
+                             size_t count, loff_t *offp)
+{
+        struct snapshot_data *data;
+        ssize_t res;
+        data = filp->private_data;
+        res = snapshot_read_next(&data->handle, count);
+        if (res > 0) {
+                if (copy_to_user(buf, data_of(data->handle), res))
+                        res = -EFAULT;
+                else
+                        *offp = data->handle.offset;
+        }
+        return res;
+}
+static ssize_t snapshot_write(struct file *filp, const char __user *buf,
+                              size_t count, loff_t *offp)
+{
+        struct snapshot_data *data;
+        ssize_t res;
+        data = filp->private_data;
+        res = snapshot_write_next(&data->handle, count);
+        if (res > 0) {
+                if (copy_from_user(data_of(data->handle), buf, res))
+                        res = -EFAULT;
+                else
+                        *offp = data->handle.offset;
+        }
+        return res;
+}
+static int snapshot_ioctl(struct inode *inode, struct file *filp,
+                          unsigned int cmd, unsigned long arg)
+{
+        int error = 0;
+        struct snapshot_data *data;
+        loff_t offset, avail;
+        if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC)
+                return -ENOTTY;
+        if (_IOC_NR(cmd) > SNAPSHOT_IOC_MAXNR)
+                return -ENOTTY;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        data = filp->private_data;
+        switch (cmd) {
+        case SNAPSHOT_FREEZE:
+                if (data->frozen)
+                        break;
+                down(&pm_sem);
+                disable_nonboot_cpus();
+                if (freeze_processes()) {
+                        thaw_processes();
+                        enable_nonboot_cpus();
+                        error = -EBUSY;
+                }
+                up(&pm_sem);
+                if (!error)
+                        data->frozen = 1;
+                break;
+        case SNAPSHOT_UNFREEZE:
+                if (!data->frozen)
+                        break;
+                down(&pm_sem);
+                thaw_processes();
+                enable_nonboot_cpus();
+                up(&pm_sem);
+                data->frozen = 0;
+                break;
+        case SNAPSHOT_ATOMIC_SNAPSHOT:
+                if (data->mode != O_RDONLY || !data->frozen  || data->ready) {
+                        error = -EPERM;
+                        break;
+                }
+                down(&pm_sem);
+                /* Free memory before shutting down devices. */
+                error = swsusp_shrink_memory();
+                if (!error) {
+                        error = device_suspend(PMSG_FREEZE);
+                        if (!error) {
+                                in_suspend = 1;
+                                error = swsusp_suspend();
+                                device_resume();
+                        }
+                }
+                up(&pm_sem);
+                if (!error)
+                        error = put_user(in_suspend, (unsigned int __user *)arg);
+                if (!error)
+                        data->ready = 1;
+                break;
+        case SNAPSHOT_ATOMIC_RESTORE:
+                if (data->mode != O_WRONLY || !data->frozen ||
+                    !snapshot_image_loaded(&data->handle)) {
+                        error = -EPERM;
+                        break;
+                }
+                down(&pm_sem);
+                pm_prepare_console();
+                error = device_suspend(PMSG_FREEZE);
+                if (!error) {
+                        error = swsusp_resume();
+                        device_resume();
+                }
+                pm_restore_console();
+                up(&pm_sem);
+                break;
+        case SNAPSHOT_FREE:
+                swsusp_free();
+                memset(&data->handle, 0, sizeof(struct snapshot_handle));
+                data->ready = 0;
+                break;
+        case SNAPSHOT_SET_IMAGE_SIZE:
+                image_size = arg;
+                break;
+        case SNAPSHOT_AVAIL_SWAP:
+                avail = count_swap_pages(data->swap, 1);
+                avail <<= PAGE_SHIFT;
+                error = put_user(avail, (loff_t __user *)arg);
+                break;
+        case SNAPSHOT_GET_SWAP_PAGE:
+                if (data->swap < 0 || data->swap >= MAX_SWAPFILES) {
+                        error = -ENODEV;
+                        break;
+                }
+                if (!data->bitmap) {
+                        data->bitmap = alloc_bitmap(count_swap_pages(data->swap, 0));
+                        if (!data->bitmap) {
+                                error = -ENOMEM;
+                                break;
+                        }
+                }
+                offset = alloc_swap_page(data->swap, data->bitmap);
+                if (offset) {
+                        offset <<= PAGE_SHIFT;
+                        error = put_user(offset, (loff_t __user *)arg);
+                } else {
+                        error = -ENOSPC;
+                }
+                break;
+        case SNAPSHOT_FREE_SWAP_PAGES:
+                if (data->swap < 0 || data->swap >= MAX_SWAPFILES) {
+                        error = -ENODEV;
+                        break;
+                }
+                free_all_swap_pages(data->swap, data->bitmap);
+                free_bitmap(data->bitmap);
+                data->bitmap = NULL;
+                break;
+        case SNAPSHOT_SET_SWAP_FILE:
+                if (!data->bitmap) {
+                        /*
+                         * User space encodes device types as two-byte values,
+                         * so we need to recode them
+                         */
+                        if (old_decode_dev(arg)) {
+                                data->swap = swap_type_of(old_decode_dev(arg));
+                                if (data->swap < 0)
+                                        error = -ENODEV;
+                        } else {
+                                data->swap = -1;
+                                error = -EINVAL;
+                        }
+                } else {
+                        error = -EPERM;
+                }
+                break;
+        case SNAPSHOT_S2RAM:
+                if (!data->frozen) {
+                        error = -EPERM;
+                        break;
+                }
+                if (down_trylock(&pm_sem)) {
+                        error = -EBUSY;
+                        break;
+                }
+                if (pm_ops->prepare) {
+                        error = pm_ops->prepare(PM_SUSPEND_MEM);
+                        if (error)
+                                goto OutS3;
+                }
+                /* Put devices to sleep */
+                error = device_suspend(PMSG_SUSPEND);
+                if (error) {
+                        printk(KERN_ERR "Failed to suspend some devices.\n");
+                } else {
+                        /* Enter S3, system is already frozen */
+                        suspend_enter(PM_SUSPEND_MEM);
+                        /* Wake up devices */
+                        device_resume();
+                }
+                if (pm_ops->finish)
+                        pm_ops->finish(PM_SUSPEND_MEM);
+OutS3:
+                up(&pm_sem);
+                break;
+        default:
+                error = -ENOTTY;
+        }
+        return error;
+}
+static struct file_operations snapshot_fops = {
+        .open = snapshot_open,
+        .release = snapshot_release,
+        .read = snapshot_read,
+        .write = snapshot_write,
+        .llseek = no_llseek,
+        .ioctl = snapshot_ioctl,
+};
+static struct miscdevice snapshot_device = {
+        .minor = SNAPSHOT_MINOR,
+        .name = "snapshot",
+        .fops = &snapshot_fops,
+};
+static int __init snapshot_device_init(void)
+{
+        return misc_register(&snapshot_device);
+};
+device_initcall(snapshot_device_init);
diff --git a/kernel/profile.c b/kernel/profile.c
index f89248e6d704..ad81f799a9b4 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -23,6 +23,7 @@
 #include <linux/cpu.h>
 #include <linux/profile.h>
 #include <linux/highmem.h>
+#include <linux/mutex.h>
 #include <asm/sections.h>
 #include <asm/semaphore.h>
@@ -44,7 +45,7 @@ static cpumask_t prof_cpu_mask = CPU_MASK_ALL;
 #ifdef CONFIG_SMP
 static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
 static DEFINE_PER_CPU(int, cpu_profile_flip);
-static DECLARE_MUTEX(profile_flip_mutex);
+static DEFINE_MUTEX(profile_flip_mutex);
 #endif /* CONFIG_SMP */
 static int __init profile_setup(char * str)
@@ -243,7 +244,7 @@ static void profile_flip_buffers(void)
 {
        int i, j, cpu;
-        down(&profile_flip_mutex);
+        mutex_lock(&profile_flip_mutex);
        j = per_cpu(cpu_profile_flip, get_cpu());
        put_cpu();
        on_each_cpu(__profile_flip_buffers, NULL, 0, 1);
@@ -259,14 +260,14 @@ static void profile_flip_buffers(void)
                        hits[i].hits = hits[i].pc = 0;
                }
        }
-        up(&profile_flip_mutex);
+        mutex_unlock(&profile_flip_mutex);
 }
 static void profile_discard_flip_buffers(void)
 {
        int i, cpu;
-        down(&profile_flip_mutex);
+        mutex_lock(&profile_flip_mutex);
        i = per_cpu(cpu_profile_flip, get_cpu());
        put_cpu();
        on_each_cpu(__profile_flip_buffers, NULL, 0, 1);
@@ -274,7 +275,7 @@ static void profile_discard_flip_buffers(void)
                struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i];
                memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit));
        }
-        up(&profile_flip_mutex);
+        mutex_unlock(&profile_flip_mutex);
 }
 void profile_hit(int type, void *__pc)
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index fedf5e369755..6df1559b1c02 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -47,15 +47,16 @@
 #include <linux/notifier.h>
 #include <linux/rcupdate.h>
 #include <linux/cpu.h>
+#include <linux/mutex.h>
 /* Definition for rcupdate control block. */
-struct rcu_ctrlblk rcu_ctrlblk = {
+static struct rcu_ctrlblk rcu_ctrlblk = {
        .cur = -300,
        .completed = -300,
        .lock = SPIN_LOCK_UNLOCKED,
        .cpumask = CPU_MASK_NONE,
 };
-struct rcu_ctrlblk rcu_bh_ctrlblk = {
+static struct rcu_ctrlblk rcu_bh_ctrlblk = {
        .cur = -300,
        .completed = -300,
        .lock = SPIN_LOCK_UNLOCKED,
@@ -75,7 +76,7 @@ static int rsinterval = 1000;
 #endif
 static atomic_t rcu_barrier_cpu_count;
-static struct semaphore rcu_barrier_sema;
+static DEFINE_MUTEX(rcu_barrier_mutex);
 static struct completion rcu_barrier_completion;
 #ifdef CONFIG_SMP
@@ -207,13 +208,13 @@ static void rcu_barrier_func(void *notused)
 void rcu_barrier(void)
 {
        BUG_ON(in_interrupt());
-        /* Take cpucontrol semaphore to protect against CPU hotplug */
+        /* Take cpucontrol mutex to protect against CPU hotplug */
-        down(&rcu_barrier_sema);
+        mutex_lock(&rcu_barrier_mutex);
        init_completion(&rcu_barrier_completion);
        atomic_set(&rcu_barrier_cpu_count, 0);
        on_each_cpu(rcu_barrier_func, NULL, 0, 1);
        wait_for_completion(&rcu_barrier_completion);
-        up(&rcu_barrier_sema);
+        mutex_unlock(&rcu_barrier_mutex);
 }
 EXPORT_SYMBOL_GPL(rcu_barrier);
@@ -549,7 +550,6 @@ static struct notifier_block __devinitdata rcu_nb = {
 */
 void __init rcu_init(void)
 {
-        sema_init(&rcu_barrier_sema, 1);
        rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
                        (void *)(long)smp_processor_id());
        /* Register notifier for non-boot CPUs */
diff --git a/kernel/sched.c b/kernel/sched.c
index 6b6e0d70eb30..7ffaabd64f89 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -237,6 +237,7 @@ struct runqueue {
        task_t *migration_thread;
        struct list_head migration_queue;
+        int cpu;
 #endif
 #ifdef CONFIG_SCHEDSTATS
@@ -1654,6 +1655,9 @@ unsigned long nr_iowait(void)
 /*
 * double_rq_lock - safely lock two runqueues
 *
+ * We must take them in cpu order to match code in
+ * dependent_sleeper and wake_dependent_sleeper.
+ *
 * Note this does not disable interrupts like task_rq_lock,
 * you need to do so manually before calling.
 */
@@ -1665,7 +1669,7 @@ static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
                spin_lock(&rq1->lock);
                __acquire(rq2->lock);   /* Fake it out ;) */
        } else {
-                if (rq1 < rq2) {
+                if (rq1->cpu < rq2->cpu) {
                        spin_lock(&rq1->lock);
                        spin_lock(&rq2->lock);
                } else {
@@ -1701,7 +1705,7 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
        __acquires(this_rq->lock)
 {
        if (unlikely(!spin_trylock(&busiest->lock))) {
-                if (busiest < this_rq) {
+                if (busiest->cpu < this_rq->cpu) {
                        spin_unlock(&this_rq->lock);
                        spin_lock(&busiest->lock);
                        spin_lock(&this_rq->lock);
@@ -2869,7 +2873,7 @@ asmlinkage void __sched schedule(void)
         */
        if (likely(!current->exit_state)) {
                if (unlikely(in_atomic())) {
-                        printk(KERN_ERR "scheduling while atomic: "
+                        printk(KERN_ERR "BUG: scheduling while atomic: "
                                "%s/0x%08x/%d\n",
                                current->comm, preempt_count(), current->pid);
                        dump_stack();
@@ -6029,6 +6033,7 @@ void __init sched_init(void)
                rq->push_cpu = 0;
                rq->migration_thread = NULL;
                INIT_LIST_HEAD(&rq->migration_queue);
+                rq->cpu = i;
 #endif
                atomic_set(&rq->nr_iowait, 0);
@@ -6069,7 +6074,7 @@ void __might_sleep(char *file, int line)
                if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
                        return;
                prev_jiffy = jiffies;
-                printk(KERN_ERR "Debug: sleeping function called from invalid"
+                printk(KERN_ERR "BUG: sleeping function called from invalid"
                                " context at %s:%d\n", file, line);
                printk("in_atomic():%d, irqs_disabled():%d\n",
                        in_atomic(), irqs_disabled());
diff --git a/kernel/signal.c b/kernel/signal.c
index ea154104a00b..75f7341b0c39 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1922,6 +1922,8 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
        sigset_t *mask = &current->blocked;
        int signr = 0;
+        try_to_freeze();
 relock:
        spin_lock_irq(&current->sighand->siglock);
        for (;;) {
@@ -2099,10 +2101,11 @@ long do_no_restart_syscall(struct restart_block *param)
 int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
 {
        int error;
-        sigset_t old_block;
        spin_lock_irq(&current->sighand->siglock);
-        old_block = current->blocked;
+        if (oldset)
+                *oldset = current->blocked;
        error = 0;
        switch (how) {
        case SIG_BLOCK:
@@ -2119,8 +2122,7 @@ int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
        }
        recalc_sigpending();
        spin_unlock_irq(&current->sighand->siglock);
-        if (oldset)
-                *oldset = old_block;
        return error;
 }
@@ -2307,7 +2309,6 @@ sys_rt_sigtimedwait(const sigset_t __user *uthese,
                        timeout = schedule_timeout_interruptible(timeout);
-                        try_to_freeze();
                        spin_lock_irq(&current->sighand->siglock);
                        sig = dequeue_signal(current, &these, &info);
                        current->blocked = current->real_blocked;
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 0375fcd5921d..d1b810782bc4 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -179,16 +179,16 @@ EXPORT_SYMBOL(_write_lock);
 #define BUILD_LOCK_OPS(op, locktype)                                    \
 void __lockfunc _##op##_lock(locktype##_t *lock)                        \
 {                                                                       \
-        preempt_disable();                                              \
        for (;;) {                                                      \
+                preempt_disable();                                      \
                if (likely(_raw_##op##_trylock(lock)))                  \
                        break;                                          \
                preempt_enable();                                       \
+                                                                        \
                if (!(lock)->break_lock)                                \
                        (lock)->break_lock = 1;                         \
                while (!op##_can_lock(lock) && (lock)->break_lock)      \
                        cpu_relax();                                    \
-                preempt_disable();                                      \
        }                                                               \
        (lock)->break_lock = 0;                                         \
 }                                                                       \
@@ -199,19 +199,18 @@ unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock)	\
 {                                                                       \
        unsigned long flags;                                            \
                                                                        \
-        preempt_disable();                                              \
        for (;;) {                                                      \
+                preempt_disable();                                      \
                local_irq_save(flags);                                  \
                if (likely(_raw_##op##_trylock(lock)))                  \
                        break;                                          \
                local_irq_restore(flags);                               \
-                                                                        \
                preempt_enable();                                       \
+                                                                        \
                if (!(lock)->break_lock)                                \
                        (lock)->break_lock = 1;                         \
                while (!op##_can_lock(lock) && (lock)->break_lock)      \
                        cpu_relax();                                    \
-                preempt_disable();                                      \
        }                                                               \
        (lock)->break_lock = 0;                                         \
        return flags;                                                   \
diff --git a/kernel/sys.c b/kernel/sys.c
index f91218a5463e..c0fcad9f826c 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1227,7 +1227,7 @@ asmlinkage long sys_setsid(void)
        struct pid *pid;
        int err = -EPERM;
-        down(&tty_sem);
+        mutex_lock(&tty_mutex);
        write_lock_irq(&tasklist_lock);
        pid = find_pid(PIDTYPE_PGID, group_leader->pid);
@@ -1241,7 +1241,7 @@ asmlinkage long sys_setsid(void)
        err = process_group(group_leader);
 out:
        write_unlock_irq(&tasklist_lock);
-        up(&tty_sem);
+        mutex_unlock(&tty_mutex);
        return err;
 }
@@ -1677,9 +1677,6 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
 * a lot simpler!  (Which we're not doing right now because we're not
 * measuring them yet).
 *
- * This expects to be called with tasklist_lock read-locked or better,
- * and the siglock not locked.  It may momentarily take the siglock.
- *
 * When sampling multiple threads for RUSAGE_SELF, under SMP we might have
 * races with threads incrementing their own counters.  But since word
 * reads are atomic, we either get new values or old values and we don't
@@ -1687,6 +1684,25 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
 * the c* fields from p->signal from races with exit.c updating those
 * fields when reaping, so a sample either gets all the additions of a
 * given child after it's reaped, or none so this sample is before reaping.
+ *
+ * tasklist_lock locking optimisation:
+ * If we are current and single threaded, we do not need to take the tasklist
+ * lock or the siglock.  No one else can take our signal_struct away,
+ * no one else can reap the children to update signal->c* counters, and
+ * no one else can race with the signal-> fields.
+ * If we do not take the tasklist_lock, the signal-> fields could be read
+ * out of order while another thread was just exiting. So we place a
+ * read memory barrier when we avoid the lock.  On the writer side,
+ * write memory barrier is implied in  __exit_signal as __exit_signal releases
+ * the siglock spinlock after updating the signal-> fields.
+ *
+ * We don't really need the siglock when we access the non c* fields
+ * of the signal_struct (for RUSAGE_SELF) even in multithreaded
+ * case, since we take the tasklist lock for read and the non c* signal->
+ * fields are updated only in __exit_signal, which is called with
+ * tasklist_lock taken for write, hence these two threads cannot execute
+ * concurrently.
+ *
 */
 static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
@@ -1694,13 +1710,23 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
        struct task_struct *t;
        unsigned long flags;
        cputime_t utime, stime;
+        int need_lock = 0;
        memset((char *) r, 0, sizeof *r);
+        utime = stime = cputime_zero;
-        if (unlikely(!p->signal))
+        if (p != current || !thread_group_empty(p))
-                return;
+                need_lock = 1;
-        utime = stime = cputime_zero;
+        if (need_lock) {
+                read_lock(&tasklist_lock);
+                if (unlikely(!p->signal)) {
+                        read_unlock(&tasklist_lock);
+                        return;
+                }
+        } else
+                /* See locking comments above */
+                smp_rmb();
        switch (who) {
                case RUSAGE_BOTH:
@@ -1740,6 +1766,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
                        BUG();
        }
+        if (need_lock)
+                read_unlock(&tasklist_lock);
        cputime_to_timeval(utime, &r->ru_utime);
        cputime_to_timeval(stime, &r->ru_stime);
 }
@@ -1747,9 +1775,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
 {
        struct rusage r;
-        read_lock(&tasklist_lock);
        k_getrusage(p, who, &r);
-        read_unlock(&tasklist_lock);
        return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;
 }
author	Jeff Garzik <jeff@garzik.org>	2006-03-23 17:13:43 -0500
committer	Jeff Garzik <jeff@garzik.org>	2006-03-23 17:13:43 -0500
commit	88e3c1da8b3258a81c5c81d4e7e22557b7d71ba7 (patch)
tree	ab518773c0ff4606f1a57d00b5931332a7e1d96e /kernel
parent	fa4fa40a990f8f4eff65476bef32007c154bbac0 (diff)
parent	b0e6e962992b76580f4900b166a337bad7c1e81b (diff)