diff options
| author | Anton Altaparmakov <aia21@hera.kernel.org> | 2006-03-23 12:05:47 -0500 |
|---|---|---|
| committer | Anton Altaparmakov <aia21@hera.kernel.org> | 2006-03-23 12:05:47 -0500 |
| commit | a05ba4561fa3ad8b64a27577d0d38c190f60f762 (patch) | |
| tree | 5eb7561113e006b7bad0bef50dec6821962b1b36 /kernel | |
| parent | 74293759002aa7db0179158c20676a034614577b (diff) | |
| parent | b0e6e962992b76580f4900b166a337bad7c1e81b (diff) | |
Merge branch 'master' of /home/aia21/ntfs-2.6/
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/cpuset.c | 212 | ||||
| -rw-r--r-- | kernel/exit.c | 4 | ||||
| -rw-r--r-- | kernel/fork.c | 8 | ||||
| -rw-r--r-- | kernel/kprobes.c | 14 | ||||
| -rw-r--r-- | kernel/kthread.c | 7 | ||||
| -rw-r--r-- | kernel/module.c | 53 | ||||
| -rw-r--r-- | kernel/panic.c | 97 | ||||
| -rw-r--r-- | kernel/posix-timers.c | 1 | ||||
| -rw-r--r-- | kernel/power/Makefile | 2 | ||||
| -rw-r--r-- | kernel/power/disk.c | 20 | ||||
| -rw-r--r-- | kernel/power/main.c | 2 | ||||
| -rw-r--r-- | kernel/power/pm.c | 21 | ||||
| -rw-r--r-- | kernel/power/power.h | 75 | ||||
| -rw-r--r-- | kernel/power/process.c | 61 | ||||
| -rw-r--r-- | kernel/power/snapshot.c | 335 | ||||
| -rw-r--r-- | kernel/power/swap.c | 544 | ||||
| -rw-r--r-- | kernel/power/swsusp.c | 887 | ||||
| -rw-r--r-- | kernel/power/user.c | 333 | ||||
| -rw-r--r-- | kernel/profile.c | 11 | ||||
| -rw-r--r-- | kernel/rcupdate.c | 14 | ||||
| -rw-r--r-- | kernel/sched.c | 13 | ||||
| -rw-r--r-- | kernel/signal.c | 11 | ||||
| -rw-r--r-- | kernel/spinlock.c | 9 | ||||
| -rw-r--r-- | kernel/sys.c | 46 |
24 files changed, 1705 insertions, 1075 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 12815d3f1a05..c86ee051b734 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
| @@ -53,7 +53,7 @@ | |||
| 53 | 53 | ||
| 54 | #include <asm/uaccess.h> | 54 | #include <asm/uaccess.h> |
| 55 | #include <asm/atomic.h> | 55 | #include <asm/atomic.h> |
| 56 | #include <asm/semaphore.h> | 56 | #include <linux/mutex.h> |
| 57 | 57 | ||
| 58 | #define CPUSET_SUPER_MAGIC 0x27e0eb | 58 | #define CPUSET_SUPER_MAGIC 0x27e0eb |
| 59 | 59 | ||
| @@ -168,63 +168,57 @@ static struct vfsmount *cpuset_mount; | |||
| 168 | static struct super_block *cpuset_sb; | 168 | static struct super_block *cpuset_sb; |
| 169 | 169 | ||
| 170 | /* | 170 | /* |
| 171 | * We have two global cpuset semaphores below. They can nest. | 171 | * We have two global cpuset mutexes below. They can nest. |
| 172 | * It is ok to first take manage_sem, then nest callback_sem. We also | 172 | * It is ok to first take manage_mutex, then nest callback_mutex. We also |
| 173 | * require taking task_lock() when dereferencing a tasks cpuset pointer. | 173 | * require taking task_lock() when dereferencing a tasks cpuset pointer. |
| 174 | * See "The task_lock() exception", at the end of this comment. | 174 | * See "The task_lock() exception", at the end of this comment. |
| 175 | * | 175 | * |
| 176 | * A task must hold both semaphores to modify cpusets. If a task | 176 | * A task must hold both mutexes to modify cpusets. If a task |
| 177 | * holds manage_sem, then it blocks others wanting that semaphore, | 177 | * holds manage_mutex, then it blocks others wanting that mutex, |
| 178 | * ensuring that it is the only task able to also acquire callback_sem | 178 | * ensuring that it is the only task able to also acquire callback_mutex |
| 179 | * and be able to modify cpusets. It can perform various checks on | 179 | * and be able to modify cpusets. It can perform various checks on |
| 180 | * the cpuset structure first, knowing nothing will change. It can | 180 | * the cpuset structure first, knowing nothing will change. It can |
| 181 | * also allocate memory while just holding manage_sem. While it is | 181 | * also allocate memory while just holding manage_mutex. While it is |
| 182 | * performing these checks, various callback routines can briefly | 182 | * performing these checks, various callback routines can briefly |
| 183 | * acquire callback_sem to query cpusets. Once it is ready to make | 183 | * acquire callback_mutex to query cpusets. Once it is ready to make |
| 184 | * the changes, it takes callback_sem, blocking everyone else. | 184 | * the changes, it takes callback_mutex, blocking everyone else. |
| 185 | * | 185 | * |
| 186 | * Calls to the kernel memory allocator can not be made while holding | 186 | * Calls to the kernel memory allocator can not be made while holding |
| 187 | * callback_sem, as that would risk double tripping on callback_sem | 187 | * callback_mutex, as that would risk double tripping on callback_mutex |
| 188 | * from one of the callbacks into the cpuset code from within | 188 | * from one of the callbacks into the cpuset code from within |
| 189 | * __alloc_pages(). | 189 | * __alloc_pages(). |
| 190 | * | 190 | * |
| 191 | * If a task is only holding callback_sem, then it has read-only | 191 | * If a task is only holding callback_mutex, then it has read-only |
| 192 | * access to cpusets. | 192 | * access to cpusets. |
| 193 | * | 193 | * |
| 194 | * The task_struct fields mems_allowed and mems_generation may only | 194 | * The task_struct fields mems_allowed and mems_generation may only |
| 195 | * be accessed in the context of that task, so require no locks. | 195 | * be accessed in the context of that task, so require no locks. |
| 196 | * | 196 | * |
| 197 | * Any task can increment and decrement the count field without lock. | 197 | * Any task can increment and decrement the count field without lock. |
| 198 | * So in general, code holding manage_sem or callback_sem can't rely | 198 | * So in general, code holding manage_mutex or callback_mutex can't rely |
| 199 | * on the count field not changing. However, if the count goes to | 199 | * on the count field not changing. However, if the count goes to |
| 200 | * zero, then only attach_task(), which holds both semaphores, can | 200 | * zero, then only attach_task(), which holds both mutexes, can |
| 201 | * increment it again. Because a count of zero means that no tasks | 201 | * increment it again. Because a count of zero means that no tasks |
| 202 | * are currently attached, therefore there is no way a task attached | 202 | * are currently attached, therefore there is no way a task attached |
| 203 | * to that cpuset can fork (the other way to increment the count). | 203 | * to that cpuset can fork (the other way to increment the count). |
| 204 | * So code holding manage_sem or callback_sem can safely assume that | 204 | * So code holding manage_mutex or callback_mutex can safely assume that |
| 205 | * if the count is zero, it will stay zero. Similarly, if a task | 205 | * if the count is zero, it will stay zero. Similarly, if a task |
| 206 | * holds manage_sem or callback_sem on a cpuset with zero count, it | 206 | * holds manage_mutex or callback_mutex on a cpuset with zero count, it |
| 207 | * knows that the cpuset won't be removed, as cpuset_rmdir() needs | 207 | * knows that the cpuset won't be removed, as cpuset_rmdir() needs |
| 208 | * both of those semaphores. | 208 | * both of those mutexes. |
| 209 | * | ||
| 210 | * A possible optimization to improve parallelism would be to make | ||
| 211 | * callback_sem a R/W semaphore (rwsem), allowing the callback routines | ||
| 212 | * to proceed in parallel, with read access, until the holder of | ||
| 213 | * manage_sem needed to take this rwsem for exclusive write access | ||
| 214 | * and modify some cpusets. | ||
| 215 | * | 209 | * |
| 216 | * The cpuset_common_file_write handler for operations that modify | 210 | * The cpuset_common_file_write handler for operations that modify |
| 217 | * the cpuset hierarchy holds manage_sem across the entire operation, | 211 | * the cpuset hierarchy holds manage_mutex across the entire operation, |
| 218 | * single threading all such cpuset modifications across the system. | 212 | * single threading all such cpuset modifications across the system. |
| 219 | * | 213 | * |
| 220 | * The cpuset_common_file_read() handlers only hold callback_sem across | 214 | * The cpuset_common_file_read() handlers only hold callback_mutex across |
| 221 | * small pieces of code, such as when reading out possibly multi-word | 215 | * small pieces of code, such as when reading out possibly multi-word |
| 222 | * cpumasks and nodemasks. | 216 | * cpumasks and nodemasks. |
| 223 | * | 217 | * |
| 224 | * The fork and exit callbacks cpuset_fork() and cpuset_exit(), don't | 218 | * The fork and exit callbacks cpuset_fork() and cpuset_exit(), don't |
| 225 | * (usually) take either semaphore. These are the two most performance | 219 | * (usually) take either mutex. These are the two most performance |
| 226 | * critical pieces of code here. The exception occurs on cpuset_exit(), | 220 | * critical pieces of code here. The exception occurs on cpuset_exit(), |
| 227 | * when a task in a notify_on_release cpuset exits. Then manage_sem | 221 | * when a task in a notify_on_release cpuset exits. Then manage_mutex |
| 228 | * is taken, and if the cpuset count is zero, a usermode call made | 222 | * is taken, and if the cpuset count is zero, a usermode call made |
| 229 | * to /sbin/cpuset_release_agent with the name of the cpuset (path | 223 | * to /sbin/cpuset_release_agent with the name of the cpuset (path |
| 230 | * relative to the root of cpuset file system) as the argument. | 224 | * relative to the root of cpuset file system) as the argument. |
| @@ -242,9 +236,9 @@ static struct super_block *cpuset_sb; | |||
| 242 | * | 236 | * |
| 243 | * The need for this exception arises from the action of attach_task(), | 237 | * The need for this exception arises from the action of attach_task(), |
| 244 | * which overwrites one tasks cpuset pointer with another. It does | 238 | * which overwrites one tasks cpuset pointer with another. It does |
| 245 | * so using both semaphores, however there are several performance | 239 | * so using both mutexes, however there are several performance |
| 246 | * critical places that need to reference task->cpuset without the | 240 | * critical places that need to reference task->cpuset without the |
| 247 | * expense of grabbing a system global semaphore. Therefore except as | 241 | * expense of grabbing a system global mutex. Therefore except as |
| 248 | * noted below, when dereferencing or, as in attach_task(), modifying | 242 | * noted below, when dereferencing or, as in attach_task(), modifying |
| 249 | * a tasks cpuset pointer we use task_lock(), which acts on a spinlock | 243 | * a tasks cpuset pointer we use task_lock(), which acts on a spinlock |
| 250 | * (task->alloc_lock) already in the task_struct routinely used for | 244 | * (task->alloc_lock) already in the task_struct routinely used for |
| @@ -256,8 +250,8 @@ static struct super_block *cpuset_sb; | |||
| 256 | * the routine cpuset_update_task_memory_state(). | 250 | * the routine cpuset_update_task_memory_state(). |
| 257 | */ | 251 | */ |
| 258 | 252 | ||
| 259 | static DECLARE_MUTEX(manage_sem); | 253 | static DEFINE_MUTEX(manage_mutex); |
| 260 | static DECLARE_MUTEX(callback_sem); | 254 | static DEFINE_MUTEX(callback_mutex); |
| 261 | 255 | ||
| 262 | /* | 256 | /* |
| 263 | * A couple of forward declarations required, due to cyclic reference loop: | 257 | * A couple of forward declarations required, due to cyclic reference loop: |
| @@ -432,7 +426,7 @@ static inline struct cftype *__d_cft(struct dentry *dentry) | |||
| 432 | } | 426 | } |
| 433 | 427 | ||
| 434 | /* | 428 | /* |
| 435 | * Call with manage_sem held. Writes path of cpuset into buf. | 429 | * Call with manage_mutex held. Writes path of cpuset into buf. |
| 436 | * Returns 0 on success, -errno on error. | 430 | * Returns 0 on success, -errno on error. |
| 437 | */ | 431 | */ |
| 438 | 432 | ||
| @@ -484,11 +478,11 @@ static int cpuset_path(const struct cpuset *cs, char *buf, int buflen) | |||
| 484 | * status of the /sbin/cpuset_release_agent task, so no sense holding | 478 | * status of the /sbin/cpuset_release_agent task, so no sense holding |
| 485 | * our caller up for that. | 479 | * our caller up for that. |
| 486 | * | 480 | * |
| 487 | * When we had only one cpuset semaphore, we had to call this | 481 | * When we had only one cpuset mutex, we had to call this |
| 488 | * without holding it, to avoid deadlock when call_usermodehelper() | 482 | * without holding it, to avoid deadlock when call_usermodehelper() |
| 489 | * allocated memory. With two locks, we could now call this while | 483 | * allocated memory. With two locks, we could now call this while |
| 490 | * holding manage_sem, but we still don't, so as to minimize | 484 | * holding manage_mutex, but we still don't, so as to minimize |
| 491 | * the time manage_sem is held. | 485 | * the time manage_mutex is held. |
| 492 | */ | 486 | */ |
| 493 | 487 | ||
| 494 | static void cpuset_release_agent(const char *pathbuf) | 488 | static void cpuset_release_agent(const char *pathbuf) |
| @@ -520,15 +514,15 @@ static void cpuset_release_agent(const char *pathbuf) | |||
| 520 | * cs is notify_on_release() and now both the user count is zero and | 514 | * cs is notify_on_release() and now both the user count is zero and |
| 521 | * the list of children is empty, prepare cpuset path in a kmalloc'd | 515 | * the list of children is empty, prepare cpuset path in a kmalloc'd |
| 522 | * buffer, to be returned via ppathbuf, so that the caller can invoke | 516 | * buffer, to be returned via ppathbuf, so that the caller can invoke |
| 523 | * cpuset_release_agent() with it later on, once manage_sem is dropped. | 517 | * cpuset_release_agent() with it later on, once manage_mutex is dropped. |
| 524 | * Call here with manage_sem held. | 518 | * Call here with manage_mutex held. |
| 525 | * | 519 | * |
| 526 | * This check_for_release() routine is responsible for kmalloc'ing | 520 | * This check_for_release() routine is responsible for kmalloc'ing |
| 527 | * pathbuf. The above cpuset_release_agent() is responsible for | 521 | * pathbuf. The above cpuset_release_agent() is responsible for |
| 528 | * kfree'ing pathbuf. The caller of these routines is responsible | 522 | * kfree'ing pathbuf. The caller of these routines is responsible |
| 529 | * for providing a pathbuf pointer, initialized to NULL, then | 523 | * for providing a pathbuf pointer, initialized to NULL, then |
| 530 | * calling check_for_release() with manage_sem held and the address | 524 | * calling check_for_release() with manage_mutex held and the address |
| 531 | * of the pathbuf pointer, then dropping manage_sem, then calling | 525 | * of the pathbuf pointer, then dropping manage_mutex, then calling |
| 532 | * cpuset_release_agent() with pathbuf, as set by check_for_release(). | 526 | * cpuset_release_agent() with pathbuf, as set by check_for_release(). |
| 533 | */ | 527 | */ |
| 534 | 528 | ||
| @@ -559,7 +553,7 @@ static void check_for_release(struct cpuset *cs, char **ppathbuf) | |||
| 559 | * One way or another, we guarantee to return some non-empty subset | 553 | * One way or another, we guarantee to return some non-empty subset |
| 560 | * of cpu_online_map. | 554 | * of cpu_online_map. |
| 561 | * | 555 | * |
| 562 | * Call with callback_sem held. | 556 | * Call with callback_mutex held. |
| 563 | */ | 557 | */ |
| 564 | 558 | ||
| 565 | static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) | 559 | static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) |
| @@ -583,7 +577,7 @@ static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) | |||
| 583 | * One way or another, we guarantee to return some non-empty subset | 577 | * One way or another, we guarantee to return some non-empty subset |
| 584 | * of node_online_map. | 578 | * of node_online_map. |
| 585 | * | 579 | * |
| 586 | * Call with callback_sem held. | 580 | * Call with callback_mutex held. |
| 587 | */ | 581 | */ |
| 588 | 582 | ||
| 589 | static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) | 583 | static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) |
| @@ -608,12 +602,12 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) | |||
| 608 | * current->cpuset if a task has its memory placement changed. | 602 | * current->cpuset if a task has its memory placement changed. |
| 609 | * Do not call this routine if in_interrupt(). | 603 | * Do not call this routine if in_interrupt(). |
| 610 | * | 604 | * |
| 611 | * Call without callback_sem or task_lock() held. May be called | 605 | * Call without callback_mutex or task_lock() held. May be called |
| 612 | * with or without manage_sem held. Doesn't need task_lock to guard | 606 | * with or without manage_mutex held. Doesn't need task_lock to guard |
| 613 | * against another task changing a non-NULL cpuset pointer to NULL, | 607 | * against another task changing a non-NULL cpuset pointer to NULL, |
| 614 | * as that is only done by a task on itself, and if the current task | 608 | * as that is only done by a task on itself, and if the current task |
| 615 | * is here, it is not simultaneously in the exit code NULL'ing its | 609 | * is here, it is not simultaneously in the exit code NULL'ing its |
| 616 | * cpuset pointer. This routine also might acquire callback_sem and | 610 | * cpuset pointer. This routine also might acquire callback_mutex and |
| 617 | * current->mm->mmap_sem during call. | 611 | * current->mm->mmap_sem during call. |
| 618 | * | 612 | * |
| 619 | * Reading current->cpuset->mems_generation doesn't need task_lock | 613 | * Reading current->cpuset->mems_generation doesn't need task_lock |
| @@ -658,13 +652,13 @@ void cpuset_update_task_memory_state(void) | |||
| 658 | } | 652 | } |
| 659 | 653 | ||
| 660 | if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) { | 654 | if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) { |
| 661 | down(&callback_sem); | 655 | mutex_lock(&callback_mutex); |
| 662 | task_lock(tsk); | 656 | task_lock(tsk); |
| 663 | cs = tsk->cpuset; /* Maybe changed when task not locked */ | 657 | cs = tsk->cpuset; /* Maybe changed when task not locked */ |
| 664 | guarantee_online_mems(cs, &tsk->mems_allowed); | 658 | guarantee_online_mems(cs, &tsk->mems_allowed); |
| 665 | tsk->cpuset_mems_generation = cs->mems_generation; | 659 | tsk->cpuset_mems_generation = cs->mems_generation; |
| 666 | task_unlock(tsk); | 660 | task_unlock(tsk); |
| 667 | up(&callback_sem); | 661 | mutex_unlock(&callback_mutex); |
| 668 | mpol_rebind_task(tsk, &tsk->mems_allowed); | 662 | mpol_rebind_task(tsk, &tsk->mems_allowed); |
| 669 | } | 663 | } |
| 670 | } | 664 | } |
| @@ -674,7 +668,7 @@ void cpuset_update_task_memory_state(void) | |||
| 674 | * | 668 | * |
| 675 | * One cpuset is a subset of another if all its allowed CPUs and | 669 | * One cpuset is a subset of another if all its allowed CPUs and |
| 676 | * Memory Nodes are a subset of the other, and its exclusive flags | 670 | * Memory Nodes are a subset of the other, and its exclusive flags |
| 677 | * are only set if the other's are set. Call holding manage_sem. | 671 | * are only set if the other's are set. Call holding manage_mutex. |
| 678 | */ | 672 | */ |
| 679 | 673 | ||
| 680 | static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) | 674 | static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) |
| @@ -692,7 +686,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) | |||
| 692 | * If we replaced the flag and mask values of the current cpuset | 686 | * If we replaced the flag and mask values of the current cpuset |
| 693 | * (cur) with those values in the trial cpuset (trial), would | 687 | * (cur) with those values in the trial cpuset (trial), would |
| 694 | * our various subset and exclusive rules still be valid? Presumes | 688 | * our various subset and exclusive rules still be valid? Presumes |
| 695 | * manage_sem held. | 689 | * manage_mutex held. |
| 696 | * | 690 | * |
| 697 | * 'cur' is the address of an actual, in-use cpuset. Operations | 691 | * 'cur' is the address of an actual, in-use cpuset. Operations |
| 698 | * such as list traversal that depend on the actual address of the | 692 | * such as list traversal that depend on the actual address of the |
| @@ -746,7 +740,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | |||
| 746 | * exclusive child cpusets | 740 | * exclusive child cpusets |
| 747 | * Build these two partitions by calling partition_sched_domains | 741 | * Build these two partitions by calling partition_sched_domains |
| 748 | * | 742 | * |
| 749 | * Call with manage_sem held. May nest a call to the | 743 | * Call with manage_mutex held. May nest a call to the |
| 750 | * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. | 744 | * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. |
| 751 | */ | 745 | */ |
| 752 | 746 | ||
| @@ -792,7 +786,7 @@ static void update_cpu_domains(struct cpuset *cur) | |||
| 792 | } | 786 | } |
| 793 | 787 | ||
| 794 | /* | 788 | /* |
| 795 | * Call with manage_sem held. May take callback_sem during call. | 789 | * Call with manage_mutex held. May take callback_mutex during call. |
| 796 | */ | 790 | */ |
| 797 | 791 | ||
| 798 | static int update_cpumask(struct cpuset *cs, char *buf) | 792 | static int update_cpumask(struct cpuset *cs, char *buf) |
| @@ -811,9 +805,9 @@ static int update_cpumask(struct cpuset *cs, char *buf) | |||
| 811 | if (retval < 0) | 805 | if (retval < 0) |
| 812 | return retval; | 806 | return retval; |
| 813 | cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed); | 807 | cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed); |
| 814 | down(&callback_sem); | 808 | mutex_lock(&callback_mutex); |
| 815 | cs->cpus_allowed = trialcs.cpus_allowed; | 809 | cs->cpus_allowed = trialcs.cpus_allowed; |
| 816 | up(&callback_sem); | 810 | mutex_unlock(&callback_mutex); |
| 817 | if (is_cpu_exclusive(cs) && !cpus_unchanged) | 811 | if (is_cpu_exclusive(cs) && !cpus_unchanged) |
| 818 | update_cpu_domains(cs); | 812 | update_cpu_domains(cs); |
| 819 | return 0; | 813 | return 0; |
| @@ -827,7 +821,7 @@ static int update_cpumask(struct cpuset *cs, char *buf) | |||
| 827 | * the cpuset is marked 'memory_migrate', migrate the tasks | 821 | * the cpuset is marked 'memory_migrate', migrate the tasks |
| 828 | * pages to the new memory. | 822 | * pages to the new memory. |
| 829 | * | 823 | * |
| 830 | * Call with manage_sem held. May take callback_sem during call. | 824 | * Call with manage_mutex held. May take callback_mutex during call. |
| 831 | * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, | 825 | * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, |
| 832 | * lock each such tasks mm->mmap_sem, scan its vma's and rebind | 826 | * lock each such tasks mm->mmap_sem, scan its vma's and rebind |
| 833 | * their mempolicies to the cpusets new mems_allowed. | 827 | * their mempolicies to the cpusets new mems_allowed. |
| @@ -862,11 +856,11 @@ static int update_nodemask(struct cpuset *cs, char *buf) | |||
| 862 | if (retval < 0) | 856 | if (retval < 0) |
| 863 | goto done; | 857 | goto done; |
| 864 | 858 | ||
| 865 | down(&callback_sem); | 859 | mutex_lock(&callback_mutex); |
| 866 | cs->mems_allowed = trialcs.mems_allowed; | 860 | cs->mems_allowed = trialcs.mems_allowed; |
| 867 | atomic_inc(&cpuset_mems_generation); | 861 | atomic_inc(&cpuset_mems_generation); |
| 868 | cs->mems_generation = atomic_read(&cpuset_mems_generation); | 862 | cs->mems_generation = atomic_read(&cpuset_mems_generation); |
| 869 | up(&callback_sem); | 863 | mutex_unlock(&callback_mutex); |
| 870 | 864 | ||
| 871 | set_cpuset_being_rebound(cs); /* causes mpol_copy() rebind */ | 865 | set_cpuset_being_rebound(cs); /* causes mpol_copy() rebind */ |
| 872 | 866 | ||
| @@ -922,7 +916,7 @@ static int update_nodemask(struct cpuset *cs, char *buf) | |||
| 922 | * tasklist_lock. Forks can happen again now - the mpol_copy() | 916 | * tasklist_lock. Forks can happen again now - the mpol_copy() |
| 923 | * cpuset_being_rebound check will catch such forks, and rebind | 917 | * cpuset_being_rebound check will catch such forks, and rebind |
| 924 | * their vma mempolicies too. Because we still hold the global | 918 | * their vma mempolicies too. Because we still hold the global |
| 925 | * cpuset manage_sem, we know that no other rebind effort will | 919 | * cpuset manage_mutex, we know that no other rebind effort will |
| 926 | * be contending for the global variable cpuset_being_rebound. | 920 | * be contending for the global variable cpuset_being_rebound. |
| 927 | * It's ok if we rebind the same mm twice; mpol_rebind_mm() | 921 | * It's ok if we rebind the same mm twice; mpol_rebind_mm() |
| 928 | * is idempotent. Also migrate pages in each mm to new nodes. | 922 | * is idempotent. Also migrate pages in each mm to new nodes. |
| @@ -948,7 +942,7 @@ done: | |||
| 948 | } | 942 | } |
| 949 | 943 | ||
| 950 | /* | 944 | /* |
| 951 | * Call with manage_sem held. | 945 | * Call with manage_mutex held. |
| 952 | */ | 946 | */ |
| 953 | 947 | ||
| 954 | static int update_memory_pressure_enabled(struct cpuset *cs, char *buf) | 948 | static int update_memory_pressure_enabled(struct cpuset *cs, char *buf) |
| @@ -967,7 +961,7 @@ static int update_memory_pressure_enabled(struct cpuset *cs, char *buf) | |||
| 967 | * cs: the cpuset to update | 961 | * cs: the cpuset to update |
| 968 | * buf: the buffer where we read the 0 or 1 | 962 | * buf: the buffer where we read the 0 or 1 |
| 969 | * | 963 | * |
| 970 | * Call with manage_sem held. | 964 | * Call with manage_mutex held. |
| 971 | */ | 965 | */ |
| 972 | 966 | ||
| 973 | static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) | 967 | static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) |
| @@ -989,12 +983,12 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) | |||
| 989 | return err; | 983 | return err; |
| 990 | cpu_exclusive_changed = | 984 | cpu_exclusive_changed = |
| 991 | (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs)); | 985 | (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs)); |
| 992 | down(&callback_sem); | 986 | mutex_lock(&callback_mutex); |
| 993 | if (turning_on) | 987 | if (turning_on) |
| 994 | set_bit(bit, &cs->flags); | 988 | set_bit(bit, &cs->flags); |
| 995 | else | 989 | else |
| 996 | clear_bit(bit, &cs->flags); | 990 | clear_bit(bit, &cs->flags); |
| 997 | up(&callback_sem); | 991 | mutex_unlock(&callback_mutex); |
| 998 | 992 | ||
| 999 | if (cpu_exclusive_changed) | 993 | if (cpu_exclusive_changed) |
| 1000 | update_cpu_domains(cs); | 994 | update_cpu_domains(cs); |
| @@ -1104,7 +1098,7 @@ static int fmeter_getrate(struct fmeter *fmp) | |||
| 1104 | * writing the path of the old cpuset in 'ppathbuf' if it needs to be | 1098 | * writing the path of the old cpuset in 'ppathbuf' if it needs to be |
| 1105 | * notified on release. | 1099 | * notified on release. |
| 1106 | * | 1100 | * |
| 1107 | * Call holding manage_sem. May take callback_sem and task_lock of | 1101 | * Call holding manage_mutex. May take callback_mutex and task_lock of |
| 1108 | * the task 'pid' during call. | 1102 | * the task 'pid' during call. |
| 1109 | */ | 1103 | */ |
| 1110 | 1104 | ||
| @@ -1144,13 +1138,13 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | |||
| 1144 | get_task_struct(tsk); | 1138 | get_task_struct(tsk); |
| 1145 | } | 1139 | } |
| 1146 | 1140 | ||
| 1147 | down(&callback_sem); | 1141 | mutex_lock(&callback_mutex); |
| 1148 | 1142 | ||
| 1149 | task_lock(tsk); | 1143 | task_lock(tsk); |
| 1150 | oldcs = tsk->cpuset; | 1144 | oldcs = tsk->cpuset; |
| 1151 | if (!oldcs) { | 1145 | if (!oldcs) { |
| 1152 | task_unlock(tsk); | 1146 | task_unlock(tsk); |
| 1153 | up(&callback_sem); | 1147 | mutex_unlock(&callback_mutex); |
| 1154 | put_task_struct(tsk); | 1148 | put_task_struct(tsk); |
| 1155 | return -ESRCH; | 1149 | return -ESRCH; |
| 1156 | } | 1150 | } |
| @@ -1164,7 +1158,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | |||
| 1164 | from = oldcs->mems_allowed; | 1158 | from = oldcs->mems_allowed; |
| 1165 | to = cs->mems_allowed; | 1159 | to = cs->mems_allowed; |
| 1166 | 1160 | ||
| 1167 | up(&callback_sem); | 1161 | mutex_unlock(&callback_mutex); |
| 1168 | 1162 | ||
| 1169 | mm = get_task_mm(tsk); | 1163 | mm = get_task_mm(tsk); |
| 1170 | if (mm) { | 1164 | if (mm) { |
| @@ -1221,7 +1215,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us | |||
| 1221 | } | 1215 | } |
| 1222 | buffer[nbytes] = 0; /* nul-terminate */ | 1216 | buffer[nbytes] = 0; /* nul-terminate */ |
| 1223 | 1217 | ||
| 1224 | down(&manage_sem); | 1218 | mutex_lock(&manage_mutex); |
| 1225 | 1219 | ||
| 1226 | if (is_removed(cs)) { | 1220 | if (is_removed(cs)) { |
| 1227 | retval = -ENODEV; | 1221 | retval = -ENODEV; |
| @@ -1264,7 +1258,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us | |||
| 1264 | if (retval == 0) | 1258 | if (retval == 0) |
| 1265 | retval = nbytes; | 1259 | retval = nbytes; |
| 1266 | out2: | 1260 | out2: |
| 1267 | up(&manage_sem); | 1261 | mutex_unlock(&manage_mutex); |
| 1268 | cpuset_release_agent(pathbuf); | 1262 | cpuset_release_agent(pathbuf); |
| 1269 | out1: | 1263 | out1: |
| 1270 | kfree(buffer); | 1264 | kfree(buffer); |
| @@ -1304,9 +1298,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) | |||
| 1304 | { | 1298 | { |
| 1305 | cpumask_t mask; | 1299 | cpumask_t mask; |
| 1306 | 1300 | ||
| 1307 | down(&callback_sem); | 1301 | mutex_lock(&callback_mutex); |
| 1308 | mask = cs->cpus_allowed; | 1302 | mask = cs->cpus_allowed; |
| 1309 | up(&callback_sem); | 1303 | mutex_unlock(&callback_mutex); |
| 1310 | 1304 | ||
| 1311 | return cpulist_scnprintf(page, PAGE_SIZE, mask); | 1305 | return cpulist_scnprintf(page, PAGE_SIZE, mask); |
| 1312 | } | 1306 | } |
| @@ -1315,9 +1309,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) | |||
| 1315 | { | 1309 | { |
| 1316 | nodemask_t mask; | 1310 | nodemask_t mask; |
| 1317 | 1311 | ||
| 1318 | down(&callback_sem); | 1312 | mutex_lock(&callback_mutex); |
| 1319 | mask = cs->mems_allowed; | 1313 | mask = cs->mems_allowed; |
| 1320 | up(&callback_sem); | 1314 | mutex_unlock(&callback_mutex); |
| 1321 | 1315 | ||
| 1322 | return nodelist_scnprintf(page, PAGE_SIZE, mask); | 1316 | return nodelist_scnprintf(page, PAGE_SIZE, mask); |
| 1323 | } | 1317 | } |
| @@ -1598,7 +1592,7 @@ static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids) | |||
| 1598 | * Handle an open on 'tasks' file. Prepare a buffer listing the | 1592 | * Handle an open on 'tasks' file. Prepare a buffer listing the |
| 1599 | * process id's of tasks currently attached to the cpuset being opened. | 1593 | * process id's of tasks currently attached to the cpuset being opened. |
| 1600 | * | 1594 | * |
| 1601 | * Does not require any specific cpuset semaphores, and does not take any. | 1595 | * Does not require any specific cpuset mutexes, and does not take any. |
| 1602 | */ | 1596 | */ |
| 1603 | static int cpuset_tasks_open(struct inode *unused, struct file *file) | 1597 | static int cpuset_tasks_open(struct inode *unused, struct file *file) |
| 1604 | { | 1598 | { |
| @@ -1754,7 +1748,7 @@ static int cpuset_populate_dir(struct dentry *cs_dentry) | |||
| 1754 | * name: name of the new cpuset. Will be strcpy'ed. | 1748 | * name: name of the new cpuset. Will be strcpy'ed. |
| 1755 | * mode: mode to set on new inode | 1749 | * mode: mode to set on new inode |
| 1756 | * | 1750 | * |
| 1757 | * Must be called with the semaphore on the parent inode held | 1751 | * Must be called with the mutex on the parent inode held |
| 1758 | */ | 1752 | */ |
| 1759 | 1753 | ||
| 1760 | static long cpuset_create(struct cpuset *parent, const char *name, int mode) | 1754 | static long cpuset_create(struct cpuset *parent, const char *name, int mode) |
| @@ -1766,7 +1760,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) | |||
| 1766 | if (!cs) | 1760 | if (!cs) |
| 1767 | return -ENOMEM; | 1761 | return -ENOMEM; |
| 1768 | 1762 | ||
| 1769 | down(&manage_sem); | 1763 | mutex_lock(&manage_mutex); |
| 1770 | cpuset_update_task_memory_state(); | 1764 | cpuset_update_task_memory_state(); |
| 1771 | cs->flags = 0; | 1765 | cs->flags = 0; |
| 1772 | if (notify_on_release(parent)) | 1766 | if (notify_on_release(parent)) |
| @@ -1782,28 +1776,28 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) | |||
| 1782 | 1776 | ||
| 1783 | cs->parent = parent; | 1777 | cs->parent = parent; |
| 1784 | 1778 | ||
| 1785 | down(&callback_sem); | 1779 | mutex_lock(&callback_mutex); |
| 1786 | list_add(&cs->sibling, &cs->parent->children); | 1780 | list_add(&cs->sibling, &cs->parent->children); |
| 1787 | number_of_cpusets++; | 1781 | number_of_cpusets++; |
| 1788 | up(&callback_sem); | 1782 | mutex_unlock(&callback_mutex); |
| 1789 | 1783 | ||
| 1790 | err = cpuset_create_dir(cs, name, mode); | 1784 | err = cpuset_create_dir(cs, name, mode); |
| 1791 | if (err < 0) | 1785 | if (err < 0) |
| 1792 | goto err; | 1786 | goto err; |
| 1793 | 1787 | ||
| 1794 | /* | 1788 | /* |
| 1795 | * Release manage_sem before cpuset_populate_dir() because it | 1789 | * Release manage_mutex before cpuset_populate_dir() because it |
| 1796 | * will down() this new directory's i_mutex and if we race with | 1790 | * will down() this new directory's i_mutex and if we race with |
| 1797 | * another mkdir, we might deadlock. | 1791 | * another mkdir, we might deadlock. |
| 1798 | */ | 1792 | */ |
| 1799 | up(&manage_sem); | 1793 | mutex_unlock(&manage_mutex); |
| 1800 | 1794 | ||
| 1801 | err = cpuset_populate_dir(cs->dentry); | 1795 | err = cpuset_populate_dir(cs->dentry); |
| 1802 | /* If err < 0, we have a half-filled directory - oh well ;) */ | 1796 | /* If err < 0, we have a half-filled directory - oh well ;) */ |
| 1803 | return 0; | 1797 | return 0; |
| 1804 | err: | 1798 | err: |
| 1805 | list_del(&cs->sibling); | 1799 | list_del(&cs->sibling); |
| 1806 | up(&manage_sem); | 1800 | mutex_unlock(&manage_mutex); |
| 1807 | kfree(cs); | 1801 | kfree(cs); |
| 1808 | return err; | 1802 | return err; |
| 1809 | } | 1803 | } |
| @@ -1825,18 +1819,18 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
| 1825 | 1819 | ||
| 1826 | /* the vfs holds both inode->i_mutex already */ | 1820 | /* the vfs holds both inode->i_mutex already */ |
| 1827 | 1821 | ||
| 1828 | down(&manage_sem); | 1822 | mutex_lock(&manage_mutex); |
| 1829 | cpuset_update_task_memory_state(); | 1823 | cpuset_update_task_memory_state(); |
| 1830 | if (atomic_read(&cs->count) > 0) { | 1824 | if (atomic_read(&cs->count) > 0) { |
| 1831 | up(&manage_sem); | 1825 | mutex_unlock(&manage_mutex); |
| 1832 | return -EBUSY; | 1826 | return -EBUSY; |
| 1833 | } | 1827 | } |
| 1834 | if (!list_empty(&cs->children)) { | 1828 | if (!list_empty(&cs->children)) { |
| 1835 | up(&manage_sem); | 1829 | mutex_unlock(&manage_mutex); |
| 1836 | return -EBUSY; | 1830 | return -EBUSY; |
| 1837 | } | 1831 | } |
| 1838 | parent = cs->parent; | 1832 | parent = cs->parent; |
| 1839 | down(&callback_sem); | 1833 | mutex_lock(&callback_mutex); |
| 1840 | set_bit(CS_REMOVED, &cs->flags); | 1834 | set_bit(CS_REMOVED, &cs->flags); |
| 1841 | if (is_cpu_exclusive(cs)) | 1835 | if (is_cpu_exclusive(cs)) |
| 1842 | update_cpu_domains(cs); | 1836 | update_cpu_domains(cs); |
| @@ -1848,10 +1842,10 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
| 1848 | cpuset_d_remove_dir(d); | 1842 | cpuset_d_remove_dir(d); |
| 1849 | dput(d); | 1843 | dput(d); |
| 1850 | number_of_cpusets--; | 1844 | number_of_cpusets--; |
| 1851 | up(&callback_sem); | 1845 | mutex_unlock(&callback_mutex); |
| 1852 | if (list_empty(&parent->children)) | 1846 | if (list_empty(&parent->children)) |
| 1853 | check_for_release(parent, &pathbuf); | 1847 | check_for_release(parent, &pathbuf); |
| 1854 | up(&manage_sem); | 1848 | mutex_unlock(&manage_mutex); |
| 1855 | cpuset_release_agent(pathbuf); | 1849 | cpuset_release_agent(pathbuf); |
| 1856 | return 0; | 1850 | return 0; |
| 1857 | } | 1851 | } |
| @@ -1960,19 +1954,19 @@ void cpuset_fork(struct task_struct *child) | |||
| 1960 | * Description: Detach cpuset from @tsk and release it. | 1954 | * Description: Detach cpuset from @tsk and release it. |
| 1961 | * | 1955 | * |
| 1962 | * Note that cpusets marked notify_on_release force every task in | 1956 | * Note that cpusets marked notify_on_release force every task in |
| 1963 | * them to take the global manage_sem semaphore when exiting. | 1957 | * them to take the global manage_mutex mutex when exiting. |
| 1964 | * This could impact scaling on very large systems. Be reluctant to | 1958 | * This could impact scaling on very large systems. Be reluctant to |
| 1965 | * use notify_on_release cpusets where very high task exit scaling | 1959 | * use notify_on_release cpusets where very high task exit scaling |
| 1966 | * is required on large systems. | 1960 | * is required on large systems. |
| 1967 | * | 1961 | * |
| 1968 | * Don't even think about derefencing 'cs' after the cpuset use count | 1962 | * Don't even think about derefencing 'cs' after the cpuset use count |
| 1969 | * goes to zero, except inside a critical section guarded by manage_sem | 1963 | * goes to zero, except inside a critical section guarded by manage_mutex |
| 1970 | * or callback_sem. Otherwise a zero cpuset use count is a license to | 1964 | * or callback_mutex. Otherwise a zero cpuset use count is a license to |
| 1971 | * any other task to nuke the cpuset immediately, via cpuset_rmdir(). | 1965 | * any other task to nuke the cpuset immediately, via cpuset_rmdir(). |
| 1972 | * | 1966 | * |
| 1973 | * This routine has to take manage_sem, not callback_sem, because | 1967 | * This routine has to take manage_mutex, not callback_mutex, because |
| 1974 | * it is holding that semaphore while calling check_for_release(), | 1968 | * it is holding that mutex while calling check_for_release(), |
| 1975 | * which calls kmalloc(), so can't be called holding callback__sem(). | 1969 | * which calls kmalloc(), so can't be called holding callback_mutex(). |
| 1976 | * | 1970 | * |
| 1977 | * We don't need to task_lock() this reference to tsk->cpuset, | 1971 | * We don't need to task_lock() this reference to tsk->cpuset, |
| 1978 | * because tsk is already marked PF_EXITING, so attach_task() won't | 1972 | * because tsk is already marked PF_EXITING, so attach_task() won't |
| @@ -2022,10 +2016,10 @@ void cpuset_exit(struct task_struct *tsk) | |||
| 2022 | if (notify_on_release(cs)) { | 2016 | if (notify_on_release(cs)) { |
| 2023 | char *pathbuf = NULL; | 2017 | char *pathbuf = NULL; |
| 2024 | 2018 | ||
| 2025 | down(&manage_sem); | 2019 | mutex_lock(&manage_mutex); |
| 2026 | if (atomic_dec_and_test(&cs->count)) | 2020 | if (atomic_dec_and_test(&cs->count)) |
| 2027 | check_for_release(cs, &pathbuf); | 2021 | check_for_release(cs, &pathbuf); |
| 2028 | up(&manage_sem); | 2022 | mutex_unlock(&manage_mutex); |
| 2029 | cpuset_release_agent(pathbuf); | 2023 | cpuset_release_agent(pathbuf); |
| 2030 | } else { | 2024 | } else { |
| 2031 | atomic_dec(&cs->count); | 2025 | atomic_dec(&cs->count); |
| @@ -2046,11 +2040,11 @@ cpumask_t cpuset_cpus_allowed(struct task_struct *tsk) | |||
| 2046 | { | 2040 | { |
| 2047 | cpumask_t mask; | 2041 | cpumask_t mask; |
| 2048 | 2042 | ||
| 2049 | down(&callback_sem); | 2043 | mutex_lock(&callback_mutex); |
| 2050 | task_lock(tsk); | 2044 | task_lock(tsk); |
| 2051 | guarantee_online_cpus(tsk->cpuset, &mask); | 2045 | guarantee_online_cpus(tsk->cpuset, &mask); |
| 2052 | task_unlock(tsk); | 2046 | task_unlock(tsk); |
| 2053 | up(&callback_sem); | 2047 | mutex_unlock(&callback_mutex); |
| 2054 | 2048 | ||
| 2055 | return mask; | 2049 | return mask; |
| 2056 | } | 2050 | } |
| @@ -2074,11 +2068,11 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk) | |||
| 2074 | { | 2068 | { |
| 2075 | nodemask_t mask; | 2069 | nodemask_t mask; |
| 2076 | 2070 | ||
| 2077 | down(&callback_sem); | 2071 | mutex_lock(&callback_mutex); |
| 2078 | task_lock(tsk); | 2072 | task_lock(tsk); |
| 2079 | guarantee_online_mems(tsk->cpuset, &mask); | 2073 | guarantee_online_mems(tsk->cpuset, &mask); |
| 2080 | task_unlock(tsk); | 2074 | task_unlock(tsk); |
| 2081 | up(&callback_sem); | 2075 | mutex_unlock(&callback_mutex); |
| 2082 | 2076 | ||
| 2083 | return mask; | 2077 | return mask; |
| 2084 | } | 2078 | } |
| @@ -2104,7 +2098,7 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl) | |||
| 2104 | 2098 | ||
| 2105 | /* | 2099 | /* |
| 2106 | * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive | 2100 | * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive |
| 2107 | * ancestor to the specified cpuset. Call holding callback_sem. | 2101 | * ancestor to the specified cpuset. Call holding callback_mutex. |
| 2108 | * If no ancestor is mem_exclusive (an unusual configuration), then | 2102 | * If no ancestor is mem_exclusive (an unusual configuration), then |
| 2109 | * returns the root cpuset. | 2103 | * returns the root cpuset. |
| 2110 | */ | 2104 | */ |
| @@ -2131,12 +2125,12 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) | |||
| 2131 | * GFP_KERNEL allocations are not so marked, so can escape to the | 2125 | * GFP_KERNEL allocations are not so marked, so can escape to the |
| 2132 | * nearest mem_exclusive ancestor cpuset. | 2126 | * nearest mem_exclusive ancestor cpuset. |
| 2133 | * | 2127 | * |
| 2134 | * Scanning up parent cpusets requires callback_sem. The __alloc_pages() | 2128 | * Scanning up parent cpusets requires callback_mutex. The __alloc_pages() |
| 2135 | * routine only calls here with __GFP_HARDWALL bit _not_ set if | 2129 | * routine only calls here with __GFP_HARDWALL bit _not_ set if |
| 2136 | * it's a GFP_KERNEL allocation, and all nodes in the current tasks | 2130 | * it's a GFP_KERNEL allocation, and all nodes in the current tasks |
| 2137 | * mems_allowed came up empty on the first pass over the zonelist. | 2131 | * mems_allowed came up empty on the first pass over the zonelist. |
| 2138 | * So only GFP_KERNEL allocations, if all nodes in the cpuset are | 2132 | * So only GFP_KERNEL allocations, if all nodes in the cpuset are |
| 2139 | * short of memory, might require taking the callback_sem semaphore. | 2133 | * short of memory, might require taking the callback_mutex mutex. |
| 2140 | * | 2134 | * |
| 2141 | * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages() | 2135 | * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages() |
| 2142 | * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing | 2136 | * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing |
| @@ -2171,31 +2165,31 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) | |||
| 2171 | return 1; | 2165 | return 1; |
| 2172 | 2166 | ||
| 2173 | /* Not hardwall and node outside mems_allowed: scan up cpusets */ | 2167 | /* Not hardwall and node outside mems_allowed: scan up cpusets */ |
| 2174 | down(&callback_sem); | 2168 | mutex_lock(&callback_mutex); |
| 2175 | 2169 | ||
| 2176 | task_lock(current); | 2170 | task_lock(current); |
| 2177 | cs = nearest_exclusive_ancestor(current->cpuset); | 2171 | cs = nearest_exclusive_ancestor(current->cpuset); |
| 2178 | task_unlock(current); | 2172 | task_unlock(current); |
| 2179 | 2173 | ||
| 2180 | allowed = node_isset(node, cs->mems_allowed); | 2174 | allowed = node_isset(node, cs->mems_allowed); |
| 2181 | up(&callback_sem); | 2175 | mutex_unlock(&callback_mutex); |
| 2182 | return allowed; | 2176 | return allowed; |
| 2183 | } | 2177 | } |
| 2184 | 2178 | ||
| 2185 | /** | 2179 | /** |
| 2186 | * cpuset_lock - lock out any changes to cpuset structures | 2180 | * cpuset_lock - lock out any changes to cpuset structures |
| 2187 | * | 2181 | * |
| 2188 | * The out of memory (oom) code needs to lock down cpusets | 2182 | * The out of memory (oom) code needs to mutex_lock cpusets |
| 2189 | * from being changed while it scans the tasklist looking for a | 2183 | * from being changed while it scans the tasklist looking for a |
| 2190 | * task in an overlapping cpuset. Expose callback_sem via this | 2184 | * task in an overlapping cpuset. Expose callback_mutex via this |
| 2191 | * cpuset_lock() routine, so the oom code can lock it, before | 2185 | * cpuset_lock() routine, so the oom code can lock it, before |
| 2192 | * locking the task list. The tasklist_lock is a spinlock, so | 2186 | * locking the task list. The tasklist_lock is a spinlock, so |
| 2193 | * must be taken inside callback_sem. | 2187 | * must be taken inside callback_mutex. |
| 2194 | */ | 2188 | */ |
| 2195 | 2189 | ||
| 2196 | void cpuset_lock(void) | 2190 | void cpuset_lock(void) |
| 2197 | { | 2191 | { |
| 2198 | down(&callback_sem); | 2192 | mutex_lock(&callback_mutex); |
| 2199 | } | 2193 | } |
| 2200 | 2194 | ||
| 2201 | /** | 2195 | /** |
| @@ -2206,7 +2200,7 @@ void cpuset_lock(void) | |||
| 2206 | 2200 | ||
| 2207 | void cpuset_unlock(void) | 2201 | void cpuset_unlock(void) |
| 2208 | { | 2202 | { |
| 2209 | up(&callback_sem); | 2203 | mutex_unlock(&callback_mutex); |
| 2210 | } | 2204 | } |
| 2211 | 2205 | ||
| 2212 | /** | 2206 | /** |
| @@ -2218,7 +2212,7 @@ void cpuset_unlock(void) | |||
| 2218 | * determine if task @p's memory usage might impact the memory | 2212 | * determine if task @p's memory usage might impact the memory |
| 2219 | * available to the current task. | 2213 | * available to the current task. |
| 2220 | * | 2214 | * |
| 2221 | * Call while holding callback_sem. | 2215 | * Call while holding callback_mutex. |
| 2222 | **/ | 2216 | **/ |
| 2223 | 2217 | ||
| 2224 | int cpuset_excl_nodes_overlap(const struct task_struct *p) | 2218 | int cpuset_excl_nodes_overlap(const struct task_struct *p) |
| @@ -2289,7 +2283,7 @@ void __cpuset_memory_pressure_bump(void) | |||
| 2289 | * - Used for /proc/<pid>/cpuset. | 2283 | * - Used for /proc/<pid>/cpuset. |
| 2290 | * - No need to task_lock(tsk) on this tsk->cpuset reference, as it | 2284 | * - No need to task_lock(tsk) on this tsk->cpuset reference, as it |
| 2291 | * doesn't really matter if tsk->cpuset changes after we read it, | 2285 | * doesn't really matter if tsk->cpuset changes after we read it, |
| 2292 | * and we take manage_sem, keeping attach_task() from changing it | 2286 | * and we take manage_mutex, keeping attach_task() from changing it |
| 2293 | * anyway. | 2287 | * anyway. |
| 2294 | */ | 2288 | */ |
| 2295 | 2289 | ||
| @@ -2305,7 +2299,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v) | |||
| 2305 | return -ENOMEM; | 2299 | return -ENOMEM; |
| 2306 | 2300 | ||
| 2307 | tsk = m->private; | 2301 | tsk = m->private; |
| 2308 | down(&manage_sem); | 2302 | mutex_lock(&manage_mutex); |
| 2309 | cs = tsk->cpuset; | 2303 | cs = tsk->cpuset; |
| 2310 | if (!cs) { | 2304 | if (!cs) { |
| 2311 | retval = -EINVAL; | 2305 | retval = -EINVAL; |
| @@ -2318,7 +2312,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v) | |||
| 2318 | seq_puts(m, buf); | 2312 | seq_puts(m, buf); |
| 2319 | seq_putc(m, '\n'); | 2313 | seq_putc(m, '\n'); |
| 2320 | out: | 2314 | out: |
| 2321 | up(&manage_sem); | 2315 | mutex_unlock(&manage_mutex); |
| 2322 | kfree(buf); | 2316 | kfree(buf); |
| 2323 | return retval; | 2317 | return retval; |
| 2324 | } | 2318 | } |
diff --git a/kernel/exit.c b/kernel/exit.c index d1e8d500a7e1..8037405e136e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
| @@ -345,9 +345,9 @@ void daemonize(const char *name, ...) | |||
| 345 | exit_mm(current); | 345 | exit_mm(current); |
| 346 | 346 | ||
| 347 | set_special_pids(1, 1); | 347 | set_special_pids(1, 1); |
| 348 | down(&tty_sem); | 348 | mutex_lock(&tty_mutex); |
| 349 | current->signal->tty = NULL; | 349 | current->signal->tty = NULL; |
| 350 | up(&tty_sem); | 350 | mutex_unlock(&tty_mutex); |
| 351 | 351 | ||
| 352 | /* Block and flush all signals */ | 352 | /* Block and flush all signals */ |
| 353 | sigfillset(&blocked); | 353 | sigfillset(&blocked); |
diff --git a/kernel/fork.c b/kernel/fork.c index 9bd7b65ee418..c79ae0b19a49 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
| @@ -607,12 +607,12 @@ static struct files_struct *alloc_files(void) | |||
| 607 | atomic_set(&newf->count, 1); | 607 | atomic_set(&newf->count, 1); |
| 608 | 608 | ||
| 609 | spin_lock_init(&newf->file_lock); | 609 | spin_lock_init(&newf->file_lock); |
| 610 | newf->next_fd = 0; | ||
| 610 | fdt = &newf->fdtab; | 611 | fdt = &newf->fdtab; |
| 611 | fdt->next_fd = 0; | ||
| 612 | fdt->max_fds = NR_OPEN_DEFAULT; | 612 | fdt->max_fds = NR_OPEN_DEFAULT; |
| 613 | fdt->max_fdset = __FD_SETSIZE; | 613 | fdt->max_fdset = EMBEDDED_FD_SET_SIZE; |
| 614 | fdt->close_on_exec = &newf->close_on_exec_init; | 614 | fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init; |
| 615 | fdt->open_fds = &newf->open_fds_init; | 615 | fdt->open_fds = (fd_set *)&newf->open_fds_init; |
| 616 | fdt->fd = &newf->fd_array[0]; | 616 | fdt->fd = &newf->fd_array[0]; |
| 617 | INIT_RCU_HEAD(&fdt->rcu); | 617 | INIT_RCU_HEAD(&fdt->rcu); |
| 618 | fdt->free_files = NULL; | 618 | fdt->free_files = NULL; |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index fef1af8a73ce..1fb9f753ef60 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
| @@ -48,7 +48,7 @@ | |||
| 48 | static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; | 48 | static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; |
| 49 | static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; | 49 | static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; |
| 50 | 50 | ||
| 51 | DECLARE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ | 51 | DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ |
| 52 | DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ | 52 | DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ |
| 53 | static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; | 53 | static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; |
| 54 | 54 | ||
| @@ -460,7 +460,7 @@ static int __kprobes __register_kprobe(struct kprobe *p, | |||
| 460 | } | 460 | } |
| 461 | 461 | ||
| 462 | p->nmissed = 0; | 462 | p->nmissed = 0; |
| 463 | down(&kprobe_mutex); | 463 | mutex_lock(&kprobe_mutex); |
| 464 | old_p = get_kprobe(p->addr); | 464 | old_p = get_kprobe(p->addr); |
| 465 | if (old_p) { | 465 | if (old_p) { |
| 466 | ret = register_aggr_kprobe(old_p, p); | 466 | ret = register_aggr_kprobe(old_p, p); |
| @@ -477,7 +477,7 @@ static int __kprobes __register_kprobe(struct kprobe *p, | |||
| 477 | arch_arm_kprobe(p); | 477 | arch_arm_kprobe(p); |
| 478 | 478 | ||
| 479 | out: | 479 | out: |
| 480 | up(&kprobe_mutex); | 480 | mutex_unlock(&kprobe_mutex); |
| 481 | 481 | ||
| 482 | if (ret && probed_mod) | 482 | if (ret && probed_mod) |
| 483 | module_put(probed_mod); | 483 | module_put(probed_mod); |
| @@ -496,10 +496,10 @@ void __kprobes unregister_kprobe(struct kprobe *p) | |||
| 496 | struct kprobe *old_p, *list_p; | 496 | struct kprobe *old_p, *list_p; |
| 497 | int cleanup_p; | 497 | int cleanup_p; |
| 498 | 498 | ||
| 499 | down(&kprobe_mutex); | 499 | mutex_lock(&kprobe_mutex); |
| 500 | old_p = get_kprobe(p->addr); | 500 | old_p = get_kprobe(p->addr); |
| 501 | if (unlikely(!old_p)) { | 501 | if (unlikely(!old_p)) { |
| 502 | up(&kprobe_mutex); | 502 | mutex_unlock(&kprobe_mutex); |
| 503 | return; | 503 | return; |
| 504 | } | 504 | } |
| 505 | if (p != old_p) { | 505 | if (p != old_p) { |
| @@ -507,7 +507,7 @@ void __kprobes unregister_kprobe(struct kprobe *p) | |||
| 507 | if (list_p == p) | 507 | if (list_p == p) |
| 508 | /* kprobe p is a valid probe */ | 508 | /* kprobe p is a valid probe */ |
| 509 | goto valid_p; | 509 | goto valid_p; |
| 510 | up(&kprobe_mutex); | 510 | mutex_unlock(&kprobe_mutex); |
| 511 | return; | 511 | return; |
| 512 | } | 512 | } |
| 513 | valid_p: | 513 | valid_p: |
| @@ -523,7 +523,7 @@ valid_p: | |||
| 523 | cleanup_p = 0; | 523 | cleanup_p = 0; |
| 524 | } | 524 | } |
| 525 | 525 | ||
| 526 | up(&kprobe_mutex); | 526 | mutex_unlock(&kprobe_mutex); |
| 527 | 527 | ||
| 528 | synchronize_sched(); | 528 | synchronize_sched(); |
| 529 | if (p->mod_refcounted && | 529 | if (p->mod_refcounted && |
diff --git a/kernel/kthread.c b/kernel/kthread.c index e75950a1092c..6a5373868a98 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
| @@ -12,6 +12,7 @@ | |||
| 12 | #include <linux/unistd.h> | 12 | #include <linux/unistd.h> |
| 13 | #include <linux/file.h> | 13 | #include <linux/file.h> |
| 14 | #include <linux/module.h> | 14 | #include <linux/module.h> |
| 15 | #include <linux/mutex.h> | ||
| 15 | #include <asm/semaphore.h> | 16 | #include <asm/semaphore.h> |
| 16 | 17 | ||
| 17 | /* | 18 | /* |
| @@ -41,7 +42,7 @@ struct kthread_stop_info | |||
| 41 | 42 | ||
| 42 | /* Thread stopping is done by setthing this var: lock serializes | 43 | /* Thread stopping is done by setthing this var: lock serializes |
| 43 | * multiple kthread_stop calls. */ | 44 | * multiple kthread_stop calls. */ |
| 44 | static DECLARE_MUTEX(kthread_stop_lock); | 45 | static DEFINE_MUTEX(kthread_stop_lock); |
| 45 | static struct kthread_stop_info kthread_stop_info; | 46 | static struct kthread_stop_info kthread_stop_info; |
| 46 | 47 | ||
| 47 | int kthread_should_stop(void) | 48 | int kthread_should_stop(void) |
| @@ -173,7 +174,7 @@ int kthread_stop_sem(struct task_struct *k, struct semaphore *s) | |||
| 173 | { | 174 | { |
| 174 | int ret; | 175 | int ret; |
| 175 | 176 | ||
| 176 | down(&kthread_stop_lock); | 177 | mutex_lock(&kthread_stop_lock); |
| 177 | 178 | ||
| 178 | /* It could exit after stop_info.k set, but before wake_up_process. */ | 179 | /* It could exit after stop_info.k set, but before wake_up_process. */ |
| 179 | get_task_struct(k); | 180 | get_task_struct(k); |
| @@ -194,7 +195,7 @@ int kthread_stop_sem(struct task_struct *k, struct semaphore *s) | |||
| 194 | wait_for_completion(&kthread_stop_info.done); | 195 | wait_for_completion(&kthread_stop_info.done); |
| 195 | kthread_stop_info.k = NULL; | 196 | kthread_stop_info.k = NULL; |
| 196 | ret = kthread_stop_info.err; | 197 | ret = kthread_stop_info.err; |
| 197 | up(&kthread_stop_lock); | 198 | mutex_unlock(&kthread_stop_lock); |
| 198 | 199 | ||
| 199 | return ret; | 200 | return ret; |
| 200 | } | 201 | } |
diff --git a/kernel/module.c b/kernel/module.c index 77764f22f021..fb404299082e 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
| @@ -39,6 +39,7 @@ | |||
| 39 | #include <linux/device.h> | 39 | #include <linux/device.h> |
| 40 | #include <linux/string.h> | 40 | #include <linux/string.h> |
| 41 | #include <linux/sched.h> | 41 | #include <linux/sched.h> |
| 42 | #include <linux/mutex.h> | ||
| 42 | #include <asm/uaccess.h> | 43 | #include <asm/uaccess.h> |
| 43 | #include <asm/semaphore.h> | 44 | #include <asm/semaphore.h> |
| 44 | #include <asm/cacheflush.h> | 45 | #include <asm/cacheflush.h> |
| @@ -60,18 +61,18 @@ | |||
| 60 | static DEFINE_SPINLOCK(modlist_lock); | 61 | static DEFINE_SPINLOCK(modlist_lock); |
| 61 | 62 | ||
| 62 | /* List of modules, protected by module_mutex AND modlist_lock */ | 63 | /* List of modules, protected by module_mutex AND modlist_lock */ |
| 63 | static DECLARE_MUTEX(module_mutex); | 64 | static DEFINE_MUTEX(module_mutex); |
| 64 | static LIST_HEAD(modules); | 65 | static LIST_HEAD(modules); |
| 65 | 66 | ||
| 66 | static DECLARE_MUTEX(notify_mutex); | 67 | static DEFINE_MUTEX(notify_mutex); |
| 67 | static struct notifier_block * module_notify_list; | 68 | static struct notifier_block * module_notify_list; |
| 68 | 69 | ||
| 69 | int register_module_notifier(struct notifier_block * nb) | 70 | int register_module_notifier(struct notifier_block * nb) |
| 70 | { | 71 | { |
| 71 | int err; | 72 | int err; |
| 72 | down(¬ify_mutex); | 73 | mutex_lock(¬ify_mutex); |
| 73 | err = notifier_chain_register(&module_notify_list, nb); | 74 | err = notifier_chain_register(&module_notify_list, nb); |
| 74 | up(¬ify_mutex); | 75 | mutex_unlock(¬ify_mutex); |
| 75 | return err; | 76 | return err; |
| 76 | } | 77 | } |
| 77 | EXPORT_SYMBOL(register_module_notifier); | 78 | EXPORT_SYMBOL(register_module_notifier); |
| @@ -79,9 +80,9 @@ EXPORT_SYMBOL(register_module_notifier); | |||
| 79 | int unregister_module_notifier(struct notifier_block * nb) | 80 | int unregister_module_notifier(struct notifier_block * nb) |
| 80 | { | 81 | { |
| 81 | int err; | 82 | int err; |
| 82 | down(¬ify_mutex); | 83 | mutex_lock(¬ify_mutex); |
| 83 | err = notifier_chain_unregister(&module_notify_list, nb); | 84 | err = notifier_chain_unregister(&module_notify_list, nb); |
| 84 | up(¬ify_mutex); | 85 | mutex_unlock(¬ify_mutex); |
| 85 | return err; | 86 | return err; |
| 86 | } | 87 | } |
| 87 | EXPORT_SYMBOL(unregister_module_notifier); | 88 | EXPORT_SYMBOL(unregister_module_notifier); |
| @@ -601,7 +602,7 @@ static void free_module(struct module *mod); | |||
| 601 | static void wait_for_zero_refcount(struct module *mod) | 602 | static void wait_for_zero_refcount(struct module *mod) |
| 602 | { | 603 | { |
| 603 | /* Since we might sleep for some time, drop the semaphore first */ | 604 | /* Since we might sleep for some time, drop the semaphore first */ |
| 604 | up(&module_mutex); | 605 | mutex_unlock(&module_mutex); |
| 605 | for (;;) { | 606 | for (;;) { |
| 606 | DEBUGP("Looking at refcount...\n"); | 607 | DEBUGP("Looking at refcount...\n"); |
| 607 | set_current_state(TASK_UNINTERRUPTIBLE); | 608 | set_current_state(TASK_UNINTERRUPTIBLE); |
| @@ -610,7 +611,7 @@ static void wait_for_zero_refcount(struct module *mod) | |||
| 610 | schedule(); | 611 | schedule(); |
| 611 | } | 612 | } |
| 612 | current->state = TASK_RUNNING; | 613 | current->state = TASK_RUNNING; |
| 613 | down(&module_mutex); | 614 | mutex_lock(&module_mutex); |
| 614 | } | 615 | } |
| 615 | 616 | ||
| 616 | asmlinkage long | 617 | asmlinkage long |
| @@ -627,7 +628,7 @@ sys_delete_module(const char __user *name_user, unsigned int flags) | |||
| 627 | return -EFAULT; | 628 | return -EFAULT; |
| 628 | name[MODULE_NAME_LEN-1] = '\0'; | 629 | name[MODULE_NAME_LEN-1] = '\0'; |
| 629 | 630 | ||
| 630 | if (down_interruptible(&module_mutex) != 0) | 631 | if (mutex_lock_interruptible(&module_mutex) != 0) |
| 631 | return -EINTR; | 632 | return -EINTR; |
| 632 | 633 | ||
| 633 | mod = find_module(name); | 634 | mod = find_module(name); |
| @@ -676,14 +677,14 @@ sys_delete_module(const char __user *name_user, unsigned int flags) | |||
| 676 | 677 | ||
| 677 | /* Final destruction now noone is using it. */ | 678 | /* Final destruction now noone is using it. */ |
| 678 | if (mod->exit != NULL) { | 679 | if (mod->exit != NULL) { |
| 679 | up(&module_mutex); | 680 | mutex_unlock(&module_mutex); |
| 680 | mod->exit(); | 681 | mod->exit(); |
| 681 | down(&module_mutex); | 682 | mutex_lock(&module_mutex); |
| 682 | } | 683 | } |
| 683 | free_module(mod); | 684 | free_module(mod); |
| 684 | 685 | ||
| 685 | out: | 686 | out: |
| 686 | up(&module_mutex); | 687 | mutex_unlock(&module_mutex); |
| 687 | return ret; | 688 | return ret; |
| 688 | } | 689 | } |
| 689 | 690 | ||
| @@ -1972,13 +1973,13 @@ sys_init_module(void __user *umod, | |||
| 1972 | return -EPERM; | 1973 | return -EPERM; |
| 1973 | 1974 | ||
| 1974 | /* Only one module load at a time, please */ | 1975 | /* Only one module load at a time, please */ |
| 1975 | if (down_interruptible(&module_mutex) != 0) | 1976 | if (mutex_lock_interruptible(&module_mutex) != 0) |
| 1976 | return -EINTR; | 1977 | return -EINTR; |
| 1977 | 1978 | ||
| 1978 | /* Do all the hard work */ | 1979 | /* Do all the hard work */ |
| 1979 | mod = load_module(umod, len, uargs); | 1980 | mod = load_module(umod, len, uargs); |
| 1980 | if (IS_ERR(mod)) { | 1981 | if (IS_ERR(mod)) { |
| 1981 | up(&module_mutex); | 1982 | mutex_unlock(&module_mutex); |
| 1982 | return PTR_ERR(mod); | 1983 | return PTR_ERR(mod); |
| 1983 | } | 1984 | } |
| 1984 | 1985 | ||
| @@ -1987,11 +1988,11 @@ sys_init_module(void __user *umod, | |||
| 1987 | stop_machine_run(__link_module, mod, NR_CPUS); | 1988 | stop_machine_run(__link_module, mod, NR_CPUS); |
| 1988 | 1989 | ||
| 1989 | /* Drop lock so they can recurse */ | 1990 | /* Drop lock so they can recurse */ |
| 1990 | up(&module_mutex); | 1991 | mutex_unlock(&module_mutex); |
| 1991 | 1992 | ||
| 1992 | down(¬ify_mutex); | 1993 | mutex_lock(¬ify_mutex); |
| 1993 | notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod); | 1994 | notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod); |
| 1994 | up(¬ify_mutex); | 1995 | mutex_unlock(¬ify_mutex); |
| 1995 | 1996 | ||
| 1996 | /* Start the module */ | 1997 | /* Start the module */ |
| 1997 | if (mod->init != NULL) | 1998 | if (mod->init != NULL) |
| @@ -2006,15 +2007,15 @@ sys_init_module(void __user *umod, | |||
| 2006 | mod->name); | 2007 | mod->name); |
| 2007 | else { | 2008 | else { |
| 2008 | module_put(mod); | 2009 | module_put(mod); |
| 2009 | down(&module_mutex); | 2010 | mutex_lock(&module_mutex); |
| 2010 | free_module(mod); | 2011 | free_module(mod); |
| 2011 | up(&module_mutex); | 2012 | mutex_unlock(&module_mutex); |
| 2012 | } | 2013 | } |
| 2013 | return ret; | 2014 | return ret; |
| 2014 | } | 2015 | } |
| 2015 | 2016 | ||
| 2016 | /* Now it's a first class citizen! */ | 2017 | /* Now it's a first class citizen! */ |
| 2017 | down(&module_mutex); | 2018 | mutex_lock(&module_mutex); |
| 2018 | mod->state = MODULE_STATE_LIVE; | 2019 | mod->state = MODULE_STATE_LIVE; |
| 2019 | /* Drop initial reference. */ | 2020 | /* Drop initial reference. */ |
| 2020 | module_put(mod); | 2021 | module_put(mod); |
| @@ -2022,7 +2023,7 @@ sys_init_module(void __user *umod, | |||
| 2022 | mod->module_init = NULL; | 2023 | mod->module_init = NULL; |
| 2023 | mod->init_size = 0; | 2024 | mod->init_size = 0; |
| 2024 | mod->init_text_size = 0; | 2025 | mod->init_text_size = 0; |
| 2025 | up(&module_mutex); | 2026 | mutex_unlock(&module_mutex); |
| 2026 | 2027 | ||
| 2027 | return 0; | 2028 | return 0; |
| 2028 | } | 2029 | } |
| @@ -2112,7 +2113,7 @@ struct module *module_get_kallsym(unsigned int symnum, | |||
| 2112 | { | 2113 | { |
| 2113 | struct module *mod; | 2114 | struct module *mod; |
| 2114 | 2115 | ||
| 2115 | down(&module_mutex); | 2116 | mutex_lock(&module_mutex); |
| 2116 | list_for_each_entry(mod, &modules, list) { | 2117 | list_for_each_entry(mod, &modules, list) { |
| 2117 | if (symnum < mod->num_symtab) { | 2118 | if (symnum < mod->num_symtab) { |
| 2118 | *value = mod->symtab[symnum].st_value; | 2119 | *value = mod->symtab[symnum].st_value; |
| @@ -2120,12 +2121,12 @@ struct module *module_get_kallsym(unsigned int symnum, | |||
| 2120 | strncpy(namebuf, | 2121 | strncpy(namebuf, |
| 2121 | mod->strtab + mod->symtab[symnum].st_name, | 2122 | mod->strtab + mod->symtab[symnum].st_name, |
| 2122 | 127); | 2123 | 127); |
| 2123 | up(&module_mutex); | 2124 | mutex_unlock(&module_mutex); |
| 2124 | return mod; | 2125 | return mod; |
| 2125 | } | 2126 | } |
| 2126 | symnum -= mod->num_symtab; | 2127 | symnum -= mod->num_symtab; |
| 2127 | } | 2128 | } |
| 2128 | up(&module_mutex); | 2129 | mutex_unlock(&module_mutex); |
| 2129 | return NULL; | 2130 | return NULL; |
| 2130 | } | 2131 | } |
| 2131 | 2132 | ||
| @@ -2168,7 +2169,7 @@ static void *m_start(struct seq_file *m, loff_t *pos) | |||
| 2168 | struct list_head *i; | 2169 | struct list_head *i; |
| 2169 | loff_t n = 0; | 2170 | loff_t n = 0; |
| 2170 | 2171 | ||
| 2171 | down(&module_mutex); | 2172 | mutex_lock(&module_mutex); |
| 2172 | list_for_each(i, &modules) { | 2173 | list_for_each(i, &modules) { |
| 2173 | if (n++ == *pos) | 2174 | if (n++ == *pos) |
| 2174 | break; | 2175 | break; |
| @@ -2189,7 +2190,7 @@ static void *m_next(struct seq_file *m, void *p, loff_t *pos) | |||
| 2189 | 2190 | ||
| 2190 | static void m_stop(struct seq_file *m, void *p) | 2191 | static void m_stop(struct seq_file *m, void *p) |
| 2191 | { | 2192 | { |
| 2192 | up(&module_mutex); | 2193 | mutex_unlock(&module_mutex); |
| 2193 | } | 2194 | } |
| 2194 | 2195 | ||
| 2195 | static int m_show(struct seq_file *m, void *p) | 2196 | static int m_show(struct seq_file *m, void *p) |
diff --git a/kernel/panic.c b/kernel/panic.c index 126dc43f1c74..acd95adddb93 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
| @@ -20,10 +20,13 @@ | |||
| 20 | #include <linux/nmi.h> | 20 | #include <linux/nmi.h> |
| 21 | #include <linux/kexec.h> | 21 | #include <linux/kexec.h> |
| 22 | 22 | ||
| 23 | int panic_timeout; | ||
| 24 | int panic_on_oops; | 23 | int panic_on_oops; |
| 25 | int tainted; | 24 | int tainted; |
| 25 | static int pause_on_oops; | ||
| 26 | static int pause_on_oops_flag; | ||
| 27 | static DEFINE_SPINLOCK(pause_on_oops_lock); | ||
| 26 | 28 | ||
| 29 | int panic_timeout; | ||
| 27 | EXPORT_SYMBOL(panic_timeout); | 30 | EXPORT_SYMBOL(panic_timeout); |
| 28 | 31 | ||
| 29 | struct notifier_block *panic_notifier_list; | 32 | struct notifier_block *panic_notifier_list; |
| @@ -174,3 +177,95 @@ void add_taint(unsigned flag) | |||
| 174 | tainted |= flag; | 177 | tainted |= flag; |
| 175 | } | 178 | } |
| 176 | EXPORT_SYMBOL(add_taint); | 179 | EXPORT_SYMBOL(add_taint); |
| 180 | |||
| 181 | static int __init pause_on_oops_setup(char *str) | ||
| 182 | { | ||
| 183 | pause_on_oops = simple_strtoul(str, NULL, 0); | ||
| 184 | return 1; | ||
| 185 | } | ||
| 186 | __setup("pause_on_oops=", pause_on_oops_setup); | ||
| 187 | |||
| 188 | static void spin_msec(int msecs) | ||
| 189 | { | ||
| 190 | int i; | ||
| 191 | |||
| 192 | for (i = 0; i < msecs; i++) { | ||
| 193 | touch_nmi_watchdog(); | ||
| 194 | mdelay(1); | ||
| 195 | } | ||
| 196 | } | ||
| 197 | |||
| 198 | /* | ||
| 199 | * It just happens that oops_enter() and oops_exit() are identically | ||
| 200 | * implemented... | ||
| 201 | */ | ||
| 202 | static void do_oops_enter_exit(void) | ||
| 203 | { | ||
| 204 | unsigned long flags; | ||
| 205 | static int spin_counter; | ||
| 206 | |||
| 207 | if (!pause_on_oops) | ||
| 208 | return; | ||
| 209 | |||
| 210 | spin_lock_irqsave(&pause_on_oops_lock, flags); | ||
| 211 | if (pause_on_oops_flag == 0) { | ||
| 212 | /* This CPU may now print the oops message */ | ||
| 213 | pause_on_oops_flag = 1; | ||
| 214 | } else { | ||
| 215 | /* We need to stall this CPU */ | ||
| 216 | if (!spin_counter) { | ||
| 217 | /* This CPU gets to do the counting */ | ||
| 218 | spin_counter = pause_on_oops; | ||
| 219 | do { | ||
| 220 | spin_unlock(&pause_on_oops_lock); | ||
| 221 | spin_msec(MSEC_PER_SEC); | ||
| 222 | spin_lock(&pause_on_oops_lock); | ||
| 223 | } while (--spin_counter); | ||
| 224 | pause_on_oops_flag = 0; | ||
| 225 | } else { | ||
| 226 | /* This CPU waits for a different one */ | ||
| 227 | while (spin_counter) { | ||
| 228 | spin_unlock(&pause_on_oops_lock); | ||
| 229 | spin_msec(1); | ||
| 230 | spin_lock(&pause_on_oops_lock); | ||
| 231 | } | ||
| 232 | } | ||
| 233 | } | ||
| 234 | spin_unlock_irqrestore(&pause_on_oops_lock, flags); | ||
| 235 | } | ||
| 236 | |||
| 237 | /* | ||
| 238 | * Return true if the calling CPU is allowed to print oops-related info. This | ||
| 239 | * is a bit racy.. | ||
| 240 | */ | ||
| 241 | int oops_may_print(void) | ||
| 242 | { | ||
| 243 | return pause_on_oops_flag == 0; | ||
| 244 | } | ||
| 245 | |||
| 246 | /* | ||
| 247 | * Called when the architecture enters its oops handler, before it prints | ||
| 248 | * anything. If this is the first CPU to oops, and it's oopsing the first time | ||
| 249 | * then let it proceed. | ||
| 250 | * | ||
| 251 | * This is all enabled by the pause_on_oops kernel boot option. We do all this | ||
| 252 | * to ensure that oopses don't scroll off the screen. It has the side-effect | ||
| 253 | * of preventing later-oopsing CPUs from mucking up the display, too. | ||
| 254 | * | ||
| 255 | * It turns out that the CPU which is allowed to print ends up pausing for the | ||
| 256 | * right duration, whereas all the other CPUs pause for twice as long: once in | ||
| 257 | * oops_enter(), once in oops_exit(). | ||
| 258 | */ | ||
| 259 | void oops_enter(void) | ||
| 260 | { | ||
| 261 | do_oops_enter_exit(); | ||
| 262 | } | ||
| 263 | |||
| 264 | /* | ||
| 265 | * Called when the architecture exits its oops handler, after printing | ||
| 266 | * everything. | ||
| 267 | */ | ||
| 268 | void oops_exit(void) | ||
| 269 | { | ||
| 270 | do_oops_enter_exit(); | ||
| 271 | } | ||
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index fa895fc2ecf5..9944379360b5 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c | |||
| @@ -35,6 +35,7 @@ | |||
| 35 | #include <linux/interrupt.h> | 35 | #include <linux/interrupt.h> |
| 36 | #include <linux/slab.h> | 36 | #include <linux/slab.h> |
| 37 | #include <linux/time.h> | 37 | #include <linux/time.h> |
| 38 | #include <linux/mutex.h> | ||
| 38 | 39 | ||
| 39 | #include <asm/uaccess.h> | 40 | #include <asm/uaccess.h> |
| 40 | #include <asm/semaphore.h> | 41 | #include <asm/semaphore.h> |
diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 04be7d0d96a7..8d0af3d37a4b 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile | |||
| @@ -5,7 +5,7 @@ endif | |||
| 5 | 5 | ||
| 6 | obj-y := main.o process.o console.o | 6 | obj-y := main.o process.o console.o |
| 7 | obj-$(CONFIG_PM_LEGACY) += pm.o | 7 | obj-$(CONFIG_PM_LEGACY) += pm.o |
| 8 | obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o snapshot.o | 8 | obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o snapshot.o swap.o user.o |
| 9 | 9 | ||
| 10 | obj-$(CONFIG_SUSPEND_SMP) += smp.o | 10 | obj-$(CONFIG_SUSPEND_SMP) += smp.o |
| 11 | 11 | ||
diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 0b43847dc980..81d4d982f3f0 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c | |||
| @@ -22,17 +22,6 @@ | |||
| 22 | #include "power.h" | 22 | #include "power.h" |
| 23 | 23 | ||
| 24 | 24 | ||
| 25 | extern suspend_disk_method_t pm_disk_mode; | ||
| 26 | |||
| 27 | extern int swsusp_shrink_memory(void); | ||
| 28 | extern int swsusp_suspend(void); | ||
| 29 | extern int swsusp_write(struct pbe *pblist, unsigned int nr_pages); | ||
| 30 | extern int swsusp_check(void); | ||
| 31 | extern int swsusp_read(struct pbe **pblist_ptr); | ||
| 32 | extern void swsusp_close(void); | ||
| 33 | extern int swsusp_resume(void); | ||
| 34 | |||
| 35 | |||
| 36 | static int noresume = 0; | 25 | static int noresume = 0; |
| 37 | char resume_file[256] = CONFIG_PM_STD_PARTITION; | 26 | char resume_file[256] = CONFIG_PM_STD_PARTITION; |
| 38 | dev_t swsusp_resume_device; | 27 | dev_t swsusp_resume_device; |
| @@ -70,10 +59,6 @@ static void power_down(suspend_disk_method_t mode) | |||
| 70 | while(1); | 59 | while(1); |
| 71 | } | 60 | } |
| 72 | 61 | ||
| 73 | |||
| 74 | static int in_suspend __nosavedata = 0; | ||
| 75 | |||
| 76 | |||
| 77 | static inline void platform_finish(void) | 62 | static inline void platform_finish(void) |
| 78 | { | 63 | { |
| 79 | if (pm_disk_mode == PM_DISK_PLATFORM) { | 64 | if (pm_disk_mode == PM_DISK_PLATFORM) { |
| @@ -87,7 +72,6 @@ static int prepare_processes(void) | |||
| 87 | int error; | 72 | int error; |
| 88 | 73 | ||
| 89 | pm_prepare_console(); | 74 | pm_prepare_console(); |
| 90 | sys_sync(); | ||
| 91 | disable_nonboot_cpus(); | 75 | disable_nonboot_cpus(); |
| 92 | 76 | ||
| 93 | if (freeze_processes()) { | 77 | if (freeze_processes()) { |
| @@ -145,7 +129,7 @@ int pm_suspend_disk(void) | |||
| 145 | if (in_suspend) { | 129 | if (in_suspend) { |
| 146 | device_resume(); | 130 | device_resume(); |
| 147 | pr_debug("PM: writing image.\n"); | 131 | pr_debug("PM: writing image.\n"); |
| 148 | error = swsusp_write(pagedir_nosave, nr_copy_pages); | 132 | error = swsusp_write(); |
| 149 | if (!error) | 133 | if (!error) |
| 150 | power_down(pm_disk_mode); | 134 | power_down(pm_disk_mode); |
| 151 | else { | 135 | else { |
| @@ -216,7 +200,7 @@ static int software_resume(void) | |||
| 216 | 200 | ||
| 217 | pr_debug("PM: Reading swsusp image.\n"); | 201 | pr_debug("PM: Reading swsusp image.\n"); |
| 218 | 202 | ||
| 219 | if ((error = swsusp_read(&pagedir_nosave))) { | 203 | if ((error = swsusp_read())) { |
| 220 | swsusp_free(); | 204 | swsusp_free(); |
| 221 | goto Thaw; | 205 | goto Thaw; |
| 222 | } | 206 | } |
diff --git a/kernel/power/main.c b/kernel/power/main.c index 9cb235cba4a9..ee371f50ccaa 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
| @@ -103,7 +103,7 @@ static int suspend_prepare(suspend_state_t state) | |||
| 103 | } | 103 | } |
| 104 | 104 | ||
| 105 | 105 | ||
| 106 | static int suspend_enter(suspend_state_t state) | 106 | int suspend_enter(suspend_state_t state) |
| 107 | { | 107 | { |
| 108 | int error = 0; | 108 | int error = 0; |
| 109 | unsigned long flags; | 109 | unsigned long flags; |
diff --git a/kernel/power/pm.c b/kernel/power/pm.c index 33c508e857dd..0f6908cce1dd 100644 --- a/kernel/power/pm.c +++ b/kernel/power/pm.c | |||
| @@ -25,6 +25,7 @@ | |||
| 25 | #include <linux/pm.h> | 25 | #include <linux/pm.h> |
| 26 | #include <linux/pm_legacy.h> | 26 | #include <linux/pm_legacy.h> |
| 27 | #include <linux/interrupt.h> | 27 | #include <linux/interrupt.h> |
| 28 | #include <linux/mutex.h> | ||
| 28 | 29 | ||
| 29 | int pm_active; | 30 | int pm_active; |
| 30 | 31 | ||
| @@ -40,7 +41,7 @@ int pm_active; | |||
| 40 | * until a resume but that will be fine. | 41 | * until a resume but that will be fine. |
| 41 | */ | 42 | */ |
| 42 | 43 | ||
| 43 | static DECLARE_MUTEX(pm_devs_lock); | 44 | static DEFINE_MUTEX(pm_devs_lock); |
| 44 | static LIST_HEAD(pm_devs); | 45 | static LIST_HEAD(pm_devs); |
| 45 | 46 | ||
| 46 | /** | 47 | /** |
| @@ -67,9 +68,9 @@ struct pm_dev *pm_register(pm_dev_t type, | |||
| 67 | dev->id = id; | 68 | dev->id = id; |
| 68 | dev->callback = callback; | 69 | dev->callback = callback; |
| 69 | 70 | ||
| 70 | down(&pm_devs_lock); | 71 | mutex_lock(&pm_devs_lock); |
| 71 | list_add(&dev->entry, &pm_devs); | 72 | list_add(&dev->entry, &pm_devs); |
| 72 | up(&pm_devs_lock); | 73 | mutex_unlock(&pm_devs_lock); |
| 73 | } | 74 | } |
| 74 | return dev; | 75 | return dev; |
| 75 | } | 76 | } |
| @@ -85,9 +86,9 @@ struct pm_dev *pm_register(pm_dev_t type, | |||
| 85 | void pm_unregister(struct pm_dev *dev) | 86 | void pm_unregister(struct pm_dev *dev) |
| 86 | { | 87 | { |
| 87 | if (dev) { | 88 | if (dev) { |
| 88 | down(&pm_devs_lock); | 89 | mutex_lock(&pm_devs_lock); |
| 89 | list_del(&dev->entry); | 90 | list_del(&dev->entry); |
| 90 | up(&pm_devs_lock); | 91 | mutex_unlock(&pm_devs_lock); |
| 91 | 92 | ||
| 92 | kfree(dev); | 93 | kfree(dev); |
| 93 | } | 94 | } |
| @@ -118,7 +119,7 @@ void pm_unregister_all(pm_callback callback) | |||
| 118 | if (!callback) | 119 | if (!callback) |
| 119 | return; | 120 | return; |
| 120 | 121 | ||
| 121 | down(&pm_devs_lock); | 122 | mutex_lock(&pm_devs_lock); |
| 122 | entry = pm_devs.next; | 123 | entry = pm_devs.next; |
| 123 | while (entry != &pm_devs) { | 124 | while (entry != &pm_devs) { |
| 124 | struct pm_dev *dev = list_entry(entry, struct pm_dev, entry); | 125 | struct pm_dev *dev = list_entry(entry, struct pm_dev, entry); |
| @@ -126,7 +127,7 @@ void pm_unregister_all(pm_callback callback) | |||
| 126 | if (dev->callback == callback) | 127 | if (dev->callback == callback) |
| 127 | __pm_unregister(dev); | 128 | __pm_unregister(dev); |
| 128 | } | 129 | } |
| 129 | up(&pm_devs_lock); | 130 | mutex_unlock(&pm_devs_lock); |
| 130 | } | 131 | } |
| 131 | 132 | ||
| 132 | /** | 133 | /** |
| @@ -234,7 +235,7 @@ int pm_send_all(pm_request_t rqst, void *data) | |||
| 234 | { | 235 | { |
| 235 | struct list_head *entry; | 236 | struct list_head *entry; |
| 236 | 237 | ||
| 237 | down(&pm_devs_lock); | 238 | mutex_lock(&pm_devs_lock); |
| 238 | entry = pm_devs.next; | 239 | entry = pm_devs.next; |
| 239 | while (entry != &pm_devs) { | 240 | while (entry != &pm_devs) { |
| 240 | struct pm_dev *dev = list_entry(entry, struct pm_dev, entry); | 241 | struct pm_dev *dev = list_entry(entry, struct pm_dev, entry); |
| @@ -246,13 +247,13 @@ int pm_send_all(pm_request_t rqst, void *data) | |||
| 246 | */ | 247 | */ |
| 247 | if (rqst == PM_SUSPEND) | 248 | if (rqst == PM_SUSPEND) |
| 248 | pm_undo_all(dev); | 249 | pm_undo_all(dev); |
| 249 | up(&pm_devs_lock); | 250 | mutex_unlock(&pm_devs_lock); |
| 250 | return status; | 251 | return status; |
| 251 | } | 252 | } |
| 252 | } | 253 | } |
| 253 | entry = entry->next; | 254 | entry = entry->next; |
| 254 | } | 255 | } |
| 255 | up(&pm_devs_lock); | 256 | mutex_unlock(&pm_devs_lock); |
| 256 | return 0; | 257 | return 0; |
| 257 | } | 258 | } |
| 258 | 259 | ||
diff --git a/kernel/power/power.h b/kernel/power/power.h index 388dba680841..f06f12f21767 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h | |||
| @@ -8,6 +8,7 @@ struct swsusp_info { | |||
| 8 | int cpus; | 8 | int cpus; |
| 9 | unsigned long image_pages; | 9 | unsigned long image_pages; |
| 10 | unsigned long pages; | 10 | unsigned long pages; |
| 11 | unsigned long size; | ||
| 11 | } __attribute__((aligned(PAGE_SIZE))); | 12 | } __attribute__((aligned(PAGE_SIZE))); |
| 12 | 13 | ||
| 13 | 14 | ||
| @@ -37,21 +38,79 @@ extern struct subsystem power_subsys; | |||
| 37 | /* References to section boundaries */ | 38 | /* References to section boundaries */ |
| 38 | extern const void __nosave_begin, __nosave_end; | 39 | extern const void __nosave_begin, __nosave_end; |
| 39 | 40 | ||
| 40 | extern unsigned int nr_copy_pages; | ||
| 41 | extern struct pbe *pagedir_nosave; | 41 | extern struct pbe *pagedir_nosave; |
| 42 | 42 | ||
| 43 | /* Preferred image size in bytes (default 500 MB) */ | 43 | /* Preferred image size in bytes (default 500 MB) */ |
| 44 | extern unsigned long image_size; | 44 | extern unsigned long image_size; |
| 45 | extern int in_suspend; | ||
| 46 | extern dev_t swsusp_resume_device; | ||
| 45 | 47 | ||
| 46 | extern asmlinkage int swsusp_arch_suspend(void); | 48 | extern asmlinkage int swsusp_arch_suspend(void); |
| 47 | extern asmlinkage int swsusp_arch_resume(void); | 49 | extern asmlinkage int swsusp_arch_resume(void); |
| 48 | 50 | ||
| 49 | extern unsigned int count_data_pages(void); | 51 | extern unsigned int count_data_pages(void); |
| 50 | extern void free_pagedir(struct pbe *pblist); | 52 | |
| 51 | extern void release_eaten_pages(void); | 53 | struct snapshot_handle { |
| 52 | extern struct pbe *alloc_pagedir(unsigned nr_pages, gfp_t gfp_mask, int safe_needed); | 54 | loff_t offset; |
| 55 | unsigned int page; | ||
| 56 | unsigned int page_offset; | ||
| 57 | unsigned int prev; | ||
| 58 | struct pbe *pbe; | ||
| 59 | void *buffer; | ||
| 60 | unsigned int buf_offset; | ||
| 61 | }; | ||
| 62 | |||
| 63 | #define data_of(handle) ((handle).buffer + (handle).buf_offset) | ||
| 64 | |||
| 65 | extern int snapshot_read_next(struct snapshot_handle *handle, size_t count); | ||
| 66 | extern int snapshot_write_next(struct snapshot_handle *handle, size_t count); | ||
| 67 | int snapshot_image_loaded(struct snapshot_handle *handle); | ||
| 68 | |||
| 69 | #define SNAPSHOT_IOC_MAGIC '3' | ||
| 70 | #define SNAPSHOT_FREEZE _IO(SNAPSHOT_IOC_MAGIC, 1) | ||
| 71 | #define SNAPSHOT_UNFREEZE _IO(SNAPSHOT_IOC_MAGIC, 2) | ||
| 72 | #define SNAPSHOT_ATOMIC_SNAPSHOT _IOW(SNAPSHOT_IOC_MAGIC, 3, void *) | ||
| 73 | #define SNAPSHOT_ATOMIC_RESTORE _IO(SNAPSHOT_IOC_MAGIC, 4) | ||
| 74 | #define SNAPSHOT_FREE _IO(SNAPSHOT_IOC_MAGIC, 5) | ||
| 75 | #define SNAPSHOT_SET_IMAGE_SIZE _IOW(SNAPSHOT_IOC_MAGIC, 6, unsigned long) | ||
| 76 | #define SNAPSHOT_AVAIL_SWAP _IOR(SNAPSHOT_IOC_MAGIC, 7, void *) | ||
| 77 | #define SNAPSHOT_GET_SWAP_PAGE _IOR(SNAPSHOT_IOC_MAGIC, 8, void *) | ||
| 78 | #define SNAPSHOT_FREE_SWAP_PAGES _IO(SNAPSHOT_IOC_MAGIC, 9) | ||
| 79 | #define SNAPSHOT_SET_SWAP_FILE _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int) | ||
| 80 | #define SNAPSHOT_S2RAM _IO(SNAPSHOT_IOC_MAGIC, 11) | ||
| 81 | #define SNAPSHOT_IOC_MAXNR 11 | ||
| 82 | |||
| 83 | /** | ||
| 84 | * The bitmap is used for tracing allocated swap pages | ||
| 85 | * | ||
| 86 | * The entire bitmap consists of a number of bitmap_page | ||
| 87 | * structures linked with the help of the .next member. | ||
| 88 | * Thus each page can be allocated individually, so we only | ||
| 89 | * need to make 0-order memory allocations to create | ||
| 90 | * the bitmap. | ||
| 91 | */ | ||
| 92 | |||
| 93 | #define BITMAP_PAGE_SIZE (PAGE_SIZE - sizeof(void *)) | ||
| 94 | #define BITMAP_PAGE_CHUNKS (BITMAP_PAGE_SIZE / sizeof(long)) | ||
| 95 | #define BITS_PER_CHUNK (sizeof(long) * 8) | ||
| 96 | #define BITMAP_PAGE_BITS (BITMAP_PAGE_CHUNKS * BITS_PER_CHUNK) | ||
| 97 | |||
| 98 | struct bitmap_page { | ||
| 99 | unsigned long chunks[BITMAP_PAGE_CHUNKS]; | ||
| 100 | struct bitmap_page *next; | ||
| 101 | }; | ||
| 102 | |||
| 103 | extern void free_bitmap(struct bitmap_page *bitmap); | ||
| 104 | extern struct bitmap_page *alloc_bitmap(unsigned int nr_bits); | ||
| 105 | extern unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap); | ||
| 106 | extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap); | ||
| 107 | |||
| 108 | extern int swsusp_check(void); | ||
| 109 | extern int swsusp_shrink_memory(void); | ||
| 53 | extern void swsusp_free(void); | 110 | extern void swsusp_free(void); |
| 54 | extern int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed); | 111 | extern int swsusp_suspend(void); |
| 55 | extern unsigned int snapshot_nr_pages(void); | 112 | extern int swsusp_resume(void); |
| 56 | extern struct pbe *snapshot_pblist(void); | 113 | extern int swsusp_read(void); |
| 57 | extern void snapshot_pblist_set(struct pbe *pblist); | 114 | extern int swsusp_write(void); |
| 115 | extern void swsusp_close(void); | ||
| 116 | extern int suspend_enter(suspend_state_t state); | ||
diff --git a/kernel/power/process.c b/kernel/power/process.c index 28de118f7a0b..8ac7c35fad77 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c | |||
| @@ -12,11 +12,12 @@ | |||
| 12 | #include <linux/interrupt.h> | 12 | #include <linux/interrupt.h> |
| 13 | #include <linux/suspend.h> | 13 | #include <linux/suspend.h> |
| 14 | #include <linux/module.h> | 14 | #include <linux/module.h> |
| 15 | #include <linux/syscalls.h> | ||
| 15 | 16 | ||
| 16 | /* | 17 | /* |
| 17 | * Timeout for stopping processes | 18 | * Timeout for stopping processes |
| 18 | */ | 19 | */ |
| 19 | #define TIMEOUT (6 * HZ) | 20 | #define TIMEOUT (20 * HZ) |
| 20 | 21 | ||
| 21 | 22 | ||
| 22 | static inline int freezeable(struct task_struct * p) | 23 | static inline int freezeable(struct task_struct * p) |
| @@ -54,38 +55,62 @@ void refrigerator(void) | |||
| 54 | current->state = save; | 55 | current->state = save; |
| 55 | } | 56 | } |
| 56 | 57 | ||
| 58 | static inline void freeze_process(struct task_struct *p) | ||
| 59 | { | ||
| 60 | unsigned long flags; | ||
| 61 | |||
| 62 | if (!freezing(p)) { | ||
| 63 | freeze(p); | ||
| 64 | spin_lock_irqsave(&p->sighand->siglock, flags); | ||
| 65 | signal_wake_up(p, 0); | ||
| 66 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | ||
| 67 | } | ||
| 68 | } | ||
| 69 | |||
| 57 | /* 0 = success, else # of processes that we failed to stop */ | 70 | /* 0 = success, else # of processes that we failed to stop */ |
| 58 | int freeze_processes(void) | 71 | int freeze_processes(void) |
| 59 | { | 72 | { |
| 60 | int todo; | 73 | int todo, nr_user, user_frozen; |
| 61 | unsigned long start_time; | 74 | unsigned long start_time; |
| 62 | struct task_struct *g, *p; | 75 | struct task_struct *g, *p; |
| 63 | unsigned long flags; | 76 | unsigned long flags; |
| 64 | 77 | ||
| 65 | printk( "Stopping tasks: " ); | 78 | printk( "Stopping tasks: " ); |
| 66 | start_time = jiffies; | 79 | start_time = jiffies; |
| 80 | user_frozen = 0; | ||
| 67 | do { | 81 | do { |
| 68 | todo = 0; | 82 | nr_user = todo = 0; |
| 69 | read_lock(&tasklist_lock); | 83 | read_lock(&tasklist_lock); |
| 70 | do_each_thread(g, p) { | 84 | do_each_thread(g, p) { |
| 71 | if (!freezeable(p)) | 85 | if (!freezeable(p)) |
| 72 | continue; | 86 | continue; |
| 73 | if (frozen(p)) | 87 | if (frozen(p)) |
| 74 | continue; | 88 | continue; |
| 75 | 89 | if (p->mm && !(p->flags & PF_BORROWED_MM)) { | |
| 76 | freeze(p); | 90 | /* The task is a user-space one. |
| 77 | spin_lock_irqsave(&p->sighand->siglock, flags); | 91 | * Freeze it unless there's a vfork completion |
| 78 | signal_wake_up(p, 0); | 92 | * pending |
| 79 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | 93 | */ |
| 80 | todo++; | 94 | if (!p->vfork_done) |
| 95 | freeze_process(p); | ||
| 96 | nr_user++; | ||
| 97 | } else { | ||
| 98 | /* Freeze only if the user space is frozen */ | ||
| 99 | if (user_frozen) | ||
| 100 | freeze_process(p); | ||
| 101 | todo++; | ||
| 102 | } | ||
| 81 | } while_each_thread(g, p); | 103 | } while_each_thread(g, p); |
| 82 | read_unlock(&tasklist_lock); | 104 | read_unlock(&tasklist_lock); |
| 105 | todo += nr_user; | ||
| 106 | if (!user_frozen && !nr_user) { | ||
| 107 | sys_sync(); | ||
| 108 | start_time = jiffies; | ||
| 109 | } | ||
| 110 | user_frozen = !nr_user; | ||
| 83 | yield(); /* Yield is okay here */ | 111 | yield(); /* Yield is okay here */ |
| 84 | if (todo && time_after(jiffies, start_time + TIMEOUT)) { | 112 | if (todo && time_after(jiffies, start_time + TIMEOUT)) |
| 85 | printk( "\n" ); | ||
| 86 | printk(KERN_ERR " stopping tasks failed (%d tasks remaining)\n", todo ); | ||
| 87 | break; | 113 | break; |
| 88 | } | ||
| 89 | } while(todo); | 114 | } while(todo); |
| 90 | 115 | ||
| 91 | /* This does not unfreeze processes that are already frozen | 116 | /* This does not unfreeze processes that are already frozen |
| @@ -94,8 +119,14 @@ int freeze_processes(void) | |||
| 94 | * but it cleans up leftover PF_FREEZE requests. | 119 | * but it cleans up leftover PF_FREEZE requests. |
| 95 | */ | 120 | */ |
| 96 | if (todo) { | 121 | if (todo) { |
| 122 | printk( "\n" ); | ||
| 123 | printk(KERN_ERR " stopping tasks timed out " | ||
| 124 | "after %d seconds (%d tasks remaining):\n", | ||
| 125 | TIMEOUT / HZ, todo); | ||
| 97 | read_lock(&tasklist_lock); | 126 | read_lock(&tasklist_lock); |
| 98 | do_each_thread(g, p) | 127 | do_each_thread(g, p) { |
| 128 | if (freezeable(p) && !frozen(p)) | ||
| 129 | printk(KERN_ERR " %s\n", p->comm); | ||
| 99 | if (freezing(p)) { | 130 | if (freezing(p)) { |
| 100 | pr_debug(" clean up: %s\n", p->comm); | 131 | pr_debug(" clean up: %s\n", p->comm); |
| 101 | p->flags &= ~PF_FREEZE; | 132 | p->flags &= ~PF_FREEZE; |
| @@ -103,7 +134,7 @@ int freeze_processes(void) | |||
| 103 | recalc_sigpending_tsk(p); | 134 | recalc_sigpending_tsk(p); |
| 104 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | 135 | spin_unlock_irqrestore(&p->sighand->siglock, flags); |
| 105 | } | 136 | } |
| 106 | while_each_thread(g, p); | 137 | } while_each_thread(g, p); |
| 107 | read_unlock(&tasklist_lock); | 138 | read_unlock(&tasklist_lock); |
| 108 | return todo; | 139 | return todo; |
| 109 | } | 140 | } |
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 8d5a5986d621..c5863d02c89e 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
| @@ -10,6 +10,7 @@ | |||
| 10 | */ | 10 | */ |
| 11 | 11 | ||
| 12 | 12 | ||
| 13 | #include <linux/version.h> | ||
| 13 | #include <linux/module.h> | 14 | #include <linux/module.h> |
| 14 | #include <linux/mm.h> | 15 | #include <linux/mm.h> |
| 15 | #include <linux/suspend.h> | 16 | #include <linux/suspend.h> |
| @@ -34,7 +35,9 @@ | |||
| 34 | #include "power.h" | 35 | #include "power.h" |
| 35 | 36 | ||
| 36 | struct pbe *pagedir_nosave; | 37 | struct pbe *pagedir_nosave; |
| 37 | unsigned int nr_copy_pages; | 38 | static unsigned int nr_copy_pages; |
| 39 | static unsigned int nr_meta_pages; | ||
| 40 | static unsigned long *buffer; | ||
| 38 | 41 | ||
| 39 | #ifdef CONFIG_HIGHMEM | 42 | #ifdef CONFIG_HIGHMEM |
| 40 | unsigned int count_highmem_pages(void) | 43 | unsigned int count_highmem_pages(void) |
| @@ -80,7 +83,7 @@ static int save_highmem_zone(struct zone *zone) | |||
| 80 | void *kaddr; | 83 | void *kaddr; |
| 81 | unsigned long pfn = zone_pfn + zone->zone_start_pfn; | 84 | unsigned long pfn = zone_pfn + zone->zone_start_pfn; |
| 82 | 85 | ||
| 83 | if (!(pfn%1000)) | 86 | if (!(pfn%10000)) |
| 84 | printk("."); | 87 | printk("."); |
| 85 | if (!pfn_valid(pfn)) | 88 | if (!pfn_valid(pfn)) |
| 86 | continue; | 89 | continue; |
| @@ -119,13 +122,15 @@ int save_highmem(void) | |||
| 119 | struct zone *zone; | 122 | struct zone *zone; |
| 120 | int res = 0; | 123 | int res = 0; |
| 121 | 124 | ||
| 122 | pr_debug("swsusp: Saving Highmem\n"); | 125 | pr_debug("swsusp: Saving Highmem"); |
| 126 | drain_local_pages(); | ||
| 123 | for_each_zone (zone) { | 127 | for_each_zone (zone) { |
| 124 | if (is_highmem(zone)) | 128 | if (is_highmem(zone)) |
| 125 | res = save_highmem_zone(zone); | 129 | res = save_highmem_zone(zone); |
| 126 | if (res) | 130 | if (res) |
| 127 | return res; | 131 | return res; |
| 128 | } | 132 | } |
| 133 | printk("\n"); | ||
| 129 | return 0; | 134 | return 0; |
| 130 | } | 135 | } |
| 131 | 136 | ||
| @@ -235,7 +240,7 @@ static void copy_data_pages(struct pbe *pblist) | |||
| 235 | * free_pagedir - free pages allocated with alloc_pagedir() | 240 | * free_pagedir - free pages allocated with alloc_pagedir() |
| 236 | */ | 241 | */ |
| 237 | 242 | ||
| 238 | void free_pagedir(struct pbe *pblist) | 243 | static void free_pagedir(struct pbe *pblist) |
| 239 | { | 244 | { |
| 240 | struct pbe *pbe; | 245 | struct pbe *pbe; |
| 241 | 246 | ||
| @@ -301,7 +306,7 @@ struct eaten_page { | |||
| 301 | 306 | ||
| 302 | static struct eaten_page *eaten_pages = NULL; | 307 | static struct eaten_page *eaten_pages = NULL; |
| 303 | 308 | ||
| 304 | void release_eaten_pages(void) | 309 | static void release_eaten_pages(void) |
| 305 | { | 310 | { |
| 306 | struct eaten_page *p, *q; | 311 | struct eaten_page *p, *q; |
| 307 | 312 | ||
| @@ -376,7 +381,6 @@ struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed | |||
| 376 | if (!nr_pages) | 381 | if (!nr_pages) |
| 377 | return NULL; | 382 | return NULL; |
| 378 | 383 | ||
| 379 | pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages); | ||
| 380 | pblist = alloc_image_page(gfp_mask, safe_needed); | 384 | pblist = alloc_image_page(gfp_mask, safe_needed); |
| 381 | /* FIXME: rewrite this ugly loop */ | 385 | /* FIXME: rewrite this ugly loop */ |
| 382 | for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages; | 386 | for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages; |
| @@ -388,7 +392,7 @@ struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed | |||
| 388 | free_pagedir(pblist); | 392 | free_pagedir(pblist); |
| 389 | pblist = NULL; | 393 | pblist = NULL; |
| 390 | } else | 394 | } else |
| 391 | create_pbe_list(pblist, nr_pages); | 395 | create_pbe_list(pblist, nr_pages); |
| 392 | return pblist; | 396 | return pblist; |
| 393 | } | 397 | } |
| 394 | 398 | ||
| @@ -414,6 +418,10 @@ void swsusp_free(void) | |||
| 414 | } | 418 | } |
| 415 | } | 419 | } |
| 416 | } | 420 | } |
| 421 | nr_copy_pages = 0; | ||
| 422 | nr_meta_pages = 0; | ||
| 423 | pagedir_nosave = NULL; | ||
| 424 | buffer = NULL; | ||
| 417 | } | 425 | } |
| 418 | 426 | ||
| 419 | 427 | ||
| @@ -437,7 +445,7 @@ static int enough_free_mem(unsigned int nr_pages) | |||
| 437 | (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); | 445 | (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); |
| 438 | } | 446 | } |
| 439 | 447 | ||
| 440 | int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed) | 448 | static int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed) |
| 441 | { | 449 | { |
| 442 | struct pbe *p; | 450 | struct pbe *p; |
| 443 | 451 | ||
| @@ -504,7 +512,318 @@ asmlinkage int swsusp_save(void) | |||
| 504 | */ | 512 | */ |
| 505 | 513 | ||
| 506 | nr_copy_pages = nr_pages; | 514 | nr_copy_pages = nr_pages; |
| 515 | nr_meta_pages = (nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT; | ||
| 507 | 516 | ||
| 508 | printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages); | 517 | printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages); |
| 509 | return 0; | 518 | return 0; |
| 510 | } | 519 | } |
| 520 | |||
| 521 | static void init_header(struct swsusp_info *info) | ||
| 522 | { | ||
| 523 | memset(info, 0, sizeof(struct swsusp_info)); | ||
| 524 | info->version_code = LINUX_VERSION_CODE; | ||
| 525 | info->num_physpages = num_physpages; | ||
| 526 | memcpy(&info->uts, &system_utsname, sizeof(system_utsname)); | ||
| 527 | info->cpus = num_online_cpus(); | ||
| 528 | info->image_pages = nr_copy_pages; | ||
| 529 | info->pages = nr_copy_pages + nr_meta_pages + 1; | ||
| 530 | info->size = info->pages; | ||
| 531 | info->size <<= PAGE_SHIFT; | ||
| 532 | } | ||
| 533 | |||
| 534 | /** | ||
| 535 | * pack_orig_addresses - the .orig_address fields of the PBEs from the | ||
| 536 | * list starting at @pbe are stored in the array @buf[] (1 page) | ||
| 537 | */ | ||
| 538 | |||
| 539 | static inline struct pbe *pack_orig_addresses(unsigned long *buf, struct pbe *pbe) | ||
| 540 | { | ||
| 541 | int j; | ||
| 542 | |||
| 543 | for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) { | ||
| 544 | buf[j] = pbe->orig_address; | ||
| 545 | pbe = pbe->next; | ||
| 546 | } | ||
| 547 | if (!pbe) | ||
| 548 | for (; j < PAGE_SIZE / sizeof(long); j++) | ||
| 549 | buf[j] = 0; | ||
| 550 | return pbe; | ||
| 551 | } | ||
| 552 | |||
| 553 | /** | ||
| 554 | * snapshot_read_next - used for reading the system memory snapshot. | ||
| 555 | * | ||
| 556 | * On the first call to it @handle should point to a zeroed | ||
| 557 | * snapshot_handle structure. The structure gets updated and a pointer | ||
| 558 | * to it should be passed to this function every next time. | ||
| 559 | * | ||
| 560 | * The @count parameter should contain the number of bytes the caller | ||
| 561 | * wants to read from the snapshot. It must not be zero. | ||
| 562 | * | ||
| 563 | * On success the function returns a positive number. Then, the caller | ||
| 564 | * is allowed to read up to the returned number of bytes from the memory | ||
| 565 | * location computed by the data_of() macro. The number returned | ||
| 566 | * may be smaller than @count, but this only happens if the read would | ||
| 567 | * cross a page boundary otherwise. | ||
| 568 | * | ||
| 569 | * The function returns 0 to indicate the end of data stream condition, | ||
| 570 | * and a negative number is returned on error. In such cases the | ||
| 571 | * structure pointed to by @handle is not updated and should not be used | ||
| 572 | * any more. | ||
| 573 | */ | ||
| 574 | |||
| 575 | int snapshot_read_next(struct snapshot_handle *handle, size_t count) | ||
| 576 | { | ||
| 577 | if (handle->page > nr_meta_pages + nr_copy_pages) | ||
| 578 | return 0; | ||
| 579 | if (!buffer) { | ||
| 580 | /* This makes the buffer be freed by swsusp_free() */ | ||
| 581 | buffer = alloc_image_page(GFP_ATOMIC, 0); | ||
| 582 | if (!buffer) | ||
| 583 | return -ENOMEM; | ||
| 584 | } | ||
| 585 | if (!handle->offset) { | ||
| 586 | init_header((struct swsusp_info *)buffer); | ||
| 587 | handle->buffer = buffer; | ||
| 588 | handle->pbe = pagedir_nosave; | ||
| 589 | } | ||
| 590 | if (handle->prev < handle->page) { | ||
| 591 | if (handle->page <= nr_meta_pages) { | ||
| 592 | handle->pbe = pack_orig_addresses(buffer, handle->pbe); | ||
| 593 | if (!handle->pbe) | ||
| 594 | handle->pbe = pagedir_nosave; | ||
| 595 | } else { | ||
| 596 | handle->buffer = (void *)handle->pbe->address; | ||
| 597 | handle->pbe = handle->pbe->next; | ||
| 598 | } | ||
| 599 | handle->prev = handle->page; | ||
| 600 | } | ||
| 601 | handle->buf_offset = handle->page_offset; | ||
| 602 | if (handle->page_offset + count >= PAGE_SIZE) { | ||
| 603 | count = PAGE_SIZE - handle->page_offset; | ||
| 604 | handle->page_offset = 0; | ||
| 605 | handle->page++; | ||
| 606 | } else { | ||
| 607 | handle->page_offset += count; | ||
| 608 | } | ||
| 609 | handle->offset += count; | ||
| 610 | return count; | ||
| 611 | } | ||
| 612 | |||
| 613 | /** | ||
| 614 | * mark_unsafe_pages - mark the pages that cannot be used for storing | ||
| 615 | * the image during resume, because they conflict with the pages that | ||
| 616 | * had been used before suspend | ||
| 617 | */ | ||
| 618 | |||
| 619 | static int mark_unsafe_pages(struct pbe *pblist) | ||
| 620 | { | ||
| 621 | struct zone *zone; | ||
| 622 | unsigned long zone_pfn; | ||
| 623 | struct pbe *p; | ||
| 624 | |||
| 625 | if (!pblist) /* a sanity check */ | ||
| 626 | return -EINVAL; | ||
| 627 | |||
| 628 | /* Clear page flags */ | ||
| 629 | for_each_zone (zone) { | ||
| 630 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) | ||
| 631 | if (pfn_valid(zone_pfn + zone->zone_start_pfn)) | ||
| 632 | ClearPageNosaveFree(pfn_to_page(zone_pfn + | ||
| 633 | zone->zone_start_pfn)); | ||
| 634 | } | ||
| 635 | |||
| 636 | /* Mark orig addresses */ | ||
| 637 | for_each_pbe (p, pblist) { | ||
| 638 | if (virt_addr_valid(p->orig_address)) | ||
| 639 | SetPageNosaveFree(virt_to_page(p->orig_address)); | ||
| 640 | else | ||
| 641 | return -EFAULT; | ||
| 642 | } | ||
| 643 | |||
| 644 | return 0; | ||
| 645 | } | ||
| 646 | |||
| 647 | static void copy_page_backup_list(struct pbe *dst, struct pbe *src) | ||
| 648 | { | ||
| 649 | /* We assume both lists contain the same number of elements */ | ||
| 650 | while (src) { | ||
| 651 | dst->orig_address = src->orig_address; | ||
| 652 | dst = dst->next; | ||
| 653 | src = src->next; | ||
| 654 | } | ||
| 655 | } | ||
| 656 | |||
| 657 | static int check_header(struct swsusp_info *info) | ||
| 658 | { | ||
| 659 | char *reason = NULL; | ||
| 660 | |||
| 661 | if (info->version_code != LINUX_VERSION_CODE) | ||
| 662 | reason = "kernel version"; | ||
| 663 | if (info->num_physpages != num_physpages) | ||
| 664 | reason = "memory size"; | ||
| 665 | if (strcmp(info->uts.sysname,system_utsname.sysname)) | ||
| 666 | reason = "system type"; | ||
| 667 | if (strcmp(info->uts.release,system_utsname.release)) | ||
| 668 | reason = "kernel release"; | ||
| 669 | if (strcmp(info->uts.version,system_utsname.version)) | ||
| 670 | reason = "version"; | ||
| 671 | if (strcmp(info->uts.machine,system_utsname.machine)) | ||
| 672 | reason = "machine"; | ||
| 673 | if (reason) { | ||
| 674 | printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason); | ||
| 675 | return -EPERM; | ||
| 676 | } | ||
| 677 | return 0; | ||
| 678 | } | ||
| 679 | |||
| 680 | /** | ||
| 681 | * load header - check the image header and copy data from it | ||
| 682 | */ | ||
| 683 | |||
| 684 | static int load_header(struct snapshot_handle *handle, | ||
| 685 | struct swsusp_info *info) | ||
| 686 | { | ||
| 687 | int error; | ||
| 688 | struct pbe *pblist; | ||
| 689 | |||
| 690 | error = check_header(info); | ||
| 691 | if (!error) { | ||
| 692 | pblist = alloc_pagedir(info->image_pages, GFP_ATOMIC, 0); | ||
| 693 | if (!pblist) | ||
| 694 | return -ENOMEM; | ||
| 695 | pagedir_nosave = pblist; | ||
| 696 | handle->pbe = pblist; | ||
| 697 | nr_copy_pages = info->image_pages; | ||
| 698 | nr_meta_pages = info->pages - info->image_pages - 1; | ||
| 699 | } | ||
| 700 | return error; | ||
| 701 | } | ||
| 702 | |||
| 703 | /** | ||
| 704 | * unpack_orig_addresses - copy the elements of @buf[] (1 page) to | ||
| 705 | * the PBEs in the list starting at @pbe | ||
| 706 | */ | ||
| 707 | |||
| 708 | static inline struct pbe *unpack_orig_addresses(unsigned long *buf, | ||
| 709 | struct pbe *pbe) | ||
| 710 | { | ||
| 711 | int j; | ||
| 712 | |||
| 713 | for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) { | ||
| 714 | pbe->orig_address = buf[j]; | ||
| 715 | pbe = pbe->next; | ||
| 716 | } | ||
| 717 | return pbe; | ||
| 718 | } | ||
| 719 | |||
| 720 | /** | ||
| 721 | * create_image - use metadata contained in the PBE list | ||
| 722 | * pointed to by pagedir_nosave to mark the pages that will | ||
| 723 | * be overwritten in the process of restoring the system | ||
| 724 | * memory state from the image and allocate memory for | ||
| 725 | * the image avoiding these pages | ||
| 726 | */ | ||
| 727 | |||
| 728 | static int create_image(struct snapshot_handle *handle) | ||
| 729 | { | ||
| 730 | int error = 0; | ||
| 731 | struct pbe *p, *pblist; | ||
| 732 | |||
| 733 | p = pagedir_nosave; | ||
| 734 | error = mark_unsafe_pages(p); | ||
| 735 | if (!error) { | ||
| 736 | pblist = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 1); | ||
| 737 | if (pblist) | ||
| 738 | copy_page_backup_list(pblist, p); | ||
| 739 | free_pagedir(p); | ||
| 740 | if (!pblist) | ||
| 741 | error = -ENOMEM; | ||
| 742 | } | ||
| 743 | if (!error) | ||
| 744 | error = alloc_data_pages(pblist, GFP_ATOMIC, 1); | ||
| 745 | if (!error) { | ||
| 746 | release_eaten_pages(); | ||
| 747 | pagedir_nosave = pblist; | ||
| 748 | } else { | ||
| 749 | pagedir_nosave = NULL; | ||
| 750 | handle->pbe = NULL; | ||
| 751 | nr_copy_pages = 0; | ||
| 752 | nr_meta_pages = 0; | ||
| 753 | } | ||
| 754 | return error; | ||
| 755 | } | ||
| 756 | |||
| 757 | /** | ||
| 758 | * snapshot_write_next - used for writing the system memory snapshot. | ||
| 759 | * | ||
| 760 | * On the first call to it @handle should point to a zeroed | ||
| 761 | * snapshot_handle structure. The structure gets updated and a pointer | ||
| 762 | * to it should be passed to this function every next time. | ||
| 763 | * | ||
| 764 | * The @count parameter should contain the number of bytes the caller | ||
| 765 | * wants to write to the image. It must not be zero. | ||
| 766 | * | ||
| 767 | * On success the function returns a positive number. Then, the caller | ||
| 768 | * is allowed to write up to the returned number of bytes to the memory | ||
| 769 | * location computed by the data_of() macro. The number returned | ||
| 770 | * may be smaller than @count, but this only happens if the write would | ||
| 771 | * cross a page boundary otherwise. | ||
| 772 | * | ||
| 773 | * The function returns 0 to indicate the "end of file" condition, | ||
| 774 | * and a negative number is returned on error. In such cases the | ||
| 775 | * structure pointed to by @handle is not updated and should not be used | ||
| 776 | * any more. | ||
| 777 | */ | ||
| 778 | |||
| 779 | int snapshot_write_next(struct snapshot_handle *handle, size_t count) | ||
| 780 | { | ||
| 781 | int error = 0; | ||
| 782 | |||
| 783 | if (handle->prev && handle->page > nr_meta_pages + nr_copy_pages) | ||
| 784 | return 0; | ||
| 785 | if (!buffer) { | ||
| 786 | /* This makes the buffer be freed by swsusp_free() */ | ||
| 787 | buffer = alloc_image_page(GFP_ATOMIC, 0); | ||
| 788 | if (!buffer) | ||
| 789 | return -ENOMEM; | ||
| 790 | } | ||
| 791 | if (!handle->offset) | ||
| 792 | handle->buffer = buffer; | ||
| 793 | if (handle->prev < handle->page) { | ||
| 794 | if (!handle->prev) { | ||
| 795 | error = load_header(handle, (struct swsusp_info *)buffer); | ||
| 796 | if (error) | ||
| 797 | return error; | ||
| 798 | } else if (handle->prev <= nr_meta_pages) { | ||
| 799 | handle->pbe = unpack_orig_addresses(buffer, handle->pbe); | ||
| 800 | if (!handle->pbe) { | ||
| 801 | error = create_image(handle); | ||
| 802 | if (error) | ||
| 803 | return error; | ||
| 804 | handle->pbe = pagedir_nosave; | ||
| 805 | handle->buffer = (void *)handle->pbe->address; | ||
| 806 | } | ||
| 807 | } else { | ||
| 808 | handle->pbe = handle->pbe->next; | ||
| 809 | handle->buffer = (void *)handle->pbe->address; | ||
| 810 | } | ||
| 811 | handle->prev = handle->page; | ||
| 812 | } | ||
| 813 | handle->buf_offset = handle->page_offset; | ||
| 814 | if (handle->page_offset + count >= PAGE_SIZE) { | ||
| 815 | count = PAGE_SIZE - handle->page_offset; | ||
| 816 | handle->page_offset = 0; | ||
| 817 | handle->page++; | ||
| 818 | } else { | ||
| 819 | handle->page_offset += count; | ||
| 820 | } | ||
| 821 | handle->offset += count; | ||
| 822 | return count; | ||
| 823 | } | ||
| 824 | |||
| 825 | int snapshot_image_loaded(struct snapshot_handle *handle) | ||
| 826 | { | ||
| 827 | return !(!handle->pbe || handle->pbe->next || !nr_copy_pages || | ||
| 828 | handle->page <= nr_meta_pages + nr_copy_pages); | ||
| 829 | } | ||
diff --git a/kernel/power/swap.c b/kernel/power/swap.c new file mode 100644 index 000000000000..9177f3f73a6c --- /dev/null +++ b/kernel/power/swap.c | |||
| @@ -0,0 +1,544 @@ | |||
| 1 | /* | ||
| 2 | * linux/kernel/power/swap.c | ||
| 3 | * | ||
| 4 | * This file provides functions for reading the suspend image from | ||
| 5 | * and writing it to a swap partition. | ||
| 6 | * | ||
| 7 | * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz> | ||
| 8 | * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> | ||
| 9 | * | ||
| 10 | * This file is released under the GPLv2. | ||
| 11 | * | ||
| 12 | */ | ||
| 13 | |||
| 14 | #include <linux/module.h> | ||
| 15 | #include <linux/smp_lock.h> | ||
| 16 | #include <linux/file.h> | ||
| 17 | #include <linux/utsname.h> | ||
| 18 | #include <linux/version.h> | ||
| 19 | #include <linux/delay.h> | ||
| 20 | #include <linux/bitops.h> | ||
| 21 | #include <linux/genhd.h> | ||
| 22 | #include <linux/device.h> | ||
| 23 | #include <linux/buffer_head.h> | ||
| 24 | #include <linux/bio.h> | ||
| 25 | #include <linux/swap.h> | ||
| 26 | #include <linux/swapops.h> | ||
| 27 | #include <linux/pm.h> | ||
| 28 | |||
| 29 | #include "power.h" | ||
| 30 | |||
| 31 | extern char resume_file[]; | ||
| 32 | |||
| 33 | #define SWSUSP_SIG "S1SUSPEND" | ||
| 34 | |||
| 35 | static struct swsusp_header { | ||
| 36 | char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)]; | ||
| 37 | swp_entry_t image; | ||
| 38 | char orig_sig[10]; | ||
| 39 | char sig[10]; | ||
| 40 | } __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header; | ||
| 41 | |||
| 42 | /* | ||
| 43 | * Saving part... | ||
| 44 | */ | ||
| 45 | |||
| 46 | static unsigned short root_swap = 0xffff; | ||
| 47 | |||
| 48 | static int mark_swapfiles(swp_entry_t start) | ||
| 49 | { | ||
| 50 | int error; | ||
| 51 | |||
| 52 | rw_swap_page_sync(READ, | ||
| 53 | swp_entry(root_swap, 0), | ||
| 54 | virt_to_page((unsigned long)&swsusp_header)); | ||
| 55 | if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) || | ||
| 56 | !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) { | ||
| 57 | memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); | ||
| 58 | memcpy(swsusp_header.sig,SWSUSP_SIG, 10); | ||
| 59 | swsusp_header.image = start; | ||
| 60 | error = rw_swap_page_sync(WRITE, | ||
| 61 | swp_entry(root_swap, 0), | ||
| 62 | virt_to_page((unsigned long) | ||
| 63 | &swsusp_header)); | ||
| 64 | } else { | ||
| 65 | pr_debug("swsusp: Partition is not swap space.\n"); | ||
| 66 | error = -ENODEV; | ||
| 67 | } | ||
| 68 | return error; | ||
| 69 | } | ||
| 70 | |||
| 71 | /** | ||
| 72 | * swsusp_swap_check - check if the resume device is a swap device | ||
| 73 | * and get its index (if so) | ||
| 74 | */ | ||
| 75 | |||
| 76 | static int swsusp_swap_check(void) /* This is called before saving image */ | ||
| 77 | { | ||
| 78 | int res = swap_type_of(swsusp_resume_device); | ||
| 79 | |||
| 80 | if (res >= 0) { | ||
| 81 | root_swap = res; | ||
| 82 | return 0; | ||
| 83 | } | ||
| 84 | return res; | ||
| 85 | } | ||
| 86 | |||
| 87 | /** | ||
| 88 | * write_page - Write one page to given swap location. | ||
| 89 | * @buf: Address we're writing. | ||
| 90 | * @offset: Offset of the swap page we're writing to. | ||
| 91 | */ | ||
| 92 | |||
| 93 | static int write_page(void *buf, unsigned long offset) | ||
| 94 | { | ||
| 95 | swp_entry_t entry; | ||
| 96 | int error = -ENOSPC; | ||
| 97 | |||
| 98 | if (offset) { | ||
| 99 | entry = swp_entry(root_swap, offset); | ||
| 100 | error = rw_swap_page_sync(WRITE, entry, virt_to_page(buf)); | ||
| 101 | } | ||
| 102 | return error; | ||
| 103 | } | ||
| 104 | |||
| 105 | /* | ||
| 106 | * The swap map is a data structure used for keeping track of each page | ||
| 107 | * written to a swap partition. It consists of many swap_map_page | ||
| 108 | * structures that contain each an array of MAP_PAGE_SIZE swap entries. | ||
| 109 | * These structures are stored on the swap and linked together with the | ||
| 110 | * help of the .next_swap member. | ||
| 111 | * | ||
| 112 | * The swap map is created during suspend. The swap map pages are | ||
| 113 | * allocated and populated one at a time, so we only need one memory | ||
| 114 | * page to set up the entire structure. | ||
| 115 | * | ||
| 116 | * During resume we also only need to use one swap_map_page structure | ||
| 117 | * at a time. | ||
| 118 | */ | ||
| 119 | |||
| 120 | #define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(long) - 1) | ||
| 121 | |||
| 122 | struct swap_map_page { | ||
| 123 | unsigned long entries[MAP_PAGE_ENTRIES]; | ||
| 124 | unsigned long next_swap; | ||
| 125 | }; | ||
| 126 | |||
| 127 | /** | ||
| 128 | * The swap_map_handle structure is used for handling swap in | ||
| 129 | * a file-alike way | ||
| 130 | */ | ||
| 131 | |||
| 132 | struct swap_map_handle { | ||
| 133 | struct swap_map_page *cur; | ||
| 134 | unsigned long cur_swap; | ||
| 135 | struct bitmap_page *bitmap; | ||
| 136 | unsigned int k; | ||
| 137 | }; | ||
| 138 | |||
| 139 | static void release_swap_writer(struct swap_map_handle *handle) | ||
| 140 | { | ||
| 141 | if (handle->cur) | ||
| 142 | free_page((unsigned long)handle->cur); | ||
| 143 | handle->cur = NULL; | ||
| 144 | if (handle->bitmap) | ||
| 145 | free_bitmap(handle->bitmap); | ||
| 146 | handle->bitmap = NULL; | ||
| 147 | } | ||
| 148 | |||
| 149 | static int get_swap_writer(struct swap_map_handle *handle) | ||
| 150 | { | ||
| 151 | handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL); | ||
| 152 | if (!handle->cur) | ||
| 153 | return -ENOMEM; | ||
| 154 | handle->bitmap = alloc_bitmap(count_swap_pages(root_swap, 0)); | ||
| 155 | if (!handle->bitmap) { | ||
| 156 | release_swap_writer(handle); | ||
| 157 | return -ENOMEM; | ||
| 158 | } | ||
| 159 | handle->cur_swap = alloc_swap_page(root_swap, handle->bitmap); | ||
| 160 | if (!handle->cur_swap) { | ||
| 161 | release_swap_writer(handle); | ||
| 162 | return -ENOSPC; | ||
| 163 | } | ||
| 164 | handle->k = 0; | ||
| 165 | return 0; | ||
| 166 | } | ||
| 167 | |||
| 168 | static int swap_write_page(struct swap_map_handle *handle, void *buf) | ||
| 169 | { | ||
| 170 | int error; | ||
| 171 | unsigned long offset; | ||
| 172 | |||
| 173 | if (!handle->cur) | ||
| 174 | return -EINVAL; | ||
| 175 | offset = alloc_swap_page(root_swap, handle->bitmap); | ||
| 176 | error = write_page(buf, offset); | ||
| 177 | if (error) | ||
| 178 | return error; | ||
| 179 | handle->cur->entries[handle->k++] = offset; | ||
| 180 | if (handle->k >= MAP_PAGE_ENTRIES) { | ||
| 181 | offset = alloc_swap_page(root_swap, handle->bitmap); | ||
| 182 | if (!offset) | ||
| 183 | return -ENOSPC; | ||
| 184 | handle->cur->next_swap = offset; | ||
| 185 | error = write_page(handle->cur, handle->cur_swap); | ||
| 186 | if (error) | ||
| 187 | return error; | ||
| 188 | memset(handle->cur, 0, PAGE_SIZE); | ||
| 189 | handle->cur_swap = offset; | ||
| 190 | handle->k = 0; | ||
| 191 | } | ||
| 192 | return 0; | ||
| 193 | } | ||
| 194 | |||
| 195 | static int flush_swap_writer(struct swap_map_handle *handle) | ||
| 196 | { | ||
| 197 | if (handle->cur && handle->cur_swap) | ||
| 198 | return write_page(handle->cur, handle->cur_swap); | ||
| 199 | else | ||
| 200 | return -EINVAL; | ||
| 201 | } | ||
| 202 | |||
| 203 | /** | ||
| 204 | * save_image - save the suspend image data | ||
| 205 | */ | ||
| 206 | |||
| 207 | static int save_image(struct swap_map_handle *handle, | ||
| 208 | struct snapshot_handle *snapshot, | ||
| 209 | unsigned int nr_pages) | ||
| 210 | { | ||
| 211 | unsigned int m; | ||
| 212 | int ret; | ||
| 213 | int error = 0; | ||
| 214 | |||
| 215 | printk("Saving image data pages (%u pages) ... ", nr_pages); | ||
| 216 | m = nr_pages / 100; | ||
| 217 | if (!m) | ||
| 218 | m = 1; | ||
| 219 | nr_pages = 0; | ||
| 220 | do { | ||
| 221 | ret = snapshot_read_next(snapshot, PAGE_SIZE); | ||
| 222 | if (ret > 0) { | ||
| 223 | error = swap_write_page(handle, data_of(*snapshot)); | ||
| 224 | if (error) | ||
| 225 | break; | ||
| 226 | if (!(nr_pages % m)) | ||
| 227 | printk("\b\b\b\b%3d%%", nr_pages / m); | ||
| 228 | nr_pages++; | ||
| 229 | } | ||
| 230 | } while (ret > 0); | ||
| 231 | if (!error) | ||
| 232 | printk("\b\b\b\bdone\n"); | ||
| 233 | return error; | ||
| 234 | } | ||
| 235 | |||
| 236 | /** | ||
| 237 | * enough_swap - Make sure we have enough swap to save the image. | ||
| 238 | * | ||
| 239 | * Returns TRUE or FALSE after checking the total amount of swap | ||
| 240 | * space avaiable from the resume partition. | ||
| 241 | */ | ||
| 242 | |||
| 243 | static int enough_swap(unsigned int nr_pages) | ||
| 244 | { | ||
| 245 | unsigned int free_swap = count_swap_pages(root_swap, 1); | ||
| 246 | |||
| 247 | pr_debug("swsusp: free swap pages: %u\n", free_swap); | ||
| 248 | return free_swap > (nr_pages + PAGES_FOR_IO + | ||
| 249 | (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); | ||
| 250 | } | ||
| 251 | |||
| 252 | /** | ||
| 253 | * swsusp_write - Write entire image and metadata. | ||
| 254 | * | ||
| 255 | * It is important _NOT_ to umount filesystems at this point. We want | ||
| 256 | * them synced (in case something goes wrong) but we DO not want to mark | ||
| 257 | * filesystem clean: it is not. (And it does not matter, if we resume | ||
| 258 | * correctly, we'll mark system clean, anyway.) | ||
| 259 | */ | ||
| 260 | |||
| 261 | int swsusp_write(void) | ||
| 262 | { | ||
| 263 | struct swap_map_handle handle; | ||
| 264 | struct snapshot_handle snapshot; | ||
| 265 | struct swsusp_info *header; | ||
| 266 | unsigned long start; | ||
| 267 | int error; | ||
| 268 | |||
| 269 | if ((error = swsusp_swap_check())) { | ||
| 270 | printk(KERN_ERR "swsusp: Cannot find swap device, try swapon -a.\n"); | ||
| 271 | return error; | ||
| 272 | } | ||
| 273 | memset(&snapshot, 0, sizeof(struct snapshot_handle)); | ||
| 274 | error = snapshot_read_next(&snapshot, PAGE_SIZE); | ||
| 275 | if (error < PAGE_SIZE) | ||
| 276 | return error < 0 ? error : -EFAULT; | ||
| 277 | header = (struct swsusp_info *)data_of(snapshot); | ||
| 278 | if (!enough_swap(header->pages)) { | ||
| 279 | printk(KERN_ERR "swsusp: Not enough free swap\n"); | ||
| 280 | return -ENOSPC; | ||
| 281 | } | ||
| 282 | error = get_swap_writer(&handle); | ||
| 283 | if (!error) { | ||
| 284 | start = handle.cur_swap; | ||
| 285 | error = swap_write_page(&handle, header); | ||
| 286 | } | ||
| 287 | if (!error) | ||
| 288 | error = save_image(&handle, &snapshot, header->pages - 1); | ||
| 289 | if (!error) { | ||
| 290 | flush_swap_writer(&handle); | ||
| 291 | printk("S"); | ||
| 292 | error = mark_swapfiles(swp_entry(root_swap, start)); | ||
| 293 | printk("|\n"); | ||
| 294 | } | ||
| 295 | if (error) | ||
| 296 | free_all_swap_pages(root_swap, handle.bitmap); | ||
| 297 | release_swap_writer(&handle); | ||
| 298 | return error; | ||
| 299 | } | ||
| 300 | |||
| 301 | /* | ||
| 302 | * Using bio to read from swap. | ||
| 303 | * This code requires a bit more work than just using buffer heads | ||
| 304 | * but, it is the recommended way for 2.5/2.6. | ||
| 305 | * The following are to signal the beginning and end of I/O. Bios | ||
| 306 | * finish asynchronously, while we want them to happen synchronously. | ||
| 307 | * A simple atomic_t, and a wait loop take care of this problem. | ||
| 308 | */ | ||
| 309 | |||
| 310 | static atomic_t io_done = ATOMIC_INIT(0); | ||
| 311 | |||
| 312 | static int end_io(struct bio *bio, unsigned int num, int err) | ||
| 313 | { | ||
| 314 | if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) | ||
| 315 | panic("I/O error reading memory image"); | ||
| 316 | atomic_set(&io_done, 0); | ||
| 317 | return 0; | ||
| 318 | } | ||
| 319 | |||
| 320 | static struct block_device *resume_bdev; | ||
| 321 | |||
| 322 | /** | ||
| 323 | * submit - submit BIO request. | ||
| 324 | * @rw: READ or WRITE. | ||
| 325 | * @off physical offset of page. | ||
| 326 | * @page: page we're reading or writing. | ||
| 327 | * | ||
| 328 | * Straight from the textbook - allocate and initialize the bio. | ||
| 329 | * If we're writing, make sure the page is marked as dirty. | ||
| 330 | * Then submit it and wait. | ||
| 331 | */ | ||
| 332 | |||
| 333 | static int submit(int rw, pgoff_t page_off, void *page) | ||
| 334 | { | ||
| 335 | int error = 0; | ||
| 336 | struct bio *bio; | ||
| 337 | |||
| 338 | bio = bio_alloc(GFP_ATOMIC, 1); | ||
| 339 | if (!bio) | ||
| 340 | return -ENOMEM; | ||
| 341 | bio->bi_sector = page_off * (PAGE_SIZE >> 9); | ||
| 342 | bio->bi_bdev = resume_bdev; | ||
| 343 | bio->bi_end_io = end_io; | ||
| 344 | |||
| 345 | if (bio_add_page(bio, virt_to_page(page), PAGE_SIZE, 0) < PAGE_SIZE) { | ||
| 346 | printk("swsusp: ERROR: adding page to bio at %ld\n",page_off); | ||
| 347 | error = -EFAULT; | ||
| 348 | goto Done; | ||
| 349 | } | ||
| 350 | |||
| 351 | atomic_set(&io_done, 1); | ||
| 352 | submit_bio(rw | (1 << BIO_RW_SYNC), bio); | ||
| 353 | while (atomic_read(&io_done)) | ||
| 354 | yield(); | ||
| 355 | if (rw == READ) | ||
| 356 | bio_set_pages_dirty(bio); | ||
| 357 | Done: | ||
| 358 | bio_put(bio); | ||
| 359 | return error; | ||
| 360 | } | ||
| 361 | |||
| 362 | static int bio_read_page(pgoff_t page_off, void *page) | ||
| 363 | { | ||
| 364 | return submit(READ, page_off, page); | ||
| 365 | } | ||
| 366 | |||
| 367 | static int bio_write_page(pgoff_t page_off, void *page) | ||
| 368 | { | ||
| 369 | return submit(WRITE, page_off, page); | ||
| 370 | } | ||
| 371 | |||
| 372 | /** | ||
| 373 | * The following functions allow us to read data using a swap map | ||
| 374 | * in a file-alike way | ||
| 375 | */ | ||
| 376 | |||
| 377 | static void release_swap_reader(struct swap_map_handle *handle) | ||
| 378 | { | ||
| 379 | if (handle->cur) | ||
| 380 | free_page((unsigned long)handle->cur); | ||
| 381 | handle->cur = NULL; | ||
| 382 | } | ||
| 383 | |||
| 384 | static int get_swap_reader(struct swap_map_handle *handle, | ||
| 385 | swp_entry_t start) | ||
| 386 | { | ||
| 387 | int error; | ||
| 388 | |||
| 389 | if (!swp_offset(start)) | ||
| 390 | return -EINVAL; | ||
| 391 | handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); | ||
| 392 | if (!handle->cur) | ||
| 393 | return -ENOMEM; | ||
| 394 | error = bio_read_page(swp_offset(start), handle->cur); | ||
| 395 | if (error) { | ||
| 396 | release_swap_reader(handle); | ||
| 397 | return error; | ||
| 398 | } | ||
| 399 | handle->k = 0; | ||
| 400 | return 0; | ||
| 401 | } | ||
| 402 | |||
| 403 | static int swap_read_page(struct swap_map_handle *handle, void *buf) | ||
| 404 | { | ||
| 405 | unsigned long offset; | ||
| 406 | int error; | ||
| 407 | |||
| 408 | if (!handle->cur) | ||
| 409 | return -EINVAL; | ||
| 410 | offset = handle->cur->entries[handle->k]; | ||
| 411 | if (!offset) | ||
| 412 | return -EFAULT; | ||
| 413 | error = bio_read_page(offset, buf); | ||
| 414 | if (error) | ||
| 415 | return error; | ||
| 416 | if (++handle->k >= MAP_PAGE_ENTRIES) { | ||
| 417 | handle->k = 0; | ||
| 418 | offset = handle->cur->next_swap; | ||
| 419 | if (!offset) | ||
| 420 | release_swap_reader(handle); | ||
| 421 | else | ||
| 422 | error = bio_read_page(offset, handle->cur); | ||
| 423 | } | ||
| 424 | return error; | ||
| 425 | } | ||
| 426 | |||
| 427 | /** | ||
| 428 | * load_image - load the image using the swap map handle | ||
| 429 | * @handle and the snapshot handle @snapshot | ||
| 430 | * (assume there are @nr_pages pages to load) | ||
| 431 | */ | ||
| 432 | |||
| 433 | static int load_image(struct swap_map_handle *handle, | ||
| 434 | struct snapshot_handle *snapshot, | ||
| 435 | unsigned int nr_pages) | ||
| 436 | { | ||
| 437 | unsigned int m; | ||
| 438 | int ret; | ||
| 439 | int error = 0; | ||
| 440 | |||
| 441 | printk("Loading image data pages (%u pages) ... ", nr_pages); | ||
| 442 | m = nr_pages / 100; | ||
| 443 | if (!m) | ||
| 444 | m = 1; | ||
| 445 | nr_pages = 0; | ||
| 446 | do { | ||
| 447 | ret = snapshot_write_next(snapshot, PAGE_SIZE); | ||
| 448 | if (ret > 0) { | ||
| 449 | error = swap_read_page(handle, data_of(*snapshot)); | ||
| 450 | if (error) | ||
| 451 | break; | ||
| 452 | if (!(nr_pages % m)) | ||
| 453 | printk("\b\b\b\b%3d%%", nr_pages / m); | ||
| 454 | nr_pages++; | ||
| 455 | } | ||
| 456 | } while (ret > 0); | ||
| 457 | if (!error) | ||
| 458 | printk("\b\b\b\bdone\n"); | ||
| 459 | if (!snapshot_image_loaded(snapshot)) | ||
| 460 | error = -ENODATA; | ||
| 461 | return error; | ||
| 462 | } | ||
| 463 | |||
| 464 | int swsusp_read(void) | ||
| 465 | { | ||
| 466 | int error; | ||
| 467 | struct swap_map_handle handle; | ||
| 468 | struct snapshot_handle snapshot; | ||
| 469 | struct swsusp_info *header; | ||
| 470 | |||
| 471 | if (IS_ERR(resume_bdev)) { | ||
| 472 | pr_debug("swsusp: block device not initialised\n"); | ||
| 473 | return PTR_ERR(resume_bdev); | ||
| 474 | } | ||
| 475 | |||
| 476 | memset(&snapshot, 0, sizeof(struct snapshot_handle)); | ||
| 477 | error = snapshot_write_next(&snapshot, PAGE_SIZE); | ||
| 478 | if (error < PAGE_SIZE) | ||
| 479 | return error < 0 ? error : -EFAULT; | ||
| 480 | header = (struct swsusp_info *)data_of(snapshot); | ||
| 481 | error = get_swap_reader(&handle, swsusp_header.image); | ||
| 482 | if (!error) | ||
| 483 | error = swap_read_page(&handle, header); | ||
| 484 | if (!error) | ||
| 485 | error = load_image(&handle, &snapshot, header->pages - 1); | ||
| 486 | release_swap_reader(&handle); | ||
| 487 | |||
| 488 | blkdev_put(resume_bdev); | ||
| 489 | |||
| 490 | if (!error) | ||
| 491 | pr_debug("swsusp: Reading resume file was successful\n"); | ||
| 492 | else | ||
| 493 | pr_debug("swsusp: Error %d resuming\n", error); | ||
| 494 | return error; | ||
| 495 | } | ||
| 496 | |||
| 497 | /** | ||
| 498 | * swsusp_check - Check for swsusp signature in the resume device | ||
| 499 | */ | ||
| 500 | |||
| 501 | int swsusp_check(void) | ||
| 502 | { | ||
| 503 | int error; | ||
| 504 | |||
| 505 | resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); | ||
| 506 | if (!IS_ERR(resume_bdev)) { | ||
| 507 | set_blocksize(resume_bdev, PAGE_SIZE); | ||
| 508 | memset(&swsusp_header, 0, sizeof(swsusp_header)); | ||
| 509 | if ((error = bio_read_page(0, &swsusp_header))) | ||
| 510 | return error; | ||
| 511 | if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) { | ||
| 512 | memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10); | ||
| 513 | /* Reset swap signature now */ | ||
| 514 | error = bio_write_page(0, &swsusp_header); | ||
| 515 | } else { | ||
| 516 | return -EINVAL; | ||
| 517 | } | ||
| 518 | if (error) | ||
| 519 | blkdev_put(resume_bdev); | ||
| 520 | else | ||
| 521 | pr_debug("swsusp: Signature found, resuming\n"); | ||
| 522 | } else { | ||
| 523 | error = PTR_ERR(resume_bdev); | ||
| 524 | } | ||
| 525 | |||
| 526 | if (error) | ||
| 527 | pr_debug("swsusp: Error %d check for resume file\n", error); | ||
| 528 | |||
| 529 | return error; | ||
| 530 | } | ||
| 531 | |||
| 532 | /** | ||
| 533 | * swsusp_close - close swap device. | ||
| 534 | */ | ||
| 535 | |||
| 536 | void swsusp_close(void) | ||
| 537 | { | ||
| 538 | if (IS_ERR(resume_bdev)) { | ||
| 539 | pr_debug("swsusp: block device not initialised\n"); | ||
| 540 | return; | ||
| 541 | } | ||
| 542 | |||
| 543 | blkdev_put(resume_bdev); | ||
| 544 | } | ||
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index 2d9d08f72f76..c4016cbbd3e0 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c | |||
| @@ -31,41 +31,24 @@ | |||
| 31 | * Fixed runaway init | 31 | * Fixed runaway init |
| 32 | * | 32 | * |
| 33 | * Rafael J. Wysocki <rjw@sisk.pl> | 33 | * Rafael J. Wysocki <rjw@sisk.pl> |
| 34 | * Added the swap map data structure and reworked the handling of swap | 34 | * Reworked the freeing of memory and the handling of swap |
| 35 | * | 35 | * |
| 36 | * More state savers are welcome. Especially for the scsi layer... | 36 | * More state savers are welcome. Especially for the scsi layer... |
| 37 | * | 37 | * |
| 38 | * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt | 38 | * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt |
| 39 | */ | 39 | */ |
| 40 | 40 | ||
| 41 | #include <linux/module.h> | ||
| 42 | #include <linux/mm.h> | 41 | #include <linux/mm.h> |
| 43 | #include <linux/suspend.h> | 42 | #include <linux/suspend.h> |
| 44 | #include <linux/smp_lock.h> | ||
| 45 | #include <linux/file.h> | ||
| 46 | #include <linux/utsname.h> | ||
| 47 | #include <linux/version.h> | ||
| 48 | #include <linux/delay.h> | ||
| 49 | #include <linux/bitops.h> | ||
| 50 | #include <linux/spinlock.h> | 43 | #include <linux/spinlock.h> |
| 51 | #include <linux/genhd.h> | ||
| 52 | #include <linux/kernel.h> | 44 | #include <linux/kernel.h> |
| 53 | #include <linux/major.h> | 45 | #include <linux/major.h> |
| 54 | #include <linux/swap.h> | 46 | #include <linux/swap.h> |
| 55 | #include <linux/pm.h> | 47 | #include <linux/pm.h> |
| 56 | #include <linux/device.h> | ||
| 57 | #include <linux/buffer_head.h> | ||
| 58 | #include <linux/swapops.h> | 48 | #include <linux/swapops.h> |
| 59 | #include <linux/bootmem.h> | 49 | #include <linux/bootmem.h> |
| 60 | #include <linux/syscalls.h> | 50 | #include <linux/syscalls.h> |
| 61 | #include <linux/highmem.h> | 51 | #include <linux/highmem.h> |
| 62 | #include <linux/bio.h> | ||
| 63 | |||
| 64 | #include <asm/uaccess.h> | ||
| 65 | #include <asm/mmu_context.h> | ||
| 66 | #include <asm/pgtable.h> | ||
| 67 | #include <asm/tlbflush.h> | ||
| 68 | #include <asm/io.h> | ||
| 69 | 52 | ||
| 70 | #include "power.h" | 53 | #include "power.h" |
| 71 | 54 | ||
| @@ -77,6 +60,8 @@ | |||
| 77 | */ | 60 | */ |
| 78 | unsigned long image_size = 500 * 1024 * 1024; | 61 | unsigned long image_size = 500 * 1024 * 1024; |
| 79 | 62 | ||
| 63 | int in_suspend __nosavedata = 0; | ||
| 64 | |||
| 80 | #ifdef CONFIG_HIGHMEM | 65 | #ifdef CONFIG_HIGHMEM |
| 81 | unsigned int count_highmem_pages(void); | 66 | unsigned int count_highmem_pages(void); |
| 82 | int save_highmem(void); | 67 | int save_highmem(void); |
| @@ -87,471 +72,97 @@ static int restore_highmem(void) { return 0; } | |||
| 87 | static unsigned int count_highmem_pages(void) { return 0; } | 72 | static unsigned int count_highmem_pages(void) { return 0; } |
| 88 | #endif | 73 | #endif |
| 89 | 74 | ||
| 90 | extern char resume_file[]; | ||
| 91 | |||
| 92 | #define SWSUSP_SIG "S1SUSPEND" | ||
| 93 | |||
| 94 | static struct swsusp_header { | ||
| 95 | char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)]; | ||
| 96 | swp_entry_t image; | ||
| 97 | char orig_sig[10]; | ||
| 98 | char sig[10]; | ||
| 99 | } __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header; | ||
| 100 | |||
| 101 | static struct swsusp_info swsusp_info; | ||
| 102 | |||
| 103 | /* | ||
| 104 | * Saving part... | ||
| 105 | */ | ||
| 106 | |||
| 107 | static unsigned short root_swap = 0xffff; | ||
| 108 | |||
| 109 | static int mark_swapfiles(swp_entry_t start) | ||
| 110 | { | ||
| 111 | int error; | ||
| 112 | |||
| 113 | rw_swap_page_sync(READ, | ||
| 114 | swp_entry(root_swap, 0), | ||
| 115 | virt_to_page((unsigned long)&swsusp_header)); | ||
| 116 | if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) || | ||
| 117 | !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) { | ||
| 118 | memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); | ||
| 119 | memcpy(swsusp_header.sig,SWSUSP_SIG, 10); | ||
| 120 | swsusp_header.image = start; | ||
| 121 | error = rw_swap_page_sync(WRITE, | ||
| 122 | swp_entry(root_swap, 0), | ||
| 123 | virt_to_page((unsigned long) | ||
| 124 | &swsusp_header)); | ||
| 125 | } else { | ||
| 126 | pr_debug("swsusp: Partition is not swap space.\n"); | ||
| 127 | error = -ENODEV; | ||
| 128 | } | ||
| 129 | return error; | ||
| 130 | } | ||
| 131 | |||
| 132 | /* | ||
| 133 | * Check whether the swap device is the specified resume | ||
| 134 | * device, irrespective of whether they are specified by | ||
| 135 | * identical names. | ||
| 136 | * | ||
| 137 | * (Thus, device inode aliasing is allowed. You can say /dev/hda4 | ||
| 138 | * instead of /dev/ide/host0/bus0/target0/lun0/part4 [if using devfs] | ||
| 139 | * and they'll be considered the same device. This is *necessary* for | ||
| 140 | * devfs, since the resume code can only recognize the form /dev/hda4, | ||
| 141 | * but the suspend code would see the long name.) | ||
| 142 | */ | ||
| 143 | static inline int is_resume_device(const struct swap_info_struct *swap_info) | ||
| 144 | { | ||
| 145 | struct file *file = swap_info->swap_file; | ||
| 146 | struct inode *inode = file->f_dentry->d_inode; | ||
| 147 | |||
| 148 | return S_ISBLK(inode->i_mode) && | ||
| 149 | swsusp_resume_device == MKDEV(imajor(inode), iminor(inode)); | ||
| 150 | } | ||
| 151 | |||
| 152 | static int swsusp_swap_check(void) /* This is called before saving image */ | ||
| 153 | { | ||
| 154 | int i; | ||
| 155 | |||
| 156 | spin_lock(&swap_lock); | ||
| 157 | for (i = 0; i < MAX_SWAPFILES; i++) { | ||
| 158 | if (!(swap_info[i].flags & SWP_WRITEOK)) | ||
| 159 | continue; | ||
| 160 | if (!swsusp_resume_device || is_resume_device(swap_info + i)) { | ||
| 161 | spin_unlock(&swap_lock); | ||
| 162 | root_swap = i; | ||
| 163 | return 0; | ||
| 164 | } | ||
| 165 | } | ||
| 166 | spin_unlock(&swap_lock); | ||
| 167 | return -ENODEV; | ||
| 168 | } | ||
| 169 | |||
| 170 | /** | ||
| 171 | * write_page - Write one page to a fresh swap location. | ||
| 172 | * @addr: Address we're writing. | ||
| 173 | * @loc: Place to store the entry we used. | ||
| 174 | * | ||
| 175 | * Allocate a new swap entry and 'sync' it. Note we discard -EIO | ||
| 176 | * errors. That is an artifact left over from swsusp. It did not | ||
| 177 | * check the return of rw_swap_page_sync() at all, since most pages | ||
| 178 | * written back to swap would return -EIO. | ||
| 179 | * This is a partial improvement, since we will at least return other | ||
| 180 | * errors, though we need to eventually fix the damn code. | ||
| 181 | */ | ||
| 182 | static int write_page(unsigned long addr, swp_entry_t *loc) | ||
| 183 | { | ||
| 184 | swp_entry_t entry; | ||
| 185 | int error = -ENOSPC; | ||
| 186 | |||
| 187 | entry = get_swap_page_of_type(root_swap); | ||
| 188 | if (swp_offset(entry)) { | ||
| 189 | error = rw_swap_page_sync(WRITE, entry, virt_to_page(addr)); | ||
| 190 | if (!error || error == -EIO) | ||
| 191 | *loc = entry; | ||
| 192 | } | ||
| 193 | return error; | ||
| 194 | } | ||
| 195 | |||
| 196 | /** | 75 | /** |
| 197 | * Swap map-handling functions | 76 | * The following functions are used for tracing the allocated |
| 198 | * | 77 | * swap pages, so that they can be freed in case of an error. |
| 199 | * The swap map is a data structure used for keeping track of each page | ||
| 200 | * written to the swap. It consists of many swap_map_page structures | ||
| 201 | * that contain each an array of MAP_PAGE_SIZE swap entries. | ||
| 202 | * These structures are linked together with the help of either the | ||
| 203 | * .next (in memory) or the .next_swap (in swap) member. | ||
| 204 | * | 78 | * |
| 205 | * The swap map is created during suspend. At that time we need to keep | 79 | * The functions operate on a linked bitmap structure defined |
| 206 | * it in memory, because we have to free all of the allocated swap | 80 | * in power.h |
| 207 | * entries if an error occurs. The memory needed is preallocated | ||
| 208 | * so that we know in advance if there's enough of it. | ||
| 209 | * | ||
| 210 | * The first swap_map_page structure is filled with the swap entries that | ||
| 211 | * correspond to the first MAP_PAGE_SIZE data pages written to swap and | ||
| 212 | * so on. After the all of the data pages have been written, the order | ||
| 213 | * of the swap_map_page structures in the map is reversed so that they | ||
| 214 | * can be read from swap in the original order. This causes the data | ||
| 215 | * pages to be loaded in exactly the same order in which they have been | ||
| 216 | * saved. | ||
| 217 | * | ||
| 218 | * During resume we only need to use one swap_map_page structure | ||
| 219 | * at a time, which means that we only need to use two memory pages for | ||
| 220 | * reading the image - one for reading the swap_map_page structures | ||
| 221 | * and the second for reading the data pages from swap. | ||
| 222 | */ | 81 | */ |
| 223 | 82 | ||
| 224 | #define MAP_PAGE_SIZE ((PAGE_SIZE - sizeof(swp_entry_t) - sizeof(void *)) \ | 83 | void free_bitmap(struct bitmap_page *bitmap) |
| 225 | / sizeof(swp_entry_t)) | ||
| 226 | |||
| 227 | struct swap_map_page { | ||
| 228 | swp_entry_t entries[MAP_PAGE_SIZE]; | ||
| 229 | swp_entry_t next_swap; | ||
| 230 | struct swap_map_page *next; | ||
| 231 | }; | ||
| 232 | |||
| 233 | static inline void free_swap_map(struct swap_map_page *swap_map) | ||
| 234 | { | 84 | { |
| 235 | struct swap_map_page *swp; | 85 | struct bitmap_page *bp; |
| 236 | 86 | ||
| 237 | while (swap_map) { | 87 | while (bitmap) { |
| 238 | swp = swap_map->next; | 88 | bp = bitmap->next; |
| 239 | free_page((unsigned long)swap_map); | 89 | free_page((unsigned long)bitmap); |
| 240 | swap_map = swp; | 90 | bitmap = bp; |
| 241 | } | 91 | } |
| 242 | } | 92 | } |
| 243 | 93 | ||
| 244 | static struct swap_map_page *alloc_swap_map(unsigned int nr_pages) | 94 | struct bitmap_page *alloc_bitmap(unsigned int nr_bits) |
| 245 | { | 95 | { |
| 246 | struct swap_map_page *swap_map, *swp; | 96 | struct bitmap_page *bitmap, *bp; |
| 247 | unsigned n = 0; | 97 | unsigned int n; |
| 248 | 98 | ||
| 249 | if (!nr_pages) | 99 | if (!nr_bits) |
| 250 | return NULL; | 100 | return NULL; |
| 251 | 101 | ||
| 252 | pr_debug("alloc_swap_map(): nr_pages = %d\n", nr_pages); | 102 | bitmap = (struct bitmap_page *)get_zeroed_page(GFP_KERNEL); |
| 253 | swap_map = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); | 103 | bp = bitmap; |
| 254 | swp = swap_map; | 104 | for (n = BITMAP_PAGE_BITS; n < nr_bits; n += BITMAP_PAGE_BITS) { |
| 255 | for (n = MAP_PAGE_SIZE; n < nr_pages; n += MAP_PAGE_SIZE) { | 105 | bp->next = (struct bitmap_page *)get_zeroed_page(GFP_KERNEL); |
| 256 | swp->next = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); | 106 | bp = bp->next; |
| 257 | swp = swp->next; | 107 | if (!bp) { |
| 258 | if (!swp) { | 108 | free_bitmap(bitmap); |
| 259 | free_swap_map(swap_map); | ||
| 260 | return NULL; | 109 | return NULL; |
| 261 | } | 110 | } |
| 262 | } | 111 | } |
| 263 | return swap_map; | 112 | return bitmap; |
| 264 | } | 113 | } |
| 265 | 114 | ||
| 266 | /** | 115 | static int bitmap_set(struct bitmap_page *bitmap, unsigned long bit) |
| 267 | * reverse_swap_map - reverse the order of pages in the swap map | ||
| 268 | * @swap_map | ||
| 269 | */ | ||
| 270 | |||
| 271 | static inline struct swap_map_page *reverse_swap_map(struct swap_map_page *swap_map) | ||
| 272 | { | ||
| 273 | struct swap_map_page *prev, *next; | ||
| 274 | |||
| 275 | prev = NULL; | ||
| 276 | while (swap_map) { | ||
| 277 | next = swap_map->next; | ||
| 278 | swap_map->next = prev; | ||
| 279 | prev = swap_map; | ||
| 280 | swap_map = next; | ||
| 281 | } | ||
| 282 | return prev; | ||
| 283 | } | ||
| 284 | |||
| 285 | /** | ||
| 286 | * free_swap_map_entries - free the swap entries allocated to store | ||
| 287 | * the swap map @swap_map (this is only called in case of an error) | ||
| 288 | */ | ||
| 289 | static inline void free_swap_map_entries(struct swap_map_page *swap_map) | ||
| 290 | { | ||
| 291 | while (swap_map) { | ||
| 292 | if (swap_map->next_swap.val) | ||
| 293 | swap_free(swap_map->next_swap); | ||
| 294 | swap_map = swap_map->next; | ||
| 295 | } | ||
| 296 | } | ||
| 297 | |||
| 298 | /** | ||
| 299 | * save_swap_map - save the swap map used for tracing the data pages | ||
| 300 | * stored in the swap | ||
| 301 | */ | ||
| 302 | |||
| 303 | static int save_swap_map(struct swap_map_page *swap_map, swp_entry_t *start) | ||
| 304 | { | ||
| 305 | swp_entry_t entry = (swp_entry_t){0}; | ||
| 306 | int error; | ||
| 307 | |||
| 308 | while (swap_map) { | ||
| 309 | swap_map->next_swap = entry; | ||
| 310 | if ((error = write_page((unsigned long)swap_map, &entry))) | ||
| 311 | return error; | ||
| 312 | swap_map = swap_map->next; | ||
| 313 | } | ||
| 314 | *start = entry; | ||
| 315 | return 0; | ||
| 316 | } | ||
| 317 | |||
| 318 | /** | ||
| 319 | * free_image_entries - free the swap entries allocated to store | ||
| 320 | * the image data pages (this is only called in case of an error) | ||
| 321 | */ | ||
| 322 | |||
| 323 | static inline void free_image_entries(struct swap_map_page *swp) | ||
| 324 | { | 116 | { |
| 325 | unsigned k; | 117 | unsigned int n; |
| 326 | 118 | ||
| 327 | while (swp) { | 119 | n = BITMAP_PAGE_BITS; |
| 328 | for (k = 0; k < MAP_PAGE_SIZE; k++) | 120 | while (bitmap && n <= bit) { |
| 329 | if (swp->entries[k].val) | 121 | n += BITMAP_PAGE_BITS; |
| 330 | swap_free(swp->entries[k]); | 122 | bitmap = bitmap->next; |
| 331 | swp = swp->next; | ||
| 332 | } | 123 | } |
| 333 | } | 124 | if (!bitmap) |
| 334 | 125 | return -EINVAL; | |
| 335 | /** | 126 | n -= BITMAP_PAGE_BITS; |
| 336 | * The swap_map_handle structure is used for handling the swap map in | 127 | bit -= n; |
| 337 | * a file-alike way | 128 | n = 0; |
| 338 | */ | 129 | while (bit >= BITS_PER_CHUNK) { |
| 339 | 130 | bit -= BITS_PER_CHUNK; | |
| 340 | struct swap_map_handle { | 131 | n++; |
| 341 | struct swap_map_page *cur; | ||
| 342 | unsigned int k; | ||
| 343 | }; | ||
| 344 | |||
| 345 | static inline void init_swap_map_handle(struct swap_map_handle *handle, | ||
| 346 | struct swap_map_page *map) | ||
| 347 | { | ||
| 348 | handle->cur = map; | ||
| 349 | handle->k = 0; | ||
| 350 | } | ||
| 351 | |||
| 352 | static inline int swap_map_write_page(struct swap_map_handle *handle, | ||
| 353 | unsigned long addr) | ||
| 354 | { | ||
| 355 | int error; | ||
| 356 | |||
| 357 | error = write_page(addr, handle->cur->entries + handle->k); | ||
| 358 | if (error) | ||
| 359 | return error; | ||
| 360 | if (++handle->k >= MAP_PAGE_SIZE) { | ||
| 361 | handle->cur = handle->cur->next; | ||
| 362 | handle->k = 0; | ||
| 363 | } | 132 | } |
| 133 | bitmap->chunks[n] |= (1UL << bit); | ||
| 364 | return 0; | 134 | return 0; |
| 365 | } | 135 | } |
| 366 | 136 | ||
| 367 | /** | 137 | unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap) |
| 368 | * save_image_data - save the data pages pointed to by the PBEs | ||
| 369 | * from the list @pblist using the swap map handle @handle | ||
| 370 | * (assume there are @nr_pages data pages to save) | ||
| 371 | */ | ||
| 372 | |||
| 373 | static int save_image_data(struct pbe *pblist, | ||
| 374 | struct swap_map_handle *handle, | ||
| 375 | unsigned int nr_pages) | ||
| 376 | { | ||
| 377 | unsigned int m; | ||
| 378 | struct pbe *p; | ||
| 379 | int error = 0; | ||
| 380 | |||
| 381 | printk("Saving image data pages (%u pages) ... ", nr_pages); | ||
| 382 | m = nr_pages / 100; | ||
| 383 | if (!m) | ||
| 384 | m = 1; | ||
| 385 | nr_pages = 0; | ||
| 386 | for_each_pbe (p, pblist) { | ||
| 387 | error = swap_map_write_page(handle, p->address); | ||
| 388 | if (error) | ||
| 389 | break; | ||
| 390 | if (!(nr_pages % m)) | ||
| 391 | printk("\b\b\b\b%3d%%", nr_pages / m); | ||
| 392 | nr_pages++; | ||
| 393 | } | ||
| 394 | if (!error) | ||
| 395 | printk("\b\b\b\bdone\n"); | ||
| 396 | return error; | ||
| 397 | } | ||
| 398 | |||
| 399 | static void dump_info(void) | ||
| 400 | { | ||
| 401 | pr_debug(" swsusp: Version: %u\n",swsusp_info.version_code); | ||
| 402 | pr_debug(" swsusp: Num Pages: %ld\n",swsusp_info.num_physpages); | ||
| 403 | pr_debug(" swsusp: UTS Sys: %s\n",swsusp_info.uts.sysname); | ||
| 404 | pr_debug(" swsusp: UTS Node: %s\n",swsusp_info.uts.nodename); | ||
| 405 | pr_debug(" swsusp: UTS Release: %s\n",swsusp_info.uts.release); | ||
| 406 | pr_debug(" swsusp: UTS Version: %s\n",swsusp_info.uts.version); | ||
| 407 | pr_debug(" swsusp: UTS Machine: %s\n",swsusp_info.uts.machine); | ||
| 408 | pr_debug(" swsusp: UTS Domain: %s\n",swsusp_info.uts.domainname); | ||
| 409 | pr_debug(" swsusp: CPUs: %d\n",swsusp_info.cpus); | ||
| 410 | pr_debug(" swsusp: Image: %ld Pages\n",swsusp_info.image_pages); | ||
| 411 | pr_debug(" swsusp: Total: %ld Pages\n", swsusp_info.pages); | ||
| 412 | } | ||
| 413 | |||
| 414 | static void init_header(unsigned int nr_pages) | ||
| 415 | { | ||
| 416 | memset(&swsusp_info, 0, sizeof(swsusp_info)); | ||
| 417 | swsusp_info.version_code = LINUX_VERSION_CODE; | ||
| 418 | swsusp_info.num_physpages = num_physpages; | ||
| 419 | memcpy(&swsusp_info.uts, &system_utsname, sizeof(system_utsname)); | ||
| 420 | |||
| 421 | swsusp_info.cpus = num_online_cpus(); | ||
| 422 | swsusp_info.image_pages = nr_pages; | ||
| 423 | swsusp_info.pages = nr_pages + | ||
| 424 | ((nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT) + 1; | ||
| 425 | } | ||
| 426 | |||
| 427 | /** | ||
| 428 | * pack_orig_addresses - the .orig_address fields of the PBEs from the | ||
| 429 | * list starting at @pbe are stored in the array @buf[] (1 page) | ||
| 430 | */ | ||
| 431 | |||
| 432 | static inline struct pbe *pack_orig_addresses(unsigned long *buf, | ||
| 433 | struct pbe *pbe) | ||
| 434 | { | ||
| 435 | int j; | ||
| 436 | |||
| 437 | for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) { | ||
| 438 | buf[j] = pbe->orig_address; | ||
| 439 | pbe = pbe->next; | ||
| 440 | } | ||
| 441 | if (!pbe) | ||
| 442 | for (; j < PAGE_SIZE / sizeof(long); j++) | ||
| 443 | buf[j] = 0; | ||
| 444 | return pbe; | ||
| 445 | } | ||
| 446 | |||
| 447 | /** | ||
| 448 | * save_image_metadata - save the .orig_address fields of the PBEs | ||
| 449 | * from the list @pblist using the swap map handle @handle | ||
| 450 | */ | ||
| 451 | |||
| 452 | static int save_image_metadata(struct pbe *pblist, | ||
| 453 | struct swap_map_handle *handle) | ||
| 454 | { | 138 | { |
| 455 | unsigned long *buf; | 139 | unsigned long offset; |
| 456 | unsigned int n = 0; | ||
| 457 | struct pbe *p; | ||
| 458 | int error = 0; | ||
| 459 | 140 | ||
| 460 | printk("Saving image metadata ... "); | 141 | offset = swp_offset(get_swap_page_of_type(swap)); |
| 461 | buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC); | 142 | if (offset) { |
| 462 | if (!buf) | 143 | if (bitmap_set(bitmap, offset)) { |
| 463 | return -ENOMEM; | 144 | swap_free(swp_entry(swap, offset)); |
| 464 | p = pblist; | 145 | offset = 0; |
| 465 | while (p) { | 146 | } |
| 466 | p = pack_orig_addresses(buf, p); | ||
| 467 | error = swap_map_write_page(handle, (unsigned long)buf); | ||
| 468 | if (error) | ||
| 469 | break; | ||
| 470 | n++; | ||
| 471 | } | 147 | } |
| 472 | free_page((unsigned long)buf); | 148 | return offset; |
| 473 | if (!error) | ||
| 474 | printk("done (%u pages saved)\n", n); | ||
| 475 | return error; | ||
| 476 | } | 149 | } |
| 477 | 150 | ||
| 478 | /** | 151 | void free_all_swap_pages(int swap, struct bitmap_page *bitmap) |
| 479 | * enough_swap - Make sure we have enough swap to save the image. | ||
| 480 | * | ||
| 481 | * Returns TRUE or FALSE after checking the total amount of swap | ||
| 482 | * space avaiable from the resume partition. | ||
| 483 | */ | ||
| 484 | |||
| 485 | static int enough_swap(unsigned int nr_pages) | ||
| 486 | { | 152 | { |
| 487 | unsigned int free_swap = swap_info[root_swap].pages - | 153 | unsigned int bit, n; |
| 488 | swap_info[root_swap].inuse_pages; | 154 | unsigned long test; |
| 489 | |||
| 490 | pr_debug("swsusp: free swap pages: %u\n", free_swap); | ||
| 491 | return free_swap > (nr_pages + PAGES_FOR_IO + | ||
| 492 | (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); | ||
| 493 | } | ||
| 494 | 155 | ||
| 495 | /** | 156 | bit = 0; |
| 496 | * swsusp_write - Write entire image and metadata. | 157 | while (bitmap) { |
| 497 | * | 158 | for (n = 0; n < BITMAP_PAGE_CHUNKS; n++) |
| 498 | * It is important _NOT_ to umount filesystems at this point. We want | 159 | for (test = 1UL; test; test <<= 1) { |
| 499 | * them synced (in case something goes wrong) but we DO not want to mark | 160 | if (bitmap->chunks[n] & test) |
| 500 | * filesystem clean: it is not. (And it does not matter, if we resume | 161 | swap_free(swp_entry(swap, bit)); |
| 501 | * correctly, we'll mark system clean, anyway.) | 162 | bit++; |
| 502 | */ | 163 | } |
| 503 | 164 | bitmap = bitmap->next; | |
| 504 | int swsusp_write(struct pbe *pblist, unsigned int nr_pages) | ||
| 505 | { | ||
| 506 | struct swap_map_page *swap_map; | ||
| 507 | struct swap_map_handle handle; | ||
| 508 | swp_entry_t start; | ||
| 509 | int error; | ||
| 510 | |||
| 511 | if ((error = swsusp_swap_check())) { | ||
| 512 | printk(KERN_ERR "swsusp: Cannot find swap device, try swapon -a.\n"); | ||
| 513 | return error; | ||
| 514 | } | ||
| 515 | if (!enough_swap(nr_pages)) { | ||
| 516 | printk(KERN_ERR "swsusp: Not enough free swap\n"); | ||
| 517 | return -ENOSPC; | ||
| 518 | } | 165 | } |
| 519 | |||
| 520 | init_header(nr_pages); | ||
| 521 | swap_map = alloc_swap_map(swsusp_info.pages); | ||
| 522 | if (!swap_map) | ||
| 523 | return -ENOMEM; | ||
| 524 | init_swap_map_handle(&handle, swap_map); | ||
| 525 | |||
| 526 | error = swap_map_write_page(&handle, (unsigned long)&swsusp_info); | ||
| 527 | if (!error) | ||
| 528 | error = save_image_metadata(pblist, &handle); | ||
| 529 | if (!error) | ||
| 530 | error = save_image_data(pblist, &handle, nr_pages); | ||
| 531 | if (error) | ||
| 532 | goto Free_image_entries; | ||
| 533 | |||
| 534 | swap_map = reverse_swap_map(swap_map); | ||
| 535 | error = save_swap_map(swap_map, &start); | ||
| 536 | if (error) | ||
| 537 | goto Free_map_entries; | ||
| 538 | |||
| 539 | dump_info(); | ||
| 540 | printk( "S" ); | ||
| 541 | error = mark_swapfiles(start); | ||
| 542 | printk( "|\n" ); | ||
| 543 | if (error) | ||
| 544 | goto Free_map_entries; | ||
| 545 | |||
| 546 | Free_swap_map: | ||
| 547 | free_swap_map(swap_map); | ||
| 548 | return error; | ||
| 549 | |||
| 550 | Free_map_entries: | ||
| 551 | free_swap_map_entries(swap_map); | ||
| 552 | Free_image_entries: | ||
| 553 | free_image_entries(swap_map); | ||
| 554 | goto Free_swap_map; | ||
| 555 | } | 166 | } |
| 556 | 167 | ||
| 557 | /** | 168 | /** |
| @@ -660,379 +271,3 @@ int swsusp_resume(void) | |||
| 660 | local_irq_enable(); | 271 | local_irq_enable(); |
| 661 | return error; | 272 | return error; |
| 662 | } | 273 | } |
| 663 | |||
| 664 | /** | ||
| 665 | * mark_unsafe_pages - mark the pages that cannot be used for storing | ||
| 666 | * the image during resume, because they conflict with the pages that | ||
| 667 | * had been used before suspend | ||
| 668 | */ | ||
| 669 | |||
| 670 | static void mark_unsafe_pages(struct pbe *pblist) | ||
| 671 | { | ||
| 672 | struct zone *zone; | ||
| 673 | unsigned long zone_pfn; | ||
| 674 | struct pbe *p; | ||
| 675 | |||
| 676 | if (!pblist) /* a sanity check */ | ||
| 677 | return; | ||
| 678 | |||
| 679 | /* Clear page flags */ | ||
| 680 | for_each_zone (zone) { | ||
| 681 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) | ||
| 682 | if (pfn_valid(zone_pfn + zone->zone_start_pfn)) | ||
| 683 | ClearPageNosaveFree(pfn_to_page(zone_pfn + | ||
| 684 | zone->zone_start_pfn)); | ||
| 685 | } | ||
| 686 | |||
| 687 | /* Mark orig addresses */ | ||
| 688 | for_each_pbe (p, pblist) | ||
| 689 | SetPageNosaveFree(virt_to_page(p->orig_address)); | ||
| 690 | |||
| 691 | } | ||
| 692 | |||
| 693 | static void copy_page_backup_list(struct pbe *dst, struct pbe *src) | ||
| 694 | { | ||
| 695 | /* We assume both lists contain the same number of elements */ | ||
| 696 | while (src) { | ||
| 697 | dst->orig_address = src->orig_address; | ||
| 698 | dst = dst->next; | ||
| 699 | src = src->next; | ||
| 700 | } | ||
| 701 | } | ||
| 702 | |||
| 703 | /* | ||
| 704 | * Using bio to read from swap. | ||
| 705 | * This code requires a bit more work than just using buffer heads | ||
| 706 | * but, it is the recommended way for 2.5/2.6. | ||
| 707 | * The following are to signal the beginning and end of I/O. Bios | ||
| 708 | * finish asynchronously, while we want them to happen synchronously. | ||
| 709 | * A simple atomic_t, and a wait loop take care of this problem. | ||
| 710 | */ | ||
| 711 | |||
| 712 | static atomic_t io_done = ATOMIC_INIT(0); | ||
| 713 | |||
| 714 | static int end_io(struct bio *bio, unsigned int num, int err) | ||
| 715 | { | ||
| 716 | if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) | ||
| 717 | panic("I/O error reading memory image"); | ||
| 718 | atomic_set(&io_done, 0); | ||
| 719 | return 0; | ||
| 720 | } | ||
| 721 | |||
| 722 | static struct block_device *resume_bdev; | ||
| 723 | |||
| 724 | /** | ||
| 725 | * submit - submit BIO request. | ||
| 726 | * @rw: READ or WRITE. | ||
| 727 | * @off physical offset of page. | ||
| 728 | * @page: page we're reading or writing. | ||
| 729 | * | ||
| 730 | * Straight from the textbook - allocate and initialize the bio. | ||
| 731 | * If we're writing, make sure the page is marked as dirty. | ||
| 732 | * Then submit it and wait. | ||
| 733 | */ | ||
| 734 | |||
| 735 | static int submit(int rw, pgoff_t page_off, void *page) | ||
| 736 | { | ||
| 737 | int error = 0; | ||
| 738 | struct bio *bio; | ||
| 739 | |||
| 740 | bio = bio_alloc(GFP_ATOMIC, 1); | ||
| 741 | if (!bio) | ||
| 742 | return -ENOMEM; | ||
| 743 | bio->bi_sector = page_off * (PAGE_SIZE >> 9); | ||
| 744 | bio->bi_bdev = resume_bdev; | ||
| 745 | bio->bi_end_io = end_io; | ||
| 746 | |||
| 747 | if (bio_add_page(bio, virt_to_page(page), PAGE_SIZE, 0) < PAGE_SIZE) { | ||
| 748 | printk("swsusp: ERROR: adding page to bio at %ld\n",page_off); | ||
| 749 | error = -EFAULT; | ||
| 750 | goto Done; | ||
| 751 | } | ||
| 752 | |||
| 753 | |||
| 754 | atomic_set(&io_done, 1); | ||
| 755 | submit_bio(rw | (1 << BIO_RW_SYNC), bio); | ||
| 756 | while (atomic_read(&io_done)) | ||
| 757 | yield(); | ||
| 758 | if (rw == READ) | ||
| 759 | bio_set_pages_dirty(bio); | ||
| 760 | Done: | ||
| 761 | bio_put(bio); | ||
| 762 | return error; | ||
| 763 | } | ||
| 764 | |||
| 765 | static int bio_read_page(pgoff_t page_off, void *page) | ||
| 766 | { | ||
| 767 | return submit(READ, page_off, page); | ||
| 768 | } | ||
| 769 | |||
| 770 | static int bio_write_page(pgoff_t page_off, void *page) | ||
| 771 | { | ||
| 772 | return submit(WRITE, page_off, page); | ||
| 773 | } | ||
| 774 | |||
| 775 | /** | ||
| 776 | * The following functions allow us to read data using a swap map | ||
| 777 | * in a file-alike way | ||
| 778 | */ | ||
| 779 | |||
| 780 | static inline void release_swap_map_reader(struct swap_map_handle *handle) | ||
| 781 | { | ||
| 782 | if (handle->cur) | ||
| 783 | free_page((unsigned long)handle->cur); | ||
| 784 | handle->cur = NULL; | ||
| 785 | } | ||
| 786 | |||
| 787 | static inline int get_swap_map_reader(struct swap_map_handle *handle, | ||
| 788 | swp_entry_t start) | ||
| 789 | { | ||
| 790 | int error; | ||
| 791 | |||
| 792 | if (!swp_offset(start)) | ||
| 793 | return -EINVAL; | ||
| 794 | handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); | ||
| 795 | if (!handle->cur) | ||
| 796 | return -ENOMEM; | ||
| 797 | error = bio_read_page(swp_offset(start), handle->cur); | ||
| 798 | if (error) { | ||
| 799 | release_swap_map_reader(handle); | ||
| 800 | return error; | ||
| 801 | } | ||
| 802 | handle->k = 0; | ||
| 803 | return 0; | ||
| 804 | } | ||
| 805 | |||
| 806 | static inline int swap_map_read_page(struct swap_map_handle *handle, void *buf) | ||
| 807 | { | ||
| 808 | unsigned long offset; | ||
| 809 | int error; | ||
| 810 | |||
| 811 | if (!handle->cur) | ||
| 812 | return -EINVAL; | ||
| 813 | offset = swp_offset(handle->cur->entries[handle->k]); | ||
| 814 | if (!offset) | ||
| 815 | return -EINVAL; | ||
| 816 | error = bio_read_page(offset, buf); | ||
| 817 | if (error) | ||
| 818 | return error; | ||
| 819 | if (++handle->k >= MAP_PAGE_SIZE) { | ||
| 820 | handle->k = 0; | ||
| 821 | offset = swp_offset(handle->cur->next_swap); | ||
| 822 | if (!offset) | ||
| 823 | release_swap_map_reader(handle); | ||
| 824 | else | ||
| 825 | error = bio_read_page(offset, handle->cur); | ||
| 826 | } | ||
| 827 | return error; | ||
| 828 | } | ||
| 829 | |||
| 830 | static int check_header(void) | ||
| 831 | { | ||
| 832 | char *reason = NULL; | ||
| 833 | |||
| 834 | dump_info(); | ||
| 835 | if (swsusp_info.version_code != LINUX_VERSION_CODE) | ||
| 836 | reason = "kernel version"; | ||
| 837 | if (swsusp_info.num_physpages != num_physpages) | ||
| 838 | reason = "memory size"; | ||
| 839 | if (strcmp(swsusp_info.uts.sysname,system_utsname.sysname)) | ||
| 840 | reason = "system type"; | ||
| 841 | if (strcmp(swsusp_info.uts.release,system_utsname.release)) | ||
| 842 | reason = "kernel release"; | ||
| 843 | if (strcmp(swsusp_info.uts.version,system_utsname.version)) | ||
| 844 | reason = "version"; | ||
| 845 | if (strcmp(swsusp_info.uts.machine,system_utsname.machine)) | ||
| 846 | reason = "machine"; | ||
| 847 | if (reason) { | ||
| 848 | printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason); | ||
| 849 | return -EPERM; | ||
| 850 | } | ||
| 851 | return 0; | ||
| 852 | } | ||
| 853 | |||
| 854 | /** | ||
| 855 | * load_image_data - load the image data using the swap map handle | ||
| 856 | * @handle and store them using the page backup list @pblist | ||
| 857 | * (assume there are @nr_pages pages to load) | ||
| 858 | */ | ||
| 859 | |||
| 860 | static int load_image_data(struct pbe *pblist, | ||
| 861 | struct swap_map_handle *handle, | ||
| 862 | unsigned int nr_pages) | ||
| 863 | { | ||
| 864 | int error; | ||
| 865 | unsigned int m; | ||
| 866 | struct pbe *p; | ||
| 867 | |||
| 868 | if (!pblist) | ||
| 869 | return -EINVAL; | ||
| 870 | printk("Loading image data pages (%u pages) ... ", nr_pages); | ||
| 871 | m = nr_pages / 100; | ||
| 872 | if (!m) | ||
| 873 | m = 1; | ||
| 874 | nr_pages = 0; | ||
| 875 | p = pblist; | ||
| 876 | while (p) { | ||
| 877 | error = swap_map_read_page(handle, (void *)p->address); | ||
| 878 | if (error) | ||
| 879 | break; | ||
| 880 | p = p->next; | ||
| 881 | if (!(nr_pages % m)) | ||
| 882 | printk("\b\b\b\b%3d%%", nr_pages / m); | ||
| 883 | nr_pages++; | ||
| 884 | } | ||
| 885 | if (!error) | ||
| 886 | printk("\b\b\b\bdone\n"); | ||
| 887 | return error; | ||
| 888 | } | ||
| 889 | |||
| 890 | /** | ||
| 891 | * unpack_orig_addresses - copy the elements of @buf[] (1 page) to | ||
| 892 | * the PBEs in the list starting at @pbe | ||
| 893 | */ | ||
| 894 | |||
| 895 | static inline struct pbe *unpack_orig_addresses(unsigned long *buf, | ||
| 896 | struct pbe *pbe) | ||
| 897 | { | ||
| 898 | int j; | ||
| 899 | |||
| 900 | for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) { | ||
| 901 | pbe->orig_address = buf[j]; | ||
| 902 | pbe = pbe->next; | ||
| 903 | } | ||
| 904 | return pbe; | ||
| 905 | } | ||
| 906 | |||
| 907 | /** | ||
| 908 | * load_image_metadata - load the image metadata using the swap map | ||
| 909 | * handle @handle and put them into the PBEs in the list @pblist | ||
| 910 | */ | ||
| 911 | |||
| 912 | static int load_image_metadata(struct pbe *pblist, struct swap_map_handle *handle) | ||
| 913 | { | ||
| 914 | struct pbe *p; | ||
| 915 | unsigned long *buf; | ||
| 916 | unsigned int n = 0; | ||
| 917 | int error = 0; | ||
| 918 | |||
| 919 | printk("Loading image metadata ... "); | ||
| 920 | buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC); | ||
| 921 | if (!buf) | ||
| 922 | return -ENOMEM; | ||
| 923 | p = pblist; | ||
| 924 | while (p) { | ||
| 925 | error = swap_map_read_page(handle, buf); | ||
| 926 | if (error) | ||
| 927 | break; | ||
| 928 | p = unpack_orig_addresses(buf, p); | ||
| 929 | n++; | ||
| 930 | } | ||
| 931 | free_page((unsigned long)buf); | ||
| 932 | if (!error) | ||
| 933 | printk("done (%u pages loaded)\n", n); | ||
| 934 | return error; | ||
| 935 | } | ||
| 936 | |||
| 937 | int swsusp_read(struct pbe **pblist_ptr) | ||
| 938 | { | ||
| 939 | int error; | ||
| 940 | struct pbe *p, *pblist; | ||
| 941 | struct swap_map_handle handle; | ||
| 942 | unsigned int nr_pages; | ||
| 943 | |||
| 944 | if (IS_ERR(resume_bdev)) { | ||
| 945 | pr_debug("swsusp: block device not initialised\n"); | ||
| 946 | return PTR_ERR(resume_bdev); | ||
| 947 | } | ||
| 948 | |||
| 949 | error = get_swap_map_reader(&handle, swsusp_header.image); | ||
| 950 | if (!error) | ||
| 951 | error = swap_map_read_page(&handle, &swsusp_info); | ||
| 952 | if (!error) | ||
| 953 | error = check_header(); | ||
| 954 | if (error) | ||
| 955 | return error; | ||
| 956 | nr_pages = swsusp_info.image_pages; | ||
| 957 | p = alloc_pagedir(nr_pages, GFP_ATOMIC, 0); | ||
| 958 | if (!p) | ||
| 959 | return -ENOMEM; | ||
| 960 | error = load_image_metadata(p, &handle); | ||
| 961 | if (!error) { | ||
| 962 | mark_unsafe_pages(p); | ||
| 963 | pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1); | ||
| 964 | if (pblist) | ||
| 965 | copy_page_backup_list(pblist, p); | ||
| 966 | free_pagedir(p); | ||
| 967 | if (!pblist) | ||
| 968 | error = -ENOMEM; | ||
| 969 | |||
| 970 | /* Allocate memory for the image and read the data from swap */ | ||
| 971 | if (!error) | ||
| 972 | error = alloc_data_pages(pblist, GFP_ATOMIC, 1); | ||
| 973 | if (!error) { | ||
| 974 | release_eaten_pages(); | ||
| 975 | error = load_image_data(pblist, &handle, nr_pages); | ||
| 976 | } | ||
| 977 | if (!error) | ||
| 978 | *pblist_ptr = pblist; | ||
| 979 | } | ||
| 980 | release_swap_map_reader(&handle); | ||
| 981 | |||
| 982 | blkdev_put(resume_bdev); | ||
| 983 | |||
| 984 | if (!error) | ||
| 985 | pr_debug("swsusp: Reading resume file was successful\n"); | ||
| 986 | else | ||
| 987 | pr_debug("swsusp: Error %d resuming\n", error); | ||
| 988 | return error; | ||
| 989 | } | ||
| 990 | |||
| 991 | /** | ||
| 992 | * swsusp_check - Check for swsusp signature in the resume device | ||
| 993 | */ | ||
| 994 | |||
| 995 | int swsusp_check(void) | ||
| 996 | { | ||
| 997 | int error; | ||
| 998 | |||
| 999 | resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); | ||
| 1000 | if (!IS_ERR(resume_bdev)) { | ||
| 1001 | set_blocksize(resume_bdev, PAGE_SIZE); | ||
| 1002 | memset(&swsusp_header, 0, sizeof(swsusp_header)); | ||
| 1003 | if ((error = bio_read_page(0, &swsusp_header))) | ||
| 1004 | return error; | ||
| 1005 | if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) { | ||
| 1006 | memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10); | ||
| 1007 | /* Reset swap signature now */ | ||
| 1008 | error = bio_write_page(0, &swsusp_header); | ||
| 1009 | } else { | ||
| 1010 | return -EINVAL; | ||
| 1011 | } | ||
| 1012 | if (error) | ||
| 1013 | blkdev_put(resume_bdev); | ||
| 1014 | else | ||
| 1015 | pr_debug("swsusp: Signature found, resuming\n"); | ||
| 1016 | } else { | ||
| 1017 | error = PTR_ERR(resume_bdev); | ||
| 1018 | } | ||
| 1019 | |||
| 1020 | if (error) | ||
| 1021 | pr_debug("swsusp: Error %d check for resume file\n", error); | ||
| 1022 | |||
| 1023 | return error; | ||
| 1024 | } | ||
| 1025 | |||
| 1026 | /** | ||
| 1027 | * swsusp_close - close swap device. | ||
| 1028 | */ | ||
| 1029 | |||
| 1030 | void swsusp_close(void) | ||
| 1031 | { | ||
| 1032 | if (IS_ERR(resume_bdev)) { | ||
| 1033 | pr_debug("swsusp: block device not initialised\n"); | ||
| 1034 | return; | ||
| 1035 | } | ||
| 1036 | |||
| 1037 | blkdev_put(resume_bdev); | ||
| 1038 | } | ||
diff --git a/kernel/power/user.c b/kernel/power/user.c new file mode 100644 index 000000000000..3f1539fbe48a --- /dev/null +++ b/kernel/power/user.c | |||
| @@ -0,0 +1,333 @@ | |||
| 1 | /* | ||
| 2 | * linux/kernel/power/user.c | ||
| 3 | * | ||
| 4 | * This file provides the user space interface for software suspend/resume. | ||
| 5 | * | ||
| 6 | * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> | ||
| 7 | * | ||
| 8 | * This file is released under the GPLv2. | ||
| 9 | * | ||
| 10 | */ | ||
| 11 | |||
| 12 | #include <linux/suspend.h> | ||
| 13 | #include <linux/syscalls.h> | ||
| 14 | #include <linux/string.h> | ||
| 15 | #include <linux/device.h> | ||
| 16 | #include <linux/miscdevice.h> | ||
| 17 | #include <linux/mm.h> | ||
| 18 | #include <linux/swap.h> | ||
| 19 | #include <linux/swapops.h> | ||
| 20 | #include <linux/pm.h> | ||
| 21 | #include <linux/fs.h> | ||
| 22 | |||
| 23 | #include <asm/uaccess.h> | ||
| 24 | |||
| 25 | #include "power.h" | ||
| 26 | |||
| 27 | #define SNAPSHOT_MINOR 231 | ||
| 28 | |||
| 29 | static struct snapshot_data { | ||
| 30 | struct snapshot_handle handle; | ||
| 31 | int swap; | ||
| 32 | struct bitmap_page *bitmap; | ||
| 33 | int mode; | ||
| 34 | char frozen; | ||
| 35 | char ready; | ||
| 36 | } snapshot_state; | ||
| 37 | |||
| 38 | static atomic_t device_available = ATOMIC_INIT(1); | ||
| 39 | |||
| 40 | static int snapshot_open(struct inode *inode, struct file *filp) | ||
| 41 | { | ||
| 42 | struct snapshot_data *data; | ||
| 43 | |||
| 44 | if (!atomic_add_unless(&device_available, -1, 0)) | ||
| 45 | return -EBUSY; | ||
| 46 | |||
| 47 | if ((filp->f_flags & O_ACCMODE) == O_RDWR) | ||
| 48 | return -ENOSYS; | ||
| 49 | |||
| 50 | nonseekable_open(inode, filp); | ||
| 51 | data = &snapshot_state; | ||
| 52 | filp->private_data = data; | ||
| 53 | memset(&data->handle, 0, sizeof(struct snapshot_handle)); | ||
| 54 | if ((filp->f_flags & O_ACCMODE) == O_RDONLY) { | ||
| 55 | data->swap = swsusp_resume_device ? swap_type_of(swsusp_resume_device) : -1; | ||
| 56 | data->mode = O_RDONLY; | ||
| 57 | } else { | ||
| 58 | data->swap = -1; | ||
| 59 | data->mode = O_WRONLY; | ||
| 60 | } | ||
| 61 | data->bitmap = NULL; | ||
| 62 | data->frozen = 0; | ||
| 63 | data->ready = 0; | ||
| 64 | |||
| 65 | return 0; | ||
| 66 | } | ||
| 67 | |||
| 68 | static int snapshot_release(struct inode *inode, struct file *filp) | ||
| 69 | { | ||
| 70 | struct snapshot_data *data; | ||
| 71 | |||
| 72 | swsusp_free(); | ||
| 73 | data = filp->private_data; | ||
| 74 | free_all_swap_pages(data->swap, data->bitmap); | ||
| 75 | free_bitmap(data->bitmap); | ||
| 76 | if (data->frozen) { | ||
| 77 | down(&pm_sem); | ||
| 78 | thaw_processes(); | ||
| 79 | enable_nonboot_cpus(); | ||
| 80 | up(&pm_sem); | ||
| 81 | } | ||
| 82 | atomic_inc(&device_available); | ||
| 83 | return 0; | ||
| 84 | } | ||
| 85 | |||
| 86 | static ssize_t snapshot_read(struct file *filp, char __user *buf, | ||
| 87 | size_t count, loff_t *offp) | ||
| 88 | { | ||
| 89 | struct snapshot_data *data; | ||
| 90 | ssize_t res; | ||
| 91 | |||
| 92 | data = filp->private_data; | ||
| 93 | res = snapshot_read_next(&data->handle, count); | ||
| 94 | if (res > 0) { | ||
| 95 | if (copy_to_user(buf, data_of(data->handle), res)) | ||
| 96 | res = -EFAULT; | ||
| 97 | else | ||
| 98 | *offp = data->handle.offset; | ||
| 99 | } | ||
| 100 | return res; | ||
| 101 | } | ||
| 102 | |||
| 103 | static ssize_t snapshot_write(struct file *filp, const char __user *buf, | ||
| 104 | size_t count, loff_t *offp) | ||
| 105 | { | ||
| 106 | struct snapshot_data *data; | ||
| 107 | ssize_t res; | ||
| 108 | |||
| 109 | data = filp->private_data; | ||
| 110 | res = snapshot_write_next(&data->handle, count); | ||
| 111 | if (res > 0) { | ||
| 112 | if (copy_from_user(data_of(data->handle), buf, res)) | ||
| 113 | res = -EFAULT; | ||
| 114 | else | ||
| 115 | *offp = data->handle.offset; | ||
| 116 | } | ||
| 117 | return res; | ||
| 118 | } | ||
| 119 | |||
| 120 | static int snapshot_ioctl(struct inode *inode, struct file *filp, | ||
| 121 | unsigned int cmd, unsigned long arg) | ||
| 122 | { | ||
| 123 | int error = 0; | ||
| 124 | struct snapshot_data *data; | ||
| 125 | loff_t offset, avail; | ||
| 126 | |||
| 127 | if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC) | ||
| 128 | return -ENOTTY; | ||
| 129 | if (_IOC_NR(cmd) > SNAPSHOT_IOC_MAXNR) | ||
| 130 | return -ENOTTY; | ||
| 131 | if (!capable(CAP_SYS_ADMIN)) | ||
| 132 | return -EPERM; | ||
| 133 | |||
| 134 | data = filp->private_data; | ||
| 135 | |||
| 136 | switch (cmd) { | ||
| 137 | |||
| 138 | case SNAPSHOT_FREEZE: | ||
| 139 | if (data->frozen) | ||
| 140 | break; | ||
| 141 | down(&pm_sem); | ||
| 142 | disable_nonboot_cpus(); | ||
| 143 | if (freeze_processes()) { | ||
| 144 | thaw_processes(); | ||
| 145 | enable_nonboot_cpus(); | ||
| 146 | error = -EBUSY; | ||
| 147 | } | ||
| 148 | up(&pm_sem); | ||
| 149 | if (!error) | ||
| 150 | data->frozen = 1; | ||
| 151 | break; | ||
| 152 | |||
| 153 | case SNAPSHOT_UNFREEZE: | ||
| 154 | if (!data->frozen) | ||
| 155 | break; | ||
| 156 | down(&pm_sem); | ||
| 157 | thaw_processes(); | ||
| 158 | enable_nonboot_cpus(); | ||
| 159 | up(&pm_sem); | ||
| 160 | data->frozen = 0; | ||
| 161 | break; | ||
| 162 | |||
| 163 | case SNAPSHOT_ATOMIC_SNAPSHOT: | ||
| 164 | if (data->mode != O_RDONLY || !data->frozen || data->ready) { | ||
| 165 | error = -EPERM; | ||
| 166 | break; | ||
| 167 | } | ||
| 168 | down(&pm_sem); | ||
| 169 | /* Free memory before shutting down devices. */ | ||
| 170 | error = swsusp_shrink_memory(); | ||
| 171 | if (!error) { | ||
| 172 | error = device_suspend(PMSG_FREEZE); | ||
| 173 | if (!error) { | ||
| 174 | in_suspend = 1; | ||
| 175 | error = swsusp_suspend(); | ||
| 176 | device_resume(); | ||
| 177 | } | ||
| 178 | } | ||
| 179 | up(&pm_sem); | ||
| 180 | if (!error) | ||
| 181 | error = put_user(in_suspend, (unsigned int __user *)arg); | ||
| 182 | if (!error) | ||
| 183 | data->ready = 1; | ||
| 184 | break; | ||
| 185 | |||
| 186 | case SNAPSHOT_ATOMIC_RESTORE: | ||
| 187 | if (data->mode != O_WRONLY || !data->frozen || | ||
| 188 | !snapshot_image_loaded(&data->handle)) { | ||
| 189 | error = -EPERM; | ||
| 190 | break; | ||
| 191 | } | ||
| 192 | down(&pm_sem); | ||
| 193 | pm_prepare_console(); | ||
| 194 | error = device_suspend(PMSG_FREEZE); | ||
| 195 | if (!error) { | ||
| 196 | error = swsusp_resume(); | ||
| 197 | device_resume(); | ||
| 198 | } | ||
| 199 | pm_restore_console(); | ||
| 200 | up(&pm_sem); | ||
| 201 | break; | ||
| 202 | |||
| 203 | case SNAPSHOT_FREE: | ||
| 204 | swsusp_free(); | ||
| 205 | memset(&data->handle, 0, sizeof(struct snapshot_handle)); | ||
| 206 | data->ready = 0; | ||
| 207 | break; | ||
| 208 | |||
| 209 | case SNAPSHOT_SET_IMAGE_SIZE: | ||
| 210 | image_size = arg; | ||
| 211 | break; | ||
| 212 | |||
| 213 | case SNAPSHOT_AVAIL_SWAP: | ||
| 214 | avail = count_swap_pages(data->swap, 1); | ||
| 215 | avail <<= PAGE_SHIFT; | ||
| 216 | error = put_user(avail, (loff_t __user *)arg); | ||
| 217 | break; | ||
| 218 | |||
| 219 | case SNAPSHOT_GET_SWAP_PAGE: | ||
| 220 | if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { | ||
| 221 | error = -ENODEV; | ||
| 222 | break; | ||
| 223 | } | ||
| 224 | if (!data->bitmap) { | ||
| 225 | data->bitmap = alloc_bitmap(count_swap_pages(data->swap, 0)); | ||
| 226 | if (!data->bitmap) { | ||
| 227 | error = -ENOMEM; | ||
| 228 | break; | ||
| 229 | } | ||
| 230 | } | ||
| 231 | offset = alloc_swap_page(data->swap, data->bitmap); | ||
| 232 | if (offset) { | ||
| 233 | offset <<= PAGE_SHIFT; | ||
| 234 | error = put_user(offset, (loff_t __user *)arg); | ||
| 235 | } else { | ||
| 236 | error = -ENOSPC; | ||
| 237 | } | ||
| 238 | break; | ||
| 239 | |||
| 240 | case SNAPSHOT_FREE_SWAP_PAGES: | ||
| 241 | if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { | ||
| 242 | error = -ENODEV; | ||
| 243 | break; | ||
| 244 | } | ||
| 245 | free_all_swap_pages(data->swap, data->bitmap); | ||
| 246 | free_bitmap(data->bitmap); | ||
| 247 | data->bitmap = NULL; | ||
| 248 | break; | ||
| 249 | |||
| 250 | case SNAPSHOT_SET_SWAP_FILE: | ||
| 251 | if (!data->bitmap) { | ||
| 252 | /* | ||
| 253 | * User space encodes device types as two-byte values, | ||
| 254 | * so we need to recode them | ||
| 255 | */ | ||
| 256 | if (old_decode_dev(arg)) { | ||
| 257 | data->swap = swap_type_of(old_decode_dev(arg)); | ||
| 258 | if (data->swap < 0) | ||
| 259 | error = -ENODEV; | ||
| 260 | } else { | ||
| 261 | data->swap = -1; | ||
| 262 | error = -EINVAL; | ||
| 263 | } | ||
| 264 | } else { | ||
| 265 | error = -EPERM; | ||
| 266 | } | ||
| 267 | break; | ||
| 268 | |||
| 269 | case SNAPSHOT_S2RAM: | ||
| 270 | if (!data->frozen) { | ||
| 271 | error = -EPERM; | ||
| 272 | break; | ||
| 273 | } | ||
| 274 | |||
| 275 | if (down_trylock(&pm_sem)) { | ||
| 276 | error = -EBUSY; | ||
| 277 | break; | ||
| 278 | } | ||
| 279 | |||
| 280 | if (pm_ops->prepare) { | ||
| 281 | error = pm_ops->prepare(PM_SUSPEND_MEM); | ||
| 282 | if (error) | ||
| 283 | goto OutS3; | ||
| 284 | } | ||
| 285 | |||
| 286 | /* Put devices to sleep */ | ||
| 287 | error = device_suspend(PMSG_SUSPEND); | ||
| 288 | if (error) { | ||
| 289 | printk(KERN_ERR "Failed to suspend some devices.\n"); | ||
| 290 | } else { | ||
| 291 | /* Enter S3, system is already frozen */ | ||
| 292 | suspend_enter(PM_SUSPEND_MEM); | ||
| 293 | |||
| 294 | /* Wake up devices */ | ||
| 295 | device_resume(); | ||
| 296 | } | ||
| 297 | |||
| 298 | if (pm_ops->finish) | ||
| 299 | pm_ops->finish(PM_SUSPEND_MEM); | ||
| 300 | |||
| 301 | OutS3: | ||
| 302 | up(&pm_sem); | ||
| 303 | break; | ||
| 304 | |||
| 305 | default: | ||
| 306 | error = -ENOTTY; | ||
| 307 | |||
| 308 | } | ||
| 309 | |||
| 310 | return error; | ||
| 311 | } | ||
| 312 | |||
| 313 | static struct file_operations snapshot_fops = { | ||
| 314 | .open = snapshot_open, | ||
| 315 | .release = snapshot_release, | ||
| 316 | .read = snapshot_read, | ||
| 317 | .write = snapshot_write, | ||
| 318 | .llseek = no_llseek, | ||
| 319 | .ioctl = snapshot_ioctl, | ||
| 320 | }; | ||
| 321 | |||
| 322 | static struct miscdevice snapshot_device = { | ||
| 323 | .minor = SNAPSHOT_MINOR, | ||
| 324 | .name = "snapshot", | ||
| 325 | .fops = &snapshot_fops, | ||
| 326 | }; | ||
| 327 | |||
| 328 | static int __init snapshot_device_init(void) | ||
| 329 | { | ||
| 330 | return misc_register(&snapshot_device); | ||
| 331 | }; | ||
| 332 | |||
| 333 | device_initcall(snapshot_device_init); | ||
diff --git a/kernel/profile.c b/kernel/profile.c index f89248e6d704..ad81f799a9b4 100644 --- a/kernel/profile.c +++ b/kernel/profile.c | |||
| @@ -23,6 +23,7 @@ | |||
| 23 | #include <linux/cpu.h> | 23 | #include <linux/cpu.h> |
| 24 | #include <linux/profile.h> | 24 | #include <linux/profile.h> |
| 25 | #include <linux/highmem.h> | 25 | #include <linux/highmem.h> |
| 26 | #include <linux/mutex.h> | ||
| 26 | #include <asm/sections.h> | 27 | #include <asm/sections.h> |
| 27 | #include <asm/semaphore.h> | 28 | #include <asm/semaphore.h> |
| 28 | 29 | ||
| @@ -44,7 +45,7 @@ static cpumask_t prof_cpu_mask = CPU_MASK_ALL; | |||
| 44 | #ifdef CONFIG_SMP | 45 | #ifdef CONFIG_SMP |
| 45 | static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); | 46 | static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); |
| 46 | static DEFINE_PER_CPU(int, cpu_profile_flip); | 47 | static DEFINE_PER_CPU(int, cpu_profile_flip); |
| 47 | static DECLARE_MUTEX(profile_flip_mutex); | 48 | static DEFINE_MUTEX(profile_flip_mutex); |
| 48 | #endif /* CONFIG_SMP */ | 49 | #endif /* CONFIG_SMP */ |
| 49 | 50 | ||
| 50 | static int __init profile_setup(char * str) | 51 | static int __init profile_setup(char * str) |
| @@ -243,7 +244,7 @@ static void profile_flip_buffers(void) | |||
| 243 | { | 244 | { |
| 244 | int i, j, cpu; | 245 | int i, j, cpu; |
| 245 | 246 | ||
| 246 | down(&profile_flip_mutex); | 247 | mutex_lock(&profile_flip_mutex); |
| 247 | j = per_cpu(cpu_profile_flip, get_cpu()); | 248 | j = per_cpu(cpu_profile_flip, get_cpu()); |
| 248 | put_cpu(); | 249 | put_cpu(); |
| 249 | on_each_cpu(__profile_flip_buffers, NULL, 0, 1); | 250 | on_each_cpu(__profile_flip_buffers, NULL, 0, 1); |
| @@ -259,14 +260,14 @@ static void profile_flip_buffers(void) | |||
| 259 | hits[i].hits = hits[i].pc = 0; | 260 | hits[i].hits = hits[i].pc = 0; |
| 260 | } | 261 | } |
| 261 | } | 262 | } |
| 262 | up(&profile_flip_mutex); | 263 | mutex_unlock(&profile_flip_mutex); |
| 263 | } | 264 | } |
| 264 | 265 | ||
| 265 | static void profile_discard_flip_buffers(void) | 266 | static void profile_discard_flip_buffers(void) |
| 266 | { | 267 | { |
| 267 | int i, cpu; | 268 | int i, cpu; |
| 268 | 269 | ||
| 269 | down(&profile_flip_mutex); | 270 | mutex_lock(&profile_flip_mutex); |
| 270 | i = per_cpu(cpu_profile_flip, get_cpu()); | 271 | i = per_cpu(cpu_profile_flip, get_cpu()); |
| 271 | put_cpu(); | 272 | put_cpu(); |
| 272 | on_each_cpu(__profile_flip_buffers, NULL, 0, 1); | 273 | on_each_cpu(__profile_flip_buffers, NULL, 0, 1); |
| @@ -274,7 +275,7 @@ static void profile_discard_flip_buffers(void) | |||
| 274 | struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i]; | 275 | struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i]; |
| 275 | memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit)); | 276 | memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit)); |
| 276 | } | 277 | } |
| 277 | up(&profile_flip_mutex); | 278 | mutex_unlock(&profile_flip_mutex); |
| 278 | } | 279 | } |
| 279 | 280 | ||
| 280 | void profile_hit(int type, void *__pc) | 281 | void profile_hit(int type, void *__pc) |
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index fedf5e369755..6df1559b1c02 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
| @@ -47,15 +47,16 @@ | |||
| 47 | #include <linux/notifier.h> | 47 | #include <linux/notifier.h> |
| 48 | #include <linux/rcupdate.h> | 48 | #include <linux/rcupdate.h> |
| 49 | #include <linux/cpu.h> | 49 | #include <linux/cpu.h> |
| 50 | #include <linux/mutex.h> | ||
| 50 | 51 | ||
| 51 | /* Definition for rcupdate control block. */ | 52 | /* Definition for rcupdate control block. */ |
| 52 | struct rcu_ctrlblk rcu_ctrlblk = { | 53 | static struct rcu_ctrlblk rcu_ctrlblk = { |
| 53 | .cur = -300, | 54 | .cur = -300, |
| 54 | .completed = -300, | 55 | .completed = -300, |
| 55 | .lock = SPIN_LOCK_UNLOCKED, | 56 | .lock = SPIN_LOCK_UNLOCKED, |
| 56 | .cpumask = CPU_MASK_NONE, | 57 | .cpumask = CPU_MASK_NONE, |
| 57 | }; | 58 | }; |
| 58 | struct rcu_ctrlblk rcu_bh_ctrlblk = { | 59 | static struct rcu_ctrlblk rcu_bh_ctrlblk = { |
| 59 | .cur = -300, | 60 | .cur = -300, |
| 60 | .completed = -300, | 61 | .completed = -300, |
| 61 | .lock = SPIN_LOCK_UNLOCKED, | 62 | .lock = SPIN_LOCK_UNLOCKED, |
| @@ -75,7 +76,7 @@ static int rsinterval = 1000; | |||
| 75 | #endif | 76 | #endif |
| 76 | 77 | ||
| 77 | static atomic_t rcu_barrier_cpu_count; | 78 | static atomic_t rcu_barrier_cpu_count; |
| 78 | static struct semaphore rcu_barrier_sema; | 79 | static DEFINE_MUTEX(rcu_barrier_mutex); |
| 79 | static struct completion rcu_barrier_completion; | 80 | static struct completion rcu_barrier_completion; |
| 80 | 81 | ||
| 81 | #ifdef CONFIG_SMP | 82 | #ifdef CONFIG_SMP |
| @@ -207,13 +208,13 @@ static void rcu_barrier_func(void *notused) | |||
| 207 | void rcu_barrier(void) | 208 | void rcu_barrier(void) |
| 208 | { | 209 | { |
| 209 | BUG_ON(in_interrupt()); | 210 | BUG_ON(in_interrupt()); |
| 210 | /* Take cpucontrol semaphore to protect against CPU hotplug */ | 211 | /* Take cpucontrol mutex to protect against CPU hotplug */ |
| 211 | down(&rcu_barrier_sema); | 212 | mutex_lock(&rcu_barrier_mutex); |
| 212 | init_completion(&rcu_barrier_completion); | 213 | init_completion(&rcu_barrier_completion); |
| 213 | atomic_set(&rcu_barrier_cpu_count, 0); | 214 | atomic_set(&rcu_barrier_cpu_count, 0); |
| 214 | on_each_cpu(rcu_barrier_func, NULL, 0, 1); | 215 | on_each_cpu(rcu_barrier_func, NULL, 0, 1); |
| 215 | wait_for_completion(&rcu_barrier_completion); | 216 | wait_for_completion(&rcu_barrier_completion); |
| 216 | up(&rcu_barrier_sema); | 217 | mutex_unlock(&rcu_barrier_mutex); |
| 217 | } | 218 | } |
| 218 | EXPORT_SYMBOL_GPL(rcu_barrier); | 219 | EXPORT_SYMBOL_GPL(rcu_barrier); |
| 219 | 220 | ||
| @@ -549,7 +550,6 @@ static struct notifier_block __devinitdata rcu_nb = { | |||
| 549 | */ | 550 | */ |
| 550 | void __init rcu_init(void) | 551 | void __init rcu_init(void) |
| 551 | { | 552 | { |
| 552 | sema_init(&rcu_barrier_sema, 1); | ||
| 553 | rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, | 553 | rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, |
| 554 | (void *)(long)smp_processor_id()); | 554 | (void *)(long)smp_processor_id()); |
| 555 | /* Register notifier for non-boot CPUs */ | 555 | /* Register notifier for non-boot CPUs */ |
diff --git a/kernel/sched.c b/kernel/sched.c index 6b6e0d70eb30..7ffaabd64f89 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
| @@ -237,6 +237,7 @@ struct runqueue { | |||
| 237 | 237 | ||
| 238 | task_t *migration_thread; | 238 | task_t *migration_thread; |
| 239 | struct list_head migration_queue; | 239 | struct list_head migration_queue; |
| 240 | int cpu; | ||
| 240 | #endif | 241 | #endif |
| 241 | 242 | ||
| 242 | #ifdef CONFIG_SCHEDSTATS | 243 | #ifdef CONFIG_SCHEDSTATS |
| @@ -1654,6 +1655,9 @@ unsigned long nr_iowait(void) | |||
| 1654 | /* | 1655 | /* |
| 1655 | * double_rq_lock - safely lock two runqueues | 1656 | * double_rq_lock - safely lock two runqueues |
| 1656 | * | 1657 | * |
| 1658 | * We must take them in cpu order to match code in | ||
| 1659 | * dependent_sleeper and wake_dependent_sleeper. | ||
| 1660 | * | ||
| 1657 | * Note this does not disable interrupts like task_rq_lock, | 1661 | * Note this does not disable interrupts like task_rq_lock, |
| 1658 | * you need to do so manually before calling. | 1662 | * you need to do so manually before calling. |
| 1659 | */ | 1663 | */ |
| @@ -1665,7 +1669,7 @@ static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) | |||
| 1665 | spin_lock(&rq1->lock); | 1669 | spin_lock(&rq1->lock); |
| 1666 | __acquire(rq2->lock); /* Fake it out ;) */ | 1670 | __acquire(rq2->lock); /* Fake it out ;) */ |
| 1667 | } else { | 1671 | } else { |
| 1668 | if (rq1 < rq2) { | 1672 | if (rq1->cpu < rq2->cpu) { |
| 1669 | spin_lock(&rq1->lock); | 1673 | spin_lock(&rq1->lock); |
| 1670 | spin_lock(&rq2->lock); | 1674 | spin_lock(&rq2->lock); |
| 1671 | } else { | 1675 | } else { |
| @@ -1701,7 +1705,7 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) | |||
| 1701 | __acquires(this_rq->lock) | 1705 | __acquires(this_rq->lock) |
| 1702 | { | 1706 | { |
| 1703 | if (unlikely(!spin_trylock(&busiest->lock))) { | 1707 | if (unlikely(!spin_trylock(&busiest->lock))) { |
| 1704 | if (busiest < this_rq) { | 1708 | if (busiest->cpu < this_rq->cpu) { |
| 1705 | spin_unlock(&this_rq->lock); | 1709 | spin_unlock(&this_rq->lock); |
| 1706 | spin_lock(&busiest->lock); | 1710 | spin_lock(&busiest->lock); |
| 1707 | spin_lock(&this_rq->lock); | 1711 | spin_lock(&this_rq->lock); |
| @@ -2869,7 +2873,7 @@ asmlinkage void __sched schedule(void) | |||
| 2869 | */ | 2873 | */ |
| 2870 | if (likely(!current->exit_state)) { | 2874 | if (likely(!current->exit_state)) { |
| 2871 | if (unlikely(in_atomic())) { | 2875 | if (unlikely(in_atomic())) { |
| 2872 | printk(KERN_ERR "scheduling while atomic: " | 2876 | printk(KERN_ERR "BUG: scheduling while atomic: " |
| 2873 | "%s/0x%08x/%d\n", | 2877 | "%s/0x%08x/%d\n", |
| 2874 | current->comm, preempt_count(), current->pid); | 2878 | current->comm, preempt_count(), current->pid); |
| 2875 | dump_stack(); | 2879 | dump_stack(); |
| @@ -6029,6 +6033,7 @@ void __init sched_init(void) | |||
| 6029 | rq->push_cpu = 0; | 6033 | rq->push_cpu = 0; |
| 6030 | rq->migration_thread = NULL; | 6034 | rq->migration_thread = NULL; |
| 6031 | INIT_LIST_HEAD(&rq->migration_queue); | 6035 | INIT_LIST_HEAD(&rq->migration_queue); |
| 6036 | rq->cpu = i; | ||
| 6032 | #endif | 6037 | #endif |
| 6033 | atomic_set(&rq->nr_iowait, 0); | 6038 | atomic_set(&rq->nr_iowait, 0); |
| 6034 | 6039 | ||
| @@ -6069,7 +6074,7 @@ void __might_sleep(char *file, int line) | |||
| 6069 | if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) | 6074 | if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) |
| 6070 | return; | 6075 | return; |
| 6071 | prev_jiffy = jiffies; | 6076 | prev_jiffy = jiffies; |
| 6072 | printk(KERN_ERR "Debug: sleeping function called from invalid" | 6077 | printk(KERN_ERR "BUG: sleeping function called from invalid" |
| 6073 | " context at %s:%d\n", file, line); | 6078 | " context at %s:%d\n", file, line); |
| 6074 | printk("in_atomic():%d, irqs_disabled():%d\n", | 6079 | printk("in_atomic():%d, irqs_disabled():%d\n", |
| 6075 | in_atomic(), irqs_disabled()); | 6080 | in_atomic(), irqs_disabled()); |
diff --git a/kernel/signal.c b/kernel/signal.c index ea154104a00b..75f7341b0c39 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
| @@ -1922,6 +1922,8 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, | |||
| 1922 | sigset_t *mask = ¤t->blocked; | 1922 | sigset_t *mask = ¤t->blocked; |
| 1923 | int signr = 0; | 1923 | int signr = 0; |
| 1924 | 1924 | ||
| 1925 | try_to_freeze(); | ||
| 1926 | |||
| 1925 | relock: | 1927 | relock: |
| 1926 | spin_lock_irq(¤t->sighand->siglock); | 1928 | spin_lock_irq(¤t->sighand->siglock); |
| 1927 | for (;;) { | 1929 | for (;;) { |
| @@ -2099,10 +2101,11 @@ long do_no_restart_syscall(struct restart_block *param) | |||
| 2099 | int sigprocmask(int how, sigset_t *set, sigset_t *oldset) | 2101 | int sigprocmask(int how, sigset_t *set, sigset_t *oldset) |
| 2100 | { | 2102 | { |
| 2101 | int error; | 2103 | int error; |
| 2102 | sigset_t old_block; | ||
| 2103 | 2104 | ||
| 2104 | spin_lock_irq(¤t->sighand->siglock); | 2105 | spin_lock_irq(¤t->sighand->siglock); |
| 2105 | old_block = current->blocked; | 2106 | if (oldset) |
| 2107 | *oldset = current->blocked; | ||
| 2108 | |||
| 2106 | error = 0; | 2109 | error = 0; |
| 2107 | switch (how) { | 2110 | switch (how) { |
| 2108 | case SIG_BLOCK: | 2111 | case SIG_BLOCK: |
| @@ -2119,8 +2122,7 @@ int sigprocmask(int how, sigset_t *set, sigset_t *oldset) | |||
| 2119 | } | 2122 | } |
| 2120 | recalc_sigpending(); | 2123 | recalc_sigpending(); |
| 2121 | spin_unlock_irq(¤t->sighand->siglock); | 2124 | spin_unlock_irq(¤t->sighand->siglock); |
| 2122 | if (oldset) | 2125 | |
| 2123 | *oldset = old_block; | ||
| 2124 | return error; | 2126 | return error; |
| 2125 | } | 2127 | } |
| 2126 | 2128 | ||
| @@ -2307,7 +2309,6 @@ sys_rt_sigtimedwait(const sigset_t __user *uthese, | |||
| 2307 | 2309 | ||
| 2308 | timeout = schedule_timeout_interruptible(timeout); | 2310 | timeout = schedule_timeout_interruptible(timeout); |
| 2309 | 2311 | ||
| 2310 | try_to_freeze(); | ||
| 2311 | spin_lock_irq(¤t->sighand->siglock); | 2312 | spin_lock_irq(¤t->sighand->siglock); |
| 2312 | sig = dequeue_signal(current, &these, &info); | 2313 | sig = dequeue_signal(current, &these, &info); |
| 2313 | current->blocked = current->real_blocked; | 2314 | current->blocked = current->real_blocked; |
diff --git a/kernel/spinlock.c b/kernel/spinlock.c index 0375fcd5921d..d1b810782bc4 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c | |||
| @@ -179,16 +179,16 @@ EXPORT_SYMBOL(_write_lock); | |||
| 179 | #define BUILD_LOCK_OPS(op, locktype) \ | 179 | #define BUILD_LOCK_OPS(op, locktype) \ |
| 180 | void __lockfunc _##op##_lock(locktype##_t *lock) \ | 180 | void __lockfunc _##op##_lock(locktype##_t *lock) \ |
| 181 | { \ | 181 | { \ |
| 182 | preempt_disable(); \ | ||
| 183 | for (;;) { \ | 182 | for (;;) { \ |
| 183 | preempt_disable(); \ | ||
| 184 | if (likely(_raw_##op##_trylock(lock))) \ | 184 | if (likely(_raw_##op##_trylock(lock))) \ |
| 185 | break; \ | 185 | break; \ |
| 186 | preempt_enable(); \ | 186 | preempt_enable(); \ |
| 187 | \ | ||
| 187 | if (!(lock)->break_lock) \ | 188 | if (!(lock)->break_lock) \ |
| 188 | (lock)->break_lock = 1; \ | 189 | (lock)->break_lock = 1; \ |
| 189 | while (!op##_can_lock(lock) && (lock)->break_lock) \ | 190 | while (!op##_can_lock(lock) && (lock)->break_lock) \ |
| 190 | cpu_relax(); \ | 191 | cpu_relax(); \ |
| 191 | preempt_disable(); \ | ||
| 192 | } \ | 192 | } \ |
| 193 | (lock)->break_lock = 0; \ | 193 | (lock)->break_lock = 0; \ |
| 194 | } \ | 194 | } \ |
| @@ -199,19 +199,18 @@ unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock) \ | |||
| 199 | { \ | 199 | { \ |
| 200 | unsigned long flags; \ | 200 | unsigned long flags; \ |
| 201 | \ | 201 | \ |
| 202 | preempt_disable(); \ | ||
| 203 | for (;;) { \ | 202 | for (;;) { \ |
| 203 | preempt_disable(); \ | ||
| 204 | local_irq_save(flags); \ | 204 | local_irq_save(flags); \ |
| 205 | if (likely(_raw_##op##_trylock(lock))) \ | 205 | if (likely(_raw_##op##_trylock(lock))) \ |
| 206 | break; \ | 206 | break; \ |
| 207 | local_irq_restore(flags); \ | 207 | local_irq_restore(flags); \ |
| 208 | \ | ||
| 209 | preempt_enable(); \ | 208 | preempt_enable(); \ |
| 209 | \ | ||
| 210 | if (!(lock)->break_lock) \ | 210 | if (!(lock)->break_lock) \ |
| 211 | (lock)->break_lock = 1; \ | 211 | (lock)->break_lock = 1; \ |
| 212 | while (!op##_can_lock(lock) && (lock)->break_lock) \ | 212 | while (!op##_can_lock(lock) && (lock)->break_lock) \ |
| 213 | cpu_relax(); \ | 213 | cpu_relax(); \ |
| 214 | preempt_disable(); \ | ||
| 215 | } \ | 214 | } \ |
| 216 | (lock)->break_lock = 0; \ | 215 | (lock)->break_lock = 0; \ |
| 217 | return flags; \ | 216 | return flags; \ |
diff --git a/kernel/sys.c b/kernel/sys.c index f91218a5463e..c0fcad9f826c 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
| @@ -1227,7 +1227,7 @@ asmlinkage long sys_setsid(void) | |||
| 1227 | struct pid *pid; | 1227 | struct pid *pid; |
| 1228 | int err = -EPERM; | 1228 | int err = -EPERM; |
| 1229 | 1229 | ||
| 1230 | down(&tty_sem); | 1230 | mutex_lock(&tty_mutex); |
| 1231 | write_lock_irq(&tasklist_lock); | 1231 | write_lock_irq(&tasklist_lock); |
| 1232 | 1232 | ||
| 1233 | pid = find_pid(PIDTYPE_PGID, group_leader->pid); | 1233 | pid = find_pid(PIDTYPE_PGID, group_leader->pid); |
| @@ -1241,7 +1241,7 @@ asmlinkage long sys_setsid(void) | |||
| 1241 | err = process_group(group_leader); | 1241 | err = process_group(group_leader); |
| 1242 | out: | 1242 | out: |
| 1243 | write_unlock_irq(&tasklist_lock); | 1243 | write_unlock_irq(&tasklist_lock); |
| 1244 | up(&tty_sem); | 1244 | mutex_unlock(&tty_mutex); |
| 1245 | return err; | 1245 | return err; |
| 1246 | } | 1246 | } |
| 1247 | 1247 | ||
| @@ -1677,9 +1677,6 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) | |||
| 1677 | * a lot simpler! (Which we're not doing right now because we're not | 1677 | * a lot simpler! (Which we're not doing right now because we're not |
| 1678 | * measuring them yet). | 1678 | * measuring them yet). |
| 1679 | * | 1679 | * |
| 1680 | * This expects to be called with tasklist_lock read-locked or better, | ||
| 1681 | * and the siglock not locked. It may momentarily take the siglock. | ||
| 1682 | * | ||
| 1683 | * When sampling multiple threads for RUSAGE_SELF, under SMP we might have | 1680 | * When sampling multiple threads for RUSAGE_SELF, under SMP we might have |
| 1684 | * races with threads incrementing their own counters. But since word | 1681 | * races with threads incrementing their own counters. But since word |
| 1685 | * reads are atomic, we either get new values or old values and we don't | 1682 | * reads are atomic, we either get new values or old values and we don't |
| @@ -1687,6 +1684,25 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) | |||
| 1687 | * the c* fields from p->signal from races with exit.c updating those | 1684 | * the c* fields from p->signal from races with exit.c updating those |
| 1688 | * fields when reaping, so a sample either gets all the additions of a | 1685 | * fields when reaping, so a sample either gets all the additions of a |
| 1689 | * given child after it's reaped, or none so this sample is before reaping. | 1686 | * given child after it's reaped, or none so this sample is before reaping. |
| 1687 | * | ||
| 1688 | * tasklist_lock locking optimisation: | ||
| 1689 | * If we are current and single threaded, we do not need to take the tasklist | ||
| 1690 | * lock or the siglock. No one else can take our signal_struct away, | ||
| 1691 | * no one else can reap the children to update signal->c* counters, and | ||
| 1692 | * no one else can race with the signal-> fields. | ||
| 1693 | * If we do not take the tasklist_lock, the signal-> fields could be read | ||
| 1694 | * out of order while another thread was just exiting. So we place a | ||
| 1695 | * read memory barrier when we avoid the lock. On the writer side, | ||
| 1696 | * write memory barrier is implied in __exit_signal as __exit_signal releases | ||
| 1697 | * the siglock spinlock after updating the signal-> fields. | ||
| 1698 | * | ||
| 1699 | * We don't really need the siglock when we access the non c* fields | ||
| 1700 | * of the signal_struct (for RUSAGE_SELF) even in multithreaded | ||
| 1701 | * case, since we take the tasklist lock for read and the non c* signal-> | ||
| 1702 | * fields are updated only in __exit_signal, which is called with | ||
| 1703 | * tasklist_lock taken for write, hence these two threads cannot execute | ||
| 1704 | * concurrently. | ||
| 1705 | * | ||
| 1690 | */ | 1706 | */ |
| 1691 | 1707 | ||
| 1692 | static void k_getrusage(struct task_struct *p, int who, struct rusage *r) | 1708 | static void k_getrusage(struct task_struct *p, int who, struct rusage *r) |
| @@ -1694,13 +1710,23 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) | |||
| 1694 | struct task_struct *t; | 1710 | struct task_struct *t; |
| 1695 | unsigned long flags; | 1711 | unsigned long flags; |
| 1696 | cputime_t utime, stime; | 1712 | cputime_t utime, stime; |
| 1713 | int need_lock = 0; | ||
| 1697 | 1714 | ||
| 1698 | memset((char *) r, 0, sizeof *r); | 1715 | memset((char *) r, 0, sizeof *r); |
| 1716 | utime = stime = cputime_zero; | ||
| 1699 | 1717 | ||
| 1700 | if (unlikely(!p->signal)) | 1718 | if (p != current || !thread_group_empty(p)) |
| 1701 | return; | 1719 | need_lock = 1; |
| 1702 | 1720 | ||
| 1703 | utime = stime = cputime_zero; | 1721 | if (need_lock) { |
| 1722 | read_lock(&tasklist_lock); | ||
| 1723 | if (unlikely(!p->signal)) { | ||
| 1724 | read_unlock(&tasklist_lock); | ||
| 1725 | return; | ||
| 1726 | } | ||
| 1727 | } else | ||
| 1728 | /* See locking comments above */ | ||
| 1729 | smp_rmb(); | ||
| 1704 | 1730 | ||
| 1705 | switch (who) { | 1731 | switch (who) { |
| 1706 | case RUSAGE_BOTH: | 1732 | case RUSAGE_BOTH: |
| @@ -1740,6 +1766,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) | |||
| 1740 | BUG(); | 1766 | BUG(); |
| 1741 | } | 1767 | } |
| 1742 | 1768 | ||
| 1769 | if (need_lock) | ||
| 1770 | read_unlock(&tasklist_lock); | ||
| 1743 | cputime_to_timeval(utime, &r->ru_utime); | 1771 | cputime_to_timeval(utime, &r->ru_utime); |
| 1744 | cputime_to_timeval(stime, &r->ru_stime); | 1772 | cputime_to_timeval(stime, &r->ru_stime); |
| 1745 | } | 1773 | } |
| @@ -1747,9 +1775,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) | |||
| 1747 | int getrusage(struct task_struct *p, int who, struct rusage __user *ru) | 1775 | int getrusage(struct task_struct *p, int who, struct rusage __user *ru) |
| 1748 | { | 1776 | { |
| 1749 | struct rusage r; | 1777 | struct rusage r; |
| 1750 | read_lock(&tasklist_lock); | ||
| 1751 | k_getrusage(p, who, &r); | 1778 | k_getrusage(p, who, &r); |
| 1752 | read_unlock(&tasklist_lock); | ||
| 1753 | return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; | 1779 | return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; |
| 1754 | } | 1780 | } |
| 1755 | 1781 | ||
