diff options
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/Makefile | 2 | ||||
| -rw-r--r-- | kernel/cpu.c | 1 | ||||
| -rw-r--r-- | kernel/cpuset.c | 466 | ||||
| -rw-r--r-- | kernel/exit.c | 29 | ||||
| -rw-r--r-- | kernel/irq/handle.c | 6 | ||||
| -rw-r--r-- | kernel/kallsyms.c | 1 | ||||
| -rw-r--r-- | kernel/kmod.c | 6 | ||||
| -rw-r--r-- | kernel/kprobes.c | 1 | ||||
| -rw-r--r-- | kernel/kthread.c | 13 | ||||
| -rw-r--r-- | kernel/params.c | 1 | ||||
| -rw-r--r-- | kernel/posix-cpu-timers.c | 16 | ||||
| -rw-r--r-- | kernel/posix-timers.c | 19 | ||||
| -rw-r--r-- | kernel/power/Makefile | 2 | ||||
| -rw-r--r-- | kernel/power/disk.c | 22 | ||||
| -rw-r--r-- | kernel/power/main.c | 5 | ||||
| -rw-r--r-- | kernel/power/power.h | 17 | ||||
| -rw-r--r-- | kernel/power/snapshot.c | 435 | ||||
| -rw-r--r-- | kernel/power/swsusp.c | 576 | ||||
| -rw-r--r-- | kernel/printk.c | 78 | ||||
| -rw-r--r-- | kernel/ptrace.c | 7 | ||||
| -rw-r--r-- | kernel/rcupdate.c | 10 | ||||
| -rw-r--r-- | kernel/rcutorture.c | 492 | ||||
| -rw-r--r-- | kernel/sched.c | 3 | ||||
| -rw-r--r-- | kernel/signal.c | 148 | ||||
| -rw-r--r-- | kernel/time.c | 25 | ||||
| -rw-r--r-- | kernel/timer.c | 342 | ||||
| -rw-r--r-- | kernel/workqueue.c | 33 |
27 files changed, 1631 insertions, 1125 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index ff4dc02ce170..4f5a1453093a 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
| @@ -22,7 +22,6 @@ obj-$(CONFIG_KEXEC) += kexec.o | |||
| 22 | obj-$(CONFIG_COMPAT) += compat.o | 22 | obj-$(CONFIG_COMPAT) += compat.o |
| 23 | obj-$(CONFIG_CPUSETS) += cpuset.o | 23 | obj-$(CONFIG_CPUSETS) += cpuset.o |
| 24 | obj-$(CONFIG_IKCONFIG) += configs.o | 24 | obj-$(CONFIG_IKCONFIG) += configs.o |
| 25 | obj-$(CONFIG_IKCONFIG_PROC) += configs.o | ||
| 26 | obj-$(CONFIG_STOP_MACHINE) += stop_machine.o | 25 | obj-$(CONFIG_STOP_MACHINE) += stop_machine.o |
| 27 | obj-$(CONFIG_AUDIT) += audit.o | 26 | obj-$(CONFIG_AUDIT) += audit.o |
| 28 | obj-$(CONFIG_AUDITSYSCALL) += auditsc.o | 27 | obj-$(CONFIG_AUDITSYSCALL) += auditsc.o |
| @@ -32,6 +31,7 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o | |||
| 32 | obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ | 31 | obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ |
| 33 | obj-$(CONFIG_CRASH_DUMP) += crash_dump.o | 32 | obj-$(CONFIG_CRASH_DUMP) += crash_dump.o |
| 34 | obj-$(CONFIG_SECCOMP) += seccomp.o | 33 | obj-$(CONFIG_SECCOMP) += seccomp.o |
| 34 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o | ||
| 35 | 35 | ||
| 36 | ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) | 36 | ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) |
| 37 | # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is | 37 | # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is |
diff --git a/kernel/cpu.c b/kernel/cpu.c index 53d8263ae12e..3619e939182e 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
| @@ -17,6 +17,7 @@ | |||
| 17 | 17 | ||
| 18 | /* This protects CPUs going up and down... */ | 18 | /* This protects CPUs going up and down... */ |
| 19 | DECLARE_MUTEX(cpucontrol); | 19 | DECLARE_MUTEX(cpucontrol); |
| 20 | EXPORT_SYMBOL_GPL(cpucontrol); | ||
| 20 | 21 | ||
| 21 | static struct notifier_block *cpu_chain; | 22 | static struct notifier_block *cpu_chain; |
| 22 | 23 | ||
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 28176d083f7b..5a737ed9dac7 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
| @@ -32,6 +32,7 @@ | |||
| 32 | #include <linux/kernel.h> | 32 | #include <linux/kernel.h> |
| 33 | #include <linux/kmod.h> | 33 | #include <linux/kmod.h> |
| 34 | #include <linux/list.h> | 34 | #include <linux/list.h> |
| 35 | #include <linux/mempolicy.h> | ||
| 35 | #include <linux/mm.h> | 36 | #include <linux/mm.h> |
| 36 | #include <linux/module.h> | 37 | #include <linux/module.h> |
| 37 | #include <linux/mount.h> | 38 | #include <linux/mount.h> |
| @@ -60,6 +61,9 @@ struct cpuset { | |||
| 60 | cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ | 61 | cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ |
| 61 | nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ | 62 | nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ |
| 62 | 63 | ||
| 64 | /* | ||
| 65 | * Count is atomic so can incr (fork) or decr (exit) without a lock. | ||
| 66 | */ | ||
| 63 | atomic_t count; /* count tasks using this cpuset */ | 67 | atomic_t count; /* count tasks using this cpuset */ |
| 64 | 68 | ||
| 65 | /* | 69 | /* |
| @@ -142,80 +146,91 @@ static struct vfsmount *cpuset_mount; | |||
| 142 | static struct super_block *cpuset_sb = NULL; | 146 | static struct super_block *cpuset_sb = NULL; |
| 143 | 147 | ||
| 144 | /* | 148 | /* |
| 145 | * cpuset_sem should be held by anyone who is depending on the children | 149 | * We have two global cpuset semaphores below. They can nest. |
| 146 | * or sibling lists of any cpuset, or performing non-atomic operations | 150 | * It is ok to first take manage_sem, then nest callback_sem. We also |
| 147 | * on the flags or *_allowed values of a cpuset, such as raising the | 151 | * require taking task_lock() when dereferencing a tasks cpuset pointer. |
| 148 | * CS_REMOVED flag bit iff it is not already raised, or reading and | 152 | * See "The task_lock() exception", at the end of this comment. |
| 149 | * conditionally modifying the *_allowed values. One kernel global | 153 | * |
| 150 | * cpuset semaphore should be sufficient - these things don't change | 154 | * A task must hold both semaphores to modify cpusets. If a task |
| 151 | * that much. | 155 | * holds manage_sem, then it blocks others wanting that semaphore, |
| 152 | * | 156 | * ensuring that it is the only task able to also acquire callback_sem |
| 153 | * The code that modifies cpusets holds cpuset_sem across the entire | 157 | * and be able to modify cpusets. It can perform various checks on |
| 154 | * operation, from cpuset_common_file_write() down, single threading | 158 | * the cpuset structure first, knowing nothing will change. It can |
| 155 | * all cpuset modifications (except for counter manipulations from | 159 | * also allocate memory while just holding manage_sem. While it is |
| 156 | * fork and exit) across the system. This presumes that cpuset | 160 | * performing these checks, various callback routines can briefly |
| 157 | * modifications are rare - better kept simple and safe, even if slow. | 161 | * acquire callback_sem to query cpusets. Once it is ready to make |
| 158 | * | 162 | * the changes, it takes callback_sem, blocking everyone else. |
| 159 | * The code that reads cpusets, such as in cpuset_common_file_read() | 163 | * |
| 160 | * and below, only holds cpuset_sem across small pieces of code, such | 164 | * Calls to the kernel memory allocator can not be made while holding |
| 161 | * as when reading out possibly multi-word cpumasks and nodemasks, as | 165 | * callback_sem, as that would risk double tripping on callback_sem |
| 162 | * the risks are less, and the desire for performance a little greater. | 166 | * from one of the callbacks into the cpuset code from within |
| 163 | * The proc_cpuset_show() routine needs to hold cpuset_sem to insure | 167 | * __alloc_pages(). |
| 164 | * that no cs->dentry is NULL, as it walks up the cpuset tree to root. | 168 | * |
| 165 | * | 169 | * If a task is only holding callback_sem, then it has read-only |
| 166 | * The hooks from fork and exit, cpuset_fork() and cpuset_exit(), don't | 170 | * access to cpusets. |
| 167 | * (usually) grab cpuset_sem. These are the two most performance | 171 | * |
| 168 | * critical pieces of code here. The exception occurs on exit(), | 172 | * The task_struct fields mems_allowed and mems_generation may only |
| 169 | * when a task in a notify_on_release cpuset exits. Then cpuset_sem | 173 | * be accessed in the context of that task, so require no locks. |
| 174 | * | ||
| 175 | * Any task can increment and decrement the count field without lock. | ||
| 176 | * So in general, code holding manage_sem or callback_sem can't rely | ||
| 177 | * on the count field not changing. However, if the count goes to | ||
| 178 | * zero, then only attach_task(), which holds both semaphores, can | ||
| 179 | * increment it again. Because a count of zero means that no tasks | ||
| 180 | * are currently attached, therefore there is no way a task attached | ||
| 181 | * to that cpuset can fork (the other way to increment the count). | ||
| 182 | * So code holding manage_sem or callback_sem can safely assume that | ||
| 183 | * if the count is zero, it will stay zero. Similarly, if a task | ||
| 184 | * holds manage_sem or callback_sem on a cpuset with zero count, it | ||
| 185 | * knows that the cpuset won't be removed, as cpuset_rmdir() needs | ||
| 186 | * both of those semaphores. | ||
| 187 | * | ||
| 188 | * A possible optimization to improve parallelism would be to make | ||
| 189 | * callback_sem a R/W semaphore (rwsem), allowing the callback routines | ||
| 190 | * to proceed in parallel, with read access, until the holder of | ||
| 191 | * manage_sem needed to take this rwsem for exclusive write access | ||
| 192 | * and modify some cpusets. | ||
| 193 | * | ||
| 194 | * The cpuset_common_file_write handler for operations that modify | ||
| 195 | * the cpuset hierarchy holds manage_sem across the entire operation, | ||
| 196 | * single threading all such cpuset modifications across the system. | ||
| 197 | * | ||
| 198 | * The cpuset_common_file_read() handlers only hold callback_sem across | ||
| 199 | * small pieces of code, such as when reading out possibly multi-word | ||
| 200 | * cpumasks and nodemasks. | ||
| 201 | * | ||
| 202 | * The fork and exit callbacks cpuset_fork() and cpuset_exit(), don't | ||
| 203 | * (usually) take either semaphore. These are the two most performance | ||
| 204 | * critical pieces of code here. The exception occurs on cpuset_exit(), | ||
| 205 | * when a task in a notify_on_release cpuset exits. Then manage_sem | ||
| 170 | * is taken, and if the cpuset count is zero, a usermode call made | 206 | * is taken, and if the cpuset count is zero, a usermode call made |
| 171 | * to /sbin/cpuset_release_agent with the name of the cpuset (path | 207 | * to /sbin/cpuset_release_agent with the name of the cpuset (path |
| 172 | * relative to the root of cpuset file system) as the argument. | 208 | * relative to the root of cpuset file system) as the argument. |
| 173 | * | 209 | * |
| 174 | * A cpuset can only be deleted if both its 'count' of using tasks is | 210 | * A cpuset can only be deleted if both its 'count' of using tasks |
| 175 | * zero, and its list of 'children' cpusets is empty. Since all tasks | 211 | * is zero, and its list of 'children' cpusets is empty. Since all |
| 176 | * in the system use _some_ cpuset, and since there is always at least | 212 | * tasks in the system use _some_ cpuset, and since there is always at |
| 177 | * one task in the system (init, pid == 1), therefore, top_cpuset | 213 | * least one task in the system (init, pid == 1), therefore, top_cpuset |
| 178 | * always has either children cpusets and/or using tasks. So no need | 214 | * always has either children cpusets and/or using tasks. So we don't |
| 179 | * for any special hack to ensure that top_cpuset cannot be deleted. | 215 | * need a special hack to ensure that top_cpuset cannot be deleted. |
| 216 | * | ||
| 217 | * The above "Tale of Two Semaphores" would be complete, but for: | ||
| 218 | * | ||
| 219 | * The task_lock() exception | ||
| 220 | * | ||
| 221 | * The need for this exception arises from the action of attach_task(), | ||
| 222 | * which overwrites one tasks cpuset pointer with another. It does | ||
| 223 | * so using both semaphores, however there are several performance | ||
| 224 | * critical places that need to reference task->cpuset without the | ||
| 225 | * expense of grabbing a system global semaphore. Therefore except as | ||
| 226 | * noted below, when dereferencing or, as in attach_task(), modifying | ||
| 227 | * a tasks cpuset pointer we use task_lock(), which acts on a spinlock | ||
| 228 | * (task->alloc_lock) already in the task_struct routinely used for | ||
| 229 | * such matters. | ||
| 180 | */ | 230 | */ |
| 181 | 231 | ||
| 182 | static DECLARE_MUTEX(cpuset_sem); | 232 | static DECLARE_MUTEX(manage_sem); |
| 183 | static struct task_struct *cpuset_sem_owner; | 233 | static DECLARE_MUTEX(callback_sem); |
| 184 | static int cpuset_sem_depth; | ||
| 185 | |||
| 186 | /* | ||
| 187 | * The global cpuset semaphore cpuset_sem can be needed by the | ||
| 188 | * memory allocator to update a tasks mems_allowed (see the calls | ||
| 189 | * to cpuset_update_current_mems_allowed()) or to walk up the | ||
| 190 | * cpuset hierarchy to find a mem_exclusive cpuset see the calls | ||
| 191 | * to cpuset_excl_nodes_overlap()). | ||
| 192 | * | ||
| 193 | * But if the memory allocation is being done by cpuset.c code, it | ||
| 194 | * usually already holds cpuset_sem. Double tripping on a kernel | ||
| 195 | * semaphore deadlocks the current task, and any other task that | ||
| 196 | * subsequently tries to obtain the lock. | ||
| 197 | * | ||
| 198 | * Run all up's and down's on cpuset_sem through the following | ||
| 199 | * wrappers, which will detect this nested locking, and avoid | ||
| 200 | * deadlocking. | ||
| 201 | */ | ||
| 202 | |||
| 203 | static inline void cpuset_down(struct semaphore *psem) | ||
| 204 | { | ||
| 205 | if (cpuset_sem_owner != current) { | ||
| 206 | down(psem); | ||
| 207 | cpuset_sem_owner = current; | ||
| 208 | } | ||
| 209 | cpuset_sem_depth++; | ||
| 210 | } | ||
| 211 | |||
| 212 | static inline void cpuset_up(struct semaphore *psem) | ||
| 213 | { | ||
| 214 | if (--cpuset_sem_depth == 0) { | ||
| 215 | cpuset_sem_owner = NULL; | ||
| 216 | up(psem); | ||
| 217 | } | ||
| 218 | } | ||
| 219 | 234 | ||
| 220 | /* | 235 | /* |
| 221 | * A couple of forward declarations required, due to cyclic reference loop: | 236 | * A couple of forward declarations required, due to cyclic reference loop: |
| @@ -390,7 +405,7 @@ static inline struct cftype *__d_cft(struct dentry *dentry) | |||
| 390 | } | 405 | } |
| 391 | 406 | ||
| 392 | /* | 407 | /* |
| 393 | * Call with cpuset_sem held. Writes path of cpuset into buf. | 408 | * Call with manage_sem held. Writes path of cpuset into buf. |
| 394 | * Returns 0 on success, -errno on error. | 409 | * Returns 0 on success, -errno on error. |
| 395 | */ | 410 | */ |
| 396 | 411 | ||
| @@ -442,10 +457,11 @@ static int cpuset_path(const struct cpuset *cs, char *buf, int buflen) | |||
| 442 | * status of the /sbin/cpuset_release_agent task, so no sense holding | 457 | * status of the /sbin/cpuset_release_agent task, so no sense holding |
| 443 | * our caller up for that. | 458 | * our caller up for that. |
| 444 | * | 459 | * |
| 445 | * The simple act of forking that task might require more memory, | 460 | * When we had only one cpuset semaphore, we had to call this |
| 446 | * which might need cpuset_sem. So this routine must be called while | 461 | * without holding it, to avoid deadlock when call_usermodehelper() |
| 447 | * cpuset_sem is not held, to avoid a possible deadlock. See also | 462 | * allocated memory. With two locks, we could now call this while |
| 448 | * comments for check_for_release(), below. | 463 | * holding manage_sem, but we still don't, so as to minimize |
| 464 | * the time manage_sem is held. | ||
| 449 | */ | 465 | */ |
| 450 | 466 | ||
| 451 | static void cpuset_release_agent(const char *pathbuf) | 467 | static void cpuset_release_agent(const char *pathbuf) |
| @@ -477,15 +493,15 @@ static void cpuset_release_agent(const char *pathbuf) | |||
| 477 | * cs is notify_on_release() and now both the user count is zero and | 493 | * cs is notify_on_release() and now both the user count is zero and |
| 478 | * the list of children is empty, prepare cpuset path in a kmalloc'd | 494 | * the list of children is empty, prepare cpuset path in a kmalloc'd |
| 479 | * buffer, to be returned via ppathbuf, so that the caller can invoke | 495 | * buffer, to be returned via ppathbuf, so that the caller can invoke |
| 480 | * cpuset_release_agent() with it later on, once cpuset_sem is dropped. | 496 | * cpuset_release_agent() with it later on, once manage_sem is dropped. |
| 481 | * Call here with cpuset_sem held. | 497 | * Call here with manage_sem held. |
| 482 | * | 498 | * |
| 483 | * This check_for_release() routine is responsible for kmalloc'ing | 499 | * This check_for_release() routine is responsible for kmalloc'ing |
| 484 | * pathbuf. The above cpuset_release_agent() is responsible for | 500 | * pathbuf. The above cpuset_release_agent() is responsible for |
| 485 | * kfree'ing pathbuf. The caller of these routines is responsible | 501 | * kfree'ing pathbuf. The caller of these routines is responsible |
| 486 | * for providing a pathbuf pointer, initialized to NULL, then | 502 | * for providing a pathbuf pointer, initialized to NULL, then |
| 487 | * calling check_for_release() with cpuset_sem held and the address | 503 | * calling check_for_release() with manage_sem held and the address |
| 488 | * of the pathbuf pointer, then dropping cpuset_sem, then calling | 504 | * of the pathbuf pointer, then dropping manage_sem, then calling |
| 489 | * cpuset_release_agent() with pathbuf, as set by check_for_release(). | 505 | * cpuset_release_agent() with pathbuf, as set by check_for_release(). |
| 490 | */ | 506 | */ |
| 491 | 507 | ||
| @@ -516,7 +532,7 @@ static void check_for_release(struct cpuset *cs, char **ppathbuf) | |||
| 516 | * One way or another, we guarantee to return some non-empty subset | 532 | * One way or another, we guarantee to return some non-empty subset |
| 517 | * of cpu_online_map. | 533 | * of cpu_online_map. |
| 518 | * | 534 | * |
| 519 | * Call with cpuset_sem held. | 535 | * Call with callback_sem held. |
| 520 | */ | 536 | */ |
| 521 | 537 | ||
| 522 | static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) | 538 | static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) |
| @@ -540,7 +556,7 @@ static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) | |||
| 540 | * One way or another, we guarantee to return some non-empty subset | 556 | * One way or another, we guarantee to return some non-empty subset |
| 541 | * of node_online_map. | 557 | * of node_online_map. |
| 542 | * | 558 | * |
| 543 | * Call with cpuset_sem held. | 559 | * Call with callback_sem held. |
| 544 | */ | 560 | */ |
| 545 | 561 | ||
| 546 | static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) | 562 | static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) |
| @@ -555,22 +571,47 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) | |||
| 555 | } | 571 | } |
| 556 | 572 | ||
| 557 | /* | 573 | /* |
| 558 | * Refresh current tasks mems_allowed and mems_generation from | 574 | * Refresh current tasks mems_allowed and mems_generation from current |
| 559 | * current tasks cpuset. Call with cpuset_sem held. | 575 | * tasks cpuset. |
| 560 | * | 576 | * |
| 561 | * This routine is needed to update the per-task mems_allowed | 577 | * Call without callback_sem or task_lock() held. May be called with |
| 562 | * data, within the tasks context, when it is trying to allocate | 578 | * or without manage_sem held. Will acquire task_lock() and might |
| 563 | * memory (in various mm/mempolicy.c routines) and notices | 579 | * acquire callback_sem during call. |
| 564 | * that some other task has been modifying its cpuset. | 580 | * |
| 581 | * The task_lock() is required to dereference current->cpuset safely. | ||
| 582 | * Without it, we could pick up the pointer value of current->cpuset | ||
| 583 | * in one instruction, and then attach_task could give us a different | ||
| 584 | * cpuset, and then the cpuset we had could be removed and freed, | ||
| 585 | * and then on our next instruction, we could dereference a no longer | ||
| 586 | * valid cpuset pointer to get its mems_generation field. | ||
| 587 | * | ||
| 588 | * This routine is needed to update the per-task mems_allowed data, | ||
| 589 | * within the tasks context, when it is trying to allocate memory | ||
| 590 | * (in various mm/mempolicy.c routines) and notices that some other | ||
| 591 | * task has been modifying its cpuset. | ||
| 565 | */ | 592 | */ |
| 566 | 593 | ||
| 567 | static void refresh_mems(void) | 594 | static void refresh_mems(void) |
| 568 | { | 595 | { |
| 569 | struct cpuset *cs = current->cpuset; | 596 | int my_cpusets_mem_gen; |
| 597 | |||
| 598 | task_lock(current); | ||
| 599 | my_cpusets_mem_gen = current->cpuset->mems_generation; | ||
| 600 | task_unlock(current); | ||
| 570 | 601 | ||
| 571 | if (current->cpuset_mems_generation != cs->mems_generation) { | 602 | if (current->cpuset_mems_generation != my_cpusets_mem_gen) { |
| 603 | struct cpuset *cs; | ||
| 604 | nodemask_t oldmem = current->mems_allowed; | ||
| 605 | |||
| 606 | down(&callback_sem); | ||
| 607 | task_lock(current); | ||
| 608 | cs = current->cpuset; | ||
| 572 | guarantee_online_mems(cs, ¤t->mems_allowed); | 609 | guarantee_online_mems(cs, ¤t->mems_allowed); |
| 573 | current->cpuset_mems_generation = cs->mems_generation; | 610 | current->cpuset_mems_generation = cs->mems_generation; |
| 611 | task_unlock(current); | ||
| 612 | up(&callback_sem); | ||
| 613 | if (!nodes_equal(oldmem, current->mems_allowed)) | ||
| 614 | numa_policy_rebind(&oldmem, ¤t->mems_allowed); | ||
| 574 | } | 615 | } |
| 575 | } | 616 | } |
| 576 | 617 | ||
| @@ -579,7 +620,7 @@ static void refresh_mems(void) | |||
| 579 | * | 620 | * |
| 580 | * One cpuset is a subset of another if all its allowed CPUs and | 621 | * One cpuset is a subset of another if all its allowed CPUs and |
| 581 | * Memory Nodes are a subset of the other, and its exclusive flags | 622 | * Memory Nodes are a subset of the other, and its exclusive flags |
| 582 | * are only set if the other's are set. | 623 | * are only set if the other's are set. Call holding manage_sem. |
| 583 | */ | 624 | */ |
| 584 | 625 | ||
| 585 | static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) | 626 | static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) |
| @@ -597,7 +638,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) | |||
| 597 | * If we replaced the flag and mask values of the current cpuset | 638 | * If we replaced the flag and mask values of the current cpuset |
| 598 | * (cur) with those values in the trial cpuset (trial), would | 639 | * (cur) with those values in the trial cpuset (trial), would |
| 599 | * our various subset and exclusive rules still be valid? Presumes | 640 | * our various subset and exclusive rules still be valid? Presumes |
| 600 | * cpuset_sem held. | 641 | * manage_sem held. |
| 601 | * | 642 | * |
| 602 | * 'cur' is the address of an actual, in-use cpuset. Operations | 643 | * 'cur' is the address of an actual, in-use cpuset. Operations |
| 603 | * such as list traversal that depend on the actual address of the | 644 | * such as list traversal that depend on the actual address of the |
| @@ -651,7 +692,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | |||
| 651 | * exclusive child cpusets | 692 | * exclusive child cpusets |
| 652 | * Build these two partitions by calling partition_sched_domains | 693 | * Build these two partitions by calling partition_sched_domains |
| 653 | * | 694 | * |
| 654 | * Call with cpuset_sem held. May nest a call to the | 695 | * Call with manage_sem held. May nest a call to the |
| 655 | * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. | 696 | * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. |
| 656 | */ | 697 | */ |
| 657 | 698 | ||
| @@ -696,6 +737,10 @@ static void update_cpu_domains(struct cpuset *cur) | |||
| 696 | unlock_cpu_hotplug(); | 737 | unlock_cpu_hotplug(); |
| 697 | } | 738 | } |
| 698 | 739 | ||
| 740 | /* | ||
| 741 | * Call with manage_sem held. May take callback_sem during call. | ||
| 742 | */ | ||
| 743 | |||
| 699 | static int update_cpumask(struct cpuset *cs, char *buf) | 744 | static int update_cpumask(struct cpuset *cs, char *buf) |
| 700 | { | 745 | { |
| 701 | struct cpuset trialcs; | 746 | struct cpuset trialcs; |
| @@ -712,12 +757,18 @@ static int update_cpumask(struct cpuset *cs, char *buf) | |||
| 712 | if (retval < 0) | 757 | if (retval < 0) |
| 713 | return retval; | 758 | return retval; |
| 714 | cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed); | 759 | cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed); |
| 760 | down(&callback_sem); | ||
| 715 | cs->cpus_allowed = trialcs.cpus_allowed; | 761 | cs->cpus_allowed = trialcs.cpus_allowed; |
| 762 | up(&callback_sem); | ||
| 716 | if (is_cpu_exclusive(cs) && !cpus_unchanged) | 763 | if (is_cpu_exclusive(cs) && !cpus_unchanged) |
| 717 | update_cpu_domains(cs); | 764 | update_cpu_domains(cs); |
| 718 | return 0; | 765 | return 0; |
| 719 | } | 766 | } |
| 720 | 767 | ||
| 768 | /* | ||
| 769 | * Call with manage_sem held. May take callback_sem during call. | ||
| 770 | */ | ||
| 771 | |||
| 721 | static int update_nodemask(struct cpuset *cs, char *buf) | 772 | static int update_nodemask(struct cpuset *cs, char *buf) |
| 722 | { | 773 | { |
| 723 | struct cpuset trialcs; | 774 | struct cpuset trialcs; |
| @@ -732,9 +783,11 @@ static int update_nodemask(struct cpuset *cs, char *buf) | |||
| 732 | return -ENOSPC; | 783 | return -ENOSPC; |
| 733 | retval = validate_change(cs, &trialcs); | 784 | retval = validate_change(cs, &trialcs); |
| 734 | if (retval == 0) { | 785 | if (retval == 0) { |
| 786 | down(&callback_sem); | ||
| 735 | cs->mems_allowed = trialcs.mems_allowed; | 787 | cs->mems_allowed = trialcs.mems_allowed; |
| 736 | atomic_inc(&cpuset_mems_generation); | 788 | atomic_inc(&cpuset_mems_generation); |
| 737 | cs->mems_generation = atomic_read(&cpuset_mems_generation); | 789 | cs->mems_generation = atomic_read(&cpuset_mems_generation); |
| 790 | up(&callback_sem); | ||
| 738 | } | 791 | } |
| 739 | return retval; | 792 | return retval; |
| 740 | } | 793 | } |
| @@ -745,6 +798,8 @@ static int update_nodemask(struct cpuset *cs, char *buf) | |||
| 745 | * CS_NOTIFY_ON_RELEASE) | 798 | * CS_NOTIFY_ON_RELEASE) |
| 746 | * cs: the cpuset to update | 799 | * cs: the cpuset to update |
| 747 | * buf: the buffer where we read the 0 or 1 | 800 | * buf: the buffer where we read the 0 or 1 |
| 801 | * | ||
| 802 | * Call with manage_sem held. | ||
| 748 | */ | 803 | */ |
| 749 | 804 | ||
| 750 | static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) | 805 | static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) |
| @@ -766,16 +821,27 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) | |||
| 766 | return err; | 821 | return err; |
| 767 | cpu_exclusive_changed = | 822 | cpu_exclusive_changed = |
| 768 | (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs)); | 823 | (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs)); |
| 824 | down(&callback_sem); | ||
| 769 | if (turning_on) | 825 | if (turning_on) |
| 770 | set_bit(bit, &cs->flags); | 826 | set_bit(bit, &cs->flags); |
| 771 | else | 827 | else |
| 772 | clear_bit(bit, &cs->flags); | 828 | clear_bit(bit, &cs->flags); |
| 829 | up(&callback_sem); | ||
| 773 | 830 | ||
| 774 | if (cpu_exclusive_changed) | 831 | if (cpu_exclusive_changed) |
| 775 | update_cpu_domains(cs); | 832 | update_cpu_domains(cs); |
| 776 | return 0; | 833 | return 0; |
| 777 | } | 834 | } |
| 778 | 835 | ||
| 836 | /* | ||
| 837 | * Attack task specified by pid in 'pidbuf' to cpuset 'cs', possibly | ||
| 838 | * writing the path of the old cpuset in 'ppathbuf' if it needs to be | ||
| 839 | * notified on release. | ||
| 840 | * | ||
| 841 | * Call holding manage_sem. May take callback_sem and task_lock of | ||
| 842 | * the task 'pid' during call. | ||
| 843 | */ | ||
| 844 | |||
| 779 | static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | 845 | static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) |
| 780 | { | 846 | { |
| 781 | pid_t pid; | 847 | pid_t pid; |
| @@ -792,7 +858,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | |||
| 792 | read_lock(&tasklist_lock); | 858 | read_lock(&tasklist_lock); |
| 793 | 859 | ||
| 794 | tsk = find_task_by_pid(pid); | 860 | tsk = find_task_by_pid(pid); |
| 795 | if (!tsk) { | 861 | if (!tsk || tsk->flags & PF_EXITING) { |
| 796 | read_unlock(&tasklist_lock); | 862 | read_unlock(&tasklist_lock); |
| 797 | return -ESRCH; | 863 | return -ESRCH; |
| 798 | } | 864 | } |
| @@ -810,10 +876,13 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | |||
| 810 | get_task_struct(tsk); | 876 | get_task_struct(tsk); |
| 811 | } | 877 | } |
| 812 | 878 | ||
| 879 | down(&callback_sem); | ||
| 880 | |||
| 813 | task_lock(tsk); | 881 | task_lock(tsk); |
| 814 | oldcs = tsk->cpuset; | 882 | oldcs = tsk->cpuset; |
| 815 | if (!oldcs) { | 883 | if (!oldcs) { |
| 816 | task_unlock(tsk); | 884 | task_unlock(tsk); |
| 885 | up(&callback_sem); | ||
| 817 | put_task_struct(tsk); | 886 | put_task_struct(tsk); |
| 818 | return -ESRCH; | 887 | return -ESRCH; |
| 819 | } | 888 | } |
| @@ -824,6 +893,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | |||
| 824 | guarantee_online_cpus(cs, &cpus); | 893 | guarantee_online_cpus(cs, &cpus); |
| 825 | set_cpus_allowed(tsk, cpus); | 894 | set_cpus_allowed(tsk, cpus); |
| 826 | 895 | ||
| 896 | up(&callback_sem); | ||
| 827 | put_task_struct(tsk); | 897 | put_task_struct(tsk); |
| 828 | if (atomic_dec_and_test(&oldcs->count)) | 898 | if (atomic_dec_and_test(&oldcs->count)) |
| 829 | check_for_release(oldcs, ppathbuf); | 899 | check_for_release(oldcs, ppathbuf); |
| @@ -867,7 +937,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us | |||
| 867 | } | 937 | } |
| 868 | buffer[nbytes] = 0; /* nul-terminate */ | 938 | buffer[nbytes] = 0; /* nul-terminate */ |
| 869 | 939 | ||
| 870 | cpuset_down(&cpuset_sem); | 940 | down(&manage_sem); |
| 871 | 941 | ||
| 872 | if (is_removed(cs)) { | 942 | if (is_removed(cs)) { |
| 873 | retval = -ENODEV; | 943 | retval = -ENODEV; |
| @@ -901,7 +971,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us | |||
| 901 | if (retval == 0) | 971 | if (retval == 0) |
| 902 | retval = nbytes; | 972 | retval = nbytes; |
| 903 | out2: | 973 | out2: |
| 904 | cpuset_up(&cpuset_sem); | 974 | up(&manage_sem); |
| 905 | cpuset_release_agent(pathbuf); | 975 | cpuset_release_agent(pathbuf); |
| 906 | out1: | 976 | out1: |
| 907 | kfree(buffer); | 977 | kfree(buffer); |
| @@ -941,9 +1011,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) | |||
| 941 | { | 1011 | { |
| 942 | cpumask_t mask; | 1012 | cpumask_t mask; |
| 943 | 1013 | ||
| 944 | cpuset_down(&cpuset_sem); | 1014 | down(&callback_sem); |
| 945 | mask = cs->cpus_allowed; | 1015 | mask = cs->cpus_allowed; |
| 946 | cpuset_up(&cpuset_sem); | 1016 | up(&callback_sem); |
| 947 | 1017 | ||
| 948 | return cpulist_scnprintf(page, PAGE_SIZE, mask); | 1018 | return cpulist_scnprintf(page, PAGE_SIZE, mask); |
| 949 | } | 1019 | } |
| @@ -952,9 +1022,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) | |||
| 952 | { | 1022 | { |
| 953 | nodemask_t mask; | 1023 | nodemask_t mask; |
| 954 | 1024 | ||
| 955 | cpuset_down(&cpuset_sem); | 1025 | down(&callback_sem); |
| 956 | mask = cs->mems_allowed; | 1026 | mask = cs->mems_allowed; |
| 957 | cpuset_up(&cpuset_sem); | 1027 | up(&callback_sem); |
| 958 | 1028 | ||
| 959 | return nodelist_scnprintf(page, PAGE_SIZE, mask); | 1029 | return nodelist_scnprintf(page, PAGE_SIZE, mask); |
| 960 | } | 1030 | } |
| @@ -995,7 +1065,6 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, | |||
| 995 | goto out; | 1065 | goto out; |
| 996 | } | 1066 | } |
| 997 | *s++ = '\n'; | 1067 | *s++ = '\n'; |
| 998 | *s = '\0'; | ||
| 999 | 1068 | ||
| 1000 | retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page); | 1069 | retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page); |
| 1001 | out: | 1070 | out: |
| @@ -1048,6 +1117,21 @@ static int cpuset_file_release(struct inode *inode, struct file *file) | |||
| 1048 | return 0; | 1117 | return 0; |
| 1049 | } | 1118 | } |
| 1050 | 1119 | ||
| 1120 | /* | ||
| 1121 | * cpuset_rename - Only allow simple rename of directories in place. | ||
| 1122 | */ | ||
| 1123 | static int cpuset_rename(struct inode *old_dir, struct dentry *old_dentry, | ||
| 1124 | struct inode *new_dir, struct dentry *new_dentry) | ||
| 1125 | { | ||
| 1126 | if (!S_ISDIR(old_dentry->d_inode->i_mode)) | ||
| 1127 | return -ENOTDIR; | ||
| 1128 | if (new_dentry->d_inode) | ||
| 1129 | return -EEXIST; | ||
| 1130 | if (old_dir != new_dir) | ||
| 1131 | return -EIO; | ||
| 1132 | return simple_rename(old_dir, old_dentry, new_dir, new_dentry); | ||
| 1133 | } | ||
| 1134 | |||
| 1051 | static struct file_operations cpuset_file_operations = { | 1135 | static struct file_operations cpuset_file_operations = { |
| 1052 | .read = cpuset_file_read, | 1136 | .read = cpuset_file_read, |
| 1053 | .write = cpuset_file_write, | 1137 | .write = cpuset_file_write, |
| @@ -1060,6 +1144,7 @@ static struct inode_operations cpuset_dir_inode_operations = { | |||
| 1060 | .lookup = simple_lookup, | 1144 | .lookup = simple_lookup, |
| 1061 | .mkdir = cpuset_mkdir, | 1145 | .mkdir = cpuset_mkdir, |
| 1062 | .rmdir = cpuset_rmdir, | 1146 | .rmdir = cpuset_rmdir, |
| 1147 | .rename = cpuset_rename, | ||
| 1063 | }; | 1148 | }; |
| 1064 | 1149 | ||
| 1065 | static int cpuset_create_file(struct dentry *dentry, int mode) | 1150 | static int cpuset_create_file(struct dentry *dentry, int mode) |
| @@ -1163,7 +1248,9 @@ struct ctr_struct { | |||
| 1163 | 1248 | ||
| 1164 | /* | 1249 | /* |
| 1165 | * Load into 'pidarray' up to 'npids' of the tasks using cpuset 'cs'. | 1250 | * Load into 'pidarray' up to 'npids' of the tasks using cpuset 'cs'. |
| 1166 | * Return actual number of pids loaded. | 1251 | * Return actual number of pids loaded. No need to task_lock(p) |
| 1252 | * when reading out p->cpuset, as we don't really care if it changes | ||
| 1253 | * on the next cycle, and we are not going to try to dereference it. | ||
| 1167 | */ | 1254 | */ |
| 1168 | static inline int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs) | 1255 | static inline int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs) |
| 1169 | { | 1256 | { |
| @@ -1205,6 +1292,12 @@ static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids) | |||
| 1205 | return cnt; | 1292 | return cnt; |
| 1206 | } | 1293 | } |
| 1207 | 1294 | ||
| 1295 | /* | ||
| 1296 | * Handle an open on 'tasks' file. Prepare a buffer listing the | ||
| 1297 | * process id's of tasks currently attached to the cpuset being opened. | ||
| 1298 | * | ||
| 1299 | * Does not require any specific cpuset semaphores, and does not take any. | ||
| 1300 | */ | ||
| 1208 | static int cpuset_tasks_open(struct inode *unused, struct file *file) | 1301 | static int cpuset_tasks_open(struct inode *unused, struct file *file) |
| 1209 | { | 1302 | { |
| 1210 | struct cpuset *cs = __d_cs(file->f_dentry->d_parent); | 1303 | struct cpuset *cs = __d_cs(file->f_dentry->d_parent); |
| @@ -1352,7 +1445,8 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) | |||
| 1352 | if (!cs) | 1445 | if (!cs) |
| 1353 | return -ENOMEM; | 1446 | return -ENOMEM; |
| 1354 | 1447 | ||
| 1355 | cpuset_down(&cpuset_sem); | 1448 | down(&manage_sem); |
| 1449 | refresh_mems(); | ||
| 1356 | cs->flags = 0; | 1450 | cs->flags = 0; |
| 1357 | if (notify_on_release(parent)) | 1451 | if (notify_on_release(parent)) |
| 1358 | set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); | 1452 | set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); |
| @@ -1366,25 +1460,27 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) | |||
| 1366 | 1460 | ||
| 1367 | cs->parent = parent; | 1461 | cs->parent = parent; |
| 1368 | 1462 | ||
| 1463 | down(&callback_sem); | ||
| 1369 | list_add(&cs->sibling, &cs->parent->children); | 1464 | list_add(&cs->sibling, &cs->parent->children); |
| 1465 | up(&callback_sem); | ||
| 1370 | 1466 | ||
| 1371 | err = cpuset_create_dir(cs, name, mode); | 1467 | err = cpuset_create_dir(cs, name, mode); |
| 1372 | if (err < 0) | 1468 | if (err < 0) |
| 1373 | goto err; | 1469 | goto err; |
| 1374 | 1470 | ||
| 1375 | /* | 1471 | /* |
| 1376 | * Release cpuset_sem before cpuset_populate_dir() because it | 1472 | * Release manage_sem before cpuset_populate_dir() because it |
| 1377 | * will down() this new directory's i_sem and if we race with | 1473 | * will down() this new directory's i_sem and if we race with |
| 1378 | * another mkdir, we might deadlock. | 1474 | * another mkdir, we might deadlock. |
| 1379 | */ | 1475 | */ |
| 1380 | cpuset_up(&cpuset_sem); | 1476 | up(&manage_sem); |
| 1381 | 1477 | ||
| 1382 | err = cpuset_populate_dir(cs->dentry); | 1478 | err = cpuset_populate_dir(cs->dentry); |
| 1383 | /* If err < 0, we have a half-filled directory - oh well ;) */ | 1479 | /* If err < 0, we have a half-filled directory - oh well ;) */ |
| 1384 | return 0; | 1480 | return 0; |
| 1385 | err: | 1481 | err: |
| 1386 | list_del(&cs->sibling); | 1482 | list_del(&cs->sibling); |
| 1387 | cpuset_up(&cpuset_sem); | 1483 | up(&manage_sem); |
| 1388 | kfree(cs); | 1484 | kfree(cs); |
| 1389 | return err; | 1485 | return err; |
| 1390 | } | 1486 | } |
| @@ -1406,29 +1502,32 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
| 1406 | 1502 | ||
| 1407 | /* the vfs holds both inode->i_sem already */ | 1503 | /* the vfs holds both inode->i_sem already */ |
| 1408 | 1504 | ||
| 1409 | cpuset_down(&cpuset_sem); | 1505 | down(&manage_sem); |
| 1506 | refresh_mems(); | ||
| 1410 | if (atomic_read(&cs->count) > 0) { | 1507 | if (atomic_read(&cs->count) > 0) { |
| 1411 | cpuset_up(&cpuset_sem); | 1508 | up(&manage_sem); |
| 1412 | return -EBUSY; | 1509 | return -EBUSY; |
| 1413 | } | 1510 | } |
| 1414 | if (!list_empty(&cs->children)) { | 1511 | if (!list_empty(&cs->children)) { |
| 1415 | cpuset_up(&cpuset_sem); | 1512 | up(&manage_sem); |
| 1416 | return -EBUSY; | 1513 | return -EBUSY; |
| 1417 | } | 1514 | } |
| 1418 | parent = cs->parent; | 1515 | parent = cs->parent; |
| 1516 | down(&callback_sem); | ||
| 1419 | set_bit(CS_REMOVED, &cs->flags); | 1517 | set_bit(CS_REMOVED, &cs->flags); |
| 1420 | if (is_cpu_exclusive(cs)) | 1518 | if (is_cpu_exclusive(cs)) |
| 1421 | update_cpu_domains(cs); | 1519 | update_cpu_domains(cs); |
| 1422 | list_del(&cs->sibling); /* delete my sibling from parent->children */ | 1520 | list_del(&cs->sibling); /* delete my sibling from parent->children */ |
| 1423 | if (list_empty(&parent->children)) | ||
| 1424 | check_for_release(parent, &pathbuf); | ||
| 1425 | spin_lock(&cs->dentry->d_lock); | 1521 | spin_lock(&cs->dentry->d_lock); |
| 1426 | d = dget(cs->dentry); | 1522 | d = dget(cs->dentry); |
| 1427 | cs->dentry = NULL; | 1523 | cs->dentry = NULL; |
| 1428 | spin_unlock(&d->d_lock); | 1524 | spin_unlock(&d->d_lock); |
| 1429 | cpuset_d_remove_dir(d); | 1525 | cpuset_d_remove_dir(d); |
| 1430 | dput(d); | 1526 | dput(d); |
| 1431 | cpuset_up(&cpuset_sem); | 1527 | up(&callback_sem); |
| 1528 | if (list_empty(&parent->children)) | ||
| 1529 | check_for_release(parent, &pathbuf); | ||
| 1530 | up(&manage_sem); | ||
| 1432 | cpuset_release_agent(pathbuf); | 1531 | cpuset_release_agent(pathbuf); |
| 1433 | return 0; | 1532 | return 0; |
| 1434 | } | 1533 | } |
| @@ -1488,16 +1587,26 @@ void __init cpuset_init_smp(void) | |||
| 1488 | * cpuset_fork - attach newly forked task to its parents cpuset. | 1587 | * cpuset_fork - attach newly forked task to its parents cpuset. |
| 1489 | * @tsk: pointer to task_struct of forking parent process. | 1588 | * @tsk: pointer to task_struct of forking parent process. |
| 1490 | * | 1589 | * |
| 1491 | * Description: By default, on fork, a task inherits its | 1590 | * Description: A task inherits its parent's cpuset at fork(). |
| 1492 | * parent's cpuset. The pointer to the shared cpuset is | 1591 | * |
| 1493 | * automatically copied in fork.c by dup_task_struct(). | 1592 | * A pointer to the shared cpuset was automatically copied in fork.c |
| 1494 | * This cpuset_fork() routine need only increment the usage | 1593 | * by dup_task_struct(). However, we ignore that copy, since it was |
| 1495 | * counter in that cpuset. | 1594 | * not made under the protection of task_lock(), so might no longer be |
| 1595 | * a valid cpuset pointer. attach_task() might have already changed | ||
| 1596 | * current->cpuset, allowing the previously referenced cpuset to | ||
| 1597 | * be removed and freed. Instead, we task_lock(current) and copy | ||
| 1598 | * its present value of current->cpuset for our freshly forked child. | ||
| 1599 | * | ||
| 1600 | * At the point that cpuset_fork() is called, 'current' is the parent | ||
| 1601 | * task, and the passed argument 'child' points to the child task. | ||
| 1496 | **/ | 1602 | **/ |
| 1497 | 1603 | ||
| 1498 | void cpuset_fork(struct task_struct *tsk) | 1604 | void cpuset_fork(struct task_struct *child) |
| 1499 | { | 1605 | { |
| 1500 | atomic_inc(&tsk->cpuset->count); | 1606 | task_lock(current); |
| 1607 | child->cpuset = current->cpuset; | ||
| 1608 | atomic_inc(&child->cpuset->count); | ||
| 1609 | task_unlock(current); | ||
| 1501 | } | 1610 | } |
| 1502 | 1611 | ||
| 1503 | /** | 1612 | /** |
| @@ -1506,35 +1615,42 @@ void cpuset_fork(struct task_struct *tsk) | |||
| 1506 | * | 1615 | * |
| 1507 | * Description: Detach cpuset from @tsk and release it. | 1616 | * Description: Detach cpuset from @tsk and release it. |
| 1508 | * | 1617 | * |
| 1509 | * Note that cpusets marked notify_on_release force every task | 1618 | * Note that cpusets marked notify_on_release force every task in |
| 1510 | * in them to take the global cpuset_sem semaphore when exiting. | 1619 | * them to take the global manage_sem semaphore when exiting. |
| 1511 | * This could impact scaling on very large systems. Be reluctant | 1620 | * This could impact scaling on very large systems. Be reluctant to |
| 1512 | * to use notify_on_release cpusets where very high task exit | 1621 | * use notify_on_release cpusets where very high task exit scaling |
| 1513 | * scaling is required on large systems. | 1622 | * is required on large systems. |
| 1514 | * | 1623 | * |
| 1515 | * Don't even think about derefencing 'cs' after the cpuset use | 1624 | * Don't even think about derefencing 'cs' after the cpuset use count |
| 1516 | * count goes to zero, except inside a critical section guarded | 1625 | * goes to zero, except inside a critical section guarded by manage_sem |
| 1517 | * by the cpuset_sem semaphore. If you don't hold cpuset_sem, | 1626 | * or callback_sem. Otherwise a zero cpuset use count is a license to |
| 1518 | * then a zero cpuset use count is a license to any other task to | 1627 | * any other task to nuke the cpuset immediately, via cpuset_rmdir(). |
| 1519 | * nuke the cpuset immediately. | 1628 | * |
| 1629 | * This routine has to take manage_sem, not callback_sem, because | ||
| 1630 | * it is holding that semaphore while calling check_for_release(), | ||
| 1631 | * which calls kmalloc(), so can't be called holding callback__sem(). | ||
| 1632 | * | ||
| 1633 | * We don't need to task_lock() this reference to tsk->cpuset, | ||
| 1634 | * because tsk is already marked PF_EXITING, so attach_task() won't | ||
| 1635 | * mess with it. | ||
| 1520 | **/ | 1636 | **/ |
| 1521 | 1637 | ||
| 1522 | void cpuset_exit(struct task_struct *tsk) | 1638 | void cpuset_exit(struct task_struct *tsk) |
| 1523 | { | 1639 | { |
| 1524 | struct cpuset *cs; | 1640 | struct cpuset *cs; |
| 1525 | 1641 | ||
| 1526 | task_lock(tsk); | 1642 | BUG_ON(!(tsk->flags & PF_EXITING)); |
| 1643 | |||
| 1527 | cs = tsk->cpuset; | 1644 | cs = tsk->cpuset; |
| 1528 | tsk->cpuset = NULL; | 1645 | tsk->cpuset = NULL; |
| 1529 | task_unlock(tsk); | ||
| 1530 | 1646 | ||
| 1531 | if (notify_on_release(cs)) { | 1647 | if (notify_on_release(cs)) { |
| 1532 | char *pathbuf = NULL; | 1648 | char *pathbuf = NULL; |
| 1533 | 1649 | ||
| 1534 | cpuset_down(&cpuset_sem); | 1650 | down(&manage_sem); |
| 1535 | if (atomic_dec_and_test(&cs->count)) | 1651 | if (atomic_dec_and_test(&cs->count)) |
| 1536 | check_for_release(cs, &pathbuf); | 1652 | check_for_release(cs, &pathbuf); |
| 1537 | cpuset_up(&cpuset_sem); | 1653 | up(&manage_sem); |
| 1538 | cpuset_release_agent(pathbuf); | 1654 | cpuset_release_agent(pathbuf); |
| 1539 | } else { | 1655 | } else { |
| 1540 | atomic_dec(&cs->count); | 1656 | atomic_dec(&cs->count); |
| @@ -1555,11 +1671,11 @@ cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk) | |||
| 1555 | { | 1671 | { |
| 1556 | cpumask_t mask; | 1672 | cpumask_t mask; |
| 1557 | 1673 | ||
| 1558 | cpuset_down(&cpuset_sem); | 1674 | down(&callback_sem); |
| 1559 | task_lock((struct task_struct *)tsk); | 1675 | task_lock((struct task_struct *)tsk); |
| 1560 | guarantee_online_cpus(tsk->cpuset, &mask); | 1676 | guarantee_online_cpus(tsk->cpuset, &mask); |
| 1561 | task_unlock((struct task_struct *)tsk); | 1677 | task_unlock((struct task_struct *)tsk); |
| 1562 | cpuset_up(&cpuset_sem); | 1678 | up(&callback_sem); |
| 1563 | 1679 | ||
| 1564 | return mask; | 1680 | return mask; |
| 1565 | } | 1681 | } |
| @@ -1575,19 +1691,28 @@ void cpuset_init_current_mems_allowed(void) | |||
| 1575 | * If the current tasks cpusets mems_allowed changed behind our backs, | 1691 | * If the current tasks cpusets mems_allowed changed behind our backs, |
| 1576 | * update current->mems_allowed and mems_generation to the new value. | 1692 | * update current->mems_allowed and mems_generation to the new value. |
| 1577 | * Do not call this routine if in_interrupt(). | 1693 | * Do not call this routine if in_interrupt(). |
| 1694 | * | ||
| 1695 | * Call without callback_sem or task_lock() held. May be called | ||
| 1696 | * with or without manage_sem held. Unless exiting, it will acquire | ||
| 1697 | * task_lock(). Also might acquire callback_sem during call to | ||
| 1698 | * refresh_mems(). | ||
| 1578 | */ | 1699 | */ |
| 1579 | 1700 | ||
| 1580 | void cpuset_update_current_mems_allowed(void) | 1701 | void cpuset_update_current_mems_allowed(void) |
| 1581 | { | 1702 | { |
| 1582 | struct cpuset *cs = current->cpuset; | 1703 | struct cpuset *cs; |
| 1704 | int need_to_refresh = 0; | ||
| 1583 | 1705 | ||
| 1706 | task_lock(current); | ||
| 1707 | cs = current->cpuset; | ||
| 1584 | if (!cs) | 1708 | if (!cs) |
| 1585 | return; /* task is exiting */ | 1709 | goto done; |
| 1586 | if (current->cpuset_mems_generation != cs->mems_generation) { | 1710 | if (current->cpuset_mems_generation != cs->mems_generation) |
| 1587 | cpuset_down(&cpuset_sem); | 1711 | need_to_refresh = 1; |
| 1712 | done: | ||
| 1713 | task_unlock(current); | ||
| 1714 | if (need_to_refresh) | ||
| 1588 | refresh_mems(); | 1715 | refresh_mems(); |
| 1589 | cpuset_up(&cpuset_sem); | ||
| 1590 | } | ||
| 1591 | } | 1716 | } |
| 1592 | 1717 | ||
| 1593 | /** | 1718 | /** |
| @@ -1621,7 +1746,7 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl) | |||
| 1621 | 1746 | ||
| 1622 | /* | 1747 | /* |
| 1623 | * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive | 1748 | * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive |
| 1624 | * ancestor to the specified cpuset. Call while holding cpuset_sem. | 1749 | * ancestor to the specified cpuset. Call holding callback_sem. |
| 1625 | * If no ancestor is mem_exclusive (an unusual configuration), then | 1750 | * If no ancestor is mem_exclusive (an unusual configuration), then |
| 1626 | * returns the root cpuset. | 1751 | * returns the root cpuset. |
| 1627 | */ | 1752 | */ |
| @@ -1648,12 +1773,12 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) | |||
| 1648 | * GFP_KERNEL allocations are not so marked, so can escape to the | 1773 | * GFP_KERNEL allocations are not so marked, so can escape to the |
| 1649 | * nearest mem_exclusive ancestor cpuset. | 1774 | * nearest mem_exclusive ancestor cpuset. |
| 1650 | * | 1775 | * |
| 1651 | * Scanning up parent cpusets requires cpuset_sem. The __alloc_pages() | 1776 | * Scanning up parent cpusets requires callback_sem. The __alloc_pages() |
| 1652 | * routine only calls here with __GFP_HARDWALL bit _not_ set if | 1777 | * routine only calls here with __GFP_HARDWALL bit _not_ set if |
| 1653 | * it's a GFP_KERNEL allocation, and all nodes in the current tasks | 1778 | * it's a GFP_KERNEL allocation, and all nodes in the current tasks |
| 1654 | * mems_allowed came up empty on the first pass over the zonelist. | 1779 | * mems_allowed came up empty on the first pass over the zonelist. |
| 1655 | * So only GFP_KERNEL allocations, if all nodes in the cpuset are | 1780 | * So only GFP_KERNEL allocations, if all nodes in the cpuset are |
| 1656 | * short of memory, might require taking the cpuset_sem semaphore. | 1781 | * short of memory, might require taking the callback_sem semaphore. |
| 1657 | * | 1782 | * |
| 1658 | * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages() | 1783 | * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages() |
| 1659 | * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing | 1784 | * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing |
| @@ -1685,14 +1810,16 @@ int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) | |||
| 1685 | return 0; | 1810 | return 0; |
| 1686 | 1811 | ||
| 1687 | /* Not hardwall and node outside mems_allowed: scan up cpusets */ | 1812 | /* Not hardwall and node outside mems_allowed: scan up cpusets */ |
| 1688 | cpuset_down(&cpuset_sem); | 1813 | down(&callback_sem); |
| 1689 | cs = current->cpuset; | 1814 | |
| 1690 | if (!cs) | 1815 | if (current->flags & PF_EXITING) /* Let dying task have memory */ |
| 1691 | goto done; /* current task exiting */ | 1816 | return 1; |
| 1692 | cs = nearest_exclusive_ancestor(cs); | 1817 | task_lock(current); |
| 1818 | cs = nearest_exclusive_ancestor(current->cpuset); | ||
| 1819 | task_unlock(current); | ||
| 1820 | |||
| 1693 | allowed = node_isset(node, cs->mems_allowed); | 1821 | allowed = node_isset(node, cs->mems_allowed); |
| 1694 | done: | 1822 | up(&callback_sem); |
| 1695 | cpuset_up(&cpuset_sem); | ||
| 1696 | return allowed; | 1823 | return allowed; |
| 1697 | } | 1824 | } |
| 1698 | 1825 | ||
| @@ -1705,7 +1832,7 @@ done: | |||
| 1705 | * determine if task @p's memory usage might impact the memory | 1832 | * determine if task @p's memory usage might impact the memory |
| 1706 | * available to the current task. | 1833 | * available to the current task. |
| 1707 | * | 1834 | * |
| 1708 | * Acquires cpuset_sem - not suitable for calling from a fast path. | 1835 | * Acquires callback_sem - not suitable for calling from a fast path. |
| 1709 | **/ | 1836 | **/ |
| 1710 | 1837 | ||
| 1711 | int cpuset_excl_nodes_overlap(const struct task_struct *p) | 1838 | int cpuset_excl_nodes_overlap(const struct task_struct *p) |
| @@ -1713,18 +1840,27 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p) | |||
| 1713 | const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ | 1840 | const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ |
| 1714 | int overlap = 0; /* do cpusets overlap? */ | 1841 | int overlap = 0; /* do cpusets overlap? */ |
| 1715 | 1842 | ||
| 1716 | cpuset_down(&cpuset_sem); | 1843 | down(&callback_sem); |
| 1717 | cs1 = current->cpuset; | 1844 | |
| 1718 | if (!cs1) | 1845 | task_lock(current); |
| 1719 | goto done; /* current task exiting */ | 1846 | if (current->flags & PF_EXITING) { |
| 1720 | cs2 = p->cpuset; | 1847 | task_unlock(current); |
| 1721 | if (!cs2) | 1848 | goto done; |
| 1722 | goto done; /* task p is exiting */ | 1849 | } |
| 1723 | cs1 = nearest_exclusive_ancestor(cs1); | 1850 | cs1 = nearest_exclusive_ancestor(current->cpuset); |
| 1724 | cs2 = nearest_exclusive_ancestor(cs2); | 1851 | task_unlock(current); |
| 1852 | |||
| 1853 | task_lock((struct task_struct *)p); | ||
| 1854 | if (p->flags & PF_EXITING) { | ||
| 1855 | task_unlock((struct task_struct *)p); | ||
| 1856 | goto done; | ||
| 1857 | } | ||
| 1858 | cs2 = nearest_exclusive_ancestor(p->cpuset); | ||
| 1859 | task_unlock((struct task_struct *)p); | ||
| 1860 | |||
| 1725 | overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); | 1861 | overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); |
| 1726 | done: | 1862 | done: |
| 1727 | cpuset_up(&cpuset_sem); | 1863 | up(&callback_sem); |
| 1728 | 1864 | ||
| 1729 | return overlap; | 1865 | return overlap; |
| 1730 | } | 1866 | } |
| @@ -1733,6 +1869,10 @@ done: | |||
| 1733 | * proc_cpuset_show() | 1869 | * proc_cpuset_show() |
| 1734 | * - Print tasks cpuset path into seq_file. | 1870 | * - Print tasks cpuset path into seq_file. |
| 1735 | * - Used for /proc/<pid>/cpuset. | 1871 | * - Used for /proc/<pid>/cpuset. |
| 1872 | * - No need to task_lock(tsk) on this tsk->cpuset reference, as it | ||
| 1873 | * doesn't really matter if tsk->cpuset changes after we read it, | ||
| 1874 | * and we take manage_sem, keeping attach_task() from changing it | ||
| 1875 | * anyway. | ||
| 1736 | */ | 1876 | */ |
| 1737 | 1877 | ||
| 1738 | static int proc_cpuset_show(struct seq_file *m, void *v) | 1878 | static int proc_cpuset_show(struct seq_file *m, void *v) |
| @@ -1747,10 +1887,8 @@ static int proc_cpuset_show(struct seq_file *m, void *v) | |||
| 1747 | return -ENOMEM; | 1887 | return -ENOMEM; |
| 1748 | 1888 | ||
| 1749 | tsk = m->private; | 1889 | tsk = m->private; |
| 1750 | cpuset_down(&cpuset_sem); | 1890 | down(&manage_sem); |
| 1751 | task_lock(tsk); | ||
| 1752 | cs = tsk->cpuset; | 1891 | cs = tsk->cpuset; |
| 1753 | task_unlock(tsk); | ||
| 1754 | if (!cs) { | 1892 | if (!cs) { |
| 1755 | retval = -EINVAL; | 1893 | retval = -EINVAL; |
| 1756 | goto out; | 1894 | goto out; |
| @@ -1762,7 +1900,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v) | |||
| 1762 | seq_puts(m, buf); | 1900 | seq_puts(m, buf); |
| 1763 | seq_putc(m, '\n'); | 1901 | seq_putc(m, '\n'); |
| 1764 | out: | 1902 | out: |
| 1765 | cpuset_up(&cpuset_sem); | 1903 | up(&manage_sem); |
| 1766 | kfree(buf); | 1904 | kfree(buf); |
| 1767 | return retval; | 1905 | return retval; |
| 1768 | } | 1906 | } |
diff --git a/kernel/exit.c b/kernel/exit.c index 79f52b85d6ed..537394b25e8d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
| @@ -547,7 +547,7 @@ static inline void reparent_thread(task_t *p, task_t *father, int traced) | |||
| 547 | 547 | ||
| 548 | if (p->pdeath_signal) | 548 | if (p->pdeath_signal) |
| 549 | /* We already hold the tasklist_lock here. */ | 549 | /* We already hold the tasklist_lock here. */ |
| 550 | group_send_sig_info(p->pdeath_signal, (void *) 0, p); | 550 | group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p); |
| 551 | 551 | ||
| 552 | /* Move the child from its dying parent to the new one. */ | 552 | /* Move the child from its dying parent to the new one. */ |
| 553 | if (unlikely(traced)) { | 553 | if (unlikely(traced)) { |
| @@ -591,8 +591,8 @@ static inline void reparent_thread(task_t *p, task_t *father, int traced) | |||
| 591 | int pgrp = process_group(p); | 591 | int pgrp = process_group(p); |
| 592 | 592 | ||
| 593 | if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) { | 593 | if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) { |
| 594 | __kill_pg_info(SIGHUP, (void *)1, pgrp); | 594 | __kill_pg_info(SIGHUP, SEND_SIG_PRIV, pgrp); |
| 595 | __kill_pg_info(SIGCONT, (void *)1, pgrp); | 595 | __kill_pg_info(SIGCONT, SEND_SIG_PRIV, pgrp); |
| 596 | } | 596 | } |
| 597 | } | 597 | } |
| 598 | } | 598 | } |
| @@ -727,8 +727,8 @@ static void exit_notify(struct task_struct *tsk) | |||
| 727 | (t->signal->session == tsk->signal->session) && | 727 | (t->signal->session == tsk->signal->session) && |
| 728 | will_become_orphaned_pgrp(process_group(tsk), tsk) && | 728 | will_become_orphaned_pgrp(process_group(tsk), tsk) && |
| 729 | has_stopped_jobs(process_group(tsk))) { | 729 | has_stopped_jobs(process_group(tsk))) { |
| 730 | __kill_pg_info(SIGHUP, (void *)1, process_group(tsk)); | 730 | __kill_pg_info(SIGHUP, SEND_SIG_PRIV, process_group(tsk)); |
| 731 | __kill_pg_info(SIGCONT, (void *)1, process_group(tsk)); | 731 | __kill_pg_info(SIGCONT, SEND_SIG_PRIV, process_group(tsk)); |
| 732 | } | 732 | } |
| 733 | 733 | ||
| 734 | /* Let father know we died | 734 | /* Let father know we died |
| @@ -783,10 +783,6 @@ static void exit_notify(struct task_struct *tsk) | |||
| 783 | /* If the process is dead, release it - nobody will wait for it */ | 783 | /* If the process is dead, release it - nobody will wait for it */ |
| 784 | if (state == EXIT_DEAD) | 784 | if (state == EXIT_DEAD) |
| 785 | release_task(tsk); | 785 | release_task(tsk); |
| 786 | |||
| 787 | /* PF_DEAD causes final put_task_struct after we schedule. */ | ||
| 788 | preempt_disable(); | ||
| 789 | tsk->flags |= PF_DEAD; | ||
| 790 | } | 786 | } |
| 791 | 787 | ||
| 792 | fastcall NORET_TYPE void do_exit(long code) | 788 | fastcall NORET_TYPE void do_exit(long code) |
| @@ -873,7 +869,11 @@ fastcall NORET_TYPE void do_exit(long code) | |||
| 873 | tsk->mempolicy = NULL; | 869 | tsk->mempolicy = NULL; |
| 874 | #endif | 870 | #endif |
| 875 | 871 | ||
| 876 | BUG_ON(!(current->flags & PF_DEAD)); | 872 | /* PF_DEAD causes final put_task_struct after we schedule. */ |
| 873 | preempt_disable(); | ||
| 874 | BUG_ON(tsk->flags & PF_DEAD); | ||
| 875 | tsk->flags |= PF_DEAD; | ||
| 876 | |||
| 877 | schedule(); | 877 | schedule(); |
| 878 | BUG(); | 878 | BUG(); |
| 879 | /* Avoid "noreturn function does return". */ | 879 | /* Avoid "noreturn function does return". */ |
| @@ -1383,6 +1383,15 @@ repeat: | |||
| 1383 | 1383 | ||
| 1384 | switch (p->state) { | 1384 | switch (p->state) { |
| 1385 | case TASK_TRACED: | 1385 | case TASK_TRACED: |
| 1386 | /* | ||
| 1387 | * When we hit the race with PTRACE_ATTACH, | ||
| 1388 | * we will not report this child. But the | ||
| 1389 | * race means it has not yet been moved to | ||
| 1390 | * our ptrace_children list, so we need to | ||
| 1391 | * set the flag here to avoid a spurious ECHILD | ||
| 1392 | * when the race happens with the only child. | ||
| 1393 | */ | ||
| 1394 | flag = 1; | ||
| 1386 | if (!my_ptrace_child(p)) | 1395 | if (!my_ptrace_child(p)) |
| 1387 | continue; | 1396 | continue; |
| 1388 | /*FALLTHROUGH*/ | 1397 | /*FALLTHROUGH*/ |
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 3ff7b925c387..51df337b37db 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c | |||
| @@ -117,14 +117,16 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs) | |||
| 117 | /* | 117 | /* |
| 118 | * No locking required for CPU-local interrupts: | 118 | * No locking required for CPU-local interrupts: |
| 119 | */ | 119 | */ |
| 120 | desc->handler->ack(irq); | 120 | if (desc->handler->ack) |
| 121 | desc->handler->ack(irq); | ||
| 121 | action_ret = handle_IRQ_event(irq, regs, desc->action); | 122 | action_ret = handle_IRQ_event(irq, regs, desc->action); |
| 122 | desc->handler->end(irq); | 123 | desc->handler->end(irq); |
| 123 | return 1; | 124 | return 1; |
| 124 | } | 125 | } |
| 125 | 126 | ||
| 126 | spin_lock(&desc->lock); | 127 | spin_lock(&desc->lock); |
| 127 | desc->handler->ack(irq); | 128 | if (desc->handler->ack) |
| 129 | desc->handler->ack(irq); | ||
| 128 | /* | 130 | /* |
| 129 | * REPLAY is when Linux resends an IRQ that was dropped earlier | 131 | * REPLAY is when Linux resends an IRQ that was dropped earlier |
| 130 | * WAITING is used by probe to mark irqs that are being tested | 132 | * WAITING is used by probe to mark irqs that are being tested |
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 13bcec151b57..39277dd6bf90 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c | |||
| @@ -18,6 +18,7 @@ | |||
| 18 | #include <linux/fs.h> | 18 | #include <linux/fs.h> |
| 19 | #include <linux/err.h> | 19 | #include <linux/err.h> |
| 20 | #include <linux/proc_fs.h> | 20 | #include <linux/proc_fs.h> |
| 21 | #include <linux/sched.h> /* for cond_resched */ | ||
| 21 | #include <linux/mm.h> | 22 | #include <linux/mm.h> |
| 22 | 23 | ||
| 23 | #include <asm/sections.h> | 24 | #include <asm/sections.h> |
diff --git a/kernel/kmod.c b/kernel/kmod.c index 44166e3bb8af..51a892063aaa 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
| @@ -131,14 +131,14 @@ struct subprocess_info { | |||
| 131 | static int ____call_usermodehelper(void *data) | 131 | static int ____call_usermodehelper(void *data) |
| 132 | { | 132 | { |
| 133 | struct subprocess_info *sub_info = data; | 133 | struct subprocess_info *sub_info = data; |
| 134 | struct key *old_session; | 134 | struct key *new_session, *old_session; |
| 135 | int retval; | 135 | int retval; |
| 136 | 136 | ||
| 137 | /* Unblock all signals and set the session keyring. */ | 137 | /* Unblock all signals and set the session keyring. */ |
| 138 | key_get(sub_info->ring); | 138 | new_session = key_get(sub_info->ring); |
| 139 | flush_signals(current); | 139 | flush_signals(current); |
| 140 | spin_lock_irq(¤t->sighand->siglock); | 140 | spin_lock_irq(¤t->sighand->siglock); |
| 141 | old_session = __install_session_keyring(current, sub_info->ring); | 141 | old_session = __install_session_keyring(current, new_session); |
| 142 | flush_signal_handlers(current, 1); | 142 | flush_signal_handlers(current, 1); |
| 143 | sigemptyset(¤t->blocked); | 143 | sigemptyset(¤t->blocked); |
| 144 | recalc_sigpending(); | 144 | recalc_sigpending(); |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index f3ea492ab44d..ce4915dd683a 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
| @@ -35,6 +35,7 @@ | |||
| 35 | #include <linux/spinlock.h> | 35 | #include <linux/spinlock.h> |
| 36 | #include <linux/hash.h> | 36 | #include <linux/hash.h> |
| 37 | #include <linux/init.h> | 37 | #include <linux/init.h> |
| 38 | #include <linux/slab.h> | ||
| 38 | #include <linux/module.h> | 39 | #include <linux/module.h> |
| 39 | #include <linux/moduleloader.h> | 40 | #include <linux/moduleloader.h> |
| 40 | #include <asm-generic/sections.h> | 41 | #include <asm-generic/sections.h> |
diff --git a/kernel/kthread.c b/kernel/kthread.c index f50f174e92da..e75950a1092c 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
| @@ -165,6 +165,12 @@ EXPORT_SYMBOL(kthread_bind); | |||
| 165 | 165 | ||
| 166 | int kthread_stop(struct task_struct *k) | 166 | int kthread_stop(struct task_struct *k) |
| 167 | { | 167 | { |
| 168 | return kthread_stop_sem(k, NULL); | ||
| 169 | } | ||
| 170 | EXPORT_SYMBOL(kthread_stop); | ||
| 171 | |||
| 172 | int kthread_stop_sem(struct task_struct *k, struct semaphore *s) | ||
| 173 | { | ||
| 168 | int ret; | 174 | int ret; |
| 169 | 175 | ||
| 170 | down(&kthread_stop_lock); | 176 | down(&kthread_stop_lock); |
| @@ -178,7 +184,10 @@ int kthread_stop(struct task_struct *k) | |||
| 178 | 184 | ||
| 179 | /* Now set kthread_should_stop() to true, and wake it up. */ | 185 | /* Now set kthread_should_stop() to true, and wake it up. */ |
| 180 | kthread_stop_info.k = k; | 186 | kthread_stop_info.k = k; |
| 181 | wake_up_process(k); | 187 | if (s) |
| 188 | up(s); | ||
| 189 | else | ||
| 190 | wake_up_process(k); | ||
| 182 | put_task_struct(k); | 191 | put_task_struct(k); |
| 183 | 192 | ||
| 184 | /* Once it dies, reset stop ptr, gather result and we're done. */ | 193 | /* Once it dies, reset stop ptr, gather result and we're done. */ |
| @@ -189,7 +198,7 @@ int kthread_stop(struct task_struct *k) | |||
| 189 | 198 | ||
| 190 | return ret; | 199 | return ret; |
| 191 | } | 200 | } |
| 192 | EXPORT_SYMBOL(kthread_stop); | 201 | EXPORT_SYMBOL(kthread_stop_sem); |
| 193 | 202 | ||
| 194 | static __init int helper_init(void) | 203 | static __init int helper_init(void) |
| 195 | { | 204 | { |
diff --git a/kernel/params.c b/kernel/params.c index 1a8614bac5d5..47ba69547945 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
| @@ -23,6 +23,7 @@ | |||
| 23 | #include <linux/module.h> | 23 | #include <linux/module.h> |
| 24 | #include <linux/device.h> | 24 | #include <linux/device.h> |
| 25 | #include <linux/err.h> | 25 | #include <linux/err.h> |
| 26 | #include <linux/slab.h> | ||
| 26 | 27 | ||
| 27 | #if 0 | 28 | #if 0 |
| 28 | #define DEBUGP printk | 29 | #define DEBUGP printk |
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index bf374fceb39c..91a894264941 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c | |||
| @@ -1225,7 +1225,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) | |||
| 1225 | /* | 1225 | /* |
| 1226 | * The task was cleaned up already, no future firings. | 1226 | * The task was cleaned up already, no future firings. |
| 1227 | */ | 1227 | */ |
| 1228 | return; | 1228 | goto out; |
| 1229 | 1229 | ||
| 1230 | /* | 1230 | /* |
| 1231 | * Fetch the current sample and update the timer's expiry time. | 1231 | * Fetch the current sample and update the timer's expiry time. |
| @@ -1235,7 +1235,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) | |||
| 1235 | bump_cpu_timer(timer, now); | 1235 | bump_cpu_timer(timer, now); |
| 1236 | if (unlikely(p->exit_state)) { | 1236 | if (unlikely(p->exit_state)) { |
| 1237 | clear_dead_task(timer, now); | 1237 | clear_dead_task(timer, now); |
| 1238 | return; | 1238 | goto out; |
| 1239 | } | 1239 | } |
| 1240 | read_lock(&tasklist_lock); /* arm_timer needs it. */ | 1240 | read_lock(&tasklist_lock); /* arm_timer needs it. */ |
| 1241 | } else { | 1241 | } else { |
| @@ -1248,8 +1248,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) | |||
| 1248 | put_task_struct(p); | 1248 | put_task_struct(p); |
| 1249 | timer->it.cpu.task = p = NULL; | 1249 | timer->it.cpu.task = p = NULL; |
| 1250 | timer->it.cpu.expires.sched = 0; | 1250 | timer->it.cpu.expires.sched = 0; |
| 1251 | read_unlock(&tasklist_lock); | 1251 | goto out_unlock; |
| 1252 | return; | ||
| 1253 | } else if (unlikely(p->exit_state) && thread_group_empty(p)) { | 1252 | } else if (unlikely(p->exit_state) && thread_group_empty(p)) { |
| 1254 | /* | 1253 | /* |
| 1255 | * We've noticed that the thread is dead, but | 1254 | * We've noticed that the thread is dead, but |
| @@ -1257,8 +1256,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) | |||
| 1257 | * drop our task ref. | 1256 | * drop our task ref. |
| 1258 | */ | 1257 | */ |
| 1259 | clear_dead_task(timer, now); | 1258 | clear_dead_task(timer, now); |
| 1260 | read_unlock(&tasklist_lock); | 1259 | goto out_unlock; |
| 1261 | return; | ||
| 1262 | } | 1260 | } |
| 1263 | cpu_clock_sample_group(timer->it_clock, p, &now); | 1261 | cpu_clock_sample_group(timer->it_clock, p, &now); |
| 1264 | bump_cpu_timer(timer, now); | 1262 | bump_cpu_timer(timer, now); |
| @@ -1270,7 +1268,13 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) | |||
| 1270 | */ | 1268 | */ |
| 1271 | arm_timer(timer, now); | 1269 | arm_timer(timer, now); |
| 1272 | 1270 | ||
| 1271 | out_unlock: | ||
| 1273 | read_unlock(&tasklist_lock); | 1272 | read_unlock(&tasklist_lock); |
| 1273 | |||
| 1274 | out: | ||
| 1275 | timer->it_overrun_last = timer->it_overrun; | ||
| 1276 | timer->it_overrun = -1; | ||
| 1277 | ++timer->it_requeue_pending; | ||
| 1274 | } | 1278 | } |
| 1275 | 1279 | ||
| 1276 | /* | 1280 | /* |
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index dda3cda73c77..ea55c7a1cd75 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c | |||
| @@ -1295,13 +1295,6 @@ sys_clock_getres(clockid_t which_clock, struct timespec __user *tp) | |||
| 1295 | return error; | 1295 | return error; |
| 1296 | } | 1296 | } |
| 1297 | 1297 | ||
| 1298 | static void nanosleep_wake_up(unsigned long __data) | ||
| 1299 | { | ||
| 1300 | struct task_struct *p = (struct task_struct *) __data; | ||
| 1301 | |||
| 1302 | wake_up_process(p); | ||
| 1303 | } | ||
| 1304 | |||
| 1305 | /* | 1298 | /* |
| 1306 | * The standard says that an absolute nanosleep call MUST wake up at | 1299 | * The standard says that an absolute nanosleep call MUST wake up at |
| 1307 | * the requested time in spite of clock settings. Here is what we do: | 1300 | * the requested time in spite of clock settings. Here is what we do: |
| @@ -1442,7 +1435,6 @@ static int common_nsleep(clockid_t which_clock, | |||
| 1442 | int flags, struct timespec *tsave) | 1435 | int flags, struct timespec *tsave) |
| 1443 | { | 1436 | { |
| 1444 | struct timespec t, dum; | 1437 | struct timespec t, dum; |
| 1445 | struct timer_list new_timer; | ||
| 1446 | DECLARE_WAITQUEUE(abs_wqueue, current); | 1438 | DECLARE_WAITQUEUE(abs_wqueue, current); |
| 1447 | u64 rq_time = (u64)0; | 1439 | u64 rq_time = (u64)0; |
| 1448 | s64 left; | 1440 | s64 left; |
| @@ -1451,10 +1443,6 @@ static int common_nsleep(clockid_t which_clock, | |||
| 1451 | ¤t_thread_info()->restart_block; | 1443 | ¤t_thread_info()->restart_block; |
| 1452 | 1444 | ||
| 1453 | abs_wqueue.flags = 0; | 1445 | abs_wqueue.flags = 0; |
| 1454 | init_timer(&new_timer); | ||
| 1455 | new_timer.expires = 0; | ||
| 1456 | new_timer.data = (unsigned long) current; | ||
| 1457 | new_timer.function = nanosleep_wake_up; | ||
| 1458 | abs = flags & TIMER_ABSTIME; | 1446 | abs = flags & TIMER_ABSTIME; |
| 1459 | 1447 | ||
| 1460 | if (restart_block->fn == clock_nanosleep_restart) { | 1448 | if (restart_block->fn == clock_nanosleep_restart) { |
| @@ -1490,13 +1478,8 @@ static int common_nsleep(clockid_t which_clock, | |||
| 1490 | if (left < (s64)0) | 1478 | if (left < (s64)0) |
| 1491 | break; | 1479 | break; |
| 1492 | 1480 | ||
| 1493 | new_timer.expires = jiffies + left; | 1481 | schedule_timeout_interruptible(left); |
| 1494 | __set_current_state(TASK_INTERRUPTIBLE); | ||
| 1495 | add_timer(&new_timer); | ||
| 1496 | |||
| 1497 | schedule(); | ||
| 1498 | 1482 | ||
| 1499 | del_timer_sync(&new_timer); | ||
| 1500 | left = rq_time - get_jiffies_64(); | 1483 | left = rq_time - get_jiffies_64(); |
| 1501 | } while (left > (s64)0 && !test_thread_flag(TIF_SIGPENDING)); | 1484 | } while (left > (s64)0 && !test_thread_flag(TIF_SIGPENDING)); |
| 1502 | 1485 | ||
diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 2f438d0eaa13..c71eb4579c07 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile | |||
| @@ -4,7 +4,7 @@ EXTRA_CFLAGS += -DDEBUG | |||
| 4 | endif | 4 | endif |
| 5 | 5 | ||
| 6 | obj-y := main.o process.o console.o pm.o | 6 | obj-y := main.o process.o console.o pm.o |
| 7 | obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o | 7 | obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o snapshot.o |
| 8 | 8 | ||
| 9 | obj-$(CONFIG_SUSPEND_SMP) += smp.o | 9 | obj-$(CONFIG_SUSPEND_SMP) += smp.o |
| 10 | 10 | ||
diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 761956e813f5..027322a564f4 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c | |||
| @@ -30,7 +30,6 @@ extern int swsusp_check(void); | |||
| 30 | extern int swsusp_read(void); | 30 | extern int swsusp_read(void); |
| 31 | extern void swsusp_close(void); | 31 | extern void swsusp_close(void); |
| 32 | extern int swsusp_resume(void); | 32 | extern int swsusp_resume(void); |
| 33 | extern int swsusp_free(void); | ||
| 34 | 33 | ||
| 35 | 34 | ||
| 36 | static int noresume = 0; | 35 | static int noresume = 0; |
| @@ -93,10 +92,7 @@ static void free_some_memory(void) | |||
| 93 | printk("Freeing memory... "); | 92 | printk("Freeing memory... "); |
| 94 | while ((tmp = shrink_all_memory(10000))) { | 93 | while ((tmp = shrink_all_memory(10000))) { |
| 95 | pages += tmp; | 94 | pages += tmp; |
| 96 | printk("\b%c", p[i]); | 95 | printk("\b%c", p[i++ % 4]); |
| 97 | i++; | ||
| 98 | if (i > 3) | ||
| 99 | i = 0; | ||
| 100 | } | 96 | } |
| 101 | printk("\bdone (%li pages freed)\n", pages); | 97 | printk("\bdone (%li pages freed)\n", pages); |
| 102 | } | 98 | } |
| @@ -178,13 +174,12 @@ int pm_suspend_disk(void) | |||
| 178 | goto Done; | 174 | goto Done; |
| 179 | 175 | ||
| 180 | if (in_suspend) { | 176 | if (in_suspend) { |
| 177 | device_resume(); | ||
| 181 | pr_debug("PM: writing image.\n"); | 178 | pr_debug("PM: writing image.\n"); |
| 182 | error = swsusp_write(); | 179 | error = swsusp_write(); |
| 183 | if (!error) | 180 | if (!error) |
| 184 | power_down(pm_disk_mode); | 181 | power_down(pm_disk_mode); |
| 185 | else { | 182 | else { |
| 186 | /* swsusp_write can not fail in device_resume, | ||
| 187 | no need to do second device_resume */ | ||
| 188 | swsusp_free(); | 183 | swsusp_free(); |
| 189 | unprepare_processes(); | 184 | unprepare_processes(); |
| 190 | return error; | 185 | return error; |
| @@ -252,14 +247,17 @@ static int software_resume(void) | |||
| 252 | 247 | ||
| 253 | pr_debug("PM: Reading swsusp image.\n"); | 248 | pr_debug("PM: Reading swsusp image.\n"); |
| 254 | 249 | ||
| 255 | if ((error = swsusp_read())) | 250 | if ((error = swsusp_read())) { |
| 256 | goto Cleanup; | 251 | swsusp_free(); |
| 252 | goto Thaw; | ||
| 253 | } | ||
| 257 | 254 | ||
| 258 | pr_debug("PM: Preparing devices for restore.\n"); | 255 | pr_debug("PM: Preparing devices for restore.\n"); |
| 259 | 256 | ||
| 260 | if ((error = device_suspend(PMSG_FREEZE))) { | 257 | if ((error = device_suspend(PMSG_FREEZE))) { |
| 261 | printk("Some devices failed to suspend\n"); | 258 | printk("Some devices failed to suspend\n"); |
| 262 | goto Free; | 259 | swsusp_free(); |
| 260 | goto Thaw; | ||
| 263 | } | 261 | } |
| 264 | 262 | ||
| 265 | mb(); | 263 | mb(); |
| @@ -268,9 +266,7 @@ static int software_resume(void) | |||
| 268 | swsusp_resume(); | 266 | swsusp_resume(); |
| 269 | pr_debug("PM: Restore failed, recovering.n"); | 267 | pr_debug("PM: Restore failed, recovering.n"); |
| 270 | device_resume(); | 268 | device_resume(); |
| 271 | Free: | 269 | Thaw: |
| 272 | swsusp_free(); | ||
| 273 | Cleanup: | ||
| 274 | unprepare_processes(); | 270 | unprepare_processes(); |
| 275 | Done: | 271 | Done: |
| 276 | /* For success case, the suspend path will release the lock */ | 272 | /* For success case, the suspend path will release the lock */ |
diff --git a/kernel/power/main.c b/kernel/power/main.c index 22bdc93cc038..18d7d693fbba 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
| @@ -167,6 +167,8 @@ static int enter_state(suspend_state_t state) | |||
| 167 | { | 167 | { |
| 168 | int error; | 168 | int error; |
| 169 | 169 | ||
| 170 | if (pm_ops->valid && !pm_ops->valid(state)) | ||
| 171 | return -ENODEV; | ||
| 170 | if (down_trylock(&pm_sem)) | 172 | if (down_trylock(&pm_sem)) |
| 171 | return -EBUSY; | 173 | return -EBUSY; |
| 172 | 174 | ||
| @@ -236,7 +238,8 @@ static ssize_t state_show(struct subsystem * subsys, char * buf) | |||
| 236 | char * s = buf; | 238 | char * s = buf; |
| 237 | 239 | ||
| 238 | for (i = 0; i < PM_SUSPEND_MAX; i++) { | 240 | for (i = 0; i < PM_SUSPEND_MAX; i++) { |
| 239 | if (pm_states[i]) | 241 | if (pm_states[i] && pm_ops && (!pm_ops->valid |
| 242 | ||(pm_ops->valid && pm_ops->valid(i)))) | ||
| 240 | s += sprintf(s,"%s ",pm_states[i]); | 243 | s += sprintf(s,"%s ",pm_states[i]); |
| 241 | } | 244 | } |
| 242 | s += sprintf(s,"\n"); | 245 | s += sprintf(s,"\n"); |
diff --git a/kernel/power/power.h b/kernel/power/power.h index 6748de23e83c..d4fd96a135ab 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h | |||
| @@ -53,3 +53,20 @@ extern void thaw_processes(void); | |||
| 53 | 53 | ||
| 54 | extern int pm_prepare_console(void); | 54 | extern int pm_prepare_console(void); |
| 55 | extern void pm_restore_console(void); | 55 | extern void pm_restore_console(void); |
| 56 | |||
| 57 | |||
| 58 | /* References to section boundaries */ | ||
| 59 | extern const void __nosave_begin, __nosave_end; | ||
| 60 | |||
| 61 | extern unsigned int nr_copy_pages; | ||
| 62 | extern suspend_pagedir_t *pagedir_nosave; | ||
| 63 | extern suspend_pagedir_t *pagedir_save; | ||
| 64 | |||
| 65 | extern asmlinkage int swsusp_arch_suspend(void); | ||
| 66 | extern asmlinkage int swsusp_arch_resume(void); | ||
| 67 | |||
| 68 | extern int restore_highmem(void); | ||
| 69 | extern struct pbe * alloc_pagedir(unsigned nr_pages); | ||
| 70 | extern void create_pbe_list(struct pbe *pblist, unsigned nr_pages); | ||
| 71 | extern void swsusp_free(void); | ||
| 72 | extern int enough_swap(unsigned nr_pages); | ||
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c new file mode 100644 index 000000000000..42a628704398 --- /dev/null +++ b/kernel/power/snapshot.c | |||
| @@ -0,0 +1,435 @@ | |||
| 1 | /* | ||
| 2 | * linux/kernel/power/snapshot.c | ||
| 3 | * | ||
| 4 | * This file provide system snapshot/restore functionality. | ||
| 5 | * | ||
| 6 | * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz> | ||
| 7 | * | ||
| 8 | * This file is released under the GPLv2, and is based on swsusp.c. | ||
| 9 | * | ||
| 10 | */ | ||
| 11 | |||
| 12 | |||
| 13 | #include <linux/module.h> | ||
| 14 | #include <linux/mm.h> | ||
| 15 | #include <linux/suspend.h> | ||
| 16 | #include <linux/smp_lock.h> | ||
| 17 | #include <linux/delay.h> | ||
| 18 | #include <linux/bitops.h> | ||
| 19 | #include <linux/spinlock.h> | ||
| 20 | #include <linux/kernel.h> | ||
| 21 | #include <linux/pm.h> | ||
| 22 | #include <linux/device.h> | ||
| 23 | #include <linux/bootmem.h> | ||
| 24 | #include <linux/syscalls.h> | ||
| 25 | #include <linux/console.h> | ||
| 26 | #include <linux/highmem.h> | ||
| 27 | |||
| 28 | #include <asm/uaccess.h> | ||
| 29 | #include <asm/mmu_context.h> | ||
| 30 | #include <asm/pgtable.h> | ||
| 31 | #include <asm/tlbflush.h> | ||
| 32 | #include <asm/io.h> | ||
| 33 | |||
| 34 | #include "power.h" | ||
| 35 | |||
| 36 | #ifdef CONFIG_HIGHMEM | ||
| 37 | struct highmem_page { | ||
| 38 | char *data; | ||
| 39 | struct page *page; | ||
| 40 | struct highmem_page *next; | ||
| 41 | }; | ||
| 42 | |||
| 43 | static struct highmem_page *highmem_copy; | ||
| 44 | |||
| 45 | static int save_highmem_zone(struct zone *zone) | ||
| 46 | { | ||
| 47 | unsigned long zone_pfn; | ||
| 48 | mark_free_pages(zone); | ||
| 49 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) { | ||
| 50 | struct page *page; | ||
| 51 | struct highmem_page *save; | ||
| 52 | void *kaddr; | ||
| 53 | unsigned long pfn = zone_pfn + zone->zone_start_pfn; | ||
| 54 | |||
| 55 | if (!(pfn%1000)) | ||
| 56 | printk("."); | ||
| 57 | if (!pfn_valid(pfn)) | ||
| 58 | continue; | ||
| 59 | page = pfn_to_page(pfn); | ||
| 60 | /* | ||
| 61 | * This condition results from rvmalloc() sans vmalloc_32() | ||
| 62 | * and architectural memory reservations. This should be | ||
| 63 | * corrected eventually when the cases giving rise to this | ||
| 64 | * are better understood. | ||
| 65 | */ | ||
| 66 | if (PageReserved(page)) { | ||
| 67 | printk("highmem reserved page?!\n"); | ||
| 68 | continue; | ||
| 69 | } | ||
| 70 | BUG_ON(PageNosave(page)); | ||
| 71 | if (PageNosaveFree(page)) | ||
| 72 | continue; | ||
| 73 | save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC); | ||
| 74 | if (!save) | ||
| 75 | return -ENOMEM; | ||
| 76 | save->next = highmem_copy; | ||
| 77 | save->page = page; | ||
| 78 | save->data = (void *) get_zeroed_page(GFP_ATOMIC); | ||
| 79 | if (!save->data) { | ||
| 80 | kfree(save); | ||
| 81 | return -ENOMEM; | ||
| 82 | } | ||
| 83 | kaddr = kmap_atomic(page, KM_USER0); | ||
| 84 | memcpy(save->data, kaddr, PAGE_SIZE); | ||
| 85 | kunmap_atomic(kaddr, KM_USER0); | ||
| 86 | highmem_copy = save; | ||
| 87 | } | ||
| 88 | return 0; | ||
| 89 | } | ||
| 90 | |||
| 91 | |||
| 92 | static int save_highmem(void) | ||
| 93 | { | ||
| 94 | struct zone *zone; | ||
| 95 | int res = 0; | ||
| 96 | |||
| 97 | pr_debug("swsusp: Saving Highmem\n"); | ||
| 98 | for_each_zone (zone) { | ||
| 99 | if (is_highmem(zone)) | ||
| 100 | res = save_highmem_zone(zone); | ||
| 101 | if (res) | ||
| 102 | return res; | ||
| 103 | } | ||
| 104 | return 0; | ||
| 105 | } | ||
| 106 | |||
| 107 | int restore_highmem(void) | ||
| 108 | { | ||
| 109 | printk("swsusp: Restoring Highmem\n"); | ||
| 110 | while (highmem_copy) { | ||
| 111 | struct highmem_page *save = highmem_copy; | ||
| 112 | void *kaddr; | ||
| 113 | highmem_copy = save->next; | ||
| 114 | |||
| 115 | kaddr = kmap_atomic(save->page, KM_USER0); | ||
| 116 | memcpy(kaddr, save->data, PAGE_SIZE); | ||
| 117 | kunmap_atomic(kaddr, KM_USER0); | ||
| 118 | free_page((long) save->data); | ||
| 119 | kfree(save); | ||
| 120 | } | ||
| 121 | return 0; | ||
| 122 | } | ||
| 123 | #else | ||
| 124 | static int save_highmem(void) { return 0; } | ||
| 125 | int restore_highmem(void) { return 0; } | ||
| 126 | #endif /* CONFIG_HIGHMEM */ | ||
| 127 | |||
| 128 | |||
| 129 | static int pfn_is_nosave(unsigned long pfn) | ||
| 130 | { | ||
| 131 | unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT; | ||
| 132 | unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT; | ||
| 133 | return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); | ||
| 134 | } | ||
| 135 | |||
| 136 | /** | ||
| 137 | * saveable - Determine whether a page should be cloned or not. | ||
| 138 | * @pfn: The page | ||
| 139 | * | ||
| 140 | * We save a page if it's Reserved, and not in the range of pages | ||
| 141 | * statically defined as 'unsaveable', or if it isn't reserved, and | ||
| 142 | * isn't part of a free chunk of pages. | ||
| 143 | */ | ||
| 144 | |||
| 145 | static int saveable(struct zone *zone, unsigned long *zone_pfn) | ||
| 146 | { | ||
| 147 | unsigned long pfn = *zone_pfn + zone->zone_start_pfn; | ||
| 148 | struct page *page; | ||
| 149 | |||
| 150 | if (!pfn_valid(pfn)) | ||
| 151 | return 0; | ||
| 152 | |||
| 153 | page = pfn_to_page(pfn); | ||
| 154 | BUG_ON(PageReserved(page) && PageNosave(page)); | ||
| 155 | if (PageNosave(page)) | ||
| 156 | return 0; | ||
| 157 | if (PageReserved(page) && pfn_is_nosave(pfn)) { | ||
| 158 | pr_debug("[nosave pfn 0x%lx]", pfn); | ||
| 159 | return 0; | ||
| 160 | } | ||
| 161 | if (PageNosaveFree(page)) | ||
| 162 | return 0; | ||
| 163 | |||
| 164 | return 1; | ||
| 165 | } | ||
| 166 | |||
| 167 | static unsigned count_data_pages(void) | ||
| 168 | { | ||
| 169 | struct zone *zone; | ||
| 170 | unsigned long zone_pfn; | ||
| 171 | unsigned n; | ||
| 172 | |||
| 173 | n = 0; | ||
| 174 | for_each_zone (zone) { | ||
| 175 | if (is_highmem(zone)) | ||
| 176 | continue; | ||
| 177 | mark_free_pages(zone); | ||
| 178 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) | ||
| 179 | n += saveable(zone, &zone_pfn); | ||
| 180 | } | ||
| 181 | return n; | ||
| 182 | } | ||
| 183 | |||
| 184 | static void copy_data_pages(struct pbe *pblist) | ||
| 185 | { | ||
| 186 | struct zone *zone; | ||
| 187 | unsigned long zone_pfn; | ||
| 188 | struct pbe *pbe, *p; | ||
| 189 | |||
| 190 | pbe = pblist; | ||
| 191 | for_each_zone (zone) { | ||
| 192 | if (is_highmem(zone)) | ||
| 193 | continue; | ||
| 194 | mark_free_pages(zone); | ||
| 195 | /* This is necessary for swsusp_free() */ | ||
| 196 | for_each_pb_page (p, pblist) | ||
| 197 | SetPageNosaveFree(virt_to_page(p)); | ||
| 198 | for_each_pbe (p, pblist) | ||
| 199 | SetPageNosaveFree(virt_to_page(p->address)); | ||
| 200 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) { | ||
| 201 | if (saveable(zone, &zone_pfn)) { | ||
| 202 | struct page *page; | ||
| 203 | page = pfn_to_page(zone_pfn + zone->zone_start_pfn); | ||
| 204 | BUG_ON(!pbe); | ||
| 205 | pbe->orig_address = (unsigned long)page_address(page); | ||
| 206 | /* copy_page is not usable for copying task structs. */ | ||
| 207 | memcpy((void *)pbe->address, (void *)pbe->orig_address, PAGE_SIZE); | ||
| 208 | pbe = pbe->next; | ||
| 209 | } | ||
| 210 | } | ||
| 211 | } | ||
| 212 | BUG_ON(pbe); | ||
| 213 | } | ||
| 214 | |||
| 215 | |||
| 216 | /** | ||
| 217 | * free_pagedir - free pages allocated with alloc_pagedir() | ||
| 218 | */ | ||
| 219 | |||
| 220 | static void free_pagedir(struct pbe *pblist) | ||
| 221 | { | ||
| 222 | struct pbe *pbe; | ||
| 223 | |||
| 224 | while (pblist) { | ||
| 225 | pbe = (pblist + PB_PAGE_SKIP)->next; | ||
| 226 | ClearPageNosave(virt_to_page(pblist)); | ||
| 227 | ClearPageNosaveFree(virt_to_page(pblist)); | ||
| 228 | free_page((unsigned long)pblist); | ||
| 229 | pblist = pbe; | ||
| 230 | } | ||
| 231 | } | ||
| 232 | |||
| 233 | /** | ||
| 234 | * fill_pb_page - Create a list of PBEs on a given memory page | ||
| 235 | */ | ||
| 236 | |||
| 237 | static inline void fill_pb_page(struct pbe *pbpage) | ||
| 238 | { | ||
| 239 | struct pbe *p; | ||
| 240 | |||
| 241 | p = pbpage; | ||
| 242 | pbpage += PB_PAGE_SKIP; | ||
| 243 | do | ||
| 244 | p->next = p + 1; | ||
| 245 | while (++p < pbpage); | ||
| 246 | } | ||
| 247 | |||
| 248 | /** | ||
| 249 | * create_pbe_list - Create a list of PBEs on top of a given chain | ||
| 250 | * of memory pages allocated with alloc_pagedir() | ||
| 251 | */ | ||
| 252 | |||
| 253 | void create_pbe_list(struct pbe *pblist, unsigned nr_pages) | ||
| 254 | { | ||
| 255 | struct pbe *pbpage, *p; | ||
| 256 | unsigned num = PBES_PER_PAGE; | ||
| 257 | |||
| 258 | for_each_pb_page (pbpage, pblist) { | ||
| 259 | if (num >= nr_pages) | ||
| 260 | break; | ||
| 261 | |||
| 262 | fill_pb_page(pbpage); | ||
| 263 | num += PBES_PER_PAGE; | ||
| 264 | } | ||
| 265 | if (pbpage) { | ||
| 266 | for (num -= PBES_PER_PAGE - 1, p = pbpage; num < nr_pages; p++, num++) | ||
| 267 | p->next = p + 1; | ||
| 268 | p->next = NULL; | ||
| 269 | } | ||
| 270 | pr_debug("create_pbe_list(): initialized %d PBEs\n", num); | ||
| 271 | } | ||
| 272 | |||
| 273 | static void *alloc_image_page(void) | ||
| 274 | { | ||
| 275 | void *res = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD); | ||
| 276 | if (res) { | ||
| 277 | SetPageNosave(virt_to_page(res)); | ||
| 278 | SetPageNosaveFree(virt_to_page(res)); | ||
| 279 | } | ||
| 280 | return res; | ||
| 281 | } | ||
| 282 | |||
| 283 | /** | ||
| 284 | * alloc_pagedir - Allocate the page directory. | ||
| 285 | * | ||
| 286 | * First, determine exactly how many pages we need and | ||
| 287 | * allocate them. | ||
| 288 | * | ||
| 289 | * We arrange the pages in a chain: each page is an array of PBES_PER_PAGE | ||
| 290 | * struct pbe elements (pbes) and the last element in the page points | ||
| 291 | * to the next page. | ||
| 292 | * | ||
| 293 | * On each page we set up a list of struct_pbe elements. | ||
| 294 | */ | ||
| 295 | |||
| 296 | struct pbe *alloc_pagedir(unsigned nr_pages) | ||
| 297 | { | ||
| 298 | unsigned num; | ||
| 299 | struct pbe *pblist, *pbe; | ||
| 300 | |||
| 301 | if (!nr_pages) | ||
| 302 | return NULL; | ||
| 303 | |||
| 304 | pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages); | ||
| 305 | pblist = alloc_image_page(); | ||
| 306 | /* FIXME: rewrite this ugly loop */ | ||
| 307 | for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages; | ||
| 308 | pbe = pbe->next, num += PBES_PER_PAGE) { | ||
| 309 | pbe += PB_PAGE_SKIP; | ||
| 310 | pbe->next = alloc_image_page(); | ||
| 311 | } | ||
| 312 | if (!pbe) { /* get_zeroed_page() failed */ | ||
| 313 | free_pagedir(pblist); | ||
| 314 | pblist = NULL; | ||
| 315 | } | ||
| 316 | return pblist; | ||
| 317 | } | ||
| 318 | |||
| 319 | /** | ||
| 320 | * Free pages we allocated for suspend. Suspend pages are alocated | ||
| 321 | * before atomic copy, so we need to free them after resume. | ||
| 322 | */ | ||
| 323 | |||
| 324 | void swsusp_free(void) | ||
| 325 | { | ||
| 326 | struct zone *zone; | ||
| 327 | unsigned long zone_pfn; | ||
| 328 | |||
| 329 | for_each_zone(zone) { | ||
| 330 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) | ||
| 331 | if (pfn_valid(zone_pfn + zone->zone_start_pfn)) { | ||
| 332 | struct page * page; | ||
| 333 | page = pfn_to_page(zone_pfn + zone->zone_start_pfn); | ||
| 334 | if (PageNosave(page) && PageNosaveFree(page)) { | ||
| 335 | ClearPageNosave(page); | ||
| 336 | ClearPageNosaveFree(page); | ||
| 337 | free_page((long) page_address(page)); | ||
| 338 | } | ||
| 339 | } | ||
| 340 | } | ||
| 341 | } | ||
| 342 | |||
| 343 | |||
| 344 | /** | ||
| 345 | * enough_free_mem - Make sure we enough free memory to snapshot. | ||
| 346 | * | ||
| 347 | * Returns TRUE or FALSE after checking the number of available | ||
| 348 | * free pages. | ||
| 349 | */ | ||
| 350 | |||
| 351 | static int enough_free_mem(unsigned nr_pages) | ||
| 352 | { | ||
| 353 | pr_debug("swsusp: available memory: %u pages\n", nr_free_pages()); | ||
| 354 | return nr_free_pages() > (nr_pages + PAGES_FOR_IO + | ||
| 355 | (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); | ||
| 356 | } | ||
| 357 | |||
| 358 | |||
| 359 | static struct pbe *swsusp_alloc(unsigned nr_pages) | ||
| 360 | { | ||
| 361 | struct pbe *pblist, *p; | ||
| 362 | |||
| 363 | if (!(pblist = alloc_pagedir(nr_pages))) { | ||
| 364 | printk(KERN_ERR "suspend: Allocating pagedir failed.\n"); | ||
| 365 | return NULL; | ||
| 366 | } | ||
| 367 | create_pbe_list(pblist, nr_pages); | ||
| 368 | |||
| 369 | for_each_pbe (p, pblist) { | ||
| 370 | p->address = (unsigned long)alloc_image_page(); | ||
| 371 | if (!p->address) { | ||
| 372 | printk(KERN_ERR "suspend: Allocating image pages failed.\n"); | ||
| 373 | swsusp_free(); | ||
| 374 | return NULL; | ||
| 375 | } | ||
| 376 | } | ||
| 377 | |||
| 378 | return pblist; | ||
| 379 | } | ||
| 380 | |||
| 381 | asmlinkage int swsusp_save(void) | ||
| 382 | { | ||
| 383 | unsigned nr_pages; | ||
| 384 | |||
| 385 | pr_debug("swsusp: critical section: \n"); | ||
| 386 | if (save_highmem()) { | ||
| 387 | printk(KERN_CRIT "swsusp: Not enough free pages for highmem\n"); | ||
| 388 | restore_highmem(); | ||
| 389 | return -ENOMEM; | ||
| 390 | } | ||
| 391 | |||
| 392 | drain_local_pages(); | ||
| 393 | nr_pages = count_data_pages(); | ||
| 394 | printk("swsusp: Need to copy %u pages\n", nr_pages); | ||
| 395 | |||
| 396 | pr_debug("swsusp: pages needed: %u + %lu + %u, free: %u\n", | ||
| 397 | nr_pages, | ||
| 398 | (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE, | ||
| 399 | PAGES_FOR_IO, nr_free_pages()); | ||
| 400 | |||
| 401 | /* This is needed because of the fixed size of swsusp_info */ | ||
| 402 | if (MAX_PBES < (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE) | ||
| 403 | return -ENOSPC; | ||
| 404 | |||
| 405 | if (!enough_free_mem(nr_pages)) { | ||
| 406 | printk(KERN_ERR "swsusp: Not enough free memory\n"); | ||
| 407 | return -ENOMEM; | ||
| 408 | } | ||
| 409 | |||
| 410 | if (!enough_swap(nr_pages)) { | ||
| 411 | printk(KERN_ERR "swsusp: Not enough free swap\n"); | ||
| 412 | return -ENOSPC; | ||
| 413 | } | ||
| 414 | |||
| 415 | pagedir_nosave = swsusp_alloc(nr_pages); | ||
| 416 | if (!pagedir_nosave) | ||
| 417 | return -ENOMEM; | ||
| 418 | |||
| 419 | /* During allocating of suspend pagedir, new cold pages may appear. | ||
| 420 | * Kill them. | ||
| 421 | */ | ||
| 422 | drain_local_pages(); | ||
| 423 | copy_data_pages(pagedir_nosave); | ||
| 424 | |||
| 425 | /* | ||
| 426 | * End of critical section. From now on, we can write to memory, | ||
| 427 | * but we should not touch disk. This specially means we must _not_ | ||
| 428 | * touch swap space! Except we must write out our image of course. | ||
| 429 | */ | ||
| 430 | |||
| 431 | nr_copy_pages = nr_pages; | ||
| 432 | |||
| 433 | printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages); | ||
| 434 | return 0; | ||
| 435 | } | ||
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index 016504ccfccf..12db1d2ad61f 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c | |||
| @@ -1,11 +1,10 @@ | |||
| 1 | /* | 1 | /* |
| 2 | * linux/kernel/power/swsusp.c | 2 | * linux/kernel/power/swsusp.c |
| 3 | * | 3 | * |
| 4 | * This file is to realize architecture-independent | 4 | * This file provides code to write suspend image to swap and read it back. |
| 5 | * machine suspend feature using pretty near only high-level routines | ||
| 6 | * | 5 | * |
| 7 | * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu> | 6 | * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu> |
| 8 | * Copyright (C) 1998,2001-2004 Pavel Machek <pavel@suse.cz> | 7 | * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz> |
| 9 | * | 8 | * |
| 10 | * This file is released under the GPLv2. | 9 | * This file is released under the GPLv2. |
| 11 | * | 10 | * |
| @@ -47,11 +46,7 @@ | |||
| 47 | #include <linux/utsname.h> | 46 | #include <linux/utsname.h> |
| 48 | #include <linux/version.h> | 47 | #include <linux/version.h> |
| 49 | #include <linux/delay.h> | 48 | #include <linux/delay.h> |
| 50 | #include <linux/reboot.h> | ||
| 51 | #include <linux/bitops.h> | 49 | #include <linux/bitops.h> |
| 52 | #include <linux/vt_kern.h> | ||
| 53 | #include <linux/kbd_kern.h> | ||
| 54 | #include <linux/keyboard.h> | ||
| 55 | #include <linux/spinlock.h> | 50 | #include <linux/spinlock.h> |
| 56 | #include <linux/genhd.h> | 51 | #include <linux/genhd.h> |
| 57 | #include <linux/kernel.h> | 52 | #include <linux/kernel.h> |
| @@ -63,10 +58,8 @@ | |||
| 63 | #include <linux/swapops.h> | 58 | #include <linux/swapops.h> |
| 64 | #include <linux/bootmem.h> | 59 | #include <linux/bootmem.h> |
| 65 | #include <linux/syscalls.h> | 60 | #include <linux/syscalls.h> |
| 66 | #include <linux/console.h> | ||
| 67 | #include <linux/highmem.h> | 61 | #include <linux/highmem.h> |
| 68 | #include <linux/bio.h> | 62 | #include <linux/bio.h> |
| 69 | #include <linux/mount.h> | ||
| 70 | 63 | ||
| 71 | #include <asm/uaccess.h> | 64 | #include <asm/uaccess.h> |
| 72 | #include <asm/mmu_context.h> | 65 | #include <asm/mmu_context.h> |
| @@ -84,16 +77,10 @@ | |||
| 84 | #define MAXKEY 32 | 77 | #define MAXKEY 32 |
| 85 | #define MAXIV 32 | 78 | #define MAXIV 32 |
| 86 | 79 | ||
| 87 | /* References to section boundaries */ | ||
| 88 | extern const void __nosave_begin, __nosave_end; | ||
| 89 | |||
| 90 | /* Variables to be preserved over suspend */ | ||
| 91 | static int nr_copy_pages_check; | ||
| 92 | |||
| 93 | extern char resume_file[]; | 80 | extern char resume_file[]; |
| 94 | 81 | ||
| 95 | /* Local variables that should not be affected by save */ | 82 | /* Local variables that should not be affected by save */ |
| 96 | static unsigned int nr_copy_pages __nosavedata = 0; | 83 | unsigned int nr_copy_pages __nosavedata = 0; |
| 97 | 84 | ||
| 98 | /* Suspend pagedir is allocated before final copy, therefore it | 85 | /* Suspend pagedir is allocated before final copy, therefore it |
| 99 | must be freed after resume | 86 | must be freed after resume |
| @@ -109,7 +96,7 @@ static unsigned int nr_copy_pages __nosavedata = 0; | |||
| 109 | MMU hardware. | 96 | MMU hardware. |
| 110 | */ | 97 | */ |
| 111 | suspend_pagedir_t *pagedir_nosave __nosavedata = NULL; | 98 | suspend_pagedir_t *pagedir_nosave __nosavedata = NULL; |
| 112 | static suspend_pagedir_t *pagedir_save; | 99 | suspend_pagedir_t *pagedir_save; |
| 113 | 100 | ||
| 114 | #define SWSUSP_SIG "S1SUSPEND" | 101 | #define SWSUSP_SIG "S1SUSPEND" |
| 115 | 102 | ||
| @@ -124,12 +111,6 @@ static struct swsusp_header { | |||
| 124 | static struct swsusp_info swsusp_info; | 111 | static struct swsusp_info swsusp_info; |
| 125 | 112 | ||
| 126 | /* | 113 | /* |
| 127 | * XXX: We try to keep some more pages free so that I/O operations succeed | ||
| 128 | * without paging. Might this be more? | ||
| 129 | */ | ||
| 130 | #define PAGES_FOR_IO 512 | ||
| 131 | |||
| 132 | /* | ||
| 133 | * Saving part... | 114 | * Saving part... |
| 134 | */ | 115 | */ |
| 135 | 116 | ||
| @@ -552,353 +533,6 @@ static int write_suspend_image(void) | |||
| 552 | goto Done; | 533 | goto Done; |
| 553 | } | 534 | } |
| 554 | 535 | ||
| 555 | |||
| 556 | #ifdef CONFIG_HIGHMEM | ||
| 557 | struct highmem_page { | ||
| 558 | char *data; | ||
| 559 | struct page *page; | ||
| 560 | struct highmem_page *next; | ||
| 561 | }; | ||
| 562 | |||
| 563 | static struct highmem_page *highmem_copy; | ||
| 564 | |||
| 565 | static int save_highmem_zone(struct zone *zone) | ||
| 566 | { | ||
| 567 | unsigned long zone_pfn; | ||
| 568 | mark_free_pages(zone); | ||
| 569 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) { | ||
| 570 | struct page *page; | ||
| 571 | struct highmem_page *save; | ||
| 572 | void *kaddr; | ||
| 573 | unsigned long pfn = zone_pfn + zone->zone_start_pfn; | ||
| 574 | |||
| 575 | if (!(pfn%1000)) | ||
| 576 | printk("."); | ||
| 577 | if (!pfn_valid(pfn)) | ||
| 578 | continue; | ||
| 579 | page = pfn_to_page(pfn); | ||
| 580 | /* | ||
| 581 | * PageReserved results from rvmalloc() sans vmalloc_32() | ||
| 582 | * and architectural memory reservations. | ||
| 583 | * | ||
| 584 | * rvmalloc should not cause this, because all implementations | ||
| 585 | * appear to always be using vmalloc_32 on architectures with | ||
| 586 | * highmem. This is a good thing, because we would like to save | ||
| 587 | * rvmalloc pages. | ||
| 588 | * | ||
| 589 | * It appears to be triggered by pages which do not point to | ||
| 590 | * valid memory (see arch/i386/mm/init.c:one_highpage_init(), | ||
| 591 | * which sets PageReserved if the page does not point to valid | ||
| 592 | * RAM. | ||
| 593 | * | ||
| 594 | * XXX: must remove usage of PageReserved! | ||
| 595 | */ | ||
| 596 | if (PageReserved(page)) | ||
| 597 | continue; | ||
| 598 | BUG_ON(PageNosave(page)); | ||
| 599 | if (PageNosaveFree(page)) | ||
| 600 | continue; | ||
| 601 | save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC); | ||
| 602 | if (!save) | ||
| 603 | return -ENOMEM; | ||
| 604 | save->next = highmem_copy; | ||
| 605 | save->page = page; | ||
| 606 | save->data = (void *) get_zeroed_page(GFP_ATOMIC); | ||
| 607 | if (!save->data) { | ||
| 608 | kfree(save); | ||
| 609 | return -ENOMEM; | ||
| 610 | } | ||
| 611 | kaddr = kmap_atomic(page, KM_USER0); | ||
| 612 | memcpy(save->data, kaddr, PAGE_SIZE); | ||
| 613 | kunmap_atomic(kaddr, KM_USER0); | ||
| 614 | highmem_copy = save; | ||
| 615 | } | ||
| 616 | return 0; | ||
| 617 | } | ||
| 618 | #endif /* CONFIG_HIGHMEM */ | ||
| 619 | |||
| 620 | |||
| 621 | static int save_highmem(void) | ||
| 622 | { | ||
| 623 | #ifdef CONFIG_HIGHMEM | ||
| 624 | struct zone *zone; | ||
| 625 | int res = 0; | ||
| 626 | |||
| 627 | pr_debug("swsusp: Saving Highmem\n"); | ||
| 628 | for_each_zone (zone) { | ||
| 629 | if (is_highmem(zone)) | ||
| 630 | res = save_highmem_zone(zone); | ||
| 631 | if (res) | ||
| 632 | return res; | ||
| 633 | } | ||
| 634 | #endif | ||
| 635 | return 0; | ||
| 636 | } | ||
| 637 | |||
| 638 | static int restore_highmem(void) | ||
| 639 | { | ||
| 640 | #ifdef CONFIG_HIGHMEM | ||
| 641 | printk("swsusp: Restoring Highmem\n"); | ||
| 642 | while (highmem_copy) { | ||
| 643 | struct highmem_page *save = highmem_copy; | ||
| 644 | void *kaddr; | ||
| 645 | highmem_copy = save->next; | ||
| 646 | |||
| 647 | kaddr = kmap_atomic(save->page, KM_USER0); | ||
| 648 | memcpy(kaddr, save->data, PAGE_SIZE); | ||
| 649 | kunmap_atomic(kaddr, KM_USER0); | ||
| 650 | free_page((long) save->data); | ||
| 651 | kfree(save); | ||
| 652 | } | ||
| 653 | #endif | ||
| 654 | return 0; | ||
| 655 | } | ||
| 656 | |||
| 657 | |||
| 658 | static int pfn_is_nosave(unsigned long pfn) | ||
| 659 | { | ||
| 660 | unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT; | ||
| 661 | unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT; | ||
| 662 | return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); | ||
| 663 | } | ||
| 664 | |||
| 665 | /** | ||
| 666 | * saveable - Determine whether a page should be cloned or not. | ||
| 667 | * @pfn: The page | ||
| 668 | * | ||
| 669 | * We save a page if it's Reserved, and not in the range of pages | ||
| 670 | * statically defined as 'unsaveable', or if it isn't reserved, and | ||
| 671 | * isn't part of a free chunk of pages. | ||
| 672 | */ | ||
| 673 | |||
| 674 | static int saveable(struct zone * zone, unsigned long * zone_pfn) | ||
| 675 | { | ||
| 676 | unsigned long pfn = *zone_pfn + zone->zone_start_pfn; | ||
| 677 | struct page * page; | ||
| 678 | |||
| 679 | if (!pfn_valid(pfn)) | ||
| 680 | return 0; | ||
| 681 | |||
| 682 | page = pfn_to_page(pfn); | ||
| 683 | if (PageNosave(page)) | ||
| 684 | return 0; | ||
| 685 | if (pfn_is_nosave(pfn)) { | ||
| 686 | pr_debug("[nosave pfn 0x%lx]", pfn); | ||
| 687 | return 0; | ||
| 688 | } | ||
| 689 | if (PageNosaveFree(page)) | ||
| 690 | return 0; | ||
| 691 | |||
| 692 | return 1; | ||
| 693 | } | ||
| 694 | |||
| 695 | static void count_data_pages(void) | ||
| 696 | { | ||
| 697 | struct zone *zone; | ||
| 698 | unsigned long zone_pfn; | ||
| 699 | |||
| 700 | nr_copy_pages = 0; | ||
| 701 | |||
| 702 | for_each_zone (zone) { | ||
| 703 | if (is_highmem(zone)) | ||
| 704 | continue; | ||
| 705 | mark_free_pages(zone); | ||
| 706 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) | ||
| 707 | nr_copy_pages += saveable(zone, &zone_pfn); | ||
| 708 | } | ||
| 709 | } | ||
| 710 | |||
| 711 | |||
| 712 | static void copy_data_pages(void) | ||
| 713 | { | ||
| 714 | struct zone *zone; | ||
| 715 | unsigned long zone_pfn; | ||
| 716 | struct pbe * pbe = pagedir_nosave; | ||
| 717 | |||
| 718 | pr_debug("copy_data_pages(): pages to copy: %d\n", nr_copy_pages); | ||
| 719 | for_each_zone (zone) { | ||
| 720 | if (is_highmem(zone)) | ||
| 721 | continue; | ||
| 722 | mark_free_pages(zone); | ||
| 723 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) { | ||
| 724 | if (saveable(zone, &zone_pfn)) { | ||
| 725 | struct page * page; | ||
| 726 | page = pfn_to_page(zone_pfn + zone->zone_start_pfn); | ||
| 727 | BUG_ON(!pbe); | ||
| 728 | pbe->orig_address = (long) page_address(page); | ||
| 729 | /* copy_page is not usable for copying task structs. */ | ||
| 730 | memcpy((void *)pbe->address, (void *)pbe->orig_address, PAGE_SIZE); | ||
| 731 | pbe = pbe->next; | ||
| 732 | } | ||
| 733 | } | ||
| 734 | } | ||
| 735 | BUG_ON(pbe); | ||
| 736 | } | ||
| 737 | |||
| 738 | |||
| 739 | /** | ||
| 740 | * calc_nr - Determine the number of pages needed for a pbe list. | ||
| 741 | */ | ||
| 742 | |||
| 743 | static int calc_nr(int nr_copy) | ||
| 744 | { | ||
| 745 | return nr_copy + (nr_copy+PBES_PER_PAGE-2)/(PBES_PER_PAGE-1); | ||
| 746 | } | ||
| 747 | |||
| 748 | /** | ||
| 749 | * free_pagedir - free pages allocated with alloc_pagedir() | ||
| 750 | */ | ||
| 751 | |||
| 752 | static inline void free_pagedir(struct pbe *pblist) | ||
| 753 | { | ||
| 754 | struct pbe *pbe; | ||
| 755 | |||
| 756 | while (pblist) { | ||
| 757 | pbe = (pblist + PB_PAGE_SKIP)->next; | ||
| 758 | free_page((unsigned long)pblist); | ||
| 759 | pblist = pbe; | ||
| 760 | } | ||
| 761 | } | ||
| 762 | |||
| 763 | /** | ||
| 764 | * fill_pb_page - Create a list of PBEs on a given memory page | ||
| 765 | */ | ||
| 766 | |||
| 767 | static inline void fill_pb_page(struct pbe *pbpage) | ||
| 768 | { | ||
| 769 | struct pbe *p; | ||
| 770 | |||
| 771 | p = pbpage; | ||
| 772 | pbpage += PB_PAGE_SKIP; | ||
| 773 | do | ||
| 774 | p->next = p + 1; | ||
| 775 | while (++p < pbpage); | ||
| 776 | } | ||
| 777 | |||
| 778 | /** | ||
| 779 | * create_pbe_list - Create a list of PBEs on top of a given chain | ||
| 780 | * of memory pages allocated with alloc_pagedir() | ||
| 781 | */ | ||
| 782 | |||
| 783 | static void create_pbe_list(struct pbe *pblist, unsigned nr_pages) | ||
| 784 | { | ||
| 785 | struct pbe *pbpage, *p; | ||
| 786 | unsigned num = PBES_PER_PAGE; | ||
| 787 | |||
| 788 | for_each_pb_page (pbpage, pblist) { | ||
| 789 | if (num >= nr_pages) | ||
| 790 | break; | ||
| 791 | |||
| 792 | fill_pb_page(pbpage); | ||
| 793 | num += PBES_PER_PAGE; | ||
| 794 | } | ||
| 795 | if (pbpage) { | ||
| 796 | for (num -= PBES_PER_PAGE - 1, p = pbpage; num < nr_pages; p++, num++) | ||
| 797 | p->next = p + 1; | ||
| 798 | p->next = NULL; | ||
| 799 | } | ||
| 800 | pr_debug("create_pbe_list(): initialized %d PBEs\n", num); | ||
| 801 | } | ||
| 802 | |||
| 803 | /** | ||
| 804 | * alloc_pagedir - Allocate the page directory. | ||
| 805 | * | ||
| 806 | * First, determine exactly how many pages we need and | ||
| 807 | * allocate them. | ||
| 808 | * | ||
| 809 | * We arrange the pages in a chain: each page is an array of PBES_PER_PAGE | ||
| 810 | * struct pbe elements (pbes) and the last element in the page points | ||
| 811 | * to the next page. | ||
| 812 | * | ||
| 813 | * On each page we set up a list of struct_pbe elements. | ||
| 814 | */ | ||
| 815 | |||
| 816 | static struct pbe * alloc_pagedir(unsigned nr_pages) | ||
| 817 | { | ||
| 818 | unsigned num; | ||
| 819 | struct pbe *pblist, *pbe; | ||
| 820 | |||
| 821 | if (!nr_pages) | ||
| 822 | return NULL; | ||
| 823 | |||
| 824 | pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages); | ||
| 825 | pblist = (struct pbe *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD); | ||
| 826 | for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages; | ||
| 827 | pbe = pbe->next, num += PBES_PER_PAGE) { | ||
| 828 | pbe += PB_PAGE_SKIP; | ||
| 829 | pbe->next = (struct pbe *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD); | ||
| 830 | } | ||
| 831 | if (!pbe) { /* get_zeroed_page() failed */ | ||
| 832 | free_pagedir(pblist); | ||
| 833 | pblist = NULL; | ||
| 834 | } | ||
| 835 | return pblist; | ||
| 836 | } | ||
| 837 | |||
| 838 | /** | ||
| 839 | * free_image_pages - Free pages allocated for snapshot | ||
| 840 | */ | ||
| 841 | |||
| 842 | static void free_image_pages(void) | ||
| 843 | { | ||
| 844 | struct pbe * p; | ||
| 845 | |||
| 846 | for_each_pbe (p, pagedir_save) { | ||
| 847 | if (p->address) { | ||
| 848 | ClearPageNosave(virt_to_page(p->address)); | ||
| 849 | free_page(p->address); | ||
| 850 | p->address = 0; | ||
| 851 | } | ||
| 852 | } | ||
| 853 | } | ||
| 854 | |||
| 855 | /** | ||
| 856 | * alloc_image_pages - Allocate pages for the snapshot. | ||
| 857 | */ | ||
| 858 | |||
| 859 | static int alloc_image_pages(void) | ||
| 860 | { | ||
| 861 | struct pbe * p; | ||
| 862 | |||
| 863 | for_each_pbe (p, pagedir_save) { | ||
| 864 | p->address = get_zeroed_page(GFP_ATOMIC | __GFP_COLD); | ||
| 865 | if (!p->address) | ||
| 866 | return -ENOMEM; | ||
| 867 | SetPageNosave(virt_to_page(p->address)); | ||
| 868 | } | ||
| 869 | return 0; | ||
| 870 | } | ||
| 871 | |||
| 872 | /* Free pages we allocated for suspend. Suspend pages are alocated | ||
| 873 | * before atomic copy, so we need to free them after resume. | ||
| 874 | */ | ||
| 875 | void swsusp_free(void) | ||
| 876 | { | ||
| 877 | BUG_ON(PageNosave(virt_to_page(pagedir_save))); | ||
| 878 | BUG_ON(PageNosaveFree(virt_to_page(pagedir_save))); | ||
| 879 | free_image_pages(); | ||
| 880 | free_pagedir(pagedir_save); | ||
| 881 | } | ||
| 882 | |||
| 883 | |||
| 884 | /** | ||
| 885 | * enough_free_mem - Make sure we enough free memory to snapshot. | ||
| 886 | * | ||
| 887 | * Returns TRUE or FALSE after checking the number of available | ||
| 888 | * free pages. | ||
| 889 | */ | ||
| 890 | |||
| 891 | static int enough_free_mem(void) | ||
| 892 | { | ||
| 893 | if (nr_free_pages() < (nr_copy_pages + PAGES_FOR_IO)) { | ||
| 894 | pr_debug("swsusp: Not enough free pages: Have %d\n", | ||
| 895 | nr_free_pages()); | ||
| 896 | return 0; | ||
| 897 | } | ||
| 898 | return 1; | ||
| 899 | } | ||
| 900 | |||
| 901 | |||
| 902 | /** | 536 | /** |
| 903 | * enough_swap - Make sure we have enough swap to save the image. | 537 | * enough_swap - Make sure we have enough swap to save the image. |
| 904 | * | 538 | * |
| @@ -909,87 +543,14 @@ static int enough_free_mem(void) | |||
| 909 | * We should only consider resume_device. | 543 | * We should only consider resume_device. |
| 910 | */ | 544 | */ |
| 911 | 545 | ||
| 912 | static int enough_swap(void) | 546 | int enough_swap(unsigned nr_pages) |
| 913 | { | 547 | { |
| 914 | struct sysinfo i; | 548 | struct sysinfo i; |
| 915 | 549 | ||
| 916 | si_swapinfo(&i); | 550 | si_swapinfo(&i); |
| 917 | if (i.freeswap < (nr_copy_pages + PAGES_FOR_IO)) { | 551 | pr_debug("swsusp: available swap: %lu pages\n", i.freeswap); |
| 918 | pr_debug("swsusp: Not enough swap. Need %ld\n",i.freeswap); | 552 | return i.freeswap > (nr_pages + PAGES_FOR_IO + |
| 919 | return 0; | 553 | (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); |
| 920 | } | ||
| 921 | return 1; | ||
| 922 | } | ||
| 923 | |||
| 924 | static int swsusp_alloc(void) | ||
| 925 | { | ||
| 926 | int error; | ||
| 927 | |||
| 928 | pagedir_nosave = NULL; | ||
| 929 | nr_copy_pages = calc_nr(nr_copy_pages); | ||
| 930 | nr_copy_pages_check = nr_copy_pages; | ||
| 931 | |||
| 932 | pr_debug("suspend: (pages needed: %d + %d free: %d)\n", | ||
| 933 | nr_copy_pages, PAGES_FOR_IO, nr_free_pages()); | ||
| 934 | |||
| 935 | if (!enough_free_mem()) | ||
| 936 | return -ENOMEM; | ||
| 937 | |||
| 938 | if (!enough_swap()) | ||
| 939 | return -ENOSPC; | ||
| 940 | |||
| 941 | if (MAX_PBES < nr_copy_pages / PBES_PER_PAGE + | ||
| 942 | !!(nr_copy_pages % PBES_PER_PAGE)) | ||
| 943 | return -ENOSPC; | ||
| 944 | |||
| 945 | if (!(pagedir_save = alloc_pagedir(nr_copy_pages))) { | ||
| 946 | printk(KERN_ERR "suspend: Allocating pagedir failed.\n"); | ||
| 947 | return -ENOMEM; | ||
| 948 | } | ||
| 949 | create_pbe_list(pagedir_save, nr_copy_pages); | ||
| 950 | pagedir_nosave = pagedir_save; | ||
| 951 | if ((error = alloc_image_pages())) { | ||
| 952 | printk(KERN_ERR "suspend: Allocating image pages failed.\n"); | ||
| 953 | swsusp_free(); | ||
| 954 | return error; | ||
| 955 | } | ||
| 956 | |||
| 957 | return 0; | ||
| 958 | } | ||
| 959 | |||
| 960 | static int suspend_prepare_image(void) | ||
| 961 | { | ||
| 962 | int error; | ||
| 963 | |||
| 964 | pr_debug("swsusp: critical section: \n"); | ||
| 965 | if (save_highmem()) { | ||
| 966 | printk(KERN_CRIT "Suspend machine: Not enough free pages for highmem\n"); | ||
| 967 | restore_highmem(); | ||
| 968 | return -ENOMEM; | ||
| 969 | } | ||
| 970 | |||
| 971 | drain_local_pages(); | ||
| 972 | count_data_pages(); | ||
| 973 | printk("swsusp: Need to copy %u pages\n", nr_copy_pages); | ||
| 974 | |||
| 975 | error = swsusp_alloc(); | ||
| 976 | if (error) | ||
| 977 | return error; | ||
| 978 | |||
| 979 | /* During allocating of suspend pagedir, new cold pages may appear. | ||
| 980 | * Kill them. | ||
| 981 | */ | ||
| 982 | drain_local_pages(); | ||
| 983 | copy_data_pages(); | ||
| 984 | |||
| 985 | /* | ||
| 986 | * End of critical section. From now on, we can write to memory, | ||
| 987 | * but we should not touch disk. This specially means we must _not_ | ||
| 988 | * touch swap space! Except we must write out our image of course. | ||
| 989 | */ | ||
| 990 | |||
| 991 | printk("swsusp: critical section/: done (%d pages copied)\n", nr_copy_pages ); | ||
| 992 | return 0; | ||
| 993 | } | 554 | } |
| 994 | 555 | ||
| 995 | 556 | ||
| @@ -1001,7 +562,7 @@ static int suspend_prepare_image(void) | |||
| 1001 | int swsusp_write(void) | 562 | int swsusp_write(void) |
| 1002 | { | 563 | { |
| 1003 | int error; | 564 | int error; |
| 1004 | device_resume(); | 565 | |
| 1005 | lock_swapdevices(); | 566 | lock_swapdevices(); |
| 1006 | error = write_suspend_image(); | 567 | error = write_suspend_image(); |
| 1007 | /* This will unlock ignored swap devices since writing is finished */ | 568 | /* This will unlock ignored swap devices since writing is finished */ |
| @@ -1011,14 +572,6 @@ int swsusp_write(void) | |||
| 1011 | } | 572 | } |
| 1012 | 573 | ||
| 1013 | 574 | ||
| 1014 | extern asmlinkage int swsusp_arch_suspend(void); | ||
| 1015 | extern asmlinkage int swsusp_arch_resume(void); | ||
| 1016 | |||
| 1017 | |||
| 1018 | asmlinkage int swsusp_save(void) | ||
| 1019 | { | ||
| 1020 | return suspend_prepare_image(); | ||
| 1021 | } | ||
| 1022 | 575 | ||
| 1023 | int swsusp_suspend(void) | 576 | int swsusp_suspend(void) |
| 1024 | { | 577 | { |
| @@ -1050,7 +603,6 @@ int swsusp_suspend(void) | |||
| 1050 | printk(KERN_ERR "Error %d suspending\n", error); | 603 | printk(KERN_ERR "Error %d suspending\n", error); |
| 1051 | /* Restore control flow magically appears here */ | 604 | /* Restore control flow magically appears here */ |
| 1052 | restore_processor_state(); | 605 | restore_processor_state(); |
| 1053 | BUG_ON (nr_copy_pages_check != nr_copy_pages); | ||
| 1054 | restore_highmem(); | 606 | restore_highmem(); |
| 1055 | device_power_up(); | 607 | device_power_up(); |
| 1056 | local_irq_enable(); | 608 | local_irq_enable(); |
| @@ -1070,6 +622,11 @@ int swsusp_resume(void) | |||
| 1070 | * execution continues at place where swsusp_arch_suspend was called | 622 | * execution continues at place where swsusp_arch_suspend was called |
| 1071 | */ | 623 | */ |
| 1072 | BUG_ON(!error); | 624 | BUG_ON(!error); |
| 625 | /* The only reason why swsusp_arch_resume() can fail is memory being | ||
| 626 | * very tight, so we have to free it as soon as we can to avoid | ||
| 627 | * subsequent failures | ||
| 628 | */ | ||
| 629 | swsusp_free(); | ||
| 1073 | restore_processor_state(); | 630 | restore_processor_state(); |
| 1074 | restore_highmem(); | 631 | restore_highmem(); |
| 1075 | touch_softlockup_watchdog(); | 632 | touch_softlockup_watchdog(); |
| @@ -1085,54 +642,28 @@ int swsusp_resume(void) | |||
| 1085 | * | 642 | * |
| 1086 | * We don't know which pages are usable until we allocate them. | 643 | * We don't know which pages are usable until we allocate them. |
| 1087 | * | 644 | * |
| 1088 | * Allocated but unusable (ie eaten) memory pages are linked together | 645 | * Allocated but unusable (ie eaten) memory pages are marked so that |
| 1089 | * to create a list, so that we can free them easily | 646 | * swsusp_free() can release them |
| 1090 | * | ||
| 1091 | * We could have used a type other than (void *) | ||
| 1092 | * for this purpose, but ... | ||
| 1093 | */ | 647 | */ |
| 1094 | static void **eaten_memory = NULL; | ||
| 1095 | |||
| 1096 | static inline void eat_page(void *page) | ||
| 1097 | { | ||
| 1098 | void **c; | ||
| 1099 | |||
| 1100 | c = eaten_memory; | ||
| 1101 | eaten_memory = page; | ||
| 1102 | *eaten_memory = c; | ||
| 1103 | } | ||
| 1104 | 648 | ||
| 1105 | unsigned long get_usable_page(gfp_t gfp_mask) | 649 | unsigned long get_safe_page(gfp_t gfp_mask) |
| 1106 | { | 650 | { |
| 1107 | unsigned long m; | 651 | unsigned long m; |
| 1108 | 652 | ||
| 1109 | m = get_zeroed_page(gfp_mask); | 653 | do { |
| 1110 | while (!PageNosaveFree(virt_to_page(m))) { | ||
| 1111 | eat_page((void *)m); | ||
| 1112 | m = get_zeroed_page(gfp_mask); | 654 | m = get_zeroed_page(gfp_mask); |
| 1113 | if (!m) | 655 | if (m && PageNosaveFree(virt_to_page(m))) |
| 1114 | break; | 656 | /* This is for swsusp_free() */ |
| 657 | SetPageNosave(virt_to_page(m)); | ||
| 658 | } while (m && PageNosaveFree(virt_to_page(m))); | ||
| 659 | if (m) { | ||
| 660 | /* This is for swsusp_free() */ | ||
| 661 | SetPageNosave(virt_to_page(m)); | ||
| 662 | SetPageNosaveFree(virt_to_page(m)); | ||
| 1115 | } | 663 | } |
| 1116 | return m; | 664 | return m; |
| 1117 | } | 665 | } |
| 1118 | 666 | ||
| 1119 | void free_eaten_memory(void) | ||
| 1120 | { | ||
| 1121 | unsigned long m; | ||
| 1122 | void **c; | ||
| 1123 | int i = 0; | ||
| 1124 | |||
| 1125 | c = eaten_memory; | ||
| 1126 | while (c) { | ||
| 1127 | m = (unsigned long)c; | ||
| 1128 | c = *c; | ||
| 1129 | free_page(m); | ||
| 1130 | i++; | ||
| 1131 | } | ||
| 1132 | eaten_memory = NULL; | ||
| 1133 | pr_debug("swsusp: %d unused pages freed\n", i); | ||
| 1134 | } | ||
| 1135 | |||
| 1136 | /** | 667 | /** |
| 1137 | * check_pagedir - We ensure here that pages that the PBEs point to | 668 | * check_pagedir - We ensure here that pages that the PBEs point to |
| 1138 | * won't collide with pages where we're going to restore from the loaded | 669 | * won't collide with pages where we're going to restore from the loaded |
| @@ -1150,7 +681,7 @@ static int check_pagedir(struct pbe *pblist) | |||
| 1150 | p->address = 0UL; | 681 | p->address = 0UL; |
| 1151 | 682 | ||
| 1152 | for_each_pbe (p, pblist) { | 683 | for_each_pbe (p, pblist) { |
| 1153 | p->address = get_usable_page(GFP_ATOMIC); | 684 | p->address = get_safe_page(GFP_ATOMIC); |
| 1154 | if (!p->address) | 685 | if (!p->address) |
| 1155 | return -ENOMEM; | 686 | return -ENOMEM; |
| 1156 | } | 687 | } |
| @@ -1169,7 +700,7 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist) | |||
| 1169 | unsigned long zone_pfn; | 700 | unsigned long zone_pfn; |
| 1170 | struct pbe *pbpage, *tail, *p; | 701 | struct pbe *pbpage, *tail, *p; |
| 1171 | void *m; | 702 | void *m; |
| 1172 | int rel = 0, error = 0; | 703 | int rel = 0; |
| 1173 | 704 | ||
| 1174 | if (!pblist) /* a sanity check */ | 705 | if (!pblist) /* a sanity check */ |
| 1175 | return NULL; | 706 | return NULL; |
| @@ -1177,41 +708,37 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist) | |||
| 1177 | pr_debug("swsusp: Relocating pagedir (%lu pages to check)\n", | 708 | pr_debug("swsusp: Relocating pagedir (%lu pages to check)\n", |
| 1178 | swsusp_info.pagedir_pages); | 709 | swsusp_info.pagedir_pages); |
| 1179 | 710 | ||
| 1180 | /* Set page flags */ | 711 | /* Clear page flags */ |
| 1181 | 712 | ||
| 1182 | for_each_zone (zone) { | 713 | for_each_zone (zone) { |
| 1183 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) | 714 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) |
| 1184 | SetPageNosaveFree(pfn_to_page(zone_pfn + | 715 | if (pfn_valid(zone_pfn + zone->zone_start_pfn)) |
| 716 | ClearPageNosaveFree(pfn_to_page(zone_pfn + | ||
| 1185 | zone->zone_start_pfn)); | 717 | zone->zone_start_pfn)); |
| 1186 | } | 718 | } |
| 1187 | 719 | ||
| 1188 | /* Clear orig addresses */ | 720 | /* Mark orig addresses */ |
| 1189 | 721 | ||
| 1190 | for_each_pbe (p, pblist) | 722 | for_each_pbe (p, pblist) |
| 1191 | ClearPageNosaveFree(virt_to_page(p->orig_address)); | 723 | SetPageNosaveFree(virt_to_page(p->orig_address)); |
| 1192 | 724 | ||
| 1193 | tail = pblist + PB_PAGE_SKIP; | 725 | tail = pblist + PB_PAGE_SKIP; |
| 1194 | 726 | ||
| 1195 | /* Relocate colliding pages */ | 727 | /* Relocate colliding pages */ |
| 1196 | 728 | ||
| 1197 | for_each_pb_page (pbpage, pblist) { | 729 | for_each_pb_page (pbpage, pblist) { |
| 1198 | if (!PageNosaveFree(virt_to_page((unsigned long)pbpage))) { | 730 | if (PageNosaveFree(virt_to_page((unsigned long)pbpage))) { |
| 1199 | m = (void *)get_usable_page(GFP_ATOMIC | __GFP_COLD); | 731 | m = (void *)get_safe_page(GFP_ATOMIC | __GFP_COLD); |
| 1200 | if (!m) { | 732 | if (!m) |
| 1201 | error = -ENOMEM; | 733 | return NULL; |
| 1202 | break; | ||
| 1203 | } | ||
| 1204 | memcpy(m, (void *)pbpage, PAGE_SIZE); | 734 | memcpy(m, (void *)pbpage, PAGE_SIZE); |
| 1205 | if (pbpage == pblist) | 735 | if (pbpage == pblist) |
| 1206 | pblist = (struct pbe *)m; | 736 | pblist = (struct pbe *)m; |
| 1207 | else | 737 | else |
| 1208 | tail->next = (struct pbe *)m; | 738 | tail->next = (struct pbe *)m; |
| 1209 | |||
| 1210 | eat_page((void *)pbpage); | ||
| 1211 | pbpage = (struct pbe *)m; | 739 | pbpage = (struct pbe *)m; |
| 1212 | 740 | ||
| 1213 | /* We have to link the PBEs again */ | 741 | /* We have to link the PBEs again */ |
| 1214 | |||
| 1215 | for (p = pbpage; p < pbpage + PB_PAGE_SKIP; p++) | 742 | for (p = pbpage; p < pbpage + PB_PAGE_SKIP; p++) |
| 1216 | if (p->next) /* needed to save the end */ | 743 | if (p->next) /* needed to save the end */ |
| 1217 | p->next = p + 1; | 744 | p->next = p + 1; |
| @@ -1221,15 +748,13 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist) | |||
| 1221 | tail = pbpage + PB_PAGE_SKIP; | 748 | tail = pbpage + PB_PAGE_SKIP; |
| 1222 | } | 749 | } |
| 1223 | 750 | ||
| 1224 | if (error) { | 751 | /* This is for swsusp_free() */ |
| 1225 | printk("\nswsusp: Out of memory\n\n"); | 752 | for_each_pb_page (pbpage, pblist) { |
| 1226 | free_pagedir(pblist); | 753 | SetPageNosave(virt_to_page(pbpage)); |
| 1227 | free_eaten_memory(); | 754 | SetPageNosaveFree(virt_to_page(pbpage)); |
| 1228 | pblist = NULL; | 755 | } |
| 1229 | /* Is this even worth handling? It should never ever happen, and we | 756 | |
| 1230 | have just lost user's state, anyway... */ | 757 | printk("swsusp: Relocated %d pages\n", rel); |
| 1231 | } else | ||
| 1232 | printk("swsusp: Relocated %d pages\n", rel); | ||
| 1233 | 758 | ||
| 1234 | return pblist; | 759 | return pblist; |
| 1235 | } | 760 | } |
| @@ -1447,9 +972,7 @@ static int read_pagedir(struct pbe *pblist) | |||
| 1447 | break; | 972 | break; |
| 1448 | } | 973 | } |
| 1449 | 974 | ||
| 1450 | if (error) | 975 | if (!error) |
| 1451 | free_pagedir(pblist); | ||
| 1452 | else | ||
| 1453 | BUG_ON(i != swsusp_info.pagedir_pages); | 976 | BUG_ON(i != swsusp_info.pagedir_pages); |
| 1454 | 977 | ||
| 1455 | return error; | 978 | return error; |
| @@ -1492,15 +1015,6 @@ static int read_suspend_image(void) | |||
| 1492 | if (!error) | 1015 | if (!error) |
| 1493 | error = data_read(pagedir_nosave); | 1016 | error = data_read(pagedir_nosave); |
| 1494 | 1017 | ||
| 1495 | if (error) { /* We fail cleanly */ | ||
| 1496 | free_eaten_memory(); | ||
| 1497 | for_each_pbe (p, pagedir_nosave) | ||
| 1498 | if (p->address) { | ||
| 1499 | free_page(p->address); | ||
| 1500 | p->address = 0UL; | ||
| 1501 | } | ||
| 1502 | free_pagedir(pagedir_nosave); | ||
| 1503 | } | ||
| 1504 | return error; | 1018 | return error; |
| 1505 | } | 1019 | } |
| 1506 | 1020 | ||
diff --git a/kernel/printk.c b/kernel/printk.c index 4b8f0f9230a4..3cb9708209bc 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
| @@ -10,7 +10,7 @@ | |||
| 10 | * elsewhere, in preparation for a serial line console (someday). | 10 | * elsewhere, in preparation for a serial line console (someday). |
| 11 | * Ted Ts'o, 2/11/93. | 11 | * Ted Ts'o, 2/11/93. |
| 12 | * Modified for sysctl support, 1/8/97, Chris Horn. | 12 | * Modified for sysctl support, 1/8/97, Chris Horn. |
| 13 | * Fixed SMP synchronization, 08/08/99, Manfred Spraul | 13 | * Fixed SMP synchronization, 08/08/99, Manfred Spraul |
| 14 | * manfreds@colorfullife.com | 14 | * manfreds@colorfullife.com |
| 15 | * Rewrote bits to get rid of console_lock | 15 | * Rewrote bits to get rid of console_lock |
| 16 | * 01Mar01 Andrew Morton <andrewm@uow.edu.au> | 16 | * 01Mar01 Andrew Morton <andrewm@uow.edu.au> |
| @@ -148,7 +148,7 @@ static int __init console_setup(char *str) | |||
| 148 | if (!strcmp(str, "ttyb")) | 148 | if (!strcmp(str, "ttyb")) |
| 149 | strcpy(name, "ttyS1"); | 149 | strcpy(name, "ttyS1"); |
| 150 | #endif | 150 | #endif |
| 151 | for(s = name; *s; s++) | 151 | for (s = name; *s; s++) |
| 152 | if ((*s >= '0' && *s <= '9') || *s == ',') | 152 | if ((*s >= '0' && *s <= '9') || *s == ',') |
| 153 | break; | 153 | break; |
| 154 | idx = simple_strtoul(s, NULL, 10); | 154 | idx = simple_strtoul(s, NULL, 10); |
| @@ -169,11 +169,11 @@ static int __init log_buf_len_setup(char *str) | |||
| 169 | size = roundup_pow_of_two(size); | 169 | size = roundup_pow_of_two(size); |
| 170 | if (size > log_buf_len) { | 170 | if (size > log_buf_len) { |
| 171 | unsigned long start, dest_idx, offset; | 171 | unsigned long start, dest_idx, offset; |
| 172 | char * new_log_buf; | 172 | char *new_log_buf; |
| 173 | 173 | ||
| 174 | new_log_buf = alloc_bootmem(size); | 174 | new_log_buf = alloc_bootmem(size); |
| 175 | if (!new_log_buf) { | 175 | if (!new_log_buf) { |
| 176 | printk("log_buf_len: allocation failed\n"); | 176 | printk(KERN_WARNING "log_buf_len: allocation failed\n"); |
| 177 | goto out; | 177 | goto out; |
| 178 | } | 178 | } |
| 179 | 179 | ||
| @@ -193,10 +193,9 @@ static int __init log_buf_len_setup(char *str) | |||
| 193 | log_end -= offset; | 193 | log_end -= offset; |
| 194 | spin_unlock_irqrestore(&logbuf_lock, flags); | 194 | spin_unlock_irqrestore(&logbuf_lock, flags); |
| 195 | 195 | ||
| 196 | printk("log_buf_len: %d\n", log_buf_len); | 196 | printk(KERN_NOTICE "log_buf_len: %d\n", log_buf_len); |
| 197 | } | 197 | } |
| 198 | out: | 198 | out: |
| 199 | |||
| 200 | return 1; | 199 | return 1; |
| 201 | } | 200 | } |
| 202 | 201 | ||
| @@ -217,7 +216,7 @@ __setup("log_buf_len=", log_buf_len_setup); | |||
| 217 | * 9 -- Return number of unread characters in the log buffer | 216 | * 9 -- Return number of unread characters in the log buffer |
| 218 | * 10 -- Return size of the log buffer | 217 | * 10 -- Return size of the log buffer |
| 219 | */ | 218 | */ |
| 220 | int do_syslog(int type, char __user * buf, int len) | 219 | int do_syslog(int type, char __user *buf, int len) |
| 221 | { | 220 | { |
| 222 | unsigned long i, j, limit, count; | 221 | unsigned long i, j, limit, count; |
| 223 | int do_clear = 0; | 222 | int do_clear = 0; |
| @@ -244,7 +243,8 @@ int do_syslog(int type, char __user * buf, int len) | |||
| 244 | error = -EFAULT; | 243 | error = -EFAULT; |
| 245 | goto out; | 244 | goto out; |
| 246 | } | 245 | } |
| 247 | error = wait_event_interruptible(log_wait, (log_start - log_end)); | 246 | error = wait_event_interruptible(log_wait, |
| 247 | (log_start - log_end)); | ||
| 248 | if (error) | 248 | if (error) |
| 249 | goto out; | 249 | goto out; |
| 250 | i = 0; | 250 | i = 0; |
| @@ -264,7 +264,7 @@ int do_syslog(int type, char __user * buf, int len) | |||
| 264 | error = i; | 264 | error = i; |
| 265 | break; | 265 | break; |
| 266 | case 4: /* Read/clear last kernel messages */ | 266 | case 4: /* Read/clear last kernel messages */ |
| 267 | do_clear = 1; | 267 | do_clear = 1; |
| 268 | /* FALL THRU */ | 268 | /* FALL THRU */ |
| 269 | case 3: /* Read last kernel messages */ | 269 | case 3: /* Read last kernel messages */ |
| 270 | error = -EINVAL; | 270 | error = -EINVAL; |
| @@ -288,11 +288,11 @@ int do_syslog(int type, char __user * buf, int len) | |||
| 288 | limit = log_end; | 288 | limit = log_end; |
| 289 | /* | 289 | /* |
| 290 | * __put_user() could sleep, and while we sleep | 290 | * __put_user() could sleep, and while we sleep |
| 291 | * printk() could overwrite the messages | 291 | * printk() could overwrite the messages |
| 292 | * we try to copy to user space. Therefore | 292 | * we try to copy to user space. Therefore |
| 293 | * the messages are copied in reverse. <manfreds> | 293 | * the messages are copied in reverse. <manfreds> |
| 294 | */ | 294 | */ |
| 295 | for(i = 0; i < count && !error; i++) { | 295 | for (i = 0; i < count && !error; i++) { |
| 296 | j = limit-1-i; | 296 | j = limit-1-i; |
| 297 | if (j + log_buf_len < log_end) | 297 | if (j + log_buf_len < log_end) |
| 298 | break; | 298 | break; |
| @@ -306,10 +306,10 @@ int do_syslog(int type, char __user * buf, int len) | |||
| 306 | if (error) | 306 | if (error) |
| 307 | break; | 307 | break; |
| 308 | error = i; | 308 | error = i; |
| 309 | if(i != count) { | 309 | if (i != count) { |
| 310 | int offset = count-error; | 310 | int offset = count-error; |
| 311 | /* buffer overflow during copy, correct user buffer. */ | 311 | /* buffer overflow during copy, correct user buffer. */ |
| 312 | for(i=0;i<error;i++) { | 312 | for (i = 0; i < error; i++) { |
| 313 | if (__get_user(c,&buf[i+offset]) || | 313 | if (__get_user(c,&buf[i+offset]) || |
| 314 | __put_user(c,&buf[i])) { | 314 | __put_user(c,&buf[i])) { |
| 315 | error = -EFAULT; | 315 | error = -EFAULT; |
| @@ -351,7 +351,7 @@ out: | |||
| 351 | return error; | 351 | return error; |
| 352 | } | 352 | } |
| 353 | 353 | ||
| 354 | asmlinkage long sys_syslog(int type, char __user * buf, int len) | 354 | asmlinkage long sys_syslog(int type, char __user *buf, int len) |
| 355 | { | 355 | { |
| 356 | return do_syslog(type, buf, len); | 356 | return do_syslog(type, buf, len); |
| 357 | } | 357 | } |
| @@ -404,21 +404,19 @@ static void call_console_drivers(unsigned long start, unsigned long end) | |||
| 404 | cur_index = start; | 404 | cur_index = start; |
| 405 | start_print = start; | 405 | start_print = start; |
| 406 | while (cur_index != end) { | 406 | while (cur_index != end) { |
| 407 | if ( msg_level < 0 && | 407 | if (msg_level < 0 && ((end - cur_index) > 2) && |
| 408 | ((end - cur_index) > 2) && | 408 | LOG_BUF(cur_index + 0) == '<' && |
| 409 | LOG_BUF(cur_index + 0) == '<' && | 409 | LOG_BUF(cur_index + 1) >= '0' && |
| 410 | LOG_BUF(cur_index + 1) >= '0' && | 410 | LOG_BUF(cur_index + 1) <= '7' && |
| 411 | LOG_BUF(cur_index + 1) <= '7' && | 411 | LOG_BUF(cur_index + 2) == '>') { |
| 412 | LOG_BUF(cur_index + 2) == '>') | ||
| 413 | { | ||
| 414 | msg_level = LOG_BUF(cur_index + 1) - '0'; | 412 | msg_level = LOG_BUF(cur_index + 1) - '0'; |
| 415 | cur_index += 3; | 413 | cur_index += 3; |
| 416 | start_print = cur_index; | 414 | start_print = cur_index; |
| 417 | } | 415 | } |
| 418 | while (cur_index != end) { | 416 | while (cur_index != end) { |
| 419 | char c = LOG_BUF(cur_index); | 417 | char c = LOG_BUF(cur_index); |
| 420 | cur_index++; | ||
| 421 | 418 | ||
| 419 | cur_index++; | ||
| 422 | if (c == '\n') { | 420 | if (c == '\n') { |
| 423 | if (msg_level < 0) { | 421 | if (msg_level < 0) { |
| 424 | /* | 422 | /* |
| @@ -461,7 +459,7 @@ static void zap_locks(void) | |||
| 461 | static unsigned long oops_timestamp; | 459 | static unsigned long oops_timestamp; |
| 462 | 460 | ||
| 463 | if (time_after_eq(jiffies, oops_timestamp) && | 461 | if (time_after_eq(jiffies, oops_timestamp) && |
| 464 | !time_after(jiffies, oops_timestamp + 30*HZ)) | 462 | !time_after(jiffies, oops_timestamp + 30 * HZ)) |
| 465 | return; | 463 | return; |
| 466 | 464 | ||
| 467 | oops_timestamp = jiffies; | 465 | oops_timestamp = jiffies; |
| @@ -495,7 +493,7 @@ __attribute__((weak)) unsigned long long printk_clock(void) | |||
| 495 | 493 | ||
| 496 | /* | 494 | /* |
| 497 | * This is printk. It can be called from any context. We want it to work. | 495 | * This is printk. It can be called from any context. We want it to work. |
| 498 | * | 496 | * |
| 499 | * We try to grab the console_sem. If we succeed, it's easy - we log the output and | 497 | * We try to grab the console_sem. If we succeed, it's easy - we log the output and |
| 500 | * call the console drivers. If we fail to get the semaphore we place the output | 498 | * call the console drivers. If we fail to get the semaphore we place the output |
| 501 | * into the log buffer and return. The current holder of the console_sem will | 499 | * into the log buffer and return. The current holder of the console_sem will |
| @@ -639,13 +637,19 @@ EXPORT_SYMBOL(vprintk); | |||
| 639 | 637 | ||
| 640 | #else | 638 | #else |
| 641 | 639 | ||
| 642 | asmlinkage long sys_syslog(int type, char __user * buf, int len) | 640 | asmlinkage long sys_syslog(int type, char __user *buf, int len) |
| 643 | { | 641 | { |
| 644 | return 0; | 642 | return 0; |
| 645 | } | 643 | } |
| 646 | 644 | ||
| 647 | int do_syslog(int type, char __user * buf, int len) { return 0; } | 645 | int do_syslog(int type, char __user *buf, int len) |
| 648 | static void call_console_drivers(unsigned long start, unsigned long end) {} | 646 | { |
| 647 | return 0; | ||
| 648 | } | ||
| 649 | |||
| 650 | static void call_console_drivers(unsigned long start, unsigned long end) | ||
| 651 | { | ||
| 652 | } | ||
| 649 | 653 | ||
| 650 | #endif | 654 | #endif |
| 651 | 655 | ||
| @@ -851,9 +855,9 @@ EXPORT_SYMBOL(console_start); | |||
| 851 | * print any messages that were printed by the kernel before the | 855 | * print any messages that were printed by the kernel before the |
| 852 | * console driver was initialized. | 856 | * console driver was initialized. |
| 853 | */ | 857 | */ |
| 854 | void register_console(struct console * console) | 858 | void register_console(struct console *console) |
| 855 | { | 859 | { |
| 856 | int i; | 860 | int i; |
| 857 | unsigned long flags; | 861 | unsigned long flags; |
| 858 | 862 | ||
| 859 | if (preferred_console < 0) | 863 | if (preferred_console < 0) |
| @@ -878,7 +882,8 @@ void register_console(struct console * console) | |||
| 878 | * See if this console matches one we selected on | 882 | * See if this console matches one we selected on |
| 879 | * the command line. | 883 | * the command line. |
| 880 | */ | 884 | */ |
| 881 | for(i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) { | 885 | for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; |
| 886 | i++) { | ||
| 882 | if (strcmp(console_cmdline[i].name, console->name) != 0) | 887 | if (strcmp(console_cmdline[i].name, console->name) != 0) |
| 883 | continue; | 888 | continue; |
| 884 | if (console->index >= 0 && | 889 | if (console->index >= 0 && |
| @@ -933,9 +938,9 @@ void register_console(struct console * console) | |||
| 933 | } | 938 | } |
| 934 | EXPORT_SYMBOL(register_console); | 939 | EXPORT_SYMBOL(register_console); |
| 935 | 940 | ||
| 936 | int unregister_console(struct console * console) | 941 | int unregister_console(struct console *console) |
| 937 | { | 942 | { |
| 938 | struct console *a,*b; | 943 | struct console *a, *b; |
| 939 | int res = 1; | 944 | int res = 1; |
| 940 | 945 | ||
| 941 | acquire_console_sem(); | 946 | acquire_console_sem(); |
| @@ -949,10 +954,10 @@ int unregister_console(struct console * console) | |||
| 949 | b->next = a->next; | 954 | b->next = a->next; |
| 950 | res = 0; | 955 | res = 0; |
| 951 | break; | 956 | break; |
| 952 | } | 957 | } |
| 953 | } | 958 | } |
| 954 | } | 959 | } |
| 955 | 960 | ||
| 956 | /* If last console is removed, we re-enable picking the first | 961 | /* If last console is removed, we re-enable picking the first |
| 957 | * one that gets registered. Without that, pmac early boot console | 962 | * one that gets registered. Without that, pmac early boot console |
| 958 | * would prevent fbcon from taking over. | 963 | * would prevent fbcon from taking over. |
| @@ -994,7 +999,7 @@ void tty_write_message(struct tty_struct *tty, char *msg) | |||
| 994 | int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst) | 999 | int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst) |
| 995 | { | 1000 | { |
| 996 | static DEFINE_SPINLOCK(ratelimit_lock); | 1001 | static DEFINE_SPINLOCK(ratelimit_lock); |
| 997 | static unsigned long toks = 10*5*HZ; | 1002 | static unsigned long toks = 10 * 5 * HZ; |
| 998 | static unsigned long last_msg; | 1003 | static unsigned long last_msg; |
| 999 | static int missed; | 1004 | static int missed; |
| 1000 | unsigned long flags; | 1005 | unsigned long flags; |
| @@ -1007,6 +1012,7 @@ int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst) | |||
| 1007 | toks = ratelimit_burst * ratelimit_jiffies; | 1012 | toks = ratelimit_burst * ratelimit_jiffies; |
| 1008 | if (toks >= ratelimit_jiffies) { | 1013 | if (toks >= ratelimit_jiffies) { |
| 1009 | int lost = missed; | 1014 | int lost = missed; |
| 1015 | |||
| 1010 | missed = 0; | 1016 | missed = 0; |
| 1011 | toks -= ratelimit_jiffies; | 1017 | toks -= ratelimit_jiffies; |
| 1012 | spin_unlock_irqrestore(&ratelimit_lock, flags); | 1018 | spin_unlock_irqrestore(&ratelimit_lock, flags); |
| @@ -1021,7 +1027,7 @@ int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst) | |||
| 1021 | EXPORT_SYMBOL(__printk_ratelimit); | 1027 | EXPORT_SYMBOL(__printk_ratelimit); |
| 1022 | 1028 | ||
| 1023 | /* minimum time in jiffies between messages */ | 1029 | /* minimum time in jiffies between messages */ |
| 1024 | int printk_ratelimit_jiffies = 5*HZ; | 1030 | int printk_ratelimit_jiffies = 5 * HZ; |
| 1025 | 1031 | ||
| 1026 | /* number of messages we send before ratelimiting */ | 1032 | /* number of messages we send before ratelimiting */ |
| 1027 | int printk_ratelimit_burst = 10; | 1033 | int printk_ratelimit_burst = 10; |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 019e04ec065a..863eee8bff47 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
| @@ -56,6 +56,10 @@ void ptrace_untrace(task_t *child) | |||
| 56 | signal_wake_up(child, 1); | 56 | signal_wake_up(child, 1); |
| 57 | } | 57 | } |
| 58 | } | 58 | } |
| 59 | if (child->signal->flags & SIGNAL_GROUP_EXIT) { | ||
| 60 | sigaddset(&child->pending.signal, SIGKILL); | ||
| 61 | signal_wake_up(child, 1); | ||
| 62 | } | ||
| 59 | spin_unlock(&child->sighand->siglock); | 63 | spin_unlock(&child->sighand->siglock); |
| 60 | } | 64 | } |
| 61 | 65 | ||
| @@ -77,8 +81,7 @@ void __ptrace_unlink(task_t *child) | |||
| 77 | SET_LINKS(child); | 81 | SET_LINKS(child); |
| 78 | } | 82 | } |
| 79 | 83 | ||
| 80 | if (child->state == TASK_TRACED) | 84 | ptrace_untrace(child); |
| 81 | ptrace_untrace(child); | ||
| 82 | } | 85 | } |
| 83 | 86 | ||
| 84 | /* | 87 | /* |
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 2559d4b8f23f..c4d159a21e04 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
| @@ -154,6 +154,15 @@ void fastcall call_rcu_bh(struct rcu_head *head, | |||
| 154 | } | 154 | } |
| 155 | 155 | ||
| 156 | /* | 156 | /* |
| 157 | * Return the number of RCU batches processed thus far. Useful | ||
| 158 | * for debug and statistics. | ||
| 159 | */ | ||
| 160 | long rcu_batches_completed(void) | ||
| 161 | { | ||
| 162 | return rcu_ctrlblk.completed; | ||
| 163 | } | ||
| 164 | |||
| 165 | /* | ||
| 157 | * Invoke the completed RCU callbacks. They are expected to be in | 166 | * Invoke the completed RCU callbacks. They are expected to be in |
| 158 | * a per-cpu list. | 167 | * a per-cpu list. |
| 159 | */ | 168 | */ |
| @@ -501,6 +510,7 @@ void synchronize_kernel(void) | |||
| 501 | } | 510 | } |
| 502 | 511 | ||
| 503 | module_param(maxbatch, int, 0); | 512 | module_param(maxbatch, int, 0); |
| 513 | EXPORT_SYMBOL_GPL(rcu_batches_completed); | ||
| 504 | EXPORT_SYMBOL(call_rcu); /* WARNING: GPL-only in April 2006. */ | 514 | EXPORT_SYMBOL(call_rcu); /* WARNING: GPL-only in April 2006. */ |
| 505 | EXPORT_SYMBOL(call_rcu_bh); /* WARNING: GPL-only in April 2006. */ | 515 | EXPORT_SYMBOL(call_rcu_bh); /* WARNING: GPL-only in April 2006. */ |
| 506 | EXPORT_SYMBOL_GPL(synchronize_rcu); | 516 | EXPORT_SYMBOL_GPL(synchronize_rcu); |
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c new file mode 100644 index 000000000000..9b58f1eff3ca --- /dev/null +++ b/kernel/rcutorture.c | |||
| @@ -0,0 +1,492 @@ | |||
| 1 | /* | ||
| 2 | * Read-Copy Update /proc-based torture test facility | ||
| 3 | * | ||
| 4 | * This program is free software; you can redistribute it and/or modify | ||
| 5 | * it under the terms of the GNU General Public License as published by | ||
| 6 | * the Free Software Foundation; either version 2 of the License, or | ||
| 7 | * (at your option) any later version. | ||
| 8 | * | ||
| 9 | * This program is distributed in the hope that it will be useful, | ||
| 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 12 | * GNU General Public License for more details. | ||
| 13 | * | ||
| 14 | * You should have received a copy of the GNU General Public License | ||
| 15 | * along with this program; if not, write to the Free Software | ||
| 16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
| 17 | * | ||
| 18 | * Copyright (C) IBM Corporation, 2005 | ||
| 19 | * | ||
| 20 | * Authors: Paul E. McKenney <paulmck@us.ibm.com> | ||
| 21 | * | ||
| 22 | * See also: Documentation/RCU/torture.txt | ||
| 23 | */ | ||
| 24 | #include <linux/types.h> | ||
| 25 | #include <linux/kernel.h> | ||
| 26 | #include <linux/init.h> | ||
| 27 | #include <linux/module.h> | ||
| 28 | #include <linux/kthread.h> | ||
| 29 | #include <linux/err.h> | ||
| 30 | #include <linux/spinlock.h> | ||
| 31 | #include <linux/smp.h> | ||
| 32 | #include <linux/rcupdate.h> | ||
| 33 | #include <linux/interrupt.h> | ||
| 34 | #include <linux/sched.h> | ||
| 35 | #include <asm/atomic.h> | ||
| 36 | #include <linux/bitops.h> | ||
| 37 | #include <linux/module.h> | ||
| 38 | #include <linux/completion.h> | ||
| 39 | #include <linux/moduleparam.h> | ||
| 40 | #include <linux/percpu.h> | ||
| 41 | #include <linux/notifier.h> | ||
| 42 | #include <linux/rcuref.h> | ||
| 43 | #include <linux/cpu.h> | ||
| 44 | #include <linux/random.h> | ||
| 45 | #include <linux/delay.h> | ||
| 46 | #include <linux/byteorder/swabb.h> | ||
| 47 | #include <linux/stat.h> | ||
| 48 | |||
| 49 | MODULE_LICENSE("GPL"); | ||
| 50 | |||
| 51 | static int nreaders = -1; /* # reader threads, defaults to 4*ncpus */ | ||
| 52 | static int stat_interval = 0; /* Interval between stats, in seconds. */ | ||
| 53 | /* Defaults to "only at end of test". */ | ||
| 54 | static int verbose = 0; /* Print more debug info. */ | ||
| 55 | |||
| 56 | MODULE_PARM(nreaders, "i"); | ||
| 57 | MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); | ||
| 58 | MODULE_PARM(stat_interval, "i"); | ||
| 59 | MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); | ||
| 60 | MODULE_PARM(verbose, "i"); | ||
| 61 | MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); | ||
| 62 | #define TORTURE_FLAG "rcutorture: " | ||
| 63 | #define PRINTK_STRING(s) \ | ||
| 64 | do { printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0) | ||
| 65 | #define VERBOSE_PRINTK_STRING(s) \ | ||
| 66 | do { if (verbose) printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0) | ||
| 67 | #define VERBOSE_PRINTK_ERRSTRING(s) \ | ||
| 68 | do { if (verbose) printk(KERN_ALERT TORTURE_FLAG "!!! " s "\n"); } while (0) | ||
| 69 | |||
| 70 | static char printk_buf[4096]; | ||
| 71 | |||
| 72 | static int nrealreaders; | ||
| 73 | static struct task_struct *writer_task; | ||
| 74 | static struct task_struct **reader_tasks; | ||
| 75 | static struct task_struct *stats_task; | ||
| 76 | |||
| 77 | #define RCU_TORTURE_PIPE_LEN 10 | ||
| 78 | |||
| 79 | struct rcu_torture { | ||
| 80 | struct rcu_head rtort_rcu; | ||
| 81 | int rtort_pipe_count; | ||
| 82 | struct list_head rtort_free; | ||
| 83 | }; | ||
| 84 | |||
| 85 | static int fullstop = 0; /* stop generating callbacks at test end. */ | ||
| 86 | static LIST_HEAD(rcu_torture_freelist); | ||
| 87 | static struct rcu_torture *rcu_torture_current = NULL; | ||
| 88 | static long rcu_torture_current_version = 0; | ||
| 89 | static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; | ||
| 90 | static DEFINE_SPINLOCK(rcu_torture_lock); | ||
| 91 | static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = | ||
| 92 | { 0 }; | ||
| 93 | static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) = | ||
| 94 | { 0 }; | ||
| 95 | static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1]; | ||
| 96 | atomic_t n_rcu_torture_alloc; | ||
| 97 | atomic_t n_rcu_torture_alloc_fail; | ||
| 98 | atomic_t n_rcu_torture_free; | ||
| 99 | |||
| 100 | /* | ||
| 101 | * Allocate an element from the rcu_tortures pool. | ||
| 102 | */ | ||
| 103 | struct rcu_torture * | ||
| 104 | rcu_torture_alloc(void) | ||
| 105 | { | ||
| 106 | struct list_head *p; | ||
| 107 | |||
| 108 | spin_lock(&rcu_torture_lock); | ||
| 109 | if (list_empty(&rcu_torture_freelist)) { | ||
| 110 | atomic_inc(&n_rcu_torture_alloc_fail); | ||
| 111 | spin_unlock(&rcu_torture_lock); | ||
| 112 | return NULL; | ||
| 113 | } | ||
| 114 | atomic_inc(&n_rcu_torture_alloc); | ||
| 115 | p = rcu_torture_freelist.next; | ||
| 116 | list_del_init(p); | ||
| 117 | spin_unlock(&rcu_torture_lock); | ||
| 118 | return container_of(p, struct rcu_torture, rtort_free); | ||
| 119 | } | ||
| 120 | |||
| 121 | /* | ||
| 122 | * Free an element to the rcu_tortures pool. | ||
| 123 | */ | ||
| 124 | static void | ||
| 125 | rcu_torture_free(struct rcu_torture *p) | ||
| 126 | { | ||
| 127 | atomic_inc(&n_rcu_torture_free); | ||
| 128 | spin_lock(&rcu_torture_lock); | ||
| 129 | list_add_tail(&p->rtort_free, &rcu_torture_freelist); | ||
| 130 | spin_unlock(&rcu_torture_lock); | ||
| 131 | } | ||
| 132 | |||
| 133 | static void | ||
| 134 | rcu_torture_cb(struct rcu_head *p) | ||
| 135 | { | ||
| 136 | int i; | ||
| 137 | struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); | ||
| 138 | |||
| 139 | if (fullstop) { | ||
| 140 | /* Test is ending, just drop callbacks on the floor. */ | ||
| 141 | /* The next initialization will pick up the pieces. */ | ||
| 142 | return; | ||
| 143 | } | ||
| 144 | i = rp->rtort_pipe_count; | ||
| 145 | if (i > RCU_TORTURE_PIPE_LEN) | ||
| 146 | i = RCU_TORTURE_PIPE_LEN; | ||
| 147 | atomic_inc(&rcu_torture_wcount[i]); | ||
| 148 | if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) | ||
| 149 | rcu_torture_free(rp); | ||
| 150 | else | ||
| 151 | call_rcu(p, rcu_torture_cb); | ||
| 152 | } | ||
| 153 | |||
| 154 | struct rcu_random_state { | ||
| 155 | unsigned long rrs_state; | ||
| 156 | unsigned long rrs_count; | ||
| 157 | }; | ||
| 158 | |||
| 159 | #define RCU_RANDOM_MULT 39916801 /* prime */ | ||
| 160 | #define RCU_RANDOM_ADD 479001701 /* prime */ | ||
| 161 | #define RCU_RANDOM_REFRESH 10000 | ||
| 162 | |||
| 163 | #define DEFINE_RCU_RANDOM(name) struct rcu_random_state name = { 0, 0 } | ||
| 164 | |||
| 165 | /* | ||
| 166 | * Crude but fast random-number generator. Uses a linear congruential | ||
| 167 | * generator, with occasional help from get_random_bytes(). | ||
| 168 | */ | ||
| 169 | static long | ||
| 170 | rcu_random(struct rcu_random_state *rrsp) | ||
| 171 | { | ||
| 172 | long refresh; | ||
| 173 | |||
| 174 | if (--rrsp->rrs_count < 0) { | ||
| 175 | get_random_bytes(&refresh, sizeof(refresh)); | ||
| 176 | rrsp->rrs_state += refresh; | ||
| 177 | rrsp->rrs_count = RCU_RANDOM_REFRESH; | ||
| 178 | } | ||
| 179 | rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD; | ||
| 180 | return swahw32(rrsp->rrs_state); | ||
| 181 | } | ||
| 182 | |||
| 183 | /* | ||
| 184 | * RCU torture writer kthread. Repeatedly substitutes a new structure | ||
| 185 | * for that pointed to by rcu_torture_current, freeing the old structure | ||
| 186 | * after a series of grace periods (the "pipeline"). | ||
| 187 | */ | ||
| 188 | static int | ||
| 189 | rcu_torture_writer(void *arg) | ||
| 190 | { | ||
| 191 | int i; | ||
| 192 | long oldbatch = rcu_batches_completed(); | ||
| 193 | struct rcu_torture *rp; | ||
| 194 | struct rcu_torture *old_rp; | ||
| 195 | static DEFINE_RCU_RANDOM(rand); | ||
| 196 | |||
| 197 | VERBOSE_PRINTK_STRING("rcu_torture_writer task started"); | ||
| 198 | do { | ||
| 199 | schedule_timeout_uninterruptible(1); | ||
| 200 | if (rcu_batches_completed() == oldbatch) | ||
| 201 | continue; | ||
| 202 | if ((rp = rcu_torture_alloc()) == NULL) | ||
| 203 | continue; | ||
| 204 | rp->rtort_pipe_count = 0; | ||
| 205 | udelay(rcu_random(&rand) & 0x3ff); | ||
| 206 | old_rp = rcu_torture_current; | ||
| 207 | rcu_assign_pointer(rcu_torture_current, rp); | ||
| 208 | smp_wmb(); | ||
| 209 | if (old_rp != NULL) { | ||
| 210 | i = old_rp->rtort_pipe_count; | ||
| 211 | if (i > RCU_TORTURE_PIPE_LEN) | ||
| 212 | i = RCU_TORTURE_PIPE_LEN; | ||
| 213 | atomic_inc(&rcu_torture_wcount[i]); | ||
| 214 | old_rp->rtort_pipe_count++; | ||
| 215 | call_rcu(&old_rp->rtort_rcu, rcu_torture_cb); | ||
| 216 | } | ||
| 217 | rcu_torture_current_version++; | ||
| 218 | oldbatch = rcu_batches_completed(); | ||
| 219 | } while (!kthread_should_stop() && !fullstop); | ||
| 220 | VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); | ||
| 221 | while (!kthread_should_stop()) | ||
| 222 | schedule_timeout_uninterruptible(1); | ||
| 223 | return 0; | ||
| 224 | } | ||
| 225 | |||
| 226 | /* | ||
| 227 | * RCU torture reader kthread. Repeatedly dereferences rcu_torture_current, | ||
| 228 | * incrementing the corresponding element of the pipeline array. The | ||
| 229 | * counter in the element should never be greater than 1, otherwise, the | ||
| 230 | * RCU implementation is broken. | ||
| 231 | */ | ||
| 232 | static int | ||
| 233 | rcu_torture_reader(void *arg) | ||
| 234 | { | ||
| 235 | int completed; | ||
| 236 | DEFINE_RCU_RANDOM(rand); | ||
| 237 | struct rcu_torture *p; | ||
| 238 | int pipe_count; | ||
| 239 | |||
| 240 | VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); | ||
| 241 | do { | ||
| 242 | rcu_read_lock(); | ||
| 243 | completed = rcu_batches_completed(); | ||
| 244 | p = rcu_dereference(rcu_torture_current); | ||
| 245 | if (p == NULL) { | ||
| 246 | /* Wait for rcu_torture_writer to get underway */ | ||
| 247 | rcu_read_unlock(); | ||
| 248 | schedule_timeout_interruptible(HZ); | ||
| 249 | continue; | ||
| 250 | } | ||
| 251 | udelay(rcu_random(&rand) & 0x7f); | ||
| 252 | preempt_disable(); | ||
| 253 | pipe_count = p->rtort_pipe_count; | ||
| 254 | if (pipe_count > RCU_TORTURE_PIPE_LEN) { | ||
| 255 | /* Should not happen, but... */ | ||
| 256 | pipe_count = RCU_TORTURE_PIPE_LEN; | ||
| 257 | } | ||
| 258 | ++__get_cpu_var(rcu_torture_count)[pipe_count]; | ||
| 259 | completed = rcu_batches_completed() - completed; | ||
| 260 | if (completed > RCU_TORTURE_PIPE_LEN) { | ||
| 261 | /* Should not happen, but... */ | ||
| 262 | completed = RCU_TORTURE_PIPE_LEN; | ||
| 263 | } | ||
| 264 | ++__get_cpu_var(rcu_torture_batch)[completed]; | ||
| 265 | preempt_enable(); | ||
| 266 | rcu_read_unlock(); | ||
| 267 | schedule(); | ||
| 268 | } while (!kthread_should_stop() && !fullstop); | ||
| 269 | VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); | ||
| 270 | while (!kthread_should_stop()) | ||
| 271 | schedule_timeout_uninterruptible(1); | ||
| 272 | return 0; | ||
| 273 | } | ||
| 274 | |||
| 275 | /* | ||
| 276 | * Create an RCU-torture statistics message in the specified buffer. | ||
| 277 | */ | ||
| 278 | static int | ||
| 279 | rcu_torture_printk(char *page) | ||
| 280 | { | ||
| 281 | int cnt = 0; | ||
| 282 | int cpu; | ||
| 283 | int i; | ||
| 284 | long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; | ||
| 285 | long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; | ||
| 286 | |||
| 287 | for_each_cpu(cpu) { | ||
| 288 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { | ||
| 289 | pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i]; | ||
| 290 | batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i]; | ||
| 291 | } | ||
| 292 | } | ||
| 293 | for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) { | ||
| 294 | if (pipesummary[i] != 0) | ||
| 295 | break; | ||
| 296 | } | ||
| 297 | cnt += sprintf(&page[cnt], "rcutorture: "); | ||
| 298 | cnt += sprintf(&page[cnt], | ||
| 299 | "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d", | ||
| 300 | rcu_torture_current, | ||
| 301 | rcu_torture_current_version, | ||
| 302 | list_empty(&rcu_torture_freelist), | ||
| 303 | atomic_read(&n_rcu_torture_alloc), | ||
| 304 | atomic_read(&n_rcu_torture_alloc_fail), | ||
| 305 | atomic_read(&n_rcu_torture_free)); | ||
| 306 | cnt += sprintf(&page[cnt], "\nrcutorture: "); | ||
| 307 | if (i > 1) | ||
| 308 | cnt += sprintf(&page[cnt], "!!! "); | ||
| 309 | cnt += sprintf(&page[cnt], "Reader Pipe: "); | ||
| 310 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) | ||
| 311 | cnt += sprintf(&page[cnt], " %ld", pipesummary[i]); | ||
| 312 | cnt += sprintf(&page[cnt], "\nrcutorture: "); | ||
| 313 | cnt += sprintf(&page[cnt], "Reader Batch: "); | ||
| 314 | for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++) | ||
| 315 | cnt += sprintf(&page[cnt], " %ld", batchsummary[i]); | ||
| 316 | cnt += sprintf(&page[cnt], "\nrcutorture: "); | ||
| 317 | cnt += sprintf(&page[cnt], "Free-Block Circulation: "); | ||
| 318 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { | ||
| 319 | cnt += sprintf(&page[cnt], " %d", | ||
| 320 | atomic_read(&rcu_torture_wcount[i])); | ||
| 321 | } | ||
| 322 | cnt += sprintf(&page[cnt], "\n"); | ||
| 323 | return cnt; | ||
| 324 | } | ||
| 325 | |||
| 326 | /* | ||
| 327 | * Print torture statistics. Caller must ensure that there is only | ||
| 328 | * one call to this function at a given time!!! This is normally | ||
| 329 | * accomplished by relying on the module system to only have one copy | ||
| 330 | * of the module loaded, and then by giving the rcu_torture_stats | ||
| 331 | * kthread full control (or the init/cleanup functions when rcu_torture_stats | ||
| 332 | * thread is not running). | ||
| 333 | */ | ||
| 334 | static void | ||
| 335 | rcu_torture_stats_print(void) | ||
| 336 | { | ||
| 337 | int cnt; | ||
| 338 | |||
| 339 | cnt = rcu_torture_printk(printk_buf); | ||
| 340 | printk(KERN_ALERT "%s", printk_buf); | ||
| 341 | } | ||
| 342 | |||
| 343 | /* | ||
| 344 | * Periodically prints torture statistics, if periodic statistics printing | ||
| 345 | * was specified via the stat_interval module parameter. | ||
| 346 | * | ||
| 347 | * No need to worry about fullstop here, since this one doesn't reference | ||
| 348 | * volatile state or register callbacks. | ||
| 349 | */ | ||
| 350 | static int | ||
| 351 | rcu_torture_stats(void *arg) | ||
| 352 | { | ||
| 353 | VERBOSE_PRINTK_STRING("rcu_torture_stats task started"); | ||
| 354 | do { | ||
| 355 | schedule_timeout_interruptible(stat_interval * HZ); | ||
| 356 | rcu_torture_stats_print(); | ||
| 357 | } while (!kthread_should_stop()); | ||
| 358 | VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping"); | ||
| 359 | return 0; | ||
| 360 | } | ||
| 361 | |||
| 362 | static void | ||
| 363 | rcu_torture_cleanup(void) | ||
| 364 | { | ||
| 365 | int i; | ||
| 366 | |||
| 367 | fullstop = 1; | ||
| 368 | if (writer_task != NULL) { | ||
| 369 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task"); | ||
| 370 | kthread_stop(writer_task); | ||
| 371 | } | ||
| 372 | writer_task = NULL; | ||
| 373 | |||
| 374 | if (reader_tasks != NULL) { | ||
| 375 | for (i = 0; i < nrealreaders; i++) { | ||
| 376 | if (reader_tasks[i] != NULL) { | ||
| 377 | VERBOSE_PRINTK_STRING( | ||
| 378 | "Stopping rcu_torture_reader task"); | ||
| 379 | kthread_stop(reader_tasks[i]); | ||
| 380 | } | ||
| 381 | reader_tasks[i] = NULL; | ||
| 382 | } | ||
| 383 | kfree(reader_tasks); | ||
| 384 | reader_tasks = NULL; | ||
| 385 | } | ||
| 386 | rcu_torture_current = NULL; | ||
| 387 | |||
| 388 | if (stats_task != NULL) { | ||
| 389 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task"); | ||
| 390 | kthread_stop(stats_task); | ||
| 391 | } | ||
| 392 | stats_task = NULL; | ||
| 393 | |||
| 394 | /* Wait for all RCU callbacks to fire. */ | ||
| 395 | |||
| 396 | for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++) | ||
| 397 | synchronize_rcu(); | ||
| 398 | rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ | ||
| 399 | PRINTK_STRING("--- End of test"); | ||
| 400 | } | ||
| 401 | |||
| 402 | static int | ||
| 403 | rcu_torture_init(void) | ||
| 404 | { | ||
| 405 | int i; | ||
| 406 | int cpu; | ||
| 407 | int firsterr = 0; | ||
| 408 | |||
| 409 | /* Process args and tell the world that the torturer is on the job. */ | ||
| 410 | |||
| 411 | if (nreaders >= 0) | ||
| 412 | nrealreaders = nreaders; | ||
| 413 | else | ||
| 414 | nrealreaders = 2 * num_online_cpus(); | ||
| 415 | printk(KERN_ALERT TORTURE_FLAG | ||
| 416 | "--- Start of test: nreaders=%d stat_interval=%d verbose=%d\n", | ||
| 417 | nrealreaders, stat_interval, verbose); | ||
| 418 | fullstop = 0; | ||
| 419 | |||
| 420 | /* Set up the freelist. */ | ||
| 421 | |||
| 422 | INIT_LIST_HEAD(&rcu_torture_freelist); | ||
| 423 | for (i = 0; i < sizeof(rcu_tortures) / sizeof(rcu_tortures[0]); i++) { | ||
| 424 | list_add_tail(&rcu_tortures[i].rtort_free, | ||
| 425 | &rcu_torture_freelist); | ||
| 426 | } | ||
| 427 | |||
| 428 | /* Initialize the statistics so that each run gets its own numbers. */ | ||
| 429 | |||
| 430 | rcu_torture_current = NULL; | ||
| 431 | rcu_torture_current_version = 0; | ||
| 432 | atomic_set(&n_rcu_torture_alloc, 0); | ||
| 433 | atomic_set(&n_rcu_torture_alloc_fail, 0); | ||
| 434 | atomic_set(&n_rcu_torture_free, 0); | ||
| 435 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) | ||
| 436 | atomic_set(&rcu_torture_wcount[i], 0); | ||
| 437 | for_each_cpu(cpu) { | ||
| 438 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { | ||
| 439 | per_cpu(rcu_torture_count, cpu)[i] = 0; | ||
| 440 | per_cpu(rcu_torture_batch, cpu)[i] = 0; | ||
| 441 | } | ||
| 442 | } | ||
| 443 | |||
| 444 | /* Start up the kthreads. */ | ||
| 445 | |||
| 446 | VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task"); | ||
| 447 | writer_task = kthread_run(rcu_torture_writer, NULL, | ||
| 448 | "rcu_torture_writer"); | ||
| 449 | if (IS_ERR(writer_task)) { | ||
| 450 | firsterr = PTR_ERR(writer_task); | ||
| 451 | VERBOSE_PRINTK_ERRSTRING("Failed to create writer"); | ||
| 452 | writer_task = NULL; | ||
| 453 | goto unwind; | ||
| 454 | } | ||
| 455 | reader_tasks = kmalloc(nrealreaders * sizeof(reader_tasks[0]), | ||
| 456 | GFP_KERNEL); | ||
| 457 | if (reader_tasks == NULL) { | ||
| 458 | VERBOSE_PRINTK_ERRSTRING("out of memory"); | ||
| 459 | firsterr = -ENOMEM; | ||
| 460 | goto unwind; | ||
| 461 | } | ||
| 462 | for (i = 0; i < nrealreaders; i++) { | ||
| 463 | VERBOSE_PRINTK_STRING("Creating rcu_torture_reader task"); | ||
| 464 | reader_tasks[i] = kthread_run(rcu_torture_reader, NULL, | ||
| 465 | "rcu_torture_reader"); | ||
| 466 | if (IS_ERR(reader_tasks[i])) { | ||
| 467 | firsterr = PTR_ERR(reader_tasks[i]); | ||
| 468 | VERBOSE_PRINTK_ERRSTRING("Failed to create reader"); | ||
| 469 | reader_tasks[i] = NULL; | ||
| 470 | goto unwind; | ||
| 471 | } | ||
| 472 | } | ||
| 473 | if (stat_interval > 0) { | ||
| 474 | VERBOSE_PRINTK_STRING("Creating rcu_torture_stats task"); | ||
| 475 | stats_task = kthread_run(rcu_torture_stats, NULL, | ||
| 476 | "rcu_torture_stats"); | ||
| 477 | if (IS_ERR(stats_task)) { | ||
| 478 | firsterr = PTR_ERR(stats_task); | ||
| 479 | VERBOSE_PRINTK_ERRSTRING("Failed to create stats"); | ||
| 480 | stats_task = NULL; | ||
| 481 | goto unwind; | ||
| 482 | } | ||
| 483 | } | ||
| 484 | return 0; | ||
| 485 | |||
| 486 | unwind: | ||
| 487 | rcu_torture_cleanup(); | ||
| 488 | return firsterr; | ||
| 489 | } | ||
| 490 | |||
| 491 | module_init(rcu_torture_init); | ||
| 492 | module_exit(rcu_torture_cleanup); | ||
diff --git a/kernel/sched.c b/kernel/sched.c index 4f26c544d02c..b4f4eb613537 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
| @@ -1468,7 +1468,7 @@ void fastcall sched_exit(task_t *p) | |||
| 1468 | * the sleep_avg of the parent as well. | 1468 | * the sleep_avg of the parent as well. |
| 1469 | */ | 1469 | */ |
| 1470 | rq = task_rq_lock(p->parent, &flags); | 1470 | rq = task_rq_lock(p->parent, &flags); |
| 1471 | if (p->first_time_slice) { | 1471 | if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) { |
| 1472 | p->parent->time_slice += p->time_slice; | 1472 | p->parent->time_slice += p->time_slice; |
| 1473 | if (unlikely(p->parent->time_slice > task_timeslice(p))) | 1473 | if (unlikely(p->parent->time_slice > task_timeslice(p))) |
| 1474 | p->parent->time_slice = task_timeslice(p); | 1474 | p->parent->time_slice = task_timeslice(p); |
| @@ -3877,7 +3877,6 @@ EXPORT_SYMBOL(cpu_present_map); | |||
| 3877 | 3877 | ||
| 3878 | #ifndef CONFIG_SMP | 3878 | #ifndef CONFIG_SMP |
| 3879 | cpumask_t cpu_online_map = CPU_MASK_ALL; | 3879 | cpumask_t cpu_online_map = CPU_MASK_ALL; |
| 3880 | EXPORT_SYMBOL_GPL(cpu_online_map); | ||
| 3881 | cpumask_t cpu_possible_map = CPU_MASK_ALL; | 3880 | cpumask_t cpu_possible_map = CPU_MASK_ALL; |
| 3882 | #endif | 3881 | #endif |
| 3883 | 3882 | ||
diff --git a/kernel/signal.c b/kernel/signal.c index 6904bbbfe116..1bf3c39d6109 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
| @@ -277,7 +277,6 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, | |||
| 277 | } else { | 277 | } else { |
| 278 | INIT_LIST_HEAD(&q->list); | 278 | INIT_LIST_HEAD(&q->list); |
| 279 | q->flags = 0; | 279 | q->flags = 0; |
| 280 | q->lock = NULL; | ||
| 281 | q->user = get_uid(t->user); | 280 | q->user = get_uid(t->user); |
| 282 | } | 281 | } |
| 283 | return(q); | 282 | return(q); |
| @@ -652,8 +651,7 @@ static int check_kill_permission(int sig, struct siginfo *info, | |||
| 652 | if (!valid_signal(sig)) | 651 | if (!valid_signal(sig)) |
| 653 | return error; | 652 | return error; |
| 654 | error = -EPERM; | 653 | error = -EPERM; |
| 655 | if ((!info || ((unsigned long)info != 1 && | 654 | if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) |
| 656 | (unsigned long)info != 2 && SI_FROMUSER(info))) | ||
| 657 | && ((sig != SIGCONT) || | 655 | && ((sig != SIGCONT) || |
| 658 | (current->signal->session != t->signal->session)) | 656 | (current->signal->session != t->signal->session)) |
| 659 | && (current->euid ^ t->suid) && (current->euid ^ t->uid) | 657 | && (current->euid ^ t->suid) && (current->euid ^ t->uid) |
| @@ -790,7 +788,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, | |||
| 790 | * fast-pathed signals for kernel-internal things like SIGSTOP | 788 | * fast-pathed signals for kernel-internal things like SIGSTOP |
| 791 | * or SIGKILL. | 789 | * or SIGKILL. |
| 792 | */ | 790 | */ |
| 793 | if ((unsigned long)info == 2) | 791 | if (info == SEND_SIG_FORCED) |
| 794 | goto out_set; | 792 | goto out_set; |
| 795 | 793 | ||
| 796 | /* Real-time signals must be queued if sent by sigqueue, or | 794 | /* Real-time signals must be queued if sent by sigqueue, or |
| @@ -802,19 +800,19 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, | |||
| 802 | pass on the info struct. */ | 800 | pass on the info struct. */ |
| 803 | 801 | ||
| 804 | q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN && | 802 | q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN && |
| 805 | ((unsigned long) info < 2 || | 803 | (is_si_special(info) || |
| 806 | info->si_code >= 0))); | 804 | info->si_code >= 0))); |
| 807 | if (q) { | 805 | if (q) { |
| 808 | list_add_tail(&q->list, &signals->list); | 806 | list_add_tail(&q->list, &signals->list); |
| 809 | switch ((unsigned long) info) { | 807 | switch ((unsigned long) info) { |
| 810 | case 0: | 808 | case (unsigned long) SEND_SIG_NOINFO: |
| 811 | q->info.si_signo = sig; | 809 | q->info.si_signo = sig; |
| 812 | q->info.si_errno = 0; | 810 | q->info.si_errno = 0; |
| 813 | q->info.si_code = SI_USER; | 811 | q->info.si_code = SI_USER; |
| 814 | q->info.si_pid = current->pid; | 812 | q->info.si_pid = current->pid; |
| 815 | q->info.si_uid = current->uid; | 813 | q->info.si_uid = current->uid; |
| 816 | break; | 814 | break; |
| 817 | case 1: | 815 | case (unsigned long) SEND_SIG_PRIV: |
| 818 | q->info.si_signo = sig; | 816 | q->info.si_signo = sig; |
| 819 | q->info.si_errno = 0; | 817 | q->info.si_errno = 0; |
| 820 | q->info.si_code = SI_KERNEL; | 818 | q->info.si_code = SI_KERNEL; |
| @@ -825,20 +823,13 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, | |||
| 825 | copy_siginfo(&q->info, info); | 823 | copy_siginfo(&q->info, info); |
| 826 | break; | 824 | break; |
| 827 | } | 825 | } |
| 828 | } else { | 826 | } else if (!is_si_special(info)) { |
| 829 | if (sig >= SIGRTMIN && info && (unsigned long)info != 1 | 827 | if (sig >= SIGRTMIN && info->si_code != SI_USER) |
| 830 | && info->si_code != SI_USER) | ||
| 831 | /* | 828 | /* |
| 832 | * Queue overflow, abort. We may abort if the signal was rt | 829 | * Queue overflow, abort. We may abort if the signal was rt |
| 833 | * and sent by user using something other than kill(). | 830 | * and sent by user using something other than kill(). |
| 834 | */ | 831 | */ |
| 835 | return -EAGAIN; | 832 | return -EAGAIN; |
| 836 | if (((unsigned long)info > 1) && (info->si_code == SI_TIMER)) | ||
| 837 | /* | ||
| 838 | * Set up a return to indicate that we dropped | ||
| 839 | * the signal. | ||
| 840 | */ | ||
| 841 | ret = info->si_sys_private; | ||
| 842 | } | 833 | } |
| 843 | 834 | ||
| 844 | out_set: | 835 | out_set: |
| @@ -859,12 +850,6 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t) | |||
| 859 | BUG(); | 850 | BUG(); |
| 860 | assert_spin_locked(&t->sighand->siglock); | 851 | assert_spin_locked(&t->sighand->siglock); |
| 861 | 852 | ||
| 862 | if (((unsigned long)info > 2) && (info->si_code == SI_TIMER)) | ||
| 863 | /* | ||
| 864 | * Set up a return to indicate that we dropped the signal. | ||
| 865 | */ | ||
| 866 | ret = info->si_sys_private; | ||
| 867 | |||
| 868 | /* Short-circuit ignored signals. */ | 853 | /* Short-circuit ignored signals. */ |
| 869 | if (sig_ignored(t, sig)) | 854 | if (sig_ignored(t, sig)) |
| 870 | goto out; | 855 | goto out; |
| @@ -894,11 +879,13 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t) | |||
| 894 | int ret; | 879 | int ret; |
| 895 | 880 | ||
| 896 | spin_lock_irqsave(&t->sighand->siglock, flags); | 881 | spin_lock_irqsave(&t->sighand->siglock, flags); |
| 897 | if (sigismember(&t->blocked, sig) || t->sighand->action[sig-1].sa.sa_handler == SIG_IGN) { | 882 | if (t->sighand->action[sig-1].sa.sa_handler == SIG_IGN) { |
| 898 | t->sighand->action[sig-1].sa.sa_handler = SIG_DFL; | 883 | t->sighand->action[sig-1].sa.sa_handler = SIG_DFL; |
| 884 | } | ||
| 885 | if (sigismember(&t->blocked, sig)) { | ||
| 899 | sigdelset(&t->blocked, sig); | 886 | sigdelset(&t->blocked, sig); |
| 900 | recalc_sigpending_tsk(t); | ||
| 901 | } | 887 | } |
| 888 | recalc_sigpending_tsk(t); | ||
| 902 | ret = specific_send_sig_info(sig, info, t); | 889 | ret = specific_send_sig_info(sig, info, t); |
| 903 | spin_unlock_irqrestore(&t->sighand->siglock, flags); | 890 | spin_unlock_irqrestore(&t->sighand->siglock, flags); |
| 904 | 891 | ||
| @@ -908,15 +895,7 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t) | |||
| 908 | void | 895 | void |
| 909 | force_sig_specific(int sig, struct task_struct *t) | 896 | force_sig_specific(int sig, struct task_struct *t) |
| 910 | { | 897 | { |
| 911 | unsigned long int flags; | 898 | force_sig_info(sig, SEND_SIG_FORCED, t); |
| 912 | |||
| 913 | spin_lock_irqsave(&t->sighand->siglock, flags); | ||
| 914 | if (t->sighand->action[sig-1].sa.sa_handler == SIG_IGN) | ||
| 915 | t->sighand->action[sig-1].sa.sa_handler = SIG_DFL; | ||
| 916 | sigdelset(&t->blocked, sig); | ||
| 917 | recalc_sigpending_tsk(t); | ||
| 918 | specific_send_sig_info(sig, (void *)2, t); | ||
| 919 | spin_unlock_irqrestore(&t->sighand->siglock, flags); | ||
| 920 | } | 899 | } |
| 921 | 900 | ||
| 922 | /* | 901 | /* |
| @@ -1051,12 +1030,6 @@ __group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) | |||
| 1051 | assert_spin_locked(&p->sighand->siglock); | 1030 | assert_spin_locked(&p->sighand->siglock); |
| 1052 | handle_stop_signal(sig, p); | 1031 | handle_stop_signal(sig, p); |
| 1053 | 1032 | ||
| 1054 | if (((unsigned long)info > 2) && (info->si_code == SI_TIMER)) | ||
| 1055 | /* | ||
| 1056 | * Set up a return to indicate that we dropped the signal. | ||
| 1057 | */ | ||
| 1058 | ret = info->si_sys_private; | ||
| 1059 | |||
| 1060 | /* Short-circuit ignored signals. */ | 1033 | /* Short-circuit ignored signals. */ |
| 1061 | if (sig_ignored(p, sig)) | 1034 | if (sig_ignored(p, sig)) |
| 1062 | return ret; | 1035 | return ret; |
| @@ -1109,8 +1082,8 @@ void zap_other_threads(struct task_struct *p) | |||
| 1109 | if (t != p->group_leader) | 1082 | if (t != p->group_leader) |
| 1110 | t->exit_signal = -1; | 1083 | t->exit_signal = -1; |
| 1111 | 1084 | ||
| 1085 | /* SIGKILL will be handled before any pending SIGSTOP */ | ||
| 1112 | sigaddset(&t->pending.signal, SIGKILL); | 1086 | sigaddset(&t->pending.signal, SIGKILL); |
| 1113 | rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); | ||
| 1114 | signal_wake_up(t, 1); | 1087 | signal_wake_up(t, 1); |
| 1115 | } | 1088 | } |
| 1116 | } | 1089 | } |
| @@ -1286,10 +1259,13 @@ send_sig_info(int sig, struct siginfo *info, struct task_struct *p) | |||
| 1286 | return ret; | 1259 | return ret; |
| 1287 | } | 1260 | } |
| 1288 | 1261 | ||
| 1262 | #define __si_special(priv) \ | ||
| 1263 | ((priv) ? SEND_SIG_PRIV : SEND_SIG_NOINFO) | ||
| 1264 | |||
| 1289 | int | 1265 | int |
| 1290 | send_sig(int sig, struct task_struct *p, int priv) | 1266 | send_sig(int sig, struct task_struct *p, int priv) |
| 1291 | { | 1267 | { |
| 1292 | return send_sig_info(sig, (void*)(long)(priv != 0), p); | 1268 | return send_sig_info(sig, __si_special(priv), p); |
| 1293 | } | 1269 | } |
| 1294 | 1270 | ||
| 1295 | /* | 1271 | /* |
| @@ -1309,7 +1285,7 @@ send_group_sig_info(int sig, struct siginfo *info, struct task_struct *p) | |||
| 1309 | void | 1285 | void |
| 1310 | force_sig(int sig, struct task_struct *p) | 1286 | force_sig(int sig, struct task_struct *p) |
| 1311 | { | 1287 | { |
| 1312 | force_sig_info(sig, (void*)1L, p); | 1288 | force_sig_info(sig, SEND_SIG_PRIV, p); |
| 1313 | } | 1289 | } |
| 1314 | 1290 | ||
| 1315 | /* | 1291 | /* |
| @@ -1334,13 +1310,13 @@ force_sigsegv(int sig, struct task_struct *p) | |||
| 1334 | int | 1310 | int |
| 1335 | kill_pg(pid_t pgrp, int sig, int priv) | 1311 | kill_pg(pid_t pgrp, int sig, int priv) |
| 1336 | { | 1312 | { |
| 1337 | return kill_pg_info(sig, (void *)(long)(priv != 0), pgrp); | 1313 | return kill_pg_info(sig, __si_special(priv), pgrp); |
| 1338 | } | 1314 | } |
| 1339 | 1315 | ||
| 1340 | int | 1316 | int |
| 1341 | kill_proc(pid_t pid, int sig, int priv) | 1317 | kill_proc(pid_t pid, int sig, int priv) |
| 1342 | { | 1318 | { |
| 1343 | return kill_proc_info(sig, (void *)(long)(priv != 0), pid); | 1319 | return kill_proc_info(sig, __si_special(priv), pid); |
| 1344 | } | 1320 | } |
| 1345 | 1321 | ||
| 1346 | /* | 1322 | /* |
| @@ -1371,11 +1347,12 @@ void sigqueue_free(struct sigqueue *q) | |||
| 1371 | * pending queue. | 1347 | * pending queue. |
| 1372 | */ | 1348 | */ |
| 1373 | if (unlikely(!list_empty(&q->list))) { | 1349 | if (unlikely(!list_empty(&q->list))) { |
| 1374 | read_lock(&tasklist_lock); | 1350 | spinlock_t *lock = ¤t->sighand->siglock; |
| 1375 | spin_lock_irqsave(q->lock, flags); | 1351 | read_lock(&tasklist_lock); |
| 1352 | spin_lock_irqsave(lock, flags); | ||
| 1376 | if (!list_empty(&q->list)) | 1353 | if (!list_empty(&q->list)) |
| 1377 | list_del_init(&q->list); | 1354 | list_del_init(&q->list); |
| 1378 | spin_unlock_irqrestore(q->lock, flags); | 1355 | spin_unlock_irqrestore(lock, flags); |
| 1379 | read_unlock(&tasklist_lock); | 1356 | read_unlock(&tasklist_lock); |
| 1380 | } | 1357 | } |
| 1381 | q->flags &= ~SIGQUEUE_PREALLOC; | 1358 | q->flags &= ~SIGQUEUE_PREALLOC; |
| @@ -1414,7 +1391,6 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) | |||
| 1414 | goto out; | 1391 | goto out; |
| 1415 | } | 1392 | } |
| 1416 | 1393 | ||
| 1417 | q->lock = &p->sighand->siglock; | ||
| 1418 | list_add_tail(&q->list, &p->pending.list); | 1394 | list_add_tail(&q->list, &p->pending.list); |
| 1419 | sigaddset(&p->pending.signal, sig); | 1395 | sigaddset(&p->pending.signal, sig); |
| 1420 | if (!sigismember(&p->blocked, sig)) | 1396 | if (!sigismember(&p->blocked, sig)) |
| @@ -1462,7 +1438,6 @@ send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) | |||
| 1462 | * We always use the shared queue for process-wide signals, | 1438 | * We always use the shared queue for process-wide signals, |
| 1463 | * to avoid several races. | 1439 | * to avoid several races. |
| 1464 | */ | 1440 | */ |
| 1465 | q->lock = &p->sighand->siglock; | ||
| 1466 | list_add_tail(&q->list, &p->signal->shared_pending.list); | 1441 | list_add_tail(&q->list, &p->signal->shared_pending.list); |
| 1467 | sigaddset(&p->signal->shared_pending.signal, sig); | 1442 | sigaddset(&p->signal->shared_pending.signal, sig); |
| 1468 | 1443 | ||
| @@ -1881,9 +1856,9 @@ relock: | |||
| 1881 | /* Let the debugger run. */ | 1856 | /* Let the debugger run. */ |
| 1882 | ptrace_stop(signr, signr, info); | 1857 | ptrace_stop(signr, signr, info); |
| 1883 | 1858 | ||
| 1884 | /* We're back. Did the debugger cancel the sig? */ | 1859 | /* We're back. Did the debugger cancel the sig or group_exit? */ |
| 1885 | signr = current->exit_code; | 1860 | signr = current->exit_code; |
| 1886 | if (signr == 0) | 1861 | if (signr == 0 || current->signal->flags & SIGNAL_GROUP_EXIT) |
| 1887 | continue; | 1862 | continue; |
| 1888 | 1863 | ||
| 1889 | current->exit_code = 0; | 1864 | current->exit_code = 0; |
| @@ -2285,26 +2260,13 @@ sys_kill(int pid, int sig) | |||
| 2285 | return kill_something_info(sig, &info, pid); | 2260 | return kill_something_info(sig, &info, pid); |
| 2286 | } | 2261 | } |
| 2287 | 2262 | ||
| 2288 | /** | 2263 | static int do_tkill(int tgid, int pid, int sig) |
| 2289 | * sys_tgkill - send signal to one specific thread | ||
| 2290 | * @tgid: the thread group ID of the thread | ||
| 2291 | * @pid: the PID of the thread | ||
| 2292 | * @sig: signal to be sent | ||
| 2293 | * | ||
| 2294 | * This syscall also checks the tgid and returns -ESRCH even if the PID | ||
| 2295 | * exists but it's not belonging to the target process anymore. This | ||
| 2296 | * method solves the problem of threads exiting and PIDs getting reused. | ||
| 2297 | */ | ||
| 2298 | asmlinkage long sys_tgkill(int tgid, int pid, int sig) | ||
| 2299 | { | 2264 | { |
| 2300 | struct siginfo info; | ||
| 2301 | int error; | 2265 | int error; |
| 2266 | struct siginfo info; | ||
| 2302 | struct task_struct *p; | 2267 | struct task_struct *p; |
| 2303 | 2268 | ||
| 2304 | /* This is only valid for single tasks */ | 2269 | error = -ESRCH; |
| 2305 | if (pid <= 0 || tgid <= 0) | ||
| 2306 | return -EINVAL; | ||
| 2307 | |||
| 2308 | info.si_signo = sig; | 2270 | info.si_signo = sig; |
| 2309 | info.si_errno = 0; | 2271 | info.si_errno = 0; |
| 2310 | info.si_code = SI_TKILL; | 2272 | info.si_code = SI_TKILL; |
| @@ -2313,8 +2275,7 @@ asmlinkage long sys_tgkill(int tgid, int pid, int sig) | |||
| 2313 | 2275 | ||
| 2314 | read_lock(&tasklist_lock); | 2276 | read_lock(&tasklist_lock); |
| 2315 | p = find_task_by_pid(pid); | 2277 | p = find_task_by_pid(pid); |
| 2316 | error = -ESRCH; | 2278 | if (p && (tgid <= 0 || p->tgid == tgid)) { |
| 2317 | if (p && (p->tgid == tgid)) { | ||
| 2318 | error = check_kill_permission(sig, &info, p); | 2279 | error = check_kill_permission(sig, &info, p); |
| 2319 | /* | 2280 | /* |
| 2320 | * The null signal is a permissions and process existence | 2281 | * The null signal is a permissions and process existence |
| @@ -2328,47 +2289,40 @@ asmlinkage long sys_tgkill(int tgid, int pid, int sig) | |||
| 2328 | } | 2289 | } |
| 2329 | } | 2290 | } |
| 2330 | read_unlock(&tasklist_lock); | 2291 | read_unlock(&tasklist_lock); |
| 2292 | |||
| 2331 | return error; | 2293 | return error; |
| 2332 | } | 2294 | } |
| 2333 | 2295 | ||
| 2296 | /** | ||
| 2297 | * sys_tgkill - send signal to one specific thread | ||
| 2298 | * @tgid: the thread group ID of the thread | ||
| 2299 | * @pid: the PID of the thread | ||
| 2300 | * @sig: signal to be sent | ||
| 2301 | * | ||
| 2302 | * This syscall also checks the tgid and returns -ESRCH even if the PID | ||
| 2303 | * exists but it's not belonging to the target process anymore. This | ||
| 2304 | * method solves the problem of threads exiting and PIDs getting reused. | ||
| 2305 | */ | ||
| 2306 | asmlinkage long sys_tgkill(int tgid, int pid, int sig) | ||
| 2307 | { | ||
| 2308 | /* This is only valid for single tasks */ | ||
| 2309 | if (pid <= 0 || tgid <= 0) | ||
| 2310 | return -EINVAL; | ||
| 2311 | |||
| 2312 | return do_tkill(tgid, pid, sig); | ||
| 2313 | } | ||
| 2314 | |||
| 2334 | /* | 2315 | /* |
| 2335 | * Send a signal to only one task, even if it's a CLONE_THREAD task. | 2316 | * Send a signal to only one task, even if it's a CLONE_THREAD task. |
| 2336 | */ | 2317 | */ |
| 2337 | asmlinkage long | 2318 | asmlinkage long |
| 2338 | sys_tkill(int pid, int sig) | 2319 | sys_tkill(int pid, int sig) |
| 2339 | { | 2320 | { |
| 2340 | struct siginfo info; | ||
| 2341 | int error; | ||
| 2342 | struct task_struct *p; | ||
| 2343 | |||
| 2344 | /* This is only valid for single tasks */ | 2321 | /* This is only valid for single tasks */ |
| 2345 | if (pid <= 0) | 2322 | if (pid <= 0) |
| 2346 | return -EINVAL; | 2323 | return -EINVAL; |
| 2347 | 2324 | ||
| 2348 | info.si_signo = sig; | 2325 | return do_tkill(0, pid, sig); |
| 2349 | info.si_errno = 0; | ||
| 2350 | info.si_code = SI_TKILL; | ||
| 2351 | info.si_pid = current->tgid; | ||
| 2352 | info.si_uid = current->uid; | ||
| 2353 | |||
| 2354 | read_lock(&tasklist_lock); | ||
| 2355 | p = find_task_by_pid(pid); | ||
| 2356 | error = -ESRCH; | ||
| 2357 | if (p) { | ||
| 2358 | error = check_kill_permission(sig, &info, p); | ||
| 2359 | /* | ||
| 2360 | * The null signal is a permissions and process existence | ||
| 2361 | * probe. No signal is actually delivered. | ||
| 2362 | */ | ||
| 2363 | if (!error && sig && p->sighand) { | ||
| 2364 | spin_lock_irq(&p->sighand->siglock); | ||
| 2365 | handle_stop_signal(sig, p); | ||
| 2366 | error = specific_send_sig_info(sig, &info, p); | ||
| 2367 | spin_unlock_irq(&p->sighand->siglock); | ||
| 2368 | } | ||
| 2369 | } | ||
| 2370 | read_unlock(&tasklist_lock); | ||
| 2371 | return error; | ||
| 2372 | } | 2326 | } |
| 2373 | 2327 | ||
| 2374 | asmlinkage long | 2328 | asmlinkage long |
diff --git a/kernel/time.c b/kernel/time.c index a3c2100470e1..245d595a13cb 100644 --- a/kernel/time.c +++ b/kernel/time.c | |||
| @@ -338,30 +338,20 @@ int do_adjtimex(struct timex *txc) | |||
| 338 | if (mtemp >= MINSEC) { | 338 | if (mtemp >= MINSEC) { |
| 339 | ltemp = (time_offset / mtemp) << (SHIFT_USEC - | 339 | ltemp = (time_offset / mtemp) << (SHIFT_USEC - |
| 340 | SHIFT_UPDATE); | 340 | SHIFT_UPDATE); |
| 341 | if (ltemp < 0) | 341 | time_freq += shift_right(ltemp, SHIFT_KH); |
| 342 | time_freq -= -ltemp >> SHIFT_KH; | ||
| 343 | else | ||
| 344 | time_freq += ltemp >> SHIFT_KH; | ||
| 345 | } else /* calibration interval too short (p. 12) */ | 342 | } else /* calibration interval too short (p. 12) */ |
| 346 | result = TIME_ERROR; | 343 | result = TIME_ERROR; |
| 347 | } else { /* PLL mode */ | 344 | } else { /* PLL mode */ |
| 348 | if (mtemp < MAXSEC) { | 345 | if (mtemp < MAXSEC) { |
| 349 | ltemp *= mtemp; | 346 | ltemp *= mtemp; |
| 350 | if (ltemp < 0) | 347 | time_freq += shift_right(ltemp,(time_constant + |
| 351 | time_freq -= -ltemp >> (time_constant + | ||
| 352 | time_constant + | ||
| 353 | SHIFT_KF - SHIFT_USEC); | ||
| 354 | else | ||
| 355 | time_freq += ltemp >> (time_constant + | ||
| 356 | time_constant + | 348 | time_constant + |
| 357 | SHIFT_KF - SHIFT_USEC); | 349 | SHIFT_KF - SHIFT_USEC)); |
| 358 | } else /* calibration interval too long (p. 12) */ | 350 | } else /* calibration interval too long (p. 12) */ |
| 359 | result = TIME_ERROR; | 351 | result = TIME_ERROR; |
| 360 | } | 352 | } |
| 361 | if (time_freq > time_tolerance) | 353 | time_freq = min(time_freq, time_tolerance); |
| 362 | time_freq = time_tolerance; | 354 | time_freq = max(time_freq, -time_tolerance); |
| 363 | else if (time_freq < -time_tolerance) | ||
| 364 | time_freq = -time_tolerance; | ||
| 365 | } /* STA_PLL || STA_PPSTIME */ | 355 | } /* STA_PLL || STA_PPSTIME */ |
| 366 | } /* txc->modes & ADJ_OFFSET */ | 356 | } /* txc->modes & ADJ_OFFSET */ |
| 367 | if (txc->modes & ADJ_TICK) { | 357 | if (txc->modes & ADJ_TICK) { |
| @@ -384,10 +374,7 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0 | |||
| 384 | if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) | 374 | if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) |
| 385 | txc->offset = save_adjust; | 375 | txc->offset = save_adjust; |
| 386 | else { | 376 | else { |
| 387 | if (time_offset < 0) | 377 | txc->offset = shift_right(time_offset, SHIFT_UPDATE); |
| 388 | txc->offset = -(-time_offset >> SHIFT_UPDATE); | ||
| 389 | else | ||
| 390 | txc->offset = time_offset >> SHIFT_UPDATE; | ||
| 391 | } | 378 | } |
| 392 | txc->freq = time_freq + pps_freq; | 379 | txc->freq = time_freq + pps_freq; |
| 393 | txc->maxerror = time_maxerror; | 380 | txc->maxerror = time_maxerror; |
diff --git a/kernel/timer.c b/kernel/timer.c index 6a2e5f8dc725..fd74268d8663 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
| @@ -46,6 +46,10 @@ static void time_interpolator_update(long delta_nsec); | |||
| 46 | #define time_interpolator_update(x) | 46 | #define time_interpolator_update(x) |
| 47 | #endif | 47 | #endif |
| 48 | 48 | ||
| 49 | u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; | ||
| 50 | |||
| 51 | EXPORT_SYMBOL(jiffies_64); | ||
| 52 | |||
| 49 | /* | 53 | /* |
| 50 | * per-CPU timer vector definitions: | 54 | * per-CPU timer vector definitions: |
| 51 | */ | 55 | */ |
| @@ -91,30 +95,6 @@ static inline void set_running_timer(tvec_base_t *base, | |||
| 91 | #endif | 95 | #endif |
| 92 | } | 96 | } |
| 93 | 97 | ||
| 94 | static void check_timer_failed(struct timer_list *timer) | ||
| 95 | { | ||
| 96 | static int whine_count; | ||
| 97 | if (whine_count < 16) { | ||
| 98 | whine_count++; | ||
| 99 | printk("Uninitialised timer!\n"); | ||
| 100 | printk("This is just a warning. Your computer is OK\n"); | ||
| 101 | printk("function=0x%p, data=0x%lx\n", | ||
| 102 | timer->function, timer->data); | ||
| 103 | dump_stack(); | ||
| 104 | } | ||
| 105 | /* | ||
| 106 | * Now fix it up | ||
| 107 | */ | ||
| 108 | timer->magic = TIMER_MAGIC; | ||
| 109 | } | ||
| 110 | |||
| 111 | static inline void check_timer(struct timer_list *timer) | ||
| 112 | { | ||
| 113 | if (timer->magic != TIMER_MAGIC) | ||
| 114 | check_timer_failed(timer); | ||
| 115 | } | ||
| 116 | |||
| 117 | |||
| 118 | static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) | 98 | static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) |
| 119 | { | 99 | { |
| 120 | unsigned long expires = timer->expires; | 100 | unsigned long expires = timer->expires; |
| @@ -177,7 +157,6 @@ void fastcall init_timer(struct timer_list *timer) | |||
| 177 | { | 157 | { |
| 178 | timer->entry.next = NULL; | 158 | timer->entry.next = NULL; |
| 179 | timer->base = &per_cpu(tvec_bases, raw_smp_processor_id()).t_base; | 159 | timer->base = &per_cpu(tvec_bases, raw_smp_processor_id()).t_base; |
| 180 | timer->magic = TIMER_MAGIC; | ||
| 181 | } | 160 | } |
| 182 | EXPORT_SYMBOL(init_timer); | 161 | EXPORT_SYMBOL(init_timer); |
| 183 | 162 | ||
| @@ -230,7 +209,6 @@ int __mod_timer(struct timer_list *timer, unsigned long expires) | |||
| 230 | int ret = 0; | 209 | int ret = 0; |
| 231 | 210 | ||
| 232 | BUG_ON(!timer->function); | 211 | BUG_ON(!timer->function); |
| 233 | check_timer(timer); | ||
| 234 | 212 | ||
| 235 | base = lock_timer_base(timer, &flags); | 213 | base = lock_timer_base(timer, &flags); |
| 236 | 214 | ||
| @@ -283,9 +261,6 @@ void add_timer_on(struct timer_list *timer, int cpu) | |||
| 283 | unsigned long flags; | 261 | unsigned long flags; |
| 284 | 262 | ||
| 285 | BUG_ON(timer_pending(timer) || !timer->function); | 263 | BUG_ON(timer_pending(timer) || !timer->function); |
| 286 | |||
| 287 | check_timer(timer); | ||
| 288 | |||
| 289 | spin_lock_irqsave(&base->t_base.lock, flags); | 264 | spin_lock_irqsave(&base->t_base.lock, flags); |
| 290 | timer->base = &base->t_base; | 265 | timer->base = &base->t_base; |
| 291 | internal_add_timer(base, timer); | 266 | internal_add_timer(base, timer); |
| @@ -316,8 +291,6 @@ int mod_timer(struct timer_list *timer, unsigned long expires) | |||
| 316 | { | 291 | { |
| 317 | BUG_ON(!timer->function); | 292 | BUG_ON(!timer->function); |
| 318 | 293 | ||
| 319 | check_timer(timer); | ||
| 320 | |||
| 321 | /* | 294 | /* |
| 322 | * This is a common optimization triggered by the | 295 | * This is a common optimization triggered by the |
| 323 | * networking code - if the timer is re-modified | 296 | * networking code - if the timer is re-modified |
| @@ -348,8 +321,6 @@ int del_timer(struct timer_list *timer) | |||
| 348 | unsigned long flags; | 321 | unsigned long flags; |
| 349 | int ret = 0; | 322 | int ret = 0; |
| 350 | 323 | ||
| 351 | check_timer(timer); | ||
| 352 | |||
| 353 | if (timer_pending(timer)) { | 324 | if (timer_pending(timer)) { |
| 354 | base = lock_timer_base(timer, &flags); | 325 | base = lock_timer_base(timer, &flags); |
| 355 | if (timer_pending(timer)) { | 326 | if (timer_pending(timer)) { |
| @@ -412,8 +383,6 @@ out: | |||
| 412 | */ | 383 | */ |
| 413 | int del_timer_sync(struct timer_list *timer) | 384 | int del_timer_sync(struct timer_list *timer) |
| 414 | { | 385 | { |
| 415 | check_timer(timer); | ||
| 416 | |||
| 417 | for (;;) { | 386 | for (;;) { |
| 418 | int ret = try_to_del_timer_sync(timer); | 387 | int ret = try_to_del_timer_sync(timer); |
| 419 | if (ret >= 0) | 388 | if (ret >= 0) |
| @@ -632,143 +601,118 @@ long time_next_adjust; | |||
| 632 | */ | 601 | */ |
| 633 | static void second_overflow(void) | 602 | static void second_overflow(void) |
| 634 | { | 603 | { |
| 635 | long ltemp; | 604 | long ltemp; |
| 636 | 605 | ||
| 637 | /* Bump the maxerror field */ | 606 | /* Bump the maxerror field */ |
| 638 | time_maxerror += time_tolerance >> SHIFT_USEC; | 607 | time_maxerror += time_tolerance >> SHIFT_USEC; |
| 639 | if ( time_maxerror > NTP_PHASE_LIMIT ) { | 608 | if (time_maxerror > NTP_PHASE_LIMIT) { |
| 640 | time_maxerror = NTP_PHASE_LIMIT; | 609 | time_maxerror = NTP_PHASE_LIMIT; |
| 641 | time_status |= STA_UNSYNC; | 610 | time_status |= STA_UNSYNC; |
| 642 | } | ||
| 643 | |||
| 644 | /* | ||
| 645 | * Leap second processing. If in leap-insert state at | ||
| 646 | * the end of the day, the system clock is set back one | ||
| 647 | * second; if in leap-delete state, the system clock is | ||
| 648 | * set ahead one second. The microtime() routine or | ||
| 649 | * external clock driver will insure that reported time | ||
| 650 | * is always monotonic. The ugly divides should be | ||
| 651 | * replaced. | ||
| 652 | */ | ||
| 653 | switch (time_state) { | ||
| 654 | |||
| 655 | case TIME_OK: | ||
| 656 | if (time_status & STA_INS) | ||
| 657 | time_state = TIME_INS; | ||
| 658 | else if (time_status & STA_DEL) | ||
| 659 | time_state = TIME_DEL; | ||
| 660 | break; | ||
| 661 | |||
| 662 | case TIME_INS: | ||
| 663 | if (xtime.tv_sec % 86400 == 0) { | ||
| 664 | xtime.tv_sec--; | ||
| 665 | wall_to_monotonic.tv_sec++; | ||
| 666 | /* The timer interpolator will make time change gradually instead | ||
| 667 | * of an immediate jump by one second. | ||
| 668 | */ | ||
| 669 | time_interpolator_update(-NSEC_PER_SEC); | ||
| 670 | time_state = TIME_OOP; | ||
| 671 | clock_was_set(); | ||
| 672 | printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n"); | ||
| 673 | } | 611 | } |
| 674 | break; | 612 | |
| 675 | 613 | /* | |
| 676 | case TIME_DEL: | 614 | * Leap second processing. If in leap-insert state at the end of the |
| 677 | if ((xtime.tv_sec + 1) % 86400 == 0) { | 615 | * day, the system clock is set back one second; if in leap-delete |
| 678 | xtime.tv_sec++; | 616 | * state, the system clock is set ahead one second. The microtime() |
| 679 | wall_to_monotonic.tv_sec--; | 617 | * routine or external clock driver will insure that reported time is |
| 680 | /* Use of time interpolator for a gradual change of time */ | 618 | * always monotonic. The ugly divides should be replaced. |
| 681 | time_interpolator_update(NSEC_PER_SEC); | 619 | */ |
| 682 | time_state = TIME_WAIT; | 620 | switch (time_state) { |
| 683 | clock_was_set(); | 621 | case TIME_OK: |
| 684 | printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n"); | 622 | if (time_status & STA_INS) |
| 623 | time_state = TIME_INS; | ||
| 624 | else if (time_status & STA_DEL) | ||
| 625 | time_state = TIME_DEL; | ||
| 626 | break; | ||
| 627 | case TIME_INS: | ||
| 628 | if (xtime.tv_sec % 86400 == 0) { | ||
| 629 | xtime.tv_sec--; | ||
| 630 | wall_to_monotonic.tv_sec++; | ||
| 631 | /* | ||
| 632 | * The timer interpolator will make time change | ||
| 633 | * gradually instead of an immediate jump by one second | ||
| 634 | */ | ||
| 635 | time_interpolator_update(-NSEC_PER_SEC); | ||
| 636 | time_state = TIME_OOP; | ||
| 637 | clock_was_set(); | ||
| 638 | printk(KERN_NOTICE "Clock: inserting leap second " | ||
| 639 | "23:59:60 UTC\n"); | ||
| 640 | } | ||
| 641 | break; | ||
| 642 | case TIME_DEL: | ||
| 643 | if ((xtime.tv_sec + 1) % 86400 == 0) { | ||
| 644 | xtime.tv_sec++; | ||
| 645 | wall_to_monotonic.tv_sec--; | ||
| 646 | /* | ||
| 647 | * Use of time interpolator for a gradual change of | ||
| 648 | * time | ||
| 649 | */ | ||
| 650 | time_interpolator_update(NSEC_PER_SEC); | ||
| 651 | time_state = TIME_WAIT; | ||
| 652 | clock_was_set(); | ||
| 653 | printk(KERN_NOTICE "Clock: deleting leap second " | ||
| 654 | "23:59:59 UTC\n"); | ||
| 655 | } | ||
| 656 | break; | ||
| 657 | case TIME_OOP: | ||
| 658 | time_state = TIME_WAIT; | ||
| 659 | break; | ||
| 660 | case TIME_WAIT: | ||
| 661 | if (!(time_status & (STA_INS | STA_DEL))) | ||
| 662 | time_state = TIME_OK; | ||
| 685 | } | 663 | } |
| 686 | break; | 664 | |
| 687 | 665 | /* | |
| 688 | case TIME_OOP: | 666 | * Compute the phase adjustment for the next second. In PLL mode, the |
| 689 | time_state = TIME_WAIT; | 667 | * offset is reduced by a fixed factor times the time constant. In FLL |
| 690 | break; | 668 | * mode the offset is used directly. In either mode, the maximum phase |
| 691 | 669 | * adjustment for each second is clamped so as to spread the adjustment | |
| 692 | case TIME_WAIT: | 670 | * over not more than the number of seconds between updates. |
| 693 | if (!(time_status & (STA_INS | STA_DEL))) | 671 | */ |
| 694 | time_state = TIME_OK; | ||
| 695 | } | ||
| 696 | |||
| 697 | /* | ||
| 698 | * Compute the phase adjustment for the next second. In | ||
| 699 | * PLL mode, the offset is reduced by a fixed factor | ||
| 700 | * times the time constant. In FLL mode the offset is | ||
| 701 | * used directly. In either mode, the maximum phase | ||
| 702 | * adjustment for each second is clamped so as to spread | ||
| 703 | * the adjustment over not more than the number of | ||
| 704 | * seconds between updates. | ||
| 705 | */ | ||
| 706 | if (time_offset < 0) { | ||
| 707 | ltemp = -time_offset; | ||
| 708 | if (!(time_status & STA_FLL)) | ||
| 709 | ltemp >>= SHIFT_KG + time_constant; | ||
| 710 | if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE) | ||
| 711 | ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE; | ||
| 712 | time_offset += ltemp; | ||
| 713 | time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); | ||
| 714 | } else { | ||
| 715 | ltemp = time_offset; | 672 | ltemp = time_offset; |
| 716 | if (!(time_status & STA_FLL)) | 673 | if (!(time_status & STA_FLL)) |
| 717 | ltemp >>= SHIFT_KG + time_constant; | 674 | ltemp = shift_right(ltemp, SHIFT_KG + time_constant); |
| 718 | if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE) | 675 | ltemp = min(ltemp, (MAXPHASE / MINSEC) << SHIFT_UPDATE); |
| 719 | ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE; | 676 | ltemp = max(ltemp, -(MAXPHASE / MINSEC) << SHIFT_UPDATE); |
| 720 | time_offset -= ltemp; | 677 | time_offset -= ltemp; |
| 721 | time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); | 678 | time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); |
| 722 | } | 679 | |
| 723 | 680 | /* | |
| 724 | /* | 681 | * Compute the frequency estimate and additional phase adjustment due |
| 725 | * Compute the frequency estimate and additional phase | 682 | * to frequency error for the next second. When the PPS signal is |
| 726 | * adjustment due to frequency error for the next | 683 | * engaged, gnaw on the watchdog counter and update the frequency |
| 727 | * second. When the PPS signal is engaged, gnaw on the | 684 | * computed by the pll and the PPS signal. |
| 728 | * watchdog counter and update the frequency computed by | 685 | */ |
| 729 | * the pll and the PPS signal. | 686 | pps_valid++; |
| 730 | */ | 687 | if (pps_valid == PPS_VALID) { /* PPS signal lost */ |
| 731 | pps_valid++; | 688 | pps_jitter = MAXTIME; |
| 732 | if (pps_valid == PPS_VALID) { /* PPS signal lost */ | 689 | pps_stabil = MAXFREQ; |
| 733 | pps_jitter = MAXTIME; | 690 | time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | |
| 734 | pps_stabil = MAXFREQ; | 691 | STA_PPSWANDER | STA_PPSERROR); |
| 735 | time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | | 692 | } |
| 736 | STA_PPSWANDER | STA_PPSERROR); | 693 | ltemp = time_freq + pps_freq; |
| 737 | } | 694 | time_adj += shift_right(ltemp,(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE)); |
| 738 | ltemp = time_freq + pps_freq; | ||
| 739 | if (ltemp < 0) | ||
| 740 | time_adj -= -ltemp >> | ||
| 741 | (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE); | ||
| 742 | else | ||
| 743 | time_adj += ltemp >> | ||
| 744 | (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE); | ||
| 745 | 695 | ||
| 746 | #if HZ == 100 | 696 | #if HZ == 100 |
| 747 | /* Compensate for (HZ==100) != (1 << SHIFT_HZ). | 697 | /* |
| 748 | * Add 25% and 3.125% to get 128.125; => only 0.125% error (p. 14) | 698 | * Compensate for (HZ==100) != (1 << SHIFT_HZ). Add 25% and 3.125% to |
| 749 | */ | 699 | * get 128.125; => only 0.125% error (p. 14) |
| 750 | if (time_adj < 0) | 700 | */ |
| 751 | time_adj -= (-time_adj >> 2) + (-time_adj >> 5); | 701 | time_adj += shift_right(time_adj, 2) + shift_right(time_adj, 5); |
| 752 | else | ||
| 753 | time_adj += (time_adj >> 2) + (time_adj >> 5); | ||
| 754 | #endif | 702 | #endif |
| 755 | #if HZ == 250 | 703 | #if HZ == 250 |
| 756 | /* Compensate for (HZ==250) != (1 << SHIFT_HZ). | 704 | /* |
| 757 | * Add 1.5625% and 0.78125% to get 255.85938; => only 0.05% error (p. 14) | 705 | * Compensate for (HZ==250) != (1 << SHIFT_HZ). Add 1.5625% and |
| 758 | */ | 706 | * 0.78125% to get 255.85938; => only 0.05% error (p. 14) |
| 759 | if (time_adj < 0) | 707 | */ |
| 760 | time_adj -= (-time_adj >> 6) + (-time_adj >> 7); | 708 | time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7); |
| 761 | else | ||
| 762 | time_adj += (time_adj >> 6) + (time_adj >> 7); | ||
| 763 | #endif | 709 | #endif |
| 764 | #if HZ == 1000 | 710 | #if HZ == 1000 |
| 765 | /* Compensate for (HZ==1000) != (1 << SHIFT_HZ). | 711 | /* |
| 766 | * Add 1.5625% and 0.78125% to get 1023.4375; => only 0.05% error (p. 14) | 712 | * Compensate for (HZ==1000) != (1 << SHIFT_HZ). Add 1.5625% and |
| 767 | */ | 713 | * 0.78125% to get 1023.4375; => only 0.05% error (p. 14) |
| 768 | if (time_adj < 0) | 714 | */ |
| 769 | time_adj -= (-time_adj >> 6) + (-time_adj >> 7); | 715 | time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7); |
| 770 | else | ||
| 771 | time_adj += (time_adj >> 6) + (time_adj >> 7); | ||
| 772 | #endif | 716 | #endif |
| 773 | } | 717 | } |
| 774 | 718 | ||
| @@ -777,23 +721,20 @@ static void update_wall_time_one_tick(void) | |||
| 777 | { | 721 | { |
| 778 | long time_adjust_step, delta_nsec; | 722 | long time_adjust_step, delta_nsec; |
| 779 | 723 | ||
| 780 | if ( (time_adjust_step = time_adjust) != 0 ) { | 724 | if ((time_adjust_step = time_adjust) != 0 ) { |
| 781 | /* We are doing an adjtime thing. | 725 | /* |
| 782 | * | 726 | * We are doing an adjtime thing. Prepare time_adjust_step to |
| 783 | * Prepare time_adjust_step to be within bounds. | 727 | * be within bounds. Note that a positive time_adjust means we |
| 784 | * Note that a positive time_adjust means we want the clock | 728 | * want the clock to run faster. |
| 785 | * to run faster. | 729 | * |
| 786 | * | 730 | * Limit the amount of the step to be in the range |
| 787 | * Limit the amount of the step to be in the range | 731 | * -tickadj .. +tickadj |
| 788 | * -tickadj .. +tickadj | 732 | */ |
| 789 | */ | 733 | time_adjust_step = min(time_adjust_step, (long)tickadj); |
| 790 | if (time_adjust > tickadj) | 734 | time_adjust_step = max(time_adjust_step, (long)-tickadj); |
| 791 | time_adjust_step = tickadj; | 735 | |
| 792 | else if (time_adjust < -tickadj) | 736 | /* Reduce by this step the amount of time left */ |
| 793 | time_adjust_step = -tickadj; | 737 | time_adjust -= time_adjust_step; |
| 794 | |||
| 795 | /* Reduce by this step the amount of time left */ | ||
| 796 | time_adjust -= time_adjust_step; | ||
| 797 | } | 738 | } |
| 798 | delta_nsec = tick_nsec + time_adjust_step * 1000; | 739 | delta_nsec = tick_nsec + time_adjust_step * 1000; |
| 799 | /* | 740 | /* |
| @@ -801,13 +742,8 @@ static void update_wall_time_one_tick(void) | |||
| 801 | * advance the tick more. | 742 | * advance the tick more. |
| 802 | */ | 743 | */ |
| 803 | time_phase += time_adj; | 744 | time_phase += time_adj; |
| 804 | if (time_phase <= -FINENSEC) { | 745 | if ((time_phase >= FINENSEC) || (time_phase <= -FINENSEC)) { |
| 805 | long ltemp = -time_phase >> (SHIFT_SCALE - 10); | 746 | long ltemp = shift_right(time_phase, (SHIFT_SCALE - 10)); |
| 806 | time_phase += ltemp << (SHIFT_SCALE - 10); | ||
| 807 | delta_nsec -= ltemp; | ||
| 808 | } | ||
| 809 | else if (time_phase >= FINENSEC) { | ||
| 810 | long ltemp = time_phase >> (SHIFT_SCALE - 10); | ||
| 811 | time_phase -= ltemp << (SHIFT_SCALE - 10); | 747 | time_phase -= ltemp << (SHIFT_SCALE - 10); |
| 812 | delta_nsec += ltemp; | 748 | delta_nsec += ltemp; |
| 813 | } | 749 | } |
| @@ -1137,8 +1073,8 @@ fastcall signed long __sched schedule_timeout(signed long timeout) | |||
| 1137 | if (timeout < 0) | 1073 | if (timeout < 0) |
| 1138 | { | 1074 | { |
| 1139 | printk(KERN_ERR "schedule_timeout: wrong timeout " | 1075 | printk(KERN_ERR "schedule_timeout: wrong timeout " |
| 1140 | "value %lx from %p\n", timeout, | 1076 | "value %lx from %p\n", timeout, |
| 1141 | __builtin_return_address(0)); | 1077 | __builtin_return_address(0)); |
| 1142 | current->state = TASK_RUNNING; | 1078 | current->state = TASK_RUNNING; |
| 1143 | goto out; | 1079 | goto out; |
| 1144 | } | 1080 | } |
| @@ -1146,12 +1082,8 @@ fastcall signed long __sched schedule_timeout(signed long timeout) | |||
| 1146 | 1082 | ||
| 1147 | expire = timeout + jiffies; | 1083 | expire = timeout + jiffies; |
| 1148 | 1084 | ||
| 1149 | init_timer(&timer); | 1085 | setup_timer(&timer, process_timeout, (unsigned long)current); |
| 1150 | timer.expires = expire; | 1086 | __mod_timer(&timer, expire); |
| 1151 | timer.data = (unsigned long) current; | ||
| 1152 | timer.function = process_timeout; | ||
| 1153 | |||
| 1154 | add_timer(&timer); | ||
| 1155 | schedule(); | 1087 | schedule(); |
| 1156 | del_singleshot_timer_sync(&timer); | 1088 | del_singleshot_timer_sync(&timer); |
| 1157 | 1089 | ||
| @@ -1168,15 +1100,15 @@ EXPORT_SYMBOL(schedule_timeout); | |||
| 1168 | */ | 1100 | */ |
| 1169 | signed long __sched schedule_timeout_interruptible(signed long timeout) | 1101 | signed long __sched schedule_timeout_interruptible(signed long timeout) |
| 1170 | { | 1102 | { |
| 1171 | __set_current_state(TASK_INTERRUPTIBLE); | 1103 | __set_current_state(TASK_INTERRUPTIBLE); |
| 1172 | return schedule_timeout(timeout); | 1104 | return schedule_timeout(timeout); |
| 1173 | } | 1105 | } |
| 1174 | EXPORT_SYMBOL(schedule_timeout_interruptible); | 1106 | EXPORT_SYMBOL(schedule_timeout_interruptible); |
| 1175 | 1107 | ||
| 1176 | signed long __sched schedule_timeout_uninterruptible(signed long timeout) | 1108 | signed long __sched schedule_timeout_uninterruptible(signed long timeout) |
| 1177 | { | 1109 | { |
| 1178 | __set_current_state(TASK_UNINTERRUPTIBLE); | 1110 | __set_current_state(TASK_UNINTERRUPTIBLE); |
| 1179 | return schedule_timeout(timeout); | 1111 | return schedule_timeout(timeout); |
| 1180 | } | 1112 | } |
| 1181 | EXPORT_SYMBOL(schedule_timeout_uninterruptible); | 1113 | EXPORT_SYMBOL(schedule_timeout_uninterruptible); |
| 1182 | 1114 | ||
| @@ -1516,16 +1448,18 @@ static void time_interpolator_update(long delta_nsec) | |||
| 1516 | if (!time_interpolator) | 1448 | if (!time_interpolator) |
| 1517 | return; | 1449 | return; |
| 1518 | 1450 | ||
| 1519 | /* The interpolator compensates for late ticks by accumulating | 1451 | /* |
| 1520 | * the late time in time_interpolator->offset. A tick earlier than | 1452 | * The interpolator compensates for late ticks by accumulating the late |
| 1521 | * expected will lead to a reset of the offset and a corresponding | 1453 | * time in time_interpolator->offset. A tick earlier than expected will |
| 1522 | * jump of the clock forward. Again this only works if the | 1454 | * lead to a reset of the offset and a corresponding jump of the clock |
| 1523 | * interpolator clock is running slightly slower than the regular clock | 1455 | * forward. Again this only works if the interpolator clock is running |
| 1524 | * and the tuning logic insures that. | 1456 | * slightly slower than the regular clock and the tuning logic insures |
| 1525 | */ | 1457 | * that. |
| 1458 | */ | ||
| 1526 | 1459 | ||
| 1527 | counter = time_interpolator_get_counter(1); | 1460 | counter = time_interpolator_get_counter(1); |
| 1528 | offset = time_interpolator->offset + GET_TI_NSECS(counter, time_interpolator); | 1461 | offset = time_interpolator->offset + |
| 1462 | GET_TI_NSECS(counter, time_interpolator); | ||
| 1529 | 1463 | ||
| 1530 | if (delta_nsec < 0 || (unsigned long) delta_nsec < offset) | 1464 | if (delta_nsec < 0 || (unsigned long) delta_nsec < offset) |
| 1531 | time_interpolator->offset = offset - delta_nsec; | 1465 | time_interpolator->offset = offset - delta_nsec; |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 91bacb13a7e2..7cee222231bc 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
| @@ -12,6 +12,8 @@ | |||
| 12 | * Andrew Morton <andrewm@uow.edu.au> | 12 | * Andrew Morton <andrewm@uow.edu.au> |
| 13 | * Kai Petzke <wpp@marie.physik.tu-berlin.de> | 13 | * Kai Petzke <wpp@marie.physik.tu-berlin.de> |
| 14 | * Theodore Ts'o <tytso@mit.edu> | 14 | * Theodore Ts'o <tytso@mit.edu> |
| 15 | * | ||
| 16 | * Made to use alloc_percpu by Christoph Lameter <clameter@sgi.com>. | ||
| 15 | */ | 17 | */ |
| 16 | 18 | ||
| 17 | #include <linux/module.h> | 19 | #include <linux/module.h> |
| @@ -57,7 +59,7 @@ struct cpu_workqueue_struct { | |||
| 57 | * per-CPU workqueues: | 59 | * per-CPU workqueues: |
| 58 | */ | 60 | */ |
| 59 | struct workqueue_struct { | 61 | struct workqueue_struct { |
| 60 | struct cpu_workqueue_struct cpu_wq[NR_CPUS]; | 62 | struct cpu_workqueue_struct *cpu_wq; |
| 61 | const char *name; | 63 | const char *name; |
| 62 | struct list_head list; /* Empty if single thread */ | 64 | struct list_head list; /* Empty if single thread */ |
| 63 | }; | 65 | }; |
| @@ -102,7 +104,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work) | |||
| 102 | if (unlikely(is_single_threaded(wq))) | 104 | if (unlikely(is_single_threaded(wq))) |
| 103 | cpu = 0; | 105 | cpu = 0; |
| 104 | BUG_ON(!list_empty(&work->entry)); | 106 | BUG_ON(!list_empty(&work->entry)); |
| 105 | __queue_work(wq->cpu_wq + cpu, work); | 107 | __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work); |
| 106 | ret = 1; | 108 | ret = 1; |
| 107 | } | 109 | } |
| 108 | put_cpu(); | 110 | put_cpu(); |
| @@ -118,7 +120,7 @@ static void delayed_work_timer_fn(unsigned long __data) | |||
| 118 | if (unlikely(is_single_threaded(wq))) | 120 | if (unlikely(is_single_threaded(wq))) |
| 119 | cpu = 0; | 121 | cpu = 0; |
| 120 | 122 | ||
| 121 | __queue_work(wq->cpu_wq + cpu, work); | 123 | __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work); |
| 122 | } | 124 | } |
| 123 | 125 | ||
| 124 | int fastcall queue_delayed_work(struct workqueue_struct *wq, | 126 | int fastcall queue_delayed_work(struct workqueue_struct *wq, |
| @@ -265,13 +267,13 @@ void fastcall flush_workqueue(struct workqueue_struct *wq) | |||
| 265 | 267 | ||
| 266 | if (is_single_threaded(wq)) { | 268 | if (is_single_threaded(wq)) { |
| 267 | /* Always use cpu 0's area. */ | 269 | /* Always use cpu 0's area. */ |
| 268 | flush_cpu_workqueue(wq->cpu_wq + 0); | 270 | flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, 0)); |
| 269 | } else { | 271 | } else { |
| 270 | int cpu; | 272 | int cpu; |
| 271 | 273 | ||
| 272 | lock_cpu_hotplug(); | 274 | lock_cpu_hotplug(); |
| 273 | for_each_online_cpu(cpu) | 275 | for_each_online_cpu(cpu) |
| 274 | flush_cpu_workqueue(wq->cpu_wq + cpu); | 276 | flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu)); |
| 275 | unlock_cpu_hotplug(); | 277 | unlock_cpu_hotplug(); |
| 276 | } | 278 | } |
| 277 | } | 279 | } |
| @@ -279,7 +281,7 @@ void fastcall flush_workqueue(struct workqueue_struct *wq) | |||
| 279 | static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq, | 281 | static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq, |
| 280 | int cpu) | 282 | int cpu) |
| 281 | { | 283 | { |
| 282 | struct cpu_workqueue_struct *cwq = wq->cpu_wq + cpu; | 284 | struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); |
| 283 | struct task_struct *p; | 285 | struct task_struct *p; |
| 284 | 286 | ||
| 285 | spin_lock_init(&cwq->lock); | 287 | spin_lock_init(&cwq->lock); |
| @@ -312,6 +314,7 @@ struct workqueue_struct *__create_workqueue(const char *name, | |||
| 312 | if (!wq) | 314 | if (!wq) |
| 313 | return NULL; | 315 | return NULL; |
| 314 | 316 | ||
| 317 | wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct); | ||
| 315 | wq->name = name; | 318 | wq->name = name; |
| 316 | /* We don't need the distraction of CPUs appearing and vanishing. */ | 319 | /* We don't need the distraction of CPUs appearing and vanishing. */ |
| 317 | lock_cpu_hotplug(); | 320 | lock_cpu_hotplug(); |
| @@ -353,7 +356,7 @@ static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu) | |||
| 353 | unsigned long flags; | 356 | unsigned long flags; |
| 354 | struct task_struct *p; | 357 | struct task_struct *p; |
| 355 | 358 | ||
| 356 | cwq = wq->cpu_wq + cpu; | 359 | cwq = per_cpu_ptr(wq->cpu_wq, cpu); |
| 357 | spin_lock_irqsave(&cwq->lock, flags); | 360 | spin_lock_irqsave(&cwq->lock, flags); |
| 358 | p = cwq->thread; | 361 | p = cwq->thread; |
| 359 | cwq->thread = NULL; | 362 | cwq->thread = NULL; |
| @@ -380,6 +383,7 @@ void destroy_workqueue(struct workqueue_struct *wq) | |||
| 380 | spin_unlock(&workqueue_lock); | 383 | spin_unlock(&workqueue_lock); |
| 381 | } | 384 | } |
| 382 | unlock_cpu_hotplug(); | 385 | unlock_cpu_hotplug(); |
| 386 | free_percpu(wq->cpu_wq); | ||
| 383 | kfree(wq); | 387 | kfree(wq); |
| 384 | } | 388 | } |
| 385 | 389 | ||
| @@ -458,7 +462,7 @@ int current_is_keventd(void) | |||
| 458 | 462 | ||
| 459 | BUG_ON(!keventd_wq); | 463 | BUG_ON(!keventd_wq); |
| 460 | 464 | ||
| 461 | cwq = keventd_wq->cpu_wq + cpu; | 465 | cwq = per_cpu_ptr(keventd_wq->cpu_wq, cpu); |
| 462 | if (current == cwq->thread) | 466 | if (current == cwq->thread) |
| 463 | ret = 1; | 467 | ret = 1; |
| 464 | 468 | ||
| @@ -470,7 +474,7 @@ int current_is_keventd(void) | |||
| 470 | /* Take the work from this (downed) CPU. */ | 474 | /* Take the work from this (downed) CPU. */ |
| 471 | static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) | 475 | static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) |
| 472 | { | 476 | { |
| 473 | struct cpu_workqueue_struct *cwq = wq->cpu_wq + cpu; | 477 | struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); |
| 474 | LIST_HEAD(list); | 478 | LIST_HEAD(list); |
| 475 | struct work_struct *work; | 479 | struct work_struct *work; |
| 476 | 480 | ||
| @@ -481,7 +485,7 @@ static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) | |||
| 481 | printk("Taking work for %s\n", wq->name); | 485 | printk("Taking work for %s\n", wq->name); |
| 482 | work = list_entry(list.next,struct work_struct,entry); | 486 | work = list_entry(list.next,struct work_struct,entry); |
| 483 | list_del(&work->entry); | 487 | list_del(&work->entry); |
| 484 | __queue_work(wq->cpu_wq + smp_processor_id(), work); | 488 | __queue_work(per_cpu_ptr(wq->cpu_wq, smp_processor_id()), work); |
| 485 | } | 489 | } |
| 486 | spin_unlock_irq(&cwq->lock); | 490 | spin_unlock_irq(&cwq->lock); |
| 487 | } | 491 | } |
| @@ -508,15 +512,18 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, | |||
| 508 | case CPU_ONLINE: | 512 | case CPU_ONLINE: |
| 509 | /* Kick off worker threads. */ | 513 | /* Kick off worker threads. */ |
| 510 | list_for_each_entry(wq, &workqueues, list) { | 514 | list_for_each_entry(wq, &workqueues, list) { |
| 511 | kthread_bind(wq->cpu_wq[hotcpu].thread, hotcpu); | 515 | struct cpu_workqueue_struct *cwq; |
| 512 | wake_up_process(wq->cpu_wq[hotcpu].thread); | 516 | |
| 517 | cwq = per_cpu_ptr(wq->cpu_wq, hotcpu); | ||
| 518 | kthread_bind(cwq->thread, hotcpu); | ||
| 519 | wake_up_process(cwq->thread); | ||
| 513 | } | 520 | } |
| 514 | break; | 521 | break; |
| 515 | 522 | ||
| 516 | case CPU_UP_CANCELED: | 523 | case CPU_UP_CANCELED: |
| 517 | list_for_each_entry(wq, &workqueues, list) { | 524 | list_for_each_entry(wq, &workqueues, list) { |
| 518 | /* Unbind so it can run. */ | 525 | /* Unbind so it can run. */ |
| 519 | kthread_bind(wq->cpu_wq[hotcpu].thread, | 526 | kthread_bind(per_cpu_ptr(wq->cpu_wq, hotcpu)->thread, |
| 520 | smp_processor_id()); | 527 | smp_processor_id()); |
| 521 | cleanup_workqueue_thread(wq, hotcpu); | 528 | cleanup_workqueue_thread(wq, hotcpu); |
| 522 | } | 529 | } |
