aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/acct.c2
-rw-r--r--kernel/audit.c6
-rw-r--r--kernel/auditsc.c2
-rw-r--r--kernel/cpu.c1
-rw-r--r--kernel/cpuset.c466
-rw-r--r--kernel/exit.c34
-rw-r--r--kernel/fork.c31
-rw-r--r--kernel/futex.c6
-rw-r--r--kernel/kallsyms.c1
-rw-r--r--kernel/kexec.c11
-rw-r--r--kernel/kmod.c6
-rw-r--r--kernel/kprobes.c1
-rw-r--r--kernel/kthread.c13
-rw-r--r--kernel/params.c1
-rw-r--r--kernel/posix-cpu-timers.c16
-rw-r--r--kernel/posix-timers.c19
-rw-r--r--kernel/power/Makefile2
-rw-r--r--kernel/power/disk.c22
-rw-r--r--kernel/power/main.c5
-rw-r--r--kernel/power/power.h17
-rw-r--r--kernel/power/snapshot.c435
-rw-r--r--kernel/power/swsusp.c569
-rw-r--r--kernel/printk.c78
-rw-r--r--kernel/ptrace.c7
-rw-r--r--kernel/rcupdate.c10
-rw-r--r--kernel/rcutorture.c492
-rw-r--r--kernel/sched.c3
-rw-r--r--kernel/signal.c150
-rw-r--r--kernel/time.c26
-rw-r--r--kernel/timer.c337
-rw-r--r--kernel/workqueue.c33
32 files changed, 1661 insertions, 1143 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index ff4dc02ce170..4f5a1453093a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -22,7 +22,6 @@ obj-$(CONFIG_KEXEC) += kexec.o
22obj-$(CONFIG_COMPAT) += compat.o 22obj-$(CONFIG_COMPAT) += compat.o
23obj-$(CONFIG_CPUSETS) += cpuset.o 23obj-$(CONFIG_CPUSETS) += cpuset.o
24obj-$(CONFIG_IKCONFIG) += configs.o 24obj-$(CONFIG_IKCONFIG) += configs.o
25obj-$(CONFIG_IKCONFIG_PROC) += configs.o
26obj-$(CONFIG_STOP_MACHINE) += stop_machine.o 25obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
27obj-$(CONFIG_AUDIT) += audit.o 26obj-$(CONFIG_AUDIT) += audit.o
28obj-$(CONFIG_AUDITSYSCALL) += auditsc.o 27obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
@@ -32,6 +31,7 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
32obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ 31obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
33obj-$(CONFIG_CRASH_DUMP) += crash_dump.o 32obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
34obj-$(CONFIG_SECCOMP) += seccomp.o 33obj-$(CONFIG_SECCOMP) += seccomp.o
34obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
35 35
36ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) 36ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
37# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 37# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/acct.c b/kernel/acct.c
index b756f527497e..2e3f4a47e7d0 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -553,7 +553,7 @@ void acct_update_integrals(struct task_struct *tsk)
553 if (delta == 0) 553 if (delta == 0)
554 return; 554 return;
555 tsk->acct_stimexpd = tsk->stime; 555 tsk->acct_stimexpd = tsk->stime;
556 tsk->acct_rss_mem1 += delta * get_mm_counter(tsk->mm, rss); 556 tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm);
557 tsk->acct_vm_mem1 += delta * tsk->mm->total_vm; 557 tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
558 } 558 }
559} 559}
diff --git a/kernel/audit.c b/kernel/audit.c
index aefa73a8a586..0c56320d38dc 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -133,7 +133,7 @@ struct audit_buffer {
133 struct list_head list; 133 struct list_head list;
134 struct sk_buff *skb; /* formatted skb ready to send */ 134 struct sk_buff *skb; /* formatted skb ready to send */
135 struct audit_context *ctx; /* NULL or associated context */ 135 struct audit_context *ctx; /* NULL or associated context */
136 int gfp_mask; 136 gfp_t gfp_mask;
137}; 137};
138 138
139static void audit_set_pid(struct audit_buffer *ab, pid_t pid) 139static void audit_set_pid(struct audit_buffer *ab, pid_t pid)
@@ -647,7 +647,7 @@ static inline void audit_get_stamp(struct audit_context *ctx,
647 * will be written at syscall exit. If there is no associated task, tsk 647 * will be written at syscall exit. If there is no associated task, tsk
648 * should be NULL. */ 648 * should be NULL. */
649 649
650struct audit_buffer *audit_log_start(struct audit_context *ctx, int gfp_mask, 650struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
651 int type) 651 int type)
652{ 652{
653 struct audit_buffer *ab = NULL; 653 struct audit_buffer *ab = NULL;
@@ -879,7 +879,7 @@ void audit_log_end(struct audit_buffer *ab)
879/* Log an audit record. This is a convenience function that calls 879/* Log an audit record. This is a convenience function that calls
880 * audit_log_start, audit_log_vformat, and audit_log_end. It may be 880 * audit_log_start, audit_log_vformat, and audit_log_end. It may be
881 * called in any context. */ 881 * called in any context. */
882void audit_log(struct audit_context *ctx, int gfp_mask, int type, 882void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
883 const char *fmt, ...) 883 const char *fmt, ...)
884{ 884{
885 struct audit_buffer *ab; 885 struct audit_buffer *ab;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 88696f639aab..d8a68509e729 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -803,7 +803,7 @@ static void audit_log_task_info(struct audit_buffer *ab)
803 up_read(&mm->mmap_sem); 803 up_read(&mm->mmap_sem);
804} 804}
805 805
806static void audit_log_exit(struct audit_context *context, unsigned int gfp_mask) 806static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
807{ 807{
808 int i; 808 int i;
809 struct audit_buffer *ab; 809 struct audit_buffer *ab;
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 53d8263ae12e..3619e939182e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -17,6 +17,7 @@
17 17
18/* This protects CPUs going up and down... */ 18/* This protects CPUs going up and down... */
19DECLARE_MUTEX(cpucontrol); 19DECLARE_MUTEX(cpucontrol);
20EXPORT_SYMBOL_GPL(cpucontrol);
20 21
21static struct notifier_block *cpu_chain; 22static struct notifier_block *cpu_chain;
22 23
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 28176d083f7b..5a737ed9dac7 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -32,6 +32,7 @@
32#include <linux/kernel.h> 32#include <linux/kernel.h>
33#include <linux/kmod.h> 33#include <linux/kmod.h>
34#include <linux/list.h> 34#include <linux/list.h>
35#include <linux/mempolicy.h>
35#include <linux/mm.h> 36#include <linux/mm.h>
36#include <linux/module.h> 37#include <linux/module.h>
37#include <linux/mount.h> 38#include <linux/mount.h>
@@ -60,6 +61,9 @@ struct cpuset {
60 cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ 61 cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */
61 nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ 62 nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */
62 63
64 /*
65 * Count is atomic so can incr (fork) or decr (exit) without a lock.
66 */
63 atomic_t count; /* count tasks using this cpuset */ 67 atomic_t count; /* count tasks using this cpuset */
64 68
65 /* 69 /*
@@ -142,80 +146,91 @@ static struct vfsmount *cpuset_mount;
142static struct super_block *cpuset_sb = NULL; 146static struct super_block *cpuset_sb = NULL;
143 147
144/* 148/*
145 * cpuset_sem should be held by anyone who is depending on the children 149 * We have two global cpuset semaphores below. They can nest.
146 * or sibling lists of any cpuset, or performing non-atomic operations 150 * It is ok to first take manage_sem, then nest callback_sem. We also
147 * on the flags or *_allowed values of a cpuset, such as raising the 151 * require taking task_lock() when dereferencing a tasks cpuset pointer.
148 * CS_REMOVED flag bit iff it is not already raised, or reading and 152 * See "The task_lock() exception", at the end of this comment.
149 * conditionally modifying the *_allowed values. One kernel global 153 *
150 * cpuset semaphore should be sufficient - these things don't change 154 * A task must hold both semaphores to modify cpusets. If a task
151 * that much. 155 * holds manage_sem, then it blocks others wanting that semaphore,
152 * 156 * ensuring that it is the only task able to also acquire callback_sem
153 * The code that modifies cpusets holds cpuset_sem across the entire 157 * and be able to modify cpusets. It can perform various checks on
154 * operation, from cpuset_common_file_write() down, single threading 158 * the cpuset structure first, knowing nothing will change. It can
155 * all cpuset modifications (except for counter manipulations from 159 * also allocate memory while just holding manage_sem. While it is
156 * fork and exit) across the system. This presumes that cpuset 160 * performing these checks, various callback routines can briefly
157 * modifications are rare - better kept simple and safe, even if slow. 161 * acquire callback_sem to query cpusets. Once it is ready to make
158 * 162 * the changes, it takes callback_sem, blocking everyone else.
159 * The code that reads cpusets, such as in cpuset_common_file_read() 163 *
160 * and below, only holds cpuset_sem across small pieces of code, such 164 * Calls to the kernel memory allocator can not be made while holding
161 * as when reading out possibly multi-word cpumasks and nodemasks, as 165 * callback_sem, as that would risk double tripping on callback_sem
162 * the risks are less, and the desire for performance a little greater. 166 * from one of the callbacks into the cpuset code from within
163 * The proc_cpuset_show() routine needs to hold cpuset_sem to insure 167 * __alloc_pages().
164 * that no cs->dentry is NULL, as it walks up the cpuset tree to root. 168 *
165 * 169 * If a task is only holding callback_sem, then it has read-only
166 * The hooks from fork and exit, cpuset_fork() and cpuset_exit(), don't 170 * access to cpusets.
167 * (usually) grab cpuset_sem. These are the two most performance 171 *
168 * critical pieces of code here. The exception occurs on exit(), 172 * The task_struct fields mems_allowed and mems_generation may only
169 * when a task in a notify_on_release cpuset exits. Then cpuset_sem 173 * be accessed in the context of that task, so require no locks.
174 *
175 * Any task can increment and decrement the count field without lock.
176 * So in general, code holding manage_sem or callback_sem can't rely
177 * on the count field not changing. However, if the count goes to
178 * zero, then only attach_task(), which holds both semaphores, can
179 * increment it again. Because a count of zero means that no tasks
180 * are currently attached, therefore there is no way a task attached
181 * to that cpuset can fork (the other way to increment the count).
182 * So code holding manage_sem or callback_sem can safely assume that
183 * if the count is zero, it will stay zero. Similarly, if a task
184 * holds manage_sem or callback_sem on a cpuset with zero count, it
185 * knows that the cpuset won't be removed, as cpuset_rmdir() needs
186 * both of those semaphores.
187 *
188 * A possible optimization to improve parallelism would be to make
189 * callback_sem a R/W semaphore (rwsem), allowing the callback routines
190 * to proceed in parallel, with read access, until the holder of
191 * manage_sem needed to take this rwsem for exclusive write access
192 * and modify some cpusets.
193 *
194 * The cpuset_common_file_write handler for operations that modify
195 * the cpuset hierarchy holds manage_sem across the entire operation,
196 * single threading all such cpuset modifications across the system.
197 *
198 * The cpuset_common_file_read() handlers only hold callback_sem across
199 * small pieces of code, such as when reading out possibly multi-word
200 * cpumasks and nodemasks.
201 *
202 * The fork and exit callbacks cpuset_fork() and cpuset_exit(), don't
203 * (usually) take either semaphore. These are the two most performance
204 * critical pieces of code here. The exception occurs on cpuset_exit(),
205 * when a task in a notify_on_release cpuset exits. Then manage_sem
170 * is taken, and if the cpuset count is zero, a usermode call made 206 * is taken, and if the cpuset count is zero, a usermode call made
171 * to /sbin/cpuset_release_agent with the name of the cpuset (path 207 * to /sbin/cpuset_release_agent with the name of the cpuset (path
172 * relative to the root of cpuset file system) as the argument. 208 * relative to the root of cpuset file system) as the argument.
173 * 209 *
174 * A cpuset can only be deleted if both its 'count' of using tasks is 210 * A cpuset can only be deleted if both its 'count' of using tasks
175 * zero, and its list of 'children' cpusets is empty. Since all tasks 211 * is zero, and its list of 'children' cpusets is empty. Since all
176 * in the system use _some_ cpuset, and since there is always at least 212 * tasks in the system use _some_ cpuset, and since there is always at
177 * one task in the system (init, pid == 1), therefore, top_cpuset 213 * least one task in the system (init, pid == 1), therefore, top_cpuset
178 * always has either children cpusets and/or using tasks. So no need 214 * always has either children cpusets and/or using tasks. So we don't
179 * for any special hack to ensure that top_cpuset cannot be deleted. 215 * need a special hack to ensure that top_cpuset cannot be deleted.
216 *
217 * The above "Tale of Two Semaphores" would be complete, but for:
218 *
219 * The task_lock() exception
220 *
221 * The need for this exception arises from the action of attach_task(),
222 * which overwrites one tasks cpuset pointer with another. It does
223 * so using both semaphores, however there are several performance
224 * critical places that need to reference task->cpuset without the
225 * expense of grabbing a system global semaphore. Therefore except as
226 * noted below, when dereferencing or, as in attach_task(), modifying
227 * a tasks cpuset pointer we use task_lock(), which acts on a spinlock
228 * (task->alloc_lock) already in the task_struct routinely used for
229 * such matters.
180 */ 230 */
181 231
182static DECLARE_MUTEX(cpuset_sem); 232static DECLARE_MUTEX(manage_sem);
183static struct task_struct *cpuset_sem_owner; 233static DECLARE_MUTEX(callback_sem);
184static int cpuset_sem_depth;
185
186/*
187 * The global cpuset semaphore cpuset_sem can be needed by the
188 * memory allocator to update a tasks mems_allowed (see the calls
189 * to cpuset_update_current_mems_allowed()) or to walk up the
190 * cpuset hierarchy to find a mem_exclusive cpuset see the calls
191 * to cpuset_excl_nodes_overlap()).
192 *
193 * But if the memory allocation is being done by cpuset.c code, it
194 * usually already holds cpuset_sem. Double tripping on a kernel
195 * semaphore deadlocks the current task, and any other task that
196 * subsequently tries to obtain the lock.
197 *
198 * Run all up's and down's on cpuset_sem through the following
199 * wrappers, which will detect this nested locking, and avoid
200 * deadlocking.
201 */
202
203static inline void cpuset_down(struct semaphore *psem)
204{
205 if (cpuset_sem_owner != current) {
206 down(psem);
207 cpuset_sem_owner = current;
208 }
209 cpuset_sem_depth++;
210}
211
212static inline void cpuset_up(struct semaphore *psem)
213{
214 if (--cpuset_sem_depth == 0) {
215 cpuset_sem_owner = NULL;
216 up(psem);
217 }
218}
219 234
220/* 235/*
221 * A couple of forward declarations required, due to cyclic reference loop: 236 * A couple of forward declarations required, due to cyclic reference loop:
@@ -390,7 +405,7 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
390} 405}
391 406
392/* 407/*
393 * Call with cpuset_sem held. Writes path of cpuset into buf. 408 * Call with manage_sem held. Writes path of cpuset into buf.
394 * Returns 0 on success, -errno on error. 409 * Returns 0 on success, -errno on error.
395 */ 410 */
396 411
@@ -442,10 +457,11 @@ static int cpuset_path(const struct cpuset *cs, char *buf, int buflen)
442 * status of the /sbin/cpuset_release_agent task, so no sense holding 457 * status of the /sbin/cpuset_release_agent task, so no sense holding
443 * our caller up for that. 458 * our caller up for that.
444 * 459 *
445 * The simple act of forking that task might require more memory, 460 * When we had only one cpuset semaphore, we had to call this
446 * which might need cpuset_sem. So this routine must be called while 461 * without holding it, to avoid deadlock when call_usermodehelper()
447 * cpuset_sem is not held, to avoid a possible deadlock. See also 462 * allocated memory. With two locks, we could now call this while
448 * comments for check_for_release(), below. 463 * holding manage_sem, but we still don't, so as to minimize
464 * the time manage_sem is held.
449 */ 465 */
450 466
451static void cpuset_release_agent(const char *pathbuf) 467static void cpuset_release_agent(const char *pathbuf)
@@ -477,15 +493,15 @@ static void cpuset_release_agent(const char *pathbuf)
477 * cs is notify_on_release() and now both the user count is zero and 493 * cs is notify_on_release() and now both the user count is zero and
478 * the list of children is empty, prepare cpuset path in a kmalloc'd 494 * the list of children is empty, prepare cpuset path in a kmalloc'd
479 * buffer, to be returned via ppathbuf, so that the caller can invoke 495 * buffer, to be returned via ppathbuf, so that the caller can invoke
480 * cpuset_release_agent() with it later on, once cpuset_sem is dropped. 496 * cpuset_release_agent() with it later on, once manage_sem is dropped.
481 * Call here with cpuset_sem held. 497 * Call here with manage_sem held.
482 * 498 *
483 * This check_for_release() routine is responsible for kmalloc'ing 499 * This check_for_release() routine is responsible for kmalloc'ing
484 * pathbuf. The above cpuset_release_agent() is responsible for 500 * pathbuf. The above cpuset_release_agent() is responsible for
485 * kfree'ing pathbuf. The caller of these routines is responsible 501 * kfree'ing pathbuf. The caller of these routines is responsible
486 * for providing a pathbuf pointer, initialized to NULL, then 502 * for providing a pathbuf pointer, initialized to NULL, then
487 * calling check_for_release() with cpuset_sem held and the address 503 * calling check_for_release() with manage_sem held and the address
488 * of the pathbuf pointer, then dropping cpuset_sem, then calling 504 * of the pathbuf pointer, then dropping manage_sem, then calling
489 * cpuset_release_agent() with pathbuf, as set by check_for_release(). 505 * cpuset_release_agent() with pathbuf, as set by check_for_release().
490 */ 506 */
491 507
@@ -516,7 +532,7 @@ static void check_for_release(struct cpuset *cs, char **ppathbuf)
516 * One way or another, we guarantee to return some non-empty subset 532 * One way or another, we guarantee to return some non-empty subset
517 * of cpu_online_map. 533 * of cpu_online_map.
518 * 534 *
519 * Call with cpuset_sem held. 535 * Call with callback_sem held.
520 */ 536 */
521 537
522static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) 538static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask)
@@ -540,7 +556,7 @@ static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask)
540 * One way or another, we guarantee to return some non-empty subset 556 * One way or another, we guarantee to return some non-empty subset
541 * of node_online_map. 557 * of node_online_map.
542 * 558 *
543 * Call with cpuset_sem held. 559 * Call with callback_sem held.
544 */ 560 */
545 561
546static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) 562static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
@@ -555,22 +571,47 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
555} 571}
556 572
557/* 573/*
558 * Refresh current tasks mems_allowed and mems_generation from 574 * Refresh current tasks mems_allowed and mems_generation from current
559 * current tasks cpuset. Call with cpuset_sem held. 575 * tasks cpuset.
560 * 576 *
561 * This routine is needed to update the per-task mems_allowed 577 * Call without callback_sem or task_lock() held. May be called with
562 * data, within the tasks context, when it is trying to allocate 578 * or without manage_sem held. Will acquire task_lock() and might
563 * memory (in various mm/mempolicy.c routines) and notices 579 * acquire callback_sem during call.
564 * that some other task has been modifying its cpuset. 580 *
581 * The task_lock() is required to dereference current->cpuset safely.
582 * Without it, we could pick up the pointer value of current->cpuset
583 * in one instruction, and then attach_task could give us a different
584 * cpuset, and then the cpuset we had could be removed and freed,
585 * and then on our next instruction, we could dereference a no longer
586 * valid cpuset pointer to get its mems_generation field.
587 *
588 * This routine is needed to update the per-task mems_allowed data,
589 * within the tasks context, when it is trying to allocate memory
590 * (in various mm/mempolicy.c routines) and notices that some other
591 * task has been modifying its cpuset.
565 */ 592 */
566 593
567static void refresh_mems(void) 594static void refresh_mems(void)
568{ 595{
569 struct cpuset *cs = current->cpuset; 596 int my_cpusets_mem_gen;
597
598 task_lock(current);
599 my_cpusets_mem_gen = current->cpuset->mems_generation;
600 task_unlock(current);
570 601
571 if (current->cpuset_mems_generation != cs->mems_generation) { 602 if (current->cpuset_mems_generation != my_cpusets_mem_gen) {
603 struct cpuset *cs;
604 nodemask_t oldmem = current->mems_allowed;
605
606 down(&callback_sem);
607 task_lock(current);
608 cs = current->cpuset;
572 guarantee_online_mems(cs, &current->mems_allowed); 609 guarantee_online_mems(cs, &current->mems_allowed);
573 current->cpuset_mems_generation = cs->mems_generation; 610 current->cpuset_mems_generation = cs->mems_generation;
611 task_unlock(current);
612 up(&callback_sem);
613 if (!nodes_equal(oldmem, current->mems_allowed))
614 numa_policy_rebind(&oldmem, &current->mems_allowed);
574 } 615 }
575} 616}
576 617
@@ -579,7 +620,7 @@ static void refresh_mems(void)
579 * 620 *
580 * One cpuset is a subset of another if all its allowed CPUs and 621 * One cpuset is a subset of another if all its allowed CPUs and
581 * Memory Nodes are a subset of the other, and its exclusive flags 622 * Memory Nodes are a subset of the other, and its exclusive flags
582 * are only set if the other's are set. 623 * are only set if the other's are set. Call holding manage_sem.
583 */ 624 */
584 625
585static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) 626static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
@@ -597,7 +638,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
597 * If we replaced the flag and mask values of the current cpuset 638 * If we replaced the flag and mask values of the current cpuset
598 * (cur) with those values in the trial cpuset (trial), would 639 * (cur) with those values in the trial cpuset (trial), would
599 * our various subset and exclusive rules still be valid? Presumes 640 * our various subset and exclusive rules still be valid? Presumes
600 * cpuset_sem held. 641 * manage_sem held.
601 * 642 *
602 * 'cur' is the address of an actual, in-use cpuset. Operations 643 * 'cur' is the address of an actual, in-use cpuset. Operations
603 * such as list traversal that depend on the actual address of the 644 * such as list traversal that depend on the actual address of the
@@ -651,7 +692,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
651 * exclusive child cpusets 692 * exclusive child cpusets
652 * Build these two partitions by calling partition_sched_domains 693 * Build these two partitions by calling partition_sched_domains
653 * 694 *
654 * Call with cpuset_sem held. May nest a call to the 695 * Call with manage_sem held. May nest a call to the
655 * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. 696 * lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
656 */ 697 */
657 698
@@ -696,6 +737,10 @@ static void update_cpu_domains(struct cpuset *cur)
696 unlock_cpu_hotplug(); 737 unlock_cpu_hotplug();
697} 738}
698 739
740/*
741 * Call with manage_sem held. May take callback_sem during call.
742 */
743
699static int update_cpumask(struct cpuset *cs, char *buf) 744static int update_cpumask(struct cpuset *cs, char *buf)
700{ 745{
701 struct cpuset trialcs; 746 struct cpuset trialcs;
@@ -712,12 +757,18 @@ static int update_cpumask(struct cpuset *cs, char *buf)
712 if (retval < 0) 757 if (retval < 0)
713 return retval; 758 return retval;
714 cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed); 759 cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed);
760 down(&callback_sem);
715 cs->cpus_allowed = trialcs.cpus_allowed; 761 cs->cpus_allowed = trialcs.cpus_allowed;
762 up(&callback_sem);
716 if (is_cpu_exclusive(cs) && !cpus_unchanged) 763 if (is_cpu_exclusive(cs) && !cpus_unchanged)
717 update_cpu_domains(cs); 764 update_cpu_domains(cs);
718 return 0; 765 return 0;
719} 766}
720 767
768/*
769 * Call with manage_sem held. May take callback_sem during call.
770 */
771
721static int update_nodemask(struct cpuset *cs, char *buf) 772static int update_nodemask(struct cpuset *cs, char *buf)
722{ 773{
723 struct cpuset trialcs; 774 struct cpuset trialcs;
@@ -732,9 +783,11 @@ static int update_nodemask(struct cpuset *cs, char *buf)
732 return -ENOSPC; 783 return -ENOSPC;
733 retval = validate_change(cs, &trialcs); 784 retval = validate_change(cs, &trialcs);
734 if (retval == 0) { 785 if (retval == 0) {
786 down(&callback_sem);
735 cs->mems_allowed = trialcs.mems_allowed; 787 cs->mems_allowed = trialcs.mems_allowed;
736 atomic_inc(&cpuset_mems_generation); 788 atomic_inc(&cpuset_mems_generation);
737 cs->mems_generation = atomic_read(&cpuset_mems_generation); 789 cs->mems_generation = atomic_read(&cpuset_mems_generation);
790 up(&callback_sem);
738 } 791 }
739 return retval; 792 return retval;
740} 793}
@@ -745,6 +798,8 @@ static int update_nodemask(struct cpuset *cs, char *buf)
745 * CS_NOTIFY_ON_RELEASE) 798 * CS_NOTIFY_ON_RELEASE)
746 * cs: the cpuset to update 799 * cs: the cpuset to update
747 * buf: the buffer where we read the 0 or 1 800 * buf: the buffer where we read the 0 or 1
801 *
802 * Call with manage_sem held.
748 */ 803 */
749 804
750static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) 805static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
@@ -766,16 +821,27 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
766 return err; 821 return err;
767 cpu_exclusive_changed = 822 cpu_exclusive_changed =
768 (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs)); 823 (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs));
824 down(&callback_sem);
769 if (turning_on) 825 if (turning_on)
770 set_bit(bit, &cs->flags); 826 set_bit(bit, &cs->flags);
771 else 827 else
772 clear_bit(bit, &cs->flags); 828 clear_bit(bit, &cs->flags);
829 up(&callback_sem);
773 830
774 if (cpu_exclusive_changed) 831 if (cpu_exclusive_changed)
775 update_cpu_domains(cs); 832 update_cpu_domains(cs);
776 return 0; 833 return 0;
777} 834}
778 835
836/*
837 * Attack task specified by pid in 'pidbuf' to cpuset 'cs', possibly
838 * writing the path of the old cpuset in 'ppathbuf' if it needs to be
839 * notified on release.
840 *
841 * Call holding manage_sem. May take callback_sem and task_lock of
842 * the task 'pid' during call.
843 */
844
779static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) 845static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
780{ 846{
781 pid_t pid; 847 pid_t pid;
@@ -792,7 +858,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
792 read_lock(&tasklist_lock); 858 read_lock(&tasklist_lock);
793 859
794 tsk = find_task_by_pid(pid); 860 tsk = find_task_by_pid(pid);
795 if (!tsk) { 861 if (!tsk || tsk->flags & PF_EXITING) {
796 read_unlock(&tasklist_lock); 862 read_unlock(&tasklist_lock);
797 return -ESRCH; 863 return -ESRCH;
798 } 864 }
@@ -810,10 +876,13 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
810 get_task_struct(tsk); 876 get_task_struct(tsk);
811 } 877 }
812 878
879 down(&callback_sem);
880
813 task_lock(tsk); 881 task_lock(tsk);
814 oldcs = tsk->cpuset; 882 oldcs = tsk->cpuset;
815 if (!oldcs) { 883 if (!oldcs) {
816 task_unlock(tsk); 884 task_unlock(tsk);
885 up(&callback_sem);
817 put_task_struct(tsk); 886 put_task_struct(tsk);
818 return -ESRCH; 887 return -ESRCH;
819 } 888 }
@@ -824,6 +893,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
824 guarantee_online_cpus(cs, &cpus); 893 guarantee_online_cpus(cs, &cpus);
825 set_cpus_allowed(tsk, cpus); 894 set_cpus_allowed(tsk, cpus);
826 895
896 up(&callback_sem);
827 put_task_struct(tsk); 897 put_task_struct(tsk);
828 if (atomic_dec_and_test(&oldcs->count)) 898 if (atomic_dec_and_test(&oldcs->count))
829 check_for_release(oldcs, ppathbuf); 899 check_for_release(oldcs, ppathbuf);
@@ -867,7 +937,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
867 } 937 }
868 buffer[nbytes] = 0; /* nul-terminate */ 938 buffer[nbytes] = 0; /* nul-terminate */
869 939
870 cpuset_down(&cpuset_sem); 940 down(&manage_sem);
871 941
872 if (is_removed(cs)) { 942 if (is_removed(cs)) {
873 retval = -ENODEV; 943 retval = -ENODEV;
@@ -901,7 +971,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
901 if (retval == 0) 971 if (retval == 0)
902 retval = nbytes; 972 retval = nbytes;
903out2: 973out2:
904 cpuset_up(&cpuset_sem); 974 up(&manage_sem);
905 cpuset_release_agent(pathbuf); 975 cpuset_release_agent(pathbuf);
906out1: 976out1:
907 kfree(buffer); 977 kfree(buffer);
@@ -941,9 +1011,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
941{ 1011{
942 cpumask_t mask; 1012 cpumask_t mask;
943 1013
944 cpuset_down(&cpuset_sem); 1014 down(&callback_sem);
945 mask = cs->cpus_allowed; 1015 mask = cs->cpus_allowed;
946 cpuset_up(&cpuset_sem); 1016 up(&callback_sem);
947 1017
948 return cpulist_scnprintf(page, PAGE_SIZE, mask); 1018 return cpulist_scnprintf(page, PAGE_SIZE, mask);
949} 1019}
@@ -952,9 +1022,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
952{ 1022{
953 nodemask_t mask; 1023 nodemask_t mask;
954 1024
955 cpuset_down(&cpuset_sem); 1025 down(&callback_sem);
956 mask = cs->mems_allowed; 1026 mask = cs->mems_allowed;
957 cpuset_up(&cpuset_sem); 1027 up(&callback_sem);
958 1028
959 return nodelist_scnprintf(page, PAGE_SIZE, mask); 1029 return nodelist_scnprintf(page, PAGE_SIZE, mask);
960} 1030}
@@ -995,7 +1065,6 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
995 goto out; 1065 goto out;
996 } 1066 }
997 *s++ = '\n'; 1067 *s++ = '\n';
998 *s = '\0';
999 1068
1000 retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page); 1069 retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);
1001out: 1070out:
@@ -1048,6 +1117,21 @@ static int cpuset_file_release(struct inode *inode, struct file *file)
1048 return 0; 1117 return 0;
1049} 1118}
1050 1119
1120/*
1121 * cpuset_rename - Only allow simple rename of directories in place.
1122 */
1123static int cpuset_rename(struct inode *old_dir, struct dentry *old_dentry,
1124 struct inode *new_dir, struct dentry *new_dentry)
1125{
1126 if (!S_ISDIR(old_dentry->d_inode->i_mode))
1127 return -ENOTDIR;
1128 if (new_dentry->d_inode)
1129 return -EEXIST;
1130 if (old_dir != new_dir)
1131 return -EIO;
1132 return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
1133}
1134
1051static struct file_operations cpuset_file_operations = { 1135static struct file_operations cpuset_file_operations = {
1052 .read = cpuset_file_read, 1136 .read = cpuset_file_read,
1053 .write = cpuset_file_write, 1137 .write = cpuset_file_write,
@@ -1060,6 +1144,7 @@ static struct inode_operations cpuset_dir_inode_operations = {
1060 .lookup = simple_lookup, 1144 .lookup = simple_lookup,
1061 .mkdir = cpuset_mkdir, 1145 .mkdir = cpuset_mkdir,
1062 .rmdir = cpuset_rmdir, 1146 .rmdir = cpuset_rmdir,
1147 .rename = cpuset_rename,
1063}; 1148};
1064 1149
1065static int cpuset_create_file(struct dentry *dentry, int mode) 1150static int cpuset_create_file(struct dentry *dentry, int mode)
@@ -1163,7 +1248,9 @@ struct ctr_struct {
1163 1248
1164/* 1249/*
1165 * Load into 'pidarray' up to 'npids' of the tasks using cpuset 'cs'. 1250 * Load into 'pidarray' up to 'npids' of the tasks using cpuset 'cs'.
1166 * Return actual number of pids loaded. 1251 * Return actual number of pids loaded. No need to task_lock(p)
1252 * when reading out p->cpuset, as we don't really care if it changes
1253 * on the next cycle, and we are not going to try to dereference it.
1167 */ 1254 */
1168static inline int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs) 1255static inline int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs)
1169{ 1256{
@@ -1205,6 +1292,12 @@ static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids)
1205 return cnt; 1292 return cnt;
1206} 1293}
1207 1294
1295/*
1296 * Handle an open on 'tasks' file. Prepare a buffer listing the
1297 * process id's of tasks currently attached to the cpuset being opened.
1298 *
1299 * Does not require any specific cpuset semaphores, and does not take any.
1300 */
1208static int cpuset_tasks_open(struct inode *unused, struct file *file) 1301static int cpuset_tasks_open(struct inode *unused, struct file *file)
1209{ 1302{
1210 struct cpuset *cs = __d_cs(file->f_dentry->d_parent); 1303 struct cpuset *cs = __d_cs(file->f_dentry->d_parent);
@@ -1352,7 +1445,8 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
1352 if (!cs) 1445 if (!cs)
1353 return -ENOMEM; 1446 return -ENOMEM;
1354 1447
1355 cpuset_down(&cpuset_sem); 1448 down(&manage_sem);
1449 refresh_mems();
1356 cs->flags = 0; 1450 cs->flags = 0;
1357 if (notify_on_release(parent)) 1451 if (notify_on_release(parent))
1358 set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); 1452 set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
@@ -1366,25 +1460,27 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
1366 1460
1367 cs->parent = parent; 1461 cs->parent = parent;
1368 1462
1463 down(&callback_sem);
1369 list_add(&cs->sibling, &cs->parent->children); 1464 list_add(&cs->sibling, &cs->parent->children);
1465 up(&callback_sem);
1370 1466
1371 err = cpuset_create_dir(cs, name, mode); 1467 err = cpuset_create_dir(cs, name, mode);
1372 if (err < 0) 1468 if (err < 0)
1373 goto err; 1469 goto err;
1374 1470
1375 /* 1471 /*
1376 * Release cpuset_sem before cpuset_populate_dir() because it 1472 * Release manage_sem before cpuset_populate_dir() because it
1377 * will down() this new directory's i_sem and if we race with 1473 * will down() this new directory's i_sem and if we race with
1378 * another mkdir, we might deadlock. 1474 * another mkdir, we might deadlock.
1379 */ 1475 */
1380 cpuset_up(&cpuset_sem); 1476 up(&manage_sem);
1381 1477
1382 err = cpuset_populate_dir(cs->dentry); 1478 err = cpuset_populate_dir(cs->dentry);
1383 /* If err < 0, we have a half-filled directory - oh well ;) */ 1479 /* If err < 0, we have a half-filled directory - oh well ;) */
1384 return 0; 1480 return 0;
1385err: 1481err:
1386 list_del(&cs->sibling); 1482 list_del(&cs->sibling);
1387 cpuset_up(&cpuset_sem); 1483 up(&manage_sem);
1388 kfree(cs); 1484 kfree(cs);
1389 return err; 1485 return err;
1390} 1486}
@@ -1406,29 +1502,32 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
1406 1502
1407 /* the vfs holds both inode->i_sem already */ 1503 /* the vfs holds both inode->i_sem already */
1408 1504
1409 cpuset_down(&cpuset_sem); 1505 down(&manage_sem);
1506 refresh_mems();
1410 if (atomic_read(&cs->count) > 0) { 1507 if (atomic_read(&cs->count) > 0) {
1411 cpuset_up(&cpuset_sem); 1508 up(&manage_sem);
1412 return -EBUSY; 1509 return -EBUSY;
1413 } 1510 }
1414 if (!list_empty(&cs->children)) { 1511 if (!list_empty(&cs->children)) {
1415 cpuset_up(&cpuset_sem); 1512 up(&manage_sem);
1416 return -EBUSY; 1513 return -EBUSY;
1417 } 1514 }
1418 parent = cs->parent; 1515 parent = cs->parent;
1516 down(&callback_sem);
1419 set_bit(CS_REMOVED, &cs->flags); 1517 set_bit(CS_REMOVED, &cs->flags);
1420 if (is_cpu_exclusive(cs)) 1518 if (is_cpu_exclusive(cs))
1421 update_cpu_domains(cs); 1519 update_cpu_domains(cs);
1422 list_del(&cs->sibling); /* delete my sibling from parent->children */ 1520 list_del(&cs->sibling); /* delete my sibling from parent->children */
1423 if (list_empty(&parent->children))
1424 check_for_release(parent, &pathbuf);
1425 spin_lock(&cs->dentry->d_lock); 1521 spin_lock(&cs->dentry->d_lock);
1426 d = dget(cs->dentry); 1522 d = dget(cs->dentry);
1427 cs->dentry = NULL; 1523 cs->dentry = NULL;
1428 spin_unlock(&d->d_lock); 1524 spin_unlock(&d->d_lock);
1429 cpuset_d_remove_dir(d); 1525 cpuset_d_remove_dir(d);
1430 dput(d); 1526 dput(d);
1431 cpuset_up(&cpuset_sem); 1527 up(&callback_sem);
1528 if (list_empty(&parent->children))
1529 check_for_release(parent, &pathbuf);
1530 up(&manage_sem);
1432 cpuset_release_agent(pathbuf); 1531 cpuset_release_agent(pathbuf);
1433 return 0; 1532 return 0;
1434} 1533}
@@ -1488,16 +1587,26 @@ void __init cpuset_init_smp(void)
1488 * cpuset_fork - attach newly forked task to its parents cpuset. 1587 * cpuset_fork - attach newly forked task to its parents cpuset.
1489 * @tsk: pointer to task_struct of forking parent process. 1588 * @tsk: pointer to task_struct of forking parent process.
1490 * 1589 *
1491 * Description: By default, on fork, a task inherits its 1590 * Description: A task inherits its parent's cpuset at fork().
1492 * parent's cpuset. The pointer to the shared cpuset is 1591 *
1493 * automatically copied in fork.c by dup_task_struct(). 1592 * A pointer to the shared cpuset was automatically copied in fork.c
1494 * This cpuset_fork() routine need only increment the usage 1593 * by dup_task_struct(). However, we ignore that copy, since it was
1495 * counter in that cpuset. 1594 * not made under the protection of task_lock(), so might no longer be
1595 * a valid cpuset pointer. attach_task() might have already changed
1596 * current->cpuset, allowing the previously referenced cpuset to
1597 * be removed and freed. Instead, we task_lock(current) and copy
1598 * its present value of current->cpuset for our freshly forked child.
1599 *
1600 * At the point that cpuset_fork() is called, 'current' is the parent
1601 * task, and the passed argument 'child' points to the child task.
1496 **/ 1602 **/
1497 1603
1498void cpuset_fork(struct task_struct *tsk) 1604void cpuset_fork(struct task_struct *child)
1499{ 1605{
1500 atomic_inc(&tsk->cpuset->count); 1606 task_lock(current);
1607 child->cpuset = current->cpuset;
1608 atomic_inc(&child->cpuset->count);
1609 task_unlock(current);
1501} 1610}
1502 1611
1503/** 1612/**
@@ -1506,35 +1615,42 @@ void cpuset_fork(struct task_struct *tsk)
1506 * 1615 *
1507 * Description: Detach cpuset from @tsk and release it. 1616 * Description: Detach cpuset from @tsk and release it.
1508 * 1617 *
1509 * Note that cpusets marked notify_on_release force every task 1618 * Note that cpusets marked notify_on_release force every task in
1510 * in them to take the global cpuset_sem semaphore when exiting. 1619 * them to take the global manage_sem semaphore when exiting.
1511 * This could impact scaling on very large systems. Be reluctant 1620 * This could impact scaling on very large systems. Be reluctant to
1512 * to use notify_on_release cpusets where very high task exit 1621 * use notify_on_release cpusets where very high task exit scaling
1513 * scaling is required on large systems. 1622 * is required on large systems.
1514 * 1623 *
1515 * Don't even think about derefencing 'cs' after the cpuset use 1624 * Don't even think about derefencing 'cs' after the cpuset use count
1516 * count goes to zero, except inside a critical section guarded 1625 * goes to zero, except inside a critical section guarded by manage_sem
1517 * by the cpuset_sem semaphore. If you don't hold cpuset_sem, 1626 * or callback_sem. Otherwise a zero cpuset use count is a license to
1518 * then a zero cpuset use count is a license to any other task to 1627 * any other task to nuke the cpuset immediately, via cpuset_rmdir().
1519 * nuke the cpuset immediately. 1628 *
1629 * This routine has to take manage_sem, not callback_sem, because
1630 * it is holding that semaphore while calling check_for_release(),
1631 * which calls kmalloc(), so can't be called holding callback__sem().
1632 *
1633 * We don't need to task_lock() this reference to tsk->cpuset,
1634 * because tsk is already marked PF_EXITING, so attach_task() won't
1635 * mess with it.
1520 **/ 1636 **/
1521 1637
1522void cpuset_exit(struct task_struct *tsk) 1638void cpuset_exit(struct task_struct *tsk)
1523{ 1639{
1524 struct cpuset *cs; 1640 struct cpuset *cs;
1525 1641
1526 task_lock(tsk); 1642 BUG_ON(!(tsk->flags & PF_EXITING));
1643
1527 cs = tsk->cpuset; 1644 cs = tsk->cpuset;
1528 tsk->cpuset = NULL; 1645 tsk->cpuset = NULL;
1529 task_unlock(tsk);
1530 1646
1531 if (notify_on_release(cs)) { 1647 if (notify_on_release(cs)) {
1532 char *pathbuf = NULL; 1648 char *pathbuf = NULL;
1533 1649
1534 cpuset_down(&cpuset_sem); 1650 down(&manage_sem);
1535 if (atomic_dec_and_test(&cs->count)) 1651 if (atomic_dec_and_test(&cs->count))
1536 check_for_release(cs, &pathbuf); 1652 check_for_release(cs, &pathbuf);
1537 cpuset_up(&cpuset_sem); 1653 up(&manage_sem);
1538 cpuset_release_agent(pathbuf); 1654 cpuset_release_agent(pathbuf);
1539 } else { 1655 } else {
1540 atomic_dec(&cs->count); 1656 atomic_dec(&cs->count);
@@ -1555,11 +1671,11 @@ cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk)
1555{ 1671{
1556 cpumask_t mask; 1672 cpumask_t mask;
1557 1673
1558 cpuset_down(&cpuset_sem); 1674 down(&callback_sem);
1559 task_lock((struct task_struct *)tsk); 1675 task_lock((struct task_struct *)tsk);
1560 guarantee_online_cpus(tsk->cpuset, &mask); 1676 guarantee_online_cpus(tsk->cpuset, &mask);
1561 task_unlock((struct task_struct *)tsk); 1677 task_unlock((struct task_struct *)tsk);
1562 cpuset_up(&cpuset_sem); 1678 up(&callback_sem);
1563 1679
1564 return mask; 1680 return mask;
1565} 1681}
@@ -1575,19 +1691,28 @@ void cpuset_init_current_mems_allowed(void)
1575 * If the current tasks cpusets mems_allowed changed behind our backs, 1691 * If the current tasks cpusets mems_allowed changed behind our backs,
1576 * update current->mems_allowed and mems_generation to the new value. 1692 * update current->mems_allowed and mems_generation to the new value.
1577 * Do not call this routine if in_interrupt(). 1693 * Do not call this routine if in_interrupt().
1694 *
1695 * Call without callback_sem or task_lock() held. May be called
1696 * with or without manage_sem held. Unless exiting, it will acquire
1697 * task_lock(). Also might acquire callback_sem during call to
1698 * refresh_mems().
1578 */ 1699 */
1579 1700
1580void cpuset_update_current_mems_allowed(void) 1701void cpuset_update_current_mems_allowed(void)
1581{ 1702{
1582 struct cpuset *cs = current->cpuset; 1703 struct cpuset *cs;
1704 int need_to_refresh = 0;
1583 1705
1706 task_lock(current);
1707 cs = current->cpuset;
1584 if (!cs) 1708 if (!cs)
1585 return; /* task is exiting */ 1709 goto done;
1586 if (current->cpuset_mems_generation != cs->mems_generation) { 1710 if (current->cpuset_mems_generation != cs->mems_generation)
1587 cpuset_down(&cpuset_sem); 1711 need_to_refresh = 1;
1712done:
1713 task_unlock(current);
1714 if (need_to_refresh)
1588 refresh_mems(); 1715 refresh_mems();
1589 cpuset_up(&cpuset_sem);
1590 }
1591} 1716}
1592 1717
1593/** 1718/**
@@ -1621,7 +1746,7 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
1621 1746
1622/* 1747/*
1623 * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive 1748 * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive
1624 * ancestor to the specified cpuset. Call while holding cpuset_sem. 1749 * ancestor to the specified cpuset. Call holding callback_sem.
1625 * If no ancestor is mem_exclusive (an unusual configuration), then 1750 * If no ancestor is mem_exclusive (an unusual configuration), then
1626 * returns the root cpuset. 1751 * returns the root cpuset.
1627 */ 1752 */
@@ -1648,12 +1773,12 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
1648 * GFP_KERNEL allocations are not so marked, so can escape to the 1773 * GFP_KERNEL allocations are not so marked, so can escape to the
1649 * nearest mem_exclusive ancestor cpuset. 1774 * nearest mem_exclusive ancestor cpuset.
1650 * 1775 *
1651 * Scanning up parent cpusets requires cpuset_sem. The __alloc_pages() 1776 * Scanning up parent cpusets requires callback_sem. The __alloc_pages()
1652 * routine only calls here with __GFP_HARDWALL bit _not_ set if 1777 * routine only calls here with __GFP_HARDWALL bit _not_ set if
1653 * it's a GFP_KERNEL allocation, and all nodes in the current tasks 1778 * it's a GFP_KERNEL allocation, and all nodes in the current tasks
1654 * mems_allowed came up empty on the first pass over the zonelist. 1779 * mems_allowed came up empty on the first pass over the zonelist.
1655 * So only GFP_KERNEL allocations, if all nodes in the cpuset are 1780 * So only GFP_KERNEL allocations, if all nodes in the cpuset are
1656 * short of memory, might require taking the cpuset_sem semaphore. 1781 * short of memory, might require taking the callback_sem semaphore.
1657 * 1782 *
1658 * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages() 1783 * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages()
1659 * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing 1784 * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing
@@ -1685,14 +1810,16 @@ int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
1685 return 0; 1810 return 0;
1686 1811
1687 /* Not hardwall and node outside mems_allowed: scan up cpusets */ 1812 /* Not hardwall and node outside mems_allowed: scan up cpusets */
1688 cpuset_down(&cpuset_sem); 1813 down(&callback_sem);
1689 cs = current->cpuset; 1814
1690 if (!cs) 1815 if (current->flags & PF_EXITING) /* Let dying task have memory */
1691 goto done; /* current task exiting */ 1816 return 1;
1692 cs = nearest_exclusive_ancestor(cs); 1817 task_lock(current);
1818 cs = nearest_exclusive_ancestor(current->cpuset);
1819 task_unlock(current);
1820
1693 allowed = node_isset(node, cs->mems_allowed); 1821 allowed = node_isset(node, cs->mems_allowed);
1694done: 1822 up(&callback_sem);
1695 cpuset_up(&cpuset_sem);
1696 return allowed; 1823 return allowed;
1697} 1824}
1698 1825
@@ -1705,7 +1832,7 @@ done:
1705 * determine if task @p's memory usage might impact the memory 1832 * determine if task @p's memory usage might impact the memory
1706 * available to the current task. 1833 * available to the current task.
1707 * 1834 *
1708 * Acquires cpuset_sem - not suitable for calling from a fast path. 1835 * Acquires callback_sem - not suitable for calling from a fast path.
1709 **/ 1836 **/
1710 1837
1711int cpuset_excl_nodes_overlap(const struct task_struct *p) 1838int cpuset_excl_nodes_overlap(const struct task_struct *p)
@@ -1713,18 +1840,27 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p)
1713 const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ 1840 const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */
1714 int overlap = 0; /* do cpusets overlap? */ 1841 int overlap = 0; /* do cpusets overlap? */
1715 1842
1716 cpuset_down(&cpuset_sem); 1843 down(&callback_sem);
1717 cs1 = current->cpuset; 1844
1718 if (!cs1) 1845 task_lock(current);
1719 goto done; /* current task exiting */ 1846 if (current->flags & PF_EXITING) {
1720 cs2 = p->cpuset; 1847 task_unlock(current);
1721 if (!cs2) 1848 goto done;
1722 goto done; /* task p is exiting */ 1849 }
1723 cs1 = nearest_exclusive_ancestor(cs1); 1850 cs1 = nearest_exclusive_ancestor(current->cpuset);
1724 cs2 = nearest_exclusive_ancestor(cs2); 1851 task_unlock(current);
1852
1853 task_lock((struct task_struct *)p);
1854 if (p->flags & PF_EXITING) {
1855 task_unlock((struct task_struct *)p);
1856 goto done;
1857 }
1858 cs2 = nearest_exclusive_ancestor(p->cpuset);
1859 task_unlock((struct task_struct *)p);
1860
1725 overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); 1861 overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed);
1726done: 1862done:
1727 cpuset_up(&cpuset_sem); 1863 up(&callback_sem);
1728 1864
1729 return overlap; 1865 return overlap;
1730} 1866}
@@ -1733,6 +1869,10 @@ done:
1733 * proc_cpuset_show() 1869 * proc_cpuset_show()
1734 * - Print tasks cpuset path into seq_file. 1870 * - Print tasks cpuset path into seq_file.
1735 * - Used for /proc/<pid>/cpuset. 1871 * - Used for /proc/<pid>/cpuset.
1872 * - No need to task_lock(tsk) on this tsk->cpuset reference, as it
1873 * doesn't really matter if tsk->cpuset changes after we read it,
1874 * and we take manage_sem, keeping attach_task() from changing it
1875 * anyway.
1736 */ 1876 */
1737 1877
1738static int proc_cpuset_show(struct seq_file *m, void *v) 1878static int proc_cpuset_show(struct seq_file *m, void *v)
@@ -1747,10 +1887,8 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
1747 return -ENOMEM; 1887 return -ENOMEM;
1748 1888
1749 tsk = m->private; 1889 tsk = m->private;
1750 cpuset_down(&cpuset_sem); 1890 down(&manage_sem);
1751 task_lock(tsk);
1752 cs = tsk->cpuset; 1891 cs = tsk->cpuset;
1753 task_unlock(tsk);
1754 if (!cs) { 1892 if (!cs) {
1755 retval = -EINVAL; 1893 retval = -EINVAL;
1756 goto out; 1894 goto out;
@@ -1762,7 +1900,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
1762 seq_puts(m, buf); 1900 seq_puts(m, buf);
1763 seq_putc(m, '\n'); 1901 seq_putc(m, '\n');
1764out: 1902out:
1765 cpuset_up(&cpuset_sem); 1903 up(&manage_sem);
1766 kfree(buf); 1904 kfree(buf);
1767 return retval; 1905 return retval;
1768} 1906}
diff --git a/kernel/exit.c b/kernel/exit.c
index 3b25b182d2be..537394b25e8d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -547,7 +547,7 @@ static inline void reparent_thread(task_t *p, task_t *father, int traced)
547 547
548 if (p->pdeath_signal) 548 if (p->pdeath_signal)
549 /* We already hold the tasklist_lock here. */ 549 /* We already hold the tasklist_lock here. */
550 group_send_sig_info(p->pdeath_signal, (void *) 0, p); 550 group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
551 551
552 /* Move the child from its dying parent to the new one. */ 552 /* Move the child from its dying parent to the new one. */
553 if (unlikely(traced)) { 553 if (unlikely(traced)) {
@@ -591,8 +591,8 @@ static inline void reparent_thread(task_t *p, task_t *father, int traced)
591 int pgrp = process_group(p); 591 int pgrp = process_group(p);
592 592
593 if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) { 593 if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) {
594 __kill_pg_info(SIGHUP, (void *)1, pgrp); 594 __kill_pg_info(SIGHUP, SEND_SIG_PRIV, pgrp);
595 __kill_pg_info(SIGCONT, (void *)1, pgrp); 595 __kill_pg_info(SIGCONT, SEND_SIG_PRIV, pgrp);
596 } 596 }
597 } 597 }
598} 598}
@@ -727,8 +727,8 @@ static void exit_notify(struct task_struct *tsk)
727 (t->signal->session == tsk->signal->session) && 727 (t->signal->session == tsk->signal->session) &&
728 will_become_orphaned_pgrp(process_group(tsk), tsk) && 728 will_become_orphaned_pgrp(process_group(tsk), tsk) &&
729 has_stopped_jobs(process_group(tsk))) { 729 has_stopped_jobs(process_group(tsk))) {
730 __kill_pg_info(SIGHUP, (void *)1, process_group(tsk)); 730 __kill_pg_info(SIGHUP, SEND_SIG_PRIV, process_group(tsk));
731 __kill_pg_info(SIGCONT, (void *)1, process_group(tsk)); 731 __kill_pg_info(SIGCONT, SEND_SIG_PRIV, process_group(tsk));
732 } 732 }
733 733
734 /* Let father know we died 734 /* Let father know we died
@@ -783,10 +783,6 @@ static void exit_notify(struct task_struct *tsk)
783 /* If the process is dead, release it - nobody will wait for it */ 783 /* If the process is dead, release it - nobody will wait for it */
784 if (state == EXIT_DEAD) 784 if (state == EXIT_DEAD)
785 release_task(tsk); 785 release_task(tsk);
786
787 /* PF_DEAD causes final put_task_struct after we schedule. */
788 preempt_disable();
789 tsk->flags |= PF_DEAD;
790} 786}
791 787
792fastcall NORET_TYPE void do_exit(long code) 788fastcall NORET_TYPE void do_exit(long code)
@@ -839,7 +835,10 @@ fastcall NORET_TYPE void do_exit(long code)
839 preempt_count()); 835 preempt_count());
840 836
841 acct_update_integrals(tsk); 837 acct_update_integrals(tsk);
842 update_mem_hiwater(tsk); 838 if (tsk->mm) {
839 update_hiwater_rss(tsk->mm);
840 update_hiwater_vm(tsk->mm);
841 }
843 group_dead = atomic_dec_and_test(&tsk->signal->live); 842 group_dead = atomic_dec_and_test(&tsk->signal->live);
844 if (group_dead) { 843 if (group_dead) {
845 del_timer_sync(&tsk->signal->real_timer); 844 del_timer_sync(&tsk->signal->real_timer);
@@ -870,7 +869,11 @@ fastcall NORET_TYPE void do_exit(long code)
870 tsk->mempolicy = NULL; 869 tsk->mempolicy = NULL;
871#endif 870#endif
872 871
873 BUG_ON(!(current->flags & PF_DEAD)); 872 /* PF_DEAD causes final put_task_struct after we schedule. */
873 preempt_disable();
874 BUG_ON(tsk->flags & PF_DEAD);
875 tsk->flags |= PF_DEAD;
876
874 schedule(); 877 schedule();
875 BUG(); 878 BUG();
876 /* Avoid "noreturn function does return". */ 879 /* Avoid "noreturn function does return". */
@@ -1380,6 +1383,15 @@ repeat:
1380 1383
1381 switch (p->state) { 1384 switch (p->state) {
1382 case TASK_TRACED: 1385 case TASK_TRACED:
1386 /*
1387 * When we hit the race with PTRACE_ATTACH,
1388 * we will not report this child. But the
1389 * race means it has not yet been moved to
1390 * our ptrace_children list, so we need to
1391 * set the flag here to avoid a spurious ECHILD
1392 * when the race happens with the only child.
1393 */
1394 flag = 1;
1383 if (!my_ptrace_child(p)) 1395 if (!my_ptrace_child(p))
1384 continue; 1396 continue;
1385 /*FALLTHROUGH*/ 1397 /*FALLTHROUGH*/
diff --git a/kernel/fork.c b/kernel/fork.c
index 280bd44ac441..8a069612eac3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -182,37 +182,37 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
182} 182}
183 183
184#ifdef CONFIG_MMU 184#ifdef CONFIG_MMU
185static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm) 185static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
186{ 186{
187 struct vm_area_struct * mpnt, *tmp, **pprev; 187 struct vm_area_struct *mpnt, *tmp, **pprev;
188 struct rb_node **rb_link, *rb_parent; 188 struct rb_node **rb_link, *rb_parent;
189 int retval; 189 int retval;
190 unsigned long charge; 190 unsigned long charge;
191 struct mempolicy *pol; 191 struct mempolicy *pol;
192 192
193 down_write(&oldmm->mmap_sem); 193 down_write(&oldmm->mmap_sem);
194 flush_cache_mm(current->mm); 194 flush_cache_mm(oldmm);
195 down_write(&mm->mmap_sem);
196
195 mm->locked_vm = 0; 197 mm->locked_vm = 0;
196 mm->mmap = NULL; 198 mm->mmap = NULL;
197 mm->mmap_cache = NULL; 199 mm->mmap_cache = NULL;
198 mm->free_area_cache = oldmm->mmap_base; 200 mm->free_area_cache = oldmm->mmap_base;
199 mm->cached_hole_size = ~0UL; 201 mm->cached_hole_size = ~0UL;
200 mm->map_count = 0; 202 mm->map_count = 0;
201 set_mm_counter(mm, rss, 0);
202 set_mm_counter(mm, anon_rss, 0);
203 cpus_clear(mm->cpu_vm_mask); 203 cpus_clear(mm->cpu_vm_mask);
204 mm->mm_rb = RB_ROOT; 204 mm->mm_rb = RB_ROOT;
205 rb_link = &mm->mm_rb.rb_node; 205 rb_link = &mm->mm_rb.rb_node;
206 rb_parent = NULL; 206 rb_parent = NULL;
207 pprev = &mm->mmap; 207 pprev = &mm->mmap;
208 208
209 for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) { 209 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
210 struct file *file; 210 struct file *file;
211 211
212 if (mpnt->vm_flags & VM_DONTCOPY) { 212 if (mpnt->vm_flags & VM_DONTCOPY) {
213 long pages = vma_pages(mpnt); 213 long pages = vma_pages(mpnt);
214 mm->total_vm -= pages; 214 mm->total_vm -= pages;
215 __vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, 215 vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
216 -pages); 216 -pages);
217 continue; 217 continue;
218 } 218 }
@@ -253,12 +253,8 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
253 } 253 }
254 254
255 /* 255 /*
256 * Link in the new vma and copy the page table entries: 256 * Link in the new vma and copy the page table entries.
257 * link in first so that swapoff can see swap entries.
258 * Note that, exceptionally, here the vma is inserted
259 * without holding mm->mmap_sem.
260 */ 257 */
261 spin_lock(&mm->page_table_lock);
262 *pprev = tmp; 258 *pprev = tmp;
263 pprev = &tmp->vm_next; 259 pprev = &tmp->vm_next;
264 260
@@ -267,8 +263,7 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
267 rb_parent = &tmp->vm_rb; 263 rb_parent = &tmp->vm_rb;
268 264
269 mm->map_count++; 265 mm->map_count++;
270 retval = copy_page_range(mm, current->mm, tmp); 266 retval = copy_page_range(mm, oldmm, tmp);
271 spin_unlock(&mm->page_table_lock);
272 267
273 if (tmp->vm_ops && tmp->vm_ops->open) 268 if (tmp->vm_ops && tmp->vm_ops->open)
274 tmp->vm_ops->open(tmp); 269 tmp->vm_ops->open(tmp);
@@ -277,9 +272,9 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
277 goto out; 272 goto out;
278 } 273 }
279 retval = 0; 274 retval = 0;
280
281out: 275out:
282 flush_tlb_mm(current->mm); 276 up_write(&mm->mmap_sem);
277 flush_tlb_mm(oldmm);
283 up_write(&oldmm->mmap_sem); 278 up_write(&oldmm->mmap_sem);
284 return retval; 279 return retval;
285fail_nomem_policy: 280fail_nomem_policy:
@@ -323,6 +318,8 @@ static struct mm_struct * mm_init(struct mm_struct * mm)
323 INIT_LIST_HEAD(&mm->mmlist); 318 INIT_LIST_HEAD(&mm->mmlist);
324 mm->core_waiters = 0; 319 mm->core_waiters = 0;
325 mm->nr_ptes = 0; 320 mm->nr_ptes = 0;
321 set_mm_counter(mm, file_rss, 0);
322 set_mm_counter(mm, anon_rss, 0);
326 spin_lock_init(&mm->page_table_lock); 323 spin_lock_init(&mm->page_table_lock);
327 rwlock_init(&mm->ioctx_list_lock); 324 rwlock_init(&mm->ioctx_list_lock);
328 mm->ioctx_list = NULL; 325 mm->ioctx_list = NULL;
@@ -499,7 +496,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
499 if (retval) 496 if (retval)
500 goto free_pt; 497 goto free_pt;
501 498
502 mm->hiwater_rss = get_mm_counter(mm,rss); 499 mm->hiwater_rss = get_mm_rss(mm);
503 mm->hiwater_vm = mm->total_vm; 500 mm->hiwater_vm = mm->total_vm;
504 501
505good_mm: 502good_mm:
diff --git a/kernel/futex.c b/kernel/futex.c
index ca05fe6a70b2..3b4d5ad44cc6 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -205,15 +205,13 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
205 /* 205 /*
206 * Do a quick atomic lookup first - this is the fastpath. 206 * Do a quick atomic lookup first - this is the fastpath.
207 */ 207 */
208 spin_lock(&current->mm->page_table_lock); 208 page = follow_page(mm, uaddr, FOLL_TOUCH|FOLL_GET);
209 page = follow_page(mm, uaddr, 0);
210 if (likely(page != NULL)) { 209 if (likely(page != NULL)) {
211 key->shared.pgoff = 210 key->shared.pgoff =
212 page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 211 page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
213 spin_unlock(&current->mm->page_table_lock); 212 put_page(page);
214 return 0; 213 return 0;
215 } 214 }
216 spin_unlock(&current->mm->page_table_lock);
217 215
218 /* 216 /*
219 * Do it the general way. 217 * Do it the general way.
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 13bcec151b57..39277dd6bf90 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -18,6 +18,7 @@
18#include <linux/fs.h> 18#include <linux/fs.h>
19#include <linux/err.h> 19#include <linux/err.h>
20#include <linux/proc_fs.h> 20#include <linux/proc_fs.h>
21#include <linux/sched.h> /* for cond_resched */
21#include <linux/mm.h> 22#include <linux/mm.h>
22 23
23#include <asm/sections.h> 24#include <asm/sections.h>
diff --git a/kernel/kexec.c b/kernel/kexec.c
index cdd4dcd8fb63..2c95848fbce8 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -90,7 +90,7 @@ int kexec_should_crash(struct task_struct *p)
90static int kimage_is_destination_range(struct kimage *image, 90static int kimage_is_destination_range(struct kimage *image,
91 unsigned long start, unsigned long end); 91 unsigned long start, unsigned long end);
92static struct page *kimage_alloc_page(struct kimage *image, 92static struct page *kimage_alloc_page(struct kimage *image,
93 unsigned int gfp_mask, 93 gfp_t gfp_mask,
94 unsigned long dest); 94 unsigned long dest);
95 95
96static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, 96static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
@@ -326,8 +326,7 @@ static int kimage_is_destination_range(struct kimage *image,
326 return 0; 326 return 0;
327} 327}
328 328
329static struct page *kimage_alloc_pages(unsigned int gfp_mask, 329static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
330 unsigned int order)
331{ 330{
332 struct page *pages; 331 struct page *pages;
333 332
@@ -335,7 +334,7 @@ static struct page *kimage_alloc_pages(unsigned int gfp_mask,
335 if (pages) { 334 if (pages) {
336 unsigned int count, i; 335 unsigned int count, i;
337 pages->mapping = NULL; 336 pages->mapping = NULL;
338 pages->private = order; 337 set_page_private(pages, order);
339 count = 1 << order; 338 count = 1 << order;
340 for (i = 0; i < count; i++) 339 for (i = 0; i < count; i++)
341 SetPageReserved(pages + i); 340 SetPageReserved(pages + i);
@@ -348,7 +347,7 @@ static void kimage_free_pages(struct page *page)
348{ 347{
349 unsigned int order, count, i; 348 unsigned int order, count, i;
350 349
351 order = page->private; 350 order = page_private(page);
352 count = 1 << order; 351 count = 1 << order;
353 for (i = 0; i < count; i++) 352 for (i = 0; i < count; i++)
354 ClearPageReserved(page + i); 353 ClearPageReserved(page + i);
@@ -654,7 +653,7 @@ static kimage_entry_t *kimage_dst_used(struct kimage *image,
654} 653}
655 654
656static struct page *kimage_alloc_page(struct kimage *image, 655static struct page *kimage_alloc_page(struct kimage *image,
657 unsigned int gfp_mask, 656 gfp_t gfp_mask,
658 unsigned long destination) 657 unsigned long destination)
659{ 658{
660 /* 659 /*
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 44166e3bb8af..51a892063aaa 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -131,14 +131,14 @@ struct subprocess_info {
131static int ____call_usermodehelper(void *data) 131static int ____call_usermodehelper(void *data)
132{ 132{
133 struct subprocess_info *sub_info = data; 133 struct subprocess_info *sub_info = data;
134 struct key *old_session; 134 struct key *new_session, *old_session;
135 int retval; 135 int retval;
136 136
137 /* Unblock all signals and set the session keyring. */ 137 /* Unblock all signals and set the session keyring. */
138 key_get(sub_info->ring); 138 new_session = key_get(sub_info->ring);
139 flush_signals(current); 139 flush_signals(current);
140 spin_lock_irq(&current->sighand->siglock); 140 spin_lock_irq(&current->sighand->siglock);
141 old_session = __install_session_keyring(current, sub_info->ring); 141 old_session = __install_session_keyring(current, new_session);
142 flush_signal_handlers(current, 1); 142 flush_signal_handlers(current, 1);
143 sigemptyset(&current->blocked); 143 sigemptyset(&current->blocked);
144 recalc_sigpending(); 144 recalc_sigpending();
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index f3ea492ab44d..ce4915dd683a 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -35,6 +35,7 @@
35#include <linux/spinlock.h> 35#include <linux/spinlock.h>
36#include <linux/hash.h> 36#include <linux/hash.h>
37#include <linux/init.h> 37#include <linux/init.h>
38#include <linux/slab.h>
38#include <linux/module.h> 39#include <linux/module.h>
39#include <linux/moduleloader.h> 40#include <linux/moduleloader.h>
40#include <asm-generic/sections.h> 41#include <asm-generic/sections.h>
diff --git a/kernel/kthread.c b/kernel/kthread.c
index f50f174e92da..e75950a1092c 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -165,6 +165,12 @@ EXPORT_SYMBOL(kthread_bind);
165 165
166int kthread_stop(struct task_struct *k) 166int kthread_stop(struct task_struct *k)
167{ 167{
168 return kthread_stop_sem(k, NULL);
169}
170EXPORT_SYMBOL(kthread_stop);
171
172int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
173{
168 int ret; 174 int ret;
169 175
170 down(&kthread_stop_lock); 176 down(&kthread_stop_lock);
@@ -178,7 +184,10 @@ int kthread_stop(struct task_struct *k)
178 184
179 /* Now set kthread_should_stop() to true, and wake it up. */ 185 /* Now set kthread_should_stop() to true, and wake it up. */
180 kthread_stop_info.k = k; 186 kthread_stop_info.k = k;
181 wake_up_process(k); 187 if (s)
188 up(s);
189 else
190 wake_up_process(k);
182 put_task_struct(k); 191 put_task_struct(k);
183 192
184 /* Once it dies, reset stop ptr, gather result and we're done. */ 193 /* Once it dies, reset stop ptr, gather result and we're done. */
@@ -189,7 +198,7 @@ int kthread_stop(struct task_struct *k)
189 198
190 return ret; 199 return ret;
191} 200}
192EXPORT_SYMBOL(kthread_stop); 201EXPORT_SYMBOL(kthread_stop_sem);
193 202
194static __init int helper_init(void) 203static __init int helper_init(void)
195{ 204{
diff --git a/kernel/params.c b/kernel/params.c
index 1a8614bac5d5..47ba69547945 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -23,6 +23,7 @@
23#include <linux/module.h> 23#include <linux/module.h>
24#include <linux/device.h> 24#include <linux/device.h>
25#include <linux/err.h> 25#include <linux/err.h>
26#include <linux/slab.h>
26 27
27#if 0 28#if 0
28#define DEBUGP printk 29#define DEBUGP printk
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index bf374fceb39c..91a894264941 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1225,7 +1225,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
1225 /* 1225 /*
1226 * The task was cleaned up already, no future firings. 1226 * The task was cleaned up already, no future firings.
1227 */ 1227 */
1228 return; 1228 goto out;
1229 1229
1230 /* 1230 /*
1231 * Fetch the current sample and update the timer's expiry time. 1231 * Fetch the current sample and update the timer's expiry time.
@@ -1235,7 +1235,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
1235 bump_cpu_timer(timer, now); 1235 bump_cpu_timer(timer, now);
1236 if (unlikely(p->exit_state)) { 1236 if (unlikely(p->exit_state)) {
1237 clear_dead_task(timer, now); 1237 clear_dead_task(timer, now);
1238 return; 1238 goto out;
1239 } 1239 }
1240 read_lock(&tasklist_lock); /* arm_timer needs it. */ 1240 read_lock(&tasklist_lock); /* arm_timer needs it. */
1241 } else { 1241 } else {
@@ -1248,8 +1248,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
1248 put_task_struct(p); 1248 put_task_struct(p);
1249 timer->it.cpu.task = p = NULL; 1249 timer->it.cpu.task = p = NULL;
1250 timer->it.cpu.expires.sched = 0; 1250 timer->it.cpu.expires.sched = 0;
1251 read_unlock(&tasklist_lock); 1251 goto out_unlock;
1252 return;
1253 } else if (unlikely(p->exit_state) && thread_group_empty(p)) { 1252 } else if (unlikely(p->exit_state) && thread_group_empty(p)) {
1254 /* 1253 /*
1255 * We've noticed that the thread is dead, but 1254 * We've noticed that the thread is dead, but
@@ -1257,8 +1256,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
1257 * drop our task ref. 1256 * drop our task ref.
1258 */ 1257 */
1259 clear_dead_task(timer, now); 1258 clear_dead_task(timer, now);
1260 read_unlock(&tasklist_lock); 1259 goto out_unlock;
1261 return;
1262 } 1260 }
1263 cpu_clock_sample_group(timer->it_clock, p, &now); 1261 cpu_clock_sample_group(timer->it_clock, p, &now);
1264 bump_cpu_timer(timer, now); 1262 bump_cpu_timer(timer, now);
@@ -1270,7 +1268,13 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
1270 */ 1268 */
1271 arm_timer(timer, now); 1269 arm_timer(timer, now);
1272 1270
1271out_unlock:
1273 read_unlock(&tasklist_lock); 1272 read_unlock(&tasklist_lock);
1273
1274out:
1275 timer->it_overrun_last = timer->it_overrun;
1276 timer->it_overrun = -1;
1277 ++timer->it_requeue_pending;
1274} 1278}
1275 1279
1276/* 1280/*
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index dda3cda73c77..ea55c7a1cd75 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -1295,13 +1295,6 @@ sys_clock_getres(clockid_t which_clock, struct timespec __user *tp)
1295 return error; 1295 return error;
1296} 1296}
1297 1297
1298static void nanosleep_wake_up(unsigned long __data)
1299{
1300 struct task_struct *p = (struct task_struct *) __data;
1301
1302 wake_up_process(p);
1303}
1304
1305/* 1298/*
1306 * The standard says that an absolute nanosleep call MUST wake up at 1299 * The standard says that an absolute nanosleep call MUST wake up at
1307 * the requested time in spite of clock settings. Here is what we do: 1300 * the requested time in spite of clock settings. Here is what we do:
@@ -1442,7 +1435,6 @@ static int common_nsleep(clockid_t which_clock,
1442 int flags, struct timespec *tsave) 1435 int flags, struct timespec *tsave)
1443{ 1436{
1444 struct timespec t, dum; 1437 struct timespec t, dum;
1445 struct timer_list new_timer;
1446 DECLARE_WAITQUEUE(abs_wqueue, current); 1438 DECLARE_WAITQUEUE(abs_wqueue, current);
1447 u64 rq_time = (u64)0; 1439 u64 rq_time = (u64)0;
1448 s64 left; 1440 s64 left;
@@ -1451,10 +1443,6 @@ static int common_nsleep(clockid_t which_clock,
1451 &current_thread_info()->restart_block; 1443 &current_thread_info()->restart_block;
1452 1444
1453 abs_wqueue.flags = 0; 1445 abs_wqueue.flags = 0;
1454 init_timer(&new_timer);
1455 new_timer.expires = 0;
1456 new_timer.data = (unsigned long) current;
1457 new_timer.function = nanosleep_wake_up;
1458 abs = flags & TIMER_ABSTIME; 1446 abs = flags & TIMER_ABSTIME;
1459 1447
1460 if (restart_block->fn == clock_nanosleep_restart) { 1448 if (restart_block->fn == clock_nanosleep_restart) {
@@ -1490,13 +1478,8 @@ static int common_nsleep(clockid_t which_clock,
1490 if (left < (s64)0) 1478 if (left < (s64)0)
1491 break; 1479 break;
1492 1480
1493 new_timer.expires = jiffies + left; 1481 schedule_timeout_interruptible(left);
1494 __set_current_state(TASK_INTERRUPTIBLE);
1495 add_timer(&new_timer);
1496
1497 schedule();
1498 1482
1499 del_timer_sync(&new_timer);
1500 left = rq_time - get_jiffies_64(); 1483 left = rq_time - get_jiffies_64();
1501 } while (left > (s64)0 && !test_thread_flag(TIF_SIGPENDING)); 1484 } while (left > (s64)0 && !test_thread_flag(TIF_SIGPENDING));
1502 1485
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 2f438d0eaa13..c71eb4579c07 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -4,7 +4,7 @@ EXTRA_CFLAGS += -DDEBUG
4endif 4endif
5 5
6obj-y := main.o process.o console.o pm.o 6obj-y := main.o process.o console.o pm.o
7obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o 7obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o snapshot.o
8 8
9obj-$(CONFIG_SUSPEND_SMP) += smp.o 9obj-$(CONFIG_SUSPEND_SMP) += smp.o
10 10
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 761956e813f5..027322a564f4 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -30,7 +30,6 @@ extern int swsusp_check(void);
30extern int swsusp_read(void); 30extern int swsusp_read(void);
31extern void swsusp_close(void); 31extern void swsusp_close(void);
32extern int swsusp_resume(void); 32extern int swsusp_resume(void);
33extern int swsusp_free(void);
34 33
35 34
36static int noresume = 0; 35static int noresume = 0;
@@ -93,10 +92,7 @@ static void free_some_memory(void)
93 printk("Freeing memory... "); 92 printk("Freeing memory... ");
94 while ((tmp = shrink_all_memory(10000))) { 93 while ((tmp = shrink_all_memory(10000))) {
95 pages += tmp; 94 pages += tmp;
96 printk("\b%c", p[i]); 95 printk("\b%c", p[i++ % 4]);
97 i++;
98 if (i > 3)
99 i = 0;
100 } 96 }
101 printk("\bdone (%li pages freed)\n", pages); 97 printk("\bdone (%li pages freed)\n", pages);
102} 98}
@@ -178,13 +174,12 @@ int pm_suspend_disk(void)
178 goto Done; 174 goto Done;
179 175
180 if (in_suspend) { 176 if (in_suspend) {
177 device_resume();
181 pr_debug("PM: writing image.\n"); 178 pr_debug("PM: writing image.\n");
182 error = swsusp_write(); 179 error = swsusp_write();
183 if (!error) 180 if (!error)
184 power_down(pm_disk_mode); 181 power_down(pm_disk_mode);
185 else { 182 else {
186 /* swsusp_write can not fail in device_resume,
187 no need to do second device_resume */
188 swsusp_free(); 183 swsusp_free();
189 unprepare_processes(); 184 unprepare_processes();
190 return error; 185 return error;
@@ -252,14 +247,17 @@ static int software_resume(void)
252 247
253 pr_debug("PM: Reading swsusp image.\n"); 248 pr_debug("PM: Reading swsusp image.\n");
254 249
255 if ((error = swsusp_read())) 250 if ((error = swsusp_read())) {
256 goto Cleanup; 251 swsusp_free();
252 goto Thaw;
253 }
257 254
258 pr_debug("PM: Preparing devices for restore.\n"); 255 pr_debug("PM: Preparing devices for restore.\n");
259 256
260 if ((error = device_suspend(PMSG_FREEZE))) { 257 if ((error = device_suspend(PMSG_FREEZE))) {
261 printk("Some devices failed to suspend\n"); 258 printk("Some devices failed to suspend\n");
262 goto Free; 259 swsusp_free();
260 goto Thaw;
263 } 261 }
264 262
265 mb(); 263 mb();
@@ -268,9 +266,7 @@ static int software_resume(void)
268 swsusp_resume(); 266 swsusp_resume();
269 pr_debug("PM: Restore failed, recovering.n"); 267 pr_debug("PM: Restore failed, recovering.n");
270 device_resume(); 268 device_resume();
271 Free: 269 Thaw:
272 swsusp_free();
273 Cleanup:
274 unprepare_processes(); 270 unprepare_processes();
275 Done: 271 Done:
276 /* For success case, the suspend path will release the lock */ 272 /* For success case, the suspend path will release the lock */
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 22bdc93cc038..18d7d693fbba 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -167,6 +167,8 @@ static int enter_state(suspend_state_t state)
167{ 167{
168 int error; 168 int error;
169 169
170 if (pm_ops->valid && !pm_ops->valid(state))
171 return -ENODEV;
170 if (down_trylock(&pm_sem)) 172 if (down_trylock(&pm_sem))
171 return -EBUSY; 173 return -EBUSY;
172 174
@@ -236,7 +238,8 @@ static ssize_t state_show(struct subsystem * subsys, char * buf)
236 char * s = buf; 238 char * s = buf;
237 239
238 for (i = 0; i < PM_SUSPEND_MAX; i++) { 240 for (i = 0; i < PM_SUSPEND_MAX; i++) {
239 if (pm_states[i]) 241 if (pm_states[i] && pm_ops && (!pm_ops->valid
242 ||(pm_ops->valid && pm_ops->valid(i))))
240 s += sprintf(s,"%s ",pm_states[i]); 243 s += sprintf(s,"%s ",pm_states[i]);
241 } 244 }
242 s += sprintf(s,"\n"); 245 s += sprintf(s,"\n");
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 6748de23e83c..d4fd96a135ab 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -53,3 +53,20 @@ extern void thaw_processes(void);
53 53
54extern int pm_prepare_console(void); 54extern int pm_prepare_console(void);
55extern void pm_restore_console(void); 55extern void pm_restore_console(void);
56
57
58/* References to section boundaries */
59extern const void __nosave_begin, __nosave_end;
60
61extern unsigned int nr_copy_pages;
62extern suspend_pagedir_t *pagedir_nosave;
63extern suspend_pagedir_t *pagedir_save;
64
65extern asmlinkage int swsusp_arch_suspend(void);
66extern asmlinkage int swsusp_arch_resume(void);
67
68extern int restore_highmem(void);
69extern struct pbe * alloc_pagedir(unsigned nr_pages);
70extern void create_pbe_list(struct pbe *pblist, unsigned nr_pages);
71extern void swsusp_free(void);
72extern int enough_swap(unsigned nr_pages);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
new file mode 100644
index 000000000000..42a628704398
--- /dev/null
+++ b/kernel/power/snapshot.c
@@ -0,0 +1,435 @@
1/*
2 * linux/kernel/power/snapshot.c
3 *
4 * This file provide system snapshot/restore functionality.
5 *
6 * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz>
7 *
8 * This file is released under the GPLv2, and is based on swsusp.c.
9 *
10 */
11
12
13#include <linux/module.h>
14#include <linux/mm.h>
15#include <linux/suspend.h>
16#include <linux/smp_lock.h>
17#include <linux/delay.h>
18#include <linux/bitops.h>
19#include <linux/spinlock.h>
20#include <linux/kernel.h>
21#include <linux/pm.h>
22#include <linux/device.h>
23#include <linux/bootmem.h>
24#include <linux/syscalls.h>
25#include <linux/console.h>
26#include <linux/highmem.h>
27
28#include <asm/uaccess.h>
29#include <asm/mmu_context.h>
30#include <asm/pgtable.h>
31#include <asm/tlbflush.h>
32#include <asm/io.h>
33
34#include "power.h"
35
36#ifdef CONFIG_HIGHMEM
37struct highmem_page {
38 char *data;
39 struct page *page;
40 struct highmem_page *next;
41};
42
43static struct highmem_page *highmem_copy;
44
45static int save_highmem_zone(struct zone *zone)
46{
47 unsigned long zone_pfn;
48 mark_free_pages(zone);
49 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
50 struct page *page;
51 struct highmem_page *save;
52 void *kaddr;
53 unsigned long pfn = zone_pfn + zone->zone_start_pfn;
54
55 if (!(pfn%1000))
56 printk(".");
57 if (!pfn_valid(pfn))
58 continue;
59 page = pfn_to_page(pfn);
60 /*
61 * This condition results from rvmalloc() sans vmalloc_32()
62 * and architectural memory reservations. This should be
63 * corrected eventually when the cases giving rise to this
64 * are better understood.
65 */
66 if (PageReserved(page)) {
67 printk("highmem reserved page?!\n");
68 continue;
69 }
70 BUG_ON(PageNosave(page));
71 if (PageNosaveFree(page))
72 continue;
73 save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC);
74 if (!save)
75 return -ENOMEM;
76 save->next = highmem_copy;
77 save->page = page;
78 save->data = (void *) get_zeroed_page(GFP_ATOMIC);
79 if (!save->data) {
80 kfree(save);
81 return -ENOMEM;
82 }
83 kaddr = kmap_atomic(page, KM_USER0);
84 memcpy(save->data, kaddr, PAGE_SIZE);
85 kunmap_atomic(kaddr, KM_USER0);
86 highmem_copy = save;
87 }
88 return 0;
89}
90
91
92static int save_highmem(void)
93{
94 struct zone *zone;
95 int res = 0;
96
97 pr_debug("swsusp: Saving Highmem\n");
98 for_each_zone (zone) {
99 if (is_highmem(zone))
100 res = save_highmem_zone(zone);
101 if (res)
102 return res;
103 }
104 return 0;
105}
106
107int restore_highmem(void)
108{
109 printk("swsusp: Restoring Highmem\n");
110 while (highmem_copy) {
111 struct highmem_page *save = highmem_copy;
112 void *kaddr;
113 highmem_copy = save->next;
114
115 kaddr = kmap_atomic(save->page, KM_USER0);
116 memcpy(kaddr, save->data, PAGE_SIZE);
117 kunmap_atomic(kaddr, KM_USER0);
118 free_page((long) save->data);
119 kfree(save);
120 }
121 return 0;
122}
123#else
124static int save_highmem(void) { return 0; }
125int restore_highmem(void) { return 0; }
126#endif /* CONFIG_HIGHMEM */
127
128
129static int pfn_is_nosave(unsigned long pfn)
130{
131 unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT;
132 unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT;
133 return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
134}
135
136/**
137 * saveable - Determine whether a page should be cloned or not.
138 * @pfn: The page
139 *
140 * We save a page if it's Reserved, and not in the range of pages
141 * statically defined as 'unsaveable', or if it isn't reserved, and
142 * isn't part of a free chunk of pages.
143 */
144
145static int saveable(struct zone *zone, unsigned long *zone_pfn)
146{
147 unsigned long pfn = *zone_pfn + zone->zone_start_pfn;
148 struct page *page;
149
150 if (!pfn_valid(pfn))
151 return 0;
152
153 page = pfn_to_page(pfn);
154 BUG_ON(PageReserved(page) && PageNosave(page));
155 if (PageNosave(page))
156 return 0;
157 if (PageReserved(page) && pfn_is_nosave(pfn)) {
158 pr_debug("[nosave pfn 0x%lx]", pfn);
159 return 0;
160 }
161 if (PageNosaveFree(page))
162 return 0;
163
164 return 1;
165}
166
167static unsigned count_data_pages(void)
168{
169 struct zone *zone;
170 unsigned long zone_pfn;
171 unsigned n;
172
173 n = 0;
174 for_each_zone (zone) {
175 if (is_highmem(zone))
176 continue;
177 mark_free_pages(zone);
178 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
179 n += saveable(zone, &zone_pfn);
180 }
181 return n;
182}
183
184static void copy_data_pages(struct pbe *pblist)
185{
186 struct zone *zone;
187 unsigned long zone_pfn;
188 struct pbe *pbe, *p;
189
190 pbe = pblist;
191 for_each_zone (zone) {
192 if (is_highmem(zone))
193 continue;
194 mark_free_pages(zone);
195 /* This is necessary for swsusp_free() */
196 for_each_pb_page (p, pblist)
197 SetPageNosaveFree(virt_to_page(p));
198 for_each_pbe (p, pblist)
199 SetPageNosaveFree(virt_to_page(p->address));
200 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
201 if (saveable(zone, &zone_pfn)) {
202 struct page *page;
203 page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
204 BUG_ON(!pbe);
205 pbe->orig_address = (unsigned long)page_address(page);
206 /* copy_page is not usable for copying task structs. */
207 memcpy((void *)pbe->address, (void *)pbe->orig_address, PAGE_SIZE);
208 pbe = pbe->next;
209 }
210 }
211 }
212 BUG_ON(pbe);
213}
214
215
216/**
217 * free_pagedir - free pages allocated with alloc_pagedir()
218 */
219
220static void free_pagedir(struct pbe *pblist)
221{
222 struct pbe *pbe;
223
224 while (pblist) {
225 pbe = (pblist + PB_PAGE_SKIP)->next;
226 ClearPageNosave(virt_to_page(pblist));
227 ClearPageNosaveFree(virt_to_page(pblist));
228 free_page((unsigned long)pblist);
229 pblist = pbe;
230 }
231}
232
233/**
234 * fill_pb_page - Create a list of PBEs on a given memory page
235 */
236
237static inline void fill_pb_page(struct pbe *pbpage)
238{
239 struct pbe *p;
240
241 p = pbpage;
242 pbpage += PB_PAGE_SKIP;
243 do
244 p->next = p + 1;
245 while (++p < pbpage);
246}
247
248/**
249 * create_pbe_list - Create a list of PBEs on top of a given chain
250 * of memory pages allocated with alloc_pagedir()
251 */
252
253void create_pbe_list(struct pbe *pblist, unsigned nr_pages)
254{
255 struct pbe *pbpage, *p;
256 unsigned num = PBES_PER_PAGE;
257
258 for_each_pb_page (pbpage, pblist) {
259 if (num >= nr_pages)
260 break;
261
262 fill_pb_page(pbpage);
263 num += PBES_PER_PAGE;
264 }
265 if (pbpage) {
266 for (num -= PBES_PER_PAGE - 1, p = pbpage; num < nr_pages; p++, num++)
267 p->next = p + 1;
268 p->next = NULL;
269 }
270 pr_debug("create_pbe_list(): initialized %d PBEs\n", num);
271}
272
273static void *alloc_image_page(void)
274{
275 void *res = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
276 if (res) {
277 SetPageNosave(virt_to_page(res));
278 SetPageNosaveFree(virt_to_page(res));
279 }
280 return res;
281}
282
283/**
284 * alloc_pagedir - Allocate the page directory.
285 *
286 * First, determine exactly how many pages we need and
287 * allocate them.
288 *
289 * We arrange the pages in a chain: each page is an array of PBES_PER_PAGE
290 * struct pbe elements (pbes) and the last element in the page points
291 * to the next page.
292 *
293 * On each page we set up a list of struct_pbe elements.
294 */
295
296struct pbe *alloc_pagedir(unsigned nr_pages)
297{
298 unsigned num;
299 struct pbe *pblist, *pbe;
300
301 if (!nr_pages)
302 return NULL;
303
304 pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages);
305 pblist = alloc_image_page();
306 /* FIXME: rewrite this ugly loop */
307 for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages;
308 pbe = pbe->next, num += PBES_PER_PAGE) {
309 pbe += PB_PAGE_SKIP;
310 pbe->next = alloc_image_page();
311 }
312 if (!pbe) { /* get_zeroed_page() failed */
313 free_pagedir(pblist);
314 pblist = NULL;
315 }
316 return pblist;
317}
318
319/**
320 * Free pages we allocated for suspend. Suspend pages are alocated
321 * before atomic copy, so we need to free them after resume.
322 */
323
324void swsusp_free(void)
325{
326 struct zone *zone;
327 unsigned long zone_pfn;
328
329 for_each_zone(zone) {
330 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
331 if (pfn_valid(zone_pfn + zone->zone_start_pfn)) {
332 struct page * page;
333 page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
334 if (PageNosave(page) && PageNosaveFree(page)) {
335 ClearPageNosave(page);
336 ClearPageNosaveFree(page);
337 free_page((long) page_address(page));
338 }
339 }
340 }
341}
342
343
344/**
345 * enough_free_mem - Make sure we enough free memory to snapshot.
346 *
347 * Returns TRUE or FALSE after checking the number of available
348 * free pages.
349 */
350
351static int enough_free_mem(unsigned nr_pages)
352{
353 pr_debug("swsusp: available memory: %u pages\n", nr_free_pages());
354 return nr_free_pages() > (nr_pages + PAGES_FOR_IO +
355 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
356}
357
358
359static struct pbe *swsusp_alloc(unsigned nr_pages)
360{
361 struct pbe *pblist, *p;
362
363 if (!(pblist = alloc_pagedir(nr_pages))) {
364 printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
365 return NULL;
366 }
367 create_pbe_list(pblist, nr_pages);
368
369 for_each_pbe (p, pblist) {
370 p->address = (unsigned long)alloc_image_page();
371 if (!p->address) {
372 printk(KERN_ERR "suspend: Allocating image pages failed.\n");
373 swsusp_free();
374 return NULL;
375 }
376 }
377
378 return pblist;
379}
380
381asmlinkage int swsusp_save(void)
382{
383 unsigned nr_pages;
384
385 pr_debug("swsusp: critical section: \n");
386 if (save_highmem()) {
387 printk(KERN_CRIT "swsusp: Not enough free pages for highmem\n");
388 restore_highmem();
389 return -ENOMEM;
390 }
391
392 drain_local_pages();
393 nr_pages = count_data_pages();
394 printk("swsusp: Need to copy %u pages\n", nr_pages);
395
396 pr_debug("swsusp: pages needed: %u + %lu + %u, free: %u\n",
397 nr_pages,
398 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE,
399 PAGES_FOR_IO, nr_free_pages());
400
401 /* This is needed because of the fixed size of swsusp_info */
402 if (MAX_PBES < (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE)
403 return -ENOSPC;
404
405 if (!enough_free_mem(nr_pages)) {
406 printk(KERN_ERR "swsusp: Not enough free memory\n");
407 return -ENOMEM;
408 }
409
410 if (!enough_swap(nr_pages)) {
411 printk(KERN_ERR "swsusp: Not enough free swap\n");
412 return -ENOSPC;
413 }
414
415 pagedir_nosave = swsusp_alloc(nr_pages);
416 if (!pagedir_nosave)
417 return -ENOMEM;
418
419 /* During allocating of suspend pagedir, new cold pages may appear.
420 * Kill them.
421 */
422 drain_local_pages();
423 copy_data_pages(pagedir_nosave);
424
425 /*
426 * End of critical section. From now on, we can write to memory,
427 * but we should not touch disk. This specially means we must _not_
428 * touch swap space! Except we must write out our image of course.
429 */
430
431 nr_copy_pages = nr_pages;
432
433 printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages);
434 return 0;
435}
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 2d5c45676442..12db1d2ad61f 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -1,11 +1,10 @@
1/* 1/*
2 * linux/kernel/power/swsusp.c 2 * linux/kernel/power/swsusp.c
3 * 3 *
4 * This file is to realize architecture-independent 4 * This file provides code to write suspend image to swap and read it back.
5 * machine suspend feature using pretty near only high-level routines
6 * 5 *
7 * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu> 6 * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
8 * Copyright (C) 1998,2001-2004 Pavel Machek <pavel@suse.cz> 7 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz>
9 * 8 *
10 * This file is released under the GPLv2. 9 * This file is released under the GPLv2.
11 * 10 *
@@ -47,11 +46,7 @@
47#include <linux/utsname.h> 46#include <linux/utsname.h>
48#include <linux/version.h> 47#include <linux/version.h>
49#include <linux/delay.h> 48#include <linux/delay.h>
50#include <linux/reboot.h>
51#include <linux/bitops.h> 49#include <linux/bitops.h>
52#include <linux/vt_kern.h>
53#include <linux/kbd_kern.h>
54#include <linux/keyboard.h>
55#include <linux/spinlock.h> 50#include <linux/spinlock.h>
56#include <linux/genhd.h> 51#include <linux/genhd.h>
57#include <linux/kernel.h> 52#include <linux/kernel.h>
@@ -63,10 +58,8 @@
63#include <linux/swapops.h> 58#include <linux/swapops.h>
64#include <linux/bootmem.h> 59#include <linux/bootmem.h>
65#include <linux/syscalls.h> 60#include <linux/syscalls.h>
66#include <linux/console.h>
67#include <linux/highmem.h> 61#include <linux/highmem.h>
68#include <linux/bio.h> 62#include <linux/bio.h>
69#include <linux/mount.h>
70 63
71#include <asm/uaccess.h> 64#include <asm/uaccess.h>
72#include <asm/mmu_context.h> 65#include <asm/mmu_context.h>
@@ -84,16 +77,10 @@
84#define MAXKEY 32 77#define MAXKEY 32
85#define MAXIV 32 78#define MAXIV 32
86 79
87/* References to section boundaries */
88extern const void __nosave_begin, __nosave_end;
89
90/* Variables to be preserved over suspend */
91static int nr_copy_pages_check;
92
93extern char resume_file[]; 80extern char resume_file[];
94 81
95/* Local variables that should not be affected by save */ 82/* Local variables that should not be affected by save */
96static unsigned int nr_copy_pages __nosavedata = 0; 83unsigned int nr_copy_pages __nosavedata = 0;
97 84
98/* Suspend pagedir is allocated before final copy, therefore it 85/* Suspend pagedir is allocated before final copy, therefore it
99 must be freed after resume 86 must be freed after resume
@@ -109,7 +96,7 @@ static unsigned int nr_copy_pages __nosavedata = 0;
109 MMU hardware. 96 MMU hardware.
110 */ 97 */
111suspend_pagedir_t *pagedir_nosave __nosavedata = NULL; 98suspend_pagedir_t *pagedir_nosave __nosavedata = NULL;
112static suspend_pagedir_t *pagedir_save; 99suspend_pagedir_t *pagedir_save;
113 100
114#define SWSUSP_SIG "S1SUSPEND" 101#define SWSUSP_SIG "S1SUSPEND"
115 102
@@ -124,12 +111,6 @@ static struct swsusp_header {
124static struct swsusp_info swsusp_info; 111static struct swsusp_info swsusp_info;
125 112
126/* 113/*
127 * XXX: We try to keep some more pages free so that I/O operations succeed
128 * without paging. Might this be more?
129 */
130#define PAGES_FOR_IO 512
131
132/*
133 * Saving part... 114 * Saving part...
134 */ 115 */
135 116
@@ -552,346 +533,6 @@ static int write_suspend_image(void)
552 goto Done; 533 goto Done;
553} 534}
554 535
555
556#ifdef CONFIG_HIGHMEM
557struct highmem_page {
558 char *data;
559 struct page *page;
560 struct highmem_page *next;
561};
562
563static struct highmem_page *highmem_copy;
564
565static int save_highmem_zone(struct zone *zone)
566{
567 unsigned long zone_pfn;
568 mark_free_pages(zone);
569 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
570 struct page *page;
571 struct highmem_page *save;
572 void *kaddr;
573 unsigned long pfn = zone_pfn + zone->zone_start_pfn;
574
575 if (!(pfn%1000))
576 printk(".");
577 if (!pfn_valid(pfn))
578 continue;
579 page = pfn_to_page(pfn);
580 /*
581 * This condition results from rvmalloc() sans vmalloc_32()
582 * and architectural memory reservations. This should be
583 * corrected eventually when the cases giving rise to this
584 * are better understood.
585 */
586 if (PageReserved(page)) {
587 printk("highmem reserved page?!\n");
588 continue;
589 }
590 BUG_ON(PageNosave(page));
591 if (PageNosaveFree(page))
592 continue;
593 save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC);
594 if (!save)
595 return -ENOMEM;
596 save->next = highmem_copy;
597 save->page = page;
598 save->data = (void *) get_zeroed_page(GFP_ATOMIC);
599 if (!save->data) {
600 kfree(save);
601 return -ENOMEM;
602 }
603 kaddr = kmap_atomic(page, KM_USER0);
604 memcpy(save->data, kaddr, PAGE_SIZE);
605 kunmap_atomic(kaddr, KM_USER0);
606 highmem_copy = save;
607 }
608 return 0;
609}
610#endif /* CONFIG_HIGHMEM */
611
612
613static int save_highmem(void)
614{
615#ifdef CONFIG_HIGHMEM
616 struct zone *zone;
617 int res = 0;
618
619 pr_debug("swsusp: Saving Highmem\n");
620 for_each_zone (zone) {
621 if (is_highmem(zone))
622 res = save_highmem_zone(zone);
623 if (res)
624 return res;
625 }
626#endif
627 return 0;
628}
629
630static int restore_highmem(void)
631{
632#ifdef CONFIG_HIGHMEM
633 printk("swsusp: Restoring Highmem\n");
634 while (highmem_copy) {
635 struct highmem_page *save = highmem_copy;
636 void *kaddr;
637 highmem_copy = save->next;
638
639 kaddr = kmap_atomic(save->page, KM_USER0);
640 memcpy(kaddr, save->data, PAGE_SIZE);
641 kunmap_atomic(kaddr, KM_USER0);
642 free_page((long) save->data);
643 kfree(save);
644 }
645#endif
646 return 0;
647}
648
649
650static int pfn_is_nosave(unsigned long pfn)
651{
652 unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT;
653 unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT;
654 return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
655}
656
657/**
658 * saveable - Determine whether a page should be cloned or not.
659 * @pfn: The page
660 *
661 * We save a page if it's Reserved, and not in the range of pages
662 * statically defined as 'unsaveable', or if it isn't reserved, and
663 * isn't part of a free chunk of pages.
664 */
665
666static int saveable(struct zone * zone, unsigned long * zone_pfn)
667{
668 unsigned long pfn = *zone_pfn + zone->zone_start_pfn;
669 struct page * page;
670
671 if (!pfn_valid(pfn))
672 return 0;
673
674 page = pfn_to_page(pfn);
675 BUG_ON(PageReserved(page) && PageNosave(page));
676 if (PageNosave(page))
677 return 0;
678 if (PageReserved(page) && pfn_is_nosave(pfn)) {
679 pr_debug("[nosave pfn 0x%lx]", pfn);
680 return 0;
681 }
682 if (PageNosaveFree(page))
683 return 0;
684
685 return 1;
686}
687
688static void count_data_pages(void)
689{
690 struct zone *zone;
691 unsigned long zone_pfn;
692
693 nr_copy_pages = 0;
694
695 for_each_zone (zone) {
696 if (is_highmem(zone))
697 continue;
698 mark_free_pages(zone);
699 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
700 nr_copy_pages += saveable(zone, &zone_pfn);
701 }
702}
703
704
705static void copy_data_pages(void)
706{
707 struct zone *zone;
708 unsigned long zone_pfn;
709 struct pbe * pbe = pagedir_nosave;
710
711 pr_debug("copy_data_pages(): pages to copy: %d\n", nr_copy_pages);
712 for_each_zone (zone) {
713 if (is_highmem(zone))
714 continue;
715 mark_free_pages(zone);
716 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
717 if (saveable(zone, &zone_pfn)) {
718 struct page * page;
719 page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
720 BUG_ON(!pbe);
721 pbe->orig_address = (long) page_address(page);
722 /* copy_page is not usable for copying task structs. */
723 memcpy((void *)pbe->address, (void *)pbe->orig_address, PAGE_SIZE);
724 pbe = pbe->next;
725 }
726 }
727 }
728 BUG_ON(pbe);
729}
730
731
732/**
733 * calc_nr - Determine the number of pages needed for a pbe list.
734 */
735
736static int calc_nr(int nr_copy)
737{
738 return nr_copy + (nr_copy+PBES_PER_PAGE-2)/(PBES_PER_PAGE-1);
739}
740
741/**
742 * free_pagedir - free pages allocated with alloc_pagedir()
743 */
744
745static inline void free_pagedir(struct pbe *pblist)
746{
747 struct pbe *pbe;
748
749 while (pblist) {
750 pbe = (pblist + PB_PAGE_SKIP)->next;
751 free_page((unsigned long)pblist);
752 pblist = pbe;
753 }
754}
755
756/**
757 * fill_pb_page - Create a list of PBEs on a given memory page
758 */
759
760static inline void fill_pb_page(struct pbe *pbpage)
761{
762 struct pbe *p;
763
764 p = pbpage;
765 pbpage += PB_PAGE_SKIP;
766 do
767 p->next = p + 1;
768 while (++p < pbpage);
769}
770
771/**
772 * create_pbe_list - Create a list of PBEs on top of a given chain
773 * of memory pages allocated with alloc_pagedir()
774 */
775
776static void create_pbe_list(struct pbe *pblist, unsigned nr_pages)
777{
778 struct pbe *pbpage, *p;
779 unsigned num = PBES_PER_PAGE;
780
781 for_each_pb_page (pbpage, pblist) {
782 if (num >= nr_pages)
783 break;
784
785 fill_pb_page(pbpage);
786 num += PBES_PER_PAGE;
787 }
788 if (pbpage) {
789 for (num -= PBES_PER_PAGE - 1, p = pbpage; num < nr_pages; p++, num++)
790 p->next = p + 1;
791 p->next = NULL;
792 }
793 pr_debug("create_pbe_list(): initialized %d PBEs\n", num);
794}
795
796/**
797 * alloc_pagedir - Allocate the page directory.
798 *
799 * First, determine exactly how many pages we need and
800 * allocate them.
801 *
802 * We arrange the pages in a chain: each page is an array of PBES_PER_PAGE
803 * struct pbe elements (pbes) and the last element in the page points
804 * to the next page.
805 *
806 * On each page we set up a list of struct_pbe elements.
807 */
808
809static struct pbe * alloc_pagedir(unsigned nr_pages)
810{
811 unsigned num;
812 struct pbe *pblist, *pbe;
813
814 if (!nr_pages)
815 return NULL;
816
817 pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages);
818 pblist = (struct pbe *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
819 for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages;
820 pbe = pbe->next, num += PBES_PER_PAGE) {
821 pbe += PB_PAGE_SKIP;
822 pbe->next = (struct pbe *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
823 }
824 if (!pbe) { /* get_zeroed_page() failed */
825 free_pagedir(pblist);
826 pblist = NULL;
827 }
828 return pblist;
829}
830
831/**
832 * free_image_pages - Free pages allocated for snapshot
833 */
834
835static void free_image_pages(void)
836{
837 struct pbe * p;
838
839 for_each_pbe (p, pagedir_save) {
840 if (p->address) {
841 ClearPageNosave(virt_to_page(p->address));
842 free_page(p->address);
843 p->address = 0;
844 }
845 }
846}
847
848/**
849 * alloc_image_pages - Allocate pages for the snapshot.
850 */
851
852static int alloc_image_pages(void)
853{
854 struct pbe * p;
855
856 for_each_pbe (p, pagedir_save) {
857 p->address = get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
858 if (!p->address)
859 return -ENOMEM;
860 SetPageNosave(virt_to_page(p->address));
861 }
862 return 0;
863}
864
865/* Free pages we allocated for suspend. Suspend pages are alocated
866 * before atomic copy, so we need to free them after resume.
867 */
868void swsusp_free(void)
869{
870 BUG_ON(PageNosave(virt_to_page(pagedir_save)));
871 BUG_ON(PageNosaveFree(virt_to_page(pagedir_save)));
872 free_image_pages();
873 free_pagedir(pagedir_save);
874}
875
876
877/**
878 * enough_free_mem - Make sure we enough free memory to snapshot.
879 *
880 * Returns TRUE or FALSE after checking the number of available
881 * free pages.
882 */
883
884static int enough_free_mem(void)
885{
886 if (nr_free_pages() < (nr_copy_pages + PAGES_FOR_IO)) {
887 pr_debug("swsusp: Not enough free pages: Have %d\n",
888 nr_free_pages());
889 return 0;
890 }
891 return 1;
892}
893
894
895/** 536/**
896 * enough_swap - Make sure we have enough swap to save the image. 537 * enough_swap - Make sure we have enough swap to save the image.
897 * 538 *
@@ -902,87 +543,14 @@ static int enough_free_mem(void)
902 * We should only consider resume_device. 543 * We should only consider resume_device.
903 */ 544 */
904 545
905static int enough_swap(void) 546int enough_swap(unsigned nr_pages)
906{ 547{
907 struct sysinfo i; 548 struct sysinfo i;
908 549
909 si_swapinfo(&i); 550 si_swapinfo(&i);
910 if (i.freeswap < (nr_copy_pages + PAGES_FOR_IO)) { 551 pr_debug("swsusp: available swap: %lu pages\n", i.freeswap);
911 pr_debug("swsusp: Not enough swap. Need %ld\n",i.freeswap); 552 return i.freeswap > (nr_pages + PAGES_FOR_IO +
912 return 0; 553 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
913 }
914 return 1;
915}
916
917static int swsusp_alloc(void)
918{
919 int error;
920
921 pagedir_nosave = NULL;
922 nr_copy_pages = calc_nr(nr_copy_pages);
923 nr_copy_pages_check = nr_copy_pages;
924
925 pr_debug("suspend: (pages needed: %d + %d free: %d)\n",
926 nr_copy_pages, PAGES_FOR_IO, nr_free_pages());
927
928 if (!enough_free_mem())
929 return -ENOMEM;
930
931 if (!enough_swap())
932 return -ENOSPC;
933
934 if (MAX_PBES < nr_copy_pages / PBES_PER_PAGE +
935 !!(nr_copy_pages % PBES_PER_PAGE))
936 return -ENOSPC;
937
938 if (!(pagedir_save = alloc_pagedir(nr_copy_pages))) {
939 printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
940 return -ENOMEM;
941 }
942 create_pbe_list(pagedir_save, nr_copy_pages);
943 pagedir_nosave = pagedir_save;
944 if ((error = alloc_image_pages())) {
945 printk(KERN_ERR "suspend: Allocating image pages failed.\n");
946 swsusp_free();
947 return error;
948 }
949
950 return 0;
951}
952
953static int suspend_prepare_image(void)
954{
955 int error;
956
957 pr_debug("swsusp: critical section: \n");
958 if (save_highmem()) {
959 printk(KERN_CRIT "Suspend machine: Not enough free pages for highmem\n");
960 restore_highmem();
961 return -ENOMEM;
962 }
963
964 drain_local_pages();
965 count_data_pages();
966 printk("swsusp: Need to copy %u pages\n", nr_copy_pages);
967
968 error = swsusp_alloc();
969 if (error)
970 return error;
971
972 /* During allocating of suspend pagedir, new cold pages may appear.
973 * Kill them.
974 */
975 drain_local_pages();
976 copy_data_pages();
977
978 /*
979 * End of critical section. From now on, we can write to memory,
980 * but we should not touch disk. This specially means we must _not_
981 * touch swap space! Except we must write out our image of course.
982 */
983
984 printk("swsusp: critical section/: done (%d pages copied)\n", nr_copy_pages );
985 return 0;
986} 554}
987 555
988 556
@@ -994,7 +562,7 @@ static int suspend_prepare_image(void)
994int swsusp_write(void) 562int swsusp_write(void)
995{ 563{
996 int error; 564 int error;
997 device_resume(); 565
998 lock_swapdevices(); 566 lock_swapdevices();
999 error = write_suspend_image(); 567 error = write_suspend_image();
1000 /* This will unlock ignored swap devices since writing is finished */ 568 /* This will unlock ignored swap devices since writing is finished */
@@ -1004,14 +572,6 @@ int swsusp_write(void)
1004} 572}
1005 573
1006 574
1007extern asmlinkage int swsusp_arch_suspend(void);
1008extern asmlinkage int swsusp_arch_resume(void);
1009
1010
1011asmlinkage int swsusp_save(void)
1012{
1013 return suspend_prepare_image();
1014}
1015 575
1016int swsusp_suspend(void) 576int swsusp_suspend(void)
1017{ 577{
@@ -1043,7 +603,6 @@ int swsusp_suspend(void)
1043 printk(KERN_ERR "Error %d suspending\n", error); 603 printk(KERN_ERR "Error %d suspending\n", error);
1044 /* Restore control flow magically appears here */ 604 /* Restore control flow magically appears here */
1045 restore_processor_state(); 605 restore_processor_state();
1046 BUG_ON (nr_copy_pages_check != nr_copy_pages);
1047 restore_highmem(); 606 restore_highmem();
1048 device_power_up(); 607 device_power_up();
1049 local_irq_enable(); 608 local_irq_enable();
@@ -1063,6 +622,11 @@ int swsusp_resume(void)
1063 * execution continues at place where swsusp_arch_suspend was called 622 * execution continues at place where swsusp_arch_suspend was called
1064 */ 623 */
1065 BUG_ON(!error); 624 BUG_ON(!error);
625 /* The only reason why swsusp_arch_resume() can fail is memory being
626 * very tight, so we have to free it as soon as we can to avoid
627 * subsequent failures
628 */
629 swsusp_free();
1066 restore_processor_state(); 630 restore_processor_state();
1067 restore_highmem(); 631 restore_highmem();
1068 touch_softlockup_watchdog(); 632 touch_softlockup_watchdog();
@@ -1078,54 +642,28 @@ int swsusp_resume(void)
1078 * 642 *
1079 * We don't know which pages are usable until we allocate them. 643 * We don't know which pages are usable until we allocate them.
1080 * 644 *
1081 * Allocated but unusable (ie eaten) memory pages are linked together 645 * Allocated but unusable (ie eaten) memory pages are marked so that
1082 * to create a list, so that we can free them easily 646 * swsusp_free() can release them
1083 *
1084 * We could have used a type other than (void *)
1085 * for this purpose, but ...
1086 */ 647 */
1087static void **eaten_memory = NULL;
1088
1089static inline void eat_page(void *page)
1090{
1091 void **c;
1092
1093 c = eaten_memory;
1094 eaten_memory = page;
1095 *eaten_memory = c;
1096}
1097 648
1098unsigned long get_usable_page(unsigned gfp_mask) 649unsigned long get_safe_page(gfp_t gfp_mask)
1099{ 650{
1100 unsigned long m; 651 unsigned long m;
1101 652
1102 m = get_zeroed_page(gfp_mask); 653 do {
1103 while (!PageNosaveFree(virt_to_page(m))) {
1104 eat_page((void *)m);
1105 m = get_zeroed_page(gfp_mask); 654 m = get_zeroed_page(gfp_mask);
1106 if (!m) 655 if (m && PageNosaveFree(virt_to_page(m)))
1107 break; 656 /* This is for swsusp_free() */
657 SetPageNosave(virt_to_page(m));
658 } while (m && PageNosaveFree(virt_to_page(m)));
659 if (m) {
660 /* This is for swsusp_free() */
661 SetPageNosave(virt_to_page(m));
662 SetPageNosaveFree(virt_to_page(m));
1108 } 663 }
1109 return m; 664 return m;
1110} 665}
1111 666
1112void free_eaten_memory(void)
1113{
1114 unsigned long m;
1115 void **c;
1116 int i = 0;
1117
1118 c = eaten_memory;
1119 while (c) {
1120 m = (unsigned long)c;
1121 c = *c;
1122 free_page(m);
1123 i++;
1124 }
1125 eaten_memory = NULL;
1126 pr_debug("swsusp: %d unused pages freed\n", i);
1127}
1128
1129/** 667/**
1130 * check_pagedir - We ensure here that pages that the PBEs point to 668 * check_pagedir - We ensure here that pages that the PBEs point to
1131 * won't collide with pages where we're going to restore from the loaded 669 * won't collide with pages where we're going to restore from the loaded
@@ -1143,7 +681,7 @@ static int check_pagedir(struct pbe *pblist)
1143 p->address = 0UL; 681 p->address = 0UL;
1144 682
1145 for_each_pbe (p, pblist) { 683 for_each_pbe (p, pblist) {
1146 p->address = get_usable_page(GFP_ATOMIC); 684 p->address = get_safe_page(GFP_ATOMIC);
1147 if (!p->address) 685 if (!p->address)
1148 return -ENOMEM; 686 return -ENOMEM;
1149 } 687 }
@@ -1162,7 +700,7 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist)
1162 unsigned long zone_pfn; 700 unsigned long zone_pfn;
1163 struct pbe *pbpage, *tail, *p; 701 struct pbe *pbpage, *tail, *p;
1164 void *m; 702 void *m;
1165 int rel = 0, error = 0; 703 int rel = 0;
1166 704
1167 if (!pblist) /* a sanity check */ 705 if (!pblist) /* a sanity check */
1168 return NULL; 706 return NULL;
@@ -1170,41 +708,37 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist)
1170 pr_debug("swsusp: Relocating pagedir (%lu pages to check)\n", 708 pr_debug("swsusp: Relocating pagedir (%lu pages to check)\n",
1171 swsusp_info.pagedir_pages); 709 swsusp_info.pagedir_pages);
1172 710
1173 /* Set page flags */ 711 /* Clear page flags */
1174 712
1175 for_each_zone (zone) { 713 for_each_zone (zone) {
1176 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) 714 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
1177 SetPageNosaveFree(pfn_to_page(zone_pfn + 715 if (pfn_valid(zone_pfn + zone->zone_start_pfn))
716 ClearPageNosaveFree(pfn_to_page(zone_pfn +
1178 zone->zone_start_pfn)); 717 zone->zone_start_pfn));
1179 } 718 }
1180 719
1181 /* Clear orig addresses */ 720 /* Mark orig addresses */
1182 721
1183 for_each_pbe (p, pblist) 722 for_each_pbe (p, pblist)
1184 ClearPageNosaveFree(virt_to_page(p->orig_address)); 723 SetPageNosaveFree(virt_to_page(p->orig_address));
1185 724
1186 tail = pblist + PB_PAGE_SKIP; 725 tail = pblist + PB_PAGE_SKIP;
1187 726
1188 /* Relocate colliding pages */ 727 /* Relocate colliding pages */
1189 728
1190 for_each_pb_page (pbpage, pblist) { 729 for_each_pb_page (pbpage, pblist) {
1191 if (!PageNosaveFree(virt_to_page((unsigned long)pbpage))) { 730 if (PageNosaveFree(virt_to_page((unsigned long)pbpage))) {
1192 m = (void *)get_usable_page(GFP_ATOMIC | __GFP_COLD); 731 m = (void *)get_safe_page(GFP_ATOMIC | __GFP_COLD);
1193 if (!m) { 732 if (!m)
1194 error = -ENOMEM; 733 return NULL;
1195 break;
1196 }
1197 memcpy(m, (void *)pbpage, PAGE_SIZE); 734 memcpy(m, (void *)pbpage, PAGE_SIZE);
1198 if (pbpage == pblist) 735 if (pbpage == pblist)
1199 pblist = (struct pbe *)m; 736 pblist = (struct pbe *)m;
1200 else 737 else
1201 tail->next = (struct pbe *)m; 738 tail->next = (struct pbe *)m;
1202
1203 eat_page((void *)pbpage);
1204 pbpage = (struct pbe *)m; 739 pbpage = (struct pbe *)m;
1205 740
1206 /* We have to link the PBEs again */ 741 /* We have to link the PBEs again */
1207
1208 for (p = pbpage; p < pbpage + PB_PAGE_SKIP; p++) 742 for (p = pbpage; p < pbpage + PB_PAGE_SKIP; p++)
1209 if (p->next) /* needed to save the end */ 743 if (p->next) /* needed to save the end */
1210 p->next = p + 1; 744 p->next = p + 1;
@@ -1214,15 +748,13 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist)
1214 tail = pbpage + PB_PAGE_SKIP; 748 tail = pbpage + PB_PAGE_SKIP;
1215 } 749 }
1216 750
1217 if (error) { 751 /* This is for swsusp_free() */
1218 printk("\nswsusp: Out of memory\n\n"); 752 for_each_pb_page (pbpage, pblist) {
1219 free_pagedir(pblist); 753 SetPageNosave(virt_to_page(pbpage));
1220 free_eaten_memory(); 754 SetPageNosaveFree(virt_to_page(pbpage));
1221 pblist = NULL; 755 }
1222 /* Is this even worth handling? It should never ever happen, and we 756
1223 have just lost user's state, anyway... */ 757 printk("swsusp: Relocated %d pages\n", rel);
1224 } else
1225 printk("swsusp: Relocated %d pages\n", rel);
1226 758
1227 return pblist; 759 return pblist;
1228} 760}
@@ -1440,9 +972,7 @@ static int read_pagedir(struct pbe *pblist)
1440 break; 972 break;
1441 } 973 }
1442 974
1443 if (error) 975 if (!error)
1444 free_pagedir(pblist);
1445 else
1446 BUG_ON(i != swsusp_info.pagedir_pages); 976 BUG_ON(i != swsusp_info.pagedir_pages);
1447 977
1448 return error; 978 return error;
@@ -1485,15 +1015,6 @@ static int read_suspend_image(void)
1485 if (!error) 1015 if (!error)
1486 error = data_read(pagedir_nosave); 1016 error = data_read(pagedir_nosave);
1487 1017
1488 if (error) { /* We fail cleanly */
1489 free_eaten_memory();
1490 for_each_pbe (p, pagedir_nosave)
1491 if (p->address) {
1492 free_page(p->address);
1493 p->address = 0UL;
1494 }
1495 free_pagedir(pagedir_nosave);
1496 }
1497 return error; 1018 return error;
1498} 1019}
1499 1020
diff --git a/kernel/printk.c b/kernel/printk.c
index 4b8f0f9230a4..3cb9708209bc 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -10,7 +10,7 @@
10 * elsewhere, in preparation for a serial line console (someday). 10 * elsewhere, in preparation for a serial line console (someday).
11 * Ted Ts'o, 2/11/93. 11 * Ted Ts'o, 2/11/93.
12 * Modified for sysctl support, 1/8/97, Chris Horn. 12 * Modified for sysctl support, 1/8/97, Chris Horn.
13 * Fixed SMP synchronization, 08/08/99, Manfred Spraul 13 * Fixed SMP synchronization, 08/08/99, Manfred Spraul
14 * manfreds@colorfullife.com 14 * manfreds@colorfullife.com
15 * Rewrote bits to get rid of console_lock 15 * Rewrote bits to get rid of console_lock
16 * 01Mar01 Andrew Morton <andrewm@uow.edu.au> 16 * 01Mar01 Andrew Morton <andrewm@uow.edu.au>
@@ -148,7 +148,7 @@ static int __init console_setup(char *str)
148 if (!strcmp(str, "ttyb")) 148 if (!strcmp(str, "ttyb"))
149 strcpy(name, "ttyS1"); 149 strcpy(name, "ttyS1");
150#endif 150#endif
151 for(s = name; *s; s++) 151 for (s = name; *s; s++)
152 if ((*s >= '0' && *s <= '9') || *s == ',') 152 if ((*s >= '0' && *s <= '9') || *s == ',')
153 break; 153 break;
154 idx = simple_strtoul(s, NULL, 10); 154 idx = simple_strtoul(s, NULL, 10);
@@ -169,11 +169,11 @@ static int __init log_buf_len_setup(char *str)
169 size = roundup_pow_of_two(size); 169 size = roundup_pow_of_two(size);
170 if (size > log_buf_len) { 170 if (size > log_buf_len) {
171 unsigned long start, dest_idx, offset; 171 unsigned long start, dest_idx, offset;
172 char * new_log_buf; 172 char *new_log_buf;
173 173
174 new_log_buf = alloc_bootmem(size); 174 new_log_buf = alloc_bootmem(size);
175 if (!new_log_buf) { 175 if (!new_log_buf) {
176 printk("log_buf_len: allocation failed\n"); 176 printk(KERN_WARNING "log_buf_len: allocation failed\n");
177 goto out; 177 goto out;
178 } 178 }
179 179
@@ -193,10 +193,9 @@ static int __init log_buf_len_setup(char *str)
193 log_end -= offset; 193 log_end -= offset;
194 spin_unlock_irqrestore(&logbuf_lock, flags); 194 spin_unlock_irqrestore(&logbuf_lock, flags);
195 195
196 printk("log_buf_len: %d\n", log_buf_len); 196 printk(KERN_NOTICE "log_buf_len: %d\n", log_buf_len);
197 } 197 }
198out: 198out:
199
200 return 1; 199 return 1;
201} 200}
202 201
@@ -217,7 +216,7 @@ __setup("log_buf_len=", log_buf_len_setup);
217 * 9 -- Return number of unread characters in the log buffer 216 * 9 -- Return number of unread characters in the log buffer
218 * 10 -- Return size of the log buffer 217 * 10 -- Return size of the log buffer
219 */ 218 */
220int do_syslog(int type, char __user * buf, int len) 219int do_syslog(int type, char __user *buf, int len)
221{ 220{
222 unsigned long i, j, limit, count; 221 unsigned long i, j, limit, count;
223 int do_clear = 0; 222 int do_clear = 0;
@@ -244,7 +243,8 @@ int do_syslog(int type, char __user * buf, int len)
244 error = -EFAULT; 243 error = -EFAULT;
245 goto out; 244 goto out;
246 } 245 }
247 error = wait_event_interruptible(log_wait, (log_start - log_end)); 246 error = wait_event_interruptible(log_wait,
247 (log_start - log_end));
248 if (error) 248 if (error)
249 goto out; 249 goto out;
250 i = 0; 250 i = 0;
@@ -264,7 +264,7 @@ int do_syslog(int type, char __user * buf, int len)
264 error = i; 264 error = i;
265 break; 265 break;
266 case 4: /* Read/clear last kernel messages */ 266 case 4: /* Read/clear last kernel messages */
267 do_clear = 1; 267 do_clear = 1;
268 /* FALL THRU */ 268 /* FALL THRU */
269 case 3: /* Read last kernel messages */ 269 case 3: /* Read last kernel messages */
270 error = -EINVAL; 270 error = -EINVAL;
@@ -288,11 +288,11 @@ int do_syslog(int type, char __user * buf, int len)
288 limit = log_end; 288 limit = log_end;
289 /* 289 /*
290 * __put_user() could sleep, and while we sleep 290 * __put_user() could sleep, and while we sleep
291 * printk() could overwrite the messages 291 * printk() could overwrite the messages
292 * we try to copy to user space. Therefore 292 * we try to copy to user space. Therefore
293 * the messages are copied in reverse. <manfreds> 293 * the messages are copied in reverse. <manfreds>
294 */ 294 */
295 for(i = 0; i < count && !error; i++) { 295 for (i = 0; i < count && !error; i++) {
296 j = limit-1-i; 296 j = limit-1-i;
297 if (j + log_buf_len < log_end) 297 if (j + log_buf_len < log_end)
298 break; 298 break;
@@ -306,10 +306,10 @@ int do_syslog(int type, char __user * buf, int len)
306 if (error) 306 if (error)
307 break; 307 break;
308 error = i; 308 error = i;
309 if(i != count) { 309 if (i != count) {
310 int offset = count-error; 310 int offset = count-error;
311 /* buffer overflow during copy, correct user buffer. */ 311 /* buffer overflow during copy, correct user buffer. */
312 for(i=0;i<error;i++) { 312 for (i = 0; i < error; i++) {
313 if (__get_user(c,&buf[i+offset]) || 313 if (__get_user(c,&buf[i+offset]) ||
314 __put_user(c,&buf[i])) { 314 __put_user(c,&buf[i])) {
315 error = -EFAULT; 315 error = -EFAULT;
@@ -351,7 +351,7 @@ out:
351 return error; 351 return error;
352} 352}
353 353
354asmlinkage long sys_syslog(int type, char __user * buf, int len) 354asmlinkage long sys_syslog(int type, char __user *buf, int len)
355{ 355{
356 return do_syslog(type, buf, len); 356 return do_syslog(type, buf, len);
357} 357}
@@ -404,21 +404,19 @@ static void call_console_drivers(unsigned long start, unsigned long end)
404 cur_index = start; 404 cur_index = start;
405 start_print = start; 405 start_print = start;
406 while (cur_index != end) { 406 while (cur_index != end) {
407 if ( msg_level < 0 && 407 if (msg_level < 0 && ((end - cur_index) > 2) &&
408 ((end - cur_index) > 2) && 408 LOG_BUF(cur_index + 0) == '<' &&
409 LOG_BUF(cur_index + 0) == '<' && 409 LOG_BUF(cur_index + 1) >= '0' &&
410 LOG_BUF(cur_index + 1) >= '0' && 410 LOG_BUF(cur_index + 1) <= '7' &&
411 LOG_BUF(cur_index + 1) <= '7' && 411 LOG_BUF(cur_index + 2) == '>') {
412 LOG_BUF(cur_index + 2) == '>')
413 {
414 msg_level = LOG_BUF(cur_index + 1) - '0'; 412 msg_level = LOG_BUF(cur_index + 1) - '0';
415 cur_index += 3; 413 cur_index += 3;
416 start_print = cur_index; 414 start_print = cur_index;
417 } 415 }
418 while (cur_index != end) { 416 while (cur_index != end) {
419 char c = LOG_BUF(cur_index); 417 char c = LOG_BUF(cur_index);
420 cur_index++;
421 418
419 cur_index++;
422 if (c == '\n') { 420 if (c == '\n') {
423 if (msg_level < 0) { 421 if (msg_level < 0) {
424 /* 422 /*
@@ -461,7 +459,7 @@ static void zap_locks(void)
461 static unsigned long oops_timestamp; 459 static unsigned long oops_timestamp;
462 460
463 if (time_after_eq(jiffies, oops_timestamp) && 461 if (time_after_eq(jiffies, oops_timestamp) &&
464 !time_after(jiffies, oops_timestamp + 30*HZ)) 462 !time_after(jiffies, oops_timestamp + 30 * HZ))
465 return; 463 return;
466 464
467 oops_timestamp = jiffies; 465 oops_timestamp = jiffies;
@@ -495,7 +493,7 @@ __attribute__((weak)) unsigned long long printk_clock(void)
495 493
496/* 494/*
497 * This is printk. It can be called from any context. We want it to work. 495 * This is printk. It can be called from any context. We want it to work.
498 * 496 *
499 * We try to grab the console_sem. If we succeed, it's easy - we log the output and 497 * We try to grab the console_sem. If we succeed, it's easy - we log the output and
500 * call the console drivers. If we fail to get the semaphore we place the output 498 * call the console drivers. If we fail to get the semaphore we place the output
501 * into the log buffer and return. The current holder of the console_sem will 499 * into the log buffer and return. The current holder of the console_sem will
@@ -639,13 +637,19 @@ EXPORT_SYMBOL(vprintk);
639 637
640#else 638#else
641 639
642asmlinkage long sys_syslog(int type, char __user * buf, int len) 640asmlinkage long sys_syslog(int type, char __user *buf, int len)
643{ 641{
644 return 0; 642 return 0;
645} 643}
646 644
647int do_syslog(int type, char __user * buf, int len) { return 0; } 645int do_syslog(int type, char __user *buf, int len)
648static void call_console_drivers(unsigned long start, unsigned long end) {} 646{
647 return 0;
648}
649
650static void call_console_drivers(unsigned long start, unsigned long end)
651{
652}
649 653
650#endif 654#endif
651 655
@@ -851,9 +855,9 @@ EXPORT_SYMBOL(console_start);
851 * print any messages that were printed by the kernel before the 855 * print any messages that were printed by the kernel before the
852 * console driver was initialized. 856 * console driver was initialized.
853 */ 857 */
854void register_console(struct console * console) 858void register_console(struct console *console)
855{ 859{
856 int i; 860 int i;
857 unsigned long flags; 861 unsigned long flags;
858 862
859 if (preferred_console < 0) 863 if (preferred_console < 0)
@@ -878,7 +882,8 @@ void register_console(struct console * console)
878 * See if this console matches one we selected on 882 * See if this console matches one we selected on
879 * the command line. 883 * the command line.
880 */ 884 */
881 for(i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) { 885 for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0];
886 i++) {
882 if (strcmp(console_cmdline[i].name, console->name) != 0) 887 if (strcmp(console_cmdline[i].name, console->name) != 0)
883 continue; 888 continue;
884 if (console->index >= 0 && 889 if (console->index >= 0 &&
@@ -933,9 +938,9 @@ void register_console(struct console * console)
933} 938}
934EXPORT_SYMBOL(register_console); 939EXPORT_SYMBOL(register_console);
935 940
936int unregister_console(struct console * console) 941int unregister_console(struct console *console)
937{ 942{
938 struct console *a,*b; 943 struct console *a, *b;
939 int res = 1; 944 int res = 1;
940 945
941 acquire_console_sem(); 946 acquire_console_sem();
@@ -949,10 +954,10 @@ int unregister_console(struct console * console)
949 b->next = a->next; 954 b->next = a->next;
950 res = 0; 955 res = 0;
951 break; 956 break;
952 } 957 }
953 } 958 }
954 } 959 }
955 960
956 /* If last console is removed, we re-enable picking the first 961 /* If last console is removed, we re-enable picking the first
957 * one that gets registered. Without that, pmac early boot console 962 * one that gets registered. Without that, pmac early boot console
958 * would prevent fbcon from taking over. 963 * would prevent fbcon from taking over.
@@ -994,7 +999,7 @@ void tty_write_message(struct tty_struct *tty, char *msg)
994int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst) 999int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst)
995{ 1000{
996 static DEFINE_SPINLOCK(ratelimit_lock); 1001 static DEFINE_SPINLOCK(ratelimit_lock);
997 static unsigned long toks = 10*5*HZ; 1002 static unsigned long toks = 10 * 5 * HZ;
998 static unsigned long last_msg; 1003 static unsigned long last_msg;
999 static int missed; 1004 static int missed;
1000 unsigned long flags; 1005 unsigned long flags;
@@ -1007,6 +1012,7 @@ int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst)
1007 toks = ratelimit_burst * ratelimit_jiffies; 1012 toks = ratelimit_burst * ratelimit_jiffies;
1008 if (toks >= ratelimit_jiffies) { 1013 if (toks >= ratelimit_jiffies) {
1009 int lost = missed; 1014 int lost = missed;
1015
1010 missed = 0; 1016 missed = 0;
1011 toks -= ratelimit_jiffies; 1017 toks -= ratelimit_jiffies;
1012 spin_unlock_irqrestore(&ratelimit_lock, flags); 1018 spin_unlock_irqrestore(&ratelimit_lock, flags);
@@ -1021,7 +1027,7 @@ int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst)
1021EXPORT_SYMBOL(__printk_ratelimit); 1027EXPORT_SYMBOL(__printk_ratelimit);
1022 1028
1023/* minimum time in jiffies between messages */ 1029/* minimum time in jiffies between messages */
1024int printk_ratelimit_jiffies = 5*HZ; 1030int printk_ratelimit_jiffies = 5 * HZ;
1025 1031
1026/* number of messages we send before ratelimiting */ 1032/* number of messages we send before ratelimiting */
1027int printk_ratelimit_burst = 10; 1033int printk_ratelimit_burst = 10;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 019e04ec065a..863eee8bff47 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -56,6 +56,10 @@ void ptrace_untrace(task_t *child)
56 signal_wake_up(child, 1); 56 signal_wake_up(child, 1);
57 } 57 }
58 } 58 }
59 if (child->signal->flags & SIGNAL_GROUP_EXIT) {
60 sigaddset(&child->pending.signal, SIGKILL);
61 signal_wake_up(child, 1);
62 }
59 spin_unlock(&child->sighand->siglock); 63 spin_unlock(&child->sighand->siglock);
60} 64}
61 65
@@ -77,8 +81,7 @@ void __ptrace_unlink(task_t *child)
77 SET_LINKS(child); 81 SET_LINKS(child);
78 } 82 }
79 83
80 if (child->state == TASK_TRACED) 84 ptrace_untrace(child);
81 ptrace_untrace(child);
82} 85}
83 86
84/* 87/*
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 2559d4b8f23f..c4d159a21e04 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -154,6 +154,15 @@ void fastcall call_rcu_bh(struct rcu_head *head,
154} 154}
155 155
156/* 156/*
157 * Return the number of RCU batches processed thus far. Useful
158 * for debug and statistics.
159 */
160long rcu_batches_completed(void)
161{
162 return rcu_ctrlblk.completed;
163}
164
165/*
157 * Invoke the completed RCU callbacks. They are expected to be in 166 * Invoke the completed RCU callbacks. They are expected to be in
158 * a per-cpu list. 167 * a per-cpu list.
159 */ 168 */
@@ -501,6 +510,7 @@ void synchronize_kernel(void)
501} 510}
502 511
503module_param(maxbatch, int, 0); 512module_param(maxbatch, int, 0);
513EXPORT_SYMBOL_GPL(rcu_batches_completed);
504EXPORT_SYMBOL(call_rcu); /* WARNING: GPL-only in April 2006. */ 514EXPORT_SYMBOL(call_rcu); /* WARNING: GPL-only in April 2006. */
505EXPORT_SYMBOL(call_rcu_bh); /* WARNING: GPL-only in April 2006. */ 515EXPORT_SYMBOL(call_rcu_bh); /* WARNING: GPL-only in April 2006. */
506EXPORT_SYMBOL_GPL(synchronize_rcu); 516EXPORT_SYMBOL_GPL(synchronize_rcu);
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
new file mode 100644
index 000000000000..9b58f1eff3ca
--- /dev/null
+++ b/kernel/rcutorture.c
@@ -0,0 +1,492 @@
1/*
2 * Read-Copy Update /proc-based torture test facility
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) IBM Corporation, 2005
19 *
20 * Authors: Paul E. McKenney <paulmck@us.ibm.com>
21 *
22 * See also: Documentation/RCU/torture.txt
23 */
24#include <linux/types.h>
25#include <linux/kernel.h>
26#include <linux/init.h>
27#include <linux/module.h>
28#include <linux/kthread.h>
29#include <linux/err.h>
30#include <linux/spinlock.h>
31#include <linux/smp.h>
32#include <linux/rcupdate.h>
33#include <linux/interrupt.h>
34#include <linux/sched.h>
35#include <asm/atomic.h>
36#include <linux/bitops.h>
37#include <linux/module.h>
38#include <linux/completion.h>
39#include <linux/moduleparam.h>
40#include <linux/percpu.h>
41#include <linux/notifier.h>
42#include <linux/rcuref.h>
43#include <linux/cpu.h>
44#include <linux/random.h>
45#include <linux/delay.h>
46#include <linux/byteorder/swabb.h>
47#include <linux/stat.h>
48
49MODULE_LICENSE("GPL");
50
51static int nreaders = -1; /* # reader threads, defaults to 4*ncpus */
52static int stat_interval = 0; /* Interval between stats, in seconds. */
53 /* Defaults to "only at end of test". */
54static int verbose = 0; /* Print more debug info. */
55
56MODULE_PARM(nreaders, "i");
57MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
58MODULE_PARM(stat_interval, "i");
59MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
60MODULE_PARM(verbose, "i");
61MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
62#define TORTURE_FLAG "rcutorture: "
63#define PRINTK_STRING(s) \
64 do { printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0)
65#define VERBOSE_PRINTK_STRING(s) \
66 do { if (verbose) printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0)
67#define VERBOSE_PRINTK_ERRSTRING(s) \
68 do { if (verbose) printk(KERN_ALERT TORTURE_FLAG "!!! " s "\n"); } while (0)
69
70static char printk_buf[4096];
71
72static int nrealreaders;
73static struct task_struct *writer_task;
74static struct task_struct **reader_tasks;
75static struct task_struct *stats_task;
76
77#define RCU_TORTURE_PIPE_LEN 10
78
79struct rcu_torture {
80 struct rcu_head rtort_rcu;
81 int rtort_pipe_count;
82 struct list_head rtort_free;
83};
84
85static int fullstop = 0; /* stop generating callbacks at test end. */
86static LIST_HEAD(rcu_torture_freelist);
87static struct rcu_torture *rcu_torture_current = NULL;
88static long rcu_torture_current_version = 0;
89static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
90static DEFINE_SPINLOCK(rcu_torture_lock);
91static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) =
92 { 0 };
93static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) =
94 { 0 };
95static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1];
96atomic_t n_rcu_torture_alloc;
97atomic_t n_rcu_torture_alloc_fail;
98atomic_t n_rcu_torture_free;
99
100/*
101 * Allocate an element from the rcu_tortures pool.
102 */
103struct rcu_torture *
104rcu_torture_alloc(void)
105{
106 struct list_head *p;
107
108 spin_lock(&rcu_torture_lock);
109 if (list_empty(&rcu_torture_freelist)) {
110 atomic_inc(&n_rcu_torture_alloc_fail);
111 spin_unlock(&rcu_torture_lock);
112 return NULL;
113 }
114 atomic_inc(&n_rcu_torture_alloc);
115 p = rcu_torture_freelist.next;
116 list_del_init(p);
117 spin_unlock(&rcu_torture_lock);
118 return container_of(p, struct rcu_torture, rtort_free);
119}
120
121/*
122 * Free an element to the rcu_tortures pool.
123 */
124static void
125rcu_torture_free(struct rcu_torture *p)
126{
127 atomic_inc(&n_rcu_torture_free);
128 spin_lock(&rcu_torture_lock);
129 list_add_tail(&p->rtort_free, &rcu_torture_freelist);
130 spin_unlock(&rcu_torture_lock);
131}
132
133static void
134rcu_torture_cb(struct rcu_head *p)
135{
136 int i;
137 struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);
138
139 if (fullstop) {
140 /* Test is ending, just drop callbacks on the floor. */
141 /* The next initialization will pick up the pieces. */
142 return;
143 }
144 i = rp->rtort_pipe_count;
145 if (i > RCU_TORTURE_PIPE_LEN)
146 i = RCU_TORTURE_PIPE_LEN;
147 atomic_inc(&rcu_torture_wcount[i]);
148 if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN)
149 rcu_torture_free(rp);
150 else
151 call_rcu(p, rcu_torture_cb);
152}
153
154struct rcu_random_state {
155 unsigned long rrs_state;
156 unsigned long rrs_count;
157};
158
159#define RCU_RANDOM_MULT 39916801 /* prime */
160#define RCU_RANDOM_ADD 479001701 /* prime */
161#define RCU_RANDOM_REFRESH 10000
162
163#define DEFINE_RCU_RANDOM(name) struct rcu_random_state name = { 0, 0 }
164
165/*
166 * Crude but fast random-number generator. Uses a linear congruential
167 * generator, with occasional help from get_random_bytes().
168 */
169static long
170rcu_random(struct rcu_random_state *rrsp)
171{
172 long refresh;
173
174 if (--rrsp->rrs_count < 0) {
175 get_random_bytes(&refresh, sizeof(refresh));
176 rrsp->rrs_state += refresh;
177 rrsp->rrs_count = RCU_RANDOM_REFRESH;
178 }
179 rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD;
180 return swahw32(rrsp->rrs_state);
181}
182
183/*
184 * RCU torture writer kthread. Repeatedly substitutes a new structure
185 * for that pointed to by rcu_torture_current, freeing the old structure
186 * after a series of grace periods (the "pipeline").
187 */
188static int
189rcu_torture_writer(void *arg)
190{
191 int i;
192 long oldbatch = rcu_batches_completed();
193 struct rcu_torture *rp;
194 struct rcu_torture *old_rp;
195 static DEFINE_RCU_RANDOM(rand);
196
197 VERBOSE_PRINTK_STRING("rcu_torture_writer task started");
198 do {
199 schedule_timeout_uninterruptible(1);
200 if (rcu_batches_completed() == oldbatch)
201 continue;
202 if ((rp = rcu_torture_alloc()) == NULL)
203 continue;
204 rp->rtort_pipe_count = 0;
205 udelay(rcu_random(&rand) & 0x3ff);
206 old_rp = rcu_torture_current;
207 rcu_assign_pointer(rcu_torture_current, rp);
208 smp_wmb();
209 if (old_rp != NULL) {
210 i = old_rp->rtort_pipe_count;
211 if (i > RCU_TORTURE_PIPE_LEN)
212 i = RCU_TORTURE_PIPE_LEN;
213 atomic_inc(&rcu_torture_wcount[i]);
214 old_rp->rtort_pipe_count++;
215 call_rcu(&old_rp->rtort_rcu, rcu_torture_cb);
216 }
217 rcu_torture_current_version++;
218 oldbatch = rcu_batches_completed();
219 } while (!kthread_should_stop() && !fullstop);
220 VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");
221 while (!kthread_should_stop())
222 schedule_timeout_uninterruptible(1);
223 return 0;
224}
225
226/*
227 * RCU torture reader kthread. Repeatedly dereferences rcu_torture_current,
228 * incrementing the corresponding element of the pipeline array. The
229 * counter in the element should never be greater than 1, otherwise, the
230 * RCU implementation is broken.
231 */
232static int
233rcu_torture_reader(void *arg)
234{
235 int completed;
236 DEFINE_RCU_RANDOM(rand);
237 struct rcu_torture *p;
238 int pipe_count;
239
240 VERBOSE_PRINTK_STRING("rcu_torture_reader task started");
241 do {
242 rcu_read_lock();
243 completed = rcu_batches_completed();
244 p = rcu_dereference(rcu_torture_current);
245 if (p == NULL) {
246 /* Wait for rcu_torture_writer to get underway */
247 rcu_read_unlock();
248 schedule_timeout_interruptible(HZ);
249 continue;
250 }
251 udelay(rcu_random(&rand) & 0x7f);
252 preempt_disable();
253 pipe_count = p->rtort_pipe_count;
254 if (pipe_count > RCU_TORTURE_PIPE_LEN) {
255 /* Should not happen, but... */
256 pipe_count = RCU_TORTURE_PIPE_LEN;
257 }
258 ++__get_cpu_var(rcu_torture_count)[pipe_count];
259 completed = rcu_batches_completed() - completed;
260 if (completed > RCU_TORTURE_PIPE_LEN) {
261 /* Should not happen, but... */
262 completed = RCU_TORTURE_PIPE_LEN;
263 }
264 ++__get_cpu_var(rcu_torture_batch)[completed];
265 preempt_enable();
266 rcu_read_unlock();
267 schedule();
268 } while (!kthread_should_stop() && !fullstop);
269 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
270 while (!kthread_should_stop())
271 schedule_timeout_uninterruptible(1);
272 return 0;
273}
274
275/*
276 * Create an RCU-torture statistics message in the specified buffer.
277 */
278static int
279rcu_torture_printk(char *page)
280{
281 int cnt = 0;
282 int cpu;
283 int i;
284 long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
285 long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
286
287 for_each_cpu(cpu) {
288 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
289 pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i];
290 batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i];
291 }
292 }
293 for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) {
294 if (pipesummary[i] != 0)
295 break;
296 }
297 cnt += sprintf(&page[cnt], "rcutorture: ");
298 cnt += sprintf(&page[cnt],
299 "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d",
300 rcu_torture_current,
301 rcu_torture_current_version,
302 list_empty(&rcu_torture_freelist),
303 atomic_read(&n_rcu_torture_alloc),
304 atomic_read(&n_rcu_torture_alloc_fail),
305 atomic_read(&n_rcu_torture_free));
306 cnt += sprintf(&page[cnt], "\nrcutorture: ");
307 if (i > 1)
308 cnt += sprintf(&page[cnt], "!!! ");
309 cnt += sprintf(&page[cnt], "Reader Pipe: ");
310 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
311 cnt += sprintf(&page[cnt], " %ld", pipesummary[i]);
312 cnt += sprintf(&page[cnt], "\nrcutorture: ");
313 cnt += sprintf(&page[cnt], "Reader Batch: ");
314 for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++)
315 cnt += sprintf(&page[cnt], " %ld", batchsummary[i]);
316 cnt += sprintf(&page[cnt], "\nrcutorture: ");
317 cnt += sprintf(&page[cnt], "Free-Block Circulation: ");
318 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
319 cnt += sprintf(&page[cnt], " %d",
320 atomic_read(&rcu_torture_wcount[i]));
321 }
322 cnt += sprintf(&page[cnt], "\n");
323 return cnt;
324}
325
326/*
327 * Print torture statistics. Caller must ensure that there is only
328 * one call to this function at a given time!!! This is normally
329 * accomplished by relying on the module system to only have one copy
330 * of the module loaded, and then by giving the rcu_torture_stats
331 * kthread full control (or the init/cleanup functions when rcu_torture_stats
332 * thread is not running).
333 */
334static void
335rcu_torture_stats_print(void)
336{
337 int cnt;
338
339 cnt = rcu_torture_printk(printk_buf);
340 printk(KERN_ALERT "%s", printk_buf);
341}
342
343/*
344 * Periodically prints torture statistics, if periodic statistics printing
345 * was specified via the stat_interval module parameter.
346 *
347 * No need to worry about fullstop here, since this one doesn't reference
348 * volatile state or register callbacks.
349 */
350static int
351rcu_torture_stats(void *arg)
352{
353 VERBOSE_PRINTK_STRING("rcu_torture_stats task started");
354 do {
355 schedule_timeout_interruptible(stat_interval * HZ);
356 rcu_torture_stats_print();
357 } while (!kthread_should_stop());
358 VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping");
359 return 0;
360}
361
362static void
363rcu_torture_cleanup(void)
364{
365 int i;
366
367 fullstop = 1;
368 if (writer_task != NULL) {
369 VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task");
370 kthread_stop(writer_task);
371 }
372 writer_task = NULL;
373
374 if (reader_tasks != NULL) {
375 for (i = 0; i < nrealreaders; i++) {
376 if (reader_tasks[i] != NULL) {
377 VERBOSE_PRINTK_STRING(
378 "Stopping rcu_torture_reader task");
379 kthread_stop(reader_tasks[i]);
380 }
381 reader_tasks[i] = NULL;
382 }
383 kfree(reader_tasks);
384 reader_tasks = NULL;
385 }
386 rcu_torture_current = NULL;
387
388 if (stats_task != NULL) {
389 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task");
390 kthread_stop(stats_task);
391 }
392 stats_task = NULL;
393
394 /* Wait for all RCU callbacks to fire. */
395
396 for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++)
397 synchronize_rcu();
398 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
399 PRINTK_STRING("--- End of test");
400}
401
402static int
403rcu_torture_init(void)
404{
405 int i;
406 int cpu;
407 int firsterr = 0;
408
409 /* Process args and tell the world that the torturer is on the job. */
410
411 if (nreaders >= 0)
412 nrealreaders = nreaders;
413 else
414 nrealreaders = 2 * num_online_cpus();
415 printk(KERN_ALERT TORTURE_FLAG
416 "--- Start of test: nreaders=%d stat_interval=%d verbose=%d\n",
417 nrealreaders, stat_interval, verbose);
418 fullstop = 0;
419
420 /* Set up the freelist. */
421
422 INIT_LIST_HEAD(&rcu_torture_freelist);
423 for (i = 0; i < sizeof(rcu_tortures) / sizeof(rcu_tortures[0]); i++) {
424 list_add_tail(&rcu_tortures[i].rtort_free,
425 &rcu_torture_freelist);
426 }
427
428 /* Initialize the statistics so that each run gets its own numbers. */
429
430 rcu_torture_current = NULL;
431 rcu_torture_current_version = 0;
432 atomic_set(&n_rcu_torture_alloc, 0);
433 atomic_set(&n_rcu_torture_alloc_fail, 0);
434 atomic_set(&n_rcu_torture_free, 0);
435 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
436 atomic_set(&rcu_torture_wcount[i], 0);
437 for_each_cpu(cpu) {
438 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
439 per_cpu(rcu_torture_count, cpu)[i] = 0;
440 per_cpu(rcu_torture_batch, cpu)[i] = 0;
441 }
442 }
443
444 /* Start up the kthreads. */
445
446 VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task");
447 writer_task = kthread_run(rcu_torture_writer, NULL,
448 "rcu_torture_writer");
449 if (IS_ERR(writer_task)) {
450 firsterr = PTR_ERR(writer_task);
451 VERBOSE_PRINTK_ERRSTRING("Failed to create writer");
452 writer_task = NULL;
453 goto unwind;
454 }
455 reader_tasks = kmalloc(nrealreaders * sizeof(reader_tasks[0]),
456 GFP_KERNEL);
457 if (reader_tasks == NULL) {
458 VERBOSE_PRINTK_ERRSTRING("out of memory");
459 firsterr = -ENOMEM;
460 goto unwind;
461 }
462 for (i = 0; i < nrealreaders; i++) {
463 VERBOSE_PRINTK_STRING("Creating rcu_torture_reader task");
464 reader_tasks[i] = kthread_run(rcu_torture_reader, NULL,
465 "rcu_torture_reader");
466 if (IS_ERR(reader_tasks[i])) {
467 firsterr = PTR_ERR(reader_tasks[i]);
468 VERBOSE_PRINTK_ERRSTRING("Failed to create reader");
469 reader_tasks[i] = NULL;
470 goto unwind;
471 }
472 }
473 if (stat_interval > 0) {
474 VERBOSE_PRINTK_STRING("Creating rcu_torture_stats task");
475 stats_task = kthread_run(rcu_torture_stats, NULL,
476 "rcu_torture_stats");
477 if (IS_ERR(stats_task)) {
478 firsterr = PTR_ERR(stats_task);
479 VERBOSE_PRINTK_ERRSTRING("Failed to create stats");
480 stats_task = NULL;
481 goto unwind;
482 }
483 }
484 return 0;
485
486unwind:
487 rcu_torture_cleanup();
488 return firsterr;
489}
490
491module_init(rcu_torture_init);
492module_exit(rcu_torture_cleanup);
diff --git a/kernel/sched.c b/kernel/sched.c
index 1e5cafdf4e27..340dd238c16d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2511,8 +2511,6 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
2511 cpustat->idle = cputime64_add(cpustat->idle, tmp); 2511 cpustat->idle = cputime64_add(cpustat->idle, tmp);
2512 /* Account for system time used */ 2512 /* Account for system time used */
2513 acct_update_integrals(p); 2513 acct_update_integrals(p);
2514 /* Update rss highwater mark */
2515 update_mem_hiwater(p);
2516} 2514}
2517 2515
2518/* 2516/*
@@ -3879,7 +3877,6 @@ EXPORT_SYMBOL(cpu_present_map);
3879 3877
3880#ifndef CONFIG_SMP 3878#ifndef CONFIG_SMP
3881cpumask_t cpu_online_map = CPU_MASK_ALL; 3879cpumask_t cpu_online_map = CPU_MASK_ALL;
3882EXPORT_SYMBOL_GPL(cpu_online_map);
3883cpumask_t cpu_possible_map = CPU_MASK_ALL; 3880cpumask_t cpu_possible_map = CPU_MASK_ALL;
3884#endif 3881#endif
3885 3882
diff --git a/kernel/signal.c b/kernel/signal.c
index f2b96b08fb44..1bf3c39d6109 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -277,7 +277,6 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
277 } else { 277 } else {
278 INIT_LIST_HEAD(&q->list); 278 INIT_LIST_HEAD(&q->list);
279 q->flags = 0; 279 q->flags = 0;
280 q->lock = NULL;
281 q->user = get_uid(t->user); 280 q->user = get_uid(t->user);
282 } 281 }
283 return(q); 282 return(q);
@@ -406,6 +405,8 @@ void __exit_signal(struct task_struct *tsk)
406 405
407void exit_signal(struct task_struct *tsk) 406void exit_signal(struct task_struct *tsk)
408{ 407{
408 atomic_dec(&tsk->signal->live);
409
409 write_lock_irq(&tasklist_lock); 410 write_lock_irq(&tasklist_lock);
410 __exit_signal(tsk); 411 __exit_signal(tsk);
411 write_unlock_irq(&tasklist_lock); 412 write_unlock_irq(&tasklist_lock);
@@ -650,8 +651,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
650 if (!valid_signal(sig)) 651 if (!valid_signal(sig))
651 return error; 652 return error;
652 error = -EPERM; 653 error = -EPERM;
653 if ((!info || ((unsigned long)info != 1 && 654 if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info)))
654 (unsigned long)info != 2 && SI_FROMUSER(info)))
655 && ((sig != SIGCONT) || 655 && ((sig != SIGCONT) ||
656 (current->signal->session != t->signal->session)) 656 (current->signal->session != t->signal->session))
657 && (current->euid ^ t->suid) && (current->euid ^ t->uid) 657 && (current->euid ^ t->suid) && (current->euid ^ t->uid)
@@ -788,7 +788,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
788 * fast-pathed signals for kernel-internal things like SIGSTOP 788 * fast-pathed signals for kernel-internal things like SIGSTOP
789 * or SIGKILL. 789 * or SIGKILL.
790 */ 790 */
791 if ((unsigned long)info == 2) 791 if (info == SEND_SIG_FORCED)
792 goto out_set; 792 goto out_set;
793 793
794 /* Real-time signals must be queued if sent by sigqueue, or 794 /* Real-time signals must be queued if sent by sigqueue, or
@@ -800,19 +800,19 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
800 pass on the info struct. */ 800 pass on the info struct. */
801 801
802 q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN && 802 q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN &&
803 ((unsigned long) info < 2 || 803 (is_si_special(info) ||
804 info->si_code >= 0))); 804 info->si_code >= 0)));
805 if (q) { 805 if (q) {
806 list_add_tail(&q->list, &signals->list); 806 list_add_tail(&q->list, &signals->list);
807 switch ((unsigned long) info) { 807 switch ((unsigned long) info) {
808 case 0: 808 case (unsigned long) SEND_SIG_NOINFO:
809 q->info.si_signo = sig; 809 q->info.si_signo = sig;
810 q->info.si_errno = 0; 810 q->info.si_errno = 0;
811 q->info.si_code = SI_USER; 811 q->info.si_code = SI_USER;
812 q->info.si_pid = current->pid; 812 q->info.si_pid = current->pid;
813 q->info.si_uid = current->uid; 813 q->info.si_uid = current->uid;
814 break; 814 break;
815 case 1: 815 case (unsigned long) SEND_SIG_PRIV:
816 q->info.si_signo = sig; 816 q->info.si_signo = sig;
817 q->info.si_errno = 0; 817 q->info.si_errno = 0;
818 q->info.si_code = SI_KERNEL; 818 q->info.si_code = SI_KERNEL;
@@ -823,20 +823,13 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
823 copy_siginfo(&q->info, info); 823 copy_siginfo(&q->info, info);
824 break; 824 break;
825 } 825 }
826 } else { 826 } else if (!is_si_special(info)) {
827 if (sig >= SIGRTMIN && info && (unsigned long)info != 1 827 if (sig >= SIGRTMIN && info->si_code != SI_USER)
828 && info->si_code != SI_USER)
829 /* 828 /*
830 * Queue overflow, abort. We may abort if the signal was rt 829 * Queue overflow, abort. We may abort if the signal was rt
831 * and sent by user using something other than kill(). 830 * and sent by user using something other than kill().
832 */ 831 */
833 return -EAGAIN; 832 return -EAGAIN;
834 if (((unsigned long)info > 1) && (info->si_code == SI_TIMER))
835 /*
836 * Set up a return to indicate that we dropped
837 * the signal.
838 */
839 ret = info->si_sys_private;
840 } 833 }
841 834
842out_set: 835out_set:
@@ -857,12 +850,6 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
857 BUG(); 850 BUG();
858 assert_spin_locked(&t->sighand->siglock); 851 assert_spin_locked(&t->sighand->siglock);
859 852
860 if (((unsigned long)info > 2) && (info->si_code == SI_TIMER))
861 /*
862 * Set up a return to indicate that we dropped the signal.
863 */
864 ret = info->si_sys_private;
865
866 /* Short-circuit ignored signals. */ 853 /* Short-circuit ignored signals. */
867 if (sig_ignored(t, sig)) 854 if (sig_ignored(t, sig))
868 goto out; 855 goto out;
@@ -892,11 +879,13 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
892 int ret; 879 int ret;
893 880
894 spin_lock_irqsave(&t->sighand->siglock, flags); 881 spin_lock_irqsave(&t->sighand->siglock, flags);
895 if (sigismember(&t->blocked, sig) || t->sighand->action[sig-1].sa.sa_handler == SIG_IGN) { 882 if (t->sighand->action[sig-1].sa.sa_handler == SIG_IGN) {
896 t->sighand->action[sig-1].sa.sa_handler = SIG_DFL; 883 t->sighand->action[sig-1].sa.sa_handler = SIG_DFL;
884 }
885 if (sigismember(&t->blocked, sig)) {
897 sigdelset(&t->blocked, sig); 886 sigdelset(&t->blocked, sig);
898 recalc_sigpending_tsk(t);
899 } 887 }
888 recalc_sigpending_tsk(t);
900 ret = specific_send_sig_info(sig, info, t); 889 ret = specific_send_sig_info(sig, info, t);
901 spin_unlock_irqrestore(&t->sighand->siglock, flags); 890 spin_unlock_irqrestore(&t->sighand->siglock, flags);
902 891
@@ -906,15 +895,7 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
906void 895void
907force_sig_specific(int sig, struct task_struct *t) 896force_sig_specific(int sig, struct task_struct *t)
908{ 897{
909 unsigned long int flags; 898 force_sig_info(sig, SEND_SIG_FORCED, t);
910
911 spin_lock_irqsave(&t->sighand->siglock, flags);
912 if (t->sighand->action[sig-1].sa.sa_handler == SIG_IGN)
913 t->sighand->action[sig-1].sa.sa_handler = SIG_DFL;
914 sigdelset(&t->blocked, sig);
915 recalc_sigpending_tsk(t);
916 specific_send_sig_info(sig, (void *)2, t);
917 spin_unlock_irqrestore(&t->sighand->siglock, flags);
918} 899}
919 900
920/* 901/*
@@ -1049,12 +1030,6 @@ __group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1049 assert_spin_locked(&p->sighand->siglock); 1030 assert_spin_locked(&p->sighand->siglock);
1050 handle_stop_signal(sig, p); 1031 handle_stop_signal(sig, p);
1051 1032
1052 if (((unsigned long)info > 2) && (info->si_code == SI_TIMER))
1053 /*
1054 * Set up a return to indicate that we dropped the signal.
1055 */
1056 ret = info->si_sys_private;
1057
1058 /* Short-circuit ignored signals. */ 1033 /* Short-circuit ignored signals. */
1059 if (sig_ignored(p, sig)) 1034 if (sig_ignored(p, sig))
1060 return ret; 1035 return ret;
@@ -1107,8 +1082,8 @@ void zap_other_threads(struct task_struct *p)
1107 if (t != p->group_leader) 1082 if (t != p->group_leader)
1108 t->exit_signal = -1; 1083 t->exit_signal = -1;
1109 1084
1085 /* SIGKILL will be handled before any pending SIGSTOP */
1110 sigaddset(&t->pending.signal, SIGKILL); 1086 sigaddset(&t->pending.signal, SIGKILL);
1111 rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending);
1112 signal_wake_up(t, 1); 1087 signal_wake_up(t, 1);
1113 } 1088 }
1114} 1089}
@@ -1284,10 +1259,13 @@ send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1284 return ret; 1259 return ret;
1285} 1260}
1286 1261
1262#define __si_special(priv) \
1263 ((priv) ? SEND_SIG_PRIV : SEND_SIG_NOINFO)
1264
1287int 1265int
1288send_sig(int sig, struct task_struct *p, int priv) 1266send_sig(int sig, struct task_struct *p, int priv)
1289{ 1267{
1290 return send_sig_info(sig, (void*)(long)(priv != 0), p); 1268 return send_sig_info(sig, __si_special(priv), p);
1291} 1269}
1292 1270
1293/* 1271/*
@@ -1307,7 +1285,7 @@ send_group_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1307void 1285void
1308force_sig(int sig, struct task_struct *p) 1286force_sig(int sig, struct task_struct *p)
1309{ 1287{
1310 force_sig_info(sig, (void*)1L, p); 1288 force_sig_info(sig, SEND_SIG_PRIV, p);
1311} 1289}
1312 1290
1313/* 1291/*
@@ -1332,13 +1310,13 @@ force_sigsegv(int sig, struct task_struct *p)
1332int 1310int
1333kill_pg(pid_t pgrp, int sig, int priv) 1311kill_pg(pid_t pgrp, int sig, int priv)
1334{ 1312{
1335 return kill_pg_info(sig, (void *)(long)(priv != 0), pgrp); 1313 return kill_pg_info(sig, __si_special(priv), pgrp);
1336} 1314}
1337 1315
1338int 1316int
1339kill_proc(pid_t pid, int sig, int priv) 1317kill_proc(pid_t pid, int sig, int priv)
1340{ 1318{
1341 return kill_proc_info(sig, (void *)(long)(priv != 0), pid); 1319 return kill_proc_info(sig, __si_special(priv), pid);
1342} 1320}
1343 1321
1344/* 1322/*
@@ -1369,11 +1347,12 @@ void sigqueue_free(struct sigqueue *q)
1369 * pending queue. 1347 * pending queue.
1370 */ 1348 */
1371 if (unlikely(!list_empty(&q->list))) { 1349 if (unlikely(!list_empty(&q->list))) {
1372 read_lock(&tasklist_lock); 1350 spinlock_t *lock = &current->sighand->siglock;
1373 spin_lock_irqsave(q->lock, flags); 1351 read_lock(&tasklist_lock);
1352 spin_lock_irqsave(lock, flags);
1374 if (!list_empty(&q->list)) 1353 if (!list_empty(&q->list))
1375 list_del_init(&q->list); 1354 list_del_init(&q->list);
1376 spin_unlock_irqrestore(q->lock, flags); 1355 spin_unlock_irqrestore(lock, flags);
1377 read_unlock(&tasklist_lock); 1356 read_unlock(&tasklist_lock);
1378 } 1357 }
1379 q->flags &= ~SIGQUEUE_PREALLOC; 1358 q->flags &= ~SIGQUEUE_PREALLOC;
@@ -1412,7 +1391,6 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
1412 goto out; 1391 goto out;
1413 } 1392 }
1414 1393
1415 q->lock = &p->sighand->siglock;
1416 list_add_tail(&q->list, &p->pending.list); 1394 list_add_tail(&q->list, &p->pending.list);
1417 sigaddset(&p->pending.signal, sig); 1395 sigaddset(&p->pending.signal, sig);
1418 if (!sigismember(&p->blocked, sig)) 1396 if (!sigismember(&p->blocked, sig))
@@ -1460,7 +1438,6 @@ send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
1460 * We always use the shared queue for process-wide signals, 1438 * We always use the shared queue for process-wide signals,
1461 * to avoid several races. 1439 * to avoid several races.
1462 */ 1440 */
1463 q->lock = &p->sighand->siglock;
1464 list_add_tail(&q->list, &p->signal->shared_pending.list); 1441 list_add_tail(&q->list, &p->signal->shared_pending.list);
1465 sigaddset(&p->signal->shared_pending.signal, sig); 1442 sigaddset(&p->signal->shared_pending.signal, sig);
1466 1443
@@ -1879,9 +1856,9 @@ relock:
1879 /* Let the debugger run. */ 1856 /* Let the debugger run. */
1880 ptrace_stop(signr, signr, info); 1857 ptrace_stop(signr, signr, info);
1881 1858
1882 /* We're back. Did the debugger cancel the sig? */ 1859 /* We're back. Did the debugger cancel the sig or group_exit? */
1883 signr = current->exit_code; 1860 signr = current->exit_code;
1884 if (signr == 0) 1861 if (signr == 0 || current->signal->flags & SIGNAL_GROUP_EXIT)
1885 continue; 1862 continue;
1886 1863
1887 current->exit_code = 0; 1864 current->exit_code = 0;
@@ -2283,26 +2260,13 @@ sys_kill(int pid, int sig)
2283 return kill_something_info(sig, &info, pid); 2260 return kill_something_info(sig, &info, pid);
2284} 2261}
2285 2262
2286/** 2263static int do_tkill(int tgid, int pid, int sig)
2287 * sys_tgkill - send signal to one specific thread
2288 * @tgid: the thread group ID of the thread
2289 * @pid: the PID of the thread
2290 * @sig: signal to be sent
2291 *
2292 * This syscall also checks the tgid and returns -ESRCH even if the PID
2293 * exists but it's not belonging to the target process anymore. This
2294 * method solves the problem of threads exiting and PIDs getting reused.
2295 */
2296asmlinkage long sys_tgkill(int tgid, int pid, int sig)
2297{ 2264{
2298 struct siginfo info;
2299 int error; 2265 int error;
2266 struct siginfo info;
2300 struct task_struct *p; 2267 struct task_struct *p;
2301 2268
2302 /* This is only valid for single tasks */ 2269 error = -ESRCH;
2303 if (pid <= 0 || tgid <= 0)
2304 return -EINVAL;
2305
2306 info.si_signo = sig; 2270 info.si_signo = sig;
2307 info.si_errno = 0; 2271 info.si_errno = 0;
2308 info.si_code = SI_TKILL; 2272 info.si_code = SI_TKILL;
@@ -2311,8 +2275,7 @@ asmlinkage long sys_tgkill(int tgid, int pid, int sig)
2311 2275
2312 read_lock(&tasklist_lock); 2276 read_lock(&tasklist_lock);
2313 p = find_task_by_pid(pid); 2277 p = find_task_by_pid(pid);
2314 error = -ESRCH; 2278 if (p && (tgid <= 0 || p->tgid == tgid)) {
2315 if (p && (p->tgid == tgid)) {
2316 error = check_kill_permission(sig, &info, p); 2279 error = check_kill_permission(sig, &info, p);
2317 /* 2280 /*
2318 * The null signal is a permissions and process existence 2281 * The null signal is a permissions and process existence
@@ -2326,47 +2289,40 @@ asmlinkage long sys_tgkill(int tgid, int pid, int sig)
2326 } 2289 }
2327 } 2290 }
2328 read_unlock(&tasklist_lock); 2291 read_unlock(&tasklist_lock);
2292
2329 return error; 2293 return error;
2330} 2294}
2331 2295
2296/**
2297 * sys_tgkill - send signal to one specific thread
2298 * @tgid: the thread group ID of the thread
2299 * @pid: the PID of the thread
2300 * @sig: signal to be sent
2301 *
2302 * This syscall also checks the tgid and returns -ESRCH even if the PID
2303 * exists but it's not belonging to the target process anymore. This
2304 * method solves the problem of threads exiting and PIDs getting reused.
2305 */
2306asmlinkage long sys_tgkill(int tgid, int pid, int sig)
2307{
2308 /* This is only valid for single tasks */
2309 if (pid <= 0 || tgid <= 0)
2310 return -EINVAL;
2311
2312 return do_tkill(tgid, pid, sig);
2313}
2314
2332/* 2315/*
2333 * Send a signal to only one task, even if it's a CLONE_THREAD task. 2316 * Send a signal to only one task, even if it's a CLONE_THREAD task.
2334 */ 2317 */
2335asmlinkage long 2318asmlinkage long
2336sys_tkill(int pid, int sig) 2319sys_tkill(int pid, int sig)
2337{ 2320{
2338 struct siginfo info;
2339 int error;
2340 struct task_struct *p;
2341
2342 /* This is only valid for single tasks */ 2321 /* This is only valid for single tasks */
2343 if (pid <= 0) 2322 if (pid <= 0)
2344 return -EINVAL; 2323 return -EINVAL;
2345 2324
2346 info.si_signo = sig; 2325 return do_tkill(0, pid, sig);
2347 info.si_errno = 0;
2348 info.si_code = SI_TKILL;
2349 info.si_pid = current->tgid;
2350 info.si_uid = current->uid;
2351
2352 read_lock(&tasklist_lock);
2353 p = find_task_by_pid(pid);
2354 error = -ESRCH;
2355 if (p) {
2356 error = check_kill_permission(sig, &info, p);
2357 /*
2358 * The null signal is a permissions and process existence
2359 * probe. No signal is actually delivered.
2360 */
2361 if (!error && sig && p->sighand) {
2362 spin_lock_irq(&p->sighand->siglock);
2363 handle_stop_signal(sig, p);
2364 error = specific_send_sig_info(sig, &info, p);
2365 spin_unlock_irq(&p->sighand->siglock);
2366 }
2367 }
2368 read_unlock(&tasklist_lock);
2369 return error;
2370} 2326}
2371 2327
2372asmlinkage long 2328asmlinkage long
diff --git a/kernel/time.c b/kernel/time.c
index 40c2410ac99a..245d595a13cb 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -338,30 +338,20 @@ int do_adjtimex(struct timex *txc)
338 if (mtemp >= MINSEC) { 338 if (mtemp >= MINSEC) {
339 ltemp = (time_offset / mtemp) << (SHIFT_USEC - 339 ltemp = (time_offset / mtemp) << (SHIFT_USEC -
340 SHIFT_UPDATE); 340 SHIFT_UPDATE);
341 if (ltemp < 0) 341 time_freq += shift_right(ltemp, SHIFT_KH);
342 time_freq -= -ltemp >> SHIFT_KH;
343 else
344 time_freq += ltemp >> SHIFT_KH;
345 } else /* calibration interval too short (p. 12) */ 342 } else /* calibration interval too short (p. 12) */
346 result = TIME_ERROR; 343 result = TIME_ERROR;
347 } else { /* PLL mode */ 344 } else { /* PLL mode */
348 if (mtemp < MAXSEC) { 345 if (mtemp < MAXSEC) {
349 ltemp *= mtemp; 346 ltemp *= mtemp;
350 if (ltemp < 0) 347 time_freq += shift_right(ltemp,(time_constant +
351 time_freq -= -ltemp >> (time_constant +
352 time_constant +
353 SHIFT_KF - SHIFT_USEC);
354 else
355 time_freq += ltemp >> (time_constant +
356 time_constant + 348 time_constant +
357 SHIFT_KF - SHIFT_USEC); 349 SHIFT_KF - SHIFT_USEC));
358 } else /* calibration interval too long (p. 12) */ 350 } else /* calibration interval too long (p. 12) */
359 result = TIME_ERROR; 351 result = TIME_ERROR;
360 } 352 }
361 if (time_freq > time_tolerance) 353 time_freq = min(time_freq, time_tolerance);
362 time_freq = time_tolerance; 354 time_freq = max(time_freq, -time_tolerance);
363 else if (time_freq < -time_tolerance)
364 time_freq = -time_tolerance;
365 } /* STA_PLL || STA_PPSTIME */ 355 } /* STA_PLL || STA_PPSTIME */
366 } /* txc->modes & ADJ_OFFSET */ 356 } /* txc->modes & ADJ_OFFSET */
367 if (txc->modes & ADJ_TICK) { 357 if (txc->modes & ADJ_TICK) {
@@ -384,10 +374,7 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0
384 if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) 374 if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
385 txc->offset = save_adjust; 375 txc->offset = save_adjust;
386 else { 376 else {
387 if (time_offset < 0) 377 txc->offset = shift_right(time_offset, SHIFT_UPDATE);
388 txc->offset = -(-time_offset >> SHIFT_UPDATE);
389 else
390 txc->offset = time_offset >> SHIFT_UPDATE;
391 } 378 }
392 txc->freq = time_freq + pps_freq; 379 txc->freq = time_freq + pps_freq;
393 txc->maxerror = time_maxerror; 380 txc->maxerror = time_maxerror;
@@ -532,6 +519,7 @@ int do_settimeofday (struct timespec *tv)
532 clock_was_set(); 519 clock_was_set();
533 return 0; 520 return 0;
534} 521}
522EXPORT_SYMBOL(do_settimeofday);
535 523
536void do_gettimeofday (struct timeval *tv) 524void do_gettimeofday (struct timeval *tv)
537{ 525{
diff --git a/kernel/timer.c b/kernel/timer.c
index 3ba10fa35b60..fd74268d8663 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -46,6 +46,10 @@ static void time_interpolator_update(long delta_nsec);
46#define time_interpolator_update(x) 46#define time_interpolator_update(x)
47#endif 47#endif
48 48
49u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
50
51EXPORT_SYMBOL(jiffies_64);
52
49/* 53/*
50 * per-CPU timer vector definitions: 54 * per-CPU timer vector definitions:
51 */ 55 */
@@ -91,30 +95,6 @@ static inline void set_running_timer(tvec_base_t *base,
91#endif 95#endif
92} 96}
93 97
94static void check_timer_failed(struct timer_list *timer)
95{
96 static int whine_count;
97 if (whine_count < 16) {
98 whine_count++;
99 printk("Uninitialised timer!\n");
100 printk("This is just a warning. Your computer is OK\n");
101 printk("function=0x%p, data=0x%lx\n",
102 timer->function, timer->data);
103 dump_stack();
104 }
105 /*
106 * Now fix it up
107 */
108 timer->magic = TIMER_MAGIC;
109}
110
111static inline void check_timer(struct timer_list *timer)
112{
113 if (timer->magic != TIMER_MAGIC)
114 check_timer_failed(timer);
115}
116
117
118static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) 98static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
119{ 99{
120 unsigned long expires = timer->expires; 100 unsigned long expires = timer->expires;
@@ -177,7 +157,6 @@ void fastcall init_timer(struct timer_list *timer)
177{ 157{
178 timer->entry.next = NULL; 158 timer->entry.next = NULL;
179 timer->base = &per_cpu(tvec_bases, raw_smp_processor_id()).t_base; 159 timer->base = &per_cpu(tvec_bases, raw_smp_processor_id()).t_base;
180 timer->magic = TIMER_MAGIC;
181} 160}
182EXPORT_SYMBOL(init_timer); 161EXPORT_SYMBOL(init_timer);
183 162
@@ -230,7 +209,6 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
230 int ret = 0; 209 int ret = 0;
231 210
232 BUG_ON(!timer->function); 211 BUG_ON(!timer->function);
233 check_timer(timer);
234 212
235 base = lock_timer_base(timer, &flags); 213 base = lock_timer_base(timer, &flags);
236 214
@@ -283,9 +261,6 @@ void add_timer_on(struct timer_list *timer, int cpu)
283 unsigned long flags; 261 unsigned long flags;
284 262
285 BUG_ON(timer_pending(timer) || !timer->function); 263 BUG_ON(timer_pending(timer) || !timer->function);
286
287 check_timer(timer);
288
289 spin_lock_irqsave(&base->t_base.lock, flags); 264 spin_lock_irqsave(&base->t_base.lock, flags);
290 timer->base = &base->t_base; 265 timer->base = &base->t_base;
291 internal_add_timer(base, timer); 266 internal_add_timer(base, timer);
@@ -316,8 +291,6 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
316{ 291{
317 BUG_ON(!timer->function); 292 BUG_ON(!timer->function);
318 293
319 check_timer(timer);
320
321 /* 294 /*
322 * This is a common optimization triggered by the 295 * This is a common optimization triggered by the
323 * networking code - if the timer is re-modified 296 * networking code - if the timer is re-modified
@@ -348,8 +321,6 @@ int del_timer(struct timer_list *timer)
348 unsigned long flags; 321 unsigned long flags;
349 int ret = 0; 322 int ret = 0;
350 323
351 check_timer(timer);
352
353 if (timer_pending(timer)) { 324 if (timer_pending(timer)) {
354 base = lock_timer_base(timer, &flags); 325 base = lock_timer_base(timer, &flags);
355 if (timer_pending(timer)) { 326 if (timer_pending(timer)) {
@@ -412,8 +383,6 @@ out:
412 */ 383 */
413int del_timer_sync(struct timer_list *timer) 384int del_timer_sync(struct timer_list *timer)
414{ 385{
415 check_timer(timer);
416
417 for (;;) { 386 for (;;) {
418 int ret = try_to_del_timer_sync(timer); 387 int ret = try_to_del_timer_sync(timer);
419 if (ret >= 0) 388 if (ret >= 0)
@@ -632,134 +601,118 @@ long time_next_adjust;
632 */ 601 */
633static void second_overflow(void) 602static void second_overflow(void)
634{ 603{
635 long ltemp; 604 long ltemp;
636 605
637 /* Bump the maxerror field */ 606 /* Bump the maxerror field */
638 time_maxerror += time_tolerance >> SHIFT_USEC; 607 time_maxerror += time_tolerance >> SHIFT_USEC;
639 if ( time_maxerror > NTP_PHASE_LIMIT ) { 608 if (time_maxerror > NTP_PHASE_LIMIT) {
640 time_maxerror = NTP_PHASE_LIMIT; 609 time_maxerror = NTP_PHASE_LIMIT;
641 time_status |= STA_UNSYNC; 610 time_status |= STA_UNSYNC;
642 }
643
644 /*
645 * Leap second processing. If in leap-insert state at
646 * the end of the day, the system clock is set back one
647 * second; if in leap-delete state, the system clock is
648 * set ahead one second. The microtime() routine or
649 * external clock driver will insure that reported time
650 * is always monotonic. The ugly divides should be
651 * replaced.
652 */
653 switch (time_state) {
654
655 case TIME_OK:
656 if (time_status & STA_INS)
657 time_state = TIME_INS;
658 else if (time_status & STA_DEL)
659 time_state = TIME_DEL;
660 break;
661
662 case TIME_INS:
663 if (xtime.tv_sec % 86400 == 0) {
664 xtime.tv_sec--;
665 wall_to_monotonic.tv_sec++;
666 /* The timer interpolator will make time change gradually instead
667 * of an immediate jump by one second.
668 */
669 time_interpolator_update(-NSEC_PER_SEC);
670 time_state = TIME_OOP;
671 clock_was_set();
672 printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n");
673 } 611 }
674 break; 612
675 613 /*
676 case TIME_DEL: 614 * Leap second processing. If in leap-insert state at the end of the
677 if ((xtime.tv_sec + 1) % 86400 == 0) { 615 * day, the system clock is set back one second; if in leap-delete
678 xtime.tv_sec++; 616 * state, the system clock is set ahead one second. The microtime()
679 wall_to_monotonic.tv_sec--; 617 * routine or external clock driver will insure that reported time is
680 /* Use of time interpolator for a gradual change of time */ 618 * always monotonic. The ugly divides should be replaced.
681 time_interpolator_update(NSEC_PER_SEC); 619 */
682 time_state = TIME_WAIT; 620 switch (time_state) {
683 clock_was_set(); 621 case TIME_OK:
684 printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n"); 622 if (time_status & STA_INS)
623 time_state = TIME_INS;
624 else if (time_status & STA_DEL)
625 time_state = TIME_DEL;
626 break;
627 case TIME_INS:
628 if (xtime.tv_sec % 86400 == 0) {
629 xtime.tv_sec--;
630 wall_to_monotonic.tv_sec++;
631 /*
632 * The timer interpolator will make time change
633 * gradually instead of an immediate jump by one second
634 */
635 time_interpolator_update(-NSEC_PER_SEC);
636 time_state = TIME_OOP;
637 clock_was_set();
638 printk(KERN_NOTICE "Clock: inserting leap second "
639 "23:59:60 UTC\n");
640 }
641 break;
642 case TIME_DEL:
643 if ((xtime.tv_sec + 1) % 86400 == 0) {
644 xtime.tv_sec++;
645 wall_to_monotonic.tv_sec--;
646 /*
647 * Use of time interpolator for a gradual change of
648 * time
649 */
650 time_interpolator_update(NSEC_PER_SEC);
651 time_state = TIME_WAIT;
652 clock_was_set();
653 printk(KERN_NOTICE "Clock: deleting leap second "
654 "23:59:59 UTC\n");
655 }
656 break;
657 case TIME_OOP:
658 time_state = TIME_WAIT;
659 break;
660 case TIME_WAIT:
661 if (!(time_status & (STA_INS | STA_DEL)))
662 time_state = TIME_OK;
685 } 663 }
686 break; 664
687 665 /*
688 case TIME_OOP: 666 * Compute the phase adjustment for the next second. In PLL mode, the
689 time_state = TIME_WAIT; 667 * offset is reduced by a fixed factor times the time constant. In FLL
690 break; 668 * mode the offset is used directly. In either mode, the maximum phase
691 669 * adjustment for each second is clamped so as to spread the adjustment
692 case TIME_WAIT: 670 * over not more than the number of seconds between updates.
693 if (!(time_status & (STA_INS | STA_DEL))) 671 */
694 time_state = TIME_OK;
695 }
696
697 /*
698 * Compute the phase adjustment for the next second. In
699 * PLL mode, the offset is reduced by a fixed factor
700 * times the time constant. In FLL mode the offset is
701 * used directly. In either mode, the maximum phase
702 * adjustment for each second is clamped so as to spread
703 * the adjustment over not more than the number of
704 * seconds between updates.
705 */
706 if (time_offset < 0) {
707 ltemp = -time_offset;
708 if (!(time_status & STA_FLL))
709 ltemp >>= SHIFT_KG + time_constant;
710 if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
711 ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
712 time_offset += ltemp;
713 time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
714 } else {
715 ltemp = time_offset; 672 ltemp = time_offset;
716 if (!(time_status & STA_FLL)) 673 if (!(time_status & STA_FLL))
717 ltemp >>= SHIFT_KG + time_constant; 674 ltemp = shift_right(ltemp, SHIFT_KG + time_constant);
718 if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE) 675 ltemp = min(ltemp, (MAXPHASE / MINSEC) << SHIFT_UPDATE);
719 ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE; 676 ltemp = max(ltemp, -(MAXPHASE / MINSEC) << SHIFT_UPDATE);
720 time_offset -= ltemp; 677 time_offset -= ltemp;
721 time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); 678 time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
722 } 679
723 680 /*
724 /* 681 * Compute the frequency estimate and additional phase adjustment due
725 * Compute the frequency estimate and additional phase 682 * to frequency error for the next second. When the PPS signal is
726 * adjustment due to frequency error for the next 683 * engaged, gnaw on the watchdog counter and update the frequency
727 * second. When the PPS signal is engaged, gnaw on the 684 * computed by the pll and the PPS signal.
728 * watchdog counter and update the frequency computed by 685 */
729 * the pll and the PPS signal. 686 pps_valid++;
730 */ 687 if (pps_valid == PPS_VALID) { /* PPS signal lost */
731 pps_valid++; 688 pps_jitter = MAXTIME;
732 if (pps_valid == PPS_VALID) { /* PPS signal lost */ 689 pps_stabil = MAXFREQ;
733 pps_jitter = MAXTIME; 690 time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
734 pps_stabil = MAXFREQ; 691 STA_PPSWANDER | STA_PPSERROR);
735 time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | 692 }
736 STA_PPSWANDER | STA_PPSERROR); 693 ltemp = time_freq + pps_freq;
737 } 694 time_adj += shift_right(ltemp,(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE));
738 ltemp = time_freq + pps_freq;
739 if (ltemp < 0)
740 time_adj -= -ltemp >>
741 (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
742 else
743 time_adj += ltemp >>
744 (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
745 695
746#if HZ == 100 696#if HZ == 100
747 /* Compensate for (HZ==100) != (1 << SHIFT_HZ). 697 /*
748 * Add 25% and 3.125% to get 128.125; => only 0.125% error (p. 14) 698 * Compensate for (HZ==100) != (1 << SHIFT_HZ). Add 25% and 3.125% to
749 */ 699 * get 128.125; => only 0.125% error (p. 14)
750 if (time_adj < 0) 700 */
751 time_adj -= (-time_adj >> 2) + (-time_adj >> 5); 701 time_adj += shift_right(time_adj, 2) + shift_right(time_adj, 5);
752 else 702#endif
753 time_adj += (time_adj >> 2) + (time_adj >> 5); 703#if HZ == 250
704 /*
705 * Compensate for (HZ==250) != (1 << SHIFT_HZ). Add 1.5625% and
706 * 0.78125% to get 255.85938; => only 0.05% error (p. 14)
707 */
708 time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
754#endif 709#endif
755#if HZ == 1000 710#if HZ == 1000
756 /* Compensate for (HZ==1000) != (1 << SHIFT_HZ). 711 /*
757 * Add 1.5625% and 0.78125% to get 1023.4375; => only 0.05% error (p. 14) 712 * Compensate for (HZ==1000) != (1 << SHIFT_HZ). Add 1.5625% and
758 */ 713 * 0.78125% to get 1023.4375; => only 0.05% error (p. 14)
759 if (time_adj < 0) 714 */
760 time_adj -= (-time_adj >> 6) + (-time_adj >> 7); 715 time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
761 else
762 time_adj += (time_adj >> 6) + (time_adj >> 7);
763#endif 716#endif
764} 717}
765 718
@@ -768,23 +721,20 @@ static void update_wall_time_one_tick(void)
768{ 721{
769 long time_adjust_step, delta_nsec; 722 long time_adjust_step, delta_nsec;
770 723
771 if ( (time_adjust_step = time_adjust) != 0 ) { 724 if ((time_adjust_step = time_adjust) != 0 ) {
772 /* We are doing an adjtime thing. 725 /*
773 * 726 * We are doing an adjtime thing. Prepare time_adjust_step to
774 * Prepare time_adjust_step to be within bounds. 727 * be within bounds. Note that a positive time_adjust means we
775 * Note that a positive time_adjust means we want the clock 728 * want the clock to run faster.
776 * to run faster. 729 *
777 * 730 * Limit the amount of the step to be in the range
778 * Limit the amount of the step to be in the range 731 * -tickadj .. +tickadj
779 * -tickadj .. +tickadj 732 */
780 */ 733 time_adjust_step = min(time_adjust_step, (long)tickadj);
781 if (time_adjust > tickadj) 734 time_adjust_step = max(time_adjust_step, (long)-tickadj);
782 time_adjust_step = tickadj; 735
783 else if (time_adjust < -tickadj) 736 /* Reduce by this step the amount of time left */
784 time_adjust_step = -tickadj; 737 time_adjust -= time_adjust_step;
785
786 /* Reduce by this step the amount of time left */
787 time_adjust -= time_adjust_step;
788 } 738 }
789 delta_nsec = tick_nsec + time_adjust_step * 1000; 739 delta_nsec = tick_nsec + time_adjust_step * 1000;
790 /* 740 /*
@@ -792,13 +742,8 @@ static void update_wall_time_one_tick(void)
792 * advance the tick more. 742 * advance the tick more.
793 */ 743 */
794 time_phase += time_adj; 744 time_phase += time_adj;
795 if (time_phase <= -FINENSEC) { 745 if ((time_phase >= FINENSEC) || (time_phase <= -FINENSEC)) {
796 long ltemp = -time_phase >> (SHIFT_SCALE - 10); 746 long ltemp = shift_right(time_phase, (SHIFT_SCALE - 10));
797 time_phase += ltemp << (SHIFT_SCALE - 10);
798 delta_nsec -= ltemp;
799 }
800 else if (time_phase >= FINENSEC) {
801 long ltemp = time_phase >> (SHIFT_SCALE - 10);
802 time_phase -= ltemp << (SHIFT_SCALE - 10); 747 time_phase -= ltemp << (SHIFT_SCALE - 10);
803 delta_nsec += ltemp; 748 delta_nsec += ltemp;
804 } 749 }
@@ -1128,8 +1073,8 @@ fastcall signed long __sched schedule_timeout(signed long timeout)
1128 if (timeout < 0) 1073 if (timeout < 0)
1129 { 1074 {
1130 printk(KERN_ERR "schedule_timeout: wrong timeout " 1075 printk(KERN_ERR "schedule_timeout: wrong timeout "
1131 "value %lx from %p\n", timeout, 1076 "value %lx from %p\n", timeout,
1132 __builtin_return_address(0)); 1077 __builtin_return_address(0));
1133 current->state = TASK_RUNNING; 1078 current->state = TASK_RUNNING;
1134 goto out; 1079 goto out;
1135 } 1080 }
@@ -1137,12 +1082,8 @@ fastcall signed long __sched schedule_timeout(signed long timeout)
1137 1082
1138 expire = timeout + jiffies; 1083 expire = timeout + jiffies;
1139 1084
1140 init_timer(&timer); 1085 setup_timer(&timer, process_timeout, (unsigned long)current);
1141 timer.expires = expire; 1086 __mod_timer(&timer, expire);
1142 timer.data = (unsigned long) current;
1143 timer.function = process_timeout;
1144
1145 add_timer(&timer);
1146 schedule(); 1087 schedule();
1147 del_singleshot_timer_sync(&timer); 1088 del_singleshot_timer_sync(&timer);
1148 1089
@@ -1159,15 +1100,15 @@ EXPORT_SYMBOL(schedule_timeout);
1159 */ 1100 */
1160signed long __sched schedule_timeout_interruptible(signed long timeout) 1101signed long __sched schedule_timeout_interruptible(signed long timeout)
1161{ 1102{
1162 __set_current_state(TASK_INTERRUPTIBLE); 1103 __set_current_state(TASK_INTERRUPTIBLE);
1163 return schedule_timeout(timeout); 1104 return schedule_timeout(timeout);
1164} 1105}
1165EXPORT_SYMBOL(schedule_timeout_interruptible); 1106EXPORT_SYMBOL(schedule_timeout_interruptible);
1166 1107
1167signed long __sched schedule_timeout_uninterruptible(signed long timeout) 1108signed long __sched schedule_timeout_uninterruptible(signed long timeout)
1168{ 1109{
1169 __set_current_state(TASK_UNINTERRUPTIBLE); 1110 __set_current_state(TASK_UNINTERRUPTIBLE);
1170 return schedule_timeout(timeout); 1111 return schedule_timeout(timeout);
1171} 1112}
1172EXPORT_SYMBOL(schedule_timeout_uninterruptible); 1113EXPORT_SYMBOL(schedule_timeout_uninterruptible);
1173 1114
@@ -1507,16 +1448,18 @@ static void time_interpolator_update(long delta_nsec)
1507 if (!time_interpolator) 1448 if (!time_interpolator)
1508 return; 1449 return;
1509 1450
1510 /* The interpolator compensates for late ticks by accumulating 1451 /*
1511 * the late time in time_interpolator->offset. A tick earlier than 1452 * The interpolator compensates for late ticks by accumulating the late
1512 * expected will lead to a reset of the offset and a corresponding 1453 * time in time_interpolator->offset. A tick earlier than expected will
1513 * jump of the clock forward. Again this only works if the 1454 * lead to a reset of the offset and a corresponding jump of the clock
1514 * interpolator clock is running slightly slower than the regular clock 1455 * forward. Again this only works if the interpolator clock is running
1515 * and the tuning logic insures that. 1456 * slightly slower than the regular clock and the tuning logic insures
1516 */ 1457 * that.
1458 */
1517 1459
1518 counter = time_interpolator_get_counter(1); 1460 counter = time_interpolator_get_counter(1);
1519 offset = time_interpolator->offset + GET_TI_NSECS(counter, time_interpolator); 1461 offset = time_interpolator->offset +
1462 GET_TI_NSECS(counter, time_interpolator);
1520 1463
1521 if (delta_nsec < 0 || (unsigned long) delta_nsec < offset) 1464 if (delta_nsec < 0 || (unsigned long) delta_nsec < offset)
1522 time_interpolator->offset = offset - delta_nsec; 1465 time_interpolator->offset = offset - delta_nsec;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 91bacb13a7e2..7cee222231bc 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -12,6 +12,8 @@
12 * Andrew Morton <andrewm@uow.edu.au> 12 * Andrew Morton <andrewm@uow.edu.au>
13 * Kai Petzke <wpp@marie.physik.tu-berlin.de> 13 * Kai Petzke <wpp@marie.physik.tu-berlin.de>
14 * Theodore Ts'o <tytso@mit.edu> 14 * Theodore Ts'o <tytso@mit.edu>
15 *
16 * Made to use alloc_percpu by Christoph Lameter <clameter@sgi.com>.
15 */ 17 */
16 18
17#include <linux/module.h> 19#include <linux/module.h>
@@ -57,7 +59,7 @@ struct cpu_workqueue_struct {
57 * per-CPU workqueues: 59 * per-CPU workqueues:
58 */ 60 */
59struct workqueue_struct { 61struct workqueue_struct {
60 struct cpu_workqueue_struct cpu_wq[NR_CPUS]; 62 struct cpu_workqueue_struct *cpu_wq;
61 const char *name; 63 const char *name;
62 struct list_head list; /* Empty if single thread */ 64 struct list_head list; /* Empty if single thread */
63}; 65};
@@ -102,7 +104,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
102 if (unlikely(is_single_threaded(wq))) 104 if (unlikely(is_single_threaded(wq)))
103 cpu = 0; 105 cpu = 0;
104 BUG_ON(!list_empty(&work->entry)); 106 BUG_ON(!list_empty(&work->entry));
105 __queue_work(wq->cpu_wq + cpu, work); 107 __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
106 ret = 1; 108 ret = 1;
107 } 109 }
108 put_cpu(); 110 put_cpu();
@@ -118,7 +120,7 @@ static void delayed_work_timer_fn(unsigned long __data)
118 if (unlikely(is_single_threaded(wq))) 120 if (unlikely(is_single_threaded(wq)))
119 cpu = 0; 121 cpu = 0;
120 122
121 __queue_work(wq->cpu_wq + cpu, work); 123 __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
122} 124}
123 125
124int fastcall queue_delayed_work(struct workqueue_struct *wq, 126int fastcall queue_delayed_work(struct workqueue_struct *wq,
@@ -265,13 +267,13 @@ void fastcall flush_workqueue(struct workqueue_struct *wq)
265 267
266 if (is_single_threaded(wq)) { 268 if (is_single_threaded(wq)) {
267 /* Always use cpu 0's area. */ 269 /* Always use cpu 0's area. */
268 flush_cpu_workqueue(wq->cpu_wq + 0); 270 flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, 0));
269 } else { 271 } else {
270 int cpu; 272 int cpu;
271 273
272 lock_cpu_hotplug(); 274 lock_cpu_hotplug();
273 for_each_online_cpu(cpu) 275 for_each_online_cpu(cpu)
274 flush_cpu_workqueue(wq->cpu_wq + cpu); 276 flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
275 unlock_cpu_hotplug(); 277 unlock_cpu_hotplug();
276 } 278 }
277} 279}
@@ -279,7 +281,7 @@ void fastcall flush_workqueue(struct workqueue_struct *wq)
279static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq, 281static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
280 int cpu) 282 int cpu)
281{ 283{
282 struct cpu_workqueue_struct *cwq = wq->cpu_wq + cpu; 284 struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
283 struct task_struct *p; 285 struct task_struct *p;
284 286
285 spin_lock_init(&cwq->lock); 287 spin_lock_init(&cwq->lock);
@@ -312,6 +314,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
312 if (!wq) 314 if (!wq)
313 return NULL; 315 return NULL;
314 316
317 wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct);
315 wq->name = name; 318 wq->name = name;
316 /* We don't need the distraction of CPUs appearing and vanishing. */ 319 /* We don't need the distraction of CPUs appearing and vanishing. */
317 lock_cpu_hotplug(); 320 lock_cpu_hotplug();
@@ -353,7 +356,7 @@ static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu)
353 unsigned long flags; 356 unsigned long flags;
354 struct task_struct *p; 357 struct task_struct *p;
355 358
356 cwq = wq->cpu_wq + cpu; 359 cwq = per_cpu_ptr(wq->cpu_wq, cpu);
357 spin_lock_irqsave(&cwq->lock, flags); 360 spin_lock_irqsave(&cwq->lock, flags);
358 p = cwq->thread; 361 p = cwq->thread;
359 cwq->thread = NULL; 362 cwq->thread = NULL;
@@ -380,6 +383,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
380 spin_unlock(&workqueue_lock); 383 spin_unlock(&workqueue_lock);
381 } 384 }
382 unlock_cpu_hotplug(); 385 unlock_cpu_hotplug();
386 free_percpu(wq->cpu_wq);
383 kfree(wq); 387 kfree(wq);
384} 388}
385 389
@@ -458,7 +462,7 @@ int current_is_keventd(void)
458 462
459 BUG_ON(!keventd_wq); 463 BUG_ON(!keventd_wq);
460 464
461 cwq = keventd_wq->cpu_wq + cpu; 465 cwq = per_cpu_ptr(keventd_wq->cpu_wq, cpu);
462 if (current == cwq->thread) 466 if (current == cwq->thread)
463 ret = 1; 467 ret = 1;
464 468
@@ -470,7 +474,7 @@ int current_is_keventd(void)
470/* Take the work from this (downed) CPU. */ 474/* Take the work from this (downed) CPU. */
471static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) 475static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
472{ 476{
473 struct cpu_workqueue_struct *cwq = wq->cpu_wq + cpu; 477 struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
474 LIST_HEAD(list); 478 LIST_HEAD(list);
475 struct work_struct *work; 479 struct work_struct *work;
476 480
@@ -481,7 +485,7 @@ static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
481 printk("Taking work for %s\n", wq->name); 485 printk("Taking work for %s\n", wq->name);
482 work = list_entry(list.next,struct work_struct,entry); 486 work = list_entry(list.next,struct work_struct,entry);
483 list_del(&work->entry); 487 list_del(&work->entry);
484 __queue_work(wq->cpu_wq + smp_processor_id(), work); 488 __queue_work(per_cpu_ptr(wq->cpu_wq, smp_processor_id()), work);
485 } 489 }
486 spin_unlock_irq(&cwq->lock); 490 spin_unlock_irq(&cwq->lock);
487} 491}
@@ -508,15 +512,18 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
508 case CPU_ONLINE: 512 case CPU_ONLINE:
509 /* Kick off worker threads. */ 513 /* Kick off worker threads. */
510 list_for_each_entry(wq, &workqueues, list) { 514 list_for_each_entry(wq, &workqueues, list) {
511 kthread_bind(wq->cpu_wq[hotcpu].thread, hotcpu); 515 struct cpu_workqueue_struct *cwq;
512 wake_up_process(wq->cpu_wq[hotcpu].thread); 516
517 cwq = per_cpu_ptr(wq->cpu_wq, hotcpu);
518 kthread_bind(cwq->thread, hotcpu);
519 wake_up_process(cwq->thread);
513 } 520 }
514 break; 521 break;
515 522
516 case CPU_UP_CANCELED: 523 case CPU_UP_CANCELED:
517 list_for_each_entry(wq, &workqueues, list) { 524 list_for_each_entry(wq, &workqueues, list) {
518 /* Unbind so it can run. */ 525 /* Unbind so it can run. */
519 kthread_bind(wq->cpu_wq[hotcpu].thread, 526 kthread_bind(per_cpu_ptr(wq->cpu_wq, hotcpu)->thread,
520 smp_processor_id()); 527 smp_processor_id());
521 cleanup_workqueue_thread(wq, hotcpu); 528 cleanup_workqueue_thread(wq, hotcpu);
522 } 529 }