aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile3
-rw-r--r--kernel/acct.c12
-rw-r--r--kernel/audit.c2
-rw-r--r--kernel/compat.c82
-rw-r--r--kernel/cpu.c29
-rw-r--r--kernel/cpuset.c69
-rw-r--r--kernel/exit.c144
-rw-r--r--kernel/fork.c159
-rw-r--r--kernel/futex.c174
-rw-r--r--kernel/futex_compat.c144
-rw-r--r--kernel/hrtimer.c194
-rw-r--r--kernel/irq/manage.c1
-rw-r--r--kernel/itimer.c14
-rw-r--r--kernel/kmod.c2
-rw-r--r--kernel/kprobes.c10
-rw-r--r--kernel/module.c23
-rw-r--r--kernel/panic.c4
-rw-r--r--kernel/params.c2
-rw-r--r--kernel/pid.c250
-rw-r--r--kernel/posix-timers.c67
-rw-r--r--kernel/power/process.c3
-rw-r--r--kernel/power/swap.c7
-rw-r--r--kernel/profile.c53
-rw-r--r--kernel/ptrace.c8
-rw-r--r--kernel/rcutorture.c4
-rw-r--r--kernel/sched.c247
-rw-r--r--kernel/signal.c345
-rw-r--r--kernel/softlockup.c2
-rw-r--r--kernel/sys.c419
-rw-r--r--kernel/sys_ni.c4
-rw-r--r--kernel/time.c4
-rw-r--r--kernel/timer.c92
32 files changed, 1558 insertions, 1015 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index ff1c11dc12cf..58908f9d156a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -12,6 +12,9 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
12 12
13obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o 13obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
14obj-$(CONFIG_FUTEX) += futex.o 14obj-$(CONFIG_FUTEX) += futex.o
15ifeq ($(CONFIG_COMPAT),y)
16obj-$(CONFIG_FUTEX) += futex_compat.o
17endif
15obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o 18obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
16obj-$(CONFIG_SMP) += cpu.o spinlock.o 19obj-$(CONFIG_SMP) += cpu.o spinlock.o
17obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o 20obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
diff --git a/kernel/acct.c b/kernel/acct.c
index 065d8b4e51ef..b327f4d20104 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -449,8 +449,8 @@ static void do_acct_process(long exitcode, struct file *file)
449 /* calculate run_time in nsec*/ 449 /* calculate run_time in nsec*/
450 do_posix_clock_monotonic_gettime(&uptime); 450 do_posix_clock_monotonic_gettime(&uptime);
451 run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec; 451 run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec;
452 run_time -= (u64)current->start_time.tv_sec*NSEC_PER_SEC 452 run_time -= (u64)current->group_leader->start_time.tv_sec * NSEC_PER_SEC
453 + current->start_time.tv_nsec; 453 + current->group_leader->start_time.tv_nsec;
454 /* convert nsec -> AHZ */ 454 /* convert nsec -> AHZ */
455 elapsed = nsec_to_AHZ(run_time); 455 elapsed = nsec_to_AHZ(run_time);
456#if ACCT_VERSION==3 456#if ACCT_VERSION==3
@@ -469,10 +469,10 @@ static void do_acct_process(long exitcode, struct file *file)
469#endif 469#endif
470 do_div(elapsed, AHZ); 470 do_div(elapsed, AHZ);
471 ac.ac_btime = xtime.tv_sec - elapsed; 471 ac.ac_btime = xtime.tv_sec - elapsed;
472 jiffies = cputime_to_jiffies(cputime_add(current->group_leader->utime, 472 jiffies = cputime_to_jiffies(cputime_add(current->utime,
473 current->signal->utime)); 473 current->signal->utime));
474 ac.ac_utime = encode_comp_t(jiffies_to_AHZ(jiffies)); 474 ac.ac_utime = encode_comp_t(jiffies_to_AHZ(jiffies));
475 jiffies = cputime_to_jiffies(cputime_add(current->group_leader->stime, 475 jiffies = cputime_to_jiffies(cputime_add(current->stime,
476 current->signal->stime)); 476 current->signal->stime));
477 ac.ac_stime = encode_comp_t(jiffies_to_AHZ(jiffies)); 477 ac.ac_stime = encode_comp_t(jiffies_to_AHZ(jiffies));
478 /* we really need to bite the bullet and change layout */ 478 /* we really need to bite the bullet and change layout */
@@ -522,9 +522,9 @@ static void do_acct_process(long exitcode, struct file *file)
522 ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */ 522 ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */
523 ac.ac_rw = encode_comp_t(ac.ac_io / 1024); 523 ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
524 ac.ac_minflt = encode_comp_t(current->signal->min_flt + 524 ac.ac_minflt = encode_comp_t(current->signal->min_flt +
525 current->group_leader->min_flt); 525 current->min_flt);
526 ac.ac_majflt = encode_comp_t(current->signal->maj_flt + 526 ac.ac_majflt = encode_comp_t(current->signal->maj_flt +
527 current->group_leader->maj_flt); 527 current->maj_flt);
528 ac.ac_swaps = encode_comp_t(0); 528 ac.ac_swaps = encode_comp_t(0);
529 ac.ac_exitcode = exitcode; 529 ac.ac_exitcode = exitcode;
530 530
diff --git a/kernel/audit.c b/kernel/audit.c
index 04fe2e301b61..c8ccbd09048f 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -578,7 +578,7 @@ static int __init audit_enable(char *str)
578 audit_initialized ? "" : " (after initialization)"); 578 audit_initialized ? "" : " (after initialization)");
579 if (audit_initialized) 579 if (audit_initialized)
580 audit_enabled = audit_default; 580 audit_enabled = audit_default;
581 return 0; 581 return 1;
582} 582}
583 583
584__setup("audit=", audit_enable); 584__setup("audit=", audit_enable);
diff --git a/kernel/compat.c b/kernel/compat.c
index 8c9cd88b6785..c1601a84f8d8 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -17,10 +17,10 @@
17#include <linux/time.h> 17#include <linux/time.h>
18#include <linux/signal.h> 18#include <linux/signal.h>
19#include <linux/sched.h> /* for MAX_SCHEDULE_TIMEOUT */ 19#include <linux/sched.h> /* for MAX_SCHEDULE_TIMEOUT */
20#include <linux/futex.h> /* for FUTEX_WAIT */
21#include <linux/syscalls.h> 20#include <linux/syscalls.h>
22#include <linux/unistd.h> 21#include <linux/unistd.h>
23#include <linux/security.h> 22#include <linux/security.h>
23#include <linux/timex.h>
24 24
25#include <asm/uaccess.h> 25#include <asm/uaccess.h>
26 26
@@ -238,28 +238,6 @@ asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set,
238 return ret; 238 return ret;
239} 239}
240 240
241#ifdef CONFIG_FUTEX
242asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, int val,
243 struct compat_timespec __user *utime, u32 __user *uaddr2,
244 int val3)
245{
246 struct timespec t;
247 unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
248 int val2 = 0;
249
250 if ((op == FUTEX_WAIT) && utime) {
251 if (get_compat_timespec(&t, utime))
252 return -EFAULT;
253 timeout = timespec_to_jiffies(&t) + 1;
254 }
255 if (op >= FUTEX_REQUEUE)
256 val2 = (int) (unsigned long) utime;
257
258 return do_futex((unsigned long)uaddr, op, val, timeout,
259 (unsigned long)uaddr2, val2, val3);
260}
261#endif
262
263asmlinkage long compat_sys_setrlimit(unsigned int resource, 241asmlinkage long compat_sys_setrlimit(unsigned int resource,
264 struct compat_rlimit __user *rlim) 242 struct compat_rlimit __user *rlim)
265{ 243{
@@ -898,3 +876,61 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat
898 return -ERESTARTNOHAND; 876 return -ERESTARTNOHAND;
899} 877}
900#endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */ 878#endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */
879
880asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
881{
882 struct timex txc;
883 int ret;
884
885 memset(&txc, 0, sizeof(struct timex));
886
887 if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) ||
888 __get_user(txc.modes, &utp->modes) ||
889 __get_user(txc.offset, &utp->offset) ||
890 __get_user(txc.freq, &utp->freq) ||
891 __get_user(txc.maxerror, &utp->maxerror) ||
892 __get_user(txc.esterror, &utp->esterror) ||
893 __get_user(txc.status, &utp->status) ||
894 __get_user(txc.constant, &utp->constant) ||
895 __get_user(txc.precision, &utp->precision) ||
896 __get_user(txc.tolerance, &utp->tolerance) ||
897 __get_user(txc.time.tv_sec, &utp->time.tv_sec) ||
898 __get_user(txc.time.tv_usec, &utp->time.tv_usec) ||
899 __get_user(txc.tick, &utp->tick) ||
900 __get_user(txc.ppsfreq, &utp->ppsfreq) ||
901 __get_user(txc.jitter, &utp->jitter) ||
902 __get_user(txc.shift, &utp->shift) ||
903 __get_user(txc.stabil, &utp->stabil) ||
904 __get_user(txc.jitcnt, &utp->jitcnt) ||
905 __get_user(txc.calcnt, &utp->calcnt) ||
906 __get_user(txc.errcnt, &utp->errcnt) ||
907 __get_user(txc.stbcnt, &utp->stbcnt))
908 return -EFAULT;
909
910 ret = do_adjtimex(&txc);
911
912 if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) ||
913 __put_user(txc.modes, &utp->modes) ||
914 __put_user(txc.offset, &utp->offset) ||
915 __put_user(txc.freq, &utp->freq) ||
916 __put_user(txc.maxerror, &utp->maxerror) ||
917 __put_user(txc.esterror, &utp->esterror) ||
918 __put_user(txc.status, &utp->status) ||
919 __put_user(txc.constant, &utp->constant) ||
920 __put_user(txc.precision, &utp->precision) ||
921 __put_user(txc.tolerance, &utp->tolerance) ||
922 __put_user(txc.time.tv_sec, &utp->time.tv_sec) ||
923 __put_user(txc.time.tv_usec, &utp->time.tv_usec) ||
924 __put_user(txc.tick, &utp->tick) ||
925 __put_user(txc.ppsfreq, &utp->ppsfreq) ||
926 __put_user(txc.jitter, &utp->jitter) ||
927 __put_user(txc.shift, &utp->shift) ||
928 __put_user(txc.stabil, &utp->stabil) ||
929 __put_user(txc.jitcnt, &utp->jitcnt) ||
930 __put_user(txc.calcnt, &utp->calcnt) ||
931 __put_user(txc.errcnt, &utp->errcnt) ||
932 __put_user(txc.stbcnt, &utp->stbcnt))
933 ret = -EFAULT;
934
935 return ret;
936}
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 8be22bd80933..fe2b8d0bfe4c 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -18,7 +18,7 @@
18/* This protects CPUs going up and down... */ 18/* This protects CPUs going up and down... */
19static DECLARE_MUTEX(cpucontrol); 19static DECLARE_MUTEX(cpucontrol);
20 20
21static struct notifier_block *cpu_chain; 21static BLOCKING_NOTIFIER_HEAD(cpu_chain);
22 22
23#ifdef CONFIG_HOTPLUG_CPU 23#ifdef CONFIG_HOTPLUG_CPU
24static struct task_struct *lock_cpu_hotplug_owner; 24static struct task_struct *lock_cpu_hotplug_owner;
@@ -71,21 +71,13 @@ EXPORT_SYMBOL_GPL(lock_cpu_hotplug_interruptible);
71/* Need to know about CPUs going up/down? */ 71/* Need to know about CPUs going up/down? */
72int register_cpu_notifier(struct notifier_block *nb) 72int register_cpu_notifier(struct notifier_block *nb)
73{ 73{
74 int ret; 74 return blocking_notifier_chain_register(&cpu_chain, nb);
75
76 if ((ret = lock_cpu_hotplug_interruptible()) != 0)
77 return ret;
78 ret = notifier_chain_register(&cpu_chain, nb);
79 unlock_cpu_hotplug();
80 return ret;
81} 75}
82EXPORT_SYMBOL(register_cpu_notifier); 76EXPORT_SYMBOL(register_cpu_notifier);
83 77
84void unregister_cpu_notifier(struct notifier_block *nb) 78void unregister_cpu_notifier(struct notifier_block *nb)
85{ 79{
86 lock_cpu_hotplug(); 80 blocking_notifier_chain_unregister(&cpu_chain, nb);
87 notifier_chain_unregister(&cpu_chain, nb);
88 unlock_cpu_hotplug();
89} 81}
90EXPORT_SYMBOL(unregister_cpu_notifier); 82EXPORT_SYMBOL(unregister_cpu_notifier);
91 83
@@ -141,7 +133,7 @@ int cpu_down(unsigned int cpu)
141 goto out; 133 goto out;
142 } 134 }
143 135
144 err = notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE, 136 err = blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE,
145 (void *)(long)cpu); 137 (void *)(long)cpu);
146 if (err == NOTIFY_BAD) { 138 if (err == NOTIFY_BAD) {
147 printk("%s: attempt to take down CPU %u failed\n", 139 printk("%s: attempt to take down CPU %u failed\n",
@@ -159,7 +151,7 @@ int cpu_down(unsigned int cpu)
159 p = __stop_machine_run(take_cpu_down, NULL, cpu); 151 p = __stop_machine_run(take_cpu_down, NULL, cpu);
160 if (IS_ERR(p)) { 152 if (IS_ERR(p)) {
161 /* CPU didn't die: tell everyone. Can't complain. */ 153 /* CPU didn't die: tell everyone. Can't complain. */
162 if (notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED, 154 if (blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED,
163 (void *)(long)cpu) == NOTIFY_BAD) 155 (void *)(long)cpu) == NOTIFY_BAD)
164 BUG(); 156 BUG();
165 157
@@ -182,8 +174,8 @@ int cpu_down(unsigned int cpu)
182 put_cpu(); 174 put_cpu();
183 175
184 /* CPU is completely dead: tell everyone. Too late to complain. */ 176 /* CPU is completely dead: tell everyone. Too late to complain. */
185 if (notifier_call_chain(&cpu_chain, CPU_DEAD, (void *)(long)cpu) 177 if (blocking_notifier_call_chain(&cpu_chain, CPU_DEAD,
186 == NOTIFY_BAD) 178 (void *)(long)cpu) == NOTIFY_BAD)
187 BUG(); 179 BUG();
188 180
189 check_for_tasks(cpu); 181 check_for_tasks(cpu);
@@ -211,7 +203,7 @@ int __devinit cpu_up(unsigned int cpu)
211 goto out; 203 goto out;
212 } 204 }
213 205
214 ret = notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu); 206 ret = blocking_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu);
215 if (ret == NOTIFY_BAD) { 207 if (ret == NOTIFY_BAD) {
216 printk("%s: attempt to bring up CPU %u failed\n", 208 printk("%s: attempt to bring up CPU %u failed\n",
217 __FUNCTION__, cpu); 209 __FUNCTION__, cpu);
@@ -226,11 +218,12 @@ int __devinit cpu_up(unsigned int cpu)
226 BUG_ON(!cpu_online(cpu)); 218 BUG_ON(!cpu_online(cpu));
227 219
228 /* Now call notifier in preparation. */ 220 /* Now call notifier in preparation. */
229 notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu); 221 blocking_notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu);
230 222
231out_notify: 223out_notify:
232 if (ret != 0) 224 if (ret != 0)
233 notifier_call_chain(&cpu_chain, CPU_UP_CANCELED, hcpu); 225 blocking_notifier_call_chain(&cpu_chain,
226 CPU_UP_CANCELED, hcpu);
234out: 227out:
235 unlock_cpu_hotplug(); 228 unlock_cpu_hotplug();
236 return ret; 229 return ret;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 18aea1bd1284..72248d1b9e3f 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -616,12 +616,10 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
616 * current->cpuset if a task has its memory placement changed. 616 * current->cpuset if a task has its memory placement changed.
617 * Do not call this routine if in_interrupt(). 617 * Do not call this routine if in_interrupt().
618 * 618 *
619 * Call without callback_mutex or task_lock() held. May be called 619 * Call without callback_mutex or task_lock() held. May be
620 * with or without manage_mutex held. Doesn't need task_lock to guard 620 * called with or without manage_mutex held. Thanks in part to
621 * against another task changing a non-NULL cpuset pointer to NULL, 621 * 'the_top_cpuset_hack', the tasks cpuset pointer will never
622 * as that is only done by a task on itself, and if the current task 622 * be NULL. This routine also might acquire callback_mutex and
623 * is here, it is not simultaneously in the exit code NULL'ing its
624 * cpuset pointer. This routine also might acquire callback_mutex and
625 * current->mm->mmap_sem during call. 623 * current->mm->mmap_sem during call.
626 * 624 *
627 * Reading current->cpuset->mems_generation doesn't need task_lock 625 * Reading current->cpuset->mems_generation doesn't need task_lock
@@ -836,6 +834,55 @@ static int update_cpumask(struct cpuset *cs, char *buf)
836} 834}
837 835
838/* 836/*
837 * cpuset_migrate_mm
838 *
839 * Migrate memory region from one set of nodes to another.
840 *
841 * Temporarilly set tasks mems_allowed to target nodes of migration,
842 * so that the migration code can allocate pages on these nodes.
843 *
844 * Call holding manage_mutex, so our current->cpuset won't change
845 * during this call, as manage_mutex holds off any attach_task()
846 * calls. Therefore we don't need to take task_lock around the
847 * call to guarantee_online_mems(), as we know no one is changing
848 * our tasks cpuset.
849 *
850 * Hold callback_mutex around the two modifications of our tasks
851 * mems_allowed to synchronize with cpuset_mems_allowed().
852 *
853 * While the mm_struct we are migrating is typically from some
854 * other task, the task_struct mems_allowed that we are hacking
855 * is for our current task, which must allocate new pages for that
856 * migrating memory region.
857 *
858 * We call cpuset_update_task_memory_state() before hacking
859 * our tasks mems_allowed, so that we are assured of being in
860 * sync with our tasks cpuset, and in particular, callbacks to
861 * cpuset_update_task_memory_state() from nested page allocations
862 * won't see any mismatch of our cpuset and task mems_generation
863 * values, so won't overwrite our hacked tasks mems_allowed
864 * nodemask.
865 */
866
867static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
868 const nodemask_t *to)
869{
870 struct task_struct *tsk = current;
871
872 cpuset_update_task_memory_state();
873
874 mutex_lock(&callback_mutex);
875 tsk->mems_allowed = *to;
876 mutex_unlock(&callback_mutex);
877
878 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
879
880 mutex_lock(&callback_mutex);
881 guarantee_online_mems(tsk->cpuset, &tsk->mems_allowed);
882 mutex_unlock(&callback_mutex);
883}
884
885/*
839 * Handle user request to change the 'mems' memory placement 886 * Handle user request to change the 'mems' memory placement
840 * of a cpuset. Needs to validate the request, update the 887 * of a cpuset. Needs to validate the request, update the
841 * cpusets mems_allowed and mems_generation, and for each 888 * cpusets mems_allowed and mems_generation, and for each
@@ -947,10 +994,8 @@ static int update_nodemask(struct cpuset *cs, char *buf)
947 struct mm_struct *mm = mmarray[i]; 994 struct mm_struct *mm = mmarray[i];
948 995
949 mpol_rebind_mm(mm, &cs->mems_allowed); 996 mpol_rebind_mm(mm, &cs->mems_allowed);
950 if (migrate) { 997 if (migrate)
951 do_migrate_pages(mm, &oldmem, &cs->mems_allowed, 998 cpuset_migrate_mm(mm, &oldmem, &cs->mems_allowed);
952 MPOL_MF_MOVE_ALL);
953 }
954 mmput(mm); 999 mmput(mm);
955 } 1000 }
956 1001
@@ -1185,11 +1230,11 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
1185 mm = get_task_mm(tsk); 1230 mm = get_task_mm(tsk);
1186 if (mm) { 1231 if (mm) {
1187 mpol_rebind_mm(mm, &to); 1232 mpol_rebind_mm(mm, &to);
1233 if (is_memory_migrate(cs))
1234 cpuset_migrate_mm(mm, &from, &to);
1188 mmput(mm); 1235 mmput(mm);
1189 } 1236 }
1190 1237
1191 if (is_memory_migrate(cs))
1192 do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL);
1193 put_task_struct(tsk); 1238 put_task_struct(tsk);
1194 synchronize_rcu(); 1239 synchronize_rcu();
1195 if (atomic_dec_and_test(&oldcs->count)) 1240 if (atomic_dec_and_test(&oldcs->count))
diff --git a/kernel/exit.c b/kernel/exit.c
index 8037405e136e..6c2eeb8f6390 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -29,8 +29,11 @@
29#include <linux/cpuset.h> 29#include <linux/cpuset.h>
30#include <linux/syscalls.h> 30#include <linux/syscalls.h>
31#include <linux/signal.h> 31#include <linux/signal.h>
32#include <linux/posix-timers.h>
32#include <linux/cn_proc.h> 33#include <linux/cn_proc.h>
33#include <linux/mutex.h> 34#include <linux/mutex.h>
35#include <linux/futex.h>
36#include <linux/compat.h>
34 37
35#include <asm/uaccess.h> 38#include <asm/uaccess.h>
36#include <asm/unistd.h> 39#include <asm/unistd.h>
@@ -48,15 +51,85 @@ static void __unhash_process(struct task_struct *p)
48{ 51{
49 nr_threads--; 52 nr_threads--;
50 detach_pid(p, PIDTYPE_PID); 53 detach_pid(p, PIDTYPE_PID);
51 detach_pid(p, PIDTYPE_TGID);
52 if (thread_group_leader(p)) { 54 if (thread_group_leader(p)) {
53 detach_pid(p, PIDTYPE_PGID); 55 detach_pid(p, PIDTYPE_PGID);
54 detach_pid(p, PIDTYPE_SID); 56 detach_pid(p, PIDTYPE_SID);
55 if (p->pid) 57
56 __get_cpu_var(process_counts)--; 58 list_del_init(&p->tasks);
59 __get_cpu_var(process_counts)--;
57 } 60 }
61 list_del_rcu(&p->thread_group);
62 remove_parent(p);
63}
64
65/*
66 * This function expects the tasklist_lock write-locked.
67 */
68static void __exit_signal(struct task_struct *tsk)
69{
70 struct signal_struct *sig = tsk->signal;
71 struct sighand_struct *sighand;
72
73 BUG_ON(!sig);
74 BUG_ON(!atomic_read(&sig->count));
75
76 rcu_read_lock();
77 sighand = rcu_dereference(tsk->sighand);
78 spin_lock(&sighand->siglock);
58 79
59 REMOVE_LINKS(p); 80 posix_cpu_timers_exit(tsk);
81 if (atomic_dec_and_test(&sig->count))
82 posix_cpu_timers_exit_group(tsk);
83 else {
84 /*
85 * If there is any task waiting for the group exit
86 * then notify it:
87 */
88 if (sig->group_exit_task && atomic_read(&sig->count) == sig->notify_count) {
89 wake_up_process(sig->group_exit_task);
90 sig->group_exit_task = NULL;
91 }
92 if (tsk == sig->curr_target)
93 sig->curr_target = next_thread(tsk);
94 /*
95 * Accumulate here the counters for all threads but the
96 * group leader as they die, so they can be added into
97 * the process-wide totals when those are taken.
98 * The group leader stays around as a zombie as long
99 * as there are other threads. When it gets reaped,
100 * the exit.c code will add its counts into these totals.
101 * We won't ever get here for the group leader, since it
102 * will have been the last reference on the signal_struct.
103 */
104 sig->utime = cputime_add(sig->utime, tsk->utime);
105 sig->stime = cputime_add(sig->stime, tsk->stime);
106 sig->min_flt += tsk->min_flt;
107 sig->maj_flt += tsk->maj_flt;
108 sig->nvcsw += tsk->nvcsw;
109 sig->nivcsw += tsk->nivcsw;
110 sig->sched_time += tsk->sched_time;
111 sig = NULL; /* Marker for below. */
112 }
113
114 __unhash_process(tsk);
115
116 tsk->signal = NULL;
117 tsk->sighand = NULL;
118 spin_unlock(&sighand->siglock);
119 rcu_read_unlock();
120
121 __cleanup_sighand(sighand);
122 clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
123 flush_sigqueue(&tsk->pending);
124 if (sig) {
125 flush_sigqueue(&sig->shared_pending);
126 __cleanup_signal(sig);
127 }
128}
129
130static void delayed_put_task_struct(struct rcu_head *rhp)
131{
132 put_task_struct(container_of(rhp, struct task_struct, rcu));
60} 133}
61 134
62void release_task(struct task_struct * p) 135void release_task(struct task_struct * p)
@@ -65,21 +138,14 @@ void release_task(struct task_struct * p)
65 task_t *leader; 138 task_t *leader;
66 struct dentry *proc_dentry; 139 struct dentry *proc_dentry;
67 140
68repeat: 141repeat:
69 atomic_dec(&p->user->processes); 142 atomic_dec(&p->user->processes);
70 spin_lock(&p->proc_lock); 143 spin_lock(&p->proc_lock);
71 proc_dentry = proc_pid_unhash(p); 144 proc_dentry = proc_pid_unhash(p);
72 write_lock_irq(&tasklist_lock); 145 write_lock_irq(&tasklist_lock);
73 if (unlikely(p->ptrace)) 146 ptrace_unlink(p);
74 __ptrace_unlink(p);
75 BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); 147 BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
76 __exit_signal(p); 148 __exit_signal(p);
77 /*
78 * Note that the fastpath in sys_times depends on __exit_signal having
79 * updated the counters before a task is removed from the tasklist of
80 * the process by __unhash_process.
81 */
82 __unhash_process(p);
83 149
84 /* 150 /*
85 * If we are the last non-leader member of the thread 151 * If we are the last non-leader member of the thread
@@ -107,28 +173,13 @@ repeat:
107 spin_unlock(&p->proc_lock); 173 spin_unlock(&p->proc_lock);
108 proc_pid_flush(proc_dentry); 174 proc_pid_flush(proc_dentry);
109 release_thread(p); 175 release_thread(p);
110 put_task_struct(p); 176 call_rcu(&p->rcu, delayed_put_task_struct);
111 177
112 p = leader; 178 p = leader;
113 if (unlikely(zap_leader)) 179 if (unlikely(zap_leader))
114 goto repeat; 180 goto repeat;
115} 181}
116 182
117/* we are using it only for SMP init */
118
119void unhash_process(struct task_struct *p)
120{
121 struct dentry *proc_dentry;
122
123 spin_lock(&p->proc_lock);
124 proc_dentry = proc_pid_unhash(p);
125 write_lock_irq(&tasklist_lock);
126 __unhash_process(p);
127 write_unlock_irq(&tasklist_lock);
128 spin_unlock(&p->proc_lock);
129 proc_pid_flush(proc_dentry);
130}
131
132/* 183/*
133 * This checks not only the pgrp, but falls back on the pid if no 184 * This checks not only the pgrp, but falls back on the pid if no
134 * satisfactory pgrp is found. I dunno - gdb doesn't work correctly 185 * satisfactory pgrp is found. I dunno - gdb doesn't work correctly
@@ -236,10 +287,10 @@ static void reparent_to_init(void)
236 287
237 ptrace_unlink(current); 288 ptrace_unlink(current);
238 /* Reparent to init */ 289 /* Reparent to init */
239 REMOVE_LINKS(current); 290 remove_parent(current);
240 current->parent = child_reaper; 291 current->parent = child_reaper;
241 current->real_parent = child_reaper; 292 current->real_parent = child_reaper;
242 SET_LINKS(current); 293 add_parent(current);
243 294
244 /* Set the exit signal to SIGCHLD so we signal init on exit */ 295 /* Set the exit signal to SIGCHLD so we signal init on exit */
245 current->exit_signal = SIGCHLD; 296 current->exit_signal = SIGCHLD;
@@ -536,13 +587,13 @@ static void exit_mm(struct task_struct * tsk)
536 mmput(mm); 587 mmput(mm);
537} 588}
538 589
539static inline void choose_new_parent(task_t *p, task_t *reaper, task_t *child_reaper) 590static inline void choose_new_parent(task_t *p, task_t *reaper)
540{ 591{
541 /* 592 /*
542 * Make sure we're not reparenting to ourselves and that 593 * Make sure we're not reparenting to ourselves and that
543 * the parent is not a zombie. 594 * the parent is not a zombie.
544 */ 595 */
545 BUG_ON(p == reaper || reaper->exit_state >= EXIT_ZOMBIE); 596 BUG_ON(p == reaper || reaper->exit_state);
546 p->real_parent = reaper; 597 p->real_parent = reaper;
547} 598}
548 599
@@ -567,9 +618,9 @@ static void reparent_thread(task_t *p, task_t *father, int traced)
567 * anyway, so let go of it. 618 * anyway, so let go of it.
568 */ 619 */
569 p->ptrace = 0; 620 p->ptrace = 0;
570 list_del_init(&p->sibling); 621 remove_parent(p);
571 p->parent = p->real_parent; 622 p->parent = p->real_parent;
572 list_add_tail(&p->sibling, &p->parent->children); 623 add_parent(p);
573 624
574 /* If we'd notified the old parent about this child's death, 625 /* If we'd notified the old parent about this child's death,
575 * also notify the new parent. 626 * also notify the new parent.
@@ -643,7 +694,7 @@ static void forget_original_parent(struct task_struct * father,
643 694
644 if (father == p->real_parent) { 695 if (father == p->real_parent) {
645 /* reparent with a reaper, real father it's us */ 696 /* reparent with a reaper, real father it's us */
646 choose_new_parent(p, reaper, child_reaper); 697 choose_new_parent(p, reaper);
647 reparent_thread(p, father, 0); 698 reparent_thread(p, father, 0);
648 } else { 699 } else {
649 /* reparent ptraced task to its real parent */ 700 /* reparent ptraced task to its real parent */
@@ -664,7 +715,7 @@ static void forget_original_parent(struct task_struct * father,
664 } 715 }
665 list_for_each_safe(_p, _n, &father->ptrace_children) { 716 list_for_each_safe(_p, _n, &father->ptrace_children) {
666 p = list_entry(_p,struct task_struct,ptrace_list); 717 p = list_entry(_p,struct task_struct,ptrace_list);
667 choose_new_parent(p, reaper, child_reaper); 718 choose_new_parent(p, reaper);
668 reparent_thread(p, father, 1); 719 reparent_thread(p, father, 1);
669 } 720 }
670} 721}
@@ -805,7 +856,7 @@ fastcall NORET_TYPE void do_exit(long code)
805 panic("Aiee, killing interrupt handler!"); 856 panic("Aiee, killing interrupt handler!");
806 if (unlikely(!tsk->pid)) 857 if (unlikely(!tsk->pid))
807 panic("Attempted to kill the idle task!"); 858 panic("Attempted to kill the idle task!");
808 if (unlikely(tsk->pid == 1)) 859 if (unlikely(tsk == child_reaper))
809 panic("Attempted to kill init!"); 860 panic("Attempted to kill init!");
810 861
811 if (unlikely(current->ptrace & PT_TRACE_EXIT)) { 862 if (unlikely(current->ptrace & PT_TRACE_EXIT)) {
@@ -852,6 +903,12 @@ fastcall NORET_TYPE void do_exit(long code)
852 exit_itimers(tsk->signal); 903 exit_itimers(tsk->signal);
853 acct_process(code); 904 acct_process(code);
854 } 905 }
906 if (unlikely(tsk->robust_list))
907 exit_robust_list(tsk);
908#ifdef CONFIG_COMPAT
909 if (unlikely(tsk->compat_robust_list))
910 compat_exit_robust_list(tsk);
911#endif
855 exit_mm(tsk); 912 exit_mm(tsk);
856 913
857 exit_sem(tsk); 914 exit_sem(tsk);
@@ -912,13 +969,6 @@ asmlinkage long sys_exit(int error_code)
912 do_exit((error_code&0xff)<<8); 969 do_exit((error_code&0xff)<<8);
913} 970}
914 971
915task_t fastcall *next_thread(const task_t *p)
916{
917 return pid_task(p->pids[PIDTYPE_TGID].pid_list.next, PIDTYPE_TGID);
918}
919
920EXPORT_SYMBOL(next_thread);
921
922/* 972/*
923 * Take down every thread in the group. This is called by fatal signals 973 * Take down every thread in the group. This is called by fatal signals
924 * as well as by sys_exit_group (below). 974 * as well as by sys_exit_group (below).
@@ -933,7 +983,6 @@ do_group_exit(int exit_code)
933 else if (!thread_group_empty(current)) { 983 else if (!thread_group_empty(current)) {
934 struct signal_struct *const sig = current->signal; 984 struct signal_struct *const sig = current->signal;
935 struct sighand_struct *const sighand = current->sighand; 985 struct sighand_struct *const sighand = current->sighand;
936 read_lock(&tasklist_lock);
937 spin_lock_irq(&sighand->siglock); 986 spin_lock_irq(&sighand->siglock);
938 if (sig->flags & SIGNAL_GROUP_EXIT) 987 if (sig->flags & SIGNAL_GROUP_EXIT)
939 /* Another thread got here before we took the lock. */ 988 /* Another thread got here before we took the lock. */
@@ -943,7 +992,6 @@ do_group_exit(int exit_code)
943 zap_other_threads(current); 992 zap_other_threads(current);
944 } 993 }
945 spin_unlock_irq(&sighand->siglock); 994 spin_unlock_irq(&sighand->siglock);
946 read_unlock(&tasklist_lock);
947 } 995 }
948 996
949 do_exit(exit_code); 997 do_exit(exit_code);
@@ -1273,7 +1321,7 @@ bail_ref:
1273 1321
1274 /* move to end of parent's list to avoid starvation */ 1322 /* move to end of parent's list to avoid starvation */
1275 remove_parent(p); 1323 remove_parent(p);
1276 add_parent(p, p->parent); 1324 add_parent(p);
1277 1325
1278 write_unlock_irq(&tasklist_lock); 1326 write_unlock_irq(&tasklist_lock);
1279 1327
diff --git a/kernel/fork.c b/kernel/fork.c
index a02063903aaa..3384eb89cb1c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -84,7 +84,7 @@ static kmem_cache_t *task_struct_cachep;
84#endif 84#endif
85 85
86/* SLAB cache for signal_struct structures (tsk->signal) */ 86/* SLAB cache for signal_struct structures (tsk->signal) */
87kmem_cache_t *signal_cachep; 87static kmem_cache_t *signal_cachep;
88 88
89/* SLAB cache for sighand_struct structures (tsk->sighand) */ 89/* SLAB cache for sighand_struct structures (tsk->sighand) */
90kmem_cache_t *sighand_cachep; 90kmem_cache_t *sighand_cachep;
@@ -108,10 +108,8 @@ void free_task(struct task_struct *tsk)
108} 108}
109EXPORT_SYMBOL(free_task); 109EXPORT_SYMBOL(free_task);
110 110
111void __put_task_struct_cb(struct rcu_head *rhp) 111void __put_task_struct(struct task_struct *tsk)
112{ 112{
113 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
114
115 WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE))); 113 WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE)));
116 WARN_ON(atomic_read(&tsk->usage)); 114 WARN_ON(atomic_read(&tsk->usage));
117 WARN_ON(tsk == current); 115 WARN_ON(tsk == current);
@@ -126,6 +124,12 @@ void __put_task_struct_cb(struct rcu_head *rhp)
126 free_task(tsk); 124 free_task(tsk);
127} 125}
128 126
127void __put_task_struct_cb(struct rcu_head *rhp)
128{
129 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
130 __put_task_struct(tsk);
131}
132
129void __init fork_init(unsigned long mempages) 133void __init fork_init(unsigned long mempages)
130{ 134{
131#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR 135#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
@@ -721,7 +725,7 @@ out_release:
721 free_fdset (new_fdt->open_fds, new_fdt->max_fdset); 725 free_fdset (new_fdt->open_fds, new_fdt->max_fdset);
722 free_fd_array(new_fdt->fd, new_fdt->max_fds); 726 free_fd_array(new_fdt->fd, new_fdt->max_fds);
723 kmem_cache_free(files_cachep, newf); 727 kmem_cache_free(files_cachep, newf);
724 goto out; 728 return NULL;
725} 729}
726 730
727static int copy_files(unsigned long clone_flags, struct task_struct * tsk) 731static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
@@ -769,8 +773,7 @@ int unshare_files(void)
769 struct files_struct *files = current->files; 773 struct files_struct *files = current->files;
770 int rc; 774 int rc;
771 775
772 if(!files) 776 BUG_ON(!files);
773 BUG();
774 777
775 /* This can race but the race causes us to copy when we don't 778 /* This can race but the race causes us to copy when we don't
776 need to and drop the copy */ 779 need to and drop the copy */
@@ -787,14 +790,6 @@ int unshare_files(void)
787 790
788EXPORT_SYMBOL(unshare_files); 791EXPORT_SYMBOL(unshare_files);
789 792
790void sighand_free_cb(struct rcu_head *rhp)
791{
792 struct sighand_struct *sp;
793
794 sp = container_of(rhp, struct sighand_struct, rcu);
795 kmem_cache_free(sighand_cachep, sp);
796}
797
798static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk) 793static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
799{ 794{
800 struct sighand_struct *sig; 795 struct sighand_struct *sig;
@@ -807,12 +802,17 @@ static inline int copy_sighand(unsigned long clone_flags, struct task_struct * t
807 rcu_assign_pointer(tsk->sighand, sig); 802 rcu_assign_pointer(tsk->sighand, sig);
808 if (!sig) 803 if (!sig)
809 return -ENOMEM; 804 return -ENOMEM;
810 spin_lock_init(&sig->siglock);
811 atomic_set(&sig->count, 1); 805 atomic_set(&sig->count, 1);
812 memcpy(sig->action, current->sighand->action, sizeof(sig->action)); 806 memcpy(sig->action, current->sighand->action, sizeof(sig->action));
813 return 0; 807 return 0;
814} 808}
815 809
810void __cleanup_sighand(struct sighand_struct *sighand)
811{
812 if (atomic_dec_and_test(&sighand->count))
813 kmem_cache_free(sighand_cachep, sighand);
814}
815
816static inline int copy_signal(unsigned long clone_flags, struct task_struct * tsk) 816static inline int copy_signal(unsigned long clone_flags, struct task_struct * tsk)
817{ 817{
818 struct signal_struct *sig; 818 struct signal_struct *sig;
@@ -848,7 +848,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
848 hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_REL); 848 hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_REL);
849 sig->it_real_incr.tv64 = 0; 849 sig->it_real_incr.tv64 = 0;
850 sig->real_timer.function = it_real_fn; 850 sig->real_timer.function = it_real_fn;
851 sig->real_timer.data = tsk; 851 sig->tsk = tsk;
852 852
853 sig->it_virt_expires = cputime_zero; 853 sig->it_virt_expires = cputime_zero;
854 sig->it_virt_incr = cputime_zero; 854 sig->it_virt_incr = cputime_zero;
@@ -882,6 +882,22 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
882 return 0; 882 return 0;
883} 883}
884 884
885void __cleanup_signal(struct signal_struct *sig)
886{
887 exit_thread_group_keys(sig);
888 kmem_cache_free(signal_cachep, sig);
889}
890
891static inline void cleanup_signal(struct task_struct *tsk)
892{
893 struct signal_struct *sig = tsk->signal;
894
895 atomic_dec(&sig->live);
896
897 if (atomic_dec_and_test(&sig->count))
898 __cleanup_signal(sig);
899}
900
885static inline void copy_flags(unsigned long clone_flags, struct task_struct *p) 901static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
886{ 902{
887 unsigned long new_flags = p->flags; 903 unsigned long new_flags = p->flags;
@@ -1062,7 +1078,10 @@ static task_t *copy_process(unsigned long clone_flags,
1062 * Clear TID on mm_release()? 1078 * Clear TID on mm_release()?
1063 */ 1079 */
1064 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; 1080 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
1065 1081 p->robust_list = NULL;
1082#ifdef CONFIG_COMPAT
1083 p->compat_robust_list = NULL;
1084#endif
1066 /* 1085 /*
1067 * sigaltstack should be cleared when sharing the same VM 1086 * sigaltstack should be cleared when sharing the same VM
1068 */ 1087 */
@@ -1093,6 +1112,7 @@ static task_t *copy_process(unsigned long clone_flags,
1093 * We dont wake it up yet. 1112 * We dont wake it up yet.
1094 */ 1113 */
1095 p->group_leader = p; 1114 p->group_leader = p;
1115 INIT_LIST_HEAD(&p->thread_group);
1096 INIT_LIST_HEAD(&p->ptrace_children); 1116 INIT_LIST_HEAD(&p->ptrace_children);
1097 INIT_LIST_HEAD(&p->ptrace_list); 1117 INIT_LIST_HEAD(&p->ptrace_list);
1098 1118
@@ -1116,16 +1136,6 @@ static task_t *copy_process(unsigned long clone_flags,
1116 !cpu_online(task_cpu(p)))) 1136 !cpu_online(task_cpu(p))))
1117 set_task_cpu(p, smp_processor_id()); 1137 set_task_cpu(p, smp_processor_id());
1118 1138
1119 /*
1120 * Check for pending SIGKILL! The new thread should not be allowed
1121 * to slip out of an OOM kill. (or normal SIGKILL.)
1122 */
1123 if (sigismember(&current->pending.signal, SIGKILL)) {
1124 write_unlock_irq(&tasklist_lock);
1125 retval = -EINTR;
1126 goto bad_fork_cleanup_namespace;
1127 }
1128
1129 /* CLONE_PARENT re-uses the old parent */ 1139 /* CLONE_PARENT re-uses the old parent */
1130 if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) 1140 if (clone_flags & (CLONE_PARENT|CLONE_THREAD))
1131 p->real_parent = current->real_parent; 1141 p->real_parent = current->real_parent;
@@ -1134,6 +1144,23 @@ static task_t *copy_process(unsigned long clone_flags,
1134 p->parent = p->real_parent; 1144 p->parent = p->real_parent;
1135 1145
1136 spin_lock(&current->sighand->siglock); 1146 spin_lock(&current->sighand->siglock);
1147
1148 /*
1149 * Process group and session signals need to be delivered to just the
1150 * parent before the fork or both the parent and the child after the
1151 * fork. Restart if a signal comes in before we add the new process to
1152 * it's process group.
1153 * A fatal signal pending means that current will exit, so the new
1154 * thread can't slip out of an OOM kill (or normal SIGKILL).
1155 */
1156 recalc_sigpending();
1157 if (signal_pending(current)) {
1158 spin_unlock(&current->sighand->siglock);
1159 write_unlock_irq(&tasklist_lock);
1160 retval = -ERESTARTNOINTR;
1161 goto bad_fork_cleanup_namespace;
1162 }
1163
1137 if (clone_flags & CLONE_THREAD) { 1164 if (clone_flags & CLONE_THREAD) {
1138 /* 1165 /*
1139 * Important: if an exit-all has been started then 1166 * Important: if an exit-all has been started then
@@ -1146,17 +1173,9 @@ static task_t *copy_process(unsigned long clone_flags,
1146 retval = -EAGAIN; 1173 retval = -EAGAIN;
1147 goto bad_fork_cleanup_namespace; 1174 goto bad_fork_cleanup_namespace;
1148 } 1175 }
1149 p->group_leader = current->group_leader;
1150 1176
1151 if (current->signal->group_stop_count > 0) { 1177 p->group_leader = current->group_leader;
1152 /* 1178 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
1153 * There is an all-stop in progress for the group.
1154 * We ourselves will stop as soon as we check signals.
1155 * Make the new thread part of that group stop too.
1156 */
1157 current->signal->group_stop_count++;
1158 set_tsk_thread_flag(p, TIF_SIGPENDING);
1159 }
1160 1179
1161 if (!cputime_eq(current->signal->it_virt_expires, 1180 if (!cputime_eq(current->signal->it_virt_expires,
1162 cputime_zero) || 1181 cputime_zero) ||
@@ -1179,23 +1198,25 @@ static task_t *copy_process(unsigned long clone_flags,
1179 */ 1198 */
1180 p->ioprio = current->ioprio; 1199 p->ioprio = current->ioprio;
1181 1200
1182 SET_LINKS(p); 1201 if (likely(p->pid)) {
1183 if (unlikely(p->ptrace & PT_PTRACED)) 1202 add_parent(p);
1184 __ptrace_link(p, current->parent); 1203 if (unlikely(p->ptrace & PT_PTRACED))
1185 1204 __ptrace_link(p, current->parent);
1186 if (thread_group_leader(p)) { 1205
1187 p->signal->tty = current->signal->tty; 1206 if (thread_group_leader(p)) {
1188 p->signal->pgrp = process_group(current); 1207 p->signal->tty = current->signal->tty;
1189 p->signal->session = current->signal->session; 1208 p->signal->pgrp = process_group(current);
1190 attach_pid(p, PIDTYPE_PGID, process_group(p)); 1209 p->signal->session = current->signal->session;
1191 attach_pid(p, PIDTYPE_SID, p->signal->session); 1210 attach_pid(p, PIDTYPE_PGID, process_group(p));
1192 if (p->pid) 1211 attach_pid(p, PIDTYPE_SID, p->signal->session);
1212
1213 list_add_tail(&p->tasks, &init_task.tasks);
1193 __get_cpu_var(process_counts)++; 1214 __get_cpu_var(process_counts)++;
1215 }
1216 attach_pid(p, PIDTYPE_PID, p->pid);
1217 nr_threads++;
1194 } 1218 }
1195 attach_pid(p, PIDTYPE_TGID, p->tgid);
1196 attach_pid(p, PIDTYPE_PID, p->pid);
1197 1219
1198 nr_threads++;
1199 total_forks++; 1220 total_forks++;
1200 spin_unlock(&current->sighand->siglock); 1221 spin_unlock(&current->sighand->siglock);
1201 write_unlock_irq(&tasklist_lock); 1222 write_unlock_irq(&tasklist_lock);
@@ -1210,9 +1231,9 @@ bad_fork_cleanup_mm:
1210 if (p->mm) 1231 if (p->mm)
1211 mmput(p->mm); 1232 mmput(p->mm);
1212bad_fork_cleanup_signal: 1233bad_fork_cleanup_signal:
1213 exit_signal(p); 1234 cleanup_signal(p);
1214bad_fork_cleanup_sighand: 1235bad_fork_cleanup_sighand:
1215 exit_sighand(p); 1236 __cleanup_sighand(p->sighand);
1216bad_fork_cleanup_fs: 1237bad_fork_cleanup_fs:
1217 exit_fs(p); /* blocking */ 1238 exit_fs(p); /* blocking */
1218bad_fork_cleanup_files: 1239bad_fork_cleanup_files:
@@ -1259,7 +1280,7 @@ task_t * __devinit fork_idle(int cpu)
1259 if (!task) 1280 if (!task)
1260 return ERR_PTR(-ENOMEM); 1281 return ERR_PTR(-ENOMEM);
1261 init_idle(task, cpu); 1282 init_idle(task, cpu);
1262 unhash_process(task); 1283
1263 return task; 1284 return task;
1264} 1285}
1265 1286
@@ -1294,17 +1315,19 @@ long do_fork(unsigned long clone_flags,
1294{ 1315{
1295 struct task_struct *p; 1316 struct task_struct *p;
1296 int trace = 0; 1317 int trace = 0;
1297 long pid = alloc_pidmap(); 1318 struct pid *pid = alloc_pid();
1319 long nr;
1298 1320
1299 if (pid < 0) 1321 if (!pid)
1300 return -EAGAIN; 1322 return -EAGAIN;
1323 nr = pid->nr;
1301 if (unlikely(current->ptrace)) { 1324 if (unlikely(current->ptrace)) {
1302 trace = fork_traceflag (clone_flags); 1325 trace = fork_traceflag (clone_flags);
1303 if (trace) 1326 if (trace)
1304 clone_flags |= CLONE_PTRACE; 1327 clone_flags |= CLONE_PTRACE;
1305 } 1328 }
1306 1329
1307 p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, pid); 1330 p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, nr);
1308 /* 1331 /*
1309 * Do this prior waking up the new thread - the thread pointer 1332 * Do this prior waking up the new thread - the thread pointer
1310 * might get invalid after that point, if the thread exits quickly. 1333 * might get invalid after that point, if the thread exits quickly.
@@ -1331,7 +1354,7 @@ long do_fork(unsigned long clone_flags,
1331 p->state = TASK_STOPPED; 1354 p->state = TASK_STOPPED;
1332 1355
1333 if (unlikely (trace)) { 1356 if (unlikely (trace)) {
1334 current->ptrace_message = pid; 1357 current->ptrace_message = nr;
1335 ptrace_notify ((trace << 8) | SIGTRAP); 1358 ptrace_notify ((trace << 8) | SIGTRAP);
1336 } 1359 }
1337 1360
@@ -1341,21 +1364,31 @@ long do_fork(unsigned long clone_flags,
1341 ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP); 1364 ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
1342 } 1365 }
1343 } else { 1366 } else {
1344 free_pidmap(pid); 1367 free_pid(pid);
1345 pid = PTR_ERR(p); 1368 nr = PTR_ERR(p);
1346 } 1369 }
1347 return pid; 1370 return nr;
1348} 1371}
1349 1372
1350#ifndef ARCH_MIN_MMSTRUCT_ALIGN 1373#ifndef ARCH_MIN_MMSTRUCT_ALIGN
1351#define ARCH_MIN_MMSTRUCT_ALIGN 0 1374#define ARCH_MIN_MMSTRUCT_ALIGN 0
1352#endif 1375#endif
1353 1376
1377static void sighand_ctor(void *data, kmem_cache_t *cachep, unsigned long flags)
1378{
1379 struct sighand_struct *sighand = data;
1380
1381 if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
1382 SLAB_CTOR_CONSTRUCTOR)
1383 spin_lock_init(&sighand->siglock);
1384}
1385
1354void __init proc_caches_init(void) 1386void __init proc_caches_init(void)
1355{ 1387{
1356 sighand_cachep = kmem_cache_create("sighand_cache", 1388 sighand_cachep = kmem_cache_create("sighand_cache",
1357 sizeof(struct sighand_struct), 0, 1389 sizeof(struct sighand_struct), 0,
1358 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); 1390 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU,
1391 sighand_ctor, NULL);
1359 signal_cachep = kmem_cache_create("signal_cache", 1392 signal_cachep = kmem_cache_create("signal_cache",
1360 sizeof(struct signal_struct), 0, 1393 sizeof(struct signal_struct), 0,
1361 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); 1394 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
diff --git a/kernel/futex.c b/kernel/futex.c
index 5efa2f978032..5699c512057b 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -8,6 +8,10 @@
8 * Removed page pinning, fix privately mapped COW pages and other cleanups 8 * Removed page pinning, fix privately mapped COW pages and other cleanups
9 * (C) Copyright 2003, 2004 Jamie Lokier 9 * (C) Copyright 2003, 2004 Jamie Lokier
10 * 10 *
11 * Robust futex support started by Ingo Molnar
12 * (C) Copyright 2006 Red Hat Inc, All Rights Reserved
13 * Thanks to Thomas Gleixner for suggestions, analysis and fixes.
14 *
11 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly 15 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
12 * enough at me, Linus for the original (flawed) idea, Matthew 16 * enough at me, Linus for the original (flawed) idea, Matthew
13 * Kirkwood for proof-of-concept implementation. 17 * Kirkwood for proof-of-concept implementation.
@@ -829,6 +833,172 @@ error:
829 goto out; 833 goto out;
830} 834}
831 835
836/*
837 * Support for robust futexes: the kernel cleans up held futexes at
838 * thread exit time.
839 *
840 * Implementation: user-space maintains a per-thread list of locks it
841 * is holding. Upon do_exit(), the kernel carefully walks this list,
842 * and marks all locks that are owned by this thread with the
843 * FUTEX_OWNER_DEAD bit, and wakes up a waiter (if any). The list is
844 * always manipulated with the lock held, so the list is private and
845 * per-thread. Userspace also maintains a per-thread 'list_op_pending'
846 * field, to allow the kernel to clean up if the thread dies after
847 * acquiring the lock, but just before it could have added itself to
848 * the list. There can only be one such pending lock.
849 */
850
851/**
852 * sys_set_robust_list - set the robust-futex list head of a task
853 * @head: pointer to the list-head
854 * @len: length of the list-head, as userspace expects
855 */
856asmlinkage long
857sys_set_robust_list(struct robust_list_head __user *head,
858 size_t len)
859{
860 /*
861 * The kernel knows only one size for now:
862 */
863 if (unlikely(len != sizeof(*head)))
864 return -EINVAL;
865
866 current->robust_list = head;
867
868 return 0;
869}
870
871/**
872 * sys_get_robust_list - get the robust-futex list head of a task
873 * @pid: pid of the process [zero for current task]
874 * @head_ptr: pointer to a list-head pointer, the kernel fills it in
875 * @len_ptr: pointer to a length field, the kernel fills in the header size
876 */
877asmlinkage long
878sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr,
879 size_t __user *len_ptr)
880{
881 struct robust_list_head *head;
882 unsigned long ret;
883
884 if (!pid)
885 head = current->robust_list;
886 else {
887 struct task_struct *p;
888
889 ret = -ESRCH;
890 read_lock(&tasklist_lock);
891 p = find_task_by_pid(pid);
892 if (!p)
893 goto err_unlock;
894 ret = -EPERM;
895 if ((current->euid != p->euid) && (current->euid != p->uid) &&
896 !capable(CAP_SYS_PTRACE))
897 goto err_unlock;
898 head = p->robust_list;
899 read_unlock(&tasklist_lock);
900 }
901
902 if (put_user(sizeof(*head), len_ptr))
903 return -EFAULT;
904 return put_user(head, head_ptr);
905
906err_unlock:
907 read_unlock(&tasklist_lock);
908
909 return ret;
910}
911
912/*
913 * Process a futex-list entry, check whether it's owned by the
914 * dying task, and do notification if so:
915 */
916int handle_futex_death(u32 __user *uaddr, struct task_struct *curr)
917{
918 u32 uval;
919
920retry:
921 if (get_user(uval, uaddr))
922 return -1;
923
924 if ((uval & FUTEX_TID_MASK) == curr->pid) {
925 /*
926 * Ok, this dying thread is truly holding a futex
927 * of interest. Set the OWNER_DIED bit atomically
928 * via cmpxchg, and if the value had FUTEX_WAITERS
929 * set, wake up a waiter (if any). (We have to do a
930 * futex_wake() even if OWNER_DIED is already set -
931 * to handle the rare but possible case of recursive
932 * thread-death.) The rest of the cleanup is done in
933 * userspace.
934 */
935 if (futex_atomic_cmpxchg_inatomic(uaddr, uval,
936 uval | FUTEX_OWNER_DIED) != uval)
937 goto retry;
938
939 if (uval & FUTEX_WAITERS)
940 futex_wake((unsigned long)uaddr, 1);
941 }
942 return 0;
943}
944
945/*
946 * Walk curr->robust_list (very carefully, it's a userspace list!)
947 * and mark any locks found there dead, and notify any waiters.
948 *
949 * We silently return on any sign of list-walking problem.
950 */
951void exit_robust_list(struct task_struct *curr)
952{
953 struct robust_list_head __user *head = curr->robust_list;
954 struct robust_list __user *entry, *pending;
955 unsigned int limit = ROBUST_LIST_LIMIT;
956 unsigned long futex_offset;
957
958 /*
959 * Fetch the list head (which was registered earlier, via
960 * sys_set_robust_list()):
961 */
962 if (get_user(entry, &head->list.next))
963 return;
964 /*
965 * Fetch the relative futex offset:
966 */
967 if (get_user(futex_offset, &head->futex_offset))
968 return;
969 /*
970 * Fetch any possibly pending lock-add first, and handle it
971 * if it exists:
972 */
973 if (get_user(pending, &head->list_op_pending))
974 return;
975 if (pending)
976 handle_futex_death((void *)pending + futex_offset, curr);
977
978 while (entry != &head->list) {
979 /*
980 * A pending lock might already be on the list, so
981 * dont process it twice:
982 */
983 if (entry != pending)
984 if (handle_futex_death((void *)entry + futex_offset,
985 curr))
986 return;
987 /*
988 * Fetch the next entry in the list:
989 */
990 if (get_user(entry, &entry->next))
991 return;
992 /*
993 * Avoid excessively long or circular lists:
994 */
995 if (!--limit)
996 break;
997
998 cond_resched();
999 }
1000}
1001
832long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, 1002long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
833 unsigned long uaddr2, int val2, int val3) 1003 unsigned long uaddr2, int val2, int val3)
834{ 1004{
@@ -869,9 +1039,11 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, int val,
869 unsigned long timeout = MAX_SCHEDULE_TIMEOUT; 1039 unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
870 int val2 = 0; 1040 int val2 = 0;
871 1041
872 if ((op == FUTEX_WAIT) && utime) { 1042 if (utime && (op == FUTEX_WAIT)) {
873 if (copy_from_user(&t, utime, sizeof(t)) != 0) 1043 if (copy_from_user(&t, utime, sizeof(t)) != 0)
874 return -EFAULT; 1044 return -EFAULT;
1045 if (!timespec_valid(&t))
1046 return -EINVAL;
875 timeout = timespec_to_jiffies(&t) + 1; 1047 timeout = timespec_to_jiffies(&t) + 1;
876 } 1048 }
877 /* 1049 /*
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
new file mode 100644
index 000000000000..1ab6a0ea3d14
--- /dev/null
+++ b/kernel/futex_compat.c
@@ -0,0 +1,144 @@
1/*
2 * linux/kernel/futex_compat.c
3 *
4 * Futex compatibililty routines.
5 *
6 * Copyright 2006, Red Hat, Inc., Ingo Molnar
7 */
8
9#include <linux/linkage.h>
10#include <linux/compat.h>
11#include <linux/futex.h>
12
13#include <asm/uaccess.h>
14
15/*
16 * Walk curr->robust_list (very carefully, it's a userspace list!)
17 * and mark any locks found there dead, and notify any waiters.
18 *
19 * We silently return on any sign of list-walking problem.
20 */
21void compat_exit_robust_list(struct task_struct *curr)
22{
23 struct compat_robust_list_head __user *head = curr->compat_robust_list;
24 struct robust_list __user *entry, *pending;
25 compat_uptr_t uentry, upending;
26 unsigned int limit = ROBUST_LIST_LIMIT;
27 compat_long_t futex_offset;
28
29 /*
30 * Fetch the list head (which was registered earlier, via
31 * sys_set_robust_list()):
32 */
33 if (get_user(uentry, &head->list.next))
34 return;
35 entry = compat_ptr(uentry);
36 /*
37 * Fetch the relative futex offset:
38 */
39 if (get_user(futex_offset, &head->futex_offset))
40 return;
41 /*
42 * Fetch any possibly pending lock-add first, and handle it
43 * if it exists:
44 */
45 if (get_user(upending, &head->list_op_pending))
46 return;
47 pending = compat_ptr(upending);
48 if (upending)
49 handle_futex_death((void *)pending + futex_offset, curr);
50
51 while (compat_ptr(uentry) != &head->list) {
52 /*
53 * A pending lock might already be on the list, so
54 * dont process it twice:
55 */
56 if (entry != pending)
57 if (handle_futex_death((void *)entry + futex_offset,
58 curr))
59 return;
60
61 /*
62 * Fetch the next entry in the list:
63 */
64 if (get_user(uentry, (compat_uptr_t *)&entry->next))
65 return;
66 entry = compat_ptr(uentry);
67 /*
68 * Avoid excessively long or circular lists:
69 */
70 if (!--limit)
71 break;
72
73 cond_resched();
74 }
75}
76
77asmlinkage long
78compat_sys_set_robust_list(struct compat_robust_list_head __user *head,
79 compat_size_t len)
80{
81 if (unlikely(len != sizeof(*head)))
82 return -EINVAL;
83
84 current->compat_robust_list = head;
85
86 return 0;
87}
88
89asmlinkage long
90compat_sys_get_robust_list(int pid, compat_uptr_t *head_ptr,
91 compat_size_t __user *len_ptr)
92{
93 struct compat_robust_list_head *head;
94 unsigned long ret;
95
96 if (!pid)
97 head = current->compat_robust_list;
98 else {
99 struct task_struct *p;
100
101 ret = -ESRCH;
102 read_lock(&tasklist_lock);
103 p = find_task_by_pid(pid);
104 if (!p)
105 goto err_unlock;
106 ret = -EPERM;
107 if ((current->euid != p->euid) && (current->euid != p->uid) &&
108 !capable(CAP_SYS_PTRACE))
109 goto err_unlock;
110 head = p->compat_robust_list;
111 read_unlock(&tasklist_lock);
112 }
113
114 if (put_user(sizeof(*head), len_ptr))
115 return -EFAULT;
116 return put_user(ptr_to_compat(head), head_ptr);
117
118err_unlock:
119 read_unlock(&tasklist_lock);
120
121 return ret;
122}
123
124asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
125 struct compat_timespec __user *utime, u32 __user *uaddr2,
126 u32 val3)
127{
128 struct timespec t;
129 unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
130 int val2 = 0;
131
132 if (utime && (op == FUTEX_WAIT)) {
133 if (get_compat_timespec(&t, utime))
134 return -EFAULT;
135 if (!timespec_valid(&t))
136 return -EINVAL;
137 timeout = timespec_to_jiffies(&t) + 1;
138 }
139 if (op >= FUTEX_REQUEUE)
140 val2 = (int) (unsigned long) utime;
141
142 return do_futex((unsigned long)uaddr, op, val, timeout,
143 (unsigned long)uaddr2, val2, val3);
144}
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 14bc9cfa6399..f181ff4dd32e 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -123,6 +123,26 @@ void ktime_get_ts(struct timespec *ts)
123EXPORT_SYMBOL_GPL(ktime_get_ts); 123EXPORT_SYMBOL_GPL(ktime_get_ts);
124 124
125/* 125/*
126 * Get the coarse grained time at the softirq based on xtime and
127 * wall_to_monotonic.
128 */
129static void hrtimer_get_softirq_time(struct hrtimer_base *base)
130{
131 ktime_t xtim, tomono;
132 unsigned long seq;
133
134 do {
135 seq = read_seqbegin(&xtime_lock);
136 xtim = timespec_to_ktime(xtime);
137 tomono = timespec_to_ktime(wall_to_monotonic);
138
139 } while (read_seqretry(&xtime_lock, seq));
140
141 base[CLOCK_REALTIME].softirq_time = xtim;
142 base[CLOCK_MONOTONIC].softirq_time = ktime_add(xtim, tomono);
143}
144
145/*
126 * Functions and macros which are different for UP/SMP systems are kept in a 146 * Functions and macros which are different for UP/SMP systems are kept in a
127 * single place 147 * single place
128 */ 148 */
@@ -246,7 +266,7 @@ ktime_t ktime_add_ns(const ktime_t kt, u64 nsec)
246/* 266/*
247 * Divide a ktime value by a nanosecond value 267 * Divide a ktime value by a nanosecond value
248 */ 268 */
249static unsigned long ktime_divns(const ktime_t kt, nsec_t div) 269static unsigned long ktime_divns(const ktime_t kt, s64 div)
250{ 270{
251 u64 dclc, inc, dns; 271 u64 dclc, inc, dns;
252 int sft = 0; 272 int sft = 0;
@@ -281,18 +301,17 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
281 * hrtimer_forward - forward the timer expiry 301 * hrtimer_forward - forward the timer expiry
282 * 302 *
283 * @timer: hrtimer to forward 303 * @timer: hrtimer to forward
304 * @now: forward past this time
284 * @interval: the interval to forward 305 * @interval: the interval to forward
285 * 306 *
286 * Forward the timer expiry so it will expire in the future. 307 * Forward the timer expiry so it will expire in the future.
287 * Returns the number of overruns. 308 * Returns the number of overruns.
288 */ 309 */
289unsigned long 310unsigned long
290hrtimer_forward(struct hrtimer *timer, ktime_t interval) 311hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
291{ 312{
292 unsigned long orun = 1; 313 unsigned long orun = 1;
293 ktime_t delta, now; 314 ktime_t delta;
294
295 now = timer->base->get_time();
296 315
297 delta = ktime_sub(now, timer->expires); 316 delta = ktime_sub(now, timer->expires);
298 317
@@ -303,7 +322,7 @@ hrtimer_forward(struct hrtimer *timer, ktime_t interval)
303 interval.tv64 = timer->base->resolution.tv64; 322 interval.tv64 = timer->base->resolution.tv64;
304 323
305 if (unlikely(delta.tv64 >= interval.tv64)) { 324 if (unlikely(delta.tv64 >= interval.tv64)) {
306 nsec_t incr = ktime_to_ns(interval); 325 s64 incr = ktime_to_ns(interval);
307 326
308 orun = ktime_divns(delta, incr); 327 orun = ktime_divns(delta, incr);
309 timer->expires = ktime_add_ns(timer->expires, incr * orun); 328 timer->expires = ktime_add_ns(timer->expires, incr * orun);
@@ -355,8 +374,6 @@ static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
355 rb_link_node(&timer->node, parent, link); 374 rb_link_node(&timer->node, parent, link);
356 rb_insert_color(&timer->node, &base->active); 375 rb_insert_color(&timer->node, &base->active);
357 376
358 timer->state = HRTIMER_PENDING;
359
360 if (!base->first || timer->expires.tv64 < 377 if (!base->first || timer->expires.tv64 <
361 rb_entry(base->first, struct hrtimer, node)->expires.tv64) 378 rb_entry(base->first, struct hrtimer, node)->expires.tv64)
362 base->first = &timer->node; 379 base->first = &timer->node;
@@ -376,6 +393,7 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
376 if (base->first == &timer->node) 393 if (base->first == &timer->node)
377 base->first = rb_next(&timer->node); 394 base->first = rb_next(&timer->node);
378 rb_erase(&timer->node, &base->active); 395 rb_erase(&timer->node, &base->active);
396 timer->node.rb_parent = HRTIMER_INACTIVE;
379} 397}
380 398
381/* 399/*
@@ -386,7 +404,6 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
386{ 404{
387 if (hrtimer_active(timer)) { 405 if (hrtimer_active(timer)) {
388 __remove_hrtimer(timer, base); 406 __remove_hrtimer(timer, base);
389 timer->state = HRTIMER_INACTIVE;
390 return 1; 407 return 1;
391 } 408 }
392 return 0; 409 return 0;
@@ -560,6 +577,7 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
560 clock_id = CLOCK_MONOTONIC; 577 clock_id = CLOCK_MONOTONIC;
561 578
562 timer->base = &bases[clock_id]; 579 timer->base = &bases[clock_id];
580 timer->node.rb_parent = HRTIMER_INACTIVE;
563} 581}
564 582
565/** 583/**
@@ -586,48 +604,38 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
586 */ 604 */
587static inline void run_hrtimer_queue(struct hrtimer_base *base) 605static inline void run_hrtimer_queue(struct hrtimer_base *base)
588{ 606{
589 ktime_t now = base->get_time();
590 struct rb_node *node; 607 struct rb_node *node;
591 608
609 if (!base->first)
610 return;
611
612 if (base->get_softirq_time)
613 base->softirq_time = base->get_softirq_time();
614
592 spin_lock_irq(&base->lock); 615 spin_lock_irq(&base->lock);
593 616
594 while ((node = base->first)) { 617 while ((node = base->first)) {
595 struct hrtimer *timer; 618 struct hrtimer *timer;
596 int (*fn)(void *); 619 int (*fn)(struct hrtimer *);
597 int restart; 620 int restart;
598 void *data;
599 621
600 timer = rb_entry(node, struct hrtimer, node); 622 timer = rb_entry(node, struct hrtimer, node);
601 if (now.tv64 <= timer->expires.tv64) 623 if (base->softirq_time.tv64 <= timer->expires.tv64)
602 break; 624 break;
603 625
604 fn = timer->function; 626 fn = timer->function;
605 data = timer->data;
606 set_curr_timer(base, timer); 627 set_curr_timer(base, timer);
607 timer->state = HRTIMER_RUNNING;
608 __remove_hrtimer(timer, base); 628 __remove_hrtimer(timer, base);
609 spin_unlock_irq(&base->lock); 629 spin_unlock_irq(&base->lock);
610 630
611 /* 631 restart = fn(timer);
612 * fn == NULL is special case for the simplest timer
613 * variant - wake up process and do not restart:
614 */
615 if (!fn) {
616 wake_up_process(data);
617 restart = HRTIMER_NORESTART;
618 } else
619 restart = fn(data);
620 632
621 spin_lock_irq(&base->lock); 633 spin_lock_irq(&base->lock);
622 634
623 /* Another CPU has added back the timer */ 635 if (restart != HRTIMER_NORESTART) {
624 if (timer->state != HRTIMER_RUNNING) 636 BUG_ON(hrtimer_active(timer));
625 continue;
626
627 if (restart == HRTIMER_RESTART)
628 enqueue_hrtimer(timer, base); 637 enqueue_hrtimer(timer, base);
629 else 638 }
630 timer->state = HRTIMER_EXPIRED;
631 } 639 }
632 set_curr_timer(base, NULL); 640 set_curr_timer(base, NULL);
633 spin_unlock_irq(&base->lock); 641 spin_unlock_irq(&base->lock);
@@ -641,6 +649,8 @@ void hrtimer_run_queues(void)
641 struct hrtimer_base *base = __get_cpu_var(hrtimer_bases); 649 struct hrtimer_base *base = __get_cpu_var(hrtimer_bases);
642 int i; 650 int i;
643 651
652 hrtimer_get_softirq_time(base);
653
644 for (i = 0; i < MAX_HRTIMER_BASES; i++) 654 for (i = 0; i < MAX_HRTIMER_BASES; i++)
645 run_hrtimer_queue(&base[i]); 655 run_hrtimer_queue(&base[i]);
646} 656}
@@ -648,80 +658,69 @@ void hrtimer_run_queues(void)
648/* 658/*
649 * Sleep related functions: 659 * Sleep related functions:
650 */ 660 */
651 661static int hrtimer_wakeup(struct hrtimer *timer)
652/**
653 * schedule_hrtimer - sleep until timeout
654 *
655 * @timer: hrtimer variable initialized with the correct clock base
656 * @mode: timeout value is abs/rel
657 *
658 * Make the current task sleep until @timeout is
659 * elapsed.
660 *
661 * You can set the task state as follows -
662 *
663 * %TASK_UNINTERRUPTIBLE - at least @timeout is guaranteed to
664 * pass before the routine returns. The routine will return 0
665 *
666 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
667 * delivered to the current task. In this case the remaining time
668 * will be returned
669 *
670 * The current task state is guaranteed to be TASK_RUNNING when this
671 * routine returns.
672 */
673static ktime_t __sched
674schedule_hrtimer(struct hrtimer *timer, const enum hrtimer_mode mode)
675{ 662{
676 /* fn stays NULL, meaning single-shot wakeup: */ 663 struct hrtimer_sleeper *t =
677 timer->data = current; 664 container_of(timer, struct hrtimer_sleeper, timer);
665 struct task_struct *task = t->task;
678 666
679 hrtimer_start(timer, timer->expires, mode); 667 t->task = NULL;
668 if (task)
669 wake_up_process(task);
680 670
681 schedule(); 671 return HRTIMER_NORESTART;
682 hrtimer_cancel(timer); 672}
683 673
684 /* Return the remaining time: */ 674void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, task_t *task)
685 if (timer->state != HRTIMER_EXPIRED) 675{
686 return ktime_sub(timer->expires, timer->base->get_time()); 676 sl->timer.function = hrtimer_wakeup;
687 else 677 sl->task = task;
688 return (ktime_t) {.tv64 = 0 };
689} 678}
690 679
691static inline ktime_t __sched 680static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
692schedule_hrtimer_interruptible(struct hrtimer *timer,
693 const enum hrtimer_mode mode)
694{ 681{
695 set_current_state(TASK_INTERRUPTIBLE); 682 hrtimer_init_sleeper(t, current);
683
684 do {
685 set_current_state(TASK_INTERRUPTIBLE);
686 hrtimer_start(&t->timer, t->timer.expires, mode);
687
688 schedule();
689
690 hrtimer_cancel(&t->timer);
691 mode = HRTIMER_ABS;
696 692
697 return schedule_hrtimer(timer, mode); 693 } while (t->task && !signal_pending(current));
694
695 return t->task == NULL;
698} 696}
699 697
700static long __sched nanosleep_restart(struct restart_block *restart) 698static long __sched nanosleep_restart(struct restart_block *restart)
701{ 699{
700 struct hrtimer_sleeper t;
702 struct timespec __user *rmtp; 701 struct timespec __user *rmtp;
703 struct timespec tu; 702 struct timespec tu;
704 void *rfn_save = restart->fn; 703 ktime_t time;
705 struct hrtimer timer;
706 ktime_t rem;
707 704
708 restart->fn = do_no_restart_syscall; 705 restart->fn = do_no_restart_syscall;
709 706
710 hrtimer_init(&timer, (clockid_t) restart->arg3, HRTIMER_ABS); 707 hrtimer_init(&t.timer, restart->arg3, HRTIMER_ABS);
711 708 t.timer.expires.tv64 = ((u64)restart->arg1 << 32) | (u64) restart->arg0;
712 timer.expires.tv64 = ((u64)restart->arg1 << 32) | (u64) restart->arg0;
713 709
714 rem = schedule_hrtimer_interruptible(&timer, HRTIMER_ABS); 710 if (do_nanosleep(&t, HRTIMER_ABS))
715
716 if (rem.tv64 <= 0)
717 return 0; 711 return 0;
718 712
719 rmtp = (struct timespec __user *) restart->arg2; 713 rmtp = (struct timespec __user *) restart->arg2;
720 tu = ktime_to_timespec(rem); 714 if (rmtp) {
721 if (rmtp && copy_to_user(rmtp, &tu, sizeof(tu))) 715 time = ktime_sub(t.timer.expires, t.timer.base->get_time());
722 return -EFAULT; 716 if (time.tv64 <= 0)
717 return 0;
718 tu = ktime_to_timespec(time);
719 if (copy_to_user(rmtp, &tu, sizeof(tu)))
720 return -EFAULT;
721 }
723 722
724 restart->fn = rfn_save; 723 restart->fn = nanosleep_restart;
725 724
726 /* The other values in restart are already filled in */ 725 /* The other values in restart are already filled in */
727 return -ERESTART_RESTARTBLOCK; 726 return -ERESTART_RESTARTBLOCK;
@@ -731,33 +730,34 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
731 const enum hrtimer_mode mode, const clockid_t clockid) 730 const enum hrtimer_mode mode, const clockid_t clockid)
732{ 731{
733 struct restart_block *restart; 732 struct restart_block *restart;
734 struct hrtimer timer; 733 struct hrtimer_sleeper t;
735 struct timespec tu; 734 struct timespec tu;
736 ktime_t rem; 735 ktime_t rem;
737 736
738 hrtimer_init(&timer, clockid, mode); 737 hrtimer_init(&t.timer, clockid, mode);
739 738 t.timer.expires = timespec_to_ktime(*rqtp);
740 timer.expires = timespec_to_ktime(*rqtp); 739 if (do_nanosleep(&t, mode))
741
742 rem = schedule_hrtimer_interruptible(&timer, mode);
743 if (rem.tv64 <= 0)
744 return 0; 740 return 0;
745 741
746 /* Absolute timers do not update the rmtp value and restart: */ 742 /* Absolute timers do not update the rmtp value and restart: */
747 if (mode == HRTIMER_ABS) 743 if (mode == HRTIMER_ABS)
748 return -ERESTARTNOHAND; 744 return -ERESTARTNOHAND;
749 745
750 tu = ktime_to_timespec(rem); 746 if (rmtp) {
751 747 rem = ktime_sub(t.timer.expires, t.timer.base->get_time());
752 if (rmtp && copy_to_user(rmtp, &tu, sizeof(tu))) 748 if (rem.tv64 <= 0)
753 return -EFAULT; 749 return 0;
750 tu = ktime_to_timespec(rem);
751 if (copy_to_user(rmtp, &tu, sizeof(tu)))
752 return -EFAULT;
753 }
754 754
755 restart = &current_thread_info()->restart_block; 755 restart = &current_thread_info()->restart_block;
756 restart->fn = nanosleep_restart; 756 restart->fn = nanosleep_restart;
757 restart->arg0 = timer.expires.tv64 & 0xFFFFFFFF; 757 restart->arg0 = t.timer.expires.tv64 & 0xFFFFFFFF;
758 restart->arg1 = timer.expires.tv64 >> 32; 758 restart->arg1 = t.timer.expires.tv64 >> 32;
759 restart->arg2 = (unsigned long) rmtp; 759 restart->arg2 = (unsigned long) rmtp;
760 restart->arg3 = (unsigned long) timer.base->index; 760 restart->arg3 = (unsigned long) t.timer.base->index;
761 761
762 return -ERESTART_RESTARTBLOCK; 762 return -ERESTART_RESTARTBLOCK;
763} 763}
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 6edfcef291e8..ac766ad573e8 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -271,6 +271,7 @@ void free_irq(unsigned int irq, void *dev_id)
271 struct irqaction **p; 271 struct irqaction **p;
272 unsigned long flags; 272 unsigned long flags;
273 273
274 WARN_ON(in_interrupt());
274 if (irq >= NR_IRQS) 275 if (irq >= NR_IRQS)
275 return; 276 return;
276 277
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 680e6b70c872..204ed7939e75 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -128,16 +128,16 @@ asmlinkage long sys_getitimer(int which, struct itimerval __user *value)
128/* 128/*
129 * The timer is automagically restarted, when interval != 0 129 * The timer is automagically restarted, when interval != 0
130 */ 130 */
131int it_real_fn(void *data) 131int it_real_fn(struct hrtimer *timer)
132{ 132{
133 struct task_struct *tsk = (struct task_struct *) data; 133 struct signal_struct *sig =
134 container_of(timer, struct signal_struct, real_timer);
134 135
135 send_group_sig_info(SIGALRM, SEND_SIG_PRIV, tsk); 136 send_group_sig_info(SIGALRM, SEND_SIG_PRIV, sig->tsk);
136
137 if (tsk->signal->it_real_incr.tv64 != 0) {
138 hrtimer_forward(&tsk->signal->real_timer,
139 tsk->signal->it_real_incr);
140 137
138 if (sig->it_real_incr.tv64 != 0) {
139 hrtimer_forward(timer, timer->base->softirq_time,
140 sig->it_real_incr);
141 return HRTIMER_RESTART; 141 return HRTIMER_RESTART;
142 } 142 }
143 return HRTIMER_NORESTART; 143 return HRTIMER_NORESTART;
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 51a892063aaa..20a997c73c3d 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -170,7 +170,7 @@ static int wait_for_helper(void *data)
170 sa.sa.sa_handler = SIG_IGN; 170 sa.sa.sa_handler = SIG_IGN;
171 sa.sa.sa_flags = 0; 171 sa.sa.sa_flags = 0;
172 siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD)); 172 siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD));
173 do_sigaction(SIGCHLD, &sa, (struct k_sigaction *)0); 173 do_sigaction(SIGCHLD, &sa, NULL);
174 allow_signal(SIGCHLD); 174 allow_signal(SIGCHLD);
175 175
176 pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD); 176 pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 1fb9f753ef60..1156eb0977d0 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -323,10 +323,10 @@ struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk)
323} 323}
324 324
325/* 325/*
326 * This function is called from exit_thread or flush_thread when task tk's 326 * This function is called from finish_task_switch when task tk becomes dead,
327 * stack is being recycled so that we can recycle any function-return probe 327 * so that we can recycle any function-return probe instances associated
328 * instances associated with this task. These left over instances represent 328 * with this task. These left over instances represent probed functions
329 * probed functions that have been called but will never return. 329 * that have been called but will never return.
330 */ 330 */
331void __kprobes kprobe_flush_task(struct task_struct *tk) 331void __kprobes kprobe_flush_task(struct task_struct *tk)
332{ 332{
@@ -336,7 +336,7 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
336 unsigned long flags = 0; 336 unsigned long flags = 0;
337 337
338 spin_lock_irqsave(&kretprobe_lock, flags); 338 spin_lock_irqsave(&kretprobe_lock, flags);
339 head = kretprobe_inst_table_head(current); 339 head = kretprobe_inst_table_head(tk);
340 hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { 340 hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
341 if (ri->task == tk) 341 if (ri->task == tk)
342 recycle_rp_inst(ri); 342 recycle_rp_inst(ri);
diff --git a/kernel/module.c b/kernel/module.c
index ddfe45ac2fd1..d24deb0dbbc9 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -64,26 +64,17 @@ static DEFINE_SPINLOCK(modlist_lock);
64static DEFINE_MUTEX(module_mutex); 64static DEFINE_MUTEX(module_mutex);
65static LIST_HEAD(modules); 65static LIST_HEAD(modules);
66 66
67static DEFINE_MUTEX(notify_mutex); 67static BLOCKING_NOTIFIER_HEAD(module_notify_list);
68static struct notifier_block * module_notify_list;
69 68
70int register_module_notifier(struct notifier_block * nb) 69int register_module_notifier(struct notifier_block * nb)
71{ 70{
72 int err; 71 return blocking_notifier_chain_register(&module_notify_list, nb);
73 mutex_lock(&notify_mutex);
74 err = notifier_chain_register(&module_notify_list, nb);
75 mutex_unlock(&notify_mutex);
76 return err;
77} 72}
78EXPORT_SYMBOL(register_module_notifier); 73EXPORT_SYMBOL(register_module_notifier);
79 74
80int unregister_module_notifier(struct notifier_block * nb) 75int unregister_module_notifier(struct notifier_block * nb)
81{ 76{
82 int err; 77 return blocking_notifier_chain_unregister(&module_notify_list, nb);
83 mutex_lock(&notify_mutex);
84 err = notifier_chain_unregister(&module_notify_list, nb);
85 mutex_unlock(&notify_mutex);
86 return err;
87} 78}
88EXPORT_SYMBOL(unregister_module_notifier); 79EXPORT_SYMBOL(unregister_module_notifier);
89 80
@@ -136,7 +127,7 @@ extern const unsigned long __start___kcrctab_gpl_future[];
136#ifndef CONFIG_MODVERSIONS 127#ifndef CONFIG_MODVERSIONS
137#define symversion(base, idx) NULL 128#define symversion(base, idx) NULL
138#else 129#else
139#define symversion(base, idx) ((base) ? ((base) + (idx)) : NULL) 130#define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL)
140#endif 131#endif
141 132
142/* lookup symbol in given range of kernel_symbols */ 133/* lookup symbol in given range of kernel_symbols */
@@ -1263,6 +1254,7 @@ static inline int license_is_gpl_compatible(const char *license)
1263 || strcmp(license, "GPL v2") == 0 1254 || strcmp(license, "GPL v2") == 0
1264 || strcmp(license, "GPL and additional rights") == 0 1255 || strcmp(license, "GPL and additional rights") == 0
1265 || strcmp(license, "Dual BSD/GPL") == 0 1256 || strcmp(license, "Dual BSD/GPL") == 0
1257 || strcmp(license, "Dual MIT/GPL") == 0
1266 || strcmp(license, "Dual MPL/GPL") == 0); 1258 || strcmp(license, "Dual MPL/GPL") == 0);
1267} 1259}
1268 1260
@@ -1816,9 +1808,8 @@ sys_init_module(void __user *umod,
1816 /* Drop lock so they can recurse */ 1808 /* Drop lock so they can recurse */
1817 mutex_unlock(&module_mutex); 1809 mutex_unlock(&module_mutex);
1818 1810
1819 mutex_lock(&notify_mutex); 1811 blocking_notifier_call_chain(&module_notify_list,
1820 notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod); 1812 MODULE_STATE_COMING, mod);
1821 mutex_unlock(&notify_mutex);
1822 1813
1823 /* Start the module */ 1814 /* Start the module */
1824 if (mod->init != NULL) 1815 if (mod->init != NULL)
diff --git a/kernel/panic.c b/kernel/panic.c
index acd95adddb93..f895c7c01d5b 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -29,7 +29,7 @@ static DEFINE_SPINLOCK(pause_on_oops_lock);
29int panic_timeout; 29int panic_timeout;
30EXPORT_SYMBOL(panic_timeout); 30EXPORT_SYMBOL(panic_timeout);
31 31
32struct notifier_block *panic_notifier_list; 32ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
33 33
34EXPORT_SYMBOL(panic_notifier_list); 34EXPORT_SYMBOL(panic_notifier_list);
35 35
@@ -97,7 +97,7 @@ NORET_TYPE void panic(const char * fmt, ...)
97 smp_send_stop(); 97 smp_send_stop();
98#endif 98#endif
99 99
100 notifier_call_chain(&panic_notifier_list, 0, buf); 100 atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
101 101
102 if (!panic_blink) 102 if (!panic_blink)
103 panic_blink = no_blink; 103 panic_blink = no_blink;
diff --git a/kernel/params.c b/kernel/params.c
index 9de637a5c8bc..af43ecdc8d9b 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -31,7 +31,7 @@
31#define DEBUGP(fmt, a...) 31#define DEBUGP(fmt, a...)
32#endif 32#endif
33 33
34static inline int dash2underscore(char c) 34static inline char dash2underscore(char c)
35{ 35{
36 if (c == '-') 36 if (c == '-')
37 return '_'; 37 return '_';
diff --git a/kernel/pid.c b/kernel/pid.c
index 1acc07246991..eeb836b65ca4 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -28,8 +28,9 @@
28#include <linux/hash.h> 28#include <linux/hash.h>
29 29
30#define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift) 30#define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift)
31static struct hlist_head *pid_hash[PIDTYPE_MAX]; 31static struct hlist_head *pid_hash;
32static int pidhash_shift; 32static int pidhash_shift;
33static kmem_cache_t *pid_cachep;
33 34
34int pid_max = PID_MAX_DEFAULT; 35int pid_max = PID_MAX_DEFAULT;
35int last_pid; 36int last_pid;
@@ -60,9 +61,22 @@ typedef struct pidmap {
60static pidmap_t pidmap_array[PIDMAP_ENTRIES] = 61static pidmap_t pidmap_array[PIDMAP_ENTRIES] =
61 { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } }; 62 { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } };
62 63
64/*
65 * Note: disable interrupts while the pidmap_lock is held as an
66 * interrupt might come in and do read_lock(&tasklist_lock).
67 *
68 * If we don't disable interrupts there is a nasty deadlock between
69 * detach_pid()->free_pid() and another cpu that does
70 * spin_lock(&pidmap_lock) followed by an interrupt routine that does
71 * read_lock(&tasklist_lock);
72 *
73 * After we clean up the tasklist_lock and know there are no
74 * irq handlers that take it we can leave the interrupts enabled.
75 * For now it is easier to be safe than to prove it can't happen.
76 */
63static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); 77static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
64 78
65fastcall void free_pidmap(int pid) 79static fastcall void free_pidmap(int pid)
66{ 80{
67 pidmap_t *map = pidmap_array + pid / BITS_PER_PAGE; 81 pidmap_t *map = pidmap_array + pid / BITS_PER_PAGE;
68 int offset = pid & BITS_PER_PAGE_MASK; 82 int offset = pid & BITS_PER_PAGE_MASK;
@@ -71,7 +85,7 @@ fastcall void free_pidmap(int pid)
71 atomic_inc(&map->nr_free); 85 atomic_inc(&map->nr_free);
72} 86}
73 87
74int alloc_pidmap(void) 88static int alloc_pidmap(void)
75{ 89{
76 int i, offset, max_scan, pid, last = last_pid; 90 int i, offset, max_scan, pid, last = last_pid;
77 pidmap_t *map; 91 pidmap_t *map;
@@ -89,12 +103,12 @@ int alloc_pidmap(void)
89 * Free the page if someone raced with us 103 * Free the page if someone raced with us
90 * installing it: 104 * installing it:
91 */ 105 */
92 spin_lock(&pidmap_lock); 106 spin_lock_irq(&pidmap_lock);
93 if (map->page) 107 if (map->page)
94 free_page(page); 108 free_page(page);
95 else 109 else
96 map->page = (void *)page; 110 map->page = (void *)page;
97 spin_unlock(&pidmap_lock); 111 spin_unlock_irq(&pidmap_lock);
98 if (unlikely(!map->page)) 112 if (unlikely(!map->page))
99 break; 113 break;
100 } 114 }
@@ -131,13 +145,73 @@ int alloc_pidmap(void)
131 return -1; 145 return -1;
132} 146}
133 147
134struct pid * fastcall find_pid(enum pid_type type, int nr) 148fastcall void put_pid(struct pid *pid)
149{
150 if (!pid)
151 return;
152 if ((atomic_read(&pid->count) == 1) ||
153 atomic_dec_and_test(&pid->count))
154 kmem_cache_free(pid_cachep, pid);
155}
156
157static void delayed_put_pid(struct rcu_head *rhp)
158{
159 struct pid *pid = container_of(rhp, struct pid, rcu);
160 put_pid(pid);
161}
162
163fastcall void free_pid(struct pid *pid)
164{
165 /* We can be called with write_lock_irq(&tasklist_lock) held */
166 unsigned long flags;
167
168 spin_lock_irqsave(&pidmap_lock, flags);
169 hlist_del_rcu(&pid->pid_chain);
170 spin_unlock_irqrestore(&pidmap_lock, flags);
171
172 free_pidmap(pid->nr);
173 call_rcu(&pid->rcu, delayed_put_pid);
174}
175
176struct pid *alloc_pid(void)
177{
178 struct pid *pid;
179 enum pid_type type;
180 int nr = -1;
181
182 pid = kmem_cache_alloc(pid_cachep, GFP_KERNEL);
183 if (!pid)
184 goto out;
185
186 nr = alloc_pidmap();
187 if (nr < 0)
188 goto out_free;
189
190 atomic_set(&pid->count, 1);
191 pid->nr = nr;
192 for (type = 0; type < PIDTYPE_MAX; ++type)
193 INIT_HLIST_HEAD(&pid->tasks[type]);
194
195 spin_lock_irq(&pidmap_lock);
196 hlist_add_head_rcu(&pid->pid_chain, &pid_hash[pid_hashfn(pid->nr)]);
197 spin_unlock_irq(&pidmap_lock);
198
199out:
200 return pid;
201
202out_free:
203 kmem_cache_free(pid_cachep, pid);
204 pid = NULL;
205 goto out;
206}
207
208struct pid * fastcall find_pid(int nr)
135{ 209{
136 struct hlist_node *elem; 210 struct hlist_node *elem;
137 struct pid *pid; 211 struct pid *pid;
138 212
139 hlist_for_each_entry_rcu(pid, elem, 213 hlist_for_each_entry_rcu(pid, elem,
140 &pid_hash[type][pid_hashfn(nr)], pid_chain) { 214 &pid_hash[pid_hashfn(nr)], pid_chain) {
141 if (pid->nr == nr) 215 if (pid->nr == nr)
142 return pid; 216 return pid;
143 } 217 }
@@ -146,105 +220,80 @@ struct pid * fastcall find_pid(enum pid_type type, int nr)
146 220
147int fastcall attach_pid(task_t *task, enum pid_type type, int nr) 221int fastcall attach_pid(task_t *task, enum pid_type type, int nr)
148{ 222{
149 struct pid *pid, *task_pid; 223 struct pid_link *link;
150 224 struct pid *pid;
151 task_pid = &task->pids[type];
152 pid = find_pid(type, nr);
153 task_pid->nr = nr;
154 if (pid == NULL) {
155 INIT_LIST_HEAD(&task_pid->pid_list);
156 hlist_add_head_rcu(&task_pid->pid_chain,
157 &pid_hash[type][pid_hashfn(nr)]);
158 } else {
159 INIT_HLIST_NODE(&task_pid->pid_chain);
160 list_add_tail_rcu(&task_pid->pid_list, &pid->pid_list);
161 }
162
163 return 0;
164}
165
166static fastcall int __detach_pid(task_t *task, enum pid_type type)
167{
168 struct pid *pid, *pid_next;
169 int nr = 0;
170
171 pid = &task->pids[type];
172 if (!hlist_unhashed(&pid->pid_chain)) {
173 225
174 if (list_empty(&pid->pid_list)) { 226 WARN_ON(!task->pid); /* to be removed soon */
175 nr = pid->nr; 227 WARN_ON(!nr); /* to be removed soon */
176 hlist_del_rcu(&pid->pid_chain);
177 } else {
178 pid_next = list_entry(pid->pid_list.next,
179 struct pid, pid_list);
180 /* insert next pid from pid_list to hash */
181 hlist_replace_rcu(&pid->pid_chain,
182 &pid_next->pid_chain);
183 }
184 }
185 228
186 list_del_rcu(&pid->pid_list); 229 link = &task->pids[type];
187 pid->nr = 0; 230 link->pid = pid = find_pid(nr);
231 hlist_add_head_rcu(&link->node, &pid->tasks[type]);
188 232
189 return nr; 233 return 0;
190} 234}
191 235
192void fastcall detach_pid(task_t *task, enum pid_type type) 236void fastcall detach_pid(task_t *task, enum pid_type type)
193{ 237{
194 int tmp, nr; 238 struct pid_link *link;
239 struct pid *pid;
240 int tmp;
195 241
196 nr = __detach_pid(task, type); 242 link = &task->pids[type];
197 if (!nr) 243 pid = link->pid;
198 return; 244
245 hlist_del_rcu(&link->node);
246 link->pid = NULL;
199 247
200 for (tmp = PIDTYPE_MAX; --tmp >= 0; ) 248 for (tmp = PIDTYPE_MAX; --tmp >= 0; )
201 if (tmp != type && find_pid(tmp, nr)) 249 if (!hlist_empty(&pid->tasks[tmp]))
202 return; 250 return;
203 251
204 free_pidmap(nr); 252 free_pid(pid);
205} 253}
206 254
207task_t *find_task_by_pid_type(int type, int nr) 255struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type)
208{ 256{
209 struct pid *pid; 257 struct task_struct *result = NULL;
210 258 if (pid) {
211 pid = find_pid(type, nr); 259 struct hlist_node *first;
212 if (!pid) 260 first = rcu_dereference(pid->tasks[type].first);
213 return NULL; 261 if (first)
262 result = hlist_entry(first, struct task_struct, pids[(type)].node);
263 }
264 return result;
265}
214 266
215 return pid_task(&pid->pid_list, type); 267/*
268 * Must be called under rcu_read_lock() or with tasklist_lock read-held.
269 */
270task_t *find_task_by_pid_type(int type, int nr)
271{
272 return pid_task(find_pid(nr), type);
216} 273}
217 274
218EXPORT_SYMBOL(find_task_by_pid_type); 275EXPORT_SYMBOL(find_task_by_pid_type);
219 276
220/* 277struct task_struct *fastcall get_pid_task(struct pid *pid, enum pid_type type)
221 * This function switches the PIDs if a non-leader thread calls 278{
222 * sys_execve() - this must be done without releasing the PID. 279 struct task_struct *result;
223 * (which a detach_pid() would eventually do.) 280 rcu_read_lock();
224 */ 281 result = pid_task(pid, type);
225void switch_exec_pids(task_t *leader, task_t *thread) 282 if (result)
283 get_task_struct(result);
284 rcu_read_unlock();
285 return result;
286}
287
288struct pid *find_get_pid(pid_t nr)
226{ 289{
227 __detach_pid(leader, PIDTYPE_PID); 290 struct pid *pid;
228 __detach_pid(leader, PIDTYPE_TGID); 291
229 __detach_pid(leader, PIDTYPE_PGID); 292 rcu_read_lock();
230 __detach_pid(leader, PIDTYPE_SID); 293 pid = get_pid(find_pid(nr));
231 294 rcu_read_unlock();
232 __detach_pid(thread, PIDTYPE_PID); 295
233 __detach_pid(thread, PIDTYPE_TGID); 296 return pid;
234
235 leader->pid = leader->tgid = thread->pid;
236 thread->pid = thread->tgid;
237
238 attach_pid(thread, PIDTYPE_PID, thread->pid);
239 attach_pid(thread, PIDTYPE_TGID, thread->tgid);
240 attach_pid(thread, PIDTYPE_PGID, thread->signal->pgrp);
241 attach_pid(thread, PIDTYPE_SID, thread->signal->session);
242 list_add_tail(&thread->tasks, &init_task.tasks);
243
244 attach_pid(leader, PIDTYPE_PID, leader->pid);
245 attach_pid(leader, PIDTYPE_TGID, leader->tgid);
246 attach_pid(leader, PIDTYPE_PGID, leader->signal->pgrp);
247 attach_pid(leader, PIDTYPE_SID, leader->signal->session);
248} 297}
249 298
250/* 299/*
@@ -254,7 +303,7 @@ void switch_exec_pids(task_t *leader, task_t *thread)
254 */ 303 */
255void __init pidhash_init(void) 304void __init pidhash_init(void)
256{ 305{
257 int i, j, pidhash_size; 306 int i, pidhash_size;
258 unsigned long megabytes = nr_kernel_pages >> (20 - PAGE_SHIFT); 307 unsigned long megabytes = nr_kernel_pages >> (20 - PAGE_SHIFT);
259 308
260 pidhash_shift = max(4, fls(megabytes * 4)); 309 pidhash_shift = max(4, fls(megabytes * 4));
@@ -263,30 +312,23 @@ void __init pidhash_init(void)
263 312
264 printk("PID hash table entries: %d (order: %d, %Zd bytes)\n", 313 printk("PID hash table entries: %d (order: %d, %Zd bytes)\n",
265 pidhash_size, pidhash_shift, 314 pidhash_size, pidhash_shift,
266 PIDTYPE_MAX * pidhash_size * sizeof(struct hlist_head)); 315 pidhash_size * sizeof(struct hlist_head));
267 316
268 for (i = 0; i < PIDTYPE_MAX; i++) { 317 pid_hash = alloc_bootmem(pidhash_size * sizeof(*(pid_hash)));
269 pid_hash[i] = alloc_bootmem(pidhash_size * 318 if (!pid_hash)
270 sizeof(*(pid_hash[i]))); 319 panic("Could not alloc pidhash!\n");
271 if (!pid_hash[i]) 320 for (i = 0; i < pidhash_size; i++)
272 panic("Could not alloc pidhash!\n"); 321 INIT_HLIST_HEAD(&pid_hash[i]);
273 for (j = 0; j < pidhash_size; j++)
274 INIT_HLIST_HEAD(&pid_hash[i][j]);
275 }
276} 322}
277 323
278void __init pidmap_init(void) 324void __init pidmap_init(void)
279{ 325{
280 int i;
281
282 pidmap_array->page = (void *)get_zeroed_page(GFP_KERNEL); 326 pidmap_array->page = (void *)get_zeroed_page(GFP_KERNEL);
327 /* Reserve PID 0. We never call free_pidmap(0) */
283 set_bit(0, pidmap_array->page); 328 set_bit(0, pidmap_array->page);
284 atomic_dec(&pidmap_array->nr_free); 329 atomic_dec(&pidmap_array->nr_free);
285 330
286 /* 331 pid_cachep = kmem_cache_create("pid", sizeof(struct pid),
287 * Allocate PID 0, and hash it via all PID types: 332 __alignof__(struct pid),
288 */ 333 SLAB_PANIC, NULL, NULL);
289
290 for (i = 0; i < PIDTYPE_MAX; i++)
291 attach_pid(current, i, 0);
292} 334}
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 9944379360b5..ac6dc8744429 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -145,7 +145,7 @@ static int common_timer_set(struct k_itimer *, int,
145 struct itimerspec *, struct itimerspec *); 145 struct itimerspec *, struct itimerspec *);
146static int common_timer_del(struct k_itimer *timer); 146static int common_timer_del(struct k_itimer *timer);
147 147
148static int posix_timer_fn(void *data); 148static int posix_timer_fn(struct hrtimer *data);
149 149
150static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags); 150static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags);
151 151
@@ -251,15 +251,18 @@ __initcall(init_posix_timers);
251 251
252static void schedule_next_timer(struct k_itimer *timr) 252static void schedule_next_timer(struct k_itimer *timr)
253{ 253{
254 struct hrtimer *timer = &timr->it.real.timer;
255
254 if (timr->it.real.interval.tv64 == 0) 256 if (timr->it.real.interval.tv64 == 0)
255 return; 257 return;
256 258
257 timr->it_overrun += hrtimer_forward(&timr->it.real.timer, 259 timr->it_overrun += hrtimer_forward(timer, timer->base->get_time(),
258 timr->it.real.interval); 260 timr->it.real.interval);
261
259 timr->it_overrun_last = timr->it_overrun; 262 timr->it_overrun_last = timr->it_overrun;
260 timr->it_overrun = -1; 263 timr->it_overrun = -1;
261 ++timr->it_requeue_pending; 264 ++timr->it_requeue_pending;
262 hrtimer_restart(&timr->it.real.timer); 265 hrtimer_restart(timer);
263} 266}
264 267
265/* 268/*
@@ -331,13 +334,14 @@ EXPORT_SYMBOL_GPL(posix_timer_event);
331 334
332 * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers. 335 * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers.
333 */ 336 */
334static int posix_timer_fn(void *data) 337static int posix_timer_fn(struct hrtimer *timer)
335{ 338{
336 struct k_itimer *timr = data; 339 struct k_itimer *timr;
337 unsigned long flags; 340 unsigned long flags;
338 int si_private = 0; 341 int si_private = 0;
339 int ret = HRTIMER_NORESTART; 342 int ret = HRTIMER_NORESTART;
340 343
344 timr = container_of(timer, struct k_itimer, it.real.timer);
341 spin_lock_irqsave(&timr->it_lock, flags); 345 spin_lock_irqsave(&timr->it_lock, flags);
342 346
343 if (timr->it.real.interval.tv64 != 0) 347 if (timr->it.real.interval.tv64 != 0)
@@ -351,7 +355,8 @@ static int posix_timer_fn(void *data)
351 */ 355 */
352 if (timr->it.real.interval.tv64 != 0) { 356 if (timr->it.real.interval.tv64 != 0) {
353 timr->it_overrun += 357 timr->it_overrun +=
354 hrtimer_forward(&timr->it.real.timer, 358 hrtimer_forward(timer,
359 timer->base->softirq_time,
355 timr->it.real.interval); 360 timr->it.real.interval);
356 ret = HRTIMER_RESTART; 361 ret = HRTIMER_RESTART;
357 ++timr->it_requeue_pending; 362 ++timr->it_requeue_pending;
@@ -603,38 +608,41 @@ static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags)
603static void 608static void
604common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) 609common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
605{ 610{
606 ktime_t remaining; 611 ktime_t now, remaining, iv;
607 struct hrtimer *timer = &timr->it.real.timer; 612 struct hrtimer *timer = &timr->it.real.timer;
608 613
609 memset(cur_setting, 0, sizeof(struct itimerspec)); 614 memset(cur_setting, 0, sizeof(struct itimerspec));
610 remaining = hrtimer_get_remaining(timer);
611 615
612 /* Time left ? or timer pending */ 616 iv = timr->it.real.interval;
613 if (remaining.tv64 > 0 || hrtimer_active(timer)) 617
614 goto calci;
615 /* interval timer ? */ 618 /* interval timer ? */
616 if (timr->it.real.interval.tv64 == 0) 619 if (iv.tv64)
620 cur_setting->it_interval = ktime_to_timespec(iv);
621 else if (!hrtimer_active(timer) &&
622 (timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE)
617 return; 623 return;
624
625 now = timer->base->get_time();
626
618 /* 627 /*
619 * When a requeue is pending or this is a SIGEV_NONE timer 628 * When a requeue is pending or this is a SIGEV_NONE
620 * move the expiry time forward by intervals, so expiry is > 629 * timer move the expiry time forward by intervals, so
621 * now. 630 * expiry is > now.
622 */ 631 */
623 if (timr->it_requeue_pending & REQUEUE_PENDING || 632 if (iv.tv64 && (timr->it_requeue_pending & REQUEUE_PENDING ||
624 (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) { 633 (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE))
625 timr->it_overrun += 634 timr->it_overrun += hrtimer_forward(timer, now, iv);
626 hrtimer_forward(timer, timr->it.real.interval); 635
627 remaining = hrtimer_get_remaining(timer); 636 remaining = ktime_sub(timer->expires, now);
628 }
629 calci:
630 /* interval timer ? */
631 if (timr->it.real.interval.tv64 != 0)
632 cur_setting->it_interval =
633 ktime_to_timespec(timr->it.real.interval);
634 /* Return 0 only, when the timer is expired and not pending */ 637 /* Return 0 only, when the timer is expired and not pending */
635 if (remaining.tv64 <= 0) 638 if (remaining.tv64 <= 0) {
636 cur_setting->it_value.tv_nsec = 1; 639 /*
637 else 640 * A single shot SIGEV_NONE timer must return 0, when
641 * it is expired !
642 */
643 if ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE)
644 cur_setting->it_value.tv_nsec = 1;
645 } else
638 cur_setting->it_value = ktime_to_timespec(remaining); 646 cur_setting->it_value = ktime_to_timespec(remaining);
639} 647}
640 648
@@ -717,7 +725,6 @@ common_timer_set(struct k_itimer *timr, int flags,
717 725
718 mode = flags & TIMER_ABSTIME ? HRTIMER_ABS : HRTIMER_REL; 726 mode = flags & TIMER_ABSTIME ? HRTIMER_ABS : HRTIMER_REL;
719 hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); 727 hrtimer_init(&timr->it.real.timer, timr->it_clock, mode);
720 timr->it.real.timer.data = timr;
721 timr->it.real.timer.function = posix_timer_fn; 728 timr->it.real.timer.function = posix_timer_fn;
722 729
723 timer->expires = timespec_to_ktime(new_setting->it_value); 730 timer->expires = timespec_to_ktime(new_setting->it_value);
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 8ac7c35fad77..b2a5f671d6cd 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -26,8 +26,7 @@ static inline int freezeable(struct task_struct * p)
26 (p->flags & PF_NOFREEZE) || 26 (p->flags & PF_NOFREEZE) ||
27 (p->exit_state == EXIT_ZOMBIE) || 27 (p->exit_state == EXIT_ZOMBIE) ||
28 (p->exit_state == EXIT_DEAD) || 28 (p->exit_state == EXIT_DEAD) ||
29 (p->state == TASK_STOPPED) || 29 (p->state == TASK_STOPPED))
30 (p->state == TASK_TRACED))
31 return 0; 30 return 0;
32 return 1; 31 return 1;
33} 32}
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 9177f3f73a6c..044b8e0c1025 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -454,10 +454,11 @@ static int load_image(struct swap_map_handle *handle,
454 nr_pages++; 454 nr_pages++;
455 } 455 }
456 } while (ret > 0); 456 } while (ret > 0);
457 if (!error) 457 if (!error) {
458 printk("\b\b\b\bdone\n"); 458 printk("\b\b\b\bdone\n");
459 if (!snapshot_image_loaded(snapshot)) 459 if (!snapshot_image_loaded(snapshot))
460 error = -ENODATA; 460 error = -ENODATA;
461 }
461 return error; 462 return error;
462} 463}
463 464
diff --git a/kernel/profile.c b/kernel/profile.c
index ad81f799a9b4..5a730fdb1a2c 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -87,72 +87,52 @@ void __init profile_init(void)
87 87
88#ifdef CONFIG_PROFILING 88#ifdef CONFIG_PROFILING
89 89
90static DECLARE_RWSEM(profile_rwsem); 90static BLOCKING_NOTIFIER_HEAD(task_exit_notifier);
91static DEFINE_RWLOCK(handoff_lock); 91static ATOMIC_NOTIFIER_HEAD(task_free_notifier);
92static struct notifier_block * task_exit_notifier; 92static BLOCKING_NOTIFIER_HEAD(munmap_notifier);
93static struct notifier_block * task_free_notifier;
94static struct notifier_block * munmap_notifier;
95 93
96void profile_task_exit(struct task_struct * task) 94void profile_task_exit(struct task_struct * task)
97{ 95{
98 down_read(&profile_rwsem); 96 blocking_notifier_call_chain(&task_exit_notifier, 0, task);
99 notifier_call_chain(&task_exit_notifier, 0, task);
100 up_read(&profile_rwsem);
101} 97}
102 98
103int profile_handoff_task(struct task_struct * task) 99int profile_handoff_task(struct task_struct * task)
104{ 100{
105 int ret; 101 int ret;
106 read_lock(&handoff_lock); 102 ret = atomic_notifier_call_chain(&task_free_notifier, 0, task);
107 ret = notifier_call_chain(&task_free_notifier, 0, task);
108 read_unlock(&handoff_lock);
109 return (ret == NOTIFY_OK) ? 1 : 0; 103 return (ret == NOTIFY_OK) ? 1 : 0;
110} 104}
111 105
112void profile_munmap(unsigned long addr) 106void profile_munmap(unsigned long addr)
113{ 107{
114 down_read(&profile_rwsem); 108 blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr);
115 notifier_call_chain(&munmap_notifier, 0, (void *)addr);
116 up_read(&profile_rwsem);
117} 109}
118 110
119int task_handoff_register(struct notifier_block * n) 111int task_handoff_register(struct notifier_block * n)
120{ 112{
121 int err = -EINVAL; 113 return atomic_notifier_chain_register(&task_free_notifier, n);
122
123 write_lock(&handoff_lock);
124 err = notifier_chain_register(&task_free_notifier, n);
125 write_unlock(&handoff_lock);
126 return err;
127} 114}
128 115
129int task_handoff_unregister(struct notifier_block * n) 116int task_handoff_unregister(struct notifier_block * n)
130{ 117{
131 int err = -EINVAL; 118 return atomic_notifier_chain_unregister(&task_free_notifier, n);
132
133 write_lock(&handoff_lock);
134 err = notifier_chain_unregister(&task_free_notifier, n);
135 write_unlock(&handoff_lock);
136 return err;
137} 119}
138 120
139int profile_event_register(enum profile_type type, struct notifier_block * n) 121int profile_event_register(enum profile_type type, struct notifier_block * n)
140{ 122{
141 int err = -EINVAL; 123 int err = -EINVAL;
142 124
143 down_write(&profile_rwsem);
144
145 switch (type) { 125 switch (type) {
146 case PROFILE_TASK_EXIT: 126 case PROFILE_TASK_EXIT:
147 err = notifier_chain_register(&task_exit_notifier, n); 127 err = blocking_notifier_chain_register(
128 &task_exit_notifier, n);
148 break; 129 break;
149 case PROFILE_MUNMAP: 130 case PROFILE_MUNMAP:
150 err = notifier_chain_register(&munmap_notifier, n); 131 err = blocking_notifier_chain_register(
132 &munmap_notifier, n);
151 break; 133 break;
152 } 134 }
153 135
154 up_write(&profile_rwsem);
155
156 return err; 136 return err;
157} 137}
158 138
@@ -161,18 +141,17 @@ int profile_event_unregister(enum profile_type type, struct notifier_block * n)
161{ 141{
162 int err = -EINVAL; 142 int err = -EINVAL;
163 143
164 down_write(&profile_rwsem);
165
166 switch (type) { 144 switch (type) {
167 case PROFILE_TASK_EXIT: 145 case PROFILE_TASK_EXIT:
168 err = notifier_chain_unregister(&task_exit_notifier, n); 146 err = blocking_notifier_chain_unregister(
147 &task_exit_notifier, n);
169 break; 148 break;
170 case PROFILE_MUNMAP: 149 case PROFILE_MUNMAP:
171 err = notifier_chain_unregister(&munmap_notifier, n); 150 err = blocking_notifier_chain_unregister(
151 &munmap_notifier, n);
172 break; 152 break;
173 } 153 }
174 154
175 up_write(&profile_rwsem);
176 return err; 155 return err;
177} 156}
178 157
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index d95a72c9279d..86a7f6c60cb2 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -35,9 +35,9 @@ void __ptrace_link(task_t *child, task_t *new_parent)
35 if (child->parent == new_parent) 35 if (child->parent == new_parent)
36 return; 36 return;
37 list_add(&child->ptrace_list, &child->parent->ptrace_children); 37 list_add(&child->ptrace_list, &child->parent->ptrace_children);
38 REMOVE_LINKS(child); 38 remove_parent(child);
39 child->parent = new_parent; 39 child->parent = new_parent;
40 SET_LINKS(child); 40 add_parent(child);
41} 41}
42 42
43/* 43/*
@@ -77,9 +77,9 @@ void __ptrace_unlink(task_t *child)
77 child->ptrace = 0; 77 child->ptrace = 0;
78 if (!list_empty(&child->ptrace_list)) { 78 if (!list_empty(&child->ptrace_list)) {
79 list_del_init(&child->ptrace_list); 79 list_del_init(&child->ptrace_list);
80 REMOVE_LINKS(child); 80 remove_parent(child);
81 child->parent = child->real_parent; 81 child->parent = child->real_parent;
82 SET_LINKS(child); 82 add_parent(child);
83 } 83 }
84 84
85 ptrace_untrace(child); 85 ptrace_untrace(child);
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index b4b362b5baf5..8154e7589d12 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -301,7 +301,7 @@ rcu_torture_printk(char *page)
301 long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; 301 long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
302 long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; 302 long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
303 303
304 for_each_cpu(cpu) { 304 for_each_possible_cpu(cpu) {
305 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { 305 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
306 pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i]; 306 pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i];
307 batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i]; 307 batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i];
@@ -535,7 +535,7 @@ rcu_torture_init(void)
535 atomic_set(&n_rcu_torture_error, 0); 535 atomic_set(&n_rcu_torture_error, 0);
536 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) 536 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
537 atomic_set(&rcu_torture_wcount[i], 0); 537 atomic_set(&rcu_torture_wcount[i], 0);
538 for_each_cpu(cpu) { 538 for_each_possible_cpu(cpu) {
539 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { 539 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
540 per_cpu(rcu_torture_count, cpu)[i] = 0; 540 per_cpu(rcu_torture_count, cpu)[i] = 0;
541 per_cpu(rcu_torture_batch, cpu)[i] = 0; 541 per_cpu(rcu_torture_batch, cpu)[i] = 0;
diff --git a/kernel/sched.c b/kernel/sched.c
index 7ffaabd64f89..dd153d6f8a04 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -49,6 +49,7 @@
49#include <linux/syscalls.h> 49#include <linux/syscalls.h>
50#include <linux/times.h> 50#include <linux/times.h>
51#include <linux/acct.h> 51#include <linux/acct.h>
52#include <linux/kprobes.h>
52#include <asm/tlb.h> 53#include <asm/tlb.h>
53 54
54#include <asm/unistd.h> 55#include <asm/unistd.h>
@@ -144,7 +145,8 @@
144 (v1) * (v2_max) / (v1_max) 145 (v1) * (v2_max) / (v1_max)
145 146
146#define DELTA(p) \ 147#define DELTA(p) \
147 (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA) 148 (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \
149 INTERACTIVE_DELTA)
148 150
149#define TASK_INTERACTIVE(p) \ 151#define TASK_INTERACTIVE(p) \
150 ((p)->prio <= (p)->static_prio - DELTA(p)) 152 ((p)->prio <= (p)->static_prio - DELTA(p))
@@ -665,9 +667,13 @@ static int effective_prio(task_t *p)
665/* 667/*
666 * __activate_task - move a task to the runqueue. 668 * __activate_task - move a task to the runqueue.
667 */ 669 */
668static inline void __activate_task(task_t *p, runqueue_t *rq) 670static void __activate_task(task_t *p, runqueue_t *rq)
669{ 671{
670 enqueue_task(p, rq->active); 672 prio_array_t *target = rq->active;
673
674 if (batch_task(p))
675 target = rq->expired;
676 enqueue_task(p, target);
671 rq->nr_running++; 677 rq->nr_running++;
672} 678}
673 679
@@ -686,7 +692,7 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
686 unsigned long long __sleep_time = now - p->timestamp; 692 unsigned long long __sleep_time = now - p->timestamp;
687 unsigned long sleep_time; 693 unsigned long sleep_time;
688 694
689 if (unlikely(p->policy == SCHED_BATCH)) 695 if (batch_task(p))
690 sleep_time = 0; 696 sleep_time = 0;
691 else { 697 else {
692 if (__sleep_time > NS_MAX_SLEEP_AVG) 698 if (__sleep_time > NS_MAX_SLEEP_AVG)
@@ -698,21 +704,25 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
698 if (likely(sleep_time > 0)) { 704 if (likely(sleep_time > 0)) {
699 /* 705 /*
700 * User tasks that sleep a long time are categorised as 706 * User tasks that sleep a long time are categorised as
701 * idle and will get just interactive status to stay active & 707 * idle. They will only have their sleep_avg increased to a
702 * prevent them suddenly becoming cpu hogs and starving 708 * level that makes them just interactive priority to stay
703 * other processes. 709 * active yet prevent them suddenly becoming cpu hogs and
710 * starving other processes.
704 */ 711 */
705 if (p->mm && p->activated != -1 && 712 if (p->mm && sleep_time > INTERACTIVE_SLEEP(p)) {
706 sleep_time > INTERACTIVE_SLEEP(p)) { 713 unsigned long ceiling;
707 p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG - 714
708 DEF_TIMESLICE); 715 ceiling = JIFFIES_TO_NS(MAX_SLEEP_AVG -
716 DEF_TIMESLICE);
717 if (p->sleep_avg < ceiling)
718 p->sleep_avg = ceiling;
709 } else { 719 } else {
710 /* 720 /*
711 * Tasks waking from uninterruptible sleep are 721 * Tasks waking from uninterruptible sleep are
712 * limited in their sleep_avg rise as they 722 * limited in their sleep_avg rise as they
713 * are likely to be waiting on I/O 723 * are likely to be waiting on I/O
714 */ 724 */
715 if (p->activated == -1 && p->mm) { 725 if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) {
716 if (p->sleep_avg >= INTERACTIVE_SLEEP(p)) 726 if (p->sleep_avg >= INTERACTIVE_SLEEP(p))
717 sleep_time = 0; 727 sleep_time = 0;
718 else if (p->sleep_avg + sleep_time >= 728 else if (p->sleep_avg + sleep_time >=
@@ -767,7 +777,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
767 * This checks to make sure it's not an uninterruptible task 777 * This checks to make sure it's not an uninterruptible task
768 * that is now waking up. 778 * that is now waking up.
769 */ 779 */
770 if (!p->activated) { 780 if (p->sleep_type == SLEEP_NORMAL) {
771 /* 781 /*
772 * Tasks which were woken up by interrupts (ie. hw events) 782 * Tasks which were woken up by interrupts (ie. hw events)
773 * are most likely of interactive nature. So we give them 783 * are most likely of interactive nature. So we give them
@@ -776,13 +786,13 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
776 * on a CPU, first time around: 786 * on a CPU, first time around:
777 */ 787 */
778 if (in_interrupt()) 788 if (in_interrupt())
779 p->activated = 2; 789 p->sleep_type = SLEEP_INTERRUPTED;
780 else { 790 else {
781 /* 791 /*
782 * Normal first-time wakeups get a credit too for 792 * Normal first-time wakeups get a credit too for
783 * on-runqueue time, but it will be weighted down: 793 * on-runqueue time, but it will be weighted down:
784 */ 794 */
785 p->activated = 1; 795 p->sleep_type = SLEEP_INTERACTIVE;
786 } 796 }
787 } 797 }
788 p->timestamp = now; 798 p->timestamp = now;
@@ -1270,19 +1280,19 @@ out_activate:
1270 * Tasks on involuntary sleep don't earn 1280 * Tasks on involuntary sleep don't earn
1271 * sleep_avg beyond just interactive state. 1281 * sleep_avg beyond just interactive state.
1272 */ 1282 */
1273 p->activated = -1; 1283 p->sleep_type = SLEEP_NONINTERACTIVE;
1274 } 1284 } else
1275 1285
1276 /* 1286 /*
1277 * Tasks that have marked their sleep as noninteractive get 1287 * Tasks that have marked their sleep as noninteractive get
1278 * woken up without updating their sleep average. (i.e. their 1288 * woken up with their sleep average not weighted in an
1279 * sleep is handled in a priority-neutral manner, no priority 1289 * interactive way.
1280 * boost and no penalty.)
1281 */ 1290 */
1282 if (old_state & TASK_NONINTERACTIVE) 1291 if (old_state & TASK_NONINTERACTIVE)
1283 __activate_task(p, rq); 1292 p->sleep_type = SLEEP_NONINTERACTIVE;
1284 else 1293
1285 activate_task(p, rq, cpu == this_cpu); 1294
1295 activate_task(p, rq, cpu == this_cpu);
1286 /* 1296 /*
1287 * Sync wakeups (i.e. those types of wakeups where the waker 1297 * Sync wakeups (i.e. those types of wakeups where the waker
1288 * has indicated that it will leave the CPU in short order) 1298 * has indicated that it will leave the CPU in short order)
@@ -1546,8 +1556,14 @@ static inline void finish_task_switch(runqueue_t *rq, task_t *prev)
1546 finish_lock_switch(rq, prev); 1556 finish_lock_switch(rq, prev);
1547 if (mm) 1557 if (mm)
1548 mmdrop(mm); 1558 mmdrop(mm);
1549 if (unlikely(prev_task_flags & PF_DEAD)) 1559 if (unlikely(prev_task_flags & PF_DEAD)) {
1560 /*
1561 * Remove function-return probe instances associated with this
1562 * task and put them back on the free list.
1563 */
1564 kprobe_flush_task(prev);
1550 put_task_struct(prev); 1565 put_task_struct(prev);
1566 }
1551} 1567}
1552 1568
1553/** 1569/**
@@ -1617,7 +1633,7 @@ unsigned long nr_uninterruptible(void)
1617{ 1633{
1618 unsigned long i, sum = 0; 1634 unsigned long i, sum = 0;
1619 1635
1620 for_each_cpu(i) 1636 for_each_possible_cpu(i)
1621 sum += cpu_rq(i)->nr_uninterruptible; 1637 sum += cpu_rq(i)->nr_uninterruptible;
1622 1638
1623 /* 1639 /*
@@ -1634,7 +1650,7 @@ unsigned long long nr_context_switches(void)
1634{ 1650{
1635 unsigned long long i, sum = 0; 1651 unsigned long long i, sum = 0;
1636 1652
1637 for_each_cpu(i) 1653 for_each_possible_cpu(i)
1638 sum += cpu_rq(i)->nr_switches; 1654 sum += cpu_rq(i)->nr_switches;
1639 1655
1640 return sum; 1656 return sum;
@@ -1644,12 +1660,27 @@ unsigned long nr_iowait(void)
1644{ 1660{
1645 unsigned long i, sum = 0; 1661 unsigned long i, sum = 0;
1646 1662
1647 for_each_cpu(i) 1663 for_each_possible_cpu(i)
1648 sum += atomic_read(&cpu_rq(i)->nr_iowait); 1664 sum += atomic_read(&cpu_rq(i)->nr_iowait);
1649 1665
1650 return sum; 1666 return sum;
1651} 1667}
1652 1668
1669unsigned long nr_active(void)
1670{
1671 unsigned long i, running = 0, uninterruptible = 0;
1672
1673 for_each_online_cpu(i) {
1674 running += cpu_rq(i)->nr_running;
1675 uninterruptible += cpu_rq(i)->nr_uninterruptible;
1676 }
1677
1678 if (unlikely((long)uninterruptible < 0))
1679 uninterruptible = 0;
1680
1681 return running + uninterruptible;
1682}
1683
1653#ifdef CONFIG_SMP 1684#ifdef CONFIG_SMP
1654 1685
1655/* 1686/*
@@ -2852,6 +2883,12 @@ EXPORT_SYMBOL(sub_preempt_count);
2852 2883
2853#endif 2884#endif
2854 2885
2886static inline int interactive_sleep(enum sleep_type sleep_type)
2887{
2888 return (sleep_type == SLEEP_INTERACTIVE ||
2889 sleep_type == SLEEP_INTERRUPTED);
2890}
2891
2855/* 2892/*
2856 * schedule() is the main scheduler function. 2893 * schedule() is the main scheduler function.
2857 */ 2894 */
@@ -2871,13 +2908,11 @@ asmlinkage void __sched schedule(void)
2871 * schedule() atomically, we ignore that path for now. 2908 * schedule() atomically, we ignore that path for now.
2872 * Otherwise, whine if we are scheduling when we should not be. 2909 * Otherwise, whine if we are scheduling when we should not be.
2873 */ 2910 */
2874 if (likely(!current->exit_state)) { 2911 if (unlikely(in_atomic() && !current->exit_state)) {
2875 if (unlikely(in_atomic())) { 2912 printk(KERN_ERR "BUG: scheduling while atomic: "
2876 printk(KERN_ERR "BUG: scheduling while atomic: " 2913 "%s/0x%08x/%d\n",
2877 "%s/0x%08x/%d\n", 2914 current->comm, preempt_count(), current->pid);
2878 current->comm, preempt_count(), current->pid); 2915 dump_stack();
2879 dump_stack();
2880 }
2881 } 2916 }
2882 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 2917 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
2883 2918
@@ -2977,12 +3012,12 @@ go_idle:
2977 queue = array->queue + idx; 3012 queue = array->queue + idx;
2978 next = list_entry(queue->next, task_t, run_list); 3013 next = list_entry(queue->next, task_t, run_list);
2979 3014
2980 if (!rt_task(next) && next->activated > 0) { 3015 if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
2981 unsigned long long delta = now - next->timestamp; 3016 unsigned long long delta = now - next->timestamp;
2982 if (unlikely((long long)(now - next->timestamp) < 0)) 3017 if (unlikely((long long)(now - next->timestamp) < 0))
2983 delta = 0; 3018 delta = 0;
2984 3019
2985 if (next->activated == 1) 3020 if (next->sleep_type == SLEEP_INTERACTIVE)
2986 delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; 3021 delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
2987 3022
2988 array = next->array; 3023 array = next->array;
@@ -2992,10 +3027,9 @@ go_idle:
2992 dequeue_task(next, array); 3027 dequeue_task(next, array);
2993 next->prio = new_prio; 3028 next->prio = new_prio;
2994 enqueue_task(next, array); 3029 enqueue_task(next, array);
2995 } else 3030 }
2996 requeue_task(next, array);
2997 } 3031 }
2998 next->activated = 0; 3032 next->sleep_type = SLEEP_NORMAL;
2999switch_tasks: 3033switch_tasks:
3000 if (next == rq->idle) 3034 if (next == rq->idle)
3001 schedstat_inc(rq, sched_goidle); 3035 schedstat_inc(rq, sched_goidle);
@@ -5568,11 +5602,31 @@ static int cpu_to_cpu_group(int cpu)
5568} 5602}
5569#endif 5603#endif
5570 5604
5605#ifdef CONFIG_SCHED_MC
5606static DEFINE_PER_CPU(struct sched_domain, core_domains);
5607static struct sched_group sched_group_core[NR_CPUS];
5608#endif
5609
5610#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
5611static int cpu_to_core_group(int cpu)
5612{
5613 return first_cpu(cpu_sibling_map[cpu]);
5614}
5615#elif defined(CONFIG_SCHED_MC)
5616static int cpu_to_core_group(int cpu)
5617{
5618 return cpu;
5619}
5620#endif
5621
5571static DEFINE_PER_CPU(struct sched_domain, phys_domains); 5622static DEFINE_PER_CPU(struct sched_domain, phys_domains);
5572static struct sched_group sched_group_phys[NR_CPUS]; 5623static struct sched_group sched_group_phys[NR_CPUS];
5573static int cpu_to_phys_group(int cpu) 5624static int cpu_to_phys_group(int cpu)
5574{ 5625{
5575#ifdef CONFIG_SCHED_SMT 5626#if defined(CONFIG_SCHED_MC)
5627 cpumask_t mask = cpu_coregroup_map(cpu);
5628 return first_cpu(mask);
5629#elif defined(CONFIG_SCHED_SMT)
5576 return first_cpu(cpu_sibling_map[cpu]); 5630 return first_cpu(cpu_sibling_map[cpu]);
5577#else 5631#else
5578 return cpu; 5632 return cpu;
@@ -5595,6 +5649,32 @@ static int cpu_to_allnodes_group(int cpu)
5595{ 5649{
5596 return cpu_to_node(cpu); 5650 return cpu_to_node(cpu);
5597} 5651}
5652static void init_numa_sched_groups_power(struct sched_group *group_head)
5653{
5654 struct sched_group *sg = group_head;
5655 int j;
5656
5657 if (!sg)
5658 return;
5659next_sg:
5660 for_each_cpu_mask(j, sg->cpumask) {
5661 struct sched_domain *sd;
5662
5663 sd = &per_cpu(phys_domains, j);
5664 if (j != first_cpu(sd->groups->cpumask)) {
5665 /*
5666 * Only add "power" once for each
5667 * physical package.
5668 */
5669 continue;
5670 }
5671
5672 sg->cpu_power += sd->groups->cpu_power;
5673 }
5674 sg = sg->next;
5675 if (sg != group_head)
5676 goto next_sg;
5677}
5598#endif 5678#endif
5599 5679
5600/* 5680/*
@@ -5670,6 +5750,17 @@ void build_sched_domains(const cpumask_t *cpu_map)
5670 sd->parent = p; 5750 sd->parent = p;
5671 sd->groups = &sched_group_phys[group]; 5751 sd->groups = &sched_group_phys[group];
5672 5752
5753#ifdef CONFIG_SCHED_MC
5754 p = sd;
5755 sd = &per_cpu(core_domains, i);
5756 group = cpu_to_core_group(i);
5757 *sd = SD_MC_INIT;
5758 sd->span = cpu_coregroup_map(i);
5759 cpus_and(sd->span, sd->span, *cpu_map);
5760 sd->parent = p;
5761 sd->groups = &sched_group_core[group];
5762#endif
5763
5673#ifdef CONFIG_SCHED_SMT 5764#ifdef CONFIG_SCHED_SMT
5674 p = sd; 5765 p = sd;
5675 sd = &per_cpu(cpu_domains, i); 5766 sd = &per_cpu(cpu_domains, i);
@@ -5695,6 +5786,19 @@ void build_sched_domains(const cpumask_t *cpu_map)
5695 } 5786 }
5696#endif 5787#endif
5697 5788
5789#ifdef CONFIG_SCHED_MC
5790 /* Set up multi-core groups */
5791 for_each_cpu_mask(i, *cpu_map) {
5792 cpumask_t this_core_map = cpu_coregroup_map(i);
5793 cpus_and(this_core_map, this_core_map, *cpu_map);
5794 if (i != first_cpu(this_core_map))
5795 continue;
5796 init_sched_build_groups(sched_group_core, this_core_map,
5797 &cpu_to_core_group);
5798 }
5799#endif
5800
5801
5698 /* Set up physical groups */ 5802 /* Set up physical groups */
5699 for (i = 0; i < MAX_NUMNODES; i++) { 5803 for (i = 0; i < MAX_NUMNODES; i++) {
5700 cpumask_t nodemask = node_to_cpumask(i); 5804 cpumask_t nodemask = node_to_cpumask(i);
@@ -5791,51 +5895,38 @@ void build_sched_domains(const cpumask_t *cpu_map)
5791 power = SCHED_LOAD_SCALE; 5895 power = SCHED_LOAD_SCALE;
5792 sd->groups->cpu_power = power; 5896 sd->groups->cpu_power = power;
5793#endif 5897#endif
5898#ifdef CONFIG_SCHED_MC
5899 sd = &per_cpu(core_domains, i);
5900 power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)
5901 * SCHED_LOAD_SCALE / 10;
5902 sd->groups->cpu_power = power;
5903
5904 sd = &per_cpu(phys_domains, i);
5794 5905
5906 /*
5907 * This has to be < 2 * SCHED_LOAD_SCALE
5908 * Lets keep it SCHED_LOAD_SCALE, so that
5909 * while calculating NUMA group's cpu_power
5910 * we can simply do
5911 * numa_group->cpu_power += phys_group->cpu_power;
5912 *
5913 * See "only add power once for each physical pkg"
5914 * comment below
5915 */
5916 sd->groups->cpu_power = SCHED_LOAD_SCALE;
5917#else
5795 sd = &per_cpu(phys_domains, i); 5918 sd = &per_cpu(phys_domains, i);
5796 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * 5919 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
5797 (cpus_weight(sd->groups->cpumask)-1) / 10; 5920 (cpus_weight(sd->groups->cpumask)-1) / 10;
5798 sd->groups->cpu_power = power; 5921 sd->groups->cpu_power = power;
5799
5800#ifdef CONFIG_NUMA
5801 sd = &per_cpu(allnodes_domains, i);
5802 if (sd->groups) {
5803 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
5804 (cpus_weight(sd->groups->cpumask)-1) / 10;
5805 sd->groups->cpu_power = power;
5806 }
5807#endif 5922#endif
5808 } 5923 }
5809 5924
5810#ifdef CONFIG_NUMA 5925#ifdef CONFIG_NUMA
5811 for (i = 0; i < MAX_NUMNODES; i++) { 5926 for (i = 0; i < MAX_NUMNODES; i++)
5812 struct sched_group *sg = sched_group_nodes[i]; 5927 init_numa_sched_groups_power(sched_group_nodes[i]);
5813 int j;
5814
5815 if (sg == NULL)
5816 continue;
5817next_sg:
5818 for_each_cpu_mask(j, sg->cpumask) {
5819 struct sched_domain *sd;
5820 int power;
5821
5822 sd = &per_cpu(phys_domains, j);
5823 if (j != first_cpu(sd->groups->cpumask)) {
5824 /*
5825 * Only add "power" once for each
5826 * physical package.
5827 */
5828 continue;
5829 }
5830 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
5831 (cpus_weight(sd->groups->cpumask)-1) / 10;
5832 5928
5833 sg->cpu_power += power; 5929 init_numa_sched_groups_power(sched_group_allnodes);
5834 }
5835 sg = sg->next;
5836 if (sg != sched_group_nodes[i])
5837 goto next_sg;
5838 }
5839#endif 5930#endif
5840 5931
5841 /* Attach the domains */ 5932 /* Attach the domains */
@@ -5843,6 +5934,8 @@ next_sg:
5843 struct sched_domain *sd; 5934 struct sched_domain *sd;
5844#ifdef CONFIG_SCHED_SMT 5935#ifdef CONFIG_SCHED_SMT
5845 sd = &per_cpu(cpu_domains, i); 5936 sd = &per_cpu(cpu_domains, i);
5937#elif defined(CONFIG_SCHED_MC)
5938 sd = &per_cpu(core_domains, i);
5846#else 5939#else
5847 sd = &per_cpu(phys_domains, i); 5940 sd = &per_cpu(phys_domains, i);
5848#endif 5941#endif
@@ -6015,7 +6108,7 @@ void __init sched_init(void)
6015 runqueue_t *rq; 6108 runqueue_t *rq;
6016 int i, j, k; 6109 int i, j, k;
6017 6110
6018 for_each_cpu(i) { 6111 for_each_possible_cpu(i) {
6019 prio_array_t *array; 6112 prio_array_t *array;
6020 6113
6021 rq = cpu_rq(i); 6114 rq = cpu_rq(i);
diff --git a/kernel/signal.c b/kernel/signal.c
index 75f7341b0c39..92025b108791 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -22,7 +22,6 @@
22#include <linux/security.h> 22#include <linux/security.h>
23#include <linux/syscalls.h> 23#include <linux/syscalls.h>
24#include <linux/ptrace.h> 24#include <linux/ptrace.h>
25#include <linux/posix-timers.h>
26#include <linux/signal.h> 25#include <linux/signal.h>
27#include <linux/audit.h> 26#include <linux/audit.h>
28#include <linux/capability.h> 27#include <linux/capability.h>
@@ -147,6 +146,8 @@ static kmem_cache_t *sigqueue_cachep;
147#define sig_kernel_stop(sig) \ 146#define sig_kernel_stop(sig) \
148 (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_STOP_MASK)) 147 (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_STOP_MASK))
149 148
149#define sig_needs_tasklist(sig) ((sig) == SIGCONT)
150
150#define sig_user_defined(t, signr) \ 151#define sig_user_defined(t, signr) \
151 (((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_DFL) && \ 152 (((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_DFL) && \
152 ((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_IGN)) 153 ((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_IGN))
@@ -292,7 +293,7 @@ static void __sigqueue_free(struct sigqueue *q)
292 kmem_cache_free(sigqueue_cachep, q); 293 kmem_cache_free(sigqueue_cachep, q);
293} 294}
294 295
295static void flush_sigqueue(struct sigpending *queue) 296void flush_sigqueue(struct sigpending *queue)
296{ 297{
297 struct sigqueue *q; 298 struct sigqueue *q;
298 299
@@ -307,9 +308,7 @@ static void flush_sigqueue(struct sigpending *queue)
307/* 308/*
308 * Flush all pending signals for a task. 309 * Flush all pending signals for a task.
309 */ 310 */
310 311void flush_signals(struct task_struct *t)
311void
312flush_signals(struct task_struct *t)
313{ 312{
314 unsigned long flags; 313 unsigned long flags;
315 314
@@ -321,109 +320,6 @@ flush_signals(struct task_struct *t)
321} 320}
322 321
323/* 322/*
324 * This function expects the tasklist_lock write-locked.
325 */
326void __exit_sighand(struct task_struct *tsk)
327{
328 struct sighand_struct * sighand = tsk->sighand;
329
330 /* Ok, we're done with the signal handlers */
331 tsk->sighand = NULL;
332 if (atomic_dec_and_test(&sighand->count))
333 sighand_free(sighand);
334}
335
336void exit_sighand(struct task_struct *tsk)
337{
338 write_lock_irq(&tasklist_lock);
339 rcu_read_lock();
340 if (tsk->sighand != NULL) {
341 struct sighand_struct *sighand = rcu_dereference(tsk->sighand);
342 spin_lock(&sighand->siglock);
343 __exit_sighand(tsk);
344 spin_unlock(&sighand->siglock);
345 }
346 rcu_read_unlock();
347 write_unlock_irq(&tasklist_lock);
348}
349
350/*
351 * This function expects the tasklist_lock write-locked.
352 */
353void __exit_signal(struct task_struct *tsk)
354{
355 struct signal_struct * sig = tsk->signal;
356 struct sighand_struct * sighand;
357
358 if (!sig)
359 BUG();
360 if (!atomic_read(&sig->count))
361 BUG();
362 rcu_read_lock();
363 sighand = rcu_dereference(tsk->sighand);
364 spin_lock(&sighand->siglock);
365 posix_cpu_timers_exit(tsk);
366 if (atomic_dec_and_test(&sig->count)) {
367 posix_cpu_timers_exit_group(tsk);
368 tsk->signal = NULL;
369 __exit_sighand(tsk);
370 spin_unlock(&sighand->siglock);
371 flush_sigqueue(&sig->shared_pending);
372 } else {
373 /*
374 * If there is any task waiting for the group exit
375 * then notify it:
376 */
377 if (sig->group_exit_task && atomic_read(&sig->count) == sig->notify_count) {
378 wake_up_process(sig->group_exit_task);
379 sig->group_exit_task = NULL;
380 }
381 if (tsk == sig->curr_target)
382 sig->curr_target = next_thread(tsk);
383 tsk->signal = NULL;
384 /*
385 * Accumulate here the counters for all threads but the
386 * group leader as they die, so they can be added into
387 * the process-wide totals when those are taken.
388 * The group leader stays around as a zombie as long
389 * as there are other threads. When it gets reaped,
390 * the exit.c code will add its counts into these totals.
391 * We won't ever get here for the group leader, since it
392 * will have been the last reference on the signal_struct.
393 */
394 sig->utime = cputime_add(sig->utime, tsk->utime);
395 sig->stime = cputime_add(sig->stime, tsk->stime);
396 sig->min_flt += tsk->min_flt;
397 sig->maj_flt += tsk->maj_flt;
398 sig->nvcsw += tsk->nvcsw;
399 sig->nivcsw += tsk->nivcsw;
400 sig->sched_time += tsk->sched_time;
401 __exit_sighand(tsk);
402 spin_unlock(&sighand->siglock);
403 sig = NULL; /* Marker for below. */
404 }
405 rcu_read_unlock();
406 clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
407 flush_sigqueue(&tsk->pending);
408 if (sig) {
409 /*
410 * We are cleaning up the signal_struct here.
411 */
412 exit_thread_group_keys(sig);
413 kmem_cache_free(signal_cachep, sig);
414 }
415}
416
417void exit_signal(struct task_struct *tsk)
418{
419 atomic_dec(&tsk->signal->live);
420
421 write_lock_irq(&tasklist_lock);
422 __exit_signal(tsk);
423 write_unlock_irq(&tasklist_lock);
424}
425
426/*
427 * Flush all handlers for a task. 323 * Flush all handlers for a task.
428 */ 324 */
429 325
@@ -695,9 +591,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
695} 591}
696 592
697/* forward decl */ 593/* forward decl */
698static void do_notify_parent_cldstop(struct task_struct *tsk, 594static void do_notify_parent_cldstop(struct task_struct *tsk, int why);
699 int to_self,
700 int why);
701 595
702/* 596/*
703 * Handle magic process-wide effects of stop/continue signals. 597 * Handle magic process-wide effects of stop/continue signals.
@@ -747,7 +641,7 @@ static void handle_stop_signal(int sig, struct task_struct *p)
747 p->signal->group_stop_count = 0; 641 p->signal->group_stop_count = 0;
748 p->signal->flags = SIGNAL_STOP_CONTINUED; 642 p->signal->flags = SIGNAL_STOP_CONTINUED;
749 spin_unlock(&p->sighand->siglock); 643 spin_unlock(&p->sighand->siglock);
750 do_notify_parent_cldstop(p, (p->ptrace & PT_PTRACED), CLD_STOPPED); 644 do_notify_parent_cldstop(p, CLD_STOPPED);
751 spin_lock(&p->sighand->siglock); 645 spin_lock(&p->sighand->siglock);
752 } 646 }
753 rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending); 647 rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending);
@@ -788,7 +682,7 @@ static void handle_stop_signal(int sig, struct task_struct *p)
788 p->signal->flags = SIGNAL_STOP_CONTINUED; 682 p->signal->flags = SIGNAL_STOP_CONTINUED;
789 p->signal->group_exit_code = 0; 683 p->signal->group_exit_code = 0;
790 spin_unlock(&p->sighand->siglock); 684 spin_unlock(&p->sighand->siglock);
791 do_notify_parent_cldstop(p, (p->ptrace & PT_PTRACED), CLD_CONTINUED); 685 do_notify_parent_cldstop(p, CLD_CONTINUED);
792 spin_lock(&p->sighand->siglock); 686 spin_lock(&p->sighand->siglock);
793 } else { 687 } else {
794 /* 688 /*
@@ -1120,27 +1014,37 @@ void zap_other_threads(struct task_struct *p)
1120/* 1014/*
1121 * Must be called under rcu_read_lock() or with tasklist_lock read-held. 1015 * Must be called under rcu_read_lock() or with tasklist_lock read-held.
1122 */ 1016 */
1017struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags)
1018{
1019 struct sighand_struct *sighand;
1020
1021 for (;;) {
1022 sighand = rcu_dereference(tsk->sighand);
1023 if (unlikely(sighand == NULL))
1024 break;
1025
1026 spin_lock_irqsave(&sighand->siglock, *flags);
1027 if (likely(sighand == tsk->sighand))
1028 break;
1029 spin_unlock_irqrestore(&sighand->siglock, *flags);
1030 }
1031
1032 return sighand;
1033}
1034
1123int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) 1035int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1124{ 1036{
1125 unsigned long flags; 1037 unsigned long flags;
1126 struct sighand_struct *sp;
1127 int ret; 1038 int ret;
1128 1039
1129retry:
1130 ret = check_kill_permission(sig, info, p); 1040 ret = check_kill_permission(sig, info, p);
1131 if (!ret && sig && (sp = rcu_dereference(p->sighand))) { 1041
1132 spin_lock_irqsave(&sp->siglock, flags); 1042 if (!ret && sig) {
1133 if (p->sighand != sp) { 1043 ret = -ESRCH;
1134 spin_unlock_irqrestore(&sp->siglock, flags); 1044 if (lock_task_sighand(p, &flags)) {
1135 goto retry; 1045 ret = __group_send_sig_info(sig, info, p);
1136 } 1046 unlock_task_sighand(p, &flags);
1137 if ((atomic_read(&sp->count) == 0) ||
1138 (atomic_read(&p->usage) == 0)) {
1139 spin_unlock_irqrestore(&sp->siglock, flags);
1140 return -ESRCH;
1141 } 1047 }
1142 ret = __group_send_sig_info(sig, info, p);
1143 spin_unlock_irqrestore(&sp->siglock, flags);
1144 } 1048 }
1145 1049
1146 return ret; 1050 return ret;
@@ -1189,7 +1093,7 @@ kill_proc_info(int sig, struct siginfo *info, pid_t pid)
1189 struct task_struct *p; 1093 struct task_struct *p;
1190 1094
1191 rcu_read_lock(); 1095 rcu_read_lock();
1192 if (unlikely(sig_kernel_stop(sig) || sig == SIGCONT)) { 1096 if (unlikely(sig_needs_tasklist(sig))) {
1193 read_lock(&tasklist_lock); 1097 read_lock(&tasklist_lock);
1194 acquired_tasklist_lock = 1; 1098 acquired_tasklist_lock = 1;
1195 } 1099 }
@@ -1405,12 +1309,10 @@ void sigqueue_free(struct sigqueue *q)
1405 __sigqueue_free(q); 1309 __sigqueue_free(q);
1406} 1310}
1407 1311
1408int 1312int send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
1409send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
1410{ 1313{
1411 unsigned long flags; 1314 unsigned long flags;
1412 int ret = 0; 1315 int ret = 0;
1413 struct sighand_struct *sh;
1414 1316
1415 BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); 1317 BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
1416 1318
@@ -1424,48 +1326,17 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
1424 */ 1326 */
1425 rcu_read_lock(); 1327 rcu_read_lock();
1426 1328
1427 if (unlikely(p->flags & PF_EXITING)) { 1329 if (!likely(lock_task_sighand(p, &flags))) {
1428 ret = -1; 1330 ret = -1;
1429 goto out_err; 1331 goto out_err;
1430 } 1332 }
1431 1333
1432retry:
1433 sh = rcu_dereference(p->sighand);
1434
1435 spin_lock_irqsave(&sh->siglock, flags);
1436 if (p->sighand != sh) {
1437 /* We raced with exec() in a multithreaded process... */
1438 spin_unlock_irqrestore(&sh->siglock, flags);
1439 goto retry;
1440 }
1441
1442 /*
1443 * We do the check here again to handle the following scenario:
1444 *
1445 * CPU 0 CPU 1
1446 * send_sigqueue
1447 * check PF_EXITING
1448 * interrupt exit code running
1449 * __exit_signal
1450 * lock sighand->siglock
1451 * unlock sighand->siglock
1452 * lock sh->siglock
1453 * add(tsk->pending) flush_sigqueue(tsk->pending)
1454 *
1455 */
1456
1457 if (unlikely(p->flags & PF_EXITING)) {
1458 ret = -1;
1459 goto out;
1460 }
1461
1462 if (unlikely(!list_empty(&q->list))) { 1334 if (unlikely(!list_empty(&q->list))) {
1463 /* 1335 /*
1464 * If an SI_TIMER entry is already queue just increment 1336 * If an SI_TIMER entry is already queue just increment
1465 * the overrun count. 1337 * the overrun count.
1466 */ 1338 */
1467 if (q->info.si_code != SI_TIMER) 1339 BUG_ON(q->info.si_code != SI_TIMER);
1468 BUG();
1469 q->info.si_overrun++; 1340 q->info.si_overrun++;
1470 goto out; 1341 goto out;
1471 } 1342 }
@@ -1481,7 +1352,7 @@ retry:
1481 signal_wake_up(p, sig == SIGKILL); 1352 signal_wake_up(p, sig == SIGKILL);
1482 1353
1483out: 1354out:
1484 spin_unlock_irqrestore(&sh->siglock, flags); 1355 unlock_task_sighand(p, &flags);
1485out_err: 1356out_err:
1486 rcu_read_unlock(); 1357 rcu_read_unlock();
1487 1358
@@ -1613,14 +1484,14 @@ void do_notify_parent(struct task_struct *tsk, int sig)
1613 spin_unlock_irqrestore(&psig->siglock, flags); 1484 spin_unlock_irqrestore(&psig->siglock, flags);
1614} 1485}
1615 1486
1616static void do_notify_parent_cldstop(struct task_struct *tsk, int to_self, int why) 1487static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
1617{ 1488{
1618 struct siginfo info; 1489 struct siginfo info;
1619 unsigned long flags; 1490 unsigned long flags;
1620 struct task_struct *parent; 1491 struct task_struct *parent;
1621 struct sighand_struct *sighand; 1492 struct sighand_struct *sighand;
1622 1493
1623 if (to_self) 1494 if (tsk->ptrace & PT_PTRACED)
1624 parent = tsk->parent; 1495 parent = tsk->parent;
1625 else { 1496 else {
1626 tsk = tsk->group_leader; 1497 tsk = tsk->group_leader;
@@ -1689,13 +1560,14 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info)
1689 /* Let the debugger run. */ 1560 /* Let the debugger run. */
1690 set_current_state(TASK_TRACED); 1561 set_current_state(TASK_TRACED);
1691 spin_unlock_irq(&current->sighand->siglock); 1562 spin_unlock_irq(&current->sighand->siglock);
1563 try_to_freeze();
1692 read_lock(&tasklist_lock); 1564 read_lock(&tasklist_lock);
1693 if (likely(current->ptrace & PT_PTRACED) && 1565 if (likely(current->ptrace & PT_PTRACED) &&
1694 likely(current->parent != current->real_parent || 1566 likely(current->parent != current->real_parent ||
1695 !(current->ptrace & PT_ATTACHED)) && 1567 !(current->ptrace & PT_ATTACHED)) &&
1696 (likely(current->parent->signal != current->signal) || 1568 (likely(current->parent->signal != current->signal) ||
1697 !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) { 1569 !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) {
1698 do_notify_parent_cldstop(current, 1, CLD_TRAPPED); 1570 do_notify_parent_cldstop(current, CLD_TRAPPED);
1699 read_unlock(&tasklist_lock); 1571 read_unlock(&tasklist_lock);
1700 schedule(); 1572 schedule();
1701 } else { 1573 } else {
@@ -1744,25 +1616,17 @@ void ptrace_notify(int exit_code)
1744static void 1616static void
1745finish_stop(int stop_count) 1617finish_stop(int stop_count)
1746{ 1618{
1747 int to_self;
1748
1749 /* 1619 /*
1750 * If there are no other threads in the group, or if there is 1620 * If there are no other threads in the group, or if there is
1751 * a group stop in progress and we are the last to stop, 1621 * a group stop in progress and we are the last to stop,
1752 * report to the parent. When ptraced, every thread reports itself. 1622 * report to the parent. When ptraced, every thread reports itself.
1753 */ 1623 */
1754 if (stop_count < 0 || (current->ptrace & PT_PTRACED)) 1624 if (stop_count == 0 || (current->ptrace & PT_PTRACED)) {
1755 to_self = 1; 1625 read_lock(&tasklist_lock);
1756 else if (stop_count == 0) 1626 do_notify_parent_cldstop(current, CLD_STOPPED);
1757 to_self = 0; 1627 read_unlock(&tasklist_lock);
1758 else 1628 }
1759 goto out;
1760
1761 read_lock(&tasklist_lock);
1762 do_notify_parent_cldstop(current, to_self, CLD_STOPPED);
1763 read_unlock(&tasklist_lock);
1764 1629
1765out:
1766 schedule(); 1630 schedule();
1767 /* 1631 /*
1768 * Now we don't run again until continued. 1632 * Now we don't run again until continued.
@@ -1776,12 +1640,10 @@ out:
1776 * Returns nonzero if we've actually stopped and released the siglock. 1640 * Returns nonzero if we've actually stopped and released the siglock.
1777 * Returns zero if we didn't stop and still hold the siglock. 1641 * Returns zero if we didn't stop and still hold the siglock.
1778 */ 1642 */
1779static int 1643static int do_signal_stop(int signr)
1780do_signal_stop(int signr)
1781{ 1644{
1782 struct signal_struct *sig = current->signal; 1645 struct signal_struct *sig = current->signal;
1783 struct sighand_struct *sighand = current->sighand; 1646 int stop_count;
1784 int stop_count = -1;
1785 1647
1786 if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED)) 1648 if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED))
1787 return 0; 1649 return 0;
@@ -1791,86 +1653,37 @@ do_signal_stop(int signr)
1791 * There is a group stop in progress. We don't need to 1653 * There is a group stop in progress. We don't need to
1792 * start another one. 1654 * start another one.
1793 */ 1655 */
1794 signr = sig->group_exit_code;
1795 stop_count = --sig->group_stop_count; 1656 stop_count = --sig->group_stop_count;
1796 current->exit_code = signr; 1657 } else {
1797 set_current_state(TASK_STOPPED);
1798 if (stop_count == 0)
1799 sig->flags = SIGNAL_STOP_STOPPED;
1800 spin_unlock_irq(&sighand->siglock);
1801 }
1802 else if (thread_group_empty(current)) {
1803 /*
1804 * Lock must be held through transition to stopped state.
1805 */
1806 current->exit_code = current->signal->group_exit_code = signr;
1807 set_current_state(TASK_STOPPED);
1808 sig->flags = SIGNAL_STOP_STOPPED;
1809 spin_unlock_irq(&sighand->siglock);
1810 }
1811 else {
1812 /* 1658 /*
1813 * There is no group stop already in progress. 1659 * There is no group stop already in progress.
1814 * We must initiate one now, but that requires 1660 * We must initiate one now.
1815 * dropping siglock to get both the tasklist lock
1816 * and siglock again in the proper order. Note that
1817 * this allows an intervening SIGCONT to be posted.
1818 * We need to check for that and bail out if necessary.
1819 */ 1661 */
1820 struct task_struct *t; 1662 struct task_struct *t;
1821 1663
1822 spin_unlock_irq(&sighand->siglock); 1664 sig->group_exit_code = signr;
1823
1824 /* signals can be posted during this window */
1825 1665
1826 read_lock(&tasklist_lock); 1666 stop_count = 0;
1827 spin_lock_irq(&sighand->siglock); 1667 for (t = next_thread(current); t != current; t = next_thread(t))
1828
1829 if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED)) {
1830 /* 1668 /*
1831 * Another stop or continue happened while we 1669 * Setting state to TASK_STOPPED for a group
1832 * didn't have the lock. We can just swallow this 1670 * stop is always done with the siglock held,
1833 * signal now. If we raced with a SIGCONT, that 1671 * so this check has no races.
1834 * should have just cleared it now. If we raced
1835 * with another processor delivering a stop signal,
1836 * then the SIGCONT that wakes us up should clear it.
1837 */ 1672 */
1838 read_unlock(&tasklist_lock); 1673 if (!t->exit_state &&
1839 return 0; 1674 !(t->state & (TASK_STOPPED|TASK_TRACED))) {
1840 } 1675 stop_count++;
1841 1676 signal_wake_up(t, 0);
1842 if (sig->group_stop_count == 0) { 1677 }
1843 sig->group_exit_code = signr; 1678 sig->group_stop_count = stop_count;
1844 stop_count = 0;
1845 for (t = next_thread(current); t != current;
1846 t = next_thread(t))
1847 /*
1848 * Setting state to TASK_STOPPED for a group
1849 * stop is always done with the siglock held,
1850 * so this check has no races.
1851 */
1852 if (!t->exit_state &&
1853 !(t->state & (TASK_STOPPED|TASK_TRACED))) {
1854 stop_count++;
1855 signal_wake_up(t, 0);
1856 }
1857 sig->group_stop_count = stop_count;
1858 }
1859 else {
1860 /* A race with another thread while unlocked. */
1861 signr = sig->group_exit_code;
1862 stop_count = --sig->group_stop_count;
1863 }
1864
1865 current->exit_code = signr;
1866 set_current_state(TASK_STOPPED);
1867 if (stop_count == 0)
1868 sig->flags = SIGNAL_STOP_STOPPED;
1869
1870 spin_unlock_irq(&sighand->siglock);
1871 read_unlock(&tasklist_lock);
1872 } 1679 }
1873 1680
1681 if (stop_count == 0)
1682 sig->flags = SIGNAL_STOP_STOPPED;
1683 current->exit_code = sig->group_exit_code;
1684 __set_current_state(TASK_STOPPED);
1685
1686 spin_unlock_irq(&current->sighand->siglock);
1874 finish_stop(stop_count); 1687 finish_stop(stop_count);
1875 return 1; 1688 return 1;
1876} 1689}
@@ -1990,7 +1803,7 @@ relock:
1990 continue; 1803 continue;
1991 1804
1992 /* Init gets no signals it doesn't want. */ 1805 /* Init gets no signals it doesn't want. */
1993 if (current->pid == 1) 1806 if (current == child_reaper)
1994 continue; 1807 continue;
1995 1808
1996 if (sig_kernel_stop(signr)) { 1809 if (sig_kernel_stop(signr)) {
@@ -2430,8 +2243,7 @@ sys_rt_sigqueueinfo(int pid, int sig, siginfo_t __user *uinfo)
2430 return kill_proc_info(sig, &info, pid); 2243 return kill_proc_info(sig, &info, pid);
2431} 2244}
2432 2245
2433int 2246int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
2434do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
2435{ 2247{
2436 struct k_sigaction *k; 2248 struct k_sigaction *k;
2437 sigset_t mask; 2249 sigset_t mask;
@@ -2457,6 +2269,7 @@ do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
2457 if (act) { 2269 if (act) {
2458 sigdelsetmask(&act->sa.sa_mask, 2270 sigdelsetmask(&act->sa.sa_mask,
2459 sigmask(SIGKILL) | sigmask(SIGSTOP)); 2271 sigmask(SIGKILL) | sigmask(SIGSTOP));
2272 *k = *act;
2460 /* 2273 /*
2461 * POSIX 3.3.1.3: 2274 * POSIX 3.3.1.3:
2462 * "Setting a signal action to SIG_IGN for a signal that is 2275 * "Setting a signal action to SIG_IGN for a signal that is
@@ -2469,19 +2282,8 @@ do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
2469 * be discarded, whether or not it is blocked" 2282 * be discarded, whether or not it is blocked"
2470 */ 2283 */
2471 if (act->sa.sa_handler == SIG_IGN || 2284 if (act->sa.sa_handler == SIG_IGN ||
2472 (act->sa.sa_handler == SIG_DFL && 2285 (act->sa.sa_handler == SIG_DFL && sig_kernel_ignore(sig))) {
2473 sig_kernel_ignore(sig))) {
2474 /*
2475 * This is a fairly rare case, so we only take the
2476 * tasklist_lock once we're sure we'll need it.
2477 * Now we must do this little unlock and relock
2478 * dance to maintain the lock hierarchy.
2479 */
2480 struct task_struct *t = current; 2286 struct task_struct *t = current;
2481 spin_unlock_irq(&t->sighand->siglock);
2482 read_lock(&tasklist_lock);
2483 spin_lock_irq(&t->sighand->siglock);
2484 *k = *act;
2485 sigemptyset(&mask); 2287 sigemptyset(&mask);
2486 sigaddset(&mask, sig); 2288 sigaddset(&mask, sig);
2487 rm_from_queue_full(&mask, &t->signal->shared_pending); 2289 rm_from_queue_full(&mask, &t->signal->shared_pending);
@@ -2490,12 +2292,7 @@ do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
2490 recalc_sigpending_tsk(t); 2292 recalc_sigpending_tsk(t);
2491 t = next_thread(t); 2293 t = next_thread(t);
2492 } while (t != current); 2294 } while (t != current);
2493 spin_unlock_irq(&current->sighand->siglock);
2494 read_unlock(&tasklist_lock);
2495 return 0;
2496 } 2295 }
2497
2498 *k = *act;
2499 } 2296 }
2500 2297
2501 spin_unlock_irq(&current->sighand->siglock); 2298 spin_unlock_irq(&current->sighand->siglock);
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index d9b3d5847ed8..ced91e1ff564 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -152,5 +152,5 @@ __init void spawn_softlockup_task(void)
152 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); 152 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
153 register_cpu_notifier(&cpu_nfb); 153 register_cpu_notifier(&cpu_nfb);
154 154
155 notifier_chain_register(&panic_notifier_list, &panic_block); 155 atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
156} 156}
diff --git a/kernel/sys.c b/kernel/sys.c
index 38bc73ede2ba..0b6ec0e7936f 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -95,99 +95,304 @@ int cad_pid = 1;
95 * and the like. 95 * and the like.
96 */ 96 */
97 97
98static struct notifier_block *reboot_notifier_list; 98static BLOCKING_NOTIFIER_HEAD(reboot_notifier_list);
99static DEFINE_RWLOCK(notifier_lock); 99
100/*
101 * Notifier chain core routines. The exported routines below
102 * are layered on top of these, with appropriate locking added.
103 */
104
105static int notifier_chain_register(struct notifier_block **nl,
106 struct notifier_block *n)
107{
108 while ((*nl) != NULL) {
109 if (n->priority > (*nl)->priority)
110 break;
111 nl = &((*nl)->next);
112 }
113 n->next = *nl;
114 rcu_assign_pointer(*nl, n);
115 return 0;
116}
117
118static int notifier_chain_unregister(struct notifier_block **nl,
119 struct notifier_block *n)
120{
121 while ((*nl) != NULL) {
122 if ((*nl) == n) {
123 rcu_assign_pointer(*nl, n->next);
124 return 0;
125 }
126 nl = &((*nl)->next);
127 }
128 return -ENOENT;
129}
130
131static int __kprobes notifier_call_chain(struct notifier_block **nl,
132 unsigned long val, void *v)
133{
134 int ret = NOTIFY_DONE;
135 struct notifier_block *nb;
136
137 nb = rcu_dereference(*nl);
138 while (nb) {
139 ret = nb->notifier_call(nb, val, v);
140 if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK)
141 break;
142 nb = rcu_dereference(nb->next);
143 }
144 return ret;
145}
146
147/*
148 * Atomic notifier chain routines. Registration and unregistration
149 * use a mutex, and call_chain is synchronized by RCU (no locks).
150 */
100 151
101/** 152/**
102 * notifier_chain_register - Add notifier to a notifier chain 153 * atomic_notifier_chain_register - Add notifier to an atomic notifier chain
103 * @list: Pointer to root list pointer 154 * @nh: Pointer to head of the atomic notifier chain
104 * @n: New entry in notifier chain 155 * @n: New entry in notifier chain
105 * 156 *
106 * Adds a notifier to a notifier chain. 157 * Adds a notifier to an atomic notifier chain.
107 * 158 *
108 * Currently always returns zero. 159 * Currently always returns zero.
109 */ 160 */
161
162int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
163 struct notifier_block *n)
164{
165 unsigned long flags;
166 int ret;
167
168 spin_lock_irqsave(&nh->lock, flags);
169 ret = notifier_chain_register(&nh->head, n);
170 spin_unlock_irqrestore(&nh->lock, flags);
171 return ret;
172}
173
174EXPORT_SYMBOL_GPL(atomic_notifier_chain_register);
175
176/**
177 * atomic_notifier_chain_unregister - Remove notifier from an atomic notifier chain
178 * @nh: Pointer to head of the atomic notifier chain
179 * @n: Entry to remove from notifier chain
180 *
181 * Removes a notifier from an atomic notifier chain.
182 *
183 * Returns zero on success or %-ENOENT on failure.
184 */
185int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
186 struct notifier_block *n)
187{
188 unsigned long flags;
189 int ret;
190
191 spin_lock_irqsave(&nh->lock, flags);
192 ret = notifier_chain_unregister(&nh->head, n);
193 spin_unlock_irqrestore(&nh->lock, flags);
194 synchronize_rcu();
195 return ret;
196}
197
198EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
199
200/**
201 * atomic_notifier_call_chain - Call functions in an atomic notifier chain
202 * @nh: Pointer to head of the atomic notifier chain
203 * @val: Value passed unmodified to notifier function
204 * @v: Pointer passed unmodified to notifier function
205 *
206 * Calls each function in a notifier chain in turn. The functions
207 * run in an atomic context, so they must not block.
208 * This routine uses RCU to synchronize with changes to the chain.
209 *
210 * If the return value of the notifier can be and'ed
211 * with %NOTIFY_STOP_MASK then atomic_notifier_call_chain
212 * will return immediately, with the return value of
213 * the notifier function which halted execution.
214 * Otherwise the return value is the return value
215 * of the last notifier function called.
216 */
110 217
111int notifier_chain_register(struct notifier_block **list, struct notifier_block *n) 218int atomic_notifier_call_chain(struct atomic_notifier_head *nh,
219 unsigned long val, void *v)
112{ 220{
113 write_lock(&notifier_lock); 221 int ret;
114 while(*list) 222
115 { 223 rcu_read_lock();
116 if(n->priority > (*list)->priority) 224 ret = notifier_call_chain(&nh->head, val, v);
117 break; 225 rcu_read_unlock();
118 list= &((*list)->next); 226 return ret;
119 }
120 n->next = *list;
121 *list=n;
122 write_unlock(&notifier_lock);
123 return 0;
124} 227}
125 228
126EXPORT_SYMBOL(notifier_chain_register); 229EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
230
231/*
232 * Blocking notifier chain routines. All access to the chain is
233 * synchronized by an rwsem.
234 */
127 235
128/** 236/**
129 * notifier_chain_unregister - Remove notifier from a notifier chain 237 * blocking_notifier_chain_register - Add notifier to a blocking notifier chain
130 * @nl: Pointer to root list pointer 238 * @nh: Pointer to head of the blocking notifier chain
131 * @n: New entry in notifier chain 239 * @n: New entry in notifier chain
132 * 240 *
133 * Removes a notifier from a notifier chain. 241 * Adds a notifier to a blocking notifier chain.
242 * Must be called in process context.
134 * 243 *
135 * Returns zero on success, or %-ENOENT on failure. 244 * Currently always returns zero.
136 */ 245 */
137 246
138int notifier_chain_unregister(struct notifier_block **nl, struct notifier_block *n) 247int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
248 struct notifier_block *n)
139{ 249{
140 write_lock(&notifier_lock); 250 int ret;
141 while((*nl)!=NULL) 251
142 { 252 /*
143 if((*nl)==n) 253 * This code gets used during boot-up, when task switching is
144 { 254 * not yet working and interrupts must remain disabled. At
145 *nl=n->next; 255 * such times we must not call down_write().
146 write_unlock(&notifier_lock); 256 */
147 return 0; 257 if (unlikely(system_state == SYSTEM_BOOTING))
148 } 258 return notifier_chain_register(&nh->head, n);
149 nl=&((*nl)->next); 259
150 } 260 down_write(&nh->rwsem);
151 write_unlock(&notifier_lock); 261 ret = notifier_chain_register(&nh->head, n);
152 return -ENOENT; 262 up_write(&nh->rwsem);
263 return ret;
153} 264}
154 265
155EXPORT_SYMBOL(notifier_chain_unregister); 266EXPORT_SYMBOL_GPL(blocking_notifier_chain_register);
156 267
157/** 268/**
158 * notifier_call_chain - Call functions in a notifier chain 269 * blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain
159 * @n: Pointer to root pointer of notifier chain 270 * @nh: Pointer to head of the blocking notifier chain
271 * @n: Entry to remove from notifier chain
272 *
273 * Removes a notifier from a blocking notifier chain.
274 * Must be called from process context.
275 *
276 * Returns zero on success or %-ENOENT on failure.
277 */
278int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh,
279 struct notifier_block *n)
280{
281 int ret;
282
283 /*
284 * This code gets used during boot-up, when task switching is
285 * not yet working and interrupts must remain disabled. At
286 * such times we must not call down_write().
287 */
288 if (unlikely(system_state == SYSTEM_BOOTING))
289 return notifier_chain_unregister(&nh->head, n);
290
291 down_write(&nh->rwsem);
292 ret = notifier_chain_unregister(&nh->head, n);
293 up_write(&nh->rwsem);
294 return ret;
295}
296
297EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister);
298
299/**
300 * blocking_notifier_call_chain - Call functions in a blocking notifier chain
301 * @nh: Pointer to head of the blocking notifier chain
160 * @val: Value passed unmodified to notifier function 302 * @val: Value passed unmodified to notifier function
161 * @v: Pointer passed unmodified to notifier function 303 * @v: Pointer passed unmodified to notifier function
162 * 304 *
163 * Calls each function in a notifier chain in turn. 305 * Calls each function in a notifier chain in turn. The functions
306 * run in a process context, so they are allowed to block.
164 * 307 *
165 * If the return value of the notifier can be and'd 308 * If the return value of the notifier can be and'ed
166 * with %NOTIFY_STOP_MASK, then notifier_call_chain 309 * with %NOTIFY_STOP_MASK then blocking_notifier_call_chain
167 * will return immediately, with the return value of 310 * will return immediately, with the return value of
168 * the notifier function which halted execution. 311 * the notifier function which halted execution.
169 * Otherwise, the return value is the return value 312 * Otherwise the return value is the return value
170 * of the last notifier function called. 313 * of the last notifier function called.
171 */ 314 */
172 315
173int __kprobes notifier_call_chain(struct notifier_block **n, unsigned long val, void *v) 316int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
317 unsigned long val, void *v)
174{ 318{
175 int ret=NOTIFY_DONE; 319 int ret;
176 struct notifier_block *nb = *n;
177 320
178 while(nb) 321 down_read(&nh->rwsem);
179 { 322 ret = notifier_call_chain(&nh->head, val, v);
180 ret=nb->notifier_call(nb,val,v); 323 up_read(&nh->rwsem);
181 if(ret&NOTIFY_STOP_MASK)
182 {
183 return ret;
184 }
185 nb=nb->next;
186 }
187 return ret; 324 return ret;
188} 325}
189 326
190EXPORT_SYMBOL(notifier_call_chain); 327EXPORT_SYMBOL_GPL(blocking_notifier_call_chain);
328
329/*
330 * Raw notifier chain routines. There is no protection;
331 * the caller must provide it. Use at your own risk!
332 */
333
334/**
335 * raw_notifier_chain_register - Add notifier to a raw notifier chain
336 * @nh: Pointer to head of the raw notifier chain
337 * @n: New entry in notifier chain
338 *
339 * Adds a notifier to a raw notifier chain.
340 * All locking must be provided by the caller.
341 *
342 * Currently always returns zero.
343 */
344
345int raw_notifier_chain_register(struct raw_notifier_head *nh,
346 struct notifier_block *n)
347{
348 return notifier_chain_register(&nh->head, n);
349}
350
351EXPORT_SYMBOL_GPL(raw_notifier_chain_register);
352
353/**
354 * raw_notifier_chain_unregister - Remove notifier from a raw notifier chain
355 * @nh: Pointer to head of the raw notifier chain
356 * @n: Entry to remove from notifier chain
357 *
358 * Removes a notifier from a raw notifier chain.
359 * All locking must be provided by the caller.
360 *
361 * Returns zero on success or %-ENOENT on failure.
362 */
363int raw_notifier_chain_unregister(struct raw_notifier_head *nh,
364 struct notifier_block *n)
365{
366 return notifier_chain_unregister(&nh->head, n);
367}
368
369EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister);
370
371/**
372 * raw_notifier_call_chain - Call functions in a raw notifier chain
373 * @nh: Pointer to head of the raw notifier chain
374 * @val: Value passed unmodified to notifier function
375 * @v: Pointer passed unmodified to notifier function
376 *
377 * Calls each function in a notifier chain in turn. The functions
378 * run in an undefined context.
379 * All locking must be provided by the caller.
380 *
381 * If the return value of the notifier can be and'ed
382 * with %NOTIFY_STOP_MASK then raw_notifier_call_chain
383 * will return immediately, with the return value of
384 * the notifier function which halted execution.
385 * Otherwise the return value is the return value
386 * of the last notifier function called.
387 */
388
389int raw_notifier_call_chain(struct raw_notifier_head *nh,
390 unsigned long val, void *v)
391{
392 return notifier_call_chain(&nh->head, val, v);
393}
394
395EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
191 396
192/** 397/**
193 * register_reboot_notifier - Register function to be called at reboot time 398 * register_reboot_notifier - Register function to be called at reboot time
@@ -196,13 +401,13 @@ EXPORT_SYMBOL(notifier_call_chain);
196 * Registers a function with the list of functions 401 * Registers a function with the list of functions
197 * to be called at reboot time. 402 * to be called at reboot time.
198 * 403 *
199 * Currently always returns zero, as notifier_chain_register 404 * Currently always returns zero, as blocking_notifier_chain_register
200 * always returns zero. 405 * always returns zero.
201 */ 406 */
202 407
203int register_reboot_notifier(struct notifier_block * nb) 408int register_reboot_notifier(struct notifier_block * nb)
204{ 409{
205 return notifier_chain_register(&reboot_notifier_list, nb); 410 return blocking_notifier_chain_register(&reboot_notifier_list, nb);
206} 411}
207 412
208EXPORT_SYMBOL(register_reboot_notifier); 413EXPORT_SYMBOL(register_reboot_notifier);
@@ -219,7 +424,7 @@ EXPORT_SYMBOL(register_reboot_notifier);
219 424
220int unregister_reboot_notifier(struct notifier_block * nb) 425int unregister_reboot_notifier(struct notifier_block * nb)
221{ 426{
222 return notifier_chain_unregister(&reboot_notifier_list, nb); 427 return blocking_notifier_chain_unregister(&reboot_notifier_list, nb);
223} 428}
224 429
225EXPORT_SYMBOL(unregister_reboot_notifier); 430EXPORT_SYMBOL(unregister_reboot_notifier);
@@ -380,7 +585,7 @@ EXPORT_SYMBOL_GPL(emergency_restart);
380 585
381void kernel_restart_prepare(char *cmd) 586void kernel_restart_prepare(char *cmd)
382{ 587{
383 notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); 588 blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
384 system_state = SYSTEM_RESTART; 589 system_state = SYSTEM_RESTART;
385 device_shutdown(); 590 device_shutdown();
386} 591}
@@ -430,7 +635,7 @@ EXPORT_SYMBOL_GPL(kernel_kexec);
430 635
431void kernel_shutdown_prepare(enum system_states state) 636void kernel_shutdown_prepare(enum system_states state)
432{ 637{
433 notifier_call_chain(&reboot_notifier_list, 638 blocking_notifier_call_chain(&reboot_notifier_list,
434 (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL); 639 (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL);
435 system_state = state; 640 system_state = state;
436 device_shutdown(); 641 device_shutdown();
@@ -997,69 +1202,24 @@ asmlinkage long sys_times(struct tms __user * tbuf)
997 */ 1202 */
998 if (tbuf) { 1203 if (tbuf) {
999 struct tms tmp; 1204 struct tms tmp;
1205 struct task_struct *tsk = current;
1206 struct task_struct *t;
1000 cputime_t utime, stime, cutime, cstime; 1207 cputime_t utime, stime, cutime, cstime;
1001 1208
1002#ifdef CONFIG_SMP 1209 spin_lock_irq(&tsk->sighand->siglock);
1003 if (thread_group_empty(current)) { 1210 utime = tsk->signal->utime;
1004 /* 1211 stime = tsk->signal->stime;
1005 * Single thread case without the use of any locks. 1212 t = tsk;
1006 * 1213 do {
1007 * We may race with release_task if two threads are 1214 utime = cputime_add(utime, t->utime);
1008 * executing. However, release task first adds up the 1215 stime = cputime_add(stime, t->stime);
1009 * counters (__exit_signal) before removing the task 1216 t = next_thread(t);
1010 * from the process tasklist (__unhash_process). 1217 } while (t != tsk);
1011 * __exit_signal also acquires and releases the
1012 * siglock which results in the proper memory ordering
1013 * so that the list modifications are always visible
1014 * after the counters have been updated.
1015 *
1016 * If the counters have been updated by the second thread
1017 * but the thread has not yet been removed from the list
1018 * then the other branch will be executing which will
1019 * block on tasklist_lock until the exit handling of the
1020 * other task is finished.
1021 *
1022 * This also implies that the sighand->siglock cannot
1023 * be held by another processor. So we can also
1024 * skip acquiring that lock.
1025 */
1026 utime = cputime_add(current->signal->utime, current->utime);
1027 stime = cputime_add(current->signal->utime, current->stime);
1028 cutime = current->signal->cutime;
1029 cstime = current->signal->cstime;
1030 } else
1031#endif
1032 {
1033
1034 /* Process with multiple threads */
1035 struct task_struct *tsk = current;
1036 struct task_struct *t;
1037 1218
1038 read_lock(&tasklist_lock); 1219 cutime = tsk->signal->cutime;
1039 utime = tsk->signal->utime; 1220 cstime = tsk->signal->cstime;
1040 stime = tsk->signal->stime; 1221 spin_unlock_irq(&tsk->sighand->siglock);
1041 t = tsk;
1042 do {
1043 utime = cputime_add(utime, t->utime);
1044 stime = cputime_add(stime, t->stime);
1045 t = next_thread(t);
1046 } while (t != tsk);
1047 1222
1048 /*
1049 * While we have tasklist_lock read-locked, no dying thread
1050 * can be updating current->signal->[us]time. Instead,
1051 * we got their counts included in the live thread loop.
1052 * However, another thread can come in right now and
1053 * do a wait call that updates current->signal->c[us]time.
1054 * To make sure we always see that pair updated atomically,
1055 * we take the siglock around fetching them.
1056 */
1057 spin_lock_irq(&tsk->sighand->siglock);
1058 cutime = tsk->signal->cutime;
1059 cstime = tsk->signal->cstime;
1060 spin_unlock_irq(&tsk->sighand->siglock);
1061 read_unlock(&tasklist_lock);
1062 }
1063 tmp.tms_utime = cputime_to_clock_t(utime); 1223 tmp.tms_utime = cputime_to_clock_t(utime);
1064 tmp.tms_stime = cputime_to_clock_t(stime); 1224 tmp.tms_stime = cputime_to_clock_t(stime);
1065 tmp.tms_cutime = cputime_to_clock_t(cutime); 1225 tmp.tms_cutime = cputime_to_clock_t(cutime);
@@ -1212,18 +1372,29 @@ asmlinkage long sys_getsid(pid_t pid)
1212asmlinkage long sys_setsid(void) 1372asmlinkage long sys_setsid(void)
1213{ 1373{
1214 struct task_struct *group_leader = current->group_leader; 1374 struct task_struct *group_leader = current->group_leader;
1215 struct pid *pid; 1375 pid_t session;
1216 int err = -EPERM; 1376 int err = -EPERM;
1217 1377
1218 mutex_lock(&tty_mutex); 1378 mutex_lock(&tty_mutex);
1219 write_lock_irq(&tasklist_lock); 1379 write_lock_irq(&tasklist_lock);
1220 1380
1221 pid = find_pid(PIDTYPE_PGID, group_leader->pid); 1381 /* Fail if I am already a session leader */
1222 if (pid) 1382 if (group_leader->signal->leader)
1383 goto out;
1384
1385 session = group_leader->pid;
1386 /* Fail if a process group id already exists that equals the
1387 * proposed session id.
1388 *
1389 * Don't check if session id == 1 because kernel threads use this
1390 * session id and so the check will always fail and make it so
1391 * init cannot successfully call setsid.
1392 */
1393 if (session > 1 && find_task_by_pid_type(PIDTYPE_PGID, session))
1223 goto out; 1394 goto out;
1224 1395
1225 group_leader->signal->leader = 1; 1396 group_leader->signal->leader = 1;
1226 __set_special_pids(group_leader->pid, group_leader->pid); 1397 __set_special_pids(session, session);
1227 group_leader->signal->tty = NULL; 1398 group_leader->signal->tty = NULL;
1228 group_leader->signal->tty_old_pgrp = 0; 1399 group_leader->signal->tty_old_pgrp = 0;
1229 err = process_group(group_leader); 1400 err = process_group(group_leader);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 1067090db6b1..d82864c4a617 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -42,6 +42,10 @@ cond_syscall(sys_recvmsg);
42cond_syscall(sys_socketcall); 42cond_syscall(sys_socketcall);
43cond_syscall(sys_futex); 43cond_syscall(sys_futex);
44cond_syscall(compat_sys_futex); 44cond_syscall(compat_sys_futex);
45cond_syscall(sys_set_robust_list);
46cond_syscall(compat_sys_set_robust_list);
47cond_syscall(sys_get_robust_list);
48cond_syscall(compat_sys_get_robust_list);
45cond_syscall(sys_epoll_create); 49cond_syscall(sys_epoll_create);
46cond_syscall(sys_epoll_ctl); 50cond_syscall(sys_epoll_ctl);
47cond_syscall(sys_epoll_wait); 51cond_syscall(sys_epoll_wait);
diff --git a/kernel/time.c b/kernel/time.c
index e00a97b77241..ff8e7019c4c4 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -610,7 +610,7 @@ void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec)
610 * 610 *
611 * Returns the timespec representation of the nsec parameter. 611 * Returns the timespec representation of the nsec parameter.
612 */ 612 */
613struct timespec ns_to_timespec(const nsec_t nsec) 613struct timespec ns_to_timespec(const s64 nsec)
614{ 614{
615 struct timespec ts; 615 struct timespec ts;
616 616
@@ -630,7 +630,7 @@ struct timespec ns_to_timespec(const nsec_t nsec)
630 * 630 *
631 * Returns the timeval representation of the nsec parameter. 631 * Returns the timeval representation of the nsec parameter.
632 */ 632 */
633struct timeval ns_to_timeval(const nsec_t nsec) 633struct timeval ns_to_timeval(const s64 nsec)
634{ 634{
635 struct timespec ts = ns_to_timespec(nsec); 635 struct timespec ts = ns_to_timespec(nsec);
636 struct timeval tv; 636 struct timeval tv;
diff --git a/kernel/timer.c b/kernel/timer.c
index ab189dd187cb..6b812c04737b 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -54,7 +54,6 @@ EXPORT_SYMBOL(jiffies_64);
54/* 54/*
55 * per-CPU timer vector definitions: 55 * per-CPU timer vector definitions:
56 */ 56 */
57
58#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6) 57#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6)
59#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8) 58#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8)
60#define TVN_SIZE (1 << TVN_BITS) 59#define TVN_SIZE (1 << TVN_BITS)
@@ -62,11 +61,6 @@ EXPORT_SYMBOL(jiffies_64);
62#define TVN_MASK (TVN_SIZE - 1) 61#define TVN_MASK (TVN_SIZE - 1)
63#define TVR_MASK (TVR_SIZE - 1) 62#define TVR_MASK (TVR_SIZE - 1)
64 63
65struct timer_base_s {
66 spinlock_t lock;
67 struct timer_list *running_timer;
68};
69
70typedef struct tvec_s { 64typedef struct tvec_s {
71 struct list_head vec[TVN_SIZE]; 65 struct list_head vec[TVN_SIZE];
72} tvec_t; 66} tvec_t;
@@ -76,7 +70,8 @@ typedef struct tvec_root_s {
76} tvec_root_t; 70} tvec_root_t;
77 71
78struct tvec_t_base_s { 72struct tvec_t_base_s {
79 struct timer_base_s t_base; 73 spinlock_t lock;
74 struct timer_list *running_timer;
80 unsigned long timer_jiffies; 75 unsigned long timer_jiffies;
81 tvec_root_t tv1; 76 tvec_root_t tv1;
82 tvec_t tv2; 77 tvec_t tv2;
@@ -87,13 +82,14 @@ struct tvec_t_base_s {
87 82
88typedef struct tvec_t_base_s tvec_base_t; 83typedef struct tvec_t_base_s tvec_base_t;
89static DEFINE_PER_CPU(tvec_base_t *, tvec_bases); 84static DEFINE_PER_CPU(tvec_base_t *, tvec_bases);
90static tvec_base_t boot_tvec_bases; 85tvec_base_t boot_tvec_bases;
86EXPORT_SYMBOL(boot_tvec_bases);
91 87
92static inline void set_running_timer(tvec_base_t *base, 88static inline void set_running_timer(tvec_base_t *base,
93 struct timer_list *timer) 89 struct timer_list *timer)
94{ 90{
95#ifdef CONFIG_SMP 91#ifdef CONFIG_SMP
96 base->t_base.running_timer = timer; 92 base->running_timer = timer;
97#endif 93#endif
98} 94}
99 95
@@ -139,15 +135,6 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
139 list_add_tail(&timer->entry, vec); 135 list_add_tail(&timer->entry, vec);
140} 136}
141 137
142typedef struct timer_base_s timer_base_t;
143/*
144 * Used by TIMER_INITIALIZER, we can't use per_cpu(tvec_bases)
145 * at compile time, and we need timer->base to lock the timer.
146 */
147timer_base_t __init_timer_base
148 ____cacheline_aligned_in_smp = { .lock = SPIN_LOCK_UNLOCKED };
149EXPORT_SYMBOL(__init_timer_base);
150
151/*** 138/***
152 * init_timer - initialize a timer. 139 * init_timer - initialize a timer.
153 * @timer: the timer to be initialized 140 * @timer: the timer to be initialized
@@ -158,7 +145,7 @@ EXPORT_SYMBOL(__init_timer_base);
158void fastcall init_timer(struct timer_list *timer) 145void fastcall init_timer(struct timer_list *timer)
159{ 146{
160 timer->entry.next = NULL; 147 timer->entry.next = NULL;
161 timer->base = &per_cpu(tvec_bases, raw_smp_processor_id())->t_base; 148 timer->base = per_cpu(tvec_bases, raw_smp_processor_id());
162} 149}
163EXPORT_SYMBOL(init_timer); 150EXPORT_SYMBOL(init_timer);
164 151
@@ -174,7 +161,7 @@ static inline void detach_timer(struct timer_list *timer,
174} 161}
175 162
176/* 163/*
177 * We are using hashed locking: holding per_cpu(tvec_bases).t_base.lock 164 * We are using hashed locking: holding per_cpu(tvec_bases).lock
178 * means that all timers which are tied to this base via timer->base are 165 * means that all timers which are tied to this base via timer->base are
179 * locked, and the base itself is locked too. 166 * locked, and the base itself is locked too.
180 * 167 *
@@ -185,10 +172,10 @@ static inline void detach_timer(struct timer_list *timer,
185 * possible to set timer->base = NULL and drop the lock: the timer remains 172 * possible to set timer->base = NULL and drop the lock: the timer remains
186 * locked. 173 * locked.
187 */ 174 */
188static timer_base_t *lock_timer_base(struct timer_list *timer, 175static tvec_base_t *lock_timer_base(struct timer_list *timer,
189 unsigned long *flags) 176 unsigned long *flags)
190{ 177{
191 timer_base_t *base; 178 tvec_base_t *base;
192 179
193 for (;;) { 180 for (;;) {
194 base = timer->base; 181 base = timer->base;
@@ -205,8 +192,7 @@ static timer_base_t *lock_timer_base(struct timer_list *timer,
205 192
206int __mod_timer(struct timer_list *timer, unsigned long expires) 193int __mod_timer(struct timer_list *timer, unsigned long expires)
207{ 194{
208 timer_base_t *base; 195 tvec_base_t *base, *new_base;
209 tvec_base_t *new_base;
210 unsigned long flags; 196 unsigned long flags;
211 int ret = 0; 197 int ret = 0;
212 198
@@ -221,7 +207,7 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
221 207
222 new_base = __get_cpu_var(tvec_bases); 208 new_base = __get_cpu_var(tvec_bases);
223 209
224 if (base != &new_base->t_base) { 210 if (base != new_base) {
225 /* 211 /*
226 * We are trying to schedule the timer on the local CPU. 212 * We are trying to schedule the timer on the local CPU.
227 * However we can't change timer's base while it is running, 213 * However we can't change timer's base while it is running,
@@ -229,21 +215,19 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
229 * handler yet has not finished. This also guarantees that 215 * handler yet has not finished. This also guarantees that
230 * the timer is serialized wrt itself. 216 * the timer is serialized wrt itself.
231 */ 217 */
232 if (unlikely(base->running_timer == timer)) { 218 if (likely(base->running_timer != timer)) {
233 /* The timer remains on a former base */
234 new_base = container_of(base, tvec_base_t, t_base);
235 } else {
236 /* See the comment in lock_timer_base() */ 219 /* See the comment in lock_timer_base() */
237 timer->base = NULL; 220 timer->base = NULL;
238 spin_unlock(&base->lock); 221 spin_unlock(&base->lock);
239 spin_lock(&new_base->t_base.lock); 222 base = new_base;
240 timer->base = &new_base->t_base; 223 spin_lock(&base->lock);
224 timer->base = base;
241 } 225 }
242 } 226 }
243 227
244 timer->expires = expires; 228 timer->expires = expires;
245 internal_add_timer(new_base, timer); 229 internal_add_timer(base, timer);
246 spin_unlock_irqrestore(&new_base->t_base.lock, flags); 230 spin_unlock_irqrestore(&base->lock, flags);
247 231
248 return ret; 232 return ret;
249} 233}
@@ -263,10 +247,10 @@ void add_timer_on(struct timer_list *timer, int cpu)
263 unsigned long flags; 247 unsigned long flags;
264 248
265 BUG_ON(timer_pending(timer) || !timer->function); 249 BUG_ON(timer_pending(timer) || !timer->function);
266 spin_lock_irqsave(&base->t_base.lock, flags); 250 spin_lock_irqsave(&base->lock, flags);
267 timer->base = &base->t_base; 251 timer->base = base;
268 internal_add_timer(base, timer); 252 internal_add_timer(base, timer);
269 spin_unlock_irqrestore(&base->t_base.lock, flags); 253 spin_unlock_irqrestore(&base->lock, flags);
270} 254}
271 255
272 256
@@ -319,7 +303,7 @@ EXPORT_SYMBOL(mod_timer);
319 */ 303 */
320int del_timer(struct timer_list *timer) 304int del_timer(struct timer_list *timer)
321{ 305{
322 timer_base_t *base; 306 tvec_base_t *base;
323 unsigned long flags; 307 unsigned long flags;
324 int ret = 0; 308 int ret = 0;
325 309
@@ -346,7 +330,7 @@ EXPORT_SYMBOL(del_timer);
346 */ 330 */
347int try_to_del_timer_sync(struct timer_list *timer) 331int try_to_del_timer_sync(struct timer_list *timer)
348{ 332{
349 timer_base_t *base; 333 tvec_base_t *base;
350 unsigned long flags; 334 unsigned long flags;
351 int ret = -1; 335 int ret = -1;
352 336
@@ -410,7 +394,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index)
410 struct timer_list *tmp; 394 struct timer_list *tmp;
411 395
412 tmp = list_entry(curr, struct timer_list, entry); 396 tmp = list_entry(curr, struct timer_list, entry);
413 BUG_ON(tmp->base != &base->t_base); 397 BUG_ON(tmp->base != base);
414 curr = curr->next; 398 curr = curr->next;
415 internal_add_timer(base, tmp); 399 internal_add_timer(base, tmp);
416 } 400 }
@@ -432,7 +416,7 @@ static inline void __run_timers(tvec_base_t *base)
432{ 416{
433 struct timer_list *timer; 417 struct timer_list *timer;
434 418
435 spin_lock_irq(&base->t_base.lock); 419 spin_lock_irq(&base->lock);
436 while (time_after_eq(jiffies, base->timer_jiffies)) { 420 while (time_after_eq(jiffies, base->timer_jiffies)) {
437 struct list_head work_list = LIST_HEAD_INIT(work_list); 421 struct list_head work_list = LIST_HEAD_INIT(work_list);
438 struct list_head *head = &work_list; 422 struct list_head *head = &work_list;
@@ -458,7 +442,7 @@ static inline void __run_timers(tvec_base_t *base)
458 442
459 set_running_timer(base, timer); 443 set_running_timer(base, timer);
460 detach_timer(timer, 1); 444 detach_timer(timer, 1);
461 spin_unlock_irq(&base->t_base.lock); 445 spin_unlock_irq(&base->lock);
462 { 446 {
463 int preempt_count = preempt_count(); 447 int preempt_count = preempt_count();
464 fn(data); 448 fn(data);
@@ -471,11 +455,11 @@ static inline void __run_timers(tvec_base_t *base)
471 BUG(); 455 BUG();
472 } 456 }
473 } 457 }
474 spin_lock_irq(&base->t_base.lock); 458 spin_lock_irq(&base->lock);
475 } 459 }
476 } 460 }
477 set_running_timer(base, NULL); 461 set_running_timer(base, NULL);
478 spin_unlock_irq(&base->t_base.lock); 462 spin_unlock_irq(&base->lock);
479} 463}
480 464
481#ifdef CONFIG_NO_IDLE_HZ 465#ifdef CONFIG_NO_IDLE_HZ
@@ -506,7 +490,7 @@ unsigned long next_timer_interrupt(void)
506 hr_expires += jiffies; 490 hr_expires += jiffies;
507 491
508 base = __get_cpu_var(tvec_bases); 492 base = __get_cpu_var(tvec_bases);
509 spin_lock(&base->t_base.lock); 493 spin_lock(&base->lock);
510 expires = base->timer_jiffies + (LONG_MAX >> 1); 494 expires = base->timer_jiffies + (LONG_MAX >> 1);
511 list = NULL; 495 list = NULL;
512 496
@@ -554,7 +538,7 @@ found:
554 expires = nte->expires; 538 expires = nte->expires;
555 } 539 }
556 } 540 }
557 spin_unlock(&base->t_base.lock); 541 spin_unlock(&base->lock);
558 542
559 if (time_before(hr_expires, expires)) 543 if (time_before(hr_expires, expires))
560 return hr_expires; 544 return hr_expires;
@@ -841,7 +825,7 @@ void update_process_times(int user_tick)
841 */ 825 */
842static unsigned long count_active_tasks(void) 826static unsigned long count_active_tasks(void)
843{ 827{
844 return (nr_running() + nr_uninterruptible()) * FIXED_1; 828 return nr_active() * FIXED_1;
845} 829}
846 830
847/* 831/*
@@ -1262,7 +1246,7 @@ static int __devinit init_timers_cpu(int cpu)
1262 } 1246 }
1263 per_cpu(tvec_bases, cpu) = base; 1247 per_cpu(tvec_bases, cpu) = base;
1264 } 1248 }
1265 spin_lock_init(&base->t_base.lock); 1249 spin_lock_init(&base->lock);
1266 for (j = 0; j < TVN_SIZE; j++) { 1250 for (j = 0; j < TVN_SIZE; j++) {
1267 INIT_LIST_HEAD(base->tv5.vec + j); 1251 INIT_LIST_HEAD(base->tv5.vec + j);
1268 INIT_LIST_HEAD(base->tv4.vec + j); 1252 INIT_LIST_HEAD(base->tv4.vec + j);
@@ -1284,7 +1268,7 @@ static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head)
1284 while (!list_empty(head)) { 1268 while (!list_empty(head)) {
1285 timer = list_entry(head->next, struct timer_list, entry); 1269 timer = list_entry(head->next, struct timer_list, entry);
1286 detach_timer(timer, 0); 1270 detach_timer(timer, 0);
1287 timer->base = &new_base->t_base; 1271 timer->base = new_base;
1288 internal_add_timer(new_base, timer); 1272 internal_add_timer(new_base, timer);
1289 } 1273 }
1290} 1274}
@@ -1300,11 +1284,11 @@ static void __devinit migrate_timers(int cpu)
1300 new_base = get_cpu_var(tvec_bases); 1284 new_base = get_cpu_var(tvec_bases);
1301 1285
1302 local_irq_disable(); 1286 local_irq_disable();
1303 spin_lock(&new_base->t_base.lock); 1287 spin_lock(&new_base->lock);
1304 spin_lock(&old_base->t_base.lock); 1288 spin_lock(&old_base->lock);
1289
1290 BUG_ON(old_base->running_timer);
1305 1291
1306 if (old_base->t_base.running_timer)
1307 BUG();
1308 for (i = 0; i < TVR_SIZE; i++) 1292 for (i = 0; i < TVR_SIZE; i++)
1309 migrate_timer_list(new_base, old_base->tv1.vec + i); 1293 migrate_timer_list(new_base, old_base->tv1.vec + i);
1310 for (i = 0; i < TVN_SIZE; i++) { 1294 for (i = 0; i < TVN_SIZE; i++) {
@@ -1314,8 +1298,8 @@ static void __devinit migrate_timers(int cpu)
1314 migrate_timer_list(new_base, old_base->tv5.vec + i); 1298 migrate_timer_list(new_base, old_base->tv5.vec + i);
1315 } 1299 }
1316 1300
1317 spin_unlock(&old_base->t_base.lock); 1301 spin_unlock(&old_base->lock);
1318 spin_unlock(&new_base->t_base.lock); 1302 spin_unlock(&new_base->lock);
1319 local_irq_enable(); 1303 local_irq_enable();
1320 put_cpu_var(tvec_bases); 1304 put_cpu_var(tvec_bases);
1321} 1305}