aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/acct.c12
-rw-r--r--kernel/audit.c2
-rw-r--r--kernel/cpuset.c69
-rw-r--r--kernel/exit.c7
-rw-r--r--kernel/fork.c28
-rw-r--r--kernel/futex.c4
-rw-r--r--kernel/futex_compat.c4
-rw-r--r--kernel/hrtimer.c49
-rw-r--r--kernel/module.c1
-rw-r--r--kernel/pid.c212
-rw-r--r--kernel/power/process.c3
-rw-r--r--kernel/sched.c84
-rw-r--r--kernel/signal.c1
-rw-r--r--kernel/sys.c19
-rw-r--r--kernel/timer.c92
15 files changed, 376 insertions, 211 deletions
diff --git a/kernel/acct.c b/kernel/acct.c
index 065d8b4e51ef..b327f4d20104 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -449,8 +449,8 @@ static void do_acct_process(long exitcode, struct file *file)
449 /* calculate run_time in nsec*/ 449 /* calculate run_time in nsec*/
450 do_posix_clock_monotonic_gettime(&uptime); 450 do_posix_clock_monotonic_gettime(&uptime);
451 run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec; 451 run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec;
452 run_time -= (u64)current->start_time.tv_sec*NSEC_PER_SEC 452 run_time -= (u64)current->group_leader->start_time.tv_sec * NSEC_PER_SEC
453 + current->start_time.tv_nsec; 453 + current->group_leader->start_time.tv_nsec;
454 /* convert nsec -> AHZ */ 454 /* convert nsec -> AHZ */
455 elapsed = nsec_to_AHZ(run_time); 455 elapsed = nsec_to_AHZ(run_time);
456#if ACCT_VERSION==3 456#if ACCT_VERSION==3
@@ -469,10 +469,10 @@ static void do_acct_process(long exitcode, struct file *file)
469#endif 469#endif
470 do_div(elapsed, AHZ); 470 do_div(elapsed, AHZ);
471 ac.ac_btime = xtime.tv_sec - elapsed; 471 ac.ac_btime = xtime.tv_sec - elapsed;
472 jiffies = cputime_to_jiffies(cputime_add(current->group_leader->utime, 472 jiffies = cputime_to_jiffies(cputime_add(current->utime,
473 current->signal->utime)); 473 current->signal->utime));
474 ac.ac_utime = encode_comp_t(jiffies_to_AHZ(jiffies)); 474 ac.ac_utime = encode_comp_t(jiffies_to_AHZ(jiffies));
475 jiffies = cputime_to_jiffies(cputime_add(current->group_leader->stime, 475 jiffies = cputime_to_jiffies(cputime_add(current->stime,
476 current->signal->stime)); 476 current->signal->stime));
477 ac.ac_stime = encode_comp_t(jiffies_to_AHZ(jiffies)); 477 ac.ac_stime = encode_comp_t(jiffies_to_AHZ(jiffies));
478 /* we really need to bite the bullet and change layout */ 478 /* we really need to bite the bullet and change layout */
@@ -522,9 +522,9 @@ static void do_acct_process(long exitcode, struct file *file)
522 ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */ 522 ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */
523 ac.ac_rw = encode_comp_t(ac.ac_io / 1024); 523 ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
524 ac.ac_minflt = encode_comp_t(current->signal->min_flt + 524 ac.ac_minflt = encode_comp_t(current->signal->min_flt +
525 current->group_leader->min_flt); 525 current->min_flt);
526 ac.ac_majflt = encode_comp_t(current->signal->maj_flt + 526 ac.ac_majflt = encode_comp_t(current->signal->maj_flt +
527 current->group_leader->maj_flt); 527 current->maj_flt);
528 ac.ac_swaps = encode_comp_t(0); 528 ac.ac_swaps = encode_comp_t(0);
529 ac.ac_exitcode = exitcode; 529 ac.ac_exitcode = exitcode;
530 530
diff --git a/kernel/audit.c b/kernel/audit.c
index 04fe2e301b61..c8ccbd09048f 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -578,7 +578,7 @@ static int __init audit_enable(char *str)
578 audit_initialized ? "" : " (after initialization)"); 578 audit_initialized ? "" : " (after initialization)");
579 if (audit_initialized) 579 if (audit_initialized)
580 audit_enabled = audit_default; 580 audit_enabled = audit_default;
581 return 0; 581 return 1;
582} 582}
583 583
584__setup("audit=", audit_enable); 584__setup("audit=", audit_enable);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 18aea1bd1284..72248d1b9e3f 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -616,12 +616,10 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
616 * current->cpuset if a task has its memory placement changed. 616 * current->cpuset if a task has its memory placement changed.
617 * Do not call this routine if in_interrupt(). 617 * Do not call this routine if in_interrupt().
618 * 618 *
619 * Call without callback_mutex or task_lock() held. May be called 619 * Call without callback_mutex or task_lock() held. May be
620 * with or without manage_mutex held. Doesn't need task_lock to guard 620 * called with or without manage_mutex held. Thanks in part to
621 * against another task changing a non-NULL cpuset pointer to NULL, 621 * 'the_top_cpuset_hack', the tasks cpuset pointer will never
622 * as that is only done by a task on itself, and if the current task 622 * be NULL. This routine also might acquire callback_mutex and
623 * is here, it is not simultaneously in the exit code NULL'ing its
624 * cpuset pointer. This routine also might acquire callback_mutex and
625 * current->mm->mmap_sem during call. 623 * current->mm->mmap_sem during call.
626 * 624 *
627 * Reading current->cpuset->mems_generation doesn't need task_lock 625 * Reading current->cpuset->mems_generation doesn't need task_lock
@@ -836,6 +834,55 @@ static int update_cpumask(struct cpuset *cs, char *buf)
836} 834}
837 835
838/* 836/*
837 * cpuset_migrate_mm
838 *
839 * Migrate memory region from one set of nodes to another.
840 *
841 * Temporarilly set tasks mems_allowed to target nodes of migration,
842 * so that the migration code can allocate pages on these nodes.
843 *
844 * Call holding manage_mutex, so our current->cpuset won't change
845 * during this call, as manage_mutex holds off any attach_task()
846 * calls. Therefore we don't need to take task_lock around the
847 * call to guarantee_online_mems(), as we know no one is changing
848 * our tasks cpuset.
849 *
850 * Hold callback_mutex around the two modifications of our tasks
851 * mems_allowed to synchronize with cpuset_mems_allowed().
852 *
853 * While the mm_struct we are migrating is typically from some
854 * other task, the task_struct mems_allowed that we are hacking
855 * is for our current task, which must allocate new pages for that
856 * migrating memory region.
857 *
858 * We call cpuset_update_task_memory_state() before hacking
859 * our tasks mems_allowed, so that we are assured of being in
860 * sync with our tasks cpuset, and in particular, callbacks to
861 * cpuset_update_task_memory_state() from nested page allocations
862 * won't see any mismatch of our cpuset and task mems_generation
863 * values, so won't overwrite our hacked tasks mems_allowed
864 * nodemask.
865 */
866
867static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
868 const nodemask_t *to)
869{
870 struct task_struct *tsk = current;
871
872 cpuset_update_task_memory_state();
873
874 mutex_lock(&callback_mutex);
875 tsk->mems_allowed = *to;
876 mutex_unlock(&callback_mutex);
877
878 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
879
880 mutex_lock(&callback_mutex);
881 guarantee_online_mems(tsk->cpuset, &tsk->mems_allowed);
882 mutex_unlock(&callback_mutex);
883}
884
885/*
839 * Handle user request to change the 'mems' memory placement 886 * Handle user request to change the 'mems' memory placement
840 * of a cpuset. Needs to validate the request, update the 887 * of a cpuset. Needs to validate the request, update the
841 * cpusets mems_allowed and mems_generation, and for each 888 * cpusets mems_allowed and mems_generation, and for each
@@ -947,10 +994,8 @@ static int update_nodemask(struct cpuset *cs, char *buf)
947 struct mm_struct *mm = mmarray[i]; 994 struct mm_struct *mm = mmarray[i];
948 995
949 mpol_rebind_mm(mm, &cs->mems_allowed); 996 mpol_rebind_mm(mm, &cs->mems_allowed);
950 if (migrate) { 997 if (migrate)
951 do_migrate_pages(mm, &oldmem, &cs->mems_allowed, 998 cpuset_migrate_mm(mm, &oldmem, &cs->mems_allowed);
952 MPOL_MF_MOVE_ALL);
953 }
954 mmput(mm); 999 mmput(mm);
955 } 1000 }
956 1001
@@ -1185,11 +1230,11 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
1185 mm = get_task_mm(tsk); 1230 mm = get_task_mm(tsk);
1186 if (mm) { 1231 if (mm) {
1187 mpol_rebind_mm(mm, &to); 1232 mpol_rebind_mm(mm, &to);
1233 if (is_memory_migrate(cs))
1234 cpuset_migrate_mm(mm, &from, &to);
1188 mmput(mm); 1235 mmput(mm);
1189 } 1236 }
1190 1237
1191 if (is_memory_migrate(cs))
1192 do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL);
1193 put_task_struct(tsk); 1238 put_task_struct(tsk);
1194 synchronize_rcu(); 1239 synchronize_rcu();
1195 if (atomic_dec_and_test(&oldcs->count)) 1240 if (atomic_dec_and_test(&oldcs->count))
diff --git a/kernel/exit.c b/kernel/exit.c
index bc0ec674d3f4..6c2eeb8f6390 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -127,6 +127,11 @@ static void __exit_signal(struct task_struct *tsk)
127 } 127 }
128} 128}
129 129
130static void delayed_put_task_struct(struct rcu_head *rhp)
131{
132 put_task_struct(container_of(rhp, struct task_struct, rcu));
133}
134
130void release_task(struct task_struct * p) 135void release_task(struct task_struct * p)
131{ 136{
132 int zap_leader; 137 int zap_leader;
@@ -168,7 +173,7 @@ repeat:
168 spin_unlock(&p->proc_lock); 173 spin_unlock(&p->proc_lock);
169 proc_pid_flush(proc_dentry); 174 proc_pid_flush(proc_dentry);
170 release_thread(p); 175 release_thread(p);
171 put_task_struct(p); 176 call_rcu(&p->rcu, delayed_put_task_struct);
172 177
173 p = leader; 178 p = leader;
174 if (unlikely(zap_leader)) 179 if (unlikely(zap_leader))
diff --git a/kernel/fork.c b/kernel/fork.c
index b3f7a1bb5e55..3384eb89cb1c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -108,10 +108,8 @@ void free_task(struct task_struct *tsk)
108} 108}
109EXPORT_SYMBOL(free_task); 109EXPORT_SYMBOL(free_task);
110 110
111void __put_task_struct_cb(struct rcu_head *rhp) 111void __put_task_struct(struct task_struct *tsk)
112{ 112{
113 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
114
115 WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE))); 113 WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE)));
116 WARN_ON(atomic_read(&tsk->usage)); 114 WARN_ON(atomic_read(&tsk->usage));
117 WARN_ON(tsk == current); 115 WARN_ON(tsk == current);
@@ -126,6 +124,12 @@ void __put_task_struct_cb(struct rcu_head *rhp)
126 free_task(tsk); 124 free_task(tsk);
127} 125}
128 126
127void __put_task_struct_cb(struct rcu_head *rhp)
128{
129 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
130 __put_task_struct(tsk);
131}
132
129void __init fork_init(unsigned long mempages) 133void __init fork_init(unsigned long mempages)
130{ 134{
131#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR 135#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
@@ -721,7 +725,7 @@ out_release:
721 free_fdset (new_fdt->open_fds, new_fdt->max_fdset); 725 free_fdset (new_fdt->open_fds, new_fdt->max_fdset);
722 free_fd_array(new_fdt->fd, new_fdt->max_fds); 726 free_fd_array(new_fdt->fd, new_fdt->max_fds);
723 kmem_cache_free(files_cachep, newf); 727 kmem_cache_free(files_cachep, newf);
724 goto out; 728 return NULL;
725} 729}
726 730
727static int copy_files(unsigned long clone_flags, struct task_struct * tsk) 731static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
@@ -1311,17 +1315,19 @@ long do_fork(unsigned long clone_flags,
1311{ 1315{
1312 struct task_struct *p; 1316 struct task_struct *p;
1313 int trace = 0; 1317 int trace = 0;
1314 long pid = alloc_pidmap(); 1318 struct pid *pid = alloc_pid();
1319 long nr;
1315 1320
1316 if (pid < 0) 1321 if (!pid)
1317 return -EAGAIN; 1322 return -EAGAIN;
1323 nr = pid->nr;
1318 if (unlikely(current->ptrace)) { 1324 if (unlikely(current->ptrace)) {
1319 trace = fork_traceflag (clone_flags); 1325 trace = fork_traceflag (clone_flags);
1320 if (trace) 1326 if (trace)
1321 clone_flags |= CLONE_PTRACE; 1327 clone_flags |= CLONE_PTRACE;
1322 } 1328 }
1323 1329
1324 p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, pid); 1330 p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, nr);
1325 /* 1331 /*
1326 * Do this prior waking up the new thread - the thread pointer 1332 * Do this prior waking up the new thread - the thread pointer
1327 * might get invalid after that point, if the thread exits quickly. 1333 * might get invalid after that point, if the thread exits quickly.
@@ -1348,7 +1354,7 @@ long do_fork(unsigned long clone_flags,
1348 p->state = TASK_STOPPED; 1354 p->state = TASK_STOPPED;
1349 1355
1350 if (unlikely (trace)) { 1356 if (unlikely (trace)) {
1351 current->ptrace_message = pid; 1357 current->ptrace_message = nr;
1352 ptrace_notify ((trace << 8) | SIGTRAP); 1358 ptrace_notify ((trace << 8) | SIGTRAP);
1353 } 1359 }
1354 1360
@@ -1358,10 +1364,10 @@ long do_fork(unsigned long clone_flags,
1358 ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP); 1364 ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
1359 } 1365 }
1360 } else { 1366 } else {
1361 free_pidmap(pid); 1367 free_pid(pid);
1362 pid = PTR_ERR(p); 1368 nr = PTR_ERR(p);
1363 } 1369 }
1364 return pid; 1370 return nr;
1365} 1371}
1366 1372
1367#ifndef ARCH_MIN_MMSTRUCT_ALIGN 1373#ifndef ARCH_MIN_MMSTRUCT_ALIGN
diff --git a/kernel/futex.c b/kernel/futex.c
index 9c9b2b6b22dd..5699c512057b 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1039,9 +1039,11 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, int val,
1039 unsigned long timeout = MAX_SCHEDULE_TIMEOUT; 1039 unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
1040 int val2 = 0; 1040 int val2 = 0;
1041 1041
1042 if ((op == FUTEX_WAIT) && utime) { 1042 if (utime && (op == FUTEX_WAIT)) {
1043 if (copy_from_user(&t, utime, sizeof(t)) != 0) 1043 if (copy_from_user(&t, utime, sizeof(t)) != 0)
1044 return -EFAULT; 1044 return -EFAULT;
1045 if (!timespec_valid(&t))
1046 return -EINVAL;
1045 timeout = timespec_to_jiffies(&t) + 1; 1047 timeout = timespec_to_jiffies(&t) + 1;
1046 } 1048 }
1047 /* 1049 /*
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 54274fc85321..1ab6a0ea3d14 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -129,9 +129,11 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
129 unsigned long timeout = MAX_SCHEDULE_TIMEOUT; 129 unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
130 int val2 = 0; 130 int val2 = 0;
131 131
132 if ((op == FUTEX_WAIT) && utime) { 132 if (utime && (op == FUTEX_WAIT)) {
133 if (get_compat_timespec(&t, utime)) 133 if (get_compat_timespec(&t, utime))
134 return -EFAULT; 134 return -EFAULT;
135 if (!timespec_valid(&t))
136 return -EINVAL;
135 timeout = timespec_to_jiffies(&t) + 1; 137 timeout = timespec_to_jiffies(&t) + 1;
136 } 138 }
137 if (op >= FUTEX_REQUEUE) 139 if (op >= FUTEX_REQUEUE)
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 0237a556eb1f..f181ff4dd32e 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -606,6 +606,9 @@ static inline void run_hrtimer_queue(struct hrtimer_base *base)
606{ 606{
607 struct rb_node *node; 607 struct rb_node *node;
608 608
609 if (!base->first)
610 return;
611
609 if (base->get_softirq_time) 612 if (base->get_softirq_time)
610 base->softirq_time = base->get_softirq_time(); 613 base->softirq_time = base->get_softirq_time();
611 614
@@ -655,29 +658,28 @@ void hrtimer_run_queues(void)
655/* 658/*
656 * Sleep related functions: 659 * Sleep related functions:
657 */ 660 */
658 661static int hrtimer_wakeup(struct hrtimer *timer)
659struct sleep_hrtimer {
660 struct hrtimer timer;
661 struct task_struct *task;
662 int expired;
663};
664
665static int nanosleep_wakeup(struct hrtimer *timer)
666{ 662{
667 struct sleep_hrtimer *t = 663 struct hrtimer_sleeper *t =
668 container_of(timer, struct sleep_hrtimer, timer); 664 container_of(timer, struct hrtimer_sleeper, timer);
665 struct task_struct *task = t->task;
669 666
670 t->expired = 1; 667 t->task = NULL;
671 wake_up_process(t->task); 668 if (task)
669 wake_up_process(task);
672 670
673 return HRTIMER_NORESTART; 671 return HRTIMER_NORESTART;
674} 672}
675 673
676static int __sched do_nanosleep(struct sleep_hrtimer *t, enum hrtimer_mode mode) 674void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, task_t *task)
677{ 675{
678 t->timer.function = nanosleep_wakeup; 676 sl->timer.function = hrtimer_wakeup;
679 t->task = current; 677 sl->task = task;
680 t->expired = 0; 678}
679
680static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
681{
682 hrtimer_init_sleeper(t, current);
681 683
682 do { 684 do {
683 set_current_state(TASK_INTERRUPTIBLE); 685 set_current_state(TASK_INTERRUPTIBLE);
@@ -685,18 +687,17 @@ static int __sched do_nanosleep(struct sleep_hrtimer *t, enum hrtimer_mode mode)
685 687
686 schedule(); 688 schedule();
687 689
688 if (unlikely(!t->expired)) { 690 hrtimer_cancel(&t->timer);
689 hrtimer_cancel(&t->timer); 691 mode = HRTIMER_ABS;
690 mode = HRTIMER_ABS; 692
691 } 693 } while (t->task && !signal_pending(current));
692 } while (!t->expired && !signal_pending(current));
693 694
694 return t->expired; 695 return t->task == NULL;
695} 696}
696 697
697static long __sched nanosleep_restart(struct restart_block *restart) 698static long __sched nanosleep_restart(struct restart_block *restart)
698{ 699{
699 struct sleep_hrtimer t; 700 struct hrtimer_sleeper t;
700 struct timespec __user *rmtp; 701 struct timespec __user *rmtp;
701 struct timespec tu; 702 struct timespec tu;
702 ktime_t time; 703 ktime_t time;
@@ -729,7 +730,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
729 const enum hrtimer_mode mode, const clockid_t clockid) 730 const enum hrtimer_mode mode, const clockid_t clockid)
730{ 731{
731 struct restart_block *restart; 732 struct restart_block *restart;
732 struct sleep_hrtimer t; 733 struct hrtimer_sleeper t;
733 struct timespec tu; 734 struct timespec tu;
734 ktime_t rem; 735 ktime_t rem;
735 736
diff --git a/kernel/module.c b/kernel/module.c
index bd088a7c1499..d24deb0dbbc9 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1254,6 +1254,7 @@ static inline int license_is_gpl_compatible(const char *license)
1254 || strcmp(license, "GPL v2") == 0 1254 || strcmp(license, "GPL v2") == 0
1255 || strcmp(license, "GPL and additional rights") == 0 1255 || strcmp(license, "GPL and additional rights") == 0
1256 || strcmp(license, "Dual BSD/GPL") == 0 1256 || strcmp(license, "Dual BSD/GPL") == 0
1257 || strcmp(license, "Dual MIT/GPL") == 0
1257 || strcmp(license, "Dual MPL/GPL") == 0); 1258 || strcmp(license, "Dual MPL/GPL") == 0);
1258} 1259}
1259 1260
diff --git a/kernel/pid.c b/kernel/pid.c
index a9f2dfd006d2..eeb836b65ca4 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -28,8 +28,9 @@
28#include <linux/hash.h> 28#include <linux/hash.h>
29 29
30#define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift) 30#define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift)
31static struct hlist_head *pid_hash[PIDTYPE_MAX]; 31static struct hlist_head *pid_hash;
32static int pidhash_shift; 32static int pidhash_shift;
33static kmem_cache_t *pid_cachep;
33 34
34int pid_max = PID_MAX_DEFAULT; 35int pid_max = PID_MAX_DEFAULT;
35int last_pid; 36int last_pid;
@@ -60,9 +61,22 @@ typedef struct pidmap {
60static pidmap_t pidmap_array[PIDMAP_ENTRIES] = 61static pidmap_t pidmap_array[PIDMAP_ENTRIES] =
61 { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } }; 62 { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } };
62 63
64/*
65 * Note: disable interrupts while the pidmap_lock is held as an
66 * interrupt might come in and do read_lock(&tasklist_lock).
67 *
68 * If we don't disable interrupts there is a nasty deadlock between
69 * detach_pid()->free_pid() and another cpu that does
70 * spin_lock(&pidmap_lock) followed by an interrupt routine that does
71 * read_lock(&tasklist_lock);
72 *
73 * After we clean up the tasklist_lock and know there are no
74 * irq handlers that take it we can leave the interrupts enabled.
75 * For now it is easier to be safe than to prove it can't happen.
76 */
63static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); 77static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
64 78
65fastcall void free_pidmap(int pid) 79static fastcall void free_pidmap(int pid)
66{ 80{
67 pidmap_t *map = pidmap_array + pid / BITS_PER_PAGE; 81 pidmap_t *map = pidmap_array + pid / BITS_PER_PAGE;
68 int offset = pid & BITS_PER_PAGE_MASK; 82 int offset = pid & BITS_PER_PAGE_MASK;
@@ -71,7 +85,7 @@ fastcall void free_pidmap(int pid)
71 atomic_inc(&map->nr_free); 85 atomic_inc(&map->nr_free);
72} 86}
73 87
74int alloc_pidmap(void) 88static int alloc_pidmap(void)
75{ 89{
76 int i, offset, max_scan, pid, last = last_pid; 90 int i, offset, max_scan, pid, last = last_pid;
77 pidmap_t *map; 91 pidmap_t *map;
@@ -89,12 +103,12 @@ int alloc_pidmap(void)
89 * Free the page if someone raced with us 103 * Free the page if someone raced with us
90 * installing it: 104 * installing it:
91 */ 105 */
92 spin_lock(&pidmap_lock); 106 spin_lock_irq(&pidmap_lock);
93 if (map->page) 107 if (map->page)
94 free_page(page); 108 free_page(page);
95 else 109 else
96 map->page = (void *)page; 110 map->page = (void *)page;
97 spin_unlock(&pidmap_lock); 111 spin_unlock_irq(&pidmap_lock);
98 if (unlikely(!map->page)) 112 if (unlikely(!map->page))
99 break; 113 break;
100 } 114 }
@@ -131,13 +145,73 @@ int alloc_pidmap(void)
131 return -1; 145 return -1;
132} 146}
133 147
134struct pid * fastcall find_pid(enum pid_type type, int nr) 148fastcall void put_pid(struct pid *pid)
149{
150 if (!pid)
151 return;
152 if ((atomic_read(&pid->count) == 1) ||
153 atomic_dec_and_test(&pid->count))
154 kmem_cache_free(pid_cachep, pid);
155}
156
157static void delayed_put_pid(struct rcu_head *rhp)
158{
159 struct pid *pid = container_of(rhp, struct pid, rcu);
160 put_pid(pid);
161}
162
163fastcall void free_pid(struct pid *pid)
164{
165 /* We can be called with write_lock_irq(&tasklist_lock) held */
166 unsigned long flags;
167
168 spin_lock_irqsave(&pidmap_lock, flags);
169 hlist_del_rcu(&pid->pid_chain);
170 spin_unlock_irqrestore(&pidmap_lock, flags);
171
172 free_pidmap(pid->nr);
173 call_rcu(&pid->rcu, delayed_put_pid);
174}
175
176struct pid *alloc_pid(void)
177{
178 struct pid *pid;
179 enum pid_type type;
180 int nr = -1;
181
182 pid = kmem_cache_alloc(pid_cachep, GFP_KERNEL);
183 if (!pid)
184 goto out;
185
186 nr = alloc_pidmap();
187 if (nr < 0)
188 goto out_free;
189
190 atomic_set(&pid->count, 1);
191 pid->nr = nr;
192 for (type = 0; type < PIDTYPE_MAX; ++type)
193 INIT_HLIST_HEAD(&pid->tasks[type]);
194
195 spin_lock_irq(&pidmap_lock);
196 hlist_add_head_rcu(&pid->pid_chain, &pid_hash[pid_hashfn(pid->nr)]);
197 spin_unlock_irq(&pidmap_lock);
198
199out:
200 return pid;
201
202out_free:
203 kmem_cache_free(pid_cachep, pid);
204 pid = NULL;
205 goto out;
206}
207
208struct pid * fastcall find_pid(int nr)
135{ 209{
136 struct hlist_node *elem; 210 struct hlist_node *elem;
137 struct pid *pid; 211 struct pid *pid;
138 212
139 hlist_for_each_entry_rcu(pid, elem, 213 hlist_for_each_entry_rcu(pid, elem,
140 &pid_hash[type][pid_hashfn(nr)], pid_chain) { 214 &pid_hash[pid_hashfn(nr)], pid_chain) {
141 if (pid->nr == nr) 215 if (pid->nr == nr)
142 return pid; 216 return pid;
143 } 217 }
@@ -146,77 +220,82 @@ struct pid * fastcall find_pid(enum pid_type type, int nr)
146 220
147int fastcall attach_pid(task_t *task, enum pid_type type, int nr) 221int fastcall attach_pid(task_t *task, enum pid_type type, int nr)
148{ 222{
149 struct pid *pid, *task_pid; 223 struct pid_link *link;
150 224 struct pid *pid;
151 task_pid = &task->pids[type]; 225
152 pid = find_pid(type, nr); 226 WARN_ON(!task->pid); /* to be removed soon */
153 task_pid->nr = nr; 227 WARN_ON(!nr); /* to be removed soon */
154 if (pid == NULL) { 228
155 INIT_LIST_HEAD(&task_pid->pid_list); 229 link = &task->pids[type];
156 hlist_add_head_rcu(&task_pid->pid_chain, 230 link->pid = pid = find_pid(nr);
157 &pid_hash[type][pid_hashfn(nr)]); 231 hlist_add_head_rcu(&link->node, &pid->tasks[type]);
158 } else {
159 INIT_HLIST_NODE(&task_pid->pid_chain);
160 list_add_tail_rcu(&task_pid->pid_list, &pid->pid_list);
161 }
162 232
163 return 0; 233 return 0;
164} 234}
165 235
166static fastcall int __detach_pid(task_t *task, enum pid_type type) 236void fastcall detach_pid(task_t *task, enum pid_type type)
167{ 237{
168 struct pid *pid, *pid_next; 238 struct pid_link *link;
169 int nr = 0; 239 struct pid *pid;
240 int tmp;
170 241
171 pid = &task->pids[type]; 242 link = &task->pids[type];
172 if (!hlist_unhashed(&pid->pid_chain)) { 243 pid = link->pid;
173 244
174 if (list_empty(&pid->pid_list)) { 245 hlist_del_rcu(&link->node);
175 nr = pid->nr; 246 link->pid = NULL;
176 hlist_del_rcu(&pid->pid_chain);
177 } else {
178 pid_next = list_entry(pid->pid_list.next,
179 struct pid, pid_list);
180 /* insert next pid from pid_list to hash */
181 hlist_replace_rcu(&pid->pid_chain,
182 &pid_next->pid_chain);
183 }
184 }
185 247
186 list_del_rcu(&pid->pid_list); 248 for (tmp = PIDTYPE_MAX; --tmp >= 0; )
187 pid->nr = 0; 249 if (!hlist_empty(&pid->tasks[tmp]))
250 return;
188 251
189 return nr; 252 free_pid(pid);
190} 253}
191 254
192void fastcall detach_pid(task_t *task, enum pid_type type) 255struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type)
193{ 256{
194 int tmp, nr; 257 struct task_struct *result = NULL;
258 if (pid) {
259 struct hlist_node *first;
260 first = rcu_dereference(pid->tasks[type].first);
261 if (first)
262 result = hlist_entry(first, struct task_struct, pids[(type)].node);
263 }
264 return result;
265}
195 266
196 nr = __detach_pid(task, type); 267/*
197 if (!nr) 268 * Must be called under rcu_read_lock() or with tasklist_lock read-held.
198 return; 269 */
270task_t *find_task_by_pid_type(int type, int nr)
271{
272 return pid_task(find_pid(nr), type);
273}
199 274
200 for (tmp = PIDTYPE_MAX; --tmp >= 0; ) 275EXPORT_SYMBOL(find_task_by_pid_type);
201 if (tmp != type && find_pid(tmp, nr))
202 return;
203 276
204 free_pidmap(nr); 277struct task_struct *fastcall get_pid_task(struct pid *pid, enum pid_type type)
278{
279 struct task_struct *result;
280 rcu_read_lock();
281 result = pid_task(pid, type);
282 if (result)
283 get_task_struct(result);
284 rcu_read_unlock();
285 return result;
205} 286}
206 287
207task_t *find_task_by_pid_type(int type, int nr) 288struct pid *find_get_pid(pid_t nr)
208{ 289{
209 struct pid *pid; 290 struct pid *pid;
210 291
211 pid = find_pid(type, nr); 292 rcu_read_lock();
212 if (!pid) 293 pid = get_pid(find_pid(nr));
213 return NULL; 294 rcu_read_unlock();
214 295
215 return pid_task(&pid->pid_list, type); 296 return pid;
216} 297}
217 298
218EXPORT_SYMBOL(find_task_by_pid_type);
219
220/* 299/*
221 * The pid hash table is scaled according to the amount of memory in the 300 * The pid hash table is scaled according to the amount of memory in the
222 * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or 301 * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or
@@ -224,7 +303,7 @@ EXPORT_SYMBOL(find_task_by_pid_type);
224 */ 303 */
225void __init pidhash_init(void) 304void __init pidhash_init(void)
226{ 305{
227 int i, j, pidhash_size; 306 int i, pidhash_size;
228 unsigned long megabytes = nr_kernel_pages >> (20 - PAGE_SHIFT); 307 unsigned long megabytes = nr_kernel_pages >> (20 - PAGE_SHIFT);
229 308
230 pidhash_shift = max(4, fls(megabytes * 4)); 309 pidhash_shift = max(4, fls(megabytes * 4));
@@ -233,16 +312,13 @@ void __init pidhash_init(void)
233 312
234 printk("PID hash table entries: %d (order: %d, %Zd bytes)\n", 313 printk("PID hash table entries: %d (order: %d, %Zd bytes)\n",
235 pidhash_size, pidhash_shift, 314 pidhash_size, pidhash_shift,
236 PIDTYPE_MAX * pidhash_size * sizeof(struct hlist_head)); 315 pidhash_size * sizeof(struct hlist_head));
237 316
238 for (i = 0; i < PIDTYPE_MAX; i++) { 317 pid_hash = alloc_bootmem(pidhash_size * sizeof(*(pid_hash)));
239 pid_hash[i] = alloc_bootmem(pidhash_size * 318 if (!pid_hash)
240 sizeof(*(pid_hash[i]))); 319 panic("Could not alloc pidhash!\n");
241 if (!pid_hash[i]) 320 for (i = 0; i < pidhash_size; i++)
242 panic("Could not alloc pidhash!\n"); 321 INIT_HLIST_HEAD(&pid_hash[i]);
243 for (j = 0; j < pidhash_size; j++)
244 INIT_HLIST_HEAD(&pid_hash[i][j]);
245 }
246} 322}
247 323
248void __init pidmap_init(void) 324void __init pidmap_init(void)
@@ -251,4 +327,8 @@ void __init pidmap_init(void)
251 /* Reserve PID 0. We never call free_pidmap(0) */ 327 /* Reserve PID 0. We never call free_pidmap(0) */
252 set_bit(0, pidmap_array->page); 328 set_bit(0, pidmap_array->page);
253 atomic_dec(&pidmap_array->nr_free); 329 atomic_dec(&pidmap_array->nr_free);
330
331 pid_cachep = kmem_cache_create("pid", sizeof(struct pid),
332 __alignof__(struct pid),
333 SLAB_PANIC, NULL, NULL);
254} 334}
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 8ac7c35fad77..b2a5f671d6cd 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -26,8 +26,7 @@ static inline int freezeable(struct task_struct * p)
26 (p->flags & PF_NOFREEZE) || 26 (p->flags & PF_NOFREEZE) ||
27 (p->exit_state == EXIT_ZOMBIE) || 27 (p->exit_state == EXIT_ZOMBIE) ||
28 (p->exit_state == EXIT_DEAD) || 28 (p->exit_state == EXIT_DEAD) ||
29 (p->state == TASK_STOPPED) || 29 (p->state == TASK_STOPPED))
30 (p->state == TASK_TRACED))
31 return 0; 30 return 0;
32 return 1; 31 return 1;
33} 32}
diff --git a/kernel/sched.c b/kernel/sched.c
index a9ecac398bb9..dd153d6f8a04 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -667,9 +667,13 @@ static int effective_prio(task_t *p)
667/* 667/*
668 * __activate_task - move a task to the runqueue. 668 * __activate_task - move a task to the runqueue.
669 */ 669 */
670static inline void __activate_task(task_t *p, runqueue_t *rq) 670static void __activate_task(task_t *p, runqueue_t *rq)
671{ 671{
672 enqueue_task(p, rq->active); 672 prio_array_t *target = rq->active;
673
674 if (batch_task(p))
675 target = rq->expired;
676 enqueue_task(p, target);
673 rq->nr_running++; 677 rq->nr_running++;
674} 678}
675 679
@@ -688,7 +692,7 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
688 unsigned long long __sleep_time = now - p->timestamp; 692 unsigned long long __sleep_time = now - p->timestamp;
689 unsigned long sleep_time; 693 unsigned long sleep_time;
690 694
691 if (unlikely(p->policy == SCHED_BATCH)) 695 if (batch_task(p))
692 sleep_time = 0; 696 sleep_time = 0;
693 else { 697 else {
694 if (__sleep_time > NS_MAX_SLEEP_AVG) 698 if (__sleep_time > NS_MAX_SLEEP_AVG)
@@ -700,21 +704,25 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
700 if (likely(sleep_time > 0)) { 704 if (likely(sleep_time > 0)) {
701 /* 705 /*
702 * User tasks that sleep a long time are categorised as 706 * User tasks that sleep a long time are categorised as
703 * idle and will get just interactive status to stay active & 707 * idle. They will only have their sleep_avg increased to a
704 * prevent them suddenly becoming cpu hogs and starving 708 * level that makes them just interactive priority to stay
705 * other processes. 709 * active yet prevent them suddenly becoming cpu hogs and
710 * starving other processes.
706 */ 711 */
707 if (p->mm && p->activated != -1 && 712 if (p->mm && sleep_time > INTERACTIVE_SLEEP(p)) {
708 sleep_time > INTERACTIVE_SLEEP(p)) { 713 unsigned long ceiling;
709 p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG - 714
710 DEF_TIMESLICE); 715 ceiling = JIFFIES_TO_NS(MAX_SLEEP_AVG -
716 DEF_TIMESLICE);
717 if (p->sleep_avg < ceiling)
718 p->sleep_avg = ceiling;
711 } else { 719 } else {
712 /* 720 /*
713 * Tasks waking from uninterruptible sleep are 721 * Tasks waking from uninterruptible sleep are
714 * limited in their sleep_avg rise as they 722 * limited in their sleep_avg rise as they
715 * are likely to be waiting on I/O 723 * are likely to be waiting on I/O
716 */ 724 */
717 if (p->activated == -1 && p->mm) { 725 if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) {
718 if (p->sleep_avg >= INTERACTIVE_SLEEP(p)) 726 if (p->sleep_avg >= INTERACTIVE_SLEEP(p))
719 sleep_time = 0; 727 sleep_time = 0;
720 else if (p->sleep_avg + sleep_time >= 728 else if (p->sleep_avg + sleep_time >=
@@ -769,7 +777,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
769 * This checks to make sure it's not an uninterruptible task 777 * This checks to make sure it's not an uninterruptible task
770 * that is now waking up. 778 * that is now waking up.
771 */ 779 */
772 if (!p->activated) { 780 if (p->sleep_type == SLEEP_NORMAL) {
773 /* 781 /*
774 * Tasks which were woken up by interrupts (ie. hw events) 782 * Tasks which were woken up by interrupts (ie. hw events)
775 * are most likely of interactive nature. So we give them 783 * are most likely of interactive nature. So we give them
@@ -778,13 +786,13 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
778 * on a CPU, first time around: 786 * on a CPU, first time around:
779 */ 787 */
780 if (in_interrupt()) 788 if (in_interrupt())
781 p->activated = 2; 789 p->sleep_type = SLEEP_INTERRUPTED;
782 else { 790 else {
783 /* 791 /*
784 * Normal first-time wakeups get a credit too for 792 * Normal first-time wakeups get a credit too for
785 * on-runqueue time, but it will be weighted down: 793 * on-runqueue time, but it will be weighted down:
786 */ 794 */
787 p->activated = 1; 795 p->sleep_type = SLEEP_INTERACTIVE;
788 } 796 }
789 } 797 }
790 p->timestamp = now; 798 p->timestamp = now;
@@ -1272,19 +1280,19 @@ out_activate:
1272 * Tasks on involuntary sleep don't earn 1280 * Tasks on involuntary sleep don't earn
1273 * sleep_avg beyond just interactive state. 1281 * sleep_avg beyond just interactive state.
1274 */ 1282 */
1275 p->activated = -1; 1283 p->sleep_type = SLEEP_NONINTERACTIVE;
1276 } 1284 } else
1277 1285
1278 /* 1286 /*
1279 * Tasks that have marked their sleep as noninteractive get 1287 * Tasks that have marked their sleep as noninteractive get
1280 * woken up without updating their sleep average. (i.e. their 1288 * woken up with their sleep average not weighted in an
1281 * sleep is handled in a priority-neutral manner, no priority 1289 * interactive way.
1282 * boost and no penalty.)
1283 */ 1290 */
1284 if (old_state & TASK_NONINTERACTIVE) 1291 if (old_state & TASK_NONINTERACTIVE)
1285 __activate_task(p, rq); 1292 p->sleep_type = SLEEP_NONINTERACTIVE;
1286 else 1293
1287 activate_task(p, rq, cpu == this_cpu); 1294
1295 activate_task(p, rq, cpu == this_cpu);
1288 /* 1296 /*
1289 * Sync wakeups (i.e. those types of wakeups where the waker 1297 * Sync wakeups (i.e. those types of wakeups where the waker
1290 * has indicated that it will leave the CPU in short order) 1298 * has indicated that it will leave the CPU in short order)
@@ -1658,6 +1666,21 @@ unsigned long nr_iowait(void)
1658 return sum; 1666 return sum;
1659} 1667}
1660 1668
1669unsigned long nr_active(void)
1670{
1671 unsigned long i, running = 0, uninterruptible = 0;
1672
1673 for_each_online_cpu(i) {
1674 running += cpu_rq(i)->nr_running;
1675 uninterruptible += cpu_rq(i)->nr_uninterruptible;
1676 }
1677
1678 if (unlikely((long)uninterruptible < 0))
1679 uninterruptible = 0;
1680
1681 return running + uninterruptible;
1682}
1683
1661#ifdef CONFIG_SMP 1684#ifdef CONFIG_SMP
1662 1685
1663/* 1686/*
@@ -2860,6 +2883,12 @@ EXPORT_SYMBOL(sub_preempt_count);
2860 2883
2861#endif 2884#endif
2862 2885
2886static inline int interactive_sleep(enum sleep_type sleep_type)
2887{
2888 return (sleep_type == SLEEP_INTERACTIVE ||
2889 sleep_type == SLEEP_INTERRUPTED);
2890}
2891
2863/* 2892/*
2864 * schedule() is the main scheduler function. 2893 * schedule() is the main scheduler function.
2865 */ 2894 */
@@ -2983,12 +3012,12 @@ go_idle:
2983 queue = array->queue + idx; 3012 queue = array->queue + idx;
2984 next = list_entry(queue->next, task_t, run_list); 3013 next = list_entry(queue->next, task_t, run_list);
2985 3014
2986 if (!rt_task(next) && next->activated > 0) { 3015 if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
2987 unsigned long long delta = now - next->timestamp; 3016 unsigned long long delta = now - next->timestamp;
2988 if (unlikely((long long)(now - next->timestamp) < 0)) 3017 if (unlikely((long long)(now - next->timestamp) < 0))
2989 delta = 0; 3018 delta = 0;
2990 3019
2991 if (next->activated == 1) 3020 if (next->sleep_type == SLEEP_INTERACTIVE)
2992 delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; 3021 delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
2993 3022
2994 array = next->array; 3023 array = next->array;
@@ -2998,10 +3027,9 @@ go_idle:
2998 dequeue_task(next, array); 3027 dequeue_task(next, array);
2999 next->prio = new_prio; 3028 next->prio = new_prio;
3000 enqueue_task(next, array); 3029 enqueue_task(next, array);
3001 } else 3030 }
3002 requeue_task(next, array);
3003 } 3031 }
3004 next->activated = 0; 3032 next->sleep_type = SLEEP_NORMAL;
3005switch_tasks: 3033switch_tasks:
3006 if (next == rq->idle) 3034 if (next == rq->idle)
3007 schedstat_inc(rq, sched_goidle); 3035 schedstat_inc(rq, sched_goidle);
diff --git a/kernel/signal.c b/kernel/signal.c
index 4922928d91f6..92025b108791 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1560,6 +1560,7 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info)
1560 /* Let the debugger run. */ 1560 /* Let the debugger run. */
1561 set_current_state(TASK_TRACED); 1561 set_current_state(TASK_TRACED);
1562 spin_unlock_irq(&current->sighand->siglock); 1562 spin_unlock_irq(&current->sighand->siglock);
1563 try_to_freeze();
1563 read_lock(&tasklist_lock); 1564 read_lock(&tasklist_lock);
1564 if (likely(current->ptrace & PT_PTRACED) && 1565 if (likely(current->ptrace & PT_PTRACED) &&
1565 likely(current->parent != current->real_parent || 1566 likely(current->parent != current->real_parent ||
diff --git a/kernel/sys.c b/kernel/sys.c
index 7ef7f6054c28..0b6ec0e7936f 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1372,18 +1372,29 @@ asmlinkage long sys_getsid(pid_t pid)
1372asmlinkage long sys_setsid(void) 1372asmlinkage long sys_setsid(void)
1373{ 1373{
1374 struct task_struct *group_leader = current->group_leader; 1374 struct task_struct *group_leader = current->group_leader;
1375 struct pid *pid; 1375 pid_t session;
1376 int err = -EPERM; 1376 int err = -EPERM;
1377 1377
1378 mutex_lock(&tty_mutex); 1378 mutex_lock(&tty_mutex);
1379 write_lock_irq(&tasklist_lock); 1379 write_lock_irq(&tasklist_lock);
1380 1380
1381 pid = find_pid(PIDTYPE_PGID, group_leader->pid); 1381 /* Fail if I am already a session leader */
1382 if (pid) 1382 if (group_leader->signal->leader)
1383 goto out;
1384
1385 session = group_leader->pid;
1386 /* Fail if a process group id already exists that equals the
1387 * proposed session id.
1388 *
1389 * Don't check if session id == 1 because kernel threads use this
1390 * session id and so the check will always fail and make it so
1391 * init cannot successfully call setsid.
1392 */
1393 if (session > 1 && find_task_by_pid_type(PIDTYPE_PGID, session))
1383 goto out; 1394 goto out;
1384 1395
1385 group_leader->signal->leader = 1; 1396 group_leader->signal->leader = 1;
1386 __set_special_pids(group_leader->pid, group_leader->pid); 1397 __set_special_pids(session, session);
1387 group_leader->signal->tty = NULL; 1398 group_leader->signal->tty = NULL;
1388 group_leader->signal->tty_old_pgrp = 0; 1399 group_leader->signal->tty_old_pgrp = 0;
1389 err = process_group(group_leader); 1400 err = process_group(group_leader);
diff --git a/kernel/timer.c b/kernel/timer.c
index ab189dd187cb..6b812c04737b 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -54,7 +54,6 @@ EXPORT_SYMBOL(jiffies_64);
54/* 54/*
55 * per-CPU timer vector definitions: 55 * per-CPU timer vector definitions:
56 */ 56 */
57
58#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6) 57#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6)
59#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8) 58#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8)
60#define TVN_SIZE (1 << TVN_BITS) 59#define TVN_SIZE (1 << TVN_BITS)
@@ -62,11 +61,6 @@ EXPORT_SYMBOL(jiffies_64);
62#define TVN_MASK (TVN_SIZE - 1) 61#define TVN_MASK (TVN_SIZE - 1)
63#define TVR_MASK (TVR_SIZE - 1) 62#define TVR_MASK (TVR_SIZE - 1)
64 63
65struct timer_base_s {
66 spinlock_t lock;
67 struct timer_list *running_timer;
68};
69
70typedef struct tvec_s { 64typedef struct tvec_s {
71 struct list_head vec[TVN_SIZE]; 65 struct list_head vec[TVN_SIZE];
72} tvec_t; 66} tvec_t;
@@ -76,7 +70,8 @@ typedef struct tvec_root_s {
76} tvec_root_t; 70} tvec_root_t;
77 71
78struct tvec_t_base_s { 72struct tvec_t_base_s {
79 struct timer_base_s t_base; 73 spinlock_t lock;
74 struct timer_list *running_timer;
80 unsigned long timer_jiffies; 75 unsigned long timer_jiffies;
81 tvec_root_t tv1; 76 tvec_root_t tv1;
82 tvec_t tv2; 77 tvec_t tv2;
@@ -87,13 +82,14 @@ struct tvec_t_base_s {
87 82
88typedef struct tvec_t_base_s tvec_base_t; 83typedef struct tvec_t_base_s tvec_base_t;
89static DEFINE_PER_CPU(tvec_base_t *, tvec_bases); 84static DEFINE_PER_CPU(tvec_base_t *, tvec_bases);
90static tvec_base_t boot_tvec_bases; 85tvec_base_t boot_tvec_bases;
86EXPORT_SYMBOL(boot_tvec_bases);
91 87
92static inline void set_running_timer(tvec_base_t *base, 88static inline void set_running_timer(tvec_base_t *base,
93 struct timer_list *timer) 89 struct timer_list *timer)
94{ 90{
95#ifdef CONFIG_SMP 91#ifdef CONFIG_SMP
96 base->t_base.running_timer = timer; 92 base->running_timer = timer;
97#endif 93#endif
98} 94}
99 95
@@ -139,15 +135,6 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
139 list_add_tail(&timer->entry, vec); 135 list_add_tail(&timer->entry, vec);
140} 136}
141 137
142typedef struct timer_base_s timer_base_t;
143/*
144 * Used by TIMER_INITIALIZER, we can't use per_cpu(tvec_bases)
145 * at compile time, and we need timer->base to lock the timer.
146 */
147timer_base_t __init_timer_base
148 ____cacheline_aligned_in_smp = { .lock = SPIN_LOCK_UNLOCKED };
149EXPORT_SYMBOL(__init_timer_base);
150
151/*** 138/***
152 * init_timer - initialize a timer. 139 * init_timer - initialize a timer.
153 * @timer: the timer to be initialized 140 * @timer: the timer to be initialized
@@ -158,7 +145,7 @@ EXPORT_SYMBOL(__init_timer_base);
158void fastcall init_timer(struct timer_list *timer) 145void fastcall init_timer(struct timer_list *timer)
159{ 146{
160 timer->entry.next = NULL; 147 timer->entry.next = NULL;
161 timer->base = &per_cpu(tvec_bases, raw_smp_processor_id())->t_base; 148 timer->base = per_cpu(tvec_bases, raw_smp_processor_id());
162} 149}
163EXPORT_SYMBOL(init_timer); 150EXPORT_SYMBOL(init_timer);
164 151
@@ -174,7 +161,7 @@ static inline void detach_timer(struct timer_list *timer,
174} 161}
175 162
176/* 163/*
177 * We are using hashed locking: holding per_cpu(tvec_bases).t_base.lock 164 * We are using hashed locking: holding per_cpu(tvec_bases).lock
178 * means that all timers which are tied to this base via timer->base are 165 * means that all timers which are tied to this base via timer->base are
179 * locked, and the base itself is locked too. 166 * locked, and the base itself is locked too.
180 * 167 *
@@ -185,10 +172,10 @@ static inline void detach_timer(struct timer_list *timer,
185 * possible to set timer->base = NULL and drop the lock: the timer remains 172 * possible to set timer->base = NULL and drop the lock: the timer remains
186 * locked. 173 * locked.
187 */ 174 */
188static timer_base_t *lock_timer_base(struct timer_list *timer, 175static tvec_base_t *lock_timer_base(struct timer_list *timer,
189 unsigned long *flags) 176 unsigned long *flags)
190{ 177{
191 timer_base_t *base; 178 tvec_base_t *base;
192 179
193 for (;;) { 180 for (;;) {
194 base = timer->base; 181 base = timer->base;
@@ -205,8 +192,7 @@ static timer_base_t *lock_timer_base(struct timer_list *timer,
205 192
206int __mod_timer(struct timer_list *timer, unsigned long expires) 193int __mod_timer(struct timer_list *timer, unsigned long expires)
207{ 194{
208 timer_base_t *base; 195 tvec_base_t *base, *new_base;
209 tvec_base_t *new_base;
210 unsigned long flags; 196 unsigned long flags;
211 int ret = 0; 197 int ret = 0;
212 198
@@ -221,7 +207,7 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
221 207
222 new_base = __get_cpu_var(tvec_bases); 208 new_base = __get_cpu_var(tvec_bases);
223 209
224 if (base != &new_base->t_base) { 210 if (base != new_base) {
225 /* 211 /*
226 * We are trying to schedule the timer on the local CPU. 212 * We are trying to schedule the timer on the local CPU.
227 * However we can't change timer's base while it is running, 213 * However we can't change timer's base while it is running,
@@ -229,21 +215,19 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
229 * handler yet has not finished. This also guarantees that 215 * handler yet has not finished. This also guarantees that
230 * the timer is serialized wrt itself. 216 * the timer is serialized wrt itself.
231 */ 217 */
232 if (unlikely(base->running_timer == timer)) { 218 if (likely(base->running_timer != timer)) {
233 /* The timer remains on a former base */
234 new_base = container_of(base, tvec_base_t, t_base);
235 } else {
236 /* See the comment in lock_timer_base() */ 219 /* See the comment in lock_timer_base() */
237 timer->base = NULL; 220 timer->base = NULL;
238 spin_unlock(&base->lock); 221 spin_unlock(&base->lock);
239 spin_lock(&new_base->t_base.lock); 222 base = new_base;
240 timer->base = &new_base->t_base; 223 spin_lock(&base->lock);
224 timer->base = base;
241 } 225 }
242 } 226 }
243 227
244 timer->expires = expires; 228 timer->expires = expires;
245 internal_add_timer(new_base, timer); 229 internal_add_timer(base, timer);
246 spin_unlock_irqrestore(&new_base->t_base.lock, flags); 230 spin_unlock_irqrestore(&base->lock, flags);
247 231
248 return ret; 232 return ret;
249} 233}
@@ -263,10 +247,10 @@ void add_timer_on(struct timer_list *timer, int cpu)
263 unsigned long flags; 247 unsigned long flags;
264 248
265 BUG_ON(timer_pending(timer) || !timer->function); 249 BUG_ON(timer_pending(timer) || !timer->function);
266 spin_lock_irqsave(&base->t_base.lock, flags); 250 spin_lock_irqsave(&base->lock, flags);
267 timer->base = &base->t_base; 251 timer->base = base;
268 internal_add_timer(base, timer); 252 internal_add_timer(base, timer);
269 spin_unlock_irqrestore(&base->t_base.lock, flags); 253 spin_unlock_irqrestore(&base->lock, flags);
270} 254}
271 255
272 256
@@ -319,7 +303,7 @@ EXPORT_SYMBOL(mod_timer);
319 */ 303 */
320int del_timer(struct timer_list *timer) 304int del_timer(struct timer_list *timer)
321{ 305{
322 timer_base_t *base; 306 tvec_base_t *base;
323 unsigned long flags; 307 unsigned long flags;
324 int ret = 0; 308 int ret = 0;
325 309
@@ -346,7 +330,7 @@ EXPORT_SYMBOL(del_timer);
346 */ 330 */
347int try_to_del_timer_sync(struct timer_list *timer) 331int try_to_del_timer_sync(struct timer_list *timer)
348{ 332{
349 timer_base_t *base; 333 tvec_base_t *base;
350 unsigned long flags; 334 unsigned long flags;
351 int ret = -1; 335 int ret = -1;
352 336
@@ -410,7 +394,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index)
410 struct timer_list *tmp; 394 struct timer_list *tmp;
411 395
412 tmp = list_entry(curr, struct timer_list, entry); 396 tmp = list_entry(curr, struct timer_list, entry);
413 BUG_ON(tmp->base != &base->t_base); 397 BUG_ON(tmp->base != base);
414 curr = curr->next; 398 curr = curr->next;
415 internal_add_timer(base, tmp); 399 internal_add_timer(base, tmp);
416 } 400 }
@@ -432,7 +416,7 @@ static inline void __run_timers(tvec_base_t *base)
432{ 416{
433 struct timer_list *timer; 417 struct timer_list *timer;
434 418
435 spin_lock_irq(&base->t_base.lock); 419 spin_lock_irq(&base->lock);
436 while (time_after_eq(jiffies, base->timer_jiffies)) { 420 while (time_after_eq(jiffies, base->timer_jiffies)) {
437 struct list_head work_list = LIST_HEAD_INIT(work_list); 421 struct list_head work_list = LIST_HEAD_INIT(work_list);
438 struct list_head *head = &work_list; 422 struct list_head *head = &work_list;
@@ -458,7 +442,7 @@ static inline void __run_timers(tvec_base_t *base)
458 442
459 set_running_timer(base, timer); 443 set_running_timer(base, timer);
460 detach_timer(timer, 1); 444 detach_timer(timer, 1);
461 spin_unlock_irq(&base->t_base.lock); 445 spin_unlock_irq(&base->lock);
462 { 446 {
463 int preempt_count = preempt_count(); 447 int preempt_count = preempt_count();
464 fn(data); 448 fn(data);
@@ -471,11 +455,11 @@ static inline void __run_timers(tvec_base_t *base)
471 BUG(); 455 BUG();
472 } 456 }
473 } 457 }
474 spin_lock_irq(&base->t_base.lock); 458 spin_lock_irq(&base->lock);
475 } 459 }
476 } 460 }
477 set_running_timer(base, NULL); 461 set_running_timer(base, NULL);
478 spin_unlock_irq(&base->t_base.lock); 462 spin_unlock_irq(&base->lock);
479} 463}
480 464
481#ifdef CONFIG_NO_IDLE_HZ 465#ifdef CONFIG_NO_IDLE_HZ
@@ -506,7 +490,7 @@ unsigned long next_timer_interrupt(void)
506 hr_expires += jiffies; 490 hr_expires += jiffies;
507 491
508 base = __get_cpu_var(tvec_bases); 492 base = __get_cpu_var(tvec_bases);
509 spin_lock(&base->t_base.lock); 493 spin_lock(&base->lock);
510 expires = base->timer_jiffies + (LONG_MAX >> 1); 494 expires = base->timer_jiffies + (LONG_MAX >> 1);
511 list = NULL; 495 list = NULL;
512 496
@@ -554,7 +538,7 @@ found:
554 expires = nte->expires; 538 expires = nte->expires;
555 } 539 }
556 } 540 }
557 spin_unlock(&base->t_base.lock); 541 spin_unlock(&base->lock);
558 542
559 if (time_before(hr_expires, expires)) 543 if (time_before(hr_expires, expires))
560 return hr_expires; 544 return hr_expires;
@@ -841,7 +825,7 @@ void update_process_times(int user_tick)
841 */ 825 */
842static unsigned long count_active_tasks(void) 826static unsigned long count_active_tasks(void)
843{ 827{
844 return (nr_running() + nr_uninterruptible()) * FIXED_1; 828 return nr_active() * FIXED_1;
845} 829}
846 830
847/* 831/*
@@ -1262,7 +1246,7 @@ static int __devinit init_timers_cpu(int cpu)
1262 } 1246 }
1263 per_cpu(tvec_bases, cpu) = base; 1247 per_cpu(tvec_bases, cpu) = base;
1264 } 1248 }
1265 spin_lock_init(&base->t_base.lock); 1249 spin_lock_init(&base->lock);
1266 for (j = 0; j < TVN_SIZE; j++) { 1250 for (j = 0; j < TVN_SIZE; j++) {
1267 INIT_LIST_HEAD(base->tv5.vec + j); 1251 INIT_LIST_HEAD(base->tv5.vec + j);
1268 INIT_LIST_HEAD(base->tv4.vec + j); 1252 INIT_LIST_HEAD(base->tv4.vec + j);
@@ -1284,7 +1268,7 @@ static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head)
1284 while (!list_empty(head)) { 1268 while (!list_empty(head)) {
1285 timer = list_entry(head->next, struct timer_list, entry); 1269 timer = list_entry(head->next, struct timer_list, entry);
1286 detach_timer(timer, 0); 1270 detach_timer(timer, 0);
1287 timer->base = &new_base->t_base; 1271 timer->base = new_base;
1288 internal_add_timer(new_base, timer); 1272 internal_add_timer(new_base, timer);
1289 } 1273 }
1290} 1274}
@@ -1300,11 +1284,11 @@ static void __devinit migrate_timers(int cpu)
1300 new_base = get_cpu_var(tvec_bases); 1284 new_base = get_cpu_var(tvec_bases);
1301 1285
1302 local_irq_disable(); 1286 local_irq_disable();
1303 spin_lock(&new_base->t_base.lock); 1287 spin_lock(&new_base->lock);
1304 spin_lock(&old_base->t_base.lock); 1288 spin_lock(&old_base->lock);
1289
1290 BUG_ON(old_base->running_timer);
1305 1291
1306 if (old_base->t_base.running_timer)
1307 BUG();
1308 for (i = 0; i < TVR_SIZE; i++) 1292 for (i = 0; i < TVR_SIZE; i++)
1309 migrate_timer_list(new_base, old_base->tv1.vec + i); 1293 migrate_timer_list(new_base, old_base->tv1.vec + i);
1310 for (i = 0; i < TVN_SIZE; i++) { 1294 for (i = 0; i < TVN_SIZE; i++) {
@@ -1314,8 +1298,8 @@ static void __devinit migrate_timers(int cpu)
1314 migrate_timer_list(new_base, old_base->tv5.vec + i); 1298 migrate_timer_list(new_base, old_base->tv5.vec + i);
1315 } 1299 }
1316 1300
1317 spin_unlock(&old_base->t_base.lock); 1301 spin_unlock(&old_base->lock);
1318 spin_unlock(&new_base->t_base.lock); 1302 spin_unlock(&new_base->lock);
1319 local_irq_enable(); 1303 local_irq_enable();
1320 put_cpu_var(tvec_bases); 1304 put_cpu_var(tvec_bases);
1321} 1305}