aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/acct.c26
-rw-r--r--kernel/auditsc.c6
-rw-r--r--kernel/cpuset.c22
-rw-r--r--kernel/exit.c75
-rw-r--r--kernel/fork.c81
-rw-r--r--kernel/futex.c10
-rw-r--r--kernel/irq/proc.c3
-rw-r--r--kernel/kallsyms.c16
-rw-r--r--kernel/kexec.c1
-rw-r--r--kernel/kmod.c2
-rw-r--r--kernel/mutex.c9
-rw-r--r--kernel/nsproxy.c42
-rw-r--r--kernel/pid.c75
-rw-r--r--kernel/relay.c4
-rw-r--r--kernel/sched.c511
-rw-r--r--kernel/signal.c13
-rw-r--r--kernel/sys.c23
-rw-r--r--kernel/sysctl.c387
-rw-r--r--kernel/time/clocksource.c8
-rw-r--r--kernel/timer.c148
-rw-r--r--kernel/tsacct.c9
-rw-r--r--kernel/workqueue.c21
22 files changed, 884 insertions, 608 deletions
diff --git a/kernel/acct.c b/kernel/acct.c
index dc12db8600..70d0d88e55 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -118,7 +118,7 @@ static int check_free_space(struct file *file)
118 spin_unlock(&acct_globals.lock); 118 spin_unlock(&acct_globals.lock);
119 119
120 /* May block */ 120 /* May block */
121 if (vfs_statfs(file->f_dentry, &sbuf)) 121 if (vfs_statfs(file->f_path.dentry, &sbuf))
122 return res; 122 return res;
123 suspend = sbuf.f_blocks * SUSPEND; 123 suspend = sbuf.f_blocks * SUSPEND;
124 resume = sbuf.f_blocks * RESUME; 124 resume = sbuf.f_blocks * RESUME;
@@ -194,7 +194,7 @@ static void acct_file_reopen(struct file *file)
194 add_timer(&acct_globals.timer); 194 add_timer(&acct_globals.timer);
195 } 195 }
196 if (old_acct) { 196 if (old_acct) {
197 mnt_unpin(old_acct->f_vfsmnt); 197 mnt_unpin(old_acct->f_path.mnt);
198 spin_unlock(&acct_globals.lock); 198 spin_unlock(&acct_globals.lock);
199 do_acct_process(old_acct); 199 do_acct_process(old_acct);
200 filp_close(old_acct, NULL); 200 filp_close(old_acct, NULL);
@@ -212,7 +212,7 @@ static int acct_on(char *name)
212 if (IS_ERR(file)) 212 if (IS_ERR(file))
213 return PTR_ERR(file); 213 return PTR_ERR(file);
214 214
215 if (!S_ISREG(file->f_dentry->d_inode->i_mode)) { 215 if (!S_ISREG(file->f_path.dentry->d_inode->i_mode)) {
216 filp_close(file, NULL); 216 filp_close(file, NULL);
217 return -EACCES; 217 return -EACCES;
218 } 218 }
@@ -229,11 +229,11 @@ static int acct_on(char *name)
229 } 229 }
230 230
231 spin_lock(&acct_globals.lock); 231 spin_lock(&acct_globals.lock);
232 mnt_pin(file->f_vfsmnt); 232 mnt_pin(file->f_path.mnt);
233 acct_file_reopen(file); 233 acct_file_reopen(file);
234 spin_unlock(&acct_globals.lock); 234 spin_unlock(&acct_globals.lock);
235 235
236 mntput(file->f_vfsmnt); /* it's pinned, now give up active reference */ 236 mntput(file->f_path.mnt); /* it's pinned, now give up active reference */
237 237
238 return 0; 238 return 0;
239} 239}
@@ -283,7 +283,7 @@ asmlinkage long sys_acct(const char __user *name)
283void acct_auto_close_mnt(struct vfsmount *m) 283void acct_auto_close_mnt(struct vfsmount *m)
284{ 284{
285 spin_lock(&acct_globals.lock); 285 spin_lock(&acct_globals.lock);
286 if (acct_globals.file && acct_globals.file->f_vfsmnt == m) 286 if (acct_globals.file && acct_globals.file->f_path.mnt == m)
287 acct_file_reopen(NULL); 287 acct_file_reopen(NULL);
288 spin_unlock(&acct_globals.lock); 288 spin_unlock(&acct_globals.lock);
289} 289}
@@ -299,7 +299,7 @@ void acct_auto_close(struct super_block *sb)
299{ 299{
300 spin_lock(&acct_globals.lock); 300 spin_lock(&acct_globals.lock);
301 if (acct_globals.file && 301 if (acct_globals.file &&
302 acct_globals.file->f_vfsmnt->mnt_sb == sb) { 302 acct_globals.file->f_path.mnt->mnt_sb == sb) {
303 acct_file_reopen(NULL); 303 acct_file_reopen(NULL);
304 } 304 }
305 spin_unlock(&acct_globals.lock); 305 spin_unlock(&acct_globals.lock);
@@ -428,6 +428,7 @@ static void do_acct_process(struct file *file)
428 u64 elapsed; 428 u64 elapsed;
429 u64 run_time; 429 u64 run_time;
430 struct timespec uptime; 430 struct timespec uptime;
431 struct tty_struct *tty;
431 432
432 /* 433 /*
433 * First check to see if there is enough free_space to continue 434 * First check to see if there is enough free_space to continue
@@ -484,16 +485,9 @@ static void do_acct_process(struct file *file)
484 ac.ac_ppid = current->parent->tgid; 485 ac.ac_ppid = current->parent->tgid;
485#endif 486#endif
486 487
487 mutex_lock(&tty_mutex);
488 /* FIXME: Whoever is responsible for current->signal locking needs
489 to use the same locking all over the kernel and document it */
490 read_lock(&tasklist_lock);
491 ac.ac_tty = current->signal->tty ?
492 old_encode_dev(tty_devnum(current->signal->tty)) : 0;
493 read_unlock(&tasklist_lock);
494 mutex_unlock(&tty_mutex);
495
496 spin_lock_irq(&current->sighand->siglock); 488 spin_lock_irq(&current->sighand->siglock);
489 tty = current->signal->tty;
490 ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
497 ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); 491 ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
498 ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); 492 ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
499 ac.ac_flag = pacct->ac_flag; 493 ac.ac_flag = pacct->ac_flag;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 40722e26de..298897559c 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -781,8 +781,8 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk
781 if ((vma->vm_flags & VM_EXECUTABLE) && 781 if ((vma->vm_flags & VM_EXECUTABLE) &&
782 vma->vm_file) { 782 vma->vm_file) {
783 audit_log_d_path(ab, "exe=", 783 audit_log_d_path(ab, "exe=",
784 vma->vm_file->f_dentry, 784 vma->vm_file->f_path.dentry,
785 vma->vm_file->f_vfsmnt); 785 vma->vm_file->f_path.mnt);
786 break; 786 break;
787 } 787 }
788 vma = vma->vm_next; 788 vma = vma->vm_next;
@@ -826,10 +826,12 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
826 context->return_code); 826 context->return_code);
827 827
828 mutex_lock(&tty_mutex); 828 mutex_lock(&tty_mutex);
829 read_lock(&tasklist_lock);
829 if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) 830 if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
830 tty = tsk->signal->tty->name; 831 tty = tsk->signal->tty->name;
831 else 832 else
832 tty = "(none)"; 833 tty = "(none)";
834 read_unlock(&tasklist_lock);
833 audit_log_format(ab, 835 audit_log_format(ab,
834 " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" 836 " a0=%lx a1=%lx a2=%lx a3=%lx items=%d"
835 " ppid=%d pid=%d auid=%u uid=%u gid=%u" 837 " ppid=%d pid=%d auid=%u uid=%u gid=%u"
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 0a6b4d89f9..2c3b443147 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -413,8 +413,8 @@ static struct file_system_type cpuset_fs_type = {
413 * 413 *
414 * 414 *
415 * When reading/writing to a file: 415 * When reading/writing to a file:
416 * - the cpuset to use in file->f_dentry->d_parent->d_fsdata 416 * - the cpuset to use in file->f_path.dentry->d_parent->d_fsdata
417 * - the 'cftype' of the file is file->f_dentry->d_fsdata 417 * - the 'cftype' of the file is file->f_path.dentry->d_fsdata
418 */ 418 */
419 419
420struct cftype { 420struct cftype {
@@ -1284,8 +1284,8 @@ static ssize_t cpuset_common_file_write(struct file *file,
1284 const char __user *userbuf, 1284 const char __user *userbuf,
1285 size_t nbytes, loff_t *unused_ppos) 1285 size_t nbytes, loff_t *unused_ppos)
1286{ 1286{
1287 struct cpuset *cs = __d_cs(file->f_dentry->d_parent); 1287 struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent);
1288 struct cftype *cft = __d_cft(file->f_dentry); 1288 struct cftype *cft = __d_cft(file->f_path.dentry);
1289 cpuset_filetype_t type = cft->private; 1289 cpuset_filetype_t type = cft->private;
1290 char *buffer; 1290 char *buffer;
1291 char *pathbuf = NULL; 1291 char *pathbuf = NULL;
@@ -1367,7 +1367,7 @@ static ssize_t cpuset_file_write(struct file *file, const char __user *buf,
1367 size_t nbytes, loff_t *ppos) 1367 size_t nbytes, loff_t *ppos)
1368{ 1368{
1369 ssize_t retval = 0; 1369 ssize_t retval = 0;
1370 struct cftype *cft = __d_cft(file->f_dentry); 1370 struct cftype *cft = __d_cft(file->f_path.dentry);
1371 if (!cft) 1371 if (!cft)
1372 return -ENODEV; 1372 return -ENODEV;
1373 1373
@@ -1417,8 +1417,8 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
1417static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, 1417static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
1418 size_t nbytes, loff_t *ppos) 1418 size_t nbytes, loff_t *ppos)
1419{ 1419{
1420 struct cftype *cft = __d_cft(file->f_dentry); 1420 struct cftype *cft = __d_cft(file->f_path.dentry);
1421 struct cpuset *cs = __d_cs(file->f_dentry->d_parent); 1421 struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent);
1422 cpuset_filetype_t type = cft->private; 1422 cpuset_filetype_t type = cft->private;
1423 char *page; 1423 char *page;
1424 ssize_t retval = 0; 1424 ssize_t retval = 0;
@@ -1476,7 +1476,7 @@ static ssize_t cpuset_file_read(struct file *file, char __user *buf, size_t nbyt
1476 loff_t *ppos) 1476 loff_t *ppos)
1477{ 1477{
1478 ssize_t retval = 0; 1478 ssize_t retval = 0;
1479 struct cftype *cft = __d_cft(file->f_dentry); 1479 struct cftype *cft = __d_cft(file->f_path.dentry);
1480 if (!cft) 1480 if (!cft)
1481 return -ENODEV; 1481 return -ENODEV;
1482 1482
@@ -1498,7 +1498,7 @@ static int cpuset_file_open(struct inode *inode, struct file *file)
1498 if (err) 1498 if (err)
1499 return err; 1499 return err;
1500 1500
1501 cft = __d_cft(file->f_dentry); 1501 cft = __d_cft(file->f_path.dentry);
1502 if (!cft) 1502 if (!cft)
1503 return -ENODEV; 1503 return -ENODEV;
1504 if (cft->open) 1504 if (cft->open)
@@ -1511,7 +1511,7 @@ static int cpuset_file_open(struct inode *inode, struct file *file)
1511 1511
1512static int cpuset_file_release(struct inode *inode, struct file *file) 1512static int cpuset_file_release(struct inode *inode, struct file *file)
1513{ 1513{
1514 struct cftype *cft = __d_cft(file->f_dentry); 1514 struct cftype *cft = __d_cft(file->f_path.dentry);
1515 if (cft->release) 1515 if (cft->release)
1516 return cft->release(inode, file); 1516 return cft->release(inode, file);
1517 return 0; 1517 return 0;
@@ -1700,7 +1700,7 @@ static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids)
1700 */ 1700 */
1701static int cpuset_tasks_open(struct inode *unused, struct file *file) 1701static int cpuset_tasks_open(struct inode *unused, struct file *file)
1702{ 1702{
1703 struct cpuset *cs = __d_cs(file->f_dentry->d_parent); 1703 struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent);
1704 struct ctr_struct *ctr; 1704 struct ctr_struct *ctr;
1705 pid_t *pidarray; 1705 pid_t *pidarray;
1706 int npids; 1706 int npids;
diff --git a/kernel/exit.c b/kernel/exit.c
index 4e3f919edc..122fadb972 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -13,7 +13,7 @@
13#include <linux/completion.h> 13#include <linux/completion.h>
14#include <linux/personality.h> 14#include <linux/personality.h>
15#include <linux/tty.h> 15#include <linux/tty.h>
16#include <linux/namespace.h> 16#include <linux/mnt_namespace.h>
17#include <linux/key.h> 17#include <linux/key.h>
18#include <linux/security.h> 18#include <linux/security.h>
19#include <linux/cpu.h> 19#include <linux/cpu.h>
@@ -22,6 +22,7 @@
22#include <linux/file.h> 22#include <linux/file.h>
23#include <linux/binfmts.h> 23#include <linux/binfmts.h>
24#include <linux/nsproxy.h> 24#include <linux/nsproxy.h>
25#include <linux/pid_namespace.h>
25#include <linux/ptrace.h> 26#include <linux/ptrace.h>
26#include <linux/profile.h> 27#include <linux/profile.h>
27#include <linux/mount.h> 28#include <linux/mount.h>
@@ -48,7 +49,6 @@
48#include <asm/mmu_context.h> 49#include <asm/mmu_context.h>
49 50
50extern void sem_exit (void); 51extern void sem_exit (void);
51extern struct task_struct *child_reaper;
52 52
53static void exit_mm(struct task_struct * tsk); 53static void exit_mm(struct task_struct * tsk);
54 54
@@ -189,21 +189,18 @@ repeat:
189int session_of_pgrp(int pgrp) 189int session_of_pgrp(int pgrp)
190{ 190{
191 struct task_struct *p; 191 struct task_struct *p;
192 int sid = -1; 192 int sid = 0;
193 193
194 read_lock(&tasklist_lock); 194 read_lock(&tasklist_lock);
195 do_each_task_pid(pgrp, PIDTYPE_PGID, p) { 195
196 if (p->signal->session > 0) { 196 p = find_task_by_pid_type(PIDTYPE_PGID, pgrp);
197 sid = p->signal->session; 197 if (p == NULL)
198 goto out; 198 p = find_task_by_pid(pgrp);
199 } 199 if (p != NULL)
200 } while_each_task_pid(pgrp, PIDTYPE_PGID, p); 200 sid = process_session(p);
201 p = find_task_by_pid(pgrp); 201
202 if (p)
203 sid = p->signal->session;
204out:
205 read_unlock(&tasklist_lock); 202 read_unlock(&tasklist_lock);
206 203
207 return sid; 204 return sid;
208} 205}
209 206
@@ -225,8 +222,8 @@ static int will_become_orphaned_pgrp(int pgrp, struct task_struct *ignored_task)
225 || p->exit_state 222 || p->exit_state
226 || is_init(p->real_parent)) 223 || is_init(p->real_parent))
227 continue; 224 continue;
228 if (process_group(p->real_parent) != pgrp 225 if (process_group(p->real_parent) != pgrp &&
229 && p->real_parent->signal->session == p->signal->session) { 226 process_session(p->real_parent) == process_session(p)) {
230 ret = 0; 227 ret = 0;
231 break; 228 break;
232 } 229 }
@@ -260,7 +257,8 @@ static int has_stopped_jobs(int pgrp)
260} 257}
261 258
262/** 259/**
263 * reparent_to_init - Reparent the calling kernel thread to the init task. 260 * reparent_to_init - Reparent the calling kernel thread to the init task
261 * of the pid space that the thread belongs to.
264 * 262 *
265 * If a kernel thread is launched as a result of a system call, or if 263 * If a kernel thread is launched as a result of a system call, or if
266 * it ever exits, it should generally reparent itself to init so that 264 * it ever exits, it should generally reparent itself to init so that
@@ -278,8 +276,8 @@ static void reparent_to_init(void)
278 ptrace_unlink(current); 276 ptrace_unlink(current);
279 /* Reparent to init */ 277 /* Reparent to init */
280 remove_parent(current); 278 remove_parent(current);
281 current->parent = child_reaper; 279 current->parent = child_reaper(current);
282 current->real_parent = child_reaper; 280 current->real_parent = child_reaper(current);
283 add_parent(current); 281 add_parent(current);
284 282
285 /* Set the exit signal to SIGCHLD so we signal init on exit */ 283 /* Set the exit signal to SIGCHLD so we signal init on exit */
@@ -302,9 +300,9 @@ void __set_special_pids(pid_t session, pid_t pgrp)
302{ 300{
303 struct task_struct *curr = current->group_leader; 301 struct task_struct *curr = current->group_leader;
304 302
305 if (curr->signal->session != session) { 303 if (process_session(curr) != session) {
306 detach_pid(curr, PIDTYPE_SID); 304 detach_pid(curr, PIDTYPE_SID);
307 curr->signal->session = session; 305 set_signal_session(curr->signal, session);
308 attach_pid(curr, PIDTYPE_SID, session); 306 attach_pid(curr, PIDTYPE_SID, session);
309 } 307 }
310 if (process_group(curr) != pgrp) { 308 if (process_group(curr) != pgrp) {
@@ -314,7 +312,7 @@ void __set_special_pids(pid_t session, pid_t pgrp)
314 } 312 }
315} 313}
316 314
317void set_special_pids(pid_t session, pid_t pgrp) 315static void set_special_pids(pid_t session, pid_t pgrp)
318{ 316{
319 write_lock_irq(&tasklist_lock); 317 write_lock_irq(&tasklist_lock);
320 __set_special_pids(session, pgrp); 318 __set_special_pids(session, pgrp);
@@ -384,9 +382,7 @@ void daemonize(const char *name, ...)
384 exit_mm(current); 382 exit_mm(current);
385 383
386 set_special_pids(1, 1); 384 set_special_pids(1, 1);
387 mutex_lock(&tty_mutex); 385 proc_clear_tty(current);
388 current->signal->tty = NULL;
389 mutex_unlock(&tty_mutex);
390 386
391 /* Block and flush all signals */ 387 /* Block and flush all signals */
392 sigfillset(&blocked); 388 sigfillset(&blocked);
@@ -429,7 +425,7 @@ static void close_files(struct files_struct * files)
429 for (;;) { 425 for (;;) {
430 unsigned long set; 426 unsigned long set;
431 i = j * __NFDBITS; 427 i = j * __NFDBITS;
432 if (i >= fdt->max_fdset || i >= fdt->max_fds) 428 if (i >= fdt->max_fds)
433 break; 429 break;
434 set = fdt->open_fds->fds_bits[j++]; 430 set = fdt->open_fds->fds_bits[j++];
435 while (set) { 431 while (set) {
@@ -470,11 +466,9 @@ void fastcall put_files_struct(struct files_struct *files)
470 * you can free files immediately. 466 * you can free files immediately.
471 */ 467 */
472 fdt = files_fdtable(files); 468 fdt = files_fdtable(files);
473 if (fdt == &files->fdtab) 469 if (fdt != &files->fdtab)
474 fdt->free_files = files;
475 else
476 kmem_cache_free(files_cachep, files); 470 kmem_cache_free(files_cachep, files);
477 free_fdtable(fdt); 471 call_rcu(&fdt->rcu, free_fdtable_rcu);
478 } 472 }
479} 473}
480 474
@@ -649,10 +643,11 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
649 * outside, so the child pgrp is now orphaned. 643 * outside, so the child pgrp is now orphaned.
650 */ 644 */
651 if ((process_group(p) != process_group(father)) && 645 if ((process_group(p) != process_group(father)) &&
652 (p->signal->session == father->signal->session)) { 646 (process_session(p) == process_session(father))) {
653 int pgrp = process_group(p); 647 int pgrp = process_group(p);
654 648
655 if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) { 649 if (will_become_orphaned_pgrp(pgrp, NULL) &&
650 has_stopped_jobs(pgrp)) {
656 __kill_pg_info(SIGHUP, SEND_SIG_PRIV, pgrp); 651 __kill_pg_info(SIGHUP, SEND_SIG_PRIV, pgrp);
657 __kill_pg_info(SIGCONT, SEND_SIG_PRIV, pgrp); 652 __kill_pg_info(SIGCONT, SEND_SIG_PRIV, pgrp);
658 } 653 }
@@ -663,7 +658,8 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
663 * When we die, we re-parent all our children. 658 * When we die, we re-parent all our children.
664 * Try to give them to another thread in our thread 659 * Try to give them to another thread in our thread
665 * group, and if no such member exists, give it to 660 * group, and if no such member exists, give it to
666 * the global child reaper process (ie "init") 661 * the child reaper process (ie "init") in our pid
662 * space.
667 */ 663 */
668static void 664static void
669forget_original_parent(struct task_struct *father, struct list_head *to_release) 665forget_original_parent(struct task_struct *father, struct list_head *to_release)
@@ -674,7 +670,7 @@ forget_original_parent(struct task_struct *father, struct list_head *to_release)
674 do { 670 do {
675 reaper = next_thread(reaper); 671 reaper = next_thread(reaper);
676 if (reaper == father) { 672 if (reaper == father) {
677 reaper = child_reaper; 673 reaper = child_reaper(father);
678 break; 674 break;
679 } 675 }
680 } while (reaper->exit_state); 676 } while (reaper->exit_state);
@@ -786,7 +782,7 @@ static void exit_notify(struct task_struct *tsk)
786 t = tsk->real_parent; 782 t = tsk->real_parent;
787 783
788 if ((process_group(t) != process_group(tsk)) && 784 if ((process_group(t) != process_group(tsk)) &&
789 (t->signal->session == tsk->signal->session) && 785 (process_session(t) == process_session(tsk)) &&
790 will_become_orphaned_pgrp(process_group(tsk), tsk) && 786 will_become_orphaned_pgrp(process_group(tsk), tsk) &&
791 has_stopped_jobs(process_group(tsk))) { 787 has_stopped_jobs(process_group(tsk))) {
792 __kill_pg_info(SIGHUP, SEND_SIG_PRIV, process_group(tsk)); 788 __kill_pg_info(SIGHUP, SEND_SIG_PRIV, process_group(tsk));
@@ -860,8 +856,13 @@ fastcall NORET_TYPE void do_exit(long code)
860 panic("Aiee, killing interrupt handler!"); 856 panic("Aiee, killing interrupt handler!");
861 if (unlikely(!tsk->pid)) 857 if (unlikely(!tsk->pid))
862 panic("Attempted to kill the idle task!"); 858 panic("Attempted to kill the idle task!");
863 if (unlikely(tsk == child_reaper)) 859 if (unlikely(tsk == child_reaper(tsk))) {
864 panic("Attempted to kill init!"); 860 if (tsk->nsproxy->pid_ns != &init_pid_ns)
861 tsk->nsproxy->pid_ns->child_reaper = init_pid_ns.child_reaper;
862 else
863 panic("Attempted to kill init!");
864 }
865
865 866
866 if (unlikely(current->ptrace & PT_TRACE_EXIT)) { 867 if (unlikely(current->ptrace & PT_TRACE_EXIT)) {
867 current->ptrace_message = code; 868 current->ptrace_message = code;
diff --git a/kernel/fork.c b/kernel/fork.c
index 7f2e31ba33..d16c566eb6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -18,7 +18,7 @@
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/vmalloc.h> 19#include <linux/vmalloc.h>
20#include <linux/completion.h> 20#include <linux/completion.h>
21#include <linux/namespace.h> 21#include <linux/mnt_namespace.h>
22#include <linux/personality.h> 22#include <linux/personality.h>
23#include <linux/mempolicy.h> 23#include <linux/mempolicy.h>
24#include <linux/sem.h> 24#include <linux/sem.h>
@@ -36,6 +36,7 @@
36#include <linux/syscalls.h> 36#include <linux/syscalls.h>
37#include <linux/jiffies.h> 37#include <linux/jiffies.h>
38#include <linux/futex.h> 38#include <linux/futex.h>
39#include <linux/task_io_accounting_ops.h>
39#include <linux/rcupdate.h> 40#include <linux/rcupdate.h>
40#include <linux/ptrace.h> 41#include <linux/ptrace.h>
41#include <linux/mount.h> 42#include <linux/mount.h>
@@ -252,7 +253,7 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
252 anon_vma_link(tmp); 253 anon_vma_link(tmp);
253 file = tmp->vm_file; 254 file = tmp->vm_file;
254 if (file) { 255 if (file) {
255 struct inode *inode = file->f_dentry->d_inode; 256 struct inode *inode = file->f_path.dentry->d_inode;
256 get_file(file); 257 get_file(file);
257 if (tmp->vm_flags & VM_DENYWRITE) 258 if (tmp->vm_flags & VM_DENYWRITE)
258 atomic_dec(&inode->i_writecount); 259 atomic_dec(&inode->i_writecount);
@@ -613,7 +614,7 @@ static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk)
613 614
614static int count_open_files(struct fdtable *fdt) 615static int count_open_files(struct fdtable *fdt)
615{ 616{
616 int size = fdt->max_fdset; 617 int size = fdt->max_fds;
617 int i; 618 int i;
618 619
619 /* Find the last open fd */ 620 /* Find the last open fd */
@@ -640,12 +641,10 @@ static struct files_struct *alloc_files(void)
640 newf->next_fd = 0; 641 newf->next_fd = 0;
641 fdt = &newf->fdtab; 642 fdt = &newf->fdtab;
642 fdt->max_fds = NR_OPEN_DEFAULT; 643 fdt->max_fds = NR_OPEN_DEFAULT;
643 fdt->max_fdset = EMBEDDED_FD_SET_SIZE;
644 fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init; 644 fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
645 fdt->open_fds = (fd_set *)&newf->open_fds_init; 645 fdt->open_fds = (fd_set *)&newf->open_fds_init;
646 fdt->fd = &newf->fd_array[0]; 646 fdt->fd = &newf->fd_array[0];
647 INIT_RCU_HEAD(&fdt->rcu); 647 INIT_RCU_HEAD(&fdt->rcu);
648 fdt->free_files = NULL;
649 fdt->next = NULL; 648 fdt->next = NULL;
650 rcu_assign_pointer(newf->fdt, fdt); 649 rcu_assign_pointer(newf->fdt, fdt);
651out: 650out:
@@ -661,7 +660,7 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
661{ 660{
662 struct files_struct *newf; 661 struct files_struct *newf;
663 struct file **old_fds, **new_fds; 662 struct file **old_fds, **new_fds;
664 int open_files, size, i, expand; 663 int open_files, size, i;
665 struct fdtable *old_fdt, *new_fdt; 664 struct fdtable *old_fdt, *new_fdt;
666 665
667 *errorp = -ENOMEM; 666 *errorp = -ENOMEM;
@@ -672,25 +671,14 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
672 spin_lock(&oldf->file_lock); 671 spin_lock(&oldf->file_lock);
673 old_fdt = files_fdtable(oldf); 672 old_fdt = files_fdtable(oldf);
674 new_fdt = files_fdtable(newf); 673 new_fdt = files_fdtable(newf);
675 size = old_fdt->max_fdset;
676 open_files = count_open_files(old_fdt); 674 open_files = count_open_files(old_fdt);
677 expand = 0;
678 675
679 /* 676 /*
680 * Check whether we need to allocate a larger fd array or fd set. 677 * Check whether we need to allocate a larger fd array and fd set.
681 * Note: we're not a clone task, so the open count won't change. 678 * Note: we're not a clone task, so the open count won't change.
682 */ 679 */
683 if (open_files > new_fdt->max_fdset) {
684 new_fdt->max_fdset = 0;
685 expand = 1;
686 }
687 if (open_files > new_fdt->max_fds) { 680 if (open_files > new_fdt->max_fds) {
688 new_fdt->max_fds = 0; 681 new_fdt->max_fds = 0;
689 expand = 1;
690 }
691
692 /* if the old fdset gets grown now, we'll only copy up to "size" fds */
693 if (expand) {
694 spin_unlock(&oldf->file_lock); 682 spin_unlock(&oldf->file_lock);
695 spin_lock(&newf->file_lock); 683 spin_lock(&newf->file_lock);
696 *errorp = expand_files(newf, open_files-1); 684 *errorp = expand_files(newf, open_files-1);
@@ -710,8 +698,10 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
710 old_fds = old_fdt->fd; 698 old_fds = old_fdt->fd;
711 new_fds = new_fdt->fd; 699 new_fds = new_fdt->fd;
712 700
713 memcpy(new_fdt->open_fds->fds_bits, old_fdt->open_fds->fds_bits, open_files/8); 701 memcpy(new_fdt->open_fds->fds_bits,
714 memcpy(new_fdt->close_on_exec->fds_bits, old_fdt->close_on_exec->fds_bits, open_files/8); 702 old_fdt->open_fds->fds_bits, open_files/8);
703 memcpy(new_fdt->close_on_exec->fds_bits,
704 old_fdt->close_on_exec->fds_bits, open_files/8);
715 705
716 for (i = open_files; i != 0; i--) { 706 for (i = open_files; i != 0; i--) {
717 struct file *f = *old_fds++; 707 struct file *f = *old_fds++;
@@ -736,22 +726,19 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
736 /* This is long word aligned thus could use a optimized version */ 726 /* This is long word aligned thus could use a optimized version */
737 memset(new_fds, 0, size); 727 memset(new_fds, 0, size);
738 728
739 if (new_fdt->max_fdset > open_files) { 729 if (new_fdt->max_fds > open_files) {
740 int left = (new_fdt->max_fdset-open_files)/8; 730 int left = (new_fdt->max_fds-open_files)/8;
741 int start = open_files / (8 * sizeof(unsigned long)); 731 int start = open_files / (8 * sizeof(unsigned long));
742 732
743 memset(&new_fdt->open_fds->fds_bits[start], 0, left); 733 memset(&new_fdt->open_fds->fds_bits[start], 0, left);
744 memset(&new_fdt->close_on_exec->fds_bits[start], 0, left); 734 memset(&new_fdt->close_on_exec->fds_bits[start], 0, left);
745 } 735 }
746 736
747out:
748 return newf; 737 return newf;
749 738
750out_release: 739out_release:
751 free_fdset (new_fdt->close_on_exec, new_fdt->max_fdset);
752 free_fdset (new_fdt->open_fds, new_fdt->max_fdset);
753 free_fd_array(new_fdt->fd, new_fdt->max_fds);
754 kmem_cache_free(files_cachep, newf); 740 kmem_cache_free(files_cachep, newf);
741out:
755 return NULL; 742 return NULL;
756} 743}
757 744
@@ -1055,6 +1042,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1055 p->wchar = 0; /* I/O counter: bytes written */ 1042 p->wchar = 0; /* I/O counter: bytes written */
1056 p->syscr = 0; /* I/O counter: read syscalls */ 1043 p->syscr = 0; /* I/O counter: read syscalls */
1057 p->syscw = 0; /* I/O counter: write syscalls */ 1044 p->syscw = 0; /* I/O counter: write syscalls */
1045 task_io_accounting_init(p);
1058 acct_clear_integrals(p); 1046 acct_clear_integrals(p);
1059 1047
1060 p->it_virt_expires = cputime_zero; 1048 p->it_virt_expires = cputime_zero;
@@ -1259,9 +1247,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1259 if (thread_group_leader(p)) { 1247 if (thread_group_leader(p)) {
1260 p->signal->tty = current->signal->tty; 1248 p->signal->tty = current->signal->tty;
1261 p->signal->pgrp = process_group(current); 1249 p->signal->pgrp = process_group(current);
1262 p->signal->session = current->signal->session; 1250 set_signal_session(p->signal, process_session(current));
1263 attach_pid(p, PIDTYPE_PGID, process_group(p)); 1251 attach_pid(p, PIDTYPE_PGID, process_group(p));
1264 attach_pid(p, PIDTYPE_SID, p->signal->session); 1252 attach_pid(p, PIDTYPE_SID, process_session(p));
1265 1253
1266 list_add_tail_rcu(&p->tasks, &init_task.tasks); 1254 list_add_tail_rcu(&p->tasks, &init_task.tasks);
1267 __get_cpu_var(process_counts)++; 1255 __get_cpu_var(process_counts)++;
@@ -1525,17 +1513,18 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
1525} 1513}
1526 1514
1527/* 1515/*
1528 * Unshare the namespace structure if it is being shared 1516 * Unshare the mnt_namespace structure if it is being shared
1529 */ 1517 */
1530static int unshare_namespace(unsigned long unshare_flags, struct namespace **new_nsp, struct fs_struct *new_fs) 1518static int unshare_mnt_namespace(unsigned long unshare_flags,
1519 struct mnt_namespace **new_nsp, struct fs_struct *new_fs)
1531{ 1520{
1532 struct namespace *ns = current->nsproxy->namespace; 1521 struct mnt_namespace *ns = current->nsproxy->mnt_ns;
1533 1522
1534 if ((unshare_flags & CLONE_NEWNS) && ns) { 1523 if ((unshare_flags & CLONE_NEWNS) && ns) {
1535 if (!capable(CAP_SYS_ADMIN)) 1524 if (!capable(CAP_SYS_ADMIN))
1536 return -EPERM; 1525 return -EPERM;
1537 1526
1538 *new_nsp = dup_namespace(current, new_fs ? new_fs : current->fs); 1527 *new_nsp = dup_mnt_ns(current, new_fs ? new_fs : current->fs);
1539 if (!*new_nsp) 1528 if (!*new_nsp)
1540 return -ENOMEM; 1529 return -ENOMEM;
1541 } 1530 }
@@ -1544,15 +1533,13 @@ static int unshare_namespace(unsigned long unshare_flags, struct namespace **new
1544} 1533}
1545 1534
1546/* 1535/*
1547 * Unsharing of sighand for tasks created with CLONE_SIGHAND is not 1536 * Unsharing of sighand is not supported yet
1548 * supported yet
1549 */ 1537 */
1550static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp) 1538static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp)
1551{ 1539{
1552 struct sighand_struct *sigh = current->sighand; 1540 struct sighand_struct *sigh = current->sighand;
1553 1541
1554 if ((unshare_flags & CLONE_SIGHAND) && 1542 if ((unshare_flags & CLONE_SIGHAND) && atomic_read(&sigh->count) > 1)
1555 (sigh && atomic_read(&sigh->count) > 1))
1556 return -EINVAL; 1543 return -EINVAL;
1557 else 1544 else
1558 return 0; 1545 return 0;
@@ -1625,8 +1612,8 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1625{ 1612{
1626 int err = 0; 1613 int err = 0;
1627 struct fs_struct *fs, *new_fs = NULL; 1614 struct fs_struct *fs, *new_fs = NULL;
1628 struct namespace *ns, *new_ns = NULL; 1615 struct mnt_namespace *ns, *new_ns = NULL;
1629 struct sighand_struct *sigh, *new_sigh = NULL; 1616 struct sighand_struct *new_sigh = NULL;
1630 struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; 1617 struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
1631 struct files_struct *fd, *new_fd = NULL; 1618 struct files_struct *fd, *new_fd = NULL;
1632 struct sem_undo_list *new_ulist = NULL; 1619 struct sem_undo_list *new_ulist = NULL;
@@ -1647,7 +1634,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1647 goto bad_unshare_out; 1634 goto bad_unshare_out;
1648 if ((err = unshare_fs(unshare_flags, &new_fs))) 1635 if ((err = unshare_fs(unshare_flags, &new_fs)))
1649 goto bad_unshare_cleanup_thread; 1636 goto bad_unshare_cleanup_thread;
1650 if ((err = unshare_namespace(unshare_flags, &new_ns, new_fs))) 1637 if ((err = unshare_mnt_namespace(unshare_flags, &new_ns, new_fs)))
1651 goto bad_unshare_cleanup_fs; 1638 goto bad_unshare_cleanup_fs;
1652 if ((err = unshare_sighand(unshare_flags, &new_sigh))) 1639 if ((err = unshare_sighand(unshare_flags, &new_sigh)))
1653 goto bad_unshare_cleanup_ns; 1640 goto bad_unshare_cleanup_ns;
@@ -1671,7 +1658,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1671 } 1658 }
1672 } 1659 }
1673 1660
1674 if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist || 1661 if (new_fs || new_ns || new_mm || new_fd || new_ulist ||
1675 new_uts || new_ipc) { 1662 new_uts || new_ipc) {
1676 1663
1677 task_lock(current); 1664 task_lock(current);
@@ -1688,17 +1675,11 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1688 } 1675 }
1689 1676
1690 if (new_ns) { 1677 if (new_ns) {
1691 ns = current->nsproxy->namespace; 1678 ns = current->nsproxy->mnt_ns;
1692 current->nsproxy->namespace = new_ns; 1679 current->nsproxy->mnt_ns = new_ns;
1693 new_ns = ns; 1680 new_ns = ns;
1694 } 1681 }
1695 1682
1696 if (new_sigh) {
1697 sigh = current->sighand;
1698 rcu_assign_pointer(current->sighand, new_sigh);
1699 new_sigh = sigh;
1700 }
1701
1702 if (new_mm) { 1683 if (new_mm) {
1703 mm = current->mm; 1684 mm = current->mm;
1704 active_mm = current->active_mm; 1685 active_mm = current->active_mm;
@@ -1756,7 +1737,7 @@ bad_unshare_cleanup_sigh:
1756 1737
1757bad_unshare_cleanup_ns: 1738bad_unshare_cleanup_ns:
1758 if (new_ns) 1739 if (new_ns)
1759 put_namespace(new_ns); 1740 put_mnt_ns(new_ns);
1760 1741
1761bad_unshare_cleanup_fs: 1742bad_unshare_cleanup_fs:
1762 if (new_fs) 1743 if (new_fs)
diff --git a/kernel/futex.c b/kernel/futex.c
index 95989a3b41..5a737de857 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -166,7 +166,7 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
166/* 166/*
167 * Get parameters which are the keys for a futex. 167 * Get parameters which are the keys for a futex.
168 * 168 *
169 * For shared mappings, it's (page->index, vma->vm_file->f_dentry->d_inode, 169 * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode,
170 * offset_within_page). For private mappings, it's (uaddr, current->mm). 170 * offset_within_page). For private mappings, it's (uaddr, current->mm).
171 * We can usually work out the index without swapping in the page. 171 * We can usually work out the index without swapping in the page.
172 * 172 *
@@ -223,7 +223,7 @@ static int get_futex_key(u32 __user *uaddr, union futex_key *key)
223 /* 223 /*
224 * Linear file mappings are also simple. 224 * Linear file mappings are also simple.
225 */ 225 */
226 key->shared.inode = vma->vm_file->f_dentry->d_inode; 226 key->shared.inode = vma->vm_file->f_path.dentry->d_inode;
227 key->both.offset++; /* Bit 0 of offset indicates inode-based key. */ 227 key->both.offset++; /* Bit 0 of offset indicates inode-based key. */
228 if (likely(!(vma->vm_flags & VM_NONLINEAR))) { 228 if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
229 key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT) 229 key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
@@ -1528,9 +1528,9 @@ static int futex_fd(u32 __user *uaddr, int signal)
1528 goto out; 1528 goto out;
1529 } 1529 }
1530 filp->f_op = &futex_fops; 1530 filp->f_op = &futex_fops;
1531 filp->f_vfsmnt = mntget(futex_mnt); 1531 filp->f_path.mnt = mntget(futex_mnt);
1532 filp->f_dentry = dget(futex_mnt->mnt_root); 1532 filp->f_path.dentry = dget(futex_mnt->mnt_root);
1533 filp->f_mapping = filp->f_dentry->d_inode->i_mapping; 1533 filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping;
1534 1534
1535 if (signal) { 1535 if (signal) {
1536 err = __f_setown(filp, task_pid(current), PIDTYPE_PID, 1); 1536 err = __f_setown(filp, task_pid(current), PIDTYPE_PID, 1);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 9a35266700..61f5c717a8 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -54,7 +54,8 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
54 unsigned int irq = (int)(long)data, full_count = count, err; 54 unsigned int irq = (int)(long)data, full_count = count, err;
55 cpumask_t new_value, tmp; 55 cpumask_t new_value, tmp;
56 56
57 if (!irq_desc[irq].chip->set_affinity || no_irq_affinity) 57 if (!irq_desc[irq].chip->set_affinity || no_irq_affinity ||
58 CHECK_IRQ_PER_CPU(irq_desc[irq].status))
58 return -EIO; 59 return -EIO;
59 60
60 err = cpumask_parse_user(buffer, count, new_value); 61 err = cpumask_parse_user(buffer, count, new_value);
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index ab63cfc429..6f294ff4f9 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -31,14 +31,14 @@
31#endif 31#endif
32 32
33/* These will be re-linked against their real values during the second link stage */ 33/* These will be re-linked against their real values during the second link stage */
34extern unsigned long kallsyms_addresses[] __attribute__((weak)); 34extern const unsigned long kallsyms_addresses[] __attribute__((weak));
35extern unsigned long kallsyms_num_syms __attribute__((weak,section("data"))); 35extern const unsigned long kallsyms_num_syms __attribute__((weak));
36extern u8 kallsyms_names[] __attribute__((weak)); 36extern const u8 kallsyms_names[] __attribute__((weak));
37 37
38extern u8 kallsyms_token_table[] __attribute__((weak)); 38extern const u8 kallsyms_token_table[] __attribute__((weak));
39extern u16 kallsyms_token_index[] __attribute__((weak)); 39extern const u16 kallsyms_token_index[] __attribute__((weak));
40 40
41extern unsigned long kallsyms_markers[] __attribute__((weak)); 41extern const unsigned long kallsyms_markers[] __attribute__((weak));
42 42
43static inline int is_kernel_inittext(unsigned long addr) 43static inline int is_kernel_inittext(unsigned long addr)
44{ 44{
@@ -84,7 +84,7 @@ static int is_ksym_addr(unsigned long addr)
84static unsigned int kallsyms_expand_symbol(unsigned int off, char *result) 84static unsigned int kallsyms_expand_symbol(unsigned int off, char *result)
85{ 85{
86 int len, skipped_first = 0; 86 int len, skipped_first = 0;
87 u8 *tptr, *data; 87 const u8 *tptr, *data;
88 88
89 /* get the compressed symbol length from the first symbol byte */ 89 /* get the compressed symbol length from the first symbol byte */
90 data = &kallsyms_names[off]; 90 data = &kallsyms_names[off];
@@ -132,7 +132,7 @@ static char kallsyms_get_symbol_type(unsigned int off)
132 * kallsyms array */ 132 * kallsyms array */
133static unsigned int get_symbol_offset(unsigned long pos) 133static unsigned int get_symbol_offset(unsigned long pos)
134{ 134{
135 u8 *name; 135 const u8 *name;
136 int i; 136 int i;
137 137
138 /* use the closest marker we have. We have markers every 256 positions, 138 /* use the closest marker we have. We have markers every 256 positions,
diff --git a/kernel/kexec.c b/kernel/kexec.c
index afbbbe981b..2a59c8a01a 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -852,6 +852,7 @@ static int kimage_load_crash_segment(struct kimage *image,
852 memset(ptr + uchunk, 0, mchunk - uchunk); 852 memset(ptr + uchunk, 0, mchunk - uchunk);
853 } 853 }
854 result = copy_from_user(ptr, buf, uchunk); 854 result = copy_from_user(ptr, buf, uchunk);
855 kexec_flush_icache_page(page);
855 kunmap(page); 856 kunmap(page);
856 if (result) { 857 if (result) {
857 result = (result < 0) ? result : -EIO; 858 result = (result < 0) ? result : -EIO;
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 8d2bea09a4..3a7379aa31 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -25,7 +25,7 @@
25#include <linux/kmod.h> 25#include <linux/kmod.h>
26#include <linux/smp_lock.h> 26#include <linux/smp_lock.h>
27#include <linux/slab.h> 27#include <linux/slab.h>
28#include <linux/namespace.h> 28#include <linux/mnt_namespace.h>
29#include <linux/completion.h> 29#include <linux/completion.h>
30#include <linux/file.h> 30#include <linux/file.h>
31#include <linux/workqueue.h> 31#include <linux/workqueue.h>
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 8c71cf72a4..e7cbbb8276 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -206,6 +206,15 @@ mutex_lock_nested(struct mutex *lock, unsigned int subclass)
206} 206}
207 207
208EXPORT_SYMBOL_GPL(mutex_lock_nested); 208EXPORT_SYMBOL_GPL(mutex_lock_nested);
209
210int __sched
211mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
212{
213 might_sleep();
214 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass);
215}
216
217EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
209#endif 218#endif
210 219
211/* 220/*
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 674aceb733..e2ce748e96 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -17,8 +17,9 @@
17#include <linux/version.h> 17#include <linux/version.h>
18#include <linux/nsproxy.h> 18#include <linux/nsproxy.h>
19#include <linux/init_task.h> 19#include <linux/init_task.h>
20#include <linux/namespace.h> 20#include <linux/mnt_namespace.h>
21#include <linux/utsname.h> 21#include <linux/utsname.h>
22#include <linux/pid_namespace.h>
22 23
23struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); 24struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);
24 25
@@ -45,8 +46,10 @@ static inline struct nsproxy *clone_namespaces(struct nsproxy *orig)
45 struct nsproxy *ns; 46 struct nsproxy *ns;
46 47
47 ns = kmemdup(orig, sizeof(struct nsproxy), GFP_KERNEL); 48 ns = kmemdup(orig, sizeof(struct nsproxy), GFP_KERNEL);
48 if (ns) 49 if (ns) {
49 atomic_set(&ns->count, 1); 50 atomic_set(&ns->count, 1);
51 ns->id = -1;
52 }
50 return ns; 53 return ns;
51} 54}
52 55
@@ -60,12 +63,14 @@ struct nsproxy *dup_namespaces(struct nsproxy *orig)
60 struct nsproxy *ns = clone_namespaces(orig); 63 struct nsproxy *ns = clone_namespaces(orig);
61 64
62 if (ns) { 65 if (ns) {
63 if (ns->namespace) 66 if (ns->mnt_ns)
64 get_namespace(ns->namespace); 67 get_mnt_ns(ns->mnt_ns);
65 if (ns->uts_ns) 68 if (ns->uts_ns)
66 get_uts_ns(ns->uts_ns); 69 get_uts_ns(ns->uts_ns);
67 if (ns->ipc_ns) 70 if (ns->ipc_ns)
68 get_ipc_ns(ns->ipc_ns); 71 get_ipc_ns(ns->ipc_ns);
72 if (ns->pid_ns)
73 get_pid_ns(ns->pid_ns);
69 } 74 }
70 75
71 return ns; 76 return ns;
@@ -97,7 +102,7 @@ int copy_namespaces(int flags, struct task_struct *tsk)
97 102
98 tsk->nsproxy = new_ns; 103 tsk->nsproxy = new_ns;
99 104
100 err = copy_namespace(flags, tsk); 105 err = copy_mnt_ns(flags, tsk);
101 if (err) 106 if (err)
102 goto out_ns; 107 goto out_ns;
103 108
@@ -109,16 +114,23 @@ int copy_namespaces(int flags, struct task_struct *tsk)
109 if (err) 114 if (err)
110 goto out_ipc; 115 goto out_ipc;
111 116
117 err = copy_pid_ns(flags, tsk);
118 if (err)
119 goto out_pid;
120
112out: 121out:
113 put_nsproxy(old_ns); 122 put_nsproxy(old_ns);
114 return err; 123 return err;
115 124
125out_pid:
126 if (new_ns->ipc_ns)
127 put_ipc_ns(new_ns->ipc_ns);
116out_ipc: 128out_ipc:
117 if (new_ns->uts_ns) 129 if (new_ns->uts_ns)
118 put_uts_ns(new_ns->uts_ns); 130 put_uts_ns(new_ns->uts_ns);
119out_uts: 131out_uts:
120 if (new_ns->namespace) 132 if (new_ns->mnt_ns)
121 put_namespace(new_ns->namespace); 133 put_mnt_ns(new_ns->mnt_ns);
122out_ns: 134out_ns:
123 tsk->nsproxy = old_ns; 135 tsk->nsproxy = old_ns;
124 kfree(new_ns); 136 kfree(new_ns);
@@ -127,11 +139,13 @@ out_ns:
127 139
128void free_nsproxy(struct nsproxy *ns) 140void free_nsproxy(struct nsproxy *ns)
129{ 141{
130 if (ns->namespace) 142 if (ns->mnt_ns)
131 put_namespace(ns->namespace); 143 put_mnt_ns(ns->mnt_ns);
132 if (ns->uts_ns) 144 if (ns->uts_ns)
133 put_uts_ns(ns->uts_ns); 145 put_uts_ns(ns->uts_ns);
134 if (ns->ipc_ns) 146 if (ns->ipc_ns)
135 put_ipc_ns(ns->ipc_ns); 147 put_ipc_ns(ns->ipc_ns);
136 kfree(ns); 148 if (ns->pid_ns)
149 put_pid_ns(ns->pid_ns);
150 kfree(ns);
137} 151}
diff --git a/kernel/pid.c b/kernel/pid.c
index a48879b0b9..2efe9d8d36 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -26,7 +26,7 @@
26#include <linux/init.h> 26#include <linux/init.h>
27#include <linux/bootmem.h> 27#include <linux/bootmem.h>
28#include <linux/hash.h> 28#include <linux/hash.h>
29#include <linux/pspace.h> 29#include <linux/pid_namespace.h>
30 30
31#define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift) 31#define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift)
32static struct hlist_head *pid_hash; 32static struct hlist_head *pid_hash;
@@ -43,9 +43,10 @@ int pid_max_max = PID_MAX_LIMIT;
43#define BITS_PER_PAGE (PAGE_SIZE*8) 43#define BITS_PER_PAGE (PAGE_SIZE*8)
44#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1) 44#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1)
45 45
46static inline int mk_pid(struct pspace *pspace, struct pidmap *map, int off) 46static inline int mk_pid(struct pid_namespace *pid_ns,
47 struct pidmap *map, int off)
47{ 48{
48 return (map - pspace->pidmap)*BITS_PER_PAGE + off; 49 return (map - pid_ns->pidmap)*BITS_PER_PAGE + off;
49} 50}
50 51
51#define find_next_offset(map, off) \ 52#define find_next_offset(map, off) \
@@ -57,11 +58,15 @@ static inline int mk_pid(struct pspace *pspace, struct pidmap *map, int off)
57 * value does not cause lots of bitmaps to be allocated, but 58 * value does not cause lots of bitmaps to be allocated, but
58 * the scheme scales to up to 4 million PIDs, runtime. 59 * the scheme scales to up to 4 million PIDs, runtime.
59 */ 60 */
60struct pspace init_pspace = { 61struct pid_namespace init_pid_ns = {
62 .kref = {
63 .refcount = ATOMIC_INIT(2),
64 },
61 .pidmap = { 65 .pidmap = {
62 [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } 66 [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
63 }, 67 },
64 .last_pid = 0 68 .last_pid = 0,
69 .child_reaper = &init_task
65}; 70};
66 71
67/* 72/*
@@ -80,25 +85,25 @@ struct pspace init_pspace = {
80 85
81static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); 86static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
82 87
83static fastcall void free_pidmap(struct pspace *pspace, int pid) 88static fastcall void free_pidmap(struct pid_namespace *pid_ns, int pid)
84{ 89{
85 struct pidmap *map = pspace->pidmap + pid / BITS_PER_PAGE; 90 struct pidmap *map = pid_ns->pidmap + pid / BITS_PER_PAGE;
86 int offset = pid & BITS_PER_PAGE_MASK; 91 int offset = pid & BITS_PER_PAGE_MASK;
87 92
88 clear_bit(offset, map->page); 93 clear_bit(offset, map->page);
89 atomic_inc(&map->nr_free); 94 atomic_inc(&map->nr_free);
90} 95}
91 96
92static int alloc_pidmap(struct pspace *pspace) 97static int alloc_pidmap(struct pid_namespace *pid_ns)
93{ 98{
94 int i, offset, max_scan, pid, last = pspace->last_pid; 99 int i, offset, max_scan, pid, last = pid_ns->last_pid;
95 struct pidmap *map; 100 struct pidmap *map;
96 101
97 pid = last + 1; 102 pid = last + 1;
98 if (pid >= pid_max) 103 if (pid >= pid_max)
99 pid = RESERVED_PIDS; 104 pid = RESERVED_PIDS;
100 offset = pid & BITS_PER_PAGE_MASK; 105 offset = pid & BITS_PER_PAGE_MASK;
101 map = &pspace->pidmap[pid/BITS_PER_PAGE]; 106 map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
102 max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset; 107 max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset;
103 for (i = 0; i <= max_scan; ++i) { 108 for (i = 0; i <= max_scan; ++i) {
104 if (unlikely(!map->page)) { 109 if (unlikely(!map->page)) {
@@ -120,11 +125,11 @@ static int alloc_pidmap(struct pspace *pspace)
120 do { 125 do {
121 if (!test_and_set_bit(offset, map->page)) { 126 if (!test_and_set_bit(offset, map->page)) {
122 atomic_dec(&map->nr_free); 127 atomic_dec(&map->nr_free);
123 pspace->last_pid = pid; 128 pid_ns->last_pid = pid;
124 return pid; 129 return pid;
125 } 130 }
126 offset = find_next_offset(map, offset); 131 offset = find_next_offset(map, offset);
127 pid = mk_pid(pspace, map, offset); 132 pid = mk_pid(pid_ns, map, offset);
128 /* 133 /*
129 * find_next_offset() found a bit, the pid from it 134 * find_next_offset() found a bit, the pid from it
130 * is in-bounds, and if we fell back to the last 135 * is in-bounds, and if we fell back to the last
@@ -135,34 +140,34 @@ static int alloc_pidmap(struct pspace *pspace)
135 (i != max_scan || pid < last || 140 (i != max_scan || pid < last ||
136 !((last+1) & BITS_PER_PAGE_MASK))); 141 !((last+1) & BITS_PER_PAGE_MASK)));
137 } 142 }
138 if (map < &pspace->pidmap[(pid_max-1)/BITS_PER_PAGE]) { 143 if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
139 ++map; 144 ++map;
140 offset = 0; 145 offset = 0;
141 } else { 146 } else {
142 map = &pspace->pidmap[0]; 147 map = &pid_ns->pidmap[0];
143 offset = RESERVED_PIDS; 148 offset = RESERVED_PIDS;
144 if (unlikely(last == offset)) 149 if (unlikely(last == offset))
145 break; 150 break;
146 } 151 }
147 pid = mk_pid(pspace, map, offset); 152 pid = mk_pid(pid_ns, map, offset);
148 } 153 }
149 return -1; 154 return -1;
150} 155}
151 156
152static int next_pidmap(struct pspace *pspace, int last) 157static int next_pidmap(struct pid_namespace *pid_ns, int last)
153{ 158{
154 int offset; 159 int offset;
155 struct pidmap *map, *end; 160 struct pidmap *map, *end;
156 161
157 offset = (last + 1) & BITS_PER_PAGE_MASK; 162 offset = (last + 1) & BITS_PER_PAGE_MASK;
158 map = &pspace->pidmap[(last + 1)/BITS_PER_PAGE]; 163 map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE];
159 end = &pspace->pidmap[PIDMAP_ENTRIES]; 164 end = &pid_ns->pidmap[PIDMAP_ENTRIES];
160 for (; map < end; map++, offset = 0) { 165 for (; map < end; map++, offset = 0) {
161 if (unlikely(!map->page)) 166 if (unlikely(!map->page))
162 continue; 167 continue;
163 offset = find_next_bit((map)->page, BITS_PER_PAGE, offset); 168 offset = find_next_bit((map)->page, BITS_PER_PAGE, offset);
164 if (offset < BITS_PER_PAGE) 169 if (offset < BITS_PER_PAGE)
165 return mk_pid(pspace, map, offset); 170 return mk_pid(pid_ns, map, offset);
166 } 171 }
167 return -1; 172 return -1;
168} 173}
@@ -192,7 +197,7 @@ fastcall void free_pid(struct pid *pid)
192 hlist_del_rcu(&pid->pid_chain); 197 hlist_del_rcu(&pid->pid_chain);
193 spin_unlock_irqrestore(&pidmap_lock, flags); 198 spin_unlock_irqrestore(&pidmap_lock, flags);
194 199
195 free_pidmap(&init_pspace, pid->nr); 200 free_pidmap(current->nsproxy->pid_ns, pid->nr);
196 call_rcu(&pid->rcu, delayed_put_pid); 201 call_rcu(&pid->rcu, delayed_put_pid);
197} 202}
198 203
@@ -206,7 +211,7 @@ struct pid *alloc_pid(void)
206 if (!pid) 211 if (!pid)
207 goto out; 212 goto out;
208 213
209 nr = alloc_pidmap(&init_pspace); 214 nr = alloc_pidmap(current->nsproxy->pid_ns);
210 if (nr < 0) 215 if (nr < 0)
211 goto out_free; 216 goto out_free;
212 217
@@ -348,13 +353,33 @@ struct pid *find_ge_pid(int nr)
348 pid = find_pid(nr); 353 pid = find_pid(nr);
349 if (pid) 354 if (pid)
350 break; 355 break;
351 nr = next_pidmap(&init_pspace, nr); 356 nr = next_pidmap(current->nsproxy->pid_ns, nr);
352 } while (nr > 0); 357 } while (nr > 0);
353 358
354 return pid; 359 return pid;
355} 360}
356EXPORT_SYMBOL_GPL(find_get_pid); 361EXPORT_SYMBOL_GPL(find_get_pid);
357 362
363int copy_pid_ns(int flags, struct task_struct *tsk)
364{
365 struct pid_namespace *old_ns = tsk->nsproxy->pid_ns;
366 int err = 0;
367
368 if (!old_ns)
369 return 0;
370
371 get_pid_ns(old_ns);
372 return err;
373}
374
375void free_pid_ns(struct kref *kref)
376{
377 struct pid_namespace *ns;
378
379 ns = container_of(kref, struct pid_namespace, kref);
380 kfree(ns);
381}
382
358/* 383/*
359 * The pid hash table is scaled according to the amount of memory in the 384 * The pid hash table is scaled according to the amount of memory in the
360 * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or 385 * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or
@@ -382,10 +407,10 @@ void __init pidhash_init(void)
382 407
383void __init pidmap_init(void) 408void __init pidmap_init(void)
384{ 409{
385 init_pspace.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); 410 init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
386 /* Reserve PID 0. We never call free_pidmap(0) */ 411 /* Reserve PID 0. We never call free_pidmap(0) */
387 set_bit(0, init_pspace.pidmap[0].page); 412 set_bit(0, init_pid_ns.pidmap[0].page);
388 atomic_dec(&init_pspace.pidmap[0].nr_free); 413 atomic_dec(&init_pid_ns.pidmap[0].nr_free);
389 414
390 pid_cachep = kmem_cache_create("pid", sizeof(struct pid), 415 pid_cachep = kmem_cache_create("pid", sizeof(struct pid),
391 __alignof__(struct pid), 416 __alignof__(struct pid),
diff --git a/kernel/relay.c b/kernel/relay.c
index 75a3a9a7ef..818e514729 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -959,7 +959,7 @@ static inline ssize_t relay_file_read_subbufs(struct file *filp,
959 if (!desc->count) 959 if (!desc->count)
960 return 0; 960 return 0;
961 961
962 mutex_lock(&filp->f_dentry->d_inode->i_mutex); 962 mutex_lock(&filp->f_path.dentry->d_inode->i_mutex);
963 do { 963 do {
964 if (!relay_file_read_avail(buf, *ppos)) 964 if (!relay_file_read_avail(buf, *ppos))
965 break; 965 break;
@@ -979,7 +979,7 @@ static inline ssize_t relay_file_read_subbufs(struct file *filp,
979 *ppos = relay_file_read_end_pos(buf, read_start, ret); 979 *ppos = relay_file_read_end_pos(buf, read_start, ret);
980 } 980 }
981 } while (desc->count && ret); 981 } while (desc->count && ret);
982 mutex_unlock(&filp->f_dentry->d_inode->i_mutex); 982 mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex);
983 983
984 return desc->written; 984 return desc->written;
985} 985}
diff --git a/kernel/sched.c b/kernel/sched.c
index f385eff468..8a0afb97af 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -225,8 +225,10 @@ struct rq {
225 unsigned long nr_uninterruptible; 225 unsigned long nr_uninterruptible;
226 226
227 unsigned long expired_timestamp; 227 unsigned long expired_timestamp;
228 unsigned long long timestamp_last_tick; 228 /* Cached timestamp set by update_cpu_clock() */
229 unsigned long long most_recent_timestamp;
229 struct task_struct *curr, *idle; 230 struct task_struct *curr, *idle;
231 unsigned long next_balance;
230 struct mm_struct *prev_mm; 232 struct mm_struct *prev_mm;
231 struct prio_array *active, *expired, arrays[2]; 233 struct prio_array *active, *expired, arrays[2];
232 int best_expired_prio; 234 int best_expired_prio;
@@ -426,7 +428,7 @@ static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
426 * bump this up when changing the output format or the meaning of an existing 428 * bump this up when changing the output format or the meaning of an existing
427 * format, so that tools can adapt (or abort) 429 * format, so that tools can adapt (or abort)
428 */ 430 */
429#define SCHEDSTAT_VERSION 12 431#define SCHEDSTAT_VERSION 14
430 432
431static int show_schedstat(struct seq_file *seq, void *v) 433static int show_schedstat(struct seq_file *seq, void *v)
432{ 434{
@@ -464,7 +466,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
464 seq_printf(seq, "domain%d %s", dcnt++, mask_str); 466 seq_printf(seq, "domain%d %s", dcnt++, mask_str);
465 for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; 467 for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
466 itype++) { 468 itype++) {
467 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu", 469 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu "
470 "%lu",
468 sd->lb_cnt[itype], 471 sd->lb_cnt[itype],
469 sd->lb_balanced[itype], 472 sd->lb_balanced[itype],
470 sd->lb_failed[itype], 473 sd->lb_failed[itype],
@@ -474,11 +477,13 @@ static int show_schedstat(struct seq_file *seq, void *v)
474 sd->lb_nobusyq[itype], 477 sd->lb_nobusyq[itype],
475 sd->lb_nobusyg[itype]); 478 sd->lb_nobusyg[itype]);
476 } 479 }
477 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", 480 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu"
481 " %lu %lu %lu\n",
478 sd->alb_cnt, sd->alb_failed, sd->alb_pushed, 482 sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
479 sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed, 483 sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
480 sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed, 484 sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
481 sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance); 485 sd->ttwu_wake_remote, sd->ttwu_move_affine,
486 sd->ttwu_move_balance);
482 } 487 }
483 preempt_enable(); 488 preempt_enable();
484#endif 489#endif
@@ -547,7 +552,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
547#endif 552#endif
548 553
549/* 554/*
550 * rq_lock - lock a given runqueue and disable interrupts. 555 * this_rq_lock - lock this runqueue and disable interrupts.
551 */ 556 */
552static inline struct rq *this_rq_lock(void) 557static inline struct rq *this_rq_lock(void)
553 __acquires(rq->lock) 558 __acquires(rq->lock)
@@ -938,13 +943,16 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local)
938{ 943{
939 unsigned long long now; 944 unsigned long long now;
940 945
946 if (rt_task(p))
947 goto out;
948
941 now = sched_clock(); 949 now = sched_clock();
942#ifdef CONFIG_SMP 950#ifdef CONFIG_SMP
943 if (!local) { 951 if (!local) {
944 /* Compensate for drifting sched_clock */ 952 /* Compensate for drifting sched_clock */
945 struct rq *this_rq = this_rq(); 953 struct rq *this_rq = this_rq();
946 now = (now - this_rq->timestamp_last_tick) 954 now = (now - this_rq->most_recent_timestamp)
947 + rq->timestamp_last_tick; 955 + rq->most_recent_timestamp;
948 } 956 }
949#endif 957#endif
950 958
@@ -959,8 +967,7 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local)
959 (now - p->timestamp) >> 20); 967 (now - p->timestamp) >> 20);
960 } 968 }
961 969
962 if (!rt_task(p)) 970 p->prio = recalc_task_prio(p, now);
963 p->prio = recalc_task_prio(p, now);
964 971
965 /* 972 /*
966 * This checks to make sure it's not an uninterruptible task 973 * This checks to make sure it's not an uninterruptible task
@@ -985,7 +992,7 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local)
985 } 992 }
986 } 993 }
987 p->timestamp = now; 994 p->timestamp = now;
988 995out:
989 __activate_task(p, rq); 996 __activate_task(p, rq);
990} 997}
991 998
@@ -1450,7 +1457,9 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1450 1457
1451 if (this_sd->flags & SD_WAKE_AFFINE) { 1458 if (this_sd->flags & SD_WAKE_AFFINE) {
1452 unsigned long tl = this_load; 1459 unsigned long tl = this_load;
1453 unsigned long tl_per_task = cpu_avg_load_per_task(this_cpu); 1460 unsigned long tl_per_task;
1461
1462 tl_per_task = cpu_avg_load_per_task(this_cpu);
1454 1463
1455 /* 1464 /*
1456 * If sync wakeup then subtract the (maximum possible) 1465 * If sync wakeup then subtract the (maximum possible)
@@ -1688,8 +1697,8 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
1688 * Not the local CPU - must adjust timestamp. This should 1697 * Not the local CPU - must adjust timestamp. This should
1689 * get optimised away in the !CONFIG_SMP case. 1698 * get optimised away in the !CONFIG_SMP case.
1690 */ 1699 */
1691 p->timestamp = (p->timestamp - this_rq->timestamp_last_tick) 1700 p->timestamp = (p->timestamp - this_rq->most_recent_timestamp)
1692 + rq->timestamp_last_tick; 1701 + rq->most_recent_timestamp;
1693 __activate_task(p, rq); 1702 __activate_task(p, rq);
1694 if (TASK_PREEMPTS_CURR(p, rq)) 1703 if (TASK_PREEMPTS_CURR(p, rq))
1695 resched_task(rq->curr); 1704 resched_task(rq->curr);
@@ -1952,6 +1961,7 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1952 __acquires(rq1->lock) 1961 __acquires(rq1->lock)
1953 __acquires(rq2->lock) 1962 __acquires(rq2->lock)
1954{ 1963{
1964 BUG_ON(!irqs_disabled());
1955 if (rq1 == rq2) { 1965 if (rq1 == rq2) {
1956 spin_lock(&rq1->lock); 1966 spin_lock(&rq1->lock);
1957 __acquire(rq2->lock); /* Fake it out ;) */ 1967 __acquire(rq2->lock); /* Fake it out ;) */
@@ -1991,6 +2001,11 @@ static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
1991 __acquires(busiest->lock) 2001 __acquires(busiest->lock)
1992 __acquires(this_rq->lock) 2002 __acquires(this_rq->lock)
1993{ 2003{
2004 if (unlikely(!irqs_disabled())) {
2005 /* printk() doesn't work good under rq->lock */
2006 spin_unlock(&this_rq->lock);
2007 BUG_ON(1);
2008 }
1994 if (unlikely(!spin_trylock(&busiest->lock))) { 2009 if (unlikely(!spin_trylock(&busiest->lock))) {
1995 if (busiest < this_rq) { 2010 if (busiest < this_rq) {
1996 spin_unlock(&this_rq->lock); 2011 spin_unlock(&this_rq->lock);
@@ -2061,8 +2076,8 @@ static void pull_task(struct rq *src_rq, struct prio_array *src_array,
2061 set_task_cpu(p, this_cpu); 2076 set_task_cpu(p, this_cpu);
2062 inc_nr_running(p, this_rq); 2077 inc_nr_running(p, this_rq);
2063 enqueue_task(p, this_array); 2078 enqueue_task(p, this_array);
2064 p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) 2079 p->timestamp = (p->timestamp - src_rq->most_recent_timestamp)
2065 + this_rq->timestamp_last_tick; 2080 + this_rq->most_recent_timestamp;
2066 /* 2081 /*
2067 * Note that idle threads have a prio of MAX_PRIO, for this test 2082 * Note that idle threads have a prio of MAX_PRIO, for this test
2068 * to be always true for them. 2083 * to be always true for them.
@@ -2098,10 +2113,15 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2098 * 2) too many balance attempts have failed. 2113 * 2) too many balance attempts have failed.
2099 */ 2114 */
2100 2115
2101 if (sd->nr_balance_failed > sd->cache_nice_tries) 2116 if (sd->nr_balance_failed > sd->cache_nice_tries) {
2117#ifdef CONFIG_SCHEDSTATS
2118 if (task_hot(p, rq->most_recent_timestamp, sd))
2119 schedstat_inc(sd, lb_hot_gained[idle]);
2120#endif
2102 return 1; 2121 return 1;
2122 }
2103 2123
2104 if (task_hot(p, rq->timestamp_last_tick, sd)) 2124 if (task_hot(p, rq->most_recent_timestamp, sd))
2105 return 0; 2125 return 0;
2106 return 1; 2126 return 1;
2107} 2127}
@@ -2199,11 +2219,6 @@ skip_queue:
2199 goto skip_bitmap; 2219 goto skip_bitmap;
2200 } 2220 }
2201 2221
2202#ifdef CONFIG_SCHEDSTATS
2203 if (task_hot(tmp, busiest->timestamp_last_tick, sd))
2204 schedstat_inc(sd, lb_hot_gained[idle]);
2205#endif
2206
2207 pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); 2222 pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
2208 pulled++; 2223 pulled++;
2209 rem_load_move -= tmp->load_weight; 2224 rem_load_move -= tmp->load_weight;
@@ -2241,7 +2256,7 @@ out:
2241static struct sched_group * 2256static struct sched_group *
2242find_busiest_group(struct sched_domain *sd, int this_cpu, 2257find_busiest_group(struct sched_domain *sd, int this_cpu,
2243 unsigned long *imbalance, enum idle_type idle, int *sd_idle, 2258 unsigned long *imbalance, enum idle_type idle, int *sd_idle,
2244 cpumask_t *cpus) 2259 cpumask_t *cpus, int *balance)
2245{ 2260{
2246 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 2261 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
2247 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 2262 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@ -2270,10 +2285,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2270 unsigned long load, group_capacity; 2285 unsigned long load, group_capacity;
2271 int local_group; 2286 int local_group;
2272 int i; 2287 int i;
2288 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2273 unsigned long sum_nr_running, sum_weighted_load; 2289 unsigned long sum_nr_running, sum_weighted_load;
2274 2290
2275 local_group = cpu_isset(this_cpu, group->cpumask); 2291 local_group = cpu_isset(this_cpu, group->cpumask);
2276 2292
2293 if (local_group)
2294 balance_cpu = first_cpu(group->cpumask);
2295
2277 /* Tally up the load of all CPUs in the group */ 2296 /* Tally up the load of all CPUs in the group */
2278 sum_weighted_load = sum_nr_running = avg_load = 0; 2297 sum_weighted_load = sum_nr_running = avg_load = 0;
2279 2298
@@ -2289,9 +2308,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2289 *sd_idle = 0; 2308 *sd_idle = 0;
2290 2309
2291 /* Bias balancing toward cpus of our domain */ 2310 /* Bias balancing toward cpus of our domain */
2292 if (local_group) 2311 if (local_group) {
2312 if (idle_cpu(i) && !first_idle_cpu) {
2313 first_idle_cpu = 1;
2314 balance_cpu = i;
2315 }
2316
2293 load = target_load(i, load_idx); 2317 load = target_load(i, load_idx);
2294 else 2318 } else
2295 load = source_load(i, load_idx); 2319 load = source_load(i, load_idx);
2296 2320
2297 avg_load += load; 2321 avg_load += load;
@@ -2299,6 +2323,16 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2299 sum_weighted_load += rq->raw_weighted_load; 2323 sum_weighted_load += rq->raw_weighted_load;
2300 } 2324 }
2301 2325
2326 /*
2327 * First idle cpu or the first cpu(busiest) in this sched group
2328 * is eligible for doing load balancing at this and above
2329 * domains.
2330 */
2331 if (local_group && balance_cpu != this_cpu && balance) {
2332 *balance = 0;
2333 goto ret;
2334 }
2335
2302 total_load += avg_load; 2336 total_load += avg_load;
2303 total_pwr += group->cpu_power; 2337 total_pwr += group->cpu_power;
2304 2338
@@ -2458,18 +2492,21 @@ small_imbalance:
2458 pwr_now /= SCHED_LOAD_SCALE; 2492 pwr_now /= SCHED_LOAD_SCALE;
2459 2493
2460 /* Amount of load we'd subtract */ 2494 /* Amount of load we'd subtract */
2461 tmp = busiest_load_per_task*SCHED_LOAD_SCALE/busiest->cpu_power; 2495 tmp = busiest_load_per_task * SCHED_LOAD_SCALE /
2496 busiest->cpu_power;
2462 if (max_load > tmp) 2497 if (max_load > tmp)
2463 pwr_move += busiest->cpu_power * 2498 pwr_move += busiest->cpu_power *
2464 min(busiest_load_per_task, max_load - tmp); 2499 min(busiest_load_per_task, max_load - tmp);
2465 2500
2466 /* Amount of load we'd add */ 2501 /* Amount of load we'd add */
2467 if (max_load*busiest->cpu_power < 2502 if (max_load * busiest->cpu_power <
2468 busiest_load_per_task*SCHED_LOAD_SCALE) 2503 busiest_load_per_task * SCHED_LOAD_SCALE)
2469 tmp = max_load*busiest->cpu_power/this->cpu_power; 2504 tmp = max_load * busiest->cpu_power / this->cpu_power;
2470 else 2505 else
2471 tmp = busiest_load_per_task*SCHED_LOAD_SCALE/this->cpu_power; 2506 tmp = busiest_load_per_task * SCHED_LOAD_SCALE /
2472 pwr_move += this->cpu_power*min(this_load_per_task, this_load + tmp); 2507 this->cpu_power;
2508 pwr_move += this->cpu_power *
2509 min(this_load_per_task, this_load + tmp);
2473 pwr_move /= SCHED_LOAD_SCALE; 2510 pwr_move /= SCHED_LOAD_SCALE;
2474 2511
2475 /* Move if we gain throughput */ 2512 /* Move if we gain throughput */
@@ -2490,8 +2527,8 @@ out_balanced:
2490 *imbalance = min_load_per_task; 2527 *imbalance = min_load_per_task;
2491 return group_min; 2528 return group_min;
2492 } 2529 }
2493ret:
2494#endif 2530#endif
2531ret:
2495 *imbalance = 0; 2532 *imbalance = 0;
2496 return NULL; 2533 return NULL;
2497} 2534}
@@ -2540,17 +2577,17 @@ static inline unsigned long minus_1_or_zero(unsigned long n)
2540/* 2577/*
2541 * Check this_cpu to ensure it is balanced within domain. Attempt to move 2578 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2542 * tasks if there is an imbalance. 2579 * tasks if there is an imbalance.
2543 *
2544 * Called with this_rq unlocked.
2545 */ 2580 */
2546static int load_balance(int this_cpu, struct rq *this_rq, 2581static int load_balance(int this_cpu, struct rq *this_rq,
2547 struct sched_domain *sd, enum idle_type idle) 2582 struct sched_domain *sd, enum idle_type idle,
2583 int *balance)
2548{ 2584{
2549 int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; 2585 int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
2550 struct sched_group *group; 2586 struct sched_group *group;
2551 unsigned long imbalance; 2587 unsigned long imbalance;
2552 struct rq *busiest; 2588 struct rq *busiest;
2553 cpumask_t cpus = CPU_MASK_ALL; 2589 cpumask_t cpus = CPU_MASK_ALL;
2590 unsigned long flags;
2554 2591
2555 /* 2592 /*
2556 * When power savings policy is enabled for the parent domain, idle 2593 * When power savings policy is enabled for the parent domain, idle
@@ -2566,7 +2603,11 @@ static int load_balance(int this_cpu, struct rq *this_rq,
2566 2603
2567redo: 2604redo:
2568 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, 2605 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
2569 &cpus); 2606 &cpus, balance);
2607
2608 if (*balance == 0)
2609 goto out_balanced;
2610
2570 if (!group) { 2611 if (!group) {
2571 schedstat_inc(sd, lb_nobusyg[idle]); 2612 schedstat_inc(sd, lb_nobusyg[idle]);
2572 goto out_balanced; 2613 goto out_balanced;
@@ -2590,11 +2631,13 @@ redo:
2590 * still unbalanced. nr_moved simply stays zero, so it is 2631 * still unbalanced. nr_moved simply stays zero, so it is
2591 * correctly treated as an imbalance. 2632 * correctly treated as an imbalance.
2592 */ 2633 */
2634 local_irq_save(flags);
2593 double_rq_lock(this_rq, busiest); 2635 double_rq_lock(this_rq, busiest);
2594 nr_moved = move_tasks(this_rq, this_cpu, busiest, 2636 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2595 minus_1_or_zero(busiest->nr_running), 2637 minus_1_or_zero(busiest->nr_running),
2596 imbalance, sd, idle, &all_pinned); 2638 imbalance, sd, idle, &all_pinned);
2597 double_rq_unlock(this_rq, busiest); 2639 double_rq_unlock(this_rq, busiest);
2640 local_irq_restore(flags);
2598 2641
2599 /* All tasks on this runqueue were pinned by CPU affinity */ 2642 /* All tasks on this runqueue were pinned by CPU affinity */
2600 if (unlikely(all_pinned)) { 2643 if (unlikely(all_pinned)) {
@@ -2611,13 +2654,13 @@ redo:
2611 2654
2612 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { 2655 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
2613 2656
2614 spin_lock(&busiest->lock); 2657 spin_lock_irqsave(&busiest->lock, flags);
2615 2658
2616 /* don't kick the migration_thread, if the curr 2659 /* don't kick the migration_thread, if the curr
2617 * task on busiest cpu can't be moved to this_cpu 2660 * task on busiest cpu can't be moved to this_cpu
2618 */ 2661 */
2619 if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { 2662 if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
2620 spin_unlock(&busiest->lock); 2663 spin_unlock_irqrestore(&busiest->lock, flags);
2621 all_pinned = 1; 2664 all_pinned = 1;
2622 goto out_one_pinned; 2665 goto out_one_pinned;
2623 } 2666 }
@@ -2627,7 +2670,7 @@ redo:
2627 busiest->push_cpu = this_cpu; 2670 busiest->push_cpu = this_cpu;
2628 active_balance = 1; 2671 active_balance = 1;
2629 } 2672 }
2630 spin_unlock(&busiest->lock); 2673 spin_unlock_irqrestore(&busiest->lock, flags);
2631 if (active_balance) 2674 if (active_balance)
2632 wake_up_process(busiest->migration_thread); 2675 wake_up_process(busiest->migration_thread);
2633 2676
@@ -2706,7 +2749,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
2706 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); 2749 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
2707redo: 2750redo:
2708 group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, 2751 group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE,
2709 &sd_idle, &cpus); 2752 &sd_idle, &cpus, NULL);
2710 if (!group) { 2753 if (!group) {
2711 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); 2754 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
2712 goto out_balanced; 2755 goto out_balanced;
@@ -2766,14 +2809,28 @@ out_balanced:
2766static void idle_balance(int this_cpu, struct rq *this_rq) 2809static void idle_balance(int this_cpu, struct rq *this_rq)
2767{ 2810{
2768 struct sched_domain *sd; 2811 struct sched_domain *sd;
2812 int pulled_task = 0;
2813 unsigned long next_balance = jiffies + 60 * HZ;
2769 2814
2770 for_each_domain(this_cpu, sd) { 2815 for_each_domain(this_cpu, sd) {
2771 if (sd->flags & SD_BALANCE_NEWIDLE) { 2816 if (sd->flags & SD_BALANCE_NEWIDLE) {
2772 /* If we've pulled tasks over stop searching: */ 2817 /* If we've pulled tasks over stop searching: */
2773 if (load_balance_newidle(this_cpu, this_rq, sd)) 2818 pulled_task = load_balance_newidle(this_cpu,
2819 this_rq, sd);
2820 if (time_after(next_balance,
2821 sd->last_balance + sd->balance_interval))
2822 next_balance = sd->last_balance
2823 + sd->balance_interval;
2824 if (pulled_task)
2774 break; 2825 break;
2775 } 2826 }
2776 } 2827 }
2828 if (!pulled_task)
2829 /*
2830 * We are going idle. next_balance may be set based on
2831 * a busy processor. So reset next_balance.
2832 */
2833 this_rq->next_balance = next_balance;
2777} 2834}
2778 2835
2779/* 2836/*
@@ -2826,26 +2883,9 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
2826 spin_unlock(&target_rq->lock); 2883 spin_unlock(&target_rq->lock);
2827} 2884}
2828 2885
2829/* 2886static void update_load(struct rq *this_rq)
2830 * rebalance_tick will get called every timer tick, on every CPU.
2831 *
2832 * It checks each scheduling domain to see if it is due to be balanced,
2833 * and initiates a balancing operation if so.
2834 *
2835 * Balancing parameters are set up in arch_init_sched_domains.
2836 */
2837
2838/* Don't have all balancing operations going off at once: */
2839static inline unsigned long cpu_offset(int cpu)
2840{ 2887{
2841 return jiffies + cpu * HZ / NR_CPUS; 2888 unsigned long this_load;
2842}
2843
2844static void
2845rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle)
2846{
2847 unsigned long this_load, interval, j = cpu_offset(this_cpu);
2848 struct sched_domain *sd;
2849 int i, scale; 2889 int i, scale;
2850 2890
2851 this_load = this_rq->raw_weighted_load; 2891 this_load = this_rq->raw_weighted_load;
@@ -2865,6 +2905,32 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle)
2865 new_load += scale-1; 2905 new_load += scale-1;
2866 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale; 2906 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale;
2867 } 2907 }
2908}
2909
2910/*
2911 * run_rebalance_domains is triggered when needed from the scheduler tick.
2912 *
2913 * It checks each scheduling domain to see if it is due to be balanced,
2914 * and initiates a balancing operation if so.
2915 *
2916 * Balancing parameters are set up in arch_init_sched_domains.
2917 */
2918static DEFINE_SPINLOCK(balancing);
2919
2920static void run_rebalance_domains(struct softirq_action *h)
2921{
2922 int this_cpu = smp_processor_id(), balance = 1;
2923 struct rq *this_rq = cpu_rq(this_cpu);
2924 unsigned long interval;
2925 struct sched_domain *sd;
2926 /*
2927 * We are idle if there are no processes running. This
2928 * is valid even if we are the idle process (SMT).
2929 */
2930 enum idle_type idle = !this_rq->nr_running ?
2931 SCHED_IDLE : NOT_IDLE;
2932 /* Earliest time when we have to call run_rebalance_domains again */
2933 unsigned long next_balance = jiffies + 60*HZ;
2868 2934
2869 for_each_domain(this_cpu, sd) { 2935 for_each_domain(this_cpu, sd) {
2870 if (!(sd->flags & SD_LOAD_BALANCE)) 2936 if (!(sd->flags & SD_LOAD_BALANCE))
@@ -2879,8 +2945,13 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle)
2879 if (unlikely(!interval)) 2945 if (unlikely(!interval))
2880 interval = 1; 2946 interval = 1;
2881 2947
2882 if (j - sd->last_balance >= interval) { 2948 if (sd->flags & SD_SERIALIZE) {
2883 if (load_balance(this_cpu, this_rq, sd, idle)) { 2949 if (!spin_trylock(&balancing))
2950 goto out;
2951 }
2952
2953 if (time_after_eq(jiffies, sd->last_balance + interval)) {
2954 if (load_balance(this_cpu, this_rq, sd, idle, &balance)) {
2884 /* 2955 /*
2885 * We've pulled tasks over so either we're no 2956 * We've pulled tasks over so either we're no
2886 * longer idle, or one of our SMT siblings is 2957 * longer idle, or one of our SMT siblings is
@@ -2888,39 +2959,48 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle)
2888 */ 2959 */
2889 idle = NOT_IDLE; 2960 idle = NOT_IDLE;
2890 } 2961 }
2891 sd->last_balance += interval; 2962 sd->last_balance = jiffies;
2892 } 2963 }
2964 if (sd->flags & SD_SERIALIZE)
2965 spin_unlock(&balancing);
2966out:
2967 if (time_after(next_balance, sd->last_balance + interval))
2968 next_balance = sd->last_balance + interval;
2969
2970 /*
2971 * Stop the load balance at this level. There is another
2972 * CPU in our sched group which is doing load balancing more
2973 * actively.
2974 */
2975 if (!balance)
2976 break;
2893 } 2977 }
2978 this_rq->next_balance = next_balance;
2894} 2979}
2895#else 2980#else
2896/* 2981/*
2897 * on UP we do not need to balance between CPUs: 2982 * on UP we do not need to balance between CPUs:
2898 */ 2983 */
2899static inline void rebalance_tick(int cpu, struct rq *rq, enum idle_type idle)
2900{
2901}
2902static inline void idle_balance(int cpu, struct rq *rq) 2984static inline void idle_balance(int cpu, struct rq *rq)
2903{ 2985{
2904} 2986}
2905#endif 2987#endif
2906 2988
2907static inline int wake_priority_sleeper(struct rq *rq) 2989static inline void wake_priority_sleeper(struct rq *rq)
2908{ 2990{
2909 int ret = 0;
2910
2911#ifdef CONFIG_SCHED_SMT 2991#ifdef CONFIG_SCHED_SMT
2992 if (!rq->nr_running)
2993 return;
2994
2912 spin_lock(&rq->lock); 2995 spin_lock(&rq->lock);
2913 /* 2996 /*
2914 * If an SMT sibling task has been put to sleep for priority 2997 * If an SMT sibling task has been put to sleep for priority
2915 * reasons reschedule the idle task to see if it can now run. 2998 * reasons reschedule the idle task to see if it can now run.
2916 */ 2999 */
2917 if (rq->nr_running) { 3000 if (rq->nr_running)
2918 resched_task(rq->idle); 3001 resched_task(rq->idle);
2919 ret = 1;
2920 }
2921 spin_unlock(&rq->lock); 3002 spin_unlock(&rq->lock);
2922#endif 3003#endif
2923 return ret;
2924} 3004}
2925 3005
2926DEFINE_PER_CPU(struct kernel_stat, kstat); 3006DEFINE_PER_CPU(struct kernel_stat, kstat);
@@ -2934,7 +3014,8 @@ EXPORT_PER_CPU_SYMBOL(kstat);
2934static inline void 3014static inline void
2935update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now) 3015update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now)
2936{ 3016{
2937 p->sched_time += now - max(p->timestamp, rq->timestamp_last_tick); 3017 p->sched_time += now - p->last_ran;
3018 p->last_ran = rq->most_recent_timestamp = now;
2938} 3019}
2939 3020
2940/* 3021/*
@@ -2947,8 +3028,7 @@ unsigned long long current_sched_time(const struct task_struct *p)
2947 unsigned long flags; 3028 unsigned long flags;
2948 3029
2949 local_irq_save(flags); 3030 local_irq_save(flags);
2950 ns = max(p->timestamp, task_rq(p)->timestamp_last_tick); 3031 ns = p->sched_time + sched_clock() - p->last_ran;
2951 ns = p->sched_time + sched_clock() - ns;
2952 local_irq_restore(flags); 3032 local_irq_restore(flags);
2953 3033
2954 return ns; 3034 return ns;
@@ -3048,35 +3128,12 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
3048 cpustat->steal = cputime64_add(cpustat->steal, tmp); 3128 cpustat->steal = cputime64_add(cpustat->steal, tmp);
3049} 3129}
3050 3130
3051/* 3131static void task_running_tick(struct rq *rq, struct task_struct *p)
3052 * This function gets called by the timer code, with HZ frequency.
3053 * We call it with interrupts disabled.
3054 *
3055 * It also gets called by the fork code, when changing the parent's
3056 * timeslices.
3057 */
3058void scheduler_tick(void)
3059{ 3132{
3060 unsigned long long now = sched_clock();
3061 struct task_struct *p = current;
3062 int cpu = smp_processor_id();
3063 struct rq *rq = cpu_rq(cpu);
3064
3065 update_cpu_clock(p, rq, now);
3066
3067 rq->timestamp_last_tick = now;
3068
3069 if (p == rq->idle) {
3070 if (wake_priority_sleeper(rq))
3071 goto out;
3072 rebalance_tick(cpu, rq, SCHED_IDLE);
3073 return;
3074 }
3075
3076 /* Task might have expired already, but not scheduled off yet */
3077 if (p->array != rq->active) { 3133 if (p->array != rq->active) {
3134 /* Task has expired but was not scheduled yet */
3078 set_tsk_need_resched(p); 3135 set_tsk_need_resched(p);
3079 goto out; 3136 return;
3080 } 3137 }
3081 spin_lock(&rq->lock); 3138 spin_lock(&rq->lock);
3082 /* 3139 /*
@@ -3144,8 +3201,34 @@ void scheduler_tick(void)
3144 } 3201 }
3145out_unlock: 3202out_unlock:
3146 spin_unlock(&rq->lock); 3203 spin_unlock(&rq->lock);
3147out: 3204}
3148 rebalance_tick(cpu, rq, NOT_IDLE); 3205
3206/*
3207 * This function gets called by the timer code, with HZ frequency.
3208 * We call it with interrupts disabled.
3209 *
3210 * It also gets called by the fork code, when changing the parent's
3211 * timeslices.
3212 */
3213void scheduler_tick(void)
3214{
3215 unsigned long long now = sched_clock();
3216 struct task_struct *p = current;
3217 int cpu = smp_processor_id();
3218 struct rq *rq = cpu_rq(cpu);
3219
3220 update_cpu_clock(p, rq, now);
3221
3222 if (p == rq->idle)
3223 /* Task on the idle queue */
3224 wake_priority_sleeper(rq);
3225 else
3226 task_running_tick(rq, p);
3227#ifdef CONFIG_SMP
3228 update_load(rq);
3229 if (time_after_eq(jiffies, rq->next_balance))
3230 raise_softirq(SCHED_SOFTIRQ);
3231#endif
3149} 3232}
3150 3233
3151#ifdef CONFIG_SCHED_SMT 3234#ifdef CONFIG_SCHED_SMT
@@ -3291,7 +3374,8 @@ void fastcall add_preempt_count(int val)
3291 /* 3374 /*
3292 * Spinlock count overflowing soon? 3375 * Spinlock count overflowing soon?
3293 */ 3376 */
3294 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10); 3377 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
3378 PREEMPT_MASK - 10);
3295} 3379}
3296EXPORT_SYMBOL(add_preempt_count); 3380EXPORT_SYMBOL(add_preempt_count);
3297 3381
@@ -4990,8 +5074,8 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4990 * afterwards, and pretending it was a local activate. 5074 * afterwards, and pretending it was a local activate.
4991 * This way is cleaner and logically correct. 5075 * This way is cleaner and logically correct.
4992 */ 5076 */
4993 p->timestamp = p->timestamp - rq_src->timestamp_last_tick 5077 p->timestamp = p->timestamp - rq_src->most_recent_timestamp
4994 + rq_dest->timestamp_last_tick; 5078 + rq_dest->most_recent_timestamp;
4995 deactivate_task(p, rq_src); 5079 deactivate_task(p, rq_src);
4996 __activate_task(p, rq_dest); 5080 __activate_task(p, rq_dest);
4997 if (TASK_PREEMPTS_CURR(p, rq_dest)) 5081 if (TASK_PREEMPTS_CURR(p, rq_dest))
@@ -5067,7 +5151,10 @@ wait_to_die:
5067} 5151}
5068 5152
5069#ifdef CONFIG_HOTPLUG_CPU 5153#ifdef CONFIG_HOTPLUG_CPU
5070/* Figure out where task on dead CPU should go, use force if neccessary. */ 5154/*
5155 * Figure out where task on dead CPU should go, use force if neccessary.
5156 * NOTE: interrupts should be disabled by the caller
5157 */
5071static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 5158static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5072{ 5159{
5073 unsigned long flags; 5160 unsigned long flags;
@@ -5187,6 +5274,7 @@ void idle_task_exit(void)
5187 mmdrop(mm); 5274 mmdrop(mm);
5188} 5275}
5189 5276
5277/* called under rq->lock with disabled interrupts */
5190static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) 5278static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5191{ 5279{
5192 struct rq *rq = cpu_rq(dead_cpu); 5280 struct rq *rq = cpu_rq(dead_cpu);
@@ -5203,10 +5291,11 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5203 * Drop lock around migration; if someone else moves it, 5291 * Drop lock around migration; if someone else moves it,
5204 * that's OK. No task can be added to this CPU, so iteration is 5292 * that's OK. No task can be added to this CPU, so iteration is
5205 * fine. 5293 * fine.
5294 * NOTE: interrupts should be left disabled --dev@
5206 */ 5295 */
5207 spin_unlock_irq(&rq->lock); 5296 spin_unlock(&rq->lock);
5208 move_task_off_dead_cpu(dead_cpu, p); 5297 move_task_off_dead_cpu(dead_cpu, p);
5209 spin_lock_irq(&rq->lock); 5298 spin_lock(&rq->lock);
5210 5299
5211 put_task_struct(p); 5300 put_task_struct(p);
5212} 5301}
@@ -5359,16 +5448,19 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
5359 if (!(sd->flags & SD_LOAD_BALANCE)) { 5448 if (!(sd->flags & SD_LOAD_BALANCE)) {
5360 printk("does not load-balance\n"); 5449 printk("does not load-balance\n");
5361 if (sd->parent) 5450 if (sd->parent)
5362 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent"); 5451 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
5452 " has parent");
5363 break; 5453 break;
5364 } 5454 }
5365 5455
5366 printk("span %s\n", str); 5456 printk("span %s\n", str);
5367 5457
5368 if (!cpu_isset(cpu, sd->span)) 5458 if (!cpu_isset(cpu, sd->span))
5369 printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu); 5459 printk(KERN_ERR "ERROR: domain->span does not contain "
5460 "CPU%d\n", cpu);
5370 if (!cpu_isset(cpu, group->cpumask)) 5461 if (!cpu_isset(cpu, group->cpumask))
5371 printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu); 5462 printk(KERN_ERR "ERROR: domain->groups does not contain"
5463 " CPU%d\n", cpu);
5372 5464
5373 printk(KERN_DEBUG); 5465 printk(KERN_DEBUG);
5374 for (i = 0; i < level + 2; i++) 5466 for (i = 0; i < level + 2; i++)
@@ -5383,7 +5475,8 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
5383 5475
5384 if (!group->cpu_power) { 5476 if (!group->cpu_power) {
5385 printk("\n"); 5477 printk("\n");
5386 printk(KERN_ERR "ERROR: domain->cpu_power not set\n"); 5478 printk(KERN_ERR "ERROR: domain->cpu_power not "
5479 "set\n");
5387 } 5480 }
5388 5481
5389 if (!cpus_weight(group->cpumask)) { 5482 if (!cpus_weight(group->cpumask)) {
@@ -5406,15 +5499,17 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
5406 printk("\n"); 5499 printk("\n");
5407 5500
5408 if (!cpus_equal(sd->span, groupmask)) 5501 if (!cpus_equal(sd->span, groupmask))
5409 printk(KERN_ERR "ERROR: groups don't span domain->span\n"); 5502 printk(KERN_ERR "ERROR: groups don't span "
5503 "domain->span\n");
5410 5504
5411 level++; 5505 level++;
5412 sd = sd->parent; 5506 sd = sd->parent;
5507 if (!sd)
5508 continue;
5413 5509
5414 if (sd) { 5510 if (!cpus_subset(groupmask, sd->span))
5415 if (!cpus_subset(groupmask, sd->span)) 5511 printk(KERN_ERR "ERROR: parent span is not a superset "
5416 printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n"); 5512 "of domain->span\n");
5417 }
5418 5513
5419 } while (sd); 5514 } while (sd);
5420} 5515}
@@ -5528,28 +5623,27 @@ static int __init isolated_cpu_setup(char *str)
5528__setup ("isolcpus=", isolated_cpu_setup); 5623__setup ("isolcpus=", isolated_cpu_setup);
5529 5624
5530/* 5625/*
5531 * init_sched_build_groups takes an array of groups, the cpumask we wish 5626 * init_sched_build_groups takes the cpumask we wish to span, and a pointer
5532 * to span, and a pointer to a function which identifies what group a CPU 5627 * to a function which identifies what group(along with sched group) a CPU
5533 * belongs to. The return value of group_fn must be a valid index into the 5628 * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
5534 * groups[] array, and must be >= 0 and < NR_CPUS (due to the fact that we 5629 * (due to the fact that we keep track of groups covered with a cpumask_t).
5535 * keep track of groups covered with a cpumask_t).
5536 * 5630 *
5537 * init_sched_build_groups will build a circular linked list of the groups 5631 * init_sched_build_groups will build a circular linked list of the groups
5538 * covered by the given span, and will set each group's ->cpumask correctly, 5632 * covered by the given span, and will set each group's ->cpumask correctly,
5539 * and ->cpu_power to 0. 5633 * and ->cpu_power to 0.
5540 */ 5634 */
5541static void 5635static void
5542init_sched_build_groups(struct sched_group groups[], cpumask_t span, 5636init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
5543 const cpumask_t *cpu_map, 5637 int (*group_fn)(int cpu, const cpumask_t *cpu_map,
5544 int (*group_fn)(int cpu, const cpumask_t *cpu_map)) 5638 struct sched_group **sg))
5545{ 5639{
5546 struct sched_group *first = NULL, *last = NULL; 5640 struct sched_group *first = NULL, *last = NULL;
5547 cpumask_t covered = CPU_MASK_NONE; 5641 cpumask_t covered = CPU_MASK_NONE;
5548 int i; 5642 int i;
5549 5643
5550 for_each_cpu_mask(i, span) { 5644 for_each_cpu_mask(i, span) {
5551 int group = group_fn(i, cpu_map); 5645 struct sched_group *sg;
5552 struct sched_group *sg = &groups[group]; 5646 int group = group_fn(i, cpu_map, &sg);
5553 int j; 5647 int j;
5554 5648
5555 if (cpu_isset(i, covered)) 5649 if (cpu_isset(i, covered))
@@ -5559,7 +5653,7 @@ init_sched_build_groups(struct sched_group groups[], cpumask_t span,
5559 sg->cpu_power = 0; 5653 sg->cpu_power = 0;
5560 5654
5561 for_each_cpu_mask(j, span) { 5655 for_each_cpu_mask(j, span) {
5562 if (group_fn(j, cpu_map) != group) 5656 if (group_fn(j, cpu_map, NULL) != group)
5563 continue; 5657 continue;
5564 5658
5565 cpu_set(j, covered); 5659 cpu_set(j, covered);
@@ -5733,8 +5827,9 @@ __setup("max_cache_size=", setup_max_cache_size);
5733 */ 5827 */
5734static void touch_cache(void *__cache, unsigned long __size) 5828static void touch_cache(void *__cache, unsigned long __size)
5735{ 5829{
5736 unsigned long size = __size/sizeof(long), chunk1 = size/3, 5830 unsigned long size = __size / sizeof(long);
5737 chunk2 = 2*size/3; 5831 unsigned long chunk1 = size / 3;
5832 unsigned long chunk2 = 2 * size / 3;
5738 unsigned long *cache = __cache; 5833 unsigned long *cache = __cache;
5739 int i; 5834 int i;
5740 5835
@@ -5843,11 +5938,11 @@ measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)
5843 */ 5938 */
5844 measure_one(cache, size, cpu1, cpu2); 5939 measure_one(cache, size, cpu1, cpu2);
5845 for (i = 0; i < ITERATIONS; i++) 5940 for (i = 0; i < ITERATIONS; i++)
5846 cost1 += measure_one(cache, size - i*1024, cpu1, cpu2); 5941 cost1 += measure_one(cache, size - i * 1024, cpu1, cpu2);
5847 5942
5848 measure_one(cache, size, cpu2, cpu1); 5943 measure_one(cache, size, cpu2, cpu1);
5849 for (i = 0; i < ITERATIONS; i++) 5944 for (i = 0; i < ITERATIONS; i++)
5850 cost1 += measure_one(cache, size - i*1024, cpu2, cpu1); 5945 cost1 += measure_one(cache, size - i * 1024, cpu2, cpu1);
5851 5946
5852 /* 5947 /*
5853 * (We measure the non-migrating [cached] cost on both 5948 * (We measure the non-migrating [cached] cost on both
@@ -5857,17 +5952,17 @@ measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)
5857 5952
5858 measure_one(cache, size, cpu1, cpu1); 5953 measure_one(cache, size, cpu1, cpu1);
5859 for (i = 0; i < ITERATIONS; i++) 5954 for (i = 0; i < ITERATIONS; i++)
5860 cost2 += measure_one(cache, size - i*1024, cpu1, cpu1); 5955 cost2 += measure_one(cache, size - i * 1024, cpu1, cpu1);
5861 5956
5862 measure_one(cache, size, cpu2, cpu2); 5957 measure_one(cache, size, cpu2, cpu2);
5863 for (i = 0; i < ITERATIONS; i++) 5958 for (i = 0; i < ITERATIONS; i++)
5864 cost2 += measure_one(cache, size - i*1024, cpu2, cpu2); 5959 cost2 += measure_one(cache, size - i * 1024, cpu2, cpu2);
5865 5960
5866 /* 5961 /*
5867 * Get the per-iteration migration cost: 5962 * Get the per-iteration migration cost:
5868 */ 5963 */
5869 do_div(cost1, 2*ITERATIONS); 5964 do_div(cost1, 2 * ITERATIONS);
5870 do_div(cost2, 2*ITERATIONS); 5965 do_div(cost2, 2 * ITERATIONS);
5871 5966
5872 return cost1 - cost2; 5967 return cost1 - cost2;
5873} 5968}
@@ -5905,7 +6000,7 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2)
5905 */ 6000 */
5906 cache = vmalloc(max_size); 6001 cache = vmalloc(max_size);
5907 if (!cache) { 6002 if (!cache) {
5908 printk("could not vmalloc %d bytes for cache!\n", 2*max_size); 6003 printk("could not vmalloc %d bytes for cache!\n", 2 * max_size);
5909 return 1000000; /* return 1 msec on very small boxen */ 6004 return 1000000; /* return 1 msec on very small boxen */
5910 } 6005 }
5911 6006
@@ -5930,7 +6025,8 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2)
5930 avg_fluct = (avg_fluct + fluct)/2; 6025 avg_fluct = (avg_fluct + fluct)/2;
5931 6026
5932 if (migration_debug) 6027 if (migration_debug)
5933 printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): (%8Ld %8Ld)\n", 6028 printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): "
6029 "(%8Ld %8Ld)\n",
5934 cpu1, cpu2, size, 6030 cpu1, cpu2, size,
5935 (long)cost / 1000000, 6031 (long)cost / 1000000,
5936 ((long)cost / 100000) % 10, 6032 ((long)cost / 100000) % 10,
@@ -6025,20 +6121,18 @@ static void calibrate_migration_costs(const cpumask_t *cpu_map)
6025 -1 6121 -1
6026#endif 6122#endif
6027 ); 6123 );
6028 if (system_state == SYSTEM_BOOTING) { 6124 if (system_state == SYSTEM_BOOTING && num_online_cpus() > 1) {
6029 if (num_online_cpus() > 1) { 6125 printk("migration_cost=");
6030 printk("migration_cost="); 6126 for (distance = 0; distance <= max_distance; distance++) {
6031 for (distance = 0; distance <= max_distance; distance++) { 6127 if (distance)
6032 if (distance) 6128 printk(",");
6033 printk(","); 6129 printk("%ld", (long)migration_cost[distance] / 1000);
6034 printk("%ld", (long)migration_cost[distance] / 1000);
6035 }
6036 printk("\n");
6037 } 6130 }
6131 printk("\n");
6038 } 6132 }
6039 j1 = jiffies; 6133 j1 = jiffies;
6040 if (migration_debug) 6134 if (migration_debug)
6041 printk("migration: %ld seconds\n", (j1-j0)/HZ); 6135 printk("migration: %ld seconds\n", (j1-j0) / HZ);
6042 6136
6043 /* 6137 /*
6044 * Move back to the original CPU. NUMA-Q gets confused 6138 * Move back to the original CPU. NUMA-Q gets confused
@@ -6135,10 +6229,13 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
6135 */ 6229 */
6136#ifdef CONFIG_SCHED_SMT 6230#ifdef CONFIG_SCHED_SMT
6137static DEFINE_PER_CPU(struct sched_domain, cpu_domains); 6231static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
6138static struct sched_group sched_group_cpus[NR_CPUS]; 6232static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
6139 6233
6140static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map) 6234static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map,
6235 struct sched_group **sg)
6141{ 6236{
6237 if (sg)
6238 *sg = &per_cpu(sched_group_cpus, cpu);
6142 return cpu; 6239 return cpu;
6143} 6240}
6144#endif 6241#endif
@@ -6148,39 +6245,52 @@ static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map)
6148 */ 6245 */
6149#ifdef CONFIG_SCHED_MC 6246#ifdef CONFIG_SCHED_MC
6150static DEFINE_PER_CPU(struct sched_domain, core_domains); 6247static DEFINE_PER_CPU(struct sched_domain, core_domains);
6151static struct sched_group sched_group_core[NR_CPUS]; 6248static DEFINE_PER_CPU(struct sched_group, sched_group_core);
6152#endif 6249#endif
6153 6250
6154#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) 6251#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
6155static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map) 6252static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
6253 struct sched_group **sg)
6156{ 6254{
6255 int group;
6157 cpumask_t mask = cpu_sibling_map[cpu]; 6256 cpumask_t mask = cpu_sibling_map[cpu];
6158 cpus_and(mask, mask, *cpu_map); 6257 cpus_and(mask, mask, *cpu_map);
6159 return first_cpu(mask); 6258 group = first_cpu(mask);
6259 if (sg)
6260 *sg = &per_cpu(sched_group_core, group);
6261 return group;
6160} 6262}
6161#elif defined(CONFIG_SCHED_MC) 6263#elif defined(CONFIG_SCHED_MC)
6162static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map) 6264static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
6265 struct sched_group **sg)
6163{ 6266{
6267 if (sg)
6268 *sg = &per_cpu(sched_group_core, cpu);
6164 return cpu; 6269 return cpu;
6165} 6270}
6166#endif 6271#endif
6167 6272
6168static DEFINE_PER_CPU(struct sched_domain, phys_domains); 6273static DEFINE_PER_CPU(struct sched_domain, phys_domains);
6169static struct sched_group sched_group_phys[NR_CPUS]; 6274static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
6170 6275
6171static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map) 6276static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map,
6277 struct sched_group **sg)
6172{ 6278{
6279 int group;
6173#ifdef CONFIG_SCHED_MC 6280#ifdef CONFIG_SCHED_MC
6174 cpumask_t mask = cpu_coregroup_map(cpu); 6281 cpumask_t mask = cpu_coregroup_map(cpu);
6175 cpus_and(mask, mask, *cpu_map); 6282 cpus_and(mask, mask, *cpu_map);
6176 return first_cpu(mask); 6283 group = first_cpu(mask);
6177#elif defined(CONFIG_SCHED_SMT) 6284#elif defined(CONFIG_SCHED_SMT)
6178 cpumask_t mask = cpu_sibling_map[cpu]; 6285 cpumask_t mask = cpu_sibling_map[cpu];
6179 cpus_and(mask, mask, *cpu_map); 6286 cpus_and(mask, mask, *cpu_map);
6180 return first_cpu(mask); 6287 group = first_cpu(mask);
6181#else 6288#else
6182 return cpu; 6289 group = cpu;
6183#endif 6290#endif
6291 if (sg)
6292 *sg = &per_cpu(sched_group_phys, group);
6293 return group;
6184} 6294}
6185 6295
6186#ifdef CONFIG_NUMA 6296#ifdef CONFIG_NUMA
@@ -6193,12 +6303,22 @@ static DEFINE_PER_CPU(struct sched_domain, node_domains);
6193static struct sched_group **sched_group_nodes_bycpu[NR_CPUS]; 6303static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
6194 6304
6195static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); 6305static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
6196static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS]; 6306static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
6197 6307
6198static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map) 6308static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
6309 struct sched_group **sg)
6199{ 6310{
6200 return cpu_to_node(cpu); 6311 cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu));
6312 int group;
6313
6314 cpus_and(nodemask, nodemask, *cpu_map);
6315 group = first_cpu(nodemask);
6316
6317 if (sg)
6318 *sg = &per_cpu(sched_group_allnodes, group);
6319 return group;
6201} 6320}
6321
6202static void init_numa_sched_groups_power(struct sched_group *group_head) 6322static void init_numa_sched_groups_power(struct sched_group *group_head)
6203{ 6323{
6204 struct sched_group *sg = group_head; 6324 struct sched_group *sg = group_head;
@@ -6234,16 +6354,9 @@ static void free_sched_groups(const cpumask_t *cpu_map)
6234 int cpu, i; 6354 int cpu, i;
6235 6355
6236 for_each_cpu_mask(cpu, *cpu_map) { 6356 for_each_cpu_mask(cpu, *cpu_map) {
6237 struct sched_group *sched_group_allnodes
6238 = sched_group_allnodes_bycpu[cpu];
6239 struct sched_group **sched_group_nodes 6357 struct sched_group **sched_group_nodes
6240 = sched_group_nodes_bycpu[cpu]; 6358 = sched_group_nodes_bycpu[cpu];
6241 6359
6242 if (sched_group_allnodes) {
6243 kfree(sched_group_allnodes);
6244 sched_group_allnodes_bycpu[cpu] = NULL;
6245 }
6246
6247 if (!sched_group_nodes) 6360 if (!sched_group_nodes)
6248 continue; 6361 continue;
6249 6362
@@ -6337,7 +6450,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6337 struct sched_domain *sd; 6450 struct sched_domain *sd;
6338#ifdef CONFIG_NUMA 6451#ifdef CONFIG_NUMA
6339 struct sched_group **sched_group_nodes = NULL; 6452 struct sched_group **sched_group_nodes = NULL;
6340 struct sched_group *sched_group_allnodes = NULL; 6453 int sd_allnodes = 0;
6341 6454
6342 /* 6455 /*
6343 * Allocate the per-node list of sched groups 6456 * Allocate the per-node list of sched groups
@@ -6355,7 +6468,6 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6355 * Set up domains for cpus specified by the cpu_map. 6468 * Set up domains for cpus specified by the cpu_map.
6356 */ 6469 */
6357 for_each_cpu_mask(i, *cpu_map) { 6470 for_each_cpu_mask(i, *cpu_map) {
6358 int group;
6359 struct sched_domain *sd = NULL, *p; 6471 struct sched_domain *sd = NULL, *p;
6360 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); 6472 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
6361 6473
@@ -6364,26 +6476,12 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6364#ifdef CONFIG_NUMA 6476#ifdef CONFIG_NUMA
6365 if (cpus_weight(*cpu_map) 6477 if (cpus_weight(*cpu_map)
6366 > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { 6478 > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
6367 if (!sched_group_allnodes) {
6368 sched_group_allnodes
6369 = kmalloc_node(sizeof(struct sched_group)
6370 * MAX_NUMNODES,
6371 GFP_KERNEL,
6372 cpu_to_node(i));
6373 if (!sched_group_allnodes) {
6374 printk(KERN_WARNING
6375 "Can not alloc allnodes sched group\n");
6376 goto error;
6377 }
6378 sched_group_allnodes_bycpu[i]
6379 = sched_group_allnodes;
6380 }
6381 sd = &per_cpu(allnodes_domains, i); 6479 sd = &per_cpu(allnodes_domains, i);
6382 *sd = SD_ALLNODES_INIT; 6480 *sd = SD_ALLNODES_INIT;
6383 sd->span = *cpu_map; 6481 sd->span = *cpu_map;
6384 group = cpu_to_allnodes_group(i, cpu_map); 6482 cpu_to_allnodes_group(i, cpu_map, &sd->groups);
6385 sd->groups = &sched_group_allnodes[group];
6386 p = sd; 6483 p = sd;
6484 sd_allnodes = 1;
6387 } else 6485 } else
6388 p = NULL; 6486 p = NULL;
6389 6487
@@ -6398,36 +6496,33 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6398 6496
6399 p = sd; 6497 p = sd;
6400 sd = &per_cpu(phys_domains, i); 6498 sd = &per_cpu(phys_domains, i);
6401 group = cpu_to_phys_group(i, cpu_map);
6402 *sd = SD_CPU_INIT; 6499 *sd = SD_CPU_INIT;
6403 sd->span = nodemask; 6500 sd->span = nodemask;
6404 sd->parent = p; 6501 sd->parent = p;
6405 if (p) 6502 if (p)
6406 p->child = sd; 6503 p->child = sd;
6407 sd->groups = &sched_group_phys[group]; 6504 cpu_to_phys_group(i, cpu_map, &sd->groups);
6408 6505
6409#ifdef CONFIG_SCHED_MC 6506#ifdef CONFIG_SCHED_MC
6410 p = sd; 6507 p = sd;
6411 sd = &per_cpu(core_domains, i); 6508 sd = &per_cpu(core_domains, i);
6412 group = cpu_to_core_group(i, cpu_map);
6413 *sd = SD_MC_INIT; 6509 *sd = SD_MC_INIT;
6414 sd->span = cpu_coregroup_map(i); 6510 sd->span = cpu_coregroup_map(i);
6415 cpus_and(sd->span, sd->span, *cpu_map); 6511 cpus_and(sd->span, sd->span, *cpu_map);
6416 sd->parent = p; 6512 sd->parent = p;
6417 p->child = sd; 6513 p->child = sd;
6418 sd->groups = &sched_group_core[group]; 6514 cpu_to_core_group(i, cpu_map, &sd->groups);
6419#endif 6515#endif
6420 6516
6421#ifdef CONFIG_SCHED_SMT 6517#ifdef CONFIG_SCHED_SMT
6422 p = sd; 6518 p = sd;
6423 sd = &per_cpu(cpu_domains, i); 6519 sd = &per_cpu(cpu_domains, i);
6424 group = cpu_to_cpu_group(i, cpu_map);
6425 *sd = SD_SIBLING_INIT; 6520 *sd = SD_SIBLING_INIT;
6426 sd->span = cpu_sibling_map[i]; 6521 sd->span = cpu_sibling_map[i];
6427 cpus_and(sd->span, sd->span, *cpu_map); 6522 cpus_and(sd->span, sd->span, *cpu_map);
6428 sd->parent = p; 6523 sd->parent = p;
6429 p->child = sd; 6524 p->child = sd;
6430 sd->groups = &sched_group_cpus[group]; 6525 cpu_to_cpu_group(i, cpu_map, &sd->groups);
6431#endif 6526#endif
6432 } 6527 }
6433 6528
@@ -6439,8 +6534,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6439 if (i != first_cpu(this_sibling_map)) 6534 if (i != first_cpu(this_sibling_map))
6440 continue; 6535 continue;
6441 6536
6442 init_sched_build_groups(sched_group_cpus, this_sibling_map, 6537 init_sched_build_groups(this_sibling_map, cpu_map, &cpu_to_cpu_group);
6443 cpu_map, &cpu_to_cpu_group);
6444 } 6538 }
6445#endif 6539#endif
6446 6540
@@ -6451,8 +6545,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6451 cpus_and(this_core_map, this_core_map, *cpu_map); 6545 cpus_and(this_core_map, this_core_map, *cpu_map);
6452 if (i != first_cpu(this_core_map)) 6546 if (i != first_cpu(this_core_map))
6453 continue; 6547 continue;
6454 init_sched_build_groups(sched_group_core, this_core_map, 6548 init_sched_build_groups(this_core_map, cpu_map, &cpu_to_core_group);
6455 cpu_map, &cpu_to_core_group);
6456 } 6549 }
6457#endif 6550#endif
6458 6551
@@ -6465,15 +6558,13 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6465 if (cpus_empty(nodemask)) 6558 if (cpus_empty(nodemask))
6466 continue; 6559 continue;
6467 6560
6468 init_sched_build_groups(sched_group_phys, nodemask, 6561 init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group);
6469 cpu_map, &cpu_to_phys_group);
6470 } 6562 }
6471 6563
6472#ifdef CONFIG_NUMA 6564#ifdef CONFIG_NUMA
6473 /* Set up node groups */ 6565 /* Set up node groups */
6474 if (sched_group_allnodes) 6566 if (sd_allnodes)
6475 init_sched_build_groups(sched_group_allnodes, *cpu_map, 6567 init_sched_build_groups(*cpu_map, cpu_map, &cpu_to_allnodes_group);
6476 cpu_map, &cpu_to_allnodes_group);
6477 6568
6478 for (i = 0; i < MAX_NUMNODES; i++) { 6569 for (i = 0; i < MAX_NUMNODES; i++) {
6479 /* Set up node groups */ 6570 /* Set up node groups */
@@ -6565,10 +6656,10 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6565 for (i = 0; i < MAX_NUMNODES; i++) 6656 for (i = 0; i < MAX_NUMNODES; i++)
6566 init_numa_sched_groups_power(sched_group_nodes[i]); 6657 init_numa_sched_groups_power(sched_group_nodes[i]);
6567 6658
6568 if (sched_group_allnodes) { 6659 if (sd_allnodes) {
6569 int group = cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map); 6660 struct sched_group *sg;
6570 struct sched_group *sg = &sched_group_allnodes[group];
6571 6661
6662 cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg);
6572 init_numa_sched_groups_power(sg); 6663 init_numa_sched_groups_power(sg);
6573 } 6664 }
6574#endif 6665#endif
@@ -6847,6 +6938,10 @@ void __init sched_init(void)
6847 6938
6848 set_load_weight(&init_task); 6939 set_load_weight(&init_task);
6849 6940
6941#ifdef CONFIG_SMP
6942 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
6943#endif
6944
6850#ifdef CONFIG_RT_MUTEXES 6945#ifdef CONFIG_RT_MUTEXES
6851 plist_head_init(&init_task.pi_waiters, &init_task.pi_lock); 6946 plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
6852#endif 6947#endif
diff --git a/kernel/signal.c b/kernel/signal.c
index ec81defde3..1921ffdc5e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -24,6 +24,9 @@
24#include <linux/signal.h> 24#include <linux/signal.h>
25#include <linux/capability.h> 25#include <linux/capability.h>
26#include <linux/freezer.h> 26#include <linux/freezer.h>
27#include <linux/pid_namespace.h>
28#include <linux/nsproxy.h>
29
27#include <asm/param.h> 30#include <asm/param.h>
28#include <asm/uaccess.h> 31#include <asm/uaccess.h>
29#include <asm/unistd.h> 32#include <asm/unistd.h>
@@ -583,7 +586,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
583 error = -EPERM; 586 error = -EPERM;
584 if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) 587 if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info)))
585 && ((sig != SIGCONT) || 588 && ((sig != SIGCONT) ||
586 (current->signal->session != t->signal->session)) 589 (process_session(current) != process_session(t)))
587 && (current->euid ^ t->suid) && (current->euid ^ t->uid) 590 && (current->euid ^ t->suid) && (current->euid ^ t->uid)
588 && (current->uid ^ t->suid) && (current->uid ^ t->uid) 591 && (current->uid ^ t->suid) && (current->uid ^ t->uid)
589 && !capable(CAP_KILL)) 592 && !capable(CAP_KILL))
@@ -1877,8 +1880,12 @@ relock:
1877 if (sig_kernel_ignore(signr)) /* Default is nothing. */ 1880 if (sig_kernel_ignore(signr)) /* Default is nothing. */
1878 continue; 1881 continue;
1879 1882
1880 /* Init gets no signals it doesn't want. */ 1883 /*
1881 if (current == child_reaper) 1884 * Init of a pid space gets no signals it doesn't want from
1885 * within that pid space. It can of course get signals from
1886 * its parent pid space.
1887 */
1888 if (current == child_reaper(current))
1882 continue; 1889 continue;
1883 1890
1884 if (sig_kernel_stop(signr)) { 1891 if (sig_kernel_stop(signr)) {
diff --git a/kernel/sys.c b/kernel/sys.c
index a0c1a29a50..c7675c1bfd 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1381,7 +1381,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
1381 1381
1382 if (p->real_parent == group_leader) { 1382 if (p->real_parent == group_leader) {
1383 err = -EPERM; 1383 err = -EPERM;
1384 if (p->signal->session != group_leader->signal->session) 1384 if (process_session(p) != process_session(group_leader))
1385 goto out; 1385 goto out;
1386 err = -EACCES; 1386 err = -EACCES;
1387 if (p->did_exec) 1387 if (p->did_exec)
@@ -1397,16 +1397,13 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
1397 goto out; 1397 goto out;
1398 1398
1399 if (pgid != pid) { 1399 if (pgid != pid) {
1400 struct task_struct *p; 1400 struct task_struct *g =
1401 find_task_by_pid_type(PIDTYPE_PGID, pgid);
1401 1402
1402 do_each_task_pid(pgid, PIDTYPE_PGID, p) { 1403 if (!g || process_session(g) != process_session(group_leader))
1403 if (p->signal->session == group_leader->signal->session) 1404 goto out;
1404 goto ok_pgid;
1405 } while_each_task_pid(pgid, PIDTYPE_PGID, p);
1406 goto out;
1407 } 1405 }
1408 1406
1409ok_pgid:
1410 err = security_task_setpgid(p, pgid); 1407 err = security_task_setpgid(p, pgid);
1411 if (err) 1408 if (err)
1412 goto out; 1409 goto out;
@@ -1459,7 +1456,7 @@ asmlinkage long sys_getpgrp(void)
1459asmlinkage long sys_getsid(pid_t pid) 1456asmlinkage long sys_getsid(pid_t pid)
1460{ 1457{
1461 if (!pid) 1458 if (!pid)
1462 return current->signal->session; 1459 return process_session(current);
1463 else { 1460 else {
1464 int retval; 1461 int retval;
1465 struct task_struct *p; 1462 struct task_struct *p;
@@ -1471,7 +1468,7 @@ asmlinkage long sys_getsid(pid_t pid)
1471 if (p) { 1468 if (p) {
1472 retval = security_task_getsid(p); 1469 retval = security_task_getsid(p);
1473 if (!retval) 1470 if (!retval)
1474 retval = p->signal->session; 1471 retval = process_session(p);
1475 } 1472 }
1476 read_unlock(&tasklist_lock); 1473 read_unlock(&tasklist_lock);
1477 return retval; 1474 return retval;
@@ -1484,7 +1481,6 @@ asmlinkage long sys_setsid(void)
1484 pid_t session; 1481 pid_t session;
1485 int err = -EPERM; 1482 int err = -EPERM;
1486 1483
1487 mutex_lock(&tty_mutex);
1488 write_lock_irq(&tasklist_lock); 1484 write_lock_irq(&tasklist_lock);
1489 1485
1490 /* Fail if I am already a session leader */ 1486 /* Fail if I am already a session leader */
@@ -1504,12 +1500,15 @@ asmlinkage long sys_setsid(void)
1504 1500
1505 group_leader->signal->leader = 1; 1501 group_leader->signal->leader = 1;
1506 __set_special_pids(session, session); 1502 __set_special_pids(session, session);
1503
1504 spin_lock(&group_leader->sighand->siglock);
1507 group_leader->signal->tty = NULL; 1505 group_leader->signal->tty = NULL;
1508 group_leader->signal->tty_old_pgrp = 0; 1506 group_leader->signal->tty_old_pgrp = 0;
1507 spin_unlock(&group_leader->sighand->siglock);
1508
1509 err = process_group(group_leader); 1509 err = process_group(group_leader);
1510out: 1510out:
1511 write_unlock_irq(&tasklist_lock); 1511 write_unlock_irq(&tasklist_lock);
1512 mutex_unlock(&tty_mutex);
1513 return err; 1512 return err;
1514} 1513}
1515 1514
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8e9f00fd6d..130c5ec9ee 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -92,7 +92,9 @@ extern char modprobe_path[];
92extern int sg_big_buff; 92extern int sg_big_buff;
93#endif 93#endif
94#ifdef CONFIG_SYSVIPC 94#ifdef CONFIG_SYSVIPC
95static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp, 95static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp,
96 void __user *buffer, size_t *lenp, loff_t *ppos);
97static int proc_ipc_doulongvec_minmax(ctl_table *table, int write, struct file *filp,
96 void __user *buffer, size_t *lenp, loff_t *ppos); 98 void __user *buffer, size_t *lenp, loff_t *ppos);
97#endif 99#endif
98 100
@@ -131,12 +133,22 @@ extern int max_lock_depth;
131 133
132#ifdef CONFIG_SYSCTL_SYSCALL 134#ifdef CONFIG_SYSCTL_SYSCALL
133static int parse_table(int __user *, int, void __user *, size_t __user *, 135static int parse_table(int __user *, int, void __user *, size_t __user *,
134 void __user *, size_t, ctl_table *, void **); 136 void __user *, size_t, ctl_table *);
135#endif 137#endif
136 138
137static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, 139static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
138 void __user *buffer, size_t *lenp, loff_t *ppos); 140 void __user *buffer, size_t *lenp, loff_t *ppos);
139 141
142static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
143 void __user *oldval, size_t __user *oldlenp,
144 void __user *newval, size_t newlen);
145
146#ifdef CONFIG_SYSVIPC
147static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen,
148 void __user *oldval, size_t __user *oldlenp,
149 void __user *newval, size_t newlen);
150#endif
151
140#ifdef CONFIG_PROC_SYSCTL 152#ifdef CONFIG_PROC_SYSCTL
141static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, 153static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp,
142 void __user *buffer, size_t *lenp, loff_t *ppos); 154 void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -163,6 +175,40 @@ extern ctl_table inotify_table[];
163int sysctl_legacy_va_layout; 175int sysctl_legacy_va_layout;
164#endif 176#endif
165 177
178static void *get_uts(ctl_table *table, int write)
179{
180 char *which = table->data;
181#ifdef CONFIG_UTS_NS
182 struct uts_namespace *uts_ns = current->nsproxy->uts_ns;
183 which = (which - (char *)&init_uts_ns) + (char *)uts_ns;
184#endif
185 if (!write)
186 down_read(&uts_sem);
187 else
188 down_write(&uts_sem);
189 return which;
190}
191
192static void put_uts(ctl_table *table, int write, void *which)
193{
194 if (!write)
195 up_read(&uts_sem);
196 else
197 up_write(&uts_sem);
198}
199
200#ifdef CONFIG_SYSVIPC
201static void *get_ipc(ctl_table *table, int write)
202{
203 char *which = table->data;
204 struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
205 which = (which - (char *)&init_ipc_ns) + (char *)ipc_ns;
206 return which;
207}
208#else
209#define get_ipc(T,W) ((T)->data)
210#endif
211
166/* /proc declarations: */ 212/* /proc declarations: */
167 213
168#ifdef CONFIG_PROC_SYSCTL 214#ifdef CONFIG_PROC_SYSCTL
@@ -229,7 +275,6 @@ static ctl_table root_table[] = {
229}; 275};
230 276
231static ctl_table kern_table[] = { 277static ctl_table kern_table[] = {
232#ifndef CONFIG_UTS_NS
233 { 278 {
234 .ctl_name = KERN_OSTYPE, 279 .ctl_name = KERN_OSTYPE,
235 .procname = "ostype", 280 .procname = "ostype",
@@ -237,7 +282,7 @@ static ctl_table kern_table[] = {
237 .maxlen = sizeof(init_uts_ns.name.sysname), 282 .maxlen = sizeof(init_uts_ns.name.sysname),
238 .mode = 0444, 283 .mode = 0444,
239 .proc_handler = &proc_do_uts_string, 284 .proc_handler = &proc_do_uts_string,
240 .strategy = &sysctl_string, 285 .strategy = &sysctl_uts_string,
241 }, 286 },
242 { 287 {
243 .ctl_name = KERN_OSRELEASE, 288 .ctl_name = KERN_OSRELEASE,
@@ -246,7 +291,7 @@ static ctl_table kern_table[] = {
246 .maxlen = sizeof(init_uts_ns.name.release), 291 .maxlen = sizeof(init_uts_ns.name.release),
247 .mode = 0444, 292 .mode = 0444,
248 .proc_handler = &proc_do_uts_string, 293 .proc_handler = &proc_do_uts_string,
249 .strategy = &sysctl_string, 294 .strategy = &sysctl_uts_string,
250 }, 295 },
251 { 296 {
252 .ctl_name = KERN_VERSION, 297 .ctl_name = KERN_VERSION,
@@ -255,7 +300,7 @@ static ctl_table kern_table[] = {
255 .maxlen = sizeof(init_uts_ns.name.version), 300 .maxlen = sizeof(init_uts_ns.name.version),
256 .mode = 0444, 301 .mode = 0444,
257 .proc_handler = &proc_do_uts_string, 302 .proc_handler = &proc_do_uts_string,
258 .strategy = &sysctl_string, 303 .strategy = &sysctl_uts_string,
259 }, 304 },
260 { 305 {
261 .ctl_name = KERN_NODENAME, 306 .ctl_name = KERN_NODENAME,
@@ -264,7 +309,7 @@ static ctl_table kern_table[] = {
264 .maxlen = sizeof(init_uts_ns.name.nodename), 309 .maxlen = sizeof(init_uts_ns.name.nodename),
265 .mode = 0644, 310 .mode = 0644,
266 .proc_handler = &proc_do_uts_string, 311 .proc_handler = &proc_do_uts_string,
267 .strategy = &sysctl_string, 312 .strategy = &sysctl_uts_string,
268 }, 313 },
269 { 314 {
270 .ctl_name = KERN_DOMAINNAME, 315 .ctl_name = KERN_DOMAINNAME,
@@ -273,56 +318,8 @@ static ctl_table kern_table[] = {
273 .maxlen = sizeof(init_uts_ns.name.domainname), 318 .maxlen = sizeof(init_uts_ns.name.domainname),
274 .mode = 0644, 319 .mode = 0644,
275 .proc_handler = &proc_do_uts_string, 320 .proc_handler = &proc_do_uts_string,
276 .strategy = &sysctl_string, 321 .strategy = &sysctl_uts_string,
277 },
278#else /* !CONFIG_UTS_NS */
279 {
280 .ctl_name = KERN_OSTYPE,
281 .procname = "ostype",
282 .data = NULL,
283 /* could maybe use __NEW_UTS_LEN here? */
284 .maxlen = FIELD_SIZEOF(struct new_utsname, sysname),
285 .mode = 0444,
286 .proc_handler = &proc_do_uts_string,
287 .strategy = &sysctl_string,
288 },
289 {
290 .ctl_name = KERN_OSRELEASE,
291 .procname = "osrelease",
292 .data = NULL,
293 .maxlen = FIELD_SIZEOF(struct new_utsname, release),
294 .mode = 0444,
295 .proc_handler = &proc_do_uts_string,
296 .strategy = &sysctl_string,
297 },
298 {
299 .ctl_name = KERN_VERSION,
300 .procname = "version",
301 .data = NULL,
302 .maxlen = FIELD_SIZEOF(struct new_utsname, version),
303 .mode = 0444,
304 .proc_handler = &proc_do_uts_string,
305 .strategy = &sysctl_string,
306 },
307 {
308 .ctl_name = KERN_NODENAME,
309 .procname = "hostname",
310 .data = NULL,
311 .maxlen = FIELD_SIZEOF(struct new_utsname, nodename),
312 .mode = 0644,
313 .proc_handler = &proc_do_uts_string,
314 .strategy = &sysctl_string,
315 },
316 {
317 .ctl_name = KERN_DOMAINNAME,
318 .procname = "domainname",
319 .data = NULL,
320 .maxlen = FIELD_SIZEOF(struct new_utsname, domainname),
321 .mode = 0644,
322 .proc_handler = &proc_do_uts_string,
323 .strategy = &sysctl_string,
324 }, 322 },
325#endif /* !CONFIG_UTS_NS */
326 { 323 {
327 .ctl_name = KERN_PANIC, 324 .ctl_name = KERN_PANIC,
328 .procname = "panic", 325 .procname = "panic",
@@ -481,58 +478,65 @@ static ctl_table kern_table[] = {
481 { 478 {
482 .ctl_name = KERN_SHMMAX, 479 .ctl_name = KERN_SHMMAX,
483 .procname = "shmmax", 480 .procname = "shmmax",
484 .data = NULL, 481 .data = &init_ipc_ns.shm_ctlmax,
485 .maxlen = sizeof (size_t), 482 .maxlen = sizeof (init_ipc_ns.shm_ctlmax),
486 .mode = 0644, 483 .mode = 0644,
487 .proc_handler = &proc_do_ipc_string, 484 .proc_handler = &proc_ipc_doulongvec_minmax,
485 .strategy = sysctl_ipc_data,
488 }, 486 },
489 { 487 {
490 .ctl_name = KERN_SHMALL, 488 .ctl_name = KERN_SHMALL,
491 .procname = "shmall", 489 .procname = "shmall",
492 .data = NULL, 490 .data = &init_ipc_ns.shm_ctlall,
493 .maxlen = sizeof (size_t), 491 .maxlen = sizeof (init_ipc_ns.shm_ctlall),
494 .mode = 0644, 492 .mode = 0644,
495 .proc_handler = &proc_do_ipc_string, 493 .proc_handler = &proc_ipc_doulongvec_minmax,
494 .strategy = sysctl_ipc_data,
496 }, 495 },
497 { 496 {
498 .ctl_name = KERN_SHMMNI, 497 .ctl_name = KERN_SHMMNI,
499 .procname = "shmmni", 498 .procname = "shmmni",
500 .data = NULL, 499 .data = &init_ipc_ns.shm_ctlmni,
501 .maxlen = sizeof (int), 500 .maxlen = sizeof (init_ipc_ns.shm_ctlmni),
502 .mode = 0644, 501 .mode = 0644,
503 .proc_handler = &proc_do_ipc_string, 502 .proc_handler = &proc_ipc_dointvec,
503 .strategy = sysctl_ipc_data,
504 }, 504 },
505 { 505 {
506 .ctl_name = KERN_MSGMAX, 506 .ctl_name = KERN_MSGMAX,
507 .procname = "msgmax", 507 .procname = "msgmax",
508 .data = NULL, 508 .data = &init_ipc_ns.msg_ctlmax,
509 .maxlen = sizeof (int), 509 .maxlen = sizeof (init_ipc_ns.msg_ctlmax),
510 .mode = 0644, 510 .mode = 0644,
511 .proc_handler = &proc_do_ipc_string, 511 .proc_handler = &proc_ipc_dointvec,
512 .strategy = sysctl_ipc_data,
512 }, 513 },
513 { 514 {
514 .ctl_name = KERN_MSGMNI, 515 .ctl_name = KERN_MSGMNI,
515 .procname = "msgmni", 516 .procname = "msgmni",
516 .data = NULL, 517 .data = &init_ipc_ns.msg_ctlmni,
517 .maxlen = sizeof (int), 518 .maxlen = sizeof (init_ipc_ns.msg_ctlmni),
518 .mode = 0644, 519 .mode = 0644,
519 .proc_handler = &proc_do_ipc_string, 520 .proc_handler = &proc_ipc_dointvec,
521 .strategy = sysctl_ipc_data,
520 }, 522 },
521 { 523 {
522 .ctl_name = KERN_MSGMNB, 524 .ctl_name = KERN_MSGMNB,
523 .procname = "msgmnb", 525 .procname = "msgmnb",
524 .data = NULL, 526 .data = &init_ipc_ns.msg_ctlmnb,
525 .maxlen = sizeof (int), 527 .maxlen = sizeof (init_ipc_ns.msg_ctlmnb),
526 .mode = 0644, 528 .mode = 0644,
527 .proc_handler = &proc_do_ipc_string, 529 .proc_handler = &proc_ipc_dointvec,
530 .strategy = sysctl_ipc_data,
528 }, 531 },
529 { 532 {
530 .ctl_name = KERN_SEM, 533 .ctl_name = KERN_SEM,
531 .procname = "sem", 534 .procname = "sem",
532 .data = NULL, 535 .data = &init_ipc_ns.sem_ctls,
533 .maxlen = 4*sizeof (int), 536 .maxlen = 4*sizeof (int),
534 .mode = 0644, 537 .mode = 0644,
535 .proc_handler = &proc_do_ipc_string, 538 .proc_handler = &proc_ipc_dointvec,
539 .strategy = sysctl_ipc_data,
536 }, 540 },
537#endif 541#endif
538#ifdef CONFIG_MAGIC_SYSRQ 542#ifdef CONFIG_MAGIC_SYSRQ
@@ -1239,7 +1243,6 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol
1239 do { 1243 do {
1240 struct ctl_table_header *head = 1244 struct ctl_table_header *head =
1241 list_entry(tmp, struct ctl_table_header, ctl_entry); 1245 list_entry(tmp, struct ctl_table_header, ctl_entry);
1242 void *context = NULL;
1243 1246
1244 if (!use_table(head)) 1247 if (!use_table(head))
1245 continue; 1248 continue;
@@ -1247,9 +1250,7 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol
1247 spin_unlock(&sysctl_lock); 1250 spin_unlock(&sysctl_lock);
1248 1251
1249 error = parse_table(name, nlen, oldval, oldlenp, 1252 error = parse_table(name, nlen, oldval, oldlenp,
1250 newval, newlen, head->ctl_table, 1253 newval, newlen, head->ctl_table);
1251 &context);
1252 kfree(context);
1253 1254
1254 spin_lock(&sysctl_lock); 1255 spin_lock(&sysctl_lock);
1255 unuse_table(head); 1256 unuse_table(head);
@@ -1305,7 +1306,7 @@ static inline int ctl_perm(ctl_table *table, int op)
1305static int parse_table(int __user *name, int nlen, 1306static int parse_table(int __user *name, int nlen,
1306 void __user *oldval, size_t __user *oldlenp, 1307 void __user *oldval, size_t __user *oldlenp,
1307 void __user *newval, size_t newlen, 1308 void __user *newval, size_t newlen,
1308 ctl_table *table, void **context) 1309 ctl_table *table)
1309{ 1310{
1310 int n; 1311 int n;
1311repeat: 1312repeat:
@@ -1325,7 +1326,7 @@ repeat:
1325 error = table->strategy( 1326 error = table->strategy(
1326 table, name, nlen, 1327 table, name, nlen,
1327 oldval, oldlenp, 1328 oldval, oldlenp,
1328 newval, newlen, context); 1329 newval, newlen);
1329 if (error) 1330 if (error)
1330 return error; 1331 return error;
1331 } 1332 }
@@ -1336,7 +1337,7 @@ repeat:
1336 } 1337 }
1337 error = do_sysctl_strategy(table, name, nlen, 1338 error = do_sysctl_strategy(table, name, nlen,
1338 oldval, oldlenp, 1339 oldval, oldlenp,
1339 newval, newlen, context); 1340 newval, newlen);
1340 return error; 1341 return error;
1341 } 1342 }
1342 } 1343 }
@@ -1347,7 +1348,7 @@ repeat:
1347int do_sysctl_strategy (ctl_table *table, 1348int do_sysctl_strategy (ctl_table *table,
1348 int __user *name, int nlen, 1349 int __user *name, int nlen,
1349 void __user *oldval, size_t __user *oldlenp, 1350 void __user *oldval, size_t __user *oldlenp,
1350 void __user *newval, size_t newlen, void **context) 1351 void __user *newval, size_t newlen)
1351{ 1352{
1352 int op = 0, rc; 1353 int op = 0, rc;
1353 size_t len; 1354 size_t len;
@@ -1361,7 +1362,7 @@ int do_sysctl_strategy (ctl_table *table,
1361 1362
1362 if (table->strategy) { 1363 if (table->strategy) {
1363 rc = table->strategy(table, name, nlen, oldval, oldlenp, 1364 rc = table->strategy(table, name, nlen, oldval, oldlenp,
1364 newval, newlen, context); 1365 newval, newlen);
1365 if (rc < 0) 1366 if (rc < 0)
1366 return rc; 1367 return rc;
1367 if (rc > 0) 1368 if (rc > 0)
@@ -1614,7 +1615,7 @@ static ssize_t do_rw_proc(int write, struct file * file, char __user * buf,
1614 size_t count, loff_t *ppos) 1615 size_t count, loff_t *ppos)
1615{ 1616{
1616 int op; 1617 int op;
1617 struct proc_dir_entry *de = PDE(file->f_dentry->d_inode); 1618 struct proc_dir_entry *de = PDE(file->f_path.dentry->d_inode);
1618 struct ctl_table *table; 1619 struct ctl_table *table;
1619 size_t res; 1620 size_t res;
1620 ssize_t error = -ENOTDIR; 1621 ssize_t error = -ENOTDIR;
@@ -1753,66 +1754,17 @@ int proc_dostring(ctl_table *table, int write, struct file *filp,
1753 * Special case of dostring for the UTS structure. This has locks 1754 * Special case of dostring for the UTS structure. This has locks
1754 * to observe. Should this be in kernel/sys.c ???? 1755 * to observe. Should this be in kernel/sys.c ????
1755 */ 1756 */
1756
1757#ifndef CONFIG_UTS_NS
1758static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
1759 void __user *buffer, size_t *lenp, loff_t *ppos)
1760{
1761 int r;
1762 1757
1763 if (!write) {
1764 down_read(&uts_sem);
1765 r=proc_dostring(table,0,filp,buffer,lenp, ppos);
1766 up_read(&uts_sem);
1767 } else {
1768 down_write(&uts_sem);
1769 r=proc_dostring(table,1,filp,buffer,lenp, ppos);
1770 up_write(&uts_sem);
1771 }
1772 return r;
1773}
1774#else /* !CONFIG_UTS_NS */
1775static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, 1758static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
1776 void __user *buffer, size_t *lenp, loff_t *ppos) 1759 void __user *buffer, size_t *lenp, loff_t *ppos)
1777{ 1760{
1778 int r; 1761 int r;
1779 struct uts_namespace* uts_ns = current->nsproxy->uts_ns; 1762 void *which;
1780 char* which; 1763 which = get_uts(table, write);
1781 1764 r = _proc_do_string(which, table->maxlen,write,filp,buffer,lenp, ppos);
1782 switch (table->ctl_name) { 1765 put_uts(table, write, which);
1783 case KERN_OSTYPE:
1784 which = uts_ns->name.sysname;
1785 break;
1786 case KERN_NODENAME:
1787 which = uts_ns->name.nodename;
1788 break;
1789 case KERN_OSRELEASE:
1790 which = uts_ns->name.release;
1791 break;
1792 case KERN_VERSION:
1793 which = uts_ns->name.version;
1794 break;
1795 case KERN_DOMAINNAME:
1796 which = uts_ns->name.domainname;
1797 break;
1798 default:
1799 r = -EINVAL;
1800 goto out;
1801 }
1802
1803 if (!write) {
1804 down_read(&uts_sem);
1805 r=_proc_do_string(which,table->maxlen,0,filp,buffer,lenp, ppos);
1806 up_read(&uts_sem);
1807 } else {
1808 down_write(&uts_sem);
1809 r=_proc_do_string(which,table->maxlen,1,filp,buffer,lenp, ppos);
1810 up_write(&uts_sem);
1811 }
1812 out:
1813 return r; 1766 return r;
1814} 1767}
1815#endif /* !CONFIG_UTS_NS */
1816 1768
1817static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, 1769static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
1818 int *valp, 1770 int *valp,
@@ -1976,9 +1928,6 @@ int proc_dointvec(ctl_table *table, int write, struct file *filp,
1976 1928
1977#define OP_SET 0 1929#define OP_SET 0
1978#define OP_AND 1 1930#define OP_AND 1
1979#define OP_OR 2
1980#define OP_MAX 3
1981#define OP_MIN 4
1982 1931
1983static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp, 1932static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp,
1984 int *valp, 1933 int *valp,
@@ -1990,13 +1939,6 @@ static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp,
1990 switch(op) { 1939 switch(op) {
1991 case OP_SET: *valp = val; break; 1940 case OP_SET: *valp = val; break;
1992 case OP_AND: *valp &= val; break; 1941 case OP_AND: *valp &= val; break;
1993 case OP_OR: *valp |= val; break;
1994 case OP_MAX: if(*valp < val)
1995 *valp = val;
1996 break;
1997 case OP_MIN: if(*valp > val)
1998 *valp = val;
1999 break;
2000 } 1942 }
2001 } else { 1943 } else {
2002 int val = *valp; 1944 int val = *valp;
@@ -2391,46 +2333,24 @@ int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp,
2391} 2333}
2392 2334
2393#ifdef CONFIG_SYSVIPC 2335#ifdef CONFIG_SYSVIPC
2394static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp, 2336static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp,
2395 void __user *buffer, size_t *lenp, loff_t *ppos) 2337 void __user *buffer, size_t *lenp, loff_t *ppos)
2396{ 2338{
2397 void *data; 2339 void *which;
2398 struct ipc_namespace *ns; 2340 which = get_ipc(table, write);
2399 2341 return __do_proc_dointvec(which, table, write, filp, buffer,
2400 ns = current->nsproxy->ipc_ns;
2401
2402 switch (table->ctl_name) {
2403 case KERN_SHMMAX:
2404 data = &ns->shm_ctlmax;
2405 goto proc_minmax;
2406 case KERN_SHMALL:
2407 data = &ns->shm_ctlall;
2408 goto proc_minmax;
2409 case KERN_SHMMNI:
2410 data = &ns->shm_ctlmni;
2411 break;
2412 case KERN_MSGMAX:
2413 data = &ns->msg_ctlmax;
2414 break;
2415 case KERN_MSGMNI:
2416 data = &ns->msg_ctlmni;
2417 break;
2418 case KERN_MSGMNB:
2419 data = &ns->msg_ctlmnb;
2420 break;
2421 case KERN_SEM:
2422 data = &ns->sem_ctls;
2423 break;
2424 default:
2425 return -EINVAL;
2426 }
2427
2428 return __do_proc_dointvec(data, table, write, filp, buffer,
2429 lenp, ppos, NULL, NULL); 2342 lenp, ppos, NULL, NULL);
2430proc_minmax: 2343}
2431 return __do_proc_doulongvec_minmax(data, table, write, filp, buffer, 2344
2345static int proc_ipc_doulongvec_minmax(ctl_table *table, int write,
2346 struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos)
2347{
2348 void *which;
2349 which = get_ipc(table, write);
2350 return __do_proc_doulongvec_minmax(which, table, write, filp, buffer,
2432 lenp, ppos, 1l, 1l); 2351 lenp, ppos, 1l, 1l);
2433} 2352}
2353
2434#endif 2354#endif
2435 2355
2436static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, 2356static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp,
@@ -2475,6 +2395,17 @@ static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp,
2475{ 2395{
2476 return -ENOSYS; 2396 return -ENOSYS;
2477} 2397}
2398static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp,
2399 void __user *buffer, size_t *lenp, loff_t *ppos)
2400{
2401 return -ENOSYS;
2402}
2403static int proc_ipc_doulongvec_minmax(ctl_table *table, int write,
2404 struct file *filp, void __user *buffer,
2405 size_t *lenp, loff_t *ppos)
2406{
2407 return -ENOSYS;
2408}
2478#endif 2409#endif
2479 2410
2480int proc_dointvec(ctl_table *table, int write, struct file *filp, 2411int proc_dointvec(ctl_table *table, int write, struct file *filp,
@@ -2539,7 +2470,7 @@ int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write,
2539/* The generic string strategy routine: */ 2470/* The generic string strategy routine: */
2540int sysctl_string(ctl_table *table, int __user *name, int nlen, 2471int sysctl_string(ctl_table *table, int __user *name, int nlen,
2541 void __user *oldval, size_t __user *oldlenp, 2472 void __user *oldval, size_t __user *oldlenp,
2542 void __user *newval, size_t newlen, void **context) 2473 void __user *newval, size_t newlen)
2543{ 2474{
2544 if (!table->data || !table->maxlen) 2475 if (!table->data || !table->maxlen)
2545 return -ENOTDIR; 2476 return -ENOTDIR;
@@ -2585,7 +2516,7 @@ int sysctl_string(ctl_table *table, int __user *name, int nlen,
2585 */ 2516 */
2586int sysctl_intvec(ctl_table *table, int __user *name, int nlen, 2517int sysctl_intvec(ctl_table *table, int __user *name, int nlen,
2587 void __user *oldval, size_t __user *oldlenp, 2518 void __user *oldval, size_t __user *oldlenp,
2588 void __user *newval, size_t newlen, void **context) 2519 void __user *newval, size_t newlen)
2589{ 2520{
2590 2521
2591 if (newval && newlen) { 2522 if (newval && newlen) {
@@ -2621,7 +2552,7 @@ int sysctl_intvec(ctl_table *table, int __user *name, int nlen,
2621/* Strategy function to convert jiffies to seconds */ 2552/* Strategy function to convert jiffies to seconds */
2622int sysctl_jiffies(ctl_table *table, int __user *name, int nlen, 2553int sysctl_jiffies(ctl_table *table, int __user *name, int nlen,
2623 void __user *oldval, size_t __user *oldlenp, 2554 void __user *oldval, size_t __user *oldlenp,
2624 void __user *newval, size_t newlen, void **context) 2555 void __user *newval, size_t newlen)
2625{ 2556{
2626 if (oldval) { 2557 if (oldval) {
2627 size_t olen; 2558 size_t olen;
@@ -2649,7 +2580,7 @@ int sysctl_jiffies(ctl_table *table, int __user *name, int nlen,
2649/* Strategy function to convert jiffies to seconds */ 2580/* Strategy function to convert jiffies to seconds */
2650int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, 2581int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
2651 void __user *oldval, size_t __user *oldlenp, 2582 void __user *oldval, size_t __user *oldlenp,
2652 void __user *newval, size_t newlen, void **context) 2583 void __user *newval, size_t newlen)
2653{ 2584{
2654 if (oldval) { 2585 if (oldval) {
2655 size_t olen; 2586 size_t olen;
@@ -2674,6 +2605,64 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
2674 return 1; 2605 return 1;
2675} 2606}
2676 2607
2608
2609/* The generic string strategy routine: */
2610static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
2611 void __user *oldval, size_t __user *oldlenp,
2612 void __user *newval, size_t newlen)
2613{
2614 struct ctl_table uts_table;
2615 int r, write;
2616 write = newval && newlen;
2617 memcpy(&uts_table, table, sizeof(uts_table));
2618 uts_table.data = get_uts(table, write);
2619 r = sysctl_string(&uts_table, name, nlen,
2620 oldval, oldlenp, newval, newlen);
2621 put_uts(table, write, uts_table.data);
2622 return r;
2623}
2624
2625#ifdef CONFIG_SYSVIPC
2626/* The generic sysctl ipc data routine. */
2627static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen,
2628 void __user *oldval, size_t __user *oldlenp,
2629 void __user *newval, size_t newlen)
2630{
2631 size_t len;
2632 void *data;
2633
2634 /* Get out of I don't have a variable */
2635 if (!table->data || !table->maxlen)
2636 return -ENOTDIR;
2637
2638 data = get_ipc(table, 1);
2639 if (!data)
2640 return -ENOTDIR;
2641
2642 if (oldval && oldlenp) {
2643 if (get_user(len, oldlenp))
2644 return -EFAULT;
2645 if (len) {
2646 if (len > table->maxlen)
2647 len = table->maxlen;
2648 if (copy_to_user(oldval, data, len))
2649 return -EFAULT;
2650 if (put_user(len, oldlenp))
2651 return -EFAULT;
2652 }
2653 }
2654
2655 if (newval && newlen) {
2656 if (newlen > table->maxlen)
2657 newlen = table->maxlen;
2658
2659 if (copy_from_user(data, newval, newlen))
2660 return -EFAULT;
2661 }
2662 return 1;
2663}
2664#endif
2665
2677#else /* CONFIG_SYSCTL_SYSCALL */ 2666#else /* CONFIG_SYSCTL_SYSCALL */
2678 2667
2679 2668
@@ -2712,32 +2701,44 @@ out:
2712 2701
2713int sysctl_string(ctl_table *table, int __user *name, int nlen, 2702int sysctl_string(ctl_table *table, int __user *name, int nlen,
2714 void __user *oldval, size_t __user *oldlenp, 2703 void __user *oldval, size_t __user *oldlenp,
2715 void __user *newval, size_t newlen, void **context) 2704 void __user *newval, size_t newlen)
2716{ 2705{
2717 return -ENOSYS; 2706 return -ENOSYS;
2718} 2707}
2719 2708
2720int sysctl_intvec(ctl_table *table, int __user *name, int nlen, 2709int sysctl_intvec(ctl_table *table, int __user *name, int nlen,
2721 void __user *oldval, size_t __user *oldlenp, 2710 void __user *oldval, size_t __user *oldlenp,
2722 void __user *newval, size_t newlen, void **context) 2711 void __user *newval, size_t newlen)
2723{ 2712{
2724 return -ENOSYS; 2713 return -ENOSYS;
2725} 2714}
2726 2715
2727int sysctl_jiffies(ctl_table *table, int __user *name, int nlen, 2716int sysctl_jiffies(ctl_table *table, int __user *name, int nlen,
2728 void __user *oldval, size_t __user *oldlenp, 2717 void __user *oldval, size_t __user *oldlenp,
2729 void __user *newval, size_t newlen, void **context) 2718 void __user *newval, size_t newlen)
2730{ 2719{
2731 return -ENOSYS; 2720 return -ENOSYS;
2732} 2721}
2733 2722
2734int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, 2723int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
2735 void __user *oldval, size_t __user *oldlenp, 2724 void __user *oldval, size_t __user *oldlenp,
2736 void __user *newval, size_t newlen, void **context) 2725 void __user *newval, size_t newlen)
2737{ 2726{
2738 return -ENOSYS; 2727 return -ENOSYS;
2739} 2728}
2740 2729
2730static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
2731 void __user *oldval, size_t __user *oldlenp,
2732 void __user *newval, size_t newlen)
2733{
2734 return -ENOSYS;
2735}
2736static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen,
2737 void __user *oldval, size_t __user *oldlenp,
2738 void __user *newval, size_t newlen)
2739{
2740 return -ENOSYS;
2741}
2741#endif /* CONFIG_SYSCTL_SYSCALL */ 2742#endif /* CONFIG_SYSCTL_SYSCALL */
2742 2743
2743/* 2744/*
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 74eca5939b..22504afc0d 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -156,7 +156,7 @@ int clocksource_register(struct clocksource *c)
156 /* check if clocksource is already registered */ 156 /* check if clocksource is already registered */
157 if (is_registered_source(c)) { 157 if (is_registered_source(c)) {
158 printk("register_clocksource: Cannot register %s. " 158 printk("register_clocksource: Cannot register %s. "
159 "Already registered!", c->name); 159 "Already registered!", c->name);
160 ret = -EBUSY; 160 ret = -EBUSY;
161 } else { 161 } else {
162 /* register it */ 162 /* register it */
@@ -186,6 +186,7 @@ void clocksource_reselect(void)
186} 186}
187EXPORT_SYMBOL(clocksource_reselect); 187EXPORT_SYMBOL(clocksource_reselect);
188 188
189#ifdef CONFIG_SYSFS
189/** 190/**
190 * sysfs_show_current_clocksources - sysfs interface for current clocksource 191 * sysfs_show_current_clocksources - sysfs interface for current clocksource
191 * @dev: unused 192 * @dev: unused
@@ -275,10 +276,10 @@ sysfs_show_available_clocksources(struct sys_device *dev, char *buf)
275 * Sysfs setup bits: 276 * Sysfs setup bits:
276 */ 277 */
277static SYSDEV_ATTR(current_clocksource, 0600, sysfs_show_current_clocksources, 278static SYSDEV_ATTR(current_clocksource, 0600, sysfs_show_current_clocksources,
278 sysfs_override_clocksource); 279 sysfs_override_clocksource);
279 280
280static SYSDEV_ATTR(available_clocksource, 0600, 281static SYSDEV_ATTR(available_clocksource, 0600,
281 sysfs_show_available_clocksources, NULL); 282 sysfs_show_available_clocksources, NULL);
282 283
283static struct sysdev_class clocksource_sysclass = { 284static struct sysdev_class clocksource_sysclass = {
284 set_kset_name("clocksource"), 285 set_kset_name("clocksource"),
@@ -307,6 +308,7 @@ static int __init init_clocksource_sysfs(void)
307} 308}
308 309
309device_initcall(init_clocksource_sysfs); 310device_initcall(init_clocksource_sysfs);
311#endif /* CONFIG_SYSFS */
310 312
311/** 313/**
312 * boot_override_clocksource - boot clock override 314 * boot_override_clocksource - boot clock override
diff --git a/kernel/timer.c b/kernel/timer.c
index c1c7fbcffe..0256ab443d 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -80,6 +80,138 @@ tvec_base_t boot_tvec_bases;
80EXPORT_SYMBOL(boot_tvec_bases); 80EXPORT_SYMBOL(boot_tvec_bases);
81static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases; 81static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases;
82 82
83/**
84 * __round_jiffies - function to round jiffies to a full second
85 * @j: the time in (absolute) jiffies that should be rounded
86 * @cpu: the processor number on which the timeout will happen
87 *
88 * __round_jiffies rounds an absolute time in the future (in jiffies)
89 * up or down to (approximately) full seconds. This is useful for timers
90 * for which the exact time they fire does not matter too much, as long as
91 * they fire approximately every X seconds.
92 *
93 * By rounding these timers to whole seconds, all such timers will fire
94 * at the same time, rather than at various times spread out. The goal
95 * of this is to have the CPU wake up less, which saves power.
96 *
97 * The exact rounding is skewed for each processor to avoid all
98 * processors firing at the exact same time, which could lead
99 * to lock contention or spurious cache line bouncing.
100 *
101 * The return value is the rounded version of the "j" parameter.
102 */
103unsigned long __round_jiffies(unsigned long j, int cpu)
104{
105 int rem;
106 unsigned long original = j;
107
108 /*
109 * We don't want all cpus firing their timers at once hitting the
110 * same lock or cachelines, so we skew each extra cpu with an extra
111 * 3 jiffies. This 3 jiffies came originally from the mm/ code which
112 * already did this.
113 * The skew is done by adding 3*cpunr, then round, then subtract this
114 * extra offset again.
115 */
116 j += cpu * 3;
117
118 rem = j % HZ;
119
120 /*
121 * If the target jiffie is just after a whole second (which can happen
122 * due to delays of the timer irq, long irq off times etc etc) then
123 * we should round down to the whole second, not up. Use 1/4th second
124 * as cutoff for this rounding as an extreme upper bound for this.
125 */
126 if (rem < HZ/4) /* round down */
127 j = j - rem;
128 else /* round up */
129 j = j - rem + HZ;
130
131 /* now that we have rounded, subtract the extra skew again */
132 j -= cpu * 3;
133
134 if (j <= jiffies) /* rounding ate our timeout entirely; */
135 return original;
136 return j;
137}
138EXPORT_SYMBOL_GPL(__round_jiffies);
139
140/**
141 * __round_jiffies_relative - function to round jiffies to a full second
142 * @j: the time in (relative) jiffies that should be rounded
143 * @cpu: the processor number on which the timeout will happen
144 *
145 * __round_jiffies_relative rounds a time delta in the future (in jiffies)
146 * up or down to (approximately) full seconds. This is useful for timers
147 * for which the exact time they fire does not matter too much, as long as
148 * they fire approximately every X seconds.
149 *
150 * By rounding these timers to whole seconds, all such timers will fire
151 * at the same time, rather than at various times spread out. The goal
152 * of this is to have the CPU wake up less, which saves power.
153 *
154 * The exact rounding is skewed for each processor to avoid all
155 * processors firing at the exact same time, which could lead
156 * to lock contention or spurious cache line bouncing.
157 *
158 * The return value is the rounded version of the "j" parameter.
159 */
160unsigned long __round_jiffies_relative(unsigned long j, int cpu)
161{
162 /*
163 * In theory the following code can skip a jiffy in case jiffies
164 * increments right between the addition and the later subtraction.
165 * However since the entire point of this function is to use approximate
166 * timeouts, it's entirely ok to not handle that.
167 */
168 return __round_jiffies(j + jiffies, cpu) - jiffies;
169}
170EXPORT_SYMBOL_GPL(__round_jiffies_relative);
171
172/**
173 * round_jiffies - function to round jiffies to a full second
174 * @j: the time in (absolute) jiffies that should be rounded
175 *
176 * round_jiffies rounds an absolute time in the future (in jiffies)
177 * up or down to (approximately) full seconds. This is useful for timers
178 * for which the exact time they fire does not matter too much, as long as
179 * they fire approximately every X seconds.
180 *
181 * By rounding these timers to whole seconds, all such timers will fire
182 * at the same time, rather than at various times spread out. The goal
183 * of this is to have the CPU wake up less, which saves power.
184 *
185 * The return value is the rounded version of the "j" parameter.
186 */
187unsigned long round_jiffies(unsigned long j)
188{
189 return __round_jiffies(j, raw_smp_processor_id());
190}
191EXPORT_SYMBOL_GPL(round_jiffies);
192
193/**
194 * round_jiffies_relative - function to round jiffies to a full second
195 * @j: the time in (relative) jiffies that should be rounded
196 *
197 * round_jiffies_relative rounds a time delta in the future (in jiffies)
198 * up or down to (approximately) full seconds. This is useful for timers
199 * for which the exact time they fire does not matter too much, as long as
200 * they fire approximately every X seconds.
201 *
202 * By rounding these timers to whole seconds, all such timers will fire
203 * at the same time, rather than at various times spread out. The goal
204 * of this is to have the CPU wake up less, which saves power.
205 *
206 * The return value is the rounded version of the "j" parameter.
207 */
208unsigned long round_jiffies_relative(unsigned long j)
209{
210 return __round_jiffies_relative(j, raw_smp_processor_id());
211}
212EXPORT_SYMBOL_GPL(round_jiffies_relative);
213
214
83static inline void set_running_timer(tvec_base_t *base, 215static inline void set_running_timer(tvec_base_t *base,
84 struct timer_list *timer) 216 struct timer_list *timer)
85{ 217{
@@ -714,7 +846,7 @@ static int change_clocksource(void)
714 clock = new; 846 clock = new;
715 clock->cycle_last = now; 847 clock->cycle_last = now;
716 printk(KERN_INFO "Time: %s clocksource has been installed.\n", 848 printk(KERN_INFO "Time: %s clocksource has been installed.\n",
717 clock->name); 849 clock->name);
718 return 1; 850 return 1;
719 } else if (clock->update_callback) { 851 } else if (clock->update_callback) {
720 return clock->update_callback(); 852 return clock->update_callback();
@@ -722,7 +854,10 @@ static int change_clocksource(void)
722 return 0; 854 return 0;
723} 855}
724#else 856#else
725#define change_clocksource() (0) 857static inline int change_clocksource(void)
858{
859 return 0;
860}
726#endif 861#endif
727 862
728/** 863/**
@@ -820,7 +955,8 @@ device_initcall(timekeeping_init_device);
820 * If the error is already larger, we look ahead even further 955 * If the error is already larger, we look ahead even further
821 * to compensate for late or lost adjustments. 956 * to compensate for late or lost adjustments.
822 */ 957 */
823static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, s64 *offset) 958static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
959 s64 *offset)
824{ 960{
825 s64 tick_error, i; 961 s64 tick_error, i;
826 u32 look_ahead, adj; 962 u32 look_ahead, adj;
@@ -844,7 +980,8 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, s64 *
844 * Now calculate the error in (1 << look_ahead) ticks, but first 980 * Now calculate the error in (1 << look_ahead) ticks, but first
845 * remove the single look ahead already included in the error. 981 * remove the single look ahead already included in the error.
846 */ 982 */
847 tick_error = current_tick_length() >> (TICK_LENGTH_SHIFT - clock->shift + 1); 983 tick_error = current_tick_length() >>
984 (TICK_LENGTH_SHIFT - clock->shift + 1);
848 tick_error -= clock->xtime_interval >> 1; 985 tick_error -= clock->xtime_interval >> 1;
849 error = ((error - tick_error) >> look_ahead) + tick_error; 986 error = ((error - tick_error) >> look_ahead) + tick_error;
850 987
@@ -896,7 +1033,8 @@ static void clocksource_adjust(struct clocksource *clock, s64 offset)
896 clock->mult += adj; 1033 clock->mult += adj;
897 clock->xtime_interval += interval; 1034 clock->xtime_interval += interval;
898 clock->xtime_nsec -= offset; 1035 clock->xtime_nsec -= offset;
899 clock->error -= (interval - offset) << (TICK_LENGTH_SHIFT - clock->shift); 1036 clock->error -= (interval - offset) <<
1037 (TICK_LENGTH_SHIFT - clock->shift);
900} 1038}
901 1039
902/** 1040/**
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 96f77013d3..baacc36914 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -96,6 +96,15 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
96 stats->write_char = p->wchar; 96 stats->write_char = p->wchar;
97 stats->read_syscalls = p->syscr; 97 stats->read_syscalls = p->syscr;
98 stats->write_syscalls = p->syscw; 98 stats->write_syscalls = p->syscw;
99#ifdef CONFIG_TASK_IO_ACCOUNTING
100 stats->read_bytes = p->ioac.read_bytes;
101 stats->write_bytes = p->ioac.write_bytes;
102 stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes;
103#else
104 stats->read_bytes = 0;
105 stats->write_bytes = 0;
106 stats->cancelled_write_bytes = 0;
107#endif
99} 108}
100#undef KB 109#undef KB
101#undef MB 110#undef MB
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 6b186750e9..db49886bfa 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -85,22 +85,19 @@ static inline int is_single_threaded(struct workqueue_struct *wq)
85 return list_empty(&wq->list); 85 return list_empty(&wq->list);
86} 86}
87 87
88/*
89 * Set the workqueue on which a work item is to be run
90 * - Must *only* be called if the pending flag is set
91 */
88static inline void set_wq_data(struct work_struct *work, void *wq) 92static inline void set_wq_data(struct work_struct *work, void *wq)
89{ 93{
90 unsigned long new, old, res; 94 unsigned long new;
95
96 BUG_ON(!work_pending(work));
91 97
92 /* assume the pending flag is already set and that the task has already
93 * been queued on this workqueue */
94 new = (unsigned long) wq | (1UL << WORK_STRUCT_PENDING); 98 new = (unsigned long) wq | (1UL << WORK_STRUCT_PENDING);
95 res = work->management; 99 new |= work->management & WORK_STRUCT_FLAG_MASK;
96 if (res != new) { 100 work->management = new;
97 do {
98 old = res;
99 new = (unsigned long) wq;
100 new |= (old & WORK_STRUCT_FLAG_MASK);
101 res = cmpxchg(&work->management, old, new);
102 } while (res != old);
103 }
104} 101}
105 102
106static inline void *get_wq_data(struct work_struct *work) 103static inline void *get_wq_data(struct work_struct *work)