aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile5
-rw-r--r--kernel/acct.c30
-rw-r--r--kernel/cpuset.c8
-rw-r--r--kernel/dma.c10
-rw-r--r--kernel/exit.c13
-rw-r--r--kernel/fork.c82
-rw-r--r--kernel/futex.c2
-rw-r--r--kernel/kallsyms.c124
-rw-r--r--kernel/kmod.c62
-rw-r--r--kernel/kprobes.c53
-rw-r--r--kernel/latency.c279
-rw-r--r--kernel/lockdep.c6
-rw-r--r--kernel/module.c40
-rw-r--r--kernel/nsproxy.c139
-rw-r--r--kernel/panic.c1
-rw-r--r--kernel/pid.c111
-rw-r--r--kernel/power/snapshot.c10
-rw-r--r--kernel/resource.c83
-rw-r--r--kernel/sched.c326
-rw-r--r--kernel/signal.c65
-rw-r--r--kernel/spinlock.c4
-rw-r--r--kernel/sys.c110
-rw-r--r--kernel/sys_ni.c5
-rw-r--r--kernel/sysctl.c363
-rw-r--r--kernel/taskstats.c10
-rw-r--r--kernel/time.c173
-rw-r--r--kernel/time/Makefile2
-rw-r--r--kernel/time/ntp.c350
-rw-r--r--kernel/timer.c230
-rw-r--r--kernel/tsacct.c124
-rw-r--r--kernel/utsname.c95
31 files changed, 2034 insertions, 881 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index d62ec66c1a..d948ca12ac 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -8,7 +8,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
8 signal.o sys.o kmod.o workqueue.o pid.o \ 8 signal.o sys.o kmod.o workqueue.o pid.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o 11 hrtimer.o rwsem.o latency.o nsproxy.o
12 12
13obj-$(CONFIG_STACKTRACE) += stacktrace.o 13obj-$(CONFIG_STACKTRACE) += stacktrace.o
14obj-y += time/ 14obj-y += time/
@@ -48,8 +48,9 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
48obj-$(CONFIG_SECCOMP) += seccomp.o 48obj-$(CONFIG_SECCOMP) += seccomp.o
49obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 49obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
50obj-$(CONFIG_RELAY) += relay.o 50obj-$(CONFIG_RELAY) += relay.o
51obj-$(CONFIG_UTS_NS) += utsname.o
51obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o 52obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
52obj-$(CONFIG_TASKSTATS) += taskstats.o 53obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
53 54
54ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) 55ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
55# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 56# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/acct.c b/kernel/acct.c
index f4330acead..0aad5ca36a 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -602,33 +602,3 @@ void acct_process(void)
602 do_acct_process(file); 602 do_acct_process(file);
603 fput(file); 603 fput(file);
604} 604}
605
606
607/**
608 * acct_update_integrals - update mm integral fields in task_struct
609 * @tsk: task_struct for accounting
610 */
611void acct_update_integrals(struct task_struct *tsk)
612{
613 if (likely(tsk->mm)) {
614 long delta =
615 cputime_to_jiffies(tsk->stime) - tsk->acct_stimexpd;
616
617 if (delta == 0)
618 return;
619 tsk->acct_stimexpd = tsk->stime;
620 tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm);
621 tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
622 }
623}
624
625/**
626 * acct_clear_integrals - clear the mm integral fields in task_struct
627 * @tsk: task_struct whose accounting fields are cleared
628 */
629void acct_clear_integrals(struct task_struct *tsk)
630{
631 tsk->acct_stimexpd = 0;
632 tsk->acct_rss_mem1 = 0;
633 tsk->acct_vm_mem1 = 0;
634}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 8c3c400cce..9d850ae13b 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -377,7 +377,7 @@ static int cpuset_fill_super(struct super_block *sb, void *unused_data,
377 inode->i_op = &simple_dir_inode_operations; 377 inode->i_op = &simple_dir_inode_operations;
378 inode->i_fop = &simple_dir_operations; 378 inode->i_fop = &simple_dir_operations;
379 /* directories start off with i_nlink == 2 (for "." entry) */ 379 /* directories start off with i_nlink == 2 (for "." entry) */
380 inode->i_nlink++; 380 inc_nlink(inode);
381 } else { 381 } else {
382 return -ENOMEM; 382 return -ENOMEM;
383 } 383 }
@@ -1565,7 +1565,7 @@ static int cpuset_create_file(struct dentry *dentry, int mode)
1565 inode->i_fop = &simple_dir_operations; 1565 inode->i_fop = &simple_dir_operations;
1566 1566
1567 /* start off with i_nlink == 2 (for "." entry) */ 1567 /* start off with i_nlink == 2 (for "." entry) */
1568 inode->i_nlink++; 1568 inc_nlink(inode);
1569 } else if (S_ISREG(mode)) { 1569 } else if (S_ISREG(mode)) {
1570 inode->i_size = 0; 1570 inode->i_size = 0;
1571 inode->i_fop = &cpuset_file_operations; 1571 inode->i_fop = &cpuset_file_operations;
@@ -1598,7 +1598,7 @@ static int cpuset_create_dir(struct cpuset *cs, const char *name, int mode)
1598 error = cpuset_create_file(dentry, S_IFDIR | mode); 1598 error = cpuset_create_file(dentry, S_IFDIR | mode);
1599 if (!error) { 1599 if (!error) {
1600 dentry->d_fsdata = cs; 1600 dentry->d_fsdata = cs;
1601 parent->d_inode->i_nlink++; 1601 inc_nlink(parent->d_inode);
1602 cs->dentry = dentry; 1602 cs->dentry = dentry;
1603 } 1603 }
1604 dput(dentry); 1604 dput(dentry);
@@ -2033,7 +2033,7 @@ int __init cpuset_init(void)
2033 } 2033 }
2034 root = cpuset_mount->mnt_sb->s_root; 2034 root = cpuset_mount->mnt_sb->s_root;
2035 root->d_fsdata = &top_cpuset; 2035 root->d_fsdata = &top_cpuset;
2036 root->d_inode->i_nlink++; 2036 inc_nlink(root->d_inode);
2037 top_cpuset.dentry = root; 2037 top_cpuset.dentry = root;
2038 root->d_inode->i_op = &cpuset_dir_inode_operations; 2038 root->d_inode->i_op = &cpuset_dir_inode_operations;
2039 number_of_cpusets = 1; 2039 number_of_cpusets = 1;
diff --git a/kernel/dma.c b/kernel/dma.c
index aef0a45b78..2020644c93 100644
--- a/kernel/dma.c
+++ b/kernel/dma.c
@@ -62,6 +62,11 @@ static struct dma_chan dma_chan_busy[MAX_DMA_CHANNELS] = {
62}; 62};
63 63
64 64
65/**
66 * request_dma - request and reserve a system DMA channel
67 * @dmanr: DMA channel number
68 * @device_id: reserving device ID string, used in /proc/dma
69 */
65int request_dma(unsigned int dmanr, const char * device_id) 70int request_dma(unsigned int dmanr, const char * device_id)
66{ 71{
67 if (dmanr >= MAX_DMA_CHANNELS) 72 if (dmanr >= MAX_DMA_CHANNELS)
@@ -76,7 +81,10 @@ int request_dma(unsigned int dmanr, const char * device_id)
76 return 0; 81 return 0;
77} /* request_dma */ 82} /* request_dma */
78 83
79 84/**
85 * free_dma - free a reserved system DMA channel
86 * @dmanr: DMA channel number
87 */
80void free_dma(unsigned int dmanr) 88void free_dma(unsigned int dmanr)
81{ 89{
82 if (dmanr >= MAX_DMA_CHANNELS) { 90 if (dmanr >= MAX_DMA_CHANNELS) {
diff --git a/kernel/exit.c b/kernel/exit.c
index 2e4c13cba9..f250a5e3e2 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -18,8 +18,10 @@
18#include <linux/security.h> 18#include <linux/security.h>
19#include <linux/cpu.h> 19#include <linux/cpu.h>
20#include <linux/acct.h> 20#include <linux/acct.h>
21#include <linux/tsacct_kern.h>
21#include <linux/file.h> 22#include <linux/file.h>
22#include <linux/binfmts.h> 23#include <linux/binfmts.h>
24#include <linux/nsproxy.h>
23#include <linux/ptrace.h> 25#include <linux/ptrace.h>
24#include <linux/profile.h> 26#include <linux/profile.h>
25#include <linux/mount.h> 27#include <linux/mount.h>
@@ -38,6 +40,7 @@
38#include <linux/pipe_fs_i.h> 40#include <linux/pipe_fs_i.h>
39#include <linux/audit.h> /* for audit_free() */ 41#include <linux/audit.h> /* for audit_free() */
40#include <linux/resource.h> 42#include <linux/resource.h>
43#include <linux/blkdev.h>
41 44
42#include <asm/uaccess.h> 45#include <asm/uaccess.h>
43#include <asm/unistd.h> 46#include <asm/unistd.h>
@@ -395,9 +398,11 @@ void daemonize(const char *name, ...)
395 fs = init_task.fs; 398 fs = init_task.fs;
396 current->fs = fs; 399 current->fs = fs;
397 atomic_inc(&fs->count); 400 atomic_inc(&fs->count);
398 exit_namespace(current); 401
399 current->namespace = init_task.namespace; 402 exit_task_namespaces(current);
400 get_namespace(current->namespace); 403 current->nsproxy = init_task.nsproxy;
404 get_task_namespaces(current);
405
401 exit_files(current); 406 exit_files(current);
402 current->files = init_task.files; 407 current->files = init_task.files;
403 atomic_inc(&current->files->count); 408 atomic_inc(&current->files->count);
@@ -915,7 +920,6 @@ fastcall NORET_TYPE void do_exit(long code)
915 exit_sem(tsk); 920 exit_sem(tsk);
916 __exit_files(tsk); 921 __exit_files(tsk);
917 __exit_fs(tsk); 922 __exit_fs(tsk);
918 exit_namespace(tsk);
919 exit_thread(); 923 exit_thread();
920 cpuset_exit(tsk); 924 cpuset_exit(tsk);
921 exit_keys(tsk); 925 exit_keys(tsk);
@@ -930,6 +934,7 @@ fastcall NORET_TYPE void do_exit(long code)
930 tsk->exit_code = code; 934 tsk->exit_code = code;
931 proc_exit_connector(tsk); 935 proc_exit_connector(tsk);
932 exit_notify(tsk); 936 exit_notify(tsk);
937 exit_task_namespaces(tsk);
933#ifdef CONFIG_NUMA 938#ifdef CONFIG_NUMA
934 mpol_free(tsk->mempolicy); 939 mpol_free(tsk->mempolicy);
935 tsk->mempolicy = NULL; 940 tsk->mempolicy = NULL;
diff --git a/kernel/fork.c b/kernel/fork.c
index 1c999f3e0b..7dc6140baa 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -27,6 +27,7 @@
27#include <linux/binfmts.h> 27#include <linux/binfmts.h>
28#include <linux/mman.h> 28#include <linux/mman.h>
29#include <linux/fs.h> 29#include <linux/fs.h>
30#include <linux/nsproxy.h>
30#include <linux/capability.h> 31#include <linux/capability.h>
31#include <linux/cpu.h> 32#include <linux/cpu.h>
32#include <linux/cpuset.h> 33#include <linux/cpuset.h>
@@ -42,6 +43,7 @@
42#include <linux/profile.h> 43#include <linux/profile.h>
43#include <linux/rmap.h> 44#include <linux/rmap.h>
44#include <linux/acct.h> 45#include <linux/acct.h>
46#include <linux/tsacct_kern.h>
45#include <linux/cn_proc.h> 47#include <linux/cn_proc.h>
46#include <linux/delayacct.h> 48#include <linux/delayacct.h>
47#include <linux/taskstats_kern.h> 49#include <linux/taskstats_kern.h>
@@ -1115,11 +1117,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1115 goto bad_fork_cleanup_signal; 1117 goto bad_fork_cleanup_signal;
1116 if ((retval = copy_keys(clone_flags, p))) 1118 if ((retval = copy_keys(clone_flags, p)))
1117 goto bad_fork_cleanup_mm; 1119 goto bad_fork_cleanup_mm;
1118 if ((retval = copy_namespace(clone_flags, p))) 1120 if ((retval = copy_namespaces(clone_flags, p)))
1119 goto bad_fork_cleanup_keys; 1121 goto bad_fork_cleanup_keys;
1120 retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); 1122 retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
1121 if (retval) 1123 if (retval)
1122 goto bad_fork_cleanup_namespace; 1124 goto bad_fork_cleanup_namespaces;
1123 1125
1124 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; 1126 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
1125 /* 1127 /*
@@ -1211,7 +1213,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1211 spin_unlock(&current->sighand->siglock); 1213 spin_unlock(&current->sighand->siglock);
1212 write_unlock_irq(&tasklist_lock); 1214 write_unlock_irq(&tasklist_lock);
1213 retval = -ERESTARTNOINTR; 1215 retval = -ERESTARTNOINTR;
1214 goto bad_fork_cleanup_namespace; 1216 goto bad_fork_cleanup_namespaces;
1215 } 1217 }
1216 1218
1217 if (clone_flags & CLONE_THREAD) { 1219 if (clone_flags & CLONE_THREAD) {
@@ -1259,8 +1261,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1259 proc_fork_connector(p); 1261 proc_fork_connector(p);
1260 return p; 1262 return p;
1261 1263
1262bad_fork_cleanup_namespace: 1264bad_fork_cleanup_namespaces:
1263 exit_namespace(p); 1265 exit_task_namespaces(p);
1264bad_fork_cleanup_keys: 1266bad_fork_cleanup_keys:
1265 exit_keys(p); 1267 exit_keys(p);
1266bad_fork_cleanup_mm: 1268bad_fork_cleanup_mm:
@@ -1513,10 +1515,9 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
1513 */ 1515 */
1514static int unshare_namespace(unsigned long unshare_flags, struct namespace **new_nsp, struct fs_struct *new_fs) 1516static int unshare_namespace(unsigned long unshare_flags, struct namespace **new_nsp, struct fs_struct *new_fs)
1515{ 1517{
1516 struct namespace *ns = current->namespace; 1518 struct namespace *ns = current->nsproxy->namespace;
1517 1519
1518 if ((unshare_flags & CLONE_NEWNS) && 1520 if ((unshare_flags & CLONE_NEWNS) && ns) {
1519 (ns && atomic_read(&ns->count) > 1)) {
1520 if (!capable(CAP_SYS_ADMIN)) 1521 if (!capable(CAP_SYS_ADMIN))
1521 return -EPERM; 1522 return -EPERM;
1522 1523
@@ -1588,6 +1589,16 @@ static int unshare_semundo(unsigned long unshare_flags, struct sem_undo_list **n
1588 return 0; 1589 return 0;
1589} 1590}
1590 1591
1592#ifndef CONFIG_IPC_NS
1593static inline int unshare_ipcs(unsigned long flags, struct ipc_namespace **ns)
1594{
1595 if (flags & CLONE_NEWIPC)
1596 return -EINVAL;
1597
1598 return 0;
1599}
1600#endif
1601
1591/* 1602/*
1592 * unshare allows a process to 'unshare' part of the process 1603 * unshare allows a process to 'unshare' part of the process
1593 * context which was originally shared using clone. copy_* 1604 * context which was originally shared using clone. copy_*
@@ -1605,13 +1616,17 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1605 struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; 1616 struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
1606 struct files_struct *fd, *new_fd = NULL; 1617 struct files_struct *fd, *new_fd = NULL;
1607 struct sem_undo_list *new_ulist = NULL; 1618 struct sem_undo_list *new_ulist = NULL;
1619 struct nsproxy *new_nsproxy = NULL, *old_nsproxy = NULL;
1620 struct uts_namespace *uts, *new_uts = NULL;
1621 struct ipc_namespace *ipc, *new_ipc = NULL;
1608 1622
1609 check_unshare_flags(&unshare_flags); 1623 check_unshare_flags(&unshare_flags);
1610 1624
1611 /* Return -EINVAL for all unsupported flags */ 1625 /* Return -EINVAL for all unsupported flags */
1612 err = -EINVAL; 1626 err = -EINVAL;
1613 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| 1627 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
1614 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM)) 1628 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
1629 CLONE_NEWUTS|CLONE_NEWIPC))
1615 goto bad_unshare_out; 1630 goto bad_unshare_out;
1616 1631
1617 if ((err = unshare_thread(unshare_flags))) 1632 if ((err = unshare_thread(unshare_flags)))
@@ -1628,11 +1643,30 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1628 goto bad_unshare_cleanup_vm; 1643 goto bad_unshare_cleanup_vm;
1629 if ((err = unshare_semundo(unshare_flags, &new_ulist))) 1644 if ((err = unshare_semundo(unshare_flags, &new_ulist)))
1630 goto bad_unshare_cleanup_fd; 1645 goto bad_unshare_cleanup_fd;
1646 if ((err = unshare_utsname(unshare_flags, &new_uts)))
1647 goto bad_unshare_cleanup_semundo;
1648 if ((err = unshare_ipcs(unshare_flags, &new_ipc)))
1649 goto bad_unshare_cleanup_uts;
1650
1651 if (new_ns || new_uts || new_ipc) {
1652 old_nsproxy = current->nsproxy;
1653 new_nsproxy = dup_namespaces(old_nsproxy);
1654 if (!new_nsproxy) {
1655 err = -ENOMEM;
1656 goto bad_unshare_cleanup_ipc;
1657 }
1658 }
1631 1659
1632 if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist) { 1660 if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist ||
1661 new_uts || new_ipc) {
1633 1662
1634 task_lock(current); 1663 task_lock(current);
1635 1664
1665 if (new_nsproxy) {
1666 current->nsproxy = new_nsproxy;
1667 new_nsproxy = old_nsproxy;
1668 }
1669
1636 if (new_fs) { 1670 if (new_fs) {
1637 fs = current->fs; 1671 fs = current->fs;
1638 current->fs = new_fs; 1672 current->fs = new_fs;
@@ -1640,8 +1674,8 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1640 } 1674 }
1641 1675
1642 if (new_ns) { 1676 if (new_ns) {
1643 ns = current->namespace; 1677 ns = current->nsproxy->namespace;
1644 current->namespace = new_ns; 1678 current->nsproxy->namespace = new_ns;
1645 new_ns = ns; 1679 new_ns = ns;
1646 } 1680 }
1647 1681
@@ -1666,9 +1700,33 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1666 new_fd = fd; 1700 new_fd = fd;
1667 } 1701 }
1668 1702
1703 if (new_uts) {
1704 uts = current->nsproxy->uts_ns;
1705 current->nsproxy->uts_ns = new_uts;
1706 new_uts = uts;
1707 }
1708
1709 if (new_ipc) {
1710 ipc = current->nsproxy->ipc_ns;
1711 current->nsproxy->ipc_ns = new_ipc;
1712 new_ipc = ipc;
1713 }
1714
1669 task_unlock(current); 1715 task_unlock(current);
1670 } 1716 }
1671 1717
1718 if (new_nsproxy)
1719 put_nsproxy(new_nsproxy);
1720
1721bad_unshare_cleanup_ipc:
1722 if (new_ipc)
1723 put_ipc_ns(new_ipc);
1724
1725bad_unshare_cleanup_uts:
1726 if (new_uts)
1727 put_uts_ns(new_uts);
1728
1729bad_unshare_cleanup_semundo:
1672bad_unshare_cleanup_fd: 1730bad_unshare_cleanup_fd:
1673 if (new_fd) 1731 if (new_fd)
1674 put_files_struct(new_fd); 1732 put_files_struct(new_fd);
diff --git a/kernel/futex.c b/kernel/futex.c
index 4b6770e980..4aaf91951a 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1527,7 +1527,7 @@ static int futex_fd(u32 __user *uaddr, int signal)
1527 filp->f_mapping = filp->f_dentry->d_inode->i_mapping; 1527 filp->f_mapping = filp->f_dentry->d_inode->i_mapping;
1528 1528
1529 if (signal) { 1529 if (signal) {
1530 err = f_setown(filp, current->pid, 1); 1530 err = __f_setown(filp, task_pid(current), PIDTYPE_PID, 1);
1531 if (err < 0) { 1531 if (err < 0) {
1532 goto error; 1532 goto error;
1533 } 1533 }
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index ab16a5a4cf..eeac3e313b 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -69,6 +69,15 @@ static inline int is_kernel(unsigned long addr)
69 return in_gate_area_no_task(addr); 69 return in_gate_area_no_task(addr);
70} 70}
71 71
72static int is_ksym_addr(unsigned long addr)
73{
74 if (all_var)
75 return is_kernel(addr);
76
77 return is_kernel_text(addr) || is_kernel_inittext(addr) ||
78 is_kernel_extratext(addr);
79}
80
72/* expand a compressed symbol data into the resulting uncompressed string, 81/* expand a compressed symbol data into the resulting uncompressed string,
73 given the offset to where the symbol is in the compressed stream */ 82 given the offset to where the symbol is in the compressed stream */
74static unsigned int kallsyms_expand_symbol(unsigned int off, char *result) 83static unsigned int kallsyms_expand_symbol(unsigned int off, char *result)
@@ -154,7 +163,73 @@ unsigned long kallsyms_lookup_name(const char *name)
154 } 163 }
155 return module_kallsyms_lookup_name(name); 164 return module_kallsyms_lookup_name(name);
156} 165}
157EXPORT_SYMBOL_GPL(kallsyms_lookup_name); 166
167static unsigned long get_symbol_pos(unsigned long addr,
168 unsigned long *symbolsize,
169 unsigned long *offset)
170{
171 unsigned long symbol_start = 0, symbol_end = 0;
172 unsigned long i, low, high, mid;
173
174 /* This kernel should never had been booted. */
175 BUG_ON(!kallsyms_addresses);
176
177 /* do a binary search on the sorted kallsyms_addresses array */
178 low = 0;
179 high = kallsyms_num_syms;
180
181 while (high - low > 1) {
182 mid = (low + high) / 2;
183 if (kallsyms_addresses[mid] <= addr)
184 low = mid;
185 else
186 high = mid;
187 }
188
189 /*
190 * search for the first aliased symbol. Aliased
191 * symbols are symbols with the same address
192 */
193 while (low && kallsyms_addresses[low-1] == kallsyms_addresses[low])
194 --low;
195
196 symbol_start = kallsyms_addresses[low];
197
198 /* Search for next non-aliased symbol */
199 for (i = low + 1; i < kallsyms_num_syms; i++) {
200 if (kallsyms_addresses[i] > symbol_start) {
201 symbol_end = kallsyms_addresses[i];
202 break;
203 }
204 }
205
206 /* if we found no next symbol, we use the end of the section */
207 if (!symbol_end) {
208 if (is_kernel_inittext(addr))
209 symbol_end = (unsigned long)_einittext;
210 else if (all_var)
211 symbol_end = (unsigned long)_end;
212 else
213 symbol_end = (unsigned long)_etext;
214 }
215
216 *symbolsize = symbol_end - symbol_start;
217 *offset = addr - symbol_start;
218
219 return low;
220}
221
222/*
223 * Lookup an address but don't bother to find any names.
224 */
225int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize,
226 unsigned long *offset)
227{
228 if (is_ksym_addr(addr))
229 return !!get_symbol_pos(addr, symbolsize, offset);
230
231 return !!module_address_lookup(addr, symbolsize, offset, NULL);
232}
158 233
159/* 234/*
160 * Lookup an address 235 * Lookup an address
@@ -168,57 +243,18 @@ const char *kallsyms_lookup(unsigned long addr,
168 unsigned long *offset, 243 unsigned long *offset,
169 char **modname, char *namebuf) 244 char **modname, char *namebuf)
170{ 245{
171 unsigned long i, low, high, mid;
172 const char *msym; 246 const char *msym;
173 247
174 /* This kernel should never had been booted. */
175 BUG_ON(!kallsyms_addresses);
176
177 namebuf[KSYM_NAME_LEN] = 0; 248 namebuf[KSYM_NAME_LEN] = 0;
178 namebuf[0] = 0; 249 namebuf[0] = 0;
179 250
180 if ((all_var && is_kernel(addr)) || 251 if (is_ksym_addr(addr)) {
181 (!all_var && (is_kernel_text(addr) || is_kernel_inittext(addr) || 252 unsigned long pos;
182 is_kernel_extratext(addr)))) {
183 unsigned long symbol_end = 0;
184
185 /* do a binary search on the sorted kallsyms_addresses array */
186 low = 0;
187 high = kallsyms_num_syms;
188
189 while (high-low > 1) {
190 mid = (low + high) / 2;
191 if (kallsyms_addresses[mid] <= addr) low = mid;
192 else high = mid;
193 }
194
195 /* search for the first aliased symbol. Aliased symbols are
196 symbols with the same address */
197 while (low && kallsyms_addresses[low - 1] == kallsyms_addresses[low])
198 --low;
199 253
254 pos = get_symbol_pos(addr, symbolsize, offset);
200 /* Grab name */ 255 /* Grab name */
201 kallsyms_expand_symbol(get_symbol_offset(low), namebuf); 256 kallsyms_expand_symbol(get_symbol_offset(pos), namebuf);
202
203 /* Search for next non-aliased symbol */
204 for (i = low + 1; i < kallsyms_num_syms; i++) {
205 if (kallsyms_addresses[i] > kallsyms_addresses[low]) {
206 symbol_end = kallsyms_addresses[i];
207 break;
208 }
209 }
210
211 /* if we found no next symbol, we use the end of the section */
212 if (!symbol_end) {
213 if (is_kernel_inittext(addr))
214 symbol_end = (unsigned long)_einittext;
215 else
216 symbol_end = all_var ? (unsigned long)_end : (unsigned long)_etext;
217 }
218
219 *symbolsize = symbol_end - kallsyms_addresses[low];
220 *modname = NULL; 257 *modname = NULL;
221 *offset = addr - kallsyms_addresses[low];
222 return namebuf; 258 return namebuf;
223 } 259 }
224 260
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 842f8015d7..bb4e29d924 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -18,8 +18,6 @@
18 call_usermodehelper wait flag, and remove exec_usermodehelper. 18 call_usermodehelper wait flag, and remove exec_usermodehelper.
19 Rusty Russell <rusty@rustcorp.com.au> Jan 2003 19 Rusty Russell <rusty@rustcorp.com.au> Jan 2003
20*/ 20*/
21#define __KERNEL_SYSCALLS__
22
23#include <linux/module.h> 21#include <linux/module.h>
24#include <linux/sched.h> 22#include <linux/sched.h>
25#include <linux/syscalls.h> 23#include <linux/syscalls.h>
@@ -35,6 +33,7 @@
35#include <linux/mount.h> 33#include <linux/mount.h>
36#include <linux/kernel.h> 34#include <linux/kernel.h>
37#include <linux/init.h> 35#include <linux/init.h>
36#include <linux/resource.h>
38#include <asm/uaccess.h> 37#include <asm/uaccess.h>
39 38
40extern int max_threads; 39extern int max_threads;
@@ -122,6 +121,7 @@ struct subprocess_info {
122 struct key *ring; 121 struct key *ring;
123 int wait; 122 int wait;
124 int retval; 123 int retval;
124 struct file *stdin;
125}; 125};
126 126
127/* 127/*
@@ -145,12 +145,30 @@ static int ____call_usermodehelper(void *data)
145 145
146 key_put(old_session); 146 key_put(old_session);
147 147
148 /* Install input pipe when needed */
149 if (sub_info->stdin) {
150 struct files_struct *f = current->files;
151 struct fdtable *fdt;
152 /* no races because files should be private here */
153 sys_close(0);
154 fd_install(0, sub_info->stdin);
155 spin_lock(&f->file_lock);
156 fdt = files_fdtable(f);
157 FD_SET(0, fdt->open_fds);
158 FD_CLR(0, fdt->close_on_exec);
159 spin_unlock(&f->file_lock);
160
161 /* and disallow core files too */
162 current->signal->rlim[RLIMIT_CORE] = (struct rlimit){0, 0};
163 }
164
148 /* We can run anywhere, unlike our parent keventd(). */ 165 /* We can run anywhere, unlike our parent keventd(). */
149 set_cpus_allowed(current, CPU_MASK_ALL); 166 set_cpus_allowed(current, CPU_MASK_ALL);
150 167
151 retval = -EPERM; 168 retval = -EPERM;
152 if (current->fs->root) 169 if (current->fs->root)
153 retval = execve(sub_info->path, sub_info->argv,sub_info->envp); 170 retval = kernel_execve(sub_info->path,
171 sub_info->argv, sub_info->envp);
154 172
155 /* Exec failed? */ 173 /* Exec failed? */
156 sub_info->retval = retval; 174 sub_info->retval = retval;
@@ -268,6 +286,44 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp,
268} 286}
269EXPORT_SYMBOL(call_usermodehelper_keys); 287EXPORT_SYMBOL(call_usermodehelper_keys);
270 288
289int call_usermodehelper_pipe(char *path, char **argv, char **envp,
290 struct file **filp)
291{
292 DECLARE_COMPLETION(done);
293 struct subprocess_info sub_info = {
294 .complete = &done,
295 .path = path,
296 .argv = argv,
297 .envp = envp,
298 .retval = 0,
299 };
300 struct file *f;
301 DECLARE_WORK(work, __call_usermodehelper, &sub_info);
302
303 if (!khelper_wq)
304 return -EBUSY;
305
306 if (path[0] == '\0')
307 return 0;
308
309 f = create_write_pipe();
310 if (!f)
311 return -ENOMEM;
312 *filp = f;
313
314 f = create_read_pipe(f);
315 if (!f) {
316 free_write_pipe(*filp);
317 return -ENOMEM;
318 }
319 sub_info.stdin = f;
320
321 queue_work(khelper_wq, &work);
322 wait_for_completion(&done);
323 return sub_info.retval;
324}
325EXPORT_SYMBOL(call_usermodehelper_pipe);
326
271void __init usermodehelper_init(void) 327void __init usermodehelper_init(void)
272{ 328{
273 khelper_wq = create_singlethread_workqueue("khelper"); 329 khelper_wq = create_singlethread_workqueue("khelper");
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 3f57dfdc8f..610c837ad9 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -37,6 +37,7 @@
37#include <linux/slab.h> 37#include <linux/slab.h>
38#include <linux/module.h> 38#include <linux/module.h>
39#include <linux/moduleloader.h> 39#include <linux/moduleloader.h>
40#include <linux/kallsyms.h>
40#include <asm-generic/sections.h> 41#include <asm-generic/sections.h>
41#include <asm/cacheflush.h> 42#include <asm/cacheflush.h>
42#include <asm/errno.h> 43#include <asm/errno.h>
@@ -45,6 +46,16 @@
45#define KPROBE_HASH_BITS 6 46#define KPROBE_HASH_BITS 6
46#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS) 47#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)
47 48
49
50/*
51 * Some oddball architectures like 64bit powerpc have function descriptors
52 * so this must be overridable.
53 */
54#ifndef kprobe_lookup_name
55#define kprobe_lookup_name(name, addr) \
56 addr = ((kprobe_opcode_t *)(kallsyms_lookup_name(name)))
57#endif
58
48static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; 59static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
49static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; 60static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
50static atomic_t kprobe_count; 61static atomic_t kprobe_count;
@@ -308,7 +319,8 @@ void __kprobes add_rp_inst(struct kretprobe_instance *ri)
308} 319}
309 320
310/* Called with kretprobe_lock held */ 321/* Called with kretprobe_lock held */
311void __kprobes recycle_rp_inst(struct kretprobe_instance *ri) 322void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
323 struct hlist_head *head)
312{ 324{
313 /* remove rp inst off the rprobe_inst_table */ 325 /* remove rp inst off the rprobe_inst_table */
314 hlist_del(&ri->hlist); 326 hlist_del(&ri->hlist);
@@ -320,7 +332,7 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri)
320 hlist_add_head(&ri->uflist, &ri->rp->free_instances); 332 hlist_add_head(&ri->uflist, &ri->rp->free_instances);
321 } else 333 } else
322 /* Unregistering */ 334 /* Unregistering */
323 kfree(ri); 335 hlist_add_head(&ri->hlist, head);
324} 336}
325 337
326struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk) 338struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk)
@@ -336,18 +348,24 @@ struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk)
336 */ 348 */
337void __kprobes kprobe_flush_task(struct task_struct *tk) 349void __kprobes kprobe_flush_task(struct task_struct *tk)
338{ 350{
339 struct kretprobe_instance *ri; 351 struct kretprobe_instance *ri;
340 struct hlist_head *head; 352 struct hlist_head *head, empty_rp;
341 struct hlist_node *node, *tmp; 353 struct hlist_node *node, *tmp;
342 unsigned long flags = 0; 354 unsigned long flags = 0;
343 355
356 INIT_HLIST_HEAD(&empty_rp);
344 spin_lock_irqsave(&kretprobe_lock, flags); 357 spin_lock_irqsave(&kretprobe_lock, flags);
345 head = kretprobe_inst_table_head(tk); 358 head = kretprobe_inst_table_head(tk);
346 hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { 359 hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
347 if (ri->task == tk) 360 if (ri->task == tk)
348 recycle_rp_inst(ri); 361 recycle_rp_inst(ri, &empty_rp);
349 } 362 }
350 spin_unlock_irqrestore(&kretprobe_lock, flags); 363 spin_unlock_irqrestore(&kretprobe_lock, flags);
364
365 hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
366 hlist_del(&ri->hlist);
367 kfree(ri);
368 }
351} 369}
352 370
353static inline void free_rp_inst(struct kretprobe *rp) 371static inline void free_rp_inst(struct kretprobe *rp)
@@ -447,6 +465,21 @@ static int __kprobes __register_kprobe(struct kprobe *p,
447 struct kprobe *old_p; 465 struct kprobe *old_p;
448 struct module *probed_mod; 466 struct module *probed_mod;
449 467
468 /*
469 * If we have a symbol_name argument look it up,
470 * and add it to the address. That way the addr
471 * field can either be global or relative to a symbol.
472 */
473 if (p->symbol_name) {
474 if (p->addr)
475 return -EINVAL;
476 kprobe_lookup_name(p->symbol_name, p->addr);
477 }
478
479 if (!p->addr)
480 return -EINVAL;
481 p->addr = (kprobe_opcode_t *)(((char *)p->addr)+ p->offset);
482
450 if ((!kernel_text_address((unsigned long) p->addr)) || 483 if ((!kernel_text_address((unsigned long) p->addr)) ||
451 in_kprobes_functions((unsigned long) p->addr)) 484 in_kprobes_functions((unsigned long) p->addr))
452 return -EINVAL; 485 return -EINVAL;
@@ -488,7 +521,7 @@ static int __kprobes __register_kprobe(struct kprobe *p,
488 (ARCH_INACTIVE_KPROBE_COUNT + 1)) 521 (ARCH_INACTIVE_KPROBE_COUNT + 1))
489 register_page_fault_notifier(&kprobe_page_fault_nb); 522 register_page_fault_notifier(&kprobe_page_fault_nb);
490 523
491 arch_arm_kprobe(p); 524 arch_arm_kprobe(p);
492 525
493out: 526out:
494 mutex_unlock(&kprobe_mutex); 527 mutex_unlock(&kprobe_mutex);
diff --git a/kernel/latency.c b/kernel/latency.c
new file mode 100644
index 0000000000..258f2555ab
--- /dev/null
+++ b/kernel/latency.c
@@ -0,0 +1,279 @@
1/*
2 * latency.c: Explicit system-wide latency-expectation infrastructure
3 *
4 * The purpose of this infrastructure is to allow device drivers to set
5 * latency constraint they have and to collect and summarize these
6 * expectations globally. The cummulated result can then be used by
7 * power management and similar users to make decisions that have
8 * tradoffs with a latency component.
9 *
10 * An example user of this are the x86 C-states; each higher C state saves
11 * more power, but has a higher exit latency. For the idle loop power
12 * code to make a good decision which C-state to use, information about
13 * acceptable latencies is required.
14 *
15 * An example announcer of latency is an audio driver that knowns it
16 * will get an interrupt when the hardware has 200 usec of samples
17 * left in the DMA buffer; in that case the driver can set a latency
18 * constraint of, say, 150 usec.
19 *
20 * Multiple drivers can each announce their maximum accepted latency,
21 * to keep these appart, a string based identifier is used.
22 *
23 *
24 * (C) Copyright 2006 Intel Corporation
25 * Author: Arjan van de Ven <arjan@linux.intel.com>
26 *
27 * This program is free software; you can redistribute it and/or
28 * modify it under the terms of the GNU General Public License
29 * as published by the Free Software Foundation; version 2
30 * of the License.
31 */
32
33#include <linux/latency.h>
34#include <linux/list.h>
35#include <linux/spinlock.h>
36#include <linux/slab.h>
37#include <linux/module.h>
38#include <linux/notifier.h>
39#include <asm/atomic.h>
40
41struct latency_info {
42 struct list_head list;
43 int usecs;
44 char *identifier;
45};
46
47/*
48 * locking rule: all modifications to current_max_latency and
49 * latency_list need to be done while holding the latency_lock.
50 * latency_lock needs to be taken _irqsave.
51 */
52static atomic_t current_max_latency;
53static DEFINE_SPINLOCK(latency_lock);
54
55static LIST_HEAD(latency_list);
56static BLOCKING_NOTIFIER_HEAD(latency_notifier);
57
58/*
59 * This function returns the maximum latency allowed, which
60 * happens to be the minimum of all maximum latencies on the
61 * list.
62 */
63static int __find_max_latency(void)
64{
65 int min = INFINITE_LATENCY;
66 struct latency_info *info;
67
68 list_for_each_entry(info, &latency_list, list) {
69 if (info->usecs < min)
70 min = info->usecs;
71 }
72 return min;
73}
74
75/**
76 * set_acceptable_latency - sets the maximum latency acceptable
77 * @identifier: string that identifies this driver
78 * @usecs: maximum acceptable latency for this driver
79 *
80 * This function informs the kernel that this device(driver)
81 * can accept at most usecs latency. This setting is used for
82 * power management and similar tradeoffs.
83 *
84 * This function sleeps and can only be called from process
85 * context.
86 * Calling this function with an existing identifier is valid
87 * and will cause the existing latency setting to be changed.
88 */
89void set_acceptable_latency(char *identifier, int usecs)
90{
91 struct latency_info *info, *iter;
92 unsigned long flags;
93 int found_old = 0;
94
95 info = kzalloc(sizeof(struct latency_info), GFP_KERNEL);
96 if (!info)
97 return;
98 info->usecs = usecs;
99 info->identifier = kstrdup(identifier, GFP_KERNEL);
100 if (!info->identifier)
101 goto free_info;
102
103 spin_lock_irqsave(&latency_lock, flags);
104 list_for_each_entry(iter, &latency_list, list) {
105 if (strcmp(iter->identifier, identifier)==0) {
106 found_old = 1;
107 iter->usecs = usecs;
108 break;
109 }
110 }
111 if (!found_old)
112 list_add(&info->list, &latency_list);
113
114 if (usecs < atomic_read(&current_max_latency))
115 atomic_set(&current_max_latency, usecs);
116
117 spin_unlock_irqrestore(&latency_lock, flags);
118
119 blocking_notifier_call_chain(&latency_notifier,
120 atomic_read(&current_max_latency), NULL);
121
122 /*
123 * if we inserted the new one, we're done; otherwise there was
124 * an existing one so we need to free the redundant data
125 */
126 if (!found_old)
127 return;
128
129 kfree(info->identifier);
130free_info:
131 kfree(info);
132}
133EXPORT_SYMBOL_GPL(set_acceptable_latency);
134
135/**
136 * modify_acceptable_latency - changes the maximum latency acceptable
137 * @identifier: string that identifies this driver
138 * @usecs: maximum acceptable latency for this driver
139 *
140 * This function informs the kernel that this device(driver)
141 * can accept at most usecs latency. This setting is used for
142 * power management and similar tradeoffs.
143 *
144 * This function does not sleep and can be called in any context.
145 * Trying to use a non-existing identifier silently gets ignored.
146 *
147 * Due to the atomic nature of this function, the modified latency
148 * value will only be used for future decisions; past decisions
149 * can still lead to longer latencies in the near future.
150 */
151void modify_acceptable_latency(char *identifier, int usecs)
152{
153 struct latency_info *iter;
154 unsigned long flags;
155
156 spin_lock_irqsave(&latency_lock, flags);
157 list_for_each_entry(iter, &latency_list, list) {
158 if (strcmp(iter->identifier, identifier) == 0) {
159 iter->usecs = usecs;
160 break;
161 }
162 }
163 if (usecs < atomic_read(&current_max_latency))
164 atomic_set(&current_max_latency, usecs);
165 spin_unlock_irqrestore(&latency_lock, flags);
166}
167EXPORT_SYMBOL_GPL(modify_acceptable_latency);
168
169/**
170 * remove_acceptable_latency - removes the maximum latency acceptable
171 * @identifier: string that identifies this driver
172 *
173 * This function removes a previously set maximum latency setting
174 * for the driver and frees up any resources associated with the
175 * bookkeeping needed for this.
176 *
177 * This function does not sleep and can be called in any context.
178 * Trying to use a non-existing identifier silently gets ignored.
179 */
180void remove_acceptable_latency(char *identifier)
181{
182 unsigned long flags;
183 int newmax = 0;
184 struct latency_info *iter, *temp;
185
186 spin_lock_irqsave(&latency_lock, flags);
187
188 list_for_each_entry_safe(iter, temp, &latency_list, list) {
189 if (strcmp(iter->identifier, identifier) == 0) {
190 list_del(&iter->list);
191 newmax = iter->usecs;
192 kfree(iter->identifier);
193 kfree(iter);
194 break;
195 }
196 }
197
198 /* If we just deleted the system wide value, we need to
199 * recalculate with a full search
200 */
201 if (newmax == atomic_read(&current_max_latency)) {
202 newmax = __find_max_latency();
203 atomic_set(&current_max_latency, newmax);
204 }
205 spin_unlock_irqrestore(&latency_lock, flags);
206}
207EXPORT_SYMBOL_GPL(remove_acceptable_latency);
208
209/**
210 * system_latency_constraint - queries the system wide latency maximum
211 *
212 * This function returns the system wide maximum latency in
213 * microseconds.
214 *
215 * This function does not sleep and can be called in any context.
216 */
217int system_latency_constraint(void)
218{
219 return atomic_read(&current_max_latency);
220}
221EXPORT_SYMBOL_GPL(system_latency_constraint);
222
223/**
224 * synchronize_acceptable_latency - recalculates all latency decisions
225 *
226 * This function will cause a callback to various kernel pieces that
227 * will make those pieces rethink their latency decisions. This implies
228 * that if there are overlong latencies in hardware state already, those
229 * latencies get taken right now. When this call completes no overlong
230 * latency decisions should be active anymore.
231 *
232 * Typical usecase of this is after a modify_acceptable_latency() call,
233 * which in itself is non-blocking and non-synchronizing.
234 *
235 * This function blocks and should not be called with locks held.
236 */
237
238void synchronize_acceptable_latency(void)
239{
240 blocking_notifier_call_chain(&latency_notifier,
241 atomic_read(&current_max_latency), NULL);
242}
243EXPORT_SYMBOL_GPL(synchronize_acceptable_latency);
244
245/*
246 * Latency notifier: this notifier gets called when a non-atomic new
247 * latency value gets set. The expectation nof the caller of the
248 * non-atomic set is that when the call returns, future latencies
249 * are within bounds, so the functions on the notifier list are
250 * expected to take the overlong latencies immediately, inside the
251 * callback, and not make a overlong latency decision anymore.
252 *
253 * The callback gets called when the new latency value is made
254 * active so system_latency_constraint() returns the new latency.
255 */
256int register_latency_notifier(struct notifier_block * nb)
257{
258 return blocking_notifier_chain_register(&latency_notifier, nb);
259}
260EXPORT_SYMBOL_GPL(register_latency_notifier);
261
262int unregister_latency_notifier(struct notifier_block * nb)
263{
264 return blocking_notifier_chain_unregister(&latency_notifier, nb);
265}
266EXPORT_SYMBOL_GPL(unregister_latency_notifier);
267
268static __init int latency_init(void)
269{
270 atomic_set(&current_max_latency, INFINITE_LATENCY);
271 /*
272 * we don't want by default to have longer latencies than 2 ticks,
273 * since that would cause lost ticks
274 */
275 set_acceptable_latency("kernel", 2*1000000/HZ);
276 return 0;
277}
278
279module_init(latency_init);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index e596525669..4c05534610 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -518,9 +518,9 @@ print_circular_bug_entry(struct lock_list *target, unsigned int depth)
518 518
519static void print_kernel_version(void) 519static void print_kernel_version(void)
520{ 520{
521 printk("%s %.*s\n", system_utsname.release, 521 printk("%s %.*s\n", init_utsname()->release,
522 (int)strcspn(system_utsname.version, " "), 522 (int)strcspn(init_utsname()->version, " "),
523 system_utsname.version); 523 init_utsname()->version);
524} 524}
525 525
526/* 526/*
diff --git a/kernel/module.c b/kernel/module.c
index 05625d5dc7..7f60e782de 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -851,6 +851,7 @@ static int check_version(Elf_Shdr *sechdrs,
851 printk("%s: no version for \"%s\" found: kernel tainted.\n", 851 printk("%s: no version for \"%s\" found: kernel tainted.\n",
852 mod->name, symname); 852 mod->name, symname);
853 add_taint(TAINT_FORCED_MODULE); 853 add_taint(TAINT_FORCED_MODULE);
854 mod->taints |= TAINT_FORCED_MODULE;
854 } 855 }
855 return 1; 856 return 1;
856} 857}
@@ -1339,6 +1340,7 @@ static void set_license(struct module *mod, const char *license)
1339 printk(KERN_WARNING "%s: module license '%s' taints kernel.\n", 1340 printk(KERN_WARNING "%s: module license '%s' taints kernel.\n",
1340 mod->name, license); 1341 mod->name, license);
1341 add_taint(TAINT_PROPRIETARY_MODULE); 1342 add_taint(TAINT_PROPRIETARY_MODULE);
1343 mod->taints |= TAINT_PROPRIETARY_MODULE;
1342 } 1344 }
1343} 1345}
1344 1346
@@ -1618,6 +1620,7 @@ static struct module *load_module(void __user *umod,
1618 /* This is allowed: modprobe --force will invalidate it. */ 1620 /* This is allowed: modprobe --force will invalidate it. */
1619 if (!modmagic) { 1621 if (!modmagic) {
1620 add_taint(TAINT_FORCED_MODULE); 1622 add_taint(TAINT_FORCED_MODULE);
1623 mod->taints |= TAINT_FORCED_MODULE;
1621 printk(KERN_WARNING "%s: no version magic, tainting kernel.\n", 1624 printk(KERN_WARNING "%s: no version magic, tainting kernel.\n",
1622 mod->name); 1625 mod->name);
1623 } else if (!same_magic(modmagic, vermagic)) { 1626 } else if (!same_magic(modmagic, vermagic)) {
@@ -1711,10 +1714,14 @@ static struct module *load_module(void __user *umod,
1711 /* Set up license info based on the info section */ 1714 /* Set up license info based on the info section */
1712 set_license(mod, get_modinfo(sechdrs, infoindex, "license")); 1715 set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
1713 1716
1714 if (strcmp(mod->name, "ndiswrapper") == 0) 1717 if (strcmp(mod->name, "ndiswrapper") == 0) {
1715 add_taint(TAINT_PROPRIETARY_MODULE); 1718 add_taint(TAINT_PROPRIETARY_MODULE);
1716 if (strcmp(mod->name, "driverloader") == 0) 1719 mod->taints |= TAINT_PROPRIETARY_MODULE;
1720 }
1721 if (strcmp(mod->name, "driverloader") == 0) {
1717 add_taint(TAINT_PROPRIETARY_MODULE); 1722 add_taint(TAINT_PROPRIETARY_MODULE);
1723 mod->taints |= TAINT_PROPRIETARY_MODULE;
1724 }
1718 1725
1719 /* Set up MODINFO_ATTR fields */ 1726 /* Set up MODINFO_ATTR fields */
1720 setup_modinfo(mod, sechdrs, infoindex); 1727 setup_modinfo(mod, sechdrs, infoindex);
@@ -1760,6 +1767,7 @@ static struct module *load_module(void __user *umod,
1760 printk(KERN_WARNING "%s: No versions for exported symbols." 1767 printk(KERN_WARNING "%s: No versions for exported symbols."
1761 " Tainting kernel.\n", mod->name); 1768 " Tainting kernel.\n", mod->name);
1762 add_taint(TAINT_FORCED_MODULE); 1769 add_taint(TAINT_FORCED_MODULE);
1770 mod->taints |= TAINT_FORCED_MODULE;
1763 } 1771 }
1764#endif 1772#endif
1765 1773
@@ -2032,7 +2040,8 @@ const char *module_address_lookup(unsigned long addr,
2032 list_for_each_entry(mod, &modules, list) { 2040 list_for_each_entry(mod, &modules, list) {
2033 if (within(addr, mod->module_init, mod->init_size) 2041 if (within(addr, mod->module_init, mod->init_size)
2034 || within(addr, mod->module_core, mod->core_size)) { 2042 || within(addr, mod->module_core, mod->core_size)) {
2035 *modname = mod->name; 2043 if (modname)
2044 *modname = mod->name;
2036 return get_ksymbol(mod, addr, size, offset); 2045 return get_ksymbol(mod, addr, size, offset);
2037 } 2046 }
2038 } 2047 }
@@ -2226,14 +2235,37 @@ struct module *module_text_address(unsigned long addr)
2226 return mod; 2235 return mod;
2227} 2236}
2228 2237
2238static char *taint_flags(unsigned int taints, char *buf)
2239{
2240 *buf = '\0';
2241 if (taints) {
2242 int bx;
2243
2244 buf[0] = '(';
2245 bx = 1;
2246 if (taints & TAINT_PROPRIETARY_MODULE)
2247 buf[bx++] = 'P';
2248 if (taints & TAINT_FORCED_MODULE)
2249 buf[bx++] = 'F';
2250 /*
2251 * TAINT_FORCED_RMMOD: could be added.
2252 * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
2253 * apply to modules.
2254 */
2255 buf[bx] = ')';
2256 }
2257 return buf;
2258}
2259
2229/* Don't grab lock, we're oopsing. */ 2260/* Don't grab lock, we're oopsing. */
2230void print_modules(void) 2261void print_modules(void)
2231{ 2262{
2232 struct module *mod; 2263 struct module *mod;
2264 char buf[8];
2233 2265
2234 printk("Modules linked in:"); 2266 printk("Modules linked in:");
2235 list_for_each_entry(mod, &modules, list) 2267 list_for_each_entry(mod, &modules, list)
2236 printk(" %s", mod->name); 2268 printk(" %s%s", mod->name, taint_flags(mod->taints, buf));
2237 printk("\n"); 2269 printk("\n");
2238} 2270}
2239 2271
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
new file mode 100644
index 0000000000..6ebdb82a0c
--- /dev/null
+++ b/kernel/nsproxy.c
@@ -0,0 +1,139 @@
1/*
2 * Copyright (C) 2006 IBM Corporation
3 *
4 * Author: Serge Hallyn <serue@us.ibm.com>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation, version 2 of the
9 * License.
10 *
11 * Jun 2006 - namespaces support
12 * OpenVZ, SWsoft Inc.
13 * Pavel Emelianov <xemul@openvz.org>
14 */
15
16#include <linux/module.h>
17#include <linux/version.h>
18#include <linux/nsproxy.h>
19#include <linux/init_task.h>
20#include <linux/namespace.h>
21#include <linux/utsname.h>
22
23struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);
24
25static inline void get_nsproxy(struct nsproxy *ns)
26{
27 atomic_inc(&ns->count);
28}
29
30void get_task_namespaces(struct task_struct *tsk)
31{
32 struct nsproxy *ns = tsk->nsproxy;
33 if (ns) {
34 get_nsproxy(ns);
35 }
36}
37
38/*
39 * creates a copy of "orig" with refcount 1.
40 * This does not grab references to the contained namespaces,
41 * so that needs to be done by dup_namespaces.
42 */
43static inline struct nsproxy *clone_namespaces(struct nsproxy *orig)
44{
45 struct nsproxy *ns;
46
47 ns = kmalloc(sizeof(struct nsproxy), GFP_KERNEL);
48 if (ns) {
49 memcpy(ns, orig, sizeof(struct nsproxy));
50 atomic_set(&ns->count, 1);
51 }
52 return ns;
53}
54
55/*
56 * copies the nsproxy, setting refcount to 1, and grabbing a
57 * reference to all contained namespaces. Called from
58 * sys_unshare()
59 */
60struct nsproxy *dup_namespaces(struct nsproxy *orig)
61{
62 struct nsproxy *ns = clone_namespaces(orig);
63
64 if (ns) {
65 if (ns->namespace)
66 get_namespace(ns->namespace);
67 if (ns->uts_ns)
68 get_uts_ns(ns->uts_ns);
69 if (ns->ipc_ns)
70 get_ipc_ns(ns->ipc_ns);
71 }
72
73 return ns;
74}
75
76/*
77 * called from clone. This now handles copy for nsproxy and all
78 * namespaces therein.
79 */
80int copy_namespaces(int flags, struct task_struct *tsk)
81{
82 struct nsproxy *old_ns = tsk->nsproxy;
83 struct nsproxy *new_ns;
84 int err = 0;
85
86 if (!old_ns)
87 return 0;
88
89 get_nsproxy(old_ns);
90
91 if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC)))
92 return 0;
93
94 new_ns = clone_namespaces(old_ns);
95 if (!new_ns) {
96 err = -ENOMEM;
97 goto out;
98 }
99
100 tsk->nsproxy = new_ns;
101
102 err = copy_namespace(flags, tsk);
103 if (err)
104 goto out_ns;
105
106 err = copy_utsname(flags, tsk);
107 if (err)
108 goto out_uts;
109
110 err = copy_ipcs(flags, tsk);
111 if (err)
112 goto out_ipc;
113
114out:
115 put_nsproxy(old_ns);
116 return err;
117
118out_ipc:
119 if (new_ns->uts_ns)
120 put_uts_ns(new_ns->uts_ns);
121out_uts:
122 if (new_ns->namespace)
123 put_namespace(new_ns->namespace);
124out_ns:
125 tsk->nsproxy = old_ns;
126 kfree(new_ns);
127 goto out;
128}
129
130void free_nsproxy(struct nsproxy *ns)
131{
132 if (ns->namespace)
133 put_namespace(ns->namespace);
134 if (ns->uts_ns)
135 put_uts_ns(ns->uts_ns);
136 if (ns->ipc_ns)
137 put_ipc_ns(ns->ipc_ns);
138 kfree(ns);
139}
diff --git a/kernel/panic.c b/kernel/panic.c
index 6ceb664fb5..525e365f72 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -21,7 +21,6 @@
21#include <linux/debug_locks.h> 21#include <linux/debug_locks.h>
22 22
23int panic_on_oops; 23int panic_on_oops;
24int panic_on_unrecovered_nmi;
25int tainted; 24int tainted;
26static int pause_on_oops; 25static int pause_on_oops;
27static int pause_on_oops_flag; 26static int pause_on_oops_flag;
diff --git a/kernel/pid.c b/kernel/pid.c
index 8387e8c681..b914392085 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -26,6 +26,7 @@
26#include <linux/init.h> 26#include <linux/init.h>
27#include <linux/bootmem.h> 27#include <linux/bootmem.h>
28#include <linux/hash.h> 28#include <linux/hash.h>
29#include <linux/pspace.h>
29 30
30#define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift) 31#define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift)
31static struct hlist_head *pid_hash; 32static struct hlist_head *pid_hash;
@@ -33,17 +34,20 @@ static int pidhash_shift;
33static kmem_cache_t *pid_cachep; 34static kmem_cache_t *pid_cachep;
34 35
35int pid_max = PID_MAX_DEFAULT; 36int pid_max = PID_MAX_DEFAULT;
36int last_pid;
37 37
38#define RESERVED_PIDS 300 38#define RESERVED_PIDS 300
39 39
40int pid_max_min = RESERVED_PIDS + 1; 40int pid_max_min = RESERVED_PIDS + 1;
41int pid_max_max = PID_MAX_LIMIT; 41int pid_max_max = PID_MAX_LIMIT;
42 42
43#define PIDMAP_ENTRIES ((PID_MAX_LIMIT + 8*PAGE_SIZE - 1)/PAGE_SIZE/8)
44#define BITS_PER_PAGE (PAGE_SIZE*8) 43#define BITS_PER_PAGE (PAGE_SIZE*8)
45#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1) 44#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1)
46#define mk_pid(map, off) (((map) - pidmap_array)*BITS_PER_PAGE + (off)) 45
46static inline int mk_pid(struct pspace *pspace, struct pidmap *map, int off)
47{
48 return (map - pspace->pidmap)*BITS_PER_PAGE + off;
49}
50
47#define find_next_offset(map, off) \ 51#define find_next_offset(map, off) \
48 find_next_zero_bit((map)->page, BITS_PER_PAGE, off) 52 find_next_zero_bit((map)->page, BITS_PER_PAGE, off)
49 53
@@ -53,13 +57,12 @@ int pid_max_max = PID_MAX_LIMIT;
53 * value does not cause lots of bitmaps to be allocated, but 57 * value does not cause lots of bitmaps to be allocated, but
54 * the scheme scales to up to 4 million PIDs, runtime. 58 * the scheme scales to up to 4 million PIDs, runtime.
55 */ 59 */
56typedef struct pidmap { 60struct pspace init_pspace = {
57 atomic_t nr_free; 61 .pidmap = {
58 void *page; 62 [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
59} pidmap_t; 63 },
60 64 .last_pid = 0
61static pidmap_t pidmap_array[PIDMAP_ENTRIES] = 65};
62 { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } };
63 66
64/* 67/*
65 * Note: disable interrupts while the pidmap_lock is held as an 68 * Note: disable interrupts while the pidmap_lock is held as an
@@ -74,40 +77,41 @@ static pidmap_t pidmap_array[PIDMAP_ENTRIES] =
74 * irq handlers that take it we can leave the interrupts enabled. 77 * irq handlers that take it we can leave the interrupts enabled.
75 * For now it is easier to be safe than to prove it can't happen. 78 * For now it is easier to be safe than to prove it can't happen.
76 */ 79 */
80
77static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); 81static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
78 82
79static fastcall void free_pidmap(int pid) 83static fastcall void free_pidmap(struct pspace *pspace, int pid)
80{ 84{
81 pidmap_t *map = pidmap_array + pid / BITS_PER_PAGE; 85 struct pidmap *map = pspace->pidmap + pid / BITS_PER_PAGE;
82 int offset = pid & BITS_PER_PAGE_MASK; 86 int offset = pid & BITS_PER_PAGE_MASK;
83 87
84 clear_bit(offset, map->page); 88 clear_bit(offset, map->page);
85 atomic_inc(&map->nr_free); 89 atomic_inc(&map->nr_free);
86} 90}
87 91
88static int alloc_pidmap(void) 92static int alloc_pidmap(struct pspace *pspace)
89{ 93{
90 int i, offset, max_scan, pid, last = last_pid; 94 int i, offset, max_scan, pid, last = pspace->last_pid;
91 pidmap_t *map; 95 struct pidmap *map;
92 96
93 pid = last + 1; 97 pid = last + 1;
94 if (pid >= pid_max) 98 if (pid >= pid_max)
95 pid = RESERVED_PIDS; 99 pid = RESERVED_PIDS;
96 offset = pid & BITS_PER_PAGE_MASK; 100 offset = pid & BITS_PER_PAGE_MASK;
97 map = &pidmap_array[pid/BITS_PER_PAGE]; 101 map = &pspace->pidmap[pid/BITS_PER_PAGE];
98 max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset; 102 max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset;
99 for (i = 0; i <= max_scan; ++i) { 103 for (i = 0; i <= max_scan; ++i) {
100 if (unlikely(!map->page)) { 104 if (unlikely(!map->page)) {
101 unsigned long page = get_zeroed_page(GFP_KERNEL); 105 void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
102 /* 106 /*
103 * Free the page if someone raced with us 107 * Free the page if someone raced with us
104 * installing it: 108 * installing it:
105 */ 109 */
106 spin_lock_irq(&pidmap_lock); 110 spin_lock_irq(&pidmap_lock);
107 if (map->page) 111 if (map->page)
108 free_page(page); 112 kfree(page);
109 else 113 else
110 map->page = (void *)page; 114 map->page = page;
111 spin_unlock_irq(&pidmap_lock); 115 spin_unlock_irq(&pidmap_lock);
112 if (unlikely(!map->page)) 116 if (unlikely(!map->page))
113 break; 117 break;
@@ -116,11 +120,11 @@ static int alloc_pidmap(void)
116 do { 120 do {
117 if (!test_and_set_bit(offset, map->page)) { 121 if (!test_and_set_bit(offset, map->page)) {
118 atomic_dec(&map->nr_free); 122 atomic_dec(&map->nr_free);
119 last_pid = pid; 123 pspace->last_pid = pid;
120 return pid; 124 return pid;
121 } 125 }
122 offset = find_next_offset(map, offset); 126 offset = find_next_offset(map, offset);
123 pid = mk_pid(map, offset); 127 pid = mk_pid(pspace, map, offset);
124 /* 128 /*
125 * find_next_offset() found a bit, the pid from it 129 * find_next_offset() found a bit, the pid from it
126 * is in-bounds, and if we fell back to the last 130 * is in-bounds, and if we fell back to the last
@@ -131,16 +135,34 @@ static int alloc_pidmap(void)
131 (i != max_scan || pid < last || 135 (i != max_scan || pid < last ||
132 !((last+1) & BITS_PER_PAGE_MASK))); 136 !((last+1) & BITS_PER_PAGE_MASK)));
133 } 137 }
134 if (map < &pidmap_array[(pid_max-1)/BITS_PER_PAGE]) { 138 if (map < &pspace->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
135 ++map; 139 ++map;
136 offset = 0; 140 offset = 0;
137 } else { 141 } else {
138 map = &pidmap_array[0]; 142 map = &pspace->pidmap[0];
139 offset = RESERVED_PIDS; 143 offset = RESERVED_PIDS;
140 if (unlikely(last == offset)) 144 if (unlikely(last == offset))
141 break; 145 break;
142 } 146 }
143 pid = mk_pid(map, offset); 147 pid = mk_pid(pspace, map, offset);
148 }
149 return -1;
150}
151
152static int next_pidmap(struct pspace *pspace, int last)
153{
154 int offset;
155 struct pidmap *map, *end;
156
157 offset = (last + 1) & BITS_PER_PAGE_MASK;
158 map = &pspace->pidmap[(last + 1)/BITS_PER_PAGE];
159 end = &pspace->pidmap[PIDMAP_ENTRIES];
160 for (; map < end; map++, offset = 0) {
161 if (unlikely(!map->page))
162 continue;
163 offset = find_next_bit((map)->page, BITS_PER_PAGE, offset);
164 if (offset < BITS_PER_PAGE)
165 return mk_pid(pspace, map, offset);
144 } 166 }
145 return -1; 167 return -1;
146} 168}
@@ -153,6 +175,7 @@ fastcall void put_pid(struct pid *pid)
153 atomic_dec_and_test(&pid->count)) 175 atomic_dec_and_test(&pid->count))
154 kmem_cache_free(pid_cachep, pid); 176 kmem_cache_free(pid_cachep, pid);
155} 177}
178EXPORT_SYMBOL_GPL(put_pid);
156 179
157static void delayed_put_pid(struct rcu_head *rhp) 180static void delayed_put_pid(struct rcu_head *rhp)
158{ 181{
@@ -169,7 +192,7 @@ fastcall void free_pid(struct pid *pid)
169 hlist_del_rcu(&pid->pid_chain); 192 hlist_del_rcu(&pid->pid_chain);
170 spin_unlock_irqrestore(&pidmap_lock, flags); 193 spin_unlock_irqrestore(&pidmap_lock, flags);
171 194
172 free_pidmap(pid->nr); 195 free_pidmap(&init_pspace, pid->nr);
173 call_rcu(&pid->rcu, delayed_put_pid); 196 call_rcu(&pid->rcu, delayed_put_pid);
174} 197}
175 198
@@ -183,7 +206,7 @@ struct pid *alloc_pid(void)
183 if (!pid) 206 if (!pid)
184 goto out; 207 goto out;
185 208
186 nr = alloc_pidmap(); 209 nr = alloc_pidmap(&init_pspace);
187 if (nr < 0) 210 if (nr < 0)
188 goto out_free; 211 goto out_free;
189 212
@@ -217,6 +240,7 @@ struct pid * fastcall find_pid(int nr)
217 } 240 }
218 return NULL; 241 return NULL;
219} 242}
243EXPORT_SYMBOL_GPL(find_pid);
220 244
221int fastcall attach_pid(struct task_struct *task, enum pid_type type, int nr) 245int fastcall attach_pid(struct task_struct *task, enum pid_type type, int nr)
222{ 246{
@@ -280,6 +304,15 @@ struct task_struct *find_task_by_pid_type(int type, int nr)
280 304
281EXPORT_SYMBOL(find_task_by_pid_type); 305EXPORT_SYMBOL(find_task_by_pid_type);
282 306
307struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
308{
309 struct pid *pid;
310 rcu_read_lock();
311 pid = get_pid(task->pids[type].pid);
312 rcu_read_unlock();
313 return pid;
314}
315
283struct task_struct *fastcall get_pid_task(struct pid *pid, enum pid_type type) 316struct task_struct *fastcall get_pid_task(struct pid *pid, enum pid_type type)
284{ 317{
285 struct task_struct *result; 318 struct task_struct *result;
@@ -303,6 +336,26 @@ struct pid *find_get_pid(pid_t nr)
303} 336}
304 337
305/* 338/*
339 * Used by proc to find the first pid that is greater then or equal to nr.
340 *
341 * If there is a pid at nr this function is exactly the same as find_pid.
342 */
343struct pid *find_ge_pid(int nr)
344{
345 struct pid *pid;
346
347 do {
348 pid = find_pid(nr);
349 if (pid)
350 break;
351 nr = next_pidmap(&init_pspace, nr);
352 } while (nr > 0);
353
354 return pid;
355}
356EXPORT_SYMBOL_GPL(find_get_pid);
357
358/*
306 * The pid hash table is scaled according to the amount of memory in the 359 * The pid hash table is scaled according to the amount of memory in the
307 * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or 360 * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or
308 * more. 361 * more.
@@ -329,10 +382,10 @@ void __init pidhash_init(void)
329 382
330void __init pidmap_init(void) 383void __init pidmap_init(void)
331{ 384{
332 pidmap_array->page = (void *)get_zeroed_page(GFP_KERNEL); 385 init_pspace.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
333 /* Reserve PID 0. We never call free_pidmap(0) */ 386 /* Reserve PID 0. We never call free_pidmap(0) */
334 set_bit(0, pidmap_array->page); 387 set_bit(0, init_pspace.pidmap[0].page);
335 atomic_dec(&pidmap_array->nr_free); 388 atomic_dec(&init_pspace.pidmap[0].nr_free);
336 389
337 pid_cachep = kmem_cache_create("pid", sizeof(struct pid), 390 pid_cachep = kmem_cache_create("pid", sizeof(struct pid),
338 __alignof__(struct pid), 391 __alignof__(struct pid),
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 1b84313cba..99f9b7d177 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -906,7 +906,7 @@ static void init_header(struct swsusp_info *info)
906 memset(info, 0, sizeof(struct swsusp_info)); 906 memset(info, 0, sizeof(struct swsusp_info));
907 info->version_code = LINUX_VERSION_CODE; 907 info->version_code = LINUX_VERSION_CODE;
908 info->num_physpages = num_physpages; 908 info->num_physpages = num_physpages;
909 memcpy(&info->uts, &system_utsname, sizeof(system_utsname)); 909 memcpy(&info->uts, init_utsname(), sizeof(struct new_utsname));
910 info->cpus = num_online_cpus(); 910 info->cpus = num_online_cpus();
911 info->image_pages = nr_copy_pages; 911 info->image_pages = nr_copy_pages;
912 info->pages = nr_copy_pages + nr_meta_pages + 1; 912 info->pages = nr_copy_pages + nr_meta_pages + 1;
@@ -1050,13 +1050,13 @@ static inline int check_header(struct swsusp_info *info)
1050 reason = "kernel version"; 1050 reason = "kernel version";
1051 if (info->num_physpages != num_physpages) 1051 if (info->num_physpages != num_physpages)
1052 reason = "memory size"; 1052 reason = "memory size";
1053 if (strcmp(info->uts.sysname,system_utsname.sysname)) 1053 if (strcmp(info->uts.sysname,init_utsname()->sysname))
1054 reason = "system type"; 1054 reason = "system type";
1055 if (strcmp(info->uts.release,system_utsname.release)) 1055 if (strcmp(info->uts.release,init_utsname()->release))
1056 reason = "kernel release"; 1056 reason = "kernel release";
1057 if (strcmp(info->uts.version,system_utsname.version)) 1057 if (strcmp(info->uts.version,init_utsname()->version))
1058 reason = "version"; 1058 reason = "version";
1059 if (strcmp(info->uts.machine,system_utsname.machine)) 1059 if (strcmp(info->uts.machine,init_utsname()->machine))
1060 reason = "machine"; 1060 reason = "machine";
1061 if (reason) { 1061 if (reason) {
1062 printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason); 1062 printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason);
diff --git a/kernel/resource.c b/kernel/resource.c
index 9db38a1a75..6de60c1214 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -193,6 +193,13 @@ static int __release_resource(struct resource *old)
193 return -EINVAL; 193 return -EINVAL;
194} 194}
195 195
196/**
197 * request_resource - request and reserve an I/O or memory resource
198 * @root: root resource descriptor
199 * @new: resource descriptor desired by caller
200 *
201 * Returns 0 for success, negative error code on error.
202 */
196int request_resource(struct resource *root, struct resource *new) 203int request_resource(struct resource *root, struct resource *new)
197{ 204{
198 struct resource *conflict; 205 struct resource *conflict;
@@ -205,6 +212,15 @@ int request_resource(struct resource *root, struct resource *new)
205 212
206EXPORT_SYMBOL(request_resource); 213EXPORT_SYMBOL(request_resource);
207 214
215/**
216 * ____request_resource - reserve a resource, with resource conflict returned
217 * @root: root resource descriptor
218 * @new: resource descriptor desired by caller
219 *
220 * Returns:
221 * On success, NULL is returned.
222 * On error, a pointer to the conflicting resource is returned.
223 */
208struct resource *____request_resource(struct resource *root, struct resource *new) 224struct resource *____request_resource(struct resource *root, struct resource *new)
209{ 225{
210 struct resource *conflict; 226 struct resource *conflict;
@@ -217,6 +233,10 @@ struct resource *____request_resource(struct resource *root, struct resource *ne
217 233
218EXPORT_SYMBOL(____request_resource); 234EXPORT_SYMBOL(____request_resource);
219 235
236/**
237 * release_resource - release a previously reserved resource
238 * @old: resource pointer
239 */
220int release_resource(struct resource *old) 240int release_resource(struct resource *old)
221{ 241{
222 int retval; 242 int retval;
@@ -315,8 +335,16 @@ static int find_resource(struct resource *root, struct resource *new,
315 return -EBUSY; 335 return -EBUSY;
316} 336}
317 337
318/* 338/**
319 * Allocate empty slot in the resource tree given range and alignment. 339 * allocate_resource - allocate empty slot in the resource tree given range & alignment
340 * @root: root resource descriptor
341 * @new: resource descriptor desired by caller
342 * @size: requested resource region size
343 * @min: minimum size to allocate
344 * @max: maximum size to allocate
345 * @align: alignment requested, in bytes
346 * @alignf: alignment function, optional, called if not NULL
347 * @alignf_data: arbitrary data to pass to the @alignf function
320 */ 348 */
321int allocate_resource(struct resource *root, struct resource *new, 349int allocate_resource(struct resource *root, struct resource *new,
322 resource_size_t size, resource_size_t min, 350 resource_size_t size, resource_size_t min,
@@ -407,10 +435,15 @@ int insert_resource(struct resource *parent, struct resource *new)
407 return result; 435 return result;
408} 436}
409 437
410/* 438/**
439 * adjust_resource - modify a resource's start and size
440 * @res: resource to modify
441 * @start: new start value
442 * @size: new size
443 *
411 * Given an existing resource, change its start and size to match the 444 * Given an existing resource, change its start and size to match the
412 * arguments. Returns -EBUSY if it can't fit. Existing children of 445 * arguments. Returns 0 on success, -EBUSY if it can't fit.
413 * the resource are assumed to be immutable. 446 * Existing children of the resource are assumed to be immutable.
414 */ 447 */
415int adjust_resource(struct resource *res, resource_size_t start, resource_size_t size) 448int adjust_resource(struct resource *res, resource_size_t start, resource_size_t size)
416{ 449{
@@ -456,11 +489,19 @@ EXPORT_SYMBOL(adjust_resource);
456 * Note how this, unlike the above, knows about 489 * Note how this, unlike the above, knows about
457 * the IO flag meanings (busy etc). 490 * the IO flag meanings (busy etc).
458 * 491 *
459 * Request-region creates a new busy region. 492 * request_region creates a new busy region.
460 * 493 *
461 * Check-region returns non-zero if the area is already busy 494 * check_region returns non-zero if the area is already busy.
462 * 495 *
463 * Release-region releases a matching busy region. 496 * release_region releases a matching busy region.
497 */
498
499/**
500 * __request_region - create a new busy resource region
501 * @parent: parent resource descriptor
502 * @start: resource start address
503 * @n: resource region size
504 * @name: reserving caller's ID string
464 */ 505 */
465struct resource * __request_region(struct resource *parent, 506struct resource * __request_region(struct resource *parent,
466 resource_size_t start, resource_size_t n, 507 resource_size_t start, resource_size_t n,
@@ -497,9 +538,23 @@ struct resource * __request_region(struct resource *parent,
497 } 538 }
498 return res; 539 return res;
499} 540}
500
501EXPORT_SYMBOL(__request_region); 541EXPORT_SYMBOL(__request_region);
502 542
543/**
544 * __check_region - check if a resource region is busy or free
545 * @parent: parent resource descriptor
546 * @start: resource start address
547 * @n: resource region size
548 *
549 * Returns 0 if the region is free at the moment it is checked,
550 * returns %-EBUSY if the region is busy.
551 *
552 * NOTE:
553 * This function is deprecated because its use is racy.
554 * Even if it returns 0, a subsequent call to request_region()
555 * may fail because another driver etc. just allocated the region.
556 * Do NOT use it. It will be removed from the kernel.
557 */
503int __check_region(struct resource *parent, resource_size_t start, 558int __check_region(struct resource *parent, resource_size_t start,
504 resource_size_t n) 559 resource_size_t n)
505{ 560{
@@ -513,9 +568,16 @@ int __check_region(struct resource *parent, resource_size_t start,
513 kfree(res); 568 kfree(res);
514 return 0; 569 return 0;
515} 570}
516
517EXPORT_SYMBOL(__check_region); 571EXPORT_SYMBOL(__check_region);
518 572
573/**
574 * __release_region - release a previously reserved resource region
575 * @parent: parent resource descriptor
576 * @start: resource start address
577 * @n: resource region size
578 *
579 * The described resource region must match a currently busy region.
580 */
519void __release_region(struct resource *parent, resource_size_t start, 581void __release_region(struct resource *parent, resource_size_t start,
520 resource_size_t n) 582 resource_size_t n)
521{ 583{
@@ -553,7 +615,6 @@ void __release_region(struct resource *parent, resource_size_t start,
553 "<%016llx-%016llx>\n", (unsigned long long)start, 615 "<%016llx-%016llx>\n", (unsigned long long)start,
554 (unsigned long long)end); 616 (unsigned long long)end);
555} 617}
556
557EXPORT_SYMBOL(__release_region); 618EXPORT_SYMBOL(__release_region);
558 619
559/* 620/*
diff --git a/kernel/sched.c b/kernel/sched.c
index 74f169ac07..53608a59d6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -49,7 +49,7 @@
49#include <linux/seq_file.h> 49#include <linux/seq_file.h>
50#include <linux/syscalls.h> 50#include <linux/syscalls.h>
51#include <linux/times.h> 51#include <linux/times.h>
52#include <linux/acct.h> 52#include <linux/tsacct_kern.h>
53#include <linux/kprobes.h> 53#include <linux/kprobes.h>
54#include <linux/delayacct.h> 54#include <linux/delayacct.h>
55#include <asm/tlb.h> 55#include <asm/tlb.h>
@@ -1232,7 +1232,7 @@ nextgroup:
1232} 1232}
1233 1233
1234/* 1234/*
1235 * find_idlest_queue - find the idlest runqueue among the cpus in group. 1235 * find_idlest_cpu - find the idlest cpu among the cpus in group.
1236 */ 1236 */
1237static int 1237static int
1238find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) 1238find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
@@ -1286,21 +1286,29 @@ static int sched_balance_self(int cpu, int flag)
1286 while (sd) { 1286 while (sd) {
1287 cpumask_t span; 1287 cpumask_t span;
1288 struct sched_group *group; 1288 struct sched_group *group;
1289 int new_cpu; 1289 int new_cpu, weight;
1290 int weight; 1290
1291 if (!(sd->flags & flag)) {
1292 sd = sd->child;
1293 continue;
1294 }
1291 1295
1292 span = sd->span; 1296 span = sd->span;
1293 group = find_idlest_group(sd, t, cpu); 1297 group = find_idlest_group(sd, t, cpu);
1294 if (!group) 1298 if (!group) {
1295 goto nextlevel; 1299 sd = sd->child;
1300 continue;
1301 }
1296 1302
1297 new_cpu = find_idlest_cpu(group, t, cpu); 1303 new_cpu = find_idlest_cpu(group, t, cpu);
1298 if (new_cpu == -1 || new_cpu == cpu) 1304 if (new_cpu == -1 || new_cpu == cpu) {
1299 goto nextlevel; 1305 /* Now try balancing at a lower domain level of cpu */
1306 sd = sd->child;
1307 continue;
1308 }
1300 1309
1301 /* Now try balancing at a lower domain level */ 1310 /* Now try balancing at a lower domain level of new_cpu */
1302 cpu = new_cpu; 1311 cpu = new_cpu;
1303nextlevel:
1304 sd = NULL; 1312 sd = NULL;
1305 weight = cpus_weight(span); 1313 weight = cpus_weight(span);
1306 for_each_domain(cpu, tmp) { 1314 for_each_domain(cpu, tmp) {
@@ -2533,8 +2541,14 @@ static int load_balance(int this_cpu, struct rq *this_rq,
2533 struct rq *busiest; 2541 struct rq *busiest;
2534 cpumask_t cpus = CPU_MASK_ALL; 2542 cpumask_t cpus = CPU_MASK_ALL;
2535 2543
2544 /*
2545 * When power savings policy is enabled for the parent domain, idle
2546 * sibling can pick up load irrespective of busy siblings. In this case,
2547 * let the state of idle sibling percolate up as IDLE, instead of
2548 * portraying it as NOT_IDLE.
2549 */
2536 if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && 2550 if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
2537 !sched_smt_power_savings) 2551 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2538 sd_idle = 1; 2552 sd_idle = 1;
2539 2553
2540 schedstat_inc(sd, lb_cnt[idle]); 2554 schedstat_inc(sd, lb_cnt[idle]);
@@ -2630,7 +2644,7 @@ redo:
2630 } 2644 }
2631 2645
2632 if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && 2646 if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2633 !sched_smt_power_savings) 2647 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2634 return -1; 2648 return -1;
2635 return nr_moved; 2649 return nr_moved;
2636 2650
@@ -2646,7 +2660,7 @@ out_one_pinned:
2646 sd->balance_interval *= 2; 2660 sd->balance_interval *= 2;
2647 2661
2648 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 2662 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2649 !sched_smt_power_savings) 2663 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2650 return -1; 2664 return -1;
2651 return 0; 2665 return 0;
2652} 2666}
@@ -2668,7 +2682,14 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
2668 int sd_idle = 0; 2682 int sd_idle = 0;
2669 cpumask_t cpus = CPU_MASK_ALL; 2683 cpumask_t cpus = CPU_MASK_ALL;
2670 2684
2671 if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings) 2685 /*
2686 * When power savings policy is enabled for the parent domain, idle
2687 * sibling can pick up load irrespective of busy siblings. In this case,
2688 * let the state of idle sibling percolate up as IDLE, instead of
2689 * portraying it as NOT_IDLE.
2690 */
2691 if (sd->flags & SD_SHARE_CPUPOWER &&
2692 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2672 sd_idle = 1; 2693 sd_idle = 1;
2673 2694
2674 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); 2695 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
@@ -2709,7 +2730,8 @@ redo:
2709 2730
2710 if (!nr_moved) { 2731 if (!nr_moved) {
2711 schedstat_inc(sd, lb_failed[NEWLY_IDLE]); 2732 schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
2712 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) 2733 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2734 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2713 return -1; 2735 return -1;
2714 } else 2736 } else
2715 sd->nr_balance_failed = 0; 2737 sd->nr_balance_failed = 0;
@@ -2719,7 +2741,7 @@ redo:
2719out_balanced: 2741out_balanced:
2720 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); 2742 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
2721 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 2743 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2722 !sched_smt_power_savings) 2744 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2723 return -1; 2745 return -1;
2724 sd->nr_balance_failed = 0; 2746 sd->nr_balance_failed = 0;
2725 2747
@@ -4384,7 +4406,10 @@ EXPORT_SYMBOL(cpu_present_map);
4384 4406
4385#ifndef CONFIG_SMP 4407#ifndef CONFIG_SMP
4386cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL; 4408cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
4409EXPORT_SYMBOL(cpu_online_map);
4410
4387cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; 4411cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
4412EXPORT_SYMBOL(cpu_possible_map);
4388#endif 4413#endif
4389 4414
4390long sched_getaffinity(pid_t pid, cpumask_t *mask) 4415long sched_getaffinity(pid_t pid, cpumask_t *mask)
@@ -4814,7 +4839,7 @@ void show_state(void)
4814 * NOTE: this function does not set the idle thread's NEED_RESCHED 4839 * NOTE: this function does not set the idle thread's NEED_RESCHED
4815 * flag, to make booting more robust. 4840 * flag, to make booting more robust.
4816 */ 4841 */
4817void __devinit init_idle(struct task_struct *idle, int cpu) 4842void __cpuinit init_idle(struct task_struct *idle, int cpu)
4818{ 4843{
4819 struct rq *rq = cpu_rq(cpu); 4844 struct rq *rq = cpu_rq(cpu);
4820 unsigned long flags; 4845 unsigned long flags;
@@ -5389,7 +5414,9 @@ static int sd_degenerate(struct sched_domain *sd)
5389 if (sd->flags & (SD_LOAD_BALANCE | 5414 if (sd->flags & (SD_LOAD_BALANCE |
5390 SD_BALANCE_NEWIDLE | 5415 SD_BALANCE_NEWIDLE |
5391 SD_BALANCE_FORK | 5416 SD_BALANCE_FORK |
5392 SD_BALANCE_EXEC)) { 5417 SD_BALANCE_EXEC |
5418 SD_SHARE_CPUPOWER |
5419 SD_SHARE_PKG_RESOURCES)) {
5393 if (sd->groups != sd->groups->next) 5420 if (sd->groups != sd->groups->next)
5394 return 0; 5421 return 0;
5395 } 5422 }
@@ -5423,7 +5450,9 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
5423 pflags &= ~(SD_LOAD_BALANCE | 5450 pflags &= ~(SD_LOAD_BALANCE |
5424 SD_BALANCE_NEWIDLE | 5451 SD_BALANCE_NEWIDLE |
5425 SD_BALANCE_FORK | 5452 SD_BALANCE_FORK |
5426 SD_BALANCE_EXEC); 5453 SD_BALANCE_EXEC |
5454 SD_SHARE_CPUPOWER |
5455 SD_SHARE_PKG_RESOURCES);
5427 } 5456 }
5428 if (~cflags & pflags) 5457 if (~cflags & pflags)
5429 return 0; 5458 return 0;
@@ -5445,12 +5474,18 @@ static void cpu_attach_domain(struct sched_domain *sd, int cpu)
5445 struct sched_domain *parent = tmp->parent; 5474 struct sched_domain *parent = tmp->parent;
5446 if (!parent) 5475 if (!parent)
5447 break; 5476 break;
5448 if (sd_parent_degenerate(tmp, parent)) 5477 if (sd_parent_degenerate(tmp, parent)) {
5449 tmp->parent = parent->parent; 5478 tmp->parent = parent->parent;
5479 if (parent->parent)
5480 parent->parent->child = tmp;
5481 }
5450 } 5482 }
5451 5483
5452 if (sd && sd_degenerate(sd)) 5484 if (sd && sd_degenerate(sd)) {
5453 sd = sd->parent; 5485 sd = sd->parent;
5486 if (sd)
5487 sd->child = NULL;
5488 }
5454 5489
5455 sched_domain_debug(sd, cpu); 5490 sched_domain_debug(sd, cpu);
5456 5491
@@ -5458,7 +5493,7 @@ static void cpu_attach_domain(struct sched_domain *sd, int cpu)
5458} 5493}
5459 5494
5460/* cpus with isolated domains */ 5495/* cpus with isolated domains */
5461static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE; 5496static cpumask_t __cpuinitdata cpu_isolated_map = CPU_MASK_NONE;
5462 5497
5463/* Setup the mask of cpus configured for isolated domains */ 5498/* Setup the mask of cpus configured for isolated domains */
5464static int __init isolated_cpu_setup(char *str) 5499static int __init isolated_cpu_setup(char *str)
@@ -5486,15 +5521,17 @@ __setup ("isolcpus=", isolated_cpu_setup);
5486 * covered by the given span, and will set each group's ->cpumask correctly, 5521 * covered by the given span, and will set each group's ->cpumask correctly,
5487 * and ->cpu_power to 0. 5522 * and ->cpu_power to 0.
5488 */ 5523 */
5489static void init_sched_build_groups(struct sched_group groups[], cpumask_t span, 5524static void
5490 int (*group_fn)(int cpu)) 5525init_sched_build_groups(struct sched_group groups[], cpumask_t span,
5526 const cpumask_t *cpu_map,
5527 int (*group_fn)(int cpu, const cpumask_t *cpu_map))
5491{ 5528{
5492 struct sched_group *first = NULL, *last = NULL; 5529 struct sched_group *first = NULL, *last = NULL;
5493 cpumask_t covered = CPU_MASK_NONE; 5530 cpumask_t covered = CPU_MASK_NONE;
5494 int i; 5531 int i;
5495 5532
5496 for_each_cpu_mask(i, span) { 5533 for_each_cpu_mask(i, span) {
5497 int group = group_fn(i); 5534 int group = group_fn(i, cpu_map);
5498 struct sched_group *sg = &groups[group]; 5535 struct sched_group *sg = &groups[group];
5499 int j; 5536 int j;
5500 5537
@@ -5505,7 +5542,7 @@ static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
5505 sg->cpu_power = 0; 5542 sg->cpu_power = 0;
5506 5543
5507 for_each_cpu_mask(j, span) { 5544 for_each_cpu_mask(j, span) {
5508 if (group_fn(j) != group) 5545 if (group_fn(j, cpu_map) != group)
5509 continue; 5546 continue;
5510 5547
5511 cpu_set(j, covered); 5548 cpu_set(j, covered);
@@ -5972,13 +6009,15 @@ static void calibrate_migration_costs(const cpumask_t *cpu_map)
5972#endif 6009#endif
5973 ); 6010 );
5974 if (system_state == SYSTEM_BOOTING) { 6011 if (system_state == SYSTEM_BOOTING) {
5975 printk("migration_cost="); 6012 if (num_online_cpus() > 1) {
5976 for (distance = 0; distance <= max_distance; distance++) { 6013 printk("migration_cost=");
5977 if (distance) 6014 for (distance = 0; distance <= max_distance; distance++) {
5978 printk(","); 6015 if (distance)
5979 printk("%ld", (long)migration_cost[distance] / 1000); 6016 printk(",");
6017 printk("%ld", (long)migration_cost[distance] / 1000);
6018 }
6019 printk("\n");
5980 } 6020 }
5981 printk("\n");
5982 } 6021 }
5983 j1 = jiffies; 6022 j1 = jiffies;
5984 if (migration_debug) 6023 if (migration_debug)
@@ -6081,7 +6120,7 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
6081static DEFINE_PER_CPU(struct sched_domain, cpu_domains); 6120static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
6082static struct sched_group sched_group_cpus[NR_CPUS]; 6121static struct sched_group sched_group_cpus[NR_CPUS];
6083 6122
6084static int cpu_to_cpu_group(int cpu) 6123static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map)
6085{ 6124{
6086 return cpu; 6125 return cpu;
6087} 6126}
@@ -6092,31 +6131,36 @@ static int cpu_to_cpu_group(int cpu)
6092 */ 6131 */
6093#ifdef CONFIG_SCHED_MC 6132#ifdef CONFIG_SCHED_MC
6094static DEFINE_PER_CPU(struct sched_domain, core_domains); 6133static DEFINE_PER_CPU(struct sched_domain, core_domains);
6095static struct sched_group *sched_group_core_bycpu[NR_CPUS]; 6134static struct sched_group sched_group_core[NR_CPUS];
6096#endif 6135#endif
6097 6136
6098#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) 6137#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
6099static int cpu_to_core_group(int cpu) 6138static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map)
6100{ 6139{
6101 return first_cpu(cpu_sibling_map[cpu]); 6140 cpumask_t mask = cpu_sibling_map[cpu];
6141 cpus_and(mask, mask, *cpu_map);
6142 return first_cpu(mask);
6102} 6143}
6103#elif defined(CONFIG_SCHED_MC) 6144#elif defined(CONFIG_SCHED_MC)
6104static int cpu_to_core_group(int cpu) 6145static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map)
6105{ 6146{
6106 return cpu; 6147 return cpu;
6107} 6148}
6108#endif 6149#endif
6109 6150
6110static DEFINE_PER_CPU(struct sched_domain, phys_domains); 6151static DEFINE_PER_CPU(struct sched_domain, phys_domains);
6111static struct sched_group *sched_group_phys_bycpu[NR_CPUS]; 6152static struct sched_group sched_group_phys[NR_CPUS];
6112 6153
6113static int cpu_to_phys_group(int cpu) 6154static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map)
6114{ 6155{
6115#ifdef CONFIG_SCHED_MC 6156#ifdef CONFIG_SCHED_MC
6116 cpumask_t mask = cpu_coregroup_map(cpu); 6157 cpumask_t mask = cpu_coregroup_map(cpu);
6158 cpus_and(mask, mask, *cpu_map);
6117 return first_cpu(mask); 6159 return first_cpu(mask);
6118#elif defined(CONFIG_SCHED_SMT) 6160#elif defined(CONFIG_SCHED_SMT)
6119 return first_cpu(cpu_sibling_map[cpu]); 6161 cpumask_t mask = cpu_sibling_map[cpu];
6162 cpus_and(mask, mask, *cpu_map);
6163 return first_cpu(mask);
6120#else 6164#else
6121 return cpu; 6165 return cpu;
6122#endif 6166#endif
@@ -6134,7 +6178,7 @@ static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
6134static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); 6178static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
6135static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS]; 6179static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS];
6136 6180
6137static int cpu_to_allnodes_group(int cpu) 6181static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map)
6138{ 6182{
6139 return cpu_to_node(cpu); 6183 return cpu_to_node(cpu);
6140} 6184}
@@ -6166,12 +6210,11 @@ next_sg:
6166} 6210}
6167#endif 6211#endif
6168 6212
6213#ifdef CONFIG_NUMA
6169/* Free memory allocated for various sched_group structures */ 6214/* Free memory allocated for various sched_group structures */
6170static void free_sched_groups(const cpumask_t *cpu_map) 6215static void free_sched_groups(const cpumask_t *cpu_map)
6171{ 6216{
6172 int cpu; 6217 int cpu, i;
6173#ifdef CONFIG_NUMA
6174 int i;
6175 6218
6176 for_each_cpu_mask(cpu, *cpu_map) { 6219 for_each_cpu_mask(cpu, *cpu_map) {
6177 struct sched_group *sched_group_allnodes 6220 struct sched_group *sched_group_allnodes
@@ -6208,19 +6251,63 @@ next_sg:
6208 kfree(sched_group_nodes); 6251 kfree(sched_group_nodes);
6209 sched_group_nodes_bycpu[cpu] = NULL; 6252 sched_group_nodes_bycpu[cpu] = NULL;
6210 } 6253 }
6254}
6255#else
6256static void free_sched_groups(const cpumask_t *cpu_map)
6257{
6258}
6211#endif 6259#endif
6212 for_each_cpu_mask(cpu, *cpu_map) { 6260
6213 if (sched_group_phys_bycpu[cpu]) { 6261/*
6214 kfree(sched_group_phys_bycpu[cpu]); 6262 * Initialize sched groups cpu_power.
6215 sched_group_phys_bycpu[cpu] = NULL; 6263 *
6216 } 6264 * cpu_power indicates the capacity of sched group, which is used while
6217#ifdef CONFIG_SCHED_MC 6265 * distributing the load between different sched groups in a sched domain.
6218 if (sched_group_core_bycpu[cpu]) { 6266 * Typically cpu_power for all the groups in a sched domain will be same unless
6219 kfree(sched_group_core_bycpu[cpu]); 6267 * there are asymmetries in the topology. If there are asymmetries, group
6220 sched_group_core_bycpu[cpu] = NULL; 6268 * having more cpu_power will pickup more load compared to the group having
6221 } 6269 * less cpu_power.
6222#endif 6270 *
6271 * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
6272 * the maximum number of tasks a group can handle in the presence of other idle
6273 * or lightly loaded groups in the same sched domain.
6274 */
6275static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6276{
6277 struct sched_domain *child;
6278 struct sched_group *group;
6279
6280 WARN_ON(!sd || !sd->groups);
6281
6282 if (cpu != first_cpu(sd->groups->cpumask))
6283 return;
6284
6285 child = sd->child;
6286
6287 /*
6288 * For perf policy, if the groups in child domain share resources
6289 * (for example cores sharing some portions of the cache hierarchy
6290 * or SMT), then set this domain groups cpu_power such that each group
6291 * can handle only one task, when there are other idle groups in the
6292 * same sched domain.
6293 */
6294 if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
6295 (child->flags &
6296 (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
6297 sd->groups->cpu_power = SCHED_LOAD_SCALE;
6298 return;
6223 } 6299 }
6300
6301 sd->groups->cpu_power = 0;
6302
6303 /*
6304 * add cpu_power of each child group to this groups cpu_power
6305 */
6306 group = child->groups;
6307 do {
6308 sd->groups->cpu_power += group->cpu_power;
6309 group = group->next;
6310 } while (group != child->groups);
6224} 6311}
6225 6312
6226/* 6313/*
@@ -6230,10 +6317,7 @@ next_sg:
6230static int build_sched_domains(const cpumask_t *cpu_map) 6317static int build_sched_domains(const cpumask_t *cpu_map)
6231{ 6318{
6232 int i; 6319 int i;
6233 struct sched_group *sched_group_phys = NULL; 6320 struct sched_domain *sd;
6234#ifdef CONFIG_SCHED_MC
6235 struct sched_group *sched_group_core = NULL;
6236#endif
6237#ifdef CONFIG_NUMA 6321#ifdef CONFIG_NUMA
6238 struct sched_group **sched_group_nodes = NULL; 6322 struct sched_group **sched_group_nodes = NULL;
6239 struct sched_group *sched_group_allnodes = NULL; 6323 struct sched_group *sched_group_allnodes = NULL;
@@ -6265,9 +6349,10 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6265 > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { 6349 > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
6266 if (!sched_group_allnodes) { 6350 if (!sched_group_allnodes) {
6267 sched_group_allnodes 6351 sched_group_allnodes
6268 = kmalloc(sizeof(struct sched_group) 6352 = kmalloc_node(sizeof(struct sched_group)
6269 * MAX_NUMNODES, 6353 * MAX_NUMNODES,
6270 GFP_KERNEL); 6354 GFP_KERNEL,
6355 cpu_to_node(i));
6271 if (!sched_group_allnodes) { 6356 if (!sched_group_allnodes) {
6272 printk(KERN_WARNING 6357 printk(KERN_WARNING
6273 "Can not alloc allnodes sched group\n"); 6358 "Can not alloc allnodes sched group\n");
@@ -6279,7 +6364,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6279 sd = &per_cpu(allnodes_domains, i); 6364 sd = &per_cpu(allnodes_domains, i);
6280 *sd = SD_ALLNODES_INIT; 6365 *sd = SD_ALLNODES_INIT;
6281 sd->span = *cpu_map; 6366 sd->span = *cpu_map;
6282 group = cpu_to_allnodes_group(i); 6367 group = cpu_to_allnodes_group(i, cpu_map);
6283 sd->groups = &sched_group_allnodes[group]; 6368 sd->groups = &sched_group_allnodes[group];
6284 p = sd; 6369 p = sd;
6285 } else 6370 } else
@@ -6289,60 +6374,42 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6289 *sd = SD_NODE_INIT; 6374 *sd = SD_NODE_INIT;
6290 sd->span = sched_domain_node_span(cpu_to_node(i)); 6375 sd->span = sched_domain_node_span(cpu_to_node(i));
6291 sd->parent = p; 6376 sd->parent = p;
6377 if (p)
6378 p->child = sd;
6292 cpus_and(sd->span, sd->span, *cpu_map); 6379 cpus_and(sd->span, sd->span, *cpu_map);
6293#endif 6380#endif
6294 6381
6295 if (!sched_group_phys) {
6296 sched_group_phys
6297 = kmalloc(sizeof(struct sched_group) * NR_CPUS,
6298 GFP_KERNEL);
6299 if (!sched_group_phys) {
6300 printk (KERN_WARNING "Can not alloc phys sched"
6301 "group\n");
6302 goto error;
6303 }
6304 sched_group_phys_bycpu[i] = sched_group_phys;
6305 }
6306
6307 p = sd; 6382 p = sd;
6308 sd = &per_cpu(phys_domains, i); 6383 sd = &per_cpu(phys_domains, i);
6309 group = cpu_to_phys_group(i); 6384 group = cpu_to_phys_group(i, cpu_map);
6310 *sd = SD_CPU_INIT; 6385 *sd = SD_CPU_INIT;
6311 sd->span = nodemask; 6386 sd->span = nodemask;
6312 sd->parent = p; 6387 sd->parent = p;
6388 if (p)
6389 p->child = sd;
6313 sd->groups = &sched_group_phys[group]; 6390 sd->groups = &sched_group_phys[group];
6314 6391
6315#ifdef CONFIG_SCHED_MC 6392#ifdef CONFIG_SCHED_MC
6316 if (!sched_group_core) {
6317 sched_group_core
6318 = kmalloc(sizeof(struct sched_group) * NR_CPUS,
6319 GFP_KERNEL);
6320 if (!sched_group_core) {
6321 printk (KERN_WARNING "Can not alloc core sched"
6322 "group\n");
6323 goto error;
6324 }
6325 sched_group_core_bycpu[i] = sched_group_core;
6326 }
6327
6328 p = sd; 6393 p = sd;
6329 sd = &per_cpu(core_domains, i); 6394 sd = &per_cpu(core_domains, i);
6330 group = cpu_to_core_group(i); 6395 group = cpu_to_core_group(i, cpu_map);
6331 *sd = SD_MC_INIT; 6396 *sd = SD_MC_INIT;
6332 sd->span = cpu_coregroup_map(i); 6397 sd->span = cpu_coregroup_map(i);
6333 cpus_and(sd->span, sd->span, *cpu_map); 6398 cpus_and(sd->span, sd->span, *cpu_map);
6334 sd->parent = p; 6399 sd->parent = p;
6400 p->child = sd;
6335 sd->groups = &sched_group_core[group]; 6401 sd->groups = &sched_group_core[group];
6336#endif 6402#endif
6337 6403
6338#ifdef CONFIG_SCHED_SMT 6404#ifdef CONFIG_SCHED_SMT
6339 p = sd; 6405 p = sd;
6340 sd = &per_cpu(cpu_domains, i); 6406 sd = &per_cpu(cpu_domains, i);
6341 group = cpu_to_cpu_group(i); 6407 group = cpu_to_cpu_group(i, cpu_map);
6342 *sd = SD_SIBLING_INIT; 6408 *sd = SD_SIBLING_INIT;
6343 sd->span = cpu_sibling_map[i]; 6409 sd->span = cpu_sibling_map[i];
6344 cpus_and(sd->span, sd->span, *cpu_map); 6410 cpus_and(sd->span, sd->span, *cpu_map);
6345 sd->parent = p; 6411 sd->parent = p;
6412 p->child = sd;
6346 sd->groups = &sched_group_cpus[group]; 6413 sd->groups = &sched_group_cpus[group];
6347#endif 6414#endif
6348 } 6415 }
@@ -6356,7 +6423,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6356 continue; 6423 continue;
6357 6424
6358 init_sched_build_groups(sched_group_cpus, this_sibling_map, 6425 init_sched_build_groups(sched_group_cpus, this_sibling_map,
6359 &cpu_to_cpu_group); 6426 cpu_map, &cpu_to_cpu_group);
6360 } 6427 }
6361#endif 6428#endif
6362 6429
@@ -6368,7 +6435,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6368 if (i != first_cpu(this_core_map)) 6435 if (i != first_cpu(this_core_map))
6369 continue; 6436 continue;
6370 init_sched_build_groups(sched_group_core, this_core_map, 6437 init_sched_build_groups(sched_group_core, this_core_map,
6371 &cpu_to_core_group); 6438 cpu_map, &cpu_to_core_group);
6372 } 6439 }
6373#endif 6440#endif
6374 6441
@@ -6382,14 +6449,14 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6382 continue; 6449 continue;
6383 6450
6384 init_sched_build_groups(sched_group_phys, nodemask, 6451 init_sched_build_groups(sched_group_phys, nodemask,
6385 &cpu_to_phys_group); 6452 cpu_map, &cpu_to_phys_group);
6386 } 6453 }
6387 6454
6388#ifdef CONFIG_NUMA 6455#ifdef CONFIG_NUMA
6389 /* Set up node groups */ 6456 /* Set up node groups */
6390 if (sched_group_allnodes) 6457 if (sched_group_allnodes)
6391 init_sched_build_groups(sched_group_allnodes, *cpu_map, 6458 init_sched_build_groups(sched_group_allnodes, *cpu_map,
6392 &cpu_to_allnodes_group); 6459 cpu_map, &cpu_to_allnodes_group);
6393 6460
6394 for (i = 0; i < MAX_NUMNODES; i++) { 6461 for (i = 0; i < MAX_NUMNODES; i++) {
6395 /* Set up node groups */ 6462 /* Set up node groups */
@@ -6461,72 +6528,20 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6461 /* Calculate CPU power for physical packages and nodes */ 6528 /* Calculate CPU power for physical packages and nodes */
6462#ifdef CONFIG_SCHED_SMT 6529#ifdef CONFIG_SCHED_SMT
6463 for_each_cpu_mask(i, *cpu_map) { 6530 for_each_cpu_mask(i, *cpu_map) {
6464 struct sched_domain *sd;
6465 sd = &per_cpu(cpu_domains, i); 6531 sd = &per_cpu(cpu_domains, i);
6466 sd->groups->cpu_power = SCHED_LOAD_SCALE; 6532 init_sched_groups_power(i, sd);
6467 } 6533 }
6468#endif 6534#endif
6469#ifdef CONFIG_SCHED_MC 6535#ifdef CONFIG_SCHED_MC
6470 for_each_cpu_mask(i, *cpu_map) { 6536 for_each_cpu_mask(i, *cpu_map) {
6471 int power;
6472 struct sched_domain *sd;
6473 sd = &per_cpu(core_domains, i); 6537 sd = &per_cpu(core_domains, i);
6474 if (sched_smt_power_savings) 6538 init_sched_groups_power(i, sd);
6475 power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
6476 else
6477 power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)
6478 * SCHED_LOAD_SCALE / 10;
6479 sd->groups->cpu_power = power;
6480 } 6539 }
6481#endif 6540#endif
6482 6541
6483 for_each_cpu_mask(i, *cpu_map) { 6542 for_each_cpu_mask(i, *cpu_map) {
6484 struct sched_domain *sd;
6485#ifdef CONFIG_SCHED_MC
6486 sd = &per_cpu(phys_domains, i); 6543 sd = &per_cpu(phys_domains, i);
6487 if (i != first_cpu(sd->groups->cpumask)) 6544 init_sched_groups_power(i, sd);
6488 continue;
6489
6490 sd->groups->cpu_power = 0;
6491 if (sched_mc_power_savings || sched_smt_power_savings) {
6492 int j;
6493
6494 for_each_cpu_mask(j, sd->groups->cpumask) {
6495 struct sched_domain *sd1;
6496 sd1 = &per_cpu(core_domains, j);
6497 /*
6498 * for each core we will add once
6499 * to the group in physical domain
6500 */
6501 if (j != first_cpu(sd1->groups->cpumask))
6502 continue;
6503
6504 if (sched_smt_power_savings)
6505 sd->groups->cpu_power += sd1->groups->cpu_power;
6506 else
6507 sd->groups->cpu_power += SCHED_LOAD_SCALE;
6508 }
6509 } else
6510 /*
6511 * This has to be < 2 * SCHED_LOAD_SCALE
6512 * Lets keep it SCHED_LOAD_SCALE, so that
6513 * while calculating NUMA group's cpu_power
6514 * we can simply do
6515 * numa_group->cpu_power += phys_group->cpu_power;
6516 *
6517 * See "only add power once for each physical pkg"
6518 * comment below
6519 */
6520 sd->groups->cpu_power = SCHED_LOAD_SCALE;
6521#else
6522 int power;
6523 sd = &per_cpu(phys_domains, i);
6524 if (sched_smt_power_savings)
6525 power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
6526 else
6527 power = SCHED_LOAD_SCALE;
6528 sd->groups->cpu_power = power;
6529#endif
6530 } 6545 }
6531 6546
6532#ifdef CONFIG_NUMA 6547#ifdef CONFIG_NUMA
@@ -6534,7 +6549,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6534 init_numa_sched_groups_power(sched_group_nodes[i]); 6549 init_numa_sched_groups_power(sched_group_nodes[i]);
6535 6550
6536 if (sched_group_allnodes) { 6551 if (sched_group_allnodes) {
6537 int group = cpu_to_allnodes_group(first_cpu(*cpu_map)); 6552 int group = cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map);
6538 struct sched_group *sg = &sched_group_allnodes[group]; 6553 struct sched_group *sg = &sched_group_allnodes[group];
6539 6554
6540 init_numa_sched_groups_power(sg); 6555 init_numa_sched_groups_power(sg);
@@ -6560,9 +6575,11 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6560 6575
6561 return 0; 6576 return 0;
6562 6577
6578#ifdef CONFIG_NUMA
6563error: 6579error:
6564 free_sched_groups(cpu_map); 6580 free_sched_groups(cpu_map);
6565 return -ENOMEM; 6581 return -ENOMEM;
6582#endif
6566} 6583}
6567/* 6584/*
6568 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 6585 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
@@ -6744,11 +6761,20 @@ static int update_sched_domains(struct notifier_block *nfb,
6744 6761
6745void __init sched_init_smp(void) 6762void __init sched_init_smp(void)
6746{ 6763{
6764 cpumask_t non_isolated_cpus;
6765
6747 lock_cpu_hotplug(); 6766 lock_cpu_hotplug();
6748 arch_init_sched_domains(&cpu_online_map); 6767 arch_init_sched_domains(&cpu_online_map);
6768 cpus_andnot(non_isolated_cpus, cpu_online_map, cpu_isolated_map);
6769 if (cpus_empty(non_isolated_cpus))
6770 cpu_set(smp_processor_id(), non_isolated_cpus);
6749 unlock_cpu_hotplug(); 6771 unlock_cpu_hotplug();
6750 /* XXX: Theoretical race here - CPU may be hotplugged now */ 6772 /* XXX: Theoretical race here - CPU may be hotplugged now */
6751 hotcpu_notifier(update_sched_domains, 0); 6773 hotcpu_notifier(update_sched_domains, 0);
6774
6775 /* Move init over to a non-isolated CPU */
6776 if (set_cpus_allowed(current, non_isolated_cpus) < 0)
6777 BUG();
6752} 6778}
6753#else 6779#else
6754void __init sched_init_smp(void) 6780void __init sched_init_smp(void)
diff --git a/kernel/signal.c b/kernel/signal.c
index fb5da6d19f..7ed8d5304b 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1055,28 +1055,44 @@ int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1055} 1055}
1056 1056
1057/* 1057/*
1058 * kill_pg_info() sends a signal to a process group: this is what the tty 1058 * kill_pgrp_info() sends a signal to a process group: this is what the tty
1059 * control characters do (^C, ^Z etc) 1059 * control characters do (^C, ^Z etc)
1060 */ 1060 */
1061 1061
1062int __kill_pg_info(int sig, struct siginfo *info, pid_t pgrp) 1062int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp)
1063{ 1063{
1064 struct task_struct *p = NULL; 1064 struct task_struct *p = NULL;
1065 int retval, success; 1065 int retval, success;
1066 1066
1067 if (pgrp <= 0)
1068 return -EINVAL;
1069
1070 success = 0; 1067 success = 0;
1071 retval = -ESRCH; 1068 retval = -ESRCH;
1072 do_each_task_pid(pgrp, PIDTYPE_PGID, p) { 1069 do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
1073 int err = group_send_sig_info(sig, info, p); 1070 int err = group_send_sig_info(sig, info, p);
1074 success |= !err; 1071 success |= !err;
1075 retval = err; 1072 retval = err;
1076 } while_each_task_pid(pgrp, PIDTYPE_PGID, p); 1073 } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
1077 return success ? 0 : retval; 1074 return success ? 0 : retval;
1078} 1075}
1079 1076
1077int kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp)
1078{
1079 int retval;
1080
1081 read_lock(&tasklist_lock);
1082 retval = __kill_pgrp_info(sig, info, pgrp);
1083 read_unlock(&tasklist_lock);
1084
1085 return retval;
1086}
1087
1088int __kill_pg_info(int sig, struct siginfo *info, pid_t pgrp)
1089{
1090 if (pgrp <= 0)
1091 return -EINVAL;
1092
1093 return __kill_pgrp_info(sig, info, find_pid(pgrp));
1094}
1095
1080int 1096int
1081kill_pg_info(int sig, struct siginfo *info, pid_t pgrp) 1097kill_pg_info(int sig, struct siginfo *info, pid_t pgrp)
1082{ 1098{
@@ -1089,8 +1105,7 @@ kill_pg_info(int sig, struct siginfo *info, pid_t pgrp)
1089 return retval; 1105 return retval;
1090} 1106}
1091 1107
1092int 1108int kill_pid_info(int sig, struct siginfo *info, struct pid *pid)
1093kill_proc_info(int sig, struct siginfo *info, pid_t pid)
1094{ 1109{
1095 int error; 1110 int error;
1096 int acquired_tasklist_lock = 0; 1111 int acquired_tasklist_lock = 0;
@@ -1101,7 +1116,7 @@ kill_proc_info(int sig, struct siginfo *info, pid_t pid)
1101 read_lock(&tasklist_lock); 1116 read_lock(&tasklist_lock);
1102 acquired_tasklist_lock = 1; 1117 acquired_tasklist_lock = 1;
1103 } 1118 }
1104 p = find_task_by_pid(pid); 1119 p = pid_task(pid, PIDTYPE_PID);
1105 error = -ESRCH; 1120 error = -ESRCH;
1106 if (p) 1121 if (p)
1107 error = group_send_sig_info(sig, info, p); 1122 error = group_send_sig_info(sig, info, p);
@@ -1111,8 +1126,18 @@ kill_proc_info(int sig, struct siginfo *info, pid_t pid)
1111 return error; 1126 return error;
1112} 1127}
1113 1128
1114/* like kill_proc_info(), but doesn't use uid/euid of "current" */ 1129int
1115int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid, 1130kill_proc_info(int sig, struct siginfo *info, pid_t pid)
1131{
1132 int error;
1133 rcu_read_lock();
1134 error = kill_pid_info(sig, info, find_pid(pid));
1135 rcu_read_unlock();
1136 return error;
1137}
1138
1139/* like kill_pid_info(), but doesn't use uid/euid of "current" */
1140int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
1116 uid_t uid, uid_t euid, u32 secid) 1141 uid_t uid, uid_t euid, u32 secid)
1117{ 1142{
1118 int ret = -EINVAL; 1143 int ret = -EINVAL;
@@ -1122,7 +1147,7 @@ int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid,
1122 return ret; 1147 return ret;
1123 1148
1124 read_lock(&tasklist_lock); 1149 read_lock(&tasklist_lock);
1125 p = find_task_by_pid(pid); 1150 p = pid_task(pid, PIDTYPE_PID);
1126 if (!p) { 1151 if (!p) {
1127 ret = -ESRCH; 1152 ret = -ESRCH;
1128 goto out_unlock; 1153 goto out_unlock;
@@ -1146,7 +1171,7 @@ out_unlock:
1146 read_unlock(&tasklist_lock); 1171 read_unlock(&tasklist_lock);
1147 return ret; 1172 return ret;
1148} 1173}
1149EXPORT_SYMBOL_GPL(kill_proc_info_as_uid); 1174EXPORT_SYMBOL_GPL(kill_pid_info_as_uid);
1150 1175
1151/* 1176/*
1152 * kill_something_info() interprets pid in interesting ways just like kill(2). 1177 * kill_something_info() interprets pid in interesting ways just like kill(2).
@@ -1264,6 +1289,18 @@ force_sigsegv(int sig, struct task_struct *p)
1264 return 0; 1289 return 0;
1265} 1290}
1266 1291
1292int kill_pgrp(struct pid *pid, int sig, int priv)
1293{
1294 return kill_pgrp_info(sig, __si_special(priv), pid);
1295}
1296EXPORT_SYMBOL(kill_pgrp);
1297
1298int kill_pid(struct pid *pid, int sig, int priv)
1299{
1300 return kill_pid_info(sig, __si_special(priv), pid);
1301}
1302EXPORT_SYMBOL(kill_pid);
1303
1267int 1304int
1268kill_pg(pid_t pgrp, int sig, int priv) 1305kill_pg(pid_t pgrp, int sig, int priv)
1269{ 1306{
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index d48143eafb..476c374151 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -215,7 +215,7 @@ void __lockfunc _##op##_lock(locktype##_t *lock) \
215 if (!(lock)->break_lock) \ 215 if (!(lock)->break_lock) \
216 (lock)->break_lock = 1; \ 216 (lock)->break_lock = 1; \
217 while (!op##_can_lock(lock) && (lock)->break_lock) \ 217 while (!op##_can_lock(lock) && (lock)->break_lock) \
218 cpu_relax(); \ 218 _raw_##op##_relax(&lock->raw_lock); \
219 } \ 219 } \
220 (lock)->break_lock = 0; \ 220 (lock)->break_lock = 0; \
221} \ 221} \
@@ -237,7 +237,7 @@ unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock) \
237 if (!(lock)->break_lock) \ 237 if (!(lock)->break_lock) \
238 (lock)->break_lock = 1; \ 238 (lock)->break_lock = 1; \
239 while (!op##_can_lock(lock) && (lock)->break_lock) \ 239 while (!op##_can_lock(lock) && (lock)->break_lock) \
240 cpu_relax(); \ 240 _raw_##op##_relax(&lock->raw_lock); \
241 } \ 241 } \
242 (lock)->break_lock = 0; \ 242 (lock)->break_lock = 0; \
243 return flags; \ 243 return flags; \
diff --git a/kernel/sys.c b/kernel/sys.c
index 8647061c08..2314867ae3 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -92,7 +92,8 @@ EXPORT_SYMBOL(fs_overflowgid);
92 */ 92 */
93 93
94int C_A_D = 1; 94int C_A_D = 1;
95int cad_pid = 1; 95struct pid *cad_pid;
96EXPORT_SYMBOL(cad_pid);
96 97
97/* 98/*
98 * Notifier list for kernel code which wants to be called 99 * Notifier list for kernel code which wants to be called
@@ -221,7 +222,7 @@ EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
221 * of the last notifier function called. 222 * of the last notifier function called.
222 */ 223 */
223 224
224int atomic_notifier_call_chain(struct atomic_notifier_head *nh, 225int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh,
225 unsigned long val, void *v) 226 unsigned long val, void *v)
226{ 227{
227 int ret; 228 int ret;
@@ -607,11 +608,10 @@ static void kernel_restart_prepare(char *cmd)
607void kernel_restart(char *cmd) 608void kernel_restart(char *cmd)
608{ 609{
609 kernel_restart_prepare(cmd); 610 kernel_restart_prepare(cmd);
610 if (!cmd) { 611 if (!cmd)
611 printk(KERN_EMERG "Restarting system.\n"); 612 printk(KERN_EMERG "Restarting system.\n");
612 } else { 613 else
613 printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); 614 printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd);
614 }
615 machine_restart(cmd); 615 machine_restart(cmd);
616} 616}
617EXPORT_SYMBOL_GPL(kernel_restart); 617EXPORT_SYMBOL_GPL(kernel_restart);
@@ -627,9 +627,8 @@ static void kernel_kexec(void)
627#ifdef CONFIG_KEXEC 627#ifdef CONFIG_KEXEC
628 struct kimage *image; 628 struct kimage *image;
629 image = xchg(&kexec_image, NULL); 629 image = xchg(&kexec_image, NULL);
630 if (!image) { 630 if (!image)
631 return; 631 return;
632 }
633 kernel_restart_prepare(NULL); 632 kernel_restart_prepare(NULL);
634 printk(KERN_EMERG "Starting new kernel\n"); 633 printk(KERN_EMERG "Starting new kernel\n");
635 machine_shutdown(); 634 machine_shutdown();
@@ -775,10 +774,9 @@ void ctrl_alt_del(void)
775 if (C_A_D) 774 if (C_A_D)
776 schedule_work(&cad_work); 775 schedule_work(&cad_work);
777 else 776 else
778 kill_proc(cad_pid, SIGINT, 1); 777 kill_cad_pid(SIGINT, 1);
779} 778}
780 779
781
782/* 780/*
783 * Unprivileged users may change the real gid to the effective gid 781 * Unprivileged users may change the real gid to the effective gid
784 * or vice versa. (BSD-style) 782 * or vice versa. (BSD-style)
@@ -823,12 +821,10 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
823 (current->sgid == egid) || 821 (current->sgid == egid) ||
824 capable(CAP_SETGID)) 822 capable(CAP_SETGID))
825 new_egid = egid; 823 new_egid = egid;
826 else { 824 else
827 return -EPERM; 825 return -EPERM;
828 }
829 } 826 }
830 if (new_egid != old_egid) 827 if (new_egid != old_egid) {
831 {
832 current->mm->dumpable = suid_dumpable; 828 current->mm->dumpable = suid_dumpable;
833 smp_wmb(); 829 smp_wmb();
834 } 830 }
@@ -857,19 +853,14 @@ asmlinkage long sys_setgid(gid_t gid)
857 if (retval) 853 if (retval)
858 return retval; 854 return retval;
859 855
860 if (capable(CAP_SETGID)) 856 if (capable(CAP_SETGID)) {
861 { 857 if (old_egid != gid) {
862 if(old_egid != gid)
863 {
864 current->mm->dumpable = suid_dumpable; 858 current->mm->dumpable = suid_dumpable;
865 smp_wmb(); 859 smp_wmb();
866 } 860 }
867 current->gid = current->egid = current->sgid = current->fsgid = gid; 861 current->gid = current->egid = current->sgid = current->fsgid = gid;
868 } 862 } else if ((gid == current->gid) || (gid == current->sgid)) {
869 else if ((gid == current->gid) || (gid == current->sgid)) 863 if (old_egid != gid) {
870 {
871 if(old_egid != gid)
872 {
873 current->mm->dumpable = suid_dumpable; 864 current->mm->dumpable = suid_dumpable;
874 smp_wmb(); 865 smp_wmb();
875 } 866 }
@@ -900,8 +891,7 @@ static int set_user(uid_t new_ruid, int dumpclear)
900 891
901 switch_uid(new_user); 892 switch_uid(new_user);
902 893
903 if(dumpclear) 894 if (dumpclear) {
904 {
905 current->mm->dumpable = suid_dumpable; 895 current->mm->dumpable = suid_dumpable;
906 smp_wmb(); 896 smp_wmb();
907 } 897 }
@@ -957,8 +947,7 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
957 if (new_ruid != old_ruid && set_user(new_ruid, new_euid != old_euid) < 0) 947 if (new_ruid != old_ruid && set_user(new_ruid, new_euid != old_euid) < 0)
958 return -EAGAIN; 948 return -EAGAIN;
959 949
960 if (new_euid != old_euid) 950 if (new_euid != old_euid) {
961 {
962 current->mm->dumpable = suid_dumpable; 951 current->mm->dumpable = suid_dumpable;
963 smp_wmb(); 952 smp_wmb();
964 } 953 }
@@ -1008,8 +997,7 @@ asmlinkage long sys_setuid(uid_t uid)
1008 } else if ((uid != current->uid) && (uid != new_suid)) 997 } else if ((uid != current->uid) && (uid != new_suid))
1009 return -EPERM; 998 return -EPERM;
1010 999
1011 if (old_euid != uid) 1000 if (old_euid != uid) {
1012 {
1013 current->mm->dumpable = suid_dumpable; 1001 current->mm->dumpable = suid_dumpable;
1014 smp_wmb(); 1002 smp_wmb();
1015 } 1003 }
@@ -1054,8 +1042,7 @@ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
1054 return -EAGAIN; 1042 return -EAGAIN;
1055 } 1043 }
1056 if (euid != (uid_t) -1) { 1044 if (euid != (uid_t) -1) {
1057 if (euid != current->euid) 1045 if (euid != current->euid) {
1058 {
1059 current->mm->dumpable = suid_dumpable; 1046 current->mm->dumpable = suid_dumpable;
1060 smp_wmb(); 1047 smp_wmb();
1061 } 1048 }
@@ -1105,8 +1092,7 @@ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
1105 return -EPERM; 1092 return -EPERM;
1106 } 1093 }
1107 if (egid != (gid_t) -1) { 1094 if (egid != (gid_t) -1) {
1108 if (egid != current->egid) 1095 if (egid != current->egid) {
1109 {
1110 current->mm->dumpable = suid_dumpable; 1096 current->mm->dumpable = suid_dumpable;
1111 smp_wmb(); 1097 smp_wmb();
1112 } 1098 }
@@ -1151,10 +1137,8 @@ asmlinkage long sys_setfsuid(uid_t uid)
1151 1137
1152 if (uid == current->uid || uid == current->euid || 1138 if (uid == current->uid || uid == current->euid ||
1153 uid == current->suid || uid == current->fsuid || 1139 uid == current->suid || uid == current->fsuid ||
1154 capable(CAP_SETUID)) 1140 capable(CAP_SETUID)) {
1155 { 1141 if (uid != old_fsuid) {
1156 if (uid != old_fsuid)
1157 {
1158 current->mm->dumpable = suid_dumpable; 1142 current->mm->dumpable = suid_dumpable;
1159 smp_wmb(); 1143 smp_wmb();
1160 } 1144 }
@@ -1182,10 +1166,8 @@ asmlinkage long sys_setfsgid(gid_t gid)
1182 1166
1183 if (gid == current->gid || gid == current->egid || 1167 if (gid == current->gid || gid == current->egid ||
1184 gid == current->sgid || gid == current->fsgid || 1168 gid == current->sgid || gid == current->fsgid ||
1185 capable(CAP_SETGID)) 1169 capable(CAP_SETGID)) {
1186 { 1170 if (gid != old_fsgid) {
1187 if (gid != old_fsgid)
1188 {
1189 current->mm->dumpable = suid_dumpable; 1171 current->mm->dumpable = suid_dumpable;
1190 smp_wmb(); 1172 smp_wmb();
1191 } 1173 }
@@ -1321,9 +1303,9 @@ out:
1321 1303
1322asmlinkage long sys_getpgid(pid_t pid) 1304asmlinkage long sys_getpgid(pid_t pid)
1323{ 1305{
1324 if (!pid) { 1306 if (!pid)
1325 return process_group(current); 1307 return process_group(current);
1326 } else { 1308 else {
1327 int retval; 1309 int retval;
1328 struct task_struct *p; 1310 struct task_struct *p;
1329 1311
@@ -1353,9 +1335,9 @@ asmlinkage long sys_getpgrp(void)
1353 1335
1354asmlinkage long sys_getsid(pid_t pid) 1336asmlinkage long sys_getsid(pid_t pid)
1355{ 1337{
1356 if (!pid) { 1338 if (!pid)
1357 return current->signal->session; 1339 return current->signal->session;
1358 } else { 1340 else {
1359 int retval; 1341 int retval;
1360 struct task_struct *p; 1342 struct task_struct *p;
1361 1343
@@ -1363,7 +1345,7 @@ asmlinkage long sys_getsid(pid_t pid)
1363 p = find_task_by_pid(pid); 1345 p = find_task_by_pid(pid);
1364 1346
1365 retval = -ESRCH; 1347 retval = -ESRCH;
1366 if(p) { 1348 if (p) {
1367 retval = security_task_getsid(p); 1349 retval = security_task_getsid(p);
1368 if (!retval) 1350 if (!retval)
1369 retval = p->signal->session; 1351 retval = p->signal->session;
@@ -1431,9 +1413,9 @@ struct group_info *groups_alloc(int gidsetsize)
1431 group_info->nblocks = nblocks; 1413 group_info->nblocks = nblocks;
1432 atomic_set(&group_info->usage, 1); 1414 atomic_set(&group_info->usage, 1);
1433 1415
1434 if (gidsetsize <= NGROUPS_SMALL) { 1416 if (gidsetsize <= NGROUPS_SMALL)
1435 group_info->blocks[0] = group_info->small_block; 1417 group_info->blocks[0] = group_info->small_block;
1436 } else { 1418 else {
1437 for (i = 0; i < nblocks; i++) { 1419 for (i = 0; i < nblocks; i++) {
1438 gid_t *b; 1420 gid_t *b;
1439 b = (void *)__get_free_page(GFP_USER); 1421 b = (void *)__get_free_page(GFP_USER);
@@ -1489,7 +1471,7 @@ static int groups_to_user(gid_t __user *grouplist,
1489/* fill a group_info from a user-space array - it must be allocated already */ 1471/* fill a group_info from a user-space array - it must be allocated already */
1490static int groups_from_user(struct group_info *group_info, 1472static int groups_from_user(struct group_info *group_info,
1491 gid_t __user *grouplist) 1473 gid_t __user *grouplist)
1492 { 1474{
1493 int i; 1475 int i;
1494 int count = group_info->ngroups; 1476 int count = group_info->ngroups;
1495 1477
@@ -1647,9 +1629,8 @@ asmlinkage long sys_setgroups(int gidsetsize, gid_t __user *grouplist)
1647int in_group_p(gid_t grp) 1629int in_group_p(gid_t grp)
1648{ 1630{
1649 int retval = 1; 1631 int retval = 1;
1650 if (grp != current->fsgid) { 1632 if (grp != current->fsgid)
1651 retval = groups_search(current->group_info, grp); 1633 retval = groups_search(current->group_info, grp);
1652 }
1653 return retval; 1634 return retval;
1654} 1635}
1655 1636
@@ -1658,9 +1639,8 @@ EXPORT_SYMBOL(in_group_p);
1658int in_egroup_p(gid_t grp) 1639int in_egroup_p(gid_t grp)
1659{ 1640{
1660 int retval = 1; 1641 int retval = 1;
1661 if (grp != current->egid) { 1642 if (grp != current->egid)
1662 retval = groups_search(current->group_info, grp); 1643 retval = groups_search(current->group_info, grp);
1663 }
1664 return retval; 1644 return retval;
1665} 1645}
1666 1646
@@ -1675,7 +1655,7 @@ asmlinkage long sys_newuname(struct new_utsname __user * name)
1675 int errno = 0; 1655 int errno = 0;
1676 1656
1677 down_read(&uts_sem); 1657 down_read(&uts_sem);
1678 if (copy_to_user(name,&system_utsname,sizeof *name)) 1658 if (copy_to_user(name, utsname(), sizeof *name))
1679 errno = -EFAULT; 1659 errno = -EFAULT;
1680 up_read(&uts_sem); 1660 up_read(&uts_sem);
1681 return errno; 1661 return errno;
@@ -1693,8 +1673,8 @@ asmlinkage long sys_sethostname(char __user *name, int len)
1693 down_write(&uts_sem); 1673 down_write(&uts_sem);
1694 errno = -EFAULT; 1674 errno = -EFAULT;
1695 if (!copy_from_user(tmp, name, len)) { 1675 if (!copy_from_user(tmp, name, len)) {
1696 memcpy(system_utsname.nodename, tmp, len); 1676 memcpy(utsname()->nodename, tmp, len);
1697 system_utsname.nodename[len] = 0; 1677 utsname()->nodename[len] = 0;
1698 errno = 0; 1678 errno = 0;
1699 } 1679 }
1700 up_write(&uts_sem); 1680 up_write(&uts_sem);
@@ -1710,11 +1690,11 @@ asmlinkage long sys_gethostname(char __user *name, int len)
1710 if (len < 0) 1690 if (len < 0)
1711 return -EINVAL; 1691 return -EINVAL;
1712 down_read(&uts_sem); 1692 down_read(&uts_sem);
1713 i = 1 + strlen(system_utsname.nodename); 1693 i = 1 + strlen(utsname()->nodename);
1714 if (i > len) 1694 if (i > len)
1715 i = len; 1695 i = len;
1716 errno = 0; 1696 errno = 0;
1717 if (copy_to_user(name, system_utsname.nodename, i)) 1697 if (copy_to_user(name, utsname()->nodename, i))
1718 errno = -EFAULT; 1698 errno = -EFAULT;
1719 up_read(&uts_sem); 1699 up_read(&uts_sem);
1720 return errno; 1700 return errno;
@@ -1739,8 +1719,8 @@ asmlinkage long sys_setdomainname(char __user *name, int len)
1739 down_write(&uts_sem); 1719 down_write(&uts_sem);
1740 errno = -EFAULT; 1720 errno = -EFAULT;
1741 if (!copy_from_user(tmp, name, len)) { 1721 if (!copy_from_user(tmp, name, len)) {
1742 memcpy(system_utsname.domainname, tmp, len); 1722 memcpy(utsname()->domainname, tmp, len);
1743 system_utsname.domainname[len] = 0; 1723 utsname()->domainname[len] = 0;
1744 errno = 0; 1724 errno = 0;
1745 } 1725 }
1746 up_write(&uts_sem); 1726 up_write(&uts_sem);
@@ -1775,9 +1755,9 @@ asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *r
1775 task_lock(current->group_leader); 1755 task_lock(current->group_leader);
1776 x = current->signal->rlim[resource]; 1756 x = current->signal->rlim[resource];
1777 task_unlock(current->group_leader); 1757 task_unlock(current->group_leader);
1778 if(x.rlim_cur > 0x7FFFFFFF) 1758 if (x.rlim_cur > 0x7FFFFFFF)
1779 x.rlim_cur = 0x7FFFFFFF; 1759 x.rlim_cur = 0x7FFFFFFF;
1780 if(x.rlim_max > 0x7FFFFFFF) 1760 if (x.rlim_max > 0x7FFFFFFF)
1781 x.rlim_max = 0x7FFFFFFF; 1761 x.rlim_max = 0x7FFFFFFF;
1782 return copy_to_user(rlim, &x, sizeof(x))?-EFAULT:0; 1762 return copy_to_user(rlim, &x, sizeof(x))?-EFAULT:0;
1783} 1763}
@@ -2083,12 +2063,12 @@ asmlinkage long sys_getcpu(unsigned __user *cpup, unsigned __user *nodep,
2083 * padding 2063 * padding
2084 */ 2064 */
2085 unsigned long t0, t1; 2065 unsigned long t0, t1;
2086 get_user(t0, &cache->t0); 2066 get_user(t0, &cache->blob[0]);
2087 get_user(t1, &cache->t1); 2067 get_user(t1, &cache->blob[1]);
2088 t0++; 2068 t0++;
2089 t1++; 2069 t1++;
2090 put_user(t0, &cache->t0); 2070 put_user(t0, &cache->blob[0]);
2091 put_user(t1, &cache->t1); 2071 put_user(t1, &cache->blob[1]);
2092 } 2072 }
2093 return err ? -EFAULT : 0; 2073 return err ? -EFAULT : 0;
2094} 2074}
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 6991bece67..7a3b2e75f0 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -134,3 +134,8 @@ cond_syscall(sys_madvise);
134cond_syscall(sys_mremap); 134cond_syscall(sys_mremap);
135cond_syscall(sys_remap_file_pages); 135cond_syscall(sys_remap_file_pages);
136cond_syscall(compat_sys_move_pages); 136cond_syscall(compat_sys_move_pages);
137
138/* block-layer dependent */
139cond_syscall(sys_bdflush);
140cond_syscall(sys_ioprio_set);
141cond_syscall(sys_ioprio_get);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 9535a38399..8020fb273c 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -52,6 +52,10 @@
52extern int proc_nr_files(ctl_table *table, int write, struct file *filp, 52extern int proc_nr_files(ctl_table *table, int write, struct file *filp,
53 void __user *buffer, size_t *lenp, loff_t *ppos); 53 void __user *buffer, size_t *lenp, loff_t *ppos);
54 54
55#ifdef CONFIG_X86
56#include <asm/nmi.h>
57#endif
58
55#if defined(CONFIG_SYSCTL) 59#if defined(CONFIG_SYSCTL)
56 60
57/* External variables not in a header file. */ 61/* External variables not in a header file. */
@@ -64,7 +68,6 @@ extern int sysrq_enabled;
64extern int core_uses_pid; 68extern int core_uses_pid;
65extern int suid_dumpable; 69extern int suid_dumpable;
66extern char core_pattern[]; 70extern char core_pattern[];
67extern int cad_pid;
68extern int pid_max; 71extern int pid_max;
69extern int min_free_kbytes; 72extern int min_free_kbytes;
70extern int printk_ratelimit_jiffies; 73extern int printk_ratelimit_jiffies;
@@ -74,13 +77,6 @@ extern int sysctl_drop_caches;
74extern int percpu_pagelist_fraction; 77extern int percpu_pagelist_fraction;
75extern int compat_log; 78extern int compat_log;
76 79
77#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
78int unknown_nmi_panic;
79int nmi_watchdog_enabled;
80extern int proc_nmi_enabled(struct ctl_table *, int , struct file *,
81 void __user *, size_t *, loff_t *);
82#endif
83
84/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ 80/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
85static int maxolduid = 65535; 81static int maxolduid = 65535;
86static int minolduid; 82static int minolduid;
@@ -95,13 +91,8 @@ extern char modprobe_path[];
95extern int sg_big_buff; 91extern int sg_big_buff;
96#endif 92#endif
97#ifdef CONFIG_SYSVIPC 93#ifdef CONFIG_SYSVIPC
98extern size_t shm_ctlmax; 94static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp,
99extern size_t shm_ctlall; 95 void __user *buffer, size_t *lenp, loff_t *ppos);
100extern int shm_ctlmni;
101extern int msg_ctlmax;
102extern int msg_ctlmnb;
103extern int msg_ctlmni;
104extern int sem_ctls[];
105#endif 96#endif
106 97
107#ifdef __sparc__ 98#ifdef __sparc__
@@ -142,7 +133,10 @@ static int parse_table(int __user *, int, void __user *, size_t __user *,
142 void __user *, size_t, ctl_table *, void **); 133 void __user *, size_t, ctl_table *, void **);
143#endif 134#endif
144 135
145static int proc_doutsstring(ctl_table *table, int write, struct file *filp, 136static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
137 void __user *buffer, size_t *lenp, loff_t *ppos);
138
139static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp,
146 void __user *buffer, size_t *lenp, loff_t *ppos); 140 void __user *buffer, size_t *lenp, loff_t *ppos);
147 141
148static ctl_table root_table[]; 142static ctl_table root_table[];
@@ -232,51 +226,100 @@ static ctl_table root_table[] = {
232}; 226};
233 227
234static ctl_table kern_table[] = { 228static ctl_table kern_table[] = {
229#ifndef CONFIG_UTS_NS
230 {
231 .ctl_name = KERN_OSTYPE,
232 .procname = "ostype",
233 .data = init_uts_ns.name.sysname,
234 .maxlen = sizeof(init_uts_ns.name.sysname),
235 .mode = 0444,
236 .proc_handler = &proc_do_uts_string,
237 .strategy = &sysctl_string,
238 },
239 {
240 .ctl_name = KERN_OSRELEASE,
241 .procname = "osrelease",
242 .data = init_uts_ns.name.release,
243 .maxlen = sizeof(init_uts_ns.name.release),
244 .mode = 0444,
245 .proc_handler = &proc_do_uts_string,
246 .strategy = &sysctl_string,
247 },
248 {
249 .ctl_name = KERN_VERSION,
250 .procname = "version",
251 .data = init_uts_ns.name.version,
252 .maxlen = sizeof(init_uts_ns.name.version),
253 .mode = 0444,
254 .proc_handler = &proc_do_uts_string,
255 .strategy = &sysctl_string,
256 },
257 {
258 .ctl_name = KERN_NODENAME,
259 .procname = "hostname",
260 .data = init_uts_ns.name.nodename,
261 .maxlen = sizeof(init_uts_ns.name.nodename),
262 .mode = 0644,
263 .proc_handler = &proc_do_uts_string,
264 .strategy = &sysctl_string,
265 },
266 {
267 .ctl_name = KERN_DOMAINNAME,
268 .procname = "domainname",
269 .data = init_uts_ns.name.domainname,
270 .maxlen = sizeof(init_uts_ns.name.domainname),
271 .mode = 0644,
272 .proc_handler = &proc_do_uts_string,
273 .strategy = &sysctl_string,
274 },
275#else /* !CONFIG_UTS_NS */
235 { 276 {
236 .ctl_name = KERN_OSTYPE, 277 .ctl_name = KERN_OSTYPE,
237 .procname = "ostype", 278 .procname = "ostype",
238 .data = system_utsname.sysname, 279 .data = NULL,
239 .maxlen = sizeof(system_utsname.sysname), 280 /* could maybe use __NEW_UTS_LEN here? */
281 .maxlen = FIELD_SIZEOF(struct new_utsname, sysname),
240 .mode = 0444, 282 .mode = 0444,
241 .proc_handler = &proc_doutsstring, 283 .proc_handler = &proc_do_uts_string,
242 .strategy = &sysctl_string, 284 .strategy = &sysctl_string,
243 }, 285 },
244 { 286 {
245 .ctl_name = KERN_OSRELEASE, 287 .ctl_name = KERN_OSRELEASE,
246 .procname = "osrelease", 288 .procname = "osrelease",
247 .data = system_utsname.release, 289 .data = NULL,
248 .maxlen = sizeof(system_utsname.release), 290 .maxlen = FIELD_SIZEOF(struct new_utsname, release),
249 .mode = 0444, 291 .mode = 0444,
250 .proc_handler = &proc_doutsstring, 292 .proc_handler = &proc_do_uts_string,
251 .strategy = &sysctl_string, 293 .strategy = &sysctl_string,
252 }, 294 },
253 { 295 {
254 .ctl_name = KERN_VERSION, 296 .ctl_name = KERN_VERSION,
255 .procname = "version", 297 .procname = "version",
256 .data = system_utsname.version, 298 .data = NULL,
257 .maxlen = sizeof(system_utsname.version), 299 .maxlen = FIELD_SIZEOF(struct new_utsname, version),
258 .mode = 0444, 300 .mode = 0444,
259 .proc_handler = &proc_doutsstring, 301 .proc_handler = &proc_do_uts_string,
260 .strategy = &sysctl_string, 302 .strategy = &sysctl_string,
261 }, 303 },
262 { 304 {
263 .ctl_name = KERN_NODENAME, 305 .ctl_name = KERN_NODENAME,
264 .procname = "hostname", 306 .procname = "hostname",
265 .data = system_utsname.nodename, 307 .data = NULL,
266 .maxlen = sizeof(system_utsname.nodename), 308 .maxlen = FIELD_SIZEOF(struct new_utsname, nodename),
267 .mode = 0644, 309 .mode = 0644,
268 .proc_handler = &proc_doutsstring, 310 .proc_handler = &proc_do_uts_string,
269 .strategy = &sysctl_string, 311 .strategy = &sysctl_string,
270 }, 312 },
271 { 313 {
272 .ctl_name = KERN_DOMAINNAME, 314 .ctl_name = KERN_DOMAINNAME,
273 .procname = "domainname", 315 .procname = "domainname",
274 .data = system_utsname.domainname, 316 .data = NULL,
275 .maxlen = sizeof(system_utsname.domainname), 317 .maxlen = FIELD_SIZEOF(struct new_utsname, domainname),
276 .mode = 0644, 318 .mode = 0644,
277 .proc_handler = &proc_doutsstring, 319 .proc_handler = &proc_do_uts_string,
278 .strategy = &sysctl_string, 320 .strategy = &sysctl_string,
279 }, 321 },
322#endif /* !CONFIG_UTS_NS */
280 { 323 {
281 .ctl_name = KERN_PANIC, 324 .ctl_name = KERN_PANIC,
282 .procname = "panic", 325 .procname = "panic",
@@ -297,7 +340,7 @@ static ctl_table kern_table[] = {
297 .ctl_name = KERN_CORE_PATTERN, 340 .ctl_name = KERN_CORE_PATTERN,
298 .procname = "core_pattern", 341 .procname = "core_pattern",
299 .data = core_pattern, 342 .data = core_pattern,
300 .maxlen = 64, 343 .maxlen = 128,
301 .mode = 0644, 344 .mode = 0644,
302 .proc_handler = &proc_dostring, 345 .proc_handler = &proc_dostring,
303 .strategy = &sysctl_string, 346 .strategy = &sysctl_string,
@@ -435,58 +478,58 @@ static ctl_table kern_table[] = {
435 { 478 {
436 .ctl_name = KERN_SHMMAX, 479 .ctl_name = KERN_SHMMAX,
437 .procname = "shmmax", 480 .procname = "shmmax",
438 .data = &shm_ctlmax, 481 .data = NULL,
439 .maxlen = sizeof (size_t), 482 .maxlen = sizeof (size_t),
440 .mode = 0644, 483 .mode = 0644,
441 .proc_handler = &proc_doulongvec_minmax, 484 .proc_handler = &proc_do_ipc_string,
442 }, 485 },
443 { 486 {
444 .ctl_name = KERN_SHMALL, 487 .ctl_name = KERN_SHMALL,
445 .procname = "shmall", 488 .procname = "shmall",
446 .data = &shm_ctlall, 489 .data = NULL,
447 .maxlen = sizeof (size_t), 490 .maxlen = sizeof (size_t),
448 .mode = 0644, 491 .mode = 0644,
449 .proc_handler = &proc_doulongvec_minmax, 492 .proc_handler = &proc_do_ipc_string,
450 }, 493 },
451 { 494 {
452 .ctl_name = KERN_SHMMNI, 495 .ctl_name = KERN_SHMMNI,
453 .procname = "shmmni", 496 .procname = "shmmni",
454 .data = &shm_ctlmni, 497 .data = NULL,
455 .maxlen = sizeof (int), 498 .maxlen = sizeof (int),
456 .mode = 0644, 499 .mode = 0644,
457 .proc_handler = &proc_dointvec, 500 .proc_handler = &proc_do_ipc_string,
458 }, 501 },
459 { 502 {
460 .ctl_name = KERN_MSGMAX, 503 .ctl_name = KERN_MSGMAX,
461 .procname = "msgmax", 504 .procname = "msgmax",
462 .data = &msg_ctlmax, 505 .data = NULL,
463 .maxlen = sizeof (int), 506 .maxlen = sizeof (int),
464 .mode = 0644, 507 .mode = 0644,
465 .proc_handler = &proc_dointvec, 508 .proc_handler = &proc_do_ipc_string,
466 }, 509 },
467 { 510 {
468 .ctl_name = KERN_MSGMNI, 511 .ctl_name = KERN_MSGMNI,
469 .procname = "msgmni", 512 .procname = "msgmni",
470 .data = &msg_ctlmni, 513 .data = NULL,
471 .maxlen = sizeof (int), 514 .maxlen = sizeof (int),
472 .mode = 0644, 515 .mode = 0644,
473 .proc_handler = &proc_dointvec, 516 .proc_handler = &proc_do_ipc_string,
474 }, 517 },
475 { 518 {
476 .ctl_name = KERN_MSGMNB, 519 .ctl_name = KERN_MSGMNB,
477 .procname = "msgmnb", 520 .procname = "msgmnb",
478 .data = &msg_ctlmnb, 521 .data = NULL,
479 .maxlen = sizeof (int), 522 .maxlen = sizeof (int),
480 .mode = 0644, 523 .mode = 0644,
481 .proc_handler = &proc_dointvec, 524 .proc_handler = &proc_do_ipc_string,
482 }, 525 },
483 { 526 {
484 .ctl_name = KERN_SEM, 527 .ctl_name = KERN_SEM,
485 .procname = "sem", 528 .procname = "sem",
486 .data = &sem_ctls, 529 .data = NULL,
487 .maxlen = 4*sizeof (int), 530 .maxlen = 4*sizeof (int),
488 .mode = 0644, 531 .mode = 0644,
489 .proc_handler = &proc_dointvec, 532 .proc_handler = &proc_do_ipc_string,
490 }, 533 },
491#endif 534#endif
492#ifdef CONFIG_MAGIC_SYSRQ 535#ifdef CONFIG_MAGIC_SYSRQ
@@ -502,10 +545,10 @@ static ctl_table kern_table[] = {
502 { 545 {
503 .ctl_name = KERN_CADPID, 546 .ctl_name = KERN_CADPID,
504 .procname = "cad_pid", 547 .procname = "cad_pid",
505 .data = &cad_pid, 548 .data = NULL,
506 .maxlen = sizeof (int), 549 .maxlen = sizeof (int),
507 .mode = 0600, 550 .mode = 0600,
508 .proc_handler = &proc_dointvec, 551 .proc_handler = &proc_do_cad_pid,
509 }, 552 },
510 { 553 {
511 .ctl_name = KERN_MAX_THREADS, 554 .ctl_name = KERN_MAX_THREADS,
@@ -1627,32 +1670,15 @@ static ssize_t proc_writesys(struct file * file, const char __user * buf,
1627 return do_rw_proc(1, file, (char __user *) buf, count, ppos); 1670 return do_rw_proc(1, file, (char __user *) buf, count, ppos);
1628} 1671}
1629 1672
1630/** 1673static int _proc_do_string(void* data, int maxlen, int write,
1631 * proc_dostring - read a string sysctl 1674 struct file *filp, void __user *buffer,
1632 * @table: the sysctl table 1675 size_t *lenp, loff_t *ppos)
1633 * @write: %TRUE if this is a write to the sysctl file
1634 * @filp: the file structure
1635 * @buffer: the user buffer
1636 * @lenp: the size of the user buffer
1637 * @ppos: file position
1638 *
1639 * Reads/writes a string from/to the user buffer. If the kernel
1640 * buffer provided is not large enough to hold the string, the
1641 * string is truncated. The copied string is %NULL-terminated.
1642 * If the string is being read by the user process, it is copied
1643 * and a newline '\n' is added. It is truncated if the buffer is
1644 * not large enough.
1645 *
1646 * Returns 0 on success.
1647 */
1648int proc_dostring(ctl_table *table, int write, struct file *filp,
1649 void __user *buffer, size_t *lenp, loff_t *ppos)
1650{ 1676{
1651 size_t len; 1677 size_t len;
1652 char __user *p; 1678 char __user *p;
1653 char c; 1679 char c;
1654 1680
1655 if (!table->data || !table->maxlen || !*lenp || 1681 if (!data || !maxlen || !*lenp ||
1656 (*ppos && !write)) { 1682 (*ppos && !write)) {
1657 *lenp = 0; 1683 *lenp = 0;
1658 return 0; 1684 return 0;
@@ -1668,20 +1694,20 @@ int proc_dostring(ctl_table *table, int write, struct file *filp,
1668 break; 1694 break;
1669 len++; 1695 len++;
1670 } 1696 }
1671 if (len >= table->maxlen) 1697 if (len >= maxlen)
1672 len = table->maxlen-1; 1698 len = maxlen-1;
1673 if(copy_from_user(table->data, buffer, len)) 1699 if(copy_from_user(data, buffer, len))
1674 return -EFAULT; 1700 return -EFAULT;
1675 ((char *) table->data)[len] = 0; 1701 ((char *) data)[len] = 0;
1676 *ppos += *lenp; 1702 *ppos += *lenp;
1677 } else { 1703 } else {
1678 len = strlen(table->data); 1704 len = strlen(data);
1679 if (len > table->maxlen) 1705 if (len > maxlen)
1680 len = table->maxlen; 1706 len = maxlen;
1681 if (len > *lenp) 1707 if (len > *lenp)
1682 len = *lenp; 1708 len = *lenp;
1683 if (len) 1709 if (len)
1684 if(copy_to_user(buffer, table->data, len)) 1710 if(copy_to_user(buffer, data, len))
1685 return -EFAULT; 1711 return -EFAULT;
1686 if (len < *lenp) { 1712 if (len < *lenp) {
1687 if(put_user('\n', ((char __user *) buffer) + len)) 1713 if(put_user('\n', ((char __user *) buffer) + len))
@@ -1694,12 +1720,38 @@ int proc_dostring(ctl_table *table, int write, struct file *filp,
1694 return 0; 1720 return 0;
1695} 1721}
1696 1722
1723/**
1724 * proc_dostring - read a string sysctl
1725 * @table: the sysctl table
1726 * @write: %TRUE if this is a write to the sysctl file
1727 * @filp: the file structure
1728 * @buffer: the user buffer
1729 * @lenp: the size of the user buffer
1730 * @ppos: file position
1731 *
1732 * Reads/writes a string from/to the user buffer. If the kernel
1733 * buffer provided is not large enough to hold the string, the
1734 * string is truncated. The copied string is %NULL-terminated.
1735 * If the string is being read by the user process, it is copied
1736 * and a newline '\n' is added. It is truncated if the buffer is
1737 * not large enough.
1738 *
1739 * Returns 0 on success.
1740 */
1741int proc_dostring(ctl_table *table, int write, struct file *filp,
1742 void __user *buffer, size_t *lenp, loff_t *ppos)
1743{
1744 return _proc_do_string(table->data, table->maxlen, write, filp,
1745 buffer, lenp, ppos);
1746}
1747
1697/* 1748/*
1698 * Special case of dostring for the UTS structure. This has locks 1749 * Special case of dostring for the UTS structure. This has locks
1699 * to observe. Should this be in kernel/sys.c ???? 1750 * to observe. Should this be in kernel/sys.c ????
1700 */ 1751 */
1701 1752
1702static int proc_doutsstring(ctl_table *table, int write, struct file *filp, 1753#ifndef CONFIG_UTS_NS
1754static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
1703 void __user *buffer, size_t *lenp, loff_t *ppos) 1755 void __user *buffer, size_t *lenp, loff_t *ppos)
1704{ 1756{
1705 int r; 1757 int r;
@@ -1715,6 +1767,48 @@ static int proc_doutsstring(ctl_table *table, int write, struct file *filp,
1715 } 1767 }
1716 return r; 1768 return r;
1717} 1769}
1770#else /* !CONFIG_UTS_NS */
1771static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
1772 void __user *buffer, size_t *lenp, loff_t *ppos)
1773{
1774 int r;
1775 struct uts_namespace* uts_ns = current->nsproxy->uts_ns;
1776 char* which;
1777
1778 switch (table->ctl_name) {
1779 case KERN_OSTYPE:
1780 which = uts_ns->name.sysname;
1781 break;
1782 case KERN_NODENAME:
1783 which = uts_ns->name.nodename;
1784 break;
1785 case KERN_OSRELEASE:
1786 which = uts_ns->name.release;
1787 break;
1788 case KERN_VERSION:
1789 which = uts_ns->name.version;
1790 break;
1791 case KERN_DOMAINNAME:
1792 which = uts_ns->name.domainname;
1793 break;
1794 default:
1795 r = -EINVAL;
1796 goto out;
1797 }
1798
1799 if (!write) {
1800 down_read(&uts_sem);
1801 r=_proc_do_string(which,table->maxlen,0,filp,buffer,lenp, ppos);
1802 up_read(&uts_sem);
1803 } else {
1804 down_write(&uts_sem);
1805 r=_proc_do_string(which,table->maxlen,1,filp,buffer,lenp, ppos);
1806 up_write(&uts_sem);
1807 }
1808 out:
1809 return r;
1810}
1811#endif /* !CONFIG_UTS_NS */
1718 1812
1719static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, 1813static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
1720 int *valp, 1814 int *valp,
@@ -1735,8 +1829,9 @@ static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
1735 return 0; 1829 return 0;
1736} 1830}
1737 1831
1738static int do_proc_dointvec(ctl_table *table, int write, struct file *filp, 1832static int __do_proc_dointvec(void *tbl_data, ctl_table *table,
1739 void __user *buffer, size_t *lenp, loff_t *ppos, 1833 int write, struct file *filp, void __user *buffer,
1834 size_t *lenp, loff_t *ppos,
1740 int (*conv)(int *negp, unsigned long *lvalp, int *valp, 1835 int (*conv)(int *negp, unsigned long *lvalp, int *valp,
1741 int write, void *data), 1836 int write, void *data),
1742 void *data) 1837 void *data)
@@ -1749,13 +1844,13 @@ static int do_proc_dointvec(ctl_table *table, int write, struct file *filp,
1749 char buf[TMPBUFLEN], *p; 1844 char buf[TMPBUFLEN], *p;
1750 char __user *s = buffer; 1845 char __user *s = buffer;
1751 1846
1752 if (!table->data || !table->maxlen || !*lenp || 1847 if (!tbl_data || !table->maxlen || !*lenp ||
1753 (*ppos && !write)) { 1848 (*ppos && !write)) {
1754 *lenp = 0; 1849 *lenp = 0;
1755 return 0; 1850 return 0;
1756 } 1851 }
1757 1852
1758 i = (int *) table->data; 1853 i = (int *) tbl_data;
1759 vleft = table->maxlen / sizeof(*i); 1854 vleft = table->maxlen / sizeof(*i);
1760 left = *lenp; 1855 left = *lenp;
1761 1856
@@ -1844,6 +1939,16 @@ static int do_proc_dointvec(ctl_table *table, int write, struct file *filp,
1844#undef TMPBUFLEN 1939#undef TMPBUFLEN
1845} 1940}
1846 1941
1942static int do_proc_dointvec(ctl_table *table, int write, struct file *filp,
1943 void __user *buffer, size_t *lenp, loff_t *ppos,
1944 int (*conv)(int *negp, unsigned long *lvalp, int *valp,
1945 int write, void *data),
1946 void *data)
1947{
1948 return __do_proc_dointvec(table->data, table, write, filp,
1949 buffer, lenp, ppos, conv, data);
1950}
1951
1847/** 1952/**
1848 * proc_dointvec - read a vector of integers 1953 * proc_dointvec - read a vector of integers
1849 * @table: the sysctl table 1954 * @table: the sysctl table
@@ -1977,7 +2082,7 @@ int proc_dointvec_minmax(ctl_table *table, int write, struct file *filp,
1977 do_proc_dointvec_minmax_conv, &param); 2082 do_proc_dointvec_minmax_conv, &param);
1978} 2083}
1979 2084
1980static int do_proc_doulongvec_minmax(ctl_table *table, int write, 2085static int __do_proc_doulongvec_minmax(void *data, ctl_table *table, int write,
1981 struct file *filp, 2086 struct file *filp,
1982 void __user *buffer, 2087 void __user *buffer,
1983 size_t *lenp, loff_t *ppos, 2088 size_t *lenp, loff_t *ppos,
@@ -1991,13 +2096,13 @@ static int do_proc_doulongvec_minmax(ctl_table *table, int write,
1991 char buf[TMPBUFLEN], *p; 2096 char buf[TMPBUFLEN], *p;
1992 char __user *s = buffer; 2097 char __user *s = buffer;
1993 2098
1994 if (!table->data || !table->maxlen || !*lenp || 2099 if (!data || !table->maxlen || !*lenp ||
1995 (*ppos && !write)) { 2100 (*ppos && !write)) {
1996 *lenp = 0; 2101 *lenp = 0;
1997 return 0; 2102 return 0;
1998 } 2103 }
1999 2104
2000 i = (unsigned long *) table->data; 2105 i = (unsigned long *) data;
2001 min = (unsigned long *) table->extra1; 2106 min = (unsigned long *) table->extra1;
2002 max = (unsigned long *) table->extra2; 2107 max = (unsigned long *) table->extra2;
2003 vleft = table->maxlen / sizeof(unsigned long); 2108 vleft = table->maxlen / sizeof(unsigned long);
@@ -2082,6 +2187,17 @@ static int do_proc_doulongvec_minmax(ctl_table *table, int write,
2082#undef TMPBUFLEN 2187#undef TMPBUFLEN
2083} 2188}
2084 2189
2190static int do_proc_doulongvec_minmax(ctl_table *table, int write,
2191 struct file *filp,
2192 void __user *buffer,
2193 size_t *lenp, loff_t *ppos,
2194 unsigned long convmul,
2195 unsigned long convdiv)
2196{
2197 return __do_proc_doulongvec_minmax(table->data, table, write,
2198 filp, buffer, lenp, ppos, convmul, convdiv);
2199}
2200
2085/** 2201/**
2086 * proc_doulongvec_minmax - read a vector of long integers with min/max values 2202 * proc_doulongvec_minmax - read a vector of long integers with min/max values
2087 * @table: the sysctl table 2203 * @table: the sysctl table
@@ -2270,6 +2386,71 @@ int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp,
2270 do_proc_dointvec_ms_jiffies_conv, NULL); 2386 do_proc_dointvec_ms_jiffies_conv, NULL);
2271} 2387}
2272 2388
2389#ifdef CONFIG_SYSVIPC
2390static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp,
2391 void __user *buffer, size_t *lenp, loff_t *ppos)
2392{
2393 void *data;
2394 struct ipc_namespace *ns;
2395
2396 ns = current->nsproxy->ipc_ns;
2397
2398 switch (table->ctl_name) {
2399 case KERN_SHMMAX:
2400 data = &ns->shm_ctlmax;
2401 goto proc_minmax;
2402 case KERN_SHMALL:
2403 data = &ns->shm_ctlall;
2404 goto proc_minmax;
2405 case KERN_SHMMNI:
2406 data = &ns->shm_ctlmni;
2407 break;
2408 case KERN_MSGMAX:
2409 data = &ns->msg_ctlmax;
2410 break;
2411 case KERN_MSGMNI:
2412 data = &ns->msg_ctlmni;
2413 break;
2414 case KERN_MSGMNB:
2415 data = &ns->msg_ctlmnb;
2416 break;
2417 case KERN_SEM:
2418 data = &ns->sem_ctls;
2419 break;
2420 default:
2421 return -EINVAL;
2422 }
2423
2424 return __do_proc_dointvec(data, table, write, filp, buffer,
2425 lenp, ppos, NULL, NULL);
2426proc_minmax:
2427 return __do_proc_doulongvec_minmax(data, table, write, filp, buffer,
2428 lenp, ppos, 1l, 1l);
2429}
2430#endif
2431
2432static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp,
2433 void __user *buffer, size_t *lenp, loff_t *ppos)
2434{
2435 struct pid *new_pid;
2436 pid_t tmp;
2437 int r;
2438
2439 tmp = pid_nr(cad_pid);
2440
2441 r = __do_proc_dointvec(&tmp, table, write, filp, buffer,
2442 lenp, ppos, NULL, NULL);
2443 if (r || !write)
2444 return r;
2445
2446 new_pid = find_get_pid(tmp);
2447 if (!new_pid)
2448 return -ESRCH;
2449
2450 put_pid(xchg(&cad_pid, new_pid));
2451 return 0;
2452}
2453
2273#else /* CONFIG_PROC_FS */ 2454#else /* CONFIG_PROC_FS */
2274 2455
2275int proc_dostring(ctl_table *table, int write, struct file *filp, 2456int proc_dostring(ctl_table *table, int write, struct file *filp,
@@ -2278,12 +2459,20 @@ int proc_dostring(ctl_table *table, int write, struct file *filp,
2278 return -ENOSYS; 2459 return -ENOSYS;
2279} 2460}
2280 2461
2281static int proc_doutsstring(ctl_table *table, int write, struct file *filp, 2462static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
2282 void __user *buffer, size_t *lenp, loff_t *ppos) 2463 void __user *buffer, size_t *lenp, loff_t *ppos)
2283{ 2464{
2284 return -ENOSYS; 2465 return -ENOSYS;
2285} 2466}
2286 2467
2468#ifdef CONFIG_SYSVIPC
2469static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp,
2470 void __user *buffer, size_t *lenp, loff_t *ppos)
2471{
2472 return -ENOSYS;
2473}
2474#endif
2475
2287int proc_dointvec(ctl_table *table, int write, struct file *filp, 2476int proc_dointvec(ctl_table *table, int write, struct file *filp,
2288 void __user *buffer, size_t *lenp, loff_t *ppos) 2477 void __user *buffer, size_t *lenp, loff_t *ppos)
2289{ 2478{
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 2ed4040d0d..5d6a8c54ee 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -18,7 +18,9 @@
18 18
19#include <linux/kernel.h> 19#include <linux/kernel.h>
20#include <linux/taskstats_kern.h> 20#include <linux/taskstats_kern.h>
21#include <linux/tsacct_kern.h>
21#include <linux/delayacct.h> 22#include <linux/delayacct.h>
23#include <linux/tsacct_kern.h>
22#include <linux/cpumask.h> 24#include <linux/cpumask.h>
23#include <linux/percpu.h> 25#include <linux/percpu.h>
24#include <net/genetlink.h> 26#include <net/genetlink.h>
@@ -75,7 +77,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
75 /* 77 /*
76 * If new attributes are added, please revisit this allocation 78 * If new attributes are added, please revisit this allocation
77 */ 79 */
78 skb = nlmsg_new(size, GFP_KERNEL); 80 skb = nlmsg_new(genlmsg_total_size(size), GFP_KERNEL);
79 if (!skb) 81 if (!skb)
80 return -ENOMEM; 82 return -ENOMEM;
81 83
@@ -198,7 +200,13 @@ static int fill_pid(pid_t pid, struct task_struct *pidtsk,
198 */ 200 */
199 201
200 delayacct_add_tsk(stats, tsk); 202 delayacct_add_tsk(stats, tsk);
203
204 /* fill in basic acct fields */
201 stats->version = TASKSTATS_VERSION; 205 stats->version = TASKSTATS_VERSION;
206 bacct_add_tsk(stats, tsk);
207
208 /* fill in extended acct fields */
209 xacct_add_tsk(stats, tsk);
202 210
203 /* Define err: label here if needed */ 211 /* Define err: label here if needed */
204 put_task_struct(tsk); 212 put_task_struct(tsk);
diff --git a/kernel/time.c b/kernel/time.c
index 5bd4897476..0e017bff4c 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -202,179 +202,6 @@ asmlinkage long sys_settimeofday(struct timeval __user *tv,
202 return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL); 202 return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
203} 203}
204 204
205/* we call this to notify the arch when the clock is being
206 * controlled. If no such arch routine, do nothing.
207 */
208void __attribute__ ((weak)) notify_arch_cmos_timer(void)
209{
210 return;
211}
212
213/* adjtimex mainly allows reading (and writing, if superuser) of
214 * kernel time-keeping variables. used by xntpd.
215 */
216int do_adjtimex(struct timex *txc)
217{
218 long ltemp, mtemp, save_adjust;
219 int result;
220
221 /* In order to modify anything, you gotta be super-user! */
222 if (txc->modes && !capable(CAP_SYS_TIME))
223 return -EPERM;
224
225 /* Now we validate the data before disabling interrupts */
226
227 if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
228 /* singleshot must not be used with any other mode bits */
229 if (txc->modes != ADJ_OFFSET_SINGLESHOT)
230 return -EINVAL;
231
232 if (txc->modes != ADJ_OFFSET_SINGLESHOT && (txc->modes & ADJ_OFFSET))
233 /* adjustment Offset limited to +- .512 seconds */
234 if (txc->offset <= - MAXPHASE || txc->offset >= MAXPHASE )
235 return -EINVAL;
236
237 /* if the quartz is off by more than 10% something is VERY wrong ! */
238 if (txc->modes & ADJ_TICK)
239 if (txc->tick < 900000/USER_HZ ||
240 txc->tick > 1100000/USER_HZ)
241 return -EINVAL;
242
243 write_seqlock_irq(&xtime_lock);
244 result = time_state; /* mostly `TIME_OK' */
245
246 /* Save for later - semantics of adjtime is to return old value */
247 save_adjust = time_next_adjust ? time_next_adjust : time_adjust;
248
249#if 0 /* STA_CLOCKERR is never set yet */
250 time_status &= ~STA_CLOCKERR; /* reset STA_CLOCKERR */
251#endif
252 /* If there are input parameters, then process them */
253 if (txc->modes)
254 {
255 if (txc->modes & ADJ_STATUS) /* only set allowed bits */
256 time_status = (txc->status & ~STA_RONLY) |
257 (time_status & STA_RONLY);
258
259 if (txc->modes & ADJ_FREQUENCY) { /* p. 22 */
260 if (txc->freq > MAXFREQ || txc->freq < -MAXFREQ) {
261 result = -EINVAL;
262 goto leave;
263 }
264 time_freq = txc->freq;
265 }
266
267 if (txc->modes & ADJ_MAXERROR) {
268 if (txc->maxerror < 0 || txc->maxerror >= NTP_PHASE_LIMIT) {
269 result = -EINVAL;
270 goto leave;
271 }
272 time_maxerror = txc->maxerror;
273 }
274
275 if (txc->modes & ADJ_ESTERROR) {
276 if (txc->esterror < 0 || txc->esterror >= NTP_PHASE_LIMIT) {
277 result = -EINVAL;
278 goto leave;
279 }
280 time_esterror = txc->esterror;
281 }
282
283 if (txc->modes & ADJ_TIMECONST) { /* p. 24 */
284 if (txc->constant < 0) { /* NTP v4 uses values > 6 */
285 result = -EINVAL;
286 goto leave;
287 }
288 time_constant = txc->constant;
289 }
290
291 if (txc->modes & ADJ_OFFSET) { /* values checked earlier */
292 if (txc->modes == ADJ_OFFSET_SINGLESHOT) {
293 /* adjtime() is independent from ntp_adjtime() */
294 if ((time_next_adjust = txc->offset) == 0)
295 time_adjust = 0;
296 }
297 else if (time_status & STA_PLL) {
298 ltemp = txc->offset;
299
300 /*
301 * Scale the phase adjustment and
302 * clamp to the operating range.
303 */
304 if (ltemp > MAXPHASE)
305 time_offset = MAXPHASE << SHIFT_UPDATE;
306 else if (ltemp < -MAXPHASE)
307 time_offset = -(MAXPHASE << SHIFT_UPDATE);
308 else
309 time_offset = ltemp << SHIFT_UPDATE;
310
311 /*
312 * Select whether the frequency is to be controlled
313 * and in which mode (PLL or FLL). Clamp to the operating
314 * range. Ugly multiply/divide should be replaced someday.
315 */
316
317 if (time_status & STA_FREQHOLD || time_reftime == 0)
318 time_reftime = xtime.tv_sec;
319 mtemp = xtime.tv_sec - time_reftime;
320 time_reftime = xtime.tv_sec;
321 if (time_status & STA_FLL) {
322 if (mtemp >= MINSEC) {
323 ltemp = (time_offset / mtemp) << (SHIFT_USEC -
324 SHIFT_UPDATE);
325 time_freq += shift_right(ltemp, SHIFT_KH);
326 } else /* calibration interval too short (p. 12) */
327 result = TIME_ERROR;
328 } else { /* PLL mode */
329 if (mtemp < MAXSEC) {
330 ltemp *= mtemp;
331 time_freq += shift_right(ltemp,(time_constant +
332 time_constant +
333 SHIFT_KF - SHIFT_USEC));
334 } else /* calibration interval too long (p. 12) */
335 result = TIME_ERROR;
336 }
337 time_freq = min(time_freq, time_tolerance);
338 time_freq = max(time_freq, -time_tolerance);
339 } /* STA_PLL */
340 } /* txc->modes & ADJ_OFFSET */
341 if (txc->modes & ADJ_TICK) {
342 tick_usec = txc->tick;
343 tick_nsec = TICK_USEC_TO_NSEC(tick_usec);
344 }
345 } /* txc->modes */
346leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
347 result = TIME_ERROR;
348
349 if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
350 txc->offset = save_adjust;
351 else {
352 txc->offset = shift_right(time_offset, SHIFT_UPDATE);
353 }
354 txc->freq = time_freq;
355 txc->maxerror = time_maxerror;
356 txc->esterror = time_esterror;
357 txc->status = time_status;
358 txc->constant = time_constant;
359 txc->precision = time_precision;
360 txc->tolerance = time_tolerance;
361 txc->tick = tick_usec;
362
363 /* PPS is not implemented, so these are zero */
364 txc->ppsfreq = 0;
365 txc->jitter = 0;
366 txc->shift = 0;
367 txc->stabil = 0;
368 txc->jitcnt = 0;
369 txc->calcnt = 0;
370 txc->errcnt = 0;
371 txc->stbcnt = 0;
372 write_sequnlock_irq(&xtime_lock);
373 do_gettimeofday(&txc->time);
374 notify_arch_cmos_timer();
375 return(result);
376}
377
378asmlinkage long sys_adjtimex(struct timex __user *txc_p) 205asmlinkage long sys_adjtimex(struct timex __user *txc_p)
379{ 206{
380 struct timex txc; /* Local copy of parameter */ 207 struct timex txc; /* Local copy of parameter */
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index e1dfd8e86c..61a3907d16 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1 +1 @@
obj-y += clocksource.o jiffies.o obj-y += ntp.o clocksource.o jiffies.o
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
new file mode 100644
index 0000000000..47195fa0ec
--- /dev/null
+++ b/kernel/time/ntp.c
@@ -0,0 +1,350 @@
1/*
2 * linux/kernel/time/ntp.c
3 *
4 * NTP state machine interfaces and logic.
5 *
6 * This code was mainly moved from kernel/timer.c and kernel/time.c
7 * Please see those files for relevant copyright info and historical
8 * changelogs.
9 */
10
11#include <linux/mm.h>
12#include <linux/time.h>
13#include <linux/timex.h>
14
15#include <asm/div64.h>
16#include <asm/timex.h>
17
18/*
19 * Timekeeping variables
20 */
21unsigned long tick_usec = TICK_USEC; /* USER_HZ period (usec) */
22unsigned long tick_nsec; /* ACTHZ period (nsec) */
23static u64 tick_length, tick_length_base;
24
25#define MAX_TICKADJ 500 /* microsecs */
26#define MAX_TICKADJ_SCALED (((u64)(MAX_TICKADJ * NSEC_PER_USEC) << \
27 TICK_LENGTH_SHIFT) / HZ)
28
29/*
30 * phase-lock loop variables
31 */
32/* TIME_ERROR prevents overwriting the CMOS clock */
33static int time_state = TIME_OK; /* clock synchronization status */
34int time_status = STA_UNSYNC; /* clock status bits */
35static long time_offset; /* time adjustment (ns) */
36static long time_constant = 2; /* pll time constant */
37long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */
38long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */
39long time_freq; /* frequency offset (scaled ppm)*/
40static long time_reftime; /* time at last adjustment (s) */
41long time_adjust;
42
43#define CLOCK_TICK_OVERFLOW (LATCH * HZ - CLOCK_TICK_RATE)
44#define CLOCK_TICK_ADJUST (((s64)CLOCK_TICK_OVERFLOW * NSEC_PER_SEC) / \
45 (s64)CLOCK_TICK_RATE)
46
47static void ntp_update_frequency(void)
48{
49 tick_length_base = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) << TICK_LENGTH_SHIFT;
50 tick_length_base += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT;
51 tick_length_base += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC);
52
53 do_div(tick_length_base, HZ);
54
55 tick_nsec = tick_length_base >> TICK_LENGTH_SHIFT;
56}
57
58/**
59 * ntp_clear - Clears the NTP state variables
60 *
61 * Must be called while holding a write on the xtime_lock
62 */
63void ntp_clear(void)
64{
65 time_adjust = 0; /* stop active adjtime() */
66 time_status |= STA_UNSYNC;
67 time_maxerror = NTP_PHASE_LIMIT;
68 time_esterror = NTP_PHASE_LIMIT;
69
70 ntp_update_frequency();
71
72 tick_length = tick_length_base;
73 time_offset = 0;
74}
75
76/*
77 * this routine handles the overflow of the microsecond field
78 *
79 * The tricky bits of code to handle the accurate clock support
80 * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
81 * They were originally developed for SUN and DEC kernels.
82 * All the kudos should go to Dave for this stuff.
83 */
84void second_overflow(void)
85{
86 long time_adj;
87
88 /* Bump the maxerror field */
89 time_maxerror += MAXFREQ >> SHIFT_USEC;
90 if (time_maxerror > NTP_PHASE_LIMIT) {
91 time_maxerror = NTP_PHASE_LIMIT;
92 time_status |= STA_UNSYNC;
93 }
94
95 /*
96 * Leap second processing. If in leap-insert state at the end of the
97 * day, the system clock is set back one second; if in leap-delete
98 * state, the system clock is set ahead one second. The microtime()
99 * routine or external clock driver will insure that reported time is
100 * always monotonic. The ugly divides should be replaced.
101 */
102 switch (time_state) {
103 case TIME_OK:
104 if (time_status & STA_INS)
105 time_state = TIME_INS;
106 else if (time_status & STA_DEL)
107 time_state = TIME_DEL;
108 break;
109 case TIME_INS:
110 if (xtime.tv_sec % 86400 == 0) {
111 xtime.tv_sec--;
112 wall_to_monotonic.tv_sec++;
113 /*
114 * The timer interpolator will make time change
115 * gradually instead of an immediate jump by one second
116 */
117 time_interpolator_update(-NSEC_PER_SEC);
118 time_state = TIME_OOP;
119 clock_was_set();
120 printk(KERN_NOTICE "Clock: inserting leap second "
121 "23:59:60 UTC\n");
122 }
123 break;
124 case TIME_DEL:
125 if ((xtime.tv_sec + 1) % 86400 == 0) {
126 xtime.tv_sec++;
127 wall_to_monotonic.tv_sec--;
128 /*
129 * Use of time interpolator for a gradual change of
130 * time
131 */
132 time_interpolator_update(NSEC_PER_SEC);
133 time_state = TIME_WAIT;
134 clock_was_set();
135 printk(KERN_NOTICE "Clock: deleting leap second "
136 "23:59:59 UTC\n");
137 }
138 break;
139 case TIME_OOP:
140 time_state = TIME_WAIT;
141 break;
142 case TIME_WAIT:
143 if (!(time_status & (STA_INS | STA_DEL)))
144 time_state = TIME_OK;
145 }
146
147 /*
148 * Compute the phase adjustment for the next second. The offset is
149 * reduced by a fixed factor times the time constant.
150 */
151 tick_length = tick_length_base;
152 time_adj = shift_right(time_offset, SHIFT_PLL + time_constant);
153 time_offset -= time_adj;
154 tick_length += (s64)time_adj << (TICK_LENGTH_SHIFT - SHIFT_UPDATE);
155
156 if (unlikely(time_adjust)) {
157 if (time_adjust > MAX_TICKADJ) {
158 time_adjust -= MAX_TICKADJ;
159 tick_length += MAX_TICKADJ_SCALED;
160 } else if (time_adjust < -MAX_TICKADJ) {
161 time_adjust += MAX_TICKADJ;
162 tick_length -= MAX_TICKADJ_SCALED;
163 } else {
164 time_adjust = 0;
165 tick_length += (s64)(time_adjust * NSEC_PER_USEC /
166 HZ) << TICK_LENGTH_SHIFT;
167 }
168 }
169}
170
171/*
172 * Return how long ticks are at the moment, that is, how much time
173 * update_wall_time_one_tick will add to xtime next time we call it
174 * (assuming no calls to do_adjtimex in the meantime).
175 * The return value is in fixed-point nanoseconds shifted by the
176 * specified number of bits to the right of the binary point.
177 * This function has no side-effects.
178 */
179u64 current_tick_length(void)
180{
181 return tick_length;
182}
183
184
185void __attribute__ ((weak)) notify_arch_cmos_timer(void)
186{
187 return;
188}
189
190/* adjtimex mainly allows reading (and writing, if superuser) of
191 * kernel time-keeping variables. used by xntpd.
192 */
193int do_adjtimex(struct timex *txc)
194{
195 long ltemp, mtemp, save_adjust;
196 s64 freq_adj, temp64;
197 int result;
198
199 /* In order to modify anything, you gotta be super-user! */
200 if (txc->modes && !capable(CAP_SYS_TIME))
201 return -EPERM;
202
203 /* Now we validate the data before disabling interrupts */
204
205 if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
206 /* singleshot must not be used with any other mode bits */
207 if (txc->modes != ADJ_OFFSET_SINGLESHOT)
208 return -EINVAL;
209
210 if (txc->modes != ADJ_OFFSET_SINGLESHOT && (txc->modes & ADJ_OFFSET))
211 /* adjustment Offset limited to +- .512 seconds */
212 if (txc->offset <= - MAXPHASE || txc->offset >= MAXPHASE )
213 return -EINVAL;
214
215 /* if the quartz is off by more than 10% something is VERY wrong ! */
216 if (txc->modes & ADJ_TICK)
217 if (txc->tick < 900000/USER_HZ ||
218 txc->tick > 1100000/USER_HZ)
219 return -EINVAL;
220
221 write_seqlock_irq(&xtime_lock);
222 result = time_state; /* mostly `TIME_OK' */
223
224 /* Save for later - semantics of adjtime is to return old value */
225 save_adjust = time_adjust;
226
227#if 0 /* STA_CLOCKERR is never set yet */
228 time_status &= ~STA_CLOCKERR; /* reset STA_CLOCKERR */
229#endif
230 /* If there are input parameters, then process them */
231 if (txc->modes)
232 {
233 if (txc->modes & ADJ_STATUS) /* only set allowed bits */
234 time_status = (txc->status & ~STA_RONLY) |
235 (time_status & STA_RONLY);
236
237 if (txc->modes & ADJ_FREQUENCY) { /* p. 22 */
238 if (txc->freq > MAXFREQ || txc->freq < -MAXFREQ) {
239 result = -EINVAL;
240 goto leave;
241 }
242 time_freq = ((s64)txc->freq * NSEC_PER_USEC) >> (SHIFT_USEC - SHIFT_NSEC);
243 }
244
245 if (txc->modes & ADJ_MAXERROR) {
246 if (txc->maxerror < 0 || txc->maxerror >= NTP_PHASE_LIMIT) {
247 result = -EINVAL;
248 goto leave;
249 }
250 time_maxerror = txc->maxerror;
251 }
252
253 if (txc->modes & ADJ_ESTERROR) {
254 if (txc->esterror < 0 || txc->esterror >= NTP_PHASE_LIMIT) {
255 result = -EINVAL;
256 goto leave;
257 }
258 time_esterror = txc->esterror;
259 }
260
261 if (txc->modes & ADJ_TIMECONST) { /* p. 24 */
262 if (txc->constant < 0) { /* NTP v4 uses values > 6 */
263 result = -EINVAL;
264 goto leave;
265 }
266 time_constant = min(txc->constant + 4, (long)MAXTC);
267 }
268
269 if (txc->modes & ADJ_OFFSET) { /* values checked earlier */
270 if (txc->modes == ADJ_OFFSET_SINGLESHOT) {
271 /* adjtime() is independent from ntp_adjtime() */
272 time_adjust = txc->offset;
273 }
274 else if (time_status & STA_PLL) {
275 ltemp = txc->offset * NSEC_PER_USEC;
276
277 /*
278 * Scale the phase adjustment and
279 * clamp to the operating range.
280 */
281 time_offset = min(ltemp, MAXPHASE * NSEC_PER_USEC);
282 time_offset = max(time_offset, -MAXPHASE * NSEC_PER_USEC);
283
284 /*
285 * Select whether the frequency is to be controlled
286 * and in which mode (PLL or FLL). Clamp to the operating
287 * range. Ugly multiply/divide should be replaced someday.
288 */
289
290 if (time_status & STA_FREQHOLD || time_reftime == 0)
291 time_reftime = xtime.tv_sec;
292 mtemp = xtime.tv_sec - time_reftime;
293 time_reftime = xtime.tv_sec;
294
295 freq_adj = (s64)time_offset * mtemp;
296 freq_adj = shift_right(freq_adj, time_constant * 2 +
297 (SHIFT_PLL + 2) * 2 - SHIFT_NSEC);
298 if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) {
299 temp64 = (s64)time_offset << (SHIFT_NSEC - SHIFT_FLL);
300 if (time_offset < 0) {
301 temp64 = -temp64;
302 do_div(temp64, mtemp);
303 freq_adj -= temp64;
304 } else {
305 do_div(temp64, mtemp);
306 freq_adj += temp64;
307 }
308 }
309 freq_adj += time_freq;
310 freq_adj = min(freq_adj, (s64)MAXFREQ_NSEC);
311 time_freq = max(freq_adj, (s64)-MAXFREQ_NSEC);
312 time_offset = (time_offset / HZ) << SHIFT_UPDATE;
313 } /* STA_PLL */
314 } /* txc->modes & ADJ_OFFSET */
315 if (txc->modes & ADJ_TICK)
316 tick_usec = txc->tick;
317
318 if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET))
319 ntp_update_frequency();
320 } /* txc->modes */
321leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
322 result = TIME_ERROR;
323
324 if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
325 txc->offset = save_adjust;
326 else
327 txc->offset = shift_right(time_offset, SHIFT_UPDATE) * HZ / 1000;
328 txc->freq = (time_freq / NSEC_PER_USEC) << (SHIFT_USEC - SHIFT_NSEC);
329 txc->maxerror = time_maxerror;
330 txc->esterror = time_esterror;
331 txc->status = time_status;
332 txc->constant = time_constant;
333 txc->precision = 1;
334 txc->tolerance = MAXFREQ;
335 txc->tick = tick_usec;
336
337 /* PPS is not implemented, so these are zero */
338 txc->ppsfreq = 0;
339 txc->jitter = 0;
340 txc->shift = 0;
341 txc->stabil = 0;
342 txc->jitcnt = 0;
343 txc->calcnt = 0;
344 txc->errcnt = 0;
345 txc->stbcnt = 0;
346 write_sequnlock_irq(&xtime_lock);
347 do_gettimeofday(&txc->time);
348 notify_arch_cmos_timer();
349 return(result);
350}
diff --git a/kernel/timer.c b/kernel/timer.c
index 4f55622b0d..c1c7fbcffe 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -41,12 +41,6 @@
41#include <asm/timex.h> 41#include <asm/timex.h>
42#include <asm/io.h> 42#include <asm/io.h>
43 43
44#ifdef CONFIG_TIME_INTERPOLATION
45static void time_interpolator_update(long delta_nsec);
46#else
47#define time_interpolator_update(x)
48#endif
49
50u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; 44u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
51 45
52EXPORT_SYMBOL(jiffies_64); 46EXPORT_SYMBOL(jiffies_64);
@@ -568,12 +562,6 @@ found:
568 562
569/******************************************************************/ 563/******************************************************************/
570 564
571/*
572 * Timekeeping variables
573 */
574unsigned long tick_usec = TICK_USEC; /* USER_HZ period (usec) */
575unsigned long tick_nsec = TICK_NSEC; /* ACTHZ period (nsec) */
576
577/* 565/*
578 * The current time 566 * The current time
579 * wall_to_monotonic is what we need to add to xtime (or xtime corrected 567 * wall_to_monotonic is what we need to add to xtime (or xtime corrected
@@ -587,209 +575,6 @@ struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
587 575
588EXPORT_SYMBOL(xtime); 576EXPORT_SYMBOL(xtime);
589 577
590/* Don't completely fail for HZ > 500. */
591int tickadj = 500/HZ ? : 1; /* microsecs */
592
593
594/*
595 * phase-lock loop variables
596 */
597/* TIME_ERROR prevents overwriting the CMOS clock */
598int time_state = TIME_OK; /* clock synchronization status */
599int time_status = STA_UNSYNC; /* clock status bits */
600long time_offset; /* time adjustment (us) */
601long time_constant = 2; /* pll time constant */
602long time_tolerance = MAXFREQ; /* frequency tolerance (ppm) */
603long time_precision = 1; /* clock precision (us) */
604long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */
605long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */
606long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC;
607 /* frequency offset (scaled ppm)*/
608static long time_adj; /* tick adjust (scaled 1 / HZ) */
609long time_reftime; /* time at last adjustment (s) */
610long time_adjust;
611long time_next_adjust;
612
613/*
614 * this routine handles the overflow of the microsecond field
615 *
616 * The tricky bits of code to handle the accurate clock support
617 * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
618 * They were originally developed for SUN and DEC kernels.
619 * All the kudos should go to Dave for this stuff.
620 *
621 */
622static void second_overflow(void)
623{
624 long ltemp;
625
626 /* Bump the maxerror field */
627 time_maxerror += time_tolerance >> SHIFT_USEC;
628 if (time_maxerror > NTP_PHASE_LIMIT) {
629 time_maxerror = NTP_PHASE_LIMIT;
630 time_status |= STA_UNSYNC;
631 }
632
633 /*
634 * Leap second processing. If in leap-insert state at the end of the
635 * day, the system clock is set back one second; if in leap-delete
636 * state, the system clock is set ahead one second. The microtime()
637 * routine or external clock driver will insure that reported time is
638 * always monotonic. The ugly divides should be replaced.
639 */
640 switch (time_state) {
641 case TIME_OK:
642 if (time_status & STA_INS)
643 time_state = TIME_INS;
644 else if (time_status & STA_DEL)
645 time_state = TIME_DEL;
646 break;
647 case TIME_INS:
648 if (xtime.tv_sec % 86400 == 0) {
649 xtime.tv_sec--;
650 wall_to_monotonic.tv_sec++;
651 /*
652 * The timer interpolator will make time change
653 * gradually instead of an immediate jump by one second
654 */
655 time_interpolator_update(-NSEC_PER_SEC);
656 time_state = TIME_OOP;
657 clock_was_set();
658 printk(KERN_NOTICE "Clock: inserting leap second "
659 "23:59:60 UTC\n");
660 }
661 break;
662 case TIME_DEL:
663 if ((xtime.tv_sec + 1) % 86400 == 0) {
664 xtime.tv_sec++;
665 wall_to_monotonic.tv_sec--;
666 /*
667 * Use of time interpolator for a gradual change of
668 * time
669 */
670 time_interpolator_update(NSEC_PER_SEC);
671 time_state = TIME_WAIT;
672 clock_was_set();
673 printk(KERN_NOTICE "Clock: deleting leap second "
674 "23:59:59 UTC\n");
675 }
676 break;
677 case TIME_OOP:
678 time_state = TIME_WAIT;
679 break;
680 case TIME_WAIT:
681 if (!(time_status & (STA_INS | STA_DEL)))
682 time_state = TIME_OK;
683 }
684
685 /*
686 * Compute the phase adjustment for the next second. In PLL mode, the
687 * offset is reduced by a fixed factor times the time constant. In FLL
688 * mode the offset is used directly. In either mode, the maximum phase
689 * adjustment for each second is clamped so as to spread the adjustment
690 * over not more than the number of seconds between updates.
691 */
692 ltemp = time_offset;
693 if (!(time_status & STA_FLL))
694 ltemp = shift_right(ltemp, SHIFT_KG + time_constant);
695 ltemp = min(ltemp, (MAXPHASE / MINSEC) << SHIFT_UPDATE);
696 ltemp = max(ltemp, -(MAXPHASE / MINSEC) << SHIFT_UPDATE);
697 time_offset -= ltemp;
698 time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
699
700 /*
701 * Compute the frequency estimate and additional phase adjustment due
702 * to frequency error for the next second.
703 */
704 ltemp = time_freq;
705 time_adj += shift_right(ltemp,(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE));
706
707#if HZ == 100
708 /*
709 * Compensate for (HZ==100) != (1 << SHIFT_HZ). Add 25% and 3.125% to
710 * get 128.125; => only 0.125% error (p. 14)
711 */
712 time_adj += shift_right(time_adj, 2) + shift_right(time_adj, 5);
713#endif
714#if HZ == 250
715 /*
716 * Compensate for (HZ==250) != (1 << SHIFT_HZ). Add 1.5625% and
717 * 0.78125% to get 255.85938; => only 0.05% error (p. 14)
718 */
719 time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
720#endif
721#if HZ == 1000
722 /*
723 * Compensate for (HZ==1000) != (1 << SHIFT_HZ). Add 1.5625% and
724 * 0.78125% to get 1023.4375; => only 0.05% error (p. 14)
725 */
726 time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
727#endif
728}
729
730/*
731 * Returns how many microseconds we need to add to xtime this tick
732 * in doing an adjustment requested with adjtime.
733 */
734static long adjtime_adjustment(void)
735{
736 long time_adjust_step;
737
738 time_adjust_step = time_adjust;
739 if (time_adjust_step) {
740 /*
741 * We are doing an adjtime thing. Prepare time_adjust_step to
742 * be within bounds. Note that a positive time_adjust means we
743 * want the clock to run faster.
744 *
745 * Limit the amount of the step to be in the range
746 * -tickadj .. +tickadj
747 */
748 time_adjust_step = min(time_adjust_step, (long)tickadj);
749 time_adjust_step = max(time_adjust_step, (long)-tickadj);
750 }
751 return time_adjust_step;
752}
753
754/* in the NTP reference this is called "hardclock()" */
755static void update_ntp_one_tick(void)
756{
757 long time_adjust_step;
758
759 time_adjust_step = adjtime_adjustment();
760 if (time_adjust_step)
761 /* Reduce by this step the amount of time left */
762 time_adjust -= time_adjust_step;
763
764 /* Changes by adjtime() do not take effect till next tick. */
765 if (time_next_adjust != 0) {
766 time_adjust = time_next_adjust;
767 time_next_adjust = 0;
768 }
769}
770
771/*
772 * Return how long ticks are at the moment, that is, how much time
773 * update_wall_time_one_tick will add to xtime next time we call it
774 * (assuming no calls to do_adjtimex in the meantime).
775 * The return value is in fixed-point nanoseconds shifted by the
776 * specified number of bits to the right of the binary point.
777 * This function has no side-effects.
778 */
779u64 current_tick_length(void)
780{
781 long delta_nsec;
782 u64 ret;
783
784 /* calculate the finest interval NTP will allow.
785 * ie: nanosecond value shifted by (SHIFT_SCALE - 10)
786 */
787 delta_nsec = tick_nsec + adjtime_adjustment() * 1000;
788 ret = (u64)delta_nsec << TICK_LENGTH_SHIFT;
789 ret += (s64)time_adj << (TICK_LENGTH_SHIFT - (SHIFT_SCALE - 10));
790
791 return ret;
792}
793 578
794/* XXX - all of this timekeeping code should be later moved to time.c */ 579/* XXX - all of this timekeeping code should be later moved to time.c */
795#include <linux/clocksource.h> 580#include <linux/clocksource.h>
@@ -966,10 +751,13 @@ void __init timekeeping_init(void)
966 unsigned long flags; 751 unsigned long flags;
967 752
968 write_seqlock_irqsave(&xtime_lock, flags); 753 write_seqlock_irqsave(&xtime_lock, flags);
754
755 ntp_clear();
756
969 clock = clocksource_get_next(); 757 clock = clocksource_get_next();
970 clocksource_calculate_interval(clock, tick_nsec); 758 clocksource_calculate_interval(clock, tick_nsec);
971 clock->cycle_last = clocksource_read(clock); 759 clock->cycle_last = clocksource_read(clock);
972 ntp_clear(); 760
973 write_sequnlock_irqrestore(&xtime_lock, flags); 761 write_sequnlock_irqrestore(&xtime_lock, flags);
974} 762}
975 763
@@ -980,7 +768,7 @@ static int timekeeping_suspended;
980 * @dev: unused 768 * @dev: unused
981 * 769 *
982 * This is for the generic clocksource timekeeping. 770 * This is for the generic clocksource timekeeping.
983 * xtime/wall_to_monotonic/jiffies/wall_jiffies/etc are 771 * xtime/wall_to_monotonic/jiffies/etc are
984 * still managed by arch specific suspend/resume code. 772 * still managed by arch specific suspend/resume code.
985 */ 773 */
986static int timekeeping_resume(struct sys_device *dev) 774static int timekeeping_resume(struct sys_device *dev)
@@ -1149,8 +937,6 @@ static void update_wall_time(void)
1149 /* interpolator bits */ 937 /* interpolator bits */
1150 time_interpolator_update(clock->xtime_interval 938 time_interpolator_update(clock->xtime_interval
1151 >> clock->shift); 939 >> clock->shift);
1152 /* increment the NTP state machine */
1153 update_ntp_one_tick();
1154 940
1155 /* accumulate error between NTP and clock interval */ 941 /* accumulate error between NTP and clock interval */
1156 clock->error += current_tick_length(); 942 clock->error += current_tick_length();
@@ -1230,9 +1016,6 @@ static inline void calc_load(unsigned long ticks)
1230 } 1016 }
1231} 1017}
1232 1018
1233/* jiffies at the most recent update of wall time */
1234unsigned long wall_jiffies = INITIAL_JIFFIES;
1235
1236/* 1019/*
1237 * This read-write spinlock protects us from races in SMP while 1020 * This read-write spinlock protects us from races in SMP while
1238 * playing with xtime and avenrun. 1021 * playing with xtime and avenrun.
@@ -1270,7 +1053,6 @@ void run_local_timers(void)
1270 */ 1053 */
1271static inline void update_times(unsigned long ticks) 1054static inline void update_times(unsigned long ticks)
1272{ 1055{
1273 wall_jiffies += ticks;
1274 update_wall_time(); 1056 update_wall_time();
1275 calc_load(ticks); 1057 calc_load(ticks);
1276} 1058}
@@ -1775,7 +1557,7 @@ unsigned long time_interpolator_get_offset(void)
1775#define INTERPOLATOR_ADJUST 65536 1557#define INTERPOLATOR_ADJUST 65536
1776#define INTERPOLATOR_MAX_SKIP 10*INTERPOLATOR_ADJUST 1558#define INTERPOLATOR_MAX_SKIP 10*INTERPOLATOR_ADJUST
1777 1559
1778static void time_interpolator_update(long delta_nsec) 1560void time_interpolator_update(long delta_nsec)
1779{ 1561{
1780 u64 counter; 1562 u64 counter;
1781 unsigned long offset; 1563 unsigned long offset;
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
new file mode 100644
index 0000000000..db443221ba
--- /dev/null
+++ b/kernel/tsacct.c
@@ -0,0 +1,124 @@
1/*
2 * tsacct.c - System accounting over taskstats interface
3 *
4 * Copyright (C) Jay Lan, <jlan@sgi.com>
5 *
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 */
18
19#include <linux/kernel.h>
20#include <linux/sched.h>
21#include <linux/tsacct_kern.h>
22#include <linux/acct.h>
23#include <linux/jiffies.h>
24
25
26#define USEC_PER_TICK (USEC_PER_SEC/HZ)
27/*
28 * fill in basic accounting fields
29 */
30void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
31{
32 struct timespec uptime, ts;
33 s64 ac_etime;
34
35 BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN);
36
37 /* calculate task elapsed time in timespec */
38 do_posix_clock_monotonic_gettime(&uptime);
39 ts = timespec_sub(uptime, current->group_leader->start_time);
40 /* rebase elapsed time to usec */
41 ac_etime = timespec_to_ns(&ts);
42 do_div(ac_etime, NSEC_PER_USEC);
43 stats->ac_etime = ac_etime;
44 stats->ac_btime = xtime.tv_sec - ts.tv_sec;
45 if (thread_group_leader(tsk)) {
46 stats->ac_exitcode = tsk->exit_code;
47 if (tsk->flags & PF_FORKNOEXEC)
48 stats->ac_flag |= AFORK;
49 }
50 if (tsk->flags & PF_SUPERPRIV)
51 stats->ac_flag |= ASU;
52 if (tsk->flags & PF_DUMPCORE)
53 stats->ac_flag |= ACORE;
54 if (tsk->flags & PF_SIGNALED)
55 stats->ac_flag |= AXSIG;
56 stats->ac_nice = task_nice(tsk);
57 stats->ac_sched = tsk->policy;
58 stats->ac_uid = tsk->uid;
59 stats->ac_gid = tsk->gid;
60 stats->ac_pid = tsk->pid;
61 stats->ac_ppid = (tsk->parent) ? tsk->parent->pid : 0;
62 stats->ac_utime = cputime_to_msecs(tsk->utime) * USEC_PER_MSEC;
63 stats->ac_stime = cputime_to_msecs(tsk->stime) * USEC_PER_MSEC;
64 stats->ac_minflt = tsk->min_flt;
65 stats->ac_majflt = tsk->maj_flt;
66
67 strncpy(stats->ac_comm, tsk->comm, sizeof(stats->ac_comm));
68}
69
70
71#ifdef CONFIG_TASK_XACCT
72
73#define KB 1024
74#define MB (1024*KB)
75/*
76 * fill in extended accounting fields
77 */
78void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
79{
80 /* convert pages-jiffies to Mbyte-usec */
81 stats->coremem = jiffies_to_usecs(p->acct_rss_mem1) * PAGE_SIZE / MB;
82 stats->virtmem = jiffies_to_usecs(p->acct_vm_mem1) * PAGE_SIZE / MB;
83 if (p->mm) {
84 /* adjust to KB unit */
85 stats->hiwater_rss = p->mm->hiwater_rss * PAGE_SIZE / KB;
86 stats->hiwater_vm = p->mm->hiwater_vm * PAGE_SIZE / KB;
87 }
88 stats->read_char = p->rchar;
89 stats->write_char = p->wchar;
90 stats->read_syscalls = p->syscr;
91 stats->write_syscalls = p->syscw;
92}
93#undef KB
94#undef MB
95
96/**
97 * acct_update_integrals - update mm integral fields in task_struct
98 * @tsk: task_struct for accounting
99 */
100void acct_update_integrals(struct task_struct *tsk)
101{
102 if (likely(tsk->mm)) {
103 long delta = cputime_to_jiffies(
104 cputime_sub(tsk->stime, tsk->acct_stimexpd));
105
106 if (delta == 0)
107 return;
108 tsk->acct_stimexpd = tsk->stime;
109 tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm);
110 tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
111 }
112}
113
114/**
115 * acct_clear_integrals - clear the mm integral fields in task_struct
116 * @tsk: task_struct whose accounting fields are cleared
117 */
118void acct_clear_integrals(struct task_struct *tsk)
119{
120 tsk->acct_stimexpd = 0;
121 tsk->acct_rss_mem1 = 0;
122 tsk->acct_vm_mem1 = 0;
123}
124#endif
diff --git a/kernel/utsname.c b/kernel/utsname.c
new file mode 100644
index 0000000000..c859164a69
--- /dev/null
+++ b/kernel/utsname.c
@@ -0,0 +1,95 @@
1/*
2 * Copyright (C) 2004 IBM Corporation
3 *
4 * Author: Serge Hallyn <serue@us.ibm.com>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation, version 2 of the
9 * License.
10 */
11
12#include <linux/module.h>
13#include <linux/uts.h>
14#include <linux/utsname.h>
15#include <linux/version.h>
16
17/*
18 * Clone a new ns copying an original utsname, setting refcount to 1
19 * @old_ns: namespace to clone
20 * Return NULL on error (failure to kmalloc), new ns otherwise
21 */
22static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
23{
24 struct uts_namespace *ns;
25
26 ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL);
27 if (ns) {
28 memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
29 kref_init(&ns->kref);
30 }
31 return ns;
32}
33
34/*
35 * unshare the current process' utsname namespace.
36 * called only in sys_unshare()
37 */
38int unshare_utsname(unsigned long unshare_flags, struct uts_namespace **new_uts)
39{
40 if (unshare_flags & CLONE_NEWUTS) {
41 if (!capable(CAP_SYS_ADMIN))
42 return -EPERM;
43
44 *new_uts = clone_uts_ns(current->nsproxy->uts_ns);
45 if (!*new_uts)
46 return -ENOMEM;
47 }
48
49 return 0;
50}
51
52/*
53 * Copy task tsk's utsname namespace, or clone it if flags
54 * specifies CLONE_NEWUTS. In latter case, changes to the
55 * utsname of this process won't be seen by parent, and vice
56 * versa.
57 */
58int copy_utsname(int flags, struct task_struct *tsk)
59{
60 struct uts_namespace *old_ns = tsk->nsproxy->uts_ns;
61 struct uts_namespace *new_ns;
62 int err = 0;
63
64 if (!old_ns)
65 return 0;
66
67 get_uts_ns(old_ns);
68
69 if (!(flags & CLONE_NEWUTS))
70 return 0;
71
72 if (!capable(CAP_SYS_ADMIN)) {
73 err = -EPERM;
74 goto out;
75 }
76
77 new_ns = clone_uts_ns(old_ns);
78 if (!new_ns) {
79 err = -ENOMEM;
80 goto out;
81 }
82 tsk->nsproxy->uts_ns = new_ns;
83
84out:
85 put_uts_ns(old_ns);
86 return err;
87}
88
89void free_uts_ns(struct kref *kref)
90{
91 struct uts_namespace *ns;
92
93 ns = container_of(kref, struct uts_namespace, kref);
94 kfree(ns);
95}