aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorLachlan McIlroy <lachlan@redback.melbourne.sgi.com>2008-02-17 21:51:42 -0500
committerLachlan McIlroy <lachlan@redback.melbourne.sgi.com>2008-02-17 21:51:42 -0500
commitc58310bf4933986513020fa90b4190c7492995ae (patch)
tree143f2c7578d02ebef5db8fc57ae69e951ae0e2ee /kernel
parent269cdfaf769f5cd831284cc831790c7c5038040f (diff)
parent1309d4e68497184d2fd87e892ddf14076c2bda98 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6 into for-linus
Diffstat (limited to 'kernel')
-rw-r--r--kernel/.gitignore1
-rw-r--r--kernel/Makefile18
-rw-r--r--kernel/audit.c12
-rw-r--r--kernel/audit_tree.c28
-rw-r--r--kernel/auditfilter.c15
-rw-r--r--kernel/auditsc.c28
-rw-r--r--kernel/capability.c113
-rw-r--r--kernel/cgroup.c318
-rw-r--r--kernel/compat.c43
-rw-r--r--kernel/cpu.c2
-rw-r--r--kernel/cpuset.c413
-rw-r--r--kernel/exit.c361
-rw-r--r--kernel/fork.c65
-rw-r--r--kernel/futex.c2
-rw-r--r--kernel/futex_compat.c2
-rw-r--r--kernel/hrtimer.c112
-rw-r--r--kernel/irq/chip.c46
-rw-r--r--kernel/irq/handle.c4
-rw-r--r--kernel/itimer.c2
-rw-r--r--kernel/kallsyms.c11
-rw-r--r--kernel/kexec.c18
-rw-r--r--kernel/kmod.c5
-rw-r--r--kernel/kprobes.c9
-rw-r--r--kernel/latency.c280
-rw-r--r--kernel/marker.c677
-rw-r--r--kernel/module.c38
-rw-r--r--kernel/mutex-debug.c2
-rw-r--r--kernel/mutex.c29
-rw-r--r--kernel/notifier.c1
-rw-r--r--kernel/nsproxy.c1
-rw-r--r--kernel/panic.c5
-rw-r--r--kernel/params.c22
-rw-r--r--kernel/pid.c209
-rw-r--r--kernel/pid_namespace.c197
-rw-r--r--kernel/pm_qos_params.c425
-rw-r--r--kernel/posix-cpu-timers.c8
-rw-r--r--kernel/posix-timers.c36
-rw-r--r--kernel/power/Kconfig9
-rw-r--r--kernel/power/disk.c4
-rw-r--r--kernel/power/snapshot.c4
-rw-r--r--kernel/printk.c52
-rw-r--r--kernel/profile.c1
-rw-r--r--kernel/ptrace.c22
-rw-r--r--kernel/rcupdate.c5
-rw-r--r--kernel/relay.c24
-rw-r--r--kernel/res_counter.c134
-rw-r--r--kernel/resource.c2
-rw-r--r--kernel/rtmutex-debug.c12
-rw-r--r--kernel/rtmutex.c5
-rw-r--r--kernel/rtmutex_common.h2
-rw-r--r--kernel/sched.c510
-rw-r--r--kernel/sched_rt.c102
-rw-r--r--kernel/signal.c254
-rw-r--r--kernel/softirq.c8
-rw-r--r--kernel/srcu.c3
-rw-r--r--kernel/stop_machine.c6
-rw-r--r--kernel/sys.c94
-rw-r--r--kernel/sys_ni.c7
-rw-r--r--kernel/sysctl.c131
-rw-r--r--kernel/sysctl_check.c158
-rw-r--r--kernel/test_kprobes.c16
-rw-r--r--kernel/time.c46
-rw-r--r--kernel/time/clockevents.c13
-rw-r--r--kernel/time/clocksource.c20
-rw-r--r--kernel/time/ntp.c4
-rw-r--r--kernel/time/tick-sched.c2
-rw-r--r--kernel/time/timekeeping.c6
-rw-r--r--kernel/timeconst.pl402
-rw-r--r--kernel/timer.c18
-rw-r--r--kernel/user.c60
-rw-r--r--kernel/user_namespace.c13
-rw-r--r--kernel/wait.c26
-rw-r--r--kernel/workqueue.c12
73 files changed, 3711 insertions, 2034 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore
index f2ab70073bd4..ab4f1090f437 100644
--- a/kernel/.gitignore
+++ b/kernel/.gitignore
@@ -3,3 +3,4 @@
3# 3#
4config_data.h 4config_data.h
5config_data.gz 5config_data.gz
6timeconst.h
diff --git a/kernel/Makefile b/kernel/Makefile
index db9af707ff5b..6c584c55a6e9 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -4,12 +4,12 @@
4 4
5obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ 5obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
6 exit.o itimer.o time.o softirq.o resource.o \ 6 exit.o itimer.o time.o softirq.o resource.o \
7 sysctl.o capability.o ptrace.o timer.o user.o user_namespace.o \ 7 sysctl.o capability.o ptrace.o timer.o user.o \
8 signal.o sys.o kmod.o workqueue.o pid.o \ 8 signal.o sys.o kmod.o workqueue.o pid.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o latency.o nsproxy.o srcu.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o \
12 utsname.o notifier.o ksysfs.o 12 notifier.o ksysfs.o pm_qos_params.o
13 13
14obj-$(CONFIG_SYSCTL) += sysctl_check.o 14obj-$(CONFIG_SYSCTL) += sysctl_check.o
15obj-$(CONFIG_STACKTRACE) += stacktrace.o 15obj-$(CONFIG_STACKTRACE) += stacktrace.o
@@ -42,7 +42,11 @@ obj-$(CONFIG_CGROUPS) += cgroup.o
42obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o 42obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o
43obj-$(CONFIG_CPUSETS) += cpuset.o 43obj-$(CONFIG_CPUSETS) += cpuset.o
44obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o 44obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
45obj-$(CONFIG_UTS_NS) += utsname.o
46obj-$(CONFIG_USER_NS) += user_namespace.o
47obj-$(CONFIG_PID_NS) += pid_namespace.o
45obj-$(CONFIG_IKCONFIG) += configs.o 48obj-$(CONFIG_IKCONFIG) += configs.o
49obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
46obj-$(CONFIG_STOP_MACHINE) += stop_machine.o 50obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
47obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o 51obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
48obj-$(CONFIG_AUDIT) += audit.o auditfilter.o 52obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
@@ -87,3 +91,11 @@ quiet_cmd_ikconfiggz = IKCFG $@
87targets += config_data.h 91targets += config_data.h
88$(obj)/config_data.h: $(obj)/config_data.gz FORCE 92$(obj)/config_data.h: $(obj)/config_data.gz FORCE
89 $(call if_changed,ikconfiggz) 93 $(call if_changed,ikconfiggz)
94
95$(obj)/time.o: $(obj)/timeconst.h
96
97quiet_cmd_timeconst = TIMEC $@
98 cmd_timeconst = $(PERL) $< $(CONFIG_HZ) > $@
99targets += timeconst.h
100$(obj)/timeconst.h: $(src)/timeconst.pl FORCE
101 $(call if_changed,timeconst)
diff --git a/kernel/audit.c b/kernel/audit.c
index c8555b180213..2eeea9a14240 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1312,26 +1312,26 @@ void audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
1312 1312
1313/* This is a helper-function to print the escaped d_path */ 1313/* This is a helper-function to print the escaped d_path */
1314void audit_log_d_path(struct audit_buffer *ab, const char *prefix, 1314void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
1315 struct dentry *dentry, struct vfsmount *vfsmnt) 1315 struct path *path)
1316{ 1316{
1317 char *p, *path; 1317 char *p, *pathname;
1318 1318
1319 if (prefix) 1319 if (prefix)
1320 audit_log_format(ab, " %s", prefix); 1320 audit_log_format(ab, " %s", prefix);
1321 1321
1322 /* We will allow 11 spaces for ' (deleted)' to be appended */ 1322 /* We will allow 11 spaces for ' (deleted)' to be appended */
1323 path = kmalloc(PATH_MAX+11, ab->gfp_mask); 1323 pathname = kmalloc(PATH_MAX+11, ab->gfp_mask);
1324 if (!path) { 1324 if (!pathname) {
1325 audit_log_format(ab, "<no memory>"); 1325 audit_log_format(ab, "<no memory>");
1326 return; 1326 return;
1327 } 1327 }
1328 p = d_path(dentry, vfsmnt, path, PATH_MAX+11); 1328 p = d_path(path, pathname, PATH_MAX+11);
1329 if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */ 1329 if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */
1330 /* FIXME: can we save some information here? */ 1330 /* FIXME: can we save some information here? */
1331 audit_log_format(ab, "<too long>"); 1331 audit_log_format(ab, "<too long>");
1332 } else 1332 } else
1333 audit_log_untrustedstring(ab, p); 1333 audit_log_untrustedstring(ab, p);
1334 kfree(path); 1334 kfree(pathname);
1335} 1335}
1336 1336
1337/** 1337/**
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index f4fcf58f20f8..9ef5e0aacc3c 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -549,8 +549,8 @@ void audit_trim_trees(void)
549 if (err) 549 if (err)
550 goto skip_it; 550 goto skip_it;
551 551
552 root_mnt = collect_mounts(nd.mnt, nd.dentry); 552 root_mnt = collect_mounts(nd.path.mnt, nd.path.dentry);
553 path_release(&nd); 553 path_put(&nd.path);
554 if (!root_mnt) 554 if (!root_mnt)
555 goto skip_it; 555 goto skip_it;
556 556
@@ -583,17 +583,17 @@ skip_it:
583static int is_under(struct vfsmount *mnt, struct dentry *dentry, 583static int is_under(struct vfsmount *mnt, struct dentry *dentry,
584 struct nameidata *nd) 584 struct nameidata *nd)
585{ 585{
586 if (mnt != nd->mnt) { 586 if (mnt != nd->path.mnt) {
587 for (;;) { 587 for (;;) {
588 if (mnt->mnt_parent == mnt) 588 if (mnt->mnt_parent == mnt)
589 return 0; 589 return 0;
590 if (mnt->mnt_parent == nd->mnt) 590 if (mnt->mnt_parent == nd->path.mnt)
591 break; 591 break;
592 mnt = mnt->mnt_parent; 592 mnt = mnt->mnt_parent;
593 } 593 }
594 dentry = mnt->mnt_mountpoint; 594 dentry = mnt->mnt_mountpoint;
595 } 595 }
596 return is_subdir(dentry, nd->dentry); 596 return is_subdir(dentry, nd->path.dentry);
597} 597}
598 598
599int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op) 599int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op)
@@ -641,8 +641,8 @@ int audit_add_tree_rule(struct audit_krule *rule)
641 err = path_lookup(tree->pathname, 0, &nd); 641 err = path_lookup(tree->pathname, 0, &nd);
642 if (err) 642 if (err)
643 goto Err; 643 goto Err;
644 mnt = collect_mounts(nd.mnt, nd.dentry); 644 mnt = collect_mounts(nd.path.mnt, nd.path.dentry);
645 path_release(&nd); 645 path_put(&nd.path);
646 if (!mnt) { 646 if (!mnt) {
647 err = -ENOMEM; 647 err = -ENOMEM;
648 goto Err; 648 goto Err;
@@ -701,8 +701,8 @@ int audit_tag_tree(char *old, char *new)
701 err = path_lookup(new, 0, &nd); 701 err = path_lookup(new, 0, &nd);
702 if (err) 702 if (err)
703 return err; 703 return err;
704 tagged = collect_mounts(nd.mnt, nd.dentry); 704 tagged = collect_mounts(nd.path.mnt, nd.path.dentry);
705 path_release(&nd); 705 path_put(&nd.path);
706 if (!tagged) 706 if (!tagged)
707 return -ENOMEM; 707 return -ENOMEM;
708 708
@@ -711,9 +711,9 @@ int audit_tag_tree(char *old, char *new)
711 drop_collected_mounts(tagged); 711 drop_collected_mounts(tagged);
712 return err; 712 return err;
713 } 713 }
714 mnt = mntget(nd.mnt); 714 mnt = mntget(nd.path.mnt);
715 dentry = dget(nd.dentry); 715 dentry = dget(nd.path.dentry);
716 path_release(&nd); 716 path_put(&nd.path);
717 717
718 if (dentry == tagged->mnt_root && dentry == mnt->mnt_root) 718 if (dentry == tagged->mnt_root && dentry == mnt->mnt_root)
719 follow_up(&mnt, &dentry); 719 follow_up(&mnt, &dentry);
@@ -744,13 +744,13 @@ int audit_tag_tree(char *old, char *new)
744 spin_lock(&vfsmount_lock); 744 spin_lock(&vfsmount_lock);
745 if (!is_under(mnt, dentry, &nd)) { 745 if (!is_under(mnt, dentry, &nd)) {
746 spin_unlock(&vfsmount_lock); 746 spin_unlock(&vfsmount_lock);
747 path_release(&nd); 747 path_put(&nd.path);
748 put_tree(tree); 748 put_tree(tree);
749 mutex_lock(&audit_filter_mutex); 749 mutex_lock(&audit_filter_mutex);
750 continue; 750 continue;
751 } 751 }
752 spin_unlock(&vfsmount_lock); 752 spin_unlock(&vfsmount_lock);
753 path_release(&nd); 753 path_put(&nd.path);
754 754
755 list_for_each_entry(p, &list, mnt_list) { 755 list_for_each_entry(p, &list, mnt_list) {
756 failed = tag_chunk(p->mnt_root->d_inode, tree); 756 failed = tag_chunk(p->mnt_root->d_inode, tree);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 6f19fd477aac..2f2914b7cc30 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -169,8 +169,8 @@ static struct audit_parent *audit_init_parent(struct nameidata *ndp)
169 inotify_init_watch(&parent->wdata); 169 inotify_init_watch(&parent->wdata);
170 /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */ 170 /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */
171 get_inotify_watch(&parent->wdata); 171 get_inotify_watch(&parent->wdata);
172 wd = inotify_add_watch(audit_ih, &parent->wdata, ndp->dentry->d_inode, 172 wd = inotify_add_watch(audit_ih, &parent->wdata,
173 AUDIT_IN_WATCH); 173 ndp->path.dentry->d_inode, AUDIT_IN_WATCH);
174 if (wd < 0) { 174 if (wd < 0) {
175 audit_free_parent(&parent->wdata); 175 audit_free_parent(&parent->wdata);
176 return ERR_PTR(wd); 176 return ERR_PTR(wd);
@@ -1161,11 +1161,11 @@ static int audit_get_nd(char *path, struct nameidata **ndp,
1161static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw) 1161static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
1162{ 1162{
1163 if (ndp) { 1163 if (ndp) {
1164 path_release(ndp); 1164 path_put(&ndp->path);
1165 kfree(ndp); 1165 kfree(ndp);
1166 } 1166 }
1167 if (ndw) { 1167 if (ndw) {
1168 path_release(ndw); 1168 path_put(&ndw->path);
1169 kfree(ndw); 1169 kfree(ndw);
1170 } 1170 }
1171} 1171}
@@ -1214,8 +1214,8 @@ static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp,
1214 1214
1215 /* update watch filter fields */ 1215 /* update watch filter fields */
1216 if (ndw) { 1216 if (ndw) {
1217 watch->dev = ndw->dentry->d_inode->i_sb->s_dev; 1217 watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
1218 watch->ino = ndw->dentry->d_inode->i_ino; 1218 watch->ino = ndw->path.dentry->d_inode->i_ino;
1219 } 1219 }
1220 1220
1221 /* The audit_filter_mutex must not be held during inotify calls because 1221 /* The audit_filter_mutex must not be held during inotify calls because
@@ -1225,7 +1225,8 @@ static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp,
1225 */ 1225 */
1226 mutex_unlock(&audit_filter_mutex); 1226 mutex_unlock(&audit_filter_mutex);
1227 1227
1228 if (inotify_find_watch(audit_ih, ndp->dentry->d_inode, &i_watch) < 0) { 1228 if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode,
1229 &i_watch) < 0) {
1229 parent = audit_init_parent(ndp); 1230 parent = audit_init_parent(ndp);
1230 if (IS_ERR(parent)) { 1231 if (IS_ERR(parent)) {
1231 /* caller expects mutex locked */ 1232 /* caller expects mutex locked */
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 1c06ecf38d7b..ac6d9b23b018 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -208,8 +208,7 @@ struct audit_context {
208 int name_count; 208 int name_count;
209 struct audit_names names[AUDIT_NAMES]; 209 struct audit_names names[AUDIT_NAMES];
210 char * filterkey; /* key for rule that triggered record */ 210 char * filterkey; /* key for rule that triggered record */
211 struct dentry * pwd; 211 struct path pwd;
212 struct vfsmount * pwdmnt;
213 struct audit_context *previous; /* For nested syscalls */ 212 struct audit_context *previous; /* For nested syscalls */
214 struct audit_aux_data *aux; 213 struct audit_aux_data *aux;
215 struct audit_aux_data *aux_pids; 214 struct audit_aux_data *aux_pids;
@@ -786,12 +785,9 @@ static inline void audit_free_names(struct audit_context *context)
786 __putname(context->names[i].name); 785 __putname(context->names[i].name);
787 } 786 }
788 context->name_count = 0; 787 context->name_count = 0;
789 if (context->pwd) 788 path_put(&context->pwd);
790 dput(context->pwd); 789 context->pwd.dentry = NULL;
791 if (context->pwdmnt) 790 context->pwd.mnt = NULL;
792 mntput(context->pwdmnt);
793 context->pwd = NULL;
794 context->pwdmnt = NULL;
795} 791}
796 792
797static inline void audit_free_aux(struct audit_context *context) 793static inline void audit_free_aux(struct audit_context *context)
@@ -930,8 +926,7 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk
930 if ((vma->vm_flags & VM_EXECUTABLE) && 926 if ((vma->vm_flags & VM_EXECUTABLE) &&
931 vma->vm_file) { 927 vma->vm_file) {
932 audit_log_d_path(ab, "exe=", 928 audit_log_d_path(ab, "exe=",
933 vma->vm_file->f_path.dentry, 929 &vma->vm_file->f_path);
934 vma->vm_file->f_path.mnt);
935 break; 930 break;
936 } 931 }
937 vma = vma->vm_next; 932 vma = vma->vm_next;
@@ -1341,10 +1336,10 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1341 context->target_sid, context->target_comm)) 1336 context->target_sid, context->target_comm))
1342 call_panic = 1; 1337 call_panic = 1;
1343 1338
1344 if (context->pwd && context->pwdmnt) { 1339 if (context->pwd.dentry && context->pwd.mnt) {
1345 ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD); 1340 ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD);
1346 if (ab) { 1341 if (ab) {
1347 audit_log_d_path(ab, "cwd=", context->pwd, context->pwdmnt); 1342 audit_log_d_path(ab, "cwd=", &context->pwd);
1348 audit_log_end(ab); 1343 audit_log_end(ab);
1349 } 1344 }
1350 } 1345 }
@@ -1367,8 +1362,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1367 case 0: 1362 case 0:
1368 /* name was specified as a relative path and the 1363 /* name was specified as a relative path and the
1369 * directory component is the cwd */ 1364 * directory component is the cwd */
1370 audit_log_d_path(ab, " name=", context->pwd, 1365 audit_log_d_path(ab, " name=", &context->pwd);
1371 context->pwdmnt);
1372 break; 1366 break;
1373 default: 1367 default:
1374 /* log the name's directory component */ 1368 /* log the name's directory component */
@@ -1695,10 +1689,10 @@ void __audit_getname(const char *name)
1695 context->names[context->name_count].ino = (unsigned long)-1; 1689 context->names[context->name_count].ino = (unsigned long)-1;
1696 context->names[context->name_count].osid = 0; 1690 context->names[context->name_count].osid = 0;
1697 ++context->name_count; 1691 ++context->name_count;
1698 if (!context->pwd) { 1692 if (!context->pwd.dentry) {
1699 read_lock(&current->fs->lock); 1693 read_lock(&current->fs->lock);
1700 context->pwd = dget(current->fs->pwd); 1694 context->pwd = current->fs->pwd;
1701 context->pwdmnt = mntget(current->fs->pwdmnt); 1695 path_get(&current->fs->pwd);
1702 read_unlock(&current->fs->lock); 1696 read_unlock(&current->fs->lock);
1703 } 1697 }
1704 1698
diff --git a/kernel/capability.c b/kernel/capability.c
index efbd9cdce132..39e8193b41ea 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -22,6 +22,37 @@
22static DEFINE_SPINLOCK(task_capability_lock); 22static DEFINE_SPINLOCK(task_capability_lock);
23 23
24/* 24/*
25 * Leveraged for setting/resetting capabilities
26 */
27
28const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET;
29const kernel_cap_t __cap_full_set = CAP_FULL_SET;
30const kernel_cap_t __cap_init_eff_set = CAP_INIT_EFF_SET;
31
32EXPORT_SYMBOL(__cap_empty_set);
33EXPORT_SYMBOL(__cap_full_set);
34EXPORT_SYMBOL(__cap_init_eff_set);
35
36/*
37 * More recent versions of libcap are available from:
38 *
39 * http://www.kernel.org/pub/linux/libs/security/linux-privs/
40 */
41
42static void warn_legacy_capability_use(void)
43{
44 static int warned;
45 if (!warned) {
46 char name[sizeof(current->comm)];
47
48 printk(KERN_INFO "warning: `%s' uses 32-bit capabilities"
49 " (legacy support in use)\n",
50 get_task_comm(name, current));
51 warned = 1;
52 }
53}
54
55/*
25 * For sys_getproccap() and sys_setproccap(), any of the three 56 * For sys_getproccap() and sys_setproccap(), any of the three
26 * capability set pointers may be NULL -- indicating that that set is 57 * capability set pointers may be NULL -- indicating that that set is
27 * uninteresting and/or not to be changed. 58 * uninteresting and/or not to be changed.
@@ -42,12 +73,21 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
42 pid_t pid; 73 pid_t pid;
43 __u32 version; 74 __u32 version;
44 struct task_struct *target; 75 struct task_struct *target;
45 struct __user_cap_data_struct data; 76 unsigned tocopy;
77 kernel_cap_t pE, pI, pP;
46 78
47 if (get_user(version, &header->version)) 79 if (get_user(version, &header->version))
48 return -EFAULT; 80 return -EFAULT;
49 81
50 if (version != _LINUX_CAPABILITY_VERSION) { 82 switch (version) {
83 case _LINUX_CAPABILITY_VERSION_1:
84 warn_legacy_capability_use();
85 tocopy = _LINUX_CAPABILITY_U32S_1;
86 break;
87 case _LINUX_CAPABILITY_VERSION_2:
88 tocopy = _LINUX_CAPABILITY_U32S_2;
89 break;
90 default:
51 if (put_user(_LINUX_CAPABILITY_VERSION, &header->version)) 91 if (put_user(_LINUX_CAPABILITY_VERSION, &header->version))
52 return -EFAULT; 92 return -EFAULT;
53 return -EINVAL; 93 return -EINVAL;
@@ -71,14 +111,47 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
71 } else 111 } else
72 target = current; 112 target = current;
73 113
74 ret = security_capget(target, &data.effective, &data.inheritable, &data.permitted); 114 ret = security_capget(target, &pE, &pI, &pP);
75 115
76out: 116out:
77 read_unlock(&tasklist_lock); 117 read_unlock(&tasklist_lock);
78 spin_unlock(&task_capability_lock); 118 spin_unlock(&task_capability_lock);
79 119
80 if (!ret && copy_to_user(dataptr, &data, sizeof data)) 120 if (!ret) {
81 return -EFAULT; 121 struct __user_cap_data_struct kdata[_LINUX_CAPABILITY_U32S];
122 unsigned i;
123
124 for (i = 0; i < tocopy; i++) {
125 kdata[i].effective = pE.cap[i];
126 kdata[i].permitted = pP.cap[i];
127 kdata[i].inheritable = pI.cap[i];
128 }
129
130 /*
131 * Note, in the case, tocopy < _LINUX_CAPABILITY_U32S,
132 * we silently drop the upper capabilities here. This
133 * has the effect of making older libcap
134 * implementations implicitly drop upper capability
135 * bits when they perform a: capget/modify/capset
136 * sequence.
137 *
138 * This behavior is considered fail-safe
139 * behavior. Upgrading the application to a newer
140 * version of libcap will enable access to the newer
141 * capabilities.
142 *
143 * An alternative would be to return an error here
144 * (-ERANGE), but that causes legacy applications to
145 * unexpectidly fail; the capget/modify/capset aborts
146 * before modification is attempted and the application
147 * fails.
148 */
149
150 if (copy_to_user(dataptr, kdata, tocopy
151 * sizeof(struct __user_cap_data_struct))) {
152 return -EFAULT;
153 }
154 }
82 155
83 return ret; 156 return ret;
84} 157}
@@ -167,6 +240,8 @@ static inline int cap_set_all(kernel_cap_t *effective,
167 */ 240 */
168asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) 241asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
169{ 242{
243 struct __user_cap_data_struct kdata[_LINUX_CAPABILITY_U32S];
244 unsigned i, tocopy;
170 kernel_cap_t inheritable, permitted, effective; 245 kernel_cap_t inheritable, permitted, effective;
171 __u32 version; 246 __u32 version;
172 struct task_struct *target; 247 struct task_struct *target;
@@ -176,7 +251,15 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
176 if (get_user(version, &header->version)) 251 if (get_user(version, &header->version))
177 return -EFAULT; 252 return -EFAULT;
178 253
179 if (version != _LINUX_CAPABILITY_VERSION) { 254 switch (version) {
255 case _LINUX_CAPABILITY_VERSION_1:
256 warn_legacy_capability_use();
257 tocopy = _LINUX_CAPABILITY_U32S_1;
258 break;
259 case _LINUX_CAPABILITY_VERSION_2:
260 tocopy = _LINUX_CAPABILITY_U32S_2;
261 break;
262 default:
180 if (put_user(_LINUX_CAPABILITY_VERSION, &header->version)) 263 if (put_user(_LINUX_CAPABILITY_VERSION, &header->version))
181 return -EFAULT; 264 return -EFAULT;
182 return -EINVAL; 265 return -EINVAL;
@@ -188,10 +271,22 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
188 if (pid && pid != task_pid_vnr(current) && !capable(CAP_SETPCAP)) 271 if (pid && pid != task_pid_vnr(current) && !capable(CAP_SETPCAP))
189 return -EPERM; 272 return -EPERM;
190 273
191 if (copy_from_user(&effective, &data->effective, sizeof(effective)) || 274 if (copy_from_user(&kdata, data, tocopy
192 copy_from_user(&inheritable, &data->inheritable, sizeof(inheritable)) || 275 * sizeof(struct __user_cap_data_struct))) {
193 copy_from_user(&permitted, &data->permitted, sizeof(permitted)))
194 return -EFAULT; 276 return -EFAULT;
277 }
278
279 for (i = 0; i < tocopy; i++) {
280 effective.cap[i] = kdata[i].effective;
281 permitted.cap[i] = kdata[i].permitted;
282 inheritable.cap[i] = kdata[i].inheritable;
283 }
284 while (i < _LINUX_CAPABILITY_U32S) {
285 effective.cap[i] = 0;
286 permitted.cap[i] = 0;
287 inheritable.cap[i] = 0;
288 i++;
289 }
195 290
196 spin_lock(&task_capability_lock); 291 spin_lock(&task_capability_lock);
197 read_lock(&tasklist_lock); 292 read_lock(&tasklist_lock);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1a3c23936d43..4766bb65e4d9 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -141,7 +141,7 @@ enum {
141 ROOT_NOPREFIX, /* mounted subsystems have no named prefix */ 141 ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
142}; 142};
143 143
144inline int cgroup_is_releasable(const struct cgroup *cgrp) 144static int cgroup_is_releasable(const struct cgroup *cgrp)
145{ 145{
146 const int bits = 146 const int bits =
147 (1 << CGRP_RELEASABLE) | 147 (1 << CGRP_RELEASABLE) |
@@ -149,7 +149,7 @@ inline int cgroup_is_releasable(const struct cgroup *cgrp)
149 return (cgrp->flags & bits) == bits; 149 return (cgrp->flags & bits) == bits;
150} 150}
151 151
152inline int notify_on_release(const struct cgroup *cgrp) 152static int notify_on_release(const struct cgroup *cgrp)
153{ 153{
154 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 154 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
155} 155}
@@ -489,7 +489,7 @@ static struct css_set *find_css_set(
489 * Any task can increment and decrement the count field without lock. 489 * Any task can increment and decrement the count field without lock.
490 * So in general, code holding cgroup_mutex can't rely on the count 490 * So in general, code holding cgroup_mutex can't rely on the count
491 * field not changing. However, if the count goes to zero, then only 491 * field not changing. However, if the count goes to zero, then only
492 * attach_task() can increment it again. Because a count of zero 492 * cgroup_attach_task() can increment it again. Because a count of zero
493 * means that no tasks are currently attached, therefore there is no 493 * means that no tasks are currently attached, therefore there is no
494 * way a task attached to that cgroup can fork (the other way to 494 * way a task attached to that cgroup can fork (the other way to
495 * increment the count). So code holding cgroup_mutex can safely 495 * increment the count). So code holding cgroup_mutex can safely
@@ -520,17 +520,17 @@ static struct css_set *find_css_set(
520 * The task_lock() exception 520 * The task_lock() exception
521 * 521 *
522 * The need for this exception arises from the action of 522 * The need for this exception arises from the action of
523 * attach_task(), which overwrites one tasks cgroup pointer with 523 * cgroup_attach_task(), which overwrites one tasks cgroup pointer with
524 * another. It does so using cgroup_mutexe, however there are 524 * another. It does so using cgroup_mutexe, however there are
525 * several performance critical places that need to reference 525 * several performance critical places that need to reference
526 * task->cgroup without the expense of grabbing a system global 526 * task->cgroup without the expense of grabbing a system global
527 * mutex. Therefore except as noted below, when dereferencing or, as 527 * mutex. Therefore except as noted below, when dereferencing or, as
528 * in attach_task(), modifying a task'ss cgroup pointer we use 528 * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use
529 * task_lock(), which acts on a spinlock (task->alloc_lock) already in 529 * task_lock(), which acts on a spinlock (task->alloc_lock) already in
530 * the task_struct routinely used for such matters. 530 * the task_struct routinely used for such matters.
531 * 531 *
532 * P.S. One more locking exception. RCU is used to guard the 532 * P.S. One more locking exception. RCU is used to guard the
533 * update of a tasks cgroup pointer by attach_task() 533 * update of a tasks cgroup pointer by cgroup_attach_task()
534 */ 534 */
535 535
536/** 536/**
@@ -586,11 +586,27 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
586 return inode; 586 return inode;
587} 587}
588 588
589/*
590 * Call subsys's pre_destroy handler.
591 * This is called before css refcnt check.
592 */
593
594static void cgroup_call_pre_destroy(struct cgroup *cgrp)
595{
596 struct cgroup_subsys *ss;
597 for_each_subsys(cgrp->root, ss)
598 if (ss->pre_destroy && cgrp->subsys[ss->subsys_id])
599 ss->pre_destroy(ss, cgrp);
600 return;
601}
602
603
589static void cgroup_diput(struct dentry *dentry, struct inode *inode) 604static void cgroup_diput(struct dentry *dentry, struct inode *inode)
590{ 605{
591 /* is dentry a directory ? if so, kfree() associated cgroup */ 606 /* is dentry a directory ? if so, kfree() associated cgroup */
592 if (S_ISDIR(inode->i_mode)) { 607 if (S_ISDIR(inode->i_mode)) {
593 struct cgroup *cgrp = dentry->d_fsdata; 608 struct cgroup *cgrp = dentry->d_fsdata;
609 struct cgroup_subsys *ss;
594 BUG_ON(!(cgroup_is_removed(cgrp))); 610 BUG_ON(!(cgroup_is_removed(cgrp)));
595 /* It's possible for external users to be holding css 611 /* It's possible for external users to be holding css
596 * reference counts on a cgroup; css_put() needs to 612 * reference counts on a cgroup; css_put() needs to
@@ -599,6 +615,23 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
599 * queue the cgroup to be handled by the release 615 * queue the cgroup to be handled by the release
600 * agent */ 616 * agent */
601 synchronize_rcu(); 617 synchronize_rcu();
618
619 mutex_lock(&cgroup_mutex);
620 /*
621 * Release the subsystem state objects.
622 */
623 for_each_subsys(cgrp->root, ss) {
624 if (cgrp->subsys[ss->subsys_id])
625 ss->destroy(ss, cgrp);
626 }
627
628 cgrp->root->number_of_cgroups--;
629 mutex_unlock(&cgroup_mutex);
630
631 /* Drop the active superblock reference that we took when we
632 * created the cgroup */
633 deactivate_super(cgrp->root->sb);
634
602 kfree(cgrp); 635 kfree(cgrp);
603 } 636 }
604 iput(inode); 637 iput(inode);
@@ -1161,7 +1194,7 @@ static void get_first_subsys(const struct cgroup *cgrp,
1161 * Call holding cgroup_mutex. May take task_lock of 1194 * Call holding cgroup_mutex. May take task_lock of
1162 * the task 'pid' during call. 1195 * the task 'pid' during call.
1163 */ 1196 */
1164static int attach_task(struct cgroup *cgrp, struct task_struct *tsk) 1197int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1165{ 1198{
1166 int retval = 0; 1199 int retval = 0;
1167 struct cgroup_subsys *ss; 1200 struct cgroup_subsys *ss;
@@ -1181,9 +1214,8 @@ static int attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1181 for_each_subsys(root, ss) { 1214 for_each_subsys(root, ss) {
1182 if (ss->can_attach) { 1215 if (ss->can_attach) {
1183 retval = ss->can_attach(ss, cgrp, tsk); 1216 retval = ss->can_attach(ss, cgrp, tsk);
1184 if (retval) { 1217 if (retval)
1185 return retval; 1218 return retval;
1186 }
1187 } 1219 }
1188 } 1220 }
1189 1221
@@ -1192,9 +1224,8 @@ static int attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1192 * based on its final set of cgroups 1224 * based on its final set of cgroups
1193 */ 1225 */
1194 newcg = find_css_set(cg, cgrp); 1226 newcg = find_css_set(cg, cgrp);
1195 if (!newcg) { 1227 if (!newcg)
1196 return -ENOMEM; 1228 return -ENOMEM;
1197 }
1198 1229
1199 task_lock(tsk); 1230 task_lock(tsk);
1200 if (tsk->flags & PF_EXITING) { 1231 if (tsk->flags & PF_EXITING) {
@@ -1214,9 +1245,8 @@ static int attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1214 write_unlock(&css_set_lock); 1245 write_unlock(&css_set_lock);
1215 1246
1216 for_each_subsys(root, ss) { 1247 for_each_subsys(root, ss) {
1217 if (ss->attach) { 1248 if (ss->attach)
1218 ss->attach(ss, cgrp, oldcgrp, tsk); 1249 ss->attach(ss, cgrp, oldcgrp, tsk);
1219 }
1220 } 1250 }
1221 set_bit(CGRP_RELEASABLE, &oldcgrp->flags); 1251 set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
1222 synchronize_rcu(); 1252 synchronize_rcu();
@@ -1239,7 +1269,7 @@ static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf)
1239 1269
1240 if (pid) { 1270 if (pid) {
1241 rcu_read_lock(); 1271 rcu_read_lock();
1242 tsk = find_task_by_pid(pid); 1272 tsk = find_task_by_vpid(pid);
1243 if (!tsk || tsk->flags & PF_EXITING) { 1273 if (!tsk || tsk->flags & PF_EXITING) {
1244 rcu_read_unlock(); 1274 rcu_read_unlock();
1245 return -ESRCH; 1275 return -ESRCH;
@@ -1257,7 +1287,7 @@ static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf)
1257 get_task_struct(tsk); 1287 get_task_struct(tsk);
1258 } 1288 }
1259 1289
1260 ret = attach_task(cgrp, tsk); 1290 ret = cgroup_attach_task(cgrp, tsk);
1261 put_task_struct(tsk); 1291 put_task_struct(tsk);
1262 return ret; 1292 return ret;
1263} 1293}
@@ -1329,9 +1359,14 @@ static ssize_t cgroup_common_file_write(struct cgroup *cgrp,
1329 goto out1; 1359 goto out1;
1330 } 1360 }
1331 buffer[nbytes] = 0; /* nul-terminate */ 1361 buffer[nbytes] = 0; /* nul-terminate */
1362 strstrip(buffer); /* strip -just- trailing whitespace */
1332 1363
1333 mutex_lock(&cgroup_mutex); 1364 mutex_lock(&cgroup_mutex);
1334 1365
1366 /*
1367 * This was already checked for in cgroup_file_write(), but
1368 * check again now we're holding cgroup_mutex.
1369 */
1335 if (cgroup_is_removed(cgrp)) { 1370 if (cgroup_is_removed(cgrp)) {
1336 retval = -ENODEV; 1371 retval = -ENODEV;
1337 goto out2; 1372 goto out2;
@@ -1349,24 +1384,9 @@ static ssize_t cgroup_common_file_write(struct cgroup *cgrp,
1349 clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 1384 clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
1350 break; 1385 break;
1351 case FILE_RELEASE_AGENT: 1386 case FILE_RELEASE_AGENT:
1352 { 1387 BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
1353 struct cgroupfs_root *root = cgrp->root; 1388 strcpy(cgrp->root->release_agent_path, buffer);
1354 /* Strip trailing newline */
1355 if (nbytes && (buffer[nbytes-1] == '\n')) {
1356 buffer[nbytes-1] = 0;
1357 }
1358 if (nbytes < sizeof(root->release_agent_path)) {
1359 /* We never write anything other than '\0'
1360 * into the last char of release_agent_path,
1361 * so it always remains a NUL-terminated
1362 * string */
1363 strncpy(root->release_agent_path, buffer, nbytes);
1364 root->release_agent_path[nbytes] = 0;
1365 } else {
1366 retval = -ENOSPC;
1367 }
1368 break; 1389 break;
1369 }
1370 default: 1390 default:
1371 retval = -EINVAL; 1391 retval = -EINVAL;
1372 goto out2; 1392 goto out2;
@@ -1387,7 +1407,7 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
1387 struct cftype *cft = __d_cft(file->f_dentry); 1407 struct cftype *cft = __d_cft(file->f_dentry);
1388 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 1408 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
1389 1409
1390 if (!cft) 1410 if (!cft || cgroup_is_removed(cgrp))
1391 return -ENODEV; 1411 return -ENODEV;
1392 if (cft->write) 1412 if (cft->write)
1393 return cft->write(cgrp, cft, file, buf, nbytes, ppos); 1413 return cft->write(cgrp, cft, file, buf, nbytes, ppos);
@@ -1457,7 +1477,7 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf,
1457 struct cftype *cft = __d_cft(file->f_dentry); 1477 struct cftype *cft = __d_cft(file->f_dentry);
1458 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 1478 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
1459 1479
1460 if (!cft) 1480 if (!cft || cgroup_is_removed(cgrp))
1461 return -ENODEV; 1481 return -ENODEV;
1462 1482
1463 if (cft->read) 1483 if (cft->read)
@@ -1675,6 +1695,29 @@ static void cgroup_advance_iter(struct cgroup *cgrp,
1675 it->task = cg->tasks.next; 1695 it->task = cg->tasks.next;
1676} 1696}
1677 1697
1698/*
1699 * To reduce the fork() overhead for systems that are not actually
1700 * using their cgroups capability, we don't maintain the lists running
1701 * through each css_set to its tasks until we see the list actually
1702 * used - in other words after the first call to cgroup_iter_start().
1703 *
1704 * The tasklist_lock is not held here, as do_each_thread() and
1705 * while_each_thread() are protected by RCU.
1706 */
1707void cgroup_enable_task_cg_lists(void)
1708{
1709 struct task_struct *p, *g;
1710 write_lock(&css_set_lock);
1711 use_task_css_set_links = 1;
1712 do_each_thread(g, p) {
1713 task_lock(p);
1714 if (list_empty(&p->cg_list))
1715 list_add(&p->cg_list, &p->cgroups->tasks);
1716 task_unlock(p);
1717 } while_each_thread(g, p);
1718 write_unlock(&css_set_lock);
1719}
1720
1678void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) 1721void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
1679{ 1722{
1680 /* 1723 /*
@@ -1682,18 +1725,9 @@ void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
1682 * we need to enable the list linking each css_set to its 1725 * we need to enable the list linking each css_set to its
1683 * tasks, and fix up all existing tasks. 1726 * tasks, and fix up all existing tasks.
1684 */ 1727 */
1685 if (!use_task_css_set_links) { 1728 if (!use_task_css_set_links)
1686 struct task_struct *p, *g; 1729 cgroup_enable_task_cg_lists();
1687 write_lock(&css_set_lock); 1730
1688 use_task_css_set_links = 1;
1689 do_each_thread(g, p) {
1690 task_lock(p);
1691 if (list_empty(&p->cg_list))
1692 list_add(&p->cg_list, &p->cgroups->tasks);
1693 task_unlock(p);
1694 } while_each_thread(g, p);
1695 write_unlock(&css_set_lock);
1696 }
1697 read_lock(&css_set_lock); 1731 read_lock(&css_set_lock);
1698 it->cg_link = &cgrp->css_sets; 1732 it->cg_link = &cgrp->css_sets;
1699 cgroup_advance_iter(cgrp, it); 1733 cgroup_advance_iter(cgrp, it);
@@ -1726,6 +1760,166 @@ void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)
1726 read_unlock(&css_set_lock); 1760 read_unlock(&css_set_lock);
1727} 1761}
1728 1762
1763static inline int started_after_time(struct task_struct *t1,
1764 struct timespec *time,
1765 struct task_struct *t2)
1766{
1767 int start_diff = timespec_compare(&t1->start_time, time);
1768 if (start_diff > 0) {
1769 return 1;
1770 } else if (start_diff < 0) {
1771 return 0;
1772 } else {
1773 /*
1774 * Arbitrarily, if two processes started at the same
1775 * time, we'll say that the lower pointer value
1776 * started first. Note that t2 may have exited by now
1777 * so this may not be a valid pointer any longer, but
1778 * that's fine - it still serves to distinguish
1779 * between two tasks started (effectively) simultaneously.
1780 */
1781 return t1 > t2;
1782 }
1783}
1784
1785/*
1786 * This function is a callback from heap_insert() and is used to order
1787 * the heap.
1788 * In this case we order the heap in descending task start time.
1789 */
1790static inline int started_after(void *p1, void *p2)
1791{
1792 struct task_struct *t1 = p1;
1793 struct task_struct *t2 = p2;
1794 return started_after_time(t1, &t2->start_time, t2);
1795}
1796
1797/**
1798 * cgroup_scan_tasks - iterate though all the tasks in a cgroup
1799 * @scan: struct cgroup_scanner containing arguments for the scan
1800 *
1801 * Arguments include pointers to callback functions test_task() and
1802 * process_task().
1803 * Iterate through all the tasks in a cgroup, calling test_task() for each,
1804 * and if it returns true, call process_task() for it also.
1805 * The test_task pointer may be NULL, meaning always true (select all tasks).
1806 * Effectively duplicates cgroup_iter_{start,next,end}()
1807 * but does not lock css_set_lock for the call to process_task().
1808 * The struct cgroup_scanner may be embedded in any structure of the caller's
1809 * creation.
1810 * It is guaranteed that process_task() will act on every task that
1811 * is a member of the cgroup for the duration of this call. This
1812 * function may or may not call process_task() for tasks that exit
1813 * or move to a different cgroup during the call, or are forked or
1814 * move into the cgroup during the call.
1815 *
1816 * Note that test_task() may be called with locks held, and may in some
1817 * situations be called multiple times for the same task, so it should
1818 * be cheap.
1819 * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been
1820 * pre-allocated and will be used for heap operations (and its "gt" member will
1821 * be overwritten), else a temporary heap will be used (allocation of which
1822 * may cause this function to fail).
1823 */
1824int cgroup_scan_tasks(struct cgroup_scanner *scan)
1825{
1826 int retval, i;
1827 struct cgroup_iter it;
1828 struct task_struct *p, *dropped;
1829 /* Never dereference latest_task, since it's not refcounted */
1830 struct task_struct *latest_task = NULL;
1831 struct ptr_heap tmp_heap;
1832 struct ptr_heap *heap;
1833 struct timespec latest_time = { 0, 0 };
1834
1835 if (scan->heap) {
1836 /* The caller supplied our heap and pre-allocated its memory */
1837 heap = scan->heap;
1838 heap->gt = &started_after;
1839 } else {
1840 /* We need to allocate our own heap memory */
1841 heap = &tmp_heap;
1842 retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after);
1843 if (retval)
1844 /* cannot allocate the heap */
1845 return retval;
1846 }
1847
1848 again:
1849 /*
1850 * Scan tasks in the cgroup, using the scanner's "test_task" callback
1851 * to determine which are of interest, and using the scanner's
1852 * "process_task" callback to process any of them that need an update.
1853 * Since we don't want to hold any locks during the task updates,
1854 * gather tasks to be processed in a heap structure.
1855 * The heap is sorted by descending task start time.
1856 * If the statically-sized heap fills up, we overflow tasks that
1857 * started later, and in future iterations only consider tasks that
1858 * started after the latest task in the previous pass. This
1859 * guarantees forward progress and that we don't miss any tasks.
1860 */
1861 heap->size = 0;
1862 cgroup_iter_start(scan->cg, &it);
1863 while ((p = cgroup_iter_next(scan->cg, &it))) {
1864 /*
1865 * Only affect tasks that qualify per the caller's callback,
1866 * if he provided one
1867 */
1868 if (scan->test_task && !scan->test_task(p, scan))
1869 continue;
1870 /*
1871 * Only process tasks that started after the last task
1872 * we processed
1873 */
1874 if (!started_after_time(p, &latest_time, latest_task))
1875 continue;
1876 dropped = heap_insert(heap, p);
1877 if (dropped == NULL) {
1878 /*
1879 * The new task was inserted; the heap wasn't
1880 * previously full
1881 */
1882 get_task_struct(p);
1883 } else if (dropped != p) {
1884 /*
1885 * The new task was inserted, and pushed out a
1886 * different task
1887 */
1888 get_task_struct(p);
1889 put_task_struct(dropped);
1890 }
1891 /*
1892 * Else the new task was newer than anything already in
1893 * the heap and wasn't inserted
1894 */
1895 }
1896 cgroup_iter_end(scan->cg, &it);
1897
1898 if (heap->size) {
1899 for (i = 0; i < heap->size; i++) {
1900 struct task_struct *p = heap->ptrs[i];
1901 if (i == 0) {
1902 latest_time = p->start_time;
1903 latest_task = p;
1904 }
1905 /* Process the task per the caller's callback */
1906 scan->process_task(p, scan);
1907 put_task_struct(p);
1908 }
1909 /*
1910 * If we had to process any tasks at all, scan again
1911 * in case some of them were in the middle of forking
1912 * children that didn't get processed.
1913 * Not the most efficient way to do it, but it avoids
1914 * having to take callback_mutex in the fork path
1915 */
1916 goto again;
1917 }
1918 if (heap == &tmp_heap)
1919 heap_free(&tmp_heap);
1920 return 0;
1921}
1922
1729/* 1923/*
1730 * Stuff for reading the 'tasks' file. 1924 * Stuff for reading the 'tasks' file.
1731 * 1925 *
@@ -1761,7 +1955,7 @@ static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp)
1761 while ((tsk = cgroup_iter_next(cgrp, &it))) { 1955 while ((tsk = cgroup_iter_next(cgrp, &it))) {
1762 if (unlikely(n == npids)) 1956 if (unlikely(n == npids))
1763 break; 1957 break;
1764 pidarray[n++] = task_pid_nr(tsk); 1958 pidarray[n++] = task_pid_vnr(tsk);
1765 } 1959 }
1766 cgroup_iter_end(cgrp, &it); 1960 cgroup_iter_end(cgrp, &it);
1767 return n; 1961 return n;
@@ -2126,9 +2320,8 @@ static inline int cgroup_has_css_refs(struct cgroup *cgrp)
2126 * matter, since it can only happen if the cgroup 2320 * matter, since it can only happen if the cgroup
2127 * has been deleted and hence no longer needs the 2321 * has been deleted and hence no longer needs the
2128 * release agent to be called anyway. */ 2322 * release agent to be called anyway. */
2129 if (css && atomic_read(&css->refcnt)) { 2323 if (css && atomic_read(&css->refcnt))
2130 return 1; 2324 return 1;
2131 }
2132 } 2325 }
2133 return 0; 2326 return 0;
2134} 2327}
@@ -2138,7 +2331,6 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2138 struct cgroup *cgrp = dentry->d_fsdata; 2331 struct cgroup *cgrp = dentry->d_fsdata;
2139 struct dentry *d; 2332 struct dentry *d;
2140 struct cgroup *parent; 2333 struct cgroup *parent;
2141 struct cgroup_subsys *ss;
2142 struct super_block *sb; 2334 struct super_block *sb;
2143 struct cgroupfs_root *root; 2335 struct cgroupfs_root *root;
2144 2336
@@ -2157,17 +2349,19 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2157 parent = cgrp->parent; 2349 parent = cgrp->parent;
2158 root = cgrp->root; 2350 root = cgrp->root;
2159 sb = root->sb; 2351 sb = root->sb;
2352 /*
2353 * Call pre_destroy handlers of subsys
2354 */
2355 cgroup_call_pre_destroy(cgrp);
2356 /*
2357 * Notify subsyses that rmdir() request comes.
2358 */
2160 2359
2161 if (cgroup_has_css_refs(cgrp)) { 2360 if (cgroup_has_css_refs(cgrp)) {
2162 mutex_unlock(&cgroup_mutex); 2361 mutex_unlock(&cgroup_mutex);
2163 return -EBUSY; 2362 return -EBUSY;
2164 } 2363 }
2165 2364
2166 for_each_subsys(root, ss) {
2167 if (cgrp->subsys[ss->subsys_id])
2168 ss->destroy(ss, cgrp);
2169 }
2170
2171 spin_lock(&release_list_lock); 2365 spin_lock(&release_list_lock);
2172 set_bit(CGRP_REMOVED, &cgrp->flags); 2366 set_bit(CGRP_REMOVED, &cgrp->flags);
2173 if (!list_empty(&cgrp->release_list)) 2367 if (!list_empty(&cgrp->release_list))
@@ -2182,15 +2376,11 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2182 2376
2183 cgroup_d_remove_dir(d); 2377 cgroup_d_remove_dir(d);
2184 dput(d); 2378 dput(d);
2185 root->number_of_cgroups--;
2186 2379
2187 set_bit(CGRP_RELEASABLE, &parent->flags); 2380 set_bit(CGRP_RELEASABLE, &parent->flags);
2188 check_for_release(parent); 2381 check_for_release(parent);
2189 2382
2190 mutex_unlock(&cgroup_mutex); 2383 mutex_unlock(&cgroup_mutex);
2191 /* Drop the active superblock reference that we took when we
2192 * created the cgroup */
2193 deactivate_super(sb);
2194 return 0; 2384 return 0;
2195} 2385}
2196 2386
@@ -2324,7 +2514,7 @@ out:
2324 * - Used for /proc/<pid>/cgroup. 2514 * - Used for /proc/<pid>/cgroup.
2325 * - No need to task_lock(tsk) on this tsk->cgroup reference, as it 2515 * - No need to task_lock(tsk) on this tsk->cgroup reference, as it
2326 * doesn't really matter if tsk->cgroup changes after we read it, 2516 * doesn't really matter if tsk->cgroup changes after we read it,
2327 * and we take cgroup_mutex, keeping attach_task() from changing it 2517 * and we take cgroup_mutex, keeping cgroup_attach_task() from changing it
2328 * anyway. No need to check that tsk->cgroup != NULL, thanks to 2518 * anyway. No need to check that tsk->cgroup != NULL, thanks to
2329 * the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks 2519 * the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks
2330 * cgroup to top_cgroup. 2520 * cgroup to top_cgroup.
@@ -2435,7 +2625,7 @@ static struct file_operations proc_cgroupstats_operations = {
2435 * A pointer to the shared css_set was automatically copied in 2625 * A pointer to the shared css_set was automatically copied in
2436 * fork.c by dup_task_struct(). However, we ignore that copy, since 2626 * fork.c by dup_task_struct(). However, we ignore that copy, since
2437 * it was not made under the protection of RCU or cgroup_mutex, so 2627 * it was not made under the protection of RCU or cgroup_mutex, so
2438 * might no longer be a valid cgroup pointer. attach_task() might 2628 * might no longer be a valid cgroup pointer. cgroup_attach_task() might
2439 * have already changed current->cgroups, allowing the previously 2629 * have already changed current->cgroups, allowing the previously
2440 * referenced cgroup group to be removed and freed. 2630 * referenced cgroup group to be removed and freed.
2441 * 2631 *
@@ -2514,8 +2704,8 @@ void cgroup_post_fork(struct task_struct *child)
2514 * attach us to a different cgroup, decrementing the count on 2704 * attach us to a different cgroup, decrementing the count on
2515 * the first cgroup that we never incremented. But in this case, 2705 * the first cgroup that we never incremented. But in this case,
2516 * top_cgroup isn't going away, and either task has PF_EXITING set, 2706 * top_cgroup isn't going away, and either task has PF_EXITING set,
2517 * which wards off any attach_task() attempts, or task is a failed 2707 * which wards off any cgroup_attach_task() attempts, or task is a failed
2518 * fork, never visible to attach_task. 2708 * fork, never visible to cgroup_attach_task.
2519 * 2709 *
2520 */ 2710 */
2521void cgroup_exit(struct task_struct *tsk, int run_callbacks) 2711void cgroup_exit(struct task_struct *tsk, int run_callbacks)
@@ -2655,7 +2845,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys)
2655 } 2845 }
2656 2846
2657 /* All seems fine. Finish by moving the task into the new cgroup */ 2847 /* All seems fine. Finish by moving the task into the new cgroup */
2658 ret = attach_task(child, tsk); 2848 ret = cgroup_attach_task(child, tsk);
2659 mutex_unlock(&cgroup_mutex); 2849 mutex_unlock(&cgroup_mutex);
2660 2850
2661 out_release: 2851 out_release:
diff --git a/kernel/compat.c b/kernel/compat.c
index 42a1ed4b61b1..5f0e201bcfd3 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -40,10 +40,35 @@ int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user
40 __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; 40 __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
41} 41}
42 42
43static long compat_nanosleep_restart(struct restart_block *restart)
44{
45 struct compat_timespec __user *rmtp;
46 struct timespec rmt;
47 mm_segment_t oldfs;
48 long ret;
49
50 rmtp = (struct compat_timespec __user *)(restart->arg1);
51 restart->arg1 = (unsigned long)&rmt;
52 oldfs = get_fs();
53 set_fs(KERNEL_DS);
54 ret = hrtimer_nanosleep_restart(restart);
55 set_fs(oldfs);
56
57 if (ret) {
58 restart->arg1 = (unsigned long)rmtp;
59
60 if (rmtp && put_compat_timespec(&rmt, rmtp))
61 return -EFAULT;
62 }
63
64 return ret;
65}
66
43asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp, 67asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp,
44 struct compat_timespec __user *rmtp) 68 struct compat_timespec __user *rmtp)
45{ 69{
46 struct timespec tu, rmt; 70 struct timespec tu, rmt;
71 mm_segment_t oldfs;
47 long ret; 72 long ret;
48 73
49 if (get_compat_timespec(&tu, rqtp)) 74 if (get_compat_timespec(&tu, rqtp))
@@ -52,11 +77,21 @@ asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp,
52 if (!timespec_valid(&tu)) 77 if (!timespec_valid(&tu))
53 return -EINVAL; 78 return -EINVAL;
54 79
55 ret = hrtimer_nanosleep(&tu, rmtp ? &rmt : NULL, HRTIMER_MODE_REL, 80 oldfs = get_fs();
56 CLOCK_MONOTONIC); 81 set_fs(KERNEL_DS);
82 ret = hrtimer_nanosleep(&tu,
83 rmtp ? (struct timespec __user *)&rmt : NULL,
84 HRTIMER_MODE_REL, CLOCK_MONOTONIC);
85 set_fs(oldfs);
86
87 if (ret) {
88 struct restart_block *restart
89 = &current_thread_info()->restart_block;
90
91 restart->fn = compat_nanosleep_restart;
92 restart->arg1 = (unsigned long)rmtp;
57 93
58 if (ret && rmtp) { 94 if (rmtp && put_compat_timespec(&rmt, rmtp))
59 if (put_compat_timespec(&rmt, rmtp))
60 return -EFAULT; 95 return -EFAULT;
61 } 96 }
62 97
diff --git a/kernel/cpu.c b/kernel/cpu.c
index e0d3a4f56ecb..2eff3f63abed 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -389,7 +389,7 @@ int disable_nonboot_cpus(void)
389 return error; 389 return error;
390} 390}
391 391
392void enable_nonboot_cpus(void) 392void __ref enable_nonboot_cpus(void)
393{ 393{
394 int cpu, error; 394 int cpu, error;
395 395
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index cfaf6419d817..3e296ed81d4d 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -38,7 +38,6 @@
38#include <linux/mount.h> 38#include <linux/mount.h>
39#include <linux/namei.h> 39#include <linux/namei.h>
40#include <linux/pagemap.h> 40#include <linux/pagemap.h>
41#include <linux/prio_heap.h>
42#include <linux/proc_fs.h> 41#include <linux/proc_fs.h>
43#include <linux/rcupdate.h> 42#include <linux/rcupdate.h>
44#include <linux/sched.h> 43#include <linux/sched.h>
@@ -56,6 +55,8 @@
56#include <asm/atomic.h> 55#include <asm/atomic.h>
57#include <linux/mutex.h> 56#include <linux/mutex.h>
58#include <linux/kfifo.h> 57#include <linux/kfifo.h>
58#include <linux/workqueue.h>
59#include <linux/cgroup.h>
59 60
60/* 61/*
61 * Tracks how many cpusets are currently defined in system. 62 * Tracks how many cpusets are currently defined in system.
@@ -64,7 +65,7 @@
64 */ 65 */
65int number_of_cpusets __read_mostly; 66int number_of_cpusets __read_mostly;
66 67
67/* Retrieve the cpuset from a cgroup */ 68/* Forward declare cgroup structures */
68struct cgroup_subsys cpuset_subsys; 69struct cgroup_subsys cpuset_subsys;
69struct cpuset; 70struct cpuset;
70 71
@@ -96,6 +97,9 @@ struct cpuset {
96 97
97 /* partition number for rebuild_sched_domains() */ 98 /* partition number for rebuild_sched_domains() */
98 int pn; 99 int pn;
100
101 /* used for walking a cpuset heirarchy */
102 struct list_head stack_list;
99}; 103};
100 104
101/* Retrieve the cpuset for a cgroup */ 105/* Retrieve the cpuset for a cgroup */
@@ -111,7 +115,10 @@ static inline struct cpuset *task_cs(struct task_struct *task)
111 return container_of(task_subsys_state(task, cpuset_subsys_id), 115 return container_of(task_subsys_state(task, cpuset_subsys_id),
112 struct cpuset, css); 116 struct cpuset, css);
113} 117}
114 118struct cpuset_hotplug_scanner {
119 struct cgroup_scanner scan;
120 struct cgroup *to;
121};
115 122
116/* bits in struct cpuset flags field */ 123/* bits in struct cpuset flags field */
117typedef enum { 124typedef enum {
@@ -160,17 +167,17 @@ static inline int is_spread_slab(const struct cpuset *cs)
160 * number, and avoid having to lock and reload mems_allowed unless 167 * number, and avoid having to lock and reload mems_allowed unless
161 * the cpuset they're using changes generation. 168 * the cpuset they're using changes generation.
162 * 169 *
163 * A single, global generation is needed because attach_task() could 170 * A single, global generation is needed because cpuset_attach_task() could
164 * reattach a task to a different cpuset, which must not have its 171 * reattach a task to a different cpuset, which must not have its
165 * generation numbers aliased with those of that tasks previous cpuset. 172 * generation numbers aliased with those of that tasks previous cpuset.
166 * 173 *
167 * Generations are needed for mems_allowed because one task cannot 174 * Generations are needed for mems_allowed because one task cannot
168 * modify anothers memory placement. So we must enable every task, 175 * modify another's memory placement. So we must enable every task,
169 * on every visit to __alloc_pages(), to efficiently check whether 176 * on every visit to __alloc_pages(), to efficiently check whether
170 * its current->cpuset->mems_allowed has changed, requiring an update 177 * its current->cpuset->mems_allowed has changed, requiring an update
171 * of its current->mems_allowed. 178 * of its current->mems_allowed.
172 * 179 *
173 * Since cpuset_mems_generation is guarded by manage_mutex, 180 * Since writes to cpuset_mems_generation are guarded by the cgroup lock
174 * there is no need to mark it atomic. 181 * there is no need to mark it atomic.
175 */ 182 */
176static int cpuset_mems_generation; 183static int cpuset_mems_generation;
@@ -182,17 +189,20 @@ static struct cpuset top_cpuset = {
182}; 189};
183 190
184/* 191/*
185 * We have two global cpuset mutexes below. They can nest. 192 * There are two global mutexes guarding cpuset structures. The first
186 * It is ok to first take manage_mutex, then nest callback_mutex. We also 193 * is the main control groups cgroup_mutex, accessed via
187 * require taking task_lock() when dereferencing a tasks cpuset pointer. 194 * cgroup_lock()/cgroup_unlock(). The second is the cpuset-specific
188 * See "The task_lock() exception", at the end of this comment. 195 * callback_mutex, below. They can nest. It is ok to first take
196 * cgroup_mutex, then nest callback_mutex. We also require taking
197 * task_lock() when dereferencing a task's cpuset pointer. See "The
198 * task_lock() exception", at the end of this comment.
189 * 199 *
190 * A task must hold both mutexes to modify cpusets. If a task 200 * A task must hold both mutexes to modify cpusets. If a task
191 * holds manage_mutex, then it blocks others wanting that mutex, 201 * holds cgroup_mutex, then it blocks others wanting that mutex,
192 * ensuring that it is the only task able to also acquire callback_mutex 202 * ensuring that it is the only task able to also acquire callback_mutex
193 * and be able to modify cpusets. It can perform various checks on 203 * and be able to modify cpusets. It can perform various checks on
194 * the cpuset structure first, knowing nothing will change. It can 204 * the cpuset structure first, knowing nothing will change. It can
195 * also allocate memory while just holding manage_mutex. While it is 205 * also allocate memory while just holding cgroup_mutex. While it is
196 * performing these checks, various callback routines can briefly 206 * performing these checks, various callback routines can briefly
197 * acquire callback_mutex to query cpusets. Once it is ready to make 207 * acquire callback_mutex to query cpusets. Once it is ready to make
198 * the changes, it takes callback_mutex, blocking everyone else. 208 * the changes, it takes callback_mutex, blocking everyone else.
@@ -208,60 +218,16 @@ static struct cpuset top_cpuset = {
208 * The task_struct fields mems_allowed and mems_generation may only 218 * The task_struct fields mems_allowed and mems_generation may only
209 * be accessed in the context of that task, so require no locks. 219 * be accessed in the context of that task, so require no locks.
210 * 220 *
211 * Any task can increment and decrement the count field without lock.
212 * So in general, code holding manage_mutex or callback_mutex can't rely
213 * on the count field not changing. However, if the count goes to
214 * zero, then only attach_task(), which holds both mutexes, can
215 * increment it again. Because a count of zero means that no tasks
216 * are currently attached, therefore there is no way a task attached
217 * to that cpuset can fork (the other way to increment the count).
218 * So code holding manage_mutex or callback_mutex can safely assume that
219 * if the count is zero, it will stay zero. Similarly, if a task
220 * holds manage_mutex or callback_mutex on a cpuset with zero count, it
221 * knows that the cpuset won't be removed, as cpuset_rmdir() needs
222 * both of those mutexes.
223 *
224 * The cpuset_common_file_write handler for operations that modify 221 * The cpuset_common_file_write handler for operations that modify
225 * the cpuset hierarchy holds manage_mutex across the entire operation, 222 * the cpuset hierarchy holds cgroup_mutex across the entire operation,
226 * single threading all such cpuset modifications across the system. 223 * single threading all such cpuset modifications across the system.
227 * 224 *
228 * The cpuset_common_file_read() handlers only hold callback_mutex across 225 * The cpuset_common_file_read() handlers only hold callback_mutex across
229 * small pieces of code, such as when reading out possibly multi-word 226 * small pieces of code, such as when reading out possibly multi-word
230 * cpumasks and nodemasks. 227 * cpumasks and nodemasks.
231 * 228 *
232 * The fork and exit callbacks cpuset_fork() and cpuset_exit(), don't 229 * Accessing a task's cpuset should be done in accordance with the
233 * (usually) take either mutex. These are the two most performance 230 * guidelines for accessing subsystem state in kernel/cgroup.c
234 * critical pieces of code here. The exception occurs on cpuset_exit(),
235 * when a task in a notify_on_release cpuset exits. Then manage_mutex
236 * is taken, and if the cpuset count is zero, a usermode call made
237 * to /sbin/cpuset_release_agent with the name of the cpuset (path
238 * relative to the root of cpuset file system) as the argument.
239 *
240 * A cpuset can only be deleted if both its 'count' of using tasks
241 * is zero, and its list of 'children' cpusets is empty. Since all
242 * tasks in the system use _some_ cpuset, and since there is always at
243 * least one task in the system (init), therefore, top_cpuset
244 * always has either children cpusets and/or using tasks. So we don't
245 * need a special hack to ensure that top_cpuset cannot be deleted.
246 *
247 * The above "Tale of Two Semaphores" would be complete, but for:
248 *
249 * The task_lock() exception
250 *
251 * The need for this exception arises from the action of attach_task(),
252 * which overwrites one tasks cpuset pointer with another. It does
253 * so using both mutexes, however there are several performance
254 * critical places that need to reference task->cpuset without the
255 * expense of grabbing a system global mutex. Therefore except as
256 * noted below, when dereferencing or, as in attach_task(), modifying
257 * a tasks cpuset pointer we use task_lock(), which acts on a spinlock
258 * (task->alloc_lock) already in the task_struct routinely used for
259 * such matters.
260 *
261 * P.S. One more locking exception. RCU is used to guard the
262 * update of a tasks cpuset pointer by attach_task() and the
263 * access of task->cpuset->mems_generation via that pointer in
264 * the routine cpuset_update_task_memory_state().
265 */ 231 */
266 232
267static DEFINE_MUTEX(callback_mutex); 233static DEFINE_MUTEX(callback_mutex);
@@ -354,15 +320,14 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
354 * Do not call this routine if in_interrupt(). 320 * Do not call this routine if in_interrupt().
355 * 321 *
356 * Call without callback_mutex or task_lock() held. May be 322 * Call without callback_mutex or task_lock() held. May be
357 * called with or without manage_mutex held. Thanks in part to 323 * called with or without cgroup_mutex held. Thanks in part to
358 * 'the_top_cpuset_hack', the tasks cpuset pointer will never 324 * 'the_top_cpuset_hack', the task's cpuset pointer will never
359 * be NULL. This routine also might acquire callback_mutex and 325 * be NULL. This routine also might acquire callback_mutex and
360 * current->mm->mmap_sem during call. 326 * current->mm->mmap_sem during call.
361 * 327 *
362 * Reading current->cpuset->mems_generation doesn't need task_lock 328 * Reading current->cpuset->mems_generation doesn't need task_lock
363 * to guard the current->cpuset derefence, because it is guarded 329 * to guard the current->cpuset derefence, because it is guarded
364 * from concurrent freeing of current->cpuset by attach_task(), 330 * from concurrent freeing of current->cpuset using RCU.
365 * using RCU.
366 * 331 *
367 * The rcu_dereference() is technically probably not needed, 332 * The rcu_dereference() is technically probably not needed,
368 * as I don't actually mind if I see a new cpuset pointer but 333 * as I don't actually mind if I see a new cpuset pointer but
@@ -424,7 +389,7 @@ void cpuset_update_task_memory_state(void)
424 * 389 *
425 * One cpuset is a subset of another if all its allowed CPUs and 390 * One cpuset is a subset of another if all its allowed CPUs and
426 * Memory Nodes are a subset of the other, and its exclusive flags 391 * Memory Nodes are a subset of the other, and its exclusive flags
427 * are only set if the other's are set. Call holding manage_mutex. 392 * are only set if the other's are set. Call holding cgroup_mutex.
428 */ 393 */
429 394
430static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) 395static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
@@ -442,7 +407,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
442 * If we replaced the flag and mask values of the current cpuset 407 * If we replaced the flag and mask values of the current cpuset
443 * (cur) with those values in the trial cpuset (trial), would 408 * (cur) with those values in the trial cpuset (trial), would
444 * our various subset and exclusive rules still be valid? Presumes 409 * our various subset and exclusive rules still be valid? Presumes
445 * manage_mutex held. 410 * cgroup_mutex held.
446 * 411 *
447 * 'cur' is the address of an actual, in-use cpuset. Operations 412 * 'cur' is the address of an actual, in-use cpuset. Operations
448 * such as list traversal that depend on the actual address of the 413 * such as list traversal that depend on the actual address of the
@@ -476,7 +441,10 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
476 if (!is_cpuset_subset(trial, par)) 441 if (!is_cpuset_subset(trial, par))
477 return -EACCES; 442 return -EACCES;
478 443
479 /* If either I or some sibling (!= me) is exclusive, we can't overlap */ 444 /*
445 * If either I or some sibling (!= me) is exclusive, we can't
446 * overlap
447 */
480 list_for_each_entry(cont, &par->css.cgroup->children, sibling) { 448 list_for_each_entry(cont, &par->css.cgroup->children, sibling) {
481 c = cgroup_cs(cont); 449 c = cgroup_cs(cont);
482 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && 450 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
@@ -732,22 +700,50 @@ static inline int started_after(void *p1, void *p2)
732 return started_after_time(t1, &t2->start_time, t2); 700 return started_after_time(t1, &t2->start_time, t2);
733} 701}
734 702
735/* 703/**
736 * Call with manage_mutex held. May take callback_mutex during call. 704 * cpuset_test_cpumask - test a task's cpus_allowed versus its cpuset's
705 * @tsk: task to test
706 * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
707 *
708 * Call with cgroup_mutex held. May take callback_mutex during call.
709 * Called for each task in a cgroup by cgroup_scan_tasks().
710 * Return nonzero if this tasks's cpus_allowed mask should be changed (in other
711 * words, if its mask is not equal to its cpuset's mask).
712 */
713int cpuset_test_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan)
714{
715 return !cpus_equal(tsk->cpus_allowed,
716 (cgroup_cs(scan->cg))->cpus_allowed);
717}
718
719/**
720 * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's
721 * @tsk: task to test
722 * @scan: struct cgroup_scanner containing the cgroup of the task
723 *
724 * Called by cgroup_scan_tasks() for each task in a cgroup whose
725 * cpus_allowed mask needs to be changed.
726 *
727 * We don't need to re-check for the cgroup/cpuset membership, since we're
728 * holding cgroup_lock() at this point.
737 */ 729 */
730void cpuset_change_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan)
731{
732 set_cpus_allowed(tsk, (cgroup_cs(scan->cg))->cpus_allowed);
733}
738 734
735/**
736 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
737 * @cs: the cpuset to consider
738 * @buf: buffer of cpu numbers written to this cpuset
739 */
739static int update_cpumask(struct cpuset *cs, char *buf) 740static int update_cpumask(struct cpuset *cs, char *buf)
740{ 741{
741 struct cpuset trialcs; 742 struct cpuset trialcs;
742 int retval, i; 743 struct cgroup_scanner scan;
743 int is_load_balanced;
744 struct cgroup_iter it;
745 struct cgroup *cgrp = cs->css.cgroup;
746 struct task_struct *p, *dropped;
747 /* Never dereference latest_task, since it's not refcounted */
748 struct task_struct *latest_task = NULL;
749 struct ptr_heap heap; 744 struct ptr_heap heap;
750 struct timespec latest_time = { 0, 0 }; 745 int retval;
746 int is_load_balanced;
751 747
752 /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */ 748 /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */
753 if (cs == &top_cpuset) 749 if (cs == &top_cpuset)
@@ -756,7 +752,7 @@ static int update_cpumask(struct cpuset *cs, char *buf)
756 trialcs = *cs; 752 trialcs = *cs;
757 753
758 /* 754 /*
759 * An empty cpus_allowed is ok iff there are no tasks in the cpuset. 755 * An empty cpus_allowed is ok only if the cpuset has no tasks.
760 * Since cpulist_parse() fails on an empty mask, we special case 756 * Since cpulist_parse() fails on an empty mask, we special case
761 * that parsing. The validate_change() call ensures that cpusets 757 * that parsing. The validate_change() call ensures that cpusets
762 * with tasks have cpus. 758 * with tasks have cpus.
@@ -777,6 +773,7 @@ static int update_cpumask(struct cpuset *cs, char *buf)
777 /* Nothing to do if the cpus didn't change */ 773 /* Nothing to do if the cpus didn't change */
778 if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed)) 774 if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed))
779 return 0; 775 return 0;
776
780 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, &started_after); 777 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, &started_after);
781 if (retval) 778 if (retval)
782 return retval; 779 return retval;
@@ -787,62 +784,19 @@ static int update_cpumask(struct cpuset *cs, char *buf)
787 cs->cpus_allowed = trialcs.cpus_allowed; 784 cs->cpus_allowed = trialcs.cpus_allowed;
788 mutex_unlock(&callback_mutex); 785 mutex_unlock(&callback_mutex);
789 786
790 again:
791 /* 787 /*
792 * Scan tasks in the cpuset, and update the cpumasks of any 788 * Scan tasks in the cpuset, and update the cpumasks of any
793 * that need an update. Since we can't call set_cpus_allowed() 789 * that need an update.
794 * while holding tasklist_lock, gather tasks to be processed
795 * in a heap structure. If the statically-sized heap fills up,
796 * overflow tasks that started later, and in future iterations
797 * only consider tasks that started after the latest task in
798 * the previous pass. This guarantees forward progress and
799 * that we don't miss any tasks
800 */ 790 */
801 heap.size = 0; 791 scan.cg = cs->css.cgroup;
802 cgroup_iter_start(cgrp, &it); 792 scan.test_task = cpuset_test_cpumask;
803 while ((p = cgroup_iter_next(cgrp, &it))) { 793 scan.process_task = cpuset_change_cpumask;
804 /* Only affect tasks that don't have the right cpus_allowed */ 794 scan.heap = &heap;
805 if (cpus_equal(p->cpus_allowed, cs->cpus_allowed)) 795 cgroup_scan_tasks(&scan);
806 continue;
807 /*
808 * Only process tasks that started after the last task
809 * we processed
810 */
811 if (!started_after_time(p, &latest_time, latest_task))
812 continue;
813 dropped = heap_insert(&heap, p);
814 if (dropped == NULL) {
815 get_task_struct(p);
816 } else if (dropped != p) {
817 get_task_struct(p);
818 put_task_struct(dropped);
819 }
820 }
821 cgroup_iter_end(cgrp, &it);
822 if (heap.size) {
823 for (i = 0; i < heap.size; i++) {
824 struct task_struct *p = heap.ptrs[i];
825 if (i == 0) {
826 latest_time = p->start_time;
827 latest_task = p;
828 }
829 set_cpus_allowed(p, cs->cpus_allowed);
830 put_task_struct(p);
831 }
832 /*
833 * If we had to process any tasks at all, scan again
834 * in case some of them were in the middle of forking
835 * children that didn't notice the new cpumask
836 * restriction. Not the most efficient way to do it,
837 * but it avoids having to take callback_mutex in the
838 * fork path
839 */
840 goto again;
841 }
842 heap_free(&heap); 796 heap_free(&heap);
797
843 if (is_load_balanced) 798 if (is_load_balanced)
844 rebuild_sched_domains(); 799 rebuild_sched_domains();
845
846 return 0; 800 return 0;
847} 801}
848 802
@@ -854,11 +808,11 @@ static int update_cpumask(struct cpuset *cs, char *buf)
854 * Temporarilly set tasks mems_allowed to target nodes of migration, 808 * Temporarilly set tasks mems_allowed to target nodes of migration,
855 * so that the migration code can allocate pages on these nodes. 809 * so that the migration code can allocate pages on these nodes.
856 * 810 *
857 * Call holding manage_mutex, so our current->cpuset won't change 811 * Call holding cgroup_mutex, so current's cpuset won't change
858 * during this call, as manage_mutex holds off any attach_task() 812 * during this call, as manage_mutex holds off any cpuset_attach()
859 * calls. Therefore we don't need to take task_lock around the 813 * calls. Therefore we don't need to take task_lock around the
860 * call to guarantee_online_mems(), as we know no one is changing 814 * call to guarantee_online_mems(), as we know no one is changing
861 * our tasks cpuset. 815 * our task's cpuset.
862 * 816 *
863 * Hold callback_mutex around the two modifications of our tasks 817 * Hold callback_mutex around the two modifications of our tasks
864 * mems_allowed to synchronize with cpuset_mems_allowed(). 818 * mems_allowed to synchronize with cpuset_mems_allowed().
@@ -903,7 +857,7 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
903 * the cpuset is marked 'memory_migrate', migrate the tasks 857 * the cpuset is marked 'memory_migrate', migrate the tasks
904 * pages to the new memory. 858 * pages to the new memory.
905 * 859 *
906 * Call with manage_mutex held. May take callback_mutex during call. 860 * Call with cgroup_mutex held. May take callback_mutex during call.
907 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, 861 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
908 * lock each such tasks mm->mmap_sem, scan its vma's and rebind 862 * lock each such tasks mm->mmap_sem, scan its vma's and rebind
909 * their mempolicies to the cpusets new mems_allowed. 863 * their mempolicies to the cpusets new mems_allowed.
@@ -1016,7 +970,7 @@ static int update_nodemask(struct cpuset *cs, char *buf)
1016 * tasklist_lock. Forks can happen again now - the mpol_copy() 970 * tasklist_lock. Forks can happen again now - the mpol_copy()
1017 * cpuset_being_rebound check will catch such forks, and rebind 971 * cpuset_being_rebound check will catch such forks, and rebind
1018 * their vma mempolicies too. Because we still hold the global 972 * their vma mempolicies too. Because we still hold the global
1019 * cpuset manage_mutex, we know that no other rebind effort will 973 * cgroup_mutex, we know that no other rebind effort will
1020 * be contending for the global variable cpuset_being_rebound. 974 * be contending for the global variable cpuset_being_rebound.
1021 * It's ok if we rebind the same mm twice; mpol_rebind_mm() 975 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
1022 * is idempotent. Also migrate pages in each mm to new nodes. 976 * is idempotent. Also migrate pages in each mm to new nodes.
@@ -1031,7 +985,7 @@ static int update_nodemask(struct cpuset *cs, char *buf)
1031 mmput(mm); 985 mmput(mm);
1032 } 986 }
1033 987
1034 /* We're done rebinding vma's to this cpusets new mems_allowed. */ 988 /* We're done rebinding vmas to this cpuset's new mems_allowed. */
1035 kfree(mmarray); 989 kfree(mmarray);
1036 cpuset_being_rebound = NULL; 990 cpuset_being_rebound = NULL;
1037 retval = 0; 991 retval = 0;
@@ -1045,7 +999,7 @@ int current_cpuset_is_being_rebound(void)
1045} 999}
1046 1000
1047/* 1001/*
1048 * Call with manage_mutex held. 1002 * Call with cgroup_mutex held.
1049 */ 1003 */
1050 1004
1051static int update_memory_pressure_enabled(struct cpuset *cs, char *buf) 1005static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
@@ -1066,7 +1020,7 @@ static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
1066 * cs: the cpuset to update 1020 * cs: the cpuset to update
1067 * buf: the buffer where we read the 0 or 1 1021 * buf: the buffer where we read the 0 or 1
1068 * 1022 *
1069 * Call with manage_mutex held. 1023 * Call with cgroup_mutex held.
1070 */ 1024 */
1071 1025
1072static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) 1026static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
@@ -1200,6 +1154,7 @@ static int fmeter_getrate(struct fmeter *fmp)
1200 return val; 1154 return val;
1201} 1155}
1202 1156
1157/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
1203static int cpuset_can_attach(struct cgroup_subsys *ss, 1158static int cpuset_can_attach(struct cgroup_subsys *ss,
1204 struct cgroup *cont, struct task_struct *tsk) 1159 struct cgroup *cont, struct task_struct *tsk)
1205{ 1160{
@@ -1547,7 +1502,8 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
1547 * If this becomes a problem for some users who wish to 1502 * If this becomes a problem for some users who wish to
1548 * allow that scenario, then cpuset_post_clone() could be 1503 * allow that scenario, then cpuset_post_clone() could be
1549 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive 1504 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
1550 * (and likewise for mems) to the new cgroup. 1505 * (and likewise for mems) to the new cgroup. Called with cgroup_mutex
1506 * held.
1551 */ 1507 */
1552static void cpuset_post_clone(struct cgroup_subsys *ss, 1508static void cpuset_post_clone(struct cgroup_subsys *ss,
1553 struct cgroup *cgroup) 1509 struct cgroup *cgroup)
@@ -1571,11 +1527,8 @@ static void cpuset_post_clone(struct cgroup_subsys *ss,
1571 1527
1572/* 1528/*
1573 * cpuset_create - create a cpuset 1529 * cpuset_create - create a cpuset
1574 * parent: cpuset that will be parent of the new cpuset. 1530 * ss: cpuset cgroup subsystem
1575 * name: name of the new cpuset. Will be strcpy'ed. 1531 * cont: control group that the new cpuset will be part of
1576 * mode: mode to set on new inode
1577 *
1578 * Must be called with the mutex on the parent inode held
1579 */ 1532 */
1580 1533
1581static struct cgroup_subsys_state *cpuset_create( 1534static struct cgroup_subsys_state *cpuset_create(
@@ -1687,53 +1640,140 @@ int __init cpuset_init(void)
1687 return 0; 1640 return 0;
1688} 1641}
1689 1642
1643/**
1644 * cpuset_do_move_task - move a given task to another cpuset
1645 * @tsk: pointer to task_struct the task to move
1646 * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
1647 *
1648 * Called by cgroup_scan_tasks() for each task in a cgroup.
1649 * Return nonzero to stop the walk through the tasks.
1650 */
1651void cpuset_do_move_task(struct task_struct *tsk, struct cgroup_scanner *scan)
1652{
1653 struct cpuset_hotplug_scanner *chsp;
1654
1655 chsp = container_of(scan, struct cpuset_hotplug_scanner, scan);
1656 cgroup_attach_task(chsp->to, tsk);
1657}
1658
1659/**
1660 * move_member_tasks_to_cpuset - move tasks from one cpuset to another
1661 * @from: cpuset in which the tasks currently reside
1662 * @to: cpuset to which the tasks will be moved
1663 *
1664 * Called with cgroup_mutex held
1665 * callback_mutex must not be held, as cpuset_attach() will take it.
1666 *
1667 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
1668 * calling callback functions for each.
1669 */
1670static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
1671{
1672 struct cpuset_hotplug_scanner scan;
1673
1674 scan.scan.cg = from->css.cgroup;
1675 scan.scan.test_task = NULL; /* select all tasks in cgroup */
1676 scan.scan.process_task = cpuset_do_move_task;
1677 scan.scan.heap = NULL;
1678 scan.to = to->css.cgroup;
1679
1680 if (cgroup_scan_tasks((struct cgroup_scanner *)&scan))
1681 printk(KERN_ERR "move_member_tasks_to_cpuset: "
1682 "cgroup_scan_tasks failed\n");
1683}
1684
1690/* 1685/*
1691 * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs 1686 * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs
1692 * or memory nodes, we need to walk over the cpuset hierarchy, 1687 * or memory nodes, we need to walk over the cpuset hierarchy,
1693 * removing that CPU or node from all cpusets. If this removes the 1688 * removing that CPU or node from all cpusets. If this removes the
1694 * last CPU or node from a cpuset, then the guarantee_online_cpus() 1689 * last CPU or node from a cpuset, then move the tasks in the empty
1695 * or guarantee_online_mems() code will use that emptied cpusets 1690 * cpuset to its next-highest non-empty parent.
1696 * parent online CPUs or nodes. Cpusets that were already empty of 1691 *
1697 * CPUs or nodes are left empty. 1692 * Called with cgroup_mutex held
1693 * callback_mutex must not be held, as cpuset_attach() will take it.
1694 */
1695static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
1696{
1697 struct cpuset *parent;
1698
1699 /*
1700 * The cgroup's css_sets list is in use if there are tasks
1701 * in the cpuset; the list is empty if there are none;
1702 * the cs->css.refcnt seems always 0.
1703 */
1704 if (list_empty(&cs->css.cgroup->css_sets))
1705 return;
1706
1707 /*
1708 * Find its next-highest non-empty parent, (top cpuset
1709 * has online cpus, so can't be empty).
1710 */
1711 parent = cs->parent;
1712 while (cpus_empty(parent->cpus_allowed) ||
1713 nodes_empty(parent->mems_allowed))
1714 parent = parent->parent;
1715
1716 move_member_tasks_to_cpuset(cs, parent);
1717}
1718
1719/*
1720 * Walk the specified cpuset subtree and look for empty cpusets.
1721 * The tasks of such cpuset must be moved to a parent cpuset.
1698 * 1722 *
1699 * This routine is intentionally inefficient in a couple of regards. 1723 * Called with cgroup_mutex held. We take callback_mutex to modify
1700 * It will check all cpusets in a subtree even if the top cpuset of 1724 * cpus_allowed and mems_allowed.
1701 * the subtree has no offline CPUs or nodes. It checks both CPUs and
1702 * nodes, even though the caller could have been coded to know that
1703 * only one of CPUs or nodes needed to be checked on a given call.
1704 * This was done to minimize text size rather than cpu cycles.
1705 * 1725 *
1706 * Call with both manage_mutex and callback_mutex held. 1726 * This walk processes the tree from top to bottom, completing one layer
1727 * before dropping down to the next. It always processes a node before
1728 * any of its children.
1707 * 1729 *
1708 * Recursive, on depth of cpuset subtree. 1730 * For now, since we lack memory hot unplug, we'll never see a cpuset
1731 * that has tasks along with an empty 'mems'. But if we did see such
1732 * a cpuset, we'd handle it just like we do if its 'cpus' was empty.
1709 */ 1733 */
1710 1734static void scan_for_empty_cpusets(const struct cpuset *root)
1711static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur)
1712{ 1735{
1736 struct cpuset *cp; /* scans cpusets being updated */
1737 struct cpuset *child; /* scans child cpusets of cp */
1738 struct list_head queue;
1713 struct cgroup *cont; 1739 struct cgroup *cont;
1714 struct cpuset *c;
1715 1740
1716 /* Each of our child cpusets mems must be online */ 1741 INIT_LIST_HEAD(&queue);
1717 list_for_each_entry(cont, &cur->css.cgroup->children, sibling) { 1742
1718 c = cgroup_cs(cont); 1743 list_add_tail((struct list_head *)&root->stack_list, &queue);
1719 guarantee_online_cpus_mems_in_subtree(c); 1744
1720 if (!cpus_empty(c->cpus_allowed)) 1745 while (!list_empty(&queue)) {
1721 guarantee_online_cpus(c, &c->cpus_allowed); 1746 cp = container_of(queue.next, struct cpuset, stack_list);
1722 if (!nodes_empty(c->mems_allowed)) 1747 list_del(queue.next);
1723 guarantee_online_mems(c, &c->mems_allowed); 1748 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
1749 child = cgroup_cs(cont);
1750 list_add_tail(&child->stack_list, &queue);
1751 }
1752 cont = cp->css.cgroup;
1753
1754 /* Continue past cpusets with all cpus, mems online */
1755 if (cpus_subset(cp->cpus_allowed, cpu_online_map) &&
1756 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
1757 continue;
1758
1759 /* Remove offline cpus and mems from this cpuset. */
1760 mutex_lock(&callback_mutex);
1761 cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map);
1762 nodes_and(cp->mems_allowed, cp->mems_allowed,
1763 node_states[N_HIGH_MEMORY]);
1764 mutex_unlock(&callback_mutex);
1765
1766 /* Move tasks from the empty cpuset to a parent */
1767 if (cpus_empty(cp->cpus_allowed) ||
1768 nodes_empty(cp->mems_allowed))
1769 remove_tasks_in_empty_cpuset(cp);
1724 } 1770 }
1725} 1771}
1726 1772
1727/* 1773/*
1728 * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track 1774 * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track
1729 * cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to 1775 * cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to
1730 * track what's online after any CPU or memory node hotplug or unplug 1776 * track what's online after any CPU or memory node hotplug or unplug event.
1731 * event.
1732 *
1733 * To ensure that we don't remove a CPU or node from the top cpuset
1734 * that is currently in use by a child cpuset (which would violate
1735 * the rule that cpusets must be subsets of their parent), we first
1736 * call the recursive routine guarantee_online_cpus_mems_in_subtree().
1737 * 1777 *
1738 * Since there are two callers of this routine, one for CPU hotplug 1778 * Since there are two callers of this routine, one for CPU hotplug
1739 * events and one for memory node hotplug events, we could have coded 1779 * events and one for memory node hotplug events, we could have coded
@@ -1744,13 +1784,11 @@ static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur)
1744static void common_cpu_mem_hotplug_unplug(void) 1784static void common_cpu_mem_hotplug_unplug(void)
1745{ 1785{
1746 cgroup_lock(); 1786 cgroup_lock();
1747 mutex_lock(&callback_mutex);
1748 1787
1749 guarantee_online_cpus_mems_in_subtree(&top_cpuset);
1750 top_cpuset.cpus_allowed = cpu_online_map; 1788 top_cpuset.cpus_allowed = cpu_online_map;
1751 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 1789 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
1790 scan_for_empty_cpusets(&top_cpuset);
1752 1791
1753 mutex_unlock(&callback_mutex);
1754 cgroup_unlock(); 1792 cgroup_unlock();
1755} 1793}
1756 1794
@@ -1826,7 +1864,7 @@ cpumask_t cpuset_cpus_allowed(struct task_struct *tsk)
1826 1864
1827/** 1865/**
1828 * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset. 1866 * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset.
1829 * Must be called with callback_mutex held. 1867 * Must be called with callback_mutex held.
1830 **/ 1868 **/
1831cpumask_t cpuset_cpus_allowed_locked(struct task_struct *tsk) 1869cpumask_t cpuset_cpus_allowed_locked(struct task_struct *tsk)
1832{ 1870{
@@ -2163,10 +2201,8 @@ void __cpuset_memory_pressure_bump(void)
2163 * - Used for /proc/<pid>/cpuset. 2201 * - Used for /proc/<pid>/cpuset.
2164 * - No need to task_lock(tsk) on this tsk->cpuset reference, as it 2202 * - No need to task_lock(tsk) on this tsk->cpuset reference, as it
2165 * doesn't really matter if tsk->cpuset changes after we read it, 2203 * doesn't really matter if tsk->cpuset changes after we read it,
2166 * and we take manage_mutex, keeping attach_task() from changing it 2204 * and we take cgroup_mutex, keeping cpuset_attach() from changing it
2167 * anyway. No need to check that tsk->cpuset != NULL, thanks to 2205 * anyway.
2168 * the_top_cpuset_hack in cpuset_exit(), which sets an exiting tasks
2169 * cpuset to top_cpuset.
2170 */ 2206 */
2171static int proc_cpuset_show(struct seq_file *m, void *unused_v) 2207static int proc_cpuset_show(struct seq_file *m, void *unused_v)
2172{ 2208{
@@ -2219,13 +2255,14 @@ const struct file_operations proc_cpuset_operations = {
2219#endif /* CONFIG_PROC_PID_CPUSET */ 2255#endif /* CONFIG_PROC_PID_CPUSET */
2220 2256
2221/* Display task cpus_allowed, mems_allowed in /proc/<pid>/status file. */ 2257/* Display task cpus_allowed, mems_allowed in /proc/<pid>/status file. */
2222char *cpuset_task_status_allowed(struct task_struct *task, char *buffer) 2258void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
2223{ 2259{
2224 buffer += sprintf(buffer, "Cpus_allowed:\t"); 2260 seq_printf(m, "Cpus_allowed:\t");
2225 buffer += cpumask_scnprintf(buffer, PAGE_SIZE, task->cpus_allowed); 2261 m->count += cpumask_scnprintf(m->buf + m->count, m->size - m->count,
2226 buffer += sprintf(buffer, "\n"); 2262 task->cpus_allowed);
2227 buffer += sprintf(buffer, "Mems_allowed:\t"); 2263 seq_printf(m, "\n");
2228 buffer += nodemask_scnprintf(buffer, PAGE_SIZE, task->mems_allowed); 2264 seq_printf(m, "Mems_allowed:\t");
2229 buffer += sprintf(buffer, "\n"); 2265 m->count += nodemask_scnprintf(m->buf + m->count, m->size - m->count,
2230 return buffer; 2266 task->mems_allowed);
2267 seq_printf(m, "\n");
2231} 2268}
diff --git a/kernel/exit.c b/kernel/exit.c
index 9e459fefda77..506a957b665a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -293,26 +293,27 @@ static void reparent_to_kthreadd(void)
293 switch_uid(INIT_USER); 293 switch_uid(INIT_USER);
294} 294}
295 295
296void __set_special_pids(pid_t session, pid_t pgrp) 296void __set_special_pids(struct pid *pid)
297{ 297{
298 struct task_struct *curr = current->group_leader; 298 struct task_struct *curr = current->group_leader;
299 pid_t nr = pid_nr(pid);
299 300
300 if (task_session_nr(curr) != session) { 301 if (task_session(curr) != pid) {
301 detach_pid(curr, PIDTYPE_SID); 302 detach_pid(curr, PIDTYPE_SID);
302 set_task_session(curr, session); 303 attach_pid(curr, PIDTYPE_SID, pid);
303 attach_pid(curr, PIDTYPE_SID, find_pid(session)); 304 set_task_session(curr, nr);
304 } 305 }
305 if (task_pgrp_nr(curr) != pgrp) { 306 if (task_pgrp(curr) != pid) {
306 detach_pid(curr, PIDTYPE_PGID); 307 detach_pid(curr, PIDTYPE_PGID);
307 set_task_pgrp(curr, pgrp); 308 attach_pid(curr, PIDTYPE_PGID, pid);
308 attach_pid(curr, PIDTYPE_PGID, find_pid(pgrp)); 309 set_task_pgrp(curr, nr);
309 } 310 }
310} 311}
311 312
312static void set_special_pids(pid_t session, pid_t pgrp) 313static void set_special_pids(struct pid *pid)
313{ 314{
314 write_lock_irq(&tasklist_lock); 315 write_lock_irq(&tasklist_lock);
315 __set_special_pids(session, pgrp); 316 __set_special_pids(pid);
316 write_unlock_irq(&tasklist_lock); 317 write_unlock_irq(&tasklist_lock);
317} 318}
318 319
@@ -383,7 +384,11 @@ void daemonize(const char *name, ...)
383 */ 384 */
384 current->flags |= PF_NOFREEZE; 385 current->flags |= PF_NOFREEZE;
385 386
386 set_special_pids(1, 1); 387 if (current->nsproxy != &init_nsproxy) {
388 get_nsproxy(&init_nsproxy);
389 switch_task_namespaces(current, &init_nsproxy);
390 }
391 set_special_pids(&init_struct_pid);
387 proc_clear_tty(current); 392 proc_clear_tty(current);
388 393
389 /* Block and flush all signals */ 394 /* Block and flush all signals */
@@ -398,11 +403,6 @@ void daemonize(const char *name, ...)
398 current->fs = fs; 403 current->fs = fs;
399 atomic_inc(&fs->count); 404 atomic_inc(&fs->count);
400 405
401 if (current->nsproxy != init_task.nsproxy) {
402 get_nsproxy(init_task.nsproxy);
403 switch_task_namespaces(current, init_task.nsproxy);
404 }
405
406 exit_files(current); 406 exit_files(current);
407 current->files = init_task.files; 407 current->files = init_task.files;
408 atomic_inc(&current->files->count); 408 atomic_inc(&current->files->count);
@@ -458,7 +458,7 @@ struct files_struct *get_files_struct(struct task_struct *task)
458 return files; 458 return files;
459} 459}
460 460
461void fastcall put_files_struct(struct files_struct *files) 461void put_files_struct(struct files_struct *files)
462{ 462{
463 struct fdtable *fdt; 463 struct fdtable *fdt;
464 464
@@ -512,14 +512,10 @@ static void __put_fs_struct(struct fs_struct *fs)
512{ 512{
513 /* No need to hold fs->lock if we are killing it */ 513 /* No need to hold fs->lock if we are killing it */
514 if (atomic_dec_and_test(&fs->count)) { 514 if (atomic_dec_and_test(&fs->count)) {
515 dput(fs->root); 515 path_put(&fs->root);
516 mntput(fs->rootmnt); 516 path_put(&fs->pwd);
517 dput(fs->pwd); 517 if (fs->altroot.dentry)
518 mntput(fs->pwdmnt); 518 path_put(&fs->altroot);
519 if (fs->altroot) {
520 dput(fs->altroot);
521 mntput(fs->altrootmnt);
522 }
523 kmem_cache_free(fs_cachep, fs); 519 kmem_cache_free(fs_cachep, fs);
524 } 520 }
525} 521}
@@ -745,24 +741,6 @@ static void exit_notify(struct task_struct *tsk)
745 struct task_struct *t; 741 struct task_struct *t;
746 struct pid *pgrp; 742 struct pid *pgrp;
747 743
748 if (signal_pending(tsk) && !(tsk->signal->flags & SIGNAL_GROUP_EXIT)
749 && !thread_group_empty(tsk)) {
750 /*
751 * This occurs when there was a race between our exit
752 * syscall and a group signal choosing us as the one to
753 * wake up. It could be that we are the only thread
754 * alerted to check for pending signals, but another thread
755 * should be woken now to take the signal since we will not.
756 * Now we'll wake all the threads in the group just to make
757 * sure someone gets all the pending signals.
758 */
759 spin_lock_irq(&tsk->sighand->siglock);
760 for (t = next_thread(tsk); t != tsk; t = next_thread(t))
761 if (!signal_pending(t) && !(t->flags & PF_EXITING))
762 recalc_sigpending_and_wake(t);
763 spin_unlock_irq(&tsk->sighand->siglock);
764 }
765
766 /* 744 /*
767 * This does two things: 745 * This does two things:
768 * 746 *
@@ -905,7 +883,7 @@ static inline void exit_child_reaper(struct task_struct *tsk)
905 zap_pid_ns_processes(tsk->nsproxy->pid_ns); 883 zap_pid_ns_processes(tsk->nsproxy->pid_ns);
906} 884}
907 885
908fastcall NORET_TYPE void do_exit(long code) 886NORET_TYPE void do_exit(long code)
909{ 887{
910 struct task_struct *tsk = current; 888 struct task_struct *tsk = current;
911 int group_dead; 889 int group_dead;
@@ -947,7 +925,7 @@ fastcall NORET_TYPE void do_exit(long code)
947 schedule(); 925 schedule();
948 } 926 }
949 927
950 tsk->flags |= PF_EXITING; 928 exit_signals(tsk); /* sets PF_EXITING */
951 /* 929 /*
952 * tsk->flags are checked in the futex code to protect against 930 * tsk->flags are checked in the futex code to protect against
953 * an exiting task cleaning up the robust pi futexes. 931 * an exiting task cleaning up the robust pi futexes.
@@ -1083,11 +1061,12 @@ do_group_exit(int exit_code)
1083 struct signal_struct *const sig = current->signal; 1061 struct signal_struct *const sig = current->signal;
1084 struct sighand_struct *const sighand = current->sighand; 1062 struct sighand_struct *const sighand = current->sighand;
1085 spin_lock_irq(&sighand->siglock); 1063 spin_lock_irq(&sighand->siglock);
1086 if (sig->flags & SIGNAL_GROUP_EXIT) 1064 if (signal_group_exit(sig))
1087 /* Another thread got here before we took the lock. */ 1065 /* Another thread got here before we took the lock. */
1088 exit_code = sig->group_exit_code; 1066 exit_code = sig->group_exit_code;
1089 else { 1067 else {
1090 sig->group_exit_code = exit_code; 1068 sig->group_exit_code = exit_code;
1069 sig->flags = SIGNAL_GROUP_EXIT;
1091 zap_other_threads(current); 1070 zap_other_threads(current);
1092 } 1071 }
1093 spin_unlock_irq(&sighand->siglock); 1072 spin_unlock_irq(&sighand->siglock);
@@ -1107,20 +1086,23 @@ asmlinkage void sys_exit_group(int error_code)
1107 do_group_exit((error_code & 0xff) << 8); 1086 do_group_exit((error_code & 0xff) << 8);
1108} 1087}
1109 1088
1110static int eligible_child(pid_t pid, int options, struct task_struct *p) 1089static struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
1090{
1091 struct pid *pid = NULL;
1092 if (type == PIDTYPE_PID)
1093 pid = task->pids[type].pid;
1094 else if (type < PIDTYPE_MAX)
1095 pid = task->group_leader->pids[type].pid;
1096 return pid;
1097}
1098
1099static int eligible_child(enum pid_type type, struct pid *pid, int options,
1100 struct task_struct *p)
1111{ 1101{
1112 int err; 1102 int err;
1113 struct pid_namespace *ns;
1114 1103
1115 ns = current->nsproxy->pid_ns; 1104 if (type < PIDTYPE_MAX) {
1116 if (pid > 0) { 1105 if (task_pid_type(p, type) != pid)
1117 if (task_pid_nr_ns(p, ns) != pid)
1118 return 0;
1119 } else if (!pid) {
1120 if (task_pgrp_nr_ns(p, ns) != task_pgrp_vnr(current))
1121 return 0;
1122 } else if (pid != -1) {
1123 if (task_pgrp_nr_ns(p, ns) != -pid)
1124 return 0; 1106 return 0;
1125 } 1107 }
1126 1108
@@ -1139,18 +1121,16 @@ static int eligible_child(pid_t pid, int options, struct task_struct *p)
1139 if (((p->exit_signal != SIGCHLD) ^ ((options & __WCLONE) != 0)) 1121 if (((p->exit_signal != SIGCHLD) ^ ((options & __WCLONE) != 0))
1140 && !(options & __WALL)) 1122 && !(options & __WALL))
1141 return 0; 1123 return 0;
1142 /*
1143 * Do not consider thread group leaders that are
1144 * in a non-empty thread group:
1145 */
1146 if (delay_group_leader(p))
1147 return 2;
1148 1124
1149 err = security_task_wait(p); 1125 err = security_task_wait(p);
1150 if (err) 1126 if (likely(!err))
1151 return err; 1127 return 1;
1152 1128
1153 return 1; 1129 if (type != PIDTYPE_PID)
1130 return 0;
1131 /* This child was explicitly requested, abort */
1132 read_unlock(&tasklist_lock);
1133 return err;
1154} 1134}
1155 1135
1156static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid, 1136static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid,
@@ -1190,20 +1170,13 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
1190{ 1170{
1191 unsigned long state; 1171 unsigned long state;
1192 int retval, status, traced; 1172 int retval, status, traced;
1193 struct pid_namespace *ns; 1173 pid_t pid = task_pid_vnr(p);
1194
1195 ns = current->nsproxy->pid_ns;
1196 1174
1197 if (unlikely(noreap)) { 1175 if (unlikely(noreap)) {
1198 pid_t pid = task_pid_nr_ns(p, ns);
1199 uid_t uid = p->uid; 1176 uid_t uid = p->uid;
1200 int exit_code = p->exit_code; 1177 int exit_code = p->exit_code;
1201 int why, status; 1178 int why, status;
1202 1179
1203 if (unlikely(p->exit_state != EXIT_ZOMBIE))
1204 return 0;
1205 if (unlikely(p->exit_signal == -1 && p->ptrace == 0))
1206 return 0;
1207 get_task_struct(p); 1180 get_task_struct(p);
1208 read_unlock(&tasklist_lock); 1181 read_unlock(&tasklist_lock);
1209 if ((exit_code & 0x7f) == 0) { 1182 if ((exit_code & 0x7f) == 0) {
@@ -1314,11 +1287,11 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
1314 retval = put_user(status, &infop->si_status); 1287 retval = put_user(status, &infop->si_status);
1315 } 1288 }
1316 if (!retval && infop) 1289 if (!retval && infop)
1317 retval = put_user(task_pid_nr_ns(p, ns), &infop->si_pid); 1290 retval = put_user(pid, &infop->si_pid);
1318 if (!retval && infop) 1291 if (!retval && infop)
1319 retval = put_user(p->uid, &infop->si_uid); 1292 retval = put_user(p->uid, &infop->si_uid);
1320 if (!retval) 1293 if (!retval)
1321 retval = task_pid_nr_ns(p, ns); 1294 retval = pid;
1322 1295
1323 if (traced) { 1296 if (traced) {
1324 write_lock_irq(&tasklist_lock); 1297 write_lock_irq(&tasklist_lock);
@@ -1350,21 +1323,38 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
1350 * the lock and this task is uninteresting. If we return nonzero, we have 1323 * the lock and this task is uninteresting. If we return nonzero, we have
1351 * released the lock and the system call should return. 1324 * released the lock and the system call should return.
1352 */ 1325 */
1353static int wait_task_stopped(struct task_struct *p, int delayed_group_leader, 1326static int wait_task_stopped(struct task_struct *p,
1354 int noreap, struct siginfo __user *infop, 1327 int noreap, struct siginfo __user *infop,
1355 int __user *stat_addr, struct rusage __user *ru) 1328 int __user *stat_addr, struct rusage __user *ru)
1356{ 1329{
1357 int retval, exit_code; 1330 int retval, exit_code, why;
1331 uid_t uid = 0; /* unneeded, required by compiler */
1358 pid_t pid; 1332 pid_t pid;
1359 1333
1360 if (!p->exit_code) 1334 exit_code = 0;
1361 return 0; 1335 spin_lock_irq(&p->sighand->siglock);
1362 if (delayed_group_leader && !(p->ptrace & PT_PTRACED) && 1336
1363 p->signal->group_stop_count > 0) 1337 if (unlikely(!task_is_stopped_or_traced(p)))
1338 goto unlock_sig;
1339
1340 if (!(p->ptrace & PT_PTRACED) && p->signal->group_stop_count > 0)
1364 /* 1341 /*
1365 * A group stop is in progress and this is the group leader. 1342 * A group stop is in progress and this is the group leader.
1366 * We won't report until all threads have stopped. 1343 * We won't report until all threads have stopped.
1367 */ 1344 */
1345 goto unlock_sig;
1346
1347 exit_code = p->exit_code;
1348 if (!exit_code)
1349 goto unlock_sig;
1350
1351 if (!noreap)
1352 p->exit_code = 0;
1353
1354 uid = p->uid;
1355unlock_sig:
1356 spin_unlock_irq(&p->sighand->siglock);
1357 if (!exit_code)
1368 return 0; 1358 return 0;
1369 1359
1370 /* 1360 /*
@@ -1374,65 +1364,15 @@ static int wait_task_stopped(struct task_struct *p, int delayed_group_leader,
1374 * keep holding onto the tasklist_lock while we call getrusage and 1364 * keep holding onto the tasklist_lock while we call getrusage and
1375 * possibly take page faults for user memory. 1365 * possibly take page faults for user memory.
1376 */ 1366 */
1377 pid = task_pid_nr_ns(p, current->nsproxy->pid_ns);
1378 get_task_struct(p); 1367 get_task_struct(p);
1368 pid = task_pid_vnr(p);
1369 why = (p->ptrace & PT_PTRACED) ? CLD_TRAPPED : CLD_STOPPED;
1379 read_unlock(&tasklist_lock); 1370 read_unlock(&tasklist_lock);
1380 1371
1381 if (unlikely(noreap)) { 1372 if (unlikely(noreap))
1382 uid_t uid = p->uid;
1383 int why = (p->ptrace & PT_PTRACED) ? CLD_TRAPPED : CLD_STOPPED;
1384
1385 exit_code = p->exit_code;
1386 if (unlikely(!exit_code) || unlikely(p->exit_state))
1387 goto bail_ref;
1388 return wait_noreap_copyout(p, pid, uid, 1373 return wait_noreap_copyout(p, pid, uid,
1389 why, exit_code, 1374 why, exit_code,
1390 infop, ru); 1375 infop, ru);
1391 }
1392
1393 write_lock_irq(&tasklist_lock);
1394
1395 /*
1396 * This uses xchg to be atomic with the thread resuming and setting
1397 * it. It must also be done with the write lock held to prevent a
1398 * race with the EXIT_ZOMBIE case.
1399 */
1400 exit_code = xchg(&p->exit_code, 0);
1401 if (unlikely(p->exit_state)) {
1402 /*
1403 * The task resumed and then died. Let the next iteration
1404 * catch it in EXIT_ZOMBIE. Note that exit_code might
1405 * already be zero here if it resumed and did _exit(0).
1406 * The task itself is dead and won't touch exit_code again;
1407 * other processors in this function are locked out.
1408 */
1409 p->exit_code = exit_code;
1410 exit_code = 0;
1411 }
1412 if (unlikely(exit_code == 0)) {
1413 /*
1414 * Another thread in this function got to it first, or it
1415 * resumed, or it resumed and then died.
1416 */
1417 write_unlock_irq(&tasklist_lock);
1418bail_ref:
1419 put_task_struct(p);
1420 /*
1421 * We are returning to the wait loop without having successfully
1422 * removed the process and having released the lock. We cannot
1423 * continue, since the "p" task pointer is potentially stale.
1424 *
1425 * Return -EAGAIN, and do_wait() will restart the loop from the
1426 * beginning. Do _not_ re-acquire the lock.
1427 */
1428 return -EAGAIN;
1429 }
1430
1431 /* move to end of parent's list to avoid starvation */
1432 remove_parent(p);
1433 add_parent(p);
1434
1435 write_unlock_irq(&tasklist_lock);
1436 1376
1437 retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; 1377 retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
1438 if (!retval && stat_addr) 1378 if (!retval && stat_addr)
@@ -1442,15 +1382,13 @@ bail_ref:
1442 if (!retval && infop) 1382 if (!retval && infop)
1443 retval = put_user(0, &infop->si_errno); 1383 retval = put_user(0, &infop->si_errno);
1444 if (!retval && infop) 1384 if (!retval && infop)
1445 retval = put_user((short)((p->ptrace & PT_PTRACED) 1385 retval = put_user(why, &infop->si_code);
1446 ? CLD_TRAPPED : CLD_STOPPED),
1447 &infop->si_code);
1448 if (!retval && infop) 1386 if (!retval && infop)
1449 retval = put_user(exit_code, &infop->si_status); 1387 retval = put_user(exit_code, &infop->si_status);
1450 if (!retval && infop) 1388 if (!retval && infop)
1451 retval = put_user(pid, &infop->si_pid); 1389 retval = put_user(pid, &infop->si_pid);
1452 if (!retval && infop) 1390 if (!retval && infop)
1453 retval = put_user(p->uid, &infop->si_uid); 1391 retval = put_user(uid, &infop->si_uid);
1454 if (!retval) 1392 if (!retval)
1455 retval = pid; 1393 retval = pid;
1456 put_task_struct(p); 1394 put_task_struct(p);
@@ -1472,7 +1410,6 @@ static int wait_task_continued(struct task_struct *p, int noreap,
1472 int retval; 1410 int retval;
1473 pid_t pid; 1411 pid_t pid;
1474 uid_t uid; 1412 uid_t uid;
1475 struct pid_namespace *ns;
1476 1413
1477 if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) 1414 if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
1478 return 0; 1415 return 0;
@@ -1487,8 +1424,7 @@ static int wait_task_continued(struct task_struct *p, int noreap,
1487 p->signal->flags &= ~SIGNAL_STOP_CONTINUED; 1424 p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
1488 spin_unlock_irq(&p->sighand->siglock); 1425 spin_unlock_irq(&p->sighand->siglock);
1489 1426
1490 ns = current->nsproxy->pid_ns; 1427 pid = task_pid_vnr(p);
1491 pid = task_pid_nr_ns(p, ns);
1492 uid = p->uid; 1428 uid = p->uid;
1493 get_task_struct(p); 1429 get_task_struct(p);
1494 read_unlock(&tasklist_lock); 1430 read_unlock(&tasklist_lock);
@@ -1499,7 +1435,7 @@ static int wait_task_continued(struct task_struct *p, int noreap,
1499 if (!retval && stat_addr) 1435 if (!retval && stat_addr)
1500 retval = put_user(0xffff, stat_addr); 1436 retval = put_user(0xffff, stat_addr);
1501 if (!retval) 1437 if (!retval)
1502 retval = task_pid_nr_ns(p, ns); 1438 retval = pid;
1503 } else { 1439 } else {
1504 retval = wait_noreap_copyout(p, pid, uid, 1440 retval = wait_noreap_copyout(p, pid, uid,
1505 CLD_CONTINUED, SIGCONT, 1441 CLD_CONTINUED, SIGCONT,
@@ -1510,103 +1446,63 @@ static int wait_task_continued(struct task_struct *p, int noreap,
1510 return retval; 1446 return retval;
1511} 1447}
1512 1448
1513 1449static long do_wait(enum pid_type type, struct pid *pid, int options,
1514static inline int my_ptrace_child(struct task_struct *p) 1450 struct siginfo __user *infop, int __user *stat_addr,
1515{ 1451 struct rusage __user *ru)
1516 if (!(p->ptrace & PT_PTRACED))
1517 return 0;
1518 if (!(p->ptrace & PT_ATTACHED))
1519 return 1;
1520 /*
1521 * This child was PTRACE_ATTACH'd. We should be seeing it only if
1522 * we are the attacher. If we are the real parent, this is a race
1523 * inside ptrace_attach. It is waiting for the tasklist_lock,
1524 * which we have to switch the parent links, but has already set
1525 * the flags in p->ptrace.
1526 */
1527 return (p->parent != p->real_parent);
1528}
1529
1530static long do_wait(pid_t pid, int options, struct siginfo __user *infop,
1531 int __user *stat_addr, struct rusage __user *ru)
1532{ 1452{
1533 DECLARE_WAITQUEUE(wait, current); 1453 DECLARE_WAITQUEUE(wait, current);
1534 struct task_struct *tsk; 1454 struct task_struct *tsk;
1535 int flag, retval; 1455 int flag, retval;
1536 int allowed, denied;
1537 1456
1538 add_wait_queue(&current->signal->wait_chldexit,&wait); 1457 add_wait_queue(&current->signal->wait_chldexit,&wait);
1539repeat: 1458repeat:
1459 /* If there is nothing that can match our critier just get out */
1460 retval = -ECHILD;
1461 if ((type < PIDTYPE_MAX) && (!pid || hlist_empty(&pid->tasks[type])))
1462 goto end;
1463
1540 /* 1464 /*
1541 * We will set this flag if we see any child that might later 1465 * We will set this flag if we see any child that might later
1542 * match our criteria, even if we are not able to reap it yet. 1466 * match our criteria, even if we are not able to reap it yet.
1543 */ 1467 */
1544 flag = 0; 1468 flag = retval = 0;
1545 allowed = denied = 0;
1546 current->state = TASK_INTERRUPTIBLE; 1469 current->state = TASK_INTERRUPTIBLE;
1547 read_lock(&tasklist_lock); 1470 read_lock(&tasklist_lock);
1548 tsk = current; 1471 tsk = current;
1549 do { 1472 do {
1550 struct task_struct *p; 1473 struct task_struct *p;
1551 int ret;
1552 1474
1553 list_for_each_entry(p, &tsk->children, sibling) { 1475 list_for_each_entry(p, &tsk->children, sibling) {
1554 ret = eligible_child(pid, options, p); 1476 int ret = eligible_child(type, pid, options, p);
1555 if (!ret) 1477 if (!ret)
1556 continue; 1478 continue;
1557 1479
1558 if (unlikely(ret < 0)) { 1480 if (unlikely(ret < 0)) {
1559 denied = ret; 1481 retval = ret;
1560 continue; 1482 } else if (task_is_stopped_or_traced(p)) {
1561 }
1562 allowed = 1;
1563
1564 if (task_is_stopped_or_traced(p)) {
1565 /* 1483 /*
1566 * It's stopped now, so it might later 1484 * It's stopped now, so it might later
1567 * continue, exit, or stop again. 1485 * continue, exit, or stop again.
1568 *
1569 * When we hit the race with PTRACE_ATTACH, we
1570 * will not report this child. But the race
1571 * means it has not yet been moved to our
1572 * ptrace_children list, so we need to set the
1573 * flag here to avoid a spurious ECHILD when
1574 * the race happens with the only child.
1575 */ 1486 */
1576 flag = 1; 1487 flag = 1;
1488 if (!(p->ptrace & PT_PTRACED) &&
1489 !(options & WUNTRACED))
1490 continue;
1577 1491
1578 if (!my_ptrace_child(p)) { 1492 retval = wait_task_stopped(p,
1579 if (task_is_traced(p))
1580 continue;
1581 if (!(options & WUNTRACED))
1582 continue;
1583 }
1584
1585 retval = wait_task_stopped(p, ret == 2,
1586 (options & WNOWAIT), infop, 1493 (options & WNOWAIT), infop,
1587 stat_addr, ru); 1494 stat_addr, ru);
1588 if (retval == -EAGAIN) 1495 } else if (p->exit_state == EXIT_ZOMBIE &&
1589 goto repeat; 1496 !delay_group_leader(p)) {
1590 if (retval != 0) /* He released the lock. */
1591 goto end;
1592 } else if (p->exit_state == EXIT_DEAD) {
1593 continue;
1594 } else if (p->exit_state == EXIT_ZOMBIE) {
1595 /* 1497 /*
1596 * Eligible but we cannot release it yet: 1498 * We don't reap group leaders with subthreads.
1597 */ 1499 */
1598 if (ret == 2)
1599 goto check_continued;
1600 if (!likely(options & WEXITED)) 1500 if (!likely(options & WEXITED))
1601 continue; 1501 continue;
1602 retval = wait_task_zombie(p, 1502 retval = wait_task_zombie(p,
1603 (options & WNOWAIT), infop, 1503 (options & WNOWAIT), infop,
1604 stat_addr, ru); 1504 stat_addr, ru);
1605 /* He released the lock. */ 1505 } else if (p->exit_state != EXIT_DEAD) {
1606 if (retval != 0)
1607 goto end;
1608 } else {
1609check_continued:
1610 /* 1506 /*
1611 * It's running now, so it might later 1507 * It's running now, so it might later
1612 * exit, stop, or stop and then continue. 1508 * exit, stop, or stop and then continue.
@@ -1617,17 +1513,20 @@ check_continued:
1617 retval = wait_task_continued(p, 1513 retval = wait_task_continued(p,
1618 (options & WNOWAIT), infop, 1514 (options & WNOWAIT), infop,
1619 stat_addr, ru); 1515 stat_addr, ru);
1620 if (retval != 0) /* He released the lock. */
1621 goto end;
1622 } 1516 }
1517 if (retval != 0) /* tasklist_lock released */
1518 goto end;
1623 } 1519 }
1624 if (!flag) { 1520 if (!flag) {
1625 list_for_each_entry(p, &tsk->ptrace_children, 1521 list_for_each_entry(p, &tsk->ptrace_children,
1626 ptrace_list) { 1522 ptrace_list) {
1627 if (!eligible_child(pid, options, p)) 1523 flag = eligible_child(type, pid, options, p);
1524 if (!flag)
1628 continue; 1525 continue;
1629 flag = 1; 1526 if (likely(flag > 0))
1630 break; 1527 break;
1528 retval = flag;
1529 goto end;
1631 } 1530 }
1632 } 1531 }
1633 if (options & __WNOTHREAD) 1532 if (options & __WNOTHREAD)
@@ -1635,10 +1534,9 @@ check_continued:
1635 tsk = next_thread(tsk); 1534 tsk = next_thread(tsk);
1636 BUG_ON(tsk->signal != current->signal); 1535 BUG_ON(tsk->signal != current->signal);
1637 } while (tsk != current); 1536 } while (tsk != current);
1638
1639 read_unlock(&tasklist_lock); 1537 read_unlock(&tasklist_lock);
1538
1640 if (flag) { 1539 if (flag) {
1641 retval = 0;
1642 if (options & WNOHANG) 1540 if (options & WNOHANG)
1643 goto end; 1541 goto end;
1644 retval = -ERESTARTSYS; 1542 retval = -ERESTARTSYS;
@@ -1648,14 +1546,12 @@ check_continued:
1648 goto repeat; 1546 goto repeat;
1649 } 1547 }
1650 retval = -ECHILD; 1548 retval = -ECHILD;
1651 if (unlikely(denied) && !allowed)
1652 retval = denied;
1653end: 1549end:
1654 current->state = TASK_RUNNING; 1550 current->state = TASK_RUNNING;
1655 remove_wait_queue(&current->signal->wait_chldexit,&wait); 1551 remove_wait_queue(&current->signal->wait_chldexit,&wait);
1656 if (infop) { 1552 if (infop) {
1657 if (retval > 0) 1553 if (retval > 0)
1658 retval = 0; 1554 retval = 0;
1659 else { 1555 else {
1660 /* 1556 /*
1661 * For a WNOHANG return, clear out all the fields 1557 * For a WNOHANG return, clear out all the fields
@@ -1679,10 +1575,12 @@ end:
1679 return retval; 1575 return retval;
1680} 1576}
1681 1577
1682asmlinkage long sys_waitid(int which, pid_t pid, 1578asmlinkage long sys_waitid(int which, pid_t upid,
1683 struct siginfo __user *infop, int options, 1579 struct siginfo __user *infop, int options,
1684 struct rusage __user *ru) 1580 struct rusage __user *ru)
1685{ 1581{
1582 struct pid *pid = NULL;
1583 enum pid_type type;
1686 long ret; 1584 long ret;
1687 1585
1688 if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED)) 1586 if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED))
@@ -1692,37 +1590,58 @@ asmlinkage long sys_waitid(int which, pid_t pid,
1692 1590
1693 switch (which) { 1591 switch (which) {
1694 case P_ALL: 1592 case P_ALL:
1695 pid = -1; 1593 type = PIDTYPE_MAX;
1696 break; 1594 break;
1697 case P_PID: 1595 case P_PID:
1698 if (pid <= 0) 1596 type = PIDTYPE_PID;
1597 if (upid <= 0)
1699 return -EINVAL; 1598 return -EINVAL;
1700 break; 1599 break;
1701 case P_PGID: 1600 case P_PGID:
1702 if (pid <= 0) 1601 type = PIDTYPE_PGID;
1602 if (upid <= 0)
1703 return -EINVAL; 1603 return -EINVAL;
1704 pid = -pid;
1705 break; 1604 break;
1706 default: 1605 default:
1707 return -EINVAL; 1606 return -EINVAL;
1708 } 1607 }
1709 1608
1710 ret = do_wait(pid, options, infop, NULL, ru); 1609 if (type < PIDTYPE_MAX)
1610 pid = find_get_pid(upid);
1611 ret = do_wait(type, pid, options, infop, NULL, ru);
1612 put_pid(pid);
1711 1613
1712 /* avoid REGPARM breakage on x86: */ 1614 /* avoid REGPARM breakage on x86: */
1713 prevent_tail_call(ret); 1615 prevent_tail_call(ret);
1714 return ret; 1616 return ret;
1715} 1617}
1716 1618
1717asmlinkage long sys_wait4(pid_t pid, int __user *stat_addr, 1619asmlinkage long sys_wait4(pid_t upid, int __user *stat_addr,
1718 int options, struct rusage __user *ru) 1620 int options, struct rusage __user *ru)
1719{ 1621{
1622 struct pid *pid = NULL;
1623 enum pid_type type;
1720 long ret; 1624 long ret;
1721 1625
1722 if (options & ~(WNOHANG|WUNTRACED|WCONTINUED| 1626 if (options & ~(WNOHANG|WUNTRACED|WCONTINUED|
1723 __WNOTHREAD|__WCLONE|__WALL)) 1627 __WNOTHREAD|__WCLONE|__WALL))
1724 return -EINVAL; 1628 return -EINVAL;
1725 ret = do_wait(pid, options | WEXITED, NULL, stat_addr, ru); 1629
1630 if (upid == -1)
1631 type = PIDTYPE_MAX;
1632 else if (upid < 0) {
1633 type = PIDTYPE_PGID;
1634 pid = find_get_pid(-upid);
1635 } else if (upid == 0) {
1636 type = PIDTYPE_PGID;
1637 pid = get_pid(task_pgrp(current));
1638 } else /* upid > 0 */ {
1639 type = PIDTYPE_PID;
1640 pid = find_get_pid(upid);
1641 }
1642
1643 ret = do_wait(type, pid, options | WEXITED, NULL, stat_addr, ru);
1644 put_pid(pid);
1726 1645
1727 /* avoid REGPARM breakage on x86: */ 1646 /* avoid REGPARM breakage on x86: */
1728 prevent_tail_call(ret); 1647 prevent_tail_call(ret);
diff --git a/kernel/fork.c b/kernel/fork.c
index 05e0b6f4365b..dd249c37b3a3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -40,6 +40,7 @@
40#include <linux/ptrace.h> 40#include <linux/ptrace.h>
41#include <linux/mount.h> 41#include <linux/mount.h>
42#include <linux/audit.h> 42#include <linux/audit.h>
43#include <linux/memcontrol.h>
43#include <linux/profile.h> 44#include <linux/profile.h>
44#include <linux/rmap.h> 45#include <linux/rmap.h>
45#include <linux/acct.h> 46#include <linux/acct.h>
@@ -325,7 +326,7 @@ static inline int mm_alloc_pgd(struct mm_struct * mm)
325 326
326static inline void mm_free_pgd(struct mm_struct * mm) 327static inline void mm_free_pgd(struct mm_struct * mm)
327{ 328{
328 pgd_free(mm->pgd); 329 pgd_free(mm, mm->pgd);
329} 330}
330#else 331#else
331#define dup_mmap(mm, oldmm) (0) 332#define dup_mmap(mm, oldmm) (0)
@@ -340,7 +341,7 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
340 341
341#include <linux/init_task.h> 342#include <linux/init_task.h>
342 343
343static struct mm_struct * mm_init(struct mm_struct * mm) 344static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
344{ 345{
345 atomic_set(&mm->mm_users, 1); 346 atomic_set(&mm->mm_users, 1);
346 atomic_set(&mm->mm_count, 1); 347 atomic_set(&mm->mm_count, 1);
@@ -357,11 +358,14 @@ static struct mm_struct * mm_init(struct mm_struct * mm)
357 mm->ioctx_list = NULL; 358 mm->ioctx_list = NULL;
358 mm->free_area_cache = TASK_UNMAPPED_BASE; 359 mm->free_area_cache = TASK_UNMAPPED_BASE;
359 mm->cached_hole_size = ~0UL; 360 mm->cached_hole_size = ~0UL;
361 mm_init_cgroup(mm, p);
360 362
361 if (likely(!mm_alloc_pgd(mm))) { 363 if (likely(!mm_alloc_pgd(mm))) {
362 mm->def_flags = 0; 364 mm->def_flags = 0;
363 return mm; 365 return mm;
364 } 366 }
367
368 mm_free_cgroup(mm);
365 free_mm(mm); 369 free_mm(mm);
366 return NULL; 370 return NULL;
367} 371}
@@ -376,7 +380,7 @@ struct mm_struct * mm_alloc(void)
376 mm = allocate_mm(); 380 mm = allocate_mm();
377 if (mm) { 381 if (mm) {
378 memset(mm, 0, sizeof(*mm)); 382 memset(mm, 0, sizeof(*mm));
379 mm = mm_init(mm); 383 mm = mm_init(mm, current);
380 } 384 }
381 return mm; 385 return mm;
382} 386}
@@ -386,10 +390,11 @@ struct mm_struct * mm_alloc(void)
386 * is dropped: either by a lazy thread or by 390 * is dropped: either by a lazy thread or by
387 * mmput. Free the page directory and the mm. 391 * mmput. Free the page directory and the mm.
388 */ 392 */
389void fastcall __mmdrop(struct mm_struct *mm) 393void __mmdrop(struct mm_struct *mm)
390{ 394{
391 BUG_ON(mm == &init_mm); 395 BUG_ON(mm == &init_mm);
392 mm_free_pgd(mm); 396 mm_free_pgd(mm);
397 mm_free_cgroup(mm);
393 destroy_context(mm); 398 destroy_context(mm);
394 free_mm(mm); 399 free_mm(mm);
395} 400}
@@ -511,7 +516,7 @@ static struct mm_struct *dup_mm(struct task_struct *tsk)
511 mm->token_priority = 0; 516 mm->token_priority = 0;
512 mm->last_interval = 0; 517 mm->last_interval = 0;
513 518
514 if (!mm_init(mm)) 519 if (!mm_init(mm, tsk))
515 goto fail_nomem; 520 goto fail_nomem;
516 521
517 if (init_new_context(tsk, mm)) 522 if (init_new_context(tsk, mm))
@@ -595,16 +600,16 @@ static struct fs_struct *__copy_fs_struct(struct fs_struct *old)
595 rwlock_init(&fs->lock); 600 rwlock_init(&fs->lock);
596 fs->umask = old->umask; 601 fs->umask = old->umask;
597 read_lock(&old->lock); 602 read_lock(&old->lock);
598 fs->rootmnt = mntget(old->rootmnt); 603 fs->root = old->root;
599 fs->root = dget(old->root); 604 path_get(&old->root);
600 fs->pwdmnt = mntget(old->pwdmnt); 605 fs->pwd = old->pwd;
601 fs->pwd = dget(old->pwd); 606 path_get(&old->pwd);
602 if (old->altroot) { 607 if (old->altroot.dentry) {
603 fs->altrootmnt = mntget(old->altrootmnt); 608 fs->altroot = old->altroot;
604 fs->altroot = dget(old->altroot); 609 path_get(&old->altroot);
605 } else { 610 } else {
606 fs->altrootmnt = NULL; 611 fs->altroot.mnt = NULL;
607 fs->altroot = NULL; 612 fs->altroot.dentry = NULL;
608 } 613 }
609 read_unlock(&old->lock); 614 read_unlock(&old->lock);
610 } 615 }
@@ -904,7 +909,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
904 hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 909 hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
905 sig->it_real_incr.tv64 = 0; 910 sig->it_real_incr.tv64 = 0;
906 sig->real_timer.function = it_real_fn; 911 sig->real_timer.function = it_real_fn;
907 sig->tsk = tsk;
908 912
909 sig->it_virt_expires = cputime_zero; 913 sig->it_virt_expires = cputime_zero;
910 sig->it_virt_incr = cputime_zero; 914 sig->it_virt_incr = cputime_zero;
@@ -1118,6 +1122,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1118#ifdef CONFIG_SECURITY 1122#ifdef CONFIG_SECURITY
1119 p->security = NULL; 1123 p->security = NULL;
1120#endif 1124#endif
1125 p->cap_bset = current->cap_bset;
1121 p->io_context = NULL; 1126 p->io_context = NULL;
1122 p->audit_context = NULL; 1127 p->audit_context = NULL;
1123 cgroup_fork(p); 1128 cgroup_fork(p);
@@ -1332,6 +1337,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1332 if (clone_flags & CLONE_NEWPID) 1337 if (clone_flags & CLONE_NEWPID)
1333 p->nsproxy->pid_ns->child_reaper = p; 1338 p->nsproxy->pid_ns->child_reaper = p;
1334 1339
1340 p->signal->leader_pid = pid;
1335 p->signal->tty = current->signal->tty; 1341 p->signal->tty = current->signal->tty;
1336 set_task_pgrp(p, task_pgrp_nr(current)); 1342 set_task_pgrp(p, task_pgrp_nr(current));
1337 set_task_session(p, task_session_nr(current)); 1343 set_task_session(p, task_session_nr(current));
@@ -1398,7 +1404,7 @@ fork_out:
1398 return ERR_PTR(retval); 1404 return ERR_PTR(retval);
1399} 1405}
1400 1406
1401noinline struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs) 1407noinline struct pt_regs * __cpuinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
1402{ 1408{
1403 memset(regs, 0, sizeof(struct pt_regs)); 1409 memset(regs, 0, sizeof(struct pt_regs));
1404 return regs; 1410 return regs;
@@ -1450,6 +1456,23 @@ long do_fork(unsigned long clone_flags,
1450 int trace = 0; 1456 int trace = 0;
1451 long nr; 1457 long nr;
1452 1458
1459 /*
1460 * We hope to recycle these flags after 2.6.26
1461 */
1462 if (unlikely(clone_flags & CLONE_STOPPED)) {
1463 static int __read_mostly count = 100;
1464
1465 if (count > 0 && printk_ratelimit()) {
1466 char comm[TASK_COMM_LEN];
1467
1468 count--;
1469 printk(KERN_INFO "fork(): process `%s' used deprecated "
1470 "clone flags 0x%lx\n",
1471 get_task_comm(comm, current),
1472 clone_flags & CLONE_STOPPED);
1473 }
1474 }
1475
1453 if (unlikely(current->ptrace)) { 1476 if (unlikely(current->ptrace)) {
1454 trace = fork_traceflag (clone_flags); 1477 trace = fork_traceflag (clone_flags);
1455 if (trace) 1478 if (trace)
@@ -1465,13 +1488,7 @@ long do_fork(unsigned long clone_flags,
1465 if (!IS_ERR(p)) { 1488 if (!IS_ERR(p)) {
1466 struct completion vfork; 1489 struct completion vfork;
1467 1490
1468 /* 1491 nr = task_pid_vnr(p);
1469 * this is enough to call pid_nr_ns here, but this if
1470 * improves optimisation of regular fork()
1471 */
1472 nr = (clone_flags & CLONE_NEWPID) ?
1473 task_pid_nr_ns(p, current->nsproxy->pid_ns) :
1474 task_pid_vnr(p);
1475 1492
1476 if (clone_flags & CLONE_PARENT_SETTID) 1493 if (clone_flags & CLONE_PARENT_SETTID)
1477 put_user(nr, parent_tidptr); 1494 put_user(nr, parent_tidptr);
@@ -1492,7 +1509,7 @@ long do_fork(unsigned long clone_flags,
1492 if (!(clone_flags & CLONE_STOPPED)) 1509 if (!(clone_flags & CLONE_STOPPED))
1493 wake_up_new_task(p, clone_flags); 1510 wake_up_new_task(p, clone_flags);
1494 else 1511 else
1495 p->state = TASK_STOPPED; 1512 __set_task_state(p, TASK_STOPPED);
1496 1513
1497 if (unlikely (trace)) { 1514 if (unlikely (trace)) {
1498 current->ptrace_message = nr; 1515 current->ptrace_message = nr;
diff --git a/kernel/futex.c b/kernel/futex.c
index a6baaec44b8f..221f2128a437 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2116,7 +2116,7 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
2116 2116
2117 t = timespec_to_ktime(ts); 2117 t = timespec_to_ktime(ts);
2118 if (cmd == FUTEX_WAIT) 2118 if (cmd == FUTEX_WAIT)
2119 t = ktime_add(ktime_get(), t); 2119 t = ktime_add_safe(ktime_get(), t);
2120 tp = &t; 2120 tp = &t;
2121 } 2121 }
2122 /* 2122 /*
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 133d558db452..7d5e4b016f39 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -176,7 +176,7 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
176 176
177 t = timespec_to_ktime(ts); 177 t = timespec_to_ktime(ts);
178 if (cmd == FUTEX_WAIT) 178 if (cmd == FUTEX_WAIT)
179 t = ktime_add(ktime_get(), t); 179 t = ktime_add_safe(ktime_get(), t);
180 tp = &t; 180 tp = &t;
181 } 181 }
182 if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE) 182 if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE)
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 1069998fe25f..98bee013f71f 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -306,7 +306,7 @@ EXPORT_SYMBOL_GPL(ktime_sub_ns);
306/* 306/*
307 * Divide a ktime value by a nanosecond value 307 * Divide a ktime value by a nanosecond value
308 */ 308 */
309unsigned long ktime_divns(const ktime_t kt, s64 div) 309u64 ktime_divns(const ktime_t kt, s64 div)
310{ 310{
311 u64 dclc, inc, dns; 311 u64 dclc, inc, dns;
312 int sft = 0; 312 int sft = 0;
@@ -321,11 +321,28 @@ unsigned long ktime_divns(const ktime_t kt, s64 div)
321 dclc >>= sft; 321 dclc >>= sft;
322 do_div(dclc, (unsigned long) div); 322 do_div(dclc, (unsigned long) div);
323 323
324 return (unsigned long) dclc; 324 return dclc;
325} 325}
326#endif /* BITS_PER_LONG >= 64 */ 326#endif /* BITS_PER_LONG >= 64 */
327 327
328/* 328/*
329 * Add two ktime values and do a safety check for overflow:
330 */
331ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs)
332{
333 ktime_t res = ktime_add(lhs, rhs);
334
335 /*
336 * We use KTIME_SEC_MAX here, the maximum timeout which we can
337 * return to user space in a timespec:
338 */
339 if (res.tv64 < 0 || res.tv64 < lhs.tv64 || res.tv64 < rhs.tv64)
340 res = ktime_set(KTIME_SEC_MAX, 0);
341
342 return res;
343}
344
345/*
329 * Check, whether the timer is on the callback pending list 346 * Check, whether the timer is on the callback pending list
330 */ 347 */
331static inline int hrtimer_cb_pending(const struct hrtimer *timer) 348static inline int hrtimer_cb_pending(const struct hrtimer *timer)
@@ -425,6 +442,8 @@ static int hrtimer_reprogram(struct hrtimer *timer,
425 ktime_t expires = ktime_sub(timer->expires, base->offset); 442 ktime_t expires = ktime_sub(timer->expires, base->offset);
426 int res; 443 int res;
427 444
445 WARN_ON_ONCE(timer->expires.tv64 < 0);
446
428 /* 447 /*
429 * When the callback is running, we do not reprogram the clock event 448 * When the callback is running, we do not reprogram the clock event
430 * device. The timer callback is either running on a different CPU or 449 * device. The timer callback is either running on a different CPU or
@@ -435,6 +454,15 @@ static int hrtimer_reprogram(struct hrtimer *timer,
435 if (hrtimer_callback_running(timer)) 454 if (hrtimer_callback_running(timer))
436 return 0; 455 return 0;
437 456
457 /*
458 * CLOCK_REALTIME timer might be requested with an absolute
459 * expiry time which is less than base->offset. Nothing wrong
460 * about that, just avoid to call into the tick code, which
461 * has now objections against negative expiry values.
462 */
463 if (expires.tv64 < 0)
464 return -ETIME;
465
438 if (expires.tv64 >= expires_next->tv64) 466 if (expires.tv64 >= expires_next->tv64)
439 return 0; 467 return 0;
440 468
@@ -656,10 +684,9 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
656 * Forward the timer expiry so it will expire in the future. 684 * Forward the timer expiry so it will expire in the future.
657 * Returns the number of overruns. 685 * Returns the number of overruns.
658 */ 686 */
659unsigned long 687u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
660hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
661{ 688{
662 unsigned long orun = 1; 689 u64 orun = 1;
663 ktime_t delta; 690 ktime_t delta;
664 691
665 delta = ktime_sub(now, timer->expires); 692 delta = ktime_sub(now, timer->expires);
@@ -683,13 +710,7 @@ hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
683 */ 710 */
684 orun++; 711 orun++;
685 } 712 }
686 timer->expires = ktime_add(timer->expires, interval); 713 timer->expires = ktime_add_safe(timer->expires, interval);
687 /*
688 * Make sure, that the result did not wrap with a very large
689 * interval.
690 */
691 if (timer->expires.tv64 < 0)
692 timer->expires = ktime_set(KTIME_SEC_MAX, 0);
693 714
694 return orun; 715 return orun;
695} 716}
@@ -840,7 +861,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
840 new_base = switch_hrtimer_base(timer, base); 861 new_base = switch_hrtimer_base(timer, base);
841 862
842 if (mode == HRTIMER_MODE_REL) { 863 if (mode == HRTIMER_MODE_REL) {
843 tim = ktime_add(tim, new_base->get_time()); 864 tim = ktime_add_safe(tim, new_base->get_time());
844 /* 865 /*
845 * CONFIG_TIME_LOW_RES is a temporary way for architectures 866 * CONFIG_TIME_LOW_RES is a temporary way for architectures
846 * to signal that they simply return xtime in 867 * to signal that they simply return xtime in
@@ -849,16 +870,8 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
849 * timeouts. This will go away with the GTOD framework. 870 * timeouts. This will go away with the GTOD framework.
850 */ 871 */
851#ifdef CONFIG_TIME_LOW_RES 872#ifdef CONFIG_TIME_LOW_RES
852 tim = ktime_add(tim, base->resolution); 873 tim = ktime_add_safe(tim, base->resolution);
853#endif 874#endif
854 /*
855 * Careful here: User space might have asked for a
856 * very long sleep, so the add above might result in a
857 * negative number, which enqueues the timer in front
858 * of the queue.
859 */
860 if (tim.tv64 < 0)
861 tim.tv64 = KTIME_MAX;
862 } 875 }
863 timer->expires = tim; 876 timer->expires = tim;
864 877
@@ -1320,13 +1333,26 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
1320 return t->task == NULL; 1333 return t->task == NULL;
1321} 1334}
1322 1335
1336static int update_rmtp(struct hrtimer *timer, struct timespec __user *rmtp)
1337{
1338 struct timespec rmt;
1339 ktime_t rem;
1340
1341 rem = ktime_sub(timer->expires, timer->base->get_time());
1342 if (rem.tv64 <= 0)
1343 return 0;
1344 rmt = ktime_to_timespec(rem);
1345
1346 if (copy_to_user(rmtp, &rmt, sizeof(*rmtp)))
1347 return -EFAULT;
1348
1349 return 1;
1350}
1351
1323long __sched hrtimer_nanosleep_restart(struct restart_block *restart) 1352long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
1324{ 1353{
1325 struct hrtimer_sleeper t; 1354 struct hrtimer_sleeper t;
1326 struct timespec *rmtp; 1355 struct timespec __user *rmtp;
1327 ktime_t time;
1328
1329 restart->fn = do_no_restart_syscall;
1330 1356
1331 hrtimer_init(&t.timer, restart->arg0, HRTIMER_MODE_ABS); 1357 hrtimer_init(&t.timer, restart->arg0, HRTIMER_MODE_ABS);
1332 t.timer.expires.tv64 = ((u64)restart->arg3 << 32) | (u64) restart->arg2; 1358 t.timer.expires.tv64 = ((u64)restart->arg3 << 32) | (u64) restart->arg2;
@@ -1334,26 +1360,22 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
1334 if (do_nanosleep(&t, HRTIMER_MODE_ABS)) 1360 if (do_nanosleep(&t, HRTIMER_MODE_ABS))
1335 return 0; 1361 return 0;
1336 1362
1337 rmtp = (struct timespec *)restart->arg1; 1363 rmtp = (struct timespec __user *)restart->arg1;
1338 if (rmtp) { 1364 if (rmtp) {
1339 time = ktime_sub(t.timer.expires, t.timer.base->get_time()); 1365 int ret = update_rmtp(&t.timer, rmtp);
1340 if (time.tv64 <= 0) 1366 if (ret <= 0)
1341 return 0; 1367 return ret;
1342 *rmtp = ktime_to_timespec(time);
1343 } 1368 }
1344 1369
1345 restart->fn = hrtimer_nanosleep_restart;
1346
1347 /* The other values in restart are already filled in */ 1370 /* The other values in restart are already filled in */
1348 return -ERESTART_RESTARTBLOCK; 1371 return -ERESTART_RESTARTBLOCK;
1349} 1372}
1350 1373
1351long hrtimer_nanosleep(struct timespec *rqtp, struct timespec *rmtp, 1374long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
1352 const enum hrtimer_mode mode, const clockid_t clockid) 1375 const enum hrtimer_mode mode, const clockid_t clockid)
1353{ 1376{
1354 struct restart_block *restart; 1377 struct restart_block *restart;
1355 struct hrtimer_sleeper t; 1378 struct hrtimer_sleeper t;
1356 ktime_t rem;
1357 1379
1358 hrtimer_init(&t.timer, clockid, mode); 1380 hrtimer_init(&t.timer, clockid, mode);
1359 t.timer.expires = timespec_to_ktime(*rqtp); 1381 t.timer.expires = timespec_to_ktime(*rqtp);
@@ -1365,10 +1387,9 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec *rmtp,
1365 return -ERESTARTNOHAND; 1387 return -ERESTARTNOHAND;
1366 1388
1367 if (rmtp) { 1389 if (rmtp) {
1368 rem = ktime_sub(t.timer.expires, t.timer.base->get_time()); 1390 int ret = update_rmtp(&t.timer, rmtp);
1369 if (rem.tv64 <= 0) 1391 if (ret <= 0)
1370 return 0; 1392 return ret;
1371 *rmtp = ktime_to_timespec(rem);
1372 } 1393 }
1373 1394
1374 restart = &current_thread_info()->restart_block; 1395 restart = &current_thread_info()->restart_block;
@@ -1384,8 +1405,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec *rmtp,
1384asmlinkage long 1405asmlinkage long
1385sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp) 1406sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp)
1386{ 1407{
1387 struct timespec tu, rmt; 1408 struct timespec tu;
1388 int ret;
1389 1409
1390 if (copy_from_user(&tu, rqtp, sizeof(tu))) 1410 if (copy_from_user(&tu, rqtp, sizeof(tu)))
1391 return -EFAULT; 1411 return -EFAULT;
@@ -1393,15 +1413,7 @@ sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp)
1393 if (!timespec_valid(&tu)) 1413 if (!timespec_valid(&tu))
1394 return -EINVAL; 1414 return -EINVAL;
1395 1415
1396 ret = hrtimer_nanosleep(&tu, rmtp ? &rmt : NULL, HRTIMER_MODE_REL, 1416 return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
1397 CLOCK_MONOTONIC);
1398
1399 if (ret && rmtp) {
1400 if (copy_to_user(rmtp, &rmt, sizeof(*rmtp)))
1401 return -EFAULT;
1402 }
1403
1404 return ret;
1405} 1417}
1406 1418
1407/* 1419/*
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 44019ce30a14..cc54c6276356 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -286,7 +286,7 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq)
286 * Note: The caller is expected to handle the ack, clear, mask and 286 * Note: The caller is expected to handle the ack, clear, mask and
287 * unmask issues if necessary. 287 * unmask issues if necessary.
288 */ 288 */
289void fastcall 289void
290handle_simple_irq(unsigned int irq, struct irq_desc *desc) 290handle_simple_irq(unsigned int irq, struct irq_desc *desc)
291{ 291{
292 struct irqaction *action; 292 struct irqaction *action;
@@ -327,7 +327,7 @@ out_unlock:
327 * it after the associated handler has acknowledged the device, so the 327 * it after the associated handler has acknowledged the device, so the
328 * interrupt line is back to inactive. 328 * interrupt line is back to inactive.
329 */ 329 */
330void fastcall 330void
331handle_level_irq(unsigned int irq, struct irq_desc *desc) 331handle_level_irq(unsigned int irq, struct irq_desc *desc)
332{ 332{
333 unsigned int cpu = smp_processor_id(); 333 unsigned int cpu = smp_processor_id();
@@ -375,7 +375,7 @@ out_unlock:
375 * for modern forms of interrupt handlers, which handle the flow 375 * for modern forms of interrupt handlers, which handle the flow
376 * details in hardware, transparently. 376 * details in hardware, transparently.
377 */ 377 */
378void fastcall 378void
379handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) 379handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
380{ 380{
381 unsigned int cpu = smp_processor_id(); 381 unsigned int cpu = smp_processor_id();
@@ -434,7 +434,7 @@ out:
434 * the handler was running. If all pending interrupts are handled, the 434 * the handler was running. If all pending interrupts are handled, the
435 * loop is left. 435 * loop is left.
436 */ 436 */
437void fastcall 437void
438handle_edge_irq(unsigned int irq, struct irq_desc *desc) 438handle_edge_irq(unsigned int irq, struct irq_desc *desc)
439{ 439{
440 const unsigned int cpu = smp_processor_id(); 440 const unsigned int cpu = smp_processor_id();
@@ -505,7 +505,7 @@ out_unlock:
505 * 505 *
506 * Per CPU interrupts on SMP machines without locking requirements 506 * Per CPU interrupts on SMP machines without locking requirements
507 */ 507 */
508void fastcall 508void
509handle_percpu_irq(unsigned int irq, struct irq_desc *desc) 509handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
510{ 510{
511 irqreturn_t action_ret; 511 irqreturn_t action_ret;
@@ -589,3 +589,39 @@ set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
589 set_irq_chip(irq, chip); 589 set_irq_chip(irq, chip);
590 __set_irq_handler(irq, handle, 0, name); 590 __set_irq_handler(irq, handle, 0, name);
591} 591}
592
593void __init set_irq_noprobe(unsigned int irq)
594{
595 struct irq_desc *desc;
596 unsigned long flags;
597
598 if (irq >= NR_IRQS) {
599 printk(KERN_ERR "Trying to mark IRQ%d non-probeable\n", irq);
600
601 return;
602 }
603
604 desc = irq_desc + irq;
605
606 spin_lock_irqsave(&desc->lock, flags);
607 desc->status |= IRQ_NOPROBE;
608 spin_unlock_irqrestore(&desc->lock, flags);
609}
610
611void __init set_irq_probe(unsigned int irq)
612{
613 struct irq_desc *desc;
614 unsigned long flags;
615
616 if (irq >= NR_IRQS) {
617 printk(KERN_ERR "Trying to mark IRQ%d probeable\n", irq);
618
619 return;
620 }
621
622 desc = irq_desc + irq;
623
624 spin_lock_irqsave(&desc->lock, flags);
625 desc->status &= ~IRQ_NOPROBE;
626 spin_unlock_irqrestore(&desc->lock, flags);
627}
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index dc335ad27525..5fa6198e9139 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -25,7 +25,7 @@
25 * 25 *
26 * Handles spurious and unhandled IRQ's. It also prints a debugmessage. 26 * Handles spurious and unhandled IRQ's. It also prints a debugmessage.
27 */ 27 */
28void fastcall 28void
29handle_bad_irq(unsigned int irq, struct irq_desc *desc) 29handle_bad_irq(unsigned int irq, struct irq_desc *desc)
30{ 30{
31 print_irq_desc(irq, desc); 31 print_irq_desc(irq, desc);
@@ -163,7 +163,7 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
163 * This is the original x86 implementation which is used for every 163 * This is the original x86 implementation which is used for every
164 * interrupt type. 164 * interrupt type.
165 */ 165 */
166fastcall unsigned int __do_IRQ(unsigned int irq) 166unsigned int __do_IRQ(unsigned int irq)
167{ 167{
168 struct irq_desc *desc = irq_desc + irq; 168 struct irq_desc *desc = irq_desc + irq;
169 struct irqaction *action; 169 struct irqaction *action;
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 2fab344dbf56..ab982747d9bd 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -132,7 +132,7 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer)
132 struct signal_struct *sig = 132 struct signal_struct *sig =
133 container_of(timer, struct signal_struct, real_timer); 133 container_of(timer, struct signal_struct, real_timer);
134 134
135 send_group_sig_info(SIGALRM, SEND_SIG_PRIV, sig->tsk); 135 kill_pid_info(SIGALRM, SEND_SIG_PRIV, sig->leader_pid);
136 136
137 return HRTIMER_NORESTART; 137 return HRTIMER_NORESTART;
138} 138}
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 7dadc71ce516..f091d13def00 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -53,14 +53,6 @@ static inline int is_kernel_inittext(unsigned long addr)
53 return 0; 53 return 0;
54} 54}
55 55
56static inline int is_kernel_extratext(unsigned long addr)
57{
58 if (addr >= (unsigned long)_sextratext
59 && addr <= (unsigned long)_eextratext)
60 return 1;
61 return 0;
62}
63
64static inline int is_kernel_text(unsigned long addr) 56static inline int is_kernel_text(unsigned long addr)
65{ 57{
66 if (addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) 58 if (addr >= (unsigned long)_stext && addr <= (unsigned long)_etext)
@@ -80,8 +72,7 @@ static int is_ksym_addr(unsigned long addr)
80 if (all_var) 72 if (all_var)
81 return is_kernel(addr); 73 return is_kernel(addr);
82 74
83 return is_kernel_text(addr) || is_kernel_inittext(addr) || 75 return is_kernel_text(addr) || is_kernel_inittext(addr);
84 is_kernel_extratext(addr);
85} 76}
86 77
87/* expand a compressed symbol data into the resulting uncompressed string, 78/* expand a compressed symbol data into the resulting uncompressed string,
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 9a26eec9eb04..06a0e2775651 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1361,8 +1361,8 @@ unsigned long __attribute__ ((weak)) paddr_vmcoreinfo_note(void)
1361 1361
1362static int __init crash_save_vmcoreinfo_init(void) 1362static int __init crash_save_vmcoreinfo_init(void)
1363{ 1363{
1364 vmcoreinfo_append_str("OSRELEASE=%s\n", init_uts_ns.name.release); 1364 VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
1365 vmcoreinfo_append_str("PAGESIZE=%ld\n", PAGE_SIZE); 1365 VMCOREINFO_PAGESIZE(PAGE_SIZE);
1366 1366
1367 VMCOREINFO_SYMBOL(init_uts_ns); 1367 VMCOREINFO_SYMBOL(init_uts_ns);
1368 VMCOREINFO_SYMBOL(node_online_map); 1368 VMCOREINFO_SYMBOL(node_online_map);
@@ -1376,15 +1376,15 @@ static int __init crash_save_vmcoreinfo_init(void)
1376#ifdef CONFIG_SPARSEMEM 1376#ifdef CONFIG_SPARSEMEM
1377 VMCOREINFO_SYMBOL(mem_section); 1377 VMCOREINFO_SYMBOL(mem_section);
1378 VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); 1378 VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
1379 VMCOREINFO_SIZE(mem_section); 1379 VMCOREINFO_STRUCT_SIZE(mem_section);
1380 VMCOREINFO_OFFSET(mem_section, section_mem_map); 1380 VMCOREINFO_OFFSET(mem_section, section_mem_map);
1381#endif 1381#endif
1382 VMCOREINFO_SIZE(page); 1382 VMCOREINFO_STRUCT_SIZE(page);
1383 VMCOREINFO_SIZE(pglist_data); 1383 VMCOREINFO_STRUCT_SIZE(pglist_data);
1384 VMCOREINFO_SIZE(zone); 1384 VMCOREINFO_STRUCT_SIZE(zone);
1385 VMCOREINFO_SIZE(free_area); 1385 VMCOREINFO_STRUCT_SIZE(free_area);
1386 VMCOREINFO_SIZE(list_head); 1386 VMCOREINFO_STRUCT_SIZE(list_head);
1387 VMCOREINFO_TYPEDEF_SIZE(nodemask_t); 1387 VMCOREINFO_SIZE(nodemask_t);
1388 VMCOREINFO_OFFSET(page, flags); 1388 VMCOREINFO_OFFSET(page, flags);
1389 VMCOREINFO_OFFSET(page, _count); 1389 VMCOREINFO_OFFSET(page, _count);
1390 VMCOREINFO_OFFSET(page, mapping); 1390 VMCOREINFO_OFFSET(page, mapping);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index bb7df2a28bd7..22be3ff3f363 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -173,10 +173,7 @@ static int ____call_usermodehelper(void *data)
173 */ 173 */
174 set_user_nice(current, 0); 174 set_user_nice(current, 0);
175 175
176 retval = -EPERM; 176 retval = kernel_execve(sub_info->path, sub_info->argv, sub_info->envp);
177 if (current->fs->root)
178 retval = kernel_execve(sub_info->path,
179 sub_info->argv, sub_info->envp);
180 177
181 /* Exec failed? */ 178 /* Exec failed? */
182 sub_info->retval = retval; 179 sub_info->retval = retval;
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index d0493eafea3e..7a86e6432338 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -699,6 +699,12 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
699 struct kretprobe_instance, uflist); 699 struct kretprobe_instance, uflist);
700 ri->rp = rp; 700 ri->rp = rp;
701 ri->task = current; 701 ri->task = current;
702
703 if (rp->entry_handler && rp->entry_handler(ri, regs)) {
704 spin_unlock_irqrestore(&kretprobe_lock, flags);
705 return 0;
706 }
707
702 arch_prepare_kretprobe(ri, regs); 708 arch_prepare_kretprobe(ri, regs);
703 709
704 /* XXX(hch): why is there no hlist_move_head? */ 710 /* XXX(hch): why is there no hlist_move_head? */
@@ -745,7 +751,8 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
745 INIT_HLIST_HEAD(&rp->used_instances); 751 INIT_HLIST_HEAD(&rp->used_instances);
746 INIT_HLIST_HEAD(&rp->free_instances); 752 INIT_HLIST_HEAD(&rp->free_instances);
747 for (i = 0; i < rp->maxactive; i++) { 753 for (i = 0; i < rp->maxactive; i++) {
748 inst = kmalloc(sizeof(struct kretprobe_instance), GFP_KERNEL); 754 inst = kmalloc(sizeof(struct kretprobe_instance) +
755 rp->data_size, GFP_KERNEL);
749 if (inst == NULL) { 756 if (inst == NULL) {
750 free_rp_inst(rp); 757 free_rp_inst(rp);
751 return -ENOMEM; 758 return -ENOMEM;
diff --git a/kernel/latency.c b/kernel/latency.c
deleted file mode 100644
index e63fcacb61a7..000000000000
--- a/kernel/latency.c
+++ /dev/null
@@ -1,280 +0,0 @@
1/*
2 * latency.c: Explicit system-wide latency-expectation infrastructure
3 *
4 * The purpose of this infrastructure is to allow device drivers to set
5 * latency constraint they have and to collect and summarize these
6 * expectations globally. The cummulated result can then be used by
7 * power management and similar users to make decisions that have
8 * tradoffs with a latency component.
9 *
10 * An example user of this are the x86 C-states; each higher C state saves
11 * more power, but has a higher exit latency. For the idle loop power
12 * code to make a good decision which C-state to use, information about
13 * acceptable latencies is required.
14 *
15 * An example announcer of latency is an audio driver that knowns it
16 * will get an interrupt when the hardware has 200 usec of samples
17 * left in the DMA buffer; in that case the driver can set a latency
18 * constraint of, say, 150 usec.
19 *
20 * Multiple drivers can each announce their maximum accepted latency,
21 * to keep these appart, a string based identifier is used.
22 *
23 *
24 * (C) Copyright 2006 Intel Corporation
25 * Author: Arjan van de Ven <arjan@linux.intel.com>
26 *
27 * This program is free software; you can redistribute it and/or
28 * modify it under the terms of the GNU General Public License
29 * as published by the Free Software Foundation; version 2
30 * of the License.
31 */
32
33#include <linux/latency.h>
34#include <linux/list.h>
35#include <linux/spinlock.h>
36#include <linux/slab.h>
37#include <linux/module.h>
38#include <linux/notifier.h>
39#include <linux/jiffies.h>
40#include <asm/atomic.h>
41
42struct latency_info {
43 struct list_head list;
44 int usecs;
45 char *identifier;
46};
47
48/*
49 * locking rule: all modifications to current_max_latency and
50 * latency_list need to be done while holding the latency_lock.
51 * latency_lock needs to be taken _irqsave.
52 */
53static atomic_t current_max_latency;
54static DEFINE_SPINLOCK(latency_lock);
55
56static LIST_HEAD(latency_list);
57static BLOCKING_NOTIFIER_HEAD(latency_notifier);
58
59/*
60 * This function returns the maximum latency allowed, which
61 * happens to be the minimum of all maximum latencies on the
62 * list.
63 */
64static int __find_max_latency(void)
65{
66 int min = INFINITE_LATENCY;
67 struct latency_info *info;
68
69 list_for_each_entry(info, &latency_list, list) {
70 if (info->usecs < min)
71 min = info->usecs;
72 }
73 return min;
74}
75
76/**
77 * set_acceptable_latency - sets the maximum latency acceptable
78 * @identifier: string that identifies this driver
79 * @usecs: maximum acceptable latency for this driver
80 *
81 * This function informs the kernel that this device(driver)
82 * can accept at most usecs latency. This setting is used for
83 * power management and similar tradeoffs.
84 *
85 * This function sleeps and can only be called from process
86 * context.
87 * Calling this function with an existing identifier is valid
88 * and will cause the existing latency setting to be changed.
89 */
90void set_acceptable_latency(char *identifier, int usecs)
91{
92 struct latency_info *info, *iter;
93 unsigned long flags;
94 int found_old = 0;
95
96 info = kzalloc(sizeof(struct latency_info), GFP_KERNEL);
97 if (!info)
98 return;
99 info->usecs = usecs;
100 info->identifier = kstrdup(identifier, GFP_KERNEL);
101 if (!info->identifier)
102 goto free_info;
103
104 spin_lock_irqsave(&latency_lock, flags);
105 list_for_each_entry(iter, &latency_list, list) {
106 if (strcmp(iter->identifier, identifier)==0) {
107 found_old = 1;
108 iter->usecs = usecs;
109 break;
110 }
111 }
112 if (!found_old)
113 list_add(&info->list, &latency_list);
114
115 if (usecs < atomic_read(&current_max_latency))
116 atomic_set(&current_max_latency, usecs);
117
118 spin_unlock_irqrestore(&latency_lock, flags);
119
120 blocking_notifier_call_chain(&latency_notifier,
121 atomic_read(&current_max_latency), NULL);
122
123 /*
124 * if we inserted the new one, we're done; otherwise there was
125 * an existing one so we need to free the redundant data
126 */
127 if (!found_old)
128 return;
129
130 kfree(info->identifier);
131free_info:
132 kfree(info);
133}
134EXPORT_SYMBOL_GPL(set_acceptable_latency);
135
136/**
137 * modify_acceptable_latency - changes the maximum latency acceptable
138 * @identifier: string that identifies this driver
139 * @usecs: maximum acceptable latency for this driver
140 *
141 * This function informs the kernel that this device(driver)
142 * can accept at most usecs latency. This setting is used for
143 * power management and similar tradeoffs.
144 *
145 * This function does not sleep and can be called in any context.
146 * Trying to use a non-existing identifier silently gets ignored.
147 *
148 * Due to the atomic nature of this function, the modified latency
149 * value will only be used for future decisions; past decisions
150 * can still lead to longer latencies in the near future.
151 */
152void modify_acceptable_latency(char *identifier, int usecs)
153{
154 struct latency_info *iter;
155 unsigned long flags;
156
157 spin_lock_irqsave(&latency_lock, flags);
158 list_for_each_entry(iter, &latency_list, list) {
159 if (strcmp(iter->identifier, identifier) == 0) {
160 iter->usecs = usecs;
161 break;
162 }
163 }
164 if (usecs < atomic_read(&current_max_latency))
165 atomic_set(&current_max_latency, usecs);
166 spin_unlock_irqrestore(&latency_lock, flags);
167}
168EXPORT_SYMBOL_GPL(modify_acceptable_latency);
169
170/**
171 * remove_acceptable_latency - removes the maximum latency acceptable
172 * @identifier: string that identifies this driver
173 *
174 * This function removes a previously set maximum latency setting
175 * for the driver and frees up any resources associated with the
176 * bookkeeping needed for this.
177 *
178 * This function does not sleep and can be called in any context.
179 * Trying to use a non-existing identifier silently gets ignored.
180 */
181void remove_acceptable_latency(char *identifier)
182{
183 unsigned long flags;
184 int newmax = 0;
185 struct latency_info *iter, *temp;
186
187 spin_lock_irqsave(&latency_lock, flags);
188
189 list_for_each_entry_safe(iter, temp, &latency_list, list) {
190 if (strcmp(iter->identifier, identifier) == 0) {
191 list_del(&iter->list);
192 newmax = iter->usecs;
193 kfree(iter->identifier);
194 kfree(iter);
195 break;
196 }
197 }
198
199 /* If we just deleted the system wide value, we need to
200 * recalculate with a full search
201 */
202 if (newmax == atomic_read(&current_max_latency)) {
203 newmax = __find_max_latency();
204 atomic_set(&current_max_latency, newmax);
205 }
206 spin_unlock_irqrestore(&latency_lock, flags);
207}
208EXPORT_SYMBOL_GPL(remove_acceptable_latency);
209
210/**
211 * system_latency_constraint - queries the system wide latency maximum
212 *
213 * This function returns the system wide maximum latency in
214 * microseconds.
215 *
216 * This function does not sleep and can be called in any context.
217 */
218int system_latency_constraint(void)
219{
220 return atomic_read(&current_max_latency);
221}
222EXPORT_SYMBOL_GPL(system_latency_constraint);
223
224/**
225 * synchronize_acceptable_latency - recalculates all latency decisions
226 *
227 * This function will cause a callback to various kernel pieces that
228 * will make those pieces rethink their latency decisions. This implies
229 * that if there are overlong latencies in hardware state already, those
230 * latencies get taken right now. When this call completes no overlong
231 * latency decisions should be active anymore.
232 *
233 * Typical usecase of this is after a modify_acceptable_latency() call,
234 * which in itself is non-blocking and non-synchronizing.
235 *
236 * This function blocks and should not be called with locks held.
237 */
238
239void synchronize_acceptable_latency(void)
240{
241 blocking_notifier_call_chain(&latency_notifier,
242 atomic_read(&current_max_latency), NULL);
243}
244EXPORT_SYMBOL_GPL(synchronize_acceptable_latency);
245
246/*
247 * Latency notifier: this notifier gets called when a non-atomic new
248 * latency value gets set. The expectation nof the caller of the
249 * non-atomic set is that when the call returns, future latencies
250 * are within bounds, so the functions on the notifier list are
251 * expected to take the overlong latencies immediately, inside the
252 * callback, and not make a overlong latency decision anymore.
253 *
254 * The callback gets called when the new latency value is made
255 * active so system_latency_constraint() returns the new latency.
256 */
257int register_latency_notifier(struct notifier_block * nb)
258{
259 return blocking_notifier_chain_register(&latency_notifier, nb);
260}
261EXPORT_SYMBOL_GPL(register_latency_notifier);
262
263int unregister_latency_notifier(struct notifier_block * nb)
264{
265 return blocking_notifier_chain_unregister(&latency_notifier, nb);
266}
267EXPORT_SYMBOL_GPL(unregister_latency_notifier);
268
269static __init int latency_init(void)
270{
271 atomic_set(&current_max_latency, INFINITE_LATENCY);
272 /*
273 * we don't want by default to have longer latencies than 2 ticks,
274 * since that would cause lost ticks
275 */
276 set_acceptable_latency("kernel", 2*1000000/HZ);
277 return 0;
278}
279
280module_init(latency_init);
diff --git a/kernel/marker.c b/kernel/marker.c
index 5323cfaedbce..c4c2cd8b61f5 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -27,35 +27,42 @@
27extern struct marker __start___markers[]; 27extern struct marker __start___markers[];
28extern struct marker __stop___markers[]; 28extern struct marker __stop___markers[];
29 29
30/* Set to 1 to enable marker debug output */
31const int marker_debug;
32
30/* 33/*
31 * markers_mutex nests inside module_mutex. Markers mutex protects the builtin 34 * markers_mutex nests inside module_mutex. Markers mutex protects the builtin
32 * and module markers, the hash table and deferred_sync. 35 * and module markers and the hash table.
33 */ 36 */
34static DEFINE_MUTEX(markers_mutex); 37static DEFINE_MUTEX(markers_mutex);
35 38
36/* 39/*
37 * Marker deferred synchronization.
38 * Upon marker probe_unregister, we delay call to synchronize_sched() to
39 * accelerate mass unregistration (only when there is no more reference to a
40 * given module do we call synchronize_sched()). However, we need to make sure
41 * every critical region has ended before we re-arm a marker that has been
42 * unregistered and then registered back with a different probe data.
43 */
44static int deferred_sync;
45
46/*
47 * Marker hash table, containing the active markers. 40 * Marker hash table, containing the active markers.
48 * Protected by module_mutex. 41 * Protected by module_mutex.
49 */ 42 */
50#define MARKER_HASH_BITS 6 43#define MARKER_HASH_BITS 6
51#define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS) 44#define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS)
52 45
46/*
47 * Note about RCU :
48 * It is used to make sure every handler has finished using its private data
49 * between two consecutive operation (add or remove) on a given marker. It is
50 * also used to delay the free of multiple probes array until a quiescent state
51 * is reached.
52 * marker entries modifications are protected by the markers_mutex.
53 */
53struct marker_entry { 54struct marker_entry {
54 struct hlist_node hlist; 55 struct hlist_node hlist;
55 char *format; 56 char *format;
56 marker_probe_func *probe; 57 void (*call)(const struct marker *mdata, /* Probe wrapper */
57 void *private; 58 void *call_private, const char *fmt, ...);
59 struct marker_probe_closure single;
60 struct marker_probe_closure *multi;
58 int refcount; /* Number of times armed. 0 if disarmed. */ 61 int refcount; /* Number of times armed. 0 if disarmed. */
62 struct rcu_head rcu;
63 void *oldptr;
64 char rcu_pending:1;
65 char ptype:1;
59 char name[0]; /* Contains name'\0'format'\0' */ 66 char name[0]; /* Contains name'\0'format'\0' */
60}; 67};
61 68
@@ -63,7 +70,8 @@ static struct hlist_head marker_table[MARKER_TABLE_SIZE];
63 70
64/** 71/**
65 * __mark_empty_function - Empty probe callback 72 * __mark_empty_function - Empty probe callback
66 * @mdata: pointer of type const struct marker 73 * @probe_private: probe private data
74 * @call_private: call site private data
67 * @fmt: format string 75 * @fmt: format string
68 * @...: variable argument list 76 * @...: variable argument list
69 * 77 *
@@ -72,13 +80,267 @@ static struct hlist_head marker_table[MARKER_TABLE_SIZE];
72 * though the function pointer change and the marker enabling are two distinct 80 * though the function pointer change and the marker enabling are two distinct
73 * operations that modifies the execution flow of preemptible code. 81 * operations that modifies the execution flow of preemptible code.
74 */ 82 */
75void __mark_empty_function(const struct marker *mdata, void *private, 83void __mark_empty_function(void *probe_private, void *call_private,
76 const char *fmt, ...) 84 const char *fmt, va_list *args)
77{ 85{
78} 86}
79EXPORT_SYMBOL_GPL(__mark_empty_function); 87EXPORT_SYMBOL_GPL(__mark_empty_function);
80 88
81/* 89/*
90 * marker_probe_cb Callback that prepares the variable argument list for probes.
91 * @mdata: pointer of type struct marker
92 * @call_private: caller site private data
93 * @fmt: format string
94 * @...: Variable argument list.
95 *
96 * Since we do not use "typical" pointer based RCU in the 1 argument case, we
97 * need to put a full smp_rmb() in this branch. This is why we do not use
98 * rcu_dereference() for the pointer read.
99 */
100void marker_probe_cb(const struct marker *mdata, void *call_private,
101 const char *fmt, ...)
102{
103 va_list args;
104 char ptype;
105
106 /*
107 * disabling preemption to make sure the teardown of the callbacks can
108 * be done correctly when they are in modules and they insure RCU read
109 * coherency.
110 */
111 preempt_disable();
112 ptype = ACCESS_ONCE(mdata->ptype);
113 if (likely(!ptype)) {
114 marker_probe_func *func;
115 /* Must read the ptype before ptr. They are not data dependant,
116 * so we put an explicit smp_rmb() here. */
117 smp_rmb();
118 func = ACCESS_ONCE(mdata->single.func);
119 /* Must read the ptr before private data. They are not data
120 * dependant, so we put an explicit smp_rmb() here. */
121 smp_rmb();
122 va_start(args, fmt);
123 func(mdata->single.probe_private, call_private, fmt, &args);
124 va_end(args);
125 } else {
126 struct marker_probe_closure *multi;
127 int i;
128 /*
129 * multi points to an array, therefore accessing the array
130 * depends on reading multi. However, even in this case,
131 * we must insure that the pointer is read _before_ the array
132 * data. Same as rcu_dereference, but we need a full smp_rmb()
133 * in the fast path, so put the explicit barrier here.
134 */
135 smp_read_barrier_depends();
136 multi = ACCESS_ONCE(mdata->multi);
137 for (i = 0; multi[i].func; i++) {
138 va_start(args, fmt);
139 multi[i].func(multi[i].probe_private, call_private, fmt,
140 &args);
141 va_end(args);
142 }
143 }
144 preempt_enable();
145}
146EXPORT_SYMBOL_GPL(marker_probe_cb);
147
148/*
149 * marker_probe_cb Callback that does not prepare the variable argument list.
150 * @mdata: pointer of type struct marker
151 * @call_private: caller site private data
152 * @fmt: format string
153 * @...: Variable argument list.
154 *
155 * Should be connected to markers "MARK_NOARGS".
156 */
157void marker_probe_cb_noarg(const struct marker *mdata,
158 void *call_private, const char *fmt, ...)
159{
160 va_list args; /* not initialized */
161 char ptype;
162
163 preempt_disable();
164 ptype = ACCESS_ONCE(mdata->ptype);
165 if (likely(!ptype)) {
166 marker_probe_func *func;
167 /* Must read the ptype before ptr. They are not data dependant,
168 * so we put an explicit smp_rmb() here. */
169 smp_rmb();
170 func = ACCESS_ONCE(mdata->single.func);
171 /* Must read the ptr before private data. They are not data
172 * dependant, so we put an explicit smp_rmb() here. */
173 smp_rmb();
174 func(mdata->single.probe_private, call_private, fmt, &args);
175 } else {
176 struct marker_probe_closure *multi;
177 int i;
178 /*
179 * multi points to an array, therefore accessing the array
180 * depends on reading multi. However, even in this case,
181 * we must insure that the pointer is read _before_ the array
182 * data. Same as rcu_dereference, but we need a full smp_rmb()
183 * in the fast path, so put the explicit barrier here.
184 */
185 smp_read_barrier_depends();
186 multi = ACCESS_ONCE(mdata->multi);
187 for (i = 0; multi[i].func; i++)
188 multi[i].func(multi[i].probe_private, call_private, fmt,
189 &args);
190 }
191 preempt_enable();
192}
193EXPORT_SYMBOL_GPL(marker_probe_cb_noarg);
194
195static void free_old_closure(struct rcu_head *head)
196{
197 struct marker_entry *entry = container_of(head,
198 struct marker_entry, rcu);
199 kfree(entry->oldptr);
200 /* Make sure we free the data before setting the pending flag to 0 */
201 smp_wmb();
202 entry->rcu_pending = 0;
203}
204
205static void debug_print_probes(struct marker_entry *entry)
206{
207 int i;
208
209 if (!marker_debug)
210 return;
211
212 if (!entry->ptype) {
213 printk(KERN_DEBUG "Single probe : %p %p\n",
214 entry->single.func,
215 entry->single.probe_private);
216 } else {
217 for (i = 0; entry->multi[i].func; i++)
218 printk(KERN_DEBUG "Multi probe %d : %p %p\n", i,
219 entry->multi[i].func,
220 entry->multi[i].probe_private);
221 }
222}
223
224static struct marker_probe_closure *
225marker_entry_add_probe(struct marker_entry *entry,
226 marker_probe_func *probe, void *probe_private)
227{
228 int nr_probes = 0;
229 struct marker_probe_closure *old, *new;
230
231 WARN_ON(!probe);
232
233 debug_print_probes(entry);
234 old = entry->multi;
235 if (!entry->ptype) {
236 if (entry->single.func == probe &&
237 entry->single.probe_private == probe_private)
238 return ERR_PTR(-EBUSY);
239 if (entry->single.func == __mark_empty_function) {
240 /* 0 -> 1 probes */
241 entry->single.func = probe;
242 entry->single.probe_private = probe_private;
243 entry->refcount = 1;
244 entry->ptype = 0;
245 debug_print_probes(entry);
246 return NULL;
247 } else {
248 /* 1 -> 2 probes */
249 nr_probes = 1;
250 old = NULL;
251 }
252 } else {
253 /* (N -> N+1), (N != 0, 1) probes */
254 for (nr_probes = 0; old[nr_probes].func; nr_probes++)
255 if (old[nr_probes].func == probe
256 && old[nr_probes].probe_private
257 == probe_private)
258 return ERR_PTR(-EBUSY);
259 }
260 /* + 2 : one for new probe, one for NULL func */
261 new = kzalloc((nr_probes + 2) * sizeof(struct marker_probe_closure),
262 GFP_KERNEL);
263 if (new == NULL)
264 return ERR_PTR(-ENOMEM);
265 if (!old)
266 new[0] = entry->single;
267 else
268 memcpy(new, old,
269 nr_probes * sizeof(struct marker_probe_closure));
270 new[nr_probes].func = probe;
271 new[nr_probes].probe_private = probe_private;
272 entry->refcount = nr_probes + 1;
273 entry->multi = new;
274 entry->ptype = 1;
275 debug_print_probes(entry);
276 return old;
277}
278
279static struct marker_probe_closure *
280marker_entry_remove_probe(struct marker_entry *entry,
281 marker_probe_func *probe, void *probe_private)
282{
283 int nr_probes = 0, nr_del = 0, i;
284 struct marker_probe_closure *old, *new;
285
286 old = entry->multi;
287
288 debug_print_probes(entry);
289 if (!entry->ptype) {
290 /* 0 -> N is an error */
291 WARN_ON(entry->single.func == __mark_empty_function);
292 /* 1 -> 0 probes */
293 WARN_ON(probe && entry->single.func != probe);
294 WARN_ON(entry->single.probe_private != probe_private);
295 entry->single.func = __mark_empty_function;
296 entry->refcount = 0;
297 entry->ptype = 0;
298 debug_print_probes(entry);
299 return NULL;
300 } else {
301 /* (N -> M), (N > 1, M >= 0) probes */
302 for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
303 if ((!probe || old[nr_probes].func == probe)
304 && old[nr_probes].probe_private
305 == probe_private)
306 nr_del++;
307 }
308 }
309
310 if (nr_probes - nr_del == 0) {
311 /* N -> 0, (N > 1) */
312 entry->single.func = __mark_empty_function;
313 entry->refcount = 0;
314 entry->ptype = 0;
315 } else if (nr_probes - nr_del == 1) {
316 /* N -> 1, (N > 1) */
317 for (i = 0; old[i].func; i++)
318 if ((probe && old[i].func != probe) ||
319 old[i].probe_private != probe_private)
320 entry->single = old[i];
321 entry->refcount = 1;
322 entry->ptype = 0;
323 } else {
324 int j = 0;
325 /* N -> M, (N > 1, M > 1) */
326 /* + 1 for NULL */
327 new = kzalloc((nr_probes - nr_del + 1)
328 * sizeof(struct marker_probe_closure), GFP_KERNEL);
329 if (new == NULL)
330 return ERR_PTR(-ENOMEM);
331 for (i = 0; old[i].func; i++)
332 if ((probe && old[i].func != probe) ||
333 old[i].probe_private != probe_private)
334 new[j++] = old[i];
335 entry->refcount = nr_probes - nr_del;
336 entry->ptype = 1;
337 entry->multi = new;
338 }
339 debug_print_probes(entry);
340 return old;
341}
342
343/*
82 * Get marker if the marker is present in the marker hash table. 344 * Get marker if the marker is present in the marker hash table.
83 * Must be called with markers_mutex held. 345 * Must be called with markers_mutex held.
84 * Returns NULL if not present. 346 * Returns NULL if not present.
@@ -102,8 +364,7 @@ static struct marker_entry *get_marker(const char *name)
102 * Add the marker to the marker hash table. Must be called with markers_mutex 364 * Add the marker to the marker hash table. Must be called with markers_mutex
103 * held. 365 * held.
104 */ 366 */
105static int add_marker(const char *name, const char *format, 367static struct marker_entry *add_marker(const char *name, const char *format)
106 marker_probe_func *probe, void *private)
107{ 368{
108 struct hlist_head *head; 369 struct hlist_head *head;
109 struct hlist_node *node; 370 struct hlist_node *node;
@@ -118,9 +379,8 @@ static int add_marker(const char *name, const char *format,
118 hlist_for_each_entry(e, node, head, hlist) { 379 hlist_for_each_entry(e, node, head, hlist) {
119 if (!strcmp(name, e->name)) { 380 if (!strcmp(name, e->name)) {
120 printk(KERN_NOTICE 381 printk(KERN_NOTICE
121 "Marker %s busy, probe %p already installed\n", 382 "Marker %s busy\n", name);
122 name, e->probe); 383 return ERR_PTR(-EBUSY); /* Already there */
123 return -EBUSY; /* Already there */
124 } 384 }
125 } 385 }
126 /* 386 /*
@@ -130,34 +390,42 @@ static int add_marker(const char *name, const char *format,
130 e = kmalloc(sizeof(struct marker_entry) + name_len + format_len, 390 e = kmalloc(sizeof(struct marker_entry) + name_len + format_len,
131 GFP_KERNEL); 391 GFP_KERNEL);
132 if (!e) 392 if (!e)
133 return -ENOMEM; 393 return ERR_PTR(-ENOMEM);
134 memcpy(&e->name[0], name, name_len); 394 memcpy(&e->name[0], name, name_len);
135 if (format) { 395 if (format) {
136 e->format = &e->name[name_len]; 396 e->format = &e->name[name_len];
137 memcpy(e->format, format, format_len); 397 memcpy(e->format, format, format_len);
398 if (strcmp(e->format, MARK_NOARGS) == 0)
399 e->call = marker_probe_cb_noarg;
400 else
401 e->call = marker_probe_cb;
138 trace_mark(core_marker_format, "name %s format %s", 402 trace_mark(core_marker_format, "name %s format %s",
139 e->name, e->format); 403 e->name, e->format);
140 } else 404 } else {
141 e->format = NULL; 405 e->format = NULL;
142 e->probe = probe; 406 e->call = marker_probe_cb;
143 e->private = private; 407 }
408 e->single.func = __mark_empty_function;
409 e->single.probe_private = NULL;
410 e->multi = NULL;
411 e->ptype = 0;
144 e->refcount = 0; 412 e->refcount = 0;
413 e->rcu_pending = 0;
145 hlist_add_head(&e->hlist, head); 414 hlist_add_head(&e->hlist, head);
146 return 0; 415 return e;
147} 416}
148 417
149/* 418/*
150 * Remove the marker from the marker hash table. Must be called with mutex_lock 419 * Remove the marker from the marker hash table. Must be called with mutex_lock
151 * held. 420 * held.
152 */ 421 */
153static void *remove_marker(const char *name) 422static int remove_marker(const char *name)
154{ 423{
155 struct hlist_head *head; 424 struct hlist_head *head;
156 struct hlist_node *node; 425 struct hlist_node *node;
157 struct marker_entry *e; 426 struct marker_entry *e;
158 int found = 0; 427 int found = 0;
159 size_t len = strlen(name) + 1; 428 size_t len = strlen(name) + 1;
160 void *private = NULL;
161 u32 hash = jhash(name, len-1, 0); 429 u32 hash = jhash(name, len-1, 0);
162 430
163 head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)]; 431 head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
@@ -167,12 +435,16 @@ static void *remove_marker(const char *name)
167 break; 435 break;
168 } 436 }
169 } 437 }
170 if (found) { 438 if (!found)
171 private = e->private; 439 return -ENOENT;
172 hlist_del(&e->hlist); 440 if (e->single.func != __mark_empty_function)
173 kfree(e); 441 return -EBUSY;
174 } 442 hlist_del(&e->hlist);
175 return private; 443 /* Make sure the call_rcu has been executed */
444 if (e->rcu_pending)
445 rcu_barrier();
446 kfree(e);
447 return 0;
176} 448}
177 449
178/* 450/*
@@ -184,6 +456,7 @@ static int marker_set_format(struct marker_entry **entry, const char *format)
184 size_t name_len = strlen((*entry)->name) + 1; 456 size_t name_len = strlen((*entry)->name) + 1;
185 size_t format_len = strlen(format) + 1; 457 size_t format_len = strlen(format) + 1;
186 458
459
187 e = kmalloc(sizeof(struct marker_entry) + name_len + format_len, 460 e = kmalloc(sizeof(struct marker_entry) + name_len + format_len,
188 GFP_KERNEL); 461 GFP_KERNEL);
189 if (!e) 462 if (!e)
@@ -191,11 +464,20 @@ static int marker_set_format(struct marker_entry **entry, const char *format)
191 memcpy(&e->name[0], (*entry)->name, name_len); 464 memcpy(&e->name[0], (*entry)->name, name_len);
192 e->format = &e->name[name_len]; 465 e->format = &e->name[name_len];
193 memcpy(e->format, format, format_len); 466 memcpy(e->format, format, format_len);
194 e->probe = (*entry)->probe; 467 if (strcmp(e->format, MARK_NOARGS) == 0)
195 e->private = (*entry)->private; 468 e->call = marker_probe_cb_noarg;
469 else
470 e->call = marker_probe_cb;
471 e->single = (*entry)->single;
472 e->multi = (*entry)->multi;
473 e->ptype = (*entry)->ptype;
196 e->refcount = (*entry)->refcount; 474 e->refcount = (*entry)->refcount;
475 e->rcu_pending = 0;
197 hlist_add_before(&e->hlist, &(*entry)->hlist); 476 hlist_add_before(&e->hlist, &(*entry)->hlist);
198 hlist_del(&(*entry)->hlist); 477 hlist_del(&(*entry)->hlist);
478 /* Make sure the call_rcu has been executed */
479 if ((*entry)->rcu_pending)
480 rcu_barrier();
199 kfree(*entry); 481 kfree(*entry);
200 *entry = e; 482 *entry = e;
201 trace_mark(core_marker_format, "name %s format %s", 483 trace_mark(core_marker_format, "name %s format %s",
@@ -206,7 +488,8 @@ static int marker_set_format(struct marker_entry **entry, const char *format)
206/* 488/*
207 * Sets the probe callback corresponding to one marker. 489 * Sets the probe callback corresponding to one marker.
208 */ 490 */
209static int set_marker(struct marker_entry **entry, struct marker *elem) 491static int set_marker(struct marker_entry **entry, struct marker *elem,
492 int active)
210{ 493{
211 int ret; 494 int ret;
212 WARN_ON(strcmp((*entry)->name, elem->name) != 0); 495 WARN_ON(strcmp((*entry)->name, elem->name) != 0);
@@ -226,9 +509,43 @@ static int set_marker(struct marker_entry **entry, struct marker *elem)
226 if (ret) 509 if (ret)
227 return ret; 510 return ret;
228 } 511 }
229 elem->call = (*entry)->probe; 512
230 elem->private = (*entry)->private; 513 /*
231 elem->state = 1; 514 * probe_cb setup (statically known) is done here. It is
515 * asynchronous with the rest of execution, therefore we only
516 * pass from a "safe" callback (with argument) to an "unsafe"
517 * callback (does not set arguments).
518 */
519 elem->call = (*entry)->call;
520 /*
521 * Sanity check :
522 * We only update the single probe private data when the ptr is
523 * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1)
524 */
525 WARN_ON(elem->single.func != __mark_empty_function
526 && elem->single.probe_private
527 != (*entry)->single.probe_private &&
528 !elem->ptype);
529 elem->single.probe_private = (*entry)->single.probe_private;
530 /*
531 * Make sure the private data is valid when we update the
532 * single probe ptr.
533 */
534 smp_wmb();
535 elem->single.func = (*entry)->single.func;
536 /*
537 * We also make sure that the new probe callbacks array is consistent
538 * before setting a pointer to it.
539 */
540 rcu_assign_pointer(elem->multi, (*entry)->multi);
541 /*
542 * Update the function or multi probe array pointer before setting the
543 * ptype.
544 */
545 smp_wmb();
546 elem->ptype = (*entry)->ptype;
547 elem->state = active;
548
232 return 0; 549 return 0;
233} 550}
234 551
@@ -240,8 +557,12 @@ static int set_marker(struct marker_entry **entry, struct marker *elem)
240 */ 557 */
241static void disable_marker(struct marker *elem) 558static void disable_marker(struct marker *elem)
242{ 559{
560 /* leave "call" as is. It is known statically. */
243 elem->state = 0; 561 elem->state = 0;
244 elem->call = __mark_empty_function; 562 elem->single.func = __mark_empty_function;
563 /* Update the function before setting the ptype */
564 smp_wmb();
565 elem->ptype = 0; /* single probe */
245 /* 566 /*
246 * Leave the private data and id there, because removal is racy and 567 * Leave the private data and id there, because removal is racy and
247 * should be done only after a synchronize_sched(). These are never used 568 * should be done only after a synchronize_sched(). These are never used
@@ -253,14 +574,11 @@ static void disable_marker(struct marker *elem)
253 * marker_update_probe_range - Update a probe range 574 * marker_update_probe_range - Update a probe range
254 * @begin: beginning of the range 575 * @begin: beginning of the range
255 * @end: end of the range 576 * @end: end of the range
256 * @probe_module: module address of the probe being updated
257 * @refcount: number of references left to the given probe_module (out)
258 * 577 *
259 * Updates the probe callback corresponding to a range of markers. 578 * Updates the probe callback corresponding to a range of markers.
260 */ 579 */
261void marker_update_probe_range(struct marker *begin, 580void marker_update_probe_range(struct marker *begin,
262 struct marker *end, struct module *probe_module, 581 struct marker *end)
263 int *refcount)
264{ 582{
265 struct marker *iter; 583 struct marker *iter;
266 struct marker_entry *mark_entry; 584 struct marker_entry *mark_entry;
@@ -268,15 +586,12 @@ void marker_update_probe_range(struct marker *begin,
268 mutex_lock(&markers_mutex); 586 mutex_lock(&markers_mutex);
269 for (iter = begin; iter < end; iter++) { 587 for (iter = begin; iter < end; iter++) {
270 mark_entry = get_marker(iter->name); 588 mark_entry = get_marker(iter->name);
271 if (mark_entry && mark_entry->refcount) { 589 if (mark_entry) {
272 set_marker(&mark_entry, iter); 590 set_marker(&mark_entry, iter,
591 !!mark_entry->refcount);
273 /* 592 /*
274 * ignore error, continue 593 * ignore error, continue
275 */ 594 */
276 if (probe_module)
277 if (probe_module ==
278 __module_text_address((unsigned long)mark_entry->probe))
279 (*refcount)++;
280 } else { 595 } else {
281 disable_marker(iter); 596 disable_marker(iter);
282 } 597 }
@@ -289,20 +604,27 @@ void marker_update_probe_range(struct marker *begin,
289 * Issues a synchronize_sched() when no reference to the module passed 604 * Issues a synchronize_sched() when no reference to the module passed
290 * as parameter is found in the probes so the probe module can be 605 * as parameter is found in the probes so the probe module can be
291 * safely unloaded from now on. 606 * safely unloaded from now on.
607 *
608 * Internal callback only changed before the first probe is connected to it.
609 * Single probe private data can only be changed on 0 -> 1 and 2 -> 1
610 * transitions. All other transitions will leave the old private data valid.
611 * This makes the non-atomicity of the callback/private data updates valid.
612 *
613 * "special case" updates :
614 * 0 -> 1 callback
615 * 1 -> 0 callback
616 * 1 -> 2 callbacks
617 * 2 -> 1 callbacks
618 * Other updates all behave the same, just like the 2 -> 3 or 3 -> 2 updates.
619 * Site effect : marker_set_format may delete the marker entry (creating a
620 * replacement).
292 */ 621 */
293static void marker_update_probes(struct module *probe_module) 622static void marker_update_probes(void)
294{ 623{
295 int refcount = 0;
296
297 /* Core kernel markers */ 624 /* Core kernel markers */
298 marker_update_probe_range(__start___markers, 625 marker_update_probe_range(__start___markers, __stop___markers);
299 __stop___markers, probe_module, &refcount);
300 /* Markers in modules. */ 626 /* Markers in modules. */
301 module_update_markers(probe_module, &refcount); 627 module_update_markers();
302 if (probe_module && refcount == 0) {
303 synchronize_sched();
304 deferred_sync = 0;
305 }
306} 628}
307 629
308/** 630/**
@@ -310,33 +632,49 @@ static void marker_update_probes(struct module *probe_module)
310 * @name: marker name 632 * @name: marker name
311 * @format: format string 633 * @format: format string
312 * @probe: probe handler 634 * @probe: probe handler
313 * @private: probe private data 635 * @probe_private: probe private data
314 * 636 *
315 * private data must be a valid allocated memory address, or NULL. 637 * private data must be a valid allocated memory address, or NULL.
316 * Returns 0 if ok, error value on error. 638 * Returns 0 if ok, error value on error.
639 * The probe address must at least be aligned on the architecture pointer size.
317 */ 640 */
318int marker_probe_register(const char *name, const char *format, 641int marker_probe_register(const char *name, const char *format,
319 marker_probe_func *probe, void *private) 642 marker_probe_func *probe, void *probe_private)
320{ 643{
321 struct marker_entry *entry; 644 struct marker_entry *entry;
322 int ret = 0; 645 int ret = 0;
646 struct marker_probe_closure *old;
323 647
324 mutex_lock(&markers_mutex); 648 mutex_lock(&markers_mutex);
325 entry = get_marker(name); 649 entry = get_marker(name);
326 if (entry && entry->refcount) { 650 if (!entry) {
327 ret = -EBUSY; 651 entry = add_marker(name, format);
328 goto end; 652 if (IS_ERR(entry)) {
329 } 653 ret = PTR_ERR(entry);
330 if (deferred_sync) { 654 goto end;
331 synchronize_sched(); 655 }
332 deferred_sync = 0;
333 } 656 }
334 ret = add_marker(name, format, probe, private); 657 /*
335 if (ret) 658 * If we detect that a call_rcu is pending for this marker,
659 * make sure it's executed now.
660 */
661 if (entry->rcu_pending)
662 rcu_barrier();
663 old = marker_entry_add_probe(entry, probe, probe_private);
664 if (IS_ERR(old)) {
665 ret = PTR_ERR(old);
336 goto end; 666 goto end;
667 }
337 mutex_unlock(&markers_mutex); 668 mutex_unlock(&markers_mutex);
338 marker_update_probes(NULL); 669 marker_update_probes(); /* may update entry */
339 return ret; 670 mutex_lock(&markers_mutex);
671 entry = get_marker(name);
672 WARN_ON(!entry);
673 entry->oldptr = old;
674 entry->rcu_pending = 1;
675 /* write rcu_pending before calling the RCU callback */
676 smp_wmb();
677 call_rcu(&entry->rcu, free_old_closure);
340end: 678end:
341 mutex_unlock(&markers_mutex); 679 mutex_unlock(&markers_mutex);
342 return ret; 680 return ret;
@@ -346,171 +684,166 @@ EXPORT_SYMBOL_GPL(marker_probe_register);
346/** 684/**
347 * marker_probe_unregister - Disconnect a probe from a marker 685 * marker_probe_unregister - Disconnect a probe from a marker
348 * @name: marker name 686 * @name: marker name
687 * @probe: probe function pointer
688 * @probe_private: probe private data
349 * 689 *
350 * Returns the private data given to marker_probe_register, or an ERR_PTR(). 690 * Returns the private data given to marker_probe_register, or an ERR_PTR().
691 * We do not need to call a synchronize_sched to make sure the probes have
692 * finished running before doing a module unload, because the module unload
693 * itself uses stop_machine(), which insures that every preempt disabled section
694 * have finished.
351 */ 695 */
352void *marker_probe_unregister(const char *name) 696int marker_probe_unregister(const char *name,
697 marker_probe_func *probe, void *probe_private)
353{ 698{
354 struct module *probe_module;
355 struct marker_entry *entry; 699 struct marker_entry *entry;
356 void *private; 700 struct marker_probe_closure *old;
701 int ret = 0;
357 702
358 mutex_lock(&markers_mutex); 703 mutex_lock(&markers_mutex);
359 entry = get_marker(name); 704 entry = get_marker(name);
360 if (!entry) { 705 if (!entry) {
361 private = ERR_PTR(-ENOENT); 706 ret = -ENOENT;
362 goto end; 707 goto end;
363 } 708 }
364 entry->refcount = 0; 709 if (entry->rcu_pending)
365 /* In what module is the probe handler ? */ 710 rcu_barrier();
366 probe_module = __module_text_address((unsigned long)entry->probe); 711 old = marker_entry_remove_probe(entry, probe, probe_private);
367 private = remove_marker(name);
368 deferred_sync = 1;
369 mutex_unlock(&markers_mutex); 712 mutex_unlock(&markers_mutex);
370 marker_update_probes(probe_module); 713 marker_update_probes(); /* may update entry */
371 return private; 714 mutex_lock(&markers_mutex);
715 entry = get_marker(name);
716 entry->oldptr = old;
717 entry->rcu_pending = 1;
718 /* write rcu_pending before calling the RCU callback */
719 smp_wmb();
720 call_rcu(&entry->rcu, free_old_closure);
721 remove_marker(name); /* Ignore busy error message */
372end: 722end:
373 mutex_unlock(&markers_mutex); 723 mutex_unlock(&markers_mutex);
374 return private; 724 return ret;
375} 725}
376EXPORT_SYMBOL_GPL(marker_probe_unregister); 726EXPORT_SYMBOL_GPL(marker_probe_unregister);
377 727
378/** 728static struct marker_entry *
379 * marker_probe_unregister_private_data - Disconnect a probe from a marker 729get_marker_from_private_data(marker_probe_func *probe, void *probe_private)
380 * @private: probe private data
381 *
382 * Unregister a marker by providing the registered private data.
383 * Returns the private data given to marker_probe_register, or an ERR_PTR().
384 */
385void *marker_probe_unregister_private_data(void *private)
386{ 730{
387 struct module *probe_module;
388 struct hlist_head *head;
389 struct hlist_node *node;
390 struct marker_entry *entry; 731 struct marker_entry *entry;
391 int found = 0;
392 unsigned int i; 732 unsigned int i;
733 struct hlist_head *head;
734 struct hlist_node *node;
393 735
394 mutex_lock(&markers_mutex);
395 for (i = 0; i < MARKER_TABLE_SIZE; i++) { 736 for (i = 0; i < MARKER_TABLE_SIZE; i++) {
396 head = &marker_table[i]; 737 head = &marker_table[i];
397 hlist_for_each_entry(entry, node, head, hlist) { 738 hlist_for_each_entry(entry, node, head, hlist) {
398 if (entry->private == private) { 739 if (!entry->ptype) {
399 found = 1; 740 if (entry->single.func == probe
400 goto iter_end; 741 && entry->single.probe_private
742 == probe_private)
743 return entry;
744 } else {
745 struct marker_probe_closure *closure;
746 closure = entry->multi;
747 for (i = 0; closure[i].func; i++) {
748 if (closure[i].func == probe &&
749 closure[i].probe_private
750 == probe_private)
751 return entry;
752 }
401 } 753 }
402 } 754 }
403 } 755 }
404iter_end: 756 return NULL;
405 if (!found) {
406 private = ERR_PTR(-ENOENT);
407 goto end;
408 }
409 entry->refcount = 0;
410 /* In what module is the probe handler ? */
411 probe_module = __module_text_address((unsigned long)entry->probe);
412 private = remove_marker(entry->name);
413 deferred_sync = 1;
414 mutex_unlock(&markers_mutex);
415 marker_update_probes(probe_module);
416 return private;
417end:
418 mutex_unlock(&markers_mutex);
419 return private;
420} 757}
421EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data);
422 758
423/** 759/**
424 * marker_arm - Arm a marker 760 * marker_probe_unregister_private_data - Disconnect a probe from a marker
425 * @name: marker name 761 * @probe: probe function
762 * @probe_private: probe private data
426 * 763 *
427 * Activate a marker. It keeps a reference count of the number of 764 * Unregister a probe by providing the registered private data.
428 * arming/disarming done. 765 * Only removes the first marker found in hash table.
429 * Returns 0 if ok, error value on error. 766 * Return 0 on success or error value.
767 * We do not need to call a synchronize_sched to make sure the probes have
768 * finished running before doing a module unload, because the module unload
769 * itself uses stop_machine(), which insures that every preempt disabled section
770 * have finished.
430 */ 771 */
431int marker_arm(const char *name) 772int marker_probe_unregister_private_data(marker_probe_func *probe,
773 void *probe_private)
432{ 774{
433 struct marker_entry *entry; 775 struct marker_entry *entry;
434 int ret = 0; 776 int ret = 0;
777 struct marker_probe_closure *old;
435 778
436 mutex_lock(&markers_mutex); 779 mutex_lock(&markers_mutex);
437 entry = get_marker(name); 780 entry = get_marker_from_private_data(probe, probe_private);
438 if (!entry) { 781 if (!entry) {
439 ret = -ENOENT; 782 ret = -ENOENT;
440 goto end; 783 goto end;
441 } 784 }
442 /* 785 if (entry->rcu_pending)
443 * Only need to update probes when refcount passes from 0 to 1. 786 rcu_barrier();
444 */ 787 old = marker_entry_remove_probe(entry, NULL, probe_private);
445 if (entry->refcount++)
446 goto end;
447end:
448 mutex_unlock(&markers_mutex); 788 mutex_unlock(&markers_mutex);
449 marker_update_probes(NULL); 789 marker_update_probes(); /* may update entry */
450 return ret;
451}
452EXPORT_SYMBOL_GPL(marker_arm);
453
454/**
455 * marker_disarm - Disarm a marker
456 * @name: marker name
457 *
458 * Disarm a marker. It keeps a reference count of the number of arming/disarming
459 * done.
460 * Returns 0 if ok, error value on error.
461 */
462int marker_disarm(const char *name)
463{
464 struct marker_entry *entry;
465 int ret = 0;
466
467 mutex_lock(&markers_mutex); 790 mutex_lock(&markers_mutex);
468 entry = get_marker(name); 791 entry = get_marker_from_private_data(probe, probe_private);
469 if (!entry) { 792 WARN_ON(!entry);
470 ret = -ENOENT; 793 entry->oldptr = old;
471 goto end; 794 entry->rcu_pending = 1;
472 } 795 /* write rcu_pending before calling the RCU callback */
473 /* 796 smp_wmb();
474 * Only permit decrement refcount if higher than 0. 797 call_rcu(&entry->rcu, free_old_closure);
475 * Do probe update only on 1 -> 0 transition. 798 remove_marker(entry->name); /* Ignore busy error message */
476 */
477 if (entry->refcount) {
478 if (--entry->refcount)
479 goto end;
480 } else {
481 ret = -EPERM;
482 goto end;
483 }
484end: 799end:
485 mutex_unlock(&markers_mutex); 800 mutex_unlock(&markers_mutex);
486 marker_update_probes(NULL);
487 return ret; 801 return ret;
488} 802}
489EXPORT_SYMBOL_GPL(marker_disarm); 803EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data);
490 804
491/** 805/**
492 * marker_get_private_data - Get a marker's probe private data 806 * marker_get_private_data - Get a marker's probe private data
493 * @name: marker name 807 * @name: marker name
808 * @probe: probe to match
809 * @num: get the nth matching probe's private data
494 * 810 *
811 * Returns the nth private data pointer (starting from 0) matching, or an
812 * ERR_PTR.
495 * Returns the private data pointer, or an ERR_PTR. 813 * Returns the private data pointer, or an ERR_PTR.
496 * The private data pointer should _only_ be dereferenced if the caller is the 814 * The private data pointer should _only_ be dereferenced if the caller is the
497 * owner of the data, or its content could vanish. This is mostly used to 815 * owner of the data, or its content could vanish. This is mostly used to
498 * confirm that a caller is the owner of a registered probe. 816 * confirm that a caller is the owner of a registered probe.
499 */ 817 */
500void *marker_get_private_data(const char *name) 818void *marker_get_private_data(const char *name, marker_probe_func *probe,
819 int num)
501{ 820{
502 struct hlist_head *head; 821 struct hlist_head *head;
503 struct hlist_node *node; 822 struct hlist_node *node;
504 struct marker_entry *e; 823 struct marker_entry *e;
505 size_t name_len = strlen(name) + 1; 824 size_t name_len = strlen(name) + 1;
506 u32 hash = jhash(name, name_len-1, 0); 825 u32 hash = jhash(name, name_len-1, 0);
507 int found = 0; 826 int i;
508 827
509 head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)]; 828 head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
510 hlist_for_each_entry(e, node, head, hlist) { 829 hlist_for_each_entry(e, node, head, hlist) {
511 if (!strcmp(name, e->name)) { 830 if (!strcmp(name, e->name)) {
512 found = 1; 831 if (!e->ptype) {
513 return e->private; 832 if (num == 0 && e->single.func == probe)
833 return e->single.probe_private;
834 else
835 break;
836 } else {
837 struct marker_probe_closure *closure;
838 int match = 0;
839 closure = e->multi;
840 for (i = 0; closure[i].func; i++) {
841 if (closure[i].func != probe)
842 continue;
843 if (match++ == num)
844 return closure[i].probe_private;
845 }
846 }
514 } 847 }
515 } 848 }
516 return ERR_PTR(-ENOENT); 849 return ERR_PTR(-ENOENT);
diff --git a/kernel/module.c b/kernel/module.c
index bd60278ee703..92595bad3812 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -46,6 +46,7 @@
46#include <asm/semaphore.h> 46#include <asm/semaphore.h>
47#include <asm/cacheflush.h> 47#include <asm/cacheflush.h>
48#include <linux/license.h> 48#include <linux/license.h>
49#include <asm/sections.h>
49 50
50#if 0 51#if 0
51#define DEBUGP printk 52#define DEBUGP printk
@@ -290,7 +291,7 @@ static unsigned long __find_symbol(const char *name,
290 } 291 }
291 } 292 }
292 DEBUGP("Failed to find symbol %s\n", name); 293 DEBUGP("Failed to find symbol %s\n", name);
293 return 0; 294 return -ENOENT;
294} 295}
295 296
296/* Search for module by name: must hold module_mutex. */ 297/* Search for module by name: must hold module_mutex. */
@@ -343,9 +344,6 @@ static inline unsigned int block_size(int val)
343 return val; 344 return val;
344} 345}
345 346
346/* Created by linker magic */
347extern char __per_cpu_start[], __per_cpu_end[];
348
349static void *percpu_modalloc(unsigned long size, unsigned long align, 347static void *percpu_modalloc(unsigned long size, unsigned long align,
350 const char *name) 348 const char *name)
351{ 349{
@@ -783,7 +781,7 @@ void __symbol_put(const char *symbol)
783 const unsigned long *crc; 781 const unsigned long *crc;
784 782
785 preempt_disable(); 783 preempt_disable();
786 if (!__find_symbol(symbol, &owner, &crc, 1)) 784 if (IS_ERR_VALUE(__find_symbol(symbol, &owner, &crc, 1)))
787 BUG(); 785 BUG();
788 module_put(owner); 786 module_put(owner);
789 preempt_enable(); 787 preempt_enable();
@@ -929,7 +927,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
929 const unsigned long *crc; 927 const unsigned long *crc;
930 struct module *owner; 928 struct module *owner;
931 929
932 if (!__find_symbol("struct_module", &owner, &crc, 1)) 930 if (IS_ERR_VALUE(__find_symbol("struct_module",
931 &owner, &crc, 1)))
933 BUG(); 932 BUG();
934 return check_version(sechdrs, versindex, "struct_module", mod, 933 return check_version(sechdrs, versindex, "struct_module", mod,
935 crc); 934 crc);
@@ -978,12 +977,12 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs,
978 977
979 ret = __find_symbol(name, &owner, &crc, 978 ret = __find_symbol(name, &owner, &crc,
980 !(mod->taints & TAINT_PROPRIETARY_MODULE)); 979 !(mod->taints & TAINT_PROPRIETARY_MODULE));
981 if (ret) { 980 if (!IS_ERR_VALUE(ret)) {
982 /* use_module can fail due to OOM, 981 /* use_module can fail due to OOM,
983 or module initialization or unloading */ 982 or module initialization or unloading */
984 if (!check_version(sechdrs, versindex, name, mod, crc) || 983 if (!check_version(sechdrs, versindex, name, mod, crc) ||
985 !use_module(mod, owner)) 984 !use_module(mod, owner))
986 ret = 0; 985 ret = -EINVAL;
987 } 986 }
988 return ret; 987 return ret;
989} 988}
@@ -1371,7 +1370,9 @@ void *__symbol_get(const char *symbol)
1371 1370
1372 preempt_disable(); 1371 preempt_disable();
1373 value = __find_symbol(symbol, &owner, &crc, 1); 1372 value = __find_symbol(symbol, &owner, &crc, 1);
1374 if (value && strong_try_module_get(owner) != 0) 1373 if (IS_ERR_VALUE(value))
1374 value = 0;
1375 else if (strong_try_module_get(owner))
1375 value = 0; 1376 value = 0;
1376 preempt_enable(); 1377 preempt_enable();
1377 1378
@@ -1391,14 +1392,16 @@ static int verify_export_symbols(struct module *mod)
1391 const unsigned long *crc; 1392 const unsigned long *crc;
1392 1393
1393 for (i = 0; i < mod->num_syms; i++) 1394 for (i = 0; i < mod->num_syms; i++)
1394 if (__find_symbol(mod->syms[i].name, &owner, &crc, 1)) { 1395 if (!IS_ERR_VALUE(__find_symbol(mod->syms[i].name,
1396 &owner, &crc, 1))) {
1395 name = mod->syms[i].name; 1397 name = mod->syms[i].name;
1396 ret = -ENOEXEC; 1398 ret = -ENOEXEC;
1397 goto dup; 1399 goto dup;
1398 } 1400 }
1399 1401
1400 for (i = 0; i < mod->num_gpl_syms; i++) 1402 for (i = 0; i < mod->num_gpl_syms; i++)
1401 if (__find_symbol(mod->gpl_syms[i].name, &owner, &crc, 1)) { 1403 if (!IS_ERR_VALUE(__find_symbol(mod->gpl_syms[i].name,
1404 &owner, &crc, 1))) {
1402 name = mod->gpl_syms[i].name; 1405 name = mod->gpl_syms[i].name;
1403 ret = -ENOEXEC; 1406 ret = -ENOEXEC;
1404 goto dup; 1407 goto dup;
@@ -1448,7 +1451,7 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
1448 strtab + sym[i].st_name, mod); 1451 strtab + sym[i].st_name, mod);
1449 1452
1450 /* Ok if resolved. */ 1453 /* Ok if resolved. */
1451 if (sym[i].st_value != 0) 1454 if (!IS_ERR_VALUE(sym[i].st_value))
1452 break; 1455 break;
1453 /* Ok if weak. */ 1456 /* Ok if weak. */
1454 if (ELF_ST_BIND(sym[i].st_info) == STB_WEAK) 1457 if (ELF_ST_BIND(sym[i].st_info) == STB_WEAK)
@@ -2035,7 +2038,7 @@ static struct module *load_module(void __user *umod,
2035#ifdef CONFIG_MARKERS 2038#ifdef CONFIG_MARKERS
2036 if (!mod->taints) 2039 if (!mod->taints)
2037 marker_update_probe_range(mod->markers, 2040 marker_update_probe_range(mod->markers,
2038 mod->markers + mod->num_markers, NULL, NULL); 2041 mod->markers + mod->num_markers);
2039#endif 2042#endif
2040 err = module_finalize(hdr, sechdrs, mod); 2043 err = module_finalize(hdr, sechdrs, mod);
2041 if (err < 0) 2044 if (err < 0)
@@ -2250,7 +2253,7 @@ static const char *get_ksymbol(struct module *mod,
2250 2253
2251/* For kallsyms to ask for address resolution. NULL means not found. Careful 2254/* For kallsyms to ask for address resolution. NULL means not found. Careful
2252 * not to lock to avoid deadlock on oopses, simply disable preemption. */ 2255 * not to lock to avoid deadlock on oopses, simply disable preemption. */
2253char *module_address_lookup(unsigned long addr, 2256const char *module_address_lookup(unsigned long addr,
2254 unsigned long *size, 2257 unsigned long *size,
2255 unsigned long *offset, 2258 unsigned long *offset,
2256 char **modname, 2259 char **modname,
@@ -2275,7 +2278,7 @@ char *module_address_lookup(unsigned long addr,
2275 ret = namebuf; 2278 ret = namebuf;
2276 } 2279 }
2277 preempt_enable(); 2280 preempt_enable();
2278 return (char *)ret; 2281 return ret;
2279} 2282}
2280 2283
2281int lookup_module_symbol_name(unsigned long addr, char *symname) 2284int lookup_module_symbol_name(unsigned long addr, char *symname)
@@ -2561,7 +2564,7 @@ EXPORT_SYMBOL(struct_module);
2561#endif 2564#endif
2562 2565
2563#ifdef CONFIG_MARKERS 2566#ifdef CONFIG_MARKERS
2564void module_update_markers(struct module *probe_module, int *refcount) 2567void module_update_markers(void)
2565{ 2568{
2566 struct module *mod; 2569 struct module *mod;
2567 2570
@@ -2569,8 +2572,7 @@ void module_update_markers(struct module *probe_module, int *refcount)
2569 list_for_each_entry(mod, &modules, list) 2572 list_for_each_entry(mod, &modules, list)
2570 if (!mod->taints) 2573 if (!mod->taints)
2571 marker_update_probe_range(mod->markers, 2574 marker_update_probe_range(mod->markers,
2572 mod->markers + mod->num_markers, 2575 mod->markers + mod->num_markers);
2573 probe_module, refcount);
2574 mutex_unlock(&module_mutex); 2576 mutex_unlock(&module_mutex);
2575} 2577}
2576#endif 2578#endif
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index d17436cdea1b..3aaa06c561de 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -107,7 +107,7 @@ void debug_mutex_init(struct mutex *lock, const char *name,
107 * use of the mutex is forbidden. The mutex must not be locked when 107 * use of the mutex is forbidden. The mutex must not be locked when
108 * this function is called. 108 * this function is called.
109 */ 109 */
110void fastcall mutex_destroy(struct mutex *lock) 110void mutex_destroy(struct mutex *lock)
111{ 111{
112 DEBUG_LOCKS_WARN_ON(mutex_is_locked(lock)); 112 DEBUG_LOCKS_WARN_ON(mutex_is_locked(lock));
113 lock->magic = NULL; 113 lock->magic = NULL;
diff --git a/kernel/mutex.c b/kernel/mutex.c
index d9ec9b666250..d046a345d365 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -58,7 +58,7 @@ EXPORT_SYMBOL(__mutex_init);
58 * We also put the fastpath first in the kernel image, to make sure the 58 * We also put the fastpath first in the kernel image, to make sure the
59 * branch is predicted by the CPU as default-untaken. 59 * branch is predicted by the CPU as default-untaken.
60 */ 60 */
61static void fastcall noinline __sched 61static void noinline __sched
62__mutex_lock_slowpath(atomic_t *lock_count); 62__mutex_lock_slowpath(atomic_t *lock_count);
63 63
64/*** 64/***
@@ -82,7 +82,7 @@ __mutex_lock_slowpath(atomic_t *lock_count);
82 * 82 *
83 * This function is similar to (but not equivalent to) down(). 83 * This function is similar to (but not equivalent to) down().
84 */ 84 */
85void inline fastcall __sched mutex_lock(struct mutex *lock) 85void inline __sched mutex_lock(struct mutex *lock)
86{ 86{
87 might_sleep(); 87 might_sleep();
88 /* 88 /*
@@ -95,8 +95,7 @@ void inline fastcall __sched mutex_lock(struct mutex *lock)
95EXPORT_SYMBOL(mutex_lock); 95EXPORT_SYMBOL(mutex_lock);
96#endif 96#endif
97 97
98static void fastcall noinline __sched 98static noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
99__mutex_unlock_slowpath(atomic_t *lock_count);
100 99
101/*** 100/***
102 * mutex_unlock - release the mutex 101 * mutex_unlock - release the mutex
@@ -109,7 +108,7 @@ __mutex_unlock_slowpath(atomic_t *lock_count);
109 * 108 *
110 * This function is similar to (but not equivalent to) up(). 109 * This function is similar to (but not equivalent to) up().
111 */ 110 */
112void fastcall __sched mutex_unlock(struct mutex *lock) 111void __sched mutex_unlock(struct mutex *lock)
113{ 112{
114 /* 113 /*
115 * The unlocking fastpath is the 0->1 transition from 'locked' 114 * The unlocking fastpath is the 0->1 transition from 'locked'
@@ -234,7 +233,7 @@ EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
234/* 233/*
235 * Release the lock, slowpath: 234 * Release the lock, slowpath:
236 */ 235 */
237static fastcall inline void 236static inline void
238__mutex_unlock_common_slowpath(atomic_t *lock_count, int nested) 237__mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
239{ 238{
240 struct mutex *lock = container_of(lock_count, struct mutex, count); 239 struct mutex *lock = container_of(lock_count, struct mutex, count);
@@ -271,7 +270,7 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
271/* 270/*
272 * Release the lock, slowpath: 271 * Release the lock, slowpath:
273 */ 272 */
274static fastcall noinline void 273static noinline void
275__mutex_unlock_slowpath(atomic_t *lock_count) 274__mutex_unlock_slowpath(atomic_t *lock_count)
276{ 275{
277 __mutex_unlock_common_slowpath(lock_count, 1); 276 __mutex_unlock_common_slowpath(lock_count, 1);
@@ -282,10 +281,10 @@ __mutex_unlock_slowpath(atomic_t *lock_count)
282 * Here come the less common (and hence less performance-critical) APIs: 281 * Here come the less common (and hence less performance-critical) APIs:
283 * mutex_lock_interruptible() and mutex_trylock(). 282 * mutex_lock_interruptible() and mutex_trylock().
284 */ 283 */
285static int fastcall noinline __sched 284static noinline int __sched
286__mutex_lock_killable_slowpath(atomic_t *lock_count); 285__mutex_lock_killable_slowpath(atomic_t *lock_count);
287 286
288static noinline int fastcall __sched 287static noinline int __sched
289__mutex_lock_interruptible_slowpath(atomic_t *lock_count); 288__mutex_lock_interruptible_slowpath(atomic_t *lock_count);
290 289
291/*** 290/***
@@ -299,7 +298,7 @@ __mutex_lock_interruptible_slowpath(atomic_t *lock_count);
299 * 298 *
300 * This function is similar to (but not equivalent to) down_interruptible(). 299 * This function is similar to (but not equivalent to) down_interruptible().
301 */ 300 */
302int fastcall __sched mutex_lock_interruptible(struct mutex *lock) 301int __sched mutex_lock_interruptible(struct mutex *lock)
303{ 302{
304 might_sleep(); 303 might_sleep();
305 return __mutex_fastpath_lock_retval 304 return __mutex_fastpath_lock_retval
@@ -308,7 +307,7 @@ int fastcall __sched mutex_lock_interruptible(struct mutex *lock)
308 307
309EXPORT_SYMBOL(mutex_lock_interruptible); 308EXPORT_SYMBOL(mutex_lock_interruptible);
310 309
311int fastcall __sched mutex_lock_killable(struct mutex *lock) 310int __sched mutex_lock_killable(struct mutex *lock)
312{ 311{
313 might_sleep(); 312 might_sleep();
314 return __mutex_fastpath_lock_retval 313 return __mutex_fastpath_lock_retval
@@ -316,7 +315,7 @@ int fastcall __sched mutex_lock_killable(struct mutex *lock)
316} 315}
317EXPORT_SYMBOL(mutex_lock_killable); 316EXPORT_SYMBOL(mutex_lock_killable);
318 317
319static void fastcall noinline __sched 318static noinline void __sched
320__mutex_lock_slowpath(atomic_t *lock_count) 319__mutex_lock_slowpath(atomic_t *lock_count)
321{ 320{
322 struct mutex *lock = container_of(lock_count, struct mutex, count); 321 struct mutex *lock = container_of(lock_count, struct mutex, count);
@@ -324,7 +323,7 @@ __mutex_lock_slowpath(atomic_t *lock_count)
324 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, _RET_IP_); 323 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, _RET_IP_);
325} 324}
326 325
327static int fastcall noinline __sched 326static noinline int __sched
328__mutex_lock_killable_slowpath(atomic_t *lock_count) 327__mutex_lock_killable_slowpath(atomic_t *lock_count)
329{ 328{
330 struct mutex *lock = container_of(lock_count, struct mutex, count); 329 struct mutex *lock = container_of(lock_count, struct mutex, count);
@@ -332,7 +331,7 @@ __mutex_lock_killable_slowpath(atomic_t *lock_count)
332 return __mutex_lock_common(lock, TASK_KILLABLE, 0, _RET_IP_); 331 return __mutex_lock_common(lock, TASK_KILLABLE, 0, _RET_IP_);
333} 332}
334 333
335static noinline int fastcall __sched 334static noinline int __sched
336__mutex_lock_interruptible_slowpath(atomic_t *lock_count) 335__mutex_lock_interruptible_slowpath(atomic_t *lock_count)
337{ 336{
338 struct mutex *lock = container_of(lock_count, struct mutex, count); 337 struct mutex *lock = container_of(lock_count, struct mutex, count);
@@ -381,7 +380,7 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
381 * This function must not be used in interrupt context. The 380 * This function must not be used in interrupt context. The
382 * mutex must be released by the same task that acquired it. 381 * mutex must be released by the same task that acquired it.
383 */ 382 */
384int fastcall __sched mutex_trylock(struct mutex *lock) 383int __sched mutex_trylock(struct mutex *lock)
385{ 384{
386 return __mutex_fastpath_trylock(&lock->count, 385 return __mutex_fastpath_trylock(&lock->count,
387 __mutex_trylock_slowpath); 386 __mutex_trylock_slowpath);
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 4253f472f060..643360d1bb14 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -4,6 +4,7 @@
4#include <linux/notifier.h> 4#include <linux/notifier.h>
5#include <linux/rcupdate.h> 5#include <linux/rcupdate.h>
6#include <linux/vmalloc.h> 6#include <linux/vmalloc.h>
7#include <linux/reboot.h>
7 8
8/* 9/*
9 * Notifier list for kernel code which wants to be called 10 * Notifier list for kernel code which wants to be called
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 79f871bc0ef4..f5d332cf8c63 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -21,6 +21,7 @@
21#include <linux/utsname.h> 21#include <linux/utsname.h>
22#include <linux/pid_namespace.h> 22#include <linux/pid_namespace.h>
23#include <net/net_namespace.h> 23#include <net/net_namespace.h>
24#include <linux/ipc_namespace.h>
24 25
25static struct kmem_cache *nsproxy_cachep; 26static struct kmem_cache *nsproxy_cachep;
26 27
diff --git a/kernel/panic.c b/kernel/panic.c
index d9e90cfe3298..24af9f8bac99 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -161,7 +161,7 @@ const char *print_tainted(void)
161{ 161{
162 static char buf[20]; 162 static char buf[20];
163 if (tainted) { 163 if (tainted) {
164 snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c", 164 snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c%c",
165 tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G', 165 tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G',
166 tainted & TAINT_FORCED_MODULE ? 'F' : ' ', 166 tainted & TAINT_FORCED_MODULE ? 'F' : ' ',
167 tainted & TAINT_UNSAFE_SMP ? 'S' : ' ', 167 tainted & TAINT_UNSAFE_SMP ? 'S' : ' ',
@@ -169,7 +169,8 @@ const char *print_tainted(void)
169 tainted & TAINT_MACHINE_CHECK ? 'M' : ' ', 169 tainted & TAINT_MACHINE_CHECK ? 'M' : ' ',
170 tainted & TAINT_BAD_PAGE ? 'B' : ' ', 170 tainted & TAINT_BAD_PAGE ? 'B' : ' ',
171 tainted & TAINT_USER ? 'U' : ' ', 171 tainted & TAINT_USER ? 'U' : ' ',
172 tainted & TAINT_DIE ? 'D' : ' '); 172 tainted & TAINT_DIE ? 'D' : ' ',
173 tainted & TAINT_OVERRIDDEN_ACPI_TABLE ? 'A' : ' ');
173 } 174 }
174 else 175 else
175 snprintf(buf, sizeof(buf), "Not tainted"); 176 snprintf(buf, sizeof(buf), "Not tainted");
diff --git a/kernel/params.c b/kernel/params.c
index 42fe5e6126c0..afc46a23eb6d 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -180,12 +180,12 @@ int parse_args(const char *name,
180#define STANDARD_PARAM_DEF(name, type, format, tmptype, strtolfn) \ 180#define STANDARD_PARAM_DEF(name, type, format, tmptype, strtolfn) \
181 int param_set_##name(const char *val, struct kernel_param *kp) \ 181 int param_set_##name(const char *val, struct kernel_param *kp) \
182 { \ 182 { \
183 char *endp; \
184 tmptype l; \ 183 tmptype l; \
184 int ret; \
185 \ 185 \
186 if (!val) return -EINVAL; \ 186 if (!val) return -EINVAL; \
187 l = strtolfn(val, &endp, 0); \ 187 ret = strtolfn(val, 0, &l); \
188 if (endp == val || ((type)l != l)) \ 188 if (ret == -EINVAL || ((type)l != l)) \
189 return -EINVAL; \ 189 return -EINVAL; \
190 *((type *)kp->arg) = l; \ 190 *((type *)kp->arg) = l; \
191 return 0; \ 191 return 0; \
@@ -195,13 +195,13 @@ int parse_args(const char *name,
195 return sprintf(buffer, format, *((type *)kp->arg)); \ 195 return sprintf(buffer, format, *((type *)kp->arg)); \
196 } 196 }
197 197
198STANDARD_PARAM_DEF(byte, unsigned char, "%c", unsigned long, simple_strtoul); 198STANDARD_PARAM_DEF(byte, unsigned char, "%c", unsigned long, strict_strtoul);
199STANDARD_PARAM_DEF(short, short, "%hi", long, simple_strtol); 199STANDARD_PARAM_DEF(short, short, "%hi", long, strict_strtol);
200STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, simple_strtoul); 200STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, strict_strtoul);
201STANDARD_PARAM_DEF(int, int, "%i", long, simple_strtol); 201STANDARD_PARAM_DEF(int, int, "%i", long, strict_strtol);
202STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, simple_strtoul); 202STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, strict_strtoul);
203STANDARD_PARAM_DEF(long, long, "%li", long, simple_strtol); 203STANDARD_PARAM_DEF(long, long, "%li", long, strict_strtol);
204STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, simple_strtoul); 204STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul);
205 205
206int param_set_charp(const char *val, struct kernel_param *kp) 206int param_set_charp(const char *val, struct kernel_param *kp)
207{ 207{
@@ -272,7 +272,7 @@ static int param_array(const char *name,
272 unsigned int min, unsigned int max, 272 unsigned int min, unsigned int max,
273 void *elem, int elemsize, 273 void *elem, int elemsize,
274 int (*set)(const char *, struct kernel_param *kp), 274 int (*set)(const char *, struct kernel_param *kp),
275 int *num) 275 unsigned int *num)
276{ 276{
277 int ret; 277 int ret;
278 struct kernel_param kp; 278 struct kernel_param kp;
diff --git a/kernel/pid.c b/kernel/pid.c
index f815455431bf..477691576b33 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -41,7 +41,6 @@
41static struct hlist_head *pid_hash; 41static struct hlist_head *pid_hash;
42static int pidhash_shift; 42static int pidhash_shift;
43struct pid init_struct_pid = INIT_STRUCT_PID; 43struct pid init_struct_pid = INIT_STRUCT_PID;
44static struct kmem_cache *pid_ns_cachep;
45 44
46int pid_max = PID_MAX_DEFAULT; 45int pid_max = PID_MAX_DEFAULT;
47 46
@@ -112,7 +111,7 @@ EXPORT_SYMBOL(is_container_init);
112 111
113static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); 112static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
114 113
115static fastcall void free_pidmap(struct pid_namespace *pid_ns, int pid) 114static void free_pidmap(struct pid_namespace *pid_ns, int pid)
116{ 115{
117 struct pidmap *map = pid_ns->pidmap + pid / BITS_PER_PAGE; 116 struct pidmap *map = pid_ns->pidmap + pid / BITS_PER_PAGE;
118 int offset = pid & BITS_PER_PAGE_MASK; 117 int offset = pid & BITS_PER_PAGE_MASK;
@@ -181,7 +180,7 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
181 return -1; 180 return -1;
182} 181}
183 182
184static int next_pidmap(struct pid_namespace *pid_ns, int last) 183int next_pidmap(struct pid_namespace *pid_ns, int last)
185{ 184{
186 int offset; 185 int offset;
187 struct pidmap *map, *end; 186 struct pidmap *map, *end;
@@ -199,7 +198,7 @@ static int next_pidmap(struct pid_namespace *pid_ns, int last)
199 return -1; 198 return -1;
200} 199}
201 200
202fastcall void put_pid(struct pid *pid) 201void put_pid(struct pid *pid)
203{ 202{
204 struct pid_namespace *ns; 203 struct pid_namespace *ns;
205 204
@@ -221,7 +220,7 @@ static void delayed_put_pid(struct rcu_head *rhp)
221 put_pid(pid); 220 put_pid(pid);
222} 221}
223 222
224fastcall void free_pid(struct pid *pid) 223void free_pid(struct pid *pid)
225{ 224{
226 /* We can be called with write_lock_irq(&tasklist_lock) held */ 225 /* We can be called with write_lock_irq(&tasklist_lock) held */
227 int i; 226 int i;
@@ -287,7 +286,7 @@ out_free:
287 goto out; 286 goto out;
288} 287}
289 288
290struct pid * fastcall find_pid_ns(int nr, struct pid_namespace *ns) 289struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
291{ 290{
292 struct hlist_node *elem; 291 struct hlist_node *elem;
293 struct upid *pnr; 292 struct upid *pnr;
@@ -317,7 +316,7 @@ EXPORT_SYMBOL_GPL(find_pid);
317/* 316/*
318 * attach_pid() must be called with the tasklist_lock write-held. 317 * attach_pid() must be called with the tasklist_lock write-held.
319 */ 318 */
320int fastcall attach_pid(struct task_struct *task, enum pid_type type, 319int attach_pid(struct task_struct *task, enum pid_type type,
321 struct pid *pid) 320 struct pid *pid)
322{ 321{
323 struct pid_link *link; 322 struct pid_link *link;
@@ -329,7 +328,7 @@ int fastcall attach_pid(struct task_struct *task, enum pid_type type,
329 return 0; 328 return 0;
330} 329}
331 330
332void fastcall detach_pid(struct task_struct *task, enum pid_type type) 331void detach_pid(struct task_struct *task, enum pid_type type)
333{ 332{
334 struct pid_link *link; 333 struct pid_link *link;
335 struct pid *pid; 334 struct pid *pid;
@@ -349,7 +348,7 @@ void fastcall detach_pid(struct task_struct *task, enum pid_type type)
349} 348}
350 349
351/* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */ 350/* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */
352void fastcall transfer_pid(struct task_struct *old, struct task_struct *new, 351void transfer_pid(struct task_struct *old, struct task_struct *new,
353 enum pid_type type) 352 enum pid_type type)
354{ 353{
355 new->pids[type].pid = old->pids[type].pid; 354 new->pids[type].pid = old->pids[type].pid;
@@ -357,7 +356,7 @@ void fastcall transfer_pid(struct task_struct *old, struct task_struct *new,
357 old->pids[type].pid = NULL; 356 old->pids[type].pid = NULL;
358} 357}
359 358
360struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type) 359struct task_struct *pid_task(struct pid *pid, enum pid_type type)
361{ 360{
362 struct task_struct *result = NULL; 361 struct task_struct *result = NULL;
363 if (pid) { 362 if (pid) {
@@ -368,6 +367,7 @@ struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type)
368 } 367 }
369 return result; 368 return result;
370} 369}
370EXPORT_SYMBOL(pid_task);
371 371
372/* 372/*
373 * Must be called under rcu_read_lock() or with tasklist_lock read-held. 373 * Must be called under rcu_read_lock() or with tasklist_lock read-held.
@@ -408,7 +408,7 @@ struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
408 return pid; 408 return pid;
409} 409}
410 410
411struct task_struct *fastcall get_pid_task(struct pid *pid, enum pid_type type) 411struct task_struct *get_pid_task(struct pid *pid, enum pid_type type)
412{ 412{
413 struct task_struct *result; 413 struct task_struct *result;
414 rcu_read_lock(); 414 rcu_read_lock();
@@ -443,6 +443,12 @@ pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
443 return nr; 443 return nr;
444} 444}
445 445
446pid_t pid_vnr(struct pid *pid)
447{
448 return pid_nr_ns(pid, current->nsproxy->pid_ns);
449}
450EXPORT_SYMBOL_GPL(pid_vnr);
451
446pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) 452pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
447{ 453{
448 return pid_nr_ns(task_pid(tsk), ns); 454 return pid_nr_ns(task_pid(tsk), ns);
@@ -487,180 +493,6 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
487} 493}
488EXPORT_SYMBOL_GPL(find_get_pid); 494EXPORT_SYMBOL_GPL(find_get_pid);
489 495
490struct pid_cache {
491 int nr_ids;
492 char name[16];
493 struct kmem_cache *cachep;
494 struct list_head list;
495};
496
497static LIST_HEAD(pid_caches_lh);
498static DEFINE_MUTEX(pid_caches_mutex);
499
500/*
501 * creates the kmem cache to allocate pids from.
502 * @nr_ids: the number of numerical ids this pid will have to carry
503 */
504
505static struct kmem_cache *create_pid_cachep(int nr_ids)
506{
507 struct pid_cache *pcache;
508 struct kmem_cache *cachep;
509
510 mutex_lock(&pid_caches_mutex);
511 list_for_each_entry (pcache, &pid_caches_lh, list)
512 if (pcache->nr_ids == nr_ids)
513 goto out;
514
515 pcache = kmalloc(sizeof(struct pid_cache), GFP_KERNEL);
516 if (pcache == NULL)
517 goto err_alloc;
518
519 snprintf(pcache->name, sizeof(pcache->name), "pid_%d", nr_ids);
520 cachep = kmem_cache_create(pcache->name,
521 sizeof(struct pid) + (nr_ids - 1) * sizeof(struct upid),
522 0, SLAB_HWCACHE_ALIGN, NULL);
523 if (cachep == NULL)
524 goto err_cachep;
525
526 pcache->nr_ids = nr_ids;
527 pcache->cachep = cachep;
528 list_add(&pcache->list, &pid_caches_lh);
529out:
530 mutex_unlock(&pid_caches_mutex);
531 return pcache->cachep;
532
533err_cachep:
534 kfree(pcache);
535err_alloc:
536 mutex_unlock(&pid_caches_mutex);
537 return NULL;
538}
539
540#ifdef CONFIG_PID_NS
541static struct pid_namespace *create_pid_namespace(int level)
542{
543 struct pid_namespace *ns;
544 int i;
545
546 ns = kmem_cache_alloc(pid_ns_cachep, GFP_KERNEL);
547 if (ns == NULL)
548 goto out;
549
550 ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
551 if (!ns->pidmap[0].page)
552 goto out_free;
553
554 ns->pid_cachep = create_pid_cachep(level + 1);
555 if (ns->pid_cachep == NULL)
556 goto out_free_map;
557
558 kref_init(&ns->kref);
559 ns->last_pid = 0;
560 ns->child_reaper = NULL;
561 ns->level = level;
562
563 set_bit(0, ns->pidmap[0].page);
564 atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
565
566 for (i = 1; i < PIDMAP_ENTRIES; i++) {
567 ns->pidmap[i].page = 0;
568 atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
569 }
570
571 return ns;
572
573out_free_map:
574 kfree(ns->pidmap[0].page);
575out_free:
576 kmem_cache_free(pid_ns_cachep, ns);
577out:
578 return ERR_PTR(-ENOMEM);
579}
580
581static void destroy_pid_namespace(struct pid_namespace *ns)
582{
583 int i;
584
585 for (i = 0; i < PIDMAP_ENTRIES; i++)
586 kfree(ns->pidmap[i].page);
587 kmem_cache_free(pid_ns_cachep, ns);
588}
589
590struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns)
591{
592 struct pid_namespace *new_ns;
593
594 BUG_ON(!old_ns);
595 new_ns = get_pid_ns(old_ns);
596 if (!(flags & CLONE_NEWPID))
597 goto out;
598
599 new_ns = ERR_PTR(-EINVAL);
600 if (flags & CLONE_THREAD)
601 goto out_put;
602
603 new_ns = create_pid_namespace(old_ns->level + 1);
604 if (!IS_ERR(new_ns))
605 new_ns->parent = get_pid_ns(old_ns);
606
607out_put:
608 put_pid_ns(old_ns);
609out:
610 return new_ns;
611}
612
613void free_pid_ns(struct kref *kref)
614{
615 struct pid_namespace *ns, *parent;
616
617 ns = container_of(kref, struct pid_namespace, kref);
618
619 parent = ns->parent;
620 destroy_pid_namespace(ns);
621
622 if (parent != NULL)
623 put_pid_ns(parent);
624}
625#endif /* CONFIG_PID_NS */
626
627void zap_pid_ns_processes(struct pid_namespace *pid_ns)
628{
629 int nr;
630 int rc;
631
632 /*
633 * The last thread in the cgroup-init thread group is terminating.
634 * Find remaining pid_ts in the namespace, signal and wait for them
635 * to exit.
636 *
637 * Note: This signals each threads in the namespace - even those that
638 * belong to the same thread group, To avoid this, we would have
639 * to walk the entire tasklist looking a processes in this
640 * namespace, but that could be unnecessarily expensive if the
641 * pid namespace has just a few processes. Or we need to
642 * maintain a tasklist for each pid namespace.
643 *
644 */
645 read_lock(&tasklist_lock);
646 nr = next_pidmap(pid_ns, 1);
647 while (nr > 0) {
648 kill_proc_info(SIGKILL, SEND_SIG_PRIV, nr);
649 nr = next_pidmap(pid_ns, nr);
650 }
651 read_unlock(&tasklist_lock);
652
653 do {
654 clear_thread_flag(TIF_SIGPENDING);
655 rc = sys_wait4(-1, NULL, __WALL, NULL);
656 } while (rc != -ECHILD);
657
658
659 /* Child reaper for the pid namespace is going away */
660 pid_ns->child_reaper = NULL;
661 return;
662}
663
664/* 496/*
665 * The pid hash table is scaled according to the amount of memory in the 497 * The pid hash table is scaled according to the amount of memory in the
666 * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or 498 * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or
@@ -693,9 +525,6 @@ void __init pidmap_init(void)
693 set_bit(0, init_pid_ns.pidmap[0].page); 525 set_bit(0, init_pid_ns.pidmap[0].page);
694 atomic_dec(&init_pid_ns.pidmap[0].nr_free); 526 atomic_dec(&init_pid_ns.pidmap[0].nr_free);
695 527
696 init_pid_ns.pid_cachep = create_pid_cachep(1); 528 init_pid_ns.pid_cachep = KMEM_CACHE(pid,
697 if (init_pid_ns.pid_cachep == NULL) 529 SLAB_HWCACHE_ALIGN | SLAB_PANIC);
698 panic("Can't create pid_1 cachep\n");
699
700 pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
701} 530}
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
new file mode 100644
index 000000000000..6d792b66d854
--- /dev/null
+++ b/kernel/pid_namespace.c
@@ -0,0 +1,197 @@
1/*
2 * Pid namespaces
3 *
4 * Authors:
5 * (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc.
6 * (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM
7 * Many thanks to Oleg Nesterov for comments and help
8 *
9 */
10
11#include <linux/pid.h>
12#include <linux/pid_namespace.h>
13#include <linux/syscalls.h>
14#include <linux/err.h>
15
16#define BITS_PER_PAGE (PAGE_SIZE*8)
17
18struct pid_cache {
19 int nr_ids;
20 char name[16];
21 struct kmem_cache *cachep;
22 struct list_head list;
23};
24
25static LIST_HEAD(pid_caches_lh);
26static DEFINE_MUTEX(pid_caches_mutex);
27static struct kmem_cache *pid_ns_cachep;
28
29/*
30 * creates the kmem cache to allocate pids from.
31 * @nr_ids: the number of numerical ids this pid will have to carry
32 */
33
34static struct kmem_cache *create_pid_cachep(int nr_ids)
35{
36 struct pid_cache *pcache;
37 struct kmem_cache *cachep;
38
39 mutex_lock(&pid_caches_mutex);
40 list_for_each_entry(pcache, &pid_caches_lh, list)
41 if (pcache->nr_ids == nr_ids)
42 goto out;
43
44 pcache = kmalloc(sizeof(struct pid_cache), GFP_KERNEL);
45 if (pcache == NULL)
46 goto err_alloc;
47
48 snprintf(pcache->name, sizeof(pcache->name), "pid_%d", nr_ids);
49 cachep = kmem_cache_create(pcache->name,
50 sizeof(struct pid) + (nr_ids - 1) * sizeof(struct upid),
51 0, SLAB_HWCACHE_ALIGN, NULL);
52 if (cachep == NULL)
53 goto err_cachep;
54
55 pcache->nr_ids = nr_ids;
56 pcache->cachep = cachep;
57 list_add(&pcache->list, &pid_caches_lh);
58out:
59 mutex_unlock(&pid_caches_mutex);
60 return pcache->cachep;
61
62err_cachep:
63 kfree(pcache);
64err_alloc:
65 mutex_unlock(&pid_caches_mutex);
66 return NULL;
67}
68
69static struct pid_namespace *create_pid_namespace(int level)
70{
71 struct pid_namespace *ns;
72 int i;
73
74 ns = kmem_cache_alloc(pid_ns_cachep, GFP_KERNEL);
75 if (ns == NULL)
76 goto out;
77
78 ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
79 if (!ns->pidmap[0].page)
80 goto out_free;
81
82 ns->pid_cachep = create_pid_cachep(level + 1);
83 if (ns->pid_cachep == NULL)
84 goto out_free_map;
85
86 kref_init(&ns->kref);
87 ns->last_pid = 0;
88 ns->child_reaper = NULL;
89 ns->level = level;
90
91 set_bit(0, ns->pidmap[0].page);
92 atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
93
94 for (i = 1; i < PIDMAP_ENTRIES; i++) {
95 ns->pidmap[i].page = 0;
96 atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
97 }
98
99 return ns;
100
101out_free_map:
102 kfree(ns->pidmap[0].page);
103out_free:
104 kmem_cache_free(pid_ns_cachep, ns);
105out:
106 return ERR_PTR(-ENOMEM);
107}
108
109static void destroy_pid_namespace(struct pid_namespace *ns)
110{
111 int i;
112
113 for (i = 0; i < PIDMAP_ENTRIES; i++)
114 kfree(ns->pidmap[i].page);
115 kmem_cache_free(pid_ns_cachep, ns);
116}
117
118struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns)
119{
120 struct pid_namespace *new_ns;
121
122 BUG_ON(!old_ns);
123 new_ns = get_pid_ns(old_ns);
124 if (!(flags & CLONE_NEWPID))
125 goto out;
126
127 new_ns = ERR_PTR(-EINVAL);
128 if (flags & CLONE_THREAD)
129 goto out_put;
130
131 new_ns = create_pid_namespace(old_ns->level + 1);
132 if (!IS_ERR(new_ns))
133 new_ns->parent = get_pid_ns(old_ns);
134
135out_put:
136 put_pid_ns(old_ns);
137out:
138 return new_ns;
139}
140
141void free_pid_ns(struct kref *kref)
142{
143 struct pid_namespace *ns, *parent;
144
145 ns = container_of(kref, struct pid_namespace, kref);
146
147 parent = ns->parent;
148 destroy_pid_namespace(ns);
149
150 if (parent != NULL)
151 put_pid_ns(parent);
152}
153
154void zap_pid_ns_processes(struct pid_namespace *pid_ns)
155{
156 int nr;
157 int rc;
158
159 /*
160 * The last thread in the cgroup-init thread group is terminating.
161 * Find remaining pid_ts in the namespace, signal and wait for them
162 * to exit.
163 *
164 * Note: This signals each threads in the namespace - even those that
165 * belong to the same thread group, To avoid this, we would have
166 * to walk the entire tasklist looking a processes in this
167 * namespace, but that could be unnecessarily expensive if the
168 * pid namespace has just a few processes. Or we need to
169 * maintain a tasklist for each pid namespace.
170 *
171 */
172 read_lock(&tasklist_lock);
173 nr = next_pidmap(pid_ns, 1);
174 while (nr > 0) {
175 kill_proc_info(SIGKILL, SEND_SIG_PRIV, nr);
176 nr = next_pidmap(pid_ns, nr);
177 }
178 read_unlock(&tasklist_lock);
179
180 do {
181 clear_thread_flag(TIF_SIGPENDING);
182 rc = sys_wait4(-1, NULL, __WALL, NULL);
183 } while (rc != -ECHILD);
184
185
186 /* Child reaper for the pid namespace is going away */
187 pid_ns->child_reaper = NULL;
188 return;
189}
190
191static __init int pid_namespaces_init(void)
192{
193 pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
194 return 0;
195}
196
197__initcall(pid_namespaces_init);
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
new file mode 100644
index 000000000000..0afe32be4c85
--- /dev/null
+++ b/kernel/pm_qos_params.c
@@ -0,0 +1,425 @@
1/*
2 * This module exposes the interface to kernel space for specifying
3 * QoS dependencies. It provides infrastructure for registration of:
4 *
5 * Dependents on a QoS value : register requirements
6 * Watchers of QoS value : get notified when target QoS value changes
7 *
8 * This QoS design is best effort based. Dependents register their QoS needs.
9 * Watchers register to keep track of the current QoS needs of the system.
10 *
11 * There are 3 basic classes of QoS parameter: latency, timeout, throughput
12 * each have defined units:
13 * latency: usec
14 * timeout: usec <-- currently not used.
15 * throughput: kbs (kilo byte / sec)
16 *
17 * There are lists of pm_qos_objects each one wrapping requirements, notifiers
18 *
19 * User mode requirements on a QOS parameter register themselves to the
20 * subsystem by opening the device node /dev/... and writing there request to
21 * the node. As long as the process holds a file handle open to the node the
22 * client continues to be accounted for. Upon file release the usermode
23 * requirement is removed and a new qos target is computed. This way when the
24 * requirement that the application has is cleaned up when closes the file
25 * pointer or exits the pm_qos_object will get an opportunity to clean up.
26 *
27 * mark gross mgross@linux.intel.com
28 */
29
30#include <linux/pm_qos_params.h>
31#include <linux/sched.h>
32#include <linux/spinlock.h>
33#include <linux/slab.h>
34#include <linux/time.h>
35#include <linux/fs.h>
36#include <linux/device.h>
37#include <linux/miscdevice.h>
38#include <linux/string.h>
39#include <linux/platform_device.h>
40#include <linux/init.h>
41
42#include <linux/uaccess.h>
43
44/*
45 * locking rule: all changes to target_value or requirements or notifiers lists
46 * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock
47 * held, taken with _irqsave. One lock to rule them all
48 */
49struct requirement_list {
50 struct list_head list;
51 union {
52 s32 value;
53 s32 usec;
54 s32 kbps;
55 };
56 char *name;
57};
58
59static s32 max_compare(s32 v1, s32 v2);
60static s32 min_compare(s32 v1, s32 v2);
61
62struct pm_qos_object {
63 struct requirement_list requirements;
64 struct blocking_notifier_head *notifiers;
65 struct miscdevice pm_qos_power_miscdev;
66 char *name;
67 s32 default_value;
68 s32 target_value;
69 s32 (*comparitor)(s32, s32);
70};
71
72static struct pm_qos_object null_pm_qos;
73static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier);
74static struct pm_qos_object cpu_dma_pm_qos = {
75 .requirements = {LIST_HEAD_INIT(cpu_dma_pm_qos.requirements.list)},
76 .notifiers = &cpu_dma_lat_notifier,
77 .name = "cpu_dma_latency",
78 .default_value = 2000 * USEC_PER_SEC,
79 .target_value = 2000 * USEC_PER_SEC,
80 .comparitor = min_compare
81};
82
83static BLOCKING_NOTIFIER_HEAD(network_lat_notifier);
84static struct pm_qos_object network_lat_pm_qos = {
85 .requirements = {LIST_HEAD_INIT(network_lat_pm_qos.requirements.list)},
86 .notifiers = &network_lat_notifier,
87 .name = "network_latency",
88 .default_value = 2000 * USEC_PER_SEC,
89 .target_value = 2000 * USEC_PER_SEC,
90 .comparitor = min_compare
91};
92
93
94static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier);
95static struct pm_qos_object network_throughput_pm_qos = {
96 .requirements =
97 {LIST_HEAD_INIT(network_throughput_pm_qos.requirements.list)},
98 .notifiers = &network_throughput_notifier,
99 .name = "network_throughput",
100 .default_value = 0,
101 .target_value = 0,
102 .comparitor = max_compare
103};
104
105
106static struct pm_qos_object *pm_qos_array[] = {
107 &null_pm_qos,
108 &cpu_dma_pm_qos,
109 &network_lat_pm_qos,
110 &network_throughput_pm_qos
111};
112
113static DEFINE_SPINLOCK(pm_qos_lock);
114
115static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
116 size_t count, loff_t *f_pos);
117static int pm_qos_power_open(struct inode *inode, struct file *filp);
118static int pm_qos_power_release(struct inode *inode, struct file *filp);
119
120static const struct file_operations pm_qos_power_fops = {
121 .write = pm_qos_power_write,
122 .open = pm_qos_power_open,
123 .release = pm_qos_power_release,
124};
125
126/* static helper functions */
127static s32 max_compare(s32 v1, s32 v2)
128{
129 return max(v1, v2);
130}
131
132static s32 min_compare(s32 v1, s32 v2)
133{
134 return min(v1, v2);
135}
136
137
138static void update_target(int target)
139{
140 s32 extreme_value;
141 struct requirement_list *node;
142 unsigned long flags;
143 int call_notifier = 0;
144
145 spin_lock_irqsave(&pm_qos_lock, flags);
146 extreme_value = pm_qos_array[target]->default_value;
147 list_for_each_entry(node,
148 &pm_qos_array[target]->requirements.list, list) {
149 extreme_value = pm_qos_array[target]->comparitor(
150 extreme_value, node->value);
151 }
152 if (pm_qos_array[target]->target_value != extreme_value) {
153 call_notifier = 1;
154 pm_qos_array[target]->target_value = extreme_value;
155 pr_debug(KERN_ERR "new target for qos %d is %d\n", target,
156 pm_qos_array[target]->target_value);
157 }
158 spin_unlock_irqrestore(&pm_qos_lock, flags);
159
160 if (call_notifier)
161 blocking_notifier_call_chain(pm_qos_array[target]->notifiers,
162 (unsigned long) extreme_value, NULL);
163}
164
165static int register_pm_qos_misc(struct pm_qos_object *qos)
166{
167 qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR;
168 qos->pm_qos_power_miscdev.name = qos->name;
169 qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops;
170
171 return misc_register(&qos->pm_qos_power_miscdev);
172}
173
174static int find_pm_qos_object_by_minor(int minor)
175{
176 int pm_qos_class;
177
178 for (pm_qos_class = 0;
179 pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) {
180 if (minor ==
181 pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor)
182 return pm_qos_class;
183 }
184 return -1;
185}
186
187/**
188 * pm_qos_requirement - returns current system wide qos expectation
189 * @pm_qos_class: identification of which qos value is requested
190 *
191 * This function returns the current target value in an atomic manner.
192 */
193int pm_qos_requirement(int pm_qos_class)
194{
195 int ret_val;
196 unsigned long flags;
197
198 spin_lock_irqsave(&pm_qos_lock, flags);
199 ret_val = pm_qos_array[pm_qos_class]->target_value;
200 spin_unlock_irqrestore(&pm_qos_lock, flags);
201
202 return ret_val;
203}
204EXPORT_SYMBOL_GPL(pm_qos_requirement);
205
206/**
207 * pm_qos_add_requirement - inserts new qos request into the list
208 * @pm_qos_class: identifies which list of qos request to us
209 * @name: identifies the request
210 * @value: defines the qos request
211 *
212 * This function inserts a new entry in the pm_qos_class list of requested qos
213 * performance charactoistics. It recomputes the agregate QoS expectations for
214 * the pm_qos_class of parrameters.
215 */
216int pm_qos_add_requirement(int pm_qos_class, char *name, s32 value)
217{
218 struct requirement_list *dep;
219 unsigned long flags;
220
221 dep = kzalloc(sizeof(struct requirement_list), GFP_KERNEL);
222 if (dep) {
223 if (value == PM_QOS_DEFAULT_VALUE)
224 dep->value = pm_qos_array[pm_qos_class]->default_value;
225 else
226 dep->value = value;
227 dep->name = kstrdup(name, GFP_KERNEL);
228 if (!dep->name)
229 goto cleanup;
230
231 spin_lock_irqsave(&pm_qos_lock, flags);
232 list_add(&dep->list,
233 &pm_qos_array[pm_qos_class]->requirements.list);
234 spin_unlock_irqrestore(&pm_qos_lock, flags);
235 update_target(pm_qos_class);
236
237 return 0;
238 }
239
240cleanup:
241 kfree(dep);
242 return -ENOMEM;
243}
244EXPORT_SYMBOL_GPL(pm_qos_add_requirement);
245
246/**
247 * pm_qos_update_requirement - modifies an existing qos request
248 * @pm_qos_class: identifies which list of qos request to us
249 * @name: identifies the request
250 * @value: defines the qos request
251 *
252 * Updates an existing qos requierement for the pm_qos_class of parameters along
253 * with updating the target pm_qos_class value.
254 *
255 * If the named request isn't in the lest then no change is made.
256 */
257int pm_qos_update_requirement(int pm_qos_class, char *name, s32 new_value)
258{
259 unsigned long flags;
260 struct requirement_list *node;
261 int pending_update = 0;
262
263 spin_lock_irqsave(&pm_qos_lock, flags);
264 list_for_each_entry(node,
265 &pm_qos_array[pm_qos_class]->requirements.list, list) {
266 if (strcmp(node->name, name) == 0) {
267 if (new_value == PM_QOS_DEFAULT_VALUE)
268 node->value =
269 pm_qos_array[pm_qos_class]->default_value;
270 else
271 node->value = new_value;
272 pending_update = 1;
273 break;
274 }
275 }
276 spin_unlock_irqrestore(&pm_qos_lock, flags);
277 if (pending_update)
278 update_target(pm_qos_class);
279
280 return 0;
281}
282EXPORT_SYMBOL_GPL(pm_qos_update_requirement);
283
284/**
285 * pm_qos_remove_requirement - modifies an existing qos request
286 * @pm_qos_class: identifies which list of qos request to us
287 * @name: identifies the request
288 *
289 * Will remove named qos request from pm_qos_class list of parrameters and
290 * recompute the current target value for the pm_qos_class.
291 */
292void pm_qos_remove_requirement(int pm_qos_class, char *name)
293{
294 unsigned long flags;
295 struct requirement_list *node;
296 int pending_update = 0;
297
298 spin_lock_irqsave(&pm_qos_lock, flags);
299 list_for_each_entry(node,
300 &pm_qos_array[pm_qos_class]->requirements.list, list) {
301 if (strcmp(node->name, name) == 0) {
302 kfree(node->name);
303 list_del(&node->list);
304 kfree(node);
305 pending_update = 1;
306 break;
307 }
308 }
309 spin_unlock_irqrestore(&pm_qos_lock, flags);
310 if (pending_update)
311 update_target(pm_qos_class);
312}
313EXPORT_SYMBOL_GPL(pm_qos_remove_requirement);
314
315/**
316 * pm_qos_add_notifier - sets notification entry for changes to target value
317 * @pm_qos_class: identifies which qos target changes should be notified.
318 * @notifier: notifier block managed by caller.
319 *
320 * will register the notifier into a notification chain that gets called
321 * uppon changes to the pm_qos_class target value.
322 */
323 int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier)
324{
325 int retval;
326
327 retval = blocking_notifier_chain_register(
328 pm_qos_array[pm_qos_class]->notifiers, notifier);
329
330 return retval;
331}
332EXPORT_SYMBOL_GPL(pm_qos_add_notifier);
333
334/**
335 * pm_qos_remove_notifier - deletes notification entry from chain.
336 * @pm_qos_class: identifies which qos target changes are notified.
337 * @notifier: notifier block to be removed.
338 *
339 * will remove the notifier from the notification chain that gets called
340 * uppon changes to the pm_qos_class target value.
341 */
342int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
343{
344 int retval;
345
346 retval = blocking_notifier_chain_unregister(
347 pm_qos_array[pm_qos_class]->notifiers, notifier);
348
349 return retval;
350}
351EXPORT_SYMBOL_GPL(pm_qos_remove_notifier);
352
353#define PID_NAME_LEN sizeof("process_1234567890")
354static char name[PID_NAME_LEN];
355
356static int pm_qos_power_open(struct inode *inode, struct file *filp)
357{
358 int ret;
359 long pm_qos_class;
360
361 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
362 if (pm_qos_class >= 0) {
363 filp->private_data = (void *)pm_qos_class;
364 sprintf(name, "process_%d", current->pid);
365 ret = pm_qos_add_requirement(pm_qos_class, name,
366 PM_QOS_DEFAULT_VALUE);
367 if (ret >= 0)
368 return 0;
369 }
370
371 return -EPERM;
372}
373
374static int pm_qos_power_release(struct inode *inode, struct file *filp)
375{
376 int pm_qos_class;
377
378 pm_qos_class = (long)filp->private_data;
379 sprintf(name, "process_%d", current->pid);
380 pm_qos_remove_requirement(pm_qos_class, name);
381
382 return 0;
383}
384
385static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
386 size_t count, loff_t *f_pos)
387{
388 s32 value;
389 int pm_qos_class;
390
391 pm_qos_class = (long)filp->private_data;
392 if (count != sizeof(s32))
393 return -EINVAL;
394 if (copy_from_user(&value, buf, sizeof(s32)))
395 return -EFAULT;
396 sprintf(name, "process_%d", current->pid);
397 pm_qos_update_requirement(pm_qos_class, name, value);
398
399 return sizeof(s32);
400}
401
402
403static int __init pm_qos_power_init(void)
404{
405 int ret = 0;
406
407 ret = register_pm_qos_misc(&cpu_dma_pm_qos);
408 if (ret < 0) {
409 printk(KERN_ERR "pm_qos_param: cpu_dma_latency setup failed\n");
410 return ret;
411 }
412 ret = register_pm_qos_misc(&network_lat_pm_qos);
413 if (ret < 0) {
414 printk(KERN_ERR "pm_qos_param: network_latency setup failed\n");
415 return ret;
416 }
417 ret = register_pm_qos_misc(&network_throughput_pm_qos);
418 if (ret < 0)
419 printk(KERN_ERR
420 "pm_qos_param: network_throughput setup failed\n");
421
422 return ret;
423}
424
425late_initcall(pm_qos_power_init);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 0b7c82ac467e..2eae91f954ca 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -20,7 +20,7 @@ static int check_clock(const clockid_t which_clock)
20 return 0; 20 return 0;
21 21
22 read_lock(&tasklist_lock); 22 read_lock(&tasklist_lock);
23 p = find_task_by_pid(pid); 23 p = find_task_by_vpid(pid);
24 if (!p || !(CPUCLOCK_PERTHREAD(which_clock) ? 24 if (!p || !(CPUCLOCK_PERTHREAD(which_clock) ?
25 same_thread_group(p, current) : thread_group_leader(p))) { 25 same_thread_group(p, current) : thread_group_leader(p))) {
26 error = -EINVAL; 26 error = -EINVAL;
@@ -305,7 +305,7 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
305 */ 305 */
306 struct task_struct *p; 306 struct task_struct *p;
307 rcu_read_lock(); 307 rcu_read_lock();
308 p = find_task_by_pid(pid); 308 p = find_task_by_vpid(pid);
309 if (p) { 309 if (p) {
310 if (CPUCLOCK_PERTHREAD(which_clock)) { 310 if (CPUCLOCK_PERTHREAD(which_clock)) {
311 if (same_thread_group(p, current)) { 311 if (same_thread_group(p, current)) {
@@ -354,7 +354,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
354 if (pid == 0) { 354 if (pid == 0) {
355 p = current; 355 p = current;
356 } else { 356 } else {
357 p = find_task_by_pid(pid); 357 p = find_task_by_vpid(pid);
358 if (p && !same_thread_group(p, current)) 358 if (p && !same_thread_group(p, current))
359 p = NULL; 359 p = NULL;
360 } 360 }
@@ -362,7 +362,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
362 if (pid == 0) { 362 if (pid == 0) {
363 p = current->group_leader; 363 p = current->group_leader;
364 } else { 364 } else {
365 p = find_task_by_pid(pid); 365 p = find_task_by_vpid(pid);
366 if (p && !thread_group_leader(p)) 366 if (p && !thread_group_leader(p))
367 p = NULL; 367 p = NULL;
368 } 368 }
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 36d563fd9e3b..a9b04203a66d 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -256,8 +256,9 @@ static void schedule_next_timer(struct k_itimer *timr)
256 if (timr->it.real.interval.tv64 == 0) 256 if (timr->it.real.interval.tv64 == 0)
257 return; 257 return;
258 258
259 timr->it_overrun += hrtimer_forward(timer, timer->base->get_time(), 259 timr->it_overrun += (unsigned int) hrtimer_forward(timer,
260 timr->it.real.interval); 260 timer->base->get_time(),
261 timr->it.real.interval);
261 262
262 timr->it_overrun_last = timr->it_overrun; 263 timr->it_overrun_last = timr->it_overrun;
263 timr->it_overrun = -1; 264 timr->it_overrun = -1;
@@ -386,7 +387,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
386 now = ktime_add(now, kj); 387 now = ktime_add(now, kj);
387 } 388 }
388#endif 389#endif
389 timr->it_overrun += 390 timr->it_overrun += (unsigned int)
390 hrtimer_forward(timer, now, 391 hrtimer_forward(timer, now,
391 timr->it.real.interval); 392 timr->it.real.interval);
392 ret = HRTIMER_RESTART; 393 ret = HRTIMER_RESTART;
@@ -403,7 +404,7 @@ static struct task_struct * good_sigevent(sigevent_t * event)
403 struct task_struct *rtn = current->group_leader; 404 struct task_struct *rtn = current->group_leader;
404 405
405 if ((event->sigev_notify & SIGEV_THREAD_ID ) && 406 if ((event->sigev_notify & SIGEV_THREAD_ID ) &&
406 (!(rtn = find_task_by_pid(event->sigev_notify_thread_id)) || 407 (!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) ||
407 !same_thread_group(rtn, current) || 408 !same_thread_group(rtn, current) ||
408 (event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_SIGNAL)) 409 (event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_SIGNAL))
409 return NULL; 410 return NULL;
@@ -662,7 +663,7 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
662 */ 663 */
663 if (iv.tv64 && (timr->it_requeue_pending & REQUEUE_PENDING || 664 if (iv.tv64 && (timr->it_requeue_pending & REQUEUE_PENDING ||
664 (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) 665 (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE))
665 timr->it_overrun += hrtimer_forward(timer, now, iv); 666 timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv);
666 667
667 remaining = ktime_sub(timer->expires, now); 668 remaining = ktime_sub(timer->expires, now);
668 /* Return 0 only, when the timer is expired and not pending */ 669 /* Return 0 only, when the timer is expired and not pending */
@@ -766,9 +767,11 @@ common_timer_set(struct k_itimer *timr, int flags,
766 /* SIGEV_NONE timers are not queued ! See common_timer_get */ 767 /* SIGEV_NONE timers are not queued ! See common_timer_get */
767 if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) { 768 if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) {
768 /* Setup correct expiry time for relative timers */ 769 /* Setup correct expiry time for relative timers */
769 if (mode == HRTIMER_MODE_REL) 770 if (mode == HRTIMER_MODE_REL) {
770 timer->expires = ktime_add(timer->expires, 771 timer->expires =
771 timer->base->get_time()); 772 ktime_add_safe(timer->expires,
773 timer->base->get_time());
774 }
772 return 0; 775 return 0;
773 } 776 }
774 777
@@ -981,20 +984,9 @@ sys_clock_getres(const clockid_t which_clock, struct timespec __user *tp)
981static int common_nsleep(const clockid_t which_clock, int flags, 984static int common_nsleep(const clockid_t which_clock, int flags,
982 struct timespec *tsave, struct timespec __user *rmtp) 985 struct timespec *tsave, struct timespec __user *rmtp)
983{ 986{
984 struct timespec rmt; 987 return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ?
985 int ret; 988 HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
986 989 which_clock);
987 ret = hrtimer_nanosleep(tsave, rmtp ? &rmt : NULL,
988 flags & TIMER_ABSTIME ?
989 HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
990 which_clock);
991
992 if (ret && rmtp) {
993 if (copy_to_user(rmtp, &rmt, sizeof(*rmtp)))
994 return -EFAULT;
995 }
996
997 return ret;
998} 990}
999 991
1000asmlinkage long 992asmlinkage long
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index ef9b802738a5..79833170bb9c 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -74,8 +74,8 @@ config PM_TRACE_RTC
74 RTC across reboots, so that you can debug a machine that just hangs 74 RTC across reboots, so that you can debug a machine that just hangs
75 during suspend (or more commonly, during resume). 75 during suspend (or more commonly, during resume).
76 76
77 To use this debugging feature you should attempt to suspend the machine, 77 To use this debugging feature you should attempt to suspend the
78 then reboot it, then run 78 machine, reboot it and then run
79 79
80 dmesg -s 1000000 | grep 'hash matches' 80 dmesg -s 1000000 | grep 'hash matches'
81 81
@@ -123,7 +123,10 @@ config HIBERNATION
123 called "hibernation" in user interfaces. STD checkpoints the 123 called "hibernation" in user interfaces. STD checkpoints the
124 system and powers it off; and restores that checkpoint on reboot. 124 system and powers it off; and restores that checkpoint on reboot.
125 125
126 You can suspend your machine with 'echo disk > /sys/power/state'. 126 You can suspend your machine with 'echo disk > /sys/power/state'
127 after placing resume=/dev/swappartition on the kernel command line
128 in your bootloader's configuration file.
129
127 Alternatively, you can use the additional userland tools available 130 Alternatively, you can use the additional userland tools available
128 from <http://suspend.sf.net>. 131 from <http://suspend.sf.net>.
129 132
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index d09da0895174..859a8e59773a 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -26,7 +26,7 @@
26 26
27 27
28static int noresume = 0; 28static int noresume = 0;
29char resume_file[256] = CONFIG_PM_STD_PARTITION; 29static char resume_file[256] = CONFIG_PM_STD_PARTITION;
30dev_t swsusp_resume_device; 30dev_t swsusp_resume_device;
31sector_t swsusp_resume_block; 31sector_t swsusp_resume_block;
32 32
@@ -185,7 +185,7 @@ static void platform_restore_cleanup(int platform_mode)
185 * reappears in this routine after a restore. 185 * reappears in this routine after a restore.
186 */ 186 */
187 187
188int create_image(int platform_mode) 188static int create_image(int platform_mode)
189{ 189{
190 int error; 190 int error;
191 191
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index f6a5df934f8d..95250d7c8d91 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1203,7 +1203,7 @@ asmlinkage int swsusp_save(void)
1203 1203
1204 printk(KERN_INFO "PM: Creating hibernation image: \n"); 1204 printk(KERN_INFO "PM: Creating hibernation image: \n");
1205 1205
1206 drain_local_pages(); 1206 drain_local_pages(NULL);
1207 nr_pages = count_data_pages(); 1207 nr_pages = count_data_pages();
1208 nr_highmem = count_highmem_pages(); 1208 nr_highmem = count_highmem_pages();
1209 printk(KERN_INFO "PM: Need to copy %u pages\n", nr_pages + nr_highmem); 1209 printk(KERN_INFO "PM: Need to copy %u pages\n", nr_pages + nr_highmem);
@@ -1221,7 +1221,7 @@ asmlinkage int swsusp_save(void)
1221 /* During allocating of suspend pagedir, new cold pages may appear. 1221 /* During allocating of suspend pagedir, new cold pages may appear.
1222 * Kill them. 1222 * Kill them.
1223 */ 1223 */
1224 drain_local_pages(); 1224 drain_local_pages(NULL);
1225 copy_data_pages(&copy_bm, &orig_bm); 1225 copy_data_pages(&copy_bm, &orig_bm);
1226 1226
1227 /* 1227 /*
diff --git a/kernel/printk.c b/kernel/printk.c
index 29ae1e99cde0..bee36100f110 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -32,7 +32,6 @@
32#include <linux/security.h> 32#include <linux/security.h>
33#include <linux/bootmem.h> 33#include <linux/bootmem.h>
34#include <linux/syscalls.h> 34#include <linux/syscalls.h>
35#include <linux/jiffies.h>
36 35
37#include <asm/uaccess.h> 36#include <asm/uaccess.h>
38 37
@@ -93,16 +92,16 @@ static int console_locked, console_suspended;
93 */ 92 */
94static DEFINE_SPINLOCK(logbuf_lock); 93static DEFINE_SPINLOCK(logbuf_lock);
95 94
96#define LOG_BUF_MASK (log_buf_len-1) 95#define LOG_BUF_MASK (log_buf_len-1)
97#define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK]) 96#define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK])
98 97
99/* 98/*
100 * The indices into log_buf are not constrained to log_buf_len - they 99 * The indices into log_buf are not constrained to log_buf_len - they
101 * must be masked before subscripting 100 * must be masked before subscripting
102 */ 101 */
103static unsigned long log_start; /* Index into log_buf: next char to be read by syslog() */ 102static unsigned log_start; /* Index into log_buf: next char to be read by syslog() */
104static unsigned long con_start; /* Index into log_buf: next char to be sent to consoles */ 103static unsigned con_start; /* Index into log_buf: next char to be sent to consoles */
105static unsigned long log_end; /* Index into log_buf: most-recently-written-char + 1 */ 104static unsigned log_end; /* Index into log_buf: most-recently-written-char + 1 */
106 105
107/* 106/*
108 * Array of consoles built from command line options (console=) 107 * Array of consoles built from command line options (console=)
@@ -128,17 +127,17 @@ static int console_may_schedule;
128static char __log_buf[__LOG_BUF_LEN]; 127static char __log_buf[__LOG_BUF_LEN];
129static char *log_buf = __log_buf; 128static char *log_buf = __log_buf;
130static int log_buf_len = __LOG_BUF_LEN; 129static int log_buf_len = __LOG_BUF_LEN;
131static unsigned long logged_chars; /* Number of chars produced since last read+clear operation */ 130static unsigned logged_chars; /* Number of chars produced since last read+clear operation */
132 131
133static int __init log_buf_len_setup(char *str) 132static int __init log_buf_len_setup(char *str)
134{ 133{
135 unsigned long size = memparse(str, &str); 134 unsigned size = memparse(str, &str);
136 unsigned long flags; 135 unsigned long flags;
137 136
138 if (size) 137 if (size)
139 size = roundup_pow_of_two(size); 138 size = roundup_pow_of_two(size);
140 if (size > log_buf_len) { 139 if (size > log_buf_len) {
141 unsigned long start, dest_idx, offset; 140 unsigned start, dest_idx, offset;
142 char *new_log_buf; 141 char *new_log_buf;
143 142
144 new_log_buf = alloc_bootmem(size); 143 new_log_buf = alloc_bootmem(size);
@@ -295,7 +294,7 @@ int log_buf_read(int idx)
295 */ 294 */
296int do_syslog(int type, char __user *buf, int len) 295int do_syslog(int type, char __user *buf, int len)
297{ 296{
298 unsigned long i, j, limit, count; 297 unsigned i, j, limit, count;
299 int do_clear = 0; 298 int do_clear = 0;
300 char c; 299 char c;
301 int error = 0; 300 int error = 0;
@@ -436,7 +435,7 @@ asmlinkage long sys_syslog(int type, char __user *buf, int len)
436/* 435/*
437 * Call the console drivers on a range of log_buf 436 * Call the console drivers on a range of log_buf
438 */ 437 */
439static void __call_console_drivers(unsigned long start, unsigned long end) 438static void __call_console_drivers(unsigned start, unsigned end)
440{ 439{
441 struct console *con; 440 struct console *con;
442 441
@@ -463,8 +462,8 @@ early_param("ignore_loglevel", ignore_loglevel_setup);
463/* 462/*
464 * Write out chars from start to end - 1 inclusive 463 * Write out chars from start to end - 1 inclusive
465 */ 464 */
466static void _call_console_drivers(unsigned long start, 465static void _call_console_drivers(unsigned start,
467 unsigned long end, int msg_log_level) 466 unsigned end, int msg_log_level)
468{ 467{
469 if ((msg_log_level < console_loglevel || ignore_loglevel) && 468 if ((msg_log_level < console_loglevel || ignore_loglevel) &&
470 console_drivers && start != end) { 469 console_drivers && start != end) {
@@ -484,12 +483,12 @@ static void _call_console_drivers(unsigned long start,
484 * log_buf[start] to log_buf[end - 1]. 483 * log_buf[start] to log_buf[end - 1].
485 * The console_sem must be held. 484 * The console_sem must be held.
486 */ 485 */
487static void call_console_drivers(unsigned long start, unsigned long end) 486static void call_console_drivers(unsigned start, unsigned end)
488{ 487{
489 unsigned long cur_index, start_print; 488 unsigned cur_index, start_print;
490 static int msg_level = -1; 489 static int msg_level = -1;
491 490
492 BUG_ON(((long)(start - end)) > 0); 491 BUG_ON(((int)(start - end)) > 0);
493 492
494 cur_index = start; 493 cur_index = start;
495 start_print = start; 494 start_print = start;
@@ -567,19 +566,6 @@ static int printk_time = 0;
567#endif 566#endif
568module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); 567module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
569 568
570static int __init printk_time_setup(char *str)
571{
572 if (*str)
573 return 0;
574 printk_time = 1;
575 printk(KERN_NOTICE "The 'time' option is deprecated and "
576 "is scheduled for removal in early 2008\n");
577 printk(KERN_NOTICE "Use 'printk.time=<value>' instead\n");
578 return 1;
579}
580
581__setup("time", printk_time_setup);
582
583/* Check if we have any console registered that can be called early in boot. */ 569/* Check if we have any console registered that can be called early in boot. */
584static int have_callable_console(void) 570static int have_callable_console(void)
585{ 571{
@@ -790,7 +776,7 @@ asmlinkage long sys_syslog(int type, char __user *buf, int len)
790 return -ENOSYS; 776 return -ENOSYS;
791} 777}
792 778
793static void call_console_drivers(unsigned long start, unsigned long end) 779static void call_console_drivers(unsigned start, unsigned end)
794{ 780{
795} 781}
796 782
@@ -983,8 +969,8 @@ void wake_up_klogd(void)
983void release_console_sem(void) 969void release_console_sem(void)
984{ 970{
985 unsigned long flags; 971 unsigned long flags;
986 unsigned long _con_start, _log_end; 972 unsigned _con_start, _log_end;
987 unsigned long wake_klogd = 0; 973 unsigned wake_klogd = 0;
988 974
989 if (console_suspended) { 975 if (console_suspended) {
990 up(&secondary_console_sem); 976 up(&secondary_console_sem);
@@ -1265,6 +1251,7 @@ void tty_write_message(struct tty_struct *tty, char *msg)
1265 return; 1251 return;
1266} 1252}
1267 1253
1254#if defined CONFIG_PRINTK
1268/* 1255/*
1269 * printk rate limiting, lifted from the networking subsystem. 1256 * printk rate limiting, lifted from the networking subsystem.
1270 * 1257 *
@@ -1275,7 +1262,7 @@ void tty_write_message(struct tty_struct *tty, char *msg)
1275int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst) 1262int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst)
1276{ 1263{
1277 static DEFINE_SPINLOCK(ratelimit_lock); 1264 static DEFINE_SPINLOCK(ratelimit_lock);
1278 static unsigned long toks = 10 * 5 * HZ; 1265 static unsigned toks = 10 * 5 * HZ;
1279 static unsigned long last_msg; 1266 static unsigned long last_msg;
1280 static int missed; 1267 static int missed;
1281 unsigned long flags; 1268 unsigned long flags;
@@ -1334,3 +1321,4 @@ bool printk_timed_ratelimit(unsigned long *caller_jiffies,
1334 return false; 1321 return false;
1335} 1322}
1336EXPORT_SYMBOL(printk_timed_ratelimit); 1323EXPORT_SYMBOL(printk_timed_ratelimit);
1324#endif
diff --git a/kernel/profile.c b/kernel/profile.c
index e64c2da11c0f..3b7a1b055122 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -20,7 +20,6 @@
20#include <linux/mm.h> 20#include <linux/mm.h>
21#include <linux/cpumask.h> 21#include <linux/cpumask.h>
22#include <linux/cpu.h> 22#include <linux/cpu.h>
23#include <linux/profile.h>
24#include <linux/highmem.h> 23#include <linux/highmem.h>
25#include <linux/mutex.h> 24#include <linux/mutex.h>
26#include <asm/sections.h> 25#include <asm/sections.h>
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index b0d4ab4dfd3d..fdb34e86f923 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -20,6 +20,7 @@
20#include <linux/signal.h> 20#include <linux/signal.h>
21#include <linux/audit.h> 21#include <linux/audit.h>
22#include <linux/pid_namespace.h> 22#include <linux/pid_namespace.h>
23#include <linux/syscalls.h>
23 24
24#include <asm/pgtable.h> 25#include <asm/pgtable.h>
25#include <asm/uaccess.h> 26#include <asm/uaccess.h>
@@ -53,7 +54,7 @@ void ptrace_untrace(struct task_struct *child)
53 spin_lock(&child->sighand->siglock); 54 spin_lock(&child->sighand->siglock);
54 if (task_is_traced(child)) { 55 if (task_is_traced(child)) {
55 if (child->signal->flags & SIGNAL_STOP_STOPPED) { 56 if (child->signal->flags & SIGNAL_STOP_STOPPED) {
56 child->state = TASK_STOPPED; 57 __set_task_state(child, TASK_STOPPED);
57 } else { 58 } else {
58 signal_wake_up(child, 1); 59 signal_wake_up(child, 1);
59 } 60 }
@@ -98,23 +99,23 @@ int ptrace_check_attach(struct task_struct *child, int kill)
98 * be changed by us so it's not changing right after this. 99 * be changed by us so it's not changing right after this.
99 */ 100 */
100 read_lock(&tasklist_lock); 101 read_lock(&tasklist_lock);
101 if ((child->ptrace & PT_PTRACED) && child->parent == current && 102 if ((child->ptrace & PT_PTRACED) && child->parent == current) {
102 (!(child->ptrace & PT_ATTACHED) || child->real_parent != current)
103 && child->signal != NULL) {
104 ret = 0; 103 ret = 0;
104 /*
105 * child->sighand can't be NULL, release_task()
106 * does ptrace_unlink() before __exit_signal().
107 */
105 spin_lock_irq(&child->sighand->siglock); 108 spin_lock_irq(&child->sighand->siglock);
106 if (task_is_stopped(child)) { 109 if (task_is_stopped(child))
107 child->state = TASK_TRACED; 110 child->state = TASK_TRACED;
108 } else if (!task_is_traced(child) && !kill) { 111 else if (!task_is_traced(child) && !kill)
109 ret = -ESRCH; 112 ret = -ESRCH;
110 }
111 spin_unlock_irq(&child->sighand->siglock); 113 spin_unlock_irq(&child->sighand->siglock);
112 } 114 }
113 read_unlock(&tasklist_lock); 115 read_unlock(&tasklist_lock);
114 116
115 if (!ret && !kill) { 117 if (!ret && !kill)
116 wait_task_inactive(child); 118 wait_task_inactive(child);
117 }
118 119
119 /* All systems go.. */ 120 /* All systems go.. */
120 return ret; 121 return ret;
@@ -201,8 +202,7 @@ repeat:
201 goto bad; 202 goto bad;
202 203
203 /* Go */ 204 /* Go */
204 task->ptrace |= PT_PTRACED | ((task->real_parent != current) 205 task->ptrace |= PT_PTRACED;
205 ? PT_ATTACHED : 0);
206 if (capable(CAP_SYS_PTRACE)) 206 if (capable(CAP_SYS_PTRACE))
207 task->ptrace |= PT_PTRACE_CAP; 207 task->ptrace |= PT_PTRACE_CAP;
208 208
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 760dfc233a00..c09605f8d16c 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -56,7 +56,10 @@ static atomic_t rcu_barrier_cpu_count;
56static DEFINE_MUTEX(rcu_barrier_mutex); 56static DEFINE_MUTEX(rcu_barrier_mutex);
57static struct completion rcu_barrier_completion; 57static struct completion rcu_barrier_completion;
58 58
59/* Because of FASTCALL declaration of complete, we use this wrapper */ 59/*
60 * Awaken the corresponding synchronize_rcu() instance now that a
61 * grace period has elapsed.
62 */
60static void wakeme_after_rcu(struct rcu_head *head) 63static void wakeme_after_rcu(struct rcu_head *head)
61{ 64{
62 struct rcu_synchronize *rcu; 65 struct rcu_synchronize *rcu;
diff --git a/kernel/relay.c b/kernel/relay.c
index 7c0373322f18..d080b9d161a7 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -37,37 +37,31 @@ static void relay_file_mmap_close(struct vm_area_struct *vma)
37} 37}
38 38
39/* 39/*
40 * nopage() vm_op implementation for relay file mapping. 40 * fault() vm_op implementation for relay file mapping.
41 */ 41 */
42static struct page *relay_buf_nopage(struct vm_area_struct *vma, 42static int relay_buf_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
43 unsigned long address,
44 int *type)
45{ 43{
46 struct page *page; 44 struct page *page;
47 struct rchan_buf *buf = vma->vm_private_data; 45 struct rchan_buf *buf = vma->vm_private_data;
48 unsigned long offset = address - vma->vm_start; 46 pgoff_t pgoff = vmf->pgoff;
49 47
50 if (address > vma->vm_end)
51 return NOPAGE_SIGBUS; /* Disallow mremap */
52 if (!buf) 48 if (!buf)
53 return NOPAGE_OOM; 49 return VM_FAULT_OOM;
54 50
55 page = vmalloc_to_page(buf->start + offset); 51 page = vmalloc_to_page(buf->start + (pgoff << PAGE_SHIFT));
56 if (!page) 52 if (!page)
57 return NOPAGE_OOM; 53 return VM_FAULT_SIGBUS;
58 get_page(page); 54 get_page(page);
55 vmf->page = page;
59 56
60 if (type) 57 return 0;
61 *type = VM_FAULT_MINOR;
62
63 return page;
64} 58}
65 59
66/* 60/*
67 * vm_ops for relay file mappings. 61 * vm_ops for relay file mappings.
68 */ 62 */
69static struct vm_operations_struct relay_file_mmap_ops = { 63static struct vm_operations_struct relay_file_mmap_ops = {
70 .nopage = relay_buf_nopage, 64 .fault = relay_buf_fault,
71 .close = relay_file_mmap_close, 65 .close = relay_file_mmap_close,
72}; 66};
73 67
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
new file mode 100644
index 000000000000..16cbec2d5d60
--- /dev/null
+++ b/kernel/res_counter.c
@@ -0,0 +1,134 @@
1/*
2 * resource cgroups
3 *
4 * Copyright 2007 OpenVZ SWsoft Inc
5 *
6 * Author: Pavel Emelianov <xemul@openvz.org>
7 *
8 */
9
10#include <linux/types.h>
11#include <linux/parser.h>
12#include <linux/fs.h>
13#include <linux/res_counter.h>
14#include <linux/uaccess.h>
15
16void res_counter_init(struct res_counter *counter)
17{
18 spin_lock_init(&counter->lock);
19 counter->limit = (unsigned long long)LLONG_MAX;
20}
21
22int res_counter_charge_locked(struct res_counter *counter, unsigned long val)
23{
24 if (counter->usage + val > counter->limit) {
25 counter->failcnt++;
26 return -ENOMEM;
27 }
28
29 counter->usage += val;
30 return 0;
31}
32
33int res_counter_charge(struct res_counter *counter, unsigned long val)
34{
35 int ret;
36 unsigned long flags;
37
38 spin_lock_irqsave(&counter->lock, flags);
39 ret = res_counter_charge_locked(counter, val);
40 spin_unlock_irqrestore(&counter->lock, flags);
41 return ret;
42}
43
44void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
45{
46 if (WARN_ON(counter->usage < val))
47 val = counter->usage;
48
49 counter->usage -= val;
50}
51
52void res_counter_uncharge(struct res_counter *counter, unsigned long val)
53{
54 unsigned long flags;
55
56 spin_lock_irqsave(&counter->lock, flags);
57 res_counter_uncharge_locked(counter, val);
58 spin_unlock_irqrestore(&counter->lock, flags);
59}
60
61
62static inline unsigned long long *
63res_counter_member(struct res_counter *counter, int member)
64{
65 switch (member) {
66 case RES_USAGE:
67 return &counter->usage;
68 case RES_LIMIT:
69 return &counter->limit;
70 case RES_FAILCNT:
71 return &counter->failcnt;
72 };
73
74 BUG();
75 return NULL;
76}
77
78ssize_t res_counter_read(struct res_counter *counter, int member,
79 const char __user *userbuf, size_t nbytes, loff_t *pos,
80 int (*read_strategy)(unsigned long long val, char *st_buf))
81{
82 unsigned long long *val;
83 char buf[64], *s;
84
85 s = buf;
86 val = res_counter_member(counter, member);
87 if (read_strategy)
88 s += read_strategy(*val, s);
89 else
90 s += sprintf(s, "%llu\n", *val);
91 return simple_read_from_buffer((void __user *)userbuf, nbytes,
92 pos, buf, s - buf);
93}
94
95ssize_t res_counter_write(struct res_counter *counter, int member,
96 const char __user *userbuf, size_t nbytes, loff_t *pos,
97 int (*write_strategy)(char *st_buf, unsigned long long *val))
98{
99 int ret;
100 char *buf, *end;
101 unsigned long flags;
102 unsigned long long tmp, *val;
103
104 buf = kmalloc(nbytes + 1, GFP_KERNEL);
105 ret = -ENOMEM;
106 if (buf == NULL)
107 goto out;
108
109 buf[nbytes] = '\0';
110 ret = -EFAULT;
111 if (copy_from_user(buf, userbuf, nbytes))
112 goto out_free;
113
114 ret = -EINVAL;
115
116 if (write_strategy) {
117 if (write_strategy(buf, &tmp)) {
118 goto out_free;
119 }
120 } else {
121 tmp = simple_strtoull(buf, &end, 10);
122 if (*end != '\0')
123 goto out_free;
124 }
125 spin_lock_irqsave(&counter->lock, flags);
126 val = res_counter_member(counter, member);
127 *val = tmp;
128 spin_unlock_irqrestore(&counter->lock, flags);
129 ret = nbytes;
130out_free:
131 kfree(buf);
132out:
133 return ret;
134}
diff --git a/kernel/resource.c b/kernel/resource.c
index 2eb553d9b517..82aea814d409 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -228,7 +228,7 @@ int release_resource(struct resource *old)
228 228
229EXPORT_SYMBOL(release_resource); 229EXPORT_SYMBOL(release_resource);
230 230
231#ifdef CONFIG_MEMORY_HOTPLUG 231#if defined(CONFIG_MEMORY_HOTPLUG) && !defined(CONFIG_ARCH_HAS_WALK_MEMORY)
232/* 232/*
233 * Finds the lowest memory reosurce exists within [res->start.res->end) 233 * Finds the lowest memory reosurce exists within [res->start.res->end)
234 * the caller must specify res->start, res->end, res->flags. 234 * the caller must specify res->start, res->end, res->flags.
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index 56d73cb8826d..5fcb4fe645e2 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -130,7 +130,7 @@ void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter,
130 130
131 task = rt_mutex_owner(act_waiter->lock); 131 task = rt_mutex_owner(act_waiter->lock);
132 if (task && task != current) { 132 if (task && task != current) {
133 act_waiter->deadlock_task_pid = task->pid; 133 act_waiter->deadlock_task_pid = get_pid(task_pid(task));
134 act_waiter->deadlock_lock = lock; 134 act_waiter->deadlock_lock = lock;
135 } 135 }
136} 136}
@@ -142,9 +142,12 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
142 if (!waiter->deadlock_lock || !rt_trace_on) 142 if (!waiter->deadlock_lock || !rt_trace_on)
143 return; 143 return;
144 144
145 task = find_task_by_pid(waiter->deadlock_task_pid); 145 rcu_read_lock();
146 if (!task) 146 task = pid_task(waiter->deadlock_task_pid, PIDTYPE_PID);
147 if (!task) {
148 rcu_read_unlock();
147 return; 149 return;
150 }
148 151
149 TRACE_OFF_NOLOCK(); 152 TRACE_OFF_NOLOCK();
150 153
@@ -173,6 +176,7 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
173 current->comm, task_pid_nr(current)); 176 current->comm, task_pid_nr(current));
174 dump_stack(); 177 dump_stack();
175 debug_show_all_locks(); 178 debug_show_all_locks();
179 rcu_read_unlock();
176 180
177 printk("[ turning off deadlock detection." 181 printk("[ turning off deadlock detection."
178 "Please report this trace. ]\n\n"); 182 "Please report this trace. ]\n\n");
@@ -203,10 +207,12 @@ void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
203 memset(waiter, 0x11, sizeof(*waiter)); 207 memset(waiter, 0x11, sizeof(*waiter));
204 plist_node_init(&waiter->list_entry, MAX_PRIO); 208 plist_node_init(&waiter->list_entry, MAX_PRIO);
205 plist_node_init(&waiter->pi_list_entry, MAX_PRIO); 209 plist_node_init(&waiter->pi_list_entry, MAX_PRIO);
210 waiter->deadlock_task_pid = NULL;
206} 211}
207 212
208void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) 213void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
209{ 214{
215 put_pid(waiter->deadlock_task_pid);
210 TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry)); 216 TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry));
211 TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); 217 TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
212 TRACE_WARN_ON(waiter->task); 218 TRACE_WARN_ON(waiter->task);
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 0deef71ff8d2..6522ae5b14a2 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -630,9 +630,12 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
630 set_current_state(state); 630 set_current_state(state);
631 631
632 /* Setup the timer, when timeout != NULL */ 632 /* Setup the timer, when timeout != NULL */
633 if (unlikely(timeout)) 633 if (unlikely(timeout)) {
634 hrtimer_start(&timeout->timer, timeout->timer.expires, 634 hrtimer_start(&timeout->timer, timeout->timer.expires,
635 HRTIMER_MODE_ABS); 635 HRTIMER_MODE_ABS);
636 if (!hrtimer_active(&timeout->timer))
637 timeout->task = NULL;
638 }
636 639
637 for (;;) { 640 for (;;) {
638 /* Try to acquire the lock: */ 641 /* Try to acquire the lock: */
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
index 2d3b83593ca3..e124bf5800ea 100644
--- a/kernel/rtmutex_common.h
+++ b/kernel/rtmutex_common.h
@@ -51,7 +51,7 @@ struct rt_mutex_waiter {
51 struct rt_mutex *lock; 51 struct rt_mutex *lock;
52#ifdef CONFIG_DEBUG_RT_MUTEXES 52#ifdef CONFIG_DEBUG_RT_MUTEXES
53 unsigned long ip; 53 unsigned long ip;
54 pid_t deadlock_task_pid; 54 struct pid *deadlock_task_pid;
55 struct rt_mutex *deadlock_lock; 55 struct rt_mutex *deadlock_lock;
56#endif 56#endif
57}; 57};
diff --git a/kernel/sched.c b/kernel/sched.c
index 9474b23c28bf..f28f19e65b59 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -155,7 +155,7 @@ struct rt_prio_array {
155 struct list_head queue[MAX_RT_PRIO]; 155 struct list_head queue[MAX_RT_PRIO];
156}; 156};
157 157
158#ifdef CONFIG_FAIR_GROUP_SCHED 158#ifdef CONFIG_GROUP_SCHED
159 159
160#include <linux/cgroup.h> 160#include <linux/cgroup.h>
161 161
@@ -165,19 +165,16 @@ static LIST_HEAD(task_groups);
165 165
166/* task group related information */ 166/* task group related information */
167struct task_group { 167struct task_group {
168#ifdef CONFIG_FAIR_CGROUP_SCHED 168#ifdef CONFIG_CGROUP_SCHED
169 struct cgroup_subsys_state css; 169 struct cgroup_subsys_state css;
170#endif 170#endif
171
172#ifdef CONFIG_FAIR_GROUP_SCHED
171 /* schedulable entities of this group on each cpu */ 173 /* schedulable entities of this group on each cpu */
172 struct sched_entity **se; 174 struct sched_entity **se;
173 /* runqueue "owned" by this group on each cpu */ 175 /* runqueue "owned" by this group on each cpu */
174 struct cfs_rq **cfs_rq; 176 struct cfs_rq **cfs_rq;
175 177
176 struct sched_rt_entity **rt_se;
177 struct rt_rq **rt_rq;
178
179 unsigned int rt_ratio;
180
181 /* 178 /*
182 * shares assigned to a task group governs how much of cpu bandwidth 179 * shares assigned to a task group governs how much of cpu bandwidth
183 * is allocated to the group. The more shares a group has, the more is 180 * is allocated to the group. The more shares a group has, the more is
@@ -213,33 +210,46 @@ struct task_group {
213 * 210 *
214 */ 211 */
215 unsigned long shares; 212 unsigned long shares;
213#endif
214
215#ifdef CONFIG_RT_GROUP_SCHED
216 struct sched_rt_entity **rt_se;
217 struct rt_rq **rt_rq;
218
219 u64 rt_runtime;
220#endif
216 221
217 struct rcu_head rcu; 222 struct rcu_head rcu;
218 struct list_head list; 223 struct list_head list;
219}; 224};
220 225
226#ifdef CONFIG_FAIR_GROUP_SCHED
221/* Default task group's sched entity on each cpu */ 227/* Default task group's sched entity on each cpu */
222static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); 228static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
223/* Default task group's cfs_rq on each cpu */ 229/* Default task group's cfs_rq on each cpu */
224static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; 230static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
225 231
226static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
227static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
228
229static struct sched_entity *init_sched_entity_p[NR_CPUS]; 232static struct sched_entity *init_sched_entity_p[NR_CPUS];
230static struct cfs_rq *init_cfs_rq_p[NR_CPUS]; 233static struct cfs_rq *init_cfs_rq_p[NR_CPUS];
234#endif
235
236#ifdef CONFIG_RT_GROUP_SCHED
237static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
238static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
231 239
232static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS]; 240static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS];
233static struct rt_rq *init_rt_rq_p[NR_CPUS]; 241static struct rt_rq *init_rt_rq_p[NR_CPUS];
242#endif
234 243
235/* task_group_mutex serializes add/remove of task groups and also changes to 244/* task_group_lock serializes add/remove of task groups and also changes to
236 * a task group's cpu shares. 245 * a task group's cpu shares.
237 */ 246 */
238static DEFINE_MUTEX(task_group_mutex); 247static DEFINE_SPINLOCK(task_group_lock);
239 248
240/* doms_cur_mutex serializes access to doms_cur[] array */ 249/* doms_cur_mutex serializes access to doms_cur[] array */
241static DEFINE_MUTEX(doms_cur_mutex); 250static DEFINE_MUTEX(doms_cur_mutex);
242 251
252#ifdef CONFIG_FAIR_GROUP_SCHED
243#ifdef CONFIG_SMP 253#ifdef CONFIG_SMP
244/* kernel thread that runs rebalance_shares() periodically */ 254/* kernel thread that runs rebalance_shares() periodically */
245static struct task_struct *lb_monitor_task; 255static struct task_struct *lb_monitor_task;
@@ -248,35 +258,40 @@ static int load_balance_monitor(void *unused);
248 258
249static void set_se_shares(struct sched_entity *se, unsigned long shares); 259static void set_se_shares(struct sched_entity *se, unsigned long shares);
250 260
261#ifdef CONFIG_USER_SCHED
262# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
263#else
264# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
265#endif
266
267#define MIN_GROUP_SHARES 2
268
269static int init_task_group_load = INIT_TASK_GROUP_LOAD;
270#endif
271
251/* Default task group. 272/* Default task group.
252 * Every task in system belong to this group at bootup. 273 * Every task in system belong to this group at bootup.
253 */ 274 */
254struct task_group init_task_group = { 275struct task_group init_task_group = {
276#ifdef CONFIG_FAIR_GROUP_SCHED
255 .se = init_sched_entity_p, 277 .se = init_sched_entity_p,
256 .cfs_rq = init_cfs_rq_p, 278 .cfs_rq = init_cfs_rq_p,
279#endif
257 280
281#ifdef CONFIG_RT_GROUP_SCHED
258 .rt_se = init_sched_rt_entity_p, 282 .rt_se = init_sched_rt_entity_p,
259 .rt_rq = init_rt_rq_p, 283 .rt_rq = init_rt_rq_p,
260};
261
262#ifdef CONFIG_FAIR_USER_SCHED
263# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
264#else
265# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
266#endif 284#endif
267 285};
268#define MIN_GROUP_SHARES 2
269
270static int init_task_group_load = INIT_TASK_GROUP_LOAD;
271 286
272/* return group to which a task belongs */ 287/* return group to which a task belongs */
273static inline struct task_group *task_group(struct task_struct *p) 288static inline struct task_group *task_group(struct task_struct *p)
274{ 289{
275 struct task_group *tg; 290 struct task_group *tg;
276 291
277#ifdef CONFIG_FAIR_USER_SCHED 292#ifdef CONFIG_USER_SCHED
278 tg = p->user->tg; 293 tg = p->user->tg;
279#elif defined(CONFIG_FAIR_CGROUP_SCHED) 294#elif defined(CONFIG_CGROUP_SCHED)
280 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), 295 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
281 struct task_group, css); 296 struct task_group, css);
282#else 297#else
@@ -288,21 +303,15 @@ static inline struct task_group *task_group(struct task_struct *p)
288/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ 303/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
289static inline void set_task_rq(struct task_struct *p, unsigned int cpu) 304static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
290{ 305{
306#ifdef CONFIG_FAIR_GROUP_SCHED
291 p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; 307 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
292 p->se.parent = task_group(p)->se[cpu]; 308 p->se.parent = task_group(p)->se[cpu];
309#endif
293 310
311#ifdef CONFIG_RT_GROUP_SCHED
294 p->rt.rt_rq = task_group(p)->rt_rq[cpu]; 312 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
295 p->rt.parent = task_group(p)->rt_se[cpu]; 313 p->rt.parent = task_group(p)->rt_se[cpu];
296} 314#endif
297
298static inline void lock_task_group_list(void)
299{
300 mutex_lock(&task_group_mutex);
301}
302
303static inline void unlock_task_group_list(void)
304{
305 mutex_unlock(&task_group_mutex);
306} 315}
307 316
308static inline void lock_doms_cur(void) 317static inline void lock_doms_cur(void)
@@ -318,12 +327,10 @@ static inline void unlock_doms_cur(void)
318#else 327#else
319 328
320static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } 329static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
321static inline void lock_task_group_list(void) { }
322static inline void unlock_task_group_list(void) { }
323static inline void lock_doms_cur(void) { } 330static inline void lock_doms_cur(void) { }
324static inline void unlock_doms_cur(void) { } 331static inline void unlock_doms_cur(void) { }
325 332
326#endif /* CONFIG_FAIR_GROUP_SCHED */ 333#endif /* CONFIG_GROUP_SCHED */
327 334
328/* CFS-related fields in a runqueue */ 335/* CFS-related fields in a runqueue */
329struct cfs_rq { 336struct cfs_rq {
@@ -363,7 +370,7 @@ struct cfs_rq {
363struct rt_rq { 370struct rt_rq {
364 struct rt_prio_array active; 371 struct rt_prio_array active;
365 unsigned long rt_nr_running; 372 unsigned long rt_nr_running;
366#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED 373#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
367 int highest_prio; /* highest queued rt task prio */ 374 int highest_prio; /* highest queued rt task prio */
368#endif 375#endif
369#ifdef CONFIG_SMP 376#ifdef CONFIG_SMP
@@ -373,7 +380,9 @@ struct rt_rq {
373 int rt_throttled; 380 int rt_throttled;
374 u64 rt_time; 381 u64 rt_time;
375 382
376#ifdef CONFIG_FAIR_GROUP_SCHED 383#ifdef CONFIG_RT_GROUP_SCHED
384 unsigned long rt_nr_boosted;
385
377 struct rq *rq; 386 struct rq *rq;
378 struct list_head leaf_rt_rq_list; 387 struct list_head leaf_rt_rq_list;
379 struct task_group *tg; 388 struct task_group *tg;
@@ -447,6 +456,8 @@ struct rq {
447#ifdef CONFIG_FAIR_GROUP_SCHED 456#ifdef CONFIG_FAIR_GROUP_SCHED
448 /* list of leaf cfs_rq on this cpu: */ 457 /* list of leaf cfs_rq on this cpu: */
449 struct list_head leaf_cfs_rq_list; 458 struct list_head leaf_cfs_rq_list;
459#endif
460#ifdef CONFIG_RT_GROUP_SCHED
450 struct list_head leaf_rt_rq_list; 461 struct list_head leaf_rt_rq_list;
451#endif 462#endif
452 463
@@ -652,19 +663,21 @@ const_debug unsigned int sysctl_sched_features =
652const_debug unsigned int sysctl_sched_nr_migrate = 32; 663const_debug unsigned int sysctl_sched_nr_migrate = 32;
653 664
654/* 665/*
655 * period over which we measure -rt task cpu usage in ms. 666 * period over which we measure -rt task cpu usage in us.
656 * default: 1s 667 * default: 1s
657 */ 668 */
658const_debug unsigned int sysctl_sched_rt_period = 1000; 669unsigned int sysctl_sched_rt_period = 1000000;
659 670
660#define SCHED_RT_FRAC_SHIFT 16 671/*
661#define SCHED_RT_FRAC (1UL << SCHED_RT_FRAC_SHIFT) 672 * part of the period that we allow rt tasks to run in us.
673 * default: 0.95s
674 */
675int sysctl_sched_rt_runtime = 950000;
662 676
663/* 677/*
664 * ratio of time -rt tasks may consume. 678 * single value that denotes runtime == period, ie unlimited time.
665 * default: 95%
666 */ 679 */
667const_debug unsigned int sysctl_sched_rt_ratio = 62259; 680#define RUNTIME_INF ((u64)~0ULL)
668 681
669/* 682/*
670 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu 683 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
@@ -1893,13 +1906,13 @@ out:
1893 return success; 1906 return success;
1894} 1907}
1895 1908
1896int fastcall wake_up_process(struct task_struct *p) 1909int wake_up_process(struct task_struct *p)
1897{ 1910{
1898 return try_to_wake_up(p, TASK_ALL, 0); 1911 return try_to_wake_up(p, TASK_ALL, 0);
1899} 1912}
1900EXPORT_SYMBOL(wake_up_process); 1913EXPORT_SYMBOL(wake_up_process);
1901 1914
1902int fastcall wake_up_state(struct task_struct *p, unsigned int state) 1915int wake_up_state(struct task_struct *p, unsigned int state)
1903{ 1916{
1904 return try_to_wake_up(p, state, 0); 1917 return try_to_wake_up(p, state, 0);
1905} 1918}
@@ -1986,7 +1999,7 @@ void sched_fork(struct task_struct *p, int clone_flags)
1986 * that must be done for every newly created context, then puts the task 1999 * that must be done for every newly created context, then puts the task
1987 * on the runqueue and wakes it. 2000 * on the runqueue and wakes it.
1988 */ 2001 */
1989void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) 2002void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
1990{ 2003{
1991 unsigned long flags; 2004 unsigned long flags;
1992 struct rq *rq; 2005 struct rq *rq;
@@ -3753,7 +3766,7 @@ void scheduler_tick(void)
3753 3766
3754#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) 3767#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
3755 3768
3756void fastcall add_preempt_count(int val) 3769void add_preempt_count(int val)
3757{ 3770{
3758 /* 3771 /*
3759 * Underflow? 3772 * Underflow?
@@ -3769,7 +3782,7 @@ void fastcall add_preempt_count(int val)
3769} 3782}
3770EXPORT_SYMBOL(add_preempt_count); 3783EXPORT_SYMBOL(add_preempt_count);
3771 3784
3772void fastcall sub_preempt_count(int val) 3785void sub_preempt_count(int val)
3773{ 3786{
3774 /* 3787 /*
3775 * Underflow? 3788 * Underflow?
@@ -4067,7 +4080,7 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
4067 * @nr_exclusive: how many wake-one or wake-many threads to wake up 4080 * @nr_exclusive: how many wake-one or wake-many threads to wake up
4068 * @key: is directly passed to the wakeup function 4081 * @key: is directly passed to the wakeup function
4069 */ 4082 */
4070void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, 4083void __wake_up(wait_queue_head_t *q, unsigned int mode,
4071 int nr_exclusive, void *key) 4084 int nr_exclusive, void *key)
4072{ 4085{
4073 unsigned long flags; 4086 unsigned long flags;
@@ -4081,7 +4094,7 @@ EXPORT_SYMBOL(__wake_up);
4081/* 4094/*
4082 * Same as __wake_up but called with the spinlock in wait_queue_head_t held. 4095 * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
4083 */ 4096 */
4084void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode) 4097void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
4085{ 4098{
4086 __wake_up_common(q, mode, 1, 0, NULL); 4099 __wake_up_common(q, mode, 1, 0, NULL);
4087} 4100}
@@ -4099,7 +4112,7 @@ void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
4099 * 4112 *
4100 * On UP it can prevent extra preemption. 4113 * On UP it can prevent extra preemption.
4101 */ 4114 */
4102void fastcall 4115void
4103__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) 4116__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
4104{ 4117{
4105 unsigned long flags; 4118 unsigned long flags;
@@ -4571,6 +4584,15 @@ recheck:
4571 return -EPERM; 4584 return -EPERM;
4572 } 4585 }
4573 4586
4587#ifdef CONFIG_RT_GROUP_SCHED
4588 /*
4589 * Do not allow realtime tasks into groups that have no runtime
4590 * assigned.
4591 */
4592 if (rt_policy(policy) && task_group(p)->rt_runtime == 0)
4593 return -EPERM;
4594#endif
4595
4574 retval = security_task_setscheduler(p, policy, param); 4596 retval = security_task_setscheduler(p, policy, param);
4575 if (retval) 4597 if (retval)
4576 return retval; 4598 return retval;
@@ -7112,7 +7134,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
7112 /* delimiter for bitsearch: */ 7134 /* delimiter for bitsearch: */
7113 __set_bit(MAX_RT_PRIO, array->bitmap); 7135 __set_bit(MAX_RT_PRIO, array->bitmap);
7114 7136
7115#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED 7137#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
7116 rt_rq->highest_prio = MAX_RT_PRIO; 7138 rt_rq->highest_prio = MAX_RT_PRIO;
7117#endif 7139#endif
7118#ifdef CONFIG_SMP 7140#ifdef CONFIG_SMP
@@ -7123,7 +7145,8 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
7123 rt_rq->rt_time = 0; 7145 rt_rq->rt_time = 0;
7124 rt_rq->rt_throttled = 0; 7146 rt_rq->rt_throttled = 0;
7125 7147
7126#ifdef CONFIG_FAIR_GROUP_SCHED 7148#ifdef CONFIG_RT_GROUP_SCHED
7149 rt_rq->rt_nr_boosted = 0;
7127 rt_rq->rq = rq; 7150 rt_rq->rq = rq;
7128#endif 7151#endif
7129} 7152}
@@ -7146,7 +7169,9 @@ static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg,
7146 se->load.inv_weight = div64_64(1ULL<<32, se->load.weight); 7169 se->load.inv_weight = div64_64(1ULL<<32, se->load.weight);
7147 se->parent = NULL; 7170 se->parent = NULL;
7148} 7171}
7172#endif
7149 7173
7174#ifdef CONFIG_RT_GROUP_SCHED
7150static void init_tg_rt_entry(struct rq *rq, struct task_group *tg, 7175static void init_tg_rt_entry(struct rq *rq, struct task_group *tg,
7151 struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, 7176 struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
7152 int cpu, int add) 7177 int cpu, int add)
@@ -7175,7 +7200,7 @@ void __init sched_init(void)
7175 init_defrootdomain(); 7200 init_defrootdomain();
7176#endif 7201#endif
7177 7202
7178#ifdef CONFIG_FAIR_GROUP_SCHED 7203#ifdef CONFIG_GROUP_SCHED
7179 list_add(&init_task_group.list, &task_groups); 7204 list_add(&init_task_group.list, &task_groups);
7180#endif 7205#endif
7181 7206
@@ -7196,7 +7221,10 @@ void __init sched_init(void)
7196 &per_cpu(init_cfs_rq, i), 7221 &per_cpu(init_cfs_rq, i),
7197 &per_cpu(init_sched_entity, i), i, 1); 7222 &per_cpu(init_sched_entity, i), i, 1);
7198 7223
7199 init_task_group.rt_ratio = sysctl_sched_rt_ratio; /* XXX */ 7224#endif
7225#ifdef CONFIG_RT_GROUP_SCHED
7226 init_task_group.rt_runtime =
7227 sysctl_sched_rt_runtime * NSEC_PER_USEC;
7200 INIT_LIST_HEAD(&rq->leaf_rt_rq_list); 7228 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
7201 init_tg_rt_entry(rq, &init_task_group, 7229 init_tg_rt_entry(rq, &init_task_group,
7202 &per_cpu(init_rt_rq, i), 7230 &per_cpu(init_rt_rq, i),
@@ -7303,7 +7331,7 @@ void normalize_rt_tasks(void)
7303 unsigned long flags; 7331 unsigned long flags;
7304 struct rq *rq; 7332 struct rq *rq;
7305 7333
7306 read_lock_irq(&tasklist_lock); 7334 read_lock_irqsave(&tasklist_lock, flags);
7307 do_each_thread(g, p) { 7335 do_each_thread(g, p) {
7308 /* 7336 /*
7309 * Only normalize user tasks: 7337 * Only normalize user tasks:
@@ -7329,16 +7357,16 @@ void normalize_rt_tasks(void)
7329 continue; 7357 continue;
7330 } 7358 }
7331 7359
7332 spin_lock_irqsave(&p->pi_lock, flags); 7360 spin_lock(&p->pi_lock);
7333 rq = __task_rq_lock(p); 7361 rq = __task_rq_lock(p);
7334 7362
7335 normalize_task(rq, p); 7363 normalize_task(rq, p);
7336 7364
7337 __task_rq_unlock(rq); 7365 __task_rq_unlock(rq);
7338 spin_unlock_irqrestore(&p->pi_lock, flags); 7366 spin_unlock(&p->pi_lock);
7339 } while_each_thread(g, p); 7367 } while_each_thread(g, p);
7340 7368
7341 read_unlock_irq(&tasklist_lock); 7369 read_unlock_irqrestore(&tasklist_lock, flags);
7342} 7370}
7343 7371
7344#endif /* CONFIG_MAGIC_SYSRQ */ 7372#endif /* CONFIG_MAGIC_SYSRQ */
@@ -7387,9 +7415,9 @@ void set_curr_task(int cpu, struct task_struct *p)
7387 7415
7388#endif 7416#endif
7389 7417
7390#ifdef CONFIG_FAIR_GROUP_SCHED 7418#ifdef CONFIG_GROUP_SCHED
7391 7419
7392#ifdef CONFIG_SMP 7420#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
7393/* 7421/*
7394 * distribute shares of all task groups among their schedulable entities, 7422 * distribute shares of all task groups among their schedulable entities,
7395 * to reflect load distribution across cpus. 7423 * to reflect load distribution across cpus.
@@ -7540,7 +7568,8 @@ static int load_balance_monitor(void *unused)
7540} 7568}
7541#endif /* CONFIG_SMP */ 7569#endif /* CONFIG_SMP */
7542 7570
7543static void free_sched_group(struct task_group *tg) 7571#ifdef CONFIG_FAIR_GROUP_SCHED
7572static void free_fair_sched_group(struct task_group *tg)
7544{ 7573{
7545 int i; 7574 int i;
7546 7575
@@ -7549,49 +7578,27 @@ static void free_sched_group(struct task_group *tg)
7549 kfree(tg->cfs_rq[i]); 7578 kfree(tg->cfs_rq[i]);
7550 if (tg->se) 7579 if (tg->se)
7551 kfree(tg->se[i]); 7580 kfree(tg->se[i]);
7552 if (tg->rt_rq)
7553 kfree(tg->rt_rq[i]);
7554 if (tg->rt_se)
7555 kfree(tg->rt_se[i]);
7556 } 7581 }
7557 7582
7558 kfree(tg->cfs_rq); 7583 kfree(tg->cfs_rq);
7559 kfree(tg->se); 7584 kfree(tg->se);
7560 kfree(tg->rt_rq);
7561 kfree(tg->rt_se);
7562 kfree(tg);
7563} 7585}
7564 7586
7565/* allocate runqueue etc for a new task group */ 7587static int alloc_fair_sched_group(struct task_group *tg)
7566struct task_group *sched_create_group(void)
7567{ 7588{
7568 struct task_group *tg;
7569 struct cfs_rq *cfs_rq; 7589 struct cfs_rq *cfs_rq;
7570 struct sched_entity *se; 7590 struct sched_entity *se;
7571 struct rt_rq *rt_rq;
7572 struct sched_rt_entity *rt_se;
7573 struct rq *rq; 7591 struct rq *rq;
7574 int i; 7592 int i;
7575 7593
7576 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
7577 if (!tg)
7578 return ERR_PTR(-ENOMEM);
7579
7580 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS, GFP_KERNEL); 7594 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS, GFP_KERNEL);
7581 if (!tg->cfs_rq) 7595 if (!tg->cfs_rq)
7582 goto err; 7596 goto err;
7583 tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL); 7597 tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL);
7584 if (!tg->se) 7598 if (!tg->se)
7585 goto err; 7599 goto err;
7586 tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL);
7587 if (!tg->rt_rq)
7588 goto err;
7589 tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL);
7590 if (!tg->rt_se)
7591 goto err;
7592 7600
7593 tg->shares = NICE_0_LOAD; 7601 tg->shares = NICE_0_LOAD;
7594 tg->rt_ratio = 0; /* XXX */
7595 7602
7596 for_each_possible_cpu(i) { 7603 for_each_possible_cpu(i) {
7597 rq = cpu_rq(i); 7604 rq = cpu_rq(i);
@@ -7606,6 +7613,79 @@ struct task_group *sched_create_group(void)
7606 if (!se) 7613 if (!se)
7607 goto err; 7614 goto err;
7608 7615
7616 init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0);
7617 }
7618
7619 return 1;
7620
7621 err:
7622 return 0;
7623}
7624
7625static inline void register_fair_sched_group(struct task_group *tg, int cpu)
7626{
7627 list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
7628 &cpu_rq(cpu)->leaf_cfs_rq_list);
7629}
7630
7631static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
7632{
7633 list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
7634}
7635#else
7636static inline void free_fair_sched_group(struct task_group *tg)
7637{
7638}
7639
7640static inline int alloc_fair_sched_group(struct task_group *tg)
7641{
7642 return 1;
7643}
7644
7645static inline void register_fair_sched_group(struct task_group *tg, int cpu)
7646{
7647}
7648
7649static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
7650{
7651}
7652#endif
7653
7654#ifdef CONFIG_RT_GROUP_SCHED
7655static void free_rt_sched_group(struct task_group *tg)
7656{
7657 int i;
7658
7659 for_each_possible_cpu(i) {
7660 if (tg->rt_rq)
7661 kfree(tg->rt_rq[i]);
7662 if (tg->rt_se)
7663 kfree(tg->rt_se[i]);
7664 }
7665
7666 kfree(tg->rt_rq);
7667 kfree(tg->rt_se);
7668}
7669
7670static int alloc_rt_sched_group(struct task_group *tg)
7671{
7672 struct rt_rq *rt_rq;
7673 struct sched_rt_entity *rt_se;
7674 struct rq *rq;
7675 int i;
7676
7677 tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL);
7678 if (!tg->rt_rq)
7679 goto err;
7680 tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL);
7681 if (!tg->rt_se)
7682 goto err;
7683
7684 tg->rt_runtime = 0;
7685
7686 for_each_possible_cpu(i) {
7687 rq = cpu_rq(i);
7688
7609 rt_rq = kmalloc_node(sizeof(struct rt_rq), 7689 rt_rq = kmalloc_node(sizeof(struct rt_rq),
7610 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); 7690 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
7611 if (!rt_rq) 7691 if (!rt_rq)
@@ -7616,20 +7696,75 @@ struct task_group *sched_create_group(void)
7616 if (!rt_se) 7696 if (!rt_se)
7617 goto err; 7697 goto err;
7618 7698
7619 init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0);
7620 init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0); 7699 init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0);
7621 } 7700 }
7622 7701
7623 lock_task_group_list(); 7702 return 1;
7703
7704 err:
7705 return 0;
7706}
7707
7708static inline void register_rt_sched_group(struct task_group *tg, int cpu)
7709{
7710 list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
7711 &cpu_rq(cpu)->leaf_rt_rq_list);
7712}
7713
7714static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
7715{
7716 list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
7717}
7718#else
7719static inline void free_rt_sched_group(struct task_group *tg)
7720{
7721}
7722
7723static inline int alloc_rt_sched_group(struct task_group *tg)
7724{
7725 return 1;
7726}
7727
7728static inline void register_rt_sched_group(struct task_group *tg, int cpu)
7729{
7730}
7731
7732static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
7733{
7734}
7735#endif
7736
7737static void free_sched_group(struct task_group *tg)
7738{
7739 free_fair_sched_group(tg);
7740 free_rt_sched_group(tg);
7741 kfree(tg);
7742}
7743
7744/* allocate runqueue etc for a new task group */
7745struct task_group *sched_create_group(void)
7746{
7747 struct task_group *tg;
7748 unsigned long flags;
7749 int i;
7750
7751 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
7752 if (!tg)
7753 return ERR_PTR(-ENOMEM);
7754
7755 if (!alloc_fair_sched_group(tg))
7756 goto err;
7757
7758 if (!alloc_rt_sched_group(tg))
7759 goto err;
7760
7761 spin_lock_irqsave(&task_group_lock, flags);
7624 for_each_possible_cpu(i) { 7762 for_each_possible_cpu(i) {
7625 rq = cpu_rq(i); 7763 register_fair_sched_group(tg, i);
7626 cfs_rq = tg->cfs_rq[i]; 7764 register_rt_sched_group(tg, i);
7627 list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
7628 rt_rq = tg->rt_rq[i];
7629 list_add_rcu(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
7630 } 7765 }
7631 list_add_rcu(&tg->list, &task_groups); 7766 list_add_rcu(&tg->list, &task_groups);
7632 unlock_task_group_list(); 7767 spin_unlock_irqrestore(&task_group_lock, flags);
7633 7768
7634 return tg; 7769 return tg;
7635 7770
@@ -7648,21 +7783,16 @@ static void free_sched_group_rcu(struct rcu_head *rhp)
7648/* Destroy runqueue etc associated with a task group */ 7783/* Destroy runqueue etc associated with a task group */
7649void sched_destroy_group(struct task_group *tg) 7784void sched_destroy_group(struct task_group *tg)
7650{ 7785{
7651 struct cfs_rq *cfs_rq = NULL; 7786 unsigned long flags;
7652 struct rt_rq *rt_rq = NULL;
7653 int i; 7787 int i;
7654 7788
7655 lock_task_group_list(); 7789 spin_lock_irqsave(&task_group_lock, flags);
7656 for_each_possible_cpu(i) { 7790 for_each_possible_cpu(i) {
7657 cfs_rq = tg->cfs_rq[i]; 7791 unregister_fair_sched_group(tg, i);
7658 list_del_rcu(&cfs_rq->leaf_cfs_rq_list); 7792 unregister_rt_sched_group(tg, i);
7659 rt_rq = tg->rt_rq[i];
7660 list_del_rcu(&rt_rq->leaf_rt_rq_list);
7661 } 7793 }
7662 list_del_rcu(&tg->list); 7794 list_del_rcu(&tg->list);
7663 unlock_task_group_list(); 7795 spin_unlock_irqrestore(&task_group_lock, flags);
7664
7665 BUG_ON(!cfs_rq);
7666 7796
7667 /* wait for possible concurrent references to cfs_rqs complete */ 7797 /* wait for possible concurrent references to cfs_rqs complete */
7668 call_rcu(&tg->rcu, free_sched_group_rcu); 7798 call_rcu(&tg->rcu, free_sched_group_rcu);
@@ -7703,6 +7833,7 @@ void sched_move_task(struct task_struct *tsk)
7703 task_rq_unlock(rq, &flags); 7833 task_rq_unlock(rq, &flags);
7704} 7834}
7705 7835
7836#ifdef CONFIG_FAIR_GROUP_SCHED
7706/* rq->lock to be locked by caller */ 7837/* rq->lock to be locked by caller */
7707static void set_se_shares(struct sched_entity *se, unsigned long shares) 7838static void set_se_shares(struct sched_entity *se, unsigned long shares)
7708{ 7839{
@@ -7728,13 +7859,14 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares)
7728 } 7859 }
7729} 7860}
7730 7861
7862static DEFINE_MUTEX(shares_mutex);
7863
7731int sched_group_set_shares(struct task_group *tg, unsigned long shares) 7864int sched_group_set_shares(struct task_group *tg, unsigned long shares)
7732{ 7865{
7733 int i; 7866 int i;
7734 struct cfs_rq *cfs_rq; 7867 unsigned long flags;
7735 struct rq *rq;
7736 7868
7737 lock_task_group_list(); 7869 mutex_lock(&shares_mutex);
7738 if (tg->shares == shares) 7870 if (tg->shares == shares)
7739 goto done; 7871 goto done;
7740 7872
@@ -7746,10 +7878,10 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
7746 * load_balance_fair) from referring to this group first, 7878 * load_balance_fair) from referring to this group first,
7747 * by taking it off the rq->leaf_cfs_rq_list on each cpu. 7879 * by taking it off the rq->leaf_cfs_rq_list on each cpu.
7748 */ 7880 */
7749 for_each_possible_cpu(i) { 7881 spin_lock_irqsave(&task_group_lock, flags);
7750 cfs_rq = tg->cfs_rq[i]; 7882 for_each_possible_cpu(i)
7751 list_del_rcu(&cfs_rq->leaf_cfs_rq_list); 7883 unregister_fair_sched_group(tg, i);
7752 } 7884 spin_unlock_irqrestore(&task_group_lock, flags);
7753 7885
7754 /* wait for any ongoing reference to this group to finish */ 7886 /* wait for any ongoing reference to this group to finish */
7755 synchronize_sched(); 7887 synchronize_sched();
@@ -7769,13 +7901,12 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
7769 * Enable load balance activity on this group, by inserting it back on 7901 * Enable load balance activity on this group, by inserting it back on
7770 * each cpu's rq->leaf_cfs_rq_list. 7902 * each cpu's rq->leaf_cfs_rq_list.
7771 */ 7903 */
7772 for_each_possible_cpu(i) { 7904 spin_lock_irqsave(&task_group_lock, flags);
7773 rq = cpu_rq(i); 7905 for_each_possible_cpu(i)
7774 cfs_rq = tg->cfs_rq[i]; 7906 register_fair_sched_group(tg, i);
7775 list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); 7907 spin_unlock_irqrestore(&task_group_lock, flags);
7776 }
7777done: 7908done:
7778 unlock_task_group_list(); 7909 mutex_unlock(&shares_mutex);
7779 return 0; 7910 return 0;
7780} 7911}
7781 7912
@@ -7783,35 +7914,84 @@ unsigned long sched_group_shares(struct task_group *tg)
7783{ 7914{
7784 return tg->shares; 7915 return tg->shares;
7785} 7916}
7917#endif
7786 7918
7919#ifdef CONFIG_RT_GROUP_SCHED
7787/* 7920/*
7788 * Ensure the total rt_ratio <= sysctl_sched_rt_ratio 7921 * Ensure that the real time constraints are schedulable.
7789 */ 7922 */
7790int sched_group_set_rt_ratio(struct task_group *tg, unsigned long rt_ratio) 7923static DEFINE_MUTEX(rt_constraints_mutex);
7924
7925static unsigned long to_ratio(u64 period, u64 runtime)
7926{
7927 if (runtime == RUNTIME_INF)
7928 return 1ULL << 16;
7929
7930 runtime *= (1ULL << 16);
7931 div64_64(runtime, period);
7932 return runtime;
7933}
7934
7935static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
7791{ 7936{
7792 struct task_group *tgi; 7937 struct task_group *tgi;
7793 unsigned long total = 0; 7938 unsigned long total = 0;
7939 unsigned long global_ratio =
7940 to_ratio(sysctl_sched_rt_period,
7941 sysctl_sched_rt_runtime < 0 ?
7942 RUNTIME_INF : sysctl_sched_rt_runtime);
7794 7943
7795 rcu_read_lock(); 7944 rcu_read_lock();
7796 list_for_each_entry_rcu(tgi, &task_groups, list) 7945 list_for_each_entry_rcu(tgi, &task_groups, list) {
7797 total += tgi->rt_ratio; 7946 if (tgi == tg)
7798 rcu_read_unlock(); 7947 continue;
7799 7948
7800 if (total + rt_ratio - tg->rt_ratio > sysctl_sched_rt_ratio) 7949 total += to_ratio(period, tgi->rt_runtime);
7801 return -EINVAL; 7950 }
7951 rcu_read_unlock();
7802 7952
7803 tg->rt_ratio = rt_ratio; 7953 return total + to_ratio(period, runtime) < global_ratio;
7804 return 0;
7805} 7954}
7806 7955
7807unsigned long sched_group_rt_ratio(struct task_group *tg) 7956int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
7808{ 7957{
7809 return tg->rt_ratio; 7958 u64 rt_runtime, rt_period;
7959 int err = 0;
7960
7961 rt_period = sysctl_sched_rt_period * NSEC_PER_USEC;
7962 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
7963 if (rt_runtime_us == -1)
7964 rt_runtime = rt_period;
7965
7966 mutex_lock(&rt_constraints_mutex);
7967 if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
7968 err = -EINVAL;
7969 goto unlock;
7970 }
7971 if (rt_runtime_us == -1)
7972 rt_runtime = RUNTIME_INF;
7973 tg->rt_runtime = rt_runtime;
7974 unlock:
7975 mutex_unlock(&rt_constraints_mutex);
7976
7977 return err;
7810} 7978}
7811 7979
7812#endif /* CONFIG_FAIR_GROUP_SCHED */ 7980long sched_group_rt_runtime(struct task_group *tg)
7981{
7982 u64 rt_runtime_us;
7983
7984 if (tg->rt_runtime == RUNTIME_INF)
7985 return -1;
7986
7987 rt_runtime_us = tg->rt_runtime;
7988 do_div(rt_runtime_us, NSEC_PER_USEC);
7989 return rt_runtime_us;
7990}
7991#endif
7992#endif /* CONFIG_GROUP_SCHED */
7813 7993
7814#ifdef CONFIG_FAIR_CGROUP_SCHED 7994#ifdef CONFIG_CGROUP_SCHED
7815 7995
7816/* return corresponding task_group object of a cgroup */ 7996/* return corresponding task_group object of a cgroup */
7817static inline struct task_group *cgroup_tg(struct cgroup *cgrp) 7997static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
@@ -7857,9 +8037,15 @@ static int
7857cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 8037cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
7858 struct task_struct *tsk) 8038 struct task_struct *tsk)
7859{ 8039{
8040#ifdef CONFIG_RT_GROUP_SCHED
8041 /* Don't accept realtime tasks when there is no way for them to run */
8042 if (rt_task(tsk) && cgroup_tg(cgrp)->rt_runtime == 0)
8043 return -EINVAL;
8044#else
7860 /* We don't support RT-tasks being in separate groups */ 8045 /* We don't support RT-tasks being in separate groups */
7861 if (tsk->sched_class != &fair_sched_class) 8046 if (tsk->sched_class != &fair_sched_class)
7862 return -EINVAL; 8047 return -EINVAL;
8048#endif
7863 8049
7864 return 0; 8050 return 0;
7865} 8051}
@@ -7871,6 +8057,7 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
7871 sched_move_task(tsk); 8057 sched_move_task(tsk);
7872} 8058}
7873 8059
8060#ifdef CONFIG_FAIR_GROUP_SCHED
7874static int cpu_shares_write_uint(struct cgroup *cgrp, struct cftype *cftype, 8061static int cpu_shares_write_uint(struct cgroup *cgrp, struct cftype *cftype,
7875 u64 shareval) 8062 u64 shareval)
7876{ 8063{
@@ -7883,31 +8070,70 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft)
7883 8070
7884 return (u64) tg->shares; 8071 return (u64) tg->shares;
7885} 8072}
8073#endif
7886 8074
7887static int cpu_rt_ratio_write_uint(struct cgroup *cgrp, struct cftype *cftype, 8075#ifdef CONFIG_RT_GROUP_SCHED
7888 u64 rt_ratio_val) 8076static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
8077 struct file *file,
8078 const char __user *userbuf,
8079 size_t nbytes, loff_t *unused_ppos)
7889{ 8080{
7890 return sched_group_set_rt_ratio(cgroup_tg(cgrp), rt_ratio_val); 8081 char buffer[64];
8082 int retval = 0;
8083 s64 val;
8084 char *end;
8085
8086 if (!nbytes)
8087 return -EINVAL;
8088 if (nbytes >= sizeof(buffer))
8089 return -E2BIG;
8090 if (copy_from_user(buffer, userbuf, nbytes))
8091 return -EFAULT;
8092
8093 buffer[nbytes] = 0; /* nul-terminate */
8094
8095 /* strip newline if necessary */
8096 if (nbytes && (buffer[nbytes-1] == '\n'))
8097 buffer[nbytes-1] = 0;
8098 val = simple_strtoll(buffer, &end, 0);
8099 if (*end)
8100 return -EINVAL;
8101
8102 /* Pass to subsystem */
8103 retval = sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
8104 if (!retval)
8105 retval = nbytes;
8106 return retval;
7891} 8107}
7892 8108
7893static u64 cpu_rt_ratio_read_uint(struct cgroup *cgrp, struct cftype *cft) 8109static ssize_t cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft,
8110 struct file *file,
8111 char __user *buf, size_t nbytes,
8112 loff_t *ppos)
7894{ 8113{
7895 struct task_group *tg = cgroup_tg(cgrp); 8114 char tmp[64];
8115 long val = sched_group_rt_runtime(cgroup_tg(cgrp));
8116 int len = sprintf(tmp, "%ld\n", val);
7896 8117
7897 return (u64) tg->rt_ratio; 8118 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
7898} 8119}
8120#endif
7899 8121
7900static struct cftype cpu_files[] = { 8122static struct cftype cpu_files[] = {
8123#ifdef CONFIG_FAIR_GROUP_SCHED
7901 { 8124 {
7902 .name = "shares", 8125 .name = "shares",
7903 .read_uint = cpu_shares_read_uint, 8126 .read_uint = cpu_shares_read_uint,
7904 .write_uint = cpu_shares_write_uint, 8127 .write_uint = cpu_shares_write_uint,
7905 }, 8128 },
8129#endif
8130#ifdef CONFIG_RT_GROUP_SCHED
7906 { 8131 {
7907 .name = "rt_ratio", 8132 .name = "rt_runtime_us",
7908 .read_uint = cpu_rt_ratio_read_uint, 8133 .read = cpu_rt_runtime_read,
7909 .write_uint = cpu_rt_ratio_write_uint, 8134 .write = cpu_rt_runtime_write,
7910 }, 8135 },
8136#endif
7911}; 8137};
7912 8138
7913static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) 8139static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
@@ -7926,7 +8152,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
7926 .early_init = 1, 8152 .early_init = 1,
7927}; 8153};
7928 8154
7929#endif /* CONFIG_FAIR_CGROUP_SCHED */ 8155#endif /* CONFIG_CGROUP_SCHED */
7930 8156
7931#ifdef CONFIG_CGROUP_CPUACCT 8157#ifdef CONFIG_CGROUP_CPUACCT
7932 8158
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 274b40d7bef2..f54792b175b2 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -55,14 +55,14 @@ static inline int on_rt_rq(struct sched_rt_entity *rt_se)
55 return !list_empty(&rt_se->run_list); 55 return !list_empty(&rt_se->run_list);
56} 56}
57 57
58#ifdef CONFIG_FAIR_GROUP_SCHED 58#ifdef CONFIG_RT_GROUP_SCHED
59 59
60static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq) 60static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
61{ 61{
62 if (!rt_rq->tg) 62 if (!rt_rq->tg)
63 return SCHED_RT_FRAC; 63 return RUNTIME_INF;
64 64
65 return rt_rq->tg->rt_ratio; 65 return rt_rq->tg->rt_runtime;
66} 66}
67 67
68#define for_each_leaf_rt_rq(rt_rq, rq) \ 68#define for_each_leaf_rt_rq(rt_rq, rq) \
@@ -89,7 +89,7 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
89static void enqueue_rt_entity(struct sched_rt_entity *rt_se); 89static void enqueue_rt_entity(struct sched_rt_entity *rt_se);
90static void dequeue_rt_entity(struct sched_rt_entity *rt_se); 90static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
91 91
92static void sched_rt_ratio_enqueue(struct rt_rq *rt_rq) 92static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
93{ 93{
94 struct sched_rt_entity *rt_se = rt_rq->rt_se; 94 struct sched_rt_entity *rt_se = rt_rq->rt_se;
95 95
@@ -102,7 +102,7 @@ static void sched_rt_ratio_enqueue(struct rt_rq *rt_rq)
102 } 102 }
103} 103}
104 104
105static void sched_rt_ratio_dequeue(struct rt_rq *rt_rq) 105static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
106{ 106{
107 struct sched_rt_entity *rt_se = rt_rq->rt_se; 107 struct sched_rt_entity *rt_se = rt_rq->rt_se;
108 108
@@ -110,11 +110,31 @@ static void sched_rt_ratio_dequeue(struct rt_rq *rt_rq)
110 dequeue_rt_entity(rt_se); 110 dequeue_rt_entity(rt_se);
111} 111}
112 112
113static inline int rt_rq_throttled(struct rt_rq *rt_rq)
114{
115 return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
116}
117
118static int rt_se_boosted(struct sched_rt_entity *rt_se)
119{
120 struct rt_rq *rt_rq = group_rt_rq(rt_se);
121 struct task_struct *p;
122
123 if (rt_rq)
124 return !!rt_rq->rt_nr_boosted;
125
126 p = rt_task_of(rt_se);
127 return p->prio != p->normal_prio;
128}
129
113#else 130#else
114 131
115static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq) 132static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
116{ 133{
117 return sysctl_sched_rt_ratio; 134 if (sysctl_sched_rt_runtime == -1)
135 return RUNTIME_INF;
136
137 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
118} 138}
119 139
120#define for_each_leaf_rt_rq(rt_rq, rq) \ 140#define for_each_leaf_rt_rq(rt_rq, rq) \
@@ -141,19 +161,23 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
141 return NULL; 161 return NULL;
142} 162}
143 163
144static inline void sched_rt_ratio_enqueue(struct rt_rq *rt_rq) 164static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
145{ 165{
146} 166}
147 167
148static inline void sched_rt_ratio_dequeue(struct rt_rq *rt_rq) 168static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
149{ 169{
150} 170}
151 171
172static inline int rt_rq_throttled(struct rt_rq *rt_rq)
173{
174 return rt_rq->rt_throttled;
175}
152#endif 176#endif
153 177
154static inline int rt_se_prio(struct sched_rt_entity *rt_se) 178static inline int rt_se_prio(struct sched_rt_entity *rt_se)
155{ 179{
156#ifdef CONFIG_FAIR_GROUP_SCHED 180#ifdef CONFIG_RT_GROUP_SCHED
157 struct rt_rq *rt_rq = group_rt_rq(rt_se); 181 struct rt_rq *rt_rq = group_rt_rq(rt_se);
158 182
159 if (rt_rq) 183 if (rt_rq)
@@ -163,28 +187,26 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se)
163 return rt_task_of(rt_se)->prio; 187 return rt_task_of(rt_se)->prio;
164} 188}
165 189
166static int sched_rt_ratio_exceeded(struct rt_rq *rt_rq) 190static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
167{ 191{
168 unsigned int rt_ratio = sched_rt_ratio(rt_rq); 192 u64 runtime = sched_rt_runtime(rt_rq);
169 u64 period, ratio;
170 193
171 if (rt_ratio == SCHED_RT_FRAC) 194 if (runtime == RUNTIME_INF)
172 return 0; 195 return 0;
173 196
174 if (rt_rq->rt_throttled) 197 if (rt_rq->rt_throttled)
175 return 1; 198 return rt_rq_throttled(rt_rq);
176
177 period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
178 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
179 199
180 if (rt_rq->rt_time > ratio) { 200 if (rt_rq->rt_time > runtime) {
181 struct rq *rq = rq_of_rt_rq(rt_rq); 201 struct rq *rq = rq_of_rt_rq(rt_rq);
182 202
183 rq->rt_throttled = 1; 203 rq->rt_throttled = 1;
184 rt_rq->rt_throttled = 1; 204 rt_rq->rt_throttled = 1;
185 205
186 sched_rt_ratio_dequeue(rt_rq); 206 if (rt_rq_throttled(rt_rq)) {
187 return 1; 207 sched_rt_rq_dequeue(rt_rq);
208 return 1;
209 }
188 } 210 }
189 211
190 return 0; 212 return 0;
@@ -196,17 +218,16 @@ static void update_sched_rt_period(struct rq *rq)
196 u64 period; 218 u64 period;
197 219
198 while (rq->clock > rq->rt_period_expire) { 220 while (rq->clock > rq->rt_period_expire) {
199 period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC; 221 period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
200 rq->rt_period_expire += period; 222 rq->rt_period_expire += period;
201 223
202 for_each_leaf_rt_rq(rt_rq, rq) { 224 for_each_leaf_rt_rq(rt_rq, rq) {
203 unsigned long rt_ratio = sched_rt_ratio(rt_rq); 225 u64 runtime = sched_rt_runtime(rt_rq);
204 u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
205 226
206 rt_rq->rt_time -= min(rt_rq->rt_time, ratio); 227 rt_rq->rt_time -= min(rt_rq->rt_time, runtime);
207 if (rt_rq->rt_throttled) { 228 if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
208 rt_rq->rt_throttled = 0; 229 rt_rq->rt_throttled = 0;
209 sched_rt_ratio_enqueue(rt_rq); 230 sched_rt_rq_enqueue(rt_rq);
210 } 231 }
211 } 232 }
212 233
@@ -239,12 +260,7 @@ static void update_curr_rt(struct rq *rq)
239 cpuacct_charge(curr, delta_exec); 260 cpuacct_charge(curr, delta_exec);
240 261
241 rt_rq->rt_time += delta_exec; 262 rt_rq->rt_time += delta_exec;
242 /* 263 if (sched_rt_runtime_exceeded(rt_rq))
243 * might make it a tad more accurate:
244 *
245 * update_sched_rt_period(rq);
246 */
247 if (sched_rt_ratio_exceeded(rt_rq))
248 resched_task(curr); 264 resched_task(curr);
249} 265}
250 266
@@ -253,7 +269,7 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
253{ 269{
254 WARN_ON(!rt_prio(rt_se_prio(rt_se))); 270 WARN_ON(!rt_prio(rt_se_prio(rt_se)));
255 rt_rq->rt_nr_running++; 271 rt_rq->rt_nr_running++;
256#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED 272#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
257 if (rt_se_prio(rt_se) < rt_rq->highest_prio) 273 if (rt_se_prio(rt_se) < rt_rq->highest_prio)
258 rt_rq->highest_prio = rt_se_prio(rt_se); 274 rt_rq->highest_prio = rt_se_prio(rt_se);
259#endif 275#endif
@@ -265,6 +281,10 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
265 281
266 update_rt_migration(rq_of_rt_rq(rt_rq)); 282 update_rt_migration(rq_of_rt_rq(rt_rq));
267#endif 283#endif
284#ifdef CONFIG_RT_GROUP_SCHED
285 if (rt_se_boosted(rt_se))
286 rt_rq->rt_nr_boosted++;
287#endif
268} 288}
269 289
270static inline 290static inline
@@ -273,7 +293,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
273 WARN_ON(!rt_prio(rt_se_prio(rt_se))); 293 WARN_ON(!rt_prio(rt_se_prio(rt_se)));
274 WARN_ON(!rt_rq->rt_nr_running); 294 WARN_ON(!rt_rq->rt_nr_running);
275 rt_rq->rt_nr_running--; 295 rt_rq->rt_nr_running--;
276#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED 296#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
277 if (rt_rq->rt_nr_running) { 297 if (rt_rq->rt_nr_running) {
278 struct rt_prio_array *array; 298 struct rt_prio_array *array;
279 299
@@ -295,6 +315,12 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
295 315
296 update_rt_migration(rq_of_rt_rq(rt_rq)); 316 update_rt_migration(rq_of_rt_rq(rt_rq));
297#endif /* CONFIG_SMP */ 317#endif /* CONFIG_SMP */
318#ifdef CONFIG_RT_GROUP_SCHED
319 if (rt_se_boosted(rt_se))
320 rt_rq->rt_nr_boosted--;
321
322 WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
323#endif
298} 324}
299 325
300static void enqueue_rt_entity(struct sched_rt_entity *rt_se) 326static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
@@ -303,7 +329,7 @@ static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
303 struct rt_prio_array *array = &rt_rq->active; 329 struct rt_prio_array *array = &rt_rq->active;
304 struct rt_rq *group_rq = group_rt_rq(rt_se); 330 struct rt_rq *group_rq = group_rt_rq(rt_se);
305 331
306 if (group_rq && group_rq->rt_throttled) 332 if (group_rq && rt_rq_throttled(group_rq))
307 return; 333 return;
308 334
309 list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se)); 335 list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
@@ -496,7 +522,7 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
496 if (unlikely(!rt_rq->rt_nr_running)) 522 if (unlikely(!rt_rq->rt_nr_running))
497 return NULL; 523 return NULL;
498 524
499 if (sched_rt_ratio_exceeded(rt_rq)) 525 if (rt_rq_throttled(rt_rq))
500 return NULL; 526 return NULL;
501 527
502 do { 528 do {
diff --git a/kernel/signal.c b/kernel/signal.c
index 4333b6dbb424..84917fe507f7 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -911,27 +911,6 @@ __group_complete_signal(int sig, struct task_struct *p)
911 } while_each_thread(p, t); 911 } while_each_thread(p, t);
912 return; 912 return;
913 } 913 }
914
915 /*
916 * There will be a core dump. We make all threads other
917 * than the chosen one go into a group stop so that nothing
918 * happens until it gets scheduled, takes the signal off
919 * the shared queue, and does the core dump. This is a
920 * little more complicated than strictly necessary, but it
921 * keeps the signal state that winds up in the core dump
922 * unchanged from the death state, e.g. which thread had
923 * the core-dump signal unblocked.
924 */
925 rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending);
926 rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending);
927 p->signal->group_stop_count = 0;
928 p->signal->group_exit_task = t;
929 p = t;
930 do {
931 p->signal->group_stop_count++;
932 signal_wake_up(t, t == p);
933 } while_each_thread(p, t);
934 return;
935 } 914 }
936 915
937 /* 916 /*
@@ -978,7 +957,6 @@ void zap_other_threads(struct task_struct *p)
978{ 957{
979 struct task_struct *t; 958 struct task_struct *t;
980 959
981 p->signal->flags = SIGNAL_GROUP_EXIT;
982 p->signal->group_stop_count = 0; 960 p->signal->group_stop_count = 0;
983 961
984 for (t = next_thread(p); t != p; t = next_thread(t)) { 962 for (t = next_thread(p); t != p; t = next_thread(t)) {
@@ -994,7 +972,7 @@ void zap_other_threads(struct task_struct *p)
994 } 972 }
995} 973}
996 974
997int fastcall __fatal_signal_pending(struct task_struct *tsk) 975int __fatal_signal_pending(struct task_struct *tsk)
998{ 976{
999 return sigismember(&tsk->pending.signal, SIGKILL); 977 return sigismember(&tsk->pending.signal, SIGKILL);
1000} 978}
@@ -1040,7 +1018,7 @@ int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1040} 1018}
1041 1019
1042/* 1020/*
1043 * kill_pgrp_info() sends a signal to a process group: this is what the tty 1021 * __kill_pgrp_info() sends a signal to a process group: this is what the tty
1044 * control characters do (^C, ^Z etc) 1022 * control characters do (^C, ^Z etc)
1045 */ 1023 */
1046 1024
@@ -1059,30 +1037,28 @@ int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp)
1059 return success ? 0 : retval; 1037 return success ? 0 : retval;
1060} 1038}
1061 1039
1062int kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp)
1063{
1064 int retval;
1065
1066 read_lock(&tasklist_lock);
1067 retval = __kill_pgrp_info(sig, info, pgrp);
1068 read_unlock(&tasklist_lock);
1069
1070 return retval;
1071}
1072
1073int kill_pid_info(int sig, struct siginfo *info, struct pid *pid) 1040int kill_pid_info(int sig, struct siginfo *info, struct pid *pid)
1074{ 1041{
1075 int error; 1042 int error = -ESRCH;
1076 struct task_struct *p; 1043 struct task_struct *p;
1077 1044
1078 rcu_read_lock(); 1045 rcu_read_lock();
1079 if (unlikely(sig_needs_tasklist(sig))) 1046 if (unlikely(sig_needs_tasklist(sig)))
1080 read_lock(&tasklist_lock); 1047 read_lock(&tasklist_lock);
1081 1048
1049retry:
1082 p = pid_task(pid, PIDTYPE_PID); 1050 p = pid_task(pid, PIDTYPE_PID);
1083 error = -ESRCH; 1051 if (p) {
1084 if (p)
1085 error = group_send_sig_info(sig, info, p); 1052 error = group_send_sig_info(sig, info, p);
1053 if (unlikely(error == -ESRCH))
1054 /*
1055 * The task was unhashed in between, try again.
1056 * If it is dead, pid_task() will return NULL,
1057 * if we race with de_thread() it will find the
1058 * new leader.
1059 */
1060 goto retry;
1061 }
1086 1062
1087 if (unlikely(sig_needs_tasklist(sig))) 1063 if (unlikely(sig_needs_tasklist(sig)))
1088 read_unlock(&tasklist_lock); 1064 read_unlock(&tasklist_lock);
@@ -1147,14 +1123,22 @@ EXPORT_SYMBOL_GPL(kill_pid_info_as_uid);
1147static int kill_something_info(int sig, struct siginfo *info, int pid) 1123static int kill_something_info(int sig, struct siginfo *info, int pid)
1148{ 1124{
1149 int ret; 1125 int ret;
1150 rcu_read_lock(); 1126
1151 if (!pid) { 1127 if (pid > 0) {
1152 ret = kill_pgrp_info(sig, info, task_pgrp(current)); 1128 rcu_read_lock();
1153 } else if (pid == -1) { 1129 ret = kill_pid_info(sig, info, find_vpid(pid));
1130 rcu_read_unlock();
1131 return ret;
1132 }
1133
1134 read_lock(&tasklist_lock);
1135 if (pid != -1) {
1136 ret = __kill_pgrp_info(sig, info,
1137 pid ? find_vpid(-pid) : task_pgrp(current));
1138 } else {
1154 int retval = 0, count = 0; 1139 int retval = 0, count = 0;
1155 struct task_struct * p; 1140 struct task_struct * p;
1156 1141
1157 read_lock(&tasklist_lock);
1158 for_each_process(p) { 1142 for_each_process(p) {
1159 if (p->pid > 1 && !same_thread_group(p, current)) { 1143 if (p->pid > 1 && !same_thread_group(p, current)) {
1160 int err = group_send_sig_info(sig, info, p); 1144 int err = group_send_sig_info(sig, info, p);
@@ -1163,14 +1147,10 @@ static int kill_something_info(int sig, struct siginfo *info, int pid)
1163 retval = err; 1147 retval = err;
1164 } 1148 }
1165 } 1149 }
1166 read_unlock(&tasklist_lock);
1167 ret = count ? retval : -ESRCH; 1150 ret = count ? retval : -ESRCH;
1168 } else if (pid < 0) {
1169 ret = kill_pgrp_info(sig, info, find_vpid(-pid));
1170 } else {
1171 ret = kill_pid_info(sig, info, find_vpid(pid));
1172 } 1151 }
1173 rcu_read_unlock(); 1152 read_unlock(&tasklist_lock);
1153
1174 return ret; 1154 return ret;
1175} 1155}
1176 1156
@@ -1218,20 +1198,6 @@ send_sig(int sig, struct task_struct *p, int priv)
1218 return send_sig_info(sig, __si_special(priv), p); 1198 return send_sig_info(sig, __si_special(priv), p);
1219} 1199}
1220 1200
1221/*
1222 * This is the entry point for "process-wide" signals.
1223 * They will go to an appropriate thread in the thread group.
1224 */
1225int
1226send_group_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1227{
1228 int ret;
1229 read_lock(&tasklist_lock);
1230 ret = group_send_sig_info(sig, info, p);
1231 read_unlock(&tasklist_lock);
1232 return ret;
1233}
1234
1235void 1201void
1236force_sig(int sig, struct task_struct *p) 1202force_sig(int sig, struct task_struct *p)
1237{ 1203{
@@ -1259,7 +1225,13 @@ force_sigsegv(int sig, struct task_struct *p)
1259 1225
1260int kill_pgrp(struct pid *pid, int sig, int priv) 1226int kill_pgrp(struct pid *pid, int sig, int priv)
1261{ 1227{
1262 return kill_pgrp_info(sig, __si_special(priv), pid); 1228 int ret;
1229
1230 read_lock(&tasklist_lock);
1231 ret = __kill_pgrp_info(sig, __si_special(priv), pid);
1232 read_unlock(&tasklist_lock);
1233
1234 return ret;
1263} 1235}
1264EXPORT_SYMBOL(kill_pgrp); 1236EXPORT_SYMBOL(kill_pgrp);
1265 1237
@@ -1578,11 +1550,6 @@ static inline int may_ptrace_stop(void)
1578{ 1550{
1579 if (!likely(current->ptrace & PT_PTRACED)) 1551 if (!likely(current->ptrace & PT_PTRACED))
1580 return 0; 1552 return 0;
1581
1582 if (unlikely(current->parent == current->real_parent &&
1583 (current->ptrace & PT_ATTACHED)))
1584 return 0;
1585
1586 /* 1553 /*
1587 * Are we in the middle of do_coredump? 1554 * Are we in the middle of do_coredump?
1588 * If so and our tracer is also part of the coredump stopping 1555 * If so and our tracer is also part of the coredump stopping
@@ -1600,6 +1567,17 @@ static inline int may_ptrace_stop(void)
1600} 1567}
1601 1568
1602/* 1569/*
1570 * Return nonzero if there is a SIGKILL that should be waking us up.
1571 * Called with the siglock held.
1572 */
1573static int sigkill_pending(struct task_struct *tsk)
1574{
1575 return ((sigismember(&tsk->pending.signal, SIGKILL) ||
1576 sigismember(&tsk->signal->shared_pending.signal, SIGKILL)) &&
1577 !unlikely(sigismember(&tsk->blocked, SIGKILL)));
1578}
1579
1580/*
1603 * This must be called with current->sighand->siglock held. 1581 * This must be called with current->sighand->siglock held.
1604 * 1582 *
1605 * This should be the path for all ptrace stops. 1583 * This should be the path for all ptrace stops.
@@ -1607,11 +1585,31 @@ static inline int may_ptrace_stop(void)
1607 * That makes it a way to test a stopped process for 1585 * That makes it a way to test a stopped process for
1608 * being ptrace-stopped vs being job-control-stopped. 1586 * being ptrace-stopped vs being job-control-stopped.
1609 * 1587 *
1610 * If we actually decide not to stop at all because the tracer is gone, 1588 * If we actually decide not to stop at all because the tracer
1611 * we leave nostop_code in current->exit_code. 1589 * is gone, we keep current->exit_code unless clear_code.
1612 */ 1590 */
1613static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info) 1591static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
1614{ 1592{
1593 int killed = 0;
1594
1595 if (arch_ptrace_stop_needed(exit_code, info)) {
1596 /*
1597 * The arch code has something special to do before a
1598 * ptrace stop. This is allowed to block, e.g. for faults
1599 * on user stack pages. We can't keep the siglock while
1600 * calling arch_ptrace_stop, so we must release it now.
1601 * To preserve proper semantics, we must do this before
1602 * any signal bookkeeping like checking group_stop_count.
1603 * Meanwhile, a SIGKILL could come in before we retake the
1604 * siglock. That must prevent us from sleeping in TASK_TRACED.
1605 * So after regaining the lock, we must check for SIGKILL.
1606 */
1607 spin_unlock_irq(&current->sighand->siglock);
1608 arch_ptrace_stop(exit_code, info);
1609 spin_lock_irq(&current->sighand->siglock);
1610 killed = sigkill_pending(current);
1611 }
1612
1615 /* 1613 /*
1616 * If there is a group stop in progress, 1614 * If there is a group stop in progress,
1617 * we must participate in the bookkeeping. 1615 * we must participate in the bookkeeping.
@@ -1623,22 +1621,23 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info)
1623 current->exit_code = exit_code; 1621 current->exit_code = exit_code;
1624 1622
1625 /* Let the debugger run. */ 1623 /* Let the debugger run. */
1626 set_current_state(TASK_TRACED); 1624 __set_current_state(TASK_TRACED);
1627 spin_unlock_irq(&current->sighand->siglock); 1625 spin_unlock_irq(&current->sighand->siglock);
1628 try_to_freeze(); 1626 try_to_freeze();
1629 read_lock(&tasklist_lock); 1627 read_lock(&tasklist_lock);
1630 if (may_ptrace_stop()) { 1628 if (!unlikely(killed) && may_ptrace_stop()) {
1631 do_notify_parent_cldstop(current, CLD_TRAPPED); 1629 do_notify_parent_cldstop(current, CLD_TRAPPED);
1632 read_unlock(&tasklist_lock); 1630 read_unlock(&tasklist_lock);
1633 schedule(); 1631 schedule();
1634 } else { 1632 } else {
1635 /* 1633 /*
1636 * By the time we got the lock, our tracer went away. 1634 * By the time we got the lock, our tracer went away.
1637 * Don't stop here. 1635 * Don't drop the lock yet, another tracer may come.
1638 */ 1636 */
1637 __set_current_state(TASK_RUNNING);
1638 if (clear_code)
1639 current->exit_code = 0;
1639 read_unlock(&tasklist_lock); 1640 read_unlock(&tasklist_lock);
1640 set_current_state(TASK_RUNNING);
1641 current->exit_code = nostop_code;
1642 } 1641 }
1643 1642
1644 /* 1643 /*
@@ -1671,7 +1670,7 @@ void ptrace_notify(int exit_code)
1671 1670
1672 /* Let the debugger run. */ 1671 /* Let the debugger run. */
1673 spin_lock_irq(&current->sighand->siglock); 1672 spin_lock_irq(&current->sighand->siglock);
1674 ptrace_stop(exit_code, 0, &info); 1673 ptrace_stop(exit_code, 1, &info);
1675 spin_unlock_irq(&current->sighand->siglock); 1674 spin_unlock_irq(&current->sighand->siglock);
1676} 1675}
1677 1676
@@ -1709,9 +1708,6 @@ static int do_signal_stop(int signr)
1709 struct signal_struct *sig = current->signal; 1708 struct signal_struct *sig = current->signal;
1710 int stop_count; 1709 int stop_count;
1711 1710
1712 if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED))
1713 return 0;
1714
1715 if (sig->group_stop_count > 0) { 1711 if (sig->group_stop_count > 0) {
1716 /* 1712 /*
1717 * There is a group stop in progress. We don't need to 1713 * There is a group stop in progress. We don't need to
@@ -1719,12 +1715,15 @@ static int do_signal_stop(int signr)
1719 */ 1715 */
1720 stop_count = --sig->group_stop_count; 1716 stop_count = --sig->group_stop_count;
1721 } else { 1717 } else {
1718 struct task_struct *t;
1719
1720 if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) ||
1721 unlikely(sig->group_exit_task))
1722 return 0;
1722 /* 1723 /*
1723 * There is no group stop already in progress. 1724 * There is no group stop already in progress.
1724 * We must initiate one now. 1725 * We must initiate one now.
1725 */ 1726 */
1726 struct task_struct *t;
1727
1728 sig->group_exit_code = signr; 1727 sig->group_exit_code = signr;
1729 1728
1730 stop_count = 0; 1729 stop_count = 0;
@@ -1734,7 +1733,7 @@ static int do_signal_stop(int signr)
1734 * stop is always done with the siglock held, 1733 * stop is always done with the siglock held,
1735 * so this check has no races. 1734 * so this check has no races.
1736 */ 1735 */
1737 if (!t->exit_state && 1736 if (!(t->flags & PF_EXITING) &&
1738 !task_is_stopped_or_traced(t)) { 1737 !task_is_stopped_or_traced(t)) {
1739 stop_count++; 1738 stop_count++;
1740 signal_wake_up(t, 0); 1739 signal_wake_up(t, 0);
@@ -1752,47 +1751,6 @@ static int do_signal_stop(int signr)
1752 return 1; 1751 return 1;
1753} 1752}
1754 1753
1755/*
1756 * Do appropriate magic when group_stop_count > 0.
1757 * We return nonzero if we stopped, after releasing the siglock.
1758 * We return zero if we still hold the siglock and should look
1759 * for another signal without checking group_stop_count again.
1760 */
1761static int handle_group_stop(void)
1762{
1763 int stop_count;
1764
1765 if (current->signal->group_exit_task == current) {
1766 /*
1767 * Group stop is so we can do a core dump,
1768 * We are the initiating thread, so get on with it.
1769 */
1770 current->signal->group_exit_task = NULL;
1771 return 0;
1772 }
1773
1774 if (current->signal->flags & SIGNAL_GROUP_EXIT)
1775 /*
1776 * Group stop is so another thread can do a core dump,
1777 * or else we are racing against a death signal.
1778 * Just punt the stop so we can get the next signal.
1779 */
1780 return 0;
1781
1782 /*
1783 * There is a group stop in progress. We stop
1784 * without any associated signal being in our queue.
1785 */
1786 stop_count = --current->signal->group_stop_count;
1787 if (stop_count == 0)
1788 current->signal->flags = SIGNAL_STOP_STOPPED;
1789 current->exit_code = current->signal->group_exit_code;
1790 set_current_state(TASK_STOPPED);
1791 spin_unlock_irq(&current->sighand->siglock);
1792 finish_stop(stop_count);
1793 return 1;
1794}
1795
1796int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, 1754int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
1797 struct pt_regs *regs, void *cookie) 1755 struct pt_regs *regs, void *cookie)
1798{ 1756{
@@ -1807,7 +1765,7 @@ relock:
1807 struct k_sigaction *ka; 1765 struct k_sigaction *ka;
1808 1766
1809 if (unlikely(current->signal->group_stop_count > 0) && 1767 if (unlikely(current->signal->group_stop_count > 0) &&
1810 handle_group_stop()) 1768 do_signal_stop(0))
1811 goto relock; 1769 goto relock;
1812 1770
1813 signr = dequeue_signal(current, mask, info); 1771 signr = dequeue_signal(current, mask, info);
@@ -1819,7 +1777,7 @@ relock:
1819 ptrace_signal_deliver(regs, cookie); 1777 ptrace_signal_deliver(regs, cookie);
1820 1778
1821 /* Let the debugger run. */ 1779 /* Let the debugger run. */
1822 ptrace_stop(signr, signr, info); 1780 ptrace_stop(signr, 0, info);
1823 1781
1824 /* We're back. Did the debugger cancel the sig? */ 1782 /* We're back. Did the debugger cancel the sig? */
1825 signr = current->exit_code; 1783 signr = current->exit_code;
@@ -1936,6 +1894,48 @@ relock:
1936 return signr; 1894 return signr;
1937} 1895}
1938 1896
1897void exit_signals(struct task_struct *tsk)
1898{
1899 int group_stop = 0;
1900 struct task_struct *t;
1901
1902 if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) {
1903 tsk->flags |= PF_EXITING;
1904 return;
1905 }
1906
1907 spin_lock_irq(&tsk->sighand->siglock);
1908 /*
1909 * From now this task is not visible for group-wide signals,
1910 * see wants_signal(), do_signal_stop().
1911 */
1912 tsk->flags |= PF_EXITING;
1913 if (!signal_pending(tsk))
1914 goto out;
1915
1916 /* It could be that __group_complete_signal() choose us to
1917 * notify about group-wide signal. Another thread should be
1918 * woken now to take the signal since we will not.
1919 */
1920 for (t = tsk; (t = next_thread(t)) != tsk; )
1921 if (!signal_pending(t) && !(t->flags & PF_EXITING))
1922 recalc_sigpending_and_wake(t);
1923
1924 if (unlikely(tsk->signal->group_stop_count) &&
1925 !--tsk->signal->group_stop_count) {
1926 tsk->signal->flags = SIGNAL_STOP_STOPPED;
1927 group_stop = 1;
1928 }
1929out:
1930 spin_unlock_irq(&tsk->sighand->siglock);
1931
1932 if (unlikely(group_stop)) {
1933 read_lock(&tasklist_lock);
1934 do_notify_parent_cldstop(tsk, CLD_STOPPED);
1935 read_unlock(&tasklist_lock);
1936 }
1937}
1938
1939EXPORT_SYMBOL(recalc_sigpending); 1939EXPORT_SYMBOL(recalc_sigpending);
1940EXPORT_SYMBOL_GPL(dequeue_signal); 1940EXPORT_SYMBOL_GPL(dequeue_signal);
1941EXPORT_SYMBOL(flush_signals); 1941EXPORT_SYMBOL(flush_signals);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index d7837d45419e..5b3aea5f471e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -320,7 +320,7 @@ void irq_exit(void)
320/* 320/*
321 * This function must run with irqs disabled! 321 * This function must run with irqs disabled!
322 */ 322 */
323inline fastcall void raise_softirq_irqoff(unsigned int nr) 323inline void raise_softirq_irqoff(unsigned int nr)
324{ 324{
325 __raise_softirq_irqoff(nr); 325 __raise_softirq_irqoff(nr);
326 326
@@ -337,7 +337,7 @@ inline fastcall void raise_softirq_irqoff(unsigned int nr)
337 wakeup_softirqd(); 337 wakeup_softirqd();
338} 338}
339 339
340void fastcall raise_softirq(unsigned int nr) 340void raise_softirq(unsigned int nr)
341{ 341{
342 unsigned long flags; 342 unsigned long flags;
343 343
@@ -363,7 +363,7 @@ struct tasklet_head
363static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec) = { NULL }; 363static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec) = { NULL };
364static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec) = { NULL }; 364static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec) = { NULL };
365 365
366void fastcall __tasklet_schedule(struct tasklet_struct *t) 366void __tasklet_schedule(struct tasklet_struct *t)
367{ 367{
368 unsigned long flags; 368 unsigned long flags;
369 369
@@ -376,7 +376,7 @@ void fastcall __tasklet_schedule(struct tasklet_struct *t)
376 376
377EXPORT_SYMBOL(__tasklet_schedule); 377EXPORT_SYMBOL(__tasklet_schedule);
378 378
379void fastcall __tasklet_hi_schedule(struct tasklet_struct *t) 379void __tasklet_hi_schedule(struct tasklet_struct *t)
380{ 380{
381 unsigned long flags; 381 unsigned long flags;
382 382
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 3507cabe963b..b0aeeaf22ce4 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -74,7 +74,7 @@ static int srcu_readers_active_idx(struct srcu_struct *sp, int idx)
74 * severe errors when invoked on an active srcu_struct. That said, it 74 * severe errors when invoked on an active srcu_struct. That said, it
75 * can be useful as an error check at cleanup time. 75 * can be useful as an error check at cleanup time.
76 */ 76 */
77int srcu_readers_active(struct srcu_struct *sp) 77static int srcu_readers_active(struct srcu_struct *sp)
78{ 78{
79 return srcu_readers_active_idx(sp, 0) + srcu_readers_active_idx(sp, 1); 79 return srcu_readers_active_idx(sp, 0) + srcu_readers_active_idx(sp, 1);
80} 80}
@@ -255,4 +255,3 @@ EXPORT_SYMBOL_GPL(srcu_read_lock);
255EXPORT_SYMBOL_GPL(srcu_read_unlock); 255EXPORT_SYMBOL_GPL(srcu_read_unlock);
256EXPORT_SYMBOL_GPL(synchronize_srcu); 256EXPORT_SYMBOL_GPL(synchronize_srcu);
257EXPORT_SYMBOL_GPL(srcu_batches_completed); 257EXPORT_SYMBOL_GPL(srcu_batches_completed);
258EXPORT_SYMBOL_GPL(srcu_readers_active);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 51b5ee53571a..6f4e0e13f70c 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -29,7 +29,6 @@ enum stopmachine_state {
29static enum stopmachine_state stopmachine_state; 29static enum stopmachine_state stopmachine_state;
30static unsigned int stopmachine_num_threads; 30static unsigned int stopmachine_num_threads;
31static atomic_t stopmachine_thread_ack; 31static atomic_t stopmachine_thread_ack;
32static DECLARE_MUTEX(stopmachine_mutex);
33 32
34static int stopmachine(void *cpu) 33static int stopmachine(void *cpu)
35{ 34{
@@ -170,6 +169,7 @@ static int do_stop(void *_smdata)
170struct task_struct *__stop_machine_run(int (*fn)(void *), void *data, 169struct task_struct *__stop_machine_run(int (*fn)(void *), void *data,
171 unsigned int cpu) 170 unsigned int cpu)
172{ 171{
172 static DEFINE_MUTEX(stopmachine_mutex);
173 struct stop_machine_data smdata; 173 struct stop_machine_data smdata;
174 struct task_struct *p; 174 struct task_struct *p;
175 175
@@ -177,7 +177,7 @@ struct task_struct *__stop_machine_run(int (*fn)(void *), void *data,
177 smdata.data = data; 177 smdata.data = data;
178 init_completion(&smdata.done); 178 init_completion(&smdata.done);
179 179
180 down(&stopmachine_mutex); 180 mutex_lock(&stopmachine_mutex);
181 181
182 /* If they don't care which CPU fn runs on, bind to any online one. */ 182 /* If they don't care which CPU fn runs on, bind to any online one. */
183 if (cpu == NR_CPUS) 183 if (cpu == NR_CPUS)
@@ -193,7 +193,7 @@ struct task_struct *__stop_machine_run(int (*fn)(void *), void *data,
193 wake_up_process(p); 193 wake_up_process(p);
194 wait_for_completion(&smdata.done); 194 wait_for_completion(&smdata.done);
195 } 195 }
196 up(&stopmachine_mutex); 196 mutex_unlock(&stopmachine_mutex);
197 return p; 197 return p;
198} 198}
199 199
diff --git a/kernel/sys.c b/kernel/sys.c
index d1fe71eb4546..a626116af5db 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -315,7 +315,7 @@ static void kernel_kexec(void)
315#endif 315#endif
316} 316}
317 317
318void kernel_shutdown_prepare(enum system_states state) 318static void kernel_shutdown_prepare(enum system_states state)
319{ 319{
320 blocking_notifier_call_chain(&reboot_notifier_list, 320 blocking_notifier_call_chain(&reboot_notifier_list,
321 (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL); 321 (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL);
@@ -916,8 +916,8 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
916{ 916{
917 struct task_struct *p; 917 struct task_struct *p;
918 struct task_struct *group_leader = current->group_leader; 918 struct task_struct *group_leader = current->group_leader;
919 int err = -EINVAL; 919 struct pid *pgrp;
920 struct pid_namespace *ns; 920 int err;
921 921
922 if (!pid) 922 if (!pid)
923 pid = task_pid_vnr(group_leader); 923 pid = task_pid_vnr(group_leader);
@@ -929,12 +929,10 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
929 /* From this point forward we keep holding onto the tasklist lock 929 /* From this point forward we keep holding onto the tasklist lock
930 * so that our parent does not change from under us. -DaveM 930 * so that our parent does not change from under us. -DaveM
931 */ 931 */
932 ns = current->nsproxy->pid_ns;
933
934 write_lock_irq(&tasklist_lock); 932 write_lock_irq(&tasklist_lock);
935 933
936 err = -ESRCH; 934 err = -ESRCH;
937 p = find_task_by_pid_ns(pid, ns); 935 p = find_task_by_vpid(pid);
938 if (!p) 936 if (!p)
939 goto out; 937 goto out;
940 938
@@ -942,7 +940,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
942 if (!thread_group_leader(p)) 940 if (!thread_group_leader(p))
943 goto out; 941 goto out;
944 942
945 if (p->real_parent->tgid == group_leader->tgid) { 943 if (same_thread_group(p->real_parent, group_leader)) {
946 err = -EPERM; 944 err = -EPERM;
947 if (task_session(p) != task_session(group_leader)) 945 if (task_session(p) != task_session(group_leader))
948 goto out; 946 goto out;
@@ -959,10 +957,12 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
959 if (p->signal->leader) 957 if (p->signal->leader)
960 goto out; 958 goto out;
961 959
960 pgrp = task_pid(p);
962 if (pgid != pid) { 961 if (pgid != pid) {
963 struct task_struct *g; 962 struct task_struct *g;
964 963
965 g = find_task_by_pid_type_ns(PIDTYPE_PGID, pgid, ns); 964 pgrp = find_vpid(pgid);
965 g = pid_task(pgrp, PIDTYPE_PGID);
966 if (!g || task_session(g) != task_session(group_leader)) 966 if (!g || task_session(g) != task_session(group_leader))
967 goto out; 967 goto out;
968 } 968 }
@@ -971,13 +971,10 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
971 if (err) 971 if (err)
972 goto out; 972 goto out;
973 973
974 if (task_pgrp_nr_ns(p, ns) != pgid) { 974 if (task_pgrp(p) != pgrp) {
975 struct pid *pid;
976
977 detach_pid(p, PIDTYPE_PGID); 975 detach_pid(p, PIDTYPE_PGID);
978 pid = find_vpid(pgid); 976 attach_pid(p, PIDTYPE_PGID, pgrp);
979 attach_pid(p, PIDTYPE_PGID, pid); 977 set_task_pgrp(p, pid_nr(pgrp));
980 set_task_pgrp(p, pid_nr(pid));
981 } 978 }
982 979
983 err = 0; 980 err = 0;
@@ -994,17 +991,14 @@ asmlinkage long sys_getpgid(pid_t pid)
994 else { 991 else {
995 int retval; 992 int retval;
996 struct task_struct *p; 993 struct task_struct *p;
997 struct pid_namespace *ns;
998
999 ns = current->nsproxy->pid_ns;
1000 994
1001 read_lock(&tasklist_lock); 995 read_lock(&tasklist_lock);
1002 p = find_task_by_pid_ns(pid, ns); 996 p = find_task_by_vpid(pid);
1003 retval = -ESRCH; 997 retval = -ESRCH;
1004 if (p) { 998 if (p) {
1005 retval = security_task_getpgid(p); 999 retval = security_task_getpgid(p);
1006 if (!retval) 1000 if (!retval)
1007 retval = task_pgrp_nr_ns(p, ns); 1001 retval = task_pgrp_vnr(p);
1008 } 1002 }
1009 read_unlock(&tasklist_lock); 1003 read_unlock(&tasklist_lock);
1010 return retval; 1004 return retval;
@@ -1028,19 +1022,16 @@ asmlinkage long sys_getsid(pid_t pid)
1028 else { 1022 else {
1029 int retval; 1023 int retval;
1030 struct task_struct *p; 1024 struct task_struct *p;
1031 struct pid_namespace *ns;
1032
1033 ns = current->nsproxy->pid_ns;
1034 1025
1035 read_lock(&tasklist_lock); 1026 rcu_read_lock();
1036 p = find_task_by_pid_ns(pid, ns); 1027 p = find_task_by_vpid(pid);
1037 retval = -ESRCH; 1028 retval = -ESRCH;
1038 if (p) { 1029 if (p) {
1039 retval = security_task_getsid(p); 1030 retval = security_task_getsid(p);
1040 if (!retval) 1031 if (!retval)
1041 retval = task_session_nr_ns(p, ns); 1032 retval = task_session_vnr(p);
1042 } 1033 }
1043 read_unlock(&tasklist_lock); 1034 rcu_read_unlock();
1044 return retval; 1035 return retval;
1045 } 1036 }
1046} 1037}
@@ -1048,35 +1039,29 @@ asmlinkage long sys_getsid(pid_t pid)
1048asmlinkage long sys_setsid(void) 1039asmlinkage long sys_setsid(void)
1049{ 1040{
1050 struct task_struct *group_leader = current->group_leader; 1041 struct task_struct *group_leader = current->group_leader;
1051 pid_t session; 1042 struct pid *sid = task_pid(group_leader);
1043 pid_t session = pid_vnr(sid);
1052 int err = -EPERM; 1044 int err = -EPERM;
1053 1045
1054 write_lock_irq(&tasklist_lock); 1046 write_lock_irq(&tasklist_lock);
1055
1056 /* Fail if I am already a session leader */ 1047 /* Fail if I am already a session leader */
1057 if (group_leader->signal->leader) 1048 if (group_leader->signal->leader)
1058 goto out; 1049 goto out;
1059 1050
1060 session = group_leader->pid;
1061 /* Fail if a process group id already exists that equals the 1051 /* Fail if a process group id already exists that equals the
1062 * proposed session id. 1052 * proposed session id.
1063 *
1064 * Don't check if session id == 1 because kernel threads use this
1065 * session id and so the check will always fail and make it so
1066 * init cannot successfully call setsid.
1067 */ 1053 */
1068 if (session > 1 && find_task_by_pid_type_ns(PIDTYPE_PGID, 1054 if (pid_task(sid, PIDTYPE_PGID))
1069 session, &init_pid_ns))
1070 goto out; 1055 goto out;
1071 1056
1072 group_leader->signal->leader = 1; 1057 group_leader->signal->leader = 1;
1073 __set_special_pids(session, session); 1058 __set_special_pids(sid);
1074 1059
1075 spin_lock(&group_leader->sighand->siglock); 1060 spin_lock(&group_leader->sighand->siglock);
1076 group_leader->signal->tty = NULL; 1061 group_leader->signal->tty = NULL;
1077 spin_unlock(&group_leader->sighand->siglock); 1062 spin_unlock(&group_leader->sighand->siglock);
1078 1063
1079 err = task_pgrp_vnr(group_leader); 1064 err = session;
1080out: 1065out:
1081 write_unlock_irq(&tasklist_lock); 1066 write_unlock_irq(&tasklist_lock);
1082 return err; 1067 return err;
@@ -1145,16 +1130,16 @@ static int groups_to_user(gid_t __user *grouplist,
1145 struct group_info *group_info) 1130 struct group_info *group_info)
1146{ 1131{
1147 int i; 1132 int i;
1148 int count = group_info->ngroups; 1133 unsigned int count = group_info->ngroups;
1149 1134
1150 for (i = 0; i < group_info->nblocks; i++) { 1135 for (i = 0; i < group_info->nblocks; i++) {
1151 int cp_count = min(NGROUPS_PER_BLOCK, count); 1136 unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
1152 int off = i * NGROUPS_PER_BLOCK; 1137 unsigned int len = cp_count * sizeof(*grouplist);
1153 int len = cp_count * sizeof(*grouplist);
1154 1138
1155 if (copy_to_user(grouplist+off, group_info->blocks[i], len)) 1139 if (copy_to_user(grouplist, group_info->blocks[i], len))
1156 return -EFAULT; 1140 return -EFAULT;
1157 1141
1142 grouplist += NGROUPS_PER_BLOCK;
1158 count -= cp_count; 1143 count -= cp_count;
1159 } 1144 }
1160 return 0; 1145 return 0;
@@ -1165,16 +1150,16 @@ static int groups_from_user(struct group_info *group_info,
1165 gid_t __user *grouplist) 1150 gid_t __user *grouplist)
1166{ 1151{
1167 int i; 1152 int i;
1168 int count = group_info->ngroups; 1153 unsigned int count = group_info->ngroups;
1169 1154
1170 for (i = 0; i < group_info->nblocks; i++) { 1155 for (i = 0; i < group_info->nblocks; i++) {
1171 int cp_count = min(NGROUPS_PER_BLOCK, count); 1156 unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
1172 int off = i * NGROUPS_PER_BLOCK; 1157 unsigned int len = cp_count * sizeof(*grouplist);
1173 int len = cp_count * sizeof(*grouplist);
1174 1158
1175 if (copy_from_user(group_info->blocks[i], grouplist+off, len)) 1159 if (copy_from_user(group_info->blocks[i], grouplist, len))
1176 return -EFAULT; 1160 return -EFAULT;
1177 1161
1162 grouplist += NGROUPS_PER_BLOCK;
1178 count -= cp_count; 1163 count -= cp_count;
1179 } 1164 }
1180 return 0; 1165 return 0;
@@ -1472,7 +1457,7 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
1472 if ((new_rlim.rlim_max > old_rlim->rlim_max) && 1457 if ((new_rlim.rlim_max > old_rlim->rlim_max) &&
1473 !capable(CAP_SYS_RESOURCE)) 1458 !capable(CAP_SYS_RESOURCE))
1474 return -EPERM; 1459 return -EPERM;
1475 if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > NR_OPEN) 1460 if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open)
1476 return -EPERM; 1461 return -EPERM;
1477 1462
1478 retval = security_task_setrlimit(resource, &new_rlim); 1463 retval = security_task_setrlimit(resource, &new_rlim);
@@ -1637,7 +1622,7 @@ asmlinkage long sys_umask(int mask)
1637 mask = xchg(&current->fs->umask, mask & S_IRWXUGO); 1622 mask = xchg(&current->fs->umask, mask & S_IRWXUGO);
1638 return mask; 1623 return mask;
1639} 1624}
1640 1625
1641asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, 1626asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
1642 unsigned long arg4, unsigned long arg5) 1627 unsigned long arg4, unsigned long arg5)
1643{ 1628{
@@ -1742,6 +1727,17 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
1742 error = prctl_set_seccomp(arg2); 1727 error = prctl_set_seccomp(arg2);
1743 break; 1728 break;
1744 1729
1730 case PR_CAPBSET_READ:
1731 if (!cap_valid(arg2))
1732 return -EINVAL;
1733 return !!cap_raised(current->cap_bset, arg2);
1734 case PR_CAPBSET_DROP:
1735#ifdef CONFIG_SECURITY_FILE_CAPABILITIES
1736 return cap_prctl_drop(arg2);
1737#else
1738 return -EINVAL;
1739#endif
1740
1745 default: 1741 default:
1746 error = -EINVAL; 1742 error = -EINVAL;
1747 break; 1743 break;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index beee5b3b68a2..5b9b467de070 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -154,7 +154,10 @@ cond_syscall(sys_ioprio_get);
154 154
155/* New file descriptors */ 155/* New file descriptors */
156cond_syscall(sys_signalfd); 156cond_syscall(sys_signalfd);
157cond_syscall(sys_timerfd);
158cond_syscall(compat_sys_signalfd); 157cond_syscall(compat_sys_signalfd);
159cond_syscall(compat_sys_timerfd); 158cond_syscall(sys_timerfd_create);
159cond_syscall(sys_timerfd_settime);
160cond_syscall(sys_timerfd_gettime);
161cond_syscall(compat_sys_timerfd_settime);
162cond_syscall(compat_sys_timerfd_gettime);
160cond_syscall(sys_eventfd); 163cond_syscall(sys_eventfd);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 7cb1ac3e6fff..8b7e95411795 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -37,7 +37,6 @@
37#include <linux/highuid.h> 37#include <linux/highuid.h>
38#include <linux/writeback.h> 38#include <linux/writeback.h>
39#include <linux/hugetlb.h> 39#include <linux/hugetlb.h>
40#include <linux/security.h>
41#include <linux/initrd.h> 40#include <linux/initrd.h>
42#include <linux/times.h> 41#include <linux/times.h>
43#include <linux/limits.h> 42#include <linux/limits.h>
@@ -67,14 +66,13 @@ extern int sysctl_overcommit_memory;
67extern int sysctl_overcommit_ratio; 66extern int sysctl_overcommit_ratio;
68extern int sysctl_panic_on_oom; 67extern int sysctl_panic_on_oom;
69extern int sysctl_oom_kill_allocating_task; 68extern int sysctl_oom_kill_allocating_task;
69extern int sysctl_oom_dump_tasks;
70extern int max_threads; 70extern int max_threads;
71extern int core_uses_pid; 71extern int core_uses_pid;
72extern int suid_dumpable; 72extern int suid_dumpable;
73extern char core_pattern[]; 73extern char core_pattern[];
74extern int pid_max; 74extern int pid_max;
75extern int min_free_kbytes; 75extern int min_free_kbytes;
76extern int printk_ratelimit_jiffies;
77extern int printk_ratelimit_burst;
78extern int pid_max_min, pid_max_max; 76extern int pid_max_min, pid_max_max;
79extern int sysctl_drop_caches; 77extern int sysctl_drop_caches;
80extern int percpu_pagelist_fraction; 78extern int percpu_pagelist_fraction;
@@ -84,8 +82,11 @@ extern int sysctl_stat_interval;
84extern int latencytop_enabled; 82extern int latencytop_enabled;
85 83
86/* Constants used for minimum and maximum */ 84/* Constants used for minimum and maximum */
87#ifdef CONFIG_DETECT_SOFTLOCKUP 85#if defined(CONFIG_DETECT_SOFTLOCKUP) || defined(CONFIG_HIGHMEM)
88static int one = 1; 86static int one = 1;
87#endif
88
89#ifdef CONFIG_DETECT_SOFTLOCKUP
89static int sixty = 60; 90static int sixty = 60;
90#endif 91#endif
91 92
@@ -310,22 +311,6 @@ static struct ctl_table kern_table[] = {
310 .mode = 0644, 311 .mode = 0644,
311 .proc_handler = &proc_dointvec, 312 .proc_handler = &proc_dointvec,
312 }, 313 },
313 {
314 .ctl_name = CTL_UNNUMBERED,
315 .procname = "sched_rt_period_ms",
316 .data = &sysctl_sched_rt_period,
317 .maxlen = sizeof(unsigned int),
318 .mode = 0644,
319 .proc_handler = &proc_dointvec,
320 },
321 {
322 .ctl_name = CTL_UNNUMBERED,
323 .procname = "sched_rt_ratio",
324 .data = &sysctl_sched_rt_ratio,
325 .maxlen = sizeof(unsigned int),
326 .mode = 0644,
327 .proc_handler = &proc_dointvec,
328 },
329#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) 314#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
330 { 315 {
331 .ctl_name = CTL_UNNUMBERED, 316 .ctl_name = CTL_UNNUMBERED,
@@ -347,6 +332,22 @@ static struct ctl_table kern_table[] = {
347#endif 332#endif
348 { 333 {
349 .ctl_name = CTL_UNNUMBERED, 334 .ctl_name = CTL_UNNUMBERED,
335 .procname = "sched_rt_period_us",
336 .data = &sysctl_sched_rt_period,
337 .maxlen = sizeof(unsigned int),
338 .mode = 0644,
339 .proc_handler = &proc_dointvec,
340 },
341 {
342 .ctl_name = CTL_UNNUMBERED,
343 .procname = "sched_rt_runtime_us",
344 .data = &sysctl_sched_rt_runtime,
345 .maxlen = sizeof(int),
346 .mode = 0644,
347 .proc_handler = &proc_dointvec,
348 },
349 {
350 .ctl_name = CTL_UNNUMBERED,
350 .procname = "sched_compat_yield", 351 .procname = "sched_compat_yield",
351 .data = &sysctl_sched_compat_yield, 352 .data = &sysctl_sched_compat_yield,
352 .maxlen = sizeof(unsigned int), 353 .maxlen = sizeof(unsigned int),
@@ -416,15 +417,6 @@ static struct ctl_table kern_table[] = {
416 .proc_handler = &proc_dointvec, 417 .proc_handler = &proc_dointvec,
417 }, 418 },
418#endif 419#endif
419#ifdef CONFIG_SECURITY_CAPABILITIES
420 {
421 .procname = "cap-bound",
422 .data = &cap_bset,
423 .maxlen = sizeof(kernel_cap_t),
424 .mode = 0600,
425 .proc_handler = &proc_dointvec_bset,
426 },
427#endif /* def CONFIG_SECURITY_CAPABILITIES */
428#ifdef CONFIG_BLK_DEV_INITRD 420#ifdef CONFIG_BLK_DEV_INITRD
429 { 421 {
430 .ctl_name = KERN_REALROOTDEV, 422 .ctl_name = KERN_REALROOTDEV,
@@ -496,14 +488,6 @@ static struct ctl_table kern_table[] = {
496 .mode = 0644, 488 .mode = 0644,
497 .proc_handler = &proc_dointvec, 489 .proc_handler = &proc_dointvec,
498 }, 490 },
499 {
500 .ctl_name = KERN_PRINTK,
501 .procname = "printk",
502 .data = &console_loglevel,
503 .maxlen = 4*sizeof(int),
504 .mode = 0644,
505 .proc_handler = &proc_dointvec,
506 },
507#ifdef CONFIG_KMOD 491#ifdef CONFIG_KMOD
508 { 492 {
509 .ctl_name = KERN_MODPROBE, 493 .ctl_name = KERN_MODPROBE,
@@ -650,6 +634,15 @@ static struct ctl_table kern_table[] = {
650 .mode = 0644, 634 .mode = 0644,
651 .proc_handler = &proc_dointvec, 635 .proc_handler = &proc_dointvec,
652 }, 636 },
637#if defined CONFIG_PRINTK
638 {
639 .ctl_name = KERN_PRINTK,
640 .procname = "printk",
641 .data = &console_loglevel,
642 .maxlen = 4*sizeof(int),
643 .mode = 0644,
644 .proc_handler = &proc_dointvec,
645 },
653 { 646 {
654 .ctl_name = KERN_PRINTK_RATELIMIT, 647 .ctl_name = KERN_PRINTK_RATELIMIT,
655 .procname = "printk_ratelimit", 648 .procname = "printk_ratelimit",
@@ -667,6 +660,7 @@ static struct ctl_table kern_table[] = {
667 .mode = 0644, 660 .mode = 0644,
668 .proc_handler = &proc_dointvec, 661 .proc_handler = &proc_dointvec,
669 }, 662 },
663#endif
670 { 664 {
671 .ctl_name = KERN_NGROUPS_MAX, 665 .ctl_name = KERN_NGROUPS_MAX,
672 .procname = "ngroups_max", 666 .procname = "ngroups_max",
@@ -877,6 +871,14 @@ static struct ctl_table vm_table[] = {
877 .proc_handler = &proc_dointvec, 871 .proc_handler = &proc_dointvec,
878 }, 872 },
879 { 873 {
874 .ctl_name = CTL_UNNUMBERED,
875 .procname = "oom_dump_tasks",
876 .data = &sysctl_oom_dump_tasks,
877 .maxlen = sizeof(sysctl_oom_dump_tasks),
878 .mode = 0644,
879 .proc_handler = &proc_dointvec,
880 },
881 {
880 .ctl_name = VM_OVERCOMMIT_RATIO, 882 .ctl_name = VM_OVERCOMMIT_RATIO,
881 .procname = "overcommit_ratio", 883 .procname = "overcommit_ratio",
882 .data = &sysctl_overcommit_ratio, 884 .data = &sysctl_overcommit_ratio,
@@ -976,10 +978,10 @@ static struct ctl_table vm_table[] = {
976 { 978 {
977 .ctl_name = CTL_UNNUMBERED, 979 .ctl_name = CTL_UNNUMBERED,
978 .procname = "nr_overcommit_hugepages", 980 .procname = "nr_overcommit_hugepages",
979 .data = &nr_overcommit_huge_pages, 981 .data = &sysctl_overcommit_huge_pages,
980 .maxlen = sizeof(nr_overcommit_huge_pages), 982 .maxlen = sizeof(sysctl_overcommit_huge_pages),
981 .mode = 0644, 983 .mode = 0644,
982 .proc_handler = &proc_doulongvec_minmax, 984 .proc_handler = &hugetlb_overcommit_handler,
983 }, 985 },
984#endif 986#endif
985 { 987 {
@@ -1150,6 +1152,19 @@ static struct ctl_table vm_table[] = {
1150 .extra1 = &zero, 1152 .extra1 = &zero,
1151 }, 1153 },
1152#endif 1154#endif
1155#ifdef CONFIG_HIGHMEM
1156 {
1157 .ctl_name = CTL_UNNUMBERED,
1158 .procname = "highmem_is_dirtyable",
1159 .data = &vm_highmem_is_dirtyable,
1160 .maxlen = sizeof(vm_highmem_is_dirtyable),
1161 .mode = 0644,
1162 .proc_handler = &proc_dointvec_minmax,
1163 .strategy = &sysctl_intvec,
1164 .extra1 = &zero,
1165 .extra2 = &one,
1166 },
1167#endif
1153/* 1168/*
1154 * NOTE: do not add new entries to this table unless you have read 1169 * NOTE: do not add new entries to this table unless you have read
1155 * Documentation/sysctl/ctl_unnumbered.txt 1170 * Documentation/sysctl/ctl_unnumbered.txt
@@ -1196,6 +1211,14 @@ static struct ctl_table fs_table[] = {
1196 .proc_handler = &proc_dointvec, 1211 .proc_handler = &proc_dointvec,
1197 }, 1212 },
1198 { 1213 {
1214 .ctl_name = CTL_UNNUMBERED,
1215 .procname = "nr_open",
1216 .data = &sysctl_nr_open,
1217 .maxlen = sizeof(int),
1218 .mode = 0644,
1219 .proc_handler = &proc_dointvec,
1220 },
1221 {
1199 .ctl_name = FS_DENTRY, 1222 .ctl_name = FS_DENTRY,
1200 .procname = "dentry-state", 1223 .procname = "dentry-state",
1201 .data = &dentry_stat, 1224 .data = &dentry_stat,
@@ -2080,26 +2103,6 @@ static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp,
2080 return 0; 2103 return 0;
2081} 2104}
2082 2105
2083#ifdef CONFIG_SECURITY_CAPABILITIES
2084/*
2085 * init may raise the set.
2086 */
2087
2088int proc_dointvec_bset(struct ctl_table *table, int write, struct file *filp,
2089 void __user *buffer, size_t *lenp, loff_t *ppos)
2090{
2091 int op;
2092
2093 if (write && !capable(CAP_SYS_MODULE)) {
2094 return -EPERM;
2095 }
2096
2097 op = is_global_init(current) ? OP_SET : OP_AND;
2098 return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
2099 do_proc_dointvec_bset_conv,&op);
2100}
2101#endif /* def CONFIG_SECURITY_CAPABILITIES */
2102
2103/* 2106/*
2104 * Taint values can only be increased 2107 * Taint values can only be increased
2105 */ 2108 */
@@ -2484,7 +2487,7 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp
2484 pid_t tmp; 2487 pid_t tmp;
2485 int r; 2488 int r;
2486 2489
2487 tmp = pid_nr_ns(cad_pid, current->nsproxy->pid_ns); 2490 tmp = pid_vnr(cad_pid);
2488 2491
2489 r = __do_proc_dointvec(&tmp, table, write, filp, buffer, 2492 r = __do_proc_dointvec(&tmp, table, write, filp, buffer,
2490 lenp, ppos, NULL, NULL); 2493 lenp, ppos, NULL, NULL);
@@ -2513,12 +2516,6 @@ int proc_dointvec(struct ctl_table *table, int write, struct file *filp,
2513 return -ENOSYS; 2516 return -ENOSYS;
2514} 2517}
2515 2518
2516int proc_dointvec_bset(struct ctl_table *table, int write, struct file *filp,
2517 void __user *buffer, size_t *lenp, loff_t *ppos)
2518{
2519 return -ENOSYS;
2520}
2521
2522int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp, 2519int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp,
2523 void __user *buffer, size_t *lenp, loff_t *ppos) 2520 void __user *buffer, size_t *lenp, loff_t *ppos)
2524{ 2521{
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index c3206fa50048..c09350d564f2 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -8,10 +8,10 @@
8struct trans_ctl_table { 8struct trans_ctl_table {
9 int ctl_name; 9 int ctl_name;
10 const char *procname; 10 const char *procname;
11 struct trans_ctl_table *child; 11 const struct trans_ctl_table *child;
12}; 12};
13 13
14static struct trans_ctl_table trans_random_table[] = { 14static const struct trans_ctl_table trans_random_table[] = {
15 { RANDOM_POOLSIZE, "poolsize" }, 15 { RANDOM_POOLSIZE, "poolsize" },
16 { RANDOM_ENTROPY_COUNT, "entropy_avail" }, 16 { RANDOM_ENTROPY_COUNT, "entropy_avail" },
17 { RANDOM_READ_THRESH, "read_wakeup_threshold" }, 17 { RANDOM_READ_THRESH, "read_wakeup_threshold" },
@@ -21,13 +21,13 @@ static struct trans_ctl_table trans_random_table[] = {
21 {} 21 {}
22}; 22};
23 23
24static struct trans_ctl_table trans_pty_table[] = { 24static const struct trans_ctl_table trans_pty_table[] = {
25 { PTY_MAX, "max" }, 25 { PTY_MAX, "max" },
26 { PTY_NR, "nr" }, 26 { PTY_NR, "nr" },
27 {} 27 {}
28}; 28};
29 29
30static struct trans_ctl_table trans_kern_table[] = { 30static const struct trans_ctl_table trans_kern_table[] = {
31 { KERN_OSTYPE, "ostype" }, 31 { KERN_OSTYPE, "ostype" },
32 { KERN_OSRELEASE, "osrelease" }, 32 { KERN_OSRELEASE, "osrelease" },
33 /* KERN_OSREV not used */ 33 /* KERN_OSREV not used */
@@ -37,10 +37,6 @@ static struct trans_ctl_table trans_kern_table[] = {
37 { KERN_NODENAME, "hostname" }, 37 { KERN_NODENAME, "hostname" },
38 { KERN_DOMAINNAME, "domainname" }, 38 { KERN_DOMAINNAME, "domainname" },
39 39
40#ifdef CONFIG_SECURITY_CAPABILITIES
41 { KERN_CAP_BSET, "cap-bound" },
42#endif /* def CONFIG_SECURITY_CAPABILITIES */
43
44 { KERN_PANIC, "panic" }, 40 { KERN_PANIC, "panic" },
45 { KERN_REALROOTDEV, "real-root-dev" }, 41 { KERN_REALROOTDEV, "real-root-dev" },
46 42
@@ -111,7 +107,7 @@ static struct trans_ctl_table trans_kern_table[] = {
111 {} 107 {}
112}; 108};
113 109
114static struct trans_ctl_table trans_vm_table[] = { 110static const struct trans_ctl_table trans_vm_table[] = {
115 { VM_OVERCOMMIT_MEMORY, "overcommit_memory" }, 111 { VM_OVERCOMMIT_MEMORY, "overcommit_memory" },
116 { VM_PAGE_CLUSTER, "page-cluster" }, 112 { VM_PAGE_CLUSTER, "page-cluster" },
117 { VM_DIRTY_BACKGROUND, "dirty_background_ratio" }, 113 { VM_DIRTY_BACKGROUND, "dirty_background_ratio" },
@@ -143,7 +139,7 @@ static struct trans_ctl_table trans_vm_table[] = {
143 {} 139 {}
144}; 140};
145 141
146static struct trans_ctl_table trans_net_core_table[] = { 142static const struct trans_ctl_table trans_net_core_table[] = {
147 { NET_CORE_WMEM_MAX, "wmem_max" }, 143 { NET_CORE_WMEM_MAX, "wmem_max" },
148 { NET_CORE_RMEM_MAX, "rmem_max" }, 144 { NET_CORE_RMEM_MAX, "rmem_max" },
149 { NET_CORE_WMEM_DEFAULT, "wmem_default" }, 145 { NET_CORE_WMEM_DEFAULT, "wmem_default" },
@@ -169,14 +165,14 @@ static struct trans_ctl_table trans_net_core_table[] = {
169 {}, 165 {},
170}; 166};
171 167
172static struct trans_ctl_table trans_net_unix_table[] = { 168static const struct trans_ctl_table trans_net_unix_table[] = {
173 /* NET_UNIX_DESTROY_DELAY unused */ 169 /* NET_UNIX_DESTROY_DELAY unused */
174 /* NET_UNIX_DELETE_DELAY unused */ 170 /* NET_UNIX_DELETE_DELAY unused */
175 { NET_UNIX_MAX_DGRAM_QLEN, "max_dgram_qlen" }, 171 { NET_UNIX_MAX_DGRAM_QLEN, "max_dgram_qlen" },
176 {} 172 {}
177}; 173};
178 174
179static struct trans_ctl_table trans_net_ipv4_route_table[] = { 175static const struct trans_ctl_table trans_net_ipv4_route_table[] = {
180 { NET_IPV4_ROUTE_FLUSH, "flush" }, 176 { NET_IPV4_ROUTE_FLUSH, "flush" },
181 { NET_IPV4_ROUTE_MIN_DELAY, "min_delay" }, 177 { NET_IPV4_ROUTE_MIN_DELAY, "min_delay" },
182 { NET_IPV4_ROUTE_MAX_DELAY, "max_delay" }, 178 { NET_IPV4_ROUTE_MAX_DELAY, "max_delay" },
@@ -199,7 +195,7 @@ static struct trans_ctl_table trans_net_ipv4_route_table[] = {
199 {} 195 {}
200}; 196};
201 197
202static struct trans_ctl_table trans_net_ipv4_conf_vars_table[] = { 198static const struct trans_ctl_table trans_net_ipv4_conf_vars_table[] = {
203 { NET_IPV4_CONF_FORWARDING, "forwarding" }, 199 { NET_IPV4_CONF_FORWARDING, "forwarding" },
204 { NET_IPV4_CONF_MC_FORWARDING, "mc_forwarding" }, 200 { NET_IPV4_CONF_MC_FORWARDING, "mc_forwarding" },
205 201
@@ -226,14 +222,14 @@ static struct trans_ctl_table trans_net_ipv4_conf_vars_table[] = {
226 {} 222 {}
227}; 223};
228 224
229static struct trans_ctl_table trans_net_ipv4_conf_table[] = { 225static const struct trans_ctl_table trans_net_ipv4_conf_table[] = {
230 { NET_PROTO_CONF_ALL, "all", trans_net_ipv4_conf_vars_table }, 226 { NET_PROTO_CONF_ALL, "all", trans_net_ipv4_conf_vars_table },
231 { NET_PROTO_CONF_DEFAULT, "default", trans_net_ipv4_conf_vars_table }, 227 { NET_PROTO_CONF_DEFAULT, "default", trans_net_ipv4_conf_vars_table },
232 { 0, NULL, trans_net_ipv4_conf_vars_table }, 228 { 0, NULL, trans_net_ipv4_conf_vars_table },
233 {} 229 {}
234}; 230};
235 231
236static struct trans_ctl_table trans_net_neigh_vars_table[] = { 232static const struct trans_ctl_table trans_net_neigh_vars_table[] = {
237 { NET_NEIGH_MCAST_SOLICIT, "mcast_solicit" }, 233 { NET_NEIGH_MCAST_SOLICIT, "mcast_solicit" },
238 { NET_NEIGH_UCAST_SOLICIT, "ucast_solicit" }, 234 { NET_NEIGH_UCAST_SOLICIT, "ucast_solicit" },
239 { NET_NEIGH_APP_SOLICIT, "app_solicit" }, 235 { NET_NEIGH_APP_SOLICIT, "app_solicit" },
@@ -255,13 +251,13 @@ static struct trans_ctl_table trans_net_neigh_vars_table[] = {
255 {} 251 {}
256}; 252};
257 253
258static struct trans_ctl_table trans_net_neigh_table[] = { 254static const struct trans_ctl_table trans_net_neigh_table[] = {
259 { NET_PROTO_CONF_DEFAULT, "default", trans_net_neigh_vars_table }, 255 { NET_PROTO_CONF_DEFAULT, "default", trans_net_neigh_vars_table },
260 { 0, NULL, trans_net_neigh_vars_table }, 256 { 0, NULL, trans_net_neigh_vars_table },
261 {} 257 {}
262}; 258};
263 259
264static struct trans_ctl_table trans_net_ipv4_netfilter_table[] = { 260static const struct trans_ctl_table trans_net_ipv4_netfilter_table[] = {
265 { NET_IPV4_NF_CONNTRACK_MAX, "ip_conntrack_max" }, 261 { NET_IPV4_NF_CONNTRACK_MAX, "ip_conntrack_max" },
266 262
267 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, "ip_conntrack_tcp_timeout_syn_sent" }, 263 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, "ip_conntrack_tcp_timeout_syn_sent" },
@@ -298,7 +294,7 @@ static struct trans_ctl_table trans_net_ipv4_netfilter_table[] = {
298 {} 294 {}
299}; 295};
300 296
301static struct trans_ctl_table trans_net_ipv4_table[] = { 297static const struct trans_ctl_table trans_net_ipv4_table[] = {
302 { NET_IPV4_FORWARD, "ip_forward" }, 298 { NET_IPV4_FORWARD, "ip_forward" },
303 { NET_IPV4_DYNADDR, "ip_dynaddr" }, 299 { NET_IPV4_DYNADDR, "ip_dynaddr" },
304 300
@@ -397,13 +393,13 @@ static struct trans_ctl_table trans_net_ipv4_table[] = {
397 {} 393 {}
398}; 394};
399 395
400static struct trans_ctl_table trans_net_ipx_table[] = { 396static const struct trans_ctl_table trans_net_ipx_table[] = {
401 { NET_IPX_PPROP_BROADCASTING, "ipx_pprop_broadcasting" }, 397 { NET_IPX_PPROP_BROADCASTING, "ipx_pprop_broadcasting" },
402 /* NET_IPX_FORWARDING unused */ 398 /* NET_IPX_FORWARDING unused */
403 {} 399 {}
404}; 400};
405 401
406static struct trans_ctl_table trans_net_atalk_table[] = { 402static const struct trans_ctl_table trans_net_atalk_table[] = {
407 { NET_ATALK_AARP_EXPIRY_TIME, "aarp-expiry-time" }, 403 { NET_ATALK_AARP_EXPIRY_TIME, "aarp-expiry-time" },
408 { NET_ATALK_AARP_TICK_TIME, "aarp-tick-time" }, 404 { NET_ATALK_AARP_TICK_TIME, "aarp-tick-time" },
409 { NET_ATALK_AARP_RETRANSMIT_LIMIT, "aarp-retransmit-limit" }, 405 { NET_ATALK_AARP_RETRANSMIT_LIMIT, "aarp-retransmit-limit" },
@@ -411,7 +407,7 @@ static struct trans_ctl_table trans_net_atalk_table[] = {
411 {}, 407 {},
412}; 408};
413 409
414static struct trans_ctl_table trans_net_netrom_table[] = { 410static const struct trans_ctl_table trans_net_netrom_table[] = {
415 { NET_NETROM_DEFAULT_PATH_QUALITY, "default_path_quality" }, 411 { NET_NETROM_DEFAULT_PATH_QUALITY, "default_path_quality" },
416 { NET_NETROM_OBSOLESCENCE_COUNT_INITIALISER, "obsolescence_count_initialiser" }, 412 { NET_NETROM_OBSOLESCENCE_COUNT_INITIALISER, "obsolescence_count_initialiser" },
417 { NET_NETROM_NETWORK_TTL_INITIALISER, "network_ttl_initialiser" }, 413 { NET_NETROM_NETWORK_TTL_INITIALISER, "network_ttl_initialiser" },
@@ -427,7 +423,7 @@ static struct trans_ctl_table trans_net_netrom_table[] = {
427 {} 423 {}
428}; 424};
429 425
430static struct trans_ctl_table trans_net_ax25_param_table[] = { 426static const struct trans_ctl_table trans_net_ax25_param_table[] = {
431 { NET_AX25_IP_DEFAULT_MODE, "ip_default_mode" }, 427 { NET_AX25_IP_DEFAULT_MODE, "ip_default_mode" },
432 { NET_AX25_DEFAULT_MODE, "ax25_default_mode" }, 428 { NET_AX25_DEFAULT_MODE, "ax25_default_mode" },
433 { NET_AX25_BACKOFF_TYPE, "backoff_type" }, 429 { NET_AX25_BACKOFF_TYPE, "backoff_type" },
@@ -445,12 +441,12 @@ static struct trans_ctl_table trans_net_ax25_param_table[] = {
445 {} 441 {}
446}; 442};
447 443
448static struct trans_ctl_table trans_net_ax25_table[] = { 444static const struct trans_ctl_table trans_net_ax25_table[] = {
449 { 0, NULL, trans_net_ax25_param_table }, 445 { 0, NULL, trans_net_ax25_param_table },
450 {} 446 {}
451}; 447};
452 448
453static struct trans_ctl_table trans_net_bridge_table[] = { 449static const struct trans_ctl_table trans_net_bridge_table[] = {
454 { NET_BRIDGE_NF_CALL_ARPTABLES, "bridge-nf-call-arptables" }, 450 { NET_BRIDGE_NF_CALL_ARPTABLES, "bridge-nf-call-arptables" },
455 { NET_BRIDGE_NF_CALL_IPTABLES, "bridge-nf-call-iptables" }, 451 { NET_BRIDGE_NF_CALL_IPTABLES, "bridge-nf-call-iptables" },
456 { NET_BRIDGE_NF_CALL_IP6TABLES, "bridge-nf-call-ip6tables" }, 452 { NET_BRIDGE_NF_CALL_IP6TABLES, "bridge-nf-call-ip6tables" },
@@ -459,7 +455,7 @@ static struct trans_ctl_table trans_net_bridge_table[] = {
459 {} 455 {}
460}; 456};
461 457
462static struct trans_ctl_table trans_net_rose_table[] = { 458static const struct trans_ctl_table trans_net_rose_table[] = {
463 { NET_ROSE_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" }, 459 { NET_ROSE_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" },
464 { NET_ROSE_CALL_REQUEST_TIMEOUT, "call_request_timeout" }, 460 { NET_ROSE_CALL_REQUEST_TIMEOUT, "call_request_timeout" },
465 { NET_ROSE_RESET_REQUEST_TIMEOUT, "reset_request_timeout" }, 461 { NET_ROSE_RESET_REQUEST_TIMEOUT, "reset_request_timeout" },
@@ -473,7 +469,7 @@ static struct trans_ctl_table trans_net_rose_table[] = {
473 {} 469 {}
474}; 470};
475 471
476static struct trans_ctl_table trans_net_ipv6_conf_var_table[] = { 472static const struct trans_ctl_table trans_net_ipv6_conf_var_table[] = {
477 { NET_IPV6_FORWARDING, "forwarding" }, 473 { NET_IPV6_FORWARDING, "forwarding" },
478 { NET_IPV6_HOP_LIMIT, "hop_limit" }, 474 { NET_IPV6_HOP_LIMIT, "hop_limit" },
479 { NET_IPV6_MTU, "mtu" }, 475 { NET_IPV6_MTU, "mtu" },
@@ -501,14 +497,14 @@ static struct trans_ctl_table trans_net_ipv6_conf_var_table[] = {
501 {} 497 {}
502}; 498};
503 499
504static struct trans_ctl_table trans_net_ipv6_conf_table[] = { 500static const struct trans_ctl_table trans_net_ipv6_conf_table[] = {
505 { NET_PROTO_CONF_ALL, "all", trans_net_ipv6_conf_var_table }, 501 { NET_PROTO_CONF_ALL, "all", trans_net_ipv6_conf_var_table },
506 { NET_PROTO_CONF_DEFAULT, "default", trans_net_ipv6_conf_var_table }, 502 { NET_PROTO_CONF_DEFAULT, "default", trans_net_ipv6_conf_var_table },
507 { 0, NULL, trans_net_ipv6_conf_var_table }, 503 { 0, NULL, trans_net_ipv6_conf_var_table },
508 {} 504 {}
509}; 505};
510 506
511static struct trans_ctl_table trans_net_ipv6_route_table[] = { 507static const struct trans_ctl_table trans_net_ipv6_route_table[] = {
512 { NET_IPV6_ROUTE_FLUSH, "flush" }, 508 { NET_IPV6_ROUTE_FLUSH, "flush" },
513 { NET_IPV6_ROUTE_GC_THRESH, "gc_thresh" }, 509 { NET_IPV6_ROUTE_GC_THRESH, "gc_thresh" },
514 { NET_IPV6_ROUTE_MAX_SIZE, "max_size" }, 510 { NET_IPV6_ROUTE_MAX_SIZE, "max_size" },
@@ -522,12 +518,12 @@ static struct trans_ctl_table trans_net_ipv6_route_table[] = {
522 {} 518 {}
523}; 519};
524 520
525static struct trans_ctl_table trans_net_ipv6_icmp_table[] = { 521static const struct trans_ctl_table trans_net_ipv6_icmp_table[] = {
526 { NET_IPV6_ICMP_RATELIMIT, "ratelimit" }, 522 { NET_IPV6_ICMP_RATELIMIT, "ratelimit" },
527 {} 523 {}
528}; 524};
529 525
530static struct trans_ctl_table trans_net_ipv6_table[] = { 526static const struct trans_ctl_table trans_net_ipv6_table[] = {
531 { NET_IPV6_CONF, "conf", trans_net_ipv6_conf_table }, 527 { NET_IPV6_CONF, "conf", trans_net_ipv6_conf_table },
532 { NET_IPV6_NEIGH, "neigh", trans_net_neigh_table }, 528 { NET_IPV6_NEIGH, "neigh", trans_net_neigh_table },
533 { NET_IPV6_ROUTE, "route", trans_net_ipv6_route_table }, 529 { NET_IPV6_ROUTE, "route", trans_net_ipv6_route_table },
@@ -542,7 +538,7 @@ static struct trans_ctl_table trans_net_ipv6_table[] = {
542 {} 538 {}
543}; 539};
544 540
545static struct trans_ctl_table trans_net_x25_table[] = { 541static const struct trans_ctl_table trans_net_x25_table[] = {
546 { NET_X25_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" }, 542 { NET_X25_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" },
547 { NET_X25_CALL_REQUEST_TIMEOUT, "call_request_timeout" }, 543 { NET_X25_CALL_REQUEST_TIMEOUT, "call_request_timeout" },
548 { NET_X25_RESET_REQUEST_TIMEOUT, "reset_request_timeout" }, 544 { NET_X25_RESET_REQUEST_TIMEOUT, "reset_request_timeout" },
@@ -552,13 +548,13 @@ static struct trans_ctl_table trans_net_x25_table[] = {
552 {} 548 {}
553}; 549};
554 550
555static struct trans_ctl_table trans_net_tr_table[] = { 551static const struct trans_ctl_table trans_net_tr_table[] = {
556 { NET_TR_RIF_TIMEOUT, "rif_timeout" }, 552 { NET_TR_RIF_TIMEOUT, "rif_timeout" },
557 {} 553 {}
558}; 554};
559 555
560 556
561static struct trans_ctl_table trans_net_decnet_conf_vars[] = { 557static const struct trans_ctl_table trans_net_decnet_conf_vars[] = {
562 { NET_DECNET_CONF_DEV_FORWARDING, "forwarding" }, 558 { NET_DECNET_CONF_DEV_FORWARDING, "forwarding" },
563 { NET_DECNET_CONF_DEV_PRIORITY, "priority" }, 559 { NET_DECNET_CONF_DEV_PRIORITY, "priority" },
564 { NET_DECNET_CONF_DEV_T2, "t2" }, 560 { NET_DECNET_CONF_DEV_T2, "t2" },
@@ -566,12 +562,12 @@ static struct trans_ctl_table trans_net_decnet_conf_vars[] = {
566 {} 562 {}
567}; 563};
568 564
569static struct trans_ctl_table trans_net_decnet_conf[] = { 565static const struct trans_ctl_table trans_net_decnet_conf[] = {
570 { 0, NULL, trans_net_decnet_conf_vars }, 566 { 0, NULL, trans_net_decnet_conf_vars },
571 {} 567 {}
572}; 568};
573 569
574static struct trans_ctl_table trans_net_decnet_table[] = { 570static const struct trans_ctl_table trans_net_decnet_table[] = {
575 { NET_DECNET_CONF, "conf", trans_net_decnet_conf }, 571 { NET_DECNET_CONF, "conf", trans_net_decnet_conf },
576 { NET_DECNET_NODE_ADDRESS, "node_address" }, 572 { NET_DECNET_NODE_ADDRESS, "node_address" },
577 { NET_DECNET_NODE_NAME, "node_name" }, 573 { NET_DECNET_NODE_NAME, "node_name" },
@@ -589,7 +585,7 @@ static struct trans_ctl_table trans_net_decnet_table[] = {
589 {} 585 {}
590}; 586};
591 587
592static struct trans_ctl_table trans_net_sctp_table[] = { 588static const struct trans_ctl_table trans_net_sctp_table[] = {
593 { NET_SCTP_RTO_INITIAL, "rto_initial" }, 589 { NET_SCTP_RTO_INITIAL, "rto_initial" },
594 { NET_SCTP_RTO_MIN, "rto_min" }, 590 { NET_SCTP_RTO_MIN, "rto_min" },
595 { NET_SCTP_RTO_MAX, "rto_max" }, 591 { NET_SCTP_RTO_MAX, "rto_max" },
@@ -610,7 +606,7 @@ static struct trans_ctl_table trans_net_sctp_table[] = {
610 {} 606 {}
611}; 607};
612 608
613static struct trans_ctl_table trans_net_llc_llc2_timeout_table[] = { 609static const struct trans_ctl_table trans_net_llc_llc2_timeout_table[] = {
614 { NET_LLC2_ACK_TIMEOUT, "ack" }, 610 { NET_LLC2_ACK_TIMEOUT, "ack" },
615 { NET_LLC2_P_TIMEOUT, "p" }, 611 { NET_LLC2_P_TIMEOUT, "p" },
616 { NET_LLC2_REJ_TIMEOUT, "rej" }, 612 { NET_LLC2_REJ_TIMEOUT, "rej" },
@@ -618,23 +614,23 @@ static struct trans_ctl_table trans_net_llc_llc2_timeout_table[] = {
618 {} 614 {}
619}; 615};
620 616
621static struct trans_ctl_table trans_net_llc_station_table[] = { 617static const struct trans_ctl_table trans_net_llc_station_table[] = {
622 { NET_LLC_STATION_ACK_TIMEOUT, "ack_timeout" }, 618 { NET_LLC_STATION_ACK_TIMEOUT, "ack_timeout" },
623 {} 619 {}
624}; 620};
625 621
626static struct trans_ctl_table trans_net_llc_llc2_table[] = { 622static const struct trans_ctl_table trans_net_llc_llc2_table[] = {
627 { NET_LLC2, "timeout", trans_net_llc_llc2_timeout_table }, 623 { NET_LLC2, "timeout", trans_net_llc_llc2_timeout_table },
628 {} 624 {}
629}; 625};
630 626
631static struct trans_ctl_table trans_net_llc_table[] = { 627static const struct trans_ctl_table trans_net_llc_table[] = {
632 { NET_LLC2, "llc2", trans_net_llc_llc2_table }, 628 { NET_LLC2, "llc2", trans_net_llc_llc2_table },
633 { NET_LLC_STATION, "station", trans_net_llc_station_table }, 629 { NET_LLC_STATION, "station", trans_net_llc_station_table },
634 {} 630 {}
635}; 631};
636 632
637static struct trans_ctl_table trans_net_netfilter_table[] = { 633static const struct trans_ctl_table trans_net_netfilter_table[] = {
638 { NET_NF_CONNTRACK_MAX, "nf_conntrack_max" }, 634 { NET_NF_CONNTRACK_MAX, "nf_conntrack_max" },
639 { NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, "nf_conntrack_tcp_timeout_syn_sent" }, 635 { NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, "nf_conntrack_tcp_timeout_syn_sent" },
640 { NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV, "nf_conntrack_tcp_timeout_syn_recv" }, 636 { NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV, "nf_conntrack_tcp_timeout_syn_recv" },
@@ -671,12 +667,12 @@ static struct trans_ctl_table trans_net_netfilter_table[] = {
671 {} 667 {}
672}; 668};
673 669
674static struct trans_ctl_table trans_net_dccp_table[] = { 670static const struct trans_ctl_table trans_net_dccp_table[] = {
675 { NET_DCCP_DEFAULT, "default" }, 671 { NET_DCCP_DEFAULT, "default" },
676 {} 672 {}
677}; 673};
678 674
679static struct trans_ctl_table trans_net_irda_table[] = { 675static const struct trans_ctl_table trans_net_irda_table[] = {
680 { NET_IRDA_DISCOVERY, "discovery" }, 676 { NET_IRDA_DISCOVERY, "discovery" },
681 { NET_IRDA_DEVNAME, "devname" }, 677 { NET_IRDA_DEVNAME, "devname" },
682 { NET_IRDA_DEBUG, "debug" }, 678 { NET_IRDA_DEBUG, "debug" },
@@ -694,7 +690,7 @@ static struct trans_ctl_table trans_net_irda_table[] = {
694 {} 690 {}
695}; 691};
696 692
697static struct trans_ctl_table trans_net_table[] = { 693static const struct trans_ctl_table trans_net_table[] = {
698 { NET_CORE, "core", trans_net_core_table }, 694 { NET_CORE, "core", trans_net_core_table },
699 /* NET_ETHER not used */ 695 /* NET_ETHER not used */
700 /* NET_802 not used */ 696 /* NET_802 not used */
@@ -720,7 +716,7 @@ static struct trans_ctl_table trans_net_table[] = {
720 {} 716 {}
721}; 717};
722 718
723static struct trans_ctl_table trans_fs_quota_table[] = { 719static const struct trans_ctl_table trans_fs_quota_table[] = {
724 { FS_DQ_LOOKUPS, "lookups" }, 720 { FS_DQ_LOOKUPS, "lookups" },
725 { FS_DQ_DROPS, "drops" }, 721 { FS_DQ_DROPS, "drops" },
726 { FS_DQ_READS, "reads" }, 722 { FS_DQ_READS, "reads" },
@@ -733,7 +729,7 @@ static struct trans_ctl_table trans_fs_quota_table[] = {
733 {} 729 {}
734}; 730};
735 731
736static struct trans_ctl_table trans_fs_xfs_table[] = { 732static const struct trans_ctl_table trans_fs_xfs_table[] = {
737 { XFS_RESTRICT_CHOWN, "restrict_chown" }, 733 { XFS_RESTRICT_CHOWN, "restrict_chown" },
738 { XFS_SGID_INHERIT, "irix_sgid_inherit" }, 734 { XFS_SGID_INHERIT, "irix_sgid_inherit" },
739 { XFS_SYMLINK_MODE, "irix_symlink_mode" }, 735 { XFS_SYMLINK_MODE, "irix_symlink_mode" },
@@ -754,24 +750,24 @@ static struct trans_ctl_table trans_fs_xfs_table[] = {
754 {} 750 {}
755}; 751};
756 752
757static struct trans_ctl_table trans_fs_ocfs2_nm_table[] = { 753static const struct trans_ctl_table trans_fs_ocfs2_nm_table[] = {
758 { 1, "hb_ctl_path" }, 754 { 1, "hb_ctl_path" },
759 {} 755 {}
760}; 756};
761 757
762static struct trans_ctl_table trans_fs_ocfs2_table[] = { 758static const struct trans_ctl_table trans_fs_ocfs2_table[] = {
763 { 1, "nm", trans_fs_ocfs2_nm_table }, 759 { 1, "nm", trans_fs_ocfs2_nm_table },
764 {} 760 {}
765}; 761};
766 762
767static struct trans_ctl_table trans_inotify_table[] = { 763static const struct trans_ctl_table trans_inotify_table[] = {
768 { INOTIFY_MAX_USER_INSTANCES, "max_user_instances" }, 764 { INOTIFY_MAX_USER_INSTANCES, "max_user_instances" },
769 { INOTIFY_MAX_USER_WATCHES, "max_user_watches" }, 765 { INOTIFY_MAX_USER_WATCHES, "max_user_watches" },
770 { INOTIFY_MAX_QUEUED_EVENTS, "max_queued_events" }, 766 { INOTIFY_MAX_QUEUED_EVENTS, "max_queued_events" },
771 {} 767 {}
772}; 768};
773 769
774static struct trans_ctl_table trans_fs_table[] = { 770static const struct trans_ctl_table trans_fs_table[] = {
775 { FS_NRINODE, "inode-nr" }, 771 { FS_NRINODE, "inode-nr" },
776 { FS_STATINODE, "inode-state" }, 772 { FS_STATINODE, "inode-state" },
777 /* FS_MAXINODE unused */ 773 /* FS_MAXINODE unused */
@@ -797,11 +793,11 @@ static struct trans_ctl_table trans_fs_table[] = {
797 {} 793 {}
798}; 794};
799 795
800static struct trans_ctl_table trans_debug_table[] = { 796static const struct trans_ctl_table trans_debug_table[] = {
801 {} 797 {}
802}; 798};
803 799
804static struct trans_ctl_table trans_cdrom_table[] = { 800static const struct trans_ctl_table trans_cdrom_table[] = {
805 { DEV_CDROM_INFO, "info" }, 801 { DEV_CDROM_INFO, "info" },
806 { DEV_CDROM_AUTOCLOSE, "autoclose" }, 802 { DEV_CDROM_AUTOCLOSE, "autoclose" },
807 { DEV_CDROM_AUTOEJECT, "autoeject" }, 803 { DEV_CDROM_AUTOEJECT, "autoeject" },
@@ -811,12 +807,12 @@ static struct trans_ctl_table trans_cdrom_table[] = {
811 {} 807 {}
812}; 808};
813 809
814static struct trans_ctl_table trans_ipmi_table[] = { 810static const struct trans_ctl_table trans_ipmi_table[] = {
815 { DEV_IPMI_POWEROFF_POWERCYCLE, "poweroff_powercycle" }, 811 { DEV_IPMI_POWEROFF_POWERCYCLE, "poweroff_powercycle" },
816 {} 812 {}
817}; 813};
818 814
819static struct trans_ctl_table trans_mac_hid_files[] = { 815static const struct trans_ctl_table trans_mac_hid_files[] = {
820 /* DEV_MAC_HID_KEYBOARD_SENDS_LINUX_KEYCODES unused */ 816 /* DEV_MAC_HID_KEYBOARD_SENDS_LINUX_KEYCODES unused */
821 /* DEV_MAC_HID_KEYBOARD_LOCK_KEYCODES unused */ 817 /* DEV_MAC_HID_KEYBOARD_LOCK_KEYCODES unused */
822 { DEV_MAC_HID_MOUSE_BUTTON_EMULATION, "mouse_button_emulation" }, 818 { DEV_MAC_HID_MOUSE_BUTTON_EMULATION, "mouse_button_emulation" },
@@ -826,35 +822,35 @@ static struct trans_ctl_table trans_mac_hid_files[] = {
826 {} 822 {}
827}; 823};
828 824
829static struct trans_ctl_table trans_raid_table[] = { 825static const struct trans_ctl_table trans_raid_table[] = {
830 { DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min" }, 826 { DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min" },
831 { DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max" }, 827 { DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max" },
832 {} 828 {}
833}; 829};
834 830
835static struct trans_ctl_table trans_scsi_table[] = { 831static const struct trans_ctl_table trans_scsi_table[] = {
836 { DEV_SCSI_LOGGING_LEVEL, "logging_level" }, 832 { DEV_SCSI_LOGGING_LEVEL, "logging_level" },
837 {} 833 {}
838}; 834};
839 835
840static struct trans_ctl_table trans_parport_default_table[] = { 836static const struct trans_ctl_table trans_parport_default_table[] = {
841 { DEV_PARPORT_DEFAULT_TIMESLICE, "timeslice" }, 837 { DEV_PARPORT_DEFAULT_TIMESLICE, "timeslice" },
842 { DEV_PARPORT_DEFAULT_SPINTIME, "spintime" }, 838 { DEV_PARPORT_DEFAULT_SPINTIME, "spintime" },
843 {} 839 {}
844}; 840};
845 841
846static struct trans_ctl_table trans_parport_device_table[] = { 842static const struct trans_ctl_table trans_parport_device_table[] = {
847 { DEV_PARPORT_DEVICE_TIMESLICE, "timeslice" }, 843 { DEV_PARPORT_DEVICE_TIMESLICE, "timeslice" },
848 {} 844 {}
849}; 845};
850 846
851static struct trans_ctl_table trans_parport_devices_table[] = { 847static const struct trans_ctl_table trans_parport_devices_table[] = {
852 { DEV_PARPORT_DEVICES_ACTIVE, "active" }, 848 { DEV_PARPORT_DEVICES_ACTIVE, "active" },
853 { 0, NULL, trans_parport_device_table }, 849 { 0, NULL, trans_parport_device_table },
854 {} 850 {}
855}; 851};
856 852
857static struct trans_ctl_table trans_parport_parport_table[] = { 853static const struct trans_ctl_table trans_parport_parport_table[] = {
858 { DEV_PARPORT_SPINTIME, "spintime" }, 854 { DEV_PARPORT_SPINTIME, "spintime" },
859 { DEV_PARPORT_BASE_ADDR, "base-addr" }, 855 { DEV_PARPORT_BASE_ADDR, "base-addr" },
860 { DEV_PARPORT_IRQ, "irq" }, 856 { DEV_PARPORT_IRQ, "irq" },
@@ -868,13 +864,13 @@ static struct trans_ctl_table trans_parport_parport_table[] = {
868 { DEV_PARPORT_AUTOPROBE + 4, "autoprobe3" }, 864 { DEV_PARPORT_AUTOPROBE + 4, "autoprobe3" },
869 {} 865 {}
870}; 866};
871static struct trans_ctl_table trans_parport_table[] = { 867static const struct trans_ctl_table trans_parport_table[] = {
872 { DEV_PARPORT_DEFAULT, "default", trans_parport_default_table }, 868 { DEV_PARPORT_DEFAULT, "default", trans_parport_default_table },
873 { 0, NULL, trans_parport_parport_table }, 869 { 0, NULL, trans_parport_parport_table },
874 {} 870 {}
875}; 871};
876 872
877static struct trans_ctl_table trans_dev_table[] = { 873static const struct trans_ctl_table trans_dev_table[] = {
878 { DEV_CDROM, "cdrom", trans_cdrom_table }, 874 { DEV_CDROM, "cdrom", trans_cdrom_table },
879 /* DEV_HWMON unused */ 875 /* DEV_HWMON unused */
880 { DEV_PARPORT, "parport", trans_parport_table }, 876 { DEV_PARPORT, "parport", trans_parport_table },
@@ -885,19 +881,19 @@ static struct trans_ctl_table trans_dev_table[] = {
885 {} 881 {}
886}; 882};
887 883
888static struct trans_ctl_table trans_bus_isa_table[] = { 884static const struct trans_ctl_table trans_bus_isa_table[] = {
889 { BUS_ISA_MEM_BASE, "membase" }, 885 { BUS_ISA_MEM_BASE, "membase" },
890 { BUS_ISA_PORT_BASE, "portbase" }, 886 { BUS_ISA_PORT_BASE, "portbase" },
891 { BUS_ISA_PORT_SHIFT, "portshift" }, 887 { BUS_ISA_PORT_SHIFT, "portshift" },
892 {} 888 {}
893}; 889};
894 890
895static struct trans_ctl_table trans_bus_table[] = { 891static const struct trans_ctl_table trans_bus_table[] = {
896 { CTL_BUS_ISA, "isa", trans_bus_isa_table }, 892 { CTL_BUS_ISA, "isa", trans_bus_isa_table },
897 {} 893 {}
898}; 894};
899 895
900static struct trans_ctl_table trans_arlan_conf_table0[] = { 896static const struct trans_ctl_table trans_arlan_conf_table0[] = {
901 { 1, "spreadingCode" }, 897 { 1, "spreadingCode" },
902 { 2, "channelNumber" }, 898 { 2, "channelNumber" },
903 { 3, "scramblingDisable" }, 899 { 3, "scramblingDisable" },
@@ -968,7 +964,7 @@ static struct trans_ctl_table trans_arlan_conf_table0[] = {
968 {} 964 {}
969}; 965};
970 966
971static struct trans_ctl_table trans_arlan_conf_table1[] = { 967static const struct trans_ctl_table trans_arlan_conf_table1[] = {
972 { 1, "spreadingCode" }, 968 { 1, "spreadingCode" },
973 { 2, "channelNumber" }, 969 { 2, "channelNumber" },
974 { 3, "scramblingDisable" }, 970 { 3, "scramblingDisable" },
@@ -1039,7 +1035,7 @@ static struct trans_ctl_table trans_arlan_conf_table1[] = {
1039 {} 1035 {}
1040}; 1036};
1041 1037
1042static struct trans_ctl_table trans_arlan_conf_table2[] = { 1038static const struct trans_ctl_table trans_arlan_conf_table2[] = {
1043 { 1, "spreadingCode" }, 1039 { 1, "spreadingCode" },
1044 { 2, "channelNumber" }, 1040 { 2, "channelNumber" },
1045 { 3, "scramblingDisable" }, 1041 { 3, "scramblingDisable" },
@@ -1110,7 +1106,7 @@ static struct trans_ctl_table trans_arlan_conf_table2[] = {
1110 {} 1106 {}
1111}; 1107};
1112 1108
1113static struct trans_ctl_table trans_arlan_conf_table3[] = { 1109static const struct trans_ctl_table trans_arlan_conf_table3[] = {
1114 { 1, "spreadingCode" }, 1110 { 1, "spreadingCode" },
1115 { 2, "channelNumber" }, 1111 { 2, "channelNumber" },
1116 { 3, "scramblingDisable" }, 1112 { 3, "scramblingDisable" },
@@ -1181,7 +1177,7 @@ static struct trans_ctl_table trans_arlan_conf_table3[] = {
1181 {} 1177 {}
1182}; 1178};
1183 1179
1184static struct trans_ctl_table trans_arlan_table[] = { 1180static const struct trans_ctl_table trans_arlan_table[] = {
1185 { 1, "arlan0", trans_arlan_conf_table0 }, 1181 { 1, "arlan0", trans_arlan_conf_table0 },
1186 { 2, "arlan1", trans_arlan_conf_table1 }, 1182 { 2, "arlan1", trans_arlan_conf_table1 },
1187 { 3, "arlan2", trans_arlan_conf_table2 }, 1183 { 3, "arlan2", trans_arlan_conf_table2 },
@@ -1189,13 +1185,13 @@ static struct trans_ctl_table trans_arlan_table[] = {
1189 {} 1185 {}
1190}; 1186};
1191 1187
1192static struct trans_ctl_table trans_s390dbf_table[] = { 1188static const struct trans_ctl_table trans_s390dbf_table[] = {
1193 { 5678 /* CTL_S390DBF_STOPPABLE */, "debug_stoppable" }, 1189 { 5678 /* CTL_S390DBF_STOPPABLE */, "debug_stoppable" },
1194 { 5679 /* CTL_S390DBF_ACTIVE */, "debug_active" }, 1190 { 5679 /* CTL_S390DBF_ACTIVE */, "debug_active" },
1195 {} 1191 {}
1196}; 1192};
1197 1193
1198static struct trans_ctl_table trans_sunrpc_table[] = { 1194static const struct trans_ctl_table trans_sunrpc_table[] = {
1199 { CTL_RPCDEBUG, "rpc_debug" }, 1195 { CTL_RPCDEBUG, "rpc_debug" },
1200 { CTL_NFSDEBUG, "nfs_debug" }, 1196 { CTL_NFSDEBUG, "nfs_debug" },
1201 { CTL_NFSDDEBUG, "nfsd_debug" }, 1197 { CTL_NFSDDEBUG, "nfsd_debug" },
@@ -1207,7 +1203,7 @@ static struct trans_ctl_table trans_sunrpc_table[] = {
1207 {} 1203 {}
1208}; 1204};
1209 1205
1210static struct trans_ctl_table trans_pm_table[] = { 1206static const struct trans_ctl_table trans_pm_table[] = {
1211 { 1 /* CTL_PM_SUSPEND */, "suspend" }, 1207 { 1 /* CTL_PM_SUSPEND */, "suspend" },
1212 { 2 /* CTL_PM_CMODE */, "cmode" }, 1208 { 2 /* CTL_PM_CMODE */, "cmode" },
1213 { 3 /* CTL_PM_P0 */, "p0" }, 1209 { 3 /* CTL_PM_P0 */, "p0" },
@@ -1215,13 +1211,13 @@ static struct trans_ctl_table trans_pm_table[] = {
1215 {} 1211 {}
1216}; 1212};
1217 1213
1218static struct trans_ctl_table trans_frv_table[] = { 1214static const struct trans_ctl_table trans_frv_table[] = {
1219 { 1, "cache-mode" }, 1215 { 1, "cache-mode" },
1220 { 2, "pin-cxnr" }, 1216 { 2, "pin-cxnr" },
1221 {} 1217 {}
1222}; 1218};
1223 1219
1224static struct trans_ctl_table trans_root_table[] = { 1220static const struct trans_ctl_table trans_root_table[] = {
1225 { CTL_KERN, "kernel", trans_kern_table }, 1221 { CTL_KERN, "kernel", trans_kern_table },
1226 { CTL_VM, "vm", trans_vm_table }, 1222 { CTL_VM, "vm", trans_vm_table },
1227 { CTL_NET, "net", trans_net_table }, 1223 { CTL_NET, "net", trans_net_table },
@@ -1265,15 +1261,14 @@ static struct ctl_table *sysctl_parent(struct ctl_table *table, int n)
1265 return table; 1261 return table;
1266} 1262}
1267 1263
1268static struct trans_ctl_table *sysctl_binary_lookup(struct ctl_table *table) 1264static const struct trans_ctl_table *sysctl_binary_lookup(struct ctl_table *table)
1269{ 1265{
1270 struct ctl_table *test; 1266 struct ctl_table *test;
1271 struct trans_ctl_table *ref; 1267 const struct trans_ctl_table *ref;
1272 int depth, cur_depth; 1268 int cur_depth;
1273 1269
1274 depth = sysctl_depth(table); 1270 cur_depth = sysctl_depth(table);
1275 1271
1276 cur_depth = depth;
1277 ref = trans_root_table; 1272 ref = trans_root_table;
1278repeat: 1273repeat:
1279 test = sysctl_parent(table, cur_depth); 1274 test = sysctl_parent(table, cur_depth);
@@ -1441,7 +1436,7 @@ static void sysctl_check_leaf(struct nsproxy *namespaces,
1441 1436
1442static void sysctl_check_bin_path(struct ctl_table *table, const char **fail) 1437static void sysctl_check_bin_path(struct ctl_table *table, const char **fail)
1443{ 1438{
1444 struct trans_ctl_table *ref; 1439 const struct trans_ctl_table *ref;
1445 1440
1446 ref = sysctl_binary_lookup(table); 1441 ref = sysctl_binary_lookup(table);
1447 if (table->ctl_name && !ref) 1442 if (table->ctl_name && !ref)
@@ -1498,9 +1493,6 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
1498 (table->strategy == sysctl_ms_jiffies) || 1493 (table->strategy == sysctl_ms_jiffies) ||
1499 (table->proc_handler == proc_dostring) || 1494 (table->proc_handler == proc_dostring) ||
1500 (table->proc_handler == proc_dointvec) || 1495 (table->proc_handler == proc_dointvec) ||
1501#ifdef CONFIG_SECURITY_CAPABILITIES
1502 (table->proc_handler == proc_dointvec_bset) ||
1503#endif /* def CONFIG_SECURITY_CAPABILITIES */
1504 (table->proc_handler == proc_dointvec_minmax) || 1496 (table->proc_handler == proc_dointvec_minmax) ||
1505 (table->proc_handler == proc_dointvec_jiffies) || 1497 (table->proc_handler == proc_dointvec_jiffies) ||
1506 (table->proc_handler == proc_dointvec_userhz_jiffies) || 1498 (table->proc_handler == proc_dointvec_userhz_jiffies) ||
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
index 88cdb109e13c..06b6395b45b2 100644
--- a/kernel/test_kprobes.c
+++ b/kernel/test_kprobes.c
@@ -135,6 +135,12 @@ static int test_jprobe(void)
135#ifdef CONFIG_KRETPROBES 135#ifdef CONFIG_KRETPROBES
136static u32 krph_val; 136static u32 krph_val;
137 137
138static int entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
139{
140 krph_val = (rand1 / div_factor);
141 return 0;
142}
143
138static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs) 144static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
139{ 145{
140 unsigned long ret = regs_return_value(regs); 146 unsigned long ret = regs_return_value(regs);
@@ -144,13 +150,19 @@ static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
144 printk(KERN_ERR "Kprobe smoke test failed: " 150 printk(KERN_ERR "Kprobe smoke test failed: "
145 "incorrect value in kretprobe handler\n"); 151 "incorrect value in kretprobe handler\n");
146 } 152 }
153 if (krph_val == 0) {
154 handler_errors++;
155 printk(KERN_ERR "Kprobe smoke test failed: "
156 "call to kretprobe entry handler failed\n");
157 }
147 158
148 krph_val = (rand1 / div_factor); 159 krph_val = rand1;
149 return 0; 160 return 0;
150} 161}
151 162
152static struct kretprobe rp = { 163static struct kretprobe rp = {
153 .handler = return_handler, 164 .handler = return_handler,
165 .entry_handler = entry_handler,
154 .kp.symbol_name = "kprobe_target" 166 .kp.symbol_name = "kprobe_target"
155}; 167};
156 168
@@ -167,7 +179,7 @@ static int test_kretprobe(void)
167 179
168 ret = kprobe_target(rand1); 180 ret = kprobe_target(rand1);
169 unregister_kretprobe(&rp); 181 unregister_kretprobe(&rp);
170 if (krph_val == 0) { 182 if (krph_val != rand1) {
171 printk(KERN_ERR "Kprobe smoke test failed: " 183 printk(KERN_ERR "Kprobe smoke test failed: "
172 "kretprobe handler not called\n"); 184 "kretprobe handler not called\n");
173 handler_errors++; 185 handler_errors++;
diff --git a/kernel/time.c b/kernel/time.c
index 4064c0566e77..a5ec013b6c80 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -39,6 +39,8 @@
39#include <asm/uaccess.h> 39#include <asm/uaccess.h>
40#include <asm/unistd.h> 40#include <asm/unistd.h>
41 41
42#include "timeconst.h"
43
42/* 44/*
43 * The timezone where the local system is located. Used as a default by some 45 * The timezone where the local system is located. Used as a default by some
44 * programs who obtain this value by using gettimeofday. 46 * programs who obtain this value by using gettimeofday.
@@ -93,7 +95,8 @@ asmlinkage long sys_stime(time_t __user *tptr)
93 95
94#endif /* __ARCH_WANT_SYS_TIME */ 96#endif /* __ARCH_WANT_SYS_TIME */
95 97
96asmlinkage long sys_gettimeofday(struct timeval __user *tv, struct timezone __user *tz) 98asmlinkage long sys_gettimeofday(struct timeval __user *tv,
99 struct timezone __user *tz)
97{ 100{
98 if (likely(tv != NULL)) { 101 if (likely(tv != NULL)) {
99 struct timeval ktv; 102 struct timeval ktv;
@@ -118,7 +121,7 @@ asmlinkage long sys_gettimeofday(struct timeval __user *tv, struct timezone __us
118 * hard to make the program warp the clock precisely n hours) or 121 * hard to make the program warp the clock precisely n hours) or
119 * compile in the timezone information into the kernel. Bad, bad.... 122 * compile in the timezone information into the kernel. Bad, bad....
120 * 123 *
121 * - TYT, 1992-01-01 124 * - TYT, 1992-01-01
122 * 125 *
123 * The best thing to do is to keep the CMOS clock in universal time (UTC) 126 * The best thing to do is to keep the CMOS clock in universal time (UTC)
124 * as real UNIX machines always do it. This avoids all headaches about 127 * as real UNIX machines always do it. This avoids all headaches about
@@ -240,7 +243,11 @@ unsigned int inline jiffies_to_msecs(const unsigned long j)
240#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) 243#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
241 return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC); 244 return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC);
242#else 245#else
243 return (j * MSEC_PER_SEC) / HZ; 246# if BITS_PER_LONG == 32
247 return ((u64)HZ_TO_MSEC_MUL32 * j) >> HZ_TO_MSEC_SHR32;
248# else
249 return (j * HZ_TO_MSEC_NUM) / HZ_TO_MSEC_DEN;
250# endif
244#endif 251#endif
245} 252}
246EXPORT_SYMBOL(jiffies_to_msecs); 253EXPORT_SYMBOL(jiffies_to_msecs);
@@ -252,7 +259,11 @@ unsigned int inline jiffies_to_usecs(const unsigned long j)
252#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) 259#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
253 return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC); 260 return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC);
254#else 261#else
255 return (j * USEC_PER_SEC) / HZ; 262# if BITS_PER_LONG == 32
263 return ((u64)HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32;
264# else
265 return (j * HZ_TO_USEC_NUM) / HZ_TO_USEC_DEN;
266# endif
256#endif 267#endif
257} 268}
258EXPORT_SYMBOL(jiffies_to_usecs); 269EXPORT_SYMBOL(jiffies_to_usecs);
@@ -267,7 +278,7 @@ EXPORT_SYMBOL(jiffies_to_usecs);
267 * 278 *
268 * This function should be only used for timestamps returned by 279 * This function should be only used for timestamps returned by
269 * current_kernel_time() or CURRENT_TIME, not with do_gettimeofday() because 280 * current_kernel_time() or CURRENT_TIME, not with do_gettimeofday() because
270 * it doesn't handle the better resolution of the later. 281 * it doesn't handle the better resolution of the latter.
271 */ 282 */
272struct timespec timespec_trunc(struct timespec t, unsigned gran) 283struct timespec timespec_trunc(struct timespec t, unsigned gran)
273{ 284{
@@ -315,7 +326,7 @@ EXPORT_SYMBOL_GPL(getnstimeofday);
315 * This algorithm was first published by Gauss (I think). 326 * This algorithm was first published by Gauss (I think).
316 * 327 *
317 * WARNING: this function will overflow on 2106-02-07 06:28:16 on 328 * WARNING: this function will overflow on 2106-02-07 06:28:16 on
318 * machines were long is 32-bit! (However, as time_t is signed, we 329 * machines where long is 32-bit! (However, as time_t is signed, we
319 * will already get problems at other places on 2038-01-19 03:14:08) 330 * will already get problems at other places on 2038-01-19 03:14:08)
320 */ 331 */
321unsigned long 332unsigned long
@@ -352,7 +363,7 @@ EXPORT_SYMBOL(mktime);
352 * normalize to the timespec storage format 363 * normalize to the timespec storage format
353 * 364 *
354 * Note: The tv_nsec part is always in the range of 365 * Note: The tv_nsec part is always in the range of
355 * 0 <= tv_nsec < NSEC_PER_SEC 366 * 0 <= tv_nsec < NSEC_PER_SEC
356 * For negative values only the tv_sec field is negative ! 367 * For negative values only the tv_sec field is negative !
357 */ 368 */
358void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec) 369void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec)
@@ -453,12 +464,13 @@ unsigned long msecs_to_jiffies(const unsigned int m)
453 /* 464 /*
454 * Generic case - multiply, round and divide. But first 465 * Generic case - multiply, round and divide. But first
455 * check that if we are doing a net multiplication, that 466 * check that if we are doing a net multiplication, that
456 * we wouldnt overflow: 467 * we wouldn't overflow:
457 */ 468 */
458 if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) 469 if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
459 return MAX_JIFFY_OFFSET; 470 return MAX_JIFFY_OFFSET;
460 471
461 return (m * HZ + MSEC_PER_SEC - 1) / MSEC_PER_SEC; 472 return ((u64)MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32)
473 >> MSEC_TO_HZ_SHR32;
462#endif 474#endif
463} 475}
464EXPORT_SYMBOL(msecs_to_jiffies); 476EXPORT_SYMBOL(msecs_to_jiffies);
@@ -472,7 +484,8 @@ unsigned long usecs_to_jiffies(const unsigned int u)
472#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) 484#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
473 return u * (HZ / USEC_PER_SEC); 485 return u * (HZ / USEC_PER_SEC);
474#else 486#else
475 return (u * HZ + USEC_PER_SEC - 1) / USEC_PER_SEC; 487 return ((u64)USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32)
488 >> USEC_TO_HZ_SHR32;
476#endif 489#endif
477} 490}
478EXPORT_SYMBOL(usecs_to_jiffies); 491EXPORT_SYMBOL(usecs_to_jiffies);
@@ -566,7 +579,11 @@ EXPORT_SYMBOL(jiffies_to_timeval);
566clock_t jiffies_to_clock_t(long x) 579clock_t jiffies_to_clock_t(long x)
567{ 580{
568#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 581#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
582# if HZ < USER_HZ
583 return x * (USER_HZ / HZ);
584# else
569 return x / (HZ / USER_HZ); 585 return x / (HZ / USER_HZ);
586# endif
570#else 587#else
571 u64 tmp = (u64)x * TICK_NSEC; 588 u64 tmp = (u64)x * TICK_NSEC;
572 do_div(tmp, (NSEC_PER_SEC / USER_HZ)); 589 do_div(tmp, (NSEC_PER_SEC / USER_HZ));
@@ -599,7 +616,14 @@ EXPORT_SYMBOL(clock_t_to_jiffies);
599u64 jiffies_64_to_clock_t(u64 x) 616u64 jiffies_64_to_clock_t(u64 x)
600{ 617{
601#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 618#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
619# if HZ < USER_HZ
620 x *= USER_HZ;
621 do_div(x, HZ);
622# elif HZ > USER_HZ
602 do_div(x, HZ / USER_HZ); 623 do_div(x, HZ / USER_HZ);
624# else
625 /* Nothing to do */
626# endif
603#else 627#else
604 /* 628 /*
605 * There are better ways that don't overflow early, 629 * There are better ways that don't overflow early,
@@ -611,7 +635,6 @@ u64 jiffies_64_to_clock_t(u64 x)
611#endif 635#endif
612 return x; 636 return x;
613} 637}
614
615EXPORT_SYMBOL(jiffies_64_to_clock_t); 638EXPORT_SYMBOL(jiffies_64_to_clock_t);
616 639
617u64 nsec_to_clock_t(u64 x) 640u64 nsec_to_clock_t(u64 x)
@@ -646,7 +669,6 @@ u64 get_jiffies_64(void)
646 } while (read_seqretry(&xtime_lock, seq)); 669 } while (read_seqretry(&xtime_lock, seq));
647 return ret; 670 return ret;
648} 671}
649
650EXPORT_SYMBOL(get_jiffies_64); 672EXPORT_SYMBOL(get_jiffies_64);
651#endif 673#endif
652 674
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 3e59fce6dd43..3d1e3e1a1971 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -133,7 +133,7 @@ static void clockevents_do_notify(unsigned long reason, void *dev)
133} 133}
134 134
135/* 135/*
136 * Called after a notify add to make devices availble which were 136 * Called after a notify add to make devices available which were
137 * released from the notifier call. 137 * released from the notifier call.
138 */ 138 */
139static void clockevents_notify_released(void) 139static void clockevents_notify_released(void)
@@ -218,6 +218,8 @@ void clockevents_exchange_device(struct clock_event_device *old,
218 */ 218 */
219void clockevents_notify(unsigned long reason, void *arg) 219void clockevents_notify(unsigned long reason, void *arg)
220{ 220{
221 struct list_head *node, *tmp;
222
221 spin_lock(&clockevents_lock); 223 spin_lock(&clockevents_lock);
222 clockevents_do_notify(reason, arg); 224 clockevents_do_notify(reason, arg);
223 225
@@ -227,13 +229,8 @@ void clockevents_notify(unsigned long reason, void *arg)
227 * Unregister the clock event devices which were 229 * Unregister the clock event devices which were
228 * released from the users in the notify chain. 230 * released from the users in the notify chain.
229 */ 231 */
230 while (!list_empty(&clockevents_released)) { 232 list_for_each_safe(node, tmp, &clockevents_released)
231 struct clock_event_device *dev; 233 list_del(node);
232
233 dev = list_entry(clockevents_released.next,
234 struct clock_event_device, list);
235 list_del(&dev->list);
236 }
237 break; 234 break;
238 default: 235 default:
239 break; 236 break;
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 6e9259a5d501..548c436a776b 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -91,7 +91,6 @@ static void clocksource_ratewd(struct clocksource *cs, int64_t delta)
91 cs->name, delta); 91 cs->name, delta);
92 cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); 92 cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
93 clocksource_change_rating(cs, 0); 93 clocksource_change_rating(cs, 0);
94 cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
95 list_del(&cs->wd_list); 94 list_del(&cs->wd_list);
96} 95}
97 96
@@ -363,15 +362,13 @@ void clocksource_unregister(struct clocksource *cs)
363static ssize_t 362static ssize_t
364sysfs_show_current_clocksources(struct sys_device *dev, char *buf) 363sysfs_show_current_clocksources(struct sys_device *dev, char *buf)
365{ 364{
366 char *curr = buf; 365 ssize_t count = 0;
367 366
368 spin_lock_irq(&clocksource_lock); 367 spin_lock_irq(&clocksource_lock);
369 curr += sprintf(curr, "%s ", curr_clocksource->name); 368 count = snprintf(buf, PAGE_SIZE, "%s\n", curr_clocksource->name);
370 spin_unlock_irq(&clocksource_lock); 369 spin_unlock_irq(&clocksource_lock);
371 370
372 curr += sprintf(curr, "\n"); 371 return count;
373
374 return curr - buf;
375} 372}
376 373
377/** 374/**
@@ -439,17 +436,20 @@ static ssize_t
439sysfs_show_available_clocksources(struct sys_device *dev, char *buf) 436sysfs_show_available_clocksources(struct sys_device *dev, char *buf)
440{ 437{
441 struct clocksource *src; 438 struct clocksource *src;
442 char *curr = buf; 439 ssize_t count = 0;
443 440
444 spin_lock_irq(&clocksource_lock); 441 spin_lock_irq(&clocksource_lock);
445 list_for_each_entry(src, &clocksource_list, list) { 442 list_for_each_entry(src, &clocksource_list, list) {
446 curr += sprintf(curr, "%s ", src->name); 443 count += snprintf(buf + count,
444 max((ssize_t)PAGE_SIZE - count, (ssize_t)0),
445 "%s ", src->name);
447 } 446 }
448 spin_unlock_irq(&clocksource_lock); 447 spin_unlock_irq(&clocksource_lock);
449 448
450 curr += sprintf(curr, "\n"); 449 count += snprintf(buf + count,
450 max((ssize_t)PAGE_SIZE - count, (ssize_t)0), "\n");
451 451
452 return curr - buf; 452 return count;
453} 453}
454 454
455/* 455/*
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index e64efaf957e8..c88b5910e7ab 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -43,10 +43,6 @@ long time_freq; /* frequency offset (scaled ppm)*/
43static long time_reftime; /* time at last adjustment (s) */ 43static long time_reftime; /* time at last adjustment (s) */
44long time_adjust; 44long time_adjust;
45 45
46#define CLOCK_TICK_OVERFLOW (LATCH * HZ - CLOCK_TICK_RATE)
47#define CLOCK_TICK_ADJUST (((s64)CLOCK_TICK_OVERFLOW * NSEC_PER_SEC) / \
48 (s64)CLOCK_TICK_RATE)
49
50static void ntp_update_frequency(void) 46static void ntp_update_frequency(void)
51{ 47{
52 u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) 48 u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ)
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 88267f0a8471..fa9bb73dbdb4 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -681,7 +681,7 @@ int tick_check_oneshot_change(int allow_nohz)
681 if (ts->nohz_mode != NOHZ_MODE_INACTIVE) 681 if (ts->nohz_mode != NOHZ_MODE_INACTIVE)
682 return 0; 682 return 0;
683 683
684 if (!timekeeping_is_continuous() || !tick_is_oneshot_available()) 684 if (!timekeeping_valid_for_hres() || !tick_is_oneshot_available())
685 return 0; 685 return 0;
686 686
687 if (!allow_nohz) 687 if (!allow_nohz)
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index cd5dbc4579c9..1af9fb050fe2 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -201,9 +201,9 @@ static inline s64 __get_nsec_offset(void) { return 0; }
201#endif 201#endif
202 202
203/** 203/**
204 * timekeeping_is_continuous - check to see if timekeeping is free running 204 * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres
205 */ 205 */
206int timekeeping_is_continuous(void) 206int timekeeping_valid_for_hres(void)
207{ 207{
208 unsigned long seq; 208 unsigned long seq;
209 int ret; 209 int ret;
@@ -364,7 +364,7 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
364 * with losing too many ticks, otherwise we would overadjust and 364 * with losing too many ticks, otherwise we would overadjust and
365 * produce an even larger error. The smaller the adjustment the 365 * produce an even larger error. The smaller the adjustment the
366 * faster we try to adjust for it, as lost ticks can do less harm 366 * faster we try to adjust for it, as lost ticks can do less harm
367 * here. This is tuned so that an error of about 1 msec is adusted 367 * here. This is tuned so that an error of about 1 msec is adjusted
368 * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks). 368 * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
369 */ 369 */
370 error2 = clock->error >> (TICK_LENGTH_SHIFT + 22 - 2 * SHIFT_HZ); 370 error2 = clock->error >> (TICK_LENGTH_SHIFT + 22 - 2 * SHIFT_HZ);
diff --git a/kernel/timeconst.pl b/kernel/timeconst.pl
new file mode 100644
index 000000000000..41468035473c
--- /dev/null
+++ b/kernel/timeconst.pl
@@ -0,0 +1,402 @@
1#!/usr/bin/perl
2# -----------------------------------------------------------------------
3#
4# Copyright 2007 rPath, Inc. - All Rights Reserved
5#
6# This file is part of the Linux kernel, and is made available under
7# the terms of the GNU General Public License version 2 or (at your
8# option) any later version; incorporated herein by reference.
9#
10# -----------------------------------------------------------------------
11#
12
13#
14# Usage: timeconst.pl HZ > timeconst.h
15#
16
17# Precomputed values for systems without Math::BigInt
18# Generated by:
19# timeconst.pl --can 24 32 48 64 100 122 128 200 250 256 300 512 1000 1024 1200
20%canned_values = (
21 24 => [
22 '0xa6aaaaab','0x2aaaaaa',26,
23 '0xa6aaaaaaaaaaaaab','0x2aaaaaaaaaaaaaa',58,
24 125,3,
25 '0xc49ba5e4','0x1fbe76c8b4',37,
26 '0xc49ba5e353f7ceda','0x1fbe76c8b439581062',69,
27 3,125,
28 '0xa2c2aaab','0xaaaa',16,
29 '0xa2c2aaaaaaaaaaab','0xaaaaaaaaaaaa',48,
30 125000,3,
31 '0xc9539b89','0x7fffbce4217d',47,
32 '0xc9539b8887229e91','0x7fffbce4217d2849cb25',79,
33 3,125000,
34 ], 32 => [
35 '0xfa000000','0x6000000',27,
36 '0xfa00000000000000','0x600000000000000',59,
37 125,4,
38 '0x83126e98','0xfdf3b645a',36,
39 '0x83126e978d4fdf3c','0xfdf3b645a1cac0831',68,
40 4,125,
41 '0xf4240000','0x0',17,
42 '0xf424000000000000','0x0',49,
43 31250,1,
44 '0x8637bd06','0x3fff79c842fa',46,
45 '0x8637bd05af6c69b6','0x3fff79c842fa5093964a',78,
46 1,31250,
47 ], 48 => [
48 '0xa6aaaaab','0x6aaaaaa',27,
49 '0xa6aaaaaaaaaaaaab','0x6aaaaaaaaaaaaaa',59,
50 125,6,
51 '0xc49ba5e4','0xfdf3b645a',36,
52 '0xc49ba5e353f7ceda','0xfdf3b645a1cac0831',68,
53 6,125,
54 '0xa2c2aaab','0x15555',17,
55 '0xa2c2aaaaaaaaaaab','0x1555555555555',49,
56 62500,3,
57 '0xc9539b89','0x3fffbce4217d',46,
58 '0xc9539b8887229e91','0x3fffbce4217d2849cb25',78,
59 3,62500,
60 ], 64 => [
61 '0xfa000000','0xe000000',28,
62 '0xfa00000000000000','0xe00000000000000',60,
63 125,8,
64 '0x83126e98','0x7ef9db22d',35,
65 '0x83126e978d4fdf3c','0x7ef9db22d0e560418',67,
66 8,125,
67 '0xf4240000','0x0',18,
68 '0xf424000000000000','0x0',50,
69 15625,1,
70 '0x8637bd06','0x1fff79c842fa',45,
71 '0x8637bd05af6c69b6','0x1fff79c842fa5093964a',77,
72 1,15625,
73 ], 100 => [
74 '0xa0000000','0x0',28,
75 '0xa000000000000000','0x0',60,
76 10,1,
77 '0xcccccccd','0x733333333',35,
78 '0xcccccccccccccccd','0x73333333333333333',67,
79 1,10,
80 '0x9c400000','0x0',18,
81 '0x9c40000000000000','0x0',50,
82 10000,1,
83 '0xd1b71759','0x1fff2e48e8a7',45,
84 '0xd1b71758e219652c','0x1fff2e48e8a71de69ad4',77,
85 1,10000,
86 ], 122 => [
87 '0x8325c53f','0xfbcda3a',28,
88 '0x8325c53ef368eb05','0xfbcda3ac10c9714',60,
89 500,61,
90 '0xf9db22d1','0x7fbe76c8b',35,
91 '0xf9db22d0e560418a','0x7fbe76c8b43958106',67,
92 61,500,
93 '0x8012e2a0','0x3ef36',18,
94 '0x8012e29f79b47583','0x3ef368eb04325',50,
95 500000,61,
96 '0xffda4053','0x1ffffbce4217',45,
97 '0xffda4052d666a983','0x1ffffbce4217d2849cb2',77,
98 61,500000,
99 ], 128 => [
100 '0xfa000000','0x1e000000',29,
101 '0xfa00000000000000','0x1e00000000000000',61,
102 125,16,
103 '0x83126e98','0x3f7ced916',34,
104 '0x83126e978d4fdf3c','0x3f7ced916872b020c',66,
105 16,125,
106 '0xf4240000','0x40000',19,
107 '0xf424000000000000','0x4000000000000',51,
108 15625,2,
109 '0x8637bd06','0xfffbce4217d',44,
110 '0x8637bd05af6c69b6','0xfffbce4217d2849cb25',76,
111 2,15625,
112 ], 200 => [
113 '0xa0000000','0x0',29,
114 '0xa000000000000000','0x0',61,
115 5,1,
116 '0xcccccccd','0x333333333',34,
117 '0xcccccccccccccccd','0x33333333333333333',66,
118 1,5,
119 '0x9c400000','0x0',19,
120 '0x9c40000000000000','0x0',51,
121 5000,1,
122 '0xd1b71759','0xfff2e48e8a7',44,
123 '0xd1b71758e219652c','0xfff2e48e8a71de69ad4',76,
124 1,5000,
125 ], 250 => [
126 '0x80000000','0x0',29,
127 '0x8000000000000000','0x0',61,
128 4,1,
129 '0x80000000','0x180000000',33,
130 '0x8000000000000000','0x18000000000000000',65,
131 1,4,
132 '0xfa000000','0x0',20,
133 '0xfa00000000000000','0x0',52,
134 4000,1,
135 '0x83126e98','0x7ff7ced9168',43,
136 '0x83126e978d4fdf3c','0x7ff7ced916872b020c4',75,
137 1,4000,
138 ], 256 => [
139 '0xfa000000','0x3e000000',30,
140 '0xfa00000000000000','0x3e00000000000000',62,
141 125,32,
142 '0x83126e98','0x1fbe76c8b',33,
143 '0x83126e978d4fdf3c','0x1fbe76c8b43958106',65,
144 32,125,
145 '0xf4240000','0xc0000',20,
146 '0xf424000000000000','0xc000000000000',52,
147 15625,4,
148 '0x8637bd06','0x7ffde7210be',43,
149 '0x8637bd05af6c69b6','0x7ffde7210be9424e592',75,
150 4,15625,
151 ], 300 => [
152 '0xd5555556','0x2aaaaaaa',30,
153 '0xd555555555555556','0x2aaaaaaaaaaaaaaa',62,
154 10,3,
155 '0x9999999a','0x1cccccccc',33,
156 '0x999999999999999a','0x1cccccccccccccccc',65,
157 3,10,
158 '0xd0555556','0xaaaaa',20,
159 '0xd055555555555556','0xaaaaaaaaaaaaa',52,
160 10000,3,
161 '0x9d495183','0x7ffcb923a29',43,
162 '0x9d495182a9930be1','0x7ffcb923a29c779a6b5',75,
163 3,10000,
164 ], 512 => [
165 '0xfa000000','0x7e000000',31,
166 '0xfa00000000000000','0x7e00000000000000',63,
167 125,64,
168 '0x83126e98','0xfdf3b645',32,
169 '0x83126e978d4fdf3c','0xfdf3b645a1cac083',64,
170 64,125,
171 '0xf4240000','0x1c0000',21,
172 '0xf424000000000000','0x1c000000000000',53,
173 15625,8,
174 '0x8637bd06','0x3ffef39085f',42,
175 '0x8637bd05af6c69b6','0x3ffef39085f4a1272c9',74,
176 8,15625,
177 ], 1000 => [
178 '0x80000000','0x0',31,
179 '0x8000000000000000','0x0',63,
180 1,1,
181 '0x80000000','0x0',31,
182 '0x8000000000000000','0x0',63,
183 1,1,
184 '0xfa000000','0x0',22,
185 '0xfa00000000000000','0x0',54,
186 1000,1,
187 '0x83126e98','0x1ff7ced9168',41,
188 '0x83126e978d4fdf3c','0x1ff7ced916872b020c4',73,
189 1,1000,
190 ], 1024 => [
191 '0xfa000000','0xfe000000',32,
192 '0xfa00000000000000','0xfe00000000000000',64,
193 125,128,
194 '0x83126e98','0x7ef9db22',31,
195 '0x83126e978d4fdf3c','0x7ef9db22d0e56041',63,
196 128,125,
197 '0xf4240000','0x3c0000',22,
198 '0xf424000000000000','0x3c000000000000',54,
199 15625,16,
200 '0x8637bd06','0x1fff79c842f',41,
201 '0x8637bd05af6c69b6','0x1fff79c842fa5093964',73,
202 16,15625,
203 ], 1200 => [
204 '0xd5555556','0xd5555555',32,
205 '0xd555555555555556','0xd555555555555555',64,
206 5,6,
207 '0x9999999a','0x66666666',31,
208 '0x999999999999999a','0x6666666666666666',63,
209 6,5,
210 '0xd0555556','0x2aaaaa',22,
211 '0xd055555555555556','0x2aaaaaaaaaaaaa',54,
212 2500,3,
213 '0x9d495183','0x1ffcb923a29',41,
214 '0x9d495182a9930be1','0x1ffcb923a29c779a6b5',73,
215 3,2500,
216 ]
217);
218
219$has_bigint = eval 'use Math::BigInt qw(bgcd); 1;';
220
221sub bint($)
222{
223 my($x) = @_;
224 return Math::BigInt->new($x);
225}
226
227#
228# Constants for division by reciprocal multiplication.
229# (bits, numerator, denominator)
230#
231sub fmul($$$)
232{
233 my ($b,$n,$d) = @_;
234
235 $n = bint($n);
236 $d = bint($d);
237
238 return scalar (($n << $b)+$d-bint(1))/$d;
239}
240
241sub fadj($$$)
242{
243 my($b,$n,$d) = @_;
244
245 $n = bint($n);
246 $d = bint($d);
247
248 $d = $d/bgcd($n, $d);
249 return scalar (($d-bint(1)) << $b)/$d;
250}
251
252sub fmuls($$$) {
253 my($b,$n,$d) = @_;
254 my($s,$m);
255 my($thres) = bint(1) << ($b-1);
256
257 $n = bint($n);
258 $d = bint($d);
259
260 for ($s = 0; 1; $s++) {
261 $m = fmul($s,$n,$d);
262 return $s if ($m >= $thres);
263 }
264 return 0;
265}
266
267# Provides mul, adj, and shr factors for a specific
268# (bit, time, hz) combination
269sub muladj($$$) {
270 my($b, $t, $hz) = @_;
271 my $s = fmuls($b, $t, $hz);
272 my $m = fmul($s, $t, $hz);
273 my $a = fadj($s, $t, $hz);
274 return ($m->as_hex(), $a->as_hex(), $s);
275}
276
277# Provides numerator, denominator values
278sub numden($$) {
279 my($n, $d) = @_;
280 my $g = bgcd($n, $d);
281 return ($n/$g, $d/$g);
282}
283
284# All values for a specific (time, hz) combo
285sub conversions($$) {
286 my ($t, $hz) = @_;
287 my @val = ();
288
289 # HZ_TO_xx
290 push(@val, muladj(32, $t, $hz));
291 push(@val, muladj(64, $t, $hz));
292 push(@val, numden($t, $hz));
293
294 # xx_TO_HZ
295 push(@val, muladj(32, $hz, $t));
296 push(@val, muladj(64, $hz, $t));
297 push(@val, numden($hz, $t));
298
299 return @val;
300}
301
302sub compute_values($) {
303 my($hz) = @_;
304 my @val = ();
305 my $s, $m, $a, $g;
306
307 if (!$has_bigint) {
308 die "$0: HZ == $hz not canned and ".
309 "Math::BigInt not available\n";
310 }
311
312 # MSEC conversions
313 push(@val, conversions(1000, $hz));
314
315 # USEC conversions
316 push(@val, conversions(1000000, $hz));
317
318 return @val;
319}
320
321sub output($@)
322{
323 my($hz, @val) = @_;
324 my $pfx, $bit, $suf, $s, $m, $a;
325
326 print "/* Automatically generated by kernel/timeconst.pl */\n";
327 print "/* Conversion constants for HZ == $hz */\n";
328 print "\n";
329 print "#ifndef KERNEL_TIMECONST_H\n";
330 print "#define KERNEL_TIMECONST_H\n";
331 print "\n";
332
333 print "#include <linux/param.h>\n";
334
335 print "\n";
336 print "#if HZ != $hz\n";
337 print "#error \"kernel/timeconst.h has the wrong HZ value!\"\n";
338 print "#endif\n";
339 print "\n";
340
341 foreach $pfx ('HZ_TO_MSEC','MSEC_TO_HZ',
342 'HZ_TO_USEC','USEC_TO_HZ') {
343 foreach $bit (32, 64) {
344 foreach $suf ('MUL', 'ADJ', 'SHR') {
345 printf "#define %-23s %s\n",
346 "${pfx}_$suf$bit", shift(@val);
347 }
348 }
349 foreach $suf ('NUM', 'DEN') {
350 printf "#define %-23s %s\n",
351 "${pfx}_$suf", shift(@val);
352 }
353 }
354
355 print "\n";
356 print "#endif /* KERNEL_TIMECONST_H */\n";
357}
358
359($hz) = @ARGV;
360
361# Use this to generate the %canned_values structure
362if ($hz eq '--can') {
363 shift(@ARGV);
364 @hzlist = sort {$a <=> $b} (@ARGV);
365
366 print "# Precomputed values for systems without Math::BigInt\n";
367 print "# Generated by:\n";
368 print "# timeconst.pl --can ", join(' ', @hzlist), "\n";
369 print "\%canned_values = (\n";
370 my $pf = "\t";
371 foreach $hz (@hzlist) {
372 my @values = compute_values($hz);
373 print "$pf$hz => [\n";
374 while (scalar(@values)) {
375 my $bit;
376 foreach $bit (32, 64) {
377 my $m = shift(@values);
378 my $a = shift(@values);
379 my $s = shift(@values);
380 print "\t\t\'",$m,"\',\'",$a,"\',",$s,",\n";
381 }
382 my $n = shift(@values);
383 my $d = shift(@values);
384 print "\t\t",$n,',',$d,",\n";
385 }
386 print "\t]";
387 $pf = ', ';
388 }
389 print "\n);\n";
390} else {
391 $hz += 0; # Force to number
392 if ($hz < 1) {
393 die "Usage: $0 HZ\n";
394 }
395
396 @val = @{$canned_values{$hz}};
397 if (!defined(@val)) {
398 @val = compute_values($hz);
399 }
400 output($hz, @val);
401}
402exit 0;
diff --git a/kernel/timer.c b/kernel/timer.c
index 9fbb472b8cf0..99b00a25f88b 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -327,7 +327,7 @@ static void timer_stats_account_timer(struct timer_list *timer) {}
327 * init_timer() must be done to a timer prior calling *any* of the 327 * init_timer() must be done to a timer prior calling *any* of the
328 * other timer functions. 328 * other timer functions.
329 */ 329 */
330void fastcall init_timer(struct timer_list *timer) 330void init_timer(struct timer_list *timer)
331{ 331{
332 timer->entry.next = NULL; 332 timer->entry.next = NULL;
333 timer->base = __raw_get_cpu_var(tvec_bases); 333 timer->base = __raw_get_cpu_var(tvec_bases);
@@ -339,7 +339,7 @@ void fastcall init_timer(struct timer_list *timer)
339} 339}
340EXPORT_SYMBOL(init_timer); 340EXPORT_SYMBOL(init_timer);
341 341
342void fastcall init_timer_deferrable(struct timer_list *timer) 342void init_timer_deferrable(struct timer_list *timer)
343{ 343{
344 init_timer(timer); 344 init_timer(timer);
345 timer_set_deferrable(timer); 345 timer_set_deferrable(timer);
@@ -818,12 +818,14 @@ unsigned long next_timer_interrupt(void)
818#ifndef CONFIG_VIRT_CPU_ACCOUNTING 818#ifndef CONFIG_VIRT_CPU_ACCOUNTING
819void account_process_tick(struct task_struct *p, int user_tick) 819void account_process_tick(struct task_struct *p, int user_tick)
820{ 820{
821 cputime_t one_jiffy = jiffies_to_cputime(1);
822
821 if (user_tick) { 823 if (user_tick) {
822 account_user_time(p, jiffies_to_cputime(1)); 824 account_user_time(p, one_jiffy);
823 account_user_time_scaled(p, jiffies_to_cputime(1)); 825 account_user_time_scaled(p, cputime_to_scaled(one_jiffy));
824 } else { 826 } else {
825 account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1)); 827 account_system_time(p, HARDIRQ_OFFSET, one_jiffy);
826 account_system_time_scaled(p, jiffies_to_cputime(1)); 828 account_system_time_scaled(p, cputime_to_scaled(one_jiffy));
827 } 829 }
828} 830}
829#endif 831#endif
@@ -977,7 +979,7 @@ asmlinkage long sys_getppid(void)
977 int pid; 979 int pid;
978 980
979 rcu_read_lock(); 981 rcu_read_lock();
980 pid = task_tgid_nr_ns(current->real_parent, current->nsproxy->pid_ns); 982 pid = task_tgid_vnr(current->real_parent);
981 rcu_read_unlock(); 983 rcu_read_unlock();
982 984
983 return pid; 985 return pid;
@@ -1040,7 +1042,7 @@ static void process_timeout(unsigned long __data)
1040 * 1042 *
1041 * In all cases the return value is guaranteed to be non-negative. 1043 * In all cases the return value is guaranteed to be non-negative.
1042 */ 1044 */
1043fastcall signed long __sched schedule_timeout(signed long timeout) 1045signed long __sched schedule_timeout(signed long timeout)
1044{ 1046{
1045 struct timer_list timer; 1047 struct timer_list timer;
1046 unsigned long expire; 1048 unsigned long expire;
diff --git a/kernel/user.c b/kernel/user.c
index bc1c48d35cb3..7132022a040c 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -17,6 +17,14 @@
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/user_namespace.h> 18#include <linux/user_namespace.h>
19 19
20struct user_namespace init_user_ns = {
21 .kref = {
22 .refcount = ATOMIC_INIT(2),
23 },
24 .root_user = &root_user,
25};
26EXPORT_SYMBOL_GPL(init_user_ns);
27
20/* 28/*
21 * UID task count cache, to get fast user lookup in "alloc_uid" 29 * UID task count cache, to get fast user lookup in "alloc_uid"
22 * when changing user ID's (ie setuid() and friends). 30 * when changing user ID's (ie setuid() and friends).
@@ -49,7 +57,7 @@ struct user_struct root_user = {
49 .uid_keyring = &root_user_keyring, 57 .uid_keyring = &root_user_keyring,
50 .session_keyring = &root_session_keyring, 58 .session_keyring = &root_session_keyring,
51#endif 59#endif
52#ifdef CONFIG_FAIR_USER_SCHED 60#ifdef CONFIG_USER_SCHED
53 .tg = &init_task_group, 61 .tg = &init_task_group,
54#endif 62#endif
55}; 63};
@@ -82,7 +90,7 @@ static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
82 return NULL; 90 return NULL;
83} 91}
84 92
85#ifdef CONFIG_FAIR_USER_SCHED 93#ifdef CONFIG_USER_SCHED
86 94
87static void sched_destroy_user(struct user_struct *up) 95static void sched_destroy_user(struct user_struct *up)
88{ 96{
@@ -105,15 +113,15 @@ static void sched_switch_user(struct task_struct *p)
105 sched_move_task(p); 113 sched_move_task(p);
106} 114}
107 115
108#else /* CONFIG_FAIR_USER_SCHED */ 116#else /* CONFIG_USER_SCHED */
109 117
110static void sched_destroy_user(struct user_struct *up) { } 118static void sched_destroy_user(struct user_struct *up) { }
111static int sched_create_user(struct user_struct *up) { return 0; } 119static int sched_create_user(struct user_struct *up) { return 0; }
112static void sched_switch_user(struct task_struct *p) { } 120static void sched_switch_user(struct task_struct *p) { }
113 121
114#endif /* CONFIG_FAIR_USER_SCHED */ 122#endif /* CONFIG_USER_SCHED */
115 123
116#if defined(CONFIG_FAIR_USER_SCHED) && defined(CONFIG_SYSFS) 124#if defined(CONFIG_USER_SCHED) && defined(CONFIG_SYSFS)
117 125
118static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */ 126static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */
119static DEFINE_MUTEX(uids_mutex); 127static DEFINE_MUTEX(uids_mutex);
@@ -129,6 +137,7 @@ static inline void uids_mutex_unlock(void)
129} 137}
130 138
131/* uid directory attributes */ 139/* uid directory attributes */
140#ifdef CONFIG_FAIR_GROUP_SCHED
132static ssize_t cpu_shares_show(struct kobject *kobj, 141static ssize_t cpu_shares_show(struct kobject *kobj,
133 struct kobj_attribute *attr, 142 struct kobj_attribute *attr,
134 char *buf) 143 char *buf)
@@ -155,10 +164,45 @@ static ssize_t cpu_shares_store(struct kobject *kobj,
155 164
156static struct kobj_attribute cpu_share_attr = 165static struct kobj_attribute cpu_share_attr =
157 __ATTR(cpu_share, 0644, cpu_shares_show, cpu_shares_store); 166 __ATTR(cpu_share, 0644, cpu_shares_show, cpu_shares_store);
167#endif
168
169#ifdef CONFIG_RT_GROUP_SCHED
170static ssize_t cpu_rt_runtime_show(struct kobject *kobj,
171 struct kobj_attribute *attr,
172 char *buf)
173{
174 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
175
176 return sprintf(buf, "%lu\n", sched_group_rt_runtime(up->tg));
177}
178
179static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
180 struct kobj_attribute *attr,
181 const char *buf, size_t size)
182{
183 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
184 unsigned long rt_runtime;
185 int rc;
186
187 sscanf(buf, "%lu", &rt_runtime);
188
189 rc = sched_group_set_rt_runtime(up->tg, rt_runtime);
190
191 return (rc ? rc : size);
192}
193
194static struct kobj_attribute cpu_rt_runtime_attr =
195 __ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store);
196#endif
158 197
159/* default attributes per uid directory */ 198/* default attributes per uid directory */
160static struct attribute *uids_attributes[] = { 199static struct attribute *uids_attributes[] = {
200#ifdef CONFIG_FAIR_GROUP_SCHED
161 &cpu_share_attr.attr, 201 &cpu_share_attr.attr,
202#endif
203#ifdef CONFIG_RT_GROUP_SCHED
204 &cpu_rt_runtime_attr.attr,
205#endif
162 NULL 206 NULL
163}; 207};
164 208
@@ -261,7 +305,7 @@ static inline void free_user(struct user_struct *up, unsigned long flags)
261 schedule_work(&up->work); 305 schedule_work(&up->work);
262} 306}
263 307
264#else /* CONFIG_FAIR_USER_SCHED && CONFIG_SYSFS */ 308#else /* CONFIG_USER_SCHED && CONFIG_SYSFS */
265 309
266int uids_sysfs_init(void) { return 0; } 310int uids_sysfs_init(void) { return 0; }
267static inline int uids_user_create(struct user_struct *up) { return 0; } 311static inline int uids_user_create(struct user_struct *up) { return 0; }
@@ -365,7 +409,7 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
365 spin_lock_irq(&uidhash_lock); 409 spin_lock_irq(&uidhash_lock);
366 up = uid_hash_find(uid, hashent); 410 up = uid_hash_find(uid, hashent);
367 if (up) { 411 if (up) {
368 /* This case is not possible when CONFIG_FAIR_USER_SCHED 412 /* This case is not possible when CONFIG_USER_SCHED
369 * is defined, since we serialize alloc_uid() using 413 * is defined, since we serialize alloc_uid() using
370 * uids_mutex. Hence no need to call 414 * uids_mutex. Hence no need to call
371 * sched_destroy_user() or remove_user_sysfs_dir(). 415 * sched_destroy_user() or remove_user_sysfs_dir().
@@ -427,6 +471,7 @@ void switch_uid(struct user_struct *new_user)
427 suid_keys(current); 471 suid_keys(current);
428} 472}
429 473
474#ifdef CONFIG_USER_NS
430void release_uids(struct user_namespace *ns) 475void release_uids(struct user_namespace *ns)
431{ 476{
432 int i; 477 int i;
@@ -451,6 +496,7 @@ void release_uids(struct user_namespace *ns)
451 496
452 free_uid(ns->root_user); 497 free_uid(ns->root_user);
453} 498}
499#endif
454 500
455static int __init uid_cache_init(void) 501static int __init uid_cache_init(void)
456{ 502{
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 7af90fc4f0fd..4c9006275df7 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -10,17 +10,6 @@
10#include <linux/nsproxy.h> 10#include <linux/nsproxy.h>
11#include <linux/user_namespace.h> 11#include <linux/user_namespace.h>
12 12
13struct user_namespace init_user_ns = {
14 .kref = {
15 .refcount = ATOMIC_INIT(2),
16 },
17 .root_user = &root_user,
18};
19
20EXPORT_SYMBOL_GPL(init_user_ns);
21
22#ifdef CONFIG_USER_NS
23
24/* 13/*
25 * Clone a new ns copying an original user ns, setting refcount to 1 14 * Clone a new ns copying an original user ns, setting refcount to 1
26 * @old_ns: namespace to clone 15 * @old_ns: namespace to clone
@@ -84,5 +73,3 @@ void free_user_ns(struct kref *kref)
84 release_uids(ns); 73 release_uids(ns);
85 kfree(ns); 74 kfree(ns);
86} 75}
87
88#endif /* CONFIG_USER_NS */
diff --git a/kernel/wait.c b/kernel/wait.c
index f9876888a569..c275c56cf2d3 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -18,7 +18,7 @@ void init_waitqueue_head(wait_queue_head_t *q)
18 18
19EXPORT_SYMBOL(init_waitqueue_head); 19EXPORT_SYMBOL(init_waitqueue_head);
20 20
21void fastcall add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) 21void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
22{ 22{
23 unsigned long flags; 23 unsigned long flags;
24 24
@@ -29,7 +29,7 @@ void fastcall add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
29} 29}
30EXPORT_SYMBOL(add_wait_queue); 30EXPORT_SYMBOL(add_wait_queue);
31 31
32void fastcall add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait) 32void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait)
33{ 33{
34 unsigned long flags; 34 unsigned long flags;
35 35
@@ -40,7 +40,7 @@ void fastcall add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait)
40} 40}
41EXPORT_SYMBOL(add_wait_queue_exclusive); 41EXPORT_SYMBOL(add_wait_queue_exclusive);
42 42
43void fastcall remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) 43void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
44{ 44{
45 unsigned long flags; 45 unsigned long flags;
46 46
@@ -63,7 +63,7 @@ EXPORT_SYMBOL(remove_wait_queue);
63 * stops them from bleeding out - it would still allow subsequent 63 * stops them from bleeding out - it would still allow subsequent
64 * loads to move into the critical region). 64 * loads to move into the critical region).
65 */ 65 */
66void fastcall 66void
67prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state) 67prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state)
68{ 68{
69 unsigned long flags; 69 unsigned long flags;
@@ -82,7 +82,7 @@ prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state)
82} 82}
83EXPORT_SYMBOL(prepare_to_wait); 83EXPORT_SYMBOL(prepare_to_wait);
84 84
85void fastcall 85void
86prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) 86prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
87{ 87{
88 unsigned long flags; 88 unsigned long flags;
@@ -101,7 +101,7 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
101} 101}
102EXPORT_SYMBOL(prepare_to_wait_exclusive); 102EXPORT_SYMBOL(prepare_to_wait_exclusive);
103 103
104void fastcall finish_wait(wait_queue_head_t *q, wait_queue_t *wait) 104void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
105{ 105{
106 unsigned long flags; 106 unsigned long flags;
107 107
@@ -157,7 +157,7 @@ EXPORT_SYMBOL(wake_bit_function);
157 * waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are 157 * waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are
158 * permitted return codes. Nonzero return codes halt waiting and return. 158 * permitted return codes. Nonzero return codes halt waiting and return.
159 */ 159 */
160int __sched fastcall 160int __sched
161__wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q, 161__wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q,
162 int (*action)(void *), unsigned mode) 162 int (*action)(void *), unsigned mode)
163{ 163{
@@ -173,7 +173,7 @@ __wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q,
173} 173}
174EXPORT_SYMBOL(__wait_on_bit); 174EXPORT_SYMBOL(__wait_on_bit);
175 175
176int __sched fastcall out_of_line_wait_on_bit(void *word, int bit, 176int __sched out_of_line_wait_on_bit(void *word, int bit,
177 int (*action)(void *), unsigned mode) 177 int (*action)(void *), unsigned mode)
178{ 178{
179 wait_queue_head_t *wq = bit_waitqueue(word, bit); 179 wait_queue_head_t *wq = bit_waitqueue(word, bit);
@@ -183,7 +183,7 @@ int __sched fastcall out_of_line_wait_on_bit(void *word, int bit,
183} 183}
184EXPORT_SYMBOL(out_of_line_wait_on_bit); 184EXPORT_SYMBOL(out_of_line_wait_on_bit);
185 185
186int __sched fastcall 186int __sched
187__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, 187__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
188 int (*action)(void *), unsigned mode) 188 int (*action)(void *), unsigned mode)
189{ 189{
@@ -201,7 +201,7 @@ __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
201} 201}
202EXPORT_SYMBOL(__wait_on_bit_lock); 202EXPORT_SYMBOL(__wait_on_bit_lock);
203 203
204int __sched fastcall out_of_line_wait_on_bit_lock(void *word, int bit, 204int __sched out_of_line_wait_on_bit_lock(void *word, int bit,
205 int (*action)(void *), unsigned mode) 205 int (*action)(void *), unsigned mode)
206{ 206{
207 wait_queue_head_t *wq = bit_waitqueue(word, bit); 207 wait_queue_head_t *wq = bit_waitqueue(word, bit);
@@ -211,7 +211,7 @@ int __sched fastcall out_of_line_wait_on_bit_lock(void *word, int bit,
211} 211}
212EXPORT_SYMBOL(out_of_line_wait_on_bit_lock); 212EXPORT_SYMBOL(out_of_line_wait_on_bit_lock);
213 213
214void fastcall __wake_up_bit(wait_queue_head_t *wq, void *word, int bit) 214void __wake_up_bit(wait_queue_head_t *wq, void *word, int bit)
215{ 215{
216 struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); 216 struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit);
217 if (waitqueue_active(wq)) 217 if (waitqueue_active(wq))
@@ -236,13 +236,13 @@ EXPORT_SYMBOL(__wake_up_bit);
236 * may need to use a less regular barrier, such fs/inode.c's smp_mb(), 236 * may need to use a less regular barrier, such fs/inode.c's smp_mb(),
237 * because spin_unlock() does not guarantee a memory barrier. 237 * because spin_unlock() does not guarantee a memory barrier.
238 */ 238 */
239void fastcall wake_up_bit(void *word, int bit) 239void wake_up_bit(void *word, int bit)
240{ 240{
241 __wake_up_bit(bit_waitqueue(word, bit), word, bit); 241 __wake_up_bit(bit_waitqueue(word, bit), word, bit);
242} 242}
243EXPORT_SYMBOL(wake_up_bit); 243EXPORT_SYMBOL(wake_up_bit);
244 244
245fastcall wait_queue_head_t *bit_waitqueue(void *word, int bit) 245wait_queue_head_t *bit_waitqueue(void *word, int bit)
246{ 246{
247 const int shift = BITS_PER_LONG == 32 ? 5 : 6; 247 const int shift = BITS_PER_LONG == 32 ? 5 : 6;
248 const struct zone *zone = page_zone(virt_to_page(word)); 248 const struct zone *zone = page_zone(virt_to_page(word));
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 52db48e7f6e7..ff06611655af 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -161,7 +161,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
161 * We queue the work to the CPU it was submitted, but there is no 161 * We queue the work to the CPU it was submitted, but there is no
162 * guarantee that it will be processed by that CPU. 162 * guarantee that it will be processed by that CPU.
163 */ 163 */
164int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work) 164int queue_work(struct workqueue_struct *wq, struct work_struct *work)
165{ 165{
166 int ret = 0; 166 int ret = 0;
167 167
@@ -175,7 +175,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
175} 175}
176EXPORT_SYMBOL_GPL(queue_work); 176EXPORT_SYMBOL_GPL(queue_work);
177 177
178void delayed_work_timer_fn(unsigned long __data) 178static void delayed_work_timer_fn(unsigned long __data)
179{ 179{
180 struct delayed_work *dwork = (struct delayed_work *)__data; 180 struct delayed_work *dwork = (struct delayed_work *)__data;
181 struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work); 181 struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work);
@@ -192,7 +192,7 @@ void delayed_work_timer_fn(unsigned long __data)
192 * 192 *
193 * Returns 0 if @work was already on a queue, non-zero otherwise. 193 * Returns 0 if @work was already on a queue, non-zero otherwise.
194 */ 194 */
195int fastcall queue_delayed_work(struct workqueue_struct *wq, 195int queue_delayed_work(struct workqueue_struct *wq,
196 struct delayed_work *dwork, unsigned long delay) 196 struct delayed_work *dwork, unsigned long delay)
197{ 197{
198 timer_stats_timer_set_start_info(&dwork->timer); 198 timer_stats_timer_set_start_info(&dwork->timer);
@@ -388,7 +388,7 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
388 * This function used to run the workqueues itself. Now we just wait for the 388 * This function used to run the workqueues itself. Now we just wait for the
389 * helper threads to do it. 389 * helper threads to do it.
390 */ 390 */
391void fastcall flush_workqueue(struct workqueue_struct *wq) 391void flush_workqueue(struct workqueue_struct *wq)
392{ 392{
393 const cpumask_t *cpu_map = wq_cpu_map(wq); 393 const cpumask_t *cpu_map = wq_cpu_map(wq);
394 int cpu; 394 int cpu;
@@ -546,7 +546,7 @@ static struct workqueue_struct *keventd_wq __read_mostly;
546 * 546 *
547 * This puts a job in the kernel-global workqueue. 547 * This puts a job in the kernel-global workqueue.
548 */ 548 */
549int fastcall schedule_work(struct work_struct *work) 549int schedule_work(struct work_struct *work)
550{ 550{
551 return queue_work(keventd_wq, work); 551 return queue_work(keventd_wq, work);
552} 552}
@@ -560,7 +560,7 @@ EXPORT_SYMBOL(schedule_work);
560 * After waiting for a given time this puts a job in the kernel-global 560 * After waiting for a given time this puts a job in the kernel-global
561 * workqueue. 561 * workqueue.
562 */ 562 */
563int fastcall schedule_delayed_work(struct delayed_work *dwork, 563int schedule_delayed_work(struct delayed_work *dwork,
564 unsigned long delay) 564 unsigned long delay)
565{ 565{
566 timer_stats_timer_set_start_info(&dwork->timer); 566 timer_stats_timer_set_start_info(&dwork->timer);