aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile44
-rw-r--r--kernel/acct.c2
-rw-r--r--kernel/cgroup.c325
-rw-r--r--kernel/compat.c74
-rw-r--r--kernel/cpuset.c884
-rw-r--r--kernel/debug/debug_core.c1
-rw-r--r--kernel/debug/debug_core.h2
-rw-r--r--kernel/debug/gdbstub.c4
-rw-r--r--kernel/debug/kdb/kdb_bp.c20
-rw-r--r--kernel/debug/kdb/kdb_debugger.c25
-rw-r--r--kernel/debug/kdb/kdb_main.c135
-rw-r--r--kernel/debug/kdb/kdb_private.h4
-rw-r--r--kernel/events/core.c18
-rw-r--r--kernel/exit.c6
-rw-r--r--kernel/fork.c14
-rw-r--r--kernel/futex.c50
-rw-r--r--kernel/futex_compat.c21
-rw-r--r--kernel/gcov/Kconfig2
-rw-r--r--kernel/irq/proc.c2
-rw-r--r--kernel/kexec.c78
-rw-r--r--kernel/kfifo.c609
-rw-r--r--kernel/kprobes.c35
-rw-r--r--kernel/lockdep.c32
-rw-r--r--kernel/module.c142
-rw-r--r--kernel/nsproxy.c5
-rw-r--r--kernel/panic.c34
-rw-r--r--kernel/pid.c3
-rw-r--r--kernel/posix-timers.c25
-rw-r--r--kernel/power/autosleep.c2
-rw-r--r--kernel/power/main.c29
-rw-r--r--kernel/power/process.c4
-rw-r--r--kernel/power/qos.c9
-rw-r--r--kernel/power/suspend.c69
-rw-r--r--kernel/power/suspend_test.c11
-rw-r--r--kernel/printk.c9
-rw-r--r--kernel/relay.c4
-rw-r--r--kernel/sched/auto_group.c3
-rw-r--r--kernel/sched/core.c189
-rw-r--r--kernel/sched/cputime.c2
-rw-r--r--kernel/sched/debug.c97
-rw-r--r--kernel/sched/stats.c79
-rw-r--r--kernel/signal.c349
-rw-r--r--kernel/smp.c183
-rw-r--r--kernel/smpboot.c2
-rw-r--r--kernel/softirq.c38
-rw-r--r--kernel/stop_machine.c2
-rw-r--r--kernel/sys.c311
-rw-r--r--kernel/sysctl.c19
-rw-r--r--kernel/sysctl_binary.c43
-rw-r--r--kernel/time.c4
-rw-r--r--kernel/time/clockevents.c1
-rw-r--r--kernel/time/ntp.c26
-rw-r--r--kernel/time/tick-sched.c2
-rw-r--r--kernel/time/timekeeping.c26
-rw-r--r--kernel/timeconst.bc108
-rw-r--r--kernel/timeconst.pl376
-rw-r--r--kernel/trace/Kconfig15
-rw-r--r--kernel/trace/blktrace.c28
-rw-r--r--kernel/trace/ftrace.c24
-rw-r--r--kernel/trace/power-traces.c3
-rw-r--r--kernel/trace/ring_buffer.c6
-rw-r--r--kernel/trace/trace_output.c3
-rw-r--r--kernel/tracepoint.c6
-rw-r--r--kernel/user-return-notifier.c4
-rw-r--r--kernel/user.c7
-rw-r--r--kernel/user_namespace.c66
-rw-r--r--kernel/utsname.c2
-rw-r--r--kernel/utsname_sysctl.c3
-rw-r--r--kernel/watchdog.c10
-rw-r--r--kernel/workqueue.c20
70 files changed, 2212 insertions, 2578 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 6c072b6da239..bbde5f1a4486 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -7,7 +7,7 @@ obj-y = fork.o exec_domain.o panic.o printk.o \
7 sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ 7 sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
8 signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ 8 signal.o sys.o kmod.o workqueue.o pid.o task_work.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o cred.o \ 12 notifier.o ksysfs.o cred.o \
13 async.o range.o groups.o lglock.o smpboot.o 13 async.o range.o groups.o lglock.o smpboot.o
@@ -25,9 +25,7 @@ endif
25obj-y += sched/ 25obj-y += sched/
26obj-y += power/ 26obj-y += power/
27 27
28ifeq ($(CONFIG_CHECKPOINT_RESTORE),y) 28obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
29obj-$(CONFIG_X86) += kcmp.o
30endif
31obj-$(CONFIG_FREEZER) += freezer.o 29obj-$(CONFIG_FREEZER) += freezer.o
32obj-$(CONFIG_PROFILING) += profile.o 30obj-$(CONFIG_PROFILING) += profile.o
33obj-$(CONFIG_STACKTRACE) += stacktrace.o 31obj-$(CONFIG_STACKTRACE) += stacktrace.o
@@ -127,11 +125,19 @@ $(obj)/config_data.h: $(obj)/config_data.gz FORCE
127 125
128$(obj)/time.o: $(obj)/timeconst.h 126$(obj)/time.o: $(obj)/timeconst.h
129 127
130quiet_cmd_timeconst = TIMEC $@ 128quiet_cmd_hzfile = HZFILE $@
131 cmd_timeconst = $(PERL) $< $(CONFIG_HZ) > $@ 129 cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@
130
131targets += hz.bc
132$(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE
133 $(call if_changed,hzfile)
134
135quiet_cmd_bc = BC $@
136 cmd_bc = bc -q $(filter-out FORCE,$^) > $@
137
132targets += timeconst.h 138targets += timeconst.h
133$(obj)/timeconst.h: $(src)/timeconst.pl FORCE 139$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE
134 $(call if_changed,timeconst) 140 $(call if_changed,bc)
135 141
136ifeq ($(CONFIG_MODULE_SIG),y) 142ifeq ($(CONFIG_MODULE_SIG),y)
137# 143#
@@ -153,23 +159,7 @@ kernel/modsign_certificate.o: signing_key.x509 extra_certificates
153# fail and that the kernel may be used afterwards. 159# fail and that the kernel may be used afterwards.
154# 160#
155############################################################################### 161###############################################################################
156sign_key_with_hash := 162ifndef CONFIG_MODULE_SIG_HASH
157ifeq ($(CONFIG_MODULE_SIG_SHA1),y)
158sign_key_with_hash := -sha1
159endif
160ifeq ($(CONFIG_MODULE_SIG_SHA224),y)
161sign_key_with_hash := -sha224
162endif
163ifeq ($(CONFIG_MODULE_SIG_SHA256),y)
164sign_key_with_hash := -sha256
165endif
166ifeq ($(CONFIG_MODULE_SIG_SHA384),y)
167sign_key_with_hash := -sha384
168endif
169ifeq ($(CONFIG_MODULE_SIG_SHA512),y)
170sign_key_with_hash := -sha512
171endif
172ifeq ($(sign_key_with_hash),)
173$(error Could not determine digest type to use from kernel config) 163$(error Could not determine digest type to use from kernel config)
174endif 164endif
175 165
@@ -182,8 +172,8 @@ signing_key.priv signing_key.x509: x509.genkey
182 @echo "### needs to be run as root, and uses a hardware random" 172 @echo "### needs to be run as root, and uses a hardware random"
183 @echo "### number generator if one is available." 173 @echo "### number generator if one is available."
184 @echo "###" 174 @echo "###"
185 openssl req -new -nodes -utf8 $(sign_key_with_hash) -days 36500 -batch \ 175 openssl req -new -nodes -utf8 -$(CONFIG_MODULE_SIG_HASH) -days 36500 \
186 -x509 -config x509.genkey \ 176 -batch -x509 -config x509.genkey \
187 -outform DER -out signing_key.x509 \ 177 -outform DER -out signing_key.x509 \
188 -keyout signing_key.priv 178 -keyout signing_key.priv
189 @echo "###" 179 @echo "###"
diff --git a/kernel/acct.c b/kernel/acct.c
index e8b1627ab9c7..b9bd7f098ee5 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -205,7 +205,7 @@ static int acct_on(struct filename *pathname)
205 if (IS_ERR(file)) 205 if (IS_ERR(file))
206 return PTR_ERR(file); 206 return PTR_ERR(file);
207 207
208 if (!S_ISREG(file->f_path.dentry->d_inode->i_mode)) { 208 if (!S_ISREG(file_inode(file)->i_mode)) {
209 filp_close(file, NULL); 209 filp_close(file, NULL);
210 return -EACCES; 210 return -EACCES;
211 } 211 }
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 4855892798fd..a32f9432666c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -52,7 +52,7 @@
52#include <linux/module.h> 52#include <linux/module.h>
53#include <linux/delayacct.h> 53#include <linux/delayacct.h>
54#include <linux/cgroupstats.h> 54#include <linux/cgroupstats.h>
55#include <linux/hash.h> 55#include <linux/hashtable.h>
56#include <linux/namei.h> 56#include <linux/namei.h>
57#include <linux/pid_namespace.h> 57#include <linux/pid_namespace.h>
58#include <linux/idr.h> 58#include <linux/idr.h>
@@ -376,22 +376,18 @@ static int css_set_count;
376 * account cgroups in empty hierarchies. 376 * account cgroups in empty hierarchies.
377 */ 377 */
378#define CSS_SET_HASH_BITS 7 378#define CSS_SET_HASH_BITS 7
379#define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS) 379static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
380static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
381 380
382static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[]) 381static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
383{ 382{
384 int i; 383 int i;
385 int index; 384 unsigned long key = 0UL;
386 unsigned long tmp = 0UL;
387 385
388 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) 386 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
389 tmp += (unsigned long)css[i]; 387 key += (unsigned long)css[i];
390 tmp = (tmp >> 16) ^ tmp; 388 key = (key >> 16) ^ key;
391 389
392 index = hash_long(tmp, CSS_SET_HASH_BITS); 390 return key;
393
394 return &css_set_table[index];
395} 391}
396 392
397/* We don't maintain the lists running through each css_set to its 393/* We don't maintain the lists running through each css_set to its
@@ -418,7 +414,7 @@ static void __put_css_set(struct css_set *cg, int taskexit)
418 } 414 }
419 415
420 /* This css_set is dead. unlink it and release cgroup refcounts */ 416 /* This css_set is dead. unlink it and release cgroup refcounts */
421 hlist_del(&cg->hlist); 417 hash_del(&cg->hlist);
422 css_set_count--; 418 css_set_count--;
423 419
424 list_for_each_entry_safe(link, saved_link, &cg->cg_links, 420 list_for_each_entry_safe(link, saved_link, &cg->cg_links,
@@ -426,12 +422,20 @@ static void __put_css_set(struct css_set *cg, int taskexit)
426 struct cgroup *cgrp = link->cgrp; 422 struct cgroup *cgrp = link->cgrp;
427 list_del(&link->cg_link_list); 423 list_del(&link->cg_link_list);
428 list_del(&link->cgrp_link_list); 424 list_del(&link->cgrp_link_list);
425
426 /*
427 * We may not be holding cgroup_mutex, and if cgrp->count is
428 * dropped to 0 the cgroup can be destroyed at any time, hence
429 * rcu_read_lock is used to keep it alive.
430 */
431 rcu_read_lock();
429 if (atomic_dec_and_test(&cgrp->count) && 432 if (atomic_dec_and_test(&cgrp->count) &&
430 notify_on_release(cgrp)) { 433 notify_on_release(cgrp)) {
431 if (taskexit) 434 if (taskexit)
432 set_bit(CGRP_RELEASABLE, &cgrp->flags); 435 set_bit(CGRP_RELEASABLE, &cgrp->flags);
433 check_for_release(cgrp); 436 check_for_release(cgrp);
434 } 437 }
438 rcu_read_unlock();
435 439
436 kfree(link); 440 kfree(link);
437 } 441 }
@@ -550,9 +554,8 @@ static struct css_set *find_existing_css_set(
550{ 554{
551 int i; 555 int i;
552 struct cgroupfs_root *root = cgrp->root; 556 struct cgroupfs_root *root = cgrp->root;
553 struct hlist_head *hhead;
554 struct hlist_node *node;
555 struct css_set *cg; 557 struct css_set *cg;
558 unsigned long key;
556 559
557 /* 560 /*
558 * Build the set of subsystem state objects that we want to see in the 561 * Build the set of subsystem state objects that we want to see in the
@@ -572,8 +575,8 @@ static struct css_set *find_existing_css_set(
572 } 575 }
573 } 576 }
574 577
575 hhead = css_set_hash(template); 578 key = css_set_hash(template);
576 hlist_for_each_entry(cg, node, hhead, hlist) { 579 hash_for_each_possible(css_set_table, cg, hlist, key) {
577 if (!compare_css_sets(cg, oldcg, cgrp, template)) 580 if (!compare_css_sets(cg, oldcg, cgrp, template))
578 continue; 581 continue;
579 582
@@ -657,8 +660,8 @@ static struct css_set *find_css_set(
657 660
658 struct list_head tmp_cg_links; 661 struct list_head tmp_cg_links;
659 662
660 struct hlist_head *hhead;
661 struct cg_cgroup_link *link; 663 struct cg_cgroup_link *link;
664 unsigned long key;
662 665
663 /* First see if we already have a cgroup group that matches 666 /* First see if we already have a cgroup group that matches
664 * the desired set */ 667 * the desired set */
@@ -704,8 +707,8 @@ static struct css_set *find_css_set(
704 css_set_count++; 707 css_set_count++;
705 708
706 /* Add this cgroup group to the hash table */ 709 /* Add this cgroup group to the hash table */
707 hhead = css_set_hash(res->subsys); 710 key = css_set_hash(res->subsys);
708 hlist_add_head(&res->hlist, hhead); 711 hash_add(css_set_table, &res->hlist, key);
709 712
710 write_unlock(&css_set_lock); 713 write_unlock(&css_set_lock);
711 714
@@ -856,47 +859,54 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
856 return inode; 859 return inode;
857} 860}
858 861
859static void cgroup_diput(struct dentry *dentry, struct inode *inode) 862static void cgroup_free_fn(struct work_struct *work)
860{ 863{
861 /* is dentry a directory ? if so, kfree() associated cgroup */ 864 struct cgroup *cgrp = container_of(work, struct cgroup, free_work);
862 if (S_ISDIR(inode->i_mode)) { 865 struct cgroup_subsys *ss;
863 struct cgroup *cgrp = dentry->d_fsdata;
864 struct cgroup_subsys *ss;
865 BUG_ON(!(cgroup_is_removed(cgrp)));
866 /* It's possible for external users to be holding css
867 * reference counts on a cgroup; css_put() needs to
868 * be able to access the cgroup after decrementing
869 * the reference count in order to know if it needs to
870 * queue the cgroup to be handled by the release
871 * agent */
872 synchronize_rcu();
873 866
874 mutex_lock(&cgroup_mutex); 867 mutex_lock(&cgroup_mutex);
875 /* 868 /*
876 * Release the subsystem state objects. 869 * Release the subsystem state objects.
877 */ 870 */
878 for_each_subsys(cgrp->root, ss) 871 for_each_subsys(cgrp->root, ss)
879 ss->css_free(cgrp); 872 ss->css_free(cgrp);
880 873
881 cgrp->root->number_of_cgroups--; 874 cgrp->root->number_of_cgroups--;
882 mutex_unlock(&cgroup_mutex); 875 mutex_unlock(&cgroup_mutex);
883 876
884 /* 877 /*
885 * Drop the active superblock reference that we took when we 878 * Drop the active superblock reference that we took when we
886 * created the cgroup 879 * created the cgroup
887 */ 880 */
888 deactivate_super(cgrp->root->sb); 881 deactivate_super(cgrp->root->sb);
889 882
890 /* 883 /*
891 * if we're getting rid of the cgroup, refcount should ensure 884 * if we're getting rid of the cgroup, refcount should ensure
892 * that there are no pidlists left. 885 * that there are no pidlists left.
893 */ 886 */
894 BUG_ON(!list_empty(&cgrp->pidlists)); 887 BUG_ON(!list_empty(&cgrp->pidlists));
888
889 simple_xattrs_free(&cgrp->xattrs);
890
891 ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
892 kfree(cgrp);
893}
895 894
896 simple_xattrs_free(&cgrp->xattrs); 895static void cgroup_free_rcu(struct rcu_head *head)
896{
897 struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
898
899 schedule_work(&cgrp->free_work);
900}
901
902static void cgroup_diput(struct dentry *dentry, struct inode *inode)
903{
904 /* is dentry a directory ? if so, kfree() associated cgroup */
905 if (S_ISDIR(inode->i_mode)) {
906 struct cgroup *cgrp = dentry->d_fsdata;
897 907
898 ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); 908 BUG_ON(!(cgroup_is_removed(cgrp)));
899 kfree_rcu(cgrp, rcu_head); 909 call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
900 } else { 910 } else {
901 struct cfent *cfe = __d_cfe(dentry); 911 struct cfent *cfe = __d_cfe(dentry);
902 struct cgroup *cgrp = dentry->d_parent->d_fsdata; 912 struct cgroup *cgrp = dentry->d_parent->d_fsdata;
@@ -925,13 +935,17 @@ static void remove_dir(struct dentry *d)
925 dput(parent); 935 dput(parent);
926} 936}
927 937
928static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) 938static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
929{ 939{
930 struct cfent *cfe; 940 struct cfent *cfe;
931 941
932 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); 942 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
933 lockdep_assert_held(&cgroup_mutex); 943 lockdep_assert_held(&cgroup_mutex);
934 944
945 /*
946 * If we're doing cleanup due to failure of cgroup_create(),
947 * the corresponding @cfe may not exist.
948 */
935 list_for_each_entry(cfe, &cgrp->files, node) { 949 list_for_each_entry(cfe, &cgrp->files, node) {
936 struct dentry *d = cfe->dentry; 950 struct dentry *d = cfe->dentry;
937 951
@@ -944,9 +958,8 @@ static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
944 list_del_init(&cfe->node); 958 list_del_init(&cfe->node);
945 dput(d); 959 dput(d);
946 960
947 return 0; 961 break;
948 } 962 }
949 return -ENOENT;
950} 963}
951 964
952/** 965/**
@@ -1083,7 +1096,6 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1083 } 1096 }
1084 } 1097 }
1085 root->subsys_mask = root->actual_subsys_mask = final_subsys_mask; 1098 root->subsys_mask = root->actual_subsys_mask = final_subsys_mask;
1086 synchronize_rcu();
1087 1099
1088 return 0; 1100 return 0;
1089} 1101}
@@ -1393,6 +1405,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1393 INIT_LIST_HEAD(&cgrp->allcg_node); 1405 INIT_LIST_HEAD(&cgrp->allcg_node);
1394 INIT_LIST_HEAD(&cgrp->release_list); 1406 INIT_LIST_HEAD(&cgrp->release_list);
1395 INIT_LIST_HEAD(&cgrp->pidlists); 1407 INIT_LIST_HEAD(&cgrp->pidlists);
1408 INIT_WORK(&cgrp->free_work, cgroup_free_fn);
1396 mutex_init(&cgrp->pidlist_mutex); 1409 mutex_init(&cgrp->pidlist_mutex);
1397 INIT_LIST_HEAD(&cgrp->event_list); 1410 INIT_LIST_HEAD(&cgrp->event_list);
1398 spin_lock_init(&cgrp->event_list_lock); 1411 spin_lock_init(&cgrp->event_list_lock);
@@ -1597,6 +1610,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1597 struct cgroupfs_root *existing_root; 1610 struct cgroupfs_root *existing_root;
1598 const struct cred *cred; 1611 const struct cred *cred;
1599 int i; 1612 int i;
1613 struct css_set *cg;
1600 1614
1601 BUG_ON(sb->s_root != NULL); 1615 BUG_ON(sb->s_root != NULL);
1602 1616
@@ -1650,14 +1664,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1650 /* Link the top cgroup in this hierarchy into all 1664 /* Link the top cgroup in this hierarchy into all
1651 * the css_set objects */ 1665 * the css_set objects */
1652 write_lock(&css_set_lock); 1666 write_lock(&css_set_lock);
1653 for (i = 0; i < CSS_SET_TABLE_SIZE; i++) { 1667 hash_for_each(css_set_table, i, cg, hlist)
1654 struct hlist_head *hhead = &css_set_table[i]; 1668 link_css_set(&tmp_cg_links, cg, root_cgrp);
1655 struct hlist_node *node;
1656 struct css_set *cg;
1657
1658 hlist_for_each_entry(cg, node, hhead, hlist)
1659 link_css_set(&tmp_cg_links, cg, root_cgrp);
1660 }
1661 write_unlock(&css_set_lock); 1669 write_unlock(&css_set_lock);
1662 1670
1663 free_cg_links(&tmp_cg_links); 1671 free_cg_links(&tmp_cg_links);
@@ -1773,7 +1781,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1773 rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(), 1781 rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(),
1774 "cgroup_path() called without proper locking"); 1782 "cgroup_path() called without proper locking");
1775 1783
1776 if (!dentry || cgrp == dummytop) { 1784 if (cgrp == dummytop) {
1777 /* 1785 /*
1778 * Inactive subsystems have no dentry for their root 1786 * Inactive subsystems have no dentry for their root
1779 * cgroup 1787 * cgroup
@@ -1982,7 +1990,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1982 ss->attach(cgrp, &tset); 1990 ss->attach(cgrp, &tset);
1983 } 1991 }
1984 1992
1985 synchronize_rcu();
1986out: 1993out:
1987 if (retval) { 1994 if (retval) {
1988 for_each_subsys(root, ss) { 1995 for_each_subsys(root, ss) {
@@ -2151,7 +2158,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2151 /* 2158 /*
2152 * step 5: success! and cleanup 2159 * step 5: success! and cleanup
2153 */ 2160 */
2154 synchronize_rcu();
2155 retval = 0; 2161 retval = 0;
2156out_put_css_set_refs: 2162out_put_css_set_refs:
2157 if (retval) { 2163 if (retval) {
@@ -2637,7 +2643,7 @@ static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, un
2637 */ 2643 */
2638static inline struct cftype *__file_cft(struct file *file) 2644static inline struct cftype *__file_cft(struct file *file)
2639{ 2645{
2640 if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations) 2646 if (file_inode(file)->i_fop != &cgroup_file_operations)
2641 return ERR_PTR(-EINVAL); 2647 return ERR_PTR(-EINVAL);
2642 return __d_cft(file->f_dentry); 2648 return __d_cft(file->f_dentry);
2643} 2649}
@@ -2769,14 +2775,14 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2769 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) 2775 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
2770 continue; 2776 continue;
2771 2777
2772 if (is_add) 2778 if (is_add) {
2773 err = cgroup_add_file(cgrp, subsys, cft); 2779 err = cgroup_add_file(cgrp, subsys, cft);
2774 else 2780 if (err)
2775 err = cgroup_rm_file(cgrp, cft); 2781 pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n",
2776 if (err) { 2782 cft->name, err);
2777 pr_warning("cgroup_addrm_files: failed to %s %s, err=%d\n",
2778 is_add ? "add" : "remove", cft->name, err);
2779 ret = err; 2783 ret = err;
2784 } else {
2785 cgroup_rm_file(cgrp, cft);
2780 } 2786 }
2781 } 2787 }
2782 return ret; 2788 return ret;
@@ -3017,6 +3023,32 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
3017} 3023}
3018EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); 3024EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
3019 3025
3026/**
3027 * cgroup_rightmost_descendant - return the rightmost descendant of a cgroup
3028 * @pos: cgroup of interest
3029 *
3030 * Return the rightmost descendant of @pos. If there's no descendant,
3031 * @pos is returned. This can be used during pre-order traversal to skip
3032 * subtree of @pos.
3033 */
3034struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos)
3035{
3036 struct cgroup *last, *tmp;
3037
3038 WARN_ON_ONCE(!rcu_read_lock_held());
3039
3040 do {
3041 last = pos;
3042 /* ->prev isn't RCU safe, walk ->next till the end */
3043 pos = NULL;
3044 list_for_each_entry_rcu(tmp, &last->children, sibling)
3045 pos = tmp;
3046 } while (pos);
3047
3048 return last;
3049}
3050EXPORT_SYMBOL_GPL(cgroup_rightmost_descendant);
3051
3020static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos) 3052static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos)
3021{ 3053{
3022 struct cgroup *last; 3054 struct cgroup *last;
@@ -3752,8 +3784,13 @@ static void cgroup_event_remove(struct work_struct *work)
3752 remove); 3784 remove);
3753 struct cgroup *cgrp = event->cgrp; 3785 struct cgroup *cgrp = event->cgrp;
3754 3786
3787 remove_wait_queue(event->wqh, &event->wait);
3788
3755 event->cft->unregister_event(cgrp, event->cft, event->eventfd); 3789 event->cft->unregister_event(cgrp, event->cft, event->eventfd);
3756 3790
3791 /* Notify userspace the event is going away. */
3792 eventfd_signal(event->eventfd, 1);
3793
3757 eventfd_ctx_put(event->eventfd); 3794 eventfd_ctx_put(event->eventfd);
3758 kfree(event); 3795 kfree(event);
3759 dput(cgrp->dentry); 3796 dput(cgrp->dentry);
@@ -3773,15 +3810,25 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
3773 unsigned long flags = (unsigned long)key; 3810 unsigned long flags = (unsigned long)key;
3774 3811
3775 if (flags & POLLHUP) { 3812 if (flags & POLLHUP) {
3776 __remove_wait_queue(event->wqh, &event->wait);
3777 spin_lock(&cgrp->event_list_lock);
3778 list_del_init(&event->list);
3779 spin_unlock(&cgrp->event_list_lock);
3780 /* 3813 /*
3781 * We are in atomic context, but cgroup_event_remove() may 3814 * If the event has been detached at cgroup removal, we
3782 * sleep, so we have to call it in workqueue. 3815 * can simply return knowing the other side will cleanup
3816 * for us.
3817 *
3818 * We can't race against event freeing since the other
3819 * side will require wqh->lock via remove_wait_queue(),
3820 * which we hold.
3783 */ 3821 */
3784 schedule_work(&event->remove); 3822 spin_lock(&cgrp->event_list_lock);
3823 if (!list_empty(&event->list)) {
3824 list_del_init(&event->list);
3825 /*
3826 * We are in atomic context, but cgroup_event_remove()
3827 * may sleep, so we have to call it in workqueue.
3828 */
3829 schedule_work(&event->remove);
3830 }
3831 spin_unlock(&cgrp->event_list_lock);
3785 } 3832 }
3786 3833
3787 return 0; 3834 return 0;
@@ -3807,6 +3854,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3807 const char *buffer) 3854 const char *buffer)
3808{ 3855{
3809 struct cgroup_event *event = NULL; 3856 struct cgroup_event *event = NULL;
3857 struct cgroup *cgrp_cfile;
3810 unsigned int efd, cfd; 3858 unsigned int efd, cfd;
3811 struct file *efile = NULL; 3859 struct file *efile = NULL;
3812 struct file *cfile = NULL; 3860 struct file *cfile = NULL;
@@ -3852,7 +3900,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3852 3900
3853 /* the process need read permission on control file */ 3901 /* the process need read permission on control file */
3854 /* AV: shouldn't we check that it's been opened for read instead? */ 3902 /* AV: shouldn't we check that it's been opened for read instead? */
3855 ret = inode_permission(cfile->f_path.dentry->d_inode, MAY_READ); 3903 ret = inode_permission(file_inode(cfile), MAY_READ);
3856 if (ret < 0) 3904 if (ret < 0)
3857 goto fail; 3905 goto fail;
3858 3906
@@ -3862,6 +3910,16 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3862 goto fail; 3910 goto fail;
3863 } 3911 }
3864 3912
3913 /*
3914 * The file to be monitored must be in the same cgroup as
3915 * cgroup.event_control is.
3916 */
3917 cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent);
3918 if (cgrp_cfile != cgrp) {
3919 ret = -EINVAL;
3920 goto fail;
3921 }
3922
3865 if (!event->cft->register_event || !event->cft->unregister_event) { 3923 if (!event->cft->register_event || !event->cft->unregister_event) {
3866 ret = -EINVAL; 3924 ret = -EINVAL;
3867 goto fail; 3925 goto fail;
@@ -4135,6 +4193,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4135 4193
4136 init_cgroup_housekeeping(cgrp); 4194 init_cgroup_housekeeping(cgrp);
4137 4195
4196 dentry->d_fsdata = cgrp;
4197 cgrp->dentry = dentry;
4198
4138 cgrp->parent = parent; 4199 cgrp->parent = parent;
4139 cgrp->root = parent->root; 4200 cgrp->root = parent->root;
4140 cgrp->top_cgroup = parent->top_cgroup; 4201 cgrp->top_cgroup = parent->top_cgroup;
@@ -4172,8 +4233,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4172 lockdep_assert_held(&dentry->d_inode->i_mutex); 4233 lockdep_assert_held(&dentry->d_inode->i_mutex);
4173 4234
4174 /* allocation complete, commit to creation */ 4235 /* allocation complete, commit to creation */
4175 dentry->d_fsdata = cgrp;
4176 cgrp->dentry = dentry;
4177 list_add_tail(&cgrp->allcg_node, &root->allcg_list); 4236 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
4178 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); 4237 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
4179 root->number_of_cgroups++; 4238 root->number_of_cgroups++;
@@ -4340,20 +4399,14 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4340 /* 4399 /*
4341 * Unregister events and notify userspace. 4400 * Unregister events and notify userspace.
4342 * Notify userspace about cgroup removing only after rmdir of cgroup 4401 * Notify userspace about cgroup removing only after rmdir of cgroup
4343 * directory to avoid race between userspace and kernelspace. Use 4402 * directory to avoid race between userspace and kernelspace.
4344 * a temporary list to avoid a deadlock with cgroup_event_wake(). Since
4345 * cgroup_event_wake() is called with the wait queue head locked,
4346 * remove_wait_queue() cannot be called while holding event_list_lock.
4347 */ 4403 */
4348 spin_lock(&cgrp->event_list_lock); 4404 spin_lock(&cgrp->event_list_lock);
4349 list_splice_init(&cgrp->event_list, &tmp_list); 4405 list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
4350 spin_unlock(&cgrp->event_list_lock);
4351 list_for_each_entry_safe(event, tmp, &tmp_list, list) {
4352 list_del_init(&event->list); 4406 list_del_init(&event->list);
4353 remove_wait_queue(event->wqh, &event->wait);
4354 eventfd_signal(event->eventfd, 1);
4355 schedule_work(&event->remove); 4407 schedule_work(&event->remove);
4356 } 4408 }
4409 spin_unlock(&cgrp->event_list_lock);
4357 4410
4358 return 0; 4411 return 0;
4359} 4412}
@@ -4438,6 +4491,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4438{ 4491{
4439 struct cgroup_subsys_state *css; 4492 struct cgroup_subsys_state *css;
4440 int i, ret; 4493 int i, ret;
4494 struct hlist_node *tmp;
4495 struct css_set *cg;
4496 unsigned long key;
4441 4497
4442 /* check name and function validity */ 4498 /* check name and function validity */
4443 if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || 4499 if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
@@ -4503,23 +4559,17 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4503 * this is all done under the css_set_lock. 4559 * this is all done under the css_set_lock.
4504 */ 4560 */
4505 write_lock(&css_set_lock); 4561 write_lock(&css_set_lock);
4506 for (i = 0; i < CSS_SET_TABLE_SIZE; i++) { 4562 hash_for_each_safe(css_set_table, i, tmp, cg, hlist) {
4507 struct css_set *cg; 4563 /* skip entries that we already rehashed */
4508 struct hlist_node *node, *tmp; 4564 if (cg->subsys[ss->subsys_id])
4509 struct hlist_head *bucket = &css_set_table[i], *new_bucket; 4565 continue;
4510 4566 /* remove existing entry */
4511 hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) { 4567 hash_del(&cg->hlist);
4512 /* skip entries that we already rehashed */ 4568 /* set new value */
4513 if (cg->subsys[ss->subsys_id]) 4569 cg->subsys[ss->subsys_id] = css;
4514 continue; 4570 /* recompute hash and restore entry */
4515 /* remove existing entry */ 4571 key = css_set_hash(cg->subsys);
4516 hlist_del(&cg->hlist); 4572 hash_add(css_set_table, &cg->hlist, key);
4517 /* set new value */
4518 cg->subsys[ss->subsys_id] = css;
4519 /* recompute hash and restore entry */
4520 new_bucket = css_set_hash(cg->subsys);
4521 hlist_add_head(&cg->hlist, new_bucket);
4522 }
4523 } 4573 }
4524 write_unlock(&css_set_lock); 4574 write_unlock(&css_set_lock);
4525 4575
@@ -4551,7 +4601,6 @@ EXPORT_SYMBOL_GPL(cgroup_load_subsys);
4551void cgroup_unload_subsys(struct cgroup_subsys *ss) 4601void cgroup_unload_subsys(struct cgroup_subsys *ss)
4552{ 4602{
4553 struct cg_cgroup_link *link; 4603 struct cg_cgroup_link *link;
4554 struct hlist_head *hhead;
4555 4604
4556 BUG_ON(ss->module == NULL); 4605 BUG_ON(ss->module == NULL);
4557 4606
@@ -4567,10 +4616,8 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4567 offline_css(ss, dummytop); 4616 offline_css(ss, dummytop);
4568 ss->active = 0; 4617 ss->active = 0;
4569 4618
4570 if (ss->use_id) { 4619 if (ss->use_id)
4571 idr_remove_all(&ss->idr);
4572 idr_destroy(&ss->idr); 4620 idr_destroy(&ss->idr);
4573 }
4574 4621
4575 /* deassign the subsys_id */ 4622 /* deassign the subsys_id */
4576 subsys[ss->subsys_id] = NULL; 4623 subsys[ss->subsys_id] = NULL;
@@ -4585,11 +4632,12 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4585 write_lock(&css_set_lock); 4632 write_lock(&css_set_lock);
4586 list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) { 4633 list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {
4587 struct css_set *cg = link->cg; 4634 struct css_set *cg = link->cg;
4635 unsigned long key;
4588 4636
4589 hlist_del(&cg->hlist); 4637 hash_del(&cg->hlist);
4590 cg->subsys[ss->subsys_id] = NULL; 4638 cg->subsys[ss->subsys_id] = NULL;
4591 hhead = css_set_hash(cg->subsys); 4639 key = css_set_hash(cg->subsys);
4592 hlist_add_head(&cg->hlist, hhead); 4640 hash_add(css_set_table, &cg->hlist, key);
4593 } 4641 }
4594 write_unlock(&css_set_lock); 4642 write_unlock(&css_set_lock);
4595 4643
@@ -4631,9 +4679,6 @@ int __init cgroup_init_early(void)
4631 list_add(&init_css_set_link.cg_link_list, 4679 list_add(&init_css_set_link.cg_link_list,
4632 &init_css_set.cg_links); 4680 &init_css_set.cg_links);
4633 4681
4634 for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
4635 INIT_HLIST_HEAD(&css_set_table[i]);
4636
4637 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4682 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4638 struct cgroup_subsys *ss = subsys[i]; 4683 struct cgroup_subsys *ss = subsys[i];
4639 4684
@@ -4667,7 +4712,7 @@ int __init cgroup_init(void)
4667{ 4712{
4668 int err; 4713 int err;
4669 int i; 4714 int i;
4670 struct hlist_head *hhead; 4715 unsigned long key;
4671 4716
4672 err = bdi_init(&cgroup_backing_dev_info); 4717 err = bdi_init(&cgroup_backing_dev_info);
4673 if (err) 4718 if (err)
@@ -4686,8 +4731,8 @@ int __init cgroup_init(void)
4686 } 4731 }
4687 4732
4688 /* Add init_css_set to the hash table */ 4733 /* Add init_css_set to the hash table */
4689 hhead = css_set_hash(init_css_set.subsys); 4734 key = css_set_hash(init_css_set.subsys);
4690 hlist_add_head(&init_css_set.hlist, hhead); 4735 hash_add(css_set_table, &init_css_set.hlist, key);
4691 BUG_ON(!init_root_id(&rootnode)); 4736 BUG_ON(!init_root_id(&rootnode));
4692 4737
4693 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); 4738 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
@@ -4982,8 +5027,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4982 } 5027 }
4983 task_unlock(tsk); 5028 task_unlock(tsk);
4984 5029
4985 if (cg) 5030 put_css_set_taskexit(cg);
4986 put_css_set_taskexit(cg);
4987} 5031}
4988 5032
4989/** 5033/**
@@ -5274,7 +5318,7 @@ EXPORT_SYMBOL_GPL(free_css_id);
5274static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) 5318static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
5275{ 5319{
5276 struct css_id *newid; 5320 struct css_id *newid;
5277 int myid, error, size; 5321 int ret, size;
5278 5322
5279 BUG_ON(!ss->use_id); 5323 BUG_ON(!ss->use_id);
5280 5324
@@ -5282,35 +5326,24 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
5282 newid = kzalloc(size, GFP_KERNEL); 5326 newid = kzalloc(size, GFP_KERNEL);
5283 if (!newid) 5327 if (!newid)
5284 return ERR_PTR(-ENOMEM); 5328 return ERR_PTR(-ENOMEM);
5285 /* get id */ 5329
5286 if (unlikely(!idr_pre_get(&ss->idr, GFP_KERNEL))) { 5330 idr_preload(GFP_KERNEL);
5287 error = -ENOMEM;
5288 goto err_out;
5289 }
5290 spin_lock(&ss->id_lock); 5331 spin_lock(&ss->id_lock);
5291 /* Don't use 0. allocates an ID of 1-65535 */ 5332 /* Don't use 0. allocates an ID of 1-65535 */
5292 error = idr_get_new_above(&ss->idr, newid, 1, &myid); 5333 ret = idr_alloc(&ss->idr, newid, 1, CSS_ID_MAX + 1, GFP_NOWAIT);
5293 spin_unlock(&ss->id_lock); 5334 spin_unlock(&ss->id_lock);
5335 idr_preload_end();
5294 5336
5295 /* Returns error when there are no free spaces for new ID.*/ 5337 /* Returns error when there are no free spaces for new ID.*/
5296 if (error) { 5338 if (ret < 0)
5297 error = -ENOSPC;
5298 goto err_out; 5339 goto err_out;
5299 }
5300 if (myid > CSS_ID_MAX)
5301 goto remove_idr;
5302 5340
5303 newid->id = myid; 5341 newid->id = ret;
5304 newid->depth = depth; 5342 newid->depth = depth;
5305 return newid; 5343 return newid;
5306remove_idr:
5307 error = -ENOSPC;
5308 spin_lock(&ss->id_lock);
5309 idr_remove(&ss->idr, myid);
5310 spin_unlock(&ss->id_lock);
5311err_out: 5344err_out:
5312 kfree(newid); 5345 kfree(newid);
5313 return ERR_PTR(error); 5346 return ERR_PTR(ret);
5314 5347
5315} 5348}
5316 5349
@@ -5441,7 +5474,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
5441 struct inode *inode; 5474 struct inode *inode;
5442 struct cgroup_subsys_state *css; 5475 struct cgroup_subsys_state *css;
5443 5476
5444 inode = f->f_dentry->d_inode; 5477 inode = file_inode(f);
5445 /* check in cgroup filesystem dir */ 5478 /* check in cgroup filesystem dir */
5446 if (inode->i_op != &cgroup_dir_inode_operations) 5479 if (inode->i_op != &cgroup_dir_inode_operations)
5447 return ERR_PTR(-EBADF); 5480 return ERR_PTR(-EBADF);
diff --git a/kernel/compat.c b/kernel/compat.c
index 36700e9e2be9..19971d8c7299 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -290,8 +290,8 @@ static inline long put_compat_itimerval(struct compat_itimerval __user *o,
290 __put_user(i->it_value.tv_usec, &o->it_value.tv_usec))); 290 __put_user(i->it_value.tv_usec, &o->it_value.tv_usec)));
291} 291}
292 292
293asmlinkage long compat_sys_getitimer(int which, 293COMPAT_SYSCALL_DEFINE2(getitimer, int, which,
294 struct compat_itimerval __user *it) 294 struct compat_itimerval __user *, it)
295{ 295{
296 struct itimerval kit; 296 struct itimerval kit;
297 int error; 297 int error;
@@ -302,9 +302,9 @@ asmlinkage long compat_sys_getitimer(int which,
302 return error; 302 return error;
303} 303}
304 304
305asmlinkage long compat_sys_setitimer(int which, 305COMPAT_SYSCALL_DEFINE3(setitimer, int, which,
306 struct compat_itimerval __user *in, 306 struct compat_itimerval __user *, in,
307 struct compat_itimerval __user *out) 307 struct compat_itimerval __user *, out)
308{ 308{
309 struct itimerval kin, kout; 309 struct itimerval kin, kout;
310 int error; 310 int error;
@@ -381,9 +381,9 @@ static inline void compat_sig_setmask(sigset_t *blocked, compat_sigset_word set)
381 memcpy(blocked->sig, &set, sizeof(set)); 381 memcpy(blocked->sig, &set, sizeof(set));
382} 382}
383 383
384asmlinkage long compat_sys_sigprocmask(int how, 384COMPAT_SYSCALL_DEFINE3(sigprocmask, int, how,
385 compat_old_sigset_t __user *nset, 385 compat_old_sigset_t __user *, nset,
386 compat_old_sigset_t __user *oset) 386 compat_old_sigset_t __user *, oset)
387{ 387{
388 old_sigset_t old_set, new_set; 388 old_sigset_t old_set, new_set;
389 sigset_t new_blocked; 389 sigset_t new_blocked;
@@ -593,7 +593,7 @@ COMPAT_SYSCALL_DEFINE5(waitid,
593 else 593 else
594 ret = put_compat_rusage(&ru, uru); 594 ret = put_compat_rusage(&ru, uru);
595 if (ret) 595 if (ret)
596 return ret; 596 return -EFAULT;
597 } 597 }
598 598
599 BUG_ON(info.si_code & __SI_MASK); 599 BUG_ON(info.si_code & __SI_MASK);
@@ -971,7 +971,7 @@ long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask,
971} 971}
972 972
973void 973void
974sigset_from_compat (sigset_t *set, compat_sigset_t *compat) 974sigset_from_compat(sigset_t *set, const compat_sigset_t *compat)
975{ 975{
976 switch (_NSIG_WORDS) { 976 switch (_NSIG_WORDS) {
977 case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 ); 977 case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 );
@@ -982,10 +982,20 @@ sigset_from_compat (sigset_t *set, compat_sigset_t *compat)
982} 982}
983EXPORT_SYMBOL_GPL(sigset_from_compat); 983EXPORT_SYMBOL_GPL(sigset_from_compat);
984 984
985asmlinkage long 985void
986compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, 986sigset_to_compat(compat_sigset_t *compat, const sigset_t *set)
987 struct compat_siginfo __user *uinfo, 987{
988 struct compat_timespec __user *uts, compat_size_t sigsetsize) 988 switch (_NSIG_WORDS) {
989 case 4: compat->sig[7] = (set->sig[3] >> 32); compat->sig[6] = set->sig[3];
990 case 3: compat->sig[5] = (set->sig[2] >> 32); compat->sig[4] = set->sig[2];
991 case 2: compat->sig[3] = (set->sig[1] >> 32); compat->sig[2] = set->sig[1];
992 case 1: compat->sig[1] = (set->sig[0] >> 32); compat->sig[0] = set->sig[0];
993 }
994}
995
996COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
997 struct compat_siginfo __user *, uinfo,
998 struct compat_timespec __user *, uts, compat_size_t, sigsetsize)
989{ 999{
990 compat_sigset_t s32; 1000 compat_sigset_t s32;
991 sigset_t s; 1001 sigset_t s;
@@ -1013,18 +1023,6 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
1013 } 1023 }
1014 1024
1015 return ret; 1025 return ret;
1016
1017}
1018
1019asmlinkage long
1020compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig,
1021 struct compat_siginfo __user *uinfo)
1022{
1023 siginfo_t info;
1024
1025 if (copy_siginfo_from_user32(&info, uinfo))
1026 return -EFAULT;
1027 return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
1028} 1026}
1029 1027
1030#ifdef __ARCH_WANT_COMPAT_SYS_TIME 1028#ifdef __ARCH_WANT_COMPAT_SYS_TIME
@@ -1067,23 +1065,6 @@ asmlinkage long compat_sys_stime(compat_time_t __user *tptr)
1067 1065
1068#endif /* __ARCH_WANT_COMPAT_SYS_TIME */ 1066#endif /* __ARCH_WANT_COMPAT_SYS_TIME */
1069 1067
1070#ifdef __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND
1071asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat_size_t sigsetsize)
1072{
1073 sigset_t newset;
1074 compat_sigset_t newset32;
1075
1076 /* XXX: Don't preclude handling different sized sigset_t's. */
1077 if (sigsetsize != sizeof(sigset_t))
1078 return -EINVAL;
1079
1080 if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t)))
1081 return -EFAULT;
1082 sigset_from_compat(&newset, &newset32);
1083 return sigsuspend(&newset);
1084}
1085#endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */
1086
1087asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp) 1068asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
1088{ 1069{
1089 struct timex txc; 1070 struct timex txc;
@@ -1222,9 +1203,9 @@ compat_sys_sysinfo(struct compat_sysinfo __user *info)
1222 return 0; 1203 return 0;
1223} 1204}
1224 1205
1225#ifdef __ARCH_WANT_COMPAT_SYS_SCHED_RR_GET_INTERVAL 1206COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
1226asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid, 1207 compat_pid_t, pid,
1227 struct compat_timespec __user *interval) 1208 struct compat_timespec __user *, interval)
1228{ 1209{
1229 struct timespec t; 1210 struct timespec t;
1230 int ret; 1211 int ret;
@@ -1237,7 +1218,6 @@ asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid,
1237 return -EFAULT; 1218 return -EFAULT;
1238 return ret; 1219 return ret;
1239} 1220}
1240#endif /* __ARCH_WANT_COMPAT_SYS_SCHED_RR_GET_INTERVAL */
1241 1221
1242/* 1222/*
1243 * Allocate user-space memory for the duration of a single system call, 1223 * Allocate user-space memory for the duration of a single system call,
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 7bb63eea6eb8..4f9dfe43ecbd 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -61,14 +61,6 @@
61#include <linux/cgroup.h> 61#include <linux/cgroup.h>
62 62
63/* 63/*
64 * Workqueue for cpuset related tasks.
65 *
66 * Using kevent workqueue may cause deadlock when memory_migrate
67 * is set. So we create a separate workqueue thread for cpuset.
68 */
69static struct workqueue_struct *cpuset_wq;
70
71/*
72 * Tracks how many cpusets are currently defined in system. 64 * Tracks how many cpusets are currently defined in system.
73 * When there is only one cpuset (the root cpuset) we can 65 * When there is only one cpuset (the root cpuset) we can
74 * short circuit some hooks. 66 * short circuit some hooks.
@@ -95,18 +87,21 @@ struct cpuset {
95 cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ 87 cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */
96 nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ 88 nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */
97 89
98 struct cpuset *parent; /* my parent */
99
100 struct fmeter fmeter; /* memory_pressure filter */ 90 struct fmeter fmeter; /* memory_pressure filter */
101 91
92 /*
93 * Tasks are being attached to this cpuset. Used to prevent
94 * zeroing cpus/mems_allowed between ->can_attach() and ->attach().
95 */
96 int attach_in_progress;
97
102 /* partition number for rebuild_sched_domains() */ 98 /* partition number for rebuild_sched_domains() */
103 int pn; 99 int pn;
104 100
105 /* for custom sched domain */ 101 /* for custom sched domain */
106 int relax_domain_level; 102 int relax_domain_level;
107 103
108 /* used for walking a cpuset hierarchy */ 104 struct work_struct hotplug_work;
109 struct list_head stack_list;
110}; 105};
111 106
112/* Retrieve the cpuset for a cgroup */ 107/* Retrieve the cpuset for a cgroup */
@@ -123,6 +118,15 @@ static inline struct cpuset *task_cs(struct task_struct *task)
123 struct cpuset, css); 118 struct cpuset, css);
124} 119}
125 120
121static inline struct cpuset *parent_cs(const struct cpuset *cs)
122{
123 struct cgroup *pcgrp = cs->css.cgroup->parent;
124
125 if (pcgrp)
126 return cgroup_cs(pcgrp);
127 return NULL;
128}
129
126#ifdef CONFIG_NUMA 130#ifdef CONFIG_NUMA
127static inline bool task_has_mempolicy(struct task_struct *task) 131static inline bool task_has_mempolicy(struct task_struct *task)
128{ 132{
@@ -138,6 +142,7 @@ static inline bool task_has_mempolicy(struct task_struct *task)
138 142
139/* bits in struct cpuset flags field */ 143/* bits in struct cpuset flags field */
140typedef enum { 144typedef enum {
145 CS_ONLINE,
141 CS_CPU_EXCLUSIVE, 146 CS_CPU_EXCLUSIVE,
142 CS_MEM_EXCLUSIVE, 147 CS_MEM_EXCLUSIVE,
143 CS_MEM_HARDWALL, 148 CS_MEM_HARDWALL,
@@ -147,13 +152,12 @@ typedef enum {
147 CS_SPREAD_SLAB, 152 CS_SPREAD_SLAB,
148} cpuset_flagbits_t; 153} cpuset_flagbits_t;
149 154
150/* the type of hotplug event */
151enum hotplug_event {
152 CPUSET_CPU_OFFLINE,
153 CPUSET_MEM_OFFLINE,
154};
155
156/* convenient tests for these bits */ 155/* convenient tests for these bits */
156static inline bool is_cpuset_online(const struct cpuset *cs)
157{
158 return test_bit(CS_ONLINE, &cs->flags);
159}
160
157static inline int is_cpu_exclusive(const struct cpuset *cs) 161static inline int is_cpu_exclusive(const struct cpuset *cs)
158{ 162{
159 return test_bit(CS_CPU_EXCLUSIVE, &cs->flags); 163 return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
@@ -190,27 +194,52 @@ static inline int is_spread_slab(const struct cpuset *cs)
190} 194}
191 195
192static struct cpuset top_cpuset = { 196static struct cpuset top_cpuset = {
193 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), 197 .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
198 (1 << CS_MEM_EXCLUSIVE)),
194}; 199};
195 200
201/**
202 * cpuset_for_each_child - traverse online children of a cpuset
203 * @child_cs: loop cursor pointing to the current child
204 * @pos_cgrp: used for iteration
205 * @parent_cs: target cpuset to walk children of
206 *
207 * Walk @child_cs through the online children of @parent_cs. Must be used
208 * with RCU read locked.
209 */
210#define cpuset_for_each_child(child_cs, pos_cgrp, parent_cs) \
211 cgroup_for_each_child((pos_cgrp), (parent_cs)->css.cgroup) \
212 if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp)))))
213
214/**
215 * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
216 * @des_cs: loop cursor pointing to the current descendant
217 * @pos_cgrp: used for iteration
218 * @root_cs: target cpuset to walk ancestor of
219 *
220 * Walk @des_cs through the online descendants of @root_cs. Must be used
221 * with RCU read locked. The caller may modify @pos_cgrp by calling
222 * cgroup_rightmost_descendant() to skip subtree.
223 */
224#define cpuset_for_each_descendant_pre(des_cs, pos_cgrp, root_cs) \
225 cgroup_for_each_descendant_pre((pos_cgrp), (root_cs)->css.cgroup) \
226 if (is_cpuset_online(((des_cs) = cgroup_cs((pos_cgrp)))))
227
196/* 228/*
197 * There are two global mutexes guarding cpuset structures. The first 229 * There are two global mutexes guarding cpuset structures - cpuset_mutex
198 * is the main control groups cgroup_mutex, accessed via 230 * and callback_mutex. The latter may nest inside the former. We also
199 * cgroup_lock()/cgroup_unlock(). The second is the cpuset-specific 231 * require taking task_lock() when dereferencing a task's cpuset pointer.
200 * callback_mutex, below. They can nest. It is ok to first take 232 * See "The task_lock() exception", at the end of this comment.
201 * cgroup_mutex, then nest callback_mutex. We also require taking 233 *
202 * task_lock() when dereferencing a task's cpuset pointer. See "The 234 * A task must hold both mutexes to modify cpusets. If a task holds
203 * task_lock() exception", at the end of this comment. 235 * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
204 * 236 * is the only task able to also acquire callback_mutex and be able to
205 * A task must hold both mutexes to modify cpusets. If a task 237 * modify cpusets. It can perform various checks on the cpuset structure
206 * holds cgroup_mutex, then it blocks others wanting that mutex, 238 * first, knowing nothing will change. It can also allocate memory while
207 * ensuring that it is the only task able to also acquire callback_mutex 239 * just holding cpuset_mutex. While it is performing these checks, various
208 * and be able to modify cpusets. It can perform various checks on 240 * callback routines can briefly acquire callback_mutex to query cpusets.
209 * the cpuset structure first, knowing nothing will change. It can 241 * Once it is ready to make the changes, it takes callback_mutex, blocking
210 * also allocate memory while just holding cgroup_mutex. While it is 242 * everyone else.
211 * performing these checks, various callback routines can briefly
212 * acquire callback_mutex to query cpusets. Once it is ready to make
213 * the changes, it takes callback_mutex, blocking everyone else.
214 * 243 *
215 * Calls to the kernel memory allocator can not be made while holding 244 * Calls to the kernel memory allocator can not be made while holding
216 * callback_mutex, as that would risk double tripping on callback_mutex 245 * callback_mutex, as that would risk double tripping on callback_mutex
@@ -232,6 +261,7 @@ static struct cpuset top_cpuset = {
232 * guidelines for accessing subsystem state in kernel/cgroup.c 261 * guidelines for accessing subsystem state in kernel/cgroup.c
233 */ 262 */
234 263
264static DEFINE_MUTEX(cpuset_mutex);
235static DEFINE_MUTEX(callback_mutex); 265static DEFINE_MUTEX(callback_mutex);
236 266
237/* 267/*
@@ -246,6 +276,17 @@ static char cpuset_nodelist[CPUSET_NODELIST_LEN];
246static DEFINE_SPINLOCK(cpuset_buffer_lock); 276static DEFINE_SPINLOCK(cpuset_buffer_lock);
247 277
248/* 278/*
279 * CPU / memory hotplug is handled asynchronously.
280 */
281static struct workqueue_struct *cpuset_propagate_hotplug_wq;
282
283static void cpuset_hotplug_workfn(struct work_struct *work);
284static void cpuset_propagate_hotplug_workfn(struct work_struct *work);
285static void schedule_cpuset_propagate_hotplug(struct cpuset *cs);
286
287static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
288
289/*
249 * This is ugly, but preserves the userspace API for existing cpuset 290 * This is ugly, but preserves the userspace API for existing cpuset
250 * users. If someone tries to mount the "cpuset" filesystem, we 291 * users. If someone tries to mount the "cpuset" filesystem, we
251 * silently switch it to mount "cgroup" instead 292 * silently switch it to mount "cgroup" instead
@@ -289,7 +330,7 @@ static void guarantee_online_cpus(const struct cpuset *cs,
289 struct cpumask *pmask) 330 struct cpumask *pmask)
290{ 331{
291 while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) 332 while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
292 cs = cs->parent; 333 cs = parent_cs(cs);
293 if (cs) 334 if (cs)
294 cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask); 335 cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
295 else 336 else
@@ -314,7 +355,7 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
314{ 355{
315 while (cs && !nodes_intersects(cs->mems_allowed, 356 while (cs && !nodes_intersects(cs->mems_allowed,
316 node_states[N_MEMORY])) 357 node_states[N_MEMORY]))
317 cs = cs->parent; 358 cs = parent_cs(cs);
318 if (cs) 359 if (cs)
319 nodes_and(*pmask, cs->mems_allowed, 360 nodes_and(*pmask, cs->mems_allowed,
320 node_states[N_MEMORY]); 361 node_states[N_MEMORY]);
@@ -326,7 +367,7 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
326/* 367/*
327 * update task's spread flag if cpuset's page/slab spread flag is set 368 * update task's spread flag if cpuset's page/slab spread flag is set
328 * 369 *
329 * Called with callback_mutex/cgroup_mutex held 370 * Called with callback_mutex/cpuset_mutex held
330 */ 371 */
331static void cpuset_update_task_spread_flag(struct cpuset *cs, 372static void cpuset_update_task_spread_flag(struct cpuset *cs,
332 struct task_struct *tsk) 373 struct task_struct *tsk)
@@ -346,7 +387,7 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs,
346 * 387 *
347 * One cpuset is a subset of another if all its allowed CPUs and 388 * One cpuset is a subset of another if all its allowed CPUs and
348 * Memory Nodes are a subset of the other, and its exclusive flags 389 * Memory Nodes are a subset of the other, and its exclusive flags
349 * are only set if the other's are set. Call holding cgroup_mutex. 390 * are only set if the other's are set. Call holding cpuset_mutex.
350 */ 391 */
351 392
352static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) 393static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
@@ -395,7 +436,7 @@ static void free_trial_cpuset(struct cpuset *trial)
395 * If we replaced the flag and mask values of the current cpuset 436 * If we replaced the flag and mask values of the current cpuset
396 * (cur) with those values in the trial cpuset (trial), would 437 * (cur) with those values in the trial cpuset (trial), would
397 * our various subset and exclusive rules still be valid? Presumes 438 * our various subset and exclusive rules still be valid? Presumes
398 * cgroup_mutex held. 439 * cpuset_mutex held.
399 * 440 *
400 * 'cur' is the address of an actual, in-use cpuset. Operations 441 * 'cur' is the address of an actual, in-use cpuset. Operations
401 * such as list traversal that depend on the actual address of the 442 * such as list traversal that depend on the actual address of the
@@ -412,48 +453,58 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
412{ 453{
413 struct cgroup *cont; 454 struct cgroup *cont;
414 struct cpuset *c, *par; 455 struct cpuset *c, *par;
456 int ret;
457
458 rcu_read_lock();
415 459
416 /* Each of our child cpusets must be a subset of us */ 460 /* Each of our child cpusets must be a subset of us */
417 list_for_each_entry(cont, &cur->css.cgroup->children, sibling) { 461 ret = -EBUSY;
418 if (!is_cpuset_subset(cgroup_cs(cont), trial)) 462 cpuset_for_each_child(c, cont, cur)
419 return -EBUSY; 463 if (!is_cpuset_subset(c, trial))
420 } 464 goto out;
421 465
422 /* Remaining checks don't apply to root cpuset */ 466 /* Remaining checks don't apply to root cpuset */
467 ret = 0;
423 if (cur == &top_cpuset) 468 if (cur == &top_cpuset)
424 return 0; 469 goto out;
425 470
426 par = cur->parent; 471 par = parent_cs(cur);
427 472
428 /* We must be a subset of our parent cpuset */ 473 /* We must be a subset of our parent cpuset */
474 ret = -EACCES;
429 if (!is_cpuset_subset(trial, par)) 475 if (!is_cpuset_subset(trial, par))
430 return -EACCES; 476 goto out;
431 477
432 /* 478 /*
433 * If either I or some sibling (!= me) is exclusive, we can't 479 * If either I or some sibling (!= me) is exclusive, we can't
434 * overlap 480 * overlap
435 */ 481 */
436 list_for_each_entry(cont, &par->css.cgroup->children, sibling) { 482 ret = -EINVAL;
437 c = cgroup_cs(cont); 483 cpuset_for_each_child(c, cont, par) {
438 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && 484 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
439 c != cur && 485 c != cur &&
440 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) 486 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
441 return -EINVAL; 487 goto out;
442 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && 488 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
443 c != cur && 489 c != cur &&
444 nodes_intersects(trial->mems_allowed, c->mems_allowed)) 490 nodes_intersects(trial->mems_allowed, c->mems_allowed))
445 return -EINVAL; 491 goto out;
446 } 492 }
447 493
448 /* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */ 494 /*
449 if (cgroup_task_count(cur->css.cgroup)) { 495 * Cpusets with tasks - existing or newly being attached - can't
450 if (cpumask_empty(trial->cpus_allowed) || 496 * have empty cpus_allowed or mems_allowed.
451 nodes_empty(trial->mems_allowed)) { 497 */
452 return -ENOSPC; 498 ret = -ENOSPC;
453 } 499 if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress) &&
454 } 500 (cpumask_empty(trial->cpus_allowed) ||
501 nodes_empty(trial->mems_allowed)))
502 goto out;
455 503
456 return 0; 504 ret = 0;
505out:
506 rcu_read_unlock();
507 return ret;
457} 508}
458 509
459#ifdef CONFIG_SMP 510#ifdef CONFIG_SMP
@@ -474,31 +525,24 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
474 return; 525 return;
475} 526}
476 527
477static void 528static void update_domain_attr_tree(struct sched_domain_attr *dattr,
478update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) 529 struct cpuset *root_cs)
479{ 530{
480 LIST_HEAD(q); 531 struct cpuset *cp;
481 532 struct cgroup *pos_cgrp;
482 list_add(&c->stack_list, &q);
483 while (!list_empty(&q)) {
484 struct cpuset *cp;
485 struct cgroup *cont;
486 struct cpuset *child;
487
488 cp = list_first_entry(&q, struct cpuset, stack_list);
489 list_del(q.next);
490 533
491 if (cpumask_empty(cp->cpus_allowed)) 534 rcu_read_lock();
535 cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) {
536 /* skip the whole subtree if @cp doesn't have any CPU */
537 if (cpumask_empty(cp->cpus_allowed)) {
538 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
492 continue; 539 continue;
540 }
493 541
494 if (is_sched_load_balance(cp)) 542 if (is_sched_load_balance(cp))
495 update_domain_attr(dattr, cp); 543 update_domain_attr(dattr, cp);
496
497 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
498 child = cgroup_cs(cont);
499 list_add_tail(&child->stack_list, &q);
500 }
501 } 544 }
545 rcu_read_unlock();
502} 546}
503 547
504/* 548/*
@@ -520,7 +564,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
520 * domains when operating in the severe memory shortage situations 564 * domains when operating in the severe memory shortage situations
521 * that could cause allocation failures below. 565 * that could cause allocation failures below.
522 * 566 *
523 * Must be called with cgroup_lock held. 567 * Must be called with cpuset_mutex held.
524 * 568 *
525 * The three key local variables below are: 569 * The three key local variables below are:
526 * q - a linked-list queue of cpuset pointers, used to implement a 570 * q - a linked-list queue of cpuset pointers, used to implement a
@@ -558,7 +602,6 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
558static int generate_sched_domains(cpumask_var_t **domains, 602static int generate_sched_domains(cpumask_var_t **domains,
559 struct sched_domain_attr **attributes) 603 struct sched_domain_attr **attributes)
560{ 604{
561 LIST_HEAD(q); /* queue of cpusets to be scanned */
562 struct cpuset *cp; /* scans q */ 605 struct cpuset *cp; /* scans q */
563 struct cpuset **csa; /* array of all cpuset ptrs */ 606 struct cpuset **csa; /* array of all cpuset ptrs */
564 int csn; /* how many cpuset ptrs in csa so far */ 607 int csn; /* how many cpuset ptrs in csa so far */
@@ -567,6 +610,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
567 struct sched_domain_attr *dattr; /* attributes for custom domains */ 610 struct sched_domain_attr *dattr; /* attributes for custom domains */
568 int ndoms = 0; /* number of sched domains in result */ 611 int ndoms = 0; /* number of sched domains in result */
569 int nslot; /* next empty doms[] struct cpumask slot */ 612 int nslot; /* next empty doms[] struct cpumask slot */
613 struct cgroup *pos_cgrp;
570 614
571 doms = NULL; 615 doms = NULL;
572 dattr = NULL; 616 dattr = NULL;
@@ -594,33 +638,27 @@ static int generate_sched_domains(cpumask_var_t **domains,
594 goto done; 638 goto done;
595 csn = 0; 639 csn = 0;
596 640
597 list_add(&top_cpuset.stack_list, &q); 641 rcu_read_lock();
598 while (!list_empty(&q)) { 642 cpuset_for_each_descendant_pre(cp, pos_cgrp, &top_cpuset) {
599 struct cgroup *cont;
600 struct cpuset *child; /* scans child cpusets of cp */
601
602 cp = list_first_entry(&q, struct cpuset, stack_list);
603 list_del(q.next);
604
605 if (cpumask_empty(cp->cpus_allowed))
606 continue;
607
608 /* 643 /*
609 * All child cpusets contain a subset of the parent's cpus, so 644 * Continue traversing beyond @cp iff @cp has some CPUs and
610 * just skip them, and then we call update_domain_attr_tree() 645 * isn't load balancing. The former is obvious. The
611 * to calc relax_domain_level of the corresponding sched 646 * latter: All child cpusets contain a subset of the
612 * domain. 647 * parent's cpus, so just skip them, and then we call
648 * update_domain_attr_tree() to calc relax_domain_level of
649 * the corresponding sched domain.
613 */ 650 */
614 if (is_sched_load_balance(cp)) { 651 if (!cpumask_empty(cp->cpus_allowed) &&
615 csa[csn++] = cp; 652 !is_sched_load_balance(cp))
616 continue; 653 continue;
617 }
618 654
619 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { 655 if (is_sched_load_balance(cp))
620 child = cgroup_cs(cont); 656 csa[csn++] = cp;
621 list_add_tail(&child->stack_list, &q); 657
622 } 658 /* skip @cp's subtree */
623 } 659 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
660 }
661 rcu_read_unlock();
624 662
625 for (i = 0; i < csn; i++) 663 for (i = 0; i < csn; i++)
626 csa[i]->pn = i; 664 csa[i]->pn = i;
@@ -725,25 +763,25 @@ done:
725/* 763/*
726 * Rebuild scheduler domains. 764 * Rebuild scheduler domains.
727 * 765 *
728 * Call with neither cgroup_mutex held nor within get_online_cpus(). 766 * If the flag 'sched_load_balance' of any cpuset with non-empty
729 * Takes both cgroup_mutex and get_online_cpus(). 767 * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
768 * which has that flag enabled, or if any cpuset with a non-empty
769 * 'cpus' is removed, then call this routine to rebuild the
770 * scheduler's dynamic sched domains.
730 * 771 *
731 * Cannot be directly called from cpuset code handling changes 772 * Call with cpuset_mutex held. Takes get_online_cpus().
732 * to the cpuset pseudo-filesystem, because it cannot be called
733 * from code that already holds cgroup_mutex.
734 */ 773 */
735static void do_rebuild_sched_domains(struct work_struct *unused) 774static void rebuild_sched_domains_locked(void)
736{ 775{
737 struct sched_domain_attr *attr; 776 struct sched_domain_attr *attr;
738 cpumask_var_t *doms; 777 cpumask_var_t *doms;
739 int ndoms; 778 int ndoms;
740 779
780 lockdep_assert_held(&cpuset_mutex);
741 get_online_cpus(); 781 get_online_cpus();
742 782
743 /* Generate domain masks and attrs */ 783 /* Generate domain masks and attrs */
744 cgroup_lock();
745 ndoms = generate_sched_domains(&doms, &attr); 784 ndoms = generate_sched_domains(&doms, &attr);
746 cgroup_unlock();
747 785
748 /* Have scheduler rebuild the domains */ 786 /* Have scheduler rebuild the domains */
749 partition_sched_domains(ndoms, doms, attr); 787 partition_sched_domains(ndoms, doms, attr);
@@ -751,7 +789,7 @@ static void do_rebuild_sched_domains(struct work_struct *unused)
751 put_online_cpus(); 789 put_online_cpus();
752} 790}
753#else /* !CONFIG_SMP */ 791#else /* !CONFIG_SMP */
754static void do_rebuild_sched_domains(struct work_struct *unused) 792static void rebuild_sched_domains_locked(void)
755{ 793{
756} 794}
757 795
@@ -763,44 +801,11 @@ static int generate_sched_domains(cpumask_var_t **domains,
763} 801}
764#endif /* CONFIG_SMP */ 802#endif /* CONFIG_SMP */
765 803
766static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
767
768/*
769 * Rebuild scheduler domains, asynchronously via workqueue.
770 *
771 * If the flag 'sched_load_balance' of any cpuset with non-empty
772 * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
773 * which has that flag enabled, or if any cpuset with a non-empty
774 * 'cpus' is removed, then call this routine to rebuild the
775 * scheduler's dynamic sched domains.
776 *
777 * The rebuild_sched_domains() and partition_sched_domains()
778 * routines must nest cgroup_lock() inside get_online_cpus(),
779 * but such cpuset changes as these must nest that locking the
780 * other way, holding cgroup_lock() for much of the code.
781 *
782 * So in order to avoid an ABBA deadlock, the cpuset code handling
783 * these user changes delegates the actual sched domain rebuilding
784 * to a separate workqueue thread, which ends up processing the
785 * above do_rebuild_sched_domains() function.
786 */
787static void async_rebuild_sched_domains(void)
788{
789 queue_work(cpuset_wq, &rebuild_sched_domains_work);
790}
791
792/*
793 * Accomplishes the same scheduler domain rebuild as the above
794 * async_rebuild_sched_domains(), however it directly calls the
795 * rebuild routine synchronously rather than calling it via an
796 * asynchronous work thread.
797 *
798 * This can only be called from code that is not holding
799 * cgroup_mutex (not nested in a cgroup_lock() call.)
800 */
801void rebuild_sched_domains(void) 804void rebuild_sched_domains(void)
802{ 805{
803 do_rebuild_sched_domains(NULL); 806 mutex_lock(&cpuset_mutex);
807 rebuild_sched_domains_locked();
808 mutex_unlock(&cpuset_mutex);
804} 809}
805 810
806/** 811/**
@@ -808,7 +813,7 @@ void rebuild_sched_domains(void)
808 * @tsk: task to test 813 * @tsk: task to test
809 * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner 814 * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
810 * 815 *
811 * Call with cgroup_mutex held. May take callback_mutex during call. 816 * Call with cpuset_mutex held. May take callback_mutex during call.
812 * Called for each task in a cgroup by cgroup_scan_tasks(). 817 * Called for each task in a cgroup by cgroup_scan_tasks().
813 * Return nonzero if this tasks's cpus_allowed mask should be changed (in other 818 * Return nonzero if this tasks's cpus_allowed mask should be changed (in other
814 * words, if its mask is not equal to its cpuset's mask). 819 * words, if its mask is not equal to its cpuset's mask).
@@ -829,7 +834,7 @@ static int cpuset_test_cpumask(struct task_struct *tsk,
829 * cpus_allowed mask needs to be changed. 834 * cpus_allowed mask needs to be changed.
830 * 835 *
831 * We don't need to re-check for the cgroup/cpuset membership, since we're 836 * We don't need to re-check for the cgroup/cpuset membership, since we're
832 * holding cgroup_lock() at this point. 837 * holding cpuset_mutex at this point.
833 */ 838 */
834static void cpuset_change_cpumask(struct task_struct *tsk, 839static void cpuset_change_cpumask(struct task_struct *tsk,
835 struct cgroup_scanner *scan) 840 struct cgroup_scanner *scan)
@@ -842,7 +847,7 @@ static void cpuset_change_cpumask(struct task_struct *tsk,
842 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed 847 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
843 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() 848 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
844 * 849 *
845 * Called with cgroup_mutex held 850 * Called with cpuset_mutex held
846 * 851 *
847 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, 852 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
848 * calling callback functions for each. 853 * calling callback functions for each.
@@ -920,7 +925,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
920 heap_free(&heap); 925 heap_free(&heap);
921 926
922 if (is_load_balanced) 927 if (is_load_balanced)
923 async_rebuild_sched_domains(); 928 rebuild_sched_domains_locked();
924 return 0; 929 return 0;
925} 930}
926 931
@@ -932,7 +937,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
932 * Temporarilly set tasks mems_allowed to target nodes of migration, 937 * Temporarilly set tasks mems_allowed to target nodes of migration,
933 * so that the migration code can allocate pages on these nodes. 938 * so that the migration code can allocate pages on these nodes.
934 * 939 *
935 * Call holding cgroup_mutex, so current's cpuset won't change 940 * Call holding cpuset_mutex, so current's cpuset won't change
936 * during this call, as manage_mutex holds off any cpuset_attach() 941 * during this call, as manage_mutex holds off any cpuset_attach()
937 * calls. Therefore we don't need to take task_lock around the 942 * calls. Therefore we don't need to take task_lock around the
938 * call to guarantee_online_mems(), as we know no one is changing 943 * call to guarantee_online_mems(), as we know no one is changing
@@ -1007,7 +1012,7 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
1007/* 1012/*
1008 * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy 1013 * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
1009 * of it to cpuset's new mems_allowed, and migrate pages to new nodes if 1014 * of it to cpuset's new mems_allowed, and migrate pages to new nodes if
1010 * memory_migrate flag is set. Called with cgroup_mutex held. 1015 * memory_migrate flag is set. Called with cpuset_mutex held.
1011 */ 1016 */
1012static void cpuset_change_nodemask(struct task_struct *p, 1017static void cpuset_change_nodemask(struct task_struct *p,
1013 struct cgroup_scanner *scan) 1018 struct cgroup_scanner *scan)
@@ -1016,7 +1021,7 @@ static void cpuset_change_nodemask(struct task_struct *p,
1016 struct cpuset *cs; 1021 struct cpuset *cs;
1017 int migrate; 1022 int migrate;
1018 const nodemask_t *oldmem = scan->data; 1023 const nodemask_t *oldmem = scan->data;
1019 static nodemask_t newmems; /* protected by cgroup_mutex */ 1024 static nodemask_t newmems; /* protected by cpuset_mutex */
1020 1025
1021 cs = cgroup_cs(scan->cg); 1026 cs = cgroup_cs(scan->cg);
1022 guarantee_online_mems(cs, &newmems); 1027 guarantee_online_mems(cs, &newmems);
@@ -1043,7 +1048,7 @@ static void *cpuset_being_rebound;
1043 * @oldmem: old mems_allowed of cpuset cs 1048 * @oldmem: old mems_allowed of cpuset cs
1044 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() 1049 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
1045 * 1050 *
1046 * Called with cgroup_mutex held 1051 * Called with cpuset_mutex held
1047 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 1052 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
1048 * if @heap != NULL. 1053 * if @heap != NULL.
1049 */ 1054 */
@@ -1065,7 +1070,7 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
1065 * take while holding tasklist_lock. Forks can happen - the 1070 * take while holding tasklist_lock. Forks can happen - the
1066 * mpol_dup() cpuset_being_rebound check will catch such forks, 1071 * mpol_dup() cpuset_being_rebound check will catch such forks,
1067 * and rebind their vma mempolicies too. Because we still hold 1072 * and rebind their vma mempolicies too. Because we still hold
1068 * the global cgroup_mutex, we know that no other rebind effort 1073 * the global cpuset_mutex, we know that no other rebind effort
1069 * will be contending for the global variable cpuset_being_rebound. 1074 * will be contending for the global variable cpuset_being_rebound.
1070 * It's ok if we rebind the same mm twice; mpol_rebind_mm() 1075 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
1071 * is idempotent. Also migrate pages in each mm to new nodes. 1076 * is idempotent. Also migrate pages in each mm to new nodes.
@@ -1084,7 +1089,7 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
1084 * mempolicies and if the cpuset is marked 'memory_migrate', 1089 * mempolicies and if the cpuset is marked 'memory_migrate',
1085 * migrate the tasks pages to the new memory. 1090 * migrate the tasks pages to the new memory.
1086 * 1091 *
1087 * Call with cgroup_mutex held. May take callback_mutex during call. 1092 * Call with cpuset_mutex held. May take callback_mutex during call.
1088 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, 1093 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
1089 * lock each such tasks mm->mmap_sem, scan its vma's and rebind 1094 * lock each such tasks mm->mmap_sem, scan its vma's and rebind
1090 * their mempolicies to the cpusets new mems_allowed. 1095 * their mempolicies to the cpusets new mems_allowed.
@@ -1168,7 +1173,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
1168 cs->relax_domain_level = val; 1173 cs->relax_domain_level = val;
1169 if (!cpumask_empty(cs->cpus_allowed) && 1174 if (!cpumask_empty(cs->cpus_allowed) &&
1170 is_sched_load_balance(cs)) 1175 is_sched_load_balance(cs))
1171 async_rebuild_sched_domains(); 1176 rebuild_sched_domains_locked();
1172 } 1177 }
1173 1178
1174 return 0; 1179 return 0;
@@ -1182,7 +1187,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
1182 * Called by cgroup_scan_tasks() for each task in a cgroup. 1187 * Called by cgroup_scan_tasks() for each task in a cgroup.
1183 * 1188 *
1184 * We don't need to re-check for the cgroup/cpuset membership, since we're 1189 * We don't need to re-check for the cgroup/cpuset membership, since we're
1185 * holding cgroup_lock() at this point. 1190 * holding cpuset_mutex at this point.
1186 */ 1191 */
1187static void cpuset_change_flag(struct task_struct *tsk, 1192static void cpuset_change_flag(struct task_struct *tsk,
1188 struct cgroup_scanner *scan) 1193 struct cgroup_scanner *scan)
@@ -1195,7 +1200,7 @@ static void cpuset_change_flag(struct task_struct *tsk,
1195 * @cs: the cpuset in which each task's spread flags needs to be changed 1200 * @cs: the cpuset in which each task's spread flags needs to be changed
1196 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() 1201 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
1197 * 1202 *
1198 * Called with cgroup_mutex held 1203 * Called with cpuset_mutex held
1199 * 1204 *
1200 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, 1205 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
1201 * calling callback functions for each. 1206 * calling callback functions for each.
@@ -1220,7 +1225,7 @@ static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)
1220 * cs: the cpuset to update 1225 * cs: the cpuset to update
1221 * turning_on: whether the flag is being set or cleared 1226 * turning_on: whether the flag is being set or cleared
1222 * 1227 *
1223 * Call with cgroup_mutex held. 1228 * Call with cpuset_mutex held.
1224 */ 1229 */
1225 1230
1226static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, 1231static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
@@ -1260,7 +1265,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1260 mutex_unlock(&callback_mutex); 1265 mutex_unlock(&callback_mutex);
1261 1266
1262 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) 1267 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
1263 async_rebuild_sched_domains(); 1268 rebuild_sched_domains_locked();
1264 1269
1265 if (spread_flag_changed) 1270 if (spread_flag_changed)
1266 update_tasks_flags(cs, &heap); 1271 update_tasks_flags(cs, &heap);
@@ -1368,24 +1373,18 @@ static int fmeter_getrate(struct fmeter *fmp)
1368 return val; 1373 return val;
1369} 1374}
1370 1375
1371/* 1376/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
1372 * Protected by cgroup_lock. The nodemasks must be stored globally because
1373 * dynamically allocating them is not allowed in can_attach, and they must
1374 * persist until attach.
1375 */
1376static cpumask_var_t cpus_attach;
1377static nodemask_t cpuset_attach_nodemask_from;
1378static nodemask_t cpuset_attach_nodemask_to;
1379
1380/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
1381static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 1377static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1382{ 1378{
1383 struct cpuset *cs = cgroup_cs(cgrp); 1379 struct cpuset *cs = cgroup_cs(cgrp);
1384 struct task_struct *task; 1380 struct task_struct *task;
1385 int ret; 1381 int ret;
1386 1382
1383 mutex_lock(&cpuset_mutex);
1384
1385 ret = -ENOSPC;
1387 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) 1386 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1388 return -ENOSPC; 1387 goto out_unlock;
1389 1388
1390 cgroup_taskset_for_each(task, cgrp, tset) { 1389 cgroup_taskset_for_each(task, cgrp, tset) {
1391 /* 1390 /*
@@ -1397,25 +1396,45 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1397 * set_cpus_allowed_ptr() on all attached tasks before 1396 * set_cpus_allowed_ptr() on all attached tasks before
1398 * cpus_allowed may be changed. 1397 * cpus_allowed may be changed.
1399 */ 1398 */
1399 ret = -EINVAL;
1400 if (task->flags & PF_THREAD_BOUND) 1400 if (task->flags & PF_THREAD_BOUND)
1401 return -EINVAL; 1401 goto out_unlock;
1402 if ((ret = security_task_setscheduler(task))) 1402 ret = security_task_setscheduler(task);
1403 return ret; 1403 if (ret)
1404 goto out_unlock;
1404 } 1405 }
1405 1406
1406 /* prepare for attach */ 1407 /*
1407 if (cs == &top_cpuset) 1408 * Mark attach is in progress. This makes validate_change() fail
1408 cpumask_copy(cpus_attach, cpu_possible_mask); 1409 * changes which zero cpus/mems_allowed.
1409 else 1410 */
1410 guarantee_online_cpus(cs, cpus_attach); 1411 cs->attach_in_progress++;
1411 1412 ret = 0;
1412 guarantee_online_mems(cs, &cpuset_attach_nodemask_to); 1413out_unlock:
1414 mutex_unlock(&cpuset_mutex);
1415 return ret;
1416}
1413 1417
1414 return 0; 1418static void cpuset_cancel_attach(struct cgroup *cgrp,
1419 struct cgroup_taskset *tset)
1420{
1421 mutex_lock(&cpuset_mutex);
1422 cgroup_cs(cgrp)->attach_in_progress--;
1423 mutex_unlock(&cpuset_mutex);
1415} 1424}
1416 1425
1426/*
1427 * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach()
1428 * but we can't allocate it dynamically there. Define it global and
1429 * allocate from cpuset_init().
1430 */
1431static cpumask_var_t cpus_attach;
1432
1417static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 1433static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1418{ 1434{
1435 /* static bufs protected by cpuset_mutex */
1436 static nodemask_t cpuset_attach_nodemask_from;
1437 static nodemask_t cpuset_attach_nodemask_to;
1419 struct mm_struct *mm; 1438 struct mm_struct *mm;
1420 struct task_struct *task; 1439 struct task_struct *task;
1421 struct task_struct *leader = cgroup_taskset_first(tset); 1440 struct task_struct *leader = cgroup_taskset_first(tset);
@@ -1423,6 +1442,16 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1423 struct cpuset *cs = cgroup_cs(cgrp); 1442 struct cpuset *cs = cgroup_cs(cgrp);
1424 struct cpuset *oldcs = cgroup_cs(oldcgrp); 1443 struct cpuset *oldcs = cgroup_cs(oldcgrp);
1425 1444
1445 mutex_lock(&cpuset_mutex);
1446
1447 /* prepare for attach */
1448 if (cs == &top_cpuset)
1449 cpumask_copy(cpus_attach, cpu_possible_mask);
1450 else
1451 guarantee_online_cpus(cs, cpus_attach);
1452
1453 guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
1454
1426 cgroup_taskset_for_each(task, cgrp, tset) { 1455 cgroup_taskset_for_each(task, cgrp, tset) {
1427 /* 1456 /*
1428 * can_attach beforehand should guarantee that this doesn't 1457 * can_attach beforehand should guarantee that this doesn't
@@ -1448,6 +1477,18 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1448 &cpuset_attach_nodemask_to); 1477 &cpuset_attach_nodemask_to);
1449 mmput(mm); 1478 mmput(mm);
1450 } 1479 }
1480
1481 cs->attach_in_progress--;
1482
1483 /*
1484 * We may have raced with CPU/memory hotunplug. Trigger hotplug
1485 * propagation if @cs doesn't have any CPU or memory. It will move
1486 * the newly added tasks to the nearest parent which can execute.
1487 */
1488 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1489 schedule_cpuset_propagate_hotplug(cs);
1490
1491 mutex_unlock(&cpuset_mutex);
1451} 1492}
1452 1493
1453/* The various types of files and directories in a cpuset file system */ 1494/* The various types of files and directories in a cpuset file system */
@@ -1469,12 +1510,13 @@ typedef enum {
1469 1510
1470static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) 1511static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1471{ 1512{
1472 int retval = 0;
1473 struct cpuset *cs = cgroup_cs(cgrp); 1513 struct cpuset *cs = cgroup_cs(cgrp);
1474 cpuset_filetype_t type = cft->private; 1514 cpuset_filetype_t type = cft->private;
1515 int retval = -ENODEV;
1475 1516
1476 if (!cgroup_lock_live_group(cgrp)) 1517 mutex_lock(&cpuset_mutex);
1477 return -ENODEV; 1518 if (!is_cpuset_online(cs))
1519 goto out_unlock;
1478 1520
1479 switch (type) { 1521 switch (type) {
1480 case FILE_CPU_EXCLUSIVE: 1522 case FILE_CPU_EXCLUSIVE:
@@ -1508,18 +1550,20 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1508 retval = -EINVAL; 1550 retval = -EINVAL;
1509 break; 1551 break;
1510 } 1552 }
1511 cgroup_unlock(); 1553out_unlock:
1554 mutex_unlock(&cpuset_mutex);
1512 return retval; 1555 return retval;
1513} 1556}
1514 1557
1515static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) 1558static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
1516{ 1559{
1517 int retval = 0;
1518 struct cpuset *cs = cgroup_cs(cgrp); 1560 struct cpuset *cs = cgroup_cs(cgrp);
1519 cpuset_filetype_t type = cft->private; 1561 cpuset_filetype_t type = cft->private;
1562 int retval = -ENODEV;
1520 1563
1521 if (!cgroup_lock_live_group(cgrp)) 1564 mutex_lock(&cpuset_mutex);
1522 return -ENODEV; 1565 if (!is_cpuset_online(cs))
1566 goto out_unlock;
1523 1567
1524 switch (type) { 1568 switch (type) {
1525 case FILE_SCHED_RELAX_DOMAIN_LEVEL: 1569 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
@@ -1529,7 +1573,8 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
1529 retval = -EINVAL; 1573 retval = -EINVAL;
1530 break; 1574 break;
1531 } 1575 }
1532 cgroup_unlock(); 1576out_unlock:
1577 mutex_unlock(&cpuset_mutex);
1533 return retval; 1578 return retval;
1534} 1579}
1535 1580
@@ -1539,17 +1584,36 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
1539static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, 1584static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1540 const char *buf) 1585 const char *buf)
1541{ 1586{
1542 int retval = 0;
1543 struct cpuset *cs = cgroup_cs(cgrp); 1587 struct cpuset *cs = cgroup_cs(cgrp);
1544 struct cpuset *trialcs; 1588 struct cpuset *trialcs;
1589 int retval = -ENODEV;
1590
1591 /*
1592 * CPU or memory hotunplug may leave @cs w/o any execution
1593 * resources, in which case the hotplug code asynchronously updates
1594 * configuration and transfers all tasks to the nearest ancestor
1595 * which can execute.
1596 *
1597 * As writes to "cpus" or "mems" may restore @cs's execution
1598 * resources, wait for the previously scheduled operations before
1599 * proceeding, so that we don't end up keep removing tasks added
1600 * after execution capability is restored.
1601 *
1602 * Flushing cpuset_hotplug_work is enough to synchronize against
1603 * hotplug hanlding; however, cpuset_attach() may schedule
1604 * propagation work directly. Flush the workqueue too.
1605 */
1606 flush_work(&cpuset_hotplug_work);
1607 flush_workqueue(cpuset_propagate_hotplug_wq);
1545 1608
1546 if (!cgroup_lock_live_group(cgrp)) 1609 mutex_lock(&cpuset_mutex);
1547 return -ENODEV; 1610 if (!is_cpuset_online(cs))
1611 goto out_unlock;
1548 1612
1549 trialcs = alloc_trial_cpuset(cs); 1613 trialcs = alloc_trial_cpuset(cs);
1550 if (!trialcs) { 1614 if (!trialcs) {
1551 retval = -ENOMEM; 1615 retval = -ENOMEM;
1552 goto out; 1616 goto out_unlock;
1553 } 1617 }
1554 1618
1555 switch (cft->private) { 1619 switch (cft->private) {
@@ -1565,8 +1629,8 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1565 } 1629 }
1566 1630
1567 free_trial_cpuset(trialcs); 1631 free_trial_cpuset(trialcs);
1568out: 1632out_unlock:
1569 cgroup_unlock(); 1633 mutex_unlock(&cpuset_mutex);
1570 return retval; 1634 return retval;
1571} 1635}
1572 1636
@@ -1790,15 +1854,12 @@ static struct cftype files[] = {
1790 1854
1791static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) 1855static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
1792{ 1856{
1793 struct cgroup *parent_cg = cont->parent; 1857 struct cpuset *cs;
1794 struct cgroup *tmp_cg;
1795 struct cpuset *parent, *cs;
1796 1858
1797 if (!parent_cg) 1859 if (!cont->parent)
1798 return &top_cpuset.css; 1860 return &top_cpuset.css;
1799 parent = cgroup_cs(parent_cg);
1800 1861
1801 cs = kmalloc(sizeof(*cs), GFP_KERNEL); 1862 cs = kzalloc(sizeof(*cs), GFP_KERNEL);
1802 if (!cs) 1863 if (!cs)
1803 return ERR_PTR(-ENOMEM); 1864 return ERR_PTR(-ENOMEM);
1804 if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) { 1865 if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) {
@@ -1806,22 +1867,38 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
1806 return ERR_PTR(-ENOMEM); 1867 return ERR_PTR(-ENOMEM);
1807 } 1868 }
1808 1869
1809 cs->flags = 0;
1810 if (is_spread_page(parent))
1811 set_bit(CS_SPREAD_PAGE, &cs->flags);
1812 if (is_spread_slab(parent))
1813 set_bit(CS_SPREAD_SLAB, &cs->flags);
1814 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); 1870 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1815 cpumask_clear(cs->cpus_allowed); 1871 cpumask_clear(cs->cpus_allowed);
1816 nodes_clear(cs->mems_allowed); 1872 nodes_clear(cs->mems_allowed);
1817 fmeter_init(&cs->fmeter); 1873 fmeter_init(&cs->fmeter);
1874 INIT_WORK(&cs->hotplug_work, cpuset_propagate_hotplug_workfn);
1818 cs->relax_domain_level = -1; 1875 cs->relax_domain_level = -1;
1819 1876
1820 cs->parent = parent; 1877 return &cs->css;
1878}
1879
1880static int cpuset_css_online(struct cgroup *cgrp)
1881{
1882 struct cpuset *cs = cgroup_cs(cgrp);
1883 struct cpuset *parent = parent_cs(cs);
1884 struct cpuset *tmp_cs;
1885 struct cgroup *pos_cg;
1886
1887 if (!parent)
1888 return 0;
1889
1890 mutex_lock(&cpuset_mutex);
1891
1892 set_bit(CS_ONLINE, &cs->flags);
1893 if (is_spread_page(parent))
1894 set_bit(CS_SPREAD_PAGE, &cs->flags);
1895 if (is_spread_slab(parent))
1896 set_bit(CS_SPREAD_SLAB, &cs->flags);
1897
1821 number_of_cpusets++; 1898 number_of_cpusets++;
1822 1899
1823 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cont->flags)) 1900 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags))
1824 goto skip_clone; 1901 goto out_unlock;
1825 1902
1826 /* 1903 /*
1827 * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is 1904 * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
@@ -1836,35 +1913,49 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
1836 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive 1913 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
1837 * (and likewise for mems) to the new cgroup. 1914 * (and likewise for mems) to the new cgroup.
1838 */ 1915 */
1839 list_for_each_entry(tmp_cg, &parent_cg->children, sibling) { 1916 rcu_read_lock();
1840 struct cpuset *tmp_cs = cgroup_cs(tmp_cg); 1917 cpuset_for_each_child(tmp_cs, pos_cg, parent) {
1841 1918 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
1842 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) 1919 rcu_read_unlock();
1843 goto skip_clone; 1920 goto out_unlock;
1921 }
1844 } 1922 }
1923 rcu_read_unlock();
1845 1924
1846 mutex_lock(&callback_mutex); 1925 mutex_lock(&callback_mutex);
1847 cs->mems_allowed = parent->mems_allowed; 1926 cs->mems_allowed = parent->mems_allowed;
1848 cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); 1927 cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
1849 mutex_unlock(&callback_mutex); 1928 mutex_unlock(&callback_mutex);
1850skip_clone: 1929out_unlock:
1851 return &cs->css; 1930 mutex_unlock(&cpuset_mutex);
1931 return 0;
1932}
1933
1934static void cpuset_css_offline(struct cgroup *cgrp)
1935{
1936 struct cpuset *cs = cgroup_cs(cgrp);
1937
1938 mutex_lock(&cpuset_mutex);
1939
1940 if (is_sched_load_balance(cs))
1941 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
1942
1943 number_of_cpusets--;
1944 clear_bit(CS_ONLINE, &cs->flags);
1945
1946 mutex_unlock(&cpuset_mutex);
1852} 1947}
1853 1948
1854/* 1949/*
1855 * If the cpuset being removed has its flag 'sched_load_balance' 1950 * If the cpuset being removed has its flag 'sched_load_balance'
1856 * enabled, then simulate turning sched_load_balance off, which 1951 * enabled, then simulate turning sched_load_balance off, which
1857 * will call async_rebuild_sched_domains(). 1952 * will call rebuild_sched_domains_locked().
1858 */ 1953 */
1859 1954
1860static void cpuset_css_free(struct cgroup *cont) 1955static void cpuset_css_free(struct cgroup *cont)
1861{ 1956{
1862 struct cpuset *cs = cgroup_cs(cont); 1957 struct cpuset *cs = cgroup_cs(cont);
1863 1958
1864 if (is_sched_load_balance(cs))
1865 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
1866
1867 number_of_cpusets--;
1868 free_cpumask_var(cs->cpus_allowed); 1959 free_cpumask_var(cs->cpus_allowed);
1869 kfree(cs); 1960 kfree(cs);
1870} 1961}
@@ -1872,8 +1963,11 @@ static void cpuset_css_free(struct cgroup *cont)
1872struct cgroup_subsys cpuset_subsys = { 1963struct cgroup_subsys cpuset_subsys = {
1873 .name = "cpuset", 1964 .name = "cpuset",
1874 .css_alloc = cpuset_css_alloc, 1965 .css_alloc = cpuset_css_alloc,
1966 .css_online = cpuset_css_online,
1967 .css_offline = cpuset_css_offline,
1875 .css_free = cpuset_css_free, 1968 .css_free = cpuset_css_free,
1876 .can_attach = cpuset_can_attach, 1969 .can_attach = cpuset_can_attach,
1970 .cancel_attach = cpuset_cancel_attach,
1877 .attach = cpuset_attach, 1971 .attach = cpuset_attach,
1878 .subsys_id = cpuset_subsys_id, 1972 .subsys_id = cpuset_subsys_id,
1879 .base_cftypes = files, 1973 .base_cftypes = files,
@@ -1924,7 +2018,9 @@ static void cpuset_do_move_task(struct task_struct *tsk,
1924{ 2018{
1925 struct cgroup *new_cgroup = scan->data; 2019 struct cgroup *new_cgroup = scan->data;
1926 2020
2021 cgroup_lock();
1927 cgroup_attach_task(new_cgroup, tsk); 2022 cgroup_attach_task(new_cgroup, tsk);
2023 cgroup_unlock();
1928} 2024}
1929 2025
1930/** 2026/**
@@ -1932,7 +2028,7 @@ static void cpuset_do_move_task(struct task_struct *tsk,
1932 * @from: cpuset in which the tasks currently reside 2028 * @from: cpuset in which the tasks currently reside
1933 * @to: cpuset to which the tasks will be moved 2029 * @to: cpuset to which the tasks will be moved
1934 * 2030 *
1935 * Called with cgroup_mutex held 2031 * Called with cpuset_mutex held
1936 * callback_mutex must not be held, as cpuset_attach() will take it. 2032 * callback_mutex must not be held, as cpuset_attach() will take it.
1937 * 2033 *
1938 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, 2034 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
@@ -1959,169 +2055,200 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
1959 * removing that CPU or node from all cpusets. If this removes the 2055 * removing that CPU or node from all cpusets. If this removes the
1960 * last CPU or node from a cpuset, then move the tasks in the empty 2056 * last CPU or node from a cpuset, then move the tasks in the empty
1961 * cpuset to its next-highest non-empty parent. 2057 * cpuset to its next-highest non-empty parent.
1962 *
1963 * Called with cgroup_mutex held
1964 * callback_mutex must not be held, as cpuset_attach() will take it.
1965 */ 2058 */
1966static void remove_tasks_in_empty_cpuset(struct cpuset *cs) 2059static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
1967{ 2060{
1968 struct cpuset *parent; 2061 struct cpuset *parent;
1969 2062
1970 /* 2063 /*
1971 * The cgroup's css_sets list is in use if there are tasks
1972 * in the cpuset; the list is empty if there are none;
1973 * the cs->css.refcnt seems always 0.
1974 */
1975 if (list_empty(&cs->css.cgroup->css_sets))
1976 return;
1977
1978 /*
1979 * Find its next-highest non-empty parent, (top cpuset 2064 * Find its next-highest non-empty parent, (top cpuset
1980 * has online cpus, so can't be empty). 2065 * has online cpus, so can't be empty).
1981 */ 2066 */
1982 parent = cs->parent; 2067 parent = parent_cs(cs);
1983 while (cpumask_empty(parent->cpus_allowed) || 2068 while (cpumask_empty(parent->cpus_allowed) ||
1984 nodes_empty(parent->mems_allowed)) 2069 nodes_empty(parent->mems_allowed))
1985 parent = parent->parent; 2070 parent = parent_cs(parent);
1986 2071
1987 move_member_tasks_to_cpuset(cs, parent); 2072 move_member_tasks_to_cpuset(cs, parent);
1988} 2073}
1989 2074
1990/* 2075/**
1991 * Helper function to traverse cpusets. 2076 * cpuset_propagate_hotplug_workfn - propagate CPU/memory hotplug to a cpuset
1992 * It can be used to walk the cpuset tree from top to bottom, completing 2077 * @cs: cpuset in interest
1993 * one layer before dropping down to the next (thus always processing a 2078 *
1994 * node before any of its children). 2079 * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
2080 * offline, update @cs accordingly. If @cs ends up with no CPU or memory,
2081 * all its tasks are moved to the nearest ancestor with both resources.
1995 */ 2082 */
1996static struct cpuset *cpuset_next(struct list_head *queue) 2083static void cpuset_propagate_hotplug_workfn(struct work_struct *work)
1997{ 2084{
1998 struct cpuset *cp; 2085 static cpumask_t off_cpus;
1999 struct cpuset *child; /* scans child cpusets of cp */ 2086 static nodemask_t off_mems, tmp_mems;
2000 struct cgroup *cont; 2087 struct cpuset *cs = container_of(work, struct cpuset, hotplug_work);
2088 bool is_empty;
2001 2089
2002 if (list_empty(queue)) 2090 mutex_lock(&cpuset_mutex);
2003 return NULL; 2091
2092 cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed);
2093 nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed);
2004 2094
2005 cp = list_first_entry(queue, struct cpuset, stack_list); 2095 /* remove offline cpus from @cs */
2006 list_del(queue->next); 2096 if (!cpumask_empty(&off_cpus)) {
2007 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { 2097 mutex_lock(&callback_mutex);
2008 child = cgroup_cs(cont); 2098 cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus);
2009 list_add_tail(&child->stack_list, queue); 2099 mutex_unlock(&callback_mutex);
2100 update_tasks_cpumask(cs, NULL);
2101 }
2102
2103 /* remove offline mems from @cs */
2104 if (!nodes_empty(off_mems)) {
2105 tmp_mems = cs->mems_allowed;
2106 mutex_lock(&callback_mutex);
2107 nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
2108 mutex_unlock(&callback_mutex);
2109 update_tasks_nodemask(cs, &tmp_mems, NULL);
2010 } 2110 }
2011 2111
2012 return cp; 2112 is_empty = cpumask_empty(cs->cpus_allowed) ||
2113 nodes_empty(cs->mems_allowed);
2114
2115 mutex_unlock(&cpuset_mutex);
2116
2117 /*
2118 * If @cs became empty, move tasks to the nearest ancestor with
2119 * execution resources. This is full cgroup operation which will
2120 * also call back into cpuset. Should be done outside any lock.
2121 */
2122 if (is_empty)
2123 remove_tasks_in_empty_cpuset(cs);
2124
2125 /* the following may free @cs, should be the last operation */
2126 css_put(&cs->css);
2013} 2127}
2014 2128
2129/**
2130 * schedule_cpuset_propagate_hotplug - schedule hotplug propagation to a cpuset
2131 * @cs: cpuset of interest
2132 *
2133 * Schedule cpuset_propagate_hotplug_workfn() which will update CPU and
2134 * memory masks according to top_cpuset.
2135 */
2136static void schedule_cpuset_propagate_hotplug(struct cpuset *cs)
2137{
2138 /*
2139 * Pin @cs. The refcnt will be released when the work item
2140 * finishes executing.
2141 */
2142 if (!css_tryget(&cs->css))
2143 return;
2015 2144
2016/* 2145 /*
2017 * Walk the specified cpuset subtree upon a hotplug operation (CPU/Memory 2146 * Queue @cs->hotplug_work. If already pending, lose the css ref.
2018 * online/offline) and update the cpusets accordingly. 2147 * cpuset_propagate_hotplug_wq is ordered and propagation will
2019 * For regular CPU/Mem hotplug, look for empty cpusets; the tasks of such 2148 * happen in the order this function is called.
2020 * cpuset must be moved to a parent cpuset. 2149 */
2150 if (!queue_work(cpuset_propagate_hotplug_wq, &cs->hotplug_work))
2151 css_put(&cs->css);
2152}
2153
2154/**
2155 * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
2021 * 2156 *
2022 * Called with cgroup_mutex held. We take callback_mutex to modify 2157 * This function is called after either CPU or memory configuration has
2023 * cpus_allowed and mems_allowed. 2158 * changed and updates cpuset accordingly. The top_cpuset is always
2159 * synchronized to cpu_active_mask and N_MEMORY, which is necessary in
2160 * order to make cpusets transparent (of no affect) on systems that are
2161 * actively using CPU hotplug but making no active use of cpusets.
2024 * 2162 *
2025 * This walk processes the tree from top to bottom, completing one layer 2163 * Non-root cpusets are only affected by offlining. If any CPUs or memory
2026 * before dropping down to the next. It always processes a node before 2164 * nodes have been taken down, cpuset_propagate_hotplug() is invoked on all
2027 * any of its children. 2165 * descendants.
2028 * 2166 *
2029 * In the case of memory hot-unplug, it will remove nodes from N_MEMORY 2167 * Note that CPU offlining during suspend is ignored. We don't modify
2030 * if all present pages from a node are offlined. 2168 * cpusets across suspend/resume cycles at all.
2031 */ 2169 */
2032static void 2170static void cpuset_hotplug_workfn(struct work_struct *work)
2033scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event)
2034{ 2171{
2035 LIST_HEAD(queue); 2172 static cpumask_t new_cpus, tmp_cpus;
2036 struct cpuset *cp; /* scans cpusets being updated */ 2173 static nodemask_t new_mems, tmp_mems;
2037 static nodemask_t oldmems; /* protected by cgroup_mutex */ 2174 bool cpus_updated, mems_updated;
2175 bool cpus_offlined, mems_offlined;
2038 2176
2039 list_add_tail((struct list_head *)&root->stack_list, &queue); 2177 mutex_lock(&cpuset_mutex);
2040 2178
2041 switch (event) { 2179 /* fetch the available cpus/mems and find out which changed how */
2042 case CPUSET_CPU_OFFLINE: 2180 cpumask_copy(&new_cpus, cpu_active_mask);
2043 while ((cp = cpuset_next(&queue)) != NULL) { 2181 new_mems = node_states[N_MEMORY];
2044 2182
2045 /* Continue past cpusets with all cpus online */ 2183 cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus);
2046 if (cpumask_subset(cp->cpus_allowed, cpu_active_mask)) 2184 cpus_offlined = cpumask_andnot(&tmp_cpus, top_cpuset.cpus_allowed,
2047 continue; 2185 &new_cpus);
2048 2186
2049 /* Remove offline cpus from this cpuset. */ 2187 mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems);
2050 mutex_lock(&callback_mutex); 2188 nodes_andnot(tmp_mems, top_cpuset.mems_allowed, new_mems);
2051 cpumask_and(cp->cpus_allowed, cp->cpus_allowed, 2189 mems_offlined = !nodes_empty(tmp_mems);
2052 cpu_active_mask);
2053 mutex_unlock(&callback_mutex);
2054 2190
2055 /* Move tasks from the empty cpuset to a parent */ 2191 /* synchronize cpus_allowed to cpu_active_mask */
2056 if (cpumask_empty(cp->cpus_allowed)) 2192 if (cpus_updated) {
2057 remove_tasks_in_empty_cpuset(cp); 2193 mutex_lock(&callback_mutex);
2058 else 2194 cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
2059 update_tasks_cpumask(cp, NULL); 2195 mutex_unlock(&callback_mutex);
2060 } 2196 /* we don't mess with cpumasks of tasks in top_cpuset */
2061 break; 2197 }
2062 2198
2063 case CPUSET_MEM_OFFLINE: 2199 /* synchronize mems_allowed to N_MEMORY */
2064 while ((cp = cpuset_next(&queue)) != NULL) { 2200 if (mems_updated) {
2201 tmp_mems = top_cpuset.mems_allowed;
2202 mutex_lock(&callback_mutex);
2203 top_cpuset.mems_allowed = new_mems;
2204 mutex_unlock(&callback_mutex);
2205 update_tasks_nodemask(&top_cpuset, &tmp_mems, NULL);
2206 }
2065 2207
2066 /* Continue past cpusets with all mems online */ 2208 /* if cpus or mems went down, we need to propagate to descendants */
2067 if (nodes_subset(cp->mems_allowed, 2209 if (cpus_offlined || mems_offlined) {
2068 node_states[N_MEMORY])) 2210 struct cpuset *cs;
2069 continue; 2211 struct cgroup *pos_cgrp;
2070 2212
2071 oldmems = cp->mems_allowed; 2213 rcu_read_lock();
2214 cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset)
2215 schedule_cpuset_propagate_hotplug(cs);
2216 rcu_read_unlock();
2217 }
2072 2218
2073 /* Remove offline mems from this cpuset. */ 2219 mutex_unlock(&cpuset_mutex);
2074 mutex_lock(&callback_mutex);
2075 nodes_and(cp->mems_allowed, cp->mems_allowed,
2076 node_states[N_MEMORY]);
2077 mutex_unlock(&callback_mutex);
2078 2220
2079 /* Move tasks from the empty cpuset to a parent */ 2221 /* wait for propagations to finish */
2080 if (nodes_empty(cp->mems_allowed)) 2222 flush_workqueue(cpuset_propagate_hotplug_wq);
2081 remove_tasks_in_empty_cpuset(cp); 2223
2082 else 2224 /* rebuild sched domains if cpus_allowed has changed */
2083 update_tasks_nodemask(cp, &oldmems, NULL); 2225 if (cpus_updated) {
2084 } 2226 struct sched_domain_attr *attr;
2227 cpumask_var_t *doms;
2228 int ndoms;
2229
2230 mutex_lock(&cpuset_mutex);
2231 ndoms = generate_sched_domains(&doms, &attr);
2232 mutex_unlock(&cpuset_mutex);
2233
2234 partition_sched_domains(ndoms, doms, attr);
2085 } 2235 }
2086} 2236}
2087 2237
2088/*
2089 * The top_cpuset tracks what CPUs and Memory Nodes are online,
2090 * period. This is necessary in order to make cpusets transparent
2091 * (of no affect) on systems that are actively using CPU hotplug
2092 * but making no active use of cpusets.
2093 *
2094 * The only exception to this is suspend/resume, where we don't
2095 * modify cpusets at all.
2096 *
2097 * This routine ensures that top_cpuset.cpus_allowed tracks
2098 * cpu_active_mask on each CPU hotplug (cpuhp) event.
2099 *
2100 * Called within get_online_cpus(). Needs to call cgroup_lock()
2101 * before calling generate_sched_domains().
2102 *
2103 * @cpu_online: Indicates whether this is a CPU online event (true) or
2104 * a CPU offline event (false).
2105 */
2106void cpuset_update_active_cpus(bool cpu_online) 2238void cpuset_update_active_cpus(bool cpu_online)
2107{ 2239{
2108 struct sched_domain_attr *attr; 2240 /*
2109 cpumask_var_t *doms; 2241 * We're inside cpu hotplug critical region which usually nests
2110 int ndoms; 2242 * inside cgroup synchronization. Bounce actual hotplug processing
2111 2243 * to a work item to avoid reverse locking order.
2112 cgroup_lock(); 2244 *
2113 mutex_lock(&callback_mutex); 2245 * We still need to do partition_sched_domains() synchronously;
2114 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); 2246 * otherwise, the scheduler will get confused and put tasks to the
2115 mutex_unlock(&callback_mutex); 2247 * dead CPU. Fall back to the default single domain.
2116 2248 * cpuset_hotplug_workfn() will rebuild it as necessary.
2117 if (!cpu_online) 2249 */
2118 scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_CPU_OFFLINE); 2250 partition_sched_domains(1, NULL, NULL);
2119 2251 schedule_work(&cpuset_hotplug_work);
2120 ndoms = generate_sched_domains(&doms, &attr);
2121 cgroup_unlock();
2122
2123 /* Have scheduler rebuild the domains */
2124 partition_sched_domains(ndoms, doms, attr);
2125} 2252}
2126 2253
2127#ifdef CONFIG_MEMORY_HOTPLUG 2254#ifdef CONFIG_MEMORY_HOTPLUG
@@ -2133,29 +2260,7 @@ void cpuset_update_active_cpus(bool cpu_online)
2133static int cpuset_track_online_nodes(struct notifier_block *self, 2260static int cpuset_track_online_nodes(struct notifier_block *self,
2134 unsigned long action, void *arg) 2261 unsigned long action, void *arg)
2135{ 2262{
2136 static nodemask_t oldmems; /* protected by cgroup_mutex */ 2263 schedule_work(&cpuset_hotplug_work);
2137
2138 cgroup_lock();
2139 switch (action) {
2140 case MEM_ONLINE:
2141 oldmems = top_cpuset.mems_allowed;
2142 mutex_lock(&callback_mutex);
2143 top_cpuset.mems_allowed = node_states[N_MEMORY];
2144 mutex_unlock(&callback_mutex);
2145 update_tasks_nodemask(&top_cpuset, &oldmems, NULL);
2146 break;
2147 case MEM_OFFLINE:
2148 /*
2149 * needn't update top_cpuset.mems_allowed explicitly because
2150 * scan_cpusets_upon_hotplug() will update it.
2151 */
2152 scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_MEM_OFFLINE);
2153 break;
2154 default:
2155 break;
2156 }
2157 cgroup_unlock();
2158
2159 return NOTIFY_OK; 2264 return NOTIFY_OK;
2160} 2265}
2161#endif 2266#endif
@@ -2173,8 +2278,9 @@ void __init cpuset_init_smp(void)
2173 2278
2174 hotplug_memory_notifier(cpuset_track_online_nodes, 10); 2279 hotplug_memory_notifier(cpuset_track_online_nodes, 10);
2175 2280
2176 cpuset_wq = create_singlethread_workqueue("cpuset"); 2281 cpuset_propagate_hotplug_wq =
2177 BUG_ON(!cpuset_wq); 2282 alloc_ordered_workqueue("cpuset_hotplug", 0);
2283 BUG_ON(!cpuset_propagate_hotplug_wq);
2178} 2284}
2179 2285
2180/** 2286/**
@@ -2273,8 +2379,8 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
2273 */ 2379 */
2274static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs) 2380static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
2275{ 2381{
2276 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent) 2382 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
2277 cs = cs->parent; 2383 cs = parent_cs(cs);
2278 return cs; 2384 return cs;
2279} 2385}
2280 2386
@@ -2412,17 +2518,6 @@ int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
2412} 2518}
2413 2519
2414/** 2520/**
2415 * cpuset_unlock - release lock on cpuset changes
2416 *
2417 * Undo the lock taken in a previous cpuset_lock() call.
2418 */
2419
2420void cpuset_unlock(void)
2421{
2422 mutex_unlock(&callback_mutex);
2423}
2424
2425/**
2426 * cpuset_mem_spread_node() - On which node to begin search for a file page 2521 * cpuset_mem_spread_node() - On which node to begin search for a file page
2427 * cpuset_slab_spread_node() - On which node to begin search for a slab page 2522 * cpuset_slab_spread_node() - On which node to begin search for a slab page
2428 * 2523 *
@@ -2511,8 +2606,16 @@ void cpuset_print_task_mems_allowed(struct task_struct *tsk)
2511 2606
2512 dentry = task_cs(tsk)->css.cgroup->dentry; 2607 dentry = task_cs(tsk)->css.cgroup->dentry;
2513 spin_lock(&cpuset_buffer_lock); 2608 spin_lock(&cpuset_buffer_lock);
2514 snprintf(cpuset_name, CPUSET_NAME_LEN, 2609
2515 dentry ? (const char *)dentry->d_name.name : "/"); 2610 if (!dentry) {
2611 strcpy(cpuset_name, "/");
2612 } else {
2613 spin_lock(&dentry->d_lock);
2614 strlcpy(cpuset_name, (const char *)dentry->d_name.name,
2615 CPUSET_NAME_LEN);
2616 spin_unlock(&dentry->d_lock);
2617 }
2618
2516 nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN, 2619 nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
2517 tsk->mems_allowed); 2620 tsk->mems_allowed);
2518 printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n", 2621 printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n",
@@ -2560,7 +2663,7 @@ void __cpuset_memory_pressure_bump(void)
2560 * - Used for /proc/<pid>/cpuset. 2663 * - Used for /proc/<pid>/cpuset.
2561 * - No need to task_lock(tsk) on this tsk->cpuset reference, as it 2664 * - No need to task_lock(tsk) on this tsk->cpuset reference, as it
2562 * doesn't really matter if tsk->cpuset changes after we read it, 2665 * doesn't really matter if tsk->cpuset changes after we read it,
2563 * and we take cgroup_mutex, keeping cpuset_attach() from changing it 2666 * and we take cpuset_mutex, keeping cpuset_attach() from changing it
2564 * anyway. 2667 * anyway.
2565 */ 2668 */
2566static int proc_cpuset_show(struct seq_file *m, void *unused_v) 2669static int proc_cpuset_show(struct seq_file *m, void *unused_v)
@@ -2582,16 +2685,15 @@ static int proc_cpuset_show(struct seq_file *m, void *unused_v)
2582 if (!tsk) 2685 if (!tsk)
2583 goto out_free; 2686 goto out_free;
2584 2687
2585 retval = -EINVAL; 2688 rcu_read_lock();
2586 cgroup_lock();
2587 css = task_subsys_state(tsk, cpuset_subsys_id); 2689 css = task_subsys_state(tsk, cpuset_subsys_id);
2588 retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); 2690 retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);
2691 rcu_read_unlock();
2589 if (retval < 0) 2692 if (retval < 0)
2590 goto out_unlock; 2693 goto out_put_task;
2591 seq_puts(m, buf); 2694 seq_puts(m, buf);
2592 seq_putc(m, '\n'); 2695 seq_putc(m, '\n');
2593out_unlock: 2696out_put_task:
2594 cgroup_unlock();
2595 put_task_struct(tsk); 2697 put_task_struct(tsk);
2596out_free: 2698out_free:
2597 kfree(buf); 2699 kfree(buf);
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 9a61738cefc8..c26278fd4851 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -29,6 +29,7 @@
29 */ 29 */
30#include <linux/pid_namespace.h> 30#include <linux/pid_namespace.h>
31#include <linux/clocksource.h> 31#include <linux/clocksource.h>
32#include <linux/serial_core.h>
32#include <linux/interrupt.h> 33#include <linux/interrupt.h>
33#include <linux/spinlock.h> 34#include <linux/spinlock.h>
34#include <linux/console.h> 35#include <linux/console.h>
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h
index 3494c28a7e7a..2235967e78b0 100644
--- a/kernel/debug/debug_core.h
+++ b/kernel/debug/debug_core.h
@@ -72,6 +72,8 @@ extern int dbg_kdb_mode;
72#ifdef CONFIG_KGDB_KDB 72#ifdef CONFIG_KGDB_KDB
73extern int kdb_stub(struct kgdb_state *ks); 73extern int kdb_stub(struct kgdb_state *ks);
74extern int kdb_parse(const char *cmdstr); 74extern int kdb_parse(const char *cmdstr);
75extern int kdb_common_init_state(struct kgdb_state *ks);
76extern int kdb_common_deinit_state(void);
75#else /* ! CONFIG_KGDB_KDB */ 77#else /* ! CONFIG_KGDB_KDB */
76static inline int kdb_stub(struct kgdb_state *ks) 78static inline int kdb_stub(struct kgdb_state *ks)
77{ 79{
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index ce615e064482..19d9a578c753 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -31,6 +31,7 @@
31#include <linux/kernel.h> 31#include <linux/kernel.h>
32#include <linux/kgdb.h> 32#include <linux/kgdb.h>
33#include <linux/kdb.h> 33#include <linux/kdb.h>
34#include <linux/serial_core.h>
34#include <linux/reboot.h> 35#include <linux/reboot.h>
35#include <linux/uaccess.h> 36#include <linux/uaccess.h>
36#include <asm/cacheflush.h> 37#include <asm/cacheflush.h>
@@ -782,7 +783,10 @@ static void gdb_cmd_query(struct kgdb_state *ks)
782 len = len / 2; 783 len = len / 2;
783 remcom_out_buffer[len++] = 0; 784 remcom_out_buffer[len++] = 0;
784 785
786 kdb_common_init_state(ks);
785 kdb_parse(remcom_out_buffer); 787 kdb_parse(remcom_out_buffer);
788 kdb_common_deinit_state();
789
786 strcpy(remcom_out_buffer, "OK"); 790 strcpy(remcom_out_buffer, "OK");
787 } 791 }
788 break; 792 break;
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index 8418c2f8ec5d..70a504601dc3 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -486,11 +486,9 @@ static int kdb_bc(int argc, const char **argv)
486/* 486/*
487 * kdb_ss 487 * kdb_ss
488 * 488 *
489 * Process the 'ss' (Single Step) and 'ssb' (Single Step to Branch) 489 * Process the 'ss' (Single Step) command.
490 * commands.
491 * 490 *
492 * ss 491 * ss
493 * ssb
494 * 492 *
495 * Parameters: 493 * Parameters:
496 * argc Argument count 494 * argc Argument count
@@ -498,35 +496,23 @@ static int kdb_bc(int argc, const char **argv)
498 * Outputs: 496 * Outputs:
499 * None. 497 * None.
500 * Returns: 498 * Returns:
501 * KDB_CMD_SS[B] for success, a kdb error if failure. 499 * KDB_CMD_SS for success, a kdb error if failure.
502 * Locking: 500 * Locking:
503 * None. 501 * None.
504 * Remarks: 502 * Remarks:
505 * 503 *
506 * Set the arch specific option to trigger a debug trap after the next 504 * Set the arch specific option to trigger a debug trap after the next
507 * instruction. 505 * instruction.
508 *
509 * For 'ssb', set the trace flag in the debug trap handler
510 * after printing the current insn and return directly without
511 * invoking the kdb command processor, until a branch instruction
512 * is encountered.
513 */ 506 */
514 507
515static int kdb_ss(int argc, const char **argv) 508static int kdb_ss(int argc, const char **argv)
516{ 509{
517 int ssb = 0;
518
519 ssb = (strcmp(argv[0], "ssb") == 0);
520 if (argc != 0) 510 if (argc != 0)
521 return KDB_ARGCOUNT; 511 return KDB_ARGCOUNT;
522 /* 512 /*
523 * Set trace flag and go. 513 * Set trace flag and go.
524 */ 514 */
525 KDB_STATE_SET(DOING_SS); 515 KDB_STATE_SET(DOING_SS);
526 if (ssb) {
527 KDB_STATE_SET(DOING_SSB);
528 return KDB_CMD_SSB;
529 }
530 return KDB_CMD_SS; 516 return KDB_CMD_SS;
531} 517}
532 518
@@ -561,8 +547,6 @@ void __init kdb_initbptab(void)
561 547
562 kdb_register_repeat("ss", kdb_ss, "", 548 kdb_register_repeat("ss", kdb_ss, "",
563 "Single Step", 1, KDB_REPEAT_NO_ARGS); 549 "Single Step", 1, KDB_REPEAT_NO_ARGS);
564 kdb_register_repeat("ssb", kdb_ss, "",
565 "Single step to branch/call", 0, KDB_REPEAT_NO_ARGS);
566 /* 550 /*
567 * Architecture dependent initialization. 551 * Architecture dependent initialization.
568 */ 552 */
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index be7b33b73d30..328d18ef31e4 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -34,6 +34,22 @@ EXPORT_SYMBOL_GPL(kdb_poll_idx);
34 34
35static struct kgdb_state *kdb_ks; 35static struct kgdb_state *kdb_ks;
36 36
37int kdb_common_init_state(struct kgdb_state *ks)
38{
39 kdb_initial_cpu = atomic_read(&kgdb_active);
40 kdb_current_task = kgdb_info[ks->cpu].task;
41 kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo;
42 return 0;
43}
44
45int kdb_common_deinit_state(void)
46{
47 kdb_initial_cpu = -1;
48 kdb_current_task = NULL;
49 kdb_current_regs = NULL;
50 return 0;
51}
52
37int kdb_stub(struct kgdb_state *ks) 53int kdb_stub(struct kgdb_state *ks)
38{ 54{
39 int error = 0; 55 int error = 0;
@@ -94,13 +110,10 @@ int kdb_stub(struct kgdb_state *ks)
94 } 110 }
95 /* Set initial kdb state variables */ 111 /* Set initial kdb state variables */
96 KDB_STATE_CLEAR(KGDB_TRANS); 112 KDB_STATE_CLEAR(KGDB_TRANS);
97 kdb_initial_cpu = atomic_read(&kgdb_active); 113 kdb_common_init_state(ks);
98 kdb_current_task = kgdb_info[ks->cpu].task;
99 kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo;
100 /* Remove any breakpoints as needed by kdb and clear single step */ 114 /* Remove any breakpoints as needed by kdb and clear single step */
101 kdb_bp_remove(); 115 kdb_bp_remove();
102 KDB_STATE_CLEAR(DOING_SS); 116 KDB_STATE_CLEAR(DOING_SS);
103 KDB_STATE_CLEAR(DOING_SSB);
104 KDB_STATE_SET(PAGER); 117 KDB_STATE_SET(PAGER);
105 /* zero out any offline cpu data */ 118 /* zero out any offline cpu data */
106 for_each_present_cpu(i) { 119 for_each_present_cpu(i) {
@@ -125,9 +138,7 @@ int kdb_stub(struct kgdb_state *ks)
125 * Upon exit from the kdb main loop setup break points and restart 138 * Upon exit from the kdb main loop setup break points and restart
126 * the system based on the requested continue state 139 * the system based on the requested continue state
127 */ 140 */
128 kdb_initial_cpu = -1; 141 kdb_common_deinit_state();
129 kdb_current_task = NULL;
130 kdb_current_regs = NULL;
131 KDB_STATE_CLEAR(PAGER); 142 KDB_STATE_CLEAR(PAGER);
132 kdbnearsym_cleanup(); 143 kdbnearsym_cleanup();
133 if (error == KDB_CMD_KGDB) { 144 if (error == KDB_CMD_KGDB) {
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 8875254120b6..00eb8f7fbf41 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -124,7 +124,7 @@ static kdbmsg_t kdbmsgs[] = {
124}; 124};
125#undef KDBMSG 125#undef KDBMSG
126 126
127static const int __nkdb_err = sizeof(kdbmsgs) / sizeof(kdbmsg_t); 127static const int __nkdb_err = ARRAY_SIZE(kdbmsgs);
128 128
129 129
130/* 130/*
@@ -175,7 +175,7 @@ static char *__env[] = {
175 (char *)0, 175 (char *)0,
176}; 176};
177 177
178static const int __nenv = (sizeof(__env) / sizeof(char *)); 178static const int __nenv = ARRAY_SIZE(__env);
179 179
180struct task_struct *kdb_curr_task(int cpu) 180struct task_struct *kdb_curr_task(int cpu)
181{ 181{
@@ -681,34 +681,50 @@ static int kdb_defcmd(int argc, const char **argv)
681 } 681 }
682 if (argc != 3) 682 if (argc != 3)
683 return KDB_ARGCOUNT; 683 return KDB_ARGCOUNT;
684 defcmd_set = kmalloc((defcmd_set_count + 1) * sizeof(*defcmd_set), 684 if (in_dbg_master()) {
685 GFP_KDB); 685 kdb_printf("Command only available during kdb_init()\n");
686 if (!defcmd_set) {
687 kdb_printf("Could not allocate new defcmd_set entry for %s\n",
688 argv[1]);
689 defcmd_set = save_defcmd_set;
690 return KDB_NOTIMP; 686 return KDB_NOTIMP;
691 } 687 }
688 defcmd_set = kmalloc((defcmd_set_count + 1) * sizeof(*defcmd_set),
689 GFP_KDB);
690 if (!defcmd_set)
691 goto fail_defcmd;
692 memcpy(defcmd_set, save_defcmd_set, 692 memcpy(defcmd_set, save_defcmd_set,
693 defcmd_set_count * sizeof(*defcmd_set)); 693 defcmd_set_count * sizeof(*defcmd_set));
694 kfree(save_defcmd_set);
695 s = defcmd_set + defcmd_set_count; 694 s = defcmd_set + defcmd_set_count;
696 memset(s, 0, sizeof(*s)); 695 memset(s, 0, sizeof(*s));
697 s->usable = 1; 696 s->usable = 1;
698 s->name = kdb_strdup(argv[1], GFP_KDB); 697 s->name = kdb_strdup(argv[1], GFP_KDB);
698 if (!s->name)
699 goto fail_name;
699 s->usage = kdb_strdup(argv[2], GFP_KDB); 700 s->usage = kdb_strdup(argv[2], GFP_KDB);
701 if (!s->usage)
702 goto fail_usage;
700 s->help = kdb_strdup(argv[3], GFP_KDB); 703 s->help = kdb_strdup(argv[3], GFP_KDB);
704 if (!s->help)
705 goto fail_help;
701 if (s->usage[0] == '"') { 706 if (s->usage[0] == '"') {
702 strcpy(s->usage, s->usage+1); 707 strcpy(s->usage, argv[2]+1);
703 s->usage[strlen(s->usage)-1] = '\0'; 708 s->usage[strlen(s->usage)-1] = '\0';
704 } 709 }
705 if (s->help[0] == '"') { 710 if (s->help[0] == '"') {
706 strcpy(s->help, s->help+1); 711 strcpy(s->help, argv[3]+1);
707 s->help[strlen(s->help)-1] = '\0'; 712 s->help[strlen(s->help)-1] = '\0';
708 } 713 }
709 ++defcmd_set_count; 714 ++defcmd_set_count;
710 defcmd_in_progress = 1; 715 defcmd_in_progress = 1;
716 kfree(save_defcmd_set);
711 return 0; 717 return 0;
718fail_help:
719 kfree(s->usage);
720fail_usage:
721 kfree(s->name);
722fail_name:
723 kfree(defcmd_set);
724fail_defcmd:
725 kdb_printf("Could not allocate new defcmd_set entry for %s\n", argv[1]);
726 defcmd_set = save_defcmd_set;
727 return KDB_NOTIMP;
712} 728}
713 729
714/* 730/*
@@ -1112,7 +1128,6 @@ void kdb_set_current_task(struct task_struct *p)
1112 * KDB_CMD_GO User typed 'go'. 1128 * KDB_CMD_GO User typed 'go'.
1113 * KDB_CMD_CPU User switched to another cpu. 1129 * KDB_CMD_CPU User switched to another cpu.
1114 * KDB_CMD_SS Single step. 1130 * KDB_CMD_SS Single step.
1115 * KDB_CMD_SSB Single step until branch.
1116 */ 1131 */
1117static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs, 1132static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
1118 kdb_dbtrap_t db_result) 1133 kdb_dbtrap_t db_result)
@@ -1151,14 +1166,6 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
1151 kdb_printf("due to Debug @ " kdb_machreg_fmt "\n", 1166 kdb_printf("due to Debug @ " kdb_machreg_fmt "\n",
1152 instruction_pointer(regs)); 1167 instruction_pointer(regs));
1153 break; 1168 break;
1154 case KDB_DB_SSB:
1155 /*
1156 * In the midst of ssb command. Just return.
1157 */
1158 KDB_DEBUG_STATE("kdb_local 3", reason);
1159 return KDB_CMD_SSB; /* Continue with SSB command */
1160
1161 break;
1162 case KDB_DB_SS: 1169 case KDB_DB_SS:
1163 break; 1170 break;
1164 case KDB_DB_SSBPT: 1171 case KDB_DB_SSBPT:
@@ -1281,7 +1288,6 @@ do_full_getstr:
1281 if (diag == KDB_CMD_GO 1288 if (diag == KDB_CMD_GO
1282 || diag == KDB_CMD_CPU 1289 || diag == KDB_CMD_CPU
1283 || diag == KDB_CMD_SS 1290 || diag == KDB_CMD_SS
1284 || diag == KDB_CMD_SSB
1285 || diag == KDB_CMD_KGDB) 1291 || diag == KDB_CMD_KGDB)
1286 break; 1292 break;
1287 1293
@@ -1368,12 +1374,6 @@ int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error,
1368 break; 1374 break;
1369 } 1375 }
1370 1376
1371 if (result == KDB_CMD_SSB) {
1372 KDB_STATE_SET(DOING_SS);
1373 KDB_STATE_SET(DOING_SSB);
1374 break;
1375 }
1376
1377 if (result == KDB_CMD_KGDB) { 1377 if (result == KDB_CMD_KGDB) {
1378 if (!KDB_STATE(DOING_KGDB)) 1378 if (!KDB_STATE(DOING_KGDB))
1379 kdb_printf("Entering please attach debugger " 1379 kdb_printf("Entering please attach debugger "
@@ -2350,69 +2350,6 @@ static int kdb_pid(int argc, const char **argv)
2350 return 0; 2350 return 0;
2351} 2351}
2352 2352
2353/*
2354 * kdb_ll - This function implements the 'll' command which follows a
2355 * linked list and executes an arbitrary command for each
2356 * element.
2357 */
2358static int kdb_ll(int argc, const char **argv)
2359{
2360 int diag = 0;
2361 unsigned long addr;
2362 long offset = 0;
2363 unsigned long va;
2364 unsigned long linkoffset;
2365 int nextarg;
2366 const char *command;
2367
2368 if (argc != 3)
2369 return KDB_ARGCOUNT;
2370
2371 nextarg = 1;
2372 diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL);
2373 if (diag)
2374 return diag;
2375
2376 diag = kdbgetularg(argv[2], &linkoffset);
2377 if (diag)
2378 return diag;
2379
2380 /*
2381 * Using the starting address as
2382 * the first element in the list, and assuming that
2383 * the list ends with a null pointer.
2384 */
2385
2386 va = addr;
2387 command = kdb_strdup(argv[3], GFP_KDB);
2388 if (!command) {
2389 kdb_printf("%s: cannot duplicate command\n", __func__);
2390 return 0;
2391 }
2392 /* Recursive use of kdb_parse, do not use argv after this point */
2393 argv = NULL;
2394
2395 while (va) {
2396 char buf[80];
2397
2398 if (KDB_FLAG(CMD_INTERRUPT))
2399 goto out;
2400
2401 sprintf(buf, "%s " kdb_machreg_fmt "\n", command, va);
2402 diag = kdb_parse(buf);
2403 if (diag)
2404 goto out;
2405
2406 addr = va + linkoffset;
2407 if (kdb_getword(&va, addr, sizeof(va)))
2408 goto out;
2409 }
2410
2411out:
2412 kfree(command);
2413 return diag;
2414}
2415
2416static int kdb_kgdb(int argc, const char **argv) 2353static int kdb_kgdb(int argc, const char **argv)
2417{ 2354{
2418 return KDB_CMD_KGDB; 2355 return KDB_CMD_KGDB;
@@ -2430,11 +2367,15 @@ static int kdb_help(int argc, const char **argv)
2430 kdb_printf("-----------------------------" 2367 kdb_printf("-----------------------------"
2431 "-----------------------------\n"); 2368 "-----------------------------\n");
2432 for_each_kdbcmd(kt, i) { 2369 for_each_kdbcmd(kt, i) {
2433 if (kt->cmd_name) 2370 char *space = "";
2434 kdb_printf("%-15.15s %-20.20s %s\n", kt->cmd_name,
2435 kt->cmd_usage, kt->cmd_help);
2436 if (KDB_FLAG(CMD_INTERRUPT)) 2371 if (KDB_FLAG(CMD_INTERRUPT))
2437 return 0; 2372 return 0;
2373 if (!kt->cmd_name)
2374 continue;
2375 if (strlen(kt->cmd_usage) > 20)
2376 space = "\n ";
2377 kdb_printf("%-15.15s %-20s%s%s\n", kt->cmd_name,
2378 kt->cmd_usage, space, kt->cmd_help);
2438 } 2379 }
2439 return 0; 2380 return 0;
2440} 2381}
@@ -2739,7 +2680,7 @@ int kdb_register_repeat(char *cmd,
2739 (kdb_max_commands - KDB_BASE_CMD_MAX) * sizeof(*new)); 2680 (kdb_max_commands - KDB_BASE_CMD_MAX) * sizeof(*new));
2740 kfree(kdb_commands); 2681 kfree(kdb_commands);
2741 } 2682 }
2742 memset(new + kdb_max_commands, 0, 2683 memset(new + kdb_max_commands - KDB_BASE_CMD_MAX, 0,
2743 kdb_command_extend * sizeof(*new)); 2684 kdb_command_extend * sizeof(*new));
2744 kdb_commands = new; 2685 kdb_commands = new;
2745 kp = kdb_commands + kdb_max_commands - KDB_BASE_CMD_MAX; 2686 kp = kdb_commands + kdb_max_commands - KDB_BASE_CMD_MAX;
@@ -2843,15 +2784,13 @@ static void __init kdb_inittab(void)
2843 "Stack traceback", 1, KDB_REPEAT_NONE); 2784 "Stack traceback", 1, KDB_REPEAT_NONE);
2844 kdb_register_repeat("btp", kdb_bt, "<pid>", 2785 kdb_register_repeat("btp", kdb_bt, "<pid>",
2845 "Display stack for process <pid>", 0, KDB_REPEAT_NONE); 2786 "Display stack for process <pid>", 0, KDB_REPEAT_NONE);
2846 kdb_register_repeat("bta", kdb_bt, "[DRSTCZEUIMA]", 2787 kdb_register_repeat("bta", kdb_bt, "[D|R|S|T|C|Z|E|U|I|M|A]",
2847 "Display stack all processes", 0, KDB_REPEAT_NONE); 2788 "Backtrace all processes matching state flag", 0, KDB_REPEAT_NONE);
2848 kdb_register_repeat("btc", kdb_bt, "", 2789 kdb_register_repeat("btc", kdb_bt, "",
2849 "Backtrace current process on each cpu", 0, KDB_REPEAT_NONE); 2790 "Backtrace current process on each cpu", 0, KDB_REPEAT_NONE);
2850 kdb_register_repeat("btt", kdb_bt, "<vaddr>", 2791 kdb_register_repeat("btt", kdb_bt, "<vaddr>",
2851 "Backtrace process given its struct task address", 0, 2792 "Backtrace process given its struct task address", 0,
2852 KDB_REPEAT_NONE); 2793 KDB_REPEAT_NONE);
2853 kdb_register_repeat("ll", kdb_ll, "<first-element> <linkoffset> <cmd>",
2854 "Execute cmd for each element in linked list", 0, KDB_REPEAT_NONE);
2855 kdb_register_repeat("env", kdb_env, "", 2794 kdb_register_repeat("env", kdb_env, "",
2856 "Show environment variables", 0, KDB_REPEAT_NONE); 2795 "Show environment variables", 0, KDB_REPEAT_NONE);
2857 kdb_register_repeat("set", kdb_set, "", 2796 kdb_register_repeat("set", kdb_set, "",
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 392ec6a25844..7afd3c8c41d5 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -19,7 +19,6 @@
19#define KDB_CMD_GO (-1001) 19#define KDB_CMD_GO (-1001)
20#define KDB_CMD_CPU (-1002) 20#define KDB_CMD_CPU (-1002)
21#define KDB_CMD_SS (-1003) 21#define KDB_CMD_SS (-1003)
22#define KDB_CMD_SSB (-1004)
23#define KDB_CMD_KGDB (-1005) 22#define KDB_CMD_KGDB (-1005)
24 23
25/* Internal debug flags */ 24/* Internal debug flags */
@@ -125,8 +124,6 @@ extern int kdb_state;
125 * kdb control */ 124 * kdb control */
126#define KDB_STATE_HOLD_CPU 0x00000010 /* Hold this cpu inside kdb */ 125#define KDB_STATE_HOLD_CPU 0x00000010 /* Hold this cpu inside kdb */
127#define KDB_STATE_DOING_SS 0x00000020 /* Doing ss command */ 126#define KDB_STATE_DOING_SS 0x00000020 /* Doing ss command */
128#define KDB_STATE_DOING_SSB 0x00000040 /* Doing ssb command,
129 * DOING_SS is also set */
130#define KDB_STATE_SSBPT 0x00000080 /* Install breakpoint 127#define KDB_STATE_SSBPT 0x00000080 /* Install breakpoint
131 * after one ss, independent of 128 * after one ss, independent of
132 * DOING_SS */ 129 * DOING_SS */
@@ -191,7 +188,6 @@ extern void kdb_bp_remove(void);
191typedef enum { 188typedef enum {
192 KDB_DB_BPT, /* Breakpoint */ 189 KDB_DB_BPT, /* Breakpoint */
193 KDB_DB_SS, /* Single-step trap */ 190 KDB_DB_SS, /* Single-step trap */
194 KDB_DB_SSB, /* Single step to branch */
195 KDB_DB_SSBPT, /* Single step over breakpoint */ 191 KDB_DB_SSBPT, /* Single step over breakpoint */
196 KDB_DB_NOBPT /* Spurious breakpoint */ 192 KDB_DB_NOBPT /* Spurious breakpoint */
197} kdb_dbtrap_t; 193} kdb_dbtrap_t;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5c75791d7269..b0cd86501c30 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3691,7 +3691,7 @@ unlock:
3691 3691
3692static int perf_fasync(int fd, struct file *filp, int on) 3692static int perf_fasync(int fd, struct file *filp, int on)
3693{ 3693{
3694 struct inode *inode = filp->f_path.dentry->d_inode; 3694 struct inode *inode = file_inode(filp);
3695 struct perf_event *event = filp->private_data; 3695 struct perf_event *event = filp->private_data;
3696 int retval; 3696 int retval;
3697 3697
@@ -5126,7 +5126,6 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
5126{ 5126{
5127 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); 5127 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
5128 struct perf_event *event; 5128 struct perf_event *event;
5129 struct hlist_node *node;
5130 struct hlist_head *head; 5129 struct hlist_head *head;
5131 5130
5132 rcu_read_lock(); 5131 rcu_read_lock();
@@ -5134,7 +5133,7 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
5134 if (!head) 5133 if (!head)
5135 goto end; 5134 goto end;
5136 5135
5137 hlist_for_each_entry_rcu(event, node, head, hlist_entry) { 5136 hlist_for_each_entry_rcu(event, head, hlist_entry) {
5138 if (perf_swevent_match(event, type, event_id, data, regs)) 5137 if (perf_swevent_match(event, type, event_id, data, regs))
5139 perf_swevent_event(event, nr, data, regs); 5138 perf_swevent_event(event, nr, data, regs);
5140 } 5139 }
@@ -5419,7 +5418,6 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
5419{ 5418{
5420 struct perf_sample_data data; 5419 struct perf_sample_data data;
5421 struct perf_event *event; 5420 struct perf_event *event;
5422 struct hlist_node *node;
5423 5421
5424 struct perf_raw_record raw = { 5422 struct perf_raw_record raw = {
5425 .size = entry_size, 5423 .size = entry_size,
@@ -5429,7 +5427,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
5429 perf_sample_data_init(&data, addr, 0); 5427 perf_sample_data_init(&data, addr, 0);
5430 data.raw = &raw; 5428 data.raw = &raw;
5431 5429
5432 hlist_for_each_entry_rcu(event, node, head, hlist_entry) { 5430 hlist_for_each_entry_rcu(event, head, hlist_entry) {
5433 if (perf_tp_event_match(event, &data, regs)) 5431 if (perf_tp_event_match(event, &data, regs))
5434 perf_swevent_event(event, count, &data, regs); 5432 perf_swevent_event(event, count, &data, regs);
5435 } 5433 }
@@ -5965,13 +5963,9 @@ int perf_pmu_register(struct pmu *pmu, char *name, int type)
5965 pmu->name = name; 5963 pmu->name = name;
5966 5964
5967 if (type < 0) { 5965 if (type < 0) {
5968 int err = idr_pre_get(&pmu_idr, GFP_KERNEL); 5966 type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
5969 if (!err) 5967 if (type < 0) {
5970 goto free_pdc; 5968 ret = type;
5971
5972 err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type);
5973 if (err) {
5974 ret = err;
5975 goto free_pdc; 5969 goto free_pdc;
5976 } 5970 }
5977 } 5971 }
diff --git a/kernel/exit.c b/kernel/exit.c
index 7dd20408707c..51e485ca9935 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -20,6 +20,7 @@
20#include <linux/tsacct_kern.h> 20#include <linux/tsacct_kern.h>
21#include <linux/file.h> 21#include <linux/file.h>
22#include <linux/fdtable.h> 22#include <linux/fdtable.h>
23#include <linux/freezer.h>
23#include <linux/binfmts.h> 24#include <linux/binfmts.h>
24#include <linux/nsproxy.h> 25#include <linux/nsproxy.h>
25#include <linux/pid_namespace.h> 26#include <linux/pid_namespace.h>
@@ -31,7 +32,6 @@
31#include <linux/mempolicy.h> 32#include <linux/mempolicy.h>
32#include <linux/taskstats_kern.h> 33#include <linux/taskstats_kern.h>
33#include <linux/delayacct.h> 34#include <linux/delayacct.h>
34#include <linux/freezer.h>
35#include <linux/cgroup.h> 35#include <linux/cgroup.h>
36#include <linux/syscalls.h> 36#include <linux/syscalls.h>
37#include <linux/signal.h> 37#include <linux/signal.h>
@@ -485,7 +485,7 @@ static void exit_mm(struct task_struct * tsk)
485 set_task_state(tsk, TASK_UNINTERRUPTIBLE); 485 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
486 if (!self.task) /* see coredump_finish() */ 486 if (!self.task) /* see coredump_finish() */
487 break; 487 break;
488 schedule(); 488 freezable_schedule();
489 } 489 }
490 __set_task_state(tsk, TASK_RUNNING); 490 __set_task_state(tsk, TASK_RUNNING);
491 down_read(&mm->mmap_sem); 491 down_read(&mm->mmap_sem);
@@ -835,7 +835,7 @@ void do_exit(long code)
835 /* 835 /*
836 * Make sure we are holding no locks: 836 * Make sure we are holding no locks:
837 */ 837 */
838 debug_check_no_locks_held(tsk); 838 debug_check_no_locks_held();
839 /* 839 /*
840 * We can do this unlocked here. The futex code uses this flag 840 * We can do this unlocked here. The futex code uses this flag
841 * just to verify whether the pi state cleanup has been done 841 * just to verify whether the pi state cleanup has been done
diff --git a/kernel/fork.c b/kernel/fork.c
index 4133876d8cd2..1766d324d5e3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -413,7 +413,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
413 tmp->vm_next = tmp->vm_prev = NULL; 413 tmp->vm_next = tmp->vm_prev = NULL;
414 file = tmp->vm_file; 414 file = tmp->vm_file;
415 if (file) { 415 if (file) {
416 struct inode *inode = file->f_path.dentry->d_inode; 416 struct inode *inode = file_inode(file);
417 struct address_space *mapping = file->f_mapping; 417 struct address_space *mapping = file->f_mapping;
418 418
419 get_file(file); 419 get_file(file);
@@ -1141,6 +1141,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1141 if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) 1141 if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
1142 return ERR_PTR(-EINVAL); 1142 return ERR_PTR(-EINVAL);
1143 1143
1144 if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
1145 return ERR_PTR(-EINVAL);
1146
1144 /* 1147 /*
1145 * Thread groups must share signals as well, and detached threads 1148 * Thread groups must share signals as well, and detached threads
1146 * can only be started up within the thread group. 1149 * can only be started up within the thread group.
@@ -1807,7 +1810,7 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1807 * If unsharing a user namespace must also unshare the thread. 1810 * If unsharing a user namespace must also unshare the thread.
1808 */ 1811 */
1809 if (unshare_flags & CLONE_NEWUSER) 1812 if (unshare_flags & CLONE_NEWUSER)
1810 unshare_flags |= CLONE_THREAD; 1813 unshare_flags |= CLONE_THREAD | CLONE_FS;
1811 /* 1814 /*
1812 * If unsharing a pid namespace must also unshare the thread. 1815 * If unsharing a pid namespace must also unshare the thread.
1813 */ 1816 */
@@ -1861,10 +1864,8 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1861 exit_sem(current); 1864 exit_sem(current);
1862 } 1865 }
1863 1866
1864 if (new_nsproxy) { 1867 if (new_nsproxy)
1865 switch_task_namespaces(current, new_nsproxy); 1868 switch_task_namespaces(current, new_nsproxy);
1866 new_nsproxy = NULL;
1867 }
1868 1869
1869 task_lock(current); 1870 task_lock(current);
1870 1871
@@ -1894,9 +1895,6 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1894 } 1895 }
1895 } 1896 }
1896 1897
1897 if (new_nsproxy)
1898 put_nsproxy(new_nsproxy);
1899
1900bad_unshare_cleanup_cred: 1898bad_unshare_cleanup_cred:
1901 if (new_cred) 1899 if (new_cred)
1902 put_cred(new_cred); 1900 put_cred(new_cred);
diff --git a/kernel/futex.c b/kernel/futex.c
index 9618b6e9fb36..b26dcfc02c94 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -223,10 +223,11 @@ static void drop_futex_key_refs(union futex_key *key)
223 * @rw: mapping needs to be read/write (values: VERIFY_READ, 223 * @rw: mapping needs to be read/write (values: VERIFY_READ,
224 * VERIFY_WRITE) 224 * VERIFY_WRITE)
225 * 225 *
226 * Returns a negative error code or 0 226 * Return: a negative error code or 0
227 *
227 * The key words are stored in *key on success. 228 * The key words are stored in *key on success.
228 * 229 *
229 * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode, 230 * For shared mappings, it's (page->index, file_inode(vma->vm_file),
230 * offset_within_page). For private mappings, it's (uaddr, current->mm). 231 * offset_within_page). For private mappings, it's (uaddr, current->mm).
231 * We can usually work out the index without swapping in the page. 232 * We can usually work out the index without swapping in the page.
232 * 233 *
@@ -705,9 +706,9 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
705 * be "current" except in the case of requeue pi. 706 * be "current" except in the case of requeue pi.
706 * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) 707 * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
707 * 708 *
708 * Returns: 709 * Return:
709 * 0 - ready to wait 710 * 0 - ready to wait;
710 * 1 - acquired the lock 711 * 1 - acquired the lock;
711 * <0 - error 712 * <0 - error
712 * 713 *
713 * The hb->lock and futex_key refs shall be held by the caller. 714 * The hb->lock and futex_key refs shall be held by the caller.
@@ -1191,9 +1192,9 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
1191 * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit. 1192 * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
1192 * hb1 and hb2 must be held by the caller. 1193 * hb1 and hb2 must be held by the caller.
1193 * 1194 *
1194 * Returns: 1195 * Return:
1195 * 0 - failed to acquire the lock atomicly 1196 * 0 - failed to acquire the lock atomically;
1196 * 1 - acquired the lock 1197 * 1 - acquired the lock;
1197 * <0 - error 1198 * <0 - error
1198 */ 1199 */
1199static int futex_proxy_trylock_atomic(u32 __user *pifutex, 1200static int futex_proxy_trylock_atomic(u32 __user *pifutex,
@@ -1254,8 +1255,8 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1254 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire 1255 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
1255 * uaddr2 atomically on behalf of the top waiter. 1256 * uaddr2 atomically on behalf of the top waiter.
1256 * 1257 *
1257 * Returns: 1258 * Return:
1258 * >=0 - on success, the number of tasks requeued or woken 1259 * >=0 - on success, the number of tasks requeued or woken;
1259 * <0 - on error 1260 * <0 - on error
1260 */ 1261 */
1261static int futex_requeue(u32 __user *uaddr1, unsigned int flags, 1262static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
@@ -1536,8 +1537,8 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
1536 * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must 1537 * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must
1537 * be paired with exactly one earlier call to queue_me(). 1538 * be paired with exactly one earlier call to queue_me().
1538 * 1539 *
1539 * Returns: 1540 * Return:
1540 * 1 - if the futex_q was still queued (and we removed unqueued it) 1541 * 1 - if the futex_q was still queued (and we removed unqueued it);
1541 * 0 - if the futex_q was already removed by the waking thread 1542 * 0 - if the futex_q was already removed by the waking thread
1542 */ 1543 */
1543static int unqueue_me(struct futex_q *q) 1544static int unqueue_me(struct futex_q *q)
@@ -1707,9 +1708,9 @@ static long futex_wait_restart(struct restart_block *restart);
1707 * the pi_state owner as well as handle race conditions that may allow us to 1708 * the pi_state owner as well as handle race conditions that may allow us to
1708 * acquire the lock. Must be called with the hb lock held. 1709 * acquire the lock. Must be called with the hb lock held.
1709 * 1710 *
1710 * Returns: 1711 * Return:
1711 * 1 - success, lock taken 1712 * 1 - success, lock taken;
1712 * 0 - success, lock not taken 1713 * 0 - success, lock not taken;
1713 * <0 - on error (-EFAULT) 1714 * <0 - on error (-EFAULT)
1714 */ 1715 */
1715static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) 1716static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
@@ -1824,8 +1825,8 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
1824 * Return with the hb lock held and a q.key reference on success, and unlocked 1825 * Return with the hb lock held and a q.key reference on success, and unlocked
1825 * with no q.key reference on failure. 1826 * with no q.key reference on failure.
1826 * 1827 *
1827 * Returns: 1828 * Return:
1828 * 0 - uaddr contains val and hb has been locked 1829 * 0 - uaddr contains val and hb has been locked;
1829 * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked 1830 * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
1830 */ 1831 */
1831static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, 1832static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
@@ -2203,9 +2204,9 @@ pi_faulted:
2203 * the wakeup and return the appropriate error code to the caller. Must be 2204 * the wakeup and return the appropriate error code to the caller. Must be
2204 * called with the hb lock held. 2205 * called with the hb lock held.
2205 * 2206 *
2206 * Returns 2207 * Return:
2207 * 0 - no early wakeup detected 2208 * 0 = no early wakeup detected;
2208 * <0 - -ETIMEDOUT or -ERESTARTNOINTR 2209 * <0 = -ETIMEDOUT or -ERESTARTNOINTR
2209 */ 2210 */
2210static inline 2211static inline
2211int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, 2212int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
@@ -2247,7 +2248,6 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2247 * @val: the expected value of uaddr 2248 * @val: the expected value of uaddr
2248 * @abs_time: absolute timeout 2249 * @abs_time: absolute timeout
2249 * @bitset: 32 bit wakeup bitset set by userspace, defaults to all 2250 * @bitset: 32 bit wakeup bitset set by userspace, defaults to all
2250 * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0)
2251 * @uaddr2: the pi futex we will take prior to returning to user-space 2251 * @uaddr2: the pi futex we will take prior to returning to user-space
2252 * 2252 *
2253 * The caller will wait on uaddr and will be requeued by futex_requeue() to 2253 * The caller will wait on uaddr and will be requeued by futex_requeue() to
@@ -2258,7 +2258,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2258 * there was a need to. 2258 * there was a need to.
2259 * 2259 *
2260 * We call schedule in futex_wait_queue_me() when we enqueue and return there 2260 * We call schedule in futex_wait_queue_me() when we enqueue and return there
2261 * via the following: 2261 * via the following--
2262 * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() 2262 * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
2263 * 2) wakeup on uaddr2 after a requeue 2263 * 2) wakeup on uaddr2 after a requeue
2264 * 3) signal 2264 * 3) signal
@@ -2276,8 +2276,8 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2276 * 2276 *
2277 * If 4 or 7, we cleanup and return with -ETIMEDOUT. 2277 * If 4 or 7, we cleanup and return with -ETIMEDOUT.
2278 * 2278 *
2279 * Returns: 2279 * Return:
2280 * 0 - On success 2280 * 0 - On success;
2281 * <0 - On error 2281 * <0 - On error
2282 */ 2282 */
2283static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, 2283static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
@@ -2472,8 +2472,6 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
2472 if (!futex_cmpxchg_enabled) 2472 if (!futex_cmpxchg_enabled)
2473 return -ENOSYS; 2473 return -ENOSYS;
2474 2474
2475 WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n");
2476
2477 rcu_read_lock(); 2475 rcu_read_lock();
2478 2476
2479 ret = -ESRCH; 2477 ret = -ESRCH;
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 83e368b005fc..f9f44fd4d34d 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -11,6 +11,7 @@
11#include <linux/nsproxy.h> 11#include <linux/nsproxy.h>
12#include <linux/futex.h> 12#include <linux/futex.h>
13#include <linux/ptrace.h> 13#include <linux/ptrace.h>
14#include <linux/syscalls.h>
14 15
15#include <asm/uaccess.h> 16#include <asm/uaccess.h>
16 17
@@ -116,9 +117,9 @@ void compat_exit_robust_list(struct task_struct *curr)
116 } 117 }
117} 118}
118 119
119asmlinkage long 120COMPAT_SYSCALL_DEFINE2(set_robust_list,
120compat_sys_set_robust_list(struct compat_robust_list_head __user *head, 121 struct compat_robust_list_head __user *, head,
121 compat_size_t len) 122 compat_size_t, len)
122{ 123{
123 if (!futex_cmpxchg_enabled) 124 if (!futex_cmpxchg_enabled)
124 return -ENOSYS; 125 return -ENOSYS;
@@ -131,9 +132,9 @@ compat_sys_set_robust_list(struct compat_robust_list_head __user *head,
131 return 0; 132 return 0;
132} 133}
133 134
134asmlinkage long 135COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
135compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, 136 compat_uptr_t __user *, head_ptr,
136 compat_size_t __user *len_ptr) 137 compat_size_t __user *, len_ptr)
137{ 138{
138 struct compat_robust_list_head __user *head; 139 struct compat_robust_list_head __user *head;
139 unsigned long ret; 140 unsigned long ret;
@@ -142,8 +143,6 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
142 if (!futex_cmpxchg_enabled) 143 if (!futex_cmpxchg_enabled)
143 return -ENOSYS; 144 return -ENOSYS;
144 145
145 WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n");
146
147 rcu_read_lock(); 146 rcu_read_lock();
148 147
149 ret = -ESRCH; 148 ret = -ESRCH;
@@ -172,9 +171,9 @@ err_unlock:
172 return ret; 171 return ret;
173} 172}
174 173
175asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, 174COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
176 struct compat_timespec __user *utime, u32 __user *uaddr2, 175 struct compat_timespec __user *, utime, u32 __user *, uaddr2,
177 u32 val3) 176 u32, val3)
178{ 177{
179 struct timespec ts; 178 struct timespec ts;
180 ktime_t t, *tp = NULL; 179 ktime_t t, *tp = NULL;
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index a92028196cc1..d4da55d1fb65 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -35,7 +35,7 @@ config GCOV_KERNEL
35config GCOV_PROFILE_ALL 35config GCOV_PROFILE_ALL
36 bool "Profile entire Kernel" 36 bool "Profile entire Kernel"
37 depends on GCOV_KERNEL 37 depends on GCOV_KERNEL
38 depends on SUPERH || S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE 38 depends on SUPERH || S390 || X86 || PPC || MICROBLAZE
39 default n 39 default n
40 ---help--- 40 ---help---
41 This options activates profiling for the entire kernel. 41 This options activates profiling for the entire kernel.
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 4bd4faa6323a..397db02209ed 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -76,7 +76,7 @@ static int irq_affinity_list_proc_show(struct seq_file *m, void *v)
76static ssize_t write_irq_affinity(int type, struct file *file, 76static ssize_t write_irq_affinity(int type, struct file *file,
77 const char __user *buffer, size_t count, loff_t *pos) 77 const char __user *buffer, size_t count, loff_t *pos)
78{ 78{
79 unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data; 79 unsigned int irq = (int)(long)PDE(file_inode(file))->data;
80 cpumask_var_t new_value; 80 cpumask_var_t new_value;
81 int err; 81 int err;
82 82
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 5e4bd7864c5d..bddd3d7a74b6 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -54,6 +54,12 @@ struct resource crashk_res = {
54 .end = 0, 54 .end = 0,
55 .flags = IORESOURCE_BUSY | IORESOURCE_MEM 55 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
56}; 56};
57struct resource crashk_low_res = {
58 .name = "Crash kernel low",
59 .start = 0,
60 .end = 0,
61 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
62};
57 63
58int kexec_should_crash(struct task_struct *p) 64int kexec_should_crash(struct task_struct *p)
59{ 65{
@@ -223,6 +229,8 @@ out:
223 229
224} 230}
225 231
232static void kimage_free_page_list(struct list_head *list);
233
226static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, 234static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
227 unsigned long nr_segments, 235 unsigned long nr_segments,
228 struct kexec_segment __user *segments) 236 struct kexec_segment __user *segments)
@@ -236,8 +244,6 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
236 if (result) 244 if (result)
237 goto out; 245 goto out;
238 246
239 *rimage = image;
240
241 /* 247 /*
242 * Find a location for the control code buffer, and add it 248 * Find a location for the control code buffer, and add it
243 * the vector of segments so that it's pages will also be 249 * the vector of segments so that it's pages will also be
@@ -248,22 +254,22 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
248 get_order(KEXEC_CONTROL_PAGE_SIZE)); 254 get_order(KEXEC_CONTROL_PAGE_SIZE));
249 if (!image->control_code_page) { 255 if (!image->control_code_page) {
250 printk(KERN_ERR "Could not allocate control_code_buffer\n"); 256 printk(KERN_ERR "Could not allocate control_code_buffer\n");
251 goto out; 257 goto out_free;
252 } 258 }
253 259
254 image->swap_page = kimage_alloc_control_pages(image, 0); 260 image->swap_page = kimage_alloc_control_pages(image, 0);
255 if (!image->swap_page) { 261 if (!image->swap_page) {
256 printk(KERN_ERR "Could not allocate swap buffer\n"); 262 printk(KERN_ERR "Could not allocate swap buffer\n");
257 goto out; 263 goto out_free;
258 } 264 }
259 265
260 result = 0; 266 *rimage = image;
261 out: 267 return 0;
262 if (result == 0)
263 *rimage = image;
264 else
265 kfree(image);
266 268
269out_free:
270 kimage_free_page_list(&image->control_pages);
271 kfree(image);
272out:
267 return result; 273 return result;
268} 274}
269 275
@@ -310,7 +316,7 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
310 mend = mstart + image->segment[i].memsz - 1; 316 mend = mstart + image->segment[i].memsz - 1;
311 /* Ensure we are within the crash kernel limits */ 317 /* Ensure we are within the crash kernel limits */
312 if ((mstart < crashk_res.start) || (mend > crashk_res.end)) 318 if ((mstart < crashk_res.start) || (mend > crashk_res.end))
313 goto out; 319 goto out_free;
314 } 320 }
315 321
316 /* 322 /*
@@ -323,16 +329,15 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
323 get_order(KEXEC_CONTROL_PAGE_SIZE)); 329 get_order(KEXEC_CONTROL_PAGE_SIZE));
324 if (!image->control_code_page) { 330 if (!image->control_code_page) {
325 printk(KERN_ERR "Could not allocate control_code_buffer\n"); 331 printk(KERN_ERR "Could not allocate control_code_buffer\n");
326 goto out; 332 goto out_free;
327 } 333 }
328 334
329 result = 0; 335 *rimage = image;
330out: 336 return 0;
331 if (result == 0)
332 *rimage = image;
333 else
334 kfree(image);
335 337
338out_free:
339 kfree(image);
340out:
336 return result; 341 return result;
337} 342}
338 343
@@ -497,8 +502,6 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
497 502
498 if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT) 503 if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
499 break; 504 break;
500 if (hole_end > crashk_res.end)
501 break;
502 /* See if I overlap any of the segments */ 505 /* See if I overlap any of the segments */
503 for (i = 0; i < image->nr_segments; i++) { 506 for (i = 0; i < image->nr_segments; i++) {
504 unsigned long mstart, mend; 507 unsigned long mstart, mend;
@@ -1369,10 +1372,11 @@ static int __init parse_crashkernel_simple(char *cmdline,
1369 * That function is the entry point for command line parsing and should be 1372 * That function is the entry point for command line parsing and should be
1370 * called from the arch-specific code. 1373 * called from the arch-specific code.
1371 */ 1374 */
1372int __init parse_crashkernel(char *cmdline, 1375static int __init __parse_crashkernel(char *cmdline,
1373 unsigned long long system_ram, 1376 unsigned long long system_ram,
1374 unsigned long long *crash_size, 1377 unsigned long long *crash_size,
1375 unsigned long long *crash_base) 1378 unsigned long long *crash_base,
1379 const char *name)
1376{ 1380{
1377 char *p = cmdline, *ck_cmdline = NULL; 1381 char *p = cmdline, *ck_cmdline = NULL;
1378 char *first_colon, *first_space; 1382 char *first_colon, *first_space;
@@ -1382,16 +1386,16 @@ int __init parse_crashkernel(char *cmdline,
1382 *crash_base = 0; 1386 *crash_base = 0;
1383 1387
1384 /* find crashkernel and use the last one if there are more */ 1388 /* find crashkernel and use the last one if there are more */
1385 p = strstr(p, "crashkernel="); 1389 p = strstr(p, name);
1386 while (p) { 1390 while (p) {
1387 ck_cmdline = p; 1391 ck_cmdline = p;
1388 p = strstr(p+1, "crashkernel="); 1392 p = strstr(p+1, name);
1389 } 1393 }
1390 1394
1391 if (!ck_cmdline) 1395 if (!ck_cmdline)
1392 return -EINVAL; 1396 return -EINVAL;
1393 1397
1394 ck_cmdline += 12; /* strlen("crashkernel=") */ 1398 ck_cmdline += strlen(name);
1395 1399
1396 /* 1400 /*
1397 * if the commandline contains a ':', then that's the extended 1401 * if the commandline contains a ':', then that's the extended
@@ -1409,6 +1413,23 @@ int __init parse_crashkernel(char *cmdline,
1409 return 0; 1413 return 0;
1410} 1414}
1411 1415
1416int __init parse_crashkernel(char *cmdline,
1417 unsigned long long system_ram,
1418 unsigned long long *crash_size,
1419 unsigned long long *crash_base)
1420{
1421 return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
1422 "crashkernel=");
1423}
1424
1425int __init parse_crashkernel_low(char *cmdline,
1426 unsigned long long system_ram,
1427 unsigned long long *crash_size,
1428 unsigned long long *crash_base)
1429{
1430 return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
1431 "crashkernel_low=");
1432}
1412 1433
1413static void update_vmcoreinfo_note(void) 1434static void update_vmcoreinfo_note(void)
1414{ 1435{
@@ -1490,6 +1511,8 @@ static int __init crash_save_vmcoreinfo_init(void)
1490 VMCOREINFO_OFFSET(page, _count); 1511 VMCOREINFO_OFFSET(page, _count);
1491 VMCOREINFO_OFFSET(page, mapping); 1512 VMCOREINFO_OFFSET(page, mapping);
1492 VMCOREINFO_OFFSET(page, lru); 1513 VMCOREINFO_OFFSET(page, lru);
1514 VMCOREINFO_OFFSET(page, _mapcount);
1515 VMCOREINFO_OFFSET(page, private);
1493 VMCOREINFO_OFFSET(pglist_data, node_zones); 1516 VMCOREINFO_OFFSET(pglist_data, node_zones);
1494 VMCOREINFO_OFFSET(pglist_data, nr_zones); 1517 VMCOREINFO_OFFSET(pglist_data, nr_zones);
1495#ifdef CONFIG_FLAT_NODE_MEM_MAP 1518#ifdef CONFIG_FLAT_NODE_MEM_MAP
@@ -1512,6 +1535,11 @@ static int __init crash_save_vmcoreinfo_init(void)
1512 VMCOREINFO_NUMBER(PG_lru); 1535 VMCOREINFO_NUMBER(PG_lru);
1513 VMCOREINFO_NUMBER(PG_private); 1536 VMCOREINFO_NUMBER(PG_private);
1514 VMCOREINFO_NUMBER(PG_swapcache); 1537 VMCOREINFO_NUMBER(PG_swapcache);
1538 VMCOREINFO_NUMBER(PG_slab);
1539#ifdef CONFIG_MEMORY_FAILURE
1540 VMCOREINFO_NUMBER(PG_hwpoison);
1541#endif
1542 VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
1515 1543
1516 arch_crash_save_vmcoreinfo(); 1544 arch_crash_save_vmcoreinfo();
1517 update_vmcoreinfo_note(); 1545 update_vmcoreinfo_note();
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
deleted file mode 100644
index 59dcf5b81d24..000000000000
--- a/kernel/kfifo.c
+++ /dev/null
@@ -1,609 +0,0 @@
1/*
2 * A generic kernel FIFO implementation
3 *
4 * Copyright (C) 2009/2010 Stefani Seibold <stefani@seibold.net>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 */
21
22#include <linux/kernel.h>
23#include <linux/export.h>
24#include <linux/slab.h>
25#include <linux/err.h>
26#include <linux/log2.h>
27#include <linux/uaccess.h>
28#include <linux/kfifo.h>
29
30/*
31 * internal helper to calculate the unused elements in a fifo
32 */
33static inline unsigned int kfifo_unused(struct __kfifo *fifo)
34{
35 return (fifo->mask + 1) - (fifo->in - fifo->out);
36}
37
38int __kfifo_alloc(struct __kfifo *fifo, unsigned int size,
39 size_t esize, gfp_t gfp_mask)
40{
41 /*
42 * round down to the next power of 2, since our 'let the indices
43 * wrap' technique works only in this case.
44 */
45 if (!is_power_of_2(size))
46 size = rounddown_pow_of_two(size);
47
48 fifo->in = 0;
49 fifo->out = 0;
50 fifo->esize = esize;
51
52 if (size < 2) {
53 fifo->data = NULL;
54 fifo->mask = 0;
55 return -EINVAL;
56 }
57
58 fifo->data = kmalloc(size * esize, gfp_mask);
59
60 if (!fifo->data) {
61 fifo->mask = 0;
62 return -ENOMEM;
63 }
64 fifo->mask = size - 1;
65
66 return 0;
67}
68EXPORT_SYMBOL(__kfifo_alloc);
69
70void __kfifo_free(struct __kfifo *fifo)
71{
72 kfree(fifo->data);
73 fifo->in = 0;
74 fifo->out = 0;
75 fifo->esize = 0;
76 fifo->data = NULL;
77 fifo->mask = 0;
78}
79EXPORT_SYMBOL(__kfifo_free);
80
81int __kfifo_init(struct __kfifo *fifo, void *buffer,
82 unsigned int size, size_t esize)
83{
84 size /= esize;
85
86 if (!is_power_of_2(size))
87 size = rounddown_pow_of_two(size);
88
89 fifo->in = 0;
90 fifo->out = 0;
91 fifo->esize = esize;
92 fifo->data = buffer;
93
94 if (size < 2) {
95 fifo->mask = 0;
96 return -EINVAL;
97 }
98 fifo->mask = size - 1;
99
100 return 0;
101}
102EXPORT_SYMBOL(__kfifo_init);
103
104static void kfifo_copy_in(struct __kfifo *fifo, const void *src,
105 unsigned int len, unsigned int off)
106{
107 unsigned int size = fifo->mask + 1;
108 unsigned int esize = fifo->esize;
109 unsigned int l;
110
111 off &= fifo->mask;
112 if (esize != 1) {
113 off *= esize;
114 size *= esize;
115 len *= esize;
116 }
117 l = min(len, size - off);
118
119 memcpy(fifo->data + off, src, l);
120 memcpy(fifo->data, src + l, len - l);
121 /*
122 * make sure that the data in the fifo is up to date before
123 * incrementing the fifo->in index counter
124 */
125 smp_wmb();
126}
127
128unsigned int __kfifo_in(struct __kfifo *fifo,
129 const void *buf, unsigned int len)
130{
131 unsigned int l;
132
133 l = kfifo_unused(fifo);
134 if (len > l)
135 len = l;
136
137 kfifo_copy_in(fifo, buf, len, fifo->in);
138 fifo->in += len;
139 return len;
140}
141EXPORT_SYMBOL(__kfifo_in);
142
143static void kfifo_copy_out(struct __kfifo *fifo, void *dst,
144 unsigned int len, unsigned int off)
145{
146 unsigned int size = fifo->mask + 1;
147 unsigned int esize = fifo->esize;
148 unsigned int l;
149
150 off &= fifo->mask;
151 if (esize != 1) {
152 off *= esize;
153 size *= esize;
154 len *= esize;
155 }
156 l = min(len, size - off);
157
158 memcpy(dst, fifo->data + off, l);
159 memcpy(dst + l, fifo->data, len - l);
160 /*
161 * make sure that the data is copied before
162 * incrementing the fifo->out index counter
163 */
164 smp_wmb();
165}
166
167unsigned int __kfifo_out_peek(struct __kfifo *fifo,
168 void *buf, unsigned int len)
169{
170 unsigned int l;
171
172 l = fifo->in - fifo->out;
173 if (len > l)
174 len = l;
175
176 kfifo_copy_out(fifo, buf, len, fifo->out);
177 return len;
178}
179EXPORT_SYMBOL(__kfifo_out_peek);
180
181unsigned int __kfifo_out(struct __kfifo *fifo,
182 void *buf, unsigned int len)
183{
184 len = __kfifo_out_peek(fifo, buf, len);
185 fifo->out += len;
186 return len;
187}
188EXPORT_SYMBOL(__kfifo_out);
189
190static unsigned long kfifo_copy_from_user(struct __kfifo *fifo,
191 const void __user *from, unsigned int len, unsigned int off,
192 unsigned int *copied)
193{
194 unsigned int size = fifo->mask + 1;
195 unsigned int esize = fifo->esize;
196 unsigned int l;
197 unsigned long ret;
198
199 off &= fifo->mask;
200 if (esize != 1) {
201 off *= esize;
202 size *= esize;
203 len *= esize;
204 }
205 l = min(len, size - off);
206
207 ret = copy_from_user(fifo->data + off, from, l);
208 if (unlikely(ret))
209 ret = DIV_ROUND_UP(ret + len - l, esize);
210 else {
211 ret = copy_from_user(fifo->data, from + l, len - l);
212 if (unlikely(ret))
213 ret = DIV_ROUND_UP(ret, esize);
214 }
215 /*
216 * make sure that the data in the fifo is up to date before
217 * incrementing the fifo->in index counter
218 */
219 smp_wmb();
220 *copied = len - ret;
221 /* return the number of elements which are not copied */
222 return ret;
223}
224
225int __kfifo_from_user(struct __kfifo *fifo, const void __user *from,
226 unsigned long len, unsigned int *copied)
227{
228 unsigned int l;
229 unsigned long ret;
230 unsigned int esize = fifo->esize;
231 int err;
232
233 if (esize != 1)
234 len /= esize;
235
236 l = kfifo_unused(fifo);
237 if (len > l)
238 len = l;
239
240 ret = kfifo_copy_from_user(fifo, from, len, fifo->in, copied);
241 if (unlikely(ret)) {
242 len -= ret;
243 err = -EFAULT;
244 } else
245 err = 0;
246 fifo->in += len;
247 return err;
248}
249EXPORT_SYMBOL(__kfifo_from_user);
250
251static unsigned long kfifo_copy_to_user(struct __kfifo *fifo, void __user *to,
252 unsigned int len, unsigned int off, unsigned int *copied)
253{
254 unsigned int l;
255 unsigned long ret;
256 unsigned int size = fifo->mask + 1;
257 unsigned int esize = fifo->esize;
258
259 off &= fifo->mask;
260 if (esize != 1) {
261 off *= esize;
262 size *= esize;
263 len *= esize;
264 }
265 l = min(len, size - off);
266
267 ret = copy_to_user(to, fifo->data + off, l);
268 if (unlikely(ret))
269 ret = DIV_ROUND_UP(ret + len - l, esize);
270 else {
271 ret = copy_to_user(to + l, fifo->data, len - l);
272 if (unlikely(ret))
273 ret = DIV_ROUND_UP(ret, esize);
274 }
275 /*
276 * make sure that the data is copied before
277 * incrementing the fifo->out index counter
278 */
279 smp_wmb();
280 *copied = len - ret;
281 /* return the number of elements which are not copied */
282 return ret;
283}
284
285int __kfifo_to_user(struct __kfifo *fifo, void __user *to,
286 unsigned long len, unsigned int *copied)
287{
288 unsigned int l;
289 unsigned long ret;
290 unsigned int esize = fifo->esize;
291 int err;
292
293 if (esize != 1)
294 len /= esize;
295
296 l = fifo->in - fifo->out;
297 if (len > l)
298 len = l;
299 ret = kfifo_copy_to_user(fifo, to, len, fifo->out, copied);
300 if (unlikely(ret)) {
301 len -= ret;
302 err = -EFAULT;
303 } else
304 err = 0;
305 fifo->out += len;
306 return err;
307}
308EXPORT_SYMBOL(__kfifo_to_user);
309
310static int setup_sgl_buf(struct scatterlist *sgl, void *buf,
311 int nents, unsigned int len)
312{
313 int n;
314 unsigned int l;
315 unsigned int off;
316 struct page *page;
317
318 if (!nents)
319 return 0;
320
321 if (!len)
322 return 0;
323
324 n = 0;
325 page = virt_to_page(buf);
326 off = offset_in_page(buf);
327 l = 0;
328
329 while (len >= l + PAGE_SIZE - off) {
330 struct page *npage;
331
332 l += PAGE_SIZE;
333 buf += PAGE_SIZE;
334 npage = virt_to_page(buf);
335 if (page_to_phys(page) != page_to_phys(npage) - l) {
336 sg_set_page(sgl, page, l - off, off);
337 sgl = sg_next(sgl);
338 if (++n == nents || sgl == NULL)
339 return n;
340 page = npage;
341 len -= l - off;
342 l = off = 0;
343 }
344 }
345 sg_set_page(sgl, page, len, off);
346 return n + 1;
347}
348
349static unsigned int setup_sgl(struct __kfifo *fifo, struct scatterlist *sgl,
350 int nents, unsigned int len, unsigned int off)
351{
352 unsigned int size = fifo->mask + 1;
353 unsigned int esize = fifo->esize;
354 unsigned int l;
355 unsigned int n;
356
357 off &= fifo->mask;
358 if (esize != 1) {
359 off *= esize;
360 size *= esize;
361 len *= esize;
362 }
363 l = min(len, size - off);
364
365 n = setup_sgl_buf(sgl, fifo->data + off, nents, l);
366 n += setup_sgl_buf(sgl + n, fifo->data, nents - n, len - l);
367
368 return n;
369}
370
371unsigned int __kfifo_dma_in_prepare(struct __kfifo *fifo,
372 struct scatterlist *sgl, int nents, unsigned int len)
373{
374 unsigned int l;
375
376 l = kfifo_unused(fifo);
377 if (len > l)
378 len = l;
379
380 return setup_sgl(fifo, sgl, nents, len, fifo->in);
381}
382EXPORT_SYMBOL(__kfifo_dma_in_prepare);
383
384unsigned int __kfifo_dma_out_prepare(struct __kfifo *fifo,
385 struct scatterlist *sgl, int nents, unsigned int len)
386{
387 unsigned int l;
388
389 l = fifo->in - fifo->out;
390 if (len > l)
391 len = l;
392
393 return setup_sgl(fifo, sgl, nents, len, fifo->out);
394}
395EXPORT_SYMBOL(__kfifo_dma_out_prepare);
396
397unsigned int __kfifo_max_r(unsigned int len, size_t recsize)
398{
399 unsigned int max = (1 << (recsize << 3)) - 1;
400
401 if (len > max)
402 return max;
403 return len;
404}
405EXPORT_SYMBOL(__kfifo_max_r);
406
407#define __KFIFO_PEEK(data, out, mask) \
408 ((data)[(out) & (mask)])
409/*
410 * __kfifo_peek_n internal helper function for determinate the length of
411 * the next record in the fifo
412 */
413static unsigned int __kfifo_peek_n(struct __kfifo *fifo, size_t recsize)
414{
415 unsigned int l;
416 unsigned int mask = fifo->mask;
417 unsigned char *data = fifo->data;
418
419 l = __KFIFO_PEEK(data, fifo->out, mask);
420
421 if (--recsize)
422 l |= __KFIFO_PEEK(data, fifo->out + 1, mask) << 8;
423
424 return l;
425}
426
427#define __KFIFO_POKE(data, in, mask, val) \
428 ( \
429 (data)[(in) & (mask)] = (unsigned char)(val) \
430 )
431
432/*
433 * __kfifo_poke_n internal helper function for storeing the length of
434 * the record into the fifo
435 */
436static void __kfifo_poke_n(struct __kfifo *fifo, unsigned int n, size_t recsize)
437{
438 unsigned int mask = fifo->mask;
439 unsigned char *data = fifo->data;
440
441 __KFIFO_POKE(data, fifo->in, mask, n);
442
443 if (recsize > 1)
444 __KFIFO_POKE(data, fifo->in + 1, mask, n >> 8);
445}
446
447unsigned int __kfifo_len_r(struct __kfifo *fifo, size_t recsize)
448{
449 return __kfifo_peek_n(fifo, recsize);
450}
451EXPORT_SYMBOL(__kfifo_len_r);
452
453unsigned int __kfifo_in_r(struct __kfifo *fifo, const void *buf,
454 unsigned int len, size_t recsize)
455{
456 if (len + recsize > kfifo_unused(fifo))
457 return 0;
458
459 __kfifo_poke_n(fifo, len, recsize);
460
461 kfifo_copy_in(fifo, buf, len, fifo->in + recsize);
462 fifo->in += len + recsize;
463 return len;
464}
465EXPORT_SYMBOL(__kfifo_in_r);
466
467static unsigned int kfifo_out_copy_r(struct __kfifo *fifo,
468 void *buf, unsigned int len, size_t recsize, unsigned int *n)
469{
470 *n = __kfifo_peek_n(fifo, recsize);
471
472 if (len > *n)
473 len = *n;
474
475 kfifo_copy_out(fifo, buf, len, fifo->out + recsize);
476 return len;
477}
478
479unsigned int __kfifo_out_peek_r(struct __kfifo *fifo, void *buf,
480 unsigned int len, size_t recsize)
481{
482 unsigned int n;
483
484 if (fifo->in == fifo->out)
485 return 0;
486
487 return kfifo_out_copy_r(fifo, buf, len, recsize, &n);
488}
489EXPORT_SYMBOL(__kfifo_out_peek_r);
490
491unsigned int __kfifo_out_r(struct __kfifo *fifo, void *buf,
492 unsigned int len, size_t recsize)
493{
494 unsigned int n;
495
496 if (fifo->in == fifo->out)
497 return 0;
498
499 len = kfifo_out_copy_r(fifo, buf, len, recsize, &n);
500 fifo->out += n + recsize;
501 return len;
502}
503EXPORT_SYMBOL(__kfifo_out_r);
504
505void __kfifo_skip_r(struct __kfifo *fifo, size_t recsize)
506{
507 unsigned int n;
508
509 n = __kfifo_peek_n(fifo, recsize);
510 fifo->out += n + recsize;
511}
512EXPORT_SYMBOL(__kfifo_skip_r);
513
514int __kfifo_from_user_r(struct __kfifo *fifo, const void __user *from,
515 unsigned long len, unsigned int *copied, size_t recsize)
516{
517 unsigned long ret;
518
519 len = __kfifo_max_r(len, recsize);
520
521 if (len + recsize > kfifo_unused(fifo)) {
522 *copied = 0;
523 return 0;
524 }
525
526 __kfifo_poke_n(fifo, len, recsize);
527
528 ret = kfifo_copy_from_user(fifo, from, len, fifo->in + recsize, copied);
529 if (unlikely(ret)) {
530 *copied = 0;
531 return -EFAULT;
532 }
533 fifo->in += len + recsize;
534 return 0;
535}
536EXPORT_SYMBOL(__kfifo_from_user_r);
537
538int __kfifo_to_user_r(struct __kfifo *fifo, void __user *to,
539 unsigned long len, unsigned int *copied, size_t recsize)
540{
541 unsigned long ret;
542 unsigned int n;
543
544 if (fifo->in == fifo->out) {
545 *copied = 0;
546 return 0;
547 }
548
549 n = __kfifo_peek_n(fifo, recsize);
550 if (len > n)
551 len = n;
552
553 ret = kfifo_copy_to_user(fifo, to, len, fifo->out + recsize, copied);
554 if (unlikely(ret)) {
555 *copied = 0;
556 return -EFAULT;
557 }
558 fifo->out += n + recsize;
559 return 0;
560}
561EXPORT_SYMBOL(__kfifo_to_user_r);
562
563unsigned int __kfifo_dma_in_prepare_r(struct __kfifo *fifo,
564 struct scatterlist *sgl, int nents, unsigned int len, size_t recsize)
565{
566 if (!nents)
567 BUG();
568
569 len = __kfifo_max_r(len, recsize);
570
571 if (len + recsize > kfifo_unused(fifo))
572 return 0;
573
574 return setup_sgl(fifo, sgl, nents, len, fifo->in + recsize);
575}
576EXPORT_SYMBOL(__kfifo_dma_in_prepare_r);
577
578void __kfifo_dma_in_finish_r(struct __kfifo *fifo,
579 unsigned int len, size_t recsize)
580{
581 len = __kfifo_max_r(len, recsize);
582 __kfifo_poke_n(fifo, len, recsize);
583 fifo->in += len + recsize;
584}
585EXPORT_SYMBOL(__kfifo_dma_in_finish_r);
586
587unsigned int __kfifo_dma_out_prepare_r(struct __kfifo *fifo,
588 struct scatterlist *sgl, int nents, unsigned int len, size_t recsize)
589{
590 if (!nents)
591 BUG();
592
593 len = __kfifo_max_r(len, recsize);
594
595 if (len + recsize > fifo->in - fifo->out)
596 return 0;
597
598 return setup_sgl(fifo, sgl, nents, len, fifo->out + recsize);
599}
600EXPORT_SYMBOL(__kfifo_dma_out_prepare_r);
601
602void __kfifo_dma_out_finish_r(struct __kfifo *fifo, size_t recsize)
603{
604 unsigned int len;
605
606 len = __kfifo_peek_n(fifo, recsize);
607 fifo->out += len + recsize;
608}
609EXPORT_SYMBOL(__kfifo_dma_out_finish_r);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 550294d58a02..e35be53f6613 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -334,11 +334,10 @@ static inline void reset_kprobe_instance(void)
334struct kprobe __kprobes *get_kprobe(void *addr) 334struct kprobe __kprobes *get_kprobe(void *addr)
335{ 335{
336 struct hlist_head *head; 336 struct hlist_head *head;
337 struct hlist_node *node;
338 struct kprobe *p; 337 struct kprobe *p;
339 338
340 head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)]; 339 head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)];
341 hlist_for_each_entry_rcu(p, node, head, hlist) { 340 hlist_for_each_entry_rcu(p, head, hlist) {
342 if (p->addr == addr) 341 if (p->addr == addr)
343 return p; 342 return p;
344 } 343 }
@@ -799,7 +798,6 @@ out:
799static void __kprobes optimize_all_kprobes(void) 798static void __kprobes optimize_all_kprobes(void)
800{ 799{
801 struct hlist_head *head; 800 struct hlist_head *head;
802 struct hlist_node *node;
803 struct kprobe *p; 801 struct kprobe *p;
804 unsigned int i; 802 unsigned int i;
805 803
@@ -810,7 +808,7 @@ static void __kprobes optimize_all_kprobes(void)
810 kprobes_allow_optimization = true; 808 kprobes_allow_optimization = true;
811 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 809 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
812 head = &kprobe_table[i]; 810 head = &kprobe_table[i];
813 hlist_for_each_entry_rcu(p, node, head, hlist) 811 hlist_for_each_entry_rcu(p, head, hlist)
814 if (!kprobe_disabled(p)) 812 if (!kprobe_disabled(p))
815 optimize_kprobe(p); 813 optimize_kprobe(p);
816 } 814 }
@@ -821,7 +819,6 @@ static void __kprobes optimize_all_kprobes(void)
821static void __kprobes unoptimize_all_kprobes(void) 819static void __kprobes unoptimize_all_kprobes(void)
822{ 820{
823 struct hlist_head *head; 821 struct hlist_head *head;
824 struct hlist_node *node;
825 struct kprobe *p; 822 struct kprobe *p;
826 unsigned int i; 823 unsigned int i;
827 824
@@ -832,7 +829,7 @@ static void __kprobes unoptimize_all_kprobes(void)
832 kprobes_allow_optimization = false; 829 kprobes_allow_optimization = false;
833 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 830 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
834 head = &kprobe_table[i]; 831 head = &kprobe_table[i];
835 hlist_for_each_entry_rcu(p, node, head, hlist) { 832 hlist_for_each_entry_rcu(p, head, hlist) {
836 if (!kprobe_disabled(p)) 833 if (!kprobe_disabled(p))
837 unoptimize_kprobe(p, false); 834 unoptimize_kprobe(p, false);
838 } 835 }
@@ -1148,7 +1145,7 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
1148{ 1145{
1149 struct kretprobe_instance *ri; 1146 struct kretprobe_instance *ri;
1150 struct hlist_head *head, empty_rp; 1147 struct hlist_head *head, empty_rp;
1151 struct hlist_node *node, *tmp; 1148 struct hlist_node *tmp;
1152 unsigned long hash, flags = 0; 1149 unsigned long hash, flags = 0;
1153 1150
1154 if (unlikely(!kprobes_initialized)) 1151 if (unlikely(!kprobes_initialized))
@@ -1159,12 +1156,12 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
1159 hash = hash_ptr(tk, KPROBE_HASH_BITS); 1156 hash = hash_ptr(tk, KPROBE_HASH_BITS);
1160 head = &kretprobe_inst_table[hash]; 1157 head = &kretprobe_inst_table[hash];
1161 kretprobe_table_lock(hash, &flags); 1158 kretprobe_table_lock(hash, &flags);
1162 hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { 1159 hlist_for_each_entry_safe(ri, tmp, head, hlist) {
1163 if (ri->task == tk) 1160 if (ri->task == tk)
1164 recycle_rp_inst(ri, &empty_rp); 1161 recycle_rp_inst(ri, &empty_rp);
1165 } 1162 }
1166 kretprobe_table_unlock(hash, &flags); 1163 kretprobe_table_unlock(hash, &flags);
1167 hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { 1164 hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) {
1168 hlist_del(&ri->hlist); 1165 hlist_del(&ri->hlist);
1169 kfree(ri); 1166 kfree(ri);
1170 } 1167 }
@@ -1173,9 +1170,9 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
1173static inline void free_rp_inst(struct kretprobe *rp) 1170static inline void free_rp_inst(struct kretprobe *rp)
1174{ 1171{
1175 struct kretprobe_instance *ri; 1172 struct kretprobe_instance *ri;
1176 struct hlist_node *pos, *next; 1173 struct hlist_node *next;
1177 1174
1178 hlist_for_each_entry_safe(ri, pos, next, &rp->free_instances, hlist) { 1175 hlist_for_each_entry_safe(ri, next, &rp->free_instances, hlist) {
1179 hlist_del(&ri->hlist); 1176 hlist_del(&ri->hlist);
1180 kfree(ri); 1177 kfree(ri);
1181 } 1178 }
@@ -1185,14 +1182,14 @@ static void __kprobes cleanup_rp_inst(struct kretprobe *rp)
1185{ 1182{
1186 unsigned long flags, hash; 1183 unsigned long flags, hash;
1187 struct kretprobe_instance *ri; 1184 struct kretprobe_instance *ri;
1188 struct hlist_node *pos, *next; 1185 struct hlist_node *next;
1189 struct hlist_head *head; 1186 struct hlist_head *head;
1190 1187
1191 /* No race here */ 1188 /* No race here */
1192 for (hash = 0; hash < KPROBE_TABLE_SIZE; hash++) { 1189 for (hash = 0; hash < KPROBE_TABLE_SIZE; hash++) {
1193 kretprobe_table_lock(hash, &flags); 1190 kretprobe_table_lock(hash, &flags);
1194 head = &kretprobe_inst_table[hash]; 1191 head = &kretprobe_inst_table[hash];
1195 hlist_for_each_entry_safe(ri, pos, next, head, hlist) { 1192 hlist_for_each_entry_safe(ri, next, head, hlist) {
1196 if (ri->rp == rp) 1193 if (ri->rp == rp)
1197 ri->rp = NULL; 1194 ri->rp = NULL;
1198 } 1195 }
@@ -2028,7 +2025,6 @@ static int __kprobes kprobes_module_callback(struct notifier_block *nb,
2028{ 2025{
2029 struct module *mod = data; 2026 struct module *mod = data;
2030 struct hlist_head *head; 2027 struct hlist_head *head;
2031 struct hlist_node *node;
2032 struct kprobe *p; 2028 struct kprobe *p;
2033 unsigned int i; 2029 unsigned int i;
2034 int checkcore = (val == MODULE_STATE_GOING); 2030 int checkcore = (val == MODULE_STATE_GOING);
@@ -2045,7 +2041,7 @@ static int __kprobes kprobes_module_callback(struct notifier_block *nb,
2045 mutex_lock(&kprobe_mutex); 2041 mutex_lock(&kprobe_mutex);
2046 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 2042 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
2047 head = &kprobe_table[i]; 2043 head = &kprobe_table[i];
2048 hlist_for_each_entry_rcu(p, node, head, hlist) 2044 hlist_for_each_entry_rcu(p, head, hlist)
2049 if (within_module_init((unsigned long)p->addr, mod) || 2045 if (within_module_init((unsigned long)p->addr, mod) ||
2050 (checkcore && 2046 (checkcore &&
2051 within_module_core((unsigned long)p->addr, mod))) { 2047 within_module_core((unsigned long)p->addr, mod))) {
@@ -2192,7 +2188,6 @@ static void __kprobes kprobe_seq_stop(struct seq_file *f, void *v)
2192static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v) 2188static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
2193{ 2189{
2194 struct hlist_head *head; 2190 struct hlist_head *head;
2195 struct hlist_node *node;
2196 struct kprobe *p, *kp; 2191 struct kprobe *p, *kp;
2197 const char *sym = NULL; 2192 const char *sym = NULL;
2198 unsigned int i = *(loff_t *) v; 2193 unsigned int i = *(loff_t *) v;
@@ -2201,7 +2196,7 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
2201 2196
2202 head = &kprobe_table[i]; 2197 head = &kprobe_table[i];
2203 preempt_disable(); 2198 preempt_disable();
2204 hlist_for_each_entry_rcu(p, node, head, hlist) { 2199 hlist_for_each_entry_rcu(p, head, hlist) {
2205 sym = kallsyms_lookup((unsigned long)p->addr, NULL, 2200 sym = kallsyms_lookup((unsigned long)p->addr, NULL,
2206 &offset, &modname, namebuf); 2201 &offset, &modname, namebuf);
2207 if (kprobe_aggrprobe(p)) { 2202 if (kprobe_aggrprobe(p)) {
@@ -2236,7 +2231,6 @@ static const struct file_operations debugfs_kprobes_operations = {
2236static void __kprobes arm_all_kprobes(void) 2231static void __kprobes arm_all_kprobes(void)
2237{ 2232{
2238 struct hlist_head *head; 2233 struct hlist_head *head;
2239 struct hlist_node *node;
2240 struct kprobe *p; 2234 struct kprobe *p;
2241 unsigned int i; 2235 unsigned int i;
2242 2236
@@ -2249,7 +2243,7 @@ static void __kprobes arm_all_kprobes(void)
2249 /* Arming kprobes doesn't optimize kprobe itself */ 2243 /* Arming kprobes doesn't optimize kprobe itself */
2250 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 2244 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
2251 head = &kprobe_table[i]; 2245 head = &kprobe_table[i];
2252 hlist_for_each_entry_rcu(p, node, head, hlist) 2246 hlist_for_each_entry_rcu(p, head, hlist)
2253 if (!kprobe_disabled(p)) 2247 if (!kprobe_disabled(p))
2254 arm_kprobe(p); 2248 arm_kprobe(p);
2255 } 2249 }
@@ -2265,7 +2259,6 @@ already_enabled:
2265static void __kprobes disarm_all_kprobes(void) 2259static void __kprobes disarm_all_kprobes(void)
2266{ 2260{
2267 struct hlist_head *head; 2261 struct hlist_head *head;
2268 struct hlist_node *node;
2269 struct kprobe *p; 2262 struct kprobe *p;
2270 unsigned int i; 2263 unsigned int i;
2271 2264
@@ -2282,7 +2275,7 @@ static void __kprobes disarm_all_kprobes(void)
2282 2275
2283 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 2276 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
2284 head = &kprobe_table[i]; 2277 head = &kprobe_table[i];
2285 hlist_for_each_entry_rcu(p, node, head, hlist) { 2278 hlist_for_each_entry_rcu(p, head, hlist) {
2286 if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) 2279 if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
2287 disarm_kprobe(p, false); 2280 disarm_kprobe(p, false);
2288 } 2281 }
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 7981e5b2350d..259db207b5d9 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -3190,9 +3190,14 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
3190#endif 3190#endif
3191 if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) { 3191 if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) {
3192 debug_locks_off(); 3192 debug_locks_off();
3193 printk("BUG: MAX_LOCK_DEPTH too low!\n"); 3193 printk("BUG: MAX_LOCK_DEPTH too low, depth: %i max: %lu!\n",
3194 curr->lockdep_depth, MAX_LOCK_DEPTH);
3194 printk("turning off the locking correctness validator.\n"); 3195 printk("turning off the locking correctness validator.\n");
3196
3197 lockdep_print_held_locks(current);
3198 debug_show_all_locks();
3195 dump_stack(); 3199 dump_stack();
3200
3196 return 0; 3201 return 0;
3197 } 3202 }
3198 3203
@@ -3203,7 +3208,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
3203} 3208}
3204 3209
3205static int 3210static int
3206print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock, 3211print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
3207 unsigned long ip) 3212 unsigned long ip)
3208{ 3213{
3209 if (!debug_locks_off()) 3214 if (!debug_locks_off())
@@ -3246,7 +3251,7 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
3246 return 0; 3251 return 0;
3247 3252
3248 if (curr->lockdep_depth <= 0) 3253 if (curr->lockdep_depth <= 0)
3249 return print_unlock_inbalance_bug(curr, lock, ip); 3254 return print_unlock_imbalance_bug(curr, lock, ip);
3250 3255
3251 return 1; 3256 return 1;
3252} 3257}
@@ -3317,7 +3322,7 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
3317 goto found_it; 3322 goto found_it;
3318 prev_hlock = hlock; 3323 prev_hlock = hlock;
3319 } 3324 }
3320 return print_unlock_inbalance_bug(curr, lock, ip); 3325 return print_unlock_imbalance_bug(curr, lock, ip);
3321 3326
3322found_it: 3327found_it:
3323 lockdep_init_map(lock, name, key, 0); 3328 lockdep_init_map(lock, name, key, 0);
@@ -3384,7 +3389,7 @@ lock_release_non_nested(struct task_struct *curr,
3384 goto found_it; 3389 goto found_it;
3385 prev_hlock = hlock; 3390 prev_hlock = hlock;
3386 } 3391 }
3387 return print_unlock_inbalance_bug(curr, lock, ip); 3392 return print_unlock_imbalance_bug(curr, lock, ip);
3388 3393
3389found_it: 3394found_it:
3390 if (hlock->instance == lock) 3395 if (hlock->instance == lock)
@@ -4083,7 +4088,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len)
4083} 4088}
4084EXPORT_SYMBOL_GPL(debug_check_no_locks_freed); 4089EXPORT_SYMBOL_GPL(debug_check_no_locks_freed);
4085 4090
4086static void print_held_locks_bug(struct task_struct *curr) 4091static void print_held_locks_bug(void)
4087{ 4092{
4088 if (!debug_locks_off()) 4093 if (!debug_locks_off())
4089 return; 4094 return;
@@ -4092,22 +4097,21 @@ static void print_held_locks_bug(struct task_struct *curr)
4092 4097
4093 printk("\n"); 4098 printk("\n");
4094 printk("=====================================\n"); 4099 printk("=====================================\n");
4095 printk("[ BUG: lock held at task exit time! ]\n"); 4100 printk("[ BUG: %s/%d still has locks held! ]\n",
4101 current->comm, task_pid_nr(current));
4096 print_kernel_ident(); 4102 print_kernel_ident();
4097 printk("-------------------------------------\n"); 4103 printk("-------------------------------------\n");
4098 printk("%s/%d is exiting with locks still held!\n", 4104 lockdep_print_held_locks(current);
4099 curr->comm, task_pid_nr(curr));
4100 lockdep_print_held_locks(curr);
4101
4102 printk("\nstack backtrace:\n"); 4105 printk("\nstack backtrace:\n");
4103 dump_stack(); 4106 dump_stack();
4104} 4107}
4105 4108
4106void debug_check_no_locks_held(struct task_struct *task) 4109void debug_check_no_locks_held(void)
4107{ 4110{
4108 if (unlikely(task->lockdep_depth > 0)) 4111 if (unlikely(current->lockdep_depth > 0))
4109 print_held_locks_bug(task); 4112 print_held_locks_bug();
4110} 4113}
4114EXPORT_SYMBOL_GPL(debug_check_no_locks_held);
4111 4115
4112void debug_show_all_locks(void) 4116void debug_show_all_locks(void)
4113{ 4117{
diff --git a/kernel/module.c b/kernel/module.c
index eab08274ec9b..0925c9a71975 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -197,9 +197,10 @@ static inline int strong_try_module_get(struct module *mod)
197 return -ENOENT; 197 return -ENOENT;
198} 198}
199 199
200static inline void add_taint_module(struct module *mod, unsigned flag) 200static inline void add_taint_module(struct module *mod, unsigned flag,
201 enum lockdep_ok lockdep_ok)
201{ 202{
202 add_taint(flag); 203 add_taint(flag, lockdep_ok);
203 mod->taints |= (1U << flag); 204 mod->taints |= (1U << flag);
204} 205}
205 206
@@ -727,7 +728,7 @@ static inline int try_force_unload(unsigned int flags)
727{ 728{
728 int ret = (flags & O_TRUNC); 729 int ret = (flags & O_TRUNC);
729 if (ret) 730 if (ret)
730 add_taint(TAINT_FORCED_RMMOD); 731 add_taint(TAINT_FORCED_RMMOD, LOCKDEP_NOW_UNRELIABLE);
731 return ret; 732 return ret;
732} 733}
733#else 734#else
@@ -1138,7 +1139,7 @@ static int try_to_force_load(struct module *mod, const char *reason)
1138 if (!test_taint(TAINT_FORCED_MODULE)) 1139 if (!test_taint(TAINT_FORCED_MODULE))
1139 printk(KERN_WARNING "%s: %s: kernel tainted.\n", 1140 printk(KERN_WARNING "%s: %s: kernel tainted.\n",
1140 mod->name, reason); 1141 mod->name, reason);
1141 add_taint_module(mod, TAINT_FORCED_MODULE); 1142 add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_NOW_UNRELIABLE);
1142 return 0; 1143 return 0;
1143#else 1144#else
1144 return -ENOEXEC; 1145 return -ENOEXEC;
@@ -2147,7 +2148,8 @@ static void set_license(struct module *mod, const char *license)
2147 if (!test_taint(TAINT_PROPRIETARY_MODULE)) 2148 if (!test_taint(TAINT_PROPRIETARY_MODULE))
2148 printk(KERN_WARNING "%s: module license '%s' taints " 2149 printk(KERN_WARNING "%s: module license '%s' taints "
2149 "kernel.\n", mod->name, license); 2150 "kernel.\n", mod->name, license);
2150 add_taint_module(mod, TAINT_PROPRIETARY_MODULE); 2151 add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
2152 LOCKDEP_NOW_UNRELIABLE);
2151 } 2153 }
2152} 2154}
2153 2155
@@ -2539,7 +2541,7 @@ static int copy_module_from_fd(int fd, struct load_info *info)
2539 if (err) 2541 if (err)
2540 goto out; 2542 goto out;
2541 2543
2542 err = vfs_getattr(file->f_vfsmnt, file->f_dentry, &stat); 2544 err = vfs_getattr(&file->f_path, &stat);
2543 if (err) 2545 if (err)
2544 goto out; 2546 goto out;
2545 2547
@@ -2700,10 +2702,10 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
2700 } 2702 }
2701 2703
2702 if (!get_modinfo(info, "intree")) 2704 if (!get_modinfo(info, "intree"))
2703 add_taint_module(mod, TAINT_OOT_MODULE); 2705 add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK);
2704 2706
2705 if (get_modinfo(info, "staging")) { 2707 if (get_modinfo(info, "staging")) {
2706 add_taint_module(mod, TAINT_CRAP); 2708 add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK);
2707 printk(KERN_WARNING "%s: module is from the staging directory," 2709 printk(KERN_WARNING "%s: module is from the staging directory,"
2708 " the quality is unknown, you have been warned.\n", 2710 " the quality is unknown, you have been warned.\n",
2709 mod->name); 2711 mod->name);
@@ -2869,15 +2871,17 @@ static int check_module_license_and_versions(struct module *mod)
2869 * using GPL-only symbols it needs. 2871 * using GPL-only symbols it needs.
2870 */ 2872 */
2871 if (strcmp(mod->name, "ndiswrapper") == 0) 2873 if (strcmp(mod->name, "ndiswrapper") == 0)
2872 add_taint(TAINT_PROPRIETARY_MODULE); 2874 add_taint(TAINT_PROPRIETARY_MODULE, LOCKDEP_NOW_UNRELIABLE);
2873 2875
2874 /* driverloader was caught wrongly pretending to be under GPL */ 2876 /* driverloader was caught wrongly pretending to be under GPL */
2875 if (strcmp(mod->name, "driverloader") == 0) 2877 if (strcmp(mod->name, "driverloader") == 0)
2876 add_taint_module(mod, TAINT_PROPRIETARY_MODULE); 2878 add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
2879 LOCKDEP_NOW_UNRELIABLE);
2877 2880
2878 /* lve claims to be GPL but upstream won't provide source */ 2881 /* lve claims to be GPL but upstream won't provide source */
2879 if (strcmp(mod->name, "lve") == 0) 2882 if (strcmp(mod->name, "lve") == 0)
2880 add_taint_module(mod, TAINT_PROPRIETARY_MODULE); 2883 add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
2884 LOCKDEP_NOW_UNRELIABLE);
2881 2885
2882#ifdef CONFIG_MODVERSIONS 2886#ifdef CONFIG_MODVERSIONS
2883 if ((mod->num_syms && !mod->crcs) 2887 if ((mod->num_syms && !mod->crcs)
@@ -3141,12 +3145,72 @@ static int may_init_module(void)
3141 return 0; 3145 return 0;
3142} 3146}
3143 3147
3148/*
3149 * We try to place it in the list now to make sure it's unique before
3150 * we dedicate too many resources. In particular, temporary percpu
3151 * memory exhaustion.
3152 */
3153static int add_unformed_module(struct module *mod)
3154{
3155 int err;
3156 struct module *old;
3157
3158 mod->state = MODULE_STATE_UNFORMED;
3159
3160again:
3161 mutex_lock(&module_mutex);
3162 if ((old = find_module_all(mod->name, true)) != NULL) {
3163 if (old->state == MODULE_STATE_COMING
3164 || old->state == MODULE_STATE_UNFORMED) {
3165 /* Wait in case it fails to load. */
3166 mutex_unlock(&module_mutex);
3167 err = wait_event_interruptible(module_wq,
3168 finished_loading(mod->name));
3169 if (err)
3170 goto out_unlocked;
3171 goto again;
3172 }
3173 err = -EEXIST;
3174 goto out;
3175 }
3176 list_add_rcu(&mod->list, &modules);
3177 err = 0;
3178
3179out:
3180 mutex_unlock(&module_mutex);
3181out_unlocked:
3182 return err;
3183}
3184
3185static int complete_formation(struct module *mod, struct load_info *info)
3186{
3187 int err;
3188
3189 mutex_lock(&module_mutex);
3190
3191 /* Find duplicate symbols (must be called under lock). */
3192 err = verify_export_symbols(mod);
3193 if (err < 0)
3194 goto out;
3195
3196 /* This relies on module_mutex for list integrity. */
3197 module_bug_finalize(info->hdr, info->sechdrs, mod);
3198
3199 /* Mark state as coming so strong_try_module_get() ignores us,
3200 * but kallsyms etc. can see us. */
3201 mod->state = MODULE_STATE_COMING;
3202
3203out:
3204 mutex_unlock(&module_mutex);
3205 return err;
3206}
3207
3144/* Allocate and load the module: note that size of section 0 is always 3208/* Allocate and load the module: note that size of section 0 is always
3145 zero, and we rely on this for optional sections. */ 3209 zero, and we rely on this for optional sections. */
3146static int load_module(struct load_info *info, const char __user *uargs, 3210static int load_module(struct load_info *info, const char __user *uargs,
3147 int flags) 3211 int flags)
3148{ 3212{
3149 struct module *mod, *old; 3213 struct module *mod;
3150 long err; 3214 long err;
3151 3215
3152 err = module_sig_check(info); 3216 err = module_sig_check(info);
@@ -3164,36 +3228,20 @@ static int load_module(struct load_info *info, const char __user *uargs,
3164 goto free_copy; 3228 goto free_copy;
3165 } 3229 }
3166 3230
3167 /* 3231 /* Reserve our place in the list. */
3168 * We try to place it in the list now to make sure it's unique 3232 err = add_unformed_module(mod);
3169 * before we dedicate too many resources. In particular, 3233 if (err)
3170 * temporary percpu memory exhaustion.
3171 */
3172 mod->state = MODULE_STATE_UNFORMED;
3173again:
3174 mutex_lock(&module_mutex);
3175 if ((old = find_module_all(mod->name, true)) != NULL) {
3176 if (old->state == MODULE_STATE_COMING
3177 || old->state == MODULE_STATE_UNFORMED) {
3178 /* Wait in case it fails to load. */
3179 mutex_unlock(&module_mutex);
3180 err = wait_event_interruptible(module_wq,
3181 finished_loading(mod->name));
3182 if (err)
3183 goto free_module;
3184 goto again;
3185 }
3186 err = -EEXIST;
3187 mutex_unlock(&module_mutex);
3188 goto free_module; 3234 goto free_module;
3189 }
3190 list_add_rcu(&mod->list, &modules);
3191 mutex_unlock(&module_mutex);
3192 3235
3193#ifdef CONFIG_MODULE_SIG 3236#ifdef CONFIG_MODULE_SIG
3194 mod->sig_ok = info->sig_ok; 3237 mod->sig_ok = info->sig_ok;
3195 if (!mod->sig_ok) 3238 if (!mod->sig_ok) {
3196 add_taint_module(mod, TAINT_FORCED_MODULE); 3239 printk_once(KERN_NOTICE
3240 "%s: module verification failed: signature and/or"
3241 " required key missing - tainting kernel\n",
3242 mod->name);
3243 add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_STILL_OK);
3244 }
3197#endif 3245#endif
3198 3246
3199 /* Now module is in final location, initialize linked lists, etc. */ 3247 /* Now module is in final location, initialize linked lists, etc. */
@@ -3236,21 +3284,11 @@ again:
3236 3284
3237 dynamic_debug_setup(info->debug, info->num_debug); 3285 dynamic_debug_setup(info->debug, info->num_debug);
3238 3286
3239 mutex_lock(&module_mutex); 3287 /* Finally it's fully formed, ready to start executing. */
3240 /* Find duplicate symbols (must be called under lock). */ 3288 err = complete_formation(mod, info);
3241 err = verify_export_symbols(mod); 3289 if (err)
3242 if (err < 0)
3243 goto ddebug_cleanup; 3290 goto ddebug_cleanup;
3244 3291
3245 /* This relies on module_mutex for list integrity. */
3246 module_bug_finalize(info->hdr, info->sechdrs, mod);
3247
3248 /* Mark state as coming so strong_try_module_get() ignores us,
3249 * but kallsyms etc. can see us. */
3250 mod->state = MODULE_STATE_COMING;
3251
3252 mutex_unlock(&module_mutex);
3253
3254 /* Module is ready to execute: parsing args may do that. */ 3292 /* Module is ready to execute: parsing args may do that. */
3255 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, 3293 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
3256 -32768, 32767, &ddebug_dyndbg_module_param_cb); 3294 -32768, 32767, &ddebug_dyndbg_module_param_cb);
@@ -3274,8 +3312,8 @@ again:
3274 /* module_bug_cleanup needs module_mutex protection */ 3312 /* module_bug_cleanup needs module_mutex protection */
3275 mutex_lock(&module_mutex); 3313 mutex_lock(&module_mutex);
3276 module_bug_cleanup(mod); 3314 module_bug_cleanup(mod);
3277 ddebug_cleanup:
3278 mutex_unlock(&module_mutex); 3315 mutex_unlock(&module_mutex);
3316 ddebug_cleanup:
3279 dynamic_debug_remove(info->debug); 3317 dynamic_debug_remove(info->debug);
3280 synchronize_sched(); 3318 synchronize_sched();
3281 kfree(mod->args); 3319 kfree(mod->args);
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 78e2ecb20165..afc0456f227a 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -153,8 +153,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
153 goto out; 153 goto out;
154 } 154 }
155 155
156 new_ns = create_new_namespaces(flags, tsk, 156 new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs);
157 task_cred_xxx(tsk, user_ns), tsk->fs);
158 if (IS_ERR(new_ns)) { 157 if (IS_ERR(new_ns)) {
159 err = PTR_ERR(new_ns); 158 err = PTR_ERR(new_ns);
160 goto out; 159 goto out;
@@ -251,7 +250,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
251 return PTR_ERR(file); 250 return PTR_ERR(file);
252 251
253 err = -EINVAL; 252 err = -EINVAL;
254 ei = PROC_I(file->f_dentry->d_inode); 253 ei = PROC_I(file_inode(file));
255 ops = ei->ns_ops; 254 ops = ei->ns_ops;
256 if (nstype && (ops->type != nstype)) 255 if (nstype && (ops->type != nstype))
257 goto out; 256 goto out;
diff --git a/kernel/panic.c b/kernel/panic.c
index e1b2822fff97..7c57cc9eee2c 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -259,26 +259,19 @@ unsigned long get_taint(void)
259 return tainted_mask; 259 return tainted_mask;
260} 260}
261 261
262void add_taint(unsigned flag) 262/**
263 * add_taint: add a taint flag if not already set.
264 * @flag: one of the TAINT_* constants.
265 * @lockdep_ok: whether lock debugging is still OK.
266 *
267 * If something bad has gone wrong, you'll want @lockdebug_ok = false, but for
268 * some notewortht-but-not-corrupting cases, it can be set to true.
269 */
270void add_taint(unsigned flag, enum lockdep_ok lockdep_ok)
263{ 271{
264 /* 272 if (lockdep_ok == LOCKDEP_NOW_UNRELIABLE && __debug_locks_off())
265 * Can't trust the integrity of the kernel anymore. 273 printk(KERN_WARNING
266 * We don't call directly debug_locks_off() because the issue 274 "Disabling lock debugging due to kernel taint\n");
267 * is not necessarily serious enough to set oops_in_progress to 1
268 * Also we want to keep up lockdep for staging/out-of-tree
269 * development and post-warning case.
270 */
271 switch (flag) {
272 case TAINT_CRAP:
273 case TAINT_OOT_MODULE:
274 case TAINT_WARN:
275 case TAINT_FIRMWARE_WORKAROUND:
276 break;
277
278 default:
279 if (__debug_locks_off())
280 printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n");
281 }
282 275
283 set_bit(flag, &tainted_mask); 276 set_bit(flag, &tainted_mask);
284} 277}
@@ -421,7 +414,8 @@ static void warn_slowpath_common(const char *file, int line, void *caller,
421 print_modules(); 414 print_modules();
422 dump_stack(); 415 dump_stack();
423 print_oops_end_marker(); 416 print_oops_end_marker();
424 add_taint(taint); 417 /* Just a warning, don't kill lockdep. */
418 add_taint(taint, LOCKDEP_STILL_OK);
425} 419}
426 420
427void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...) 421void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...)
diff --git a/kernel/pid.c b/kernel/pid.c
index f2c6a6825098..047dc6264638 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -350,10 +350,9 @@ void disable_pid_allocation(struct pid_namespace *ns)
350 350
351struct pid *find_pid_ns(int nr, struct pid_namespace *ns) 351struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
352{ 352{
353 struct hlist_node *elem;
354 struct upid *pnr; 353 struct upid *pnr;
355 354
356 hlist_for_each_entry_rcu(pnr, elem, 355 hlist_for_each_entry_rcu(pnr,
357 &pid_hash[pid_hashfn(nr, ns)], pid_chain) 356 &pid_hash[pid_hashfn(nr, ns)], pid_chain)
358 if (pnr->nr == nr && pnr->ns == ns) 357 if (pnr->nr == nr && pnr->ns == ns)
359 return container_of(pnr, struct pid, 358 return container_of(pnr, struct pid,
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 10349d5f2ec3..6edbb2c55c22 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -552,24 +552,22 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
552 return -EAGAIN; 552 return -EAGAIN;
553 553
554 spin_lock_init(&new_timer->it_lock); 554 spin_lock_init(&new_timer->it_lock);
555 retry: 555
556 if (unlikely(!idr_pre_get(&posix_timers_id, GFP_KERNEL))) { 556 idr_preload(GFP_KERNEL);
557 error = -EAGAIN;
558 goto out;
559 }
560 spin_lock_irq(&idr_lock); 557 spin_lock_irq(&idr_lock);
561 error = idr_get_new(&posix_timers_id, new_timer, &new_timer_id); 558 error = idr_alloc(&posix_timers_id, new_timer, 0, 0, GFP_NOWAIT);
562 spin_unlock_irq(&idr_lock); 559 spin_unlock_irq(&idr_lock);
563 if (error) { 560 idr_preload_end();
564 if (error == -EAGAIN) 561 if (error < 0) {
565 goto retry;
566 /* 562 /*
567 * Weird looking, but we return EAGAIN if the IDR is 563 * Weird looking, but we return EAGAIN if the IDR is
568 * full (proper POSIX return value for this) 564 * full (proper POSIX return value for this)
569 */ 565 */
570 error = -EAGAIN; 566 if (error == -ENOSPC)
567 error = -EAGAIN;
571 goto out; 568 goto out;
572 } 569 }
570 new_timer_id = error;
573 571
574 it_id_set = IT_ID_SET; 572 it_id_set = IT_ID_SET;
575 new_timer->it_id = (timer_t) new_timer_id; 573 new_timer->it_id = (timer_t) new_timer_id;
@@ -639,6 +637,13 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
639{ 637{
640 struct k_itimer *timr; 638 struct k_itimer *timr;
641 639
640 /*
641 * timer_t could be any type >= int and we want to make sure any
642 * @timer_id outside positive int range fails lookup.
643 */
644 if ((unsigned long long)timer_id > INT_MAX)
645 return NULL;
646
642 rcu_read_lock(); 647 rcu_read_lock();
643 timr = idr_find(&posix_timers_id, (int)timer_id); 648 timr = idr_find(&posix_timers_id, (int)timer_id);
644 if (timr) { 649 if (timr) {
diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c
index ca304046d9e2..c6422ffeda9a 100644
--- a/kernel/power/autosleep.c
+++ b/kernel/power/autosleep.c
@@ -66,7 +66,7 @@ static DECLARE_WORK(suspend_work, try_to_suspend);
66 66
67void queue_up_suspend_work(void) 67void queue_up_suspend_work(void)
68{ 68{
69 if (!work_pending(&suspend_work) && autosleep_state > PM_SUSPEND_ON) 69 if (autosleep_state > PM_SUSPEND_ON)
70 queue_work(autosleep_wq, &suspend_work); 70 queue_work(autosleep_wq, &suspend_work);
71} 71}
72 72
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 1c16f9167de1..d77663bfedeb 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -313,7 +313,7 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
313static suspend_state_t decode_state(const char *buf, size_t n) 313static suspend_state_t decode_state(const char *buf, size_t n)
314{ 314{
315#ifdef CONFIG_SUSPEND 315#ifdef CONFIG_SUSPEND
316 suspend_state_t state = PM_SUSPEND_STANDBY; 316 suspend_state_t state = PM_SUSPEND_MIN;
317 const char * const *s; 317 const char * const *s;
318#endif 318#endif
319 char *p; 319 char *p;
@@ -553,6 +553,30 @@ power_attr(pm_trace_dev_match);
553 553
554#endif /* CONFIG_PM_TRACE */ 554#endif /* CONFIG_PM_TRACE */
555 555
556#ifdef CONFIG_FREEZER
557static ssize_t pm_freeze_timeout_show(struct kobject *kobj,
558 struct kobj_attribute *attr, char *buf)
559{
560 return sprintf(buf, "%u\n", freeze_timeout_msecs);
561}
562
563static ssize_t pm_freeze_timeout_store(struct kobject *kobj,
564 struct kobj_attribute *attr,
565 const char *buf, size_t n)
566{
567 unsigned long val;
568
569 if (kstrtoul(buf, 10, &val))
570 return -EINVAL;
571
572 freeze_timeout_msecs = val;
573 return n;
574}
575
576power_attr(pm_freeze_timeout);
577
578#endif /* CONFIG_FREEZER*/
579
556static struct attribute * g[] = { 580static struct attribute * g[] = {
557 &state_attr.attr, 581 &state_attr.attr,
558#ifdef CONFIG_PM_TRACE 582#ifdef CONFIG_PM_TRACE
@@ -576,6 +600,9 @@ static struct attribute * g[] = {
576 &pm_print_times_attr.attr, 600 &pm_print_times_attr.attr,
577#endif 601#endif
578#endif 602#endif
603#ifdef CONFIG_FREEZER
604 &pm_freeze_timeout_attr.attr,
605#endif
579 NULL, 606 NULL,
580}; 607};
581 608
diff --git a/kernel/power/process.c b/kernel/power/process.c
index d5a258b60c6f..98088e0e71e8 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -21,7 +21,7 @@
21/* 21/*
22 * Timeout for stopping processes 22 * Timeout for stopping processes
23 */ 23 */
24#define TIMEOUT (20 * HZ) 24unsigned int __read_mostly freeze_timeout_msecs = 20 * MSEC_PER_SEC;
25 25
26static int try_to_freeze_tasks(bool user_only) 26static int try_to_freeze_tasks(bool user_only)
27{ 27{
@@ -36,7 +36,7 @@ static int try_to_freeze_tasks(bool user_only)
36 36
37 do_gettimeofday(&start); 37 do_gettimeofday(&start);
38 38
39 end_time = jiffies + TIMEOUT; 39 end_time = jiffies + msecs_to_jiffies(freeze_timeout_msecs);
40 40
41 if (!user_only) 41 if (!user_only)
42 freeze_workqueues_begin(); 42 freeze_workqueues_begin();
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 9322ff7eaad6..587dddeebf15 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -359,8 +359,7 @@ void pm_qos_update_request(struct pm_qos_request *req,
359 return; 359 return;
360 } 360 }
361 361
362 if (delayed_work_pending(&req->work)) 362 cancel_delayed_work_sync(&req->work);
363 cancel_delayed_work_sync(&req->work);
364 363
365 if (new_value != req->node.prio) 364 if (new_value != req->node.prio)
366 pm_qos_update_target( 365 pm_qos_update_target(
@@ -386,8 +385,7 @@ void pm_qos_update_request_timeout(struct pm_qos_request *req, s32 new_value,
386 "%s called for unknown object.", __func__)) 385 "%s called for unknown object.", __func__))
387 return; 386 return;
388 387
389 if (delayed_work_pending(&req->work)) 388 cancel_delayed_work_sync(&req->work);
390 cancel_delayed_work_sync(&req->work);
391 389
392 if (new_value != req->node.prio) 390 if (new_value != req->node.prio)
393 pm_qos_update_target( 391 pm_qos_update_target(
@@ -416,8 +414,7 @@ void pm_qos_remove_request(struct pm_qos_request *req)
416 return; 414 return;
417 } 415 }
418 416
419 if (delayed_work_pending(&req->work)) 417 cancel_delayed_work_sync(&req->work);
420 cancel_delayed_work_sync(&req->work);
421 418
422 pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints, 419 pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints,
423 &req->node, PM_QOS_REMOVE_REQ, 420 &req->node, PM_QOS_REMOVE_REQ,
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index c8b7446b27df..d4feda084a3a 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -30,12 +30,38 @@
30#include "power.h" 30#include "power.h"
31 31
32const char *const pm_states[PM_SUSPEND_MAX] = { 32const char *const pm_states[PM_SUSPEND_MAX] = {
33 [PM_SUSPEND_FREEZE] = "freeze",
33 [PM_SUSPEND_STANDBY] = "standby", 34 [PM_SUSPEND_STANDBY] = "standby",
34 [PM_SUSPEND_MEM] = "mem", 35 [PM_SUSPEND_MEM] = "mem",
35}; 36};
36 37
37static const struct platform_suspend_ops *suspend_ops; 38static const struct platform_suspend_ops *suspend_ops;
38 39
40static bool need_suspend_ops(suspend_state_t state)
41{
42 return !!(state > PM_SUSPEND_FREEZE);
43}
44
45static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
46static bool suspend_freeze_wake;
47
48static void freeze_begin(void)
49{
50 suspend_freeze_wake = false;
51}
52
53static void freeze_enter(void)
54{
55 wait_event(suspend_freeze_wait_head, suspend_freeze_wake);
56}
57
58void freeze_wake(void)
59{
60 suspend_freeze_wake = true;
61 wake_up(&suspend_freeze_wait_head);
62}
63EXPORT_SYMBOL_GPL(freeze_wake);
64
39/** 65/**
40 * suspend_set_ops - Set the global suspend method table. 66 * suspend_set_ops - Set the global suspend method table.
41 * @ops: Suspend operations to use. 67 * @ops: Suspend operations to use.
@@ -50,8 +76,11 @@ EXPORT_SYMBOL_GPL(suspend_set_ops);
50 76
51bool valid_state(suspend_state_t state) 77bool valid_state(suspend_state_t state)
52{ 78{
79 if (state == PM_SUSPEND_FREEZE)
80 return true;
53 /* 81 /*
54 * All states need lowlevel support and need to be valid to the lowlevel 82 * PM_SUSPEND_STANDBY and PM_SUSPEND_MEMORY states need lowlevel
83 * support and need to be valid to the lowlevel
55 * implementation, no valid callback implies that none are valid. 84 * implementation, no valid callback implies that none are valid.
56 */ 85 */
57 return suspend_ops && suspend_ops->valid && suspend_ops->valid(state); 86 return suspend_ops && suspend_ops->valid && suspend_ops->valid(state);
@@ -89,11 +118,11 @@ static int suspend_test(int level)
89 * hibernation). Run suspend notifiers, allocate the "suspend" console and 118 * hibernation). Run suspend notifiers, allocate the "suspend" console and
90 * freeze processes. 119 * freeze processes.
91 */ 120 */
92static int suspend_prepare(void) 121static int suspend_prepare(suspend_state_t state)
93{ 122{
94 int error; 123 int error;
95 124
96 if (!suspend_ops || !suspend_ops->enter) 125 if (need_suspend_ops(state) && (!suspend_ops || !suspend_ops->enter))
97 return -EPERM; 126 return -EPERM;
98 127
99 pm_prepare_console(); 128 pm_prepare_console();
@@ -137,7 +166,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
137{ 166{
138 int error; 167 int error;
139 168
140 if (suspend_ops->prepare) { 169 if (need_suspend_ops(state) && suspend_ops->prepare) {
141 error = suspend_ops->prepare(); 170 error = suspend_ops->prepare();
142 if (error) 171 if (error)
143 goto Platform_finish; 172 goto Platform_finish;
@@ -149,12 +178,23 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
149 goto Platform_finish; 178 goto Platform_finish;
150 } 179 }
151 180
152 if (suspend_ops->prepare_late) { 181 if (need_suspend_ops(state) && suspend_ops->prepare_late) {
153 error = suspend_ops->prepare_late(); 182 error = suspend_ops->prepare_late();
154 if (error) 183 if (error)
155 goto Platform_wake; 184 goto Platform_wake;
156 } 185 }
157 186
187 /*
188 * PM_SUSPEND_FREEZE equals
189 * frozen processes + suspended devices + idle processors.
190 * Thus we should invoke freeze_enter() soon after
191 * all the devices are suspended.
192 */
193 if (state == PM_SUSPEND_FREEZE) {
194 freeze_enter();
195 goto Platform_wake;
196 }
197
158 if (suspend_test(TEST_PLATFORM)) 198 if (suspend_test(TEST_PLATFORM))
159 goto Platform_wake; 199 goto Platform_wake;
160 200
@@ -182,13 +222,13 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
182 enable_nonboot_cpus(); 222 enable_nonboot_cpus();
183 223
184 Platform_wake: 224 Platform_wake:
185 if (suspend_ops->wake) 225 if (need_suspend_ops(state) && suspend_ops->wake)
186 suspend_ops->wake(); 226 suspend_ops->wake();
187 227
188 dpm_resume_start(PMSG_RESUME); 228 dpm_resume_start(PMSG_RESUME);
189 229
190 Platform_finish: 230 Platform_finish:
191 if (suspend_ops->finish) 231 if (need_suspend_ops(state) && suspend_ops->finish)
192 suspend_ops->finish(); 232 suspend_ops->finish();
193 233
194 return error; 234 return error;
@@ -203,11 +243,11 @@ int suspend_devices_and_enter(suspend_state_t state)
203 int error; 243 int error;
204 bool wakeup = false; 244 bool wakeup = false;
205 245
206 if (!suspend_ops) 246 if (need_suspend_ops(state) && !suspend_ops)
207 return -ENOSYS; 247 return -ENOSYS;
208 248
209 trace_machine_suspend(state); 249 trace_machine_suspend(state);
210 if (suspend_ops->begin) { 250 if (need_suspend_ops(state) && suspend_ops->begin) {
211 error = suspend_ops->begin(state); 251 error = suspend_ops->begin(state);
212 if (error) 252 if (error)
213 goto Close; 253 goto Close;
@@ -226,7 +266,7 @@ int suspend_devices_and_enter(suspend_state_t state)
226 266
227 do { 267 do {
228 error = suspend_enter(state, &wakeup); 268 error = suspend_enter(state, &wakeup);
229 } while (!error && !wakeup 269 } while (!error && !wakeup && need_suspend_ops(state)
230 && suspend_ops->suspend_again && suspend_ops->suspend_again()); 270 && suspend_ops->suspend_again && suspend_ops->suspend_again());
231 271
232 Resume_devices: 272 Resume_devices:
@@ -236,13 +276,13 @@ int suspend_devices_and_enter(suspend_state_t state)
236 ftrace_start(); 276 ftrace_start();
237 resume_console(); 277 resume_console();
238 Close: 278 Close:
239 if (suspend_ops->end) 279 if (need_suspend_ops(state) && suspend_ops->end)
240 suspend_ops->end(); 280 suspend_ops->end();
241 trace_machine_suspend(PWR_EVENT_EXIT); 281 trace_machine_suspend(PWR_EVENT_EXIT);
242 return error; 282 return error;
243 283
244 Recover_platform: 284 Recover_platform:
245 if (suspend_ops->recover) 285 if (need_suspend_ops(state) && suspend_ops->recover)
246 suspend_ops->recover(); 286 suspend_ops->recover();
247 goto Resume_devices; 287 goto Resume_devices;
248} 288}
@@ -278,12 +318,15 @@ static int enter_state(suspend_state_t state)
278 if (!mutex_trylock(&pm_mutex)) 318 if (!mutex_trylock(&pm_mutex))
279 return -EBUSY; 319 return -EBUSY;
280 320
321 if (state == PM_SUSPEND_FREEZE)
322 freeze_begin();
323
281 printk(KERN_INFO "PM: Syncing filesystems ... "); 324 printk(KERN_INFO "PM: Syncing filesystems ... ");
282 sys_sync(); 325 sys_sync();
283 printk("done.\n"); 326 printk("done.\n");
284 327
285 pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); 328 pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
286 error = suspend_prepare(); 329 error = suspend_prepare(state);
287 if (error) 330 if (error)
288 goto Unlock; 331 goto Unlock;
289 332
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 25596e450ac7..9b2a1d58558d 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -112,7 +112,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
112 rtc_set_alarm(rtc, &alm); 112 rtc_set_alarm(rtc, &alm);
113} 113}
114 114
115static int __init has_wakealarm(struct device *dev, void *name_ptr) 115static int __init has_wakealarm(struct device *dev, const void *data)
116{ 116{
117 struct rtc_device *candidate = to_rtc_device(dev); 117 struct rtc_device *candidate = to_rtc_device(dev);
118 118
@@ -121,7 +121,6 @@ static int __init has_wakealarm(struct device *dev, void *name_ptr)
121 if (!device_may_wakeup(candidate->dev.parent)) 121 if (!device_may_wakeup(candidate->dev.parent))
122 return 0; 122 return 0;
123 123
124 *(const char **)name_ptr = dev_name(dev);
125 return 1; 124 return 1;
126} 125}
127 126
@@ -159,8 +158,8 @@ static int __init test_suspend(void)
159 static char warn_no_rtc[] __initdata = 158 static char warn_no_rtc[] __initdata =
160 KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n"; 159 KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n";
161 160
162 char *pony = NULL;
163 struct rtc_device *rtc = NULL; 161 struct rtc_device *rtc = NULL;
162 struct device *dev;
164 163
165 /* PM is initialized by now; is that state testable? */ 164 /* PM is initialized by now; is that state testable? */
166 if (test_state == PM_SUSPEND_ON) 165 if (test_state == PM_SUSPEND_ON)
@@ -171,9 +170,9 @@ static int __init test_suspend(void)
171 } 170 }
172 171
173 /* RTCs have initialized by now too ... can we use one? */ 172 /* RTCs have initialized by now too ... can we use one? */
174 class_find_device(rtc_class, NULL, &pony, has_wakealarm); 173 dev = class_find_device(rtc_class, NULL, NULL, has_wakealarm);
175 if (pony) 174 if (dev)
176 rtc = rtc_class_open(pony); 175 rtc = rtc_class_open(dev_name(dev));
177 if (!rtc) { 176 if (!rtc) {
178 printk(warn_no_rtc); 177 printk(warn_no_rtc);
179 goto done; 178 goto done;
diff --git a/kernel/printk.c b/kernel/printk.c
index f24633afa46a..0b31715f335a 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -88,6 +88,12 @@ static DEFINE_SEMAPHORE(console_sem);
88struct console *console_drivers; 88struct console *console_drivers;
89EXPORT_SYMBOL_GPL(console_drivers); 89EXPORT_SYMBOL_GPL(console_drivers);
90 90
91#ifdef CONFIG_LOCKDEP
92static struct lockdep_map console_lock_dep_map = {
93 .name = "console_lock"
94};
95#endif
96
91/* 97/*
92 * This is used for debugging the mess that is the VT code by 98 * This is used for debugging the mess that is the VT code by
93 * keeping track if we have the console semaphore held. It's 99 * keeping track if we have the console semaphore held. It's
@@ -1919,6 +1925,7 @@ void console_lock(void)
1919 return; 1925 return;
1920 console_locked = 1; 1926 console_locked = 1;
1921 console_may_schedule = 1; 1927 console_may_schedule = 1;
1928 mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_);
1922} 1929}
1923EXPORT_SYMBOL(console_lock); 1930EXPORT_SYMBOL(console_lock);
1924 1931
@@ -1940,6 +1947,7 @@ int console_trylock(void)
1940 } 1947 }
1941 console_locked = 1; 1948 console_locked = 1;
1942 console_may_schedule = 0; 1949 console_may_schedule = 0;
1950 mutex_acquire(&console_lock_dep_map, 0, 1, _RET_IP_);
1943 return 1; 1951 return 1;
1944} 1952}
1945EXPORT_SYMBOL(console_trylock); 1953EXPORT_SYMBOL(console_trylock);
@@ -2102,6 +2110,7 @@ skip:
2102 local_irq_restore(flags); 2110 local_irq_restore(flags);
2103 } 2111 }
2104 console_locked = 0; 2112 console_locked = 0;
2113 mutex_release(&console_lock_dep_map, 1, _RET_IP_);
2105 2114
2106 /* Release the exclusive_console once it is used */ 2115 /* Release the exclusive_console once it is used */
2107 if (unlikely(exclusive_console)) 2116 if (unlikely(exclusive_console))
diff --git a/kernel/relay.c b/kernel/relay.c
index e8cd2027abbd..01ab081ac53a 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1139,7 +1139,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
1139 if (!desc->count) 1139 if (!desc->count)
1140 return 0; 1140 return 0;
1141 1141
1142 mutex_lock(&filp->f_path.dentry->d_inode->i_mutex); 1142 mutex_lock(&file_inode(filp)->i_mutex);
1143 do { 1143 do {
1144 if (!relay_file_read_avail(buf, *ppos)) 1144 if (!relay_file_read_avail(buf, *ppos))
1145 break; 1145 break;
@@ -1159,7 +1159,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
1159 *ppos = relay_file_read_end_pos(buf, read_start, ret); 1159 *ppos = relay_file_read_end_pos(buf, read_start, ret);
1160 } 1160 }
1161 } while (desc->count && ret); 1161 } while (desc->count && ret);
1162 mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex); 1162 mutex_unlock(&file_inode(filp)->i_mutex);
1163 1163
1164 return desc->written; 1164 return desc->written;
1165} 1165}
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 0984a21076a3..64de5f8b0c9e 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -35,6 +35,7 @@ static inline void autogroup_destroy(struct kref *kref)
35 ag->tg->rt_se = NULL; 35 ag->tg->rt_se = NULL;
36 ag->tg->rt_rq = NULL; 36 ag->tg->rt_rq = NULL;
37#endif 37#endif
38 sched_offline_group(ag->tg);
38 sched_destroy_group(ag->tg); 39 sched_destroy_group(ag->tg);
39} 40}
40 41
@@ -76,6 +77,8 @@ static inline struct autogroup *autogroup_create(void)
76 if (IS_ERR(tg)) 77 if (IS_ERR(tg))
77 goto out_free; 78 goto out_free;
78 79
80 sched_online_group(tg, &root_task_group);
81
79 kref_init(&ag->kref); 82 kref_init(&ag->kref);
80 init_rwsem(&ag->lock); 83 init_rwsem(&ag->lock);
81 ag->id = atomic_inc_return(&autogroup_seq_nr); 84 ag->id = atomic_inc_return(&autogroup_seq_nr);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 03d7784b7bd2..7f12624a393c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1132,18 +1132,28 @@ EXPORT_SYMBOL_GPL(kick_process);
1132 */ 1132 */
1133static int select_fallback_rq(int cpu, struct task_struct *p) 1133static int select_fallback_rq(int cpu, struct task_struct *p)
1134{ 1134{
1135 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu)); 1135 int nid = cpu_to_node(cpu);
1136 const struct cpumask *nodemask = NULL;
1136 enum { cpuset, possible, fail } state = cpuset; 1137 enum { cpuset, possible, fail } state = cpuset;
1137 int dest_cpu; 1138 int dest_cpu;
1138 1139
1139 /* Look for allowed, online CPU in same node. */ 1140 /*
1140 for_each_cpu(dest_cpu, nodemask) { 1141 * If the node that the cpu is on has been offlined, cpu_to_node()
1141 if (!cpu_online(dest_cpu)) 1142 * will return -1. There is no cpu on the node, and we should
1142 continue; 1143 * select the cpu on the other node.
1143 if (!cpu_active(dest_cpu)) 1144 */
1144 continue; 1145 if (nid != -1) {
1145 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) 1146 nodemask = cpumask_of_node(nid);
1146 return dest_cpu; 1147
1148 /* Look for allowed, online CPU in same node. */
1149 for_each_cpu(dest_cpu, nodemask) {
1150 if (!cpu_online(dest_cpu))
1151 continue;
1152 if (!cpu_active(dest_cpu))
1153 continue;
1154 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
1155 return dest_cpu;
1156 }
1147 } 1157 }
1148 1158
1149 for (;;) { 1159 for (;;) {
@@ -1742,9 +1752,8 @@ EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
1742static void fire_sched_in_preempt_notifiers(struct task_struct *curr) 1752static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
1743{ 1753{
1744 struct preempt_notifier *notifier; 1754 struct preempt_notifier *notifier;
1745 struct hlist_node *node;
1746 1755
1747 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) 1756 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
1748 notifier->ops->sched_in(notifier, raw_smp_processor_id()); 1757 notifier->ops->sched_in(notifier, raw_smp_processor_id());
1749} 1758}
1750 1759
@@ -1753,9 +1762,8 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
1753 struct task_struct *next) 1762 struct task_struct *next)
1754{ 1763{
1755 struct preempt_notifier *notifier; 1764 struct preempt_notifier *notifier;
1756 struct hlist_node *node;
1757 1765
1758 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) 1766 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
1759 notifier->ops->sched_out(notifier, next); 1767 notifier->ops->sched_out(notifier, next);
1760} 1768}
1761 1769
@@ -1969,11 +1977,10 @@ context_switch(struct rq *rq, struct task_struct *prev,
1969} 1977}
1970 1978
1971/* 1979/*
1972 * nr_running, nr_uninterruptible and nr_context_switches: 1980 * nr_running and nr_context_switches:
1973 * 1981 *
1974 * externally visible scheduler statistics: current number of runnable 1982 * externally visible scheduler statistics: current number of runnable
1975 * threads, current number of uninterruptible-sleeping threads, total 1983 * threads, total number of context switches performed since bootup.
1976 * number of context switches performed since bootup.
1977 */ 1984 */
1978unsigned long nr_running(void) 1985unsigned long nr_running(void)
1979{ 1986{
@@ -1985,23 +1992,6 @@ unsigned long nr_running(void)
1985 return sum; 1992 return sum;
1986} 1993}
1987 1994
1988unsigned long nr_uninterruptible(void)
1989{
1990 unsigned long i, sum = 0;
1991
1992 for_each_possible_cpu(i)
1993 sum += cpu_rq(i)->nr_uninterruptible;
1994
1995 /*
1996 * Since we read the counters lockless, it might be slightly
1997 * inaccurate. Do not allow it to go below zero though:
1998 */
1999 if (unlikely((long)sum < 0))
2000 sum = 0;
2001
2002 return sum;
2003}
2004
2005unsigned long long nr_context_switches(void) 1995unsigned long long nr_context_switches(void)
2006{ 1996{
2007 int i; 1997 int i;
@@ -2786,7 +2776,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
2786 if (irqs_disabled()) 2776 if (irqs_disabled())
2787 print_irqtrace_events(prev); 2777 print_irqtrace_events(prev);
2788 dump_stack(); 2778 dump_stack();
2789 add_taint(TAINT_WARN); 2779 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
2790} 2780}
2791 2781
2792/* 2782/*
@@ -3268,7 +3258,8 @@ void complete_all(struct completion *x)
3268EXPORT_SYMBOL(complete_all); 3258EXPORT_SYMBOL(complete_all);
3269 3259
3270static inline long __sched 3260static inline long __sched
3271do_wait_for_common(struct completion *x, long timeout, int state) 3261do_wait_for_common(struct completion *x,
3262 long (*action)(long), long timeout, int state)
3272{ 3263{
3273 if (!x->done) { 3264 if (!x->done) {
3274 DECLARE_WAITQUEUE(wait, current); 3265 DECLARE_WAITQUEUE(wait, current);
@@ -3281,7 +3272,7 @@ do_wait_for_common(struct completion *x, long timeout, int state)
3281 } 3272 }
3282 __set_current_state(state); 3273 __set_current_state(state);
3283 spin_unlock_irq(&x->wait.lock); 3274 spin_unlock_irq(&x->wait.lock);
3284 timeout = schedule_timeout(timeout); 3275 timeout = action(timeout);
3285 spin_lock_irq(&x->wait.lock); 3276 spin_lock_irq(&x->wait.lock);
3286 } while (!x->done && timeout); 3277 } while (!x->done && timeout);
3287 __remove_wait_queue(&x->wait, &wait); 3278 __remove_wait_queue(&x->wait, &wait);
@@ -3292,17 +3283,30 @@ do_wait_for_common(struct completion *x, long timeout, int state)
3292 return timeout ?: 1; 3283 return timeout ?: 1;
3293} 3284}
3294 3285
3295static long __sched 3286static inline long __sched
3296wait_for_common(struct completion *x, long timeout, int state) 3287__wait_for_common(struct completion *x,
3288 long (*action)(long), long timeout, int state)
3297{ 3289{
3298 might_sleep(); 3290 might_sleep();
3299 3291
3300 spin_lock_irq(&x->wait.lock); 3292 spin_lock_irq(&x->wait.lock);
3301 timeout = do_wait_for_common(x, timeout, state); 3293 timeout = do_wait_for_common(x, action, timeout, state);
3302 spin_unlock_irq(&x->wait.lock); 3294 spin_unlock_irq(&x->wait.lock);
3303 return timeout; 3295 return timeout;
3304} 3296}
3305 3297
3298static long __sched
3299wait_for_common(struct completion *x, long timeout, int state)
3300{
3301 return __wait_for_common(x, schedule_timeout, timeout, state);
3302}
3303
3304static long __sched
3305wait_for_common_io(struct completion *x, long timeout, int state)
3306{
3307 return __wait_for_common(x, io_schedule_timeout, timeout, state);
3308}
3309
3306/** 3310/**
3307 * wait_for_completion: - waits for completion of a task 3311 * wait_for_completion: - waits for completion of a task
3308 * @x: holds the state of this particular completion 3312 * @x: holds the state of this particular completion
@@ -3339,6 +3343,39 @@ wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3339EXPORT_SYMBOL(wait_for_completion_timeout); 3343EXPORT_SYMBOL(wait_for_completion_timeout);
3340 3344
3341/** 3345/**
3346 * wait_for_completion_io: - waits for completion of a task
3347 * @x: holds the state of this particular completion
3348 *
3349 * This waits to be signaled for completion of a specific task. It is NOT
3350 * interruptible and there is no timeout. The caller is accounted as waiting
3351 * for IO.
3352 */
3353void __sched wait_for_completion_io(struct completion *x)
3354{
3355 wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
3356}
3357EXPORT_SYMBOL(wait_for_completion_io);
3358
3359/**
3360 * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout)
3361 * @x: holds the state of this particular completion
3362 * @timeout: timeout value in jiffies
3363 *
3364 * This waits for either a completion of a specific task to be signaled or for a
3365 * specified timeout to expire. The timeout is in jiffies. It is not
3366 * interruptible. The caller is accounted as waiting for IO.
3367 *
3368 * The return value is 0 if timed out, and positive (at least 1, or number of
3369 * jiffies left till timeout) if completed.
3370 */
3371unsigned long __sched
3372wait_for_completion_io_timeout(struct completion *x, unsigned long timeout)
3373{
3374 return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE);
3375}
3376EXPORT_SYMBOL(wait_for_completion_io_timeout);
3377
3378/**
3342 * wait_for_completion_interruptible: - waits for completion of a task (w/intr) 3379 * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
3343 * @x: holds the state of this particular completion 3380 * @x: holds the state of this particular completion
3344 * 3381 *
@@ -4364,7 +4401,10 @@ EXPORT_SYMBOL(yield);
4364 * It's the caller's job to ensure that the target task struct 4401 * It's the caller's job to ensure that the target task struct
4365 * can't go away on us before we can do any checks. 4402 * can't go away on us before we can do any checks.
4366 * 4403 *
4367 * Returns true if we indeed boosted the target task. 4404 * Returns:
4405 * true (>0) if we indeed boosted the target task.
4406 * false (0) if we failed to boost the target.
4407 * -ESRCH if there's no task to yield to.
4368 */ 4408 */
4369bool __sched yield_to(struct task_struct *p, bool preempt) 4409bool __sched yield_to(struct task_struct *p, bool preempt)
4370{ 4410{
@@ -4378,6 +4418,15 @@ bool __sched yield_to(struct task_struct *p, bool preempt)
4378 4418
4379again: 4419again:
4380 p_rq = task_rq(p); 4420 p_rq = task_rq(p);
4421 /*
4422 * If we're the only runnable task on the rq and target rq also
4423 * has only one task, there's absolutely no point in yielding.
4424 */
4425 if (rq->nr_running == 1 && p_rq->nr_running == 1) {
4426 yielded = -ESRCH;
4427 goto out_irq;
4428 }
4429
4381 double_rq_lock(rq, p_rq); 4430 double_rq_lock(rq, p_rq);
4382 while (task_rq(p) != p_rq) { 4431 while (task_rq(p) != p_rq) {
4383 double_rq_unlock(rq, p_rq); 4432 double_rq_unlock(rq, p_rq);
@@ -4385,13 +4434,13 @@ again:
4385 } 4434 }
4386 4435
4387 if (!curr->sched_class->yield_to_task) 4436 if (!curr->sched_class->yield_to_task)
4388 goto out; 4437 goto out_unlock;
4389 4438
4390 if (curr->sched_class != p->sched_class) 4439 if (curr->sched_class != p->sched_class)
4391 goto out; 4440 goto out_unlock;
4392 4441
4393 if (task_running(p_rq, p) || p->state) 4442 if (task_running(p_rq, p) || p->state)
4394 goto out; 4443 goto out_unlock;
4395 4444
4396 yielded = curr->sched_class->yield_to_task(rq, p, preempt); 4445 yielded = curr->sched_class->yield_to_task(rq, p, preempt);
4397 if (yielded) { 4446 if (yielded) {
@@ -4404,11 +4453,12 @@ again:
4404 resched_task(p_rq->curr); 4453 resched_task(p_rq->curr);
4405 } 4454 }
4406 4455
4407out: 4456out_unlock:
4408 double_rq_unlock(rq, p_rq); 4457 double_rq_unlock(rq, p_rq);
4458out_irq:
4409 local_irq_restore(flags); 4459 local_irq_restore(flags);
4410 4460
4411 if (yielded) 4461 if (yielded > 0)
4412 schedule(); 4462 schedule();
4413 4463
4414 return yielded; 4464 return yielded;
@@ -7161,7 +7211,6 @@ static void free_sched_group(struct task_group *tg)
7161struct task_group *sched_create_group(struct task_group *parent) 7211struct task_group *sched_create_group(struct task_group *parent)
7162{ 7212{
7163 struct task_group *tg; 7213 struct task_group *tg;
7164 unsigned long flags;
7165 7214
7166 tg = kzalloc(sizeof(*tg), GFP_KERNEL); 7215 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
7167 if (!tg) 7216 if (!tg)
@@ -7173,6 +7222,17 @@ struct task_group *sched_create_group(struct task_group *parent)
7173 if (!alloc_rt_sched_group(tg, parent)) 7222 if (!alloc_rt_sched_group(tg, parent))
7174 goto err; 7223 goto err;
7175 7224
7225 return tg;
7226
7227err:
7228 free_sched_group(tg);
7229 return ERR_PTR(-ENOMEM);
7230}
7231
7232void sched_online_group(struct task_group *tg, struct task_group *parent)
7233{
7234 unsigned long flags;
7235
7176 spin_lock_irqsave(&task_group_lock, flags); 7236 spin_lock_irqsave(&task_group_lock, flags);
7177 list_add_rcu(&tg->list, &task_groups); 7237 list_add_rcu(&tg->list, &task_groups);
7178 7238
@@ -7182,12 +7242,6 @@ struct task_group *sched_create_group(struct task_group *parent)
7182 INIT_LIST_HEAD(&tg->children); 7242 INIT_LIST_HEAD(&tg->children);
7183 list_add_rcu(&tg->siblings, &parent->children); 7243 list_add_rcu(&tg->siblings, &parent->children);
7184 spin_unlock_irqrestore(&task_group_lock, flags); 7244 spin_unlock_irqrestore(&task_group_lock, flags);
7185
7186 return tg;
7187
7188err:
7189 free_sched_group(tg);
7190 return ERR_PTR(-ENOMEM);
7191} 7245}
7192 7246
7193/* rcu callback to free various structures associated with a task group */ 7247/* rcu callback to free various structures associated with a task group */
@@ -7200,6 +7254,12 @@ static void free_sched_group_rcu(struct rcu_head *rhp)
7200/* Destroy runqueue etc associated with a task group */ 7254/* Destroy runqueue etc associated with a task group */
7201void sched_destroy_group(struct task_group *tg) 7255void sched_destroy_group(struct task_group *tg)
7202{ 7256{
7257 /* wait for possible concurrent references to cfs_rqs complete */
7258 call_rcu(&tg->rcu, free_sched_group_rcu);
7259}
7260
7261void sched_offline_group(struct task_group *tg)
7262{
7203 unsigned long flags; 7263 unsigned long flags;
7204 int i; 7264 int i;
7205 7265
@@ -7211,9 +7271,6 @@ void sched_destroy_group(struct task_group *tg)
7211 list_del_rcu(&tg->list); 7271 list_del_rcu(&tg->list);
7212 list_del_rcu(&tg->siblings); 7272 list_del_rcu(&tg->siblings);
7213 spin_unlock_irqrestore(&task_group_lock, flags); 7273 spin_unlock_irqrestore(&task_group_lock, flags);
7214
7215 /* wait for possible concurrent references to cfs_rqs complete */
7216 call_rcu(&tg->rcu, free_sched_group_rcu);
7217} 7274}
7218 7275
7219/* change task's runqueue when it moves between groups. 7276/* change task's runqueue when it moves between groups.
@@ -7584,6 +7641,19 @@ static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp)
7584 return &tg->css; 7641 return &tg->css;
7585} 7642}
7586 7643
7644static int cpu_cgroup_css_online(struct cgroup *cgrp)
7645{
7646 struct task_group *tg = cgroup_tg(cgrp);
7647 struct task_group *parent;
7648
7649 if (!cgrp->parent)
7650 return 0;
7651
7652 parent = cgroup_tg(cgrp->parent);
7653 sched_online_group(tg, parent);
7654 return 0;
7655}
7656
7587static void cpu_cgroup_css_free(struct cgroup *cgrp) 7657static void cpu_cgroup_css_free(struct cgroup *cgrp)
7588{ 7658{
7589 struct task_group *tg = cgroup_tg(cgrp); 7659 struct task_group *tg = cgroup_tg(cgrp);
@@ -7591,6 +7661,13 @@ static void cpu_cgroup_css_free(struct cgroup *cgrp)
7591 sched_destroy_group(tg); 7661 sched_destroy_group(tg);
7592} 7662}
7593 7663
7664static void cpu_cgroup_css_offline(struct cgroup *cgrp)
7665{
7666 struct task_group *tg = cgroup_tg(cgrp);
7667
7668 sched_offline_group(tg);
7669}
7670
7594static int cpu_cgroup_can_attach(struct cgroup *cgrp, 7671static int cpu_cgroup_can_attach(struct cgroup *cgrp,
7595 struct cgroup_taskset *tset) 7672 struct cgroup_taskset *tset)
7596{ 7673{
@@ -7946,6 +8023,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
7946 .name = "cpu", 8023 .name = "cpu",
7947 .css_alloc = cpu_cgroup_css_alloc, 8024 .css_alloc = cpu_cgroup_css_alloc,
7948 .css_free = cpu_cgroup_css_free, 8025 .css_free = cpu_cgroup_css_free,
8026 .css_online = cpu_cgroup_css_online,
8027 .css_offline = cpu_cgroup_css_offline,
7949 .can_attach = cpu_cgroup_can_attach, 8028 .can_attach = cpu_cgroup_can_attach,
7950 .attach = cpu_cgroup_attach, 8029 .attach = cpu_cgroup_attach,
7951 .exit = cpu_cgroup_exit, 8030 .exit = cpu_cgroup_exit,
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 9857329ed280..ed12cbb135f4 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -604,7 +604,7 @@ static unsigned long long vtime_delta(struct task_struct *tsk)
604{ 604{
605 unsigned long long clock; 605 unsigned long long clock;
606 606
607 clock = sched_clock(); 607 clock = local_clock();
608 if (clock < tsk->vtime_snap) 608 if (clock < tsk->vtime_snap)
609 return 0; 609 return 0;
610 610
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 7ae4c4c5420e..75024a673520 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -110,13 +110,6 @@ static char *task_group_path(struct task_group *tg)
110 if (autogroup_path(tg, group_path, PATH_MAX)) 110 if (autogroup_path(tg, group_path, PATH_MAX))
111 return group_path; 111 return group_path;
112 112
113 /*
114 * May be NULL if the underlying cgroup isn't fully-created yet
115 */
116 if (!tg->css.cgroup) {
117 group_path[0] = '\0';
118 return group_path;
119 }
120 cgroup_path(tg->css.cgroup, group_path, PATH_MAX); 113 cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
121 return group_path; 114 return group_path;
122} 115}
@@ -269,11 +262,11 @@ static void print_cpu(struct seq_file *m, int cpu)
269 { 262 {
270 unsigned int freq = cpu_khz ? : 1; 263 unsigned int freq = cpu_khz ? : 1;
271 264
272 SEQ_printf(m, "\ncpu#%d, %u.%03u MHz\n", 265 SEQ_printf(m, "cpu#%d, %u.%03u MHz\n",
273 cpu, freq / 1000, (freq % 1000)); 266 cpu, freq / 1000, (freq % 1000));
274 } 267 }
275#else 268#else
276 SEQ_printf(m, "\ncpu#%d\n", cpu); 269 SEQ_printf(m, "cpu#%d\n", cpu);
277#endif 270#endif
278 271
279#define P(x) \ 272#define P(x) \
@@ -330,6 +323,7 @@ do { \
330 print_rq(m, rq, cpu); 323 print_rq(m, rq, cpu);
331 rcu_read_unlock(); 324 rcu_read_unlock();
332 spin_unlock_irqrestore(&sched_debug_lock, flags); 325 spin_unlock_irqrestore(&sched_debug_lock, flags);
326 SEQ_printf(m, "\n");
333} 327}
334 328
335static const char *sched_tunable_scaling_names[] = { 329static const char *sched_tunable_scaling_names[] = {
@@ -338,11 +332,10 @@ static const char *sched_tunable_scaling_names[] = {
338 "linear" 332 "linear"
339}; 333};
340 334
341static int sched_debug_show(struct seq_file *m, void *v) 335static void sched_debug_header(struct seq_file *m)
342{ 336{
343 u64 ktime, sched_clk, cpu_clk; 337 u64 ktime, sched_clk, cpu_clk;
344 unsigned long flags; 338 unsigned long flags;
345 int cpu;
346 339
347 local_irq_save(flags); 340 local_irq_save(flags);
348 ktime = ktime_to_ns(ktime_get()); 341 ktime = ktime_to_ns(ktime_get());
@@ -384,33 +377,101 @@ static int sched_debug_show(struct seq_file *m, void *v)
384#undef PN 377#undef PN
385#undef P 378#undef P
386 379
387 SEQ_printf(m, " .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling", 380 SEQ_printf(m, " .%-40s: %d (%s)\n",
381 "sysctl_sched_tunable_scaling",
388 sysctl_sched_tunable_scaling, 382 sysctl_sched_tunable_scaling,
389 sched_tunable_scaling_names[sysctl_sched_tunable_scaling]); 383 sched_tunable_scaling_names[sysctl_sched_tunable_scaling]);
384 SEQ_printf(m, "\n");
385}
390 386
391 for_each_online_cpu(cpu) 387static int sched_debug_show(struct seq_file *m, void *v)
392 print_cpu(m, cpu); 388{
389 int cpu = (unsigned long)(v - 2);
393 390
394 SEQ_printf(m, "\n"); 391 if (cpu != -1)
392 print_cpu(m, cpu);
393 else
394 sched_debug_header(m);
395 395
396 return 0; 396 return 0;
397} 397}
398 398
399void sysrq_sched_debug_show(void) 399void sysrq_sched_debug_show(void)
400{ 400{
401 sched_debug_show(NULL, NULL); 401 int cpu;
402
403 sched_debug_header(NULL);
404 for_each_online_cpu(cpu)
405 print_cpu(NULL, cpu);
406
407}
408
409/*
410 * This itererator needs some explanation.
411 * It returns 1 for the header position.
412 * This means 2 is cpu 0.
413 * In a hotplugged system some cpus, including cpu 0, may be missing so we have
414 * to use cpumask_* to iterate over the cpus.
415 */
416static void *sched_debug_start(struct seq_file *file, loff_t *offset)
417{
418 unsigned long n = *offset;
419
420 if (n == 0)
421 return (void *) 1;
422
423 n--;
424
425 if (n > 0)
426 n = cpumask_next(n - 1, cpu_online_mask);
427 else
428 n = cpumask_first(cpu_online_mask);
429
430 *offset = n + 1;
431
432 if (n < nr_cpu_ids)
433 return (void *)(unsigned long)(n + 2);
434 return NULL;
435}
436
437static void *sched_debug_next(struct seq_file *file, void *data, loff_t *offset)
438{
439 (*offset)++;
440 return sched_debug_start(file, offset);
441}
442
443static void sched_debug_stop(struct seq_file *file, void *data)
444{
445}
446
447static const struct seq_operations sched_debug_sops = {
448 .start = sched_debug_start,
449 .next = sched_debug_next,
450 .stop = sched_debug_stop,
451 .show = sched_debug_show,
452};
453
454static int sched_debug_release(struct inode *inode, struct file *file)
455{
456 seq_release(inode, file);
457
458 return 0;
402} 459}
403 460
404static int sched_debug_open(struct inode *inode, struct file *filp) 461static int sched_debug_open(struct inode *inode, struct file *filp)
405{ 462{
406 return single_open(filp, sched_debug_show, NULL); 463 int ret = 0;
464
465 ret = seq_open(filp, &sched_debug_sops);
466
467 return ret;
407} 468}
408 469
409static const struct file_operations sched_debug_fops = { 470static const struct file_operations sched_debug_fops = {
410 .open = sched_debug_open, 471 .open = sched_debug_open,
411 .read = seq_read, 472 .read = seq_read,
412 .llseek = seq_lseek, 473 .llseek = seq_lseek,
413 .release = single_release, 474 .release = sched_debug_release,
414}; 475};
415 476
416static int __init init_sched_debug_procfs(void) 477static int __init init_sched_debug_procfs(void)
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 903ffa9e8872..e036eda1a9c9 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -21,14 +21,17 @@ static int show_schedstat(struct seq_file *seq, void *v)
21 if (mask_str == NULL) 21 if (mask_str == NULL)
22 return -ENOMEM; 22 return -ENOMEM;
23 23
24 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); 24 if (v == (void *)1) {
25 seq_printf(seq, "timestamp %lu\n", jiffies); 25 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
26 for_each_online_cpu(cpu) { 26 seq_printf(seq, "timestamp %lu\n", jiffies);
27 struct rq *rq = cpu_rq(cpu); 27 } else {
28 struct rq *rq;
28#ifdef CONFIG_SMP 29#ifdef CONFIG_SMP
29 struct sched_domain *sd; 30 struct sched_domain *sd;
30 int dcount = 0; 31 int dcount = 0;
31#endif 32#endif
33 cpu = (unsigned long)(v - 2);
34 rq = cpu_rq(cpu);
32 35
33 /* runqueue-specific stats */ 36 /* runqueue-specific stats */
34 seq_printf(seq, 37 seq_printf(seq,
@@ -77,30 +80,66 @@ static int show_schedstat(struct seq_file *seq, void *v)
77 return 0; 80 return 0;
78} 81}
79 82
80static int schedstat_open(struct inode *inode, struct file *file) 83/*
84 * This itererator needs some explanation.
85 * It returns 1 for the header position.
86 * This means 2 is cpu 0.
87 * In a hotplugged system some cpus, including cpu 0, may be missing so we have
88 * to use cpumask_* to iterate over the cpus.
89 */
90static void *schedstat_start(struct seq_file *file, loff_t *offset)
81{ 91{
82 unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); 92 unsigned long n = *offset;
83 char *buf = kmalloc(size, GFP_KERNEL);
84 struct seq_file *m;
85 int res;
86 93
87 if (!buf) 94 if (n == 0)
88 return -ENOMEM; 95 return (void *) 1;
89 res = single_open(file, show_schedstat, NULL); 96
90 if (!res) { 97 n--;
91 m = file->private_data; 98
92 m->buf = buf; 99 if (n > 0)
93 m->size = size; 100 n = cpumask_next(n - 1, cpu_online_mask);
94 } else 101 else
95 kfree(buf); 102 n = cpumask_first(cpu_online_mask);
96 return res; 103
104 *offset = n + 1;
105
106 if (n < nr_cpu_ids)
107 return (void *)(unsigned long)(n + 2);
108 return NULL;
109}
110
111static void *schedstat_next(struct seq_file *file, void *data, loff_t *offset)
112{
113 (*offset)++;
114 return schedstat_start(file, offset);
115}
116
117static void schedstat_stop(struct seq_file *file, void *data)
118{
119}
120
121static const struct seq_operations schedstat_sops = {
122 .start = schedstat_start,
123 .next = schedstat_next,
124 .stop = schedstat_stop,
125 .show = show_schedstat,
126};
127
128static int schedstat_open(struct inode *inode, struct file *file)
129{
130 return seq_open(file, &schedstat_sops);
97} 131}
98 132
133static int schedstat_release(struct inode *inode, struct file *file)
134{
135 return 0;
136};
137
99static const struct file_operations proc_schedstat_operations = { 138static const struct file_operations proc_schedstat_operations = {
100 .open = schedstat_open, 139 .open = schedstat_open,
101 .read = seq_read, 140 .read = seq_read,
102 .llseek = seq_lseek, 141 .llseek = seq_lseek,
103 .release = single_release, 142 .release = schedstat_release,
104}; 143};
105 144
106static int __init proc_schedstat_init(void) 145static int __init proc_schedstat_init(void)
diff --git a/kernel/signal.c b/kernel/signal.c
index 7f82adbad480..dd72567767d9 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -485,6 +485,9 @@ flush_signal_handlers(struct task_struct *t, int force_default)
485 if (force_default || ka->sa.sa_handler != SIG_IGN) 485 if (force_default || ka->sa.sa_handler != SIG_IGN)
486 ka->sa.sa_handler = SIG_DFL; 486 ka->sa.sa_handler = SIG_DFL;
487 ka->sa.sa_flags = 0; 487 ka->sa.sa_flags = 0;
488#ifdef __ARCH_HAS_SA_RESTORER
489 ka->sa.sa_restorer = NULL;
490#endif
488 sigemptyset(&ka->sa.sa_mask); 491 sigemptyset(&ka->sa.sa_mask);
489 ka++; 492 ka++;
490 } 493 }
@@ -1157,11 +1160,11 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
1157static void print_fatal_signal(int signr) 1160static void print_fatal_signal(int signr)
1158{ 1161{
1159 struct pt_regs *regs = signal_pt_regs(); 1162 struct pt_regs *regs = signal_pt_regs();
1160 printk("%s/%d: potentially unexpected fatal signal %d.\n", 1163 printk(KERN_INFO "%s/%d: potentially unexpected fatal signal %d.\n",
1161 current->comm, task_pid_nr(current), signr); 1164 current->comm, task_pid_nr(current), signr);
1162 1165
1163#if defined(__i386__) && !defined(__arch_um__) 1166#if defined(__i386__) && !defined(__arch_um__)
1164 printk("code at %08lx: ", regs->ip); 1167 printk(KERN_INFO "code at %08lx: ", regs->ip);
1165 { 1168 {
1166 int i; 1169 int i;
1167 for (i = 0; i < 16; i++) { 1170 for (i = 0; i < 16; i++) {
@@ -1169,11 +1172,11 @@ static void print_fatal_signal(int signr)
1169 1172
1170 if (get_user(insn, (unsigned char *)(regs->ip + i))) 1173 if (get_user(insn, (unsigned char *)(regs->ip + i)))
1171 break; 1174 break;
1172 printk("%02x ", insn); 1175 printk(KERN_CONT "%02x ", insn);
1173 } 1176 }
1174 } 1177 }
1178 printk(KERN_CONT "\n");
1175#endif 1179#endif
1176 printk("\n");
1177 preempt_disable(); 1180 preempt_disable();
1178 show_regs(regs); 1181 show_regs(regs);
1179 preempt_enable(); 1182 preempt_enable();
@@ -2399,6 +2402,15 @@ void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka,
2399 tracehook_signal_handler(sig, info, ka, regs, stepping); 2402 tracehook_signal_handler(sig, info, ka, regs, stepping);
2400} 2403}
2401 2404
2405void signal_setup_done(int failed, struct ksignal *ksig, int stepping)
2406{
2407 if (failed)
2408 force_sigsegv(ksig->sig, current);
2409 else
2410 signal_delivered(ksig->sig, &ksig->info, &ksig->ka,
2411 signal_pt_regs(), stepping);
2412}
2413
2402/* 2414/*
2403 * It could be that complete_signal() picked us to notify about the 2415 * It could be that complete_signal() picked us to notify about the
2404 * group-wide signal. Other threads should be notified now to take 2416 * group-wide signal. Other threads should be notified now to take
@@ -2616,40 +2628,95 @@ SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, nset,
2616 return 0; 2628 return 0;
2617} 2629}
2618 2630
2619long do_sigpending(void __user *set, unsigned long sigsetsize) 2631#ifdef CONFIG_COMPAT
2632COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset,
2633 compat_sigset_t __user *, oset, compat_size_t, sigsetsize)
2620{ 2634{
2621 long error = -EINVAL; 2635#ifdef __BIG_ENDIAN
2622 sigset_t pending; 2636 sigset_t old_set = current->blocked;
2637
2638 /* XXX: Don't preclude handling different sized sigset_t's. */
2639 if (sigsetsize != sizeof(sigset_t))
2640 return -EINVAL;
2641
2642 if (nset) {
2643 compat_sigset_t new32;
2644 sigset_t new_set;
2645 int error;
2646 if (copy_from_user(&new32, nset, sizeof(compat_sigset_t)))
2647 return -EFAULT;
2648
2649 sigset_from_compat(&new_set, &new32);
2650 sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));
2651
2652 error = sigprocmask(how, &new_set, NULL);
2653 if (error)
2654 return error;
2655 }
2656 if (oset) {
2657 compat_sigset_t old32;
2658 sigset_to_compat(&old32, &old_set);
2659 if (copy_to_user(oset, &old32, sizeof(compat_sigset_t)))
2660 return -EFAULT;
2661 }
2662 return 0;
2663#else
2664 return sys_rt_sigprocmask(how, (sigset_t __user *)nset,
2665 (sigset_t __user *)oset, sigsetsize);
2666#endif
2667}
2668#endif
2623 2669
2670static int do_sigpending(void *set, unsigned long sigsetsize)
2671{
2624 if (sigsetsize > sizeof(sigset_t)) 2672 if (sigsetsize > sizeof(sigset_t))
2625 goto out; 2673 return -EINVAL;
2626 2674
2627 spin_lock_irq(&current->sighand->siglock); 2675 spin_lock_irq(&current->sighand->siglock);
2628 sigorsets(&pending, &current->pending.signal, 2676 sigorsets(set, &current->pending.signal,
2629 &current->signal->shared_pending.signal); 2677 &current->signal->shared_pending.signal);
2630 spin_unlock_irq(&current->sighand->siglock); 2678 spin_unlock_irq(&current->sighand->siglock);
2631 2679
2632 /* Outside the lock because only this thread touches it. */ 2680 /* Outside the lock because only this thread touches it. */
2633 sigandsets(&pending, &current->blocked, &pending); 2681 sigandsets(set, &current->blocked, set);
2634 2682 return 0;
2635 error = -EFAULT;
2636 if (!copy_to_user(set, &pending, sigsetsize))
2637 error = 0;
2638
2639out:
2640 return error;
2641} 2683}
2642 2684
2643/** 2685/**
2644 * sys_rt_sigpending - examine a pending signal that has been raised 2686 * sys_rt_sigpending - examine a pending signal that has been raised
2645 * while blocked 2687 * while blocked
2646 * @set: stores pending signals 2688 * @uset: stores pending signals
2647 * @sigsetsize: size of sigset_t type or larger 2689 * @sigsetsize: size of sigset_t type or larger
2648 */ 2690 */
2649SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, set, size_t, sigsetsize) 2691SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize)
2650{ 2692{
2651 return do_sigpending(set, sigsetsize); 2693 sigset_t set;
2694 int err = do_sigpending(&set, sigsetsize);
2695 if (!err && copy_to_user(uset, &set, sigsetsize))
2696 err = -EFAULT;
2697 return err;
2698}
2699
2700#ifdef CONFIG_COMPAT
2701COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset,
2702 compat_size_t, sigsetsize)
2703{
2704#ifdef __BIG_ENDIAN
2705 sigset_t set;
2706 int err = do_sigpending(&set, sigsetsize);
2707 if (!err) {
2708 compat_sigset_t set32;
2709 sigset_to_compat(&set32, &set);
2710 /* we can get here only if sigsetsize <= sizeof(set) */
2711 if (copy_to_user(uset, &set32, sigsetsize))
2712 err = -EFAULT;
2713 }
2714 return err;
2715#else
2716 return sys_rt_sigpending((sigset_t __user *)uset, sigsetsize);
2717#endif
2652} 2718}
2719#endif
2653 2720
2654#ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER 2721#ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER
2655 2722
@@ -2927,6 +2994,23 @@ SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig)
2927 return do_tkill(0, pid, sig); 2994 return do_tkill(0, pid, sig);
2928} 2995}
2929 2996
2997static int do_rt_sigqueueinfo(pid_t pid, int sig, siginfo_t *info)
2998{
2999 /* Not even root can pretend to send signals from the kernel.
3000 * Nor can they impersonate a kill()/tgkill(), which adds source info.
3001 */
3002 if ((info->si_code >= 0 || info->si_code == SI_TKILL) &&
3003 (task_pid_vnr(current) != pid)) {
3004 /* We used to allow any < 0 si_code */
3005 WARN_ON_ONCE(info->si_code < 0);
3006 return -EPERM;
3007 }
3008 info->si_signo = sig;
3009
3010 /* POSIX.1b doesn't mention process groups. */
3011 return kill_proc_info(sig, info, pid);
3012}
3013
2930/** 3014/**
2931 * sys_rt_sigqueueinfo - send signal information to a signal 3015 * sys_rt_sigqueueinfo - send signal information to a signal
2932 * @pid: the PID of the thread 3016 * @pid: the PID of the thread
@@ -2937,25 +3021,26 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
2937 siginfo_t __user *, uinfo) 3021 siginfo_t __user *, uinfo)
2938{ 3022{
2939 siginfo_t info; 3023 siginfo_t info;
2940
2941 if (copy_from_user(&info, uinfo, sizeof(siginfo_t))) 3024 if (copy_from_user(&info, uinfo, sizeof(siginfo_t)))
2942 return -EFAULT; 3025 return -EFAULT;
3026 return do_rt_sigqueueinfo(pid, sig, &info);
3027}
2943 3028
2944 /* Not even root can pretend to send signals from the kernel. 3029#ifdef CONFIG_COMPAT
2945 * Nor can they impersonate a kill()/tgkill(), which adds source info. 3030COMPAT_SYSCALL_DEFINE3(rt_sigqueueinfo,
2946 */ 3031 compat_pid_t, pid,
2947 if (info.si_code >= 0 || info.si_code == SI_TKILL) { 3032 int, sig,
2948 /* We used to allow any < 0 si_code */ 3033 struct compat_siginfo __user *, uinfo)
2949 WARN_ON_ONCE(info.si_code < 0); 3034{
2950 return -EPERM; 3035 siginfo_t info;
2951 } 3036 int ret = copy_siginfo_from_user32(&info, uinfo);
2952 info.si_signo = sig; 3037 if (unlikely(ret))
2953 3038 return ret;
2954 /* POSIX.1b doesn't mention process groups. */ 3039 return do_rt_sigqueueinfo(pid, sig, &info);
2955 return kill_proc_info(sig, &info, pid);
2956} 3040}
3041#endif
2957 3042
2958long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info) 3043static int do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
2959{ 3044{
2960 /* This is only valid for single tasks */ 3045 /* This is only valid for single tasks */
2961 if (pid <= 0 || tgid <= 0) 3046 if (pid <= 0 || tgid <= 0)
@@ -2964,7 +3049,8 @@ long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
2964 /* Not even root can pretend to send signals from the kernel. 3049 /* Not even root can pretend to send signals from the kernel.
2965 * Nor can they impersonate a kill()/tgkill(), which adds source info. 3050 * Nor can they impersonate a kill()/tgkill(), which adds source info.
2966 */ 3051 */
2967 if (info->si_code >= 0 || info->si_code == SI_TKILL) { 3052 if (((info->si_code >= 0 || info->si_code == SI_TKILL)) &&
3053 (task_pid_vnr(current) != pid)) {
2968 /* We used to allow any < 0 si_code */ 3054 /* We used to allow any < 0 si_code */
2969 WARN_ON_ONCE(info->si_code < 0); 3055 WARN_ON_ONCE(info->si_code < 0);
2970 return -EPERM; 3056 return -EPERM;
@@ -2985,6 +3071,21 @@ SYSCALL_DEFINE4(rt_tgsigqueueinfo, pid_t, tgid, pid_t, pid, int, sig,
2985 return do_rt_tgsigqueueinfo(tgid, pid, sig, &info); 3071 return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
2986} 3072}
2987 3073
3074#ifdef CONFIG_COMPAT
3075COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo,
3076 compat_pid_t, tgid,
3077 compat_pid_t, pid,
3078 int, sig,
3079 struct compat_siginfo __user *, uinfo)
3080{
3081 siginfo_t info;
3082
3083 if (copy_siginfo_from_user32(&info, uinfo))
3084 return -EFAULT;
3085 return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
3086}
3087#endif
3088
2988int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) 3089int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
2989{ 3090{
2990 struct task_struct *t = current; 3091 struct task_struct *t = current;
@@ -3030,7 +3131,7 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
3030 return 0; 3131 return 0;
3031} 3132}
3032 3133
3033int 3134static int
3034do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp) 3135do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp)
3035{ 3136{
3036 stack_t oss; 3137 stack_t oss;
@@ -3095,12 +3196,10 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
3095out: 3196out:
3096 return error; 3197 return error;
3097} 3198}
3098#ifdef CONFIG_GENERIC_SIGALTSTACK
3099SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss) 3199SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss)
3100{ 3200{
3101 return do_sigaltstack(uss, uoss, current_user_stack_pointer()); 3201 return do_sigaltstack(uss, uoss, current_user_stack_pointer());
3102} 3202}
3103#endif
3104 3203
3105int restore_altstack(const stack_t __user *uss) 3204int restore_altstack(const stack_t __user *uss)
3106{ 3205{
@@ -3118,7 +3217,6 @@ int __save_altstack(stack_t __user *uss, unsigned long sp)
3118} 3217}
3119 3218
3120#ifdef CONFIG_COMPAT 3219#ifdef CONFIG_COMPAT
3121#ifdef CONFIG_GENERIC_SIGALTSTACK
3122COMPAT_SYSCALL_DEFINE2(sigaltstack, 3220COMPAT_SYSCALL_DEFINE2(sigaltstack,
3123 const compat_stack_t __user *, uss_ptr, 3221 const compat_stack_t __user *, uss_ptr,
3124 compat_stack_t __user *, uoss_ptr) 3222 compat_stack_t __user *, uoss_ptr)
@@ -3168,7 +3266,6 @@ int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp)
3168 __put_user(t->sas_ss_size, &uss->ss_size); 3266 __put_user(t->sas_ss_size, &uss->ss_size);
3169} 3267}
3170#endif 3268#endif
3171#endif
3172 3269
3173#ifdef __ARCH_WANT_SYS_SIGPENDING 3270#ifdef __ARCH_WANT_SYS_SIGPENDING
3174 3271
@@ -3178,7 +3275,7 @@ int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp)
3178 */ 3275 */
3179SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) 3276SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
3180{ 3277{
3181 return do_sigpending(set, sizeof(*set)); 3278 return sys_rt_sigpending((sigset_t __user *)set, sizeof(old_sigset_t));
3182} 3279}
3183 3280
3184#endif 3281#endif
@@ -3234,7 +3331,7 @@ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset,
3234} 3331}
3235#endif /* __ARCH_WANT_SYS_SIGPROCMASK */ 3332#endif /* __ARCH_WANT_SYS_SIGPROCMASK */
3236 3333
3237#ifdef __ARCH_WANT_SYS_RT_SIGACTION 3334#ifndef CONFIG_ODD_RT_SIGACTION
3238/** 3335/**
3239 * sys_rt_sigaction - alter an action taken by a process 3336 * sys_rt_sigaction - alter an action taken by a process
3240 * @sig: signal to be sent 3337 * @sig: signal to be sent
@@ -3268,7 +3365,132 @@ SYSCALL_DEFINE4(rt_sigaction, int, sig,
3268out: 3365out:
3269 return ret; 3366 return ret;
3270} 3367}
3271#endif /* __ARCH_WANT_SYS_RT_SIGACTION */ 3368#ifdef CONFIG_COMPAT
3369COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig,
3370 const struct compat_sigaction __user *, act,
3371 struct compat_sigaction __user *, oact,
3372 compat_size_t, sigsetsize)
3373{
3374 struct k_sigaction new_ka, old_ka;
3375 compat_sigset_t mask;
3376#ifdef __ARCH_HAS_SA_RESTORER
3377 compat_uptr_t restorer;
3378#endif
3379 int ret;
3380
3381 /* XXX: Don't preclude handling different sized sigset_t's. */
3382 if (sigsetsize != sizeof(compat_sigset_t))
3383 return -EINVAL;
3384
3385 if (act) {
3386 compat_uptr_t handler;
3387 ret = get_user(handler, &act->sa_handler);
3388 new_ka.sa.sa_handler = compat_ptr(handler);
3389#ifdef __ARCH_HAS_SA_RESTORER
3390 ret |= get_user(restorer, &act->sa_restorer);
3391 new_ka.sa.sa_restorer = compat_ptr(restorer);
3392#endif
3393 ret |= copy_from_user(&mask, &act->sa_mask, sizeof(mask));
3394 ret |= __get_user(new_ka.sa.sa_flags, &act->sa_flags);
3395 if (ret)
3396 return -EFAULT;
3397 sigset_from_compat(&new_ka.sa.sa_mask, &mask);
3398 }
3399
3400 ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
3401 if (!ret && oact) {
3402 sigset_to_compat(&mask, &old_ka.sa.sa_mask);
3403 ret = put_user(ptr_to_compat(old_ka.sa.sa_handler),
3404 &oact->sa_handler);
3405 ret |= copy_to_user(&oact->sa_mask, &mask, sizeof(mask));
3406 ret |= __put_user(old_ka.sa.sa_flags, &oact->sa_flags);
3407#ifdef __ARCH_HAS_SA_RESTORER
3408 ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer),
3409 &oact->sa_restorer);
3410#endif
3411 }
3412 return ret;
3413}
3414#endif
3415#endif /* !CONFIG_ODD_RT_SIGACTION */
3416
3417#ifdef CONFIG_OLD_SIGACTION
3418SYSCALL_DEFINE3(sigaction, int, sig,
3419 const struct old_sigaction __user *, act,
3420 struct old_sigaction __user *, oact)
3421{
3422 struct k_sigaction new_ka, old_ka;
3423 int ret;
3424
3425 if (act) {
3426 old_sigset_t mask;
3427 if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
3428 __get_user(new_ka.sa.sa_handler, &act->sa_handler) ||
3429 __get_user(new_ka.sa.sa_restorer, &act->sa_restorer) ||
3430 __get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
3431 __get_user(mask, &act->sa_mask))
3432 return -EFAULT;
3433#ifdef __ARCH_HAS_KA_RESTORER
3434 new_ka.ka_restorer = NULL;
3435#endif
3436 siginitset(&new_ka.sa.sa_mask, mask);
3437 }
3438
3439 ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
3440
3441 if (!ret && oact) {
3442 if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
3443 __put_user(old_ka.sa.sa_handler, &oact->sa_handler) ||
3444 __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer) ||
3445 __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
3446 __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask))
3447 return -EFAULT;
3448 }
3449
3450 return ret;
3451}
3452#endif
3453#ifdef CONFIG_COMPAT_OLD_SIGACTION
3454COMPAT_SYSCALL_DEFINE3(sigaction, int, sig,
3455 const struct compat_old_sigaction __user *, act,
3456 struct compat_old_sigaction __user *, oact)
3457{
3458 struct k_sigaction new_ka, old_ka;
3459 int ret;
3460 compat_old_sigset_t mask;
3461 compat_uptr_t handler, restorer;
3462
3463 if (act) {
3464 if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
3465 __get_user(handler, &act->sa_handler) ||
3466 __get_user(restorer, &act->sa_restorer) ||
3467 __get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
3468 __get_user(mask, &act->sa_mask))
3469 return -EFAULT;
3470
3471#ifdef __ARCH_HAS_KA_RESTORER
3472 new_ka.ka_restorer = NULL;
3473#endif
3474 new_ka.sa.sa_handler = compat_ptr(handler);
3475 new_ka.sa.sa_restorer = compat_ptr(restorer);
3476 siginitset(&new_ka.sa.sa_mask, mask);
3477 }
3478
3479 ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
3480
3481 if (!ret && oact) {
3482 if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
3483 __put_user(ptr_to_compat(old_ka.sa.sa_handler),
3484 &oact->sa_handler) ||
3485 __put_user(ptr_to_compat(old_ka.sa.sa_restorer),
3486 &oact->sa_restorer) ||
3487 __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
3488 __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask))
3489 return -EFAULT;
3490 }
3491 return ret;
3492}
3493#endif
3272 3494
3273#ifdef __ARCH_WANT_SYS_SGETMASK 3495#ifdef __ARCH_WANT_SYS_SGETMASK
3274 3496
@@ -3336,7 +3558,6 @@ int sigsuspend(sigset_t *set)
3336 return -ERESTARTNOHAND; 3558 return -ERESTARTNOHAND;
3337} 3559}
3338 3560
3339#ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND
3340/** 3561/**
3341 * sys_rt_sigsuspend - replace the signal mask for a value with the 3562 * sys_rt_sigsuspend - replace the signal mask for a value with the
3342 * @unewset value until a signal is received 3563 * @unewset value until a signal is received
@@ -3355,7 +3576,45 @@ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize)
3355 return -EFAULT; 3576 return -EFAULT;
3356 return sigsuspend(&newset); 3577 return sigsuspend(&newset);
3357} 3578}
3358#endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */ 3579
3580#ifdef CONFIG_COMPAT
3581COMPAT_SYSCALL_DEFINE2(rt_sigsuspend, compat_sigset_t __user *, unewset, compat_size_t, sigsetsize)
3582{
3583#ifdef __BIG_ENDIAN
3584 sigset_t newset;
3585 compat_sigset_t newset32;
3586
3587 /* XXX: Don't preclude handling different sized sigset_t's. */
3588 if (sigsetsize != sizeof(sigset_t))
3589 return -EINVAL;
3590
3591 if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t)))
3592 return -EFAULT;
3593 sigset_from_compat(&newset, &newset32);
3594 return sigsuspend(&newset);
3595#else
3596 /* on little-endian bitmaps don't care about granularity */
3597 return sys_rt_sigsuspend((sigset_t __user *)unewset, sigsetsize);
3598#endif
3599}
3600#endif
3601
3602#ifdef CONFIG_OLD_SIGSUSPEND
3603SYSCALL_DEFINE1(sigsuspend, old_sigset_t, mask)
3604{
3605 sigset_t blocked;
3606 siginitset(&blocked, mask);
3607 return sigsuspend(&blocked);
3608}
3609#endif
3610#ifdef CONFIG_OLD_SIGSUSPEND3
3611SYSCALL_DEFINE3(sigsuspend, int, unused1, int, unused2, old_sigset_t, mask)
3612{
3613 sigset_t blocked;
3614 siginitset(&blocked, mask);
3615 return sigsuspend(&blocked);
3616}
3617#endif
3359 3618
3360__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma) 3619__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma)
3361{ 3620{
diff --git a/kernel/smp.c b/kernel/smp.c
index 69f38bd98b42..8e451f3ff51b 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -16,22 +16,12 @@
16#include "smpboot.h" 16#include "smpboot.h"
17 17
18#ifdef CONFIG_USE_GENERIC_SMP_HELPERS 18#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
19static struct {
20 struct list_head queue;
21 raw_spinlock_t lock;
22} call_function __cacheline_aligned_in_smp =
23 {
24 .queue = LIST_HEAD_INIT(call_function.queue),
25 .lock = __RAW_SPIN_LOCK_UNLOCKED(call_function.lock),
26 };
27
28enum { 19enum {
29 CSD_FLAG_LOCK = 0x01, 20 CSD_FLAG_LOCK = 0x01,
30}; 21};
31 22
32struct call_function_data { 23struct call_function_data {
33 struct call_single_data csd; 24 struct call_single_data __percpu *csd;
34 atomic_t refs;
35 cpumask_var_t cpumask; 25 cpumask_var_t cpumask;
36 cpumask_var_t cpumask_ipi; 26 cpumask_var_t cpumask_ipi;
37}; 27};
@@ -60,6 +50,11 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
60 if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL, 50 if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL,
61 cpu_to_node(cpu))) 51 cpu_to_node(cpu)))
62 return notifier_from_errno(-ENOMEM); 52 return notifier_from_errno(-ENOMEM);
53 cfd->csd = alloc_percpu(struct call_single_data);
54 if (!cfd->csd) {
55 free_cpumask_var(cfd->cpumask);
56 return notifier_from_errno(-ENOMEM);
57 }
63 break; 58 break;
64 59
65#ifdef CONFIG_HOTPLUG_CPU 60#ifdef CONFIG_HOTPLUG_CPU
@@ -70,6 +65,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
70 case CPU_DEAD_FROZEN: 65 case CPU_DEAD_FROZEN:
71 free_cpumask_var(cfd->cpumask); 66 free_cpumask_var(cfd->cpumask);
72 free_cpumask_var(cfd->cpumask_ipi); 67 free_cpumask_var(cfd->cpumask_ipi);
68 free_percpu(cfd->csd);
73 break; 69 break;
74#endif 70#endif
75 }; 71 };
@@ -171,85 +167,6 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait)
171} 167}
172 168
173/* 169/*
174 * Invoked by arch to handle an IPI for call function. Must be called with
175 * interrupts disabled.
176 */
177void generic_smp_call_function_interrupt(void)
178{
179 struct call_function_data *data;
180 int cpu = smp_processor_id();
181
182 /*
183 * Shouldn't receive this interrupt on a cpu that is not yet online.
184 */
185 WARN_ON_ONCE(!cpu_online(cpu));
186
187 /*
188 * Ensure entry is visible on call_function_queue after we have
189 * entered the IPI. See comment in smp_call_function_many.
190 * If we don't have this, then we may miss an entry on the list
191 * and never get another IPI to process it.
192 */
193 smp_mb();
194
195 /*
196 * It's ok to use list_for_each_rcu() here even though we may
197 * delete 'pos', since list_del_rcu() doesn't clear ->next
198 */
199 list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
200 int refs;
201 smp_call_func_t func;
202
203 /*
204 * Since we walk the list without any locks, we might
205 * see an entry that was completed, removed from the
206 * list and is in the process of being reused.
207 *
208 * We must check that the cpu is in the cpumask before
209 * checking the refs, and both must be set before
210 * executing the callback on this cpu.
211 */
212
213 if (!cpumask_test_cpu(cpu, data->cpumask))
214 continue;
215
216 smp_rmb();
217
218 if (atomic_read(&data->refs) == 0)
219 continue;
220
221 func = data->csd.func; /* save for later warn */
222 func(data->csd.info);
223
224 /*
225 * If the cpu mask is not still set then func enabled
226 * interrupts (BUG), and this cpu took another smp call
227 * function interrupt and executed func(info) twice
228 * on this cpu. That nested execution decremented refs.
229 */
230 if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) {
231 WARN(1, "%pf enabled interrupts and double executed\n", func);
232 continue;
233 }
234
235 refs = atomic_dec_return(&data->refs);
236 WARN_ON(refs < 0);
237
238 if (refs)
239 continue;
240
241 WARN_ON(!cpumask_empty(data->cpumask));
242
243 raw_spin_lock(&call_function.lock);
244 list_del_rcu(&data->csd.list);
245 raw_spin_unlock(&call_function.lock);
246
247 csd_unlock(&data->csd);
248 }
249
250}
251
252/*
253 * Invoked by arch to handle an IPI for call function single. Must be 170 * Invoked by arch to handle an IPI for call function single. Must be
254 * called from the arch with interrupts disabled. 171 * called from the arch with interrupts disabled.
255 */ 172 */
@@ -453,8 +370,7 @@ void smp_call_function_many(const struct cpumask *mask,
453 smp_call_func_t func, void *info, bool wait) 370 smp_call_func_t func, void *info, bool wait)
454{ 371{
455 struct call_function_data *data; 372 struct call_function_data *data;
456 unsigned long flags; 373 int cpu, next_cpu, this_cpu = smp_processor_id();
457 int refs, cpu, next_cpu, this_cpu = smp_processor_id();
458 374
459 /* 375 /*
460 * Can deadlock when called with interrupts disabled. 376 * Can deadlock when called with interrupts disabled.
@@ -486,50 +402,13 @@ void smp_call_function_many(const struct cpumask *mask,
486 } 402 }
487 403
488 data = &__get_cpu_var(cfd_data); 404 data = &__get_cpu_var(cfd_data);
489 csd_lock(&data->csd);
490
491 /* This BUG_ON verifies our reuse assertions and can be removed */
492 BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask));
493
494 /*
495 * The global call function queue list add and delete are protected
496 * by a lock, but the list is traversed without any lock, relying
497 * on the rcu list add and delete to allow safe concurrent traversal.
498 * We reuse the call function data without waiting for any grace
499 * period after some other cpu removes it from the global queue.
500 * This means a cpu might find our data block as it is being
501 * filled out.
502 *
503 * We hold off the interrupt handler on the other cpu by
504 * ordering our writes to the cpu mask vs our setting of the
505 * refs counter. We assert only the cpu owning the data block
506 * will set a bit in cpumask, and each bit will only be cleared
507 * by the subject cpu. Each cpu must first find its bit is
508 * set and then check that refs is set indicating the element is
509 * ready to be processed, otherwise it must skip the entry.
510 *
511 * On the previous iteration refs was set to 0 by another cpu.
512 * To avoid the use of transitivity, set the counter to 0 here
513 * so the wmb will pair with the rmb in the interrupt handler.
514 */
515 atomic_set(&data->refs, 0); /* convert 3rd to 1st party write */
516
517 data->csd.func = func;
518 data->csd.info = info;
519 405
520 /* Ensure 0 refs is visible before mask. Also orders func and info */
521 smp_wmb();
522
523 /* We rely on the "and" being processed before the store */
524 cpumask_and(data->cpumask, mask, cpu_online_mask); 406 cpumask_and(data->cpumask, mask, cpu_online_mask);
525 cpumask_clear_cpu(this_cpu, data->cpumask); 407 cpumask_clear_cpu(this_cpu, data->cpumask);
526 refs = cpumask_weight(data->cpumask);
527 408
528 /* Some callers race with other cpus changing the passed mask */ 409 /* Some callers race with other cpus changing the passed mask */
529 if (unlikely(!refs)) { 410 if (unlikely(!cpumask_weight(data->cpumask)))
530 csd_unlock(&data->csd);
531 return; 411 return;
532 }
533 412
534 /* 413 /*
535 * After we put an entry into the list, data->cpumask 414 * After we put an entry into the list, data->cpumask
@@ -537,34 +416,32 @@ void smp_call_function_many(const struct cpumask *mask,
537 * a SMP function call, so data->cpumask will be zero. 416 * a SMP function call, so data->cpumask will be zero.
538 */ 417 */
539 cpumask_copy(data->cpumask_ipi, data->cpumask); 418 cpumask_copy(data->cpumask_ipi, data->cpumask);
540 raw_spin_lock_irqsave(&call_function.lock, flags);
541 /*
542 * Place entry at the _HEAD_ of the list, so that any cpu still
543 * observing the entry in generic_smp_call_function_interrupt()
544 * will not miss any other list entries:
545 */
546 list_add_rcu(&data->csd.list, &call_function.queue);
547 /*
548 * We rely on the wmb() in list_add_rcu to complete our writes
549 * to the cpumask before this write to refs, which indicates
550 * data is on the list and is ready to be processed.
551 */
552 atomic_set(&data->refs, refs);
553 raw_spin_unlock_irqrestore(&call_function.lock, flags);
554 419
555 /* 420 for_each_cpu(cpu, data->cpumask) {
556 * Make the list addition visible before sending the ipi. 421 struct call_single_data *csd = per_cpu_ptr(data->csd, cpu);
557 * (IPIs must obey or appear to obey normal Linux cache 422 struct call_single_queue *dst =
558 * coherency rules -- see comment in generic_exec_single). 423 &per_cpu(call_single_queue, cpu);
559 */ 424 unsigned long flags;
560 smp_mb(); 425
426 csd_lock(csd);
427 csd->func = func;
428 csd->info = info;
429
430 raw_spin_lock_irqsave(&dst->lock, flags);
431 list_add_tail(&csd->list, &dst->list);
432 raw_spin_unlock_irqrestore(&dst->lock, flags);
433 }
561 434
562 /* Send a message to all CPUs in the map */ 435 /* Send a message to all CPUs in the map */
563 arch_send_call_function_ipi_mask(data->cpumask_ipi); 436 arch_send_call_function_ipi_mask(data->cpumask_ipi);
564 437
565 /* Optionally wait for the CPUs to complete */ 438 if (wait) {
566 if (wait) 439 for_each_cpu(cpu, data->cpumask) {
567 csd_lock_wait(&data->csd); 440 struct call_single_data *csd =
441 per_cpu_ptr(data->csd, cpu);
442 csd_lock_wait(csd);
443 }
444 }
568} 445}
569EXPORT_SYMBOL(smp_call_function_many); 446EXPORT_SYMBOL(smp_call_function_many);
570 447
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index d4abac261779..8eaed9aa9cf0 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -209,6 +209,8 @@ static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cp
209{ 209{
210 struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); 210 struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
211 211
212 if (ht->pre_unpark)
213 ht->pre_unpark(cpu);
212 kthread_unpark(tsk); 214 kthread_unpark(tsk);
213} 215}
214 216
diff --git a/kernel/softirq.c b/kernel/softirq.c
index f5cc25f147a6..14d7758074aa 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -195,21 +195,21 @@ void local_bh_enable_ip(unsigned long ip)
195EXPORT_SYMBOL(local_bh_enable_ip); 195EXPORT_SYMBOL(local_bh_enable_ip);
196 196
197/* 197/*
198 * We restart softirq processing MAX_SOFTIRQ_RESTART times, 198 * We restart softirq processing for at most 2 ms,
199 * and we fall back to softirqd after that. 199 * and if need_resched() is not set.
200 * 200 *
201 * This number has been established via experimentation. 201 * These limits have been established via experimentation.
202 * The two things to balance is latency against fairness - 202 * The two things to balance is latency against fairness -
203 * we want to handle softirqs as soon as possible, but they 203 * we want to handle softirqs as soon as possible, but they
204 * should not be able to lock up the box. 204 * should not be able to lock up the box.
205 */ 205 */
206#define MAX_SOFTIRQ_RESTART 10 206#define MAX_SOFTIRQ_TIME msecs_to_jiffies(2)
207 207
208asmlinkage void __do_softirq(void) 208asmlinkage void __do_softirq(void)
209{ 209{
210 struct softirq_action *h; 210 struct softirq_action *h;
211 __u32 pending; 211 __u32 pending;
212 int max_restart = MAX_SOFTIRQ_RESTART; 212 unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
213 int cpu; 213 int cpu;
214 unsigned long old_flags = current->flags; 214 unsigned long old_flags = current->flags;
215 215
@@ -264,11 +264,12 @@ restart:
264 local_irq_disable(); 264 local_irq_disable();
265 265
266 pending = local_softirq_pending(); 266 pending = local_softirq_pending();
267 if (pending && --max_restart) 267 if (pending) {
268 goto restart; 268 if (time_before(jiffies, end) && !need_resched())
269 goto restart;
269 270
270 if (pending)
271 wakeup_softirqd(); 271 wakeup_softirqd();
272 }
272 273
273 lockdep_softirq_exit(); 274 lockdep_softirq_exit();
274 275
@@ -322,18 +323,10 @@ void irq_enter(void)
322 323
323static inline void invoke_softirq(void) 324static inline void invoke_softirq(void)
324{ 325{
325 if (!force_irqthreads) { 326 if (!force_irqthreads)
326#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
327 __do_softirq(); 327 __do_softirq();
328#else 328 else
329 do_softirq();
330#endif
331 } else {
332 __local_bh_disable((unsigned long)__builtin_return_address(0),
333 SOFTIRQ_OFFSET);
334 wakeup_softirqd(); 329 wakeup_softirqd();
335 __local_bh_enable(SOFTIRQ_OFFSET);
336 }
337} 330}
338 331
339/* 332/*
@@ -341,9 +334,15 @@ static inline void invoke_softirq(void)
341 */ 334 */
342void irq_exit(void) 335void irq_exit(void)
343{ 336{
337#ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED
338 local_irq_disable();
339#else
340 WARN_ON_ONCE(!irqs_disabled());
341#endif
342
344 account_irq_exit_time(current); 343 account_irq_exit_time(current);
345 trace_hardirq_exit(); 344 trace_hardirq_exit();
346 sub_preempt_count(IRQ_EXIT_OFFSET); 345 sub_preempt_count(HARDIRQ_OFFSET);
347 if (!in_interrupt() && local_softirq_pending()) 346 if (!in_interrupt() && local_softirq_pending())
348 invoke_softirq(); 347 invoke_softirq();
349 348
@@ -353,7 +352,6 @@ void irq_exit(void)
353 tick_nohz_irq_exit(); 352 tick_nohz_irq_exit();
354#endif 353#endif
355 rcu_irq_exit(); 354 rcu_irq_exit();
356 sched_preempt_enable_no_resched();
357} 355}
358 356
359/* 357/*
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 95d178c62d5a..c09f2955ae30 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -336,7 +336,7 @@ static struct smp_hotplug_thread cpu_stop_threads = {
336 .create = cpu_stop_create, 336 .create = cpu_stop_create,
337 .setup = cpu_stop_unpark, 337 .setup = cpu_stop_unpark,
338 .park = cpu_stop_park, 338 .park = cpu_stop_park,
339 .unpark = cpu_stop_unpark, 339 .pre_unpark = cpu_stop_unpark,
340 .selfparking = true, 340 .selfparking = true,
341}; 341};
342 342
diff --git a/kernel/sys.c b/kernel/sys.c
index 265b37690421..81f56445fba9 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -47,6 +47,7 @@
47#include <linux/syscalls.h> 47#include <linux/syscalls.h>
48#include <linux/kprobes.h> 48#include <linux/kprobes.h>
49#include <linux/user_namespace.h> 49#include <linux/user_namespace.h>
50#include <linux/binfmts.h>
50 51
51#include <linux/kmsg_dump.h> 52#include <linux/kmsg_dump.h>
52/* Move somewhere else to avoid recompiling? */ 53/* Move somewhere else to avoid recompiling? */
@@ -433,11 +434,12 @@ static DEFINE_MUTEX(reboot_mutex);
433SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, 434SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
434 void __user *, arg) 435 void __user *, arg)
435{ 436{
437 struct pid_namespace *pid_ns = task_active_pid_ns(current);
436 char buffer[256]; 438 char buffer[256];
437 int ret = 0; 439 int ret = 0;
438 440
439 /* We only trust the superuser with rebooting the system. */ 441 /* We only trust the superuser with rebooting the system. */
440 if (!capable(CAP_SYS_BOOT)) 442 if (!ns_capable(pid_ns->user_ns, CAP_SYS_BOOT))
441 return -EPERM; 443 return -EPERM;
442 444
443 /* For safety, we require "magic" arguments. */ 445 /* For safety, we require "magic" arguments. */
@@ -453,7 +455,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
453 * pid_namespace, the command is handled by reboot_pid_ns() which will 455 * pid_namespace, the command is handled by reboot_pid_ns() which will
454 * call do_exit(). 456 * call do_exit().
455 */ 457 */
456 ret = reboot_pid_ns(task_active_pid_ns(current), cmd); 458 ret = reboot_pid_ns(pid_ns, cmd);
457 if (ret) 459 if (ret)
458 return ret; 460 return ret;
459 461
@@ -1792,14 +1794,14 @@ SYSCALL_DEFINE1(umask, int, mask)
1792static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) 1794static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
1793{ 1795{
1794 struct fd exe; 1796 struct fd exe;
1795 struct dentry *dentry; 1797 struct inode *inode;
1796 int err; 1798 int err;
1797 1799
1798 exe = fdget(fd); 1800 exe = fdget(fd);
1799 if (!exe.file) 1801 if (!exe.file)
1800 return -EBADF; 1802 return -EBADF;
1801 1803
1802 dentry = exe.file->f_path.dentry; 1804 inode = file_inode(exe.file);
1803 1805
1804 /* 1806 /*
1805 * Because the original mm->exe_file points to executable file, make 1807 * Because the original mm->exe_file points to executable file, make
@@ -1807,11 +1809,11 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
1807 * overall picture. 1809 * overall picture.
1808 */ 1810 */
1809 err = -EACCES; 1811 err = -EACCES;
1810 if (!S_ISREG(dentry->d_inode->i_mode) || 1812 if (!S_ISREG(inode->i_mode) ||
1811 exe.file->f_path.mnt->mnt_flags & MNT_NOEXEC) 1813 exe.file->f_path.mnt->mnt_flags & MNT_NOEXEC)
1812 goto exit; 1814 goto exit;
1813 1815
1814 err = inode_permission(dentry->d_inode, MAY_EXEC); 1816 err = inode_permission(inode, MAY_EXEC);
1815 if (err) 1817 if (err)
1816 goto exit; 1818 goto exit;
1817 1819
@@ -2012,160 +2014,159 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2012 2014
2013 error = 0; 2015 error = 0;
2014 switch (option) { 2016 switch (option) {
2015 case PR_SET_PDEATHSIG: 2017 case PR_SET_PDEATHSIG:
2016 if (!valid_signal(arg2)) { 2018 if (!valid_signal(arg2)) {
2017 error = -EINVAL; 2019 error = -EINVAL;
2018 break;
2019 }
2020 me->pdeath_signal = arg2;
2021 break;
2022 case PR_GET_PDEATHSIG:
2023 error = put_user(me->pdeath_signal, (int __user *)arg2);
2024 break;
2025 case PR_GET_DUMPABLE:
2026 error = get_dumpable(me->mm);
2027 break; 2020 break;
2028 case PR_SET_DUMPABLE: 2021 }
2029 if (arg2 < 0 || arg2 > 1) { 2022 me->pdeath_signal = arg2;
2030 error = -EINVAL; 2023 break;
2031 break; 2024 case PR_GET_PDEATHSIG:
2032 } 2025 error = put_user(me->pdeath_signal, (int __user *)arg2);
2033 set_dumpable(me->mm, arg2); 2026 break;
2027 case PR_GET_DUMPABLE:
2028 error = get_dumpable(me->mm);
2029 break;
2030 case PR_SET_DUMPABLE:
2031 if (arg2 != SUID_DUMP_DISABLE && arg2 != SUID_DUMP_USER) {
2032 error = -EINVAL;
2034 break; 2033 break;
2034 }
2035 set_dumpable(me->mm, arg2);
2036 break;
2035 2037
2036 case PR_SET_UNALIGN: 2038 case PR_SET_UNALIGN:
2037 error = SET_UNALIGN_CTL(me, arg2); 2039 error = SET_UNALIGN_CTL(me, arg2);
2038 break; 2040 break;
2039 case PR_GET_UNALIGN: 2041 case PR_GET_UNALIGN:
2040 error = GET_UNALIGN_CTL(me, arg2); 2042 error = GET_UNALIGN_CTL(me, arg2);
2041 break; 2043 break;
2042 case PR_SET_FPEMU: 2044 case PR_SET_FPEMU:
2043 error = SET_FPEMU_CTL(me, arg2); 2045 error = SET_FPEMU_CTL(me, arg2);
2044 break; 2046 break;
2045 case PR_GET_FPEMU: 2047 case PR_GET_FPEMU:
2046 error = GET_FPEMU_CTL(me, arg2); 2048 error = GET_FPEMU_CTL(me, arg2);
2047 break; 2049 break;
2048 case PR_SET_FPEXC: 2050 case PR_SET_FPEXC:
2049 error = SET_FPEXC_CTL(me, arg2); 2051 error = SET_FPEXC_CTL(me, arg2);
2050 break; 2052 break;
2051 case PR_GET_FPEXC: 2053 case PR_GET_FPEXC:
2052 error = GET_FPEXC_CTL(me, arg2); 2054 error = GET_FPEXC_CTL(me, arg2);
2053 break; 2055 break;
2054 case PR_GET_TIMING: 2056 case PR_GET_TIMING:
2055 error = PR_TIMING_STATISTICAL; 2057 error = PR_TIMING_STATISTICAL;
2056 break; 2058 break;
2057 case PR_SET_TIMING: 2059 case PR_SET_TIMING:
2058 if (arg2 != PR_TIMING_STATISTICAL) 2060 if (arg2 != PR_TIMING_STATISTICAL)
2059 error = -EINVAL; 2061 error = -EINVAL;
2060 break; 2062 break;
2061 case PR_SET_NAME: 2063 case PR_SET_NAME:
2062 comm[sizeof(me->comm)-1] = 0; 2064 comm[sizeof(me->comm) - 1] = 0;
2063 if (strncpy_from_user(comm, (char __user *)arg2, 2065 if (strncpy_from_user(comm, (char __user *)arg2,
2064 sizeof(me->comm) - 1) < 0) 2066 sizeof(me->comm) - 1) < 0)
2065 return -EFAULT; 2067 return -EFAULT;
2066 set_task_comm(me, comm); 2068 set_task_comm(me, comm);
2067 proc_comm_connector(me); 2069 proc_comm_connector(me);
2068 break; 2070 break;
2069 case PR_GET_NAME: 2071 case PR_GET_NAME:
2070 get_task_comm(comm, me); 2072 get_task_comm(comm, me);
2071 if (copy_to_user((char __user *)arg2, comm, 2073 if (copy_to_user((char __user *)arg2, comm, sizeof(comm)))
2072 sizeof(comm))) 2074 return -EFAULT;
2073 return -EFAULT; 2075 break;
2074 break; 2076 case PR_GET_ENDIAN:
2075 case PR_GET_ENDIAN: 2077 error = GET_ENDIAN(me, arg2);
2076 error = GET_ENDIAN(me, arg2); 2078 break;
2077 break; 2079 case PR_SET_ENDIAN:
2078 case PR_SET_ENDIAN: 2080 error = SET_ENDIAN(me, arg2);
2079 error = SET_ENDIAN(me, arg2); 2081 break;
2080 break; 2082 case PR_GET_SECCOMP:
2081 case PR_GET_SECCOMP: 2083 error = prctl_get_seccomp();
2082 error = prctl_get_seccomp(); 2084 break;
2083 break; 2085 case PR_SET_SECCOMP:
2084 case PR_SET_SECCOMP: 2086 error = prctl_set_seccomp(arg2, (char __user *)arg3);
2085 error = prctl_set_seccomp(arg2, (char __user *)arg3); 2087 break;
2086 break; 2088 case PR_GET_TSC:
2087 case PR_GET_TSC: 2089 error = GET_TSC_CTL(arg2);
2088 error = GET_TSC_CTL(arg2); 2090 break;
2089 break; 2091 case PR_SET_TSC:
2090 case PR_SET_TSC: 2092 error = SET_TSC_CTL(arg2);
2091 error = SET_TSC_CTL(arg2); 2093 break;
2092 break; 2094 case PR_TASK_PERF_EVENTS_DISABLE:
2093 case PR_TASK_PERF_EVENTS_DISABLE: 2095 error = perf_event_task_disable();
2094 error = perf_event_task_disable(); 2096 break;
2095 break; 2097 case PR_TASK_PERF_EVENTS_ENABLE:
2096 case PR_TASK_PERF_EVENTS_ENABLE: 2098 error = perf_event_task_enable();
2097 error = perf_event_task_enable(); 2099 break;
2098 break; 2100 case PR_GET_TIMERSLACK:
2099 case PR_GET_TIMERSLACK: 2101 error = current->timer_slack_ns;
2100 error = current->timer_slack_ns; 2102 break;
2101 break; 2103 case PR_SET_TIMERSLACK:
2102 case PR_SET_TIMERSLACK: 2104 if (arg2 <= 0)
2103 if (arg2 <= 0) 2105 current->timer_slack_ns =
2104 current->timer_slack_ns =
2105 current->default_timer_slack_ns; 2106 current->default_timer_slack_ns;
2106 else 2107 else
2107 current->timer_slack_ns = arg2; 2108 current->timer_slack_ns = arg2;
2108 break; 2109 break;
2109 case PR_MCE_KILL: 2110 case PR_MCE_KILL:
2110 if (arg4 | arg5) 2111 if (arg4 | arg5)
2111 return -EINVAL; 2112 return -EINVAL;
2112 switch (arg2) { 2113 switch (arg2) {
2113 case PR_MCE_KILL_CLEAR: 2114 case PR_MCE_KILL_CLEAR:
2114 if (arg3 != 0) 2115 if (arg3 != 0)
2115 return -EINVAL;
2116 current->flags &= ~PF_MCE_PROCESS;
2117 break;
2118 case PR_MCE_KILL_SET:
2119 current->flags |= PF_MCE_PROCESS;
2120 if (arg3 == PR_MCE_KILL_EARLY)
2121 current->flags |= PF_MCE_EARLY;
2122 else if (arg3 == PR_MCE_KILL_LATE)
2123 current->flags &= ~PF_MCE_EARLY;
2124 else if (arg3 == PR_MCE_KILL_DEFAULT)
2125 current->flags &=
2126 ~(PF_MCE_EARLY|PF_MCE_PROCESS);
2127 else
2128 return -EINVAL;
2129 break;
2130 default:
2131 return -EINVAL; 2116 return -EINVAL;
2132 } 2117 current->flags &= ~PF_MCE_PROCESS;
2133 break; 2118 break;
2134 case PR_MCE_KILL_GET: 2119 case PR_MCE_KILL_SET:
2135 if (arg2 | arg3 | arg4 | arg5) 2120 current->flags |= PF_MCE_PROCESS;
2136 return -EINVAL; 2121 if (arg3 == PR_MCE_KILL_EARLY)
2137 if (current->flags & PF_MCE_PROCESS) 2122 current->flags |= PF_MCE_EARLY;
2138 error = (current->flags & PF_MCE_EARLY) ? 2123 else if (arg3 == PR_MCE_KILL_LATE)
2139 PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE; 2124 current->flags &= ~PF_MCE_EARLY;
2125 else if (arg3 == PR_MCE_KILL_DEFAULT)
2126 current->flags &=
2127 ~(PF_MCE_EARLY|PF_MCE_PROCESS);
2140 else 2128 else
2141 error = PR_MCE_KILL_DEFAULT;
2142 break;
2143 case PR_SET_MM:
2144 error = prctl_set_mm(arg2, arg3, arg4, arg5);
2145 break;
2146 case PR_GET_TID_ADDRESS:
2147 error = prctl_get_tid_address(me, (int __user **)arg2);
2148 break;
2149 case PR_SET_CHILD_SUBREAPER:
2150 me->signal->is_child_subreaper = !!arg2;
2151 break;
2152 case PR_GET_CHILD_SUBREAPER:
2153 error = put_user(me->signal->is_child_subreaper,
2154 (int __user *) arg2);
2155 break;
2156 case PR_SET_NO_NEW_PRIVS:
2157 if (arg2 != 1 || arg3 || arg4 || arg5)
2158 return -EINVAL; 2129 return -EINVAL;
2159
2160 current->no_new_privs = 1;
2161 break; 2130 break;
2162 case PR_GET_NO_NEW_PRIVS:
2163 if (arg2 || arg3 || arg4 || arg5)
2164 return -EINVAL;
2165 return current->no_new_privs ? 1 : 0;
2166 default: 2131 default:
2167 error = -EINVAL; 2132 return -EINVAL;
2168 break; 2133 }
2134 break;
2135 case PR_MCE_KILL_GET:
2136 if (arg2 | arg3 | arg4 | arg5)
2137 return -EINVAL;
2138 if (current->flags & PF_MCE_PROCESS)
2139 error = (current->flags & PF_MCE_EARLY) ?
2140 PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE;
2141 else
2142 error = PR_MCE_KILL_DEFAULT;
2143 break;
2144 case PR_SET_MM:
2145 error = prctl_set_mm(arg2, arg3, arg4, arg5);
2146 break;
2147 case PR_GET_TID_ADDRESS:
2148 error = prctl_get_tid_address(me, (int __user **)arg2);
2149 break;
2150 case PR_SET_CHILD_SUBREAPER:
2151 me->signal->is_child_subreaper = !!arg2;
2152 break;
2153 case PR_GET_CHILD_SUBREAPER:
2154 error = put_user(me->signal->is_child_subreaper,
2155 (int __user *)arg2);
2156 break;
2157 case PR_SET_NO_NEW_PRIVS:
2158 if (arg2 != 1 || arg3 || arg4 || arg5)
2159 return -EINVAL;
2160
2161 current->no_new_privs = 1;
2162 break;
2163 case PR_GET_NO_NEW_PRIVS:
2164 if (arg2 || arg3 || arg4 || arg5)
2165 return -EINVAL;
2166 return current->no_new_privs ? 1 : 0;
2167 default:
2168 error = -EINVAL;
2169 break;
2169 } 2170 }
2170 return error; 2171 return error;
2171} 2172}
@@ -2184,11 +2185,6 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
2184 2185
2185char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; 2186char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
2186 2187
2187static void argv_cleanup(struct subprocess_info *info)
2188{
2189 argv_free(info->argv);
2190}
2191
2192static int __orderly_poweroff(void) 2188static int __orderly_poweroff(void)
2193{ 2189{
2194 int argc; 2190 int argc;
@@ -2208,9 +2204,8 @@ static int __orderly_poweroff(void)
2208 } 2204 }
2209 2205
2210 ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_WAIT_EXEC, 2206 ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_WAIT_EXEC,
2211 NULL, argv_cleanup, NULL); 2207 NULL, NULL, NULL);
2212 if (ret == -ENOMEM) 2208 argv_free(argv);
2213 argv_free(argv);
2214 2209
2215 return ret; 2210 return ret;
2216} 2211}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4fc9be955c71..afc1dc60f3f8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -105,7 +105,6 @@ extern char core_pattern[];
105extern unsigned int core_pipe_limit; 105extern unsigned int core_pipe_limit;
106#endif 106#endif
107extern int pid_max; 107extern int pid_max;
108extern int min_free_kbytes;
109extern int pid_max_min, pid_max_max; 108extern int pid_max_min, pid_max_max;
110extern int sysctl_drop_caches; 109extern int sysctl_drop_caches;
111extern int percpu_pagelist_fraction; 110extern int percpu_pagelist_fraction;
@@ -158,14 +157,20 @@ extern int sysctl_tsb_ratio;
158 157
159#ifdef __hppa__ 158#ifdef __hppa__
160extern int pwrsw_enabled; 159extern int pwrsw_enabled;
160#endif
161
162#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW
161extern int unaligned_enabled; 163extern int unaligned_enabled;
162#endif 164#endif
163 165
164#ifdef CONFIG_IA64 166#ifdef CONFIG_IA64
165extern int no_unaligned_warning;
166extern int unaligned_dump_stack; 167extern int unaligned_dump_stack;
167#endif 168#endif
168 169
170#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN
171extern int no_unaligned_warning;
172#endif
173
169#ifdef CONFIG_PROC_SYSCTL 174#ifdef CONFIG_PROC_SYSCTL
170static int proc_do_cad_pid(struct ctl_table *table, int write, 175static int proc_do_cad_pid(struct ctl_table *table, int write,
171 void __user *buffer, size_t *lenp, loff_t *ppos); 176 void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -553,6 +558,8 @@ static struct ctl_table kern_table[] = {
553 .mode = 0644, 558 .mode = 0644,
554 .proc_handler = proc_dointvec, 559 .proc_handler = proc_dointvec,
555 }, 560 },
561#endif
562#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW
556 { 563 {
557 .procname = "unaligned-trap", 564 .procname = "unaligned-trap",
558 .data = &unaligned_enabled, 565 .data = &unaligned_enabled,
@@ -919,7 +926,7 @@ static struct ctl_table kern_table[] = {
919 .proc_handler = proc_doulongvec_minmax, 926 .proc_handler = proc_doulongvec_minmax,
920 }, 927 },
921#endif 928#endif
922#ifdef CONFIG_IA64 929#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN
923 { 930 {
924 .procname = "ignore-unaligned-usertrap", 931 .procname = "ignore-unaligned-usertrap",
925 .data = &no_unaligned_warning, 932 .data = &no_unaligned_warning,
@@ -927,6 +934,8 @@ static struct ctl_table kern_table[] = {
927 .mode = 0644, 934 .mode = 0644,
928 .proc_handler = proc_dointvec, 935 .proc_handler = proc_dointvec,
929 }, 936 },
937#endif
938#ifdef CONFIG_IA64
930 { 939 {
931 .procname = "unaligned-dump-stack", 940 .procname = "unaligned-dump-stack",
932 .data = &unaligned_dump_stack, 941 .data = &unaligned_dump_stack,
@@ -2014,7 +2023,7 @@ static int proc_taint(struct ctl_table *table, int write,
2014 int i; 2023 int i;
2015 for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) { 2024 for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) {
2016 if ((tmptaint >> i) & 1) 2025 if ((tmptaint >> i) & 1)
2017 add_taint(i); 2026 add_taint(i, LOCKDEP_STILL_OK);
2018 } 2027 }
2019 } 2028 }
2020 2029
@@ -2091,7 +2100,7 @@ int proc_dointvec_minmax(struct ctl_table *table, int write,
2091static void validate_coredump_safety(void) 2100static void validate_coredump_safety(void)
2092{ 2101{
2093#ifdef CONFIG_COREDUMP 2102#ifdef CONFIG_COREDUMP
2094 if (suid_dumpable == SUID_DUMPABLE_SAFE && 2103 if (suid_dumpable == SUID_DUMP_ROOT &&
2095 core_pattern[0] != '/' && core_pattern[0] != '|') { 2104 core_pattern[0] != '/' && core_pattern[0] != '|') {
2096 printk(KERN_WARNING "Unsafe core_pattern used with "\ 2105 printk(KERN_WARNING "Unsafe core_pattern used with "\
2097 "suid_dumpable=2. Pipe handler or fully qualified "\ 2106 "suid_dumpable=2. Pipe handler or fully qualified "\
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 5a6384450501..ebf72358e86a 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -387,7 +387,6 @@ static const struct bin_table bin_net_ipv4_table[] = {
387 { CTL_INT, NET_TCP_MODERATE_RCVBUF, "tcp_moderate_rcvbuf" }, 387 { CTL_INT, NET_TCP_MODERATE_RCVBUF, "tcp_moderate_rcvbuf" },
388 { CTL_INT, NET_TCP_TSO_WIN_DIVISOR, "tcp_tso_win_divisor" }, 388 { CTL_INT, NET_TCP_TSO_WIN_DIVISOR, "tcp_tso_win_divisor" },
389 { CTL_STR, NET_TCP_CONG_CONTROL, "tcp_congestion_control" }, 389 { CTL_STR, NET_TCP_CONG_CONTROL, "tcp_congestion_control" },
390 { CTL_INT, NET_TCP_ABC, "tcp_abc" },
391 { CTL_INT, NET_TCP_MTU_PROBING, "tcp_mtu_probing" }, 390 { CTL_INT, NET_TCP_MTU_PROBING, "tcp_mtu_probing" },
392 { CTL_INT, NET_TCP_BASE_MSS, "tcp_base_mss" }, 391 { CTL_INT, NET_TCP_BASE_MSS, "tcp_base_mss" },
393 { CTL_INT, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" }, 392 { CTL_INT, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" },
@@ -971,7 +970,6 @@ out:
971static ssize_t bin_intvec(struct file *file, 970static ssize_t bin_intvec(struct file *file,
972 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) 971 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
973{ 972{
974 mm_segment_t old_fs = get_fs();
975 ssize_t copied = 0; 973 ssize_t copied = 0;
976 char *buffer; 974 char *buffer;
977 ssize_t result; 975 ssize_t result;
@@ -984,13 +982,10 @@ static ssize_t bin_intvec(struct file *file,
984 if (oldval && oldlen) { 982 if (oldval && oldlen) {
985 unsigned __user *vec = oldval; 983 unsigned __user *vec = oldval;
986 size_t length = oldlen / sizeof(*vec); 984 size_t length = oldlen / sizeof(*vec);
987 loff_t pos = 0;
988 char *str, *end; 985 char *str, *end;
989 int i; 986 int i;
990 987
991 set_fs(KERNEL_DS); 988 result = kernel_read(file, 0, buffer, BUFSZ - 1);
992 result = vfs_read(file, buffer, BUFSZ - 1, &pos);
993 set_fs(old_fs);
994 if (result < 0) 989 if (result < 0)
995 goto out_kfree; 990 goto out_kfree;
996 991
@@ -1017,7 +1012,6 @@ static ssize_t bin_intvec(struct file *file,
1017 if (newval && newlen) { 1012 if (newval && newlen) {
1018 unsigned __user *vec = newval; 1013 unsigned __user *vec = newval;
1019 size_t length = newlen / sizeof(*vec); 1014 size_t length = newlen / sizeof(*vec);
1020 loff_t pos = 0;
1021 char *str, *end; 1015 char *str, *end;
1022 int i; 1016 int i;
1023 1017
@@ -1033,9 +1027,7 @@ static ssize_t bin_intvec(struct file *file,
1033 str += snprintf(str, end - str, "%lu\t", value); 1027 str += snprintf(str, end - str, "%lu\t", value);
1034 } 1028 }
1035 1029
1036 set_fs(KERNEL_DS); 1030 result = kernel_write(file, buffer, str - buffer, 0);
1037 result = vfs_write(file, buffer, str - buffer, &pos);
1038 set_fs(old_fs);
1039 if (result < 0) 1031 if (result < 0)
1040 goto out_kfree; 1032 goto out_kfree;
1041 } 1033 }
@@ -1049,7 +1041,6 @@ out:
1049static ssize_t bin_ulongvec(struct file *file, 1041static ssize_t bin_ulongvec(struct file *file,
1050 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) 1042 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1051{ 1043{
1052 mm_segment_t old_fs = get_fs();
1053 ssize_t copied = 0; 1044 ssize_t copied = 0;
1054 char *buffer; 1045 char *buffer;
1055 ssize_t result; 1046 ssize_t result;
@@ -1062,13 +1053,10 @@ static ssize_t bin_ulongvec(struct file *file,
1062 if (oldval && oldlen) { 1053 if (oldval && oldlen) {
1063 unsigned long __user *vec = oldval; 1054 unsigned long __user *vec = oldval;
1064 size_t length = oldlen / sizeof(*vec); 1055 size_t length = oldlen / sizeof(*vec);
1065 loff_t pos = 0;
1066 char *str, *end; 1056 char *str, *end;
1067 int i; 1057 int i;
1068 1058
1069 set_fs(KERNEL_DS); 1059 result = kernel_read(file, 0, buffer, BUFSZ - 1);
1070 result = vfs_read(file, buffer, BUFSZ - 1, &pos);
1071 set_fs(old_fs);
1072 if (result < 0) 1060 if (result < 0)
1073 goto out_kfree; 1061 goto out_kfree;
1074 1062
@@ -1095,7 +1083,6 @@ static ssize_t bin_ulongvec(struct file *file,
1095 if (newval && newlen) { 1083 if (newval && newlen) {
1096 unsigned long __user *vec = newval; 1084 unsigned long __user *vec = newval;
1097 size_t length = newlen / sizeof(*vec); 1085 size_t length = newlen / sizeof(*vec);
1098 loff_t pos = 0;
1099 char *str, *end; 1086 char *str, *end;
1100 int i; 1087 int i;
1101 1088
@@ -1111,9 +1098,7 @@ static ssize_t bin_ulongvec(struct file *file,
1111 str += snprintf(str, end - str, "%lu\t", value); 1098 str += snprintf(str, end - str, "%lu\t", value);
1112 } 1099 }
1113 1100
1114 set_fs(KERNEL_DS); 1101 result = kernel_write(file, buffer, str - buffer, 0);
1115 result = vfs_write(file, buffer, str - buffer, &pos);
1116 set_fs(old_fs);
1117 if (result < 0) 1102 if (result < 0)
1118 goto out_kfree; 1103 goto out_kfree;
1119 } 1104 }
@@ -1127,19 +1112,15 @@ out:
1127static ssize_t bin_uuid(struct file *file, 1112static ssize_t bin_uuid(struct file *file,
1128 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) 1113 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1129{ 1114{
1130 mm_segment_t old_fs = get_fs();
1131 ssize_t result, copied = 0; 1115 ssize_t result, copied = 0;
1132 1116
1133 /* Only supports reads */ 1117 /* Only supports reads */
1134 if (oldval && oldlen) { 1118 if (oldval && oldlen) {
1135 loff_t pos = 0;
1136 char buf[40], *str = buf; 1119 char buf[40], *str = buf;
1137 unsigned char uuid[16]; 1120 unsigned char uuid[16];
1138 int i; 1121 int i;
1139 1122
1140 set_fs(KERNEL_DS); 1123 result = kernel_read(file, 0, buf, sizeof(buf) - 1);
1141 result = vfs_read(file, buf, sizeof(buf) - 1, &pos);
1142 set_fs(old_fs);
1143 if (result < 0) 1124 if (result < 0)
1144 goto out; 1125 goto out;
1145 1126
@@ -1175,18 +1156,14 @@ out:
1175static ssize_t bin_dn_node_address(struct file *file, 1156static ssize_t bin_dn_node_address(struct file *file,
1176 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) 1157 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1177{ 1158{
1178 mm_segment_t old_fs = get_fs();
1179 ssize_t result, copied = 0; 1159 ssize_t result, copied = 0;
1180 1160
1181 if (oldval && oldlen) { 1161 if (oldval && oldlen) {
1182 loff_t pos = 0;
1183 char buf[15], *nodep; 1162 char buf[15], *nodep;
1184 unsigned long area, node; 1163 unsigned long area, node;
1185 __le16 dnaddr; 1164 __le16 dnaddr;
1186 1165
1187 set_fs(KERNEL_DS); 1166 result = kernel_read(file, 0, buf, sizeof(buf) - 1);
1188 result = vfs_read(file, buf, sizeof(buf) - 1, &pos);
1189 set_fs(old_fs);
1190 if (result < 0) 1167 if (result < 0)
1191 goto out; 1168 goto out;
1192 1169
@@ -1194,9 +1171,10 @@ static ssize_t bin_dn_node_address(struct file *file,
1194 1171
1195 /* Convert the decnet address to binary */ 1172 /* Convert the decnet address to binary */
1196 result = -EIO; 1173 result = -EIO;
1197 nodep = strchr(buf, '.') + 1; 1174 nodep = strchr(buf, '.');
1198 if (!nodep) 1175 if (!nodep)
1199 goto out; 1176 goto out;
1177 ++nodep;
1200 1178
1201 area = simple_strtoul(buf, NULL, 10); 1179 area = simple_strtoul(buf, NULL, 10);
1202 node = simple_strtoul(nodep, NULL, 10); 1180 node = simple_strtoul(nodep, NULL, 10);
@@ -1215,7 +1193,6 @@ static ssize_t bin_dn_node_address(struct file *file,
1215 } 1193 }
1216 1194
1217 if (newval && newlen) { 1195 if (newval && newlen) {
1218 loff_t pos = 0;
1219 __le16 dnaddr; 1196 __le16 dnaddr;
1220 char buf[15]; 1197 char buf[15];
1221 int len; 1198 int len;
@@ -1232,9 +1209,7 @@ static ssize_t bin_dn_node_address(struct file *file,
1232 le16_to_cpu(dnaddr) >> 10, 1209 le16_to_cpu(dnaddr) >> 10,
1233 le16_to_cpu(dnaddr) & 0x3ff); 1210 le16_to_cpu(dnaddr) & 0x3ff);
1234 1211
1235 set_fs(KERNEL_DS); 1212 result = kernel_write(file, buf, len, 0);
1236 result = vfs_write(file, buf, len, &pos);
1237 set_fs(old_fs);
1238 if (result < 0) 1213 if (result < 0)
1239 goto out; 1214 goto out;
1240 } 1215 }
diff --git a/kernel/time.c b/kernel/time.c
index c2a27dd93142..f8342a41efa6 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -240,7 +240,7 @@ EXPORT_SYMBOL(current_fs_time);
240 * Avoid unnecessary multiplications/divisions in the 240 * Avoid unnecessary multiplications/divisions in the
241 * two most common HZ cases: 241 * two most common HZ cases:
242 */ 242 */
243inline unsigned int jiffies_to_msecs(const unsigned long j) 243unsigned int jiffies_to_msecs(const unsigned long j)
244{ 244{
245#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) 245#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
246 return (MSEC_PER_SEC / HZ) * j; 246 return (MSEC_PER_SEC / HZ) * j;
@@ -256,7 +256,7 @@ inline unsigned int jiffies_to_msecs(const unsigned long j)
256} 256}
257EXPORT_SYMBOL(jiffies_to_msecs); 257EXPORT_SYMBOL(jiffies_to_msecs);
258 258
259inline unsigned int jiffies_to_usecs(const unsigned long j) 259unsigned int jiffies_to_usecs(const unsigned long j)
260{ 260{
261#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) 261#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
262 return (USEC_PER_SEC / HZ) * j; 262 return (USEC_PER_SEC / HZ) * j;
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 30b6de0d977c..c6d6400ee137 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -339,6 +339,7 @@ void clockevents_config_and_register(struct clock_event_device *dev,
339 clockevents_config(dev, freq); 339 clockevents_config(dev, freq);
340 clockevents_register_device(dev); 340 clockevents_register_device(dev);
341} 341}
342EXPORT_SYMBOL_GPL(clockevents_config_and_register);
342 343
343/** 344/**
344 * clockevents_update_freq - Update frequency and reprogram a clock event device. 345 * clockevents_update_freq - Update frequency and reprogram a clock event device.
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index b10a42bb0165..072bb066bb7d 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -23,7 +23,7 @@
23 * NTP timekeeping variables: 23 * NTP timekeeping variables:
24 */ 24 */
25 25
26DEFINE_SPINLOCK(ntp_lock); 26DEFINE_RAW_SPINLOCK(ntp_lock);
27 27
28 28
29/* USER_HZ period (usecs): */ 29/* USER_HZ period (usecs): */
@@ -348,7 +348,7 @@ void ntp_clear(void)
348{ 348{
349 unsigned long flags; 349 unsigned long flags;
350 350
351 spin_lock_irqsave(&ntp_lock, flags); 351 raw_spin_lock_irqsave(&ntp_lock, flags);
352 352
353 time_adjust = 0; /* stop active adjtime() */ 353 time_adjust = 0; /* stop active adjtime() */
354 time_status |= STA_UNSYNC; 354 time_status |= STA_UNSYNC;
@@ -362,7 +362,7 @@ void ntp_clear(void)
362 362
363 /* Clear PPS state variables */ 363 /* Clear PPS state variables */
364 pps_clear(); 364 pps_clear();
365 spin_unlock_irqrestore(&ntp_lock, flags); 365 raw_spin_unlock_irqrestore(&ntp_lock, flags);
366 366
367} 367}
368 368
@@ -372,9 +372,9 @@ u64 ntp_tick_length(void)
372 unsigned long flags; 372 unsigned long flags;
373 s64 ret; 373 s64 ret;
374 374
375 spin_lock_irqsave(&ntp_lock, flags); 375 raw_spin_lock_irqsave(&ntp_lock, flags);
376 ret = tick_length; 376 ret = tick_length;
377 spin_unlock_irqrestore(&ntp_lock, flags); 377 raw_spin_unlock_irqrestore(&ntp_lock, flags);
378 return ret; 378 return ret;
379} 379}
380 380
@@ -395,7 +395,7 @@ int second_overflow(unsigned long secs)
395 int leap = 0; 395 int leap = 0;
396 unsigned long flags; 396 unsigned long flags;
397 397
398 spin_lock_irqsave(&ntp_lock, flags); 398 raw_spin_lock_irqsave(&ntp_lock, flags);
399 399
400 /* 400 /*
401 * Leap second processing. If in leap-insert state at the end of the 401 * Leap second processing. If in leap-insert state at the end of the
@@ -479,7 +479,7 @@ int second_overflow(unsigned long secs)
479 time_adjust = 0; 479 time_adjust = 0;
480 480
481out: 481out:
482 spin_unlock_irqrestore(&ntp_lock, flags); 482 raw_spin_unlock_irqrestore(&ntp_lock, flags);
483 483
484 return leap; 484 return leap;
485} 485}
@@ -672,7 +672,7 @@ int do_adjtimex(struct timex *txc)
672 672
673 getnstimeofday(&ts); 673 getnstimeofday(&ts);
674 674
675 spin_lock_irq(&ntp_lock); 675 raw_spin_lock_irq(&ntp_lock);
676 676
677 if (txc->modes & ADJ_ADJTIME) { 677 if (txc->modes & ADJ_ADJTIME) {
678 long save_adjust = time_adjust; 678 long save_adjust = time_adjust;
@@ -714,7 +714,7 @@ int do_adjtimex(struct timex *txc)
714 /* fill PPS status fields */ 714 /* fill PPS status fields */
715 pps_fill_timex(txc); 715 pps_fill_timex(txc);
716 716
717 spin_unlock_irq(&ntp_lock); 717 raw_spin_unlock_irq(&ntp_lock);
718 718
719 txc->time.tv_sec = ts.tv_sec; 719 txc->time.tv_sec = ts.tv_sec;
720 txc->time.tv_usec = ts.tv_nsec; 720 txc->time.tv_usec = ts.tv_nsec;
@@ -912,7 +912,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
912 912
913 pts_norm = pps_normalize_ts(*phase_ts); 913 pts_norm = pps_normalize_ts(*phase_ts);
914 914
915 spin_lock_irqsave(&ntp_lock, flags); 915 raw_spin_lock_irqsave(&ntp_lock, flags);
916 916
917 /* clear the error bits, they will be set again if needed */ 917 /* clear the error bits, they will be set again if needed */
918 time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); 918 time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR);
@@ -925,7 +925,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
925 * just start the frequency interval */ 925 * just start the frequency interval */
926 if (unlikely(pps_fbase.tv_sec == 0)) { 926 if (unlikely(pps_fbase.tv_sec == 0)) {
927 pps_fbase = *raw_ts; 927 pps_fbase = *raw_ts;
928 spin_unlock_irqrestore(&ntp_lock, flags); 928 raw_spin_unlock_irqrestore(&ntp_lock, flags);
929 return; 929 return;
930 } 930 }
931 931
@@ -940,7 +940,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
940 time_status |= STA_PPSJITTER; 940 time_status |= STA_PPSJITTER;
941 /* restart the frequency calibration interval */ 941 /* restart the frequency calibration interval */
942 pps_fbase = *raw_ts; 942 pps_fbase = *raw_ts;
943 spin_unlock_irqrestore(&ntp_lock, flags); 943 raw_spin_unlock_irqrestore(&ntp_lock, flags);
944 pr_err("hardpps: PPSJITTER: bad pulse\n"); 944 pr_err("hardpps: PPSJITTER: bad pulse\n");
945 return; 945 return;
946 } 946 }
@@ -957,7 +957,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
957 957
958 hardpps_update_phase(pts_norm.nsec); 958 hardpps_update_phase(pts_norm.nsec);
959 959
960 spin_unlock_irqrestore(&ntp_lock, flags); 960 raw_spin_unlock_irqrestore(&ntp_lock, flags);
961} 961}
962EXPORT_SYMBOL(hardpps); 962EXPORT_SYMBOL(hardpps);
963 963
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 314b9ee07edf..a19a39952c1b 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -554,6 +554,7 @@ void tick_nohz_idle_enter(void)
554 554
555 local_irq_enable(); 555 local_irq_enable();
556} 556}
557EXPORT_SYMBOL_GPL(tick_nohz_idle_enter);
557 558
558/** 559/**
559 * tick_nohz_irq_exit - update next tick event from interrupt exit 560 * tick_nohz_irq_exit - update next tick event from interrupt exit
@@ -685,6 +686,7 @@ void tick_nohz_idle_exit(void)
685 686
686 local_irq_enable(); 687 local_irq_enable();
687} 688}
689EXPORT_SYMBOL_GPL(tick_nohz_idle_exit);
688 690
689static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now) 691static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now)
690{ 692{
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 1e35515a875e..9a0bc98fbe1d 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -138,6 +138,20 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
138} 138}
139 139
140/* Timekeeper helper functions. */ 140/* Timekeeper helper functions. */
141
142#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
143u32 (*arch_gettimeoffset)(void);
144
145u32 get_arch_timeoffset(void)
146{
147 if (likely(arch_gettimeoffset))
148 return arch_gettimeoffset();
149 return 0;
150}
151#else
152static inline u32 get_arch_timeoffset(void) { return 0; }
153#endif
154
141static inline s64 timekeeping_get_ns(struct timekeeper *tk) 155static inline s64 timekeeping_get_ns(struct timekeeper *tk)
142{ 156{
143 cycle_t cycle_now, cycle_delta; 157 cycle_t cycle_now, cycle_delta;
@@ -154,8 +168,8 @@ static inline s64 timekeeping_get_ns(struct timekeeper *tk)
154 nsec = cycle_delta * tk->mult + tk->xtime_nsec; 168 nsec = cycle_delta * tk->mult + tk->xtime_nsec;
155 nsec >>= tk->shift; 169 nsec >>= tk->shift;
156 170
157 /* If arch requires, add in gettimeoffset() */ 171 /* If arch requires, add in get_arch_timeoffset() */
158 return nsec + arch_gettimeoffset(); 172 return nsec + get_arch_timeoffset();
159} 173}
160 174
161static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) 175static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
@@ -174,8 +188,8 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
174 /* convert delta to nanoseconds. */ 188 /* convert delta to nanoseconds. */
175 nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); 189 nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
176 190
177 /* If arch requires, add in gettimeoffset() */ 191 /* If arch requires, add in get_arch_timeoffset() */
178 return nsec + arch_gettimeoffset(); 192 return nsec + get_arch_timeoffset();
179} 193}
180 194
181static RAW_NOTIFIER_HEAD(pvclock_gtod_chain); 195static RAW_NOTIFIER_HEAD(pvclock_gtod_chain);
@@ -257,8 +271,8 @@ static void timekeeping_forward_now(struct timekeeper *tk)
257 271
258 tk->xtime_nsec += cycle_delta * tk->mult; 272 tk->xtime_nsec += cycle_delta * tk->mult;
259 273
260 /* If arch requires, add in gettimeoffset() */ 274 /* If arch requires, add in get_arch_timeoffset() */
261 tk->xtime_nsec += (u64)arch_gettimeoffset() << tk->shift; 275 tk->xtime_nsec += (u64)get_arch_timeoffset() << tk->shift;
262 276
263 tk_normalize_xtime(tk); 277 tk_normalize_xtime(tk);
264 278
diff --git a/kernel/timeconst.bc b/kernel/timeconst.bc
new file mode 100644
index 000000000000..511bdf2cafda
--- /dev/null
+++ b/kernel/timeconst.bc
@@ -0,0 +1,108 @@
1scale=0
2
3define gcd(a,b) {
4 auto t;
5 while (b) {
6 t = b;
7 b = a % b;
8 a = t;
9 }
10 return a;
11}
12
13/* Division by reciprocal multiplication. */
14define fmul(b,n,d) {
15 return (2^b*n+d-1)/d;
16}
17
18/* Adjustment factor when a ceiling value is used. Use as:
19 (imul * n) + (fmulxx * n + fadjxx) >> xx) */
20define fadj(b,n,d) {
21 auto v;
22 d = d/gcd(n,d);
23 v = 2^b*(d-1)/d;
24 return v;
25}
26
27/* Compute the appropriate mul/adj values as well as a shift count,
28 which brings the mul value into the range 2^b-1 <= x < 2^b. Such
29 a shift value will be correct in the signed integer range and off
30 by at most one in the upper half of the unsigned range. */
31define fmuls(b,n,d) {
32 auto s, m;
33 for (s = 0; 1; s++) {
34 m = fmul(s,n,d);
35 if (m >= 2^(b-1))
36 return s;
37 }
38 return 0;
39}
40
41define timeconst(hz) {
42 print "/* Automatically generated by kernel/timeconst.bc */\n"
43 print "/* Time conversion constants for HZ == ", hz, " */\n"
44 print "\n"
45
46 print "#ifndef KERNEL_TIMECONST_H\n"
47 print "#define KERNEL_TIMECONST_H\n\n"
48
49 print "#include <linux/param.h>\n"
50 print "#include <linux/types.h>\n\n"
51
52 print "#if HZ != ", hz, "\n"
53 print "#error \qkernel/timeconst.h has the wrong HZ value!\q\n"
54 print "#endif\n\n"
55
56 if (hz < 2) {
57 print "#error Totally bogus HZ value!\n"
58 } else {
59 s=fmuls(32,1000,hz)
60 obase=16
61 print "#define HZ_TO_MSEC_MUL32\tU64_C(0x", fmul(s,1000,hz), ")\n"
62 print "#define HZ_TO_MSEC_ADJ32\tU64_C(0x", fadj(s,1000,hz), ")\n"
63 obase=10
64 print "#define HZ_TO_MSEC_SHR32\t", s, "\n"
65
66 s=fmuls(32,hz,1000)
67 obase=16
68 print "#define MSEC_TO_HZ_MUL32\tU64_C(0x", fmul(s,hz,1000), ")\n"
69 print "#define MSEC_TO_HZ_ADJ32\tU64_C(0x", fadj(s,hz,1000), ")\n"
70 obase=10
71 print "#define MSEC_TO_HZ_SHR32\t", s, "\n"
72
73 obase=10
74 cd=gcd(hz,1000)
75 print "#define HZ_TO_MSEC_NUM\t\t", 1000/cd, "\n"
76 print "#define HZ_TO_MSEC_DEN\t\t", hz/cd, "\n"
77 print "#define MSEC_TO_HZ_NUM\t\t", hz/cd, "\n"
78 print "#define MSEC_TO_HZ_DEN\t\t", 1000/cd, "\n"
79 print "\n"
80
81 s=fmuls(32,1000000,hz)
82 obase=16
83 print "#define HZ_TO_USEC_MUL32\tU64_C(0x", fmul(s,1000000,hz), ")\n"
84 print "#define HZ_TO_USEC_ADJ32\tU64_C(0x", fadj(s,1000000,hz), ")\n"
85 obase=10
86 print "#define HZ_TO_USEC_SHR32\t", s, "\n"
87
88 s=fmuls(32,hz,1000000)
89 obase=16
90 print "#define USEC_TO_HZ_MUL32\tU64_C(0x", fmul(s,hz,1000000), ")\n"
91 print "#define USEC_TO_HZ_ADJ32\tU64_C(0x", fadj(s,hz,1000000), ")\n"
92 obase=10
93 print "#define USEC_TO_HZ_SHR32\t", s, "\n"
94
95 obase=10
96 cd=gcd(hz,1000000)
97 print "#define HZ_TO_USEC_NUM\t\t", 1000000/cd, "\n"
98 print "#define HZ_TO_USEC_DEN\t\t", hz/cd, "\n"
99 print "#define USEC_TO_HZ_NUM\t\t", hz/cd, "\n"
100 print "#define USEC_TO_HZ_DEN\t\t", 1000000/cd, "\n"
101 print "\n"
102
103 print "#endif /* KERNEL_TIMECONST_H */\n"
104 }
105 halt
106}
107
108timeconst(hz)
diff --git a/kernel/timeconst.pl b/kernel/timeconst.pl
deleted file mode 100644
index 3f42652a6a37..000000000000
--- a/kernel/timeconst.pl
+++ /dev/null
@@ -1,376 +0,0 @@
1#!/usr/bin/perl
2# -----------------------------------------------------------------------
3#
4# Copyright 2007-2008 rPath, Inc. - All Rights Reserved
5#
6# This file is part of the Linux kernel, and is made available under
7# the terms of the GNU General Public License version 2 or (at your
8# option) any later version; incorporated herein by reference.
9#
10# -----------------------------------------------------------------------
11#
12
13#
14# Usage: timeconst.pl HZ > timeconst.h
15#
16
17# Precomputed values for systems without Math::BigInt
18# Generated by:
19# timeconst.pl --can 24 32 48 64 100 122 128 200 250 256 300 512 1000 1024 1200
20%canned_values = (
21 24 => [
22 '0xa6aaaaab','0x2aaaaaa',26,
23 125,3,
24 '0xc49ba5e4','0x1fbe76c8b4',37,
25 3,125,
26 '0xa2c2aaab','0xaaaa',16,
27 125000,3,
28 '0xc9539b89','0x7fffbce4217d',47,
29 3,125000,
30 ], 32 => [
31 '0xfa000000','0x6000000',27,
32 125,4,
33 '0x83126e98','0xfdf3b645a',36,
34 4,125,
35 '0xf4240000','0x0',17,
36 31250,1,
37 '0x8637bd06','0x3fff79c842fa',46,
38 1,31250,
39 ], 48 => [
40 '0xa6aaaaab','0x6aaaaaa',27,
41 125,6,
42 '0xc49ba5e4','0xfdf3b645a',36,
43 6,125,
44 '0xa2c2aaab','0x15555',17,
45 62500,3,
46 '0xc9539b89','0x3fffbce4217d',46,
47 3,62500,
48 ], 64 => [
49 '0xfa000000','0xe000000',28,
50 125,8,
51 '0x83126e98','0x7ef9db22d',35,
52 8,125,
53 '0xf4240000','0x0',18,
54 15625,1,
55 '0x8637bd06','0x1fff79c842fa',45,
56 1,15625,
57 ], 100 => [
58 '0xa0000000','0x0',28,
59 10,1,
60 '0xcccccccd','0x733333333',35,
61 1,10,
62 '0x9c400000','0x0',18,
63 10000,1,
64 '0xd1b71759','0x1fff2e48e8a7',45,
65 1,10000,
66 ], 122 => [
67 '0x8325c53f','0xfbcda3a',28,
68 500,61,
69 '0xf9db22d1','0x7fbe76c8b',35,
70 61,500,
71 '0x8012e2a0','0x3ef36',18,
72 500000,61,
73 '0xffda4053','0x1ffffbce4217',45,
74 61,500000,
75 ], 128 => [
76 '0xfa000000','0x1e000000',29,
77 125,16,
78 '0x83126e98','0x3f7ced916',34,
79 16,125,
80 '0xf4240000','0x40000',19,
81 15625,2,
82 '0x8637bd06','0xfffbce4217d',44,
83 2,15625,
84 ], 200 => [
85 '0xa0000000','0x0',29,
86 5,1,
87 '0xcccccccd','0x333333333',34,
88 1,5,
89 '0x9c400000','0x0',19,
90 5000,1,
91 '0xd1b71759','0xfff2e48e8a7',44,
92 1,5000,
93 ], 250 => [
94 '0x80000000','0x0',29,
95 4,1,
96 '0x80000000','0x180000000',33,
97 1,4,
98 '0xfa000000','0x0',20,
99 4000,1,
100 '0x83126e98','0x7ff7ced9168',43,
101 1,4000,
102 ], 256 => [
103 '0xfa000000','0x3e000000',30,
104 125,32,
105 '0x83126e98','0x1fbe76c8b',33,
106 32,125,
107 '0xf4240000','0xc0000',20,
108 15625,4,
109 '0x8637bd06','0x7ffde7210be',43,
110 4,15625,
111 ], 300 => [
112 '0xd5555556','0x2aaaaaaa',30,
113 10,3,
114 '0x9999999a','0x1cccccccc',33,
115 3,10,
116 '0xd0555556','0xaaaaa',20,
117 10000,3,
118 '0x9d495183','0x7ffcb923a29',43,
119 3,10000,
120 ], 512 => [
121 '0xfa000000','0x7e000000',31,
122 125,64,
123 '0x83126e98','0xfdf3b645',32,
124 64,125,
125 '0xf4240000','0x1c0000',21,
126 15625,8,
127 '0x8637bd06','0x3ffef39085f',42,
128 8,15625,
129 ], 1000 => [
130 '0x80000000','0x0',31,
131 1,1,
132 '0x80000000','0x0',31,
133 1,1,
134 '0xfa000000','0x0',22,
135 1000,1,
136 '0x83126e98','0x1ff7ced9168',41,
137 1,1000,
138 ], 1024 => [
139 '0xfa000000','0xfe000000',32,
140 125,128,
141 '0x83126e98','0x7ef9db22',31,
142 128,125,
143 '0xf4240000','0x3c0000',22,
144 15625,16,
145 '0x8637bd06','0x1fff79c842f',41,
146 16,15625,
147 ], 1200 => [
148 '0xd5555556','0xd5555555',32,
149 5,6,
150 '0x9999999a','0x66666666',31,
151 6,5,
152 '0xd0555556','0x2aaaaa',22,
153 2500,3,
154 '0x9d495183','0x1ffcb923a29',41,
155 3,2500,
156 ]
157);
158
159$has_bigint = eval 'use Math::BigInt qw(bgcd); 1;';
160
161sub bint($)
162{
163 my($x) = @_;
164 return Math::BigInt->new($x);
165}
166
167#
168# Constants for division by reciprocal multiplication.
169# (bits, numerator, denominator)
170#
171sub fmul($$$)
172{
173 my ($b,$n,$d) = @_;
174
175 $n = bint($n);
176 $d = bint($d);
177
178 return scalar (($n << $b)+$d-bint(1))/$d;
179}
180
181sub fadj($$$)
182{
183 my($b,$n,$d) = @_;
184
185 $n = bint($n);
186 $d = bint($d);
187
188 $d = $d/bgcd($n, $d);
189 return scalar (($d-bint(1)) << $b)/$d;
190}
191
192sub fmuls($$$) {
193 my($b,$n,$d) = @_;
194 my($s,$m);
195 my($thres) = bint(1) << ($b-1);
196
197 $n = bint($n);
198 $d = bint($d);
199
200 for ($s = 0; 1; $s++) {
201 $m = fmul($s,$n,$d);
202 return $s if ($m >= $thres);
203 }
204 return 0;
205}
206
207# Generate a hex value if the result fits in 64 bits;
208# otherwise skip.
209sub bignum_hex($) {
210 my($x) = @_;
211 my $s = $x->as_hex();
212
213 return (length($s) > 18) ? undef : $s;
214}
215
216# Provides mul, adj, and shr factors for a specific
217# (bit, time, hz) combination
218sub muladj($$$) {
219 my($b, $t, $hz) = @_;
220 my $s = fmuls($b, $t, $hz);
221 my $m = fmul($s, $t, $hz);
222 my $a = fadj($s, $t, $hz);
223 return (bignum_hex($m), bignum_hex($a), $s);
224}
225
226# Provides numerator, denominator values
227sub numden($$) {
228 my($n, $d) = @_;
229 my $g = bgcd($n, $d);
230 return ($n/$g, $d/$g);
231}
232
233# All values for a specific (time, hz) combo
234sub conversions($$) {
235 my ($t, $hz) = @_;
236 my @val = ();
237
238 # HZ_TO_xx
239 push(@val, muladj(32, $t, $hz));
240 push(@val, numden($t, $hz));
241
242 # xx_TO_HZ
243 push(@val, muladj(32, $hz, $t));
244 push(@val, numden($hz, $t));
245
246 return @val;
247}
248
249sub compute_values($) {
250 my($hz) = @_;
251 my @val = ();
252 my $s, $m, $a, $g;
253
254 if (!$has_bigint) {
255 die "$0: HZ == $hz not canned and ".
256 "Math::BigInt not available\n";
257 }
258
259 # MSEC conversions
260 push(@val, conversions(1000, $hz));
261
262 # USEC conversions
263 push(@val, conversions(1000000, $hz));
264
265 return @val;
266}
267
268sub outputval($$)
269{
270 my($name, $val) = @_;
271 my $csuf;
272
273 if (defined($val)) {
274 if ($name !~ /SHR/) {
275 $val = "U64_C($val)";
276 }
277 printf "#define %-23s %s\n", $name.$csuf, $val.$csuf;
278 }
279}
280
281sub output($@)
282{
283 my($hz, @val) = @_;
284 my $pfx, $bit, $suf, $s, $m, $a;
285
286 print "/* Automatically generated by kernel/timeconst.pl */\n";
287 print "/* Conversion constants for HZ == $hz */\n";
288 print "\n";
289 print "#ifndef KERNEL_TIMECONST_H\n";
290 print "#define KERNEL_TIMECONST_H\n";
291 print "\n";
292
293 print "#include <linux/param.h>\n";
294 print "#include <linux/types.h>\n";
295
296 print "\n";
297 print "#if HZ != $hz\n";
298 print "#error \"kernel/timeconst.h has the wrong HZ value!\"\n";
299 print "#endif\n";
300 print "\n";
301
302 foreach $pfx ('HZ_TO_MSEC','MSEC_TO_HZ',
303 'HZ_TO_USEC','USEC_TO_HZ') {
304 foreach $bit (32) {
305 foreach $suf ('MUL', 'ADJ', 'SHR') {
306 outputval("${pfx}_$suf$bit", shift(@val));
307 }
308 }
309 foreach $suf ('NUM', 'DEN') {
310 outputval("${pfx}_$suf", shift(@val));
311 }
312 }
313
314 print "\n";
315 print "#endif /* KERNEL_TIMECONST_H */\n";
316}
317
318# Pretty-print Perl values
319sub perlvals(@) {
320 my $v;
321 my @l = ();
322
323 foreach $v (@_) {
324 if (!defined($v)) {
325 push(@l, 'undef');
326 } elsif ($v =~ /^0x/) {
327 push(@l, "\'".$v."\'");
328 } else {
329 push(@l, $v.'');
330 }
331 }
332 return join(',', @l);
333}
334
335($hz) = @ARGV;
336
337# Use this to generate the %canned_values structure
338if ($hz eq '--can') {
339 shift(@ARGV);
340 @hzlist = sort {$a <=> $b} (@ARGV);
341
342 print "# Precomputed values for systems without Math::BigInt\n";
343 print "# Generated by:\n";
344 print "# timeconst.pl --can ", join(' ', @hzlist), "\n";
345 print "\%canned_values = (\n";
346 my $pf = "\t";
347 foreach $hz (@hzlist) {
348 my @values = compute_values($hz);
349 print "$pf$hz => [\n";
350 while (scalar(@values)) {
351 my $bit;
352 foreach $bit (32) {
353 my $m = shift(@values);
354 my $a = shift(@values);
355 my $s = shift(@values);
356 print "\t\t", perlvals($m,$a,$s), ",\n";
357 }
358 my $n = shift(@values);
359 my $d = shift(@values);
360 print "\t\t", perlvals($n,$d), ",\n";
361 }
362 print "\t]";
363 $pf = ', ';
364 }
365 print "\n);\n";
366} else {
367 $hz += 0; # Force to number
368 if ($hz < 1) {
369 die "Usage: $0 HZ\n";
370 }
371
372 $cv = $canned_values{$hz};
373 @val = defined($cv) ? @$cv : compute_values($hz);
374 output($hz, @val);
375}
376exit 0;
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index b516a8e19d51..fc382d6e2765 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -81,21 +81,6 @@ config EVENT_TRACING
81 select CONTEXT_SWITCH_TRACER 81 select CONTEXT_SWITCH_TRACER
82 bool 82 bool
83 83
84config EVENT_POWER_TRACING_DEPRECATED
85 depends on EVENT_TRACING
86 bool "Deprecated power event trace API, to be removed"
87 default y
88 help
89 Provides old power event types:
90 C-state/idle accounting events:
91 power:power_start
92 power:power_end
93 and old cpufreq accounting event:
94 power:power_frequency
95 This is for userspace compatibility
96 and will vanish after 5 kernel iterations,
97 namely 3.1.
98
99config CONTEXT_SWITCH_TRACER 84config CONTEXT_SWITCH_TRACER
100 bool 85 bool
101 86
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 71259e2b6b61..9e5b8c272eec 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -739,6 +739,12 @@ static void blk_add_trace_rq_complete(void *ignore,
739 struct request_queue *q, 739 struct request_queue *q,
740 struct request *rq) 740 struct request *rq)
741{ 741{
742 struct blk_trace *bt = q->blk_trace;
743
744 /* if control ever passes through here, it's a request based driver */
745 if (unlikely(bt && !bt->rq_based))
746 bt->rq_based = true;
747
742 blk_add_trace_rq(q, rq, BLK_TA_COMPLETE); 748 blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);
743} 749}
744 750
@@ -774,15 +780,30 @@ static void blk_add_trace_bio_bounce(void *ignore,
774 blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0); 780 blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);
775} 781}
776 782
777static void blk_add_trace_bio_complete(void *ignore, 783static void blk_add_trace_bio_complete(void *ignore, struct bio *bio, int error)
778 struct request_queue *q, struct bio *bio,
779 int error)
780{ 784{
785 struct request_queue *q;
786 struct blk_trace *bt;
787
788 if (!bio->bi_bdev)
789 return;
790
791 q = bdev_get_queue(bio->bi_bdev);
792 bt = q->blk_trace;
793
794 /*
795 * Request based drivers will generate both rq and bio completions.
796 * Ignore bio ones.
797 */
798 if (likely(!bt) || bt->rq_based)
799 return;
800
781 blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error); 801 blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error);
782} 802}
783 803
784static void blk_add_trace_bio_backmerge(void *ignore, 804static void blk_add_trace_bio_backmerge(void *ignore,
785 struct request_queue *q, 805 struct request_queue *q,
806 struct request *rq,
786 struct bio *bio) 807 struct bio *bio)
787{ 808{
788 blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0); 809 blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0);
@@ -790,6 +811,7 @@ static void blk_add_trace_bio_backmerge(void *ignore,
790 811
791static void blk_add_trace_bio_frontmerge(void *ignore, 812static void blk_add_trace_bio_frontmerge(void *ignore,
792 struct request_queue *q, 813 struct request_queue *q,
814 struct request *rq,
793 struct bio *bio) 815 struct bio *bio)
794{ 816{
795 blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0); 817 blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index e6effd0c40a9..6893d5a2bf08 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -762,7 +762,6 @@ ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip)
762{ 762{
763 struct ftrace_profile *rec; 763 struct ftrace_profile *rec;
764 struct hlist_head *hhd; 764 struct hlist_head *hhd;
765 struct hlist_node *n;
766 unsigned long key; 765 unsigned long key;
767 766
768 key = hash_long(ip, ftrace_profile_bits); 767 key = hash_long(ip, ftrace_profile_bits);
@@ -771,7 +770,7 @@ ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip)
771 if (hlist_empty(hhd)) 770 if (hlist_empty(hhd))
772 return NULL; 771 return NULL;
773 772
774 hlist_for_each_entry_rcu(rec, n, hhd, node) { 773 hlist_for_each_entry_rcu(rec, hhd, node) {
775 if (rec->ip == ip) 774 if (rec->ip == ip)
776 return rec; 775 return rec;
777 } 776 }
@@ -1133,7 +1132,6 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
1133 unsigned long key; 1132 unsigned long key;
1134 struct ftrace_func_entry *entry; 1133 struct ftrace_func_entry *entry;
1135 struct hlist_head *hhd; 1134 struct hlist_head *hhd;
1136 struct hlist_node *n;
1137 1135
1138 if (ftrace_hash_empty(hash)) 1136 if (ftrace_hash_empty(hash))
1139 return NULL; 1137 return NULL;
@@ -1145,7 +1143,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
1145 1143
1146 hhd = &hash->buckets[key]; 1144 hhd = &hash->buckets[key];
1147 1145
1148 hlist_for_each_entry_rcu(entry, n, hhd, hlist) { 1146 hlist_for_each_entry_rcu(entry, hhd, hlist) {
1149 if (entry->ip == ip) 1147 if (entry->ip == ip)
1150 return entry; 1148 return entry;
1151 } 1149 }
@@ -1202,7 +1200,7 @@ remove_hash_entry(struct ftrace_hash *hash,
1202static void ftrace_hash_clear(struct ftrace_hash *hash) 1200static void ftrace_hash_clear(struct ftrace_hash *hash)
1203{ 1201{
1204 struct hlist_head *hhd; 1202 struct hlist_head *hhd;
1205 struct hlist_node *tp, *tn; 1203 struct hlist_node *tn;
1206 struct ftrace_func_entry *entry; 1204 struct ftrace_func_entry *entry;
1207 int size = 1 << hash->size_bits; 1205 int size = 1 << hash->size_bits;
1208 int i; 1206 int i;
@@ -1212,7 +1210,7 @@ static void ftrace_hash_clear(struct ftrace_hash *hash)
1212 1210
1213 for (i = 0; i < size; i++) { 1211 for (i = 0; i < size; i++) {
1214 hhd = &hash->buckets[i]; 1212 hhd = &hash->buckets[i];
1215 hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) 1213 hlist_for_each_entry_safe(entry, tn, hhd, hlist)
1216 free_hash_entry(hash, entry); 1214 free_hash_entry(hash, entry);
1217 } 1215 }
1218 FTRACE_WARN_ON(hash->count); 1216 FTRACE_WARN_ON(hash->count);
@@ -1275,7 +1273,6 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
1275{ 1273{
1276 struct ftrace_func_entry *entry; 1274 struct ftrace_func_entry *entry;
1277 struct ftrace_hash *new_hash; 1275 struct ftrace_hash *new_hash;
1278 struct hlist_node *tp;
1279 int size; 1276 int size;
1280 int ret; 1277 int ret;
1281 int i; 1278 int i;
@@ -1290,7 +1287,7 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
1290 1287
1291 size = 1 << hash->size_bits; 1288 size = 1 << hash->size_bits;
1292 for (i = 0; i < size; i++) { 1289 for (i = 0; i < size; i++) {
1293 hlist_for_each_entry(entry, tp, &hash->buckets[i], hlist) { 1290 hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
1294 ret = add_hash_entry(new_hash, entry->ip); 1291 ret = add_hash_entry(new_hash, entry->ip);
1295 if (ret < 0) 1292 if (ret < 0)
1296 goto free_hash; 1293 goto free_hash;
@@ -1316,7 +1313,7 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
1316 struct ftrace_hash **dst, struct ftrace_hash *src) 1313 struct ftrace_hash **dst, struct ftrace_hash *src)
1317{ 1314{
1318 struct ftrace_func_entry *entry; 1315 struct ftrace_func_entry *entry;
1319 struct hlist_node *tp, *tn; 1316 struct hlist_node *tn;
1320 struct hlist_head *hhd; 1317 struct hlist_head *hhd;
1321 struct ftrace_hash *old_hash; 1318 struct ftrace_hash *old_hash;
1322 struct ftrace_hash *new_hash; 1319 struct ftrace_hash *new_hash;
@@ -1362,7 +1359,7 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
1362 size = 1 << src->size_bits; 1359 size = 1 << src->size_bits;
1363 for (i = 0; i < size; i++) { 1360 for (i = 0; i < size; i++) {
1364 hhd = &src->buckets[i]; 1361 hhd = &src->buckets[i];
1365 hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) { 1362 hlist_for_each_entry_safe(entry, tn, hhd, hlist) {
1366 if (bits > 0) 1363 if (bits > 0)
1367 key = hash_long(entry->ip, bits); 1364 key = hash_long(entry->ip, bits);
1368 else 1365 else
@@ -2901,7 +2898,6 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,
2901{ 2898{
2902 struct ftrace_func_probe *entry; 2899 struct ftrace_func_probe *entry;
2903 struct hlist_head *hhd; 2900 struct hlist_head *hhd;
2904 struct hlist_node *n;
2905 unsigned long key; 2901 unsigned long key;
2906 2902
2907 key = hash_long(ip, FTRACE_HASH_BITS); 2903 key = hash_long(ip, FTRACE_HASH_BITS);
@@ -2917,7 +2913,7 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,
2917 * on the hash. rcu_read_lock is too dangerous here. 2913 * on the hash. rcu_read_lock is too dangerous here.
2918 */ 2914 */
2919 preempt_disable_notrace(); 2915 preempt_disable_notrace();
2920 hlist_for_each_entry_rcu(entry, n, hhd, node) { 2916 hlist_for_each_entry_rcu(entry, hhd, node) {
2921 if (entry->ip == ip) 2917 if (entry->ip == ip)
2922 entry->ops->func(ip, parent_ip, &entry->data); 2918 entry->ops->func(ip, parent_ip, &entry->data);
2923 } 2919 }
@@ -3068,7 +3064,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3068 void *data, int flags) 3064 void *data, int flags)
3069{ 3065{
3070 struct ftrace_func_probe *entry; 3066 struct ftrace_func_probe *entry;
3071 struct hlist_node *n, *tmp; 3067 struct hlist_node *tmp;
3072 char str[KSYM_SYMBOL_LEN]; 3068 char str[KSYM_SYMBOL_LEN];
3073 int type = MATCH_FULL; 3069 int type = MATCH_FULL;
3074 int i, len = 0; 3070 int i, len = 0;
@@ -3091,7 +3087,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3091 for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { 3087 for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
3092 struct hlist_head *hhd = &ftrace_func_hash[i]; 3088 struct hlist_head *hhd = &ftrace_func_hash[i];
3093 3089
3094 hlist_for_each_entry_safe(entry, n, tmp, hhd, node) { 3090 hlist_for_each_entry_safe(entry, tmp, hhd, node) {
3095 3091
3096 /* break up if statements for readability */ 3092 /* break up if statements for readability */
3097 if ((flags & PROBE_TEST_FUNC) && entry->ops != ops) 3093 if ((flags & PROBE_TEST_FUNC) && entry->ops != ops)
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index f55fcf61b223..1c71382b283d 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -13,8 +13,5 @@
13#define CREATE_TRACE_POINTS 13#define CREATE_TRACE_POINTS
14#include <trace/events/power.h> 14#include <trace/events/power.h>
15 15
16#ifdef EVENT_POWER_TRACING_DEPRECATED
17EXPORT_TRACEPOINT_SYMBOL_GPL(power_start);
18#endif
19EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); 16EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle);
20 17
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 7244acde77b0..6989df2ba194 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -178,7 +178,7 @@ void tracing_off_permanent(void)
178#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) 178#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
179#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */ 179#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */
180 180
181#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) 181#ifndef CONFIG_HAVE_64BIT_ALIGNED_ACCESS
182# define RB_FORCE_8BYTE_ALIGNMENT 0 182# define RB_FORCE_8BYTE_ALIGNMENT 0
183# define RB_ARCH_ALIGNMENT RB_ALIGNMENT 183# define RB_ARCH_ALIGNMENT RB_ALIGNMENT
184#else 184#else
@@ -186,6 +186,8 @@ void tracing_off_permanent(void)
186# define RB_ARCH_ALIGNMENT 8U 186# define RB_ARCH_ALIGNMENT 8U
187#endif 187#endif
188 188
189#define RB_ALIGN_DATA __aligned(RB_ARCH_ALIGNMENT)
190
189/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ 191/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
190#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX 192#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
191 193
@@ -334,7 +336,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
334struct buffer_data_page { 336struct buffer_data_page {
335 u64 time_stamp; /* page time stamp */ 337 u64 time_stamp; /* page time stamp */
336 local_t commit; /* write committed index */ 338 local_t commit; /* write committed index */
337 unsigned char data[]; /* data of buffer page */ 339 unsigned char data[] RB_ALIGN_DATA; /* data of buffer page */
338}; 340};
339 341
340/* 342/*
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 194d79602dc7..697e88d13907 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -739,12 +739,11 @@ static int task_state_char(unsigned long state)
739struct trace_event *ftrace_find_event(int type) 739struct trace_event *ftrace_find_event(int type)
740{ 740{
741 struct trace_event *event; 741 struct trace_event *event;
742 struct hlist_node *n;
743 unsigned key; 742 unsigned key;
744 743
745 key = type & (EVENT_HASHSIZE - 1); 744 key = type & (EVENT_HASHSIZE - 1);
746 745
747 hlist_for_each_entry(event, n, &event_hash[key], node) { 746 hlist_for_each_entry(event, &event_hash[key], node) {
748 if (event->type == type) 747 if (event->type == type)
749 return event; 748 return event;
750 } 749 }
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index d96ba22dabfa..0c05a4592047 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -192,12 +192,11 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry,
192static struct tracepoint_entry *get_tracepoint(const char *name) 192static struct tracepoint_entry *get_tracepoint(const char *name)
193{ 193{
194 struct hlist_head *head; 194 struct hlist_head *head;
195 struct hlist_node *node;
196 struct tracepoint_entry *e; 195 struct tracepoint_entry *e;
197 u32 hash = jhash(name, strlen(name), 0); 196 u32 hash = jhash(name, strlen(name), 0);
198 197
199 head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)]; 198 head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)];
200 hlist_for_each_entry(e, node, head, hlist) { 199 hlist_for_each_entry(e, head, hlist) {
201 if (!strcmp(name, e->name)) 200 if (!strcmp(name, e->name))
202 return e; 201 return e;
203 } 202 }
@@ -211,13 +210,12 @@ static struct tracepoint_entry *get_tracepoint(const char *name)
211static struct tracepoint_entry *add_tracepoint(const char *name) 210static struct tracepoint_entry *add_tracepoint(const char *name)
212{ 211{
213 struct hlist_head *head; 212 struct hlist_head *head;
214 struct hlist_node *node;
215 struct tracepoint_entry *e; 213 struct tracepoint_entry *e;
216 size_t name_len = strlen(name) + 1; 214 size_t name_len = strlen(name) + 1;
217 u32 hash = jhash(name, name_len-1, 0); 215 u32 hash = jhash(name, name_len-1, 0);
218 216
219 head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)]; 217 head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)];
220 hlist_for_each_entry(e, node, head, hlist) { 218 hlist_for_each_entry(e, head, hlist) {
221 if (!strcmp(name, e->name)) { 219 if (!strcmp(name, e->name)) {
222 printk(KERN_NOTICE 220 printk(KERN_NOTICE
223 "tracepoint %s busy\n", name); 221 "tracepoint %s busy\n", name);
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
index 1744bb80f1fb..394f70b17162 100644
--- a/kernel/user-return-notifier.c
+++ b/kernel/user-return-notifier.c
@@ -34,11 +34,11 @@ EXPORT_SYMBOL_GPL(user_return_notifier_unregister);
34void fire_user_return_notifiers(void) 34void fire_user_return_notifiers(void)
35{ 35{
36 struct user_return_notifier *urn; 36 struct user_return_notifier *urn;
37 struct hlist_node *tmp1, *tmp2; 37 struct hlist_node *tmp2;
38 struct hlist_head *head; 38 struct hlist_head *head;
39 39
40 head = &get_cpu_var(return_notifier_list); 40 head = &get_cpu_var(return_notifier_list);
41 hlist_for_each_entry_safe(urn, tmp1, tmp2, head, link) 41 hlist_for_each_entry_safe(urn, tmp2, head, link)
42 urn->on_user_return(urn); 42 urn->on_user_return(urn);
43 put_cpu_var(return_notifier_list); 43 put_cpu_var(return_notifier_list);
44} 44}
diff --git a/kernel/user.c b/kernel/user.c
index 33acb5e53a5f..e81978e8c03b 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -47,9 +47,7 @@ struct user_namespace init_user_ns = {
47 .count = 4294967295U, 47 .count = 4294967295U,
48 }, 48 },
49 }, 49 },
50 .kref = { 50 .count = ATOMIC_INIT(3),
51 .refcount = ATOMIC_INIT(3),
52 },
53 .owner = GLOBAL_ROOT_UID, 51 .owner = GLOBAL_ROOT_UID,
54 .group = GLOBAL_ROOT_GID, 52 .group = GLOBAL_ROOT_GID,
55 .proc_inum = PROC_USER_INIT_INO, 53 .proc_inum = PROC_USER_INIT_INO,
@@ -107,9 +105,8 @@ static void uid_hash_remove(struct user_struct *up)
107static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent) 105static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent)
108{ 106{
109 struct user_struct *user; 107 struct user_struct *user;
110 struct hlist_node *h;
111 108
112 hlist_for_each_entry(user, h, hashent, uidhash_node) { 109 hlist_for_each_entry(user, hashent, uidhash_node) {
113 if (uid_eq(user->uid, uid)) { 110 if (uid_eq(user->uid, uid)) {
114 atomic_inc(&user->__count); 111 atomic_inc(&user->__count);
115 return user; 112 return user;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 2b042c42fbc4..b14f4d342043 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -21,6 +21,7 @@
21#include <linux/uaccess.h> 21#include <linux/uaccess.h>
22#include <linux/ctype.h> 22#include <linux/ctype.h>
23#include <linux/projid.h> 23#include <linux/projid.h>
24#include <linux/fs_struct.h>
24 25
25static struct kmem_cache *user_ns_cachep __read_mostly; 26static struct kmem_cache *user_ns_cachep __read_mostly;
26 27
@@ -78,7 +79,7 @@ int create_user_ns(struct cred *new)
78 return ret; 79 return ret;
79 } 80 }
80 81
81 kref_init(&ns->kref); 82 atomic_set(&ns->count, 1);
82 /* Leave the new->user_ns reference with the new user namespace. */ 83 /* Leave the new->user_ns reference with the new user namespace. */
83 ns->parent = parent_ns; 84 ns->parent = parent_ns;
84 ns->owner = owner; 85 ns->owner = owner;
@@ -104,15 +105,16 @@ int unshare_userns(unsigned long unshare_flags, struct cred **new_cred)
104 return create_user_ns(cred); 105 return create_user_ns(cred);
105} 106}
106 107
107void free_user_ns(struct kref *kref) 108void free_user_ns(struct user_namespace *ns)
108{ 109{
109 struct user_namespace *parent, *ns = 110 struct user_namespace *parent;
110 container_of(kref, struct user_namespace, kref);
111 111
112 parent = ns->parent; 112 do {
113 proc_free_inum(ns->proc_inum); 113 parent = ns->parent;
114 kmem_cache_free(user_ns_cachep, ns); 114 proc_free_inum(ns->proc_inum);
115 put_user_ns(parent); 115 kmem_cache_free(user_ns_cachep, ns);
116 ns = parent;
117 } while (atomic_dec_and_test(&parent->count));
116} 118}
117EXPORT_SYMBOL(free_user_ns); 119EXPORT_SYMBOL(free_user_ns);
118 120
@@ -519,6 +521,42 @@ struct seq_operations proc_projid_seq_operations = {
519 .show = projid_m_show, 521 .show = projid_m_show,
520}; 522};
521 523
524static bool mappings_overlap(struct uid_gid_map *new_map, struct uid_gid_extent *extent)
525{
526 u32 upper_first, lower_first, upper_last, lower_last;
527 unsigned idx;
528
529 upper_first = extent->first;
530 lower_first = extent->lower_first;
531 upper_last = upper_first + extent->count - 1;
532 lower_last = lower_first + extent->count - 1;
533
534 for (idx = 0; idx < new_map->nr_extents; idx++) {
535 u32 prev_upper_first, prev_lower_first;
536 u32 prev_upper_last, prev_lower_last;
537 struct uid_gid_extent *prev;
538
539 prev = &new_map->extent[idx];
540
541 prev_upper_first = prev->first;
542 prev_lower_first = prev->lower_first;
543 prev_upper_last = prev_upper_first + prev->count - 1;
544 prev_lower_last = prev_lower_first + prev->count - 1;
545
546 /* Does the upper range intersect a previous extent? */
547 if ((prev_upper_first <= upper_last) &&
548 (prev_upper_last >= upper_first))
549 return true;
550
551 /* Does the lower range intersect a previous extent? */
552 if ((prev_lower_first <= lower_last) &&
553 (prev_lower_last >= lower_first))
554 return true;
555 }
556 return false;
557}
558
559
522static DEFINE_MUTEX(id_map_mutex); 560static DEFINE_MUTEX(id_map_mutex);
523 561
524static ssize_t map_write(struct file *file, const char __user *buf, 562static ssize_t map_write(struct file *file, const char __user *buf,
@@ -531,7 +569,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
531 struct user_namespace *ns = seq->private; 569 struct user_namespace *ns = seq->private;
532 struct uid_gid_map new_map; 570 struct uid_gid_map new_map;
533 unsigned idx; 571 unsigned idx;
534 struct uid_gid_extent *extent, *last = NULL; 572 struct uid_gid_extent *extent = NULL;
535 unsigned long page = 0; 573 unsigned long page = 0;
536 char *kbuf, *pos, *next_line; 574 char *kbuf, *pos, *next_line;
537 ssize_t ret = -EINVAL; 575 ssize_t ret = -EINVAL;
@@ -634,14 +672,11 @@ static ssize_t map_write(struct file *file, const char __user *buf,
634 if ((extent->lower_first + extent->count) <= extent->lower_first) 672 if ((extent->lower_first + extent->count) <= extent->lower_first)
635 goto out; 673 goto out;
636 674
637 /* For now only accept extents that are strictly in order */ 675 /* Do the ranges in extent overlap any previous extents? */
638 if (last && 676 if (mappings_overlap(&new_map, extent))
639 (((last->first + last->count) > extent->first) ||
640 ((last->lower_first + last->count) > extent->lower_first)))
641 goto out; 677 goto out;
642 678
643 new_map.nr_extents++; 679 new_map.nr_extents++;
644 last = extent;
645 680
646 /* Fail if the file contains too many extents */ 681 /* Fail if the file contains too many extents */
647 if ((new_map.nr_extents == UID_GID_MAP_MAX_EXTENTS) && 682 if ((new_map.nr_extents == UID_GID_MAP_MAX_EXTENTS) &&
@@ -803,6 +838,9 @@ static int userns_install(struct nsproxy *nsproxy, void *ns)
803 if (atomic_read(&current->mm->mm_users) > 1) 838 if (atomic_read(&current->mm->mm_users) > 1)
804 return -EINVAL; 839 return -EINVAL;
805 840
841 if (current->fs->users != 1)
842 return -EINVAL;
843
806 if (!ns_capable(user_ns, CAP_SYS_ADMIN)) 844 if (!ns_capable(user_ns, CAP_SYS_ADMIN))
807 return -EPERM; 845 return -EPERM;
808 846
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 08b197e8c485..a47fc5de3113 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -30,7 +30,7 @@ static struct uts_namespace *create_uts_ns(void)
30/* 30/*
31 * Clone a new ns copying an original utsname, setting refcount to 1 31 * Clone a new ns copying an original utsname, setting refcount to 1
32 * @old_ns: namespace to clone 32 * @old_ns: namespace to clone
33 * Return NULL on error (failure to kmalloc), new ns otherwise 33 * Return ERR_PTR(-ENOMEM) on error (failure to kmalloc), new ns otherwise
34 */ 34 */
35static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, 35static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
36 struct uts_namespace *old_ns) 36 struct uts_namespace *old_ns)
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 63da38c2d820..4f69f9a5e221 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -15,6 +15,8 @@
15#include <linux/sysctl.h> 15#include <linux/sysctl.h>
16#include <linux/wait.h> 16#include <linux/wait.h>
17 17
18#ifdef CONFIG_PROC_SYSCTL
19
18static void *get_uts(ctl_table *table, int write) 20static void *get_uts(ctl_table *table, int write)
19{ 21{
20 char *which = table->data; 22 char *which = table->data;
@@ -38,7 +40,6 @@ static void put_uts(ctl_table *table, int write, void *which)
38 up_write(&uts_sem); 40 up_write(&uts_sem);
39} 41}
40 42
41#ifdef CONFIG_PROC_SYSCTL
42/* 43/*
43 * Special case of dostring for the UTS structure. This has locks 44 * Special case of dostring for the UTS structure. This has locks
44 * to observe. Should this be in kernel/sys.c ???? 45 * to observe. Should this be in kernel/sys.c ????
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 27689422aa92..4a944676358e 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -113,9 +113,9 @@ static int get_softlockup_thresh(void)
113 * resolution, and we don't need to waste time with a big divide when 113 * resolution, and we don't need to waste time with a big divide when
114 * 2^30ns == 1.074s. 114 * 2^30ns == 1.074s.
115 */ 115 */
116static unsigned long get_timestamp(int this_cpu) 116static unsigned long get_timestamp(void)
117{ 117{
118 return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ 118 return local_clock() >> 30LL; /* 2^30 ~= 10^9 */
119} 119}
120 120
121static void set_sample_period(void) 121static void set_sample_period(void)
@@ -133,9 +133,7 @@ static void set_sample_period(void)
133/* Commands for resetting the watchdog */ 133/* Commands for resetting the watchdog */
134static void __touch_watchdog(void) 134static void __touch_watchdog(void)
135{ 135{
136 int this_cpu = smp_processor_id(); 136 __this_cpu_write(watchdog_touch_ts, get_timestamp());
137
138 __this_cpu_write(watchdog_touch_ts, get_timestamp(this_cpu));
139} 137}
140 138
141void touch_softlockup_watchdog(void) 139void touch_softlockup_watchdog(void)
@@ -196,7 +194,7 @@ static int is_hardlockup(void)
196 194
197static int is_softlockup(unsigned long touch_ts) 195static int is_softlockup(unsigned long touch_ts)
198{ 196{
199 unsigned long now = get_timestamp(smp_processor_id()); 197 unsigned long now = get_timestamp();
200 198
201 /* Warn about unreasonable delays: */ 199 /* Warn about unreasonable delays: */
202 if (time_after(now, touch_ts + get_softlockup_thresh())) 200 if (time_after(now, touch_ts + get_softlockup_thresh()))
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f4feacad3812..55fac5b991b7 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -251,8 +251,8 @@ EXPORT_SYMBOL_GPL(system_freezable_wq);
251 for ((pool) = &std_worker_pools(cpu)[0]; \ 251 for ((pool) = &std_worker_pools(cpu)[0]; \
252 (pool) < &std_worker_pools(cpu)[NR_STD_WORKER_POOLS]; (pool)++) 252 (pool) < &std_worker_pools(cpu)[NR_STD_WORKER_POOLS]; (pool)++)
253 253
254#define for_each_busy_worker(worker, i, pos, pool) \ 254#define for_each_busy_worker(worker, i, pool) \
255 hash_for_each(pool->busy_hash, i, pos, worker, hentry) 255 hash_for_each(pool->busy_hash, i, worker, hentry)
256 256
257static inline int __next_wq_cpu(int cpu, const struct cpumask *mask, 257static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
258 unsigned int sw) 258 unsigned int sw)
@@ -457,11 +457,12 @@ static int worker_pool_assign_id(struct worker_pool *pool)
457 int ret; 457 int ret;
458 458
459 mutex_lock(&worker_pool_idr_mutex); 459 mutex_lock(&worker_pool_idr_mutex);
460 idr_pre_get(&worker_pool_idr, GFP_KERNEL); 460 ret = idr_alloc(&worker_pool_idr, pool, 0, 0, GFP_KERNEL);
461 ret = idr_get_new(&worker_pool_idr, pool, &pool->id); 461 if (ret >= 0)
462 pool->id = ret;
462 mutex_unlock(&worker_pool_idr_mutex); 463 mutex_unlock(&worker_pool_idr_mutex);
463 464
464 return ret; 465 return ret < 0 ? ret : 0;
465} 466}
466 467
467/* 468/*
@@ -909,9 +910,8 @@ static struct worker *find_worker_executing_work(struct worker_pool *pool,
909 struct work_struct *work) 910 struct work_struct *work)
910{ 911{
911 struct worker *worker; 912 struct worker *worker;
912 struct hlist_node *tmp;
913 913
914 hash_for_each_possible(pool->busy_hash, worker, tmp, hentry, 914 hash_for_each_possible(pool->busy_hash, worker, hentry,
915 (unsigned long)work) 915 (unsigned long)work)
916 if (worker->current_work == work && 916 if (worker->current_work == work &&
917 worker->current_func == work->func) 917 worker->current_func == work->func)
@@ -1626,7 +1626,6 @@ static void busy_worker_rebind_fn(struct work_struct *work)
1626static void rebind_workers(struct worker_pool *pool) 1626static void rebind_workers(struct worker_pool *pool)
1627{ 1627{
1628 struct worker *worker, *n; 1628 struct worker *worker, *n;
1629 struct hlist_node *pos;
1630 int i; 1629 int i;
1631 1630
1632 lockdep_assert_held(&pool->assoc_mutex); 1631 lockdep_assert_held(&pool->assoc_mutex);
@@ -1648,7 +1647,7 @@ static void rebind_workers(struct worker_pool *pool)
1648 } 1647 }
1649 1648
1650 /* rebind busy workers */ 1649 /* rebind busy workers */
1651 for_each_busy_worker(worker, i, pos, pool) { 1650 for_each_busy_worker(worker, i, pool) {
1652 struct work_struct *rebind_work = &worker->rebind_work; 1651 struct work_struct *rebind_work = &worker->rebind_work;
1653 struct workqueue_struct *wq; 1652 struct workqueue_struct *wq;
1654 1653
@@ -3423,7 +3422,6 @@ static void wq_unbind_fn(struct work_struct *work)
3423 int cpu = smp_processor_id(); 3422 int cpu = smp_processor_id();
3424 struct worker_pool *pool; 3423 struct worker_pool *pool;
3425 struct worker *worker; 3424 struct worker *worker;
3426 struct hlist_node *pos;
3427 int i; 3425 int i;
3428 3426
3429 for_each_std_worker_pool(pool, cpu) { 3427 for_each_std_worker_pool(pool, cpu) {
@@ -3442,7 +3440,7 @@ static void wq_unbind_fn(struct work_struct *work)
3442 list_for_each_entry(worker, &pool->idle_list, entry) 3440 list_for_each_entry(worker, &pool->idle_list, entry)
3443 worker->flags |= WORKER_UNBOUND; 3441 worker->flags |= WORKER_UNBOUND;
3444 3442
3445 for_each_busy_worker(worker, i, pos, pool) 3443 for_each_busy_worker(worker, i, pool)
3446 worker->flags |= WORKER_UNBOUND; 3444 worker->flags |= WORKER_UNBOUND;
3447 3445
3448 pool->flags |= POOL_DISASSOCIATED; 3446 pool->flags |= POOL_DISASSOCIATED;