aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile77
-rw-r--r--kernel/acct.c10
-rw-r--r--kernel/audit.c144
-rw-r--r--kernel/audit.h11
-rw-r--r--kernel/audit_watch.c5
-rw-r--r--kernel/auditfilter.c202
-rw-r--r--kernel/auditsc.c510
-rw-r--r--kernel/cgroup.c332
-rw-r--r--kernel/cgroup_freezer.c8
-rw-r--r--kernel/cpu.c4
-rw-r--r--kernel/cred.c10
-rw-r--r--kernel/debug/debug_core.c32
-rw-r--r--kernel/debug/kdb/kdb_bt.c2
-rw-r--r--kernel/debug/kdb/kdb_io.c33
-rw-r--r--kernel/debug/kdb/kdb_main.c33
-rw-r--r--kernel/events/core.c81
-rw-r--r--kernel/events/uprobes.c353
-rw-r--r--kernel/exit.c100
-rw-r--r--kernel/fork.c46
-rw-r--r--kernel/irq/irqdomain.c33
-rw-r--r--kernel/jump_label.c1
-rw-r--r--kernel/kexec.c1
-rw-r--r--kernel/kmod.c7
-rw-r--r--kernel/kthread.c1
-rw-r--r--kernel/modsign_pubkey.c113
-rw-r--r--kernel/module-internal.h14
-rw-r--r--kernel/module.c149
-rw-r--r--kernel/module_signing.c249
-rw-r--r--kernel/pid.c1
-rw-r--r--kernel/pid_namespace.c21
-rw-r--r--kernel/power/Kconfig4
-rw-r--r--kernel/power/poweroff.c2
-rw-r--r--kernel/power/process.c2
-rw-r--r--kernel/power/qos.c1
-rw-r--r--kernel/printk.c1
-rw-r--r--kernel/ptrace.c3
-rw-r--r--kernel/rcutree.c21
-rw-r--r--kernel/rcutree.h6
-rw-r--r--kernel/resource.c50
-rw-r--r--kernel/sched/core.c71
-rw-r--r--kernel/signal.c3
-rw-r--r--kernel/srcu.c4
-rw-r--r--kernel/sys.c29
-rw-r--r--kernel/sysctl.c15
-rw-r--r--kernel/taskstats.c39
-rw-r--r--kernel/time.c2
-rw-r--r--kernel/time/Kconfig4
-rw-r--r--kernel/time/alarmtimer.c118
-rw-r--r--kernel/time/clockevents.c24
-rw-r--r--kernel/time/jiffies.c32
-rw-r--r--kernel/time/tick-sched.c2
-rw-r--r--kernel/time/timekeeping.c119
-rw-r--r--kernel/timer.c10
-rw-r--r--kernel/trace/trace.c11
-rw-r--r--kernel/trace/trace.h2
-rw-r--r--kernel/trace/trace_functions.c15
-rw-r--r--kernel/tsacct.c12
-rw-r--r--kernel/user.c8
-rw-r--r--kernel/user_namespace.c128
-rw-r--r--kernel/workqueue.c1217
60 files changed, 2846 insertions, 1692 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 5404911eaee9..0dfeca4324ee 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -54,6 +54,7 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
54obj-$(CONFIG_PROVE_LOCKING) += spinlock.o 54obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
55obj-$(CONFIG_UID16) += uid16.o 55obj-$(CONFIG_UID16) += uid16.o
56obj-$(CONFIG_MODULES) += module.o 56obj-$(CONFIG_MODULES) += module.o
57obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o
57obj-$(CONFIG_KALLSYMS) += kallsyms.o 58obj-$(CONFIG_KALLSYMS) += kallsyms.o
58obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o 59obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
59obj-$(CONFIG_KEXEC) += kexec.o 60obj-$(CONFIG_KEXEC) += kexec.o
@@ -130,3 +131,79 @@ quiet_cmd_timeconst = TIMEC $@
130targets += timeconst.h 131targets += timeconst.h
131$(obj)/timeconst.h: $(src)/timeconst.pl FORCE 132$(obj)/timeconst.h: $(src)/timeconst.pl FORCE
132 $(call if_changed,timeconst) 133 $(call if_changed,timeconst)
134
135ifeq ($(CONFIG_MODULE_SIG),y)
136#
137# Pull the signing certificate and any extra certificates into the kernel
138#
139extra_certificates:
140 touch $@
141
142kernel/modsign_pubkey.o: signing_key.x509 extra_certificates
143
144###############################################################################
145#
146# If module signing is requested, say by allyesconfig, but a key has not been
147# supplied, then one will need to be generated to make sure the build does not
148# fail and that the kernel may be used afterwards.
149#
150###############################################################################
151sign_key_with_hash :=
152ifeq ($(CONFIG_MODULE_SIG_SHA1),y)
153sign_key_with_hash := -sha1
154endif
155ifeq ($(CONFIG_MODULE_SIG_SHA224),y)
156sign_key_with_hash := -sha224
157endif
158ifeq ($(CONFIG_MODULE_SIG_SHA256),y)
159sign_key_with_hash := -sha256
160endif
161ifeq ($(CONFIG_MODULE_SIG_SHA384),y)
162sign_key_with_hash := -sha384
163endif
164ifeq ($(CONFIG_MODULE_SIG_SHA512),y)
165sign_key_with_hash := -sha512
166endif
167ifeq ($(sign_key_with_hash),)
168$(error Could not determine digest type to use from kernel config)
169endif
170
171signing_key.priv signing_key.x509: x509.genkey
172 @echo "###"
173 @echo "### Now generating an X.509 key pair to be used for signing modules."
174 @echo "###"
175 @echo "### If this takes a long time, you might wish to run rngd in the"
176 @echo "### background to keep the supply of entropy topped up. It"
177 @echo "### needs to be run as root, and should use a hardware random"
178 @echo "### number generator if one is available, eg:"
179 @echo "###"
180 @echo "### rngd -r /dev/hwrandom"
181 @echo "###"
182 openssl req -new -nodes -utf8 $(sign_key_with_hash) -days 36500 -batch \
183 -x509 -config x509.genkey \
184 -outform DER -out signing_key.x509 \
185 -keyout signing_key.priv
186 @echo "###"
187 @echo "### Key pair generated."
188 @echo "###"
189
190x509.genkey:
191 @echo Generating X.509 key generation config
192 @echo >x509.genkey "[ req ]"
193 @echo >>x509.genkey "default_bits = 4096"
194 @echo >>x509.genkey "distinguished_name = req_distinguished_name"
195 @echo >>x509.genkey "prompt = no"
196 @echo >>x509.genkey "string_mask = utf8only"
197 @echo >>x509.genkey "x509_extensions = myexts"
198 @echo >>x509.genkey
199 @echo >>x509.genkey "[ req_distinguished_name ]"
200 @echo >>x509.genkey "O = Magrathea"
201 @echo >>x509.genkey "CN = Glacier signing key"
202 @echo >>x509.genkey "emailAddress = slartibartfast@magrathea.h2g2"
203 @echo >>x509.genkey
204 @echo >>x509.genkey "[ myexts ]"
205 @echo >>x509.genkey "basicConstraints=critical,CA:FALSE"
206 @echo >>x509.genkey "keyUsage=digitalSignature"
207 @echo >>x509.genkey "subjectKeyIdentifier=hash"
208 @echo >>x509.genkey "authorityKeyIdentifier=keyid"
209endif
diff --git a/kernel/acct.c b/kernel/acct.c
index 02e6167a53b0..051e071a06e7 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -193,7 +193,7 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
193 } 193 }
194} 194}
195 195
196static int acct_on(char *name) 196static int acct_on(struct filename *pathname)
197{ 197{
198 struct file *file; 198 struct file *file;
199 struct vfsmount *mnt; 199 struct vfsmount *mnt;
@@ -201,7 +201,7 @@ static int acct_on(char *name)
201 struct bsd_acct_struct *acct = NULL; 201 struct bsd_acct_struct *acct = NULL;
202 202
203 /* Difference from BSD - they don't do O_APPEND */ 203 /* Difference from BSD - they don't do O_APPEND */
204 file = filp_open(name, O_WRONLY|O_APPEND|O_LARGEFILE, 0); 204 file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
205 if (IS_ERR(file)) 205 if (IS_ERR(file))
206 return PTR_ERR(file); 206 return PTR_ERR(file);
207 207
@@ -260,7 +260,7 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
260 return -EPERM; 260 return -EPERM;
261 261
262 if (name) { 262 if (name) {
263 char *tmp = getname(name); 263 struct filename *tmp = getname(name);
264 if (IS_ERR(tmp)) 264 if (IS_ERR(tmp))
265 return (PTR_ERR(tmp)); 265 return (PTR_ERR(tmp));
266 error = acct_on(tmp); 266 error = acct_on(tmp);
@@ -507,8 +507,8 @@ static void do_acct_process(struct bsd_acct_struct *acct,
507 do_div(elapsed, AHZ); 507 do_div(elapsed, AHZ);
508 ac.ac_btime = get_seconds() - elapsed; 508 ac.ac_btime = get_seconds() - elapsed;
509 /* we really need to bite the bullet and change layout */ 509 /* we really need to bite the bullet and change layout */
510 ac.ac_uid = orig_cred->uid; 510 ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid);
511 ac.ac_gid = orig_cred->gid; 511 ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid);
512#if ACCT_VERSION==2 512#if ACCT_VERSION==2
513 ac.ac_ahz = AHZ; 513 ac.ac_ahz = AHZ;
514#endif 514#endif
diff --git a/kernel/audit.c b/kernel/audit.c
index ea3b7b6191c7..40414e9143db 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -61,6 +61,7 @@
61#include <linux/netlink.h> 61#include <linux/netlink.h>
62#include <linux/freezer.h> 62#include <linux/freezer.h>
63#include <linux/tty.h> 63#include <linux/tty.h>
64#include <linux/pid_namespace.h>
64 65
65#include "audit.h" 66#include "audit.h"
66 67
@@ -87,11 +88,11 @@ static int audit_failure = AUDIT_FAIL_PRINTK;
87 88
88/* 89/*
89 * If audit records are to be written to the netlink socket, audit_pid 90 * If audit records are to be written to the netlink socket, audit_pid
90 * contains the pid of the auditd process and audit_nlk_pid contains 91 * contains the pid of the auditd process and audit_nlk_portid contains
91 * the pid to use to send netlink messages to that process. 92 * the portid to use to send netlink messages to that process.
92 */ 93 */
93int audit_pid; 94int audit_pid;
94static int audit_nlk_pid; 95static int audit_nlk_portid;
95 96
96/* If audit_rate_limit is non-zero, limit the rate of sending audit records 97/* If audit_rate_limit is non-zero, limit the rate of sending audit records
97 * to that number per second. This prevents DoS attacks, but results in 98 * to that number per second. This prevents DoS attacks, but results in
@@ -104,7 +105,7 @@ static int audit_backlog_wait_time = 60 * HZ;
104static int audit_backlog_wait_overflow = 0; 105static int audit_backlog_wait_overflow = 0;
105 106
106/* The identity of the user shutting down the audit system. */ 107/* The identity of the user shutting down the audit system. */
107uid_t audit_sig_uid = -1; 108kuid_t audit_sig_uid = INVALID_UID;
108pid_t audit_sig_pid = -1; 109pid_t audit_sig_pid = -1;
109u32 audit_sig_sid = 0; 110u32 audit_sig_sid = 0;
110 111
@@ -264,7 +265,7 @@ void audit_log_lost(const char *message)
264} 265}
265 266
266static int audit_log_config_change(char *function_name, int new, int old, 267static int audit_log_config_change(char *function_name, int new, int old,
267 uid_t loginuid, u32 sessionid, u32 sid, 268 kuid_t loginuid, u32 sessionid, u32 sid,
268 int allow_changes) 269 int allow_changes)
269{ 270{
270 struct audit_buffer *ab; 271 struct audit_buffer *ab;
@@ -272,7 +273,7 @@ static int audit_log_config_change(char *function_name, int new, int old,
272 273
273 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); 274 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
274 audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new, 275 audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new,
275 old, loginuid, sessionid); 276 old, from_kuid(&init_user_ns, loginuid), sessionid);
276 if (sid) { 277 if (sid) {
277 char *ctx = NULL; 278 char *ctx = NULL;
278 u32 len; 279 u32 len;
@@ -292,7 +293,7 @@ static int audit_log_config_change(char *function_name, int new, int old,
292} 293}
293 294
294static int audit_do_config_change(char *function_name, int *to_change, 295static int audit_do_config_change(char *function_name, int *to_change,
295 int new, uid_t loginuid, u32 sessionid, 296 int new, kuid_t loginuid, u32 sessionid,
296 u32 sid) 297 u32 sid)
297{ 298{
298 int allow_changes, rc = 0, old = *to_change; 299 int allow_changes, rc = 0, old = *to_change;
@@ -319,21 +320,21 @@ static int audit_do_config_change(char *function_name, int *to_change,
319 return rc; 320 return rc;
320} 321}
321 322
322static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sessionid, 323static int audit_set_rate_limit(int limit, kuid_t loginuid, u32 sessionid,
323 u32 sid) 324 u32 sid)
324{ 325{
325 return audit_do_config_change("audit_rate_limit", &audit_rate_limit, 326 return audit_do_config_change("audit_rate_limit", &audit_rate_limit,
326 limit, loginuid, sessionid, sid); 327 limit, loginuid, sessionid, sid);
327} 328}
328 329
329static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sessionid, 330static int audit_set_backlog_limit(int limit, kuid_t loginuid, u32 sessionid,
330 u32 sid) 331 u32 sid)
331{ 332{
332 return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, 333 return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit,
333 limit, loginuid, sessionid, sid); 334 limit, loginuid, sessionid, sid);
334} 335}
335 336
336static int audit_set_enabled(int state, uid_t loginuid, u32 sessionid, u32 sid) 337static int audit_set_enabled(int state, kuid_t loginuid, u32 sessionid, u32 sid)
337{ 338{
338 int rc; 339 int rc;
339 if (state < AUDIT_OFF || state > AUDIT_LOCKED) 340 if (state < AUDIT_OFF || state > AUDIT_LOCKED)
@@ -348,7 +349,7 @@ static int audit_set_enabled(int state, uid_t loginuid, u32 sessionid, u32 sid)
348 return rc; 349 return rc;
349} 350}
350 351
351static int audit_set_failure(int state, uid_t loginuid, u32 sessionid, u32 sid) 352static int audit_set_failure(int state, kuid_t loginuid, u32 sessionid, u32 sid)
352{ 353{
353 if (state != AUDIT_FAIL_SILENT 354 if (state != AUDIT_FAIL_SILENT
354 && state != AUDIT_FAIL_PRINTK 355 && state != AUDIT_FAIL_PRINTK
@@ -401,7 +402,7 @@ static void kauditd_send_skb(struct sk_buff *skb)
401 int err; 402 int err;
402 /* take a reference in case we can't send it and we want to hold it */ 403 /* take a reference in case we can't send it and we want to hold it */
403 skb_get(skb); 404 skb_get(skb);
404 err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0); 405 err = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0);
405 if (err < 0) { 406 if (err < 0) {
406 BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */ 407 BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */
407 printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); 408 printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
@@ -467,24 +468,6 @@ static int kauditd_thread(void *dummy)
467 return 0; 468 return 0;
468} 469}
469 470
470static int audit_prepare_user_tty(pid_t pid, uid_t loginuid, u32 sessionid)
471{
472 struct task_struct *tsk;
473 int err;
474
475 rcu_read_lock();
476 tsk = find_task_by_vpid(pid);
477 if (!tsk) {
478 rcu_read_unlock();
479 return -ESRCH;
480 }
481 get_task_struct(tsk);
482 rcu_read_unlock();
483 err = tty_audit_push_task(tsk, loginuid, sessionid);
484 put_task_struct(tsk);
485 return err;
486}
487
488int audit_send_list(void *_dest) 471int audit_send_list(void *_dest)
489{ 472{
490 struct audit_netlink_list *dest = _dest; 473 struct audit_netlink_list *dest = _dest;
@@ -588,6 +571,11 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
588{ 571{
589 int err = 0; 572 int err = 0;
590 573
574 /* Only support the initial namespaces for now. */
575 if ((current_user_ns() != &init_user_ns) ||
576 (task_active_pid_ns(current) != &init_pid_ns))
577 return -EPERM;
578
591 switch (msg_type) { 579 switch (msg_type) {
592 case AUDIT_GET: 580 case AUDIT_GET:
593 case AUDIT_LIST: 581 case AUDIT_LIST:
@@ -619,8 +607,7 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
619} 607}
620 608
621static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, 609static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
622 u32 pid, u32 uid, uid_t auid, u32 ses, 610 kuid_t auid, u32 ses, u32 sid)
623 u32 sid)
624{ 611{
625 int rc = 0; 612 int rc = 0;
626 char *ctx = NULL; 613 char *ctx = NULL;
@@ -633,7 +620,9 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
633 620
634 *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); 621 *ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
635 audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u", 622 audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u",
636 pid, uid, auid, ses); 623 task_tgid_vnr(current),
624 from_kuid(&init_user_ns, current_uid()),
625 from_kuid(&init_user_ns, auid), ses);
637 if (sid) { 626 if (sid) {
638 rc = security_secid_to_secctx(sid, &ctx, &len); 627 rc = security_secid_to_secctx(sid, &ctx, &len);
639 if (rc) 628 if (rc)
@@ -649,13 +638,13 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
649 638
650static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) 639static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
651{ 640{
652 u32 uid, pid, seq, sid; 641 u32 seq, sid;
653 void *data; 642 void *data;
654 struct audit_status *status_get, status_set; 643 struct audit_status *status_get, status_set;
655 int err; 644 int err;
656 struct audit_buffer *ab; 645 struct audit_buffer *ab;
657 u16 msg_type = nlh->nlmsg_type; 646 u16 msg_type = nlh->nlmsg_type;
658 uid_t loginuid; /* loginuid of sender */ 647 kuid_t loginuid; /* loginuid of sender */
659 u32 sessionid; 648 u32 sessionid;
660 struct audit_sig_info *sig_data; 649 struct audit_sig_info *sig_data;
661 char *ctx = NULL; 650 char *ctx = NULL;
@@ -675,8 +664,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
675 return err; 664 return err;
676 } 665 }
677 666
678 pid = NETLINK_CREDS(skb)->pid;
679 uid = NETLINK_CREDS(skb)->uid;
680 loginuid = audit_get_loginuid(current); 667 loginuid = audit_get_loginuid(current);
681 sessionid = audit_get_sessionid(current); 668 sessionid = audit_get_sessionid(current);
682 security_task_getsecid(current, &sid); 669 security_task_getsecid(current, &sid);
@@ -692,7 +679,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
692 status_set.backlog_limit = audit_backlog_limit; 679 status_set.backlog_limit = audit_backlog_limit;
693 status_set.lost = atomic_read(&audit_lost); 680 status_set.lost = atomic_read(&audit_lost);
694 status_set.backlog = skb_queue_len(&audit_skb_queue); 681 status_set.backlog = skb_queue_len(&audit_skb_queue);
695 audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_GET, 0, 0, 682 audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_GET, 0, 0,
696 &status_set, sizeof(status_set)); 683 &status_set, sizeof(status_set));
697 break; 684 break;
698 case AUDIT_SET: 685 case AUDIT_SET:
@@ -720,7 +707,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
720 sessionid, sid, 1); 707 sessionid, sid, 1);
721 708
722 audit_pid = new_pid; 709 audit_pid = new_pid;
723 audit_nlk_pid = NETLINK_CB(skb).pid; 710 audit_nlk_portid = NETLINK_CB(skb).portid;
724 } 711 }
725 if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) { 712 if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) {
726 err = audit_set_rate_limit(status_get->rate_limit, 713 err = audit_set_rate_limit(status_get->rate_limit,
@@ -738,16 +725,16 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
738 if (!audit_enabled && msg_type != AUDIT_USER_AVC) 725 if (!audit_enabled && msg_type != AUDIT_USER_AVC)
739 return 0; 726 return 0;
740 727
741 err = audit_filter_user(&NETLINK_CB(skb)); 728 err = audit_filter_user();
742 if (err == 1) { 729 if (err == 1) {
743 err = 0; 730 err = 0;
744 if (msg_type == AUDIT_USER_TTY) { 731 if (msg_type == AUDIT_USER_TTY) {
745 err = audit_prepare_user_tty(pid, loginuid, 732 err = tty_audit_push_task(current, loginuid,
746 sessionid); 733 sessionid);
747 if (err) 734 if (err)
748 break; 735 break;
749 } 736 }
750 audit_log_common_recv_msg(&ab, msg_type, pid, uid, 737 audit_log_common_recv_msg(&ab, msg_type,
751 loginuid, sessionid, sid); 738 loginuid, sessionid, sid);
752 739
753 if (msg_type != AUDIT_USER_TTY) 740 if (msg_type != AUDIT_USER_TTY)
@@ -763,7 +750,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
763 size--; 750 size--;
764 audit_log_n_untrustedstring(ab, data, size); 751 audit_log_n_untrustedstring(ab, data, size);
765 } 752 }
766 audit_set_pid(ab, pid); 753 audit_set_pid(ab, NETLINK_CB(skb).portid);
767 audit_log_end(ab); 754 audit_log_end(ab);
768 } 755 }
769 break; 756 break;
@@ -772,8 +759,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
772 if (nlmsg_len(nlh) < sizeof(struct audit_rule)) 759 if (nlmsg_len(nlh) < sizeof(struct audit_rule))
773 return -EINVAL; 760 return -EINVAL;
774 if (audit_enabled == AUDIT_LOCKED) { 761 if (audit_enabled == AUDIT_LOCKED) {
775 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, 762 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE,
776 uid, loginuid, sessionid, sid); 763 loginuid, sessionid, sid);
777 764
778 audit_log_format(ab, " audit_enabled=%d res=0", 765 audit_log_format(ab, " audit_enabled=%d res=0",
779 audit_enabled); 766 audit_enabled);
@@ -782,8 +769,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
782 } 769 }
783 /* fallthrough */ 770 /* fallthrough */
784 case AUDIT_LIST: 771 case AUDIT_LIST:
785 err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid, 772 err = audit_receive_filter(msg_type, NETLINK_CB(skb).portid,
786 uid, seq, data, nlmsg_len(nlh), 773 seq, data, nlmsg_len(nlh),
787 loginuid, sessionid, sid); 774 loginuid, sessionid, sid);
788 break; 775 break;
789 case AUDIT_ADD_RULE: 776 case AUDIT_ADD_RULE:
@@ -791,8 +778,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
791 if (nlmsg_len(nlh) < sizeof(struct audit_rule_data)) 778 if (nlmsg_len(nlh) < sizeof(struct audit_rule_data))
792 return -EINVAL; 779 return -EINVAL;
793 if (audit_enabled == AUDIT_LOCKED) { 780 if (audit_enabled == AUDIT_LOCKED) {
794 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, 781 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE,
795 uid, loginuid, sessionid, sid); 782 loginuid, sessionid, sid);
796 783
797 audit_log_format(ab, " audit_enabled=%d res=0", 784 audit_log_format(ab, " audit_enabled=%d res=0",
798 audit_enabled); 785 audit_enabled);
@@ -801,15 +788,15 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
801 } 788 }
802 /* fallthrough */ 789 /* fallthrough */
803 case AUDIT_LIST_RULES: 790 case AUDIT_LIST_RULES:
804 err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid, 791 err = audit_receive_filter(msg_type, NETLINK_CB(skb).portid,
805 uid, seq, data, nlmsg_len(nlh), 792 seq, data, nlmsg_len(nlh),
806 loginuid, sessionid, sid); 793 loginuid, sessionid, sid);
807 break; 794 break;
808 case AUDIT_TRIM: 795 case AUDIT_TRIM:
809 audit_trim_trees(); 796 audit_trim_trees();
810 797
811 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, 798 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE,
812 uid, loginuid, sessionid, sid); 799 loginuid, sessionid, sid);
813 800
814 audit_log_format(ab, " op=trim res=1"); 801 audit_log_format(ab, " op=trim res=1");
815 audit_log_end(ab); 802 audit_log_end(ab);
@@ -840,8 +827,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
840 /* OK, here comes... */ 827 /* OK, here comes... */
841 err = audit_tag_tree(old, new); 828 err = audit_tag_tree(old, new);
842 829
843 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, 830 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE,
844 uid, loginuid, sessionid, sid); 831 loginuid, sessionid, sid);
845 832
846 audit_log_format(ab, " op=make_equiv old="); 833 audit_log_format(ab, " op=make_equiv old=");
847 audit_log_untrustedstring(ab, old); 834 audit_log_untrustedstring(ab, old);
@@ -866,53 +853,41 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
866 security_release_secctx(ctx, len); 853 security_release_secctx(ctx, len);
867 return -ENOMEM; 854 return -ENOMEM;
868 } 855 }
869 sig_data->uid = audit_sig_uid; 856 sig_data->uid = from_kuid(&init_user_ns, audit_sig_uid);
870 sig_data->pid = audit_sig_pid; 857 sig_data->pid = audit_sig_pid;
871 if (audit_sig_sid) { 858 if (audit_sig_sid) {
872 memcpy(sig_data->ctx, ctx, len); 859 memcpy(sig_data->ctx, ctx, len);
873 security_release_secctx(ctx, len); 860 security_release_secctx(ctx, len);
874 } 861 }
875 audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, 862 audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_SIGNAL_INFO,
876 0, 0, sig_data, sizeof(*sig_data) + len); 863 0, 0, sig_data, sizeof(*sig_data) + len);
877 kfree(sig_data); 864 kfree(sig_data);
878 break; 865 break;
879 case AUDIT_TTY_GET: { 866 case AUDIT_TTY_GET: {
880 struct audit_tty_status s; 867 struct audit_tty_status s;
881 struct task_struct *tsk; 868 struct task_struct *tsk = current;
882 unsigned long flags; 869
883 870 spin_lock_irq(&tsk->sighand->siglock);
884 rcu_read_lock(); 871 s.enabled = tsk->signal->audit_tty != 0;
885 tsk = find_task_by_vpid(pid); 872 spin_unlock_irq(&tsk->sighand->siglock);
886 if (tsk && lock_task_sighand(tsk, &flags)) { 873
887 s.enabled = tsk->signal->audit_tty != 0; 874 audit_send_reply(NETLINK_CB(skb).portid, seq,
888 unlock_task_sighand(tsk, &flags); 875 AUDIT_TTY_GET, 0, 0, &s, sizeof(s));
889 } else
890 err = -ESRCH;
891 rcu_read_unlock();
892
893 if (!err)
894 audit_send_reply(NETLINK_CB(skb).pid, seq,
895 AUDIT_TTY_GET, 0, 0, &s, sizeof(s));
896 break; 876 break;
897 } 877 }
898 case AUDIT_TTY_SET: { 878 case AUDIT_TTY_SET: {
899 struct audit_tty_status *s; 879 struct audit_tty_status *s;
900 struct task_struct *tsk; 880 struct task_struct *tsk = current;
901 unsigned long flags;
902 881
903 if (nlh->nlmsg_len < sizeof(struct audit_tty_status)) 882 if (nlh->nlmsg_len < sizeof(struct audit_tty_status))
904 return -EINVAL; 883 return -EINVAL;
905 s = data; 884 s = data;
906 if (s->enabled != 0 && s->enabled != 1) 885 if (s->enabled != 0 && s->enabled != 1)
907 return -EINVAL; 886 return -EINVAL;
908 rcu_read_lock(); 887
909 tsk = find_task_by_vpid(pid); 888 spin_lock_irq(&tsk->sighand->siglock);
910 if (tsk && lock_task_sighand(tsk, &flags)) { 889 tsk->signal->audit_tty = s->enabled != 0;
911 tsk->signal->audit_tty = s->enabled != 0; 890 spin_unlock_irq(&tsk->sighand->siglock);
912 unlock_task_sighand(tsk, &flags);
913 } else
914 err = -ESRCH;
915 rcu_read_unlock();
916 break; 891 break;
917 } 892 }
918 default: 893 default:
@@ -971,8 +946,7 @@ static int __init audit_init(void)
971 946
972 printk(KERN_INFO "audit: initializing netlink socket (%s)\n", 947 printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
973 audit_default ? "enabled" : "disabled"); 948 audit_default ? "enabled" : "disabled");
974 audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, 949 audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, &cfg);
975 THIS_MODULE, &cfg);
976 if (!audit_sock) 950 if (!audit_sock)
977 audit_panic("cannot initialize netlink socket"); 951 audit_panic("cannot initialize netlink socket");
978 else 952 else
@@ -1466,6 +1440,8 @@ void audit_log_link_denied(const char *operation, struct path *link)
1466 1440
1467 ab = audit_log_start(current->audit_context, GFP_KERNEL, 1441 ab = audit_log_start(current->audit_context, GFP_KERNEL,
1468 AUDIT_ANOM_LINK); 1442 AUDIT_ANOM_LINK);
1443 if (!ab)
1444 return;
1469 audit_log_format(ab, "op=%s action=denied", operation); 1445 audit_log_format(ab, "op=%s action=denied", operation);
1470 audit_log_format(ab, " pid=%d comm=", current->pid); 1446 audit_log_format(ab, " pid=%d comm=", current->pid);
1471 audit_log_untrustedstring(ab, current->comm); 1447 audit_log_untrustedstring(ab, current->comm);
diff --git a/kernel/audit.h b/kernel/audit.h
index 816766803371..d51cba868e1b 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -74,10 +74,15 @@ static inline int audit_hash_ino(u32 ino)
74 return (ino & (AUDIT_INODE_BUCKETS-1)); 74 return (ino & (AUDIT_INODE_BUCKETS-1));
75} 75}
76 76
77/* Indicates that audit should log the full pathname. */
78#define AUDIT_NAME_FULL -1
79
77extern int audit_match_class(int class, unsigned syscall); 80extern int audit_match_class(int class, unsigned syscall);
78extern int audit_comparator(const u32 left, const u32 op, const u32 right); 81extern int audit_comparator(const u32 left, const u32 op, const u32 right);
79extern int audit_compare_dname_path(const char *dname, const char *path, 82extern int audit_uid_comparator(kuid_t left, u32 op, kuid_t right);
80 int *dirlen); 83extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right);
84extern int parent_len(const char *path);
85extern int audit_compare_dname_path(const char *dname, const char *path, int plen);
81extern struct sk_buff * audit_make_reply(int pid, int seq, int type, 86extern struct sk_buff * audit_make_reply(int pid, int seq, int type,
82 int done, int multi, 87 int done, int multi,
83 const void *payload, int size); 88 const void *payload, int size);
@@ -144,7 +149,7 @@ extern void audit_kill_trees(struct list_head *);
144extern char *audit_unpack_string(void **, size_t *, size_t); 149extern char *audit_unpack_string(void **, size_t *, size_t);
145 150
146extern pid_t audit_sig_pid; 151extern pid_t audit_sig_pid;
147extern uid_t audit_sig_uid; 152extern kuid_t audit_sig_uid;
148extern u32 audit_sig_sid; 153extern u32 audit_sig_sid;
149 154
150#ifdef CONFIG_AUDITSYSCALL 155#ifdef CONFIG_AUDITSYSCALL
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 3823281401b5..9a9ae6e3d290 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -241,7 +241,7 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc
241 struct audit_buffer *ab; 241 struct audit_buffer *ab;
242 ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); 242 ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
243 audit_log_format(ab, "auid=%u ses=%u op=", 243 audit_log_format(ab, "auid=%u ses=%u op=",
244 audit_get_loginuid(current), 244 from_kuid(&init_user_ns, audit_get_loginuid(current)),
245 audit_get_sessionid(current)); 245 audit_get_sessionid(current));
246 audit_log_string(ab, op); 246 audit_log_string(ab, op);
247 audit_log_format(ab, " path="); 247 audit_log_format(ab, " path=");
@@ -265,7 +265,8 @@ static void audit_update_watch(struct audit_parent *parent,
265 /* Run all of the watches on this parent looking for the one that 265 /* Run all of the watches on this parent looking for the one that
266 * matches the given dname */ 266 * matches the given dname */
267 list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) { 267 list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
268 if (audit_compare_dname_path(dname, owatch->path, NULL)) 268 if (audit_compare_dname_path(dname, owatch->path,
269 AUDIT_NAME_FULL))
269 continue; 270 continue;
270 271
271 /* If the update involves invalidating rules, do the inode-based 272 /* If the update involves invalidating rules, do the inode-based
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index a6c3f1abd206..7f19f23d38a3 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -342,6 +342,8 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
342 342
343 f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS); 343 f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS);
344 f->val = rule->values[i]; 344 f->val = rule->values[i];
345 f->uid = INVALID_UID;
346 f->gid = INVALID_GID;
345 347
346 err = -EINVAL; 348 err = -EINVAL;
347 if (f->op == Audit_bad) 349 if (f->op == Audit_bad)
@@ -350,16 +352,32 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
350 switch(f->type) { 352 switch(f->type) {
351 default: 353 default:
352 goto exit_free; 354 goto exit_free;
353 case AUDIT_PID:
354 case AUDIT_UID: 355 case AUDIT_UID:
355 case AUDIT_EUID: 356 case AUDIT_EUID:
356 case AUDIT_SUID: 357 case AUDIT_SUID:
357 case AUDIT_FSUID: 358 case AUDIT_FSUID:
359 case AUDIT_LOGINUID:
360 /* bit ops not implemented for uid comparisons */
361 if (f->op == Audit_bitmask || f->op == Audit_bittest)
362 goto exit_free;
363
364 f->uid = make_kuid(current_user_ns(), f->val);
365 if (!uid_valid(f->uid))
366 goto exit_free;
367 break;
358 case AUDIT_GID: 368 case AUDIT_GID:
359 case AUDIT_EGID: 369 case AUDIT_EGID:
360 case AUDIT_SGID: 370 case AUDIT_SGID:
361 case AUDIT_FSGID: 371 case AUDIT_FSGID:
362 case AUDIT_LOGINUID: 372 /* bit ops not implemented for gid comparisons */
373 if (f->op == Audit_bitmask || f->op == Audit_bittest)
374 goto exit_free;
375
376 f->gid = make_kgid(current_user_ns(), f->val);
377 if (!gid_valid(f->gid))
378 goto exit_free;
379 break;
380 case AUDIT_PID:
363 case AUDIT_PERS: 381 case AUDIT_PERS:
364 case AUDIT_MSGTYPE: 382 case AUDIT_MSGTYPE:
365 case AUDIT_PPID: 383 case AUDIT_PPID:
@@ -437,19 +455,39 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
437 455
438 f->type = data->fields[i]; 456 f->type = data->fields[i];
439 f->val = data->values[i]; 457 f->val = data->values[i];
458 f->uid = INVALID_UID;
459 f->gid = INVALID_GID;
440 f->lsm_str = NULL; 460 f->lsm_str = NULL;
441 f->lsm_rule = NULL; 461 f->lsm_rule = NULL;
442 switch(f->type) { 462 switch(f->type) {
443 case AUDIT_PID:
444 case AUDIT_UID: 463 case AUDIT_UID:
445 case AUDIT_EUID: 464 case AUDIT_EUID:
446 case AUDIT_SUID: 465 case AUDIT_SUID:
447 case AUDIT_FSUID: 466 case AUDIT_FSUID:
467 case AUDIT_LOGINUID:
468 case AUDIT_OBJ_UID:
469 /* bit ops not implemented for uid comparisons */
470 if (f->op == Audit_bitmask || f->op == Audit_bittest)
471 goto exit_free;
472
473 f->uid = make_kuid(current_user_ns(), f->val);
474 if (!uid_valid(f->uid))
475 goto exit_free;
476 break;
448 case AUDIT_GID: 477 case AUDIT_GID:
449 case AUDIT_EGID: 478 case AUDIT_EGID:
450 case AUDIT_SGID: 479 case AUDIT_SGID:
451 case AUDIT_FSGID: 480 case AUDIT_FSGID:
452 case AUDIT_LOGINUID: 481 case AUDIT_OBJ_GID:
482 /* bit ops not implemented for gid comparisons */
483 if (f->op == Audit_bitmask || f->op == Audit_bittest)
484 goto exit_free;
485
486 f->gid = make_kgid(current_user_ns(), f->val);
487 if (!gid_valid(f->gid))
488 goto exit_free;
489 break;
490 case AUDIT_PID:
453 case AUDIT_PERS: 491 case AUDIT_PERS:
454 case AUDIT_MSGTYPE: 492 case AUDIT_MSGTYPE:
455 case AUDIT_PPID: 493 case AUDIT_PPID:
@@ -461,8 +499,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
461 case AUDIT_ARG1: 499 case AUDIT_ARG1:
462 case AUDIT_ARG2: 500 case AUDIT_ARG2:
463 case AUDIT_ARG3: 501 case AUDIT_ARG3:
464 case AUDIT_OBJ_UID:
465 case AUDIT_OBJ_GID:
466 break; 502 break;
467 case AUDIT_ARCH: 503 case AUDIT_ARCH:
468 entry->rule.arch_f = f; 504 entry->rule.arch_f = f;
@@ -707,6 +743,23 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
707 if (strcmp(a->filterkey, b->filterkey)) 743 if (strcmp(a->filterkey, b->filterkey))
708 return 1; 744 return 1;
709 break; 745 break;
746 case AUDIT_UID:
747 case AUDIT_EUID:
748 case AUDIT_SUID:
749 case AUDIT_FSUID:
750 case AUDIT_LOGINUID:
751 case AUDIT_OBJ_UID:
752 if (!uid_eq(a->fields[i].uid, b->fields[i].uid))
753 return 1;
754 break;
755 case AUDIT_GID:
756 case AUDIT_EGID:
757 case AUDIT_SGID:
758 case AUDIT_FSGID:
759 case AUDIT_OBJ_GID:
760 if (!gid_eq(a->fields[i].gid, b->fields[i].gid))
761 return 1;
762 break;
710 default: 763 default:
711 if (a->fields[i].val != b->fields[i].val) 764 if (a->fields[i].val != b->fields[i].val)
712 return 1; 765 return 1;
@@ -1056,7 +1109,7 @@ static void audit_list_rules(int pid, int seq, struct sk_buff_head *q)
1056} 1109}
1057 1110
1058/* Log rule additions and removals */ 1111/* Log rule additions and removals */
1059static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid, 1112static void audit_log_rule_change(kuid_t loginuid, u32 sessionid, u32 sid,
1060 char *action, struct audit_krule *rule, 1113 char *action, struct audit_krule *rule,
1061 int res) 1114 int res)
1062{ 1115{
@@ -1068,7 +1121,8 @@ static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid,
1068 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); 1121 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
1069 if (!ab) 1122 if (!ab)
1070 return; 1123 return;
1071 audit_log_format(ab, "auid=%u ses=%u", loginuid, sessionid); 1124 audit_log_format(ab, "auid=%u ses=%u",
1125 from_kuid(&init_user_ns, loginuid), sessionid);
1072 if (sid) { 1126 if (sid) {
1073 char *ctx = NULL; 1127 char *ctx = NULL;
1074 u32 len; 1128 u32 len;
@@ -1098,8 +1152,8 @@ static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid,
1098 * @sessionid: sessionid for netlink audit message 1152 * @sessionid: sessionid for netlink audit message
1099 * @sid: SE Linux Security ID of sender 1153 * @sid: SE Linux Security ID of sender
1100 */ 1154 */
1101int audit_receive_filter(int type, int pid, int uid, int seq, void *data, 1155int audit_receive_filter(int type, int pid, int seq, void *data,
1102 size_t datasz, uid_t loginuid, u32 sessionid, u32 sid) 1156 size_t datasz, kuid_t loginuid, u32 sessionid, u32 sid)
1103{ 1157{
1104 struct task_struct *tsk; 1158 struct task_struct *tsk;
1105 struct audit_netlink_list *dest; 1159 struct audit_netlink_list *dest;
@@ -1198,46 +1252,110 @@ int audit_comparator(u32 left, u32 op, u32 right)
1198 } 1252 }
1199} 1253}
1200 1254
1201/* Compare given dentry name with last component in given path, 1255int audit_uid_comparator(kuid_t left, u32 op, kuid_t right)
1202 * return of 0 indicates a match. */
1203int audit_compare_dname_path(const char *dname, const char *path,
1204 int *dirlen)
1205{ 1256{
1206 int dlen, plen; 1257 switch (op) {
1207 const char *p; 1258 case Audit_equal:
1259 return uid_eq(left, right);
1260 case Audit_not_equal:
1261 return !uid_eq(left, right);
1262 case Audit_lt:
1263 return uid_lt(left, right);
1264 case Audit_le:
1265 return uid_lte(left, right);
1266 case Audit_gt:
1267 return uid_gt(left, right);
1268 case Audit_ge:
1269 return uid_gte(left, right);
1270 case Audit_bitmask:
1271 case Audit_bittest:
1272 default:
1273 BUG();
1274 return 0;
1275 }
1276}
1208 1277
1209 if (!dname || !path) 1278int audit_gid_comparator(kgid_t left, u32 op, kgid_t right)
1210 return 1; 1279{
1280 switch (op) {
1281 case Audit_equal:
1282 return gid_eq(left, right);
1283 case Audit_not_equal:
1284 return !gid_eq(left, right);
1285 case Audit_lt:
1286 return gid_lt(left, right);
1287 case Audit_le:
1288 return gid_lte(left, right);
1289 case Audit_gt:
1290 return gid_gt(left, right);
1291 case Audit_ge:
1292 return gid_gte(left, right);
1293 case Audit_bitmask:
1294 case Audit_bittest:
1295 default:
1296 BUG();
1297 return 0;
1298 }
1299}
1300
1301/**
1302 * parent_len - find the length of the parent portion of a pathname
1303 * @path: pathname of which to determine length
1304 */
1305int parent_len(const char *path)
1306{
1307 int plen;
1308 const char *p;
1211 1309
1212 dlen = strlen(dname);
1213 plen = strlen(path); 1310 plen = strlen(path);
1214 if (plen < dlen) 1311
1215 return 1; 1312 if (plen == 0)
1313 return plen;
1216 1314
1217 /* disregard trailing slashes */ 1315 /* disregard trailing slashes */
1218 p = path + plen - 1; 1316 p = path + plen - 1;
1219 while ((*p == '/') && (p > path)) 1317 while ((*p == '/') && (p > path))
1220 p--; 1318 p--;
1221 1319
1222 /* find last path component */ 1320 /* walk backward until we find the next slash or hit beginning */
1223 p = p - dlen + 1; 1321 while ((*p != '/') && (p > path))
1224 if (p < path) 1322 p--;
1323
1324 /* did we find a slash? Then increment to include it in path */
1325 if (*p == '/')
1326 p++;
1327
1328 return p - path;
1329}
1330
1331/**
1332 * audit_compare_dname_path - compare given dentry name with last component in
1333 * given path. Return of 0 indicates a match.
1334 * @dname: dentry name that we're comparing
1335 * @path: full pathname that we're comparing
1336 * @parentlen: length of the parent if known. Passing in AUDIT_NAME_FULL
1337 * here indicates that we must compute this value.
1338 */
1339int audit_compare_dname_path(const char *dname, const char *path, int parentlen)
1340{
1341 int dlen, pathlen;
1342 const char *p;
1343
1344 dlen = strlen(dname);
1345 pathlen = strlen(path);
1346 if (pathlen < dlen)
1225 return 1; 1347 return 1;
1226 else if (p > path) {
1227 if (*--p != '/')
1228 return 1;
1229 else
1230 p++;
1231 }
1232 1348
1233 /* return length of path's directory component */ 1349 parentlen = parentlen == AUDIT_NAME_FULL ? parent_len(path) : parentlen;
1234 if (dirlen) 1350 if (pathlen - parentlen != dlen)
1235 *dirlen = p - path; 1351 return 1;
1352
1353 p = path + parentlen;
1354
1236 return strncmp(p, dname, dlen); 1355 return strncmp(p, dname, dlen);
1237} 1356}
1238 1357
1239static int audit_filter_user_rules(struct netlink_skb_parms *cb, 1358static int audit_filter_user_rules(struct audit_krule *rule,
1240 struct audit_krule *rule,
1241 enum audit_state *state) 1359 enum audit_state *state)
1242{ 1360{
1243 int i; 1361 int i;
@@ -1249,17 +1367,17 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb,
1249 1367
1250 switch (f->type) { 1368 switch (f->type) {
1251 case AUDIT_PID: 1369 case AUDIT_PID:
1252 result = audit_comparator(cb->creds.pid, f->op, f->val); 1370 result = audit_comparator(task_pid_vnr(current), f->op, f->val);
1253 break; 1371 break;
1254 case AUDIT_UID: 1372 case AUDIT_UID:
1255 result = audit_comparator(cb->creds.uid, f->op, f->val); 1373 result = audit_uid_comparator(current_uid(), f->op, f->uid);
1256 break; 1374 break;
1257 case AUDIT_GID: 1375 case AUDIT_GID:
1258 result = audit_comparator(cb->creds.gid, f->op, f->val); 1376 result = audit_gid_comparator(current_gid(), f->op, f->gid);
1259 break; 1377 break;
1260 case AUDIT_LOGINUID: 1378 case AUDIT_LOGINUID:
1261 result = audit_comparator(audit_get_loginuid(current), 1379 result = audit_uid_comparator(audit_get_loginuid(current),
1262 f->op, f->val); 1380 f->op, f->uid);
1263 break; 1381 break;
1264 case AUDIT_SUBJ_USER: 1382 case AUDIT_SUBJ_USER:
1265 case AUDIT_SUBJ_ROLE: 1383 case AUDIT_SUBJ_ROLE:
@@ -1287,7 +1405,7 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb,
1287 return 1; 1405 return 1;
1288} 1406}
1289 1407
1290int audit_filter_user(struct netlink_skb_parms *cb) 1408int audit_filter_user(void)
1291{ 1409{
1292 enum audit_state state = AUDIT_DISABLED; 1410 enum audit_state state = AUDIT_DISABLED;
1293 struct audit_entry *e; 1411 struct audit_entry *e;
@@ -1295,7 +1413,7 @@ int audit_filter_user(struct netlink_skb_parms *cb)
1295 1413
1296 rcu_read_lock(); 1414 rcu_read_lock();
1297 list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) { 1415 list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) {
1298 if (audit_filter_user_rules(cb, &e->rule, &state)) { 1416 if (audit_filter_user_rules(&e->rule, &state)) {
1299 if (state == AUDIT_DISABLED) 1417 if (state == AUDIT_DISABLED)
1300 ret = 0; 1418 ret = 0;
1301 break; 1419 break;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 4b96415527b8..2f186ed80c40 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -81,9 +81,6 @@
81 * a name dynamically and also add those to the list anchored by names_list. */ 81 * a name dynamically and also add those to the list anchored by names_list. */
82#define AUDIT_NAMES 5 82#define AUDIT_NAMES 5
83 83
84/* Indicates that audit should log the full pathname. */
85#define AUDIT_NAME_FULL -1
86
87/* no execve audit message should be longer than this (userspace limits) */ 84/* no execve audit message should be longer than this (userspace limits) */
88#define MAX_EXECVE_AUDIT_LEN 7500 85#define MAX_EXECVE_AUDIT_LEN 7500
89 86
@@ -106,27 +103,29 @@ struct audit_cap_data {
106 * we don't let putname() free it (instead we free all of the saved 103 * we don't let putname() free it (instead we free all of the saved
107 * pointers at syscall exit time). 104 * pointers at syscall exit time).
108 * 105 *
109 * Further, in fs/namei.c:path_lookup() we store the inode and device. */ 106 * Further, in fs/namei.c:path_lookup() we store the inode and device.
107 */
110struct audit_names { 108struct audit_names {
111 struct list_head list; /* audit_context->names_list */ 109 struct list_head list; /* audit_context->names_list */
112 const char *name; 110 struct filename *name;
113 unsigned long ino; 111 unsigned long ino;
114 dev_t dev; 112 dev_t dev;
115 umode_t mode; 113 umode_t mode;
116 uid_t uid; 114 kuid_t uid;
117 gid_t gid; 115 kgid_t gid;
118 dev_t rdev; 116 dev_t rdev;
119 u32 osid; 117 u32 osid;
120 struct audit_cap_data fcap; 118 struct audit_cap_data fcap;
121 unsigned int fcap_ver; 119 unsigned int fcap_ver;
122 int name_len; /* number of name's characters to log */ 120 int name_len; /* number of name's characters to log */
123 bool name_put; /* call __putname() for this name */ 121 unsigned char type; /* record type */
122 bool name_put; /* call __putname() for this name */
124 /* 123 /*
125 * This was an allocated audit_names and not from the array of 124 * This was an allocated audit_names and not from the array of
126 * names allocated in the task audit context. Thus this name 125 * names allocated in the task audit context. Thus this name
127 * should be freed on syscall exit 126 * should be freed on syscall exit
128 */ 127 */
129 bool should_free; 128 bool should_free;
130}; 129};
131 130
132struct audit_aux_data { 131struct audit_aux_data {
@@ -149,8 +148,8 @@ struct audit_aux_data_execve {
149struct audit_aux_data_pids { 148struct audit_aux_data_pids {
150 struct audit_aux_data d; 149 struct audit_aux_data d;
151 pid_t target_pid[AUDIT_AUX_PIDS]; 150 pid_t target_pid[AUDIT_AUX_PIDS];
152 uid_t target_auid[AUDIT_AUX_PIDS]; 151 kuid_t target_auid[AUDIT_AUX_PIDS];
153 uid_t target_uid[AUDIT_AUX_PIDS]; 152 kuid_t target_uid[AUDIT_AUX_PIDS];
154 unsigned int target_sessionid[AUDIT_AUX_PIDS]; 153 unsigned int target_sessionid[AUDIT_AUX_PIDS];
155 u32 target_sid[AUDIT_AUX_PIDS]; 154 u32 target_sid[AUDIT_AUX_PIDS];
156 char target_comm[AUDIT_AUX_PIDS][TASK_COMM_LEN]; 155 char target_comm[AUDIT_AUX_PIDS][TASK_COMM_LEN];
@@ -208,14 +207,14 @@ struct audit_context {
208 size_t sockaddr_len; 207 size_t sockaddr_len;
209 /* Save things to print about task_struct */ 208 /* Save things to print about task_struct */
210 pid_t pid, ppid; 209 pid_t pid, ppid;
211 uid_t uid, euid, suid, fsuid; 210 kuid_t uid, euid, suid, fsuid;
212 gid_t gid, egid, sgid, fsgid; 211 kgid_t gid, egid, sgid, fsgid;
213 unsigned long personality; 212 unsigned long personality;
214 int arch; 213 int arch;
215 214
216 pid_t target_pid; 215 pid_t target_pid;
217 uid_t target_auid; 216 kuid_t target_auid;
218 uid_t target_uid; 217 kuid_t target_uid;
219 unsigned int target_sessionid; 218 unsigned int target_sessionid;
220 u32 target_sid; 219 u32 target_sid;
221 char target_comm[TASK_COMM_LEN]; 220 char target_comm[TASK_COMM_LEN];
@@ -231,8 +230,8 @@ struct audit_context {
231 long args[6]; 230 long args[6];
232 } socketcall; 231 } socketcall;
233 struct { 232 struct {
234 uid_t uid; 233 kuid_t uid;
235 gid_t gid; 234 kgid_t gid;
236 umode_t mode; 235 umode_t mode;
237 u32 osid; 236 u32 osid;
238 int has_perm; 237 int has_perm;
@@ -464,37 +463,47 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree)
464 return 0; 463 return 0;
465} 464}
466 465
467static int audit_compare_id(uid_t uid1, 466static int audit_compare_uid(kuid_t uid,
468 struct audit_names *name, 467 struct audit_names *name,
469 unsigned long name_offset, 468 struct audit_field *f,
470 struct audit_field *f, 469 struct audit_context *ctx)
471 struct audit_context *ctx)
472{ 470{
473 struct audit_names *n; 471 struct audit_names *n;
474 unsigned long addr;
475 uid_t uid2;
476 int rc; 472 int rc;
477 473
478 BUILD_BUG_ON(sizeof(uid_t) != sizeof(gid_t));
479
480 if (name) { 474 if (name) {
481 addr = (unsigned long)name; 475 rc = audit_uid_comparator(uid, f->op, name->uid);
482 addr += name_offset;
483
484 uid2 = *(uid_t *)addr;
485 rc = audit_comparator(uid1, f->op, uid2);
486 if (rc) 476 if (rc)
487 return rc; 477 return rc;
488 } 478 }
489 479
490 if (ctx) { 480 if (ctx) {
491 list_for_each_entry(n, &ctx->names_list, list) { 481 list_for_each_entry(n, &ctx->names_list, list) {
492 addr = (unsigned long)n; 482 rc = audit_uid_comparator(uid, f->op, n->uid);
493 addr += name_offset; 483 if (rc)
494 484 return rc;
495 uid2 = *(uid_t *)addr; 485 }
486 }
487 return 0;
488}
496 489
497 rc = audit_comparator(uid1, f->op, uid2); 490static int audit_compare_gid(kgid_t gid,
491 struct audit_names *name,
492 struct audit_field *f,
493 struct audit_context *ctx)
494{
495 struct audit_names *n;
496 int rc;
497
498 if (name) {
499 rc = audit_gid_comparator(gid, f->op, name->gid);
500 if (rc)
501 return rc;
502 }
503
504 if (ctx) {
505 list_for_each_entry(n, &ctx->names_list, list) {
506 rc = audit_gid_comparator(gid, f->op, n->gid);
498 if (rc) 507 if (rc)
499 return rc; 508 return rc;
500 } 509 }
@@ -511,80 +520,62 @@ static int audit_field_compare(struct task_struct *tsk,
511 switch (f->val) { 520 switch (f->val) {
512 /* process to file object comparisons */ 521 /* process to file object comparisons */
513 case AUDIT_COMPARE_UID_TO_OBJ_UID: 522 case AUDIT_COMPARE_UID_TO_OBJ_UID:
514 return audit_compare_id(cred->uid, 523 return audit_compare_uid(cred->uid, name, f, ctx);
515 name, offsetof(struct audit_names, uid),
516 f, ctx);
517 case AUDIT_COMPARE_GID_TO_OBJ_GID: 524 case AUDIT_COMPARE_GID_TO_OBJ_GID:
518 return audit_compare_id(cred->gid, 525 return audit_compare_gid(cred->gid, name, f, ctx);
519 name, offsetof(struct audit_names, gid),
520 f, ctx);
521 case AUDIT_COMPARE_EUID_TO_OBJ_UID: 526 case AUDIT_COMPARE_EUID_TO_OBJ_UID:
522 return audit_compare_id(cred->euid, 527 return audit_compare_uid(cred->euid, name, f, ctx);
523 name, offsetof(struct audit_names, uid),
524 f, ctx);
525 case AUDIT_COMPARE_EGID_TO_OBJ_GID: 528 case AUDIT_COMPARE_EGID_TO_OBJ_GID:
526 return audit_compare_id(cred->egid, 529 return audit_compare_gid(cred->egid, name, f, ctx);
527 name, offsetof(struct audit_names, gid),
528 f, ctx);
529 case AUDIT_COMPARE_AUID_TO_OBJ_UID: 530 case AUDIT_COMPARE_AUID_TO_OBJ_UID:
530 return audit_compare_id(tsk->loginuid, 531 return audit_compare_uid(tsk->loginuid, name, f, ctx);
531 name, offsetof(struct audit_names, uid),
532 f, ctx);
533 case AUDIT_COMPARE_SUID_TO_OBJ_UID: 532 case AUDIT_COMPARE_SUID_TO_OBJ_UID:
534 return audit_compare_id(cred->suid, 533 return audit_compare_uid(cred->suid, name, f, ctx);
535 name, offsetof(struct audit_names, uid),
536 f, ctx);
537 case AUDIT_COMPARE_SGID_TO_OBJ_GID: 534 case AUDIT_COMPARE_SGID_TO_OBJ_GID:
538 return audit_compare_id(cred->sgid, 535 return audit_compare_gid(cred->sgid, name, f, ctx);
539 name, offsetof(struct audit_names, gid),
540 f, ctx);
541 case AUDIT_COMPARE_FSUID_TO_OBJ_UID: 536 case AUDIT_COMPARE_FSUID_TO_OBJ_UID:
542 return audit_compare_id(cred->fsuid, 537 return audit_compare_uid(cred->fsuid, name, f, ctx);
543 name, offsetof(struct audit_names, uid),
544 f, ctx);
545 case AUDIT_COMPARE_FSGID_TO_OBJ_GID: 538 case AUDIT_COMPARE_FSGID_TO_OBJ_GID:
546 return audit_compare_id(cred->fsgid, 539 return audit_compare_gid(cred->fsgid, name, f, ctx);
547 name, offsetof(struct audit_names, gid),
548 f, ctx);
549 /* uid comparisons */ 540 /* uid comparisons */
550 case AUDIT_COMPARE_UID_TO_AUID: 541 case AUDIT_COMPARE_UID_TO_AUID:
551 return audit_comparator(cred->uid, f->op, tsk->loginuid); 542 return audit_uid_comparator(cred->uid, f->op, tsk->loginuid);
552 case AUDIT_COMPARE_UID_TO_EUID: 543 case AUDIT_COMPARE_UID_TO_EUID:
553 return audit_comparator(cred->uid, f->op, cred->euid); 544 return audit_uid_comparator(cred->uid, f->op, cred->euid);
554 case AUDIT_COMPARE_UID_TO_SUID: 545 case AUDIT_COMPARE_UID_TO_SUID:
555 return audit_comparator(cred->uid, f->op, cred->suid); 546 return audit_uid_comparator(cred->uid, f->op, cred->suid);
556 case AUDIT_COMPARE_UID_TO_FSUID: 547 case AUDIT_COMPARE_UID_TO_FSUID:
557 return audit_comparator(cred->uid, f->op, cred->fsuid); 548 return audit_uid_comparator(cred->uid, f->op, cred->fsuid);
558 /* auid comparisons */ 549 /* auid comparisons */
559 case AUDIT_COMPARE_AUID_TO_EUID: 550 case AUDIT_COMPARE_AUID_TO_EUID:
560 return audit_comparator(tsk->loginuid, f->op, cred->euid); 551 return audit_uid_comparator(tsk->loginuid, f->op, cred->euid);
561 case AUDIT_COMPARE_AUID_TO_SUID: 552 case AUDIT_COMPARE_AUID_TO_SUID:
562 return audit_comparator(tsk->loginuid, f->op, cred->suid); 553 return audit_uid_comparator(tsk->loginuid, f->op, cred->suid);
563 case AUDIT_COMPARE_AUID_TO_FSUID: 554 case AUDIT_COMPARE_AUID_TO_FSUID:
564 return audit_comparator(tsk->loginuid, f->op, cred->fsuid); 555 return audit_uid_comparator(tsk->loginuid, f->op, cred->fsuid);
565 /* euid comparisons */ 556 /* euid comparisons */
566 case AUDIT_COMPARE_EUID_TO_SUID: 557 case AUDIT_COMPARE_EUID_TO_SUID:
567 return audit_comparator(cred->euid, f->op, cred->suid); 558 return audit_uid_comparator(cred->euid, f->op, cred->suid);
568 case AUDIT_COMPARE_EUID_TO_FSUID: 559 case AUDIT_COMPARE_EUID_TO_FSUID:
569 return audit_comparator(cred->euid, f->op, cred->fsuid); 560 return audit_uid_comparator(cred->euid, f->op, cred->fsuid);
570 /* suid comparisons */ 561 /* suid comparisons */
571 case AUDIT_COMPARE_SUID_TO_FSUID: 562 case AUDIT_COMPARE_SUID_TO_FSUID:
572 return audit_comparator(cred->suid, f->op, cred->fsuid); 563 return audit_uid_comparator(cred->suid, f->op, cred->fsuid);
573 /* gid comparisons */ 564 /* gid comparisons */
574 case AUDIT_COMPARE_GID_TO_EGID: 565 case AUDIT_COMPARE_GID_TO_EGID:
575 return audit_comparator(cred->gid, f->op, cred->egid); 566 return audit_gid_comparator(cred->gid, f->op, cred->egid);
576 case AUDIT_COMPARE_GID_TO_SGID: 567 case AUDIT_COMPARE_GID_TO_SGID:
577 return audit_comparator(cred->gid, f->op, cred->sgid); 568 return audit_gid_comparator(cred->gid, f->op, cred->sgid);
578 case AUDIT_COMPARE_GID_TO_FSGID: 569 case AUDIT_COMPARE_GID_TO_FSGID:
579 return audit_comparator(cred->gid, f->op, cred->fsgid); 570 return audit_gid_comparator(cred->gid, f->op, cred->fsgid);
580 /* egid comparisons */ 571 /* egid comparisons */
581 case AUDIT_COMPARE_EGID_TO_SGID: 572 case AUDIT_COMPARE_EGID_TO_SGID:
582 return audit_comparator(cred->egid, f->op, cred->sgid); 573 return audit_gid_comparator(cred->egid, f->op, cred->sgid);
583 case AUDIT_COMPARE_EGID_TO_FSGID: 574 case AUDIT_COMPARE_EGID_TO_FSGID:
584 return audit_comparator(cred->egid, f->op, cred->fsgid); 575 return audit_gid_comparator(cred->egid, f->op, cred->fsgid);
585 /* sgid comparison */ 576 /* sgid comparison */
586 case AUDIT_COMPARE_SGID_TO_FSGID: 577 case AUDIT_COMPARE_SGID_TO_FSGID:
587 return audit_comparator(cred->sgid, f->op, cred->fsgid); 578 return audit_gid_comparator(cred->sgid, f->op, cred->fsgid);
588 default: 579 default:
589 WARN(1, "Missing AUDIT_COMPARE define. Report as a bug\n"); 580 WARN(1, "Missing AUDIT_COMPARE define. Report as a bug\n");
590 return 0; 581 return 0;
@@ -630,28 +621,28 @@ static int audit_filter_rules(struct task_struct *tsk,
630 } 621 }
631 break; 622 break;
632 case AUDIT_UID: 623 case AUDIT_UID:
633 result = audit_comparator(cred->uid, f->op, f->val); 624 result = audit_uid_comparator(cred->uid, f->op, f->uid);
634 break; 625 break;
635 case AUDIT_EUID: 626 case AUDIT_EUID:
636 result = audit_comparator(cred->euid, f->op, f->val); 627 result = audit_uid_comparator(cred->euid, f->op, f->uid);
637 break; 628 break;
638 case AUDIT_SUID: 629 case AUDIT_SUID:
639 result = audit_comparator(cred->suid, f->op, f->val); 630 result = audit_uid_comparator(cred->suid, f->op, f->uid);
640 break; 631 break;
641 case AUDIT_FSUID: 632 case AUDIT_FSUID:
642 result = audit_comparator(cred->fsuid, f->op, f->val); 633 result = audit_uid_comparator(cred->fsuid, f->op, f->uid);
643 break; 634 break;
644 case AUDIT_GID: 635 case AUDIT_GID:
645 result = audit_comparator(cred->gid, f->op, f->val); 636 result = audit_gid_comparator(cred->gid, f->op, f->gid);
646 break; 637 break;
647 case AUDIT_EGID: 638 case AUDIT_EGID:
648 result = audit_comparator(cred->egid, f->op, f->val); 639 result = audit_gid_comparator(cred->egid, f->op, f->gid);
649 break; 640 break;
650 case AUDIT_SGID: 641 case AUDIT_SGID:
651 result = audit_comparator(cred->sgid, f->op, f->val); 642 result = audit_gid_comparator(cred->sgid, f->op, f->gid);
652 break; 643 break;
653 case AUDIT_FSGID: 644 case AUDIT_FSGID:
654 result = audit_comparator(cred->fsgid, f->op, f->val); 645 result = audit_gid_comparator(cred->fsgid, f->op, f->gid);
655 break; 646 break;
656 case AUDIT_PERS: 647 case AUDIT_PERS:
657 result = audit_comparator(tsk->personality, f->op, f->val); 648 result = audit_comparator(tsk->personality, f->op, f->val);
@@ -717,10 +708,10 @@ static int audit_filter_rules(struct task_struct *tsk,
717 break; 708 break;
718 case AUDIT_OBJ_UID: 709 case AUDIT_OBJ_UID:
719 if (name) { 710 if (name) {
720 result = audit_comparator(name->uid, f->op, f->val); 711 result = audit_uid_comparator(name->uid, f->op, f->uid);
721 } else if (ctx) { 712 } else if (ctx) {
722 list_for_each_entry(n, &ctx->names_list, list) { 713 list_for_each_entry(n, &ctx->names_list, list) {
723 if (audit_comparator(n->uid, f->op, f->val)) { 714 if (audit_uid_comparator(n->uid, f->op, f->uid)) {
724 ++result; 715 ++result;
725 break; 716 break;
726 } 717 }
@@ -729,10 +720,10 @@ static int audit_filter_rules(struct task_struct *tsk,
729 break; 720 break;
730 case AUDIT_OBJ_GID: 721 case AUDIT_OBJ_GID:
731 if (name) { 722 if (name) {
732 result = audit_comparator(name->gid, f->op, f->val); 723 result = audit_gid_comparator(name->gid, f->op, f->gid);
733 } else if (ctx) { 724 } else if (ctx) {
734 list_for_each_entry(n, &ctx->names_list, list) { 725 list_for_each_entry(n, &ctx->names_list, list) {
735 if (audit_comparator(n->gid, f->op, f->val)) { 726 if (audit_gid_comparator(n->gid, f->op, f->gid)) {
736 ++result; 727 ++result;
737 break; 728 break;
738 } 729 }
@@ -750,7 +741,7 @@ static int audit_filter_rules(struct task_struct *tsk,
750 case AUDIT_LOGINUID: 741 case AUDIT_LOGINUID:
751 result = 0; 742 result = 0;
752 if (ctx) 743 if (ctx)
753 result = audit_comparator(tsk->loginuid, f->op, f->val); 744 result = audit_uid_comparator(tsk->loginuid, f->op, f->uid);
754 break; 745 break;
755 case AUDIT_SUBJ_USER: 746 case AUDIT_SUBJ_USER:
756 case AUDIT_SUBJ_ROLE: 747 case AUDIT_SUBJ_ROLE:
@@ -1006,7 +997,7 @@ static inline void audit_free_names(struct audit_context *context)
1006 context->ino_count); 997 context->ino_count);
1007 list_for_each_entry(n, &context->names_list, list) { 998 list_for_each_entry(n, &context->names_list, list) {
1008 printk(KERN_ERR "names[%d] = %p = %s\n", i, 999 printk(KERN_ERR "names[%d] = %p = %s\n", i,
1009 n->name, n->name ?: "(null)"); 1000 n->name, n->name->name ?: "(null)");
1010 } 1001 }
1011 dump_stack(); 1002 dump_stack();
1012 return; 1003 return;
@@ -1154,13 +1145,43 @@ error_path:
1154 1145
1155EXPORT_SYMBOL(audit_log_task_context); 1146EXPORT_SYMBOL(audit_log_task_context);
1156 1147
1157static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) 1148void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
1158{ 1149{
1150 const struct cred *cred;
1159 char name[sizeof(tsk->comm)]; 1151 char name[sizeof(tsk->comm)];
1160 struct mm_struct *mm = tsk->mm; 1152 struct mm_struct *mm = tsk->mm;
1161 struct vm_area_struct *vma; 1153 char *tty;
1154
1155 if (!ab)
1156 return;
1162 1157
1163 /* tsk == current */ 1158 /* tsk == current */
1159 cred = current_cred();
1160
1161 spin_lock_irq(&tsk->sighand->siglock);
1162 if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
1163 tty = tsk->signal->tty->name;
1164 else
1165 tty = "(none)";
1166 spin_unlock_irq(&tsk->sighand->siglock);
1167
1168
1169 audit_log_format(ab,
1170 " ppid=%ld pid=%d auid=%u uid=%u gid=%u"
1171 " euid=%u suid=%u fsuid=%u"
1172 " egid=%u sgid=%u fsgid=%u ses=%u tty=%s",
1173 sys_getppid(),
1174 tsk->pid,
1175 from_kuid(&init_user_ns, tsk->loginuid),
1176 from_kuid(&init_user_ns, cred->uid),
1177 from_kgid(&init_user_ns, cred->gid),
1178 from_kuid(&init_user_ns, cred->euid),
1179 from_kuid(&init_user_ns, cred->suid),
1180 from_kuid(&init_user_ns, cred->fsuid),
1181 from_kgid(&init_user_ns, cred->egid),
1182 from_kgid(&init_user_ns, cred->sgid),
1183 from_kgid(&init_user_ns, cred->fsgid),
1184 tsk->sessionid, tty);
1164 1185
1165 get_task_comm(name, tsk); 1186 get_task_comm(name, tsk);
1166 audit_log_format(ab, " comm="); 1187 audit_log_format(ab, " comm=");
@@ -1168,23 +1189,17 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk
1168 1189
1169 if (mm) { 1190 if (mm) {
1170 down_read(&mm->mmap_sem); 1191 down_read(&mm->mmap_sem);
1171 vma = mm->mmap; 1192 if (mm->exe_file)
1172 while (vma) { 1193 audit_log_d_path(ab, " exe=", &mm->exe_file->f_path);
1173 if ((vma->vm_flags & VM_EXECUTABLE) &&
1174 vma->vm_file) {
1175 audit_log_d_path(ab, " exe=",
1176 &vma->vm_file->f_path);
1177 break;
1178 }
1179 vma = vma->vm_next;
1180 }
1181 up_read(&mm->mmap_sem); 1194 up_read(&mm->mmap_sem);
1182 } 1195 }
1183 audit_log_task_context(ab); 1196 audit_log_task_context(ab);
1184} 1197}
1185 1198
1199EXPORT_SYMBOL(audit_log_task_info);
1200
1186static int audit_log_pid_context(struct audit_context *context, pid_t pid, 1201static int audit_log_pid_context(struct audit_context *context, pid_t pid,
1187 uid_t auid, uid_t uid, unsigned int sessionid, 1202 kuid_t auid, kuid_t uid, unsigned int sessionid,
1188 u32 sid, char *comm) 1203 u32 sid, char *comm)
1189{ 1204{
1190 struct audit_buffer *ab; 1205 struct audit_buffer *ab;
@@ -1196,8 +1211,9 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
1196 if (!ab) 1211 if (!ab)
1197 return rc; 1212 return rc;
1198 1213
1199 audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, auid, 1214 audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid,
1200 uid, sessionid); 1215 from_kuid(&init_user_ns, auid),
1216 from_kuid(&init_user_ns, uid), sessionid);
1201 if (security_secid_to_secctx(sid, &ctx, &len)) { 1217 if (security_secid_to_secctx(sid, &ctx, &len)) {
1202 audit_log_format(ab, " obj=(none)"); 1218 audit_log_format(ab, " obj=(none)");
1203 rc = 1; 1219 rc = 1;
@@ -1447,7 +1463,9 @@ static void show_special(struct audit_context *context, int *call_panic)
1447 u32 osid = context->ipc.osid; 1463 u32 osid = context->ipc.osid;
1448 1464
1449 audit_log_format(ab, "ouid=%u ogid=%u mode=%#ho", 1465 audit_log_format(ab, "ouid=%u ogid=%u mode=%#ho",
1450 context->ipc.uid, context->ipc.gid, context->ipc.mode); 1466 from_kuid(&init_user_ns, context->ipc.uid),
1467 from_kgid(&init_user_ns, context->ipc.gid),
1468 context->ipc.mode);
1451 if (osid) { 1469 if (osid) {
1452 char *ctx = NULL; 1470 char *ctx = NULL;
1453 u32 len; 1471 u32 len;
@@ -1536,7 +1554,7 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n,
1536 case AUDIT_NAME_FULL: 1554 case AUDIT_NAME_FULL:
1537 /* log the full path */ 1555 /* log the full path */
1538 audit_log_format(ab, " name="); 1556 audit_log_format(ab, " name=");
1539 audit_log_untrustedstring(ab, n->name); 1557 audit_log_untrustedstring(ab, n->name->name);
1540 break; 1558 break;
1541 case 0: 1559 case 0:
1542 /* name was specified as a relative path and the 1560 /* name was specified as a relative path and the
@@ -1546,7 +1564,7 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n,
1546 default: 1564 default:
1547 /* log the name's directory component */ 1565 /* log the name's directory component */
1548 audit_log_format(ab, " name="); 1566 audit_log_format(ab, " name=");
1549 audit_log_n_untrustedstring(ab, n->name, 1567 audit_log_n_untrustedstring(ab, n->name->name,
1550 n->name_len); 1568 n->name_len);
1551 } 1569 }
1552 } else 1570 } else
@@ -1560,8 +1578,8 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n,
1560 MAJOR(n->dev), 1578 MAJOR(n->dev),
1561 MINOR(n->dev), 1579 MINOR(n->dev),
1562 n->mode, 1580 n->mode,
1563 n->uid, 1581 from_kuid(&init_user_ns, n->uid),
1564 n->gid, 1582 from_kgid(&init_user_ns, n->gid),
1565 MAJOR(n->rdev), 1583 MAJOR(n->rdev),
1566 MINOR(n->rdev)); 1584 MINOR(n->rdev));
1567 } 1585 }
@@ -1585,26 +1603,12 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n,
1585 1603
1586static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) 1604static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
1587{ 1605{
1588 const struct cred *cred;
1589 int i, call_panic = 0; 1606 int i, call_panic = 0;
1590 struct audit_buffer *ab; 1607 struct audit_buffer *ab;
1591 struct audit_aux_data *aux; 1608 struct audit_aux_data *aux;
1592 const char *tty;
1593 struct audit_names *n; 1609 struct audit_names *n;
1594 1610
1595 /* tsk == current */ 1611 /* tsk == current */
1596 context->pid = tsk->pid;
1597 if (!context->ppid)
1598 context->ppid = sys_getppid();
1599 cred = current_cred();
1600 context->uid = cred->uid;
1601 context->gid = cred->gid;
1602 context->euid = cred->euid;
1603 context->suid = cred->suid;
1604 context->fsuid = cred->fsuid;
1605 context->egid = cred->egid;
1606 context->sgid = cred->sgid;
1607 context->fsgid = cred->fsgid;
1608 context->personality = tsk->personality; 1612 context->personality = tsk->personality;
1609 1613
1610 ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL); 1614 ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL);
@@ -1619,32 +1623,13 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1619 (context->return_valid==AUDITSC_SUCCESS)?"yes":"no", 1623 (context->return_valid==AUDITSC_SUCCESS)?"yes":"no",
1620 context->return_code); 1624 context->return_code);
1621 1625
1622 spin_lock_irq(&tsk->sighand->siglock);
1623 if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
1624 tty = tsk->signal->tty->name;
1625 else
1626 tty = "(none)";
1627 spin_unlock_irq(&tsk->sighand->siglock);
1628
1629 audit_log_format(ab, 1626 audit_log_format(ab,
1630 " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" 1627 " a0=%lx a1=%lx a2=%lx a3=%lx items=%d",
1631 " ppid=%d pid=%d auid=%u uid=%u gid=%u" 1628 context->argv[0],
1632 " euid=%u suid=%u fsuid=%u" 1629 context->argv[1],
1633 " egid=%u sgid=%u fsgid=%u tty=%s ses=%u", 1630 context->argv[2],
1634 context->argv[0], 1631 context->argv[3],
1635 context->argv[1], 1632 context->name_count);
1636 context->argv[2],
1637 context->argv[3],
1638 context->name_count,
1639 context->ppid,
1640 context->pid,
1641 tsk->loginuid,
1642 context->uid,
1643 context->gid,
1644 context->euid, context->suid, context->fsuid,
1645 context->egid, context->sgid, context->fsgid, tty,
1646 tsk->sessionid);
1647
1648 1633
1649 audit_log_task_info(ab, tsk); 1634 audit_log_task_info(ab, tsk);
1650 audit_log_key(ab, context->filterkey); 1635 audit_log_key(ab, context->filterkey);
@@ -2009,7 +1994,8 @@ retry:
2009#endif 1994#endif
2010} 1995}
2011 1996
2012static struct audit_names *audit_alloc_name(struct audit_context *context) 1997static struct audit_names *audit_alloc_name(struct audit_context *context,
1998 unsigned char type)
2013{ 1999{
2014 struct audit_names *aname; 2000 struct audit_names *aname;
2015 2001
@@ -2024,6 +2010,7 @@ static struct audit_names *audit_alloc_name(struct audit_context *context)
2024 } 2010 }
2025 2011
2026 aname->ino = (unsigned long)-1; 2012 aname->ino = (unsigned long)-1;
2013 aname->type = type;
2027 list_add_tail(&aname->list, &context->names_list); 2014 list_add_tail(&aname->list, &context->names_list);
2028 2015
2029 context->name_count++; 2016 context->name_count++;
@@ -2034,13 +2021,36 @@ static struct audit_names *audit_alloc_name(struct audit_context *context)
2034} 2021}
2035 2022
2036/** 2023/**
2024 * audit_reusename - fill out filename with info from existing entry
2025 * @uptr: userland ptr to pathname
2026 *
2027 * Search the audit_names list for the current audit context. If there is an
2028 * existing entry with a matching "uptr" then return the filename
2029 * associated with that audit_name. If not, return NULL.
2030 */
2031struct filename *
2032__audit_reusename(const __user char *uptr)
2033{
2034 struct audit_context *context = current->audit_context;
2035 struct audit_names *n;
2036
2037 list_for_each_entry(n, &context->names_list, list) {
2038 if (!n->name)
2039 continue;
2040 if (n->name->uptr == uptr)
2041 return n->name;
2042 }
2043 return NULL;
2044}
2045
2046/**
2037 * audit_getname - add a name to the list 2047 * audit_getname - add a name to the list
2038 * @name: name to add 2048 * @name: name to add
2039 * 2049 *
2040 * Add a name to the list of audit names for this context. 2050 * Add a name to the list of audit names for this context.
2041 * Called from fs/namei.c:getname(). 2051 * Called from fs/namei.c:getname().
2042 */ 2052 */
2043void __audit_getname(const char *name) 2053void __audit_getname(struct filename *name)
2044{ 2054{
2045 struct audit_context *context = current->audit_context; 2055 struct audit_context *context = current->audit_context;
2046 struct audit_names *n; 2056 struct audit_names *n;
@@ -2054,13 +2064,19 @@ void __audit_getname(const char *name)
2054 return; 2064 return;
2055 } 2065 }
2056 2066
2057 n = audit_alloc_name(context); 2067#if AUDIT_DEBUG
2068 /* The filename _must_ have a populated ->name */
2069 BUG_ON(!name->name);
2070#endif
2071
2072 n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN);
2058 if (!n) 2073 if (!n)
2059 return; 2074 return;
2060 2075
2061 n->name = name; 2076 n->name = name;
2062 n->name_len = AUDIT_NAME_FULL; 2077 n->name_len = AUDIT_NAME_FULL;
2063 n->name_put = true; 2078 n->name_put = true;
2079 name->aname = n;
2064 2080
2065 if (!context->pwd.dentry) 2081 if (!context->pwd.dentry)
2066 get_fs_pwd(current->fs, &context->pwd); 2082 get_fs_pwd(current->fs, &context->pwd);
@@ -2073,7 +2089,7 @@ void __audit_getname(const char *name)
2073 * then we delay the putname until syscall exit. 2089 * then we delay the putname until syscall exit.
2074 * Called from include/linux/fs.h:putname(). 2090 * Called from include/linux/fs.h:putname().
2075 */ 2091 */
2076void audit_putname(const char *name) 2092void audit_putname(struct filename *name)
2077{ 2093{
2078 struct audit_context *context = current->audit_context; 2094 struct audit_context *context = current->audit_context;
2079 2095
@@ -2088,7 +2104,7 @@ void audit_putname(const char *name)
2088 2104
2089 list_for_each_entry(n, &context->names_list, list) 2105 list_for_each_entry(n, &context->names_list, list)
2090 printk(KERN_ERR "name[%d] = %p = %s\n", i, 2106 printk(KERN_ERR "name[%d] = %p = %s\n", i,
2091 n->name, n->name ?: "(null)"); 2107 n->name, n->name->name ?: "(null)");
2092 } 2108 }
2093#endif 2109#endif
2094 __putname(name); 2110 __putname(name);
@@ -2102,8 +2118,8 @@ void audit_putname(const char *name)
2102 " put_count=%d\n", 2118 " put_count=%d\n",
2103 __FILE__, __LINE__, 2119 __FILE__, __LINE__,
2104 context->serial, context->major, 2120 context->serial, context->major,
2105 context->in_syscall, name, context->name_count, 2121 context->in_syscall, name->name,
2106 context->put_count); 2122 context->name_count, context->put_count);
2107 dump_stack(); 2123 dump_stack();
2108 } 2124 }
2109 } 2125 }
@@ -2146,13 +2162,13 @@ static void audit_copy_inode(struct audit_names *name, const struct dentry *dent
2146} 2162}
2147 2163
2148/** 2164/**
2149 * audit_inode - store the inode and device from a lookup 2165 * __audit_inode - store the inode and device from a lookup
2150 * @name: name being audited 2166 * @name: name being audited
2151 * @dentry: dentry being audited 2167 * @dentry: dentry being audited
2152 * 2168 * @parent: does this dentry represent the parent?
2153 * Called from fs/namei.c:path_lookup().
2154 */ 2169 */
2155void __audit_inode(const char *name, const struct dentry *dentry) 2170void __audit_inode(struct filename *name, const struct dentry *dentry,
2171 unsigned int parent)
2156{ 2172{
2157 struct audit_context *context = current->audit_context; 2173 struct audit_context *context = current->audit_context;
2158 const struct inode *inode = dentry->d_inode; 2174 const struct inode *inode = dentry->d_inode;
@@ -2161,24 +2177,69 @@ void __audit_inode(const char *name, const struct dentry *dentry)
2161 if (!context->in_syscall) 2177 if (!context->in_syscall)
2162 return; 2178 return;
2163 2179
2180 if (!name)
2181 goto out_alloc;
2182
2183#if AUDIT_DEBUG
2184 /* The struct filename _must_ have a populated ->name */
2185 BUG_ON(!name->name);
2186#endif
2187 /*
2188 * If we have a pointer to an audit_names entry already, then we can
2189 * just use it directly if the type is correct.
2190 */
2191 n = name->aname;
2192 if (n) {
2193 if (parent) {
2194 if (n->type == AUDIT_TYPE_PARENT ||
2195 n->type == AUDIT_TYPE_UNKNOWN)
2196 goto out;
2197 } else {
2198 if (n->type != AUDIT_TYPE_PARENT)
2199 goto out;
2200 }
2201 }
2202
2164 list_for_each_entry_reverse(n, &context->names_list, list) { 2203 list_for_each_entry_reverse(n, &context->names_list, list) {
2165 if (n->name && (n->name == name)) 2204 /* does the name pointer match? */
2166 goto out; 2205 if (!n->name || n->name->name != name->name)
2206 continue;
2207
2208 /* match the correct record type */
2209 if (parent) {
2210 if (n->type == AUDIT_TYPE_PARENT ||
2211 n->type == AUDIT_TYPE_UNKNOWN)
2212 goto out;
2213 } else {
2214 if (n->type != AUDIT_TYPE_PARENT)
2215 goto out;
2216 }
2167 } 2217 }
2168 2218
2169 /* unable to find the name from a previous getname() */ 2219out_alloc:
2170 n = audit_alloc_name(context); 2220 /* unable to find the name from a previous getname(). Allocate a new
2221 * anonymous entry.
2222 */
2223 n = audit_alloc_name(context, AUDIT_TYPE_NORMAL);
2171 if (!n) 2224 if (!n)
2172 return; 2225 return;
2173out: 2226out:
2227 if (parent) {
2228 n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL;
2229 n->type = AUDIT_TYPE_PARENT;
2230 } else {
2231 n->name_len = AUDIT_NAME_FULL;
2232 n->type = AUDIT_TYPE_NORMAL;
2233 }
2174 handle_path(dentry); 2234 handle_path(dentry);
2175 audit_copy_inode(n, dentry, inode); 2235 audit_copy_inode(n, dentry, inode);
2176} 2236}
2177 2237
2178/** 2238/**
2179 * audit_inode_child - collect inode info for created/removed objects 2239 * __audit_inode_child - collect inode info for created/removed objects
2180 * @dentry: dentry being audited
2181 * @parent: inode of dentry parent 2240 * @parent: inode of dentry parent
2241 * @dentry: dentry being audited
2242 * @type: AUDIT_TYPE_* value that we're looking for
2182 * 2243 *
2183 * For syscalls that create or remove filesystem objects, audit_inode 2244 * For syscalls that create or remove filesystem objects, audit_inode
2184 * can only collect information for the filesystem object's parent. 2245 * can only collect information for the filesystem object's parent.
@@ -2188,15 +2249,14 @@ out:
2188 * must be hooked prior, in order to capture the target inode during 2249 * must be hooked prior, in order to capture the target inode during
2189 * unsuccessful attempts. 2250 * unsuccessful attempts.
2190 */ 2251 */
2191void __audit_inode_child(const struct dentry *dentry, 2252void __audit_inode_child(const struct inode *parent,
2192 const struct inode *parent) 2253 const struct dentry *dentry,
2254 const unsigned char type)
2193{ 2255{
2194 struct audit_context *context = current->audit_context; 2256 struct audit_context *context = current->audit_context;
2195 const char *found_parent = NULL, *found_child = NULL;
2196 const struct inode *inode = dentry->d_inode; 2257 const struct inode *inode = dentry->d_inode;
2197 const char *dname = dentry->d_name.name; 2258 const char *dname = dentry->d_name.name;
2198 struct audit_names *n; 2259 struct audit_names *n, *found_parent = NULL, *found_child = NULL;
2199 int dirlen = 0;
2200 2260
2201 if (!context->in_syscall) 2261 if (!context->in_syscall)
2202 return; 2262 return;
@@ -2204,62 +2264,65 @@ void __audit_inode_child(const struct dentry *dentry,
2204 if (inode) 2264 if (inode)
2205 handle_one(inode); 2265 handle_one(inode);
2206 2266
2207 /* parent is more likely, look for it first */ 2267 /* look for a parent entry first */
2208 list_for_each_entry(n, &context->names_list, list) { 2268 list_for_each_entry(n, &context->names_list, list) {
2209 if (!n->name) 2269 if (!n->name || n->type != AUDIT_TYPE_PARENT)
2210 continue; 2270 continue;
2211 2271
2212 if (n->ino == parent->i_ino && 2272 if (n->ino == parent->i_ino &&
2213 !audit_compare_dname_path(dname, n->name, &dirlen)) { 2273 !audit_compare_dname_path(dname, n->name->name, n->name_len)) {
2214 n->name_len = dirlen; /* update parent data in place */ 2274 found_parent = n;
2215 found_parent = n->name; 2275 break;
2216 goto add_names;
2217 } 2276 }
2218 } 2277 }
2219 2278
2220 /* no matching parent, look for matching child */ 2279 /* is there a matching child entry? */
2221 list_for_each_entry(n, &context->names_list, list) { 2280 list_for_each_entry(n, &context->names_list, list) {
2222 if (!n->name) 2281 /* can only match entries that have a name */
2282 if (!n->name || n->type != type)
2223 continue; 2283 continue;
2224 2284
2225 /* strcmp() is the more likely scenario */ 2285 /* if we found a parent, make sure this one is a child of it */
2226 if (!strcmp(dname, n->name) || 2286 if (found_parent && (n->name != found_parent->name))
2227 !audit_compare_dname_path(dname, n->name, &dirlen)) { 2287 continue;
2228 if (inode) 2288
2229 audit_copy_inode(n, NULL, inode); 2289 if (!strcmp(dname, n->name->name) ||
2230 else 2290 !audit_compare_dname_path(dname, n->name->name,
2231 n->ino = (unsigned long)-1; 2291 found_parent ?
2232 found_child = n->name; 2292 found_parent->name_len :
2233 goto add_names; 2293 AUDIT_NAME_FULL)) {
2294 found_child = n;
2295 break;
2234 } 2296 }
2235 } 2297 }
2236 2298
2237add_names:
2238 if (!found_parent) { 2299 if (!found_parent) {
2239 n = audit_alloc_name(context); 2300 /* create a new, "anonymous" parent record */
2301 n = audit_alloc_name(context, AUDIT_TYPE_PARENT);
2240 if (!n) 2302 if (!n)
2241 return; 2303 return;
2242 audit_copy_inode(n, NULL, parent); 2304 audit_copy_inode(n, NULL, parent);
2243 } 2305 }
2244 2306
2245 if (!found_child) { 2307 if (!found_child) {
2246 n = audit_alloc_name(context); 2308 found_child = audit_alloc_name(context, type);
2247 if (!n) 2309 if (!found_child)
2248 return; 2310 return;
2249 2311
2250 /* Re-use the name belonging to the slot for a matching parent 2312 /* Re-use the name belonging to the slot for a matching parent
2251 * directory. All names for this context are relinquished in 2313 * directory. All names for this context are relinquished in
2252 * audit_free_names() */ 2314 * audit_free_names() */
2253 if (found_parent) { 2315 if (found_parent) {
2254 n->name = found_parent; 2316 found_child->name = found_parent->name;
2255 n->name_len = AUDIT_NAME_FULL; 2317 found_child->name_len = AUDIT_NAME_FULL;
2256 /* don't call __putname() */ 2318 /* don't call __putname() */
2257 n->name_put = false; 2319 found_child->name_put = false;
2258 } 2320 }
2259
2260 if (inode)
2261 audit_copy_inode(n, NULL, inode);
2262 } 2321 }
2322 if (inode)
2323 audit_copy_inode(found_child, dentry, inode);
2324 else
2325 found_child->ino = (unsigned long)-1;
2263} 2326}
2264EXPORT_SYMBOL_GPL(__audit_inode_child); 2327EXPORT_SYMBOL_GPL(__audit_inode_child);
2265 2328
@@ -2299,14 +2362,14 @@ static atomic_t session_id = ATOMIC_INIT(0);
2299 * 2362 *
2300 * Called (set) from fs/proc/base.c::proc_loginuid_write(). 2363 * Called (set) from fs/proc/base.c::proc_loginuid_write().
2301 */ 2364 */
2302int audit_set_loginuid(uid_t loginuid) 2365int audit_set_loginuid(kuid_t loginuid)
2303{ 2366{
2304 struct task_struct *task = current; 2367 struct task_struct *task = current;
2305 struct audit_context *context = task->audit_context; 2368 struct audit_context *context = task->audit_context;
2306 unsigned int sessionid; 2369 unsigned int sessionid;
2307 2370
2308#ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE 2371#ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE
2309 if (task->loginuid != -1) 2372 if (uid_valid(task->loginuid))
2310 return -EPERM; 2373 return -EPERM;
2311#else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */ 2374#else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */
2312 if (!capable(CAP_AUDIT_CONTROL)) 2375 if (!capable(CAP_AUDIT_CONTROL))
@@ -2322,8 +2385,10 @@ int audit_set_loginuid(uid_t loginuid)
2322 audit_log_format(ab, "login pid=%d uid=%u " 2385 audit_log_format(ab, "login pid=%d uid=%u "
2323 "old auid=%u new auid=%u" 2386 "old auid=%u new auid=%u"
2324 " old ses=%u new ses=%u", 2387 " old ses=%u new ses=%u",
2325 task->pid, task_uid(task), 2388 task->pid,
2326 task->loginuid, loginuid, 2389 from_kuid(&init_user_ns, task_uid(task)),
2390 from_kuid(&init_user_ns, task->loginuid),
2391 from_kuid(&init_user_ns, loginuid),
2327 task->sessionid, sessionid); 2392 task->sessionid, sessionid);
2328 audit_log_end(ab); 2393 audit_log_end(ab);
2329 } 2394 }
@@ -2546,12 +2611,12 @@ int __audit_signal_info(int sig, struct task_struct *t)
2546 struct audit_aux_data_pids *axp; 2611 struct audit_aux_data_pids *axp;
2547 struct task_struct *tsk = current; 2612 struct task_struct *tsk = current;
2548 struct audit_context *ctx = tsk->audit_context; 2613 struct audit_context *ctx = tsk->audit_context;
2549 uid_t uid = current_uid(), t_uid = task_uid(t); 2614 kuid_t uid = current_uid(), t_uid = task_uid(t);
2550 2615
2551 if (audit_pid && t->tgid == audit_pid) { 2616 if (audit_pid && t->tgid == audit_pid) {
2552 if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) { 2617 if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) {
2553 audit_sig_pid = tsk->pid; 2618 audit_sig_pid = tsk->pid;
2554 if (tsk->loginuid != -1) 2619 if (uid_valid(tsk->loginuid))
2555 audit_sig_uid = tsk->loginuid; 2620 audit_sig_uid = tsk->loginuid;
2556 else 2621 else
2557 audit_sig_uid = uid; 2622 audit_sig_uid = uid;
@@ -2672,8 +2737,8 @@ void __audit_mmap_fd(int fd, int flags)
2672 2737
2673static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr) 2738static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr)
2674{ 2739{
2675 uid_t auid, uid; 2740 kuid_t auid, uid;
2676 gid_t gid; 2741 kgid_t gid;
2677 unsigned int sessionid; 2742 unsigned int sessionid;
2678 2743
2679 auid = audit_get_loginuid(current); 2744 auid = audit_get_loginuid(current);
@@ -2681,7 +2746,10 @@ static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr)
2681 current_uid_gid(&uid, &gid); 2746 current_uid_gid(&uid, &gid);
2682 2747
2683 audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u", 2748 audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u",
2684 auid, uid, gid, sessionid); 2749 from_kuid(&init_user_ns, auid),
2750 from_kuid(&init_user_ns, uid),
2751 from_kgid(&init_user_ns, gid),
2752 sessionid);
2685 audit_log_task_context(ab); 2753 audit_log_task_context(ab);
2686 audit_log_format(ab, " pid=%d comm=", current->pid); 2754 audit_log_format(ab, " pid=%d comm=", current->pid);
2687 audit_log_untrustedstring(ab, current->comm); 2755 audit_log_untrustedstring(ab, current->comm);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 79818507e444..13774b3b39aa 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -88,11 +88,12 @@ static DEFINE_MUTEX(cgroup_root_mutex);
88 88
89/* 89/*
90 * Generate an array of cgroup subsystem pointers. At boot time, this is 90 * Generate an array of cgroup subsystem pointers. At boot time, this is
91 * populated up to CGROUP_BUILTIN_SUBSYS_COUNT, and modular subsystems are 91 * populated with the built in subsystems, and modular subsystems are
92 * registered after that. The mutable section of this array is protected by 92 * registered after that. The mutable section of this array is protected by
93 * cgroup_mutex. 93 * cgroup_mutex.
94 */ 94 */
95#define SUBSYS(_x) &_x ## _subsys, 95#define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys,
96#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option)
96static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = { 97static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
97#include <linux/cgroup_subsys.h> 98#include <linux/cgroup_subsys.h>
98}; 99};
@@ -111,13 +112,13 @@ struct cgroupfs_root {
111 * The bitmask of subsystems intended to be attached to this 112 * The bitmask of subsystems intended to be attached to this
112 * hierarchy 113 * hierarchy
113 */ 114 */
114 unsigned long subsys_bits; 115 unsigned long subsys_mask;
115 116
116 /* Unique id for this hierarchy. */ 117 /* Unique id for this hierarchy. */
117 int hierarchy_id; 118 int hierarchy_id;
118 119
119 /* The bitmask of subsystems currently attached to this hierarchy */ 120 /* The bitmask of subsystems currently attached to this hierarchy */
120 unsigned long actual_subsys_bits; 121 unsigned long actual_subsys_mask;
121 122
122 /* A list running through the attached subsystems */ 123 /* A list running through the attached subsystems */
123 struct list_head subsys_list; 124 struct list_head subsys_list;
@@ -276,7 +277,8 @@ inline int cgroup_is_removed(const struct cgroup *cgrp)
276 277
277/* bits in struct cgroupfs_root flags field */ 278/* bits in struct cgroupfs_root flags field */
278enum { 279enum {
279 ROOT_NOPREFIX, /* mounted subsystems have no named prefix */ 280 ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
281 ROOT_XATTR, /* supports extended attributes */
280}; 282};
281 283
282static int cgroup_is_releasable(const struct cgroup *cgrp) 284static int cgroup_is_releasable(const struct cgroup *cgrp)
@@ -556,7 +558,7 @@ static struct css_set *find_existing_css_set(
556 * won't change, so no need for locking. 558 * won't change, so no need for locking.
557 */ 559 */
558 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 560 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
559 if (root->subsys_bits & (1UL << i)) { 561 if (root->subsys_mask & (1UL << i)) {
560 /* Subsystem is in this hierarchy. So we want 562 /* Subsystem is in this hierarchy. So we want
561 * the subsystem state from the new 563 * the subsystem state from the new
562 * cgroup */ 564 * cgroup */
@@ -824,7 +826,8 @@ EXPORT_SYMBOL_GPL(cgroup_unlock);
824static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); 826static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
825static struct dentry *cgroup_lookup(struct inode *, struct dentry *, unsigned int); 827static struct dentry *cgroup_lookup(struct inode *, struct dentry *, unsigned int);
826static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); 828static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
827static int cgroup_populate_dir(struct cgroup *cgrp); 829static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,
830 unsigned long subsys_mask);
828static const struct inode_operations cgroup_dir_inode_operations; 831static const struct inode_operations cgroup_dir_inode_operations;
829static const struct file_operations proc_cgroupstats_operations; 832static const struct file_operations proc_cgroupstats_operations;
830 833
@@ -912,15 +915,19 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
912 */ 915 */
913 BUG_ON(!list_empty(&cgrp->pidlists)); 916 BUG_ON(!list_empty(&cgrp->pidlists));
914 917
918 simple_xattrs_free(&cgrp->xattrs);
919
915 kfree_rcu(cgrp, rcu_head); 920 kfree_rcu(cgrp, rcu_head);
916 } else { 921 } else {
917 struct cfent *cfe = __d_cfe(dentry); 922 struct cfent *cfe = __d_cfe(dentry);
918 struct cgroup *cgrp = dentry->d_parent->d_fsdata; 923 struct cgroup *cgrp = dentry->d_parent->d_fsdata;
924 struct cftype *cft = cfe->type;
919 925
920 WARN_ONCE(!list_empty(&cfe->node) && 926 WARN_ONCE(!list_empty(&cfe->node) &&
921 cgrp != &cgrp->root->top_cgroup, 927 cgrp != &cgrp->root->top_cgroup,
922 "cfe still linked for %s\n", cfe->type->name); 928 "cfe still linked for %s\n", cfe->type->name);
923 kfree(cfe); 929 kfree(cfe);
930 simple_xattrs_free(&cft->xattrs);
924 } 931 }
925 iput(inode); 932 iput(inode);
926} 933}
@@ -963,12 +970,29 @@ static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
963 return -ENOENT; 970 return -ENOENT;
964} 971}
965 972
966static void cgroup_clear_directory(struct dentry *dir) 973/**
974 * cgroup_clear_directory - selective removal of base and subsystem files
975 * @dir: directory containing the files
976 * @base_files: true if the base files should be removed
977 * @subsys_mask: mask of the subsystem ids whose files should be removed
978 */
979static void cgroup_clear_directory(struct dentry *dir, bool base_files,
980 unsigned long subsys_mask)
967{ 981{
968 struct cgroup *cgrp = __d_cgrp(dir); 982 struct cgroup *cgrp = __d_cgrp(dir);
983 struct cgroup_subsys *ss;
969 984
970 while (!list_empty(&cgrp->files)) 985 for_each_subsys(cgrp->root, ss) {
971 cgroup_rm_file(cgrp, NULL); 986 struct cftype_set *set;
987 if (!test_bit(ss->subsys_id, &subsys_mask))
988 continue;
989 list_for_each_entry(set, &ss->cftsets, node)
990 cgroup_rm_file(cgrp, set->cfts);
991 }
992 if (base_files) {
993 while (!list_empty(&cgrp->files))
994 cgroup_rm_file(cgrp, NULL);
995 }
972} 996}
973 997
974/* 998/*
@@ -977,8 +1001,9 @@ static void cgroup_clear_directory(struct dentry *dir)
977static void cgroup_d_remove_dir(struct dentry *dentry) 1001static void cgroup_d_remove_dir(struct dentry *dentry)
978{ 1002{
979 struct dentry *parent; 1003 struct dentry *parent;
1004 struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
980 1005
981 cgroup_clear_directory(dentry); 1006 cgroup_clear_directory(dentry, true, root->subsys_mask);
982 1007
983 parent = dentry->d_parent; 1008 parent = dentry->d_parent;
984 spin_lock(&parent->d_lock); 1009 spin_lock(&parent->d_lock);
@@ -1022,22 +1047,22 @@ void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
1022 * returns an error, no reference counts are touched. 1047 * returns an error, no reference counts are touched.
1023 */ 1048 */
1024static int rebind_subsystems(struct cgroupfs_root *root, 1049static int rebind_subsystems(struct cgroupfs_root *root,
1025 unsigned long final_bits) 1050 unsigned long final_subsys_mask)
1026{ 1051{
1027 unsigned long added_bits, removed_bits; 1052 unsigned long added_mask, removed_mask;
1028 struct cgroup *cgrp = &root->top_cgroup; 1053 struct cgroup *cgrp = &root->top_cgroup;
1029 int i; 1054 int i;
1030 1055
1031 BUG_ON(!mutex_is_locked(&cgroup_mutex)); 1056 BUG_ON(!mutex_is_locked(&cgroup_mutex));
1032 BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); 1057 BUG_ON(!mutex_is_locked(&cgroup_root_mutex));
1033 1058
1034 removed_bits = root->actual_subsys_bits & ~final_bits; 1059 removed_mask = root->actual_subsys_mask & ~final_subsys_mask;
1035 added_bits = final_bits & ~root->actual_subsys_bits; 1060 added_mask = final_subsys_mask & ~root->actual_subsys_mask;
1036 /* Check that any added subsystems are currently free */ 1061 /* Check that any added subsystems are currently free */
1037 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1062 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1038 unsigned long bit = 1UL << i; 1063 unsigned long bit = 1UL << i;
1039 struct cgroup_subsys *ss = subsys[i]; 1064 struct cgroup_subsys *ss = subsys[i];
1040 if (!(bit & added_bits)) 1065 if (!(bit & added_mask))
1041 continue; 1066 continue;
1042 /* 1067 /*
1043 * Nobody should tell us to do a subsys that doesn't exist: 1068 * Nobody should tell us to do a subsys that doesn't exist:
@@ -1062,7 +1087,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1062 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1087 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1063 struct cgroup_subsys *ss = subsys[i]; 1088 struct cgroup_subsys *ss = subsys[i];
1064 unsigned long bit = 1UL << i; 1089 unsigned long bit = 1UL << i;
1065 if (bit & added_bits) { 1090 if (bit & added_mask) {
1066 /* We're binding this subsystem to this hierarchy */ 1091 /* We're binding this subsystem to this hierarchy */
1067 BUG_ON(ss == NULL); 1092 BUG_ON(ss == NULL);
1068 BUG_ON(cgrp->subsys[i]); 1093 BUG_ON(cgrp->subsys[i]);
@@ -1075,7 +1100,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1075 if (ss->bind) 1100 if (ss->bind)
1076 ss->bind(cgrp); 1101 ss->bind(cgrp);
1077 /* refcount was already taken, and we're keeping it */ 1102 /* refcount was already taken, and we're keeping it */
1078 } else if (bit & removed_bits) { 1103 } else if (bit & removed_mask) {
1079 /* We're removing this subsystem */ 1104 /* We're removing this subsystem */
1080 BUG_ON(ss == NULL); 1105 BUG_ON(ss == NULL);
1081 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); 1106 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
@@ -1088,7 +1113,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1088 list_move(&ss->sibling, &rootnode.subsys_list); 1113 list_move(&ss->sibling, &rootnode.subsys_list);
1089 /* subsystem is now free - drop reference on module */ 1114 /* subsystem is now free - drop reference on module */
1090 module_put(ss->module); 1115 module_put(ss->module);
1091 } else if (bit & final_bits) { 1116 } else if (bit & final_subsys_mask) {
1092 /* Subsystem state should already exist */ 1117 /* Subsystem state should already exist */
1093 BUG_ON(ss == NULL); 1118 BUG_ON(ss == NULL);
1094 BUG_ON(!cgrp->subsys[i]); 1119 BUG_ON(!cgrp->subsys[i]);
@@ -1105,7 +1130,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1105 BUG_ON(cgrp->subsys[i]); 1130 BUG_ON(cgrp->subsys[i]);
1106 } 1131 }
1107 } 1132 }
1108 root->subsys_bits = root->actual_subsys_bits = final_bits; 1133 root->subsys_mask = root->actual_subsys_mask = final_subsys_mask;
1109 synchronize_rcu(); 1134 synchronize_rcu();
1110 1135
1111 return 0; 1136 return 0;
@@ -1121,6 +1146,8 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
1121 seq_printf(seq, ",%s", ss->name); 1146 seq_printf(seq, ",%s", ss->name);
1122 if (test_bit(ROOT_NOPREFIX, &root->flags)) 1147 if (test_bit(ROOT_NOPREFIX, &root->flags))
1123 seq_puts(seq, ",noprefix"); 1148 seq_puts(seq, ",noprefix");
1149 if (test_bit(ROOT_XATTR, &root->flags))
1150 seq_puts(seq, ",xattr");
1124 if (strlen(root->release_agent_path)) 1151 if (strlen(root->release_agent_path))
1125 seq_printf(seq, ",release_agent=%s", root->release_agent_path); 1152 seq_printf(seq, ",release_agent=%s", root->release_agent_path);
1126 if (clone_children(&root->top_cgroup)) 1153 if (clone_children(&root->top_cgroup))
@@ -1132,7 +1159,7 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
1132} 1159}
1133 1160
1134struct cgroup_sb_opts { 1161struct cgroup_sb_opts {
1135 unsigned long subsys_bits; 1162 unsigned long subsys_mask;
1136 unsigned long flags; 1163 unsigned long flags;
1137 char *release_agent; 1164 char *release_agent;
1138 bool clone_children; 1165 bool clone_children;
@@ -1189,6 +1216,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1189 opts->clone_children = true; 1216 opts->clone_children = true;
1190 continue; 1217 continue;
1191 } 1218 }
1219 if (!strcmp(token, "xattr")) {
1220 set_bit(ROOT_XATTR, &opts->flags);
1221 continue;
1222 }
1192 if (!strncmp(token, "release_agent=", 14)) { 1223 if (!strncmp(token, "release_agent=", 14)) {
1193 /* Specifying two release agents is forbidden */ 1224 /* Specifying two release agents is forbidden */
1194 if (opts->release_agent) 1225 if (opts->release_agent)
@@ -1237,7 +1268,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1237 /* Mutually exclusive option 'all' + subsystem name */ 1268 /* Mutually exclusive option 'all' + subsystem name */
1238 if (all_ss) 1269 if (all_ss)
1239 return -EINVAL; 1270 return -EINVAL;
1240 set_bit(i, &opts->subsys_bits); 1271 set_bit(i, &opts->subsys_mask);
1241 one_ss = true; 1272 one_ss = true;
1242 1273
1243 break; 1274 break;
@@ -1258,7 +1289,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1258 continue; 1289 continue;
1259 if (ss->disabled) 1290 if (ss->disabled)
1260 continue; 1291 continue;
1261 set_bit(i, &opts->subsys_bits); 1292 set_bit(i, &opts->subsys_mask);
1262 } 1293 }
1263 } 1294 }
1264 1295
@@ -1270,19 +1301,19 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1270 * the cpuset subsystem. 1301 * the cpuset subsystem.
1271 */ 1302 */
1272 if (test_bit(ROOT_NOPREFIX, &opts->flags) && 1303 if (test_bit(ROOT_NOPREFIX, &opts->flags) &&
1273 (opts->subsys_bits & mask)) 1304 (opts->subsys_mask & mask))
1274 return -EINVAL; 1305 return -EINVAL;
1275 1306
1276 1307
1277 /* Can't specify "none" and some subsystems */ 1308 /* Can't specify "none" and some subsystems */
1278 if (opts->subsys_bits && opts->none) 1309 if (opts->subsys_mask && opts->none)
1279 return -EINVAL; 1310 return -EINVAL;
1280 1311
1281 /* 1312 /*
1282 * We either have to specify by name or by subsystems. (So all 1313 * We either have to specify by name or by subsystems. (So all
1283 * empty hierarchies must have a name). 1314 * empty hierarchies must have a name).
1284 */ 1315 */
1285 if (!opts->subsys_bits && !opts->name) 1316 if (!opts->subsys_mask && !opts->name)
1286 return -EINVAL; 1317 return -EINVAL;
1287 1318
1288 /* 1319 /*
@@ -1291,10 +1322,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1291 * take duplicate reference counts on a subsystem that's already used, 1322 * take duplicate reference counts on a subsystem that's already used,
1292 * but rebind_subsystems handles this case. 1323 * but rebind_subsystems handles this case.
1293 */ 1324 */
1294 for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { 1325 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1295 unsigned long bit = 1UL << i; 1326 unsigned long bit = 1UL << i;
1296 1327
1297 if (!(bit & opts->subsys_bits)) 1328 if (!(bit & opts->subsys_mask))
1298 continue; 1329 continue;
1299 if (!try_module_get(subsys[i]->module)) { 1330 if (!try_module_get(subsys[i]->module)) {
1300 module_pin_failed = true; 1331 module_pin_failed = true;
@@ -1307,11 +1338,11 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1307 * raced with a module_delete call, and to the user this is 1338 * raced with a module_delete call, and to the user this is
1308 * essentially a "subsystem doesn't exist" case. 1339 * essentially a "subsystem doesn't exist" case.
1309 */ 1340 */
1310 for (i--; i >= CGROUP_BUILTIN_SUBSYS_COUNT; i--) { 1341 for (i--; i >= 0; i--) {
1311 /* drop refcounts only on the ones we took */ 1342 /* drop refcounts only on the ones we took */
1312 unsigned long bit = 1UL << i; 1343 unsigned long bit = 1UL << i;
1313 1344
1314 if (!(bit & opts->subsys_bits)) 1345 if (!(bit & opts->subsys_mask))
1315 continue; 1346 continue;
1316 module_put(subsys[i]->module); 1347 module_put(subsys[i]->module);
1317 } 1348 }
@@ -1321,13 +1352,13 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1321 return 0; 1352 return 0;
1322} 1353}
1323 1354
1324static void drop_parsed_module_refcounts(unsigned long subsys_bits) 1355static void drop_parsed_module_refcounts(unsigned long subsys_mask)
1325{ 1356{
1326 int i; 1357 int i;
1327 for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { 1358 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1328 unsigned long bit = 1UL << i; 1359 unsigned long bit = 1UL << i;
1329 1360
1330 if (!(bit & subsys_bits)) 1361 if (!(bit & subsys_mask))
1331 continue; 1362 continue;
1332 module_put(subsys[i]->module); 1363 module_put(subsys[i]->module);
1333 } 1364 }
@@ -1339,6 +1370,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1339 struct cgroupfs_root *root = sb->s_fs_info; 1370 struct cgroupfs_root *root = sb->s_fs_info;
1340 struct cgroup *cgrp = &root->top_cgroup; 1371 struct cgroup *cgrp = &root->top_cgroup;
1341 struct cgroup_sb_opts opts; 1372 struct cgroup_sb_opts opts;
1373 unsigned long added_mask, removed_mask;
1342 1374
1343 mutex_lock(&cgrp->dentry->d_inode->i_mutex); 1375 mutex_lock(&cgrp->dentry->d_inode->i_mutex);
1344 mutex_lock(&cgroup_mutex); 1376 mutex_lock(&cgroup_mutex);
@@ -1350,27 +1382,31 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1350 goto out_unlock; 1382 goto out_unlock;
1351 1383
1352 /* See feature-removal-schedule.txt */ 1384 /* See feature-removal-schedule.txt */
1353 if (opts.subsys_bits != root->actual_subsys_bits || opts.release_agent) 1385 if (opts.subsys_mask != root->actual_subsys_mask || opts.release_agent)
1354 pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", 1386 pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
1355 task_tgid_nr(current), current->comm); 1387 task_tgid_nr(current), current->comm);
1356 1388
1389 added_mask = opts.subsys_mask & ~root->subsys_mask;
1390 removed_mask = root->subsys_mask & ~opts.subsys_mask;
1391
1357 /* Don't allow flags or name to change at remount */ 1392 /* Don't allow flags or name to change at remount */
1358 if (opts.flags != root->flags || 1393 if (opts.flags != root->flags ||
1359 (opts.name && strcmp(opts.name, root->name))) { 1394 (opts.name && strcmp(opts.name, root->name))) {
1360 ret = -EINVAL; 1395 ret = -EINVAL;
1361 drop_parsed_module_refcounts(opts.subsys_bits); 1396 drop_parsed_module_refcounts(opts.subsys_mask);
1362 goto out_unlock; 1397 goto out_unlock;
1363 } 1398 }
1364 1399
1365 ret = rebind_subsystems(root, opts.subsys_bits); 1400 ret = rebind_subsystems(root, opts.subsys_mask);
1366 if (ret) { 1401 if (ret) {
1367 drop_parsed_module_refcounts(opts.subsys_bits); 1402 drop_parsed_module_refcounts(opts.subsys_mask);
1368 goto out_unlock; 1403 goto out_unlock;
1369 } 1404 }
1370 1405
1371 /* clear out any existing files and repopulate subsystem files */ 1406 /* clear out any existing files and repopulate subsystem files */
1372 cgroup_clear_directory(cgrp->dentry); 1407 cgroup_clear_directory(cgrp->dentry, false, removed_mask);
1373 cgroup_populate_dir(cgrp); 1408 /* re-populate subsystem files */
1409 cgroup_populate_dir(cgrp, false, added_mask);
1374 1410
1375 if (opts.release_agent) 1411 if (opts.release_agent)
1376 strcpy(root->release_agent_path, opts.release_agent); 1412 strcpy(root->release_agent_path, opts.release_agent);
@@ -1401,6 +1437,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1401 mutex_init(&cgrp->pidlist_mutex); 1437 mutex_init(&cgrp->pidlist_mutex);
1402 INIT_LIST_HEAD(&cgrp->event_list); 1438 INIT_LIST_HEAD(&cgrp->event_list);
1403 spin_lock_init(&cgrp->event_list_lock); 1439 spin_lock_init(&cgrp->event_list_lock);
1440 simple_xattrs_init(&cgrp->xattrs);
1404} 1441}
1405 1442
1406static void init_cgroup_root(struct cgroupfs_root *root) 1443static void init_cgroup_root(struct cgroupfs_root *root)
@@ -1455,8 +1492,8 @@ static int cgroup_test_super(struct super_block *sb, void *data)
1455 * If we asked for subsystems (or explicitly for no 1492 * If we asked for subsystems (or explicitly for no
1456 * subsystems) then they must match 1493 * subsystems) then they must match
1457 */ 1494 */
1458 if ((opts->subsys_bits || opts->none) 1495 if ((opts->subsys_mask || opts->none)
1459 && (opts->subsys_bits != root->subsys_bits)) 1496 && (opts->subsys_mask != root->subsys_mask))
1460 return 0; 1497 return 0;
1461 1498
1462 return 1; 1499 return 1;
@@ -1466,7 +1503,7 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1466{ 1503{
1467 struct cgroupfs_root *root; 1504 struct cgroupfs_root *root;
1468 1505
1469 if (!opts->subsys_bits && !opts->none) 1506 if (!opts->subsys_mask && !opts->none)
1470 return NULL; 1507 return NULL;
1471 1508
1472 root = kzalloc(sizeof(*root), GFP_KERNEL); 1509 root = kzalloc(sizeof(*root), GFP_KERNEL);
@@ -1479,7 +1516,7 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1479 } 1516 }
1480 init_cgroup_root(root); 1517 init_cgroup_root(root);
1481 1518
1482 root->subsys_bits = opts->subsys_bits; 1519 root->subsys_mask = opts->subsys_mask;
1483 root->flags = opts->flags; 1520 root->flags = opts->flags;
1484 if (opts->release_agent) 1521 if (opts->release_agent)
1485 strcpy(root->release_agent_path, opts->release_agent); 1522 strcpy(root->release_agent_path, opts->release_agent);
@@ -1511,7 +1548,7 @@ static int cgroup_set_super(struct super_block *sb, void *data)
1511 if (!opts->new_root) 1548 if (!opts->new_root)
1512 return -EINVAL; 1549 return -EINVAL;
1513 1550
1514 BUG_ON(!opts->subsys_bits && !opts->none); 1551 BUG_ON(!opts->subsys_mask && !opts->none);
1515 1552
1516 ret = set_anon_super(sb, NULL); 1553 ret = set_anon_super(sb, NULL);
1517 if (ret) 1554 if (ret)
@@ -1629,7 +1666,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1629 if (ret) 1666 if (ret)
1630 goto unlock_drop; 1667 goto unlock_drop;
1631 1668
1632 ret = rebind_subsystems(root, root->subsys_bits); 1669 ret = rebind_subsystems(root, root->subsys_mask);
1633 if (ret == -EBUSY) { 1670 if (ret == -EBUSY) {
1634 free_cg_links(&tmp_cg_links); 1671 free_cg_links(&tmp_cg_links);
1635 goto unlock_drop; 1672 goto unlock_drop;
@@ -1669,7 +1706,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1669 BUG_ON(root->number_of_cgroups != 1); 1706 BUG_ON(root->number_of_cgroups != 1);
1670 1707
1671 cred = override_creds(&init_cred); 1708 cred = override_creds(&init_cred);
1672 cgroup_populate_dir(root_cgrp); 1709 cgroup_populate_dir(root_cgrp, true, root->subsys_mask);
1673 revert_creds(cred); 1710 revert_creds(cred);
1674 mutex_unlock(&cgroup_root_mutex); 1711 mutex_unlock(&cgroup_root_mutex);
1675 mutex_unlock(&cgroup_mutex); 1712 mutex_unlock(&cgroup_mutex);
@@ -1681,7 +1718,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1681 */ 1718 */
1682 cgroup_drop_root(opts.new_root); 1719 cgroup_drop_root(opts.new_root);
1683 /* no subsys rebinding, so refcounts don't change */ 1720 /* no subsys rebinding, so refcounts don't change */
1684 drop_parsed_module_refcounts(opts.subsys_bits); 1721 drop_parsed_module_refcounts(opts.subsys_mask);
1685 } 1722 }
1686 1723
1687 kfree(opts.release_agent); 1724 kfree(opts.release_agent);
@@ -1695,7 +1732,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1695 drop_new_super: 1732 drop_new_super:
1696 deactivate_locked_super(sb); 1733 deactivate_locked_super(sb);
1697 drop_modules: 1734 drop_modules:
1698 drop_parsed_module_refcounts(opts.subsys_bits); 1735 drop_parsed_module_refcounts(opts.subsys_mask);
1699 out_err: 1736 out_err:
1700 kfree(opts.release_agent); 1737 kfree(opts.release_agent);
1701 kfree(opts.name); 1738 kfree(opts.name);
@@ -1745,6 +1782,8 @@ static void cgroup_kill_sb(struct super_block *sb) {
1745 mutex_unlock(&cgroup_root_mutex); 1782 mutex_unlock(&cgroup_root_mutex);
1746 mutex_unlock(&cgroup_mutex); 1783 mutex_unlock(&cgroup_mutex);
1747 1784
1785 simple_xattrs_free(&cgrp->xattrs);
1786
1748 kill_litter_super(sb); 1787 kill_litter_super(sb);
1749 cgroup_drop_root(root); 1788 cgroup_drop_root(root);
1750} 1789}
@@ -2551,6 +2590,64 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
2551 return simple_rename(old_dir, old_dentry, new_dir, new_dentry); 2590 return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
2552} 2591}
2553 2592
2593static struct simple_xattrs *__d_xattrs(struct dentry *dentry)
2594{
2595 if (S_ISDIR(dentry->d_inode->i_mode))
2596 return &__d_cgrp(dentry)->xattrs;
2597 else
2598 return &__d_cft(dentry)->xattrs;
2599}
2600
2601static inline int xattr_enabled(struct dentry *dentry)
2602{
2603 struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
2604 return test_bit(ROOT_XATTR, &root->flags);
2605}
2606
2607static bool is_valid_xattr(const char *name)
2608{
2609 if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
2610 !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN))
2611 return true;
2612 return false;
2613}
2614
2615static int cgroup_setxattr(struct dentry *dentry, const char *name,
2616 const void *val, size_t size, int flags)
2617{
2618 if (!xattr_enabled(dentry))
2619 return -EOPNOTSUPP;
2620 if (!is_valid_xattr(name))
2621 return -EINVAL;
2622 return simple_xattr_set(__d_xattrs(dentry), name, val, size, flags);
2623}
2624
2625static int cgroup_removexattr(struct dentry *dentry, const char *name)
2626{
2627 if (!xattr_enabled(dentry))
2628 return -EOPNOTSUPP;
2629 if (!is_valid_xattr(name))
2630 return -EINVAL;
2631 return simple_xattr_remove(__d_xattrs(dentry), name);
2632}
2633
2634static ssize_t cgroup_getxattr(struct dentry *dentry, const char *name,
2635 void *buf, size_t size)
2636{
2637 if (!xattr_enabled(dentry))
2638 return -EOPNOTSUPP;
2639 if (!is_valid_xattr(name))
2640 return -EINVAL;
2641 return simple_xattr_get(__d_xattrs(dentry), name, buf, size);
2642}
2643
2644static ssize_t cgroup_listxattr(struct dentry *dentry, char *buf, size_t size)
2645{
2646 if (!xattr_enabled(dentry))
2647 return -EOPNOTSUPP;
2648 return simple_xattr_list(__d_xattrs(dentry), buf, size);
2649}
2650
2554static const struct file_operations cgroup_file_operations = { 2651static const struct file_operations cgroup_file_operations = {
2555 .read = cgroup_file_read, 2652 .read = cgroup_file_read,
2556 .write = cgroup_file_write, 2653 .write = cgroup_file_write,
@@ -2559,11 +2656,22 @@ static const struct file_operations cgroup_file_operations = {
2559 .release = cgroup_file_release, 2656 .release = cgroup_file_release,
2560}; 2657};
2561 2658
2659static const struct inode_operations cgroup_file_inode_operations = {
2660 .setxattr = cgroup_setxattr,
2661 .getxattr = cgroup_getxattr,
2662 .listxattr = cgroup_listxattr,
2663 .removexattr = cgroup_removexattr,
2664};
2665
2562static const struct inode_operations cgroup_dir_inode_operations = { 2666static const struct inode_operations cgroup_dir_inode_operations = {
2563 .lookup = cgroup_lookup, 2667 .lookup = cgroup_lookup,
2564 .mkdir = cgroup_mkdir, 2668 .mkdir = cgroup_mkdir,
2565 .rmdir = cgroup_rmdir, 2669 .rmdir = cgroup_rmdir,
2566 .rename = cgroup_rename, 2670 .rename = cgroup_rename,
2671 .setxattr = cgroup_setxattr,
2672 .getxattr = cgroup_getxattr,
2673 .listxattr = cgroup_listxattr,
2674 .removexattr = cgroup_removexattr,
2567}; 2675};
2568 2676
2569static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) 2677static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
@@ -2611,6 +2719,7 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode,
2611 } else if (S_ISREG(mode)) { 2719 } else if (S_ISREG(mode)) {
2612 inode->i_size = 0; 2720 inode->i_size = 0;
2613 inode->i_fop = &cgroup_file_operations; 2721 inode->i_fop = &cgroup_file_operations;
2722 inode->i_op = &cgroup_file_inode_operations;
2614 } 2723 }
2615 d_instantiate(dentry, inode); 2724 d_instantiate(dentry, inode);
2616 dget(dentry); /* Extra count - pin the dentry in core */ 2725 dget(dentry); /* Extra count - pin the dentry in core */
@@ -2671,7 +2780,7 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
2671} 2780}
2672 2781
2673static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, 2782static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2674 const struct cftype *cft) 2783 struct cftype *cft)
2675{ 2784{
2676 struct dentry *dir = cgrp->dentry; 2785 struct dentry *dir = cgrp->dentry;
2677 struct cgroup *parent = __d_cgrp(dir); 2786 struct cgroup *parent = __d_cgrp(dir);
@@ -2681,6 +2790,8 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2681 umode_t mode; 2790 umode_t mode;
2682 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; 2791 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
2683 2792
2793 simple_xattrs_init(&cft->xattrs);
2794
2684 /* does @cft->flags tell us to skip creation on @cgrp? */ 2795 /* does @cft->flags tell us to skip creation on @cgrp? */
2685 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) 2796 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
2686 return 0; 2797 return 0;
@@ -2721,9 +2832,9 @@ out:
2721} 2832}
2722 2833
2723static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, 2834static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2724 const struct cftype cfts[], bool is_add) 2835 struct cftype cfts[], bool is_add)
2725{ 2836{
2726 const struct cftype *cft; 2837 struct cftype *cft;
2727 int err, ret = 0; 2838 int err, ret = 0;
2728 2839
2729 for (cft = cfts; cft->name[0] != '\0'; cft++) { 2840 for (cft = cfts; cft->name[0] != '\0'; cft++) {
@@ -2757,7 +2868,7 @@ static void cgroup_cfts_prepare(void)
2757} 2868}
2758 2869
2759static void cgroup_cfts_commit(struct cgroup_subsys *ss, 2870static void cgroup_cfts_commit(struct cgroup_subsys *ss,
2760 const struct cftype *cfts, bool is_add) 2871 struct cftype *cfts, bool is_add)
2761 __releases(&cgroup_mutex) __releases(&cgroup_cft_mutex) 2872 __releases(&cgroup_mutex) __releases(&cgroup_cft_mutex)
2762{ 2873{
2763 LIST_HEAD(pending); 2874 LIST_HEAD(pending);
@@ -2808,7 +2919,7 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss,
2808 * function currently returns 0 as long as @cfts registration is successful 2919 * function currently returns 0 as long as @cfts registration is successful
2809 * even if some file creation attempts on existing cgroups fail. 2920 * even if some file creation attempts on existing cgroups fail.
2810 */ 2921 */
2811int cgroup_add_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts) 2922int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2812{ 2923{
2813 struct cftype_set *set; 2924 struct cftype_set *set;
2814 2925
@@ -2838,7 +2949,7 @@ EXPORT_SYMBOL_GPL(cgroup_add_cftypes);
2838 * Returns 0 on successful unregistration, -ENOENT if @cfts is not 2949 * Returns 0 on successful unregistration, -ENOENT if @cfts is not
2839 * registered with @ss. 2950 * registered with @ss.
2840 */ 2951 */
2841int cgroup_rm_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts) 2952int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2842{ 2953{
2843 struct cftype_set *set; 2954 struct cftype_set *set;
2844 2955
@@ -3843,18 +3954,29 @@ static struct cftype files[] = {
3843 { } /* terminate */ 3954 { } /* terminate */
3844}; 3955};
3845 3956
3846static int cgroup_populate_dir(struct cgroup *cgrp) 3957/**
3958 * cgroup_populate_dir - selectively creation of files in a directory
3959 * @cgrp: target cgroup
3960 * @base_files: true if the base files should be added
3961 * @subsys_mask: mask of the subsystem ids whose files should be added
3962 */
3963static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,
3964 unsigned long subsys_mask)
3847{ 3965{
3848 int err; 3966 int err;
3849 struct cgroup_subsys *ss; 3967 struct cgroup_subsys *ss;
3850 3968
3851 err = cgroup_addrm_files(cgrp, NULL, files, true); 3969 if (base_files) {
3852 if (err < 0) 3970 err = cgroup_addrm_files(cgrp, NULL, files, true);
3853 return err; 3971 if (err < 0)
3972 return err;
3973 }
3854 3974
3855 /* process cftsets of each subsystem */ 3975 /* process cftsets of each subsystem */
3856 for_each_subsys(cgrp->root, ss) { 3976 for_each_subsys(cgrp->root, ss) {
3857 struct cftype_set *set; 3977 struct cftype_set *set;
3978 if (!test_bit(ss->subsys_id, &subsys_mask))
3979 continue;
3858 3980
3859 list_for_each_entry(set, &ss->cftsets, node) 3981 list_for_each_entry(set, &ss->cftsets, node)
3860 cgroup_addrm_files(cgrp, ss, set->cfts, true); 3982 cgroup_addrm_files(cgrp, ss, set->cfts, true);
@@ -3954,8 +4076,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3954 set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); 4076 set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
3955 4077
3956 for_each_subsys(root, ss) { 4078 for_each_subsys(root, ss) {
3957 struct cgroup_subsys_state *css = ss->create(cgrp); 4079 struct cgroup_subsys_state *css;
3958 4080
4081 css = ss->create(cgrp);
3959 if (IS_ERR(css)) { 4082 if (IS_ERR(css)) {
3960 err = PTR_ERR(css); 4083 err = PTR_ERR(css);
3961 goto err_destroy; 4084 goto err_destroy;
@@ -3969,6 +4092,15 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3969 /* At error, ->destroy() callback has to free assigned ID. */ 4092 /* At error, ->destroy() callback has to free assigned ID. */
3970 if (clone_children(parent) && ss->post_clone) 4093 if (clone_children(parent) && ss->post_clone)
3971 ss->post_clone(cgrp); 4094 ss->post_clone(cgrp);
4095
4096 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
4097 parent->parent) {
4098 pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
4099 current->comm, current->pid, ss->name);
4100 if (!strcmp(ss->name, "memory"))
4101 pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n");
4102 ss->warned_broken_hierarchy = true;
4103 }
3972 } 4104 }
3973 4105
3974 list_add(&cgrp->sibling, &cgrp->parent->children); 4106 list_add(&cgrp->sibling, &cgrp->parent->children);
@@ -3988,7 +4120,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3988 4120
3989 list_add_tail(&cgrp->allcg_node, &root->allcg_list); 4121 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
3990 4122
3991 err = cgroup_populate_dir(cgrp); 4123 err = cgroup_populate_dir(cgrp, true, root->subsys_mask);
3992 /* If err < 0, we have a half-filled directory - oh well ;) */ 4124 /* If err < 0, we have a half-filled directory - oh well ;) */
3993 4125
3994 mutex_unlock(&cgroup_mutex); 4126 mutex_unlock(&cgroup_mutex);
@@ -4321,8 +4453,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4321 * since cgroup_init_subsys will have already taken care of it. 4453 * since cgroup_init_subsys will have already taken care of it.
4322 */ 4454 */
4323 if (ss->module == NULL) { 4455 if (ss->module == NULL) {
4324 /* a few sanity checks */ 4456 /* a sanity check */
4325 BUG_ON(ss->subsys_id >= CGROUP_BUILTIN_SUBSYS_COUNT);
4326 BUG_ON(subsys[ss->subsys_id] != ss); 4457 BUG_ON(subsys[ss->subsys_id] != ss);
4327 return 0; 4458 return 0;
4328 } 4459 }
@@ -4330,24 +4461,8 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4330 /* init base cftset */ 4461 /* init base cftset */
4331 cgroup_init_cftsets(ss); 4462 cgroup_init_cftsets(ss);
4332 4463
4333 /*
4334 * need to register a subsys id before anything else - for example,
4335 * init_cgroup_css needs it.
4336 */
4337 mutex_lock(&cgroup_mutex); 4464 mutex_lock(&cgroup_mutex);
4338 /* find the first empty slot in the array */ 4465 subsys[ss->subsys_id] = ss;
4339 for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
4340 if (subsys[i] == NULL)
4341 break;
4342 }
4343 if (i == CGROUP_SUBSYS_COUNT) {
4344 /* maximum number of subsystems already registered! */
4345 mutex_unlock(&cgroup_mutex);
4346 return -EBUSY;
4347 }
4348 /* assign ourselves the subsys_id */
4349 ss->subsys_id = i;
4350 subsys[i] = ss;
4351 4466
4352 /* 4467 /*
4353 * no ss->create seems to need anything important in the ss struct, so 4468 * no ss->create seems to need anything important in the ss struct, so
@@ -4356,7 +4471,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4356 css = ss->create(dummytop); 4471 css = ss->create(dummytop);
4357 if (IS_ERR(css)) { 4472 if (IS_ERR(css)) {
4358 /* failure case - need to deassign the subsys[] slot. */ 4473 /* failure case - need to deassign the subsys[] slot. */
4359 subsys[i] = NULL; 4474 subsys[ss->subsys_id] = NULL;
4360 mutex_unlock(&cgroup_mutex); 4475 mutex_unlock(&cgroup_mutex);
4361 return PTR_ERR(css); 4476 return PTR_ERR(css);
4362 } 4477 }
@@ -4372,7 +4487,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4372 if (ret) { 4487 if (ret) {
4373 dummytop->subsys[ss->subsys_id] = NULL; 4488 dummytop->subsys[ss->subsys_id] = NULL;
4374 ss->destroy(dummytop); 4489 ss->destroy(dummytop);
4375 subsys[i] = NULL; 4490 subsys[ss->subsys_id] = NULL;
4376 mutex_unlock(&cgroup_mutex); 4491 mutex_unlock(&cgroup_mutex);
4377 return ret; 4492 return ret;
4378 } 4493 }
@@ -4439,7 +4554,6 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4439 4554
4440 mutex_lock(&cgroup_mutex); 4555 mutex_lock(&cgroup_mutex);
4441 /* deassign the subsys_id */ 4556 /* deassign the subsys_id */
4442 BUG_ON(ss->subsys_id < CGROUP_BUILTIN_SUBSYS_COUNT);
4443 subsys[ss->subsys_id] = NULL; 4557 subsys[ss->subsys_id] = NULL;
4444 4558
4445 /* remove subsystem from rootnode's list of subsystems */ 4559 /* remove subsystem from rootnode's list of subsystems */
@@ -4502,10 +4616,13 @@ int __init cgroup_init_early(void)
4502 for (i = 0; i < CSS_SET_TABLE_SIZE; i++) 4616 for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
4503 INIT_HLIST_HEAD(&css_set_table[i]); 4617 INIT_HLIST_HEAD(&css_set_table[i]);
4504 4618
4505 /* at bootup time, we don't worry about modular subsystems */ 4619 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4506 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4507 struct cgroup_subsys *ss = subsys[i]; 4620 struct cgroup_subsys *ss = subsys[i];
4508 4621
4622 /* at bootup time, we don't worry about modular subsystems */
4623 if (!ss || ss->module)
4624 continue;
4625
4509 BUG_ON(!ss->name); 4626 BUG_ON(!ss->name);
4510 BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); 4627 BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
4511 BUG_ON(!ss->create); 4628 BUG_ON(!ss->create);
@@ -4538,9 +4655,12 @@ int __init cgroup_init(void)
4538 if (err) 4655 if (err)
4539 return err; 4656 return err;
4540 4657
4541 /* at bootup time, we don't worry about modular subsystems */ 4658 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4542 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4543 struct cgroup_subsys *ss = subsys[i]; 4659 struct cgroup_subsys *ss = subsys[i];
4660
4661 /* at bootup time, we don't worry about modular subsystems */
4662 if (!ss || ss->module)
4663 continue;
4544 if (!ss->early_init) 4664 if (!ss->early_init)
4545 cgroup_init_subsys(ss); 4665 cgroup_init_subsys(ss);
4546 if (ss->use_id) 4666 if (ss->use_id)
@@ -4735,13 +4855,16 @@ void cgroup_fork_callbacks(struct task_struct *child)
4735{ 4855{
4736 if (need_forkexit_callback) { 4856 if (need_forkexit_callback) {
4737 int i; 4857 int i;
4738 /* 4858 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4739 * forkexit callbacks are only supported for builtin
4740 * subsystems, and the builtin section of the subsys array is
4741 * immutable, so we don't need to lock the subsys array here.
4742 */
4743 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4744 struct cgroup_subsys *ss = subsys[i]; 4859 struct cgroup_subsys *ss = subsys[i];
4860
4861 /*
4862 * forkexit callbacks are only supported for
4863 * builtin subsystems.
4864 */
4865 if (!ss || ss->module)
4866 continue;
4867
4745 if (ss->fork) 4868 if (ss->fork)
4746 ss->fork(child); 4869 ss->fork(child);
4747 } 4870 }
@@ -4846,12 +4969,13 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4846 tsk->cgroups = &init_css_set; 4969 tsk->cgroups = &init_css_set;
4847 4970
4848 if (run_callbacks && need_forkexit_callback) { 4971 if (run_callbacks && need_forkexit_callback) {
4849 /* 4972 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4850 * modular subsystems can't use callbacks, so no need to lock
4851 * the subsys array
4852 */
4853 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4854 struct cgroup_subsys *ss = subsys[i]; 4973 struct cgroup_subsys *ss = subsys[i];
4974
4975 /* modular subsystems can't use callbacks */
4976 if (!ss || ss->module)
4977 continue;
4978
4855 if (ss->exit) { 4979 if (ss->exit) {
4856 struct cgroup *old_cgrp = 4980 struct cgroup *old_cgrp =
4857 rcu_dereference_raw(cg->subsys[i])->cgroup; 4981 rcu_dereference_raw(cg->subsys[i])->cgroup;
@@ -5037,13 +5161,17 @@ static int __init cgroup_disable(char *str)
5037 while ((token = strsep(&str, ",")) != NULL) { 5161 while ((token = strsep(&str, ",")) != NULL) {
5038 if (!*token) 5162 if (!*token)
5039 continue; 5163 continue;
5040 /* 5164 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
5041 * cgroup_disable, being at boot time, can't know about module
5042 * subsystems, so we don't worry about them.
5043 */
5044 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
5045 struct cgroup_subsys *ss = subsys[i]; 5165 struct cgroup_subsys *ss = subsys[i];
5046 5166
5167 /*
5168 * cgroup_disable, being at boot time, can't
5169 * know about module subsystems, so we don't
5170 * worry about them.
5171 */
5172 if (!ss || ss->module)
5173 continue;
5174
5047 if (!strcmp(token, ss->name)) { 5175 if (!strcmp(token, ss->name)) {
5048 ss->disabled = 1; 5176 ss->disabled = 1;
5049 printk(KERN_INFO "Disabling %s control group" 5177 printk(KERN_INFO "Disabling %s control group"
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 3649fc6b3eaa..b1724ce98981 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -373,4 +373,12 @@ struct cgroup_subsys freezer_subsys = {
373 .can_attach = freezer_can_attach, 373 .can_attach = freezer_can_attach,
374 .fork = freezer_fork, 374 .fork = freezer_fork,
375 .base_cftypes = files, 375 .base_cftypes = files,
376
377 /*
378 * freezer subsys doesn't handle hierarchy at all. Frozen state
379 * should be inherited through the hierarchy - if a parent is
380 * frozen, all its children should be frozen. Fix it and remove
381 * the following.
382 */
383 .broken_hierarchy = true,
376}; 384};
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f560598807c1..42bd331ee0ab 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -80,6 +80,10 @@ void put_online_cpus(void)
80 if (cpu_hotplug.active_writer == current) 80 if (cpu_hotplug.active_writer == current)
81 return; 81 return;
82 mutex_lock(&cpu_hotplug.lock); 82 mutex_lock(&cpu_hotplug.lock);
83
84 if (WARN_ON(!cpu_hotplug.refcount))
85 cpu_hotplug.refcount++; /* try to fix things up */
86
83 if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer)) 87 if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer))
84 wake_up_process(cpu_hotplug.active_writer); 88 wake_up_process(cpu_hotplug.active_writer);
85 mutex_unlock(&cpu_hotplug.lock); 89 mutex_unlock(&cpu_hotplug.lock);
diff --git a/kernel/cred.c b/kernel/cred.c
index de728ac50d82..48cea3da6d05 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -799,9 +799,15 @@ static void dump_invalid_creds(const struct cred *cred, const char *label,
799 atomic_read(&cred->usage), 799 atomic_read(&cred->usage),
800 read_cred_subscribers(cred)); 800 read_cred_subscribers(cred));
801 printk(KERN_ERR "CRED: ->*uid = { %d,%d,%d,%d }\n", 801 printk(KERN_ERR "CRED: ->*uid = { %d,%d,%d,%d }\n",
802 cred->uid, cred->euid, cred->suid, cred->fsuid); 802 from_kuid_munged(&init_user_ns, cred->uid),
803 from_kuid_munged(&init_user_ns, cred->euid),
804 from_kuid_munged(&init_user_ns, cred->suid),
805 from_kuid_munged(&init_user_ns, cred->fsuid));
803 printk(KERN_ERR "CRED: ->*gid = { %d,%d,%d,%d }\n", 806 printk(KERN_ERR "CRED: ->*gid = { %d,%d,%d,%d }\n",
804 cred->gid, cred->egid, cred->sgid, cred->fsgid); 807 from_kgid_munged(&init_user_ns, cred->gid),
808 from_kgid_munged(&init_user_ns, cred->egid),
809 from_kgid_munged(&init_user_ns, cred->sgid),
810 from_kgid_munged(&init_user_ns, cred->fsgid));
805#ifdef CONFIG_SECURITY 811#ifdef CONFIG_SECURITY
806 printk(KERN_ERR "CRED: ->security is %p\n", cred->security); 812 printk(KERN_ERR "CRED: ->security is %p\n", cred->security);
807 if ((unsigned long) cred->security >= PAGE_SIZE && 813 if ((unsigned long) cred->security >= PAGE_SIZE &&
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 0557f24c6bca..9a61738cefc8 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -672,6 +672,10 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
672{ 672{
673 struct kgdb_state kgdb_var; 673 struct kgdb_state kgdb_var;
674 struct kgdb_state *ks = &kgdb_var; 674 struct kgdb_state *ks = &kgdb_var;
675 int ret = 0;
676
677 if (arch_kgdb_ops.enable_nmi)
678 arch_kgdb_ops.enable_nmi(0);
675 679
676 ks->cpu = raw_smp_processor_id(); 680 ks->cpu = raw_smp_processor_id();
677 ks->ex_vector = evector; 681 ks->ex_vector = evector;
@@ -681,13 +685,33 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
681 ks->linux_regs = regs; 685 ks->linux_regs = regs;
682 686
683 if (kgdb_reenter_check(ks)) 687 if (kgdb_reenter_check(ks))
684 return 0; /* Ouch, double exception ! */ 688 goto out; /* Ouch, double exception ! */
685 if (kgdb_info[ks->cpu].enter_kgdb != 0) 689 if (kgdb_info[ks->cpu].enter_kgdb != 0)
686 return 0; 690 goto out;
687 691
688 return kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER); 692 ret = kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER);
693out:
694 if (arch_kgdb_ops.enable_nmi)
695 arch_kgdb_ops.enable_nmi(1);
696 return ret;
689} 697}
690 698
699/*
700 * GDB places a breakpoint at this function to know dynamically
701 * loaded objects. It's not defined static so that only one instance with this
702 * name exists in the kernel.
703 */
704
705static int module_event(struct notifier_block *self, unsigned long val,
706 void *data)
707{
708 return 0;
709}
710
711static struct notifier_block dbg_module_load_nb = {
712 .notifier_call = module_event,
713};
714
691int kgdb_nmicallback(int cpu, void *regs) 715int kgdb_nmicallback(int cpu, void *regs)
692{ 716{
693#ifdef CONFIG_SMP 717#ifdef CONFIG_SMP
@@ -816,6 +840,7 @@ static void kgdb_register_callbacks(void)
816 kgdb_arch_init(); 840 kgdb_arch_init();
817 if (!dbg_is_early) 841 if (!dbg_is_early)
818 kgdb_arch_late(); 842 kgdb_arch_late();
843 register_module_notifier(&dbg_module_load_nb);
819 register_reboot_notifier(&dbg_reboot_notifier); 844 register_reboot_notifier(&dbg_reboot_notifier);
820 atomic_notifier_chain_register(&panic_notifier_list, 845 atomic_notifier_chain_register(&panic_notifier_list,
821 &kgdb_panic_event_nb); 846 &kgdb_panic_event_nb);
@@ -839,6 +864,7 @@ static void kgdb_unregister_callbacks(void)
839 if (kgdb_io_module_registered) { 864 if (kgdb_io_module_registered) {
840 kgdb_io_module_registered = 0; 865 kgdb_io_module_registered = 0;
841 unregister_reboot_notifier(&dbg_reboot_notifier); 866 unregister_reboot_notifier(&dbg_reboot_notifier);
867 unregister_module_notifier(&dbg_module_load_nb);
842 atomic_notifier_chain_unregister(&panic_notifier_list, 868 atomic_notifier_chain_unregister(&panic_notifier_list,
843 &kgdb_panic_event_nb); 869 &kgdb_panic_event_nb);
844 kgdb_arch_exit(); 870 kgdb_arch_exit();
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index 07c9bbb94a0b..b03e0e814e43 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -129,6 +129,8 @@ kdb_bt(int argc, const char **argv)
129 } 129 }
130 /* Now the inactive tasks */ 130 /* Now the inactive tasks */
131 kdb_do_each_thread(g, p) { 131 kdb_do_each_thread(g, p) {
132 if (KDB_FLAG(CMD_INTERRUPT))
133 return 0;
132 if (task_curr(p)) 134 if (task_curr(p))
133 continue; 135 continue;
134 if (kdb_bt1(p, mask, argcount, btaprompt)) 136 if (kdb_bt1(p, mask, argcount, btaprompt))
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 0a69d2adc4f3..14ff4849262c 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -552,6 +552,7 @@ int vkdb_printf(const char *fmt, va_list ap)
552{ 552{
553 int diag; 553 int diag;
554 int linecount; 554 int linecount;
555 int colcount;
555 int logging, saved_loglevel = 0; 556 int logging, saved_loglevel = 0;
556 int saved_trap_printk; 557 int saved_trap_printk;
557 int got_printf_lock = 0; 558 int got_printf_lock = 0;
@@ -584,6 +585,10 @@ int vkdb_printf(const char *fmt, va_list ap)
584 if (diag || linecount <= 1) 585 if (diag || linecount <= 1)
585 linecount = 24; 586 linecount = 24;
586 587
588 diag = kdbgetintenv("COLUMNS", &colcount);
589 if (diag || colcount <= 1)
590 colcount = 80;
591
587 diag = kdbgetintenv("LOGGING", &logging); 592 diag = kdbgetintenv("LOGGING", &logging);
588 if (diag) 593 if (diag)
589 logging = 0; 594 logging = 0;
@@ -690,7 +695,7 @@ kdb_printit:
690 gdbstub_msg_write(kdb_buffer, retlen); 695 gdbstub_msg_write(kdb_buffer, retlen);
691 } else { 696 } else {
692 if (dbg_io_ops && !dbg_io_ops->is_console) { 697 if (dbg_io_ops && !dbg_io_ops->is_console) {
693 len = strlen(kdb_buffer); 698 len = retlen;
694 cp = kdb_buffer; 699 cp = kdb_buffer;
695 while (len--) { 700 while (len--) {
696 dbg_io_ops->write_char(*cp); 701 dbg_io_ops->write_char(*cp);
@@ -709,11 +714,29 @@ kdb_printit:
709 printk(KERN_INFO "%s", kdb_buffer); 714 printk(KERN_INFO "%s", kdb_buffer);
710 } 715 }
711 716
712 if (KDB_STATE(PAGER) && strchr(kdb_buffer, '\n')) 717 if (KDB_STATE(PAGER)) {
713 kdb_nextline++; 718 /*
719 * Check printed string to decide how to bump the
720 * kdb_nextline to control when the more prompt should
721 * show up.
722 */
723 int got = 0;
724 len = retlen;
725 while (len--) {
726 if (kdb_buffer[len] == '\n') {
727 kdb_nextline++;
728 got = 0;
729 } else if (kdb_buffer[len] == '\r') {
730 got = 0;
731 } else {
732 got++;
733 }
734 }
735 kdb_nextline += got / (colcount + 1);
736 }
714 737
715 /* check for having reached the LINES number of printed lines */ 738 /* check for having reached the LINES number of printed lines */
716 if (kdb_nextline == linecount) { 739 if (kdb_nextline >= linecount) {
717 char buf1[16] = ""; 740 char buf1[16] = "";
718 741
719 /* Watch out for recursion here. Any routine that calls 742 /* Watch out for recursion here. Any routine that calls
@@ -765,7 +788,7 @@ kdb_printit:
765 kdb_grepping_flag = 0; 788 kdb_grepping_flag = 0;
766 kdb_printf("\n"); 789 kdb_printf("\n");
767 } else if (buf1[0] == ' ') { 790 } else if (buf1[0] == ' ') {
768 kdb_printf("\n"); 791 kdb_printf("\r");
769 suspend_grep = 1; /* for this recursion */ 792 suspend_grep = 1; /* for this recursion */
770 } else if (buf1[0] == '\n') { 793 } else if (buf1[0] == '\n') {
771 kdb_nextline = linecount - 1; 794 kdb_nextline = linecount - 1;
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 31df1706b9a9..4d5f8d5612f3 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -21,6 +21,7 @@
21#include <linux/smp.h> 21#include <linux/smp.h>
22#include <linux/utsname.h> 22#include <linux/utsname.h>
23#include <linux/vmalloc.h> 23#include <linux/vmalloc.h>
24#include <linux/atomic.h>
24#include <linux/module.h> 25#include <linux/module.h>
25#include <linux/mm.h> 26#include <linux/mm.h>
26#include <linux/init.h> 27#include <linux/init.h>
@@ -2100,6 +2101,8 @@ static int kdb_dmesg(int argc, const char **argv)
2100 } 2101 }
2101 if (!lines--) 2102 if (!lines--)
2102 break; 2103 break;
2104 if (KDB_FLAG(CMD_INTERRUPT))
2105 return 0;
2103 2106
2104 kdb_printf("%.*s\n", (int)len - 1, buf); 2107 kdb_printf("%.*s\n", (int)len - 1, buf);
2105 } 2108 }
@@ -2107,6 +2110,32 @@ static int kdb_dmesg(int argc, const char **argv)
2107 return 0; 2110 return 0;
2108} 2111}
2109#endif /* CONFIG_PRINTK */ 2112#endif /* CONFIG_PRINTK */
2113
2114/* Make sure we balance enable/disable calls, must disable first. */
2115static atomic_t kdb_nmi_disabled;
2116
2117static int kdb_disable_nmi(int argc, const char *argv[])
2118{
2119 if (atomic_read(&kdb_nmi_disabled))
2120 return 0;
2121 atomic_set(&kdb_nmi_disabled, 1);
2122 arch_kgdb_ops.enable_nmi(0);
2123 return 0;
2124}
2125
2126static int kdb_param_enable_nmi(const char *val, const struct kernel_param *kp)
2127{
2128 if (!atomic_add_unless(&kdb_nmi_disabled, -1, 0))
2129 return -EINVAL;
2130 arch_kgdb_ops.enable_nmi(1);
2131 return 0;
2132}
2133
2134static const struct kernel_param_ops kdb_param_ops_enable_nmi = {
2135 .set = kdb_param_enable_nmi,
2136};
2137module_param_cb(enable_nmi, &kdb_param_ops_enable_nmi, NULL, 0600);
2138
2110/* 2139/*
2111 * kdb_cpu - This function implements the 'cpu' command. 2140 * kdb_cpu - This function implements the 'cpu' command.
2112 * cpu [<cpunum>] 2141 * cpu [<cpunum>]
@@ -2851,6 +2880,10 @@ static void __init kdb_inittab(void)
2851 kdb_register_repeat("dmesg", kdb_dmesg, "[lines]", 2880 kdb_register_repeat("dmesg", kdb_dmesg, "[lines]",
2852 "Display syslog buffer", 0, KDB_REPEAT_NONE); 2881 "Display syslog buffer", 0, KDB_REPEAT_NONE);
2853#endif 2882#endif
2883 if (arch_kgdb_ops.enable_nmi) {
2884 kdb_register_repeat("disable_nmi", kdb_disable_nmi, "",
2885 "Disable NMI entry to KDB", 0, KDB_REPEAT_NONE);
2886 }
2854 kdb_register_repeat("defcmd", kdb_defcmd, "name \"usage\" \"help\"", 2887 kdb_register_repeat("defcmd", kdb_defcmd, "name \"usage\" \"help\"",
2855 "Define a set of commands, down to endefcmd", 0, KDB_REPEAT_NONE); 2888 "Define a set of commands, down to endefcmd", 0, KDB_REPEAT_NONE);
2856 kdb_register_repeat("kill", kdb_kill, "<-signal> <pid>", 2889 kdb_register_repeat("kill", kdb_kill, "<-signal> <pid>",
diff --git a/kernel/events/core.c b/kernel/events/core.c
index fd15593c7f54..dbccf83c134d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -471,14 +471,13 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
471{ 471{
472 struct perf_cgroup *cgrp; 472 struct perf_cgroup *cgrp;
473 struct cgroup_subsys_state *css; 473 struct cgroup_subsys_state *css;
474 struct file *file; 474 struct fd f = fdget(fd);
475 int ret = 0, fput_needed; 475 int ret = 0;
476 476
477 file = fget_light(fd, &fput_needed); 477 if (!f.file)
478 if (!file)
479 return -EBADF; 478 return -EBADF;
480 479
481 css = cgroup_css_from_dir(file, perf_subsys_id); 480 css = cgroup_css_from_dir(f.file, perf_subsys_id);
482 if (IS_ERR(css)) { 481 if (IS_ERR(css)) {
483 ret = PTR_ERR(css); 482 ret = PTR_ERR(css);
484 goto out; 483 goto out;
@@ -504,7 +503,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
504 ret = -EINVAL; 503 ret = -EINVAL;
505 } 504 }
506out: 505out:
507 fput_light(file, fput_needed); 506 fdput(f);
508 return ret; 507 return ret;
509} 508}
510 509
@@ -3237,21 +3236,18 @@ unlock:
3237 3236
3238static const struct file_operations perf_fops; 3237static const struct file_operations perf_fops;
3239 3238
3240static struct file *perf_fget_light(int fd, int *fput_needed) 3239static inline int perf_fget_light(int fd, struct fd *p)
3241{ 3240{
3242 struct file *file; 3241 struct fd f = fdget(fd);
3243 3242 if (!f.file)
3244 file = fget_light(fd, fput_needed); 3243 return -EBADF;
3245 if (!file)
3246 return ERR_PTR(-EBADF);
3247 3244
3248 if (file->f_op != &perf_fops) { 3245 if (f.file->f_op != &perf_fops) {
3249 fput_light(file, *fput_needed); 3246 fdput(f);
3250 *fput_needed = 0; 3247 return -EBADF;
3251 return ERR_PTR(-EBADF);
3252 } 3248 }
3253 3249 *p = f;
3254 return file; 3250 return 0;
3255} 3251}
3256 3252
3257static int perf_event_set_output(struct perf_event *event, 3253static int perf_event_set_output(struct perf_event *event,
@@ -3283,22 +3279,19 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3283 3279
3284 case PERF_EVENT_IOC_SET_OUTPUT: 3280 case PERF_EVENT_IOC_SET_OUTPUT:
3285 { 3281 {
3286 struct file *output_file = NULL;
3287 struct perf_event *output_event = NULL;
3288 int fput_needed = 0;
3289 int ret; 3282 int ret;
3290
3291 if (arg != -1) { 3283 if (arg != -1) {
3292 output_file = perf_fget_light(arg, &fput_needed); 3284 struct perf_event *output_event;
3293 if (IS_ERR(output_file)) 3285 struct fd output;
3294 return PTR_ERR(output_file); 3286 ret = perf_fget_light(arg, &output);
3295 output_event = output_file->private_data; 3287 if (ret)
3288 return ret;
3289 output_event = output.file->private_data;
3290 ret = perf_event_set_output(event, output_event);
3291 fdput(output);
3292 } else {
3293 ret = perf_event_set_output(event, NULL);
3296 } 3294 }
3297
3298 ret = perf_event_set_output(event, output_event);
3299 if (output_event)
3300 fput_light(output_file, fput_needed);
3301
3302 return ret; 3295 return ret;
3303 } 3296 }
3304 3297
@@ -3681,7 +3674,7 @@ unlock:
3681 atomic_inc(&event->mmap_count); 3674 atomic_inc(&event->mmap_count);
3682 mutex_unlock(&event->mmap_mutex); 3675 mutex_unlock(&event->mmap_mutex);
3683 3676
3684 vma->vm_flags |= VM_RESERVED; 3677 vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
3685 vma->vm_ops = &perf_mmap_vmops; 3678 vma->vm_ops = &perf_mmap_vmops;
3686 3679
3687 return ret; 3680 return ret;
@@ -6446,12 +6439,11 @@ SYSCALL_DEFINE5(perf_event_open,
6446 struct perf_event_attr attr; 6439 struct perf_event_attr attr;
6447 struct perf_event_context *ctx; 6440 struct perf_event_context *ctx;
6448 struct file *event_file = NULL; 6441 struct file *event_file = NULL;
6449 struct file *group_file = NULL; 6442 struct fd group = {NULL, 0};
6450 struct task_struct *task = NULL; 6443 struct task_struct *task = NULL;
6451 struct pmu *pmu; 6444 struct pmu *pmu;
6452 int event_fd; 6445 int event_fd;
6453 int move_group = 0; 6446 int move_group = 0;
6454 int fput_needed = 0;
6455 int err; 6447 int err;
6456 6448
6457 /* for future expandability... */ 6449 /* for future expandability... */
@@ -6481,17 +6473,15 @@ SYSCALL_DEFINE5(perf_event_open,
6481 if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1)) 6473 if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
6482 return -EINVAL; 6474 return -EINVAL;
6483 6475
6484 event_fd = get_unused_fd_flags(O_RDWR); 6476 event_fd = get_unused_fd();
6485 if (event_fd < 0) 6477 if (event_fd < 0)
6486 return event_fd; 6478 return event_fd;
6487 6479
6488 if (group_fd != -1) { 6480 if (group_fd != -1) {
6489 group_file = perf_fget_light(group_fd, &fput_needed); 6481 err = perf_fget_light(group_fd, &group);
6490 if (IS_ERR(group_file)) { 6482 if (err)
6491 err = PTR_ERR(group_file);
6492 goto err_fd; 6483 goto err_fd;
6493 } 6484 group_leader = group.file->private_data;
6494 group_leader = group_file->private_data;
6495 if (flags & PERF_FLAG_FD_OUTPUT) 6485 if (flags & PERF_FLAG_FD_OUTPUT)
6496 output_event = group_leader; 6486 output_event = group_leader;
6497 if (flags & PERF_FLAG_FD_NO_GROUP) 6487 if (flags & PERF_FLAG_FD_NO_GROUP)
@@ -6667,7 +6657,7 @@ SYSCALL_DEFINE5(perf_event_open,
6667 * of the group leader will find the pointer to itself in 6657 * of the group leader will find the pointer to itself in
6668 * perf_group_detach(). 6658 * perf_group_detach().
6669 */ 6659 */
6670 fput_light(group_file, fput_needed); 6660 fdput(group);
6671 fd_install(event_fd, event_file); 6661 fd_install(event_fd, event_file);
6672 return event_fd; 6662 return event_fd;
6673 6663
@@ -6681,7 +6671,7 @@ err_task:
6681 if (task) 6671 if (task)
6682 put_task_struct(task); 6672 put_task_struct(task);
6683err_group_fd: 6673err_group_fd:
6684 fput_light(group_file, fput_needed); 6674 fdput(group);
6685err_fd: 6675err_fd:
6686 put_unused_fd(event_fd); 6676 put_unused_fd(event_fd);
6687 return err; 6677 return err;
@@ -7506,5 +7496,12 @@ struct cgroup_subsys perf_subsys = {
7506 .destroy = perf_cgroup_destroy, 7496 .destroy = perf_cgroup_destroy,
7507 .exit = perf_cgroup_exit, 7497 .exit = perf_cgroup_exit,
7508 .attach = perf_cgroup_attach, 7498 .attach = perf_cgroup_attach,
7499
7500 /*
7501 * perf_event cgroup doesn't handle nesting correctly.
7502 * ctx->nr_cgroups adjustments should be propagated through the
7503 * cgroup hierarchy. Fix it and remove the following.
7504 */
7505 .broken_hierarchy = true,
7509}; 7506};
7510#endif /* CONFIG_CGROUP_PERF */ 7507#endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 912ef48d28ab..5cc4e7e42e68 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -78,15 +78,23 @@ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
78 */ 78 */
79static atomic_t uprobe_events = ATOMIC_INIT(0); 79static atomic_t uprobe_events = ATOMIC_INIT(0);
80 80
81/* Have a copy of original instruction */
82#define UPROBE_COPY_INSN 0
83/* Dont run handlers when first register/ last unregister in progress*/
84#define UPROBE_RUN_HANDLER 1
85/* Can skip singlestep */
86#define UPROBE_SKIP_SSTEP 2
87
81struct uprobe { 88struct uprobe {
82 struct rb_node rb_node; /* node in the rb tree */ 89 struct rb_node rb_node; /* node in the rb tree */
83 atomic_t ref; 90 atomic_t ref;
84 struct rw_semaphore consumer_rwsem; 91 struct rw_semaphore consumer_rwsem;
92 struct mutex copy_mutex; /* TODO: kill me and UPROBE_COPY_INSN */
85 struct list_head pending_list; 93 struct list_head pending_list;
86 struct uprobe_consumer *consumers; 94 struct uprobe_consumer *consumers;
87 struct inode *inode; /* Also hold a ref to inode */ 95 struct inode *inode; /* Also hold a ref to inode */
88 loff_t offset; 96 loff_t offset;
89 int flags; 97 unsigned long flags;
90 struct arch_uprobe arch; 98 struct arch_uprobe arch;
91}; 99};
92 100
@@ -100,17 +108,12 @@ struct uprobe {
100 */ 108 */
101static bool valid_vma(struct vm_area_struct *vma, bool is_register) 109static bool valid_vma(struct vm_area_struct *vma, bool is_register)
102{ 110{
103 if (!vma->vm_file) 111 vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_SHARED;
104 return false;
105
106 if (!is_register)
107 return true;
108 112
109 if ((vma->vm_flags & (VM_HUGETLB|VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)) 113 if (is_register)
110 == (VM_READ|VM_EXEC)) 114 flags |= VM_WRITE;
111 return true;
112 115
113 return false; 116 return vma->vm_file && (vma->vm_flags & flags) == VM_MAYEXEC;
114} 117}
115 118
116static unsigned long offset_to_vaddr(struct vm_area_struct *vma, loff_t offset) 119static unsigned long offset_to_vaddr(struct vm_area_struct *vma, loff_t offset)
@@ -141,10 +144,14 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
141 spinlock_t *ptl; 144 spinlock_t *ptl;
142 pte_t *ptep; 145 pte_t *ptep;
143 int err; 146 int err;
147 /* For mmu_notifiers */
148 const unsigned long mmun_start = addr;
149 const unsigned long mmun_end = addr + PAGE_SIZE;
144 150
145 /* For try_to_free_swap() and munlock_vma_page() below */ 151 /* For try_to_free_swap() and munlock_vma_page() below */
146 lock_page(page); 152 lock_page(page);
147 153
154 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
148 err = -EAGAIN; 155 err = -EAGAIN;
149 ptep = page_check_address(page, mm, addr, &ptl, 0); 156 ptep = page_check_address(page, mm, addr, &ptl, 0);
150 if (!ptep) 157 if (!ptep)
@@ -173,6 +180,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
173 180
174 err = 0; 181 err = 0;
175 unlock: 182 unlock:
183 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
176 unlock_page(page); 184 unlock_page(page);
177 return err; 185 return err;
178} 186}
@@ -188,19 +196,44 @@ bool __weak is_swbp_insn(uprobe_opcode_t *insn)
188 return *insn == UPROBE_SWBP_INSN; 196 return *insn == UPROBE_SWBP_INSN;
189} 197}
190 198
199static void copy_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *opcode)
200{
201 void *kaddr = kmap_atomic(page);
202 memcpy(opcode, kaddr + (vaddr & ~PAGE_MASK), UPROBE_SWBP_INSN_SIZE);
203 kunmap_atomic(kaddr);
204}
205
206static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode)
207{
208 uprobe_opcode_t old_opcode;
209 bool is_swbp;
210
211 copy_opcode(page, vaddr, &old_opcode);
212 is_swbp = is_swbp_insn(&old_opcode);
213
214 if (is_swbp_insn(new_opcode)) {
215 if (is_swbp) /* register: already installed? */
216 return 0;
217 } else {
218 if (!is_swbp) /* unregister: was it changed by us? */
219 return 0;
220 }
221
222 return 1;
223}
224
191/* 225/*
192 * NOTE: 226 * NOTE:
193 * Expect the breakpoint instruction to be the smallest size instruction for 227 * Expect the breakpoint instruction to be the smallest size instruction for
194 * the architecture. If an arch has variable length instruction and the 228 * the architecture. If an arch has variable length instruction and the
195 * breakpoint instruction is not of the smallest length instruction 229 * breakpoint instruction is not of the smallest length instruction
196 * supported by that architecture then we need to modify read_opcode / 230 * supported by that architecture then we need to modify is_swbp_at_addr and
197 * write_opcode accordingly. This would never be a problem for archs that 231 * write_opcode accordingly. This would never be a problem for archs that
198 * have fixed length instructions. 232 * have fixed length instructions.
199 */ 233 */
200 234
201/* 235/*
202 * write_opcode - write the opcode at a given virtual address. 236 * write_opcode - write the opcode at a given virtual address.
203 * @auprobe: arch breakpointing information.
204 * @mm: the probed process address space. 237 * @mm: the probed process address space.
205 * @vaddr: the virtual address to store the opcode. 238 * @vaddr: the virtual address to store the opcode.
206 * @opcode: opcode to be written at @vaddr. 239 * @opcode: opcode to be written at @vaddr.
@@ -211,8 +244,8 @@ bool __weak is_swbp_insn(uprobe_opcode_t *insn)
211 * For mm @mm, write the opcode at @vaddr. 244 * For mm @mm, write the opcode at @vaddr.
212 * Return 0 (success) or a negative errno. 245 * Return 0 (success) or a negative errno.
213 */ 246 */
214static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, 247static int write_opcode(struct mm_struct *mm, unsigned long vaddr,
215 unsigned long vaddr, uprobe_opcode_t opcode) 248 uprobe_opcode_t opcode)
216{ 249{
217 struct page *old_page, *new_page; 250 struct page *old_page, *new_page;
218 void *vaddr_old, *vaddr_new; 251 void *vaddr_old, *vaddr_new;
@@ -221,10 +254,14 @@ static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
221 254
222retry: 255retry:
223 /* Read the page with vaddr into memory */ 256 /* Read the page with vaddr into memory */
224 ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &old_page, &vma); 257 ret = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma);
225 if (ret <= 0) 258 if (ret <= 0)
226 return ret; 259 return ret;
227 260
261 ret = verify_opcode(old_page, vaddr, &opcode);
262 if (ret <= 0)
263 goto put_old;
264
228 ret = -ENOMEM; 265 ret = -ENOMEM;
229 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr); 266 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
230 if (!new_page) 267 if (!new_page)
@@ -259,63 +296,6 @@ put_old:
259} 296}
260 297
261/** 298/**
262 * read_opcode - read the opcode at a given virtual address.
263 * @mm: the probed process address space.
264 * @vaddr: the virtual address to read the opcode.
265 * @opcode: location to store the read opcode.
266 *
267 * Called with mm->mmap_sem held (for read and with a reference to
268 * mm.
269 *
270 * For mm @mm, read the opcode at @vaddr and store it in @opcode.
271 * Return 0 (success) or a negative errno.
272 */
273static int read_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t *opcode)
274{
275 struct page *page;
276 void *vaddr_new;
277 int ret;
278
279 ret = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL);
280 if (ret <= 0)
281 return ret;
282
283 vaddr_new = kmap_atomic(page);
284 vaddr &= ~PAGE_MASK;
285 memcpy(opcode, vaddr_new + vaddr, UPROBE_SWBP_INSN_SIZE);
286 kunmap_atomic(vaddr_new);
287
288 put_page(page);
289
290 return 0;
291}
292
293static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr)
294{
295 uprobe_opcode_t opcode;
296 int result;
297
298 if (current->mm == mm) {
299 pagefault_disable();
300 result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr,
301 sizeof(opcode));
302 pagefault_enable();
303
304 if (likely(result == 0))
305 goto out;
306 }
307
308 result = read_opcode(mm, vaddr, &opcode);
309 if (result)
310 return result;
311out:
312 if (is_swbp_insn(&opcode))
313 return 1;
314
315 return 0;
316}
317
318/**
319 * set_swbp - store breakpoint at a given address. 299 * set_swbp - store breakpoint at a given address.
320 * @auprobe: arch specific probepoint information. 300 * @auprobe: arch specific probepoint information.
321 * @mm: the probed process address space. 301 * @mm: the probed process address space.
@@ -326,18 +306,7 @@ out:
326 */ 306 */
327int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) 307int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
328{ 308{
329 int result; 309 return write_opcode(mm, vaddr, UPROBE_SWBP_INSN);
330 /*
331 * See the comment near uprobes_hash().
332 */
333 result = is_swbp_at_addr(mm, vaddr);
334 if (result == 1)
335 return 0;
336
337 if (result)
338 return result;
339
340 return write_opcode(auprobe, mm, vaddr, UPROBE_SWBP_INSN);
341} 310}
342 311
343/** 312/**
@@ -352,16 +321,7 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned
352int __weak 321int __weak
353set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) 322set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
354{ 323{
355 int result; 324 return write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn);
356
357 result = is_swbp_at_addr(mm, vaddr);
358 if (!result)
359 return -EINVAL;
360
361 if (result != 1)
362 return result;
363
364 return write_opcode(auprobe, mm, vaddr, *(uprobe_opcode_t *)auprobe->insn);
365} 325}
366 326
367static int match_uprobe(struct uprobe *l, struct uprobe *r) 327static int match_uprobe(struct uprobe *l, struct uprobe *r)
@@ -468,7 +428,7 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe)
468 spin_unlock(&uprobes_treelock); 428 spin_unlock(&uprobes_treelock);
469 429
470 /* For now assume that the instruction need not be single-stepped */ 430 /* For now assume that the instruction need not be single-stepped */
471 uprobe->flags |= UPROBE_SKIP_SSTEP; 431 __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
472 432
473 return u; 433 return u;
474} 434}
@@ -490,6 +450,7 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
490 uprobe->inode = igrab(inode); 450 uprobe->inode = igrab(inode);
491 uprobe->offset = offset; 451 uprobe->offset = offset;
492 init_rwsem(&uprobe->consumer_rwsem); 452 init_rwsem(&uprobe->consumer_rwsem);
453 mutex_init(&uprobe->copy_mutex);
493 454
494 /* add to uprobes_tree, sorted on inode:offset */ 455 /* add to uprobes_tree, sorted on inode:offset */
495 cur_uprobe = insert_uprobe(uprobe); 456 cur_uprobe = insert_uprobe(uprobe);
@@ -510,7 +471,7 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
510{ 471{
511 struct uprobe_consumer *uc; 472 struct uprobe_consumer *uc;
512 473
513 if (!(uprobe->flags & UPROBE_RUN_HANDLER)) 474 if (!test_bit(UPROBE_RUN_HANDLER, &uprobe->flags))
514 return; 475 return;
515 476
516 down_read(&uprobe->consumer_rwsem); 477 down_read(&uprobe->consumer_rwsem);
@@ -616,29 +577,43 @@ static int copy_insn(struct uprobe *uprobe, struct file *filp)
616 return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset); 577 return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset);
617} 578}
618 579
619/* 580static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
620 * How mm->uprobes_state.count gets updated 581 struct mm_struct *mm, unsigned long vaddr)
621 * uprobe_mmap() increments the count if 582{
622 * - it successfully adds a breakpoint. 583 int ret = 0;
623 * - it cannot add a breakpoint, but sees that there is a underlying 584
624 * breakpoint (via a is_swbp_at_addr()). 585 if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
625 * 586 return ret;
626 * uprobe_munmap() decrements the count if 587
627 * - it sees a underlying breakpoint, (via is_swbp_at_addr) 588 mutex_lock(&uprobe->copy_mutex);
628 * (Subsequent uprobe_unregister wouldnt find the breakpoint 589 if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
629 * unless a uprobe_mmap kicks in, since the old vma would be 590 goto out;
630 * dropped just after uprobe_munmap.) 591
631 * 592 ret = copy_insn(uprobe, file);
632 * uprobe_register increments the count if: 593 if (ret)
633 * - it successfully adds a breakpoint. 594 goto out;
634 * 595
635 * uprobe_unregister decrements the count if: 596 ret = -ENOTSUPP;
636 * - it sees a underlying breakpoint and removes successfully. 597 if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn))
637 * (via is_swbp_at_addr) 598 goto out;
638 * (Subsequent uprobe_munmap wouldnt find the breakpoint 599
639 * since there is no underlying breakpoint after the 600 ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr);
640 * breakpoint removal.) 601 if (ret)
641 */ 602 goto out;
603
604 /* write_opcode() assumes we don't cross page boundary */
605 BUG_ON((uprobe->offset & ~PAGE_MASK) +
606 UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
607
608 smp_wmb(); /* pairs with rmb() in find_active_uprobe() */
609 set_bit(UPROBE_COPY_INSN, &uprobe->flags);
610
611 out:
612 mutex_unlock(&uprobe->copy_mutex);
613
614 return ret;
615}
616
642static int 617static int
643install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, 618install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
644 struct vm_area_struct *vma, unsigned long vaddr) 619 struct vm_area_struct *vma, unsigned long vaddr)
@@ -656,24 +631,9 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
656 if (!uprobe->consumers) 631 if (!uprobe->consumers)
657 return 0; 632 return 0;
658 633
659 if (!(uprobe->flags & UPROBE_COPY_INSN)) { 634 ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr);
660 ret = copy_insn(uprobe, vma->vm_file); 635 if (ret)
661 if (ret) 636 return ret;
662 return ret;
663
664 if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn))
665 return -ENOTSUPP;
666
667 ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr);
668 if (ret)
669 return ret;
670
671 /* write_opcode() assumes we don't cross page boundary */
672 BUG_ON((uprobe->offset & ~PAGE_MASK) +
673 UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
674
675 uprobe->flags |= UPROBE_COPY_INSN;
676 }
677 637
678 /* 638 /*
679 * set MMF_HAS_UPROBES in advance for uprobe_pre_sstep_notifier(), 639 * set MMF_HAS_UPROBES in advance for uprobe_pre_sstep_notifier(),
@@ -692,15 +652,15 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
692 return ret; 652 return ret;
693} 653}
694 654
695static void 655static int
696remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr) 656remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr)
697{ 657{
698 /* can happen if uprobe_register() fails */ 658 /* can happen if uprobe_register() fails */
699 if (!test_bit(MMF_HAS_UPROBES, &mm->flags)) 659 if (!test_bit(MMF_HAS_UPROBES, &mm->flags))
700 return; 660 return 0;
701 661
702 set_bit(MMF_RECALC_UPROBES, &mm->flags); 662 set_bit(MMF_RECALC_UPROBES, &mm->flags);
703 set_orig_insn(&uprobe->arch, mm, vaddr); 663 return set_orig_insn(&uprobe->arch, mm, vaddr);
704} 664}
705 665
706/* 666/*
@@ -735,7 +695,6 @@ static struct map_info *
735build_map_info(struct address_space *mapping, loff_t offset, bool is_register) 695build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
736{ 696{
737 unsigned long pgoff = offset >> PAGE_SHIFT; 697 unsigned long pgoff = offset >> PAGE_SHIFT;
738 struct prio_tree_iter iter;
739 struct vm_area_struct *vma; 698 struct vm_area_struct *vma;
740 struct map_info *curr = NULL; 699 struct map_info *curr = NULL;
741 struct map_info *prev = NULL; 700 struct map_info *prev = NULL;
@@ -744,7 +703,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
744 703
745 again: 704 again:
746 mutex_lock(&mapping->i_mmap_mutex); 705 mutex_lock(&mapping->i_mmap_mutex);
747 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 706 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
748 if (!valid_vma(vma, is_register)) 707 if (!valid_vma(vma, is_register))
749 continue; 708 continue;
750 709
@@ -816,7 +775,7 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
816 struct mm_struct *mm = info->mm; 775 struct mm_struct *mm = info->mm;
817 struct vm_area_struct *vma; 776 struct vm_area_struct *vma;
818 777
819 if (err) 778 if (err && is_register)
820 goto free; 779 goto free;
821 780
822 down_write(&mm->mmap_sem); 781 down_write(&mm->mmap_sem);
@@ -832,7 +791,7 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
832 if (is_register) 791 if (is_register)
833 err = install_breakpoint(uprobe, mm, vma, info->vaddr); 792 err = install_breakpoint(uprobe, mm, vma, info->vaddr);
834 else 793 else
835 remove_breakpoint(uprobe, mm, info->vaddr); 794 err |= remove_breakpoint(uprobe, mm, info->vaddr);
836 795
837 unlock: 796 unlock:
838 up_write(&mm->mmap_sem); 797 up_write(&mm->mmap_sem);
@@ -889,13 +848,15 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *
889 mutex_lock(uprobes_hash(inode)); 848 mutex_lock(uprobes_hash(inode));
890 uprobe = alloc_uprobe(inode, offset); 849 uprobe = alloc_uprobe(inode, offset);
891 850
892 if (uprobe && !consumer_add(uprobe, uc)) { 851 if (!uprobe) {
852 ret = -ENOMEM;
853 } else if (!consumer_add(uprobe, uc)) {
893 ret = __uprobe_register(uprobe); 854 ret = __uprobe_register(uprobe);
894 if (ret) { 855 if (ret) {
895 uprobe->consumers = NULL; 856 uprobe->consumers = NULL;
896 __uprobe_unregister(uprobe); 857 __uprobe_unregister(uprobe);
897 } else { 858 } else {
898 uprobe->flags |= UPROBE_RUN_HANDLER; 859 set_bit(UPROBE_RUN_HANDLER, &uprobe->flags);
899 } 860 }
900 } 861 }
901 862
@@ -928,7 +889,7 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume
928 if (consumer_del(uprobe, uc)) { 889 if (consumer_del(uprobe, uc)) {
929 if (!uprobe->consumers) { 890 if (!uprobe->consumers) {
930 __uprobe_unregister(uprobe); 891 __uprobe_unregister(uprobe);
931 uprobe->flags &= ~UPROBE_RUN_HANDLER; 892 clear_bit(UPROBE_RUN_HANDLER, &uprobe->flags);
932 } 893 }
933 } 894 }
934 895
@@ -1389,10 +1350,11 @@ bool uprobe_deny_signal(void)
1389 */ 1350 */
1390static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs) 1351static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs)
1391{ 1352{
1392 if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) 1353 if (test_bit(UPROBE_SKIP_SSTEP, &uprobe->flags)) {
1393 return true; 1354 if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
1394 1355 return true;
1395 uprobe->flags &= ~UPROBE_SKIP_SSTEP; 1356 clear_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
1357 }
1396 return false; 1358 return false;
1397} 1359}
1398 1360
@@ -1415,6 +1377,30 @@ static void mmf_recalc_uprobes(struct mm_struct *mm)
1415 clear_bit(MMF_HAS_UPROBES, &mm->flags); 1377 clear_bit(MMF_HAS_UPROBES, &mm->flags);
1416} 1378}
1417 1379
1380static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr)
1381{
1382 struct page *page;
1383 uprobe_opcode_t opcode;
1384 int result;
1385
1386 pagefault_disable();
1387 result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr,
1388 sizeof(opcode));
1389 pagefault_enable();
1390
1391 if (likely(result == 0))
1392 goto out;
1393
1394 result = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL);
1395 if (result < 0)
1396 return result;
1397
1398 copy_opcode(page, vaddr, &opcode);
1399 put_page(page);
1400 out:
1401 return is_swbp_insn(&opcode);
1402}
1403
1418static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) 1404static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
1419{ 1405{
1420 struct mm_struct *mm = current->mm; 1406 struct mm_struct *mm = current->mm;
@@ -1485,38 +1471,41 @@ static void handle_swbp(struct pt_regs *regs)
1485 } 1471 }
1486 return; 1472 return;
1487 } 1473 }
1474 /*
1475 * TODO: move copy_insn/etc into _register and remove this hack.
1476 * After we hit the bp, _unregister + _register can install the
1477 * new and not-yet-analyzed uprobe at the same address, restart.
1478 */
1479 smp_rmb(); /* pairs with wmb() in install_breakpoint() */
1480 if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags)))
1481 goto restart;
1488 1482
1489 utask = current->utask; 1483 utask = current->utask;
1490 if (!utask) { 1484 if (!utask) {
1491 utask = add_utask(); 1485 utask = add_utask();
1492 /* Cannot allocate; re-execute the instruction. */ 1486 /* Cannot allocate; re-execute the instruction. */
1493 if (!utask) 1487 if (!utask)
1494 goto cleanup_ret; 1488 goto restart;
1495 } 1489 }
1496 utask->active_uprobe = uprobe; 1490
1497 handler_chain(uprobe, regs); 1491 handler_chain(uprobe, regs);
1498 if (uprobe->flags & UPROBE_SKIP_SSTEP && can_skip_sstep(uprobe, regs)) 1492 if (can_skip_sstep(uprobe, regs))
1499 goto cleanup_ret; 1493 goto out;
1500 1494
1501 utask->state = UTASK_SSTEP;
1502 if (!pre_ssout(uprobe, regs, bp_vaddr)) { 1495 if (!pre_ssout(uprobe, regs, bp_vaddr)) {
1503 arch_uprobe_enable_step(&uprobe->arch); 1496 arch_uprobe_enable_step(&uprobe->arch);
1497 utask->active_uprobe = uprobe;
1498 utask->state = UTASK_SSTEP;
1504 return; 1499 return;
1505 } 1500 }
1506 1501
1507cleanup_ret: 1502restart:
1508 if (utask) { 1503 /*
1509 utask->active_uprobe = NULL; 1504 * cannot singlestep; cannot skip instruction;
1510 utask->state = UTASK_RUNNING; 1505 * re-execute the instruction.
1511 } 1506 */
1512 if (!(uprobe->flags & UPROBE_SKIP_SSTEP)) 1507 instruction_pointer_set(regs, bp_vaddr);
1513 1508out:
1514 /*
1515 * cannot singlestep; cannot skip instruction;
1516 * re-execute the instruction.
1517 */
1518 instruction_pointer_set(regs, bp_vaddr);
1519
1520 put_uprobe(uprobe); 1509 put_uprobe(uprobe);
1521} 1510}
1522 1511
@@ -1548,13 +1537,12 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
1548} 1537}
1549 1538
1550/* 1539/*
1551 * On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag. (and on 1540 * On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag and
1552 * subsequent probe hits on the thread sets the state to UTASK_BP_HIT) and 1541 * allows the thread to return from interrupt. After that handle_swbp()
1553 * allows the thread to return from interrupt. 1542 * sets utask->active_uprobe.
1554 * 1543 *
1555 * On singlestep exception, singlestep notifier sets the TIF_UPROBE flag and 1544 * On singlestep exception, singlestep notifier sets the TIF_UPROBE flag
1556 * also sets the state to UTASK_SSTEP_ACK and allows the thread to return from 1545 * and allows the thread to return from interrupt.
1557 * interrupt.
1558 * 1546 *
1559 * While returning to userspace, thread notices the TIF_UPROBE flag and calls 1547 * While returning to userspace, thread notices the TIF_UPROBE flag and calls
1560 * uprobe_notify_resume(). 1548 * uprobe_notify_resume().
@@ -1563,11 +1551,13 @@ void uprobe_notify_resume(struct pt_regs *regs)
1563{ 1551{
1564 struct uprobe_task *utask; 1552 struct uprobe_task *utask;
1565 1553
1554 clear_thread_flag(TIF_UPROBE);
1555
1566 utask = current->utask; 1556 utask = current->utask;
1567 if (!utask || utask->state == UTASK_BP_HIT) 1557 if (utask && utask->active_uprobe)
1568 handle_swbp(regs);
1569 else
1570 handle_singlestep(utask, regs); 1558 handle_singlestep(utask, regs);
1559 else
1560 handle_swbp(regs);
1571} 1561}
1572 1562
1573/* 1563/*
@@ -1576,17 +1566,10 @@ void uprobe_notify_resume(struct pt_regs *regs)
1576 */ 1566 */
1577int uprobe_pre_sstep_notifier(struct pt_regs *regs) 1567int uprobe_pre_sstep_notifier(struct pt_regs *regs)
1578{ 1568{
1579 struct uprobe_task *utask;
1580
1581 if (!current->mm || !test_bit(MMF_HAS_UPROBES, &current->mm->flags)) 1569 if (!current->mm || !test_bit(MMF_HAS_UPROBES, &current->mm->flags))
1582 return 0; 1570 return 0;
1583 1571
1584 utask = current->utask;
1585 if (utask)
1586 utask->state = UTASK_BP_HIT;
1587
1588 set_thread_flag(TIF_UPROBE); 1572 set_thread_flag(TIF_UPROBE);
1589
1590 return 1; 1573 return 1;
1591} 1574}
1592 1575
diff --git a/kernel/exit.c b/kernel/exit.c
index f65345f9e5bb..346616c0092c 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -457,108 +457,13 @@ void daemonize(const char *name, ...)
457 /* Become as one with the init task */ 457 /* Become as one with the init task */
458 458
459 daemonize_fs_struct(); 459 daemonize_fs_struct();
460 exit_files(current); 460 daemonize_descriptors();
461 current->files = init_task.files;
462 atomic_inc(&current->files->count);
463 461
464 reparent_to_kthreadd(); 462 reparent_to_kthreadd();
465} 463}
466 464
467EXPORT_SYMBOL(daemonize); 465EXPORT_SYMBOL(daemonize);
468 466
469static void close_files(struct files_struct * files)
470{
471 int i, j;
472 struct fdtable *fdt;
473
474 j = 0;
475
476 /*
477 * It is safe to dereference the fd table without RCU or
478 * ->file_lock because this is the last reference to the
479 * files structure. But use RCU to shut RCU-lockdep up.
480 */
481 rcu_read_lock();
482 fdt = files_fdtable(files);
483 rcu_read_unlock();
484 for (;;) {
485 unsigned long set;
486 i = j * BITS_PER_LONG;
487 if (i >= fdt->max_fds)
488 break;
489 set = fdt->open_fds[j++];
490 while (set) {
491 if (set & 1) {
492 struct file * file = xchg(&fdt->fd[i], NULL);
493 if (file) {
494 filp_close(file, files);
495 cond_resched();
496 }
497 }
498 i++;
499 set >>= 1;
500 }
501 }
502}
503
504struct files_struct *get_files_struct(struct task_struct *task)
505{
506 struct files_struct *files;
507
508 task_lock(task);
509 files = task->files;
510 if (files)
511 atomic_inc(&files->count);
512 task_unlock(task);
513
514 return files;
515}
516
517void put_files_struct(struct files_struct *files)
518{
519 struct fdtable *fdt;
520
521 if (atomic_dec_and_test(&files->count)) {
522 close_files(files);
523 /*
524 * Free the fd and fdset arrays if we expanded them.
525 * If the fdtable was embedded, pass files for freeing
526 * at the end of the RCU grace period. Otherwise,
527 * you can free files immediately.
528 */
529 rcu_read_lock();
530 fdt = files_fdtable(files);
531 if (fdt != &files->fdtab)
532 kmem_cache_free(files_cachep, files);
533 free_fdtable(fdt);
534 rcu_read_unlock();
535 }
536}
537
538void reset_files_struct(struct files_struct *files)
539{
540 struct task_struct *tsk = current;
541 struct files_struct *old;
542
543 old = tsk->files;
544 task_lock(tsk);
545 tsk->files = files;
546 task_unlock(tsk);
547 put_files_struct(old);
548}
549
550void exit_files(struct task_struct *tsk)
551{
552 struct files_struct * files = tsk->files;
553
554 if (files) {
555 task_lock(tsk);
556 tsk->files = NULL;
557 task_unlock(tsk);
558 put_files_struct(files);
559 }
560}
561
562#ifdef CONFIG_MM_OWNER 467#ifdef CONFIG_MM_OWNER
563/* 468/*
564 * A task is exiting. If it owned this mm, find a new owner for the mm. 469 * A task is exiting. If it owned this mm, find a new owner for the mm.
@@ -1046,6 +951,9 @@ void do_exit(long code)
1046 if (tsk->splice_pipe) 951 if (tsk->splice_pipe)
1047 __free_pipe_info(tsk->splice_pipe); 952 __free_pipe_info(tsk->splice_pipe);
1048 953
954 if (tsk->task_frag.page)
955 put_page(tsk->task_frag.page);
956
1049 validate_creds_for_do_exit(tsk); 957 validate_creds_for_do_exit(tsk);
1050 958
1051 preempt_disable(); 959 preempt_disable();
diff --git a/kernel/fork.c b/kernel/fork.c
index 5a0e74d89a5a..8b20ab7d3aa2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -330,6 +330,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
330 tsk->btrace_seq = 0; 330 tsk->btrace_seq = 0;
331#endif 331#endif
332 tsk->splice_pipe = NULL; 332 tsk->splice_pipe = NULL;
333 tsk->task_frag.page = NULL;
333 334
334 account_kernel_stack(ti, 1); 335 account_kernel_stack(ti, 1);
335 336
@@ -422,7 +423,12 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
422 mapping->i_mmap_writable++; 423 mapping->i_mmap_writable++;
423 flush_dcache_mmap_lock(mapping); 424 flush_dcache_mmap_lock(mapping);
424 /* insert tmp into the share list, just after mpnt */ 425 /* insert tmp into the share list, just after mpnt */
425 vma_prio_tree_add(tmp, mpnt); 426 if (unlikely(tmp->vm_flags & VM_NONLINEAR))
427 vma_nonlinear_insert(tmp,
428 &mapping->i_mmap_nonlinear);
429 else
430 vma_interval_tree_insert_after(tmp, mpnt,
431 &mapping->i_mmap);
426 flush_dcache_mmap_unlock(mapping); 432 flush_dcache_mmap_unlock(mapping);
427 mutex_unlock(&mapping->i_mmap_mutex); 433 mutex_unlock(&mapping->i_mmap_mutex);
428 } 434 }
@@ -621,26 +627,6 @@ void mmput(struct mm_struct *mm)
621} 627}
622EXPORT_SYMBOL_GPL(mmput); 628EXPORT_SYMBOL_GPL(mmput);
623 629
624/*
625 * We added or removed a vma mapping the executable. The vmas are only mapped
626 * during exec and are not mapped with the mmap system call.
627 * Callers must hold down_write() on the mm's mmap_sem for these
628 */
629void added_exe_file_vma(struct mm_struct *mm)
630{
631 mm->num_exe_file_vmas++;
632}
633
634void removed_exe_file_vma(struct mm_struct *mm)
635{
636 mm->num_exe_file_vmas--;
637 if ((mm->num_exe_file_vmas == 0) && mm->exe_file) {
638 fput(mm->exe_file);
639 mm->exe_file = NULL;
640 }
641
642}
643
644void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) 630void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
645{ 631{
646 if (new_exe_file) 632 if (new_exe_file)
@@ -648,15 +634,13 @@ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
648 if (mm->exe_file) 634 if (mm->exe_file)
649 fput(mm->exe_file); 635 fput(mm->exe_file);
650 mm->exe_file = new_exe_file; 636 mm->exe_file = new_exe_file;
651 mm->num_exe_file_vmas = 0;
652} 637}
653 638
654struct file *get_mm_exe_file(struct mm_struct *mm) 639struct file *get_mm_exe_file(struct mm_struct *mm)
655{ 640{
656 struct file *exe_file; 641 struct file *exe_file;
657 642
658 /* We need mmap_sem to protect against races with removal of 643 /* We need mmap_sem to protect against races with removal of exe_file */
659 * VM_EXECUTABLE vmas */
660 down_read(&mm->mmap_sem); 644 down_read(&mm->mmap_sem);
661 exe_file = mm->exe_file; 645 exe_file = mm->exe_file;
662 if (exe_file) 646 if (exe_file)
@@ -1077,7 +1061,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
1077 init_rwsem(&sig->group_rwsem); 1061 init_rwsem(&sig->group_rwsem);
1078#endif 1062#endif
1079 1063
1080 sig->oom_adj = current->signal->oom_adj;
1081 sig->oom_score_adj = current->signal->oom_score_adj; 1064 sig->oom_score_adj = current->signal->oom_score_adj;
1082 sig->oom_score_adj_min = current->signal->oom_score_adj_min; 1065 sig->oom_score_adj_min = current->signal->oom_score_adj_min;
1083 1066
@@ -1601,7 +1584,7 @@ long do_fork(unsigned long clone_flags,
1601 * requested, no event is reported; otherwise, report if the event 1584 * requested, no event is reported; otherwise, report if the event
1602 * for the type of forking is enabled. 1585 * for the type of forking is enabled.
1603 */ 1586 */
1604 if (likely(user_mode(regs)) && !(clone_flags & CLONE_UNTRACED)) { 1587 if (!(clone_flags & CLONE_UNTRACED) && likely(user_mode(regs))) {
1605 if (clone_flags & CLONE_VFORK) 1588 if (clone_flags & CLONE_VFORK)
1606 trace = PTRACE_EVENT_VFORK; 1589 trace = PTRACE_EVENT_VFORK;
1607 else if ((clone_flags & CSIGNAL) != SIGCHLD) 1590 else if ((clone_flags & CSIGNAL) != SIGCHLD)
@@ -1651,6 +1634,17 @@ long do_fork(unsigned long clone_flags,
1651 return nr; 1634 return nr;
1652} 1635}
1653 1636
1637#ifdef CONFIG_GENERIC_KERNEL_THREAD
1638/*
1639 * Create a kernel thread.
1640 */
1641pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
1642{
1643 return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, NULL,
1644 (unsigned long)arg, NULL, NULL);
1645}
1646#endif
1647
1654#ifndef ARCH_MIN_MMSTRUCT_ALIGN 1648#ifndef ARCH_MIN_MMSTRUCT_ALIGN
1655#define ARCH_MIN_MMSTRUCT_ALIGN 0 1649#define ARCH_MIN_MMSTRUCT_ALIGN 0
1656#endif 1650#endif
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 49a77727db42..4e69e24d3d7d 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -148,7 +148,8 @@ static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain,
148 * @host_data: Controller private data pointer 148 * @host_data: Controller private data pointer
149 * 149 *
150 * Allocates a legacy irq_domain if irq_base is positive or a linear 150 * Allocates a legacy irq_domain if irq_base is positive or a linear
151 * domain otherwise. 151 * domain otherwise. For the legacy domain, IRQ descriptors will also
152 * be allocated.
152 * 153 *
153 * This is intended to implement the expected behaviour for most 154 * This is intended to implement the expected behaviour for most
154 * interrupt controllers which is that a linear mapping should 155 * interrupt controllers which is that a linear mapping should
@@ -162,11 +163,33 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
162 const struct irq_domain_ops *ops, 163 const struct irq_domain_ops *ops,
163 void *host_data) 164 void *host_data)
164{ 165{
165 if (first_irq > 0) 166 if (first_irq > 0) {
166 return irq_domain_add_legacy(of_node, size, first_irq, 0, 167 int irq_base;
168
169 if (IS_ENABLED(CONFIG_SPARSE_IRQ)) {
170 /*
171 * Set the descriptor allocator to search for a
172 * 1-to-1 mapping, such as irq_alloc_desc_at().
173 * Use of_node_to_nid() which is defined to
174 * numa_node_id() on platforms that have no custom
175 * implementation.
176 */
177 irq_base = irq_alloc_descs(first_irq, first_irq, size,
178 of_node_to_nid(of_node));
179 if (irq_base < 0) {
180 WARN(1, "Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n",
181 first_irq);
182 irq_base = first_irq;
183 }
184 } else
185 irq_base = first_irq;
186
187 return irq_domain_add_legacy(of_node, size, irq_base, 0,
167 ops, host_data); 188 ops, host_data);
168 else 189 }
169 return irq_domain_add_linear(of_node, size, ops, host_data); 190
191 /* A linear domain is the default */
192 return irq_domain_add_linear(of_node, size, ops, host_data);
170} 193}
171 194
172/** 195/**
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 43049192b5ec..60f48fa0fd0d 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -118,6 +118,7 @@ void jump_label_rate_limit(struct static_key_deferred *key,
118 key->timeout = rl; 118 key->timeout = rl;
119 INIT_DELAYED_WORK(&key->work, jump_label_update_timeout); 119 INIT_DELAYED_WORK(&key->work, jump_label_update_timeout);
120} 120}
121EXPORT_SYMBOL_GPL(jump_label_rate_limit);
121 122
122static int addr_conflict(struct jump_entry *entry, void *start, void *end) 123static int addr_conflict(struct jump_entry *entry, void *start, void *end)
123{ 124{
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 0668d58d6413..5e4bd7864c5d 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -21,7 +21,6 @@
21#include <linux/hardirq.h> 21#include <linux/hardirq.h>
22#include <linux/elf.h> 22#include <linux/elf.h>
23#include <linux/elfcore.h> 23#include <linux/elfcore.h>
24#include <generated/utsrelease.h>
25#include <linux/utsname.h> 24#include <linux/utsname.h>
26#include <linux/numa.h> 25#include <linux/numa.h>
27#include <linux/suspend.h> 26#include <linux/suspend.h>
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 6f99aead66c6..1c317e386831 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -37,6 +37,7 @@
37#include <linux/notifier.h> 37#include <linux/notifier.h>
38#include <linux/suspend.h> 38#include <linux/suspend.h>
39#include <linux/rwsem.h> 39#include <linux/rwsem.h>
40#include <linux/ptrace.h>
40#include <asm/uaccess.h> 41#include <asm/uaccess.h>
41 42
42#include <trace/events/module.h> 43#include <trace/events/module.h>
@@ -221,11 +222,13 @@ static int ____call_usermodehelper(void *data)
221 retval = kernel_execve(sub_info->path, 222 retval = kernel_execve(sub_info->path,
222 (const char *const *)sub_info->argv, 223 (const char *const *)sub_info->argv,
223 (const char *const *)sub_info->envp); 224 (const char *const *)sub_info->envp);
225 if (!retval)
226 return 0;
224 227
225 /* Exec failed? */ 228 /* Exec failed? */
226fail: 229fail:
227 sub_info->retval = retval; 230 sub_info->retval = retval;
228 return 0; 231 do_exit(0);
229} 232}
230 233
231static int call_helper(void *data) 234static int call_helper(void *data)
@@ -292,7 +295,7 @@ static int wait_for_helper(void *data)
292 } 295 }
293 296
294 umh_complete(sub_info); 297 umh_complete(sub_info);
295 return 0; 298 do_exit(0);
296} 299}
297 300
298/* This is run by khelper thread */ 301/* This is run by khelper thread */
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 146a6fa96825..29fb60caecb5 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -16,6 +16,7 @@
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/freezer.h> 18#include <linux/freezer.h>
19#include <linux/ptrace.h>
19#include <trace/events/sched.h> 20#include <trace/events/sched.h>
20 21
21static DEFINE_SPINLOCK(kthread_create_lock); 22static DEFINE_SPINLOCK(kthread_create_lock);
diff --git a/kernel/modsign_pubkey.c b/kernel/modsign_pubkey.c
new file mode 100644
index 000000000000..4646eb2c3820
--- /dev/null
+++ b/kernel/modsign_pubkey.c
@@ -0,0 +1,113 @@
1/* Public keys for module signature verification
2 *
3 * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/kernel.h>
13#include <linux/sched.h>
14#include <linux/cred.h>
15#include <linux/err.h>
16#include <keys/asymmetric-type.h>
17#include "module-internal.h"
18
19struct key *modsign_keyring;
20
21extern __initdata const u8 modsign_certificate_list[];
22extern __initdata const u8 modsign_certificate_list_end[];
23asm(".section .init.data,\"aw\"\n"
24 "modsign_certificate_list:\n"
25 ".incbin \"signing_key.x509\"\n"
26 ".incbin \"extra_certificates\"\n"
27 "modsign_certificate_list_end:"
28 );
29
30/*
31 * We need to make sure ccache doesn't cache the .o file as it doesn't notice
32 * if modsign.pub changes.
33 */
34static __initdata const char annoy_ccache[] = __TIME__ "foo";
35
36/*
37 * Load the compiled-in keys
38 */
39static __init int module_verify_init(void)
40{
41 pr_notice("Initialise module verification\n");
42
43 modsign_keyring = key_alloc(&key_type_keyring, ".module_sign",
44 KUIDT_INIT(0), KGIDT_INIT(0),
45 current_cred(),
46 (KEY_POS_ALL & ~KEY_POS_SETATTR) |
47 KEY_USR_VIEW | KEY_USR_READ,
48 KEY_ALLOC_NOT_IN_QUOTA);
49 if (IS_ERR(modsign_keyring))
50 panic("Can't allocate module signing keyring\n");
51
52 if (key_instantiate_and_link(modsign_keyring, NULL, 0, NULL, NULL) < 0)
53 panic("Can't instantiate module signing keyring\n");
54
55 return 0;
56}
57
58/*
59 * Must be initialised before we try and load the keys into the keyring.
60 */
61device_initcall(module_verify_init);
62
63/*
64 * Load the compiled-in keys
65 */
66static __init int load_module_signing_keys(void)
67{
68 key_ref_t key;
69 const u8 *p, *end;
70 size_t plen;
71
72 pr_notice("Loading module verification certificates\n");
73
74 end = modsign_certificate_list_end;
75 p = modsign_certificate_list;
76 while (p < end) {
77 /* Each cert begins with an ASN.1 SEQUENCE tag and must be more
78 * than 256 bytes in size.
79 */
80 if (end - p < 4)
81 goto dodgy_cert;
82 if (p[0] != 0x30 &&
83 p[1] != 0x82)
84 goto dodgy_cert;
85 plen = (p[2] << 8) | p[3];
86 plen += 4;
87 if (plen > end - p)
88 goto dodgy_cert;
89
90 key = key_create_or_update(make_key_ref(modsign_keyring, 1),
91 "asymmetric",
92 NULL,
93 p,
94 plen,
95 (KEY_POS_ALL & ~KEY_POS_SETATTR) |
96 KEY_USR_VIEW,
97 KEY_ALLOC_NOT_IN_QUOTA);
98 if (IS_ERR(key))
99 pr_err("MODSIGN: Problem loading in-kernel X.509 certificate (%ld)\n",
100 PTR_ERR(key));
101 else
102 pr_notice("MODSIGN: Loaded cert '%s'\n",
103 key_ref_to_ptr(key)->description);
104 p += plen;
105 }
106
107 return 0;
108
109dodgy_cert:
110 pr_err("MODSIGN: Problem parsing in-kernel X.509 certificate list\n");
111 return 0;
112}
113late_initcall(load_module_signing_keys);
diff --git a/kernel/module-internal.h b/kernel/module-internal.h
new file mode 100644
index 000000000000..24f9247b7d02
--- /dev/null
+++ b/kernel/module-internal.h
@@ -0,0 +1,14 @@
1/* Module internals
2 *
3 * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12extern struct key *modsign_keyring;
13
14extern int mod_verify_sig(const void *mod, unsigned long *_modlen);
diff --git a/kernel/module.c b/kernel/module.c
index 4edbd9c11aca..6085f5ef88ea 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -58,6 +58,8 @@
58#include <linux/jump_label.h> 58#include <linux/jump_label.h>
59#include <linux/pfn.h> 59#include <linux/pfn.h>
60#include <linux/bsearch.h> 60#include <linux/bsearch.h>
61#include <linux/fips.h>
62#include "module-internal.h"
61 63
62#define CREATE_TRACE_POINTS 64#define CREATE_TRACE_POINTS
63#include <trace/events/module.h> 65#include <trace/events/module.h>
@@ -102,6 +104,43 @@ static LIST_HEAD(modules);
102struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */ 104struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */
103#endif /* CONFIG_KGDB_KDB */ 105#endif /* CONFIG_KGDB_KDB */
104 106
107#ifdef CONFIG_MODULE_SIG
108#ifdef CONFIG_MODULE_SIG_FORCE
109static bool sig_enforce = true;
110#else
111static bool sig_enforce = false;
112
113static int param_set_bool_enable_only(const char *val,
114 const struct kernel_param *kp)
115{
116 int err;
117 bool test;
118 struct kernel_param dummy_kp = *kp;
119
120 dummy_kp.arg = &test;
121
122 err = param_set_bool(val, &dummy_kp);
123 if (err)
124 return err;
125
126 /* Don't let them unset it once it's set! */
127 if (!test && sig_enforce)
128 return -EROFS;
129
130 if (test)
131 sig_enforce = true;
132 return 0;
133}
134
135static const struct kernel_param_ops param_ops_bool_enable_only = {
136 .set = param_set_bool_enable_only,
137 .get = param_get_bool,
138};
139#define param_check_bool_enable_only param_check_bool
140
141module_param(sig_enforce, bool_enable_only, 0644);
142#endif /* !CONFIG_MODULE_SIG_FORCE */
143#endif /* CONFIG_MODULE_SIG */
105 144
106/* Block module loading/unloading? */ 145/* Block module loading/unloading? */
107int modules_disabled = 0; 146int modules_disabled = 0;
@@ -136,6 +175,7 @@ struct load_info {
136 unsigned long symoffs, stroffs; 175 unsigned long symoffs, stroffs;
137 struct _ddebug *debug; 176 struct _ddebug *debug;
138 unsigned int num_debug; 177 unsigned int num_debug;
178 bool sig_ok;
139 struct { 179 struct {
140 unsigned int sym, str, mod, vers, info, pcpu; 180 unsigned int sym, str, mod, vers, info, pcpu;
141 } index; 181 } index;
@@ -1949,26 +1989,6 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
1949 return ret; 1989 return ret;
1950} 1990}
1951 1991
1952int __weak apply_relocate(Elf_Shdr *sechdrs,
1953 const char *strtab,
1954 unsigned int symindex,
1955 unsigned int relsec,
1956 struct module *me)
1957{
1958 pr_err("module %s: REL relocation unsupported\n", me->name);
1959 return -ENOEXEC;
1960}
1961
1962int __weak apply_relocate_add(Elf_Shdr *sechdrs,
1963 const char *strtab,
1964 unsigned int symindex,
1965 unsigned int relsec,
1966 struct module *me)
1967{
1968 pr_err("module %s: RELA relocation unsupported\n", me->name);
1969 return -ENOEXEC;
1970}
1971
1972static int apply_relocations(struct module *mod, const struct load_info *info) 1992static int apply_relocations(struct module *mod, const struct load_info *info)
1973{ 1993{
1974 unsigned int i; 1994 unsigned int i;
@@ -2399,7 +2419,44 @@ static inline void kmemleak_load_module(const struct module *mod,
2399} 2419}
2400#endif 2420#endif
2401 2421
2402/* Sets info->hdr and info->len. */ 2422#ifdef CONFIG_MODULE_SIG
2423static int module_sig_check(struct load_info *info,
2424 const void *mod, unsigned long *_len)
2425{
2426 int err = -ENOKEY;
2427 unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1;
2428 unsigned long len = *_len;
2429
2430 if (len > markerlen &&
2431 memcmp(mod + len - markerlen, MODULE_SIG_STRING, markerlen) == 0) {
2432 /* We truncate the module to discard the signature */
2433 *_len -= markerlen;
2434 err = mod_verify_sig(mod, _len);
2435 }
2436
2437 if (!err) {
2438 info->sig_ok = true;
2439 return 0;
2440 }
2441
2442 /* Not having a signature is only an error if we're strict. */
2443 if (err < 0 && fips_enabled)
2444 panic("Module verification failed with error %d in FIPS mode\n",
2445 err);
2446 if (err == -ENOKEY && !sig_enforce)
2447 err = 0;
2448
2449 return err;
2450}
2451#else /* !CONFIG_MODULE_SIG */
2452static int module_sig_check(struct load_info *info,
2453 void *mod, unsigned long *len)
2454{
2455 return 0;
2456}
2457#endif /* !CONFIG_MODULE_SIG */
2458
2459/* Sets info->hdr, info->len and info->sig_ok. */
2403static int copy_and_check(struct load_info *info, 2460static int copy_and_check(struct load_info *info,
2404 const void __user *umod, unsigned long len, 2461 const void __user *umod, unsigned long len,
2405 const char __user *uargs) 2462 const char __user *uargs)
@@ -2419,6 +2476,10 @@ static int copy_and_check(struct load_info *info,
2419 goto free_hdr; 2476 goto free_hdr;
2420 } 2477 }
2421 2478
2479 err = module_sig_check(info, hdr, &len);
2480 if (err)
2481 goto free_hdr;
2482
2422 /* Sanity checks against insmoding binaries or wrong arch, 2483 /* Sanity checks against insmoding binaries or wrong arch,
2423 weird elf version */ 2484 weird elf version */
2424 if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0 2485 if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0
@@ -2730,6 +2791,10 @@ static int check_module_license_and_versions(struct module *mod)
2730 if (strcmp(mod->name, "driverloader") == 0) 2791 if (strcmp(mod->name, "driverloader") == 0)
2731 add_taint_module(mod, TAINT_PROPRIETARY_MODULE); 2792 add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
2732 2793
2794 /* lve claims to be GPL but upstream won't provide source */
2795 if (strcmp(mod->name, "lve") == 0)
2796 add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
2797
2733#ifdef CONFIG_MODVERSIONS 2798#ifdef CONFIG_MODVERSIONS
2734 if ((mod->num_syms && !mod->crcs) 2799 if ((mod->num_syms && !mod->crcs)
2735 || (mod->num_gpl_syms && !mod->gpl_crcs) 2800 || (mod->num_gpl_syms && !mod->gpl_crcs)
@@ -2861,6 +2926,20 @@ static int post_relocation(struct module *mod, const struct load_info *info)
2861 return module_finalize(info->hdr, info->sechdrs, mod); 2926 return module_finalize(info->hdr, info->sechdrs, mod);
2862} 2927}
2863 2928
2929/* Is this module of this name done loading? No locks held. */
2930static bool finished_loading(const char *name)
2931{
2932 struct module *mod;
2933 bool ret;
2934
2935 mutex_lock(&module_mutex);
2936 mod = find_module(name);
2937 ret = !mod || mod->state != MODULE_STATE_COMING;
2938 mutex_unlock(&module_mutex);
2939
2940 return ret;
2941}
2942
2864/* Allocate and load the module: note that size of section 0 is always 2943/* Allocate and load the module: note that size of section 0 is always
2865 zero, and we rely on this for optional sections. */ 2944 zero, and we rely on this for optional sections. */
2866static struct module *load_module(void __user *umod, 2945static struct module *load_module(void __user *umod,
@@ -2868,7 +2947,7 @@ static struct module *load_module(void __user *umod,
2868 const char __user *uargs) 2947 const char __user *uargs)
2869{ 2948{
2870 struct load_info info = { NULL, }; 2949 struct load_info info = { NULL, };
2871 struct module *mod; 2950 struct module *mod, *old;
2872 long err; 2951 long err;
2873 2952
2874 pr_debug("load_module: umod=%p, len=%lu, uargs=%p\n", 2953 pr_debug("load_module: umod=%p, len=%lu, uargs=%p\n",
@@ -2886,6 +2965,12 @@ static struct module *load_module(void __user *umod,
2886 goto free_copy; 2965 goto free_copy;
2887 } 2966 }
2888 2967
2968#ifdef CONFIG_MODULE_SIG
2969 mod->sig_ok = info.sig_ok;
2970 if (!mod->sig_ok)
2971 add_taint_module(mod, TAINT_FORCED_MODULE);
2972#endif
2973
2889 /* Now module is in final location, initialize linked lists, etc. */ 2974 /* Now module is in final location, initialize linked lists, etc. */
2890 err = module_unload_init(mod); 2975 err = module_unload_init(mod);
2891 if (err) 2976 if (err)
@@ -2934,8 +3019,18 @@ static struct module *load_module(void __user *umod,
2934 * function to insert in a way safe to concurrent readers. 3019 * function to insert in a way safe to concurrent readers.
2935 * The mutex protects against concurrent writers. 3020 * The mutex protects against concurrent writers.
2936 */ 3021 */
3022again:
2937 mutex_lock(&module_mutex); 3023 mutex_lock(&module_mutex);
2938 if (find_module(mod->name)) { 3024 if ((old = find_module(mod->name)) != NULL) {
3025 if (old->state == MODULE_STATE_COMING) {
3026 /* Wait in case it fails to load. */
3027 mutex_unlock(&module_mutex);
3028 err = wait_event_interruptible(module_wq,
3029 finished_loading(mod->name));
3030 if (err)
3031 goto free_arch_cleanup;
3032 goto again;
3033 }
2939 err = -EEXIST; 3034 err = -EEXIST;
2940 goto unlock; 3035 goto unlock;
2941 } 3036 }
@@ -2975,7 +3070,7 @@ static struct module *load_module(void __user *umod,
2975 /* Unlink carefully: kallsyms could be walking list. */ 3070 /* Unlink carefully: kallsyms could be walking list. */
2976 list_del_rcu(&mod->list); 3071 list_del_rcu(&mod->list);
2977 module_bug_cleanup(mod); 3072 module_bug_cleanup(mod);
2978 3073 wake_up_all(&module_wq);
2979 ddebug: 3074 ddebug:
2980 dynamic_debug_remove(info.debug); 3075 dynamic_debug_remove(info.debug);
2981 unlock: 3076 unlock:
@@ -3050,7 +3145,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
3050 blocking_notifier_call_chain(&module_notify_list, 3145 blocking_notifier_call_chain(&module_notify_list,
3051 MODULE_STATE_GOING, mod); 3146 MODULE_STATE_GOING, mod);
3052 free_module(mod); 3147 free_module(mod);
3053 wake_up(&module_wq); 3148 wake_up_all(&module_wq);
3054 return ret; 3149 return ret;
3055 } 3150 }
3056 if (ret > 0) { 3151 if (ret > 0) {
@@ -3062,9 +3157,8 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
3062 dump_stack(); 3157 dump_stack();
3063 } 3158 }
3064 3159
3065 /* Now it's a first class citizen! Wake up anyone waiting for it. */ 3160 /* Now it's a first class citizen! */
3066 mod->state = MODULE_STATE_LIVE; 3161 mod->state = MODULE_STATE_LIVE;
3067 wake_up(&module_wq);
3068 blocking_notifier_call_chain(&module_notify_list, 3162 blocking_notifier_call_chain(&module_notify_list,
3069 MODULE_STATE_LIVE, mod); 3163 MODULE_STATE_LIVE, mod);
3070 3164
@@ -3087,6 +3181,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
3087 mod->init_ro_size = 0; 3181 mod->init_ro_size = 0;
3088 mod->init_text_size = 0; 3182 mod->init_text_size = 0;
3089 mutex_unlock(&module_mutex); 3183 mutex_unlock(&module_mutex);
3184 wake_up_all(&module_wq);
3090 3185
3091 return 0; 3186 return 0;
3092} 3187}
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
new file mode 100644
index 000000000000..d492a23df99c
--- /dev/null
+++ b/kernel/module_signing.c
@@ -0,0 +1,249 @@
1/* Module signature checker
2 *
3 * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/kernel.h>
13#include <linux/err.h>
14#include <crypto/public_key.h>
15#include <crypto/hash.h>
16#include <keys/asymmetric-type.h>
17#include "module-internal.h"
18
19/*
20 * Module signature information block.
21 *
22 * The constituents of the signature section are, in order:
23 *
24 * - Signer's name
25 * - Key identifier
26 * - Signature data
27 * - Information block
28 */
29struct module_signature {
30 enum pkey_algo algo : 8; /* Public-key crypto algorithm */
31 enum pkey_hash_algo hash : 8; /* Digest algorithm */
32 enum pkey_id_type id_type : 8; /* Key identifier type */
33 u8 signer_len; /* Length of signer's name */
34 u8 key_id_len; /* Length of key identifier */
35 u8 __pad[3];
36 __be32 sig_len; /* Length of signature data */
37};
38
39/*
40 * Digest the module contents.
41 */
42static struct public_key_signature *mod_make_digest(enum pkey_hash_algo hash,
43 const void *mod,
44 unsigned long modlen)
45{
46 struct public_key_signature *pks;
47 struct crypto_shash *tfm;
48 struct shash_desc *desc;
49 size_t digest_size, desc_size;
50 int ret;
51
52 pr_devel("==>%s()\n", __func__);
53
54 /* Allocate the hashing algorithm we're going to need and find out how
55 * big the hash operational data will be.
56 */
57 tfm = crypto_alloc_shash(pkey_hash_algo[hash], 0, 0);
58 if (IS_ERR(tfm))
59 return (PTR_ERR(tfm) == -ENOENT) ? ERR_PTR(-ENOPKG) : ERR_CAST(tfm);
60
61 desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
62 digest_size = crypto_shash_digestsize(tfm);
63
64 /* We allocate the hash operational data storage on the end of our
65 * context data and the digest output buffer on the end of that.
66 */
67 ret = -ENOMEM;
68 pks = kzalloc(digest_size + sizeof(*pks) + desc_size, GFP_KERNEL);
69 if (!pks)
70 goto error_no_pks;
71
72 pks->pkey_hash_algo = hash;
73 pks->digest = (u8 *)pks + sizeof(*pks) + desc_size;
74 pks->digest_size = digest_size;
75
76 desc = (void *)pks + sizeof(*pks);
77 desc->tfm = tfm;
78 desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
79
80 ret = crypto_shash_init(desc);
81 if (ret < 0)
82 goto error;
83
84 ret = crypto_shash_finup(desc, mod, modlen, pks->digest);
85 if (ret < 0)
86 goto error;
87
88 crypto_free_shash(tfm);
89 pr_devel("<==%s() = ok\n", __func__);
90 return pks;
91
92error:
93 kfree(pks);
94error_no_pks:
95 crypto_free_shash(tfm);
96 pr_devel("<==%s() = %d\n", __func__, ret);
97 return ERR_PTR(ret);
98}
99
100/*
101 * Extract an MPI array from the signature data. This represents the actual
102 * signature. Each raw MPI is prefaced by a BE 2-byte value indicating the
103 * size of the MPI in bytes.
104 *
105 * RSA signatures only have one MPI, so currently we only read one.
106 */
107static int mod_extract_mpi_array(struct public_key_signature *pks,
108 const void *data, size_t len)
109{
110 size_t nbytes;
111 MPI mpi;
112
113 if (len < 3)
114 return -EBADMSG;
115 nbytes = ((const u8 *)data)[0] << 8 | ((const u8 *)data)[1];
116 data += 2;
117 len -= 2;
118 if (len != nbytes)
119 return -EBADMSG;
120
121 mpi = mpi_read_raw_data(data, nbytes);
122 if (!mpi)
123 return -ENOMEM;
124 pks->mpi[0] = mpi;
125 pks->nr_mpi = 1;
126 return 0;
127}
128
129/*
130 * Request an asymmetric key.
131 */
132static struct key *request_asymmetric_key(const char *signer, size_t signer_len,
133 const u8 *key_id, size_t key_id_len)
134{
135 key_ref_t key;
136 size_t i;
137 char *id, *q;
138
139 pr_devel("==>%s(,%zu,,%zu)\n", __func__, signer_len, key_id_len);
140
141 /* Construct an identifier. */
142 id = kmalloc(signer_len + 2 + key_id_len * 2 + 1, GFP_KERNEL);
143 if (!id)
144 return ERR_PTR(-ENOKEY);
145
146 memcpy(id, signer, signer_len);
147
148 q = id + signer_len;
149 *q++ = ':';
150 *q++ = ' ';
151 for (i = 0; i < key_id_len; i++) {
152 *q++ = hex_asc[*key_id >> 4];
153 *q++ = hex_asc[*key_id++ & 0x0f];
154 }
155
156 *q = 0;
157
158 pr_debug("Look up: \"%s\"\n", id);
159
160 key = keyring_search(make_key_ref(modsign_keyring, 1),
161 &key_type_asymmetric, id);
162 if (IS_ERR(key))
163 pr_warn("Request for unknown module key '%s' err %ld\n",
164 id, PTR_ERR(key));
165 kfree(id);
166
167 if (IS_ERR(key)) {
168 switch (PTR_ERR(key)) {
169 /* Hide some search errors */
170 case -EACCES:
171 case -ENOTDIR:
172 case -EAGAIN:
173 return ERR_PTR(-ENOKEY);
174 default:
175 return ERR_CAST(key);
176 }
177 }
178
179 pr_devel("<==%s() = 0 [%x]\n", __func__, key_serial(key_ref_to_ptr(key)));
180 return key_ref_to_ptr(key);
181}
182
183/*
184 * Verify the signature on a module.
185 */
186int mod_verify_sig(const void *mod, unsigned long *_modlen)
187{
188 struct public_key_signature *pks;
189 struct module_signature ms;
190 struct key *key;
191 const void *sig;
192 size_t modlen = *_modlen, sig_len;
193 int ret;
194
195 pr_devel("==>%s(,%lu)\n", __func__, modlen);
196
197 if (modlen <= sizeof(ms))
198 return -EBADMSG;
199
200 memcpy(&ms, mod + (modlen - sizeof(ms)), sizeof(ms));
201 modlen -= sizeof(ms);
202
203 sig_len = be32_to_cpu(ms.sig_len);
204 if (sig_len >= modlen)
205 return -EBADMSG;
206 modlen -= sig_len;
207 if ((size_t)ms.signer_len + ms.key_id_len >= modlen)
208 return -EBADMSG;
209 modlen -= (size_t)ms.signer_len + ms.key_id_len;
210
211 *_modlen = modlen;
212 sig = mod + modlen;
213
214 /* For the moment, only support RSA and X.509 identifiers */
215 if (ms.algo != PKEY_ALGO_RSA ||
216 ms.id_type != PKEY_ID_X509)
217 return -ENOPKG;
218
219 if (ms.hash >= PKEY_HASH__LAST ||
220 !pkey_hash_algo[ms.hash])
221 return -ENOPKG;
222
223 key = request_asymmetric_key(sig, ms.signer_len,
224 sig + ms.signer_len, ms.key_id_len);
225 if (IS_ERR(key))
226 return PTR_ERR(key);
227
228 pks = mod_make_digest(ms.hash, mod, modlen);
229 if (IS_ERR(pks)) {
230 ret = PTR_ERR(pks);
231 goto error_put_key;
232 }
233
234 ret = mod_extract_mpi_array(pks, sig + ms.signer_len + ms.key_id_len,
235 sig_len);
236 if (ret < 0)
237 goto error_free_pks;
238
239 ret = verify_signature(key, pks);
240 pr_devel("verify_signature() = %d\n", ret);
241
242error_free_pks:
243 mpi_free(pks->rsa.s);
244 kfree(pks);
245error_put_key:
246 key_put(key);
247 pr_devel("<==%s() = %d\n", __func__, ret);
248 return ret;
249}
diff --git a/kernel/pid.c b/kernel/pid.c
index e86b291ad834..aebd4f5aaf41 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -479,6 +479,7 @@ pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
479 } 479 }
480 return nr; 480 return nr;
481} 481}
482EXPORT_SYMBOL_GPL(pid_nr_ns);
482 483
483pid_t pid_vnr(struct pid *pid) 484pid_t pid_vnr(struct pid *pid)
484{ 485{
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 6144bab8fd8e..eb00be205811 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -16,6 +16,7 @@
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/proc_fs.h> 17#include <linux/proc_fs.h>
18#include <linux/reboot.h> 18#include <linux/reboot.h>
19#include <linux/export.h>
19 20
20#define BITS_PER_PAGE (PAGE_SIZE*8) 21#define BITS_PER_PAGE (PAGE_SIZE*8)
21 22
@@ -132,18 +133,26 @@ struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old
132 return create_pid_namespace(old_ns); 133 return create_pid_namespace(old_ns);
133} 134}
134 135
135void free_pid_ns(struct kref *kref) 136static void free_pid_ns(struct kref *kref)
136{ 137{
137 struct pid_namespace *ns, *parent; 138 struct pid_namespace *ns;
138 139
139 ns = container_of(kref, struct pid_namespace, kref); 140 ns = container_of(kref, struct pid_namespace, kref);
140
141 parent = ns->parent;
142 destroy_pid_namespace(ns); 141 destroy_pid_namespace(ns);
142}
143 143
144 if (parent != NULL) 144void put_pid_ns(struct pid_namespace *ns)
145 put_pid_ns(parent); 145{
146 struct pid_namespace *parent;
147
148 while (ns != &init_pid_ns) {
149 parent = ns->parent;
150 if (!kref_put(&ns->kref, free_pid_ns))
151 break;
152 ns = parent;
153 }
146} 154}
155EXPORT_SYMBOL_GPL(put_pid_ns);
147 156
148void zap_pid_ns_processes(struct pid_namespace *pid_ns) 157void zap_pid_ns_processes(struct pid_namespace *pid_ns)
149{ 158{
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index a70518c9d82f..5dfdc9ea180b 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -263,6 +263,10 @@ config PM_GENERIC_DOMAINS
263 bool 263 bool
264 depends on PM 264 depends on PM
265 265
266config PM_GENERIC_DOMAINS_SLEEP
267 def_bool y
268 depends on PM_SLEEP && PM_GENERIC_DOMAINS
269
266config PM_GENERIC_DOMAINS_RUNTIME 270config PM_GENERIC_DOMAINS_RUNTIME
267 def_bool y 271 def_bool y
268 depends on PM_RUNTIME && PM_GENERIC_DOMAINS 272 depends on PM_RUNTIME && PM_GENERIC_DOMAINS
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index d52359374e85..68197a4e8fc9 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -37,7 +37,7 @@ static struct sysrq_key_op sysrq_poweroff_op = {
37 .enable_mask = SYSRQ_ENABLE_BOOT, 37 .enable_mask = SYSRQ_ENABLE_BOOT,
38}; 38};
39 39
40static int pm_sysrq_init(void) 40static int __init pm_sysrq_init(void)
41{ 41{
42 register_sysrq_key('o', &sysrq_poweroff_op); 42 register_sysrq_key('o', &sysrq_poweroff_op);
43 return 0; 43 return 0;
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 19db29f67558..87da817f9e13 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -79,7 +79,7 @@ static int try_to_freeze_tasks(bool user_only)
79 79
80 /* 80 /*
81 * We need to retry, but first give the freezing tasks some 81 * We need to retry, but first give the freezing tasks some
82 * time to enter the regrigerator. 82 * time to enter the refrigerator.
83 */ 83 */
84 msleep(10); 84 msleep(10);
85 } 85 }
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 6a031e684026..846bd42c7ed1 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -139,6 +139,7 @@ static inline int pm_qos_get_value(struct pm_qos_constraints *c)
139 default: 139 default:
140 /* runtime check for not using enum */ 140 /* runtime check for not using enum */
141 BUG(); 141 BUG();
142 return PM_QOS_DEFAULT_VALUE;
142 } 143 }
143} 144}
144 145
diff --git a/kernel/printk.c b/kernel/printk.c
index 66a2ea37b576..2d607f4d1797 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1890,7 +1890,6 @@ static int __cpuinit console_cpu_notify(struct notifier_block *self,
1890 switch (action) { 1890 switch (action) {
1891 case CPU_ONLINE: 1891 case CPU_ONLINE:
1892 case CPU_DEAD: 1892 case CPU_DEAD:
1893 case CPU_DYING:
1894 case CPU_DOWN_FAILED: 1893 case CPU_DOWN_FAILED:
1895 case CPU_UP_CANCELED: 1894 case CPU_UP_CANCELED:
1896 console_lock(); 1895 console_lock();
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index a232bb59d93f..1f5e55dda955 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -180,7 +180,8 @@ static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode)
180 return has_ns_capability(current, ns, CAP_SYS_PTRACE); 180 return has_ns_capability(current, ns, CAP_SYS_PTRACE);
181} 181}
182 182
183int __ptrace_may_access(struct task_struct *task, unsigned int mode) 183/* Returns 0 on success, -errno on denial. */
184static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
184{ 185{
185 const struct cred *cred = current_cred(), *tcred; 186 const struct cred *cred = current_cred(), *tcred;
186 187
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 4fb2376ddf06..74df86bd9204 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -74,6 +74,7 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
74 .orphan_nxttail = &sname##_state.orphan_nxtlist, \ 74 .orphan_nxttail = &sname##_state.orphan_nxtlist, \
75 .orphan_donetail = &sname##_state.orphan_donelist, \ 75 .orphan_donetail = &sname##_state.orphan_donelist, \
76 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ 76 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
77 .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \
77 .name = #sname, \ 78 .name = #sname, \
78} 79}
79 80
@@ -1197,7 +1198,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
1197 raw_spin_unlock_irq(&rnp->lock); 1198 raw_spin_unlock_irq(&rnp->lock);
1198 1199
1199 /* Exclude any concurrent CPU-hotplug operations. */ 1200 /* Exclude any concurrent CPU-hotplug operations. */
1200 get_online_cpus(); 1201 mutex_lock(&rsp->onoff_mutex);
1201 1202
1202 /* 1203 /*
1203 * Set the quiescent-state-needed bits in all the rcu_node 1204 * Set the quiescent-state-needed bits in all the rcu_node
@@ -1234,7 +1235,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
1234 cond_resched(); 1235 cond_resched();
1235 } 1236 }
1236 1237
1237 put_online_cpus(); 1238 mutex_unlock(&rsp->onoff_mutex);
1238 return 1; 1239 return 1;
1239} 1240}
1240 1241
@@ -1700,6 +1701,7 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
1700 /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */ 1701 /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */
1701 1702
1702 /* Exclude any attempts to start a new grace period. */ 1703 /* Exclude any attempts to start a new grace period. */
1704 mutex_lock(&rsp->onoff_mutex);
1703 raw_spin_lock_irqsave(&rsp->onofflock, flags); 1705 raw_spin_lock_irqsave(&rsp->onofflock, flags);
1704 1706
1705 /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ 1707 /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
@@ -1744,6 +1746,7 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
1744 init_callback_list(rdp); 1746 init_callback_list(rdp);
1745 /* Disallow further callbacks on this CPU. */ 1747 /* Disallow further callbacks on this CPU. */
1746 rdp->nxttail[RCU_NEXT_TAIL] = NULL; 1748 rdp->nxttail[RCU_NEXT_TAIL] = NULL;
1749 mutex_unlock(&rsp->onoff_mutex);
1747} 1750}
1748 1751
1749#else /* #ifdef CONFIG_HOTPLUG_CPU */ 1752#else /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -2648,6 +2651,9 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
2648 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 2651 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
2649 struct rcu_node *rnp = rcu_get_root(rsp); 2652 struct rcu_node *rnp = rcu_get_root(rsp);
2650 2653
2654 /* Exclude new grace periods. */
2655 mutex_lock(&rsp->onoff_mutex);
2656
2651 /* Set up local state, ensuring consistent view of global state. */ 2657 /* Set up local state, ensuring consistent view of global state. */
2652 raw_spin_lock_irqsave(&rnp->lock, flags); 2658 raw_spin_lock_irqsave(&rnp->lock, flags);
2653 rdp->beenonline = 1; /* We have now been online. */ 2659 rdp->beenonline = 1; /* We have now been online. */
@@ -2662,14 +2668,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
2662 rcu_prepare_for_idle_init(cpu); 2668 rcu_prepare_for_idle_init(cpu);
2663 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 2669 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
2664 2670
2665 /*
2666 * A new grace period might start here. If so, we won't be part
2667 * of it, but that is OK, as we are currently in a quiescent state.
2668 */
2669
2670 /* Exclude any attempts to start a new GP on large systems. */
2671 raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */
2672
2673 /* Add CPU to rcu_node bitmasks. */ 2671 /* Add CPU to rcu_node bitmasks. */
2674 rnp = rdp->mynode; 2672 rnp = rdp->mynode;
2675 mask = rdp->grpmask; 2673 mask = rdp->grpmask;
@@ -2693,8 +2691,9 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
2693 raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ 2691 raw_spin_unlock(&rnp->lock); /* irqs already disabled. */
2694 rnp = rnp->parent; 2692 rnp = rnp->parent;
2695 } while (rnp != NULL && !(rnp->qsmaskinit & mask)); 2693 } while (rnp != NULL && !(rnp->qsmaskinit & mask));
2694 local_irq_restore(flags);
2696 2695
2697 raw_spin_unlock_irqrestore(&rsp->onofflock, flags); 2696 mutex_unlock(&rsp->onoff_mutex);
2698} 2697}
2699 2698
2700static void __cpuinit rcu_prepare_cpu(int cpu) 2699static void __cpuinit rcu_prepare_cpu(int cpu)
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 5faf05d68326..a240f032848e 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -394,11 +394,17 @@ struct rcu_state {
394 struct rcu_head **orphan_donetail; /* Tail of above. */ 394 struct rcu_head **orphan_donetail; /* Tail of above. */
395 long qlen_lazy; /* Number of lazy callbacks. */ 395 long qlen_lazy; /* Number of lazy callbacks. */
396 long qlen; /* Total number of callbacks. */ 396 long qlen; /* Total number of callbacks. */
397 /* End of fields guarded by onofflock. */
398
399 struct mutex onoff_mutex; /* Coordinate hotplug & GPs. */
400
397 struct mutex barrier_mutex; /* Guards barrier fields. */ 401 struct mutex barrier_mutex; /* Guards barrier fields. */
398 atomic_t barrier_cpu_count; /* # CPUs waiting on. */ 402 atomic_t barrier_cpu_count; /* # CPUs waiting on. */
399 struct completion barrier_completion; /* Wake at barrier end. */ 403 struct completion barrier_completion; /* Wake at barrier end. */
400 unsigned long n_barrier_done; /* ++ at start and end of */ 404 unsigned long n_barrier_done; /* ++ at start and end of */
401 /* _rcu_barrier(). */ 405 /* _rcu_barrier(). */
406 /* End of fields guarded by barrier_mutex. */
407
402 unsigned long jiffies_force_qs; /* Time at which to invoke */ 408 unsigned long jiffies_force_qs; /* Time at which to invoke */
403 /* force_quiescent_state(). */ 409 /* force_quiescent_state(). */
404 unsigned long n_force_qs; /* Number of calls to */ 410 unsigned long n_force_qs; /* Number of calls to */
diff --git a/kernel/resource.c b/kernel/resource.c
index 34d45886ee84..73f35d4b30b9 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -763,6 +763,7 @@ static void __init __reserve_region_with_split(struct resource *root,
763 struct resource *parent = root; 763 struct resource *parent = root;
764 struct resource *conflict; 764 struct resource *conflict;
765 struct resource *res = kzalloc(sizeof(*res), GFP_ATOMIC); 765 struct resource *res = kzalloc(sizeof(*res), GFP_ATOMIC);
766 struct resource *next_res = NULL;
766 767
767 if (!res) 768 if (!res)
768 return; 769 return;
@@ -772,21 +773,46 @@ static void __init __reserve_region_with_split(struct resource *root,
772 res->end = end; 773 res->end = end;
773 res->flags = IORESOURCE_BUSY; 774 res->flags = IORESOURCE_BUSY;
774 775
775 conflict = __request_resource(parent, res); 776 while (1) {
776 if (!conflict)
777 return;
778 777
779 /* failed, split and try again */ 778 conflict = __request_resource(parent, res);
780 kfree(res); 779 if (!conflict) {
780 if (!next_res)
781 break;
782 res = next_res;
783 next_res = NULL;
784 continue;
785 }
781 786
782 /* conflict covered whole area */ 787 /* conflict covered whole area */
783 if (conflict->start <= start && conflict->end >= end) 788 if (conflict->start <= res->start &&
784 return; 789 conflict->end >= res->end) {
790 kfree(res);
791 WARN_ON(next_res);
792 break;
793 }
794
795 /* failed, split and try again */
796 if (conflict->start > res->start) {
797 end = res->end;
798 res->end = conflict->start - 1;
799 if (conflict->end < end) {
800 next_res = kzalloc(sizeof(*next_res),
801 GFP_ATOMIC);
802 if (!next_res) {
803 kfree(res);
804 break;
805 }
806 next_res->name = name;
807 next_res->start = conflict->end + 1;
808 next_res->end = end;
809 next_res->flags = IORESOURCE_BUSY;
810 }
811 } else {
812 res->start = conflict->end + 1;
813 }
814 }
785 815
786 if (conflict->start > start)
787 __reserve_region_with_split(root, start, conflict->start-1, name);
788 if (conflict->end < end)
789 __reserve_region_with_split(root, conflict->end+1, end, name);
790} 816}
791 817
792void __init reserve_region_with_split(struct resource *root, 818void __init reserve_region_with_split(struct resource *root,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c17747236438..2d8927fda712 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -505,7 +505,7 @@ static inline void init_hrtick(void)
505#ifdef CONFIG_SMP 505#ifdef CONFIG_SMP
506 506
507#ifndef tsk_is_polling 507#ifndef tsk_is_polling
508#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) 508#define tsk_is_polling(t) 0
509#endif 509#endif
510 510
511void resched_task(struct task_struct *p) 511void resched_task(struct task_struct *p)
@@ -6122,6 +6122,17 @@ static void sched_init_numa(void)
6122 * numbers. 6122 * numbers.
6123 */ 6123 */
6124 6124
6125 /*
6126 * Here, we should temporarily reset sched_domains_numa_levels to 0.
6127 * If it fails to allocate memory for array sched_domains_numa_masks[][],
6128 * the array will contain less then 'level' members. This could be
6129 * dangerous when we use it to iterate array sched_domains_numa_masks[][]
6130 * in other functions.
6131 *
6132 * We reset it to 'level' at the end of this function.
6133 */
6134 sched_domains_numa_levels = 0;
6135
6125 sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); 6136 sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
6126 if (!sched_domains_numa_masks) 6137 if (!sched_domains_numa_masks)
6127 return; 6138 return;
@@ -6176,11 +6187,68 @@ static void sched_init_numa(void)
6176 } 6187 }
6177 6188
6178 sched_domain_topology = tl; 6189 sched_domain_topology = tl;
6190
6191 sched_domains_numa_levels = level;
6192}
6193
6194static void sched_domains_numa_masks_set(int cpu)
6195{
6196 int i, j;
6197 int node = cpu_to_node(cpu);
6198
6199 for (i = 0; i < sched_domains_numa_levels; i++) {
6200 for (j = 0; j < nr_node_ids; j++) {
6201 if (node_distance(j, node) <= sched_domains_numa_distance[i])
6202 cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
6203 }
6204 }
6205}
6206
6207static void sched_domains_numa_masks_clear(int cpu)
6208{
6209 int i, j;
6210 for (i = 0; i < sched_domains_numa_levels; i++) {
6211 for (j = 0; j < nr_node_ids; j++)
6212 cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
6213 }
6214}
6215
6216/*
6217 * Update sched_domains_numa_masks[level][node] array when new cpus
6218 * are onlined.
6219 */
6220static int sched_domains_numa_masks_update(struct notifier_block *nfb,
6221 unsigned long action,
6222 void *hcpu)
6223{
6224 int cpu = (long)hcpu;
6225
6226 switch (action & ~CPU_TASKS_FROZEN) {
6227 case CPU_ONLINE:
6228 sched_domains_numa_masks_set(cpu);
6229 break;
6230
6231 case CPU_DEAD:
6232 sched_domains_numa_masks_clear(cpu);
6233 break;
6234
6235 default:
6236 return NOTIFY_DONE;
6237 }
6238
6239 return NOTIFY_OK;
6179} 6240}
6180#else 6241#else
6181static inline void sched_init_numa(void) 6242static inline void sched_init_numa(void)
6182{ 6243{
6183} 6244}
6245
6246static int sched_domains_numa_masks_update(struct notifier_block *nfb,
6247 unsigned long action,
6248 void *hcpu)
6249{
6250 return 0;
6251}
6184#endif /* CONFIG_NUMA */ 6252#endif /* CONFIG_NUMA */
6185 6253
6186static int __sdt_alloc(const struct cpumask *cpu_map) 6254static int __sdt_alloc(const struct cpumask *cpu_map)
@@ -6629,6 +6697,7 @@ void __init sched_init_smp(void)
6629 mutex_unlock(&sched_domains_mutex); 6697 mutex_unlock(&sched_domains_mutex);
6630 put_online_cpus(); 6698 put_online_cpus();
6631 6699
6700 hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
6632 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); 6701 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
6633 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); 6702 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
6634 6703
diff --git a/kernel/signal.c b/kernel/signal.c
index 2c681f11b7d2..0af8868525d6 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -17,6 +17,7 @@
17#include <linux/fs.h> 17#include <linux/fs.h>
18#include <linux/tty.h> 18#include <linux/tty.h>
19#include <linux/binfmts.h> 19#include <linux/binfmts.h>
20#include <linux/coredump.h>
20#include <linux/security.h> 21#include <linux/security.h>
21#include <linux/syscalls.h> 22#include <linux/syscalls.h>
22#include <linux/ptrace.h> 23#include <linux/ptrace.h>
@@ -2359,7 +2360,7 @@ relock:
2359 * first and our do_group_exit call below will use 2360 * first and our do_group_exit call below will use
2360 * that value and ignore the one we pass it. 2361 * that value and ignore the one we pass it.
2361 */ 2362 */
2362 do_coredump(info->si_signo, info->si_signo, regs); 2363 do_coredump(info, regs);
2363 } 2364 }
2364 2365
2365 /* 2366 /*
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 2095be3318d5..97c465ebd844 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -379,7 +379,7 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
379 rcu_batch_queue(&sp->batch_queue, head); 379 rcu_batch_queue(&sp->batch_queue, head);
380 if (!sp->running) { 380 if (!sp->running) {
381 sp->running = true; 381 sp->running = true;
382 queue_delayed_work(system_nrt_wq, &sp->work, 0); 382 schedule_delayed_work(&sp->work, 0);
383 } 383 }
384 spin_unlock_irqrestore(&sp->queue_lock, flags); 384 spin_unlock_irqrestore(&sp->queue_lock, flags);
385} 385}
@@ -631,7 +631,7 @@ static void srcu_reschedule(struct srcu_struct *sp)
631 } 631 }
632 632
633 if (pending) 633 if (pending)
634 queue_delayed_work(system_nrt_wq, &sp->work, SRCU_INTERVAL); 634 schedule_delayed_work(&sp->work, SRCU_INTERVAL);
635} 635}
636 636
637/* 637/*
diff --git a/kernel/sys.c b/kernel/sys.c
index 241507f23eca..e6e0ece5f6a0 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -368,6 +368,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier);
368void kernel_restart(char *cmd) 368void kernel_restart(char *cmd)
369{ 369{
370 kernel_restart_prepare(cmd); 370 kernel_restart_prepare(cmd);
371 disable_nonboot_cpus();
371 if (!cmd) 372 if (!cmd)
372 printk(KERN_EMERG "Restarting system.\n"); 373 printk(KERN_EMERG "Restarting system.\n");
373 else 374 else
@@ -1264,15 +1265,16 @@ DECLARE_RWSEM(uts_sem);
1264 * Work around broken programs that cannot handle "Linux 3.0". 1265 * Work around broken programs that cannot handle "Linux 3.0".
1265 * Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40 1266 * Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40
1266 */ 1267 */
1267static int override_release(char __user *release, int len) 1268static int override_release(char __user *release, size_t len)
1268{ 1269{
1269 int ret = 0; 1270 int ret = 0;
1270 char buf[65];
1271 1271
1272 if (current->personality & UNAME26) { 1272 if (current->personality & UNAME26) {
1273 char *rest = UTS_RELEASE; 1273 const char *rest = UTS_RELEASE;
1274 char buf[65] = { 0 };
1274 int ndots = 0; 1275 int ndots = 0;
1275 unsigned v; 1276 unsigned v;
1277 size_t copy;
1276 1278
1277 while (*rest) { 1279 while (*rest) {
1278 if (*rest == '.' && ++ndots >= 3) 1280 if (*rest == '.' && ++ndots >= 3)
@@ -1282,8 +1284,9 @@ static int override_release(char __user *release, int len)
1282 rest++; 1284 rest++;
1283 } 1285 }
1284 v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 40; 1286 v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 40;
1285 snprintf(buf, len, "2.6.%u%s", v, rest); 1287 copy = clamp_t(size_t, len, 1, sizeof(buf));
1286 ret = copy_to_user(release, buf, len); 1288 copy = scnprintf(buf, copy, "2.6.%u%s", v, rest);
1289 ret = copy_to_user(release, buf, copy + 1);
1287 } 1290 }
1288 return ret; 1291 return ret;
1289} 1292}
@@ -1788,15 +1791,15 @@ SYSCALL_DEFINE1(umask, int, mask)
1788#ifdef CONFIG_CHECKPOINT_RESTORE 1791#ifdef CONFIG_CHECKPOINT_RESTORE
1789static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) 1792static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
1790{ 1793{
1791 struct file *exe_file; 1794 struct fd exe;
1792 struct dentry *dentry; 1795 struct dentry *dentry;
1793 int err; 1796 int err;
1794 1797
1795 exe_file = fget(fd); 1798 exe = fdget(fd);
1796 if (!exe_file) 1799 if (!exe.file)
1797 return -EBADF; 1800 return -EBADF;
1798 1801
1799 dentry = exe_file->f_path.dentry; 1802 dentry = exe.file->f_path.dentry;
1800 1803
1801 /* 1804 /*
1802 * Because the original mm->exe_file points to executable file, make 1805 * Because the original mm->exe_file points to executable file, make
@@ -1805,7 +1808,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
1805 */ 1808 */
1806 err = -EACCES; 1809 err = -EACCES;
1807 if (!S_ISREG(dentry->d_inode->i_mode) || 1810 if (!S_ISREG(dentry->d_inode->i_mode) ||
1808 exe_file->f_path.mnt->mnt_flags & MNT_NOEXEC) 1811 exe.file->f_path.mnt->mnt_flags & MNT_NOEXEC)
1809 goto exit; 1812 goto exit;
1810 1813
1811 err = inode_permission(dentry->d_inode, MAY_EXEC); 1814 err = inode_permission(dentry->d_inode, MAY_EXEC);
@@ -1839,12 +1842,12 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
1839 goto exit_unlock; 1842 goto exit_unlock;
1840 1843
1841 err = 0; 1844 err = 0;
1842 set_mm_exe_file(mm, exe_file); 1845 set_mm_exe_file(mm, exe.file); /* this grabs a reference to exe.file */
1843exit_unlock: 1846exit_unlock:
1844 up_write(&mm->mmap_sem); 1847 up_write(&mm->mmap_sem);
1845 1848
1846exit: 1849exit:
1847 fput(exe_file); 1850 fdput(exe);
1848 return err; 1851 return err;
1849} 1852}
1850 1853
@@ -2204,7 +2207,7 @@ static int __orderly_poweroff(void)
2204 return -ENOMEM; 2207 return -ENOMEM;
2205 } 2208 }
2206 2209
2207 ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_NO_WAIT, 2210 ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_WAIT_EXEC,
2208 NULL, argv_cleanup, NULL); 2211 NULL, argv_cleanup, NULL);
2209 if (ret == -ENOMEM) 2212 if (ret == -ENOMEM)
2210 argv_free(argv); 2213 argv_free(argv);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 81c7b1a1a307..26f65eaa01f9 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -97,10 +97,12 @@
97extern int sysctl_overcommit_memory; 97extern int sysctl_overcommit_memory;
98extern int sysctl_overcommit_ratio; 98extern int sysctl_overcommit_ratio;
99extern int max_threads; 99extern int max_threads;
100extern int core_uses_pid;
101extern int suid_dumpable; 100extern int suid_dumpable;
101#ifdef CONFIG_COREDUMP
102extern int core_uses_pid;
102extern char core_pattern[]; 103extern char core_pattern[];
103extern unsigned int core_pipe_limit; 104extern unsigned int core_pipe_limit;
105#endif
104extern int pid_max; 106extern int pid_max;
105extern int min_free_kbytes; 107extern int min_free_kbytes;
106extern int pid_max_min, pid_max_max; 108extern int pid_max_min, pid_max_max;
@@ -177,8 +179,10 @@ static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
177 179
178static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, 180static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
179 void __user *buffer, size_t *lenp, loff_t *ppos); 181 void __user *buffer, size_t *lenp, loff_t *ppos);
182#ifdef CONFIG_COREDUMP
180static int proc_dostring_coredump(struct ctl_table *table, int write, 183static int proc_dostring_coredump(struct ctl_table *table, int write,
181 void __user *buffer, size_t *lenp, loff_t *ppos); 184 void __user *buffer, size_t *lenp, loff_t *ppos);
185#endif
182 186
183#ifdef CONFIG_MAGIC_SYSRQ 187#ifdef CONFIG_MAGIC_SYSRQ
184/* Note: sysrq code uses it's own private copy */ 188/* Note: sysrq code uses it's own private copy */
@@ -404,6 +408,7 @@ static struct ctl_table kern_table[] = {
404 .mode = 0644, 408 .mode = 0644,
405 .proc_handler = proc_dointvec, 409 .proc_handler = proc_dointvec,
406 }, 410 },
411#ifdef CONFIG_COREDUMP
407 { 412 {
408 .procname = "core_uses_pid", 413 .procname = "core_uses_pid",
409 .data = &core_uses_pid, 414 .data = &core_uses_pid,
@@ -425,6 +430,7 @@ static struct ctl_table kern_table[] = {
425 .mode = 0644, 430 .mode = 0644,
426 .proc_handler = proc_dointvec, 431 .proc_handler = proc_dointvec,
427 }, 432 },
433#endif
428#ifdef CONFIG_PROC_SYSCTL 434#ifdef CONFIG_PROC_SYSCTL
429 { 435 {
430 .procname = "tainted", 436 .procname = "tainted",
@@ -1543,8 +1549,7 @@ static struct ctl_table fs_table[] = {
1543}; 1549};
1544 1550
1545static struct ctl_table debug_table[] = { 1551static struct ctl_table debug_table[] = {
1546#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \ 1552#ifdef CONFIG_SYSCTL_EXCEPTION_TRACE
1547 defined(CONFIG_S390) || defined(CONFIG_TILE)
1548 { 1553 {
1549 .procname = "exception-trace", 1554 .procname = "exception-trace",
1550 .data = &show_unhandled_signals, 1555 .data = &show_unhandled_signals,
@@ -2036,12 +2041,14 @@ int proc_dointvec_minmax(struct ctl_table *table, int write,
2036 2041
2037static void validate_coredump_safety(void) 2042static void validate_coredump_safety(void)
2038{ 2043{
2044#ifdef CONFIG_COREDUMP
2039 if (suid_dumpable == SUID_DUMPABLE_SAFE && 2045 if (suid_dumpable == SUID_DUMPABLE_SAFE &&
2040 core_pattern[0] != '/' && core_pattern[0] != '|') { 2046 core_pattern[0] != '/' && core_pattern[0] != '|') {
2041 printk(KERN_WARNING "Unsafe core_pattern used with "\ 2047 printk(KERN_WARNING "Unsafe core_pattern used with "\
2042 "suid_dumpable=2. Pipe handler or fully qualified "\ 2048 "suid_dumpable=2. Pipe handler or fully qualified "\
2043 "core dump path required.\n"); 2049 "core dump path required.\n");
2044 } 2050 }
2051#endif
2045} 2052}
2046 2053
2047static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, 2054static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
@@ -2053,6 +2060,7 @@ static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
2053 return error; 2060 return error;
2054} 2061}
2055 2062
2063#ifdef CONFIG_COREDUMP
2056static int proc_dostring_coredump(struct ctl_table *table, int write, 2064static int proc_dostring_coredump(struct ctl_table *table, int write,
2057 void __user *buffer, size_t *lenp, loff_t *ppos) 2065 void __user *buffer, size_t *lenp, loff_t *ppos)
2058{ 2066{
@@ -2061,6 +2069,7 @@ static int proc_dostring_coredump(struct ctl_table *table, int write,
2061 validate_coredump_safety(); 2069 validate_coredump_safety();
2062 return error; 2070 return error;
2063} 2071}
2072#endif
2064 2073
2065static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write, 2074static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,
2066 void __user *buffer, 2075 void __user *buffer,
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index d0a32796550f..145bb4d3bd4d 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -27,6 +27,7 @@
27#include <linux/cgroup.h> 27#include <linux/cgroup.h>
28#include <linux/fs.h> 28#include <linux/fs.h>
29#include <linux/file.h> 29#include <linux/file.h>
30#include <linux/pid_namespace.h>
30#include <net/genetlink.h> 31#include <net/genetlink.h>
31#include <linux/atomic.h> 32#include <linux/atomic.h>
32 33
@@ -174,7 +175,9 @@ static void send_cpu_listeners(struct sk_buff *skb,
174 up_write(&listeners->sem); 175 up_write(&listeners->sem);
175} 176}
176 177
177static void fill_stats(struct task_struct *tsk, struct taskstats *stats) 178static void fill_stats(struct user_namespace *user_ns,
179 struct pid_namespace *pid_ns,
180 struct task_struct *tsk, struct taskstats *stats)
178{ 181{
179 memset(stats, 0, sizeof(*stats)); 182 memset(stats, 0, sizeof(*stats));
180 /* 183 /*
@@ -190,7 +193,7 @@ static void fill_stats(struct task_struct *tsk, struct taskstats *stats)
190 stats->version = TASKSTATS_VERSION; 193 stats->version = TASKSTATS_VERSION;
191 stats->nvcsw = tsk->nvcsw; 194 stats->nvcsw = tsk->nvcsw;
192 stats->nivcsw = tsk->nivcsw; 195 stats->nivcsw = tsk->nivcsw;
193 bacct_add_tsk(stats, tsk); 196 bacct_add_tsk(user_ns, pid_ns, stats, tsk);
194 197
195 /* fill in extended acct fields */ 198 /* fill in extended acct fields */
196 xacct_add_tsk(stats, tsk); 199 xacct_add_tsk(stats, tsk);
@@ -207,7 +210,7 @@ static int fill_stats_for_pid(pid_t pid, struct taskstats *stats)
207 rcu_read_unlock(); 210 rcu_read_unlock();
208 if (!tsk) 211 if (!tsk)
209 return -ESRCH; 212 return -ESRCH;
210 fill_stats(tsk, stats); 213 fill_stats(current_user_ns(), task_active_pid_ns(current), tsk, stats);
211 put_task_struct(tsk); 214 put_task_struct(tsk);
212 return 0; 215 return 0;
213} 216}
@@ -291,6 +294,12 @@ static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)
291 if (!cpumask_subset(mask, cpu_possible_mask)) 294 if (!cpumask_subset(mask, cpu_possible_mask))
292 return -EINVAL; 295 return -EINVAL;
293 296
297 if (current_user_ns() != &init_user_ns)
298 return -EINVAL;
299
300 if (task_active_pid_ns(current) != &init_pid_ns)
301 return -EINVAL;
302
294 if (isadd == REGISTER) { 303 if (isadd == REGISTER) {
295 for_each_cpu(cpu, mask) { 304 for_each_cpu(cpu, mask) {
296 s = kmalloc_node(sizeof(struct listener), 305 s = kmalloc_node(sizeof(struct listener),
@@ -415,16 +424,15 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
415 struct nlattr *na; 424 struct nlattr *na;
416 size_t size; 425 size_t size;
417 u32 fd; 426 u32 fd;
418 struct file *file; 427 struct fd f;
419 int fput_needed;
420 428
421 na = info->attrs[CGROUPSTATS_CMD_ATTR_FD]; 429 na = info->attrs[CGROUPSTATS_CMD_ATTR_FD];
422 if (!na) 430 if (!na)
423 return -EINVAL; 431 return -EINVAL;
424 432
425 fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]); 433 fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]);
426 file = fget_light(fd, &fput_needed); 434 f = fdget(fd);
427 if (!file) 435 if (!f.file)
428 return 0; 436 return 0;
429 437
430 size = nla_total_size(sizeof(struct cgroupstats)); 438 size = nla_total_size(sizeof(struct cgroupstats));
@@ -437,6 +445,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
437 na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS, 445 na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS,
438 sizeof(struct cgroupstats)); 446 sizeof(struct cgroupstats));
439 if (na == NULL) { 447 if (na == NULL) {
448 nlmsg_free(rep_skb);
440 rc = -EMSGSIZE; 449 rc = -EMSGSIZE;
441 goto err; 450 goto err;
442 } 451 }
@@ -444,7 +453,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
444 stats = nla_data(na); 453 stats = nla_data(na);
445 memset(stats, 0, sizeof(*stats)); 454 memset(stats, 0, sizeof(*stats));
446 455
447 rc = cgroupstats_build(stats, file->f_dentry); 456 rc = cgroupstats_build(stats, f.file->f_dentry);
448 if (rc < 0) { 457 if (rc < 0) {
449 nlmsg_free(rep_skb); 458 nlmsg_free(rep_skb);
450 goto err; 459 goto err;
@@ -453,7 +462,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
453 rc = send_reply(rep_skb, info); 462 rc = send_reply(rep_skb, info);
454 463
455err: 464err:
456 fput_light(file, fput_needed); 465 fdput(f);
457 return rc; 466 return rc;
458} 467}
459 468
@@ -467,7 +476,7 @@ static int cmd_attr_register_cpumask(struct genl_info *info)
467 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask); 476 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask);
468 if (rc < 0) 477 if (rc < 0)
469 goto out; 478 goto out;
470 rc = add_del_listener(info->snd_pid, mask, REGISTER); 479 rc = add_del_listener(info->snd_portid, mask, REGISTER);
471out: 480out:
472 free_cpumask_var(mask); 481 free_cpumask_var(mask);
473 return rc; 482 return rc;
@@ -483,7 +492,7 @@ static int cmd_attr_deregister_cpumask(struct genl_info *info)
483 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask); 492 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask);
484 if (rc < 0) 493 if (rc < 0)
485 goto out; 494 goto out;
486 rc = add_del_listener(info->snd_pid, mask, DEREGISTER); 495 rc = add_del_listener(info->snd_portid, mask, DEREGISTER);
487out: 496out:
488 free_cpumask_var(mask); 497 free_cpumask_var(mask);
489 return rc; 498 return rc;
@@ -631,11 +640,12 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
631 if (rc < 0) 640 if (rc < 0)
632 return; 641 return;
633 642
634 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, tsk->pid); 643 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID,
644 task_pid_nr_ns(tsk, &init_pid_ns));
635 if (!stats) 645 if (!stats)
636 goto err; 646 goto err;
637 647
638 fill_stats(tsk, stats); 648 fill_stats(&init_user_ns, &init_pid_ns, tsk, stats);
639 649
640 /* 650 /*
641 * Doesn't matter if tsk is the leader or the last group member leaving 651 * Doesn't matter if tsk is the leader or the last group member leaving
@@ -643,7 +653,8 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
643 if (!is_thread_group || !group_dead) 653 if (!is_thread_group || !group_dead)
644 goto send; 654 goto send;
645 655
646 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid); 656 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID,
657 task_tgid_nr_ns(tsk, &init_pid_ns));
647 if (!stats) 658 if (!stats)
648 goto err; 659 goto err;
649 660
diff --git a/kernel/time.c b/kernel/time.c
index ba744cf80696..d226c6a3fd28 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -30,7 +30,7 @@
30#include <linux/export.h> 30#include <linux/export.h>
31#include <linux/timex.h> 31#include <linux/timex.h>
32#include <linux/capability.h> 32#include <linux/capability.h>
33#include <linux/clocksource.h> 33#include <linux/timekeeper_internal.h>
34#include <linux/errno.h> 34#include <linux/errno.h>
35#include <linux/syscalls.h> 35#include <linux/syscalls.h>
36#include <linux/security.h> 36#include <linux/security.h>
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index fd42bd452b75..8601f0db1261 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -16,6 +16,10 @@ config ARCH_CLOCKSOURCE_DATA
16config GENERIC_TIME_VSYSCALL 16config GENERIC_TIME_VSYSCALL
17 bool 17 bool
18 18
19# Timekeeping vsyscall support
20config GENERIC_TIME_VSYSCALL_OLD
21 bool
22
19# ktime_t scalar 64bit nsec representation 23# ktime_t scalar 64bit nsec representation
20config KTIME_SCALAR 24config KTIME_SCALAR
21 bool 25 bool
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index aa27d391bfc8..f11d83b12949 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -37,7 +37,6 @@
37static struct alarm_base { 37static struct alarm_base {
38 spinlock_t lock; 38 spinlock_t lock;
39 struct timerqueue_head timerqueue; 39 struct timerqueue_head timerqueue;
40 struct hrtimer timer;
41 ktime_t (*gettime)(void); 40 ktime_t (*gettime)(void);
42 clockid_t base_clockid; 41 clockid_t base_clockid;
43} alarm_bases[ALARM_NUMTYPE]; 42} alarm_bases[ALARM_NUMTYPE];
@@ -46,6 +45,8 @@ static struct alarm_base {
46static ktime_t freezer_delta; 45static ktime_t freezer_delta;
47static DEFINE_SPINLOCK(freezer_delta_lock); 46static DEFINE_SPINLOCK(freezer_delta_lock);
48 47
48static struct wakeup_source *ws;
49
49#ifdef CONFIG_RTC_CLASS 50#ifdef CONFIG_RTC_CLASS
50/* rtc timer and device for setting alarm wakeups at suspend */ 51/* rtc timer and device for setting alarm wakeups at suspend */
51static struct rtc_timer rtctimer; 52static struct rtc_timer rtctimer;
@@ -130,50 +131,35 @@ static inline void alarmtimer_rtc_timer_init(void) { }
130 * @base: pointer to the base where the timer is being run 131 * @base: pointer to the base where the timer is being run
131 * @alarm: pointer to alarm being enqueued. 132 * @alarm: pointer to alarm being enqueued.
132 * 133 *
133 * Adds alarm to a alarm_base timerqueue and if necessary sets 134 * Adds alarm to a alarm_base timerqueue
134 * an hrtimer to run.
135 * 135 *
136 * Must hold base->lock when calling. 136 * Must hold base->lock when calling.
137 */ 137 */
138static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm) 138static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm)
139{ 139{
140 if (alarm->state & ALARMTIMER_STATE_ENQUEUED)
141 timerqueue_del(&base->timerqueue, &alarm->node);
142
140 timerqueue_add(&base->timerqueue, &alarm->node); 143 timerqueue_add(&base->timerqueue, &alarm->node);
141 alarm->state |= ALARMTIMER_STATE_ENQUEUED; 144 alarm->state |= ALARMTIMER_STATE_ENQUEUED;
142
143 if (&alarm->node == timerqueue_getnext(&base->timerqueue)) {
144 hrtimer_try_to_cancel(&base->timer);
145 hrtimer_start(&base->timer, alarm->node.expires,
146 HRTIMER_MODE_ABS);
147 }
148} 145}
149 146
150/** 147/**
151 * alarmtimer_remove - Removes an alarm timer from an alarm_base timerqueue 148 * alarmtimer_dequeue - Removes an alarm timer from an alarm_base timerqueue
152 * @base: pointer to the base where the timer is running 149 * @base: pointer to the base where the timer is running
153 * @alarm: pointer to alarm being removed 150 * @alarm: pointer to alarm being removed
154 * 151 *
155 * Removes alarm to a alarm_base timerqueue and if necessary sets 152 * Removes alarm to a alarm_base timerqueue
156 * a new timer to run.
157 * 153 *
158 * Must hold base->lock when calling. 154 * Must hold base->lock when calling.
159 */ 155 */
160static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm) 156static void alarmtimer_dequeue(struct alarm_base *base, struct alarm *alarm)
161{ 157{
162 struct timerqueue_node *next = timerqueue_getnext(&base->timerqueue);
163
164 if (!(alarm->state & ALARMTIMER_STATE_ENQUEUED)) 158 if (!(alarm->state & ALARMTIMER_STATE_ENQUEUED))
165 return; 159 return;
166 160
167 timerqueue_del(&base->timerqueue, &alarm->node); 161 timerqueue_del(&base->timerqueue, &alarm->node);
168 alarm->state &= ~ALARMTIMER_STATE_ENQUEUED; 162 alarm->state &= ~ALARMTIMER_STATE_ENQUEUED;
169
170 if (next == &alarm->node) {
171 hrtimer_try_to_cancel(&base->timer);
172 next = timerqueue_getnext(&base->timerqueue);
173 if (!next)
174 return;
175 hrtimer_start(&base->timer, next->expires, HRTIMER_MODE_ABS);
176 }
177} 163}
178 164
179 165
@@ -188,42 +174,23 @@ static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm)
188 */ 174 */
189static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) 175static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
190{ 176{
191 struct alarm_base *base = container_of(timer, struct alarm_base, timer); 177 struct alarm *alarm = container_of(timer, struct alarm, timer);
192 struct timerqueue_node *next; 178 struct alarm_base *base = &alarm_bases[alarm->type];
193 unsigned long flags; 179 unsigned long flags;
194 ktime_t now;
195 int ret = HRTIMER_NORESTART; 180 int ret = HRTIMER_NORESTART;
196 int restart = ALARMTIMER_NORESTART; 181 int restart = ALARMTIMER_NORESTART;
197 182
198 spin_lock_irqsave(&base->lock, flags); 183 spin_lock_irqsave(&base->lock, flags);
199 now = base->gettime(); 184 alarmtimer_dequeue(base, alarm);
200 while ((next = timerqueue_getnext(&base->timerqueue))) { 185 spin_unlock_irqrestore(&base->lock, flags);
201 struct alarm *alarm;
202 ktime_t expired = next->expires;
203
204 if (expired.tv64 > now.tv64)
205 break;
206
207 alarm = container_of(next, struct alarm, node);
208
209 timerqueue_del(&base->timerqueue, &alarm->node);
210 alarm->state &= ~ALARMTIMER_STATE_ENQUEUED;
211
212 alarm->state |= ALARMTIMER_STATE_CALLBACK;
213 spin_unlock_irqrestore(&base->lock, flags);
214 if (alarm->function)
215 restart = alarm->function(alarm, now);
216 spin_lock_irqsave(&base->lock, flags);
217 alarm->state &= ~ALARMTIMER_STATE_CALLBACK;
218 186
219 if (restart != ALARMTIMER_NORESTART) { 187 if (alarm->function)
220 timerqueue_add(&base->timerqueue, &alarm->node); 188 restart = alarm->function(alarm, base->gettime());
221 alarm->state |= ALARMTIMER_STATE_ENQUEUED;
222 }
223 }
224 189
225 if (next) { 190 spin_lock_irqsave(&base->lock, flags);
226 hrtimer_set_expires(&base->timer, next->expires); 191 if (restart != ALARMTIMER_NORESTART) {
192 hrtimer_set_expires(&alarm->timer, alarm->node.expires);
193 alarmtimer_enqueue(base, alarm);
227 ret = HRTIMER_RESTART; 194 ret = HRTIMER_RESTART;
228 } 195 }
229 spin_unlock_irqrestore(&base->lock, flags); 196 spin_unlock_irqrestore(&base->lock, flags);
@@ -250,6 +217,7 @@ static int alarmtimer_suspend(struct device *dev)
250 unsigned long flags; 217 unsigned long flags;
251 struct rtc_device *rtc; 218 struct rtc_device *rtc;
252 int i; 219 int i;
220 int ret;
253 221
254 spin_lock_irqsave(&freezer_delta_lock, flags); 222 spin_lock_irqsave(&freezer_delta_lock, flags);
255 min = freezer_delta; 223 min = freezer_delta;
@@ -279,8 +247,10 @@ static int alarmtimer_suspend(struct device *dev)
279 if (min.tv64 == 0) 247 if (min.tv64 == 0)
280 return 0; 248 return 0;
281 249
282 /* XXX - Should we enforce a minimum sleep time? */ 250 if (ktime_to_ns(min) < 2 * NSEC_PER_SEC) {
283 WARN_ON(min.tv64 < NSEC_PER_SEC); 251 __pm_wakeup_event(ws, 2 * MSEC_PER_SEC);
252 return -EBUSY;
253 }
284 254
285 /* Setup an rtc timer to fire that far in the future */ 255 /* Setup an rtc timer to fire that far in the future */
286 rtc_timer_cancel(rtc, &rtctimer); 256 rtc_timer_cancel(rtc, &rtctimer);
@@ -288,9 +258,11 @@ static int alarmtimer_suspend(struct device *dev)
288 now = rtc_tm_to_ktime(tm); 258 now = rtc_tm_to_ktime(tm);
289 now = ktime_add(now, min); 259 now = ktime_add(now, min);
290 260
291 rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0)); 261 /* Set alarm, if in the past reject suspend briefly to handle */
292 262 ret = rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0));
293 return 0; 263 if (ret < 0)
264 __pm_wakeup_event(ws, MSEC_PER_SEC);
265 return ret;
294} 266}
295#else 267#else
296static int alarmtimer_suspend(struct device *dev) 268static int alarmtimer_suspend(struct device *dev)
@@ -324,6 +296,9 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type,
324 enum alarmtimer_restart (*function)(struct alarm *, ktime_t)) 296 enum alarmtimer_restart (*function)(struct alarm *, ktime_t))
325{ 297{
326 timerqueue_init(&alarm->node); 298 timerqueue_init(&alarm->node);
299 hrtimer_init(&alarm->timer, alarm_bases[type].base_clockid,
300 HRTIMER_MODE_ABS);
301 alarm->timer.function = alarmtimer_fired;
327 alarm->function = function; 302 alarm->function = function;
328 alarm->type = type; 303 alarm->type = type;
329 alarm->state = ALARMTIMER_STATE_INACTIVE; 304 alarm->state = ALARMTIMER_STATE_INACTIVE;
@@ -334,17 +309,19 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type,
334 * @alarm: ptr to alarm to set 309 * @alarm: ptr to alarm to set
335 * @start: time to run the alarm 310 * @start: time to run the alarm
336 */ 311 */
337void alarm_start(struct alarm *alarm, ktime_t start) 312int alarm_start(struct alarm *alarm, ktime_t start)
338{ 313{
339 struct alarm_base *base = &alarm_bases[alarm->type]; 314 struct alarm_base *base = &alarm_bases[alarm->type];
340 unsigned long flags; 315 unsigned long flags;
316 int ret;
341 317
342 spin_lock_irqsave(&base->lock, flags); 318 spin_lock_irqsave(&base->lock, flags);
343 if (alarmtimer_active(alarm))
344 alarmtimer_remove(base, alarm);
345 alarm->node.expires = start; 319 alarm->node.expires = start;
346 alarmtimer_enqueue(base, alarm); 320 alarmtimer_enqueue(base, alarm);
321 ret = hrtimer_start(&alarm->timer, alarm->node.expires,
322 HRTIMER_MODE_ABS);
347 spin_unlock_irqrestore(&base->lock, flags); 323 spin_unlock_irqrestore(&base->lock, flags);
324 return ret;
348} 325}
349 326
350/** 327/**
@@ -358,18 +335,12 @@ int alarm_try_to_cancel(struct alarm *alarm)
358{ 335{
359 struct alarm_base *base = &alarm_bases[alarm->type]; 336 struct alarm_base *base = &alarm_bases[alarm->type];
360 unsigned long flags; 337 unsigned long flags;
361 int ret = -1; 338 int ret;
362 spin_lock_irqsave(&base->lock, flags);
363
364 if (alarmtimer_callback_running(alarm))
365 goto out;
366 339
367 if (alarmtimer_is_queued(alarm)) { 340 spin_lock_irqsave(&base->lock, flags);
368 alarmtimer_remove(base, alarm); 341 ret = hrtimer_try_to_cancel(&alarm->timer);
369 ret = 1; 342 if (ret >= 0)
370 } else 343 alarmtimer_dequeue(base, alarm);
371 ret = 0;
372out:
373 spin_unlock_irqrestore(&base->lock, flags); 344 spin_unlock_irqrestore(&base->lock, flags);
374 return ret; 345 return ret;
375} 346}
@@ -802,10 +773,6 @@ static int __init alarmtimer_init(void)
802 for (i = 0; i < ALARM_NUMTYPE; i++) { 773 for (i = 0; i < ALARM_NUMTYPE; i++) {
803 timerqueue_init_head(&alarm_bases[i].timerqueue); 774 timerqueue_init_head(&alarm_bases[i].timerqueue);
804 spin_lock_init(&alarm_bases[i].lock); 775 spin_lock_init(&alarm_bases[i].lock);
805 hrtimer_init(&alarm_bases[i].timer,
806 alarm_bases[i].base_clockid,
807 HRTIMER_MODE_ABS);
808 alarm_bases[i].timer.function = alarmtimer_fired;
809 } 776 }
810 777
811 error = alarmtimer_rtc_interface_setup(); 778 error = alarmtimer_rtc_interface_setup();
@@ -821,6 +788,7 @@ static int __init alarmtimer_init(void)
821 error = PTR_ERR(pdev); 788 error = PTR_ERR(pdev);
822 goto out_drv; 789 goto out_drv;
823 } 790 }
791 ws = wakeup_source_register("alarmtimer");
824 return 0; 792 return 0;
825 793
826out_drv: 794out_drv:
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 7e1ce012a851..30b6de0d977c 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -397,6 +397,30 @@ void clockevents_exchange_device(struct clock_event_device *old,
397 local_irq_restore(flags); 397 local_irq_restore(flags);
398} 398}
399 399
400/**
401 * clockevents_suspend - suspend clock devices
402 */
403void clockevents_suspend(void)
404{
405 struct clock_event_device *dev;
406
407 list_for_each_entry_reverse(dev, &clockevent_devices, list)
408 if (dev->suspend)
409 dev->suspend(dev);
410}
411
412/**
413 * clockevents_resume - resume clock devices
414 */
415void clockevents_resume(void)
416{
417 struct clock_event_device *dev;
418
419 list_for_each_entry(dev, &clockevent_devices, list)
420 if (dev->resume)
421 dev->resume(dev);
422}
423
400#ifdef CONFIG_GENERIC_CLOCKEVENTS 424#ifdef CONFIG_GENERIC_CLOCKEVENTS
401/** 425/**
402 * clockevents_notify - notification about relevant events 426 * clockevents_notify - notification about relevant events
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 46da0537c10b..6629bf7b5285 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -37,7 +37,7 @@
37 * requested HZ value. It is also not recommended 37 * requested HZ value. It is also not recommended
38 * for "tick-less" systems. 38 * for "tick-less" systems.
39 */ 39 */
40#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/SHIFTED_HZ)) 40#define NSEC_PER_JIFFY ((NSEC_PER_SEC+HZ/2)/HZ)
41 41
42/* Since jiffies uses a simple NSEC_PER_JIFFY multiplier 42/* Since jiffies uses a simple NSEC_PER_JIFFY multiplier
43 * conversion, the .shift value could be zero. However 43 * conversion, the .shift value could be zero. However
@@ -95,3 +95,33 @@ struct clocksource * __init __weak clocksource_default_clock(void)
95{ 95{
96 return &clocksource_jiffies; 96 return &clocksource_jiffies;
97} 97}
98
99struct clocksource refined_jiffies;
100
101int register_refined_jiffies(long cycles_per_second)
102{
103 u64 nsec_per_tick, shift_hz;
104 long cycles_per_tick;
105
106
107
108 refined_jiffies = clocksource_jiffies;
109 refined_jiffies.name = "refined-jiffies";
110 refined_jiffies.rating++;
111
112 /* Calc cycles per tick */
113 cycles_per_tick = (cycles_per_second + HZ/2)/HZ;
114 /* shift_hz stores hz<<8 for extra accuracy */
115 shift_hz = (u64)cycles_per_second << 8;
116 shift_hz += cycles_per_tick/2;
117 do_div(shift_hz, cycles_per_tick);
118 /* Calculate nsec_per_tick using shift_hz */
119 nsec_per_tick = (u64)NSEC_PER_SEC << 8;
120 nsec_per_tick += (u32)shift_hz/2;
121 do_div(nsec_per_tick, (u32)shift_hz);
122
123 refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT;
124
125 clocksource_register(&refined_jiffies);
126 return 0;
127}
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f423bdd035c2..a40260885265 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -835,7 +835,7 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
835 */ 835 */
836 if (ts->tick_stopped) { 836 if (ts->tick_stopped) {
837 touch_softlockup_watchdog(); 837 touch_softlockup_watchdog();
838 if (idle_cpu(cpu)) 838 if (is_idle_task(current))
839 ts->idle_jiffies++; 839 ts->idle_jiffies++;
840 } 840 }
841 update_process_times(user_mode(regs)); 841 update_process_times(user_mode(regs));
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index d3b91e75cecd..e424970bb562 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -8,6 +8,7 @@
8 * 8 *
9 */ 9 */
10 10
11#include <linux/timekeeper_internal.h>
11#include <linux/module.h> 12#include <linux/module.h>
12#include <linux/interrupt.h> 13#include <linux/interrupt.h>
13#include <linux/percpu.h> 14#include <linux/percpu.h>
@@ -21,61 +22,6 @@
21#include <linux/tick.h> 22#include <linux/tick.h>
22#include <linux/stop_machine.h> 23#include <linux/stop_machine.h>
23 24
24/* Structure holding internal timekeeping values. */
25struct timekeeper {
26 /* Current clocksource used for timekeeping. */
27 struct clocksource *clock;
28 /* NTP adjusted clock multiplier */
29 u32 mult;
30 /* The shift value of the current clocksource. */
31 u32 shift;
32 /* Number of clock cycles in one NTP interval. */
33 cycle_t cycle_interval;
34 /* Number of clock shifted nano seconds in one NTP interval. */
35 u64 xtime_interval;
36 /* shifted nano seconds left over when rounding cycle_interval */
37 s64 xtime_remainder;
38 /* Raw nano seconds accumulated per NTP interval. */
39 u32 raw_interval;
40
41 /* Current CLOCK_REALTIME time in seconds */
42 u64 xtime_sec;
43 /* Clock shifted nano seconds */
44 u64 xtime_nsec;
45
46 /* Difference between accumulated time and NTP time in ntp
47 * shifted nano seconds. */
48 s64 ntp_error;
49 /* Shift conversion between clock shifted nano seconds and
50 * ntp shifted nano seconds. */
51 u32 ntp_error_shift;
52
53 /*
54 * wall_to_monotonic is what we need to add to xtime (or xtime corrected
55 * for sub jiffie times) to get to monotonic time. Monotonic is pegged
56 * at zero at system boot time, so wall_to_monotonic will be negative,
57 * however, we will ALWAYS keep the tv_nsec part positive so we can use
58 * the usual normalization.
59 *
60 * wall_to_monotonic is moved after resume from suspend for the
61 * monotonic time not to jump. We need to add total_sleep_time to
62 * wall_to_monotonic to get the real boot based time offset.
63 *
64 * - wall_to_monotonic is no longer the boot time, getboottime must be
65 * used instead.
66 */
67 struct timespec wall_to_monotonic;
68 /* Offset clock monotonic -> clock realtime */
69 ktime_t offs_real;
70 /* time spent in suspend */
71 struct timespec total_sleep_time;
72 /* Offset clock monotonic -> clock boottime */
73 ktime_t offs_boot;
74 /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */
75 struct timespec raw_time;
76 /* Seqlock for all timekeeper values */
77 seqlock_t lock;
78};
79 25
80static struct timekeeper timekeeper; 26static struct timekeeper timekeeper;
81 27
@@ -96,15 +42,6 @@ static inline void tk_normalize_xtime(struct timekeeper *tk)
96 } 42 }
97} 43}
98 44
99static struct timespec tk_xtime(struct timekeeper *tk)
100{
101 struct timespec ts;
102
103 ts.tv_sec = tk->xtime_sec;
104 ts.tv_nsec = (long)(tk->xtime_nsec >> tk->shift);
105 return ts;
106}
107
108static void tk_set_xtime(struct timekeeper *tk, const struct timespec *ts) 45static void tk_set_xtime(struct timekeeper *tk, const struct timespec *ts)
109{ 46{
110 tk->xtime_sec = ts->tv_sec; 47 tk->xtime_sec = ts->tv_sec;
@@ -246,14 +183,11 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
246/* must hold write on timekeeper.lock */ 183/* must hold write on timekeeper.lock */
247static void timekeeping_update(struct timekeeper *tk, bool clearntp) 184static void timekeeping_update(struct timekeeper *tk, bool clearntp)
248{ 185{
249 struct timespec xt;
250
251 if (clearntp) { 186 if (clearntp) {
252 tk->ntp_error = 0; 187 tk->ntp_error = 0;
253 ntp_clear(); 188 ntp_clear();
254 } 189 }
255 xt = tk_xtime(tk); 190 update_vsyscall(tk);
256 update_vsyscall(&xt, &tk->wall_to_monotonic, tk->clock, tk->mult);
257} 191}
258 192
259/** 193/**
@@ -776,6 +710,7 @@ static void timekeeping_resume(void)
776 710
777 read_persistent_clock(&ts); 711 read_persistent_clock(&ts);
778 712
713 clockevents_resume();
779 clocksource_resume(); 714 clocksource_resume();
780 715
781 write_seqlock_irqsave(&tk->lock, flags); 716 write_seqlock_irqsave(&tk->lock, flags);
@@ -835,6 +770,7 @@ static int timekeeping_suspend(void)
835 770
836 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); 771 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
837 clocksource_suspend(); 772 clocksource_suspend();
773 clockevents_suspend();
838 774
839 return 0; 775 return 0;
840} 776}
@@ -1111,7 +1047,7 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
1111 accumulate_nsecs_to_secs(tk); 1047 accumulate_nsecs_to_secs(tk);
1112 1048
1113 /* Accumulate raw time */ 1049 /* Accumulate raw time */
1114 raw_nsecs = tk->raw_interval << shift; 1050 raw_nsecs = (u64)tk->raw_interval << shift;
1115 raw_nsecs += tk->raw_time.tv_nsec; 1051 raw_nsecs += tk->raw_time.tv_nsec;
1116 if (raw_nsecs >= NSEC_PER_SEC) { 1052 if (raw_nsecs >= NSEC_PER_SEC) {
1117 u64 raw_secs = raw_nsecs; 1053 u64 raw_secs = raw_nsecs;
@@ -1128,6 +1064,33 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
1128 return offset; 1064 return offset;
1129} 1065}
1130 1066
1067#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
1068static inline void old_vsyscall_fixup(struct timekeeper *tk)
1069{
1070 s64 remainder;
1071
1072 /*
1073 * Store only full nanoseconds into xtime_nsec after rounding
1074 * it up and add the remainder to the error difference.
1075 * XXX - This is necessary to avoid small 1ns inconsistnecies caused
1076 * by truncating the remainder in vsyscalls. However, it causes
1077 * additional work to be done in timekeeping_adjust(). Once
1078 * the vsyscall implementations are converted to use xtime_nsec
1079 * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD
1080 * users are removed, this can be killed.
1081 */
1082 remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1);
1083 tk->xtime_nsec -= remainder;
1084 tk->xtime_nsec += 1ULL << tk->shift;
1085 tk->ntp_error += remainder << tk->ntp_error_shift;
1086
1087}
1088#else
1089#define old_vsyscall_fixup(tk)
1090#endif
1091
1092
1093
1131/** 1094/**
1132 * update_wall_time - Uses the current clocksource to increment the wall time 1095 * update_wall_time - Uses the current clocksource to increment the wall time
1133 * 1096 *
@@ -1139,7 +1102,6 @@ static void update_wall_time(void)
1139 cycle_t offset; 1102 cycle_t offset;
1140 int shift = 0, maxshift; 1103 int shift = 0, maxshift;
1141 unsigned long flags; 1104 unsigned long flags;
1142 s64 remainder;
1143 1105
1144 write_seqlock_irqsave(&tk->lock, flags); 1106 write_seqlock_irqsave(&tk->lock, flags);
1145 1107
@@ -1181,20 +1143,11 @@ static void update_wall_time(void)
1181 /* correct the clock when NTP error is too big */ 1143 /* correct the clock when NTP error is too big */
1182 timekeeping_adjust(tk, offset); 1144 timekeeping_adjust(tk, offset);
1183 1145
1184
1185 /* 1146 /*
1186 * Store only full nanoseconds into xtime_nsec after rounding 1147 * XXX This can be killed once everyone converts
1187 * it up and add the remainder to the error difference. 1148 * to the new update_vsyscall.
1188 * XXX - This is necessary to avoid small 1ns inconsistnecies caused 1149 */
1189 * by truncating the remainder in vsyscalls. However, it causes 1150 old_vsyscall_fixup(tk);
1190 * additional work to be done in timekeeping_adjust(). Once
1191 * the vsyscall implementations are converted to use xtime_nsec
1192 * (shifted nanoseconds), this can be killed.
1193 */
1194 remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1);
1195 tk->xtime_nsec -= remainder;
1196 tk->xtime_nsec += 1ULL << tk->shift;
1197 tk->ntp_error += remainder << tk->ntp_error_shift;
1198 1151
1199 /* 1152 /*
1200 * Finally, make sure that after the rounding 1153 * Finally, make sure that after the rounding
diff --git a/kernel/timer.c b/kernel/timer.c
index d5de1b2292aa..367d00858482 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -63,6 +63,7 @@ EXPORT_SYMBOL(jiffies_64);
63#define TVR_SIZE (1 << TVR_BITS) 63#define TVR_SIZE (1 << TVR_BITS)
64#define TVN_MASK (TVN_SIZE - 1) 64#define TVN_MASK (TVN_SIZE - 1)
65#define TVR_MASK (TVR_SIZE - 1) 65#define TVR_MASK (TVR_SIZE - 1)
66#define MAX_TVAL ((unsigned long)((1ULL << (TVR_BITS + 4*TVN_BITS)) - 1))
66 67
67struct tvec { 68struct tvec {
68 struct list_head vec[TVN_SIZE]; 69 struct list_head vec[TVN_SIZE];
@@ -359,11 +360,12 @@ __internal_add_timer(struct tvec_base *base, struct timer_list *timer)
359 vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK); 360 vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK);
360 } else { 361 } else {
361 int i; 362 int i;
362 /* If the timeout is larger than 0xffffffff on 64-bit 363 /* If the timeout is larger than MAX_TVAL (on 64-bit
363 * architectures then we use the maximum timeout: 364 * architectures or with CONFIG_BASE_SMALL=1) then we
365 * use the maximum timeout.
364 */ 366 */
365 if (idx > 0xffffffffUL) { 367 if (idx > MAX_TVAL) {
366 idx = 0xffffffffUL; 368 idx = MAX_TVAL;
367 expires = idx + base->timer_jiffies; 369 expires = idx + base->timer_jiffies;
368 } 370 }
369 i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK; 371 i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 1ec5c1dab629..31e4f55773f1 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2061,7 +2061,8 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
2061 seq_puts(m, "# -----------------\n"); 2061 seq_puts(m, "# -----------------\n");
2062 seq_printf(m, "# | task: %.16s-%d " 2062 seq_printf(m, "# | task: %.16s-%d "
2063 "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n", 2063 "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n",
2064 data->comm, data->pid, data->uid, data->nice, 2064 data->comm, data->pid,
2065 from_kuid_munged(seq_user_ns(m), data->uid), data->nice,
2065 data->policy, data->rt_priority); 2066 data->policy, data->rt_priority);
2066 seq_puts(m, "# -----------------\n"); 2067 seq_puts(m, "# -----------------\n");
2067 2068
@@ -4199,12 +4200,6 @@ static void buffer_pipe_buf_release(struct pipe_inode_info *pipe,
4199 buf->private = 0; 4200 buf->private = 0;
4200} 4201}
4201 4202
4202static int buffer_pipe_buf_steal(struct pipe_inode_info *pipe,
4203 struct pipe_buffer *buf)
4204{
4205 return 1;
4206}
4207
4208static void buffer_pipe_buf_get(struct pipe_inode_info *pipe, 4203static void buffer_pipe_buf_get(struct pipe_inode_info *pipe,
4209 struct pipe_buffer *buf) 4204 struct pipe_buffer *buf)
4210{ 4205{
@@ -4220,7 +4215,7 @@ static const struct pipe_buf_operations buffer_pipe_buf_ops = {
4220 .unmap = generic_pipe_buf_unmap, 4215 .unmap = generic_pipe_buf_unmap,
4221 .confirm = generic_pipe_buf_confirm, 4216 .confirm = generic_pipe_buf_confirm,
4222 .release = buffer_pipe_buf_release, 4217 .release = buffer_pipe_buf_release,
4223 .steal = buffer_pipe_buf_steal, 4218 .steal = generic_pipe_buf_steal,
4224 .get = buffer_pipe_buf_get, 4219 .get = buffer_pipe_buf_get,
4225}; 4220};
4226 4221
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 63a2da0b9a6e..c15f528c1af4 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -147,7 +147,7 @@ struct trace_array_cpu {
147 unsigned long skipped_entries; 147 unsigned long skipped_entries;
148 cycle_t preempt_timestamp; 148 cycle_t preempt_timestamp;
149 pid_t pid; 149 pid_t pid;
150 uid_t uid; 150 kuid_t uid;
151 char comm[TASK_COMM_LEN]; 151 char comm[TASK_COMM_LEN];
152}; 152};
153 153
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 483162a9f908..507a7a9630bf 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -13,7 +13,6 @@
13#include <linux/debugfs.h> 13#include <linux/debugfs.h>
14#include <linux/uaccess.h> 14#include <linux/uaccess.h>
15#include <linux/ftrace.h> 15#include <linux/ftrace.h>
16#include <linux/pstore.h>
17#include <linux/fs.h> 16#include <linux/fs.h>
18 17
19#include "trace.h" 18#include "trace.h"
@@ -76,10 +75,9 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip,
76 preempt_enable_notrace(); 75 preempt_enable_notrace();
77} 76}
78 77
79/* Our two options */ 78/* Our option */
80enum { 79enum {
81 TRACE_FUNC_OPT_STACK = 0x1, 80 TRACE_FUNC_OPT_STACK = 0x1,
82 TRACE_FUNC_OPT_PSTORE = 0x2,
83}; 81};
84 82
85static struct tracer_flags func_flags; 83static struct tracer_flags func_flags;
@@ -109,12 +107,6 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
109 disabled = atomic_inc_return(&data->disabled); 107 disabled = atomic_inc_return(&data->disabled);
110 108
111 if (likely(disabled == 1)) { 109 if (likely(disabled == 1)) {
112 /*
113 * So far tracing doesn't support multiple buffers, so
114 * we make an explicit call for now.
115 */
116 if (unlikely(func_flags.val & TRACE_FUNC_OPT_PSTORE))
117 pstore_ftrace_call(ip, parent_ip);
118 pc = preempt_count(); 110 pc = preempt_count();
119 trace_function(tr, ip, parent_ip, flags, pc); 111 trace_function(tr, ip, parent_ip, flags, pc);
120 } 112 }
@@ -181,9 +173,6 @@ static struct tracer_opt func_opts[] = {
181#ifdef CONFIG_STACKTRACE 173#ifdef CONFIG_STACKTRACE
182 { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) }, 174 { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) },
183#endif 175#endif
184#ifdef CONFIG_PSTORE_FTRACE
185 { TRACER_OPT(func_pstore, TRACE_FUNC_OPT_PSTORE) },
186#endif
187 { } /* Always set a last empty entry */ 176 { } /* Always set a last empty entry */
188}; 177};
189 178
@@ -236,8 +225,6 @@ static int func_set_flag(u32 old_flags, u32 bit, int set)
236 } 225 }
237 226
238 break; 227 break;
239 case TRACE_FUNC_OPT_PSTORE:
240 break;
241 default: 228 default:
242 return -EINVAL; 229 return -EINVAL;
243 } 230 }
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 23b4d784ebdd..625df0b44690 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -26,7 +26,9 @@
26/* 26/*
27 * fill in basic accounting fields 27 * fill in basic accounting fields
28 */ 28 */
29void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) 29void bacct_add_tsk(struct user_namespace *user_ns,
30 struct pid_namespace *pid_ns,
31 struct taskstats *stats, struct task_struct *tsk)
30{ 32{
31 const struct cred *tcred; 33 const struct cred *tcred;
32 struct timespec uptime, ts; 34 struct timespec uptime, ts;
@@ -55,13 +57,13 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
55 stats->ac_flag |= AXSIG; 57 stats->ac_flag |= AXSIG;
56 stats->ac_nice = task_nice(tsk); 58 stats->ac_nice = task_nice(tsk);
57 stats->ac_sched = tsk->policy; 59 stats->ac_sched = tsk->policy;
58 stats->ac_pid = tsk->pid; 60 stats->ac_pid = task_pid_nr_ns(tsk, pid_ns);
59 rcu_read_lock(); 61 rcu_read_lock();
60 tcred = __task_cred(tsk); 62 tcred = __task_cred(tsk);
61 stats->ac_uid = tcred->uid; 63 stats->ac_uid = from_kuid_munged(user_ns, tcred->uid);
62 stats->ac_gid = tcred->gid; 64 stats->ac_gid = from_kgid_munged(user_ns, tcred->gid);
63 stats->ac_ppid = pid_alive(tsk) ? 65 stats->ac_ppid = pid_alive(tsk) ?
64 rcu_dereference(tsk->real_parent)->tgid : 0; 66 task_tgid_nr_ns(rcu_dereference(tsk->real_parent), pid_ns) : 0;
65 rcu_read_unlock(); 67 rcu_read_unlock();
66 stats->ac_utime = cputime_to_usecs(tsk->utime); 68 stats->ac_utime = cputime_to_usecs(tsk->utime);
67 stats->ac_stime = cputime_to_usecs(tsk->stime); 69 stats->ac_stime = cputime_to_usecs(tsk->stime);
diff --git a/kernel/user.c b/kernel/user.c
index b815fefbe76f..750acffbe9ec 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -38,6 +38,14 @@ struct user_namespace init_user_ns = {
38 .count = 4294967295U, 38 .count = 4294967295U,
39 }, 39 },
40 }, 40 },
41 .projid_map = {
42 .nr_extents = 1,
43 .extent[0] = {
44 .first = 0,
45 .lower_first = 0,
46 .count = 4294967295U,
47 },
48 },
41 .kref = { 49 .kref = {
42 .refcount = ATOMIC_INIT(3), 50 .refcount = ATOMIC_INIT(3),
43 }, 51 },
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 86602316422d..456a6b9fba34 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -19,6 +19,7 @@
19#include <linux/fs.h> 19#include <linux/fs.h>
20#include <linux/uaccess.h> 20#include <linux/uaccess.h>
21#include <linux/ctype.h> 21#include <linux/ctype.h>
22#include <linux/projid.h>
22 23
23static struct kmem_cache *user_ns_cachep __read_mostly; 24static struct kmem_cache *user_ns_cachep __read_mostly;
24 25
@@ -295,6 +296,75 @@ gid_t from_kgid_munged(struct user_namespace *targ, kgid_t kgid)
295} 296}
296EXPORT_SYMBOL(from_kgid_munged); 297EXPORT_SYMBOL(from_kgid_munged);
297 298
299/**
300 * make_kprojid - Map a user-namespace projid pair into a kprojid.
301 * @ns: User namespace that the projid is in
302 * @projid: Project identifier
303 *
304 * Maps a user-namespace uid pair into a kernel internal kuid,
305 * and returns that kuid.
306 *
307 * When there is no mapping defined for the user-namespace projid
308 * pair INVALID_PROJID is returned. Callers are expected to test
309 * for and handle handle INVALID_PROJID being returned. INVALID_PROJID
310 * may be tested for using projid_valid().
311 */
312kprojid_t make_kprojid(struct user_namespace *ns, projid_t projid)
313{
314 /* Map the uid to a global kernel uid */
315 return KPROJIDT_INIT(map_id_down(&ns->projid_map, projid));
316}
317EXPORT_SYMBOL(make_kprojid);
318
319/**
320 * from_kprojid - Create a projid from a kprojid user-namespace pair.
321 * @targ: The user namespace we want a projid in.
322 * @kprojid: The kernel internal project identifier to start with.
323 *
324 * Map @kprojid into the user-namespace specified by @targ and
325 * return the resulting projid.
326 *
327 * There is always a mapping into the initial user_namespace.
328 *
329 * If @kprojid has no mapping in @targ (projid_t)-1 is returned.
330 */
331projid_t from_kprojid(struct user_namespace *targ, kprojid_t kprojid)
332{
333 /* Map the uid from a global kernel uid */
334 return map_id_up(&targ->projid_map, __kprojid_val(kprojid));
335}
336EXPORT_SYMBOL(from_kprojid);
337
338/**
339 * from_kprojid_munged - Create a projiid from a kprojid user-namespace pair.
340 * @targ: The user namespace we want a projid in.
341 * @kprojid: The kernel internal projid to start with.
342 *
343 * Map @kprojid into the user-namespace specified by @targ and
344 * return the resulting projid.
345 *
346 * There is always a mapping into the initial user_namespace.
347 *
348 * Unlike from_kprojid from_kprojid_munged never fails and always
349 * returns a valid projid. This makes from_kprojid_munged
350 * appropriate for use in syscalls like stat and where
351 * failing the system call and failing to provide a valid projid are
352 * not an options.
353 *
354 * If @kprojid has no mapping in @targ OVERFLOW_PROJID is returned.
355 */
356projid_t from_kprojid_munged(struct user_namespace *targ, kprojid_t kprojid)
357{
358 projid_t projid;
359 projid = from_kprojid(targ, kprojid);
360
361 if (projid == (projid_t) -1)
362 projid = OVERFLOW_PROJID;
363 return projid;
364}
365EXPORT_SYMBOL(from_kprojid_munged);
366
367
298static int uid_m_show(struct seq_file *seq, void *v) 368static int uid_m_show(struct seq_file *seq, void *v)
299{ 369{
300 struct user_namespace *ns = seq->private; 370 struct user_namespace *ns = seq->private;
@@ -337,6 +407,27 @@ static int gid_m_show(struct seq_file *seq, void *v)
337 return 0; 407 return 0;
338} 408}
339 409
410static int projid_m_show(struct seq_file *seq, void *v)
411{
412 struct user_namespace *ns = seq->private;
413 struct uid_gid_extent *extent = v;
414 struct user_namespace *lower_ns;
415 projid_t lower;
416
417 lower_ns = seq_user_ns(seq);
418 if ((lower_ns == ns) && lower_ns->parent)
419 lower_ns = lower_ns->parent;
420
421 lower = from_kprojid(lower_ns, KPROJIDT_INIT(extent->lower_first));
422
423 seq_printf(seq, "%10u %10u %10u\n",
424 extent->first,
425 lower,
426 extent->count);
427
428 return 0;
429}
430
340static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map) 431static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map)
341{ 432{
342 struct uid_gid_extent *extent = NULL; 433 struct uid_gid_extent *extent = NULL;
@@ -362,6 +453,13 @@ static void *gid_m_start(struct seq_file *seq, loff_t *ppos)
362 return m_start(seq, ppos, &ns->gid_map); 453 return m_start(seq, ppos, &ns->gid_map);
363} 454}
364 455
456static void *projid_m_start(struct seq_file *seq, loff_t *ppos)
457{
458 struct user_namespace *ns = seq->private;
459
460 return m_start(seq, ppos, &ns->projid_map);
461}
462
365static void *m_next(struct seq_file *seq, void *v, loff_t *pos) 463static void *m_next(struct seq_file *seq, void *v, loff_t *pos)
366{ 464{
367 (*pos)++; 465 (*pos)++;
@@ -387,6 +485,13 @@ struct seq_operations proc_gid_seq_operations = {
387 .show = gid_m_show, 485 .show = gid_m_show,
388}; 486};
389 487
488struct seq_operations proc_projid_seq_operations = {
489 .start = projid_m_start,
490 .stop = m_stop,
491 .next = m_next,
492 .show = projid_m_show,
493};
494
390static DEFINE_MUTEX(id_map_mutex); 495static DEFINE_MUTEX(id_map_mutex);
391 496
392static ssize_t map_write(struct file *file, const char __user *buf, 497static ssize_t map_write(struct file *file, const char __user *buf,
@@ -434,7 +539,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
434 /* Require the appropriate privilege CAP_SETUID or CAP_SETGID 539 /* Require the appropriate privilege CAP_SETUID or CAP_SETGID
435 * over the user namespace in order to set the id mapping. 540 * over the user namespace in order to set the id mapping.
436 */ 541 */
437 if (!ns_capable(ns, cap_setid)) 542 if (cap_valid(cap_setid) && !ns_capable(ns, cap_setid))
438 goto out; 543 goto out;
439 544
440 /* Get a buffer */ 545 /* Get a buffer */
@@ -584,9 +689,30 @@ ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t siz
584 &ns->gid_map, &ns->parent->gid_map); 689 &ns->gid_map, &ns->parent->gid_map);
585} 690}
586 691
692ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos)
693{
694 struct seq_file *seq = file->private_data;
695 struct user_namespace *ns = seq->private;
696 struct user_namespace *seq_ns = seq_user_ns(seq);
697
698 if (!ns->parent)
699 return -EPERM;
700
701 if ((seq_ns != ns) && (seq_ns != ns->parent))
702 return -EPERM;
703
704 /* Anyone can set any valid project id no capability needed */
705 return map_write(file, buf, size, ppos, -1,
706 &ns->projid_map, &ns->parent->projid_map);
707}
708
587static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, 709static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
588 struct uid_gid_map *new_map) 710 struct uid_gid_map *new_map)
589{ 711{
712 /* Allow anyone to set a mapping that doesn't require privilege */
713 if (!cap_valid(cap_setid))
714 return true;
715
590 /* Allow the specified ids if we have the appropriate capability 716 /* Allow the specified ids if we have the appropriate capability
591 * (CAP_SETUID or CAP_SETGID) over the parent user namespace. 717 * (CAP_SETUID or CAP_SETGID) over the parent user namespace.
592 */ 718 */
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 3c5a79e2134c..d951daa0ca9a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -58,7 +58,7 @@ enum {
58 * be executing on any CPU. The gcwq behaves as an unbound one. 58 * be executing on any CPU. The gcwq behaves as an unbound one.
59 * 59 *
60 * Note that DISASSOCIATED can be flipped only while holding 60 * Note that DISASSOCIATED can be flipped only while holding
61 * managership of all pools on the gcwq to avoid changing binding 61 * assoc_mutex of all pools on the gcwq to avoid changing binding
62 * state while create_worker() is in progress. 62 * state while create_worker() is in progress.
63 */ 63 */
64 GCWQ_DISASSOCIATED = 1 << 0, /* cpu can't serve workers */ 64 GCWQ_DISASSOCIATED = 1 << 0, /* cpu can't serve workers */
@@ -73,11 +73,10 @@ enum {
73 WORKER_DIE = 1 << 1, /* die die die */ 73 WORKER_DIE = 1 << 1, /* die die die */
74 WORKER_IDLE = 1 << 2, /* is idle */ 74 WORKER_IDLE = 1 << 2, /* is idle */
75 WORKER_PREP = 1 << 3, /* preparing to run works */ 75 WORKER_PREP = 1 << 3, /* preparing to run works */
76 WORKER_REBIND = 1 << 5, /* mom is home, come back */
77 WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */ 76 WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */
78 WORKER_UNBOUND = 1 << 7, /* worker is unbound */ 77 WORKER_UNBOUND = 1 << 7, /* worker is unbound */
79 78
80 WORKER_NOT_RUNNING = WORKER_PREP | WORKER_REBIND | WORKER_UNBOUND | 79 WORKER_NOT_RUNNING = WORKER_PREP | WORKER_UNBOUND |
81 WORKER_CPU_INTENSIVE, 80 WORKER_CPU_INTENSIVE,
82 81
83 NR_WORKER_POOLS = 2, /* # worker pools per gcwq */ 82 NR_WORKER_POOLS = 2, /* # worker pools per gcwq */
@@ -126,7 +125,6 @@ enum {
126 125
127struct global_cwq; 126struct global_cwq;
128struct worker_pool; 127struct worker_pool;
129struct idle_rebind;
130 128
131/* 129/*
132 * The poor guys doing the actual heavy lifting. All on-duty workers 130 * The poor guys doing the actual heavy lifting. All on-duty workers
@@ -150,7 +148,6 @@ struct worker {
150 int id; /* I: worker id */ 148 int id; /* I: worker id */
151 149
152 /* for rebinding worker to CPU */ 150 /* for rebinding worker to CPU */
153 struct idle_rebind *idle_rebind; /* L: for idle worker */
154 struct work_struct rebind_work; /* L: for busy worker */ 151 struct work_struct rebind_work; /* L: for busy worker */
155}; 152};
156 153
@@ -160,13 +157,15 @@ struct worker_pool {
160 157
161 struct list_head worklist; /* L: list of pending works */ 158 struct list_head worklist; /* L: list of pending works */
162 int nr_workers; /* L: total number of workers */ 159 int nr_workers; /* L: total number of workers */
160
161 /* nr_idle includes the ones off idle_list for rebinding */
163 int nr_idle; /* L: currently idle ones */ 162 int nr_idle; /* L: currently idle ones */
164 163
165 struct list_head idle_list; /* X: list of idle workers */ 164 struct list_head idle_list; /* X: list of idle workers */
166 struct timer_list idle_timer; /* L: worker idle timeout */ 165 struct timer_list idle_timer; /* L: worker idle timeout */
167 struct timer_list mayday_timer; /* L: SOS timer for workers */ 166 struct timer_list mayday_timer; /* L: SOS timer for workers */
168 167
169 struct mutex manager_mutex; /* mutex manager should hold */ 168 struct mutex assoc_mutex; /* protect GCWQ_DISASSOCIATED */
170 struct ida worker_ida; /* L: for worker IDs */ 169 struct ida worker_ida; /* L: for worker IDs */
171}; 170};
172 171
@@ -184,9 +183,8 @@ struct global_cwq {
184 struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE]; 183 struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE];
185 /* L: hash of busy workers */ 184 /* L: hash of busy workers */
186 185
187 struct worker_pool pools[2]; /* normal and highpri pools */ 186 struct worker_pool pools[NR_WORKER_POOLS];
188 187 /* normal and highpri pools */
189 wait_queue_head_t rebind_hold; /* rebind hold wait */
190} ____cacheline_aligned_in_smp; 188} ____cacheline_aligned_in_smp;
191 189
192/* 190/*
@@ -269,17 +267,15 @@ struct workqueue_struct {
269}; 267};
270 268
271struct workqueue_struct *system_wq __read_mostly; 269struct workqueue_struct *system_wq __read_mostly;
272struct workqueue_struct *system_long_wq __read_mostly;
273struct workqueue_struct *system_nrt_wq __read_mostly;
274struct workqueue_struct *system_unbound_wq __read_mostly;
275struct workqueue_struct *system_freezable_wq __read_mostly;
276struct workqueue_struct *system_nrt_freezable_wq __read_mostly;
277EXPORT_SYMBOL_GPL(system_wq); 270EXPORT_SYMBOL_GPL(system_wq);
271struct workqueue_struct *system_highpri_wq __read_mostly;
272EXPORT_SYMBOL_GPL(system_highpri_wq);
273struct workqueue_struct *system_long_wq __read_mostly;
278EXPORT_SYMBOL_GPL(system_long_wq); 274EXPORT_SYMBOL_GPL(system_long_wq);
279EXPORT_SYMBOL_GPL(system_nrt_wq); 275struct workqueue_struct *system_unbound_wq __read_mostly;
280EXPORT_SYMBOL_GPL(system_unbound_wq); 276EXPORT_SYMBOL_GPL(system_unbound_wq);
277struct workqueue_struct *system_freezable_wq __read_mostly;
281EXPORT_SYMBOL_GPL(system_freezable_wq); 278EXPORT_SYMBOL_GPL(system_freezable_wq);
282EXPORT_SYMBOL_GPL(system_nrt_freezable_wq);
283 279
284#define CREATE_TRACE_POINTS 280#define CREATE_TRACE_POINTS
285#include <trace/events/workqueue.h> 281#include <trace/events/workqueue.h>
@@ -534,18 +530,24 @@ static int work_next_color(int color)
534} 530}
535 531
536/* 532/*
537 * A work's data points to the cwq with WORK_STRUCT_CWQ set while the 533 * While queued, %WORK_STRUCT_CWQ is set and non flag bits of a work's data
538 * work is on queue. Once execution starts, WORK_STRUCT_CWQ is 534 * contain the pointer to the queued cwq. Once execution starts, the flag
539 * cleared and the work data contains the cpu number it was last on. 535 * is cleared and the high bits contain OFFQ flags and CPU number.
540 * 536 *
541 * set_work_{cwq|cpu}() and clear_work_data() can be used to set the 537 * set_work_cwq(), set_work_cpu_and_clear_pending(), mark_work_canceling()
542 * cwq, cpu or clear work->data. These functions should only be 538 * and clear_work_data() can be used to set the cwq, cpu or clear
543 * called while the work is owned - ie. while the PENDING bit is set. 539 * work->data. These functions should only be called while the work is
540 * owned - ie. while the PENDING bit is set.
544 * 541 *
545 * get_work_[g]cwq() can be used to obtain the gcwq or cwq 542 * get_work_[g]cwq() can be used to obtain the gcwq or cwq corresponding to
546 * corresponding to a work. gcwq is available once the work has been 543 * a work. gcwq is available once the work has been queued anywhere after
547 * queued anywhere after initialization. cwq is available only from 544 * initialization until it is sync canceled. cwq is available only while
548 * queueing until execution starts. 545 * the work item is queued.
546 *
547 * %WORK_OFFQ_CANCELING is used to mark a work item which is being
548 * canceled. While being canceled, a work item may have its PENDING set
549 * but stay off timer and worklist for arbitrarily long and nobody should
550 * try to steal the PENDING bit.
549 */ 551 */
550static inline void set_work_data(struct work_struct *work, unsigned long data, 552static inline void set_work_data(struct work_struct *work, unsigned long data,
551 unsigned long flags) 553 unsigned long flags)
@@ -562,13 +564,22 @@ static void set_work_cwq(struct work_struct *work,
562 WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags); 564 WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags);
563} 565}
564 566
565static void set_work_cpu(struct work_struct *work, unsigned int cpu) 567static void set_work_cpu_and_clear_pending(struct work_struct *work,
568 unsigned int cpu)
566{ 569{
567 set_work_data(work, cpu << WORK_STRUCT_FLAG_BITS, WORK_STRUCT_PENDING); 570 /*
571 * The following wmb is paired with the implied mb in
572 * test_and_set_bit(PENDING) and ensures all updates to @work made
573 * here are visible to and precede any updates by the next PENDING
574 * owner.
575 */
576 smp_wmb();
577 set_work_data(work, (unsigned long)cpu << WORK_OFFQ_CPU_SHIFT, 0);
568} 578}
569 579
570static void clear_work_data(struct work_struct *work) 580static void clear_work_data(struct work_struct *work)
571{ 581{
582 smp_wmb(); /* see set_work_cpu_and_clear_pending() */
572 set_work_data(work, WORK_STRUCT_NO_CPU, 0); 583 set_work_data(work, WORK_STRUCT_NO_CPU, 0);
573} 584}
574 585
@@ -591,7 +602,7 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work)
591 return ((struct cpu_workqueue_struct *) 602 return ((struct cpu_workqueue_struct *)
592 (data & WORK_STRUCT_WQ_DATA_MASK))->pool->gcwq; 603 (data & WORK_STRUCT_WQ_DATA_MASK))->pool->gcwq;
593 604
594 cpu = data >> WORK_STRUCT_FLAG_BITS; 605 cpu = data >> WORK_OFFQ_CPU_SHIFT;
595 if (cpu == WORK_CPU_NONE) 606 if (cpu == WORK_CPU_NONE)
596 return NULL; 607 return NULL;
597 608
@@ -599,6 +610,22 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work)
599 return get_gcwq(cpu); 610 return get_gcwq(cpu);
600} 611}
601 612
613static void mark_work_canceling(struct work_struct *work)
614{
615 struct global_cwq *gcwq = get_work_gcwq(work);
616 unsigned long cpu = gcwq ? gcwq->cpu : WORK_CPU_NONE;
617
618 set_work_data(work, (cpu << WORK_OFFQ_CPU_SHIFT) | WORK_OFFQ_CANCELING,
619 WORK_STRUCT_PENDING);
620}
621
622static bool work_is_canceling(struct work_struct *work)
623{
624 unsigned long data = atomic_long_read(&work->data);
625
626 return !(data & WORK_STRUCT_CWQ) && (data & WORK_OFFQ_CANCELING);
627}
628
602/* 629/*
603 * Policy functions. These define the policies on how the global worker 630 * Policy functions. These define the policies on how the global worker
604 * pools are managed. Unless noted otherwise, these functions assume that 631 * pools are managed. Unless noted otherwise, these functions assume that
@@ -657,6 +684,13 @@ static bool too_many_workers(struct worker_pool *pool)
657 int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ 684 int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
658 int nr_busy = pool->nr_workers - nr_idle; 685 int nr_busy = pool->nr_workers - nr_idle;
659 686
687 /*
688 * nr_idle and idle_list may disagree if idle rebinding is in
689 * progress. Never return %true if idle_list is empty.
690 */
691 if (list_empty(&pool->idle_list))
692 return false;
693
660 return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy; 694 return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
661} 695}
662 696
@@ -903,6 +937,206 @@ static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
903} 937}
904 938
905/** 939/**
940 * move_linked_works - move linked works to a list
941 * @work: start of series of works to be scheduled
942 * @head: target list to append @work to
943 * @nextp: out paramter for nested worklist walking
944 *
945 * Schedule linked works starting from @work to @head. Work series to
946 * be scheduled starts at @work and includes any consecutive work with
947 * WORK_STRUCT_LINKED set in its predecessor.
948 *
949 * If @nextp is not NULL, it's updated to point to the next work of
950 * the last scheduled work. This allows move_linked_works() to be
951 * nested inside outer list_for_each_entry_safe().
952 *
953 * CONTEXT:
954 * spin_lock_irq(gcwq->lock).
955 */
956static void move_linked_works(struct work_struct *work, struct list_head *head,
957 struct work_struct **nextp)
958{
959 struct work_struct *n;
960
961 /*
962 * Linked worklist will always end before the end of the list,
963 * use NULL for list head.
964 */
965 list_for_each_entry_safe_from(work, n, NULL, entry) {
966 list_move_tail(&work->entry, head);
967 if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
968 break;
969 }
970
971 /*
972 * If we're already inside safe list traversal and have moved
973 * multiple works to the scheduled queue, the next position
974 * needs to be updated.
975 */
976 if (nextp)
977 *nextp = n;
978}
979
980static void cwq_activate_delayed_work(struct work_struct *work)
981{
982 struct cpu_workqueue_struct *cwq = get_work_cwq(work);
983
984 trace_workqueue_activate_work(work);
985 move_linked_works(work, &cwq->pool->worklist, NULL);
986 __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
987 cwq->nr_active++;
988}
989
990static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
991{
992 struct work_struct *work = list_first_entry(&cwq->delayed_works,
993 struct work_struct, entry);
994
995 cwq_activate_delayed_work(work);
996}
997
998/**
999 * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight
1000 * @cwq: cwq of interest
1001 * @color: color of work which left the queue
1002 *
1003 * A work either has completed or is removed from pending queue,
1004 * decrement nr_in_flight of its cwq and handle workqueue flushing.
1005 *
1006 * CONTEXT:
1007 * spin_lock_irq(gcwq->lock).
1008 */
1009static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color)
1010{
1011 /* ignore uncolored works */
1012 if (color == WORK_NO_COLOR)
1013 return;
1014
1015 cwq->nr_in_flight[color]--;
1016
1017 cwq->nr_active--;
1018 if (!list_empty(&cwq->delayed_works)) {
1019 /* one down, submit a delayed one */
1020 if (cwq->nr_active < cwq->max_active)
1021 cwq_activate_first_delayed(cwq);
1022 }
1023
1024 /* is flush in progress and are we at the flushing tip? */
1025 if (likely(cwq->flush_color != color))
1026 return;
1027
1028 /* are there still in-flight works? */
1029 if (cwq->nr_in_flight[color])
1030 return;
1031
1032 /* this cwq is done, clear flush_color */
1033 cwq->flush_color = -1;
1034
1035 /*
1036 * If this was the last cwq, wake up the first flusher. It
1037 * will handle the rest.
1038 */
1039 if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush))
1040 complete(&cwq->wq->first_flusher->done);
1041}
1042
1043/**
1044 * try_to_grab_pending - steal work item from worklist and disable irq
1045 * @work: work item to steal
1046 * @is_dwork: @work is a delayed_work
1047 * @flags: place to store irq state
1048 *
1049 * Try to grab PENDING bit of @work. This function can handle @work in any
1050 * stable state - idle, on timer or on worklist. Return values are
1051 *
1052 * 1 if @work was pending and we successfully stole PENDING
1053 * 0 if @work was idle and we claimed PENDING
1054 * -EAGAIN if PENDING couldn't be grabbed at the moment, safe to busy-retry
1055 * -ENOENT if someone else is canceling @work, this state may persist
1056 * for arbitrarily long
1057 *
1058 * On >= 0 return, the caller owns @work's PENDING bit. To avoid getting
1059 * interrupted while holding PENDING and @work off queue, irq must be
1060 * disabled on entry. This, combined with delayed_work->timer being
1061 * irqsafe, ensures that we return -EAGAIN for finite short period of time.
1062 *
1063 * On successful return, >= 0, irq is disabled and the caller is
1064 * responsible for releasing it using local_irq_restore(*@flags).
1065 *
1066 * This function is safe to call from any context including IRQ handler.
1067 */
1068static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
1069 unsigned long *flags)
1070{
1071 struct global_cwq *gcwq;
1072
1073 local_irq_save(*flags);
1074
1075 /* try to steal the timer if it exists */
1076 if (is_dwork) {
1077 struct delayed_work *dwork = to_delayed_work(work);
1078
1079 /*
1080 * dwork->timer is irqsafe. If del_timer() fails, it's
1081 * guaranteed that the timer is not queued anywhere and not
1082 * running on the local CPU.
1083 */
1084 if (likely(del_timer(&dwork->timer)))
1085 return 1;
1086 }
1087
1088 /* try to claim PENDING the normal way */
1089 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
1090 return 0;
1091
1092 /*
1093 * The queueing is in progress, or it is already queued. Try to
1094 * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
1095 */
1096 gcwq = get_work_gcwq(work);
1097 if (!gcwq)
1098 goto fail;
1099
1100 spin_lock(&gcwq->lock);
1101 if (!list_empty(&work->entry)) {
1102 /*
1103 * This work is queued, but perhaps we locked the wrong gcwq.
1104 * In that case we must see the new value after rmb(), see
1105 * insert_work()->wmb().
1106 */
1107 smp_rmb();
1108 if (gcwq == get_work_gcwq(work)) {
1109 debug_work_deactivate(work);
1110
1111 /*
1112 * A delayed work item cannot be grabbed directly
1113 * because it might have linked NO_COLOR work items
1114 * which, if left on the delayed_list, will confuse
1115 * cwq->nr_active management later on and cause
1116 * stall. Make sure the work item is activated
1117 * before grabbing.
1118 */
1119 if (*work_data_bits(work) & WORK_STRUCT_DELAYED)
1120 cwq_activate_delayed_work(work);
1121
1122 list_del_init(&work->entry);
1123 cwq_dec_nr_in_flight(get_work_cwq(work),
1124 get_work_color(work));
1125
1126 spin_unlock(&gcwq->lock);
1127 return 1;
1128 }
1129 }
1130 spin_unlock(&gcwq->lock);
1131fail:
1132 local_irq_restore(*flags);
1133 if (work_is_canceling(work))
1134 return -ENOENT;
1135 cpu_relax();
1136 return -EAGAIN;
1137}
1138
1139/**
906 * insert_work - insert a work into gcwq 1140 * insert_work - insert a work into gcwq
907 * @cwq: cwq @work belongs to 1141 * @cwq: cwq @work belongs to
908 * @work: work to insert 1142 * @work: work to insert
@@ -982,7 +1216,15 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
982 struct cpu_workqueue_struct *cwq; 1216 struct cpu_workqueue_struct *cwq;
983 struct list_head *worklist; 1217 struct list_head *worklist;
984 unsigned int work_flags; 1218 unsigned int work_flags;
985 unsigned long flags; 1219 unsigned int req_cpu = cpu;
1220
1221 /*
1222 * While a work item is PENDING && off queue, a task trying to
1223 * steal the PENDING will busy-loop waiting for it to either get
1224 * queued or lose PENDING. Grabbing PENDING and queueing should
1225 * happen with IRQ disabled.
1226 */
1227 WARN_ON_ONCE(!irqs_disabled());
986 1228
987 debug_work_activate(work); 1229 debug_work_activate(work);
988 1230
@@ -995,21 +1237,22 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
995 if (!(wq->flags & WQ_UNBOUND)) { 1237 if (!(wq->flags & WQ_UNBOUND)) {
996 struct global_cwq *last_gcwq; 1238 struct global_cwq *last_gcwq;
997 1239
998 if (unlikely(cpu == WORK_CPU_UNBOUND)) 1240 if (cpu == WORK_CPU_UNBOUND)
999 cpu = raw_smp_processor_id(); 1241 cpu = raw_smp_processor_id();
1000 1242
1001 /* 1243 /*
1002 * It's multi cpu. If @wq is non-reentrant and @work 1244 * It's multi cpu. If @work was previously on a different
1003 * was previously on a different cpu, it might still 1245 * cpu, it might still be running there, in which case the
1004 * be running there, in which case the work needs to 1246 * work needs to be queued on that cpu to guarantee
1005 * be queued on that cpu to guarantee non-reentrance. 1247 * non-reentrancy.
1006 */ 1248 */
1007 gcwq = get_gcwq(cpu); 1249 gcwq = get_gcwq(cpu);
1008 if (wq->flags & WQ_NON_REENTRANT && 1250 last_gcwq = get_work_gcwq(work);
1009 (last_gcwq = get_work_gcwq(work)) && last_gcwq != gcwq) { 1251
1252 if (last_gcwq && last_gcwq != gcwq) {
1010 struct worker *worker; 1253 struct worker *worker;
1011 1254
1012 spin_lock_irqsave(&last_gcwq->lock, flags); 1255 spin_lock(&last_gcwq->lock);
1013 1256
1014 worker = find_worker_executing_work(last_gcwq, work); 1257 worker = find_worker_executing_work(last_gcwq, work);
1015 1258
@@ -1017,22 +1260,23 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
1017 gcwq = last_gcwq; 1260 gcwq = last_gcwq;
1018 else { 1261 else {
1019 /* meh... not running there, queue here */ 1262 /* meh... not running there, queue here */
1020 spin_unlock_irqrestore(&last_gcwq->lock, flags); 1263 spin_unlock(&last_gcwq->lock);
1021 spin_lock_irqsave(&gcwq->lock, flags); 1264 spin_lock(&gcwq->lock);
1022 } 1265 }
1023 } else 1266 } else {
1024 spin_lock_irqsave(&gcwq->lock, flags); 1267 spin_lock(&gcwq->lock);
1268 }
1025 } else { 1269 } else {
1026 gcwq = get_gcwq(WORK_CPU_UNBOUND); 1270 gcwq = get_gcwq(WORK_CPU_UNBOUND);
1027 spin_lock_irqsave(&gcwq->lock, flags); 1271 spin_lock(&gcwq->lock);
1028 } 1272 }
1029 1273
1030 /* gcwq determined, get cwq and queue */ 1274 /* gcwq determined, get cwq and queue */
1031 cwq = get_cwq(gcwq->cpu, wq); 1275 cwq = get_cwq(gcwq->cpu, wq);
1032 trace_workqueue_queue_work(cpu, cwq, work); 1276 trace_workqueue_queue_work(req_cpu, cwq, work);
1033 1277
1034 if (WARN_ON(!list_empty(&work->entry))) { 1278 if (WARN_ON(!list_empty(&work->entry))) {
1035 spin_unlock_irqrestore(&gcwq->lock, flags); 1279 spin_unlock(&gcwq->lock);
1036 return; 1280 return;
1037 } 1281 }
1038 1282
@@ -1050,79 +1294,110 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
1050 1294
1051 insert_work(cwq, work, worklist, work_flags); 1295 insert_work(cwq, work, worklist, work_flags);
1052 1296
1053 spin_unlock_irqrestore(&gcwq->lock, flags); 1297 spin_unlock(&gcwq->lock);
1054} 1298}
1055 1299
1056/** 1300/**
1057 * queue_work - queue work on a workqueue 1301 * queue_work_on - queue work on specific cpu
1302 * @cpu: CPU number to execute work on
1058 * @wq: workqueue to use 1303 * @wq: workqueue to use
1059 * @work: work to queue 1304 * @work: work to queue
1060 * 1305 *
1061 * Returns 0 if @work was already on a queue, non-zero otherwise. 1306 * Returns %false if @work was already on a queue, %true otherwise.
1062 * 1307 *
1063 * We queue the work to the CPU on which it was submitted, but if the CPU dies 1308 * We queue the work to a specific CPU, the caller must ensure it
1064 * it can be processed by another CPU. 1309 * can't go away.
1065 */ 1310 */
1066int queue_work(struct workqueue_struct *wq, struct work_struct *work) 1311bool queue_work_on(int cpu, struct workqueue_struct *wq,
1312 struct work_struct *work)
1067{ 1313{
1068 int ret; 1314 bool ret = false;
1315 unsigned long flags;
1069 1316
1070 ret = queue_work_on(get_cpu(), wq, work); 1317 local_irq_save(flags);
1071 put_cpu(); 1318
1319 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
1320 __queue_work(cpu, wq, work);
1321 ret = true;
1322 }
1072 1323
1324 local_irq_restore(flags);
1073 return ret; 1325 return ret;
1074} 1326}
1075EXPORT_SYMBOL_GPL(queue_work); 1327EXPORT_SYMBOL_GPL(queue_work_on);
1076 1328
1077/** 1329/**
1078 * queue_work_on - queue work on specific cpu 1330 * queue_work - queue work on a workqueue
1079 * @cpu: CPU number to execute work on
1080 * @wq: workqueue to use 1331 * @wq: workqueue to use
1081 * @work: work to queue 1332 * @work: work to queue
1082 * 1333 *
1083 * Returns 0 if @work was already on a queue, non-zero otherwise. 1334 * Returns %false if @work was already on a queue, %true otherwise.
1084 * 1335 *
1085 * We queue the work to a specific CPU, the caller must ensure it 1336 * We queue the work to the CPU on which it was submitted, but if the CPU dies
1086 * can't go away. 1337 * it can be processed by another CPU.
1087 */ 1338 */
1088int 1339bool queue_work(struct workqueue_struct *wq, struct work_struct *work)
1089queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work)
1090{ 1340{
1091 int ret = 0; 1341 return queue_work_on(WORK_CPU_UNBOUND, wq, work);
1092
1093 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
1094 __queue_work(cpu, wq, work);
1095 ret = 1;
1096 }
1097 return ret;
1098} 1342}
1099EXPORT_SYMBOL_GPL(queue_work_on); 1343EXPORT_SYMBOL_GPL(queue_work);
1100 1344
1101static void delayed_work_timer_fn(unsigned long __data) 1345void delayed_work_timer_fn(unsigned long __data)
1102{ 1346{
1103 struct delayed_work *dwork = (struct delayed_work *)__data; 1347 struct delayed_work *dwork = (struct delayed_work *)__data;
1104 struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work); 1348 struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work);
1105 1349
1106 __queue_work(smp_processor_id(), cwq->wq, &dwork->work); 1350 /* should have been called from irqsafe timer with irq already off */
1351 __queue_work(dwork->cpu, cwq->wq, &dwork->work);
1107} 1352}
1353EXPORT_SYMBOL_GPL(delayed_work_timer_fn);
1108 1354
1109/** 1355static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
1110 * queue_delayed_work - queue work on a workqueue after delay 1356 struct delayed_work *dwork, unsigned long delay)
1111 * @wq: workqueue to use
1112 * @dwork: delayable work to queue
1113 * @delay: number of jiffies to wait before queueing
1114 *
1115 * Returns 0 if @work was already on a queue, non-zero otherwise.
1116 */
1117int queue_delayed_work(struct workqueue_struct *wq,
1118 struct delayed_work *dwork, unsigned long delay)
1119{ 1357{
1120 if (delay == 0) 1358 struct timer_list *timer = &dwork->timer;
1121 return queue_work(wq, &dwork->work); 1359 struct work_struct *work = &dwork->work;
1360 unsigned int lcpu;
1361
1362 WARN_ON_ONCE(timer->function != delayed_work_timer_fn ||
1363 timer->data != (unsigned long)dwork);
1364 BUG_ON(timer_pending(timer));
1365 BUG_ON(!list_empty(&work->entry));
1366
1367 timer_stats_timer_set_start_info(&dwork->timer);
1368
1369 /*
1370 * This stores cwq for the moment, for the timer_fn. Note that the
1371 * work's gcwq is preserved to allow reentrance detection for
1372 * delayed works.
1373 */
1374 if (!(wq->flags & WQ_UNBOUND)) {
1375 struct global_cwq *gcwq = get_work_gcwq(work);
1122 1376
1123 return queue_delayed_work_on(-1, wq, dwork, delay); 1377 /*
1378 * If we cannot get the last gcwq from @work directly,
1379 * select the last CPU such that it avoids unnecessarily
1380 * triggering non-reentrancy check in __queue_work().
1381 */
1382 lcpu = cpu;
1383 if (gcwq)
1384 lcpu = gcwq->cpu;
1385 if (lcpu == WORK_CPU_UNBOUND)
1386 lcpu = raw_smp_processor_id();
1387 } else {
1388 lcpu = WORK_CPU_UNBOUND;
1389 }
1390
1391 set_work_cwq(work, get_cwq(lcpu, wq), 0);
1392
1393 dwork->cpu = cpu;
1394 timer->expires = jiffies + delay;
1395
1396 if (unlikely(cpu != WORK_CPU_UNBOUND))
1397 add_timer_on(timer, cpu);
1398 else
1399 add_timer(timer);
1124} 1400}
1125EXPORT_SYMBOL_GPL(queue_delayed_work);
1126 1401
1127/** 1402/**
1128 * queue_delayed_work_on - queue work on specific CPU after delay 1403 * queue_delayed_work_on - queue work on specific CPU after delay
@@ -1131,53 +1406,100 @@ EXPORT_SYMBOL_GPL(queue_delayed_work);
1131 * @dwork: work to queue 1406 * @dwork: work to queue
1132 * @delay: number of jiffies to wait before queueing 1407 * @delay: number of jiffies to wait before queueing
1133 * 1408 *
1134 * Returns 0 if @work was already on a queue, non-zero otherwise. 1409 * Returns %false if @work was already on a queue, %true otherwise. If
1410 * @delay is zero and @dwork is idle, it will be scheduled for immediate
1411 * execution.
1135 */ 1412 */
1136int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, 1413bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
1137 struct delayed_work *dwork, unsigned long delay) 1414 struct delayed_work *dwork, unsigned long delay)
1138{ 1415{
1139 int ret = 0;
1140 struct timer_list *timer = &dwork->timer;
1141 struct work_struct *work = &dwork->work; 1416 struct work_struct *work = &dwork->work;
1417 bool ret = false;
1418 unsigned long flags;
1142 1419
1143 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { 1420 if (!delay)
1144 unsigned int lcpu; 1421 return queue_work_on(cpu, wq, &dwork->work);
1145 1422
1146 BUG_ON(timer_pending(timer)); 1423 /* read the comment in __queue_work() */
1147 BUG_ON(!list_empty(&work->entry)); 1424 local_irq_save(flags);
1148 1425
1149 timer_stats_timer_set_start_info(&dwork->timer); 1426 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
1427 __queue_delayed_work(cpu, wq, dwork, delay);
1428 ret = true;
1429 }
1150 1430
1151 /* 1431 local_irq_restore(flags);
1152 * This stores cwq for the moment, for the timer_fn. 1432 return ret;
1153 * Note that the work's gcwq is preserved to allow 1433}
1154 * reentrance detection for delayed works. 1434EXPORT_SYMBOL_GPL(queue_delayed_work_on);
1155 */
1156 if (!(wq->flags & WQ_UNBOUND)) {
1157 struct global_cwq *gcwq = get_work_gcwq(work);
1158 1435
1159 if (gcwq && gcwq->cpu != WORK_CPU_UNBOUND) 1436/**
1160 lcpu = gcwq->cpu; 1437 * queue_delayed_work - queue work on a workqueue after delay
1161 else 1438 * @wq: workqueue to use
1162 lcpu = raw_smp_processor_id(); 1439 * @dwork: delayable work to queue
1163 } else 1440 * @delay: number of jiffies to wait before queueing
1164 lcpu = WORK_CPU_UNBOUND; 1441 *
1442 * Equivalent to queue_delayed_work_on() but tries to use the local CPU.
1443 */
1444bool queue_delayed_work(struct workqueue_struct *wq,
1445 struct delayed_work *dwork, unsigned long delay)
1446{
1447 return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay);
1448}
1449EXPORT_SYMBOL_GPL(queue_delayed_work);
1165 1450
1166 set_work_cwq(work, get_cwq(lcpu, wq), 0); 1451/**
1452 * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU
1453 * @cpu: CPU number to execute work on
1454 * @wq: workqueue to use
1455 * @dwork: work to queue
1456 * @delay: number of jiffies to wait before queueing
1457 *
1458 * If @dwork is idle, equivalent to queue_delayed_work_on(); otherwise,
1459 * modify @dwork's timer so that it expires after @delay. If @delay is
1460 * zero, @work is guaranteed to be scheduled immediately regardless of its
1461 * current state.
1462 *
1463 * Returns %false if @dwork was idle and queued, %true if @dwork was
1464 * pending and its timer was modified.
1465 *
1466 * This function is safe to call from any context including IRQ handler.
1467 * See try_to_grab_pending() for details.
1468 */
1469bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
1470 struct delayed_work *dwork, unsigned long delay)
1471{
1472 unsigned long flags;
1473 int ret;
1167 1474
1168 timer->expires = jiffies + delay; 1475 do {
1169 timer->data = (unsigned long)dwork; 1476 ret = try_to_grab_pending(&dwork->work, true, &flags);
1170 timer->function = delayed_work_timer_fn; 1477 } while (unlikely(ret == -EAGAIN));
1171 1478
1172 if (unlikely(cpu >= 0)) 1479 if (likely(ret >= 0)) {
1173 add_timer_on(timer, cpu); 1480 __queue_delayed_work(cpu, wq, dwork, delay);
1174 else 1481 local_irq_restore(flags);
1175 add_timer(timer);
1176 ret = 1;
1177 } 1482 }
1483
1484 /* -ENOENT from try_to_grab_pending() becomes %true */
1178 return ret; 1485 return ret;
1179} 1486}
1180EXPORT_SYMBOL_GPL(queue_delayed_work_on); 1487EXPORT_SYMBOL_GPL(mod_delayed_work_on);
1488
1489/**
1490 * mod_delayed_work - modify delay of or queue a delayed work
1491 * @wq: workqueue to use
1492 * @dwork: work to queue
1493 * @delay: number of jiffies to wait before queueing
1494 *
1495 * mod_delayed_work_on() on local CPU.
1496 */
1497bool mod_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork,
1498 unsigned long delay)
1499{
1500 return mod_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay);
1501}
1502EXPORT_SYMBOL_GPL(mod_delayed_work);
1181 1503
1182/** 1504/**
1183 * worker_enter_idle - enter idle state 1505 * worker_enter_idle - enter idle state
@@ -1305,37 +1627,21 @@ __acquires(&gcwq->lock)
1305 } 1627 }
1306} 1628}
1307 1629
1308struct idle_rebind {
1309 int cnt; /* # workers to be rebound */
1310 struct completion done; /* all workers rebound */
1311};
1312
1313/* 1630/*
1314 * Rebind an idle @worker to its CPU. During CPU onlining, this has to 1631 * Rebind an idle @worker to its CPU. worker_thread() will test
1315 * happen synchronously for idle workers. worker_thread() will test 1632 * list_empty(@worker->entry) before leaving idle and call this function.
1316 * %WORKER_REBIND before leaving idle and call this function.
1317 */ 1633 */
1318static void idle_worker_rebind(struct worker *worker) 1634static void idle_worker_rebind(struct worker *worker)
1319{ 1635{
1320 struct global_cwq *gcwq = worker->pool->gcwq; 1636 struct global_cwq *gcwq = worker->pool->gcwq;
1321 1637
1322 /* CPU must be online at this point */ 1638 /* CPU may go down again inbetween, clear UNBOUND only on success */
1323 WARN_ON(!worker_maybe_bind_and_lock(worker)); 1639 if (worker_maybe_bind_and_lock(worker))
1324 if (!--worker->idle_rebind->cnt) 1640 worker_clr_flags(worker, WORKER_UNBOUND);
1325 complete(&worker->idle_rebind->done);
1326 spin_unlock_irq(&worker->pool->gcwq->lock);
1327 1641
1328 /* we did our part, wait for rebind_workers() to finish up */ 1642 /* rebind complete, become available again */
1329 wait_event(gcwq->rebind_hold, !(worker->flags & WORKER_REBIND)); 1643 list_add(&worker->entry, &worker->pool->idle_list);
1330 1644 spin_unlock_irq(&gcwq->lock);
1331 /*
1332 * rebind_workers() shouldn't finish until all workers passed the
1333 * above WORKER_REBIND wait. Tell it when done.
1334 */
1335 spin_lock_irq(&worker->pool->gcwq->lock);
1336 if (!--worker->idle_rebind->cnt)
1337 complete(&worker->idle_rebind->done);
1338 spin_unlock_irq(&worker->pool->gcwq->lock);
1339} 1645}
1340 1646
1341/* 1647/*
@@ -1349,16 +1655,8 @@ static void busy_worker_rebind_fn(struct work_struct *work)
1349 struct worker *worker = container_of(work, struct worker, rebind_work); 1655 struct worker *worker = container_of(work, struct worker, rebind_work);
1350 struct global_cwq *gcwq = worker->pool->gcwq; 1656 struct global_cwq *gcwq = worker->pool->gcwq;
1351 1657
1352 worker_maybe_bind_and_lock(worker); 1658 if (worker_maybe_bind_and_lock(worker))
1353 1659 worker_clr_flags(worker, WORKER_UNBOUND);
1354 /*
1355 * %WORKER_REBIND must be cleared even if the above binding failed;
1356 * otherwise, we may confuse the next CPU_UP cycle or oops / get
1357 * stuck by calling idle_worker_rebind() prematurely. If CPU went
1358 * down again inbetween, %WORKER_UNBOUND would be set, so clearing
1359 * %WORKER_REBIND is always safe.
1360 */
1361 worker_clr_flags(worker, WORKER_REBIND);
1362 1660
1363 spin_unlock_irq(&gcwq->lock); 1661 spin_unlock_irq(&gcwq->lock);
1364} 1662}
@@ -1370,123 +1668,74 @@ static void busy_worker_rebind_fn(struct work_struct *work)
1370 * @gcwq->cpu is coming online. Rebind all workers to the CPU. Rebinding 1668 * @gcwq->cpu is coming online. Rebind all workers to the CPU. Rebinding
1371 * is different for idle and busy ones. 1669 * is different for idle and busy ones.
1372 * 1670 *
1373 * The idle ones should be rebound synchronously and idle rebinding should 1671 * Idle ones will be removed from the idle_list and woken up. They will
1374 * be complete before any worker starts executing work items with 1672 * add themselves back after completing rebind. This ensures that the
1375 * concurrency management enabled; otherwise, scheduler may oops trying to 1673 * idle_list doesn't contain any unbound workers when re-bound busy workers
1376 * wake up non-local idle worker from wq_worker_sleeping(). 1674 * try to perform local wake-ups for concurrency management.
1377 * 1675 *
1378 * This is achieved by repeatedly requesting rebinding until all idle 1676 * Busy workers can rebind after they finish their current work items.
1379 * workers are known to have been rebound under @gcwq->lock and holding all 1677 * Queueing the rebind work item at the head of the scheduled list is
1380 * idle workers from becoming busy until idle rebinding is complete. 1678 * enough. Note that nr_running will be properly bumped as busy workers
1679 * rebind.
1381 * 1680 *
1382 * Once idle workers are rebound, busy workers can be rebound as they 1681 * On return, all non-manager workers are scheduled for rebind - see
1383 * finish executing their current work items. Queueing the rebind work at 1682 * manage_workers() for the manager special case. Any idle worker
1384 * the head of their scheduled lists is enough. Note that nr_running will 1683 * including the manager will not appear on @idle_list until rebind is
1385 * be properbly bumped as busy workers rebind. 1684 * complete, making local wake-ups safe.
1386 *
1387 * On return, all workers are guaranteed to either be bound or have rebind
1388 * work item scheduled.
1389 */ 1685 */
1390static void rebind_workers(struct global_cwq *gcwq) 1686static void rebind_workers(struct global_cwq *gcwq)
1391 __releases(&gcwq->lock) __acquires(&gcwq->lock)
1392{ 1687{
1393 struct idle_rebind idle_rebind;
1394 struct worker_pool *pool; 1688 struct worker_pool *pool;
1395 struct worker *worker; 1689 struct worker *worker, *n;
1396 struct hlist_node *pos; 1690 struct hlist_node *pos;
1397 int i; 1691 int i;
1398 1692
1399 lockdep_assert_held(&gcwq->lock); 1693 lockdep_assert_held(&gcwq->lock);
1400 1694
1401 for_each_worker_pool(pool, gcwq) 1695 for_each_worker_pool(pool, gcwq)
1402 lockdep_assert_held(&pool->manager_mutex); 1696 lockdep_assert_held(&pool->assoc_mutex);
1403 1697
1404 /* 1698 /* dequeue and kick idle ones */
1405 * Rebind idle workers. Interlocked both ways. We wait for
1406 * workers to rebind via @idle_rebind.done. Workers will wait for
1407 * us to finish up by watching %WORKER_REBIND.
1408 */
1409 init_completion(&idle_rebind.done);
1410retry:
1411 idle_rebind.cnt = 1;
1412 INIT_COMPLETION(idle_rebind.done);
1413
1414 /* set REBIND and kick idle ones, we'll wait for these later */
1415 for_each_worker_pool(pool, gcwq) { 1699 for_each_worker_pool(pool, gcwq) {
1416 list_for_each_entry(worker, &pool->idle_list, entry) { 1700 list_for_each_entry_safe(worker, n, &pool->idle_list, entry) {
1417 unsigned long worker_flags = worker->flags; 1701 /*
1418 1702 * idle workers should be off @pool->idle_list
1419 if (worker->flags & WORKER_REBIND) 1703 * until rebind is complete to avoid receiving
1420 continue; 1704 * premature local wake-ups.
1421 1705 */
1422 /* morph UNBOUND to REBIND atomically */ 1706 list_del_init(&worker->entry);
1423 worker_flags &= ~WORKER_UNBOUND;
1424 worker_flags |= WORKER_REBIND;
1425 ACCESS_ONCE(worker->flags) = worker_flags;
1426
1427 idle_rebind.cnt++;
1428 worker->idle_rebind = &idle_rebind;
1429 1707
1430 /* worker_thread() will call idle_worker_rebind() */ 1708 /*
1709 * worker_thread() will see the above dequeuing
1710 * and call idle_worker_rebind().
1711 */
1431 wake_up_process(worker->task); 1712 wake_up_process(worker->task);
1432 } 1713 }
1433 } 1714 }
1434 1715
1435 if (--idle_rebind.cnt) { 1716 /* rebind busy workers */
1436 spin_unlock_irq(&gcwq->lock);
1437 wait_for_completion(&idle_rebind.done);
1438 spin_lock_irq(&gcwq->lock);
1439 /* busy ones might have become idle while waiting, retry */
1440 goto retry;
1441 }
1442
1443 /* all idle workers are rebound, rebind busy workers */
1444 for_each_busy_worker(worker, i, pos, gcwq) { 1717 for_each_busy_worker(worker, i, pos, gcwq) {
1445 struct work_struct *rebind_work = &worker->rebind_work; 1718 struct work_struct *rebind_work = &worker->rebind_work;
1446 unsigned long worker_flags = worker->flags; 1719 struct workqueue_struct *wq;
1447
1448 /* morph UNBOUND to REBIND atomically */
1449 worker_flags &= ~WORKER_UNBOUND;
1450 worker_flags |= WORKER_REBIND;
1451 ACCESS_ONCE(worker->flags) = worker_flags;
1452 1720
1453 if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, 1721 if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
1454 work_data_bits(rebind_work))) 1722 work_data_bits(rebind_work)))
1455 continue; 1723 continue;
1456 1724
1457 /* wq doesn't matter, use the default one */
1458 debug_work_activate(rebind_work); 1725 debug_work_activate(rebind_work);
1459 insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work,
1460 worker->scheduled.next,
1461 work_color_to_flags(WORK_NO_COLOR));
1462 }
1463
1464 /*
1465 * All idle workers are rebound and waiting for %WORKER_REBIND to
1466 * be cleared inside idle_worker_rebind(). Clear and release.
1467 * Clearing %WORKER_REBIND from this foreign context is safe
1468 * because these workers are still guaranteed to be idle.
1469 *
1470 * We need to make sure all idle workers passed WORKER_REBIND wait
1471 * in idle_worker_rebind() before returning; otherwise, workers can
1472 * get stuck at the wait if hotplug cycle repeats.
1473 */
1474 idle_rebind.cnt = 1;
1475 INIT_COMPLETION(idle_rebind.done);
1476
1477 for_each_worker_pool(pool, gcwq) {
1478 list_for_each_entry(worker, &pool->idle_list, entry) {
1479 worker->flags &= ~WORKER_REBIND;
1480 idle_rebind.cnt++;
1481 }
1482 }
1483 1726
1484 wake_up_all(&gcwq->rebind_hold); 1727 /*
1728 * wq doesn't really matter but let's keep @worker->pool
1729 * and @cwq->pool consistent for sanity.
1730 */
1731 if (worker_pool_pri(worker->pool))
1732 wq = system_highpri_wq;
1733 else
1734 wq = system_wq;
1485 1735
1486 if (--idle_rebind.cnt) { 1736 insert_work(get_cwq(gcwq->cpu, wq), rebind_work,
1487 spin_unlock_irq(&gcwq->lock); 1737 worker->scheduled.next,
1488 wait_for_completion(&idle_rebind.done); 1738 work_color_to_flags(WORK_NO_COLOR));
1489 spin_lock_irq(&gcwq->lock);
1490 } 1739 }
1491} 1740}
1492 1741
@@ -1844,22 +2093,22 @@ static bool manage_workers(struct worker *worker)
1844 * grab %POOL_MANAGING_WORKERS to achieve this because that can 2093 * grab %POOL_MANAGING_WORKERS to achieve this because that can
1845 * lead to idle worker depletion (all become busy thinking someone 2094 * lead to idle worker depletion (all become busy thinking someone
1846 * else is managing) which in turn can result in deadlock under 2095 * else is managing) which in turn can result in deadlock under
1847 * extreme circumstances. Use @pool->manager_mutex to synchronize 2096 * extreme circumstances. Use @pool->assoc_mutex to synchronize
1848 * manager against CPU hotplug. 2097 * manager against CPU hotplug.
1849 * 2098 *
1850 * manager_mutex would always be free unless CPU hotplug is in 2099 * assoc_mutex would always be free unless CPU hotplug is in
1851 * progress. trylock first without dropping @gcwq->lock. 2100 * progress. trylock first without dropping @gcwq->lock.
1852 */ 2101 */
1853 if (unlikely(!mutex_trylock(&pool->manager_mutex))) { 2102 if (unlikely(!mutex_trylock(&pool->assoc_mutex))) {
1854 spin_unlock_irq(&pool->gcwq->lock); 2103 spin_unlock_irq(&pool->gcwq->lock);
1855 mutex_lock(&pool->manager_mutex); 2104 mutex_lock(&pool->assoc_mutex);
1856 /* 2105 /*
1857 * CPU hotplug could have happened while we were waiting 2106 * CPU hotplug could have happened while we were waiting
1858 * for manager_mutex. Hotplug itself can't handle us 2107 * for assoc_mutex. Hotplug itself can't handle us
1859 * because manager isn't either on idle or busy list, and 2108 * because manager isn't either on idle or busy list, and
1860 * @gcwq's state and ours could have deviated. 2109 * @gcwq's state and ours could have deviated.
1861 * 2110 *
1862 * As hotplug is now excluded via manager_mutex, we can 2111 * As hotplug is now excluded via assoc_mutex, we can
1863 * simply try to bind. It will succeed or fail depending 2112 * simply try to bind. It will succeed or fail depending
1864 * on @gcwq's current state. Try it and adjust 2113 * on @gcwq's current state. Try it and adjust
1865 * %WORKER_UNBOUND accordingly. 2114 * %WORKER_UNBOUND accordingly.
@@ -1882,112 +2131,11 @@ static bool manage_workers(struct worker *worker)
1882 ret |= maybe_create_worker(pool); 2131 ret |= maybe_create_worker(pool);
1883 2132
1884 pool->flags &= ~POOL_MANAGING_WORKERS; 2133 pool->flags &= ~POOL_MANAGING_WORKERS;
1885 mutex_unlock(&pool->manager_mutex); 2134 mutex_unlock(&pool->assoc_mutex);
1886 return ret; 2135 return ret;
1887} 2136}
1888 2137
1889/** 2138/**
1890 * move_linked_works - move linked works to a list
1891 * @work: start of series of works to be scheduled
1892 * @head: target list to append @work to
1893 * @nextp: out paramter for nested worklist walking
1894 *
1895 * Schedule linked works starting from @work to @head. Work series to
1896 * be scheduled starts at @work and includes any consecutive work with
1897 * WORK_STRUCT_LINKED set in its predecessor.
1898 *
1899 * If @nextp is not NULL, it's updated to point to the next work of
1900 * the last scheduled work. This allows move_linked_works() to be
1901 * nested inside outer list_for_each_entry_safe().
1902 *
1903 * CONTEXT:
1904 * spin_lock_irq(gcwq->lock).
1905 */
1906static void move_linked_works(struct work_struct *work, struct list_head *head,
1907 struct work_struct **nextp)
1908{
1909 struct work_struct *n;
1910
1911 /*
1912 * Linked worklist will always end before the end of the list,
1913 * use NULL for list head.
1914 */
1915 list_for_each_entry_safe_from(work, n, NULL, entry) {
1916 list_move_tail(&work->entry, head);
1917 if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
1918 break;
1919 }
1920
1921 /*
1922 * If we're already inside safe list traversal and have moved
1923 * multiple works to the scheduled queue, the next position
1924 * needs to be updated.
1925 */
1926 if (nextp)
1927 *nextp = n;
1928}
1929
1930static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
1931{
1932 struct work_struct *work = list_first_entry(&cwq->delayed_works,
1933 struct work_struct, entry);
1934
1935 trace_workqueue_activate_work(work);
1936 move_linked_works(work, &cwq->pool->worklist, NULL);
1937 __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
1938 cwq->nr_active++;
1939}
1940
1941/**
1942 * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight
1943 * @cwq: cwq of interest
1944 * @color: color of work which left the queue
1945 * @delayed: for a delayed work
1946 *
1947 * A work either has completed or is removed from pending queue,
1948 * decrement nr_in_flight of its cwq and handle workqueue flushing.
1949 *
1950 * CONTEXT:
1951 * spin_lock_irq(gcwq->lock).
1952 */
1953static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color,
1954 bool delayed)
1955{
1956 /* ignore uncolored works */
1957 if (color == WORK_NO_COLOR)
1958 return;
1959
1960 cwq->nr_in_flight[color]--;
1961
1962 if (!delayed) {
1963 cwq->nr_active--;
1964 if (!list_empty(&cwq->delayed_works)) {
1965 /* one down, submit a delayed one */
1966 if (cwq->nr_active < cwq->max_active)
1967 cwq_activate_first_delayed(cwq);
1968 }
1969 }
1970
1971 /* is flush in progress and are we at the flushing tip? */
1972 if (likely(cwq->flush_color != color))
1973 return;
1974
1975 /* are there still in-flight works? */
1976 if (cwq->nr_in_flight[color])
1977 return;
1978
1979 /* this cwq is done, clear flush_color */
1980 cwq->flush_color = -1;
1981
1982 /*
1983 * If this was the last cwq, wake up the first flusher. It
1984 * will handle the rest.
1985 */
1986 if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush))
1987 complete(&cwq->wq->first_flusher->done);
1988}
1989
1990/**
1991 * process_one_work - process single work 2139 * process_one_work - process single work
1992 * @worker: self 2140 * @worker: self
1993 * @work: work to process 2141 * @work: work to process
@@ -2030,7 +2178,7 @@ __acquires(&gcwq->lock)
2030 * necessary to avoid spurious warnings from rescuers servicing the 2178 * necessary to avoid spurious warnings from rescuers servicing the
2031 * unbound or a disassociated gcwq. 2179 * unbound or a disassociated gcwq.
2032 */ 2180 */
2033 WARN_ON_ONCE(!(worker->flags & (WORKER_UNBOUND | WORKER_REBIND)) && 2181 WARN_ON_ONCE(!(worker->flags & WORKER_UNBOUND) &&
2034 !(gcwq->flags & GCWQ_DISASSOCIATED) && 2182 !(gcwq->flags & GCWQ_DISASSOCIATED) &&
2035 raw_smp_processor_id() != gcwq->cpu); 2183 raw_smp_processor_id() != gcwq->cpu);
2036 2184
@@ -2046,15 +2194,13 @@ __acquires(&gcwq->lock)
2046 return; 2194 return;
2047 } 2195 }
2048 2196
2049 /* claim and process */ 2197 /* claim and dequeue */
2050 debug_work_deactivate(work); 2198 debug_work_deactivate(work);
2051 hlist_add_head(&worker->hentry, bwh); 2199 hlist_add_head(&worker->hentry, bwh);
2052 worker->current_work = work; 2200 worker->current_work = work;
2053 worker->current_cwq = cwq; 2201 worker->current_cwq = cwq;
2054 work_color = get_work_color(work); 2202 work_color = get_work_color(work);
2055 2203
2056 /* record the current cpu number in the work data and dequeue */
2057 set_work_cpu(work, gcwq->cpu);
2058 list_del_init(&work->entry); 2204 list_del_init(&work->entry);
2059 2205
2060 /* 2206 /*
@@ -2071,9 +2217,16 @@ __acquires(&gcwq->lock)
2071 if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool)) 2217 if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool))
2072 wake_up_worker(pool); 2218 wake_up_worker(pool);
2073 2219
2220 /*
2221 * Record the last CPU and clear PENDING which should be the last
2222 * update to @work. Also, do this inside @gcwq->lock so that
2223 * PENDING and queued state changes happen together while IRQ is
2224 * disabled.
2225 */
2226 set_work_cpu_and_clear_pending(work, gcwq->cpu);
2227
2074 spin_unlock_irq(&gcwq->lock); 2228 spin_unlock_irq(&gcwq->lock);
2075 2229
2076 work_clear_pending(work);
2077 lock_map_acquire_read(&cwq->wq->lockdep_map); 2230 lock_map_acquire_read(&cwq->wq->lockdep_map);
2078 lock_map_acquire(&lockdep_map); 2231 lock_map_acquire(&lockdep_map);
2079 trace_workqueue_execute_start(work); 2232 trace_workqueue_execute_start(work);
@@ -2087,11 +2240,9 @@ __acquires(&gcwq->lock)
2087 lock_map_release(&cwq->wq->lockdep_map); 2240 lock_map_release(&cwq->wq->lockdep_map);
2088 2241
2089 if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { 2242 if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
2090 printk(KERN_ERR "BUG: workqueue leaked lock or atomic: " 2243 pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n"
2091 "%s/0x%08x/%d\n", 2244 " last function: %pf\n",
2092 current->comm, preempt_count(), task_pid_nr(current)); 2245 current->comm, preempt_count(), task_pid_nr(current), f);
2093 printk(KERN_ERR " last function: ");
2094 print_symbol("%s\n", (unsigned long)f);
2095 debug_show_held_locks(current); 2246 debug_show_held_locks(current);
2096 dump_stack(); 2247 dump_stack();
2097 } 2248 }
@@ -2106,7 +2257,7 @@ __acquires(&gcwq->lock)
2106 hlist_del_init(&worker->hentry); 2257 hlist_del_init(&worker->hentry);
2107 worker->current_work = NULL; 2258 worker->current_work = NULL;
2108 worker->current_cwq = NULL; 2259 worker->current_cwq = NULL;
2109 cwq_dec_nr_in_flight(cwq, work_color, false); 2260 cwq_dec_nr_in_flight(cwq, work_color);
2110} 2261}
2111 2262
2112/** 2263/**
@@ -2151,18 +2302,17 @@ static int worker_thread(void *__worker)
2151woke_up: 2302woke_up:
2152 spin_lock_irq(&gcwq->lock); 2303 spin_lock_irq(&gcwq->lock);
2153 2304
2154 /* 2305 /* we are off idle list if destruction or rebind is requested */
2155 * DIE can be set only while idle and REBIND set while busy has 2306 if (unlikely(list_empty(&worker->entry))) {
2156 * @worker->rebind_work scheduled. Checking here is enough.
2157 */
2158 if (unlikely(worker->flags & (WORKER_REBIND | WORKER_DIE))) {
2159 spin_unlock_irq(&gcwq->lock); 2307 spin_unlock_irq(&gcwq->lock);
2160 2308
2309 /* if DIE is set, destruction is requested */
2161 if (worker->flags & WORKER_DIE) { 2310 if (worker->flags & WORKER_DIE) {
2162 worker->task->flags &= ~PF_WQ_WORKER; 2311 worker->task->flags &= ~PF_WQ_WORKER;
2163 return 0; 2312 return 0;
2164 } 2313 }
2165 2314
2315 /* otherwise, rebind */
2166 idle_worker_rebind(worker); 2316 idle_worker_rebind(worker);
2167 goto woke_up; 2317 goto woke_up;
2168 } 2318 }
@@ -2645,8 +2795,8 @@ reflush:
2645 2795
2646 if (++flush_cnt == 10 || 2796 if (++flush_cnt == 10 ||
2647 (flush_cnt % 100 == 0 && flush_cnt <= 1000)) 2797 (flush_cnt % 100 == 0 && flush_cnt <= 1000))
2648 pr_warning("workqueue %s: flush on destruction isn't complete after %u tries\n", 2798 pr_warn("workqueue %s: flush on destruction isn't complete after %u tries\n",
2649 wq->name, flush_cnt); 2799 wq->name, flush_cnt);
2650 goto reflush; 2800 goto reflush;
2651 } 2801 }
2652 2802
@@ -2657,8 +2807,7 @@ reflush:
2657} 2807}
2658EXPORT_SYMBOL_GPL(drain_workqueue); 2808EXPORT_SYMBOL_GPL(drain_workqueue);
2659 2809
2660static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, 2810static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
2661 bool wait_executing)
2662{ 2811{
2663 struct worker *worker = NULL; 2812 struct worker *worker = NULL;
2664 struct global_cwq *gcwq; 2813 struct global_cwq *gcwq;
@@ -2680,13 +2829,12 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
2680 cwq = get_work_cwq(work); 2829 cwq = get_work_cwq(work);
2681 if (unlikely(!cwq || gcwq != cwq->pool->gcwq)) 2830 if (unlikely(!cwq || gcwq != cwq->pool->gcwq))
2682 goto already_gone; 2831 goto already_gone;
2683 } else if (wait_executing) { 2832 } else {
2684 worker = find_worker_executing_work(gcwq, work); 2833 worker = find_worker_executing_work(gcwq, work);
2685 if (!worker) 2834 if (!worker)
2686 goto already_gone; 2835 goto already_gone;
2687 cwq = worker->current_cwq; 2836 cwq = worker->current_cwq;
2688 } else 2837 }
2689 goto already_gone;
2690 2838
2691 insert_wq_barrier(cwq, barr, work, worker); 2839 insert_wq_barrier(cwq, barr, work, worker);
2692 spin_unlock_irq(&gcwq->lock); 2840 spin_unlock_irq(&gcwq->lock);
@@ -2713,15 +2861,8 @@ already_gone:
2713 * flush_work - wait for a work to finish executing the last queueing instance 2861 * flush_work - wait for a work to finish executing the last queueing instance
2714 * @work: the work to flush 2862 * @work: the work to flush
2715 * 2863 *
2716 * Wait until @work has finished execution. This function considers 2864 * Wait until @work has finished execution. @work is guaranteed to be idle
2717 * only the last queueing instance of @work. If @work has been 2865 * on return if it hasn't been requeued since flush started.
2718 * enqueued across different CPUs on a non-reentrant workqueue or on
2719 * multiple workqueues, @work might still be executing on return on
2720 * some of the CPUs from earlier queueing.
2721 *
2722 * If @work was queued only on a non-reentrant, ordered or unbound
2723 * workqueue, @work is guaranteed to be idle on return if it hasn't
2724 * been requeued since flush started.
2725 * 2866 *
2726 * RETURNS: 2867 * RETURNS:
2727 * %true if flush_work() waited for the work to finish execution, 2868 * %true if flush_work() waited for the work to finish execution,
@@ -2734,140 +2875,36 @@ bool flush_work(struct work_struct *work)
2734 lock_map_acquire(&work->lockdep_map); 2875 lock_map_acquire(&work->lockdep_map);
2735 lock_map_release(&work->lockdep_map); 2876 lock_map_release(&work->lockdep_map);
2736 2877
2737 if (start_flush_work(work, &barr, true)) { 2878 if (start_flush_work(work, &barr)) {
2738 wait_for_completion(&barr.done); 2879 wait_for_completion(&barr.done);
2739 destroy_work_on_stack(&barr.work); 2880 destroy_work_on_stack(&barr.work);
2740 return true; 2881 return true;
2741 } else 2882 } else {
2742 return false;
2743}
2744EXPORT_SYMBOL_GPL(flush_work);
2745
2746static bool wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work)
2747{
2748 struct wq_barrier barr;
2749 struct worker *worker;
2750
2751 spin_lock_irq(&gcwq->lock);
2752
2753 worker = find_worker_executing_work(gcwq, work);
2754 if (unlikely(worker))
2755 insert_wq_barrier(worker->current_cwq, &barr, work, worker);
2756
2757 spin_unlock_irq(&gcwq->lock);
2758
2759 if (unlikely(worker)) {
2760 wait_for_completion(&barr.done);
2761 destroy_work_on_stack(&barr.work);
2762 return true;
2763 } else
2764 return false; 2883 return false;
2765}
2766
2767static bool wait_on_work(struct work_struct *work)
2768{
2769 bool ret = false;
2770 int cpu;
2771
2772 might_sleep();
2773
2774 lock_map_acquire(&work->lockdep_map);
2775 lock_map_release(&work->lockdep_map);
2776
2777 for_each_gcwq_cpu(cpu)
2778 ret |= wait_on_cpu_work(get_gcwq(cpu), work);
2779 return ret;
2780}
2781
2782/**
2783 * flush_work_sync - wait until a work has finished execution
2784 * @work: the work to flush
2785 *
2786 * Wait until @work has finished execution. On return, it's
2787 * guaranteed that all queueing instances of @work which happened
2788 * before this function is called are finished. In other words, if
2789 * @work hasn't been requeued since this function was called, @work is
2790 * guaranteed to be idle on return.
2791 *
2792 * RETURNS:
2793 * %true if flush_work_sync() waited for the work to finish execution,
2794 * %false if it was already idle.
2795 */
2796bool flush_work_sync(struct work_struct *work)
2797{
2798 struct wq_barrier barr;
2799 bool pending, waited;
2800
2801 /* we'll wait for executions separately, queue barr only if pending */
2802 pending = start_flush_work(work, &barr, false);
2803
2804 /* wait for executions to finish */
2805 waited = wait_on_work(work);
2806
2807 /* wait for the pending one */
2808 if (pending) {
2809 wait_for_completion(&barr.done);
2810 destroy_work_on_stack(&barr.work);
2811 } 2884 }
2812
2813 return pending || waited;
2814}
2815EXPORT_SYMBOL_GPL(flush_work_sync);
2816
2817/*
2818 * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit,
2819 * so this work can't be re-armed in any way.
2820 */
2821static int try_to_grab_pending(struct work_struct *work)
2822{
2823 struct global_cwq *gcwq;
2824 int ret = -1;
2825
2826 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
2827 return 0;
2828
2829 /*
2830 * The queueing is in progress, or it is already queued. Try to
2831 * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
2832 */
2833 gcwq = get_work_gcwq(work);
2834 if (!gcwq)
2835 return ret;
2836
2837 spin_lock_irq(&gcwq->lock);
2838 if (!list_empty(&work->entry)) {
2839 /*
2840 * This work is queued, but perhaps we locked the wrong gcwq.
2841 * In that case we must see the new value after rmb(), see
2842 * insert_work()->wmb().
2843 */
2844 smp_rmb();
2845 if (gcwq == get_work_gcwq(work)) {
2846 debug_work_deactivate(work);
2847 list_del_init(&work->entry);
2848 cwq_dec_nr_in_flight(get_work_cwq(work),
2849 get_work_color(work),
2850 *work_data_bits(work) & WORK_STRUCT_DELAYED);
2851 ret = 1;
2852 }
2853 }
2854 spin_unlock_irq(&gcwq->lock);
2855
2856 return ret;
2857} 2885}
2886EXPORT_SYMBOL_GPL(flush_work);
2858 2887
2859static bool __cancel_work_timer(struct work_struct *work, 2888static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
2860 struct timer_list* timer)
2861{ 2889{
2890 unsigned long flags;
2862 int ret; 2891 int ret;
2863 2892
2864 do { 2893 do {
2865 ret = (timer && likely(del_timer(timer))); 2894 ret = try_to_grab_pending(work, is_dwork, &flags);
2866 if (!ret) 2895 /*
2867 ret = try_to_grab_pending(work); 2896 * If someone else is canceling, wait for the same event it
2868 wait_on_work(work); 2897 * would be waiting for before retrying.
2898 */
2899 if (unlikely(ret == -ENOENT))
2900 flush_work(work);
2869 } while (unlikely(ret < 0)); 2901 } while (unlikely(ret < 0));
2870 2902
2903 /* tell other tasks trying to grab @work to back off */
2904 mark_work_canceling(work);
2905 local_irq_restore(flags);
2906
2907 flush_work(work);
2871 clear_work_data(work); 2908 clear_work_data(work);
2872 return ret; 2909 return ret;
2873} 2910}
@@ -2892,7 +2929,7 @@ static bool __cancel_work_timer(struct work_struct *work,
2892 */ 2929 */
2893bool cancel_work_sync(struct work_struct *work) 2930bool cancel_work_sync(struct work_struct *work)
2894{ 2931{
2895 return __cancel_work_timer(work, NULL); 2932 return __cancel_work_timer(work, false);
2896} 2933}
2897EXPORT_SYMBOL_GPL(cancel_work_sync); 2934EXPORT_SYMBOL_GPL(cancel_work_sync);
2898 2935
@@ -2910,33 +2947,44 @@ EXPORT_SYMBOL_GPL(cancel_work_sync);
2910 */ 2947 */
2911bool flush_delayed_work(struct delayed_work *dwork) 2948bool flush_delayed_work(struct delayed_work *dwork)
2912{ 2949{
2950 local_irq_disable();
2913 if (del_timer_sync(&dwork->timer)) 2951 if (del_timer_sync(&dwork->timer))
2914 __queue_work(raw_smp_processor_id(), 2952 __queue_work(dwork->cpu,
2915 get_work_cwq(&dwork->work)->wq, &dwork->work); 2953 get_work_cwq(&dwork->work)->wq, &dwork->work);
2954 local_irq_enable();
2916 return flush_work(&dwork->work); 2955 return flush_work(&dwork->work);
2917} 2956}
2918EXPORT_SYMBOL(flush_delayed_work); 2957EXPORT_SYMBOL(flush_delayed_work);
2919 2958
2920/** 2959/**
2921 * flush_delayed_work_sync - wait for a dwork to finish 2960 * cancel_delayed_work - cancel a delayed work
2922 * @dwork: the delayed work to flush 2961 * @dwork: delayed_work to cancel
2923 * 2962 *
2924 * Delayed timer is cancelled and the pending work is queued for 2963 * Kill off a pending delayed_work. Returns %true if @dwork was pending
2925 * execution immediately. Other than timer handling, its behavior 2964 * and canceled; %false if wasn't pending. Note that the work callback
2926 * is identical to flush_work_sync(). 2965 * function may still be running on return, unless it returns %true and the
2966 * work doesn't re-arm itself. Explicitly flush or use
2967 * cancel_delayed_work_sync() to wait on it.
2927 * 2968 *
2928 * RETURNS: 2969 * This function is safe to call from any context including IRQ handler.
2929 * %true if flush_work_sync() waited for the work to finish execution,
2930 * %false if it was already idle.
2931 */ 2970 */
2932bool flush_delayed_work_sync(struct delayed_work *dwork) 2971bool cancel_delayed_work(struct delayed_work *dwork)
2933{ 2972{
2934 if (del_timer_sync(&dwork->timer)) 2973 unsigned long flags;
2935 __queue_work(raw_smp_processor_id(), 2974 int ret;
2936 get_work_cwq(&dwork->work)->wq, &dwork->work); 2975
2937 return flush_work_sync(&dwork->work); 2976 do {
2977 ret = try_to_grab_pending(&dwork->work, true, &flags);
2978 } while (unlikely(ret == -EAGAIN));
2979
2980 if (unlikely(ret < 0))
2981 return false;
2982
2983 set_work_cpu_and_clear_pending(&dwork->work, work_cpu(&dwork->work));
2984 local_irq_restore(flags);
2985 return true;
2938} 2986}
2939EXPORT_SYMBOL(flush_delayed_work_sync); 2987EXPORT_SYMBOL(cancel_delayed_work);
2940 2988
2941/** 2989/**
2942 * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish 2990 * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish
@@ -2949,54 +2997,39 @@ EXPORT_SYMBOL(flush_delayed_work_sync);
2949 */ 2997 */
2950bool cancel_delayed_work_sync(struct delayed_work *dwork) 2998bool cancel_delayed_work_sync(struct delayed_work *dwork)
2951{ 2999{
2952 return __cancel_work_timer(&dwork->work, &dwork->timer); 3000 return __cancel_work_timer(&dwork->work, true);
2953} 3001}
2954EXPORT_SYMBOL(cancel_delayed_work_sync); 3002EXPORT_SYMBOL(cancel_delayed_work_sync);
2955 3003
2956/** 3004/**
2957 * schedule_work - put work task in global workqueue
2958 * @work: job to be done
2959 *
2960 * Returns zero if @work was already on the kernel-global workqueue and
2961 * non-zero otherwise.
2962 *
2963 * This puts a job in the kernel-global workqueue if it was not already
2964 * queued and leaves it in the same position on the kernel-global
2965 * workqueue otherwise.
2966 */
2967int schedule_work(struct work_struct *work)
2968{
2969 return queue_work(system_wq, work);
2970}
2971EXPORT_SYMBOL(schedule_work);
2972
2973/*
2974 * schedule_work_on - put work task on a specific cpu 3005 * schedule_work_on - put work task on a specific cpu
2975 * @cpu: cpu to put the work task on 3006 * @cpu: cpu to put the work task on
2976 * @work: job to be done 3007 * @work: job to be done
2977 * 3008 *
2978 * This puts a job on a specific cpu 3009 * This puts a job on a specific cpu
2979 */ 3010 */
2980int schedule_work_on(int cpu, struct work_struct *work) 3011bool schedule_work_on(int cpu, struct work_struct *work)
2981{ 3012{
2982 return queue_work_on(cpu, system_wq, work); 3013 return queue_work_on(cpu, system_wq, work);
2983} 3014}
2984EXPORT_SYMBOL(schedule_work_on); 3015EXPORT_SYMBOL(schedule_work_on);
2985 3016
2986/** 3017/**
2987 * schedule_delayed_work - put work task in global workqueue after delay 3018 * schedule_work - put work task in global workqueue
2988 * @dwork: job to be done 3019 * @work: job to be done
2989 * @delay: number of jiffies to wait or 0 for immediate execution
2990 * 3020 *
2991 * After waiting for a given time this puts a job in the kernel-global 3021 * Returns %false if @work was already on the kernel-global workqueue and
2992 * workqueue. 3022 * %true otherwise.
3023 *
3024 * This puts a job in the kernel-global workqueue if it was not already
3025 * queued and leaves it in the same position on the kernel-global
3026 * workqueue otherwise.
2993 */ 3027 */
2994int schedule_delayed_work(struct delayed_work *dwork, 3028bool schedule_work(struct work_struct *work)
2995 unsigned long delay)
2996{ 3029{
2997 return queue_delayed_work(system_wq, dwork, delay); 3030 return queue_work(system_wq, work);
2998} 3031}
2999EXPORT_SYMBOL(schedule_delayed_work); 3032EXPORT_SYMBOL(schedule_work);
3000 3033
3001/** 3034/**
3002 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay 3035 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
@@ -3007,14 +3040,28 @@ EXPORT_SYMBOL(schedule_delayed_work);
3007 * After waiting for a given time this puts a job in the kernel-global 3040 * After waiting for a given time this puts a job in the kernel-global
3008 * workqueue on the specified CPU. 3041 * workqueue on the specified CPU.
3009 */ 3042 */
3010int schedule_delayed_work_on(int cpu, 3043bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork,
3011 struct delayed_work *dwork, unsigned long delay) 3044 unsigned long delay)
3012{ 3045{
3013 return queue_delayed_work_on(cpu, system_wq, dwork, delay); 3046 return queue_delayed_work_on(cpu, system_wq, dwork, delay);
3014} 3047}
3015EXPORT_SYMBOL(schedule_delayed_work_on); 3048EXPORT_SYMBOL(schedule_delayed_work_on);
3016 3049
3017/** 3050/**
3051 * schedule_delayed_work - put work task in global workqueue after delay
3052 * @dwork: job to be done
3053 * @delay: number of jiffies to wait or 0 for immediate execution
3054 *
3055 * After waiting for a given time this puts a job in the kernel-global
3056 * workqueue.
3057 */
3058bool schedule_delayed_work(struct delayed_work *dwork, unsigned long delay)
3059{
3060 return queue_delayed_work(system_wq, dwork, delay);
3061}
3062EXPORT_SYMBOL(schedule_delayed_work);
3063
3064/**
3018 * schedule_on_each_cpu - execute a function synchronously on each online CPU 3065 * schedule_on_each_cpu - execute a function synchronously on each online CPU
3019 * @func: the function to call 3066 * @func: the function to call
3020 * 3067 *
@@ -3161,9 +3208,8 @@ static int wq_clamp_max_active(int max_active, unsigned int flags,
3161 int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE; 3208 int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE;
3162 3209
3163 if (max_active < 1 || max_active > lim) 3210 if (max_active < 1 || max_active > lim)
3164 printk(KERN_WARNING "workqueue: max_active %d requested for %s " 3211 pr_warn("workqueue: max_active %d requested for %s is out of range, clamping between %d and %d\n",
3165 "is out of range, clamping between %d and %d\n", 3212 max_active, name, 1, lim);
3166 max_active, name, 1, lim);
3167 3213
3168 return clamp_val(max_active, 1, lim); 3214 return clamp_val(max_active, 1, lim);
3169} 3215}
@@ -3319,6 +3365,26 @@ void destroy_workqueue(struct workqueue_struct *wq)
3319EXPORT_SYMBOL_GPL(destroy_workqueue); 3365EXPORT_SYMBOL_GPL(destroy_workqueue);
3320 3366
3321/** 3367/**
3368 * cwq_set_max_active - adjust max_active of a cwq
3369 * @cwq: target cpu_workqueue_struct
3370 * @max_active: new max_active value.
3371 *
3372 * Set @cwq->max_active to @max_active and activate delayed works if
3373 * increased.
3374 *
3375 * CONTEXT:
3376 * spin_lock_irq(gcwq->lock).
3377 */
3378static void cwq_set_max_active(struct cpu_workqueue_struct *cwq, int max_active)
3379{
3380 cwq->max_active = max_active;
3381
3382 while (!list_empty(&cwq->delayed_works) &&
3383 cwq->nr_active < cwq->max_active)
3384 cwq_activate_first_delayed(cwq);
3385}
3386
3387/**
3322 * workqueue_set_max_active - adjust max_active of a workqueue 3388 * workqueue_set_max_active - adjust max_active of a workqueue
3323 * @wq: target workqueue 3389 * @wq: target workqueue
3324 * @max_active: new max_active value. 3390 * @max_active: new max_active value.
@@ -3345,7 +3411,7 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
3345 3411
3346 if (!(wq->flags & WQ_FREEZABLE) || 3412 if (!(wq->flags & WQ_FREEZABLE) ||
3347 !(gcwq->flags & GCWQ_FREEZING)) 3413 !(gcwq->flags & GCWQ_FREEZING))
3348 get_cwq(gcwq->cpu, wq)->max_active = max_active; 3414 cwq_set_max_active(get_cwq(gcwq->cpu, wq), max_active);
3349 3415
3350 spin_unlock_irq(&gcwq->lock); 3416 spin_unlock_irq(&gcwq->lock);
3351 } 3417 }
@@ -3440,23 +3506,23 @@ EXPORT_SYMBOL_GPL(work_busy);
3440 */ 3506 */
3441 3507
3442/* claim manager positions of all pools */ 3508/* claim manager positions of all pools */
3443static void gcwq_claim_management_and_lock(struct global_cwq *gcwq) 3509static void gcwq_claim_assoc_and_lock(struct global_cwq *gcwq)
3444{ 3510{
3445 struct worker_pool *pool; 3511 struct worker_pool *pool;
3446 3512
3447 for_each_worker_pool(pool, gcwq) 3513 for_each_worker_pool(pool, gcwq)
3448 mutex_lock_nested(&pool->manager_mutex, pool - gcwq->pools); 3514 mutex_lock_nested(&pool->assoc_mutex, pool - gcwq->pools);
3449 spin_lock_irq(&gcwq->lock); 3515 spin_lock_irq(&gcwq->lock);
3450} 3516}
3451 3517
3452/* release manager positions */ 3518/* release manager positions */
3453static void gcwq_release_management_and_unlock(struct global_cwq *gcwq) 3519static void gcwq_release_assoc_and_unlock(struct global_cwq *gcwq)
3454{ 3520{
3455 struct worker_pool *pool; 3521 struct worker_pool *pool;
3456 3522
3457 spin_unlock_irq(&gcwq->lock); 3523 spin_unlock_irq(&gcwq->lock);
3458 for_each_worker_pool(pool, gcwq) 3524 for_each_worker_pool(pool, gcwq)
3459 mutex_unlock(&pool->manager_mutex); 3525 mutex_unlock(&pool->assoc_mutex);
3460} 3526}
3461 3527
3462static void gcwq_unbind_fn(struct work_struct *work) 3528static void gcwq_unbind_fn(struct work_struct *work)
@@ -3469,7 +3535,7 @@ static void gcwq_unbind_fn(struct work_struct *work)
3469 3535
3470 BUG_ON(gcwq->cpu != smp_processor_id()); 3536 BUG_ON(gcwq->cpu != smp_processor_id());
3471 3537
3472 gcwq_claim_management_and_lock(gcwq); 3538 gcwq_claim_assoc_and_lock(gcwq);
3473 3539
3474 /* 3540 /*
3475 * We've claimed all manager positions. Make all workers unbound 3541 * We've claimed all manager positions. Make all workers unbound
@@ -3486,7 +3552,7 @@ static void gcwq_unbind_fn(struct work_struct *work)
3486 3552
3487 gcwq->flags |= GCWQ_DISASSOCIATED; 3553 gcwq->flags |= GCWQ_DISASSOCIATED;
3488 3554
3489 gcwq_release_management_and_unlock(gcwq); 3555 gcwq_release_assoc_and_unlock(gcwq);
3490 3556
3491 /* 3557 /*
3492 * Call schedule() so that we cross rq->lock and thus can guarantee 3558 * Call schedule() so that we cross rq->lock and thus can guarantee
@@ -3514,7 +3580,7 @@ static void gcwq_unbind_fn(struct work_struct *work)
3514 * Workqueues should be brought up before normal priority CPU notifiers. 3580 * Workqueues should be brought up before normal priority CPU notifiers.
3515 * This will be registered high priority CPU notifier. 3581 * This will be registered high priority CPU notifier.
3516 */ 3582 */
3517static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb, 3583static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,
3518 unsigned long action, 3584 unsigned long action,
3519 void *hcpu) 3585 void *hcpu)
3520{ 3586{
@@ -3542,10 +3608,10 @@ static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb,
3542 3608
3543 case CPU_DOWN_FAILED: 3609 case CPU_DOWN_FAILED:
3544 case CPU_ONLINE: 3610 case CPU_ONLINE:
3545 gcwq_claim_management_and_lock(gcwq); 3611 gcwq_claim_assoc_and_lock(gcwq);
3546 gcwq->flags &= ~GCWQ_DISASSOCIATED; 3612 gcwq->flags &= ~GCWQ_DISASSOCIATED;
3547 rebind_workers(gcwq); 3613 rebind_workers(gcwq);
3548 gcwq_release_management_and_unlock(gcwq); 3614 gcwq_release_assoc_and_unlock(gcwq);
3549 break; 3615 break;
3550 } 3616 }
3551 return NOTIFY_OK; 3617 return NOTIFY_OK;
@@ -3555,7 +3621,7 @@ static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb,
3555 * Workqueues should be brought down after normal priority CPU notifiers. 3621 * Workqueues should be brought down after normal priority CPU notifiers.
3556 * This will be registered as low priority CPU notifier. 3622 * This will be registered as low priority CPU notifier.
3557 */ 3623 */
3558static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb, 3624static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb,
3559 unsigned long action, 3625 unsigned long action,
3560 void *hcpu) 3626 void *hcpu)
3561{ 3627{
@@ -3566,7 +3632,7 @@ static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb,
3566 case CPU_DOWN_PREPARE: 3632 case CPU_DOWN_PREPARE:
3567 /* unbinding should happen on the local CPU */ 3633 /* unbinding should happen on the local CPU */
3568 INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn); 3634 INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn);
3569 schedule_work_on(cpu, &unbind_work); 3635 queue_work_on(cpu, system_highpri_wq, &unbind_work);
3570 flush_work(&unbind_work); 3636 flush_work(&unbind_work);
3571 break; 3637 break;
3572 } 3638 }
@@ -3735,11 +3801,7 @@ void thaw_workqueues(void)
3735 continue; 3801 continue;
3736 3802
3737 /* restore max_active and repopulate worklist */ 3803 /* restore max_active and repopulate worklist */
3738 cwq->max_active = wq->saved_max_active; 3804 cwq_set_max_active(cwq, wq->saved_max_active);
3739
3740 while (!list_empty(&cwq->delayed_works) &&
3741 cwq->nr_active < cwq->max_active)
3742 cwq_activate_first_delayed(cwq);
3743 } 3805 }
3744 3806
3745 for_each_worker_pool(pool, gcwq) 3807 for_each_worker_pool(pool, gcwq)
@@ -3759,8 +3821,12 @@ static int __init init_workqueues(void)
3759 unsigned int cpu; 3821 unsigned int cpu;
3760 int i; 3822 int i;
3761 3823
3824 /* make sure we have enough bits for OFFQ CPU number */
3825 BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_CPU_SHIFT)) <
3826 WORK_CPU_LAST);
3827
3762 cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP); 3828 cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
3763 cpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); 3829 hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);
3764 3830
3765 /* initialize gcwqs */ 3831 /* initialize gcwqs */
3766 for_each_gcwq_cpu(cpu) { 3832 for_each_gcwq_cpu(cpu) {
@@ -3786,11 +3852,9 @@ static int __init init_workqueues(void)
3786 setup_timer(&pool->mayday_timer, gcwq_mayday_timeout, 3852 setup_timer(&pool->mayday_timer, gcwq_mayday_timeout,
3787 (unsigned long)pool); 3853 (unsigned long)pool);
3788 3854
3789 mutex_init(&pool->manager_mutex); 3855 mutex_init(&pool->assoc_mutex);
3790 ida_init(&pool->worker_ida); 3856 ida_init(&pool->worker_ida);
3791 } 3857 }
3792
3793 init_waitqueue_head(&gcwq->rebind_hold);
3794 } 3858 }
3795 3859
3796 /* create the initial worker */ 3860 /* create the initial worker */
@@ -3813,17 +3877,14 @@ static int __init init_workqueues(void)
3813 } 3877 }
3814 3878
3815 system_wq = alloc_workqueue("events", 0, 0); 3879 system_wq = alloc_workqueue("events", 0, 0);
3880 system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);
3816 system_long_wq = alloc_workqueue("events_long", 0, 0); 3881 system_long_wq = alloc_workqueue("events_long", 0, 0);
3817 system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0);
3818 system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, 3882 system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
3819 WQ_UNBOUND_MAX_ACTIVE); 3883 WQ_UNBOUND_MAX_ACTIVE);
3820 system_freezable_wq = alloc_workqueue("events_freezable", 3884 system_freezable_wq = alloc_workqueue("events_freezable",
3821 WQ_FREEZABLE, 0); 3885 WQ_FREEZABLE, 0);
3822 system_nrt_freezable_wq = alloc_workqueue("events_nrt_freezable", 3886 BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq ||
3823 WQ_NON_REENTRANT | WQ_FREEZABLE, 0); 3887 !system_unbound_wq || !system_freezable_wq);
3824 BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq ||
3825 !system_unbound_wq || !system_freezable_wq ||
3826 !system_nrt_freezable_wq);
3827 return 0; 3888 return 0;
3828} 3889}
3829early_initcall(init_workqueues); 3890early_initcall(init_workqueues);