aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorH. Peter Anvin <hpa@linux.intel.com>2012-01-19 15:56:50 -0500
committerH. Peter Anvin <hpa@linux.intel.com>2012-01-19 15:56:50 -0500
commit282f445a779ed76fca9884fe377bf56a3088b208 (patch)
treed9abcf526baee0100672851e0a8894c19e762a39 /kernel
parent68f30fbee19cc67849b9fa8e153ede70758afe81 (diff)
parent90a4c0f51e8e44111a926be6f4c87af3938a79c3 (diff)
Merge remote-tracking branch 'linus/master' into x86/urgent
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile3
-rw-r--r--kernel/acct.c42
-rw-r--r--kernel/async.c2
-rw-r--r--kernel/audit.c13
-rw-r--r--kernel/audit.h6
-rw-r--r--kernel/auditfilter.c17
-rw-r--r--kernel/auditsc.c751
-rw-r--r--kernel/capability.c80
-rw-r--r--kernel/cgroup.c423
-rw-r--r--kernel/cgroup_freezer.c77
-rw-r--r--kernel/cpu.c4
-rw-r--r--kernel/cpuset.c105
-rw-r--r--kernel/debug/kdb/kdb_main.c2
-rw-r--r--kernel/events/core.c15
-rw-r--r--kernel/events/ring_buffer.c2
-rw-r--r--kernel/exit.c15
-rw-r--r--kernel/fork.c26
-rw-r--r--kernel/freezer.c203
-rw-r--r--kernel/irq/internals.h2
-rw-r--r--kernel/irq/irqdomain.c15
-rw-r--r--kernel/irq/manage.c2
-rw-r--r--kernel/irq/spurious.c2
-rw-r--r--kernel/jump_label.c2
-rw-r--r--kernel/kexec.c29
-rw-r--r--kernel/kmod.c27
-rw-r--r--kernel/kprobes.c2
-rw-r--r--kernel/kthread.c27
-rw-r--r--kernel/module.c205
-rw-r--r--kernel/panic.c26
-rw-r--r--kernel/params.c38
-rw-r--r--kernel/pid.c4
-rw-r--r--kernel/pid_namespace.c31
-rw-r--r--kernel/power/hibernate.c92
-rw-r--r--kernel/power/main.c10
-rw-r--r--kernel/power/power.h2
-rw-r--r--kernel/power/process.c77
-rw-r--r--kernel/power/snapshot.c6
-rw-r--r--kernel/power/suspend.c12
-rw-r--r--kernel/power/swap.c14
-rw-r--r--kernel/power/user.c184
-rw-r--r--kernel/printk.c10
-rw-r--r--kernel/ptrace.c14
-rw-r--r--kernel/relay.c2
-rw-r--r--kernel/res_counter.c3
-rw-r--r--kernel/rtmutex-tester.c37
-rw-r--r--kernel/sched/core.c82
-rw-r--r--kernel/sched/fair.c10
-rw-r--r--kernel/seccomp.c2
-rw-r--r--kernel/signal.c74
-rw-r--r--kernel/sys.c121
-rw-r--r--kernel/sysctl.c9
-rw-r--r--kernel/time/Kconfig2
-rw-r--r--kernel/time/clockevents.c1
-rw-r--r--kernel/time/clocksource.c37
-rw-r--r--kernel/trace/blktrace.c2
-rw-r--r--kernel/trace/ftrace.c715
-rw-r--r--kernel/trace/trace.c2
-rw-r--r--kernel/trace/trace.h2
-rw-r--r--kernel/trace/trace_events_filter.c283
-rw-r--r--kernel/trace/trace_stack.c30
-rw-r--r--kernel/workqueue.c32
61 files changed, 2466 insertions, 1599 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index f70396e5a24b..2d9de86b7e76 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -23,6 +23,7 @@ CFLAGS_REMOVE_irq_work.o = -pg
23endif 23endif
24 24
25obj-y += sched/ 25obj-y += sched/
26obj-y += power/
26 27
27obj-$(CONFIG_FREEZER) += freezer.o 28obj-$(CONFIG_FREEZER) += freezer.o
28obj-$(CONFIG_PROFILING) += profile.o 29obj-$(CONFIG_PROFILING) += profile.o
@@ -52,8 +53,6 @@ obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
52obj-$(CONFIG_UID16) += uid16.o 53obj-$(CONFIG_UID16) += uid16.o
53obj-$(CONFIG_MODULES) += module.o 54obj-$(CONFIG_MODULES) += module.o
54obj-$(CONFIG_KALLSYMS) += kallsyms.o 55obj-$(CONFIG_KALLSYMS) += kallsyms.o
55obj-$(CONFIG_PM) += power/
56obj-$(CONFIG_FREEZER) += power/
57obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o 56obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
58obj-$(CONFIG_KEXEC) += kexec.o 57obj-$(CONFIG_KEXEC) += kexec.o
59obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o 58obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
diff --git a/kernel/acct.c b/kernel/acct.c
index 203dfead2e06..02e6167a53b0 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -84,11 +84,10 @@ static void do_acct_process(struct bsd_acct_struct *acct,
84 * the cache line to have the data after getting the lock. 84 * the cache line to have the data after getting the lock.
85 */ 85 */
86struct bsd_acct_struct { 86struct bsd_acct_struct {
87 volatile int active; 87 int active;
88 volatile int needcheck; 88 unsigned long needcheck;
89 struct file *file; 89 struct file *file;
90 struct pid_namespace *ns; 90 struct pid_namespace *ns;
91 struct timer_list timer;
92 struct list_head list; 91 struct list_head list;
93}; 92};
94 93
@@ -96,15 +95,6 @@ static DEFINE_SPINLOCK(acct_lock);
96static LIST_HEAD(acct_list); 95static LIST_HEAD(acct_list);
97 96
98/* 97/*
99 * Called whenever the timer says to check the free space.
100 */
101static void acct_timeout(unsigned long x)
102{
103 struct bsd_acct_struct *acct = (struct bsd_acct_struct *)x;
104 acct->needcheck = 1;
105}
106
107/*
108 * Check the amount of free space and suspend/resume accordingly. 98 * Check the amount of free space and suspend/resume accordingly.
109 */ 99 */
110static int check_free_space(struct bsd_acct_struct *acct, struct file *file) 100static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
@@ -112,12 +102,12 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
112 struct kstatfs sbuf; 102 struct kstatfs sbuf;
113 int res; 103 int res;
114 int act; 104 int act;
115 sector_t resume; 105 u64 resume;
116 sector_t suspend; 106 u64 suspend;
117 107
118 spin_lock(&acct_lock); 108 spin_lock(&acct_lock);
119 res = acct->active; 109 res = acct->active;
120 if (!file || !acct->needcheck) 110 if (!file || time_is_before_jiffies(acct->needcheck))
121 goto out; 111 goto out;
122 spin_unlock(&acct_lock); 112 spin_unlock(&acct_lock);
123 113
@@ -127,8 +117,8 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
127 suspend = sbuf.f_blocks * SUSPEND; 117 suspend = sbuf.f_blocks * SUSPEND;
128 resume = sbuf.f_blocks * RESUME; 118 resume = sbuf.f_blocks * RESUME;
129 119
130 sector_div(suspend, 100); 120 do_div(suspend, 100);
131 sector_div(resume, 100); 121 do_div(resume, 100);
132 122
133 if (sbuf.f_bavail <= suspend) 123 if (sbuf.f_bavail <= suspend)
134 act = -1; 124 act = -1;
@@ -160,10 +150,7 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
160 } 150 }
161 } 151 }
162 152
163 del_timer(&acct->timer); 153 acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
164 acct->needcheck = 0;
165 acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ;
166 add_timer(&acct->timer);
167 res = acct->active; 154 res = acct->active;
168out: 155out:
169 spin_unlock(&acct_lock); 156 spin_unlock(&acct_lock);
@@ -185,9 +172,7 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
185 if (acct->file) { 172 if (acct->file) {
186 old_acct = acct->file; 173 old_acct = acct->file;
187 old_ns = acct->ns; 174 old_ns = acct->ns;
188 del_timer(&acct->timer);
189 acct->active = 0; 175 acct->active = 0;
190 acct->needcheck = 0;
191 acct->file = NULL; 176 acct->file = NULL;
192 acct->ns = NULL; 177 acct->ns = NULL;
193 list_del(&acct->list); 178 list_del(&acct->list);
@@ -195,13 +180,9 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
195 if (file) { 180 if (file) {
196 acct->file = file; 181 acct->file = file;
197 acct->ns = ns; 182 acct->ns = ns;
198 acct->needcheck = 0; 183 acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
199 acct->active = 1; 184 acct->active = 1;
200 list_add(&acct->list, &acct_list); 185 list_add(&acct->list, &acct_list);
201 /* It's been deleted if it was used before so this is safe */
202 setup_timer(&acct->timer, acct_timeout, (unsigned long)acct);
203 acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ;
204 add_timer(&acct->timer);
205 } 186 }
206 if (old_acct) { 187 if (old_acct) {
207 mnt_unpin(old_acct->f_path.mnt); 188 mnt_unpin(old_acct->f_path.mnt);
@@ -334,7 +315,7 @@ void acct_auto_close(struct super_block *sb)
334 spin_lock(&acct_lock); 315 spin_lock(&acct_lock);
335restart: 316restart:
336 list_for_each_entry(acct, &acct_list, list) 317 list_for_each_entry(acct, &acct_list, list)
337 if (acct->file && acct->file->f_path.mnt->mnt_sb == sb) { 318 if (acct->file && acct->file->f_path.dentry->d_sb == sb) {
338 acct_file_reopen(acct, NULL, NULL); 319 acct_file_reopen(acct, NULL, NULL);
339 goto restart; 320 goto restart;
340 } 321 }
@@ -348,7 +329,6 @@ void acct_exit_ns(struct pid_namespace *ns)
348 if (acct == NULL) 329 if (acct == NULL)
349 return; 330 return;
350 331
351 del_timer_sync(&acct->timer);
352 spin_lock(&acct_lock); 332 spin_lock(&acct_lock);
353 if (acct->file != NULL) 333 if (acct->file != NULL)
354 acct_file_reopen(acct, NULL, NULL); 334 acct_file_reopen(acct, NULL, NULL);
@@ -498,7 +478,7 @@ static void do_acct_process(struct bsd_acct_struct *acct,
498 * Fill the accounting struct with the needed info as recorded 478 * Fill the accounting struct with the needed info as recorded
499 * by the different kernel functions. 479 * by the different kernel functions.
500 */ 480 */
501 memset((caddr_t)&ac, 0, sizeof(acct_t)); 481 memset(&ac, 0, sizeof(acct_t));
502 482
503 ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER; 483 ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER;
504 strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm)); 484 strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm));
diff --git a/kernel/async.c b/kernel/async.c
index 80b74b88fefe..bd0c168a3bbe 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -78,8 +78,6 @@ static DECLARE_WAIT_QUEUE_HEAD(async_done);
78 78
79static atomic_t entry_count; 79static atomic_t entry_count;
80 80
81extern int initcall_debug;
82
83 81
84/* 82/*
85 * MUST be called with the lock held! 83 * MUST be called with the lock held!
diff --git a/kernel/audit.c b/kernel/audit.c
index 09fae2677a45..bb0eb5bb9a0a 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -601,13 +601,13 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
601 case AUDIT_TTY_SET: 601 case AUDIT_TTY_SET:
602 case AUDIT_TRIM: 602 case AUDIT_TRIM:
603 case AUDIT_MAKE_EQUIV: 603 case AUDIT_MAKE_EQUIV:
604 if (security_netlink_recv(skb, CAP_AUDIT_CONTROL)) 604 if (!capable(CAP_AUDIT_CONTROL))
605 err = -EPERM; 605 err = -EPERM;
606 break; 606 break;
607 case AUDIT_USER: 607 case AUDIT_USER:
608 case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: 608 case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
609 case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2: 609 case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2:
610 if (security_netlink_recv(skb, CAP_AUDIT_WRITE)) 610 if (!capable(CAP_AUDIT_WRITE))
611 err = -EPERM; 611 err = -EPERM;
612 break; 612 break;
613 default: /* bad msg */ 613 default: /* bad msg */
@@ -631,7 +631,7 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
631 } 631 }
632 632
633 *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); 633 *ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
634 audit_log_format(*ab, "user pid=%d uid=%u auid=%u ses=%u", 634 audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u",
635 pid, uid, auid, ses); 635 pid, uid, auid, ses);
636 if (sid) { 636 if (sid) {
637 rc = security_secid_to_secctx(sid, &ctx, &len); 637 rc = security_secid_to_secctx(sid, &ctx, &len);
@@ -1260,12 +1260,13 @@ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
1260 avail = audit_expand(ab, 1260 avail = audit_expand(ab,
1261 max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail)); 1261 max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail));
1262 if (!avail) 1262 if (!avail)
1263 goto out; 1263 goto out_va_end;
1264 len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args2); 1264 len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args2);
1265 } 1265 }
1266 va_end(args2);
1267 if (len > 0) 1266 if (len > 0)
1268 skb_put(skb, len); 1267 skb_put(skb, len);
1268out_va_end:
1269 va_end(args2);
1269out: 1270out:
1270 return; 1271 return;
1271} 1272}
@@ -1422,7 +1423,7 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
1422 char *p, *pathname; 1423 char *p, *pathname;
1423 1424
1424 if (prefix) 1425 if (prefix)
1425 audit_log_format(ab, " %s", prefix); 1426 audit_log_format(ab, "%s", prefix);
1426 1427
1427 /* We will allow 11 spaces for ' (deleted)' to be appended */ 1428 /* We will allow 11 spaces for ' (deleted)' to be appended */
1428 pathname = kmalloc(PATH_MAX+11, ab->gfp_mask); 1429 pathname = kmalloc(PATH_MAX+11, ab->gfp_mask);
diff --git a/kernel/audit.h b/kernel/audit.h
index 91e7071c4d2c..816766803371 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -36,12 +36,8 @@ enum audit_state {
36 AUDIT_DISABLED, /* Do not create per-task audit_context. 36 AUDIT_DISABLED, /* Do not create per-task audit_context.
37 * No syscall-specific audit records can 37 * No syscall-specific audit records can
38 * be generated. */ 38 * be generated. */
39 AUDIT_SETUP_CONTEXT, /* Create the per-task audit_context,
40 * but don't necessarily fill it in at
41 * syscall entry time (i.e., filter
42 * instead). */
43 AUDIT_BUILD_CONTEXT, /* Create the per-task audit_context, 39 AUDIT_BUILD_CONTEXT, /* Create the per-task audit_context,
44 * and always fill it in at syscall 40 * and fill it in at syscall
45 * entry time. This makes a full 41 * entry time. This makes a full
46 * syscall record available if some 42 * syscall record available if some
47 * other part of the kernel decides it 43 * other part of the kernel decides it
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index f8277c80d678..a6c3f1abd206 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -235,13 +235,15 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
235 switch(listnr) { 235 switch(listnr) {
236 default: 236 default:
237 goto exit_err; 237 goto exit_err;
238 case AUDIT_FILTER_USER:
239 case AUDIT_FILTER_TYPE:
240#ifdef CONFIG_AUDITSYSCALL 238#ifdef CONFIG_AUDITSYSCALL
241 case AUDIT_FILTER_ENTRY: 239 case AUDIT_FILTER_ENTRY:
240 if (rule->action == AUDIT_ALWAYS)
241 goto exit_err;
242 case AUDIT_FILTER_EXIT: 242 case AUDIT_FILTER_EXIT:
243 case AUDIT_FILTER_TASK: 243 case AUDIT_FILTER_TASK:
244#endif 244#endif
245 case AUDIT_FILTER_USER:
246 case AUDIT_FILTER_TYPE:
245 ; 247 ;
246 } 248 }
247 if (unlikely(rule->action == AUDIT_POSSIBLE)) { 249 if (unlikely(rule->action == AUDIT_POSSIBLE)) {
@@ -385,7 +387,7 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
385 goto exit_free; 387 goto exit_free;
386 break; 388 break;
387 case AUDIT_FILETYPE: 389 case AUDIT_FILETYPE:
388 if ((f->val & ~S_IFMT) > S_IFMT) 390 if (f->val & ~S_IFMT)
389 goto exit_free; 391 goto exit_free;
390 break; 392 break;
391 case AUDIT_INODE: 393 case AUDIT_INODE:
@@ -459,6 +461,8 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
459 case AUDIT_ARG1: 461 case AUDIT_ARG1:
460 case AUDIT_ARG2: 462 case AUDIT_ARG2:
461 case AUDIT_ARG3: 463 case AUDIT_ARG3:
464 case AUDIT_OBJ_UID:
465 case AUDIT_OBJ_GID:
462 break; 466 break;
463 case AUDIT_ARCH: 467 case AUDIT_ARCH:
464 entry->rule.arch_f = f; 468 entry->rule.arch_f = f;
@@ -522,7 +526,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
522 goto exit_free; 526 goto exit_free;
523 break; 527 break;
524 case AUDIT_FILTERKEY: 528 case AUDIT_FILTERKEY:
525 err = -EINVAL;
526 if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN) 529 if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN)
527 goto exit_free; 530 goto exit_free;
528 str = audit_unpack_string(&bufp, &remain, f->val); 531 str = audit_unpack_string(&bufp, &remain, f->val);
@@ -536,7 +539,11 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
536 goto exit_free; 539 goto exit_free;
537 break; 540 break;
538 case AUDIT_FILETYPE: 541 case AUDIT_FILETYPE:
539 if ((f->val & ~S_IFMT) > S_IFMT) 542 if (f->val & ~S_IFMT)
543 goto exit_free;
544 break;
545 case AUDIT_FIELD_COMPARE:
546 if (f->val > AUDIT_MAX_FIELD_COMPARE)
540 goto exit_free; 547 goto exit_free;
541 break; 548 break;
542 default: 549 default:
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 47b7fc1ea893..caaea6e944f8 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -70,9 +70,15 @@
70 70
71#include "audit.h" 71#include "audit.h"
72 72
73/* flags stating the success for a syscall */
74#define AUDITSC_INVALID 0
75#define AUDITSC_SUCCESS 1
76#define AUDITSC_FAILURE 2
77
73/* AUDIT_NAMES is the number of slots we reserve in the audit_context 78/* AUDIT_NAMES is the number of slots we reserve in the audit_context
74 * for saving names from getname(). */ 79 * for saving names from getname(). If we get more names we will allocate
75#define AUDIT_NAMES 20 80 * a name dynamically and also add those to the list anchored by names_list. */
81#define AUDIT_NAMES 5
76 82
77/* Indicates that audit should log the full pathname. */ 83/* Indicates that audit should log the full pathname. */
78#define AUDIT_NAME_FULL -1 84#define AUDIT_NAME_FULL -1
@@ -101,9 +107,8 @@ struct audit_cap_data {
101 * 107 *
102 * Further, in fs/namei.c:path_lookup() we store the inode and device. */ 108 * Further, in fs/namei.c:path_lookup() we store the inode and device. */
103struct audit_names { 109struct audit_names {
110 struct list_head list; /* audit_context->names_list */
104 const char *name; 111 const char *name;
105 int name_len; /* number of name's characters to log */
106 unsigned name_put; /* call __putname() for this name */
107 unsigned long ino; 112 unsigned long ino;
108 dev_t dev; 113 dev_t dev;
109 umode_t mode; 114 umode_t mode;
@@ -113,6 +118,14 @@ struct audit_names {
113 u32 osid; 118 u32 osid;
114 struct audit_cap_data fcap; 119 struct audit_cap_data fcap;
115 unsigned int fcap_ver; 120 unsigned int fcap_ver;
121 int name_len; /* number of name's characters to log */
122 bool name_put; /* call __putname() for this name */
123 /*
124 * This was an allocated audit_names and not from the array of
125 * names allocated in the task audit context. Thus this name
126 * should be freed on syscall exit
127 */
128 bool should_free;
116}; 129};
117 130
118struct audit_aux_data { 131struct audit_aux_data {
@@ -174,8 +187,17 @@ struct audit_context {
174 long return_code;/* syscall return code */ 187 long return_code;/* syscall return code */
175 u64 prio; 188 u64 prio;
176 int return_valid; /* return code is valid */ 189 int return_valid; /* return code is valid */
177 int name_count; 190 /*
178 struct audit_names names[AUDIT_NAMES]; 191 * The names_list is the list of all audit_names collected during this
192 * syscall. The first AUDIT_NAMES entries in the names_list will
193 * actually be from the preallocated_names array for performance
194 * reasons. Except during allocation they should never be referenced
195 * through the preallocated_names array and should only be found/used
196 * by running the names_list.
197 */
198 struct audit_names preallocated_names[AUDIT_NAMES];
199 int name_count; /* total records in names_list */
200 struct list_head names_list; /* anchor for struct audit_names->list */
179 char * filterkey; /* key for rule that triggered record */ 201 char * filterkey; /* key for rule that triggered record */
180 struct path pwd; 202 struct path pwd;
181 struct audit_context *previous; /* For nested syscalls */ 203 struct audit_context *previous; /* For nested syscalls */
@@ -210,12 +232,12 @@ struct audit_context {
210 struct { 232 struct {
211 uid_t uid; 233 uid_t uid;
212 gid_t gid; 234 gid_t gid;
213 mode_t mode; 235 umode_t mode;
214 u32 osid; 236 u32 osid;
215 int has_perm; 237 int has_perm;
216 uid_t perm_uid; 238 uid_t perm_uid;
217 gid_t perm_gid; 239 gid_t perm_gid;
218 mode_t perm_mode; 240 umode_t perm_mode;
219 unsigned long qbytes; 241 unsigned long qbytes;
220 } ipc; 242 } ipc;
221 struct { 243 struct {
@@ -234,7 +256,7 @@ struct audit_context {
234 } mq_sendrecv; 256 } mq_sendrecv;
235 struct { 257 struct {
236 int oflag; 258 int oflag;
237 mode_t mode; 259 umode_t mode;
238 struct mq_attr attr; 260 struct mq_attr attr;
239 } mq_open; 261 } mq_open;
240 struct { 262 struct {
@@ -305,21 +327,21 @@ static int audit_match_perm(struct audit_context *ctx, int mask)
305 } 327 }
306} 328}
307 329
308static int audit_match_filetype(struct audit_context *ctx, int which) 330static int audit_match_filetype(struct audit_context *ctx, int val)
309{ 331{
310 unsigned index = which & ~S_IFMT; 332 struct audit_names *n;
311 mode_t mode = which & S_IFMT; 333 umode_t mode = (umode_t)val;
312 334
313 if (unlikely(!ctx)) 335 if (unlikely(!ctx))
314 return 0; 336 return 0;
315 337
316 if (index >= ctx->name_count) 338 list_for_each_entry(n, &ctx->names_list, list) {
317 return 0; 339 if ((n->ino != -1) &&
318 if (ctx->names[index].ino == -1) 340 ((n->mode & S_IFMT) == mode))
319 return 0; 341 return 1;
320 if ((ctx->names[index].mode ^ mode) & S_IFMT) 342 }
321 return 0; 343
322 return 1; 344 return 0;
323} 345}
324 346
325/* 347/*
@@ -441,6 +463,134 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree)
441 return 0; 463 return 0;
442} 464}
443 465
466static int audit_compare_id(uid_t uid1,
467 struct audit_names *name,
468 unsigned long name_offset,
469 struct audit_field *f,
470 struct audit_context *ctx)
471{
472 struct audit_names *n;
473 unsigned long addr;
474 uid_t uid2;
475 int rc;
476
477 BUILD_BUG_ON(sizeof(uid_t) != sizeof(gid_t));
478
479 if (name) {
480 addr = (unsigned long)name;
481 addr += name_offset;
482
483 uid2 = *(uid_t *)addr;
484 rc = audit_comparator(uid1, f->op, uid2);
485 if (rc)
486 return rc;
487 }
488
489 if (ctx) {
490 list_for_each_entry(n, &ctx->names_list, list) {
491 addr = (unsigned long)n;
492 addr += name_offset;
493
494 uid2 = *(uid_t *)addr;
495
496 rc = audit_comparator(uid1, f->op, uid2);
497 if (rc)
498 return rc;
499 }
500 }
501 return 0;
502}
503
504static int audit_field_compare(struct task_struct *tsk,
505 const struct cred *cred,
506 struct audit_field *f,
507 struct audit_context *ctx,
508 struct audit_names *name)
509{
510 switch (f->val) {
511 /* process to file object comparisons */
512 case AUDIT_COMPARE_UID_TO_OBJ_UID:
513 return audit_compare_id(cred->uid,
514 name, offsetof(struct audit_names, uid),
515 f, ctx);
516 case AUDIT_COMPARE_GID_TO_OBJ_GID:
517 return audit_compare_id(cred->gid,
518 name, offsetof(struct audit_names, gid),
519 f, ctx);
520 case AUDIT_COMPARE_EUID_TO_OBJ_UID:
521 return audit_compare_id(cred->euid,
522 name, offsetof(struct audit_names, uid),
523 f, ctx);
524 case AUDIT_COMPARE_EGID_TO_OBJ_GID:
525 return audit_compare_id(cred->egid,
526 name, offsetof(struct audit_names, gid),
527 f, ctx);
528 case AUDIT_COMPARE_AUID_TO_OBJ_UID:
529 return audit_compare_id(tsk->loginuid,
530 name, offsetof(struct audit_names, uid),
531 f, ctx);
532 case AUDIT_COMPARE_SUID_TO_OBJ_UID:
533 return audit_compare_id(cred->suid,
534 name, offsetof(struct audit_names, uid),
535 f, ctx);
536 case AUDIT_COMPARE_SGID_TO_OBJ_GID:
537 return audit_compare_id(cred->sgid,
538 name, offsetof(struct audit_names, gid),
539 f, ctx);
540 case AUDIT_COMPARE_FSUID_TO_OBJ_UID:
541 return audit_compare_id(cred->fsuid,
542 name, offsetof(struct audit_names, uid),
543 f, ctx);
544 case AUDIT_COMPARE_FSGID_TO_OBJ_GID:
545 return audit_compare_id(cred->fsgid,
546 name, offsetof(struct audit_names, gid),
547 f, ctx);
548 /* uid comparisons */
549 case AUDIT_COMPARE_UID_TO_AUID:
550 return audit_comparator(cred->uid, f->op, tsk->loginuid);
551 case AUDIT_COMPARE_UID_TO_EUID:
552 return audit_comparator(cred->uid, f->op, cred->euid);
553 case AUDIT_COMPARE_UID_TO_SUID:
554 return audit_comparator(cred->uid, f->op, cred->suid);
555 case AUDIT_COMPARE_UID_TO_FSUID:
556 return audit_comparator(cred->uid, f->op, cred->fsuid);
557 /* auid comparisons */
558 case AUDIT_COMPARE_AUID_TO_EUID:
559 return audit_comparator(tsk->loginuid, f->op, cred->euid);
560 case AUDIT_COMPARE_AUID_TO_SUID:
561 return audit_comparator(tsk->loginuid, f->op, cred->suid);
562 case AUDIT_COMPARE_AUID_TO_FSUID:
563 return audit_comparator(tsk->loginuid, f->op, cred->fsuid);
564 /* euid comparisons */
565 case AUDIT_COMPARE_EUID_TO_SUID:
566 return audit_comparator(cred->euid, f->op, cred->suid);
567 case AUDIT_COMPARE_EUID_TO_FSUID:
568 return audit_comparator(cred->euid, f->op, cred->fsuid);
569 /* suid comparisons */
570 case AUDIT_COMPARE_SUID_TO_FSUID:
571 return audit_comparator(cred->suid, f->op, cred->fsuid);
572 /* gid comparisons */
573 case AUDIT_COMPARE_GID_TO_EGID:
574 return audit_comparator(cred->gid, f->op, cred->egid);
575 case AUDIT_COMPARE_GID_TO_SGID:
576 return audit_comparator(cred->gid, f->op, cred->sgid);
577 case AUDIT_COMPARE_GID_TO_FSGID:
578 return audit_comparator(cred->gid, f->op, cred->fsgid);
579 /* egid comparisons */
580 case AUDIT_COMPARE_EGID_TO_SGID:
581 return audit_comparator(cred->egid, f->op, cred->sgid);
582 case AUDIT_COMPARE_EGID_TO_FSGID:
583 return audit_comparator(cred->egid, f->op, cred->fsgid);
584 /* sgid comparison */
585 case AUDIT_COMPARE_SGID_TO_FSGID:
586 return audit_comparator(cred->sgid, f->op, cred->fsgid);
587 default:
588 WARN(1, "Missing AUDIT_COMPARE define. Report as a bug\n");
589 return 0;
590 }
591 return 0;
592}
593
444/* Determine if any context name data matches a rule's watch data */ 594/* Determine if any context name data matches a rule's watch data */
445/* Compare a task_struct with an audit_rule. Return 1 on match, 0 595/* Compare a task_struct with an audit_rule. Return 1 on match, 0
446 * otherwise. 596 * otherwise.
@@ -457,13 +607,14 @@ static int audit_filter_rules(struct task_struct *tsk,
457 bool task_creation) 607 bool task_creation)
458{ 608{
459 const struct cred *cred; 609 const struct cred *cred;
460 int i, j, need_sid = 1; 610 int i, need_sid = 1;
461 u32 sid; 611 u32 sid;
462 612
463 cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation); 613 cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation);
464 614
465 for (i = 0; i < rule->field_count; i++) { 615 for (i = 0; i < rule->field_count; i++) {
466 struct audit_field *f = &rule->fields[i]; 616 struct audit_field *f = &rule->fields[i];
617 struct audit_names *n;
467 int result = 0; 618 int result = 0;
468 619
469 switch (f->type) { 620 switch (f->type) {
@@ -522,12 +673,14 @@ static int audit_filter_rules(struct task_struct *tsk,
522 } 673 }
523 break; 674 break;
524 case AUDIT_DEVMAJOR: 675 case AUDIT_DEVMAJOR:
525 if (name) 676 if (name) {
526 result = audit_comparator(MAJOR(name->dev), 677 if (audit_comparator(MAJOR(name->dev), f->op, f->val) ||
527 f->op, f->val); 678 audit_comparator(MAJOR(name->rdev), f->op, f->val))
528 else if (ctx) { 679 ++result;
529 for (j = 0; j < ctx->name_count; j++) { 680 } else if (ctx) {
530 if (audit_comparator(MAJOR(ctx->names[j].dev), f->op, f->val)) { 681 list_for_each_entry(n, &ctx->names_list, list) {
682 if (audit_comparator(MAJOR(n->dev), f->op, f->val) ||
683 audit_comparator(MAJOR(n->rdev), f->op, f->val)) {
531 ++result; 684 ++result;
532 break; 685 break;
533 } 686 }
@@ -535,12 +688,14 @@ static int audit_filter_rules(struct task_struct *tsk,
535 } 688 }
536 break; 689 break;
537 case AUDIT_DEVMINOR: 690 case AUDIT_DEVMINOR:
538 if (name) 691 if (name) {
539 result = audit_comparator(MINOR(name->dev), 692 if (audit_comparator(MINOR(name->dev), f->op, f->val) ||
540 f->op, f->val); 693 audit_comparator(MINOR(name->rdev), f->op, f->val))
541 else if (ctx) { 694 ++result;
542 for (j = 0; j < ctx->name_count; j++) { 695 } else if (ctx) {
543 if (audit_comparator(MINOR(ctx->names[j].dev), f->op, f->val)) { 696 list_for_each_entry(n, &ctx->names_list, list) {
697 if (audit_comparator(MINOR(n->dev), f->op, f->val) ||
698 audit_comparator(MINOR(n->rdev), f->op, f->val)) {
544 ++result; 699 ++result;
545 break; 700 break;
546 } 701 }
@@ -551,8 +706,32 @@ static int audit_filter_rules(struct task_struct *tsk,
551 if (name) 706 if (name)
552 result = (name->ino == f->val); 707 result = (name->ino == f->val);
553 else if (ctx) { 708 else if (ctx) {
554 for (j = 0; j < ctx->name_count; j++) { 709 list_for_each_entry(n, &ctx->names_list, list) {
555 if (audit_comparator(ctx->names[j].ino, f->op, f->val)) { 710 if (audit_comparator(n->ino, f->op, f->val)) {
711 ++result;
712 break;
713 }
714 }
715 }
716 break;
717 case AUDIT_OBJ_UID:
718 if (name) {
719 result = audit_comparator(name->uid, f->op, f->val);
720 } else if (ctx) {
721 list_for_each_entry(n, &ctx->names_list, list) {
722 if (audit_comparator(n->uid, f->op, f->val)) {
723 ++result;
724 break;
725 }
726 }
727 }
728 break;
729 case AUDIT_OBJ_GID:
730 if (name) {
731 result = audit_comparator(name->gid, f->op, f->val);
732 } else if (ctx) {
733 list_for_each_entry(n, &ctx->names_list, list) {
734 if (audit_comparator(n->gid, f->op, f->val)) {
556 ++result; 735 ++result;
557 break; 736 break;
558 } 737 }
@@ -607,11 +786,10 @@ static int audit_filter_rules(struct task_struct *tsk,
607 name->osid, f->type, f->op, 786 name->osid, f->type, f->op,
608 f->lsm_rule, ctx); 787 f->lsm_rule, ctx);
609 } else if (ctx) { 788 } else if (ctx) {
610 for (j = 0; j < ctx->name_count; j++) { 789 list_for_each_entry(n, &ctx->names_list, list) {
611 if (security_audit_rule_match( 790 if (security_audit_rule_match(n->osid, f->type,
612 ctx->names[j].osid, 791 f->op, f->lsm_rule,
613 f->type, f->op, 792 ctx)) {
614 f->lsm_rule, ctx)) {
615 ++result; 793 ++result;
616 break; 794 break;
617 } 795 }
@@ -643,8 +821,10 @@ static int audit_filter_rules(struct task_struct *tsk,
643 case AUDIT_FILETYPE: 821 case AUDIT_FILETYPE:
644 result = audit_match_filetype(ctx, f->val); 822 result = audit_match_filetype(ctx, f->val);
645 break; 823 break;
824 case AUDIT_FIELD_COMPARE:
825 result = audit_field_compare(tsk, cred, f, ctx, name);
826 break;
646 } 827 }
647
648 if (!result) 828 if (!result)
649 return 0; 829 return 0;
650 } 830 }
@@ -722,40 +902,53 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
722 return AUDIT_BUILD_CONTEXT; 902 return AUDIT_BUILD_CONTEXT;
723} 903}
724 904
725/* At syscall exit time, this filter is called if any audit_names[] have been 905/*
906 * Given an audit_name check the inode hash table to see if they match.
907 * Called holding the rcu read lock to protect the use of audit_inode_hash
908 */
909static int audit_filter_inode_name(struct task_struct *tsk,
910 struct audit_names *n,
911 struct audit_context *ctx) {
912 int word, bit;
913 int h = audit_hash_ino((u32)n->ino);
914 struct list_head *list = &audit_inode_hash[h];
915 struct audit_entry *e;
916 enum audit_state state;
917
918 word = AUDIT_WORD(ctx->major);
919 bit = AUDIT_BIT(ctx->major);
920
921 if (list_empty(list))
922 return 0;
923
924 list_for_each_entry_rcu(e, list, list) {
925 if ((e->rule.mask[word] & bit) == bit &&
926 audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) {
927 ctx->current_state = state;
928 return 1;
929 }
930 }
931
932 return 0;
933}
934
935/* At syscall exit time, this filter is called if any audit_names have been
726 * collected during syscall processing. We only check rules in sublists at hash 936 * collected during syscall processing. We only check rules in sublists at hash
727 * buckets applicable to the inode numbers in audit_names[]. 937 * buckets applicable to the inode numbers in audit_names.
728 * Regarding audit_state, same rules apply as for audit_filter_syscall(). 938 * Regarding audit_state, same rules apply as for audit_filter_syscall().
729 */ 939 */
730void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx) 940void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)
731{ 941{
732 int i; 942 struct audit_names *n;
733 struct audit_entry *e;
734 enum audit_state state;
735 943
736 if (audit_pid && tsk->tgid == audit_pid) 944 if (audit_pid && tsk->tgid == audit_pid)
737 return; 945 return;
738 946
739 rcu_read_lock(); 947 rcu_read_lock();
740 for (i = 0; i < ctx->name_count; i++) {
741 int word = AUDIT_WORD(ctx->major);
742 int bit = AUDIT_BIT(ctx->major);
743 struct audit_names *n = &ctx->names[i];
744 int h = audit_hash_ino((u32)n->ino);
745 struct list_head *list = &audit_inode_hash[h];
746
747 if (list_empty(list))
748 continue;
749 948
750 list_for_each_entry_rcu(e, list, list) { 949 list_for_each_entry(n, &ctx->names_list, list) {
751 if ((e->rule.mask[word] & bit) == bit && 950 if (audit_filter_inode_name(tsk, n, ctx))
752 audit_filter_rules(tsk, &e->rule, ctx, n, 951 break;
753 &state, false)) {
754 rcu_read_unlock();
755 ctx->current_state = state;
756 return;
757 }
758 }
759 } 952 }
760 rcu_read_unlock(); 953 rcu_read_unlock();
761} 954}
@@ -766,7 +959,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
766{ 959{
767 struct audit_context *context = tsk->audit_context; 960 struct audit_context *context = tsk->audit_context;
768 961
769 if (likely(!context)) 962 if (!context)
770 return NULL; 963 return NULL;
771 context->return_valid = return_valid; 964 context->return_valid = return_valid;
772 965
@@ -799,7 +992,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
799 992
800static inline void audit_free_names(struct audit_context *context) 993static inline void audit_free_names(struct audit_context *context)
801{ 994{
802 int i; 995 struct audit_names *n, *next;
803 996
804#if AUDIT_DEBUG == 2 997#if AUDIT_DEBUG == 2
805 if (context->put_count + context->ino_count != context->name_count) { 998 if (context->put_count + context->ino_count != context->name_count) {
@@ -810,10 +1003,9 @@ static inline void audit_free_names(struct audit_context *context)
810 context->serial, context->major, context->in_syscall, 1003 context->serial, context->major, context->in_syscall,
811 context->name_count, context->put_count, 1004 context->name_count, context->put_count,
812 context->ino_count); 1005 context->ino_count);
813 for (i = 0; i < context->name_count; i++) { 1006 list_for_each_entry(n, &context->names_list, list) {
814 printk(KERN_ERR "names[%d] = %p = %s\n", i, 1007 printk(KERN_ERR "names[%d] = %p = %s\n", i,
815 context->names[i].name, 1008 n->name, n->name ?: "(null)");
816 context->names[i].name ?: "(null)");
817 } 1009 }
818 dump_stack(); 1010 dump_stack();
819 return; 1011 return;
@@ -824,9 +1016,12 @@ static inline void audit_free_names(struct audit_context *context)
824 context->ino_count = 0; 1016 context->ino_count = 0;
825#endif 1017#endif
826 1018
827 for (i = 0; i < context->name_count; i++) { 1019 list_for_each_entry_safe(n, next, &context->names_list, list) {
828 if (context->names[i].name && context->names[i].name_put) 1020 list_del(&n->list);
829 __putname(context->names[i].name); 1021 if (n->name && n->name_put)
1022 __putname(n->name);
1023 if (n->should_free)
1024 kfree(n);
830 } 1025 }
831 context->name_count = 0; 1026 context->name_count = 0;
832 path_put(&context->pwd); 1027 path_put(&context->pwd);
@@ -864,6 +1059,7 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state)
864 return NULL; 1059 return NULL;
865 audit_zero_context(context, state); 1060 audit_zero_context(context, state);
866 INIT_LIST_HEAD(&context->killed_trees); 1061 INIT_LIST_HEAD(&context->killed_trees);
1062 INIT_LIST_HEAD(&context->names_list);
867 return context; 1063 return context;
868} 1064}
869 1065
@@ -886,7 +1082,7 @@ int audit_alloc(struct task_struct *tsk)
886 return 0; /* Return if not auditing. */ 1082 return 0; /* Return if not auditing. */
887 1083
888 state = audit_filter_task(tsk, &key); 1084 state = audit_filter_task(tsk, &key);
889 if (likely(state == AUDIT_DISABLED)) 1085 if (state == AUDIT_DISABLED)
890 return 0; 1086 return 0;
891 1087
892 if (!(context = audit_alloc_context(state))) { 1088 if (!(context = audit_alloc_context(state))) {
@@ -975,7 +1171,7 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk
975 while (vma) { 1171 while (vma) {
976 if ((vma->vm_flags & VM_EXECUTABLE) && 1172 if ((vma->vm_flags & VM_EXECUTABLE) &&
977 vma->vm_file) { 1173 vma->vm_file) {
978 audit_log_d_path(ab, "exe=", 1174 audit_log_d_path(ab, " exe=",
979 &vma->vm_file->f_path); 1175 &vma->vm_file->f_path);
980 break; 1176 break;
981 } 1177 }
@@ -1166,8 +1362,8 @@ static void audit_log_execve_info(struct audit_context *context,
1166 struct audit_buffer **ab, 1362 struct audit_buffer **ab,
1167 struct audit_aux_data_execve *axi) 1363 struct audit_aux_data_execve *axi)
1168{ 1364{
1169 int i; 1365 int i, len;
1170 size_t len, len_sent = 0; 1366 size_t len_sent = 0;
1171 const char __user *p; 1367 const char __user *p;
1172 char *buf; 1368 char *buf;
1173 1369
@@ -1249,7 +1445,7 @@ static void show_special(struct audit_context *context, int *call_panic)
1249 case AUDIT_IPC: { 1445 case AUDIT_IPC: {
1250 u32 osid = context->ipc.osid; 1446 u32 osid = context->ipc.osid;
1251 1447
1252 audit_log_format(ab, "ouid=%u ogid=%u mode=%#o", 1448 audit_log_format(ab, "ouid=%u ogid=%u mode=%#ho",
1253 context->ipc.uid, context->ipc.gid, context->ipc.mode); 1449 context->ipc.uid, context->ipc.gid, context->ipc.mode);
1254 if (osid) { 1450 if (osid) {
1255 char *ctx = NULL; 1451 char *ctx = NULL;
@@ -1267,7 +1463,7 @@ static void show_special(struct audit_context *context, int *call_panic)
1267 ab = audit_log_start(context, GFP_KERNEL, 1463 ab = audit_log_start(context, GFP_KERNEL,
1268 AUDIT_IPC_SET_PERM); 1464 AUDIT_IPC_SET_PERM);
1269 audit_log_format(ab, 1465 audit_log_format(ab,
1270 "qbytes=%lx ouid=%u ogid=%u mode=%#o", 1466 "qbytes=%lx ouid=%u ogid=%u mode=%#ho",
1271 context->ipc.qbytes, 1467 context->ipc.qbytes,
1272 context->ipc.perm_uid, 1468 context->ipc.perm_uid,
1273 context->ipc.perm_gid, 1469 context->ipc.perm_gid,
@@ -1278,7 +1474,7 @@ static void show_special(struct audit_context *context, int *call_panic)
1278 break; } 1474 break; }
1279 case AUDIT_MQ_OPEN: { 1475 case AUDIT_MQ_OPEN: {
1280 audit_log_format(ab, 1476 audit_log_format(ab,
1281 "oflag=0x%x mode=%#o mq_flags=0x%lx mq_maxmsg=%ld " 1477 "oflag=0x%x mode=%#ho mq_flags=0x%lx mq_maxmsg=%ld "
1282 "mq_msgsize=%ld mq_curmsgs=%ld", 1478 "mq_msgsize=%ld mq_curmsgs=%ld",
1283 context->mq_open.oflag, context->mq_open.mode, 1479 context->mq_open.oflag, context->mq_open.mode,
1284 context->mq_open.attr.mq_flags, 1480 context->mq_open.attr.mq_flags,
@@ -1324,6 +1520,68 @@ static void show_special(struct audit_context *context, int *call_panic)
1324 audit_log_end(ab); 1520 audit_log_end(ab);
1325} 1521}
1326 1522
1523static void audit_log_name(struct audit_context *context, struct audit_names *n,
1524 int record_num, int *call_panic)
1525{
1526 struct audit_buffer *ab;
1527 ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
1528 if (!ab)
1529 return; /* audit_panic has been called */
1530
1531 audit_log_format(ab, "item=%d", record_num);
1532
1533 if (n->name) {
1534 switch (n->name_len) {
1535 case AUDIT_NAME_FULL:
1536 /* log the full path */
1537 audit_log_format(ab, " name=");
1538 audit_log_untrustedstring(ab, n->name);
1539 break;
1540 case 0:
1541 /* name was specified as a relative path and the
1542 * directory component is the cwd */
1543 audit_log_d_path(ab, " name=", &context->pwd);
1544 break;
1545 default:
1546 /* log the name's directory component */
1547 audit_log_format(ab, " name=");
1548 audit_log_n_untrustedstring(ab, n->name,
1549 n->name_len);
1550 }
1551 } else
1552 audit_log_format(ab, " name=(null)");
1553
1554 if (n->ino != (unsigned long)-1) {
1555 audit_log_format(ab, " inode=%lu"
1556 " dev=%02x:%02x mode=%#ho"
1557 " ouid=%u ogid=%u rdev=%02x:%02x",
1558 n->ino,
1559 MAJOR(n->dev),
1560 MINOR(n->dev),
1561 n->mode,
1562 n->uid,
1563 n->gid,
1564 MAJOR(n->rdev),
1565 MINOR(n->rdev));
1566 }
1567 if (n->osid != 0) {
1568 char *ctx = NULL;
1569 u32 len;
1570 if (security_secid_to_secctx(
1571 n->osid, &ctx, &len)) {
1572 audit_log_format(ab, " osid=%u", n->osid);
1573 *call_panic = 2;
1574 } else {
1575 audit_log_format(ab, " obj=%s", ctx);
1576 security_release_secctx(ctx, len);
1577 }
1578 }
1579
1580 audit_log_fcaps(ab, n);
1581
1582 audit_log_end(ab);
1583}
1584
1327static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) 1585static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
1328{ 1586{
1329 const struct cred *cred; 1587 const struct cred *cred;
@@ -1331,6 +1589,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1331 struct audit_buffer *ab; 1589 struct audit_buffer *ab;
1332 struct audit_aux_data *aux; 1590 struct audit_aux_data *aux;
1333 const char *tty; 1591 const char *tty;
1592 struct audit_names *n;
1334 1593
1335 /* tsk == current */ 1594 /* tsk == current */
1336 context->pid = tsk->pid; 1595 context->pid = tsk->pid;
@@ -1466,70 +1725,14 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1466 if (context->pwd.dentry && context->pwd.mnt) { 1725 if (context->pwd.dentry && context->pwd.mnt) {
1467 ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD); 1726 ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD);
1468 if (ab) { 1727 if (ab) {
1469 audit_log_d_path(ab, "cwd=", &context->pwd); 1728 audit_log_d_path(ab, " cwd=", &context->pwd);
1470 audit_log_end(ab); 1729 audit_log_end(ab);
1471 } 1730 }
1472 } 1731 }
1473 for (i = 0; i < context->name_count; i++) {
1474 struct audit_names *n = &context->names[i];
1475 1732
1476 ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); 1733 i = 0;
1477 if (!ab) 1734 list_for_each_entry(n, &context->names_list, list)
1478 continue; /* audit_panic has been called */ 1735 audit_log_name(context, n, i++, &call_panic);
1479
1480 audit_log_format(ab, "item=%d", i);
1481
1482 if (n->name) {
1483 switch(n->name_len) {
1484 case AUDIT_NAME_FULL:
1485 /* log the full path */
1486 audit_log_format(ab, " name=");
1487 audit_log_untrustedstring(ab, n->name);
1488 break;
1489 case 0:
1490 /* name was specified as a relative path and the
1491 * directory component is the cwd */
1492 audit_log_d_path(ab, "name=", &context->pwd);
1493 break;
1494 default:
1495 /* log the name's directory component */
1496 audit_log_format(ab, " name=");
1497 audit_log_n_untrustedstring(ab, n->name,
1498 n->name_len);
1499 }
1500 } else
1501 audit_log_format(ab, " name=(null)");
1502
1503 if (n->ino != (unsigned long)-1) {
1504 audit_log_format(ab, " inode=%lu"
1505 " dev=%02x:%02x mode=%#o"
1506 " ouid=%u ogid=%u rdev=%02x:%02x",
1507 n->ino,
1508 MAJOR(n->dev),
1509 MINOR(n->dev),
1510 n->mode,
1511 n->uid,
1512 n->gid,
1513 MAJOR(n->rdev),
1514 MINOR(n->rdev));
1515 }
1516 if (n->osid != 0) {
1517 char *ctx = NULL;
1518 u32 len;
1519 if (security_secid_to_secctx(
1520 n->osid, &ctx, &len)) {
1521 audit_log_format(ab, " osid=%u", n->osid);
1522 call_panic = 2;
1523 } else {
1524 audit_log_format(ab, " obj=%s", ctx);
1525 security_release_secctx(ctx, len);
1526 }
1527 }
1528
1529 audit_log_fcaps(ab, n);
1530
1531 audit_log_end(ab);
1532 }
1533 1736
1534 /* Send end of event record to help user space know we are finished */ 1737 /* Send end of event record to help user space know we are finished */
1535 ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE); 1738 ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE);
@@ -1545,12 +1748,12 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1545 * 1748 *
1546 * Called from copy_process and do_exit 1749 * Called from copy_process and do_exit
1547 */ 1750 */
1548void audit_free(struct task_struct *tsk) 1751void __audit_free(struct task_struct *tsk)
1549{ 1752{
1550 struct audit_context *context; 1753 struct audit_context *context;
1551 1754
1552 context = audit_get_context(tsk, 0, 0); 1755 context = audit_get_context(tsk, 0, 0);
1553 if (likely(!context)) 1756 if (!context)
1554 return; 1757 return;
1555 1758
1556 /* Check for system calls that do not go through the exit 1759 /* Check for system calls that do not go through the exit
@@ -1583,7 +1786,7 @@ void audit_free(struct task_struct *tsk)
1583 * will only be written if another part of the kernel requests that it 1786 * will only be written if another part of the kernel requests that it
1584 * be written). 1787 * be written).
1585 */ 1788 */
1586void audit_syscall_entry(int arch, int major, 1789void __audit_syscall_entry(int arch, int major,
1587 unsigned long a1, unsigned long a2, 1790 unsigned long a1, unsigned long a2,
1588 unsigned long a3, unsigned long a4) 1791 unsigned long a3, unsigned long a4)
1589{ 1792{
@@ -1591,7 +1794,7 @@ void audit_syscall_entry(int arch, int major,
1591 struct audit_context *context = tsk->audit_context; 1794 struct audit_context *context = tsk->audit_context;
1592 enum audit_state state; 1795 enum audit_state state;
1593 1796
1594 if (unlikely(!context)) 1797 if (!context)
1595 return; 1798 return;
1596 1799
1597 /* 1800 /*
@@ -1648,7 +1851,7 @@ void audit_syscall_entry(int arch, int major,
1648 context->prio = 0; 1851 context->prio = 0;
1649 state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]); 1852 state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]);
1650 } 1853 }
1651 if (likely(state == AUDIT_DISABLED)) 1854 if (state == AUDIT_DISABLED)
1652 return; 1855 return;
1653 1856
1654 context->serial = 0; 1857 context->serial = 0;
@@ -1658,30 +1861,9 @@ void audit_syscall_entry(int arch, int major,
1658 context->ppid = 0; 1861 context->ppid = 0;
1659} 1862}
1660 1863
1661void audit_finish_fork(struct task_struct *child)
1662{
1663 struct audit_context *ctx = current->audit_context;
1664 struct audit_context *p = child->audit_context;
1665 if (!p || !ctx)
1666 return;
1667 if (!ctx->in_syscall || ctx->current_state != AUDIT_RECORD_CONTEXT)
1668 return;
1669 p->arch = ctx->arch;
1670 p->major = ctx->major;
1671 memcpy(p->argv, ctx->argv, sizeof(ctx->argv));
1672 p->ctime = ctx->ctime;
1673 p->dummy = ctx->dummy;
1674 p->in_syscall = ctx->in_syscall;
1675 p->filterkey = kstrdup(ctx->filterkey, GFP_KERNEL);
1676 p->ppid = current->pid;
1677 p->prio = ctx->prio;
1678 p->current_state = ctx->current_state;
1679}
1680
1681/** 1864/**
1682 * audit_syscall_exit - deallocate audit context after a system call 1865 * audit_syscall_exit - deallocate audit context after a system call
1683 * @valid: success/failure flag 1866 * @pt_regs: syscall registers
1684 * @return_code: syscall return value
1685 * 1867 *
1686 * Tear down after system call. If the audit context has been marked as 1868 * Tear down after system call. If the audit context has been marked as
1687 * auditable (either because of the AUDIT_RECORD_CONTEXT state from 1869 * auditable (either because of the AUDIT_RECORD_CONTEXT state from
@@ -1689,14 +1871,18 @@ void audit_finish_fork(struct task_struct *child)
1689 * message), then write out the syscall information. In call cases, 1871 * message), then write out the syscall information. In call cases,
1690 * free the names stored from getname(). 1872 * free the names stored from getname().
1691 */ 1873 */
1692void audit_syscall_exit(int valid, long return_code) 1874void __audit_syscall_exit(int success, long return_code)
1693{ 1875{
1694 struct task_struct *tsk = current; 1876 struct task_struct *tsk = current;
1695 struct audit_context *context; 1877 struct audit_context *context;
1696 1878
1697 context = audit_get_context(tsk, valid, return_code); 1879 if (success)
1880 success = AUDITSC_SUCCESS;
1881 else
1882 success = AUDITSC_FAILURE;
1698 1883
1699 if (likely(!context)) 1884 context = audit_get_context(tsk, success, return_code);
1885 if (!context)
1700 return; 1886 return;
1701 1887
1702 if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT) 1888 if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
@@ -1821,6 +2007,30 @@ retry:
1821#endif 2007#endif
1822} 2008}
1823 2009
2010static struct audit_names *audit_alloc_name(struct audit_context *context)
2011{
2012 struct audit_names *aname;
2013
2014 if (context->name_count < AUDIT_NAMES) {
2015 aname = &context->preallocated_names[context->name_count];
2016 memset(aname, 0, sizeof(*aname));
2017 } else {
2018 aname = kzalloc(sizeof(*aname), GFP_NOFS);
2019 if (!aname)
2020 return NULL;
2021 aname->should_free = true;
2022 }
2023
2024 aname->ino = (unsigned long)-1;
2025 list_add_tail(&aname->list, &context->names_list);
2026
2027 context->name_count++;
2028#if AUDIT_DEBUG
2029 context->ino_count++;
2030#endif
2031 return aname;
2032}
2033
1824/** 2034/**
1825 * audit_getname - add a name to the list 2035 * audit_getname - add a name to the list
1826 * @name: name to add 2036 * @name: name to add
@@ -1831,9 +2041,7 @@ retry:
1831void __audit_getname(const char *name) 2041void __audit_getname(const char *name)
1832{ 2042{
1833 struct audit_context *context = current->audit_context; 2043 struct audit_context *context = current->audit_context;
1834 2044 struct audit_names *n;
1835 if (IS_ERR(name) || !name)
1836 return;
1837 2045
1838 if (!context->in_syscall) { 2046 if (!context->in_syscall) {
1839#if AUDIT_DEBUG == 2 2047#if AUDIT_DEBUG == 2
@@ -1843,13 +2051,15 @@ void __audit_getname(const char *name)
1843#endif 2051#endif
1844 return; 2052 return;
1845 } 2053 }
1846 BUG_ON(context->name_count >= AUDIT_NAMES); 2054
1847 context->names[context->name_count].name = name; 2055 n = audit_alloc_name(context);
1848 context->names[context->name_count].name_len = AUDIT_NAME_FULL; 2056 if (!n)
1849 context->names[context->name_count].name_put = 1; 2057 return;
1850 context->names[context->name_count].ino = (unsigned long)-1; 2058
1851 context->names[context->name_count].osid = 0; 2059 n->name = name;
1852 ++context->name_count; 2060 n->name_len = AUDIT_NAME_FULL;
2061 n->name_put = true;
2062
1853 if (!context->pwd.dentry) 2063 if (!context->pwd.dentry)
1854 get_fs_pwd(current->fs, &context->pwd); 2064 get_fs_pwd(current->fs, &context->pwd);
1855} 2065}
@@ -1871,12 +2081,13 @@ void audit_putname(const char *name)
1871 printk(KERN_ERR "%s:%d(:%d): __putname(%p)\n", 2081 printk(KERN_ERR "%s:%d(:%d): __putname(%p)\n",
1872 __FILE__, __LINE__, context->serial, name); 2082 __FILE__, __LINE__, context->serial, name);
1873 if (context->name_count) { 2083 if (context->name_count) {
2084 struct audit_names *n;
1874 int i; 2085 int i;
1875 for (i = 0; i < context->name_count; i++) 2086
2087 list_for_each_entry(n, &context->names_list, list)
1876 printk(KERN_ERR "name[%d] = %p = %s\n", i, 2088 printk(KERN_ERR "name[%d] = %p = %s\n", i,
1877 context->names[i].name, 2089 n->name, n->name ?: "(null)");
1878 context->names[i].name ?: "(null)"); 2090 }
1879 }
1880#endif 2091#endif
1881 __putname(name); 2092 __putname(name);
1882 } 2093 }
@@ -1897,39 +2108,11 @@ void audit_putname(const char *name)
1897#endif 2108#endif
1898} 2109}
1899 2110
1900static int audit_inc_name_count(struct audit_context *context,
1901 const struct inode *inode)
1902{
1903 if (context->name_count >= AUDIT_NAMES) {
1904 if (inode)
1905 printk(KERN_DEBUG "audit: name_count maxed, losing inode data: "
1906 "dev=%02x:%02x, inode=%lu\n",
1907 MAJOR(inode->i_sb->s_dev),
1908 MINOR(inode->i_sb->s_dev),
1909 inode->i_ino);
1910
1911 else
1912 printk(KERN_DEBUG "name_count maxed, losing inode data\n");
1913 return 1;
1914 }
1915 context->name_count++;
1916#if AUDIT_DEBUG
1917 context->ino_count++;
1918#endif
1919 return 0;
1920}
1921
1922
1923static inline int audit_copy_fcaps(struct audit_names *name, const struct dentry *dentry) 2111static inline int audit_copy_fcaps(struct audit_names *name, const struct dentry *dentry)
1924{ 2112{
1925 struct cpu_vfs_cap_data caps; 2113 struct cpu_vfs_cap_data caps;
1926 int rc; 2114 int rc;
1927 2115
1928 memset(&name->fcap.permitted, 0, sizeof(kernel_cap_t));
1929 memset(&name->fcap.inheritable, 0, sizeof(kernel_cap_t));
1930 name->fcap.fE = 0;
1931 name->fcap_ver = 0;
1932
1933 if (!dentry) 2116 if (!dentry)
1934 return 0; 2117 return 0;
1935 2118
@@ -1969,30 +2152,25 @@ static void audit_copy_inode(struct audit_names *name, const struct dentry *dent
1969 */ 2152 */
1970void __audit_inode(const char *name, const struct dentry *dentry) 2153void __audit_inode(const char *name, const struct dentry *dentry)
1971{ 2154{
1972 int idx;
1973 struct audit_context *context = current->audit_context; 2155 struct audit_context *context = current->audit_context;
1974 const struct inode *inode = dentry->d_inode; 2156 const struct inode *inode = dentry->d_inode;
2157 struct audit_names *n;
1975 2158
1976 if (!context->in_syscall) 2159 if (!context->in_syscall)
1977 return; 2160 return;
1978 if (context->name_count 2161
1979 && context->names[context->name_count-1].name 2162 list_for_each_entry_reverse(n, &context->names_list, list) {
1980 && context->names[context->name_count-1].name == name) 2163 if (n->name && (n->name == name))
1981 idx = context->name_count - 1; 2164 goto out;
1982 else if (context->name_count > 1
1983 && context->names[context->name_count-2].name
1984 && context->names[context->name_count-2].name == name)
1985 idx = context->name_count - 2;
1986 else {
1987 /* FIXME: how much do we care about inodes that have no
1988 * associated name? */
1989 if (audit_inc_name_count(context, inode))
1990 return;
1991 idx = context->name_count - 1;
1992 context->names[idx].name = NULL;
1993 } 2165 }
2166
2167 /* unable to find the name from a previous getname() */
2168 n = audit_alloc_name(context);
2169 if (!n)
2170 return;
2171out:
1994 handle_path(dentry); 2172 handle_path(dentry);
1995 audit_copy_inode(&context->names[idx], dentry, inode); 2173 audit_copy_inode(n, dentry, inode);
1996} 2174}
1997 2175
1998/** 2176/**
@@ -2011,11 +2189,11 @@ void __audit_inode(const char *name, const struct dentry *dentry)
2011void __audit_inode_child(const struct dentry *dentry, 2189void __audit_inode_child(const struct dentry *dentry,
2012 const struct inode *parent) 2190 const struct inode *parent)
2013{ 2191{
2014 int idx;
2015 struct audit_context *context = current->audit_context; 2192 struct audit_context *context = current->audit_context;
2016 const char *found_parent = NULL, *found_child = NULL; 2193 const char *found_parent = NULL, *found_child = NULL;
2017 const struct inode *inode = dentry->d_inode; 2194 const struct inode *inode = dentry->d_inode;
2018 const char *dname = dentry->d_name.name; 2195 const char *dname = dentry->d_name.name;
2196 struct audit_names *n;
2019 int dirlen = 0; 2197 int dirlen = 0;
2020 2198
2021 if (!context->in_syscall) 2199 if (!context->in_syscall)
@@ -2025,9 +2203,7 @@ void __audit_inode_child(const struct dentry *dentry,
2025 handle_one(inode); 2203 handle_one(inode);
2026 2204
2027 /* parent is more likely, look for it first */ 2205 /* parent is more likely, look for it first */
2028 for (idx = 0; idx < context->name_count; idx++) { 2206 list_for_each_entry(n, &context->names_list, list) {
2029 struct audit_names *n = &context->names[idx];
2030
2031 if (!n->name) 2207 if (!n->name)
2032 continue; 2208 continue;
2033 2209
@@ -2040,9 +2216,7 @@ void __audit_inode_child(const struct dentry *dentry,
2040 } 2216 }
2041 2217
2042 /* no matching parent, look for matching child */ 2218 /* no matching parent, look for matching child */
2043 for (idx = 0; idx < context->name_count; idx++) { 2219 list_for_each_entry(n, &context->names_list, list) {
2044 struct audit_names *n = &context->names[idx];
2045
2046 if (!n->name) 2220 if (!n->name)
2047 continue; 2221 continue;
2048 2222
@@ -2060,34 +2234,29 @@ void __audit_inode_child(const struct dentry *dentry,
2060 2234
2061add_names: 2235add_names:
2062 if (!found_parent) { 2236 if (!found_parent) {
2063 if (audit_inc_name_count(context, parent)) 2237 n = audit_alloc_name(context);
2238 if (!n)
2064 return; 2239 return;
2065 idx = context->name_count - 1; 2240 audit_copy_inode(n, NULL, parent);
2066 context->names[idx].name = NULL;
2067 audit_copy_inode(&context->names[idx], NULL, parent);
2068 } 2241 }
2069 2242
2070 if (!found_child) { 2243 if (!found_child) {
2071 if (audit_inc_name_count(context, inode)) 2244 n = audit_alloc_name(context);
2245 if (!n)
2072 return; 2246 return;
2073 idx = context->name_count - 1;
2074 2247
2075 /* Re-use the name belonging to the slot for a matching parent 2248 /* Re-use the name belonging to the slot for a matching parent
2076 * directory. All names for this context are relinquished in 2249 * directory. All names for this context are relinquished in
2077 * audit_free_names() */ 2250 * audit_free_names() */
2078 if (found_parent) { 2251 if (found_parent) {
2079 context->names[idx].name = found_parent; 2252 n->name = found_parent;
2080 context->names[idx].name_len = AUDIT_NAME_FULL; 2253 n->name_len = AUDIT_NAME_FULL;
2081 /* don't call __putname() */ 2254 /* don't call __putname() */
2082 context->names[idx].name_put = 0; 2255 n->name_put = false;
2083 } else {
2084 context->names[idx].name = NULL;
2085 } 2256 }
2086 2257
2087 if (inode) 2258 if (inode)
2088 audit_copy_inode(&context->names[idx], NULL, inode); 2259 audit_copy_inode(n, NULL, inode);
2089 else
2090 context->names[idx].ino = (unsigned long)-1;
2091 } 2260 }
2092} 2261}
2093EXPORT_SYMBOL_GPL(__audit_inode_child); 2262EXPORT_SYMBOL_GPL(__audit_inode_child);
@@ -2121,19 +2290,28 @@ int auditsc_get_stamp(struct audit_context *ctx,
2121static atomic_t session_id = ATOMIC_INIT(0); 2290static atomic_t session_id = ATOMIC_INIT(0);
2122 2291
2123/** 2292/**
2124 * audit_set_loginuid - set a task's audit_context loginuid 2293 * audit_set_loginuid - set current task's audit_context loginuid
2125 * @task: task whose audit context is being modified
2126 * @loginuid: loginuid value 2294 * @loginuid: loginuid value
2127 * 2295 *
2128 * Returns 0. 2296 * Returns 0.
2129 * 2297 *
2130 * Called (set) from fs/proc/base.c::proc_loginuid_write(). 2298 * Called (set) from fs/proc/base.c::proc_loginuid_write().
2131 */ 2299 */
2132int audit_set_loginuid(struct task_struct *task, uid_t loginuid) 2300int audit_set_loginuid(uid_t loginuid)
2133{ 2301{
2134 unsigned int sessionid = atomic_inc_return(&session_id); 2302 struct task_struct *task = current;
2135 struct audit_context *context = task->audit_context; 2303 struct audit_context *context = task->audit_context;
2304 unsigned int sessionid;
2305
2306#ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE
2307 if (task->loginuid != -1)
2308 return -EPERM;
2309#else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */
2310 if (!capable(CAP_AUDIT_CONTROL))
2311 return -EPERM;
2312#endif /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */
2136 2313
2314 sessionid = atomic_inc_return(&session_id);
2137 if (context && context->in_syscall) { 2315 if (context && context->in_syscall) {
2138 struct audit_buffer *ab; 2316 struct audit_buffer *ab;
2139 2317
@@ -2160,7 +2338,7 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
2160 * @attr: queue attributes 2338 * @attr: queue attributes
2161 * 2339 *
2162 */ 2340 */
2163void __audit_mq_open(int oflag, mode_t mode, struct mq_attr *attr) 2341void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr)
2164{ 2342{
2165 struct audit_context *context = current->audit_context; 2343 struct audit_context *context = current->audit_context;
2166 2344
@@ -2260,7 +2438,7 @@ void __audit_ipc_obj(struct kern_ipc_perm *ipcp)
2260 * 2438 *
2261 * Called only after audit_ipc_obj(). 2439 * Called only after audit_ipc_obj().
2262 */ 2440 */
2263void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode) 2441void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode)
2264{ 2442{
2265 struct audit_context *context = current->audit_context; 2443 struct audit_context *context = current->audit_context;
2266 2444
@@ -2271,14 +2449,11 @@ void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mod
2271 context->ipc.has_perm = 1; 2449 context->ipc.has_perm = 1;
2272} 2450}
2273 2451
2274int audit_bprm(struct linux_binprm *bprm) 2452int __audit_bprm(struct linux_binprm *bprm)
2275{ 2453{
2276 struct audit_aux_data_execve *ax; 2454 struct audit_aux_data_execve *ax;
2277 struct audit_context *context = current->audit_context; 2455 struct audit_context *context = current->audit_context;
2278 2456
2279 if (likely(!audit_enabled || !context || context->dummy))
2280 return 0;
2281
2282 ax = kmalloc(sizeof(*ax), GFP_KERNEL); 2457 ax = kmalloc(sizeof(*ax), GFP_KERNEL);
2283 if (!ax) 2458 if (!ax)
2284 return -ENOMEM; 2459 return -ENOMEM;
@@ -2299,13 +2474,10 @@ int audit_bprm(struct linux_binprm *bprm)
2299 * @args: args array 2474 * @args: args array
2300 * 2475 *
2301 */ 2476 */
2302void audit_socketcall(int nargs, unsigned long *args) 2477void __audit_socketcall(int nargs, unsigned long *args)
2303{ 2478{
2304 struct audit_context *context = current->audit_context; 2479 struct audit_context *context = current->audit_context;
2305 2480
2306 if (likely(!context || context->dummy))
2307 return;
2308
2309 context->type = AUDIT_SOCKETCALL; 2481 context->type = AUDIT_SOCKETCALL;
2310 context->socketcall.nargs = nargs; 2482 context->socketcall.nargs = nargs;
2311 memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long)); 2483 memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long));
@@ -2331,13 +2503,10 @@ void __audit_fd_pair(int fd1, int fd2)
2331 * 2503 *
2332 * Returns 0 for success or NULL context or < 0 on error. 2504 * Returns 0 for success or NULL context or < 0 on error.
2333 */ 2505 */
2334int audit_sockaddr(int len, void *a) 2506int __audit_sockaddr(int len, void *a)
2335{ 2507{
2336 struct audit_context *context = current->audit_context; 2508 struct audit_context *context = current->audit_context;
2337 2509
2338 if (likely(!context || context->dummy))
2339 return 0;
2340
2341 if (!context->sockaddr) { 2510 if (!context->sockaddr) {
2342 void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL); 2511 void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL);
2343 if (!p) 2512 if (!p)
@@ -2499,6 +2668,25 @@ void __audit_mmap_fd(int fd, int flags)
2499 context->type = AUDIT_MMAP; 2668 context->type = AUDIT_MMAP;
2500} 2669}
2501 2670
2671static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr)
2672{
2673 uid_t auid, uid;
2674 gid_t gid;
2675 unsigned int sessionid;
2676
2677 auid = audit_get_loginuid(current);
2678 sessionid = audit_get_sessionid(current);
2679 current_uid_gid(&uid, &gid);
2680
2681 audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u",
2682 auid, uid, gid, sessionid);
2683 audit_log_task_context(ab);
2684 audit_log_format(ab, " pid=%d comm=", current->pid);
2685 audit_log_untrustedstring(ab, current->comm);
2686 audit_log_format(ab, " reason=");
2687 audit_log_string(ab, reason);
2688 audit_log_format(ab, " sig=%ld", signr);
2689}
2502/** 2690/**
2503 * audit_core_dumps - record information about processes that end abnormally 2691 * audit_core_dumps - record information about processes that end abnormally
2504 * @signr: signal value 2692 * @signr: signal value
@@ -2509,10 +2697,6 @@ void __audit_mmap_fd(int fd, int flags)
2509void audit_core_dumps(long signr) 2697void audit_core_dumps(long signr)
2510{ 2698{
2511 struct audit_buffer *ab; 2699 struct audit_buffer *ab;
2512 u32 sid;
2513 uid_t auid = audit_get_loginuid(current), uid;
2514 gid_t gid;
2515 unsigned int sessionid = audit_get_sessionid(current);
2516 2700
2517 if (!audit_enabled) 2701 if (!audit_enabled)
2518 return; 2702 return;
@@ -2521,24 +2705,17 @@ void audit_core_dumps(long signr)
2521 return; 2705 return;
2522 2706
2523 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); 2707 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
2524 current_uid_gid(&uid, &gid); 2708 audit_log_abend(ab, "memory violation", signr);
2525 audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u", 2709 audit_log_end(ab);
2526 auid, uid, gid, sessionid); 2710}
2527 security_task_getsecid(current, &sid);
2528 if (sid) {
2529 char *ctx = NULL;
2530 u32 len;
2531 2711
2532 if (security_secid_to_secctx(sid, &ctx, &len)) 2712void __audit_seccomp(unsigned long syscall)
2533 audit_log_format(ab, " ssid=%u", sid); 2713{
2534 else { 2714 struct audit_buffer *ab;
2535 audit_log_format(ab, " subj=%s", ctx); 2715
2536 security_release_secctx(ctx, len); 2716 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
2537 } 2717 audit_log_abend(ab, "seccomp", SIGKILL);
2538 } 2718 audit_log_format(ab, " syscall=%ld", syscall);
2539 audit_log_format(ab, " pid=%d comm=", current->pid);
2540 audit_log_untrustedstring(ab, current->comm);
2541 audit_log_format(ab, " sig=%ld", signr);
2542 audit_log_end(ab); 2719 audit_log_end(ab);
2543} 2720}
2544 2721
diff --git a/kernel/capability.c b/kernel/capability.c
index b463871a4e69..3f1adb6c6470 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -287,74 +287,84 @@ error:
287} 287}
288 288
289/** 289/**
290 * has_capability - Does a task have a capability in init_user_ns 290 * has_ns_capability - Does a task have a capability in a specific user ns
291 * @t: The task in question 291 * @t: The task in question
292 * @ns: target user namespace
292 * @cap: The capability to be tested for 293 * @cap: The capability to be tested for
293 * 294 *
294 * Return true if the specified task has the given superior capability 295 * Return true if the specified task has the given superior capability
295 * currently in effect to the initial user namespace, false if not. 296 * currently in effect to the specified user namespace, false if not.
296 * 297 *
297 * Note that this does not set PF_SUPERPRIV on the task. 298 * Note that this does not set PF_SUPERPRIV on the task.
298 */ 299 */
299bool has_capability(struct task_struct *t, int cap) 300bool has_ns_capability(struct task_struct *t,
301 struct user_namespace *ns, int cap)
300{ 302{
301 int ret = security_real_capable(t, &init_user_ns, cap); 303 int ret;
304
305 rcu_read_lock();
306 ret = security_capable(__task_cred(t), ns, cap);
307 rcu_read_unlock();
302 308
303 return (ret == 0); 309 return (ret == 0);
304} 310}
305 311
306/** 312/**
307 * has_capability - Does a task have a capability in a specific user ns 313 * has_capability - Does a task have a capability in init_user_ns
308 * @t: The task in question 314 * @t: The task in question
309 * @ns: target user namespace
310 * @cap: The capability to be tested for 315 * @cap: The capability to be tested for
311 * 316 *
312 * Return true if the specified task has the given superior capability 317 * Return true if the specified task has the given superior capability
313 * currently in effect to the specified user namespace, false if not. 318 * currently in effect to the initial user namespace, false if not.
314 * 319 *
315 * Note that this does not set PF_SUPERPRIV on the task. 320 * Note that this does not set PF_SUPERPRIV on the task.
316 */ 321 */
317bool has_ns_capability(struct task_struct *t, 322bool has_capability(struct task_struct *t, int cap)
318 struct user_namespace *ns, int cap)
319{ 323{
320 int ret = security_real_capable(t, ns, cap); 324 return has_ns_capability(t, &init_user_ns, cap);
321
322 return (ret == 0);
323} 325}
324 326
325/** 327/**
326 * has_capability_noaudit - Does a task have a capability (unaudited) 328 * has_ns_capability_noaudit - Does a task have a capability (unaudited)
329 * in a specific user ns.
327 * @t: The task in question 330 * @t: The task in question
331 * @ns: target user namespace
328 * @cap: The capability to be tested for 332 * @cap: The capability to be tested for
329 * 333 *
330 * Return true if the specified task has the given superior capability 334 * Return true if the specified task has the given superior capability
331 * currently in effect to init_user_ns, false if not. Don't write an 335 * currently in effect to the specified user namespace, false if not.
332 * audit message for the check. 336 * Do not write an audit message for the check.
333 * 337 *
334 * Note that this does not set PF_SUPERPRIV on the task. 338 * Note that this does not set PF_SUPERPRIV on the task.
335 */ 339 */
336bool has_capability_noaudit(struct task_struct *t, int cap) 340bool has_ns_capability_noaudit(struct task_struct *t,
341 struct user_namespace *ns, int cap)
337{ 342{
338 int ret = security_real_capable_noaudit(t, &init_user_ns, cap); 343 int ret;
344
345 rcu_read_lock();
346 ret = security_capable_noaudit(__task_cred(t), ns, cap);
347 rcu_read_unlock();
339 348
340 return (ret == 0); 349 return (ret == 0);
341} 350}
342 351
343/** 352/**
344 * capable - Determine if the current task has a superior capability in effect 353 * has_capability_noaudit - Does a task have a capability (unaudited) in the
354 * initial user ns
355 * @t: The task in question
345 * @cap: The capability to be tested for 356 * @cap: The capability to be tested for
346 * 357 *
347 * Return true if the current task has the given superior capability currently 358 * Return true if the specified task has the given superior capability
348 * available for use, false if not. 359 * currently in effect to init_user_ns, false if not. Don't write an
360 * audit message for the check.
349 * 361 *
350 * This sets PF_SUPERPRIV on the task if the capability is available on the 362 * Note that this does not set PF_SUPERPRIV on the task.
351 * assumption that it's about to be used.
352 */ 363 */
353bool capable(int cap) 364bool has_capability_noaudit(struct task_struct *t, int cap)
354{ 365{
355 return ns_capable(&init_user_ns, cap); 366 return has_ns_capability_noaudit(t, &init_user_ns, cap);
356} 367}
357EXPORT_SYMBOL(capable);
358 368
359/** 369/**
360 * ns_capable - Determine if the current task has a superior capability in effect 370 * ns_capable - Determine if the current task has a superior capability in effect
@@ -374,7 +384,7 @@ bool ns_capable(struct user_namespace *ns, int cap)
374 BUG(); 384 BUG();
375 } 385 }
376 386
377 if (security_capable(ns, current_cred(), cap) == 0) { 387 if (security_capable(current_cred(), ns, cap) == 0) {
378 current->flags |= PF_SUPERPRIV; 388 current->flags |= PF_SUPERPRIV;
379 return true; 389 return true;
380 } 390 }
@@ -383,18 +393,20 @@ bool ns_capable(struct user_namespace *ns, int cap)
383EXPORT_SYMBOL(ns_capable); 393EXPORT_SYMBOL(ns_capable);
384 394
385/** 395/**
386 * task_ns_capable - Determine whether current task has a superior 396 * capable - Determine if the current task has a superior capability in effect
387 * capability targeted at a specific task's user namespace. 397 * @cap: The capability to be tested for
388 * @t: The task whose user namespace is targeted. 398 *
389 * @cap: The capability in question. 399 * Return true if the current task has the given superior capability currently
400 * available for use, false if not.
390 * 401 *
391 * Return true if it does, false otherwise. 402 * This sets PF_SUPERPRIV on the task if the capability is available on the
403 * assumption that it's about to be used.
392 */ 404 */
393bool task_ns_capable(struct task_struct *t, int cap) 405bool capable(int cap)
394{ 406{
395 return ns_capable(task_cred_xxx(t, user)->user_ns, cap); 407 return ns_capable(&init_user_ns, cap);
396} 408}
397EXPORT_SYMBOL(task_ns_capable); 409EXPORT_SYMBOL(capable);
398 410
399/** 411/**
400 * nsown_capable - Check superior capability to one's own user_ns 412 * nsown_capable - Check superior capability to one's own user_ns
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a184470cf9b5..a5d3b5325f77 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -63,7 +63,24 @@
63 63
64#include <linux/atomic.h> 64#include <linux/atomic.h>
65 65
66/*
67 * cgroup_mutex is the master lock. Any modification to cgroup or its
68 * hierarchy must be performed while holding it.
69 *
70 * cgroup_root_mutex nests inside cgroup_mutex and should be held to modify
71 * cgroupfs_root of any cgroup hierarchy - subsys list, flags,
72 * release_agent_path and so on. Modifying requires both cgroup_mutex and
73 * cgroup_root_mutex. Readers can acquire either of the two. This is to
74 * break the following locking order cycle.
75 *
76 * A. cgroup_mutex -> cred_guard_mutex -> s_type->i_mutex_key -> namespace_sem
77 * B. namespace_sem -> cgroup_mutex
78 *
79 * B happens only through cgroup_show_options() and using cgroup_root_mutex
80 * breaks it.
81 */
66static DEFINE_MUTEX(cgroup_mutex); 82static DEFINE_MUTEX(cgroup_mutex);
83static DEFINE_MUTEX(cgroup_root_mutex);
67 84
68/* 85/*
69 * Generate an array of cgroup subsystem pointers. At boot time, this is 86 * Generate an array of cgroup subsystem pointers. At boot time, this is
@@ -760,7 +777,7 @@ EXPORT_SYMBOL_GPL(cgroup_unlock);
760 * -> cgroup_mkdir. 777 * -> cgroup_mkdir.
761 */ 778 */
762 779
763static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); 780static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
764static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *); 781static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *);
765static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); 782static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
766static int cgroup_populate_dir(struct cgroup *cgrp); 783static int cgroup_populate_dir(struct cgroup *cgrp);
@@ -775,7 +792,7 @@ static struct backing_dev_info cgroup_backing_dev_info = {
775static int alloc_css_id(struct cgroup_subsys *ss, 792static int alloc_css_id(struct cgroup_subsys *ss,
776 struct cgroup *parent, struct cgroup *child); 793 struct cgroup *parent, struct cgroup *child);
777 794
778static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) 795static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
779{ 796{
780 struct inode *inode = new_inode(sb); 797 struct inode *inode = new_inode(sb);
781 798
@@ -921,7 +938,7 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
921 * 938 *
922 * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex; 939 * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
923 */ 940 */
924DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); 941static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
925 942
926static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp) 943static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
927{ 944{
@@ -953,6 +970,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
953 int i; 970 int i;
954 971
955 BUG_ON(!mutex_is_locked(&cgroup_mutex)); 972 BUG_ON(!mutex_is_locked(&cgroup_mutex));
973 BUG_ON(!mutex_is_locked(&cgroup_root_mutex));
956 974
957 removed_bits = root->actual_subsys_bits & ~final_bits; 975 removed_bits = root->actual_subsys_bits & ~final_bits;
958 added_bits = final_bits & ~root->actual_subsys_bits; 976 added_bits = final_bits & ~root->actual_subsys_bits;
@@ -1038,12 +1056,12 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1038 return 0; 1056 return 0;
1039} 1057}
1040 1058
1041static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs) 1059static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
1042{ 1060{
1043 struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info; 1061 struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
1044 struct cgroup_subsys *ss; 1062 struct cgroup_subsys *ss;
1045 1063
1046 mutex_lock(&cgroup_mutex); 1064 mutex_lock(&cgroup_root_mutex);
1047 for_each_subsys(root, ss) 1065 for_each_subsys(root, ss)
1048 seq_printf(seq, ",%s", ss->name); 1066 seq_printf(seq, ",%s", ss->name);
1049 if (test_bit(ROOT_NOPREFIX, &root->flags)) 1067 if (test_bit(ROOT_NOPREFIX, &root->flags))
@@ -1054,7 +1072,7 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
1054 seq_puts(seq, ",clone_children"); 1072 seq_puts(seq, ",clone_children");
1055 if (strlen(root->name)) 1073 if (strlen(root->name))
1056 seq_printf(seq, ",name=%s", root->name); 1074 seq_printf(seq, ",name=%s", root->name);
1057 mutex_unlock(&cgroup_mutex); 1075 mutex_unlock(&cgroup_root_mutex);
1058 return 0; 1076 return 0;
1059} 1077}
1060 1078
@@ -1175,10 +1193,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1175 1193
1176 /* 1194 /*
1177 * If the 'all' option was specified select all the subsystems, 1195 * If the 'all' option was specified select all the subsystems,
1178 * otherwise 'all, 'none' and a subsystem name options were not 1196 * otherwise if 'none', 'name=' and a subsystem name options
1179 * specified, let's default to 'all' 1197 * were not specified, let's default to 'all'
1180 */ 1198 */
1181 if (all_ss || (!all_ss && !one_ss && !opts->none)) { 1199 if (all_ss || (!one_ss && !opts->none && !opts->name)) {
1182 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1200 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1183 struct cgroup_subsys *ss = subsys[i]; 1201 struct cgroup_subsys *ss = subsys[i];
1184 if (ss == NULL) 1202 if (ss == NULL)
@@ -1269,6 +1287,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1269 1287
1270 mutex_lock(&cgrp->dentry->d_inode->i_mutex); 1288 mutex_lock(&cgrp->dentry->d_inode->i_mutex);
1271 mutex_lock(&cgroup_mutex); 1289 mutex_lock(&cgroup_mutex);
1290 mutex_lock(&cgroup_root_mutex);
1272 1291
1273 /* See what subsystems are wanted */ 1292 /* See what subsystems are wanted */
1274 ret = parse_cgroupfs_options(data, &opts); 1293 ret = parse_cgroupfs_options(data, &opts);
@@ -1297,6 +1316,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1297 out_unlock: 1316 out_unlock:
1298 kfree(opts.release_agent); 1317 kfree(opts.release_agent);
1299 kfree(opts.name); 1318 kfree(opts.name);
1319 mutex_unlock(&cgroup_root_mutex);
1300 mutex_unlock(&cgroup_mutex); 1320 mutex_unlock(&cgroup_mutex);
1301 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 1321 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
1302 return ret; 1322 return ret;
@@ -1481,6 +1501,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1481 int ret = 0; 1501 int ret = 0;
1482 struct super_block *sb; 1502 struct super_block *sb;
1483 struct cgroupfs_root *new_root; 1503 struct cgroupfs_root *new_root;
1504 struct inode *inode;
1484 1505
1485 /* First find the desired set of subsystems */ 1506 /* First find the desired set of subsystems */
1486 mutex_lock(&cgroup_mutex); 1507 mutex_lock(&cgroup_mutex);
@@ -1514,7 +1535,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1514 /* We used the new root structure, so this is a new hierarchy */ 1535 /* We used the new root structure, so this is a new hierarchy */
1515 struct list_head tmp_cg_links; 1536 struct list_head tmp_cg_links;
1516 struct cgroup *root_cgrp = &root->top_cgroup; 1537 struct cgroup *root_cgrp = &root->top_cgroup;
1517 struct inode *inode;
1518 struct cgroupfs_root *existing_root; 1538 struct cgroupfs_root *existing_root;
1519 const struct cred *cred; 1539 const struct cred *cred;
1520 int i; 1540 int i;
@@ -1528,18 +1548,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1528 1548
1529 mutex_lock(&inode->i_mutex); 1549 mutex_lock(&inode->i_mutex);
1530 mutex_lock(&cgroup_mutex); 1550 mutex_lock(&cgroup_mutex);
1551 mutex_lock(&cgroup_root_mutex);
1531 1552
1532 if (strlen(root->name)) { 1553 /* Check for name clashes with existing mounts */
1533 /* Check for name clashes with existing mounts */ 1554 ret = -EBUSY;
1534 for_each_active_root(existing_root) { 1555 if (strlen(root->name))
1535 if (!strcmp(existing_root->name, root->name)) { 1556 for_each_active_root(existing_root)
1536 ret = -EBUSY; 1557 if (!strcmp(existing_root->name, root->name))
1537 mutex_unlock(&cgroup_mutex); 1558 goto unlock_drop;
1538 mutex_unlock(&inode->i_mutex);
1539 goto drop_new_super;
1540 }
1541 }
1542 }
1543 1559
1544 /* 1560 /*
1545 * We're accessing css_set_count without locking 1561 * We're accessing css_set_count without locking
@@ -1549,18 +1565,13 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1549 * have some link structures left over 1565 * have some link structures left over
1550 */ 1566 */
1551 ret = allocate_cg_links(css_set_count, &tmp_cg_links); 1567 ret = allocate_cg_links(css_set_count, &tmp_cg_links);
1552 if (ret) { 1568 if (ret)
1553 mutex_unlock(&cgroup_mutex); 1569 goto unlock_drop;
1554 mutex_unlock(&inode->i_mutex);
1555 goto drop_new_super;
1556 }
1557 1570
1558 ret = rebind_subsystems(root, root->subsys_bits); 1571 ret = rebind_subsystems(root, root->subsys_bits);
1559 if (ret == -EBUSY) { 1572 if (ret == -EBUSY) {
1560 mutex_unlock(&cgroup_mutex);
1561 mutex_unlock(&inode->i_mutex);
1562 free_cg_links(&tmp_cg_links); 1573 free_cg_links(&tmp_cg_links);
1563 goto drop_new_super; 1574 goto unlock_drop;
1564 } 1575 }
1565 /* 1576 /*
1566 * There must be no failure case after here, since rebinding 1577 * There must be no failure case after here, since rebinding
@@ -1599,6 +1610,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1599 cred = override_creds(&init_cred); 1610 cred = override_creds(&init_cred);
1600 cgroup_populate_dir(root_cgrp); 1611 cgroup_populate_dir(root_cgrp);
1601 revert_creds(cred); 1612 revert_creds(cred);
1613 mutex_unlock(&cgroup_root_mutex);
1602 mutex_unlock(&cgroup_mutex); 1614 mutex_unlock(&cgroup_mutex);
1603 mutex_unlock(&inode->i_mutex); 1615 mutex_unlock(&inode->i_mutex);
1604 } else { 1616 } else {
@@ -1615,6 +1627,10 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1615 kfree(opts.name); 1627 kfree(opts.name);
1616 return dget(sb->s_root); 1628 return dget(sb->s_root);
1617 1629
1630 unlock_drop:
1631 mutex_unlock(&cgroup_root_mutex);
1632 mutex_unlock(&cgroup_mutex);
1633 mutex_unlock(&inode->i_mutex);
1618 drop_new_super: 1634 drop_new_super:
1619 deactivate_locked_super(sb); 1635 deactivate_locked_super(sb);
1620 drop_modules: 1636 drop_modules:
@@ -1639,6 +1655,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
1639 BUG_ON(!list_empty(&cgrp->sibling)); 1655 BUG_ON(!list_empty(&cgrp->sibling));
1640 1656
1641 mutex_lock(&cgroup_mutex); 1657 mutex_lock(&cgroup_mutex);
1658 mutex_lock(&cgroup_root_mutex);
1642 1659
1643 /* Rebind all subsystems back to the default hierarchy */ 1660 /* Rebind all subsystems back to the default hierarchy */
1644 ret = rebind_subsystems(root, 0); 1661 ret = rebind_subsystems(root, 0);
@@ -1664,6 +1681,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
1664 root_count--; 1681 root_count--;
1665 } 1682 }
1666 1683
1684 mutex_unlock(&cgroup_root_mutex);
1667 mutex_unlock(&cgroup_mutex); 1685 mutex_unlock(&cgroup_mutex);
1668 1686
1669 kill_litter_super(sb); 1687 kill_litter_super(sb);
@@ -1740,11 +1758,90 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1740EXPORT_SYMBOL_GPL(cgroup_path); 1758EXPORT_SYMBOL_GPL(cgroup_path);
1741 1759
1742/* 1760/*
1761 * Control Group taskset
1762 */
1763struct task_and_cgroup {
1764 struct task_struct *task;
1765 struct cgroup *cgrp;
1766};
1767
1768struct cgroup_taskset {
1769 struct task_and_cgroup single;
1770 struct flex_array *tc_array;
1771 int tc_array_len;
1772 int idx;
1773 struct cgroup *cur_cgrp;
1774};
1775
1776/**
1777 * cgroup_taskset_first - reset taskset and return the first task
1778 * @tset: taskset of interest
1779 *
1780 * @tset iteration is initialized and the first task is returned.
1781 */
1782struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
1783{
1784 if (tset->tc_array) {
1785 tset->idx = 0;
1786 return cgroup_taskset_next(tset);
1787 } else {
1788 tset->cur_cgrp = tset->single.cgrp;
1789 return tset->single.task;
1790 }
1791}
1792EXPORT_SYMBOL_GPL(cgroup_taskset_first);
1793
1794/**
1795 * cgroup_taskset_next - iterate to the next task in taskset
1796 * @tset: taskset of interest
1797 *
1798 * Return the next task in @tset. Iteration must have been initialized
1799 * with cgroup_taskset_first().
1800 */
1801struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
1802{
1803 struct task_and_cgroup *tc;
1804
1805 if (!tset->tc_array || tset->idx >= tset->tc_array_len)
1806 return NULL;
1807
1808 tc = flex_array_get(tset->tc_array, tset->idx++);
1809 tset->cur_cgrp = tc->cgrp;
1810 return tc->task;
1811}
1812EXPORT_SYMBOL_GPL(cgroup_taskset_next);
1813
1814/**
1815 * cgroup_taskset_cur_cgroup - return the matching cgroup for the current task
1816 * @tset: taskset of interest
1817 *
1818 * Return the cgroup for the current (last returned) task of @tset. This
1819 * function must be preceded by either cgroup_taskset_first() or
1820 * cgroup_taskset_next().
1821 */
1822struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset)
1823{
1824 return tset->cur_cgrp;
1825}
1826EXPORT_SYMBOL_GPL(cgroup_taskset_cur_cgroup);
1827
1828/**
1829 * cgroup_taskset_size - return the number of tasks in taskset
1830 * @tset: taskset of interest
1831 */
1832int cgroup_taskset_size(struct cgroup_taskset *tset)
1833{
1834 return tset->tc_array ? tset->tc_array_len : 1;
1835}
1836EXPORT_SYMBOL_GPL(cgroup_taskset_size);
1837
1838
1839/*
1743 * cgroup_task_migrate - move a task from one cgroup to another. 1840 * cgroup_task_migrate - move a task from one cgroup to another.
1744 * 1841 *
1745 * 'guarantee' is set if the caller promises that a new css_set for the task 1842 * 'guarantee' is set if the caller promises that a new css_set for the task
1746 * will already exist. If not set, this function might sleep, and can fail with 1843 * will already exist. If not set, this function might sleep, and can fail with
1747 * -ENOMEM. Otherwise, it can only fail with -ESRCH. 1844 * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked.
1748 */ 1845 */
1749static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, 1846static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1750 struct task_struct *tsk, bool guarantee) 1847 struct task_struct *tsk, bool guarantee)
@@ -1753,14 +1850,12 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1753 struct css_set *newcg; 1850 struct css_set *newcg;
1754 1851
1755 /* 1852 /*
1756 * get old css_set. we need to take task_lock and refcount it, because 1853 * We are synchronized through threadgroup_lock() against PF_EXITING
1757 * an exiting task can change its css_set to init_css_set and drop its 1854 * setting such that we can't race against cgroup_exit() changing the
1758 * old one without taking cgroup_mutex. 1855 * css_set to init_css_set and dropping the old one.
1759 */ 1856 */
1760 task_lock(tsk); 1857 WARN_ON_ONCE(tsk->flags & PF_EXITING);
1761 oldcg = tsk->cgroups; 1858 oldcg = tsk->cgroups;
1762 get_css_set(oldcg);
1763 task_unlock(tsk);
1764 1859
1765 /* locate or allocate a new css_set for this task. */ 1860 /* locate or allocate a new css_set for this task. */
1766 if (guarantee) { 1861 if (guarantee) {
@@ -1775,20 +1870,11 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1775 might_sleep(); 1870 might_sleep();
1776 /* find_css_set will give us newcg already referenced. */ 1871 /* find_css_set will give us newcg already referenced. */
1777 newcg = find_css_set(oldcg, cgrp); 1872 newcg = find_css_set(oldcg, cgrp);
1778 if (!newcg) { 1873 if (!newcg)
1779 put_css_set(oldcg);
1780 return -ENOMEM; 1874 return -ENOMEM;
1781 }
1782 } 1875 }
1783 put_css_set(oldcg);
1784 1876
1785 /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */
1786 task_lock(tsk); 1877 task_lock(tsk);
1787 if (tsk->flags & PF_EXITING) {
1788 task_unlock(tsk);
1789 put_css_set(newcg);
1790 return -ESRCH;
1791 }
1792 rcu_assign_pointer(tsk->cgroups, newcg); 1878 rcu_assign_pointer(tsk->cgroups, newcg);
1793 task_unlock(tsk); 1879 task_unlock(tsk);
1794 1880
@@ -1814,8 +1900,8 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1814 * @cgrp: the cgroup the task is attaching to 1900 * @cgrp: the cgroup the task is attaching to
1815 * @tsk: the task to be attached 1901 * @tsk: the task to be attached
1816 * 1902 *
1817 * Call holding cgroup_mutex. May take task_lock of 1903 * Call with cgroup_mutex and threadgroup locked. May take task_lock of
1818 * the task 'tsk' during call. 1904 * @tsk during call.
1819 */ 1905 */
1820int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) 1906int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1821{ 1907{
@@ -1823,15 +1909,23 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1823 struct cgroup_subsys *ss, *failed_ss = NULL; 1909 struct cgroup_subsys *ss, *failed_ss = NULL;
1824 struct cgroup *oldcgrp; 1910 struct cgroup *oldcgrp;
1825 struct cgroupfs_root *root = cgrp->root; 1911 struct cgroupfs_root *root = cgrp->root;
1912 struct cgroup_taskset tset = { };
1913
1914 /* @tsk either already exited or can't exit until the end */
1915 if (tsk->flags & PF_EXITING)
1916 return -ESRCH;
1826 1917
1827 /* Nothing to do if the task is already in that cgroup */ 1918 /* Nothing to do if the task is already in that cgroup */
1828 oldcgrp = task_cgroup_from_root(tsk, root); 1919 oldcgrp = task_cgroup_from_root(tsk, root);
1829 if (cgrp == oldcgrp) 1920 if (cgrp == oldcgrp)
1830 return 0; 1921 return 0;
1831 1922
1923 tset.single.task = tsk;
1924 tset.single.cgrp = oldcgrp;
1925
1832 for_each_subsys(root, ss) { 1926 for_each_subsys(root, ss) {
1833 if (ss->can_attach) { 1927 if (ss->can_attach) {
1834 retval = ss->can_attach(ss, cgrp, tsk); 1928 retval = ss->can_attach(ss, cgrp, &tset);
1835 if (retval) { 1929 if (retval) {
1836 /* 1930 /*
1837 * Remember on which subsystem the can_attach() 1931 * Remember on which subsystem the can_attach()
@@ -1843,13 +1937,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1843 goto out; 1937 goto out;
1844 } 1938 }
1845 } 1939 }
1846 if (ss->can_attach_task) {
1847 retval = ss->can_attach_task(cgrp, tsk);
1848 if (retval) {
1849 failed_ss = ss;
1850 goto out;
1851 }
1852 }
1853 } 1940 }
1854 1941
1855 retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false); 1942 retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false);
@@ -1857,12 +1944,8 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1857 goto out; 1944 goto out;
1858 1945
1859 for_each_subsys(root, ss) { 1946 for_each_subsys(root, ss) {
1860 if (ss->pre_attach)
1861 ss->pre_attach(cgrp);
1862 if (ss->attach_task)
1863 ss->attach_task(cgrp, tsk);
1864 if (ss->attach) 1947 if (ss->attach)
1865 ss->attach(ss, cgrp, oldcgrp, tsk); 1948 ss->attach(ss, cgrp, &tset);
1866 } 1949 }
1867 1950
1868 synchronize_rcu(); 1951 synchronize_rcu();
@@ -1884,7 +1967,7 @@ out:
1884 */ 1967 */
1885 break; 1968 break;
1886 if (ss->cancel_attach) 1969 if (ss->cancel_attach)
1887 ss->cancel_attach(ss, cgrp, tsk); 1970 ss->cancel_attach(ss, cgrp, &tset);
1888 } 1971 }
1889 } 1972 }
1890 return retval; 1973 return retval;
@@ -1935,23 +2018,17 @@ static bool css_set_check_fetched(struct cgroup *cgrp,
1935 2018
1936 read_lock(&css_set_lock); 2019 read_lock(&css_set_lock);
1937 newcg = find_existing_css_set(cg, cgrp, template); 2020 newcg = find_existing_css_set(cg, cgrp, template);
1938 if (newcg)
1939 get_css_set(newcg);
1940 read_unlock(&css_set_lock); 2021 read_unlock(&css_set_lock);
1941 2022
1942 /* doesn't exist at all? */ 2023 /* doesn't exist at all? */
1943 if (!newcg) 2024 if (!newcg)
1944 return false; 2025 return false;
1945 /* see if it's already in the list */ 2026 /* see if it's already in the list */
1946 list_for_each_entry(cg_entry, newcg_list, links) { 2027 list_for_each_entry(cg_entry, newcg_list, links)
1947 if (cg_entry->cg == newcg) { 2028 if (cg_entry->cg == newcg)
1948 put_css_set(newcg);
1949 return true; 2029 return true;
1950 }
1951 }
1952 2030
1953 /* not found */ 2031 /* not found */
1954 put_css_set(newcg);
1955 return false; 2032 return false;
1956} 2033}
1957 2034
@@ -1985,21 +2062,21 @@ static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
1985 * @cgrp: the cgroup to attach to 2062 * @cgrp: the cgroup to attach to
1986 * @leader: the threadgroup leader task_struct of the group to be attached 2063 * @leader: the threadgroup leader task_struct of the group to be attached
1987 * 2064 *
1988 * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will 2065 * Call holding cgroup_mutex and the group_rwsem of the leader. Will take
1989 * take task_lock of each thread in leader's threadgroup individually in turn. 2066 * task_lock of each thread in leader's threadgroup individually in turn.
1990 */ 2067 */
1991int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) 2068static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
1992{ 2069{
1993 int retval, i, group_size; 2070 int retval, i, group_size;
1994 struct cgroup_subsys *ss, *failed_ss = NULL; 2071 struct cgroup_subsys *ss, *failed_ss = NULL;
1995 bool cancel_failed_ss = false;
1996 /* guaranteed to be initialized later, but the compiler needs this */ 2072 /* guaranteed to be initialized later, but the compiler needs this */
1997 struct cgroup *oldcgrp = NULL;
1998 struct css_set *oldcg; 2073 struct css_set *oldcg;
1999 struct cgroupfs_root *root = cgrp->root; 2074 struct cgroupfs_root *root = cgrp->root;
2000 /* threadgroup list cursor and array */ 2075 /* threadgroup list cursor and array */
2001 struct task_struct *tsk; 2076 struct task_struct *tsk;
2077 struct task_and_cgroup *tc;
2002 struct flex_array *group; 2078 struct flex_array *group;
2079 struct cgroup_taskset tset = { };
2003 /* 2080 /*
2004 * we need to make sure we have css_sets for all the tasks we're 2081 * we need to make sure we have css_sets for all the tasks we're
2005 * going to move -before- we actually start moving them, so that in 2082 * going to move -before- we actually start moving them, so that in
@@ -2012,13 +2089,12 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2012 * step 0: in order to do expensive, possibly blocking operations for 2089 * step 0: in order to do expensive, possibly blocking operations for
2013 * every thread, we cannot iterate the thread group list, since it needs 2090 * every thread, we cannot iterate the thread group list, since it needs
2014 * rcu or tasklist locked. instead, build an array of all threads in the 2091 * rcu or tasklist locked. instead, build an array of all threads in the
2015 * group - threadgroup_fork_lock prevents new threads from appearing, 2092 * group - group_rwsem prevents new threads from appearing, and if
2016 * and if threads exit, this will just be an over-estimate. 2093 * threads exit, this will just be an over-estimate.
2017 */ 2094 */
2018 group_size = get_nr_threads(leader); 2095 group_size = get_nr_threads(leader);
2019 /* flex_array supports very large thread-groups better than kmalloc. */ 2096 /* flex_array supports very large thread-groups better than kmalloc. */
2020 group = flex_array_alloc(sizeof(struct task_struct *), group_size, 2097 group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL);
2021 GFP_KERNEL);
2022 if (!group) 2098 if (!group)
2023 return -ENOMEM; 2099 return -ENOMEM;
2024 /* pre-allocate to guarantee space while iterating in rcu read-side. */ 2100 /* pre-allocate to guarantee space while iterating in rcu read-side. */
@@ -2040,49 +2116,53 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2040 retval = -EAGAIN; 2116 retval = -EAGAIN;
2041 goto out_free_group_list; 2117 goto out_free_group_list;
2042 } 2118 }
2043 /* take a reference on each task in the group to go in the array. */ 2119
2044 tsk = leader; 2120 tsk = leader;
2045 i = 0; 2121 i = 0;
2046 do { 2122 do {
2123 struct task_and_cgroup ent;
2124
2125 /* @tsk either already exited or can't exit until the end */
2126 if (tsk->flags & PF_EXITING)
2127 continue;
2128
2047 /* as per above, nr_threads may decrease, but not increase. */ 2129 /* as per above, nr_threads may decrease, but not increase. */
2048 BUG_ON(i >= group_size); 2130 BUG_ON(i >= group_size);
2049 get_task_struct(tsk);
2050 /* 2131 /*
2051 * saying GFP_ATOMIC has no effect here because we did prealloc 2132 * saying GFP_ATOMIC has no effect here because we did prealloc
2052 * earlier, but it's good form to communicate our expectations. 2133 * earlier, but it's good form to communicate our expectations.
2053 */ 2134 */
2054 retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC); 2135 ent.task = tsk;
2136 ent.cgrp = task_cgroup_from_root(tsk, root);
2137 /* nothing to do if this task is already in the cgroup */
2138 if (ent.cgrp == cgrp)
2139 continue;
2140 retval = flex_array_put(group, i, &ent, GFP_ATOMIC);
2055 BUG_ON(retval != 0); 2141 BUG_ON(retval != 0);
2056 i++; 2142 i++;
2057 } while_each_thread(leader, tsk); 2143 } while_each_thread(leader, tsk);
2058 /* remember the number of threads in the array for later. */ 2144 /* remember the number of threads in the array for later. */
2059 group_size = i; 2145 group_size = i;
2146 tset.tc_array = group;
2147 tset.tc_array_len = group_size;
2060 read_unlock(&tasklist_lock); 2148 read_unlock(&tasklist_lock);
2061 2149
2150 /* methods shouldn't be called if no task is actually migrating */
2151 retval = 0;
2152 if (!group_size)
2153 goto out_free_group_list;
2154
2062 /* 2155 /*
2063 * step 1: check that we can legitimately attach to the cgroup. 2156 * step 1: check that we can legitimately attach to the cgroup.
2064 */ 2157 */
2065 for_each_subsys(root, ss) { 2158 for_each_subsys(root, ss) {
2066 if (ss->can_attach) { 2159 if (ss->can_attach) {
2067 retval = ss->can_attach(ss, cgrp, leader); 2160 retval = ss->can_attach(ss, cgrp, &tset);
2068 if (retval) { 2161 if (retval) {
2069 failed_ss = ss; 2162 failed_ss = ss;
2070 goto out_cancel_attach; 2163 goto out_cancel_attach;
2071 } 2164 }
2072 } 2165 }
2073 /* a callback to be run on every thread in the threadgroup. */
2074 if (ss->can_attach_task) {
2075 /* run on each task in the threadgroup. */
2076 for (i = 0; i < group_size; i++) {
2077 tsk = flex_array_get_ptr(group, i);
2078 retval = ss->can_attach_task(cgrp, tsk);
2079 if (retval) {
2080 failed_ss = ss;
2081 cancel_failed_ss = true;
2082 goto out_cancel_attach;
2083 }
2084 }
2085 }
2086 } 2166 }
2087 2167
2088 /* 2168 /*
@@ -2091,67 +2171,36 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2091 */ 2171 */
2092 INIT_LIST_HEAD(&newcg_list); 2172 INIT_LIST_HEAD(&newcg_list);
2093 for (i = 0; i < group_size; i++) { 2173 for (i = 0; i < group_size; i++) {
2094 tsk = flex_array_get_ptr(group, i); 2174 tc = flex_array_get(group, i);
2095 /* nothing to do if this task is already in the cgroup */ 2175 oldcg = tc->task->cgroups;
2096 oldcgrp = task_cgroup_from_root(tsk, root); 2176
2097 if (cgrp == oldcgrp) 2177 /* if we don't already have it in the list get a new one */
2098 continue; 2178 if (!css_set_check_fetched(cgrp, tc->task, oldcg,
2099 /* get old css_set pointer */ 2179 &newcg_list)) {
2100 task_lock(tsk);
2101 oldcg = tsk->cgroups;
2102 get_css_set(oldcg);
2103 task_unlock(tsk);
2104 /* see if the new one for us is already in the list? */
2105 if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) {
2106 /* was already there, nothing to do. */
2107 put_css_set(oldcg);
2108 } else {
2109 /* we don't already have it. get new one. */
2110 retval = css_set_prefetch(cgrp, oldcg, &newcg_list); 2180 retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
2111 put_css_set(oldcg);
2112 if (retval) 2181 if (retval)
2113 goto out_list_teardown; 2182 goto out_list_teardown;
2114 } 2183 }
2115 } 2184 }
2116 2185
2117 /* 2186 /*
2118 * step 3: now that we're guaranteed success wrt the css_sets, proceed 2187 * step 3: now that we're guaranteed success wrt the css_sets,
2119 * to move all tasks to the new cgroup, calling ss->attach_task for each 2188 * proceed to move all tasks to the new cgroup. There are no
2120 * one along the way. there are no failure cases after here, so this is 2189 * failure cases after here, so this is the commit point.
2121 * the commit point.
2122 */ 2190 */
2123 for_each_subsys(root, ss) {
2124 if (ss->pre_attach)
2125 ss->pre_attach(cgrp);
2126 }
2127 for (i = 0; i < group_size; i++) { 2191 for (i = 0; i < group_size; i++) {
2128 tsk = flex_array_get_ptr(group, i); 2192 tc = flex_array_get(group, i);
2129 /* leave current thread as it is if it's already there */ 2193 retval = cgroup_task_migrate(cgrp, tc->cgrp, tc->task, true);
2130 oldcgrp = task_cgroup_from_root(tsk, root); 2194 BUG_ON(retval);
2131 if (cgrp == oldcgrp)
2132 continue;
2133 /* if the thread is PF_EXITING, it can just get skipped. */
2134 retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true);
2135 if (retval == 0) {
2136 /* attach each task to each subsystem */
2137 for_each_subsys(root, ss) {
2138 if (ss->attach_task)
2139 ss->attach_task(cgrp, tsk);
2140 }
2141 } else {
2142 BUG_ON(retval != -ESRCH);
2143 }
2144 } 2195 }
2145 /* nothing is sensitive to fork() after this point. */ 2196 /* nothing is sensitive to fork() after this point. */
2146 2197
2147 /* 2198 /*
2148 * step 4: do expensive, non-thread-specific subsystem callbacks. 2199 * step 4: do subsystem attach callbacks.
2149 * TODO: if ever a subsystem needs to know the oldcgrp for each task
2150 * being moved, this call will need to be reworked to communicate that.
2151 */ 2200 */
2152 for_each_subsys(root, ss) { 2201 for_each_subsys(root, ss) {
2153 if (ss->attach) 2202 if (ss->attach)
2154 ss->attach(ss, cgrp, oldcgrp, leader); 2203 ss->attach(ss, cgrp, &tset);
2155 } 2204 }
2156 2205
2157 /* 2206 /*
@@ -2171,20 +2220,12 @@ out_cancel_attach:
2171 /* same deal as in cgroup_attach_task */ 2220 /* same deal as in cgroup_attach_task */
2172 if (retval) { 2221 if (retval) {
2173 for_each_subsys(root, ss) { 2222 for_each_subsys(root, ss) {
2174 if (ss == failed_ss) { 2223 if (ss == failed_ss)
2175 if (cancel_failed_ss && ss->cancel_attach)
2176 ss->cancel_attach(ss, cgrp, leader);
2177 break; 2224 break;
2178 }
2179 if (ss->cancel_attach) 2225 if (ss->cancel_attach)
2180 ss->cancel_attach(ss, cgrp, leader); 2226 ss->cancel_attach(ss, cgrp, &tset);
2181 } 2227 }
2182 } 2228 }
2183 /* clean up the array of referenced threads in the group. */
2184 for (i = 0; i < group_size; i++) {
2185 tsk = flex_array_get_ptr(group, i);
2186 put_task_struct(tsk);
2187 }
2188out_free_group_list: 2229out_free_group_list:
2189 flex_array_free(group); 2230 flex_array_free(group);
2190 return retval; 2231 return retval;
@@ -2192,8 +2233,8 @@ out_free_group_list:
2192 2233
2193/* 2234/*
2194 * Find the task_struct of the task to attach by vpid and pass it along to the 2235 * Find the task_struct of the task to attach by vpid and pass it along to the
2195 * function to attach either it or all tasks in its threadgroup. Will take 2236 * function to attach either it or all tasks in its threadgroup. Will lock
2196 * cgroup_mutex; may take task_lock of task. 2237 * cgroup_mutex and threadgroup; may take task_lock of task.
2197 */ 2238 */
2198static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) 2239static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
2199{ 2240{
@@ -2220,13 +2261,7 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
2220 * detect it later. 2261 * detect it later.
2221 */ 2262 */
2222 tsk = tsk->group_leader; 2263 tsk = tsk->group_leader;
2223 } else if (tsk->flags & PF_EXITING) {
2224 /* optimization for the single-task-only case */
2225 rcu_read_unlock();
2226 cgroup_unlock();
2227 return -ESRCH;
2228 } 2264 }
2229
2230 /* 2265 /*
2231 * even if we're attaching all tasks in the thread group, we 2266 * even if we're attaching all tasks in the thread group, we
2232 * only need to check permissions on one of them. 2267 * only need to check permissions on one of them.
@@ -2249,13 +2284,15 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
2249 get_task_struct(tsk); 2284 get_task_struct(tsk);
2250 } 2285 }
2251 2286
2252 if (threadgroup) { 2287 threadgroup_lock(tsk);
2253 threadgroup_fork_write_lock(tsk); 2288
2289 if (threadgroup)
2254 ret = cgroup_attach_proc(cgrp, tsk); 2290 ret = cgroup_attach_proc(cgrp, tsk);
2255 threadgroup_fork_write_unlock(tsk); 2291 else
2256 } else {
2257 ret = cgroup_attach_task(cgrp, tsk); 2292 ret = cgroup_attach_task(cgrp, tsk);
2258 } 2293
2294 threadgroup_unlock(tsk);
2295
2259 put_task_struct(tsk); 2296 put_task_struct(tsk);
2260 cgroup_unlock(); 2297 cgroup_unlock();
2261 return ret; 2298 return ret;
@@ -2306,7 +2343,9 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
2306 return -EINVAL; 2343 return -EINVAL;
2307 if (!cgroup_lock_live_group(cgrp)) 2344 if (!cgroup_lock_live_group(cgrp))
2308 return -ENODEV; 2345 return -ENODEV;
2346 mutex_lock(&cgroup_root_mutex);
2309 strcpy(cgrp->root->release_agent_path, buffer); 2347 strcpy(cgrp->root->release_agent_path, buffer);
2348 mutex_unlock(&cgroup_root_mutex);
2310 cgroup_unlock(); 2349 cgroup_unlock();
2311 return 0; 2350 return 0;
2312} 2351}
@@ -2585,7 +2624,7 @@ static inline struct cftype *__file_cft(struct file *file)
2585 return __d_cft(file->f_dentry); 2624 return __d_cft(file->f_dentry);
2586} 2625}
2587 2626
2588static int cgroup_create_file(struct dentry *dentry, mode_t mode, 2627static int cgroup_create_file(struct dentry *dentry, umode_t mode,
2589 struct super_block *sb) 2628 struct super_block *sb)
2590{ 2629{
2591 struct inode *inode; 2630 struct inode *inode;
@@ -2626,7 +2665,7 @@ static int cgroup_create_file(struct dentry *dentry, mode_t mode,
2626 * @mode: mode to set on new directory. 2665 * @mode: mode to set on new directory.
2627 */ 2666 */
2628static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, 2667static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
2629 mode_t mode) 2668 umode_t mode)
2630{ 2669{
2631 struct dentry *parent; 2670 struct dentry *parent;
2632 int error = 0; 2671 int error = 0;
@@ -2653,9 +2692,9 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
2653 * returns S_IRUGO if it has only a read handler 2692 * returns S_IRUGO if it has only a read handler
2654 * returns S_IWUSR if it has only a write hander 2693 * returns S_IWUSR if it has only a write hander
2655 */ 2694 */
2656static mode_t cgroup_file_mode(const struct cftype *cft) 2695static umode_t cgroup_file_mode(const struct cftype *cft)
2657{ 2696{
2658 mode_t mode = 0; 2697 umode_t mode = 0;
2659 2698
2660 if (cft->mode) 2699 if (cft->mode)
2661 return cft->mode; 2700 return cft->mode;
@@ -2678,7 +2717,7 @@ int cgroup_add_file(struct cgroup *cgrp,
2678 struct dentry *dir = cgrp->dentry; 2717 struct dentry *dir = cgrp->dentry;
2679 struct dentry *dentry; 2718 struct dentry *dentry;
2680 int error; 2719 int error;
2681 mode_t mode; 2720 umode_t mode;
2682 2721
2683 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; 2722 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
2684 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { 2723 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
@@ -2789,6 +2828,7 @@ static void cgroup_enable_task_cg_lists(void)
2789} 2828}
2790 2829
2791void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) 2830void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
2831 __acquires(css_set_lock)
2792{ 2832{
2793 /* 2833 /*
2794 * The first time anyone tries to iterate across a cgroup, 2834 * The first time anyone tries to iterate across a cgroup,
@@ -2828,6 +2868,7 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
2828} 2868}
2829 2869
2830void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it) 2870void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)
2871 __releases(css_set_lock)
2831{ 2872{
2832 read_unlock(&css_set_lock); 2873 read_unlock(&css_set_lock);
2833} 2874}
@@ -3752,7 +3793,7 @@ static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
3752 * Must be called with the mutex on the parent inode held 3793 * Must be called with the mutex on the parent inode held
3753 */ 3794 */
3754static long cgroup_create(struct cgroup *parent, struct dentry *dentry, 3795static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3755 mode_t mode) 3796 umode_t mode)
3756{ 3797{
3757 struct cgroup *cgrp; 3798 struct cgroup *cgrp;
3758 struct cgroupfs_root *root = parent->root; 3799 struct cgroupfs_root *root = parent->root;
@@ -3846,7 +3887,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3846 return err; 3887 return err;
3847} 3888}
3848 3889
3849static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode) 3890static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
3850{ 3891{
3851 struct cgroup *c_parent = dentry->d_parent->d_fsdata; 3892 struct cgroup *c_parent = dentry->d_parent->d_fsdata;
3852 3893
@@ -4491,20 +4532,31 @@ static const struct file_operations proc_cgroupstats_operations = {
4491 * 4532 *
4492 * A pointer to the shared css_set was automatically copied in 4533 * A pointer to the shared css_set was automatically copied in
4493 * fork.c by dup_task_struct(). However, we ignore that copy, since 4534 * fork.c by dup_task_struct(). However, we ignore that copy, since
4494 * it was not made under the protection of RCU or cgroup_mutex, so 4535 * it was not made under the protection of RCU, cgroup_mutex or
4495 * might no longer be a valid cgroup pointer. cgroup_attach_task() might 4536 * threadgroup_change_begin(), so it might no longer be a valid
4496 * have already changed current->cgroups, allowing the previously 4537 * cgroup pointer. cgroup_attach_task() might have already changed
4497 * referenced cgroup group to be removed and freed. 4538 * current->cgroups, allowing the previously referenced cgroup
4539 * group to be removed and freed.
4540 *
4541 * Outside the pointer validity we also need to process the css_set
4542 * inheritance between threadgoup_change_begin() and
4543 * threadgoup_change_end(), this way there is no leak in any process
4544 * wide migration performed by cgroup_attach_proc() that could otherwise
4545 * miss a thread because it is too early or too late in the fork stage.
4498 * 4546 *
4499 * At the point that cgroup_fork() is called, 'current' is the parent 4547 * At the point that cgroup_fork() is called, 'current' is the parent
4500 * task, and the passed argument 'child' points to the child task. 4548 * task, and the passed argument 'child' points to the child task.
4501 */ 4549 */
4502void cgroup_fork(struct task_struct *child) 4550void cgroup_fork(struct task_struct *child)
4503{ 4551{
4504 task_lock(current); 4552 /*
4553 * We don't need to task_lock() current because current->cgroups
4554 * can't be changed concurrently here. The parent obviously hasn't
4555 * exited and called cgroup_exit(), and we are synchronized against
4556 * cgroup migration through threadgroup_change_begin().
4557 */
4505 child->cgroups = current->cgroups; 4558 child->cgroups = current->cgroups;
4506 get_css_set(child->cgroups); 4559 get_css_set(child->cgroups);
4507 task_unlock(current);
4508 INIT_LIST_HEAD(&child->cg_list); 4560 INIT_LIST_HEAD(&child->cg_list);
4509} 4561}
4510 4562
@@ -4546,10 +4598,19 @@ void cgroup_post_fork(struct task_struct *child)
4546{ 4598{
4547 if (use_task_css_set_links) { 4599 if (use_task_css_set_links) {
4548 write_lock(&css_set_lock); 4600 write_lock(&css_set_lock);
4549 task_lock(child); 4601 if (list_empty(&child->cg_list)) {
4550 if (list_empty(&child->cg_list)) 4602 /*
4603 * It's safe to use child->cgroups without task_lock()
4604 * here because we are protected through
4605 * threadgroup_change_begin() against concurrent
4606 * css_set change in cgroup_task_migrate(). Also
4607 * the task can't exit at that point until
4608 * wake_up_new_task() is called, so we are protected
4609 * against cgroup_exit() setting child->cgroup to
4610 * init_css_set.
4611 */
4551 list_add(&child->cg_list, &child->cgroups->tasks); 4612 list_add(&child->cg_list, &child->cgroups->tasks);
4552 task_unlock(child); 4613 }
4553 write_unlock(&css_set_lock); 4614 write_unlock(&css_set_lock);
4554 } 4615 }
4555} 4616}
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 213c0351dad8..fc0646b78a64 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -48,19 +48,17 @@ static inline struct freezer *task_freezer(struct task_struct *task)
48 struct freezer, css); 48 struct freezer, css);
49} 49}
50 50
51static inline int __cgroup_freezing_or_frozen(struct task_struct *task) 51bool cgroup_freezing(struct task_struct *task)
52{ 52{
53 enum freezer_state state = task_freezer(task)->state; 53 enum freezer_state state;
54 return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN); 54 bool ret;
55}
56 55
57int cgroup_freezing_or_frozen(struct task_struct *task) 56 rcu_read_lock();
58{ 57 state = task_freezer(task)->state;
59 int result; 58 ret = state == CGROUP_FREEZING || state == CGROUP_FROZEN;
60 task_lock(task); 59 rcu_read_unlock();
61 result = __cgroup_freezing_or_frozen(task); 60
62 task_unlock(task); 61 return ret;
63 return result;
64} 62}
65 63
66/* 64/*
@@ -102,9 +100,6 @@ struct cgroup_subsys freezer_subsys;
102 * freezer_can_attach(): 100 * freezer_can_attach():
103 * cgroup_mutex (held by caller of can_attach) 101 * cgroup_mutex (held by caller of can_attach)
104 * 102 *
105 * cgroup_freezing_or_frozen():
106 * task->alloc_lock (to get task's cgroup)
107 *
108 * freezer_fork() (preserving fork() performance means can't take cgroup_mutex): 103 * freezer_fork() (preserving fork() performance means can't take cgroup_mutex):
109 * freezer->lock 104 * freezer->lock
110 * sighand->siglock (if the cgroup is freezing) 105 * sighand->siglock (if the cgroup is freezing)
@@ -130,7 +125,7 @@ struct cgroup_subsys freezer_subsys;
130 * write_lock css_set_lock (cgroup iterator start) 125 * write_lock css_set_lock (cgroup iterator start)
131 * task->alloc_lock 126 * task->alloc_lock
132 * read_lock css_set_lock (cgroup iterator start) 127 * read_lock css_set_lock (cgroup iterator start)
133 * task->alloc_lock (inside thaw_process(), prevents race with refrigerator()) 128 * task->alloc_lock (inside __thaw_task(), prevents race with refrigerator())
134 * sighand->siglock 129 * sighand->siglock
135 */ 130 */
136static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, 131static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
@@ -150,7 +145,11 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
150static void freezer_destroy(struct cgroup_subsys *ss, 145static void freezer_destroy(struct cgroup_subsys *ss,
151 struct cgroup *cgroup) 146 struct cgroup *cgroup)
152{ 147{
153 kfree(cgroup_freezer(cgroup)); 148 struct freezer *freezer = cgroup_freezer(cgroup);
149
150 if (freezer->state != CGROUP_THAWED)
151 atomic_dec(&system_freezing_cnt);
152 kfree(freezer);
154} 153}
155 154
156/* task is frozen or will freeze immediately when next it gets woken */ 155/* task is frozen or will freeze immediately when next it gets woken */
@@ -167,13 +166,17 @@ static bool is_task_frozen_enough(struct task_struct *task)
167 */ 166 */
168static int freezer_can_attach(struct cgroup_subsys *ss, 167static int freezer_can_attach(struct cgroup_subsys *ss,
169 struct cgroup *new_cgroup, 168 struct cgroup *new_cgroup,
170 struct task_struct *task) 169 struct cgroup_taskset *tset)
171{ 170{
172 struct freezer *freezer; 171 struct freezer *freezer;
172 struct task_struct *task;
173 173
174 /* 174 /*
175 * Anything frozen can't move or be moved to/from. 175 * Anything frozen can't move or be moved to/from.
176 */ 176 */
177 cgroup_taskset_for_each(task, new_cgroup, tset)
178 if (cgroup_freezing(task))
179 return -EBUSY;
177 180
178 freezer = cgroup_freezer(new_cgroup); 181 freezer = cgroup_freezer(new_cgroup);
179 if (freezer->state != CGROUP_THAWED) 182 if (freezer->state != CGROUP_THAWED)
@@ -182,17 +185,6 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
182 return 0; 185 return 0;
183} 186}
184 187
185static int freezer_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
186{
187 rcu_read_lock();
188 if (__cgroup_freezing_or_frozen(tsk)) {
189 rcu_read_unlock();
190 return -EBUSY;
191 }
192 rcu_read_unlock();
193 return 0;
194}
195
196static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) 188static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
197{ 189{
198 struct freezer *freezer; 190 struct freezer *freezer;
@@ -220,7 +212,7 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
220 212
221 /* Locking avoids race with FREEZING -> THAWED transitions. */ 213 /* Locking avoids race with FREEZING -> THAWED transitions. */
222 if (freezer->state == CGROUP_FREEZING) 214 if (freezer->state == CGROUP_FREEZING)
223 freeze_task(task, true); 215 freeze_task(task);
224 spin_unlock_irq(&freezer->lock); 216 spin_unlock_irq(&freezer->lock);
225} 217}
226 218
@@ -238,7 +230,7 @@ static void update_if_frozen(struct cgroup *cgroup,
238 cgroup_iter_start(cgroup, &it); 230 cgroup_iter_start(cgroup, &it);
239 while ((task = cgroup_iter_next(cgroup, &it))) { 231 while ((task = cgroup_iter_next(cgroup, &it))) {
240 ntotal++; 232 ntotal++;
241 if (is_task_frozen_enough(task)) 233 if (freezing(task) && is_task_frozen_enough(task))
242 nfrozen++; 234 nfrozen++;
243 } 235 }
244 236
@@ -286,10 +278,9 @@ static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
286 struct task_struct *task; 278 struct task_struct *task;
287 unsigned int num_cant_freeze_now = 0; 279 unsigned int num_cant_freeze_now = 0;
288 280
289 freezer->state = CGROUP_FREEZING;
290 cgroup_iter_start(cgroup, &it); 281 cgroup_iter_start(cgroup, &it);
291 while ((task = cgroup_iter_next(cgroup, &it))) { 282 while ((task = cgroup_iter_next(cgroup, &it))) {
292 if (!freeze_task(task, true)) 283 if (!freeze_task(task))
293 continue; 284 continue;
294 if (is_task_frozen_enough(task)) 285 if (is_task_frozen_enough(task))
295 continue; 286 continue;
@@ -307,12 +298,9 @@ static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
307 struct task_struct *task; 298 struct task_struct *task;
308 299
309 cgroup_iter_start(cgroup, &it); 300 cgroup_iter_start(cgroup, &it);
310 while ((task = cgroup_iter_next(cgroup, &it))) { 301 while ((task = cgroup_iter_next(cgroup, &it)))
311 thaw_process(task); 302 __thaw_task(task);
312 }
313 cgroup_iter_end(cgroup, &it); 303 cgroup_iter_end(cgroup, &it);
314
315 freezer->state = CGROUP_THAWED;
316} 304}
317 305
318static int freezer_change_state(struct cgroup *cgroup, 306static int freezer_change_state(struct cgroup *cgroup,
@@ -326,20 +314,24 @@ static int freezer_change_state(struct cgroup *cgroup,
326 spin_lock_irq(&freezer->lock); 314 spin_lock_irq(&freezer->lock);
327 315
328 update_if_frozen(cgroup, freezer); 316 update_if_frozen(cgroup, freezer);
329 if (goal_state == freezer->state)
330 goto out;
331 317
332 switch (goal_state) { 318 switch (goal_state) {
333 case CGROUP_THAWED: 319 case CGROUP_THAWED:
320 if (freezer->state != CGROUP_THAWED)
321 atomic_dec(&system_freezing_cnt);
322 freezer->state = CGROUP_THAWED;
334 unfreeze_cgroup(cgroup, freezer); 323 unfreeze_cgroup(cgroup, freezer);
335 break; 324 break;
336 case CGROUP_FROZEN: 325 case CGROUP_FROZEN:
326 if (freezer->state == CGROUP_THAWED)
327 atomic_inc(&system_freezing_cnt);
328 freezer->state = CGROUP_FREEZING;
337 retval = try_to_freeze_cgroup(cgroup, freezer); 329 retval = try_to_freeze_cgroup(cgroup, freezer);
338 break; 330 break;
339 default: 331 default:
340 BUG(); 332 BUG();
341 } 333 }
342out: 334
343 spin_unlock_irq(&freezer->lock); 335 spin_unlock_irq(&freezer->lock);
344 336
345 return retval; 337 return retval;
@@ -388,10 +380,5 @@ struct cgroup_subsys freezer_subsys = {
388 .populate = freezer_populate, 380 .populate = freezer_populate,
389 .subsys_id = freezer_subsys_id, 381 .subsys_id = freezer_subsys_id,
390 .can_attach = freezer_can_attach, 382 .can_attach = freezer_can_attach,
391 .can_attach_task = freezer_can_attach_task,
392 .pre_attach = NULL,
393 .attach_task = NULL,
394 .attach = NULL,
395 .fork = freezer_fork, 383 .fork = freezer_fork,
396 .exit = NULL,
397}; 384};
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 5ca38d5d238a..2060c6e57027 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -470,7 +470,7 @@ out:
470 cpu_maps_update_done(); 470 cpu_maps_update_done();
471} 471}
472 472
473static int alloc_frozen_cpus(void) 473static int __init alloc_frozen_cpus(void)
474{ 474{
475 if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO)) 475 if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO))
476 return -ENOMEM; 476 return -ENOMEM;
@@ -543,7 +543,7 @@ cpu_hotplug_pm_callback(struct notifier_block *nb,
543} 543}
544 544
545 545
546int cpu_hotplug_pm_sync_init(void) 546static int __init cpu_hotplug_pm_sync_init(void)
547{ 547{
548 pm_notifier(cpu_hotplug_pm_callback, 0); 548 pm_notifier(cpu_hotplug_pm_callback, 0);
549 return 0; 549 return 0;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 0b1712dba587..a09ac2b9a661 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1389,79 +1389,73 @@ static int fmeter_getrate(struct fmeter *fmp)
1389 return val; 1389 return val;
1390} 1390}
1391 1391
1392/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
1393static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1394 struct task_struct *tsk)
1395{
1396 struct cpuset *cs = cgroup_cs(cont);
1397
1398 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1399 return -ENOSPC;
1400
1401 /*
1402 * Kthreads bound to specific cpus cannot be moved to a new cpuset; we
1403 * cannot change their cpu affinity and isolating such threads by their
1404 * set of allowed nodes is unnecessary. Thus, cpusets are not
1405 * applicable for such threads. This prevents checking for success of
1406 * set_cpus_allowed_ptr() on all attached tasks before cpus_allowed may
1407 * be changed.
1408 */
1409 if (tsk->flags & PF_THREAD_BOUND)
1410 return -EINVAL;
1411
1412 return 0;
1413}
1414
1415static int cpuset_can_attach_task(struct cgroup *cgrp, struct task_struct *task)
1416{
1417 return security_task_setscheduler(task);
1418}
1419
1420/* 1392/*
1421 * Protected by cgroup_lock. The nodemasks must be stored globally because 1393 * Protected by cgroup_lock. The nodemasks must be stored globally because
1422 * dynamically allocating them is not allowed in pre_attach, and they must 1394 * dynamically allocating them is not allowed in can_attach, and they must
1423 * persist among pre_attach, attach_task, and attach. 1395 * persist until attach.
1424 */ 1396 */
1425static cpumask_var_t cpus_attach; 1397static cpumask_var_t cpus_attach;
1426static nodemask_t cpuset_attach_nodemask_from; 1398static nodemask_t cpuset_attach_nodemask_from;
1427static nodemask_t cpuset_attach_nodemask_to; 1399static nodemask_t cpuset_attach_nodemask_to;
1428 1400
1429/* Set-up work for before attaching each task. */ 1401/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
1430static void cpuset_pre_attach(struct cgroup *cont) 1402static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
1403 struct cgroup_taskset *tset)
1431{ 1404{
1432 struct cpuset *cs = cgroup_cs(cont); 1405 struct cpuset *cs = cgroup_cs(cgrp);
1406 struct task_struct *task;
1407 int ret;
1408
1409 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1410 return -ENOSPC;
1411
1412 cgroup_taskset_for_each(task, cgrp, tset) {
1413 /*
1414 * Kthreads bound to specific cpus cannot be moved to a new
1415 * cpuset; we cannot change their cpu affinity and
1416 * isolating such threads by their set of allowed nodes is
1417 * unnecessary. Thus, cpusets are not applicable for such
1418 * threads. This prevents checking for success of
1419 * set_cpus_allowed_ptr() on all attached tasks before
1420 * cpus_allowed may be changed.
1421 */
1422 if (task->flags & PF_THREAD_BOUND)
1423 return -EINVAL;
1424 if ((ret = security_task_setscheduler(task)))
1425 return ret;
1426 }
1433 1427
1428 /* prepare for attach */
1434 if (cs == &top_cpuset) 1429 if (cs == &top_cpuset)
1435 cpumask_copy(cpus_attach, cpu_possible_mask); 1430 cpumask_copy(cpus_attach, cpu_possible_mask);
1436 else 1431 else
1437 guarantee_online_cpus(cs, cpus_attach); 1432 guarantee_online_cpus(cs, cpus_attach);
1438 1433
1439 guarantee_online_mems(cs, &cpuset_attach_nodemask_to); 1434 guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
1440}
1441
1442/* Per-thread attachment work. */
1443static void cpuset_attach_task(struct cgroup *cont, struct task_struct *tsk)
1444{
1445 int err;
1446 struct cpuset *cs = cgroup_cs(cont);
1447 1435
1448 /* 1436 return 0;
1449 * can_attach beforehand should guarantee that this doesn't fail.
1450 * TODO: have a better way to handle failure here
1451 */
1452 err = set_cpus_allowed_ptr(tsk, cpus_attach);
1453 WARN_ON_ONCE(err);
1454
1455 cpuset_change_task_nodemask(tsk, &cpuset_attach_nodemask_to);
1456 cpuset_update_task_spread_flag(cs, tsk);
1457} 1437}
1458 1438
1459static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, 1439static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
1460 struct cgroup *oldcont, struct task_struct *tsk) 1440 struct cgroup_taskset *tset)
1461{ 1441{
1462 struct mm_struct *mm; 1442 struct mm_struct *mm;
1463 struct cpuset *cs = cgroup_cs(cont); 1443 struct task_struct *task;
1464 struct cpuset *oldcs = cgroup_cs(oldcont); 1444 struct task_struct *leader = cgroup_taskset_first(tset);
1445 struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset);
1446 struct cpuset *cs = cgroup_cs(cgrp);
1447 struct cpuset *oldcs = cgroup_cs(oldcgrp);
1448
1449 cgroup_taskset_for_each(task, cgrp, tset) {
1450 /*
1451 * can_attach beforehand should guarantee that this doesn't
1452 * fail. TODO: have a better way to handle failure here
1453 */
1454 WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
1455
1456 cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
1457 cpuset_update_task_spread_flag(cs, task);
1458 }
1465 1459
1466 /* 1460 /*
1467 * Change mm, possibly for multiple threads in a threadgroup. This is 1461 * Change mm, possibly for multiple threads in a threadgroup. This is
@@ -1469,7 +1463,7 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1469 */ 1463 */
1470 cpuset_attach_nodemask_from = oldcs->mems_allowed; 1464 cpuset_attach_nodemask_from = oldcs->mems_allowed;
1471 cpuset_attach_nodemask_to = cs->mems_allowed; 1465 cpuset_attach_nodemask_to = cs->mems_allowed;
1472 mm = get_task_mm(tsk); 1466 mm = get_task_mm(leader);
1473 if (mm) { 1467 if (mm) {
1474 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); 1468 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
1475 if (is_memory_migrate(cs)) 1469 if (is_memory_migrate(cs))
@@ -1925,9 +1919,6 @@ struct cgroup_subsys cpuset_subsys = {
1925 .create = cpuset_create, 1919 .create = cpuset_create,
1926 .destroy = cpuset_destroy, 1920 .destroy = cpuset_destroy,
1927 .can_attach = cpuset_can_attach, 1921 .can_attach = cpuset_can_attach,
1928 .can_attach_task = cpuset_can_attach_task,
1929 .pre_attach = cpuset_pre_attach,
1930 .attach_task = cpuset_attach_task,
1931 .attach = cpuset_attach, 1922 .attach = cpuset_attach,
1932 .populate = cpuset_populate, 1923 .populate = cpuset_populate,
1933 .post_clone = cpuset_post_clone, 1924 .post_clone = cpuset_post_clone,
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 63786e71a3cd..e2ae7349437f 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1982,7 +1982,7 @@ static int kdb_lsmod(int argc, const char **argv)
1982 kdb_printf("%-20s%8u 0x%p ", mod->name, 1982 kdb_printf("%-20s%8u 0x%p ", mod->name,
1983 mod->core_size, (void *)mod); 1983 mod->core_size, (void *)mod);
1984#ifdef CONFIG_MODULE_UNLOAD 1984#ifdef CONFIG_MODULE_UNLOAD
1985 kdb_printf("%4d ", module_refcount(mod)); 1985 kdb_printf("%4ld ", module_refcount(mod));
1986#endif 1986#endif
1987 if (mod->state == MODULE_STATE_GOING) 1987 if (mod->state == MODULE_STATE_GOING)
1988 kdb_printf(" (Unloading)"); 1988 kdb_printf(" (Unloading)");
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 890eb02c2f21..a8f4ac001a00 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4,7 +4,7 @@
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> 4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar 5 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> 6 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> 7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 * 8 *
9 * For licensing details see kernel-base/COPYING 9 * For licensing details see kernel-base/COPYING
10 */ 10 */
@@ -6941,10 +6941,13 @@ static int __perf_cgroup_move(void *info)
6941 return 0; 6941 return 0;
6942} 6942}
6943 6943
6944static void 6944static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
6945perf_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *task) 6945 struct cgroup_taskset *tset)
6946{ 6946{
6947 task_function_call(task, __perf_cgroup_move, task); 6947 struct task_struct *task;
6948
6949 cgroup_taskset_for_each(task, cgrp, tset)
6950 task_function_call(task, __perf_cgroup_move, task);
6948} 6951}
6949 6952
6950static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, 6953static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
@@ -6958,7 +6961,7 @@ static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
6958 if (!(task->flags & PF_EXITING)) 6961 if (!(task->flags & PF_EXITING))
6959 return; 6962 return;
6960 6963
6961 perf_cgroup_attach_task(cgrp, task); 6964 task_function_call(task, __perf_cgroup_move, task);
6962} 6965}
6963 6966
6964struct cgroup_subsys perf_subsys = { 6967struct cgroup_subsys perf_subsys = {
@@ -6967,6 +6970,6 @@ struct cgroup_subsys perf_subsys = {
6967 .create = perf_cgroup_create, 6970 .create = perf_cgroup_create,
6968 .destroy = perf_cgroup_destroy, 6971 .destroy = perf_cgroup_destroy,
6969 .exit = perf_cgroup_exit, 6972 .exit = perf_cgroup_exit,
6970 .attach_task = perf_cgroup_attach_task, 6973 .attach = perf_cgroup_attach,
6971}; 6974};
6972#endif /* CONFIG_CGROUP_PERF */ 6975#endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 7f3011c6b57f..6ddaba43fb7a 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -4,7 +4,7 @@
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> 4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar 5 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> 6 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> 7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 * 8 *
9 * For licensing details see kernel-base/COPYING 9 * For licensing details see kernel-base/COPYING
10 */ 10 */
diff --git a/kernel/exit.c b/kernel/exit.c
index d579a459309d..294b1709170d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -51,6 +51,7 @@
51#include <trace/events/sched.h> 51#include <trace/events/sched.h>
52#include <linux/hw_breakpoint.h> 52#include <linux/hw_breakpoint.h>
53#include <linux/oom.h> 53#include <linux/oom.h>
54#include <linux/writeback.h>
54 55
55#include <asm/uaccess.h> 56#include <asm/uaccess.h>
56#include <asm/unistd.h> 57#include <asm/unistd.h>
@@ -679,8 +680,6 @@ static void exit_mm(struct task_struct * tsk)
679 tsk->mm = NULL; 680 tsk->mm = NULL;
680 up_read(&mm->mmap_sem); 681 up_read(&mm->mmap_sem);
681 enter_lazy_tlb(mm, current); 682 enter_lazy_tlb(mm, current);
682 /* We don't want this task to be frozen prematurely */
683 clear_freeze_flag(tsk);
684 task_unlock(tsk); 683 task_unlock(tsk);
685 mm_update_next_owner(mm); 684 mm_update_next_owner(mm);
686 mmput(mm); 685 mmput(mm);
@@ -888,7 +887,7 @@ static void check_stack_usage(void)
888static inline void check_stack_usage(void) {} 887static inline void check_stack_usage(void) {}
889#endif 888#endif
890 889
891NORET_TYPE void do_exit(long code) 890void do_exit(long code)
892{ 891{
893 struct task_struct *tsk = current; 892 struct task_struct *tsk = current;
894 int group_dead; 893 int group_dead;
@@ -965,8 +964,7 @@ NORET_TYPE void do_exit(long code)
965 acct_collect(code, group_dead); 964 acct_collect(code, group_dead);
966 if (group_dead) 965 if (group_dead)
967 tty_audit_exit(); 966 tty_audit_exit();
968 if (unlikely(tsk->audit_context)) 967 audit_free(tsk);
969 audit_free(tsk);
970 968
971 tsk->exit_code = code; 969 tsk->exit_code = code;
972 taskstats_exit(tsk, group_dead); 970 taskstats_exit(tsk, group_dead);
@@ -1037,9 +1035,12 @@ NORET_TYPE void do_exit(long code)
1037 validate_creds_for_do_exit(tsk); 1035 validate_creds_for_do_exit(tsk);
1038 1036
1039 preempt_disable(); 1037 preempt_disable();
1038 if (tsk->nr_dirtied)
1039 __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
1040 exit_rcu(); 1040 exit_rcu();
1041 /* causes final put_task_struct in finish_task_switch(). */ 1041 /* causes final put_task_struct in finish_task_switch(). */
1042 tsk->state = TASK_DEAD; 1042 tsk->state = TASK_DEAD;
1043 tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */
1043 schedule(); 1044 schedule();
1044 BUG(); 1045 BUG();
1045 /* Avoid "noreturn function does return". */ 1046 /* Avoid "noreturn function does return". */
@@ -1049,7 +1050,7 @@ NORET_TYPE void do_exit(long code)
1049 1050
1050EXPORT_SYMBOL_GPL(do_exit); 1051EXPORT_SYMBOL_GPL(do_exit);
1051 1052
1052NORET_TYPE void complete_and_exit(struct completion *comp, long code) 1053void complete_and_exit(struct completion *comp, long code)
1053{ 1054{
1054 if (comp) 1055 if (comp)
1055 complete(comp); 1056 complete(comp);
@@ -1068,7 +1069,7 @@ SYSCALL_DEFINE1(exit, int, error_code)
1068 * Take down every thread in the group. This is called by fatal signals 1069 * Take down every thread in the group. This is called by fatal signals
1069 * as well as by sys_exit_group (below). 1070 * as well as by sys_exit_group (below).
1070 */ 1071 */
1071NORET_TYPE void 1072void
1072do_group_exit(int exit_code) 1073do_group_exit(int exit_code)
1073{ 1074{
1074 struct signal_struct *sig = current->signal; 1075 struct signal_struct *sig = current->signal;
diff --git a/kernel/fork.c b/kernel/fork.c
index b058c5820ecd..051f090d40c1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -76,6 +76,9 @@
76 76
77#include <trace/events/sched.h> 77#include <trace/events/sched.h>
78 78
79#define CREATE_TRACE_POINTS
80#include <trace/events/task.h>
81
79/* 82/*
80 * Protected counters by write_lock_irq(&tasklist_lock) 83 * Protected counters by write_lock_irq(&tasklist_lock)
81 */ 84 */
@@ -870,6 +873,7 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
870{ 873{
871#ifdef CONFIG_BLOCK 874#ifdef CONFIG_BLOCK
872 struct io_context *ioc = current->io_context; 875 struct io_context *ioc = current->io_context;
876 struct io_context *new_ioc;
873 877
874 if (!ioc) 878 if (!ioc)
875 return 0; 879 return 0;
@@ -881,11 +885,12 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
881 if (unlikely(!tsk->io_context)) 885 if (unlikely(!tsk->io_context))
882 return -ENOMEM; 886 return -ENOMEM;
883 } else if (ioprio_valid(ioc->ioprio)) { 887 } else if (ioprio_valid(ioc->ioprio)) {
884 tsk->io_context = alloc_io_context(GFP_KERNEL, -1); 888 new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE);
885 if (unlikely(!tsk->io_context)) 889 if (unlikely(!new_ioc))
886 return -ENOMEM; 890 return -ENOMEM;
887 891
888 tsk->io_context->ioprio = ioc->ioprio; 892 new_ioc->ioprio = ioc->ioprio;
893 put_io_context(new_ioc, NULL);
889 } 894 }
890#endif 895#endif
891 return 0; 896 return 0;
@@ -972,7 +977,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
972 sched_autogroup_fork(sig); 977 sched_autogroup_fork(sig);
973 978
974#ifdef CONFIG_CGROUPS 979#ifdef CONFIG_CGROUPS
975 init_rwsem(&sig->threadgroup_fork_lock); 980 init_rwsem(&sig->group_rwsem);
976#endif 981#endif
977 982
978 sig->oom_adj = current->signal->oom_adj; 983 sig->oom_adj = current->signal->oom_adj;
@@ -992,7 +997,6 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p)
992 new_flags |= PF_FORKNOEXEC; 997 new_flags |= PF_FORKNOEXEC;
993 new_flags |= PF_STARTING; 998 new_flags |= PF_STARTING;
994 p->flags = new_flags; 999 p->flags = new_flags;
995 clear_freeze_flag(p);
996} 1000}
997 1001
998SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr) 1002SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
@@ -1154,7 +1158,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1154 p->io_context = NULL; 1158 p->io_context = NULL;
1155 p->audit_context = NULL; 1159 p->audit_context = NULL;
1156 if (clone_flags & CLONE_THREAD) 1160 if (clone_flags & CLONE_THREAD)
1157 threadgroup_fork_read_lock(current); 1161 threadgroup_change_begin(current);
1158 cgroup_fork(p); 1162 cgroup_fork(p);
1159#ifdef CONFIG_NUMA 1163#ifdef CONFIG_NUMA
1160 p->mempolicy = mpol_dup(p->mempolicy); 1164 p->mempolicy = mpol_dup(p->mempolicy);
@@ -1292,6 +1296,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1292 1296
1293 p->nr_dirtied = 0; 1297 p->nr_dirtied = 0;
1294 p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10); 1298 p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
1299 p->dirty_paused_when = 0;
1295 1300
1296 /* 1301 /*
1297 * Ok, make it visible to the rest of the system. 1302 * Ok, make it visible to the rest of the system.
@@ -1369,8 +1374,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1369 proc_fork_connector(p); 1374 proc_fork_connector(p);
1370 cgroup_post_fork(p); 1375 cgroup_post_fork(p);
1371 if (clone_flags & CLONE_THREAD) 1376 if (clone_flags & CLONE_THREAD)
1372 threadgroup_fork_read_unlock(current); 1377 threadgroup_change_end(current);
1373 perf_event_fork(p); 1378 perf_event_fork(p);
1379
1380 trace_task_newtask(p, clone_flags);
1381
1374 return p; 1382 return p;
1375 1383
1376bad_fork_free_pid: 1384bad_fork_free_pid:
@@ -1404,7 +1412,7 @@ bad_fork_cleanup_policy:
1404bad_fork_cleanup_cgroup: 1412bad_fork_cleanup_cgroup:
1405#endif 1413#endif
1406 if (clone_flags & CLONE_THREAD) 1414 if (clone_flags & CLONE_THREAD)
1407 threadgroup_fork_read_unlock(current); 1415 threadgroup_change_end(current);
1408 cgroup_exit(p, cgroup_callbacks_done); 1416 cgroup_exit(p, cgroup_callbacks_done);
1409 delayacct_tsk_free(p); 1417 delayacct_tsk_free(p);
1410 module_put(task_thread_info(p)->exec_domain->module); 1418 module_put(task_thread_info(p)->exec_domain->module);
@@ -1519,8 +1527,6 @@ long do_fork(unsigned long clone_flags,
1519 init_completion(&vfork); 1527 init_completion(&vfork);
1520 } 1528 }
1521 1529
1522 audit_finish_fork(p);
1523
1524 /* 1530 /*
1525 * We set PF_STARTING at creation in case tracing wants to 1531 * We set PF_STARTING at creation in case tracing wants to
1526 * use this to distinguish a fully live task from one that 1532 * use this to distinguish a fully live task from one that
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 7be56c534397..9815b8d1eed5 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -9,101 +9,114 @@
9#include <linux/export.h> 9#include <linux/export.h>
10#include <linux/syscalls.h> 10#include <linux/syscalls.h>
11#include <linux/freezer.h> 11#include <linux/freezer.h>
12#include <linux/kthread.h>
12 13
13/* 14/* total number of freezing conditions in effect */
14 * freezing is complete, mark current process as frozen 15atomic_t system_freezing_cnt = ATOMIC_INIT(0);
16EXPORT_SYMBOL(system_freezing_cnt);
17
18/* indicate whether PM freezing is in effect, protected by pm_mutex */
19bool pm_freezing;
20bool pm_nosig_freezing;
21
22/* protects freezing and frozen transitions */
23static DEFINE_SPINLOCK(freezer_lock);
24
25/**
26 * freezing_slow_path - slow path for testing whether a task needs to be frozen
27 * @p: task to be tested
28 *
29 * This function is called by freezing() if system_freezing_cnt isn't zero
30 * and tests whether @p needs to enter and stay in frozen state. Can be
31 * called under any context. The freezers are responsible for ensuring the
32 * target tasks see the updated state.
15 */ 33 */
16static inline void frozen_process(void) 34bool freezing_slow_path(struct task_struct *p)
17{ 35{
18 if (!unlikely(current->flags & PF_NOFREEZE)) { 36 if (p->flags & PF_NOFREEZE)
19 current->flags |= PF_FROZEN; 37 return false;
20 smp_wmb(); 38
21 } 39 if (pm_nosig_freezing || cgroup_freezing(p))
22 clear_freeze_flag(current); 40 return true;
41
42 if (pm_freezing && !(p->flags & PF_KTHREAD))
43 return true;
44
45 return false;
23} 46}
47EXPORT_SYMBOL(freezing_slow_path);
24 48
25/* Refrigerator is place where frozen processes are stored :-). */ 49/* Refrigerator is place where frozen processes are stored :-). */
26void refrigerator(void) 50bool __refrigerator(bool check_kthr_stop)
27{ 51{
28 /* Hmm, should we be allowed to suspend when there are realtime 52 /* Hmm, should we be allowed to suspend when there are realtime
29 processes around? */ 53 processes around? */
30 long save; 54 bool was_frozen = false;
55 long save = current->state;
31 56
32 task_lock(current);
33 if (freezing(current)) {
34 frozen_process();
35 task_unlock(current);
36 } else {
37 task_unlock(current);
38 return;
39 }
40 save = current->state;
41 pr_debug("%s entered refrigerator\n", current->comm); 57 pr_debug("%s entered refrigerator\n", current->comm);
42 58
43 spin_lock_irq(&current->sighand->siglock);
44 recalc_sigpending(); /* We sent fake signal, clean it up */
45 spin_unlock_irq(&current->sighand->siglock);
46
47 /* prevent accounting of that task to load */
48 current->flags |= PF_FREEZING;
49
50 for (;;) { 59 for (;;) {
51 set_current_state(TASK_UNINTERRUPTIBLE); 60 set_current_state(TASK_UNINTERRUPTIBLE);
52 if (!frozen(current)) 61
62 spin_lock_irq(&freezer_lock);
63 current->flags |= PF_FROZEN;
64 if (!freezing(current) ||
65 (check_kthr_stop && kthread_should_stop()))
66 current->flags &= ~PF_FROZEN;
67 spin_unlock_irq(&freezer_lock);
68
69 if (!(current->flags & PF_FROZEN))
53 break; 70 break;
71 was_frozen = true;
54 schedule(); 72 schedule();
55 } 73 }
56 74
57 /* Remove the accounting blocker */
58 current->flags &= ~PF_FREEZING;
59
60 pr_debug("%s left refrigerator\n", current->comm); 75 pr_debug("%s left refrigerator\n", current->comm);
61 __set_current_state(save); 76
77 /*
78 * Restore saved task state before returning. The mb'd version
79 * needs to be used; otherwise, it might silently break
80 * synchronization which depends on ordered task state change.
81 */
82 set_current_state(save);
83
84 return was_frozen;
62} 85}
63EXPORT_SYMBOL(refrigerator); 86EXPORT_SYMBOL(__refrigerator);
64 87
65static void fake_signal_wake_up(struct task_struct *p) 88static void fake_signal_wake_up(struct task_struct *p)
66{ 89{
67 unsigned long flags; 90 unsigned long flags;
68 91
69 spin_lock_irqsave(&p->sighand->siglock, flags); 92 if (lock_task_sighand(p, &flags)) {
70 signal_wake_up(p, 0); 93 signal_wake_up(p, 0);
71 spin_unlock_irqrestore(&p->sighand->siglock, flags); 94 unlock_task_sighand(p, &flags);
95 }
72} 96}
73 97
74/** 98/**
75 * freeze_task - send a freeze request to given task 99 * freeze_task - send a freeze request to given task
76 * @p: task to send the request to 100 * @p: task to send the request to
77 * @sig_only: if set, the request will only be sent if the task has the 101 *
78 * PF_FREEZER_NOSIG flag unset 102 * If @p is freezing, the freeze request is sent by setting %TIF_FREEZE
79 * Return value: 'false', if @sig_only is set and the task has 103 * flag and either sending a fake signal to it or waking it up, depending
80 * PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise 104 * on whether it has %PF_FREEZER_NOSIG set.
81 * 105 *
82 * The freeze request is sent by setting the tasks's TIF_FREEZE flag and 106 * RETURNS:
83 * either sending a fake signal to it or waking it up, depending on whether 107 * %false, if @p is not freezing or already frozen; %true, otherwise
84 * or not it has PF_FREEZER_NOSIG set. If @sig_only is set and the task
85 * has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its
86 * TIF_FREEZE flag will not be set.
87 */ 108 */
88bool freeze_task(struct task_struct *p, bool sig_only) 109bool freeze_task(struct task_struct *p)
89{ 110{
90 /* 111 unsigned long flags;
91 * We first check if the task is freezing and next if it has already 112
92 * been frozen to avoid the race with frozen_process() which first marks 113 spin_lock_irqsave(&freezer_lock, flags);
93 * the task as frozen and next clears its TIF_FREEZE. 114 if (!freezing(p) || frozen(p)) {
94 */ 115 spin_unlock_irqrestore(&freezer_lock, flags);
95 if (!freezing(p)) { 116 return false;
96 smp_rmb();
97 if (frozen(p))
98 return false;
99
100 if (!sig_only || should_send_signal(p))
101 set_freeze_flag(p);
102 else
103 return false;
104 } 117 }
105 118
106 if (should_send_signal(p)) { 119 if (!(p->flags & PF_KTHREAD)) {
107 fake_signal_wake_up(p); 120 fake_signal_wake_up(p);
108 /* 121 /*
109 * fake_signal_wake_up() goes through p's scheduler 122 * fake_signal_wake_up() goes through p's scheduler
@@ -111,56 +124,48 @@ bool freeze_task(struct task_struct *p, bool sig_only)
111 * TASK_RUNNING transition can't race with task state 124 * TASK_RUNNING transition can't race with task state
112 * testing in try_to_freeze_tasks(). 125 * testing in try_to_freeze_tasks().
113 */ 126 */
114 } else if (sig_only) {
115 return false;
116 } else { 127 } else {
117 wake_up_state(p, TASK_INTERRUPTIBLE); 128 wake_up_state(p, TASK_INTERRUPTIBLE);
118 } 129 }
119 130
131 spin_unlock_irqrestore(&freezer_lock, flags);
120 return true; 132 return true;
121} 133}
122 134
123void cancel_freezing(struct task_struct *p) 135void __thaw_task(struct task_struct *p)
124{ 136{
125 unsigned long flags; 137 unsigned long flags;
126 138
127 if (freezing(p)) { 139 /*
128 pr_debug(" clean up: %s\n", p->comm); 140 * Clear freezing and kick @p if FROZEN. Clearing is guaranteed to
129 clear_freeze_flag(p); 141 * be visible to @p as waking up implies wmb. Waking up inside
130 spin_lock_irqsave(&p->sighand->siglock, flags); 142 * freezer_lock also prevents wakeups from leaking outside
131 recalc_sigpending_and_wake(p); 143 * refrigerator.
132 spin_unlock_irqrestore(&p->sighand->siglock, flags); 144 */
133 } 145 spin_lock_irqsave(&freezer_lock, flags);
134} 146 if (frozen(p))
135 147 wake_up_process(p);
136static int __thaw_process(struct task_struct *p) 148 spin_unlock_irqrestore(&freezer_lock, flags);
137{
138 if (frozen(p)) {
139 p->flags &= ~PF_FROZEN;
140 return 1;
141 }
142 clear_freeze_flag(p);
143 return 0;
144} 149}
145 150
146/* 151/**
147 * Wake up a frozen process 152 * set_freezable - make %current freezable
148 * 153 *
149 * task_lock() is needed to prevent the race with refrigerator() which may 154 * Mark %current freezable and enter refrigerator if necessary.
150 * occur if the freezing of tasks fails. Namely, without the lock, if the
151 * freezing of tasks failed, thaw_tasks() might have run before a task in
152 * refrigerator() could call frozen_process(), in which case the task would be
153 * frozen and no one would thaw it.
154 */ 155 */
155int thaw_process(struct task_struct *p) 156bool set_freezable(void)
156{ 157{
157 task_lock(p); 158 might_sleep();
158 if (__thaw_process(p) == 1) { 159
159 task_unlock(p); 160 /*
160 wake_up_process(p); 161 * Modify flags while holding freezer_lock. This ensures the
161 return 1; 162 * freezer notices that we aren't frozen yet or the freezing
162 } 163 * condition is visible to try_to_freeze() below.
163 task_unlock(p); 164 */
164 return 0; 165 spin_lock_irq(&freezer_lock);
166 current->flags &= ~PF_NOFREEZE;
167 spin_unlock_irq(&freezer_lock);
168
169 return try_to_freeze();
165} 170}
166EXPORT_SYMBOL(thaw_process); 171EXPORT_SYMBOL(set_freezable);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index a73dd6c7372d..b7952316016a 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -15,7 +15,7 @@
15 15
16#define istate core_internal_state__do_not_mess_with_it 16#define istate core_internal_state__do_not_mess_with_it
17 17
18extern int noirqdebug; 18extern bool noirqdebug;
19 19
20/* 20/*
21 * Bits used by threaded handlers: 21 * Bits used by threaded handlers:
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 200ce832c585..1f9e26526b69 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -135,6 +135,9 @@ int irq_domain_simple_dt_translate(struct irq_domain *d,
135 return -EINVAL; 135 return -EINVAL;
136 if (intsize < 1) 136 if (intsize < 1)
137 return -EINVAL; 137 return -EINVAL;
138 if (d->nr_irq && ((intspec[0] < d->hwirq_base) ||
139 (intspec[0] >= d->hwirq_base + d->nr_irq)))
140 return -EINVAL;
138 141
139 *out_hwirq = intspec[0]; 142 *out_hwirq = intspec[0];
140 *out_type = IRQ_TYPE_NONE; 143 *out_type = IRQ_TYPE_NONE;
@@ -143,11 +146,6 @@ int irq_domain_simple_dt_translate(struct irq_domain *d,
143 return 0; 146 return 0;
144} 147}
145 148
146struct irq_domain_ops irq_domain_simple_ops = {
147 .dt_translate = irq_domain_simple_dt_translate,
148};
149EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
150
151/** 149/**
152 * irq_domain_create_simple() - Set up a 'simple' translation range 150 * irq_domain_create_simple() - Set up a 'simple' translation range
153 */ 151 */
@@ -182,3 +180,10 @@ void irq_domain_generate_simple(const struct of_device_id *match,
182} 180}
183EXPORT_SYMBOL_GPL(irq_domain_generate_simple); 181EXPORT_SYMBOL_GPL(irq_domain_generate_simple);
184#endif /* CONFIG_OF_IRQ */ 182#endif /* CONFIG_OF_IRQ */
183
184struct irq_domain_ops irq_domain_simple_ops = {
185#ifdef CONFIG_OF_IRQ
186 .dt_translate = irq_domain_simple_dt_translate,
187#endif /* CONFIG_OF_IRQ */
188};
189EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 1da999f5e746..a9a9dbe49fea 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1292,7 +1292,7 @@ EXPORT_SYMBOL(free_irq);
1292 * and to set up the interrupt handler in the right order. 1292 * and to set up the interrupt handler in the right order.
1293 * 1293 *
1294 * If you want to set up a threaded irq handler for your device 1294 * If you want to set up a threaded irq handler for your device
1295 * then you need to supply @handler and @thread_fn. @handler ist 1295 * then you need to supply @handler and @thread_fn. @handler is
1296 * still called in hard interrupt context and has to check 1296 * still called in hard interrupt context and has to check
1297 * whether the interrupt originates from the device. If yes it 1297 * whether the interrupt originates from the device. If yes it
1298 * needs to disable the interrupt on the device and return 1298 * needs to disable the interrupt on the device and return
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index dc813a948be2..611cd6003c45 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -325,7 +325,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
325 desc->irqs_unhandled = 0; 325 desc->irqs_unhandled = 0;
326} 326}
327 327
328int noirqdebug __read_mostly; 328bool noirqdebug __read_mostly;
329 329
330int noirqdebug_setup(char *str) 330int noirqdebug_setup(char *str)
331{ 331{
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 30c3c7708132..01d3b70fc98a 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -71,6 +71,7 @@ void jump_label_inc(struct jump_label_key *key)
71 atomic_inc(&key->enabled); 71 atomic_inc(&key->enabled);
72 jump_label_unlock(); 72 jump_label_unlock();
73} 73}
74EXPORT_SYMBOL_GPL(jump_label_inc);
74 75
75static void __jump_label_dec(struct jump_label_key *key, 76static void __jump_label_dec(struct jump_label_key *key,
76 unsigned long rate_limit, struct delayed_work *work) 77 unsigned long rate_limit, struct delayed_work *work)
@@ -86,6 +87,7 @@ static void __jump_label_dec(struct jump_label_key *key,
86 87
87 jump_label_unlock(); 88 jump_label_unlock();
88} 89}
90EXPORT_SYMBOL_GPL(jump_label_dec);
89 91
90static void jump_label_update_timeout(struct work_struct *work) 92static void jump_label_update_timeout(struct work_struct *work)
91{ 93{
diff --git a/kernel/kexec.c b/kernel/kexec.c
index dc7bc0829286..7b0886786701 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -32,7 +32,6 @@
32#include <linux/console.h> 32#include <linux/console.h>
33#include <linux/vmalloc.h> 33#include <linux/vmalloc.h>
34#include <linux/swap.h> 34#include <linux/swap.h>
35#include <linux/kmsg_dump.h>
36#include <linux/syscore_ops.h> 35#include <linux/syscore_ops.h>
37 36
38#include <asm/page.h> 37#include <asm/page.h>
@@ -1094,8 +1093,6 @@ void crash_kexec(struct pt_regs *regs)
1094 if (kexec_crash_image) { 1093 if (kexec_crash_image) {
1095 struct pt_regs fixed_regs; 1094 struct pt_regs fixed_regs;
1096 1095
1097 kmsg_dump(KMSG_DUMP_KEXEC);
1098
1099 crash_setup_regs(&fixed_regs, regs); 1096 crash_setup_regs(&fixed_regs, regs);
1100 crash_save_vmcoreinfo(); 1097 crash_save_vmcoreinfo();
1101 machine_crash_shutdown(&fixed_regs); 1098 machine_crash_shutdown(&fixed_regs);
@@ -1132,6 +1129,8 @@ int crash_shrink_memory(unsigned long new_size)
1132{ 1129{
1133 int ret = 0; 1130 int ret = 0;
1134 unsigned long start, end; 1131 unsigned long start, end;
1132 unsigned long old_size;
1133 struct resource *ram_res;
1135 1134
1136 mutex_lock(&kexec_mutex); 1135 mutex_lock(&kexec_mutex);
1137 1136
@@ -1141,11 +1140,15 @@ int crash_shrink_memory(unsigned long new_size)
1141 } 1140 }
1142 start = crashk_res.start; 1141 start = crashk_res.start;
1143 end = crashk_res.end; 1142 end = crashk_res.end;
1143 old_size = (end == 0) ? 0 : end - start + 1;
1144 if (new_size >= old_size) {
1145 ret = (new_size == old_size) ? 0 : -EINVAL;
1146 goto unlock;
1147 }
1144 1148
1145 if (new_size >= end - start + 1) { 1149 ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
1146 ret = -EINVAL; 1150 if (!ram_res) {
1147 if (new_size == end - start + 1) 1151 ret = -ENOMEM;
1148 ret = 0;
1149 goto unlock; 1152 goto unlock;
1150 } 1153 }
1151 1154
@@ -1157,7 +1160,15 @@ int crash_shrink_memory(unsigned long new_size)
1157 1160
1158 if ((start == end) && (crashk_res.parent != NULL)) 1161 if ((start == end) && (crashk_res.parent != NULL))
1159 release_resource(&crashk_res); 1162 release_resource(&crashk_res);
1163
1164 ram_res->start = end;
1165 ram_res->end = crashk_res.end;
1166 ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
1167 ram_res->name = "System RAM";
1168
1160 crashk_res.end = end - 1; 1169 crashk_res.end = end - 1;
1170
1171 insert_resource(&iomem_resource, ram_res);
1161 crash_unmap_reserved_pages(); 1172 crash_unmap_reserved_pages();
1162 1173
1163unlock: 1174unlock:
@@ -1523,7 +1534,7 @@ int kernel_kexec(void)
1523 1534
1524#ifdef CONFIG_KEXEC_JUMP 1535#ifdef CONFIG_KEXEC_JUMP
1525 if (kexec_image->preserve_context) { 1536 if (kexec_image->preserve_context) {
1526 mutex_lock(&pm_mutex); 1537 lock_system_sleep();
1527 pm_prepare_console(); 1538 pm_prepare_console();
1528 error = freeze_processes(); 1539 error = freeze_processes();
1529 if (error) { 1540 if (error) {
@@ -1576,7 +1587,7 @@ int kernel_kexec(void)
1576 thaw_processes(); 1587 thaw_processes();
1577 Restore_console: 1588 Restore_console:
1578 pm_restore_console(); 1589 pm_restore_console();
1579 mutex_unlock(&pm_mutex); 1590 unlock_system_sleep();
1580 } 1591 }
1581#endif 1592#endif
1582 1593
diff --git a/kernel/kmod.c b/kernel/kmod.c
index a4bea97c75b6..a0a88543934e 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -36,6 +36,7 @@
36#include <linux/resource.h> 36#include <linux/resource.h>
37#include <linux/notifier.h> 37#include <linux/notifier.h>
38#include <linux/suspend.h> 38#include <linux/suspend.h>
39#include <linux/rwsem.h>
39#include <asm/uaccess.h> 40#include <asm/uaccess.h>
40 41
41#include <trace/events/module.h> 42#include <trace/events/module.h>
@@ -50,6 +51,7 @@ static struct workqueue_struct *khelper_wq;
50static kernel_cap_t usermodehelper_bset = CAP_FULL_SET; 51static kernel_cap_t usermodehelper_bset = CAP_FULL_SET;
51static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET; 52static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET;
52static DEFINE_SPINLOCK(umh_sysctl_lock); 53static DEFINE_SPINLOCK(umh_sysctl_lock);
54static DECLARE_RWSEM(umhelper_sem);
53 55
54#ifdef CONFIG_MODULES 56#ifdef CONFIG_MODULES
55 57
@@ -275,6 +277,7 @@ static void __call_usermodehelper(struct work_struct *work)
275 * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY 277 * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY
276 * (used for preventing user land processes from being created after the user 278 * (used for preventing user land processes from being created after the user
277 * land has been frozen during a system-wide hibernation or suspend operation). 279 * land has been frozen during a system-wide hibernation or suspend operation).
280 * Should always be manipulated under umhelper_sem acquired for write.
278 */ 281 */
279static int usermodehelper_disabled = 1; 282static int usermodehelper_disabled = 1;
280 283
@@ -282,17 +285,29 @@ static int usermodehelper_disabled = 1;
282static atomic_t running_helpers = ATOMIC_INIT(0); 285static atomic_t running_helpers = ATOMIC_INIT(0);
283 286
284/* 287/*
285 * Wait queue head used by usermodehelper_pm_callback() to wait for all running 288 * Wait queue head used by usermodehelper_disable() to wait for all running
286 * helpers to finish. 289 * helpers to finish.
287 */ 290 */
288static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq); 291static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq);
289 292
290/* 293/*
291 * Time to wait for running_helpers to become zero before the setting of 294 * Time to wait for running_helpers to become zero before the setting of
292 * usermodehelper_disabled in usermodehelper_pm_callback() fails 295 * usermodehelper_disabled in usermodehelper_disable() fails
293 */ 296 */
294#define RUNNING_HELPERS_TIMEOUT (5 * HZ) 297#define RUNNING_HELPERS_TIMEOUT (5 * HZ)
295 298
299void read_lock_usermodehelper(void)
300{
301 down_read(&umhelper_sem);
302}
303EXPORT_SYMBOL_GPL(read_lock_usermodehelper);
304
305void read_unlock_usermodehelper(void)
306{
307 up_read(&umhelper_sem);
308}
309EXPORT_SYMBOL_GPL(read_unlock_usermodehelper);
310
296/** 311/**
297 * usermodehelper_disable - prevent new helpers from being started 312 * usermodehelper_disable - prevent new helpers from being started
298 */ 313 */
@@ -300,8 +315,10 @@ int usermodehelper_disable(void)
300{ 315{
301 long retval; 316 long retval;
302 317
318 down_write(&umhelper_sem);
303 usermodehelper_disabled = 1; 319 usermodehelper_disabled = 1;
304 smp_mb(); 320 up_write(&umhelper_sem);
321
305 /* 322 /*
306 * From now on call_usermodehelper_exec() won't start any new 323 * From now on call_usermodehelper_exec() won't start any new
307 * helpers, so it is sufficient if running_helpers turns out to 324 * helpers, so it is sufficient if running_helpers turns out to
@@ -314,7 +331,9 @@ int usermodehelper_disable(void)
314 if (retval) 331 if (retval)
315 return 0; 332 return 0;
316 333
334 down_write(&umhelper_sem);
317 usermodehelper_disabled = 0; 335 usermodehelper_disabled = 0;
336 up_write(&umhelper_sem);
318 return -EAGAIN; 337 return -EAGAIN;
319} 338}
320 339
@@ -323,7 +342,9 @@ int usermodehelper_disable(void)
323 */ 342 */
324void usermodehelper_enable(void) 343void usermodehelper_enable(void)
325{ 344{
345 down_write(&umhelper_sem);
326 usermodehelper_disabled = 0; 346 usermodehelper_disabled = 0;
347 up_write(&umhelper_sem);
327} 348}
328 349
329/** 350/**
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index e5d84644823b..95dd7212e610 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -2198,7 +2198,7 @@ static ssize_t write_enabled_file_bool(struct file *file,
2198 const char __user *user_buf, size_t count, loff_t *ppos) 2198 const char __user *user_buf, size_t count, loff_t *ppos)
2199{ 2199{
2200 char buf[32]; 2200 char buf[32];
2201 int buf_size; 2201 size_t buf_size;
2202 2202
2203 buf_size = min(count, (sizeof(buf)-1)); 2203 buf_size = min(count, (sizeof(buf)-1));
2204 if (copy_from_user(buf, user_buf, buf_size)) 2204 if (copy_from_user(buf, user_buf, buf_size))
diff --git a/kernel/kthread.c b/kernel/kthread.c
index b6d216a92639..3d3de633702e 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -59,6 +59,31 @@ int kthread_should_stop(void)
59EXPORT_SYMBOL(kthread_should_stop); 59EXPORT_SYMBOL(kthread_should_stop);
60 60
61/** 61/**
62 * kthread_freezable_should_stop - should this freezable kthread return now?
63 * @was_frozen: optional out parameter, indicates whether %current was frozen
64 *
65 * kthread_should_stop() for freezable kthreads, which will enter
66 * refrigerator if necessary. This function is safe from kthread_stop() /
67 * freezer deadlock and freezable kthreads should use this function instead
68 * of calling try_to_freeze() directly.
69 */
70bool kthread_freezable_should_stop(bool *was_frozen)
71{
72 bool frozen = false;
73
74 might_sleep();
75
76 if (unlikely(freezing(current)))
77 frozen = __refrigerator(true);
78
79 if (was_frozen)
80 *was_frozen = frozen;
81
82 return kthread_should_stop();
83}
84EXPORT_SYMBOL_GPL(kthread_freezable_should_stop);
85
86/**
62 * kthread_data - return data value specified on kthread creation 87 * kthread_data - return data value specified on kthread creation
63 * @task: kthread task in question 88 * @task: kthread task in question
64 * 89 *
@@ -257,7 +282,7 @@ int kthreadd(void *unused)
257 set_cpus_allowed_ptr(tsk, cpu_all_mask); 282 set_cpus_allowed_ptr(tsk, cpu_all_mask);
258 set_mems_allowed(node_states[N_HIGH_MEMORY]); 283 set_mems_allowed(node_states[N_HIGH_MEMORY]);
259 284
260 current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG; 285 current->flags |= PF_NOFREEZE;
261 286
262 for (;;) { 287 for (;;) {
263 set_current_state(TASK_INTERRUPTIBLE); 288 set_current_state(TASK_INTERRUPTIBLE);
diff --git a/kernel/module.c b/kernel/module.c
index 178333c48d1e..2c932760fd33 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -62,12 +62,6 @@
62#define CREATE_TRACE_POINTS 62#define CREATE_TRACE_POINTS
63#include <trace/events/module.h> 63#include <trace/events/module.h>
64 64
65#if 0
66#define DEBUGP printk
67#else
68#define DEBUGP(fmt , a...)
69#endif
70
71#ifndef ARCH_SHF_SMALL 65#ifndef ARCH_SHF_SMALL
72#define ARCH_SHF_SMALL 0 66#define ARCH_SHF_SMALL 0
73#endif 67#endif
@@ -138,7 +132,6 @@ struct load_info {
138 unsigned long len; 132 unsigned long len;
139 Elf_Shdr *sechdrs; 133 Elf_Shdr *sechdrs;
140 char *secstrings, *strtab; 134 char *secstrings, *strtab;
141 unsigned long *strmap;
142 unsigned long symoffs, stroffs; 135 unsigned long symoffs, stroffs;
143 struct _ddebug *debug; 136 struct _ddebug *debug;
144 unsigned int num_debug; 137 unsigned int num_debug;
@@ -410,7 +403,7 @@ const struct kernel_symbol *find_symbol(const char *name,
410 return fsa.sym; 403 return fsa.sym;
411 } 404 }
412 405
413 DEBUGP("Failed to find symbol %s\n", name); 406 pr_debug("Failed to find symbol %s\n", name);
414 return NULL; 407 return NULL;
415} 408}
416EXPORT_SYMBOL_GPL(find_symbol); 409EXPORT_SYMBOL_GPL(find_symbol);
@@ -600,11 +593,11 @@ static int already_uses(struct module *a, struct module *b)
600 593
601 list_for_each_entry(use, &b->source_list, source_list) { 594 list_for_each_entry(use, &b->source_list, source_list) {
602 if (use->source == a) { 595 if (use->source == a) {
603 DEBUGP("%s uses %s!\n", a->name, b->name); 596 pr_debug("%s uses %s!\n", a->name, b->name);
604 return 1; 597 return 1;
605 } 598 }
606 } 599 }
607 DEBUGP("%s does not use %s!\n", a->name, b->name); 600 pr_debug("%s does not use %s!\n", a->name, b->name);
608 return 0; 601 return 0;
609} 602}
610 603
@@ -619,7 +612,7 @@ static int add_module_usage(struct module *a, struct module *b)
619{ 612{
620 struct module_use *use; 613 struct module_use *use;
621 614
622 DEBUGP("Allocating new usage for %s.\n", a->name); 615 pr_debug("Allocating new usage for %s.\n", a->name);
623 use = kmalloc(sizeof(*use), GFP_ATOMIC); 616 use = kmalloc(sizeof(*use), GFP_ATOMIC);
624 if (!use) { 617 if (!use) {
625 printk(KERN_WARNING "%s: out of memory loading\n", a->name); 618 printk(KERN_WARNING "%s: out of memory loading\n", a->name);
@@ -663,7 +656,7 @@ static void module_unload_free(struct module *mod)
663 mutex_lock(&module_mutex); 656 mutex_lock(&module_mutex);
664 list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) { 657 list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) {
665 struct module *i = use->target; 658 struct module *i = use->target;
666 DEBUGP("%s unusing %s\n", mod->name, i->name); 659 pr_debug("%s unusing %s\n", mod->name, i->name);
667 module_put(i); 660 module_put(i);
668 list_del(&use->source_list); 661 list_del(&use->source_list);
669 list_del(&use->target_list); 662 list_del(&use->target_list);
@@ -726,9 +719,9 @@ static int try_stop_module(struct module *mod, int flags, int *forced)
726 } 719 }
727} 720}
728 721
729unsigned int module_refcount(struct module *mod) 722unsigned long module_refcount(struct module *mod)
730{ 723{
731 unsigned int incs = 0, decs = 0; 724 unsigned long incs = 0, decs = 0;
732 int cpu; 725 int cpu;
733 726
734 for_each_possible_cpu(cpu) 727 for_each_possible_cpu(cpu)
@@ -761,7 +754,7 @@ static void wait_for_zero_refcount(struct module *mod)
761 /* Since we might sleep for some time, release the mutex first */ 754 /* Since we might sleep for some time, release the mutex first */
762 mutex_unlock(&module_mutex); 755 mutex_unlock(&module_mutex);
763 for (;;) { 756 for (;;) {
764 DEBUGP("Looking at refcount...\n"); 757 pr_debug("Looking at refcount...\n");
765 set_current_state(TASK_UNINTERRUPTIBLE); 758 set_current_state(TASK_UNINTERRUPTIBLE);
766 if (module_refcount(mod) == 0) 759 if (module_refcount(mod) == 0)
767 break; 760 break;
@@ -804,7 +797,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
804 if (mod->state != MODULE_STATE_LIVE) { 797 if (mod->state != MODULE_STATE_LIVE) {
805 /* FIXME: if (force), slam module count and wake up 798 /* FIXME: if (force), slam module count and wake up
806 waiter --RR */ 799 waiter --RR */
807 DEBUGP("%s already dying\n", mod->name); 800 pr_debug("%s already dying\n", mod->name);
808 ret = -EBUSY; 801 ret = -EBUSY;
809 goto out; 802 goto out;
810 } 803 }
@@ -854,7 +847,7 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod)
854 struct module_use *use; 847 struct module_use *use;
855 int printed_something = 0; 848 int printed_something = 0;
856 849
857 seq_printf(m, " %u ", module_refcount(mod)); 850 seq_printf(m, " %lu ", module_refcount(mod));
858 851
859 /* Always include a trailing , so userspace can differentiate 852 /* Always include a trailing , so userspace can differentiate
860 between this and the old multi-field proc format. */ 853 between this and the old multi-field proc format. */
@@ -904,13 +897,11 @@ EXPORT_SYMBOL_GPL(symbol_put_addr);
904static ssize_t show_refcnt(struct module_attribute *mattr, 897static ssize_t show_refcnt(struct module_attribute *mattr,
905 struct module_kobject *mk, char *buffer) 898 struct module_kobject *mk, char *buffer)
906{ 899{
907 return sprintf(buffer, "%u\n", module_refcount(mk->mod)); 900 return sprintf(buffer, "%lu\n", module_refcount(mk->mod));
908} 901}
909 902
910static struct module_attribute refcnt = { 903static struct module_attribute modinfo_refcnt =
911 .attr = { .name = "refcnt", .mode = 0444 }, 904 __ATTR(refcnt, 0444, show_refcnt, NULL);
912 .show = show_refcnt,
913};
914 905
915void module_put(struct module *module) 906void module_put(struct module *module)
916{ 907{
@@ -951,6 +942,26 @@ static inline int module_unload_init(struct module *mod)
951} 942}
952#endif /* CONFIG_MODULE_UNLOAD */ 943#endif /* CONFIG_MODULE_UNLOAD */
953 944
945static size_t module_flags_taint(struct module *mod, char *buf)
946{
947 size_t l = 0;
948
949 if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE))
950 buf[l++] = 'P';
951 if (mod->taints & (1 << TAINT_OOT_MODULE))
952 buf[l++] = 'O';
953 if (mod->taints & (1 << TAINT_FORCED_MODULE))
954 buf[l++] = 'F';
955 if (mod->taints & (1 << TAINT_CRAP))
956 buf[l++] = 'C';
957 /*
958 * TAINT_FORCED_RMMOD: could be added.
959 * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
960 * apply to modules.
961 */
962 return l;
963}
964
954static ssize_t show_initstate(struct module_attribute *mattr, 965static ssize_t show_initstate(struct module_attribute *mattr,
955 struct module_kobject *mk, char *buffer) 966 struct module_kobject *mk, char *buffer)
956{ 967{
@@ -970,10 +981,8 @@ static ssize_t show_initstate(struct module_attribute *mattr,
970 return sprintf(buffer, "%s\n", state); 981 return sprintf(buffer, "%s\n", state);
971} 982}
972 983
973static struct module_attribute initstate = { 984static struct module_attribute modinfo_initstate =
974 .attr = { .name = "initstate", .mode = 0444 }, 985 __ATTR(initstate, 0444, show_initstate, NULL);
975 .show = show_initstate,
976};
977 986
978static ssize_t store_uevent(struct module_attribute *mattr, 987static ssize_t store_uevent(struct module_attribute *mattr,
979 struct module_kobject *mk, 988 struct module_kobject *mk,
@@ -986,18 +995,50 @@ static ssize_t store_uevent(struct module_attribute *mattr,
986 return count; 995 return count;
987} 996}
988 997
989struct module_attribute module_uevent = { 998struct module_attribute module_uevent =
990 .attr = { .name = "uevent", .mode = 0200 }, 999 __ATTR(uevent, 0200, NULL, store_uevent);
991 .store = store_uevent, 1000
992}; 1001static ssize_t show_coresize(struct module_attribute *mattr,
1002 struct module_kobject *mk, char *buffer)
1003{
1004 return sprintf(buffer, "%u\n", mk->mod->core_size);
1005}
1006
1007static struct module_attribute modinfo_coresize =
1008 __ATTR(coresize, 0444, show_coresize, NULL);
1009
1010static ssize_t show_initsize(struct module_attribute *mattr,
1011 struct module_kobject *mk, char *buffer)
1012{
1013 return sprintf(buffer, "%u\n", mk->mod->init_size);
1014}
1015
1016static struct module_attribute modinfo_initsize =
1017 __ATTR(initsize, 0444, show_initsize, NULL);
1018
1019static ssize_t show_taint(struct module_attribute *mattr,
1020 struct module_kobject *mk, char *buffer)
1021{
1022 size_t l;
1023
1024 l = module_flags_taint(mk->mod, buffer);
1025 buffer[l++] = '\n';
1026 return l;
1027}
1028
1029static struct module_attribute modinfo_taint =
1030 __ATTR(taint, 0444, show_taint, NULL);
993 1031
994static struct module_attribute *modinfo_attrs[] = { 1032static struct module_attribute *modinfo_attrs[] = {
1033 &module_uevent,
995 &modinfo_version, 1034 &modinfo_version,
996 &modinfo_srcversion, 1035 &modinfo_srcversion,
997 &initstate, 1036 &modinfo_initstate,
998 &module_uevent, 1037 &modinfo_coresize,
1038 &modinfo_initsize,
1039 &modinfo_taint,
999#ifdef CONFIG_MODULE_UNLOAD 1040#ifdef CONFIG_MODULE_UNLOAD
1000 &refcnt, 1041 &modinfo_refcnt,
1001#endif 1042#endif
1002 NULL, 1043 NULL,
1003}; 1044};
@@ -1057,7 +1098,7 @@ static int check_version(Elf_Shdr *sechdrs,
1057 1098
1058 if (versions[i].crc == maybe_relocated(*crc, crc_owner)) 1099 if (versions[i].crc == maybe_relocated(*crc, crc_owner))
1059 return 1; 1100 return 1;
1060 DEBUGP("Found checksum %lX vs module %lX\n", 1101 pr_debug("Found checksum %lX vs module %lX\n",
1061 maybe_relocated(*crc, crc_owner), versions[i].crc); 1102 maybe_relocated(*crc, crc_owner), versions[i].crc);
1062 goto bad_version; 1103 goto bad_version;
1063 } 1104 }
@@ -1834,7 +1875,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
1834 case SHN_COMMON: 1875 case SHN_COMMON:
1835 /* We compiled with -fno-common. These are not 1876 /* We compiled with -fno-common. These are not
1836 supposed to happen. */ 1877 supposed to happen. */
1837 DEBUGP("Common symbol: %s\n", name); 1878 pr_debug("Common symbol: %s\n", name);
1838 printk("%s: please compile with -fno-common\n", 1879 printk("%s: please compile with -fno-common\n",
1839 mod->name); 1880 mod->name);
1840 ret = -ENOEXEC; 1881 ret = -ENOEXEC;
@@ -1842,7 +1883,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
1842 1883
1843 case SHN_ABS: 1884 case SHN_ABS:
1844 /* Don't need to do anything */ 1885 /* Don't need to do anything */
1845 DEBUGP("Absolute symbol: 0x%08lx\n", 1886 pr_debug("Absolute symbol: 0x%08lx\n",
1846 (long)sym[i].st_value); 1887 (long)sym[i].st_value);
1847 break; 1888 break;
1848 1889
@@ -1966,7 +2007,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
1966 for (i = 0; i < info->hdr->e_shnum; i++) 2007 for (i = 0; i < info->hdr->e_shnum; i++)
1967 info->sechdrs[i].sh_entsize = ~0UL; 2008 info->sechdrs[i].sh_entsize = ~0UL;
1968 2009
1969 DEBUGP("Core section allocation order:\n"); 2010 pr_debug("Core section allocation order:\n");
1970 for (m = 0; m < ARRAY_SIZE(masks); ++m) { 2011 for (m = 0; m < ARRAY_SIZE(masks); ++m) {
1971 for (i = 0; i < info->hdr->e_shnum; ++i) { 2012 for (i = 0; i < info->hdr->e_shnum; ++i) {
1972 Elf_Shdr *s = &info->sechdrs[i]; 2013 Elf_Shdr *s = &info->sechdrs[i];
@@ -1978,7 +2019,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
1978 || strstarts(sname, ".init")) 2019 || strstarts(sname, ".init"))
1979 continue; 2020 continue;
1980 s->sh_entsize = get_offset(mod, &mod->core_size, s, i); 2021 s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
1981 DEBUGP("\t%s\n", name); 2022 pr_debug("\t%s\n", sname);
1982 } 2023 }
1983 switch (m) { 2024 switch (m) {
1984 case 0: /* executable */ 2025 case 0: /* executable */
@@ -1995,7 +2036,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
1995 } 2036 }
1996 } 2037 }
1997 2038
1998 DEBUGP("Init section allocation order:\n"); 2039 pr_debug("Init section allocation order:\n");
1999 for (m = 0; m < ARRAY_SIZE(masks); ++m) { 2040 for (m = 0; m < ARRAY_SIZE(masks); ++m) {
2000 for (i = 0; i < info->hdr->e_shnum; ++i) { 2041 for (i = 0; i < info->hdr->e_shnum; ++i) {
2001 Elf_Shdr *s = &info->sechdrs[i]; 2042 Elf_Shdr *s = &info->sechdrs[i];
@@ -2008,7 +2049,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
2008 continue; 2049 continue;
2009 s->sh_entsize = (get_offset(mod, &mod->init_size, s, i) 2050 s->sh_entsize = (get_offset(mod, &mod->init_size, s, i)
2010 | INIT_OFFSET_MASK); 2051 | INIT_OFFSET_MASK);
2011 DEBUGP("\t%s\n", sname); 2052 pr_debug("\t%s\n", sname);
2012 } 2053 }
2013 switch (m) { 2054 switch (m) {
2014 case 0: /* executable */ 2055 case 0: /* executable */
@@ -2178,45 +2219,46 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
2178 return true; 2219 return true;
2179} 2220}
2180 2221
2222/*
2223 * We only allocate and copy the strings needed by the parts of symtab
2224 * we keep. This is simple, but has the effect of making multiple
2225 * copies of duplicates. We could be more sophisticated, see
2226 * linux-kernel thread starting with
2227 * <73defb5e4bca04a6431392cc341112b1@localhost>.
2228 */
2181static void layout_symtab(struct module *mod, struct load_info *info) 2229static void layout_symtab(struct module *mod, struct load_info *info)
2182{ 2230{
2183 Elf_Shdr *symsect = info->sechdrs + info->index.sym; 2231 Elf_Shdr *symsect = info->sechdrs + info->index.sym;
2184 Elf_Shdr *strsect = info->sechdrs + info->index.str; 2232 Elf_Shdr *strsect = info->sechdrs + info->index.str;
2185 const Elf_Sym *src; 2233 const Elf_Sym *src;
2186 unsigned int i, nsrc, ndst; 2234 unsigned int i, nsrc, ndst, strtab_size;
2187 2235
2188 /* Put symbol section at end of init part of module. */ 2236 /* Put symbol section at end of init part of module. */
2189 symsect->sh_flags |= SHF_ALLOC; 2237 symsect->sh_flags |= SHF_ALLOC;
2190 symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect, 2238 symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect,
2191 info->index.sym) | INIT_OFFSET_MASK; 2239 info->index.sym) | INIT_OFFSET_MASK;
2192 DEBUGP("\t%s\n", info->secstrings + symsect->sh_name); 2240 pr_debug("\t%s\n", info->secstrings + symsect->sh_name);
2193 2241
2194 src = (void *)info->hdr + symsect->sh_offset; 2242 src = (void *)info->hdr + symsect->sh_offset;
2195 nsrc = symsect->sh_size / sizeof(*src); 2243 nsrc = symsect->sh_size / sizeof(*src);
2196 for (ndst = i = 1; i < nsrc; ++i, ++src)
2197 if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) {
2198 unsigned int j = src->st_name;
2199 2244
2200 while (!__test_and_set_bit(j, info->strmap) 2245 /* Compute total space required for the core symbols' strtab. */
2201 && info->strtab[j]) 2246 for (ndst = i = strtab_size = 1; i < nsrc; ++i, ++src)
2202 ++j; 2247 if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) {
2203 ++ndst; 2248 strtab_size += strlen(&info->strtab[src->st_name]) + 1;
2249 ndst++;
2204 } 2250 }
2205 2251
2206 /* Append room for core symbols at end of core part. */ 2252 /* Append room for core symbols at end of core part. */
2207 info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); 2253 info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1);
2208 mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym); 2254 info->stroffs = mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym);
2255 mod->core_size += strtab_size;
2209 2256
2210 /* Put string table section at end of init part of module. */ 2257 /* Put string table section at end of init part of module. */
2211 strsect->sh_flags |= SHF_ALLOC; 2258 strsect->sh_flags |= SHF_ALLOC;
2212 strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect, 2259 strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect,
2213 info->index.str) | INIT_OFFSET_MASK; 2260 info->index.str) | INIT_OFFSET_MASK;
2214 DEBUGP("\t%s\n", info->secstrings + strsect->sh_name); 2261 pr_debug("\t%s\n", info->secstrings + strsect->sh_name);
2215
2216 /* Append room for core symbols' strings at end of core part. */
2217 info->stroffs = mod->core_size;
2218 __set_bit(0, info->strmap);
2219 mod->core_size += bitmap_weight(info->strmap, strsect->sh_size);
2220} 2262}
2221 2263
2222static void add_kallsyms(struct module *mod, const struct load_info *info) 2264static void add_kallsyms(struct module *mod, const struct load_info *info)
@@ -2237,22 +2279,19 @@ static void add_kallsyms(struct module *mod, const struct load_info *info)
2237 mod->symtab[i].st_info = elf_type(&mod->symtab[i], info); 2279 mod->symtab[i].st_info = elf_type(&mod->symtab[i], info);
2238 2280
2239 mod->core_symtab = dst = mod->module_core + info->symoffs; 2281 mod->core_symtab = dst = mod->module_core + info->symoffs;
2282 mod->core_strtab = s = mod->module_core + info->stroffs;
2240 src = mod->symtab; 2283 src = mod->symtab;
2241 *dst = *src; 2284 *dst = *src;
2285 *s++ = 0;
2242 for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) { 2286 for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) {
2243 if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) 2287 if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum))
2244 continue; 2288 continue;
2289
2245 dst[ndst] = *src; 2290 dst[ndst] = *src;
2246 dst[ndst].st_name = bitmap_weight(info->strmap, 2291 dst[ndst++].st_name = s - mod->core_strtab;
2247 dst[ndst].st_name); 2292 s += strlcpy(s, &mod->strtab[src->st_name], KSYM_NAME_LEN) + 1;
2248 ++ndst;
2249 } 2293 }
2250 mod->core_num_syms = ndst; 2294 mod->core_num_syms = ndst;
2251
2252 mod->core_strtab = s = mod->module_core + info->stroffs;
2253 for (*s = 0, i = 1; i < info->sechdrs[info->index.str].sh_size; ++i)
2254 if (test_bit(i, info->strmap))
2255 *++s = mod->strtab[i];
2256} 2295}
2257#else 2296#else
2258static inline void layout_symtab(struct module *mod, struct load_info *info) 2297static inline void layout_symtab(struct module *mod, struct load_info *info)
@@ -2621,7 +2660,7 @@ static int move_module(struct module *mod, struct load_info *info)
2621 mod->module_init = ptr; 2660 mod->module_init = ptr;
2622 2661
2623 /* Transfer each section which specifies SHF_ALLOC */ 2662 /* Transfer each section which specifies SHF_ALLOC */
2624 DEBUGP("final section addresses:\n"); 2663 pr_debug("final section addresses:\n");
2625 for (i = 0; i < info->hdr->e_shnum; i++) { 2664 for (i = 0; i < info->hdr->e_shnum; i++) {
2626 void *dest; 2665 void *dest;
2627 Elf_Shdr *shdr = &info->sechdrs[i]; 2666 Elf_Shdr *shdr = &info->sechdrs[i];
@@ -2639,8 +2678,8 @@ static int move_module(struct module *mod, struct load_info *info)
2639 memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size); 2678 memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size);
2640 /* Update sh_addr to point to copy in image. */ 2679 /* Update sh_addr to point to copy in image. */
2641 shdr->sh_addr = (unsigned long)dest; 2680 shdr->sh_addr = (unsigned long)dest;
2642 DEBUGP("\t0x%lx %s\n", 2681 pr_debug("\t0x%lx %s\n",
2643 shdr->sh_addr, info->secstrings + shdr->sh_name); 2682 (long)shdr->sh_addr, info->secstrings + shdr->sh_name);
2644 } 2683 }
2645 2684
2646 return 0; 2685 return 0;
@@ -2742,27 +2781,18 @@ static struct module *layout_and_allocate(struct load_info *info)
2742 this is done generically; there doesn't appear to be any 2781 this is done generically; there doesn't appear to be any
2743 special cases for the architectures. */ 2782 special cases for the architectures. */
2744 layout_sections(mod, info); 2783 layout_sections(mod, info);
2745
2746 info->strmap = kzalloc(BITS_TO_LONGS(info->sechdrs[info->index.str].sh_size)
2747 * sizeof(long), GFP_KERNEL);
2748 if (!info->strmap) {
2749 err = -ENOMEM;
2750 goto free_percpu;
2751 }
2752 layout_symtab(mod, info); 2784 layout_symtab(mod, info);
2753 2785
2754 /* Allocate and move to the final place */ 2786 /* Allocate and move to the final place */
2755 err = move_module(mod, info); 2787 err = move_module(mod, info);
2756 if (err) 2788 if (err)
2757 goto free_strmap; 2789 goto free_percpu;
2758 2790
2759 /* Module has been copied to its final place now: return it. */ 2791 /* Module has been copied to its final place now: return it. */
2760 mod = (void *)info->sechdrs[info->index.mod].sh_addr; 2792 mod = (void *)info->sechdrs[info->index.mod].sh_addr;
2761 kmemleak_load_module(mod, info); 2793 kmemleak_load_module(mod, info);
2762 return mod; 2794 return mod;
2763 2795
2764free_strmap:
2765 kfree(info->strmap);
2766free_percpu: 2796free_percpu:
2767 percpu_modfree(mod); 2797 percpu_modfree(mod);
2768out: 2798out:
@@ -2772,7 +2802,6 @@ out:
2772/* mod is no longer valid after this! */ 2802/* mod is no longer valid after this! */
2773static void module_deallocate(struct module *mod, struct load_info *info) 2803static void module_deallocate(struct module *mod, struct load_info *info)
2774{ 2804{
2775 kfree(info->strmap);
2776 percpu_modfree(mod); 2805 percpu_modfree(mod);
2777 module_free(mod, mod->module_init); 2806 module_free(mod, mod->module_init);
2778 module_free(mod, mod->module_core); 2807 module_free(mod, mod->module_core);
@@ -2811,7 +2840,7 @@ static struct module *load_module(void __user *umod,
2811 struct module *mod; 2840 struct module *mod;
2812 long err; 2841 long err;
2813 2842
2814 DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", 2843 pr_debug("load_module: umod=%p, len=%lu, uargs=%p\n",
2815 umod, len, uargs); 2844 umod, len, uargs);
2816 2845
2817 /* Copy in the blobs from userspace, check they are vaguely sane. */ 2846 /* Copy in the blobs from userspace, check they are vaguely sane. */
@@ -2902,8 +2931,7 @@ static struct module *load_module(void __user *umod,
2902 if (err < 0) 2931 if (err < 0)
2903 goto unlink; 2932 goto unlink;
2904 2933
2905 /* Get rid of temporary copy and strmap. */ 2934 /* Get rid of temporary copy. */
2906 kfree(info.strmap);
2907 free_copy(&info); 2935 free_copy(&info);
2908 2936
2909 /* Done! */ 2937 /* Done! */
@@ -3256,20 +3284,7 @@ static char *module_flags(struct module *mod, char *buf)
3256 mod->state == MODULE_STATE_GOING || 3284 mod->state == MODULE_STATE_GOING ||
3257 mod->state == MODULE_STATE_COMING) { 3285 mod->state == MODULE_STATE_COMING) {
3258 buf[bx++] = '('; 3286 buf[bx++] = '(';
3259 if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE)) 3287 bx += module_flags_taint(mod, buf + bx);
3260 buf[bx++] = 'P';
3261 else if (mod->taints & (1 << TAINT_OOT_MODULE))
3262 buf[bx++] = 'O';
3263 if (mod->taints & (1 << TAINT_FORCED_MODULE))
3264 buf[bx++] = 'F';
3265 if (mod->taints & (1 << TAINT_CRAP))
3266 buf[bx++] = 'C';
3267 /*
3268 * TAINT_FORCED_RMMOD: could be added.
3269 * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
3270 * apply to modules.
3271 */
3272
3273 /* Show a - for module-is-being-unloaded */ 3288 /* Show a - for module-is-being-unloaded */
3274 if (mod->state == MODULE_STATE_GOING) 3289 if (mod->state == MODULE_STATE_GOING)
3275 buf[bx++] = '-'; 3290 buf[bx++] = '-';
diff --git a/kernel/panic.c b/kernel/panic.c
index 3458469eb7c3..80aed44e345a 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -49,6 +49,15 @@ static long no_blink(int state)
49long (*panic_blink)(int state); 49long (*panic_blink)(int state);
50EXPORT_SYMBOL(panic_blink); 50EXPORT_SYMBOL(panic_blink);
51 51
52/*
53 * Stop ourself in panic -- architecture code may override this
54 */
55void __weak panic_smp_self_stop(void)
56{
57 while (1)
58 cpu_relax();
59}
60
52/** 61/**
53 * panic - halt the system 62 * panic - halt the system
54 * @fmt: The text string to print 63 * @fmt: The text string to print
@@ -57,8 +66,9 @@ EXPORT_SYMBOL(panic_blink);
57 * 66 *
58 * This function never returns. 67 * This function never returns.
59 */ 68 */
60NORET_TYPE void panic(const char * fmt, ...) 69void panic(const char *fmt, ...)
61{ 70{
71 static DEFINE_SPINLOCK(panic_lock);
62 static char buf[1024]; 72 static char buf[1024];
63 va_list args; 73 va_list args;
64 long i, i_next = 0; 74 long i, i_next = 0;
@@ -68,8 +78,14 @@ NORET_TYPE void panic(const char * fmt, ...)
68 * It's possible to come here directly from a panic-assertion and 78 * It's possible to come here directly from a panic-assertion and
69 * not have preempt disabled. Some functions called from here want 79 * not have preempt disabled. Some functions called from here want
70 * preempt to be disabled. No point enabling it later though... 80 * preempt to be disabled. No point enabling it later though...
81 *
82 * Only one CPU is allowed to execute the panic code from here. For
83 * multiple parallel invocations of panic, all other CPUs either
84 * stop themself or will wait until they are stopped by the 1st CPU
85 * with smp_send_stop().
71 */ 86 */
72 preempt_disable(); 87 if (!spin_trylock(&panic_lock))
88 panic_smp_self_stop();
73 89
74 console_verbose(); 90 console_verbose();
75 bust_spinlocks(1); 91 bust_spinlocks(1);
@@ -78,7 +94,11 @@ NORET_TYPE void panic(const char * fmt, ...)
78 va_end(args); 94 va_end(args);
79 printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf); 95 printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf);
80#ifdef CONFIG_DEBUG_BUGVERBOSE 96#ifdef CONFIG_DEBUG_BUGVERBOSE
81 dump_stack(); 97 /*
98 * Avoid nested stack-dumping if a panic occurs during oops processing
99 */
100 if (!oops_in_progress)
101 dump_stack();
82#endif 102#endif
83 103
84 /* 104 /*
diff --git a/kernel/params.c b/kernel/params.c
index 65aae11eb93f..32ee04308285 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -25,12 +25,6 @@
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/ctype.h> 26#include <linux/ctype.h>
27 27
28#if 0
29#define DEBUGP printk
30#else
31#define DEBUGP(fmt, a...)
32#endif
33
34/* Protects all parameters, and incidentally kmalloced_param list. */ 28/* Protects all parameters, and incidentally kmalloced_param list. */
35static DEFINE_MUTEX(param_lock); 29static DEFINE_MUTEX(param_lock);
36 30
@@ -105,7 +99,7 @@ static int parse_one(char *param,
105 /* No one handled NULL, so do it here. */ 99 /* No one handled NULL, so do it here. */
106 if (!val && params[i].ops->set != param_set_bool) 100 if (!val && params[i].ops->set != param_set_bool)
107 return -EINVAL; 101 return -EINVAL;
108 DEBUGP("They are equal! Calling %p\n", 102 pr_debug("They are equal! Calling %p\n",
109 params[i].ops->set); 103 params[i].ops->set);
110 mutex_lock(&param_lock); 104 mutex_lock(&param_lock);
111 err = params[i].ops->set(val, &params[i]); 105 err = params[i].ops->set(val, &params[i]);
@@ -115,11 +109,11 @@ static int parse_one(char *param,
115 } 109 }
116 110
117 if (handle_unknown) { 111 if (handle_unknown) {
118 DEBUGP("Unknown argument: calling %p\n", handle_unknown); 112 pr_debug("Unknown argument: calling %p\n", handle_unknown);
119 return handle_unknown(param, val); 113 return handle_unknown(param, val);
120 } 114 }
121 115
122 DEBUGP("Unknown argument `%s'\n", param); 116 pr_debug("Unknown argument `%s'\n", param);
123 return -ENOENT; 117 return -ENOENT;
124} 118}
125 119
@@ -184,7 +178,7 @@ int parse_args(const char *name,
184{ 178{
185 char *param, *val; 179 char *param, *val;
186 180
187 DEBUGP("Parsing ARGS: %s\n", args); 181 pr_debug("Parsing ARGS: %s\n", args);
188 182
189 /* Chew leading spaces */ 183 /* Chew leading spaces */
190 args = skip_spaces(args); 184 args = skip_spaces(args);
@@ -369,6 +363,30 @@ struct kernel_param_ops param_ops_invbool = {
369}; 363};
370EXPORT_SYMBOL(param_ops_invbool); 364EXPORT_SYMBOL(param_ops_invbool);
371 365
366int param_set_bint(const char *val, const struct kernel_param *kp)
367{
368 struct kernel_param boolkp;
369 bool v;
370 int ret;
371
372 /* Match bool exactly, by re-using it. */
373 boolkp = *kp;
374 boolkp.arg = &v;
375 boolkp.flags |= KPARAM_ISBOOL;
376
377 ret = param_set_bool(val, &boolkp);
378 if (ret == 0)
379 *(int *)kp->arg = v;
380 return ret;
381}
382EXPORT_SYMBOL(param_set_bint);
383
384struct kernel_param_ops param_ops_bint = {
385 .set = param_set_bint,
386 .get = param_get_int,
387};
388EXPORT_SYMBOL(param_ops_bint);
389
372/* We break the rule and mangle the string. */ 390/* We break the rule and mangle the string. */
373static int param_array(const char *name, 391static int param_array(const char *name,
374 const char *val, 392 const char *val,
diff --git a/kernel/pid.c b/kernel/pid.c
index fa5f72227e5f..ce8e00deaccb 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -137,7 +137,9 @@ static int pid_before(int base, int a, int b)
137} 137}
138 138
139/* 139/*
140 * We might be racing with someone else trying to set pid_ns->last_pid. 140 * We might be racing with someone else trying to set pid_ns->last_pid
141 * at the pid allocation time (there's also a sysctl for this, but racing
142 * with this one is OK, see comment in kernel/pid_namespace.c about it).
141 * We want the winner to have the "later" value, because if the 143 * We want the winner to have the "later" value, because if the
142 * "earlier" value prevails, then a pid may get reused immediately. 144 * "earlier" value prevails, then a pid may get reused immediately.
143 * 145 *
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index e9c9adc84ca6..a8968396046d 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -191,9 +191,40 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
191 return; 191 return;
192} 192}
193 193
194static int pid_ns_ctl_handler(struct ctl_table *table, int write,
195 void __user *buffer, size_t *lenp, loff_t *ppos)
196{
197 struct ctl_table tmp = *table;
198
199 if (write && !capable(CAP_SYS_ADMIN))
200 return -EPERM;
201
202 /*
203 * Writing directly to ns' last_pid field is OK, since this field
204 * is volatile in a living namespace anyway and a code writing to
205 * it should synchronize its usage with external means.
206 */
207
208 tmp.data = &current->nsproxy->pid_ns->last_pid;
209 return proc_dointvec(&tmp, write, buffer, lenp, ppos);
210}
211
212static struct ctl_table pid_ns_ctl_table[] = {
213 {
214 .procname = "ns_last_pid",
215 .maxlen = sizeof(int),
216 .mode = 0666, /* permissions are checked in the handler */
217 .proc_handler = pid_ns_ctl_handler,
218 },
219 { }
220};
221
222static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } };
223
194static __init int pid_namespaces_init(void) 224static __init int pid_namespaces_init(void)
195{ 225{
196 pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); 226 pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
227 register_sysctl_paths(kern_path, pid_ns_ctl_table);
197 return 0; 228 return 0;
198} 229}
199 230
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index a6b0503574ee..6d6d28870335 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -43,8 +43,6 @@ int in_suspend __nosavedata;
43enum { 43enum {
44 HIBERNATION_INVALID, 44 HIBERNATION_INVALID,
45 HIBERNATION_PLATFORM, 45 HIBERNATION_PLATFORM,
46 HIBERNATION_TEST,
47 HIBERNATION_TESTPROC,
48 HIBERNATION_SHUTDOWN, 46 HIBERNATION_SHUTDOWN,
49 HIBERNATION_REBOOT, 47 HIBERNATION_REBOOT,
50 /* keep last */ 48 /* keep last */
@@ -55,7 +53,7 @@ enum {
55 53
56static int hibernation_mode = HIBERNATION_SHUTDOWN; 54static int hibernation_mode = HIBERNATION_SHUTDOWN;
57 55
58static bool freezer_test_done; 56bool freezer_test_done;
59 57
60static const struct platform_hibernation_ops *hibernation_ops; 58static const struct platform_hibernation_ops *hibernation_ops;
61 59
@@ -71,14 +69,14 @@ void hibernation_set_ops(const struct platform_hibernation_ops *ops)
71 WARN_ON(1); 69 WARN_ON(1);
72 return; 70 return;
73 } 71 }
74 mutex_lock(&pm_mutex); 72 lock_system_sleep();
75 hibernation_ops = ops; 73 hibernation_ops = ops;
76 if (ops) 74 if (ops)
77 hibernation_mode = HIBERNATION_PLATFORM; 75 hibernation_mode = HIBERNATION_PLATFORM;
78 else if (hibernation_mode == HIBERNATION_PLATFORM) 76 else if (hibernation_mode == HIBERNATION_PLATFORM)
79 hibernation_mode = HIBERNATION_SHUTDOWN; 77 hibernation_mode = HIBERNATION_SHUTDOWN;
80 78
81 mutex_unlock(&pm_mutex); 79 unlock_system_sleep();
82} 80}
83 81
84static bool entering_platform_hibernation; 82static bool entering_platform_hibernation;
@@ -96,15 +94,6 @@ static void hibernation_debug_sleep(void)
96 mdelay(5000); 94 mdelay(5000);
97} 95}
98 96
99static int hibernation_testmode(int mode)
100{
101 if (hibernation_mode == mode) {
102 hibernation_debug_sleep();
103 return 1;
104 }
105 return 0;
106}
107
108static int hibernation_test(int level) 97static int hibernation_test(int level)
109{ 98{
110 if (pm_test_level == level) { 99 if (pm_test_level == level) {
@@ -114,7 +103,6 @@ static int hibernation_test(int level)
114 return 0; 103 return 0;
115} 104}
116#else /* !CONFIG_PM_DEBUG */ 105#else /* !CONFIG_PM_DEBUG */
117static int hibernation_testmode(int mode) { return 0; }
118static int hibernation_test(int level) { return 0; } 106static int hibernation_test(int level) { return 0; }
119#endif /* !CONFIG_PM_DEBUG */ 107#endif /* !CONFIG_PM_DEBUG */
120 108
@@ -278,8 +266,7 @@ static int create_image(int platform_mode)
278 goto Platform_finish; 266 goto Platform_finish;
279 267
280 error = disable_nonboot_cpus(); 268 error = disable_nonboot_cpus();
281 if (error || hibernation_test(TEST_CPUS) 269 if (error || hibernation_test(TEST_CPUS))
282 || hibernation_testmode(HIBERNATION_TEST))
283 goto Enable_cpus; 270 goto Enable_cpus;
284 271
285 local_irq_disable(); 272 local_irq_disable();
@@ -333,7 +320,7 @@ static int create_image(int platform_mode)
333 */ 320 */
334int hibernation_snapshot(int platform_mode) 321int hibernation_snapshot(int platform_mode)
335{ 322{
336 pm_message_t msg = PMSG_RECOVER; 323 pm_message_t msg;
337 int error; 324 int error;
338 325
339 error = platform_begin(platform_mode); 326 error = platform_begin(platform_mode);
@@ -349,8 +336,7 @@ int hibernation_snapshot(int platform_mode)
349 if (error) 336 if (error)
350 goto Cleanup; 337 goto Cleanup;
351 338
352 if (hibernation_test(TEST_FREEZER) || 339 if (hibernation_test(TEST_FREEZER)) {
353 hibernation_testmode(HIBERNATION_TESTPROC)) {
354 340
355 /* 341 /*
356 * Indicate to the caller that we are returning due to a 342 * Indicate to the caller that we are returning due to a
@@ -362,26 +348,26 @@ int hibernation_snapshot(int platform_mode)
362 348
363 error = dpm_prepare(PMSG_FREEZE); 349 error = dpm_prepare(PMSG_FREEZE);
364 if (error) { 350 if (error) {
365 dpm_complete(msg); 351 dpm_complete(PMSG_RECOVER);
366 goto Cleanup; 352 goto Cleanup;
367 } 353 }
368 354
369 suspend_console(); 355 suspend_console();
370 pm_restrict_gfp_mask(); 356 pm_restrict_gfp_mask();
357
371 error = dpm_suspend(PMSG_FREEZE); 358 error = dpm_suspend(PMSG_FREEZE);
372 if (error)
373 goto Recover_platform;
374 359
375 if (hibernation_test(TEST_DEVICES)) 360 if (error || hibernation_test(TEST_DEVICES))
376 goto Recover_platform; 361 platform_recover(platform_mode);
362 else
363 error = create_image(platform_mode);
377 364
378 error = create_image(platform_mode);
379 /* 365 /*
380 * Control returns here (1) after the image has been created or the 366 * In the case that we call create_image() above, the control
367 * returns here (1) after the image has been created or the
381 * image creation has failed and (2) after a successful restore. 368 * image creation has failed and (2) after a successful restore.
382 */ 369 */
383 370
384 Resume_devices:
385 /* We may need to release the preallocated image pages here. */ 371 /* We may need to release the preallocated image pages here. */
386 if (error || !in_suspend) 372 if (error || !in_suspend)
387 swsusp_free(); 373 swsusp_free();
@@ -399,10 +385,6 @@ int hibernation_snapshot(int platform_mode)
399 platform_end(platform_mode); 385 platform_end(platform_mode);
400 return error; 386 return error;
401 387
402 Recover_platform:
403 platform_recover(platform_mode);
404 goto Resume_devices;
405
406 Cleanup: 388 Cleanup:
407 swsusp_free(); 389 swsusp_free();
408 goto Close; 390 goto Close;
@@ -590,9 +572,6 @@ int hibernation_platform_enter(void)
590static void power_down(void) 572static void power_down(void)
591{ 573{
592 switch (hibernation_mode) { 574 switch (hibernation_mode) {
593 case HIBERNATION_TEST:
594 case HIBERNATION_TESTPROC:
595 break;
596 case HIBERNATION_REBOOT: 575 case HIBERNATION_REBOOT:
597 kernel_restart(NULL); 576 kernel_restart(NULL);
598 break; 577 break;
@@ -611,17 +590,6 @@ static void power_down(void)
611 while(1); 590 while(1);
612} 591}
613 592
614static int prepare_processes(void)
615{
616 int error = 0;
617
618 if (freeze_processes()) {
619 error = -EBUSY;
620 thaw_processes();
621 }
622 return error;
623}
624
625/** 593/**
626 * hibernate - Carry out system hibernation, including saving the image. 594 * hibernate - Carry out system hibernation, including saving the image.
627 */ 595 */
@@ -629,7 +597,7 @@ int hibernate(void)
629{ 597{
630 int error; 598 int error;
631 599
632 mutex_lock(&pm_mutex); 600 lock_system_sleep();
633 /* The snapshot device should not be opened while we're running */ 601 /* The snapshot device should not be opened while we're running */
634 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { 602 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
635 error = -EBUSY; 603 error = -EBUSY;
@@ -654,7 +622,7 @@ int hibernate(void)
654 sys_sync(); 622 sys_sync();
655 printk("done.\n"); 623 printk("done.\n");
656 624
657 error = prepare_processes(); 625 error = freeze_processes();
658 if (error) 626 if (error)
659 goto Finish; 627 goto Finish;
660 628
@@ -697,7 +665,7 @@ int hibernate(void)
697 pm_restore_console(); 665 pm_restore_console();
698 atomic_inc(&snapshot_device_available); 666 atomic_inc(&snapshot_device_available);
699 Unlock: 667 Unlock:
700 mutex_unlock(&pm_mutex); 668 unlock_system_sleep();
701 return error; 669 return error;
702} 670}
703 671
@@ -811,11 +779,13 @@ static int software_resume(void)
811 goto close_finish; 779 goto close_finish;
812 780
813 error = create_basic_memory_bitmaps(); 781 error = create_basic_memory_bitmaps();
814 if (error) 782 if (error) {
783 usermodehelper_enable();
815 goto close_finish; 784 goto close_finish;
785 }
816 786
817 pr_debug("PM: Preparing processes for restore.\n"); 787 pr_debug("PM: Preparing processes for restore.\n");
818 error = prepare_processes(); 788 error = freeze_processes();
819 if (error) { 789 if (error) {
820 swsusp_close(FMODE_READ); 790 swsusp_close(FMODE_READ);
821 goto Done; 791 goto Done;
@@ -855,8 +825,6 @@ static const char * const hibernation_modes[] = {
855 [HIBERNATION_PLATFORM] = "platform", 825 [HIBERNATION_PLATFORM] = "platform",
856 [HIBERNATION_SHUTDOWN] = "shutdown", 826 [HIBERNATION_SHUTDOWN] = "shutdown",
857 [HIBERNATION_REBOOT] = "reboot", 827 [HIBERNATION_REBOOT] = "reboot",
858 [HIBERNATION_TEST] = "test",
859 [HIBERNATION_TESTPROC] = "testproc",
860}; 828};
861 829
862/* 830/*
@@ -865,17 +833,15 @@ static const char * const hibernation_modes[] = {
865 * Hibernation can be handled in several ways. There are a few different ways 833 * Hibernation can be handled in several ways. There are a few different ways
866 * to put the system into the sleep state: using the platform driver (e.g. ACPI 834 * to put the system into the sleep state: using the platform driver (e.g. ACPI
867 * or other hibernation_ops), powering it off or rebooting it (for testing 835 * or other hibernation_ops), powering it off or rebooting it (for testing
868 * mostly), or using one of the two available test modes. 836 * mostly).
869 * 837 *
870 * The sysfs file /sys/power/disk provides an interface for selecting the 838 * The sysfs file /sys/power/disk provides an interface for selecting the
871 * hibernation mode to use. Reading from this file causes the available modes 839 * hibernation mode to use. Reading from this file causes the available modes
872 * to be printed. There are 5 modes that can be supported: 840 * to be printed. There are 3 modes that can be supported:
873 * 841 *
874 * 'platform' 842 * 'platform'
875 * 'shutdown' 843 * 'shutdown'
876 * 'reboot' 844 * 'reboot'
877 * 'test'
878 * 'testproc'
879 * 845 *
880 * If a platform hibernation driver is in use, 'platform' will be supported 846 * If a platform hibernation driver is in use, 'platform' will be supported
881 * and will be used by default. Otherwise, 'shutdown' will be used by default. 847 * and will be used by default. Otherwise, 'shutdown' will be used by default.
@@ -899,8 +865,6 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
899 switch (i) { 865 switch (i) {
900 case HIBERNATION_SHUTDOWN: 866 case HIBERNATION_SHUTDOWN:
901 case HIBERNATION_REBOOT: 867 case HIBERNATION_REBOOT:
902 case HIBERNATION_TEST:
903 case HIBERNATION_TESTPROC:
904 break; 868 break;
905 case HIBERNATION_PLATFORM: 869 case HIBERNATION_PLATFORM:
906 if (hibernation_ops) 870 if (hibernation_ops)
@@ -929,7 +893,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
929 p = memchr(buf, '\n', n); 893 p = memchr(buf, '\n', n);
930 len = p ? p - buf : n; 894 len = p ? p - buf : n;
931 895
932 mutex_lock(&pm_mutex); 896 lock_system_sleep();
933 for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { 897 for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
934 if (len == strlen(hibernation_modes[i]) 898 if (len == strlen(hibernation_modes[i])
935 && !strncmp(buf, hibernation_modes[i], len)) { 899 && !strncmp(buf, hibernation_modes[i], len)) {
@@ -941,8 +905,6 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
941 switch (mode) { 905 switch (mode) {
942 case HIBERNATION_SHUTDOWN: 906 case HIBERNATION_SHUTDOWN:
943 case HIBERNATION_REBOOT: 907 case HIBERNATION_REBOOT:
944 case HIBERNATION_TEST:
945 case HIBERNATION_TESTPROC:
946 hibernation_mode = mode; 908 hibernation_mode = mode;
947 break; 909 break;
948 case HIBERNATION_PLATFORM: 910 case HIBERNATION_PLATFORM:
@@ -957,7 +919,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
957 if (!error) 919 if (!error)
958 pr_debug("PM: Hibernation mode set to '%s'\n", 920 pr_debug("PM: Hibernation mode set to '%s'\n",
959 hibernation_modes[mode]); 921 hibernation_modes[mode]);
960 mutex_unlock(&pm_mutex); 922 unlock_system_sleep();
961 return error ? error : n; 923 return error ? error : n;
962} 924}
963 925
@@ -984,9 +946,9 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
984 if (maj != MAJOR(res) || min != MINOR(res)) 946 if (maj != MAJOR(res) || min != MINOR(res))
985 goto out; 947 goto out;
986 948
987 mutex_lock(&pm_mutex); 949 lock_system_sleep();
988 swsusp_resume_device = res; 950 swsusp_resume_device = res;
989 mutex_unlock(&pm_mutex); 951 unlock_system_sleep();
990 printk(KERN_INFO "PM: Starting manual resume from disk\n"); 952 printk(KERN_INFO "PM: Starting manual resume from disk\n");
991 noresume = 0; 953 noresume = 0;
992 software_resume(); 954 software_resume();
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 36e0f0903c32..9824b41e5a18 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * Copyright (c) 2003 Patrick Mochel 4 * Copyright (c) 2003 Patrick Mochel
5 * Copyright (c) 2003 Open Source Development Lab 5 * Copyright (c) 2003 Open Source Development Lab
6 * 6 *
7 * This file is released under the GPLv2 7 * This file is released under the GPLv2
8 * 8 *
9 */ 9 */
@@ -116,7 +116,7 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
116 p = memchr(buf, '\n', n); 116 p = memchr(buf, '\n', n);
117 len = p ? p - buf : n; 117 len = p ? p - buf : n;
118 118
119 mutex_lock(&pm_mutex); 119 lock_system_sleep();
120 120
121 level = TEST_FIRST; 121 level = TEST_FIRST;
122 for (s = &pm_tests[level]; level <= TEST_MAX; s++, level++) 122 for (s = &pm_tests[level]; level <= TEST_MAX; s++, level++)
@@ -126,7 +126,7 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
126 break; 126 break;
127 } 127 }
128 128
129 mutex_unlock(&pm_mutex); 129 unlock_system_sleep();
130 130
131 return error ? error : n; 131 return error ? error : n;
132} 132}
@@ -240,7 +240,7 @@ struct kobject *power_kobj;
240 * 'standby' (Power-On Suspend), 'mem' (Suspend-to-RAM), and 240 * 'standby' (Power-On Suspend), 'mem' (Suspend-to-RAM), and
241 * 'disk' (Suspend-to-Disk). 241 * 'disk' (Suspend-to-Disk).
242 * 242 *
243 * store() accepts one of those strings, translates it into the 243 * store() accepts one of those strings, translates it into the
244 * proper enumerated value, and initiates a suspend transition. 244 * proper enumerated value, and initiates a suspend transition.
245 */ 245 */
246static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, 246static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
@@ -282,7 +282,7 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
282 /* First, check if we are requested to hibernate */ 282 /* First, check if we are requested to hibernate */
283 if (len == 4 && !strncmp(buf, "disk", len)) { 283 if (len == 4 && !strncmp(buf, "disk", len)) {
284 error = hibernate(); 284 error = hibernate();
285 goto Exit; 285 goto Exit;
286 } 286 }
287 287
288#ifdef CONFIG_SUSPEND 288#ifdef CONFIG_SUSPEND
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 23a2db1ec442..0c4defe6d3b8 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -50,6 +50,8 @@ static inline char *check_image_kernel(struct swsusp_info *info)
50#define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT) 50#define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT)
51 51
52/* kernel/power/hibernate.c */ 52/* kernel/power/hibernate.c */
53extern bool freezer_test_done;
54
53extern int hibernation_snapshot(int platform_mode); 55extern int hibernation_snapshot(int platform_mode);
54extern int hibernation_restore(int platform_mode); 56extern int hibernation_restore(int platform_mode);
55extern int hibernation_platform_enter(void); 57extern int hibernation_platform_enter(void);
diff --git a/kernel/power/process.c b/kernel/power/process.c
index addbbe5531bc..77274c9ba2f1 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -22,16 +22,7 @@
22 */ 22 */
23#define TIMEOUT (20 * HZ) 23#define TIMEOUT (20 * HZ)
24 24
25static inline int freezable(struct task_struct * p) 25static int try_to_freeze_tasks(bool user_only)
26{
27 if ((p == current) ||
28 (p->flags & PF_NOFREEZE) ||
29 (p->exit_state != 0))
30 return 0;
31 return 1;
32}
33
34static int try_to_freeze_tasks(bool sig_only)
35{ 26{
36 struct task_struct *g, *p; 27 struct task_struct *g, *p;
37 unsigned long end_time; 28 unsigned long end_time;
@@ -46,17 +37,14 @@ static int try_to_freeze_tasks(bool sig_only)
46 37
47 end_time = jiffies + TIMEOUT; 38 end_time = jiffies + TIMEOUT;
48 39
49 if (!sig_only) 40 if (!user_only)
50 freeze_workqueues_begin(); 41 freeze_workqueues_begin();
51 42
52 while (true) { 43 while (true) {
53 todo = 0; 44 todo = 0;
54 read_lock(&tasklist_lock); 45 read_lock(&tasklist_lock);
55 do_each_thread(g, p) { 46 do_each_thread(g, p) {
56 if (frozen(p) || !freezable(p)) 47 if (p == current || !freeze_task(p))
57 continue;
58
59 if (!freeze_task(p, sig_only))
60 continue; 48 continue;
61 49
62 /* 50 /*
@@ -77,7 +65,7 @@ static int try_to_freeze_tasks(bool sig_only)
77 } while_each_thread(g, p); 65 } while_each_thread(g, p);
78 read_unlock(&tasklist_lock); 66 read_unlock(&tasklist_lock);
79 67
80 if (!sig_only) { 68 if (!user_only) {
81 wq_busy = freeze_workqueues_busy(); 69 wq_busy = freeze_workqueues_busy();
82 todo += wq_busy; 70 todo += wq_busy;
83 } 71 }
@@ -103,11 +91,6 @@ static int try_to_freeze_tasks(bool sig_only)
103 elapsed_csecs = elapsed_csecs64; 91 elapsed_csecs = elapsed_csecs64;
104 92
105 if (todo) { 93 if (todo) {
106 /* This does not unfreeze processes that are already frozen
107 * (we have slightly ugly calling convention in that respect,
108 * and caller must call thaw_processes() if something fails),
109 * but it cleans up leftover PF_FREEZE requests.
110 */
111 printk("\n"); 94 printk("\n");
112 printk(KERN_ERR "Freezing of tasks %s after %d.%02d seconds " 95 printk(KERN_ERR "Freezing of tasks %s after %d.%02d seconds "
113 "(%d tasks refusing to freeze, wq_busy=%d):\n", 96 "(%d tasks refusing to freeze, wq_busy=%d):\n",
@@ -115,15 +98,11 @@ static int try_to_freeze_tasks(bool sig_only)
115 elapsed_csecs / 100, elapsed_csecs % 100, 98 elapsed_csecs / 100, elapsed_csecs % 100,
116 todo - wq_busy, wq_busy); 99 todo - wq_busy, wq_busy);
117 100
118 thaw_workqueues();
119
120 read_lock(&tasklist_lock); 101 read_lock(&tasklist_lock);
121 do_each_thread(g, p) { 102 do_each_thread(g, p) {
122 task_lock(p); 103 if (!wakeup && !freezer_should_skip(p) &&
123 if (!wakeup && freezing(p) && !freezer_should_skip(p)) 104 p != current && freezing(p) && !frozen(p))
124 sched_show_task(p); 105 sched_show_task(p);
125 cancel_freezing(p);
126 task_unlock(p);
127 } while_each_thread(g, p); 106 } while_each_thread(g, p);
128 read_unlock(&tasklist_lock); 107 read_unlock(&tasklist_lock);
129 } else { 108 } else {
@@ -136,12 +115,18 @@ static int try_to_freeze_tasks(bool sig_only)
136 115
137/** 116/**
138 * freeze_processes - Signal user space processes to enter the refrigerator. 117 * freeze_processes - Signal user space processes to enter the refrigerator.
118 *
119 * On success, returns 0. On failure, -errno and system is fully thawed.
139 */ 120 */
140int freeze_processes(void) 121int freeze_processes(void)
141{ 122{
142 int error; 123 int error;
143 124
125 if (!pm_freezing)
126 atomic_inc(&system_freezing_cnt);
127
144 printk("Freezing user space processes ... "); 128 printk("Freezing user space processes ... ");
129 pm_freezing = true;
145 error = try_to_freeze_tasks(true); 130 error = try_to_freeze_tasks(true);
146 if (!error) { 131 if (!error) {
147 printk("done."); 132 printk("done.");
@@ -150,17 +135,22 @@ int freeze_processes(void)
150 printk("\n"); 135 printk("\n");
151 BUG_ON(in_atomic()); 136 BUG_ON(in_atomic());
152 137
138 if (error)
139 thaw_processes();
153 return error; 140 return error;
154} 141}
155 142
156/** 143/**
157 * freeze_kernel_threads - Make freezable kernel threads go to the refrigerator. 144 * freeze_kernel_threads - Make freezable kernel threads go to the refrigerator.
145 *
146 * On success, returns 0. On failure, -errno and system is fully thawed.
158 */ 147 */
159int freeze_kernel_threads(void) 148int freeze_kernel_threads(void)
160{ 149{
161 int error; 150 int error;
162 151
163 printk("Freezing remaining freezable tasks ... "); 152 printk("Freezing remaining freezable tasks ... ");
153 pm_nosig_freezing = true;
164 error = try_to_freeze_tasks(false); 154 error = try_to_freeze_tasks(false);
165 if (!error) 155 if (!error)
166 printk("done."); 156 printk("done.");
@@ -168,37 +158,32 @@ int freeze_kernel_threads(void)
168 printk("\n"); 158 printk("\n");
169 BUG_ON(in_atomic()); 159 BUG_ON(in_atomic());
170 160
161 if (error)
162 thaw_processes();
171 return error; 163 return error;
172} 164}
173 165
174static void thaw_tasks(bool nosig_only) 166void thaw_processes(void)
175{ 167{
176 struct task_struct *g, *p; 168 struct task_struct *g, *p;
177 169
178 read_lock(&tasklist_lock); 170 if (pm_freezing)
179 do_each_thread(g, p) { 171 atomic_dec(&system_freezing_cnt);
180 if (!freezable(p)) 172 pm_freezing = false;
181 continue; 173 pm_nosig_freezing = false;
182 174
183 if (nosig_only && should_send_signal(p)) 175 oom_killer_enable();
184 continue; 176
177 printk("Restarting tasks ... ");
185 178
186 if (cgroup_freezing_or_frozen(p)) 179 thaw_workqueues();
187 continue;
188 180
189 thaw_process(p); 181 read_lock(&tasklist_lock);
182 do_each_thread(g, p) {
183 __thaw_task(p);
190 } while_each_thread(g, p); 184 } while_each_thread(g, p);
191 read_unlock(&tasklist_lock); 185 read_unlock(&tasklist_lock);
192}
193 186
194void thaw_processes(void)
195{
196 oom_killer_enable();
197
198 printk("Restarting tasks ... ");
199 thaw_workqueues();
200 thaw_tasks(true);
201 thaw_tasks(false);
202 schedule(); 187 schedule();
203 printk("done.\n"); 188 printk("done.\n");
204} 189}
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index cbe2c1441392..1cf88900ec4f 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -858,6 +858,9 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
858 PageReserved(page)) 858 PageReserved(page))
859 return NULL; 859 return NULL;
860 860
861 if (page_is_guard(page))
862 return NULL;
863
861 return page; 864 return page;
862} 865}
863 866
@@ -920,6 +923,9 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn)
920 && (!kernel_page_present(page) || pfn_is_nosave(pfn))) 923 && (!kernel_page_present(page) || pfn_is_nosave(pfn)))
921 return NULL; 924 return NULL;
922 925
926 if (page_is_guard(page))
927 return NULL;
928
923 return page; 929 return page;
924} 930}
925 931
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 4953dc054c53..4fd51beed879 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -42,9 +42,9 @@ static const struct platform_suspend_ops *suspend_ops;
42 */ 42 */
43void suspend_set_ops(const struct platform_suspend_ops *ops) 43void suspend_set_ops(const struct platform_suspend_ops *ops)
44{ 44{
45 mutex_lock(&pm_mutex); 45 lock_system_sleep();
46 suspend_ops = ops; 46 suspend_ops = ops;
47 mutex_unlock(&pm_mutex); 47 unlock_system_sleep();
48} 48}
49EXPORT_SYMBOL_GPL(suspend_set_ops); 49EXPORT_SYMBOL_GPL(suspend_set_ops);
50 50
@@ -106,13 +106,11 @@ static int suspend_prepare(void)
106 goto Finish; 106 goto Finish;
107 107
108 error = suspend_freeze_processes(); 108 error = suspend_freeze_processes();
109 if (error) { 109 if (!error)
110 suspend_stats.failed_freeze++;
111 dpm_save_failed_step(SUSPEND_FREEZE);
112 } else
113 return 0; 110 return 0;
114 111
115 suspend_thaw_processes(); 112 suspend_stats.failed_freeze++;
113 dpm_save_failed_step(SUSPEND_FREEZE);
116 usermodehelper_enable(); 114 usermodehelper_enable();
117 Finish: 115 Finish:
118 pm_notifier_call_chain(PM_POST_SUSPEND); 116 pm_notifier_call_chain(PM_POST_SUSPEND);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 11a594c4ba25..8742fd013a94 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -18,7 +18,6 @@
18#include <linux/bitops.h> 18#include <linux/bitops.h>
19#include <linux/genhd.h> 19#include <linux/genhd.h>
20#include <linux/device.h> 20#include <linux/device.h>
21#include <linux/buffer_head.h>
22#include <linux/bio.h> 21#include <linux/bio.h>
23#include <linux/blkdev.h> 22#include <linux/blkdev.h>
24#include <linux/swap.h> 23#include <linux/swap.h>
@@ -774,8 +773,7 @@ static int enough_swap(unsigned int nr_pages, unsigned int flags)
774 773
775 pr_debug("PM: Free swap pages: %u\n", free_swap); 774 pr_debug("PM: Free swap pages: %u\n", free_swap);
776 775
777 required = PAGES_FOR_IO + ((flags & SF_NOCOMPRESS_MODE) ? 776 required = PAGES_FOR_IO + nr_pages;
778 nr_pages : (nr_pages * LZO_CMP_PAGES) / LZO_UNC_PAGES + 1);
779 return free_swap > required; 777 return free_swap > required;
780} 778}
781 779
@@ -803,10 +801,12 @@ int swsusp_write(unsigned int flags)
803 printk(KERN_ERR "PM: Cannot get swap writer\n"); 801 printk(KERN_ERR "PM: Cannot get swap writer\n");
804 return error; 802 return error;
805 } 803 }
806 if (!enough_swap(pages, flags)) { 804 if (flags & SF_NOCOMPRESS_MODE) {
807 printk(KERN_ERR "PM: Not enough free swap\n"); 805 if (!enough_swap(pages, flags)) {
808 error = -ENOSPC; 806 printk(KERN_ERR "PM: Not enough free swap\n");
809 goto out_finish; 807 error = -ENOSPC;
808 goto out_finish;
809 }
810 } 810 }
811 memset(&snapshot, 0, sizeof(struct snapshot_handle)); 811 memset(&snapshot, 0, sizeof(struct snapshot_handle));
812 error = snapshot_read_next(&snapshot); 812 error = snapshot_read_next(&snapshot);
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 6d8f535c2b88..6b1ab7a88522 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -21,6 +21,7 @@
21#include <linux/swapops.h> 21#include <linux/swapops.h>
22#include <linux/pm.h> 22#include <linux/pm.h>
23#include <linux/fs.h> 23#include <linux/fs.h>
24#include <linux/compat.h>
24#include <linux/console.h> 25#include <linux/console.h>
25#include <linux/cpu.h> 26#include <linux/cpu.h>
26#include <linux/freezer.h> 27#include <linux/freezer.h>
@@ -30,28 +31,6 @@
30 31
31#include "power.h" 32#include "power.h"
32 33
33/*
34 * NOTE: The SNAPSHOT_SET_SWAP_FILE and SNAPSHOT_PMOPS ioctls are obsolete and
35 * will be removed in the future. They are only preserved here for
36 * compatibility with existing userland utilities.
37 */
38#define SNAPSHOT_SET_SWAP_FILE _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int)
39#define SNAPSHOT_PMOPS _IOW(SNAPSHOT_IOC_MAGIC, 12, unsigned int)
40
41#define PMOPS_PREPARE 1
42#define PMOPS_ENTER 2
43#define PMOPS_FINISH 3
44
45/*
46 * NOTE: The following ioctl definitions are wrong and have been replaced with
47 * correct ones. They are only preserved here for compatibility with existing
48 * userland utilities and will be removed in the future.
49 */
50#define SNAPSHOT_ATOMIC_SNAPSHOT _IOW(SNAPSHOT_IOC_MAGIC, 3, void *)
51#define SNAPSHOT_SET_IMAGE_SIZE _IOW(SNAPSHOT_IOC_MAGIC, 6, unsigned long)
52#define SNAPSHOT_AVAIL_SWAP _IOR(SNAPSHOT_IOC_MAGIC, 7, void *)
53#define SNAPSHOT_GET_SWAP_PAGE _IOR(SNAPSHOT_IOC_MAGIC, 8, void *)
54
55 34
56#define SNAPSHOT_MINOR 231 35#define SNAPSHOT_MINOR 231
57 36
@@ -71,7 +50,7 @@ static int snapshot_open(struct inode *inode, struct file *filp)
71 struct snapshot_data *data; 50 struct snapshot_data *data;
72 int error; 51 int error;
73 52
74 mutex_lock(&pm_mutex); 53 lock_system_sleep();
75 54
76 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { 55 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
77 error = -EBUSY; 56 error = -EBUSY;
@@ -123,7 +102,7 @@ static int snapshot_open(struct inode *inode, struct file *filp)
123 data->platform_support = 0; 102 data->platform_support = 0;
124 103
125 Unlock: 104 Unlock:
126 mutex_unlock(&pm_mutex); 105 unlock_system_sleep();
127 106
128 return error; 107 return error;
129} 108}
@@ -132,7 +111,7 @@ static int snapshot_release(struct inode *inode, struct file *filp)
132{ 111{
133 struct snapshot_data *data; 112 struct snapshot_data *data;
134 113
135 mutex_lock(&pm_mutex); 114 lock_system_sleep();
136 115
137 swsusp_free(); 116 swsusp_free();
138 free_basic_memory_bitmaps(); 117 free_basic_memory_bitmaps();
@@ -146,7 +125,7 @@ static int snapshot_release(struct inode *inode, struct file *filp)
146 PM_POST_HIBERNATION : PM_POST_RESTORE); 125 PM_POST_HIBERNATION : PM_POST_RESTORE);
147 atomic_inc(&snapshot_device_available); 126 atomic_inc(&snapshot_device_available);
148 127
149 mutex_unlock(&pm_mutex); 128 unlock_system_sleep();
150 129
151 return 0; 130 return 0;
152} 131}
@@ -158,7 +137,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
158 ssize_t res; 137 ssize_t res;
159 loff_t pg_offp = *offp & ~PAGE_MASK; 138 loff_t pg_offp = *offp & ~PAGE_MASK;
160 139
161 mutex_lock(&pm_mutex); 140 lock_system_sleep();
162 141
163 data = filp->private_data; 142 data = filp->private_data;
164 if (!data->ready) { 143 if (!data->ready) {
@@ -179,7 +158,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
179 *offp += res; 158 *offp += res;
180 159
181 Unlock: 160 Unlock:
182 mutex_unlock(&pm_mutex); 161 unlock_system_sleep();
183 162
184 return res; 163 return res;
185} 164}
@@ -191,7 +170,7 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
191 ssize_t res; 170 ssize_t res;
192 loff_t pg_offp = *offp & ~PAGE_MASK; 171 loff_t pg_offp = *offp & ~PAGE_MASK;
193 172
194 mutex_lock(&pm_mutex); 173 lock_system_sleep();
195 174
196 data = filp->private_data; 175 data = filp->private_data;
197 176
@@ -208,20 +187,11 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
208 if (res > 0) 187 if (res > 0)
209 *offp += res; 188 *offp += res;
210unlock: 189unlock:
211 mutex_unlock(&pm_mutex); 190 unlock_system_sleep();
212 191
213 return res; 192 return res;
214} 193}
215 194
216static void snapshot_deprecated_ioctl(unsigned int cmd)
217{
218 if (printk_ratelimit())
219 printk(KERN_NOTICE "%pf: ioctl '%.8x' is deprecated and will "
220 "be removed soon, update your suspend-to-disk "
221 "utilities\n",
222 __builtin_return_address(0), cmd);
223}
224
225static long snapshot_ioctl(struct file *filp, unsigned int cmd, 195static long snapshot_ioctl(struct file *filp, unsigned int cmd,
226 unsigned long arg) 196 unsigned long arg)
227{ 197{
@@ -257,11 +227,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
257 break; 227 break;
258 228
259 error = freeze_processes(); 229 error = freeze_processes();
260 if (error) { 230 if (error)
261 thaw_processes();
262 usermodehelper_enable(); 231 usermodehelper_enable();
263 } 232 else
264 if (!error)
265 data->frozen = 1; 233 data->frozen = 1;
266 break; 234 break;
267 235
@@ -274,8 +242,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
274 data->frozen = 0; 242 data->frozen = 0;
275 break; 243 break;
276 244
277 case SNAPSHOT_ATOMIC_SNAPSHOT:
278 snapshot_deprecated_ioctl(cmd);
279 case SNAPSHOT_CREATE_IMAGE: 245 case SNAPSHOT_CREATE_IMAGE:
280 if (data->mode != O_RDONLY || !data->frozen || data->ready) { 246 if (data->mode != O_RDONLY || !data->frozen || data->ready) {
281 error = -EPERM; 247 error = -EPERM;
@@ -283,10 +249,15 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
283 } 249 }
284 pm_restore_gfp_mask(); 250 pm_restore_gfp_mask();
285 error = hibernation_snapshot(data->platform_support); 251 error = hibernation_snapshot(data->platform_support);
286 if (!error) 252 if (!error) {
287 error = put_user(in_suspend, (int __user *)arg); 253 error = put_user(in_suspend, (int __user *)arg);
288 if (!error) 254 if (!error && !freezer_test_done)
289 data->ready = 1; 255 data->ready = 1;
256 if (freezer_test_done) {
257 freezer_test_done = false;
258 thaw_processes();
259 }
260 }
290 break; 261 break;
291 262
292 case SNAPSHOT_ATOMIC_RESTORE: 263 case SNAPSHOT_ATOMIC_RESTORE:
@@ -305,8 +276,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
305 data->ready = 0; 276 data->ready = 0;
306 break; 277 break;
307 278
308 case SNAPSHOT_SET_IMAGE_SIZE:
309 snapshot_deprecated_ioctl(cmd);
310 case SNAPSHOT_PREF_IMAGE_SIZE: 279 case SNAPSHOT_PREF_IMAGE_SIZE:
311 image_size = arg; 280 image_size = arg;
312 break; 281 break;
@@ -321,16 +290,12 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
321 error = put_user(size, (loff_t __user *)arg); 290 error = put_user(size, (loff_t __user *)arg);
322 break; 291 break;
323 292
324 case SNAPSHOT_AVAIL_SWAP:
325 snapshot_deprecated_ioctl(cmd);
326 case SNAPSHOT_AVAIL_SWAP_SIZE: 293 case SNAPSHOT_AVAIL_SWAP_SIZE:
327 size = count_swap_pages(data->swap, 1); 294 size = count_swap_pages(data->swap, 1);
328 size <<= PAGE_SHIFT; 295 size <<= PAGE_SHIFT;
329 error = put_user(size, (loff_t __user *)arg); 296 error = put_user(size, (loff_t __user *)arg);
330 break; 297 break;
331 298
332 case SNAPSHOT_GET_SWAP_PAGE:
333 snapshot_deprecated_ioctl(cmd);
334 case SNAPSHOT_ALLOC_SWAP_PAGE: 299 case SNAPSHOT_ALLOC_SWAP_PAGE:
335 if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { 300 if (data->swap < 0 || data->swap >= MAX_SWAPFILES) {
336 error = -ENODEV; 301 error = -ENODEV;
@@ -353,27 +318,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
353 free_all_swap_pages(data->swap); 318 free_all_swap_pages(data->swap);
354 break; 319 break;
355 320
356 case SNAPSHOT_SET_SWAP_FILE: /* This ioctl is deprecated */
357 snapshot_deprecated_ioctl(cmd);
358 if (!swsusp_swap_in_use()) {
359 /*
360 * User space encodes device types as two-byte values,
361 * so we need to recode them
362 */
363 if (old_decode_dev(arg)) {
364 data->swap = swap_type_of(old_decode_dev(arg),
365 0, NULL);
366 if (data->swap < 0)
367 error = -ENODEV;
368 } else {
369 data->swap = -1;
370 error = -EINVAL;
371 }
372 } else {
373 error = -EPERM;
374 }
375 break;
376
377 case SNAPSHOT_S2RAM: 321 case SNAPSHOT_S2RAM:
378 if (!data->frozen) { 322 if (!data->frozen) {
379 error = -EPERM; 323 error = -EPERM;
@@ -396,33 +340,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
396 error = hibernation_platform_enter(); 340 error = hibernation_platform_enter();
397 break; 341 break;
398 342
399 case SNAPSHOT_PMOPS: /* This ioctl is deprecated */
400 snapshot_deprecated_ioctl(cmd);
401 error = -EINVAL;
402
403 switch (arg) {
404
405 case PMOPS_PREPARE:
406 data->platform_support = 1;
407 error = 0;
408 break;
409
410 case PMOPS_ENTER:
411 if (data->platform_support)
412 error = hibernation_platform_enter();
413 break;
414
415 case PMOPS_FINISH:
416 if (data->platform_support)
417 error = 0;
418 break;
419
420 default:
421 printk(KERN_ERR "SNAPSHOT_PMOPS: invalid argument %ld\n", arg);
422
423 }
424 break;
425
426 case SNAPSHOT_SET_SWAP_AREA: 343 case SNAPSHOT_SET_SWAP_AREA:
427 if (swsusp_swap_in_use()) { 344 if (swsusp_swap_in_use()) {
428 error = -EPERM; 345 error = -EPERM;
@@ -464,6 +381,66 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
464 return error; 381 return error;
465} 382}
466 383
384#ifdef CONFIG_COMPAT
385
386struct compat_resume_swap_area {
387 compat_loff_t offset;
388 u32 dev;
389} __packed;
390
391static long
392snapshot_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
393{
394 BUILD_BUG_ON(sizeof(loff_t) != sizeof(compat_loff_t));
395
396 switch (cmd) {
397 case SNAPSHOT_GET_IMAGE_SIZE:
398 case SNAPSHOT_AVAIL_SWAP_SIZE:
399 case SNAPSHOT_ALLOC_SWAP_PAGE: {
400 compat_loff_t __user *uoffset = compat_ptr(arg);
401 loff_t offset;
402 mm_segment_t old_fs;
403 int err;
404
405 old_fs = get_fs();
406 set_fs(KERNEL_DS);
407 err = snapshot_ioctl(file, cmd, (unsigned long) &offset);
408 set_fs(old_fs);
409 if (!err && put_user(offset, uoffset))
410 err = -EFAULT;
411 return err;
412 }
413
414 case SNAPSHOT_CREATE_IMAGE:
415 return snapshot_ioctl(file, cmd,
416 (unsigned long) compat_ptr(arg));
417
418 case SNAPSHOT_SET_SWAP_AREA: {
419 struct compat_resume_swap_area __user *u_swap_area =
420 compat_ptr(arg);
421 struct resume_swap_area swap_area;
422 mm_segment_t old_fs;
423 int err;
424
425 err = get_user(swap_area.offset, &u_swap_area->offset);
426 err |= get_user(swap_area.dev, &u_swap_area->dev);
427 if (err)
428 return -EFAULT;
429 old_fs = get_fs();
430 set_fs(KERNEL_DS);
431 err = snapshot_ioctl(file, SNAPSHOT_SET_SWAP_AREA,
432 (unsigned long) &swap_area);
433 set_fs(old_fs);
434 return err;
435 }
436
437 default:
438 return snapshot_ioctl(file, cmd, arg);
439 }
440}
441
442#endif /* CONFIG_COMPAT */
443
467static const struct file_operations snapshot_fops = { 444static const struct file_operations snapshot_fops = {
468 .open = snapshot_open, 445 .open = snapshot_open,
469 .release = snapshot_release, 446 .release = snapshot_release,
@@ -471,6 +448,9 @@ static const struct file_operations snapshot_fops = {
471 .write = snapshot_write, 448 .write = snapshot_write,
472 .llseek = no_llseek, 449 .llseek = no_llseek,
473 .unlocked_ioctl = snapshot_ioctl, 450 .unlocked_ioctl = snapshot_ioctl,
451#ifdef CONFIG_COMPAT
452 .compat_ioctl = snapshot_compat_ioctl,
453#endif
474}; 454};
475 455
476static struct miscdevice snapshot_device = { 456static struct miscdevice snapshot_device = {
diff --git a/kernel/printk.c b/kernel/printk.c
index 989e4a52da76..13c0a1143f49 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -521,7 +521,7 @@ static void __call_console_drivers(unsigned start, unsigned end)
521 } 521 }
522} 522}
523 523
524static int __read_mostly ignore_loglevel; 524static bool __read_mostly ignore_loglevel;
525 525
526static int __init ignore_loglevel_setup(char *str) 526static int __init ignore_loglevel_setup(char *str)
527{ 527{
@@ -532,7 +532,7 @@ static int __init ignore_loglevel_setup(char *str)
532} 532}
533 533
534early_param("ignore_loglevel", ignore_loglevel_setup); 534early_param("ignore_loglevel", ignore_loglevel_setup);
535module_param_named(ignore_loglevel, ignore_loglevel, bool, S_IRUGO | S_IWUSR); 535module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR);
536MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to" 536MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to"
537 "print all kernel messages to the console."); 537 "print all kernel messages to the console.");
538 538
@@ -696,9 +696,9 @@ static void zap_locks(void)
696} 696}
697 697
698#if defined(CONFIG_PRINTK_TIME) 698#if defined(CONFIG_PRINTK_TIME)
699static int printk_time = 1; 699static bool printk_time = 1;
700#else 700#else
701static int printk_time = 0; 701static bool printk_time = 0;
702#endif 702#endif
703module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); 703module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
704 704
@@ -1098,7 +1098,7 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha
1098 return -1; 1098 return -1;
1099} 1099}
1100 1100
1101int console_suspend_enabled = 1; 1101bool console_suspend_enabled = 1;
1102EXPORT_SYMBOL(console_suspend_enabled); 1102EXPORT_SYMBOL(console_suspend_enabled);
1103 1103
1104static int __init console_suspend_disable(char *str) 1104static int __init console_suspend_disable(char *str)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 78ab24a7b0e4..00ab2ca5ed11 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -172,6 +172,14 @@ int ptrace_check_attach(struct task_struct *child, bool ignore_state)
172 return ret; 172 return ret;
173} 173}
174 174
175static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode)
176{
177 if (mode & PTRACE_MODE_NOAUDIT)
178 return has_ns_capability_noaudit(current, ns, CAP_SYS_PTRACE);
179 else
180 return has_ns_capability(current, ns, CAP_SYS_PTRACE);
181}
182
175int __ptrace_may_access(struct task_struct *task, unsigned int mode) 183int __ptrace_may_access(struct task_struct *task, unsigned int mode)
176{ 184{
177 const struct cred *cred = current_cred(), *tcred; 185 const struct cred *cred = current_cred(), *tcred;
@@ -198,7 +206,7 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
198 cred->gid == tcred->sgid && 206 cred->gid == tcred->sgid &&
199 cred->gid == tcred->gid)) 207 cred->gid == tcred->gid))
200 goto ok; 208 goto ok;
201 if (ns_capable(tcred->user->user_ns, CAP_SYS_PTRACE)) 209 if (ptrace_has_cap(tcred->user->user_ns, mode))
202 goto ok; 210 goto ok;
203 rcu_read_unlock(); 211 rcu_read_unlock();
204 return -EPERM; 212 return -EPERM;
@@ -207,7 +215,7 @@ ok:
207 smp_rmb(); 215 smp_rmb();
208 if (task->mm) 216 if (task->mm)
209 dumpable = get_dumpable(task->mm); 217 dumpable = get_dumpable(task->mm);
210 if (!dumpable && !task_ns_capable(task, CAP_SYS_PTRACE)) 218 if (!dumpable && !ptrace_has_cap(task_user_ns(task), mode))
211 return -EPERM; 219 return -EPERM;
212 220
213 return security_ptrace_access_check(task, mode); 221 return security_ptrace_access_check(task, mode);
@@ -277,7 +285,7 @@ static int ptrace_attach(struct task_struct *task, long request,
277 task->ptrace = PT_PTRACED; 285 task->ptrace = PT_PTRACED;
278 if (seize) 286 if (seize)
279 task->ptrace |= PT_SEIZED; 287 task->ptrace |= PT_SEIZED;
280 if (task_ns_capable(task, CAP_SYS_PTRACE)) 288 if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE))
281 task->ptrace |= PT_PTRACE_CAP; 289 task->ptrace |= PT_PTRACE_CAP;
282 290
283 __ptrace_link(task, current); 291 __ptrace_link(task, current);
diff --git a/kernel/relay.c b/kernel/relay.c
index 226fade4d727..4335e1d7ee2d 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -302,7 +302,7 @@ static void buf_unmapped_default_callback(struct rchan_buf *buf,
302 */ 302 */
303static struct dentry *create_buf_file_default_callback(const char *filename, 303static struct dentry *create_buf_file_default_callback(const char *filename,
304 struct dentry *parent, 304 struct dentry *parent,
305 int mode, 305 umode_t mode,
306 struct rchan_buf *buf, 306 struct rchan_buf *buf,
307 int *is_global) 307 int *is_global)
308{ 308{
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index 34683efa2cce..6d269cce7aa1 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -159,8 +159,7 @@ int res_counter_memparse_write_strategy(const char *buf,
159 return 0; 159 return 0;
160 } 160 }
161 161
162 /* FIXME - make memparse() take const char* args */ 162 *res = memparse(buf, &end);
163 *res = memparse((char *)buf, &end);
164 if (*end != '\0') 163 if (*end != '\0')
165 return -EINVAL; 164 return -EINVAL;
166 165
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 3d9f31cd79e7..98ec49475460 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -6,11 +6,11 @@
6 * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> 6 * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
7 * 7 *
8 */ 8 */
9#include <linux/device.h>
9#include <linux/kthread.h> 10#include <linux/kthread.h>
10#include <linux/export.h> 11#include <linux/export.h>
11#include <linux/sched.h> 12#include <linux/sched.h>
12#include <linux/spinlock.h> 13#include <linux/spinlock.h>
13#include <linux/sysdev.h>
14#include <linux/timer.h> 14#include <linux/timer.h>
15#include <linux/freezer.h> 15#include <linux/freezer.h>
16 16
@@ -27,7 +27,7 @@ struct test_thread_data {
27 int opdata; 27 int opdata;
28 int mutexes[MAX_RT_TEST_MUTEXES]; 28 int mutexes[MAX_RT_TEST_MUTEXES];
29 int event; 29 int event;
30 struct sys_device sysdev; 30 struct device dev;
31}; 31};
32 32
33static struct test_thread_data thread_data[MAX_RT_TEST_THREADS]; 33static struct test_thread_data thread_data[MAX_RT_TEST_THREADS];
@@ -271,7 +271,7 @@ static int test_func(void *data)
271 * 271 *
272 * opcode:data 272 * opcode:data
273 */ 273 */
274static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribute *attr, 274static ssize_t sysfs_test_command(struct device *dev, struct device_attribute *attr,
275 const char *buf, size_t count) 275 const char *buf, size_t count)
276{ 276{
277 struct sched_param schedpar; 277 struct sched_param schedpar;
@@ -279,8 +279,8 @@ static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribut
279 char cmdbuf[32]; 279 char cmdbuf[32];
280 int op, dat, tid, ret; 280 int op, dat, tid, ret;
281 281
282 td = container_of(dev, struct test_thread_data, sysdev); 282 td = container_of(dev, struct test_thread_data, dev);
283 tid = td->sysdev.id; 283 tid = td->dev.id;
284 284
285 /* strings from sysfs write are not 0 terminated! */ 285 /* strings from sysfs write are not 0 terminated! */
286 if (count >= sizeof(cmdbuf)) 286 if (count >= sizeof(cmdbuf))
@@ -334,7 +334,7 @@ static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribut
334 * @dev: thread to query 334 * @dev: thread to query
335 * @buf: char buffer to be filled with thread status info 335 * @buf: char buffer to be filled with thread status info
336 */ 336 */
337static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute *attr, 337static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *attr,
338 char *buf) 338 char *buf)
339{ 339{
340 struct test_thread_data *td; 340 struct test_thread_data *td;
@@ -342,8 +342,8 @@ static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute
342 char *curr = buf; 342 char *curr = buf;
343 int i; 343 int i;
344 344
345 td = container_of(dev, struct test_thread_data, sysdev); 345 td = container_of(dev, struct test_thread_data, dev);
346 tsk = threads[td->sysdev.id]; 346 tsk = threads[td->dev.id];
347 347
348 spin_lock(&rttest_lock); 348 spin_lock(&rttest_lock);
349 349
@@ -360,28 +360,29 @@ static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute
360 spin_unlock(&rttest_lock); 360 spin_unlock(&rttest_lock);
361 361
362 curr += sprintf(curr, ", T: %p, R: %p\n", tsk, 362 curr += sprintf(curr, ", T: %p, R: %p\n", tsk,
363 mutexes[td->sysdev.id].owner); 363 mutexes[td->dev.id].owner);
364 364
365 return curr - buf; 365 return curr - buf;
366} 366}
367 367
368static SYSDEV_ATTR(status, 0600, sysfs_test_status, NULL); 368static DEVICE_ATTR(status, 0600, sysfs_test_status, NULL);
369static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command); 369static DEVICE_ATTR(command, 0600, NULL, sysfs_test_command);
370 370
371static struct sysdev_class rttest_sysclass = { 371static struct bus_type rttest_subsys = {
372 .name = "rttest", 372 .name = "rttest",
373 .dev_name = "rttest",
373}; 374};
374 375
375static int init_test_thread(int id) 376static int init_test_thread(int id)
376{ 377{
377 thread_data[id].sysdev.cls = &rttest_sysclass; 378 thread_data[id].dev.bus = &rttest_subsys;
378 thread_data[id].sysdev.id = id; 379 thread_data[id].dev.id = id;
379 380
380 threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id); 381 threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id);
381 if (IS_ERR(threads[id])) 382 if (IS_ERR(threads[id]))
382 return PTR_ERR(threads[id]); 383 return PTR_ERR(threads[id]);
383 384
384 return sysdev_register(&thread_data[id].sysdev); 385 return device_register(&thread_data[id].dev);
385} 386}
386 387
387static int init_rttest(void) 388static int init_rttest(void)
@@ -393,7 +394,7 @@ static int init_rttest(void)
393 for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) 394 for (i = 0; i < MAX_RT_TEST_MUTEXES; i++)
394 rt_mutex_init(&mutexes[i]); 395 rt_mutex_init(&mutexes[i]);
395 396
396 ret = sysdev_class_register(&rttest_sysclass); 397 ret = subsys_system_register(&rttest_subsys, NULL);
397 if (ret) 398 if (ret)
398 return ret; 399 return ret;
399 400
@@ -401,10 +402,10 @@ static int init_rttest(void)
401 ret = init_test_thread(i); 402 ret = init_test_thread(i);
402 if (ret) 403 if (ret)
403 break; 404 break;
404 ret = sysdev_create_file(&thread_data[i].sysdev, &attr_status); 405 ret = device_create_file(&thread_data[i].dev, &dev_attr_status);
405 if (ret) 406 if (ret)
406 break; 407 break;
407 ret = sysdev_create_file(&thread_data[i].sysdev, &attr_command); 408 ret = device_create_file(&thread_data[i].dev, &dev_attr_command);
408 if (ret) 409 if (ret)
409 break; 410 break;
410 } 411 }
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4dbfd04a2148..df00cb09263e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4330,7 +4330,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
4330 goto out_free_cpus_allowed; 4330 goto out_free_cpus_allowed;
4331 } 4331 }
4332 retval = -EPERM; 4332 retval = -EPERM;
4333 if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE)) 4333 if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE))
4334 goto out_unlock; 4334 goto out_unlock;
4335 4335
4336 retval = security_task_setscheduler(p); 4336 retval = security_task_setscheduler(p);
@@ -5176,7 +5176,7 @@ static void sd_free_ctl_entry(struct ctl_table **tablep)
5176static void 5176static void
5177set_table_entry(struct ctl_table *entry, 5177set_table_entry(struct ctl_table *entry,
5178 const char *procname, void *data, int maxlen, 5178 const char *procname, void *data, int maxlen,
5179 mode_t mode, proc_handler *proc_handler) 5179 umode_t mode, proc_handler *proc_handler)
5180{ 5180{
5181 entry->procname = procname; 5181 entry->procname = procname;
5182 entry->data = data; 5182 entry->data = data;
@@ -6675,54 +6675,52 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
6675} 6675}
6676 6676
6677#ifdef CONFIG_SCHED_MC 6677#ifdef CONFIG_SCHED_MC
6678static ssize_t sched_mc_power_savings_show(struct sysdev_class *class, 6678static ssize_t sched_mc_power_savings_show(struct device *dev,
6679 struct sysdev_class_attribute *attr, 6679 struct device_attribute *attr,
6680 char *page) 6680 char *buf)
6681{ 6681{
6682 return sprintf(page, "%u\n", sched_mc_power_savings); 6682 return sprintf(buf, "%u\n", sched_mc_power_savings);
6683} 6683}
6684static ssize_t sched_mc_power_savings_store(struct sysdev_class *class, 6684static ssize_t sched_mc_power_savings_store(struct device *dev,
6685 struct sysdev_class_attribute *attr, 6685 struct device_attribute *attr,
6686 const char *buf, size_t count) 6686 const char *buf, size_t count)
6687{ 6687{
6688 return sched_power_savings_store(buf, count, 0); 6688 return sched_power_savings_store(buf, count, 0);
6689} 6689}
6690static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644, 6690static DEVICE_ATTR(sched_mc_power_savings, 0644,
6691 sched_mc_power_savings_show, 6691 sched_mc_power_savings_show,
6692 sched_mc_power_savings_store); 6692 sched_mc_power_savings_store);
6693#endif 6693#endif
6694 6694
6695#ifdef CONFIG_SCHED_SMT 6695#ifdef CONFIG_SCHED_SMT
6696static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev, 6696static ssize_t sched_smt_power_savings_show(struct device *dev,
6697 struct sysdev_class_attribute *attr, 6697 struct device_attribute *attr,
6698 char *page) 6698 char *buf)
6699{ 6699{
6700 return sprintf(page, "%u\n", sched_smt_power_savings); 6700 return sprintf(buf, "%u\n", sched_smt_power_savings);
6701} 6701}
6702static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev, 6702static ssize_t sched_smt_power_savings_store(struct device *dev,
6703 struct sysdev_class_attribute *attr, 6703 struct device_attribute *attr,
6704 const char *buf, size_t count) 6704 const char *buf, size_t count)
6705{ 6705{
6706 return sched_power_savings_store(buf, count, 1); 6706 return sched_power_savings_store(buf, count, 1);
6707} 6707}
6708static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644, 6708static DEVICE_ATTR(sched_smt_power_savings, 0644,
6709 sched_smt_power_savings_show, 6709 sched_smt_power_savings_show,
6710 sched_smt_power_savings_store); 6710 sched_smt_power_savings_store);
6711#endif 6711#endif
6712 6712
6713int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) 6713int __init sched_create_sysfs_power_savings_entries(struct device *dev)
6714{ 6714{
6715 int err = 0; 6715 int err = 0;
6716 6716
6717#ifdef CONFIG_SCHED_SMT 6717#ifdef CONFIG_SCHED_SMT
6718 if (smt_capable()) 6718 if (smt_capable())
6719 err = sysfs_create_file(&cls->kset.kobj, 6719 err = device_create_file(dev, &dev_attr_sched_smt_power_savings);
6720 &attr_sched_smt_power_savings.attr);
6721#endif 6720#endif
6722#ifdef CONFIG_SCHED_MC 6721#ifdef CONFIG_SCHED_MC
6723 if (!err && mc_capable()) 6722 if (!err && mc_capable())
6724 err = sysfs_create_file(&cls->kset.kobj, 6723 err = device_create_file(dev, &dev_attr_sched_mc_power_savings);
6725 &attr_sched_mc_power_savings.attr);
6726#endif 6724#endif
6727 return err; 6725 return err;
6728} 6726}
@@ -7136,10 +7134,6 @@ void set_curr_task(int cpu, struct task_struct *p)
7136 7134
7137#endif 7135#endif
7138 7136
7139#ifdef CONFIG_RT_GROUP_SCHED
7140#else /* !CONFIG_RT_GROUP_SCHED */
7141#endif /* CONFIG_RT_GROUP_SCHED */
7142
7143#ifdef CONFIG_CGROUP_SCHED 7137#ifdef CONFIG_CGROUP_SCHED
7144/* task_group_lock serializes the addition/removal of task groups */ 7138/* task_group_lock serializes the addition/removal of task groups */
7145static DEFINE_SPINLOCK(task_group_lock); 7139static DEFINE_SPINLOCK(task_group_lock);
@@ -7248,9 +7242,6 @@ void sched_move_task(struct task_struct *tsk)
7248} 7242}
7249#endif /* CONFIG_CGROUP_SCHED */ 7243#endif /* CONFIG_CGROUP_SCHED */
7250 7244
7251#ifdef CONFIG_FAIR_GROUP_SCHED
7252#endif
7253
7254#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) 7245#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
7255static unsigned long to_ratio(u64 period, u64 runtime) 7246static unsigned long to_ratio(u64 period, u64 runtime)
7256{ 7247{
@@ -7565,24 +7556,31 @@ cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
7565 sched_destroy_group(tg); 7556 sched_destroy_group(tg);
7566} 7557}
7567 7558
7568static int 7559static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
7569cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) 7560 struct cgroup_taskset *tset)
7570{ 7561{
7562 struct task_struct *task;
7563
7564 cgroup_taskset_for_each(task, cgrp, tset) {
7571#ifdef CONFIG_RT_GROUP_SCHED 7565#ifdef CONFIG_RT_GROUP_SCHED
7572 if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) 7566 if (!sched_rt_can_attach(cgroup_tg(cgrp), task))
7573 return -EINVAL; 7567 return -EINVAL;
7574#else 7568#else
7575 /* We don't support RT-tasks being in separate groups */ 7569 /* We don't support RT-tasks being in separate groups */
7576 if (tsk->sched_class != &fair_sched_class) 7570 if (task->sched_class != &fair_sched_class)
7577 return -EINVAL; 7571 return -EINVAL;
7578#endif 7572#endif
7573 }
7579 return 0; 7574 return 0;
7580} 7575}
7581 7576
7582static void 7577static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
7583cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) 7578 struct cgroup_taskset *tset)
7584{ 7579{
7585 sched_move_task(tsk); 7580 struct task_struct *task;
7581
7582 cgroup_taskset_for_each(task, cgrp, tset)
7583 sched_move_task(task);
7586} 7584}
7587 7585
7588static void 7586static void
@@ -7917,8 +7915,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
7917 .name = "cpu", 7915 .name = "cpu",
7918 .create = cpu_cgroup_create, 7916 .create = cpu_cgroup_create,
7919 .destroy = cpu_cgroup_destroy, 7917 .destroy = cpu_cgroup_destroy,
7920 .can_attach_task = cpu_cgroup_can_attach_task, 7918 .can_attach = cpu_cgroup_can_attach,
7921 .attach_task = cpu_cgroup_attach_task, 7919 .attach = cpu_cgroup_attach,
7922 .exit = cpu_cgroup_exit, 7920 .exit = cpu_cgroup_exit,
7923 .populate = cpu_cgroup_populate, 7921 .populate = cpu_cgroup_populate,
7924 .subsys_id = cpu_cgroup_subsys_id, 7922 .subsys_id = cpu_cgroup_subsys_id,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8e42de9105f8..84adb2d66cbd 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3130,8 +3130,10 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
3130} 3130}
3131 3131
3132#define LBF_ALL_PINNED 0x01 3132#define LBF_ALL_PINNED 0x01
3133#define LBF_NEED_BREAK 0x02 3133#define LBF_NEED_BREAK 0x02 /* clears into HAD_BREAK */
3134#define LBF_ABORT 0x04 3134#define LBF_HAD_BREAK 0x04
3135#define LBF_HAD_BREAKS 0x0C /* count HAD_BREAKs overflows into ABORT */
3136#define LBF_ABORT 0x10
3135 3137
3136/* 3138/*
3137 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? 3139 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
@@ -4508,7 +4510,9 @@ redo:
4508 goto out_balanced; 4510 goto out_balanced;
4509 4511
4510 if (lb_flags & LBF_NEED_BREAK) { 4512 if (lb_flags & LBF_NEED_BREAK) {
4511 lb_flags &= ~LBF_NEED_BREAK; 4513 lb_flags += LBF_HAD_BREAK - LBF_NEED_BREAK;
4514 if (lb_flags & LBF_ABORT)
4515 goto out_balanced;
4512 goto redo; 4516 goto redo;
4513 } 4517 }
4514 4518
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 57d4b13b631d..e8d76c5895ea 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -6,6 +6,7 @@
6 * This defines a simple but solid secure-computing mode. 6 * This defines a simple but solid secure-computing mode.
7 */ 7 */
8 8
9#include <linux/audit.h>
9#include <linux/seccomp.h> 10#include <linux/seccomp.h>
10#include <linux/sched.h> 11#include <linux/sched.h>
11#include <linux/compat.h> 12#include <linux/compat.h>
@@ -54,6 +55,7 @@ void __secure_computing(int this_syscall)
54#ifdef SECCOMP_DEBUG 55#ifdef SECCOMP_DEBUG
55 dump_stack(); 56 dump_stack();
56#endif 57#endif
58 audit_seccomp(this_syscall);
57 do_exit(SIGKILL); 59 do_exit(SIGKILL);
58} 60}
59 61
diff --git a/kernel/signal.c b/kernel/signal.c
index 56ce3a618b28..c73c4284160e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -28,6 +28,7 @@
28#include <linux/freezer.h> 28#include <linux/freezer.h>
29#include <linux/pid_namespace.h> 29#include <linux/pid_namespace.h>
30#include <linux/nsproxy.h> 30#include <linux/nsproxy.h>
31#include <linux/user_namespace.h>
31#define CREATE_TRACE_POINTS 32#define CREATE_TRACE_POINTS
32#include <trace/events/signal.h> 33#include <trace/events/signal.h>
33 34
@@ -1019,6 +1020,34 @@ static inline int legacy_queue(struct sigpending *signals, int sig)
1019 return (sig < SIGRTMIN) && sigismember(&signals->signal, sig); 1020 return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
1020} 1021}
1021 1022
1023/*
1024 * map the uid in struct cred into user namespace *ns
1025 */
1026static inline uid_t map_cred_ns(const struct cred *cred,
1027 struct user_namespace *ns)
1028{
1029 return user_ns_map_uid(ns, cred, cred->uid);
1030}
1031
1032#ifdef CONFIG_USER_NS
1033static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t)
1034{
1035 if (current_user_ns() == task_cred_xxx(t, user_ns))
1036 return;
1037
1038 if (SI_FROMKERNEL(info))
1039 return;
1040
1041 info->si_uid = user_ns_map_uid(task_cred_xxx(t, user_ns),
1042 current_cred(), info->si_uid);
1043}
1044#else
1045static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t)
1046{
1047 return;
1048}
1049#endif
1050
1022static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, 1051static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
1023 int group, int from_ancestor_ns) 1052 int group, int from_ancestor_ns)
1024{ 1053{
@@ -1088,6 +1117,9 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
1088 q->info.si_pid = 0; 1117 q->info.si_pid = 0;
1089 break; 1118 break;
1090 } 1119 }
1120
1121 userns_fixup_signal_uid(&q->info, t);
1122
1091 } else if (!is_si_special(info)) { 1123 } else if (!is_si_special(info)) {
1092 if (sig >= SIGRTMIN && info->si_code != SI_USER) { 1124 if (sig >= SIGRTMIN && info->si_code != SI_USER) {
1093 /* 1125 /*
@@ -1626,7 +1658,8 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
1626 */ 1658 */
1627 rcu_read_lock(); 1659 rcu_read_lock();
1628 info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns); 1660 info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
1629 info.si_uid = __task_cred(tsk)->uid; 1661 info.si_uid = map_cred_ns(__task_cred(tsk),
1662 task_cred_xxx(tsk->parent, user_ns));
1630 rcu_read_unlock(); 1663 rcu_read_unlock();
1631 1664
1632 info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime); 1665 info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime);
@@ -1709,7 +1742,8 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
1709 */ 1742 */
1710 rcu_read_lock(); 1743 rcu_read_lock();
1711 info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); 1744 info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns);
1712 info.si_uid = __task_cred(tsk)->uid; 1745 info.si_uid = map_cred_ns(__task_cred(tsk),
1746 task_cred_xxx(parent, user_ns));
1713 rcu_read_unlock(); 1747 rcu_read_unlock();
1714 1748
1715 info.si_utime = cputime_to_clock_t(tsk->utime); 1749 info.si_utime = cputime_to_clock_t(tsk->utime);
@@ -2125,8 +2159,11 @@ static int ptrace_signal(int signr, siginfo_t *info,
2125 info->si_signo = signr; 2159 info->si_signo = signr;
2126 info->si_errno = 0; 2160 info->si_errno = 0;
2127 info->si_code = SI_USER; 2161 info->si_code = SI_USER;
2162 rcu_read_lock();
2128 info->si_pid = task_pid_vnr(current->parent); 2163 info->si_pid = task_pid_vnr(current->parent);
2129 info->si_uid = task_uid(current->parent); 2164 info->si_uid = map_cred_ns(__task_cred(current->parent),
2165 current_user_ns());
2166 rcu_read_unlock();
2130 } 2167 }
2131 2168
2132 /* If the (new) signal is now blocked, requeue it. */ 2169 /* If the (new) signal is now blocked, requeue it. */
@@ -2318,6 +2355,27 @@ relock:
2318 return signr; 2355 return signr;
2319} 2356}
2320 2357
2358/**
2359 * block_sigmask - add @ka's signal mask to current->blocked
2360 * @ka: action for @signr
2361 * @signr: signal that has been successfully delivered
2362 *
2363 * This function should be called when a signal has succesfully been
2364 * delivered. It adds the mask of signals for @ka to current->blocked
2365 * so that they are blocked during the execution of the signal
2366 * handler. In addition, @signr will be blocked unless %SA_NODEFER is
2367 * set in @ka->sa.sa_flags.
2368 */
2369void block_sigmask(struct k_sigaction *ka, int signr)
2370{
2371 sigset_t blocked;
2372
2373 sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask);
2374 if (!(ka->sa.sa_flags & SA_NODEFER))
2375 sigaddset(&blocked, signr);
2376 set_current_blocked(&blocked);
2377}
2378
2321/* 2379/*
2322 * It could be that complete_signal() picked us to notify about the 2380 * It could be that complete_signal() picked us to notify about the
2323 * group-wide signal. Other threads should be notified now to take 2381 * group-wide signal. Other threads should be notified now to take
@@ -2355,8 +2413,15 @@ void exit_signals(struct task_struct *tsk)
2355 int group_stop = 0; 2413 int group_stop = 0;
2356 sigset_t unblocked; 2414 sigset_t unblocked;
2357 2415
2416 /*
2417 * @tsk is about to have PF_EXITING set - lock out users which
2418 * expect stable threadgroup.
2419 */
2420 threadgroup_change_begin(tsk);
2421
2358 if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) { 2422 if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) {
2359 tsk->flags |= PF_EXITING; 2423 tsk->flags |= PF_EXITING;
2424 threadgroup_change_end(tsk);
2360 return; 2425 return;
2361 } 2426 }
2362 2427
@@ -2366,6 +2431,9 @@ void exit_signals(struct task_struct *tsk)
2366 * see wants_signal(), do_signal_stop(). 2431 * see wants_signal(), do_signal_stop().
2367 */ 2432 */
2368 tsk->flags |= PF_EXITING; 2433 tsk->flags |= PF_EXITING;
2434
2435 threadgroup_change_end(tsk);
2436
2369 if (!signal_pending(tsk)) 2437 if (!signal_pending(tsk))
2370 goto out; 2438 goto out;
2371 2439
diff --git a/kernel/sys.c b/kernel/sys.c
index ddf8155bf3f8..40701538fbd1 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1692,6 +1692,124 @@ SYSCALL_DEFINE1(umask, int, mask)
1692 return mask; 1692 return mask;
1693} 1693}
1694 1694
1695#ifdef CONFIG_CHECKPOINT_RESTORE
1696static int prctl_set_mm(int opt, unsigned long addr,
1697 unsigned long arg4, unsigned long arg5)
1698{
1699 unsigned long rlim = rlimit(RLIMIT_DATA);
1700 unsigned long vm_req_flags;
1701 unsigned long vm_bad_flags;
1702 struct vm_area_struct *vma;
1703 int error = 0;
1704 struct mm_struct *mm = current->mm;
1705
1706 if (arg4 | arg5)
1707 return -EINVAL;
1708
1709 if (!capable(CAP_SYS_ADMIN))
1710 return -EPERM;
1711
1712 if (addr >= TASK_SIZE)
1713 return -EINVAL;
1714
1715 down_read(&mm->mmap_sem);
1716 vma = find_vma(mm, addr);
1717
1718 if (opt != PR_SET_MM_START_BRK && opt != PR_SET_MM_BRK) {
1719 /* It must be existing VMA */
1720 if (!vma || vma->vm_start > addr)
1721 goto out;
1722 }
1723
1724 error = -EINVAL;
1725 switch (opt) {
1726 case PR_SET_MM_START_CODE:
1727 case PR_SET_MM_END_CODE:
1728 vm_req_flags = VM_READ | VM_EXEC;
1729 vm_bad_flags = VM_WRITE | VM_MAYSHARE;
1730
1731 if ((vma->vm_flags & vm_req_flags) != vm_req_flags ||
1732 (vma->vm_flags & vm_bad_flags))
1733 goto out;
1734
1735 if (opt == PR_SET_MM_START_CODE)
1736 mm->start_code = addr;
1737 else
1738 mm->end_code = addr;
1739 break;
1740
1741 case PR_SET_MM_START_DATA:
1742 case PR_SET_MM_END_DATA:
1743 vm_req_flags = VM_READ | VM_WRITE;
1744 vm_bad_flags = VM_EXEC | VM_MAYSHARE;
1745
1746 if ((vma->vm_flags & vm_req_flags) != vm_req_flags ||
1747 (vma->vm_flags & vm_bad_flags))
1748 goto out;
1749
1750 if (opt == PR_SET_MM_START_DATA)
1751 mm->start_data = addr;
1752 else
1753 mm->end_data = addr;
1754 break;
1755
1756 case PR_SET_MM_START_STACK:
1757
1758#ifdef CONFIG_STACK_GROWSUP
1759 vm_req_flags = VM_READ | VM_WRITE | VM_GROWSUP;
1760#else
1761 vm_req_flags = VM_READ | VM_WRITE | VM_GROWSDOWN;
1762#endif
1763 if ((vma->vm_flags & vm_req_flags) != vm_req_flags)
1764 goto out;
1765
1766 mm->start_stack = addr;
1767 break;
1768
1769 case PR_SET_MM_START_BRK:
1770 if (addr <= mm->end_data)
1771 goto out;
1772
1773 if (rlim < RLIM_INFINITY &&
1774 (mm->brk - addr) +
1775 (mm->end_data - mm->start_data) > rlim)
1776 goto out;
1777
1778 mm->start_brk = addr;
1779 break;
1780
1781 case PR_SET_MM_BRK:
1782 if (addr <= mm->end_data)
1783 goto out;
1784
1785 if (rlim < RLIM_INFINITY &&
1786 (addr - mm->start_brk) +
1787 (mm->end_data - mm->start_data) > rlim)
1788 goto out;
1789
1790 mm->brk = addr;
1791 break;
1792
1793 default:
1794 error = -EINVAL;
1795 goto out;
1796 }
1797
1798 error = 0;
1799
1800out:
1801 up_read(&mm->mmap_sem);
1802
1803 return error;
1804}
1805#else /* CONFIG_CHECKPOINT_RESTORE */
1806static int prctl_set_mm(int opt, unsigned long addr,
1807 unsigned long arg4, unsigned long arg5)
1808{
1809 return -EINVAL;
1810}
1811#endif
1812
1695SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, 1813SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1696 unsigned long, arg4, unsigned long, arg5) 1814 unsigned long, arg4, unsigned long, arg5)
1697{ 1815{
@@ -1841,6 +1959,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1841 else 1959 else
1842 error = PR_MCE_KILL_DEFAULT; 1960 error = PR_MCE_KILL_DEFAULT;
1843 break; 1961 break;
1962 case PR_SET_MM:
1963 error = prctl_set_mm(arg2, arg3, arg4, arg5);
1964 break;
1844 default: 1965 default:
1845 error = -EINVAL; 1966 error = -EINVAL;
1846 break; 1967 break;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ae2719643854..f487f257e05e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -803,6 +803,15 @@ static struct ctl_table kern_table[] = {
803 .mode = 0644, 803 .mode = 0644,
804 .proc_handler = proc_dointvec, 804 .proc_handler = proc_dointvec,
805 }, 805 },
806#ifdef CONFIG_DEBUG_STACKOVERFLOW
807 {
808 .procname = "panic_on_stackoverflow",
809 .data = &sysctl_panic_on_stackoverflow,
810 .maxlen = sizeof(int),
811 .mode = 0644,
812 .proc_handler = proc_dointvec,
813 },
814#endif
806 { 815 {
807 .procname = "bootloader_type", 816 .procname = "bootloader_type",
808 .data = &bootloader_type, 817 .data = &bootloader_type,
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index b26c2228fe92..2cf9cc7aa103 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -25,7 +25,7 @@ config HIGH_RES_TIMERS
25config GENERIC_CLOCKEVENTS_BUILD 25config GENERIC_CLOCKEVENTS_BUILD
26 bool 26 bool
27 default y 27 default y
28 depends on GENERIC_CLOCKEVENTS || GENERIC_CLOCKEVENTS_MIGR 28 depends on GENERIC_CLOCKEVENTS
29 29
30config GENERIC_CLOCKEVENTS_MIN_ADJUST 30config GENERIC_CLOCKEVENTS_MIN_ADJUST
31 bool 31 bool
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 1ecd6ba36d6c..9cd928f7a7c6 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -17,7 +17,6 @@
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/notifier.h> 18#include <linux/notifier.h>
19#include <linux/smp.h> 19#include <linux/smp.h>
20#include <linux/sysdev.h>
21 20
22#include "tick-internal.h" 21#include "tick-internal.h"
23 22
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index d3ad022136e5..a45ca167ab24 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -23,8 +23,8 @@
23 * o Allow clocksource drivers to be unregistered 23 * o Allow clocksource drivers to be unregistered
24 */ 24 */
25 25
26#include <linux/device.h>
26#include <linux/clocksource.h> 27#include <linux/clocksource.h>
27#include <linux/sysdev.h>
28#include <linux/init.h> 28#include <linux/init.h>
29#include <linux/module.h> 29#include <linux/module.h>
30#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */ 30#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
@@ -796,8 +796,8 @@ EXPORT_SYMBOL(clocksource_unregister);
796 * Provides sysfs interface for listing current clocksource. 796 * Provides sysfs interface for listing current clocksource.
797 */ 797 */
798static ssize_t 798static ssize_t
799sysfs_show_current_clocksources(struct sys_device *dev, 799sysfs_show_current_clocksources(struct device *dev,
800 struct sysdev_attribute *attr, char *buf) 800 struct device_attribute *attr, char *buf)
801{ 801{
802 ssize_t count = 0; 802 ssize_t count = 0;
803 803
@@ -818,8 +818,8 @@ sysfs_show_current_clocksources(struct sys_device *dev,
818 * Takes input from sysfs interface for manually overriding the default 818 * Takes input from sysfs interface for manually overriding the default
819 * clocksource selection. 819 * clocksource selection.
820 */ 820 */
821static ssize_t sysfs_override_clocksource(struct sys_device *dev, 821static ssize_t sysfs_override_clocksource(struct device *dev,
822 struct sysdev_attribute *attr, 822 struct device_attribute *attr,
823 const char *buf, size_t count) 823 const char *buf, size_t count)
824{ 824{
825 size_t ret = count; 825 size_t ret = count;
@@ -853,8 +853,8 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
853 * Provides sysfs interface for listing registered clocksources 853 * Provides sysfs interface for listing registered clocksources
854 */ 854 */
855static ssize_t 855static ssize_t
856sysfs_show_available_clocksources(struct sys_device *dev, 856sysfs_show_available_clocksources(struct device *dev,
857 struct sysdev_attribute *attr, 857 struct device_attribute *attr,
858 char *buf) 858 char *buf)
859{ 859{
860 struct clocksource *src; 860 struct clocksource *src;
@@ -883,35 +883,36 @@ sysfs_show_available_clocksources(struct sys_device *dev,
883/* 883/*
884 * Sysfs setup bits: 884 * Sysfs setup bits:
885 */ 885 */
886static SYSDEV_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources, 886static DEVICE_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources,
887 sysfs_override_clocksource); 887 sysfs_override_clocksource);
888 888
889static SYSDEV_ATTR(available_clocksource, 0444, 889static DEVICE_ATTR(available_clocksource, 0444,
890 sysfs_show_available_clocksources, NULL); 890 sysfs_show_available_clocksources, NULL);
891 891
892static struct sysdev_class clocksource_sysclass = { 892static struct bus_type clocksource_subsys = {
893 .name = "clocksource", 893 .name = "clocksource",
894 .dev_name = "clocksource",
894}; 895};
895 896
896static struct sys_device device_clocksource = { 897static struct device device_clocksource = {
897 .id = 0, 898 .id = 0,
898 .cls = &clocksource_sysclass, 899 .bus = &clocksource_subsys,
899}; 900};
900 901
901static int __init init_clocksource_sysfs(void) 902static int __init init_clocksource_sysfs(void)
902{ 903{
903 int error = sysdev_class_register(&clocksource_sysclass); 904 int error = subsys_system_register(&clocksource_subsys, NULL);
904 905
905 if (!error) 906 if (!error)
906 error = sysdev_register(&device_clocksource); 907 error = device_register(&device_clocksource);
907 if (!error) 908 if (!error)
908 error = sysdev_create_file( 909 error = device_create_file(
909 &device_clocksource, 910 &device_clocksource,
910 &attr_current_clocksource); 911 &dev_attr_current_clocksource);
911 if (!error) 912 if (!error)
912 error = sysdev_create_file( 913 error = device_create_file(
913 &device_clocksource, 914 &device_clocksource,
914 &attr_available_clocksource); 915 &dev_attr_available_clocksource);
915 return error; 916 return error;
916} 917}
917 918
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 16fc34a0806f..cdea7b56b0c9 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -402,7 +402,7 @@ static int blk_remove_buf_file_callback(struct dentry *dentry)
402 402
403static struct dentry *blk_create_buf_file_callback(const char *filename, 403static struct dentry *blk_create_buf_file_callback(const char *filename,
404 struct dentry *parent, 404 struct dentry *parent,
405 int mode, 405 umode_t mode,
406 struct rchan_buf *buf, 406 struct rchan_buf *buf,
407 int *is_global) 407 int *is_global)
408{ 408{
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b1e8943fed1d..683d559a0eef 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -22,11 +22,13 @@
22#include <linux/hardirq.h> 22#include <linux/hardirq.h>
23#include <linux/kthread.h> 23#include <linux/kthread.h>
24#include <linux/uaccess.h> 24#include <linux/uaccess.h>
25#include <linux/bsearch.h>
25#include <linux/module.h> 26#include <linux/module.h>
26#include <linux/ftrace.h> 27#include <linux/ftrace.h>
27#include <linux/sysctl.h> 28#include <linux/sysctl.h>
28#include <linux/slab.h> 29#include <linux/slab.h>
29#include <linux/ctype.h> 30#include <linux/ctype.h>
31#include <linux/sort.h>
30#include <linux/list.h> 32#include <linux/list.h>
31#include <linux/hash.h> 33#include <linux/hash.h>
32#include <linux/rcupdate.h> 34#include <linux/rcupdate.h>
@@ -947,13 +949,6 @@ struct ftrace_func_probe {
947 struct rcu_head rcu; 949 struct rcu_head rcu;
948}; 950};
949 951
950enum {
951 FTRACE_ENABLE_CALLS = (1 << 0),
952 FTRACE_DISABLE_CALLS = (1 << 1),
953 FTRACE_UPDATE_TRACE_FUNC = (1 << 2),
954 FTRACE_START_FUNC_RET = (1 << 3),
955 FTRACE_STOP_FUNC_RET = (1 << 4),
956};
957struct ftrace_func_entry { 952struct ftrace_func_entry {
958 struct hlist_node hlist; 953 struct hlist_node hlist;
959 unsigned long ip; 954 unsigned long ip;
@@ -984,18 +979,19 @@ static struct ftrace_ops global_ops = {
984 .filter_hash = EMPTY_HASH, 979 .filter_hash = EMPTY_HASH,
985}; 980};
986 981
987static struct dyn_ftrace *ftrace_new_addrs;
988
989static DEFINE_MUTEX(ftrace_regex_lock); 982static DEFINE_MUTEX(ftrace_regex_lock);
990 983
991struct ftrace_page { 984struct ftrace_page {
992 struct ftrace_page *next; 985 struct ftrace_page *next;
986 struct dyn_ftrace *records;
993 int index; 987 int index;
994 struct dyn_ftrace records[]; 988 int size;
995}; 989};
996 990
997#define ENTRIES_PER_PAGE \ 991static struct ftrace_page *ftrace_new_pgs;
998 ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct dyn_ftrace)) 992
993#define ENTRY_SIZE sizeof(struct dyn_ftrace)
994#define ENTRIES_PER_PAGE (PAGE_SIZE / ENTRY_SIZE)
999 995
1000/* estimate from running different kernels */ 996/* estimate from running different kernels */
1001#define NR_TO_INIT 10000 997#define NR_TO_INIT 10000
@@ -1003,7 +999,10 @@ struct ftrace_page {
1003static struct ftrace_page *ftrace_pages_start; 999static struct ftrace_page *ftrace_pages_start;
1004static struct ftrace_page *ftrace_pages; 1000static struct ftrace_page *ftrace_pages;
1005 1001
1006static struct dyn_ftrace *ftrace_free_records; 1002static bool ftrace_hash_empty(struct ftrace_hash *hash)
1003{
1004 return !hash || !hash->count;
1005}
1007 1006
1008static struct ftrace_func_entry * 1007static struct ftrace_func_entry *
1009ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) 1008ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
@@ -1013,7 +1012,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
1013 struct hlist_head *hhd; 1012 struct hlist_head *hhd;
1014 struct hlist_node *n; 1013 struct hlist_node *n;
1015 1014
1016 if (!hash->count) 1015 if (ftrace_hash_empty(hash))
1017 return NULL; 1016 return NULL;
1018 1017
1019 if (hash->size_bits > 0) 1018 if (hash->size_bits > 0)
@@ -1157,7 +1156,7 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
1157 return NULL; 1156 return NULL;
1158 1157
1159 /* Empty hash? */ 1158 /* Empty hash? */
1160 if (!hash || !hash->count) 1159 if (ftrace_hash_empty(hash))
1161 return new_hash; 1160 return new_hash;
1162 1161
1163 size = 1 << hash->size_bits; 1162 size = 1 << hash->size_bits;
@@ -1282,9 +1281,9 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
1282 filter_hash = rcu_dereference_raw(ops->filter_hash); 1281 filter_hash = rcu_dereference_raw(ops->filter_hash);
1283 notrace_hash = rcu_dereference_raw(ops->notrace_hash); 1282 notrace_hash = rcu_dereference_raw(ops->notrace_hash);
1284 1283
1285 if ((!filter_hash || !filter_hash->count || 1284 if ((ftrace_hash_empty(filter_hash) ||
1286 ftrace_lookup_ip(filter_hash, ip)) && 1285 ftrace_lookup_ip(filter_hash, ip)) &&
1287 (!notrace_hash || !notrace_hash->count || 1286 (ftrace_hash_empty(notrace_hash) ||
1288 !ftrace_lookup_ip(notrace_hash, ip))) 1287 !ftrace_lookup_ip(notrace_hash, ip)))
1289 ret = 1; 1288 ret = 1;
1290 else 1289 else
@@ -1307,6 +1306,47 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
1307 } \ 1306 } \
1308 } 1307 }
1309 1308
1309
1310static int ftrace_cmp_recs(const void *a, const void *b)
1311{
1312 const struct dyn_ftrace *reca = a;
1313 const struct dyn_ftrace *recb = b;
1314
1315 if (reca->ip > recb->ip)
1316 return 1;
1317 if (reca->ip < recb->ip)
1318 return -1;
1319 return 0;
1320}
1321
1322/**
1323 * ftrace_location - return true if the ip giving is a traced location
1324 * @ip: the instruction pointer to check
1325 *
1326 * Returns 1 if @ip given is a pointer to a ftrace location.
1327 * That is, the instruction that is either a NOP or call to
1328 * the function tracer. It checks the ftrace internal tables to
1329 * determine if the address belongs or not.
1330 */
1331int ftrace_location(unsigned long ip)
1332{
1333 struct ftrace_page *pg;
1334 struct dyn_ftrace *rec;
1335 struct dyn_ftrace key;
1336
1337 key.ip = ip;
1338
1339 for (pg = ftrace_pages_start; pg; pg = pg->next) {
1340 rec = bsearch(&key, pg->records, pg->index,
1341 sizeof(struct dyn_ftrace),
1342 ftrace_cmp_recs);
1343 if (rec)
1344 return 1;
1345 }
1346
1347 return 0;
1348}
1349
1310static void __ftrace_hash_rec_update(struct ftrace_ops *ops, 1350static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
1311 int filter_hash, 1351 int filter_hash,
1312 bool inc) 1352 bool inc)
@@ -1336,7 +1376,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
1336 if (filter_hash) { 1376 if (filter_hash) {
1337 hash = ops->filter_hash; 1377 hash = ops->filter_hash;
1338 other_hash = ops->notrace_hash; 1378 other_hash = ops->notrace_hash;
1339 if (!hash || !hash->count) 1379 if (ftrace_hash_empty(hash))
1340 all = 1; 1380 all = 1;
1341 } else { 1381 } else {
1342 inc = !inc; 1382 inc = !inc;
@@ -1346,7 +1386,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
1346 * If the notrace hash has no items, 1386 * If the notrace hash has no items,
1347 * then there's nothing to do. 1387 * then there's nothing to do.
1348 */ 1388 */
1349 if (hash && !hash->count) 1389 if (ftrace_hash_empty(hash))
1350 return; 1390 return;
1351 } 1391 }
1352 1392
@@ -1363,8 +1403,8 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
1363 if (!other_hash || !ftrace_lookup_ip(other_hash, rec->ip)) 1403 if (!other_hash || !ftrace_lookup_ip(other_hash, rec->ip))
1364 match = 1; 1404 match = 1;
1365 } else { 1405 } else {
1366 in_hash = hash && !!ftrace_lookup_ip(hash, rec->ip); 1406 in_hash = !!ftrace_lookup_ip(hash, rec->ip);
1367 in_other_hash = other_hash && !!ftrace_lookup_ip(other_hash, rec->ip); 1407 in_other_hash = !!ftrace_lookup_ip(other_hash, rec->ip);
1368 1408
1369 /* 1409 /*
1370 * 1410 *
@@ -1372,7 +1412,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
1372 if (filter_hash && in_hash && !in_other_hash) 1412 if (filter_hash && in_hash && !in_other_hash)
1373 match = 1; 1413 match = 1;
1374 else if (!filter_hash && in_hash && 1414 else if (!filter_hash && in_hash &&
1375 (in_other_hash || !other_hash->count)) 1415 (in_other_hash || ftrace_hash_empty(other_hash)))
1376 match = 1; 1416 match = 1;
1377 } 1417 }
1378 if (!match) 1418 if (!match)
@@ -1406,40 +1446,12 @@ static void ftrace_hash_rec_enable(struct ftrace_ops *ops,
1406 __ftrace_hash_rec_update(ops, filter_hash, 1); 1446 __ftrace_hash_rec_update(ops, filter_hash, 1);
1407} 1447}
1408 1448
1409static void ftrace_free_rec(struct dyn_ftrace *rec)
1410{
1411 rec->freelist = ftrace_free_records;
1412 ftrace_free_records = rec;
1413 rec->flags |= FTRACE_FL_FREE;
1414}
1415
1416static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) 1449static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
1417{ 1450{
1418 struct dyn_ftrace *rec; 1451 if (ftrace_pages->index == ftrace_pages->size) {
1419 1452 /* We should have allocated enough */
1420 /* First check for freed records */ 1453 if (WARN_ON(!ftrace_pages->next))
1421 if (ftrace_free_records) {
1422 rec = ftrace_free_records;
1423
1424 if (unlikely(!(rec->flags & FTRACE_FL_FREE))) {
1425 FTRACE_WARN_ON_ONCE(1);
1426 ftrace_free_records = NULL;
1427 return NULL; 1454 return NULL;
1428 }
1429
1430 ftrace_free_records = rec->freelist;
1431 memset(rec, 0, sizeof(*rec));
1432 return rec;
1433 }
1434
1435 if (ftrace_pages->index == ENTRIES_PER_PAGE) {
1436 if (!ftrace_pages->next) {
1437 /* allocate another page */
1438 ftrace_pages->next =
1439 (void *)get_zeroed_page(GFP_KERNEL);
1440 if (!ftrace_pages->next)
1441 return NULL;
1442 }
1443 ftrace_pages = ftrace_pages->next; 1455 ftrace_pages = ftrace_pages->next;
1444 } 1456 }
1445 1457
@@ -1459,8 +1471,6 @@ ftrace_record_ip(unsigned long ip)
1459 return NULL; 1471 return NULL;
1460 1472
1461 rec->ip = ip; 1473 rec->ip = ip;
1462 rec->newlist = ftrace_new_addrs;
1463 ftrace_new_addrs = rec;
1464 1474
1465 return rec; 1475 return rec;
1466} 1476}
@@ -1475,7 +1485,19 @@ static void print_ip_ins(const char *fmt, unsigned char *p)
1475 printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]); 1485 printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);
1476} 1486}
1477 1487
1478static void ftrace_bug(int failed, unsigned long ip) 1488/**
1489 * ftrace_bug - report and shutdown function tracer
1490 * @failed: The failed type (EFAULT, EINVAL, EPERM)
1491 * @ip: The address that failed
1492 *
1493 * The arch code that enables or disables the function tracing
1494 * can call ftrace_bug() when it has detected a problem in
1495 * modifying the code. @failed should be one of either:
1496 * EFAULT - if the problem happens on reading the @ip address
1497 * EINVAL - if what is read at @ip is not what was expected
1498 * EPERM - if the problem happens on writting to the @ip address
1499 */
1500void ftrace_bug(int failed, unsigned long ip)
1479{ 1501{
1480 switch (failed) { 1502 switch (failed) {
1481 case -EFAULT: 1503 case -EFAULT:
@@ -1517,24 +1539,19 @@ int ftrace_text_reserved(void *start, void *end)
1517 return 0; 1539 return 0;
1518} 1540}
1519 1541
1520 1542static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
1521static int
1522__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
1523{ 1543{
1524 unsigned long ftrace_addr;
1525 unsigned long flag = 0UL; 1544 unsigned long flag = 0UL;
1526 1545
1527 ftrace_addr = (unsigned long)FTRACE_ADDR;
1528
1529 /* 1546 /*
1530 * If we are enabling tracing: 1547 * If we are updating calls:
1531 * 1548 *
1532 * If the record has a ref count, then we need to enable it 1549 * If the record has a ref count, then we need to enable it
1533 * because someone is using it. 1550 * because someone is using it.
1534 * 1551 *
1535 * Otherwise we make sure its disabled. 1552 * Otherwise we make sure its disabled.
1536 * 1553 *
1537 * If we are disabling tracing, then disable all records that 1554 * If we are disabling calls, then disable all records that
1538 * are enabled. 1555 * are enabled.
1539 */ 1556 */
1540 if (enable && (rec->flags & ~FTRACE_FL_MASK)) 1557 if (enable && (rec->flags & ~FTRACE_FL_MASK))
@@ -1542,18 +1559,72 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
1542 1559
1543 /* If the state of this record hasn't changed, then do nothing */ 1560 /* If the state of this record hasn't changed, then do nothing */
1544 if ((rec->flags & FTRACE_FL_ENABLED) == flag) 1561 if ((rec->flags & FTRACE_FL_ENABLED) == flag)
1545 return 0; 1562 return FTRACE_UPDATE_IGNORE;
1546 1563
1547 if (flag) { 1564 if (flag) {
1548 rec->flags |= FTRACE_FL_ENABLED; 1565 if (update)
1566 rec->flags |= FTRACE_FL_ENABLED;
1567 return FTRACE_UPDATE_MAKE_CALL;
1568 }
1569
1570 if (update)
1571 rec->flags &= ~FTRACE_FL_ENABLED;
1572
1573 return FTRACE_UPDATE_MAKE_NOP;
1574}
1575
1576/**
1577 * ftrace_update_record, set a record that now is tracing or not
1578 * @rec: the record to update
1579 * @enable: set to 1 if the record is tracing, zero to force disable
1580 *
1581 * The records that represent all functions that can be traced need
1582 * to be updated when tracing has been enabled.
1583 */
1584int ftrace_update_record(struct dyn_ftrace *rec, int enable)
1585{
1586 return ftrace_check_record(rec, enable, 1);
1587}
1588
1589/**
1590 * ftrace_test_record, check if the record has been enabled or not
1591 * @rec: the record to test
1592 * @enable: set to 1 to check if enabled, 0 if it is disabled
1593 *
1594 * The arch code may need to test if a record is already set to
1595 * tracing to determine how to modify the function code that it
1596 * represents.
1597 */
1598int ftrace_test_record(struct dyn_ftrace *rec, int enable)
1599{
1600 return ftrace_check_record(rec, enable, 0);
1601}
1602
1603static int
1604__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
1605{
1606 unsigned long ftrace_addr;
1607 int ret;
1608
1609 ftrace_addr = (unsigned long)FTRACE_ADDR;
1610
1611 ret = ftrace_update_record(rec, enable);
1612
1613 switch (ret) {
1614 case FTRACE_UPDATE_IGNORE:
1615 return 0;
1616
1617 case FTRACE_UPDATE_MAKE_CALL:
1549 return ftrace_make_call(rec, ftrace_addr); 1618 return ftrace_make_call(rec, ftrace_addr);
1619
1620 case FTRACE_UPDATE_MAKE_NOP:
1621 return ftrace_make_nop(NULL, rec, ftrace_addr);
1550 } 1622 }
1551 1623
1552 rec->flags &= ~FTRACE_FL_ENABLED; 1624 return -1; /* unknow ftrace bug */
1553 return ftrace_make_nop(NULL, rec, ftrace_addr);
1554} 1625}
1555 1626
1556static void ftrace_replace_code(int enable) 1627static void ftrace_replace_code(int update)
1557{ 1628{
1558 struct dyn_ftrace *rec; 1629 struct dyn_ftrace *rec;
1559 struct ftrace_page *pg; 1630 struct ftrace_page *pg;
@@ -1563,11 +1634,7 @@ static void ftrace_replace_code(int enable)
1563 return; 1634 return;
1564 1635
1565 do_for_each_ftrace_rec(pg, rec) { 1636 do_for_each_ftrace_rec(pg, rec) {
1566 /* Skip over free records */ 1637 failed = __ftrace_replace_code(rec, update);
1567 if (rec->flags & FTRACE_FL_FREE)
1568 continue;
1569
1570 failed = __ftrace_replace_code(rec, enable);
1571 if (failed) { 1638 if (failed) {
1572 ftrace_bug(failed, rec->ip); 1639 ftrace_bug(failed, rec->ip);
1573 /* Stop processing */ 1640 /* Stop processing */
@@ -1576,6 +1643,78 @@ static void ftrace_replace_code(int enable)
1576 } while_for_each_ftrace_rec(); 1643 } while_for_each_ftrace_rec();
1577} 1644}
1578 1645
1646struct ftrace_rec_iter {
1647 struct ftrace_page *pg;
1648 int index;
1649};
1650
1651/**
1652 * ftrace_rec_iter_start, start up iterating over traced functions
1653 *
1654 * Returns an iterator handle that is used to iterate over all
1655 * the records that represent address locations where functions
1656 * are traced.
1657 *
1658 * May return NULL if no records are available.
1659 */
1660struct ftrace_rec_iter *ftrace_rec_iter_start(void)
1661{
1662 /*
1663 * We only use a single iterator.
1664 * Protected by the ftrace_lock mutex.
1665 */
1666 static struct ftrace_rec_iter ftrace_rec_iter;
1667 struct ftrace_rec_iter *iter = &ftrace_rec_iter;
1668
1669 iter->pg = ftrace_pages_start;
1670 iter->index = 0;
1671
1672 /* Could have empty pages */
1673 while (iter->pg && !iter->pg->index)
1674 iter->pg = iter->pg->next;
1675
1676 if (!iter->pg)
1677 return NULL;
1678
1679 return iter;
1680}
1681
1682/**
1683 * ftrace_rec_iter_next, get the next record to process.
1684 * @iter: The handle to the iterator.
1685 *
1686 * Returns the next iterator after the given iterator @iter.
1687 */
1688struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter)
1689{
1690 iter->index++;
1691
1692 if (iter->index >= iter->pg->index) {
1693 iter->pg = iter->pg->next;
1694 iter->index = 0;
1695
1696 /* Could have empty pages */
1697 while (iter->pg && !iter->pg->index)
1698 iter->pg = iter->pg->next;
1699 }
1700
1701 if (!iter->pg)
1702 return NULL;
1703
1704 return iter;
1705}
1706
1707/**
1708 * ftrace_rec_iter_record, get the record at the iterator location
1709 * @iter: The current iterator location
1710 *
1711 * Returns the record that the current @iter is at.
1712 */
1713struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter)
1714{
1715 return &iter->pg->records[iter->index];
1716}
1717
1579static int 1718static int
1580ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec) 1719ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
1581{ 1720{
@@ -1617,13 +1756,7 @@ static int __ftrace_modify_code(void *data)
1617{ 1756{
1618 int *command = data; 1757 int *command = data;
1619 1758
1620 /* 1759 if (*command & FTRACE_UPDATE_CALLS)
1621 * Do not call function tracer while we update the code.
1622 * We are in stop machine, no worrying about races.
1623 */
1624 function_trace_stop++;
1625
1626 if (*command & FTRACE_ENABLE_CALLS)
1627 ftrace_replace_code(1); 1760 ftrace_replace_code(1);
1628 else if (*command & FTRACE_DISABLE_CALLS) 1761 else if (*command & FTRACE_DISABLE_CALLS)
1629 ftrace_replace_code(0); 1762 ftrace_replace_code(0);
@@ -1636,21 +1769,33 @@ static int __ftrace_modify_code(void *data)
1636 else if (*command & FTRACE_STOP_FUNC_RET) 1769 else if (*command & FTRACE_STOP_FUNC_RET)
1637 ftrace_disable_ftrace_graph_caller(); 1770 ftrace_disable_ftrace_graph_caller();
1638 1771
1639#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
1640 /*
1641 * For archs that call ftrace_test_stop_func(), we must
1642 * wait till after we update all the function callers
1643 * before we update the callback. This keeps different
1644 * ops that record different functions from corrupting
1645 * each other.
1646 */
1647 __ftrace_trace_function = __ftrace_trace_function_delay;
1648#endif
1649 function_trace_stop--;
1650
1651 return 0; 1772 return 0;
1652} 1773}
1653 1774
1775/**
1776 * ftrace_run_stop_machine, go back to the stop machine method
1777 * @command: The command to tell ftrace what to do
1778 *
1779 * If an arch needs to fall back to the stop machine method, the
1780 * it can call this function.
1781 */
1782void ftrace_run_stop_machine(int command)
1783{
1784 stop_machine(__ftrace_modify_code, &command, NULL);
1785}
1786
1787/**
1788 * arch_ftrace_update_code, modify the code to trace or not trace
1789 * @command: The command that needs to be done
1790 *
1791 * Archs can override this function if it does not need to
1792 * run stop_machine() to modify code.
1793 */
1794void __weak arch_ftrace_update_code(int command)
1795{
1796 ftrace_run_stop_machine(command);
1797}
1798
1654static void ftrace_run_update_code(int command) 1799static void ftrace_run_update_code(int command)
1655{ 1800{
1656 int ret; 1801 int ret;
@@ -1659,8 +1804,31 @@ static void ftrace_run_update_code(int command)
1659 FTRACE_WARN_ON(ret); 1804 FTRACE_WARN_ON(ret);
1660 if (ret) 1805 if (ret)
1661 return; 1806 return;
1807 /*
1808 * Do not call function tracer while we update the code.
1809 * We are in stop machine.
1810 */
1811 function_trace_stop++;
1662 1812
1663 stop_machine(__ftrace_modify_code, &command, NULL); 1813 /*
1814 * By default we use stop_machine() to modify the code.
1815 * But archs can do what ever they want as long as it
1816 * is safe. The stop_machine() is the safest, but also
1817 * produces the most overhead.
1818 */
1819 arch_ftrace_update_code(command);
1820
1821#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
1822 /*
1823 * For archs that call ftrace_test_stop_func(), we must
1824 * wait till after we update all the function callers
1825 * before we update the callback. This keeps different
1826 * ops that record different functions from corrupting
1827 * each other.
1828 */
1829 __ftrace_trace_function = __ftrace_trace_function_delay;
1830#endif
1831 function_trace_stop--;
1664 1832
1665 ret = ftrace_arch_code_modify_post_process(); 1833 ret = ftrace_arch_code_modify_post_process();
1666 FTRACE_WARN_ON(ret); 1834 FTRACE_WARN_ON(ret);
@@ -1691,7 +1859,7 @@ static int ftrace_startup(struct ftrace_ops *ops, int command)
1691 return -ENODEV; 1859 return -ENODEV;
1692 1860
1693 ftrace_start_up++; 1861 ftrace_start_up++;
1694 command |= FTRACE_ENABLE_CALLS; 1862 command |= FTRACE_UPDATE_CALLS;
1695 1863
1696 /* ops marked global share the filter hashes */ 1864 /* ops marked global share the filter hashes */
1697 if (ops->flags & FTRACE_OPS_FL_GLOBAL) { 1865 if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
@@ -1743,8 +1911,7 @@ static void ftrace_shutdown(struct ftrace_ops *ops, int command)
1743 if (ops != &global_ops || !global_start_up) 1911 if (ops != &global_ops || !global_start_up)
1744 ops->flags &= ~FTRACE_OPS_FL_ENABLED; 1912 ops->flags &= ~FTRACE_OPS_FL_ENABLED;
1745 1913
1746 if (!ftrace_start_up) 1914 command |= FTRACE_UPDATE_CALLS;
1747 command |= FTRACE_DISABLE_CALLS;
1748 1915
1749 if (saved_ftrace_func != ftrace_trace_function) { 1916 if (saved_ftrace_func != ftrace_trace_function) {
1750 saved_ftrace_func = ftrace_trace_function; 1917 saved_ftrace_func = ftrace_trace_function;
@@ -1766,7 +1933,7 @@ static void ftrace_startup_sysctl(void)
1766 saved_ftrace_func = NULL; 1933 saved_ftrace_func = NULL;
1767 /* ftrace_start_up is true if we want ftrace running */ 1934 /* ftrace_start_up is true if we want ftrace running */
1768 if (ftrace_start_up) 1935 if (ftrace_start_up)
1769 ftrace_run_update_code(FTRACE_ENABLE_CALLS); 1936 ftrace_run_update_code(FTRACE_UPDATE_CALLS);
1770} 1937}
1771 1938
1772static void ftrace_shutdown_sysctl(void) 1939static void ftrace_shutdown_sysctl(void)
@@ -1788,14 +1955,16 @@ static int ops_traces_mod(struct ftrace_ops *ops)
1788 struct ftrace_hash *hash; 1955 struct ftrace_hash *hash;
1789 1956
1790 hash = ops->filter_hash; 1957 hash = ops->filter_hash;
1791 return !!(!hash || !hash->count); 1958 return ftrace_hash_empty(hash);
1792} 1959}
1793 1960
1794static int ftrace_update_code(struct module *mod) 1961static int ftrace_update_code(struct module *mod)
1795{ 1962{
1963 struct ftrace_page *pg;
1796 struct dyn_ftrace *p; 1964 struct dyn_ftrace *p;
1797 cycle_t start, stop; 1965 cycle_t start, stop;
1798 unsigned long ref = 0; 1966 unsigned long ref = 0;
1967 int i;
1799 1968
1800 /* 1969 /*
1801 * When adding a module, we need to check if tracers are 1970 * When adding a module, we need to check if tracers are
@@ -1817,46 +1986,44 @@ static int ftrace_update_code(struct module *mod)
1817 start = ftrace_now(raw_smp_processor_id()); 1986 start = ftrace_now(raw_smp_processor_id());
1818 ftrace_update_cnt = 0; 1987 ftrace_update_cnt = 0;
1819 1988
1820 while (ftrace_new_addrs) { 1989 for (pg = ftrace_new_pgs; pg; pg = pg->next) {
1821 1990
1822 /* If something went wrong, bail without enabling anything */ 1991 for (i = 0; i < pg->index; i++) {
1823 if (unlikely(ftrace_disabled)) 1992 /* If something went wrong, bail without enabling anything */
1824 return -1; 1993 if (unlikely(ftrace_disabled))
1994 return -1;
1825 1995
1826 p = ftrace_new_addrs; 1996 p = &pg->records[i];
1827 ftrace_new_addrs = p->newlist; 1997 p->flags = ref;
1828 p->flags = ref;
1829 1998
1830 /* 1999 /*
1831 * Do the initial record conversion from mcount jump 2000 * Do the initial record conversion from mcount jump
1832 * to the NOP instructions. 2001 * to the NOP instructions.
1833 */ 2002 */
1834 if (!ftrace_code_disable(mod, p)) { 2003 if (!ftrace_code_disable(mod, p))
1835 ftrace_free_rec(p); 2004 break;
1836 /* Game over */
1837 break;
1838 }
1839 2005
1840 ftrace_update_cnt++; 2006 ftrace_update_cnt++;
1841 2007
1842 /* 2008 /*
1843 * If the tracing is enabled, go ahead and enable the record. 2009 * If the tracing is enabled, go ahead and enable the record.
1844 * 2010 *
1845 * The reason not to enable the record immediatelly is the 2011 * The reason not to enable the record immediatelly is the
1846 * inherent check of ftrace_make_nop/ftrace_make_call for 2012 * inherent check of ftrace_make_nop/ftrace_make_call for
1847 * correct previous instructions. Making first the NOP 2013 * correct previous instructions. Making first the NOP
1848 * conversion puts the module to the correct state, thus 2014 * conversion puts the module to the correct state, thus
1849 * passing the ftrace_make_call check. 2015 * passing the ftrace_make_call check.
1850 */ 2016 */
1851 if (ftrace_start_up && ref) { 2017 if (ftrace_start_up && ref) {
1852 int failed = __ftrace_replace_code(p, 1); 2018 int failed = __ftrace_replace_code(p, 1);
1853 if (failed) { 2019 if (failed)
1854 ftrace_bug(failed, p->ip); 2020 ftrace_bug(failed, p->ip);
1855 ftrace_free_rec(p);
1856 } 2021 }
1857 } 2022 }
1858 } 2023 }
1859 2024
2025 ftrace_new_pgs = NULL;
2026
1860 stop = ftrace_now(raw_smp_processor_id()); 2027 stop = ftrace_now(raw_smp_processor_id());
1861 ftrace_update_time = stop - start; 2028 ftrace_update_time = stop - start;
1862 ftrace_update_tot_cnt += ftrace_update_cnt; 2029 ftrace_update_tot_cnt += ftrace_update_cnt;
@@ -1864,57 +2031,108 @@ static int ftrace_update_code(struct module *mod)
1864 return 0; 2031 return 0;
1865} 2032}
1866 2033
1867static int __init ftrace_dyn_table_alloc(unsigned long num_to_init) 2034static int ftrace_allocate_records(struct ftrace_page *pg, int count)
1868{ 2035{
1869 struct ftrace_page *pg; 2036 int order;
1870 int cnt; 2037 int cnt;
1871 int i;
1872 2038
1873 /* allocate a few pages */ 2039 if (WARN_ON(!count))
1874 ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL); 2040 return -EINVAL;
1875 if (!ftrace_pages_start) 2041
1876 return -1; 2042 order = get_count_order(DIV_ROUND_UP(count, ENTRIES_PER_PAGE));
1877 2043
1878 /* 2044 /*
1879 * Allocate a few more pages. 2045 * We want to fill as much as possible. No more than a page
1880 * 2046 * may be empty.
1881 * TODO: have some parser search vmlinux before
1882 * final linking to find all calls to ftrace.
1883 * Then we can:
1884 * a) know how many pages to allocate.
1885 * and/or
1886 * b) set up the table then.
1887 *
1888 * The dynamic code is still necessary for
1889 * modules.
1890 */ 2047 */
2048 while ((PAGE_SIZE << order) / ENTRY_SIZE >= count + ENTRIES_PER_PAGE)
2049 order--;
1891 2050
1892 pg = ftrace_pages = ftrace_pages_start; 2051 again:
2052 pg->records = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, order);
1893 2053
1894 cnt = num_to_init / ENTRIES_PER_PAGE; 2054 if (!pg->records) {
1895 pr_info("ftrace: allocating %ld entries in %d pages\n", 2055 /* if we can't allocate this size, try something smaller */
1896 num_to_init, cnt + 1); 2056 if (!order)
2057 return -ENOMEM;
2058 order >>= 1;
2059 goto again;
2060 }
1897 2061
1898 for (i = 0; i < cnt; i++) { 2062 cnt = (PAGE_SIZE << order) / ENTRY_SIZE;
1899 pg->next = (void *)get_zeroed_page(GFP_KERNEL); 2063 pg->size = cnt;
1900 2064
1901 /* If we fail, we'll try later anyway */ 2065 if (cnt > count)
1902 if (!pg->next) 2066 cnt = count;
2067
2068 return cnt;
2069}
2070
2071static struct ftrace_page *
2072ftrace_allocate_pages(unsigned long num_to_init)
2073{
2074 struct ftrace_page *start_pg;
2075 struct ftrace_page *pg;
2076 int order;
2077 int cnt;
2078
2079 if (!num_to_init)
2080 return 0;
2081
2082 start_pg = pg = kzalloc(sizeof(*pg), GFP_KERNEL);
2083 if (!pg)
2084 return NULL;
2085
2086 /*
2087 * Try to allocate as much as possible in one continues
2088 * location that fills in all of the space. We want to
2089 * waste as little space as possible.
2090 */
2091 for (;;) {
2092 cnt = ftrace_allocate_records(pg, num_to_init);
2093 if (cnt < 0)
2094 goto free_pages;
2095
2096 num_to_init -= cnt;
2097 if (!num_to_init)
1903 break; 2098 break;
1904 2099
2100 pg->next = kzalloc(sizeof(*pg), GFP_KERNEL);
2101 if (!pg->next)
2102 goto free_pages;
2103
1905 pg = pg->next; 2104 pg = pg->next;
1906 } 2105 }
1907 2106
1908 return 0; 2107 return start_pg;
2108
2109 free_pages:
2110 while (start_pg) {
2111 order = get_count_order(pg->size / ENTRIES_PER_PAGE);
2112 free_pages((unsigned long)pg->records, order);
2113 start_pg = pg->next;
2114 kfree(pg);
2115 pg = start_pg;
2116 }
2117 pr_info("ftrace: FAILED to allocate memory for functions\n");
2118 return NULL;
1909} 2119}
1910 2120
1911enum { 2121static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
1912 FTRACE_ITER_FILTER = (1 << 0), 2122{
1913 FTRACE_ITER_NOTRACE = (1 << 1), 2123 int cnt;
1914 FTRACE_ITER_PRINTALL = (1 << 2), 2124
1915 FTRACE_ITER_HASH = (1 << 3), 2125 if (!num_to_init) {
1916 FTRACE_ITER_ENABLED = (1 << 4), 2126 pr_info("ftrace: No functions to be traced?\n");
1917}; 2127 return -1;
2128 }
2129
2130 cnt = num_to_init / ENTRIES_PER_PAGE;
2131 pr_info("ftrace: allocating %ld entries in %d pages\n",
2132 num_to_init, cnt + 1);
2133
2134 return 0;
2135}
1918 2136
1919#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ 2137#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
1920 2138
@@ -1980,6 +2198,9 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos)
1980 void *p = NULL; 2198 void *p = NULL;
1981 loff_t l; 2199 loff_t l;
1982 2200
2201 if (!(iter->flags & FTRACE_ITER_DO_HASH))
2202 return NULL;
2203
1983 if (iter->func_pos > *pos) 2204 if (iter->func_pos > *pos)
1984 return NULL; 2205 return NULL;
1985 2206
@@ -2023,7 +2244,7 @@ static void *
2023t_next(struct seq_file *m, void *v, loff_t *pos) 2244t_next(struct seq_file *m, void *v, loff_t *pos)
2024{ 2245{
2025 struct ftrace_iterator *iter = m->private; 2246 struct ftrace_iterator *iter = m->private;
2026 struct ftrace_ops *ops = &global_ops; 2247 struct ftrace_ops *ops = iter->ops;
2027 struct dyn_ftrace *rec = NULL; 2248 struct dyn_ftrace *rec = NULL;
2028 2249
2029 if (unlikely(ftrace_disabled)) 2250 if (unlikely(ftrace_disabled))
@@ -2047,9 +2268,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
2047 } 2268 }
2048 } else { 2269 } else {
2049 rec = &iter->pg->records[iter->idx++]; 2270 rec = &iter->pg->records[iter->idx++];
2050 if ((rec->flags & FTRACE_FL_FREE) || 2271 if (((iter->flags & FTRACE_ITER_FILTER) &&
2051
2052 ((iter->flags & FTRACE_ITER_FILTER) &&
2053 !(ftrace_lookup_ip(ops->filter_hash, rec->ip))) || 2272 !(ftrace_lookup_ip(ops->filter_hash, rec->ip))) ||
2054 2273
2055 ((iter->flags & FTRACE_ITER_NOTRACE) && 2274 ((iter->flags & FTRACE_ITER_NOTRACE) &&
@@ -2081,7 +2300,7 @@ static void reset_iter_read(struct ftrace_iterator *iter)
2081static void *t_start(struct seq_file *m, loff_t *pos) 2300static void *t_start(struct seq_file *m, loff_t *pos)
2082{ 2301{
2083 struct ftrace_iterator *iter = m->private; 2302 struct ftrace_iterator *iter = m->private;
2084 struct ftrace_ops *ops = &global_ops; 2303 struct ftrace_ops *ops = iter->ops;
2085 void *p = NULL; 2304 void *p = NULL;
2086 loff_t l; 2305 loff_t l;
2087 2306
@@ -2101,7 +2320,8 @@ static void *t_start(struct seq_file *m, loff_t *pos)
2101 * off, we can short cut and just print out that all 2320 * off, we can short cut and just print out that all
2102 * functions are enabled. 2321 * functions are enabled.
2103 */ 2322 */
2104 if (iter->flags & FTRACE_ITER_FILTER && !ops->filter_hash->count) { 2323 if (iter->flags & FTRACE_ITER_FILTER &&
2324 ftrace_hash_empty(ops->filter_hash)) {
2105 if (*pos > 0) 2325 if (*pos > 0)
2106 return t_hash_start(m, pos); 2326 return t_hash_start(m, pos);
2107 iter->flags |= FTRACE_ITER_PRINTALL; 2327 iter->flags |= FTRACE_ITER_PRINTALL;
@@ -2126,12 +2346,8 @@ static void *t_start(struct seq_file *m, loff_t *pos)
2126 break; 2346 break;
2127 } 2347 }
2128 2348
2129 if (!p) { 2349 if (!p)
2130 if (iter->flags & FTRACE_ITER_FILTER) 2350 return t_hash_start(m, pos);
2131 return t_hash_start(m, pos);
2132
2133 return NULL;
2134 }
2135 2351
2136 return iter; 2352 return iter;
2137} 2353}
@@ -2189,6 +2405,7 @@ ftrace_avail_open(struct inode *inode, struct file *file)
2189 return -ENOMEM; 2405 return -ENOMEM;
2190 2406
2191 iter->pg = ftrace_pages_start; 2407 iter->pg = ftrace_pages_start;
2408 iter->ops = &global_ops;
2192 2409
2193 ret = seq_open(file, &show_ftrace_seq_ops); 2410 ret = seq_open(file, &show_ftrace_seq_ops);
2194 if (!ret) { 2411 if (!ret) {
@@ -2217,6 +2434,7 @@ ftrace_enabled_open(struct inode *inode, struct file *file)
2217 2434
2218 iter->pg = ftrace_pages_start; 2435 iter->pg = ftrace_pages_start;
2219 iter->flags = FTRACE_ITER_ENABLED; 2436 iter->flags = FTRACE_ITER_ENABLED;
2437 iter->ops = &global_ops;
2220 2438
2221 ret = seq_open(file, &show_ftrace_seq_ops); 2439 ret = seq_open(file, &show_ftrace_seq_ops);
2222 if (!ret) { 2440 if (!ret) {
@@ -2237,7 +2455,23 @@ static void ftrace_filter_reset(struct ftrace_hash *hash)
2237 mutex_unlock(&ftrace_lock); 2455 mutex_unlock(&ftrace_lock);
2238} 2456}
2239 2457
2240static int 2458/**
2459 * ftrace_regex_open - initialize function tracer filter files
2460 * @ops: The ftrace_ops that hold the hash filters
2461 * @flag: The type of filter to process
2462 * @inode: The inode, usually passed in to your open routine
2463 * @file: The file, usually passed in to your open routine
2464 *
2465 * ftrace_regex_open() initializes the filter files for the
2466 * @ops. Depending on @flag it may process the filter hash or
2467 * the notrace hash of @ops. With this called from the open
2468 * routine, you can use ftrace_filter_write() for the write
2469 * routine if @flag has FTRACE_ITER_FILTER set, or
2470 * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set.
2471 * ftrace_regex_lseek() should be used as the lseek routine, and
2472 * release must call ftrace_regex_release().
2473 */
2474int
2241ftrace_regex_open(struct ftrace_ops *ops, int flag, 2475ftrace_regex_open(struct ftrace_ops *ops, int flag,
2242 struct inode *inode, struct file *file) 2476 struct inode *inode, struct file *file)
2243{ 2477{
@@ -2306,8 +2540,9 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
2306static int 2540static int
2307ftrace_filter_open(struct inode *inode, struct file *file) 2541ftrace_filter_open(struct inode *inode, struct file *file)
2308{ 2542{
2309 return ftrace_regex_open(&global_ops, FTRACE_ITER_FILTER, 2543 return ftrace_regex_open(&global_ops,
2310 inode, file); 2544 FTRACE_ITER_FILTER | FTRACE_ITER_DO_HASH,
2545 inode, file);
2311} 2546}
2312 2547
2313static int 2548static int
@@ -2317,7 +2552,7 @@ ftrace_notrace_open(struct inode *inode, struct file *file)
2317 inode, file); 2552 inode, file);
2318} 2553}
2319 2554
2320static loff_t 2555loff_t
2321ftrace_regex_lseek(struct file *file, loff_t offset, int origin) 2556ftrace_regex_lseek(struct file *file, loff_t offset, int origin)
2322{ 2557{
2323 loff_t ret; 2558 loff_t ret;
@@ -2426,7 +2661,6 @@ match_records(struct ftrace_hash *hash, char *buff,
2426 goto out_unlock; 2661 goto out_unlock;
2427 2662
2428 do_for_each_ftrace_rec(pg, rec) { 2663 do_for_each_ftrace_rec(pg, rec) {
2429
2430 if (ftrace_match_record(rec, mod, search, search_len, type)) { 2664 if (ftrace_match_record(rec, mod, search, search_len, type)) {
2431 ret = enter_record(hash, rec, not); 2665 ret = enter_record(hash, rec, not);
2432 if (ret < 0) { 2666 if (ret < 0) {
@@ -2871,14 +3105,14 @@ out_unlock:
2871 return ret; 3105 return ret;
2872} 3106}
2873 3107
2874static ssize_t 3108ssize_t
2875ftrace_filter_write(struct file *file, const char __user *ubuf, 3109ftrace_filter_write(struct file *file, const char __user *ubuf,
2876 size_t cnt, loff_t *ppos) 3110 size_t cnt, loff_t *ppos)
2877{ 3111{
2878 return ftrace_regex_write(file, ubuf, cnt, ppos, 1); 3112 return ftrace_regex_write(file, ubuf, cnt, ppos, 1);
2879} 3113}
2880 3114
2881static ssize_t 3115ssize_t
2882ftrace_notrace_write(struct file *file, const char __user *ubuf, 3116ftrace_notrace_write(struct file *file, const char __user *ubuf,
2883 size_t cnt, loff_t *ppos) 3117 size_t cnt, loff_t *ppos)
2884{ 3118{
@@ -2919,7 +3153,7 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
2919 ret = ftrace_hash_move(ops, enable, orig_hash, hash); 3153 ret = ftrace_hash_move(ops, enable, orig_hash, hash);
2920 if (!ret && ops->flags & FTRACE_OPS_FL_ENABLED 3154 if (!ret && ops->flags & FTRACE_OPS_FL_ENABLED
2921 && ftrace_enabled) 3155 && ftrace_enabled)
2922 ftrace_run_update_code(FTRACE_ENABLE_CALLS); 3156 ftrace_run_update_code(FTRACE_UPDATE_CALLS);
2923 3157
2924 mutex_unlock(&ftrace_lock); 3158 mutex_unlock(&ftrace_lock);
2925 3159
@@ -3045,8 +3279,8 @@ static void __init set_ftrace_early_graph(char *buf)
3045} 3279}
3046#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 3280#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
3047 3281
3048static void __init 3282void __init
3049set_ftrace_early_filter(struct ftrace_ops *ops, char *buf, int enable) 3283ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable)
3050{ 3284{
3051 char *func; 3285 char *func;
3052 3286
@@ -3059,17 +3293,16 @@ set_ftrace_early_filter(struct ftrace_ops *ops, char *buf, int enable)
3059static void __init set_ftrace_early_filters(void) 3293static void __init set_ftrace_early_filters(void)
3060{ 3294{
3061 if (ftrace_filter_buf[0]) 3295 if (ftrace_filter_buf[0])
3062 set_ftrace_early_filter(&global_ops, ftrace_filter_buf, 1); 3296 ftrace_set_early_filter(&global_ops, ftrace_filter_buf, 1);
3063 if (ftrace_notrace_buf[0]) 3297 if (ftrace_notrace_buf[0])
3064 set_ftrace_early_filter(&global_ops, ftrace_notrace_buf, 0); 3298 ftrace_set_early_filter(&global_ops, ftrace_notrace_buf, 0);
3065#ifdef CONFIG_FUNCTION_GRAPH_TRACER 3299#ifdef CONFIG_FUNCTION_GRAPH_TRACER
3066 if (ftrace_graph_buf[0]) 3300 if (ftrace_graph_buf[0])
3067 set_ftrace_early_graph(ftrace_graph_buf); 3301 set_ftrace_early_graph(ftrace_graph_buf);
3068#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 3302#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
3069} 3303}
3070 3304
3071static int 3305int ftrace_regex_release(struct inode *inode, struct file *file)
3072ftrace_regex_release(struct inode *inode, struct file *file)
3073{ 3306{
3074 struct seq_file *m = (struct seq_file *)file->private_data; 3307 struct seq_file *m = (struct seq_file *)file->private_data;
3075 struct ftrace_iterator *iter; 3308 struct ftrace_iterator *iter;
@@ -3107,7 +3340,7 @@ ftrace_regex_release(struct inode *inode, struct file *file)
3107 orig_hash, iter->hash); 3340 orig_hash, iter->hash);
3108 if (!ret && (iter->ops->flags & FTRACE_OPS_FL_ENABLED) 3341 if (!ret && (iter->ops->flags & FTRACE_OPS_FL_ENABLED)
3109 && ftrace_enabled) 3342 && ftrace_enabled)
3110 ftrace_run_update_code(FTRACE_ENABLE_CALLS); 3343 ftrace_run_update_code(FTRACE_UPDATE_CALLS);
3111 3344
3112 mutex_unlock(&ftrace_lock); 3345 mutex_unlock(&ftrace_lock);
3113 } 3346 }
@@ -3270,9 +3503,6 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
3270 3503
3271 do_for_each_ftrace_rec(pg, rec) { 3504 do_for_each_ftrace_rec(pg, rec) {
3272 3505
3273 if (rec->flags & FTRACE_FL_FREE)
3274 continue;
3275
3276 if (ftrace_match_record(rec, NULL, search, search_len, type)) { 3506 if (ftrace_match_record(rec, NULL, search, search_len, type)) {
3277 /* if it is in the array */ 3507 /* if it is in the array */
3278 exists = false; 3508 exists = false;
@@ -3381,15 +3611,62 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
3381 return 0; 3611 return 0;
3382} 3612}
3383 3613
3614static void ftrace_swap_recs(void *a, void *b, int size)
3615{
3616 struct dyn_ftrace *reca = a;
3617 struct dyn_ftrace *recb = b;
3618 struct dyn_ftrace t;
3619
3620 t = *reca;
3621 *reca = *recb;
3622 *recb = t;
3623}
3624
3384static int ftrace_process_locs(struct module *mod, 3625static int ftrace_process_locs(struct module *mod,
3385 unsigned long *start, 3626 unsigned long *start,
3386 unsigned long *end) 3627 unsigned long *end)
3387{ 3628{
3629 struct ftrace_page *pg;
3630 unsigned long count;
3388 unsigned long *p; 3631 unsigned long *p;
3389 unsigned long addr; 3632 unsigned long addr;
3390 unsigned long flags = 0; /* Shut up gcc */ 3633 unsigned long flags = 0; /* Shut up gcc */
3634 int ret = -ENOMEM;
3635
3636 count = end - start;
3637
3638 if (!count)
3639 return 0;
3640
3641 pg = ftrace_allocate_pages(count);
3642 if (!pg)
3643 return -ENOMEM;
3391 3644
3392 mutex_lock(&ftrace_lock); 3645 mutex_lock(&ftrace_lock);
3646
3647 /*
3648 * Core and each module needs their own pages, as
3649 * modules will free them when they are removed.
3650 * Force a new page to be allocated for modules.
3651 */
3652 if (!mod) {
3653 WARN_ON(ftrace_pages || ftrace_pages_start);
3654 /* First initialization */
3655 ftrace_pages = ftrace_pages_start = pg;
3656 } else {
3657 if (!ftrace_pages)
3658 goto out;
3659
3660 if (WARN_ON(ftrace_pages->next)) {
3661 /* Hmm, we have free pages? */
3662 while (ftrace_pages->next)
3663 ftrace_pages = ftrace_pages->next;
3664 }
3665
3666 ftrace_pages->next = pg;
3667 ftrace_pages = pg;
3668 }
3669
3393 p = start; 3670 p = start;
3394 while (p < end) { 3671 while (p < end) {
3395 addr = ftrace_call_adjust(*p++); 3672 addr = ftrace_call_adjust(*p++);
@@ -3401,9 +3678,18 @@ static int ftrace_process_locs(struct module *mod,
3401 */ 3678 */
3402 if (!addr) 3679 if (!addr)
3403 continue; 3680 continue;
3404 ftrace_record_ip(addr); 3681 if (!ftrace_record_ip(addr))
3682 break;
3405 } 3683 }
3406 3684
3685 /* These new locations need to be initialized */
3686 ftrace_new_pgs = pg;
3687
3688 /* Make each individual set of pages sorted by ips */
3689 for (; pg; pg = pg->next)
3690 sort(pg->records, pg->index, sizeof(struct dyn_ftrace),
3691 ftrace_cmp_recs, ftrace_swap_recs);
3692
3407 /* 3693 /*
3408 * We only need to disable interrupts on start up 3694 * We only need to disable interrupts on start up
3409 * because we are modifying code that an interrupt 3695 * because we are modifying code that an interrupt
@@ -3417,32 +3703,55 @@ static int ftrace_process_locs(struct module *mod,
3417 ftrace_update_code(mod); 3703 ftrace_update_code(mod);
3418 if (!mod) 3704 if (!mod)
3419 local_irq_restore(flags); 3705 local_irq_restore(flags);
3706 ret = 0;
3707 out:
3420 mutex_unlock(&ftrace_lock); 3708 mutex_unlock(&ftrace_lock);
3421 3709
3422 return 0; 3710 return ret;
3423} 3711}
3424 3712
3425#ifdef CONFIG_MODULES 3713#ifdef CONFIG_MODULES
3714
3715#define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next)
3716
3426void ftrace_release_mod(struct module *mod) 3717void ftrace_release_mod(struct module *mod)
3427{ 3718{
3428 struct dyn_ftrace *rec; 3719 struct dyn_ftrace *rec;
3720 struct ftrace_page **last_pg;
3429 struct ftrace_page *pg; 3721 struct ftrace_page *pg;
3722 int order;
3430 3723
3431 mutex_lock(&ftrace_lock); 3724 mutex_lock(&ftrace_lock);
3432 3725
3433 if (ftrace_disabled) 3726 if (ftrace_disabled)
3434 goto out_unlock; 3727 goto out_unlock;
3435 3728
3436 do_for_each_ftrace_rec(pg, rec) { 3729 /*
3730 * Each module has its own ftrace_pages, remove
3731 * them from the list.
3732 */
3733 last_pg = &ftrace_pages_start;
3734 for (pg = ftrace_pages_start; pg; pg = *last_pg) {
3735 rec = &pg->records[0];
3437 if (within_module_core(rec->ip, mod)) { 3736 if (within_module_core(rec->ip, mod)) {
3438 /* 3737 /*
3439 * rec->ip is changed in ftrace_free_rec() 3738 * As core pages are first, the first
3440 * It should not between s and e if record was freed. 3739 * page should never be a module page.
3441 */ 3740 */
3442 FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE); 3741 if (WARN_ON(pg == ftrace_pages_start))
3443 ftrace_free_rec(rec); 3742 goto out_unlock;
3444 } 3743
3445 } while_for_each_ftrace_rec(); 3744 /* Check if we are deleting the last page */
3745 if (pg == ftrace_pages)
3746 ftrace_pages = next_to_ftrace_page(last_pg);
3747
3748 *last_pg = pg->next;
3749 order = get_count_order(pg->size / ENTRIES_PER_PAGE);
3750 free_pages((unsigned long)pg->records, order);
3751 kfree(pg);
3752 } else
3753 last_pg = &pg->next;
3754 }
3446 out_unlock: 3755 out_unlock:
3447 mutex_unlock(&ftrace_lock); 3756 mutex_unlock(&ftrace_lock);
3448} 3757}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 91dc4bc8bf72..a3f1bc5d2a00 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4438,7 +4438,7 @@ static const struct file_operations trace_options_core_fops = {
4438}; 4438};
4439 4439
4440struct dentry *trace_create_file(const char *name, 4440struct dentry *trace_create_file(const char *name,
4441 mode_t mode, 4441 umode_t mode,
4442 struct dentry *parent, 4442 struct dentry *parent,
4443 void *data, 4443 void *data,
4444 const struct file_operations *fops) 4444 const struct file_operations *fops)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2c2657462ac3..b93ecbadad6d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -312,7 +312,7 @@ void tracing_reset_current(int cpu);
312void tracing_reset_current_online_cpus(void); 312void tracing_reset_current_online_cpus(void);
313int tracing_open_generic(struct inode *inode, struct file *filp); 313int tracing_open_generic(struct inode *inode, struct file *filp);
314struct dentry *trace_create_file(const char *name, 314struct dentry *trace_create_file(const char *name,
315 mode_t mode, 315 umode_t mode,
316 struct dentry *parent, 316 struct dentry *parent,
317 void *data, 317 void *data,
318 const struct file_operations *fops); 318 const struct file_operations *fops);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index f04cc3136bd3..24aee7127451 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1738,11 +1738,121 @@ static int replace_system_preds(struct event_subsystem *system,
1738 return -ENOMEM; 1738 return -ENOMEM;
1739} 1739}
1740 1740
1741static int create_filter_start(char *filter_str, bool set_str,
1742 struct filter_parse_state **psp,
1743 struct event_filter **filterp)
1744{
1745 struct event_filter *filter;
1746 struct filter_parse_state *ps = NULL;
1747 int err = 0;
1748
1749 WARN_ON_ONCE(*psp || *filterp);
1750
1751 /* allocate everything, and if any fails, free all and fail */
1752 filter = __alloc_filter();
1753 if (filter && set_str)
1754 err = replace_filter_string(filter, filter_str);
1755
1756 ps = kzalloc(sizeof(*ps), GFP_KERNEL);
1757
1758 if (!filter || !ps || err) {
1759 kfree(ps);
1760 __free_filter(filter);
1761 return -ENOMEM;
1762 }
1763
1764 /* we're committed to creating a new filter */
1765 *filterp = filter;
1766 *psp = ps;
1767
1768 parse_init(ps, filter_ops, filter_str);
1769 err = filter_parse(ps);
1770 if (err && set_str)
1771 append_filter_err(ps, filter);
1772 return err;
1773}
1774
1775static void create_filter_finish(struct filter_parse_state *ps)
1776{
1777 if (ps) {
1778 filter_opstack_clear(ps);
1779 postfix_clear(ps);
1780 kfree(ps);
1781 }
1782}
1783
1784/**
1785 * create_filter - create a filter for a ftrace_event_call
1786 * @call: ftrace_event_call to create a filter for
1787 * @filter_str: filter string
1788 * @set_str: remember @filter_str and enable detailed error in filter
1789 * @filterp: out param for created filter (always updated on return)
1790 *
1791 * Creates a filter for @call with @filter_str. If @set_str is %true,
1792 * @filter_str is copied and recorded in the new filter.
1793 *
1794 * On success, returns 0 and *@filterp points to the new filter. On
1795 * failure, returns -errno and *@filterp may point to %NULL or to a new
1796 * filter. In the latter case, the returned filter contains error
1797 * information if @set_str is %true and the caller is responsible for
1798 * freeing it.
1799 */
1800static int create_filter(struct ftrace_event_call *call,
1801 char *filter_str, bool set_str,
1802 struct event_filter **filterp)
1803{
1804 struct event_filter *filter = NULL;
1805 struct filter_parse_state *ps = NULL;
1806 int err;
1807
1808 err = create_filter_start(filter_str, set_str, &ps, &filter);
1809 if (!err) {
1810 err = replace_preds(call, filter, ps, filter_str, false);
1811 if (err && set_str)
1812 append_filter_err(ps, filter);
1813 }
1814 create_filter_finish(ps);
1815
1816 *filterp = filter;
1817 return err;
1818}
1819
1820/**
1821 * create_system_filter - create a filter for an event_subsystem
1822 * @system: event_subsystem to create a filter for
1823 * @filter_str: filter string
1824 * @filterp: out param for created filter (always updated on return)
1825 *
1826 * Identical to create_filter() except that it creates a subsystem filter
1827 * and always remembers @filter_str.
1828 */
1829static int create_system_filter(struct event_subsystem *system,
1830 char *filter_str, struct event_filter **filterp)
1831{
1832 struct event_filter *filter = NULL;
1833 struct filter_parse_state *ps = NULL;
1834 int err;
1835
1836 err = create_filter_start(filter_str, true, &ps, &filter);
1837 if (!err) {
1838 err = replace_system_preds(system, ps, filter_str);
1839 if (!err) {
1840 /* System filters just show a default message */
1841 kfree(filter->filter_string);
1842 filter->filter_string = NULL;
1843 } else {
1844 append_filter_err(ps, filter);
1845 }
1846 }
1847 create_filter_finish(ps);
1848
1849 *filterp = filter;
1850 return err;
1851}
1852
1741int apply_event_filter(struct ftrace_event_call *call, char *filter_string) 1853int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1742{ 1854{
1743 struct filter_parse_state *ps;
1744 struct event_filter *filter; 1855 struct event_filter *filter;
1745 struct event_filter *tmp;
1746 int err = 0; 1856 int err = 0;
1747 1857
1748 mutex_lock(&event_mutex); 1858 mutex_lock(&event_mutex);
@@ -1759,49 +1869,30 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1759 goto out_unlock; 1869 goto out_unlock;
1760 } 1870 }
1761 1871
1762 err = -ENOMEM; 1872 err = create_filter(call, filter_string, true, &filter);
1763 ps = kzalloc(sizeof(*ps), GFP_KERNEL);
1764 if (!ps)
1765 goto out_unlock;
1766
1767 filter = __alloc_filter();
1768 if (!filter) {
1769 kfree(ps);
1770 goto out_unlock;
1771 }
1772
1773 replace_filter_string(filter, filter_string);
1774
1775 parse_init(ps, filter_ops, filter_string);
1776 err = filter_parse(ps);
1777 if (err) {
1778 append_filter_err(ps, filter);
1779 goto out;
1780 }
1781 1873
1782 err = replace_preds(call, filter, ps, filter_string, false);
1783 if (err) {
1784 filter_disable(call);
1785 append_filter_err(ps, filter);
1786 } else
1787 call->flags |= TRACE_EVENT_FL_FILTERED;
1788out:
1789 /* 1874 /*
1790 * Always swap the call filter with the new filter 1875 * Always swap the call filter with the new filter
1791 * even if there was an error. If there was an error 1876 * even if there was an error. If there was an error
1792 * in the filter, we disable the filter and show the error 1877 * in the filter, we disable the filter and show the error
1793 * string 1878 * string
1794 */ 1879 */
1795 tmp = call->filter; 1880 if (filter) {
1796 rcu_assign_pointer(call->filter, filter); 1881 struct event_filter *tmp = call->filter;
1797 if (tmp) { 1882
1798 /* Make sure the call is done with the filter */ 1883 if (!err)
1799 synchronize_sched(); 1884 call->flags |= TRACE_EVENT_FL_FILTERED;
1800 __free_filter(tmp); 1885 else
1886 filter_disable(call);
1887
1888 rcu_assign_pointer(call->filter, filter);
1889
1890 if (tmp) {
1891 /* Make sure the call is done with the filter */
1892 synchronize_sched();
1893 __free_filter(tmp);
1894 }
1801 } 1895 }
1802 filter_opstack_clear(ps);
1803 postfix_clear(ps);
1804 kfree(ps);
1805out_unlock: 1896out_unlock:
1806 mutex_unlock(&event_mutex); 1897 mutex_unlock(&event_mutex);
1807 1898
@@ -1811,7 +1902,6 @@ out_unlock:
1811int apply_subsystem_event_filter(struct event_subsystem *system, 1902int apply_subsystem_event_filter(struct event_subsystem *system,
1812 char *filter_string) 1903 char *filter_string)
1813{ 1904{
1814 struct filter_parse_state *ps;
1815 struct event_filter *filter; 1905 struct event_filter *filter;
1816 int err = 0; 1906 int err = 0;
1817 1907
@@ -1835,48 +1925,19 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1835 goto out_unlock; 1925 goto out_unlock;
1836 } 1926 }
1837 1927
1838 err = -ENOMEM; 1928 err = create_system_filter(system, filter_string, &filter);
1839 ps = kzalloc(sizeof(*ps), GFP_KERNEL); 1929 if (filter) {
1840 if (!ps) 1930 /*
1841 goto out_unlock; 1931 * No event actually uses the system filter
1842 1932 * we can free it without synchronize_sched().
1843 filter = __alloc_filter(); 1933 */
1844 if (!filter) 1934 __free_filter(system->filter);
1845 goto out; 1935 system->filter = filter;
1846 1936 }
1847 /* System filters just show a default message */
1848 kfree(filter->filter_string);
1849 filter->filter_string = NULL;
1850
1851 /*
1852 * No event actually uses the system filter
1853 * we can free it without synchronize_sched().
1854 */
1855 __free_filter(system->filter);
1856 system->filter = filter;
1857
1858 parse_init(ps, filter_ops, filter_string);
1859 err = filter_parse(ps);
1860 if (err)
1861 goto err_filter;
1862
1863 err = replace_system_preds(system, ps, filter_string);
1864 if (err)
1865 goto err_filter;
1866
1867out:
1868 filter_opstack_clear(ps);
1869 postfix_clear(ps);
1870 kfree(ps);
1871out_unlock: 1937out_unlock:
1872 mutex_unlock(&event_mutex); 1938 mutex_unlock(&event_mutex);
1873 1939
1874 return err; 1940 return err;
1875
1876err_filter:
1877 replace_filter_string(filter, filter_string);
1878 append_filter_err(ps, system->filter);
1879 goto out;
1880} 1941}
1881 1942
1882#ifdef CONFIG_PERF_EVENTS 1943#ifdef CONFIG_PERF_EVENTS
@@ -1894,7 +1955,6 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
1894{ 1955{
1895 int err; 1956 int err;
1896 struct event_filter *filter; 1957 struct event_filter *filter;
1897 struct filter_parse_state *ps;
1898 struct ftrace_event_call *call; 1958 struct ftrace_event_call *call;
1899 1959
1900 mutex_lock(&event_mutex); 1960 mutex_lock(&event_mutex);
@@ -1909,33 +1969,10 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
1909 if (event->filter) 1969 if (event->filter)
1910 goto out_unlock; 1970 goto out_unlock;
1911 1971
1912 filter = __alloc_filter(); 1972 err = create_filter(call, filter_str, false, &filter);
1913 if (!filter) {
1914 err = PTR_ERR(filter);
1915 goto out_unlock;
1916 }
1917
1918 err = -ENOMEM;
1919 ps = kzalloc(sizeof(*ps), GFP_KERNEL);
1920 if (!ps)
1921 goto free_filter;
1922
1923 parse_init(ps, filter_ops, filter_str);
1924 err = filter_parse(ps);
1925 if (err)
1926 goto free_ps;
1927
1928 err = replace_preds(call, filter, ps, filter_str, false);
1929 if (!err) 1973 if (!err)
1930 event->filter = filter; 1974 event->filter = filter;
1931 1975 else
1932free_ps:
1933 filter_opstack_clear(ps);
1934 postfix_clear(ps);
1935 kfree(ps);
1936
1937free_filter:
1938 if (err)
1939 __free_filter(filter); 1976 __free_filter(filter);
1940 1977
1941out_unlock: 1978out_unlock:
@@ -1954,43 +1991,6 @@ out_unlock:
1954#define CREATE_TRACE_POINTS 1991#define CREATE_TRACE_POINTS
1955#include "trace_events_filter_test.h" 1992#include "trace_events_filter_test.h"
1956 1993
1957static int test_get_filter(char *filter_str, struct ftrace_event_call *call,
1958 struct event_filter **pfilter)
1959{
1960 struct event_filter *filter;
1961 struct filter_parse_state *ps;
1962 int err = -ENOMEM;
1963
1964 filter = __alloc_filter();
1965 if (!filter)
1966 goto out;
1967
1968 ps = kzalloc(sizeof(*ps), GFP_KERNEL);
1969 if (!ps)
1970 goto free_filter;
1971
1972 parse_init(ps, filter_ops, filter_str);
1973 err = filter_parse(ps);
1974 if (err)
1975 goto free_ps;
1976
1977 err = replace_preds(call, filter, ps, filter_str, false);
1978 if (!err)
1979 *pfilter = filter;
1980
1981 free_ps:
1982 filter_opstack_clear(ps);
1983 postfix_clear(ps);
1984 kfree(ps);
1985
1986 free_filter:
1987 if (err)
1988 __free_filter(filter);
1989
1990 out:
1991 return err;
1992}
1993
1994#define DATA_REC(m, va, vb, vc, vd, ve, vf, vg, vh, nvisit) \ 1994#define DATA_REC(m, va, vb, vc, vd, ve, vf, vg, vh, nvisit) \
1995{ \ 1995{ \
1996 .filter = FILTER, \ 1996 .filter = FILTER, \
@@ -2109,12 +2109,13 @@ static __init int ftrace_test_event_filter(void)
2109 struct test_filter_data_t *d = &test_filter_data[i]; 2109 struct test_filter_data_t *d = &test_filter_data[i];
2110 int err; 2110 int err;
2111 2111
2112 err = test_get_filter(d->filter, &event_ftrace_test_filter, 2112 err = create_filter(&event_ftrace_test_filter, d->filter,
2113 &filter); 2113 false, &filter);
2114 if (err) { 2114 if (err) {
2115 printk(KERN_INFO 2115 printk(KERN_INFO
2116 "Failed to get filter for '%s', err %d\n", 2116 "Failed to get filter for '%s', err %d\n",
2117 d->filter, err); 2117 d->filter, err);
2118 __free_filter(filter);
2118 break; 2119 break;
2119 } 2120 }
2120 2121
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 77575b386d97..d4545f49242e 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -13,6 +13,9 @@
13#include <linux/sysctl.h> 13#include <linux/sysctl.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/fs.h> 15#include <linux/fs.h>
16
17#include <asm/setup.h>
18
16#include "trace.h" 19#include "trace.h"
17 20
18#define STACK_TRACE_ENTRIES 500 21#define STACK_TRACE_ENTRIES 500
@@ -133,7 +136,6 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)
133static struct ftrace_ops trace_ops __read_mostly = 136static struct ftrace_ops trace_ops __read_mostly =
134{ 137{
135 .func = stack_trace_call, 138 .func = stack_trace_call,
136 .flags = FTRACE_OPS_FL_GLOBAL,
137}; 139};
138 140
139static ssize_t 141static ssize_t
@@ -311,6 +313,21 @@ static const struct file_operations stack_trace_fops = {
311 .release = seq_release, 313 .release = seq_release,
312}; 314};
313 315
316static int
317stack_trace_filter_open(struct inode *inode, struct file *file)
318{
319 return ftrace_regex_open(&trace_ops, FTRACE_ITER_FILTER,
320 inode, file);
321}
322
323static const struct file_operations stack_trace_filter_fops = {
324 .open = stack_trace_filter_open,
325 .read = seq_read,
326 .write = ftrace_filter_write,
327 .llseek = ftrace_regex_lseek,
328 .release = ftrace_regex_release,
329};
330
314int 331int
315stack_trace_sysctl(struct ctl_table *table, int write, 332stack_trace_sysctl(struct ctl_table *table, int write,
316 void __user *buffer, size_t *lenp, 333 void __user *buffer, size_t *lenp,
@@ -338,8 +355,13 @@ stack_trace_sysctl(struct ctl_table *table, int write,
338 return ret; 355 return ret;
339} 356}
340 357
358static char stack_trace_filter_buf[COMMAND_LINE_SIZE+1] __initdata;
359
341static __init int enable_stacktrace(char *str) 360static __init int enable_stacktrace(char *str)
342{ 361{
362 if (strncmp(str, "_filter=", 8) == 0)
363 strncpy(stack_trace_filter_buf, str+8, COMMAND_LINE_SIZE);
364
343 stack_tracer_enabled = 1; 365 stack_tracer_enabled = 1;
344 last_stack_tracer_enabled = 1; 366 last_stack_tracer_enabled = 1;
345 return 1; 367 return 1;
@@ -358,6 +380,12 @@ static __init int stack_trace_init(void)
358 trace_create_file("stack_trace", 0444, d_tracer, 380 trace_create_file("stack_trace", 0444, d_tracer,
359 NULL, &stack_trace_fops); 381 NULL, &stack_trace_fops);
360 382
383 trace_create_file("stack_trace_filter", 0444, d_tracer,
384 NULL, &stack_trace_filter_fops);
385
386 if (stack_trace_filter_buf[0])
387 ftrace_set_early_filter(&trace_ops, stack_trace_filter_buf, 1);
388
361 if (stack_tracer_enabled) 389 if (stack_tracer_enabled)
362 register_ftrace_function(&trace_ops); 390 register_ftrace_function(&trace_ops);
363 391
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 42fa9ad0a810..bec7b5b53e03 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -242,10 +242,10 @@ struct workqueue_struct {
242 242
243 int nr_drainers; /* W: drain in progress */ 243 int nr_drainers; /* W: drain in progress */
244 int saved_max_active; /* W: saved cwq max_active */ 244 int saved_max_active; /* W: saved cwq max_active */
245 const char *name; /* I: workqueue name */
246#ifdef CONFIG_LOCKDEP 245#ifdef CONFIG_LOCKDEP
247 struct lockdep_map lockdep_map; 246 struct lockdep_map lockdep_map;
248#endif 247#endif
248 char name[]; /* I: workqueue name */
249}; 249};
250 250
251struct workqueue_struct *system_wq __read_mostly; 251struct workqueue_struct *system_wq __read_mostly;
@@ -2954,14 +2954,29 @@ static int wq_clamp_max_active(int max_active, unsigned int flags,
2954 return clamp_val(max_active, 1, lim); 2954 return clamp_val(max_active, 1, lim);
2955} 2955}
2956 2956
2957struct workqueue_struct *__alloc_workqueue_key(const char *name, 2957struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
2958 unsigned int flags, 2958 unsigned int flags,
2959 int max_active, 2959 int max_active,
2960 struct lock_class_key *key, 2960 struct lock_class_key *key,
2961 const char *lock_name) 2961 const char *lock_name, ...)
2962{ 2962{
2963 va_list args, args1;
2963 struct workqueue_struct *wq; 2964 struct workqueue_struct *wq;
2964 unsigned int cpu; 2965 unsigned int cpu;
2966 size_t namelen;
2967
2968 /* determine namelen, allocate wq and format name */
2969 va_start(args, lock_name);
2970 va_copy(args1, args);
2971 namelen = vsnprintf(NULL, 0, fmt, args) + 1;
2972
2973 wq = kzalloc(sizeof(*wq) + namelen, GFP_KERNEL);
2974 if (!wq)
2975 goto err;
2976
2977 vsnprintf(wq->name, namelen, fmt, args1);
2978 va_end(args);
2979 va_end(args1);
2965 2980
2966 /* 2981 /*
2967 * Workqueues which may be used during memory reclaim should 2982 * Workqueues which may be used during memory reclaim should
@@ -2978,12 +2993,9 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
2978 flags |= WQ_HIGHPRI; 2993 flags |= WQ_HIGHPRI;
2979 2994
2980 max_active = max_active ?: WQ_DFL_ACTIVE; 2995 max_active = max_active ?: WQ_DFL_ACTIVE;
2981 max_active = wq_clamp_max_active(max_active, flags, name); 2996 max_active = wq_clamp_max_active(max_active, flags, wq->name);
2982
2983 wq = kzalloc(sizeof(*wq), GFP_KERNEL);
2984 if (!wq)
2985 goto err;
2986 2997
2998 /* init wq */
2987 wq->flags = flags; 2999 wq->flags = flags;
2988 wq->saved_max_active = max_active; 3000 wq->saved_max_active = max_active;
2989 mutex_init(&wq->flush_mutex); 3001 mutex_init(&wq->flush_mutex);
@@ -2991,7 +3003,6 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
2991 INIT_LIST_HEAD(&wq->flusher_queue); 3003 INIT_LIST_HEAD(&wq->flusher_queue);
2992 INIT_LIST_HEAD(&wq->flusher_overflow); 3004 INIT_LIST_HEAD(&wq->flusher_overflow);
2993 3005
2994 wq->name = name;
2995 lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); 3006 lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
2996 INIT_LIST_HEAD(&wq->list); 3007 INIT_LIST_HEAD(&wq->list);
2997 3008
@@ -3020,7 +3031,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
3020 if (!rescuer) 3031 if (!rescuer)
3021 goto err; 3032 goto err;
3022 3033
3023 rescuer->task = kthread_create(rescuer_thread, wq, "%s", name); 3034 rescuer->task = kthread_create(rescuer_thread, wq, "%s",
3035 wq->name);
3024 if (IS_ERR(rescuer->task)) 3036 if (IS_ERR(rescuer->task))
3025 goto err; 3037 goto err;
3026 3038