aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/auditfilter.c10
-rw-r--r--kernel/bounds.c19
-rw-r--r--kernel/cgroup.c331
-rw-r--r--kernel/cgroup_debug.c20
-rw-r--r--kernel/configs.c7
-rw-r--r--kernel/cpu.c40
-rw-r--r--kernel/cpuset.c380
-rw-r--r--kernel/dma.c7
-rw-r--r--kernel/exit.c85
-rw-r--r--kernel/fork.c54
-rw-r--r--kernel/hrtimer.c34
-rw-r--r--kernel/irq/devres.c1
-rw-r--r--kernel/irq/manage.c1
-rw-r--r--kernel/kallsyms.c6
-rw-r--r--kernel/kexec.c3
-rw-r--r--kernel/kprobes.c349
-rw-r--r--kernel/kthread.c2
-rw-r--r--kernel/latencytop.c9
-rw-r--r--kernel/lockdep_proc.c16
-rw-r--r--kernel/marker.c1
-rw-r--r--kernel/notifier.c38
-rw-r--r--kernel/ns_cgroup.c2
-rw-r--r--kernel/nsproxy.c12
-rw-r--r--kernel/panic.c8
-rw-r--r--kernel/pid_namespace.c2
-rw-r--r--kernel/power/console.c27
-rw-r--r--kernel/printk.c26
-rw-r--r--kernel/profile.c4
-rw-r--r--kernel/ptrace.c7
-rw-r--r--kernel/rcutorture.c1
-rw-r--r--kernel/relay.c37
-rw-r--r--kernel/res_counter.c10
-rw-r--r--kernel/resource.c10
-rw-r--r--kernel/sched.c62
-rw-r--r--kernel/sched_debug.c5
-rw-r--r--kernel/sys.c58
-rw-r--r--kernel/sysctl.c176
-rw-r--r--kernel/time.c1
-rw-r--r--kernel/time/timer_list.c5
-rw-r--r--kernel/time/timer_stats.c5
-rw-r--r--kernel/user.c15
-rw-r--r--kernel/user_namespace.c2
-rw-r--r--kernel/utsname.c1
-rw-r--r--kernel/workqueue.c24
45 files changed, 1198 insertions, 717 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 6c5f081132a4..188c43223f52 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -11,7 +11,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o pm_qos_params.o 12 notifier.o ksysfs.o pm_qos_params.o
13 13
14obj-$(CONFIG_SYSCTL) += sysctl_check.o 14obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
15obj-$(CONFIG_STACKTRACE) += stacktrace.o 15obj-$(CONFIG_STACKTRACE) += stacktrace.o
16obj-y += time/ 16obj-y += time/
17obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o 17obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 9435d9392df5..0e0bd27e6512 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -267,7 +267,7 @@ static int audit_to_watch(struct audit_krule *krule, char *path, int len,
267 return -EINVAL; 267 return -EINVAL;
268 268
269 watch = audit_init_watch(path); 269 watch = audit_init_watch(path);
270 if (unlikely(IS_ERR(watch))) 270 if (IS_ERR(watch))
271 return PTR_ERR(watch); 271 return PTR_ERR(watch);
272 272
273 audit_get_watch(watch); 273 audit_get_watch(watch);
@@ -851,7 +851,7 @@ static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
851 return ERR_PTR(-ENOMEM); 851 return ERR_PTR(-ENOMEM);
852 852
853 new = audit_init_watch(path); 853 new = audit_init_watch(path);
854 if (unlikely(IS_ERR(new))) { 854 if (IS_ERR(new)) {
855 kfree(path); 855 kfree(path);
856 goto out; 856 goto out;
857 } 857 }
@@ -992,7 +992,7 @@ static void audit_update_watch(struct audit_parent *parent,
992 audit_set_auditable(current->audit_context); 992 audit_set_auditable(current->audit_context);
993 993
994 nwatch = audit_dupe_watch(owatch); 994 nwatch = audit_dupe_watch(owatch);
995 if (unlikely(IS_ERR(nwatch))) { 995 if (IS_ERR(nwatch)) {
996 mutex_unlock(&audit_filter_mutex); 996 mutex_unlock(&audit_filter_mutex);
997 audit_panic("error updating watch, skipping"); 997 audit_panic("error updating watch, skipping");
998 return; 998 return;
@@ -1007,7 +1007,7 @@ static void audit_update_watch(struct audit_parent *parent,
1007 list_del_rcu(&oentry->list); 1007 list_del_rcu(&oentry->list);
1008 1008
1009 nentry = audit_dupe_rule(&oentry->rule, nwatch); 1009 nentry = audit_dupe_rule(&oentry->rule, nwatch);
1010 if (unlikely(IS_ERR(nentry))) 1010 if (IS_ERR(nentry))
1011 audit_panic("error updating watch, removing"); 1011 audit_panic("error updating watch, removing");
1012 else { 1012 else {
1013 int h = audit_hash_ino((u32)ino); 1013 int h = audit_hash_ino((u32)ino);
@@ -1790,7 +1790,7 @@ int audit_update_lsm_rules(void)
1790 watch = entry->rule.watch; 1790 watch = entry->rule.watch;
1791 tree = entry->rule.tree; 1791 tree = entry->rule.tree;
1792 nentry = audit_dupe_rule(&entry->rule, watch); 1792 nentry = audit_dupe_rule(&entry->rule, watch);
1793 if (unlikely(IS_ERR(nentry))) { 1793 if (IS_ERR(nentry)) {
1794 /* save the first error encountered for the 1794 /* save the first error encountered for the
1795 * return value */ 1795 * return value */
1796 if (!err) 1796 if (!err)
diff --git a/kernel/bounds.c b/kernel/bounds.c
new file mode 100644
index 000000000000..3c5301381837
--- /dev/null
+++ b/kernel/bounds.c
@@ -0,0 +1,19 @@
1/*
2 * Generate definitions needed by the preprocessor.
3 * This code generates raw asm output which is post-processed
4 * to extract and format the required data.
5 */
6
7#define __GENERATING_BOUNDS_H
8/* Include headers that define the enum constants of interest */
9#include <linux/page-flags.h>
10#include <linux/mmzone.h>
11#include <linux/kbuild.h>
12
13void foo(void)
14{
15 /* The enum constants to put into include/linux/bounds.h */
16 DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
17 DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
18 /* End of constants */
19}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 6d8de051382b..b9d467d83fc1 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -44,6 +44,7 @@
44#include <linux/kmod.h> 44#include <linux/kmod.h>
45#include <linux/delayacct.h> 45#include <linux/delayacct.h>
46#include <linux/cgroupstats.h> 46#include <linux/cgroupstats.h>
47#include <linux/hash.h>
47 48
48#include <asm/atomic.h> 49#include <asm/atomic.h>
49 50
@@ -118,17 +119,7 @@ static int root_count;
118 * be called. 119 * be called.
119 */ 120 */
120static int need_forkexit_callback; 121static int need_forkexit_callback;
121 122static int need_mm_owner_callback __read_mostly;
122/* bits in struct cgroup flags field */
123enum {
124 /* Control Group is dead */
125 CGRP_REMOVED,
126 /* Control Group has previously had a child cgroup or a task,
127 * but no longer (only if CGRP_NOTIFY_ON_RELEASE is set) */
128 CGRP_RELEASABLE,
129 /* Control Group requires release notifications to userspace */
130 CGRP_NOTIFY_ON_RELEASE,
131};
132 123
133/* convenient tests for these bits */ 124/* convenient tests for these bits */
134inline int cgroup_is_removed(const struct cgroup *cgrp) 125inline int cgroup_is_removed(const struct cgroup *cgrp)
@@ -204,6 +195,27 @@ static struct cg_cgroup_link init_css_set_link;
204static DEFINE_RWLOCK(css_set_lock); 195static DEFINE_RWLOCK(css_set_lock);
205static int css_set_count; 196static int css_set_count;
206 197
198/* hash table for cgroup groups. This improves the performance to
199 * find an existing css_set */
200#define CSS_SET_HASH_BITS 7
201#define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS)
202static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
203
204static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
205{
206 int i;
207 int index;
208 unsigned long tmp = 0UL;
209
210 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
211 tmp += (unsigned long)css[i];
212 tmp = (tmp >> 16) ^ tmp;
213
214 index = hash_long(tmp, CSS_SET_HASH_BITS);
215
216 return &css_set_table[index];
217}
218
207/* We don't maintain the lists running through each css_set to its 219/* We don't maintain the lists running through each css_set to its
208 * task until after the first call to cgroup_iter_start(). This 220 * task until after the first call to cgroup_iter_start(). This
209 * reduces the fork()/exit() overhead for people who have cgroups 221 * reduces the fork()/exit() overhead for people who have cgroups
@@ -230,7 +242,7 @@ static int use_task_css_set_links;
230static void unlink_css_set(struct css_set *cg) 242static void unlink_css_set(struct css_set *cg)
231{ 243{
232 write_lock(&css_set_lock); 244 write_lock(&css_set_lock);
233 list_del(&cg->list); 245 hlist_del(&cg->hlist);
234 css_set_count--; 246 css_set_count--;
235 while (!list_empty(&cg->cg_links)) { 247 while (!list_empty(&cg->cg_links)) {
236 struct cg_cgroup_link *link; 248 struct cg_cgroup_link *link;
@@ -295,9 +307,7 @@ static inline void put_css_set_taskexit(struct css_set *cg)
295/* 307/*
296 * find_existing_css_set() is a helper for 308 * find_existing_css_set() is a helper for
297 * find_css_set(), and checks to see whether an existing 309 * find_css_set(), and checks to see whether an existing
298 * css_set is suitable. This currently walks a linked-list for 310 * css_set is suitable.
299 * simplicity; a later patch will use a hash table for better
300 * performance
301 * 311 *
302 * oldcg: the cgroup group that we're using before the cgroup 312 * oldcg: the cgroup group that we're using before the cgroup
303 * transition 313 * transition
@@ -314,7 +324,9 @@ static struct css_set *find_existing_css_set(
314{ 324{
315 int i; 325 int i;
316 struct cgroupfs_root *root = cgrp->root; 326 struct cgroupfs_root *root = cgrp->root;
317 struct list_head *l = &init_css_set.list; 327 struct hlist_head *hhead;
328 struct hlist_node *node;
329 struct css_set *cg;
318 330
319 /* Built the set of subsystem state objects that we want to 331 /* Built the set of subsystem state objects that we want to
320 * see in the new css_set */ 332 * see in the new css_set */
@@ -331,18 +343,13 @@ static struct css_set *find_existing_css_set(
331 } 343 }
332 } 344 }
333 345
334 /* Look through existing cgroup groups to find one to reuse */ 346 hhead = css_set_hash(template);
335 do { 347 hlist_for_each_entry(cg, node, hhead, hlist) {
336 struct css_set *cg =
337 list_entry(l, struct css_set, list);
338
339 if (!memcmp(template, cg->subsys, sizeof(cg->subsys))) { 348 if (!memcmp(template, cg->subsys, sizeof(cg->subsys))) {
340 /* All subsystems matched */ 349 /* All subsystems matched */
341 return cg; 350 return cg;
342 } 351 }
343 /* Try the next cgroup group */ 352 }
344 l = l->next;
345 } while (l != &init_css_set.list);
346 353
347 /* No existing cgroup group matched */ 354 /* No existing cgroup group matched */
348 return NULL; 355 return NULL;
@@ -404,6 +411,8 @@ static struct css_set *find_css_set(
404 struct list_head tmp_cg_links; 411 struct list_head tmp_cg_links;
405 struct cg_cgroup_link *link; 412 struct cg_cgroup_link *link;
406 413
414 struct hlist_head *hhead;
415
407 /* First see if we already have a cgroup group that matches 416 /* First see if we already have a cgroup group that matches
408 * the desired set */ 417 * the desired set */
409 write_lock(&css_set_lock); 418 write_lock(&css_set_lock);
@@ -428,6 +437,7 @@ static struct css_set *find_css_set(
428 kref_init(&res->ref); 437 kref_init(&res->ref);
429 INIT_LIST_HEAD(&res->cg_links); 438 INIT_LIST_HEAD(&res->cg_links);
430 INIT_LIST_HEAD(&res->tasks); 439 INIT_LIST_HEAD(&res->tasks);
440 INIT_HLIST_NODE(&res->hlist);
431 441
432 /* Copy the set of subsystem state objects generated in 442 /* Copy the set of subsystem state objects generated in
433 * find_existing_css_set() */ 443 * find_existing_css_set() */
@@ -467,9 +477,12 @@ static struct css_set *find_css_set(
467 477
468 BUG_ON(!list_empty(&tmp_cg_links)); 478 BUG_ON(!list_empty(&tmp_cg_links));
469 479
470 /* Link this cgroup group into the list */
471 list_add(&res->list, &init_css_set.list);
472 css_set_count++; 480 css_set_count++;
481
482 /* Add this cgroup group to the hash table */
483 hhead = css_set_hash(res->subsys);
484 hlist_add_head(&res->hlist, hhead);
485
473 write_unlock(&css_set_lock); 486 write_unlock(&css_set_lock);
474 487
475 return res; 488 return res;
@@ -948,7 +961,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
948 int ret = 0; 961 int ret = 0;
949 struct super_block *sb; 962 struct super_block *sb;
950 struct cgroupfs_root *root; 963 struct cgroupfs_root *root;
951 struct list_head tmp_cg_links, *l; 964 struct list_head tmp_cg_links;
952 INIT_LIST_HEAD(&tmp_cg_links); 965 INIT_LIST_HEAD(&tmp_cg_links);
953 966
954 /* First find the desired set of subsystems */ 967 /* First find the desired set of subsystems */
@@ -990,6 +1003,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
990 /* New superblock */ 1003 /* New superblock */
991 struct cgroup *cgrp = &root->top_cgroup; 1004 struct cgroup *cgrp = &root->top_cgroup;
992 struct inode *inode; 1005 struct inode *inode;
1006 int i;
993 1007
994 BUG_ON(sb->s_root != NULL); 1008 BUG_ON(sb->s_root != NULL);
995 1009
@@ -1034,22 +1048,25 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1034 /* Link the top cgroup in this hierarchy into all 1048 /* Link the top cgroup in this hierarchy into all
1035 * the css_set objects */ 1049 * the css_set objects */
1036 write_lock(&css_set_lock); 1050 write_lock(&css_set_lock);
1037 l = &init_css_set.list; 1051 for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
1038 do { 1052 struct hlist_head *hhead = &css_set_table[i];
1053 struct hlist_node *node;
1039 struct css_set *cg; 1054 struct css_set *cg;
1040 struct cg_cgroup_link *link; 1055
1041 cg = list_entry(l, struct css_set, list); 1056 hlist_for_each_entry(cg, node, hhead, hlist) {
1042 BUG_ON(list_empty(&tmp_cg_links)); 1057 struct cg_cgroup_link *link;
1043 link = list_entry(tmp_cg_links.next, 1058
1044 struct cg_cgroup_link, 1059 BUG_ON(list_empty(&tmp_cg_links));
1045 cgrp_link_list); 1060 link = list_entry(tmp_cg_links.next,
1046 list_del(&link->cgrp_link_list); 1061 struct cg_cgroup_link,
1047 link->cg = cg; 1062 cgrp_link_list);
1048 list_add(&link->cgrp_link_list, 1063 list_del(&link->cgrp_link_list);
1049 &root->top_cgroup.css_sets); 1064 link->cg = cg;
1050 list_add(&link->cg_link_list, &cg->cg_links); 1065 list_add(&link->cgrp_link_list,
1051 l = l->next; 1066 &root->top_cgroup.css_sets);
1052 } while (l != &init_css_set.list); 1067 list_add(&link->cg_link_list, &cg->cg_links);
1068 }
1069 }
1053 write_unlock(&css_set_lock); 1070 write_unlock(&css_set_lock);
1054 1071
1055 free_cg_links(&tmp_cg_links); 1072 free_cg_links(&tmp_cg_links);
@@ -1307,18 +1324,16 @@ enum cgroup_filetype {
1307 FILE_DIR, 1324 FILE_DIR,
1308 FILE_TASKLIST, 1325 FILE_TASKLIST,
1309 FILE_NOTIFY_ON_RELEASE, 1326 FILE_NOTIFY_ON_RELEASE,
1310 FILE_RELEASABLE,
1311 FILE_RELEASE_AGENT, 1327 FILE_RELEASE_AGENT,
1312}; 1328};
1313 1329
1314static ssize_t cgroup_write_uint(struct cgroup *cgrp, struct cftype *cft, 1330static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
1315 struct file *file, 1331 struct file *file,
1316 const char __user *userbuf, 1332 const char __user *userbuf,
1317 size_t nbytes, loff_t *unused_ppos) 1333 size_t nbytes, loff_t *unused_ppos)
1318{ 1334{
1319 char buffer[64]; 1335 char buffer[64];
1320 int retval = 0; 1336 int retval = 0;
1321 u64 val;
1322 char *end; 1337 char *end;
1323 1338
1324 if (!nbytes) 1339 if (!nbytes)
@@ -1329,16 +1344,18 @@ static ssize_t cgroup_write_uint(struct cgroup *cgrp, struct cftype *cft,
1329 return -EFAULT; 1344 return -EFAULT;
1330 1345
1331 buffer[nbytes] = 0; /* nul-terminate */ 1346 buffer[nbytes] = 0; /* nul-terminate */
1332 1347 strstrip(buffer);
1333 /* strip newline if necessary */ 1348 if (cft->write_u64) {
1334 if (nbytes && (buffer[nbytes-1] == '\n')) 1349 u64 val = simple_strtoull(buffer, &end, 0);
1335 buffer[nbytes-1] = 0; 1350 if (*end)
1336 val = simple_strtoull(buffer, &end, 0); 1351 return -EINVAL;
1337 if (*end) 1352 retval = cft->write_u64(cgrp, cft, val);
1338 return -EINVAL; 1353 } else {
1339 1354 s64 val = simple_strtoll(buffer, &end, 0);
1340 /* Pass to subsystem */ 1355 if (*end)
1341 retval = cft->write_uint(cgrp, cft, val); 1356 return -EINVAL;
1357 retval = cft->write_s64(cgrp, cft, val);
1358 }
1342 if (!retval) 1359 if (!retval)
1343 retval = nbytes; 1360 retval = nbytes;
1344 return retval; 1361 return retval;
@@ -1419,23 +1436,39 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
1419 return -ENODEV; 1436 return -ENODEV;
1420 if (cft->write) 1437 if (cft->write)
1421 return cft->write(cgrp, cft, file, buf, nbytes, ppos); 1438 return cft->write(cgrp, cft, file, buf, nbytes, ppos);
1422 if (cft->write_uint) 1439 if (cft->write_u64 || cft->write_s64)
1423 return cgroup_write_uint(cgrp, cft, file, buf, nbytes, ppos); 1440 return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos);
1441 if (cft->trigger) {
1442 int ret = cft->trigger(cgrp, (unsigned int)cft->private);
1443 return ret ? ret : nbytes;
1444 }
1424 return -EINVAL; 1445 return -EINVAL;
1425} 1446}
1426 1447
1427static ssize_t cgroup_read_uint(struct cgroup *cgrp, struct cftype *cft, 1448static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft,
1428 struct file *file, 1449 struct file *file,
1429 char __user *buf, size_t nbytes, 1450 char __user *buf, size_t nbytes,
1430 loff_t *ppos) 1451 loff_t *ppos)
1431{ 1452{
1432 char tmp[64]; 1453 char tmp[64];
1433 u64 val = cft->read_uint(cgrp, cft); 1454 u64 val = cft->read_u64(cgrp, cft);
1434 int len = sprintf(tmp, "%llu\n", (unsigned long long) val); 1455 int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
1435 1456
1436 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); 1457 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
1437} 1458}
1438 1459
1460static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft,
1461 struct file *file,
1462 char __user *buf, size_t nbytes,
1463 loff_t *ppos)
1464{
1465 char tmp[64];
1466 s64 val = cft->read_s64(cgrp, cft);
1467 int len = sprintf(tmp, "%lld\n", (long long) val);
1468
1469 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
1470}
1471
1439static ssize_t cgroup_common_file_read(struct cgroup *cgrp, 1472static ssize_t cgroup_common_file_read(struct cgroup *cgrp,
1440 struct cftype *cft, 1473 struct cftype *cft,
1441 struct file *file, 1474 struct file *file,
@@ -1490,11 +1523,56 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf,
1490 1523
1491 if (cft->read) 1524 if (cft->read)
1492 return cft->read(cgrp, cft, file, buf, nbytes, ppos); 1525 return cft->read(cgrp, cft, file, buf, nbytes, ppos);
1493 if (cft->read_uint) 1526 if (cft->read_u64)
1494 return cgroup_read_uint(cgrp, cft, file, buf, nbytes, ppos); 1527 return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos);
1528 if (cft->read_s64)
1529 return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos);
1495 return -EINVAL; 1530 return -EINVAL;
1496} 1531}
1497 1532
1533/*
1534 * seqfile ops/methods for returning structured data. Currently just
1535 * supports string->u64 maps, but can be extended in future.
1536 */
1537
1538struct cgroup_seqfile_state {
1539 struct cftype *cft;
1540 struct cgroup *cgroup;
1541};
1542
1543static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
1544{
1545 struct seq_file *sf = cb->state;
1546 return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value);
1547}
1548
1549static int cgroup_seqfile_show(struct seq_file *m, void *arg)
1550{
1551 struct cgroup_seqfile_state *state = m->private;
1552 struct cftype *cft = state->cft;
1553 if (cft->read_map) {
1554 struct cgroup_map_cb cb = {
1555 .fill = cgroup_map_add,
1556 .state = m,
1557 };
1558 return cft->read_map(state->cgroup, cft, &cb);
1559 }
1560 return cft->read_seq_string(state->cgroup, cft, m);
1561}
1562
1563int cgroup_seqfile_release(struct inode *inode, struct file *file)
1564{
1565 struct seq_file *seq = file->private_data;
1566 kfree(seq->private);
1567 return single_release(inode, file);
1568}
1569
1570static struct file_operations cgroup_seqfile_operations = {
1571 .read = seq_read,
1572 .llseek = seq_lseek,
1573 .release = cgroup_seqfile_release,
1574};
1575
1498static int cgroup_file_open(struct inode *inode, struct file *file) 1576static int cgroup_file_open(struct inode *inode, struct file *file)
1499{ 1577{
1500 int err; 1578 int err;
@@ -1507,7 +1585,18 @@ static int cgroup_file_open(struct inode *inode, struct file *file)
1507 cft = __d_cft(file->f_dentry); 1585 cft = __d_cft(file->f_dentry);
1508 if (!cft) 1586 if (!cft)
1509 return -ENODEV; 1587 return -ENODEV;
1510 if (cft->open) 1588 if (cft->read_map || cft->read_seq_string) {
1589 struct cgroup_seqfile_state *state =
1590 kzalloc(sizeof(*state), GFP_USER);
1591 if (!state)
1592 return -ENOMEM;
1593 state->cft = cft;
1594 state->cgroup = __d_cgrp(file->f_dentry->d_parent);
1595 file->f_op = &cgroup_seqfile_operations;
1596 err = single_open(file, cgroup_seqfile_show, state);
1597 if (err < 0)
1598 kfree(state);
1599 } else if (cft->open)
1511 err = cft->open(inode, file); 1600 err = cft->open(inode, file);
1512 else 1601 else
1513 err = 0; 1602 err = 0;
@@ -1715,7 +1804,7 @@ static void cgroup_advance_iter(struct cgroup *cgrp,
1715 * The tasklist_lock is not held here, as do_each_thread() and 1804 * The tasklist_lock is not held here, as do_each_thread() and
1716 * while_each_thread() are protected by RCU. 1805 * while_each_thread() are protected by RCU.
1717 */ 1806 */
1718void cgroup_enable_task_cg_lists(void) 1807static void cgroup_enable_task_cg_lists(void)
1719{ 1808{
1720 struct task_struct *p, *g; 1809 struct task_struct *p, *g;
1721 write_lock(&css_set_lock); 1810 write_lock(&css_set_lock);
@@ -1913,14 +2002,14 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
1913 2002
1914 if (heap->size) { 2003 if (heap->size) {
1915 for (i = 0; i < heap->size; i++) { 2004 for (i = 0; i < heap->size; i++) {
1916 struct task_struct *p = heap->ptrs[i]; 2005 struct task_struct *q = heap->ptrs[i];
1917 if (i == 0) { 2006 if (i == 0) {
1918 latest_time = p->start_time; 2007 latest_time = q->start_time;
1919 latest_task = p; 2008 latest_task = q;
1920 } 2009 }
1921 /* Process the task per the caller's callback */ 2010 /* Process the task per the caller's callback */
1922 scan->process_task(p, scan); 2011 scan->process_task(q, scan);
1923 put_task_struct(p); 2012 put_task_struct(q);
1924 } 2013 }
1925 /* 2014 /*
1926 * If we had to process any tasks at all, scan again 2015 * If we had to process any tasks at all, scan again
@@ -2138,11 +2227,6 @@ static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
2138 return notify_on_release(cgrp); 2227 return notify_on_release(cgrp);
2139} 2228}
2140 2229
2141static u64 cgroup_read_releasable(struct cgroup *cgrp, struct cftype *cft)
2142{
2143 return test_bit(CGRP_RELEASABLE, &cgrp->flags);
2144}
2145
2146/* 2230/*
2147 * for the common functions, 'private' gives the type of file 2231 * for the common functions, 'private' gives the type of file
2148 */ 2232 */
@@ -2158,16 +2242,10 @@ static struct cftype files[] = {
2158 2242
2159 { 2243 {
2160 .name = "notify_on_release", 2244 .name = "notify_on_release",
2161 .read_uint = cgroup_read_notify_on_release, 2245 .read_u64 = cgroup_read_notify_on_release,
2162 .write = cgroup_common_file_write, 2246 .write = cgroup_common_file_write,
2163 .private = FILE_NOTIFY_ON_RELEASE, 2247 .private = FILE_NOTIFY_ON_RELEASE,
2164 }, 2248 },
2165
2166 {
2167 .name = "releasable",
2168 .read_uint = cgroup_read_releasable,
2169 .private = FILE_RELEASABLE,
2170 }
2171}; 2249};
2172 2250
2173static struct cftype cft_release_agent = { 2251static struct cftype cft_release_agent = {
@@ -2401,10 +2479,9 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2401 return 0; 2479 return 0;
2402} 2480}
2403 2481
2404static void cgroup_init_subsys(struct cgroup_subsys *ss) 2482static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
2405{ 2483{
2406 struct cgroup_subsys_state *css; 2484 struct cgroup_subsys_state *css;
2407 struct list_head *l;
2408 2485
2409 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); 2486 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
2410 2487
@@ -2415,34 +2492,19 @@ static void cgroup_init_subsys(struct cgroup_subsys *ss)
2415 BUG_ON(IS_ERR(css)); 2492 BUG_ON(IS_ERR(css));
2416 init_cgroup_css(css, ss, dummytop); 2493 init_cgroup_css(css, ss, dummytop);
2417 2494
2418 /* Update all cgroup groups to contain a subsys 2495 /* Update the init_css_set to contain a subsys
2419 * pointer to this state - since the subsystem is 2496 * pointer to this state - since the subsystem is
2420 * newly registered, all tasks and hence all cgroup 2497 * newly registered, all tasks and hence the
2421 * groups are in the subsystem's top cgroup. */ 2498 * init_css_set is in the subsystem's top cgroup. */
2422 write_lock(&css_set_lock); 2499 init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
2423 l = &init_css_set.list;
2424 do {
2425 struct css_set *cg =
2426 list_entry(l, struct css_set, list);
2427 cg->subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
2428 l = l->next;
2429 } while (l != &init_css_set.list);
2430 write_unlock(&css_set_lock);
2431
2432 /* If this subsystem requested that it be notified with fork
2433 * events, we should send it one now for every process in the
2434 * system */
2435 if (ss->fork) {
2436 struct task_struct *g, *p;
2437
2438 read_lock(&tasklist_lock);
2439 do_each_thread(g, p) {
2440 ss->fork(ss, p);
2441 } while_each_thread(g, p);
2442 read_unlock(&tasklist_lock);
2443 }
2444 2500
2445 need_forkexit_callback |= ss->fork || ss->exit; 2501 need_forkexit_callback |= ss->fork || ss->exit;
2502 need_mm_owner_callback |= !!ss->mm_owner_changed;
2503
2504 /* At system boot, before all subsystems have been
2505 * registered, no tasks have been forked, so we don't
2506 * need to invoke fork callbacks here. */
2507 BUG_ON(!list_empty(&init_task.tasks));
2446 2508
2447 ss->active = 1; 2509 ss->active = 1;
2448} 2510}
@@ -2458,9 +2520,9 @@ int __init cgroup_init_early(void)
2458 int i; 2520 int i;
2459 kref_init(&init_css_set.ref); 2521 kref_init(&init_css_set.ref);
2460 kref_get(&init_css_set.ref); 2522 kref_get(&init_css_set.ref);
2461 INIT_LIST_HEAD(&init_css_set.list);
2462 INIT_LIST_HEAD(&init_css_set.cg_links); 2523 INIT_LIST_HEAD(&init_css_set.cg_links);
2463 INIT_LIST_HEAD(&init_css_set.tasks); 2524 INIT_LIST_HEAD(&init_css_set.tasks);
2525 INIT_HLIST_NODE(&init_css_set.hlist);
2464 css_set_count = 1; 2526 css_set_count = 1;
2465 init_cgroup_root(&rootnode); 2527 init_cgroup_root(&rootnode);
2466 list_add(&rootnode.root_list, &roots); 2528 list_add(&rootnode.root_list, &roots);
@@ -2473,6 +2535,9 @@ int __init cgroup_init_early(void)
2473 list_add(&init_css_set_link.cg_link_list, 2535 list_add(&init_css_set_link.cg_link_list,
2474 &init_css_set.cg_links); 2536 &init_css_set.cg_links);
2475 2537
2538 for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
2539 INIT_HLIST_HEAD(&css_set_table[i]);
2540
2476 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 2541 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2477 struct cgroup_subsys *ss = subsys[i]; 2542 struct cgroup_subsys *ss = subsys[i];
2478 2543
@@ -2502,7 +2567,7 @@ int __init cgroup_init(void)
2502{ 2567{
2503 int err; 2568 int err;
2504 int i; 2569 int i;
2505 struct proc_dir_entry *entry; 2570 struct hlist_head *hhead;
2506 2571
2507 err = bdi_init(&cgroup_backing_dev_info); 2572 err = bdi_init(&cgroup_backing_dev_info);
2508 if (err) 2573 if (err)
@@ -2514,13 +2579,15 @@ int __init cgroup_init(void)
2514 cgroup_init_subsys(ss); 2579 cgroup_init_subsys(ss);
2515 } 2580 }
2516 2581
2582 /* Add init_css_set to the hash table */
2583 hhead = css_set_hash(init_css_set.subsys);
2584 hlist_add_head(&init_css_set.hlist, hhead);
2585
2517 err = register_filesystem(&cgroup_fs_type); 2586 err = register_filesystem(&cgroup_fs_type);
2518 if (err < 0) 2587 if (err < 0)
2519 goto out; 2588 goto out;
2520 2589
2521 entry = create_proc_entry("cgroups", 0, NULL); 2590 proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
2522 if (entry)
2523 entry->proc_fops = &proc_cgroupstats_operations;
2524 2591
2525out: 2592out:
2526 if (err) 2593 if (err)
@@ -2683,6 +2750,34 @@ void cgroup_fork_callbacks(struct task_struct *child)
2683 } 2750 }
2684} 2751}
2685 2752
2753#ifdef CONFIG_MM_OWNER
2754/**
2755 * cgroup_mm_owner_callbacks - run callbacks when the mm->owner changes
2756 * @p: the new owner
2757 *
2758 * Called on every change to mm->owner. mm_init_owner() does not
2759 * invoke this routine, since it assigns the mm->owner the first time
2760 * and does not change it.
2761 */
2762void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new)
2763{
2764 struct cgroup *oldcgrp, *newcgrp;
2765
2766 if (need_mm_owner_callback) {
2767 int i;
2768 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2769 struct cgroup_subsys *ss = subsys[i];
2770 oldcgrp = task_cgroup(old, ss->subsys_id);
2771 newcgrp = task_cgroup(new, ss->subsys_id);
2772 if (oldcgrp == newcgrp)
2773 continue;
2774 if (ss->mm_owner_changed)
2775 ss->mm_owner_changed(ss, oldcgrp, newcgrp);
2776 }
2777 }
2778}
2779#endif /* CONFIG_MM_OWNER */
2780
2686/** 2781/**
2687 * cgroup_post_fork - called on a new task after adding it to the task list 2782 * cgroup_post_fork - called on a new task after adding it to the task list
2688 * @child: the task in question 2783 * @child: the task in question
diff --git a/kernel/cgroup_debug.c b/kernel/cgroup_debug.c
index 37301e877cb0..c3dc3aba4c02 100644
--- a/kernel/cgroup_debug.c
+++ b/kernel/cgroup_debug.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * kernel/ccontainer_debug.c - Example cgroup subsystem that 2 * kernel/cgroup_debug.c - Example cgroup subsystem that
3 * exposes debug info 3 * exposes debug info
4 * 4 *
5 * Copyright (C) Google Inc, 2007 5 * Copyright (C) Google Inc, 2007
@@ -62,25 +62,35 @@ static u64 current_css_set_refcount_read(struct cgroup *cont,
62 return count; 62 return count;
63} 63}
64 64
65static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
66{
67 return test_bit(CGRP_RELEASABLE, &cgrp->flags);
68}
69
65static struct cftype files[] = { 70static struct cftype files[] = {
66 { 71 {
67 .name = "cgroup_refcount", 72 .name = "cgroup_refcount",
68 .read_uint = cgroup_refcount_read, 73 .read_u64 = cgroup_refcount_read,
69 }, 74 },
70 { 75 {
71 .name = "taskcount", 76 .name = "taskcount",
72 .read_uint = taskcount_read, 77 .read_u64 = taskcount_read,
73 }, 78 },
74 79
75 { 80 {
76 .name = "current_css_set", 81 .name = "current_css_set",
77 .read_uint = current_css_set_read, 82 .read_u64 = current_css_set_read,
78 }, 83 },
79 84
80 { 85 {
81 .name = "current_css_set_refcount", 86 .name = "current_css_set_refcount",
82 .read_uint = current_css_set_refcount_read, 87 .read_u64 = current_css_set_refcount_read,
83 }, 88 },
89
90 {
91 .name = "releasable",
92 .read_u64 = releasable_read,
93 }
84}; 94};
85 95
86static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) 96static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
diff --git a/kernel/configs.c b/kernel/configs.c
index e84d3f9c6c7b..4c345210ed8c 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -79,12 +79,11 @@ static int __init ikconfig_init(void)
79 struct proc_dir_entry *entry; 79 struct proc_dir_entry *entry;
80 80
81 /* create the current config file */ 81 /* create the current config file */
82 entry = create_proc_entry("config.gz", S_IFREG | S_IRUGO, 82 entry = proc_create("config.gz", S_IFREG | S_IRUGO, NULL,
83 &proc_root); 83 &ikconfig_file_ops);
84 if (!entry) 84 if (!entry)
85 return -ENOMEM; 85 return -ENOMEM;
86 86
87 entry->proc_fops = &ikconfig_file_ops;
88 entry->size = kernel_config_data_size; 87 entry->size = kernel_config_data_size;
89 88
90 return 0; 89 return 0;
@@ -95,7 +94,7 @@ static int __init ikconfig_init(void)
95 94
96static void __exit ikconfig_cleanup(void) 95static void __exit ikconfig_cleanup(void)
97{ 96{
98 remove_proc_entry("config.gz", &proc_root); 97 remove_proc_entry("config.gz", NULL);
99} 98}
100 99
101module_init(ikconfig_init); 100module_init(ikconfig_init);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 2011ad8d2697..a98f6ab16ecd 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -33,17 +33,13 @@ static struct {
33 * an ongoing cpu hotplug operation. 33 * an ongoing cpu hotplug operation.
34 */ 34 */
35 int refcount; 35 int refcount;
36 wait_queue_head_t writer_queue;
37} cpu_hotplug; 36} cpu_hotplug;
38 37
39#define writer_exists() (cpu_hotplug.active_writer != NULL)
40
41void __init cpu_hotplug_init(void) 38void __init cpu_hotplug_init(void)
42{ 39{
43 cpu_hotplug.active_writer = NULL; 40 cpu_hotplug.active_writer = NULL;
44 mutex_init(&cpu_hotplug.lock); 41 mutex_init(&cpu_hotplug.lock);
45 cpu_hotplug.refcount = 0; 42 cpu_hotplug.refcount = 0;
46 init_waitqueue_head(&cpu_hotplug.writer_queue);
47} 43}
48 44
49#ifdef CONFIG_HOTPLUG_CPU 45#ifdef CONFIG_HOTPLUG_CPU
@@ -65,11 +61,8 @@ void put_online_cpus(void)
65 if (cpu_hotplug.active_writer == current) 61 if (cpu_hotplug.active_writer == current)
66 return; 62 return;
67 mutex_lock(&cpu_hotplug.lock); 63 mutex_lock(&cpu_hotplug.lock);
68 cpu_hotplug.refcount--; 64 if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer))
69 65 wake_up_process(cpu_hotplug.active_writer);
70 if (unlikely(writer_exists()) && !cpu_hotplug.refcount)
71 wake_up(&cpu_hotplug.writer_queue);
72
73 mutex_unlock(&cpu_hotplug.lock); 66 mutex_unlock(&cpu_hotplug.lock);
74 67
75} 68}
@@ -98,8 +91,8 @@ void cpu_maps_update_done(void)
98 * Note that during a cpu-hotplug operation, the new readers, if any, 91 * Note that during a cpu-hotplug operation, the new readers, if any,
99 * will be blocked by the cpu_hotplug.lock 92 * will be blocked by the cpu_hotplug.lock
100 * 93 *
101 * Since cpu_maps_update_begin is always called after invoking 94 * Since cpu_hotplug_begin() is always called after invoking
102 * cpu_maps_update_begin, we can be sure that only one writer is active. 95 * cpu_maps_update_begin(), we can be sure that only one writer is active.
103 * 96 *
104 * Note that theoretically, there is a possibility of a livelock: 97 * Note that theoretically, there is a possibility of a livelock:
105 * - Refcount goes to zero, last reader wakes up the sleeping 98 * - Refcount goes to zero, last reader wakes up the sleeping
@@ -115,19 +108,16 @@ void cpu_maps_update_done(void)
115 */ 108 */
116static void cpu_hotplug_begin(void) 109static void cpu_hotplug_begin(void)
117{ 110{
118 DECLARE_WAITQUEUE(wait, current);
119
120 mutex_lock(&cpu_hotplug.lock);
121
122 cpu_hotplug.active_writer = current; 111 cpu_hotplug.active_writer = current;
123 add_wait_queue_exclusive(&cpu_hotplug.writer_queue, &wait); 112
124 while (cpu_hotplug.refcount) { 113 for (;;) {
125 set_current_state(TASK_UNINTERRUPTIBLE); 114 mutex_lock(&cpu_hotplug.lock);
115 if (likely(!cpu_hotplug.refcount))
116 break;
117 __set_current_state(TASK_UNINTERRUPTIBLE);
126 mutex_unlock(&cpu_hotplug.lock); 118 mutex_unlock(&cpu_hotplug.lock);
127 schedule(); 119 schedule();
128 mutex_lock(&cpu_hotplug.lock);
129 } 120 }
130 remove_wait_queue_locked(&cpu_hotplug.writer_queue, &wait);
131} 121}
132 122
133static void cpu_hotplug_done(void) 123static void cpu_hotplug_done(void)
@@ -136,7 +126,7 @@ static void cpu_hotplug_done(void)
136 mutex_unlock(&cpu_hotplug.lock); 126 mutex_unlock(&cpu_hotplug.lock);
137} 127}
138/* Need to know about CPUs going up/down? */ 128/* Need to know about CPUs going up/down? */
139int __cpuinit register_cpu_notifier(struct notifier_block *nb) 129int __ref register_cpu_notifier(struct notifier_block *nb)
140{ 130{
141 int ret; 131 int ret;
142 cpu_maps_update_begin(); 132 cpu_maps_update_begin();
@@ -149,7 +139,7 @@ int __cpuinit register_cpu_notifier(struct notifier_block *nb)
149 139
150EXPORT_SYMBOL(register_cpu_notifier); 140EXPORT_SYMBOL(register_cpu_notifier);
151 141
152void unregister_cpu_notifier(struct notifier_block *nb) 142void __ref unregister_cpu_notifier(struct notifier_block *nb)
153{ 143{
154 cpu_maps_update_begin(); 144 cpu_maps_update_begin();
155 raw_notifier_chain_unregister(&cpu_chain, nb); 145 raw_notifier_chain_unregister(&cpu_chain, nb);
@@ -180,7 +170,7 @@ struct take_cpu_down_param {
180}; 170};
181 171
182/* Take this CPU down. */ 172/* Take this CPU down. */
183static int take_cpu_down(void *_param) 173static int __ref take_cpu_down(void *_param)
184{ 174{
185 struct take_cpu_down_param *param = _param; 175 struct take_cpu_down_param *param = _param;
186 int err; 176 int err;
@@ -199,7 +189,7 @@ static int take_cpu_down(void *_param)
199} 189}
200 190
201/* Requires cpu_add_remove_lock to be held */ 191/* Requires cpu_add_remove_lock to be held */
202static int _cpu_down(unsigned int cpu, int tasks_frozen) 192static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
203{ 193{
204 int err, nr_calls = 0; 194 int err, nr_calls = 0;
205 struct task_struct *p; 195 struct task_struct *p;
@@ -274,7 +264,7 @@ out_release:
274 return err; 264 return err;
275} 265}
276 266
277int cpu_down(unsigned int cpu) 267int __ref cpu_down(unsigned int cpu)
278{ 268{
279 int err = 0; 269 int err = 0;
280 270
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 8b35fbd8292f..8da627d33804 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -127,6 +127,7 @@ struct cpuset_hotplug_scanner {
127typedef enum { 127typedef enum {
128 CS_CPU_EXCLUSIVE, 128 CS_CPU_EXCLUSIVE,
129 CS_MEM_EXCLUSIVE, 129 CS_MEM_EXCLUSIVE,
130 CS_MEM_HARDWALL,
130 CS_MEMORY_MIGRATE, 131 CS_MEMORY_MIGRATE,
131 CS_SCHED_LOAD_BALANCE, 132 CS_SCHED_LOAD_BALANCE,
132 CS_SPREAD_PAGE, 133 CS_SPREAD_PAGE,
@@ -144,6 +145,11 @@ static inline int is_mem_exclusive(const struct cpuset *cs)
144 return test_bit(CS_MEM_EXCLUSIVE, &cs->flags); 145 return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
145} 146}
146 147
148static inline int is_mem_hardwall(const struct cpuset *cs)
149{
150 return test_bit(CS_MEM_HARDWALL, &cs->flags);
151}
152
147static inline int is_sched_load_balance(const struct cpuset *cs) 153static inline int is_sched_load_balance(const struct cpuset *cs)
148{ 154{
149 return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); 155 return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
@@ -735,7 +741,8 @@ static inline int started_after(void *p1, void *p2)
735 * Return nonzero if this tasks's cpus_allowed mask should be changed (in other 741 * Return nonzero if this tasks's cpus_allowed mask should be changed (in other
736 * words, if its mask is not equal to its cpuset's mask). 742 * words, if its mask is not equal to its cpuset's mask).
737 */ 743 */
738int cpuset_test_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan) 744static int cpuset_test_cpumask(struct task_struct *tsk,
745 struct cgroup_scanner *scan)
739{ 746{
740 return !cpus_equal(tsk->cpus_allowed, 747 return !cpus_equal(tsk->cpus_allowed,
741 (cgroup_cs(scan->cg))->cpus_allowed); 748 (cgroup_cs(scan->cg))->cpus_allowed);
@@ -752,7 +759,8 @@ int cpuset_test_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan)
752 * We don't need to re-check for the cgroup/cpuset membership, since we're 759 * We don't need to re-check for the cgroup/cpuset membership, since we're
753 * holding cgroup_lock() at this point. 760 * holding cgroup_lock() at this point.
754 */ 761 */
755void cpuset_change_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan) 762static void cpuset_change_cpumask(struct task_struct *tsk,
763 struct cgroup_scanner *scan)
756{ 764{
757 set_cpus_allowed_ptr(tsk, &((cgroup_cs(scan->cg))->cpus_allowed)); 765 set_cpus_allowed_ptr(tsk, &((cgroup_cs(scan->cg))->cpus_allowed));
758} 766}
@@ -941,7 +949,7 @@ static int update_nodemask(struct cpuset *cs, char *buf)
941 cs->mems_generation = cpuset_mems_generation++; 949 cs->mems_generation = cpuset_mems_generation++;
942 mutex_unlock(&callback_mutex); 950 mutex_unlock(&callback_mutex);
943 951
944 cpuset_being_rebound = cs; /* causes mpol_copy() rebind */ 952 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
945 953
946 fudge = 10; /* spare mmarray[] slots */ 954 fudge = 10; /* spare mmarray[] slots */
947 fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */ 955 fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */
@@ -992,7 +1000,7 @@ static int update_nodemask(struct cpuset *cs, char *buf)
992 * rebind the vma mempolicies of each mm in mmarray[] to their 1000 * rebind the vma mempolicies of each mm in mmarray[] to their
993 * new cpuset, and release that mm. The mpol_rebind_mm() 1001 * new cpuset, and release that mm. The mpol_rebind_mm()
994 * call takes mmap_sem, which we couldn't take while holding 1002 * call takes mmap_sem, which we couldn't take while holding
995 * tasklist_lock. Forks can happen again now - the mpol_copy() 1003 * tasklist_lock. Forks can happen again now - the mpol_dup()
996 * cpuset_being_rebound check will catch such forks, and rebind 1004 * cpuset_being_rebound check will catch such forks, and rebind
997 * their vma mempolicies too. Because we still hold the global 1005 * their vma mempolicies too. Because we still hold the global
998 * cgroup_mutex, we know that no other rebind effort will 1006 * cgroup_mutex, we know that no other rebind effort will
@@ -1023,19 +1031,6 @@ int current_cpuset_is_being_rebound(void)
1023 return task_cs(current) == cpuset_being_rebound; 1031 return task_cs(current) == cpuset_being_rebound;
1024} 1032}
1025 1033
1026/*
1027 * Call with cgroup_mutex held.
1028 */
1029
1030static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
1031{
1032 if (simple_strtoul(buf, NULL, 10) != 0)
1033 cpuset_memory_pressure_enabled = 1;
1034 else
1035 cpuset_memory_pressure_enabled = 0;
1036 return 0;
1037}
1038
1039static int update_relax_domain_level(struct cpuset *cs, char *buf) 1034static int update_relax_domain_level(struct cpuset *cs, char *buf)
1040{ 1035{
1041 int val = simple_strtol(buf, NULL, 10); 1036 int val = simple_strtol(buf, NULL, 10);
@@ -1053,25 +1048,20 @@ static int update_relax_domain_level(struct cpuset *cs, char *buf)
1053 1048
1054/* 1049/*
1055 * update_flag - read a 0 or a 1 in a file and update associated flag 1050 * update_flag - read a 0 or a 1 in a file and update associated flag
1056 * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, 1051 * bit: the bit to update (see cpuset_flagbits_t)
1057 * CS_SCHED_LOAD_BALANCE, 1052 * cs: the cpuset to update
1058 * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE, 1053 * turning_on: whether the flag is being set or cleared
1059 * CS_SPREAD_PAGE, CS_SPREAD_SLAB)
1060 * cs: the cpuset to update
1061 * buf: the buffer where we read the 0 or 1
1062 * 1054 *
1063 * Call with cgroup_mutex held. 1055 * Call with cgroup_mutex held.
1064 */ 1056 */
1065 1057
1066static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) 1058static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1059 int turning_on)
1067{ 1060{
1068 int turning_on;
1069 struct cpuset trialcs; 1061 struct cpuset trialcs;
1070 int err; 1062 int err;
1071 int cpus_nonempty, balance_flag_changed; 1063 int cpus_nonempty, balance_flag_changed;
1072 1064
1073 turning_on = (simple_strtoul(buf, NULL, 10) != 0);
1074
1075 trialcs = *cs; 1065 trialcs = *cs;
1076 if (turning_on) 1066 if (turning_on)
1077 set_bit(bit, &trialcs.flags); 1067 set_bit(bit, &trialcs.flags);
@@ -1241,6 +1231,7 @@ typedef enum {
1241 FILE_MEMLIST, 1231 FILE_MEMLIST,
1242 FILE_CPU_EXCLUSIVE, 1232 FILE_CPU_EXCLUSIVE,
1243 FILE_MEM_EXCLUSIVE, 1233 FILE_MEM_EXCLUSIVE,
1234 FILE_MEM_HARDWALL,
1244 FILE_SCHED_LOAD_BALANCE, 1235 FILE_SCHED_LOAD_BALANCE,
1245 FILE_SCHED_RELAX_DOMAIN_LEVEL, 1236 FILE_SCHED_RELAX_DOMAIN_LEVEL,
1246 FILE_MEMORY_PRESSURE_ENABLED, 1237 FILE_MEMORY_PRESSURE_ENABLED,
@@ -1265,7 +1256,8 @@ static ssize_t cpuset_common_file_write(struct cgroup *cont,
1265 return -E2BIG; 1256 return -E2BIG;
1266 1257
1267 /* +1 for nul-terminator */ 1258 /* +1 for nul-terminator */
1268 if ((buffer = kmalloc(nbytes + 1, GFP_KERNEL)) == 0) 1259 buffer = kmalloc(nbytes + 1, GFP_KERNEL);
1260 if (!buffer)
1269 return -ENOMEM; 1261 return -ENOMEM;
1270 1262
1271 if (copy_from_user(buffer, userbuf, nbytes)) { 1263 if (copy_from_user(buffer, userbuf, nbytes)) {
@@ -1288,46 +1280,71 @@ static ssize_t cpuset_common_file_write(struct cgroup *cont,
1288 case FILE_MEMLIST: 1280 case FILE_MEMLIST:
1289 retval = update_nodemask(cs, buffer); 1281 retval = update_nodemask(cs, buffer);
1290 break; 1282 break;
1283 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
1284 retval = update_relax_domain_level(cs, buffer);
1285 break;
1286 default:
1287 retval = -EINVAL;
1288 goto out2;
1289 }
1290
1291 if (retval == 0)
1292 retval = nbytes;
1293out2:
1294 cgroup_unlock();
1295out1:
1296 kfree(buffer);
1297 return retval;
1298}
1299
1300static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1301{
1302 int retval = 0;
1303 struct cpuset *cs = cgroup_cs(cgrp);
1304 cpuset_filetype_t type = cft->private;
1305
1306 cgroup_lock();
1307
1308 if (cgroup_is_removed(cgrp)) {
1309 cgroup_unlock();
1310 return -ENODEV;
1311 }
1312
1313 switch (type) {
1291 case FILE_CPU_EXCLUSIVE: 1314 case FILE_CPU_EXCLUSIVE:
1292 retval = update_flag(CS_CPU_EXCLUSIVE, cs, buffer); 1315 retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
1293 break; 1316 break;
1294 case FILE_MEM_EXCLUSIVE: 1317 case FILE_MEM_EXCLUSIVE:
1295 retval = update_flag(CS_MEM_EXCLUSIVE, cs, buffer); 1318 retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
1296 break; 1319 break;
1297 case FILE_SCHED_LOAD_BALANCE: 1320 case FILE_MEM_HARDWALL:
1298 retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, buffer); 1321 retval = update_flag(CS_MEM_HARDWALL, cs, val);
1299 break; 1322 break;
1300 case FILE_SCHED_RELAX_DOMAIN_LEVEL: 1323 case FILE_SCHED_LOAD_BALANCE:
1301 retval = update_relax_domain_level(cs, buffer); 1324 retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
1302 break; 1325 break;
1303 case FILE_MEMORY_MIGRATE: 1326 case FILE_MEMORY_MIGRATE:
1304 retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer); 1327 retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
1305 break; 1328 break;
1306 case FILE_MEMORY_PRESSURE_ENABLED: 1329 case FILE_MEMORY_PRESSURE_ENABLED:
1307 retval = update_memory_pressure_enabled(cs, buffer); 1330 cpuset_memory_pressure_enabled = !!val;
1308 break; 1331 break;
1309 case FILE_MEMORY_PRESSURE: 1332 case FILE_MEMORY_PRESSURE:
1310 retval = -EACCES; 1333 retval = -EACCES;
1311 break; 1334 break;
1312 case FILE_SPREAD_PAGE: 1335 case FILE_SPREAD_PAGE:
1313 retval = update_flag(CS_SPREAD_PAGE, cs, buffer); 1336 retval = update_flag(CS_SPREAD_PAGE, cs, val);
1314 cs->mems_generation = cpuset_mems_generation++; 1337 cs->mems_generation = cpuset_mems_generation++;
1315 break; 1338 break;
1316 case FILE_SPREAD_SLAB: 1339 case FILE_SPREAD_SLAB:
1317 retval = update_flag(CS_SPREAD_SLAB, cs, buffer); 1340 retval = update_flag(CS_SPREAD_SLAB, cs, val);
1318 cs->mems_generation = cpuset_mems_generation++; 1341 cs->mems_generation = cpuset_mems_generation++;
1319 break; 1342 break;
1320 default: 1343 default:
1321 retval = -EINVAL; 1344 retval = -EINVAL;
1322 goto out2; 1345 break;
1323 } 1346 }
1324
1325 if (retval == 0)
1326 retval = nbytes;
1327out2:
1328 cgroup_unlock(); 1347 cgroup_unlock();
1329out1:
1330 kfree(buffer);
1331 return retval; 1348 return retval;
1332} 1349}
1333 1350
@@ -1389,33 +1406,9 @@ static ssize_t cpuset_common_file_read(struct cgroup *cont,
1389 case FILE_MEMLIST: 1406 case FILE_MEMLIST:
1390 s += cpuset_sprintf_memlist(s, cs); 1407 s += cpuset_sprintf_memlist(s, cs);
1391 break; 1408 break;
1392 case FILE_CPU_EXCLUSIVE:
1393 *s++ = is_cpu_exclusive(cs) ? '1' : '0';
1394 break;
1395 case FILE_MEM_EXCLUSIVE:
1396 *s++ = is_mem_exclusive(cs) ? '1' : '0';
1397 break;
1398 case FILE_SCHED_LOAD_BALANCE:
1399 *s++ = is_sched_load_balance(cs) ? '1' : '0';
1400 break;
1401 case FILE_SCHED_RELAX_DOMAIN_LEVEL: 1409 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
1402 s += sprintf(s, "%d", cs->relax_domain_level); 1410 s += sprintf(s, "%d", cs->relax_domain_level);
1403 break; 1411 break;
1404 case FILE_MEMORY_MIGRATE:
1405 *s++ = is_memory_migrate(cs) ? '1' : '0';
1406 break;
1407 case FILE_MEMORY_PRESSURE_ENABLED:
1408 *s++ = cpuset_memory_pressure_enabled ? '1' : '0';
1409 break;
1410 case FILE_MEMORY_PRESSURE:
1411 s += sprintf(s, "%d", fmeter_getrate(&cs->fmeter));
1412 break;
1413 case FILE_SPREAD_PAGE:
1414 *s++ = is_spread_page(cs) ? '1' : '0';
1415 break;
1416 case FILE_SPREAD_SLAB:
1417 *s++ = is_spread_slab(cs) ? '1' : '0';
1418 break;
1419 default: 1412 default:
1420 retval = -EINVAL; 1413 retval = -EINVAL;
1421 goto out; 1414 goto out;
@@ -1428,121 +1421,137 @@ out:
1428 return retval; 1421 return retval;
1429} 1422}
1430 1423
1431 1424static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
1432 1425{
1426 struct cpuset *cs = cgroup_cs(cont);
1427 cpuset_filetype_t type = cft->private;
1428 switch (type) {
1429 case FILE_CPU_EXCLUSIVE:
1430 return is_cpu_exclusive(cs);
1431 case FILE_MEM_EXCLUSIVE:
1432 return is_mem_exclusive(cs);
1433 case FILE_MEM_HARDWALL:
1434 return is_mem_hardwall(cs);
1435 case FILE_SCHED_LOAD_BALANCE:
1436 return is_sched_load_balance(cs);
1437 case FILE_MEMORY_MIGRATE:
1438 return is_memory_migrate(cs);
1439 case FILE_MEMORY_PRESSURE_ENABLED:
1440 return cpuset_memory_pressure_enabled;
1441 case FILE_MEMORY_PRESSURE:
1442 return fmeter_getrate(&cs->fmeter);
1443 case FILE_SPREAD_PAGE:
1444 return is_spread_page(cs);
1445 case FILE_SPREAD_SLAB:
1446 return is_spread_slab(cs);
1447 default:
1448 BUG();
1449 }
1450}
1433 1451
1434 1452
1435/* 1453/*
1436 * for the common functions, 'private' gives the type of file 1454 * for the common functions, 'private' gives the type of file
1437 */ 1455 */
1438 1456
1439static struct cftype cft_cpus = { 1457static struct cftype files[] = {
1440 .name = "cpus", 1458 {
1441 .read = cpuset_common_file_read, 1459 .name = "cpus",
1442 .write = cpuset_common_file_write, 1460 .read = cpuset_common_file_read,
1443 .private = FILE_CPULIST, 1461 .write = cpuset_common_file_write,
1444}; 1462 .private = FILE_CPULIST,
1445 1463 },
1446static struct cftype cft_mems = { 1464
1447 .name = "mems", 1465 {
1448 .read = cpuset_common_file_read, 1466 .name = "mems",
1449 .write = cpuset_common_file_write, 1467 .read = cpuset_common_file_read,
1450 .private = FILE_MEMLIST, 1468 .write = cpuset_common_file_write,
1451}; 1469 .private = FILE_MEMLIST,
1452 1470 },
1453static struct cftype cft_cpu_exclusive = { 1471
1454 .name = "cpu_exclusive", 1472 {
1455 .read = cpuset_common_file_read, 1473 .name = "cpu_exclusive",
1456 .write = cpuset_common_file_write, 1474 .read_u64 = cpuset_read_u64,
1457 .private = FILE_CPU_EXCLUSIVE, 1475 .write_u64 = cpuset_write_u64,
1458}; 1476 .private = FILE_CPU_EXCLUSIVE,
1459 1477 },
1460static struct cftype cft_mem_exclusive = { 1478
1461 .name = "mem_exclusive", 1479 {
1462 .read = cpuset_common_file_read, 1480 .name = "mem_exclusive",
1463 .write = cpuset_common_file_write, 1481 .read_u64 = cpuset_read_u64,
1464 .private = FILE_MEM_EXCLUSIVE, 1482 .write_u64 = cpuset_write_u64,
1465}; 1483 .private = FILE_MEM_EXCLUSIVE,
1466 1484 },
1467static struct cftype cft_sched_load_balance = { 1485
1468 .name = "sched_load_balance", 1486 {
1469 .read = cpuset_common_file_read, 1487 .name = "mem_hardwall",
1470 .write = cpuset_common_file_write, 1488 .read_u64 = cpuset_read_u64,
1471 .private = FILE_SCHED_LOAD_BALANCE, 1489 .write_u64 = cpuset_write_u64,
1472}; 1490 .private = FILE_MEM_HARDWALL,
1473 1491 },
1474static struct cftype cft_sched_relax_domain_level = { 1492
1475 .name = "sched_relax_domain_level", 1493 {
1476 .read = cpuset_common_file_read, 1494 .name = "sched_load_balance",
1477 .write = cpuset_common_file_write, 1495 .read_u64 = cpuset_read_u64,
1478 .private = FILE_SCHED_RELAX_DOMAIN_LEVEL, 1496 .write_u64 = cpuset_write_u64,
1479}; 1497 .private = FILE_SCHED_LOAD_BALANCE,
1480 1498 },
1481static struct cftype cft_memory_migrate = { 1499
1482 .name = "memory_migrate", 1500 {
1483 .read = cpuset_common_file_read, 1501 .name = "sched_relax_domain_level",
1484 .write = cpuset_common_file_write, 1502 .read_u64 = cpuset_read_u64,
1485 .private = FILE_MEMORY_MIGRATE, 1503 .write_u64 = cpuset_write_u64,
1504 .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
1505 },
1506
1507 {
1508 .name = "memory_migrate",
1509 .read_u64 = cpuset_read_u64,
1510 .write_u64 = cpuset_write_u64,
1511 .private = FILE_MEMORY_MIGRATE,
1512 },
1513
1514 {
1515 .name = "memory_pressure",
1516 .read_u64 = cpuset_read_u64,
1517 .write_u64 = cpuset_write_u64,
1518 .private = FILE_MEMORY_PRESSURE,
1519 },
1520
1521 {
1522 .name = "memory_spread_page",
1523 .read_u64 = cpuset_read_u64,
1524 .write_u64 = cpuset_write_u64,
1525 .private = FILE_SPREAD_PAGE,
1526 },
1527
1528 {
1529 .name = "memory_spread_slab",
1530 .read_u64 = cpuset_read_u64,
1531 .write_u64 = cpuset_write_u64,
1532 .private = FILE_SPREAD_SLAB,
1533 },
1486}; 1534};
1487 1535
1488static struct cftype cft_memory_pressure_enabled = { 1536static struct cftype cft_memory_pressure_enabled = {
1489 .name = "memory_pressure_enabled", 1537 .name = "memory_pressure_enabled",
1490 .read = cpuset_common_file_read, 1538 .read_u64 = cpuset_read_u64,
1491 .write = cpuset_common_file_write, 1539 .write_u64 = cpuset_write_u64,
1492 .private = FILE_MEMORY_PRESSURE_ENABLED, 1540 .private = FILE_MEMORY_PRESSURE_ENABLED,
1493}; 1541};
1494 1542
1495static struct cftype cft_memory_pressure = {
1496 .name = "memory_pressure",
1497 .read = cpuset_common_file_read,
1498 .write = cpuset_common_file_write,
1499 .private = FILE_MEMORY_PRESSURE,
1500};
1501
1502static struct cftype cft_spread_page = {
1503 .name = "memory_spread_page",
1504 .read = cpuset_common_file_read,
1505 .write = cpuset_common_file_write,
1506 .private = FILE_SPREAD_PAGE,
1507};
1508
1509static struct cftype cft_spread_slab = {
1510 .name = "memory_spread_slab",
1511 .read = cpuset_common_file_read,
1512 .write = cpuset_common_file_write,
1513 .private = FILE_SPREAD_SLAB,
1514};
1515
1516static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) 1543static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
1517{ 1544{
1518 int err; 1545 int err;
1519 1546
1520 if ((err = cgroup_add_file(cont, ss, &cft_cpus)) < 0) 1547 err = cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
1521 return err; 1548 if (err)
1522 if ((err = cgroup_add_file(cont, ss, &cft_mems)) < 0)
1523 return err;
1524 if ((err = cgroup_add_file(cont, ss, &cft_cpu_exclusive)) < 0)
1525 return err;
1526 if ((err = cgroup_add_file(cont, ss, &cft_mem_exclusive)) < 0)
1527 return err;
1528 if ((err = cgroup_add_file(cont, ss, &cft_memory_migrate)) < 0)
1529 return err;
1530 if ((err = cgroup_add_file(cont, ss, &cft_sched_load_balance)) < 0)
1531 return err;
1532 if ((err = cgroup_add_file(cont, ss,
1533 &cft_sched_relax_domain_level)) < 0)
1534 return err;
1535 if ((err = cgroup_add_file(cont, ss, &cft_memory_pressure)) < 0)
1536 return err;
1537 if ((err = cgroup_add_file(cont, ss, &cft_spread_page)) < 0)
1538 return err;
1539 if ((err = cgroup_add_file(cont, ss, &cft_spread_slab)) < 0)
1540 return err; 1549 return err;
1541 /* memory_pressure_enabled is in root cpuset only */ 1550 /* memory_pressure_enabled is in root cpuset only */
1542 if (err == 0 && !cont->parent) 1551 if (!cont->parent)
1543 err = cgroup_add_file(cont, ss, 1552 err = cgroup_add_file(cont, ss,
1544 &cft_memory_pressure_enabled); 1553 &cft_memory_pressure_enabled);
1545 return 0; 1554 return err;
1546} 1555}
1547 1556
1548/* 1557/*
@@ -1642,7 +1651,7 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
1642 cpuset_update_task_memory_state(); 1651 cpuset_update_task_memory_state();
1643 1652
1644 if (is_sched_load_balance(cs)) 1653 if (is_sched_load_balance(cs))
1645 update_flag(CS_SCHED_LOAD_BALANCE, cs, "0"); 1654 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
1646 1655
1647 number_of_cpusets--; 1656 number_of_cpusets--;
1648 kfree(cs); 1657 kfree(cs);
@@ -1707,7 +1716,8 @@ int __init cpuset_init(void)
1707 * Called by cgroup_scan_tasks() for each task in a cgroup. 1716 * Called by cgroup_scan_tasks() for each task in a cgroup.
1708 * Return nonzero to stop the walk through the tasks. 1717 * Return nonzero to stop the walk through the tasks.
1709 */ 1718 */
1710void cpuset_do_move_task(struct task_struct *tsk, struct cgroup_scanner *scan) 1719static void cpuset_do_move_task(struct task_struct *tsk,
1720 struct cgroup_scanner *scan)
1711{ 1721{
1712 struct cpuset_hotplug_scanner *chsp; 1722 struct cpuset_hotplug_scanner *chsp;
1713 1723
@@ -1958,33 +1968,25 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
1958} 1968}
1959 1969
1960/** 1970/**
1961 * cpuset_zonelist_valid_mems_allowed - check zonelist vs. curremt mems_allowed 1971 * cpuset_nodemask_valid_mems_allowed - check nodemask vs. curremt mems_allowed
1962 * @zl: the zonelist to be checked 1972 * @nodemask: the nodemask to be checked
1963 * 1973 *
1964 * Are any of the nodes on zonelist zl allowed in current->mems_allowed? 1974 * Are any of the nodes in the nodemask allowed in current->mems_allowed?
1965 */ 1975 */
1966int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl) 1976int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
1967{ 1977{
1968 int i; 1978 return nodes_intersects(*nodemask, current->mems_allowed);
1969
1970 for (i = 0; zl->zones[i]; i++) {
1971 int nid = zone_to_nid(zl->zones[i]);
1972
1973 if (node_isset(nid, current->mems_allowed))
1974 return 1;
1975 }
1976 return 0;
1977} 1979}
1978 1980
1979/* 1981/*
1980 * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive 1982 * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or
1981 * ancestor to the specified cpuset. Call holding callback_mutex. 1983 * mem_hardwall ancestor to the specified cpuset. Call holding
1982 * If no ancestor is mem_exclusive (an unusual configuration), then 1984 * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall
1983 * returns the root cpuset. 1985 * (an unusual configuration), then returns the root cpuset.
1984 */ 1986 */
1985static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) 1987static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
1986{ 1988{
1987 while (!is_mem_exclusive(cs) && cs->parent) 1989 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent)
1988 cs = cs->parent; 1990 cs = cs->parent;
1989 return cs; 1991 return cs;
1990} 1992}
@@ -1998,7 +2000,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
1998 * __GFP_THISNODE is set, yes, we can always allocate. If zone 2000 * __GFP_THISNODE is set, yes, we can always allocate. If zone
1999 * z's node is in our tasks mems_allowed, yes. If it's not a 2001 * z's node is in our tasks mems_allowed, yes. If it's not a
2000 * __GFP_HARDWALL request and this zone's nodes is in the nearest 2002 * __GFP_HARDWALL request and this zone's nodes is in the nearest
2001 * mem_exclusive cpuset ancestor to this tasks cpuset, yes. 2003 * hardwalled cpuset ancestor to this tasks cpuset, yes.
2002 * If the task has been OOM killed and has access to memory reserves 2004 * If the task has been OOM killed and has access to memory reserves
2003 * as specified by the TIF_MEMDIE flag, yes. 2005 * as specified by the TIF_MEMDIE flag, yes.
2004 * Otherwise, no. 2006 * Otherwise, no.
@@ -2021,7 +2023,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
2021 * and do not allow allocations outside the current tasks cpuset 2023 * and do not allow allocations outside the current tasks cpuset
2022 * unless the task has been OOM killed as is marked TIF_MEMDIE. 2024 * unless the task has been OOM killed as is marked TIF_MEMDIE.
2023 * GFP_KERNEL allocations are not so marked, so can escape to the 2025 * GFP_KERNEL allocations are not so marked, so can escape to the
2024 * nearest enclosing mem_exclusive ancestor cpuset. 2026 * nearest enclosing hardwalled ancestor cpuset.
2025 * 2027 *
2026 * Scanning up parent cpusets requires callback_mutex. The 2028 * Scanning up parent cpusets requires callback_mutex. The
2027 * __alloc_pages() routine only calls here with __GFP_HARDWALL bit 2029 * __alloc_pages() routine only calls here with __GFP_HARDWALL bit
@@ -2044,7 +2046,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
2044 * in_interrupt - any node ok (current task context irrelevant) 2046 * in_interrupt - any node ok (current task context irrelevant)
2045 * GFP_ATOMIC - any node ok 2047 * GFP_ATOMIC - any node ok
2046 * TIF_MEMDIE - any node ok 2048 * TIF_MEMDIE - any node ok
2047 * GFP_KERNEL - any node in enclosing mem_exclusive cpuset ok 2049 * GFP_KERNEL - any node in enclosing hardwalled cpuset ok
2048 * GFP_USER - only nodes in current tasks mems allowed ok. 2050 * GFP_USER - only nodes in current tasks mems allowed ok.
2049 * 2051 *
2050 * Rule: 2052 * Rule:
@@ -2081,7 +2083,7 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
2081 mutex_lock(&callback_mutex); 2083 mutex_lock(&callback_mutex);
2082 2084
2083 task_lock(current); 2085 task_lock(current);
2084 cs = nearest_exclusive_ancestor(task_cs(current)); 2086 cs = nearest_hardwall_ancestor(task_cs(current));
2085 task_unlock(current); 2087 task_unlock(current);
2086 2088
2087 allowed = node_isset(node, cs->mems_allowed); 2089 allowed = node_isset(node, cs->mems_allowed);
diff --git a/kernel/dma.c b/kernel/dma.c
index 6a82bb716dac..d2c60a822790 100644
--- a/kernel/dma.c
+++ b/kernel/dma.c
@@ -149,12 +149,7 @@ static const struct file_operations proc_dma_operations = {
149 149
150static int __init proc_dma_init(void) 150static int __init proc_dma_init(void)
151{ 151{
152 struct proc_dir_entry *e; 152 proc_create("dma", 0, NULL, &proc_dma_operations);
153
154 e = create_proc_entry("dma", 0, NULL);
155 if (e)
156 e->proc_fops = &proc_dma_operations;
157
158 return 0; 153 return 0;
159} 154}
160 155
diff --git a/kernel/exit.c b/kernel/exit.c
index 97f609f574b1..ae0f2c4e452b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -557,6 +557,88 @@ void exit_fs(struct task_struct *tsk)
557 557
558EXPORT_SYMBOL_GPL(exit_fs); 558EXPORT_SYMBOL_GPL(exit_fs);
559 559
560#ifdef CONFIG_MM_OWNER
561/*
562 * Task p is exiting and it owned mm, lets find a new owner for it
563 */
564static inline int
565mm_need_new_owner(struct mm_struct *mm, struct task_struct *p)
566{
567 /*
568 * If there are other users of the mm and the owner (us) is exiting
569 * we need to find a new owner to take on the responsibility.
570 */
571 if (!mm)
572 return 0;
573 if (atomic_read(&mm->mm_users) <= 1)
574 return 0;
575 if (mm->owner != p)
576 return 0;
577 return 1;
578}
579
580void mm_update_next_owner(struct mm_struct *mm)
581{
582 struct task_struct *c, *g, *p = current;
583
584retry:
585 if (!mm_need_new_owner(mm, p))
586 return;
587
588 read_lock(&tasklist_lock);
589 /*
590 * Search in the children
591 */
592 list_for_each_entry(c, &p->children, sibling) {
593 if (c->mm == mm)
594 goto assign_new_owner;
595 }
596
597 /*
598 * Search in the siblings
599 */
600 list_for_each_entry(c, &p->parent->children, sibling) {
601 if (c->mm == mm)
602 goto assign_new_owner;
603 }
604
605 /*
606 * Search through everything else. We should not get
607 * here often
608 */
609 do_each_thread(g, c) {
610 if (c->mm == mm)
611 goto assign_new_owner;
612 } while_each_thread(g, c);
613
614 read_unlock(&tasklist_lock);
615 return;
616
617assign_new_owner:
618 BUG_ON(c == p);
619 get_task_struct(c);
620 /*
621 * The task_lock protects c->mm from changing.
622 * We always want mm->owner->mm == mm
623 */
624 task_lock(c);
625 /*
626 * Delay read_unlock() till we have the task_lock()
627 * to ensure that c does not slip away underneath us
628 */
629 read_unlock(&tasklist_lock);
630 if (c->mm != mm) {
631 task_unlock(c);
632 put_task_struct(c);
633 goto retry;
634 }
635 cgroup_mm_owner_callbacks(mm->owner, c);
636 mm->owner = c;
637 task_unlock(c);
638 put_task_struct(c);
639}
640#endif /* CONFIG_MM_OWNER */
641
560/* 642/*
561 * Turn us into a lazy TLB process if we 643 * Turn us into a lazy TLB process if we
562 * aren't already.. 644 * aren't already..
@@ -596,6 +678,7 @@ static void exit_mm(struct task_struct * tsk)
596 /* We don't want this task to be frozen prematurely */ 678 /* We don't want this task to be frozen prematurely */
597 clear_freeze_flag(tsk); 679 clear_freeze_flag(tsk);
598 task_unlock(tsk); 680 task_unlock(tsk);
681 mm_update_next_owner(mm);
599 mmput(mm); 682 mmput(mm);
600} 683}
601 684
@@ -967,7 +1050,7 @@ NORET_TYPE void do_exit(long code)
967 proc_exit_connector(tsk); 1050 proc_exit_connector(tsk);
968 exit_notify(tsk, group_dead); 1051 exit_notify(tsk, group_dead);
969#ifdef CONFIG_NUMA 1052#ifdef CONFIG_NUMA
970 mpol_free(tsk->mempolicy); 1053 mpol_put(tsk->mempolicy);
971 tsk->mempolicy = NULL; 1054 tsk->mempolicy = NULL;
972#endif 1055#endif
973#ifdef CONFIG_FUTEX 1056#ifdef CONFIG_FUTEX
diff --git a/kernel/fork.c b/kernel/fork.c
index c674aa8d3c31..068ffe007529 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -279,7 +279,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
279 if (!tmp) 279 if (!tmp)
280 goto fail_nomem; 280 goto fail_nomem;
281 *tmp = *mpnt; 281 *tmp = *mpnt;
282 pol = mpol_copy(vma_policy(mpnt)); 282 pol = mpol_dup(vma_policy(mpnt));
283 retval = PTR_ERR(pol); 283 retval = PTR_ERR(pol);
284 if (IS_ERR(pol)) 284 if (IS_ERR(pol))
285 goto fail_nomem_policy; 285 goto fail_nomem_policy;
@@ -381,14 +381,13 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
381 mm->ioctx_list = NULL; 381 mm->ioctx_list = NULL;
382 mm->free_area_cache = TASK_UNMAPPED_BASE; 382 mm->free_area_cache = TASK_UNMAPPED_BASE;
383 mm->cached_hole_size = ~0UL; 383 mm->cached_hole_size = ~0UL;
384 mm_init_cgroup(mm, p); 384 mm_init_owner(mm, p);
385 385
386 if (likely(!mm_alloc_pgd(mm))) { 386 if (likely(!mm_alloc_pgd(mm))) {
387 mm->def_flags = 0; 387 mm->def_flags = 0;
388 return mm; 388 return mm;
389 } 389 }
390 390
391 mm_free_cgroup(mm);
392 free_mm(mm); 391 free_mm(mm);
393 return NULL; 392 return NULL;
394} 393}
@@ -432,13 +431,13 @@ void mmput(struct mm_struct *mm)
432 if (atomic_dec_and_test(&mm->mm_users)) { 431 if (atomic_dec_and_test(&mm->mm_users)) {
433 exit_aio(mm); 432 exit_aio(mm);
434 exit_mmap(mm); 433 exit_mmap(mm);
434 set_mm_exe_file(mm, NULL);
435 if (!list_empty(&mm->mmlist)) { 435 if (!list_empty(&mm->mmlist)) {
436 spin_lock(&mmlist_lock); 436 spin_lock(&mmlist_lock);
437 list_del(&mm->mmlist); 437 list_del(&mm->mmlist);
438 spin_unlock(&mmlist_lock); 438 spin_unlock(&mmlist_lock);
439 } 439 }
440 put_swap_token(mm); 440 put_swap_token(mm);
441 mm_free_cgroup(mm);
442 mmdrop(mm); 441 mmdrop(mm);
443 } 442 }
444} 443}
@@ -545,6 +544,8 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
545 if (init_new_context(tsk, mm)) 544 if (init_new_context(tsk, mm))
546 goto fail_nocontext; 545 goto fail_nocontext;
547 546
547 dup_mm_exe_file(oldmm, mm);
548
548 err = dup_mmap(mm, oldmm); 549 err = dup_mmap(mm, oldmm);
549 if (err) 550 if (err)
550 goto free_pt; 551 goto free_pt;
@@ -982,6 +983,13 @@ static void rt_mutex_init_task(struct task_struct *p)
982#endif 983#endif
983} 984}
984 985
986#ifdef CONFIG_MM_OWNER
987void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
988{
989 mm->owner = p;
990}
991#endif /* CONFIG_MM_OWNER */
992
985/* 993/*
986 * This creates a new process as a copy of the old one, 994 * This creates a new process as a copy of the old one,
987 * but does not actually start it yet. 995 * but does not actually start it yet.
@@ -1116,7 +1124,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1116 p->audit_context = NULL; 1124 p->audit_context = NULL;
1117 cgroup_fork(p); 1125 cgroup_fork(p);
1118#ifdef CONFIG_NUMA 1126#ifdef CONFIG_NUMA
1119 p->mempolicy = mpol_copy(p->mempolicy); 1127 p->mempolicy = mpol_dup(p->mempolicy);
1120 if (IS_ERR(p->mempolicy)) { 1128 if (IS_ERR(p->mempolicy)) {
1121 retval = PTR_ERR(p->mempolicy); 1129 retval = PTR_ERR(p->mempolicy);
1122 p->mempolicy = NULL; 1130 p->mempolicy = NULL;
@@ -1374,7 +1382,7 @@ bad_fork_cleanup_security:
1374 security_task_free(p); 1382 security_task_free(p);
1375bad_fork_cleanup_policy: 1383bad_fork_cleanup_policy:
1376#ifdef CONFIG_NUMA 1384#ifdef CONFIG_NUMA
1377 mpol_free(p->mempolicy); 1385 mpol_put(p->mempolicy);
1378bad_fork_cleanup_cgroup: 1386bad_fork_cleanup_cgroup:
1379#endif 1387#endif
1380 cgroup_exit(p, cgroup_callbacks_done); 1388 cgroup_exit(p, cgroup_callbacks_done);
@@ -1664,18 +1672,6 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp
1664} 1672}
1665 1673
1666/* 1674/*
1667 * Unsharing of semundo for tasks created with CLONE_SYSVSEM is not
1668 * supported yet
1669 */
1670static int unshare_semundo(unsigned long unshare_flags, struct sem_undo_list **new_ulistp)
1671{
1672 if (unshare_flags & CLONE_SYSVSEM)
1673 return -EINVAL;
1674
1675 return 0;
1676}
1677
1678/*
1679 * unshare allows a process to 'unshare' part of the process 1675 * unshare allows a process to 'unshare' part of the process
1680 * context which was originally shared using clone. copy_* 1676 * context which was originally shared using clone. copy_*
1681 * functions used by do_fork() cannot be used here directly 1677 * functions used by do_fork() cannot be used here directly
@@ -1690,8 +1686,8 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1690 struct sighand_struct *new_sigh = NULL; 1686 struct sighand_struct *new_sigh = NULL;
1691 struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; 1687 struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
1692 struct files_struct *fd, *new_fd = NULL; 1688 struct files_struct *fd, *new_fd = NULL;
1693 struct sem_undo_list *new_ulist = NULL;
1694 struct nsproxy *new_nsproxy = NULL; 1689 struct nsproxy *new_nsproxy = NULL;
1690 int do_sysvsem = 0;
1695 1691
1696 check_unshare_flags(&unshare_flags); 1692 check_unshare_flags(&unshare_flags);
1697 1693
@@ -1703,6 +1699,13 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1703 CLONE_NEWNET)) 1699 CLONE_NEWNET))
1704 goto bad_unshare_out; 1700 goto bad_unshare_out;
1705 1701
1702 /*
1703 * CLONE_NEWIPC must also detach from the undolist: after switching
1704 * to a new ipc namespace, the semaphore arrays from the old
1705 * namespace are unreachable.
1706 */
1707 if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM))
1708 do_sysvsem = 1;
1706 if ((err = unshare_thread(unshare_flags))) 1709 if ((err = unshare_thread(unshare_flags)))
1707 goto bad_unshare_out; 1710 goto bad_unshare_out;
1708 if ((err = unshare_fs(unshare_flags, &new_fs))) 1711 if ((err = unshare_fs(unshare_flags, &new_fs)))
@@ -1713,13 +1716,17 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1713 goto bad_unshare_cleanup_sigh; 1716 goto bad_unshare_cleanup_sigh;
1714 if ((err = unshare_fd(unshare_flags, &new_fd))) 1717 if ((err = unshare_fd(unshare_flags, &new_fd)))
1715 goto bad_unshare_cleanup_vm; 1718 goto bad_unshare_cleanup_vm;
1716 if ((err = unshare_semundo(unshare_flags, &new_ulist)))
1717 goto bad_unshare_cleanup_fd;
1718 if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, 1719 if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
1719 new_fs))) 1720 new_fs)))
1720 goto bad_unshare_cleanup_semundo; 1721 goto bad_unshare_cleanup_fd;
1721 1722
1722 if (new_fs || new_mm || new_fd || new_ulist || new_nsproxy) { 1723 if (new_fs || new_mm || new_fd || do_sysvsem || new_nsproxy) {
1724 if (do_sysvsem) {
1725 /*
1726 * CLONE_SYSVSEM is equivalent to sys_exit().
1727 */
1728 exit_sem(current);
1729 }
1723 1730
1724 if (new_nsproxy) { 1731 if (new_nsproxy) {
1725 switch_task_namespaces(current, new_nsproxy); 1732 switch_task_namespaces(current, new_nsproxy);
@@ -1755,7 +1762,6 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1755 if (new_nsproxy) 1762 if (new_nsproxy)
1756 put_nsproxy(new_nsproxy); 1763 put_nsproxy(new_nsproxy);
1757 1764
1758bad_unshare_cleanup_semundo:
1759bad_unshare_cleanup_fd: 1765bad_unshare_cleanup_fd:
1760 if (new_fd) 1766 if (new_fd)
1761 put_files_struct(new_fd); 1767 put_files_struct(new_fd);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index f78777abe769..dea4c9124ac8 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -590,7 +590,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
590 list_add_tail(&timer->cb_entry, 590 list_add_tail(&timer->cb_entry,
591 &base->cpu_base->cb_pending); 591 &base->cpu_base->cb_pending);
592 timer->state = HRTIMER_STATE_PENDING; 592 timer->state = HRTIMER_STATE_PENDING;
593 raise_softirq(HRTIMER_SOFTIRQ);
594 return 1; 593 return 1;
595 default: 594 default:
596 BUG(); 595 BUG();
@@ -633,6 +632,11 @@ static int hrtimer_switch_to_hres(void)
633 return 1; 632 return 1;
634} 633}
635 634
635static inline void hrtimer_raise_softirq(void)
636{
637 raise_softirq(HRTIMER_SOFTIRQ);
638}
639
636#else 640#else
637 641
638static inline int hrtimer_hres_active(void) { return 0; } 642static inline int hrtimer_hres_active(void) { return 0; }
@@ -651,6 +655,7 @@ static inline int hrtimer_reprogram(struct hrtimer *timer,
651{ 655{
652 return 0; 656 return 0;
653} 657}
658static inline void hrtimer_raise_softirq(void) { }
654 659
655#endif /* CONFIG_HIGH_RES_TIMERS */ 660#endif /* CONFIG_HIGH_RES_TIMERS */
656 661
@@ -850,7 +855,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
850{ 855{
851 struct hrtimer_clock_base *base, *new_base; 856 struct hrtimer_clock_base *base, *new_base;
852 unsigned long flags; 857 unsigned long flags;
853 int ret; 858 int ret, raise;
854 859
855 base = lock_hrtimer_base(timer, &flags); 860 base = lock_hrtimer_base(timer, &flags);
856 861
@@ -884,8 +889,18 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
884 enqueue_hrtimer(timer, new_base, 889 enqueue_hrtimer(timer, new_base,
885 new_base->cpu_base == &__get_cpu_var(hrtimer_bases)); 890 new_base->cpu_base == &__get_cpu_var(hrtimer_bases));
886 891
892 /*
893 * The timer may be expired and moved to the cb_pending
894 * list. We can not raise the softirq with base lock held due
895 * to a possible deadlock with runqueue lock.
896 */
897 raise = timer->state == HRTIMER_STATE_PENDING;
898
887 unlock_hrtimer_base(timer, &flags); 899 unlock_hrtimer_base(timer, &flags);
888 900
901 if (raise)
902 hrtimer_raise_softirq();
903
889 return ret; 904 return ret;
890} 905}
891EXPORT_SYMBOL_GPL(hrtimer_start); 906EXPORT_SYMBOL_GPL(hrtimer_start);
@@ -1080,8 +1095,19 @@ static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base)
1080 * If the timer was rearmed on another CPU, reprogram 1095 * If the timer was rearmed on another CPU, reprogram
1081 * the event device. 1096 * the event device.
1082 */ 1097 */
1083 if (timer->base->first == &timer->node) 1098 struct hrtimer_clock_base *base = timer->base;
1084 hrtimer_reprogram(timer, timer->base); 1099
1100 if (base->first == &timer->node &&
1101 hrtimer_reprogram(timer, base)) {
1102 /*
1103 * Timer is expired. Thus move it from tree to
1104 * pending list again.
1105 */
1106 __remove_hrtimer(timer, base,
1107 HRTIMER_STATE_PENDING, 0);
1108 list_add_tail(&timer->cb_entry,
1109 &base->cpu_base->cb_pending);
1110 }
1085 } 1111 }
1086 } 1112 }
1087 spin_unlock_irq(&cpu_base->lock); 1113 spin_unlock_irq(&cpu_base->lock);
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index 6d9204f3a370..38a25b8d8bff 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -1,6 +1,7 @@
1#include <linux/module.h> 1#include <linux/module.h>
2#include <linux/interrupt.h> 2#include <linux/interrupt.h>
3#include <linux/device.h> 3#include <linux/device.h>
4#include <linux/gfp.h>
4 5
5/* 6/*
6 * Device resource management aware IRQ request/free implementation. 7 * Device resource management aware IRQ request/free implementation.
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 438a01464287..46e4ad1723f0 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -11,6 +11,7 @@
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/random.h> 12#include <linux/random.h>
13#include <linux/interrupt.h> 13#include <linux/interrupt.h>
14#include <linux/slab.h>
14 15
15#include "internals.h" 16#include "internals.h"
16 17
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index f091d13def00..6fc0040f3e3a 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -472,11 +472,7 @@ static const struct file_operations kallsyms_operations = {
472 472
473static int __init kallsyms_init(void) 473static int __init kallsyms_init(void)
474{ 474{
475 struct proc_dir_entry *entry; 475 proc_create("kallsyms", 0444, NULL, &kallsyms_operations);
476
477 entry = create_proc_entry("kallsyms", 0444, NULL);
478 if (entry)
479 entry->proc_fops = &kallsyms_operations;
480 return 0; 476 return 0;
481} 477}
482__initcall(kallsyms_init); 478__initcall(kallsyms_init);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 6782dce93d01..cb85c79989b4 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1405,6 +1405,9 @@ static int __init crash_save_vmcoreinfo_init(void)
1405 VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); 1405 VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
1406 VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); 1406 VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
1407 VMCOREINFO_NUMBER(NR_FREE_PAGES); 1407 VMCOREINFO_NUMBER(NR_FREE_PAGES);
1408 VMCOREINFO_NUMBER(PG_lru);
1409 VMCOREINFO_NUMBER(PG_private);
1410 VMCOREINFO_NUMBER(PG_swapcache);
1408 1411
1409 arch_crash_save_vmcoreinfo(); 1412 arch_crash_save_vmcoreinfo();
1410 1413
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index fcfb580c3afc..1e0250cb9486 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -72,6 +72,18 @@ DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */
72DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ 72DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */
73static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; 73static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
74 74
75/*
76 * Normally, functions that we'd want to prohibit kprobes in, are marked
77 * __kprobes. But, there are cases where such functions already belong to
78 * a different section (__sched for preempt_schedule)
79 *
80 * For such cases, we now have a blacklist
81 */
82struct kprobe_blackpoint kprobe_blacklist[] = {
83 {"preempt_schedule",},
84 {NULL} /* Terminator */
85};
86
75#ifdef __ARCH_WANT_KPROBES_INSN_SLOT 87#ifdef __ARCH_WANT_KPROBES_INSN_SLOT
76/* 88/*
77 * kprobe->ainsn.insn points to the copy of the instruction to be 89 * kprobe->ainsn.insn points to the copy of the instruction to be
@@ -417,6 +429,21 @@ static inline void free_rp_inst(struct kretprobe *rp)
417 } 429 }
418} 430}
419 431
432static void __kprobes cleanup_rp_inst(struct kretprobe *rp)
433{
434 unsigned long flags;
435 struct kretprobe_instance *ri;
436 struct hlist_node *pos, *next;
437 /* No race here */
438 spin_lock_irqsave(&kretprobe_lock, flags);
439 hlist_for_each_entry_safe(ri, pos, next, &rp->used_instances, uflist) {
440 ri->rp = NULL;
441 hlist_del(&ri->uflist);
442 }
443 spin_unlock_irqrestore(&kretprobe_lock, flags);
444 free_rp_inst(rp);
445}
446
420/* 447/*
421 * Keep all fields in the kprobe consistent 448 * Keep all fields in the kprobe consistent
422 */ 449 */
@@ -492,9 +519,22 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
492 519
493static int __kprobes in_kprobes_functions(unsigned long addr) 520static int __kprobes in_kprobes_functions(unsigned long addr)
494{ 521{
522 struct kprobe_blackpoint *kb;
523
495 if (addr >= (unsigned long)__kprobes_text_start && 524 if (addr >= (unsigned long)__kprobes_text_start &&
496 addr < (unsigned long)__kprobes_text_end) 525 addr < (unsigned long)__kprobes_text_end)
497 return -EINVAL; 526 return -EINVAL;
527 /*
528 * If there exists a kprobe_blacklist, verify and
529 * fail any probe registration in the prohibited area
530 */
531 for (kb = kprobe_blacklist; kb->name != NULL; kb++) {
532 if (kb->start_addr) {
533 if (addr >= kb->start_addr &&
534 addr < (kb->start_addr + kb->range))
535 return -EINVAL;
536 }
537 }
498 return 0; 538 return 0;
499} 539}
500 540
@@ -555,6 +595,7 @@ static int __kprobes __register_kprobe(struct kprobe *p,
555 } 595 }
556 596
557 p->nmissed = 0; 597 p->nmissed = 0;
598 INIT_LIST_HEAD(&p->list);
558 mutex_lock(&kprobe_mutex); 599 mutex_lock(&kprobe_mutex);
559 old_p = get_kprobe(p->addr); 600 old_p = get_kprobe(p->addr);
560 if (old_p) { 601 if (old_p) {
@@ -581,35 +622,28 @@ out:
581 return ret; 622 return ret;
582} 623}
583 624
584int __kprobes register_kprobe(struct kprobe *p) 625/*
585{ 626 * Unregister a kprobe without a scheduler synchronization.
586 return __register_kprobe(p, (unsigned long)__builtin_return_address(0)); 627 */
587} 628static int __kprobes __unregister_kprobe_top(struct kprobe *p)
588
589void __kprobes unregister_kprobe(struct kprobe *p)
590{ 629{
591 struct module *mod;
592 struct kprobe *old_p, *list_p; 630 struct kprobe *old_p, *list_p;
593 int cleanup_p;
594 631
595 mutex_lock(&kprobe_mutex);
596 old_p = get_kprobe(p->addr); 632 old_p = get_kprobe(p->addr);
597 if (unlikely(!old_p)) { 633 if (unlikely(!old_p))
598 mutex_unlock(&kprobe_mutex); 634 return -EINVAL;
599 return; 635
600 }
601 if (p != old_p) { 636 if (p != old_p) {
602 list_for_each_entry_rcu(list_p, &old_p->list, list) 637 list_for_each_entry_rcu(list_p, &old_p->list, list)
603 if (list_p == p) 638 if (list_p == p)
604 /* kprobe p is a valid probe */ 639 /* kprobe p is a valid probe */
605 goto valid_p; 640 goto valid_p;
606 mutex_unlock(&kprobe_mutex); 641 return -EINVAL;
607 return;
608 } 642 }
609valid_p: 643valid_p:
610 if (old_p == p || 644 if (old_p == p ||
611 (old_p->pre_handler == aggr_pre_handler && 645 (old_p->pre_handler == aggr_pre_handler &&
612 p->list.next == &old_p->list && p->list.prev == &old_p->list)) { 646 list_is_singular(&old_p->list))) {
613 /* 647 /*
614 * Only probe on the hash list. Disarm only if kprobes are 648 * Only probe on the hash list. Disarm only if kprobes are
615 * enabled - otherwise, the breakpoint would already have 649 * enabled - otherwise, the breakpoint would already have
@@ -618,43 +652,97 @@ valid_p:
618 if (kprobe_enabled) 652 if (kprobe_enabled)
619 arch_disarm_kprobe(p); 653 arch_disarm_kprobe(p);
620 hlist_del_rcu(&old_p->hlist); 654 hlist_del_rcu(&old_p->hlist);
621 cleanup_p = 1;
622 } else { 655 } else {
656 if (p->break_handler)
657 old_p->break_handler = NULL;
658 if (p->post_handler) {
659 list_for_each_entry_rcu(list_p, &old_p->list, list) {
660 if ((list_p != p) && (list_p->post_handler))
661 goto noclean;
662 }
663 old_p->post_handler = NULL;
664 }
665noclean:
623 list_del_rcu(&p->list); 666 list_del_rcu(&p->list);
624 cleanup_p = 0;
625 } 667 }
668 return 0;
669}
626 670
627 mutex_unlock(&kprobe_mutex); 671static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
672{
673 struct module *mod;
674 struct kprobe *old_p;
628 675
629 synchronize_sched();
630 if (p->mod_refcounted) { 676 if (p->mod_refcounted) {
631 mod = module_text_address((unsigned long)p->addr); 677 mod = module_text_address((unsigned long)p->addr);
632 if (mod) 678 if (mod)
633 module_put(mod); 679 module_put(mod);
634 } 680 }
635 681
636 if (cleanup_p) { 682 if (list_empty(&p->list) || list_is_singular(&p->list)) {
637 if (p != old_p) { 683 if (!list_empty(&p->list)) {
638 list_del_rcu(&p->list); 684 /* "p" is the last child of an aggr_kprobe */
685 old_p = list_entry(p->list.next, struct kprobe, list);
686 list_del(&p->list);
639 kfree(old_p); 687 kfree(old_p);
640 } 688 }
641 arch_remove_kprobe(p); 689 arch_remove_kprobe(p);
642 } else { 690 }
643 mutex_lock(&kprobe_mutex); 691}
644 if (p->break_handler) 692
645 old_p->break_handler = NULL; 693static int __register_kprobes(struct kprobe **kps, int num,
646 if (p->post_handler){ 694 unsigned long called_from)
647 list_for_each_entry_rcu(list_p, &old_p->list, list){ 695{
648 if (list_p->post_handler){ 696 int i, ret = 0;
649 cleanup_p = 2; 697
650 break; 698 if (num <= 0)
651 } 699 return -EINVAL;
652 } 700 for (i = 0; i < num; i++) {
653 if (cleanup_p == 0) 701 ret = __register_kprobe(kps[i], called_from);
654 old_p->post_handler = NULL; 702 if (ret < 0 && i > 0) {
703 unregister_kprobes(kps, i);
704 break;
655 } 705 }
656 mutex_unlock(&kprobe_mutex);
657 } 706 }
707 return ret;
708}
709
710/*
711 * Registration and unregistration functions for kprobe.
712 */
713int __kprobes register_kprobe(struct kprobe *p)
714{
715 return __register_kprobes(&p, 1,
716 (unsigned long)__builtin_return_address(0));
717}
718
719void __kprobes unregister_kprobe(struct kprobe *p)
720{
721 unregister_kprobes(&p, 1);
722}
723
724int __kprobes register_kprobes(struct kprobe **kps, int num)
725{
726 return __register_kprobes(kps, num,
727 (unsigned long)__builtin_return_address(0));
728}
729
730void __kprobes unregister_kprobes(struct kprobe **kps, int num)
731{
732 int i;
733
734 if (num <= 0)
735 return;
736 mutex_lock(&kprobe_mutex);
737 for (i = 0; i < num; i++)
738 if (__unregister_kprobe_top(kps[i]) < 0)
739 kps[i]->addr = NULL;
740 mutex_unlock(&kprobe_mutex);
741
742 synchronize_sched();
743 for (i = 0; i < num; i++)
744 if (kps[i]->addr)
745 __unregister_kprobe_bottom(kps[i]);
658} 746}
659 747
660static struct notifier_block kprobe_exceptions_nb = { 748static struct notifier_block kprobe_exceptions_nb = {
@@ -667,24 +755,69 @@ unsigned long __weak arch_deref_entry_point(void *entry)
667 return (unsigned long)entry; 755 return (unsigned long)entry;
668} 756}
669 757
670int __kprobes register_jprobe(struct jprobe *jp) 758static int __register_jprobes(struct jprobe **jps, int num,
759 unsigned long called_from)
671{ 760{
672 unsigned long addr = arch_deref_entry_point(jp->entry); 761 struct jprobe *jp;
762 int ret = 0, i;
673 763
674 if (!kernel_text_address(addr)) 764 if (num <= 0)
675 return -EINVAL; 765 return -EINVAL;
766 for (i = 0; i < num; i++) {
767 unsigned long addr;
768 jp = jps[i];
769 addr = arch_deref_entry_point(jp->entry);
770
771 if (!kernel_text_address(addr))
772 ret = -EINVAL;
773 else {
774 /* Todo: Verify probepoint is a function entry point */
775 jp->kp.pre_handler = setjmp_pre_handler;
776 jp->kp.break_handler = longjmp_break_handler;
777 ret = __register_kprobe(&jp->kp, called_from);
778 }
779 if (ret < 0 && i > 0) {
780 unregister_jprobes(jps, i);
781 break;
782 }
783 }
784 return ret;
785}
676 786
677 /* Todo: Verify probepoint is a function entry point */ 787int __kprobes register_jprobe(struct jprobe *jp)
678 jp->kp.pre_handler = setjmp_pre_handler; 788{
679 jp->kp.break_handler = longjmp_break_handler; 789 return __register_jprobes(&jp, 1,
680
681 return __register_kprobe(&jp->kp,
682 (unsigned long)__builtin_return_address(0)); 790 (unsigned long)__builtin_return_address(0));
683} 791}
684 792
685void __kprobes unregister_jprobe(struct jprobe *jp) 793void __kprobes unregister_jprobe(struct jprobe *jp)
686{ 794{
687 unregister_kprobe(&jp->kp); 795 unregister_jprobes(&jp, 1);
796}
797
798int __kprobes register_jprobes(struct jprobe **jps, int num)
799{
800 return __register_jprobes(jps, num,
801 (unsigned long)__builtin_return_address(0));
802}
803
804void __kprobes unregister_jprobes(struct jprobe **jps, int num)
805{
806 int i;
807
808 if (num <= 0)
809 return;
810 mutex_lock(&kprobe_mutex);
811 for (i = 0; i < num; i++)
812 if (__unregister_kprobe_top(&jps[i]->kp) < 0)
813 jps[i]->kp.addr = NULL;
814 mutex_unlock(&kprobe_mutex);
815
816 synchronize_sched();
817 for (i = 0; i < num; i++) {
818 if (jps[i]->kp.addr)
819 __unregister_kprobe_bottom(&jps[i]->kp);
820 }
688} 821}
689 822
690#ifdef CONFIG_KRETPROBES 823#ifdef CONFIG_KRETPROBES
@@ -725,7 +858,8 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
725 return 0; 858 return 0;
726} 859}
727 860
728int __kprobes register_kretprobe(struct kretprobe *rp) 861static int __kprobes __register_kretprobe(struct kretprobe *rp,
862 unsigned long called_from)
729{ 863{
730 int ret = 0; 864 int ret = 0;
731 struct kretprobe_instance *inst; 865 struct kretprobe_instance *inst;
@@ -771,46 +905,101 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
771 905
772 rp->nmissed = 0; 906 rp->nmissed = 0;
773 /* Establish function entry probe point */ 907 /* Establish function entry probe point */
774 if ((ret = __register_kprobe(&rp->kp, 908 ret = __register_kprobe(&rp->kp, called_from);
775 (unsigned long)__builtin_return_address(0))) != 0) 909 if (ret != 0)
776 free_rp_inst(rp); 910 free_rp_inst(rp);
777 return ret; 911 return ret;
778} 912}
779 913
914static int __register_kretprobes(struct kretprobe **rps, int num,
915 unsigned long called_from)
916{
917 int ret = 0, i;
918
919 if (num <= 0)
920 return -EINVAL;
921 for (i = 0; i < num; i++) {
922 ret = __register_kretprobe(rps[i], called_from);
923 if (ret < 0 && i > 0) {
924 unregister_kretprobes(rps, i);
925 break;
926 }
927 }
928 return ret;
929}
930
931int __kprobes register_kretprobe(struct kretprobe *rp)
932{
933 return __register_kretprobes(&rp, 1,
934 (unsigned long)__builtin_return_address(0));
935}
936
937void __kprobes unregister_kretprobe(struct kretprobe *rp)
938{
939 unregister_kretprobes(&rp, 1);
940}
941
942int __kprobes register_kretprobes(struct kretprobe **rps, int num)
943{
944 return __register_kretprobes(rps, num,
945 (unsigned long)__builtin_return_address(0));
946}
947
948void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
949{
950 int i;
951
952 if (num <= 0)
953 return;
954 mutex_lock(&kprobe_mutex);
955 for (i = 0; i < num; i++)
956 if (__unregister_kprobe_top(&rps[i]->kp) < 0)
957 rps[i]->kp.addr = NULL;
958 mutex_unlock(&kprobe_mutex);
959
960 synchronize_sched();
961 for (i = 0; i < num; i++) {
962 if (rps[i]->kp.addr) {
963 __unregister_kprobe_bottom(&rps[i]->kp);
964 cleanup_rp_inst(rps[i]);
965 }
966 }
967}
968
780#else /* CONFIG_KRETPROBES */ 969#else /* CONFIG_KRETPROBES */
781int __kprobes register_kretprobe(struct kretprobe *rp) 970int __kprobes register_kretprobe(struct kretprobe *rp)
782{ 971{
783 return -ENOSYS; 972 return -ENOSYS;
784} 973}
785 974
786static int __kprobes pre_handler_kretprobe(struct kprobe *p, 975int __kprobes register_kretprobes(struct kretprobe **rps, int num)
787 struct pt_regs *regs)
788{ 976{
789 return 0; 977 return -ENOSYS;
790} 978}
791#endif /* CONFIG_KRETPROBES */
792
793void __kprobes unregister_kretprobe(struct kretprobe *rp) 979void __kprobes unregister_kretprobe(struct kretprobe *rp)
794{ 980{
795 unsigned long flags; 981}
796 struct kretprobe_instance *ri;
797 struct hlist_node *pos, *next;
798 982
799 unregister_kprobe(&rp->kp); 983void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
984{
985}
800 986
801 /* No race here */ 987static int __kprobes pre_handler_kretprobe(struct kprobe *p,
802 spin_lock_irqsave(&kretprobe_lock, flags); 988 struct pt_regs *regs)
803 hlist_for_each_entry_safe(ri, pos, next, &rp->used_instances, uflist) { 989{
804 ri->rp = NULL; 990 return 0;
805 hlist_del(&ri->uflist);
806 }
807 spin_unlock_irqrestore(&kretprobe_lock, flags);
808 free_rp_inst(rp);
809} 991}
810 992
993#endif /* CONFIG_KRETPROBES */
994
811static int __init init_kprobes(void) 995static int __init init_kprobes(void)
812{ 996{
813 int i, err = 0; 997 int i, err = 0;
998 unsigned long offset = 0, size = 0;
999 char *modname, namebuf[128];
1000 const char *symbol_name;
1001 void *addr;
1002 struct kprobe_blackpoint *kb;
814 1003
815 /* FIXME allocate the probe table, currently defined statically */ 1004 /* FIXME allocate the probe table, currently defined statically */
816 /* initialize all list heads */ 1005 /* initialize all list heads */
@@ -819,6 +1008,28 @@ static int __init init_kprobes(void)
819 INIT_HLIST_HEAD(&kretprobe_inst_table[i]); 1008 INIT_HLIST_HEAD(&kretprobe_inst_table[i]);
820 } 1009 }
821 1010
1011 /*
1012 * Lookup and populate the kprobe_blacklist.
1013 *
1014 * Unlike the kretprobe blacklist, we'll need to determine
1015 * the range of addresses that belong to the said functions,
1016 * since a kprobe need not necessarily be at the beginning
1017 * of a function.
1018 */
1019 for (kb = kprobe_blacklist; kb->name != NULL; kb++) {
1020 kprobe_lookup_name(kb->name, addr);
1021 if (!addr)
1022 continue;
1023
1024 kb->start_addr = (unsigned long)addr;
1025 symbol_name = kallsyms_lookup(kb->start_addr,
1026 &size, &offset, &modname, namebuf);
1027 if (!symbol_name)
1028 kb->range = 0;
1029 else
1030 kb->range = size;
1031 }
1032
822 if (kretprobe_blacklist_size) { 1033 if (kretprobe_blacklist_size) {
823 /* lookup the function address from its name */ 1034 /* lookup the function address from its name */
824 for (i = 0; kretprobe_blacklist[i].name != NULL; i++) { 1035 for (i = 0; kretprobe_blacklist[i].name != NULL; i++) {
@@ -1066,8 +1277,12 @@ module_init(init_kprobes);
1066 1277
1067EXPORT_SYMBOL_GPL(register_kprobe); 1278EXPORT_SYMBOL_GPL(register_kprobe);
1068EXPORT_SYMBOL_GPL(unregister_kprobe); 1279EXPORT_SYMBOL_GPL(unregister_kprobe);
1280EXPORT_SYMBOL_GPL(register_kprobes);
1281EXPORT_SYMBOL_GPL(unregister_kprobes);
1069EXPORT_SYMBOL_GPL(register_jprobe); 1282EXPORT_SYMBOL_GPL(register_jprobe);
1070EXPORT_SYMBOL_GPL(unregister_jprobe); 1283EXPORT_SYMBOL_GPL(unregister_jprobe);
1284EXPORT_SYMBOL_GPL(register_jprobes);
1285EXPORT_SYMBOL_GPL(unregister_jprobes);
1071#ifdef CONFIG_KPROBES 1286#ifdef CONFIG_KPROBES
1072EXPORT_SYMBOL_GPL(jprobe_return); 1287EXPORT_SYMBOL_GPL(jprobe_return);
1073#endif 1288#endif
@@ -1075,4 +1290,6 @@ EXPORT_SYMBOL_GPL(jprobe_return);
1075#ifdef CONFIG_KPROBES 1290#ifdef CONFIG_KPROBES
1076EXPORT_SYMBOL_GPL(register_kretprobe); 1291EXPORT_SYMBOL_GPL(register_kretprobe);
1077EXPORT_SYMBOL_GPL(unregister_kretprobe); 1292EXPORT_SYMBOL_GPL(unregister_kretprobe);
1293EXPORT_SYMBOL_GPL(register_kretprobes);
1294EXPORT_SYMBOL_GPL(unregister_kretprobes);
1078#endif 1295#endif
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 92cf6930ab51..ac72eea48339 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -144,9 +144,9 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
144 144
145 spin_lock(&kthread_create_lock); 145 spin_lock(&kthread_create_lock);
146 list_add_tail(&create.list, &kthread_create_list); 146 list_add_tail(&create.list, &kthread_create_list);
147 wake_up_process(kthreadd_task);
148 spin_unlock(&kthread_create_lock); 147 spin_unlock(&kthread_create_lock);
149 148
149 wake_up_process(kthreadd_task);
150 wait_for_completion(&create.done); 150 wait_for_completion(&create.done);
151 151
152 if (!IS_ERR(create.result)) { 152 if (!IS_ERR(create.result)) {
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 7c74dab0d21b..5e7b45c56923 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -233,14 +233,7 @@ static struct file_operations lstats_fops = {
233 233
234static int __init init_lstats_procfs(void) 234static int __init init_lstats_procfs(void)
235{ 235{
236 struct proc_dir_entry *pe; 236 proc_create("latency_stats", 0644, NULL, &lstats_fops);
237
238 pe = create_proc_entry("latency_stats", 0644, NULL);
239 if (!pe)
240 return -ENOMEM;
241
242 pe->proc_fops = &lstats_fops;
243
244 return 0; 237 return 0;
245} 238}
246__initcall(init_lstats_procfs); 239__initcall(init_lstats_procfs);
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 8a135bd163c2..dc5d29648d85 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -660,20 +660,12 @@ static const struct file_operations proc_lock_stat_operations = {
660 660
661static int __init lockdep_proc_init(void) 661static int __init lockdep_proc_init(void)
662{ 662{
663 struct proc_dir_entry *entry; 663 proc_create("lockdep", S_IRUSR, NULL, &proc_lockdep_operations);
664 664 proc_create("lockdep_stats", S_IRUSR, NULL,
665 entry = create_proc_entry("lockdep", S_IRUSR, NULL); 665 &proc_lockdep_stats_operations);
666 if (entry)
667 entry->proc_fops = &proc_lockdep_operations;
668
669 entry = create_proc_entry("lockdep_stats", S_IRUSR, NULL);
670 if (entry)
671 entry->proc_fops = &proc_lockdep_stats_operations;
672 666
673#ifdef CONFIG_LOCK_STAT 667#ifdef CONFIG_LOCK_STAT
674 entry = create_proc_entry("lock_stat", S_IRUSR, NULL); 668 proc_create("lock_stat", S_IRUSR, NULL, &proc_lock_stat_operations);
675 if (entry)
676 entry->proc_fops = &proc_lock_stat_operations;
677#endif 669#endif
678 670
679 return 0; 671 return 0;
diff --git a/kernel/marker.c b/kernel/marker.c
index 005b95954593..139260e5460c 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -23,6 +23,7 @@
23#include <linux/rcupdate.h> 23#include <linux/rcupdate.h>
24#include <linux/marker.h> 24#include <linux/marker.h>
25#include <linux/err.h> 25#include <linux/err.h>
26#include <linux/slab.h>
26 27
27extern struct marker __start___markers[]; 28extern struct marker __start___markers[];
28extern struct marker __stop___markers[]; 29extern struct marker __stop___markers[];
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 643360d1bb14..823be11584ef 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -31,6 +31,21 @@ static int notifier_chain_register(struct notifier_block **nl,
31 return 0; 31 return 0;
32} 32}
33 33
34static int notifier_chain_cond_register(struct notifier_block **nl,
35 struct notifier_block *n)
36{
37 while ((*nl) != NULL) {
38 if ((*nl) == n)
39 return 0;
40 if (n->priority > (*nl)->priority)
41 break;
42 nl = &((*nl)->next);
43 }
44 n->next = *nl;
45 rcu_assign_pointer(*nl, n);
46 return 0;
47}
48
34static int notifier_chain_unregister(struct notifier_block **nl, 49static int notifier_chain_unregister(struct notifier_block **nl,
35 struct notifier_block *n) 50 struct notifier_block *n)
36{ 51{
@@ -205,6 +220,29 @@ int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
205EXPORT_SYMBOL_GPL(blocking_notifier_chain_register); 220EXPORT_SYMBOL_GPL(blocking_notifier_chain_register);
206 221
207/** 222/**
223 * blocking_notifier_chain_cond_register - Cond add notifier to a blocking notifier chain
224 * @nh: Pointer to head of the blocking notifier chain
225 * @n: New entry in notifier chain
226 *
227 * Adds a notifier to a blocking notifier chain, only if not already
228 * present in the chain.
229 * Must be called in process context.
230 *
231 * Currently always returns zero.
232 */
233int blocking_notifier_chain_cond_register(struct blocking_notifier_head *nh,
234 struct notifier_block *n)
235{
236 int ret;
237
238 down_write(&nh->rwsem);
239 ret = notifier_chain_cond_register(&nh->head, n);
240 up_write(&nh->rwsem);
241 return ret;
242}
243EXPORT_SYMBOL_GPL(blocking_notifier_chain_cond_register);
244
245/**
208 * blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain 246 * blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain
209 * @nh: Pointer to head of the blocking notifier chain 247 * @nh: Pointer to head of the blocking notifier chain
210 * @n: Entry to remove from notifier chain 248 * @n: Entry to remove from notifier chain
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index aead4d69f62b..48d7ed6fc3a4 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -7,6 +7,8 @@
7#include <linux/module.h> 7#include <linux/module.h>
8#include <linux/cgroup.h> 8#include <linux/cgroup.h>
9#include <linux/fs.h> 9#include <linux/fs.h>
10#include <linux/slab.h>
11#include <linux/nsproxy.h>
10 12
11struct ns_cgroup { 13struct ns_cgroup {
12 struct cgroup_subsys_state css; 14 struct cgroup_subsys_state css;
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index f5d332cf8c63..adc785146a1c 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -139,6 +139,18 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
139 goto out; 139 goto out;
140 } 140 }
141 141
142 /*
143 * CLONE_NEWIPC must detach from the undolist: after switching
144 * to a new ipc namespace, the semaphore arrays from the old
145 * namespace are unreachable. In clone parlance, CLONE_SYSVSEM
146 * means share undolist with parent, so we must forbid using
147 * it along with CLONE_NEWIPC.
148 */
149 if ((flags & CLONE_NEWIPC) && (flags & CLONE_SYSVSEM)) {
150 err = -EINVAL;
151 goto out;
152 }
153
142 new_ns = create_new_namespaces(flags, tsk, tsk->fs); 154 new_ns = create_new_namespaces(flags, tsk, tsk->fs);
143 if (IS_ERR(new_ns)) { 155 if (IS_ERR(new_ns)) {
144 err = PTR_ERR(new_ns); 156 err = PTR_ERR(new_ns);
diff --git a/kernel/panic.c b/kernel/panic.c
index 24af9f8bac99..425567f45b9f 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -153,6 +153,8 @@ EXPORT_SYMBOL(panic);
153 * 'M' - System experienced a machine check exception. 153 * 'M' - System experienced a machine check exception.
154 * 'B' - System has hit bad_page. 154 * 'B' - System has hit bad_page.
155 * 'U' - Userspace-defined naughtiness. 155 * 'U' - Userspace-defined naughtiness.
156 * 'A' - ACPI table overridden.
157 * 'W' - Taint on warning.
156 * 158 *
157 * The string is overwritten by the next call to print_taint(). 159 * The string is overwritten by the next call to print_taint().
158 */ 160 */
@@ -161,7 +163,7 @@ const char *print_tainted(void)
161{ 163{
162 static char buf[20]; 164 static char buf[20];
163 if (tainted) { 165 if (tainted) {
164 snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c%c", 166 snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c%c%c",
165 tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G', 167 tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G',
166 tainted & TAINT_FORCED_MODULE ? 'F' : ' ', 168 tainted & TAINT_FORCED_MODULE ? 'F' : ' ',
167 tainted & TAINT_UNSAFE_SMP ? 'S' : ' ', 169 tainted & TAINT_UNSAFE_SMP ? 'S' : ' ',
@@ -170,7 +172,8 @@ const char *print_tainted(void)
170 tainted & TAINT_BAD_PAGE ? 'B' : ' ', 172 tainted & TAINT_BAD_PAGE ? 'B' : ' ',
171 tainted & TAINT_USER ? 'U' : ' ', 173 tainted & TAINT_USER ? 'U' : ' ',
172 tainted & TAINT_DIE ? 'D' : ' ', 174 tainted & TAINT_DIE ? 'D' : ' ',
173 tainted & TAINT_OVERRIDDEN_ACPI_TABLE ? 'A' : ' '); 175 tainted & TAINT_OVERRIDDEN_ACPI_TABLE ? 'A' : ' ',
176 tainted & TAINT_WARN ? 'W' : ' ');
174 } 177 }
175 else 178 else
176 snprintf(buf, sizeof(buf), "Not tainted"); 179 snprintf(buf, sizeof(buf), "Not tainted");
@@ -312,6 +315,7 @@ void warn_on_slowpath(const char *file, int line)
312 print_modules(); 315 print_modules();
313 dump_stack(); 316 dump_stack();
314 print_oops_end_marker(); 317 print_oops_end_marker();
318 add_taint(TAINT_WARN);
315} 319}
316EXPORT_SYMBOL(warn_on_slowpath); 320EXPORT_SYMBOL(warn_on_slowpath);
317#endif 321#endif
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 6d792b66d854..5ca37fa50beb 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -92,7 +92,7 @@ static struct pid_namespace *create_pid_namespace(int level)
92 atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); 92 atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
93 93
94 for (i = 1; i < PIDMAP_ENTRIES; i++) { 94 for (i = 1; i < PIDMAP_ENTRIES; i++) {
95 ns->pidmap[i].page = 0; 95 ns->pidmap[i].page = NULL;
96 atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); 96 atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
97 } 97 }
98 98
diff --git a/kernel/power/console.c b/kernel/power/console.c
index 89bcf4973ee5..b8628be2a465 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -7,17 +7,39 @@
7#include <linux/vt_kern.h> 7#include <linux/vt_kern.h>
8#include <linux/kbd_kern.h> 8#include <linux/kbd_kern.h>
9#include <linux/console.h> 9#include <linux/console.h>
10#include <linux/module.h>
10#include "power.h" 11#include "power.h"
11 12
12#if defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE) 13#if defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE)
13#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) 14#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1)
14 15
15static int orig_fgconsole, orig_kmsg; 16static int orig_fgconsole, orig_kmsg;
17static int disable_vt_switch;
18
19/*
20 * Normally during a suspend, we allocate a new console and switch to it.
21 * When we resume, we switch back to the original console. This switch
22 * can be slow, so on systems where the framebuffer can handle restoration
23 * of video registers anyways, there's little point in doing the console
24 * switch. This function allows you to disable it by passing it '0'.
25 */
26void pm_set_vt_switch(int do_switch)
27{
28 acquire_console_sem();
29 disable_vt_switch = !do_switch;
30 release_console_sem();
31}
32EXPORT_SYMBOL(pm_set_vt_switch);
16 33
17int pm_prepare_console(void) 34int pm_prepare_console(void)
18{ 35{
19 acquire_console_sem(); 36 acquire_console_sem();
20 37
38 if (disable_vt_switch) {
39 release_console_sem();
40 return 0;
41 }
42
21 orig_fgconsole = fg_console; 43 orig_fgconsole = fg_console;
22 44
23 if (vc_allocate(SUSPEND_CONSOLE)) { 45 if (vc_allocate(SUSPEND_CONSOLE)) {
@@ -50,9 +72,12 @@ int pm_prepare_console(void)
50void pm_restore_console(void) 72void pm_restore_console(void)
51{ 73{
52 acquire_console_sem(); 74 acquire_console_sem();
75 if (disable_vt_switch) {
76 release_console_sem();
77 return;
78 }
53 set_console(orig_fgconsole); 79 set_console(orig_fgconsole);
54 release_console_sem(); 80 release_console_sem();
55 kmsg_redirect = orig_kmsg; 81 kmsg_redirect = orig_kmsg;
56 return;
57} 82}
58#endif 83#endif
diff --git a/kernel/printk.c b/kernel/printk.c
index bdd4ea8c3f2b..d3f9c0f788bf 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1287,31 +1287,7 @@ void tty_write_message(struct tty_struct *tty, char *msg)
1287 */ 1287 */
1288int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst) 1288int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst)
1289{ 1289{
1290 static DEFINE_SPINLOCK(ratelimit_lock); 1290 return __ratelimit(ratelimit_jiffies, ratelimit_burst);
1291 static unsigned toks = 10 * 5 * HZ;
1292 static unsigned long last_msg;
1293 static int missed;
1294 unsigned long flags;
1295 unsigned long now = jiffies;
1296
1297 spin_lock_irqsave(&ratelimit_lock, flags);
1298 toks += now - last_msg;
1299 last_msg = now;
1300 if (toks > (ratelimit_burst * ratelimit_jiffies))
1301 toks = ratelimit_burst * ratelimit_jiffies;
1302 if (toks >= ratelimit_jiffies) {
1303 int lost = missed;
1304
1305 missed = 0;
1306 toks -= ratelimit_jiffies;
1307 spin_unlock_irqrestore(&ratelimit_lock, flags);
1308 if (lost)
1309 printk(KERN_WARNING "printk: %d messages suppressed.\n", lost);
1310 return 1;
1311 }
1312 missed++;
1313 spin_unlock_irqrestore(&ratelimit_lock, flags);
1314 return 0;
1315} 1291}
1316EXPORT_SYMBOL(__printk_ratelimit); 1292EXPORT_SYMBOL(__printk_ratelimit);
1317 1293
diff --git a/kernel/profile.c b/kernel/profile.c
index 606d7387265c..ae7ead82cbc9 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -587,10 +587,10 @@ static int __init create_proc_profile(void)
587 return 0; 587 return 0;
588 if (create_hash_tables()) 588 if (create_hash_tables())
589 return -1; 589 return -1;
590 entry = create_proc_entry("profile", S_IWUSR | S_IRUGO, NULL); 590 entry = proc_create("profile", S_IWUSR | S_IRUGO,
591 NULL, &proc_profile_operations);
591 if (!entry) 592 if (!entry)
592 return 0; 593 return 0;
593 entry->proc_fops = &proc_profile_operations;
594 entry->size = (1+prof_len) * sizeof(atomic_t); 594 entry->size = (1+prof_len) * sizeof(atomic_t);
595 hotcpu_notifier(profile_cpu_callback, 0); 595 hotcpu_notifier(profile_cpu_callback, 0);
596 return 0; 596 return 0;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 67e392ed5496..dac4b4e57293 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -612,7 +612,7 @@ int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data)
612 return (copied == sizeof(data)) ? 0 : -EIO; 612 return (copied == sizeof(data)) ? 0 : -EIO;
613} 613}
614 614
615#ifdef CONFIG_COMPAT 615#if defined CONFIG_COMPAT && defined __ARCH_WANT_COMPAT_SYS_PTRACE
616#include <linux/compat.h> 616#include <linux/compat.h>
617 617
618int compat_ptrace_request(struct task_struct *child, compat_long_t request, 618int compat_ptrace_request(struct task_struct *child, compat_long_t request,
@@ -667,7 +667,6 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
667 return ret; 667 return ret;
668} 668}
669 669
670#ifdef __ARCH_WANT_COMPAT_SYS_PTRACE
671asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, 670asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
672 compat_long_t addr, compat_long_t data) 671 compat_long_t addr, compat_long_t data)
673{ 672{
@@ -710,6 +709,4 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
710 unlock_kernel(); 709 unlock_kernel();
711 return ret; 710 return ret;
712} 711}
713#endif /* __ARCH_WANT_COMPAT_SYS_PTRACE */ 712#endif /* CONFIG_COMPAT && __ARCH_WANT_COMPAT_SYS_PTRACE */
714
715#endif /* CONFIG_COMPAT */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 47894f919d4e..33acc424667e 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -45,6 +45,7 @@
45#include <linux/byteorder/swabb.h> 45#include <linux/byteorder/swabb.h>
46#include <linux/stat.h> 46#include <linux/stat.h>
47#include <linux/srcu.h> 47#include <linux/srcu.h>
48#include <linux/slab.h>
48 49
49MODULE_LICENSE("GPL"); 50MODULE_LICENSE("GPL");
50MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " 51MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
diff --git a/kernel/relay.c b/kernel/relay.c
index d6204a485818..7de644cdec43 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -65,6 +65,35 @@ static struct vm_operations_struct relay_file_mmap_ops = {
65 .close = relay_file_mmap_close, 65 .close = relay_file_mmap_close,
66}; 66};
67 67
68/*
69 * allocate an array of pointers of struct page
70 */
71static struct page **relay_alloc_page_array(unsigned int n_pages)
72{
73 struct page **array;
74 size_t pa_size = n_pages * sizeof(struct page *);
75
76 if (pa_size > PAGE_SIZE) {
77 array = vmalloc(pa_size);
78 if (array)
79 memset(array, 0, pa_size);
80 } else {
81 array = kzalloc(pa_size, GFP_KERNEL);
82 }
83 return array;
84}
85
86/*
87 * free an array of pointers of struct page
88 */
89static void relay_free_page_array(struct page **array)
90{
91 if (is_vmalloc_addr(array))
92 vfree(array);
93 else
94 kfree(array);
95}
96
68/** 97/**
69 * relay_mmap_buf: - mmap channel buffer to process address space 98 * relay_mmap_buf: - mmap channel buffer to process address space
70 * @buf: relay channel buffer 99 * @buf: relay channel buffer
@@ -109,7 +138,7 @@ static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size)
109 *size = PAGE_ALIGN(*size); 138 *size = PAGE_ALIGN(*size);
110 n_pages = *size >> PAGE_SHIFT; 139 n_pages = *size >> PAGE_SHIFT;
111 140
112 buf->page_array = kcalloc(n_pages, sizeof(struct page *), GFP_KERNEL); 141 buf->page_array = relay_alloc_page_array(n_pages);
113 if (!buf->page_array) 142 if (!buf->page_array)
114 return NULL; 143 return NULL;
115 144
@@ -130,7 +159,7 @@ static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size)
130depopulate: 159depopulate:
131 for (j = 0; j < i; j++) 160 for (j = 0; j < i; j++)
132 __free_page(buf->page_array[j]); 161 __free_page(buf->page_array[j]);
133 kfree(buf->page_array); 162 relay_free_page_array(buf->page_array);
134 return NULL; 163 return NULL;
135} 164}
136 165
@@ -189,7 +218,7 @@ static void relay_destroy_buf(struct rchan_buf *buf)
189 vunmap(buf->start); 218 vunmap(buf->start);
190 for (i = 0; i < buf->page_count; i++) 219 for (i = 0; i < buf->page_count; i++)
191 __free_page(buf->page_array[i]); 220 __free_page(buf->page_array[i]);
192 kfree(buf->page_array); 221 relay_free_page_array(buf->page_array);
193 } 222 }
194 chan->buf[buf->cpu] = NULL; 223 chan->buf[buf->cpu] = NULL;
195 kfree(buf->padding); 224 kfree(buf->padding);
@@ -1162,7 +1191,7 @@ static ssize_t relay_file_splice_read(struct file *in,
1162 ret = 0; 1191 ret = 0;
1163 spliced = 0; 1192 spliced = 0;
1164 1193
1165 while (len) { 1194 while (len && !spliced) {
1166 ret = subbuf_splice_actor(in, ppos, pipe, len, flags, &nonpad_ret); 1195 ret = subbuf_splice_actor(in, ppos, pipe, len, flags, &nonpad_ret);
1167 if (ret < 0) 1196 if (ret < 0)
1168 break; 1197 break;
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index efbfc0fc232f..d3c61b4ebef2 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -10,6 +10,7 @@
10#include <linux/types.h> 10#include <linux/types.h>
11#include <linux/parser.h> 11#include <linux/parser.h>
12#include <linux/fs.h> 12#include <linux/fs.h>
13#include <linux/slab.h>
13#include <linux/res_counter.h> 14#include <linux/res_counter.h>
14#include <linux/uaccess.h> 15#include <linux/uaccess.h>
15 16
@@ -27,6 +28,8 @@ int res_counter_charge_locked(struct res_counter *counter, unsigned long val)
27 } 28 }
28 29
29 counter->usage += val; 30 counter->usage += val;
31 if (counter->usage > counter->max_usage)
32 counter->max_usage = counter->usage;
30 return 0; 33 return 0;
31} 34}
32 35
@@ -65,6 +68,8 @@ res_counter_member(struct res_counter *counter, int member)
65 switch (member) { 68 switch (member) {
66 case RES_USAGE: 69 case RES_USAGE:
67 return &counter->usage; 70 return &counter->usage;
71 case RES_MAX_USAGE:
72 return &counter->max_usage;
68 case RES_LIMIT: 73 case RES_LIMIT:
69 return &counter->limit; 74 return &counter->limit;
70 case RES_FAILCNT: 75 case RES_FAILCNT:
@@ -92,6 +97,11 @@ ssize_t res_counter_read(struct res_counter *counter, int member,
92 pos, buf, s - buf); 97 pos, buf, s - buf);
93} 98}
94 99
100u64 res_counter_read_u64(struct res_counter *counter, int member)
101{
102 return *res_counter_member(counter, member);
103}
104
95ssize_t res_counter_write(struct res_counter *counter, int member, 105ssize_t res_counter_write(struct res_counter *counter, int member,
96 const char __user *userbuf, size_t nbytes, loff_t *pos, 106 const char __user *userbuf, size_t nbytes, loff_t *pos,
97 int (*write_strategy)(char *st_buf, unsigned long long *val)) 107 int (*write_strategy)(char *st_buf, unsigned long long *val))
diff --git a/kernel/resource.c b/kernel/resource.c
index cee12cc47cab..74af2d7cb5a1 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -131,14 +131,8 @@ static const struct file_operations proc_iomem_operations = {
131 131
132static int __init ioresources_init(void) 132static int __init ioresources_init(void)
133{ 133{
134 struct proc_dir_entry *entry; 134 proc_create("ioports", 0, NULL, &proc_ioports_operations);
135 135 proc_create("iomem", 0, NULL, &proc_iomem_operations);
136 entry = create_proc_entry("ioports", 0, NULL);
137 if (entry)
138 entry->proc_fops = &proc_ioports_operations;
139 entry = create_proc_entry("iomem", 0, NULL);
140 if (entry)
141 entry->proc_fops = &proc_iomem_operations;
142 return 0; 136 return 0;
143} 137}
144__initcall(ioresources_init); 138__initcall(ioresources_init);
diff --git a/kernel/sched.c b/kernel/sched.c
index 740fb409e5bb..e2f7f5acc807 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -9057,13 +9057,13 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
9057} 9057}
9058 9058
9059#ifdef CONFIG_FAIR_GROUP_SCHED 9059#ifdef CONFIG_FAIR_GROUP_SCHED
9060static int cpu_shares_write_uint(struct cgroup *cgrp, struct cftype *cftype, 9060static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
9061 u64 shareval) 9061 u64 shareval)
9062{ 9062{
9063 return sched_group_set_shares(cgroup_tg(cgrp), shareval); 9063 return sched_group_set_shares(cgroup_tg(cgrp), shareval);
9064} 9064}
9065 9065
9066static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft) 9066static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
9067{ 9067{
9068 struct task_group *tg = cgroup_tg(cgrp); 9068 struct task_group *tg = cgroup_tg(cgrp);
9069 9069
@@ -9073,48 +9073,14 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft)
9073 9073
9074#ifdef CONFIG_RT_GROUP_SCHED 9074#ifdef CONFIG_RT_GROUP_SCHED
9075static ssize_t cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, 9075static ssize_t cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
9076 struct file *file, 9076 s64 val)
9077 const char __user *userbuf,
9078 size_t nbytes, loff_t *unused_ppos)
9079{ 9077{
9080 char buffer[64]; 9078 return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
9081 int retval = 0;
9082 s64 val;
9083 char *end;
9084
9085 if (!nbytes)
9086 return -EINVAL;
9087 if (nbytes >= sizeof(buffer))
9088 return -E2BIG;
9089 if (copy_from_user(buffer, userbuf, nbytes))
9090 return -EFAULT;
9091
9092 buffer[nbytes] = 0; /* nul-terminate */
9093
9094 /* strip newline if necessary */
9095 if (nbytes && (buffer[nbytes-1] == '\n'))
9096 buffer[nbytes-1] = 0;
9097 val = simple_strtoll(buffer, &end, 0);
9098 if (*end)
9099 return -EINVAL;
9100
9101 /* Pass to subsystem */
9102 retval = sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
9103 if (!retval)
9104 retval = nbytes;
9105 return retval;
9106} 9079}
9107 9080
9108static ssize_t cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft, 9081static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)
9109 struct file *file,
9110 char __user *buf, size_t nbytes,
9111 loff_t *ppos)
9112{ 9082{
9113 char tmp[64]; 9083 return sched_group_rt_runtime(cgroup_tg(cgrp));
9114 long val = sched_group_rt_runtime(cgroup_tg(cgrp));
9115 int len = sprintf(tmp, "%ld\n", val);
9116
9117 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
9118} 9084}
9119 9085
9120static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype, 9086static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
@@ -9133,20 +9099,20 @@ static struct cftype cpu_files[] = {
9133#ifdef CONFIG_FAIR_GROUP_SCHED 9099#ifdef CONFIG_FAIR_GROUP_SCHED
9134 { 9100 {
9135 .name = "shares", 9101 .name = "shares",
9136 .read_uint = cpu_shares_read_uint, 9102 .read_u64 = cpu_shares_read_u64,
9137 .write_uint = cpu_shares_write_uint, 9103 .write_u64 = cpu_shares_write_u64,
9138 }, 9104 },
9139#endif 9105#endif
9140#ifdef CONFIG_RT_GROUP_SCHED 9106#ifdef CONFIG_RT_GROUP_SCHED
9141 { 9107 {
9142 .name = "rt_runtime_us", 9108 .name = "rt_runtime_us",
9143 .read = cpu_rt_runtime_read, 9109 .read_s64 = cpu_rt_runtime_read,
9144 .write = cpu_rt_runtime_write, 9110 .write_s64 = cpu_rt_runtime_write,
9145 }, 9111 },
9146 { 9112 {
9147 .name = "rt_period_us", 9113 .name = "rt_period_us",
9148 .read_uint = cpu_rt_period_read_uint, 9114 .read_u64 = cpu_rt_period_read_uint,
9149 .write_uint = cpu_rt_period_write_uint, 9115 .write_u64 = cpu_rt_period_write_uint,
9150 }, 9116 },
9151#endif 9117#endif
9152}; 9118};
@@ -9277,8 +9243,8 @@ out:
9277static struct cftype files[] = { 9243static struct cftype files[] = {
9278 { 9244 {
9279 .name = "usage", 9245 .name = "usage",
9280 .read_uint = cpuusage_read, 9246 .read_u64 = cpuusage_read,
9281 .write_uint = cpuusage_write, 9247 .write_u64 = cpuusage_write,
9282 }, 9248 },
9283}; 9249};
9284 9250
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index f3f4af4b8b0f..8a9498e7c831 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -277,12 +277,9 @@ static int __init init_sched_debug_procfs(void)
277{ 277{
278 struct proc_dir_entry *pe; 278 struct proc_dir_entry *pe;
279 279
280 pe = create_proc_entry("sched_debug", 0644, NULL); 280 pe = proc_create("sched_debug", 0644, NULL, &sched_debug_fops);
281 if (!pe) 281 if (!pe)
282 return -ENOMEM; 282 return -ENOMEM;
283
284 pe->proc_fops = &sched_debug_fops;
285
286 return 0; 283 return 0;
287} 284}
288 285
diff --git a/kernel/sys.c b/kernel/sys.c
index 6a0cc71ee88d..e423d0d9e6ff 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1545,6 +1545,19 @@ out:
1545 * 1545 *
1546 */ 1546 */
1547 1547
1548static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r,
1549 cputime_t *utimep, cputime_t *stimep)
1550{
1551 *utimep = cputime_add(*utimep, t->utime);
1552 *stimep = cputime_add(*stimep, t->stime);
1553 r->ru_nvcsw += t->nvcsw;
1554 r->ru_nivcsw += t->nivcsw;
1555 r->ru_minflt += t->min_flt;
1556 r->ru_majflt += t->maj_flt;
1557 r->ru_inblock += task_io_get_inblock(t);
1558 r->ru_oublock += task_io_get_oublock(t);
1559}
1560
1548static void k_getrusage(struct task_struct *p, int who, struct rusage *r) 1561static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1549{ 1562{
1550 struct task_struct *t; 1563 struct task_struct *t;
@@ -1554,6 +1567,11 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1554 memset((char *) r, 0, sizeof *r); 1567 memset((char *) r, 0, sizeof *r);
1555 utime = stime = cputime_zero; 1568 utime = stime = cputime_zero;
1556 1569
1570 if (who == RUSAGE_THREAD) {
1571 accumulate_thread_rusage(p, r, &utime, &stime);
1572 goto out;
1573 }
1574
1557 rcu_read_lock(); 1575 rcu_read_lock();
1558 if (!lock_task_sighand(p, &flags)) { 1576 if (!lock_task_sighand(p, &flags)) {
1559 rcu_read_unlock(); 1577 rcu_read_unlock();
@@ -1586,14 +1604,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1586 r->ru_oublock += p->signal->oublock; 1604 r->ru_oublock += p->signal->oublock;
1587 t = p; 1605 t = p;
1588 do { 1606 do {
1589 utime = cputime_add(utime, t->utime); 1607 accumulate_thread_rusage(t, r, &utime, &stime);
1590 stime = cputime_add(stime, t->stime);
1591 r->ru_nvcsw += t->nvcsw;
1592 r->ru_nivcsw += t->nivcsw;
1593 r->ru_minflt += t->min_flt;
1594 r->ru_majflt += t->maj_flt;
1595 r->ru_inblock += task_io_get_inblock(t);
1596 r->ru_oublock += task_io_get_oublock(t);
1597 t = next_thread(t); 1608 t = next_thread(t);
1598 } while (t != p); 1609 } while (t != p);
1599 break; 1610 break;
@@ -1605,6 +1616,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1605 unlock_task_sighand(p, &flags); 1616 unlock_task_sighand(p, &flags);
1606 rcu_read_unlock(); 1617 rcu_read_unlock();
1607 1618
1619out:
1608 cputime_to_timeval(utime, &r->ru_utime); 1620 cputime_to_timeval(utime, &r->ru_utime);
1609 cputime_to_timeval(stime, &r->ru_stime); 1621 cputime_to_timeval(stime, &r->ru_stime);
1610} 1622}
@@ -1618,7 +1630,8 @@ int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
1618 1630
1619asmlinkage long sys_getrusage(int who, struct rusage __user *ru) 1631asmlinkage long sys_getrusage(int who, struct rusage __user *ru)
1620{ 1632{
1621 if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN) 1633 if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN &&
1634 who != RUSAGE_THREAD)
1622 return -EINVAL; 1635 return -EINVAL;
1623 return getrusage(current, who, ru); 1636 return getrusage(current, who, ru);
1624} 1637}
@@ -1632,10 +1645,9 @@ asmlinkage long sys_umask(int mask)
1632asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, 1645asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
1633 unsigned long arg4, unsigned long arg5) 1646 unsigned long arg4, unsigned long arg5)
1634{ 1647{
1635 long error; 1648 long uninitialized_var(error);
1636 1649
1637 error = security_task_prctl(option, arg2, arg3, arg4, arg5); 1650 if (security_task_prctl(option, arg2, arg3, arg4, arg5, &error))
1638 if (error)
1639 return error; 1651 return error;
1640 1652
1641 switch (option) { 1653 switch (option) {
@@ -1688,17 +1700,6 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
1688 error = -EINVAL; 1700 error = -EINVAL;
1689 break; 1701 break;
1690 1702
1691 case PR_GET_KEEPCAPS:
1692 if (current->keep_capabilities)
1693 error = 1;
1694 break;
1695 case PR_SET_KEEPCAPS:
1696 if (arg2 != 0 && arg2 != 1) {
1697 error = -EINVAL;
1698 break;
1699 }
1700 current->keep_capabilities = arg2;
1701 break;
1702 case PR_SET_NAME: { 1703 case PR_SET_NAME: {
1703 struct task_struct *me = current; 1704 struct task_struct *me = current;
1704 unsigned char ncomm[sizeof(me->comm)]; 1705 unsigned char ncomm[sizeof(me->comm)];
@@ -1732,17 +1733,6 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
1732 case PR_SET_SECCOMP: 1733 case PR_SET_SECCOMP:
1733 error = prctl_set_seccomp(arg2); 1734 error = prctl_set_seccomp(arg2);
1734 break; 1735 break;
1735
1736 case PR_CAPBSET_READ:
1737 if (!cap_valid(arg2))
1738 return -EINVAL;
1739 return !!cap_raised(current->cap_bset, arg2);
1740 case PR_CAPBSET_DROP:
1741#ifdef CONFIG_SECURITY_FILE_CAPABILITIES
1742 return cap_prctl_drop(arg2);
1743#else
1744 return -EINVAL;
1745#endif
1746 case PR_GET_TSC: 1736 case PR_GET_TSC:
1747 error = GET_TSC_CTL(arg2); 1737 error = GET_TSC_CTL(arg2);
1748 break; 1738 break;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index fd3364827ccf..d7ffdc59816a 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -38,6 +38,7 @@
38#include <linux/writeback.h> 38#include <linux/writeback.h>
39#include <linux/hugetlb.h> 39#include <linux/hugetlb.h>
40#include <linux/initrd.h> 40#include <linux/initrd.h>
41#include <linux/key.h>
41#include <linux/times.h> 42#include <linux/times.h>
42#include <linux/limits.h> 43#include <linux/limits.h>
43#include <linux/dcache.h> 44#include <linux/dcache.h>
@@ -144,12 +145,6 @@ extern int no_unaligned_warning;
144extern int max_lock_depth; 145extern int max_lock_depth;
145#endif 146#endif
146 147
147#ifdef CONFIG_SYSCTL_SYSCALL
148static int parse_table(int __user *, int, void __user *, size_t __user *,
149 void __user *, size_t, struct ctl_table *);
150#endif
151
152
153#ifdef CONFIG_PROC_SYSCTL 148#ifdef CONFIG_PROC_SYSCTL
154static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, 149static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp,
155 void __user *buffer, size_t *lenp, loff_t *ppos); 150 void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -809,6 +804,14 @@ static struct ctl_table kern_table[] = {
809 .proc_handler = &proc_dostring, 804 .proc_handler = &proc_dostring,
810 .strategy = &sysctl_string, 805 .strategy = &sysctl_string,
811 }, 806 },
807#ifdef CONFIG_KEYS
808 {
809 .ctl_name = CTL_UNNUMBERED,
810 .procname = "keys",
811 .mode = 0555,
812 .child = key_sysctls,
813 },
814#endif
812/* 815/*
813 * NOTE: do not add new entries to this table unless you have read 816 * NOTE: do not add new entries to this table unless you have read
814 * Documentation/sysctl/ctl_unnumbered.txt 817 * Documentation/sysctl/ctl_unnumbered.txt
@@ -1430,6 +1433,76 @@ void register_sysctl_root(struct ctl_table_root *root)
1430} 1433}
1431 1434
1432#ifdef CONFIG_SYSCTL_SYSCALL 1435#ifdef CONFIG_SYSCTL_SYSCALL
1436/* Perform the actual read/write of a sysctl table entry. */
1437static int do_sysctl_strategy(struct ctl_table_root *root,
1438 struct ctl_table *table,
1439 int __user *name, int nlen,
1440 void __user *oldval, size_t __user *oldlenp,
1441 void __user *newval, size_t newlen)
1442{
1443 int op = 0, rc;
1444
1445 if (oldval)
1446 op |= 004;
1447 if (newval)
1448 op |= 002;
1449 if (sysctl_perm(root, table, op))
1450 return -EPERM;
1451
1452 if (table->strategy) {
1453 rc = table->strategy(table, name, nlen, oldval, oldlenp,
1454 newval, newlen);
1455 if (rc < 0)
1456 return rc;
1457 if (rc > 0)
1458 return 0;
1459 }
1460
1461 /* If there is no strategy routine, or if the strategy returns
1462 * zero, proceed with automatic r/w */
1463 if (table->data && table->maxlen) {
1464 rc = sysctl_data(table, name, nlen, oldval, oldlenp,
1465 newval, newlen);
1466 if (rc < 0)
1467 return rc;
1468 }
1469 return 0;
1470}
1471
1472static int parse_table(int __user *name, int nlen,
1473 void __user *oldval, size_t __user *oldlenp,
1474 void __user *newval, size_t newlen,
1475 struct ctl_table_root *root,
1476 struct ctl_table *table)
1477{
1478 int n;
1479repeat:
1480 if (!nlen)
1481 return -ENOTDIR;
1482 if (get_user(n, name))
1483 return -EFAULT;
1484 for ( ; table->ctl_name || table->procname; table++) {
1485 if (!table->ctl_name)
1486 continue;
1487 if (n == table->ctl_name) {
1488 int error;
1489 if (table->child) {
1490 if (sysctl_perm(root, table, 001))
1491 return -EPERM;
1492 name++;
1493 nlen--;
1494 table = table->child;
1495 goto repeat;
1496 }
1497 error = do_sysctl_strategy(root, table, name, nlen,
1498 oldval, oldlenp,
1499 newval, newlen);
1500 return error;
1501 }
1502 }
1503 return -ENOTDIR;
1504}
1505
1433int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, 1506int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp,
1434 void __user *newval, size_t newlen) 1507 void __user *newval, size_t newlen)
1435{ 1508{
@@ -1447,7 +1520,8 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol
1447 for (head = sysctl_head_next(NULL); head; 1520 for (head = sysctl_head_next(NULL); head;
1448 head = sysctl_head_next(head)) { 1521 head = sysctl_head_next(head)) {
1449 error = parse_table(name, nlen, oldval, oldlenp, 1522 error = parse_table(name, nlen, oldval, oldlenp,
1450 newval, newlen, head->ctl_table); 1523 newval, newlen,
1524 head->root, head->ctl_table);
1451 if (error != -ENOTDIR) { 1525 if (error != -ENOTDIR) {
1452 sysctl_head_finish(head); 1526 sysctl_head_finish(head);
1453 break; 1527 break;
@@ -1493,84 +1567,22 @@ static int test_perm(int mode, int op)
1493 return -EACCES; 1567 return -EACCES;
1494} 1568}
1495 1569
1496int sysctl_perm(struct ctl_table *table, int op) 1570int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
1497{ 1571{
1498 int error; 1572 int error;
1573 int mode;
1574
1499 error = security_sysctl(table, op); 1575 error = security_sysctl(table, op);
1500 if (error) 1576 if (error)
1501 return error; 1577 return error;
1502 return test_perm(table->mode, op);
1503}
1504
1505#ifdef CONFIG_SYSCTL_SYSCALL
1506static int parse_table(int __user *name, int nlen,
1507 void __user *oldval, size_t __user *oldlenp,
1508 void __user *newval, size_t newlen,
1509 struct ctl_table *table)
1510{
1511 int n;
1512repeat:
1513 if (!nlen)
1514 return -ENOTDIR;
1515 if (get_user(n, name))
1516 return -EFAULT;
1517 for ( ; table->ctl_name || table->procname; table++) {
1518 if (!table->ctl_name)
1519 continue;
1520 if (n == table->ctl_name) {
1521 int error;
1522 if (table->child) {
1523 if (sysctl_perm(table, 001))
1524 return -EPERM;
1525 name++;
1526 nlen--;
1527 table = table->child;
1528 goto repeat;
1529 }
1530 error = do_sysctl_strategy(table, name, nlen,
1531 oldval, oldlenp,
1532 newval, newlen);
1533 return error;
1534 }
1535 }
1536 return -ENOTDIR;
1537}
1538 1578
1539/* Perform the actual read/write of a sysctl table entry. */ 1579 if (root->permissions)
1540int do_sysctl_strategy (struct ctl_table *table, 1580 mode = root->permissions(root, current->nsproxy, table);
1541 int __user *name, int nlen, 1581 else
1542 void __user *oldval, size_t __user *oldlenp, 1582 mode = table->mode;
1543 void __user *newval, size_t newlen)
1544{
1545 int op = 0, rc;
1546
1547 if (oldval)
1548 op |= 004;
1549 if (newval)
1550 op |= 002;
1551 if (sysctl_perm(table, op))
1552 return -EPERM;
1553 1583
1554 if (table->strategy) { 1584 return test_perm(mode, op);
1555 rc = table->strategy(table, name, nlen, oldval, oldlenp,
1556 newval, newlen);
1557 if (rc < 0)
1558 return rc;
1559 if (rc > 0)
1560 return 0;
1561 }
1562
1563 /* If there is no strategy routine, or if the strategy returns
1564 * zero, proceed with automatic r/w */
1565 if (table->data && table->maxlen) {
1566 rc = sysctl_data(table, name, nlen, oldval, oldlenp,
1567 newval, newlen);
1568 if (rc < 0)
1569 return rc;
1570 }
1571 return 0;
1572} 1585}
1573#endif /* CONFIG_SYSCTL_SYSCALL */
1574 1586
1575static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table) 1587static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table)
1576{ 1588{
@@ -1583,9 +1595,13 @@ static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table)
1583 1595
1584static __init int sysctl_init(void) 1596static __init int sysctl_init(void)
1585{ 1597{
1586 int err;
1587 sysctl_set_parent(NULL, root_table); 1598 sysctl_set_parent(NULL, root_table);
1588 err = sysctl_check_table(current->nsproxy, root_table); 1599#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
1600 {
1601 int err;
1602 err = sysctl_check_table(current->nsproxy, root_table);
1603 }
1604#endif
1589 return 0; 1605 return 0;
1590} 1606}
1591 1607
@@ -1712,10 +1728,12 @@ struct ctl_table_header *__register_sysctl_paths(
1712 header->unregistering = NULL; 1728 header->unregistering = NULL;
1713 header->root = root; 1729 header->root = root;
1714 sysctl_set_parent(NULL, header->ctl_table); 1730 sysctl_set_parent(NULL, header->ctl_table);
1731#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
1715 if (sysctl_check_table(namespaces, header->ctl_table)) { 1732 if (sysctl_check_table(namespaces, header->ctl_table)) {
1716 kfree(header); 1733 kfree(header);
1717 return NULL; 1734 return NULL;
1718 } 1735 }
1736#endif
1719 spin_lock(&sysctl_lock); 1737 spin_lock(&sysctl_lock);
1720 header_list = lookup_header_list(root, namespaces); 1738 header_list = lookup_header_list(root, namespaces);
1721 list_add_tail(&header->ctl_entry, header_list); 1739 list_add_tail(&header->ctl_entry, header_list);
diff --git a/kernel/time.c b/kernel/time.c
index 35d373a98782..86729042e4cd 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -35,6 +35,7 @@
35#include <linux/syscalls.h> 35#include <linux/syscalls.h>
36#include <linux/security.h> 36#include <linux/security.h>
37#include <linux/fs.h> 37#include <linux/fs.h>
38#include <linux/slab.h>
38 39
39#include <asm/uaccess.h> 40#include <asm/uaccess.h>
40#include <asm/unistd.h> 41#include <asm/unistd.h>
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 67fe8fc21fb1..a40e20fd0001 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -278,12 +278,9 @@ static int __init init_timer_list_procfs(void)
278{ 278{
279 struct proc_dir_entry *pe; 279 struct proc_dir_entry *pe;
280 280
281 pe = create_proc_entry("timer_list", 0644, NULL); 281 pe = proc_create("timer_list", 0644, NULL, &timer_list_fops);
282 if (!pe) 282 if (!pe)
283 return -ENOMEM; 283 return -ENOMEM;
284
285 pe->proc_fops = &timer_list_fops;
286
287 return 0; 284 return 0;
288} 285}
289__initcall(init_timer_list_procfs); 286__initcall(init_timer_list_procfs);
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 417da8c5bc72..c994530d166d 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -415,12 +415,9 @@ static int __init init_tstats_procfs(void)
415{ 415{
416 struct proc_dir_entry *pe; 416 struct proc_dir_entry *pe;
417 417
418 pe = create_proc_entry("timer_stats", 0644, NULL); 418 pe = proc_create("timer_stats", 0644, NULL, &tstats_fops);
419 if (!pe) 419 if (!pe)
420 return -ENOMEM; 420 return -ENOMEM;
421
422 pe->proc_fops = &tstats_fops;
423
424 return 0; 421 return 0;
425} 422}
426__initcall(init_tstats_procfs); 423__initcall(init_tstats_procfs);
diff --git a/kernel/user.c b/kernel/user.c
index debce602bfdd..aefbbfa3159f 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -53,10 +53,6 @@ struct user_struct root_user = {
53 .files = ATOMIC_INIT(0), 53 .files = ATOMIC_INIT(0),
54 .sigpending = ATOMIC_INIT(0), 54 .sigpending = ATOMIC_INIT(0),
55 .locked_shm = 0, 55 .locked_shm = 0,
56#ifdef CONFIG_KEYS
57 .uid_keyring = &root_user_keyring,
58 .session_keyring = &root_session_keyring,
59#endif
60#ifdef CONFIG_USER_SCHED 56#ifdef CONFIG_USER_SCHED
61 .tg = &init_task_group, 57 .tg = &init_task_group,
62#endif 58#endif
@@ -420,12 +416,12 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
420 new->mq_bytes = 0; 416 new->mq_bytes = 0;
421#endif 417#endif
422 new->locked_shm = 0; 418 new->locked_shm = 0;
423 419#ifdef CONFIG_KEYS
424 if (alloc_uid_keyring(new, current) < 0) 420 new->uid_keyring = new->session_keyring = NULL;
425 goto out_free_user; 421#endif
426 422
427 if (sched_create_user(new) < 0) 423 if (sched_create_user(new) < 0)
428 goto out_put_keys; 424 goto out_free_user;
429 425
430 if (uids_user_create(new)) 426 if (uids_user_create(new))
431 goto out_destoy_sched; 427 goto out_destoy_sched;
@@ -459,9 +455,6 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
459 455
460out_destoy_sched: 456out_destoy_sched:
461 sched_destroy_user(new); 457 sched_destroy_user(new);
462out_put_keys:
463 key_put(new->uid_keyring);
464 key_put(new->session_keyring);
465out_free_user: 458out_free_user:
466 kmem_cache_free(uid_cachep, new); 459 kmem_cache_free(uid_cachep, new);
467out_unlock: 460out_unlock:
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 4c9006275df7..a9ab0596de44 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -8,6 +8,7 @@
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/version.h> 9#include <linux/version.h>
10#include <linux/nsproxy.h> 10#include <linux/nsproxy.h>
11#include <linux/slab.h>
11#include <linux/user_namespace.h> 12#include <linux/user_namespace.h>
12 13
13/* 14/*
@@ -73,3 +74,4 @@ void free_user_ns(struct kref *kref)
73 release_uids(ns); 74 release_uids(ns);
74 kfree(ns); 75 kfree(ns);
75} 76}
77EXPORT_SYMBOL(free_user_ns);
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 816d7b24fa03..64d398f12444 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -14,6 +14,7 @@
14#include <linux/utsname.h> 14#include <linux/utsname.h>
15#include <linux/version.h> 15#include <linux/version.h>
16#include <linux/err.h> 16#include <linux/err.h>
17#include <linux/slab.h>
17 18
18/* 19/*
19 * Clone a new ns copying an original utsname, setting refcount to 1 20 * Clone a new ns copying an original utsname, setting refcount to 1
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 00ff4d08e370..7db251a959c5 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -158,8 +158,8 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
158 * 158 *
159 * Returns 0 if @work was already on a queue, non-zero otherwise. 159 * Returns 0 if @work was already on a queue, non-zero otherwise.
160 * 160 *
161 * We queue the work to the CPU it was submitted, but there is no 161 * We queue the work to the CPU on which it was submitted, but if the CPU dies
162 * guarantee that it will be processed by that CPU. 162 * it can be processed by another CPU.
163 */ 163 */
164int queue_work(struct workqueue_struct *wq, struct work_struct *work) 164int queue_work(struct workqueue_struct *wq, struct work_struct *work)
165{ 165{
@@ -772,7 +772,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
772} 772}
773EXPORT_SYMBOL_GPL(__create_workqueue_key); 773EXPORT_SYMBOL_GPL(__create_workqueue_key);
774 774
775static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) 775static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
776{ 776{
777 /* 777 /*
778 * Our caller is either destroy_workqueue() or CPU_DEAD, 778 * Our caller is either destroy_workqueue() or CPU_DEAD,
@@ -808,19 +808,16 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
808void destroy_workqueue(struct workqueue_struct *wq) 808void destroy_workqueue(struct workqueue_struct *wq)
809{ 809{
810 const cpumask_t *cpu_map = wq_cpu_map(wq); 810 const cpumask_t *cpu_map = wq_cpu_map(wq);
811 struct cpu_workqueue_struct *cwq;
812 int cpu; 811 int cpu;
813 812
814 get_online_cpus(); 813 get_online_cpus();
815 spin_lock(&workqueue_lock); 814 spin_lock(&workqueue_lock);
816 list_del(&wq->list); 815 list_del(&wq->list);
817 spin_unlock(&workqueue_lock); 816 spin_unlock(&workqueue_lock);
818 put_online_cpus();
819 817
820 for_each_cpu_mask(cpu, *cpu_map) { 818 for_each_cpu_mask(cpu, *cpu_map)
821 cwq = per_cpu_ptr(wq->cpu_wq, cpu); 819 cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu));
822 cleanup_workqueue_thread(cwq, cpu); 820 put_online_cpus();
823 }
824 821
825 free_percpu(wq->cpu_wq); 822 free_percpu(wq->cpu_wq);
826 kfree(wq); 823 kfree(wq);
@@ -838,7 +835,6 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
838 action &= ~CPU_TASKS_FROZEN; 835 action &= ~CPU_TASKS_FROZEN;
839 836
840 switch (action) { 837 switch (action) {
841
842 case CPU_UP_PREPARE: 838 case CPU_UP_PREPARE:
843 cpu_set(cpu, cpu_populated_map); 839 cpu_set(cpu, cpu_populated_map);
844 } 840 }
@@ -861,11 +857,17 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
861 case CPU_UP_CANCELED: 857 case CPU_UP_CANCELED:
862 start_workqueue_thread(cwq, -1); 858 start_workqueue_thread(cwq, -1);
863 case CPU_DEAD: 859 case CPU_DEAD:
864 cleanup_workqueue_thread(cwq, cpu); 860 cleanup_workqueue_thread(cwq);
865 break; 861 break;
866 } 862 }
867 } 863 }
868 864
865 switch (action) {
866 case CPU_UP_CANCELED:
867 case CPU_DEAD:
868 cpu_clear(cpu, cpu_populated_map);
869 }
870
869 return NOTIFY_OK; 871 return NOTIFY_OK;
870} 872}
871 873