aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-01-21 20:51:34 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2014-01-21 20:51:34 -0500
commitf075e0f6993f41c72dbb1d3e7a2d7740f14e89e2 (patch)
treea25b464a67fffc6f43940e0e85e2735a48bb1ad7
parent5cb7398caf69e3943df78435a19a8a77fe8b9463 (diff)
parentdd4b0a4676907481256d16d5de0851b315a6f22c (diff)
Merge branch 'for-3.14' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup updates from Tejun Heo: "The bulk of changes are cleanups and preparations for the upcoming kernfs conversion. - cgroup_event mechanism which is and will be used only by memcg is moved to memcg. - pidlist handling is updated so that it can be served by seq_file. Also, the list is not sorted if sane_behavior. cgroup documentation explicitly states that the file is not sorted but it has been for quite some time. - All cgroup file handling now happens on top of seq_file. This is to prepare for kernfs conversion. In addition, all operations are restructured so that they map 1-1 to kernfs operations. - Other cleanups and low-pri fixes" * 'for-3.14' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (40 commits) cgroup: trivial style updates cgroup: remove stray references to css_id doc: cgroups: Fix typo in doc/cgroups cgroup: fix fail path in cgroup_load_subsys() cgroup: fix missing unlock on error in cgroup_load_subsys() cgroup: remove for_each_root_subsys() cgroup: implement for_each_css() cgroup: factor out cgroup_subsys_state creation into create_css() cgroup: combine css handling loops in cgroup_create() cgroup: reorder operations in cgroup_create() cgroup: make for_each_subsys() useable under cgroup_root_mutex cgroup: css iterations and css_from_dir() are safe under cgroup_mutex cgroup: unify pidlist and other file handling cgroup: replace cftype->read_seq_string() with cftype->seq_show() cgroup: attach cgroup_open_file to all cgroup files cgroup: generalize cgroup_pidlist_open_file cgroup: unify read path so that seq_file is always used cgroup: unify cgroup_write_X64() and cgroup_write_string() cgroup: remove cftype->read(), ->read_map() and ->write() hugetlb_cgroup: convert away from cftype->read() ...
-rw-r--r--Documentation/cgroups/cgroups.txt20
-rw-r--r--Documentation/cgroups/memory.txt4
-rw-r--r--Documentation/cgroups/resource_counter.txt4
-rw-r--r--block/blk-throttle.c35
-rw-r--r--block/cfq-iosched.c131
-rw-r--r--drivers/md/bcache/request.c1
-rw-r--r--include/linux/cgroup.h112
-rw-r--r--include/linux/vmpressure.h8
-rw-r--r--init/Kconfig3
-rw-r--r--kernel/cgroup.c1202
-rw-r--r--kernel/cgroup_freezer.c7
-rw-r--r--kernel/cpuset.c71
-rw-r--r--kernel/sched/core.c13
-rw-r--r--kernel/sched/cpuacct.c18
-rw-r--r--mm/hugetlb_cgroup.c22
-rw-r--r--mm/memcontrol.c426
-rw-r--r--mm/page_cgroup.c2
-rw-r--r--mm/vmpressure.c26
-rw-r--r--net/core/netprio_cgroup.c8
-rw-r--r--security/device_cgroup.c7
20 files changed, 1022 insertions, 1098 deletions
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index 638bf17ff869..821de56d1580 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -24,7 +24,6 @@ CONTENTS:
24 2.1 Basic Usage 24 2.1 Basic Usage
25 2.2 Attaching processes 25 2.2 Attaching processes
26 2.3 Mounting hierarchies by name 26 2.3 Mounting hierarchies by name
27 2.4 Notification API
283. Kernel API 273. Kernel API
29 3.1 Overview 28 3.1 Overview
30 3.2 Synchronization 29 3.2 Synchronization
@@ -472,25 +471,6 @@ you give a subsystem a name.
472The name of the subsystem appears as part of the hierarchy description 471The name of the subsystem appears as part of the hierarchy description
473in /proc/mounts and /proc/<pid>/cgroups. 472in /proc/mounts and /proc/<pid>/cgroups.
474 473
4752.4 Notification API
476--------------------
477
478There is mechanism which allows to get notifications about changing
479status of a cgroup.
480
481To register a new notification handler you need to:
482 - create a file descriptor for event notification using eventfd(2);
483 - open a control file to be monitored (e.g. memory.usage_in_bytes);
484 - write "<event_fd> <control_fd> <args>" to cgroup.event_control.
485 Interpretation of args is defined by control file implementation;
486
487eventfd will be woken up by control file implementation or when the
488cgroup is removed.
489
490To unregister a notification handler just close eventfd.
491
492NOTE: Support of notifications should be implemented for the control
493file. See documentation for the subsystem.
494 474
4953. Kernel API 4753. Kernel API
496============= 476=============
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index e2bc132608fd..2622115276aa 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -577,7 +577,7 @@ Each memcg's numa_stat file includes "total", "file", "anon" and "unevictable"
577per-node page counts including "hierarchical_<counter>" which sums up all 577per-node page counts including "hierarchical_<counter>" which sums up all
578hierarchical children's values in addition to the memcg's own value. 578hierarchical children's values in addition to the memcg's own value.
579 579
580The ouput format of memory.numa_stat is: 580The output format of memory.numa_stat is:
581 581
582total=<total pages> N0=<node 0 pages> N1=<node 1 pages> ... 582total=<total pages> N0=<node 0 pages> N1=<node 1 pages> ...
583file=<total file pages> N0=<node 0 pages> N1=<node 1 pages> ... 583file=<total file pages> N0=<node 0 pages> N1=<node 1 pages> ...
@@ -670,7 +670,7 @@ page tables.
670 670
6718.1 Interface 6718.1 Interface
672 672
673This feature is disabled by default. It can be enabledi (and disabled again) by 673This feature is disabled by default. It can be enabled (and disabled again) by
674writing to memory.move_charge_at_immigrate of the destination cgroup. 674writing to memory.move_charge_at_immigrate of the destination cgroup.
675 675
676If you want to enable it: 676If you want to enable it:
diff --git a/Documentation/cgroups/resource_counter.txt b/Documentation/cgroups/resource_counter.txt
index c4d99ed0b418..52e1da16a309 100644
--- a/Documentation/cgroups/resource_counter.txt
+++ b/Documentation/cgroups/resource_counter.txt
@@ -97,8 +97,8 @@ to work with it.
97 (struct res_counter *rc, struct res_counter *top, 97 (struct res_counter *rc, struct res_counter *top,
98 unsinged long val) 98 unsinged long val)
99 99
100 Almost same as res_cunter_uncharge() but propagation of uncharge 100 Almost same as res_counter_uncharge() but propagation of uncharge
101 stops when rc == top. This is useful when kill a res_coutner in 101 stops when rc == top. This is useful when kill a res_counter in
102 child cgroup. 102 child cgroup.
103 103
104 2.1 Other accounting routines 104 2.1 Other accounting routines
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 06534049afba..a760857e6b62 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1303,13 +1303,10 @@ static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,
1303 return __blkg_prfill_rwstat(sf, pd, &rwstat); 1303 return __blkg_prfill_rwstat(sf, pd, &rwstat);
1304} 1304}
1305 1305
1306static int tg_print_cpu_rwstat(struct cgroup_subsys_state *css, 1306static int tg_print_cpu_rwstat(struct seq_file *sf, void *v)
1307 struct cftype *cft, struct seq_file *sf)
1308{ 1307{
1309 struct blkcg *blkcg = css_to_blkcg(css); 1308 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_cpu_rwstat,
1310 1309 &blkcg_policy_throtl, seq_cft(sf)->private, true);
1311 blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat, &blkcg_policy_throtl,
1312 cft->private, true);
1313 return 0; 1310 return 0;
1314} 1311}
1315 1312
@@ -1335,19 +1332,17 @@ static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd,
1335 return __blkg_prfill_u64(sf, pd, v); 1332 return __blkg_prfill_u64(sf, pd, v);
1336} 1333}
1337 1334
1338static int tg_print_conf_u64(struct cgroup_subsys_state *css, 1335static int tg_print_conf_u64(struct seq_file *sf, void *v)
1339 struct cftype *cft, struct seq_file *sf)
1340{ 1336{
1341 blkcg_print_blkgs(sf, css_to_blkcg(css), tg_prfill_conf_u64, 1337 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_u64,
1342 &blkcg_policy_throtl, cft->private, false); 1338 &blkcg_policy_throtl, seq_cft(sf)->private, false);
1343 return 0; 1339 return 0;
1344} 1340}
1345 1341
1346static int tg_print_conf_uint(struct cgroup_subsys_state *css, 1342static int tg_print_conf_uint(struct seq_file *sf, void *v)
1347 struct cftype *cft, struct seq_file *sf)
1348{ 1343{
1349 blkcg_print_blkgs(sf, css_to_blkcg(css), tg_prfill_conf_uint, 1344 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_uint,
1350 &blkcg_policy_throtl, cft->private, false); 1345 &blkcg_policy_throtl, seq_cft(sf)->private, false);
1351 return 0; 1346 return 0;
1352} 1347}
1353 1348
@@ -1428,40 +1423,40 @@ static struct cftype throtl_files[] = {
1428 { 1423 {
1429 .name = "throttle.read_bps_device", 1424 .name = "throttle.read_bps_device",
1430 .private = offsetof(struct throtl_grp, bps[READ]), 1425 .private = offsetof(struct throtl_grp, bps[READ]),
1431 .read_seq_string = tg_print_conf_u64, 1426 .seq_show = tg_print_conf_u64,
1432 .write_string = tg_set_conf_u64, 1427 .write_string = tg_set_conf_u64,
1433 .max_write_len = 256, 1428 .max_write_len = 256,
1434 }, 1429 },
1435 { 1430 {
1436 .name = "throttle.write_bps_device", 1431 .name = "throttle.write_bps_device",
1437 .private = offsetof(struct throtl_grp, bps[WRITE]), 1432 .private = offsetof(struct throtl_grp, bps[WRITE]),
1438 .read_seq_string = tg_print_conf_u64, 1433 .seq_show = tg_print_conf_u64,
1439 .write_string = tg_set_conf_u64, 1434 .write_string = tg_set_conf_u64,
1440 .max_write_len = 256, 1435 .max_write_len = 256,
1441 }, 1436 },
1442 { 1437 {
1443 .name = "throttle.read_iops_device", 1438 .name = "throttle.read_iops_device",
1444 .private = offsetof(struct throtl_grp, iops[READ]), 1439 .private = offsetof(struct throtl_grp, iops[READ]),
1445 .read_seq_string = tg_print_conf_uint, 1440 .seq_show = tg_print_conf_uint,
1446 .write_string = tg_set_conf_uint, 1441 .write_string = tg_set_conf_uint,
1447 .max_write_len = 256, 1442 .max_write_len = 256,
1448 }, 1443 },
1449 { 1444 {
1450 .name = "throttle.write_iops_device", 1445 .name = "throttle.write_iops_device",
1451 .private = offsetof(struct throtl_grp, iops[WRITE]), 1446 .private = offsetof(struct throtl_grp, iops[WRITE]),
1452 .read_seq_string = tg_print_conf_uint, 1447 .seq_show = tg_print_conf_uint,
1453 .write_string = tg_set_conf_uint, 1448 .write_string = tg_set_conf_uint,
1454 .max_write_len = 256, 1449 .max_write_len = 256,
1455 }, 1450 },
1456 { 1451 {
1457 .name = "throttle.io_service_bytes", 1452 .name = "throttle.io_service_bytes",
1458 .private = offsetof(struct tg_stats_cpu, service_bytes), 1453 .private = offsetof(struct tg_stats_cpu, service_bytes),
1459 .read_seq_string = tg_print_cpu_rwstat, 1454 .seq_show = tg_print_cpu_rwstat,
1460 }, 1455 },
1461 { 1456 {
1462 .name = "throttle.io_serviced", 1457 .name = "throttle.io_serviced",
1463 .private = offsetof(struct tg_stats_cpu, serviced), 1458 .private = offsetof(struct tg_stats_cpu, serviced),
1464 .read_seq_string = tg_print_cpu_rwstat, 1459 .seq_show = tg_print_cpu_rwstat,
1465 }, 1460 },
1466 { } /* terminate */ 1461 { } /* terminate */
1467}; 1462};
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 4d5cec1ad80d..744833b630c6 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1632,11 +1632,11 @@ static u64 cfqg_prfill_weight_device(struct seq_file *sf,
1632 return __blkg_prfill_u64(sf, pd, cfqg->dev_weight); 1632 return __blkg_prfill_u64(sf, pd, cfqg->dev_weight);
1633} 1633}
1634 1634
1635static int cfqg_print_weight_device(struct cgroup_subsys_state *css, 1635static int cfqg_print_weight_device(struct seq_file *sf, void *v)
1636 struct cftype *cft, struct seq_file *sf)
1637{ 1636{
1638 blkcg_print_blkgs(sf, css_to_blkcg(css), cfqg_prfill_weight_device, 1637 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1639 &blkcg_policy_cfq, 0, false); 1638 cfqg_prfill_weight_device, &blkcg_policy_cfq,
1639 0, false);
1640 return 0; 1640 return 0;
1641} 1641}
1642 1642
@@ -1650,26 +1650,23 @@ static u64 cfqg_prfill_leaf_weight_device(struct seq_file *sf,
1650 return __blkg_prfill_u64(sf, pd, cfqg->dev_leaf_weight); 1650 return __blkg_prfill_u64(sf, pd, cfqg->dev_leaf_weight);
1651} 1651}
1652 1652
1653static int cfqg_print_leaf_weight_device(struct cgroup_subsys_state *css, 1653static int cfqg_print_leaf_weight_device(struct seq_file *sf, void *v)
1654 struct cftype *cft,
1655 struct seq_file *sf)
1656{ 1654{
1657 blkcg_print_blkgs(sf, css_to_blkcg(css), cfqg_prfill_leaf_weight_device, 1655 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1658 &blkcg_policy_cfq, 0, false); 1656 cfqg_prfill_leaf_weight_device, &blkcg_policy_cfq,
1657 0, false);
1659 return 0; 1658 return 0;
1660} 1659}
1661 1660
1662static int cfq_print_weight(struct cgroup_subsys_state *css, struct cftype *cft, 1661static int cfq_print_weight(struct seq_file *sf, void *v)
1663 struct seq_file *sf)
1664{ 1662{
1665 seq_printf(sf, "%u\n", css_to_blkcg(css)->cfq_weight); 1663 seq_printf(sf, "%u\n", css_to_blkcg(seq_css(sf))->cfq_weight);
1666 return 0; 1664 return 0;
1667} 1665}
1668 1666
1669static int cfq_print_leaf_weight(struct cgroup_subsys_state *css, 1667static int cfq_print_leaf_weight(struct seq_file *sf, void *v)
1670 struct cftype *cft, struct seq_file *sf)
1671{ 1668{
1672 seq_printf(sf, "%u\n", css_to_blkcg(css)->cfq_leaf_weight); 1669 seq_printf(sf, "%u\n", css_to_blkcg(seq_css(sf))->cfq_leaf_weight);
1673 return 0; 1670 return 0;
1674} 1671}
1675 1672
@@ -1762,23 +1759,17 @@ static int cfq_set_leaf_weight(struct cgroup_subsys_state *css,
1762 return __cfq_set_weight(css, cft, val, true); 1759 return __cfq_set_weight(css, cft, val, true);
1763} 1760}
1764 1761
1765static int cfqg_print_stat(struct cgroup_subsys_state *css, struct cftype *cft, 1762static int cfqg_print_stat(struct seq_file *sf, void *v)
1766 struct seq_file *sf)
1767{ 1763{
1768 struct blkcg *blkcg = css_to_blkcg(css); 1764 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
1769 1765 &blkcg_policy_cfq, seq_cft(sf)->private, false);
1770 blkcg_print_blkgs(sf, blkcg, blkg_prfill_stat, &blkcg_policy_cfq,
1771 cft->private, false);
1772 return 0; 1766 return 0;
1773} 1767}
1774 1768
1775static int cfqg_print_rwstat(struct cgroup_subsys_state *css, 1769static int cfqg_print_rwstat(struct seq_file *sf, void *v)
1776 struct cftype *cft, struct seq_file *sf)
1777{ 1770{
1778 struct blkcg *blkcg = css_to_blkcg(css); 1771 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
1779 1772 &blkcg_policy_cfq, seq_cft(sf)->private, true);
1780 blkcg_print_blkgs(sf, blkcg, blkg_prfill_rwstat, &blkcg_policy_cfq,
1781 cft->private, true);
1782 return 0; 1773 return 0;
1783} 1774}
1784 1775
@@ -1798,23 +1789,19 @@ static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf,
1798 return __blkg_prfill_rwstat(sf, pd, &sum); 1789 return __blkg_prfill_rwstat(sf, pd, &sum);
1799} 1790}
1800 1791
1801static int cfqg_print_stat_recursive(struct cgroup_subsys_state *css, 1792static int cfqg_print_stat_recursive(struct seq_file *sf, void *v)
1802 struct cftype *cft, struct seq_file *sf)
1803{ 1793{
1804 struct blkcg *blkcg = css_to_blkcg(css); 1794 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1805 1795 cfqg_prfill_stat_recursive, &blkcg_policy_cfq,
1806 blkcg_print_blkgs(sf, blkcg, cfqg_prfill_stat_recursive, 1796 seq_cft(sf)->private, false);
1807 &blkcg_policy_cfq, cft->private, false);
1808 return 0; 1797 return 0;
1809} 1798}
1810 1799
1811static int cfqg_print_rwstat_recursive(struct cgroup_subsys_state *css, 1800static int cfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
1812 struct cftype *cft, struct seq_file *sf)
1813{ 1801{
1814 struct blkcg *blkcg = css_to_blkcg(css); 1802 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1815 1803 cfqg_prfill_rwstat_recursive, &blkcg_policy_cfq,
1816 blkcg_print_blkgs(sf, blkcg, cfqg_prfill_rwstat_recursive, 1804 seq_cft(sf)->private, true);
1817 &blkcg_policy_cfq, cft->private, true);
1818 return 0; 1805 return 0;
1819} 1806}
1820 1807
@@ -1835,13 +1822,11 @@ static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf,
1835} 1822}
1836 1823
1837/* print avg_queue_size */ 1824/* print avg_queue_size */
1838static int cfqg_print_avg_queue_size(struct cgroup_subsys_state *css, 1825static int cfqg_print_avg_queue_size(struct seq_file *sf, void *v)
1839 struct cftype *cft, struct seq_file *sf)
1840{ 1826{
1841 struct blkcg *blkcg = css_to_blkcg(css); 1827 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1842 1828 cfqg_prfill_avg_queue_size, &blkcg_policy_cfq,
1843 blkcg_print_blkgs(sf, blkcg, cfqg_prfill_avg_queue_size, 1829 0, false);
1844 &blkcg_policy_cfq, 0, false);
1845 return 0; 1830 return 0;
1846} 1831}
1847#endif /* CONFIG_DEBUG_BLK_CGROUP */ 1832#endif /* CONFIG_DEBUG_BLK_CGROUP */
@@ -1851,14 +1836,14 @@ static struct cftype cfq_blkcg_files[] = {
1851 { 1836 {
1852 .name = "weight_device", 1837 .name = "weight_device",
1853 .flags = CFTYPE_ONLY_ON_ROOT, 1838 .flags = CFTYPE_ONLY_ON_ROOT,
1854 .read_seq_string = cfqg_print_leaf_weight_device, 1839 .seq_show = cfqg_print_leaf_weight_device,
1855 .write_string = cfqg_set_leaf_weight_device, 1840 .write_string = cfqg_set_leaf_weight_device,
1856 .max_write_len = 256, 1841 .max_write_len = 256,
1857 }, 1842 },
1858 { 1843 {
1859 .name = "weight", 1844 .name = "weight",
1860 .flags = CFTYPE_ONLY_ON_ROOT, 1845 .flags = CFTYPE_ONLY_ON_ROOT,
1861 .read_seq_string = cfq_print_leaf_weight, 1846 .seq_show = cfq_print_leaf_weight,
1862 .write_u64 = cfq_set_leaf_weight, 1847 .write_u64 = cfq_set_leaf_weight,
1863 }, 1848 },
1864 1849
@@ -1866,26 +1851,26 @@ static struct cftype cfq_blkcg_files[] = {
1866 { 1851 {
1867 .name = "weight_device", 1852 .name = "weight_device",
1868 .flags = CFTYPE_NOT_ON_ROOT, 1853 .flags = CFTYPE_NOT_ON_ROOT,
1869 .read_seq_string = cfqg_print_weight_device, 1854 .seq_show = cfqg_print_weight_device,
1870 .write_string = cfqg_set_weight_device, 1855 .write_string = cfqg_set_weight_device,
1871 .max_write_len = 256, 1856 .max_write_len = 256,
1872 }, 1857 },
1873 { 1858 {
1874 .name = "weight", 1859 .name = "weight",
1875 .flags = CFTYPE_NOT_ON_ROOT, 1860 .flags = CFTYPE_NOT_ON_ROOT,
1876 .read_seq_string = cfq_print_weight, 1861 .seq_show = cfq_print_weight,
1877 .write_u64 = cfq_set_weight, 1862 .write_u64 = cfq_set_weight,
1878 }, 1863 },
1879 1864
1880 { 1865 {
1881 .name = "leaf_weight_device", 1866 .name = "leaf_weight_device",
1882 .read_seq_string = cfqg_print_leaf_weight_device, 1867 .seq_show = cfqg_print_leaf_weight_device,
1883 .write_string = cfqg_set_leaf_weight_device, 1868 .write_string = cfqg_set_leaf_weight_device,
1884 .max_write_len = 256, 1869 .max_write_len = 256,
1885 }, 1870 },
1886 { 1871 {
1887 .name = "leaf_weight", 1872 .name = "leaf_weight",
1888 .read_seq_string = cfq_print_leaf_weight, 1873 .seq_show = cfq_print_leaf_weight,
1889 .write_u64 = cfq_set_leaf_weight, 1874 .write_u64 = cfq_set_leaf_weight,
1890 }, 1875 },
1891 1876
@@ -1893,114 +1878,114 @@ static struct cftype cfq_blkcg_files[] = {
1893 { 1878 {
1894 .name = "time", 1879 .name = "time",
1895 .private = offsetof(struct cfq_group, stats.time), 1880 .private = offsetof(struct cfq_group, stats.time),
1896 .read_seq_string = cfqg_print_stat, 1881 .seq_show = cfqg_print_stat,
1897 }, 1882 },
1898 { 1883 {
1899 .name = "sectors", 1884 .name = "sectors",
1900 .private = offsetof(struct cfq_group, stats.sectors), 1885 .private = offsetof(struct cfq_group, stats.sectors),
1901 .read_seq_string = cfqg_print_stat, 1886 .seq_show = cfqg_print_stat,
1902 }, 1887 },
1903 { 1888 {
1904 .name = "io_service_bytes", 1889 .name = "io_service_bytes",
1905 .private = offsetof(struct cfq_group, stats.service_bytes), 1890 .private = offsetof(struct cfq_group, stats.service_bytes),
1906 .read_seq_string = cfqg_print_rwstat, 1891 .seq_show = cfqg_print_rwstat,
1907 }, 1892 },
1908 { 1893 {
1909 .name = "io_serviced", 1894 .name = "io_serviced",
1910 .private = offsetof(struct cfq_group, stats.serviced), 1895 .private = offsetof(struct cfq_group, stats.serviced),
1911 .read_seq_string = cfqg_print_rwstat, 1896 .seq_show = cfqg_print_rwstat,
1912 }, 1897 },
1913 { 1898 {
1914 .name = "io_service_time", 1899 .name = "io_service_time",
1915 .private = offsetof(struct cfq_group, stats.service_time), 1900 .private = offsetof(struct cfq_group, stats.service_time),
1916 .read_seq_string = cfqg_print_rwstat, 1901 .seq_show = cfqg_print_rwstat,
1917 }, 1902 },
1918 { 1903 {
1919 .name = "io_wait_time", 1904 .name = "io_wait_time",
1920 .private = offsetof(struct cfq_group, stats.wait_time), 1905 .private = offsetof(struct cfq_group, stats.wait_time),
1921 .read_seq_string = cfqg_print_rwstat, 1906 .seq_show = cfqg_print_rwstat,
1922 }, 1907 },
1923 { 1908 {
1924 .name = "io_merged", 1909 .name = "io_merged",
1925 .private = offsetof(struct cfq_group, stats.merged), 1910 .private = offsetof(struct cfq_group, stats.merged),
1926 .read_seq_string = cfqg_print_rwstat, 1911 .seq_show = cfqg_print_rwstat,
1927 }, 1912 },
1928 { 1913 {
1929 .name = "io_queued", 1914 .name = "io_queued",
1930 .private = offsetof(struct cfq_group, stats.queued), 1915 .private = offsetof(struct cfq_group, stats.queued),
1931 .read_seq_string = cfqg_print_rwstat, 1916 .seq_show = cfqg_print_rwstat,
1932 }, 1917 },
1933 1918
1934 /* the same statictics which cover the cfqg and its descendants */ 1919 /* the same statictics which cover the cfqg and its descendants */
1935 { 1920 {
1936 .name = "time_recursive", 1921 .name = "time_recursive",
1937 .private = offsetof(struct cfq_group, stats.time), 1922 .private = offsetof(struct cfq_group, stats.time),
1938 .read_seq_string = cfqg_print_stat_recursive, 1923 .seq_show = cfqg_print_stat_recursive,
1939 }, 1924 },
1940 { 1925 {
1941 .name = "sectors_recursive", 1926 .name = "sectors_recursive",
1942 .private = offsetof(struct cfq_group, stats.sectors), 1927 .private = offsetof(struct cfq_group, stats.sectors),
1943 .read_seq_string = cfqg_print_stat_recursive, 1928 .seq_show = cfqg_print_stat_recursive,
1944 }, 1929 },
1945 { 1930 {
1946 .name = "io_service_bytes_recursive", 1931 .name = "io_service_bytes_recursive",
1947 .private = offsetof(struct cfq_group, stats.service_bytes), 1932 .private = offsetof(struct cfq_group, stats.service_bytes),
1948 .read_seq_string = cfqg_print_rwstat_recursive, 1933 .seq_show = cfqg_print_rwstat_recursive,
1949 }, 1934 },
1950 { 1935 {
1951 .name = "io_serviced_recursive", 1936 .name = "io_serviced_recursive",
1952 .private = offsetof(struct cfq_group, stats.serviced), 1937 .private = offsetof(struct cfq_group, stats.serviced),
1953 .read_seq_string = cfqg_print_rwstat_recursive, 1938 .seq_show = cfqg_print_rwstat_recursive,
1954 }, 1939 },
1955 { 1940 {
1956 .name = "io_service_time_recursive", 1941 .name = "io_service_time_recursive",
1957 .private = offsetof(struct cfq_group, stats.service_time), 1942 .private = offsetof(struct cfq_group, stats.service_time),
1958 .read_seq_string = cfqg_print_rwstat_recursive, 1943 .seq_show = cfqg_print_rwstat_recursive,
1959 }, 1944 },
1960 { 1945 {
1961 .name = "io_wait_time_recursive", 1946 .name = "io_wait_time_recursive",
1962 .private = offsetof(struct cfq_group, stats.wait_time), 1947 .private = offsetof(struct cfq_group, stats.wait_time),
1963 .read_seq_string = cfqg_print_rwstat_recursive, 1948 .seq_show = cfqg_print_rwstat_recursive,
1964 }, 1949 },
1965 { 1950 {
1966 .name = "io_merged_recursive", 1951 .name = "io_merged_recursive",
1967 .private = offsetof(struct cfq_group, stats.merged), 1952 .private = offsetof(struct cfq_group, stats.merged),
1968 .read_seq_string = cfqg_print_rwstat_recursive, 1953 .seq_show = cfqg_print_rwstat_recursive,
1969 }, 1954 },
1970 { 1955 {
1971 .name = "io_queued_recursive", 1956 .name = "io_queued_recursive",
1972 .private = offsetof(struct cfq_group, stats.queued), 1957 .private = offsetof(struct cfq_group, stats.queued),
1973 .read_seq_string = cfqg_print_rwstat_recursive, 1958 .seq_show = cfqg_print_rwstat_recursive,
1974 }, 1959 },
1975#ifdef CONFIG_DEBUG_BLK_CGROUP 1960#ifdef CONFIG_DEBUG_BLK_CGROUP
1976 { 1961 {
1977 .name = "avg_queue_size", 1962 .name = "avg_queue_size",
1978 .read_seq_string = cfqg_print_avg_queue_size, 1963 .seq_show = cfqg_print_avg_queue_size,
1979 }, 1964 },
1980 { 1965 {
1981 .name = "group_wait_time", 1966 .name = "group_wait_time",
1982 .private = offsetof(struct cfq_group, stats.group_wait_time), 1967 .private = offsetof(struct cfq_group, stats.group_wait_time),
1983 .read_seq_string = cfqg_print_stat, 1968 .seq_show = cfqg_print_stat,
1984 }, 1969 },
1985 { 1970 {
1986 .name = "idle_time", 1971 .name = "idle_time",
1987 .private = offsetof(struct cfq_group, stats.idle_time), 1972 .private = offsetof(struct cfq_group, stats.idle_time),
1988 .read_seq_string = cfqg_print_stat, 1973 .seq_show = cfqg_print_stat,
1989 }, 1974 },
1990 { 1975 {
1991 .name = "empty_time", 1976 .name = "empty_time",
1992 .private = offsetof(struct cfq_group, stats.empty_time), 1977 .private = offsetof(struct cfq_group, stats.empty_time),
1993 .read_seq_string = cfqg_print_stat, 1978 .seq_show = cfqg_print_stat,
1994 }, 1979 },
1995 { 1980 {
1996 .name = "dequeue", 1981 .name = "dequeue",
1997 .private = offsetof(struct cfq_group, stats.dequeue), 1982 .private = offsetof(struct cfq_group, stats.dequeue),
1998 .read_seq_string = cfqg_print_stat, 1983 .seq_show = cfqg_print_stat,
1999 }, 1984 },
2000 { 1985 {
2001 .name = "unaccounted_time", 1986 .name = "unaccounted_time",
2002 .private = offsetof(struct cfq_group, stats.unaccounted_time), 1987 .private = offsetof(struct cfq_group, stats.unaccounted_time),
2003 .read_seq_string = cfqg_print_stat, 1988 .seq_show = cfqg_print_stat,
2004 }, 1989 },
2005#endif /* CONFIG_DEBUG_BLK_CGROUP */ 1990#endif /* CONFIG_DEBUG_BLK_CGROUP */
2006 { } /* terminate */ 1991 { } /* terminate */
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index fbcc851ed5a5..61bcfc21d2a0 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -163,7 +163,6 @@ static struct cgroup_subsys_state *bcachecg_create(struct cgroup *cgroup)
163static void bcachecg_destroy(struct cgroup *cgroup) 163static void bcachecg_destroy(struct cgroup *cgroup)
164{ 164{
165 struct bch_cgroup *cg = cgroup_to_bcache(cgroup); 165 struct bch_cgroup *cg = cgroup_to_bcache(cgroup);
166 free_css_id(&bcache_subsys, &cg->css);
167 kfree(cg); 166 kfree(cg);
168} 167}
169 168
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 39c1d9469677..5c097596104b 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -21,6 +21,7 @@
21#include <linux/xattr.h> 21#include <linux/xattr.h>
22#include <linux/fs.h> 22#include <linux/fs.h>
23#include <linux/percpu-refcount.h> 23#include <linux/percpu-refcount.h>
24#include <linux/seq_file.h>
24 25
25#ifdef CONFIG_CGROUPS 26#ifdef CONFIG_CGROUPS
26 27
@@ -28,8 +29,6 @@ struct cgroupfs_root;
28struct cgroup_subsys; 29struct cgroup_subsys;
29struct inode; 30struct inode;
30struct cgroup; 31struct cgroup;
31struct css_id;
32struct eventfd_ctx;
33 32
34extern int cgroup_init_early(void); 33extern int cgroup_init_early(void);
35extern int cgroup_init(void); 34extern int cgroup_init(void);
@@ -79,8 +78,6 @@ struct cgroup_subsys_state {
79 struct cgroup_subsys_state *parent; 78 struct cgroup_subsys_state *parent;
80 79
81 unsigned long flags; 80 unsigned long flags;
82 /* ID for this css, if possible */
83 struct css_id __rcu *id;
84 81
85 /* percpu_ref killing and RCU release */ 82 /* percpu_ref killing and RCU release */
86 struct rcu_head rcu_head; 83 struct rcu_head rcu_head;
@@ -239,10 +236,6 @@ struct cgroup {
239 struct rcu_head rcu_head; 236 struct rcu_head rcu_head;
240 struct work_struct destroy_work; 237 struct work_struct destroy_work;
241 238
242 /* List of events which userspace want to receive */
243 struct list_head event_list;
244 spinlock_t event_list_lock;
245
246 /* directory xattrs */ 239 /* directory xattrs */
247 struct simple_xattrs xattrs; 240 struct simple_xattrs xattrs;
248}; 241};
@@ -280,6 +273,9 @@ enum {
280 * - "tasks" is removed. Everything should be at process 273 * - "tasks" is removed. Everything should be at process
281 * granularity. Use "cgroup.procs" instead. 274 * granularity. Use "cgroup.procs" instead.
282 * 275 *
276 * - "cgroup.procs" is not sorted. pids will be unique unless they
277 * got recycled inbetween reads.
278 *
283 * - "release_agent" and "notify_on_release" are removed. 279 * - "release_agent" and "notify_on_release" are removed.
284 * Replacement notification mechanism will be implemented. 280 * Replacement notification mechanism will be implemented.
285 * 281 *
@@ -320,9 +316,6 @@ struct cgroupfs_root {
320 /* Unique id for this hierarchy. */ 316 /* Unique id for this hierarchy. */
321 int hierarchy_id; 317 int hierarchy_id;
322 318
323 /* A list running through the attached subsystems */
324 struct list_head subsys_list;
325
326 /* The root cgroup for this hierarchy */ 319 /* The root cgroup for this hierarchy */
327 struct cgroup top_cgroup; 320 struct cgroup top_cgroup;
328 321
@@ -389,16 +382,6 @@ struct css_set {
389}; 382};
390 383
391/* 384/*
392 * cgroup_map_cb is an abstract callback API for reporting map-valued
393 * control files
394 */
395
396struct cgroup_map_cb {
397 int (*fill)(struct cgroup_map_cb *cb, const char *key, u64 value);
398 void *state;
399};
400
401/*
402 * struct cftype: handler definitions for cgroup control files 385 * struct cftype: handler definitions for cgroup control files
403 * 386 *
404 * When reading/writing to a file: 387 * When reading/writing to a file:
@@ -445,10 +428,6 @@ struct cftype {
445 */ 428 */
446 struct cgroup_subsys *ss; 429 struct cgroup_subsys *ss;
447 430
448 int (*open)(struct inode *inode, struct file *file);
449 ssize_t (*read)(struct cgroup_subsys_state *css, struct cftype *cft,
450 struct file *file,
451 char __user *buf, size_t nbytes, loff_t *ppos);
452 /* 431 /*
453 * read_u64() is a shortcut for the common case of returning a 432 * read_u64() is a shortcut for the common case of returning a
454 * single integer. Use it in place of read() 433 * single integer. Use it in place of read()
@@ -458,24 +437,14 @@ struct cftype {
458 * read_s64() is a signed version of read_u64() 437 * read_s64() is a signed version of read_u64()
459 */ 438 */
460 s64 (*read_s64)(struct cgroup_subsys_state *css, struct cftype *cft); 439 s64 (*read_s64)(struct cgroup_subsys_state *css, struct cftype *cft);
461 /*
462 * read_map() is used for defining a map of key/value
463 * pairs. It should call cb->fill(cb, key, value) for each
464 * entry. The key/value pairs (and their ordering) should not
465 * change between reboots.
466 */
467 int (*read_map)(struct cgroup_subsys_state *css, struct cftype *cft,
468 struct cgroup_map_cb *cb);
469 /*
470 * read_seq_string() is used for outputting a simple sequence
471 * using seqfile.
472 */
473 int (*read_seq_string)(struct cgroup_subsys_state *css,
474 struct cftype *cft, struct seq_file *m);
475 440
476 ssize_t (*write)(struct cgroup_subsys_state *css, struct cftype *cft, 441 /* generic seq_file read interface */
477 struct file *file, 442 int (*seq_show)(struct seq_file *sf, void *v);
478 const char __user *buf, size_t nbytes, loff_t *ppos); 443
444 /* optional ops, implement all or none */
445 void *(*seq_start)(struct seq_file *sf, loff_t *ppos);
446 void *(*seq_next)(struct seq_file *sf, void *v, loff_t *ppos);
447 void (*seq_stop)(struct seq_file *sf, void *v);
479 448
480 /* 449 /*
481 * write_u64() is a shortcut for the common case of accepting 450 * write_u64() is a shortcut for the common case of accepting
@@ -504,27 +473,6 @@ struct cftype {
504 * kick type for multiplexing. 473 * kick type for multiplexing.
505 */ 474 */
506 int (*trigger)(struct cgroup_subsys_state *css, unsigned int event); 475 int (*trigger)(struct cgroup_subsys_state *css, unsigned int event);
507
508 int (*release)(struct inode *inode, struct file *file);
509
510 /*
511 * register_event() callback will be used to add new userspace
512 * waiter for changes related to the cftype. Implement it if
513 * you want to provide this functionality. Use eventfd_signal()
514 * on eventfd to send notification to userspace.
515 */
516 int (*register_event)(struct cgroup_subsys_state *css,
517 struct cftype *cft, struct eventfd_ctx *eventfd,
518 const char *args);
519 /*
520 * unregister_event() callback will be called when userspace
521 * closes the eventfd or on cgroup removing.
522 * This callback must be implemented, if you want provide
523 * notification functionality.
524 */
525 void (*unregister_event)(struct cgroup_subsys_state *css,
526 struct cftype *cft,
527 struct eventfd_ctx *eventfd);
528}; 476};
529 477
530/* 478/*
@@ -538,6 +486,26 @@ struct cftype_set {
538}; 486};
539 487
540/* 488/*
489 * cgroupfs file entry, pointed to from leaf dentry->d_fsdata. Don't
490 * access directly.
491 */
492struct cfent {
493 struct list_head node;
494 struct dentry *dentry;
495 struct cftype *type;
496 struct cgroup_subsys_state *css;
497
498 /* file xattrs */
499 struct simple_xattrs xattrs;
500};
501
502/* seq_file->private points to the following, only ->priv is public */
503struct cgroup_open_file {
504 struct cfent *cfe;
505 void *priv;
506};
507
508/*
541 * See the comment above CGRP_ROOT_SANE_BEHAVIOR for details. This 509 * See the comment above CGRP_ROOT_SANE_BEHAVIOR for details. This
542 * function can be called as long as @cgrp is accessible. 510 * function can be called as long as @cgrp is accessible.
543 */ 511 */
@@ -552,6 +520,18 @@ static inline const char *cgroup_name(const struct cgroup *cgrp)
552 return rcu_dereference(cgrp->name)->name; 520 return rcu_dereference(cgrp->name)->name;
553} 521}
554 522
523static inline struct cgroup_subsys_state *seq_css(struct seq_file *seq)
524{
525 struct cgroup_open_file *of = seq->private;
526 return of->cfe->css;
527}
528
529static inline struct cftype *seq_cft(struct seq_file *seq)
530{
531 struct cgroup_open_file *of = seq->private;
532 return of->cfe->type;
533}
534
555int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); 535int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
556int cgroup_rm_cftypes(struct cftype *cfts); 536int cgroup_rm_cftypes(struct cftype *cfts);
557 537
@@ -631,12 +611,8 @@ struct cgroup_subsys {
631#define MAX_CGROUP_TYPE_NAMELEN 32 611#define MAX_CGROUP_TYPE_NAMELEN 32
632 const char *name; 612 const char *name;
633 613
634 /* 614 /* link to parent, protected by cgroup_lock() */
635 * Link to parent, and list entry in parent's children.
636 * Protected by cgroup_lock()
637 */
638 struct cgroupfs_root *root; 615 struct cgroupfs_root *root;
639 struct list_head sibling;
640 616
641 /* list of cftype_sets */ 617 /* list of cftype_sets */
642 struct list_head cftsets; 618 struct list_head cftsets;
diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h
index 3f3788d49362..3e4535876d37 100644
--- a/include/linux/vmpressure.h
+++ b/include/linux/vmpressure.h
@@ -7,6 +7,7 @@
7#include <linux/gfp.h> 7#include <linux/gfp.h>
8#include <linux/types.h> 8#include <linux/types.h>
9#include <linux/cgroup.h> 9#include <linux/cgroup.h>
10#include <linux/eventfd.h>
10 11
11struct vmpressure { 12struct vmpressure {
12 unsigned long scanned; 13 unsigned long scanned;
@@ -33,13 +34,10 @@ extern void vmpressure_init(struct vmpressure *vmpr);
33extern void vmpressure_cleanup(struct vmpressure *vmpr); 34extern void vmpressure_cleanup(struct vmpressure *vmpr);
34extern struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg); 35extern struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg);
35extern struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr); 36extern struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr);
36extern struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css); 37extern int vmpressure_register_event(struct mem_cgroup *memcg,
37extern int vmpressure_register_event(struct cgroup_subsys_state *css,
38 struct cftype *cft,
39 struct eventfd_ctx *eventfd, 38 struct eventfd_ctx *eventfd,
40 const char *args); 39 const char *args);
41extern void vmpressure_unregister_event(struct cgroup_subsys_state *css, 40extern void vmpressure_unregister_event(struct mem_cgroup *memcg,
42 struct cftype *cft,
43 struct eventfd_ctx *eventfd); 41 struct eventfd_ctx *eventfd);
44#else 42#else
45static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, 43static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
diff --git a/init/Kconfig b/init/Kconfig
index 5236dc562a36..8d402e33b7fc 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -854,7 +854,6 @@ config NUMA_BALANCING
854 854
855menuconfig CGROUPS 855menuconfig CGROUPS
856 boolean "Control Group support" 856 boolean "Control Group support"
857 depends on EVENTFD
858 help 857 help
859 This option adds support for grouping sets of processes together, for 858 This option adds support for grouping sets of processes together, for
860 use with process control subsystems such as Cpusets, CFS, memory 859 use with process control subsystems such as Cpusets, CFS, memory
@@ -921,6 +920,7 @@ config MEMCG
921 bool "Memory Resource Controller for Control Groups" 920 bool "Memory Resource Controller for Control Groups"
922 depends on RESOURCE_COUNTERS 921 depends on RESOURCE_COUNTERS
923 select MM_OWNER 922 select MM_OWNER
923 select EVENTFD
924 help 924 help
925 Provides a memory resource controller that manages both anonymous 925 Provides a memory resource controller that manages both anonymous
926 memory and page cache. (See Documentation/cgroups/memory.txt) 926 memory and page cache. (See Documentation/cgroups/memory.txt)
@@ -1160,7 +1160,6 @@ config UIDGID_STRICT_TYPE_CHECKS
1160 1160
1161config SCHED_AUTOGROUP 1161config SCHED_AUTOGROUP
1162 bool "Automatic process group scheduling" 1162 bool "Automatic process group scheduling"
1163 select EVENTFD
1164 select CGROUPS 1163 select CGROUPS
1165 select CGROUP_SCHED 1164 select CGROUP_SCHED
1166 select FAIR_GROUP_SCHED 1165 select FAIR_GROUP_SCHED
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index bc1dcabe9217..e2f46ba37f72 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -41,7 +41,6 @@
41#include <linux/rcupdate.h> 41#include <linux/rcupdate.h>
42#include <linux/sched.h> 42#include <linux/sched.h>
43#include <linux/backing-dev.h> 43#include <linux/backing-dev.h>
44#include <linux/seq_file.h>
45#include <linux/slab.h> 44#include <linux/slab.h>
46#include <linux/magic.h> 45#include <linux/magic.h>
47#include <linux/spinlock.h> 46#include <linux/spinlock.h>
@@ -56,15 +55,20 @@
56#include <linux/pid_namespace.h> 55#include <linux/pid_namespace.h>
57#include <linux/idr.h> 56#include <linux/idr.h>
58#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ 57#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
59#include <linux/eventfd.h>
60#include <linux/poll.h>
61#include <linux/flex_array.h> /* used in cgroup_attach_task */ 58#include <linux/flex_array.h> /* used in cgroup_attach_task */
62#include <linux/kthread.h> 59#include <linux/kthread.h>
63#include <linux/file.h>
64 60
65#include <linux/atomic.h> 61#include <linux/atomic.h>
66 62
67/* 63/*
64 * pidlists linger the following amount before being destroyed. The goal
65 * is avoiding frequent destruction in the middle of consecutive read calls
66 * Expiring in the middle is a performance problem not a correctness one.
67 * 1 sec should be enough.
68 */
69#define CGROUP_PIDLIST_DESTROY_DELAY HZ
70
71/*
68 * cgroup_mutex is the master lock. Any modification to cgroup or its 72 * cgroup_mutex is the master lock. Any modification to cgroup or its
69 * hierarchy must be performed while holding it. 73 * hierarchy must be performed while holding it.
70 * 74 *
@@ -89,6 +93,19 @@ static DEFINE_MUTEX(cgroup_mutex);
89 93
90static DEFINE_MUTEX(cgroup_root_mutex); 94static DEFINE_MUTEX(cgroup_root_mutex);
91 95
96#define cgroup_assert_mutex_or_rcu_locked() \
97 rcu_lockdep_assert(rcu_read_lock_held() || \
98 lockdep_is_held(&cgroup_mutex), \
99 "cgroup_mutex or RCU read lock required");
100
101#ifdef CONFIG_LOCKDEP
102#define cgroup_assert_mutex_or_root_locked() \
103 WARN_ON_ONCE(debug_locks && (!lockdep_is_held(&cgroup_mutex) && \
104 !lockdep_is_held(&cgroup_root_mutex)))
105#else
106#define cgroup_assert_mutex_or_root_locked() do { } while (0)
107#endif
108
92/* 109/*
93 * cgroup destruction makes heavy use of work items and there can be a lot 110 * cgroup destruction makes heavy use of work items and there can be a lot
94 * of concurrent destructions. Use a separate workqueue so that cgroup 111 * of concurrent destructions. Use a separate workqueue so that cgroup
@@ -98,6 +115,12 @@ static DEFINE_MUTEX(cgroup_root_mutex);
98static struct workqueue_struct *cgroup_destroy_wq; 115static struct workqueue_struct *cgroup_destroy_wq;
99 116
100/* 117/*
118 * pidlist destructions need to be flushed on cgroup destruction. Use a
119 * separate workqueue as flush domain.
120 */
121static struct workqueue_struct *cgroup_pidlist_destroy_wq;
122
123/*
101 * Generate an array of cgroup subsystem pointers. At boot time, this is 124 * Generate an array of cgroup subsystem pointers. At boot time, this is
102 * populated with the built in subsystems, and modular subsystems are 125 * populated with the built in subsystems, and modular subsystems are
103 * registered after that. The mutable section of this array is protected by 126 * registered after that. The mutable section of this array is protected by
@@ -119,49 +142,6 @@ static struct cgroupfs_root cgroup_dummy_root;
119/* dummy_top is a shorthand for the dummy hierarchy's top cgroup */ 142/* dummy_top is a shorthand for the dummy hierarchy's top cgroup */
120static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup; 143static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup;
121 144
122/*
123 * cgroupfs file entry, pointed to from leaf dentry->d_fsdata.
124 */
125struct cfent {
126 struct list_head node;
127 struct dentry *dentry;
128 struct cftype *type;
129 struct cgroup_subsys_state *css;
130
131 /* file xattrs */
132 struct simple_xattrs xattrs;
133};
134
135/*
136 * cgroup_event represents events which userspace want to receive.
137 */
138struct cgroup_event {
139 /*
140 * css which the event belongs to.
141 */
142 struct cgroup_subsys_state *css;
143 /*
144 * Control file which the event associated.
145 */
146 struct cftype *cft;
147 /*
148 * eventfd to signal userspace about the event.
149 */
150 struct eventfd_ctx *eventfd;
151 /*
152 * Each of these stored in a list by the cgroup.
153 */
154 struct list_head list;
155 /*
156 * All fields below needed to unregister event when
157 * userspace closes eventfd.
158 */
159 poll_table pt;
160 wait_queue_head_t *wqh;
161 wait_queue_t wait;
162 struct work_struct remove;
163};
164
165/* The list of hierarchy roots */ 145/* The list of hierarchy roots */
166 146
167static LIST_HEAD(cgroup_roots); 147static LIST_HEAD(cgroup_roots);
@@ -200,6 +180,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp);
200static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], 180static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
201 bool is_add); 181 bool is_add);
202static int cgroup_file_release(struct inode *inode, struct file *file); 182static int cgroup_file_release(struct inode *inode, struct file *file);
183static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
203 184
204/** 185/**
205 * cgroup_css - obtain a cgroup's css for the specified subsystem 186 * cgroup_css - obtain a cgroup's css for the specified subsystem
@@ -262,16 +243,32 @@ static int notify_on_release(const struct cgroup *cgrp)
262} 243}
263 244
264/** 245/**
246 * for_each_css - iterate all css's of a cgroup
247 * @css: the iteration cursor
248 * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
249 * @cgrp: the target cgroup to iterate css's of
250 *
251 * Should be called under cgroup_mutex.
252 */
253#define for_each_css(css, ssid, cgrp) \
254 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
255 if (!((css) = rcu_dereference_check( \
256 (cgrp)->subsys[(ssid)], \
257 lockdep_is_held(&cgroup_mutex)))) { } \
258 else
259
260/**
265 * for_each_subsys - iterate all loaded cgroup subsystems 261 * for_each_subsys - iterate all loaded cgroup subsystems
266 * @ss: the iteration cursor 262 * @ss: the iteration cursor
267 * @i: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end 263 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
268 * 264 *
269 * Should be called under cgroup_mutex. 265 * Iterates through all loaded subsystems. Should be called under
266 * cgroup_mutex or cgroup_root_mutex.
270 */ 267 */
271#define for_each_subsys(ss, i) \ 268#define for_each_subsys(ss, ssid) \
272 for ((i) = 0; (i) < CGROUP_SUBSYS_COUNT; (i)++) \ 269 for (({ cgroup_assert_mutex_or_root_locked(); (ssid) = 0; }); \
273 if (({ lockdep_assert_held(&cgroup_mutex); \ 270 (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
274 !((ss) = cgroup_subsys[i]); })) { } \ 271 if (!((ss) = cgroup_subsys[(ssid)])) { } \
275 else 272 else
276 273
277/** 274/**
@@ -286,10 +283,6 @@ static int notify_on_release(const struct cgroup *cgrp)
286 for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT && \ 283 for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT && \
287 (((ss) = cgroup_subsys[i]) || true); (i)++) 284 (((ss) = cgroup_subsys[i]) || true); (i)++)
288 285
289/* iterate each subsystem attached to a hierarchy */
290#define for_each_root_subsys(root, ss) \
291 list_for_each_entry((ss), &(root)->subsys_list, sibling)
292
293/* iterate across the active hierarchies */ 286/* iterate across the active hierarchies */
294#define for_each_active_root(root) \ 287#define for_each_active_root(root) \
295 list_for_each_entry((root), &cgroup_roots, root_list) 288 list_for_each_entry((root), &cgroup_roots, root_list)
@@ -863,11 +856,7 @@ static void cgroup_free_fn(struct work_struct *work)
863 */ 856 */
864 deactivate_super(cgrp->root->sb); 857 deactivate_super(cgrp->root->sb);
865 858
866 /* 859 cgroup_pidlist_destroy_all(cgrp);
867 * if we're getting rid of the cgroup, refcount should ensure
868 * that there are no pidlists left.
869 */
870 BUG_ON(!list_empty(&cgrp->pidlists));
871 860
872 simple_xattrs_free(&cgrp->xattrs); 861 simple_xattrs_free(&cgrp->xattrs);
873 862
@@ -1050,7 +1039,6 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1050 cgroup_css(cgroup_dummy_top, ss)); 1039 cgroup_css(cgroup_dummy_top, ss));
1051 cgroup_css(cgrp, ss)->cgroup = cgrp; 1040 cgroup_css(cgrp, ss)->cgroup = cgrp;
1052 1041
1053 list_move(&ss->sibling, &root->subsys_list);
1054 ss->root = root; 1042 ss->root = root;
1055 if (ss->bind) 1043 if (ss->bind)
1056 ss->bind(cgroup_css(cgrp, ss)); 1044 ss->bind(cgroup_css(cgrp, ss));
@@ -1069,7 +1057,6 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1069 RCU_INIT_POINTER(cgrp->subsys[i], NULL); 1057 RCU_INIT_POINTER(cgrp->subsys[i], NULL);
1070 1058
1071 cgroup_subsys[i]->root = &cgroup_dummy_root; 1059 cgroup_subsys[i]->root = &cgroup_dummy_root;
1072 list_move(&ss->sibling, &cgroup_dummy_root.subsys_list);
1073 1060
1074 /* subsystem is now free - drop reference on module */ 1061 /* subsystem is now free - drop reference on module */
1075 module_put(ss->module); 1062 module_put(ss->module);
@@ -1096,10 +1083,12 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
1096{ 1083{
1097 struct cgroupfs_root *root = dentry->d_sb->s_fs_info; 1084 struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
1098 struct cgroup_subsys *ss; 1085 struct cgroup_subsys *ss;
1086 int ssid;
1099 1087
1100 mutex_lock(&cgroup_root_mutex); 1088 mutex_lock(&cgroup_root_mutex);
1101 for_each_root_subsys(root, ss) 1089 for_each_subsys(ss, ssid)
1102 seq_printf(seq, ",%s", ss->name); 1090 if (root->subsys_mask & (1 << ssid))
1091 seq_printf(seq, ",%s", ss->name);
1103 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) 1092 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
1104 seq_puts(seq, ",sane_behavior"); 1093 seq_puts(seq, ",sane_behavior");
1105 if (root->flags & CGRP_ROOT_NOPREFIX) 1094 if (root->flags & CGRP_ROOT_NOPREFIX)
@@ -1362,8 +1351,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1362 INIT_LIST_HEAD(&cgrp->pidlists); 1351 INIT_LIST_HEAD(&cgrp->pidlists);
1363 mutex_init(&cgrp->pidlist_mutex); 1352 mutex_init(&cgrp->pidlist_mutex);
1364 cgrp->dummy_css.cgroup = cgrp; 1353 cgrp->dummy_css.cgroup = cgrp;
1365 INIT_LIST_HEAD(&cgrp->event_list);
1366 spin_lock_init(&cgrp->event_list_lock);
1367 simple_xattrs_init(&cgrp->xattrs); 1354 simple_xattrs_init(&cgrp->xattrs);
1368} 1355}
1369 1356
@@ -1371,7 +1358,6 @@ static void init_cgroup_root(struct cgroupfs_root *root)
1371{ 1358{
1372 struct cgroup *cgrp = &root->top_cgroup; 1359 struct cgroup *cgrp = &root->top_cgroup;
1373 1360
1374 INIT_LIST_HEAD(&root->subsys_list);
1375 INIT_LIST_HEAD(&root->root_list); 1361 INIT_LIST_HEAD(&root->root_list);
1376 root->number_of_cgroups = 1; 1362 root->number_of_cgroups = 1;
1377 cgrp->root = root; 1363 cgrp->root = root;
@@ -1693,7 +1679,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1693 return ERR_PTR(ret); 1679 return ERR_PTR(ret);
1694} 1680}
1695 1681
1696static void cgroup_kill_sb(struct super_block *sb) { 1682static void cgroup_kill_sb(struct super_block *sb)
1683{
1697 struct cgroupfs_root *root = sb->s_fs_info; 1684 struct cgroupfs_root *root = sb->s_fs_info;
1698 struct cgroup *cgrp = &root->top_cgroup; 1685 struct cgroup *cgrp = &root->top_cgroup;
1699 struct cgrp_cset_link *link, *tmp_link; 1686 struct cgrp_cset_link *link, *tmp_link;
@@ -1976,8 +1963,8 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
1976 bool threadgroup) 1963 bool threadgroup)
1977{ 1964{
1978 int retval, i, group_size; 1965 int retval, i, group_size;
1979 struct cgroup_subsys *ss, *failed_ss = NULL;
1980 struct cgroupfs_root *root = cgrp->root; 1966 struct cgroupfs_root *root = cgrp->root;
1967 struct cgroup_subsys_state *css, *failed_css = NULL;
1981 /* threadgroup list cursor and array */ 1968 /* threadgroup list cursor and array */
1982 struct task_struct *leader = tsk; 1969 struct task_struct *leader = tsk;
1983 struct task_and_cgroup *tc; 1970 struct task_and_cgroup *tc;
@@ -2050,13 +2037,11 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2050 /* 2037 /*
2051 * step 1: check that we can legitimately attach to the cgroup. 2038 * step 1: check that we can legitimately attach to the cgroup.
2052 */ 2039 */
2053 for_each_root_subsys(root, ss) { 2040 for_each_css(css, i, cgrp) {
2054 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); 2041 if (css->ss->can_attach) {
2055 2042 retval = css->ss->can_attach(css, &tset);
2056 if (ss->can_attach) {
2057 retval = ss->can_attach(css, &tset);
2058 if (retval) { 2043 if (retval) {
2059 failed_ss = ss; 2044 failed_css = css;
2060 goto out_cancel_attach; 2045 goto out_cancel_attach;
2061 } 2046 }
2062 } 2047 }
@@ -2092,12 +2077,9 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2092 /* 2077 /*
2093 * step 4: do subsystem attach callbacks. 2078 * step 4: do subsystem attach callbacks.
2094 */ 2079 */
2095 for_each_root_subsys(root, ss) { 2080 for_each_css(css, i, cgrp)
2096 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); 2081 if (css->ss->attach)
2097 2082 css->ss->attach(css, &tset);
2098 if (ss->attach)
2099 ss->attach(css, &tset);
2100 }
2101 2083
2102 /* 2084 /*
2103 * step 5: success! and cleanup 2085 * step 5: success! and cleanup
@@ -2114,13 +2096,11 @@ out_put_css_set_refs:
2114 } 2096 }
2115out_cancel_attach: 2097out_cancel_attach:
2116 if (retval) { 2098 if (retval) {
2117 for_each_root_subsys(root, ss) { 2099 for_each_css(css, i, cgrp) {
2118 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); 2100 if (css == failed_css)
2119
2120 if (ss == failed_ss)
2121 break; 2101 break;
2122 if (ss->cancel_attach) 2102 if (css->ss->cancel_attach)
2123 ss->cancel_attach(css, &tset); 2103 css->ss->cancel_attach(css, &tset);
2124 } 2104 }
2125 } 2105 }
2126out_free_group_list: 2106out_free_group_list:
@@ -2148,7 +2128,7 @@ retry_find_task:
2148 tsk = find_task_by_vpid(pid); 2128 tsk = find_task_by_vpid(pid);
2149 if (!tsk) { 2129 if (!tsk) {
2150 rcu_read_unlock(); 2130 rcu_read_unlock();
2151 ret= -ESRCH; 2131 ret = -ESRCH;
2152 goto out_unlock_cgroup; 2132 goto out_unlock_cgroup;
2153 } 2133 }
2154 /* 2134 /*
@@ -2260,10 +2240,9 @@ static int cgroup_release_agent_write(struct cgroup_subsys_state *css,
2260 return 0; 2240 return 0;
2261} 2241}
2262 2242
2263static int cgroup_release_agent_show(struct cgroup_subsys_state *css, 2243static int cgroup_release_agent_show(struct seq_file *seq, void *v)
2264 struct cftype *cft, struct seq_file *seq)
2265{ 2244{
2266 struct cgroup *cgrp = css->cgroup; 2245 struct cgroup *cgrp = seq_css(seq)->cgroup;
2267 2246
2268 if (!cgroup_lock_live_group(cgrp)) 2247 if (!cgroup_lock_live_group(cgrp))
2269 return -ENODEV; 2248 return -ENODEV;
@@ -2273,174 +2252,129 @@ static int cgroup_release_agent_show(struct cgroup_subsys_state *css,
2273 return 0; 2252 return 0;
2274} 2253}
2275 2254
2276static int cgroup_sane_behavior_show(struct cgroup_subsys_state *css, 2255static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
2277 struct cftype *cft, struct seq_file *seq)
2278{ 2256{
2279 seq_printf(seq, "%d\n", cgroup_sane_behavior(css->cgroup)); 2257 struct cgroup *cgrp = seq_css(seq)->cgroup;
2258
2259 seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));
2280 return 0; 2260 return 0;
2281} 2261}
2282 2262
2283/* A buffer size big enough for numbers or short strings */ 2263/* A buffer size big enough for numbers or short strings */
2284#define CGROUP_LOCAL_BUFFER_SIZE 64 2264#define CGROUP_LOCAL_BUFFER_SIZE 64
2285 2265
2286static ssize_t cgroup_write_X64(struct cgroup_subsys_state *css, 2266static ssize_t cgroup_file_write(struct file *file, const char __user *userbuf,
2287 struct cftype *cft, struct file *file, 2267 size_t nbytes, loff_t *ppos)
2288 const char __user *userbuf, size_t nbytes,
2289 loff_t *unused_ppos)
2290{ 2268{
2291 char buffer[CGROUP_LOCAL_BUFFER_SIZE]; 2269 struct cfent *cfe = __d_cfe(file->f_dentry);
2292 int retval = 0; 2270 struct cftype *cft = __d_cft(file->f_dentry);
2293 char *end; 2271 struct cgroup_subsys_state *css = cfe->css;
2272 size_t max_bytes = cft->max_write_len ?: CGROUP_LOCAL_BUFFER_SIZE - 1;
2273 char *buf;
2274 int ret;
2294 2275
2295 if (!nbytes) 2276 if (nbytes >= max_bytes)
2296 return -EINVAL;
2297 if (nbytes >= sizeof(buffer))
2298 return -E2BIG; 2277 return -E2BIG;
2299 if (copy_from_user(buffer, userbuf, nbytes))
2300 return -EFAULT;
2301 2278
2302 buffer[nbytes] = 0; /* nul-terminate */ 2279 buf = kmalloc(nbytes + 1, GFP_KERNEL);
2303 if (cft->write_u64) { 2280 if (!buf)
2304 u64 val = simple_strtoull(strstrip(buffer), &end, 0); 2281 return -ENOMEM;
2305 if (*end) 2282
2306 return -EINVAL; 2283 if (copy_from_user(buf, userbuf, nbytes)) {
2307 retval = cft->write_u64(css, cft, val); 2284 ret = -EFAULT;
2285 goto out_free;
2286 }
2287
2288 buf[nbytes] = '\0';
2289
2290 if (cft->write_string) {
2291 ret = cft->write_string(css, cft, strstrip(buf));
2292 } else if (cft->write_u64) {
2293 unsigned long long v;
2294 ret = kstrtoull(buf, 0, &v);
2295 if (!ret)
2296 ret = cft->write_u64(css, cft, v);
2297 } else if (cft->write_s64) {
2298 long long v;
2299 ret = kstrtoll(buf, 0, &v);
2300 if (!ret)
2301 ret = cft->write_s64(css, cft, v);
2302 } else if (cft->trigger) {
2303 ret = cft->trigger(css, (unsigned int)cft->private);
2308 } else { 2304 } else {
2309 s64 val = simple_strtoll(strstrip(buffer), &end, 0); 2305 ret = -EINVAL;
2310 if (*end)
2311 return -EINVAL;
2312 retval = cft->write_s64(css, cft, val);
2313 } 2306 }
2314 if (!retval) 2307out_free:
2315 retval = nbytes; 2308 kfree(buf);
2316 return retval; 2309 return ret ?: nbytes;
2317} 2310}
2318 2311
2319static ssize_t cgroup_write_string(struct cgroup_subsys_state *css, 2312/*
2320 struct cftype *cft, struct file *file, 2313 * seqfile ops/methods for returning structured data. Currently just
2321 const char __user *userbuf, size_t nbytes, 2314 * supports string->u64 maps, but can be extended in future.
2322 loff_t *unused_ppos) 2315 */
2316
2317static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
2323{ 2318{
2324 char local_buffer[CGROUP_LOCAL_BUFFER_SIZE]; 2319 struct cftype *cft = seq_cft(seq);
2325 int retval = 0;
2326 size_t max_bytes = cft->max_write_len;
2327 char *buffer = local_buffer;
2328 2320
2329 if (!max_bytes) 2321 if (cft->seq_start) {
2330 max_bytes = sizeof(local_buffer) - 1; 2322 return cft->seq_start(seq, ppos);
2331 if (nbytes >= max_bytes) 2323 } else {
2332 return -E2BIG; 2324 /*
2333 /* Allocate a dynamic buffer if we need one */ 2325 * The same behavior and code as single_open(). Returns
2334 if (nbytes >= sizeof(local_buffer)) { 2326 * !NULL if pos is at the beginning; otherwise, NULL.
2335 buffer = kmalloc(nbytes + 1, GFP_KERNEL); 2327 */
2336 if (buffer == NULL) 2328 return NULL + !*ppos;
2337 return -ENOMEM;
2338 }
2339 if (nbytes && copy_from_user(buffer, userbuf, nbytes)) {
2340 retval = -EFAULT;
2341 goto out;
2342 } 2329 }
2343
2344 buffer[nbytes] = 0; /* nul-terminate */
2345 retval = cft->write_string(css, cft, strstrip(buffer));
2346 if (!retval)
2347 retval = nbytes;
2348out:
2349 if (buffer != local_buffer)
2350 kfree(buffer);
2351 return retval;
2352} 2330}
2353 2331
2354static ssize_t cgroup_file_write(struct file *file, const char __user *buf, 2332static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
2355 size_t nbytes, loff_t *ppos)
2356{ 2333{
2357 struct cfent *cfe = __d_cfe(file->f_dentry); 2334 struct cftype *cft = seq_cft(seq);
2358 struct cftype *cft = __d_cft(file->f_dentry);
2359 struct cgroup_subsys_state *css = cfe->css;
2360 2335
2361 if (cft->write) 2336 if (cft->seq_next) {
2362 return cft->write(css, cft, file, buf, nbytes, ppos); 2337 return cft->seq_next(seq, v, ppos);
2363 if (cft->write_u64 || cft->write_s64) 2338 } else {
2364 return cgroup_write_X64(css, cft, file, buf, nbytes, ppos); 2339 /*
2365 if (cft->write_string) 2340 * The same behavior and code as single_open(), always
2366 return cgroup_write_string(css, cft, file, buf, nbytes, ppos); 2341 * terminate after the initial read.
2367 if (cft->trigger) { 2342 */
2368 int ret = cft->trigger(css, (unsigned int)cft->private); 2343 ++*ppos;
2369 return ret ? ret : nbytes; 2344 return NULL;
2370 } 2345 }
2371 return -EINVAL;
2372} 2346}
2373 2347
2374static ssize_t cgroup_read_u64(struct cgroup_subsys_state *css, 2348static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
2375 struct cftype *cft, struct file *file,
2376 char __user *buf, size_t nbytes, loff_t *ppos)
2377{ 2349{
2378 char tmp[CGROUP_LOCAL_BUFFER_SIZE]; 2350 struct cftype *cft = seq_cft(seq);
2379 u64 val = cft->read_u64(css, cft);
2380 int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
2381 2351
2382 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); 2352 if (cft->seq_stop)
2353 cft->seq_stop(seq, v);
2383} 2354}
2384 2355
2385static ssize_t cgroup_read_s64(struct cgroup_subsys_state *css, 2356static int cgroup_seqfile_show(struct seq_file *m, void *arg)
2386 struct cftype *cft, struct file *file,
2387 char __user *buf, size_t nbytes, loff_t *ppos)
2388{ 2357{
2389 char tmp[CGROUP_LOCAL_BUFFER_SIZE]; 2358 struct cftype *cft = seq_cft(m);
2390 s64 val = cft->read_s64(css, cft); 2359 struct cgroup_subsys_state *css = seq_css(m);
2391 int len = sprintf(tmp, "%lld\n", (long long) val);
2392 2360
2393 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); 2361 if (cft->seq_show)
2394} 2362 return cft->seq_show(m, arg);
2395 2363
2396static ssize_t cgroup_file_read(struct file *file, char __user *buf,
2397 size_t nbytes, loff_t *ppos)
2398{
2399 struct cfent *cfe = __d_cfe(file->f_dentry);
2400 struct cftype *cft = __d_cft(file->f_dentry);
2401 struct cgroup_subsys_state *css = cfe->css;
2402
2403 if (cft->read)
2404 return cft->read(css, cft, file, buf, nbytes, ppos);
2405 if (cft->read_u64) 2364 if (cft->read_u64)
2406 return cgroup_read_u64(css, cft, file, buf, nbytes, ppos); 2365 seq_printf(m, "%llu\n", cft->read_u64(css, cft));
2407 if (cft->read_s64) 2366 else if (cft->read_s64)
2408 return cgroup_read_s64(css, cft, file, buf, nbytes, ppos); 2367 seq_printf(m, "%lld\n", cft->read_s64(css, cft));
2409 return -EINVAL; 2368 else
2410} 2369 return -EINVAL;
2411 2370 return 0;
2412/*
2413 * seqfile ops/methods for returning structured data. Currently just
2414 * supports string->u64 maps, but can be extended in future.
2415 */
2416
2417static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
2418{
2419 struct seq_file *sf = cb->state;
2420 return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value);
2421}
2422
2423static int cgroup_seqfile_show(struct seq_file *m, void *arg)
2424{
2425 struct cfent *cfe = m->private;
2426 struct cftype *cft = cfe->type;
2427 struct cgroup_subsys_state *css = cfe->css;
2428
2429 if (cft->read_map) {
2430 struct cgroup_map_cb cb = {
2431 .fill = cgroup_map_add,
2432 .state = m,
2433 };
2434 return cft->read_map(css, cft, &cb);
2435 }
2436 return cft->read_seq_string(css, cft, m);
2437} 2371}
2438 2372
2439static const struct file_operations cgroup_seqfile_operations = { 2373static struct seq_operations cgroup_seq_operations = {
2440 .read = seq_read, 2374 .start = cgroup_seqfile_start,
2441 .write = cgroup_file_write, 2375 .next = cgroup_seqfile_next,
2442 .llseek = seq_lseek, 2376 .stop = cgroup_seqfile_stop,
2443 .release = cgroup_file_release, 2377 .show = cgroup_seqfile_show,
2444}; 2378};
2445 2379
2446static int cgroup_file_open(struct inode *inode, struct file *file) 2380static int cgroup_file_open(struct inode *inode, struct file *file)
@@ -2449,6 +2383,7 @@ static int cgroup_file_open(struct inode *inode, struct file *file)
2449 struct cftype *cft = __d_cft(file->f_dentry); 2383 struct cftype *cft = __d_cft(file->f_dentry);
2450 struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); 2384 struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent);
2451 struct cgroup_subsys_state *css; 2385 struct cgroup_subsys_state *css;
2386 struct cgroup_open_file *of;
2452 int err; 2387 int err;
2453 2388
2454 err = generic_file_open(inode, file); 2389 err = generic_file_open(inode, file);
@@ -2478,32 +2413,26 @@ static int cgroup_file_open(struct inode *inode, struct file *file)
2478 WARN_ON_ONCE(cfe->css && cfe->css != css); 2413 WARN_ON_ONCE(cfe->css && cfe->css != css);
2479 cfe->css = css; 2414 cfe->css = css;
2480 2415
2481 if (cft->read_map || cft->read_seq_string) { 2416 of = __seq_open_private(file, &cgroup_seq_operations,
2482 file->f_op = &cgroup_seqfile_operations; 2417 sizeof(struct cgroup_open_file));
2483 err = single_open(file, cgroup_seqfile_show, cfe); 2418 if (of) {
2484 } else if (cft->open) { 2419 of->cfe = cfe;
2485 err = cft->open(inode, file); 2420 return 0;
2486 } 2421 }
2487 2422
2488 if (css->ss && err) 2423 if (css->ss)
2489 css_put(css); 2424 css_put(css);
2490 return err; 2425 return -ENOMEM;
2491} 2426}
2492 2427
2493static int cgroup_file_release(struct inode *inode, struct file *file) 2428static int cgroup_file_release(struct inode *inode, struct file *file)
2494{ 2429{
2495 struct cfent *cfe = __d_cfe(file->f_dentry); 2430 struct cfent *cfe = __d_cfe(file->f_dentry);
2496 struct cftype *cft = __d_cft(file->f_dentry);
2497 struct cgroup_subsys_state *css = cfe->css; 2431 struct cgroup_subsys_state *css = cfe->css;
2498 int ret = 0;
2499 2432
2500 if (cft->release)
2501 ret = cft->release(inode, file);
2502 if (css->ss) 2433 if (css->ss)
2503 css_put(css); 2434 css_put(css);
2504 if (file->f_op == &cgroup_seqfile_operations) 2435 return seq_release_private(inode, file);
2505 single_release(inode, file);
2506 return ret;
2507} 2436}
2508 2437
2509/* 2438/*
@@ -2614,7 +2543,7 @@ static ssize_t cgroup_listxattr(struct dentry *dentry, char *buf, size_t size)
2614} 2543}
2615 2544
2616static const struct file_operations cgroup_file_operations = { 2545static const struct file_operations cgroup_file_operations = {
2617 .read = cgroup_file_read, 2546 .read = seq_read,
2618 .write = cgroup_file_write, 2547 .write = cgroup_file_write,
2619 .llseek = generic_file_llseek, 2548 .llseek = generic_file_llseek,
2620 .open = cgroup_file_open, 2549 .open = cgroup_file_open,
@@ -2639,16 +2568,6 @@ static const struct inode_operations cgroup_dir_inode_operations = {
2639 .removexattr = cgroup_removexattr, 2568 .removexattr = cgroup_removexattr,
2640}; 2569};
2641 2570
2642/*
2643 * Check if a file is a control file
2644 */
2645static inline struct cftype *__file_cft(struct file *file)
2646{
2647 if (file_inode(file)->i_fop != &cgroup_file_operations)
2648 return ERR_PTR(-EINVAL);
2649 return __d_cft(file->f_dentry);
2650}
2651
2652static int cgroup_create_file(struct dentry *dentry, umode_t mode, 2571static int cgroup_create_file(struct dentry *dentry, umode_t mode,
2653 struct super_block *sb) 2572 struct super_block *sb)
2654{ 2573{
@@ -2706,12 +2625,11 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
2706 if (cft->mode) 2625 if (cft->mode)
2707 return cft->mode; 2626 return cft->mode;
2708 2627
2709 if (cft->read || cft->read_u64 || cft->read_s64 || 2628 if (cft->read_u64 || cft->read_s64 || cft->seq_show)
2710 cft->read_map || cft->read_seq_string)
2711 mode |= S_IRUGO; 2629 mode |= S_IRUGO;
2712 2630
2713 if (cft->write || cft->write_u64 || cft->write_s64 || 2631 if (cft->write_u64 || cft->write_s64 || cft->write_string ||
2714 cft->write_string || cft->trigger) 2632 cft->trigger)
2715 mode |= S_IWUSR; 2633 mode |= S_IWUSR;
2716 2634
2717 return mode; 2635 return mode;
@@ -3007,9 +2925,9 @@ static void cgroup_enable_task_cg_lists(void)
3007 * @parent_css: css whose children to walk 2925 * @parent_css: css whose children to walk
3008 * 2926 *
3009 * This function returns the next child of @parent_css and should be called 2927 * This function returns the next child of @parent_css and should be called
3010 * under RCU read lock. The only requirement is that @parent_css and 2928 * under either cgroup_mutex or RCU read lock. The only requirement is
3011 * @pos_css are accessible. The next sibling is guaranteed to be returned 2929 * that @parent_css and @pos_css are accessible. The next sibling is
3012 * regardless of their states. 2930 * guaranteed to be returned regardless of their states.
3013 */ 2931 */
3014struct cgroup_subsys_state * 2932struct cgroup_subsys_state *
3015css_next_child(struct cgroup_subsys_state *pos_css, 2933css_next_child(struct cgroup_subsys_state *pos_css,
@@ -3019,7 +2937,7 @@ css_next_child(struct cgroup_subsys_state *pos_css,
3019 struct cgroup *cgrp = parent_css->cgroup; 2937 struct cgroup *cgrp = parent_css->cgroup;
3020 struct cgroup *next; 2938 struct cgroup *next;
3021 2939
3022 WARN_ON_ONCE(!rcu_read_lock_held()); 2940 cgroup_assert_mutex_or_rcu_locked();
3023 2941
3024 /* 2942 /*
3025 * @pos could already have been removed. Once a cgroup is removed, 2943 * @pos could already have been removed. Once a cgroup is removed,
@@ -3066,10 +2984,10 @@ EXPORT_SYMBOL_GPL(css_next_child);
3066 * to visit for pre-order traversal of @root's descendants. @root is 2984 * to visit for pre-order traversal of @root's descendants. @root is
3067 * included in the iteration and the first node to be visited. 2985 * included in the iteration and the first node to be visited.
3068 * 2986 *
3069 * While this function requires RCU read locking, it doesn't require the 2987 * While this function requires cgroup_mutex or RCU read locking, it
3070 * whole traversal to be contained in a single RCU critical section. This 2988 * doesn't require the whole traversal to be contained in a single critical
3071 * function will return the correct next descendant as long as both @pos 2989 * section. This function will return the correct next descendant as long
3072 * and @root are accessible and @pos is a descendant of @root. 2990 * as both @pos and @root are accessible and @pos is a descendant of @root.
3073 */ 2991 */
3074struct cgroup_subsys_state * 2992struct cgroup_subsys_state *
3075css_next_descendant_pre(struct cgroup_subsys_state *pos, 2993css_next_descendant_pre(struct cgroup_subsys_state *pos,
@@ -3077,7 +2995,7 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos,
3077{ 2995{
3078 struct cgroup_subsys_state *next; 2996 struct cgroup_subsys_state *next;
3079 2997
3080 WARN_ON_ONCE(!rcu_read_lock_held()); 2998 cgroup_assert_mutex_or_rcu_locked();
3081 2999
3082 /* if first iteration, visit @root */ 3000 /* if first iteration, visit @root */
3083 if (!pos) 3001 if (!pos)
@@ -3108,17 +3026,17 @@ EXPORT_SYMBOL_GPL(css_next_descendant_pre);
3108 * is returned. This can be used during pre-order traversal to skip 3026 * is returned. This can be used during pre-order traversal to skip
3109 * subtree of @pos. 3027 * subtree of @pos.
3110 * 3028 *
3111 * While this function requires RCU read locking, it doesn't require the 3029 * While this function requires cgroup_mutex or RCU read locking, it
3112 * whole traversal to be contained in a single RCU critical section. This 3030 * doesn't require the whole traversal to be contained in a single critical
3113 * function will return the correct rightmost descendant as long as @pos is 3031 * section. This function will return the correct rightmost descendant as
3114 * accessible. 3032 * long as @pos is accessible.
3115 */ 3033 */
3116struct cgroup_subsys_state * 3034struct cgroup_subsys_state *
3117css_rightmost_descendant(struct cgroup_subsys_state *pos) 3035css_rightmost_descendant(struct cgroup_subsys_state *pos)
3118{ 3036{
3119 struct cgroup_subsys_state *last, *tmp; 3037 struct cgroup_subsys_state *last, *tmp;
3120 3038
3121 WARN_ON_ONCE(!rcu_read_lock_held()); 3039 cgroup_assert_mutex_or_rcu_locked();
3122 3040
3123 do { 3041 do {
3124 last = pos; 3042 last = pos;
@@ -3154,10 +3072,11 @@ css_leftmost_descendant(struct cgroup_subsys_state *pos)
3154 * to visit for post-order traversal of @root's descendants. @root is 3072 * to visit for post-order traversal of @root's descendants. @root is
3155 * included in the iteration and the last node to be visited. 3073 * included in the iteration and the last node to be visited.
3156 * 3074 *
3157 * While this function requires RCU read locking, it doesn't require the 3075 * While this function requires cgroup_mutex or RCU read locking, it
3158 * whole traversal to be contained in a single RCU critical section. This 3076 * doesn't require the whole traversal to be contained in a single critical
3159 * function will return the correct next descendant as long as both @pos 3077 * section. This function will return the correct next descendant as long
3160 * and @cgroup are accessible and @pos is a descendant of @cgroup. 3078 * as both @pos and @cgroup are accessible and @pos is a descendant of
3079 * @cgroup.
3161 */ 3080 */
3162struct cgroup_subsys_state * 3081struct cgroup_subsys_state *
3163css_next_descendant_post(struct cgroup_subsys_state *pos, 3082css_next_descendant_post(struct cgroup_subsys_state *pos,
@@ -3165,7 +3084,7 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
3165{ 3084{
3166 struct cgroup_subsys_state *next; 3085 struct cgroup_subsys_state *next;
3167 3086
3168 WARN_ON_ONCE(!rcu_read_lock_held()); 3087 cgroup_assert_mutex_or_rcu_locked();
3169 3088
3170 /* if first iteration, visit leftmost descendant which may be @root */ 3089 /* if first iteration, visit leftmost descendant which may be @root */
3171 if (!pos) 3090 if (!pos)
@@ -3504,14 +3423,12 @@ struct cgroup_pidlist {
3504 pid_t *list; 3423 pid_t *list;
3505 /* how many elements the above list has */ 3424 /* how many elements the above list has */
3506 int length; 3425 int length;
3507 /* how many files are using the current array */
3508 int use_count;
3509 /* each of these stored in a list by its cgroup */ 3426 /* each of these stored in a list by its cgroup */
3510 struct list_head links; 3427 struct list_head links;
3511 /* pointer to the cgroup we belong to, for list removal purposes */ 3428 /* pointer to the cgroup we belong to, for list removal purposes */
3512 struct cgroup *owner; 3429 struct cgroup *owner;
3513 /* protects the other fields */ 3430 /* for delayed destruction */
3514 struct rw_semaphore rwsem; 3431 struct delayed_work destroy_dwork;
3515}; 3432};
3516 3433
3517/* 3434/*
@@ -3527,6 +3444,7 @@ static void *pidlist_allocate(int count)
3527 else 3444 else
3528 return kmalloc(count * sizeof(pid_t), GFP_KERNEL); 3445 return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
3529} 3446}
3447
3530static void pidlist_free(void *p) 3448static void pidlist_free(void *p)
3531{ 3449{
3532 if (is_vmalloc_addr(p)) 3450 if (is_vmalloc_addr(p))
@@ -3536,6 +3454,47 @@ static void pidlist_free(void *p)
3536} 3454}
3537 3455
3538/* 3456/*
3457 * Used to destroy all pidlists lingering waiting for destroy timer. None
3458 * should be left afterwards.
3459 */
3460static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
3461{
3462 struct cgroup_pidlist *l, *tmp_l;
3463
3464 mutex_lock(&cgrp->pidlist_mutex);
3465 list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
3466 mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
3467 mutex_unlock(&cgrp->pidlist_mutex);
3468
3469 flush_workqueue(cgroup_pidlist_destroy_wq);
3470 BUG_ON(!list_empty(&cgrp->pidlists));
3471}
3472
3473static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
3474{
3475 struct delayed_work *dwork = to_delayed_work(work);
3476 struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
3477 destroy_dwork);
3478 struct cgroup_pidlist *tofree = NULL;
3479
3480 mutex_lock(&l->owner->pidlist_mutex);
3481
3482 /*
3483 * Destroy iff we didn't get queued again. The state won't change
3484 * as destroy_dwork can only be queued while locked.
3485 */
3486 if (!delayed_work_pending(dwork)) {
3487 list_del(&l->links);
3488 pidlist_free(l->list);
3489 put_pid_ns(l->key.ns);
3490 tofree = l;
3491 }
3492
3493 mutex_unlock(&l->owner->pidlist_mutex);
3494 kfree(tofree);
3495}
3496
3497/*
3539 * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries 3498 * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
3540 * Returns the number of unique elements. 3499 * Returns the number of unique elements.
3541 */ 3500 */
@@ -3565,52 +3524,92 @@ after:
3565 return dest; 3524 return dest;
3566} 3525}
3567 3526
3527/*
3528 * The two pid files - task and cgroup.procs - guaranteed that the result
3529 * is sorted, which forced this whole pidlist fiasco. As pid order is
3530 * different per namespace, each namespace needs differently sorted list,
3531 * making it impossible to use, for example, single rbtree of member tasks
3532 * sorted by task pointer. As pidlists can be fairly large, allocating one
3533 * per open file is dangerous, so cgroup had to implement shared pool of
3534 * pidlists keyed by cgroup and namespace.
3535 *
3536 * All this extra complexity was caused by the original implementation
3537 * committing to an entirely unnecessary property. In the long term, we
3538 * want to do away with it. Explicitly scramble sort order if
3539 * sane_behavior so that no such expectation exists in the new interface.
3540 *
3541 * Scrambling is done by swapping every two consecutive bits, which is
3542 * non-identity one-to-one mapping which disturbs sort order sufficiently.
3543 */
3544static pid_t pid_fry(pid_t pid)
3545{
3546 unsigned a = pid & 0x55555555;
3547 unsigned b = pid & 0xAAAAAAAA;
3548
3549 return (a << 1) | (b >> 1);
3550}
3551
3552static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
3553{
3554 if (cgroup_sane_behavior(cgrp))
3555 return pid_fry(pid);
3556 else
3557 return pid;
3558}
3559
3568static int cmppid(const void *a, const void *b) 3560static int cmppid(const void *a, const void *b)
3569{ 3561{
3570 return *(pid_t *)a - *(pid_t *)b; 3562 return *(pid_t *)a - *(pid_t *)b;
3571} 3563}
3572 3564
3565static int fried_cmppid(const void *a, const void *b)
3566{
3567 return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);
3568}
3569
3570static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3571 enum cgroup_filetype type)
3572{
3573 struct cgroup_pidlist *l;
3574 /* don't need task_nsproxy() if we're looking at ourself */
3575 struct pid_namespace *ns = task_active_pid_ns(current);
3576
3577 lockdep_assert_held(&cgrp->pidlist_mutex);
3578
3579 list_for_each_entry(l, &cgrp->pidlists, links)
3580 if (l->key.type == type && l->key.ns == ns)
3581 return l;
3582 return NULL;
3583}
3584
3573/* 3585/*
3574 * find the appropriate pidlist for our purpose (given procs vs tasks) 3586 * find the appropriate pidlist for our purpose (given procs vs tasks)
3575 * returns with the lock on that pidlist already held, and takes care 3587 * returns with the lock on that pidlist already held, and takes care
3576 * of the use count, or returns NULL with no locks held if we're out of 3588 * of the use count, or returns NULL with no locks held if we're out of
3577 * memory. 3589 * memory.
3578 */ 3590 */
3579static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, 3591static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
3580 enum cgroup_filetype type) 3592 enum cgroup_filetype type)
3581{ 3593{
3582 struct cgroup_pidlist *l; 3594 struct cgroup_pidlist *l;
3583 /* don't need task_nsproxy() if we're looking at ourself */
3584 struct pid_namespace *ns = task_active_pid_ns(current);
3585 3595
3586 /* 3596 lockdep_assert_held(&cgrp->pidlist_mutex);
3587 * We can't drop the pidlist_mutex before taking the l->rwsem in case 3597
3588 * the last ref-holder is trying to remove l from the list at the same 3598 l = cgroup_pidlist_find(cgrp, type);
3589 * time. Holding the pidlist_mutex precludes somebody taking whichever 3599 if (l)
3590 * list we find out from under us - compare release_pid_array(). 3600 return l;
3591 */ 3601
3592 mutex_lock(&cgrp->pidlist_mutex);
3593 list_for_each_entry(l, &cgrp->pidlists, links) {
3594 if (l->key.type == type && l->key.ns == ns) {
3595 /* make sure l doesn't vanish out from under us */
3596 down_write(&l->rwsem);
3597 mutex_unlock(&cgrp->pidlist_mutex);
3598 return l;
3599 }
3600 }
3601 /* entry not found; create a new one */ 3602 /* entry not found; create a new one */
3602 l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); 3603 l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
3603 if (!l) { 3604 if (!l)
3604 mutex_unlock(&cgrp->pidlist_mutex);
3605 return l; 3605 return l;
3606 } 3606
3607 init_rwsem(&l->rwsem); 3607 INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
3608 down_write(&l->rwsem);
3609 l->key.type = type; 3608 l->key.type = type;
3610 l->key.ns = get_pid_ns(ns); 3609 /* don't need task_nsproxy() if we're looking at ourself */
3610 l->key.ns = get_pid_ns(task_active_pid_ns(current));
3611 l->owner = cgrp; 3611 l->owner = cgrp;
3612 list_add(&l->links, &cgrp->pidlists); 3612 list_add(&l->links, &cgrp->pidlists);
3613 mutex_unlock(&cgrp->pidlist_mutex);
3614 return l; 3613 return l;
3615} 3614}
3616 3615
@@ -3627,6 +3626,8 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3627 struct task_struct *tsk; 3626 struct task_struct *tsk;
3628 struct cgroup_pidlist *l; 3627 struct cgroup_pidlist *l;
3629 3628
3629 lockdep_assert_held(&cgrp->pidlist_mutex);
3630
3630 /* 3631 /*
3631 * If cgroup gets more users after we read count, we won't have 3632 * If cgroup gets more users after we read count, we won't have
3632 * enough space - tough. This race is indistinguishable to the 3633 * enough space - tough. This race is indistinguishable to the
@@ -3653,20 +3654,24 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3653 css_task_iter_end(&it); 3654 css_task_iter_end(&it);
3654 length = n; 3655 length = n;
3655 /* now sort & (if procs) strip out duplicates */ 3656 /* now sort & (if procs) strip out duplicates */
3656 sort(array, length, sizeof(pid_t), cmppid, NULL); 3657 if (cgroup_sane_behavior(cgrp))
3658 sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
3659 else
3660 sort(array, length, sizeof(pid_t), cmppid, NULL);
3657 if (type == CGROUP_FILE_PROCS) 3661 if (type == CGROUP_FILE_PROCS)
3658 length = pidlist_uniq(array, length); 3662 length = pidlist_uniq(array, length);
3659 l = cgroup_pidlist_find(cgrp, type); 3663
3664 l = cgroup_pidlist_find_create(cgrp, type);
3660 if (!l) { 3665 if (!l) {
3666 mutex_unlock(&cgrp->pidlist_mutex);
3661 pidlist_free(array); 3667 pidlist_free(array);
3662 return -ENOMEM; 3668 return -ENOMEM;
3663 } 3669 }
3664 /* store array, freeing old if necessary - lock already held */ 3670
3671 /* store array, freeing old if necessary */
3665 pidlist_free(l->list); 3672 pidlist_free(l->list);
3666 l->list = array; 3673 l->list = array;
3667 l->length = length; 3674 l->length = length;
3668 l->use_count++;
3669 up_write(&l->rwsem);
3670 *lp = l; 3675 *lp = l;
3671 return 0; 3676 return 0;
3672} 3677}
@@ -3740,20 +3745,45 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
3740 * after a seek to the start). Use a binary-search to find the 3745 * after a seek to the start). Use a binary-search to find the
3741 * next pid to display, if any 3746 * next pid to display, if any
3742 */ 3747 */
3743 struct cgroup_pidlist *l = s->private; 3748 struct cgroup_open_file *of = s->private;
3749 struct cgroup *cgrp = seq_css(s)->cgroup;
3750 struct cgroup_pidlist *l;
3751 enum cgroup_filetype type = seq_cft(s)->private;
3744 int index = 0, pid = *pos; 3752 int index = 0, pid = *pos;
3745 int *iter; 3753 int *iter, ret;
3754
3755 mutex_lock(&cgrp->pidlist_mutex);
3756
3757 /*
3758 * !NULL @of->priv indicates that this isn't the first start()
3759 * after open. If the matching pidlist is around, we can use that.
3760 * Look for it. Note that @of->priv can't be used directly. It
3761 * could already have been destroyed.
3762 */
3763 if (of->priv)
3764 of->priv = cgroup_pidlist_find(cgrp, type);
3765
3766 /*
3767 * Either this is the first start() after open or the matching
3768 * pidlist has been destroyed inbetween. Create a new one.
3769 */
3770 if (!of->priv) {
3771 ret = pidlist_array_load(cgrp, type,
3772 (struct cgroup_pidlist **)&of->priv);
3773 if (ret)
3774 return ERR_PTR(ret);
3775 }
3776 l = of->priv;
3746 3777
3747 down_read(&l->rwsem);
3748 if (pid) { 3778 if (pid) {
3749 int end = l->length; 3779 int end = l->length;
3750 3780
3751 while (index < end) { 3781 while (index < end) {
3752 int mid = (index + end) / 2; 3782 int mid = (index + end) / 2;
3753 if (l->list[mid] == pid) { 3783 if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {
3754 index = mid; 3784 index = mid;
3755 break; 3785 break;
3756 } else if (l->list[mid] <= pid) 3786 } else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)
3757 index = mid + 1; 3787 index = mid + 1;
3758 else 3788 else
3759 end = mid; 3789 end = mid;
@@ -3764,19 +3794,25 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
3764 return NULL; 3794 return NULL;
3765 /* Update the abstract position to be the actual pid that we found */ 3795 /* Update the abstract position to be the actual pid that we found */
3766 iter = l->list + index; 3796 iter = l->list + index;
3767 *pos = *iter; 3797 *pos = cgroup_pid_fry(cgrp, *iter);
3768 return iter; 3798 return iter;
3769} 3799}
3770 3800
3771static void cgroup_pidlist_stop(struct seq_file *s, void *v) 3801static void cgroup_pidlist_stop(struct seq_file *s, void *v)
3772{ 3802{
3773 struct cgroup_pidlist *l = s->private; 3803 struct cgroup_open_file *of = s->private;
3774 up_read(&l->rwsem); 3804 struct cgroup_pidlist *l = of->priv;
3805
3806 if (l)
3807 mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
3808 CGROUP_PIDLIST_DESTROY_DELAY);
3809 mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
3775} 3810}
3776 3811
3777static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) 3812static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
3778{ 3813{
3779 struct cgroup_pidlist *l = s->private; 3814 struct cgroup_open_file *of = s->private;
3815 struct cgroup_pidlist *l = of->priv;
3780 pid_t *p = v; 3816 pid_t *p = v;
3781 pid_t *end = l->list + l->length; 3817 pid_t *end = l->list + l->length;
3782 /* 3818 /*
@@ -3787,7 +3823,7 @@ static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
3787 if (p >= end) { 3823 if (p >= end) {
3788 return NULL; 3824 return NULL;
3789 } else { 3825 } else {
3790 *pos = *p; 3826 *pos = cgroup_pid_fry(seq_css(s)->cgroup, *p);
3791 return p; 3827 return p;
3792 } 3828 }
3793} 3829}
@@ -3808,92 +3844,6 @@ static const struct seq_operations cgroup_pidlist_seq_operations = {
3808 .show = cgroup_pidlist_show, 3844 .show = cgroup_pidlist_show,
3809}; 3845};
3810 3846
3811static void cgroup_release_pid_array(struct cgroup_pidlist *l)
3812{
3813 /*
3814 * the case where we're the last user of this particular pidlist will
3815 * have us remove it from the cgroup's list, which entails taking the
3816 * mutex. since in pidlist_find the pidlist->lock depends on cgroup->
3817 * pidlist_mutex, we have to take pidlist_mutex first.
3818 */
3819 mutex_lock(&l->owner->pidlist_mutex);
3820 down_write(&l->rwsem);
3821 BUG_ON(!l->use_count);
3822 if (!--l->use_count) {
3823 /* we're the last user if refcount is 0; remove and free */
3824 list_del(&l->links);
3825 mutex_unlock(&l->owner->pidlist_mutex);
3826 pidlist_free(l->list);
3827 put_pid_ns(l->key.ns);
3828 up_write(&l->rwsem);
3829 kfree(l);
3830 return;
3831 }
3832 mutex_unlock(&l->owner->pidlist_mutex);
3833 up_write(&l->rwsem);
3834}
3835
3836static int cgroup_pidlist_release(struct inode *inode, struct file *file)
3837{
3838 struct cgroup_pidlist *l;
3839 if (!(file->f_mode & FMODE_READ))
3840 return 0;
3841 /*
3842 * the seq_file will only be initialized if the file was opened for
3843 * reading; hence we check if it's not null only in that case.
3844 */
3845 l = ((struct seq_file *)file->private_data)->private;
3846 cgroup_release_pid_array(l);
3847 return seq_release(inode, file);
3848}
3849
3850static const struct file_operations cgroup_pidlist_operations = {
3851 .read = seq_read,
3852 .llseek = seq_lseek,
3853 .write = cgroup_file_write,
3854 .release = cgroup_pidlist_release,
3855};
3856
3857/*
3858 * The following functions handle opens on a file that displays a pidlist
3859 * (tasks or procs). Prepare an array of the process/thread IDs of whoever's
3860 * in the cgroup.
3861 */
3862/* helper function for the two below it */
3863static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type)
3864{
3865 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
3866 struct cgroup_pidlist *l;
3867 int retval;
3868
3869 /* Nothing to do for write-only files */
3870 if (!(file->f_mode & FMODE_READ))
3871 return 0;
3872
3873 /* have the array populated */
3874 retval = pidlist_array_load(cgrp, type, &l);
3875 if (retval)
3876 return retval;
3877 /* configure file information */
3878 file->f_op = &cgroup_pidlist_operations;
3879
3880 retval = seq_open(file, &cgroup_pidlist_seq_operations);
3881 if (retval) {
3882 cgroup_release_pid_array(l);
3883 return retval;
3884 }
3885 ((struct seq_file *)file->private_data)->private = l;
3886 return 0;
3887}
3888static int cgroup_tasks_open(struct inode *unused, struct file *file)
3889{
3890 return cgroup_pidlist_open(file, CGROUP_FILE_TASKS);
3891}
3892static int cgroup_procs_open(struct inode *unused, struct file *file)
3893{
3894 return cgroup_pidlist_open(file, CGROUP_FILE_PROCS);
3895}
3896
3897static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, 3847static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
3898 struct cftype *cft) 3848 struct cftype *cft)
3899{ 3849{
@@ -3928,202 +3878,6 @@ static void cgroup_dput(struct cgroup *cgrp)
3928 deactivate_super(sb); 3878 deactivate_super(sb);
3929} 3879}
3930 3880
3931/*
3932 * Unregister event and free resources.
3933 *
3934 * Gets called from workqueue.
3935 */
3936static void cgroup_event_remove(struct work_struct *work)
3937{
3938 struct cgroup_event *event = container_of(work, struct cgroup_event,
3939 remove);
3940 struct cgroup_subsys_state *css = event->css;
3941
3942 remove_wait_queue(event->wqh, &event->wait);
3943
3944 event->cft->unregister_event(css, event->cft, event->eventfd);
3945
3946 /* Notify userspace the event is going away. */
3947 eventfd_signal(event->eventfd, 1);
3948
3949 eventfd_ctx_put(event->eventfd);
3950 kfree(event);
3951 css_put(css);
3952}
3953
3954/*
3955 * Gets called on POLLHUP on eventfd when user closes it.
3956 *
3957 * Called with wqh->lock held and interrupts disabled.
3958 */
3959static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
3960 int sync, void *key)
3961{
3962 struct cgroup_event *event = container_of(wait,
3963 struct cgroup_event, wait);
3964 struct cgroup *cgrp = event->css->cgroup;
3965 unsigned long flags = (unsigned long)key;
3966
3967 if (flags & POLLHUP) {
3968 /*
3969 * If the event has been detached at cgroup removal, we
3970 * can simply return knowing the other side will cleanup
3971 * for us.
3972 *
3973 * We can't race against event freeing since the other
3974 * side will require wqh->lock via remove_wait_queue(),
3975 * which we hold.
3976 */
3977 spin_lock(&cgrp->event_list_lock);
3978 if (!list_empty(&event->list)) {
3979 list_del_init(&event->list);
3980 /*
3981 * We are in atomic context, but cgroup_event_remove()
3982 * may sleep, so we have to call it in workqueue.
3983 */
3984 schedule_work(&event->remove);
3985 }
3986 spin_unlock(&cgrp->event_list_lock);
3987 }
3988
3989 return 0;
3990}
3991
3992static void cgroup_event_ptable_queue_proc(struct file *file,
3993 wait_queue_head_t *wqh, poll_table *pt)
3994{
3995 struct cgroup_event *event = container_of(pt,
3996 struct cgroup_event, pt);
3997
3998 event->wqh = wqh;
3999 add_wait_queue(wqh, &event->wait);
4000}
4001
4002/*
4003 * Parse input and register new cgroup event handler.
4004 *
4005 * Input must be in format '<event_fd> <control_fd> <args>'.
4006 * Interpretation of args is defined by control file implementation.
4007 */
4008static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css,
4009 struct cftype *cft, const char *buffer)
4010{
4011 struct cgroup *cgrp = dummy_css->cgroup;
4012 struct cgroup_event *event;
4013 struct cgroup_subsys_state *cfile_css;
4014 unsigned int efd, cfd;
4015 struct fd efile;
4016 struct fd cfile;
4017 char *endp;
4018 int ret;
4019
4020 efd = simple_strtoul(buffer, &endp, 10);
4021 if (*endp != ' ')
4022 return -EINVAL;
4023 buffer = endp + 1;
4024
4025 cfd = simple_strtoul(buffer, &endp, 10);
4026 if ((*endp != ' ') && (*endp != '\0'))
4027 return -EINVAL;
4028 buffer = endp + 1;
4029
4030 event = kzalloc(sizeof(*event), GFP_KERNEL);
4031 if (!event)
4032 return -ENOMEM;
4033
4034 INIT_LIST_HEAD(&event->list);
4035 init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
4036 init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
4037 INIT_WORK(&event->remove, cgroup_event_remove);
4038
4039 efile = fdget(efd);
4040 if (!efile.file) {
4041 ret = -EBADF;
4042 goto out_kfree;
4043 }
4044
4045 event->eventfd = eventfd_ctx_fileget(efile.file);
4046 if (IS_ERR(event->eventfd)) {
4047 ret = PTR_ERR(event->eventfd);
4048 goto out_put_efile;
4049 }
4050
4051 cfile = fdget(cfd);
4052 if (!cfile.file) {
4053 ret = -EBADF;
4054 goto out_put_eventfd;
4055 }
4056
4057 /* the process need read permission on control file */
4058 /* AV: shouldn't we check that it's been opened for read instead? */
4059 ret = inode_permission(file_inode(cfile.file), MAY_READ);
4060 if (ret < 0)
4061 goto out_put_cfile;
4062
4063 event->cft = __file_cft(cfile.file);
4064 if (IS_ERR(event->cft)) {
4065 ret = PTR_ERR(event->cft);
4066 goto out_put_cfile;
4067 }
4068
4069 if (!event->cft->ss) {
4070 ret = -EBADF;
4071 goto out_put_cfile;
4072 }
4073
4074 /*
4075 * Determine the css of @cfile, verify it belongs to the same
4076 * cgroup as cgroup.event_control, and associate @event with it.
4077 * Remaining events are automatically removed on cgroup destruction
4078 * but the removal is asynchronous, so take an extra ref.
4079 */
4080 rcu_read_lock();
4081
4082 ret = -EINVAL;
4083 event->css = cgroup_css(cgrp, event->cft->ss);
4084 cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, event->cft->ss);
4085 if (event->css && event->css == cfile_css && css_tryget(event->css))
4086 ret = 0;
4087
4088 rcu_read_unlock();
4089 if (ret)
4090 goto out_put_cfile;
4091
4092 if (!event->cft->register_event || !event->cft->unregister_event) {
4093 ret = -EINVAL;
4094 goto out_put_css;
4095 }
4096
4097 ret = event->cft->register_event(event->css, event->cft,
4098 event->eventfd, buffer);
4099 if (ret)
4100 goto out_put_css;
4101
4102 efile.file->f_op->poll(efile.file, &event->pt);
4103
4104 spin_lock(&cgrp->event_list_lock);
4105 list_add(&event->list, &cgrp->event_list);
4106 spin_unlock(&cgrp->event_list_lock);
4107
4108 fdput(cfile);
4109 fdput(efile);
4110
4111 return 0;
4112
4113out_put_css:
4114 css_put(event->css);
4115out_put_cfile:
4116 fdput(cfile);
4117out_put_eventfd:
4118 eventfd_ctx_put(event->eventfd);
4119out_put_efile:
4120 fdput(efile);
4121out_kfree:
4122 kfree(event);
4123
4124 return ret;
4125}
4126
4127static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, 3881static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
4128 struct cftype *cft) 3882 struct cftype *cft)
4129{ 3883{
@@ -4143,17 +3897,15 @@ static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
4143static struct cftype cgroup_base_files[] = { 3897static struct cftype cgroup_base_files[] = {
4144 { 3898 {
4145 .name = "cgroup.procs", 3899 .name = "cgroup.procs",
4146 .open = cgroup_procs_open, 3900 .seq_start = cgroup_pidlist_start,
3901 .seq_next = cgroup_pidlist_next,
3902 .seq_stop = cgroup_pidlist_stop,
3903 .seq_show = cgroup_pidlist_show,
3904 .private = CGROUP_FILE_PROCS,
4147 .write_u64 = cgroup_procs_write, 3905 .write_u64 = cgroup_procs_write,
4148 .release = cgroup_pidlist_release,
4149 .mode = S_IRUGO | S_IWUSR, 3906 .mode = S_IRUGO | S_IWUSR,
4150 }, 3907 },
4151 { 3908 {
4152 .name = "cgroup.event_control",
4153 .write_string = cgroup_write_event_control,
4154 .mode = S_IWUGO,
4155 },
4156 {
4157 .name = "cgroup.clone_children", 3909 .name = "cgroup.clone_children",
4158 .flags = CFTYPE_INSANE, 3910 .flags = CFTYPE_INSANE,
4159 .read_u64 = cgroup_clone_children_read, 3911 .read_u64 = cgroup_clone_children_read,
@@ -4162,7 +3914,7 @@ static struct cftype cgroup_base_files[] = {
4162 { 3914 {
4163 .name = "cgroup.sane_behavior", 3915 .name = "cgroup.sane_behavior",
4164 .flags = CFTYPE_ONLY_ON_ROOT, 3916 .flags = CFTYPE_ONLY_ON_ROOT,
4165 .read_seq_string = cgroup_sane_behavior_show, 3917 .seq_show = cgroup_sane_behavior_show,
4166 }, 3918 },
4167 3919
4168 /* 3920 /*
@@ -4173,9 +3925,12 @@ static struct cftype cgroup_base_files[] = {
4173 { 3925 {
4174 .name = "tasks", 3926 .name = "tasks",
4175 .flags = CFTYPE_INSANE, /* use "procs" instead */ 3927 .flags = CFTYPE_INSANE, /* use "procs" instead */
4176 .open = cgroup_tasks_open, 3928 .seq_start = cgroup_pidlist_start,
3929 .seq_next = cgroup_pidlist_next,
3930 .seq_stop = cgroup_pidlist_stop,
3931 .seq_show = cgroup_pidlist_show,
3932 .private = CGROUP_FILE_TASKS,
4177 .write_u64 = cgroup_tasks_write, 3933 .write_u64 = cgroup_tasks_write,
4178 .release = cgroup_pidlist_release,
4179 .mode = S_IRUGO | S_IWUSR, 3934 .mode = S_IRUGO | S_IWUSR,
4180 }, 3935 },
4181 { 3936 {
@@ -4187,7 +3942,7 @@ static struct cftype cgroup_base_files[] = {
4187 { 3942 {
4188 .name = "release_agent", 3943 .name = "release_agent",
4189 .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, 3944 .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
4190 .read_seq_string = cgroup_release_agent_show, 3945 .seq_show = cgroup_release_agent_show,
4191 .write_string = cgroup_release_agent_write, 3946 .write_string = cgroup_release_agent_write,
4192 .max_write_len = PATH_MAX, 3947 .max_write_len = PATH_MAX,
4193 }, 3948 },
@@ -4333,6 +4088,62 @@ static void offline_css(struct cgroup_subsys_state *css)
4333 RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css); 4088 RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css);
4334} 4089}
4335 4090
4091/**
4092 * create_css - create a cgroup_subsys_state
4093 * @cgrp: the cgroup new css will be associated with
4094 * @ss: the subsys of new css
4095 *
4096 * Create a new css associated with @cgrp - @ss pair. On success, the new
4097 * css is online and installed in @cgrp with all interface files created.
4098 * Returns 0 on success, -errno on failure.
4099 */
4100static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
4101{
4102 struct cgroup *parent = cgrp->parent;
4103 struct cgroup_subsys_state *css;
4104 int err;
4105
4106 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
4107 lockdep_assert_held(&cgroup_mutex);
4108
4109 css = ss->css_alloc(cgroup_css(parent, ss));
4110 if (IS_ERR(css))
4111 return PTR_ERR(css);
4112
4113 err = percpu_ref_init(&css->refcnt, css_release);
4114 if (err)
4115 goto err_free;
4116
4117 init_css(css, ss, cgrp);
4118
4119 err = cgroup_populate_dir(cgrp, 1 << ss->subsys_id);
4120 if (err)
4121 goto err_free;
4122
4123 err = online_css(css);
4124 if (err)
4125 goto err_free;
4126
4127 dget(cgrp->dentry);
4128 css_get(css->parent);
4129
4130 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
4131 parent->parent) {
4132 pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
4133 current->comm, current->pid, ss->name);
4134 if (!strcmp(ss->name, "memory"))
4135 pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n");
4136 ss->warned_broken_hierarchy = true;
4137 }
4138
4139 return 0;
4140
4141err_free:
4142 percpu_ref_cancel_init(&css->refcnt);
4143 ss->css_free(css);
4144 return err;
4145}
4146
4336/* 4147/*
4337 * cgroup_create - create a cgroup 4148 * cgroup_create - create a cgroup
4338 * @parent: cgroup that will be parent of the new cgroup 4149 * @parent: cgroup that will be parent of the new cgroup
@@ -4344,11 +4155,10 @@ static void offline_css(struct cgroup_subsys_state *css)
4344static long cgroup_create(struct cgroup *parent, struct dentry *dentry, 4155static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4345 umode_t mode) 4156 umode_t mode)
4346{ 4157{
4347 struct cgroup_subsys_state *css_ar[CGROUP_SUBSYS_COUNT] = { };
4348 struct cgroup *cgrp; 4158 struct cgroup *cgrp;
4349 struct cgroup_name *name; 4159 struct cgroup_name *name;
4350 struct cgroupfs_root *root = parent->root; 4160 struct cgroupfs_root *root = parent->root;
4351 int err = 0; 4161 int ssid, err = 0;
4352 struct cgroup_subsys *ss; 4162 struct cgroup_subsys *ss;
4353 struct super_block *sb = root->sb; 4163 struct super_block *sb = root->sb;
4354 4164
@@ -4404,23 +4214,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4404 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) 4214 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
4405 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); 4215 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
4406 4216
4407 for_each_root_subsys(root, ss) {
4408 struct cgroup_subsys_state *css;
4409
4410 css = ss->css_alloc(cgroup_css(parent, ss));
4411 if (IS_ERR(css)) {
4412 err = PTR_ERR(css);
4413 goto err_free_all;
4414 }
4415 css_ar[ss->subsys_id] = css;
4416
4417 err = percpu_ref_init(&css->refcnt, css_release);
4418 if (err)
4419 goto err_free_all;
4420
4421 init_css(css, ss, cgrp);
4422 }
4423
4424 /* 4217 /*
4425 * Create directory. cgroup_create_file() returns with the new 4218 * Create directory. cgroup_create_file() returns with the new
4426 * directory locked on success so that it can be populated without 4219 * directory locked on success so that it can be populated without
@@ -4428,7 +4221,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4428 */ 4221 */
4429 err = cgroup_create_file(dentry, S_IFDIR | mode, sb); 4222 err = cgroup_create_file(dentry, S_IFDIR | mode, sb);
4430 if (err < 0) 4223 if (err < 0)
4431 goto err_free_all; 4224 goto err_unlock;
4432 lockdep_assert_held(&dentry->d_inode->i_mutex); 4225 lockdep_assert_held(&dentry->d_inode->i_mutex);
4433 4226
4434 cgrp->serial_nr = cgroup_serial_nr_next++; 4227 cgrp->serial_nr = cgroup_serial_nr_next++;
@@ -4440,55 +4233,31 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4440 /* hold a ref to the parent's dentry */ 4233 /* hold a ref to the parent's dentry */
4441 dget(parent->dentry); 4234 dget(parent->dentry);
4442 4235
4443 /* creation succeeded, notify subsystems */ 4236 /*
4444 for_each_root_subsys(root, ss) { 4237 * @cgrp is now fully operational. If something fails after this
4445 struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; 4238 * point, it'll be released via the normal destruction path.
4446 4239 */
4447 err = online_css(css);
4448 if (err)
4449 goto err_destroy;
4450
4451 /* each css holds a ref to the cgroup's dentry and parent css */
4452 dget(dentry);
4453 css_get(css->parent);
4454
4455 /* mark it consumed for error path */
4456 css_ar[ss->subsys_id] = NULL;
4457
4458 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
4459 parent->parent) {
4460 pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
4461 current->comm, current->pid, ss->name);
4462 if (!strcmp(ss->name, "memory"))
4463 pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n");
4464 ss->warned_broken_hierarchy = true;
4465 }
4466 }
4467
4468 idr_replace(&root->cgroup_idr, cgrp, cgrp->id); 4240 idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
4469 4241
4470 err = cgroup_addrm_files(cgrp, cgroup_base_files, true); 4242 err = cgroup_addrm_files(cgrp, cgroup_base_files, true);
4471 if (err) 4243 if (err)
4472 goto err_destroy; 4244 goto err_destroy;
4473 4245
4474 err = cgroup_populate_dir(cgrp, root->subsys_mask); 4246 /* let's create and online css's */
4475 if (err) 4247 for_each_subsys(ss, ssid) {
4476 goto err_destroy; 4248 if (root->subsys_mask & (1 << ssid)) {
4249 err = create_css(cgrp, ss);
4250 if (err)
4251 goto err_destroy;
4252 }
4253 }
4477 4254
4478 mutex_unlock(&cgroup_mutex); 4255 mutex_unlock(&cgroup_mutex);
4479 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 4256 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
4480 4257
4481 return 0; 4258 return 0;
4482 4259
4483err_free_all: 4260err_unlock:
4484 for_each_root_subsys(root, ss) {
4485 struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
4486
4487 if (css) {
4488 percpu_ref_cancel_init(&css->refcnt);
4489 ss->css_free(css);
4490 }
4491 }
4492 mutex_unlock(&cgroup_mutex); 4261 mutex_unlock(&cgroup_mutex);
4493 /* Release the reference count that we took on the superblock */ 4262 /* Release the reference count that we took on the superblock */
4494 deactivate_super(sb); 4263 deactivate_super(sb);
@@ -4501,14 +4270,6 @@ err_free_cgrp:
4501 return err; 4270 return err;
4502 4271
4503err_destroy: 4272err_destroy:
4504 for_each_root_subsys(root, ss) {
4505 struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
4506
4507 if (css) {
4508 percpu_ref_cancel_init(&css->refcnt);
4509 ss->css_free(css);
4510 }
4511 }
4512 cgroup_destroy_locked(cgrp); 4273 cgroup_destroy_locked(cgrp);
4513 mutex_unlock(&cgroup_mutex); 4274 mutex_unlock(&cgroup_mutex);
4514 mutex_unlock(&dentry->d_inode->i_mutex); 4275 mutex_unlock(&dentry->d_inode->i_mutex);
@@ -4631,10 +4392,10 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4631 __releases(&cgroup_mutex) __acquires(&cgroup_mutex) 4392 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4632{ 4393{
4633 struct dentry *d = cgrp->dentry; 4394 struct dentry *d = cgrp->dentry;
4634 struct cgroup_event *event, *tmp; 4395 struct cgroup_subsys_state *css;
4635 struct cgroup_subsys *ss;
4636 struct cgroup *child; 4396 struct cgroup *child;
4637 bool empty; 4397 bool empty;
4398 int ssid;
4638 4399
4639 lockdep_assert_held(&d->d_inode->i_mutex); 4400 lockdep_assert_held(&d->d_inode->i_mutex);
4640 lockdep_assert_held(&cgroup_mutex); 4401 lockdep_assert_held(&cgroup_mutex);
@@ -4670,12 +4431,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4670 * will be invoked to perform the rest of destruction once the 4431 * will be invoked to perform the rest of destruction once the
4671 * percpu refs of all css's are confirmed to be killed. 4432 * percpu refs of all css's are confirmed to be killed.
4672 */ 4433 */
4673 for_each_root_subsys(cgrp->root, ss) { 4434 for_each_css(css, ssid, cgrp)
4674 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); 4435 kill_css(css);
4675
4676 if (css)
4677 kill_css(css);
4678 }
4679 4436
4680 /* 4437 /*
4681 * Mark @cgrp dead. This prevents further task migration and child 4438 * Mark @cgrp dead. This prevents further task migration and child
@@ -4710,18 +4467,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4710 dget(d); 4467 dget(d);
4711 cgroup_d_remove_dir(d); 4468 cgroup_d_remove_dir(d);
4712 4469
4713 /*
4714 * Unregister events and notify userspace.
4715 * Notify userspace about cgroup removing only after rmdir of cgroup
4716 * directory to avoid race between userspace and kernelspace.
4717 */
4718 spin_lock(&cgrp->event_list_lock);
4719 list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
4720 list_del_init(&event->list);
4721 schedule_work(&event->remove);
4722 }
4723 spin_unlock(&cgrp->event_list_lock);
4724
4725 return 0; 4470 return 0;
4726}; 4471};
4727 4472
@@ -4792,7 +4537,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4792 cgroup_init_cftsets(ss); 4537 cgroup_init_cftsets(ss);
4793 4538
4794 /* Create the top cgroup state for this subsystem */ 4539 /* Create the top cgroup state for this subsystem */
4795 list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);
4796 ss->root = &cgroup_dummy_root; 4540 ss->root = &cgroup_dummy_root;
4797 css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss)); 4541 css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss));
4798 /* We don't handle early failures gracefully */ 4542 /* We don't handle early failures gracefully */
@@ -4866,6 +4610,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4866 cgroup_init_cftsets(ss); 4610 cgroup_init_cftsets(ss);
4867 4611
4868 mutex_lock(&cgroup_mutex); 4612 mutex_lock(&cgroup_mutex);
4613 mutex_lock(&cgroup_root_mutex);
4869 cgroup_subsys[ss->subsys_id] = ss; 4614 cgroup_subsys[ss->subsys_id] = ss;
4870 4615
4871 /* 4616 /*
@@ -4877,11 +4622,11 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4877 if (IS_ERR(css)) { 4622 if (IS_ERR(css)) {
4878 /* failure case - need to deassign the cgroup_subsys[] slot. */ 4623 /* failure case - need to deassign the cgroup_subsys[] slot. */
4879 cgroup_subsys[ss->subsys_id] = NULL; 4624 cgroup_subsys[ss->subsys_id] = NULL;
4625 mutex_unlock(&cgroup_root_mutex);
4880 mutex_unlock(&cgroup_mutex); 4626 mutex_unlock(&cgroup_mutex);
4881 return PTR_ERR(css); 4627 return PTR_ERR(css);
4882 } 4628 }
4883 4629
4884 list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);
4885 ss->root = &cgroup_dummy_root; 4630 ss->root = &cgroup_dummy_root;
4886 4631
4887 /* our new subsystem will be attached to the dummy hierarchy. */ 4632 /* our new subsystem will be attached to the dummy hierarchy. */
@@ -4911,14 +4656,18 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4911 write_unlock(&css_set_lock); 4656 write_unlock(&css_set_lock);
4912 4657
4913 ret = online_css(css); 4658 ret = online_css(css);
4914 if (ret) 4659 if (ret) {
4660 ss->css_free(css);
4915 goto err_unload; 4661 goto err_unload;
4662 }
4916 4663
4917 /* success! */ 4664 /* success! */
4665 mutex_unlock(&cgroup_root_mutex);
4918 mutex_unlock(&cgroup_mutex); 4666 mutex_unlock(&cgroup_mutex);
4919 return 0; 4667 return 0;
4920 4668
4921err_unload: 4669err_unload:
4670 mutex_unlock(&cgroup_root_mutex);
4922 mutex_unlock(&cgroup_mutex); 4671 mutex_unlock(&cgroup_mutex);
4923 /* @ss can't be mounted here as try_module_get() would fail */ 4672 /* @ss can't be mounted here as try_module_get() would fail */
4924 cgroup_unload_subsys(ss); 4673 cgroup_unload_subsys(ss);
@@ -4937,6 +4686,7 @@ EXPORT_SYMBOL_GPL(cgroup_load_subsys);
4937void cgroup_unload_subsys(struct cgroup_subsys *ss) 4686void cgroup_unload_subsys(struct cgroup_subsys *ss)
4938{ 4687{
4939 struct cgrp_cset_link *link; 4688 struct cgrp_cset_link *link;
4689 struct cgroup_subsys_state *css;
4940 4690
4941 BUG_ON(ss->module == NULL); 4691 BUG_ON(ss->module == NULL);
4942 4692
@@ -4948,15 +4698,15 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4948 BUG_ON(ss->root != &cgroup_dummy_root); 4698 BUG_ON(ss->root != &cgroup_dummy_root);
4949 4699
4950 mutex_lock(&cgroup_mutex); 4700 mutex_lock(&cgroup_mutex);
4701 mutex_lock(&cgroup_root_mutex);
4951 4702
4952 offline_css(cgroup_css(cgroup_dummy_top, ss)); 4703 css = cgroup_css(cgroup_dummy_top, ss);
4704 if (css)
4705 offline_css(css);
4953 4706
4954 /* deassign the subsys_id */ 4707 /* deassign the subsys_id */
4955 cgroup_subsys[ss->subsys_id] = NULL; 4708 cgroup_subsys[ss->subsys_id] = NULL;
4956 4709
4957 /* remove subsystem from the dummy root's list of subsystems */
4958 list_del_init(&ss->sibling);
4959
4960 /* 4710 /*
4961 * disentangle the css from all css_sets attached to the dummy 4711 * disentangle the css from all css_sets attached to the dummy
4962 * top. as in loading, we need to pay our respects to the hashtable 4712 * top. as in loading, we need to pay our respects to the hashtable
@@ -4979,9 +4729,11 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4979 * need to free before marking as null because ss->css_free needs 4729 * need to free before marking as null because ss->css_free needs
4980 * the cgrp->subsys pointer to find their state. 4730 * the cgrp->subsys pointer to find their state.
4981 */ 4731 */
4982 ss->css_free(cgroup_css(cgroup_dummy_top, ss)); 4732 if (css)
4733 ss->css_free(css);
4983 RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL); 4734 RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL);
4984 4735
4736 mutex_unlock(&cgroup_root_mutex);
4985 mutex_unlock(&cgroup_mutex); 4737 mutex_unlock(&cgroup_mutex);
4986} 4738}
4987EXPORT_SYMBOL_GPL(cgroup_unload_subsys); 4739EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
@@ -5100,6 +4852,15 @@ static int __init cgroup_wq_init(void)
5100 */ 4852 */
5101 cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); 4853 cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
5102 BUG_ON(!cgroup_destroy_wq); 4854 BUG_ON(!cgroup_destroy_wq);
4855
4856 /*
4857 * Used to destroy pidlists and separate to serve as flush domain.
4858 * Cap @max_active to 1 too.
4859 */
4860 cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
4861 0, 1);
4862 BUG_ON(!cgroup_pidlist_destroy_wq);
4863
5103 return 0; 4864 return 0;
5104} 4865}
5105core_initcall(cgroup_wq_init); 4866core_initcall(cgroup_wq_init);
@@ -5143,11 +4904,12 @@ int proc_cgroup_show(struct seq_file *m, void *v)
5143 for_each_active_root(root) { 4904 for_each_active_root(root) {
5144 struct cgroup_subsys *ss; 4905 struct cgroup_subsys *ss;
5145 struct cgroup *cgrp; 4906 struct cgroup *cgrp;
5146 int count = 0; 4907 int ssid, count = 0;
5147 4908
5148 seq_printf(m, "%d:", root->hierarchy_id); 4909 seq_printf(m, "%d:", root->hierarchy_id);
5149 for_each_root_subsys(root, ss) 4910 for_each_subsys(ss, ssid)
5150 seq_printf(m, "%s%s", count++ ? "," : "", ss->name); 4911 if (root->subsys_mask & (1 << ssid))
4912 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
5151 if (strlen(root->name)) 4913 if (strlen(root->name))
5152 seq_printf(m, "%sname=%s", count ? "," : "", 4914 seq_printf(m, "%sname=%s", count ? "," : "",
5153 root->name); 4915 root->name);
@@ -5488,16 +5250,16 @@ __setup("cgroup_disable=", cgroup_disable);
5488 * @dentry: directory dentry of interest 5250 * @dentry: directory dentry of interest
5489 * @ss: subsystem of interest 5251 * @ss: subsystem of interest
5490 * 5252 *
5491 * Must be called under RCU read lock. The caller is responsible for 5253 * Must be called under cgroup_mutex or RCU read lock. The caller is
5492 * pinning the returned css if it needs to be accessed outside the RCU 5254 * responsible for pinning the returned css if it needs to be accessed
5493 * critical section. 5255 * outside the critical section.
5494 */ 5256 */
5495struct cgroup_subsys_state *css_from_dir(struct dentry *dentry, 5257struct cgroup_subsys_state *css_from_dir(struct dentry *dentry,
5496 struct cgroup_subsys *ss) 5258 struct cgroup_subsys *ss)
5497{ 5259{
5498 struct cgroup *cgrp; 5260 struct cgroup *cgrp;
5499 5261
5500 WARN_ON_ONCE(!rcu_read_lock_held()); 5262 cgroup_assert_mutex_or_rcu_locked();
5501 5263
5502 /* is @dentry a cgroup dir? */ 5264 /* is @dentry a cgroup dir? */
5503 if (!dentry->d_inode || 5265 if (!dentry->d_inode ||
@@ -5520,9 +5282,7 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
5520{ 5282{
5521 struct cgroup *cgrp; 5283 struct cgroup *cgrp;
5522 5284
5523 rcu_lockdep_assert(rcu_read_lock_held() || 5285 cgroup_assert_mutex_or_rcu_locked();
5524 lockdep_is_held(&cgroup_mutex),
5525 "css_from_id() needs proper protection");
5526 5286
5527 cgrp = idr_find(&ss->root->cgroup_idr, id); 5287 cgrp = idr_find(&ss->root->cgroup_idr, id);
5528 if (cgrp) 5288 if (cgrp)
@@ -5570,9 +5330,7 @@ static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
5570 return count; 5330 return count;
5571} 5331}
5572 5332
5573static int current_css_set_cg_links_read(struct cgroup_subsys_state *css, 5333static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
5574 struct cftype *cft,
5575 struct seq_file *seq)
5576{ 5334{
5577 struct cgrp_cset_link *link; 5335 struct cgrp_cset_link *link;
5578 struct css_set *cset; 5336 struct css_set *cset;
@@ -5597,9 +5355,9 @@ static int current_css_set_cg_links_read(struct cgroup_subsys_state *css,
5597} 5355}
5598 5356
5599#define MAX_TASKS_SHOWN_PER_CSS 25 5357#define MAX_TASKS_SHOWN_PER_CSS 25
5600static int cgroup_css_links_read(struct cgroup_subsys_state *css, 5358static int cgroup_css_links_read(struct seq_file *seq, void *v)
5601 struct cftype *cft, struct seq_file *seq)
5602{ 5359{
5360 struct cgroup_subsys_state *css = seq_css(seq);
5603 struct cgrp_cset_link *link; 5361 struct cgrp_cset_link *link;
5604 5362
5605 read_lock(&css_set_lock); 5363 read_lock(&css_set_lock);
@@ -5645,12 +5403,12 @@ static struct cftype debug_files[] = {
5645 5403
5646 { 5404 {
5647 .name = "current_css_set_cg_links", 5405 .name = "current_css_set_cg_links",
5648 .read_seq_string = current_css_set_cg_links_read, 5406 .seq_show = current_css_set_cg_links_read,
5649 }, 5407 },
5650 5408
5651 { 5409 {
5652 .name = "cgroup_css_links", 5410 .name = "cgroup_css_links",
5653 .read_seq_string = cgroup_css_links_read, 5411 .seq_show = cgroup_css_links_read,
5654 }, 5412 },
5655 5413
5656 { 5414 {
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index f0ff64d0ebaa..6c3154e477f6 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -301,10 +301,9 @@ out_unlock:
301 spin_unlock_irq(&freezer->lock); 301 spin_unlock_irq(&freezer->lock);
302} 302}
303 303
304static int freezer_read(struct cgroup_subsys_state *css, struct cftype *cft, 304static int freezer_read(struct seq_file *m, void *v)
305 struct seq_file *m)
306{ 305{
307 struct cgroup_subsys_state *pos; 306 struct cgroup_subsys_state *css = seq_css(m), *pos;
308 307
309 rcu_read_lock(); 308 rcu_read_lock();
310 309
@@ -458,7 +457,7 @@ static struct cftype files[] = {
458 { 457 {
459 .name = "state", 458 .name = "state",
460 .flags = CFTYPE_NOT_ON_ROOT, 459 .flags = CFTYPE_NOT_ON_ROOT,
461 .read_seq_string = freezer_read, 460 .seq_show = freezer_read,
462 .write_string = freezer_write, 461 .write_string = freezer_write,
463 }, 462 },
464 { 463 {
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 4772034b4b17..4410ac6a55f1 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1731,66 +1731,41 @@ out_unlock:
1731 * used, list of ranges of sequential numbers, is variable length, 1731 * used, list of ranges of sequential numbers, is variable length,
1732 * and since these maps can change value dynamically, one could read 1732 * and since these maps can change value dynamically, one could read
1733 * gibberish by doing partial reads while a list was changing. 1733 * gibberish by doing partial reads while a list was changing.
1734 * A single large read to a buffer that crosses a page boundary is
1735 * ok, because the result being copied to user land is not recomputed
1736 * across a page fault.
1737 */ 1734 */
1738 1735static int cpuset_common_seq_show(struct seq_file *sf, void *v)
1739static size_t cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
1740{ 1736{
1741 size_t count; 1737 struct cpuset *cs = css_cs(seq_css(sf));
1742 1738 cpuset_filetype_t type = seq_cft(sf)->private;
1743 mutex_lock(&callback_mutex); 1739 ssize_t count;
1744 count = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed); 1740 char *buf, *s;
1745 mutex_unlock(&callback_mutex); 1741 int ret = 0;
1746 1742
1747 return count; 1743 count = seq_get_buf(sf, &buf);
1748} 1744 s = buf;
1749
1750static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs)
1751{
1752 size_t count;
1753 1745
1754 mutex_lock(&callback_mutex); 1746 mutex_lock(&callback_mutex);
1755 count = nodelist_scnprintf(page, PAGE_SIZE, cs->mems_allowed);
1756 mutex_unlock(&callback_mutex);
1757
1758 return count;
1759}
1760
1761static ssize_t cpuset_common_file_read(struct cgroup_subsys_state *css,
1762 struct cftype *cft, struct file *file,
1763 char __user *buf, size_t nbytes,
1764 loff_t *ppos)
1765{
1766 struct cpuset *cs = css_cs(css);
1767 cpuset_filetype_t type = cft->private;
1768 char *page;
1769 ssize_t retval = 0;
1770 char *s;
1771
1772 if (!(page = (char *)__get_free_page(GFP_TEMPORARY)))
1773 return -ENOMEM;
1774
1775 s = page;
1776 1747
1777 switch (type) { 1748 switch (type) {
1778 case FILE_CPULIST: 1749 case FILE_CPULIST:
1779 s += cpuset_sprintf_cpulist(s, cs); 1750 s += cpulist_scnprintf(s, count, cs->cpus_allowed);
1780 break; 1751 break;
1781 case FILE_MEMLIST: 1752 case FILE_MEMLIST:
1782 s += cpuset_sprintf_memlist(s, cs); 1753 s += nodelist_scnprintf(s, count, cs->mems_allowed);
1783 break; 1754 break;
1784 default: 1755 default:
1785 retval = -EINVAL; 1756 ret = -EINVAL;
1786 goto out; 1757 goto out_unlock;
1787 } 1758 }
1788 *s++ = '\n';
1789 1759
1790 retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page); 1760 if (s < buf + count - 1) {
1791out: 1761 *s++ = '\n';
1792 free_page((unsigned long)page); 1762 seq_commit(sf, s - buf);
1793 return retval; 1763 } else {
1764 seq_commit(sf, -1);
1765 }
1766out_unlock:
1767 mutex_unlock(&callback_mutex);
1768 return ret;
1794} 1769}
1795 1770
1796static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) 1771static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
@@ -1847,7 +1822,7 @@ static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
1847static struct cftype files[] = { 1822static struct cftype files[] = {
1848 { 1823 {
1849 .name = "cpus", 1824 .name = "cpus",
1850 .read = cpuset_common_file_read, 1825 .seq_show = cpuset_common_seq_show,
1851 .write_string = cpuset_write_resmask, 1826 .write_string = cpuset_write_resmask,
1852 .max_write_len = (100U + 6 * NR_CPUS), 1827 .max_write_len = (100U + 6 * NR_CPUS),
1853 .private = FILE_CPULIST, 1828 .private = FILE_CPULIST,
@@ -1855,7 +1830,7 @@ static struct cftype files[] = {
1855 1830
1856 { 1831 {
1857 .name = "mems", 1832 .name = "mems",
1858 .read = cpuset_common_file_read, 1833 .seq_show = cpuset_common_seq_show,
1859 .write_string = cpuset_write_resmask, 1834 .write_string = cpuset_write_resmask,
1860 .max_write_len = (100U + 6 * MAX_NUMNODES), 1835 .max_write_len = (100U + 6 * MAX_NUMNODES),
1861 .private = FILE_MEMLIST, 1836 .private = FILE_MEMLIST,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 36c951b7eef8..3897e09e86a2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7852,15 +7852,14 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
7852 return ret; 7852 return ret;
7853} 7853}
7854 7854
7855static int cpu_stats_show(struct cgroup_subsys_state *css, struct cftype *cft, 7855static int cpu_stats_show(struct seq_file *sf, void *v)
7856 struct cgroup_map_cb *cb)
7857{ 7856{
7858 struct task_group *tg = css_tg(css); 7857 struct task_group *tg = css_tg(seq_css(sf));
7859 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 7858 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7860 7859
7861 cb->fill(cb, "nr_periods", cfs_b->nr_periods); 7860 seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);
7862 cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); 7861 seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
7863 cb->fill(cb, "throttled_time", cfs_b->throttled_time); 7862 seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
7864 7863
7865 return 0; 7864 return 0;
7866} 7865}
@@ -7914,7 +7913,7 @@ static struct cftype cpu_files[] = {
7914 }, 7913 },
7915 { 7914 {
7916 .name = "stat", 7915 .name = "stat",
7917 .read_map = cpu_stats_show, 7916 .seq_show = cpu_stats_show,
7918 }, 7917 },
7919#endif 7918#endif
7920#ifdef CONFIG_RT_GROUP_SCHED 7919#ifdef CONFIG_RT_GROUP_SCHED
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index f64722ff0299..622e0818f905 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -163,10 +163,9 @@ out:
163 return err; 163 return err;
164} 164}
165 165
166static int cpuacct_percpu_seq_read(struct cgroup_subsys_state *css, 166static int cpuacct_percpu_seq_show(struct seq_file *m, void *V)
167 struct cftype *cft, struct seq_file *m)
168{ 167{
169 struct cpuacct *ca = css_ca(css); 168 struct cpuacct *ca = css_ca(seq_css(m));
170 u64 percpu; 169 u64 percpu;
171 int i; 170 int i;
172 171
@@ -183,10 +182,9 @@ static const char * const cpuacct_stat_desc[] = {
183 [CPUACCT_STAT_SYSTEM] = "system", 182 [CPUACCT_STAT_SYSTEM] = "system",
184}; 183};
185 184
186static int cpuacct_stats_show(struct cgroup_subsys_state *css, 185static int cpuacct_stats_show(struct seq_file *sf, void *v)
187 struct cftype *cft, struct cgroup_map_cb *cb)
188{ 186{
189 struct cpuacct *ca = css_ca(css); 187 struct cpuacct *ca = css_ca(seq_css(sf));
190 int cpu; 188 int cpu;
191 s64 val = 0; 189 s64 val = 0;
192 190
@@ -196,7 +194,7 @@ static int cpuacct_stats_show(struct cgroup_subsys_state *css,
196 val += kcpustat->cpustat[CPUTIME_NICE]; 194 val += kcpustat->cpustat[CPUTIME_NICE];
197 } 195 }
198 val = cputime64_to_clock_t(val); 196 val = cputime64_to_clock_t(val);
199 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val); 197 seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val);
200 198
201 val = 0; 199 val = 0;
202 for_each_online_cpu(cpu) { 200 for_each_online_cpu(cpu) {
@@ -207,7 +205,7 @@ static int cpuacct_stats_show(struct cgroup_subsys_state *css,
207 } 205 }
208 206
209 val = cputime64_to_clock_t(val); 207 val = cputime64_to_clock_t(val);
210 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); 208 seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
211 209
212 return 0; 210 return 0;
213} 211}
@@ -220,11 +218,11 @@ static struct cftype files[] = {
220 }, 218 },
221 { 219 {
222 .name = "usage_percpu", 220 .name = "usage_percpu",
223 .read_seq_string = cpuacct_percpu_seq_read, 221 .seq_show = cpuacct_percpu_seq_show,
224 }, 222 },
225 { 223 {
226 .name = "stat", 224 .name = "stat",
227 .read_map = cpuacct_stats_show, 225 .seq_show = cpuacct_stats_show,
228 }, 226 },
229 { } /* terminate */ 227 { } /* terminate */
230}; 228};
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index bda8e44f6fde..d747a84e09b0 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -242,22 +242,16 @@ void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
242 return; 242 return;
243} 243}
244 244
245static ssize_t hugetlb_cgroup_read(struct cgroup_subsys_state *css, 245static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
246 struct cftype *cft, struct file *file, 246 struct cftype *cft)
247 char __user *buf, size_t nbytes,
248 loff_t *ppos)
249{ 247{
250 u64 val; 248 int idx, name;
251 char str[64];
252 int idx, name, len;
253 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); 249 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
254 250
255 idx = MEMFILE_IDX(cft->private); 251 idx = MEMFILE_IDX(cft->private);
256 name = MEMFILE_ATTR(cft->private); 252 name = MEMFILE_ATTR(cft->private);
257 253
258 val = res_counter_read_u64(&h_cg->hugepage[idx], name); 254 return res_counter_read_u64(&h_cg->hugepage[idx], name);
259 len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
260 return simple_read_from_buffer(buf, nbytes, ppos, str, len);
261} 255}
262 256
263static int hugetlb_cgroup_write(struct cgroup_subsys_state *css, 257static int hugetlb_cgroup_write(struct cgroup_subsys_state *css,
@@ -337,28 +331,28 @@ static void __init __hugetlb_cgroup_file_init(int idx)
337 cft = &h->cgroup_files[0]; 331 cft = &h->cgroup_files[0];
338 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf); 332 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf);
339 cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT); 333 cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT);
340 cft->read = hugetlb_cgroup_read; 334 cft->read_u64 = hugetlb_cgroup_read_u64;
341 cft->write_string = hugetlb_cgroup_write; 335 cft->write_string = hugetlb_cgroup_write;
342 336
343 /* Add the usage file */ 337 /* Add the usage file */
344 cft = &h->cgroup_files[1]; 338 cft = &h->cgroup_files[1];
345 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf); 339 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf);
346 cft->private = MEMFILE_PRIVATE(idx, RES_USAGE); 340 cft->private = MEMFILE_PRIVATE(idx, RES_USAGE);
347 cft->read = hugetlb_cgroup_read; 341 cft->read_u64 = hugetlb_cgroup_read_u64;
348 342
349 /* Add the MAX usage file */ 343 /* Add the MAX usage file */
350 cft = &h->cgroup_files[2]; 344 cft = &h->cgroup_files[2];
351 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf); 345 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf);
352 cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE); 346 cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE);
353 cft->trigger = hugetlb_cgroup_reset; 347 cft->trigger = hugetlb_cgroup_reset;
354 cft->read = hugetlb_cgroup_read; 348 cft->read_u64 = hugetlb_cgroup_read_u64;
355 349
356 /* Add the failcntfile */ 350 /* Add the failcntfile */
357 cft = &h->cgroup_files[3]; 351 cft = &h->cgroup_files[3];
358 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf); 352 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf);
359 cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT); 353 cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT);
360 cft->trigger = hugetlb_cgroup_reset; 354 cft->trigger = hugetlb_cgroup_reset;
361 cft->read = hugetlb_cgroup_read; 355 cft->read_u64 = hugetlb_cgroup_read_u64;
362 356
363 /* NULL terminate the last cft */ 357 /* NULL terminate the last cft */
364 cft = &h->cgroup_files[4]; 358 cft = &h->cgroup_files[4];
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7f1a356153c0..7caff36180cd 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -45,6 +45,7 @@
45#include <linux/swapops.h> 45#include <linux/swapops.h>
46#include <linux/spinlock.h> 46#include <linux/spinlock.h>
47#include <linux/eventfd.h> 47#include <linux/eventfd.h>
48#include <linux/poll.h>
48#include <linux/sort.h> 49#include <linux/sort.h>
49#include <linux/fs.h> 50#include <linux/fs.h>
50#include <linux/seq_file.h> 51#include <linux/seq_file.h>
@@ -55,6 +56,7 @@
55#include <linux/cpu.h> 56#include <linux/cpu.h>
56#include <linux/oom.h> 57#include <linux/oom.h>
57#include <linux/lockdep.h> 58#include <linux/lockdep.h>
59#include <linux/file.h>
58#include "internal.h" 60#include "internal.h"
59#include <net/sock.h> 61#include <net/sock.h>
60#include <net/ip.h> 62#include <net/ip.h>
@@ -227,6 +229,46 @@ struct mem_cgroup_eventfd_list {
227 struct eventfd_ctx *eventfd; 229 struct eventfd_ctx *eventfd;
228}; 230};
229 231
232/*
233 * cgroup_event represents events which userspace want to receive.
234 */
235struct mem_cgroup_event {
236 /*
237 * memcg which the event belongs to.
238 */
239 struct mem_cgroup *memcg;
240 /*
241 * eventfd to signal userspace about the event.
242 */
243 struct eventfd_ctx *eventfd;
244 /*
245 * Each of these stored in a list by the cgroup.
246 */
247 struct list_head list;
248 /*
249 * register_event() callback will be used to add new userspace
250 * waiter for changes related to this event. Use eventfd_signal()
251 * on eventfd to send notification to userspace.
252 */
253 int (*register_event)(struct mem_cgroup *memcg,
254 struct eventfd_ctx *eventfd, const char *args);
255 /*
256 * unregister_event() callback will be called when userspace closes
257 * the eventfd or on cgroup removing. This callback must be set,
258 * if you want provide notification functionality.
259 */
260 void (*unregister_event)(struct mem_cgroup *memcg,
261 struct eventfd_ctx *eventfd);
262 /*
263 * All fields below needed to unregister event when
264 * userspace closes eventfd.
265 */
266 poll_table pt;
267 wait_queue_head_t *wqh;
268 wait_queue_t wait;
269 struct work_struct remove;
270};
271
230static void mem_cgroup_threshold(struct mem_cgroup *memcg); 272static void mem_cgroup_threshold(struct mem_cgroup *memcg);
231static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); 273static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
232 274
@@ -331,6 +373,10 @@ struct mem_cgroup {
331 atomic_t numainfo_updating; 373 atomic_t numainfo_updating;
332#endif 374#endif
333 375
376 /* List of events which userspace want to receive */
377 struct list_head event_list;
378 spinlock_t event_list_lock;
379
334 struct mem_cgroup_per_node *nodeinfo[0]; 380 struct mem_cgroup_per_node *nodeinfo[0];
335 /* WARNING: nodeinfo must be the last member here */ 381 /* WARNING: nodeinfo must be the last member here */
336}; 382};
@@ -490,11 +536,6 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
490 return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; 536 return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
491} 537}
492 538
493struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css)
494{
495 return &mem_cgroup_from_css(css)->vmpressure;
496}
497
498static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) 539static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
499{ 540{
500 return (memcg == root_mem_cgroup); 541 return (memcg == root_mem_cgroup);
@@ -2976,10 +3017,9 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
2976} 3017}
2977 3018
2978#ifdef CONFIG_SLABINFO 3019#ifdef CONFIG_SLABINFO
2979static int mem_cgroup_slabinfo_read(struct cgroup_subsys_state *css, 3020static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v)
2980 struct cftype *cft, struct seq_file *m)
2981{ 3021{
2982 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3022 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
2983 struct memcg_cache_params *params; 3023 struct memcg_cache_params *params;
2984 3024
2985 if (!memcg_can_account_kmem(memcg)) 3025 if (!memcg_can_account_kmem(memcg))
@@ -5112,14 +5152,12 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
5112 return val << PAGE_SHIFT; 5152 return val << PAGE_SHIFT;
5113} 5153}
5114 5154
5115static ssize_t mem_cgroup_read(struct cgroup_subsys_state *css, 5155static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
5116 struct cftype *cft, struct file *file, 5156 struct cftype *cft)
5117 char __user *buf, size_t nbytes, loff_t *ppos)
5118{ 5157{
5119 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5158 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5120 char str[64];
5121 u64 val; 5159 u64 val;
5122 int name, len; 5160 int name;
5123 enum res_type type; 5161 enum res_type type;
5124 5162
5125 type = MEMFILE_TYPE(cft->private); 5163 type = MEMFILE_TYPE(cft->private);
@@ -5145,8 +5183,7 @@ static ssize_t mem_cgroup_read(struct cgroup_subsys_state *css,
5145 BUG(); 5183 BUG();
5146 } 5184 }
5147 5185
5148 len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val); 5186 return val;
5149 return simple_read_from_buffer(buf, nbytes, ppos, str, len);
5150} 5187}
5151 5188
5152static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val) 5189static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val)
@@ -5383,8 +5420,7 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
5383#endif 5420#endif
5384 5421
5385#ifdef CONFIG_NUMA 5422#ifdef CONFIG_NUMA
5386static int memcg_numa_stat_show(struct cgroup_subsys_state *css, 5423static int memcg_numa_stat_show(struct seq_file *m, void *v)
5387 struct cftype *cft, struct seq_file *m)
5388{ 5424{
5389 struct numa_stat { 5425 struct numa_stat {
5390 const char *name; 5426 const char *name;
@@ -5400,7 +5436,7 @@ static int memcg_numa_stat_show(struct cgroup_subsys_state *css,
5400 const struct numa_stat *stat; 5436 const struct numa_stat *stat;
5401 int nid; 5437 int nid;
5402 unsigned long nr; 5438 unsigned long nr;
5403 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5439 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5404 5440
5405 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 5441 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
5406 nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask); 5442 nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
@@ -5439,10 +5475,9 @@ static inline void mem_cgroup_lru_names_not_uptodate(void)
5439 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); 5475 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
5440} 5476}
5441 5477
5442static int memcg_stat_show(struct cgroup_subsys_state *css, struct cftype *cft, 5478static int memcg_stat_show(struct seq_file *m, void *v)
5443 struct seq_file *m)
5444{ 5479{
5445 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5480 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5446 struct mem_cgroup *mi; 5481 struct mem_cgroup *mi;
5447 unsigned int i; 5482 unsigned int i;
5448 5483
@@ -5651,13 +5686,11 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
5651 mem_cgroup_oom_notify_cb(iter); 5686 mem_cgroup_oom_notify_cb(iter);
5652} 5687}
5653 5688
5654static int mem_cgroup_usage_register_event(struct cgroup_subsys_state *css, 5689static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
5655 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) 5690 struct eventfd_ctx *eventfd, const char *args, enum res_type type)
5656{ 5691{
5657 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5658 struct mem_cgroup_thresholds *thresholds; 5692 struct mem_cgroup_thresholds *thresholds;
5659 struct mem_cgroup_threshold_ary *new; 5693 struct mem_cgroup_threshold_ary *new;
5660 enum res_type type = MEMFILE_TYPE(cft->private);
5661 u64 threshold, usage; 5694 u64 threshold, usage;
5662 int i, size, ret; 5695 int i, size, ret;
5663 5696
@@ -5734,13 +5767,23 @@ unlock:
5734 return ret; 5767 return ret;
5735} 5768}
5736 5769
5737static void mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css, 5770static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
5738 struct cftype *cft, struct eventfd_ctx *eventfd) 5771 struct eventfd_ctx *eventfd, const char *args)
5772{
5773 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
5774}
5775
5776static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
5777 struct eventfd_ctx *eventfd, const char *args)
5778{
5779 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
5780}
5781
5782static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
5783 struct eventfd_ctx *eventfd, enum res_type type)
5739{ 5784{
5740 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5741 struct mem_cgroup_thresholds *thresholds; 5785 struct mem_cgroup_thresholds *thresholds;
5742 struct mem_cgroup_threshold_ary *new; 5786 struct mem_cgroup_threshold_ary *new;
5743 enum res_type type = MEMFILE_TYPE(cft->private);
5744 u64 usage; 5787 u64 usage;
5745 int i, j, size; 5788 int i, j, size;
5746 5789
@@ -5813,14 +5856,23 @@ unlock:
5813 mutex_unlock(&memcg->thresholds_lock); 5856 mutex_unlock(&memcg->thresholds_lock);
5814} 5857}
5815 5858
5816static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css, 5859static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
5817 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) 5860 struct eventfd_ctx *eventfd)
5861{
5862 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
5863}
5864
5865static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
5866 struct eventfd_ctx *eventfd)
5867{
5868 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
5869}
5870
5871static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
5872 struct eventfd_ctx *eventfd, const char *args)
5818{ 5873{
5819 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5820 struct mem_cgroup_eventfd_list *event; 5874 struct mem_cgroup_eventfd_list *event;
5821 enum res_type type = MEMFILE_TYPE(cft->private);
5822 5875
5823 BUG_ON(type != _OOM_TYPE);
5824 event = kmalloc(sizeof(*event), GFP_KERNEL); 5876 event = kmalloc(sizeof(*event), GFP_KERNEL);
5825 if (!event) 5877 if (!event)
5826 return -ENOMEM; 5878 return -ENOMEM;
@@ -5838,14 +5890,10 @@ static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css,
5838 return 0; 5890 return 0;
5839} 5891}
5840 5892
5841static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css, 5893static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
5842 struct cftype *cft, struct eventfd_ctx *eventfd) 5894 struct eventfd_ctx *eventfd)
5843{ 5895{
5844 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5845 struct mem_cgroup_eventfd_list *ev, *tmp; 5896 struct mem_cgroup_eventfd_list *ev, *tmp;
5846 enum res_type type = MEMFILE_TYPE(cft->private);
5847
5848 BUG_ON(type != _OOM_TYPE);
5849 5897
5850 spin_lock(&memcg_oom_lock); 5898 spin_lock(&memcg_oom_lock);
5851 5899
@@ -5859,17 +5907,12 @@ static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css,
5859 spin_unlock(&memcg_oom_lock); 5907 spin_unlock(&memcg_oom_lock);
5860} 5908}
5861 5909
5862static int mem_cgroup_oom_control_read(struct cgroup_subsys_state *css, 5910static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
5863 struct cftype *cft, struct cgroup_map_cb *cb)
5864{ 5911{
5865 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5912 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
5866
5867 cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable);
5868 5913
5869 if (atomic_read(&memcg->under_oom)) 5914 seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
5870 cb->fill(cb, "under_oom", 1); 5915 seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom));
5871 else
5872 cb->fill(cb, "under_oom", 0);
5873 return 0; 5916 return 0;
5874} 5917}
5875 5918
@@ -5962,41 +6005,261 @@ static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
5962} 6005}
5963#endif 6006#endif
5964 6007
6008/*
6009 * DO NOT USE IN NEW FILES.
6010 *
6011 * "cgroup.event_control" implementation.
6012 *
6013 * This is way over-engineered. It tries to support fully configurable
6014 * events for each user. Such level of flexibility is completely
6015 * unnecessary especially in the light of the planned unified hierarchy.
6016 *
6017 * Please deprecate this and replace with something simpler if at all
6018 * possible.
6019 */
6020
6021/*
6022 * Unregister event and free resources.
6023 *
6024 * Gets called from workqueue.
6025 */
6026static void memcg_event_remove(struct work_struct *work)
6027{
6028 struct mem_cgroup_event *event =
6029 container_of(work, struct mem_cgroup_event, remove);
6030 struct mem_cgroup *memcg = event->memcg;
6031
6032 remove_wait_queue(event->wqh, &event->wait);
6033
6034 event->unregister_event(memcg, event->eventfd);
6035
6036 /* Notify userspace the event is going away. */
6037 eventfd_signal(event->eventfd, 1);
6038
6039 eventfd_ctx_put(event->eventfd);
6040 kfree(event);
6041 css_put(&memcg->css);
6042}
6043
6044/*
6045 * Gets called on POLLHUP on eventfd when user closes it.
6046 *
6047 * Called with wqh->lock held and interrupts disabled.
6048 */
6049static int memcg_event_wake(wait_queue_t *wait, unsigned mode,
6050 int sync, void *key)
6051{
6052 struct mem_cgroup_event *event =
6053 container_of(wait, struct mem_cgroup_event, wait);
6054 struct mem_cgroup *memcg = event->memcg;
6055 unsigned long flags = (unsigned long)key;
6056
6057 if (flags & POLLHUP) {
6058 /*
6059 * If the event has been detached at cgroup removal, we
6060 * can simply return knowing the other side will cleanup
6061 * for us.
6062 *
6063 * We can't race against event freeing since the other
6064 * side will require wqh->lock via remove_wait_queue(),
6065 * which we hold.
6066 */
6067 spin_lock(&memcg->event_list_lock);
6068 if (!list_empty(&event->list)) {
6069 list_del_init(&event->list);
6070 /*
6071 * We are in atomic context, but cgroup_event_remove()
6072 * may sleep, so we have to call it in workqueue.
6073 */
6074 schedule_work(&event->remove);
6075 }
6076 spin_unlock(&memcg->event_list_lock);
6077 }
6078
6079 return 0;
6080}
6081
6082static void memcg_event_ptable_queue_proc(struct file *file,
6083 wait_queue_head_t *wqh, poll_table *pt)
6084{
6085 struct mem_cgroup_event *event =
6086 container_of(pt, struct mem_cgroup_event, pt);
6087
6088 event->wqh = wqh;
6089 add_wait_queue(wqh, &event->wait);
6090}
6091
6092/*
6093 * DO NOT USE IN NEW FILES.
6094 *
6095 * Parse input and register new cgroup event handler.
6096 *
6097 * Input must be in format '<event_fd> <control_fd> <args>'.
6098 * Interpretation of args is defined by control file implementation.
6099 */
6100static int memcg_write_event_control(struct cgroup_subsys_state *css,
6101 struct cftype *cft, const char *buffer)
6102{
6103 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6104 struct mem_cgroup_event *event;
6105 struct cgroup_subsys_state *cfile_css;
6106 unsigned int efd, cfd;
6107 struct fd efile;
6108 struct fd cfile;
6109 const char *name;
6110 char *endp;
6111 int ret;
6112
6113 efd = simple_strtoul(buffer, &endp, 10);
6114 if (*endp != ' ')
6115 return -EINVAL;
6116 buffer = endp + 1;
6117
6118 cfd = simple_strtoul(buffer, &endp, 10);
6119 if ((*endp != ' ') && (*endp != '\0'))
6120 return -EINVAL;
6121 buffer = endp + 1;
6122
6123 event = kzalloc(sizeof(*event), GFP_KERNEL);
6124 if (!event)
6125 return -ENOMEM;
6126
6127 event->memcg = memcg;
6128 INIT_LIST_HEAD(&event->list);
6129 init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
6130 init_waitqueue_func_entry(&event->wait, memcg_event_wake);
6131 INIT_WORK(&event->remove, memcg_event_remove);
6132
6133 efile = fdget(efd);
6134 if (!efile.file) {
6135 ret = -EBADF;
6136 goto out_kfree;
6137 }
6138
6139 event->eventfd = eventfd_ctx_fileget(efile.file);
6140 if (IS_ERR(event->eventfd)) {
6141 ret = PTR_ERR(event->eventfd);
6142 goto out_put_efile;
6143 }
6144
6145 cfile = fdget(cfd);
6146 if (!cfile.file) {
6147 ret = -EBADF;
6148 goto out_put_eventfd;
6149 }
6150
6151 /* the process need read permission on control file */
6152 /* AV: shouldn't we check that it's been opened for read instead? */
6153 ret = inode_permission(file_inode(cfile.file), MAY_READ);
6154 if (ret < 0)
6155 goto out_put_cfile;
6156
6157 /*
6158 * Determine the event callbacks and set them in @event. This used
6159 * to be done via struct cftype but cgroup core no longer knows
6160 * about these events. The following is crude but the whole thing
6161 * is for compatibility anyway.
6162 *
6163 * DO NOT ADD NEW FILES.
6164 */
6165 name = cfile.file->f_dentry->d_name.name;
6166
6167 if (!strcmp(name, "memory.usage_in_bytes")) {
6168 event->register_event = mem_cgroup_usage_register_event;
6169 event->unregister_event = mem_cgroup_usage_unregister_event;
6170 } else if (!strcmp(name, "memory.oom_control")) {
6171 event->register_event = mem_cgroup_oom_register_event;
6172 event->unregister_event = mem_cgroup_oom_unregister_event;
6173 } else if (!strcmp(name, "memory.pressure_level")) {
6174 event->register_event = vmpressure_register_event;
6175 event->unregister_event = vmpressure_unregister_event;
6176 } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
6177 event->register_event = memsw_cgroup_usage_register_event;
6178 event->unregister_event = memsw_cgroup_usage_unregister_event;
6179 } else {
6180 ret = -EINVAL;
6181 goto out_put_cfile;
6182 }
6183
6184 /*
6185 * Verify @cfile should belong to @css. Also, remaining events are
6186 * automatically removed on cgroup destruction but the removal is
6187 * asynchronous, so take an extra ref on @css.
6188 */
6189 rcu_read_lock();
6190
6191 ret = -EINVAL;
6192 cfile_css = css_from_dir(cfile.file->f_dentry->d_parent,
6193 &mem_cgroup_subsys);
6194 if (cfile_css == css && css_tryget(css))
6195 ret = 0;
6196
6197 rcu_read_unlock();
6198 if (ret)
6199 goto out_put_cfile;
6200
6201 ret = event->register_event(memcg, event->eventfd, buffer);
6202 if (ret)
6203 goto out_put_css;
6204
6205 efile.file->f_op->poll(efile.file, &event->pt);
6206
6207 spin_lock(&memcg->event_list_lock);
6208 list_add(&event->list, &memcg->event_list);
6209 spin_unlock(&memcg->event_list_lock);
6210
6211 fdput(cfile);
6212 fdput(efile);
6213
6214 return 0;
6215
6216out_put_css:
6217 css_put(css);
6218out_put_cfile:
6219 fdput(cfile);
6220out_put_eventfd:
6221 eventfd_ctx_put(event->eventfd);
6222out_put_efile:
6223 fdput(efile);
6224out_kfree:
6225 kfree(event);
6226
6227 return ret;
6228}
6229
5965static struct cftype mem_cgroup_files[] = { 6230static struct cftype mem_cgroup_files[] = {
5966 { 6231 {
5967 .name = "usage_in_bytes", 6232 .name = "usage_in_bytes",
5968 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 6233 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
5969 .read = mem_cgroup_read, 6234 .read_u64 = mem_cgroup_read_u64,
5970 .register_event = mem_cgroup_usage_register_event,
5971 .unregister_event = mem_cgroup_usage_unregister_event,
5972 }, 6235 },
5973 { 6236 {
5974 .name = "max_usage_in_bytes", 6237 .name = "max_usage_in_bytes",
5975 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 6238 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
5976 .trigger = mem_cgroup_reset, 6239 .trigger = mem_cgroup_reset,
5977 .read = mem_cgroup_read, 6240 .read_u64 = mem_cgroup_read_u64,
5978 }, 6241 },
5979 { 6242 {
5980 .name = "limit_in_bytes", 6243 .name = "limit_in_bytes",
5981 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 6244 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
5982 .write_string = mem_cgroup_write, 6245 .write_string = mem_cgroup_write,
5983 .read = mem_cgroup_read, 6246 .read_u64 = mem_cgroup_read_u64,
5984 }, 6247 },
5985 { 6248 {
5986 .name = "soft_limit_in_bytes", 6249 .name = "soft_limit_in_bytes",
5987 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 6250 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
5988 .write_string = mem_cgroup_write, 6251 .write_string = mem_cgroup_write,
5989 .read = mem_cgroup_read, 6252 .read_u64 = mem_cgroup_read_u64,
5990 }, 6253 },
5991 { 6254 {
5992 .name = "failcnt", 6255 .name = "failcnt",
5993 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 6256 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
5994 .trigger = mem_cgroup_reset, 6257 .trigger = mem_cgroup_reset,
5995 .read = mem_cgroup_read, 6258 .read_u64 = mem_cgroup_read_u64,
5996 }, 6259 },
5997 { 6260 {
5998 .name = "stat", 6261 .name = "stat",
5999 .read_seq_string = memcg_stat_show, 6262 .seq_show = memcg_stat_show,
6000 }, 6263 },
6001 { 6264 {
6002 .name = "force_empty", 6265 .name = "force_empty",
@@ -6009,6 +6272,12 @@ static struct cftype mem_cgroup_files[] = {
6009 .read_u64 = mem_cgroup_hierarchy_read, 6272 .read_u64 = mem_cgroup_hierarchy_read,
6010 }, 6273 },
6011 { 6274 {
6275 .name = "cgroup.event_control", /* XXX: for compat */
6276 .write_string = memcg_write_event_control,
6277 .flags = CFTYPE_NO_PREFIX,
6278 .mode = S_IWUGO,
6279 },
6280 {
6012 .name = "swappiness", 6281 .name = "swappiness",
6013 .read_u64 = mem_cgroup_swappiness_read, 6282 .read_u64 = mem_cgroup_swappiness_read,
6014 .write_u64 = mem_cgroup_swappiness_write, 6283 .write_u64 = mem_cgroup_swappiness_write,
@@ -6020,21 +6289,17 @@ static struct cftype mem_cgroup_files[] = {
6020 }, 6289 },
6021 { 6290 {
6022 .name = "oom_control", 6291 .name = "oom_control",
6023 .read_map = mem_cgroup_oom_control_read, 6292 .seq_show = mem_cgroup_oom_control_read,
6024 .write_u64 = mem_cgroup_oom_control_write, 6293 .write_u64 = mem_cgroup_oom_control_write,
6025 .register_event = mem_cgroup_oom_register_event,
6026 .unregister_event = mem_cgroup_oom_unregister_event,
6027 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), 6294 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
6028 }, 6295 },
6029 { 6296 {
6030 .name = "pressure_level", 6297 .name = "pressure_level",
6031 .register_event = vmpressure_register_event,
6032 .unregister_event = vmpressure_unregister_event,
6033 }, 6298 },
6034#ifdef CONFIG_NUMA 6299#ifdef CONFIG_NUMA
6035 { 6300 {
6036 .name = "numa_stat", 6301 .name = "numa_stat",
6037 .read_seq_string = memcg_numa_stat_show, 6302 .seq_show = memcg_numa_stat_show,
6038 }, 6303 },
6039#endif 6304#endif
6040#ifdef CONFIG_MEMCG_KMEM 6305#ifdef CONFIG_MEMCG_KMEM
@@ -6042,29 +6307,29 @@ static struct cftype mem_cgroup_files[] = {
6042 .name = "kmem.limit_in_bytes", 6307 .name = "kmem.limit_in_bytes",
6043 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), 6308 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
6044 .write_string = mem_cgroup_write, 6309 .write_string = mem_cgroup_write,
6045 .read = mem_cgroup_read, 6310 .read_u64 = mem_cgroup_read_u64,
6046 }, 6311 },
6047 { 6312 {
6048 .name = "kmem.usage_in_bytes", 6313 .name = "kmem.usage_in_bytes",
6049 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), 6314 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
6050 .read = mem_cgroup_read, 6315 .read_u64 = mem_cgroup_read_u64,
6051 }, 6316 },
6052 { 6317 {
6053 .name = "kmem.failcnt", 6318 .name = "kmem.failcnt",
6054 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), 6319 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
6055 .trigger = mem_cgroup_reset, 6320 .trigger = mem_cgroup_reset,
6056 .read = mem_cgroup_read, 6321 .read_u64 = mem_cgroup_read_u64,
6057 }, 6322 },
6058 { 6323 {
6059 .name = "kmem.max_usage_in_bytes", 6324 .name = "kmem.max_usage_in_bytes",
6060 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), 6325 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
6061 .trigger = mem_cgroup_reset, 6326 .trigger = mem_cgroup_reset,
6062 .read = mem_cgroup_read, 6327 .read_u64 = mem_cgroup_read_u64,
6063 }, 6328 },
6064#ifdef CONFIG_SLABINFO 6329#ifdef CONFIG_SLABINFO
6065 { 6330 {
6066 .name = "kmem.slabinfo", 6331 .name = "kmem.slabinfo",
6067 .read_seq_string = mem_cgroup_slabinfo_read, 6332 .seq_show = mem_cgroup_slabinfo_read,
6068 }, 6333 },
6069#endif 6334#endif
6070#endif 6335#endif
@@ -6076,27 +6341,25 @@ static struct cftype memsw_cgroup_files[] = {
6076 { 6341 {
6077 .name = "memsw.usage_in_bytes", 6342 .name = "memsw.usage_in_bytes",
6078 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 6343 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
6079 .read = mem_cgroup_read, 6344 .read_u64 = mem_cgroup_read_u64,
6080 .register_event = mem_cgroup_usage_register_event,
6081 .unregister_event = mem_cgroup_usage_unregister_event,
6082 }, 6345 },
6083 { 6346 {
6084 .name = "memsw.max_usage_in_bytes", 6347 .name = "memsw.max_usage_in_bytes",
6085 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 6348 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
6086 .trigger = mem_cgroup_reset, 6349 .trigger = mem_cgroup_reset,
6087 .read = mem_cgroup_read, 6350 .read_u64 = mem_cgroup_read_u64,
6088 }, 6351 },
6089 { 6352 {
6090 .name = "memsw.limit_in_bytes", 6353 .name = "memsw.limit_in_bytes",
6091 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 6354 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
6092 .write_string = mem_cgroup_write, 6355 .write_string = mem_cgroup_write,
6093 .read = mem_cgroup_read, 6356 .read_u64 = mem_cgroup_read_u64,
6094 }, 6357 },
6095 { 6358 {
6096 .name = "memsw.failcnt", 6359 .name = "memsw.failcnt",
6097 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 6360 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
6098 .trigger = mem_cgroup_reset, 6361 .trigger = mem_cgroup_reset,
6099 .read = mem_cgroup_read, 6362 .read_u64 = mem_cgroup_read_u64,
6100 }, 6363 },
6101 { }, /* terminate */ 6364 { }, /* terminate */
6102}; 6365};
@@ -6268,6 +6531,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
6268 mutex_init(&memcg->thresholds_lock); 6531 mutex_init(&memcg->thresholds_lock);
6269 spin_lock_init(&memcg->move_lock); 6532 spin_lock_init(&memcg->move_lock);
6270 vmpressure_init(&memcg->vmpressure); 6533 vmpressure_init(&memcg->vmpressure);
6534 INIT_LIST_HEAD(&memcg->event_list);
6535 spin_lock_init(&memcg->event_list_lock);
6271 6536
6272 return &memcg->css; 6537 return &memcg->css;
6273 6538
@@ -6343,6 +6608,19 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
6343static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) 6608static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
6344{ 6609{
6345 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 6610 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6611 struct mem_cgroup_event *event, *tmp;
6612
6613 /*
6614 * Unregister events and notify userspace.
6615 * Notify userspace about cgroup removing only after rmdir of cgroup
6616 * directory to avoid race between userspace and kernelspace.
6617 */
6618 spin_lock(&memcg->event_list_lock);
6619 list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
6620 list_del_init(&event->list);
6621 schedule_work(&event->remove);
6622 }
6623 spin_unlock(&memcg->event_list_lock);
6346 6624
6347 kmem_cgroup_css_offline(memcg); 6625 kmem_cgroup_css_offline(memcg);
6348 6626
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 6d757e3a872a..3bd0b8e6ab12 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -451,7 +451,7 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
451 * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry 451 * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry
452 * @ent: swap entry to be looked up. 452 * @ent: swap entry to be looked up.
453 * 453 *
454 * Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID) 454 * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
455 */ 455 */
456unsigned short lookup_swap_cgroup_id(swp_entry_t ent) 456unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
457{ 457{
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index e0f62837c3f4..196970a4541f 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -278,8 +278,7 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
278 278
279/** 279/**
280 * vmpressure_register_event() - Bind vmpressure notifications to an eventfd 280 * vmpressure_register_event() - Bind vmpressure notifications to an eventfd
281 * @css: css that is interested in vmpressure notifications 281 * @memcg: memcg that is interested in vmpressure notifications
282 * @cft: cgroup control files handle
283 * @eventfd: eventfd context to link notifications with 282 * @eventfd: eventfd context to link notifications with
284 * @args: event arguments (used to set up a pressure level threshold) 283 * @args: event arguments (used to set up a pressure level threshold)
285 * 284 *
@@ -289,15 +288,12 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
289 * threshold (one of vmpressure_str_levels, i.e. "low", "medium", or 288 * threshold (one of vmpressure_str_levels, i.e. "low", "medium", or
290 * "critical"). 289 * "critical").
291 * 290 *
292 * This function should not be used directly, just pass it to (struct 291 * To be used as memcg event method.
293 * cftype).register_event, and then cgroup core will handle everything by
294 * itself.
295 */ 292 */
296int vmpressure_register_event(struct cgroup_subsys_state *css, 293int vmpressure_register_event(struct mem_cgroup *memcg,
297 struct cftype *cft, struct eventfd_ctx *eventfd, 294 struct eventfd_ctx *eventfd, const char *args)
298 const char *args)
299{ 295{
300 struct vmpressure *vmpr = css_to_vmpressure(css); 296 struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
301 struct vmpressure_event *ev; 297 struct vmpressure_event *ev;
302 int level; 298 int level;
303 299
@@ -325,23 +321,19 @@ int vmpressure_register_event(struct cgroup_subsys_state *css,
325 321
326/** 322/**
327 * vmpressure_unregister_event() - Unbind eventfd from vmpressure 323 * vmpressure_unregister_event() - Unbind eventfd from vmpressure
328 * @css: css handle 324 * @memcg: memcg handle
329 * @cft: cgroup control files handle
330 * @eventfd: eventfd context that was used to link vmpressure with the @cg 325 * @eventfd: eventfd context that was used to link vmpressure with the @cg
331 * 326 *
332 * This function does internal manipulations to detach the @eventfd from 327 * This function does internal manipulations to detach the @eventfd from
333 * the vmpressure notifications, and then frees internal resources 328 * the vmpressure notifications, and then frees internal resources
334 * associated with the @eventfd (but the @eventfd itself is not freed). 329 * associated with the @eventfd (but the @eventfd itself is not freed).
335 * 330 *
336 * This function should not be used directly, just pass it to (struct 331 * To be used as memcg event method.
337 * cftype).unregister_event, and then cgroup core will handle everything
338 * by itself.
339 */ 332 */
340void vmpressure_unregister_event(struct cgroup_subsys_state *css, 333void vmpressure_unregister_event(struct mem_cgroup *memcg,
341 struct cftype *cft,
342 struct eventfd_ctx *eventfd) 334 struct eventfd_ctx *eventfd)
343{ 335{
344 struct vmpressure *vmpr = css_to_vmpressure(css); 336 struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
345 struct vmpressure_event *ev; 337 struct vmpressure_event *ev;
346 338
347 mutex_lock(&vmpr->events_lock); 339 mutex_lock(&vmpr->events_lock);
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 9b7cf6c85f82..56cbb69ba024 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -173,14 +173,14 @@ static u64 read_prioidx(struct cgroup_subsys_state *css, struct cftype *cft)
173 return css->cgroup->id; 173 return css->cgroup->id;
174} 174}
175 175
176static int read_priomap(struct cgroup_subsys_state *css, struct cftype *cft, 176static int read_priomap(struct seq_file *sf, void *v)
177 struct cgroup_map_cb *cb)
178{ 177{
179 struct net_device *dev; 178 struct net_device *dev;
180 179
181 rcu_read_lock(); 180 rcu_read_lock();
182 for_each_netdev_rcu(&init_net, dev) 181 for_each_netdev_rcu(&init_net, dev)
183 cb->fill(cb, dev->name, netprio_prio(css, dev)); 182 seq_printf(sf, "%s %u\n", dev->name,
183 netprio_prio(seq_css(sf), dev));
184 rcu_read_unlock(); 184 rcu_read_unlock();
185 return 0; 185 return 0;
186} 186}
@@ -238,7 +238,7 @@ static struct cftype ss_files[] = {
238 }, 238 },
239 { 239 {
240 .name = "ifpriomap", 240 .name = "ifpriomap",
241 .read_map = read_priomap, 241 .seq_show = read_priomap,
242 .write_string = write_priomap, 242 .write_string = write_priomap,
243 }, 243 },
244 { } /* terminate */ 244 { } /* terminate */
diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index 7c2a0a71049e..d3b6d2cd3a06 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -274,10 +274,9 @@ static void set_majmin(char *str, unsigned m)
274 sprintf(str, "%u", m); 274 sprintf(str, "%u", m);
275} 275}
276 276
277static int devcgroup_seq_read(struct cgroup_subsys_state *css, 277static int devcgroup_seq_show(struct seq_file *m, void *v)
278 struct cftype *cft, struct seq_file *m)
279{ 278{
280 struct dev_cgroup *devcgroup = css_to_devcgroup(css); 279 struct dev_cgroup *devcgroup = css_to_devcgroup(seq_css(m));
281 struct dev_exception_item *ex; 280 struct dev_exception_item *ex;
282 char maj[MAJMINLEN], min[MAJMINLEN], acc[ACCLEN]; 281 char maj[MAJMINLEN], min[MAJMINLEN], acc[ACCLEN];
283 282
@@ -679,7 +678,7 @@ static struct cftype dev_cgroup_files[] = {
679 }, 678 },
680 { 679 {
681 .name = "list", 680 .name = "list",
682 .read_seq_string = devcgroup_seq_read, 681 .seq_show = devcgroup_seq_show,
683 .private = DEVCG_LIST, 682 .private = DEVCG_LIST,
684 }, 683 },
685 { } /* terminate */ 684 { } /* terminate */