aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/cgroups/cgroups.txt20
-rw-r--r--Documentation/cgroups/memory.txt4
-rw-r--r--Documentation/cgroups/resource_counter.txt4
-rw-r--r--Documentation/devicetree/bindings/ata/marvell.txt2
-rw-r--r--Documentation/devicetree/bindings/ata/sata_rcar.txt18
-rw-r--r--arch/arm/boot/dts/armada-370-xp.dtsi2
-rw-r--r--arch/arm/plat-samsung/include/plat/regs-ata.h56
-rw-r--r--block/blk-throttle.c35
-rw-r--r--block/cfq-iosched.c131
-rw-r--r--drivers/ata/ahci.c62
-rw-r--r--drivers/ata/ahci_imx.c242
-rw-r--r--drivers/ata/ata_generic.c7
-rw-r--r--drivers/ata/libahci.c4
-rw-r--r--drivers/ata/libata-core.c27
-rw-r--r--drivers/ata/libata-eh.c5
-rw-r--r--drivers/ata/libata-scsi.c18
-rw-r--r--drivers/ata/pata_samsung_cf.c43
-rw-r--r--drivers/ata/sata_highbank.c1
-rw-r--r--drivers/ata/sata_mv.c52
-rw-r--r--drivers/ata/sata_rcar.c118
-rw-r--r--drivers/md/bcache/request.c1
-rw-r--r--fs/dlm/lowcomms.c8
-rw-r--r--fs/gfs2/aops.c23
-rw-r--r--fs/gfs2/dir.c90
-rw-r--r--fs/gfs2/dir.h19
-rw-r--r--fs/gfs2/glock.c29
-rw-r--r--fs/gfs2/glock.h2
-rw-r--r--fs/gfs2/glops.c26
-rw-r--r--fs/gfs2/incore.h23
-rw-r--r--fs/gfs2/inode.c118
-rw-r--r--fs/gfs2/lops.c5
-rw-r--r--fs/gfs2/main.c1
-rw-r--r--fs/gfs2/meta_io.c3
-rw-r--r--fs/gfs2/ops_fstype.c58
-rw-r--r--fs/gfs2/quota.c342
-rw-r--r--fs/gfs2/quota.h1
-rw-r--r--fs/gfs2/rgrp.c113
-rw-r--r--fs/gfs2/rgrp.h2
-rw-r--r--fs/gfs2/super.c43
-rw-r--r--include/linux/cgroup.h112
-rw-r--r--include/linux/libata.h2
-rw-r--r--include/linux/vmpressure.h8
-rw-r--r--include/uapi/linux/gfs2_ondisk.h11
-rw-r--r--init/Kconfig3
-rw-r--r--kernel/cgroup.c1202
-rw-r--r--kernel/cgroup_freezer.c7
-rw-r--r--kernel/cpuset.c71
-rw-r--r--kernel/sched/core.c13
-rw-r--r--kernel/sched/cpuacct.c18
-rw-r--r--kernel/workqueue.c2
-rw-r--r--lib/percpu-refcount.c3
-rw-r--r--mm/hugetlb_cgroup.c22
-rw-r--r--mm/memcontrol.c426
-rw-r--r--mm/page_cgroup.c2
-rw-r--r--mm/percpu.c4
-rw-r--r--mm/vmpressure.c26
-rw-r--r--net/core/netprio_cgroup.c8
-rw-r--r--security/device_cgroup.c7
58 files changed, 2045 insertions, 1660 deletions
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index 638bf17ff869..821de56d1580 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -24,7 +24,6 @@ CONTENTS:
24 2.1 Basic Usage 24 2.1 Basic Usage
25 2.2 Attaching processes 25 2.2 Attaching processes
26 2.3 Mounting hierarchies by name 26 2.3 Mounting hierarchies by name
27 2.4 Notification API
283. Kernel API 273. Kernel API
29 3.1 Overview 28 3.1 Overview
30 3.2 Synchronization 29 3.2 Synchronization
@@ -472,25 +471,6 @@ you give a subsystem a name.
472The name of the subsystem appears as part of the hierarchy description 471The name of the subsystem appears as part of the hierarchy description
473in /proc/mounts and /proc/<pid>/cgroups. 472in /proc/mounts and /proc/<pid>/cgroups.
474 473
4752.4 Notification API
476--------------------
477
478There is mechanism which allows to get notifications about changing
479status of a cgroup.
480
481To register a new notification handler you need to:
482 - create a file descriptor for event notification using eventfd(2);
483 - open a control file to be monitored (e.g. memory.usage_in_bytes);
484 - write "<event_fd> <control_fd> <args>" to cgroup.event_control.
485 Interpretation of args is defined by control file implementation;
486
487eventfd will be woken up by control file implementation or when the
488cgroup is removed.
489
490To unregister a notification handler just close eventfd.
491
492NOTE: Support of notifications should be implemented for the control
493file. See documentation for the subsystem.
494 474
4953. Kernel API 4753. Kernel API
496============= 476=============
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index e2bc132608fd..2622115276aa 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -577,7 +577,7 @@ Each memcg's numa_stat file includes "total", "file", "anon" and "unevictable"
577per-node page counts including "hierarchical_<counter>" which sums up all 577per-node page counts including "hierarchical_<counter>" which sums up all
578hierarchical children's values in addition to the memcg's own value. 578hierarchical children's values in addition to the memcg's own value.
579 579
580The ouput format of memory.numa_stat is: 580The output format of memory.numa_stat is:
581 581
582total=<total pages> N0=<node 0 pages> N1=<node 1 pages> ... 582total=<total pages> N0=<node 0 pages> N1=<node 1 pages> ...
583file=<total file pages> N0=<node 0 pages> N1=<node 1 pages> ... 583file=<total file pages> N0=<node 0 pages> N1=<node 1 pages> ...
@@ -670,7 +670,7 @@ page tables.
670 670
6718.1 Interface 6718.1 Interface
672 672
673This feature is disabled by default. It can be enabledi (and disabled again) by 673This feature is disabled by default. It can be enabled (and disabled again) by
674writing to memory.move_charge_at_immigrate of the destination cgroup. 674writing to memory.move_charge_at_immigrate of the destination cgroup.
675 675
676If you want to enable it: 676If you want to enable it:
diff --git a/Documentation/cgroups/resource_counter.txt b/Documentation/cgroups/resource_counter.txt
index c4d99ed0b418..52e1da16a309 100644
--- a/Documentation/cgroups/resource_counter.txt
+++ b/Documentation/cgroups/resource_counter.txt
@@ -97,8 +97,8 @@ to work with it.
97 (struct res_counter *rc, struct res_counter *top, 97 (struct res_counter *rc, struct res_counter *top,
98 unsinged long val) 98 unsinged long val)
99 99
100 Almost same as res_cunter_uncharge() but propagation of uncharge 100 Almost same as res_counter_uncharge() but propagation of uncharge
101 stops when rc == top. This is useful when kill a res_coutner in 101 stops when rc == top. This is useful when kill a res_counter in
102 child cgroup. 102 child cgroup.
103 103
104 2.1 Other accounting routines 104 2.1 Other accounting routines
diff --git a/Documentation/devicetree/bindings/ata/marvell.txt b/Documentation/devicetree/bindings/ata/marvell.txt
index b5cdd20cde9c..1c8351604d38 100644
--- a/Documentation/devicetree/bindings/ata/marvell.txt
+++ b/Documentation/devicetree/bindings/ata/marvell.txt
@@ -1,7 +1,7 @@
1* Marvell Orion SATA 1* Marvell Orion SATA
2 2
3Required Properties: 3Required Properties:
4- compatibility : "marvell,orion-sata" 4- compatibility : "marvell,orion-sata" or "marvell,armada-370-sata"
5- reg : Address range of controller 5- reg : Address range of controller
6- interrupts : Interrupt controller is using 6- interrupts : Interrupt controller is using
7- nr-ports : Number of SATA ports in use. 7- nr-ports : Number of SATA ports in use.
diff --git a/Documentation/devicetree/bindings/ata/sata_rcar.txt b/Documentation/devicetree/bindings/ata/sata_rcar.txt
new file mode 100644
index 000000000000..1e6111333fa8
--- /dev/null
+++ b/Documentation/devicetree/bindings/ata/sata_rcar.txt
@@ -0,0 +1,18 @@
1* Renesas R-Car SATA
2
3Required properties:
4- compatible : should contain one of the following:
5 - "renesas,sata-r8a7779" for R-Car H1
6 - "renesas,sata-r8a7790" for R-Car H2
7 - "renesas,sata-r8a7791" for R-Car M2
8- reg : address and length of the SATA registers;
9- interrupts : must consist of one interrupt specifier.
10
11Example:
12
13sata: sata@fc600000 {
14 compatible = "renesas,sata-r8a7779";
15 reg = <0xfc600000 0x2000>;
16 interrupt-parent = <&gic>;
17 interrupts = <0 100 IRQ_TYPE_LEVEL_HIGH>;
18};
diff --git a/arch/arm/boot/dts/armada-370-xp.dtsi b/arch/arm/boot/dts/armada-370-xp.dtsi
index 7f10f627ae5b..80ffacd128f8 100644
--- a/arch/arm/boot/dts/armada-370-xp.dtsi
+++ b/arch/arm/boot/dts/armada-370-xp.dtsi
@@ -152,7 +152,7 @@
152 }; 152 };
153 153
154 sata@a0000 { 154 sata@a0000 {
155 compatible = "marvell,orion-sata"; 155 compatible = "marvell,armada-370-sata";
156 reg = <0xa0000 0x5000>; 156 reg = <0xa0000 0x5000>;
157 interrupts = <55>; 157 interrupts = <55>;
158 clocks = <&gateclk 15>, <&gateclk 30>; 158 clocks = <&gateclk 15>, <&gateclk 30>;
diff --git a/arch/arm/plat-samsung/include/plat/regs-ata.h b/arch/arm/plat-samsung/include/plat/regs-ata.h
deleted file mode 100644
index f5df92fdae26..000000000000
--- a/arch/arm/plat-samsung/include/plat/regs-ata.h
+++ /dev/null
@@ -1,56 +0,0 @@
1/* linux/arch/arm/plat-samsung/include/plat/regs-ata.h
2 *
3 * Copyright (c) 2010 Samsung Electronics Co., Ltd.
4 * http://www.samsung.com
5 *
6 * Samsung CF-ATA register definitions
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11*/
12
13#ifndef __ASM_PLAT_REGS_ATA_H
14#define __ASM_PLAT_REGS_ATA_H __FILE__
15
16#define S3C_CFATA_REG(x) (x)
17
18#define S3C_CFATA_MUX S3C_CFATA_REG(0x0)
19
20#define S3C_ATA_CTRL S3C_CFATA_REG(0x0)
21#define S3C_ATA_STATUS S3C_CFATA_REG(0x4)
22#define S3C_ATA_CMD S3C_CFATA_REG(0x8)
23#define S3C_ATA_SWRST S3C_CFATA_REG(0xc)
24#define S3C_ATA_IRQ S3C_CFATA_REG(0x10)
25#define S3C_ATA_IRQ_MSK S3C_CFATA_REG(0x14)
26#define S3C_ATA_CFG S3C_CFATA_REG(0x18)
27
28#define S3C_ATA_MDMA_TIME S3C_CFATA_REG(0x28)
29#define S3C_ATA_PIO_TIME S3C_CFATA_REG(0x2c)
30#define S3C_ATA_UDMA_TIME S3C_CFATA_REG(0x30)
31#define S3C_ATA_XFR_NUM S3C_CFATA_REG(0x34)
32#define S3C_ATA_XFR_CNT S3C_CFATA_REG(0x38)
33#define S3C_ATA_TBUF_START S3C_CFATA_REG(0x3c)
34#define S3C_ATA_TBUF_SIZE S3C_CFATA_REG(0x40)
35#define S3C_ATA_SBUF_START S3C_CFATA_REG(0x44)
36#define S3C_ATA_SBUF_SIZE S3C_CFATA_REG(0x48)
37#define S3C_ATA_CADR_TBUF S3C_CFATA_REG(0x4c)
38#define S3C_ATA_CADR_SBUF S3C_CFATA_REG(0x50)
39#define S3C_ATA_PIO_DTR S3C_CFATA_REG(0x54)
40#define S3C_ATA_PIO_FED S3C_CFATA_REG(0x58)
41#define S3C_ATA_PIO_SCR S3C_CFATA_REG(0x5c)
42#define S3C_ATA_PIO_LLR S3C_CFATA_REG(0x60)
43#define S3C_ATA_PIO_LMR S3C_CFATA_REG(0x64)
44#define S3C_ATA_PIO_LHR S3C_CFATA_REG(0x68)
45#define S3C_ATA_PIO_DVR S3C_CFATA_REG(0x6c)
46#define S3C_ATA_PIO_CSD S3C_CFATA_REG(0x70)
47#define S3C_ATA_PIO_DAD S3C_CFATA_REG(0x74)
48#define S3C_ATA_PIO_READY S3C_CFATA_REG(0x78)
49#define S3C_ATA_PIO_RDATA S3C_CFATA_REG(0x7c)
50
51#define S3C_CFATA_MUX_TRUEIDE 0x01
52
53#define S3C_ATA_CFG_SWAP 0x40
54#define S3C_ATA_CFG_IORDYEN 0x02
55
56#endif /* __ASM_PLAT_REGS_ATA_H */
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 06534049afba..a760857e6b62 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1303,13 +1303,10 @@ static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,
1303 return __blkg_prfill_rwstat(sf, pd, &rwstat); 1303 return __blkg_prfill_rwstat(sf, pd, &rwstat);
1304} 1304}
1305 1305
1306static int tg_print_cpu_rwstat(struct cgroup_subsys_state *css, 1306static int tg_print_cpu_rwstat(struct seq_file *sf, void *v)
1307 struct cftype *cft, struct seq_file *sf)
1308{ 1307{
1309 struct blkcg *blkcg = css_to_blkcg(css); 1308 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_cpu_rwstat,
1310 1309 &blkcg_policy_throtl, seq_cft(sf)->private, true);
1311 blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat, &blkcg_policy_throtl,
1312 cft->private, true);
1313 return 0; 1310 return 0;
1314} 1311}
1315 1312
@@ -1335,19 +1332,17 @@ static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd,
1335 return __blkg_prfill_u64(sf, pd, v); 1332 return __blkg_prfill_u64(sf, pd, v);
1336} 1333}
1337 1334
1338static int tg_print_conf_u64(struct cgroup_subsys_state *css, 1335static int tg_print_conf_u64(struct seq_file *sf, void *v)
1339 struct cftype *cft, struct seq_file *sf)
1340{ 1336{
1341 blkcg_print_blkgs(sf, css_to_blkcg(css), tg_prfill_conf_u64, 1337 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_u64,
1342 &blkcg_policy_throtl, cft->private, false); 1338 &blkcg_policy_throtl, seq_cft(sf)->private, false);
1343 return 0; 1339 return 0;
1344} 1340}
1345 1341
1346static int tg_print_conf_uint(struct cgroup_subsys_state *css, 1342static int tg_print_conf_uint(struct seq_file *sf, void *v)
1347 struct cftype *cft, struct seq_file *sf)
1348{ 1343{
1349 blkcg_print_blkgs(sf, css_to_blkcg(css), tg_prfill_conf_uint, 1344 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_uint,
1350 &blkcg_policy_throtl, cft->private, false); 1345 &blkcg_policy_throtl, seq_cft(sf)->private, false);
1351 return 0; 1346 return 0;
1352} 1347}
1353 1348
@@ -1428,40 +1423,40 @@ static struct cftype throtl_files[] = {
1428 { 1423 {
1429 .name = "throttle.read_bps_device", 1424 .name = "throttle.read_bps_device",
1430 .private = offsetof(struct throtl_grp, bps[READ]), 1425 .private = offsetof(struct throtl_grp, bps[READ]),
1431 .read_seq_string = tg_print_conf_u64, 1426 .seq_show = tg_print_conf_u64,
1432 .write_string = tg_set_conf_u64, 1427 .write_string = tg_set_conf_u64,
1433 .max_write_len = 256, 1428 .max_write_len = 256,
1434 }, 1429 },
1435 { 1430 {
1436 .name = "throttle.write_bps_device", 1431 .name = "throttle.write_bps_device",
1437 .private = offsetof(struct throtl_grp, bps[WRITE]), 1432 .private = offsetof(struct throtl_grp, bps[WRITE]),
1438 .read_seq_string = tg_print_conf_u64, 1433 .seq_show = tg_print_conf_u64,
1439 .write_string = tg_set_conf_u64, 1434 .write_string = tg_set_conf_u64,
1440 .max_write_len = 256, 1435 .max_write_len = 256,
1441 }, 1436 },
1442 { 1437 {
1443 .name = "throttle.read_iops_device", 1438 .name = "throttle.read_iops_device",
1444 .private = offsetof(struct throtl_grp, iops[READ]), 1439 .private = offsetof(struct throtl_grp, iops[READ]),
1445 .read_seq_string = tg_print_conf_uint, 1440 .seq_show = tg_print_conf_uint,
1446 .write_string = tg_set_conf_uint, 1441 .write_string = tg_set_conf_uint,
1447 .max_write_len = 256, 1442 .max_write_len = 256,
1448 }, 1443 },
1449 { 1444 {
1450 .name = "throttle.write_iops_device", 1445 .name = "throttle.write_iops_device",
1451 .private = offsetof(struct throtl_grp, iops[WRITE]), 1446 .private = offsetof(struct throtl_grp, iops[WRITE]),
1452 .read_seq_string = tg_print_conf_uint, 1447 .seq_show = tg_print_conf_uint,
1453 .write_string = tg_set_conf_uint, 1448 .write_string = tg_set_conf_uint,
1454 .max_write_len = 256, 1449 .max_write_len = 256,
1455 }, 1450 },
1456 { 1451 {
1457 .name = "throttle.io_service_bytes", 1452 .name = "throttle.io_service_bytes",
1458 .private = offsetof(struct tg_stats_cpu, service_bytes), 1453 .private = offsetof(struct tg_stats_cpu, service_bytes),
1459 .read_seq_string = tg_print_cpu_rwstat, 1454 .seq_show = tg_print_cpu_rwstat,
1460 }, 1455 },
1461 { 1456 {
1462 .name = "throttle.io_serviced", 1457 .name = "throttle.io_serviced",
1463 .private = offsetof(struct tg_stats_cpu, serviced), 1458 .private = offsetof(struct tg_stats_cpu, serviced),
1464 .read_seq_string = tg_print_cpu_rwstat, 1459 .seq_show = tg_print_cpu_rwstat,
1465 }, 1460 },
1466 { } /* terminate */ 1461 { } /* terminate */
1467}; 1462};
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 4d5cec1ad80d..744833b630c6 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1632,11 +1632,11 @@ static u64 cfqg_prfill_weight_device(struct seq_file *sf,
1632 return __blkg_prfill_u64(sf, pd, cfqg->dev_weight); 1632 return __blkg_prfill_u64(sf, pd, cfqg->dev_weight);
1633} 1633}
1634 1634
1635static int cfqg_print_weight_device(struct cgroup_subsys_state *css, 1635static int cfqg_print_weight_device(struct seq_file *sf, void *v)
1636 struct cftype *cft, struct seq_file *sf)
1637{ 1636{
1638 blkcg_print_blkgs(sf, css_to_blkcg(css), cfqg_prfill_weight_device, 1637 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1639 &blkcg_policy_cfq, 0, false); 1638 cfqg_prfill_weight_device, &blkcg_policy_cfq,
1639 0, false);
1640 return 0; 1640 return 0;
1641} 1641}
1642 1642
@@ -1650,26 +1650,23 @@ static u64 cfqg_prfill_leaf_weight_device(struct seq_file *sf,
1650 return __blkg_prfill_u64(sf, pd, cfqg->dev_leaf_weight); 1650 return __blkg_prfill_u64(sf, pd, cfqg->dev_leaf_weight);
1651} 1651}
1652 1652
1653static int cfqg_print_leaf_weight_device(struct cgroup_subsys_state *css, 1653static int cfqg_print_leaf_weight_device(struct seq_file *sf, void *v)
1654 struct cftype *cft,
1655 struct seq_file *sf)
1656{ 1654{
1657 blkcg_print_blkgs(sf, css_to_blkcg(css), cfqg_prfill_leaf_weight_device, 1655 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1658 &blkcg_policy_cfq, 0, false); 1656 cfqg_prfill_leaf_weight_device, &blkcg_policy_cfq,
1657 0, false);
1659 return 0; 1658 return 0;
1660} 1659}
1661 1660
1662static int cfq_print_weight(struct cgroup_subsys_state *css, struct cftype *cft, 1661static int cfq_print_weight(struct seq_file *sf, void *v)
1663 struct seq_file *sf)
1664{ 1662{
1665 seq_printf(sf, "%u\n", css_to_blkcg(css)->cfq_weight); 1663 seq_printf(sf, "%u\n", css_to_blkcg(seq_css(sf))->cfq_weight);
1666 return 0; 1664 return 0;
1667} 1665}
1668 1666
1669static int cfq_print_leaf_weight(struct cgroup_subsys_state *css, 1667static int cfq_print_leaf_weight(struct seq_file *sf, void *v)
1670 struct cftype *cft, struct seq_file *sf)
1671{ 1668{
1672 seq_printf(sf, "%u\n", css_to_blkcg(css)->cfq_leaf_weight); 1669 seq_printf(sf, "%u\n", css_to_blkcg(seq_css(sf))->cfq_leaf_weight);
1673 return 0; 1670 return 0;
1674} 1671}
1675 1672
@@ -1762,23 +1759,17 @@ static int cfq_set_leaf_weight(struct cgroup_subsys_state *css,
1762 return __cfq_set_weight(css, cft, val, true); 1759 return __cfq_set_weight(css, cft, val, true);
1763} 1760}
1764 1761
1765static int cfqg_print_stat(struct cgroup_subsys_state *css, struct cftype *cft, 1762static int cfqg_print_stat(struct seq_file *sf, void *v)
1766 struct seq_file *sf)
1767{ 1763{
1768 struct blkcg *blkcg = css_to_blkcg(css); 1764 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
1769 1765 &blkcg_policy_cfq, seq_cft(sf)->private, false);
1770 blkcg_print_blkgs(sf, blkcg, blkg_prfill_stat, &blkcg_policy_cfq,
1771 cft->private, false);
1772 return 0; 1766 return 0;
1773} 1767}
1774 1768
1775static int cfqg_print_rwstat(struct cgroup_subsys_state *css, 1769static int cfqg_print_rwstat(struct seq_file *sf, void *v)
1776 struct cftype *cft, struct seq_file *sf)
1777{ 1770{
1778 struct blkcg *blkcg = css_to_blkcg(css); 1771 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
1779 1772 &blkcg_policy_cfq, seq_cft(sf)->private, true);
1780 blkcg_print_blkgs(sf, blkcg, blkg_prfill_rwstat, &blkcg_policy_cfq,
1781 cft->private, true);
1782 return 0; 1773 return 0;
1783} 1774}
1784 1775
@@ -1798,23 +1789,19 @@ static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf,
1798 return __blkg_prfill_rwstat(sf, pd, &sum); 1789 return __blkg_prfill_rwstat(sf, pd, &sum);
1799} 1790}
1800 1791
1801static int cfqg_print_stat_recursive(struct cgroup_subsys_state *css, 1792static int cfqg_print_stat_recursive(struct seq_file *sf, void *v)
1802 struct cftype *cft, struct seq_file *sf)
1803{ 1793{
1804 struct blkcg *blkcg = css_to_blkcg(css); 1794 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1805 1795 cfqg_prfill_stat_recursive, &blkcg_policy_cfq,
1806 blkcg_print_blkgs(sf, blkcg, cfqg_prfill_stat_recursive, 1796 seq_cft(sf)->private, false);
1807 &blkcg_policy_cfq, cft->private, false);
1808 return 0; 1797 return 0;
1809} 1798}
1810 1799
1811static int cfqg_print_rwstat_recursive(struct cgroup_subsys_state *css, 1800static int cfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
1812 struct cftype *cft, struct seq_file *sf)
1813{ 1801{
1814 struct blkcg *blkcg = css_to_blkcg(css); 1802 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1815 1803 cfqg_prfill_rwstat_recursive, &blkcg_policy_cfq,
1816 blkcg_print_blkgs(sf, blkcg, cfqg_prfill_rwstat_recursive, 1804 seq_cft(sf)->private, true);
1817 &blkcg_policy_cfq, cft->private, true);
1818 return 0; 1805 return 0;
1819} 1806}
1820 1807
@@ -1835,13 +1822,11 @@ static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf,
1835} 1822}
1836 1823
1837/* print avg_queue_size */ 1824/* print avg_queue_size */
1838static int cfqg_print_avg_queue_size(struct cgroup_subsys_state *css, 1825static int cfqg_print_avg_queue_size(struct seq_file *sf, void *v)
1839 struct cftype *cft, struct seq_file *sf)
1840{ 1826{
1841 struct blkcg *blkcg = css_to_blkcg(css); 1827 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1842 1828 cfqg_prfill_avg_queue_size, &blkcg_policy_cfq,
1843 blkcg_print_blkgs(sf, blkcg, cfqg_prfill_avg_queue_size, 1829 0, false);
1844 &blkcg_policy_cfq, 0, false);
1845 return 0; 1830 return 0;
1846} 1831}
1847#endif /* CONFIG_DEBUG_BLK_CGROUP */ 1832#endif /* CONFIG_DEBUG_BLK_CGROUP */
@@ -1851,14 +1836,14 @@ static struct cftype cfq_blkcg_files[] = {
1851 { 1836 {
1852 .name = "weight_device", 1837 .name = "weight_device",
1853 .flags = CFTYPE_ONLY_ON_ROOT, 1838 .flags = CFTYPE_ONLY_ON_ROOT,
1854 .read_seq_string = cfqg_print_leaf_weight_device, 1839 .seq_show = cfqg_print_leaf_weight_device,
1855 .write_string = cfqg_set_leaf_weight_device, 1840 .write_string = cfqg_set_leaf_weight_device,
1856 .max_write_len = 256, 1841 .max_write_len = 256,
1857 }, 1842 },
1858 { 1843 {
1859 .name = "weight", 1844 .name = "weight",
1860 .flags = CFTYPE_ONLY_ON_ROOT, 1845 .flags = CFTYPE_ONLY_ON_ROOT,
1861 .read_seq_string = cfq_print_leaf_weight, 1846 .seq_show = cfq_print_leaf_weight,
1862 .write_u64 = cfq_set_leaf_weight, 1847 .write_u64 = cfq_set_leaf_weight,
1863 }, 1848 },
1864 1849
@@ -1866,26 +1851,26 @@ static struct cftype cfq_blkcg_files[] = {
1866 { 1851 {
1867 .name = "weight_device", 1852 .name = "weight_device",
1868 .flags = CFTYPE_NOT_ON_ROOT, 1853 .flags = CFTYPE_NOT_ON_ROOT,
1869 .read_seq_string = cfqg_print_weight_device, 1854 .seq_show = cfqg_print_weight_device,
1870 .write_string = cfqg_set_weight_device, 1855 .write_string = cfqg_set_weight_device,
1871 .max_write_len = 256, 1856 .max_write_len = 256,
1872 }, 1857 },
1873 { 1858 {
1874 .name = "weight", 1859 .name = "weight",
1875 .flags = CFTYPE_NOT_ON_ROOT, 1860 .flags = CFTYPE_NOT_ON_ROOT,
1876 .read_seq_string = cfq_print_weight, 1861 .seq_show = cfq_print_weight,
1877 .write_u64 = cfq_set_weight, 1862 .write_u64 = cfq_set_weight,
1878 }, 1863 },
1879 1864
1880 { 1865 {
1881 .name = "leaf_weight_device", 1866 .name = "leaf_weight_device",
1882 .read_seq_string = cfqg_print_leaf_weight_device, 1867 .seq_show = cfqg_print_leaf_weight_device,
1883 .write_string = cfqg_set_leaf_weight_device, 1868 .write_string = cfqg_set_leaf_weight_device,
1884 .max_write_len = 256, 1869 .max_write_len = 256,
1885 }, 1870 },
1886 { 1871 {
1887 .name = "leaf_weight", 1872 .name = "leaf_weight",
1888 .read_seq_string = cfq_print_leaf_weight, 1873 .seq_show = cfq_print_leaf_weight,
1889 .write_u64 = cfq_set_leaf_weight, 1874 .write_u64 = cfq_set_leaf_weight,
1890 }, 1875 },
1891 1876
@@ -1893,114 +1878,114 @@ static struct cftype cfq_blkcg_files[] = {
1893 { 1878 {
1894 .name = "time", 1879 .name = "time",
1895 .private = offsetof(struct cfq_group, stats.time), 1880 .private = offsetof(struct cfq_group, stats.time),
1896 .read_seq_string = cfqg_print_stat, 1881 .seq_show = cfqg_print_stat,
1897 }, 1882 },
1898 { 1883 {
1899 .name = "sectors", 1884 .name = "sectors",
1900 .private = offsetof(struct cfq_group, stats.sectors), 1885 .private = offsetof(struct cfq_group, stats.sectors),
1901 .read_seq_string = cfqg_print_stat, 1886 .seq_show = cfqg_print_stat,
1902 }, 1887 },
1903 { 1888 {
1904 .name = "io_service_bytes", 1889 .name = "io_service_bytes",
1905 .private = offsetof(struct cfq_group, stats.service_bytes), 1890 .private = offsetof(struct cfq_group, stats.service_bytes),
1906 .read_seq_string = cfqg_print_rwstat, 1891 .seq_show = cfqg_print_rwstat,
1907 }, 1892 },
1908 { 1893 {
1909 .name = "io_serviced", 1894 .name = "io_serviced",
1910 .private = offsetof(struct cfq_group, stats.serviced), 1895 .private = offsetof(struct cfq_group, stats.serviced),
1911 .read_seq_string = cfqg_print_rwstat, 1896 .seq_show = cfqg_print_rwstat,
1912 }, 1897 },
1913 { 1898 {
1914 .name = "io_service_time", 1899 .name = "io_service_time",
1915 .private = offsetof(struct cfq_group, stats.service_time), 1900 .private = offsetof(struct cfq_group, stats.service_time),
1916 .read_seq_string = cfqg_print_rwstat, 1901 .seq_show = cfqg_print_rwstat,
1917 }, 1902 },
1918 { 1903 {
1919 .name = "io_wait_time", 1904 .name = "io_wait_time",
1920 .private = offsetof(struct cfq_group, stats.wait_time), 1905 .private = offsetof(struct cfq_group, stats.wait_time),
1921 .read_seq_string = cfqg_print_rwstat, 1906 .seq_show = cfqg_print_rwstat,
1922 }, 1907 },
1923 { 1908 {
1924 .name = "io_merged", 1909 .name = "io_merged",
1925 .private = offsetof(struct cfq_group, stats.merged), 1910 .private = offsetof(struct cfq_group, stats.merged),
1926 .read_seq_string = cfqg_print_rwstat, 1911 .seq_show = cfqg_print_rwstat,
1927 }, 1912 },
1928 { 1913 {
1929 .name = "io_queued", 1914 .name = "io_queued",
1930 .private = offsetof(struct cfq_group, stats.queued), 1915 .private = offsetof(struct cfq_group, stats.queued),
1931 .read_seq_string = cfqg_print_rwstat, 1916 .seq_show = cfqg_print_rwstat,
1932 }, 1917 },
1933 1918
1934 /* the same statictics which cover the cfqg and its descendants */ 1919 /* the same statictics which cover the cfqg and its descendants */
1935 { 1920 {
1936 .name = "time_recursive", 1921 .name = "time_recursive",
1937 .private = offsetof(struct cfq_group, stats.time), 1922 .private = offsetof(struct cfq_group, stats.time),
1938 .read_seq_string = cfqg_print_stat_recursive, 1923 .seq_show = cfqg_print_stat_recursive,
1939 }, 1924 },
1940 { 1925 {
1941 .name = "sectors_recursive", 1926 .name = "sectors_recursive",
1942 .private = offsetof(struct cfq_group, stats.sectors), 1927 .private = offsetof(struct cfq_group, stats.sectors),
1943 .read_seq_string = cfqg_print_stat_recursive, 1928 .seq_show = cfqg_print_stat_recursive,
1944 }, 1929 },
1945 { 1930 {
1946 .name = "io_service_bytes_recursive", 1931 .name = "io_service_bytes_recursive",
1947 .private = offsetof(struct cfq_group, stats.service_bytes), 1932 .private = offsetof(struct cfq_group, stats.service_bytes),
1948 .read_seq_string = cfqg_print_rwstat_recursive, 1933 .seq_show = cfqg_print_rwstat_recursive,
1949 }, 1934 },
1950 { 1935 {
1951 .name = "io_serviced_recursive", 1936 .name = "io_serviced_recursive",
1952 .private = offsetof(struct cfq_group, stats.serviced), 1937 .private = offsetof(struct cfq_group, stats.serviced),
1953 .read_seq_string = cfqg_print_rwstat_recursive, 1938 .seq_show = cfqg_print_rwstat_recursive,
1954 }, 1939 },
1955 { 1940 {
1956 .name = "io_service_time_recursive", 1941 .name = "io_service_time_recursive",
1957 .private = offsetof(struct cfq_group, stats.service_time), 1942 .private = offsetof(struct cfq_group, stats.service_time),
1958 .read_seq_string = cfqg_print_rwstat_recursive, 1943 .seq_show = cfqg_print_rwstat_recursive,
1959 }, 1944 },
1960 { 1945 {
1961 .name = "io_wait_time_recursive", 1946 .name = "io_wait_time_recursive",
1962 .private = offsetof(struct cfq_group, stats.wait_time), 1947 .private = offsetof(struct cfq_group, stats.wait_time),
1963 .read_seq_string = cfqg_print_rwstat_recursive, 1948 .seq_show = cfqg_print_rwstat_recursive,
1964 }, 1949 },
1965 { 1950 {
1966 .name = "io_merged_recursive", 1951 .name = "io_merged_recursive",
1967 .private = offsetof(struct cfq_group, stats.merged), 1952 .private = offsetof(struct cfq_group, stats.merged),
1968 .read_seq_string = cfqg_print_rwstat_recursive, 1953 .seq_show = cfqg_print_rwstat_recursive,
1969 }, 1954 },
1970 { 1955 {
1971 .name = "io_queued_recursive", 1956 .name = "io_queued_recursive",
1972 .private = offsetof(struct cfq_group, stats.queued), 1957 .private = offsetof(struct cfq_group, stats.queued),
1973 .read_seq_string = cfqg_print_rwstat_recursive, 1958 .seq_show = cfqg_print_rwstat_recursive,
1974 }, 1959 },
1975#ifdef CONFIG_DEBUG_BLK_CGROUP 1960#ifdef CONFIG_DEBUG_BLK_CGROUP
1976 { 1961 {
1977 .name = "avg_queue_size", 1962 .name = "avg_queue_size",
1978 .read_seq_string = cfqg_print_avg_queue_size, 1963 .seq_show = cfqg_print_avg_queue_size,
1979 }, 1964 },
1980 { 1965 {
1981 .name = "group_wait_time", 1966 .name = "group_wait_time",
1982 .private = offsetof(struct cfq_group, stats.group_wait_time), 1967 .private = offsetof(struct cfq_group, stats.group_wait_time),
1983 .read_seq_string = cfqg_print_stat, 1968 .seq_show = cfqg_print_stat,
1984 }, 1969 },
1985 { 1970 {
1986 .name = "idle_time", 1971 .name = "idle_time",
1987 .private = offsetof(struct cfq_group, stats.idle_time), 1972 .private = offsetof(struct cfq_group, stats.idle_time),
1988 .read_seq_string = cfqg_print_stat, 1973 .seq_show = cfqg_print_stat,
1989 }, 1974 },
1990 { 1975 {
1991 .name = "empty_time", 1976 .name = "empty_time",
1992 .private = offsetof(struct cfq_group, stats.empty_time), 1977 .private = offsetof(struct cfq_group, stats.empty_time),
1993 .read_seq_string = cfqg_print_stat, 1978 .seq_show = cfqg_print_stat,
1994 }, 1979 },
1995 { 1980 {
1996 .name = "dequeue", 1981 .name = "dequeue",
1997 .private = offsetof(struct cfq_group, stats.dequeue), 1982 .private = offsetof(struct cfq_group, stats.dequeue),
1998 .read_seq_string = cfqg_print_stat, 1983 .seq_show = cfqg_print_stat,
1999 }, 1984 },
2000 { 1985 {
2001 .name = "unaccounted_time", 1986 .name = "unaccounted_time",
2002 .private = offsetof(struct cfq_group, stats.unaccounted_time), 1987 .private = offsetof(struct cfq_group, stats.unaccounted_time),
2003 .read_seq_string = cfqg_print_stat, 1988 .seq_show = cfqg_print_stat,
2004 }, 1989 },
2005#endif /* CONFIG_DEBUG_BLK_CGROUP */ 1990#endif /* CONFIG_DEBUG_BLK_CGROUP */
2006 { } /* terminate */ 1991 { } /* terminate */
diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index e3a92a6da39a..74911c2cb1dd 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -83,6 +83,8 @@ enum board_ids {
83static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent); 83static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent);
84static int ahci_vt8251_hardreset(struct ata_link *link, unsigned int *class, 84static int ahci_vt8251_hardreset(struct ata_link *link, unsigned int *class,
85 unsigned long deadline); 85 unsigned long deadline);
86static void ahci_mcp89_apple_enable(struct pci_dev *pdev);
87static bool is_mcp89_apple(struct pci_dev *pdev);
86static int ahci_p5wdh_hardreset(struct ata_link *link, unsigned int *class, 88static int ahci_p5wdh_hardreset(struct ata_link *link, unsigned int *class,
87 unsigned long deadline); 89 unsigned long deadline);
88#ifdef CONFIG_PM 90#ifdef CONFIG_PM
@@ -664,6 +666,10 @@ static int ahci_pci_device_resume(struct pci_dev *pdev)
664 if (rc) 666 if (rc)
665 return rc; 667 return rc;
666 668
669 /* Apple BIOS helpfully mangles the registers on resume */
670 if (is_mcp89_apple(pdev))
671 ahci_mcp89_apple_enable(pdev);
672
667 if (pdev->dev.power.power_state.event == PM_EVENT_SUSPEND) { 673 if (pdev->dev.power.power_state.event == PM_EVENT_SUSPEND) {
668 rc = ahci_pci_reset_controller(host); 674 rc = ahci_pci_reset_controller(host);
669 if (rc) 675 if (rc)
@@ -780,6 +786,48 @@ static void ahci_p5wdh_workaround(struct ata_host *host)
780 } 786 }
781} 787}
782 788
789/*
790 * Macbook7,1 firmware forcibly disables MCP89 AHCI and changes PCI ID when
791 * booting in BIOS compatibility mode. We restore the registers but not ID.
792 */
793static void ahci_mcp89_apple_enable(struct pci_dev *pdev)
794{
795 u32 val;
796
797 printk(KERN_INFO "ahci: enabling MCP89 AHCI mode\n");
798
799 pci_read_config_dword(pdev, 0xf8, &val);
800 val |= 1 << 0x1b;
801 /* the following changes the device ID, but appears not to affect function */
802 /* val = (val & ~0xf0000000) | 0x80000000; */
803 pci_write_config_dword(pdev, 0xf8, val);
804
805 pci_read_config_dword(pdev, 0x54c, &val);
806 val |= 1 << 0xc;
807 pci_write_config_dword(pdev, 0x54c, val);
808
809 pci_read_config_dword(pdev, 0x4a4, &val);
810 val &= 0xff;
811 val |= 0x01060100;
812 pci_write_config_dword(pdev, 0x4a4, val);
813
814 pci_read_config_dword(pdev, 0x54c, &val);
815 val &= ~(1 << 0xc);
816 pci_write_config_dword(pdev, 0x54c, val);
817
818 pci_read_config_dword(pdev, 0xf8, &val);
819 val &= ~(1 << 0x1b);
820 pci_write_config_dword(pdev, 0xf8, val);
821}
822
823static bool is_mcp89_apple(struct pci_dev *pdev)
824{
825 return pdev->vendor == PCI_VENDOR_ID_NVIDIA &&
826 pdev->device == PCI_DEVICE_ID_NVIDIA_NFORCE_MCP89_SATA &&
827 pdev->subsystem_vendor == PCI_VENDOR_ID_APPLE &&
828 pdev->subsystem_device == 0xcb89;
829}
830
783/* only some SB600 ahci controllers can do 64bit DMA */ 831/* only some SB600 ahci controllers can do 64bit DMA */
784static bool ahci_sb600_enable_64bit(struct pci_dev *pdev) 832static bool ahci_sb600_enable_64bit(struct pci_dev *pdev)
785{ 833{
@@ -1100,7 +1148,7 @@ static inline void ahci_gtf_filter_workaround(struct ata_host *host)
1100{} 1148{}
1101#endif 1149#endif
1102 1150
1103int ahci_init_interrupts(struct pci_dev *pdev, struct ahci_host_priv *hpriv) 1151static int ahci_init_interrupts(struct pci_dev *pdev, struct ahci_host_priv *hpriv)
1104{ 1152{
1105 int rc; 1153 int rc;
1106 unsigned int maxvec; 1154 unsigned int maxvec;
@@ -1212,15 +1260,9 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
1212 if (pdev->vendor == PCI_VENDOR_ID_MARVELL && !marvell_enable) 1260 if (pdev->vendor == PCI_VENDOR_ID_MARVELL && !marvell_enable)
1213 return -ENODEV; 1261 return -ENODEV;
1214 1262
1215 /* 1263 /* Apple BIOS on MCP89 prevents us using AHCI */
1216 * For some reason, MCP89 on MacBook 7,1 doesn't work with 1264 if (is_mcp89_apple(pdev))
1217 * ahci, use ata_generic instead. 1265 ahci_mcp89_apple_enable(pdev);
1218 */
1219 if (pdev->vendor == PCI_VENDOR_ID_NVIDIA &&
1220 pdev->device == PCI_DEVICE_ID_NVIDIA_NFORCE_MCP89_SATA &&
1221 pdev->subsystem_vendor == PCI_VENDOR_ID_APPLE &&
1222 pdev->subsystem_device == 0xcb89)
1223 return -ENODEV;
1224 1266
1225 /* Promise's PDC42819 is a SAS/SATA controller that has an AHCI mode. 1267 /* Promise's PDC42819 is a SAS/SATA controller that has an AHCI mode.
1226 * At the moment, we can only use the AHCI mode. Let the users know 1268 * At the moment, we can only use the AHCI mode. Let the users know
diff --git a/drivers/ata/ahci_imx.c b/drivers/ata/ahci_imx.c
index 3e23e9941dad..dd4d6f74d7bd 100644
--- a/drivers/ata/ahci_imx.c
+++ b/drivers/ata/ahci_imx.c
@@ -34,10 +34,21 @@ enum {
34 HOST_TIMER1MS = 0xe0, /* Timer 1-ms */ 34 HOST_TIMER1MS = 0xe0, /* Timer 1-ms */
35}; 35};
36 36
37enum ahci_imx_type {
38 AHCI_IMX53,
39 AHCI_IMX6Q,
40};
41
37struct imx_ahci_priv { 42struct imx_ahci_priv {
38 struct platform_device *ahci_pdev; 43 struct platform_device *ahci_pdev;
44 enum ahci_imx_type type;
45
46 /* i.MX53 clock */
47 struct clk *sata_gate_clk;
48 /* Common clock */
39 struct clk *sata_ref_clk; 49 struct clk *sata_ref_clk;
40 struct clk *ahb_clk; 50 struct clk *ahb_clk;
51
41 struct regmap *gpr; 52 struct regmap *gpr;
42 bool no_device; 53 bool no_device;
43 bool first_time; 54 bool first_time;
@@ -47,6 +58,59 @@ static int ahci_imx_hotplug;
47module_param_named(hotplug, ahci_imx_hotplug, int, 0644); 58module_param_named(hotplug, ahci_imx_hotplug, int, 0644);
48MODULE_PARM_DESC(hotplug, "AHCI IMX hot-plug support (0=Don't support, 1=support)"); 59MODULE_PARM_DESC(hotplug, "AHCI IMX hot-plug support (0=Don't support, 1=support)");
49 60
61static int imx_sata_clock_enable(struct device *dev)
62{
63 struct imx_ahci_priv *imxpriv = dev_get_drvdata(dev->parent);
64 int ret;
65
66 if (imxpriv->type == AHCI_IMX53) {
67 ret = clk_prepare_enable(imxpriv->sata_gate_clk);
68 if (ret < 0) {
69 dev_err(dev, "prepare-enable sata_gate clock err:%d\n",
70 ret);
71 return ret;
72 }
73 }
74
75 ret = clk_prepare_enable(imxpriv->sata_ref_clk);
76 if (ret < 0) {
77 dev_err(dev, "prepare-enable sata_ref clock err:%d\n",
78 ret);
79 goto clk_err;
80 }
81
82 if (imxpriv->type == AHCI_IMX6Q) {
83 regmap_update_bits(imxpriv->gpr, IOMUXC_GPR13,
84 IMX6Q_GPR13_SATA_MPLL_CLK_EN,
85 IMX6Q_GPR13_SATA_MPLL_CLK_EN);
86 }
87
88 usleep_range(1000, 2000);
89
90 return 0;
91
92clk_err:
93 if (imxpriv->type == AHCI_IMX53)
94 clk_disable_unprepare(imxpriv->sata_gate_clk);
95 return ret;
96}
97
98static void imx_sata_clock_disable(struct device *dev)
99{
100 struct imx_ahci_priv *imxpriv = dev_get_drvdata(dev->parent);
101
102 if (imxpriv->type == AHCI_IMX6Q) {
103 regmap_update_bits(imxpriv->gpr, IOMUXC_GPR13,
104 IMX6Q_GPR13_SATA_MPLL_CLK_EN,
105 !IMX6Q_GPR13_SATA_MPLL_CLK_EN);
106 }
107
108 clk_disable_unprepare(imxpriv->sata_ref_clk);
109
110 if (imxpriv->type == AHCI_IMX53)
111 clk_disable_unprepare(imxpriv->sata_gate_clk);
112}
113
50static void ahci_imx_error_handler(struct ata_port *ap) 114static void ahci_imx_error_handler(struct ata_port *ap)
51{ 115{
52 u32 reg_val; 116 u32 reg_val;
@@ -72,16 +136,29 @@ static void ahci_imx_error_handler(struct ata_port *ap)
72 */ 136 */
73 reg_val = readl(mmio + PORT_PHY_CTL); 137 reg_val = readl(mmio + PORT_PHY_CTL);
74 writel(reg_val | PORT_PHY_CTL_PDDQ_LOC, mmio + PORT_PHY_CTL); 138 writel(reg_val | PORT_PHY_CTL_PDDQ_LOC, mmio + PORT_PHY_CTL);
75 regmap_update_bits(imxpriv->gpr, IOMUXC_GPR13, 139 imx_sata_clock_disable(ap->dev);
76 IMX6Q_GPR13_SATA_MPLL_CLK_EN,
77 !IMX6Q_GPR13_SATA_MPLL_CLK_EN);
78 clk_disable_unprepare(imxpriv->sata_ref_clk);
79 imxpriv->no_device = true; 140 imxpriv->no_device = true;
80} 141}
81 142
143static int ahci_imx_softreset(struct ata_link *link, unsigned int *class,
144 unsigned long deadline)
145{
146 struct ata_port *ap = link->ap;
147 struct imx_ahci_priv *imxpriv = dev_get_drvdata(ap->dev->parent);
148 int ret = -EIO;
149
150 if (imxpriv->type == AHCI_IMX53)
151 ret = ahci_pmp_retry_srst_ops.softreset(link, class, deadline);
152 else if (imxpriv->type == AHCI_IMX6Q)
153 ret = ahci_ops.softreset(link, class, deadline);
154
155 return ret;
156}
157
82static struct ata_port_operations ahci_imx_ops = { 158static struct ata_port_operations ahci_imx_ops = {
83 .inherits = &ahci_platform_ops, 159 .inherits = &ahci_platform_ops,
84 .error_handler = ahci_imx_error_handler, 160 .error_handler = ahci_imx_error_handler,
161 .softreset = ahci_imx_softreset,
85}; 162};
86 163
87static const struct ata_port_info ahci_imx_port_info = { 164static const struct ata_port_info ahci_imx_port_info = {
@@ -91,52 +168,15 @@ static const struct ata_port_info ahci_imx_port_info = {
91 .port_ops = &ahci_imx_ops, 168 .port_ops = &ahci_imx_ops,
92}; 169};
93 170
94static int imx6q_sata_init(struct device *dev, void __iomem *mmio) 171static int imx_sata_init(struct device *dev, void __iomem *mmio)
95{ 172{
96 int ret = 0; 173 int ret = 0;
97 unsigned int reg_val; 174 unsigned int reg_val;
98 struct imx_ahci_priv *imxpriv = dev_get_drvdata(dev->parent); 175 struct imx_ahci_priv *imxpriv = dev_get_drvdata(dev->parent);
99 176
100 imxpriv->gpr = 177 ret = imx_sata_clock_enable(dev);
101 syscon_regmap_lookup_by_compatible("fsl,imx6q-iomuxc-gpr"); 178 if (ret < 0)
102 if (IS_ERR(imxpriv->gpr)) {
103 dev_err(dev, "failed to find fsl,imx6q-iomux-gpr regmap\n");
104 return PTR_ERR(imxpriv->gpr);
105 }
106
107 ret = clk_prepare_enable(imxpriv->sata_ref_clk);
108 if (ret < 0) {
109 dev_err(dev, "prepare-enable sata_ref clock err:%d\n", ret);
110 return ret; 179 return ret;
111 }
112
113 /*
114 * set PHY Paremeters, two steps to configure the GPR13,
115 * one write for rest of parameters, mask of first write
116 * is 0x07ffffff, and the other one write for setting
117 * the mpll_clk_en.
118 */
119 regmap_update_bits(imxpriv->gpr, 0x34, IMX6Q_GPR13_SATA_RX_EQ_VAL_MASK
120 | IMX6Q_GPR13_SATA_RX_LOS_LVL_MASK
121 | IMX6Q_GPR13_SATA_RX_DPLL_MODE_MASK
122 | IMX6Q_GPR13_SATA_SPD_MODE_MASK
123 | IMX6Q_GPR13_SATA_MPLL_SS_EN
124 | IMX6Q_GPR13_SATA_TX_ATTEN_MASK
125 | IMX6Q_GPR13_SATA_TX_BOOST_MASK
126 | IMX6Q_GPR13_SATA_TX_LVL_MASK
127 | IMX6Q_GPR13_SATA_MPLL_CLK_EN
128 | IMX6Q_GPR13_SATA_TX_EDGE_RATE
129 , IMX6Q_GPR13_SATA_RX_EQ_VAL_3_0_DB
130 | IMX6Q_GPR13_SATA_RX_LOS_LVL_SATA2M
131 | IMX6Q_GPR13_SATA_RX_DPLL_MODE_2P_4F
132 | IMX6Q_GPR13_SATA_SPD_MODE_3P0G
133 | IMX6Q_GPR13_SATA_MPLL_SS_EN
134 | IMX6Q_GPR13_SATA_TX_ATTEN_9_16
135 | IMX6Q_GPR13_SATA_TX_BOOST_3_33_DB
136 | IMX6Q_GPR13_SATA_TX_LVL_1_025_V);
137 regmap_update_bits(imxpriv->gpr, 0x34, IMX6Q_GPR13_SATA_MPLL_CLK_EN,
138 IMX6Q_GPR13_SATA_MPLL_CLK_EN);
139 usleep_range(100, 200);
140 180
141 /* 181 /*
142 * Configure the HWINIT bits of the HOST_CAP and HOST_PORTS_IMPL, 182 * Configure the HWINIT bits of the HOST_CAP and HOST_PORTS_IMPL,
@@ -162,13 +202,9 @@ static int imx6q_sata_init(struct device *dev, void __iomem *mmio)
162 return 0; 202 return 0;
163} 203}
164 204
165static void imx6q_sata_exit(struct device *dev) 205static void imx_sata_exit(struct device *dev)
166{ 206{
167 struct imx_ahci_priv *imxpriv = dev_get_drvdata(dev->parent); 207 imx_sata_clock_disable(dev);
168
169 regmap_update_bits(imxpriv->gpr, 0x34, IMX6Q_GPR13_SATA_MPLL_CLK_EN,
170 !IMX6Q_GPR13_SATA_MPLL_CLK_EN);
171 clk_disable_unprepare(imxpriv->sata_ref_clk);
172} 208}
173 209
174static int imx_ahci_suspend(struct device *dev) 210static int imx_ahci_suspend(struct device *dev)
@@ -179,12 +215,8 @@ static int imx_ahci_suspend(struct device *dev)
179 * If no_device is set, The CLKs had been gated off in the 215 * If no_device is set, The CLKs had been gated off in the
180 * initialization so don't do it again here. 216 * initialization so don't do it again here.
181 */ 217 */
182 if (!imxpriv->no_device) { 218 if (!imxpriv->no_device)
183 regmap_update_bits(imxpriv->gpr, IOMUXC_GPR13, 219 imx_sata_clock_disable(dev);
184 IMX6Q_GPR13_SATA_MPLL_CLK_EN,
185 !IMX6Q_GPR13_SATA_MPLL_CLK_EN);
186 clk_disable_unprepare(imxpriv->sata_ref_clk);
187 }
188 220
189 return 0; 221 return 0;
190} 222}
@@ -192,34 +224,26 @@ static int imx_ahci_suspend(struct device *dev)
192static int imx_ahci_resume(struct device *dev) 224static int imx_ahci_resume(struct device *dev)
193{ 225{
194 struct imx_ahci_priv *imxpriv = dev_get_drvdata(dev->parent); 226 struct imx_ahci_priv *imxpriv = dev_get_drvdata(dev->parent);
195 int ret; 227 int ret = 0;
196
197 if (!imxpriv->no_device) {
198 ret = clk_prepare_enable(imxpriv->sata_ref_clk);
199 if (ret < 0) {
200 dev_err(dev, "pre-enable sata_ref clock err:%d\n", ret);
201 return ret;
202 }
203 228
204 regmap_update_bits(imxpriv->gpr, IOMUXC_GPR13, 229 if (!imxpriv->no_device)
205 IMX6Q_GPR13_SATA_MPLL_CLK_EN, 230 ret = imx_sata_clock_enable(dev);
206 IMX6Q_GPR13_SATA_MPLL_CLK_EN);
207 usleep_range(1000, 2000);
208 }
209 231
210 return 0; 232 return ret;
211} 233}
212 234
213static struct ahci_platform_data imx6q_sata_pdata = { 235static struct ahci_platform_data imx_sata_pdata = {
214 .init = imx6q_sata_init, 236 .init = imx_sata_init,
215 .exit = imx6q_sata_exit, 237 .exit = imx_sata_exit,
216 .ata_port_info = &ahci_imx_port_info, 238 .ata_port_info = &ahci_imx_port_info,
217 .suspend = imx_ahci_suspend, 239 .suspend = imx_ahci_suspend,
218 .resume = imx_ahci_resume, 240 .resume = imx_ahci_resume,
241
219}; 242};
220 243
221static const struct of_device_id imx_ahci_of_match[] = { 244static const struct of_device_id imx_ahci_of_match[] = {
222 { .compatible = "fsl,imx6q-ahci", .data = &imx6q_sata_pdata}, 245 { .compatible = "fsl,imx53-ahci", .data = (void *)AHCI_IMX53 },
246 { .compatible = "fsl,imx6q-ahci", .data = (void *)AHCI_IMX6Q },
223 {}, 247 {},
224}; 248};
225MODULE_DEVICE_TABLE(of, imx_ahci_of_match); 249MODULE_DEVICE_TABLE(of, imx_ahci_of_match);
@@ -229,12 +253,20 @@ static int imx_ahci_probe(struct platform_device *pdev)
229 struct device *dev = &pdev->dev; 253 struct device *dev = &pdev->dev;
230 struct resource *mem, *irq, res[2]; 254 struct resource *mem, *irq, res[2];
231 const struct of_device_id *of_id; 255 const struct of_device_id *of_id;
256 enum ahci_imx_type type;
232 const struct ahci_platform_data *pdata = NULL; 257 const struct ahci_platform_data *pdata = NULL;
233 struct imx_ahci_priv *imxpriv; 258 struct imx_ahci_priv *imxpriv;
234 struct device *ahci_dev; 259 struct device *ahci_dev;
235 struct platform_device *ahci_pdev; 260 struct platform_device *ahci_pdev;
236 int ret; 261 int ret;
237 262
263 of_id = of_match_device(imx_ahci_of_match, dev);
264 if (!of_id)
265 return -EINVAL;
266
267 type = (enum ahci_imx_type)of_id->data;
268 pdata = &imx_sata_pdata;
269
238 imxpriv = devm_kzalloc(dev, sizeof(*imxpriv), GFP_KERNEL); 270 imxpriv = devm_kzalloc(dev, sizeof(*imxpriv), GFP_KERNEL);
239 if (!imxpriv) { 271 if (!imxpriv) {
240 dev_err(dev, "can't alloc ahci_host_priv\n"); 272 dev_err(dev, "can't alloc ahci_host_priv\n");
@@ -250,6 +282,8 @@ static int imx_ahci_probe(struct platform_device *pdev)
250 282
251 imxpriv->no_device = false; 283 imxpriv->no_device = false;
252 imxpriv->first_time = true; 284 imxpriv->first_time = true;
285 imxpriv->type = type;
286
253 imxpriv->ahb_clk = devm_clk_get(dev, "ahb"); 287 imxpriv->ahb_clk = devm_clk_get(dev, "ahb");
254 if (IS_ERR(imxpriv->ahb_clk)) { 288 if (IS_ERR(imxpriv->ahb_clk)) {
255 dev_err(dev, "can't get ahb clock.\n"); 289 dev_err(dev, "can't get ahb clock.\n");
@@ -257,6 +291,15 @@ static int imx_ahci_probe(struct platform_device *pdev)
257 goto err_out; 291 goto err_out;
258 } 292 }
259 293
294 if (type == AHCI_IMX53) {
295 imxpriv->sata_gate_clk = devm_clk_get(dev, "sata_gate");
296 if (IS_ERR(imxpriv->sata_gate_clk)) {
297 dev_err(dev, "can't get sata_gate clock.\n");
298 ret = PTR_ERR(imxpriv->sata_gate_clk);
299 goto err_out;
300 }
301 }
302
260 imxpriv->sata_ref_clk = devm_clk_get(dev, "sata_ref"); 303 imxpriv->sata_ref_clk = devm_clk_get(dev, "sata_ref");
261 if (IS_ERR(imxpriv->sata_ref_clk)) { 304 if (IS_ERR(imxpriv->sata_ref_clk)) {
262 dev_err(dev, "can't get sata_ref clock.\n"); 305 dev_err(dev, "can't get sata_ref clock.\n");
@@ -267,14 +310,6 @@ static int imx_ahci_probe(struct platform_device *pdev)
267 imxpriv->ahci_pdev = ahci_pdev; 310 imxpriv->ahci_pdev = ahci_pdev;
268 platform_set_drvdata(pdev, imxpriv); 311 platform_set_drvdata(pdev, imxpriv);
269 312
270 of_id = of_match_device(imx_ahci_of_match, dev);
271 if (of_id) {
272 pdata = of_id->data;
273 } else {
274 ret = -EINVAL;
275 goto err_out;
276 }
277
278 mem = platform_get_resource(pdev, IORESOURCE_MEM, 0); 313 mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
279 irq = platform_get_resource(pdev, IORESOURCE_IRQ, 0); 314 irq = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
280 if (!mem || !irq) { 315 if (!mem || !irq) {
@@ -290,6 +325,43 @@ static int imx_ahci_probe(struct platform_device *pdev)
290 ahci_dev->dma_mask = &ahci_dev->coherent_dma_mask; 325 ahci_dev->dma_mask = &ahci_dev->coherent_dma_mask;
291 ahci_dev->of_node = dev->of_node; 326 ahci_dev->of_node = dev->of_node;
292 327
328 if (type == AHCI_IMX6Q) {
329 imxpriv->gpr = syscon_regmap_lookup_by_compatible(
330 "fsl,imx6q-iomuxc-gpr");
331 if (IS_ERR(imxpriv->gpr)) {
332 dev_err(dev,
333 "failed to find fsl,imx6q-iomux-gpr regmap\n");
334 ret = PTR_ERR(imxpriv->gpr);
335 goto err_out;
336 }
337
338 /*
339 * Set PHY Paremeters, two steps to configure the GPR13,
340 * one write for rest of parameters, mask of first write
341 * is 0x07fffffe, and the other one write for setting
342 * the mpll_clk_en happens in imx_sata_clock_enable().
343 */
344 regmap_update_bits(imxpriv->gpr, IOMUXC_GPR13,
345 IMX6Q_GPR13_SATA_RX_EQ_VAL_MASK |
346 IMX6Q_GPR13_SATA_RX_LOS_LVL_MASK |
347 IMX6Q_GPR13_SATA_RX_DPLL_MODE_MASK |
348 IMX6Q_GPR13_SATA_SPD_MODE_MASK |
349 IMX6Q_GPR13_SATA_MPLL_SS_EN |
350 IMX6Q_GPR13_SATA_TX_ATTEN_MASK |
351 IMX6Q_GPR13_SATA_TX_BOOST_MASK |
352 IMX6Q_GPR13_SATA_TX_LVL_MASK |
353 IMX6Q_GPR13_SATA_MPLL_CLK_EN |
354 IMX6Q_GPR13_SATA_TX_EDGE_RATE,
355 IMX6Q_GPR13_SATA_RX_EQ_VAL_3_0_DB |
356 IMX6Q_GPR13_SATA_RX_LOS_LVL_SATA2M |
357 IMX6Q_GPR13_SATA_RX_DPLL_MODE_2P_4F |
358 IMX6Q_GPR13_SATA_SPD_MODE_3P0G |
359 IMX6Q_GPR13_SATA_MPLL_SS_EN |
360 IMX6Q_GPR13_SATA_TX_ATTEN_9_16 |
361 IMX6Q_GPR13_SATA_TX_BOOST_3_33_DB |
362 IMX6Q_GPR13_SATA_TX_LVL_1_025_V);
363 }
364
293 ret = platform_device_add_resources(ahci_pdev, res, 2); 365 ret = platform_device_add_resources(ahci_pdev, res, 2);
294 if (ret) 366 if (ret)
295 goto err_out; 367 goto err_out;
diff --git a/drivers/ata/ata_generic.c b/drivers/ata/ata_generic.c
index f8f38a08abc5..7d196656adb5 100644
--- a/drivers/ata/ata_generic.c
+++ b/drivers/ata/ata_generic.c
@@ -221,13 +221,6 @@ static struct pci_device_id ata_generic[] = {
221 { PCI_DEVICE(PCI_VENDOR_ID_OPTI, PCI_DEVICE_ID_OPTI_82C558), }, 221 { PCI_DEVICE(PCI_VENDOR_ID_OPTI, PCI_DEVICE_ID_OPTI_82C558), },
222 { PCI_DEVICE(PCI_VENDOR_ID_CENATEK,PCI_DEVICE_ID_CENATEK_IDE), 222 { PCI_DEVICE(PCI_VENDOR_ID_CENATEK,PCI_DEVICE_ID_CENATEK_IDE),
223 .driver_data = ATA_GEN_FORCE_DMA }, 223 .driver_data = ATA_GEN_FORCE_DMA },
224 /*
225 * For some reason, MCP89 on MacBook 7,1 doesn't work with
226 * ahci, use ata_generic instead.
227 */
228 { PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_MCP89_SATA,
229 PCI_VENDOR_ID_APPLE, 0xcb89,
230 .driver_data = ATA_GEN_FORCE_DMA },
231#if !defined(CONFIG_PATA_TOSHIBA) && !defined(CONFIG_PATA_TOSHIBA_MODULE) 224#if !defined(CONFIG_PATA_TOSHIBA) && !defined(CONFIG_PATA_TOSHIBA_MODULE)
232 { PCI_DEVICE(PCI_VENDOR_ID_TOSHIBA,PCI_DEVICE_ID_TOSHIBA_PICCOLO_1), }, 225 { PCI_DEVICE(PCI_VENDOR_ID_TOSHIBA,PCI_DEVICE_ID_TOSHIBA_PICCOLO_1), },
233 { PCI_DEVICE(PCI_VENDOR_ID_TOSHIBA,PCI_DEVICE_ID_TOSHIBA_PICCOLO_2), }, 226 { PCI_DEVICE(PCI_VENDOR_ID_TOSHIBA,PCI_DEVICE_ID_TOSHIBA_PICCOLO_2), },
diff --git a/drivers/ata/libahci.c b/drivers/ata/libahci.c
index c482f8cadd7a..36605abe5a67 100644
--- a/drivers/ata/libahci.c
+++ b/drivers/ata/libahci.c
@@ -1764,7 +1764,7 @@ static void ahci_handle_port_interrupt(struct ata_port *ap,
1764 } 1764 }
1765} 1765}
1766 1766
1767void ahci_port_intr(struct ata_port *ap) 1767static void ahci_port_intr(struct ata_port *ap)
1768{ 1768{
1769 void __iomem *port_mmio = ahci_port_base(ap); 1769 void __iomem *port_mmio = ahci_port_base(ap);
1770 u32 status; 1770 u32 status;
@@ -1797,7 +1797,7 @@ irqreturn_t ahci_thread_fn(int irq, void *dev_instance)
1797} 1797}
1798EXPORT_SYMBOL_GPL(ahci_thread_fn); 1798EXPORT_SYMBOL_GPL(ahci_thread_fn);
1799 1799
1800void ahci_hw_port_interrupt(struct ata_port *ap) 1800static void ahci_hw_port_interrupt(struct ata_port *ap)
1801{ 1801{
1802 void __iomem *port_mmio = ahci_port_base(ap); 1802 void __iomem *port_mmio = ahci_port_base(ap);
1803 struct ahci_port_priv *pp = ap->private_data; 1803 struct ahci_port_priv *pp = ap->private_data;
diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 1393a5890ed5..1a3dbd1b196e 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -2222,6 +2222,16 @@ int ata_dev_configure(struct ata_device *dev)
2222 if (rc) 2222 if (rc)
2223 return rc; 2223 return rc;
2224 2224
2225 /* some WD SATA-1 drives have issues with LPM, turn on NOLPM for them */
2226 if ((dev->horkage & ATA_HORKAGE_WD_BROKEN_LPM) &&
2227 (id[ATA_ID_SATA_CAPABILITY] & 0xe) == 0x2)
2228 dev->horkage |= ATA_HORKAGE_NOLPM;
2229
2230 if (dev->horkage & ATA_HORKAGE_NOLPM) {
2231 ata_dev_warn(dev, "LPM support broken, forcing max_power\n");
2232 dev->link->ap->target_lpm_policy = ATA_LPM_MAX_POWER;
2233 }
2234
2225 /* let ACPI work its magic */ 2235 /* let ACPI work its magic */
2226 rc = ata_acpi_on_devcfg(dev); 2236 rc = ata_acpi_on_devcfg(dev);
2227 if (rc) 2237 if (rc)
@@ -4216,6 +4226,23 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = {
4216 { "Micron_M500*", NULL, ATA_HORKAGE_NO_NCQ_TRIM, }, 4226 { "Micron_M500*", NULL, ATA_HORKAGE_NO_NCQ_TRIM, },
4217 { "Crucial_CT???M500SSD1", NULL, ATA_HORKAGE_NO_NCQ_TRIM, }, 4227 { "Crucial_CT???M500SSD1", NULL, ATA_HORKAGE_NO_NCQ_TRIM, },
4218 4228
4229 /*
4230 * Some WD SATA-I drives spin up and down erratically when the link
4231 * is put into the slumber mode. We don't have full list of the
4232 * affected devices. Disable LPM if the device matches one of the
4233 * known prefixes and is SATA-1. As a side effect LPM partial is
4234 * lost too.
4235 *
4236 * https://bugzilla.kernel.org/show_bug.cgi?id=57211
4237 */
4238 { "WDC WD800JD-*", NULL, ATA_HORKAGE_WD_BROKEN_LPM },
4239 { "WDC WD1200JD-*", NULL, ATA_HORKAGE_WD_BROKEN_LPM },
4240 { "WDC WD1600JD-*", NULL, ATA_HORKAGE_WD_BROKEN_LPM },
4241 { "WDC WD2000JD-*", NULL, ATA_HORKAGE_WD_BROKEN_LPM },
4242 { "WDC WD2500JD-*", NULL, ATA_HORKAGE_WD_BROKEN_LPM },
4243 { "WDC WD3000JD-*", NULL, ATA_HORKAGE_WD_BROKEN_LPM },
4244 { "WDC WD3200JD-*", NULL, ATA_HORKAGE_WD_BROKEN_LPM },
4245
4219 /* End Marker */ 4246 /* End Marker */
4220 { } 4247 { }
4221}; 4248};
diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index 92d7797223be..6d8757008318 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -2402,7 +2402,7 @@ static void ata_eh_link_report(struct ata_link *link)
2402 struct ata_port *ap = link->ap; 2402 struct ata_port *ap = link->ap;
2403 struct ata_eh_context *ehc = &link->eh_context; 2403 struct ata_eh_context *ehc = &link->eh_context;
2404 const char *frozen, *desc; 2404 const char *frozen, *desc;
2405 char tries_buf[6]; 2405 char tries_buf[6] = "";
2406 int tag, nr_failed = 0; 2406 int tag, nr_failed = 0;
2407 2407
2408 if (ehc->i.flags & ATA_EHI_QUIET) 2408 if (ehc->i.flags & ATA_EHI_QUIET)
@@ -2433,9 +2433,8 @@ static void ata_eh_link_report(struct ata_link *link)
2433 if (ap->pflags & ATA_PFLAG_FROZEN) 2433 if (ap->pflags & ATA_PFLAG_FROZEN)
2434 frozen = " frozen"; 2434 frozen = " frozen";
2435 2435
2436 memset(tries_buf, 0, sizeof(tries_buf));
2437 if (ap->eh_tries < ATA_EH_MAX_TRIES) 2436 if (ap->eh_tries < ATA_EH_MAX_TRIES)
2438 snprintf(tries_buf, sizeof(tries_buf) - 1, " t%d", 2437 snprintf(tries_buf, sizeof(tries_buf), " t%d",
2439 ap->eh_tries); 2438 ap->eh_tries);
2440 2439
2441 if (ehc->i.dev) { 2440 if (ehc->i.dev) {
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 377eb889f555..ef8567de6a75 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -111,12 +111,14 @@ static const char *ata_lpm_policy_names[] = {
111 [ATA_LPM_MIN_POWER] = "min_power", 111 [ATA_LPM_MIN_POWER] = "min_power",
112}; 112};
113 113
114static ssize_t ata_scsi_lpm_store(struct device *dev, 114static ssize_t ata_scsi_lpm_store(struct device *device,
115 struct device_attribute *attr, 115 struct device_attribute *attr,
116 const char *buf, size_t count) 116 const char *buf, size_t count)
117{ 117{
118 struct Scsi_Host *shost = class_to_shost(dev); 118 struct Scsi_Host *shost = class_to_shost(device);
119 struct ata_port *ap = ata_shost_to_port(shost); 119 struct ata_port *ap = ata_shost_to_port(shost);
120 struct ata_link *link;
121 struct ata_device *dev;
120 enum ata_lpm_policy policy; 122 enum ata_lpm_policy policy;
121 unsigned long flags; 123 unsigned long flags;
122 124
@@ -132,10 +134,20 @@ static ssize_t ata_scsi_lpm_store(struct device *dev,
132 return -EINVAL; 134 return -EINVAL;
133 135
134 spin_lock_irqsave(ap->lock, flags); 136 spin_lock_irqsave(ap->lock, flags);
137
138 ata_for_each_link(link, ap, EDGE) {
139 ata_for_each_dev(dev, &ap->link, ENABLED) {
140 if (dev->horkage & ATA_HORKAGE_NOLPM) {
141 count = -EOPNOTSUPP;
142 goto out_unlock;
143 }
144 }
145 }
146
135 ap->target_lpm_policy = policy; 147 ap->target_lpm_policy = policy;
136 ata_port_schedule_eh(ap); 148 ata_port_schedule_eh(ap);
149out_unlock:
137 spin_unlock_irqrestore(ap->lock, flags); 150 spin_unlock_irqrestore(ap->lock, flags);
138
139 return count; 151 return count;
140} 152}
141 153
diff --git a/drivers/ata/pata_samsung_cf.c b/drivers/ata/pata_samsung_cf.c
index 898e544a7ae8..a79566d05666 100644
--- a/drivers/ata/pata_samsung_cf.c
+++ b/drivers/ata/pata_samsung_cf.c
@@ -24,11 +24,34 @@
24#include <linux/slab.h> 24#include <linux/slab.h>
25 25
26#include <linux/platform_data/ata-samsung_cf.h> 26#include <linux/platform_data/ata-samsung_cf.h>
27#include <plat/regs-ata.h>
28 27
29#define DRV_NAME "pata_samsung_cf" 28#define DRV_NAME "pata_samsung_cf"
30#define DRV_VERSION "0.1" 29#define DRV_VERSION "0.1"
31 30
31#define S3C_CFATA_REG(x) (x)
32#define S3C_CFATA_MUX S3C_CFATA_REG(0x0)
33#define S3C_ATA_CTRL S3C_CFATA_REG(0x0)
34#define S3C_ATA_CMD S3C_CFATA_REG(0x8)
35#define S3C_ATA_IRQ S3C_CFATA_REG(0x10)
36#define S3C_ATA_IRQ_MSK S3C_CFATA_REG(0x14)
37#define S3C_ATA_CFG S3C_CFATA_REG(0x18)
38
39#define S3C_ATA_PIO_TIME S3C_CFATA_REG(0x2c)
40#define S3C_ATA_PIO_DTR S3C_CFATA_REG(0x54)
41#define S3C_ATA_PIO_FED S3C_CFATA_REG(0x58)
42#define S3C_ATA_PIO_SCR S3C_CFATA_REG(0x5c)
43#define S3C_ATA_PIO_LLR S3C_CFATA_REG(0x60)
44#define S3C_ATA_PIO_LMR S3C_CFATA_REG(0x64)
45#define S3C_ATA_PIO_LHR S3C_CFATA_REG(0x68)
46#define S3C_ATA_PIO_DVR S3C_CFATA_REG(0x6c)
47#define S3C_ATA_PIO_CSD S3C_CFATA_REG(0x70)
48#define S3C_ATA_PIO_DAD S3C_CFATA_REG(0x74)
49#define S3C_ATA_PIO_RDATA S3C_CFATA_REG(0x7c)
50
51#define S3C_CFATA_MUX_TRUEIDE 0x01
52#define S3C_ATA_CFG_SWAP 0x40
53#define S3C_ATA_CFG_IORDYEN 0x02
54
32enum s3c_cpu_type { 55enum s3c_cpu_type {
33 TYPE_S3C64XX, 56 TYPE_S3C64XX,
34 TYPE_S5PC100, 57 TYPE_S5PC100,
@@ -495,22 +518,10 @@ static int __init pata_s3c_probe(struct platform_device *pdev)
495 info->irq = platform_get_irq(pdev, 0); 518 info->irq = platform_get_irq(pdev, 0);
496 519
497 res = platform_get_resource(pdev, IORESOURCE_MEM, 0); 520 res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
498 if (res == NULL) {
499 dev_err(dev, "failed to get mem resource\n");
500 return -EINVAL;
501 }
502
503 if (!devm_request_mem_region(dev, res->start,
504 resource_size(res), DRV_NAME)) {
505 dev_err(dev, "error requesting register region\n");
506 return -EBUSY;
507 }
508 521
509 info->ide_addr = devm_ioremap(dev, res->start, resource_size(res)); 522 info->ide_addr = devm_ioremap_resource(dev, res);
510 if (!info->ide_addr) { 523 if (IS_ERR(info->ide_addr))
511 dev_err(dev, "failed to map IO base address\n"); 524 return PTR_ERR(info->ide_addr);
512 return -ENOMEM;
513 }
514 525
515 info->clk = devm_clk_get(&pdev->dev, "cfcon"); 526 info->clk = devm_clk_get(&pdev->dev, "cfcon");
516 if (IS_ERR(info->clk)) { 527 if (IS_ERR(info->clk)) {
diff --git a/drivers/ata/sata_highbank.c b/drivers/ata/sata_highbank.c
index ea3b3dc10f33..870b11eadc6d 100644
--- a/drivers/ata/sata_highbank.c
+++ b/drivers/ata/sata_highbank.c
@@ -29,7 +29,6 @@
29#include <linux/of_address.h> 29#include <linux/of_address.h>
30#include <linux/platform_device.h> 30#include <linux/platform_device.h>
31#include <linux/libata.h> 31#include <linux/libata.h>
32#include <linux/ahci_platform.h>
33#include <linux/interrupt.h> 32#include <linux/interrupt.h>
34#include <linux/delay.h> 33#include <linux/delay.h>
35#include <linux/export.h> 34#include <linux/export.h>
diff --git a/drivers/ata/sata_mv.c b/drivers/ata/sata_mv.c
index 56be31819897..20a7517bd339 100644
--- a/drivers/ata/sata_mv.c
+++ b/drivers/ata/sata_mv.c
@@ -60,6 +60,7 @@
60#include <linux/dma-mapping.h> 60#include <linux/dma-mapping.h>
61#include <linux/device.h> 61#include <linux/device.h>
62#include <linux/clk.h> 62#include <linux/clk.h>
63#include <linux/phy/phy.h>
63#include <linux/platform_device.h> 64#include <linux/platform_device.h>
64#include <linux/ata_platform.h> 65#include <linux/ata_platform.h>
65#include <linux/mbus.h> 66#include <linux/mbus.h>
@@ -304,6 +305,7 @@ enum {
304 MV5_LTMODE = 0x30, 305 MV5_LTMODE = 0x30,
305 MV5_PHY_CTL = 0x0C, 306 MV5_PHY_CTL = 0x0C,
306 SATA_IFCFG = 0x050, 307 SATA_IFCFG = 0x050,
308 LP_PHY_CTL = 0x058,
307 309
308 MV_M2_PREAMP_MASK = 0x7e0, 310 MV_M2_PREAMP_MASK = 0x7e0,
309 311
@@ -431,6 +433,7 @@ enum {
431 MV_HP_CUT_THROUGH = (1 << 10), /* can use EDMA cut-through */ 433 MV_HP_CUT_THROUGH = (1 << 10), /* can use EDMA cut-through */
432 MV_HP_FLAG_SOC = (1 << 11), /* SystemOnChip, no PCI */ 434 MV_HP_FLAG_SOC = (1 << 11), /* SystemOnChip, no PCI */
433 MV_HP_QUIRK_LED_BLINK_EN = (1 << 12), /* is led blinking enabled? */ 435 MV_HP_QUIRK_LED_BLINK_EN = (1 << 12), /* is led blinking enabled? */
436 MV_HP_FIX_LP_PHY_CTL = (1 << 13), /* fix speed in LP_PHY_CTL ? */
434 437
435 /* Port private flags (pp_flags) */ 438 /* Port private flags (pp_flags) */
436 MV_PP_FLAG_EDMA_EN = (1 << 0), /* is EDMA engine enabled? */ 439 MV_PP_FLAG_EDMA_EN = (1 << 0), /* is EDMA engine enabled? */
@@ -563,6 +566,12 @@ struct mv_host_priv {
563 struct clk *clk; 566 struct clk *clk;
564 struct clk **port_clks; 567 struct clk **port_clks;
565 /* 568 /*
569 * Some devices have a SATA PHY which can be enabled/disabled
570 * in order to save power. These are optional: if the platform
571 * devices does not have any phy, they won't be used.
572 */
573 struct phy **port_phys;
574 /*
566 * These consistent DMA memory pools give us guaranteed 575 * These consistent DMA memory pools give us guaranteed
567 * alignment for hardware-accessed data structures, 576 * alignment for hardware-accessed data structures,
568 * and less memory waste in accomplishing the alignment. 577 * and less memory waste in accomplishing the alignment.
@@ -1358,6 +1367,7 @@ static int mv_scr_write(struct ata_link *link, unsigned int sc_reg_in, u32 val)
1358 1367
1359 if (ofs != 0xffffffffU) { 1368 if (ofs != 0xffffffffU) {
1360 void __iomem *addr = mv_ap_base(link->ap) + ofs; 1369 void __iomem *addr = mv_ap_base(link->ap) + ofs;
1370 struct mv_host_priv *hpriv = link->ap->host->private_data;
1361 if (sc_reg_in == SCR_CONTROL) { 1371 if (sc_reg_in == SCR_CONTROL) {
1362 /* 1372 /*
1363 * Workaround for 88SX60x1 FEr SATA#26: 1373 * Workaround for 88SX60x1 FEr SATA#26:
@@ -1374,6 +1384,18 @@ static int mv_scr_write(struct ata_link *link, unsigned int sc_reg_in, u32 val)
1374 */ 1384 */
1375 if ((val & 0xf) == 1 || (readl(addr) & 0xf) == 1) 1385 if ((val & 0xf) == 1 || (readl(addr) & 0xf) == 1)
1376 val |= 0xf000; 1386 val |= 0xf000;
1387
1388 if (hpriv->hp_flags & MV_HP_FIX_LP_PHY_CTL) {
1389 void __iomem *lp_phy_addr =
1390 mv_ap_base(link->ap) + LP_PHY_CTL;
1391 /*
1392 * Set PHY speed according to SControl speed.
1393 */
1394 if ((val & 0xf0) == 0x10)
1395 writelfl(0x7, lp_phy_addr);
1396 else
1397 writelfl(0x227, lp_phy_addr);
1398 }
1377 } 1399 }
1378 writelfl(val, addr); 1400 writelfl(val, addr);
1379 return 0; 1401 return 0;
@@ -4076,6 +4098,11 @@ static int mv_platform_probe(struct platform_device *pdev)
4076 GFP_KERNEL); 4098 GFP_KERNEL);
4077 if (!hpriv->port_clks) 4099 if (!hpriv->port_clks)
4078 return -ENOMEM; 4100 return -ENOMEM;
4101 hpriv->port_phys = devm_kzalloc(&pdev->dev,
4102 sizeof(struct phy *) * n_ports,
4103 GFP_KERNEL);
4104 if (!hpriv->port_phys)
4105 return -ENOMEM;
4079 host->private_data = hpriv; 4106 host->private_data = hpriv;
4080 hpriv->n_ports = n_ports; 4107 hpriv->n_ports = n_ports;
4081 hpriv->board_idx = chip_soc; 4108 hpriv->board_idx = chip_soc;
@@ -4097,6 +4124,17 @@ static int mv_platform_probe(struct platform_device *pdev)
4097 hpriv->port_clks[port] = clk_get(&pdev->dev, port_number); 4124 hpriv->port_clks[port] = clk_get(&pdev->dev, port_number);
4098 if (!IS_ERR(hpriv->port_clks[port])) 4125 if (!IS_ERR(hpriv->port_clks[port]))
4099 clk_prepare_enable(hpriv->port_clks[port]); 4126 clk_prepare_enable(hpriv->port_clks[port]);
4127
4128 sprintf(port_number, "port%d", port);
4129 hpriv->port_phys[port] = devm_phy_get(&pdev->dev, port_number);
4130 if (IS_ERR(hpriv->port_phys[port])) {
4131 rc = PTR_ERR(hpriv->port_phys[port]);
4132 hpriv->port_phys[port] = NULL;
4133 if ((rc != -EPROBE_DEFER) && (rc != -ENODEV))
4134 dev_warn(&pdev->dev, "error getting phy");
4135 goto err;
4136 } else
4137 phy_power_on(hpriv->port_phys[port]);
4100 } 4138 }
4101 4139
4102 /* 4140 /*
@@ -4110,6 +4148,15 @@ static int mv_platform_probe(struct platform_device *pdev)
4110 if (rc) 4148 if (rc)
4111 goto err; 4149 goto err;
4112 4150
4151 /*
4152 * To allow disk hotplug on Armada 370/XP SoCs, the PHY speed must be
4153 * updated in the LP_PHY_CTL register.
4154 */
4155 if (pdev->dev.of_node &&
4156 of_device_is_compatible(pdev->dev.of_node,
4157 "marvell,armada-370-sata"))
4158 hpriv->hp_flags |= MV_HP_FIX_LP_PHY_CTL;
4159
4113 /* initialize adapter */ 4160 /* initialize adapter */
4114 rc = mv_init_host(host); 4161 rc = mv_init_host(host);
4115 if (rc) 4162 if (rc)
@@ -4132,6 +4179,8 @@ err:
4132 clk_disable_unprepare(hpriv->port_clks[port]); 4179 clk_disable_unprepare(hpriv->port_clks[port]);
4133 clk_put(hpriv->port_clks[port]); 4180 clk_put(hpriv->port_clks[port]);
4134 } 4181 }
4182 if (hpriv->port_phys[port])
4183 phy_power_off(hpriv->port_phys[port]);
4135 } 4184 }
4136 4185
4137 return rc; 4186 return rc;
@@ -4161,6 +4210,8 @@ static int mv_platform_remove(struct platform_device *pdev)
4161 clk_disable_unprepare(hpriv->port_clks[port]); 4210 clk_disable_unprepare(hpriv->port_clks[port]);
4162 clk_put(hpriv->port_clks[port]); 4211 clk_put(hpriv->port_clks[port]);
4163 } 4212 }
4213 if (hpriv->port_phys[port])
4214 phy_power_off(hpriv->port_phys[port]);
4164 } 4215 }
4165 return 0; 4216 return 0;
4166} 4217}
@@ -4209,6 +4260,7 @@ static int mv_platform_resume(struct platform_device *pdev)
4209 4260
4210#ifdef CONFIG_OF 4261#ifdef CONFIG_OF
4211static struct of_device_id mv_sata_dt_ids[] = { 4262static struct of_device_id mv_sata_dt_ids[] = {
4263 { .compatible = "marvell,armada-370-sata", },
4212 { .compatible = "marvell,orion-sata", }, 4264 { .compatible = "marvell,orion-sata", },
4213 {}, 4265 {},
4214}; 4266};
diff --git a/drivers/ata/sata_rcar.c b/drivers/ata/sata_rcar.c
index 1dae9a9009f7..2b25bd83fc9d 100644
--- a/drivers/ata/sata_rcar.c
+++ b/drivers/ata/sata_rcar.c
@@ -15,6 +15,7 @@
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/ata.h> 16#include <linux/ata.h>
17#include <linux/libata.h> 17#include <linux/libata.h>
18#include <linux/of_device.h>
18#include <linux/platform_device.h> 19#include <linux/platform_device.h>
19#include <linux/clk.h> 20#include <linux/clk.h>
20#include <linux/err.h> 21#include <linux/err.h>
@@ -123,12 +124,37 @@
123 124
124#define SATA_RCAR_DMA_BOUNDARY 0x1FFFFFFEUL 125#define SATA_RCAR_DMA_BOUNDARY 0x1FFFFFFEUL
125 126
127/* Gen2 Physical Layer Control Registers */
128#define RCAR_GEN2_PHY_CTL1_REG 0x1704
129#define RCAR_GEN2_PHY_CTL1 0x34180002
130#define RCAR_GEN2_PHY_CTL1_SS 0xC180 /* Spread Spectrum */
131
132#define RCAR_GEN2_PHY_CTL2_REG 0x170C
133#define RCAR_GEN2_PHY_CTL2 0x00002303
134
135#define RCAR_GEN2_PHY_CTL3_REG 0x171C
136#define RCAR_GEN2_PHY_CTL3 0x000B0194
137
138#define RCAR_GEN2_PHY_CTL4_REG 0x1724
139#define RCAR_GEN2_PHY_CTL4 0x00030994
140
141#define RCAR_GEN2_PHY_CTL5_REG 0x1740
142#define RCAR_GEN2_PHY_CTL5 0x03004001
143#define RCAR_GEN2_PHY_CTL5_DC BIT(1) /* DC connection */
144#define RCAR_GEN2_PHY_CTL5_TR BIT(2) /* Termination Resistor */
145
146enum sata_rcar_type {
147 RCAR_GEN1_SATA,
148 RCAR_GEN2_SATA,
149};
150
126struct sata_rcar_priv { 151struct sata_rcar_priv {
127 void __iomem *base; 152 void __iomem *base;
128 struct clk *clk; 153 struct clk *clk;
154 enum sata_rcar_type type;
129}; 155};
130 156
131static void sata_rcar_phy_initialize(struct sata_rcar_priv *priv) 157static void sata_rcar_gen1_phy_preinit(struct sata_rcar_priv *priv)
132{ 158{
133 void __iomem *base = priv->base; 159 void __iomem *base = priv->base;
134 160
@@ -141,8 +167,8 @@ static void sata_rcar_phy_initialize(struct sata_rcar_priv *priv)
141 iowrite32(0, base + SATAPHYRESET_REG); 167 iowrite32(0, base + SATAPHYRESET_REG);
142} 168}
143 169
144static void sata_rcar_phy_write(struct sata_rcar_priv *priv, u16 reg, u32 val, 170static void sata_rcar_gen1_phy_write(struct sata_rcar_priv *priv, u16 reg,
145 int group) 171 u32 val, int group)
146{ 172{
147 void __iomem *base = priv->base; 173 void __iomem *base = priv->base;
148 int timeout; 174 int timeout;
@@ -170,6 +196,29 @@ static void sata_rcar_phy_write(struct sata_rcar_priv *priv, u16 reg, u32 val,
170 iowrite32(0, base + SATAPHYADDR_REG); 196 iowrite32(0, base + SATAPHYADDR_REG);
171} 197}
172 198
199static void sata_rcar_gen1_phy_init(struct sata_rcar_priv *priv)
200{
201 sata_rcar_gen1_phy_preinit(priv);
202 sata_rcar_gen1_phy_write(priv, SATAPCTLR1_REG, 0x00200188, 0);
203 sata_rcar_gen1_phy_write(priv, SATAPCTLR1_REG, 0x00200188, 1);
204 sata_rcar_gen1_phy_write(priv, SATAPCTLR3_REG, 0x0000A061, 0);
205 sata_rcar_gen1_phy_write(priv, SATAPCTLR2_REG, 0x20000000, 0);
206 sata_rcar_gen1_phy_write(priv, SATAPCTLR2_REG, 0x20000000, 1);
207 sata_rcar_gen1_phy_write(priv, SATAPCTLR4_REG, 0x28E80000, 0);
208}
209
210static void sata_rcar_gen2_phy_init(struct sata_rcar_priv *priv)
211{
212 void __iomem *base = priv->base;
213
214 iowrite32(RCAR_GEN2_PHY_CTL1, base + RCAR_GEN2_PHY_CTL1_REG);
215 iowrite32(RCAR_GEN2_PHY_CTL2, base + RCAR_GEN2_PHY_CTL2_REG);
216 iowrite32(RCAR_GEN2_PHY_CTL3, base + RCAR_GEN2_PHY_CTL3_REG);
217 iowrite32(RCAR_GEN2_PHY_CTL4, base + RCAR_GEN2_PHY_CTL4_REG);
218 iowrite32(RCAR_GEN2_PHY_CTL5 | RCAR_GEN2_PHY_CTL5_DC |
219 RCAR_GEN2_PHY_CTL5_TR, base + RCAR_GEN2_PHY_CTL5_REG);
220}
221
173static void sata_rcar_freeze(struct ata_port *ap) 222static void sata_rcar_freeze(struct ata_port *ap)
174{ 223{
175 struct sata_rcar_priv *priv = ap->host->private_data; 224 struct sata_rcar_priv *priv = ap->host->private_data;
@@ -738,13 +787,17 @@ static void sata_rcar_init_controller(struct ata_host *host)
738 u32 val; 787 u32 val;
739 788
740 /* reset and setup phy */ 789 /* reset and setup phy */
741 sata_rcar_phy_initialize(priv); 790 switch (priv->type) {
742 sata_rcar_phy_write(priv, SATAPCTLR1_REG, 0x00200188, 0); 791 case RCAR_GEN1_SATA:
743 sata_rcar_phy_write(priv, SATAPCTLR1_REG, 0x00200188, 1); 792 sata_rcar_gen1_phy_init(priv);
744 sata_rcar_phy_write(priv, SATAPCTLR3_REG, 0x0000A061, 0); 793 break;
745 sata_rcar_phy_write(priv, SATAPCTLR2_REG, 0x20000000, 0); 794 case RCAR_GEN2_SATA:
746 sata_rcar_phy_write(priv, SATAPCTLR2_REG, 0x20000000, 1); 795 sata_rcar_gen2_phy_init(priv);
747 sata_rcar_phy_write(priv, SATAPCTLR4_REG, 0x28E80000, 0); 796 break;
797 default:
798 dev_warn(host->dev, "SATA phy is not initialized\n");
799 break;
800 }
748 801
749 /* SATA-IP reset state */ 802 /* SATA-IP reset state */
750 val = ioread32(base + ATAPI_CONTROL1_REG); 803 val = ioread32(base + ATAPI_CONTROL1_REG);
@@ -770,8 +823,40 @@ static void sata_rcar_init_controller(struct ata_host *host)
770 iowrite32(ATAPI_INT_ENABLE_SATAINT, base + ATAPI_INT_ENABLE_REG); 823 iowrite32(ATAPI_INT_ENABLE_SATAINT, base + ATAPI_INT_ENABLE_REG);
771} 824}
772 825
826static struct of_device_id sata_rcar_match[] = {
827 {
828 /* Deprecated by "renesas,sata-r8a7779" */
829 .compatible = "renesas,rcar-sata",
830 .data = (void *)RCAR_GEN1_SATA,
831 },
832 {
833 .compatible = "renesas,sata-r8a7779",
834 .data = (void *)RCAR_GEN1_SATA,
835 },
836 {
837 .compatible = "renesas,sata-r8a7790",
838 .data = (void *)RCAR_GEN2_SATA
839 },
840 {
841 .compatible = "renesas,sata-r8a7791",
842 .data = (void *)RCAR_GEN2_SATA
843 },
844 { },
845};
846MODULE_DEVICE_TABLE(of, sata_rcar_match);
847
848static const struct platform_device_id sata_rcar_id_table[] = {
849 { "sata_rcar", RCAR_GEN1_SATA }, /* Deprecated by "sata-r8a7779" */
850 { "sata-r8a7779", RCAR_GEN1_SATA },
851 { "sata-r8a7790", RCAR_GEN2_SATA },
852 { "sata-r8a7791", RCAR_GEN2_SATA },
853 { },
854};
855MODULE_DEVICE_TABLE(platform, sata_rcar_id_table);
856
773static int sata_rcar_probe(struct platform_device *pdev) 857static int sata_rcar_probe(struct platform_device *pdev)
774{ 858{
859 const struct of_device_id *of_id;
775 struct ata_host *host; 860 struct ata_host *host;
776 struct sata_rcar_priv *priv; 861 struct sata_rcar_priv *priv;
777 struct resource *mem; 862 struct resource *mem;
@@ -787,6 +872,12 @@ static int sata_rcar_probe(struct platform_device *pdev)
787 if (!priv) 872 if (!priv)
788 return -ENOMEM; 873 return -ENOMEM;
789 874
875 of_id = of_match_device(sata_rcar_match, &pdev->dev);
876 if (of_id)
877 priv->type = (enum sata_rcar_type)of_id->data;
878 else
879 priv->type = platform_get_device_id(pdev)->driver_data;
880
790 priv->clk = devm_clk_get(&pdev->dev, NULL); 881 priv->clk = devm_clk_get(&pdev->dev, NULL);
791 if (IS_ERR(priv->clk)) { 882 if (IS_ERR(priv->clk)) {
792 dev_err(&pdev->dev, "failed to get access to sata clock\n"); 883 dev_err(&pdev->dev, "failed to get access to sata clock\n");
@@ -892,15 +983,10 @@ static const struct dev_pm_ops sata_rcar_pm_ops = {
892}; 983};
893#endif 984#endif
894 985
895static struct of_device_id sata_rcar_match[] = {
896 { .compatible = "renesas,rcar-sata", },
897 {},
898};
899MODULE_DEVICE_TABLE(of, sata_rcar_match);
900
901static struct platform_driver sata_rcar_driver = { 986static struct platform_driver sata_rcar_driver = {
902 .probe = sata_rcar_probe, 987 .probe = sata_rcar_probe,
903 .remove = sata_rcar_remove, 988 .remove = sata_rcar_remove,
989 .id_table = sata_rcar_id_table,
904 .driver = { 990 .driver = {
905 .name = DRV_NAME, 991 .name = DRV_NAME,
906 .owner = THIS_MODULE, 992 .owner = THIS_MODULE,
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index fbcc851ed5a5..61bcfc21d2a0 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -163,7 +163,6 @@ static struct cgroup_subsys_state *bcachecg_create(struct cgroup *cgroup)
163static void bcachecg_destroy(struct cgroup *cgroup) 163static void bcachecg_destroy(struct cgroup *cgroup)
164{ 164{
165 struct bch_cgroup *cg = cgroup_to_bcache(cgroup); 165 struct bch_cgroup *cg = cgroup_to_bcache(cgroup);
166 free_css_id(&bcache_subsys, &cg->css);
167 kfree(cg); 166 kfree(cg);
168} 167}
169 168
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index d90909ec6aa6..a5e34dd6a32c 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -649,6 +649,7 @@ static void process_sctp_notification(struct connection *con,
649 struct msghdr *msg, char *buf) 649 struct msghdr *msg, char *buf)
650{ 650{
651 union sctp_notification *sn = (union sctp_notification *)buf; 651 union sctp_notification *sn = (union sctp_notification *)buf;
652 struct linger linger;
652 653
653 switch (sn->sn_header.sn_type) { 654 switch (sn->sn_header.sn_type) {
654 case SCTP_SEND_FAILED: 655 case SCTP_SEND_FAILED:
@@ -727,6 +728,13 @@ static void process_sctp_notification(struct connection *con,
727 } 728 }
728 add_sock(new_con->sock, new_con); 729 add_sock(new_con->sock, new_con);
729 730
731 linger.l_onoff = 1;
732 linger.l_linger = 0;
733 ret = kernel_setsockopt(new_con->sock, SOL_SOCKET, SO_LINGER,
734 (char *)&linger, sizeof(linger));
735 if (ret < 0)
736 log_print("set socket option SO_LINGER failed");
737
730 log_print("connecting to %d sctp association %d", 738 log_print("connecting to %d sctp association %d",
731 nodeid, (int)sn->sn_assoc_change.sac_assoc_id); 739 nodeid, (int)sn->sn_assoc_change.sac_assoc_id);
732 740
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 73f3e4ee4037..49436fa7cd4f 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -1032,8 +1032,9 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
1032 unmap_shared_mapping_range(ip->i_inode.i_mapping, offset, len); 1032 unmap_shared_mapping_range(ip->i_inode.i_mapping, offset, len);
1033 rv = filemap_write_and_wait_range(mapping, lstart, end); 1033 rv = filemap_write_and_wait_range(mapping, lstart, end);
1034 if (rv) 1034 if (rv)
1035 return rv; 1035 goto out;
1036 truncate_inode_pages_range(mapping, lstart, end); 1036 if (rw == WRITE)
1037 truncate_inode_pages_range(mapping, lstart, end);
1037 } 1038 }
1038 1039
1039 rv = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 1040 rv = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
@@ -1080,30 +1081,22 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
1080 bh = bh->b_this_page; 1081 bh = bh->b_this_page;
1081 } while(bh != head); 1082 } while(bh != head);
1082 spin_unlock(&sdp->sd_ail_lock); 1083 spin_unlock(&sdp->sd_ail_lock);
1083 gfs2_log_unlock(sdp);
1084 1084
1085 head = bh = page_buffers(page); 1085 head = bh = page_buffers(page);
1086 do { 1086 do {
1087 gfs2_log_lock(sdp);
1088 bd = bh->b_private; 1087 bd = bh->b_private;
1089 if (bd) { 1088 if (bd) {
1090 gfs2_assert_warn(sdp, bd->bd_bh == bh); 1089 gfs2_assert_warn(sdp, bd->bd_bh == bh);
1091 if (!list_empty(&bd->bd_list)) { 1090 if (!list_empty(&bd->bd_list))
1092 if (!buffer_pinned(bh)) 1091 list_del_init(&bd->bd_list);
1093 list_del_init(&bd->bd_list); 1092 bd->bd_bh = NULL;
1094 else
1095 bd = NULL;
1096 }
1097 if (bd)
1098 bd->bd_bh = NULL;
1099 bh->b_private = NULL; 1093 bh->b_private = NULL;
1100 }
1101 gfs2_log_unlock(sdp);
1102 if (bd)
1103 kmem_cache_free(gfs2_bufdata_cachep, bd); 1094 kmem_cache_free(gfs2_bufdata_cachep, bd);
1095 }
1104 1096
1105 bh = bh->b_this_page; 1097 bh = bh->b_this_page;
1106 } while (bh != head); 1098 } while (bh != head);
1099 gfs2_log_unlock(sdp);
1107 1100
1108 return try_to_free_buffers(page); 1101 return try_to_free_buffers(page);
1109 1102
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 2e5fc268d324..fa32655449c8 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -834,6 +834,7 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh,
834 struct gfs2_leaf *leaf; 834 struct gfs2_leaf *leaf;
835 struct gfs2_dirent *dent; 835 struct gfs2_dirent *dent;
836 struct qstr name = { .name = "" }; 836 struct qstr name = { .name = "" };
837 struct timespec tv = CURRENT_TIME;
837 838
838 error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL); 839 error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL);
839 if (error) 840 if (error)
@@ -850,7 +851,11 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh,
850 leaf->lf_entries = 0; 851 leaf->lf_entries = 0;
851 leaf->lf_dirent_format = cpu_to_be32(GFS2_FORMAT_DE); 852 leaf->lf_dirent_format = cpu_to_be32(GFS2_FORMAT_DE);
852 leaf->lf_next = 0; 853 leaf->lf_next = 0;
853 memset(leaf->lf_reserved, 0, sizeof(leaf->lf_reserved)); 854 leaf->lf_inode = cpu_to_be64(ip->i_no_addr);
855 leaf->lf_dist = cpu_to_be32(1);
856 leaf->lf_nsec = cpu_to_be32(tv.tv_nsec);
857 leaf->lf_sec = cpu_to_be64(tv.tv_sec);
858 memset(leaf->lf_reserved2, 0, sizeof(leaf->lf_reserved2));
854 dent = (struct gfs2_dirent *)(leaf+1); 859 dent = (struct gfs2_dirent *)(leaf+1);
855 gfs2_qstr2dirent(&name, bh->b_size - sizeof(struct gfs2_leaf), dent); 860 gfs2_qstr2dirent(&name, bh->b_size - sizeof(struct gfs2_leaf), dent);
856 *pbh = bh; 861 *pbh = bh;
@@ -1612,11 +1617,31 @@ out:
1612 return ret; 1617 return ret;
1613} 1618}
1614 1619
1620/**
1621 * dir_new_leaf - Add a new leaf onto hash chain
1622 * @inode: The directory
1623 * @name: The name we are adding
1624 *
1625 * This adds a new dir leaf onto an existing leaf when there is not
1626 * enough space to add a new dir entry. This is a last resort after
1627 * we've expanded the hash table to max size and also split existing
1628 * leaf blocks, so it will only occur for very large directories.
1629 *
1630 * The dist parameter is set to 1 for leaf blocks directly attached
1631 * to the hash table, 2 for one layer of indirection, 3 for two layers
1632 * etc. We are thus able to tell the difference between an old leaf
1633 * with dist set to zero (i.e. "don't know") and a new one where we
1634 * set this information for debug/fsck purposes.
1635 *
1636 * Returns: 0 on success, or -ve on error
1637 */
1638
1615static int dir_new_leaf(struct inode *inode, const struct qstr *name) 1639static int dir_new_leaf(struct inode *inode, const struct qstr *name)
1616{ 1640{
1617 struct buffer_head *bh, *obh; 1641 struct buffer_head *bh, *obh;
1618 struct gfs2_inode *ip = GFS2_I(inode); 1642 struct gfs2_inode *ip = GFS2_I(inode);
1619 struct gfs2_leaf *leaf, *oleaf; 1643 struct gfs2_leaf *leaf, *oleaf;
1644 u32 dist = 1;
1620 int error; 1645 int error;
1621 u32 index; 1646 u32 index;
1622 u64 bn; 1647 u64 bn;
@@ -1626,6 +1651,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)
1626 if (error) 1651 if (error)
1627 return error; 1652 return error;
1628 do { 1653 do {
1654 dist++;
1629 oleaf = (struct gfs2_leaf *)obh->b_data; 1655 oleaf = (struct gfs2_leaf *)obh->b_data;
1630 bn = be64_to_cpu(oleaf->lf_next); 1656 bn = be64_to_cpu(oleaf->lf_next);
1631 if (!bn) 1657 if (!bn)
@@ -1643,6 +1669,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)
1643 brelse(obh); 1669 brelse(obh);
1644 return -ENOSPC; 1670 return -ENOSPC;
1645 } 1671 }
1672 leaf->lf_dist = cpu_to_be32(dist);
1646 oleaf->lf_next = cpu_to_be64(bh->b_blocknr); 1673 oleaf->lf_next = cpu_to_be64(bh->b_blocknr);
1647 brelse(bh); 1674 brelse(bh);
1648 brelse(obh); 1675 brelse(obh);
@@ -1659,39 +1686,53 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)
1659 1686
1660/** 1687/**
1661 * gfs2_dir_add - Add new filename into directory 1688 * gfs2_dir_add - Add new filename into directory
1662 * @dip: The GFS2 inode 1689 * @inode: The directory inode
1663 * @filename: The new name 1690 * @name: The new name
1664 * @inode: The inode number of the entry 1691 * @nip: The GFS2 inode to be linked in to the directory
1665 * @type: The type of the entry 1692 * @da: The directory addition info
1693 *
1694 * If the call to gfs2_diradd_alloc_required resulted in there being
1695 * no need to allocate any new directory blocks, then it will contain
1696 * a pointer to the directory entry and the bh in which it resides. We
1697 * can use that without having to repeat the search. If there was no
1698 * free space, then we must now create more space.
1666 * 1699 *
1667 * Returns: 0 on success, error code on failure 1700 * Returns: 0 on success, error code on failure
1668 */ 1701 */
1669 1702
1670int gfs2_dir_add(struct inode *inode, const struct qstr *name, 1703int gfs2_dir_add(struct inode *inode, const struct qstr *name,
1671 const struct gfs2_inode *nip) 1704 const struct gfs2_inode *nip, struct gfs2_diradd *da)
1672{ 1705{
1673 struct gfs2_inode *ip = GFS2_I(inode); 1706 struct gfs2_inode *ip = GFS2_I(inode);
1674 struct buffer_head *bh; 1707 struct buffer_head *bh = da->bh;
1675 struct gfs2_dirent *dent; 1708 struct gfs2_dirent *dent = da->dent;
1709 struct timespec tv;
1676 struct gfs2_leaf *leaf; 1710 struct gfs2_leaf *leaf;
1677 int error; 1711 int error;
1678 1712
1679 while(1) { 1713 while(1) {
1680 dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space, 1714 if (da->bh == NULL) {
1681 &bh); 1715 dent = gfs2_dirent_search(inode, name,
1716 gfs2_dirent_find_space, &bh);
1717 }
1682 if (dent) { 1718 if (dent) {
1683 if (IS_ERR(dent)) 1719 if (IS_ERR(dent))
1684 return PTR_ERR(dent); 1720 return PTR_ERR(dent);
1685 dent = gfs2_init_dirent(inode, dent, name, bh); 1721 dent = gfs2_init_dirent(inode, dent, name, bh);
1686 gfs2_inum_out(nip, dent); 1722 gfs2_inum_out(nip, dent);
1687 dent->de_type = cpu_to_be16(IF2DT(nip->i_inode.i_mode)); 1723 dent->de_type = cpu_to_be16(IF2DT(nip->i_inode.i_mode));
1724 tv = CURRENT_TIME;
1688 if (ip->i_diskflags & GFS2_DIF_EXHASH) { 1725 if (ip->i_diskflags & GFS2_DIF_EXHASH) {
1689 leaf = (struct gfs2_leaf *)bh->b_data; 1726 leaf = (struct gfs2_leaf *)bh->b_data;
1690 be16_add_cpu(&leaf->lf_entries, 1); 1727 be16_add_cpu(&leaf->lf_entries, 1);
1728 leaf->lf_nsec = cpu_to_be32(tv.tv_nsec);
1729 leaf->lf_sec = cpu_to_be64(tv.tv_sec);
1691 } 1730 }
1731 da->dent = NULL;
1732 da->bh = NULL;
1692 brelse(bh); 1733 brelse(bh);
1693 ip->i_entries++; 1734 ip->i_entries++;
1694 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 1735 ip->i_inode.i_mtime = ip->i_inode.i_ctime = tv;
1695 if (S_ISDIR(nip->i_inode.i_mode)) 1736 if (S_ISDIR(nip->i_inode.i_mode))
1696 inc_nlink(&ip->i_inode); 1737 inc_nlink(&ip->i_inode);
1697 mark_inode_dirty(inode); 1738 mark_inode_dirty(inode);
@@ -1742,6 +1783,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry)
1742 const struct qstr *name = &dentry->d_name; 1783 const struct qstr *name = &dentry->d_name;
1743 struct gfs2_dirent *dent, *prev = NULL; 1784 struct gfs2_dirent *dent, *prev = NULL;
1744 struct buffer_head *bh; 1785 struct buffer_head *bh;
1786 struct timespec tv = CURRENT_TIME;
1745 1787
1746 /* Returns _either_ the entry (if its first in block) or the 1788 /* Returns _either_ the entry (if its first in block) or the
1747 previous entry otherwise */ 1789 previous entry otherwise */
@@ -1767,13 +1809,15 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry)
1767 if (!entries) 1809 if (!entries)
1768 gfs2_consist_inode(dip); 1810 gfs2_consist_inode(dip);
1769 leaf->lf_entries = cpu_to_be16(--entries); 1811 leaf->lf_entries = cpu_to_be16(--entries);
1812 leaf->lf_nsec = cpu_to_be32(tv.tv_nsec);
1813 leaf->lf_sec = cpu_to_be64(tv.tv_sec);
1770 } 1814 }
1771 brelse(bh); 1815 brelse(bh);
1772 1816
1773 if (!dip->i_entries) 1817 if (!dip->i_entries)
1774 gfs2_consist_inode(dip); 1818 gfs2_consist_inode(dip);
1775 dip->i_entries--; 1819 dip->i_entries--;
1776 dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME; 1820 dip->i_inode.i_mtime = dip->i_inode.i_ctime = tv;
1777 if (S_ISDIR(dentry->d_inode->i_mode)) 1821 if (S_ISDIR(dentry->d_inode->i_mode))
1778 drop_nlink(&dip->i_inode); 1822 drop_nlink(&dip->i_inode);
1779 mark_inode_dirty(&dip->i_inode); 1823 mark_inode_dirty(&dip->i_inode);
@@ -2017,22 +2061,36 @@ out:
2017 * gfs2_diradd_alloc_required - find if adding entry will require an allocation 2061 * gfs2_diradd_alloc_required - find if adding entry will require an allocation
2018 * @ip: the file being written to 2062 * @ip: the file being written to
2019 * @filname: the filename that's going to be added 2063 * @filname: the filename that's going to be added
2064 * @da: The structure to return dir alloc info
2020 * 2065 *
2021 * Returns: 1 if alloc required, 0 if not, -ve on error 2066 * Returns: 0 if ok, -ve on error
2022 */ 2067 */
2023 2068
2024int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name) 2069int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name,
2070 struct gfs2_diradd *da)
2025{ 2071{
2072 struct gfs2_inode *ip = GFS2_I(inode);
2073 struct gfs2_sbd *sdp = GFS2_SB(inode);
2074 const unsigned int extra = sizeof(struct gfs2_dinode) - sizeof(struct gfs2_leaf);
2026 struct gfs2_dirent *dent; 2075 struct gfs2_dirent *dent;
2027 struct buffer_head *bh; 2076 struct buffer_head *bh;
2028 2077
2078 da->nr_blocks = 0;
2079 da->bh = NULL;
2080 da->dent = NULL;
2081
2029 dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space, &bh); 2082 dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space, &bh);
2030 if (!dent) { 2083 if (!dent) {
2031 return 1; 2084 da->nr_blocks = sdp->sd_max_dirres;
2085 if (!(ip->i_diskflags & GFS2_DIF_EXHASH) &&
2086 (GFS2_DIRENT_SIZE(name->len) < extra))
2087 da->nr_blocks = 1;
2088 return 0;
2032 } 2089 }
2033 if (IS_ERR(dent)) 2090 if (IS_ERR(dent))
2034 return PTR_ERR(dent); 2091 return PTR_ERR(dent);
2035 brelse(bh); 2092 da->bh = bh;
2093 da->dent = dent;
2036 return 0; 2094 return 0;
2037} 2095}
2038 2096
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index 4f03bbd1873f..126c65dda028 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -16,6 +16,14 @@
16struct inode; 16struct inode;
17struct gfs2_inode; 17struct gfs2_inode;
18struct gfs2_inum; 18struct gfs2_inum;
19struct buffer_head;
20struct gfs2_dirent;
21
22struct gfs2_diradd {
23 unsigned nr_blocks;
24 struct gfs2_dirent *dent;
25 struct buffer_head *bh;
26};
19 27
20extern struct inode *gfs2_dir_search(struct inode *dir, 28extern struct inode *gfs2_dir_search(struct inode *dir,
21 const struct qstr *filename, 29 const struct qstr *filename,
@@ -23,7 +31,13 @@ extern struct inode *gfs2_dir_search(struct inode *dir,
23extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename, 31extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename,
24 const struct gfs2_inode *ip); 32 const struct gfs2_inode *ip);
25extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename, 33extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
26 const struct gfs2_inode *ip); 34 const struct gfs2_inode *ip, struct gfs2_diradd *da);
35static inline void gfs2_dir_no_add(struct gfs2_diradd *da)
36{
37 if (da->bh)
38 brelse(da->bh);
39 da->bh = NULL;
40}
27extern int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry); 41extern int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry);
28extern int gfs2_dir_read(struct inode *inode, struct dir_context *ctx, 42extern int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
29 struct file_ra_state *f_ra); 43 struct file_ra_state *f_ra);
@@ -33,7 +47,8 @@ extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
33extern int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip); 47extern int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip);
34 48
35extern int gfs2_diradd_alloc_required(struct inode *dir, 49extern int gfs2_diradd_alloc_required(struct inode *dir,
36 const struct qstr *filename); 50 const struct qstr *filename,
51 struct gfs2_diradd *da);
37extern int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block, 52extern int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
38 struct buffer_head **bhp); 53 struct buffer_head **bhp);
39extern void gfs2_dir_hash_inval(struct gfs2_inode *ip); 54extern void gfs2_dir_hash_inval(struct gfs2_inode *ip);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 6f7a47c05259..ca0be6c69a26 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1552,13 +1552,11 @@ void gfs2_glock_thaw(struct gfs2_sbd *sdp)
1552 glock_hash_walk(thaw_glock, sdp); 1552 glock_hash_walk(thaw_glock, sdp);
1553} 1553}
1554 1554
1555static int dump_glock(struct seq_file *seq, struct gfs2_glock *gl) 1555static void dump_glock(struct seq_file *seq, struct gfs2_glock *gl)
1556{ 1556{
1557 int ret;
1558 spin_lock(&gl->gl_spin); 1557 spin_lock(&gl->gl_spin);
1559 ret = gfs2_dump_glock(seq, gl); 1558 gfs2_dump_glock(seq, gl);
1560 spin_unlock(&gl->gl_spin); 1559 spin_unlock(&gl->gl_spin);
1561 return ret;
1562} 1560}
1563 1561
1564static void dump_glock_func(struct gfs2_glock *gl) 1562static void dump_glock_func(struct gfs2_glock *gl)
@@ -1647,10 +1645,9 @@ static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags)
1647 * @seq: the seq_file struct 1645 * @seq: the seq_file struct
1648 * @gh: the glock holder 1646 * @gh: the glock holder
1649 * 1647 *
1650 * Returns: 0 on success, -ENOBUFS when we run out of space
1651 */ 1648 */
1652 1649
1653static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh) 1650static void dump_holder(struct seq_file *seq, const struct gfs2_holder *gh)
1654{ 1651{
1655 struct task_struct *gh_owner = NULL; 1652 struct task_struct *gh_owner = NULL;
1656 char flags_buf[32]; 1653 char flags_buf[32];
@@ -1666,7 +1663,6 @@ static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh)
1666 gh_owner ? gh_owner->comm : "(ended)", 1663 gh_owner ? gh_owner->comm : "(ended)",
1667 (void *)gh->gh_ip); 1664 (void *)gh->gh_ip);
1668 rcu_read_unlock(); 1665 rcu_read_unlock();
1669 return 0;
1670} 1666}
1671 1667
1672static const char *gflags2str(char *buf, const struct gfs2_glock *gl) 1668static const char *gflags2str(char *buf, const struct gfs2_glock *gl)
@@ -1721,16 +1717,14 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl)
1721 * example. The field's are n = number (id of the object), f = flags, 1717 * example. The field's are n = number (id of the object), f = flags,
1722 * t = type, s = state, r = refcount, e = error, p = pid. 1718 * t = type, s = state, r = refcount, e = error, p = pid.
1723 * 1719 *
1724 * Returns: 0 on success, -ENOBUFS when we run out of space
1725 */ 1720 */
1726 1721
1727int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl) 1722void gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
1728{ 1723{
1729 const struct gfs2_glock_operations *glops = gl->gl_ops; 1724 const struct gfs2_glock_operations *glops = gl->gl_ops;
1730 unsigned long long dtime; 1725 unsigned long long dtime;
1731 const struct gfs2_holder *gh; 1726 const struct gfs2_holder *gh;
1732 char gflags_buf[32]; 1727 char gflags_buf[32];
1733 int error = 0;
1734 1728
1735 dtime = jiffies - gl->gl_demote_time; 1729 dtime = jiffies - gl->gl_demote_time;
1736 dtime *= 1000000/HZ; /* demote time in uSec */ 1730 dtime *= 1000000/HZ; /* demote time in uSec */
@@ -1747,15 +1741,11 @@ int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
1747 atomic_read(&gl->gl_revokes), 1741 atomic_read(&gl->gl_revokes),
1748 (int)gl->gl_lockref.count, gl->gl_hold_time); 1742 (int)gl->gl_lockref.count, gl->gl_hold_time);
1749 1743
1750 list_for_each_entry(gh, &gl->gl_holders, gh_list) { 1744 list_for_each_entry(gh, &gl->gl_holders, gh_list)
1751 error = dump_holder(seq, gh); 1745 dump_holder(seq, gh);
1752 if (error) 1746
1753 goto out;
1754 }
1755 if (gl->gl_state != LM_ST_UNLOCKED && glops->go_dump) 1747 if (gl->gl_state != LM_ST_UNLOCKED && glops->go_dump)
1756 error = glops->go_dump(seq, gl); 1748 glops->go_dump(seq, gl);
1757out:
1758 return error;
1759} 1749}
1760 1750
1761static int gfs2_glstats_seq_show(struct seq_file *seq, void *iter_ptr) 1751static int gfs2_glstats_seq_show(struct seq_file *seq, void *iter_ptr)
@@ -1953,7 +1943,8 @@ static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr)
1953 1943
1954static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr) 1944static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr)
1955{ 1945{
1956 return dump_glock(seq, iter_ptr); 1946 dump_glock(seq, iter_ptr);
1947 return 0;
1957} 1948}
1958 1949
1959static void *gfs2_sbstats_seq_start(struct seq_file *seq, loff_t *pos) 1950static void *gfs2_sbstats_seq_start(struct seq_file *seq, loff_t *pos)
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 6647d77366ba..32572f71f027 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -199,7 +199,7 @@ extern int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number,
199 struct gfs2_holder *gh); 199 struct gfs2_holder *gh);
200extern int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs); 200extern int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
201extern void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs); 201extern void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
202extern int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl); 202extern void gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl);
203#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { gfs2_dump_glock(NULL, gl); BUG(); } } while(0) 203#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { gfs2_dump_glock(NULL, gl); BUG(); } } while(0)
204extern __printf(2, 3) 204extern __printf(2, 3)
205void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...); 205void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index f88dcd925010..3bf0631b5d56 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -133,7 +133,8 @@ void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
133 133
134static void rgrp_go_sync(struct gfs2_glock *gl) 134static void rgrp_go_sync(struct gfs2_glock *gl)
135{ 135{
136 struct address_space *metamapping = gfs2_glock2aspace(gl); 136 struct gfs2_sbd *sdp = gl->gl_sbd;
137 struct address_space *mapping = &sdp->sd_aspace;
137 struct gfs2_rgrpd *rgd; 138 struct gfs2_rgrpd *rgd;
138 int error; 139 int error;
139 140
@@ -141,10 +142,10 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
141 return; 142 return;
142 GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE); 143 GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE);
143 144
144 gfs2_log_flush(gl->gl_sbd, gl); 145 gfs2_log_flush(sdp, gl);
145 filemap_fdatawrite(metamapping); 146 filemap_fdatawrite_range(mapping, gl->gl_vm.start, gl->gl_vm.end);
146 error = filemap_fdatawait(metamapping); 147 error = filemap_fdatawait_range(mapping, gl->gl_vm.start, gl->gl_vm.end);
147 mapping_set_error(metamapping, error); 148 mapping_set_error(mapping, error);
148 gfs2_ail_empty_gl(gl); 149 gfs2_ail_empty_gl(gl);
149 150
150 spin_lock(&gl->gl_spin); 151 spin_lock(&gl->gl_spin);
@@ -166,11 +167,12 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
166 167
167static void rgrp_go_inval(struct gfs2_glock *gl, int flags) 168static void rgrp_go_inval(struct gfs2_glock *gl, int flags)
168{ 169{
169 struct address_space *mapping = gfs2_glock2aspace(gl); 170 struct gfs2_sbd *sdp = gl->gl_sbd;
171 struct address_space *mapping = &sdp->sd_aspace;
170 172
171 WARN_ON_ONCE(!(flags & DIO_METADATA)); 173 WARN_ON_ONCE(!(flags & DIO_METADATA));
172 gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count)); 174 gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
173 truncate_inode_pages(mapping, 0); 175 truncate_inode_pages_range(mapping, gl->gl_vm.start, gl->gl_vm.end);
174 176
175 if (gl->gl_object) { 177 if (gl->gl_object) {
176 struct gfs2_rgrpd *rgd = (struct gfs2_rgrpd *)gl->gl_object; 178 struct gfs2_rgrpd *rgd = (struct gfs2_rgrpd *)gl->gl_object;
@@ -435,21 +437,19 @@ static int inode_go_lock(struct gfs2_holder *gh)
435 * @seq: The iterator 437 * @seq: The iterator
436 * @ip: the inode 438 * @ip: the inode
437 * 439 *
438 * Returns: 0 on success, -ENOBUFS when we run out of space
439 */ 440 */
440 441
441static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl) 442static void inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
442{ 443{
443 const struct gfs2_inode *ip = gl->gl_object; 444 const struct gfs2_inode *ip = gl->gl_object;
444 if (ip == NULL) 445 if (ip == NULL)
445 return 0; 446 return;
446 gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu\n", 447 gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu\n",
447 (unsigned long long)ip->i_no_formal_ino, 448 (unsigned long long)ip->i_no_formal_ino,
448 (unsigned long long)ip->i_no_addr, 449 (unsigned long long)ip->i_no_addr,
449 IF2DT(ip->i_inode.i_mode), ip->i_flags, 450 IF2DT(ip->i_inode.i_mode), ip->i_flags,
450 (unsigned int)ip->i_diskflags, 451 (unsigned int)ip->i_diskflags,
451 (unsigned long long)i_size_read(&ip->i_inode)); 452 (unsigned long long)i_size_read(&ip->i_inode));
452 return 0;
453} 453}
454 454
455/** 455/**
@@ -558,7 +558,7 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
558 .go_unlock = gfs2_rgrp_go_unlock, 558 .go_unlock = gfs2_rgrp_go_unlock,
559 .go_dump = gfs2_rgrp_dump, 559 .go_dump = gfs2_rgrp_dump,
560 .go_type = LM_TYPE_RGRP, 560 .go_type = LM_TYPE_RGRP,
561 .go_flags = GLOF_ASPACE | GLOF_LVB, 561 .go_flags = GLOF_LVB,
562}; 562};
563 563
564const struct gfs2_glock_operations gfs2_trans_glops = { 564const struct gfs2_glock_operations gfs2_trans_glops = {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index ba1ea67f4eeb..cf0e34400f71 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -93,6 +93,7 @@ struct gfs2_rgrpd {
93 struct gfs2_rgrp_lvb *rd_rgl; 93 struct gfs2_rgrp_lvb *rd_rgl;
94 u32 rd_last_alloc; 94 u32 rd_last_alloc;
95 u32 rd_flags; 95 u32 rd_flags;
96 u32 rd_extfail_pt; /* extent failure point */
96#define GFS2_RDF_CHECK 0x10000000 /* check for unlinked inodes */ 97#define GFS2_RDF_CHECK 0x10000000 /* check for unlinked inodes */
97#define GFS2_RDF_UPTODATE 0x20000000 /* rg is up to date */ 98#define GFS2_RDF_UPTODATE 0x20000000 /* rg is up to date */
98#define GFS2_RDF_ERROR 0x40000000 /* error in rg */ 99#define GFS2_RDF_ERROR 0x40000000 /* error in rg */
@@ -217,7 +218,7 @@ struct gfs2_glock_operations {
217 int (*go_demote_ok) (const struct gfs2_glock *gl); 218 int (*go_demote_ok) (const struct gfs2_glock *gl);
218 int (*go_lock) (struct gfs2_holder *gh); 219 int (*go_lock) (struct gfs2_holder *gh);
219 void (*go_unlock) (struct gfs2_holder *gh); 220 void (*go_unlock) (struct gfs2_holder *gh);
220 int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl); 221 void (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl);
221 void (*go_callback)(struct gfs2_glock *gl, bool remote); 222 void (*go_callback)(struct gfs2_glock *gl, bool remote);
222 const int go_type; 223 const int go_type;
223 const unsigned long go_flags; 224 const unsigned long go_flags;
@@ -350,7 +351,15 @@ struct gfs2_glock {
350 atomic_t gl_ail_count; 351 atomic_t gl_ail_count;
351 atomic_t gl_revokes; 352 atomic_t gl_revokes;
352 struct delayed_work gl_work; 353 struct delayed_work gl_work;
353 struct work_struct gl_delete; 354 union {
355 /* For inode and iopen glocks only */
356 struct work_struct gl_delete;
357 /* For rgrp glocks only */
358 struct {
359 loff_t start;
360 loff_t end;
361 } gl_vm;
362 };
354 struct rcu_head gl_rcu; 363 struct rcu_head gl_rcu;
355}; 364};
356 365
@@ -419,10 +428,13 @@ enum {
419}; 428};
420 429
421struct gfs2_quota_data { 430struct gfs2_quota_data {
431 struct hlist_bl_node qd_hlist;
422 struct list_head qd_list; 432 struct list_head qd_list;
423 struct kqid qd_id; 433 struct kqid qd_id;
434 struct gfs2_sbd *qd_sbd;
424 struct lockref qd_lockref; 435 struct lockref qd_lockref;
425 struct list_head qd_lru; 436 struct list_head qd_lru;
437 unsigned qd_hash;
426 438
427 unsigned long qd_flags; /* QDF_... */ 439 unsigned long qd_flags; /* QDF_... */
428 440
@@ -441,6 +453,7 @@ struct gfs2_quota_data {
441 453
442 u64 qd_sync_gen; 454 u64 qd_sync_gen;
443 unsigned long qd_last_warn; 455 unsigned long qd_last_warn;
456 struct rcu_head qd_rcu;
444}; 457};
445 458
446struct gfs2_trans { 459struct gfs2_trans {
@@ -720,13 +733,15 @@ struct gfs2_sbd {
720 spinlock_t sd_trunc_lock; 733 spinlock_t sd_trunc_lock;
721 734
722 unsigned int sd_quota_slots; 735 unsigned int sd_quota_slots;
723 unsigned int sd_quota_chunks; 736 unsigned long *sd_quota_bitmap;
724 unsigned char **sd_quota_bitmap; 737 spinlock_t sd_bitmap_lock;
725 738
726 u64 sd_quota_sync_gen; 739 u64 sd_quota_sync_gen;
727 740
728 /* Log stuff */ 741 /* Log stuff */
729 742
743 struct address_space sd_aspace;
744
730 spinlock_t sd_log_lock; 745 spinlock_t sd_log_lock;
731 746
732 struct gfs2_trans *sd_log_tr; 747 struct gfs2_trans *sd_log_tr;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 7119504159f1..890588c7fb33 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -149,7 +149,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
149 ip = GFS2_I(inode); 149 ip = GFS2_I(inode);
150 150
151 if (!inode) 151 if (!inode)
152 return ERR_PTR(-ENOBUFS); 152 return ERR_PTR(-ENOMEM);
153 153
154 if (inode->i_state & I_NEW) { 154 if (inode->i_state & I_NEW) {
155 struct gfs2_sbd *sdp = GFS2_SB(inode); 155 struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -469,14 +469,36 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip,
469 brelse(dibh); 469 brelse(dibh);
470} 470}
471 471
472/**
473 * gfs2_trans_da_blocks - Calculate number of blocks to link inode
474 * @dip: The directory we are linking into
475 * @da: The dir add information
476 * @nr_inodes: The number of inodes involved
477 *
478 * This calculate the number of blocks we need to reserve in a
479 * transaction to link @nr_inodes into a directory. In most cases
480 * @nr_inodes will be 2 (the directory plus the inode being linked in)
481 * but in case of rename, 4 may be required.
482 *
483 * Returns: Number of blocks
484 */
485
486static unsigned gfs2_trans_da_blks(const struct gfs2_inode *dip,
487 const struct gfs2_diradd *da,
488 unsigned nr_inodes)
489{
490 return da->nr_blocks + gfs2_rg_blocks(dip, da->nr_blocks) +
491 (nr_inodes * RES_DINODE) + RES_QUOTA + RES_STATFS;
492}
493
472static int link_dinode(struct gfs2_inode *dip, const struct qstr *name, 494static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
473 struct gfs2_inode *ip, int arq) 495 struct gfs2_inode *ip, struct gfs2_diradd *da)
474{ 496{
475 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); 497 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
476 struct gfs2_alloc_parms ap = { .target = sdp->sd_max_dirres, }; 498 struct gfs2_alloc_parms ap = { .target = da->nr_blocks, };
477 int error; 499 int error;
478 500
479 if (arq) { 501 if (da->nr_blocks) {
480 error = gfs2_quota_lock_check(dip); 502 error = gfs2_quota_lock_check(dip);
481 if (error) 503 if (error)
482 goto fail_quota_locks; 504 goto fail_quota_locks;
@@ -485,10 +507,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
485 if (error) 507 if (error)
486 goto fail_quota_locks; 508 goto fail_quota_locks;
487 509
488 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + 510 error = gfs2_trans_begin(sdp, gfs2_trans_da_blks(dip, da, 2), 0);
489 dip->i_rgd->rd_length +
490 2 * RES_DINODE +
491 RES_STATFS + RES_QUOTA, 0);
492 if (error) 511 if (error)
493 goto fail_ipreserv; 512 goto fail_ipreserv;
494 } else { 513 } else {
@@ -497,7 +516,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
497 goto fail_quota_locks; 516 goto fail_quota_locks;
498 } 517 }
499 518
500 error = gfs2_dir_add(&dip->i_inode, name, ip); 519 error = gfs2_dir_add(&dip->i_inode, name, ip, da);
501 if (error) 520 if (error)
502 goto fail_end_trans; 521 goto fail_end_trans;
503 522
@@ -560,7 +579,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
560 struct dentry *d; 579 struct dentry *d;
561 int error; 580 int error;
562 u32 aflags = 0; 581 u32 aflags = 0;
563 int arq; 582 struct gfs2_diradd da = { .bh = NULL, };
564 583
565 if (!name->len || name->len > GFS2_FNAMESIZE) 584 if (!name->len || name->len > GFS2_FNAMESIZE)
566 return -ENAMETOOLONG; 585 return -ENAMETOOLONG;
@@ -585,6 +604,9 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
585 error = PTR_ERR(inode); 604 error = PTR_ERR(inode);
586 if (!IS_ERR(inode)) { 605 if (!IS_ERR(inode)) {
587 d = d_splice_alias(inode, dentry); 606 d = d_splice_alias(inode, dentry);
607 error = PTR_ERR(d);
608 if (IS_ERR(d))
609 goto fail_gunlock;
588 error = 0; 610 error = 0;
589 if (file) { 611 if (file) {
590 if (S_ISREG(inode->i_mode)) { 612 if (S_ISREG(inode->i_mode)) {
@@ -602,7 +624,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
602 goto fail_gunlock; 624 goto fail_gunlock;
603 } 625 }
604 626
605 arq = error = gfs2_diradd_alloc_required(dir, name); 627 error = gfs2_diradd_alloc_required(dir, name, &da);
606 if (error < 0) 628 if (error < 0)
607 goto fail_gunlock; 629 goto fail_gunlock;
608 630
@@ -690,7 +712,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
690 if (error) 712 if (error)
691 goto fail_gunlock3; 713 goto fail_gunlock3;
692 714
693 error = link_dinode(dip, name, ip, arq); 715 error = link_dinode(dip, name, ip, &da);
694 if (error) 716 if (error)
695 goto fail_gunlock3; 717 goto fail_gunlock3;
696 718
@@ -719,6 +741,7 @@ fail_free_inode:
719 free_inode_nonrcu(inode); 741 free_inode_nonrcu(inode);
720 inode = NULL; 742 inode = NULL;
721fail_gunlock: 743fail_gunlock:
744 gfs2_dir_no_add(&da);
722 gfs2_glock_dq_uninit(ghs); 745 gfs2_glock_dq_uninit(ghs);
723 if (inode && !IS_ERR(inode)) { 746 if (inode && !IS_ERR(inode)) {
724 clear_nlink(inode); 747 clear_nlink(inode);
@@ -779,6 +802,11 @@ static struct dentry *__gfs2_lookup(struct inode *dir, struct dentry *dentry,
779 } 802 }
780 803
781 d = d_splice_alias(inode, dentry); 804 d = d_splice_alias(inode, dentry);
805 if (IS_ERR(d)) {
806 iput(inode);
807 gfs2_glock_dq_uninit(&gh);
808 return d;
809 }
782 if (file && S_ISREG(inode->i_mode)) 810 if (file && S_ISREG(inode->i_mode))
783 error = finish_open(file, dentry, gfs2_open_common, opened); 811 error = finish_open(file, dentry, gfs2_open_common, opened);
784 812
@@ -817,7 +845,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
817 struct gfs2_inode *ip = GFS2_I(inode); 845 struct gfs2_inode *ip = GFS2_I(inode);
818 struct gfs2_holder ghs[2]; 846 struct gfs2_holder ghs[2];
819 struct buffer_head *dibh; 847 struct buffer_head *dibh;
820 int alloc_required; 848 struct gfs2_diradd da = { .bh = NULL, };
821 int error; 849 int error;
822 850
823 if (S_ISDIR(inode->i_mode)) 851 if (S_ISDIR(inode->i_mode))
@@ -872,13 +900,12 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
872 if (ip->i_inode.i_nlink == (u32)-1) 900 if (ip->i_inode.i_nlink == (u32)-1)
873 goto out_gunlock; 901 goto out_gunlock;
874 902
875 alloc_required = error = gfs2_diradd_alloc_required(dir, &dentry->d_name); 903 error = gfs2_diradd_alloc_required(dir, &dentry->d_name, &da);
876 if (error < 0) 904 if (error < 0)
877 goto out_gunlock; 905 goto out_gunlock;
878 error = 0;
879 906
880 if (alloc_required) { 907 if (da.nr_blocks) {
881 struct gfs2_alloc_parms ap = { .target = sdp->sd_max_dirres, }; 908 struct gfs2_alloc_parms ap = { .target = da.nr_blocks, };
882 error = gfs2_quota_lock_check(dip); 909 error = gfs2_quota_lock_check(dip);
883 if (error) 910 if (error)
884 goto out_gunlock; 911 goto out_gunlock;
@@ -887,10 +914,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
887 if (error) 914 if (error)
888 goto out_gunlock_q; 915 goto out_gunlock_q;
889 916
890 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + 917 error = gfs2_trans_begin(sdp, gfs2_trans_da_blks(dip, &da, 2), 0);
891 gfs2_rg_blocks(dip, sdp->sd_max_dirres) +
892 2 * RES_DINODE + RES_STATFS +
893 RES_QUOTA, 0);
894 if (error) 918 if (error)
895 goto out_ipres; 919 goto out_ipres;
896 } else { 920 } else {
@@ -903,7 +927,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
903 if (error) 927 if (error)
904 goto out_end_trans; 928 goto out_end_trans;
905 929
906 error = gfs2_dir_add(dir, &dentry->d_name, ip); 930 error = gfs2_dir_add(dir, &dentry->d_name, ip, &da);
907 if (error) 931 if (error)
908 goto out_brelse; 932 goto out_brelse;
909 933
@@ -919,12 +943,13 @@ out_brelse:
919out_end_trans: 943out_end_trans:
920 gfs2_trans_end(sdp); 944 gfs2_trans_end(sdp);
921out_ipres: 945out_ipres:
922 if (alloc_required) 946 if (da.nr_blocks)
923 gfs2_inplace_release(dip); 947 gfs2_inplace_release(dip);
924out_gunlock_q: 948out_gunlock_q:
925 if (alloc_required) 949 if (da.nr_blocks)
926 gfs2_quota_unlock(dip); 950 gfs2_quota_unlock(dip);
927out_gunlock: 951out_gunlock:
952 gfs2_dir_no_add(&da);
928 gfs2_glock_dq(ghs + 1); 953 gfs2_glock_dq(ghs + 1);
929out_child: 954out_child:
930 gfs2_glock_dq(ghs); 955 gfs2_glock_dq(ghs);
@@ -1254,7 +1279,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
1254 struct gfs2_rgrpd *nrgd; 1279 struct gfs2_rgrpd *nrgd;
1255 unsigned int num_gh; 1280 unsigned int num_gh;
1256 int dir_rename = 0; 1281 int dir_rename = 0;
1257 int alloc_required = 0; 1282 struct gfs2_diradd da = { .nr_blocks = 0, };
1258 unsigned int x; 1283 unsigned int x;
1259 int error; 1284 int error;
1260 1285
@@ -1388,14 +1413,14 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
1388 goto out_gunlock; 1413 goto out_gunlock;
1389 } 1414 }
1390 1415
1391 if (nip == NULL) 1416 if (nip == NULL) {
1392 alloc_required = gfs2_diradd_alloc_required(ndir, &ndentry->d_name); 1417 error = gfs2_diradd_alloc_required(ndir, &ndentry->d_name, &da);
1393 error = alloc_required; 1418 if (error)
1394 if (error < 0) 1419 goto out_gunlock;
1395 goto out_gunlock; 1420 }
1396 1421
1397 if (alloc_required) { 1422 if (da.nr_blocks) {
1398 struct gfs2_alloc_parms ap = { .target = sdp->sd_max_dirres, }; 1423 struct gfs2_alloc_parms ap = { .target = da.nr_blocks, };
1399 error = gfs2_quota_lock_check(ndip); 1424 error = gfs2_quota_lock_check(ndip);
1400 if (error) 1425 if (error)
1401 goto out_gunlock; 1426 goto out_gunlock;
@@ -1404,10 +1429,8 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
1404 if (error) 1429 if (error)
1405 goto out_gunlock_q; 1430 goto out_gunlock_q;
1406 1431
1407 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + 1432 error = gfs2_trans_begin(sdp, gfs2_trans_da_blks(ndip, &da, 4) +
1408 gfs2_rg_blocks(ndip, sdp->sd_max_dirres) + 1433 4 * RES_LEAF + 4, 0);
1409 4 * RES_DINODE + 4 * RES_LEAF +
1410 RES_STATFS + RES_QUOTA + 4, 0);
1411 if (error) 1434 if (error)
1412 goto out_ipreserv; 1435 goto out_ipreserv;
1413 } else { 1436 } else {
@@ -1441,19 +1464,20 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
1441 if (error) 1464 if (error)
1442 goto out_end_trans; 1465 goto out_end_trans;
1443 1466
1444 error = gfs2_dir_add(ndir, &ndentry->d_name, ip); 1467 error = gfs2_dir_add(ndir, &ndentry->d_name, ip, &da);
1445 if (error) 1468 if (error)
1446 goto out_end_trans; 1469 goto out_end_trans;
1447 1470
1448out_end_trans: 1471out_end_trans:
1449 gfs2_trans_end(sdp); 1472 gfs2_trans_end(sdp);
1450out_ipreserv: 1473out_ipreserv:
1451 if (alloc_required) 1474 if (da.nr_blocks)
1452 gfs2_inplace_release(ndip); 1475 gfs2_inplace_release(ndip);
1453out_gunlock_q: 1476out_gunlock_q:
1454 if (alloc_required) 1477 if (da.nr_blocks)
1455 gfs2_quota_unlock(ndip); 1478 gfs2_quota_unlock(ndip);
1456out_gunlock: 1479out_gunlock:
1480 gfs2_dir_no_add(&da);
1457 while (x--) { 1481 while (x--) {
1458 gfs2_glock_dq(ghs + x); 1482 gfs2_glock_dq(ghs + x);
1459 gfs2_holder_uninit(ghs + x); 1483 gfs2_holder_uninit(ghs + x);
@@ -1607,10 +1631,22 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
1607 if (!(attr->ia_valid & ATTR_GID) || gid_eq(ogid, ngid)) 1631 if (!(attr->ia_valid & ATTR_GID) || gid_eq(ogid, ngid))
1608 ogid = ngid = NO_GID_QUOTA_CHANGE; 1632 ogid = ngid = NO_GID_QUOTA_CHANGE;
1609 1633
1610 error = gfs2_quota_lock(ip, nuid, ngid); 1634 error = get_write_access(inode);
1611 if (error) 1635 if (error)
1612 return error; 1636 return error;
1613 1637
1638 error = gfs2_rs_alloc(ip);
1639 if (error)
1640 goto out;
1641
1642 error = gfs2_rindex_update(sdp);
1643 if (error)
1644 goto out;
1645
1646 error = gfs2_quota_lock(ip, nuid, ngid);
1647 if (error)
1648 goto out;
1649
1614 if (!uid_eq(ouid, NO_UID_QUOTA_CHANGE) || 1650 if (!uid_eq(ouid, NO_UID_QUOTA_CHANGE) ||
1615 !gid_eq(ogid, NO_GID_QUOTA_CHANGE)) { 1651 !gid_eq(ogid, NO_GID_QUOTA_CHANGE)) {
1616 error = gfs2_quota_check(ip, nuid, ngid); 1652 error = gfs2_quota_check(ip, nuid, ngid);
@@ -1637,6 +1673,8 @@ out_end_trans:
1637 gfs2_trans_end(sdp); 1673 gfs2_trans_end(sdp);
1638out_gunlock_q: 1674out_gunlock_q:
1639 gfs2_quota_unlock(ip); 1675 gfs2_quota_unlock(ip);
1676out:
1677 put_write_access(inode);
1640 return error; 1678 return error;
1641} 1679}
1642 1680
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 010b9fb9fec6..58f06400b7b8 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -83,6 +83,7 @@ static void maybe_release_space(struct gfs2_bufdata *bd)
83 bd->bd_bh->b_data + bi->bi_offset, bi->bi_len); 83 bd->bd_bh->b_data + bi->bi_offset, bi->bi_len);
84 clear_bit(GBF_FULL, &bi->bi_flags); 84 clear_bit(GBF_FULL, &bi->bi_flags);
85 rgd->rd_free_clone = rgd->rd_free; 85 rgd->rd_free_clone = rgd->rd_free;
86 rgd->rd_extfail_pt = rgd->rd_free;
86} 87}
87 88
88/** 89/**
@@ -588,8 +589,12 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
588static void gfs2_meta_sync(struct gfs2_glock *gl) 589static void gfs2_meta_sync(struct gfs2_glock *gl)
589{ 590{
590 struct address_space *mapping = gfs2_glock2aspace(gl); 591 struct address_space *mapping = gfs2_glock2aspace(gl);
592 struct gfs2_sbd *sdp = gl->gl_sbd;
591 int error; 593 int error;
592 594
595 if (mapping == NULL)
596 mapping = &sdp->sd_aspace;
597
593 filemap_fdatawrite(mapping); 598 filemap_fdatawrite(mapping);
594 error = filemap_fdatawait(mapping); 599 error = filemap_fdatawait(mapping);
595 600
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 0650db2541ef..c272e73063de 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -76,6 +76,7 @@ static int __init init_gfs2_fs(void)
76 76
77 gfs2_str2qstr(&gfs2_qdot, "."); 77 gfs2_str2qstr(&gfs2_qdot, ".");
78 gfs2_str2qstr(&gfs2_qdotdot, ".."); 78 gfs2_str2qstr(&gfs2_qdotdot, "..");
79 gfs2_quota_hash_init();
79 80
80 error = gfs2_sys_init(); 81 error = gfs2_sys_init();
81 if (error) 82 if (error)
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 52f177be3bf8..c7f24690ed05 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -116,6 +116,9 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
116 unsigned long index; 116 unsigned long index;
117 unsigned int bufnum; 117 unsigned int bufnum;
118 118
119 if (mapping == NULL)
120 mapping = &sdp->sd_aspace;
121
119 shift = PAGE_CACHE_SHIFT - sdp->sd_sb.sb_bsize_shift; 122 shift = PAGE_CACHE_SHIFT - sdp->sd_sb.sb_bsize_shift;
120 index = blkno >> shift; /* convert block to page */ 123 index = blkno >> shift; /* convert block to page */
121 bufnum = blkno - (index << shift); /* block buf index within page */ 124 bufnum = blkno - (index << shift); /* block buf index within page */
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 52fa88314f5c..1e712b566d76 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -36,6 +36,7 @@
36#include "log.h" 36#include "log.h"
37#include "quota.h" 37#include "quota.h"
38#include "dir.h" 38#include "dir.h"
39#include "meta_io.h"
39#include "trace_gfs2.h" 40#include "trace_gfs2.h"
40 41
41#define DO 0 42#define DO 0
@@ -62,6 +63,7 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
62static struct gfs2_sbd *init_sbd(struct super_block *sb) 63static struct gfs2_sbd *init_sbd(struct super_block *sb)
63{ 64{
64 struct gfs2_sbd *sdp; 65 struct gfs2_sbd *sdp;
66 struct address_space *mapping;
65 67
66 sdp = kzalloc(sizeof(struct gfs2_sbd), GFP_KERNEL); 68 sdp = kzalloc(sizeof(struct gfs2_sbd), GFP_KERNEL);
67 if (!sdp) 69 if (!sdp)
@@ -97,6 +99,18 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
97 init_waitqueue_head(&sdp->sd_quota_wait); 99 init_waitqueue_head(&sdp->sd_quota_wait);
98 INIT_LIST_HEAD(&sdp->sd_trunc_list); 100 INIT_LIST_HEAD(&sdp->sd_trunc_list);
99 spin_lock_init(&sdp->sd_trunc_lock); 101 spin_lock_init(&sdp->sd_trunc_lock);
102 spin_lock_init(&sdp->sd_bitmap_lock);
103
104 mapping = &sdp->sd_aspace;
105
106 address_space_init_once(mapping);
107 mapping->a_ops = &gfs2_meta_aops;
108 mapping->host = sb->s_bdev->bd_inode;
109 mapping->flags = 0;
110 mapping_set_gfp_mask(mapping, GFP_NOFS);
111 mapping->private_data = NULL;
112 mapping->backing_dev_info = sb->s_bdi;
113 mapping->writeback_index = 0;
100 114
101 spin_lock_init(&sdp->sd_log_lock); 115 spin_lock_init(&sdp->sd_log_lock);
102 atomic_set(&sdp->sd_log_pinned, 0); 116 atomic_set(&sdp->sd_log_pinned, 0);
@@ -217,7 +231,7 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent)
217 231
218 page = alloc_page(GFP_NOFS); 232 page = alloc_page(GFP_NOFS);
219 if (unlikely(!page)) 233 if (unlikely(!page))
220 return -ENOBUFS; 234 return -ENOMEM;
221 235
222 ClearPageUptodate(page); 236 ClearPageUptodate(page);
223 ClearPageDirty(page); 237 ClearPageDirty(page);
@@ -956,40 +970,6 @@ fail:
956 return error; 970 return error;
957} 971}
958 972
959static int init_threads(struct gfs2_sbd *sdp, int undo)
960{
961 struct task_struct *p;
962 int error = 0;
963
964 if (undo)
965 goto fail_quotad;
966
967 p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
968 if (IS_ERR(p)) {
969 error = PTR_ERR(p);
970 fs_err(sdp, "can't start logd thread: %d\n", error);
971 return error;
972 }
973 sdp->sd_logd_process = p;
974
975 p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad");
976 if (IS_ERR(p)) {
977 error = PTR_ERR(p);
978 fs_err(sdp, "can't start quotad thread: %d\n", error);
979 goto fail;
980 }
981 sdp->sd_quotad_process = p;
982
983 return 0;
984
985
986fail_quotad:
987 kthread_stop(sdp->sd_quotad_process);
988fail:
989 kthread_stop(sdp->sd_logd_process);
990 return error;
991}
992
993static const match_table_t nolock_tokens = { 973static const match_table_t nolock_tokens = {
994 { Opt_jid, "jid=%d\n", }, 974 { Opt_jid, "jid=%d\n", },
995 { Opt_err, NULL }, 975 { Opt_err, NULL },
@@ -1254,15 +1234,11 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
1254 goto fail_per_node; 1234 goto fail_per_node;
1255 } 1235 }
1256 1236
1257 error = init_threads(sdp, DO);
1258 if (error)
1259 goto fail_per_node;
1260
1261 if (!(sb->s_flags & MS_RDONLY)) { 1237 if (!(sb->s_flags & MS_RDONLY)) {
1262 error = gfs2_make_fs_rw(sdp); 1238 error = gfs2_make_fs_rw(sdp);
1263 if (error) { 1239 if (error) {
1264 fs_err(sdp, "can't make FS RW: %d\n", error); 1240 fs_err(sdp, "can't make FS RW: %d\n", error);
1265 goto fail_threads; 1241 goto fail_per_node;
1266 } 1242 }
1267 } 1243 }
1268 1244
@@ -1270,8 +1246,6 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
1270 gfs2_online_uevent(sdp); 1246 gfs2_online_uevent(sdp);
1271 return 0; 1247 return 0;
1272 1248
1273fail_threads:
1274 init_threads(sdp, UNDO);
1275fail_per_node: 1249fail_per_node:
1276 init_per_node(sdp, UNDO); 1250 init_per_node(sdp, UNDO);
1277fail_inodes: 1251fail_inodes:
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 98236d0df3ca..8bec0e3192dd 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -52,6 +52,11 @@
52#include <linux/dqblk_xfs.h> 52#include <linux/dqblk_xfs.h>
53#include <linux/lockref.h> 53#include <linux/lockref.h>
54#include <linux/list_lru.h> 54#include <linux/list_lru.h>
55#include <linux/rcupdate.h>
56#include <linux/rculist_bl.h>
57#include <linux/bit_spinlock.h>
58#include <linux/jhash.h>
59#include <linux/vmalloc.h>
55 60
56#include "gfs2.h" 61#include "gfs2.h"
57#include "incore.h" 62#include "incore.h"
@@ -67,16 +72,44 @@
67#include "inode.h" 72#include "inode.h"
68#include "util.h" 73#include "util.h"
69 74
70struct gfs2_quota_change_host { 75#define GFS2_QD_HASH_SHIFT 12
71 u64 qc_change; 76#define GFS2_QD_HASH_SIZE (1 << GFS2_QD_HASH_SHIFT)
72 u32 qc_flags; /* GFS2_QCF_... */ 77#define GFS2_QD_HASH_MASK (GFS2_QD_HASH_SIZE - 1)
73 struct kqid qc_id;
74};
75 78
76/* Lock order: qd_lock -> qd->lockref.lock -> lru lock */ 79/* Lock order: qd_lock -> bucket lock -> qd->lockref.lock -> lru lock */
80/* -> sd_bitmap_lock */
77static DEFINE_SPINLOCK(qd_lock); 81static DEFINE_SPINLOCK(qd_lock);
78struct list_lru gfs2_qd_lru; 82struct list_lru gfs2_qd_lru;
79 83
84static struct hlist_bl_head qd_hash_table[GFS2_QD_HASH_SIZE];
85
86static unsigned int gfs2_qd_hash(const struct gfs2_sbd *sdp,
87 const struct kqid qid)
88{
89 unsigned int h;
90
91 h = jhash(&sdp, sizeof(struct gfs2_sbd *), 0);
92 h = jhash(&qid, sizeof(struct kqid), h);
93
94 return h & GFS2_QD_HASH_MASK;
95}
96
97static inline void spin_lock_bucket(unsigned int hash)
98{
99 hlist_bl_lock(&qd_hash_table[hash]);
100}
101
102static inline void spin_unlock_bucket(unsigned int hash)
103{
104 hlist_bl_unlock(&qd_hash_table[hash]);
105}
106
107static void gfs2_qd_dealloc(struct rcu_head *rcu)
108{
109 struct gfs2_quota_data *qd = container_of(rcu, struct gfs2_quota_data, qd_rcu);
110 kmem_cache_free(gfs2_quotad_cachep, qd);
111}
112
80static void gfs2_qd_dispose(struct list_head *list) 113static void gfs2_qd_dispose(struct list_head *list)
81{ 114{
82 struct gfs2_quota_data *qd; 115 struct gfs2_quota_data *qd;
@@ -93,6 +126,10 @@ static void gfs2_qd_dispose(struct list_head *list)
93 list_del(&qd->qd_list); 126 list_del(&qd->qd_list);
94 spin_unlock(&qd_lock); 127 spin_unlock(&qd_lock);
95 128
129 spin_lock_bucket(qd->qd_hash);
130 hlist_bl_del_rcu(&qd->qd_hlist);
131 spin_unlock_bucket(qd->qd_hash);
132
96 gfs2_assert_warn(sdp, !qd->qd_change); 133 gfs2_assert_warn(sdp, !qd->qd_change);
97 gfs2_assert_warn(sdp, !qd->qd_slot_count); 134 gfs2_assert_warn(sdp, !qd->qd_slot_count);
98 gfs2_assert_warn(sdp, !qd->qd_bh_count); 135 gfs2_assert_warn(sdp, !qd->qd_bh_count);
@@ -101,7 +138,7 @@ static void gfs2_qd_dispose(struct list_head *list)
101 atomic_dec(&sdp->sd_quota_count); 138 atomic_dec(&sdp->sd_quota_count);
102 139
103 /* Delete it from the common reclaim list */ 140 /* Delete it from the common reclaim list */
104 kmem_cache_free(gfs2_quotad_cachep, qd); 141 call_rcu(&qd->qd_rcu, gfs2_qd_dealloc);
105 } 142 }
106} 143}
107 144
@@ -171,83 +208,95 @@ static u64 qd2offset(struct gfs2_quota_data *qd)
171 return offset; 208 return offset;
172} 209}
173 210
174static int qd_alloc(struct gfs2_sbd *sdp, struct kqid qid, 211static struct gfs2_quota_data *qd_alloc(unsigned hash, struct gfs2_sbd *sdp, struct kqid qid)
175 struct gfs2_quota_data **qdp)
176{ 212{
177 struct gfs2_quota_data *qd; 213 struct gfs2_quota_data *qd;
178 int error; 214 int error;
179 215
180 qd = kmem_cache_zalloc(gfs2_quotad_cachep, GFP_NOFS); 216 qd = kmem_cache_zalloc(gfs2_quotad_cachep, GFP_NOFS);
181 if (!qd) 217 if (!qd)
182 return -ENOMEM; 218 return NULL;
183 219
220 qd->qd_sbd = sdp;
184 qd->qd_lockref.count = 1; 221 qd->qd_lockref.count = 1;
185 spin_lock_init(&qd->qd_lockref.lock); 222 spin_lock_init(&qd->qd_lockref.lock);
186 qd->qd_id = qid; 223 qd->qd_id = qid;
187 qd->qd_slot = -1; 224 qd->qd_slot = -1;
188 INIT_LIST_HEAD(&qd->qd_lru); 225 INIT_LIST_HEAD(&qd->qd_lru);
226 qd->qd_hash = hash;
189 227
190 error = gfs2_glock_get(sdp, qd2index(qd), 228 error = gfs2_glock_get(sdp, qd2index(qd),
191 &gfs2_quota_glops, CREATE, &qd->qd_gl); 229 &gfs2_quota_glops, CREATE, &qd->qd_gl);
192 if (error) 230 if (error)
193 goto fail; 231 goto fail;
194 232
195 *qdp = qd; 233 return qd;
196
197 return 0;
198 234
199fail: 235fail:
200 kmem_cache_free(gfs2_quotad_cachep, qd); 236 kmem_cache_free(gfs2_quotad_cachep, qd);
201 return error; 237 return NULL;
202} 238}
203 239
204static int qd_get(struct gfs2_sbd *sdp, struct kqid qid, 240static struct gfs2_quota_data *gfs2_qd_search_bucket(unsigned int hash,
205 struct gfs2_quota_data **qdp) 241 const struct gfs2_sbd *sdp,
242 struct kqid qid)
206{ 243{
207 struct gfs2_quota_data *qd = NULL, *new_qd = NULL; 244 struct gfs2_quota_data *qd;
208 int error, found; 245 struct hlist_bl_node *h;
209
210 *qdp = NULL;
211 246
212 for (;;) { 247 hlist_bl_for_each_entry_rcu(qd, h, &qd_hash_table[hash], qd_hlist) {
213 found = 0; 248 if (!qid_eq(qd->qd_id, qid))
214 spin_lock(&qd_lock); 249 continue;
215 list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) { 250 if (qd->qd_sbd != sdp)
216 if (qid_eq(qd->qd_id, qid) && 251 continue;
217 lockref_get_not_dead(&qd->qd_lockref)) { 252 if (lockref_get_not_dead(&qd->qd_lockref)) {
218 list_lru_del(&gfs2_qd_lru, &qd->qd_lru); 253 list_lru_del(&gfs2_qd_lru, &qd->qd_lru);
219 found = 1; 254 return qd;
220 break;
221 }
222 } 255 }
256 }
223 257
224 if (!found) 258 return NULL;
225 qd = NULL; 259}
226 260
227 if (!qd && new_qd) {
228 qd = new_qd;
229 list_add(&qd->qd_list, &sdp->sd_quota_list);
230 atomic_inc(&sdp->sd_quota_count);
231 new_qd = NULL;
232 }
233 261
234 spin_unlock(&qd_lock); 262static int qd_get(struct gfs2_sbd *sdp, struct kqid qid,
263 struct gfs2_quota_data **qdp)
264{
265 struct gfs2_quota_data *qd, *new_qd;
266 unsigned int hash = gfs2_qd_hash(sdp, qid);
235 267
236 if (qd) { 268 rcu_read_lock();
237 if (new_qd) { 269 *qdp = qd = gfs2_qd_search_bucket(hash, sdp, qid);
238 gfs2_glock_put(new_qd->qd_gl); 270 rcu_read_unlock();
239 kmem_cache_free(gfs2_quotad_cachep, new_qd);
240 }
241 *qdp = qd;
242 return 0;
243 }
244 271
245 error = qd_alloc(sdp, qid, &new_qd); 272 if (qd)
246 if (error) 273 return 0;
247 return error; 274
275 new_qd = qd_alloc(hash, sdp, qid);
276 if (!new_qd)
277 return -ENOMEM;
278
279 spin_lock(&qd_lock);
280 spin_lock_bucket(hash);
281 *qdp = qd = gfs2_qd_search_bucket(hash, sdp, qid);
282 if (qd == NULL) {
283 *qdp = new_qd;
284 list_add(&new_qd->qd_list, &sdp->sd_quota_list);
285 hlist_bl_add_head_rcu(&new_qd->qd_hlist, &qd_hash_table[hash]);
286 atomic_inc(&sdp->sd_quota_count);
248 } 287 }
288 spin_unlock_bucket(hash);
289 spin_unlock(&qd_lock);
290
291 if (qd) {
292 gfs2_glock_put(new_qd->qd_gl);
293 kmem_cache_free(gfs2_quotad_cachep, new_qd);
294 }
295
296 return 0;
249} 297}
250 298
299
251static void qd_hold(struct gfs2_quota_data *qd) 300static void qd_hold(struct gfs2_quota_data *qd)
252{ 301{
253 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; 302 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
@@ -268,88 +317,48 @@ static void qd_put(struct gfs2_quota_data *qd)
268 317
269static int slot_get(struct gfs2_quota_data *qd) 318static int slot_get(struct gfs2_quota_data *qd)
270{ 319{
271 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; 320 struct gfs2_sbd *sdp = qd->qd_sbd;
272 unsigned int c, o = 0, b; 321 unsigned int bit;
273 unsigned char byte = 0; 322 int error = 0;
274 323
275 spin_lock(&qd_lock); 324 spin_lock(&sdp->sd_bitmap_lock);
325 if (qd->qd_slot_count != 0)
326 goto out;
276 327
277 if (qd->qd_slot_count++) { 328 error = -ENOSPC;
278 spin_unlock(&qd_lock); 329 bit = find_first_zero_bit(sdp->sd_quota_bitmap, sdp->sd_quota_slots);
279 return 0; 330 if (bit < sdp->sd_quota_slots) {
331 set_bit(bit, sdp->sd_quota_bitmap);
332 qd->qd_slot = bit;
333out:
334 qd->qd_slot_count++;
280 } 335 }
336 spin_unlock(&sdp->sd_bitmap_lock);
281 337
282 for (c = 0; c < sdp->sd_quota_chunks; c++) 338 return error;
283 for (o = 0; o < PAGE_SIZE; o++) {
284 byte = sdp->sd_quota_bitmap[c][o];
285 if (byte != 0xFF)
286 goto found;
287 }
288
289 goto fail;
290
291found:
292 for (b = 0; b < 8; b++)
293 if (!(byte & (1 << b)))
294 break;
295 qd->qd_slot = c * (8 * PAGE_SIZE) + o * 8 + b;
296
297 if (qd->qd_slot >= sdp->sd_quota_slots)
298 goto fail;
299
300 sdp->sd_quota_bitmap[c][o] |= 1 << b;
301
302 spin_unlock(&qd_lock);
303
304 return 0;
305
306fail:
307 qd->qd_slot_count--;
308 spin_unlock(&qd_lock);
309 return -ENOSPC;
310} 339}
311 340
312static void slot_hold(struct gfs2_quota_data *qd) 341static void slot_hold(struct gfs2_quota_data *qd)
313{ 342{
314 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; 343 struct gfs2_sbd *sdp = qd->qd_sbd;
315 344
316 spin_lock(&qd_lock); 345 spin_lock(&sdp->sd_bitmap_lock);
317 gfs2_assert(sdp, qd->qd_slot_count); 346 gfs2_assert(sdp, qd->qd_slot_count);
318 qd->qd_slot_count++; 347 qd->qd_slot_count++;
319 spin_unlock(&qd_lock); 348 spin_unlock(&sdp->sd_bitmap_lock);
320}
321
322static void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap,
323 unsigned int bit, int new_value)
324{
325 unsigned int c, o, b = bit;
326 int old_value;
327
328 c = b / (8 * PAGE_SIZE);
329 b %= 8 * PAGE_SIZE;
330 o = b / 8;
331 b %= 8;
332
333 old_value = (bitmap[c][o] & (1 << b));
334 gfs2_assert_withdraw(sdp, !old_value != !new_value);
335
336 if (new_value)
337 bitmap[c][o] |= 1 << b;
338 else
339 bitmap[c][o] &= ~(1 << b);
340} 349}
341 350
342static void slot_put(struct gfs2_quota_data *qd) 351static void slot_put(struct gfs2_quota_data *qd)
343{ 352{
344 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; 353 struct gfs2_sbd *sdp = qd->qd_sbd;
345 354
346 spin_lock(&qd_lock); 355 spin_lock(&sdp->sd_bitmap_lock);
347 gfs2_assert(sdp, qd->qd_slot_count); 356 gfs2_assert(sdp, qd->qd_slot_count);
348 if (!--qd->qd_slot_count) { 357 if (!--qd->qd_slot_count) {
349 gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, qd->qd_slot, 0); 358 BUG_ON(!test_and_clear_bit(qd->qd_slot, sdp->sd_quota_bitmap));
350 qd->qd_slot = -1; 359 qd->qd_slot = -1;
351 } 360 }
352 spin_unlock(&qd_lock); 361 spin_unlock(&sdp->sd_bitmap_lock);
353} 362}
354 363
355static int bh_get(struct gfs2_quota_data *qd) 364static int bh_get(struct gfs2_quota_data *qd)
@@ -427,8 +436,7 @@ static int qd_check_sync(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd,
427 list_move_tail(&qd->qd_list, &sdp->sd_quota_list); 436 list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
428 set_bit(QDF_LOCKED, &qd->qd_flags); 437 set_bit(QDF_LOCKED, &qd->qd_flags);
429 qd->qd_change_sync = qd->qd_change; 438 qd->qd_change_sync = qd->qd_change;
430 gfs2_assert_warn(sdp, qd->qd_slot_count); 439 slot_hold(qd);
431 qd->qd_slot_count++;
432 return 1; 440 return 1;
433} 441}
434 442
@@ -1214,17 +1222,6 @@ int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid)
1214 return error; 1222 return error;
1215} 1223}
1216 1224
1217static void gfs2_quota_change_in(struct gfs2_quota_change_host *qc, const void *buf)
1218{
1219 const struct gfs2_quota_change *str = buf;
1220
1221 qc->qc_change = be64_to_cpu(str->qc_change);
1222 qc->qc_flags = be32_to_cpu(str->qc_flags);
1223 qc->qc_id = make_kqid(&init_user_ns,
1224 (qc->qc_flags & GFS2_QCF_USER)?USRQUOTA:GRPQUOTA,
1225 be32_to_cpu(str->qc_id));
1226}
1227
1228int gfs2_quota_init(struct gfs2_sbd *sdp) 1225int gfs2_quota_init(struct gfs2_sbd *sdp)
1229{ 1226{
1230 struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode); 1227 struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
@@ -1232,6 +1229,8 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
1232 unsigned int blocks = size >> sdp->sd_sb.sb_bsize_shift; 1229 unsigned int blocks = size >> sdp->sd_sb.sb_bsize_shift;
1233 unsigned int x, slot = 0; 1230 unsigned int x, slot = 0;
1234 unsigned int found = 0; 1231 unsigned int found = 0;
1232 unsigned int hash;
1233 unsigned int bm_size;
1235 u64 dblock; 1234 u64 dblock;
1236 u32 extlen = 0; 1235 u32 extlen = 0;
1237 int error; 1236 int error;
@@ -1240,23 +1239,20 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
1240 return -EIO; 1239 return -EIO;
1241 1240
1242 sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block; 1241 sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block;
1243 sdp->sd_quota_chunks = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * PAGE_SIZE); 1242 bm_size = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * sizeof(unsigned long));
1244 1243 bm_size *= sizeof(unsigned long);
1245 error = -ENOMEM; 1244 error = -ENOMEM;
1246 1245 sdp->sd_quota_bitmap = kmalloc(bm_size, GFP_NOFS|__GFP_NOWARN);
1247 sdp->sd_quota_bitmap = kcalloc(sdp->sd_quota_chunks, 1246 if (sdp->sd_quota_bitmap == NULL)
1248 sizeof(unsigned char *), GFP_NOFS); 1247 sdp->sd_quota_bitmap = __vmalloc(bm_size, GFP_NOFS, PAGE_KERNEL);
1249 if (!sdp->sd_quota_bitmap) 1248 if (!sdp->sd_quota_bitmap)
1250 return error; 1249 return error;
1251 1250
1252 for (x = 0; x < sdp->sd_quota_chunks; x++) { 1251 memset(sdp->sd_quota_bitmap, 0, bm_size);
1253 sdp->sd_quota_bitmap[x] = kzalloc(PAGE_SIZE, GFP_NOFS);
1254 if (!sdp->sd_quota_bitmap[x])
1255 goto fail;
1256 }
1257 1252
1258 for (x = 0; x < blocks; x++) { 1253 for (x = 0; x < blocks; x++) {
1259 struct buffer_head *bh; 1254 struct buffer_head *bh;
1255 const struct gfs2_quota_change *qc;
1260 unsigned int y; 1256 unsigned int y;
1261 1257
1262 if (!extlen) { 1258 if (!extlen) {
@@ -1274,34 +1270,42 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
1274 goto fail; 1270 goto fail;
1275 } 1271 }
1276 1272
1273 qc = (const struct gfs2_quota_change *)(bh->b_data + sizeof(struct gfs2_meta_header));
1277 for (y = 0; y < sdp->sd_qc_per_block && slot < sdp->sd_quota_slots; 1274 for (y = 0; y < sdp->sd_qc_per_block && slot < sdp->sd_quota_slots;
1278 y++, slot++) { 1275 y++, slot++) {
1279 struct gfs2_quota_change_host qc;
1280 struct gfs2_quota_data *qd; 1276 struct gfs2_quota_data *qd;
1281 1277 s64 qc_change = be64_to_cpu(qc->qc_change);
1282 gfs2_quota_change_in(&qc, bh->b_data + 1278 u32 qc_flags = be32_to_cpu(qc->qc_flags);
1283 sizeof(struct gfs2_meta_header) + 1279 enum quota_type qtype = (qc_flags & GFS2_QCF_USER) ?
1284 y * sizeof(struct gfs2_quota_change)); 1280 USRQUOTA : GRPQUOTA;
1285 if (!qc.qc_change) 1281 struct kqid qc_id = make_kqid(&init_user_ns, qtype,
1282 be32_to_cpu(qc->qc_id));
1283 qc++;
1284 if (!qc_change)
1286 continue; 1285 continue;
1287 1286
1288 error = qd_alloc(sdp, qc.qc_id, &qd); 1287 hash = gfs2_qd_hash(sdp, qc_id);
1289 if (error) { 1288 qd = qd_alloc(hash, sdp, qc_id);
1289 if (qd == NULL) {
1290 brelse(bh); 1290 brelse(bh);
1291 goto fail; 1291 goto fail;
1292 } 1292 }
1293 1293
1294 set_bit(QDF_CHANGE, &qd->qd_flags); 1294 set_bit(QDF_CHANGE, &qd->qd_flags);
1295 qd->qd_change = qc.qc_change; 1295 qd->qd_change = qc_change;
1296 qd->qd_slot = slot; 1296 qd->qd_slot = slot;
1297 qd->qd_slot_count = 1; 1297 qd->qd_slot_count = 1;
1298 1298
1299 spin_lock(&qd_lock); 1299 spin_lock(&qd_lock);
1300 gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, slot, 1); 1300 BUG_ON(test_and_set_bit(slot, sdp->sd_quota_bitmap));
1301 list_add(&qd->qd_list, &sdp->sd_quota_list); 1301 list_add(&qd->qd_list, &sdp->sd_quota_list);
1302 atomic_inc(&sdp->sd_quota_count); 1302 atomic_inc(&sdp->sd_quota_count);
1303 spin_unlock(&qd_lock); 1303 spin_unlock(&qd_lock);
1304 1304
1305 spin_lock_bucket(hash);
1306 hlist_bl_add_head_rcu(&qd->qd_hlist, &qd_hash_table[hash]);
1307 spin_unlock_bucket(hash);
1308
1305 found++; 1309 found++;
1306 } 1310 }
1307 1311
@@ -1324,44 +1328,28 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
1324{ 1328{
1325 struct list_head *head = &sdp->sd_quota_list; 1329 struct list_head *head = &sdp->sd_quota_list;
1326 struct gfs2_quota_data *qd; 1330 struct gfs2_quota_data *qd;
1327 unsigned int x;
1328 1331
1329 spin_lock(&qd_lock); 1332 spin_lock(&qd_lock);
1330 while (!list_empty(head)) { 1333 while (!list_empty(head)) {
1331 qd = list_entry(head->prev, struct gfs2_quota_data, qd_list); 1334 qd = list_entry(head->prev, struct gfs2_quota_data, qd_list);
1332 1335
1333 /*
1334 * To be removed in due course... we should be able to
1335 * ensure that all refs to the qd have done by this point
1336 * so that this rather odd test is not required
1337 */
1338 spin_lock(&qd->qd_lockref.lock);
1339 if (qd->qd_lockref.count > 1 ||
1340 (qd->qd_lockref.count && !test_bit(QDF_CHANGE, &qd->qd_flags))) {
1341 spin_unlock(&qd->qd_lockref.lock);
1342 list_move(&qd->qd_list, head);
1343 spin_unlock(&qd_lock);
1344 schedule();
1345 spin_lock(&qd_lock);
1346 continue;
1347 }
1348 spin_unlock(&qd->qd_lockref.lock);
1349
1350 list_del(&qd->qd_list); 1336 list_del(&qd->qd_list);
1337
1351 /* Also remove if this qd exists in the reclaim list */ 1338 /* Also remove if this qd exists in the reclaim list */
1352 list_lru_del(&gfs2_qd_lru, &qd->qd_lru); 1339 list_lru_del(&gfs2_qd_lru, &qd->qd_lru);
1353 atomic_dec(&sdp->sd_quota_count); 1340 atomic_dec(&sdp->sd_quota_count);
1354 spin_unlock(&qd_lock); 1341 spin_unlock(&qd_lock);
1355 1342
1356 if (!qd->qd_lockref.count) { 1343 spin_lock_bucket(qd->qd_hash);
1357 gfs2_assert_warn(sdp, !qd->qd_change); 1344 hlist_bl_del_rcu(&qd->qd_hlist);
1358 gfs2_assert_warn(sdp, !qd->qd_slot_count); 1345 spin_unlock_bucket(qd->qd_hash);
1359 } else 1346
1360 gfs2_assert_warn(sdp, qd->qd_slot_count == 1); 1347 gfs2_assert_warn(sdp, !qd->qd_change);
1348 gfs2_assert_warn(sdp, !qd->qd_slot_count);
1361 gfs2_assert_warn(sdp, !qd->qd_bh_count); 1349 gfs2_assert_warn(sdp, !qd->qd_bh_count);
1362 1350
1363 gfs2_glock_put(qd->qd_gl); 1351 gfs2_glock_put(qd->qd_gl);
1364 kmem_cache_free(gfs2_quotad_cachep, qd); 1352 call_rcu(&qd->qd_rcu, gfs2_qd_dealloc);
1365 1353
1366 spin_lock(&qd_lock); 1354 spin_lock(&qd_lock);
1367 } 1355 }
@@ -1370,9 +1358,11 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
1370 gfs2_assert_warn(sdp, !atomic_read(&sdp->sd_quota_count)); 1358 gfs2_assert_warn(sdp, !atomic_read(&sdp->sd_quota_count));
1371 1359
1372 if (sdp->sd_quota_bitmap) { 1360 if (sdp->sd_quota_bitmap) {
1373 for (x = 0; x < sdp->sd_quota_chunks; x++) 1361 if (is_vmalloc_addr(sdp->sd_quota_bitmap))
1374 kfree(sdp->sd_quota_bitmap[x]); 1362 vfree(sdp->sd_quota_bitmap);
1375 kfree(sdp->sd_quota_bitmap); 1363 else
1364 kfree(sdp->sd_quota_bitmap);
1365 sdp->sd_quota_bitmap = NULL;
1376 } 1366 }
1377} 1367}
1378 1368
@@ -1656,3 +1646,11 @@ const struct quotactl_ops gfs2_quotactl_ops = {
1656 .get_dqblk = gfs2_get_dqblk, 1646 .get_dqblk = gfs2_get_dqblk,
1657 .set_dqblk = gfs2_set_dqblk, 1647 .set_dqblk = gfs2_set_dqblk,
1658}; 1648};
1649
1650void __init gfs2_quota_hash_init(void)
1651{
1652 unsigned i;
1653
1654 for(i = 0; i < GFS2_QD_HASH_SIZE; i++)
1655 INIT_HLIST_BL_HEAD(&qd_hash_table[i]);
1656}
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index 96e4f34a03b0..55d506eb3c4a 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -57,5 +57,6 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
57extern const struct quotactl_ops gfs2_quotactl_ops; 57extern const struct quotactl_ops gfs2_quotactl_ops;
58extern struct shrinker gfs2_qd_shrinker; 58extern struct shrinker gfs2_qd_shrinker;
59extern struct list_lru gfs2_qd_lru; 59extern struct list_lru gfs2_qd_lru;
60extern void __init gfs2_quota_hash_init(void);
60 61
61#endif /* __QUOTA_DOT_H__ */ 62#endif /* __QUOTA_DOT_H__ */
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index c8d6161bd682..a1da21349235 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -57,6 +57,11 @@
57 * 3 = Used (metadata) 57 * 3 = Used (metadata)
58 */ 58 */
59 59
60struct gfs2_extent {
61 struct gfs2_rbm rbm;
62 u32 len;
63};
64
60static const char valid_change[16] = { 65static const char valid_change[16] = {
61 /* current */ 66 /* current */
62 /* n */ 0, 1, 1, 1, 67 /* n */ 0, 1, 1, 1,
@@ -65,8 +70,9 @@ static const char valid_change[16] = {
65 1, 0, 0, 0 70 1, 0, 0, 0
66}; 71};
67 72
68static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 minext, 73static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
69 const struct gfs2_inode *ip, bool nowrap); 74 const struct gfs2_inode *ip, bool nowrap,
75 const struct gfs2_alloc_parms *ap);
70 76
71 77
72/** 78/**
@@ -635,9 +641,13 @@ static void __rs_deltree(struct gfs2_blkreserv *rs)
635 /* return reserved blocks to the rgrp */ 641 /* return reserved blocks to the rgrp */
636 BUG_ON(rs->rs_rbm.rgd->rd_reserved < rs->rs_free); 642 BUG_ON(rs->rs_rbm.rgd->rd_reserved < rs->rs_free);
637 rs->rs_rbm.rgd->rd_reserved -= rs->rs_free; 643 rs->rs_rbm.rgd->rd_reserved -= rs->rs_free;
644 /* The rgrp extent failure point is likely not to increase;
645 it will only do so if the freed blocks are somehow
646 contiguous with a span of free blocks that follows. Still,
647 it will force the number to be recalculated later. */
648 rgd->rd_extfail_pt += rs->rs_free;
638 rs->rs_free = 0; 649 rs->rs_free = 0;
639 clear_bit(GBF_FULL, &bi->bi_flags); 650 clear_bit(GBF_FULL, &bi->bi_flags);
640 smp_mb__after_clear_bit();
641 } 651 }
642} 652}
643 653
@@ -876,6 +886,7 @@ static int rgd_insert(struct gfs2_rgrpd *rgd)
876static int read_rindex_entry(struct gfs2_inode *ip) 886static int read_rindex_entry(struct gfs2_inode *ip)
877{ 887{
878 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 888 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
889 const unsigned bsize = sdp->sd_sb.sb_bsize;
879 loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex); 890 loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex);
880 struct gfs2_rindex buf; 891 struct gfs2_rindex buf;
881 int error; 892 int error;
@@ -913,6 +924,8 @@ static int read_rindex_entry(struct gfs2_inode *ip)
913 goto fail; 924 goto fail;
914 925
915 rgd->rd_gl->gl_object = rgd; 926 rgd->rd_gl->gl_object = rgd;
927 rgd->rd_gl->gl_vm.start = rgd->rd_addr * bsize;
928 rgd->rd_gl->gl_vm.end = rgd->rd_gl->gl_vm.start + (rgd->rd_length * bsize) - 1;
916 rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr; 929 rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr;
917 rgd->rd_flags &= ~GFS2_RDF_UPTODATE; 930 rgd->rd_flags &= ~GFS2_RDF_UPTODATE;
918 if (rgd->rd_data > sdp->sd_max_rg_data) 931 if (rgd->rd_data > sdp->sd_max_rg_data)
@@ -1126,6 +1139,8 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
1126 gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data); 1139 gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
1127 rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK); 1140 rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
1128 rgd->rd_free_clone = rgd->rd_free; 1141 rgd->rd_free_clone = rgd->rd_free;
1142 /* max out the rgrp allocation failure point */
1143 rgd->rd_extfail_pt = rgd->rd_free;
1129 } 1144 }
1130 if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) { 1145 if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) {
1131 rgd->rd_rgl->rl_unlinked = cpu_to_be32(count_unlinked(rgd)); 1146 rgd->rd_rgl->rl_unlinked = cpu_to_be32(count_unlinked(rgd));
@@ -1184,7 +1199,7 @@ int gfs2_rgrp_go_lock(struct gfs2_holder *gh)
1184 1199
1185 if (gh->gh_flags & GL_SKIP && sdp->sd_args.ar_rgrplvb) 1200 if (gh->gh_flags & GL_SKIP && sdp->sd_args.ar_rgrplvb)
1186 return 0; 1201 return 0;
1187 return gfs2_rgrp_bh_get((struct gfs2_rgrpd *)gh->gh_gl->gl_object); 1202 return gfs2_rgrp_bh_get(rgd);
1188} 1203}
1189 1204
1190/** 1205/**
@@ -1455,7 +1470,7 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip,
1455 if (WARN_ON(gfs2_rbm_from_block(&rbm, goal))) 1470 if (WARN_ON(gfs2_rbm_from_block(&rbm, goal)))
1456 return; 1471 return;
1457 1472
1458 ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, extlen, ip, true); 1473 ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &extlen, ip, true, ap);
1459 if (ret == 0) { 1474 if (ret == 0) {
1460 rs->rs_rbm = rbm; 1475 rs->rs_rbm = rbm;
1461 rs->rs_free = extlen; 1476 rs->rs_free = extlen;
@@ -1520,6 +1535,7 @@ static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block,
1520 * @rbm: The current position in the resource group 1535 * @rbm: The current position in the resource group
1521 * @ip: The inode for which we are searching for blocks 1536 * @ip: The inode for which we are searching for blocks
1522 * @minext: The minimum extent length 1537 * @minext: The minimum extent length
1538 * @maxext: A pointer to the maximum extent structure
1523 * 1539 *
1524 * This checks the current position in the rgrp to see whether there is 1540 * This checks the current position in the rgrp to see whether there is
1525 * a reservation covering this block. If not then this function is a 1541 * a reservation covering this block. If not then this function is a
@@ -1532,7 +1548,8 @@ static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block,
1532 1548
1533static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm, 1549static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm,
1534 const struct gfs2_inode *ip, 1550 const struct gfs2_inode *ip,
1535 u32 minext) 1551 u32 minext,
1552 struct gfs2_extent *maxext)
1536{ 1553{
1537 u64 block = gfs2_rbm_to_block(rbm); 1554 u64 block = gfs2_rbm_to_block(rbm);
1538 u32 extlen = 1; 1555 u32 extlen = 1;
@@ -1545,8 +1562,7 @@ static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm,
1545 */ 1562 */
1546 if (minext) { 1563 if (minext) {
1547 extlen = gfs2_free_extlen(rbm, minext); 1564 extlen = gfs2_free_extlen(rbm, minext);
1548 nblock = block + extlen; 1565 if (extlen <= maxext->len)
1549 if (extlen < minext)
1550 goto fail; 1566 goto fail;
1551 } 1567 }
1552 1568
@@ -1555,9 +1571,17 @@ static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm,
1555 * and skip if parts of it are already reserved 1571 * and skip if parts of it are already reserved
1556 */ 1572 */
1557 nblock = gfs2_next_unreserved_block(rbm->rgd, block, extlen, ip); 1573 nblock = gfs2_next_unreserved_block(rbm->rgd, block, extlen, ip);
1558 if (nblock == block) 1574 if (nblock == block) {
1559 return 0; 1575 if (!minext || extlen >= minext)
1576 return 0;
1577
1578 if (extlen > maxext->len) {
1579 maxext->len = extlen;
1580 maxext->rbm = *rbm;
1581 }
1560fail: 1582fail:
1583 nblock = block + extlen;
1584 }
1561 ret = gfs2_rbm_from_block(rbm, nblock); 1585 ret = gfs2_rbm_from_block(rbm, nblock);
1562 if (ret < 0) 1586 if (ret < 0)
1563 return ret; 1587 return ret;
@@ -1568,30 +1592,38 @@ fail:
1568 * gfs2_rbm_find - Look for blocks of a particular state 1592 * gfs2_rbm_find - Look for blocks of a particular state
1569 * @rbm: Value/result starting position and final position 1593 * @rbm: Value/result starting position and final position
1570 * @state: The state which we want to find 1594 * @state: The state which we want to find
1571 * @minext: The requested extent length (0 for a single block) 1595 * @minext: Pointer to the requested extent length (NULL for a single block)
1596 * This is updated to be the actual reservation size.
1572 * @ip: If set, check for reservations 1597 * @ip: If set, check for reservations
1573 * @nowrap: Stop looking at the end of the rgrp, rather than wrapping 1598 * @nowrap: Stop looking at the end of the rgrp, rather than wrapping
1574 * around until we've reached the starting point. 1599 * around until we've reached the starting point.
1600 * @ap: the allocation parameters
1575 * 1601 *
1576 * Side effects: 1602 * Side effects:
1577 * - If looking for free blocks, we set GBF_FULL on each bitmap which 1603 * - If looking for free blocks, we set GBF_FULL on each bitmap which
1578 * has no free blocks in it. 1604 * has no free blocks in it.
1605 * - If looking for free blocks, we set rd_extfail_pt on each rgrp which
1606 * has come up short on a free block search.
1579 * 1607 *
1580 * Returns: 0 on success, -ENOSPC if there is no block of the requested state 1608 * Returns: 0 on success, -ENOSPC if there is no block of the requested state
1581 */ 1609 */
1582 1610
1583static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 minext, 1611static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
1584 const struct gfs2_inode *ip, bool nowrap) 1612 const struct gfs2_inode *ip, bool nowrap,
1613 const struct gfs2_alloc_parms *ap)
1585{ 1614{
1586 struct buffer_head *bh; 1615 struct buffer_head *bh;
1587 int initial_bii; 1616 int initial_bii;
1588 u32 initial_offset; 1617 u32 initial_offset;
1618 int first_bii = rbm->bii;
1619 u32 first_offset = rbm->offset;
1589 u32 offset; 1620 u32 offset;
1590 u8 *buffer; 1621 u8 *buffer;
1591 int n = 0; 1622 int n = 0;
1592 int iters = rbm->rgd->rd_length; 1623 int iters = rbm->rgd->rd_length;
1593 int ret; 1624 int ret;
1594 struct gfs2_bitmap *bi; 1625 struct gfs2_bitmap *bi;
1626 struct gfs2_extent maxext = { .rbm.rgd = rbm->rgd, };
1595 1627
1596 /* If we are not starting at the beginning of a bitmap, then we 1628 /* If we are not starting at the beginning of a bitmap, then we
1597 * need to add one to the bitmap count to ensure that we search 1629 * need to add one to the bitmap count to ensure that we search
@@ -1620,7 +1652,9 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 minext,
1620 return 0; 1652 return 0;
1621 1653
1622 initial_bii = rbm->bii; 1654 initial_bii = rbm->bii;
1623 ret = gfs2_reservation_check_and_update(rbm, ip, minext); 1655 ret = gfs2_reservation_check_and_update(rbm, ip,
1656 minext ? *minext : 0,
1657 &maxext);
1624 if (ret == 0) 1658 if (ret == 0)
1625 return 0; 1659 return 0;
1626 if (ret > 0) { 1660 if (ret > 0) {
@@ -1655,6 +1689,24 @@ next_iter:
1655 break; 1689 break;
1656 } 1690 }
1657 1691
1692 if (minext == NULL || state != GFS2_BLKST_FREE)
1693 return -ENOSPC;
1694
1695 /* If the extent was too small, and it's smaller than the smallest
1696 to have failed before, remember for future reference that it's
1697 useless to search this rgrp again for this amount or more. */
1698 if ((first_offset == 0) && (first_bii == 0) &&
1699 (*minext < rbm->rgd->rd_extfail_pt))
1700 rbm->rgd->rd_extfail_pt = *minext;
1701
1702 /* If the maximum extent we found is big enough to fulfill the
1703 minimum requirements, use it anyway. */
1704 if (maxext.len) {
1705 *rbm = maxext.rbm;
1706 *minext = maxext.len;
1707 return 0;
1708 }
1709
1658 return -ENOSPC; 1710 return -ENOSPC;
1659} 1711}
1660 1712
@@ -1680,7 +1732,8 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
1680 1732
1681 while (1) { 1733 while (1) {
1682 down_write(&sdp->sd_log_flush_lock); 1734 down_write(&sdp->sd_log_flush_lock);
1683 error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, 0, NULL, true); 1735 error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, NULL, NULL,
1736 true, NULL);
1684 up_write(&sdp->sd_log_flush_lock); 1737 up_write(&sdp->sd_log_flush_lock);
1685 if (error == -ENOSPC) 1738 if (error == -ENOSPC)
1686 break; 1739 break;
@@ -1891,7 +1944,9 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *a
1891 } 1944 }
1892 1945
1893 /* Skip unuseable resource groups */ 1946 /* Skip unuseable resource groups */
1894 if (rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR)) 1947 if ((rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC |
1948 GFS2_RDF_ERROR)) ||
1949 (ap->target > rs->rs_rbm.rgd->rd_extfail_pt))
1895 goto skip_rgrp; 1950 goto skip_rgrp;
1896 1951
1897 if (sdp->sd_args.ar_rgrplvb) 1952 if (sdp->sd_args.ar_rgrplvb)
@@ -1911,15 +1966,16 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *a
1911 return 0; 1966 return 0;
1912 } 1967 }
1913 1968
1914 /* Drop reservation, if we couldn't use reserved rgrp */
1915 if (gfs2_rs_active(rs))
1916 gfs2_rs_deltree(rs);
1917check_rgrp: 1969check_rgrp:
1918 /* Check for unlinked inodes which can be reclaimed */ 1970 /* Check for unlinked inodes which can be reclaimed */
1919 if (rs->rs_rbm.rgd->rd_flags & GFS2_RDF_CHECK) 1971 if (rs->rs_rbm.rgd->rd_flags & GFS2_RDF_CHECK)
1920 try_rgrp_unlink(rs->rs_rbm.rgd, &last_unlinked, 1972 try_rgrp_unlink(rs->rs_rbm.rgd, &last_unlinked,
1921 ip->i_no_addr); 1973 ip->i_no_addr);
1922skip_rgrp: 1974skip_rgrp:
1975 /* Drop reservation, if we couldn't use reserved rgrp */
1976 if (gfs2_rs_active(rs))
1977 gfs2_rs_deltree(rs);
1978
1923 /* Unlock rgrp if required */ 1979 /* Unlock rgrp if required */
1924 if (!rg_locked) 1980 if (!rg_locked)
1925 gfs2_glock_dq_uninit(&rs->rs_rgd_gh); 1981 gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
@@ -2064,25 +2120,24 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
2064 * 2120 *
2065 */ 2121 */
2066 2122
2067int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl) 2123void gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
2068{ 2124{
2069 struct gfs2_rgrpd *rgd = gl->gl_object; 2125 struct gfs2_rgrpd *rgd = gl->gl_object;
2070 struct gfs2_blkreserv *trs; 2126 struct gfs2_blkreserv *trs;
2071 const struct rb_node *n; 2127 const struct rb_node *n;
2072 2128
2073 if (rgd == NULL) 2129 if (rgd == NULL)
2074 return 0; 2130 return;
2075 gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u r:%u\n", 2131 gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u r:%u e:%u\n",
2076 (unsigned long long)rgd->rd_addr, rgd->rd_flags, 2132 (unsigned long long)rgd->rd_addr, rgd->rd_flags,
2077 rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes, 2133 rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes,
2078 rgd->rd_reserved); 2134 rgd->rd_reserved, rgd->rd_extfail_pt);
2079 spin_lock(&rgd->rd_rsspin); 2135 spin_lock(&rgd->rd_rsspin);
2080 for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) { 2136 for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) {
2081 trs = rb_entry(n, struct gfs2_blkreserv, rs_node); 2137 trs = rb_entry(n, struct gfs2_blkreserv, rs_node);
2082 dump_rs(seq, trs); 2138 dump_rs(seq, trs);
2083 } 2139 }
2084 spin_unlock(&rgd->rd_rsspin); 2140 spin_unlock(&rgd->rd_rsspin);
2085 return 0;
2086} 2141}
2087 2142
2088static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd) 2143static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
@@ -2184,18 +2239,20 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
2184 int error; 2239 int error;
2185 2240
2186 gfs2_set_alloc_start(&rbm, ip, dinode); 2241 gfs2_set_alloc_start(&rbm, ip, dinode);
2187 error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, 0, ip, false); 2242 error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, ip, false, NULL);
2188 2243
2189 if (error == -ENOSPC) { 2244 if (error == -ENOSPC) {
2190 gfs2_set_alloc_start(&rbm, ip, dinode); 2245 gfs2_set_alloc_start(&rbm, ip, dinode);
2191 error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, 0, NULL, false); 2246 error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, NULL, false,
2247 NULL);
2192 } 2248 }
2193 2249
2194 /* Since all blocks are reserved in advance, this shouldn't happen */ 2250 /* Since all blocks are reserved in advance, this shouldn't happen */
2195 if (error) { 2251 if (error) {
2196 fs_warn(sdp, "inum=%llu error=%d, nblocks=%u, full=%d\n", 2252 fs_warn(sdp, "inum=%llu error=%d, nblocks=%u, full=%d fail_pt=%d\n",
2197 (unsigned long long)ip->i_no_addr, error, *nblocks, 2253 (unsigned long long)ip->i_no_addr, error, *nblocks,
2198 test_bit(GBF_FULL, &rbm.rgd->rd_bits->bi_flags)); 2254 test_bit(GBF_FULL, &rbm.rgd->rd_bits->bi_flags),
2255 rbm.rgd->rd_extfail_pt);
2199 goto rgrp_error; 2256 goto rgrp_error;
2200 } 2257 }
2201 2258
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 3a10d2ffbbe7..463ab2e95d1c 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -68,7 +68,7 @@ extern void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
68extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state); 68extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state);
69extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist); 69extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
70extern u64 gfs2_ri_total(struct gfs2_sbd *sdp); 70extern u64 gfs2_ri_total(struct gfs2_sbd *sdp);
71extern int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl); 71extern void gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl);
72extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, 72extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
73 struct buffer_head *bh, 73 struct buffer_head *bh,
74 const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed); 74 const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 35da5b19c0de..60f60f6181f3 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -369,6 +369,33 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd)
369 return 0; 369 return 0;
370} 370}
371 371
372static int init_threads(struct gfs2_sbd *sdp)
373{
374 struct task_struct *p;
375 int error = 0;
376
377 p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
378 if (IS_ERR(p)) {
379 error = PTR_ERR(p);
380 fs_err(sdp, "can't start logd thread: %d\n", error);
381 return error;
382 }
383 sdp->sd_logd_process = p;
384
385 p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad");
386 if (IS_ERR(p)) {
387 error = PTR_ERR(p);
388 fs_err(sdp, "can't start quotad thread: %d\n", error);
389 goto fail;
390 }
391 sdp->sd_quotad_process = p;
392 return 0;
393
394fail:
395 kthread_stop(sdp->sd_logd_process);
396 return error;
397}
398
372/** 399/**
373 * gfs2_make_fs_rw - Turn a Read-Only FS into a Read-Write one 400 * gfs2_make_fs_rw - Turn a Read-Only FS into a Read-Write one
374 * @sdp: the filesystem 401 * @sdp: the filesystem
@@ -384,10 +411,14 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
384 struct gfs2_log_header_host head; 411 struct gfs2_log_header_host head;
385 int error; 412 int error;
386 413
387 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &t_gh); 414 error = init_threads(sdp);
388 if (error) 415 if (error)
389 return error; 416 return error;
390 417
418 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &t_gh);
419 if (error)
420 goto fail_threads;
421
391 j_gl->gl_ops->go_inval(j_gl, DIO_METADATA); 422 j_gl->gl_ops->go_inval(j_gl, DIO_METADATA);
392 423
393 error = gfs2_find_jhead(sdp->sd_jdesc, &head); 424 error = gfs2_find_jhead(sdp->sd_jdesc, &head);
@@ -417,7 +448,9 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
417fail: 448fail:
418 t_gh.gh_flags |= GL_NOCACHE; 449 t_gh.gh_flags |= GL_NOCACHE;
419 gfs2_glock_dq_uninit(&t_gh); 450 gfs2_glock_dq_uninit(&t_gh);
420 451fail_threads:
452 kthread_stop(sdp->sd_quotad_process);
453 kthread_stop(sdp->sd_logd_process);
421 return error; 454 return error;
422} 455}
423 456
@@ -800,6 +833,9 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
800 struct gfs2_holder t_gh; 833 struct gfs2_holder t_gh;
801 int error; 834 int error;
802 835
836 kthread_stop(sdp->sd_quotad_process);
837 kthread_stop(sdp->sd_logd_process);
838
803 flush_workqueue(gfs2_delete_workqueue); 839 flush_workqueue(gfs2_delete_workqueue);
804 gfs2_quota_sync(sdp->sd_vfs, 0); 840 gfs2_quota_sync(sdp->sd_vfs, 0);
805 gfs2_statfs_sync(sdp->sd_vfs, 0); 841 gfs2_statfs_sync(sdp->sd_vfs, 0);
@@ -857,9 +893,6 @@ restart:
857 } 893 }
858 spin_unlock(&sdp->sd_jindex_spin); 894 spin_unlock(&sdp->sd_jindex_spin);
859 895
860 kthread_stop(sdp->sd_quotad_process);
861 kthread_stop(sdp->sd_logd_process);
862
863 if (!(sb->s_flags & MS_RDONLY)) { 896 if (!(sb->s_flags & MS_RDONLY)) {
864 error = gfs2_make_fs_ro(sdp); 897 error = gfs2_make_fs_ro(sdp);
865 if (error) 898 if (error)
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 39c1d9469677..5c097596104b 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -21,6 +21,7 @@
21#include <linux/xattr.h> 21#include <linux/xattr.h>
22#include <linux/fs.h> 22#include <linux/fs.h>
23#include <linux/percpu-refcount.h> 23#include <linux/percpu-refcount.h>
24#include <linux/seq_file.h>
24 25
25#ifdef CONFIG_CGROUPS 26#ifdef CONFIG_CGROUPS
26 27
@@ -28,8 +29,6 @@ struct cgroupfs_root;
28struct cgroup_subsys; 29struct cgroup_subsys;
29struct inode; 30struct inode;
30struct cgroup; 31struct cgroup;
31struct css_id;
32struct eventfd_ctx;
33 32
34extern int cgroup_init_early(void); 33extern int cgroup_init_early(void);
35extern int cgroup_init(void); 34extern int cgroup_init(void);
@@ -79,8 +78,6 @@ struct cgroup_subsys_state {
79 struct cgroup_subsys_state *parent; 78 struct cgroup_subsys_state *parent;
80 79
81 unsigned long flags; 80 unsigned long flags;
82 /* ID for this css, if possible */
83 struct css_id __rcu *id;
84 81
85 /* percpu_ref killing and RCU release */ 82 /* percpu_ref killing and RCU release */
86 struct rcu_head rcu_head; 83 struct rcu_head rcu_head;
@@ -239,10 +236,6 @@ struct cgroup {
239 struct rcu_head rcu_head; 236 struct rcu_head rcu_head;
240 struct work_struct destroy_work; 237 struct work_struct destroy_work;
241 238
242 /* List of events which userspace want to receive */
243 struct list_head event_list;
244 spinlock_t event_list_lock;
245
246 /* directory xattrs */ 239 /* directory xattrs */
247 struct simple_xattrs xattrs; 240 struct simple_xattrs xattrs;
248}; 241};
@@ -280,6 +273,9 @@ enum {
280 * - "tasks" is removed. Everything should be at process 273 * - "tasks" is removed. Everything should be at process
281 * granularity. Use "cgroup.procs" instead. 274 * granularity. Use "cgroup.procs" instead.
282 * 275 *
276 * - "cgroup.procs" is not sorted. pids will be unique unless they
277 * got recycled inbetween reads.
278 *
283 * - "release_agent" and "notify_on_release" are removed. 279 * - "release_agent" and "notify_on_release" are removed.
284 * Replacement notification mechanism will be implemented. 280 * Replacement notification mechanism will be implemented.
285 * 281 *
@@ -320,9 +316,6 @@ struct cgroupfs_root {
320 /* Unique id for this hierarchy. */ 316 /* Unique id for this hierarchy. */
321 int hierarchy_id; 317 int hierarchy_id;
322 318
323 /* A list running through the attached subsystems */
324 struct list_head subsys_list;
325
326 /* The root cgroup for this hierarchy */ 319 /* The root cgroup for this hierarchy */
327 struct cgroup top_cgroup; 320 struct cgroup top_cgroup;
328 321
@@ -389,16 +382,6 @@ struct css_set {
389}; 382};
390 383
391/* 384/*
392 * cgroup_map_cb is an abstract callback API for reporting map-valued
393 * control files
394 */
395
396struct cgroup_map_cb {
397 int (*fill)(struct cgroup_map_cb *cb, const char *key, u64 value);
398 void *state;
399};
400
401/*
402 * struct cftype: handler definitions for cgroup control files 385 * struct cftype: handler definitions for cgroup control files
403 * 386 *
404 * When reading/writing to a file: 387 * When reading/writing to a file:
@@ -445,10 +428,6 @@ struct cftype {
445 */ 428 */
446 struct cgroup_subsys *ss; 429 struct cgroup_subsys *ss;
447 430
448 int (*open)(struct inode *inode, struct file *file);
449 ssize_t (*read)(struct cgroup_subsys_state *css, struct cftype *cft,
450 struct file *file,
451 char __user *buf, size_t nbytes, loff_t *ppos);
452 /* 431 /*
453 * read_u64() is a shortcut for the common case of returning a 432 * read_u64() is a shortcut for the common case of returning a
454 * single integer. Use it in place of read() 433 * single integer. Use it in place of read()
@@ -458,24 +437,14 @@ struct cftype {
458 * read_s64() is a signed version of read_u64() 437 * read_s64() is a signed version of read_u64()
459 */ 438 */
460 s64 (*read_s64)(struct cgroup_subsys_state *css, struct cftype *cft); 439 s64 (*read_s64)(struct cgroup_subsys_state *css, struct cftype *cft);
461 /*
462 * read_map() is used for defining a map of key/value
463 * pairs. It should call cb->fill(cb, key, value) for each
464 * entry. The key/value pairs (and their ordering) should not
465 * change between reboots.
466 */
467 int (*read_map)(struct cgroup_subsys_state *css, struct cftype *cft,
468 struct cgroup_map_cb *cb);
469 /*
470 * read_seq_string() is used for outputting a simple sequence
471 * using seqfile.
472 */
473 int (*read_seq_string)(struct cgroup_subsys_state *css,
474 struct cftype *cft, struct seq_file *m);
475 440
476 ssize_t (*write)(struct cgroup_subsys_state *css, struct cftype *cft, 441 /* generic seq_file read interface */
477 struct file *file, 442 int (*seq_show)(struct seq_file *sf, void *v);
478 const char __user *buf, size_t nbytes, loff_t *ppos); 443
444 /* optional ops, implement all or none */
445 void *(*seq_start)(struct seq_file *sf, loff_t *ppos);
446 void *(*seq_next)(struct seq_file *sf, void *v, loff_t *ppos);
447 void (*seq_stop)(struct seq_file *sf, void *v);
479 448
480 /* 449 /*
481 * write_u64() is a shortcut for the common case of accepting 450 * write_u64() is a shortcut for the common case of accepting
@@ -504,27 +473,6 @@ struct cftype {
504 * kick type for multiplexing. 473 * kick type for multiplexing.
505 */ 474 */
506 int (*trigger)(struct cgroup_subsys_state *css, unsigned int event); 475 int (*trigger)(struct cgroup_subsys_state *css, unsigned int event);
507
508 int (*release)(struct inode *inode, struct file *file);
509
510 /*
511 * register_event() callback will be used to add new userspace
512 * waiter for changes related to the cftype. Implement it if
513 * you want to provide this functionality. Use eventfd_signal()
514 * on eventfd to send notification to userspace.
515 */
516 int (*register_event)(struct cgroup_subsys_state *css,
517 struct cftype *cft, struct eventfd_ctx *eventfd,
518 const char *args);
519 /*
520 * unregister_event() callback will be called when userspace
521 * closes the eventfd or on cgroup removing.
522 * This callback must be implemented, if you want provide
523 * notification functionality.
524 */
525 void (*unregister_event)(struct cgroup_subsys_state *css,
526 struct cftype *cft,
527 struct eventfd_ctx *eventfd);
528}; 476};
529 477
530/* 478/*
@@ -538,6 +486,26 @@ struct cftype_set {
538}; 486};
539 487
540/* 488/*
489 * cgroupfs file entry, pointed to from leaf dentry->d_fsdata. Don't
490 * access directly.
491 */
492struct cfent {
493 struct list_head node;
494 struct dentry *dentry;
495 struct cftype *type;
496 struct cgroup_subsys_state *css;
497
498 /* file xattrs */
499 struct simple_xattrs xattrs;
500};
501
502/* seq_file->private points to the following, only ->priv is public */
503struct cgroup_open_file {
504 struct cfent *cfe;
505 void *priv;
506};
507
508/*
541 * See the comment above CGRP_ROOT_SANE_BEHAVIOR for details. This 509 * See the comment above CGRP_ROOT_SANE_BEHAVIOR for details. This
542 * function can be called as long as @cgrp is accessible. 510 * function can be called as long as @cgrp is accessible.
543 */ 511 */
@@ -552,6 +520,18 @@ static inline const char *cgroup_name(const struct cgroup *cgrp)
552 return rcu_dereference(cgrp->name)->name; 520 return rcu_dereference(cgrp->name)->name;
553} 521}
554 522
523static inline struct cgroup_subsys_state *seq_css(struct seq_file *seq)
524{
525 struct cgroup_open_file *of = seq->private;
526 return of->cfe->css;
527}
528
529static inline struct cftype *seq_cft(struct seq_file *seq)
530{
531 struct cgroup_open_file *of = seq->private;
532 return of->cfe->type;
533}
534
555int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); 535int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
556int cgroup_rm_cftypes(struct cftype *cfts); 536int cgroup_rm_cftypes(struct cftype *cfts);
557 537
@@ -631,12 +611,8 @@ struct cgroup_subsys {
631#define MAX_CGROUP_TYPE_NAMELEN 32 611#define MAX_CGROUP_TYPE_NAMELEN 32
632 const char *name; 612 const char *name;
633 613
634 /* 614 /* link to parent, protected by cgroup_lock() */
635 * Link to parent, and list entry in parent's children.
636 * Protected by cgroup_lock()
637 */
638 struct cgroupfs_root *root; 615 struct cgroupfs_root *root;
639 struct list_head sibling;
640 616
641 /* list of cftype_sets */ 617 /* list of cftype_sets */
642 struct list_head cftsets; 618 struct list_head cftsets;
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 9b503376738f..bec6dbe939a0 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -419,6 +419,8 @@ enum {
419 ATA_HORKAGE_MAX_SEC_LBA48 = (1 << 17), /* Set max sects to 65535 */ 419 ATA_HORKAGE_MAX_SEC_LBA48 = (1 << 17), /* Set max sects to 65535 */
420 ATA_HORKAGE_ATAPI_DMADIR = (1 << 18), /* device requires dmadir */ 420 ATA_HORKAGE_ATAPI_DMADIR = (1 << 18), /* device requires dmadir */
421 ATA_HORKAGE_NO_NCQ_TRIM = (1 << 19), /* don't use queued TRIM */ 421 ATA_HORKAGE_NO_NCQ_TRIM = (1 << 19), /* don't use queued TRIM */
422 ATA_HORKAGE_NOLPM = (1 << 20), /* don't use LPM */
423 ATA_HORKAGE_WD_BROKEN_LPM = (1 << 21), /* some WDs have broken LPM */
422 424
423 /* DMA mask for user DMA control: User visible values; DO NOT 425 /* DMA mask for user DMA control: User visible values; DO NOT
424 renumber */ 426 renumber */
diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h
index 3f3788d49362..3e4535876d37 100644
--- a/include/linux/vmpressure.h
+++ b/include/linux/vmpressure.h
@@ -7,6 +7,7 @@
7#include <linux/gfp.h> 7#include <linux/gfp.h>
8#include <linux/types.h> 8#include <linux/types.h>
9#include <linux/cgroup.h> 9#include <linux/cgroup.h>
10#include <linux/eventfd.h>
10 11
11struct vmpressure { 12struct vmpressure {
12 unsigned long scanned; 13 unsigned long scanned;
@@ -33,13 +34,10 @@ extern void vmpressure_init(struct vmpressure *vmpr);
33extern void vmpressure_cleanup(struct vmpressure *vmpr); 34extern void vmpressure_cleanup(struct vmpressure *vmpr);
34extern struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg); 35extern struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg);
35extern struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr); 36extern struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr);
36extern struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css); 37extern int vmpressure_register_event(struct mem_cgroup *memcg,
37extern int vmpressure_register_event(struct cgroup_subsys_state *css,
38 struct cftype *cft,
39 struct eventfd_ctx *eventfd, 38 struct eventfd_ctx *eventfd,
40 const char *args); 39 const char *args);
41extern void vmpressure_unregister_event(struct cgroup_subsys_state *css, 40extern void vmpressure_unregister_event(struct mem_cgroup *memcg,
42 struct cftype *cft,
43 struct eventfd_ctx *eventfd); 41 struct eventfd_ctx *eventfd);
44#else 42#else
45static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, 43static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
diff --git a/include/uapi/linux/gfs2_ondisk.h b/include/uapi/linux/gfs2_ondisk.h
index b2de1f9a88d6..0f24c07aed51 100644
--- a/include/uapi/linux/gfs2_ondisk.h
+++ b/include/uapi/linux/gfs2_ondisk.h
@@ -319,7 +319,16 @@ struct gfs2_leaf {
319 __be32 lf_dirent_format; /* Format of the dirents */ 319 __be32 lf_dirent_format; /* Format of the dirents */
320 __be64 lf_next; /* Next leaf, if overflow */ 320 __be64 lf_next; /* Next leaf, if overflow */
321 321
322 __u8 lf_reserved[64]; 322 union {
323 __u8 lf_reserved[64];
324 struct {
325 __be64 lf_inode; /* Dir inode number */
326 __be32 lf_dist; /* Dist from inode on chain */
327 __be32 lf_nsec; /* Last ins/del usecs */
328 __be64 lf_sec; /* Last ins/del in secs */
329 __u8 lf_reserved2[40];
330 };
331 };
323}; 332};
324 333
325/* 334/*
diff --git a/init/Kconfig b/init/Kconfig
index 5236dc562a36..8d402e33b7fc 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -854,7 +854,6 @@ config NUMA_BALANCING
854 854
855menuconfig CGROUPS 855menuconfig CGROUPS
856 boolean "Control Group support" 856 boolean "Control Group support"
857 depends on EVENTFD
858 help 857 help
859 This option adds support for grouping sets of processes together, for 858 This option adds support for grouping sets of processes together, for
860 use with process control subsystems such as Cpusets, CFS, memory 859 use with process control subsystems such as Cpusets, CFS, memory
@@ -921,6 +920,7 @@ config MEMCG
921 bool "Memory Resource Controller for Control Groups" 920 bool "Memory Resource Controller for Control Groups"
922 depends on RESOURCE_COUNTERS 921 depends on RESOURCE_COUNTERS
923 select MM_OWNER 922 select MM_OWNER
923 select EVENTFD
924 help 924 help
925 Provides a memory resource controller that manages both anonymous 925 Provides a memory resource controller that manages both anonymous
926 memory and page cache. (See Documentation/cgroups/memory.txt) 926 memory and page cache. (See Documentation/cgroups/memory.txt)
@@ -1160,7 +1160,6 @@ config UIDGID_STRICT_TYPE_CHECKS
1160 1160
1161config SCHED_AUTOGROUP 1161config SCHED_AUTOGROUP
1162 bool "Automatic process group scheduling" 1162 bool "Automatic process group scheduling"
1163 select EVENTFD
1164 select CGROUPS 1163 select CGROUPS
1165 select CGROUP_SCHED 1164 select CGROUP_SCHED
1166 select FAIR_GROUP_SCHED 1165 select FAIR_GROUP_SCHED
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index bc1dcabe9217..e2f46ba37f72 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -41,7 +41,6 @@
41#include <linux/rcupdate.h> 41#include <linux/rcupdate.h>
42#include <linux/sched.h> 42#include <linux/sched.h>
43#include <linux/backing-dev.h> 43#include <linux/backing-dev.h>
44#include <linux/seq_file.h>
45#include <linux/slab.h> 44#include <linux/slab.h>
46#include <linux/magic.h> 45#include <linux/magic.h>
47#include <linux/spinlock.h> 46#include <linux/spinlock.h>
@@ -56,15 +55,20 @@
56#include <linux/pid_namespace.h> 55#include <linux/pid_namespace.h>
57#include <linux/idr.h> 56#include <linux/idr.h>
58#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ 57#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
59#include <linux/eventfd.h>
60#include <linux/poll.h>
61#include <linux/flex_array.h> /* used in cgroup_attach_task */ 58#include <linux/flex_array.h> /* used in cgroup_attach_task */
62#include <linux/kthread.h> 59#include <linux/kthread.h>
63#include <linux/file.h>
64 60
65#include <linux/atomic.h> 61#include <linux/atomic.h>
66 62
67/* 63/*
64 * pidlists linger the following amount before being destroyed. The goal
65 * is avoiding frequent destruction in the middle of consecutive read calls
66 * Expiring in the middle is a performance problem not a correctness one.
67 * 1 sec should be enough.
68 */
69#define CGROUP_PIDLIST_DESTROY_DELAY HZ
70
71/*
68 * cgroup_mutex is the master lock. Any modification to cgroup or its 72 * cgroup_mutex is the master lock. Any modification to cgroup or its
69 * hierarchy must be performed while holding it. 73 * hierarchy must be performed while holding it.
70 * 74 *
@@ -89,6 +93,19 @@ static DEFINE_MUTEX(cgroup_mutex);
89 93
90static DEFINE_MUTEX(cgroup_root_mutex); 94static DEFINE_MUTEX(cgroup_root_mutex);
91 95
96#define cgroup_assert_mutex_or_rcu_locked() \
97 rcu_lockdep_assert(rcu_read_lock_held() || \
98 lockdep_is_held(&cgroup_mutex), \
99 "cgroup_mutex or RCU read lock required");
100
101#ifdef CONFIG_LOCKDEP
102#define cgroup_assert_mutex_or_root_locked() \
103 WARN_ON_ONCE(debug_locks && (!lockdep_is_held(&cgroup_mutex) && \
104 !lockdep_is_held(&cgroup_root_mutex)))
105#else
106#define cgroup_assert_mutex_or_root_locked() do { } while (0)
107#endif
108
92/* 109/*
93 * cgroup destruction makes heavy use of work items and there can be a lot 110 * cgroup destruction makes heavy use of work items and there can be a lot
94 * of concurrent destructions. Use a separate workqueue so that cgroup 111 * of concurrent destructions. Use a separate workqueue so that cgroup
@@ -98,6 +115,12 @@ static DEFINE_MUTEX(cgroup_root_mutex);
98static struct workqueue_struct *cgroup_destroy_wq; 115static struct workqueue_struct *cgroup_destroy_wq;
99 116
100/* 117/*
118 * pidlist destructions need to be flushed on cgroup destruction. Use a
119 * separate workqueue as flush domain.
120 */
121static struct workqueue_struct *cgroup_pidlist_destroy_wq;
122
123/*
101 * Generate an array of cgroup subsystem pointers. At boot time, this is 124 * Generate an array of cgroup subsystem pointers. At boot time, this is
102 * populated with the built in subsystems, and modular subsystems are 125 * populated with the built in subsystems, and modular subsystems are
103 * registered after that. The mutable section of this array is protected by 126 * registered after that. The mutable section of this array is protected by
@@ -119,49 +142,6 @@ static struct cgroupfs_root cgroup_dummy_root;
119/* dummy_top is a shorthand for the dummy hierarchy's top cgroup */ 142/* dummy_top is a shorthand for the dummy hierarchy's top cgroup */
120static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup; 143static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup;
121 144
122/*
123 * cgroupfs file entry, pointed to from leaf dentry->d_fsdata.
124 */
125struct cfent {
126 struct list_head node;
127 struct dentry *dentry;
128 struct cftype *type;
129 struct cgroup_subsys_state *css;
130
131 /* file xattrs */
132 struct simple_xattrs xattrs;
133};
134
135/*
136 * cgroup_event represents events which userspace want to receive.
137 */
138struct cgroup_event {
139 /*
140 * css which the event belongs to.
141 */
142 struct cgroup_subsys_state *css;
143 /*
144 * Control file which the event associated.
145 */
146 struct cftype *cft;
147 /*
148 * eventfd to signal userspace about the event.
149 */
150 struct eventfd_ctx *eventfd;
151 /*
152 * Each of these stored in a list by the cgroup.
153 */
154 struct list_head list;
155 /*
156 * All fields below needed to unregister event when
157 * userspace closes eventfd.
158 */
159 poll_table pt;
160 wait_queue_head_t *wqh;
161 wait_queue_t wait;
162 struct work_struct remove;
163};
164
165/* The list of hierarchy roots */ 145/* The list of hierarchy roots */
166 146
167static LIST_HEAD(cgroup_roots); 147static LIST_HEAD(cgroup_roots);
@@ -200,6 +180,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp);
200static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], 180static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
201 bool is_add); 181 bool is_add);
202static int cgroup_file_release(struct inode *inode, struct file *file); 182static int cgroup_file_release(struct inode *inode, struct file *file);
183static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
203 184
204/** 185/**
205 * cgroup_css - obtain a cgroup's css for the specified subsystem 186 * cgroup_css - obtain a cgroup's css for the specified subsystem
@@ -262,16 +243,32 @@ static int notify_on_release(const struct cgroup *cgrp)
262} 243}
263 244
264/** 245/**
246 * for_each_css - iterate all css's of a cgroup
247 * @css: the iteration cursor
248 * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
249 * @cgrp: the target cgroup to iterate css's of
250 *
251 * Should be called under cgroup_mutex.
252 */
253#define for_each_css(css, ssid, cgrp) \
254 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
255 if (!((css) = rcu_dereference_check( \
256 (cgrp)->subsys[(ssid)], \
257 lockdep_is_held(&cgroup_mutex)))) { } \
258 else
259
260/**
265 * for_each_subsys - iterate all loaded cgroup subsystems 261 * for_each_subsys - iterate all loaded cgroup subsystems
266 * @ss: the iteration cursor 262 * @ss: the iteration cursor
267 * @i: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end 263 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
268 * 264 *
269 * Should be called under cgroup_mutex. 265 * Iterates through all loaded subsystems. Should be called under
266 * cgroup_mutex or cgroup_root_mutex.
270 */ 267 */
271#define for_each_subsys(ss, i) \ 268#define for_each_subsys(ss, ssid) \
272 for ((i) = 0; (i) < CGROUP_SUBSYS_COUNT; (i)++) \ 269 for (({ cgroup_assert_mutex_or_root_locked(); (ssid) = 0; }); \
273 if (({ lockdep_assert_held(&cgroup_mutex); \ 270 (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
274 !((ss) = cgroup_subsys[i]); })) { } \ 271 if (!((ss) = cgroup_subsys[(ssid)])) { } \
275 else 272 else
276 273
277/** 274/**
@@ -286,10 +283,6 @@ static int notify_on_release(const struct cgroup *cgrp)
286 for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT && \ 283 for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT && \
287 (((ss) = cgroup_subsys[i]) || true); (i)++) 284 (((ss) = cgroup_subsys[i]) || true); (i)++)
288 285
289/* iterate each subsystem attached to a hierarchy */
290#define for_each_root_subsys(root, ss) \
291 list_for_each_entry((ss), &(root)->subsys_list, sibling)
292
293/* iterate across the active hierarchies */ 286/* iterate across the active hierarchies */
294#define for_each_active_root(root) \ 287#define for_each_active_root(root) \
295 list_for_each_entry((root), &cgroup_roots, root_list) 288 list_for_each_entry((root), &cgroup_roots, root_list)
@@ -863,11 +856,7 @@ static void cgroup_free_fn(struct work_struct *work)
863 */ 856 */
864 deactivate_super(cgrp->root->sb); 857 deactivate_super(cgrp->root->sb);
865 858
866 /* 859 cgroup_pidlist_destroy_all(cgrp);
867 * if we're getting rid of the cgroup, refcount should ensure
868 * that there are no pidlists left.
869 */
870 BUG_ON(!list_empty(&cgrp->pidlists));
871 860
872 simple_xattrs_free(&cgrp->xattrs); 861 simple_xattrs_free(&cgrp->xattrs);
873 862
@@ -1050,7 +1039,6 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1050 cgroup_css(cgroup_dummy_top, ss)); 1039 cgroup_css(cgroup_dummy_top, ss));
1051 cgroup_css(cgrp, ss)->cgroup = cgrp; 1040 cgroup_css(cgrp, ss)->cgroup = cgrp;
1052 1041
1053 list_move(&ss->sibling, &root->subsys_list);
1054 ss->root = root; 1042 ss->root = root;
1055 if (ss->bind) 1043 if (ss->bind)
1056 ss->bind(cgroup_css(cgrp, ss)); 1044 ss->bind(cgroup_css(cgrp, ss));
@@ -1069,7 +1057,6 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1069 RCU_INIT_POINTER(cgrp->subsys[i], NULL); 1057 RCU_INIT_POINTER(cgrp->subsys[i], NULL);
1070 1058
1071 cgroup_subsys[i]->root = &cgroup_dummy_root; 1059 cgroup_subsys[i]->root = &cgroup_dummy_root;
1072 list_move(&ss->sibling, &cgroup_dummy_root.subsys_list);
1073 1060
1074 /* subsystem is now free - drop reference on module */ 1061 /* subsystem is now free - drop reference on module */
1075 module_put(ss->module); 1062 module_put(ss->module);
@@ -1096,10 +1083,12 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
1096{ 1083{
1097 struct cgroupfs_root *root = dentry->d_sb->s_fs_info; 1084 struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
1098 struct cgroup_subsys *ss; 1085 struct cgroup_subsys *ss;
1086 int ssid;
1099 1087
1100 mutex_lock(&cgroup_root_mutex); 1088 mutex_lock(&cgroup_root_mutex);
1101 for_each_root_subsys(root, ss) 1089 for_each_subsys(ss, ssid)
1102 seq_printf(seq, ",%s", ss->name); 1090 if (root->subsys_mask & (1 << ssid))
1091 seq_printf(seq, ",%s", ss->name);
1103 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) 1092 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
1104 seq_puts(seq, ",sane_behavior"); 1093 seq_puts(seq, ",sane_behavior");
1105 if (root->flags & CGRP_ROOT_NOPREFIX) 1094 if (root->flags & CGRP_ROOT_NOPREFIX)
@@ -1362,8 +1351,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1362 INIT_LIST_HEAD(&cgrp->pidlists); 1351 INIT_LIST_HEAD(&cgrp->pidlists);
1363 mutex_init(&cgrp->pidlist_mutex); 1352 mutex_init(&cgrp->pidlist_mutex);
1364 cgrp->dummy_css.cgroup = cgrp; 1353 cgrp->dummy_css.cgroup = cgrp;
1365 INIT_LIST_HEAD(&cgrp->event_list);
1366 spin_lock_init(&cgrp->event_list_lock);
1367 simple_xattrs_init(&cgrp->xattrs); 1354 simple_xattrs_init(&cgrp->xattrs);
1368} 1355}
1369 1356
@@ -1371,7 +1358,6 @@ static void init_cgroup_root(struct cgroupfs_root *root)
1371{ 1358{
1372 struct cgroup *cgrp = &root->top_cgroup; 1359 struct cgroup *cgrp = &root->top_cgroup;
1373 1360
1374 INIT_LIST_HEAD(&root->subsys_list);
1375 INIT_LIST_HEAD(&root->root_list); 1361 INIT_LIST_HEAD(&root->root_list);
1376 root->number_of_cgroups = 1; 1362 root->number_of_cgroups = 1;
1377 cgrp->root = root; 1363 cgrp->root = root;
@@ -1693,7 +1679,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1693 return ERR_PTR(ret); 1679 return ERR_PTR(ret);
1694} 1680}
1695 1681
1696static void cgroup_kill_sb(struct super_block *sb) { 1682static void cgroup_kill_sb(struct super_block *sb)
1683{
1697 struct cgroupfs_root *root = sb->s_fs_info; 1684 struct cgroupfs_root *root = sb->s_fs_info;
1698 struct cgroup *cgrp = &root->top_cgroup; 1685 struct cgroup *cgrp = &root->top_cgroup;
1699 struct cgrp_cset_link *link, *tmp_link; 1686 struct cgrp_cset_link *link, *tmp_link;
@@ -1976,8 +1963,8 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
1976 bool threadgroup) 1963 bool threadgroup)
1977{ 1964{
1978 int retval, i, group_size; 1965 int retval, i, group_size;
1979 struct cgroup_subsys *ss, *failed_ss = NULL;
1980 struct cgroupfs_root *root = cgrp->root; 1966 struct cgroupfs_root *root = cgrp->root;
1967 struct cgroup_subsys_state *css, *failed_css = NULL;
1981 /* threadgroup list cursor and array */ 1968 /* threadgroup list cursor and array */
1982 struct task_struct *leader = tsk; 1969 struct task_struct *leader = tsk;
1983 struct task_and_cgroup *tc; 1970 struct task_and_cgroup *tc;
@@ -2050,13 +2037,11 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2050 /* 2037 /*
2051 * step 1: check that we can legitimately attach to the cgroup. 2038 * step 1: check that we can legitimately attach to the cgroup.
2052 */ 2039 */
2053 for_each_root_subsys(root, ss) { 2040 for_each_css(css, i, cgrp) {
2054 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); 2041 if (css->ss->can_attach) {
2055 2042 retval = css->ss->can_attach(css, &tset);
2056 if (ss->can_attach) {
2057 retval = ss->can_attach(css, &tset);
2058 if (retval) { 2043 if (retval) {
2059 failed_ss = ss; 2044 failed_css = css;
2060 goto out_cancel_attach; 2045 goto out_cancel_attach;
2061 } 2046 }
2062 } 2047 }
@@ -2092,12 +2077,9 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2092 /* 2077 /*
2093 * step 4: do subsystem attach callbacks. 2078 * step 4: do subsystem attach callbacks.
2094 */ 2079 */
2095 for_each_root_subsys(root, ss) { 2080 for_each_css(css, i, cgrp)
2096 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); 2081 if (css->ss->attach)
2097 2082 css->ss->attach(css, &tset);
2098 if (ss->attach)
2099 ss->attach(css, &tset);
2100 }
2101 2083
2102 /* 2084 /*
2103 * step 5: success! and cleanup 2085 * step 5: success! and cleanup
@@ -2114,13 +2096,11 @@ out_put_css_set_refs:
2114 } 2096 }
2115out_cancel_attach: 2097out_cancel_attach:
2116 if (retval) { 2098 if (retval) {
2117 for_each_root_subsys(root, ss) { 2099 for_each_css(css, i, cgrp) {
2118 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); 2100 if (css == failed_css)
2119
2120 if (ss == failed_ss)
2121 break; 2101 break;
2122 if (ss->cancel_attach) 2102 if (css->ss->cancel_attach)
2123 ss->cancel_attach(css, &tset); 2103 css->ss->cancel_attach(css, &tset);
2124 } 2104 }
2125 } 2105 }
2126out_free_group_list: 2106out_free_group_list:
@@ -2148,7 +2128,7 @@ retry_find_task:
2148 tsk = find_task_by_vpid(pid); 2128 tsk = find_task_by_vpid(pid);
2149 if (!tsk) { 2129 if (!tsk) {
2150 rcu_read_unlock(); 2130 rcu_read_unlock();
2151 ret= -ESRCH; 2131 ret = -ESRCH;
2152 goto out_unlock_cgroup; 2132 goto out_unlock_cgroup;
2153 } 2133 }
2154 /* 2134 /*
@@ -2260,10 +2240,9 @@ static int cgroup_release_agent_write(struct cgroup_subsys_state *css,
2260 return 0; 2240 return 0;
2261} 2241}
2262 2242
2263static int cgroup_release_agent_show(struct cgroup_subsys_state *css, 2243static int cgroup_release_agent_show(struct seq_file *seq, void *v)
2264 struct cftype *cft, struct seq_file *seq)
2265{ 2244{
2266 struct cgroup *cgrp = css->cgroup; 2245 struct cgroup *cgrp = seq_css(seq)->cgroup;
2267 2246
2268 if (!cgroup_lock_live_group(cgrp)) 2247 if (!cgroup_lock_live_group(cgrp))
2269 return -ENODEV; 2248 return -ENODEV;
@@ -2273,174 +2252,129 @@ static int cgroup_release_agent_show(struct cgroup_subsys_state *css,
2273 return 0; 2252 return 0;
2274} 2253}
2275 2254
2276static int cgroup_sane_behavior_show(struct cgroup_subsys_state *css, 2255static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
2277 struct cftype *cft, struct seq_file *seq)
2278{ 2256{
2279 seq_printf(seq, "%d\n", cgroup_sane_behavior(css->cgroup)); 2257 struct cgroup *cgrp = seq_css(seq)->cgroup;
2258
2259 seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));
2280 return 0; 2260 return 0;
2281} 2261}
2282 2262
2283/* A buffer size big enough for numbers or short strings */ 2263/* A buffer size big enough for numbers or short strings */
2284#define CGROUP_LOCAL_BUFFER_SIZE 64 2264#define CGROUP_LOCAL_BUFFER_SIZE 64
2285 2265
2286static ssize_t cgroup_write_X64(struct cgroup_subsys_state *css, 2266static ssize_t cgroup_file_write(struct file *file, const char __user *userbuf,
2287 struct cftype *cft, struct file *file, 2267 size_t nbytes, loff_t *ppos)
2288 const char __user *userbuf, size_t nbytes,
2289 loff_t *unused_ppos)
2290{ 2268{
2291 char buffer[CGROUP_LOCAL_BUFFER_SIZE]; 2269 struct cfent *cfe = __d_cfe(file->f_dentry);
2292 int retval = 0; 2270 struct cftype *cft = __d_cft(file->f_dentry);
2293 char *end; 2271 struct cgroup_subsys_state *css = cfe->css;
2272 size_t max_bytes = cft->max_write_len ?: CGROUP_LOCAL_BUFFER_SIZE - 1;
2273 char *buf;
2274 int ret;
2294 2275
2295 if (!nbytes) 2276 if (nbytes >= max_bytes)
2296 return -EINVAL;
2297 if (nbytes >= sizeof(buffer))
2298 return -E2BIG; 2277 return -E2BIG;
2299 if (copy_from_user(buffer, userbuf, nbytes))
2300 return -EFAULT;
2301 2278
2302 buffer[nbytes] = 0; /* nul-terminate */ 2279 buf = kmalloc(nbytes + 1, GFP_KERNEL);
2303 if (cft->write_u64) { 2280 if (!buf)
2304 u64 val = simple_strtoull(strstrip(buffer), &end, 0); 2281 return -ENOMEM;
2305 if (*end) 2282
2306 return -EINVAL; 2283 if (copy_from_user(buf, userbuf, nbytes)) {
2307 retval = cft->write_u64(css, cft, val); 2284 ret = -EFAULT;
2285 goto out_free;
2286 }
2287
2288 buf[nbytes] = '\0';
2289
2290 if (cft->write_string) {
2291 ret = cft->write_string(css, cft, strstrip(buf));
2292 } else if (cft->write_u64) {
2293 unsigned long long v;
2294 ret = kstrtoull(buf, 0, &v);
2295 if (!ret)
2296 ret = cft->write_u64(css, cft, v);
2297 } else if (cft->write_s64) {
2298 long long v;
2299 ret = kstrtoll(buf, 0, &v);
2300 if (!ret)
2301 ret = cft->write_s64(css, cft, v);
2302 } else if (cft->trigger) {
2303 ret = cft->trigger(css, (unsigned int)cft->private);
2308 } else { 2304 } else {
2309 s64 val = simple_strtoll(strstrip(buffer), &end, 0); 2305 ret = -EINVAL;
2310 if (*end)
2311 return -EINVAL;
2312 retval = cft->write_s64(css, cft, val);
2313 } 2306 }
2314 if (!retval) 2307out_free:
2315 retval = nbytes; 2308 kfree(buf);
2316 return retval; 2309 return ret ?: nbytes;
2317} 2310}
2318 2311
2319static ssize_t cgroup_write_string(struct cgroup_subsys_state *css, 2312/*
2320 struct cftype *cft, struct file *file, 2313 * seqfile ops/methods for returning structured data. Currently just
2321 const char __user *userbuf, size_t nbytes, 2314 * supports string->u64 maps, but can be extended in future.
2322 loff_t *unused_ppos) 2315 */
2316
2317static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
2323{ 2318{
2324 char local_buffer[CGROUP_LOCAL_BUFFER_SIZE]; 2319 struct cftype *cft = seq_cft(seq);
2325 int retval = 0;
2326 size_t max_bytes = cft->max_write_len;
2327 char *buffer = local_buffer;
2328 2320
2329 if (!max_bytes) 2321 if (cft->seq_start) {
2330 max_bytes = sizeof(local_buffer) - 1; 2322 return cft->seq_start(seq, ppos);
2331 if (nbytes >= max_bytes) 2323 } else {
2332 return -E2BIG; 2324 /*
2333 /* Allocate a dynamic buffer if we need one */ 2325 * The same behavior and code as single_open(). Returns
2334 if (nbytes >= sizeof(local_buffer)) { 2326 * !NULL if pos is at the beginning; otherwise, NULL.
2335 buffer = kmalloc(nbytes + 1, GFP_KERNEL); 2327 */
2336 if (buffer == NULL) 2328 return NULL + !*ppos;
2337 return -ENOMEM;
2338 }
2339 if (nbytes && copy_from_user(buffer, userbuf, nbytes)) {
2340 retval = -EFAULT;
2341 goto out;
2342 } 2329 }
2343
2344 buffer[nbytes] = 0; /* nul-terminate */
2345 retval = cft->write_string(css, cft, strstrip(buffer));
2346 if (!retval)
2347 retval = nbytes;
2348out:
2349 if (buffer != local_buffer)
2350 kfree(buffer);
2351 return retval;
2352} 2330}
2353 2331
2354static ssize_t cgroup_file_write(struct file *file, const char __user *buf, 2332static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
2355 size_t nbytes, loff_t *ppos)
2356{ 2333{
2357 struct cfent *cfe = __d_cfe(file->f_dentry); 2334 struct cftype *cft = seq_cft(seq);
2358 struct cftype *cft = __d_cft(file->f_dentry);
2359 struct cgroup_subsys_state *css = cfe->css;
2360 2335
2361 if (cft->write) 2336 if (cft->seq_next) {
2362 return cft->write(css, cft, file, buf, nbytes, ppos); 2337 return cft->seq_next(seq, v, ppos);
2363 if (cft->write_u64 || cft->write_s64) 2338 } else {
2364 return cgroup_write_X64(css, cft, file, buf, nbytes, ppos); 2339 /*
2365 if (cft->write_string) 2340 * The same behavior and code as single_open(), always
2366 return cgroup_write_string(css, cft, file, buf, nbytes, ppos); 2341 * terminate after the initial read.
2367 if (cft->trigger) { 2342 */
2368 int ret = cft->trigger(css, (unsigned int)cft->private); 2343 ++*ppos;
2369 return ret ? ret : nbytes; 2344 return NULL;
2370 } 2345 }
2371 return -EINVAL;
2372} 2346}
2373 2347
2374static ssize_t cgroup_read_u64(struct cgroup_subsys_state *css, 2348static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
2375 struct cftype *cft, struct file *file,
2376 char __user *buf, size_t nbytes, loff_t *ppos)
2377{ 2349{
2378 char tmp[CGROUP_LOCAL_BUFFER_SIZE]; 2350 struct cftype *cft = seq_cft(seq);
2379 u64 val = cft->read_u64(css, cft);
2380 int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
2381 2351
2382 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); 2352 if (cft->seq_stop)
2353 cft->seq_stop(seq, v);
2383} 2354}
2384 2355
2385static ssize_t cgroup_read_s64(struct cgroup_subsys_state *css, 2356static int cgroup_seqfile_show(struct seq_file *m, void *arg)
2386 struct cftype *cft, struct file *file,
2387 char __user *buf, size_t nbytes, loff_t *ppos)
2388{ 2357{
2389 char tmp[CGROUP_LOCAL_BUFFER_SIZE]; 2358 struct cftype *cft = seq_cft(m);
2390 s64 val = cft->read_s64(css, cft); 2359 struct cgroup_subsys_state *css = seq_css(m);
2391 int len = sprintf(tmp, "%lld\n", (long long) val);
2392 2360
2393 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); 2361 if (cft->seq_show)
2394} 2362 return cft->seq_show(m, arg);
2395 2363
2396static ssize_t cgroup_file_read(struct file *file, char __user *buf,
2397 size_t nbytes, loff_t *ppos)
2398{
2399 struct cfent *cfe = __d_cfe(file->f_dentry);
2400 struct cftype *cft = __d_cft(file->f_dentry);
2401 struct cgroup_subsys_state *css = cfe->css;
2402
2403 if (cft->read)
2404 return cft->read(css, cft, file, buf, nbytes, ppos);
2405 if (cft->read_u64) 2364 if (cft->read_u64)
2406 return cgroup_read_u64(css, cft, file, buf, nbytes, ppos); 2365 seq_printf(m, "%llu\n", cft->read_u64(css, cft));
2407 if (cft->read_s64) 2366 else if (cft->read_s64)
2408 return cgroup_read_s64(css, cft, file, buf, nbytes, ppos); 2367 seq_printf(m, "%lld\n", cft->read_s64(css, cft));
2409 return -EINVAL; 2368 else
2410} 2369 return -EINVAL;
2411 2370 return 0;
2412/*
2413 * seqfile ops/methods for returning structured data. Currently just
2414 * supports string->u64 maps, but can be extended in future.
2415 */
2416
2417static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
2418{
2419 struct seq_file *sf = cb->state;
2420 return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value);
2421}
2422
2423static int cgroup_seqfile_show(struct seq_file *m, void *arg)
2424{
2425 struct cfent *cfe = m->private;
2426 struct cftype *cft = cfe->type;
2427 struct cgroup_subsys_state *css = cfe->css;
2428
2429 if (cft->read_map) {
2430 struct cgroup_map_cb cb = {
2431 .fill = cgroup_map_add,
2432 .state = m,
2433 };
2434 return cft->read_map(css, cft, &cb);
2435 }
2436 return cft->read_seq_string(css, cft, m);
2437} 2371}
2438 2372
2439static const struct file_operations cgroup_seqfile_operations = { 2373static struct seq_operations cgroup_seq_operations = {
2440 .read = seq_read, 2374 .start = cgroup_seqfile_start,
2441 .write = cgroup_file_write, 2375 .next = cgroup_seqfile_next,
2442 .llseek = seq_lseek, 2376 .stop = cgroup_seqfile_stop,
2443 .release = cgroup_file_release, 2377 .show = cgroup_seqfile_show,
2444}; 2378};
2445 2379
2446static int cgroup_file_open(struct inode *inode, struct file *file) 2380static int cgroup_file_open(struct inode *inode, struct file *file)
@@ -2449,6 +2383,7 @@ static int cgroup_file_open(struct inode *inode, struct file *file)
2449 struct cftype *cft = __d_cft(file->f_dentry); 2383 struct cftype *cft = __d_cft(file->f_dentry);
2450 struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); 2384 struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent);
2451 struct cgroup_subsys_state *css; 2385 struct cgroup_subsys_state *css;
2386 struct cgroup_open_file *of;
2452 int err; 2387 int err;
2453 2388
2454 err = generic_file_open(inode, file); 2389 err = generic_file_open(inode, file);
@@ -2478,32 +2413,26 @@ static int cgroup_file_open(struct inode *inode, struct file *file)
2478 WARN_ON_ONCE(cfe->css && cfe->css != css); 2413 WARN_ON_ONCE(cfe->css && cfe->css != css);
2479 cfe->css = css; 2414 cfe->css = css;
2480 2415
2481 if (cft->read_map || cft->read_seq_string) { 2416 of = __seq_open_private(file, &cgroup_seq_operations,
2482 file->f_op = &cgroup_seqfile_operations; 2417 sizeof(struct cgroup_open_file));
2483 err = single_open(file, cgroup_seqfile_show, cfe); 2418 if (of) {
2484 } else if (cft->open) { 2419 of->cfe = cfe;
2485 err = cft->open(inode, file); 2420 return 0;
2486 } 2421 }
2487 2422
2488 if (css->ss && err) 2423 if (css->ss)
2489 css_put(css); 2424 css_put(css);
2490 return err; 2425 return -ENOMEM;
2491} 2426}
2492 2427
2493static int cgroup_file_release(struct inode *inode, struct file *file) 2428static int cgroup_file_release(struct inode *inode, struct file *file)
2494{ 2429{
2495 struct cfent *cfe = __d_cfe(file->f_dentry); 2430 struct cfent *cfe = __d_cfe(file->f_dentry);
2496 struct cftype *cft = __d_cft(file->f_dentry);
2497 struct cgroup_subsys_state *css = cfe->css; 2431 struct cgroup_subsys_state *css = cfe->css;
2498 int ret = 0;
2499 2432
2500 if (cft->release)
2501 ret = cft->release(inode, file);
2502 if (css->ss) 2433 if (css->ss)
2503 css_put(css); 2434 css_put(css);
2504 if (file->f_op == &cgroup_seqfile_operations) 2435 return seq_release_private(inode, file);
2505 single_release(inode, file);
2506 return ret;
2507} 2436}
2508 2437
2509/* 2438/*
@@ -2614,7 +2543,7 @@ static ssize_t cgroup_listxattr(struct dentry *dentry, char *buf, size_t size)
2614} 2543}
2615 2544
2616static const struct file_operations cgroup_file_operations = { 2545static const struct file_operations cgroup_file_operations = {
2617 .read = cgroup_file_read, 2546 .read = seq_read,
2618 .write = cgroup_file_write, 2547 .write = cgroup_file_write,
2619 .llseek = generic_file_llseek, 2548 .llseek = generic_file_llseek,
2620 .open = cgroup_file_open, 2549 .open = cgroup_file_open,
@@ -2639,16 +2568,6 @@ static const struct inode_operations cgroup_dir_inode_operations = {
2639 .removexattr = cgroup_removexattr, 2568 .removexattr = cgroup_removexattr,
2640}; 2569};
2641 2570
2642/*
2643 * Check if a file is a control file
2644 */
2645static inline struct cftype *__file_cft(struct file *file)
2646{
2647 if (file_inode(file)->i_fop != &cgroup_file_operations)
2648 return ERR_PTR(-EINVAL);
2649 return __d_cft(file->f_dentry);
2650}
2651
2652static int cgroup_create_file(struct dentry *dentry, umode_t mode, 2571static int cgroup_create_file(struct dentry *dentry, umode_t mode,
2653 struct super_block *sb) 2572 struct super_block *sb)
2654{ 2573{
@@ -2706,12 +2625,11 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
2706 if (cft->mode) 2625 if (cft->mode)
2707 return cft->mode; 2626 return cft->mode;
2708 2627
2709 if (cft->read || cft->read_u64 || cft->read_s64 || 2628 if (cft->read_u64 || cft->read_s64 || cft->seq_show)
2710 cft->read_map || cft->read_seq_string)
2711 mode |= S_IRUGO; 2629 mode |= S_IRUGO;
2712 2630
2713 if (cft->write || cft->write_u64 || cft->write_s64 || 2631 if (cft->write_u64 || cft->write_s64 || cft->write_string ||
2714 cft->write_string || cft->trigger) 2632 cft->trigger)
2715 mode |= S_IWUSR; 2633 mode |= S_IWUSR;
2716 2634
2717 return mode; 2635 return mode;
@@ -3007,9 +2925,9 @@ static void cgroup_enable_task_cg_lists(void)
3007 * @parent_css: css whose children to walk 2925 * @parent_css: css whose children to walk
3008 * 2926 *
3009 * This function returns the next child of @parent_css and should be called 2927 * This function returns the next child of @parent_css and should be called
3010 * under RCU read lock. The only requirement is that @parent_css and 2928 * under either cgroup_mutex or RCU read lock. The only requirement is
3011 * @pos_css are accessible. The next sibling is guaranteed to be returned 2929 * that @parent_css and @pos_css are accessible. The next sibling is
3012 * regardless of their states. 2930 * guaranteed to be returned regardless of their states.
3013 */ 2931 */
3014struct cgroup_subsys_state * 2932struct cgroup_subsys_state *
3015css_next_child(struct cgroup_subsys_state *pos_css, 2933css_next_child(struct cgroup_subsys_state *pos_css,
@@ -3019,7 +2937,7 @@ css_next_child(struct cgroup_subsys_state *pos_css,
3019 struct cgroup *cgrp = parent_css->cgroup; 2937 struct cgroup *cgrp = parent_css->cgroup;
3020 struct cgroup *next; 2938 struct cgroup *next;
3021 2939
3022 WARN_ON_ONCE(!rcu_read_lock_held()); 2940 cgroup_assert_mutex_or_rcu_locked();
3023 2941
3024 /* 2942 /*
3025 * @pos could already have been removed. Once a cgroup is removed, 2943 * @pos could already have been removed. Once a cgroup is removed,
@@ -3066,10 +2984,10 @@ EXPORT_SYMBOL_GPL(css_next_child);
3066 * to visit for pre-order traversal of @root's descendants. @root is 2984 * to visit for pre-order traversal of @root's descendants. @root is
3067 * included in the iteration and the first node to be visited. 2985 * included in the iteration and the first node to be visited.
3068 * 2986 *
3069 * While this function requires RCU read locking, it doesn't require the 2987 * While this function requires cgroup_mutex or RCU read locking, it
3070 * whole traversal to be contained in a single RCU critical section. This 2988 * doesn't require the whole traversal to be contained in a single critical
3071 * function will return the correct next descendant as long as both @pos 2989 * section. This function will return the correct next descendant as long
3072 * and @root are accessible and @pos is a descendant of @root. 2990 * as both @pos and @root are accessible and @pos is a descendant of @root.
3073 */ 2991 */
3074struct cgroup_subsys_state * 2992struct cgroup_subsys_state *
3075css_next_descendant_pre(struct cgroup_subsys_state *pos, 2993css_next_descendant_pre(struct cgroup_subsys_state *pos,
@@ -3077,7 +2995,7 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos,
3077{ 2995{
3078 struct cgroup_subsys_state *next; 2996 struct cgroup_subsys_state *next;
3079 2997
3080 WARN_ON_ONCE(!rcu_read_lock_held()); 2998 cgroup_assert_mutex_or_rcu_locked();
3081 2999
3082 /* if first iteration, visit @root */ 3000 /* if first iteration, visit @root */
3083 if (!pos) 3001 if (!pos)
@@ -3108,17 +3026,17 @@ EXPORT_SYMBOL_GPL(css_next_descendant_pre);
3108 * is returned. This can be used during pre-order traversal to skip 3026 * is returned. This can be used during pre-order traversal to skip
3109 * subtree of @pos. 3027 * subtree of @pos.
3110 * 3028 *
3111 * While this function requires RCU read locking, it doesn't require the 3029 * While this function requires cgroup_mutex or RCU read locking, it
3112 * whole traversal to be contained in a single RCU critical section. This 3030 * doesn't require the whole traversal to be contained in a single critical
3113 * function will return the correct rightmost descendant as long as @pos is 3031 * section. This function will return the correct rightmost descendant as
3114 * accessible. 3032 * long as @pos is accessible.
3115 */ 3033 */
3116struct cgroup_subsys_state * 3034struct cgroup_subsys_state *
3117css_rightmost_descendant(struct cgroup_subsys_state *pos) 3035css_rightmost_descendant(struct cgroup_subsys_state *pos)
3118{ 3036{
3119 struct cgroup_subsys_state *last, *tmp; 3037 struct cgroup_subsys_state *last, *tmp;
3120 3038
3121 WARN_ON_ONCE(!rcu_read_lock_held()); 3039 cgroup_assert_mutex_or_rcu_locked();
3122 3040
3123 do { 3041 do {
3124 last = pos; 3042 last = pos;
@@ -3154,10 +3072,11 @@ css_leftmost_descendant(struct cgroup_subsys_state *pos)
3154 * to visit for post-order traversal of @root's descendants. @root is 3072 * to visit for post-order traversal of @root's descendants. @root is
3155 * included in the iteration and the last node to be visited. 3073 * included in the iteration and the last node to be visited.
3156 * 3074 *
3157 * While this function requires RCU read locking, it doesn't require the 3075 * While this function requires cgroup_mutex or RCU read locking, it
3158 * whole traversal to be contained in a single RCU critical section. This 3076 * doesn't require the whole traversal to be contained in a single critical
3159 * function will return the correct next descendant as long as both @pos 3077 * section. This function will return the correct next descendant as long
3160 * and @cgroup are accessible and @pos is a descendant of @cgroup. 3078 * as both @pos and @cgroup are accessible and @pos is a descendant of
3079 * @cgroup.
3161 */ 3080 */
3162struct cgroup_subsys_state * 3081struct cgroup_subsys_state *
3163css_next_descendant_post(struct cgroup_subsys_state *pos, 3082css_next_descendant_post(struct cgroup_subsys_state *pos,
@@ -3165,7 +3084,7 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
3165{ 3084{
3166 struct cgroup_subsys_state *next; 3085 struct cgroup_subsys_state *next;
3167 3086
3168 WARN_ON_ONCE(!rcu_read_lock_held()); 3087 cgroup_assert_mutex_or_rcu_locked();
3169 3088
3170 /* if first iteration, visit leftmost descendant which may be @root */ 3089 /* if first iteration, visit leftmost descendant which may be @root */
3171 if (!pos) 3090 if (!pos)
@@ -3504,14 +3423,12 @@ struct cgroup_pidlist {
3504 pid_t *list; 3423 pid_t *list;
3505 /* how many elements the above list has */ 3424 /* how many elements the above list has */
3506 int length; 3425 int length;
3507 /* how many files are using the current array */
3508 int use_count;
3509 /* each of these stored in a list by its cgroup */ 3426 /* each of these stored in a list by its cgroup */
3510 struct list_head links; 3427 struct list_head links;
3511 /* pointer to the cgroup we belong to, for list removal purposes */ 3428 /* pointer to the cgroup we belong to, for list removal purposes */
3512 struct cgroup *owner; 3429 struct cgroup *owner;
3513 /* protects the other fields */ 3430 /* for delayed destruction */
3514 struct rw_semaphore rwsem; 3431 struct delayed_work destroy_dwork;
3515}; 3432};
3516 3433
3517/* 3434/*
@@ -3527,6 +3444,7 @@ static void *pidlist_allocate(int count)
3527 else 3444 else
3528 return kmalloc(count * sizeof(pid_t), GFP_KERNEL); 3445 return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
3529} 3446}
3447
3530static void pidlist_free(void *p) 3448static void pidlist_free(void *p)
3531{ 3449{
3532 if (is_vmalloc_addr(p)) 3450 if (is_vmalloc_addr(p))
@@ -3536,6 +3454,47 @@ static void pidlist_free(void *p)
3536} 3454}
3537 3455
3538/* 3456/*
3457 * Used to destroy all pidlists lingering waiting for destroy timer. None
3458 * should be left afterwards.
3459 */
3460static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
3461{
3462 struct cgroup_pidlist *l, *tmp_l;
3463
3464 mutex_lock(&cgrp->pidlist_mutex);
3465 list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
3466 mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
3467 mutex_unlock(&cgrp->pidlist_mutex);
3468
3469 flush_workqueue(cgroup_pidlist_destroy_wq);
3470 BUG_ON(!list_empty(&cgrp->pidlists));
3471}
3472
3473static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
3474{
3475 struct delayed_work *dwork = to_delayed_work(work);
3476 struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
3477 destroy_dwork);
3478 struct cgroup_pidlist *tofree = NULL;
3479
3480 mutex_lock(&l->owner->pidlist_mutex);
3481
3482 /*
3483 * Destroy iff we didn't get queued again. The state won't change
3484 * as destroy_dwork can only be queued while locked.
3485 */
3486 if (!delayed_work_pending(dwork)) {
3487 list_del(&l->links);
3488 pidlist_free(l->list);
3489 put_pid_ns(l->key.ns);
3490 tofree = l;
3491 }
3492
3493 mutex_unlock(&l->owner->pidlist_mutex);
3494 kfree(tofree);
3495}
3496
3497/*
3539 * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries 3498 * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
3540 * Returns the number of unique elements. 3499 * Returns the number of unique elements.
3541 */ 3500 */
@@ -3565,52 +3524,92 @@ after:
3565 return dest; 3524 return dest;
3566} 3525}
3567 3526
3527/*
3528 * The two pid files - task and cgroup.procs - guaranteed that the result
3529 * is sorted, which forced this whole pidlist fiasco. As pid order is
3530 * different per namespace, each namespace needs differently sorted list,
3531 * making it impossible to use, for example, single rbtree of member tasks
3532 * sorted by task pointer. As pidlists can be fairly large, allocating one
3533 * per open file is dangerous, so cgroup had to implement shared pool of
3534 * pidlists keyed by cgroup and namespace.
3535 *
3536 * All this extra complexity was caused by the original implementation
3537 * committing to an entirely unnecessary property. In the long term, we
3538 * want to do away with it. Explicitly scramble sort order if
3539 * sane_behavior so that no such expectation exists in the new interface.
3540 *
3541 * Scrambling is done by swapping every two consecutive bits, which is
3542 * non-identity one-to-one mapping which disturbs sort order sufficiently.
3543 */
3544static pid_t pid_fry(pid_t pid)
3545{
3546 unsigned a = pid & 0x55555555;
3547 unsigned b = pid & 0xAAAAAAAA;
3548
3549 return (a << 1) | (b >> 1);
3550}
3551
3552static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
3553{
3554 if (cgroup_sane_behavior(cgrp))
3555 return pid_fry(pid);
3556 else
3557 return pid;
3558}
3559
3568static int cmppid(const void *a, const void *b) 3560static int cmppid(const void *a, const void *b)
3569{ 3561{
3570 return *(pid_t *)a - *(pid_t *)b; 3562 return *(pid_t *)a - *(pid_t *)b;
3571} 3563}
3572 3564
3565static int fried_cmppid(const void *a, const void *b)
3566{
3567 return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);
3568}
3569
3570static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3571 enum cgroup_filetype type)
3572{
3573 struct cgroup_pidlist *l;
3574 /* don't need task_nsproxy() if we're looking at ourself */
3575 struct pid_namespace *ns = task_active_pid_ns(current);
3576
3577 lockdep_assert_held(&cgrp->pidlist_mutex);
3578
3579 list_for_each_entry(l, &cgrp->pidlists, links)
3580 if (l->key.type == type && l->key.ns == ns)
3581 return l;
3582 return NULL;
3583}
3584
3573/* 3585/*
3574 * find the appropriate pidlist for our purpose (given procs vs tasks) 3586 * find the appropriate pidlist for our purpose (given procs vs tasks)
3575 * returns with the lock on that pidlist already held, and takes care 3587 * returns with the lock on that pidlist already held, and takes care
3576 * of the use count, or returns NULL with no locks held if we're out of 3588 * of the use count, or returns NULL with no locks held if we're out of
3577 * memory. 3589 * memory.
3578 */ 3590 */
3579static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, 3591static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
3580 enum cgroup_filetype type) 3592 enum cgroup_filetype type)
3581{ 3593{
3582 struct cgroup_pidlist *l; 3594 struct cgroup_pidlist *l;
3583 /* don't need task_nsproxy() if we're looking at ourself */
3584 struct pid_namespace *ns = task_active_pid_ns(current);
3585 3595
3586 /* 3596 lockdep_assert_held(&cgrp->pidlist_mutex);
3587 * We can't drop the pidlist_mutex before taking the l->rwsem in case 3597
3588 * the last ref-holder is trying to remove l from the list at the same 3598 l = cgroup_pidlist_find(cgrp, type);
3589 * time. Holding the pidlist_mutex precludes somebody taking whichever 3599 if (l)
3590 * list we find out from under us - compare release_pid_array(). 3600 return l;
3591 */ 3601
3592 mutex_lock(&cgrp->pidlist_mutex);
3593 list_for_each_entry(l, &cgrp->pidlists, links) {
3594 if (l->key.type == type && l->key.ns == ns) {
3595 /* make sure l doesn't vanish out from under us */
3596 down_write(&l->rwsem);
3597 mutex_unlock(&cgrp->pidlist_mutex);
3598 return l;
3599 }
3600 }
3601 /* entry not found; create a new one */ 3602 /* entry not found; create a new one */
3602 l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); 3603 l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
3603 if (!l) { 3604 if (!l)
3604 mutex_unlock(&cgrp->pidlist_mutex);
3605 return l; 3605 return l;
3606 } 3606
3607 init_rwsem(&l->rwsem); 3607 INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
3608 down_write(&l->rwsem);
3609 l->key.type = type; 3608 l->key.type = type;
3610 l->key.ns = get_pid_ns(ns); 3609 /* don't need task_nsproxy() if we're looking at ourself */
3610 l->key.ns = get_pid_ns(task_active_pid_ns(current));
3611 l->owner = cgrp; 3611 l->owner = cgrp;
3612 list_add(&l->links, &cgrp->pidlists); 3612 list_add(&l->links, &cgrp->pidlists);
3613 mutex_unlock(&cgrp->pidlist_mutex);
3614 return l; 3613 return l;
3615} 3614}
3616 3615
@@ -3627,6 +3626,8 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3627 struct task_struct *tsk; 3626 struct task_struct *tsk;
3628 struct cgroup_pidlist *l; 3627 struct cgroup_pidlist *l;
3629 3628
3629 lockdep_assert_held(&cgrp->pidlist_mutex);
3630
3630 /* 3631 /*
3631 * If cgroup gets more users after we read count, we won't have 3632 * If cgroup gets more users after we read count, we won't have
3632 * enough space - tough. This race is indistinguishable to the 3633 * enough space - tough. This race is indistinguishable to the
@@ -3653,20 +3654,24 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3653 css_task_iter_end(&it); 3654 css_task_iter_end(&it);
3654 length = n; 3655 length = n;
3655 /* now sort & (if procs) strip out duplicates */ 3656 /* now sort & (if procs) strip out duplicates */
3656 sort(array, length, sizeof(pid_t), cmppid, NULL); 3657 if (cgroup_sane_behavior(cgrp))
3658 sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
3659 else
3660 sort(array, length, sizeof(pid_t), cmppid, NULL);
3657 if (type == CGROUP_FILE_PROCS) 3661 if (type == CGROUP_FILE_PROCS)
3658 length = pidlist_uniq(array, length); 3662 length = pidlist_uniq(array, length);
3659 l = cgroup_pidlist_find(cgrp, type); 3663
3664 l = cgroup_pidlist_find_create(cgrp, type);
3660 if (!l) { 3665 if (!l) {
3666 mutex_unlock(&cgrp->pidlist_mutex);
3661 pidlist_free(array); 3667 pidlist_free(array);
3662 return -ENOMEM; 3668 return -ENOMEM;
3663 } 3669 }
3664 /* store array, freeing old if necessary - lock already held */ 3670
3671 /* store array, freeing old if necessary */
3665 pidlist_free(l->list); 3672 pidlist_free(l->list);
3666 l->list = array; 3673 l->list = array;
3667 l->length = length; 3674 l->length = length;
3668 l->use_count++;
3669 up_write(&l->rwsem);
3670 *lp = l; 3675 *lp = l;
3671 return 0; 3676 return 0;
3672} 3677}
@@ -3740,20 +3745,45 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
3740 * after a seek to the start). Use a binary-search to find the 3745 * after a seek to the start). Use a binary-search to find the
3741 * next pid to display, if any 3746 * next pid to display, if any
3742 */ 3747 */
3743 struct cgroup_pidlist *l = s->private; 3748 struct cgroup_open_file *of = s->private;
3749 struct cgroup *cgrp = seq_css(s)->cgroup;
3750 struct cgroup_pidlist *l;
3751 enum cgroup_filetype type = seq_cft(s)->private;
3744 int index = 0, pid = *pos; 3752 int index = 0, pid = *pos;
3745 int *iter; 3753 int *iter, ret;
3754
3755 mutex_lock(&cgrp->pidlist_mutex);
3756
3757 /*
3758 * !NULL @of->priv indicates that this isn't the first start()
3759 * after open. If the matching pidlist is around, we can use that.
3760 * Look for it. Note that @of->priv can't be used directly. It
3761 * could already have been destroyed.
3762 */
3763 if (of->priv)
3764 of->priv = cgroup_pidlist_find(cgrp, type);
3765
3766 /*
3767 * Either this is the first start() after open or the matching
3768 * pidlist has been destroyed inbetween. Create a new one.
3769 */
3770 if (!of->priv) {
3771 ret = pidlist_array_load(cgrp, type,
3772 (struct cgroup_pidlist **)&of->priv);
3773 if (ret)
3774 return ERR_PTR(ret);
3775 }
3776 l = of->priv;
3746 3777
3747 down_read(&l->rwsem);
3748 if (pid) { 3778 if (pid) {
3749 int end = l->length; 3779 int end = l->length;
3750 3780
3751 while (index < end) { 3781 while (index < end) {
3752 int mid = (index + end) / 2; 3782 int mid = (index + end) / 2;
3753 if (l->list[mid] == pid) { 3783 if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {
3754 index = mid; 3784 index = mid;
3755 break; 3785 break;
3756 } else if (l->list[mid] <= pid) 3786 } else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)
3757 index = mid + 1; 3787 index = mid + 1;
3758 else 3788 else
3759 end = mid; 3789 end = mid;
@@ -3764,19 +3794,25 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
3764 return NULL; 3794 return NULL;
3765 /* Update the abstract position to be the actual pid that we found */ 3795 /* Update the abstract position to be the actual pid that we found */
3766 iter = l->list + index; 3796 iter = l->list + index;
3767 *pos = *iter; 3797 *pos = cgroup_pid_fry(cgrp, *iter);
3768 return iter; 3798 return iter;
3769} 3799}
3770 3800
3771static void cgroup_pidlist_stop(struct seq_file *s, void *v) 3801static void cgroup_pidlist_stop(struct seq_file *s, void *v)
3772{ 3802{
3773 struct cgroup_pidlist *l = s->private; 3803 struct cgroup_open_file *of = s->private;
3774 up_read(&l->rwsem); 3804 struct cgroup_pidlist *l = of->priv;
3805
3806 if (l)
3807 mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
3808 CGROUP_PIDLIST_DESTROY_DELAY);
3809 mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
3775} 3810}
3776 3811
3777static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) 3812static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
3778{ 3813{
3779 struct cgroup_pidlist *l = s->private; 3814 struct cgroup_open_file *of = s->private;
3815 struct cgroup_pidlist *l = of->priv;
3780 pid_t *p = v; 3816 pid_t *p = v;
3781 pid_t *end = l->list + l->length; 3817 pid_t *end = l->list + l->length;
3782 /* 3818 /*
@@ -3787,7 +3823,7 @@ static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
3787 if (p >= end) { 3823 if (p >= end) {
3788 return NULL; 3824 return NULL;
3789 } else { 3825 } else {
3790 *pos = *p; 3826 *pos = cgroup_pid_fry(seq_css(s)->cgroup, *p);
3791 return p; 3827 return p;
3792 } 3828 }
3793} 3829}
@@ -3808,92 +3844,6 @@ static const struct seq_operations cgroup_pidlist_seq_operations = {
3808 .show = cgroup_pidlist_show, 3844 .show = cgroup_pidlist_show,
3809}; 3845};
3810 3846
3811static void cgroup_release_pid_array(struct cgroup_pidlist *l)
3812{
3813 /*
3814 * the case where we're the last user of this particular pidlist will
3815 * have us remove it from the cgroup's list, which entails taking the
3816 * mutex. since in pidlist_find the pidlist->lock depends on cgroup->
3817 * pidlist_mutex, we have to take pidlist_mutex first.
3818 */
3819 mutex_lock(&l->owner->pidlist_mutex);
3820 down_write(&l->rwsem);
3821 BUG_ON(!l->use_count);
3822 if (!--l->use_count) {
3823 /* we're the last user if refcount is 0; remove and free */
3824 list_del(&l->links);
3825 mutex_unlock(&l->owner->pidlist_mutex);
3826 pidlist_free(l->list);
3827 put_pid_ns(l->key.ns);
3828 up_write(&l->rwsem);
3829 kfree(l);
3830 return;
3831 }
3832 mutex_unlock(&l->owner->pidlist_mutex);
3833 up_write(&l->rwsem);
3834}
3835
3836static int cgroup_pidlist_release(struct inode *inode, struct file *file)
3837{
3838 struct cgroup_pidlist *l;
3839 if (!(file->f_mode & FMODE_READ))
3840 return 0;
3841 /*
3842 * the seq_file will only be initialized if the file was opened for
3843 * reading; hence we check if it's not null only in that case.
3844 */
3845 l = ((struct seq_file *)file->private_data)->private;
3846 cgroup_release_pid_array(l);
3847 return seq_release(inode, file);
3848}
3849
3850static const struct file_operations cgroup_pidlist_operations = {
3851 .read = seq_read,
3852 .llseek = seq_lseek,
3853 .write = cgroup_file_write,
3854 .release = cgroup_pidlist_release,
3855};
3856
3857/*
3858 * The following functions handle opens on a file that displays a pidlist
3859 * (tasks or procs). Prepare an array of the process/thread IDs of whoever's
3860 * in the cgroup.
3861 */
3862/* helper function for the two below it */
3863static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type)
3864{
3865 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
3866 struct cgroup_pidlist *l;
3867 int retval;
3868
3869 /* Nothing to do for write-only files */
3870 if (!(file->f_mode & FMODE_READ))
3871 return 0;
3872
3873 /* have the array populated */
3874 retval = pidlist_array_load(cgrp, type, &l);
3875 if (retval)
3876 return retval;
3877 /* configure file information */
3878 file->f_op = &cgroup_pidlist_operations;
3879
3880 retval = seq_open(file, &cgroup_pidlist_seq_operations);
3881 if (retval) {
3882 cgroup_release_pid_array(l);
3883 return retval;
3884 }
3885 ((struct seq_file *)file->private_data)->private = l;
3886 return 0;
3887}
3888static int cgroup_tasks_open(struct inode *unused, struct file *file)
3889{
3890 return cgroup_pidlist_open(file, CGROUP_FILE_TASKS);
3891}
3892static int cgroup_procs_open(struct inode *unused, struct file *file)
3893{
3894 return cgroup_pidlist_open(file, CGROUP_FILE_PROCS);
3895}
3896
3897static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, 3847static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
3898 struct cftype *cft) 3848 struct cftype *cft)
3899{ 3849{
@@ -3928,202 +3878,6 @@ static void cgroup_dput(struct cgroup *cgrp)
3928 deactivate_super(sb); 3878 deactivate_super(sb);
3929} 3879}
3930 3880
3931/*
3932 * Unregister event and free resources.
3933 *
3934 * Gets called from workqueue.
3935 */
3936static void cgroup_event_remove(struct work_struct *work)
3937{
3938 struct cgroup_event *event = container_of(work, struct cgroup_event,
3939 remove);
3940 struct cgroup_subsys_state *css = event->css;
3941
3942 remove_wait_queue(event->wqh, &event->wait);
3943
3944 event->cft->unregister_event(css, event->cft, event->eventfd);
3945
3946 /* Notify userspace the event is going away. */
3947 eventfd_signal(event->eventfd, 1);
3948
3949 eventfd_ctx_put(event->eventfd);
3950 kfree(event);
3951 css_put(css);
3952}
3953
3954/*
3955 * Gets called on POLLHUP on eventfd when user closes it.
3956 *
3957 * Called with wqh->lock held and interrupts disabled.
3958 */
3959static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
3960 int sync, void *key)
3961{
3962 struct cgroup_event *event = container_of(wait,
3963 struct cgroup_event, wait);
3964 struct cgroup *cgrp = event->css->cgroup;
3965 unsigned long flags = (unsigned long)key;
3966
3967 if (flags & POLLHUP) {
3968 /*
3969 * If the event has been detached at cgroup removal, we
3970 * can simply return knowing the other side will cleanup
3971 * for us.
3972 *
3973 * We can't race against event freeing since the other
3974 * side will require wqh->lock via remove_wait_queue(),
3975 * which we hold.
3976 */
3977 spin_lock(&cgrp->event_list_lock);
3978 if (!list_empty(&event->list)) {
3979 list_del_init(&event->list);
3980 /*
3981 * We are in atomic context, but cgroup_event_remove()
3982 * may sleep, so we have to call it in workqueue.
3983 */
3984 schedule_work(&event->remove);
3985 }
3986 spin_unlock(&cgrp->event_list_lock);
3987 }
3988
3989 return 0;
3990}
3991
3992static void cgroup_event_ptable_queue_proc(struct file *file,
3993 wait_queue_head_t *wqh, poll_table *pt)
3994{
3995 struct cgroup_event *event = container_of(pt,
3996 struct cgroup_event, pt);
3997
3998 event->wqh = wqh;
3999 add_wait_queue(wqh, &event->wait);
4000}
4001
4002/*
4003 * Parse input and register new cgroup event handler.
4004 *
4005 * Input must be in format '<event_fd> <control_fd> <args>'.
4006 * Interpretation of args is defined by control file implementation.
4007 */
4008static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css,
4009 struct cftype *cft, const char *buffer)
4010{
4011 struct cgroup *cgrp = dummy_css->cgroup;
4012 struct cgroup_event *event;
4013 struct cgroup_subsys_state *cfile_css;
4014 unsigned int efd, cfd;
4015 struct fd efile;
4016 struct fd cfile;
4017 char *endp;
4018 int ret;
4019
4020 efd = simple_strtoul(buffer, &endp, 10);
4021 if (*endp != ' ')
4022 return -EINVAL;
4023 buffer = endp + 1;
4024
4025 cfd = simple_strtoul(buffer, &endp, 10);
4026 if ((*endp != ' ') && (*endp != '\0'))
4027 return -EINVAL;
4028 buffer = endp + 1;
4029
4030 event = kzalloc(sizeof(*event), GFP_KERNEL);
4031 if (!event)
4032 return -ENOMEM;
4033
4034 INIT_LIST_HEAD(&event->list);
4035 init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
4036 init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
4037 INIT_WORK(&event->remove, cgroup_event_remove);
4038
4039 efile = fdget(efd);
4040 if (!efile.file) {
4041 ret = -EBADF;
4042 goto out_kfree;
4043 }
4044
4045 event->eventfd = eventfd_ctx_fileget(efile.file);
4046 if (IS_ERR(event->eventfd)) {
4047 ret = PTR_ERR(event->eventfd);
4048 goto out_put_efile;
4049 }
4050
4051 cfile = fdget(cfd);
4052 if (!cfile.file) {
4053 ret = -EBADF;
4054 goto out_put_eventfd;
4055 }
4056
4057 /* the process need read permission on control file */
4058 /* AV: shouldn't we check that it's been opened for read instead? */
4059 ret = inode_permission(file_inode(cfile.file), MAY_READ);
4060 if (ret < 0)
4061 goto out_put_cfile;
4062
4063 event->cft = __file_cft(cfile.file);
4064 if (IS_ERR(event->cft)) {
4065 ret = PTR_ERR(event->cft);
4066 goto out_put_cfile;
4067 }
4068
4069 if (!event->cft->ss) {
4070 ret = -EBADF;
4071 goto out_put_cfile;
4072 }
4073
4074 /*
4075 * Determine the css of @cfile, verify it belongs to the same
4076 * cgroup as cgroup.event_control, and associate @event with it.
4077 * Remaining events are automatically removed on cgroup destruction
4078 * but the removal is asynchronous, so take an extra ref.
4079 */
4080 rcu_read_lock();
4081
4082 ret = -EINVAL;
4083 event->css = cgroup_css(cgrp, event->cft->ss);
4084 cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, event->cft->ss);
4085 if (event->css && event->css == cfile_css && css_tryget(event->css))
4086 ret = 0;
4087
4088 rcu_read_unlock();
4089 if (ret)
4090 goto out_put_cfile;
4091
4092 if (!event->cft->register_event || !event->cft->unregister_event) {
4093 ret = -EINVAL;
4094 goto out_put_css;
4095 }
4096
4097 ret = event->cft->register_event(event->css, event->cft,
4098 event->eventfd, buffer);
4099 if (ret)
4100 goto out_put_css;
4101
4102 efile.file->f_op->poll(efile.file, &event->pt);
4103
4104 spin_lock(&cgrp->event_list_lock);
4105 list_add(&event->list, &cgrp->event_list);
4106 spin_unlock(&cgrp->event_list_lock);
4107
4108 fdput(cfile);
4109 fdput(efile);
4110
4111 return 0;
4112
4113out_put_css:
4114 css_put(event->css);
4115out_put_cfile:
4116 fdput(cfile);
4117out_put_eventfd:
4118 eventfd_ctx_put(event->eventfd);
4119out_put_efile:
4120 fdput(efile);
4121out_kfree:
4122 kfree(event);
4123
4124 return ret;
4125}
4126
4127static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, 3881static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
4128 struct cftype *cft) 3882 struct cftype *cft)
4129{ 3883{
@@ -4143,17 +3897,15 @@ static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
4143static struct cftype cgroup_base_files[] = { 3897static struct cftype cgroup_base_files[] = {
4144 { 3898 {
4145 .name = "cgroup.procs", 3899 .name = "cgroup.procs",
4146 .open = cgroup_procs_open, 3900 .seq_start = cgroup_pidlist_start,
3901 .seq_next = cgroup_pidlist_next,
3902 .seq_stop = cgroup_pidlist_stop,
3903 .seq_show = cgroup_pidlist_show,
3904 .private = CGROUP_FILE_PROCS,
4147 .write_u64 = cgroup_procs_write, 3905 .write_u64 = cgroup_procs_write,
4148 .release = cgroup_pidlist_release,
4149 .mode = S_IRUGO | S_IWUSR, 3906 .mode = S_IRUGO | S_IWUSR,
4150 }, 3907 },
4151 { 3908 {
4152 .name = "cgroup.event_control",
4153 .write_string = cgroup_write_event_control,
4154 .mode = S_IWUGO,
4155 },
4156 {
4157 .name = "cgroup.clone_children", 3909 .name = "cgroup.clone_children",
4158 .flags = CFTYPE_INSANE, 3910 .flags = CFTYPE_INSANE,
4159 .read_u64 = cgroup_clone_children_read, 3911 .read_u64 = cgroup_clone_children_read,
@@ -4162,7 +3914,7 @@ static struct cftype cgroup_base_files[] = {
4162 { 3914 {
4163 .name = "cgroup.sane_behavior", 3915 .name = "cgroup.sane_behavior",
4164 .flags = CFTYPE_ONLY_ON_ROOT, 3916 .flags = CFTYPE_ONLY_ON_ROOT,
4165 .read_seq_string = cgroup_sane_behavior_show, 3917 .seq_show = cgroup_sane_behavior_show,
4166 }, 3918 },
4167 3919
4168 /* 3920 /*
@@ -4173,9 +3925,12 @@ static struct cftype cgroup_base_files[] = {
4173 { 3925 {
4174 .name = "tasks", 3926 .name = "tasks",
4175 .flags = CFTYPE_INSANE, /* use "procs" instead */ 3927 .flags = CFTYPE_INSANE, /* use "procs" instead */
4176 .open = cgroup_tasks_open, 3928 .seq_start = cgroup_pidlist_start,
3929 .seq_next = cgroup_pidlist_next,
3930 .seq_stop = cgroup_pidlist_stop,
3931 .seq_show = cgroup_pidlist_show,
3932 .private = CGROUP_FILE_TASKS,
4177 .write_u64 = cgroup_tasks_write, 3933 .write_u64 = cgroup_tasks_write,
4178 .release = cgroup_pidlist_release,
4179 .mode = S_IRUGO | S_IWUSR, 3934 .mode = S_IRUGO | S_IWUSR,
4180 }, 3935 },
4181 { 3936 {
@@ -4187,7 +3942,7 @@ static struct cftype cgroup_base_files[] = {
4187 { 3942 {
4188 .name = "release_agent", 3943 .name = "release_agent",
4189 .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, 3944 .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
4190 .read_seq_string = cgroup_release_agent_show, 3945 .seq_show = cgroup_release_agent_show,
4191 .write_string = cgroup_release_agent_write, 3946 .write_string = cgroup_release_agent_write,
4192 .max_write_len = PATH_MAX, 3947 .max_write_len = PATH_MAX,
4193 }, 3948 },
@@ -4333,6 +4088,62 @@ static void offline_css(struct cgroup_subsys_state *css)
4333 RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css); 4088 RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css);
4334} 4089}
4335 4090
4091/**
4092 * create_css - create a cgroup_subsys_state
4093 * @cgrp: the cgroup new css will be associated with
4094 * @ss: the subsys of new css
4095 *
4096 * Create a new css associated with @cgrp - @ss pair. On success, the new
4097 * css is online and installed in @cgrp with all interface files created.
4098 * Returns 0 on success, -errno on failure.
4099 */
4100static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
4101{
4102 struct cgroup *parent = cgrp->parent;
4103 struct cgroup_subsys_state *css;
4104 int err;
4105
4106 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
4107 lockdep_assert_held(&cgroup_mutex);
4108
4109 css = ss->css_alloc(cgroup_css(parent, ss));
4110 if (IS_ERR(css))
4111 return PTR_ERR(css);
4112
4113 err = percpu_ref_init(&css->refcnt, css_release);
4114 if (err)
4115 goto err_free;
4116
4117 init_css(css, ss, cgrp);
4118
4119 err = cgroup_populate_dir(cgrp, 1 << ss->subsys_id);
4120 if (err)
4121 goto err_free;
4122
4123 err = online_css(css);
4124 if (err)
4125 goto err_free;
4126
4127 dget(cgrp->dentry);
4128 css_get(css->parent);
4129
4130 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
4131 parent->parent) {
4132 pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
4133 current->comm, current->pid, ss->name);
4134 if (!strcmp(ss->name, "memory"))
4135 pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n");
4136 ss->warned_broken_hierarchy = true;
4137 }
4138
4139 return 0;
4140
4141err_free:
4142 percpu_ref_cancel_init(&css->refcnt);
4143 ss->css_free(css);
4144 return err;
4145}
4146
4336/* 4147/*
4337 * cgroup_create - create a cgroup 4148 * cgroup_create - create a cgroup
4338 * @parent: cgroup that will be parent of the new cgroup 4149 * @parent: cgroup that will be parent of the new cgroup
@@ -4344,11 +4155,10 @@ static void offline_css(struct cgroup_subsys_state *css)
4344static long cgroup_create(struct cgroup *parent, struct dentry *dentry, 4155static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4345 umode_t mode) 4156 umode_t mode)
4346{ 4157{
4347 struct cgroup_subsys_state *css_ar[CGROUP_SUBSYS_COUNT] = { };
4348 struct cgroup *cgrp; 4158 struct cgroup *cgrp;
4349 struct cgroup_name *name; 4159 struct cgroup_name *name;
4350 struct cgroupfs_root *root = parent->root; 4160 struct cgroupfs_root *root = parent->root;
4351 int err = 0; 4161 int ssid, err = 0;
4352 struct cgroup_subsys *ss; 4162 struct cgroup_subsys *ss;
4353 struct super_block *sb = root->sb; 4163 struct super_block *sb = root->sb;
4354 4164
@@ -4404,23 +4214,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4404 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) 4214 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
4405 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); 4215 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
4406 4216
4407 for_each_root_subsys(root, ss) {
4408 struct cgroup_subsys_state *css;
4409
4410 css = ss->css_alloc(cgroup_css(parent, ss));
4411 if (IS_ERR(css)) {
4412 err = PTR_ERR(css);
4413 goto err_free_all;
4414 }
4415 css_ar[ss->subsys_id] = css;
4416
4417 err = percpu_ref_init(&css->refcnt, css_release);
4418 if (err)
4419 goto err_free_all;
4420
4421 init_css(css, ss, cgrp);
4422 }
4423
4424 /* 4217 /*
4425 * Create directory. cgroup_create_file() returns with the new 4218 * Create directory. cgroup_create_file() returns with the new
4426 * directory locked on success so that it can be populated without 4219 * directory locked on success so that it can be populated without
@@ -4428,7 +4221,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4428 */ 4221 */
4429 err = cgroup_create_file(dentry, S_IFDIR | mode, sb); 4222 err = cgroup_create_file(dentry, S_IFDIR | mode, sb);
4430 if (err < 0) 4223 if (err < 0)
4431 goto err_free_all; 4224 goto err_unlock;
4432 lockdep_assert_held(&dentry->d_inode->i_mutex); 4225 lockdep_assert_held(&dentry->d_inode->i_mutex);
4433 4226
4434 cgrp->serial_nr = cgroup_serial_nr_next++; 4227 cgrp->serial_nr = cgroup_serial_nr_next++;
@@ -4440,55 +4233,31 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4440 /* hold a ref to the parent's dentry */ 4233 /* hold a ref to the parent's dentry */
4441 dget(parent->dentry); 4234 dget(parent->dentry);
4442 4235
4443 /* creation succeeded, notify subsystems */ 4236 /*
4444 for_each_root_subsys(root, ss) { 4237 * @cgrp is now fully operational. If something fails after this
4445 struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; 4238 * point, it'll be released via the normal destruction path.
4446 4239 */
4447 err = online_css(css);
4448 if (err)
4449 goto err_destroy;
4450
4451 /* each css holds a ref to the cgroup's dentry and parent css */
4452 dget(dentry);
4453 css_get(css->parent);
4454
4455 /* mark it consumed for error path */
4456 css_ar[ss->subsys_id] = NULL;
4457
4458 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
4459 parent->parent) {
4460 pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
4461 current->comm, current->pid, ss->name);
4462 if (!strcmp(ss->name, "memory"))
4463 pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n");
4464 ss->warned_broken_hierarchy = true;
4465 }
4466 }
4467
4468 idr_replace(&root->cgroup_idr, cgrp, cgrp->id); 4240 idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
4469 4241
4470 err = cgroup_addrm_files(cgrp, cgroup_base_files, true); 4242 err = cgroup_addrm_files(cgrp, cgroup_base_files, true);
4471 if (err) 4243 if (err)
4472 goto err_destroy; 4244 goto err_destroy;
4473 4245
4474 err = cgroup_populate_dir(cgrp, root->subsys_mask); 4246 /* let's create and online css's */
4475 if (err) 4247 for_each_subsys(ss, ssid) {
4476 goto err_destroy; 4248 if (root->subsys_mask & (1 << ssid)) {
4249 err = create_css(cgrp, ss);
4250 if (err)
4251 goto err_destroy;
4252 }
4253 }
4477 4254
4478 mutex_unlock(&cgroup_mutex); 4255 mutex_unlock(&cgroup_mutex);
4479 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 4256 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
4480 4257
4481 return 0; 4258 return 0;
4482 4259
4483err_free_all: 4260err_unlock:
4484 for_each_root_subsys(root, ss) {
4485 struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
4486
4487 if (css) {
4488 percpu_ref_cancel_init(&css->refcnt);
4489 ss->css_free(css);
4490 }
4491 }
4492 mutex_unlock(&cgroup_mutex); 4261 mutex_unlock(&cgroup_mutex);
4493 /* Release the reference count that we took on the superblock */ 4262 /* Release the reference count that we took on the superblock */
4494 deactivate_super(sb); 4263 deactivate_super(sb);
@@ -4501,14 +4270,6 @@ err_free_cgrp:
4501 return err; 4270 return err;
4502 4271
4503err_destroy: 4272err_destroy:
4504 for_each_root_subsys(root, ss) {
4505 struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
4506
4507 if (css) {
4508 percpu_ref_cancel_init(&css->refcnt);
4509 ss->css_free(css);
4510 }
4511 }
4512 cgroup_destroy_locked(cgrp); 4273 cgroup_destroy_locked(cgrp);
4513 mutex_unlock(&cgroup_mutex); 4274 mutex_unlock(&cgroup_mutex);
4514 mutex_unlock(&dentry->d_inode->i_mutex); 4275 mutex_unlock(&dentry->d_inode->i_mutex);
@@ -4631,10 +4392,10 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4631 __releases(&cgroup_mutex) __acquires(&cgroup_mutex) 4392 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4632{ 4393{
4633 struct dentry *d = cgrp->dentry; 4394 struct dentry *d = cgrp->dentry;
4634 struct cgroup_event *event, *tmp; 4395 struct cgroup_subsys_state *css;
4635 struct cgroup_subsys *ss;
4636 struct cgroup *child; 4396 struct cgroup *child;
4637 bool empty; 4397 bool empty;
4398 int ssid;
4638 4399
4639 lockdep_assert_held(&d->d_inode->i_mutex); 4400 lockdep_assert_held(&d->d_inode->i_mutex);
4640 lockdep_assert_held(&cgroup_mutex); 4401 lockdep_assert_held(&cgroup_mutex);
@@ -4670,12 +4431,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4670 * will be invoked to perform the rest of destruction once the 4431 * will be invoked to perform the rest of destruction once the
4671 * percpu refs of all css's are confirmed to be killed. 4432 * percpu refs of all css's are confirmed to be killed.
4672 */ 4433 */
4673 for_each_root_subsys(cgrp->root, ss) { 4434 for_each_css(css, ssid, cgrp)
4674 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); 4435 kill_css(css);
4675
4676 if (css)
4677 kill_css(css);
4678 }
4679 4436
4680 /* 4437 /*
4681 * Mark @cgrp dead. This prevents further task migration and child 4438 * Mark @cgrp dead. This prevents further task migration and child
@@ -4710,18 +4467,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4710 dget(d); 4467 dget(d);
4711 cgroup_d_remove_dir(d); 4468 cgroup_d_remove_dir(d);
4712 4469
4713 /*
4714 * Unregister events and notify userspace.
4715 * Notify userspace about cgroup removing only after rmdir of cgroup
4716 * directory to avoid race between userspace and kernelspace.
4717 */
4718 spin_lock(&cgrp->event_list_lock);
4719 list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
4720 list_del_init(&event->list);
4721 schedule_work(&event->remove);
4722 }
4723 spin_unlock(&cgrp->event_list_lock);
4724
4725 return 0; 4470 return 0;
4726}; 4471};
4727 4472
@@ -4792,7 +4537,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4792 cgroup_init_cftsets(ss); 4537 cgroup_init_cftsets(ss);
4793 4538
4794 /* Create the top cgroup state for this subsystem */ 4539 /* Create the top cgroup state for this subsystem */
4795 list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);
4796 ss->root = &cgroup_dummy_root; 4540 ss->root = &cgroup_dummy_root;
4797 css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss)); 4541 css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss));
4798 /* We don't handle early failures gracefully */ 4542 /* We don't handle early failures gracefully */
@@ -4866,6 +4610,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4866 cgroup_init_cftsets(ss); 4610 cgroup_init_cftsets(ss);
4867 4611
4868 mutex_lock(&cgroup_mutex); 4612 mutex_lock(&cgroup_mutex);
4613 mutex_lock(&cgroup_root_mutex);
4869 cgroup_subsys[ss->subsys_id] = ss; 4614 cgroup_subsys[ss->subsys_id] = ss;
4870 4615
4871 /* 4616 /*
@@ -4877,11 +4622,11 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4877 if (IS_ERR(css)) { 4622 if (IS_ERR(css)) {
4878 /* failure case - need to deassign the cgroup_subsys[] slot. */ 4623 /* failure case - need to deassign the cgroup_subsys[] slot. */
4879 cgroup_subsys[ss->subsys_id] = NULL; 4624 cgroup_subsys[ss->subsys_id] = NULL;
4625 mutex_unlock(&cgroup_root_mutex);
4880 mutex_unlock(&cgroup_mutex); 4626 mutex_unlock(&cgroup_mutex);
4881 return PTR_ERR(css); 4627 return PTR_ERR(css);
4882 } 4628 }
4883 4629
4884 list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);
4885 ss->root = &cgroup_dummy_root; 4630 ss->root = &cgroup_dummy_root;
4886 4631
4887 /* our new subsystem will be attached to the dummy hierarchy. */ 4632 /* our new subsystem will be attached to the dummy hierarchy. */
@@ -4911,14 +4656,18 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4911 write_unlock(&css_set_lock); 4656 write_unlock(&css_set_lock);
4912 4657
4913 ret = online_css(css); 4658 ret = online_css(css);
4914 if (ret) 4659 if (ret) {
4660 ss->css_free(css);
4915 goto err_unload; 4661 goto err_unload;
4662 }
4916 4663
4917 /* success! */ 4664 /* success! */
4665 mutex_unlock(&cgroup_root_mutex);
4918 mutex_unlock(&cgroup_mutex); 4666 mutex_unlock(&cgroup_mutex);
4919 return 0; 4667 return 0;
4920 4668
4921err_unload: 4669err_unload:
4670 mutex_unlock(&cgroup_root_mutex);
4922 mutex_unlock(&cgroup_mutex); 4671 mutex_unlock(&cgroup_mutex);
4923 /* @ss can't be mounted here as try_module_get() would fail */ 4672 /* @ss can't be mounted here as try_module_get() would fail */
4924 cgroup_unload_subsys(ss); 4673 cgroup_unload_subsys(ss);
@@ -4937,6 +4686,7 @@ EXPORT_SYMBOL_GPL(cgroup_load_subsys);
4937void cgroup_unload_subsys(struct cgroup_subsys *ss) 4686void cgroup_unload_subsys(struct cgroup_subsys *ss)
4938{ 4687{
4939 struct cgrp_cset_link *link; 4688 struct cgrp_cset_link *link;
4689 struct cgroup_subsys_state *css;
4940 4690
4941 BUG_ON(ss->module == NULL); 4691 BUG_ON(ss->module == NULL);
4942 4692
@@ -4948,15 +4698,15 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4948 BUG_ON(ss->root != &cgroup_dummy_root); 4698 BUG_ON(ss->root != &cgroup_dummy_root);
4949 4699
4950 mutex_lock(&cgroup_mutex); 4700 mutex_lock(&cgroup_mutex);
4701 mutex_lock(&cgroup_root_mutex);
4951 4702
4952 offline_css(cgroup_css(cgroup_dummy_top, ss)); 4703 css = cgroup_css(cgroup_dummy_top, ss);
4704 if (css)
4705 offline_css(css);
4953 4706
4954 /* deassign the subsys_id */ 4707 /* deassign the subsys_id */
4955 cgroup_subsys[ss->subsys_id] = NULL; 4708 cgroup_subsys[ss->subsys_id] = NULL;
4956 4709
4957 /* remove subsystem from the dummy root's list of subsystems */
4958 list_del_init(&ss->sibling);
4959
4960 /* 4710 /*
4961 * disentangle the css from all css_sets attached to the dummy 4711 * disentangle the css from all css_sets attached to the dummy
4962 * top. as in loading, we need to pay our respects to the hashtable 4712 * top. as in loading, we need to pay our respects to the hashtable
@@ -4979,9 +4729,11 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4979 * need to free before marking as null because ss->css_free needs 4729 * need to free before marking as null because ss->css_free needs
4980 * the cgrp->subsys pointer to find their state. 4730 * the cgrp->subsys pointer to find their state.
4981 */ 4731 */
4982 ss->css_free(cgroup_css(cgroup_dummy_top, ss)); 4732 if (css)
4733 ss->css_free(css);
4983 RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL); 4734 RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL);
4984 4735
4736 mutex_unlock(&cgroup_root_mutex);
4985 mutex_unlock(&cgroup_mutex); 4737 mutex_unlock(&cgroup_mutex);
4986} 4738}
4987EXPORT_SYMBOL_GPL(cgroup_unload_subsys); 4739EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
@@ -5100,6 +4852,15 @@ static int __init cgroup_wq_init(void)
5100 */ 4852 */
5101 cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); 4853 cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
5102 BUG_ON(!cgroup_destroy_wq); 4854 BUG_ON(!cgroup_destroy_wq);
4855
4856 /*
4857 * Used to destroy pidlists and separate to serve as flush domain.
4858 * Cap @max_active to 1 too.
4859 */
4860 cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
4861 0, 1);
4862 BUG_ON(!cgroup_pidlist_destroy_wq);
4863
5103 return 0; 4864 return 0;
5104} 4865}
5105core_initcall(cgroup_wq_init); 4866core_initcall(cgroup_wq_init);
@@ -5143,11 +4904,12 @@ int proc_cgroup_show(struct seq_file *m, void *v)
5143 for_each_active_root(root) { 4904 for_each_active_root(root) {
5144 struct cgroup_subsys *ss; 4905 struct cgroup_subsys *ss;
5145 struct cgroup *cgrp; 4906 struct cgroup *cgrp;
5146 int count = 0; 4907 int ssid, count = 0;
5147 4908
5148 seq_printf(m, "%d:", root->hierarchy_id); 4909 seq_printf(m, "%d:", root->hierarchy_id);
5149 for_each_root_subsys(root, ss) 4910 for_each_subsys(ss, ssid)
5150 seq_printf(m, "%s%s", count++ ? "," : "", ss->name); 4911 if (root->subsys_mask & (1 << ssid))
4912 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
5151 if (strlen(root->name)) 4913 if (strlen(root->name))
5152 seq_printf(m, "%sname=%s", count ? "," : "", 4914 seq_printf(m, "%sname=%s", count ? "," : "",
5153 root->name); 4915 root->name);
@@ -5488,16 +5250,16 @@ __setup("cgroup_disable=", cgroup_disable);
5488 * @dentry: directory dentry of interest 5250 * @dentry: directory dentry of interest
5489 * @ss: subsystem of interest 5251 * @ss: subsystem of interest
5490 * 5252 *
5491 * Must be called under RCU read lock. The caller is responsible for 5253 * Must be called under cgroup_mutex or RCU read lock. The caller is
5492 * pinning the returned css if it needs to be accessed outside the RCU 5254 * responsible for pinning the returned css if it needs to be accessed
5493 * critical section. 5255 * outside the critical section.
5494 */ 5256 */
5495struct cgroup_subsys_state *css_from_dir(struct dentry *dentry, 5257struct cgroup_subsys_state *css_from_dir(struct dentry *dentry,
5496 struct cgroup_subsys *ss) 5258 struct cgroup_subsys *ss)
5497{ 5259{
5498 struct cgroup *cgrp; 5260 struct cgroup *cgrp;
5499 5261
5500 WARN_ON_ONCE(!rcu_read_lock_held()); 5262 cgroup_assert_mutex_or_rcu_locked();
5501 5263
5502 /* is @dentry a cgroup dir? */ 5264 /* is @dentry a cgroup dir? */
5503 if (!dentry->d_inode || 5265 if (!dentry->d_inode ||
@@ -5520,9 +5282,7 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
5520{ 5282{
5521 struct cgroup *cgrp; 5283 struct cgroup *cgrp;
5522 5284
5523 rcu_lockdep_assert(rcu_read_lock_held() || 5285 cgroup_assert_mutex_or_rcu_locked();
5524 lockdep_is_held(&cgroup_mutex),
5525 "css_from_id() needs proper protection");
5526 5286
5527 cgrp = idr_find(&ss->root->cgroup_idr, id); 5287 cgrp = idr_find(&ss->root->cgroup_idr, id);
5528 if (cgrp) 5288 if (cgrp)
@@ -5570,9 +5330,7 @@ static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
5570 return count; 5330 return count;
5571} 5331}
5572 5332
5573static int current_css_set_cg_links_read(struct cgroup_subsys_state *css, 5333static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
5574 struct cftype *cft,
5575 struct seq_file *seq)
5576{ 5334{
5577 struct cgrp_cset_link *link; 5335 struct cgrp_cset_link *link;
5578 struct css_set *cset; 5336 struct css_set *cset;
@@ -5597,9 +5355,9 @@ static int current_css_set_cg_links_read(struct cgroup_subsys_state *css,
5597} 5355}
5598 5356
5599#define MAX_TASKS_SHOWN_PER_CSS 25 5357#define MAX_TASKS_SHOWN_PER_CSS 25
5600static int cgroup_css_links_read(struct cgroup_subsys_state *css, 5358static int cgroup_css_links_read(struct seq_file *seq, void *v)
5601 struct cftype *cft, struct seq_file *seq)
5602{ 5359{
5360 struct cgroup_subsys_state *css = seq_css(seq);
5603 struct cgrp_cset_link *link; 5361 struct cgrp_cset_link *link;
5604 5362
5605 read_lock(&css_set_lock); 5363 read_lock(&css_set_lock);
@@ -5645,12 +5403,12 @@ static struct cftype debug_files[] = {
5645 5403
5646 { 5404 {
5647 .name = "current_css_set_cg_links", 5405 .name = "current_css_set_cg_links",
5648 .read_seq_string = current_css_set_cg_links_read, 5406 .seq_show = current_css_set_cg_links_read,
5649 }, 5407 },
5650 5408
5651 { 5409 {
5652 .name = "cgroup_css_links", 5410 .name = "cgroup_css_links",
5653 .read_seq_string = cgroup_css_links_read, 5411 .seq_show = cgroup_css_links_read,
5654 }, 5412 },
5655 5413
5656 { 5414 {
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index f0ff64d0ebaa..6c3154e477f6 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -301,10 +301,9 @@ out_unlock:
301 spin_unlock_irq(&freezer->lock); 301 spin_unlock_irq(&freezer->lock);
302} 302}
303 303
304static int freezer_read(struct cgroup_subsys_state *css, struct cftype *cft, 304static int freezer_read(struct seq_file *m, void *v)
305 struct seq_file *m)
306{ 305{
307 struct cgroup_subsys_state *pos; 306 struct cgroup_subsys_state *css = seq_css(m), *pos;
308 307
309 rcu_read_lock(); 308 rcu_read_lock();
310 309
@@ -458,7 +457,7 @@ static struct cftype files[] = {
458 { 457 {
459 .name = "state", 458 .name = "state",
460 .flags = CFTYPE_NOT_ON_ROOT, 459 .flags = CFTYPE_NOT_ON_ROOT,
461 .read_seq_string = freezer_read, 460 .seq_show = freezer_read,
462 .write_string = freezer_write, 461 .write_string = freezer_write,
463 }, 462 },
464 { 463 {
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 4772034b4b17..4410ac6a55f1 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1731,66 +1731,41 @@ out_unlock:
1731 * used, list of ranges of sequential numbers, is variable length, 1731 * used, list of ranges of sequential numbers, is variable length,
1732 * and since these maps can change value dynamically, one could read 1732 * and since these maps can change value dynamically, one could read
1733 * gibberish by doing partial reads while a list was changing. 1733 * gibberish by doing partial reads while a list was changing.
1734 * A single large read to a buffer that crosses a page boundary is
1735 * ok, because the result being copied to user land is not recomputed
1736 * across a page fault.
1737 */ 1734 */
1738 1735static int cpuset_common_seq_show(struct seq_file *sf, void *v)
1739static size_t cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
1740{ 1736{
1741 size_t count; 1737 struct cpuset *cs = css_cs(seq_css(sf));
1742 1738 cpuset_filetype_t type = seq_cft(sf)->private;
1743 mutex_lock(&callback_mutex); 1739 ssize_t count;
1744 count = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed); 1740 char *buf, *s;
1745 mutex_unlock(&callback_mutex); 1741 int ret = 0;
1746 1742
1747 return count; 1743 count = seq_get_buf(sf, &buf);
1748} 1744 s = buf;
1749
1750static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs)
1751{
1752 size_t count;
1753 1745
1754 mutex_lock(&callback_mutex); 1746 mutex_lock(&callback_mutex);
1755 count = nodelist_scnprintf(page, PAGE_SIZE, cs->mems_allowed);
1756 mutex_unlock(&callback_mutex);
1757
1758 return count;
1759}
1760
1761static ssize_t cpuset_common_file_read(struct cgroup_subsys_state *css,
1762 struct cftype *cft, struct file *file,
1763 char __user *buf, size_t nbytes,
1764 loff_t *ppos)
1765{
1766 struct cpuset *cs = css_cs(css);
1767 cpuset_filetype_t type = cft->private;
1768 char *page;
1769 ssize_t retval = 0;
1770 char *s;
1771
1772 if (!(page = (char *)__get_free_page(GFP_TEMPORARY)))
1773 return -ENOMEM;
1774
1775 s = page;
1776 1747
1777 switch (type) { 1748 switch (type) {
1778 case FILE_CPULIST: 1749 case FILE_CPULIST:
1779 s += cpuset_sprintf_cpulist(s, cs); 1750 s += cpulist_scnprintf(s, count, cs->cpus_allowed);
1780 break; 1751 break;
1781 case FILE_MEMLIST: 1752 case FILE_MEMLIST:
1782 s += cpuset_sprintf_memlist(s, cs); 1753 s += nodelist_scnprintf(s, count, cs->mems_allowed);
1783 break; 1754 break;
1784 default: 1755 default:
1785 retval = -EINVAL; 1756 ret = -EINVAL;
1786 goto out; 1757 goto out_unlock;
1787 } 1758 }
1788 *s++ = '\n';
1789 1759
1790 retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page); 1760 if (s < buf + count - 1) {
1791out: 1761 *s++ = '\n';
1792 free_page((unsigned long)page); 1762 seq_commit(sf, s - buf);
1793 return retval; 1763 } else {
1764 seq_commit(sf, -1);
1765 }
1766out_unlock:
1767 mutex_unlock(&callback_mutex);
1768 return ret;
1794} 1769}
1795 1770
1796static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) 1771static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
@@ -1847,7 +1822,7 @@ static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
1847static struct cftype files[] = { 1822static struct cftype files[] = {
1848 { 1823 {
1849 .name = "cpus", 1824 .name = "cpus",
1850 .read = cpuset_common_file_read, 1825 .seq_show = cpuset_common_seq_show,
1851 .write_string = cpuset_write_resmask, 1826 .write_string = cpuset_write_resmask,
1852 .max_write_len = (100U + 6 * NR_CPUS), 1827 .max_write_len = (100U + 6 * NR_CPUS),
1853 .private = FILE_CPULIST, 1828 .private = FILE_CPULIST,
@@ -1855,7 +1830,7 @@ static struct cftype files[] = {
1855 1830
1856 { 1831 {
1857 .name = "mems", 1832 .name = "mems",
1858 .read = cpuset_common_file_read, 1833 .seq_show = cpuset_common_seq_show,
1859 .write_string = cpuset_write_resmask, 1834 .write_string = cpuset_write_resmask,
1860 .max_write_len = (100U + 6 * MAX_NUMNODES), 1835 .max_write_len = (100U + 6 * MAX_NUMNODES),
1861 .private = FILE_MEMLIST, 1836 .private = FILE_MEMLIST,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5ae36cc11fe5..4d6964e49711 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7854,15 +7854,14 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
7854 return ret; 7854 return ret;
7855} 7855}
7856 7856
7857static int cpu_stats_show(struct cgroup_subsys_state *css, struct cftype *cft, 7857static int cpu_stats_show(struct seq_file *sf, void *v)
7858 struct cgroup_map_cb *cb)
7859{ 7858{
7860 struct task_group *tg = css_tg(css); 7859 struct task_group *tg = css_tg(seq_css(sf));
7861 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 7860 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7862 7861
7863 cb->fill(cb, "nr_periods", cfs_b->nr_periods); 7862 seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);
7864 cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); 7863 seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
7865 cb->fill(cb, "throttled_time", cfs_b->throttled_time); 7864 seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
7866 7865
7867 return 0; 7866 return 0;
7868} 7867}
@@ -7916,7 +7915,7 @@ static struct cftype cpu_files[] = {
7916 }, 7915 },
7917 { 7916 {
7918 .name = "stat", 7917 .name = "stat",
7919 .read_map = cpu_stats_show, 7918 .seq_show = cpu_stats_show,
7920 }, 7919 },
7921#endif 7920#endif
7922#ifdef CONFIG_RT_GROUP_SCHED 7921#ifdef CONFIG_RT_GROUP_SCHED
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index f64722ff0299..622e0818f905 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -163,10 +163,9 @@ out:
163 return err; 163 return err;
164} 164}
165 165
166static int cpuacct_percpu_seq_read(struct cgroup_subsys_state *css, 166static int cpuacct_percpu_seq_show(struct seq_file *m, void *V)
167 struct cftype *cft, struct seq_file *m)
168{ 167{
169 struct cpuacct *ca = css_ca(css); 168 struct cpuacct *ca = css_ca(seq_css(m));
170 u64 percpu; 169 u64 percpu;
171 int i; 170 int i;
172 171
@@ -183,10 +182,9 @@ static const char * const cpuacct_stat_desc[] = {
183 [CPUACCT_STAT_SYSTEM] = "system", 182 [CPUACCT_STAT_SYSTEM] = "system",
184}; 183};
185 184
186static int cpuacct_stats_show(struct cgroup_subsys_state *css, 185static int cpuacct_stats_show(struct seq_file *sf, void *v)
187 struct cftype *cft, struct cgroup_map_cb *cb)
188{ 186{
189 struct cpuacct *ca = css_ca(css); 187 struct cpuacct *ca = css_ca(seq_css(sf));
190 int cpu; 188 int cpu;
191 s64 val = 0; 189 s64 val = 0;
192 190
@@ -196,7 +194,7 @@ static int cpuacct_stats_show(struct cgroup_subsys_state *css,
196 val += kcpustat->cpustat[CPUTIME_NICE]; 194 val += kcpustat->cpustat[CPUTIME_NICE];
197 } 195 }
198 val = cputime64_to_clock_t(val); 196 val = cputime64_to_clock_t(val);
199 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val); 197 seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val);
200 198
201 val = 0; 199 val = 0;
202 for_each_online_cpu(cpu) { 200 for_each_online_cpu(cpu) {
@@ -207,7 +205,7 @@ static int cpuacct_stats_show(struct cgroup_subsys_state *css,
207 } 205 }
208 206
209 val = cputime64_to_clock_t(val); 207 val = cputime64_to_clock_t(val);
210 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); 208 seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
211 209
212 return 0; 210 return 0;
213} 211}
@@ -220,11 +218,11 @@ static struct cftype files[] = {
220 }, 218 },
221 { 219 {
222 .name = "usage_percpu", 220 .name = "usage_percpu",
223 .read_seq_string = cpuacct_percpu_seq_read, 221 .seq_show = cpuacct_percpu_seq_show,
224 }, 222 },
225 { 223 {
226 .name = "stat", 224 .name = "stat",
227 .read_map = cpuacct_stats_show, 225 .seq_show = cpuacct_stats_show,
228 }, 226 },
229 { } /* terminate */ 227 { } /* terminate */
230}; 228};
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index b010eac595d2..82ef9f3b7473 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4789,6 +4789,7 @@ static int workqueue_cpu_down_callback(struct notifier_block *nfb,
4789 4789
4790 /* wait for per-cpu unbinding to finish */ 4790 /* wait for per-cpu unbinding to finish */
4791 flush_work(&unbind_work); 4791 flush_work(&unbind_work);
4792 destroy_work_on_stack(&unbind_work);
4792 break; 4793 break;
4793 } 4794 }
4794 return NOTIFY_OK; 4795 return NOTIFY_OK;
@@ -4828,6 +4829,7 @@ long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
4828 INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn); 4829 INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn);
4829 schedule_work_on(cpu, &wfc.work); 4830 schedule_work_on(cpu, &wfc.work);
4830 flush_work(&wfc.work); 4831 flush_work(&wfc.work);
4832 destroy_work_on_stack(&wfc.work);
4831 return wfc.ret; 4833 return wfc.ret;
4832} 4834}
4833EXPORT_SYMBOL_GPL(work_on_cpu); 4835EXPORT_SYMBOL_GPL(work_on_cpu);
diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c
index 1a53d497a8c5..963b7034a51b 100644
--- a/lib/percpu-refcount.c
+++ b/lib/percpu-refcount.c
@@ -120,6 +120,9 @@ static void percpu_ref_kill_rcu(struct rcu_head *rcu)
120 120
121 atomic_add((int) count - PCPU_COUNT_BIAS, &ref->count); 121 atomic_add((int) count - PCPU_COUNT_BIAS, &ref->count);
122 122
123 WARN_ONCE(atomic_read(&ref->count) <= 0, "percpu ref <= 0 (%i)",
124 atomic_read(&ref->count));
125
123 /* @ref is viewed as dead on all CPUs, send out kill confirmation */ 126 /* @ref is viewed as dead on all CPUs, send out kill confirmation */
124 if (ref->confirm_kill) 127 if (ref->confirm_kill)
125 ref->confirm_kill(ref); 128 ref->confirm_kill(ref);
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index bda8e44f6fde..d747a84e09b0 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -242,22 +242,16 @@ void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
242 return; 242 return;
243} 243}
244 244
245static ssize_t hugetlb_cgroup_read(struct cgroup_subsys_state *css, 245static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
246 struct cftype *cft, struct file *file, 246 struct cftype *cft)
247 char __user *buf, size_t nbytes,
248 loff_t *ppos)
249{ 247{
250 u64 val; 248 int idx, name;
251 char str[64];
252 int idx, name, len;
253 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); 249 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
254 250
255 idx = MEMFILE_IDX(cft->private); 251 idx = MEMFILE_IDX(cft->private);
256 name = MEMFILE_ATTR(cft->private); 252 name = MEMFILE_ATTR(cft->private);
257 253
258 val = res_counter_read_u64(&h_cg->hugepage[idx], name); 254 return res_counter_read_u64(&h_cg->hugepage[idx], name);
259 len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
260 return simple_read_from_buffer(buf, nbytes, ppos, str, len);
261} 255}
262 256
263static int hugetlb_cgroup_write(struct cgroup_subsys_state *css, 257static int hugetlb_cgroup_write(struct cgroup_subsys_state *css,
@@ -337,28 +331,28 @@ static void __init __hugetlb_cgroup_file_init(int idx)
337 cft = &h->cgroup_files[0]; 331 cft = &h->cgroup_files[0];
338 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf); 332 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf);
339 cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT); 333 cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT);
340 cft->read = hugetlb_cgroup_read; 334 cft->read_u64 = hugetlb_cgroup_read_u64;
341 cft->write_string = hugetlb_cgroup_write; 335 cft->write_string = hugetlb_cgroup_write;
342 336
343 /* Add the usage file */ 337 /* Add the usage file */
344 cft = &h->cgroup_files[1]; 338 cft = &h->cgroup_files[1];
345 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf); 339 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf);
346 cft->private = MEMFILE_PRIVATE(idx, RES_USAGE); 340 cft->private = MEMFILE_PRIVATE(idx, RES_USAGE);
347 cft->read = hugetlb_cgroup_read; 341 cft->read_u64 = hugetlb_cgroup_read_u64;
348 342
349 /* Add the MAX usage file */ 343 /* Add the MAX usage file */
350 cft = &h->cgroup_files[2]; 344 cft = &h->cgroup_files[2];
351 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf); 345 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf);
352 cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE); 346 cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE);
353 cft->trigger = hugetlb_cgroup_reset; 347 cft->trigger = hugetlb_cgroup_reset;
354 cft->read = hugetlb_cgroup_read; 348 cft->read_u64 = hugetlb_cgroup_read_u64;
355 349
356 /* Add the failcntfile */ 350 /* Add the failcntfile */
357 cft = &h->cgroup_files[3]; 351 cft = &h->cgroup_files[3];
358 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf); 352 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf);
359 cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT); 353 cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT);
360 cft->trigger = hugetlb_cgroup_reset; 354 cft->trigger = hugetlb_cgroup_reset;
361 cft->read = hugetlb_cgroup_read; 355 cft->read_u64 = hugetlb_cgroup_read_u64;
362 356
363 /* NULL terminate the last cft */ 357 /* NULL terminate the last cft */
364 cft = &h->cgroup_files[4]; 358 cft = &h->cgroup_files[4];
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 57b16083f046..67dd2a881433 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -45,6 +45,7 @@
45#include <linux/swapops.h> 45#include <linux/swapops.h>
46#include <linux/spinlock.h> 46#include <linux/spinlock.h>
47#include <linux/eventfd.h> 47#include <linux/eventfd.h>
48#include <linux/poll.h>
48#include <linux/sort.h> 49#include <linux/sort.h>
49#include <linux/fs.h> 50#include <linux/fs.h>
50#include <linux/seq_file.h> 51#include <linux/seq_file.h>
@@ -55,6 +56,7 @@
55#include <linux/cpu.h> 56#include <linux/cpu.h>
56#include <linux/oom.h> 57#include <linux/oom.h>
57#include <linux/lockdep.h> 58#include <linux/lockdep.h>
59#include <linux/file.h>
58#include "internal.h" 60#include "internal.h"
59#include <net/sock.h> 61#include <net/sock.h>
60#include <net/ip.h> 62#include <net/ip.h>
@@ -227,6 +229,46 @@ struct mem_cgroup_eventfd_list {
227 struct eventfd_ctx *eventfd; 229 struct eventfd_ctx *eventfd;
228}; 230};
229 231
232/*
233 * cgroup_event represents events which userspace want to receive.
234 */
235struct mem_cgroup_event {
236 /*
237 * memcg which the event belongs to.
238 */
239 struct mem_cgroup *memcg;
240 /*
241 * eventfd to signal userspace about the event.
242 */
243 struct eventfd_ctx *eventfd;
244 /*
245 * Each of these stored in a list by the cgroup.
246 */
247 struct list_head list;
248 /*
249 * register_event() callback will be used to add new userspace
250 * waiter for changes related to this event. Use eventfd_signal()
251 * on eventfd to send notification to userspace.
252 */
253 int (*register_event)(struct mem_cgroup *memcg,
254 struct eventfd_ctx *eventfd, const char *args);
255 /*
256 * unregister_event() callback will be called when userspace closes
257 * the eventfd or on cgroup removing. This callback must be set,
258 * if you want provide notification functionality.
259 */
260 void (*unregister_event)(struct mem_cgroup *memcg,
261 struct eventfd_ctx *eventfd);
262 /*
263 * All fields below needed to unregister event when
264 * userspace closes eventfd.
265 */
266 poll_table pt;
267 wait_queue_head_t *wqh;
268 wait_queue_t wait;
269 struct work_struct remove;
270};
271
230static void mem_cgroup_threshold(struct mem_cgroup *memcg); 272static void mem_cgroup_threshold(struct mem_cgroup *memcg);
231static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); 273static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
232 274
@@ -331,6 +373,10 @@ struct mem_cgroup {
331 atomic_t numainfo_updating; 373 atomic_t numainfo_updating;
332#endif 374#endif
333 375
376 /* List of events which userspace want to receive */
377 struct list_head event_list;
378 spinlock_t event_list_lock;
379
334 struct mem_cgroup_per_node *nodeinfo[0]; 380 struct mem_cgroup_per_node *nodeinfo[0];
335 /* WARNING: nodeinfo must be the last member here */ 381 /* WARNING: nodeinfo must be the last member here */
336}; 382};
@@ -490,11 +536,6 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
490 return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; 536 return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
491} 537}
492 538
493struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css)
494{
495 return &mem_cgroup_from_css(css)->vmpressure;
496}
497
498static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) 539static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
499{ 540{
500 return (memcg == root_mem_cgroup); 541 return (memcg == root_mem_cgroup);
@@ -2979,10 +3020,9 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
2979} 3020}
2980 3021
2981#ifdef CONFIG_SLABINFO 3022#ifdef CONFIG_SLABINFO
2982static int mem_cgroup_slabinfo_read(struct cgroup_subsys_state *css, 3023static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v)
2983 struct cftype *cft, struct seq_file *m)
2984{ 3024{
2985 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3025 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
2986 struct memcg_cache_params *params; 3026 struct memcg_cache_params *params;
2987 3027
2988 if (!memcg_can_account_kmem(memcg)) 3028 if (!memcg_can_account_kmem(memcg))
@@ -5115,14 +5155,12 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
5115 return val << PAGE_SHIFT; 5155 return val << PAGE_SHIFT;
5116} 5156}
5117 5157
5118static ssize_t mem_cgroup_read(struct cgroup_subsys_state *css, 5158static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
5119 struct cftype *cft, struct file *file, 5159 struct cftype *cft)
5120 char __user *buf, size_t nbytes, loff_t *ppos)
5121{ 5160{
5122 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5161 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5123 char str[64];
5124 u64 val; 5162 u64 val;
5125 int name, len; 5163 int name;
5126 enum res_type type; 5164 enum res_type type;
5127 5165
5128 type = MEMFILE_TYPE(cft->private); 5166 type = MEMFILE_TYPE(cft->private);
@@ -5148,8 +5186,7 @@ static ssize_t mem_cgroup_read(struct cgroup_subsys_state *css,
5148 BUG(); 5186 BUG();
5149 } 5187 }
5150 5188
5151 len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val); 5189 return val;
5152 return simple_read_from_buffer(buf, nbytes, ppos, str, len);
5153} 5190}
5154 5191
5155static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val) 5192static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val)
@@ -5386,8 +5423,7 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
5386#endif 5423#endif
5387 5424
5388#ifdef CONFIG_NUMA 5425#ifdef CONFIG_NUMA
5389static int memcg_numa_stat_show(struct cgroup_subsys_state *css, 5426static int memcg_numa_stat_show(struct seq_file *m, void *v)
5390 struct cftype *cft, struct seq_file *m)
5391{ 5427{
5392 struct numa_stat { 5428 struct numa_stat {
5393 const char *name; 5429 const char *name;
@@ -5403,7 +5439,7 @@ static int memcg_numa_stat_show(struct cgroup_subsys_state *css,
5403 const struct numa_stat *stat; 5439 const struct numa_stat *stat;
5404 int nid; 5440 int nid;
5405 unsigned long nr; 5441 unsigned long nr;
5406 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5442 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5407 5443
5408 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 5444 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
5409 nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask); 5445 nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
@@ -5442,10 +5478,9 @@ static inline void mem_cgroup_lru_names_not_uptodate(void)
5442 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); 5478 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
5443} 5479}
5444 5480
5445static int memcg_stat_show(struct cgroup_subsys_state *css, struct cftype *cft, 5481static int memcg_stat_show(struct seq_file *m, void *v)
5446 struct seq_file *m)
5447{ 5482{
5448 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5483 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5449 struct mem_cgroup *mi; 5484 struct mem_cgroup *mi;
5450 unsigned int i; 5485 unsigned int i;
5451 5486
@@ -5654,13 +5689,11 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
5654 mem_cgroup_oom_notify_cb(iter); 5689 mem_cgroup_oom_notify_cb(iter);
5655} 5690}
5656 5691
5657static int mem_cgroup_usage_register_event(struct cgroup_subsys_state *css, 5692static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
5658 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) 5693 struct eventfd_ctx *eventfd, const char *args, enum res_type type)
5659{ 5694{
5660 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5661 struct mem_cgroup_thresholds *thresholds; 5695 struct mem_cgroup_thresholds *thresholds;
5662 struct mem_cgroup_threshold_ary *new; 5696 struct mem_cgroup_threshold_ary *new;
5663 enum res_type type = MEMFILE_TYPE(cft->private);
5664 u64 threshold, usage; 5697 u64 threshold, usage;
5665 int i, size, ret; 5698 int i, size, ret;
5666 5699
@@ -5737,13 +5770,23 @@ unlock:
5737 return ret; 5770 return ret;
5738} 5771}
5739 5772
5740static void mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css, 5773static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
5741 struct cftype *cft, struct eventfd_ctx *eventfd) 5774 struct eventfd_ctx *eventfd, const char *args)
5775{
5776 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
5777}
5778
5779static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
5780 struct eventfd_ctx *eventfd, const char *args)
5781{
5782 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
5783}
5784
5785static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
5786 struct eventfd_ctx *eventfd, enum res_type type)
5742{ 5787{
5743 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5744 struct mem_cgroup_thresholds *thresholds; 5788 struct mem_cgroup_thresholds *thresholds;
5745 struct mem_cgroup_threshold_ary *new; 5789 struct mem_cgroup_threshold_ary *new;
5746 enum res_type type = MEMFILE_TYPE(cft->private);
5747 u64 usage; 5790 u64 usage;
5748 int i, j, size; 5791 int i, j, size;
5749 5792
@@ -5816,14 +5859,23 @@ unlock:
5816 mutex_unlock(&memcg->thresholds_lock); 5859 mutex_unlock(&memcg->thresholds_lock);
5817} 5860}
5818 5861
5819static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css, 5862static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
5820 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) 5863 struct eventfd_ctx *eventfd)
5864{
5865 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
5866}
5867
5868static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
5869 struct eventfd_ctx *eventfd)
5870{
5871 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
5872}
5873
5874static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
5875 struct eventfd_ctx *eventfd, const char *args)
5821{ 5876{
5822 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5823 struct mem_cgroup_eventfd_list *event; 5877 struct mem_cgroup_eventfd_list *event;
5824 enum res_type type = MEMFILE_TYPE(cft->private);
5825 5878
5826 BUG_ON(type != _OOM_TYPE);
5827 event = kmalloc(sizeof(*event), GFP_KERNEL); 5879 event = kmalloc(sizeof(*event), GFP_KERNEL);
5828 if (!event) 5880 if (!event)
5829 return -ENOMEM; 5881 return -ENOMEM;
@@ -5841,14 +5893,10 @@ static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css,
5841 return 0; 5893 return 0;
5842} 5894}
5843 5895
5844static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css, 5896static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
5845 struct cftype *cft, struct eventfd_ctx *eventfd) 5897 struct eventfd_ctx *eventfd)
5846{ 5898{
5847 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5848 struct mem_cgroup_eventfd_list *ev, *tmp; 5899 struct mem_cgroup_eventfd_list *ev, *tmp;
5849 enum res_type type = MEMFILE_TYPE(cft->private);
5850
5851 BUG_ON(type != _OOM_TYPE);
5852 5900
5853 spin_lock(&memcg_oom_lock); 5901 spin_lock(&memcg_oom_lock);
5854 5902
@@ -5862,17 +5910,12 @@ static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css,
5862 spin_unlock(&memcg_oom_lock); 5910 spin_unlock(&memcg_oom_lock);
5863} 5911}
5864 5912
5865static int mem_cgroup_oom_control_read(struct cgroup_subsys_state *css, 5913static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
5866 struct cftype *cft, struct cgroup_map_cb *cb)
5867{ 5914{
5868 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5915 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
5869
5870 cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable);
5871 5916
5872 if (atomic_read(&memcg->under_oom)) 5917 seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
5873 cb->fill(cb, "under_oom", 1); 5918 seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom));
5874 else
5875 cb->fill(cb, "under_oom", 0);
5876 return 0; 5919 return 0;
5877} 5920}
5878 5921
@@ -5965,41 +6008,261 @@ static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
5965} 6008}
5966#endif 6009#endif
5967 6010
6011/*
6012 * DO NOT USE IN NEW FILES.
6013 *
6014 * "cgroup.event_control" implementation.
6015 *
6016 * This is way over-engineered. It tries to support fully configurable
6017 * events for each user. Such level of flexibility is completely
6018 * unnecessary especially in the light of the planned unified hierarchy.
6019 *
6020 * Please deprecate this and replace with something simpler if at all
6021 * possible.
6022 */
6023
6024/*
6025 * Unregister event and free resources.
6026 *
6027 * Gets called from workqueue.
6028 */
6029static void memcg_event_remove(struct work_struct *work)
6030{
6031 struct mem_cgroup_event *event =
6032 container_of(work, struct mem_cgroup_event, remove);
6033 struct mem_cgroup *memcg = event->memcg;
6034
6035 remove_wait_queue(event->wqh, &event->wait);
6036
6037 event->unregister_event(memcg, event->eventfd);
6038
6039 /* Notify userspace the event is going away. */
6040 eventfd_signal(event->eventfd, 1);
6041
6042 eventfd_ctx_put(event->eventfd);
6043 kfree(event);
6044 css_put(&memcg->css);
6045}
6046
6047/*
6048 * Gets called on POLLHUP on eventfd when user closes it.
6049 *
6050 * Called with wqh->lock held and interrupts disabled.
6051 */
6052static int memcg_event_wake(wait_queue_t *wait, unsigned mode,
6053 int sync, void *key)
6054{
6055 struct mem_cgroup_event *event =
6056 container_of(wait, struct mem_cgroup_event, wait);
6057 struct mem_cgroup *memcg = event->memcg;
6058 unsigned long flags = (unsigned long)key;
6059
6060 if (flags & POLLHUP) {
6061 /*
6062 * If the event has been detached at cgroup removal, we
6063 * can simply return knowing the other side will cleanup
6064 * for us.
6065 *
6066 * We can't race against event freeing since the other
6067 * side will require wqh->lock via remove_wait_queue(),
6068 * which we hold.
6069 */
6070 spin_lock(&memcg->event_list_lock);
6071 if (!list_empty(&event->list)) {
6072 list_del_init(&event->list);
6073 /*
6074 * We are in atomic context, but cgroup_event_remove()
6075 * may sleep, so we have to call it in workqueue.
6076 */
6077 schedule_work(&event->remove);
6078 }
6079 spin_unlock(&memcg->event_list_lock);
6080 }
6081
6082 return 0;
6083}
6084
6085static void memcg_event_ptable_queue_proc(struct file *file,
6086 wait_queue_head_t *wqh, poll_table *pt)
6087{
6088 struct mem_cgroup_event *event =
6089 container_of(pt, struct mem_cgroup_event, pt);
6090
6091 event->wqh = wqh;
6092 add_wait_queue(wqh, &event->wait);
6093}
6094
6095/*
6096 * DO NOT USE IN NEW FILES.
6097 *
6098 * Parse input and register new cgroup event handler.
6099 *
6100 * Input must be in format '<event_fd> <control_fd> <args>'.
6101 * Interpretation of args is defined by control file implementation.
6102 */
6103static int memcg_write_event_control(struct cgroup_subsys_state *css,
6104 struct cftype *cft, const char *buffer)
6105{
6106 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6107 struct mem_cgroup_event *event;
6108 struct cgroup_subsys_state *cfile_css;
6109 unsigned int efd, cfd;
6110 struct fd efile;
6111 struct fd cfile;
6112 const char *name;
6113 char *endp;
6114 int ret;
6115
6116 efd = simple_strtoul(buffer, &endp, 10);
6117 if (*endp != ' ')
6118 return -EINVAL;
6119 buffer = endp + 1;
6120
6121 cfd = simple_strtoul(buffer, &endp, 10);
6122 if ((*endp != ' ') && (*endp != '\0'))
6123 return -EINVAL;
6124 buffer = endp + 1;
6125
6126 event = kzalloc(sizeof(*event), GFP_KERNEL);
6127 if (!event)
6128 return -ENOMEM;
6129
6130 event->memcg = memcg;
6131 INIT_LIST_HEAD(&event->list);
6132 init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
6133 init_waitqueue_func_entry(&event->wait, memcg_event_wake);
6134 INIT_WORK(&event->remove, memcg_event_remove);
6135
6136 efile = fdget(efd);
6137 if (!efile.file) {
6138 ret = -EBADF;
6139 goto out_kfree;
6140 }
6141
6142 event->eventfd = eventfd_ctx_fileget(efile.file);
6143 if (IS_ERR(event->eventfd)) {
6144 ret = PTR_ERR(event->eventfd);
6145 goto out_put_efile;
6146 }
6147
6148 cfile = fdget(cfd);
6149 if (!cfile.file) {
6150 ret = -EBADF;
6151 goto out_put_eventfd;
6152 }
6153
6154 /* the process need read permission on control file */
6155 /* AV: shouldn't we check that it's been opened for read instead? */
6156 ret = inode_permission(file_inode(cfile.file), MAY_READ);
6157 if (ret < 0)
6158 goto out_put_cfile;
6159
6160 /*
6161 * Determine the event callbacks and set them in @event. This used
6162 * to be done via struct cftype but cgroup core no longer knows
6163 * about these events. The following is crude but the whole thing
6164 * is for compatibility anyway.
6165 *
6166 * DO NOT ADD NEW FILES.
6167 */
6168 name = cfile.file->f_dentry->d_name.name;
6169
6170 if (!strcmp(name, "memory.usage_in_bytes")) {
6171 event->register_event = mem_cgroup_usage_register_event;
6172 event->unregister_event = mem_cgroup_usage_unregister_event;
6173 } else if (!strcmp(name, "memory.oom_control")) {
6174 event->register_event = mem_cgroup_oom_register_event;
6175 event->unregister_event = mem_cgroup_oom_unregister_event;
6176 } else if (!strcmp(name, "memory.pressure_level")) {
6177 event->register_event = vmpressure_register_event;
6178 event->unregister_event = vmpressure_unregister_event;
6179 } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
6180 event->register_event = memsw_cgroup_usage_register_event;
6181 event->unregister_event = memsw_cgroup_usage_unregister_event;
6182 } else {
6183 ret = -EINVAL;
6184 goto out_put_cfile;
6185 }
6186
6187 /*
6188 * Verify @cfile should belong to @css. Also, remaining events are
6189 * automatically removed on cgroup destruction but the removal is
6190 * asynchronous, so take an extra ref on @css.
6191 */
6192 rcu_read_lock();
6193
6194 ret = -EINVAL;
6195 cfile_css = css_from_dir(cfile.file->f_dentry->d_parent,
6196 &mem_cgroup_subsys);
6197 if (cfile_css == css && css_tryget(css))
6198 ret = 0;
6199
6200 rcu_read_unlock();
6201 if (ret)
6202 goto out_put_cfile;
6203
6204 ret = event->register_event(memcg, event->eventfd, buffer);
6205 if (ret)
6206 goto out_put_css;
6207
6208 efile.file->f_op->poll(efile.file, &event->pt);
6209
6210 spin_lock(&memcg->event_list_lock);
6211 list_add(&event->list, &memcg->event_list);
6212 spin_unlock(&memcg->event_list_lock);
6213
6214 fdput(cfile);
6215 fdput(efile);
6216
6217 return 0;
6218
6219out_put_css:
6220 css_put(css);
6221out_put_cfile:
6222 fdput(cfile);
6223out_put_eventfd:
6224 eventfd_ctx_put(event->eventfd);
6225out_put_efile:
6226 fdput(efile);
6227out_kfree:
6228 kfree(event);
6229
6230 return ret;
6231}
6232
5968static struct cftype mem_cgroup_files[] = { 6233static struct cftype mem_cgroup_files[] = {
5969 { 6234 {
5970 .name = "usage_in_bytes", 6235 .name = "usage_in_bytes",
5971 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 6236 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
5972 .read = mem_cgroup_read, 6237 .read_u64 = mem_cgroup_read_u64,
5973 .register_event = mem_cgroup_usage_register_event,
5974 .unregister_event = mem_cgroup_usage_unregister_event,
5975 }, 6238 },
5976 { 6239 {
5977 .name = "max_usage_in_bytes", 6240 .name = "max_usage_in_bytes",
5978 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 6241 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
5979 .trigger = mem_cgroup_reset, 6242 .trigger = mem_cgroup_reset,
5980 .read = mem_cgroup_read, 6243 .read_u64 = mem_cgroup_read_u64,
5981 }, 6244 },
5982 { 6245 {
5983 .name = "limit_in_bytes", 6246 .name = "limit_in_bytes",
5984 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 6247 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
5985 .write_string = mem_cgroup_write, 6248 .write_string = mem_cgroup_write,
5986 .read = mem_cgroup_read, 6249 .read_u64 = mem_cgroup_read_u64,
5987 }, 6250 },
5988 { 6251 {
5989 .name = "soft_limit_in_bytes", 6252 .name = "soft_limit_in_bytes",
5990 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 6253 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
5991 .write_string = mem_cgroup_write, 6254 .write_string = mem_cgroup_write,
5992 .read = mem_cgroup_read, 6255 .read_u64 = mem_cgroup_read_u64,
5993 }, 6256 },
5994 { 6257 {
5995 .name = "failcnt", 6258 .name = "failcnt",
5996 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 6259 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
5997 .trigger = mem_cgroup_reset, 6260 .trigger = mem_cgroup_reset,
5998 .read = mem_cgroup_read, 6261 .read_u64 = mem_cgroup_read_u64,
5999 }, 6262 },
6000 { 6263 {
6001 .name = "stat", 6264 .name = "stat",
6002 .read_seq_string = memcg_stat_show, 6265 .seq_show = memcg_stat_show,
6003 }, 6266 },
6004 { 6267 {
6005 .name = "force_empty", 6268 .name = "force_empty",
@@ -6012,6 +6275,12 @@ static struct cftype mem_cgroup_files[] = {
6012 .read_u64 = mem_cgroup_hierarchy_read, 6275 .read_u64 = mem_cgroup_hierarchy_read,
6013 }, 6276 },
6014 { 6277 {
6278 .name = "cgroup.event_control", /* XXX: for compat */
6279 .write_string = memcg_write_event_control,
6280 .flags = CFTYPE_NO_PREFIX,
6281 .mode = S_IWUGO,
6282 },
6283 {
6015 .name = "swappiness", 6284 .name = "swappiness",
6016 .read_u64 = mem_cgroup_swappiness_read, 6285 .read_u64 = mem_cgroup_swappiness_read,
6017 .write_u64 = mem_cgroup_swappiness_write, 6286 .write_u64 = mem_cgroup_swappiness_write,
@@ -6023,21 +6292,17 @@ static struct cftype mem_cgroup_files[] = {
6023 }, 6292 },
6024 { 6293 {
6025 .name = "oom_control", 6294 .name = "oom_control",
6026 .read_map = mem_cgroup_oom_control_read, 6295 .seq_show = mem_cgroup_oom_control_read,
6027 .write_u64 = mem_cgroup_oom_control_write, 6296 .write_u64 = mem_cgroup_oom_control_write,
6028 .register_event = mem_cgroup_oom_register_event,
6029 .unregister_event = mem_cgroup_oom_unregister_event,
6030 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), 6297 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
6031 }, 6298 },
6032 { 6299 {
6033 .name = "pressure_level", 6300 .name = "pressure_level",
6034 .register_event = vmpressure_register_event,
6035 .unregister_event = vmpressure_unregister_event,
6036 }, 6301 },
6037#ifdef CONFIG_NUMA 6302#ifdef CONFIG_NUMA
6038 { 6303 {
6039 .name = "numa_stat", 6304 .name = "numa_stat",
6040 .read_seq_string = memcg_numa_stat_show, 6305 .seq_show = memcg_numa_stat_show,
6041 }, 6306 },
6042#endif 6307#endif
6043#ifdef CONFIG_MEMCG_KMEM 6308#ifdef CONFIG_MEMCG_KMEM
@@ -6045,29 +6310,29 @@ static struct cftype mem_cgroup_files[] = {
6045 .name = "kmem.limit_in_bytes", 6310 .name = "kmem.limit_in_bytes",
6046 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), 6311 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
6047 .write_string = mem_cgroup_write, 6312 .write_string = mem_cgroup_write,
6048 .read = mem_cgroup_read, 6313 .read_u64 = mem_cgroup_read_u64,
6049 }, 6314 },
6050 { 6315 {
6051 .name = "kmem.usage_in_bytes", 6316 .name = "kmem.usage_in_bytes",
6052 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), 6317 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
6053 .read = mem_cgroup_read, 6318 .read_u64 = mem_cgroup_read_u64,
6054 }, 6319 },
6055 { 6320 {
6056 .name = "kmem.failcnt", 6321 .name = "kmem.failcnt",
6057 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), 6322 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
6058 .trigger = mem_cgroup_reset, 6323 .trigger = mem_cgroup_reset,
6059 .read = mem_cgroup_read, 6324 .read_u64 = mem_cgroup_read_u64,
6060 }, 6325 },
6061 { 6326 {
6062 .name = "kmem.max_usage_in_bytes", 6327 .name = "kmem.max_usage_in_bytes",
6063 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), 6328 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
6064 .trigger = mem_cgroup_reset, 6329 .trigger = mem_cgroup_reset,
6065 .read = mem_cgroup_read, 6330 .read_u64 = mem_cgroup_read_u64,
6066 }, 6331 },
6067#ifdef CONFIG_SLABINFO 6332#ifdef CONFIG_SLABINFO
6068 { 6333 {
6069 .name = "kmem.slabinfo", 6334 .name = "kmem.slabinfo",
6070 .read_seq_string = mem_cgroup_slabinfo_read, 6335 .seq_show = mem_cgroup_slabinfo_read,
6071 }, 6336 },
6072#endif 6337#endif
6073#endif 6338#endif
@@ -6079,27 +6344,25 @@ static struct cftype memsw_cgroup_files[] = {
6079 { 6344 {
6080 .name = "memsw.usage_in_bytes", 6345 .name = "memsw.usage_in_bytes",
6081 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 6346 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
6082 .read = mem_cgroup_read, 6347 .read_u64 = mem_cgroup_read_u64,
6083 .register_event = mem_cgroup_usage_register_event,
6084 .unregister_event = mem_cgroup_usage_unregister_event,
6085 }, 6348 },
6086 { 6349 {
6087 .name = "memsw.max_usage_in_bytes", 6350 .name = "memsw.max_usage_in_bytes",
6088 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 6351 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
6089 .trigger = mem_cgroup_reset, 6352 .trigger = mem_cgroup_reset,
6090 .read = mem_cgroup_read, 6353 .read_u64 = mem_cgroup_read_u64,
6091 }, 6354 },
6092 { 6355 {
6093 .name = "memsw.limit_in_bytes", 6356 .name = "memsw.limit_in_bytes",
6094 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 6357 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
6095 .write_string = mem_cgroup_write, 6358 .write_string = mem_cgroup_write,
6096 .read = mem_cgroup_read, 6359 .read_u64 = mem_cgroup_read_u64,
6097 }, 6360 },
6098 { 6361 {
6099 .name = "memsw.failcnt", 6362 .name = "memsw.failcnt",
6100 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 6363 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
6101 .trigger = mem_cgroup_reset, 6364 .trigger = mem_cgroup_reset,
6102 .read = mem_cgroup_read, 6365 .read_u64 = mem_cgroup_read_u64,
6103 }, 6366 },
6104 { }, /* terminate */ 6367 { }, /* terminate */
6105}; 6368};
@@ -6271,6 +6534,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
6271 mutex_init(&memcg->thresholds_lock); 6534 mutex_init(&memcg->thresholds_lock);
6272 spin_lock_init(&memcg->move_lock); 6535 spin_lock_init(&memcg->move_lock);
6273 vmpressure_init(&memcg->vmpressure); 6536 vmpressure_init(&memcg->vmpressure);
6537 INIT_LIST_HEAD(&memcg->event_list);
6538 spin_lock_init(&memcg->event_list_lock);
6274 6539
6275 return &memcg->css; 6540 return &memcg->css;
6276 6541
@@ -6346,6 +6611,19 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
6346static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) 6611static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
6347{ 6612{
6348 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 6613 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6614 struct mem_cgroup_event *event, *tmp;
6615
6616 /*
6617 * Unregister events and notify userspace.
6618 * Notify userspace about cgroup removing only after rmdir of cgroup
6619 * directory to avoid race between userspace and kernelspace.
6620 */
6621 spin_lock(&memcg->event_list_lock);
6622 list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
6623 list_del_init(&event->list);
6624 schedule_work(&event->remove);
6625 }
6626 spin_unlock(&memcg->event_list_lock);
6349 6627
6350 kmem_cgroup_css_offline(memcg); 6628 kmem_cgroup_css_offline(memcg);
6351 6629
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index d8bd2c500aa4..cfd162882c00 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -452,7 +452,7 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
452 * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry 452 * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry
453 * @ent: swap entry to be looked up. 453 * @ent: swap entry to be looked up.
454 * 454 *
455 * Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID) 455 * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
456 */ 456 */
457unsigned short lookup_swap_cgroup_id(swp_entry_t ent) 457unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
458{ 458{
diff --git a/mm/percpu.c b/mm/percpu.c
index 65fd8a749712..036cfe07050f 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1689,10 +1689,10 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
1689 max_distance += ai->unit_size; 1689 max_distance += ai->unit_size;
1690 1690
1691 /* warn if maximum distance is further than 75% of vmalloc space */ 1691 /* warn if maximum distance is further than 75% of vmalloc space */
1692 if (max_distance > (VMALLOC_END - VMALLOC_START) * 3 / 4) { 1692 if (max_distance > VMALLOC_TOTAL * 3 / 4) {
1693 pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc " 1693 pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc "
1694 "space 0x%lx\n", max_distance, 1694 "space 0x%lx\n", max_distance,
1695 (unsigned long)(VMALLOC_END - VMALLOC_START)); 1695 VMALLOC_TOTAL);
1696#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK 1696#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
1697 /* and fail if we have fallback */ 1697 /* and fail if we have fallback */
1698 rc = -EINVAL; 1698 rc = -EINVAL;
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index e0f62837c3f4..196970a4541f 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -278,8 +278,7 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
278 278
279/** 279/**
280 * vmpressure_register_event() - Bind vmpressure notifications to an eventfd 280 * vmpressure_register_event() - Bind vmpressure notifications to an eventfd
281 * @css: css that is interested in vmpressure notifications 281 * @memcg: memcg that is interested in vmpressure notifications
282 * @cft: cgroup control files handle
283 * @eventfd: eventfd context to link notifications with 282 * @eventfd: eventfd context to link notifications with
284 * @args: event arguments (used to set up a pressure level threshold) 283 * @args: event arguments (used to set up a pressure level threshold)
285 * 284 *
@@ -289,15 +288,12 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
289 * threshold (one of vmpressure_str_levels, i.e. "low", "medium", or 288 * threshold (one of vmpressure_str_levels, i.e. "low", "medium", or
290 * "critical"). 289 * "critical").
291 * 290 *
292 * This function should not be used directly, just pass it to (struct 291 * To be used as memcg event method.
293 * cftype).register_event, and then cgroup core will handle everything by
294 * itself.
295 */ 292 */
296int vmpressure_register_event(struct cgroup_subsys_state *css, 293int vmpressure_register_event(struct mem_cgroup *memcg,
297 struct cftype *cft, struct eventfd_ctx *eventfd, 294 struct eventfd_ctx *eventfd, const char *args)
298 const char *args)
299{ 295{
300 struct vmpressure *vmpr = css_to_vmpressure(css); 296 struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
301 struct vmpressure_event *ev; 297 struct vmpressure_event *ev;
302 int level; 298 int level;
303 299
@@ -325,23 +321,19 @@ int vmpressure_register_event(struct cgroup_subsys_state *css,
325 321
326/** 322/**
327 * vmpressure_unregister_event() - Unbind eventfd from vmpressure 323 * vmpressure_unregister_event() - Unbind eventfd from vmpressure
328 * @css: css handle 324 * @memcg: memcg handle
329 * @cft: cgroup control files handle
330 * @eventfd: eventfd context that was used to link vmpressure with the @cg 325 * @eventfd: eventfd context that was used to link vmpressure with the @cg
331 * 326 *
332 * This function does internal manipulations to detach the @eventfd from 327 * This function does internal manipulations to detach the @eventfd from
333 * the vmpressure notifications, and then frees internal resources 328 * the vmpressure notifications, and then frees internal resources
334 * associated with the @eventfd (but the @eventfd itself is not freed). 329 * associated with the @eventfd (but the @eventfd itself is not freed).
335 * 330 *
336 * This function should not be used directly, just pass it to (struct 331 * To be used as memcg event method.
337 * cftype).unregister_event, and then cgroup core will handle everything
338 * by itself.
339 */ 332 */
340void vmpressure_unregister_event(struct cgroup_subsys_state *css, 333void vmpressure_unregister_event(struct mem_cgroup *memcg,
341 struct cftype *cft,
342 struct eventfd_ctx *eventfd) 334 struct eventfd_ctx *eventfd)
343{ 335{
344 struct vmpressure *vmpr = css_to_vmpressure(css); 336 struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
345 struct vmpressure_event *ev; 337 struct vmpressure_event *ev;
346 338
347 mutex_lock(&vmpr->events_lock); 339 mutex_lock(&vmpr->events_lock);
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 9b7cf6c85f82..56cbb69ba024 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -173,14 +173,14 @@ static u64 read_prioidx(struct cgroup_subsys_state *css, struct cftype *cft)
173 return css->cgroup->id; 173 return css->cgroup->id;
174} 174}
175 175
176static int read_priomap(struct cgroup_subsys_state *css, struct cftype *cft, 176static int read_priomap(struct seq_file *sf, void *v)
177 struct cgroup_map_cb *cb)
178{ 177{
179 struct net_device *dev; 178 struct net_device *dev;
180 179
181 rcu_read_lock(); 180 rcu_read_lock();
182 for_each_netdev_rcu(&init_net, dev) 181 for_each_netdev_rcu(&init_net, dev)
183 cb->fill(cb, dev->name, netprio_prio(css, dev)); 182 seq_printf(sf, "%s %u\n", dev->name,
183 netprio_prio(seq_css(sf), dev));
184 rcu_read_unlock(); 184 rcu_read_unlock();
185 return 0; 185 return 0;
186} 186}
@@ -238,7 +238,7 @@ static struct cftype ss_files[] = {
238 }, 238 },
239 { 239 {
240 .name = "ifpriomap", 240 .name = "ifpriomap",
241 .read_map = read_priomap, 241 .seq_show = read_priomap,
242 .write_string = write_priomap, 242 .write_string = write_priomap,
243 }, 243 },
244 { } /* terminate */ 244 { } /* terminate */
diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index 7c2a0a71049e..d3b6d2cd3a06 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -274,10 +274,9 @@ static void set_majmin(char *str, unsigned m)
274 sprintf(str, "%u", m); 274 sprintf(str, "%u", m);
275} 275}
276 276
277static int devcgroup_seq_read(struct cgroup_subsys_state *css, 277static int devcgroup_seq_show(struct seq_file *m, void *v)
278 struct cftype *cft, struct seq_file *m)
279{ 278{
280 struct dev_cgroup *devcgroup = css_to_devcgroup(css); 279 struct dev_cgroup *devcgroup = css_to_devcgroup(seq_css(m));
281 struct dev_exception_item *ex; 280 struct dev_exception_item *ex;
282 char maj[MAJMINLEN], min[MAJMINLEN], acc[ACCLEN]; 281 char maj[MAJMINLEN], min[MAJMINLEN], acc[ACCLEN];
283 282
@@ -679,7 +678,7 @@ static struct cftype dev_cgroup_files[] = {
679 }, 678 },
680 { 679 {
681 .name = "list", 680 .name = "list",
682 .read_seq_string = devcgroup_seq_read, 681 .seq_show = devcgroup_seq_show,
683 .private = DEVCG_LIST, 682 .private = DEVCG_LIST,
684 }, 683 },
685 { } /* terminate */ 684 { } /* terminate */