aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/stable/sysfs-devices-node96
-rw-r--r--Documentation/cgroups/memory.txt66
-rw-r--r--Documentation/cgroups/resource_counter.txt7
-rw-r--r--arch/cris/include/asm/io.h39
-rw-r--r--arch/h8300/Kconfig1
-rw-r--r--arch/x86/platform/iris/iris.c67
-rw-r--r--drivers/message/fusion/mptscsih.c1
-rw-r--r--drivers/video/backlight/locomolcd.c38
-rw-r--r--fs/ceph/export.c4
-rw-r--r--include/linux/gfp.h5
-rw-r--r--include/linux/hugetlb_cgroup.h5
-rw-r--r--include/linux/memcontrol.h209
-rw-r--r--include/linux/res_counter.h12
-rw-r--r--include/linux/sched.h1
-rw-r--r--include/linux/slab.h48
-rw-r--r--include/linux/slab_def.h3
-rw-r--r--include/linux/slub_def.h9
-rw-r--r--include/linux/thread_info.h2
-rw-r--r--include/trace/events/gfpflags.h1
-rw-r--r--init/Kconfig2
-rw-r--r--kernel/fork.c4
-rw-r--r--kernel/irq/manage.c2
-rw-r--r--kernel/res_counter.c20
-rw-r--r--mm/Kconfig13
-rw-r--r--mm/hugetlb.c11
-rw-r--r--mm/hugetlb_cgroup.c19
-rw-r--r--mm/kmemleak.c3
-rw-r--r--mm/memcontrol.c1242
-rw-r--r--mm/memory_hotplug.c18
-rw-r--r--mm/mprotect.c30
-rw-r--r--mm/page_alloc.c38
-rw-r--r--mm/slab.c94
-rw-r--r--mm/slab.h137
-rw-r--r--mm/slab_common.c118
-rw-r--r--mm/slob.c2
-rw-r--r--mm/slub.c150
-rw-r--r--mm/vmscan.c14
-rw-r--r--scripts/coccinelle/api/d_find_alias.cocci80
38 files changed, 2407 insertions, 204 deletions
diff --git a/Documentation/ABI/stable/sysfs-devices-node b/Documentation/ABI/stable/sysfs-devices-node
index 49b82cad7003..ce259c13c36a 100644
--- a/Documentation/ABI/stable/sysfs-devices-node
+++ b/Documentation/ABI/stable/sysfs-devices-node
@@ -1,7 +1,101 @@
1What: /sys/devices/system/node/possible
2Date: October 2002
3Contact: Linux Memory Management list <linux-mm@kvack.org>
4Description:
5 Nodes that could be possibly become online at some point.
6
7What: /sys/devices/system/node/online
8Date: October 2002
9Contact: Linux Memory Management list <linux-mm@kvack.org>
10Description:
11 Nodes that are online.
12
13What: /sys/devices/system/node/has_normal_memory
14Date: October 2002
15Contact: Linux Memory Management list <linux-mm@kvack.org>
16Description:
17 Nodes that have regular memory.
18
19What: /sys/devices/system/node/has_cpu
20Date: October 2002
21Contact: Linux Memory Management list <linux-mm@kvack.org>
22Description:
23 Nodes that have one or more CPUs.
24
25What: /sys/devices/system/node/has_high_memory
26Date: October 2002
27Contact: Linux Memory Management list <linux-mm@kvack.org>
28Description:
29 Nodes that have regular or high memory.
30 Depends on CONFIG_HIGHMEM.
31
1What: /sys/devices/system/node/nodeX 32What: /sys/devices/system/node/nodeX
2Date: October 2002 33Date: October 2002
3Contact: Linux Memory Management list <linux-mm@kvack.org> 34Contact: Linux Memory Management list <linux-mm@kvack.org>
4Description: 35Description:
5 When CONFIG_NUMA is enabled, this is a directory containing 36 When CONFIG_NUMA is enabled, this is a directory containing
6 information on node X such as what CPUs are local to the 37 information on node X such as what CPUs are local to the
7 node. 38 node. Each file is detailed next.
39
40What: /sys/devices/system/node/nodeX/cpumap
41Date: October 2002
42Contact: Linux Memory Management list <linux-mm@kvack.org>
43Description:
44 The node's cpumap.
45
46What: /sys/devices/system/node/nodeX/cpulist
47Date: October 2002
48Contact: Linux Memory Management list <linux-mm@kvack.org>
49Description:
50 The CPUs associated to the node.
51
52What: /sys/devices/system/node/nodeX/meminfo
53Date: October 2002
54Contact: Linux Memory Management list <linux-mm@kvack.org>
55Description:
56 Provides information about the node's distribution and memory
57 utilization. Similar to /proc/meminfo, see Documentation/filesystems/proc.txt
58
59What: /sys/devices/system/node/nodeX/numastat
60Date: October 2002
61Contact: Linux Memory Management list <linux-mm@kvack.org>
62Description:
63 The node's hit/miss statistics, in units of pages.
64 See Documentation/numastat.txt
65
66What: /sys/devices/system/node/nodeX/distance
67Date: October 2002
68Contact: Linux Memory Management list <linux-mm@kvack.org>
69Description:
70 Distance between the node and all the other nodes
71 in the system.
72
73What: /sys/devices/system/node/nodeX/vmstat
74Date: October 2002
75Contact: Linux Memory Management list <linux-mm@kvack.org>
76Description:
77 The node's zoned virtual memory statistics.
78 This is a superset of numastat.
79
80What: /sys/devices/system/node/nodeX/compact
81Date: February 2010
82Contact: Mel Gorman <mel@csn.ul.ie>
83Description:
84 When this file is written to, all memory within that node
85 will be compacted. When it completes, memory will be freed
86 into blocks which have as many contiguous pages as possible
87
88What: /sys/devices/system/node/nodeX/scan_unevictable_pages
89Date: October 2008
90Contact: Lee Schermerhorn <lee.schermerhorn@hp.com>
91Description:
92 When set, it triggers scanning the node's unevictable lists
93 and move any pages that have become evictable onto the respective
94 zone's inactive list. See mm/vmscan.c
95
96What: /sys/devices/system/node/nodeX/hugepages/hugepages-<size>/
97Date: December 2009
98Contact: Lee Schermerhorn <lee.schermerhorn@hp.com>
99Description:
100 The node's huge page size control/query attributes.
101 See Documentation/vm/hugetlbpage.txt \ No newline at end of file
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index a25cb3fafeba..8b8c28b9864c 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -71,6 +71,11 @@ Brief summary of control files.
71 memory.oom_control # set/show oom controls. 71 memory.oom_control # set/show oom controls.
72 memory.numa_stat # show the number of memory usage per numa node 72 memory.numa_stat # show the number of memory usage per numa node
73 73
74 memory.kmem.limit_in_bytes # set/show hard limit for kernel memory
75 memory.kmem.usage_in_bytes # show current kernel memory allocation
76 memory.kmem.failcnt # show the number of kernel memory usage hits limits
77 memory.kmem.max_usage_in_bytes # show max kernel memory usage recorded
78
74 memory.kmem.tcp.limit_in_bytes # set/show hard limit for tcp buf memory 79 memory.kmem.tcp.limit_in_bytes # set/show hard limit for tcp buf memory
75 memory.kmem.tcp.usage_in_bytes # show current tcp buf memory allocation 80 memory.kmem.tcp.usage_in_bytes # show current tcp buf memory allocation
76 memory.kmem.tcp.failcnt # show the number of tcp buf memory usage hits limits 81 memory.kmem.tcp.failcnt # show the number of tcp buf memory usage hits limits
@@ -268,20 +273,73 @@ the amount of kernel memory used by the system. Kernel memory is fundamentally
268different than user memory, since it can't be swapped out, which makes it 273different than user memory, since it can't be swapped out, which makes it
269possible to DoS the system by consuming too much of this precious resource. 274possible to DoS the system by consuming too much of this precious resource.
270 275
276Kernel memory won't be accounted at all until limit on a group is set. This
277allows for existing setups to continue working without disruption. The limit
278cannot be set if the cgroup have children, or if there are already tasks in the
279cgroup. Attempting to set the limit under those conditions will return -EBUSY.
280When use_hierarchy == 1 and a group is accounted, its children will
281automatically be accounted regardless of their limit value.
282
283After a group is first limited, it will be kept being accounted until it
284is removed. The memory limitation itself, can of course be removed by writing
285-1 to memory.kmem.limit_in_bytes. In this case, kmem will be accounted, but not
286limited.
287
271Kernel memory limits are not imposed for the root cgroup. Usage for the root 288Kernel memory limits are not imposed for the root cgroup. Usage for the root
272cgroup may or may not be accounted. 289cgroup may or may not be accounted. The memory used is accumulated into
290memory.kmem.usage_in_bytes, or in a separate counter when it makes sense.
291(currently only for tcp).
292The main "kmem" counter is fed into the main counter, so kmem charges will
293also be visible from the user counter.
273 294
274Currently no soft limit is implemented for kernel memory. It is future work 295Currently no soft limit is implemented for kernel memory. It is future work
275to trigger slab reclaim when those limits are reached. 296to trigger slab reclaim when those limits are reached.
276 297
2772.7.1 Current Kernel Memory resources accounted 2982.7.1 Current Kernel Memory resources accounted
278 299
300* stack pages: every process consumes some stack pages. By accounting into
301kernel memory, we prevent new processes from being created when the kernel
302memory usage is too high.
303
304* slab pages: pages allocated by the SLAB or SLUB allocator are tracked. A copy
305of each kmem_cache is created everytime the cache is touched by the first time
306from inside the memcg. The creation is done lazily, so some objects can still be
307skipped while the cache is being created. All objects in a slab page should
308belong to the same memcg. This only fails to hold when a task is migrated to a
309different memcg during the page allocation by the cache.
310
279* sockets memory pressure: some sockets protocols have memory pressure 311* sockets memory pressure: some sockets protocols have memory pressure
280thresholds. The Memory Controller allows them to be controlled individually 312thresholds. The Memory Controller allows them to be controlled individually
281per cgroup, instead of globally. 313per cgroup, instead of globally.
282 314
283* tcp memory pressure: sockets memory pressure for the tcp protocol. 315* tcp memory pressure: sockets memory pressure for the tcp protocol.
284 316
3172.7.3 Common use cases
318
319Because the "kmem" counter is fed to the main user counter, kernel memory can
320never be limited completely independently of user memory. Say "U" is the user
321limit, and "K" the kernel limit. There are three possible ways limits can be
322set:
323
324 U != 0, K = unlimited:
325 This is the standard memcg limitation mechanism already present before kmem
326 accounting. Kernel memory is completely ignored.
327
328 U != 0, K < U:
329 Kernel memory is a subset of the user memory. This setup is useful in
330 deployments where the total amount of memory per-cgroup is overcommited.
331 Overcommiting kernel memory limits is definitely not recommended, since the
332 box can still run out of non-reclaimable memory.
333 In this case, the admin could set up K so that the sum of all groups is
334 never greater than the total memory, and freely set U at the cost of his
335 QoS.
336
337 U != 0, K >= U:
338 Since kmem charges will also be fed to the user counter and reclaim will be
339 triggered for the cgroup for both kinds of memory. This setup gives the
340 admin a unified view of memory, and it is also useful for people who just
341 want to track kernel memory usage.
342
2853. User Interface 3433. User Interface
286 344
2870. Configuration 3450. Configuration
@@ -290,6 +348,7 @@ a. Enable CONFIG_CGROUPS
290b. Enable CONFIG_RESOURCE_COUNTERS 348b. Enable CONFIG_RESOURCE_COUNTERS
291c. Enable CONFIG_MEMCG 349c. Enable CONFIG_MEMCG
292d. Enable CONFIG_MEMCG_SWAP (to use swap extension) 350d. Enable CONFIG_MEMCG_SWAP (to use swap extension)
351d. Enable CONFIG_MEMCG_KMEM (to use kmem extension)
293 352
2941. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?) 3531. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?)
295# mount -t tmpfs none /sys/fs/cgroup 354# mount -t tmpfs none /sys/fs/cgroup
@@ -406,6 +465,11 @@ About use_hierarchy, see Section 6.
406 Because rmdir() moves all pages to parent, some out-of-use page caches can be 465 Because rmdir() moves all pages to parent, some out-of-use page caches can be
407 moved to the parent. If you want to avoid that, force_empty will be useful. 466 moved to the parent. If you want to avoid that, force_empty will be useful.
408 467
468 Also, note that when memory.kmem.limit_in_bytes is set the charges due to
469 kernel pages will still be seen. This is not considered a failure and the
470 write will still return success. In this case, it is expected that
471 memory.kmem.usage_in_bytes == memory.usage_in_bytes.
472
409 About use_hierarchy, see Section 6. 473 About use_hierarchy, see Section 6.
410 474
4115.2 stat file 4755.2 stat file
diff --git a/Documentation/cgroups/resource_counter.txt b/Documentation/cgroups/resource_counter.txt
index 0c4a344e78fa..c4d99ed0b418 100644
--- a/Documentation/cgroups/resource_counter.txt
+++ b/Documentation/cgroups/resource_counter.txt
@@ -83,16 +83,17 @@ to work with it.
83 res_counter->lock internally (it must be called with res_counter->lock 83 res_counter->lock internally (it must be called with res_counter->lock
84 held). The force parameter indicates whether we can bypass the limit. 84 held). The force parameter indicates whether we can bypass the limit.
85 85
86 e. void res_counter_uncharge[_locked] 86 e. u64 res_counter_uncharge[_locked]
87 (struct res_counter *rc, unsigned long val) 87 (struct res_counter *rc, unsigned long val)
88 88
89 When a resource is released (freed) it should be de-accounted 89 When a resource is released (freed) it should be de-accounted
90 from the resource counter it was accounted to. This is called 90 from the resource counter it was accounted to. This is called
91 "uncharging". 91 "uncharging". The return value of this function indicate the amount
92 of charges still present in the counter.
92 93
93 The _locked routines imply that the res_counter->lock is taken. 94 The _locked routines imply that the res_counter->lock is taken.
94 95
95 f. void res_counter_uncharge_until 96 f. u64 res_counter_uncharge_until
96 (struct res_counter *rc, struct res_counter *top, 97 (struct res_counter *rc, struct res_counter *top,
97 unsinged long val) 98 unsinged long val)
98 99
diff --git a/arch/cris/include/asm/io.h b/arch/cris/include/asm/io.h
index 32567bc2a421..ac12ae2b9286 100644
--- a/arch/cris/include/asm/io.h
+++ b/arch/cris/include/asm/io.h
@@ -133,12 +133,39 @@ static inline void writel(unsigned int b, volatile void __iomem *addr)
133#define insb(port,addr,count) (cris_iops ? cris_iops->read_io(port,addr,1,count) : 0) 133#define insb(port,addr,count) (cris_iops ? cris_iops->read_io(port,addr,1,count) : 0)
134#define insw(port,addr,count) (cris_iops ? cris_iops->read_io(port,addr,2,count) : 0) 134#define insw(port,addr,count) (cris_iops ? cris_iops->read_io(port,addr,2,count) : 0)
135#define insl(port,addr,count) (cris_iops ? cris_iops->read_io(port,addr,4,count) : 0) 135#define insl(port,addr,count) (cris_iops ? cris_iops->read_io(port,addr,4,count) : 0)
136#define outb(data,port) if (cris_iops) cris_iops->write_io(port,(void*)(unsigned)data,1,1) 136static inline void outb(unsigned char data, unsigned int port)
137#define outw(data,port) if (cris_iops) cris_iops->write_io(port,(void*)(unsigned)data,2,1) 137{
138#define outl(data,port) if (cris_iops) cris_iops->write_io(port,(void*)(unsigned)data,4,1) 138 if (cris_iops)
139#define outsb(port,addr,count) if(cris_iops) cris_iops->write_io(port,(void*)addr,1,count) 139 cris_iops->write_io(port, (void *) &data, 1, 1);
140#define outsw(port,addr,count) if(cris_iops) cris_iops->write_io(port,(void*)addr,2,count) 140}
141#define outsl(port,addr,count) if(cris_iops) cris_iops->write_io(port,(void*)addr,3,count) 141static inline void outw(unsigned short data, unsigned int port)
142{
143 if (cris_iops)
144 cris_iops->write_io(port, (void *) &data, 2, 1);
145}
146static inline void outl(unsigned int data, unsigned int port)
147{
148 if (cris_iops)
149 cris_iops->write_io(port, (void *) &data, 4, 1);
150}
151static inline void outsb(unsigned int port, const void *addr,
152 unsigned long count)
153{
154 if (cris_iops)
155 cris_iops->write_io(port, (void *)addr, 1, count);
156}
157static inline void outsw(unsigned int port, const void *addr,
158 unsigned long count)
159{
160 if (cris_iops)
161 cris_iops->write_io(port, (void *)addr, 2, count);
162}
163static inline void outsl(unsigned int port, const void *addr,
164 unsigned long count)
165{
166 if (cris_iops)
167 cris_iops->write_io(port, (void *)addr, 4, count);
168}
142 169
143/* 170/*
144 * Convert a physical pointer to a virtual kernel pointer for /dev/mem 171 * Convert a physical pointer to a virtual kernel pointer for /dev/mem
diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig
index 04bef4d25b4a..0ae445087607 100644
--- a/arch/h8300/Kconfig
+++ b/arch/h8300/Kconfig
@@ -3,6 +3,7 @@ config H8300
3 default y 3 default y
4 select HAVE_IDE 4 select HAVE_IDE
5 select HAVE_GENERIC_HARDIRQS 5 select HAVE_GENERIC_HARDIRQS
6 select GENERIC_ATOMIC64
6 select HAVE_UID16 7 select HAVE_UID16
7 select ARCH_WANT_IPC_PARSE_VERSION 8 select ARCH_WANT_IPC_PARSE_VERSION
8 select GENERIC_IRQ_SHOW 9 select GENERIC_IRQ_SHOW
diff --git a/arch/x86/platform/iris/iris.c b/arch/x86/platform/iris/iris.c
index 5917eb56b313..e6cb80f620af 100644
--- a/arch/x86/platform/iris/iris.c
+++ b/arch/x86/platform/iris/iris.c
@@ -23,6 +23,7 @@
23 23
24#include <linux/moduleparam.h> 24#include <linux/moduleparam.h>
25#include <linux/module.h> 25#include <linux/module.h>
26#include <linux/platform_device.h>
26#include <linux/kernel.h> 27#include <linux/kernel.h>
27#include <linux/errno.h> 28#include <linux/errno.h>
28#include <linux/delay.h> 29#include <linux/delay.h>
@@ -62,29 +63,75 @@ static void iris_power_off(void)
62 * by reading its input port and seeing whether the read value is 63 * by reading its input port and seeing whether the read value is
63 * meaningful. 64 * meaningful.
64 */ 65 */
65static int iris_init(void) 66static int iris_probe(struct platform_device *pdev)
66{ 67{
67 unsigned char status; 68 unsigned char status = inb(IRIS_GIO_INPUT);
68 if (force != 1) {
69 printk(KERN_ERR "The force parameter has not been set to 1 so the Iris poweroff handler will not be installed.\n");
70 return -ENODEV;
71 }
72 status = inb(IRIS_GIO_INPUT);
73 if (status == IRIS_GIO_NODEV) { 69 if (status == IRIS_GIO_NODEV) {
74 printk(KERN_ERR "This machine does not seem to be an Iris. Power_off handler not installed.\n"); 70 printk(KERN_ERR "This machine does not seem to be an Iris. "
71 "Power off handler not installed.\n");
75 return -ENODEV; 72 return -ENODEV;
76 } 73 }
77 old_pm_power_off = pm_power_off; 74 old_pm_power_off = pm_power_off;
78 pm_power_off = &iris_power_off; 75 pm_power_off = &iris_power_off;
79 printk(KERN_INFO "Iris power_off handler installed.\n"); 76 printk(KERN_INFO "Iris power_off handler installed.\n");
80
81 return 0; 77 return 0;
82} 78}
83 79
84static void iris_exit(void) 80static int iris_remove(struct platform_device *pdev)
85{ 81{
86 pm_power_off = old_pm_power_off; 82 pm_power_off = old_pm_power_off;
87 printk(KERN_INFO "Iris power_off handler uninstalled.\n"); 83 printk(KERN_INFO "Iris power_off handler uninstalled.\n");
84 return 0;
85}
86
87static struct platform_driver iris_driver = {
88 .driver = {
89 .name = "iris",
90 .owner = THIS_MODULE,
91 },
92 .probe = iris_probe,
93 .remove = iris_remove,
94};
95
96static struct resource iris_resources[] = {
97 {
98 .start = IRIS_GIO_BASE,
99 .end = IRIS_GIO_OUTPUT,
100 .flags = IORESOURCE_IO,
101 .name = "address"
102 }
103};
104
105static struct platform_device *iris_device;
106
107static int iris_init(void)
108{
109 int ret;
110 if (force != 1) {
111 printk(KERN_ERR "The force parameter has not been set to 1."
112 " The Iris poweroff handler will not be installed.\n");
113 return -ENODEV;
114 }
115 ret = platform_driver_register(&iris_driver);
116 if (ret < 0) {
117 printk(KERN_ERR "Failed to register iris platform driver: %d\n",
118 ret);
119 return ret;
120 }
121 iris_device = platform_device_register_simple("iris", (-1),
122 iris_resources, ARRAY_SIZE(iris_resources));
123 if (IS_ERR(iris_device)) {
124 printk(KERN_ERR "Failed to register iris platform device\n");
125 platform_driver_unregister(&iris_driver);
126 return PTR_ERR(iris_device);
127 }
128 return 0;
129}
130
131static void iris_exit(void)
132{
133 platform_device_unregister(iris_device);
134 platform_driver_unregister(&iris_driver);
88} 135}
89 136
90module_init(iris_init); 137module_init(iris_init);
diff --git a/drivers/message/fusion/mptscsih.c b/drivers/message/fusion/mptscsih.c
index 0c3ced70707b..164afa71bba7 100644
--- a/drivers/message/fusion/mptscsih.c
+++ b/drivers/message/fusion/mptscsih.c
@@ -792,6 +792,7 @@ mptscsih_io_done(MPT_ADAPTER *ioc, MPT_FRAME_HDR *mf, MPT_FRAME_HDR *mr)
792 * than an unsolicited DID_ABORT. 792 * than an unsolicited DID_ABORT.
793 */ 793 */
794 sc->result = DID_RESET << 16; 794 sc->result = DID_RESET << 16;
795 break;
795 796
796 case MPI_IOCSTATUS_SCSI_EXT_TERMINATED: /* 0x004C */ 797 case MPI_IOCSTATUS_SCSI_EXT_TERMINATED: /* 0x004C */
797 if (ioc->bus_type == FC) 798 if (ioc->bus_type == FC)
diff --git a/drivers/video/backlight/locomolcd.c b/drivers/video/backlight/locomolcd.c
index 3a6d5419e3e3..146fea8aa431 100644
--- a/drivers/video/backlight/locomolcd.c
+++ b/drivers/video/backlight/locomolcd.c
@@ -107,7 +107,6 @@ void locomolcd_power(int on)
107} 107}
108EXPORT_SYMBOL(locomolcd_power); 108EXPORT_SYMBOL(locomolcd_power);
109 109
110
111static int current_intensity; 110static int current_intensity;
112 111
113static int locomolcd_set_intensity(struct backlight_device *bd) 112static int locomolcd_set_intensity(struct backlight_device *bd)
@@ -122,13 +121,25 @@ static int locomolcd_set_intensity(struct backlight_device *bd)
122 intensity = 0; 121 intensity = 0;
123 122
124 switch (intensity) { 123 switch (intensity) {
125 /* AC and non-AC are handled differently, but produce same results in sharp code? */ 124 /*
126 case 0: locomo_frontlight_set(locomolcd_dev, 0, 0, 161); break; 125 * AC and non-AC are handled differently,
127 case 1: locomo_frontlight_set(locomolcd_dev, 117, 0, 161); break; 126 * but produce same results in sharp code?
128 case 2: locomo_frontlight_set(locomolcd_dev, 163, 0, 148); break; 127 */
129 case 3: locomo_frontlight_set(locomolcd_dev, 194, 0, 161); break; 128 case 0:
130 case 4: locomo_frontlight_set(locomolcd_dev, 194, 1, 161); break; 129 locomo_frontlight_set(locomolcd_dev, 0, 0, 161);
131 130 break;
131 case 1:
132 locomo_frontlight_set(locomolcd_dev, 117, 0, 161);
133 break;
134 case 2:
135 locomo_frontlight_set(locomolcd_dev, 163, 0, 148);
136 break;
137 case 3:
138 locomo_frontlight_set(locomolcd_dev, 194, 0, 161);
139 break;
140 case 4:
141 locomo_frontlight_set(locomolcd_dev, 194, 1, 161);
142 break;
132 default: 143 default:
133 return -ENODEV; 144 return -ENODEV;
134 } 145 }
@@ -175,9 +186,11 @@ static int locomolcd_probe(struct locomo_dev *ldev)
175 186
176 locomo_gpio_set_dir(ldev->dev.parent, LOCOMO_GPIO_FL_VR, 0); 187 locomo_gpio_set_dir(ldev->dev.parent, LOCOMO_GPIO_FL_VR, 0);
177 188
178 /* the poodle_lcd_power function is called for the first time 189 /*
190 * the poodle_lcd_power function is called for the first time
179 * from fs_initcall, which is before locomo is activated. 191 * from fs_initcall, which is before locomo is activated.
180 * We need to recall poodle_lcd_power here*/ 192 * We need to recall poodle_lcd_power here
193 */
181 if (machine_is_poodle()) 194 if (machine_is_poodle())
182 locomolcd_power(1); 195 locomolcd_power(1);
183 196
@@ -190,8 +203,8 @@ static int locomolcd_probe(struct locomo_dev *ldev)
190 &ldev->dev, NULL, 203 &ldev->dev, NULL,
191 &locomobl_data, &props); 204 &locomobl_data, &props);
192 205
193 if (IS_ERR (locomolcd_bl_device)) 206 if (IS_ERR(locomolcd_bl_device))
194 return PTR_ERR (locomolcd_bl_device); 207 return PTR_ERR(locomolcd_bl_device);
195 208
196 /* Set up frontlight so that screen is readable */ 209 /* Set up frontlight so that screen is readable */
197 locomolcd_bl_device->props.brightness = 2; 210 locomolcd_bl_device->props.brightness = 2;
@@ -226,7 +239,6 @@ static struct locomo_driver poodle_lcd_driver = {
226 .resume = locomolcd_resume, 239 .resume = locomolcd_resume,
227}; 240};
228 241
229
230static int __init locomolcd_init(void) 242static int __init locomolcd_init(void)
231{ 243{
232 return locomo_driver_register(&poodle_lcd_driver); 244 return locomo_driver_register(&poodle_lcd_driver);
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 9349bb37a2fe..ca3ab3f9ca70 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -56,13 +56,15 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
56 struct ceph_nfs_confh *cfh = (void *)rawfh; 56 struct ceph_nfs_confh *cfh = (void *)rawfh;
57 int connected_handle_length = sizeof(*cfh)/4; 57 int connected_handle_length = sizeof(*cfh)/4;
58 int handle_length = sizeof(*fh)/4; 58 int handle_length = sizeof(*fh)/4;
59 struct dentry *dentry = d_find_alias(inode); 59 struct dentry *dentry;
60 struct dentry *parent; 60 struct dentry *parent;
61 61
62 /* don't re-export snaps */ 62 /* don't re-export snaps */
63 if (ceph_snap(inode) != CEPH_NOSNAP) 63 if (ceph_snap(inode) != CEPH_NOSNAP)
64 return -EINVAL; 64 return -EINVAL;
65 65
66 dentry = d_find_alias(inode);
67
66 /* if we found an alias, generate a connectable fh */ 68 /* if we found an alias, generate a connectable fh */
67 if (*max_len >= connected_handle_length && dentry) { 69 if (*max_len >= connected_handle_length && dentry) {
68 dout("encode_fh %p connectable\n", dentry); 70 dout("encode_fh %p connectable\n", dentry);
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index f74856e17e48..0f615eb23d05 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -30,6 +30,7 @@ struct vm_area_struct;
30#define ___GFP_HARDWALL 0x20000u 30#define ___GFP_HARDWALL 0x20000u
31#define ___GFP_THISNODE 0x40000u 31#define ___GFP_THISNODE 0x40000u
32#define ___GFP_RECLAIMABLE 0x80000u 32#define ___GFP_RECLAIMABLE 0x80000u
33#define ___GFP_KMEMCG 0x100000u
33#define ___GFP_NOTRACK 0x200000u 34#define ___GFP_NOTRACK 0x200000u
34#define ___GFP_NO_KSWAPD 0x400000u 35#define ___GFP_NO_KSWAPD 0x400000u
35#define ___GFP_OTHER_NODE 0x800000u 36#define ___GFP_OTHER_NODE 0x800000u
@@ -89,6 +90,7 @@ struct vm_area_struct;
89 90
90#define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD) 91#define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD)
91#define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */ 92#define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */
93#define __GFP_KMEMCG ((__force gfp_t)___GFP_KMEMCG) /* Allocation comes from a memcg-accounted resource */
92#define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */ 94#define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */
93 95
94/* 96/*
@@ -365,6 +367,9 @@ extern void free_pages(unsigned long addr, unsigned int order);
365extern void free_hot_cold_page(struct page *page, int cold); 367extern void free_hot_cold_page(struct page *page, int cold);
366extern void free_hot_cold_page_list(struct list_head *list, int cold); 368extern void free_hot_cold_page_list(struct list_head *list, int cold);
367 369
370extern void __free_memcg_kmem_pages(struct page *page, unsigned int order);
371extern void free_memcg_kmem_pages(unsigned long addr, unsigned int order);
372
368#define __free_page(page) __free_pages((page), 0) 373#define __free_page(page) __free_pages((page), 0)
369#define free_page(addr) free_pages((addr), 0) 374#define free_page(addr) free_pages((addr), 0)
370 375
diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h
index d73878c694b3..ce8217f7b5c2 100644
--- a/include/linux/hugetlb_cgroup.h
+++ b/include/linux/hugetlb_cgroup.h
@@ -62,7 +62,7 @@ extern void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
62 struct page *page); 62 struct page *page);
63extern void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, 63extern void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
64 struct hugetlb_cgroup *h_cg); 64 struct hugetlb_cgroup *h_cg);
65extern int hugetlb_cgroup_file_init(int idx) __init; 65extern void hugetlb_cgroup_file_init(void) __init;
66extern void hugetlb_cgroup_migrate(struct page *oldhpage, 66extern void hugetlb_cgroup_migrate(struct page *oldhpage,
67 struct page *newhpage); 67 struct page *newhpage);
68 68
@@ -111,9 +111,8 @@ hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
111 return; 111 return;
112} 112}
113 113
114static inline int __init hugetlb_cgroup_file_init(int idx) 114static inline void hugetlb_cgroup_file_init(void)
115{ 115{
116 return 0;
117} 116}
118 117
119static inline void hugetlb_cgroup_migrate(struct page *oldhpage, 118static inline void hugetlb_cgroup_migrate(struct page *oldhpage,
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index e98a74c0c9c0..0108a56f814e 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -21,11 +21,14 @@
21#define _LINUX_MEMCONTROL_H 21#define _LINUX_MEMCONTROL_H
22#include <linux/cgroup.h> 22#include <linux/cgroup.h>
23#include <linux/vm_event_item.h> 23#include <linux/vm_event_item.h>
24#include <linux/hardirq.h>
25#include <linux/jump_label.h>
24 26
25struct mem_cgroup; 27struct mem_cgroup;
26struct page_cgroup; 28struct page_cgroup;
27struct page; 29struct page;
28struct mm_struct; 30struct mm_struct;
31struct kmem_cache;
29 32
30/* Stats that can be updated by kernel. */ 33/* Stats that can be updated by kernel. */
31enum mem_cgroup_page_stat_item { 34enum mem_cgroup_page_stat_item {
@@ -414,5 +417,211 @@ static inline void sock_release_memcg(struct sock *sk)
414{ 417{
415} 418}
416#endif /* CONFIG_INET && CONFIG_MEMCG_KMEM */ 419#endif /* CONFIG_INET && CONFIG_MEMCG_KMEM */
420
421#ifdef CONFIG_MEMCG_KMEM
422extern struct static_key memcg_kmem_enabled_key;
423
424extern int memcg_limited_groups_array_size;
425
426/*
427 * Helper macro to loop through all memcg-specific caches. Callers must still
428 * check if the cache is valid (it is either valid or NULL).
429 * the slab_mutex must be held when looping through those caches
430 */
431#define for_each_memcg_cache_index(_idx) \
432 for ((_idx) = 0; i < memcg_limited_groups_array_size; (_idx)++)
433
434static inline bool memcg_kmem_enabled(void)
435{
436 return static_key_false(&memcg_kmem_enabled_key);
437}
438
439/*
440 * In general, we'll do everything in our power to not incur in any overhead
441 * for non-memcg users for the kmem functions. Not even a function call, if we
442 * can avoid it.
443 *
444 * Therefore, we'll inline all those functions so that in the best case, we'll
445 * see that kmemcg is off for everybody and proceed quickly. If it is on,
446 * we'll still do most of the flag checking inline. We check a lot of
447 * conditions, but because they are pretty simple, they are expected to be
448 * fast.
449 */
450bool __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg,
451 int order);
452void __memcg_kmem_commit_charge(struct page *page,
453 struct mem_cgroup *memcg, int order);
454void __memcg_kmem_uncharge_pages(struct page *page, int order);
455
456int memcg_cache_id(struct mem_cgroup *memcg);
457int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
458 struct kmem_cache *root_cache);
459void memcg_release_cache(struct kmem_cache *cachep);
460void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep);
461
462int memcg_update_cache_size(struct kmem_cache *s, int num_groups);
463void memcg_update_array_size(int num_groups);
464
465struct kmem_cache *
466__memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp);
467
468void mem_cgroup_destroy_cache(struct kmem_cache *cachep);
469void kmem_cache_destroy_memcg_children(struct kmem_cache *s);
470
471/**
472 * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed.
473 * @gfp: the gfp allocation flags.
474 * @memcg: a pointer to the memcg this was charged against.
475 * @order: allocation order.
476 *
477 * returns true if the memcg where the current task belongs can hold this
478 * allocation.
479 *
480 * We return true automatically if this allocation is not to be accounted to
481 * any memcg.
482 */
483static inline bool
484memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
485{
486 if (!memcg_kmem_enabled())
487 return true;
488
489 /*
490 * __GFP_NOFAIL allocations will move on even if charging is not
491 * possible. Therefore we don't even try, and have this allocation
492 * unaccounted. We could in theory charge it with
493 * res_counter_charge_nofail, but we hope those allocations are rare,
494 * and won't be worth the trouble.
495 */
496 if (!(gfp & __GFP_KMEMCG) || (gfp & __GFP_NOFAIL))
497 return true;
498 if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD))
499 return true;
500
501 /* If the test is dying, just let it go. */
502 if (unlikely(fatal_signal_pending(current)))
503 return true;
504
505 return __memcg_kmem_newpage_charge(gfp, memcg, order);
506}
507
508/**
509 * memcg_kmem_uncharge_pages: uncharge pages from memcg
510 * @page: pointer to struct page being freed
511 * @order: allocation order.
512 *
513 * there is no need to specify memcg here, since it is embedded in page_cgroup
514 */
515static inline void
516memcg_kmem_uncharge_pages(struct page *page, int order)
517{
518 if (memcg_kmem_enabled())
519 __memcg_kmem_uncharge_pages(page, order);
520}
521
522/**
523 * memcg_kmem_commit_charge: embeds correct memcg in a page
524 * @page: pointer to struct page recently allocated
525 * @memcg: the memcg structure we charged against
526 * @order: allocation order.
527 *
528 * Needs to be called after memcg_kmem_newpage_charge, regardless of success or
529 * failure of the allocation. if @page is NULL, this function will revert the
530 * charges. Otherwise, it will commit the memcg given by @memcg to the
531 * corresponding page_cgroup.
532 */
533static inline void
534memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order)
535{
536 if (memcg_kmem_enabled() && memcg)
537 __memcg_kmem_commit_charge(page, memcg, order);
538}
539
540/**
541 * memcg_kmem_get_cache: selects the correct per-memcg cache for allocation
542 * @cachep: the original global kmem cache
543 * @gfp: allocation flags.
544 *
545 * This function assumes that the task allocating, which determines the memcg
546 * in the page allocator, belongs to the same cgroup throughout the whole
547 * process. Misacounting can happen if the task calls memcg_kmem_get_cache()
548 * while belonging to a cgroup, and later on changes. This is considered
549 * acceptable, and should only happen upon task migration.
550 *
551 * Before the cache is created by the memcg core, there is also a possible
552 * imbalance: the task belongs to a memcg, but the cache being allocated from
553 * is the global cache, since the child cache is not yet guaranteed to be
554 * ready. This case is also fine, since in this case the GFP_KMEMCG will not be
555 * passed and the page allocator will not attempt any cgroup accounting.
556 */
557static __always_inline struct kmem_cache *
558memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
559{
560 if (!memcg_kmem_enabled())
561 return cachep;
562 if (gfp & __GFP_NOFAIL)
563 return cachep;
564 if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD))
565 return cachep;
566 if (unlikely(fatal_signal_pending(current)))
567 return cachep;
568
569 return __memcg_kmem_get_cache(cachep, gfp);
570}
571#else
572#define for_each_memcg_cache_index(_idx) \
573 for (; NULL; )
574
575static inline bool memcg_kmem_enabled(void)
576{
577 return false;
578}
579
580static inline bool
581memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
582{
583 return true;
584}
585
586static inline void memcg_kmem_uncharge_pages(struct page *page, int order)
587{
588}
589
590static inline void
591memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order)
592{
593}
594
595static inline int memcg_cache_id(struct mem_cgroup *memcg)
596{
597 return -1;
598}
599
600static inline int
601memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
602 struct kmem_cache *root_cache)
603{
604 return 0;
605}
606
607static inline void memcg_release_cache(struct kmem_cache *cachep)
608{
609}
610
611static inline void memcg_cache_list_add(struct mem_cgroup *memcg,
612 struct kmem_cache *s)
613{
614}
615
616static inline struct kmem_cache *
617memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
618{
619 return cachep;
620}
621
622static inline void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
623{
624}
625#endif /* CONFIG_MEMCG_KMEM */
417#endif /* _LINUX_MEMCONTROL_H */ 626#endif /* _LINUX_MEMCONTROL_H */
418 627
diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h
index 6f54e40fa218..5ae8456d9670 100644
--- a/include/linux/res_counter.h
+++ b/include/linux/res_counter.h
@@ -125,14 +125,16 @@ int res_counter_charge_nofail(struct res_counter *counter,
125 * 125 *
126 * these calls check for usage underflow and show a warning on the console 126 * these calls check for usage underflow and show a warning on the console
127 * _locked call expects the counter->lock to be taken 127 * _locked call expects the counter->lock to be taken
128 *
129 * returns the total charges still present in @counter.
128 */ 130 */
129 131
130void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val); 132u64 res_counter_uncharge_locked(struct res_counter *counter, unsigned long val);
131void res_counter_uncharge(struct res_counter *counter, unsigned long val); 133u64 res_counter_uncharge(struct res_counter *counter, unsigned long val);
132 134
133void res_counter_uncharge_until(struct res_counter *counter, 135u64 res_counter_uncharge_until(struct res_counter *counter,
134 struct res_counter *top, 136 struct res_counter *top,
135 unsigned long val); 137 unsigned long val);
136/** 138/**
137 * res_counter_margin - calculate chargeable space of a counter 139 * res_counter_margin - calculate chargeable space of a counter
138 * @cnt: the counter 140 * @cnt: the counter
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9914c662ed7b..f712465b05c5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1597,6 +1597,7 @@ struct task_struct {
1597 unsigned long nr_pages; /* uncharged usage */ 1597 unsigned long nr_pages; /* uncharged usage */
1598 unsigned long memsw_nr_pages; /* uncharged mem+swap usage */ 1598 unsigned long memsw_nr_pages; /* uncharged mem+swap usage */
1599 } memcg_batch; 1599 } memcg_batch;
1600 unsigned int memcg_kmem_skip_account;
1600#endif 1601#endif
1601#ifdef CONFIG_HAVE_HW_BREAKPOINT 1602#ifdef CONFIG_HAVE_HW_BREAKPOINT
1602 atomic_t ptrace_bp_refcnt; 1603 atomic_t ptrace_bp_refcnt;
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 743a10415122..5d168d7e0a28 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -11,6 +11,8 @@
11 11
12#include <linux/gfp.h> 12#include <linux/gfp.h>
13#include <linux/types.h> 13#include <linux/types.h>
14#include <linux/workqueue.h>
15
14 16
15/* 17/*
16 * Flags to pass to kmem_cache_create(). 18 * Flags to pass to kmem_cache_create().
@@ -116,6 +118,7 @@ struct kmem_cache {
116}; 118};
117#endif 119#endif
118 120
121struct mem_cgroup;
119/* 122/*
120 * struct kmem_cache related prototypes 123 * struct kmem_cache related prototypes
121 */ 124 */
@@ -125,6 +128,9 @@ int slab_is_available(void);
125struct kmem_cache *kmem_cache_create(const char *, size_t, size_t, 128struct kmem_cache *kmem_cache_create(const char *, size_t, size_t,
126 unsigned long, 129 unsigned long,
127 void (*)(void *)); 130 void (*)(void *));
131struct kmem_cache *
132kmem_cache_create_memcg(struct mem_cgroup *, const char *, size_t, size_t,
133 unsigned long, void (*)(void *), struct kmem_cache *);
128void kmem_cache_destroy(struct kmem_cache *); 134void kmem_cache_destroy(struct kmem_cache *);
129int kmem_cache_shrink(struct kmem_cache *); 135int kmem_cache_shrink(struct kmem_cache *);
130void kmem_cache_free(struct kmem_cache *, void *); 136void kmem_cache_free(struct kmem_cache *, void *);
@@ -175,6 +181,48 @@ void kmem_cache_free(struct kmem_cache *, void *);
175#ifndef ARCH_SLAB_MINALIGN 181#ifndef ARCH_SLAB_MINALIGN
176#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long) 182#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
177#endif 183#endif
184/*
185 * This is the main placeholder for memcg-related information in kmem caches.
186 * struct kmem_cache will hold a pointer to it, so the memory cost while
187 * disabled is 1 pointer. The runtime cost while enabled, gets bigger than it
188 * would otherwise be if that would be bundled in kmem_cache: we'll need an
189 * extra pointer chase. But the trade off clearly lays in favor of not
190 * penalizing non-users.
191 *
192 * Both the root cache and the child caches will have it. For the root cache,
193 * this will hold a dynamically allocated array large enough to hold
194 * information about the currently limited memcgs in the system.
195 *
196 * Child caches will hold extra metadata needed for its operation. Fields are:
197 *
198 * @memcg: pointer to the memcg this cache belongs to
199 * @list: list_head for the list of all caches in this memcg
200 * @root_cache: pointer to the global, root cache, this cache was derived from
201 * @dead: set to true after the memcg dies; the cache may still be around.
202 * @nr_pages: number of pages that belongs to this cache.
203 * @destroy: worker to be called whenever we are ready, or believe we may be
204 * ready, to destroy this cache.
205 */
206struct memcg_cache_params {
207 bool is_root_cache;
208 union {
209 struct kmem_cache *memcg_caches[0];
210 struct {
211 struct mem_cgroup *memcg;
212 struct list_head list;
213 struct kmem_cache *root_cache;
214 bool dead;
215 atomic_t nr_pages;
216 struct work_struct destroy;
217 };
218 };
219};
220
221int memcg_update_all_caches(int num_memcgs);
222
223struct seq_file;
224int cache_show(struct kmem_cache *s, struct seq_file *m);
225void print_slabinfo_header(struct seq_file *m);
178 226
179/* 227/*
180 * Common kmalloc functions provided by all allocators 228 * Common kmalloc functions provided by all allocators
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 45c0356fdc8c..8bb6e0eaf3c6 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -81,6 +81,9 @@ struct kmem_cache {
81 */ 81 */
82 int obj_offset; 82 int obj_offset;
83#endif /* CONFIG_DEBUG_SLAB */ 83#endif /* CONFIG_DEBUG_SLAB */
84#ifdef CONFIG_MEMCG_KMEM
85 struct memcg_cache_params *memcg_params;
86#endif
84 87
85/* 6) per-cpu/per-node data, touched during every alloc/free */ 88/* 6) per-cpu/per-node data, touched during every alloc/free */
86 /* 89 /*
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index df448adb7283..9db4825cd393 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -101,6 +101,10 @@ struct kmem_cache {
101#ifdef CONFIG_SYSFS 101#ifdef CONFIG_SYSFS
102 struct kobject kobj; /* For sysfs */ 102 struct kobject kobj; /* For sysfs */
103#endif 103#endif
104#ifdef CONFIG_MEMCG_KMEM
105 struct memcg_cache_params *memcg_params;
106 int max_attr_size; /* for propagation, maximum size of a stored attr */
107#endif
104 108
105#ifdef CONFIG_NUMA 109#ifdef CONFIG_NUMA
106 /* 110 /*
@@ -222,7 +226,10 @@ void *__kmalloc(size_t size, gfp_t flags);
222static __always_inline void * 226static __always_inline void *
223kmalloc_order(size_t size, gfp_t flags, unsigned int order) 227kmalloc_order(size_t size, gfp_t flags, unsigned int order)
224{ 228{
225 void *ret = (void *) __get_free_pages(flags | __GFP_COMP, order); 229 void *ret;
230
231 flags |= (__GFP_COMP | __GFP_KMEMCG);
232 ret = (void *) __get_free_pages(flags, order);
226 kmemleak_alloc(ret, size, 1, flags); 233 kmemleak_alloc(ret, size, 1, flags);
227 return ret; 234 return ret;
228} 235}
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index ccc1899bd62e..e7e04736802f 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -61,6 +61,8 @@ extern long do_no_restart_syscall(struct restart_block *parm);
61# define THREADINFO_GFP (GFP_KERNEL | __GFP_NOTRACK) 61# define THREADINFO_GFP (GFP_KERNEL | __GFP_NOTRACK)
62#endif 62#endif
63 63
64#define THREADINFO_GFP_ACCOUNTED (THREADINFO_GFP | __GFP_KMEMCG)
65
64/* 66/*
65 * flag set/clear/test wrappers 67 * flag set/clear/test wrappers
66 * - pass TIF_xxxx constants to these functions 68 * - pass TIF_xxxx constants to these functions
diff --git a/include/trace/events/gfpflags.h b/include/trace/events/gfpflags.h
index d6fd8e5b14b7..1eddbf1557f2 100644
--- a/include/trace/events/gfpflags.h
+++ b/include/trace/events/gfpflags.h
@@ -34,6 +34,7 @@
34 {(unsigned long)__GFP_HARDWALL, "GFP_HARDWALL"}, \ 34 {(unsigned long)__GFP_HARDWALL, "GFP_HARDWALL"}, \
35 {(unsigned long)__GFP_THISNODE, "GFP_THISNODE"}, \ 35 {(unsigned long)__GFP_THISNODE, "GFP_THISNODE"}, \
36 {(unsigned long)__GFP_RECLAIMABLE, "GFP_RECLAIMABLE"}, \ 36 {(unsigned long)__GFP_RECLAIMABLE, "GFP_RECLAIMABLE"}, \
37 {(unsigned long)__GFP_KMEMCG, "GFP_KMEMCG"}, \
37 {(unsigned long)__GFP_MOVABLE, "GFP_MOVABLE"}, \ 38 {(unsigned long)__GFP_MOVABLE, "GFP_MOVABLE"}, \
38 {(unsigned long)__GFP_NOTRACK, "GFP_NOTRACK"}, \ 39 {(unsigned long)__GFP_NOTRACK, "GFP_NOTRACK"}, \
39 {(unsigned long)__GFP_NO_KSWAPD, "GFP_NO_KSWAPD"}, \ 40 {(unsigned long)__GFP_NO_KSWAPD, "GFP_NO_KSWAPD"}, \
diff --git a/init/Kconfig b/init/Kconfig
index 675d8a2326cf..7d30240e5bfe 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -882,7 +882,7 @@ config MEMCG_SWAP_ENABLED
882config MEMCG_KMEM 882config MEMCG_KMEM
883 bool "Memory Resource Controller Kernel Memory accounting (EXPERIMENTAL)" 883 bool "Memory Resource Controller Kernel Memory accounting (EXPERIMENTAL)"
884 depends on MEMCG && EXPERIMENTAL 884 depends on MEMCG && EXPERIMENTAL
885 default n 885 depends on SLUB || SLAB
886 help 886 help
887 The Kernel Memory extension for Memory Resource Controller can limit 887 The Kernel Memory extension for Memory Resource Controller can limit
888 the amount of memory used by kernel objects in the system. Those are 888 the amount of memory used by kernel objects in the system. Those are
diff --git a/kernel/fork.c b/kernel/fork.c
index c36c4e301efe..85f6d536608d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -146,7 +146,7 @@ void __weak arch_release_thread_info(struct thread_info *ti)
146static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, 146static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
147 int node) 147 int node)
148{ 148{
149 struct page *page = alloc_pages_node(node, THREADINFO_GFP, 149 struct page *page = alloc_pages_node(node, THREADINFO_GFP_ACCOUNTED,
150 THREAD_SIZE_ORDER); 150 THREAD_SIZE_ORDER);
151 151
152 return page ? page_address(page) : NULL; 152 return page ? page_address(page) : NULL;
@@ -154,7 +154,7 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
154 154
155static inline void free_thread_info(struct thread_info *ti) 155static inline void free_thread_info(struct thread_info *ti)
156{ 156{
157 free_pages((unsigned long)ti, THREAD_SIZE_ORDER); 157 free_memcg_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
158} 158}
159# else 159# else
160static struct kmem_cache *thread_info_cache; 160static struct kmem_cache *thread_info_cache;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 35c70c9e24d8..e49a288fa479 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -818,7 +818,7 @@ static void irq_thread_dtor(struct callback_head *unused)
818 action = kthread_data(tsk); 818 action = kthread_data(tsk);
819 819
820 pr_err("exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", 820 pr_err("exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
821 tsk->comm ? tsk->comm : "", tsk->pid, action->irq); 821 tsk->comm, tsk->pid, action->irq);
822 822
823 823
824 desc = irq_to_desc(action->irq); 824 desc = irq_to_desc(action->irq);
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index 3920d593e63c..ff55247e7049 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -86,33 +86,39 @@ int res_counter_charge_nofail(struct res_counter *counter, unsigned long val,
86 return __res_counter_charge(counter, val, limit_fail_at, true); 86 return __res_counter_charge(counter, val, limit_fail_at, true);
87} 87}
88 88
89void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) 89u64 res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
90{ 90{
91 if (WARN_ON(counter->usage < val)) 91 if (WARN_ON(counter->usage < val))
92 val = counter->usage; 92 val = counter->usage;
93 93
94 counter->usage -= val; 94 counter->usage -= val;
95 return counter->usage;
95} 96}
96 97
97void res_counter_uncharge_until(struct res_counter *counter, 98u64 res_counter_uncharge_until(struct res_counter *counter,
98 struct res_counter *top, 99 struct res_counter *top,
99 unsigned long val) 100 unsigned long val)
100{ 101{
101 unsigned long flags; 102 unsigned long flags;
102 struct res_counter *c; 103 struct res_counter *c;
104 u64 ret = 0;
103 105
104 local_irq_save(flags); 106 local_irq_save(flags);
105 for (c = counter; c != top; c = c->parent) { 107 for (c = counter; c != top; c = c->parent) {
108 u64 r;
106 spin_lock(&c->lock); 109 spin_lock(&c->lock);
107 res_counter_uncharge_locked(c, val); 110 r = res_counter_uncharge_locked(c, val);
111 if (c == counter)
112 ret = r;
108 spin_unlock(&c->lock); 113 spin_unlock(&c->lock);
109 } 114 }
110 local_irq_restore(flags); 115 local_irq_restore(flags);
116 return ret;
111} 117}
112 118
113void res_counter_uncharge(struct res_counter *counter, unsigned long val) 119u64 res_counter_uncharge(struct res_counter *counter, unsigned long val)
114{ 120{
115 res_counter_uncharge_until(counter, NULL, val); 121 return res_counter_uncharge_until(counter, NULL, val);
116} 122}
117 123
118static inline unsigned long long * 124static inline unsigned long long *
diff --git a/mm/Kconfig b/mm/Kconfig
index 71259e052ce8..278e3ab1f169 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -149,7 +149,18 @@ config MOVABLE_NODE
149 depends on NO_BOOTMEM 149 depends on NO_BOOTMEM
150 depends on X86_64 150 depends on X86_64
151 depends on NUMA 151 depends on NUMA
152 depends on BROKEN 152 default n
153 help
154 Allow a node to have only movable memory. Pages used by the kernel,
155 such as direct mapping pages cannot be migrated. So the corresponding
156 memory device cannot be hotplugged. This option allows users to
157 online all the memory of a node as movable memory so that the whole
158 node can be hotplugged. Users who don't use the memory hotplug
159 feature are fine with this option on since they don't online memory
160 as movable.
161
162 Say Y here if you want to hotplug a whole node.
163 Say N here if you want kernel to use memory on all nodes evenly.
153 164
154# eventually, we can have this option just 'select SPARSEMEM' 165# eventually, we can have this option just 'select SPARSEMEM'
155config MEMORY_HOTPLUG 166config MEMORY_HOTPLUG
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e5318c7793ae..4f3ea0b1e57c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1906,14 +1906,12 @@ static int __init hugetlb_init(void)
1906 default_hstate.max_huge_pages = default_hstate_max_huge_pages; 1906 default_hstate.max_huge_pages = default_hstate_max_huge_pages;
1907 1907
1908 hugetlb_init_hstates(); 1908 hugetlb_init_hstates();
1909
1910 gather_bootmem_prealloc(); 1909 gather_bootmem_prealloc();
1911
1912 report_hugepages(); 1910 report_hugepages();
1913 1911
1914 hugetlb_sysfs_init(); 1912 hugetlb_sysfs_init();
1915
1916 hugetlb_register_all_nodes(); 1913 hugetlb_register_all_nodes();
1914 hugetlb_cgroup_file_init();
1917 1915
1918 return 0; 1916 return 0;
1919} 1917}
@@ -1943,13 +1941,6 @@ void __init hugetlb_add_hstate(unsigned order)
1943 h->next_nid_to_free = first_node(node_states[N_MEMORY]); 1941 h->next_nid_to_free = first_node(node_states[N_MEMORY]);
1944 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", 1942 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
1945 huge_page_size(h)/1024); 1943 huge_page_size(h)/1024);
1946 /*
1947 * Add cgroup control files only if the huge page consists
1948 * of more than two normal pages. This is because we use
1949 * page[2].lru.next for storing cgoup details.
1950 */
1951 if (order >= HUGETLB_CGROUP_MIN_ORDER)
1952 hugetlb_cgroup_file_init(hugetlb_max_hstate - 1);
1953 1944
1954 parsed_hstate = h; 1945 parsed_hstate = h;
1955} 1946}
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index b5bde7a5c017..9cea7de22ffb 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -333,7 +333,7 @@ static char *mem_fmt(char *buf, int size, unsigned long hsize)
333 return buf; 333 return buf;
334} 334}
335 335
336int __init hugetlb_cgroup_file_init(int idx) 336static void __init __hugetlb_cgroup_file_init(int idx)
337{ 337{
338 char buf[32]; 338 char buf[32];
339 struct cftype *cft; 339 struct cftype *cft;
@@ -375,7 +375,22 @@ int __init hugetlb_cgroup_file_init(int idx)
375 375
376 WARN_ON(cgroup_add_cftypes(&hugetlb_subsys, h->cgroup_files)); 376 WARN_ON(cgroup_add_cftypes(&hugetlb_subsys, h->cgroup_files));
377 377
378 return 0; 378 return;
379}
380
381void __init hugetlb_cgroup_file_init(void)
382{
383 struct hstate *h;
384
385 for_each_hstate(h) {
386 /*
387 * Add cgroup control files only if the huge page consists
388 * of more than two normal pages. This is because we use
389 * page[2].lru.next for storing cgroup details.
390 */
391 if (huge_page_order(h) >= HUGETLB_CGROUP_MIN_ORDER)
392 __hugetlb_cgroup_file_init(hstate_index(h));
393 }
379} 394}
380 395
381/* 396/*
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index a217cc544060..752a705c77c2 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1556,7 +1556,8 @@ static int dump_str_object_info(const char *str)
1556 struct kmemleak_object *object; 1556 struct kmemleak_object *object;
1557 unsigned long addr; 1557 unsigned long addr;
1558 1558
1559 addr= simple_strtoul(str, NULL, 0); 1559 if (kstrtoul(str, 0, &addr))
1560 return -EINVAL;
1560 object = find_and_get_object(addr, 0); 1561 object = find_and_get_object(addr, 0);
1561 if (!object) { 1562 if (!object) {
1562 pr_info("Unknown object at 0x%08lx\n", addr); 1563 pr_info("Unknown object at 0x%08lx\n", addr);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index bbfac5063ca8..f3009b4bae51 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -10,6 +10,10 @@
10 * Copyright (C) 2009 Nokia Corporation 10 * Copyright (C) 2009 Nokia Corporation
11 * Author: Kirill A. Shutemov 11 * Author: Kirill A. Shutemov
12 * 12 *
13 * Kernel Memory Controller
14 * Copyright (C) 2012 Parallels Inc. and Google Inc.
15 * Authors: Glauber Costa and Suleiman Souhlal
16 *
13 * This program is free software; you can redistribute it and/or modify 17 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by 18 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2 of the License, or 19 * the Free Software Foundation; either version 2 of the License, or
@@ -268,6 +272,10 @@ struct mem_cgroup {
268 }; 272 };
269 273
270 /* 274 /*
275 * the counter to account for kernel memory usage.
276 */
277 struct res_counter kmem;
278 /*
271 * Per cgroup active and inactive list, similar to the 279 * Per cgroup active and inactive list, similar to the
272 * per zone LRU lists. 280 * per zone LRU lists.
273 */ 281 */
@@ -282,6 +290,7 @@ struct mem_cgroup {
282 * Should the accounting and control be hierarchical, per subtree? 290 * Should the accounting and control be hierarchical, per subtree?
283 */ 291 */
284 bool use_hierarchy; 292 bool use_hierarchy;
293 unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */
285 294
286 bool oom_lock; 295 bool oom_lock;
287 atomic_t under_oom; 296 atomic_t under_oom;
@@ -332,8 +341,61 @@ struct mem_cgroup {
332#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) 341#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
333 struct tcp_memcontrol tcp_mem; 342 struct tcp_memcontrol tcp_mem;
334#endif 343#endif
344#if defined(CONFIG_MEMCG_KMEM)
345 /* analogous to slab_common's slab_caches list. per-memcg */
346 struct list_head memcg_slab_caches;
347 /* Not a spinlock, we can take a lot of time walking the list */
348 struct mutex slab_caches_mutex;
349 /* Index in the kmem_cache->memcg_params->memcg_caches array */
350 int kmemcg_id;
351#endif
335}; 352};
336 353
354/* internal only representation about the status of kmem accounting. */
355enum {
356 KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */
357 KMEM_ACCOUNTED_ACTIVATED, /* static key enabled. */
358 KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */
359};
360
361/* We account when limit is on, but only after call sites are patched */
362#define KMEM_ACCOUNTED_MASK \
363 ((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED))
364
365#ifdef CONFIG_MEMCG_KMEM
366static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
367{
368 set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
369}
370
371static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
372{
373 return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
374}
375
376static void memcg_kmem_set_activated(struct mem_cgroup *memcg)
377{
378 set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
379}
380
381static void memcg_kmem_clear_activated(struct mem_cgroup *memcg)
382{
383 clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
384}
385
386static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
387{
388 if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))
389 set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags);
390}
391
392static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg)
393{
394 return test_and_clear_bit(KMEM_ACCOUNTED_DEAD,
395 &memcg->kmem_account_flags);
396}
397#endif
398
337/* Stuffs for move charges at task migration. */ 399/* Stuffs for move charges at task migration. */
338/* 400/*
339 * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a 401 * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a
@@ -388,9 +450,13 @@ enum charge_type {
388}; 450};
389 451
390/* for encoding cft->private value on file */ 452/* for encoding cft->private value on file */
391#define _MEM (0) 453enum res_type {
392#define _MEMSWAP (1) 454 _MEM,
393#define _OOM_TYPE (2) 455 _MEMSWAP,
456 _OOM_TYPE,
457 _KMEM,
458};
459
394#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) 460#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
395#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) 461#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
396#define MEMFILE_ATTR(val) ((val) & 0xffff) 462#define MEMFILE_ATTR(val) ((val) & 0xffff)
@@ -487,6 +553,75 @@ static void disarm_sock_keys(struct mem_cgroup *memcg)
487} 553}
488#endif 554#endif
489 555
556#ifdef CONFIG_MEMCG_KMEM
557/*
558 * This will be the memcg's index in each cache's ->memcg_params->memcg_caches.
559 * There are two main reasons for not using the css_id for this:
560 * 1) this works better in sparse environments, where we have a lot of memcgs,
561 * but only a few kmem-limited. Or also, if we have, for instance, 200
562 * memcgs, and none but the 200th is kmem-limited, we'd have to have a
563 * 200 entry array for that.
564 *
565 * 2) In order not to violate the cgroup API, we would like to do all memory
566 * allocation in ->create(). At that point, we haven't yet allocated the
567 * css_id. Having a separate index prevents us from messing with the cgroup
568 * core for this
569 *
570 * The current size of the caches array is stored in
571 * memcg_limited_groups_array_size. It will double each time we have to
572 * increase it.
573 */
574static DEFINE_IDA(kmem_limited_groups);
575int memcg_limited_groups_array_size;
576
577/*
578 * MIN_SIZE is different than 1, because we would like to avoid going through
579 * the alloc/free process all the time. In a small machine, 4 kmem-limited
580 * cgroups is a reasonable guess. In the future, it could be a parameter or
581 * tunable, but that is strictly not necessary.
582 *
583 * MAX_SIZE should be as large as the number of css_ids. Ideally, we could get
584 * this constant directly from cgroup, but it is understandable that this is
585 * better kept as an internal representation in cgroup.c. In any case, the
586 * css_id space is not getting any smaller, and we don't have to necessarily
587 * increase ours as well if it increases.
588 */
589#define MEMCG_CACHES_MIN_SIZE 4
590#define MEMCG_CACHES_MAX_SIZE 65535
591
592/*
593 * A lot of the calls to the cache allocation functions are expected to be
594 * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
595 * conditional to this static branch, we'll have to allow modules that does
596 * kmem_cache_alloc and the such to see this symbol as well
597 */
598struct static_key memcg_kmem_enabled_key;
599EXPORT_SYMBOL(memcg_kmem_enabled_key);
600
601static void disarm_kmem_keys(struct mem_cgroup *memcg)
602{
603 if (memcg_kmem_is_active(memcg)) {
604 static_key_slow_dec(&memcg_kmem_enabled_key);
605 ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id);
606 }
607 /*
608 * This check can't live in kmem destruction function,
609 * since the charges will outlive the cgroup
610 */
611 WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0);
612}
613#else
614static void disarm_kmem_keys(struct mem_cgroup *memcg)
615{
616}
617#endif /* CONFIG_MEMCG_KMEM */
618
619static void disarm_static_keys(struct mem_cgroup *memcg)
620{
621 disarm_sock_keys(memcg);
622 disarm_kmem_keys(memcg);
623}
624
490static void drain_all_stock_async(struct mem_cgroup *memcg); 625static void drain_all_stock_async(struct mem_cgroup *memcg);
491 626
492static struct mem_cgroup_per_zone * 627static struct mem_cgroup_per_zone *
@@ -1453,6 +1588,10 @@ done:
1453 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, 1588 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
1454 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, 1589 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
1455 res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); 1590 res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
1591 printk(KERN_INFO "kmem: usage %llukB, limit %llukB, failcnt %llu\n",
1592 res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10,
1593 res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10,
1594 res_counter_read_u64(&memcg->kmem, RES_FAILCNT));
1456} 1595}
1457 1596
1458/* 1597/*
@@ -2060,20 +2199,28 @@ struct memcg_stock_pcp {
2060static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); 2199static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
2061static DEFINE_MUTEX(percpu_charge_mutex); 2200static DEFINE_MUTEX(percpu_charge_mutex);
2062 2201
2063/* 2202/**
2064 * Try to consume stocked charge on this cpu. If success, one page is consumed 2203 * consume_stock: Try to consume stocked charge on this cpu.
2065 * from local stock and true is returned. If the stock is 0 or charges from a 2204 * @memcg: memcg to consume from.
2066 * cgroup which is not current target, returns false. This stock will be 2205 * @nr_pages: how many pages to charge.
2067 * refilled. 2206 *
2207 * The charges will only happen if @memcg matches the current cpu's memcg
2208 * stock, and at least @nr_pages are available in that stock. Failure to
2209 * service an allocation will refill the stock.
2210 *
2211 * returns true if successful, false otherwise.
2068 */ 2212 */
2069static bool consume_stock(struct mem_cgroup *memcg) 2213static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2070{ 2214{
2071 struct memcg_stock_pcp *stock; 2215 struct memcg_stock_pcp *stock;
2072 bool ret = true; 2216 bool ret = true;
2073 2217
2218 if (nr_pages > CHARGE_BATCH)
2219 return false;
2220
2074 stock = &get_cpu_var(memcg_stock); 2221 stock = &get_cpu_var(memcg_stock);
2075 if (memcg == stock->cached && stock->nr_pages) 2222 if (memcg == stock->cached && stock->nr_pages >= nr_pages)
2076 stock->nr_pages--; 2223 stock->nr_pages -= nr_pages;
2077 else /* need to call res_counter_charge */ 2224 else /* need to call res_counter_charge */
2078 ret = false; 2225 ret = false;
2079 put_cpu_var(memcg_stock); 2226 put_cpu_var(memcg_stock);
@@ -2250,7 +2397,8 @@ enum {
2250}; 2397};
2251 2398
2252static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, 2399static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2253 unsigned int nr_pages, bool oom_check) 2400 unsigned int nr_pages, unsigned int min_pages,
2401 bool oom_check)
2254{ 2402{
2255 unsigned long csize = nr_pages * PAGE_SIZE; 2403 unsigned long csize = nr_pages * PAGE_SIZE;
2256 struct mem_cgroup *mem_over_limit; 2404 struct mem_cgroup *mem_over_limit;
@@ -2273,18 +2421,18 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2273 } else 2421 } else
2274 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); 2422 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
2275 /* 2423 /*
2276 * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch
2277 * of regular pages (CHARGE_BATCH), or a single regular page (1).
2278 *
2279 * Never reclaim on behalf of optional batching, retry with a 2424 * Never reclaim on behalf of optional batching, retry with a
2280 * single page instead. 2425 * single page instead.
2281 */ 2426 */
2282 if (nr_pages == CHARGE_BATCH) 2427 if (nr_pages > min_pages)
2283 return CHARGE_RETRY; 2428 return CHARGE_RETRY;
2284 2429
2285 if (!(gfp_mask & __GFP_WAIT)) 2430 if (!(gfp_mask & __GFP_WAIT))
2286 return CHARGE_WOULDBLOCK; 2431 return CHARGE_WOULDBLOCK;
2287 2432
2433 if (gfp_mask & __GFP_NORETRY)
2434 return CHARGE_NOMEM;
2435
2288 ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); 2436 ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
2289 if (mem_cgroup_margin(mem_over_limit) >= nr_pages) 2437 if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2290 return CHARGE_RETRY; 2438 return CHARGE_RETRY;
@@ -2297,7 +2445,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2297 * unlikely to succeed so close to the limit, and we fall back 2445 * unlikely to succeed so close to the limit, and we fall back
2298 * to regular pages anyway in case of failure. 2446 * to regular pages anyway in case of failure.
2299 */ 2447 */
2300 if (nr_pages == 1 && ret) 2448 if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret)
2301 return CHARGE_RETRY; 2449 return CHARGE_RETRY;
2302 2450
2303 /* 2451 /*
@@ -2371,7 +2519,7 @@ again:
2371 memcg = *ptr; 2519 memcg = *ptr;
2372 if (mem_cgroup_is_root(memcg)) 2520 if (mem_cgroup_is_root(memcg))
2373 goto done; 2521 goto done;
2374 if (nr_pages == 1 && consume_stock(memcg)) 2522 if (consume_stock(memcg, nr_pages))
2375 goto done; 2523 goto done;
2376 css_get(&memcg->css); 2524 css_get(&memcg->css);
2377 } else { 2525 } else {
@@ -2396,7 +2544,7 @@ again:
2396 rcu_read_unlock(); 2544 rcu_read_unlock();
2397 goto done; 2545 goto done;
2398 } 2546 }
2399 if (nr_pages == 1 && consume_stock(memcg)) { 2547 if (consume_stock(memcg, nr_pages)) {
2400 /* 2548 /*
2401 * It seems dagerous to access memcg without css_get(). 2549 * It seems dagerous to access memcg without css_get().
2402 * But considering how consume_stok works, it's not 2550 * But considering how consume_stok works, it's not
@@ -2431,7 +2579,8 @@ again:
2431 nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; 2579 nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
2432 } 2580 }
2433 2581
2434 ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, oom_check); 2582 ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages,
2583 oom_check);
2435 switch (ret) { 2584 switch (ret) {
2436 case CHARGE_OK: 2585 case CHARGE_OK:
2437 break; 2586 break;
@@ -2624,6 +2773,766 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2624 memcg_check_events(memcg, page); 2773 memcg_check_events(memcg, page);
2625} 2774}
2626 2775
2776static DEFINE_MUTEX(set_limit_mutex);
2777
2778#ifdef CONFIG_MEMCG_KMEM
2779static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
2780{
2781 return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
2782 (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK);
2783}
2784
2785/*
2786 * This is a bit cumbersome, but it is rarely used and avoids a backpointer
2787 * in the memcg_cache_params struct.
2788 */
2789static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
2790{
2791 struct kmem_cache *cachep;
2792
2793 VM_BUG_ON(p->is_root_cache);
2794 cachep = p->root_cache;
2795 return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)];
2796}
2797
2798#ifdef CONFIG_SLABINFO
2799static int mem_cgroup_slabinfo_read(struct cgroup *cont, struct cftype *cft,
2800 struct seq_file *m)
2801{
2802 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
2803 struct memcg_cache_params *params;
2804
2805 if (!memcg_can_account_kmem(memcg))
2806 return -EIO;
2807
2808 print_slabinfo_header(m);
2809
2810 mutex_lock(&memcg->slab_caches_mutex);
2811 list_for_each_entry(params, &memcg->memcg_slab_caches, list)
2812 cache_show(memcg_params_to_cache(params), m);
2813 mutex_unlock(&memcg->slab_caches_mutex);
2814
2815 return 0;
2816}
2817#endif
2818
2819static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
2820{
2821 struct res_counter *fail_res;
2822 struct mem_cgroup *_memcg;
2823 int ret = 0;
2824 bool may_oom;
2825
2826 ret = res_counter_charge(&memcg->kmem, size, &fail_res);
2827 if (ret)
2828 return ret;
2829
2830 /*
2831 * Conditions under which we can wait for the oom_killer. Those are
2832 * the same conditions tested by the core page allocator
2833 */
2834 may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY);
2835
2836 _memcg = memcg;
2837 ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT,
2838 &_memcg, may_oom);
2839
2840 if (ret == -EINTR) {
2841 /*
2842 * __mem_cgroup_try_charge() chosed to bypass to root due to
2843 * OOM kill or fatal signal. Since our only options are to
2844 * either fail the allocation or charge it to this cgroup, do
2845 * it as a temporary condition. But we can't fail. From a
2846 * kmem/slab perspective, the cache has already been selected,
2847 * by mem_cgroup_kmem_get_cache(), so it is too late to change
2848 * our minds.
2849 *
2850 * This condition will only trigger if the task entered
2851 * memcg_charge_kmem in a sane state, but was OOM-killed during
2852 * __mem_cgroup_try_charge() above. Tasks that were already
2853 * dying when the allocation triggers should have been already
2854 * directed to the root cgroup in memcontrol.h
2855 */
2856 res_counter_charge_nofail(&memcg->res, size, &fail_res);
2857 if (do_swap_account)
2858 res_counter_charge_nofail(&memcg->memsw, size,
2859 &fail_res);
2860 ret = 0;
2861 } else if (ret)
2862 res_counter_uncharge(&memcg->kmem, size);
2863
2864 return ret;
2865}
2866
2867static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
2868{
2869 res_counter_uncharge(&memcg->res, size);
2870 if (do_swap_account)
2871 res_counter_uncharge(&memcg->memsw, size);
2872
2873 /* Not down to 0 */
2874 if (res_counter_uncharge(&memcg->kmem, size))
2875 return;
2876
2877 if (memcg_kmem_test_and_clear_dead(memcg))
2878 mem_cgroup_put(memcg);
2879}
2880
2881void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep)
2882{
2883 if (!memcg)
2884 return;
2885
2886 mutex_lock(&memcg->slab_caches_mutex);
2887 list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
2888 mutex_unlock(&memcg->slab_caches_mutex);
2889}
2890
2891/*
2892 * helper for acessing a memcg's index. It will be used as an index in the
2893 * child cache array in kmem_cache, and also to derive its name. This function
2894 * will return -1 when this is not a kmem-limited memcg.
2895 */
2896int memcg_cache_id(struct mem_cgroup *memcg)
2897{
2898 return memcg ? memcg->kmemcg_id : -1;
2899}
2900
2901/*
2902 * This ends up being protected by the set_limit mutex, during normal
2903 * operation, because that is its main call site.
2904 *
2905 * But when we create a new cache, we can call this as well if its parent
2906 * is kmem-limited. That will have to hold set_limit_mutex as well.
2907 */
2908int memcg_update_cache_sizes(struct mem_cgroup *memcg)
2909{
2910 int num, ret;
2911
2912 num = ida_simple_get(&kmem_limited_groups,
2913 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
2914 if (num < 0)
2915 return num;
2916 /*
2917 * After this point, kmem_accounted (that we test atomically in
2918 * the beginning of this conditional), is no longer 0. This
2919 * guarantees only one process will set the following boolean
2920 * to true. We don't need test_and_set because we're protected
2921 * by the set_limit_mutex anyway.
2922 */
2923 memcg_kmem_set_activated(memcg);
2924
2925 ret = memcg_update_all_caches(num+1);
2926 if (ret) {
2927 ida_simple_remove(&kmem_limited_groups, num);
2928 memcg_kmem_clear_activated(memcg);
2929 return ret;
2930 }
2931
2932 memcg->kmemcg_id = num;
2933 INIT_LIST_HEAD(&memcg->memcg_slab_caches);
2934 mutex_init(&memcg->slab_caches_mutex);
2935 return 0;
2936}
2937
2938static size_t memcg_caches_array_size(int num_groups)
2939{
2940 ssize_t size;
2941 if (num_groups <= 0)
2942 return 0;
2943
2944 size = 2 * num_groups;
2945 if (size < MEMCG_CACHES_MIN_SIZE)
2946 size = MEMCG_CACHES_MIN_SIZE;
2947 else if (size > MEMCG_CACHES_MAX_SIZE)
2948 size = MEMCG_CACHES_MAX_SIZE;
2949
2950 return size;
2951}
2952
2953/*
2954 * We should update the current array size iff all caches updates succeed. This
2955 * can only be done from the slab side. The slab mutex needs to be held when
2956 * calling this.
2957 */
2958void memcg_update_array_size(int num)
2959{
2960 if (num > memcg_limited_groups_array_size)
2961 memcg_limited_groups_array_size = memcg_caches_array_size(num);
2962}
2963
2964int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
2965{
2966 struct memcg_cache_params *cur_params = s->memcg_params;
2967
2968 VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache);
2969
2970 if (num_groups > memcg_limited_groups_array_size) {
2971 int i;
2972 ssize_t size = memcg_caches_array_size(num_groups);
2973
2974 size *= sizeof(void *);
2975 size += sizeof(struct memcg_cache_params);
2976
2977 s->memcg_params = kzalloc(size, GFP_KERNEL);
2978 if (!s->memcg_params) {
2979 s->memcg_params = cur_params;
2980 return -ENOMEM;
2981 }
2982
2983 s->memcg_params->is_root_cache = true;
2984
2985 /*
2986 * There is the chance it will be bigger than
2987 * memcg_limited_groups_array_size, if we failed an allocation
2988 * in a cache, in which case all caches updated before it, will
2989 * have a bigger array.
2990 *
2991 * But if that is the case, the data after
2992 * memcg_limited_groups_array_size is certainly unused
2993 */
2994 for (i = 0; i < memcg_limited_groups_array_size; i++) {
2995 if (!cur_params->memcg_caches[i])
2996 continue;
2997 s->memcg_params->memcg_caches[i] =
2998 cur_params->memcg_caches[i];
2999 }
3000
3001 /*
3002 * Ideally, we would wait until all caches succeed, and only
3003 * then free the old one. But this is not worth the extra
3004 * pointer per-cache we'd have to have for this.
3005 *
3006 * It is not a big deal if some caches are left with a size
3007 * bigger than the others. And all updates will reset this
3008 * anyway.
3009 */
3010 kfree(cur_params);
3011 }
3012 return 0;
3013}
3014
3015int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
3016 struct kmem_cache *root_cache)
3017{
3018 size_t size = sizeof(struct memcg_cache_params);
3019
3020 if (!memcg_kmem_enabled())
3021 return 0;
3022
3023 if (!memcg)
3024 size += memcg_limited_groups_array_size * sizeof(void *);
3025
3026 s->memcg_params = kzalloc(size, GFP_KERNEL);
3027 if (!s->memcg_params)
3028 return -ENOMEM;
3029
3030 if (memcg) {
3031 s->memcg_params->memcg = memcg;
3032 s->memcg_params->root_cache = root_cache;
3033 }
3034 return 0;
3035}
3036
3037void memcg_release_cache(struct kmem_cache *s)
3038{
3039 struct kmem_cache *root;
3040 struct mem_cgroup *memcg;
3041 int id;
3042
3043 /*
3044 * This happens, for instance, when a root cache goes away before we
3045 * add any memcg.
3046 */
3047 if (!s->memcg_params)
3048 return;
3049
3050 if (s->memcg_params->is_root_cache)
3051 goto out;
3052
3053 memcg = s->memcg_params->memcg;
3054 id = memcg_cache_id(memcg);
3055
3056 root = s->memcg_params->root_cache;
3057 root->memcg_params->memcg_caches[id] = NULL;
3058 mem_cgroup_put(memcg);
3059
3060 mutex_lock(&memcg->slab_caches_mutex);
3061 list_del(&s->memcg_params->list);
3062 mutex_unlock(&memcg->slab_caches_mutex);
3063
3064out:
3065 kfree(s->memcg_params);
3066}
3067
3068/*
3069 * During the creation a new cache, we need to disable our accounting mechanism
3070 * altogether. This is true even if we are not creating, but rather just
3071 * enqueing new caches to be created.
3072 *
3073 * This is because that process will trigger allocations; some visible, like
3074 * explicit kmallocs to auxiliary data structures, name strings and internal
3075 * cache structures; some well concealed, like INIT_WORK() that can allocate
3076 * objects during debug.
3077 *
3078 * If any allocation happens during memcg_kmem_get_cache, we will recurse back
3079 * to it. This may not be a bounded recursion: since the first cache creation
3080 * failed to complete (waiting on the allocation), we'll just try to create the
3081 * cache again, failing at the same point.
3082 *
3083 * memcg_kmem_get_cache is prepared to abort after seeing a positive count of
3084 * memcg_kmem_skip_account. So we enclose anything that might allocate memory
3085 * inside the following two functions.
3086 */
3087static inline void memcg_stop_kmem_account(void)
3088{
3089 VM_BUG_ON(!current->mm);
3090 current->memcg_kmem_skip_account++;
3091}
3092
3093static inline void memcg_resume_kmem_account(void)
3094{
3095 VM_BUG_ON(!current->mm);
3096 current->memcg_kmem_skip_account--;
3097}
3098
3099static void kmem_cache_destroy_work_func(struct work_struct *w)
3100{
3101 struct kmem_cache *cachep;
3102 struct memcg_cache_params *p;
3103
3104 p = container_of(w, struct memcg_cache_params, destroy);
3105
3106 cachep = memcg_params_to_cache(p);
3107
3108 /*
3109 * If we get down to 0 after shrink, we could delete right away.
3110 * However, memcg_release_pages() already puts us back in the workqueue
3111 * in that case. If we proceed deleting, we'll get a dangling
3112 * reference, and removing the object from the workqueue in that case
3113 * is unnecessary complication. We are not a fast path.
3114 *
3115 * Note that this case is fundamentally different from racing with
3116 * shrink_slab(): if memcg_cgroup_destroy_cache() is called in
3117 * kmem_cache_shrink, not only we would be reinserting a dead cache
3118 * into the queue, but doing so from inside the worker racing to
3119 * destroy it.
3120 *
3121 * So if we aren't down to zero, we'll just schedule a worker and try
3122 * again
3123 */
3124 if (atomic_read(&cachep->memcg_params->nr_pages) != 0) {
3125 kmem_cache_shrink(cachep);
3126 if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
3127 return;
3128 } else
3129 kmem_cache_destroy(cachep);
3130}
3131
3132void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
3133{
3134 if (!cachep->memcg_params->dead)
3135 return;
3136
3137 /*
3138 * There are many ways in which we can get here.
3139 *
3140 * We can get to a memory-pressure situation while the delayed work is
3141 * still pending to run. The vmscan shrinkers can then release all
3142 * cache memory and get us to destruction. If this is the case, we'll
3143 * be executed twice, which is a bug (the second time will execute over
3144 * bogus data). In this case, cancelling the work should be fine.
3145 *
3146 * But we can also get here from the worker itself, if
3147 * kmem_cache_shrink is enough to shake all the remaining objects and
3148 * get the page count to 0. In this case, we'll deadlock if we try to
3149 * cancel the work (the worker runs with an internal lock held, which
3150 * is the same lock we would hold for cancel_work_sync().)
3151 *
3152 * Since we can't possibly know who got us here, just refrain from
3153 * running if there is already work pending
3154 */
3155 if (work_pending(&cachep->memcg_params->destroy))
3156 return;
3157 /*
3158 * We have to defer the actual destroying to a workqueue, because
3159 * we might currently be in a context that cannot sleep.
3160 */
3161 schedule_work(&cachep->memcg_params->destroy);
3162}
3163
3164static char *memcg_cache_name(struct mem_cgroup *memcg, struct kmem_cache *s)
3165{
3166 char *name;
3167 struct dentry *dentry;
3168
3169 rcu_read_lock();
3170 dentry = rcu_dereference(memcg->css.cgroup->dentry);
3171 rcu_read_unlock();
3172
3173 BUG_ON(dentry == NULL);
3174
3175 name = kasprintf(GFP_KERNEL, "%s(%d:%s)", s->name,
3176 memcg_cache_id(memcg), dentry->d_name.name);
3177
3178 return name;
3179}
3180
3181static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
3182 struct kmem_cache *s)
3183{
3184 char *name;
3185 struct kmem_cache *new;
3186
3187 name = memcg_cache_name(memcg, s);
3188 if (!name)
3189 return NULL;
3190
3191 new = kmem_cache_create_memcg(memcg, name, s->object_size, s->align,
3192 (s->flags & ~SLAB_PANIC), s->ctor, s);
3193
3194 if (new)
3195 new->allocflags |= __GFP_KMEMCG;
3196
3197 kfree(name);
3198 return new;
3199}
3200
3201/*
3202 * This lock protects updaters, not readers. We want readers to be as fast as
3203 * they can, and they will either see NULL or a valid cache value. Our model
3204 * allow them to see NULL, in which case the root memcg will be selected.
3205 *
3206 * We need this lock because multiple allocations to the same cache from a non
3207 * will span more than one worker. Only one of them can create the cache.
3208 */
3209static DEFINE_MUTEX(memcg_cache_mutex);
3210static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
3211 struct kmem_cache *cachep)
3212{
3213 struct kmem_cache *new_cachep;
3214 int idx;
3215
3216 BUG_ON(!memcg_can_account_kmem(memcg));
3217
3218 idx = memcg_cache_id(memcg);
3219
3220 mutex_lock(&memcg_cache_mutex);
3221 new_cachep = cachep->memcg_params->memcg_caches[idx];
3222 if (new_cachep)
3223 goto out;
3224
3225 new_cachep = kmem_cache_dup(memcg, cachep);
3226 if (new_cachep == NULL) {
3227 new_cachep = cachep;
3228 goto out;
3229 }
3230
3231 mem_cgroup_get(memcg);
3232 atomic_set(&new_cachep->memcg_params->nr_pages , 0);
3233
3234 cachep->memcg_params->memcg_caches[idx] = new_cachep;
3235 /*
3236 * the readers won't lock, make sure everybody sees the updated value,
3237 * so they won't put stuff in the queue again for no reason
3238 */
3239 wmb();
3240out:
3241 mutex_unlock(&memcg_cache_mutex);
3242 return new_cachep;
3243}
3244
3245void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
3246{
3247 struct kmem_cache *c;
3248 int i;
3249
3250 if (!s->memcg_params)
3251 return;
3252 if (!s->memcg_params->is_root_cache)
3253 return;
3254
3255 /*
3256 * If the cache is being destroyed, we trust that there is no one else
3257 * requesting objects from it. Even if there are, the sanity checks in
3258 * kmem_cache_destroy should caught this ill-case.
3259 *
3260 * Still, we don't want anyone else freeing memcg_caches under our
3261 * noses, which can happen if a new memcg comes to life. As usual,
3262 * we'll take the set_limit_mutex to protect ourselves against this.
3263 */
3264 mutex_lock(&set_limit_mutex);
3265 for (i = 0; i < memcg_limited_groups_array_size; i++) {
3266 c = s->memcg_params->memcg_caches[i];
3267 if (!c)
3268 continue;
3269
3270 /*
3271 * We will now manually delete the caches, so to avoid races
3272 * we need to cancel all pending destruction workers and
3273 * proceed with destruction ourselves.
3274 *
3275 * kmem_cache_destroy() will call kmem_cache_shrink internally,
3276 * and that could spawn the workers again: it is likely that
3277 * the cache still have active pages until this very moment.
3278 * This would lead us back to mem_cgroup_destroy_cache.
3279 *
3280 * But that will not execute at all if the "dead" flag is not
3281 * set, so flip it down to guarantee we are in control.
3282 */
3283 c->memcg_params->dead = false;
3284 cancel_work_sync(&c->memcg_params->destroy);
3285 kmem_cache_destroy(c);
3286 }
3287 mutex_unlock(&set_limit_mutex);
3288}
3289
3290struct create_work {
3291 struct mem_cgroup *memcg;
3292 struct kmem_cache *cachep;
3293 struct work_struct work;
3294};
3295
3296static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
3297{
3298 struct kmem_cache *cachep;
3299 struct memcg_cache_params *params;
3300
3301 if (!memcg_kmem_is_active(memcg))
3302 return;
3303
3304 mutex_lock(&memcg->slab_caches_mutex);
3305 list_for_each_entry(params, &memcg->memcg_slab_caches, list) {
3306 cachep = memcg_params_to_cache(params);
3307 cachep->memcg_params->dead = true;
3308 INIT_WORK(&cachep->memcg_params->destroy,
3309 kmem_cache_destroy_work_func);
3310 schedule_work(&cachep->memcg_params->destroy);
3311 }
3312 mutex_unlock(&memcg->slab_caches_mutex);
3313}
3314
3315static void memcg_create_cache_work_func(struct work_struct *w)
3316{
3317 struct create_work *cw;
3318
3319 cw = container_of(w, struct create_work, work);
3320 memcg_create_kmem_cache(cw->memcg, cw->cachep);
3321 /* Drop the reference gotten when we enqueued. */
3322 css_put(&cw->memcg->css);
3323 kfree(cw);
3324}
3325
3326/*
3327 * Enqueue the creation of a per-memcg kmem_cache.
3328 * Called with rcu_read_lock.
3329 */
3330static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg,
3331 struct kmem_cache *cachep)
3332{
3333 struct create_work *cw;
3334
3335 cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT);
3336 if (cw == NULL)
3337 return;
3338
3339 /* The corresponding put will be done in the workqueue. */
3340 if (!css_tryget(&memcg->css)) {
3341 kfree(cw);
3342 return;
3343 }
3344
3345 cw->memcg = memcg;
3346 cw->cachep = cachep;
3347
3348 INIT_WORK(&cw->work, memcg_create_cache_work_func);
3349 schedule_work(&cw->work);
3350}
3351
3352static void memcg_create_cache_enqueue(struct mem_cgroup *memcg,
3353 struct kmem_cache *cachep)
3354{
3355 /*
3356 * We need to stop accounting when we kmalloc, because if the
3357 * corresponding kmalloc cache is not yet created, the first allocation
3358 * in __memcg_create_cache_enqueue will recurse.
3359 *
3360 * However, it is better to enclose the whole function. Depending on
3361 * the debugging options enabled, INIT_WORK(), for instance, can
3362 * trigger an allocation. This too, will make us recurse. Because at
3363 * this point we can't allow ourselves back into memcg_kmem_get_cache,
3364 * the safest choice is to do it like this, wrapping the whole function.
3365 */
3366 memcg_stop_kmem_account();
3367 __memcg_create_cache_enqueue(memcg, cachep);
3368 memcg_resume_kmem_account();
3369}
3370/*
3371 * Return the kmem_cache we're supposed to use for a slab allocation.
3372 * We try to use the current memcg's version of the cache.
3373 *
3374 * If the cache does not exist yet, if we are the first user of it,
3375 * we either create it immediately, if possible, or create it asynchronously
3376 * in a workqueue.
3377 * In the latter case, we will let the current allocation go through with
3378 * the original cache.
3379 *
3380 * Can't be called in interrupt context or from kernel threads.
3381 * This function needs to be called with rcu_read_lock() held.
3382 */
3383struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
3384 gfp_t gfp)
3385{
3386 struct mem_cgroup *memcg;
3387 int idx;
3388
3389 VM_BUG_ON(!cachep->memcg_params);
3390 VM_BUG_ON(!cachep->memcg_params->is_root_cache);
3391
3392 if (!current->mm || current->memcg_kmem_skip_account)
3393 return cachep;
3394
3395 rcu_read_lock();
3396 memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
3397 rcu_read_unlock();
3398
3399 if (!memcg_can_account_kmem(memcg))
3400 return cachep;
3401
3402 idx = memcg_cache_id(memcg);
3403
3404 /*
3405 * barrier to mare sure we're always seeing the up to date value. The
3406 * code updating memcg_caches will issue a write barrier to match this.
3407 */
3408 read_barrier_depends();
3409 if (unlikely(cachep->memcg_params->memcg_caches[idx] == NULL)) {
3410 /*
3411 * If we are in a safe context (can wait, and not in interrupt
3412 * context), we could be be predictable and return right away.
3413 * This would guarantee that the allocation being performed
3414 * already belongs in the new cache.
3415 *
3416 * However, there are some clashes that can arrive from locking.
3417 * For instance, because we acquire the slab_mutex while doing
3418 * kmem_cache_dup, this means no further allocation could happen
3419 * with the slab_mutex held.
3420 *
3421 * Also, because cache creation issue get_online_cpus(), this
3422 * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex,
3423 * that ends up reversed during cpu hotplug. (cpuset allocates
3424 * a bunch of GFP_KERNEL memory during cpuup). Due to all that,
3425 * better to defer everything.
3426 */
3427 memcg_create_cache_enqueue(memcg, cachep);
3428 return cachep;
3429 }
3430
3431 return cachep->memcg_params->memcg_caches[idx];
3432}
3433EXPORT_SYMBOL(__memcg_kmem_get_cache);
3434
3435/*
3436 * We need to verify if the allocation against current->mm->owner's memcg is
3437 * possible for the given order. But the page is not allocated yet, so we'll
3438 * need a further commit step to do the final arrangements.
3439 *
3440 * It is possible for the task to switch cgroups in this mean time, so at
3441 * commit time, we can't rely on task conversion any longer. We'll then use
3442 * the handle argument to return to the caller which cgroup we should commit
3443 * against. We could also return the memcg directly and avoid the pointer
3444 * passing, but a boolean return value gives better semantics considering
3445 * the compiled-out case as well.
3446 *
3447 * Returning true means the allocation is possible.
3448 */
3449bool
3450__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
3451{
3452 struct mem_cgroup *memcg;
3453 int ret;
3454
3455 *_memcg = NULL;
3456 memcg = try_get_mem_cgroup_from_mm(current->mm);
3457
3458 /*
3459 * very rare case described in mem_cgroup_from_task. Unfortunately there
3460 * isn't much we can do without complicating this too much, and it would
3461 * be gfp-dependent anyway. Just let it go
3462 */
3463 if (unlikely(!memcg))
3464 return true;
3465
3466 if (!memcg_can_account_kmem(memcg)) {
3467 css_put(&memcg->css);
3468 return true;
3469 }
3470
3471 ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order);
3472 if (!ret)
3473 *_memcg = memcg;
3474
3475 css_put(&memcg->css);
3476 return (ret == 0);
3477}
3478
3479void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
3480 int order)
3481{
3482 struct page_cgroup *pc;
3483
3484 VM_BUG_ON(mem_cgroup_is_root(memcg));
3485
3486 /* The page allocation failed. Revert */
3487 if (!page) {
3488 memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
3489 return;
3490 }
3491
3492 pc = lookup_page_cgroup(page);
3493 lock_page_cgroup(pc);
3494 pc->mem_cgroup = memcg;
3495 SetPageCgroupUsed(pc);
3496 unlock_page_cgroup(pc);
3497}
3498
3499void __memcg_kmem_uncharge_pages(struct page *page, int order)
3500{
3501 struct mem_cgroup *memcg = NULL;
3502 struct page_cgroup *pc;
3503
3504
3505 pc = lookup_page_cgroup(page);
3506 /*
3507 * Fast unlocked return. Theoretically might have changed, have to
3508 * check again after locking.
3509 */
3510 if (!PageCgroupUsed(pc))
3511 return;
3512
3513 lock_page_cgroup(pc);
3514 if (PageCgroupUsed(pc)) {
3515 memcg = pc->mem_cgroup;
3516 ClearPageCgroupUsed(pc);
3517 }
3518 unlock_page_cgroup(pc);
3519
3520 /*
3521 * We trust that only if there is a memcg associated with the page, it
3522 * is a valid allocation
3523 */
3524 if (!memcg)
3525 return;
3526
3527 VM_BUG_ON(mem_cgroup_is_root(memcg));
3528 memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
3529}
3530#else
3531static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
3532{
3533}
3534#endif /* CONFIG_MEMCG_KMEM */
3535
2627#ifdef CONFIG_TRANSPARENT_HUGEPAGE 3536#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2628 3537
2629#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION) 3538#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)
@@ -3486,8 +4395,6 @@ void mem_cgroup_print_bad_page(struct page *page)
3486} 4395}
3487#endif 4396#endif
3488 4397
3489static DEFINE_MUTEX(set_limit_mutex);
3490
3491static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, 4398static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
3492 unsigned long long val) 4399 unsigned long long val)
3493{ 4400{
@@ -3772,6 +4679,7 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3772static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) 4679static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
3773{ 4680{
3774 int node, zid; 4681 int node, zid;
4682 u64 usage;
3775 4683
3776 do { 4684 do {
3777 /* This is for making all *used* pages to be on LRU. */ 4685 /* This is for making all *used* pages to be on LRU. */
@@ -3792,13 +4700,20 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
3792 cond_resched(); 4700 cond_resched();
3793 4701
3794 /* 4702 /*
4703 * Kernel memory may not necessarily be trackable to a specific
4704 * process. So they are not migrated, and therefore we can't
4705 * expect their value to drop to 0 here.
4706 * Having res filled up with kmem only is enough.
4707 *
3795 * This is a safety check because mem_cgroup_force_empty_list 4708 * This is a safety check because mem_cgroup_force_empty_list
3796 * could have raced with mem_cgroup_replace_page_cache callers 4709 * could have raced with mem_cgroup_replace_page_cache callers
3797 * so the lru seemed empty but the page could have been added 4710 * so the lru seemed empty but the page could have been added
3798 * right after the check. RES_USAGE should be safe as we always 4711 * right after the check. RES_USAGE should be safe as we always
3799 * charge before adding to the LRU. 4712 * charge before adding to the LRU.
3800 */ 4713 */
3801 } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0); 4714 usage = res_counter_read_u64(&memcg->res, RES_USAGE) -
4715 res_counter_read_u64(&memcg->kmem, RES_USAGE);
4716 } while (usage > 0);
3802} 4717}
3803 4718
3804/* 4719/*
@@ -3942,7 +4857,8 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
3942 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 4857 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
3943 char str[64]; 4858 char str[64];
3944 u64 val; 4859 u64 val;
3945 int type, name, len; 4860 int name, len;
4861 enum res_type type;
3946 4862
3947 type = MEMFILE_TYPE(cft->private); 4863 type = MEMFILE_TYPE(cft->private);
3948 name = MEMFILE_ATTR(cft->private); 4864 name = MEMFILE_ATTR(cft->private);
@@ -3963,6 +4879,9 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
3963 else 4879 else
3964 val = res_counter_read_u64(&memcg->memsw, name); 4880 val = res_counter_read_u64(&memcg->memsw, name);
3965 break; 4881 break;
4882 case _KMEM:
4883 val = res_counter_read_u64(&memcg->kmem, name);
4884 break;
3966 default: 4885 default:
3967 BUG(); 4886 BUG();
3968 } 4887 }
@@ -3970,6 +4889,125 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
3970 len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val); 4889 len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
3971 return simple_read_from_buffer(buf, nbytes, ppos, str, len); 4890 return simple_read_from_buffer(buf, nbytes, ppos, str, len);
3972} 4891}
4892
4893static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
4894{
4895 int ret = -EINVAL;
4896#ifdef CONFIG_MEMCG_KMEM
4897 bool must_inc_static_branch = false;
4898
4899 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
4900 /*
4901 * For simplicity, we won't allow this to be disabled. It also can't
4902 * be changed if the cgroup has children already, or if tasks had
4903 * already joined.
4904 *
4905 * If tasks join before we set the limit, a person looking at
4906 * kmem.usage_in_bytes will have no way to determine when it took
4907 * place, which makes the value quite meaningless.
4908 *
4909 * After it first became limited, changes in the value of the limit are
4910 * of course permitted.
4911 *
4912 * Taking the cgroup_lock is really offensive, but it is so far the only
4913 * way to guarantee that no children will appear. There are plenty of
4914 * other offenders, and they should all go away. Fine grained locking
4915 * is probably the way to go here. When we are fully hierarchical, we
4916 * can also get rid of the use_hierarchy check.
4917 */
4918 cgroup_lock();
4919 mutex_lock(&set_limit_mutex);
4920 if (!memcg->kmem_account_flags && val != RESOURCE_MAX) {
4921 if (cgroup_task_count(cont) || (memcg->use_hierarchy &&
4922 !list_empty(&cont->children))) {
4923 ret = -EBUSY;
4924 goto out;
4925 }
4926 ret = res_counter_set_limit(&memcg->kmem, val);
4927 VM_BUG_ON(ret);
4928
4929 ret = memcg_update_cache_sizes(memcg);
4930 if (ret) {
4931 res_counter_set_limit(&memcg->kmem, RESOURCE_MAX);
4932 goto out;
4933 }
4934 must_inc_static_branch = true;
4935 /*
4936 * kmem charges can outlive the cgroup. In the case of slab
4937 * pages, for instance, a page contain objects from various
4938 * processes, so it is unfeasible to migrate them away. We
4939 * need to reference count the memcg because of that.
4940 */
4941 mem_cgroup_get(memcg);
4942 } else
4943 ret = res_counter_set_limit(&memcg->kmem, val);
4944out:
4945 mutex_unlock(&set_limit_mutex);
4946 cgroup_unlock();
4947
4948 /*
4949 * We are by now familiar with the fact that we can't inc the static
4950 * branch inside cgroup_lock. See disarm functions for details. A
4951 * worker here is overkill, but also wrong: After the limit is set, we
4952 * must start accounting right away. Since this operation can't fail,
4953 * we can safely defer it to here - no rollback will be needed.
4954 *
4955 * The boolean used to control this is also safe, because
4956 * KMEM_ACCOUNTED_ACTIVATED guarantees that only one process will be
4957 * able to set it to true;
4958 */
4959 if (must_inc_static_branch) {
4960 static_key_slow_inc(&memcg_kmem_enabled_key);
4961 /*
4962 * setting the active bit after the inc will guarantee no one
4963 * starts accounting before all call sites are patched
4964 */
4965 memcg_kmem_set_active(memcg);
4966 }
4967
4968#endif
4969 return ret;
4970}
4971
4972static int memcg_propagate_kmem(struct mem_cgroup *memcg)
4973{
4974 int ret = 0;
4975 struct mem_cgroup *parent = parent_mem_cgroup(memcg);
4976 if (!parent)
4977 goto out;
4978
4979 memcg->kmem_account_flags = parent->kmem_account_flags;
4980#ifdef CONFIG_MEMCG_KMEM
4981 /*
4982 * When that happen, we need to disable the static branch only on those
4983 * memcgs that enabled it. To achieve this, we would be forced to
4984 * complicate the code by keeping track of which memcgs were the ones
4985 * that actually enabled limits, and which ones got it from its
4986 * parents.
4987 *
4988 * It is a lot simpler just to do static_key_slow_inc() on every child
4989 * that is accounted.
4990 */
4991 if (!memcg_kmem_is_active(memcg))
4992 goto out;
4993
4994 /*
4995 * destroy(), called if we fail, will issue static_key_slow_inc() and
4996 * mem_cgroup_put() if kmem is enabled. We have to either call them
4997 * unconditionally, or clear the KMEM_ACTIVE flag. I personally find
4998 * this more consistent, since it always leads to the same destroy path
4999 */
5000 mem_cgroup_get(memcg);
5001 static_key_slow_inc(&memcg_kmem_enabled_key);
5002
5003 mutex_lock(&set_limit_mutex);
5004 ret = memcg_update_cache_sizes(memcg);
5005 mutex_unlock(&set_limit_mutex);
5006#endif
5007out:
5008 return ret;
5009}
5010
3973/* 5011/*
3974 * The user of this function is... 5012 * The user of this function is...
3975 * RES_LIMIT. 5013 * RES_LIMIT.
@@ -3978,7 +5016,8 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
3978 const char *buffer) 5016 const char *buffer)
3979{ 5017{
3980 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 5018 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
3981 int type, name; 5019 enum res_type type;
5020 int name;
3982 unsigned long long val; 5021 unsigned long long val;
3983 int ret; 5022 int ret;
3984 5023
@@ -4000,8 +5039,12 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
4000 break; 5039 break;
4001 if (type == _MEM) 5040 if (type == _MEM)
4002 ret = mem_cgroup_resize_limit(memcg, val); 5041 ret = mem_cgroup_resize_limit(memcg, val);
4003 else 5042 else if (type == _MEMSWAP)
4004 ret = mem_cgroup_resize_memsw_limit(memcg, val); 5043 ret = mem_cgroup_resize_memsw_limit(memcg, val);
5044 else if (type == _KMEM)
5045 ret = memcg_update_kmem_limit(cont, val);
5046 else
5047 return -EINVAL;
4005 break; 5048 break;
4006 case RES_SOFT_LIMIT: 5049 case RES_SOFT_LIMIT:
4007 ret = res_counter_memparse_write_strategy(buffer, &val); 5050 ret = res_counter_memparse_write_strategy(buffer, &val);
@@ -4054,7 +5097,8 @@ out:
4054static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) 5097static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
4055{ 5098{
4056 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 5099 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
4057 int type, name; 5100 int name;
5101 enum res_type type;
4058 5102
4059 type = MEMFILE_TYPE(event); 5103 type = MEMFILE_TYPE(event);
4060 name = MEMFILE_ATTR(event); 5104 name = MEMFILE_ATTR(event);
@@ -4066,14 +5110,22 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
4066 case RES_MAX_USAGE: 5110 case RES_MAX_USAGE:
4067 if (type == _MEM) 5111 if (type == _MEM)
4068 res_counter_reset_max(&memcg->res); 5112 res_counter_reset_max(&memcg->res);
4069 else 5113 else if (type == _MEMSWAP)
4070 res_counter_reset_max(&memcg->memsw); 5114 res_counter_reset_max(&memcg->memsw);
5115 else if (type == _KMEM)
5116 res_counter_reset_max(&memcg->kmem);
5117 else
5118 return -EINVAL;
4071 break; 5119 break;
4072 case RES_FAILCNT: 5120 case RES_FAILCNT:
4073 if (type == _MEM) 5121 if (type == _MEM)
4074 res_counter_reset_failcnt(&memcg->res); 5122 res_counter_reset_failcnt(&memcg->res);
4075 else 5123 else if (type == _MEMSWAP)
4076 res_counter_reset_failcnt(&memcg->memsw); 5124 res_counter_reset_failcnt(&memcg->memsw);
5125 else if (type == _KMEM)
5126 res_counter_reset_failcnt(&memcg->kmem);
5127 else
5128 return -EINVAL;
4077 break; 5129 break;
4078 } 5130 }
4079 5131
@@ -4390,7 +5442,7 @@ static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
4390 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5442 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4391 struct mem_cgroup_thresholds *thresholds; 5443 struct mem_cgroup_thresholds *thresholds;
4392 struct mem_cgroup_threshold_ary *new; 5444 struct mem_cgroup_threshold_ary *new;
4393 int type = MEMFILE_TYPE(cft->private); 5445 enum res_type type = MEMFILE_TYPE(cft->private);
4394 u64 threshold, usage; 5446 u64 threshold, usage;
4395 int i, size, ret; 5447 int i, size, ret;
4396 5448
@@ -4473,7 +5525,7 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
4473 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5525 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4474 struct mem_cgroup_thresholds *thresholds; 5526 struct mem_cgroup_thresholds *thresholds;
4475 struct mem_cgroup_threshold_ary *new; 5527 struct mem_cgroup_threshold_ary *new;
4476 int type = MEMFILE_TYPE(cft->private); 5528 enum res_type type = MEMFILE_TYPE(cft->private);
4477 u64 usage; 5529 u64 usage;
4478 int i, j, size; 5530 int i, j, size;
4479 5531
@@ -4551,7 +5603,7 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
4551{ 5603{
4552 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5604 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4553 struct mem_cgroup_eventfd_list *event; 5605 struct mem_cgroup_eventfd_list *event;
4554 int type = MEMFILE_TYPE(cft->private); 5606 enum res_type type = MEMFILE_TYPE(cft->private);
4555 5607
4556 BUG_ON(type != _OOM_TYPE); 5608 BUG_ON(type != _OOM_TYPE);
4557 event = kmalloc(sizeof(*event), GFP_KERNEL); 5609 event = kmalloc(sizeof(*event), GFP_KERNEL);
@@ -4576,7 +5628,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
4576{ 5628{
4577 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5629 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4578 struct mem_cgroup_eventfd_list *ev, *tmp; 5630 struct mem_cgroup_eventfd_list *ev, *tmp;
4579 int type = MEMFILE_TYPE(cft->private); 5631 enum res_type type = MEMFILE_TYPE(cft->private);
4580 5632
4581 BUG_ON(type != _OOM_TYPE); 5633 BUG_ON(type != _OOM_TYPE);
4582 5634
@@ -4635,12 +5687,33 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
4635#ifdef CONFIG_MEMCG_KMEM 5687#ifdef CONFIG_MEMCG_KMEM
4636static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 5688static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
4637{ 5689{
5690 int ret;
5691
5692 memcg->kmemcg_id = -1;
5693 ret = memcg_propagate_kmem(memcg);
5694 if (ret)
5695 return ret;
5696
4638 return mem_cgroup_sockets_init(memcg, ss); 5697 return mem_cgroup_sockets_init(memcg, ss);
4639}; 5698};
4640 5699
4641static void kmem_cgroup_destroy(struct mem_cgroup *memcg) 5700static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
4642{ 5701{
4643 mem_cgroup_sockets_destroy(memcg); 5702 mem_cgroup_sockets_destroy(memcg);
5703
5704 memcg_kmem_mark_dead(memcg);
5705
5706 if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0)
5707 return;
5708
5709 /*
5710 * Charges already down to 0, undo mem_cgroup_get() done in the charge
5711 * path here, being careful not to race with memcg_uncharge_kmem: it is
5712 * possible that the charges went down to 0 between mark_dead and the
5713 * res_counter read, so in that case, we don't need the put
5714 */
5715 if (memcg_kmem_test_and_clear_dead(memcg))
5716 mem_cgroup_put(memcg);
4644} 5717}
4645#else 5718#else
4646static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 5719static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
@@ -4749,6 +5822,37 @@ static struct cftype mem_cgroup_files[] = {
4749 .read = mem_cgroup_read, 5822 .read = mem_cgroup_read,
4750 }, 5823 },
4751#endif 5824#endif
5825#ifdef CONFIG_MEMCG_KMEM
5826 {
5827 .name = "kmem.limit_in_bytes",
5828 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
5829 .write_string = mem_cgroup_write,
5830 .read = mem_cgroup_read,
5831 },
5832 {
5833 .name = "kmem.usage_in_bytes",
5834 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
5835 .read = mem_cgroup_read,
5836 },
5837 {
5838 .name = "kmem.failcnt",
5839 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
5840 .trigger = mem_cgroup_reset,
5841 .read = mem_cgroup_read,
5842 },
5843 {
5844 .name = "kmem.max_usage_in_bytes",
5845 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
5846 .trigger = mem_cgroup_reset,
5847 .read = mem_cgroup_read,
5848 },
5849#ifdef CONFIG_SLABINFO
5850 {
5851 .name = "kmem.slabinfo",
5852 .read_seq_string = mem_cgroup_slabinfo_read,
5853 },
5854#endif
5855#endif
4752 { }, /* terminate */ 5856 { }, /* terminate */
4753}; 5857};
4754 5858
@@ -4816,16 +5920,29 @@ out_free:
4816} 5920}
4817 5921
4818/* 5922/*
4819 * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU, 5923 * At destroying mem_cgroup, references from swap_cgroup can remain.
4820 * but in process context. The work_freeing structure is overlaid 5924 * (scanning all at force_empty is too costly...)
4821 * on the rcu_freeing structure, which itself is overlaid on memsw. 5925 *
5926 * Instead of clearing all references at force_empty, we remember
5927 * the number of reference from swap_cgroup and free mem_cgroup when
5928 * it goes down to 0.
5929 *
5930 * Removal of cgroup itself succeeds regardless of refs from swap.
4822 */ 5931 */
4823static void free_work(struct work_struct *work) 5932
5933static void __mem_cgroup_free(struct mem_cgroup *memcg)
4824{ 5934{
4825 struct mem_cgroup *memcg; 5935 int node;
4826 int size = sizeof(struct mem_cgroup); 5936 int size = sizeof(struct mem_cgroup);
4827 5937
4828 memcg = container_of(work, struct mem_cgroup, work_freeing); 5938 mem_cgroup_remove_from_trees(memcg);
5939 free_css_id(&mem_cgroup_subsys, &memcg->css);
5940
5941 for_each_node(node)
5942 free_mem_cgroup_per_zone_info(memcg, node);
5943
5944 free_percpu(memcg->stat);
5945
4829 /* 5946 /*
4830 * We need to make sure that (at least for now), the jump label 5947 * We need to make sure that (at least for now), the jump label
4831 * destruction code runs outside of the cgroup lock. This is because 5948 * destruction code runs outside of the cgroup lock. This is because
@@ -4837,45 +5954,34 @@ static void free_work(struct work_struct *work)
4837 * to move this code around, and make sure it is outside 5954 * to move this code around, and make sure it is outside
4838 * the cgroup_lock. 5955 * the cgroup_lock.
4839 */ 5956 */
4840 disarm_sock_keys(memcg); 5957 disarm_static_keys(memcg);
4841 if (size < PAGE_SIZE) 5958 if (size < PAGE_SIZE)
4842 kfree(memcg); 5959 kfree(memcg);
4843 else 5960 else
4844 vfree(memcg); 5961 vfree(memcg);
4845} 5962}
4846 5963
4847static void free_rcu(struct rcu_head *rcu_head)
4848{
4849 struct mem_cgroup *memcg;
4850
4851 memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
4852 INIT_WORK(&memcg->work_freeing, free_work);
4853 schedule_work(&memcg->work_freeing);
4854}
4855 5964
4856/* 5965/*
4857 * At destroying mem_cgroup, references from swap_cgroup can remain. 5966 * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU,
4858 * (scanning all at force_empty is too costly...) 5967 * but in process context. The work_freeing structure is overlaid
4859 * 5968 * on the rcu_freeing structure, which itself is overlaid on memsw.
4860 * Instead of clearing all references at force_empty, we remember
4861 * the number of reference from swap_cgroup and free mem_cgroup when
4862 * it goes down to 0.
4863 *
4864 * Removal of cgroup itself succeeds regardless of refs from swap.
4865 */ 5969 */
4866 5970static void free_work(struct work_struct *work)
4867static void __mem_cgroup_free(struct mem_cgroup *memcg)
4868{ 5971{
4869 int node; 5972 struct mem_cgroup *memcg;
4870 5973
4871 mem_cgroup_remove_from_trees(memcg); 5974 memcg = container_of(work, struct mem_cgroup, work_freeing);
4872 free_css_id(&mem_cgroup_subsys, &memcg->css); 5975 __mem_cgroup_free(memcg);
5976}
4873 5977
4874 for_each_node(node) 5978static void free_rcu(struct rcu_head *rcu_head)
4875 free_mem_cgroup_per_zone_info(memcg, node); 5979{
5980 struct mem_cgroup *memcg;
4876 5981
4877 free_percpu(memcg->stat); 5982 memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
4878 call_rcu(&memcg->rcu_freeing, free_rcu); 5983 INIT_WORK(&memcg->work_freeing, free_work);
5984 schedule_work(&memcg->work_freeing);
4879} 5985}
4880 5986
4881static void mem_cgroup_get(struct mem_cgroup *memcg) 5987static void mem_cgroup_get(struct mem_cgroup *memcg)
@@ -4887,7 +5993,7 @@ static void __mem_cgroup_put(struct mem_cgroup *memcg, int count)
4887{ 5993{
4888 if (atomic_sub_and_test(count, &memcg->refcnt)) { 5994 if (atomic_sub_and_test(count, &memcg->refcnt)) {
4889 struct mem_cgroup *parent = parent_mem_cgroup(memcg); 5995 struct mem_cgroup *parent = parent_mem_cgroup(memcg);
4890 __mem_cgroup_free(memcg); 5996 call_rcu(&memcg->rcu_freeing, free_rcu);
4891 if (parent) 5997 if (parent)
4892 mem_cgroup_put(parent); 5998 mem_cgroup_put(parent);
4893 } 5999 }
@@ -4994,6 +6100,8 @@ mem_cgroup_css_alloc(struct cgroup *cont)
4994 if (parent && parent->use_hierarchy) { 6100 if (parent && parent->use_hierarchy) {
4995 res_counter_init(&memcg->res, &parent->res); 6101 res_counter_init(&memcg->res, &parent->res);
4996 res_counter_init(&memcg->memsw, &parent->memsw); 6102 res_counter_init(&memcg->memsw, &parent->memsw);
6103 res_counter_init(&memcg->kmem, &parent->kmem);
6104
4997 /* 6105 /*
4998 * We increment refcnt of the parent to ensure that we can 6106 * We increment refcnt of the parent to ensure that we can
4999 * safely access it on res_counter_charge/uncharge. 6107 * safely access it on res_counter_charge/uncharge.
@@ -5004,6 +6112,7 @@ mem_cgroup_css_alloc(struct cgroup *cont)
5004 } else { 6112 } else {
5005 res_counter_init(&memcg->res, NULL); 6113 res_counter_init(&memcg->res, NULL);
5006 res_counter_init(&memcg->memsw, NULL); 6114 res_counter_init(&memcg->memsw, NULL);
6115 res_counter_init(&memcg->kmem, NULL);
5007 /* 6116 /*
5008 * Deeper hierachy with use_hierarchy == false doesn't make 6117 * Deeper hierachy with use_hierarchy == false doesn't make
5009 * much sense so let cgroup subsystem know about this 6118 * much sense so let cgroup subsystem know about this
@@ -5043,6 +6152,7 @@ static void mem_cgroup_css_offline(struct cgroup *cont)
5043 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 6152 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
5044 6153
5045 mem_cgroup_reparent_charges(memcg); 6154 mem_cgroup_reparent_charges(memcg);
6155 mem_cgroup_destroy_all_caches(memcg);
5046} 6156}
5047 6157
5048static void mem_cgroup_css_free(struct cgroup *cont) 6158static void mem_cgroup_css_free(struct cgroup *cont)
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 962e353aa86f..d04ed87bfacb 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -590,18 +590,21 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
590} 590}
591 591
592#ifdef CONFIG_MOVABLE_NODE 592#ifdef CONFIG_MOVABLE_NODE
593/* when CONFIG_MOVABLE_NODE, we allow online node don't have normal memory */ 593/*
594 * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have
595 * normal memory.
596 */
594static bool can_online_high_movable(struct zone *zone) 597static bool can_online_high_movable(struct zone *zone)
595{ 598{
596 return true; 599 return true;
597} 600}
598#else /* #ifdef CONFIG_MOVABLE_NODE */ 601#else /* CONFIG_MOVABLE_NODE */
599/* ensure every online node has NORMAL memory */ 602/* ensure every online node has NORMAL memory */
600static bool can_online_high_movable(struct zone *zone) 603static bool can_online_high_movable(struct zone *zone)
601{ 604{
602 return node_state(zone_to_nid(zone), N_NORMAL_MEMORY); 605 return node_state(zone_to_nid(zone), N_NORMAL_MEMORY);
603} 606}
604#endif /* #ifdef CONFIG_MOVABLE_NODE */ 607#endif /* CONFIG_MOVABLE_NODE */
605 608
606/* check which state of node_states will be changed when online memory */ 609/* check which state of node_states will be changed when online memory */
607static void node_states_check_changes_online(unsigned long nr_pages, 610static void node_states_check_changes_online(unsigned long nr_pages,
@@ -1112,12 +1115,15 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
1112} 1115}
1113 1116
1114#ifdef CONFIG_MOVABLE_NODE 1117#ifdef CONFIG_MOVABLE_NODE
1115/* when CONFIG_MOVABLE_NODE, we allow online node don't have normal memory */ 1118/*
1119 * When CONFIG_MOVABLE_NODE, we permit offlining of a node which doesn't have
1120 * normal memory.
1121 */
1116static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) 1122static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
1117{ 1123{
1118 return true; 1124 return true;
1119} 1125}
1120#else /* #ifdef CONFIG_MOVABLE_NODE */ 1126#else /* CONFIG_MOVABLE_NODE */
1121/* ensure the node has NORMAL memory if it is still online */ 1127/* ensure the node has NORMAL memory if it is still online */
1122static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) 1128static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
1123{ 1129{
@@ -1141,7 +1147,7 @@ static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
1141 */ 1147 */
1142 return present_pages == 0; 1148 return present_pages == 0;
1143} 1149}
1144#endif /* #ifdef CONFIG_MOVABLE_NODE */ 1150#endif /* CONFIG_MOVABLE_NODE */
1145 1151
1146/* check which state of node_states will be changed when offline memory */ 1152/* check which state of node_states will be changed when offline memory */
1147static void node_states_check_changes_offline(unsigned long nr_pages, 1153static void node_states_check_changes_offline(unsigned long nr_pages,
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 3dca970367db..94722a4d6b43 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -114,7 +114,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
114 114
115#ifdef CONFIG_NUMA_BALANCING 115#ifdef CONFIG_NUMA_BALANCING
116static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, 116static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
117 pmd_t *pmd) 117 pmd_t *pmd)
118{ 118{
119 spin_lock(&mm->page_table_lock); 119 spin_lock(&mm->page_table_lock);
120 set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd)); 120 set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd));
@@ -122,15 +122,15 @@ static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
122} 122}
123#else 123#else
124static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, 124static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
125 pmd_t *pmd) 125 pmd_t *pmd)
126{ 126{
127 BUG(); 127 BUG();
128} 128}
129#endif /* CONFIG_NUMA_BALANCING */ 129#endif /* CONFIG_NUMA_BALANCING */
130 130
131static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t *pud, 131static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
132 unsigned long addr, unsigned long end, pgprot_t newprot, 132 pud_t *pud, unsigned long addr, unsigned long end,
133 int dirty_accountable, int prot_numa) 133 pgprot_t newprot, int dirty_accountable, int prot_numa)
134{ 134{
135 pmd_t *pmd; 135 pmd_t *pmd;
136 unsigned long next; 136 unsigned long next;
@@ -143,7 +143,8 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t *
143 if (pmd_trans_huge(*pmd)) { 143 if (pmd_trans_huge(*pmd)) {
144 if (next - addr != HPAGE_PMD_SIZE) 144 if (next - addr != HPAGE_PMD_SIZE)
145 split_huge_page_pmd(vma, addr, pmd); 145 split_huge_page_pmd(vma, addr, pmd);
146 else if (change_huge_pmd(vma, pmd, addr, newprot, prot_numa)) { 146 else if (change_huge_pmd(vma, pmd, addr, newprot,
147 prot_numa)) {
147 pages += HPAGE_PMD_NR; 148 pages += HPAGE_PMD_NR;
148 continue; 149 continue;
149 } 150 }
@@ -167,9 +168,9 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t *
167 return pages; 168 return pages;
168} 169}
169 170
170static inline unsigned long change_pud_range(struct vm_area_struct *vma, pgd_t *pgd, 171static inline unsigned long change_pud_range(struct vm_area_struct *vma,
171 unsigned long addr, unsigned long end, pgprot_t newprot, 172 pgd_t *pgd, unsigned long addr, unsigned long end,
172 int dirty_accountable, int prot_numa) 173 pgprot_t newprot, int dirty_accountable, int prot_numa)
173{ 174{
174 pud_t *pud; 175 pud_t *pud;
175 unsigned long next; 176 unsigned long next;
@@ -304,7 +305,8 @@ success:
304 dirty_accountable = 1; 305 dirty_accountable = 1;
305 } 306 }
306 307
307 change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable, 0); 308 change_protection(vma, start, end, vma->vm_page_prot,
309 dirty_accountable, 0);
308 310
309 vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); 311 vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
310 vm_stat_account(mm, newflags, vma->vm_file, nrpages); 312 vm_stat_account(mm, newflags, vma->vm_file, nrpages);
@@ -361,8 +363,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
361 error = -EINVAL; 363 error = -EINVAL;
362 if (!(vma->vm_flags & VM_GROWSDOWN)) 364 if (!(vma->vm_flags & VM_GROWSDOWN))
363 goto out; 365 goto out;
364 } 366 } else {
365 else {
366 if (vma->vm_start > start) 367 if (vma->vm_start > start)
367 goto out; 368 goto out;
368 if (unlikely(grows & PROT_GROWSUP)) { 369 if (unlikely(grows & PROT_GROWSUP)) {
@@ -378,9 +379,10 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
378 for (nstart = start ; ; ) { 379 for (nstart = start ; ; ) {
379 unsigned long newflags; 380 unsigned long newflags;
380 381
381 /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ 382 /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
382 383
383 newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); 384 newflags = vm_flags;
385 newflags |= (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
384 386
385 /* newflags >> 4 shift VM_MAY% in place of VM_% */ 387 /* newflags >> 4 shift VM_MAY% in place of VM_% */
386 if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) { 388 if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d037c8bc1512..2ad2ad168efe 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -371,8 +371,7 @@ static int destroy_compound_page(struct page *page, unsigned long order)
371 int nr_pages = 1 << order; 371 int nr_pages = 1 << order;
372 int bad = 0; 372 int bad = 0;
373 373
374 if (unlikely(compound_order(page) != order) || 374 if (unlikely(compound_order(page) != order)) {
375 unlikely(!PageHead(page))) {
376 bad_page(page); 375 bad_page(page);
377 bad++; 376 bad++;
378 } 377 }
@@ -2613,6 +2612,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2613 int migratetype = allocflags_to_migratetype(gfp_mask); 2612 int migratetype = allocflags_to_migratetype(gfp_mask);
2614 unsigned int cpuset_mems_cookie; 2613 unsigned int cpuset_mems_cookie;
2615 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET; 2614 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET;
2615 struct mem_cgroup *memcg = NULL;
2616 2616
2617 gfp_mask &= gfp_allowed_mask; 2617 gfp_mask &= gfp_allowed_mask;
2618 2618
@@ -2631,6 +2631,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2631 if (unlikely(!zonelist->_zonerefs->zone)) 2631 if (unlikely(!zonelist->_zonerefs->zone))
2632 return NULL; 2632 return NULL;
2633 2633
2634 /*
2635 * Will only have any effect when __GFP_KMEMCG is set. This is
2636 * verified in the (always inline) callee
2637 */
2638 if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
2639 return NULL;
2640
2634retry_cpuset: 2641retry_cpuset:
2635 cpuset_mems_cookie = get_mems_allowed(); 2642 cpuset_mems_cookie = get_mems_allowed();
2636 2643
@@ -2666,6 +2673,8 @@ out:
2666 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) 2673 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
2667 goto retry_cpuset; 2674 goto retry_cpuset;
2668 2675
2676 memcg_kmem_commit_charge(page, memcg, order);
2677
2669 return page; 2678 return page;
2670} 2679}
2671EXPORT_SYMBOL(__alloc_pages_nodemask); 2680EXPORT_SYMBOL(__alloc_pages_nodemask);
@@ -2718,6 +2727,31 @@ void free_pages(unsigned long addr, unsigned int order)
2718 2727
2719EXPORT_SYMBOL(free_pages); 2728EXPORT_SYMBOL(free_pages);
2720 2729
2730/*
2731 * __free_memcg_kmem_pages and free_memcg_kmem_pages will free
2732 * pages allocated with __GFP_KMEMCG.
2733 *
2734 * Those pages are accounted to a particular memcg, embedded in the
2735 * corresponding page_cgroup. To avoid adding a hit in the allocator to search
2736 * for that information only to find out that it is NULL for users who have no
2737 * interest in that whatsoever, we provide these functions.
2738 *
2739 * The caller knows better which flags it relies on.
2740 */
2741void __free_memcg_kmem_pages(struct page *page, unsigned int order)
2742{
2743 memcg_kmem_uncharge_pages(page, order);
2744 __free_pages(page, order);
2745}
2746
2747void free_memcg_kmem_pages(unsigned long addr, unsigned int order)
2748{
2749 if (addr != 0) {
2750 VM_BUG_ON(!virt_addr_valid((void *)addr));
2751 __free_memcg_kmem_pages(virt_to_page((void *)addr), order);
2752 }
2753}
2754
2721static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size) 2755static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
2722{ 2756{
2723 if (addr) { 2757 if (addr) {
diff --git a/mm/slab.c b/mm/slab.c
index 2c3a2e0394db..e7667a3584bc 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -87,7 +87,6 @@
87 */ 87 */
88 88
89#include <linux/slab.h> 89#include <linux/slab.h>
90#include "slab.h"
91#include <linux/mm.h> 90#include <linux/mm.h>
92#include <linux/poison.h> 91#include <linux/poison.h>
93#include <linux/swap.h> 92#include <linux/swap.h>
@@ -128,6 +127,8 @@
128 127
129#include "internal.h" 128#include "internal.h"
130 129
130#include "slab.h"
131
131/* 132/*
132 * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. 133 * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
133 * 0 for faster, smaller code (especially in the critical paths). 134 * 0 for faster, smaller code (especially in the critical paths).
@@ -641,6 +642,26 @@ static void init_node_lock_keys(int q)
641 } 642 }
642} 643}
643 644
645static void on_slab_lock_classes_node(struct kmem_cache *cachep, int q)
646{
647 struct kmem_list3 *l3;
648 l3 = cachep->nodelists[q];
649 if (!l3)
650 return;
651
652 slab_set_lock_classes(cachep, &on_slab_l3_key,
653 &on_slab_alc_key, q);
654}
655
656static inline void on_slab_lock_classes(struct kmem_cache *cachep)
657{
658 int node;
659
660 VM_BUG_ON(OFF_SLAB(cachep));
661 for_each_node(node)
662 on_slab_lock_classes_node(cachep, node);
663}
664
644static inline void init_lock_keys(void) 665static inline void init_lock_keys(void)
645{ 666{
646 int node; 667 int node;
@@ -657,6 +678,14 @@ static inline void init_lock_keys(void)
657{ 678{
658} 679}
659 680
681static inline void on_slab_lock_classes(struct kmem_cache *cachep)
682{
683}
684
685static inline void on_slab_lock_classes_node(struct kmem_cache *cachep, int node)
686{
687}
688
660static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node) 689static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node)
661{ 690{
662} 691}
@@ -1385,6 +1414,9 @@ static int __cpuinit cpuup_prepare(long cpu)
1385 free_alien_cache(alien); 1414 free_alien_cache(alien);
1386 if (cachep->flags & SLAB_DEBUG_OBJECTS) 1415 if (cachep->flags & SLAB_DEBUG_OBJECTS)
1387 slab_set_debugobj_lock_classes_node(cachep, node); 1416 slab_set_debugobj_lock_classes_node(cachep, node);
1417 else if (!OFF_SLAB(cachep) &&
1418 !(cachep->flags & SLAB_DESTROY_BY_RCU))
1419 on_slab_lock_classes_node(cachep, node);
1388 } 1420 }
1389 init_node_lock_keys(node); 1421 init_node_lock_keys(node);
1390 1422
@@ -1863,6 +1895,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1863 if (page->pfmemalloc) 1895 if (page->pfmemalloc)
1864 SetPageSlabPfmemalloc(page + i); 1896 SetPageSlabPfmemalloc(page + i);
1865 } 1897 }
1898 memcg_bind_pages(cachep, cachep->gfporder);
1866 1899
1867 if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { 1900 if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
1868 kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); 1901 kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);
@@ -1899,9 +1932,11 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
1899 __ClearPageSlab(page); 1932 __ClearPageSlab(page);
1900 page++; 1933 page++;
1901 } 1934 }
1935
1936 memcg_release_pages(cachep, cachep->gfporder);
1902 if (current->reclaim_state) 1937 if (current->reclaim_state)
1903 current->reclaim_state->reclaimed_slab += nr_freed; 1938 current->reclaim_state->reclaimed_slab += nr_freed;
1904 free_pages((unsigned long)addr, cachep->gfporder); 1939 free_memcg_kmem_pages((unsigned long)addr, cachep->gfporder);
1905} 1940}
1906 1941
1907static void kmem_rcu_free(struct rcu_head *head) 1942static void kmem_rcu_free(struct rcu_head *head)
@@ -2489,7 +2524,8 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
2489 WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU); 2524 WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU);
2490 2525
2491 slab_set_debugobj_lock_classes(cachep); 2526 slab_set_debugobj_lock_classes(cachep);
2492 } 2527 } else if (!OFF_SLAB(cachep) && !(flags & SLAB_DESTROY_BY_RCU))
2528 on_slab_lock_classes(cachep);
2493 2529
2494 return 0; 2530 return 0;
2495} 2531}
@@ -3453,6 +3489,8 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3453 if (slab_should_failslab(cachep, flags)) 3489 if (slab_should_failslab(cachep, flags))
3454 return NULL; 3490 return NULL;
3455 3491
3492 cachep = memcg_kmem_get_cache(cachep, flags);
3493
3456 cache_alloc_debugcheck_before(cachep, flags); 3494 cache_alloc_debugcheck_before(cachep, flags);
3457 local_irq_save(save_flags); 3495 local_irq_save(save_flags);
3458 3496
@@ -3538,6 +3576,8 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
3538 if (slab_should_failslab(cachep, flags)) 3576 if (slab_should_failslab(cachep, flags))
3539 return NULL; 3577 return NULL;
3540 3578
3579 cachep = memcg_kmem_get_cache(cachep, flags);
3580
3541 cache_alloc_debugcheck_before(cachep, flags); 3581 cache_alloc_debugcheck_before(cachep, flags);
3542 local_irq_save(save_flags); 3582 local_irq_save(save_flags);
3543 objp = __do_cache_alloc(cachep, flags); 3583 objp = __do_cache_alloc(cachep, flags);
@@ -3851,6 +3891,9 @@ EXPORT_SYMBOL(__kmalloc);
3851void kmem_cache_free(struct kmem_cache *cachep, void *objp) 3891void kmem_cache_free(struct kmem_cache *cachep, void *objp)
3852{ 3892{
3853 unsigned long flags; 3893 unsigned long flags;
3894 cachep = cache_from_obj(cachep, objp);
3895 if (!cachep)
3896 return;
3854 3897
3855 local_irq_save(flags); 3898 local_irq_save(flags);
3856 debug_check_no_locks_freed(objp, cachep->object_size); 3899 debug_check_no_locks_freed(objp, cachep->object_size);
@@ -3998,7 +4041,7 @@ static void do_ccupdate_local(void *info)
3998} 4041}
3999 4042
4000/* Always called with the slab_mutex held */ 4043/* Always called with the slab_mutex held */
4001static int do_tune_cpucache(struct kmem_cache *cachep, int limit, 4044static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
4002 int batchcount, int shared, gfp_t gfp) 4045 int batchcount, int shared, gfp_t gfp)
4003{ 4046{
4004 struct ccupdate_struct *new; 4047 struct ccupdate_struct *new;
@@ -4041,12 +4084,49 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
4041 return alloc_kmemlist(cachep, gfp); 4084 return alloc_kmemlist(cachep, gfp);
4042} 4085}
4043 4086
4087static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
4088 int batchcount, int shared, gfp_t gfp)
4089{
4090 int ret;
4091 struct kmem_cache *c = NULL;
4092 int i = 0;
4093
4094 ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
4095
4096 if (slab_state < FULL)
4097 return ret;
4098
4099 if ((ret < 0) || !is_root_cache(cachep))
4100 return ret;
4101
4102 VM_BUG_ON(!mutex_is_locked(&slab_mutex));
4103 for_each_memcg_cache_index(i) {
4104 c = cache_from_memcg(cachep, i);
4105 if (c)
4106 /* return value determined by the parent cache only */
4107 __do_tune_cpucache(c, limit, batchcount, shared, gfp);
4108 }
4109
4110 return ret;
4111}
4112
4044/* Called with slab_mutex held always */ 4113/* Called with slab_mutex held always */
4045static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) 4114static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
4046{ 4115{
4047 int err; 4116 int err;
4048 int limit, shared; 4117 int limit = 0;
4118 int shared = 0;
4119 int batchcount = 0;
4120
4121 if (!is_root_cache(cachep)) {
4122 struct kmem_cache *root = memcg_root_cache(cachep);
4123 limit = root->limit;
4124 shared = root->shared;
4125 batchcount = root->batchcount;
4126 }
4049 4127
4128 if (limit && shared && batchcount)
4129 goto skip_setup;
4050 /* 4130 /*
4051 * The head array serves three purposes: 4131 * The head array serves three purposes:
4052 * - create a LIFO ordering, i.e. return objects that are cache-warm 4132 * - create a LIFO ordering, i.e. return objects that are cache-warm
@@ -4088,7 +4168,9 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
4088 if (limit > 32) 4168 if (limit > 32)
4089 limit = 32; 4169 limit = 32;
4090#endif 4170#endif
4091 err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared, gfp); 4171 batchcount = (limit + 1) / 2;
4172skip_setup:
4173 err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
4092 if (err) 4174 if (err)
4093 printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", 4175 printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
4094 cachep->name, -err); 4176 cachep->name, -err);
diff --git a/mm/slab.h b/mm/slab.h
index 1cb9c9ee0e6f..34a98d642196 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -43,12 +43,15 @@ extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size,
43extern void create_boot_cache(struct kmem_cache *, const char *name, 43extern void create_boot_cache(struct kmem_cache *, const char *name,
44 size_t size, unsigned long flags); 44 size_t size, unsigned long flags);
45 45
46struct mem_cgroup;
46#ifdef CONFIG_SLUB 47#ifdef CONFIG_SLUB
47struct kmem_cache *__kmem_cache_alias(const char *name, size_t size, 48struct kmem_cache *
48 size_t align, unsigned long flags, void (*ctor)(void *)); 49__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
50 size_t align, unsigned long flags, void (*ctor)(void *));
49#else 51#else
50static inline struct kmem_cache *__kmem_cache_alias(const char *name, size_t size, 52static inline struct kmem_cache *
51 size_t align, unsigned long flags, void (*ctor)(void *)) 53__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
54 size_t align, unsigned long flags, void (*ctor)(void *))
52{ return NULL; } 55{ return NULL; }
53#endif 56#endif
54 57
@@ -100,4 +103,130 @@ void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo);
100void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s); 103void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s);
101ssize_t slabinfo_write(struct file *file, const char __user *buffer, 104ssize_t slabinfo_write(struct file *file, const char __user *buffer,
102 size_t count, loff_t *ppos); 105 size_t count, loff_t *ppos);
106
107#ifdef CONFIG_MEMCG_KMEM
108static inline bool is_root_cache(struct kmem_cache *s)
109{
110 return !s->memcg_params || s->memcg_params->is_root_cache;
111}
112
113static inline bool cache_match_memcg(struct kmem_cache *cachep,
114 struct mem_cgroup *memcg)
115{
116 return (is_root_cache(cachep) && !memcg) ||
117 (cachep->memcg_params->memcg == memcg);
118}
119
120static inline void memcg_bind_pages(struct kmem_cache *s, int order)
121{
122 if (!is_root_cache(s))
123 atomic_add(1 << order, &s->memcg_params->nr_pages);
124}
125
126static inline void memcg_release_pages(struct kmem_cache *s, int order)
127{
128 if (is_root_cache(s))
129 return;
130
131 if (atomic_sub_and_test((1 << order), &s->memcg_params->nr_pages))
132 mem_cgroup_destroy_cache(s);
133}
134
135static inline bool slab_equal_or_root(struct kmem_cache *s,
136 struct kmem_cache *p)
137{
138 return (p == s) ||
139 (s->memcg_params && (p == s->memcg_params->root_cache));
140}
141
142/*
143 * We use suffixes to the name in memcg because we can't have caches
144 * created in the system with the same name. But when we print them
145 * locally, better refer to them with the base name
146 */
147static inline const char *cache_name(struct kmem_cache *s)
148{
149 if (!is_root_cache(s))
150 return s->memcg_params->root_cache->name;
151 return s->name;
152}
153
154static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx)
155{
156 return s->memcg_params->memcg_caches[idx];
157}
158
159static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
160{
161 if (is_root_cache(s))
162 return s;
163 return s->memcg_params->root_cache;
164}
165#else
166static inline bool is_root_cache(struct kmem_cache *s)
167{
168 return true;
169}
170
171static inline bool cache_match_memcg(struct kmem_cache *cachep,
172 struct mem_cgroup *memcg)
173{
174 return true;
175}
176
177static inline void memcg_bind_pages(struct kmem_cache *s, int order)
178{
179}
180
181static inline void memcg_release_pages(struct kmem_cache *s, int order)
182{
183}
184
185static inline bool slab_equal_or_root(struct kmem_cache *s,
186 struct kmem_cache *p)
187{
188 return true;
189}
190
191static inline const char *cache_name(struct kmem_cache *s)
192{
193 return s->name;
194}
195
196static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx)
197{
198 return NULL;
199}
200
201static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
202{
203 return s;
204}
205#endif
206
207static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
208{
209 struct kmem_cache *cachep;
210 struct page *page;
211
212 /*
213 * When kmemcg is not being used, both assignments should return the
214 * same value. but we don't want to pay the assignment price in that
215 * case. If it is not compiled in, the compiler should be smart enough
216 * to not do even the assignment. In that case, slab_equal_or_root
217 * will also be a constant.
218 */
219 if (!memcg_kmem_enabled() && !unlikely(s->flags & SLAB_DEBUG_FREE))
220 return s;
221
222 page = virt_to_head_page(x);
223 cachep = page->slab_cache;
224 if (slab_equal_or_root(cachep, s))
225 return cachep;
226
227 pr_err("%s: Wrong slab cache. %s but object is from %s\n",
228 __FUNCTION__, cachep->name, s->name);
229 WARN_ON_ONCE(1);
230 return s;
231}
103#endif 232#endif
diff --git a/mm/slab_common.c b/mm/slab_common.c
index a8e76d79ee65..3f3cd97d3fdf 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -18,6 +18,7 @@
18#include <asm/cacheflush.h> 18#include <asm/cacheflush.h>
19#include <asm/tlbflush.h> 19#include <asm/tlbflush.h>
20#include <asm/page.h> 20#include <asm/page.h>
21#include <linux/memcontrol.h>
21 22
22#include "slab.h" 23#include "slab.h"
23 24
@@ -27,7 +28,8 @@ DEFINE_MUTEX(slab_mutex);
27struct kmem_cache *kmem_cache; 28struct kmem_cache *kmem_cache;
28 29
29#ifdef CONFIG_DEBUG_VM 30#ifdef CONFIG_DEBUG_VM
30static int kmem_cache_sanity_check(const char *name, size_t size) 31static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name,
32 size_t size)
31{ 33{
32 struct kmem_cache *s = NULL; 34 struct kmem_cache *s = NULL;
33 35
@@ -53,7 +55,13 @@ static int kmem_cache_sanity_check(const char *name, size_t size)
53 continue; 55 continue;
54 } 56 }
55 57
56 if (!strcmp(s->name, name)) { 58 /*
59 * For simplicity, we won't check this in the list of memcg
60 * caches. We have control over memcg naming, and if there
61 * aren't duplicates in the global list, there won't be any
62 * duplicates in the memcg lists as well.
63 */
64 if (!memcg && !strcmp(s->name, name)) {
57 pr_err("%s (%s): Cache name already exists.\n", 65 pr_err("%s (%s): Cache name already exists.\n",
58 __func__, name); 66 __func__, name);
59 dump_stack(); 67 dump_stack();
@@ -66,12 +74,41 @@ static int kmem_cache_sanity_check(const char *name, size_t size)
66 return 0; 74 return 0;
67} 75}
68#else 76#else
69static inline int kmem_cache_sanity_check(const char *name, size_t size) 77static inline int kmem_cache_sanity_check(struct mem_cgroup *memcg,
78 const char *name, size_t size)
70{ 79{
71 return 0; 80 return 0;
72} 81}
73#endif 82#endif
74 83
84#ifdef CONFIG_MEMCG_KMEM
85int memcg_update_all_caches(int num_memcgs)
86{
87 struct kmem_cache *s;
88 int ret = 0;
89 mutex_lock(&slab_mutex);
90
91 list_for_each_entry(s, &slab_caches, list) {
92 if (!is_root_cache(s))
93 continue;
94
95 ret = memcg_update_cache_size(s, num_memcgs);
96 /*
97 * See comment in memcontrol.c, memcg_update_cache_size:
98 * Instead of freeing the memory, we'll just leave the caches
99 * up to this point in an updated state.
100 */
101 if (ret)
102 goto out;
103 }
104
105 memcg_update_array_size(num_memcgs);
106out:
107 mutex_unlock(&slab_mutex);
108 return ret;
109}
110#endif
111
75/* 112/*
76 * Figure out what the alignment of the objects will be given a set of 113 * Figure out what the alignment of the objects will be given a set of
77 * flags, a user specified alignment and the size of the objects. 114 * flags, a user specified alignment and the size of the objects.
@@ -125,8 +162,10 @@ unsigned long calculate_alignment(unsigned long flags,
125 * as davem. 162 * as davem.
126 */ 163 */
127 164
128struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align, 165struct kmem_cache *
129 unsigned long flags, void (*ctor)(void *)) 166kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
167 size_t align, unsigned long flags, void (*ctor)(void *),
168 struct kmem_cache *parent_cache)
130{ 169{
131 struct kmem_cache *s = NULL; 170 struct kmem_cache *s = NULL;
132 int err = 0; 171 int err = 0;
@@ -134,7 +173,7 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align
134 get_online_cpus(); 173 get_online_cpus();
135 mutex_lock(&slab_mutex); 174 mutex_lock(&slab_mutex);
136 175
137 if (!kmem_cache_sanity_check(name, size) == 0) 176 if (!kmem_cache_sanity_check(memcg, name, size) == 0)
138 goto out_locked; 177 goto out_locked;
139 178
140 /* 179 /*
@@ -145,7 +184,7 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align
145 */ 184 */
146 flags &= CACHE_CREATE_MASK; 185 flags &= CACHE_CREATE_MASK;
147 186
148 s = __kmem_cache_alias(name, size, align, flags, ctor); 187 s = __kmem_cache_alias(memcg, name, size, align, flags, ctor);
149 if (s) 188 if (s)
150 goto out_locked; 189 goto out_locked;
151 190
@@ -154,6 +193,13 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align
154 s->object_size = s->size = size; 193 s->object_size = s->size = size;
155 s->align = calculate_alignment(flags, align, size); 194 s->align = calculate_alignment(flags, align, size);
156 s->ctor = ctor; 195 s->ctor = ctor;
196
197 if (memcg_register_cache(memcg, s, parent_cache)) {
198 kmem_cache_free(kmem_cache, s);
199 err = -ENOMEM;
200 goto out_locked;
201 }
202
157 s->name = kstrdup(name, GFP_KERNEL); 203 s->name = kstrdup(name, GFP_KERNEL);
158 if (!s->name) { 204 if (!s->name) {
159 kmem_cache_free(kmem_cache, s); 205 kmem_cache_free(kmem_cache, s);
@@ -163,10 +209,9 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align
163 209
164 err = __kmem_cache_create(s, flags); 210 err = __kmem_cache_create(s, flags);
165 if (!err) { 211 if (!err) {
166
167 s->refcount = 1; 212 s->refcount = 1;
168 list_add(&s->list, &slab_caches); 213 list_add(&s->list, &slab_caches);
169 214 memcg_cache_list_add(memcg, s);
170 } else { 215 } else {
171 kfree(s->name); 216 kfree(s->name);
172 kmem_cache_free(kmem_cache, s); 217 kmem_cache_free(kmem_cache, s);
@@ -194,10 +239,20 @@ out_locked:
194 239
195 return s; 240 return s;
196} 241}
242
243struct kmem_cache *
244kmem_cache_create(const char *name, size_t size, size_t align,
245 unsigned long flags, void (*ctor)(void *))
246{
247 return kmem_cache_create_memcg(NULL, name, size, align, flags, ctor, NULL);
248}
197EXPORT_SYMBOL(kmem_cache_create); 249EXPORT_SYMBOL(kmem_cache_create);
198 250
199void kmem_cache_destroy(struct kmem_cache *s) 251void kmem_cache_destroy(struct kmem_cache *s)
200{ 252{
253 /* Destroy all the children caches if we aren't a memcg cache */
254 kmem_cache_destroy_memcg_children(s);
255
201 get_online_cpus(); 256 get_online_cpus();
202 mutex_lock(&slab_mutex); 257 mutex_lock(&slab_mutex);
203 s->refcount--; 258 s->refcount--;
@@ -209,6 +264,7 @@ void kmem_cache_destroy(struct kmem_cache *s)
209 if (s->flags & SLAB_DESTROY_BY_RCU) 264 if (s->flags & SLAB_DESTROY_BY_RCU)
210 rcu_barrier(); 265 rcu_barrier();
211 266
267 memcg_release_cache(s);
212 kfree(s->name); 268 kfree(s->name);
213 kmem_cache_free(kmem_cache, s); 269 kmem_cache_free(kmem_cache, s);
214 } else { 270 } else {
@@ -267,7 +323,7 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size,
267 323
268 324
269#ifdef CONFIG_SLABINFO 325#ifdef CONFIG_SLABINFO
270static void print_slabinfo_header(struct seq_file *m) 326void print_slabinfo_header(struct seq_file *m)
271{ 327{
272 /* 328 /*
273 * Output format version, so at least we can change it 329 * Output format version, so at least we can change it
@@ -311,16 +367,43 @@ static void s_stop(struct seq_file *m, void *p)
311 mutex_unlock(&slab_mutex); 367 mutex_unlock(&slab_mutex);
312} 368}
313 369
314static int s_show(struct seq_file *m, void *p) 370static void
371memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info)
372{
373 struct kmem_cache *c;
374 struct slabinfo sinfo;
375 int i;
376
377 if (!is_root_cache(s))
378 return;
379
380 for_each_memcg_cache_index(i) {
381 c = cache_from_memcg(s, i);
382 if (!c)
383 continue;
384
385 memset(&sinfo, 0, sizeof(sinfo));
386 get_slabinfo(c, &sinfo);
387
388 info->active_slabs += sinfo.active_slabs;
389 info->num_slabs += sinfo.num_slabs;
390 info->shared_avail += sinfo.shared_avail;
391 info->active_objs += sinfo.active_objs;
392 info->num_objs += sinfo.num_objs;
393 }
394}
395
396int cache_show(struct kmem_cache *s, struct seq_file *m)
315{ 397{
316 struct kmem_cache *s = list_entry(p, struct kmem_cache, list);
317 struct slabinfo sinfo; 398 struct slabinfo sinfo;
318 399
319 memset(&sinfo, 0, sizeof(sinfo)); 400 memset(&sinfo, 0, sizeof(sinfo));
320 get_slabinfo(s, &sinfo); 401 get_slabinfo(s, &sinfo);
321 402
403 memcg_accumulate_slabinfo(s, &sinfo);
404
322 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", 405 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
323 s->name, sinfo.active_objs, sinfo.num_objs, s->size, 406 cache_name(s), sinfo.active_objs, sinfo.num_objs, s->size,
324 sinfo.objects_per_slab, (1 << sinfo.cache_order)); 407 sinfo.objects_per_slab, (1 << sinfo.cache_order));
325 408
326 seq_printf(m, " : tunables %4u %4u %4u", 409 seq_printf(m, " : tunables %4u %4u %4u",
@@ -332,6 +415,15 @@ static int s_show(struct seq_file *m, void *p)
332 return 0; 415 return 0;
333} 416}
334 417
418static int s_show(struct seq_file *m, void *p)
419{
420 struct kmem_cache *s = list_entry(p, struct kmem_cache, list);
421
422 if (!is_root_cache(s))
423 return 0;
424 return cache_show(s, m);
425}
426
335/* 427/*
336 * slabinfo_op - iterator that generates /proc/slabinfo 428 * slabinfo_op - iterator that generates /proc/slabinfo
337 * 429 *
diff --git a/mm/slob.c b/mm/slob.c
index 795bab7d391d..a99fdf7a0907 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -58,7 +58,6 @@
58 58
59#include <linux/kernel.h> 59#include <linux/kernel.h>
60#include <linux/slab.h> 60#include <linux/slab.h>
61#include "slab.h"
62 61
63#include <linux/mm.h> 62#include <linux/mm.h>
64#include <linux/swap.h> /* struct reclaim_state */ 63#include <linux/swap.h> /* struct reclaim_state */
@@ -73,6 +72,7 @@
73 72
74#include <linux/atomic.h> 73#include <linux/atomic.h>
75 74
75#include "slab.h"
76/* 76/*
77 * slob_block has a field 'units', which indicates size of block if +ve, 77 * slob_block has a field 'units', which indicates size of block if +ve,
78 * or offset of next block if -ve (in SLOB_UNITs). 78 * or offset of next block if -ve (in SLOB_UNITs).
diff --git a/mm/slub.c b/mm/slub.c
index 87f9f32bf0cd..ba2ca53f6c3a 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -31,6 +31,7 @@
31#include <linux/fault-inject.h> 31#include <linux/fault-inject.h>
32#include <linux/stacktrace.h> 32#include <linux/stacktrace.h>
33#include <linux/prefetch.h> 33#include <linux/prefetch.h>
34#include <linux/memcontrol.h>
34 35
35#include <trace/events/kmem.h> 36#include <trace/events/kmem.h>
36 37
@@ -200,13 +201,14 @@ enum track_item { TRACK_ALLOC, TRACK_FREE };
200static int sysfs_slab_add(struct kmem_cache *); 201static int sysfs_slab_add(struct kmem_cache *);
201static int sysfs_slab_alias(struct kmem_cache *, const char *); 202static int sysfs_slab_alias(struct kmem_cache *, const char *);
202static void sysfs_slab_remove(struct kmem_cache *); 203static void sysfs_slab_remove(struct kmem_cache *);
203 204static void memcg_propagate_slab_attrs(struct kmem_cache *s);
204#else 205#else
205static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } 206static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
206static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) 207static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
207 { return 0; } 208 { return 0; }
208static inline void sysfs_slab_remove(struct kmem_cache *s) { } 209static inline void sysfs_slab_remove(struct kmem_cache *s) { }
209 210
211static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { }
210#endif 212#endif
211 213
212static inline void stat(const struct kmem_cache *s, enum stat_item si) 214static inline void stat(const struct kmem_cache *s, enum stat_item si)
@@ -1343,6 +1345,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1343 void *start; 1345 void *start;
1344 void *last; 1346 void *last;
1345 void *p; 1347 void *p;
1348 int order;
1346 1349
1347 BUG_ON(flags & GFP_SLAB_BUG_MASK); 1350 BUG_ON(flags & GFP_SLAB_BUG_MASK);
1348 1351
@@ -1351,7 +1354,9 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1351 if (!page) 1354 if (!page)
1352 goto out; 1355 goto out;
1353 1356
1357 order = compound_order(page);
1354 inc_slabs_node(s, page_to_nid(page), page->objects); 1358 inc_slabs_node(s, page_to_nid(page), page->objects);
1359 memcg_bind_pages(s, order);
1355 page->slab_cache = s; 1360 page->slab_cache = s;
1356 __SetPageSlab(page); 1361 __SetPageSlab(page);
1357 if (page->pfmemalloc) 1362 if (page->pfmemalloc)
@@ -1360,7 +1365,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1360 start = page_address(page); 1365 start = page_address(page);
1361 1366
1362 if (unlikely(s->flags & SLAB_POISON)) 1367 if (unlikely(s->flags & SLAB_POISON))
1363 memset(start, POISON_INUSE, PAGE_SIZE << compound_order(page)); 1368 memset(start, POISON_INUSE, PAGE_SIZE << order);
1364 1369
1365 last = start; 1370 last = start;
1366 for_each_object(p, s, start, page->objects) { 1371 for_each_object(p, s, start, page->objects) {
@@ -1401,10 +1406,12 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
1401 1406
1402 __ClearPageSlabPfmemalloc(page); 1407 __ClearPageSlabPfmemalloc(page);
1403 __ClearPageSlab(page); 1408 __ClearPageSlab(page);
1409
1410 memcg_release_pages(s, order);
1404 reset_page_mapcount(page); 1411 reset_page_mapcount(page);
1405 if (current->reclaim_state) 1412 if (current->reclaim_state)
1406 current->reclaim_state->reclaimed_slab += pages; 1413 current->reclaim_state->reclaimed_slab += pages;
1407 __free_pages(page, order); 1414 __free_memcg_kmem_pages(page, order);
1408} 1415}
1409 1416
1410#define need_reserve_slab_rcu \ 1417#define need_reserve_slab_rcu \
@@ -2322,6 +2329,7 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s,
2322 if (slab_pre_alloc_hook(s, gfpflags)) 2329 if (slab_pre_alloc_hook(s, gfpflags))
2323 return NULL; 2330 return NULL;
2324 2331
2332 s = memcg_kmem_get_cache(s, gfpflags);
2325redo: 2333redo:
2326 2334
2327 /* 2335 /*
@@ -2610,19 +2618,10 @@ redo:
2610 2618
2611void kmem_cache_free(struct kmem_cache *s, void *x) 2619void kmem_cache_free(struct kmem_cache *s, void *x)
2612{ 2620{
2613 struct page *page; 2621 s = cache_from_obj(s, x);
2614 2622 if (!s)
2615 page = virt_to_head_page(x);
2616
2617 if (kmem_cache_debug(s) && page->slab_cache != s) {
2618 pr_err("kmem_cache_free: Wrong slab cache. %s but object"
2619 " is from %s\n", page->slab_cache->name, s->name);
2620 WARN_ON_ONCE(1);
2621 return; 2623 return;
2622 } 2624 slab_free(s, virt_to_head_page(x), x, _RET_IP_);
2623
2624 slab_free(s, page, x, _RET_IP_);
2625
2626 trace_kmem_cache_free(_RET_IP_, x); 2625 trace_kmem_cache_free(_RET_IP_, x);
2627} 2626}
2628EXPORT_SYMBOL(kmem_cache_free); 2627EXPORT_SYMBOL(kmem_cache_free);
@@ -3154,8 +3153,19 @@ int __kmem_cache_shutdown(struct kmem_cache *s)
3154{ 3153{
3155 int rc = kmem_cache_close(s); 3154 int rc = kmem_cache_close(s);
3156 3155
3157 if (!rc) 3156 if (!rc) {
3157 /*
3158 * We do the same lock strategy around sysfs_slab_add, see
3159 * __kmem_cache_create. Because this is pretty much the last
3160 * operation we do and the lock will be released shortly after
3161 * that in slab_common.c, we could just move sysfs_slab_remove
3162 * to a later point in common code. We should do that when we
3163 * have a common sysfs framework for all allocators.
3164 */
3165 mutex_unlock(&slab_mutex);
3158 sysfs_slab_remove(s); 3166 sysfs_slab_remove(s);
3167 mutex_lock(&slab_mutex);
3168 }
3159 3169
3160 return rc; 3170 return rc;
3161} 3171}
@@ -3292,7 +3302,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
3292 struct page *page; 3302 struct page *page;
3293 void *ptr = NULL; 3303 void *ptr = NULL;
3294 3304
3295 flags |= __GFP_COMP | __GFP_NOTRACK; 3305 flags |= __GFP_COMP | __GFP_NOTRACK | __GFP_KMEMCG;
3296 page = alloc_pages_node(node, flags, get_order(size)); 3306 page = alloc_pages_node(node, flags, get_order(size));
3297 if (page) 3307 if (page)
3298 ptr = page_address(page); 3308 ptr = page_address(page);
@@ -3398,7 +3408,7 @@ void kfree(const void *x)
3398 if (unlikely(!PageSlab(page))) { 3408 if (unlikely(!PageSlab(page))) {
3399 BUG_ON(!PageCompound(page)); 3409 BUG_ON(!PageCompound(page));
3400 kmemleak_free(x); 3410 kmemleak_free(x);
3401 __free_pages(page, compound_order(page)); 3411 __free_memcg_kmem_pages(page, compound_order(page));
3402 return; 3412 return;
3403 } 3413 }
3404 slab_free(page->slab_cache, page, object, _RET_IP_); 3414 slab_free(page->slab_cache, page, object, _RET_IP_);
@@ -3786,7 +3796,7 @@ static int slab_unmergeable(struct kmem_cache *s)
3786 return 0; 3796 return 0;
3787} 3797}
3788 3798
3789static struct kmem_cache *find_mergeable(size_t size, 3799static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size,
3790 size_t align, unsigned long flags, const char *name, 3800 size_t align, unsigned long flags, const char *name,
3791 void (*ctor)(void *)) 3801 void (*ctor)(void *))
3792{ 3802{
@@ -3822,17 +3832,21 @@ static struct kmem_cache *find_mergeable(size_t size,
3822 if (s->size - size >= sizeof(void *)) 3832 if (s->size - size >= sizeof(void *))
3823 continue; 3833 continue;
3824 3834
3835 if (!cache_match_memcg(s, memcg))
3836 continue;
3837
3825 return s; 3838 return s;
3826 } 3839 }
3827 return NULL; 3840 return NULL;
3828} 3841}
3829 3842
3830struct kmem_cache *__kmem_cache_alias(const char *name, size_t size, 3843struct kmem_cache *
3831 size_t align, unsigned long flags, void (*ctor)(void *)) 3844__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
3845 size_t align, unsigned long flags, void (*ctor)(void *))
3832{ 3846{
3833 struct kmem_cache *s; 3847 struct kmem_cache *s;
3834 3848
3835 s = find_mergeable(size, align, flags, name, ctor); 3849 s = find_mergeable(memcg, size, align, flags, name, ctor);
3836 if (s) { 3850 if (s) {
3837 s->refcount++; 3851 s->refcount++;
3838 /* 3852 /*
@@ -3863,6 +3877,7 @@ int __kmem_cache_create(struct kmem_cache *s, unsigned long flags)
3863 if (slab_state <= UP) 3877 if (slab_state <= UP)
3864 return 0; 3878 return 0;
3865 3879
3880 memcg_propagate_slab_attrs(s);
3866 mutex_unlock(&slab_mutex); 3881 mutex_unlock(&slab_mutex);
3867 err = sysfs_slab_add(s); 3882 err = sysfs_slab_add(s);
3868 mutex_lock(&slab_mutex); 3883 mutex_lock(&slab_mutex);
@@ -5096,10 +5111,95 @@ static ssize_t slab_attr_store(struct kobject *kobj,
5096 return -EIO; 5111 return -EIO;
5097 5112
5098 err = attribute->store(s, buf, len); 5113 err = attribute->store(s, buf, len);
5114#ifdef CONFIG_MEMCG_KMEM
5115 if (slab_state >= FULL && err >= 0 && is_root_cache(s)) {
5116 int i;
5099 5117
5118 mutex_lock(&slab_mutex);
5119 if (s->max_attr_size < len)
5120 s->max_attr_size = len;
5121
5122 /*
5123 * This is a best effort propagation, so this function's return
5124 * value will be determined by the parent cache only. This is
5125 * basically because not all attributes will have a well
5126 * defined semantics for rollbacks - most of the actions will
5127 * have permanent effects.
5128 *
5129 * Returning the error value of any of the children that fail
5130 * is not 100 % defined, in the sense that users seeing the
5131 * error code won't be able to know anything about the state of
5132 * the cache.
5133 *
5134 * Only returning the error code for the parent cache at least
5135 * has well defined semantics. The cache being written to
5136 * directly either failed or succeeded, in which case we loop
5137 * through the descendants with best-effort propagation.
5138 */
5139 for_each_memcg_cache_index(i) {
5140 struct kmem_cache *c = cache_from_memcg(s, i);
5141 if (c)
5142 attribute->store(c, buf, len);
5143 }
5144 mutex_unlock(&slab_mutex);
5145 }
5146#endif
5100 return err; 5147 return err;
5101} 5148}
5102 5149
5150static void memcg_propagate_slab_attrs(struct kmem_cache *s)
5151{
5152#ifdef CONFIG_MEMCG_KMEM
5153 int i;
5154 char *buffer = NULL;
5155
5156 if (!is_root_cache(s))
5157 return;
5158
5159 /*
5160 * This mean this cache had no attribute written. Therefore, no point
5161 * in copying default values around
5162 */
5163 if (!s->max_attr_size)
5164 return;
5165
5166 for (i = 0; i < ARRAY_SIZE(slab_attrs); i++) {
5167 char mbuf[64];
5168 char *buf;
5169 struct slab_attribute *attr = to_slab_attr(slab_attrs[i]);
5170
5171 if (!attr || !attr->store || !attr->show)
5172 continue;
5173
5174 /*
5175 * It is really bad that we have to allocate here, so we will
5176 * do it only as a fallback. If we actually allocate, though,
5177 * we can just use the allocated buffer until the end.
5178 *
5179 * Most of the slub attributes will tend to be very small in
5180 * size, but sysfs allows buffers up to a page, so they can
5181 * theoretically happen.
5182 */
5183 if (buffer)
5184 buf = buffer;
5185 else if (s->max_attr_size < ARRAY_SIZE(mbuf))
5186 buf = mbuf;
5187 else {
5188 buffer = (char *) get_zeroed_page(GFP_KERNEL);
5189 if (WARN_ON(!buffer))
5190 continue;
5191 buf = buffer;
5192 }
5193
5194 attr->show(s->memcg_params->root_cache, buf);
5195 attr->store(s, buf, strlen(buf));
5196 }
5197
5198 if (buffer)
5199 free_page((unsigned long)buffer);
5200#endif
5201}
5202
5103static const struct sysfs_ops slab_sysfs_ops = { 5203static const struct sysfs_ops slab_sysfs_ops = {
5104 .show = slab_attr_show, 5204 .show = slab_attr_show,
5105 .store = slab_attr_store, 5205 .store = slab_attr_store,
@@ -5156,6 +5256,12 @@ static char *create_unique_id(struct kmem_cache *s)
5156 if (p != name + 1) 5256 if (p != name + 1)
5157 *p++ = '-'; 5257 *p++ = '-';
5158 p += sprintf(p, "%07d", s->size); 5258 p += sprintf(p, "%07d", s->size);
5259
5260#ifdef CONFIG_MEMCG_KMEM
5261 if (!is_root_cache(s))
5262 p += sprintf(p, "-%08d", memcg_cache_id(s->memcg_params->memcg));
5263#endif
5264
5159 BUG_ON(p > name + ID_STR_LENGTH - 1); 5265 BUG_ON(p > name + ID_STR_LENGTH - 1);
5160 return name; 5266 return name;
5161} 5267}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 7f3096137b8a..828530e2794a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1177,7 +1177,11 @@ int isolate_lru_page(struct page *page)
1177} 1177}
1178 1178
1179/* 1179/*
1180 * Are there way too many processes in the direct reclaim path already? 1180 * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
1181 * then get resheduled. When there are massive number of tasks doing page
1182 * allocation, such sleeping direct reclaimers may keep piling up on each CPU,
1183 * the LRU list will go small and be scanned faster than necessary, leading to
1184 * unnecessary swapping, thrashing and OOM.
1181 */ 1185 */
1182static int too_many_isolated(struct zone *zone, int file, 1186static int too_many_isolated(struct zone *zone, int file,
1183 struct scan_control *sc) 1187 struct scan_control *sc)
@@ -1198,6 +1202,14 @@ static int too_many_isolated(struct zone *zone, int file,
1198 isolated = zone_page_state(zone, NR_ISOLATED_ANON); 1202 isolated = zone_page_state(zone, NR_ISOLATED_ANON);
1199 } 1203 }
1200 1204
1205 /*
1206 * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
1207 * won't get blocked by normal direct-reclaimers, forming a circular
1208 * deadlock.
1209 */
1210 if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS)
1211 inactive >>= 3;
1212
1201 return isolated > inactive; 1213 return isolated > inactive;
1202} 1214}
1203 1215
diff --git a/scripts/coccinelle/api/d_find_alias.cocci b/scripts/coccinelle/api/d_find_alias.cocci
new file mode 100644
index 000000000000..a9694a8d3e5a
--- /dev/null
+++ b/scripts/coccinelle/api/d_find_alias.cocci
@@ -0,0 +1,80 @@
1/// Make sure calls to d_find_alias() have a corresponding call to dput().
2//
3// Keywords: d_find_alias, dput
4//
5// Confidence: Moderate
6// URL: http://coccinelle.lip6.fr/
7// Options: -include_headers
8
9virtual context
10virtual org
11virtual patch
12virtual report
13
14@r exists@
15local idexpression struct dentry *dent;
16expression E, E1;
17statement S1, S2;
18position p1, p2;
19@@
20(
21 if (!(dent@p1 = d_find_alias(...))) S1
22|
23 dent@p1 = d_find_alias(...)
24)
25
26<...when != dput(dent)
27 when != if (...) { <+... dput(dent) ...+> }
28 when != true !dent || ...
29 when != dent = E
30 when != E = dent
31if (!dent || ...) S2
32...>
33(
34 return <+...dent...+>;
35|
36 return @p2 ...;
37|
38 dent@p2 = E1;
39|
40 E1 = dent;
41)
42
43@depends on context@
44local idexpression struct dentry *r.dent;
45position r.p1,r.p2;
46@@
47* dent@p1 = ...
48 ...
49(
50* return@p2 ...;
51|
52* dent@p2
53)
54
55
56@script:python depends on org@
57p1 << r.p1;
58p2 << r.p2;
59@@
60cocci.print_main("Missing call to dput()",p1)
61cocci.print_secs("",p2)
62
63@depends on patch@
64local idexpression struct dentry *r.dent;
65position r.p2;
66@@
67(
68+ dput(dent);
69 return @p2 ...;
70|
71+ dput(dent);
72 dent@p2 = ...;
73)
74
75@script:python depends on report@
76p1 << r.p1;
77p2 << r.p2;
78@@
79msg = "Missing call to dput() at line %s."
80coccilib.report.print_report(p1[0], msg % (p2[0].line))