10 files changed, 286 insertions, 9 deletions
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index f74856e17e48..0f615eb23d05 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -30,6 +30,7 @@ struct vm_area_struct;
 #define ___GFP_HARDWALL         0x20000u
 #define ___GFP_THISNODE         0x40000u
 #define ___GFP_RECLAIMABLE      0x80000u
+#define ___GFP_KMEMCG           0x100000u
 #define ___GFP_NOTRACK          0x200000u
 #define ___GFP_NO_KSWAPD        0x400000u
 #define ___GFP_OTHER_NODE       0x800000u
@@ -89,6 +90,7 @@ struct vm_area_struct;
 #define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD)
 #define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */
+#define __GFP_KMEMCG    ((__force gfp_t)___GFP_KMEMCG) /* Allocation comes from a memcg-accounted resource */
 #define __GFP_WRITE     ((__force gfp_t)___GFP_WRITE)   /* Allocator intends to dirty page */
 /*
@@ -365,6 +367,9 @@ extern void free_pages(unsigned long addr, unsigned int order);
 extern void free_hot_cold_page(struct page *page, int cold);
 extern void free_hot_cold_page_list(struct list_head *list, int cold);
+extern void __free_memcg_kmem_pages(struct page *page, unsigned int order);
+extern void free_memcg_kmem_pages(unsigned long addr, unsigned int order);
 #define __free_page(page) __free_pages((page), 0)
 #define free_page(addr) free_pages((addr), 0)
diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h
index d73878c694b3..ce8217f7b5c2 100644
--- a/include/linux/hugetlb_cgroup.h
+++ b/include/linux/hugetlb_cgroup.h
@@ -62,7 +62,7 @@ extern void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
                                         struct page *page);
 extern void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
                                           struct hugetlb_cgroup *h_cg);
-extern int hugetlb_cgroup_file_init(int idx) __init;
+extern void hugetlb_cgroup_file_init(void) __init;
 extern void hugetlb_cgroup_migrate(struct page *oldhpage,
                                   struct page *newhpage);
@@ -111,9 +111,8 @@ hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
        return;
 }
-static inline int __init hugetlb_cgroup_file_init(int idx)
+static inline void hugetlb_cgroup_file_init(void)
 {
-        return 0;
 }
 static inline void hugetlb_cgroup_migrate(struct page *oldhpage,
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index e98a74c0c9c0..0108a56f814e 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -21,11 +21,14 @@
 #define _LINUX_MEMCONTROL_H
 #include <linux/cgroup.h>
 #include <linux/vm_event_item.h>
+#include <linux/hardirq.h>
+#include <linux/jump_label.h>
 struct mem_cgroup;
 struct page_cgroup;
 struct page;
 struct mm_struct;
+struct kmem_cache;
 /* Stats that can be updated by kernel. */
 enum mem_cgroup_page_stat_item {
@@ -414,5 +417,211 @@ static inline void sock_release_memcg(struct sock *sk)
 {
 }
 #endif /* CONFIG_INET && CONFIG_MEMCG_KMEM */
+#ifdef CONFIG_MEMCG_KMEM
+extern struct static_key memcg_kmem_enabled_key;
+extern int memcg_limited_groups_array_size;
+/*
+ * Helper macro to loop through all memcg-specific caches. Callers must still
+ * check if the cache is valid (it is either valid or NULL).
+ * the slab_mutex must be held when looping through those caches
+ */
+#define for_each_memcg_cache_index(_idx)        \
+        for ((_idx) = 0; i < memcg_limited_groups_array_size; (_idx)++)
+static inline bool memcg_kmem_enabled(void)
+{
+        return static_key_false(&memcg_kmem_enabled_key);
+}
+/*
+ * In general, we'll do everything in our power to not incur in any overhead
+ * for non-memcg users for the kmem functions. Not even a function call, if we
+ * can avoid it.
+ *
+ * Therefore, we'll inline all those functions so that in the best case, we'll
+ * see that kmemcg is off for everybody and proceed quickly.  If it is on,
+ * we'll still do most of the flag checking inline. We check a lot of
+ * conditions, but because they are pretty simple, they are expected to be
+ * fast.
+ */
+bool __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg,
+                                        int order);
+void __memcg_kmem_commit_charge(struct page *page,
+                                       struct mem_cgroup *memcg, int order);
+void __memcg_kmem_uncharge_pages(struct page *page, int order);
+int memcg_cache_id(struct mem_cgroup *memcg);
+int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
+                         struct kmem_cache *root_cache);
+void memcg_release_cache(struct kmem_cache *cachep);
+void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep);
+int memcg_update_cache_size(struct kmem_cache *s, int num_groups);
+void memcg_update_array_size(int num_groups);
+struct kmem_cache *
+__memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp);
+void mem_cgroup_destroy_cache(struct kmem_cache *cachep);
+void kmem_cache_destroy_memcg_children(struct kmem_cache *s);
+/**
+ * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed.
+ * @gfp: the gfp allocation flags.
+ * @memcg: a pointer to the memcg this was charged against.
+ * @order: allocation order.
+ *
+ * returns true if the memcg where the current task belongs can hold this
+ * allocation.
+ *
+ * We return true automatically if this allocation is not to be accounted to
+ * any memcg.
+ */
+static inline bool
+memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
+{
+        if (!memcg_kmem_enabled())
+                return true;
+        /*
+         * __GFP_NOFAIL allocations will move on even if charging is not
+         * possible. Therefore we don't even try, and have this allocation
+         * unaccounted. We could in theory charge it with
+         * res_counter_charge_nofail, but we hope those allocations are rare,
+         * and won't be worth the trouble.
+         */
+        if (!(gfp & __GFP_KMEMCG) || (gfp & __GFP_NOFAIL))
+                return true;
+        if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD))
+                return true;
+        /* If the test is dying, just let it go. */
+        if (unlikely(fatal_signal_pending(current)))
+                return true;
+        return __memcg_kmem_newpage_charge(gfp, memcg, order);
+}
+/**
+ * memcg_kmem_uncharge_pages: uncharge pages from memcg
+ * @page: pointer to struct page being freed
+ * @order: allocation order.
+ *
+ * there is no need to specify memcg here, since it is embedded in page_cgroup
+ */
+static inline void
+memcg_kmem_uncharge_pages(struct page *page, int order)
+{
+        if (memcg_kmem_enabled())
+                __memcg_kmem_uncharge_pages(page, order);
+}
+/**
+ * memcg_kmem_commit_charge: embeds correct memcg in a page
+ * @page: pointer to struct page recently allocated
+ * @memcg: the memcg structure we charged against
+ * @order: allocation order.
+ *
+ * Needs to be called after memcg_kmem_newpage_charge, regardless of success or
+ * failure of the allocation. if @page is NULL, this function will revert the
+ * charges. Otherwise, it will commit the memcg given by @memcg to the
+ * corresponding page_cgroup.
+ */
+static inline void
+memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order)
+{
+        if (memcg_kmem_enabled() && memcg)
+                __memcg_kmem_commit_charge(page, memcg, order);
+}
+/**
+ * memcg_kmem_get_cache: selects the correct per-memcg cache for allocation
+ * @cachep: the original global kmem cache
+ * @gfp: allocation flags.
+ *
+ * This function assumes that the task allocating, which determines the memcg
+ * in the page allocator, belongs to the same cgroup throughout the whole
+ * process.  Misacounting can happen if the task calls memcg_kmem_get_cache()
+ * while belonging to a cgroup, and later on changes. This is considered
+ * acceptable, and should only happen upon task migration.
+ *
+ * Before the cache is created by the memcg core, there is also a possible
+ * imbalance: the task belongs to a memcg, but the cache being allocated from
+ * is the global cache, since the child cache is not yet guaranteed to be
+ * ready. This case is also fine, since in this case the GFP_KMEMCG will not be
+ * passed and the page allocator will not attempt any cgroup accounting.
+ */
+static __always_inline struct kmem_cache *
+memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
+{
+        if (!memcg_kmem_enabled())
+                return cachep;
+        if (gfp & __GFP_NOFAIL)
+                return cachep;
+        if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD))
+                return cachep;
+        if (unlikely(fatal_signal_pending(current)))
+                return cachep;
+        return __memcg_kmem_get_cache(cachep, gfp);
+}
+#else
+#define for_each_memcg_cache_index(_idx)        \
+        for (; NULL; )
+static inline bool memcg_kmem_enabled(void)
+{
+        return false;
+}
+static inline bool
+memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
+{
+        return true;
+}
+static inline void memcg_kmem_uncharge_pages(struct page *page, int order)
+{
+}
+static inline void
+memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order)
+{
+}
+static inline int memcg_cache_id(struct mem_cgroup *memcg)
+{
+        return -1;
+}
+static inline int
+memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
+                     struct kmem_cache *root_cache)
+{
+        return 0;
+}
+static inline void memcg_release_cache(struct kmem_cache *cachep)
+{
+}
+static inline void memcg_cache_list_add(struct mem_cgroup *memcg,
+                                        struct kmem_cache *s)
+{
+}
+static inline struct kmem_cache *
+memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
+{
+        return cachep;
+}
+static inline void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
 #endif /* _LINUX_MEMCONTROL_H */
diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h
index 6f54e40fa218..5ae8456d9670 100644
--- a/include/linux/res_counter.h
+++ b/include/linux/res_counter.h
@@ -125,14 +125,16 @@ int res_counter_charge_nofail(struct res_counter *counter,
 *
 * these calls check for usage underflow and show a warning on the console
 * _locked call expects the counter->lock to be taken
+ *
+ * returns the total charges still present in @counter.
 */
-void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val);
+u64 res_counter_uncharge_locked(struct res_counter *counter, unsigned long val);
-void res_counter_uncharge(struct res_counter *counter, unsigned long val);
+u64 res_counter_uncharge(struct res_counter *counter, unsigned long val);
-void res_counter_uncharge_until(struct res_counter *counter,
+u64 res_counter_uncharge_until(struct res_counter *counter,
-                                struct res_counter *top,
+                               struct res_counter *top,
-                                unsigned long val);
+                               unsigned long val);
 /**
 * res_counter_margin - calculate chargeable space of a counter
 * @cnt: the counter
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9914c662ed7b..f712465b05c5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1597,6 +1597,7 @@ struct task_struct {
                unsigned long nr_pages; /* uncharged usage */
                unsigned long memsw_nr_pages; /* uncharged mem+swap usage */
        } memcg_batch;
+        unsigned int memcg_kmem_skip_account;
 #endif
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
        atomic_t ptrace_bp_refcnt;
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 743a10415122..5d168d7e0a28 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -11,6 +11,8 @@
 #include <linux/gfp.h>
 #include <linux/types.h>
+#include <linux/workqueue.h>
 /*
 * Flags to pass to kmem_cache_create().
@@ -116,6 +118,7 @@ struct kmem_cache {
 };
 #endif
+struct mem_cgroup;
 /*
 * struct kmem_cache related prototypes
 */
@@ -125,6 +128,9 @@ int slab_is_available(void);
 struct kmem_cache *kmem_cache_create(const char *, size_t, size_t,
                        unsigned long,
                        void (*)(void *));
+struct kmem_cache *
+kmem_cache_create_memcg(struct mem_cgroup *, const char *, size_t, size_t,
+                        unsigned long, void (*)(void *), struct kmem_cache *);
 void kmem_cache_destroy(struct kmem_cache *);
 int kmem_cache_shrink(struct kmem_cache *);
 void kmem_cache_free(struct kmem_cache *, void *);
@@ -175,6 +181,48 @@ void kmem_cache_free(struct kmem_cache *, void *);
 #ifndef ARCH_SLAB_MINALIGN
 #define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
 #endif
+/*
+ * This is the main placeholder for memcg-related information in kmem caches.
+ * struct kmem_cache will hold a pointer to it, so the memory cost while
+ * disabled is 1 pointer. The runtime cost while enabled, gets bigger than it
+ * would otherwise be if that would be bundled in kmem_cache: we'll need an
+ * extra pointer chase. But the trade off clearly lays in favor of not
+ * penalizing non-users.
+ *
+ * Both the root cache and the child caches will have it. For the root cache,
+ * this will hold a dynamically allocated array large enough to hold
+ * information about the currently limited memcgs in the system.
+ *
+ * Child caches will hold extra metadata needed for its operation. Fields are:
+ *
+ * @memcg: pointer to the memcg this cache belongs to
+ * @list: list_head for the list of all caches in this memcg
+ * @root_cache: pointer to the global, root cache, this cache was derived from
+ * @dead: set to true after the memcg dies; the cache may still be around.
+ * @nr_pages: number of pages that belongs to this cache.
+ * @destroy: worker to be called whenever we are ready, or believe we may be
+ *           ready, to destroy this cache.
+ */
+struct memcg_cache_params {
+        bool is_root_cache;
+        union {
+                struct kmem_cache *memcg_caches[0];
+                struct {
+                        struct mem_cgroup *memcg;
+                        struct list_head list;
+                        struct kmem_cache *root_cache;
+                        bool dead;
+                        atomic_t nr_pages;
+                        struct work_struct destroy;
+                };
+        };
+};
+int memcg_update_all_caches(int num_memcgs);
+struct seq_file;
+int cache_show(struct kmem_cache *s, struct seq_file *m);
+void print_slabinfo_header(struct seq_file *m);
 /*
 * Common kmalloc functions provided by all allocators
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 45c0356fdc8c..8bb6e0eaf3c6 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -81,6 +81,9 @@ struct kmem_cache {
         */
        int obj_offset;
 #endif /* CONFIG_DEBUG_SLAB */
+#ifdef CONFIG_MEMCG_KMEM
+        struct memcg_cache_params *memcg_params;
+#endif
 /* 6) per-cpu/per-node data, touched during every alloc/free */
        /*
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index df448adb7283..9db4825cd393 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -101,6 +101,10 @@ struct kmem_cache {
 #ifdef CONFIG_SYSFS
        struct kobject kobj;    /* For sysfs */
 #endif
+#ifdef CONFIG_MEMCG_KMEM
+        struct memcg_cache_params *memcg_params;
+        int max_attr_size; /* for propagation, maximum size of a stored attr */
+#endif
 #ifdef CONFIG_NUMA
        /*
@@ -222,7 +226,10 @@ void *__kmalloc(size_t size, gfp_t flags);
 static __always_inline void *
 kmalloc_order(size_t size, gfp_t flags, unsigned int order)
 {
-        void *ret = (void *) __get_free_pages(flags | __GFP_COMP, order);
+        void *ret;
+        flags |= (__GFP_COMP | __GFP_KMEMCG);
+        ret = (void *) __get_free_pages(flags, order);
        kmemleak_alloc(ret, size, 1, flags);
        return ret;
 }
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index ccc1899bd62e..e7e04736802f 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -61,6 +61,8 @@ extern long do_no_restart_syscall(struct restart_block *parm);
 # define THREADINFO_GFP         (GFP_KERNEL | __GFP_NOTRACK)
 #endif
+#define THREADINFO_GFP_ACCOUNTED (THREADINFO_GFP | __GFP_KMEMCG)
 /*
 * flag set/clear/test wrappers
 * - pass TIF_xxxx constants to these functions
diff --git a/include/trace/events/gfpflags.h b/include/trace/events/gfpflags.h
index d6fd8e5b14b7..1eddbf1557f2 100644
--- a/include/trace/events/gfpflags.h
+++ b/include/trace/events/gfpflags.h
@@ -34,6 +34,7 @@
        {(unsigned long)__GFP_HARDWALL,         "GFP_HARDWALL"},        \
        {(unsigned long)__GFP_THISNODE,         "GFP_THISNODE"},        \
        {(unsigned long)__GFP_RECLAIMABLE,      "GFP_RECLAIMABLE"},     \
+        {(unsigned long)__GFP_KMEMCG,           "GFP_KMEMCG"},          \
        {(unsigned long)__GFP_MOVABLE,          "GFP_MOVABLE"},         \
        {(unsigned long)__GFP_NOTRACK,          "GFP_NOTRACK"},         \
        {(unsigned long)__GFP_NO_KSWAPD,        "GFP_NO_KSWAPD"},       \

diff --git a/include/linux/gfp.h b/include/linux/gfp.h index f74856e17e48..0f615eb23d05 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h
@@ -30,6 +30,7 @@ struct vm_area_struct;
30	#define ___GFP_HARDWALL 0x20000u	30	#define ___GFP_HARDWALL 0x20000u
31	#define ___GFP_THISNODE 0x40000u	31	#define ___GFP_THISNODE 0x40000u
32	#define ___GFP_RECLAIMABLE 0x80000u	32	#define ___GFP_RECLAIMABLE 0x80000u
		33	#define ___GFP_KMEMCG 0x100000u
33	#define ___GFP_NOTRACK 0x200000u	34	#define ___GFP_NOTRACK 0x200000u
34	#define ___GFP_NO_KSWAPD 0x400000u	35	#define ___GFP_NO_KSWAPD 0x400000u
35	#define ___GFP_OTHER_NODE 0x800000u	36	#define ___GFP_OTHER_NODE 0x800000u
@@ -89,6 +90,7 @@ struct vm_area_struct;
89		90
90	#define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD)	91	#define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD)
91	#define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */	92	#define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */
		93	#define __GFP_KMEMCG ((__force gfp_t)___GFP_KMEMCG) /* Allocation comes from a memcg-accounted resource */
92	#define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */	94	#define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */
93		95
94	/*	96	/*
@@ -365,6 +367,9 @@ extern void free_pages(unsigned long addr, unsigned int order);
365	extern void free_hot_cold_page(struct page *page, int cold);	367	extern void free_hot_cold_page(struct page *page, int cold);
366	extern void free_hot_cold_page_list(struct list_head *list, int cold);	368	extern void free_hot_cold_page_list(struct list_head *list, int cold);
367		369
		370	extern void __free_memcg_kmem_pages(struct page *page, unsigned int order);
		371	extern void free_memcg_kmem_pages(unsigned long addr, unsigned int order);
		372
368	#define __free_page(page) __free_pages((page), 0)	373	#define __free_page(page) __free_pages((page), 0)
369	#define free_page(addr) free_pages((addr), 0)	374	#define free_page(addr) free_pages((addr), 0)
370		375


diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index d73878c694b3..ce8217f7b5c2 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h
@@ -62,7 +62,7 @@ extern void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
62	struct page *page);	62	struct page *page);
63	extern void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,	63	extern void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
64	struct hugetlb_cgroup *h_cg);	64	struct hugetlb_cgroup *h_cg);
65	extern int hugetlb_cgroup_file_init(int idx) __init;	65	extern void hugetlb_cgroup_file_init(void) __init;
66	extern void hugetlb_cgroup_migrate(struct page *oldhpage,	66	extern void hugetlb_cgroup_migrate(struct page *oldhpage,
67	struct page *newhpage);	67	struct page *newhpage);
68		68
@@ -111,9 +111,8 @@ hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
111	return;	111	return;
112	}	112	}
113		113
114	static inline int __init hugetlb_cgroup_file_init(int idx)	114	static inline void hugetlb_cgroup_file_init(void)
115	{	115	{
116	return 0;
117	}	116	}
118		117
119	static inline void hugetlb_cgroup_migrate(struct page *oldhpage,	118	static inline void hugetlb_cgroup_migrate(struct page *oldhpage,


diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e98a74c0c9c0..0108a56f814e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h
@@ -21,11 +21,14 @@
21	#define _LINUX_MEMCONTROL_H	21	#define _LINUX_MEMCONTROL_H
22	#include <linux/cgroup.h>	22	#include <linux/cgroup.h>
23	#include <linux/vm_event_item.h>	23	#include <linux/vm_event_item.h>
		24	#include <linux/hardirq.h>
		25	#include <linux/jump_label.h>
24		26
25	struct mem_cgroup;	27	struct mem_cgroup;
26	struct page_cgroup;	28	struct page_cgroup;
27	struct page;	29	struct page;
28	struct mm_struct;	30	struct mm_struct;
		31	struct kmem_cache;
29		32
30	/* Stats that can be updated by kernel. */	33	/* Stats that can be updated by kernel. */
31	enum mem_cgroup_page_stat_item {	34	enum mem_cgroup_page_stat_item {
@@ -414,5 +417,211 @@ static inline void sock_release_memcg(struct sock *sk)
414	{	417	{
415	}	418	}
416	#endif /* CONFIG_INET && CONFIG_MEMCG_KMEM */	419	#endif /* CONFIG_INET && CONFIG_MEMCG_KMEM */
		420
		421	#ifdef CONFIG_MEMCG_KMEM
		422	extern struct static_key memcg_kmem_enabled_key;
		423
		424	extern int memcg_limited_groups_array_size;
		425
		426	/*
		427	* Helper macro to loop through all memcg-specific caches. Callers must still
		428	* check if the cache is valid (it is either valid or NULL).
		429	* the slab_mutex must be held when looping through those caches
		430	*/
		431	#define for_each_memcg_cache_index(_idx) \
		432	for ((_idx) = 0; i < memcg_limited_groups_array_size; (_idx)++)
		433
		434	static inline bool memcg_kmem_enabled(void)
		435	{
		436	return static_key_false(&memcg_kmem_enabled_key);
		437	}
		438
		439	/*
		440	* In general, we'll do everything in our power to not incur in any overhead
		441	* for non-memcg users for the kmem functions. Not even a function call, if we
		442	* can avoid it.
		443	*
		444	* Therefore, we'll inline all those functions so that in the best case, we'll
		445	* see that kmemcg is off for everybody and proceed quickly. If it is on,
		446	* we'll still do most of the flag checking inline. We check a lot of
		447	* conditions, but because they are pretty simple, they are expected to be
		448	* fast.
		449	*/
		450	bool __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg,
		451	int order);
		452	void __memcg_kmem_commit_charge(struct page *page,
		453	struct mem_cgroup *memcg, int order);
		454	void __memcg_kmem_uncharge_pages(struct page *page, int order);
		455
		456	int memcg_cache_id(struct mem_cgroup *memcg);
		457	int memcg_register_cache(struct mem_cgroup memcg, struct kmem_cache s,
		458	struct kmem_cache *root_cache);
		459	void memcg_release_cache(struct kmem_cache *cachep);
		460	void memcg_cache_list_add(struct mem_cgroup memcg, struct kmem_cache cachep);
		461
		462	int memcg_update_cache_size(struct kmem_cache *s, int num_groups);
		463	void memcg_update_array_size(int num_groups);
		464
		465	struct kmem_cache *
		466	__memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp);
		467
		468	void mem_cgroup_destroy_cache(struct kmem_cache *cachep);
		469	void kmem_cache_destroy_memcg_children(struct kmem_cache *s);
		470
		471	/**
		472	* memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed.
		473	* @gfp: the gfp allocation flags.
		474	* @memcg: a pointer to the memcg this was charged against.
		475	* @order: allocation order.
		476	*
		477	* returns true if the memcg where the current task belongs can hold this
		478	* allocation.
		479	*
		480	* We return true automatically if this allocation is not to be accounted to
		481	* any memcg.
		482	*/
		483	static inline bool
		484	memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
		485	{
		486	if (!memcg_kmem_enabled())
		487	return true;
		488
		489	/*
		490	* __GFP_NOFAIL allocations will move on even if charging is not
		491	* possible. Therefore we don't even try, and have this allocation
		492	* unaccounted. We could in theory charge it with
		493	* res_counter_charge_nofail, but we hope those allocations are rare,
		494	* and won't be worth the trouble.
		495	*/
		496	if (!(gfp & __GFP_KMEMCG) \|\| (gfp & __GFP_NOFAIL))
		497	return true;
		498	if (in_interrupt() \|\| (!current->mm) \|\| (current->flags & PF_KTHREAD))
		499	return true;
		500
		501	/* If the test is dying, just let it go. */
		502	if (unlikely(fatal_signal_pending(current)))
		503	return true;
		504
		505	return __memcg_kmem_newpage_charge(gfp, memcg, order);
		506	}
		507
		508	/**
		509	* memcg_kmem_uncharge_pages: uncharge pages from memcg
		510	* @page: pointer to struct page being freed
		511	* @order: allocation order.
		512	*
		513	* there is no need to specify memcg here, since it is embedded in page_cgroup
		514	*/
		515	static inline void
		516	memcg_kmem_uncharge_pages(struct page *page, int order)
		517	{
		518	if (memcg_kmem_enabled())
		519	__memcg_kmem_uncharge_pages(page, order);
		520	}
		521
		522	/**
		523	* memcg_kmem_commit_charge: embeds correct memcg in a page
		524	* @page: pointer to struct page recently allocated
		525	* @memcg: the memcg structure we charged against
		526	* @order: allocation order.
		527	*
		528	* Needs to be called after memcg_kmem_newpage_charge, regardless of success or
		529	* failure of the allocation. if @page is NULL, this function will revert the
		530	* charges. Otherwise, it will commit the memcg given by @memcg to the
		531	* corresponding page_cgroup.
		532	*/
		533	static inline void
		534	memcg_kmem_commit_charge(struct page page, struct mem_cgroup memcg, int order)
		535	{
		536	if (memcg_kmem_enabled() && memcg)
		537	__memcg_kmem_commit_charge(page, memcg, order);
		538	}
		539
		540	/**
		541	* memcg_kmem_get_cache: selects the correct per-memcg cache for allocation
		542	* @cachep: the original global kmem cache
		543	* @gfp: allocation flags.
		544	*
		545	* This function assumes that the task allocating, which determines the memcg
		546	* in the page allocator, belongs to the same cgroup throughout the whole
		547	* process. Misacounting can happen if the task calls memcg_kmem_get_cache()
		548	* while belonging to a cgroup, and later on changes. This is considered
		549	* acceptable, and should only happen upon task migration.
		550	*
		551	* Before the cache is created by the memcg core, there is also a possible
		552	* imbalance: the task belongs to a memcg, but the cache being allocated from
		553	* is the global cache, since the child cache is not yet guaranteed to be
		554	* ready. This case is also fine, since in this case the GFP_KMEMCG will not be
		555	* passed and the page allocator will not attempt any cgroup accounting.
		556	*/
		557	static __always_inline struct kmem_cache *
		558	memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
		559	{
		560	if (!memcg_kmem_enabled())
		561	return cachep;
		562	if (gfp & __GFP_NOFAIL)
		563	return cachep;
		564	if (in_interrupt() \|\| (!current->mm) \|\| (current->flags & PF_KTHREAD))
		565	return cachep;
		566	if (unlikely(fatal_signal_pending(current)))
		567	return cachep;
		568
		569	return __memcg_kmem_get_cache(cachep, gfp);
		570	}
		571	#else
		572	#define for_each_memcg_cache_index(_idx) \
		573	for (; NULL; )
		574
		575	static inline bool memcg_kmem_enabled(void)
		576	{
		577	return false;
		578	}
		579
		580	static inline bool
		581	memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
		582	{
		583	return true;
		584	}
		585
		586	static inline void memcg_kmem_uncharge_pages(struct page *page, int order)
		587	{
		588	}
		589
		590	static inline void
		591	memcg_kmem_commit_charge(struct page page, struct mem_cgroup memcg, int order)
		592	{
		593	}
		594
		595	static inline int memcg_cache_id(struct mem_cgroup *memcg)
		596	{
		597	return -1;
		598	}
		599
		600	static inline int
		601	memcg_register_cache(struct mem_cgroup memcg, struct kmem_cache s,
		602	struct kmem_cache *root_cache)
		603	{
		604	return 0;
		605	}
		606
		607	static inline void memcg_release_cache(struct kmem_cache *cachep)
		608	{
		609	}
		610
		611	static inline void memcg_cache_list_add(struct mem_cgroup *memcg,
		612	struct kmem_cache *s)
		613	{
		614	}
		615
		616	static inline struct kmem_cache *
		617	memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
		618	{
		619	return cachep;
		620	}
		621
		622	static inline void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
		623	{
		624	}
		625	#endif /* CONFIG_MEMCG_KMEM */
417	#endif /* _LINUX_MEMCONTROL_H */	626	#endif /* _LINUX_MEMCONTROL_H */
418		627


diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h index 6f54e40fa218..5ae8456d9670 100644 --- a/include/linux/res_counter.h +++ b/include/linux/res_counter.h
@@ -125,14 +125,16 @@ int res_counter_charge_nofail(struct res_counter *counter,
125	*	125	*
126	* these calls check for usage underflow and show a warning on the console	126	* these calls check for usage underflow and show a warning on the console
127	* _locked call expects the counter->lock to be taken	127	* _locked call expects the counter->lock to be taken
		128	*
		129	* returns the total charges still present in @counter.
128	*/	130	*/
129		131
130	void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val);	132	u64 res_counter_uncharge_locked(struct res_counter *counter, unsigned long val);
131	void res_counter_uncharge(struct res_counter *counter, unsigned long val);	133	u64 res_counter_uncharge(struct res_counter *counter, unsigned long val);
132		134
133	void res_counter_uncharge_until(struct res_counter *counter,	135	u64 res_counter_uncharge_until(struct res_counter *counter,
134	struct res_counter *top,	136	struct res_counter *top,
135	unsigned long val);	137	unsigned long val);
136	/**	138	/**
137	* res_counter_margin - calculate chargeable space of a counter	139	* res_counter_margin - calculate chargeable space of a counter
138	* @cnt: the counter	140	* @cnt: the counter


diff --git a/include/linux/sched.h b/include/linux/sched.h index 9914c662ed7b..f712465b05c5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h
@@ -1597,6 +1597,7 @@ struct task_struct {
1597	unsigned long nr_pages; /* uncharged usage */	1597	unsigned long nr_pages; /* uncharged usage */
1598	unsigned long memsw_nr_pages; /* uncharged mem+swap usage */	1598	unsigned long memsw_nr_pages; /* uncharged mem+swap usage */
1599	} memcg_batch;	1599	} memcg_batch;
		1600	unsigned int memcg_kmem_skip_account;
1600	#endif	1601	#endif
1601	#ifdef CONFIG_HAVE_HW_BREAKPOINT	1602	#ifdef CONFIG_HAVE_HW_BREAKPOINT
1602	atomic_t ptrace_bp_refcnt;	1603	atomic_t ptrace_bp_refcnt;


diff --git a/include/linux/slab.h b/include/linux/slab.h index 743a10415122..5d168d7e0a28 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h
@@ -11,6 +11,8 @@
11		11
12	#include <linux/gfp.h>	12	#include <linux/gfp.h>
13	#include <linux/types.h>	13	#include <linux/types.h>
		14	#include <linux/workqueue.h>
		15
14		16
15	/*	17	/*
16	* Flags to pass to kmem_cache_create().	18	* Flags to pass to kmem_cache_create().
@@ -116,6 +118,7 @@ struct kmem_cache {
116	};	118	};
117	#endif	119	#endif
118		120
		121	struct mem_cgroup;
119	/*	122	/*
120	* struct kmem_cache related prototypes	123	* struct kmem_cache related prototypes
121	*/	124	*/
@@ -125,6 +128,9 @@ int slab_is_available(void);
125	struct kmem_cache kmem_cache_create(const char , size_t, size_t,	128	struct kmem_cache kmem_cache_create(const char , size_t, size_t,
126	unsigned long,	129	unsigned long,
127	void ()(void ));	130	void ()(void ));
		131	struct kmem_cache *
		132	kmem_cache_create_memcg(struct mem_cgroup , const char , size_t, size_t,
		133	unsigned long, void ()(void ), struct kmem_cache *);
128	void kmem_cache_destroy(struct kmem_cache *);	134	void kmem_cache_destroy(struct kmem_cache *);
129	int kmem_cache_shrink(struct kmem_cache *);	135	int kmem_cache_shrink(struct kmem_cache *);
130	void kmem_cache_free(struct kmem_cache , void );	136	void kmem_cache_free(struct kmem_cache , void );
@@ -175,6 +181,48 @@ void kmem_cache_free(struct kmem_cache , void );
175	#ifndef ARCH_SLAB_MINALIGN	181	#ifndef ARCH_SLAB_MINALIGN
176	#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)	182	#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
177	#endif	183	#endif
		184	/*
		185	* This is the main placeholder for memcg-related information in kmem caches.
		186	* struct kmem_cache will hold a pointer to it, so the memory cost while
		187	* disabled is 1 pointer. The runtime cost while enabled, gets bigger than it
		188	* would otherwise be if that would be bundled in kmem_cache: we'll need an
		189	* extra pointer chase. But the trade off clearly lays in favor of not
		190	* penalizing non-users.
		191	*
		192	* Both the root cache and the child caches will have it. For the root cache,
		193	* this will hold a dynamically allocated array large enough to hold
		194	* information about the currently limited memcgs in the system.
		195	*
		196	* Child caches will hold extra metadata needed for its operation. Fields are:
		197	*
		198	* @memcg: pointer to the memcg this cache belongs to
		199	* @list: list_head for the list of all caches in this memcg
		200	* @root_cache: pointer to the global, root cache, this cache was derived from
		201	* @dead: set to true after the memcg dies; the cache may still be around.
		202	* @nr_pages: number of pages that belongs to this cache.
		203	* @destroy: worker to be called whenever we are ready, or believe we may be
		204	* ready, to destroy this cache.
		205	*/
		206	struct memcg_cache_params {
		207	bool is_root_cache;
		208	union {
		209	struct kmem_cache *memcg_caches[0];
		210	struct {
		211	struct mem_cgroup *memcg;
		212	struct list_head list;
		213	struct kmem_cache *root_cache;
		214	bool dead;
		215	atomic_t nr_pages;
		216	struct work_struct destroy;
		217	};
		218	};
		219	};
		220
		221	int memcg_update_all_caches(int num_memcgs);
		222
		223	struct seq_file;
		224	int cache_show(struct kmem_cache s, struct seq_file m);
		225	void print_slabinfo_header(struct seq_file *m);
178		226
179	/*	227	/*
180	* Common kmalloc functions provided by all allocators	228	* Common kmalloc functions provided by all allocators


diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index 45c0356fdc8c..8bb6e0eaf3c6 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h
@@ -81,6 +81,9 @@ struct kmem_cache {
81	*/	81	*/
82	int obj_offset;	82	int obj_offset;
83	#endif /* CONFIG_DEBUG_SLAB */	83	#endif /* CONFIG_DEBUG_SLAB */
		84	#ifdef CONFIG_MEMCG_KMEM
		85	struct memcg_cache_params *memcg_params;
		86	#endif
84		87
85	/* 6) per-cpu/per-node data, touched during every alloc/free */	88	/* 6) per-cpu/per-node data, touched during every alloc/free */
86	/*	89	/*


diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index df448adb7283..9db4825cd393 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h
@@ -101,6 +101,10 @@ struct kmem_cache {
101	#ifdef CONFIG_SYSFS	101	#ifdef CONFIG_SYSFS
102	struct kobject kobj; /* For sysfs */	102	struct kobject kobj; /* For sysfs */
103	#endif	103	#endif
		104	#ifdef CONFIG_MEMCG_KMEM
		105	struct memcg_cache_params *memcg_params;
		106	int max_attr_size; /* for propagation, maximum size of a stored attr */
		107	#endif
104		108
105	#ifdef CONFIG_NUMA	109	#ifdef CONFIG_NUMA
106	/*	110	/*
@@ -222,7 +226,10 @@ void *__kmalloc(size_t size, gfp_t flags);
222	static __always_inline void *	226	static __always_inline void *
223	kmalloc_order(size_t size, gfp_t flags, unsigned int order)	227	kmalloc_order(size_t size, gfp_t flags, unsigned int order)
224	{	228	{
225	void ret = (void ) __get_free_pages(flags \| __GFP_COMP, order);	229	void *ret;
		230
		231	flags \|= (__GFP_COMP \| __GFP_KMEMCG);
		232	ret = (void *) __get_free_pages(flags, order);
226	kmemleak_alloc(ret, size, 1, flags);	233	kmemleak_alloc(ret, size, 1, flags);
227	return ret;	234	return ret;
228	}	235	}


diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index ccc1899bd62e..e7e04736802f 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h
@@ -61,6 +61,8 @@ extern long do_no_restart_syscall(struct restart_block *parm);
61	# define THREADINFO_GFP (GFP_KERNEL \| __GFP_NOTRACK)	61	# define THREADINFO_GFP (GFP_KERNEL \| __GFP_NOTRACK)
62	#endif	62	#endif
63		63
		64	#define THREADINFO_GFP_ACCOUNTED (THREADINFO_GFP \| __GFP_KMEMCG)
		65
64	/*	66	/*
65	* flag set/clear/test wrappers	67	* flag set/clear/test wrappers
66	* - pass TIF_xxxx constants to these functions	68	* - pass TIF_xxxx constants to these functions


diff --git a/include/trace/events/gfpflags.h b/include/trace/events/gfpflags.h index d6fd8e5b14b7..1eddbf1557f2 100644 --- a/include/trace/events/gfpflags.h +++ b/include/trace/events/gfpflags.h
@@ -34,6 +34,7 @@
34	{(unsigned long)__GFP_HARDWALL, "GFP_HARDWALL"}, \	34	{(unsigned long)__GFP_HARDWALL, "GFP_HARDWALL"}, \
35	{(unsigned long)__GFP_THISNODE, "GFP_THISNODE"}, \	35	{(unsigned long)__GFP_THISNODE, "GFP_THISNODE"}, \
36	{(unsigned long)__GFP_RECLAIMABLE, "GFP_RECLAIMABLE"}, \	36	{(unsigned long)__GFP_RECLAIMABLE, "GFP_RECLAIMABLE"}, \
		37	{(unsigned long)__GFP_KMEMCG, "GFP_KMEMCG"}, \
37	{(unsigned long)__GFP_MOVABLE, "GFP_MOVABLE"}, \	38	{(unsigned long)__GFP_MOVABLE, "GFP_MOVABLE"}, \
38	{(unsigned long)__GFP_NOTRACK, "GFP_NOTRACK"}, \	39	{(unsigned long)__GFP_NOTRACK, "GFP_NOTRACK"}, \
39	{(unsigned long)__GFP_NO_KSWAPD, "GFP_NO_KSWAPD"}, \	40	{(unsigned long)__GFP_NO_KSWAPD, "GFP_NO_KSWAPD"}, \