aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Makefile2
-rw-r--r--mm/allocpercpu.c15
-rw-r--r--mm/hugetlb.c43
-rw-r--r--mm/memcontrol.c365
-rw-r--r--mm/memory.c13
-rw-r--r--mm/migrate.c19
-rw-r--r--mm/oom_kill.c2
-rw-r--r--mm/page_alloc.c21
-rw-r--r--mm/rmap.c4
-rw-r--r--mm/shmem.c9
-rw-r--r--mm/slab.c9
-rw-r--r--mm/slub.c217
-rw-r--r--mm/swap.c2
-rw-r--r--mm/truncate.c3
-rw-r--r--mm/vmscan.c9
15 files changed, 338 insertions, 395 deletions
diff --git a/mm/Makefile b/mm/Makefile
index 9f117bab5322..a5b0dd93427a 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -32,5 +32,5 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
32obj-$(CONFIG_MIGRATION) += migrate.o 32obj-$(CONFIG_MIGRATION) += migrate.o
33obj-$(CONFIG_SMP) += allocpercpu.o 33obj-$(CONFIG_SMP) += allocpercpu.o
34obj-$(CONFIG_QUICKLIST) += quicklist.o 34obj-$(CONFIG_QUICKLIST) += quicklist.o
35obj-$(CONFIG_CGROUP_MEM_CONT) += memcontrol.o 35obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o
36 36
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index 7e58322b7134..b0012e27fea8 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -6,6 +6,10 @@
6#include <linux/mm.h> 6#include <linux/mm.h>
7#include <linux/module.h> 7#include <linux/module.h>
8 8
9#ifndef cache_line_size
10#define cache_line_size() L1_CACHE_BYTES
11#endif
12
9/** 13/**
10 * percpu_depopulate - depopulate per-cpu data for given cpu 14 * percpu_depopulate - depopulate per-cpu data for given cpu
11 * @__pdata: per-cpu data to depopulate 15 * @__pdata: per-cpu data to depopulate
@@ -52,6 +56,11 @@ void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu)
52 struct percpu_data *pdata = __percpu_disguise(__pdata); 56 struct percpu_data *pdata = __percpu_disguise(__pdata);
53 int node = cpu_to_node(cpu); 57 int node = cpu_to_node(cpu);
54 58
59 /*
60 * We should make sure each CPU gets private memory.
61 */
62 size = roundup(size, cache_line_size());
63
55 BUG_ON(pdata->ptrs[cpu]); 64 BUG_ON(pdata->ptrs[cpu]);
56 if (node_online(node)) 65 if (node_online(node))
57 pdata->ptrs[cpu] = kmalloc_node(size, gfp|__GFP_ZERO, node); 66 pdata->ptrs[cpu] = kmalloc_node(size, gfp|__GFP_ZERO, node);
@@ -98,7 +107,11 @@ EXPORT_SYMBOL_GPL(__percpu_populate_mask);
98 */ 107 */
99void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask) 108void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask)
100{ 109{
101 void *pdata = kzalloc(nr_cpu_ids * sizeof(void *), gfp); 110 /*
111 * We allocate whole cache lines to avoid false sharing
112 */
113 size_t sz = roundup(nr_cpu_ids * sizeof(void *), cache_line_size());
114 void *pdata = kzalloc(sz, gfp);
102 void *__pdata = __percpu_disguise(pdata); 115 void *__pdata = __percpu_disguise(pdata);
103 116
104 if (unlikely(!pdata)) 117 if (unlikely(!pdata))
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 89e6286a7f57..dcacc811e70e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -71,7 +71,25 @@ static void enqueue_huge_page(struct page *page)
71 free_huge_pages_node[nid]++; 71 free_huge_pages_node[nid]++;
72} 72}
73 73
74static struct page *dequeue_huge_page(struct vm_area_struct *vma, 74static struct page *dequeue_huge_page(void)
75{
76 int nid;
77 struct page *page = NULL;
78
79 for (nid = 0; nid < MAX_NUMNODES; ++nid) {
80 if (!list_empty(&hugepage_freelists[nid])) {
81 page = list_entry(hugepage_freelists[nid].next,
82 struct page, lru);
83 list_del(&page->lru);
84 free_huge_pages--;
85 free_huge_pages_node[nid]--;
86 break;
87 }
88 }
89 return page;
90}
91
92static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
75 unsigned long address) 93 unsigned long address)
76{ 94{
77 int nid; 95 int nid;
@@ -296,8 +314,10 @@ static int gather_surplus_pages(int delta)
296 int needed, allocated; 314 int needed, allocated;
297 315
298 needed = (resv_huge_pages + delta) - free_huge_pages; 316 needed = (resv_huge_pages + delta) - free_huge_pages;
299 if (needed <= 0) 317 if (needed <= 0) {
318 resv_huge_pages += delta;
300 return 0; 319 return 0;
320 }
301 321
302 allocated = 0; 322 allocated = 0;
303 INIT_LIST_HEAD(&surplus_list); 323 INIT_LIST_HEAD(&surplus_list);
@@ -335,9 +355,12 @@ retry:
335 * The surplus_list now contains _at_least_ the number of extra pages 355 * The surplus_list now contains _at_least_ the number of extra pages
336 * needed to accomodate the reservation. Add the appropriate number 356 * needed to accomodate the reservation. Add the appropriate number
337 * of pages to the hugetlb pool and free the extras back to the buddy 357 * of pages to the hugetlb pool and free the extras back to the buddy
338 * allocator. 358 * allocator. Commit the entire reservation here to prevent another
359 * process from stealing the pages as they are added to the pool but
360 * before they are reserved.
339 */ 361 */
340 needed += allocated; 362 needed += allocated;
363 resv_huge_pages += delta;
341 ret = 0; 364 ret = 0;
342free: 365free:
343 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 366 list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
@@ -371,6 +394,9 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages)
371 struct page *page; 394 struct page *page;
372 unsigned long nr_pages; 395 unsigned long nr_pages;
373 396
397 /* Uncommit the reservation */
398 resv_huge_pages -= unused_resv_pages;
399
374 nr_pages = min(unused_resv_pages, surplus_huge_pages); 400 nr_pages = min(unused_resv_pages, surplus_huge_pages);
375 401
376 while (nr_pages) { 402 while (nr_pages) {
@@ -402,7 +428,7 @@ static struct page *alloc_huge_page_shared(struct vm_area_struct *vma,
402 struct page *page; 428 struct page *page;
403 429
404 spin_lock(&hugetlb_lock); 430 spin_lock(&hugetlb_lock);
405 page = dequeue_huge_page(vma, addr); 431 page = dequeue_huge_page_vma(vma, addr);
406 spin_unlock(&hugetlb_lock); 432 spin_unlock(&hugetlb_lock);
407 return page ? page : ERR_PTR(-VM_FAULT_OOM); 433 return page ? page : ERR_PTR(-VM_FAULT_OOM);
408} 434}
@@ -417,7 +443,7 @@ static struct page *alloc_huge_page_private(struct vm_area_struct *vma,
417 443
418 spin_lock(&hugetlb_lock); 444 spin_lock(&hugetlb_lock);
419 if (free_huge_pages > resv_huge_pages) 445 if (free_huge_pages > resv_huge_pages)
420 page = dequeue_huge_page(vma, addr); 446 page = dequeue_huge_page_vma(vma, addr);
421 spin_unlock(&hugetlb_lock); 447 spin_unlock(&hugetlb_lock);
422 if (!page) { 448 if (!page) {
423 page = alloc_buddy_huge_page(vma, addr); 449 page = alloc_buddy_huge_page(vma, addr);
@@ -570,7 +596,7 @@ static unsigned long set_max_huge_pages(unsigned long count)
570 min_count = max(count, min_count); 596 min_count = max(count, min_count);
571 try_to_free_low(min_count); 597 try_to_free_low(min_count);
572 while (min_count < persistent_huge_pages) { 598 while (min_count < persistent_huge_pages) {
573 struct page *page = dequeue_huge_page(NULL, 0); 599 struct page *page = dequeue_huge_page();
574 if (!page) 600 if (!page)
575 break; 601 break;
576 update_and_free_page(page); 602 update_and_free_page(page);
@@ -1205,12 +1231,13 @@ static int hugetlb_acct_memory(long delta)
1205 if (gather_surplus_pages(delta) < 0) 1231 if (gather_surplus_pages(delta) < 0)
1206 goto out; 1232 goto out;
1207 1233
1208 if (delta > cpuset_mems_nr(free_huge_pages_node)) 1234 if (delta > cpuset_mems_nr(free_huge_pages_node)) {
1235 return_unused_surplus_pages(delta);
1209 goto out; 1236 goto out;
1237 }
1210 } 1238 }
1211 1239
1212 ret = 0; 1240 ret = 0;
1213 resv_huge_pages += delta;
1214 if (delta < 0) 1241 if (delta < 0)
1215 return_unused_surplus_pages((unsigned long) -delta); 1242 return_unused_surplus_pages((unsigned long) -delta);
1216 1243
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 631002d085d1..8b9f6cae938e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -137,14 +137,21 @@ struct mem_cgroup {
137 */ 137 */
138 struct mem_cgroup_stat stat; 138 struct mem_cgroup_stat stat;
139}; 139};
140static struct mem_cgroup init_mem_cgroup;
140 141
141/* 142/*
142 * We use the lower bit of the page->page_cgroup pointer as a bit spin 143 * We use the lower bit of the page->page_cgroup pointer as a bit spin
143 * lock. We need to ensure that page->page_cgroup is atleast two 144 * lock. We need to ensure that page->page_cgroup is at least two
144 * byte aligned (based on comments from Nick Piggin) 145 * byte aligned (based on comments from Nick Piggin). But since
146 * bit_spin_lock doesn't actually set that lock bit in a non-debug
147 * uniprocessor kernel, we should avoid setting it here too.
145 */ 148 */
146#define PAGE_CGROUP_LOCK_BIT 0x0 149#define PAGE_CGROUP_LOCK_BIT 0x0
147#define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT) 150#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
151#define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT)
152#else
153#define PAGE_CGROUP_LOCK 0x0
154#endif
148 155
149/* 156/*
150 * A page_cgroup page is associated with every page descriptor. The 157 * A page_cgroup page is associated with every page descriptor. The
@@ -154,37 +161,27 @@ struct page_cgroup {
154 struct list_head lru; /* per cgroup LRU list */ 161 struct list_head lru; /* per cgroup LRU list */
155 struct page *page; 162 struct page *page;
156 struct mem_cgroup *mem_cgroup; 163 struct mem_cgroup *mem_cgroup;
157 atomic_t ref_cnt; /* Helpful when pages move b/w */ 164 int ref_cnt; /* cached, mapped, migrating */
158 /* mapped and cached states */ 165 int flags;
159 int flags;
160}; 166};
161#define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */ 167#define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */
162#define PAGE_CGROUP_FLAG_ACTIVE (0x2) /* page is active in this cgroup */ 168#define PAGE_CGROUP_FLAG_ACTIVE (0x2) /* page is active in this cgroup */
163 169
164static inline int page_cgroup_nid(struct page_cgroup *pc) 170static int page_cgroup_nid(struct page_cgroup *pc)
165{ 171{
166 return page_to_nid(pc->page); 172 return page_to_nid(pc->page);
167} 173}
168 174
169static inline enum zone_type page_cgroup_zid(struct page_cgroup *pc) 175static enum zone_type page_cgroup_zid(struct page_cgroup *pc)
170{ 176{
171 return page_zonenum(pc->page); 177 return page_zonenum(pc->page);
172} 178}
173 179
174enum {
175 MEM_CGROUP_TYPE_UNSPEC = 0,
176 MEM_CGROUP_TYPE_MAPPED,
177 MEM_CGROUP_TYPE_CACHED,
178 MEM_CGROUP_TYPE_ALL,
179 MEM_CGROUP_TYPE_MAX,
180};
181
182enum charge_type { 180enum charge_type {
183 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 181 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
184 MEM_CGROUP_CHARGE_TYPE_MAPPED, 182 MEM_CGROUP_CHARGE_TYPE_MAPPED,
185}; 183};
186 184
187
188/* 185/*
189 * Always modified under lru lock. Then, not necessary to preempt_disable() 186 * Always modified under lru lock. Then, not necessary to preempt_disable()
190 */ 187 */
@@ -193,23 +190,21 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, int flags,
193{ 190{
194 int val = (charge)? 1 : -1; 191 int val = (charge)? 1 : -1;
195 struct mem_cgroup_stat *stat = &mem->stat; 192 struct mem_cgroup_stat *stat = &mem->stat;
196 VM_BUG_ON(!irqs_disabled());
197 193
194 VM_BUG_ON(!irqs_disabled());
198 if (flags & PAGE_CGROUP_FLAG_CACHE) 195 if (flags & PAGE_CGROUP_FLAG_CACHE)
199 __mem_cgroup_stat_add_safe(stat, 196 __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_CACHE, val);
200 MEM_CGROUP_STAT_CACHE, val);
201 else 197 else
202 __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_RSS, val); 198 __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_RSS, val);
203} 199}
204 200
205static inline struct mem_cgroup_per_zone * 201static struct mem_cgroup_per_zone *
206mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) 202mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
207{ 203{
208 BUG_ON(!mem->info.nodeinfo[nid]);
209 return &mem->info.nodeinfo[nid]->zoneinfo[zid]; 204 return &mem->info.nodeinfo[nid]->zoneinfo[zid];
210} 205}
211 206
212static inline struct mem_cgroup_per_zone * 207static struct mem_cgroup_per_zone *
213page_cgroup_zoneinfo(struct page_cgroup *pc) 208page_cgroup_zoneinfo(struct page_cgroup *pc)
214{ 209{
215 struct mem_cgroup *mem = pc->mem_cgroup; 210 struct mem_cgroup *mem = pc->mem_cgroup;
@@ -234,18 +229,14 @@ static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem,
234 return total; 229 return total;
235} 230}
236 231
237static struct mem_cgroup init_mem_cgroup; 232static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
238
239static inline
240struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
241{ 233{
242 return container_of(cgroup_subsys_state(cont, 234 return container_of(cgroup_subsys_state(cont,
243 mem_cgroup_subsys_id), struct mem_cgroup, 235 mem_cgroup_subsys_id), struct mem_cgroup,
244 css); 236 css);
245} 237}
246 238
247static inline 239static struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
248struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
249{ 240{
250 return container_of(task_subsys_state(p, mem_cgroup_subsys_id), 241 return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
251 struct mem_cgroup, css); 242 struct mem_cgroup, css);
@@ -267,81 +258,33 @@ void mm_free_cgroup(struct mm_struct *mm)
267 258
268static inline int page_cgroup_locked(struct page *page) 259static inline int page_cgroup_locked(struct page *page)
269{ 260{
270 return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, 261 return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
271 &page->page_cgroup);
272} 262}
273 263
274void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc) 264static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc)
275{ 265{
276 int locked; 266 VM_BUG_ON(!page_cgroup_locked(page));
277 267 page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK);
278 /*
279 * While resetting the page_cgroup we might not hold the
280 * page_cgroup lock. free_hot_cold_page() is an example
281 * of such a scenario
282 */
283 if (pc)
284 VM_BUG_ON(!page_cgroup_locked(page));
285 locked = (page->page_cgroup & PAGE_CGROUP_LOCK);
286 page->page_cgroup = ((unsigned long)pc | locked);
287} 268}
288 269
289struct page_cgroup *page_get_page_cgroup(struct page *page) 270struct page_cgroup *page_get_page_cgroup(struct page *page)
290{ 271{
291 return (struct page_cgroup *) 272 return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK);
292 (page->page_cgroup & ~PAGE_CGROUP_LOCK);
293} 273}
294 274
295static void __always_inline lock_page_cgroup(struct page *page) 275static void lock_page_cgroup(struct page *page)
296{ 276{
297 bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); 277 bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
298 VM_BUG_ON(!page_cgroup_locked(page));
299}
300
301static void __always_inline unlock_page_cgroup(struct page *page)
302{
303 bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
304} 278}
305 279
306/* 280static int try_lock_page_cgroup(struct page *page)
307 * Tie new page_cgroup to struct page under lock_page_cgroup()
308 * This can fail if the page has been tied to a page_cgroup.
309 * If success, returns 0.
310 */
311static int page_cgroup_assign_new_page_cgroup(struct page *page,
312 struct page_cgroup *pc)
313{ 281{
314 int ret = 0; 282 return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
315
316 lock_page_cgroup(page);
317 if (!page_get_page_cgroup(page))
318 page_assign_page_cgroup(page, pc);
319 else /* A page is tied to other pc. */
320 ret = 1;
321 unlock_page_cgroup(page);
322 return ret;
323} 283}
324 284
325/* 285static void unlock_page_cgroup(struct page *page)
326 * Clear page->page_cgroup member under lock_page_cgroup().
327 * If given "pc" value is different from one page->page_cgroup,
328 * page->cgroup is not cleared.
329 * Returns a value of page->page_cgroup at lock taken.
330 * A can can detect failure of clearing by following
331 * clear_page_cgroup(page, pc) == pc
332 */
333
334static struct page_cgroup *clear_page_cgroup(struct page *page,
335 struct page_cgroup *pc)
336{ 286{
337 struct page_cgroup *ret; 287 bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
338 /* lock and clear */
339 lock_page_cgroup(page);
340 ret = page_get_page_cgroup(page);
341 if (likely(ret == pc))
342 page_assign_page_cgroup(page, NULL);
343 unlock_page_cgroup(page);
344 return ret;
345} 288}
346 289
347static void __mem_cgroup_remove_list(struct page_cgroup *pc) 290static void __mem_cgroup_remove_list(struct page_cgroup *pc)
@@ -399,7 +342,7 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
399 int ret; 342 int ret;
400 343
401 task_lock(task); 344 task_lock(task);
402 ret = task->mm && vm_match_cgroup(task->mm, mem); 345 ret = task->mm && mm_match_cgroup(task->mm, mem);
403 task_unlock(task); 346 task_unlock(task);
404 return ret; 347 return ret;
405} 348}
@@ -407,18 +350,30 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
407/* 350/*
408 * This routine assumes that the appropriate zone's lru lock is already held 351 * This routine assumes that the appropriate zone's lru lock is already held
409 */ 352 */
410void mem_cgroup_move_lists(struct page_cgroup *pc, bool active) 353void mem_cgroup_move_lists(struct page *page, bool active)
411{ 354{
355 struct page_cgroup *pc;
412 struct mem_cgroup_per_zone *mz; 356 struct mem_cgroup_per_zone *mz;
413 unsigned long flags; 357 unsigned long flags;
414 358
415 if (!pc) 359 /*
360 * We cannot lock_page_cgroup while holding zone's lru_lock,
361 * because other holders of lock_page_cgroup can be interrupted
362 * with an attempt to rotate_reclaimable_page. But we cannot
363 * safely get to page_cgroup without it, so just try_lock it:
364 * mem_cgroup_isolate_pages allows for page left on wrong list.
365 */
366 if (!try_lock_page_cgroup(page))
416 return; 367 return;
417 368
418 mz = page_cgroup_zoneinfo(pc); 369 pc = page_get_page_cgroup(page);
419 spin_lock_irqsave(&mz->lru_lock, flags); 370 if (pc) {
420 __mem_cgroup_move_lists(pc, active); 371 mz = page_cgroup_zoneinfo(pc);
421 spin_unlock_irqrestore(&mz->lru_lock, flags); 372 spin_lock_irqsave(&mz->lru_lock, flags);
373 __mem_cgroup_move_lists(pc, active);
374 spin_unlock_irqrestore(&mz->lru_lock, flags);
375 }
376 unlock_page_cgroup(page);
422} 377}
423 378
424/* 379/*
@@ -437,6 +392,7 @@ int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
437 rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); 392 rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
438 return (int)((rss * 100L) / total); 393 return (int)((rss * 100L) / total);
439} 394}
395
440/* 396/*
441 * This function is called from vmscan.c. In page reclaiming loop. balance 397 * This function is called from vmscan.c. In page reclaiming loop. balance
442 * between active and inactive list is calculated. For memory controller 398 * between active and inactive list is calculated. For memory controller
@@ -500,7 +456,6 @@ long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem,
500 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid); 456 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);
501 457
502 nr_inactive = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE); 458 nr_inactive = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE);
503
504 return (nr_inactive >> priority); 459 return (nr_inactive >> priority);
505} 460}
506 461
@@ -586,26 +541,21 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
586 * with it 541 * with it
587 */ 542 */
588retry: 543retry:
589 if (page) { 544 lock_page_cgroup(page);
590 lock_page_cgroup(page); 545 pc = page_get_page_cgroup(page);
591 pc = page_get_page_cgroup(page); 546 /*
592 /* 547 * The page_cgroup exists and
593 * The page_cgroup exists and 548 * the page has already been accounted.
594 * the page has already been accounted. 549 */
595 */ 550 if (pc) {
596 if (pc) { 551 VM_BUG_ON(pc->page != page);
597 if (unlikely(!atomic_inc_not_zero(&pc->ref_cnt))) { 552 VM_BUG_ON(pc->ref_cnt <= 0);
598 /* this page is under being uncharged ? */ 553
599 unlock_page_cgroup(page); 554 pc->ref_cnt++;
600 cpu_relax();
601 goto retry;
602 } else {
603 unlock_page_cgroup(page);
604 goto done;
605 }
606 }
607 unlock_page_cgroup(page); 555 unlock_page_cgroup(page);
556 goto done;
608 } 557 }
558 unlock_page_cgroup(page);
609 559
610 pc = kzalloc(sizeof(struct page_cgroup), gfp_mask); 560 pc = kzalloc(sizeof(struct page_cgroup), gfp_mask);
611 if (pc == NULL) 561 if (pc == NULL)
@@ -623,16 +573,11 @@ retry:
623 rcu_read_lock(); 573 rcu_read_lock();
624 mem = rcu_dereference(mm->mem_cgroup); 574 mem = rcu_dereference(mm->mem_cgroup);
625 /* 575 /*
626 * For every charge from the cgroup, increment reference 576 * For every charge from the cgroup, increment reference count
627 * count
628 */ 577 */
629 css_get(&mem->css); 578 css_get(&mem->css);
630 rcu_read_unlock(); 579 rcu_read_unlock();
631 580
632 /*
633 * If we created the page_cgroup, we should free it on exceeding
634 * the cgroup limit.
635 */
636 while (res_counter_charge(&mem->res, PAGE_SIZE)) { 581 while (res_counter_charge(&mem->res, PAGE_SIZE)) {
637 if (!(gfp_mask & __GFP_WAIT)) 582 if (!(gfp_mask & __GFP_WAIT))
638 goto out; 583 goto out;
@@ -641,12 +586,12 @@ retry:
641 continue; 586 continue;
642 587
643 /* 588 /*
644 * try_to_free_mem_cgroup_pages() might not give us a full 589 * try_to_free_mem_cgroup_pages() might not give us a full
645 * picture of reclaim. Some pages are reclaimed and might be 590 * picture of reclaim. Some pages are reclaimed and might be
646 * moved to swap cache or just unmapped from the cgroup. 591 * moved to swap cache or just unmapped from the cgroup.
647 * Check the limit again to see if the reclaim reduced the 592 * Check the limit again to see if the reclaim reduced the
648 * current usage of the cgroup before giving up 593 * current usage of the cgroup before giving up
649 */ 594 */
650 if (res_counter_check_under_limit(&mem->res)) 595 if (res_counter_check_under_limit(&mem->res))
651 continue; 596 continue;
652 597
@@ -657,14 +602,16 @@ retry:
657 congestion_wait(WRITE, HZ/10); 602 congestion_wait(WRITE, HZ/10);
658 } 603 }
659 604
660 atomic_set(&pc->ref_cnt, 1); 605 pc->ref_cnt = 1;
661 pc->mem_cgroup = mem; 606 pc->mem_cgroup = mem;
662 pc->page = page; 607 pc->page = page;
663 pc->flags = PAGE_CGROUP_FLAG_ACTIVE; 608 pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
664 if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE) 609 if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE)
665 pc->flags |= PAGE_CGROUP_FLAG_CACHE; 610 pc->flags |= PAGE_CGROUP_FLAG_CACHE;
666 611
667 if (!page || page_cgroup_assign_new_page_cgroup(page, pc)) { 612 lock_page_cgroup(page);
613 if (page_get_page_cgroup(page)) {
614 unlock_page_cgroup(page);
668 /* 615 /*
669 * Another charge has been added to this page already. 616 * Another charge has been added to this page already.
670 * We take lock_page_cgroup(page) again and read 617 * We take lock_page_cgroup(page) again and read
@@ -673,17 +620,16 @@ retry:
673 res_counter_uncharge(&mem->res, PAGE_SIZE); 620 res_counter_uncharge(&mem->res, PAGE_SIZE);
674 css_put(&mem->css); 621 css_put(&mem->css);
675 kfree(pc); 622 kfree(pc);
676 if (!page)
677 goto done;
678 goto retry; 623 goto retry;
679 } 624 }
625 page_assign_page_cgroup(page, pc);
680 626
681 mz = page_cgroup_zoneinfo(pc); 627 mz = page_cgroup_zoneinfo(pc);
682 spin_lock_irqsave(&mz->lru_lock, flags); 628 spin_lock_irqsave(&mz->lru_lock, flags);
683 /* Update statistics vector */
684 __mem_cgroup_add_list(pc); 629 __mem_cgroup_add_list(pc);
685 spin_unlock_irqrestore(&mz->lru_lock, flags); 630 spin_unlock_irqrestore(&mz->lru_lock, flags);
686 631
632 unlock_page_cgroup(page);
687done: 633done:
688 return 0; 634 return 0;
689out: 635out:
@@ -693,70 +639,61 @@ err:
693 return -ENOMEM; 639 return -ENOMEM;
694} 640}
695 641
696int mem_cgroup_charge(struct page *page, struct mm_struct *mm, 642int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
697 gfp_t gfp_mask)
698{ 643{
699 return mem_cgroup_charge_common(page, mm, gfp_mask, 644 return mem_cgroup_charge_common(page, mm, gfp_mask,
700 MEM_CGROUP_CHARGE_TYPE_MAPPED); 645 MEM_CGROUP_CHARGE_TYPE_MAPPED);
701} 646}
702 647
703/*
704 * See if the cached pages should be charged at all?
705 */
706int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 648int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
707 gfp_t gfp_mask) 649 gfp_t gfp_mask)
708{ 650{
709 int ret = 0;
710 if (!mm) 651 if (!mm)
711 mm = &init_mm; 652 mm = &init_mm;
712 653 return mem_cgroup_charge_common(page, mm, gfp_mask,
713 ret = mem_cgroup_charge_common(page, mm, gfp_mask,
714 MEM_CGROUP_CHARGE_TYPE_CACHE); 654 MEM_CGROUP_CHARGE_TYPE_CACHE);
715 return ret;
716} 655}
717 656
718/* 657/*
719 * Uncharging is always a welcome operation, we never complain, simply 658 * Uncharging is always a welcome operation, we never complain, simply
720 * uncharge. This routine should be called with lock_page_cgroup held 659 * uncharge.
721 */ 660 */
722void mem_cgroup_uncharge(struct page_cgroup *pc) 661void mem_cgroup_uncharge_page(struct page *page)
723{ 662{
663 struct page_cgroup *pc;
724 struct mem_cgroup *mem; 664 struct mem_cgroup *mem;
725 struct mem_cgroup_per_zone *mz; 665 struct mem_cgroup_per_zone *mz;
726 struct page *page;
727 unsigned long flags; 666 unsigned long flags;
728 667
729 /* 668 /*
730 * Check if our page_cgroup is valid 669 * Check if our page_cgroup is valid
731 */ 670 */
671 lock_page_cgroup(page);
672 pc = page_get_page_cgroup(page);
732 if (!pc) 673 if (!pc)
733 return; 674 goto unlock;
734 675
735 if (atomic_dec_and_test(&pc->ref_cnt)) { 676 VM_BUG_ON(pc->page != page);
736 page = pc->page; 677 VM_BUG_ON(pc->ref_cnt <= 0);
678
679 if (--(pc->ref_cnt) == 0) {
737 mz = page_cgroup_zoneinfo(pc); 680 mz = page_cgroup_zoneinfo(pc);
738 /* 681 spin_lock_irqsave(&mz->lru_lock, flags);
739 * get page->cgroup and clear it under lock. 682 __mem_cgroup_remove_list(pc);
740 * force_empty can drop page->cgroup without checking refcnt. 683 spin_unlock_irqrestore(&mz->lru_lock, flags);
741 */ 684
685 page_assign_page_cgroup(page, NULL);
742 unlock_page_cgroup(page); 686 unlock_page_cgroup(page);
743 if (clear_page_cgroup(page, pc) == pc) { 687
744 mem = pc->mem_cgroup; 688 mem = pc->mem_cgroup;
745 css_put(&mem->css); 689 res_counter_uncharge(&mem->res, PAGE_SIZE);
746 res_counter_uncharge(&mem->res, PAGE_SIZE); 690 css_put(&mem->css);
747 spin_lock_irqsave(&mz->lru_lock, flags); 691
748 __mem_cgroup_remove_list(pc); 692 kfree(pc);
749 spin_unlock_irqrestore(&mz->lru_lock, flags); 693 return;
750 kfree(pc);
751 }
752 lock_page_cgroup(page);
753 } 694 }
754}
755 695
756void mem_cgroup_uncharge_page(struct page *page) 696unlock:
757{
758 lock_page_cgroup(page);
759 mem_cgroup_uncharge(page_get_page_cgroup(page));
760 unlock_page_cgroup(page); 697 unlock_page_cgroup(page);
761} 698}
762 699
@@ -764,63 +701,59 @@ void mem_cgroup_uncharge_page(struct page *page)
764 * Returns non-zero if a page (under migration) has valid page_cgroup member. 701 * Returns non-zero if a page (under migration) has valid page_cgroup member.
765 * Refcnt of page_cgroup is incremented. 702 * Refcnt of page_cgroup is incremented.
766 */ 703 */
767
768int mem_cgroup_prepare_migration(struct page *page) 704int mem_cgroup_prepare_migration(struct page *page)
769{ 705{
770 struct page_cgroup *pc; 706 struct page_cgroup *pc;
771 int ret = 0; 707
772 lock_page_cgroup(page); 708 lock_page_cgroup(page);
773 pc = page_get_page_cgroup(page); 709 pc = page_get_page_cgroup(page);
774 if (pc && atomic_inc_not_zero(&pc->ref_cnt)) 710 if (pc)
775 ret = 1; 711 pc->ref_cnt++;
776 unlock_page_cgroup(page); 712 unlock_page_cgroup(page);
777 return ret; 713 return pc != NULL;
778} 714}
779 715
780void mem_cgroup_end_migration(struct page *page) 716void mem_cgroup_end_migration(struct page *page)
781{ 717{
782 struct page_cgroup *pc; 718 mem_cgroup_uncharge_page(page);
783
784 lock_page_cgroup(page);
785 pc = page_get_page_cgroup(page);
786 mem_cgroup_uncharge(pc);
787 unlock_page_cgroup(page);
788} 719}
720
789/* 721/*
790 * We know both *page* and *newpage* are now not-on-LRU and Pg_locked. 722 * We know both *page* and *newpage* are now not-on-LRU and PG_locked.
791 * And no race with uncharge() routines because page_cgroup for *page* 723 * And no race with uncharge() routines because page_cgroup for *page*
792 * has extra one reference by mem_cgroup_prepare_migration. 724 * has extra one reference by mem_cgroup_prepare_migration.
793 */ 725 */
794
795void mem_cgroup_page_migration(struct page *page, struct page *newpage) 726void mem_cgroup_page_migration(struct page *page, struct page *newpage)
796{ 727{
797 struct page_cgroup *pc; 728 struct page_cgroup *pc;
798 struct mem_cgroup *mem;
799 unsigned long flags;
800 struct mem_cgroup_per_zone *mz; 729 struct mem_cgroup_per_zone *mz;
801retry: 730 unsigned long flags;
731
732 lock_page_cgroup(page);
802 pc = page_get_page_cgroup(page); 733 pc = page_get_page_cgroup(page);
803 if (!pc) 734 if (!pc) {
735 unlock_page_cgroup(page);
804 return; 736 return;
805 mem = pc->mem_cgroup; 737 }
738
806 mz = page_cgroup_zoneinfo(pc); 739 mz = page_cgroup_zoneinfo(pc);
807 if (clear_page_cgroup(page, pc) != pc)
808 goto retry;
809 spin_lock_irqsave(&mz->lru_lock, flags); 740 spin_lock_irqsave(&mz->lru_lock, flags);
810
811 __mem_cgroup_remove_list(pc); 741 __mem_cgroup_remove_list(pc);
812 spin_unlock_irqrestore(&mz->lru_lock, flags); 742 spin_unlock_irqrestore(&mz->lru_lock, flags);
813 743
744 page_assign_page_cgroup(page, NULL);
745 unlock_page_cgroup(page);
746
814 pc->page = newpage; 747 pc->page = newpage;
815 lock_page_cgroup(newpage); 748 lock_page_cgroup(newpage);
816 page_assign_page_cgroup(newpage, pc); 749 page_assign_page_cgroup(newpage, pc);
817 unlock_page_cgroup(newpage);
818 750
819 mz = page_cgroup_zoneinfo(pc); 751 mz = page_cgroup_zoneinfo(pc);
820 spin_lock_irqsave(&mz->lru_lock, flags); 752 spin_lock_irqsave(&mz->lru_lock, flags);
821 __mem_cgroup_add_list(pc); 753 __mem_cgroup_add_list(pc);
822 spin_unlock_irqrestore(&mz->lru_lock, flags); 754 spin_unlock_irqrestore(&mz->lru_lock, flags);
823 return; 755
756 unlock_page_cgroup(newpage);
824} 757}
825 758
826/* 759/*
@@ -829,14 +762,13 @@ retry:
829 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 762 * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
830 */ 763 */
831#define FORCE_UNCHARGE_BATCH (128) 764#define FORCE_UNCHARGE_BATCH (128)
832static void 765static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
833mem_cgroup_force_empty_list(struct mem_cgroup *mem,
834 struct mem_cgroup_per_zone *mz, 766 struct mem_cgroup_per_zone *mz,
835 int active) 767 int active)
836{ 768{
837 struct page_cgroup *pc; 769 struct page_cgroup *pc;
838 struct page *page; 770 struct page *page;
839 int count; 771 int count = FORCE_UNCHARGE_BATCH;
840 unsigned long flags; 772 unsigned long flags;
841 struct list_head *list; 773 struct list_head *list;
842 774
@@ -845,46 +777,36 @@ mem_cgroup_force_empty_list(struct mem_cgroup *mem,
845 else 777 else
846 list = &mz->inactive_list; 778 list = &mz->inactive_list;
847 779
848 if (list_empty(list))
849 return;
850retry:
851 count = FORCE_UNCHARGE_BATCH;
852 spin_lock_irqsave(&mz->lru_lock, flags); 780 spin_lock_irqsave(&mz->lru_lock, flags);
853 781 while (!list_empty(list)) {
854 while (--count && !list_empty(list)) {
855 pc = list_entry(list->prev, struct page_cgroup, lru); 782 pc = list_entry(list->prev, struct page_cgroup, lru);
856 page = pc->page; 783 page = pc->page;
857 /* Avoid race with charge */ 784 get_page(page);
858 atomic_set(&pc->ref_cnt, 0); 785 spin_unlock_irqrestore(&mz->lru_lock, flags);
859 if (clear_page_cgroup(page, pc) == pc) { 786 mem_cgroup_uncharge_page(page);
860 css_put(&mem->css); 787 put_page(page);
861 res_counter_uncharge(&mem->res, PAGE_SIZE); 788 if (--count <= 0) {
862 __mem_cgroup_remove_list(pc); 789 count = FORCE_UNCHARGE_BATCH;
863 kfree(pc); 790 cond_resched();
864 } else /* being uncharged ? ...do relax */ 791 }
865 break; 792 spin_lock_irqsave(&mz->lru_lock, flags);
866 } 793 }
867 spin_unlock_irqrestore(&mz->lru_lock, flags); 794 spin_unlock_irqrestore(&mz->lru_lock, flags);
868 if (!list_empty(list)) {
869 cond_resched();
870 goto retry;
871 }
872 return;
873} 795}
874 796
875/* 797/*
876 * make mem_cgroup's charge to be 0 if there is no task. 798 * make mem_cgroup's charge to be 0 if there is no task.
877 * This enables deleting this mem_cgroup. 799 * This enables deleting this mem_cgroup.
878 */ 800 */
879 801static int mem_cgroup_force_empty(struct mem_cgroup *mem)
880int mem_cgroup_force_empty(struct mem_cgroup *mem)
881{ 802{
882 int ret = -EBUSY; 803 int ret = -EBUSY;
883 int node, zid; 804 int node, zid;
805
884 css_get(&mem->css); 806 css_get(&mem->css);
885 /* 807 /*
886 * page reclaim code (kswapd etc..) will move pages between 808 * page reclaim code (kswapd etc..) will move pages between
887` * active_list <-> inactive_list while we don't take a lock. 809 * active_list <-> inactive_list while we don't take a lock.
888 * So, we have to do loop here until all lists are empty. 810 * So, we have to do loop here until all lists are empty.
889 */ 811 */
890 while (mem->res.usage > 0) { 812 while (mem->res.usage > 0) {
@@ -906,9 +828,7 @@ out:
906 return ret; 828 return ret;
907} 829}
908 830
909 831static int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp)
910
911int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp)
912{ 832{
913 *tmp = memparse(buf, &buf); 833 *tmp = memparse(buf, &buf);
914 if (*buf != '\0') 834 if (*buf != '\0')
@@ -945,8 +865,7 @@ static ssize_t mem_force_empty_write(struct cgroup *cont,
945 size_t nbytes, loff_t *ppos) 865 size_t nbytes, loff_t *ppos)
946{ 866{
947 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 867 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
948 int ret; 868 int ret = mem_cgroup_force_empty(mem);
949 ret = mem_cgroup_force_empty(mem);
950 if (!ret) 869 if (!ret)
951 ret = nbytes; 870 ret = nbytes;
952 return ret; 871 return ret;
@@ -955,7 +874,6 @@ static ssize_t mem_force_empty_write(struct cgroup *cont,
955/* 874/*
956 * Note: This should be removed if cgroup supports write-only file. 875 * Note: This should be removed if cgroup supports write-only file.
957 */ 876 */
958
959static ssize_t mem_force_empty_read(struct cgroup *cont, 877static ssize_t mem_force_empty_read(struct cgroup *cont,
960 struct cftype *cft, 878 struct cftype *cft,
961 struct file *file, char __user *userbuf, 879 struct file *file, char __user *userbuf,
@@ -964,7 +882,6 @@ static ssize_t mem_force_empty_read(struct cgroup *cont,
964 return -EINVAL; 882 return -EINVAL;
965} 883}
966 884
967
968static const struct mem_cgroup_stat_desc { 885static const struct mem_cgroup_stat_desc {
969 const char *msg; 886 const char *msg;
970 u64 unit; 887 u64 unit;
@@ -1017,8 +934,6 @@ static int mem_control_stat_open(struct inode *unused, struct file *file)
1017 return single_open(file, mem_control_stat_show, cont); 934 return single_open(file, mem_control_stat_show, cont);
1018} 935}
1019 936
1020
1021
1022static struct cftype mem_cgroup_files[] = { 937static struct cftype mem_cgroup_files[] = {
1023 { 938 {
1024 .name = "usage_in_bytes", 939 .name = "usage_in_bytes",
@@ -1084,9 +999,6 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
1084 kfree(mem->info.nodeinfo[node]); 999 kfree(mem->info.nodeinfo[node]);
1085} 1000}
1086 1001
1087
1088static struct mem_cgroup init_mem_cgroup;
1089
1090static struct cgroup_subsys_state * 1002static struct cgroup_subsys_state *
1091mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) 1003mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
1092{ 1004{
@@ -1176,7 +1088,6 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
1176 1088
1177out: 1089out:
1178 mmput(mm); 1090 mmput(mm);
1179 return;
1180} 1091}
1181 1092
1182struct cgroup_subsys mem_cgroup_subsys = { 1093struct cgroup_subsys mem_cgroup_subsys = {
diff --git a/mm/memory.c b/mm/memory.c
index ce3c9e4492d8..0d14d1e58a5f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1711,7 +1711,7 @@ unlock:
1711 } 1711 }
1712 return ret; 1712 return ret;
1713oom_free_new: 1713oom_free_new:
1714 __free_page(new_page); 1714 page_cache_release(new_page);
1715oom: 1715oom:
1716 if (old_page) 1716 if (old_page)
1717 page_cache_release(old_page); 1717 page_cache_release(old_page);
@@ -2093,12 +2093,9 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2093 unlock_page(page); 2093 unlock_page(page);
2094 2094
2095 if (write_access) { 2095 if (write_access) {
2096 /* XXX: We could OR the do_wp_page code with this one? */ 2096 ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
2097 if (do_wp_page(mm, vma, address, 2097 if (ret & VM_FAULT_ERROR)
2098 page_table, pmd, ptl, pte) & VM_FAULT_OOM) { 2098 ret &= VM_FAULT_ERROR;
2099 mem_cgroup_uncharge_page(page);
2100 ret = VM_FAULT_OOM;
2101 }
2102 goto out; 2099 goto out;
2103 } 2100 }
2104 2101
@@ -2163,7 +2160,7 @@ release:
2163 page_cache_release(page); 2160 page_cache_release(page);
2164 goto unlock; 2161 goto unlock;
2165oom_free_page: 2162oom_free_page:
2166 __free_page(page); 2163 page_cache_release(page);
2167oom: 2164oom:
2168 return VM_FAULT_OOM; 2165 return VM_FAULT_OOM;
2169} 2166}
diff --git a/mm/migrate.c b/mm/migrate.c
index a73504ff5ab9..4e0eccca5e26 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -153,11 +153,6 @@ static void remove_migration_pte(struct vm_area_struct *vma,
153 return; 153 return;
154 } 154 }
155 155
156 if (mem_cgroup_charge(new, mm, GFP_KERNEL)) {
157 pte_unmap(ptep);
158 return;
159 }
160
161 ptl = pte_lockptr(mm, pmd); 156 ptl = pte_lockptr(mm, pmd);
162 spin_lock(ptl); 157 spin_lock(ptl);
163 pte = *ptep; 158 pte = *ptep;
@@ -169,6 +164,20 @@ static void remove_migration_pte(struct vm_area_struct *vma,
169 if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old) 164 if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old)
170 goto out; 165 goto out;
171 166
167 /*
168 * Yes, ignore the return value from a GFP_ATOMIC mem_cgroup_charge.
169 * Failure is not an option here: we're now expected to remove every
170 * migration pte, and will cause crashes otherwise. Normally this
171 * is not an issue: mem_cgroup_prepare_migration bumped up the old
172 * page_cgroup count for safety, that's now attached to the new page,
173 * so this charge should just be another incrementation of the count,
174 * to keep in balance with rmap.c's mem_cgroup_uncharging. But if
175 * there's been a force_empty, those reference counts may no longer
176 * be reliable, and this charge can actually fail: oh well, we don't
177 * make the situation any worse by proceeding as if it had succeeded.
178 */
179 mem_cgroup_charge(new, mm, GFP_ATOMIC);
180
172 get_page(new); 181 get_page(new);
173 pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); 182 pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
174 if (is_write_migration_entry(entry)) 183 if (is_write_migration_entry(entry))
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 4194b9db0104..44b2da11bf43 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -412,7 +412,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
412 return oom_kill_task(p); 412 return oom_kill_task(p);
413} 413}
414 414
415#ifdef CONFIG_CGROUP_MEM_CONT 415#ifdef CONFIG_CGROUP_MEM_RES_CTLR
416void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask) 416void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
417{ 417{
418 unsigned long points = 0; 418 unsigned long points = 0;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8896e874a67d..402a504f1228 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -19,6 +19,7 @@
19#include <linux/swap.h> 19#include <linux/swap.h>
20#include <linux/interrupt.h> 20#include <linux/interrupt.h>
21#include <linux/pagemap.h> 21#include <linux/pagemap.h>
22#include <linux/jiffies.h>
22#include <linux/bootmem.h> 23#include <linux/bootmem.h>
23#include <linux/compiler.h> 24#include <linux/compiler.h>
24#include <linux/kernel.h> 25#include <linux/kernel.h>
@@ -221,13 +222,19 @@ static inline int bad_range(struct zone *zone, struct page *page)
221 222
222static void bad_page(struct page *page) 223static void bad_page(struct page *page)
223{ 224{
224 printk(KERN_EMERG "Bad page state in process '%s'\n" 225 void *pc = page_get_page_cgroup(page);
225 KERN_EMERG "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n" 226
226 KERN_EMERG "Trying to fix it up, but a reboot is needed\n" 227 printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG
227 KERN_EMERG "Backtrace:\n", 228 "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n",
228 current->comm, page, (int)(2*sizeof(unsigned long)), 229 current->comm, page, (int)(2*sizeof(unsigned long)),
229 (unsigned long)page->flags, page->mapping, 230 (unsigned long)page->flags, page->mapping,
230 page_mapcount(page), page_count(page)); 231 page_mapcount(page), page_count(page));
232 if (pc) {
233 printk(KERN_EMERG "cgroup:%p\n", pc);
234 page_reset_bad_cgroup(page);
235 }
236 printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
237 KERN_EMERG "Backtrace:\n");
231 dump_stack(); 238 dump_stack();
232 page->flags &= ~(1 << PG_lru | 239 page->flags &= ~(1 << PG_lru |
233 1 << PG_private | 240 1 << PG_private |
@@ -453,6 +460,7 @@ static inline int free_pages_check(struct page *page)
453{ 460{
454 if (unlikely(page_mapcount(page) | 461 if (unlikely(page_mapcount(page) |
455 (page->mapping != NULL) | 462 (page->mapping != NULL) |
463 (page_get_page_cgroup(page) != NULL) |
456 (page_count(page) != 0) | 464 (page_count(page) != 0) |
457 (page->flags & ( 465 (page->flags & (
458 1 << PG_lru | 466 1 << PG_lru |
@@ -602,6 +610,7 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
602{ 610{
603 if (unlikely(page_mapcount(page) | 611 if (unlikely(page_mapcount(page) |
604 (page->mapping != NULL) | 612 (page->mapping != NULL) |
613 (page_get_page_cgroup(page) != NULL) |
605 (page_count(page) != 0) | 614 (page_count(page) != 0) |
606 (page->flags & ( 615 (page->flags & (
607 1 << PG_lru | 616 1 << PG_lru |
@@ -988,7 +997,6 @@ static void free_hot_cold_page(struct page *page, int cold)
988 997
989 if (!PageHighMem(page)) 998 if (!PageHighMem(page))
990 debug_check_no_locks_freed(page_address(page), PAGE_SIZE); 999 debug_check_no_locks_freed(page_address(page), PAGE_SIZE);
991 VM_BUG_ON(page_get_page_cgroup(page));
992 arch_free_page(page, 0); 1000 arch_free_page(page, 0);
993 kernel_map_pages(page, 1, 0); 1001 kernel_map_pages(page, 1, 0);
994 1002
@@ -1276,7 +1284,7 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1276 if (!zlc) 1284 if (!zlc)
1277 return NULL; 1285 return NULL;
1278 1286
1279 if (jiffies - zlc->last_full_zap > 1 * HZ) { 1287 if (time_after(jiffies, zlc->last_full_zap + HZ)) {
1280 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); 1288 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1281 zlc->last_full_zap = jiffies; 1289 zlc->last_full_zap = jiffies;
1282 } 1290 }
@@ -2527,7 +2535,6 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
2527 set_page_links(page, zone, nid, pfn); 2535 set_page_links(page, zone, nid, pfn);
2528 init_page_count(page); 2536 init_page_count(page);
2529 reset_page_mapcount(page); 2537 reset_page_mapcount(page);
2530 page_assign_page_cgroup(page, NULL);
2531 SetPageReserved(page); 2538 SetPageReserved(page);
2532 2539
2533 /* 2540 /*
diff --git a/mm/rmap.c b/mm/rmap.c
index 8fd527c4e2bf..0c9a2df06c39 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -321,7 +321,7 @@ static int page_referenced_anon(struct page *page,
321 * counting on behalf of references from different 321 * counting on behalf of references from different
322 * cgroups 322 * cgroups
323 */ 323 */
324 if (mem_cont && !vm_match_cgroup(vma->vm_mm, mem_cont)) 324 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
325 continue; 325 continue;
326 referenced += page_referenced_one(page, vma, &mapcount); 326 referenced += page_referenced_one(page, vma, &mapcount);
327 if (!mapcount) 327 if (!mapcount)
@@ -382,7 +382,7 @@ static int page_referenced_file(struct page *page,
382 * counting on behalf of references from different 382 * counting on behalf of references from different
383 * cgroups 383 * cgroups
384 */ 384 */
385 if (mem_cont && !vm_match_cgroup(vma->vm_mm, mem_cont)) 385 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
386 continue; 386 continue;
387 if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE)) 387 if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE))
388 == (VM_LOCKED|VM_MAYSHARE)) { 388 == (VM_LOCKED|VM_MAYSHARE)) {
diff --git a/mm/shmem.c b/mm/shmem.c
index 90b576cbc06e..3372bc579e89 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1370,14 +1370,17 @@ repeat:
1370 shmem_swp_unmap(entry); 1370 shmem_swp_unmap(entry);
1371 spin_unlock(&info->lock); 1371 spin_unlock(&info->lock);
1372 unlock_page(swappage); 1372 unlock_page(swappage);
1373 page_cache_release(swappage);
1374 if (error == -ENOMEM) { 1373 if (error == -ENOMEM) {
1375 /* allow reclaim from this memory cgroup */ 1374 /* allow reclaim from this memory cgroup */
1376 error = mem_cgroup_cache_charge(NULL, 1375 error = mem_cgroup_cache_charge(swappage,
1377 current->mm, gfp & ~__GFP_HIGHMEM); 1376 current->mm, gfp & ~__GFP_HIGHMEM);
1378 if (error) 1377 if (error) {
1378 page_cache_release(swappage);
1379 goto failed; 1379 goto failed;
1380 }
1381 mem_cgroup_uncharge_page(swappage);
1380 } 1382 }
1383 page_cache_release(swappage);
1381 goto repeat; 1384 goto repeat;
1382 } 1385 }
1383 } else if (sgp == SGP_READ && !filepage) { 1386 } else if (sgp == SGP_READ && !filepage) {
diff --git a/mm/slab.c b/mm/slab.c
index 473e6c2eaefb..e6c698f55674 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -333,7 +333,7 @@ static __always_inline int index_of(const size_t size)
333 return i; \ 333 return i; \
334 else \ 334 else \
335 i++; 335 i++;
336#include "linux/kmalloc_sizes.h" 336#include <linux/kmalloc_sizes.h>
337#undef CACHE 337#undef CACHE
338 __bad_size(); 338 __bad_size();
339 } else 339 } else
@@ -2964,11 +2964,10 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
2964 struct array_cache *ac; 2964 struct array_cache *ac;
2965 int node; 2965 int node;
2966 2966
2967 node = numa_node_id(); 2967retry:
2968
2969 check_irq_off(); 2968 check_irq_off();
2969 node = numa_node_id();
2970 ac = cpu_cache_get(cachep); 2970 ac = cpu_cache_get(cachep);
2971retry:
2972 batchcount = ac->batchcount; 2971 batchcount = ac->batchcount;
2973 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { 2972 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
2974 /* 2973 /*
@@ -3280,7 +3279,7 @@ retry:
3280 if (local_flags & __GFP_WAIT) 3279 if (local_flags & __GFP_WAIT)
3281 local_irq_enable(); 3280 local_irq_enable();
3282 kmem_flagcheck(cache, flags); 3281 kmem_flagcheck(cache, flags);
3283 obj = kmem_getpages(cache, flags, -1); 3282 obj = kmem_getpages(cache, local_flags, -1);
3284 if (local_flags & __GFP_WAIT) 3283 if (local_flags & __GFP_WAIT)
3285 local_irq_disable(); 3284 local_irq_disable();
3286 if (obj) { 3285 if (obj) {
diff --git a/mm/slub.c b/mm/slub.c
index 74c65af0a54f..96d63eb3ab17 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -291,32 +291,16 @@ static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu)
291#endif 291#endif
292} 292}
293 293
294/* 294/* Verify that a pointer has an address that is valid within a slab page */
295 * The end pointer in a slab is special. It points to the first object in the
296 * slab but has bit 0 set to mark it.
297 *
298 * Note that SLUB relies on page_mapping returning NULL for pages with bit 0
299 * in the mapping set.
300 */
301static inline int is_end(void *addr)
302{
303 return (unsigned long)addr & PAGE_MAPPING_ANON;
304}
305
306static void *slab_address(struct page *page)
307{
308 return page->end - PAGE_MAPPING_ANON;
309}
310
311static inline int check_valid_pointer(struct kmem_cache *s, 295static inline int check_valid_pointer(struct kmem_cache *s,
312 struct page *page, const void *object) 296 struct page *page, const void *object)
313{ 297{
314 void *base; 298 void *base;
315 299
316 if (object == page->end) 300 if (!object)
317 return 1; 301 return 1;
318 302
319 base = slab_address(page); 303 base = page_address(page);
320 if (object < base || object >= base + s->objects * s->size || 304 if (object < base || object >= base + s->objects * s->size ||
321 (object - base) % s->size) { 305 (object - base) % s->size) {
322 return 0; 306 return 0;
@@ -349,8 +333,7 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
349 333
350/* Scan freelist */ 334/* Scan freelist */
351#define for_each_free_object(__p, __s, __free) \ 335#define for_each_free_object(__p, __s, __free) \
352 for (__p = (__free); (__p) != page->end; __p = get_freepointer((__s),\ 336 for (__p = (__free); __p; __p = get_freepointer((__s), __p))
353 __p))
354 337
355/* Determine object index from a given position */ 338/* Determine object index from a given position */
356static inline int slab_index(void *p, struct kmem_cache *s, void *addr) 339static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
@@ -502,7 +485,7 @@ static void slab_fix(struct kmem_cache *s, char *fmt, ...)
502static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) 485static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
503{ 486{
504 unsigned int off; /* Offset of last byte */ 487 unsigned int off; /* Offset of last byte */
505 u8 *addr = slab_address(page); 488 u8 *addr = page_address(page);
506 489
507 print_tracking(s, p); 490 print_tracking(s, p);
508 491
@@ -637,7 +620,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
637 * A. Free pointer (if we cannot overwrite object on free) 620 * A. Free pointer (if we cannot overwrite object on free)
638 * B. Tracking data for SLAB_STORE_USER 621 * B. Tracking data for SLAB_STORE_USER
639 * C. Padding to reach required alignment boundary or at mininum 622 * C. Padding to reach required alignment boundary or at mininum
640 * one word if debuggin is on to be able to detect writes 623 * one word if debugging is on to be able to detect writes
641 * before the word boundary. 624 * before the word boundary.
642 * 625 *
643 * Padding is done using 0x5a (POISON_INUSE) 626 * Padding is done using 0x5a (POISON_INUSE)
@@ -680,7 +663,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
680 if (!(s->flags & SLAB_POISON)) 663 if (!(s->flags & SLAB_POISON))
681 return 1; 664 return 1;
682 665
683 start = slab_address(page); 666 start = page_address(page);
684 end = start + (PAGE_SIZE << s->order); 667 end = start + (PAGE_SIZE << s->order);
685 length = s->objects * s->size; 668 length = s->objects * s->size;
686 remainder = end - (start + length); 669 remainder = end - (start + length);
@@ -748,7 +731,7 @@ static int check_object(struct kmem_cache *s, struct page *page,
748 * of the free objects in this slab. May cause 731 * of the free objects in this slab. May cause
749 * another error because the object count is now wrong. 732 * another error because the object count is now wrong.
750 */ 733 */
751 set_freepointer(s, p, page->end); 734 set_freepointer(s, p, NULL);
752 return 0; 735 return 0;
753 } 736 }
754 return 1; 737 return 1;
@@ -782,18 +765,18 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
782 void *fp = page->freelist; 765 void *fp = page->freelist;
783 void *object = NULL; 766 void *object = NULL;
784 767
785 while (fp != page->end && nr <= s->objects) { 768 while (fp && nr <= s->objects) {
786 if (fp == search) 769 if (fp == search)
787 return 1; 770 return 1;
788 if (!check_valid_pointer(s, page, fp)) { 771 if (!check_valid_pointer(s, page, fp)) {
789 if (object) { 772 if (object) {
790 object_err(s, page, object, 773 object_err(s, page, object,
791 "Freechain corrupt"); 774 "Freechain corrupt");
792 set_freepointer(s, object, page->end); 775 set_freepointer(s, object, NULL);
793 break; 776 break;
794 } else { 777 } else {
795 slab_err(s, page, "Freepointer corrupt"); 778 slab_err(s, page, "Freepointer corrupt");
796 page->freelist = page->end; 779 page->freelist = NULL;
797 page->inuse = s->objects; 780 page->inuse = s->objects;
798 slab_fix(s, "Freelist cleared"); 781 slab_fix(s, "Freelist cleared");
799 return 0; 782 return 0;
@@ -870,7 +853,7 @@ static int alloc_debug_processing(struct kmem_cache *s, struct page *page,
870 if (!check_slab(s, page)) 853 if (!check_slab(s, page))
871 goto bad; 854 goto bad;
872 855
873 if (object && !on_freelist(s, page, object)) { 856 if (!on_freelist(s, page, object)) {
874 object_err(s, page, object, "Object already allocated"); 857 object_err(s, page, object, "Object already allocated");
875 goto bad; 858 goto bad;
876 } 859 }
@@ -880,7 +863,7 @@ static int alloc_debug_processing(struct kmem_cache *s, struct page *page,
880 goto bad; 863 goto bad;
881 } 864 }
882 865
883 if (object && !check_object(s, page, object, 0)) 866 if (!check_object(s, page, object, 0))
884 goto bad; 867 goto bad;
885 868
886 /* Success perform special debug activities for allocs */ 869 /* Success perform special debug activities for allocs */
@@ -899,7 +882,7 @@ bad:
899 */ 882 */
900 slab_fix(s, "Marking all objects used"); 883 slab_fix(s, "Marking all objects used");
901 page->inuse = s->objects; 884 page->inuse = s->objects;
902 page->freelist = page->end; 885 page->freelist = NULL;
903 } 886 }
904 return 0; 887 return 0;
905} 888}
@@ -939,7 +922,7 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page,
939 } 922 }
940 923
941 /* Special debug activities for freeing objects */ 924 /* Special debug activities for freeing objects */
942 if (!SlabFrozen(page) && page->freelist == page->end) 925 if (!SlabFrozen(page) && !page->freelist)
943 remove_full(s, page); 926 remove_full(s, page);
944 if (s->flags & SLAB_STORE_USER) 927 if (s->flags & SLAB_STORE_USER)
945 set_track(s, object, TRACK_FREE, addr); 928 set_track(s, object, TRACK_FREE, addr);
@@ -1015,30 +998,11 @@ static unsigned long kmem_cache_flags(unsigned long objsize,
1015 void (*ctor)(struct kmem_cache *, void *)) 998 void (*ctor)(struct kmem_cache *, void *))
1016{ 999{
1017 /* 1000 /*
1018 * The page->offset field is only 16 bit wide. This is an offset 1001 * Enable debugging if selected on the kernel commandline.
1019 * in units of words from the beginning of an object. If the slab
1020 * size is bigger then we cannot move the free pointer behind the
1021 * object anymore.
1022 *
1023 * On 32 bit platforms the limit is 256k. On 64bit platforms
1024 * the limit is 512k.
1025 *
1026 * Debugging or ctor may create a need to move the free
1027 * pointer. Fail if this happens.
1028 */ 1002 */
1029 if (objsize >= 65535 * sizeof(void *)) { 1003 if (slub_debug && (!slub_debug_slabs ||
1030 BUG_ON(flags & (SLAB_RED_ZONE | SLAB_POISON | 1004 strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)) == 0))
1031 SLAB_STORE_USER | SLAB_DESTROY_BY_RCU)); 1005 flags |= slub_debug;
1032 BUG_ON(ctor);
1033 } else {
1034 /*
1035 * Enable debugging if selected on the kernel commandline.
1036 */
1037 if (slub_debug && (!slub_debug_slabs ||
1038 strncmp(slub_debug_slabs, name,
1039 strlen(slub_debug_slabs)) == 0))
1040 flags |= slub_debug;
1041 }
1042 1006
1043 return flags; 1007 return flags;
1044} 1008}
@@ -1124,7 +1088,6 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1124 SetSlabDebug(page); 1088 SetSlabDebug(page);
1125 1089
1126 start = page_address(page); 1090 start = page_address(page);
1127 page->end = start + 1;
1128 1091
1129 if (unlikely(s->flags & SLAB_POISON)) 1092 if (unlikely(s->flags & SLAB_POISON))
1130 memset(start, POISON_INUSE, PAGE_SIZE << s->order); 1093 memset(start, POISON_INUSE, PAGE_SIZE << s->order);
@@ -1136,7 +1099,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1136 last = p; 1099 last = p;
1137 } 1100 }
1138 setup_object(s, page, last); 1101 setup_object(s, page, last);
1139 set_freepointer(s, last, page->end); 1102 set_freepointer(s, last, NULL);
1140 1103
1141 page->freelist = start; 1104 page->freelist = start;
1142 page->inuse = 0; 1105 page->inuse = 0;
@@ -1152,7 +1115,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
1152 void *p; 1115 void *p;
1153 1116
1154 slab_pad_check(s, page); 1117 slab_pad_check(s, page);
1155 for_each_object(p, s, slab_address(page)) 1118 for_each_object(p, s, page_address(page))
1156 check_object(s, page, p, 0); 1119 check_object(s, page, p, 0);
1157 ClearSlabDebug(page); 1120 ClearSlabDebug(page);
1158 } 1121 }
@@ -1162,7 +1125,6 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
1162 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 1125 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
1163 -pages); 1126 -pages);
1164 1127
1165 page->mapping = NULL;
1166 __free_pages(page, s->order); 1128 __free_pages(page, s->order);
1167} 1129}
1168 1130
@@ -1307,7 +1269,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
1307 * may return off node objects because partial slabs are obtained 1269 * may return off node objects because partial slabs are obtained
1308 * from other nodes and filled up. 1270 * from other nodes and filled up.
1309 * 1271 *
1310 * If /sys/slab/xx/defrag_ratio is set to 100 (which makes 1272 * If /sys/kernel/slab/xx/defrag_ratio is set to 100 (which makes
1311 * defrag_ratio = 1000) then every (well almost) allocation will 1273 * defrag_ratio = 1000) then every (well almost) allocation will
1312 * first attempt to defrag slab caches on other nodes. This means 1274 * first attempt to defrag slab caches on other nodes. This means
1313 * scanning over all nodes to look for partial slabs which may be 1275 * scanning over all nodes to look for partial slabs which may be
@@ -1366,7 +1328,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
1366 ClearSlabFrozen(page); 1328 ClearSlabFrozen(page);
1367 if (page->inuse) { 1329 if (page->inuse) {
1368 1330
1369 if (page->freelist != page->end) { 1331 if (page->freelist) {
1370 add_partial(n, page, tail); 1332 add_partial(n, page, tail);
1371 stat(c, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); 1333 stat(c, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD);
1372 } else { 1334 } else {
@@ -1382,9 +1344,11 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
1382 * Adding an empty slab to the partial slabs in order 1344 * Adding an empty slab to the partial slabs in order
1383 * to avoid page allocator overhead. This slab needs 1345 * to avoid page allocator overhead. This slab needs
1384 * to come after the other slabs with objects in 1346 * to come after the other slabs with objects in
1385 * order to fill them up. That way the size of the 1347 * so that the others get filled first. That way the
1386 * partial list stays small. kmem_cache_shrink can 1348 * size of the partial list stays small.
1387 * reclaim empty slabs from the partial list. 1349 *
1350 * kmem_cache_shrink can reclaim any empty slabs from the
1351 * partial list.
1388 */ 1352 */
1389 add_partial(n, page, 1); 1353 add_partial(n, page, 1);
1390 slab_unlock(page); 1354 slab_unlock(page);
@@ -1404,18 +1368,14 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1404 struct page *page = c->page; 1368 struct page *page = c->page;
1405 int tail = 1; 1369 int tail = 1;
1406 1370
1407 if (c->freelist) 1371 if (page->freelist)
1408 stat(c, DEACTIVATE_REMOTE_FREES); 1372 stat(c, DEACTIVATE_REMOTE_FREES);
1409 /* 1373 /*
1410 * Merge cpu freelist into freelist. Typically we get here 1374 * Merge cpu freelist into slab freelist. Typically we get here
1411 * because both freelists are empty. So this is unlikely 1375 * because both freelists are empty. So this is unlikely
1412 * to occur. 1376 * to occur.
1413 *
1414 * We need to use _is_end here because deactivate slab may
1415 * be called for a debug slab. Then c->freelist may contain
1416 * a dummy pointer.
1417 */ 1377 */
1418 while (unlikely(!is_end(c->freelist))) { 1378 while (unlikely(c->freelist)) {
1419 void **object; 1379 void **object;
1420 1380
1421 tail = 0; /* Hot objects. Put the slab first */ 1381 tail = 0; /* Hot objects. Put the slab first */
@@ -1442,6 +1402,7 @@ static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1442 1402
1443/* 1403/*
1444 * Flush cpu slab. 1404 * Flush cpu slab.
1405 *
1445 * Called from IPI handler with interrupts disabled. 1406 * Called from IPI handler with interrupts disabled.
1446 */ 1407 */
1447static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) 1408static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
@@ -1500,7 +1461,8 @@ static inline int node_match(struct kmem_cache_cpu *c, int node)
1500 * rest of the freelist to the lockless freelist. 1461 * rest of the freelist to the lockless freelist.
1501 * 1462 *
1502 * And if we were unable to get a new slab from the partial slab lists then 1463 * And if we were unable to get a new slab from the partial slab lists then
1503 * we need to allocate a new slab. This is slowest path since we may sleep. 1464 * we need to allocate a new slab. This is the slowest path since it involves
1465 * a call to the page allocator and the setup of a new slab.
1504 */ 1466 */
1505static void *__slab_alloc(struct kmem_cache *s, 1467static void *__slab_alloc(struct kmem_cache *s,
1506 gfp_t gfpflags, int node, void *addr, struct kmem_cache_cpu *c) 1468 gfp_t gfpflags, int node, void *addr, struct kmem_cache_cpu *c)
@@ -1514,18 +1476,19 @@ static void *__slab_alloc(struct kmem_cache *s,
1514 slab_lock(c->page); 1476 slab_lock(c->page);
1515 if (unlikely(!node_match(c, node))) 1477 if (unlikely(!node_match(c, node)))
1516 goto another_slab; 1478 goto another_slab;
1479
1517 stat(c, ALLOC_REFILL); 1480 stat(c, ALLOC_REFILL);
1481
1518load_freelist: 1482load_freelist:
1519 object = c->page->freelist; 1483 object = c->page->freelist;
1520 if (unlikely(object == c->page->end)) 1484 if (unlikely(!object))
1521 goto another_slab; 1485 goto another_slab;
1522 if (unlikely(SlabDebug(c->page))) 1486 if (unlikely(SlabDebug(c->page)))
1523 goto debug; 1487 goto debug;
1524 1488
1525 object = c->page->freelist;
1526 c->freelist = object[c->offset]; 1489 c->freelist = object[c->offset];
1527 c->page->inuse = s->objects; 1490 c->page->inuse = s->objects;
1528 c->page->freelist = c->page->end; 1491 c->page->freelist = NULL;
1529 c->node = page_to_nid(c->page); 1492 c->node = page_to_nid(c->page);
1530unlock_out: 1493unlock_out:
1531 slab_unlock(c->page); 1494 slab_unlock(c->page);
@@ -1578,7 +1541,6 @@ new_slab:
1578 1541
1579 return NULL; 1542 return NULL;
1580debug: 1543debug:
1581 object = c->page->freelist;
1582 if (!alloc_debug_processing(s, c->page, object, addr)) 1544 if (!alloc_debug_processing(s, c->page, object, addr))
1583 goto another_slab; 1545 goto another_slab;
1584 1546
@@ -1607,7 +1569,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
1607 1569
1608 local_irq_save(flags); 1570 local_irq_save(flags);
1609 c = get_cpu_slab(s, smp_processor_id()); 1571 c = get_cpu_slab(s, smp_processor_id());
1610 if (unlikely(is_end(c->freelist) || !node_match(c, node))) 1572 if (unlikely(!c->freelist || !node_match(c, node)))
1611 1573
1612 object = __slab_alloc(s, gfpflags, node, addr, c); 1574 object = __slab_alloc(s, gfpflags, node, addr, c);
1613 1575
@@ -1659,6 +1621,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
1659 1621
1660 if (unlikely(SlabDebug(page))) 1622 if (unlikely(SlabDebug(page)))
1661 goto debug; 1623 goto debug;
1624
1662checks_ok: 1625checks_ok:
1663 prior = object[offset] = page->freelist; 1626 prior = object[offset] = page->freelist;
1664 page->freelist = object; 1627 page->freelist = object;
@@ -1673,11 +1636,10 @@ checks_ok:
1673 goto slab_empty; 1636 goto slab_empty;
1674 1637
1675 /* 1638 /*
1676 * Objects left in the slab. If it 1639 * Objects left in the slab. If it was not on the partial list before
1677 * was not on the partial list before
1678 * then add it. 1640 * then add it.
1679 */ 1641 */
1680 if (unlikely(prior == page->end)) { 1642 if (unlikely(!prior)) {
1681 add_partial(get_node(s, page_to_nid(page)), page, 1); 1643 add_partial(get_node(s, page_to_nid(page)), page, 1);
1682 stat(c, FREE_ADD_PARTIAL); 1644 stat(c, FREE_ADD_PARTIAL);
1683 } 1645 }
@@ -1687,7 +1649,7 @@ out_unlock:
1687 return; 1649 return;
1688 1650
1689slab_empty: 1651slab_empty:
1690 if (prior != page->end) { 1652 if (prior) {
1691 /* 1653 /*
1692 * Slab still on the partial list. 1654 * Slab still on the partial list.
1693 */ 1655 */
@@ -1724,8 +1686,8 @@ static __always_inline void slab_free(struct kmem_cache *s,
1724 unsigned long flags; 1686 unsigned long flags;
1725 1687
1726 local_irq_save(flags); 1688 local_irq_save(flags);
1727 debug_check_no_locks_freed(object, s->objsize);
1728 c = get_cpu_slab(s, smp_processor_id()); 1689 c = get_cpu_slab(s, smp_processor_id());
1690 debug_check_no_locks_freed(object, c->objsize);
1729 if (likely(page == c->page && c->node >= 0)) { 1691 if (likely(page == c->page && c->node >= 0)) {
1730 object[c->offset] = c->freelist; 1692 object[c->offset] = c->freelist;
1731 c->freelist = object; 1693 c->freelist = object;
@@ -1888,20 +1850,21 @@ static unsigned long calculate_alignment(unsigned long flags,
1888 unsigned long align, unsigned long size) 1850 unsigned long align, unsigned long size)
1889{ 1851{
1890 /* 1852 /*
1891 * If the user wants hardware cache aligned objects then 1853 * If the user wants hardware cache aligned objects then follow that
1892 * follow that suggestion if the object is sufficiently 1854 * suggestion if the object is sufficiently large.
1893 * large.
1894 * 1855 *
1895 * The hardware cache alignment cannot override the 1856 * The hardware cache alignment cannot override the specified
1896 * specified alignment though. If that is greater 1857 * alignment though. If that is greater then use it.
1897 * then use it.
1898 */ 1858 */
1899 if ((flags & SLAB_HWCACHE_ALIGN) && 1859 if (flags & SLAB_HWCACHE_ALIGN) {
1900 size > cache_line_size() / 2) 1860 unsigned long ralign = cache_line_size();
1901 return max_t(unsigned long, align, cache_line_size()); 1861 while (size <= ralign / 2)
1862 ralign /= 2;
1863 align = max(align, ralign);
1864 }
1902 1865
1903 if (align < ARCH_SLAB_MINALIGN) 1866 if (align < ARCH_SLAB_MINALIGN)
1904 return ARCH_SLAB_MINALIGN; 1867 align = ARCH_SLAB_MINALIGN;
1905 1868
1906 return ALIGN(align, sizeof(void *)); 1869 return ALIGN(align, sizeof(void *));
1907} 1870}
@@ -1910,7 +1873,7 @@ static void init_kmem_cache_cpu(struct kmem_cache *s,
1910 struct kmem_cache_cpu *c) 1873 struct kmem_cache_cpu *c)
1911{ 1874{
1912 c->page = NULL; 1875 c->page = NULL;
1913 c->freelist = (void *)PAGE_MAPPING_ANON; 1876 c->freelist = NULL;
1914 c->node = 0; 1877 c->node = 0;
1915 c->offset = s->offset / sizeof(void *); 1878 c->offset = s->offset / sizeof(void *);
1916 c->objsize = s->objsize; 1879 c->objsize = s->objsize;
@@ -2092,6 +2055,7 @@ static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags,
2092#endif 2055#endif
2093 init_kmem_cache_node(n); 2056 init_kmem_cache_node(n);
2094 atomic_long_inc(&n->nr_slabs); 2057 atomic_long_inc(&n->nr_slabs);
2058
2095 /* 2059 /*
2096 * lockdep requires consistent irq usage for each lock 2060 * lockdep requires consistent irq usage for each lock
2097 * so even though there cannot be a race this early in 2061 * so even though there cannot be a race this early in
@@ -2173,6 +2137,14 @@ static int calculate_sizes(struct kmem_cache *s)
2173 unsigned long align = s->align; 2137 unsigned long align = s->align;
2174 2138
2175 /* 2139 /*
2140 * Round up object size to the next word boundary. We can only
2141 * place the free pointer at word boundaries and this determines
2142 * the possible location of the free pointer.
2143 */
2144 size = ALIGN(size, sizeof(void *));
2145
2146#ifdef CONFIG_SLUB_DEBUG
2147 /*
2176 * Determine if we can poison the object itself. If the user of 2148 * Determine if we can poison the object itself. If the user of
2177 * the slab may touch the object after free or before allocation 2149 * the slab may touch the object after free or before allocation
2178 * then we should never poison the object itself. 2150 * then we should never poison the object itself.
@@ -2183,14 +2155,7 @@ static int calculate_sizes(struct kmem_cache *s)
2183 else 2155 else
2184 s->flags &= ~__OBJECT_POISON; 2156 s->flags &= ~__OBJECT_POISON;
2185 2157
2186 /*
2187 * Round up object size to the next word boundary. We can only
2188 * place the free pointer at word boundaries and this determines
2189 * the possible location of the free pointer.
2190 */
2191 size = ALIGN(size, sizeof(void *));
2192 2158
2193#ifdef CONFIG_SLUB_DEBUG
2194 /* 2159 /*
2195 * If we are Redzoning then check if there is some space between the 2160 * If we are Redzoning then check if there is some space between the
2196 * end of the object and the free pointer. If not then add an 2161 * end of the object and the free pointer. If not then add an
@@ -2343,7 +2308,7 @@ int kmem_ptr_validate(struct kmem_cache *s, const void *object)
2343 /* 2308 /*
2344 * We could also check if the object is on the slabs freelist. 2309 * We could also check if the object is on the slabs freelist.
2345 * But this would be too expensive and it seems that the main 2310 * But this would be too expensive and it seems that the main
2346 * purpose of kmem_ptr_valid is to check if the object belongs 2311 * purpose of kmem_ptr_valid() is to check if the object belongs
2347 * to a certain slab. 2312 * to a certain slab.
2348 */ 2313 */
2349 return 1; 2314 return 1;
@@ -2630,13 +2595,24 @@ void *__kmalloc(size_t size, gfp_t flags)
2630} 2595}
2631EXPORT_SYMBOL(__kmalloc); 2596EXPORT_SYMBOL(__kmalloc);
2632 2597
2598static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
2599{
2600 struct page *page = alloc_pages_node(node, flags | __GFP_COMP,
2601 get_order(size));
2602
2603 if (page)
2604 return page_address(page);
2605 else
2606 return NULL;
2607}
2608
2633#ifdef CONFIG_NUMA 2609#ifdef CONFIG_NUMA
2634void *__kmalloc_node(size_t size, gfp_t flags, int node) 2610void *__kmalloc_node(size_t size, gfp_t flags, int node)
2635{ 2611{
2636 struct kmem_cache *s; 2612 struct kmem_cache *s;
2637 2613
2638 if (unlikely(size > PAGE_SIZE)) 2614 if (unlikely(size > PAGE_SIZE))
2639 return kmalloc_large(size, flags); 2615 return kmalloc_large_node(size, flags, node);
2640 2616
2641 s = get_slab(size, flags); 2617 s = get_slab(size, flags);
2642 2618
@@ -2653,19 +2629,17 @@ size_t ksize(const void *object)
2653 struct page *page; 2629 struct page *page;
2654 struct kmem_cache *s; 2630 struct kmem_cache *s;
2655 2631
2656 BUG_ON(!object);
2657 if (unlikely(object == ZERO_SIZE_PTR)) 2632 if (unlikely(object == ZERO_SIZE_PTR))
2658 return 0; 2633 return 0;
2659 2634
2660 page = virt_to_head_page(object); 2635 page = virt_to_head_page(object);
2661 BUG_ON(!page);
2662 2636
2663 if (unlikely(!PageSlab(page))) 2637 if (unlikely(!PageSlab(page)))
2664 return PAGE_SIZE << compound_order(page); 2638 return PAGE_SIZE << compound_order(page);
2665 2639
2666 s = page->slab; 2640 s = page->slab;
2667 BUG_ON(!s);
2668 2641
2642#ifdef CONFIG_SLUB_DEBUG
2669 /* 2643 /*
2670 * Debugging requires use of the padding between object 2644 * Debugging requires use of the padding between object
2671 * and whatever may come after it. 2645 * and whatever may come after it.
@@ -2673,6 +2647,7 @@ size_t ksize(const void *object)
2673 if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) 2647 if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
2674 return s->objsize; 2648 return s->objsize;
2675 2649
2650#endif
2676 /* 2651 /*
2677 * If we have the need to store the freelist pointer 2652 * If we have the need to store the freelist pointer
2678 * back there or track user information then we can 2653 * back there or track user information then we can
@@ -2680,7 +2655,6 @@ size_t ksize(const void *object)
2680 */ 2655 */
2681 if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) 2656 if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
2682 return s->inuse; 2657 return s->inuse;
2683
2684 /* 2658 /*
2685 * Else we can use all the padding etc for the allocation 2659 * Else we can use all the padding etc for the allocation
2686 */ 2660 */
@@ -2957,7 +2931,7 @@ void __init kmem_cache_init(void)
2957 /* 2931 /*
2958 * Patch up the size_index table if we have strange large alignment 2932 * Patch up the size_index table if we have strange large alignment
2959 * requirements for the kmalloc array. This is only the case for 2933 * requirements for the kmalloc array. This is only the case for
2960 * mips it seems. The standard arches will not generate any code here. 2934 * MIPS it seems. The standard arches will not generate any code here.
2961 * 2935 *
2962 * Largest permitted alignment is 256 bytes due to the way we 2936 * Largest permitted alignment is 256 bytes due to the way we
2963 * handle the index determination for the smaller caches. 2937 * handle the index determination for the smaller caches.
@@ -2986,7 +2960,6 @@ void __init kmem_cache_init(void)
2986 kmem_size = sizeof(struct kmem_cache); 2960 kmem_size = sizeof(struct kmem_cache);
2987#endif 2961#endif
2988 2962
2989
2990 printk(KERN_INFO 2963 printk(KERN_INFO
2991 "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," 2964 "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d,"
2992 " CPUs=%d, Nodes=%d\n", 2965 " CPUs=%d, Nodes=%d\n",
@@ -3083,12 +3056,15 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
3083 */ 3056 */
3084 for_each_online_cpu(cpu) 3057 for_each_online_cpu(cpu)
3085 get_cpu_slab(s, cpu)->objsize = s->objsize; 3058 get_cpu_slab(s, cpu)->objsize = s->objsize;
3059
3086 s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); 3060 s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
3087 up_write(&slub_lock); 3061 up_write(&slub_lock);
3062
3088 if (sysfs_slab_alias(s, name)) 3063 if (sysfs_slab_alias(s, name))
3089 goto err; 3064 goto err;
3090 return s; 3065 return s;
3091 } 3066 }
3067
3092 s = kmalloc(kmem_size, GFP_KERNEL); 3068 s = kmalloc(kmem_size, GFP_KERNEL);
3093 if (s) { 3069 if (s) {
3094 if (kmem_cache_open(s, GFP_KERNEL, name, 3070 if (kmem_cache_open(s, GFP_KERNEL, name,
@@ -3184,7 +3160,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
3184 struct kmem_cache *s; 3160 struct kmem_cache *s;
3185 3161
3186 if (unlikely(size > PAGE_SIZE)) 3162 if (unlikely(size > PAGE_SIZE))
3187 return kmalloc_large(size, gfpflags); 3163 return kmalloc_large_node(size, gfpflags, node);
3188 3164
3189 s = get_slab(size, gfpflags); 3165 s = get_slab(size, gfpflags);
3190 3166
@@ -3199,7 +3175,7 @@ static int validate_slab(struct kmem_cache *s, struct page *page,
3199 unsigned long *map) 3175 unsigned long *map)
3200{ 3176{
3201 void *p; 3177 void *p;
3202 void *addr = slab_address(page); 3178 void *addr = page_address(page);
3203 3179
3204 if (!check_slab(s, page) || 3180 if (!check_slab(s, page) ||
3205 !on_freelist(s, page, NULL)) 3181 !on_freelist(s, page, NULL))
@@ -3482,7 +3458,7 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
3482static void process_slab(struct loc_track *t, struct kmem_cache *s, 3458static void process_slab(struct loc_track *t, struct kmem_cache *s,
3483 struct page *page, enum track_item alloc) 3459 struct page *page, enum track_item alloc)
3484{ 3460{
3485 void *addr = slab_address(page); 3461 void *addr = page_address(page);
3486 DECLARE_BITMAP(map, s->objects); 3462 DECLARE_BITMAP(map, s->objects);
3487 void *p; 3463 void *p;
3488 3464
@@ -3591,8 +3567,8 @@ enum slab_stat_type {
3591#define SO_CPU (1 << SL_CPU) 3567#define SO_CPU (1 << SL_CPU)
3592#define SO_OBJECTS (1 << SL_OBJECTS) 3568#define SO_OBJECTS (1 << SL_OBJECTS)
3593 3569
3594static unsigned long slab_objects(struct kmem_cache *s, 3570static ssize_t show_slab_objects(struct kmem_cache *s,
3595 char *buf, unsigned long flags) 3571 char *buf, unsigned long flags)
3596{ 3572{
3597 unsigned long total = 0; 3573 unsigned long total = 0;
3598 int cpu; 3574 int cpu;
@@ -3602,6 +3578,8 @@ static unsigned long slab_objects(struct kmem_cache *s,
3602 unsigned long *per_cpu; 3578 unsigned long *per_cpu;
3603 3579
3604 nodes = kzalloc(2 * sizeof(unsigned long) * nr_node_ids, GFP_KERNEL); 3580 nodes = kzalloc(2 * sizeof(unsigned long) * nr_node_ids, GFP_KERNEL);
3581 if (!nodes)
3582 return -ENOMEM;
3605 per_cpu = nodes + nr_node_ids; 3583 per_cpu = nodes + nr_node_ids;
3606 3584
3607 for_each_possible_cpu(cpu) { 3585 for_each_possible_cpu(cpu) {
@@ -3754,25 +3732,25 @@ SLAB_ATTR_RO(aliases);
3754 3732
3755static ssize_t slabs_show(struct kmem_cache *s, char *buf) 3733static ssize_t slabs_show(struct kmem_cache *s, char *buf)
3756{ 3734{
3757 return slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU); 3735 return show_slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU);
3758} 3736}
3759SLAB_ATTR_RO(slabs); 3737SLAB_ATTR_RO(slabs);
3760 3738
3761static ssize_t partial_show(struct kmem_cache *s, char *buf) 3739static ssize_t partial_show(struct kmem_cache *s, char *buf)
3762{ 3740{
3763 return slab_objects(s, buf, SO_PARTIAL); 3741 return show_slab_objects(s, buf, SO_PARTIAL);
3764} 3742}
3765SLAB_ATTR_RO(partial); 3743SLAB_ATTR_RO(partial);
3766 3744
3767static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf) 3745static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf)
3768{ 3746{
3769 return slab_objects(s, buf, SO_CPU); 3747 return show_slab_objects(s, buf, SO_CPU);
3770} 3748}
3771SLAB_ATTR_RO(cpu_slabs); 3749SLAB_ATTR_RO(cpu_slabs);
3772 3750
3773static ssize_t objects_show(struct kmem_cache *s, char *buf) 3751static ssize_t objects_show(struct kmem_cache *s, char *buf)
3774{ 3752{
3775 return slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU|SO_OBJECTS); 3753 return show_slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU|SO_OBJECTS);
3776} 3754}
3777SLAB_ATTR_RO(objects); 3755SLAB_ATTR_RO(objects);
3778 3756
@@ -3971,7 +3949,6 @@ SLAB_ATTR(remote_node_defrag_ratio);
3971#endif 3949#endif
3972 3950
3973#ifdef CONFIG_SLUB_STATS 3951#ifdef CONFIG_SLUB_STATS
3974
3975static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si) 3952static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
3976{ 3953{
3977 unsigned long sum = 0; 3954 unsigned long sum = 0;
@@ -4155,8 +4132,8 @@ static struct kset *slab_kset;
4155#define ID_STR_LENGTH 64 4132#define ID_STR_LENGTH 64
4156 4133
4157/* Create a unique string id for a slab cache: 4134/* Create a unique string id for a slab cache:
4158 * format 4135 *
4159 * :[flags-]size:[memory address of kmemcache] 4136 * Format :[flags-]size
4160 */ 4137 */
4161static char *create_unique_id(struct kmem_cache *s) 4138static char *create_unique_id(struct kmem_cache *s)
4162{ 4139{
diff --git a/mm/swap.c b/mm/swap.c
index 710a20bb9749..d4ec59aa5c46 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -176,7 +176,7 @@ void activate_page(struct page *page)
176 SetPageActive(page); 176 SetPageActive(page);
177 add_page_to_active_list(zone, page); 177 add_page_to_active_list(zone, page);
178 __count_vm_event(PGACTIVATE); 178 __count_vm_event(PGACTIVATE);
179 mem_cgroup_move_lists(page_get_page_cgroup(page), true); 179 mem_cgroup_move_lists(page, true);
180 } 180 }
181 spin_unlock_irq(&zone->lru_lock); 181 spin_unlock_irq(&zone->lru_lock);
182} 182}
diff --git a/mm/truncate.c b/mm/truncate.c
index c35c49e54fb6..7d20ce41ecf5 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -134,8 +134,7 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
134} 134}
135 135
136/** 136/**
137 * truncate_inode_pages - truncate range of pages specified by start and 137 * truncate_inode_pages - truncate range of pages specified by start & end byte offsets
138 * end byte offsets
139 * @mapping: mapping to truncate 138 * @mapping: mapping to truncate
140 * @lstart: offset from which to truncate 139 * @lstart: offset from which to truncate
141 * @lend: offset to which to truncate 140 * @lend: offset to which to truncate
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a26dabd62fed..45711585684e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -126,7 +126,7 @@ long vm_total_pages; /* The total number of pages which the VM controls */
126static LIST_HEAD(shrinker_list); 126static LIST_HEAD(shrinker_list);
127static DECLARE_RWSEM(shrinker_rwsem); 127static DECLARE_RWSEM(shrinker_rwsem);
128 128
129#ifdef CONFIG_CGROUP_MEM_CONT 129#ifdef CONFIG_CGROUP_MEM_RES_CTLR
130#define scan_global_lru(sc) (!(sc)->mem_cgroup) 130#define scan_global_lru(sc) (!(sc)->mem_cgroup)
131#else 131#else
132#define scan_global_lru(sc) (1) 132#define scan_global_lru(sc) (1)
@@ -1128,7 +1128,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1128 ClearPageActive(page); 1128 ClearPageActive(page);
1129 1129
1130 list_move(&page->lru, &zone->inactive_list); 1130 list_move(&page->lru, &zone->inactive_list);
1131 mem_cgroup_move_lists(page_get_page_cgroup(page), false); 1131 mem_cgroup_move_lists(page, false);
1132 pgmoved++; 1132 pgmoved++;
1133 if (!pagevec_add(&pvec, page)) { 1133 if (!pagevec_add(&pvec, page)) {
1134 __mod_zone_page_state(zone, NR_INACTIVE, pgmoved); 1134 __mod_zone_page_state(zone, NR_INACTIVE, pgmoved);
@@ -1156,8 +1156,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1156 VM_BUG_ON(PageLRU(page)); 1156 VM_BUG_ON(PageLRU(page));
1157 SetPageLRU(page); 1157 SetPageLRU(page);
1158 VM_BUG_ON(!PageActive(page)); 1158 VM_BUG_ON(!PageActive(page));
1159
1159 list_move(&page->lru, &zone->active_list); 1160 list_move(&page->lru, &zone->active_list);
1160 mem_cgroup_move_lists(page_get_page_cgroup(page), true); 1161 mem_cgroup_move_lists(page, true);
1161 pgmoved++; 1162 pgmoved++;
1162 if (!pagevec_add(&pvec, page)) { 1163 if (!pagevec_add(&pvec, page)) {
1163 __mod_zone_page_state(zone, NR_ACTIVE, pgmoved); 1164 __mod_zone_page_state(zone, NR_ACTIVE, pgmoved);
@@ -1427,7 +1428,7 @@ unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
1427 return do_try_to_free_pages(zones, gfp_mask, &sc); 1428 return do_try_to_free_pages(zones, gfp_mask, &sc);
1428} 1429}
1429 1430
1430#ifdef CONFIG_CGROUP_MEM_CONT 1431#ifdef CONFIG_CGROUP_MEM_RES_CTLR
1431 1432
1432unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, 1433unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1433 gfp_t gfp_mask) 1434 gfp_t gfp_mask)