aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig5
-rw-r--r--mm/Makefile7
-rw-r--r--mm/backing-dev.c78
-rw-r--r--mm/bootmem.c4
-rw-r--r--mm/bounce.c8
-rw-r--r--mm/compaction.c167
-rw-r--r--mm/fadvise.c18
-rw-r--r--mm/filemap.c38
-rw-r--r--mm/filemap_xip.c6
-rw-r--r--mm/frontswap.c150
-rw-r--r--mm/highmem.c12
-rw-r--r--mm/hugetlb.c195
-rw-r--r--mm/hugetlb_cgroup.c418
-rw-r--r--mm/hwpoison-inject.c2
-rw-r--r--mm/internal.h9
-rw-r--r--mm/memblock.c35
-rw-r--r--mm/memcontrol.c390
-rw-r--r--mm/memory-failure.c35
-rw-r--r--mm/memory.c32
-rw-r--r--mm/memory_hotplug.c20
-rw-r--r--mm/mempolicy.c10
-rw-r--r--mm/mempool.c12
-rw-r--r--mm/migrate.c81
-rw-r--r--mm/mmap.c18
-rw-r--r--mm/mmu_notifier.c45
-rw-r--r--mm/mmzone.c2
-rw-r--r--mm/mremap.c2
-rw-r--r--mm/oom_kill.c223
-rw-r--r--mm/page-writeback.c108
-rw-r--r--mm/page_alloc.c336
-rw-r--r--mm/page_cgroup.c2
-rw-r--r--mm/page_io.c145
-rw-r--r--mm/page_isolation.c93
-rw-r--r--mm/shmem.c8
-rw-r--r--mm/slab.c623
-rw-r--r--mm/slab.h33
-rw-r--r--mm/slab_common.c120
-rw-r--r--mm/slob.c152
-rw-r--r--mm/slub.c464
-rw-r--r--mm/sparse.c29
-rw-r--r--mm/swap.c52
-rw-r--r--mm/swap_state.c7
-rw-r--r--mm/swapfile.c145
-rw-r--r--mm/vmalloc.c52
-rw-r--r--mm/vmscan.c185
-rw-r--r--mm/vmstat.c1
46 files changed, 2961 insertions, 1616 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 82fed4eb2b6..d5c8019c662 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -140,9 +140,13 @@ config ARCH_DISCARD_MEMBLOCK
140config NO_BOOTMEM 140config NO_BOOTMEM
141 boolean 141 boolean
142 142
143config MEMORY_ISOLATION
144 boolean
145
143# eventually, we can have this option just 'select SPARSEMEM' 146# eventually, we can have this option just 'select SPARSEMEM'
144config MEMORY_HOTPLUG 147config MEMORY_HOTPLUG
145 bool "Allow for memory hot-add" 148 bool "Allow for memory hot-add"
149 select MEMORY_ISOLATION
146 depends on SPARSEMEM || X86_64_ACPI_NUMA 150 depends on SPARSEMEM || X86_64_ACPI_NUMA
147 depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG 151 depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG
148 depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390) 152 depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390)
@@ -272,6 +276,7 @@ config MEMORY_FAILURE
272 depends on MMU 276 depends on MMU
273 depends on ARCH_SUPPORTS_MEMORY_FAILURE 277 depends on ARCH_SUPPORTS_MEMORY_FAILURE
274 bool "Enable recovery from hardware memory errors" 278 bool "Enable recovery from hardware memory errors"
279 select MEMORY_ISOLATION
275 help 280 help
276 Enables code to recover from some memory failures on systems 281 Enables code to recover from some memory failures on systems
277 with MCA recovery. This allows a system to continue running 282 with MCA recovery. This allows a system to continue running
diff --git a/mm/Makefile b/mm/Makefile
index 2e2fbbefb99..92753e2d82d 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -15,8 +15,9 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
15 maccess.o page_alloc.o page-writeback.o \ 15 maccess.o page_alloc.o page-writeback.o \
16 readahead.o swap.o truncate.o vmscan.o shmem.o \ 16 readahead.o swap.o truncate.o vmscan.o shmem.o \
17 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ 17 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
18 page_isolation.o mm_init.o mmu_context.o percpu.o \ 18 mm_init.o mmu_context.o percpu.o slab_common.o \
19 compaction.o $(mmu-y) 19 compaction.o $(mmu-y)
20
20obj-y += init-mm.o 21obj-y += init-mm.o
21 22
22ifdef CONFIG_NO_BOOTMEM 23ifdef CONFIG_NO_BOOTMEM
@@ -48,9 +49,11 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
48obj-$(CONFIG_MIGRATION) += migrate.o 49obj-$(CONFIG_MIGRATION) += migrate.o
49obj-$(CONFIG_QUICKLIST) += quicklist.o 50obj-$(CONFIG_QUICKLIST) += quicklist.o
50obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o 51obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
51obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o 52obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o
53obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
52obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o 54obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
53obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o 55obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
54obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o 56obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
55obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o 57obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
56obj-$(CONFIG_CLEANCACHE) += cleancache.o 58obj-$(CONFIG_CLEANCACHE) += cleancache.o
59obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index dd8e2aafb07..b41823cc05e 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -39,12 +39,6 @@ DEFINE_SPINLOCK(bdi_lock);
39LIST_HEAD(bdi_list); 39LIST_HEAD(bdi_list);
40LIST_HEAD(bdi_pending_list); 40LIST_HEAD(bdi_pending_list);
41 41
42static struct task_struct *sync_supers_tsk;
43static struct timer_list sync_supers_timer;
44
45static int bdi_sync_supers(void *);
46static void sync_supers_timer_fn(unsigned long);
47
48void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2) 42void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
49{ 43{
50 if (wb1 < wb2) { 44 if (wb1 < wb2) {
@@ -250,12 +244,6 @@ static int __init default_bdi_init(void)
250{ 244{
251 int err; 245 int err;
252 246
253 sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers");
254 BUG_ON(IS_ERR(sync_supers_tsk));
255
256 setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0);
257 bdi_arm_supers_timer();
258
259 err = bdi_init(&default_backing_dev_info); 247 err = bdi_init(&default_backing_dev_info);
260 if (!err) 248 if (!err)
261 bdi_register(&default_backing_dev_info, NULL, "default"); 249 bdi_register(&default_backing_dev_info, NULL, "default");
@@ -270,46 +258,6 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi)
270 return wb_has_dirty_io(&bdi->wb); 258 return wb_has_dirty_io(&bdi->wb);
271} 259}
272 260
273/*
274 * kupdated() used to do this. We cannot do it from the bdi_forker_thread()
275 * or we risk deadlocking on ->s_umount. The longer term solution would be
276 * to implement sync_supers_bdi() or similar and simply do it from the
277 * bdi writeback thread individually.
278 */
279static int bdi_sync_supers(void *unused)
280{
281 set_user_nice(current, 0);
282
283 while (!kthread_should_stop()) {
284 set_current_state(TASK_INTERRUPTIBLE);
285 schedule();
286
287 /*
288 * Do this periodically, like kupdated() did before.
289 */
290 sync_supers();
291 }
292
293 return 0;
294}
295
296void bdi_arm_supers_timer(void)
297{
298 unsigned long next;
299
300 if (!dirty_writeback_interval)
301 return;
302
303 next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies;
304 mod_timer(&sync_supers_timer, round_jiffies_up(next));
305}
306
307static void sync_supers_timer_fn(unsigned long unused)
308{
309 wake_up_process(sync_supers_tsk);
310 bdi_arm_supers_timer();
311}
312
313static void wakeup_timer_fn(unsigned long data) 261static void wakeup_timer_fn(unsigned long data)
314{ 262{
315 struct backing_dev_info *bdi = (struct backing_dev_info *)data; 263 struct backing_dev_info *bdi = (struct backing_dev_info *)data;
@@ -677,7 +625,7 @@ int bdi_init(struct backing_dev_info *bdi)
677 625
678 bdi->min_ratio = 0; 626 bdi->min_ratio = 0;
679 bdi->max_ratio = 100; 627 bdi->max_ratio = 100;
680 bdi->max_prop_frac = PROP_FRAC_BASE; 628 bdi->max_prop_frac = FPROP_FRAC_BASE;
681 spin_lock_init(&bdi->wb_lock); 629 spin_lock_init(&bdi->wb_lock);
682 INIT_LIST_HEAD(&bdi->bdi_list); 630 INIT_LIST_HEAD(&bdi->bdi_list);
683 INIT_LIST_HEAD(&bdi->work_list); 631 INIT_LIST_HEAD(&bdi->work_list);
@@ -700,7 +648,7 @@ int bdi_init(struct backing_dev_info *bdi)
700 bdi->write_bandwidth = INIT_BW; 648 bdi->write_bandwidth = INIT_BW;
701 bdi->avg_write_bandwidth = INIT_BW; 649 bdi->avg_write_bandwidth = INIT_BW;
702 650
703 err = prop_local_init_percpu(&bdi->completions); 651 err = fprop_local_init_percpu(&bdi->completions);
704 652
705 if (err) { 653 if (err) {
706err: 654err:
@@ -744,7 +692,7 @@ void bdi_destroy(struct backing_dev_info *bdi)
744 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) 692 for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
745 percpu_counter_destroy(&bdi->bdi_stat[i]); 693 percpu_counter_destroy(&bdi->bdi_stat[i]);
746 694
747 prop_local_destroy_percpu(&bdi->completions); 695 fprop_local_destroy_percpu(&bdi->completions);
748} 696}
749EXPORT_SYMBOL(bdi_destroy); 697EXPORT_SYMBOL(bdi_destroy);
750 698
@@ -886,3 +834,23 @@ out:
886 return ret; 834 return ret;
887} 835}
888EXPORT_SYMBOL(wait_iff_congested); 836EXPORT_SYMBOL(wait_iff_congested);
837
838int pdflush_proc_obsolete(struct ctl_table *table, int write,
839 void __user *buffer, size_t *lenp, loff_t *ppos)
840{
841 char kbuf[] = "0\n";
842
843 if (*ppos) {
844 *lenp = 0;
845 return 0;
846 }
847
848 if (copy_to_user(buffer, kbuf, sizeof(kbuf)))
849 return -EFAULT;
850 printk_once(KERN_WARNING "%s exported in /proc is scheduled for removal\n",
851 table->procname);
852
853 *lenp = 2;
854 *ppos += *lenp;
855 return 2;
856}
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 73096630cb3..bcb63ac48cc 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -710,6 +710,10 @@ again:
710 if (ptr) 710 if (ptr)
711 return ptr; 711 return ptr;
712 712
713 /* do not panic in alloc_bootmem_bdata() */
714 if (limit && goal + size > limit)
715 limit = 0;
716
713 ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, limit); 717 ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, limit);
714 if (ptr) 718 if (ptr)
715 return ptr; 719 return ptr;
diff --git a/mm/bounce.c b/mm/bounce.c
index d1be02ca188..04208677556 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -24,23 +24,25 @@
24 24
25static mempool_t *page_pool, *isa_page_pool; 25static mempool_t *page_pool, *isa_page_pool;
26 26
27#ifdef CONFIG_HIGHMEM 27#if defined(CONFIG_HIGHMEM) || defined(CONFIG_NEED_BOUNCE_POOL)
28static __init int init_emergency_pool(void) 28static __init int init_emergency_pool(void)
29{ 29{
30#ifndef CONFIG_MEMORY_HOTPLUG 30#if defined(CONFIG_HIGHMEM) && !defined(CONFIG_MEMORY_HOTPLUG)
31 if (max_pfn <= max_low_pfn) 31 if (max_pfn <= max_low_pfn)
32 return 0; 32 return 0;
33#endif 33#endif
34 34
35 page_pool = mempool_create_page_pool(POOL_SIZE, 0); 35 page_pool = mempool_create_page_pool(POOL_SIZE, 0);
36 BUG_ON(!page_pool); 36 BUG_ON(!page_pool);
37 printk("highmem bounce pool size: %d pages\n", POOL_SIZE); 37 printk("bounce pool size: %d pages\n", POOL_SIZE);
38 38
39 return 0; 39 return 0;
40} 40}
41 41
42__initcall(init_emergency_pool); 42__initcall(init_emergency_pool);
43#endif
43 44
45#ifdef CONFIG_HIGHMEM
44/* 46/*
45 * highmem version, map in to vec 47 * highmem version, map in to vec
46 */ 48 */
diff --git a/mm/compaction.c b/mm/compaction.c
index 2f42d952853..7fcd3a52e68 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -51,6 +51,47 @@ static inline bool migrate_async_suitable(int migratetype)
51} 51}
52 52
53/* 53/*
54 * Compaction requires the taking of some coarse locks that are potentially
55 * very heavily contended. Check if the process needs to be scheduled or
56 * if the lock is contended. For async compaction, back out in the event
57 * if contention is severe. For sync compaction, schedule.
58 *
59 * Returns true if the lock is held.
60 * Returns false if the lock is released and compaction should abort
61 */
62static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
63 bool locked, struct compact_control *cc)
64{
65 if (need_resched() || spin_is_contended(lock)) {
66 if (locked) {
67 spin_unlock_irqrestore(lock, *flags);
68 locked = false;
69 }
70
71 /* async aborts if taking too long or contended */
72 if (!cc->sync) {
73 if (cc->contended)
74 *cc->contended = true;
75 return false;
76 }
77
78 cond_resched();
79 if (fatal_signal_pending(current))
80 return false;
81 }
82
83 if (!locked)
84 spin_lock_irqsave(lock, *flags);
85 return true;
86}
87
88static inline bool compact_trylock_irqsave(spinlock_t *lock,
89 unsigned long *flags, struct compact_control *cc)
90{
91 return compact_checklock_irqsave(lock, flags, false, cc);
92}
93
94/*
54 * Isolate free pages onto a private freelist. Caller must hold zone->lock. 95 * Isolate free pages onto a private freelist. Caller must hold zone->lock.
55 * If @strict is true, will abort returning 0 on any invalid PFNs or non-free 96 * If @strict is true, will abort returning 0 on any invalid PFNs or non-free
56 * pages inside of the pageblock (even though it may still end up isolating 97 * pages inside of the pageblock (even though it may still end up isolating
@@ -173,7 +214,7 @@ isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn)
173} 214}
174 215
175/* Update the number of anon and file isolated pages in the zone */ 216/* Update the number of anon and file isolated pages in the zone */
176static void acct_isolated(struct zone *zone, struct compact_control *cc) 217static void acct_isolated(struct zone *zone, bool locked, struct compact_control *cc)
177{ 218{
178 struct page *page; 219 struct page *page;
179 unsigned int count[2] = { 0, }; 220 unsigned int count[2] = { 0, };
@@ -181,8 +222,14 @@ static void acct_isolated(struct zone *zone, struct compact_control *cc)
181 list_for_each_entry(page, &cc->migratepages, lru) 222 list_for_each_entry(page, &cc->migratepages, lru)
182 count[!!page_is_file_cache(page)]++; 223 count[!!page_is_file_cache(page)]++;
183 224
184 __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); 225 /* If locked we can use the interrupt unsafe versions */
185 __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); 226 if (locked) {
227 __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
228 __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
229 } else {
230 mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
231 mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
232 }
186} 233}
187 234
188/* Similar to reclaim, but different enough that they don't share logic */ 235/* Similar to reclaim, but different enough that they don't share logic */
@@ -228,6 +275,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
228 struct list_head *migratelist = &cc->migratepages; 275 struct list_head *migratelist = &cc->migratepages;
229 isolate_mode_t mode = 0; 276 isolate_mode_t mode = 0;
230 struct lruvec *lruvec; 277 struct lruvec *lruvec;
278 unsigned long flags;
279 bool locked;
231 280
232 /* 281 /*
233 * Ensure that there are not too many pages isolated from the LRU 282 * Ensure that there are not too many pages isolated from the LRU
@@ -247,25 +296,22 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
247 296
248 /* Time to isolate some pages for migration */ 297 /* Time to isolate some pages for migration */
249 cond_resched(); 298 cond_resched();
250 spin_lock_irq(&zone->lru_lock); 299 spin_lock_irqsave(&zone->lru_lock, flags);
300 locked = true;
251 for (; low_pfn < end_pfn; low_pfn++) { 301 for (; low_pfn < end_pfn; low_pfn++) {
252 struct page *page; 302 struct page *page;
253 bool locked = true;
254 303
255 /* give a chance to irqs before checking need_resched() */ 304 /* give a chance to irqs before checking need_resched() */
256 if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) { 305 if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) {
257 spin_unlock_irq(&zone->lru_lock); 306 spin_unlock_irqrestore(&zone->lru_lock, flags);
258 locked = false; 307 locked = false;
259 } 308 }
260 if (need_resched() || spin_is_contended(&zone->lru_lock)) { 309
261 if (locked) 310 /* Check if it is ok to still hold the lock */
262 spin_unlock_irq(&zone->lru_lock); 311 locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
263 cond_resched(); 312 locked, cc);
264 spin_lock_irq(&zone->lru_lock); 313 if (!locked)
265 if (fatal_signal_pending(current)) 314 break;
266 break;
267 } else if (!locked)
268 spin_lock_irq(&zone->lru_lock);
269 315
270 /* 316 /*
271 * migrate_pfn does not necessarily start aligned to a 317 * migrate_pfn does not necessarily start aligned to a
@@ -349,9 +395,10 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
349 } 395 }
350 } 396 }
351 397
352 acct_isolated(zone, cc); 398 acct_isolated(zone, locked, cc);
353 399
354 spin_unlock_irq(&zone->lru_lock); 400 if (locked)
401 spin_unlock_irqrestore(&zone->lru_lock, flags);
355 402
356 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); 403 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
357 404
@@ -384,6 +431,20 @@ static bool suitable_migration_target(struct page *page)
384} 431}
385 432
386/* 433/*
434 * Returns the start pfn of the last page block in a zone. This is the starting
435 * point for full compaction of a zone. Compaction searches for free pages from
436 * the end of each zone, while isolate_freepages_block scans forward inside each
437 * page block.
438 */
439static unsigned long start_free_pfn(struct zone *zone)
440{
441 unsigned long free_pfn;
442 free_pfn = zone->zone_start_pfn + zone->spanned_pages;
443 free_pfn &= ~(pageblock_nr_pages-1);
444 return free_pfn;
445}
446
447/*
387 * Based on information in the current compact_control, find blocks 448 * Based on information in the current compact_control, find blocks
388 * suitable for isolating free pages from and then isolate them. 449 * suitable for isolating free pages from and then isolate them.
389 */ 450 */
@@ -447,7 +508,16 @@ static void isolate_freepages(struct zone *zone,
447 * are disabled 508 * are disabled
448 */ 509 */
449 isolated = 0; 510 isolated = 0;
450 spin_lock_irqsave(&zone->lock, flags); 511
512 /*
513 * The zone lock must be held to isolate freepages. This
514 * unfortunately this is a very coarse lock and can be
515 * heavily contended if there are parallel allocations
516 * or parallel compactions. For async compaction do not
517 * spin on the lock
518 */
519 if (!compact_trylock_irqsave(&zone->lock, &flags, cc))
520 break;
451 if (suitable_migration_target(page)) { 521 if (suitable_migration_target(page)) {
452 end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn); 522 end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn);
453 isolated = isolate_freepages_block(pfn, end_pfn, 523 isolated = isolate_freepages_block(pfn, end_pfn,
@@ -461,8 +531,19 @@ static void isolate_freepages(struct zone *zone,
461 * looking for free pages, the search will restart here as 531 * looking for free pages, the search will restart here as
462 * page migration may have returned some pages to the allocator 532 * page migration may have returned some pages to the allocator
463 */ 533 */
464 if (isolated) 534 if (isolated) {
465 high_pfn = max(high_pfn, pfn); 535 high_pfn = max(high_pfn, pfn);
536
537 /*
538 * If the free scanner has wrapped, update
539 * compact_cached_free_pfn to point to the highest
540 * pageblock with free pages. This reduces excessive
541 * scanning of full pageblocks near the end of the
542 * zone
543 */
544 if (cc->order > 0 && cc->wrapped)
545 zone->compact_cached_free_pfn = high_pfn;
546 }
466 } 547 }
467 548
468 /* split_free_page does not map the pages */ 549 /* split_free_page does not map the pages */
@@ -470,6 +551,11 @@ static void isolate_freepages(struct zone *zone,
470 551
471 cc->free_pfn = high_pfn; 552 cc->free_pfn = high_pfn;
472 cc->nr_freepages = nr_freepages; 553 cc->nr_freepages = nr_freepages;
554
555 /* If compact_cached_free_pfn is reset then set it now */
556 if (cc->order > 0 && !cc->wrapped &&
557 zone->compact_cached_free_pfn == start_free_pfn(zone))
558 zone->compact_cached_free_pfn = high_pfn;
473} 559}
474 560
475/* 561/*
@@ -565,8 +651,26 @@ static int compact_finished(struct zone *zone,
565 if (fatal_signal_pending(current)) 651 if (fatal_signal_pending(current))
566 return COMPACT_PARTIAL; 652 return COMPACT_PARTIAL;
567 653
568 /* Compaction run completes if the migrate and free scanner meet */ 654 /*
569 if (cc->free_pfn <= cc->migrate_pfn) 655 * A full (order == -1) compaction run starts at the beginning and
656 * end of a zone; it completes when the migrate and free scanner meet.
657 * A partial (order > 0) compaction can start with the free scanner
658 * at a random point in the zone, and may have to restart.
659 */
660 if (cc->free_pfn <= cc->migrate_pfn) {
661 if (cc->order > 0 && !cc->wrapped) {
662 /* We started partway through; restart at the end. */
663 unsigned long free_pfn = start_free_pfn(zone);
664 zone->compact_cached_free_pfn = free_pfn;
665 cc->free_pfn = free_pfn;
666 cc->wrapped = 1;
667 return COMPACT_CONTINUE;
668 }
669 return COMPACT_COMPLETE;
670 }
671
672 /* We wrapped around and ended up where we started. */
673 if (cc->wrapped && cc->free_pfn <= cc->start_free_pfn)
570 return COMPACT_COMPLETE; 674 return COMPACT_COMPLETE;
571 675
572 /* 676 /*
@@ -664,8 +768,15 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
664 768
665 /* Setup to move all movable pages to the end of the zone */ 769 /* Setup to move all movable pages to the end of the zone */
666 cc->migrate_pfn = zone->zone_start_pfn; 770 cc->migrate_pfn = zone->zone_start_pfn;
667 cc->free_pfn = cc->migrate_pfn + zone->spanned_pages; 771
668 cc->free_pfn &= ~(pageblock_nr_pages-1); 772 if (cc->order > 0) {
773 /* Incremental compaction. Start where the last one stopped. */
774 cc->free_pfn = zone->compact_cached_free_pfn;
775 cc->start_free_pfn = cc->free_pfn;
776 } else {
777 /* Order == -1 starts at the end of the zone. */
778 cc->free_pfn = start_free_pfn(zone);
779 }
669 780
670 migrate_prep_local(); 781 migrate_prep_local();
671 782
@@ -718,7 +829,7 @@ out:
718 829
719static unsigned long compact_zone_order(struct zone *zone, 830static unsigned long compact_zone_order(struct zone *zone,
720 int order, gfp_t gfp_mask, 831 int order, gfp_t gfp_mask,
721 bool sync) 832 bool sync, bool *contended)
722{ 833{
723 struct compact_control cc = { 834 struct compact_control cc = {
724 .nr_freepages = 0, 835 .nr_freepages = 0,
@@ -727,6 +838,7 @@ static unsigned long compact_zone_order(struct zone *zone,
727 .migratetype = allocflags_to_migratetype(gfp_mask), 838 .migratetype = allocflags_to_migratetype(gfp_mask),
728 .zone = zone, 839 .zone = zone,
729 .sync = sync, 840 .sync = sync,
841 .contended = contended,
730 }; 842 };
731 INIT_LIST_HEAD(&cc.freepages); 843 INIT_LIST_HEAD(&cc.freepages);
732 INIT_LIST_HEAD(&cc.migratepages); 844 INIT_LIST_HEAD(&cc.migratepages);
@@ -748,7 +860,7 @@ int sysctl_extfrag_threshold = 500;
748 */ 860 */
749unsigned long try_to_compact_pages(struct zonelist *zonelist, 861unsigned long try_to_compact_pages(struct zonelist *zonelist,
750 int order, gfp_t gfp_mask, nodemask_t *nodemask, 862 int order, gfp_t gfp_mask, nodemask_t *nodemask,
751 bool sync) 863 bool sync, bool *contended)
752{ 864{
753 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 865 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
754 int may_enter_fs = gfp_mask & __GFP_FS; 866 int may_enter_fs = gfp_mask & __GFP_FS;
@@ -772,7 +884,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
772 nodemask) { 884 nodemask) {
773 int status; 885 int status;
774 886
775 status = compact_zone_order(zone, order, gfp_mask, sync); 887 status = compact_zone_order(zone, order, gfp_mask, sync,
888 contended);
776 rc = max(status, rc); 889 rc = max(status, rc);
777 890
778 /* If a normal allocation would succeed, stop compacting */ 891 /* If a normal allocation would succeed, stop compacting */
@@ -808,7 +921,7 @@ static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
808 if (cc->order > 0) { 921 if (cc->order > 0) {
809 int ok = zone_watermark_ok(zone, cc->order, 922 int ok = zone_watermark_ok(zone, cc->order,
810 low_wmark_pages(zone), 0, 0); 923 low_wmark_pages(zone), 0, 0);
811 if (ok && cc->order > zone->compact_order_failed) 924 if (ok && cc->order >= zone->compact_order_failed)
812 zone->compact_order_failed = cc->order + 1; 925 zone->compact_order_failed = cc->order + 1;
813 /* Currently async compaction is never deferred. */ 926 /* Currently async compaction is never deferred. */
814 else if (!ok && cc->sync) 927 else if (!ok && cc->sync)
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 469491e0af7..9b75a045dbf 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -93,11 +93,6 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
93 spin_unlock(&file->f_lock); 93 spin_unlock(&file->f_lock);
94 break; 94 break;
95 case POSIX_FADV_WILLNEED: 95 case POSIX_FADV_WILLNEED:
96 if (!mapping->a_ops->readpage) {
97 ret = -EINVAL;
98 break;
99 }
100
101 /* First and last PARTIAL page! */ 96 /* First and last PARTIAL page! */
102 start_index = offset >> PAGE_CACHE_SHIFT; 97 start_index = offset >> PAGE_CACHE_SHIFT;
103 end_index = endbyte >> PAGE_CACHE_SHIFT; 98 end_index = endbyte >> PAGE_CACHE_SHIFT;
@@ -106,12 +101,13 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
106 nrpages = end_index - start_index + 1; 101 nrpages = end_index - start_index + 1;
107 if (!nrpages) 102 if (!nrpages)
108 nrpages = ~0UL; 103 nrpages = ~0UL;
109 104
110 ret = force_page_cache_readahead(mapping, file, 105 /*
111 start_index, 106 * Ignore return value because fadvise() shall return
112 nrpages); 107 * success even if filesystem can't retrieve a hint,
113 if (ret > 0) 108 */
114 ret = 0; 109 force_page_cache_readahead(mapping, file, start_index,
110 nrpages);
115 break; 111 break;
116 case POSIX_FADV_NOREUSE: 112 case POSIX_FADV_NOREUSE:
117 break; 113 break;
diff --git a/mm/filemap.c b/mm/filemap.c
index a4a5260b027..384344575c3 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1412,12 +1412,8 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1412 retval = filemap_write_and_wait_range(mapping, pos, 1412 retval = filemap_write_and_wait_range(mapping, pos,
1413 pos + iov_length(iov, nr_segs) - 1); 1413 pos + iov_length(iov, nr_segs) - 1);
1414 if (!retval) { 1414 if (!retval) {
1415 struct blk_plug plug;
1416
1417 blk_start_plug(&plug);
1418 retval = mapping->a_ops->direct_IO(READ, iocb, 1415 retval = mapping->a_ops->direct_IO(READ, iocb,
1419 iov, pos, nr_segs); 1416 iov, pos, nr_segs);
1420 blk_finish_plug(&plug);
1421 } 1417 }
1422 if (retval > 0) { 1418 if (retval > 0) {
1423 *ppos = pos + retval; 1419 *ppos = pos + retval;
@@ -1712,8 +1708,35 @@ page_not_uptodate:
1712} 1708}
1713EXPORT_SYMBOL(filemap_fault); 1709EXPORT_SYMBOL(filemap_fault);
1714 1710
1711int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1712{
1713 struct page *page = vmf->page;
1714 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
1715 int ret = VM_FAULT_LOCKED;
1716
1717 sb_start_pagefault(inode->i_sb);
1718 file_update_time(vma->vm_file);
1719 lock_page(page);
1720 if (page->mapping != inode->i_mapping) {
1721 unlock_page(page);
1722 ret = VM_FAULT_NOPAGE;
1723 goto out;
1724 }
1725 /*
1726 * We mark the page dirty already here so that when freeze is in
1727 * progress, we are guaranteed that writeback during freezing will
1728 * see the dirty page and writeprotect it again.
1729 */
1730 set_page_dirty(page);
1731out:
1732 sb_end_pagefault(inode->i_sb);
1733 return ret;
1734}
1735EXPORT_SYMBOL(filemap_page_mkwrite);
1736
1715const struct vm_operations_struct generic_file_vm_ops = { 1737const struct vm_operations_struct generic_file_vm_ops = {
1716 .fault = filemap_fault, 1738 .fault = filemap_fault,
1739 .page_mkwrite = filemap_page_mkwrite,
1717}; 1740};
1718 1741
1719/* This is used for a general mmap of a disk file */ 1742/* This is used for a general mmap of a disk file */
@@ -2407,8 +2430,6 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2407 count = ocount; 2430 count = ocount;
2408 pos = *ppos; 2431 pos = *ppos;
2409 2432
2410 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
2411
2412 /* We can write back this queue in page reclaim */ 2433 /* We can write back this queue in page reclaim */
2413 current->backing_dev_info = mapping->backing_dev_info; 2434 current->backing_dev_info = mapping->backing_dev_info;
2414 written = 0; 2435 written = 0;
@@ -2502,13 +2523,12 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2502{ 2523{
2503 struct file *file = iocb->ki_filp; 2524 struct file *file = iocb->ki_filp;
2504 struct inode *inode = file->f_mapping->host; 2525 struct inode *inode = file->f_mapping->host;
2505 struct blk_plug plug;
2506 ssize_t ret; 2526 ssize_t ret;
2507 2527
2508 BUG_ON(iocb->ki_pos != pos); 2528 BUG_ON(iocb->ki_pos != pos);
2509 2529
2530 sb_start_write(inode->i_sb);
2510 mutex_lock(&inode->i_mutex); 2531 mutex_lock(&inode->i_mutex);
2511 blk_start_plug(&plug);
2512 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); 2532 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
2513 mutex_unlock(&inode->i_mutex); 2533 mutex_unlock(&inode->i_mutex);
2514 2534
@@ -2519,7 +2539,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2519 if (err < 0 && ret > 0) 2539 if (err < 0 && ret > 0)
2520 ret = err; 2540 ret = err;
2521 } 2541 }
2522 blk_finish_plug(&plug); 2542 sb_end_write(inode->i_sb);
2523 return ret; 2543 return ret;
2524} 2544}
2525EXPORT_SYMBOL(generic_file_aio_write); 2545EXPORT_SYMBOL(generic_file_aio_write);
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 213ca1f5340..13e013b1270 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -304,6 +304,7 @@ out:
304 304
305static const struct vm_operations_struct xip_file_vm_ops = { 305static const struct vm_operations_struct xip_file_vm_ops = {
306 .fault = xip_file_fault, 306 .fault = xip_file_fault,
307 .page_mkwrite = filemap_page_mkwrite,
307}; 308};
308 309
309int xip_file_mmap(struct file * file, struct vm_area_struct * vma) 310int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
@@ -401,6 +402,8 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
401 loff_t pos; 402 loff_t pos;
402 ssize_t ret; 403 ssize_t ret;
403 404
405 sb_start_write(inode->i_sb);
406
404 mutex_lock(&inode->i_mutex); 407 mutex_lock(&inode->i_mutex);
405 408
406 if (!access_ok(VERIFY_READ, buf, len)) { 409 if (!access_ok(VERIFY_READ, buf, len)) {
@@ -411,8 +414,6 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
411 pos = *ppos; 414 pos = *ppos;
412 count = len; 415 count = len;
413 416
414 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
415
416 /* We can write back this queue in page reclaim */ 417 /* We can write back this queue in page reclaim */
417 current->backing_dev_info = mapping->backing_dev_info; 418 current->backing_dev_info = mapping->backing_dev_info;
418 419
@@ -436,6 +437,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
436 current->backing_dev_info = NULL; 437 current->backing_dev_info = NULL;
437 out_up: 438 out_up:
438 mutex_unlock(&inode->i_mutex); 439 mutex_unlock(&inode->i_mutex);
440 sb_end_write(inode->i_sb);
439 return ret; 441 return ret;
440} 442}
441EXPORT_SYMBOL_GPL(xip_file_write); 443EXPORT_SYMBOL_GPL(xip_file_write);
diff --git a/mm/frontswap.c b/mm/frontswap.c
index e25025574a0..6b3e71a2cd4 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -11,15 +11,11 @@
11 * This work is licensed under the terms of the GNU GPL, version 2. 11 * This work is licensed under the terms of the GNU GPL, version 2.
12 */ 12 */
13 13
14#include <linux/mm.h>
15#include <linux/mman.h> 14#include <linux/mman.h>
16#include <linux/swap.h> 15#include <linux/swap.h>
17#include <linux/swapops.h> 16#include <linux/swapops.h>
18#include <linux/proc_fs.h>
19#include <linux/security.h> 17#include <linux/security.h>
20#include <linux/capability.h>
21#include <linux/module.h> 18#include <linux/module.h>
22#include <linux/uaccess.h>
23#include <linux/debugfs.h> 19#include <linux/debugfs.h>
24#include <linux/frontswap.h> 20#include <linux/frontswap.h>
25#include <linux/swapfile.h> 21#include <linux/swapfile.h>
@@ -110,16 +106,21 @@ void __frontswap_init(unsigned type)
110 BUG_ON(sis == NULL); 106 BUG_ON(sis == NULL);
111 if (sis->frontswap_map == NULL) 107 if (sis->frontswap_map == NULL)
112 return; 108 return;
113 if (frontswap_enabled) 109 frontswap_ops.init(type);
114 (*frontswap_ops.init)(type);
115} 110}
116EXPORT_SYMBOL(__frontswap_init); 111EXPORT_SYMBOL(__frontswap_init);
117 112
113static inline void __frontswap_clear(struct swap_info_struct *sis, pgoff_t offset)
114{
115 frontswap_clear(sis, offset);
116 atomic_dec(&sis->frontswap_pages);
117}
118
118/* 119/*
119 * "Store" data from a page to frontswap and associate it with the page's 120 * "Store" data from a page to frontswap and associate it with the page's
120 * swaptype and offset. Page must be locked and in the swap cache. 121 * swaptype and offset. Page must be locked and in the swap cache.
121 * If frontswap already contains a page with matching swaptype and 122 * If frontswap already contains a page with matching swaptype and
122 * offset, the frontswap implmentation may either overwrite the data and 123 * offset, the frontswap implementation may either overwrite the data and
123 * return success or invalidate the page from frontswap and return failure. 124 * return success or invalidate the page from frontswap and return failure.
124 */ 125 */
125int __frontswap_store(struct page *page) 126int __frontswap_store(struct page *page)
@@ -134,22 +135,21 @@ int __frontswap_store(struct page *page)
134 BUG_ON(sis == NULL); 135 BUG_ON(sis == NULL);
135 if (frontswap_test(sis, offset)) 136 if (frontswap_test(sis, offset))
136 dup = 1; 137 dup = 1;
137 ret = (*frontswap_ops.store)(type, offset, page); 138 ret = frontswap_ops.store(type, offset, page);
138 if (ret == 0) { 139 if (ret == 0) {
139 frontswap_set(sis, offset); 140 frontswap_set(sis, offset);
140 inc_frontswap_succ_stores(); 141 inc_frontswap_succ_stores();
141 if (!dup) 142 if (!dup)
142 atomic_inc(&sis->frontswap_pages); 143 atomic_inc(&sis->frontswap_pages);
143 } else if (dup) { 144 } else {
144 /* 145 /*
145 failed dup always results in automatic invalidate of 146 failed dup always results in automatic invalidate of
146 the (older) page from frontswap 147 the (older) page from frontswap
147 */ 148 */
148 frontswap_clear(sis, offset);
149 atomic_dec(&sis->frontswap_pages);
150 inc_frontswap_failed_stores();
151 } else
152 inc_frontswap_failed_stores(); 149 inc_frontswap_failed_stores();
150 if (dup)
151 __frontswap_clear(sis, offset);
152 }
153 if (frontswap_writethrough_enabled) 153 if (frontswap_writethrough_enabled)
154 /* report failure so swap also writes to swap device */ 154 /* report failure so swap also writes to swap device */
155 ret = -1; 155 ret = -1;
@@ -173,7 +173,7 @@ int __frontswap_load(struct page *page)
173 BUG_ON(!PageLocked(page)); 173 BUG_ON(!PageLocked(page));
174 BUG_ON(sis == NULL); 174 BUG_ON(sis == NULL);
175 if (frontswap_test(sis, offset)) 175 if (frontswap_test(sis, offset))
176 ret = (*frontswap_ops.load)(type, offset, page); 176 ret = frontswap_ops.load(type, offset, page);
177 if (ret == 0) 177 if (ret == 0)
178 inc_frontswap_loads(); 178 inc_frontswap_loads();
179 return ret; 179 return ret;
@@ -190,9 +190,8 @@ void __frontswap_invalidate_page(unsigned type, pgoff_t offset)
190 190
191 BUG_ON(sis == NULL); 191 BUG_ON(sis == NULL);
192 if (frontswap_test(sis, offset)) { 192 if (frontswap_test(sis, offset)) {
193 (*frontswap_ops.invalidate_page)(type, offset); 193 frontswap_ops.invalidate_page(type, offset);
194 atomic_dec(&sis->frontswap_pages); 194 __frontswap_clear(sis, offset);
195 frontswap_clear(sis, offset);
196 inc_frontswap_invalidates(); 195 inc_frontswap_invalidates();
197 } 196 }
198} 197}
@@ -209,67 +208,102 @@ void __frontswap_invalidate_area(unsigned type)
209 BUG_ON(sis == NULL); 208 BUG_ON(sis == NULL);
210 if (sis->frontswap_map == NULL) 209 if (sis->frontswap_map == NULL)
211 return; 210 return;
212 (*frontswap_ops.invalidate_area)(type); 211 frontswap_ops.invalidate_area(type);
213 atomic_set(&sis->frontswap_pages, 0); 212 atomic_set(&sis->frontswap_pages, 0);
214 memset(sis->frontswap_map, 0, sis->max / sizeof(long)); 213 memset(sis->frontswap_map, 0, sis->max / sizeof(long));
215} 214}
216EXPORT_SYMBOL(__frontswap_invalidate_area); 215EXPORT_SYMBOL(__frontswap_invalidate_area);
217 216
218/* 217static unsigned long __frontswap_curr_pages(void)
219 * Frontswap, like a true swap device, may unnecessarily retain pages
220 * under certain circumstances; "shrink" frontswap is essentially a
221 * "partial swapoff" and works by calling try_to_unuse to attempt to
222 * unuse enough frontswap pages to attempt to -- subject to memory
223 * constraints -- reduce the number of pages in frontswap to the
224 * number given in the parameter target_pages.
225 */
226void frontswap_shrink(unsigned long target_pages)
227{ 218{
228 struct swap_info_struct *si = NULL;
229 int si_frontswap_pages;
230 unsigned long total_pages = 0, total_pages_to_unuse;
231 unsigned long pages = 0, pages_to_unuse = 0;
232 int type; 219 int type;
233 bool locked = false; 220 unsigned long totalpages = 0;
221 struct swap_info_struct *si = NULL;
234 222
235 /* 223 assert_spin_locked(&swap_lock);
236 * we don't want to hold swap_lock while doing a very
237 * lengthy try_to_unuse, but swap_list may change
238 * so restart scan from swap_list.head each time
239 */
240 spin_lock(&swap_lock);
241 locked = true;
242 total_pages = 0;
243 for (type = swap_list.head; type >= 0; type = si->next) { 224 for (type = swap_list.head; type >= 0; type = si->next) {
244 si = swap_info[type]; 225 si = swap_info[type];
245 total_pages += atomic_read(&si->frontswap_pages); 226 totalpages += atomic_read(&si->frontswap_pages);
246 } 227 }
247 if (total_pages <= target_pages) 228 return totalpages;
248 goto out; 229}
249 total_pages_to_unuse = total_pages - target_pages; 230
231static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
232 int *swapid)
233{
234 int ret = -EINVAL;
235 struct swap_info_struct *si = NULL;
236 int si_frontswap_pages;
237 unsigned long total_pages_to_unuse = total;
238 unsigned long pages = 0, pages_to_unuse = 0;
239 int type;
240
241 assert_spin_locked(&swap_lock);
250 for (type = swap_list.head; type >= 0; type = si->next) { 242 for (type = swap_list.head; type >= 0; type = si->next) {
251 si = swap_info[type]; 243 si = swap_info[type];
252 si_frontswap_pages = atomic_read(&si->frontswap_pages); 244 si_frontswap_pages = atomic_read(&si->frontswap_pages);
253 if (total_pages_to_unuse < si_frontswap_pages) 245 if (total_pages_to_unuse < si_frontswap_pages) {
254 pages = pages_to_unuse = total_pages_to_unuse; 246 pages = pages_to_unuse = total_pages_to_unuse;
255 else { 247 } else {
256 pages = si_frontswap_pages; 248 pages = si_frontswap_pages;
257 pages_to_unuse = 0; /* unuse all */ 249 pages_to_unuse = 0; /* unuse all */
258 } 250 }
259 /* ensure there is enough RAM to fetch pages from frontswap */ 251 /* ensure there is enough RAM to fetch pages from frontswap */
260 if (security_vm_enough_memory_mm(current->mm, pages)) 252 if (security_vm_enough_memory_mm(current->mm, pages)) {
253 ret = -ENOMEM;
261 continue; 254 continue;
255 }
262 vm_unacct_memory(pages); 256 vm_unacct_memory(pages);
257 *unused = pages_to_unuse;
258 *swapid = type;
259 ret = 0;
263 break; 260 break;
264 } 261 }
265 if (type < 0) 262
266 goto out; 263 return ret;
267 locked = false; 264}
265
266static int __frontswap_shrink(unsigned long target_pages,
267 unsigned long *pages_to_unuse,
268 int *type)
269{
270 unsigned long total_pages = 0, total_pages_to_unuse;
271
272 assert_spin_locked(&swap_lock);
273
274 total_pages = __frontswap_curr_pages();
275 if (total_pages <= target_pages) {
276 /* Nothing to do */
277 *pages_to_unuse = 0;
278 return 0;
279 }
280 total_pages_to_unuse = total_pages - target_pages;
281 return __frontswap_unuse_pages(total_pages_to_unuse, pages_to_unuse, type);
282}
283
284/*
285 * Frontswap, like a true swap device, may unnecessarily retain pages
286 * under certain circumstances; "shrink" frontswap is essentially a
287 * "partial swapoff" and works by calling try_to_unuse to attempt to
288 * unuse enough frontswap pages to attempt to -- subject to memory
289 * constraints -- reduce the number of pages in frontswap to the
290 * number given in the parameter target_pages.
291 */
292void frontswap_shrink(unsigned long target_pages)
293{
294 unsigned long pages_to_unuse = 0;
295 int type, ret;
296
297 /*
298 * we don't want to hold swap_lock while doing a very
299 * lengthy try_to_unuse, but swap_list may change
300 * so restart scan from swap_list.head each time
301 */
302 spin_lock(&swap_lock);
303 ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type);
268 spin_unlock(&swap_lock); 304 spin_unlock(&swap_lock);
269 try_to_unuse(type, true, pages_to_unuse); 305 if (ret == 0 && pages_to_unuse)
270out: 306 try_to_unuse(type, true, pages_to_unuse);
271 if (locked)
272 spin_unlock(&swap_lock);
273 return; 307 return;
274} 308}
275EXPORT_SYMBOL(frontswap_shrink); 309EXPORT_SYMBOL(frontswap_shrink);
@@ -281,16 +315,12 @@ EXPORT_SYMBOL(frontswap_shrink);
281 */ 315 */
282unsigned long frontswap_curr_pages(void) 316unsigned long frontswap_curr_pages(void)
283{ 317{
284 int type;
285 unsigned long totalpages = 0; 318 unsigned long totalpages = 0;
286 struct swap_info_struct *si = NULL;
287 319
288 spin_lock(&swap_lock); 320 spin_lock(&swap_lock);
289 for (type = swap_list.head; type >= 0; type = si->next) { 321 totalpages = __frontswap_curr_pages();
290 si = swap_info[type];
291 totalpages += atomic_read(&si->frontswap_pages);
292 }
293 spin_unlock(&swap_lock); 322 spin_unlock(&swap_lock);
323
294 return totalpages; 324 return totalpages;
295} 325}
296EXPORT_SYMBOL(frontswap_curr_pages); 326EXPORT_SYMBOL(frontswap_curr_pages);
diff --git a/mm/highmem.c b/mm/highmem.c
index 57d82c6250c..d517cd16a6e 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -94,6 +94,18 @@ static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
94 do { spin_unlock(&kmap_lock); (void)(flags); } while (0) 94 do { spin_unlock(&kmap_lock); (void)(flags); } while (0)
95#endif 95#endif
96 96
97struct page *kmap_to_page(void *vaddr)
98{
99 unsigned long addr = (unsigned long)vaddr;
100
101 if (addr >= PKMAP_ADDR(0) && addr <= PKMAP_ADDR(LAST_PKMAP)) {
102 int i = (addr - PKMAP_ADDR(0)) >> PAGE_SHIFT;
103 return pte_page(pkmap_page_table[i]);
104 }
105
106 return virt_to_page(addr);
107}
108
97static void flush_all_zero_pkmaps(void) 109static void flush_all_zero_pkmaps(void)
98{ 110{
99 int i; 111 int i;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e198831276a..bc727122dd4 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -24,17 +24,20 @@
24 24
25#include <asm/page.h> 25#include <asm/page.h>
26#include <asm/pgtable.h> 26#include <asm/pgtable.h>
27#include <linux/io.h> 27#include <asm/tlb.h>
28 28
29#include <linux/io.h>
29#include <linux/hugetlb.h> 30#include <linux/hugetlb.h>
31#include <linux/hugetlb_cgroup.h>
30#include <linux/node.h> 32#include <linux/node.h>
33#include <linux/hugetlb_cgroup.h>
31#include "internal.h" 34#include "internal.h"
32 35
33const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 36const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
34static gfp_t htlb_alloc_mask = GFP_HIGHUSER; 37static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
35unsigned long hugepages_treat_as_movable; 38unsigned long hugepages_treat_as_movable;
36 39
37static int max_hstate; 40int hugetlb_max_hstate __read_mostly;
38unsigned int default_hstate_idx; 41unsigned int default_hstate_idx;
39struct hstate hstates[HUGE_MAX_HSTATE]; 42struct hstate hstates[HUGE_MAX_HSTATE];
40 43
@@ -45,13 +48,10 @@ static struct hstate * __initdata parsed_hstate;
45static unsigned long __initdata default_hstate_max_huge_pages; 48static unsigned long __initdata default_hstate_max_huge_pages;
46static unsigned long __initdata default_hstate_size; 49static unsigned long __initdata default_hstate_size;
47 50
48#define for_each_hstate(h) \
49 for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++)
50
51/* 51/*
52 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages 52 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
53 */ 53 */
54static DEFINE_SPINLOCK(hugetlb_lock); 54DEFINE_SPINLOCK(hugetlb_lock);
55 55
56static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) 56static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
57{ 57{
@@ -509,7 +509,7 @@ void copy_huge_page(struct page *dst, struct page *src)
509static void enqueue_huge_page(struct hstate *h, struct page *page) 509static void enqueue_huge_page(struct hstate *h, struct page *page)
510{ 510{
511 int nid = page_to_nid(page); 511 int nid = page_to_nid(page);
512 list_add(&page->lru, &h->hugepage_freelists[nid]); 512 list_move(&page->lru, &h->hugepage_freelists[nid]);
513 h->free_huge_pages++; 513 h->free_huge_pages++;
514 h->free_huge_pages_node[nid]++; 514 h->free_huge_pages_node[nid]++;
515} 515}
@@ -521,7 +521,7 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
521 if (list_empty(&h->hugepage_freelists[nid])) 521 if (list_empty(&h->hugepage_freelists[nid]))
522 return NULL; 522 return NULL;
523 page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); 523 page = list_entry(h->hugepage_freelists[nid].next, struct page, lru);
524 list_del(&page->lru); 524 list_move(&page->lru, &h->hugepage_activelist);
525 set_page_refcounted(page); 525 set_page_refcounted(page);
526 h->free_huge_pages--; 526 h->free_huge_pages--;
527 h->free_huge_pages_node[nid]--; 527 h->free_huge_pages_node[nid]--;
@@ -593,6 +593,7 @@ static void update_and_free_page(struct hstate *h, struct page *page)
593 1 << PG_active | 1 << PG_reserved | 593 1 << PG_active | 1 << PG_reserved |
594 1 << PG_private | 1 << PG_writeback); 594 1 << PG_private | 1 << PG_writeback);
595 } 595 }
596 VM_BUG_ON(hugetlb_cgroup_from_page(page));
596 set_compound_page_dtor(page, NULL); 597 set_compound_page_dtor(page, NULL);
597 set_page_refcounted(page); 598 set_page_refcounted(page);
598 arch_release_hugepage(page); 599 arch_release_hugepage(page);
@@ -625,10 +626,13 @@ static void free_huge_page(struct page *page)
625 page->mapping = NULL; 626 page->mapping = NULL;
626 BUG_ON(page_count(page)); 627 BUG_ON(page_count(page));
627 BUG_ON(page_mapcount(page)); 628 BUG_ON(page_mapcount(page));
628 INIT_LIST_HEAD(&page->lru);
629 629
630 spin_lock(&hugetlb_lock); 630 spin_lock(&hugetlb_lock);
631 hugetlb_cgroup_uncharge_page(hstate_index(h),
632 pages_per_huge_page(h), page);
631 if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) { 633 if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) {
634 /* remove the page from active list */
635 list_del(&page->lru);
632 update_and_free_page(h, page); 636 update_and_free_page(h, page);
633 h->surplus_huge_pages--; 637 h->surplus_huge_pages--;
634 h->surplus_huge_pages_node[nid]--; 638 h->surplus_huge_pages_node[nid]--;
@@ -641,8 +645,10 @@ static void free_huge_page(struct page *page)
641 645
642static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) 646static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
643{ 647{
648 INIT_LIST_HEAD(&page->lru);
644 set_compound_page_dtor(page, free_huge_page); 649 set_compound_page_dtor(page, free_huge_page);
645 spin_lock(&hugetlb_lock); 650 spin_lock(&hugetlb_lock);
651 set_hugetlb_cgroup(page, NULL);
646 h->nr_huge_pages++; 652 h->nr_huge_pages++;
647 h->nr_huge_pages_node[nid]++; 653 h->nr_huge_pages_node[nid]++;
648 spin_unlock(&hugetlb_lock); 654 spin_unlock(&hugetlb_lock);
@@ -889,8 +895,10 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
889 895
890 spin_lock(&hugetlb_lock); 896 spin_lock(&hugetlb_lock);
891 if (page) { 897 if (page) {
898 INIT_LIST_HEAD(&page->lru);
892 r_nid = page_to_nid(page); 899 r_nid = page_to_nid(page);
893 set_compound_page_dtor(page, free_huge_page); 900 set_compound_page_dtor(page, free_huge_page);
901 set_hugetlb_cgroup(page, NULL);
894 /* 902 /*
895 * We incremented the global counters already 903 * We incremented the global counters already
896 */ 904 */
@@ -993,7 +1001,6 @@ retry:
993 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 1001 list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
994 if ((--needed) < 0) 1002 if ((--needed) < 0)
995 break; 1003 break;
996 list_del(&page->lru);
997 /* 1004 /*
998 * This page is now managed by the hugetlb allocator and has 1005 * This page is now managed by the hugetlb allocator and has
999 * no users -- drop the buddy allocator's reference. 1006 * no users -- drop the buddy allocator's reference.
@@ -1008,7 +1015,6 @@ free:
1008 /* Free unnecessary surplus pages to the buddy allocator */ 1015 /* Free unnecessary surplus pages to the buddy allocator */
1009 if (!list_empty(&surplus_list)) { 1016 if (!list_empty(&surplus_list)) {
1010 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 1017 list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
1011 list_del(&page->lru);
1012 put_page(page); 1018 put_page(page);
1013 } 1019 }
1014 } 1020 }
@@ -1112,7 +1118,10 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
1112 struct hstate *h = hstate_vma(vma); 1118 struct hstate *h = hstate_vma(vma);
1113 struct page *page; 1119 struct page *page;
1114 long chg; 1120 long chg;
1121 int ret, idx;
1122 struct hugetlb_cgroup *h_cg;
1115 1123
1124 idx = hstate_index(h);
1116 /* 1125 /*
1117 * Processes that did not create the mapping will have no 1126 * Processes that did not create the mapping will have no
1118 * reserves and will not have accounted against subpool 1127 * reserves and will not have accounted against subpool
@@ -1123,27 +1132,43 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
1123 */ 1132 */
1124 chg = vma_needs_reservation(h, vma, addr); 1133 chg = vma_needs_reservation(h, vma, addr);
1125 if (chg < 0) 1134 if (chg < 0)
1126 return ERR_PTR(-VM_FAULT_OOM); 1135 return ERR_PTR(-ENOMEM);
1127 if (chg) 1136 if (chg)
1128 if (hugepage_subpool_get_pages(spool, chg)) 1137 if (hugepage_subpool_get_pages(spool, chg))
1129 return ERR_PTR(-VM_FAULT_SIGBUS); 1138 return ERR_PTR(-ENOSPC);
1130 1139
1140 ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
1141 if (ret) {
1142 hugepage_subpool_put_pages(spool, chg);
1143 return ERR_PTR(-ENOSPC);
1144 }
1131 spin_lock(&hugetlb_lock); 1145 spin_lock(&hugetlb_lock);
1132 page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve); 1146 page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
1133 spin_unlock(&hugetlb_lock); 1147 if (page) {
1134 1148 /* update page cgroup details */
1135 if (!page) { 1149 hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h),
1150 h_cg, page);
1151 spin_unlock(&hugetlb_lock);
1152 } else {
1153 spin_unlock(&hugetlb_lock);
1136 page = alloc_buddy_huge_page(h, NUMA_NO_NODE); 1154 page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
1137 if (!page) { 1155 if (!page) {
1156 hugetlb_cgroup_uncharge_cgroup(idx,
1157 pages_per_huge_page(h),
1158 h_cg);
1138 hugepage_subpool_put_pages(spool, chg); 1159 hugepage_subpool_put_pages(spool, chg);
1139 return ERR_PTR(-VM_FAULT_SIGBUS); 1160 return ERR_PTR(-ENOSPC);
1140 } 1161 }
1162 spin_lock(&hugetlb_lock);
1163 hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h),
1164 h_cg, page);
1165 list_move(&page->lru, &h->hugepage_activelist);
1166 spin_unlock(&hugetlb_lock);
1141 } 1167 }
1142 1168
1143 set_page_private(page, (unsigned long)spool); 1169 set_page_private(page, (unsigned long)spool);
1144 1170
1145 vma_commit_reservation(h, vma, addr); 1171 vma_commit_reservation(h, vma, addr);
1146
1147 return page; 1172 return page;
1148} 1173}
1149 1174
@@ -1646,7 +1671,7 @@ static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
1646 struct attribute_group *hstate_attr_group) 1671 struct attribute_group *hstate_attr_group)
1647{ 1672{
1648 int retval; 1673 int retval;
1649 int hi = h - hstates; 1674 int hi = hstate_index(h);
1650 1675
1651 hstate_kobjs[hi] = kobject_create_and_add(h->name, parent); 1676 hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
1652 if (!hstate_kobjs[hi]) 1677 if (!hstate_kobjs[hi])
@@ -1741,11 +1766,13 @@ void hugetlb_unregister_node(struct node *node)
1741 if (!nhs->hugepages_kobj) 1766 if (!nhs->hugepages_kobj)
1742 return; /* no hstate attributes */ 1767 return; /* no hstate attributes */
1743 1768
1744 for_each_hstate(h) 1769 for_each_hstate(h) {
1745 if (nhs->hstate_kobjs[h - hstates]) { 1770 int idx = hstate_index(h);
1746 kobject_put(nhs->hstate_kobjs[h - hstates]); 1771 if (nhs->hstate_kobjs[idx]) {
1747 nhs->hstate_kobjs[h - hstates] = NULL; 1772 kobject_put(nhs->hstate_kobjs[idx]);
1773 nhs->hstate_kobjs[idx] = NULL;
1748 } 1774 }
1775 }
1749 1776
1750 kobject_put(nhs->hugepages_kobj); 1777 kobject_put(nhs->hugepages_kobj);
1751 nhs->hugepages_kobj = NULL; 1778 nhs->hugepages_kobj = NULL;
@@ -1848,7 +1875,7 @@ static void __exit hugetlb_exit(void)
1848 hugetlb_unregister_all_nodes(); 1875 hugetlb_unregister_all_nodes();
1849 1876
1850 for_each_hstate(h) { 1877 for_each_hstate(h) {
1851 kobject_put(hstate_kobjs[h - hstates]); 1878 kobject_put(hstate_kobjs[hstate_index(h)]);
1852 } 1879 }
1853 1880
1854 kobject_put(hugepages_kobj); 1881 kobject_put(hugepages_kobj);
@@ -1869,7 +1896,7 @@ static int __init hugetlb_init(void)
1869 if (!size_to_hstate(default_hstate_size)) 1896 if (!size_to_hstate(default_hstate_size))
1870 hugetlb_add_hstate(HUGETLB_PAGE_ORDER); 1897 hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
1871 } 1898 }
1872 default_hstate_idx = size_to_hstate(default_hstate_size) - hstates; 1899 default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size));
1873 if (default_hstate_max_huge_pages) 1900 if (default_hstate_max_huge_pages)
1874 default_hstate.max_huge_pages = default_hstate_max_huge_pages; 1901 default_hstate.max_huge_pages = default_hstate_max_huge_pages;
1875 1902
@@ -1897,19 +1924,27 @@ void __init hugetlb_add_hstate(unsigned order)
1897 printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n"); 1924 printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n");
1898 return; 1925 return;
1899 } 1926 }
1900 BUG_ON(max_hstate >= HUGE_MAX_HSTATE); 1927 BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
1901 BUG_ON(order == 0); 1928 BUG_ON(order == 0);
1902 h = &hstates[max_hstate++]; 1929 h = &hstates[hugetlb_max_hstate++];
1903 h->order = order; 1930 h->order = order;
1904 h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); 1931 h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
1905 h->nr_huge_pages = 0; 1932 h->nr_huge_pages = 0;
1906 h->free_huge_pages = 0; 1933 h->free_huge_pages = 0;
1907 for (i = 0; i < MAX_NUMNODES; ++i) 1934 for (i = 0; i < MAX_NUMNODES; ++i)
1908 INIT_LIST_HEAD(&h->hugepage_freelists[i]); 1935 INIT_LIST_HEAD(&h->hugepage_freelists[i]);
1936 INIT_LIST_HEAD(&h->hugepage_activelist);
1909 h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]); 1937 h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]);
1910 h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]); 1938 h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]);
1911 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", 1939 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
1912 huge_page_size(h)/1024); 1940 huge_page_size(h)/1024);
1941 /*
1942 * Add cgroup control files only if the huge page consists
1943 * of more than two normal pages. This is because we use
1944 * page[2].lru.next for storing cgoup details.
1945 */
1946 if (order >= HUGETLB_CGROUP_MIN_ORDER)
1947 hugetlb_cgroup_file_init(hugetlb_max_hstate - 1);
1913 1948
1914 parsed_hstate = h; 1949 parsed_hstate = h;
1915} 1950}
@@ -1920,10 +1955,10 @@ static int __init hugetlb_nrpages_setup(char *s)
1920 static unsigned long *last_mhp; 1955 static unsigned long *last_mhp;
1921 1956
1922 /* 1957 /*
1923 * !max_hstate means we haven't parsed a hugepagesz= parameter yet, 1958 * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet,
1924 * so this hugepages= parameter goes to the "default hstate". 1959 * so this hugepages= parameter goes to the "default hstate".
1925 */ 1960 */
1926 if (!max_hstate) 1961 if (!hugetlb_max_hstate)
1927 mhp = &default_hstate_max_huge_pages; 1962 mhp = &default_hstate_max_huge_pages;
1928 else 1963 else
1929 mhp = &parsed_hstate->max_huge_pages; 1964 mhp = &parsed_hstate->max_huge_pages;
@@ -1942,7 +1977,7 @@ static int __init hugetlb_nrpages_setup(char *s)
1942 * But we need to allocate >= MAX_ORDER hstates here early to still 1977 * But we need to allocate >= MAX_ORDER hstates here early to still
1943 * use the bootmem allocator. 1978 * use the bootmem allocator.
1944 */ 1979 */
1945 if (max_hstate && parsed_hstate->order >= MAX_ORDER) 1980 if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER)
1946 hugetlb_hstate_alloc_pages(parsed_hstate); 1981 hugetlb_hstate_alloc_pages(parsed_hstate);
1947 1982
1948 last_mhp = mhp; 1983 last_mhp = mhp;
@@ -2308,30 +2343,26 @@ static int is_hugetlb_entry_hwpoisoned(pte_t pte)
2308 return 0; 2343 return 0;
2309} 2344}
2310 2345
2311void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 2346void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
2312 unsigned long end, struct page *ref_page) 2347 unsigned long start, unsigned long end,
2348 struct page *ref_page)
2313{ 2349{
2350 int force_flush = 0;
2314 struct mm_struct *mm = vma->vm_mm; 2351 struct mm_struct *mm = vma->vm_mm;
2315 unsigned long address; 2352 unsigned long address;
2316 pte_t *ptep; 2353 pte_t *ptep;
2317 pte_t pte; 2354 pte_t pte;
2318 struct page *page; 2355 struct page *page;
2319 struct page *tmp;
2320 struct hstate *h = hstate_vma(vma); 2356 struct hstate *h = hstate_vma(vma);
2321 unsigned long sz = huge_page_size(h); 2357 unsigned long sz = huge_page_size(h);
2322 2358
2323 /*
2324 * A page gathering list, protected by per file i_mmap_mutex. The
2325 * lock is used to avoid list corruption from multiple unmapping
2326 * of the same page since we are using page->lru.
2327 */
2328 LIST_HEAD(page_list);
2329
2330 WARN_ON(!is_vm_hugetlb_page(vma)); 2359 WARN_ON(!is_vm_hugetlb_page(vma));
2331 BUG_ON(start & ~huge_page_mask(h)); 2360 BUG_ON(start & ~huge_page_mask(h));
2332 BUG_ON(end & ~huge_page_mask(h)); 2361 BUG_ON(end & ~huge_page_mask(h));
2333 2362
2363 tlb_start_vma(tlb, vma);
2334 mmu_notifier_invalidate_range_start(mm, start, end); 2364 mmu_notifier_invalidate_range_start(mm, start, end);
2365again:
2335 spin_lock(&mm->page_table_lock); 2366 spin_lock(&mm->page_table_lock);
2336 for (address = start; address < end; address += sz) { 2367 for (address = start; address < end; address += sz) {
2337 ptep = huge_pte_offset(mm, address); 2368 ptep = huge_pte_offset(mm, address);
@@ -2370,30 +2401,64 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
2370 } 2401 }
2371 2402
2372 pte = huge_ptep_get_and_clear(mm, address, ptep); 2403 pte = huge_ptep_get_and_clear(mm, address, ptep);
2404 tlb_remove_tlb_entry(tlb, ptep, address);
2373 if (pte_dirty(pte)) 2405 if (pte_dirty(pte))
2374 set_page_dirty(page); 2406 set_page_dirty(page);
2375 list_add(&page->lru, &page_list);
2376 2407
2408 page_remove_rmap(page);
2409 force_flush = !__tlb_remove_page(tlb, page);
2410 if (force_flush)
2411 break;
2377 /* Bail out after unmapping reference page if supplied */ 2412 /* Bail out after unmapping reference page if supplied */
2378 if (ref_page) 2413 if (ref_page)
2379 break; 2414 break;
2380 } 2415 }
2381 flush_tlb_range(vma, start, end);
2382 spin_unlock(&mm->page_table_lock); 2416 spin_unlock(&mm->page_table_lock);
2383 mmu_notifier_invalidate_range_end(mm, start, end); 2417 /*
2384 list_for_each_entry_safe(page, tmp, &page_list, lru) { 2418 * mmu_gather ran out of room to batch pages, we break out of
2385 page_remove_rmap(page); 2419 * the PTE lock to avoid doing the potential expensive TLB invalidate
2386 list_del(&page->lru); 2420 * and page-free while holding it.
2387 put_page(page); 2421 */
2422 if (force_flush) {
2423 force_flush = 0;
2424 tlb_flush_mmu(tlb);
2425 if (address < end && !ref_page)
2426 goto again;
2388 } 2427 }
2428 mmu_notifier_invalidate_range_end(mm, start, end);
2429 tlb_end_vma(tlb, vma);
2430}
2431
2432void __unmap_hugepage_range_final(struct mmu_gather *tlb,
2433 struct vm_area_struct *vma, unsigned long start,
2434 unsigned long end, struct page *ref_page)
2435{
2436 __unmap_hugepage_range(tlb, vma, start, end, ref_page);
2437
2438 /*
2439 * Clear this flag so that x86's huge_pmd_share page_table_shareable
2440 * test will fail on a vma being torn down, and not grab a page table
2441 * on its way out. We're lucky that the flag has such an appropriate
2442 * name, and can in fact be safely cleared here. We could clear it
2443 * before the __unmap_hugepage_range above, but all that's necessary
2444 * is to clear it before releasing the i_mmap_mutex. This works
2445 * because in the context this is called, the VMA is about to be
2446 * destroyed and the i_mmap_mutex is held.
2447 */
2448 vma->vm_flags &= ~VM_MAYSHARE;
2389} 2449}
2390 2450
2391void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 2451void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
2392 unsigned long end, struct page *ref_page) 2452 unsigned long end, struct page *ref_page)
2393{ 2453{
2394 mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); 2454 struct mm_struct *mm;
2395 __unmap_hugepage_range(vma, start, end, ref_page); 2455 struct mmu_gather tlb;
2396 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); 2456
2457 mm = vma->vm_mm;
2458
2459 tlb_gather_mmu(&tlb, mm, 0);
2460 __unmap_hugepage_range(&tlb, vma, start, end, ref_page);
2461 tlb_finish_mmu(&tlb, start, end);
2397} 2462}
2398 2463
2399/* 2464/*
@@ -2438,9 +2503,8 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
2438 * from the time of fork. This would look like data corruption 2503 * from the time of fork. This would look like data corruption
2439 */ 2504 */
2440 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) 2505 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
2441 __unmap_hugepage_range(iter_vma, 2506 unmap_hugepage_range(iter_vma, address,
2442 address, address + huge_page_size(h), 2507 address + huge_page_size(h), page);
2443 page);
2444 } 2508 }
2445 mutex_unlock(&mapping->i_mmap_mutex); 2509 mutex_unlock(&mapping->i_mmap_mutex);
2446 2510
@@ -2496,6 +2560,7 @@ retry_avoidcopy:
2496 new_page = alloc_huge_page(vma, address, outside_reserve); 2560 new_page = alloc_huge_page(vma, address, outside_reserve);
2497 2561
2498 if (IS_ERR(new_page)) { 2562 if (IS_ERR(new_page)) {
2563 long err = PTR_ERR(new_page);
2499 page_cache_release(old_page); 2564 page_cache_release(old_page);
2500 2565
2501 /* 2566 /*
@@ -2524,7 +2589,10 @@ retry_avoidcopy:
2524 2589
2525 /* Caller expects lock to be held */ 2590 /* Caller expects lock to be held */
2526 spin_lock(&mm->page_table_lock); 2591 spin_lock(&mm->page_table_lock);
2527 return -PTR_ERR(new_page); 2592 if (err == -ENOMEM)
2593 return VM_FAULT_OOM;
2594 else
2595 return VM_FAULT_SIGBUS;
2528 } 2596 }
2529 2597
2530 /* 2598 /*
@@ -2642,7 +2710,11 @@ retry:
2642 goto out; 2710 goto out;
2643 page = alloc_huge_page(vma, address, 0); 2711 page = alloc_huge_page(vma, address, 0);
2644 if (IS_ERR(page)) { 2712 if (IS_ERR(page)) {
2645 ret = -PTR_ERR(page); 2713 ret = PTR_ERR(page);
2714 if (ret == -ENOMEM)
2715 ret = VM_FAULT_OOM;
2716 else
2717 ret = VM_FAULT_SIGBUS;
2646 goto out; 2718 goto out;
2647 } 2719 }
2648 clear_huge_page(page, address, pages_per_huge_page(h)); 2720 clear_huge_page(page, address, pages_per_huge_page(h));
@@ -2679,7 +2751,7 @@ retry:
2679 */ 2751 */
2680 if (unlikely(PageHWPoison(page))) { 2752 if (unlikely(PageHWPoison(page))) {
2681 ret = VM_FAULT_HWPOISON | 2753 ret = VM_FAULT_HWPOISON |
2682 VM_FAULT_SET_HINDEX(h - hstates); 2754 VM_FAULT_SET_HINDEX(hstate_index(h));
2683 goto backout_unlocked; 2755 goto backout_unlocked;
2684 } 2756 }
2685 } 2757 }
@@ -2752,7 +2824,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2752 return 0; 2824 return 0;
2753 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) 2825 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
2754 return VM_FAULT_HWPOISON_LARGE | 2826 return VM_FAULT_HWPOISON_LARGE |
2755 VM_FAULT_SET_HINDEX(h - hstates); 2827 VM_FAULT_SET_HINDEX(hstate_index(h));
2756 } 2828 }
2757 2829
2758 ptep = huge_pte_alloc(mm, address, huge_page_size(h)); 2830 ptep = huge_pte_alloc(mm, address, huge_page_size(h));
@@ -2959,9 +3031,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
2959 } 3031 }
2960 } 3032 }
2961 spin_unlock(&mm->page_table_lock); 3033 spin_unlock(&mm->page_table_lock);
2962 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); 3034 /*
2963 3035 * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare
3036 * may have cleared our pud entry and done put_page on the page table:
3037 * once we release i_mmap_mutex, another task can do the final put_page
3038 * and that page table be reused and filled with junk.
3039 */
2964 flush_tlb_range(vma, start, end); 3040 flush_tlb_range(vma, start, end);
3041 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
2965} 3042}
2966 3043
2967int hugetlb_reserve_pages(struct inode *inode, 3044int hugetlb_reserve_pages(struct inode *inode,
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
new file mode 100644
index 00000000000..a3f358fb8a0
--- /dev/null
+++ b/mm/hugetlb_cgroup.c
@@ -0,0 +1,418 @@
1/*
2 *
3 * Copyright IBM Corporation, 2012
4 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of version 2.1 of the GNU Lesser General Public License
8 * as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it would be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
13 *
14 */
15
16#include <linux/cgroup.h>
17#include <linux/slab.h>
18#include <linux/hugetlb.h>
19#include <linux/hugetlb_cgroup.h>
20
21struct hugetlb_cgroup {
22 struct cgroup_subsys_state css;
23 /*
24 * the counter to account for hugepages from hugetlb.
25 */
26 struct res_counter hugepage[HUGE_MAX_HSTATE];
27};
28
29#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
30#define MEMFILE_IDX(val) (((val) >> 16) & 0xffff)
31#define MEMFILE_ATTR(val) ((val) & 0xffff)
32
33struct cgroup_subsys hugetlb_subsys __read_mostly;
34static struct hugetlb_cgroup *root_h_cgroup __read_mostly;
35
36static inline
37struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s)
38{
39 return container_of(s, struct hugetlb_cgroup, css);
40}
41
42static inline
43struct hugetlb_cgroup *hugetlb_cgroup_from_cgroup(struct cgroup *cgroup)
44{
45 return hugetlb_cgroup_from_css(cgroup_subsys_state(cgroup,
46 hugetlb_subsys_id));
47}
48
49static inline
50struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task)
51{
52 return hugetlb_cgroup_from_css(task_subsys_state(task,
53 hugetlb_subsys_id));
54}
55
56static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg)
57{
58 return (h_cg == root_h_cgroup);
59}
60
61static inline struct hugetlb_cgroup *parent_hugetlb_cgroup(struct cgroup *cg)
62{
63 if (!cg->parent)
64 return NULL;
65 return hugetlb_cgroup_from_cgroup(cg->parent);
66}
67
68static inline bool hugetlb_cgroup_have_usage(struct cgroup *cg)
69{
70 int idx;
71 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cg);
72
73 for (idx = 0; idx < hugetlb_max_hstate; idx++) {
74 if ((res_counter_read_u64(&h_cg->hugepage[idx], RES_USAGE)) > 0)
75 return true;
76 }
77 return false;
78}
79
80static struct cgroup_subsys_state *hugetlb_cgroup_create(struct cgroup *cgroup)
81{
82 int idx;
83 struct cgroup *parent_cgroup;
84 struct hugetlb_cgroup *h_cgroup, *parent_h_cgroup;
85
86 h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL);
87 if (!h_cgroup)
88 return ERR_PTR(-ENOMEM);
89
90 parent_cgroup = cgroup->parent;
91 if (parent_cgroup) {
92 parent_h_cgroup = hugetlb_cgroup_from_cgroup(parent_cgroup);
93 for (idx = 0; idx < HUGE_MAX_HSTATE; idx++)
94 res_counter_init(&h_cgroup->hugepage[idx],
95 &parent_h_cgroup->hugepage[idx]);
96 } else {
97 root_h_cgroup = h_cgroup;
98 for (idx = 0; idx < HUGE_MAX_HSTATE; idx++)
99 res_counter_init(&h_cgroup->hugepage[idx], NULL);
100 }
101 return &h_cgroup->css;
102}
103
104static void hugetlb_cgroup_destroy(struct cgroup *cgroup)
105{
106 struct hugetlb_cgroup *h_cgroup;
107
108 h_cgroup = hugetlb_cgroup_from_cgroup(cgroup);
109 kfree(h_cgroup);
110}
111
112
113/*
114 * Should be called with hugetlb_lock held.
115 * Since we are holding hugetlb_lock, pages cannot get moved from
116 * active list or uncharged from the cgroup, So no need to get
117 * page reference and test for page active here. This function
118 * cannot fail.
119 */
120static void hugetlb_cgroup_move_parent(int idx, struct cgroup *cgroup,
121 struct page *page)
122{
123 int csize;
124 struct res_counter *counter;
125 struct res_counter *fail_res;
126 struct hugetlb_cgroup *page_hcg;
127 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup);
128 struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(cgroup);
129
130 page_hcg = hugetlb_cgroup_from_page(page);
131 /*
132 * We can have pages in active list without any cgroup
133 * ie, hugepage with less than 3 pages. We can safely
134 * ignore those pages.
135 */
136 if (!page_hcg || page_hcg != h_cg)
137 goto out;
138
139 csize = PAGE_SIZE << compound_order(page);
140 if (!parent) {
141 parent = root_h_cgroup;
142 /* root has no limit */
143 res_counter_charge_nofail(&parent->hugepage[idx],
144 csize, &fail_res);
145 }
146 counter = &h_cg->hugepage[idx];
147 res_counter_uncharge_until(counter, counter->parent, csize);
148
149 set_hugetlb_cgroup(page, parent);
150out:
151 return;
152}
153
154/*
155 * Force the hugetlb cgroup to empty the hugetlb resources by moving them to
156 * the parent cgroup.
157 */
158static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup)
159{
160 struct hstate *h;
161 struct page *page;
162 int ret = 0, idx = 0;
163
164 do {
165 if (cgroup_task_count(cgroup) ||
166 !list_empty(&cgroup->children)) {
167 ret = -EBUSY;
168 goto out;
169 }
170 for_each_hstate(h) {
171 spin_lock(&hugetlb_lock);
172 list_for_each_entry(page, &h->hugepage_activelist, lru)
173 hugetlb_cgroup_move_parent(idx, cgroup, page);
174
175 spin_unlock(&hugetlb_lock);
176 idx++;
177 }
178 cond_resched();
179 } while (hugetlb_cgroup_have_usage(cgroup));
180out:
181 return ret;
182}
183
184int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
185 struct hugetlb_cgroup **ptr)
186{
187 int ret = 0;
188 struct res_counter *fail_res;
189 struct hugetlb_cgroup *h_cg = NULL;
190 unsigned long csize = nr_pages * PAGE_SIZE;
191
192 if (hugetlb_cgroup_disabled())
193 goto done;
194 /*
195 * We don't charge any cgroup if the compound page have less
196 * than 3 pages.
197 */
198 if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
199 goto done;
200again:
201 rcu_read_lock();
202 h_cg = hugetlb_cgroup_from_task(current);
203 if (!css_tryget(&h_cg->css)) {
204 rcu_read_unlock();
205 goto again;
206 }
207 rcu_read_unlock();
208
209 ret = res_counter_charge(&h_cg->hugepage[idx], csize, &fail_res);
210 css_put(&h_cg->css);
211done:
212 *ptr = h_cg;
213 return ret;
214}
215
216/* Should be called with hugetlb_lock held */
217void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
218 struct hugetlb_cgroup *h_cg,
219 struct page *page)
220{
221 if (hugetlb_cgroup_disabled() || !h_cg)
222 return;
223
224 set_hugetlb_cgroup(page, h_cg);
225 return;
226}
227
228/*
229 * Should be called with hugetlb_lock held
230 */
231void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
232 struct page *page)
233{
234 struct hugetlb_cgroup *h_cg;
235 unsigned long csize = nr_pages * PAGE_SIZE;
236
237 if (hugetlb_cgroup_disabled())
238 return;
239 VM_BUG_ON(!spin_is_locked(&hugetlb_lock));
240 h_cg = hugetlb_cgroup_from_page(page);
241 if (unlikely(!h_cg))
242 return;
243 set_hugetlb_cgroup(page, NULL);
244 res_counter_uncharge(&h_cg->hugepage[idx], csize);
245 return;
246}
247
248void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
249 struct hugetlb_cgroup *h_cg)
250{
251 unsigned long csize = nr_pages * PAGE_SIZE;
252
253 if (hugetlb_cgroup_disabled() || !h_cg)
254 return;
255
256 if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
257 return;
258
259 res_counter_uncharge(&h_cg->hugepage[idx], csize);
260 return;
261}
262
263static ssize_t hugetlb_cgroup_read(struct cgroup *cgroup, struct cftype *cft,
264 struct file *file, char __user *buf,
265 size_t nbytes, loff_t *ppos)
266{
267 u64 val;
268 char str[64];
269 int idx, name, len;
270 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup);
271
272 idx = MEMFILE_IDX(cft->private);
273 name = MEMFILE_ATTR(cft->private);
274
275 val = res_counter_read_u64(&h_cg->hugepage[idx], name);
276 len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
277 return simple_read_from_buffer(buf, nbytes, ppos, str, len);
278}
279
280static int hugetlb_cgroup_write(struct cgroup *cgroup, struct cftype *cft,
281 const char *buffer)
282{
283 int idx, name, ret;
284 unsigned long long val;
285 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup);
286
287 idx = MEMFILE_IDX(cft->private);
288 name = MEMFILE_ATTR(cft->private);
289
290 switch (name) {
291 case RES_LIMIT:
292 if (hugetlb_cgroup_is_root(h_cg)) {
293 /* Can't set limit on root */
294 ret = -EINVAL;
295 break;
296 }
297 /* This function does all necessary parse...reuse it */
298 ret = res_counter_memparse_write_strategy(buffer, &val);
299 if (ret)
300 break;
301 ret = res_counter_set_limit(&h_cg->hugepage[idx], val);
302 break;
303 default:
304 ret = -EINVAL;
305 break;
306 }
307 return ret;
308}
309
310static int hugetlb_cgroup_reset(struct cgroup *cgroup, unsigned int event)
311{
312 int idx, name, ret = 0;
313 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup);
314
315 idx = MEMFILE_IDX(event);
316 name = MEMFILE_ATTR(event);
317
318 switch (name) {
319 case RES_MAX_USAGE:
320 res_counter_reset_max(&h_cg->hugepage[idx]);
321 break;
322 case RES_FAILCNT:
323 res_counter_reset_failcnt(&h_cg->hugepage[idx]);
324 break;
325 default:
326 ret = -EINVAL;
327 break;
328 }
329 return ret;
330}
331
332static char *mem_fmt(char *buf, int size, unsigned long hsize)
333{
334 if (hsize >= (1UL << 30))
335 snprintf(buf, size, "%luGB", hsize >> 30);
336 else if (hsize >= (1UL << 20))
337 snprintf(buf, size, "%luMB", hsize >> 20);
338 else
339 snprintf(buf, size, "%luKB", hsize >> 10);
340 return buf;
341}
342
343int __init hugetlb_cgroup_file_init(int idx)
344{
345 char buf[32];
346 struct cftype *cft;
347 struct hstate *h = &hstates[idx];
348
349 /* format the size */
350 mem_fmt(buf, 32, huge_page_size(h));
351
352 /* Add the limit file */
353 cft = &h->cgroup_files[0];
354 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf);
355 cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT);
356 cft->read = hugetlb_cgroup_read;
357 cft->write_string = hugetlb_cgroup_write;
358
359 /* Add the usage file */
360 cft = &h->cgroup_files[1];
361 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf);
362 cft->private = MEMFILE_PRIVATE(idx, RES_USAGE);
363 cft->read = hugetlb_cgroup_read;
364
365 /* Add the MAX usage file */
366 cft = &h->cgroup_files[2];
367 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf);
368 cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE);
369 cft->trigger = hugetlb_cgroup_reset;
370 cft->read = hugetlb_cgroup_read;
371
372 /* Add the failcntfile */
373 cft = &h->cgroup_files[3];
374 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf);
375 cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT);
376 cft->trigger = hugetlb_cgroup_reset;
377 cft->read = hugetlb_cgroup_read;
378
379 /* NULL terminate the last cft */
380 cft = &h->cgroup_files[4];
381 memset(cft, 0, sizeof(*cft));
382
383 WARN_ON(cgroup_add_cftypes(&hugetlb_subsys, h->cgroup_files));
384
385 return 0;
386}
387
388/*
389 * hugetlb_lock will make sure a parallel cgroup rmdir won't happen
390 * when we migrate hugepages
391 */
392void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
393{
394 struct hugetlb_cgroup *h_cg;
395 struct hstate *h = page_hstate(oldhpage);
396
397 if (hugetlb_cgroup_disabled())
398 return;
399
400 VM_BUG_ON(!PageHuge(oldhpage));
401 spin_lock(&hugetlb_lock);
402 h_cg = hugetlb_cgroup_from_page(oldhpage);
403 set_hugetlb_cgroup(oldhpage, NULL);
404
405 /* move the h_cg details to new cgroup */
406 set_hugetlb_cgroup(newhpage, h_cg);
407 list_move(&newhpage->lru, &h->hugepage_activelist);
408 spin_unlock(&hugetlb_lock);
409 return;
410}
411
412struct cgroup_subsys hugetlb_subsys = {
413 .name = "hugetlb",
414 .create = hugetlb_cgroup_create,
415 .pre_destroy = hugetlb_cgroup_pre_destroy,
416 .destroy = hugetlb_cgroup_destroy,
417 .subsys_id = hugetlb_subsys_id,
418};
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index cc448bb983b..3a61efc518d 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -123,7 +123,7 @@ static int pfn_inject_init(void)
123 if (!dentry) 123 if (!dentry)
124 goto fail; 124 goto fail;
125 125
126#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 126#ifdef CONFIG_MEMCG_SWAP
127 dentry = debugfs_create_u64("corrupt-filter-memcg", 0600, 127 dentry = debugfs_create_u64("corrupt-filter-memcg", 0600,
128 hwpoison_dir, &hwpoison_filter_memcg); 128 hwpoison_dir, &hwpoison_filter_memcg);
129 if (!dentry) 129 if (!dentry)
diff --git a/mm/internal.h b/mm/internal.h
index 2ba87fbfb75..b8c91b342e2 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -118,12 +118,19 @@ struct compact_control {
118 unsigned long nr_freepages; /* Number of isolated free pages */ 118 unsigned long nr_freepages; /* Number of isolated free pages */
119 unsigned long nr_migratepages; /* Number of pages to migrate */ 119 unsigned long nr_migratepages; /* Number of pages to migrate */
120 unsigned long free_pfn; /* isolate_freepages search base */ 120 unsigned long free_pfn; /* isolate_freepages search base */
121 unsigned long start_free_pfn; /* where we started the search */
121 unsigned long migrate_pfn; /* isolate_migratepages search base */ 122 unsigned long migrate_pfn; /* isolate_migratepages search base */
122 bool sync; /* Synchronous migration */ 123 bool sync; /* Synchronous migration */
124 bool wrapped; /* Order > 0 compactions are
125 incremental, once free_pfn
126 and migrate_pfn meet, we restart
127 from the top of the zone;
128 remember we wrapped around. */
123 129
124 int order; /* order a direct compactor needs */ 130 int order; /* order a direct compactor needs */
125 int migratetype; /* MOVABLE, RECLAIMABLE etc */ 131 int migratetype; /* MOVABLE, RECLAIMABLE etc */
126 struct zone *zone; 132 struct zone *zone;
133 bool *contended; /* True if a lock was contended */
127}; 134};
128 135
129unsigned long 136unsigned long
@@ -347,3 +354,5 @@ extern u32 hwpoison_filter_enable;
347extern unsigned long vm_mmap_pgoff(struct file *, unsigned long, 354extern unsigned long vm_mmap_pgoff(struct file *, unsigned long,
348 unsigned long, unsigned long, 355 unsigned long, unsigned long,
349 unsigned long, unsigned long); 356 unsigned long, unsigned long);
357
358extern void set_pageblock_order(void);
diff --git a/mm/memblock.c b/mm/memblock.c
index 5cc6731b00c..4d9393c7edc 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -222,13 +222,13 @@ static int __init_memblock memblock_double_array(struct memblock_type *type,
222 /* Try to find some space for it. 222 /* Try to find some space for it.
223 * 223 *
224 * WARNING: We assume that either slab_is_available() and we use it or 224 * WARNING: We assume that either slab_is_available() and we use it or
225 * we use MEMBLOCK for allocations. That means that this is unsafe to use 225 * we use MEMBLOCK for allocations. That means that this is unsafe to
226 * when bootmem is currently active (unless bootmem itself is implemented 226 * use when bootmem is currently active (unless bootmem itself is
227 * on top of MEMBLOCK which isn't the case yet) 227 * implemented on top of MEMBLOCK which isn't the case yet)
228 * 228 *
229 * This should however not be an issue for now, as we currently only 229 * This should however not be an issue for now, as we currently only
230 * call into MEMBLOCK while it's still active, or much later when slab is 230 * call into MEMBLOCK while it's still active, or much later when slab
231 * active for memory hotplug operations 231 * is active for memory hotplug operations
232 */ 232 */
233 if (use_slab) { 233 if (use_slab) {
234 new_array = kmalloc(new_size, GFP_KERNEL); 234 new_array = kmalloc(new_size, GFP_KERNEL);
@@ -243,8 +243,8 @@ static int __init_memblock memblock_double_array(struct memblock_type *type,
243 new_alloc_size, PAGE_SIZE); 243 new_alloc_size, PAGE_SIZE);
244 if (!addr && new_area_size) 244 if (!addr && new_area_size)
245 addr = memblock_find_in_range(0, 245 addr = memblock_find_in_range(0,
246 min(new_area_start, memblock.current_limit), 246 min(new_area_start, memblock.current_limit),
247 new_alloc_size, PAGE_SIZE); 247 new_alloc_size, PAGE_SIZE);
248 248
249 new_array = addr ? __va(addr) : 0; 249 new_array = addr ? __va(addr) : 0;
250 } 250 }
@@ -254,12 +254,14 @@ static int __init_memblock memblock_double_array(struct memblock_type *type,
254 return -1; 254 return -1;
255 } 255 }
256 256
257 memblock_dbg("memblock: %s array is doubled to %ld at [%#010llx-%#010llx]", 257 memblock_dbg("memblock: %s is doubled to %ld at [%#010llx-%#010llx]",
258 memblock_type_name(type), type->max * 2, (u64)addr, (u64)addr + new_size - 1); 258 memblock_type_name(type), type->max * 2, (u64)addr,
259 (u64)addr + new_size - 1);
259 260
260 /* Found space, we now need to move the array over before 261 /*
261 * we add the reserved region since it may be our reserved 262 * Found space, we now need to move the array over before we add the
262 * array itself that is full. 263 * reserved region since it may be our reserved array itself that is
264 * full.
263 */ 265 */
264 memcpy(new_array, type->regions, old_size); 266 memcpy(new_array, type->regions, old_size);
265 memset(new_array + type->max, 0, old_size); 267 memset(new_array + type->max, 0, old_size);
@@ -267,17 +269,16 @@ static int __init_memblock memblock_double_array(struct memblock_type *type,
267 type->regions = new_array; 269 type->regions = new_array;
268 type->max <<= 1; 270 type->max <<= 1;
269 271
270 /* Free old array. We needn't free it if the array is the 272 /* Free old array. We needn't free it if the array is the static one */
271 * static one
272 */
273 if (*in_slab) 273 if (*in_slab)
274 kfree(old_array); 274 kfree(old_array);
275 else if (old_array != memblock_memory_init_regions && 275 else if (old_array != memblock_memory_init_regions &&
276 old_array != memblock_reserved_init_regions) 276 old_array != memblock_reserved_init_regions)
277 memblock_free(__pa(old_array), old_alloc_size); 277 memblock_free(__pa(old_array), old_alloc_size);
278 278
279 /* Reserve the new array if that comes from the memblock. 279 /*
280 * Otherwise, we needn't do it 280 * Reserve the new array if that comes from the memblock. Otherwise, we
281 * needn't do it
281 */ 282 */
282 if (!use_slab) 283 if (!use_slab)
283 BUG_ON(memblock_reserve(addr, new_alloc_size)); 284 BUG_ON(memblock_reserve(addr, new_alloc_size));
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f72b5e52451..795e525afab 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -61,12 +61,12 @@ struct cgroup_subsys mem_cgroup_subsys __read_mostly;
61#define MEM_CGROUP_RECLAIM_RETRIES 5 61#define MEM_CGROUP_RECLAIM_RETRIES 5
62static struct mem_cgroup *root_mem_cgroup __read_mostly; 62static struct mem_cgroup *root_mem_cgroup __read_mostly;
63 63
64#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 64#ifdef CONFIG_MEMCG_SWAP
65/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ 65/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
66int do_swap_account __read_mostly; 66int do_swap_account __read_mostly;
67 67
68/* for remember boot option*/ 68/* for remember boot option*/
69#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED 69#ifdef CONFIG_MEMCG_SWAP_ENABLED
70static int really_do_swap_account __initdata = 1; 70static int really_do_swap_account __initdata = 1;
71#else 71#else
72static int really_do_swap_account __initdata = 0; 72static int really_do_swap_account __initdata = 0;
@@ -87,7 +87,7 @@ enum mem_cgroup_stat_index {
87 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ 87 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
88 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ 88 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
89 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ 89 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
90 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ 90 MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */
91 MEM_CGROUP_STAT_NSTATS, 91 MEM_CGROUP_STAT_NSTATS,
92}; 92};
93 93
@@ -378,9 +378,7 @@ static bool move_file(void)
378 378
379enum charge_type { 379enum charge_type {
380 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 380 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
381 MEM_CGROUP_CHARGE_TYPE_MAPPED, 381 MEM_CGROUP_CHARGE_TYPE_ANON,
382 MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */
383 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */
384 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ 382 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
385 MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ 383 MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */
386 NR_CHARGE_TYPE, 384 NR_CHARGE_TYPE,
@@ -407,8 +405,14 @@ enum charge_type {
407static void mem_cgroup_get(struct mem_cgroup *memcg); 405static void mem_cgroup_get(struct mem_cgroup *memcg);
408static void mem_cgroup_put(struct mem_cgroup *memcg); 406static void mem_cgroup_put(struct mem_cgroup *memcg);
409 407
408static inline
409struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
410{
411 return container_of(s, struct mem_cgroup, css);
412}
413
410/* Writing them here to avoid exposing memcg's inner layout */ 414/* Writing them here to avoid exposing memcg's inner layout */
411#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM 415#ifdef CONFIG_MEMCG_KMEM
412#include <net/sock.h> 416#include <net/sock.h>
413#include <net/ip.h> 417#include <net/ip.h>
414 418
@@ -467,9 +471,9 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
467} 471}
468EXPORT_SYMBOL(tcp_proto_cgroup); 472EXPORT_SYMBOL(tcp_proto_cgroup);
469#endif /* CONFIG_INET */ 473#endif /* CONFIG_INET */
470#endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */ 474#endif /* CONFIG_MEMCG_KMEM */
471 475
472#if defined(CONFIG_INET) && defined(CONFIG_CGROUP_MEM_RES_CTLR_KMEM) 476#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
473static void disarm_sock_keys(struct mem_cgroup *memcg) 477static void disarm_sock_keys(struct mem_cgroup *memcg)
474{ 478{
475 if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto)) 479 if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))
@@ -703,7 +707,7 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
703 bool charge) 707 bool charge)
704{ 708{
705 int val = (charge) ? 1 : -1; 709 int val = (charge) ? 1 : -1;
706 this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); 710 this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
707} 711}
708 712
709static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, 713static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
@@ -864,9 +868,8 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
864 868
865struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) 869struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
866{ 870{
867 return container_of(cgroup_subsys_state(cont, 871 return mem_cgroup_from_css(
868 mem_cgroup_subsys_id), struct mem_cgroup, 872 cgroup_subsys_state(cont, mem_cgroup_subsys_id));
869 css);
870} 873}
871 874
872struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 875struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
@@ -879,8 +882,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
879 if (unlikely(!p)) 882 if (unlikely(!p))
880 return NULL; 883 return NULL;
881 884
882 return container_of(task_subsys_state(p, mem_cgroup_subsys_id), 885 return mem_cgroup_from_css(task_subsys_state(p, mem_cgroup_subsys_id));
883 struct mem_cgroup, css);
884} 886}
885 887
886struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) 888struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
@@ -966,8 +968,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
966 css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id); 968 css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id);
967 if (css) { 969 if (css) {
968 if (css == &root->css || css_tryget(css)) 970 if (css == &root->css || css_tryget(css))
969 memcg = container_of(css, 971 memcg = mem_cgroup_from_css(css);
970 struct mem_cgroup, css);
971 } else 972 } else
972 id = 0; 973 id = 0;
973 rcu_read_unlock(); 974 rcu_read_unlock();
@@ -1454,7 +1455,7 @@ static int mem_cgroup_count_children(struct mem_cgroup *memcg)
1454/* 1455/*
1455 * Return the memory (and swap, if configured) limit for a memcg. 1456 * Return the memory (and swap, if configured) limit for a memcg.
1456 */ 1457 */
1457u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) 1458static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1458{ 1459{
1459 u64 limit; 1460 u64 limit;
1460 u64 memsw; 1461 u64 memsw;
@@ -1470,6 +1471,73 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1470 return min(limit, memsw); 1471 return min(limit, memsw);
1471} 1472}
1472 1473
1474void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1475 int order)
1476{
1477 struct mem_cgroup *iter;
1478 unsigned long chosen_points = 0;
1479 unsigned long totalpages;
1480 unsigned int points = 0;
1481 struct task_struct *chosen = NULL;
1482
1483 /*
1484 * If current has a pending SIGKILL, then automatically select it. The
1485 * goal is to allow it to allocate so that it may quickly exit and free
1486 * its memory.
1487 */
1488 if (fatal_signal_pending(current)) {
1489 set_thread_flag(TIF_MEMDIE);
1490 return;
1491 }
1492
1493 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
1494 totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
1495 for_each_mem_cgroup_tree(iter, memcg) {
1496 struct cgroup *cgroup = iter->css.cgroup;
1497 struct cgroup_iter it;
1498 struct task_struct *task;
1499
1500 cgroup_iter_start(cgroup, &it);
1501 while ((task = cgroup_iter_next(cgroup, &it))) {
1502 switch (oom_scan_process_thread(task, totalpages, NULL,
1503 false)) {
1504 case OOM_SCAN_SELECT:
1505 if (chosen)
1506 put_task_struct(chosen);
1507 chosen = task;
1508 chosen_points = ULONG_MAX;
1509 get_task_struct(chosen);
1510 /* fall through */
1511 case OOM_SCAN_CONTINUE:
1512 continue;
1513 case OOM_SCAN_ABORT:
1514 cgroup_iter_end(cgroup, &it);
1515 mem_cgroup_iter_break(memcg, iter);
1516 if (chosen)
1517 put_task_struct(chosen);
1518 return;
1519 case OOM_SCAN_OK:
1520 break;
1521 };
1522 points = oom_badness(task, memcg, NULL, totalpages);
1523 if (points > chosen_points) {
1524 if (chosen)
1525 put_task_struct(chosen);
1526 chosen = task;
1527 chosen_points = points;
1528 get_task_struct(chosen);
1529 }
1530 }
1531 cgroup_iter_end(cgroup, &it);
1532 }
1533
1534 if (!chosen)
1535 return;
1536 points = chosen_points * 1000 / totalpages;
1537 oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
1538 NULL, "Memory cgroup out of memory");
1539}
1540
1473static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, 1541static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
1474 gfp_t gfp_mask, 1542 gfp_t gfp_mask,
1475 unsigned long flags) 1543 unsigned long flags)
@@ -1899,7 +1967,7 @@ again:
1899 return; 1967 return;
1900 /* 1968 /*
1901 * If this memory cgroup is not under account moving, we don't 1969 * If this memory cgroup is not under account moving, we don't
1902 * need to take move_lock_page_cgroup(). Because we already hold 1970 * need to take move_lock_mem_cgroup(). Because we already hold
1903 * rcu_read_lock(), any calls to move_account will be delayed until 1971 * rcu_read_lock(), any calls to move_account will be delayed until
1904 * rcu_read_unlock() if mem_cgroup_stolen() == true. 1972 * rcu_read_unlock() if mem_cgroup_stolen() == true.
1905 */ 1973 */
@@ -1921,7 +1989,7 @@ void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)
1921 /* 1989 /*
1922 * It's guaranteed that pc->mem_cgroup never changes while 1990 * It's guaranteed that pc->mem_cgroup never changes while
1923 * lock is held because a routine modifies pc->mem_cgroup 1991 * lock is held because a routine modifies pc->mem_cgroup
1924 * should take move_lock_page_cgroup(). 1992 * should take move_lock_mem_cgroup().
1925 */ 1993 */
1926 move_unlock_mem_cgroup(pc->mem_cgroup, flags); 1994 move_unlock_mem_cgroup(pc->mem_cgroup, flags);
1927} 1995}
@@ -2268,7 +2336,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
2268 * We always charge the cgroup the mm_struct belongs to. 2336 * We always charge the cgroup the mm_struct belongs to.
2269 * The mm_struct's mem_cgroup changes on task migration if the 2337 * The mm_struct's mem_cgroup changes on task migration if the
2270 * thread group leader migrates. It's possible that mm is not 2338 * thread group leader migrates. It's possible that mm is not
2271 * set, if so charge the init_mm (happens for pagecache usage). 2339 * set, if so charge the root memcg (happens for pagecache usage).
2272 */ 2340 */
2273 if (!*ptr && !mm) 2341 if (!*ptr && !mm)
2274 *ptr = root_mem_cgroup; 2342 *ptr = root_mem_cgroup;
@@ -2429,7 +2497,7 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
2429 css = css_lookup(&mem_cgroup_subsys, id); 2497 css = css_lookup(&mem_cgroup_subsys, id);
2430 if (!css) 2498 if (!css)
2431 return NULL; 2499 return NULL;
2432 return container_of(css, struct mem_cgroup, css); 2500 return mem_cgroup_from_css(css);
2433} 2501}
2434 2502
2435struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) 2503struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
@@ -2473,11 +2541,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2473 bool anon; 2541 bool anon;
2474 2542
2475 lock_page_cgroup(pc); 2543 lock_page_cgroup(pc);
2476 if (unlikely(PageCgroupUsed(pc))) { 2544 VM_BUG_ON(PageCgroupUsed(pc));
2477 unlock_page_cgroup(pc);
2478 __mem_cgroup_cancel_charge(memcg, nr_pages);
2479 return;
2480 }
2481 /* 2545 /*
2482 * we don't need page_cgroup_lock about tail pages, becase they are not 2546 * we don't need page_cgroup_lock about tail pages, becase they are not
2483 * accessed by any other context at this point. 2547 * accessed by any other context at this point.
@@ -2519,7 +2583,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2519 spin_unlock_irq(&zone->lru_lock); 2583 spin_unlock_irq(&zone->lru_lock);
2520 } 2584 }
2521 2585
2522 if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) 2586 if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON)
2523 anon = true; 2587 anon = true;
2524 else 2588 else
2525 anon = false; 2589 anon = false;
@@ -2644,8 +2708,7 @@ out:
2644 2708
2645static int mem_cgroup_move_parent(struct page *page, 2709static int mem_cgroup_move_parent(struct page *page,
2646 struct page_cgroup *pc, 2710 struct page_cgroup *pc,
2647 struct mem_cgroup *child, 2711 struct mem_cgroup *child)
2648 gfp_t gfp_mask)
2649{ 2712{
2650 struct mem_cgroup *parent; 2713 struct mem_cgroup *parent;
2651 unsigned int nr_pages; 2714 unsigned int nr_pages;
@@ -2728,38 +2791,7 @@ int mem_cgroup_newpage_charge(struct page *page,
2728 VM_BUG_ON(page->mapping && !PageAnon(page)); 2791 VM_BUG_ON(page->mapping && !PageAnon(page));
2729 VM_BUG_ON(!mm); 2792 VM_BUG_ON(!mm);
2730 return mem_cgroup_charge_common(page, mm, gfp_mask, 2793 return mem_cgroup_charge_common(page, mm, gfp_mask,
2731 MEM_CGROUP_CHARGE_TYPE_MAPPED); 2794 MEM_CGROUP_CHARGE_TYPE_ANON);
2732}
2733
2734static void
2735__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2736 enum charge_type ctype);
2737
2738int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2739 gfp_t gfp_mask)
2740{
2741 struct mem_cgroup *memcg = NULL;
2742 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
2743 int ret;
2744
2745 if (mem_cgroup_disabled())
2746 return 0;
2747 if (PageCompound(page))
2748 return 0;
2749
2750 if (unlikely(!mm))
2751 mm = &init_mm;
2752 if (!page_is_file_cache(page))
2753 type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
2754
2755 if (!PageSwapCache(page))
2756 ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);
2757 else { /* page is swapcache/shmem */
2758 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg);
2759 if (!ret)
2760 __mem_cgroup_commit_charge_swapin(page, memcg, type);
2761 }
2762 return ret;
2763} 2795}
2764 2796
2765/* 2797/*
@@ -2768,27 +2800,26 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2768 * struct page_cgroup is acquired. This refcnt will be consumed by 2800 * struct page_cgroup is acquired. This refcnt will be consumed by
2769 * "commit()" or removed by "cancel()" 2801 * "commit()" or removed by "cancel()"
2770 */ 2802 */
2771int mem_cgroup_try_charge_swapin(struct mm_struct *mm, 2803static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
2772 struct page *page, 2804 struct page *page,
2773 gfp_t mask, struct mem_cgroup **memcgp) 2805 gfp_t mask,
2806 struct mem_cgroup **memcgp)
2774{ 2807{
2775 struct mem_cgroup *memcg; 2808 struct mem_cgroup *memcg;
2809 struct page_cgroup *pc;
2776 int ret; 2810 int ret;
2777 2811
2778 *memcgp = NULL; 2812 pc = lookup_page_cgroup(page);
2779
2780 if (mem_cgroup_disabled())
2781 return 0;
2782
2783 if (!do_swap_account)
2784 goto charge_cur_mm;
2785 /* 2813 /*
2786 * A racing thread's fault, or swapoff, may have already updated 2814 * Every swap fault against a single page tries to charge the
2787 * the pte, and even removed page from swap cache: in those cases 2815 * page, bail as early as possible. shmem_unuse() encounters
2788 * do_swap_page()'s pte_same() test will fail; but there's also a 2816 * already charged pages, too. The USED bit is protected by
2789 * KSM case which does need to charge the page. 2817 * the page lock, which serializes swap cache removal, which
2818 * in turn serializes uncharging.
2790 */ 2819 */
2791 if (!PageSwapCache(page)) 2820 if (PageCgroupUsed(pc))
2821 return 0;
2822 if (!do_swap_account)
2792 goto charge_cur_mm; 2823 goto charge_cur_mm;
2793 memcg = try_get_mem_cgroup_from_page(page); 2824 memcg = try_get_mem_cgroup_from_page(page);
2794 if (!memcg) 2825 if (!memcg)
@@ -2800,14 +2831,44 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
2800 ret = 0; 2831 ret = 0;
2801 return ret; 2832 return ret;
2802charge_cur_mm: 2833charge_cur_mm:
2803 if (unlikely(!mm))
2804 mm = &init_mm;
2805 ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true); 2834 ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true);
2806 if (ret == -EINTR) 2835 if (ret == -EINTR)
2807 ret = 0; 2836 ret = 0;
2808 return ret; 2837 return ret;
2809} 2838}
2810 2839
2840int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
2841 gfp_t gfp_mask, struct mem_cgroup **memcgp)
2842{
2843 *memcgp = NULL;
2844 if (mem_cgroup_disabled())
2845 return 0;
2846 /*
2847 * A racing thread's fault, or swapoff, may have already
2848 * updated the pte, and even removed page from swap cache: in
2849 * those cases unuse_pte()'s pte_same() test will fail; but
2850 * there's also a KSM case which does need to charge the page.
2851 */
2852 if (!PageSwapCache(page)) {
2853 int ret;
2854
2855 ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, memcgp, true);
2856 if (ret == -EINTR)
2857 ret = 0;
2858 return ret;
2859 }
2860 return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp);
2861}
2862
2863void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
2864{
2865 if (mem_cgroup_disabled())
2866 return;
2867 if (!memcg)
2868 return;
2869 __mem_cgroup_cancel_charge(memcg, 1);
2870}
2871
2811static void 2872static void
2812__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, 2873__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
2813 enum charge_type ctype) 2874 enum charge_type ctype)
@@ -2842,16 +2903,30 @@ void mem_cgroup_commit_charge_swapin(struct page *page,
2842 struct mem_cgroup *memcg) 2903 struct mem_cgroup *memcg)
2843{ 2904{
2844 __mem_cgroup_commit_charge_swapin(page, memcg, 2905 __mem_cgroup_commit_charge_swapin(page, memcg,
2845 MEM_CGROUP_CHARGE_TYPE_MAPPED); 2906 MEM_CGROUP_CHARGE_TYPE_ANON);
2846} 2907}
2847 2908
2848void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) 2909int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2910 gfp_t gfp_mask)
2849{ 2911{
2912 struct mem_cgroup *memcg = NULL;
2913 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
2914 int ret;
2915
2850 if (mem_cgroup_disabled()) 2916 if (mem_cgroup_disabled())
2851 return; 2917 return 0;
2852 if (!memcg) 2918 if (PageCompound(page))
2853 return; 2919 return 0;
2854 __mem_cgroup_cancel_charge(memcg, 1); 2920
2921 if (!PageSwapCache(page))
2922 ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);
2923 else { /* page is swapcache/shmem */
2924 ret = __mem_cgroup_try_charge_swapin(mm, page,
2925 gfp_mask, &memcg);
2926 if (!ret)
2927 __mem_cgroup_commit_charge_swapin(page, memcg, type);
2928 }
2929 return ret;
2855} 2930}
2856 2931
2857static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg, 2932static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
@@ -2911,7 +2986,8 @@ direct_uncharge:
2911 * uncharge if !page_mapped(page) 2986 * uncharge if !page_mapped(page)
2912 */ 2987 */
2913static struct mem_cgroup * 2988static struct mem_cgroup *
2914__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) 2989__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
2990 bool end_migration)
2915{ 2991{
2916 struct mem_cgroup *memcg = NULL; 2992 struct mem_cgroup *memcg = NULL;
2917 unsigned int nr_pages = 1; 2993 unsigned int nr_pages = 1;
@@ -2921,8 +2997,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2921 if (mem_cgroup_disabled()) 2997 if (mem_cgroup_disabled())
2922 return NULL; 2998 return NULL;
2923 2999
2924 if (PageSwapCache(page)) 3000 VM_BUG_ON(PageSwapCache(page));
2925 return NULL;
2926 3001
2927 if (PageTransHuge(page)) { 3002 if (PageTransHuge(page)) {
2928 nr_pages <<= compound_order(page); 3003 nr_pages <<= compound_order(page);
@@ -2945,7 +3020,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2945 anon = PageAnon(page); 3020 anon = PageAnon(page);
2946 3021
2947 switch (ctype) { 3022 switch (ctype) {
2948 case MEM_CGROUP_CHARGE_TYPE_MAPPED: 3023 case MEM_CGROUP_CHARGE_TYPE_ANON:
2949 /* 3024 /*
2950 * Generally PageAnon tells if it's the anon statistics to be 3025 * Generally PageAnon tells if it's the anon statistics to be
2951 * updated; but sometimes e.g. mem_cgroup_uncharge_page() is 3026 * updated; but sometimes e.g. mem_cgroup_uncharge_page() is
@@ -2955,7 +3030,16 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2955 /* fallthrough */ 3030 /* fallthrough */
2956 case MEM_CGROUP_CHARGE_TYPE_DROP: 3031 case MEM_CGROUP_CHARGE_TYPE_DROP:
2957 /* See mem_cgroup_prepare_migration() */ 3032 /* See mem_cgroup_prepare_migration() */
2958 if (page_mapped(page) || PageCgroupMigration(pc)) 3033 if (page_mapped(page))
3034 goto unlock_out;
3035 /*
3036 * Pages under migration may not be uncharged. But
3037 * end_migration() /must/ be the one uncharging the
3038 * unused post-migration page and so it has to call
3039 * here with the migration bit still set. See the
3040 * res_counter handling below.
3041 */
3042 if (!end_migration && PageCgroupMigration(pc))
2959 goto unlock_out; 3043 goto unlock_out;
2960 break; 3044 break;
2961 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: 3045 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
@@ -2989,7 +3073,12 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2989 mem_cgroup_swap_statistics(memcg, true); 3073 mem_cgroup_swap_statistics(memcg, true);
2990 mem_cgroup_get(memcg); 3074 mem_cgroup_get(memcg);
2991 } 3075 }
2992 if (!mem_cgroup_is_root(memcg)) 3076 /*
3077 * Migration does not charge the res_counter for the
3078 * replacement page, so leave it alone when phasing out the
3079 * page that is unused after the migration.
3080 */
3081 if (!end_migration && !mem_cgroup_is_root(memcg))
2993 mem_cgroup_do_uncharge(memcg, nr_pages, ctype); 3082 mem_cgroup_do_uncharge(memcg, nr_pages, ctype);
2994 3083
2995 return memcg; 3084 return memcg;
@@ -3005,14 +3094,16 @@ void mem_cgroup_uncharge_page(struct page *page)
3005 if (page_mapped(page)) 3094 if (page_mapped(page))
3006 return; 3095 return;
3007 VM_BUG_ON(page->mapping && !PageAnon(page)); 3096 VM_BUG_ON(page->mapping && !PageAnon(page));
3008 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); 3097 if (PageSwapCache(page))
3098 return;
3099 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false);
3009} 3100}
3010 3101
3011void mem_cgroup_uncharge_cache_page(struct page *page) 3102void mem_cgroup_uncharge_cache_page(struct page *page)
3012{ 3103{
3013 VM_BUG_ON(page_mapped(page)); 3104 VM_BUG_ON(page_mapped(page));
3014 VM_BUG_ON(page->mapping); 3105 VM_BUG_ON(page->mapping);
3015 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); 3106 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);
3016} 3107}
3017 3108
3018/* 3109/*
@@ -3076,7 +3167,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
3076 if (!swapout) /* this was a swap cache but the swap is unused ! */ 3167 if (!swapout) /* this was a swap cache but the swap is unused ! */
3077 ctype = MEM_CGROUP_CHARGE_TYPE_DROP; 3168 ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
3078 3169
3079 memcg = __mem_cgroup_uncharge_common(page, ctype); 3170 memcg = __mem_cgroup_uncharge_common(page, ctype, false);
3080 3171
3081 /* 3172 /*
3082 * record memcg information, if swapout && memcg != NULL, 3173 * record memcg information, if swapout && memcg != NULL,
@@ -3087,7 +3178,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
3087} 3178}
3088#endif 3179#endif
3089 3180
3090#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 3181#ifdef CONFIG_MEMCG_SWAP
3091/* 3182/*
3092 * called from swap_entry_free(). remove record in swap_cgroup and 3183 * called from swap_entry_free(). remove record in swap_cgroup and
3093 * uncharge "memsw" account. 3184 * uncharge "memsw" account.
@@ -3166,19 +3257,18 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
3166 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old 3257 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
3167 * page belongs to. 3258 * page belongs to.
3168 */ 3259 */
3169int mem_cgroup_prepare_migration(struct page *page, 3260void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
3170 struct page *newpage, struct mem_cgroup **memcgp, gfp_t gfp_mask) 3261 struct mem_cgroup **memcgp)
3171{ 3262{
3172 struct mem_cgroup *memcg = NULL; 3263 struct mem_cgroup *memcg = NULL;
3173 struct page_cgroup *pc; 3264 struct page_cgroup *pc;
3174 enum charge_type ctype; 3265 enum charge_type ctype;
3175 int ret = 0;
3176 3266
3177 *memcgp = NULL; 3267 *memcgp = NULL;
3178 3268
3179 VM_BUG_ON(PageTransHuge(page)); 3269 VM_BUG_ON(PageTransHuge(page));
3180 if (mem_cgroup_disabled()) 3270 if (mem_cgroup_disabled())
3181 return 0; 3271 return;
3182 3272
3183 pc = lookup_page_cgroup(page); 3273 pc = lookup_page_cgroup(page);
3184 lock_page_cgroup(pc); 3274 lock_page_cgroup(pc);
@@ -3223,24 +3313,9 @@ int mem_cgroup_prepare_migration(struct page *page,
3223 * we return here. 3313 * we return here.
3224 */ 3314 */
3225 if (!memcg) 3315 if (!memcg)
3226 return 0; 3316 return;
3227 3317
3228 *memcgp = memcg; 3318 *memcgp = memcg;
3229 ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, memcgp, false);
3230 css_put(&memcg->css);/* drop extra refcnt */
3231 if (ret) {
3232 if (PageAnon(page)) {
3233 lock_page_cgroup(pc);
3234 ClearPageCgroupMigration(pc);
3235 unlock_page_cgroup(pc);
3236 /*
3237 * The old page may be fully unmapped while we kept it.
3238 */
3239 mem_cgroup_uncharge_page(page);
3240 }
3241 /* we'll need to revisit this error code (we have -EINTR) */
3242 return -ENOMEM;
3243 }
3244 /* 3319 /*
3245 * We charge new page before it's used/mapped. So, even if unlock_page() 3320 * We charge new page before it's used/mapped. So, even if unlock_page()
3246 * is called before end_migration, we can catch all events on this new 3321 * is called before end_migration, we can catch all events on this new
@@ -3248,13 +3323,15 @@ int mem_cgroup_prepare_migration(struct page *page,
3248 * mapcount will be finally 0 and we call uncharge in end_migration(). 3323 * mapcount will be finally 0 and we call uncharge in end_migration().
3249 */ 3324 */
3250 if (PageAnon(page)) 3325 if (PageAnon(page))
3251 ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; 3326 ctype = MEM_CGROUP_CHARGE_TYPE_ANON;
3252 else if (page_is_file_cache(page))
3253 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
3254 else 3327 else
3255 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; 3328 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
3329 /*
3330 * The page is committed to the memcg, but it's not actually
3331 * charged to the res_counter since we plan on replacing the
3332 * old one and only one page is going to be left afterwards.
3333 */
3256 __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false); 3334 __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false);
3257 return ret;
3258} 3335}
3259 3336
3260/* remove redundant charge if migration failed*/ 3337/* remove redundant charge if migration failed*/
@@ -3276,6 +3353,12 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3276 used = newpage; 3353 used = newpage;
3277 unused = oldpage; 3354 unused = oldpage;
3278 } 3355 }
3356 anon = PageAnon(used);
3357 __mem_cgroup_uncharge_common(unused,
3358 anon ? MEM_CGROUP_CHARGE_TYPE_ANON
3359 : MEM_CGROUP_CHARGE_TYPE_CACHE,
3360 true);
3361 css_put(&memcg->css);
3279 /* 3362 /*
3280 * We disallowed uncharge of pages under migration because mapcount 3363 * We disallowed uncharge of pages under migration because mapcount
3281 * of the page goes down to zero, temporarly. 3364 * of the page goes down to zero, temporarly.
@@ -3285,10 +3368,6 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3285 lock_page_cgroup(pc); 3368 lock_page_cgroup(pc);
3286 ClearPageCgroupMigration(pc); 3369 ClearPageCgroupMigration(pc);
3287 unlock_page_cgroup(pc); 3370 unlock_page_cgroup(pc);
3288 anon = PageAnon(used);
3289 __mem_cgroup_uncharge_common(unused,
3290 anon ? MEM_CGROUP_CHARGE_TYPE_MAPPED
3291 : MEM_CGROUP_CHARGE_TYPE_CACHE);
3292 3371
3293 /* 3372 /*
3294 * If a page is a file cache, radix-tree replacement is very atomic 3373 * If a page is a file cache, radix-tree replacement is very atomic
@@ -3340,10 +3419,6 @@ void mem_cgroup_replace_page_cache(struct page *oldpage,
3340 */ 3419 */
3341 if (!memcg) 3420 if (!memcg)
3342 return; 3421 return;
3343
3344 if (PageSwapBacked(oldpage))
3345 type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
3346
3347 /* 3422 /*
3348 * Even if newpage->mapping was NULL before starting replacement, 3423 * Even if newpage->mapping was NULL before starting replacement,
3349 * the newpage may be on LRU(or pagevec for LRU) already. We lock 3424 * the newpage may be on LRU(or pagevec for LRU) already. We lock
@@ -3418,7 +3493,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
3418 /* 3493 /*
3419 * Rather than hide all in some function, I do this in 3494 * Rather than hide all in some function, I do this in
3420 * open coded manner. You see what this really does. 3495 * open coded manner. You see what this really does.
3421 * We have to guarantee memcg->res.limit < memcg->memsw.limit. 3496 * We have to guarantee memcg->res.limit <= memcg->memsw.limit.
3422 */ 3497 */
3423 mutex_lock(&set_limit_mutex); 3498 mutex_lock(&set_limit_mutex);
3424 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3499 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
@@ -3479,7 +3554,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
3479 /* 3554 /*
3480 * Rather than hide all in some function, I do this in 3555 * Rather than hide all in some function, I do this in
3481 * open coded manner. You see what this really does. 3556 * open coded manner. You see what this really does.
3482 * We have to guarantee memcg->res.limit < memcg->memsw.limit. 3557 * We have to guarantee memcg->res.limit <= memcg->memsw.limit.
3483 */ 3558 */
3484 mutex_lock(&set_limit_mutex); 3559 mutex_lock(&set_limit_mutex);
3485 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 3560 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
@@ -3611,10 +3686,12 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3611} 3686}
3612 3687
3613/* 3688/*
3614 * This routine traverse page_cgroup in given list and drop them all. 3689 * Traverse a specified page_cgroup list and try to drop them all. This doesn't
3615 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 3690 * reclaim the pages page themselves - it just removes the page_cgroups.
3691 * Returns true if some page_cgroups were not freed, indicating that the caller
3692 * must retry this operation.
3616 */ 3693 */
3617static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, 3694static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3618 int node, int zid, enum lru_list lru) 3695 int node, int zid, enum lru_list lru)
3619{ 3696{
3620 struct mem_cgroup_per_zone *mz; 3697 struct mem_cgroup_per_zone *mz;
@@ -3622,7 +3699,6 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3622 struct list_head *list; 3699 struct list_head *list;
3623 struct page *busy; 3700 struct page *busy;
3624 struct zone *zone; 3701 struct zone *zone;
3625 int ret = 0;
3626 3702
3627 zone = &NODE_DATA(node)->node_zones[zid]; 3703 zone = &NODE_DATA(node)->node_zones[zid];
3628 mz = mem_cgroup_zoneinfo(memcg, node, zid); 3704 mz = mem_cgroup_zoneinfo(memcg, node, zid);
@@ -3636,7 +3712,6 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3636 struct page_cgroup *pc; 3712 struct page_cgroup *pc;
3637 struct page *page; 3713 struct page *page;
3638 3714
3639 ret = 0;
3640 spin_lock_irqsave(&zone->lru_lock, flags); 3715 spin_lock_irqsave(&zone->lru_lock, flags);
3641 if (list_empty(list)) { 3716 if (list_empty(list)) {
3642 spin_unlock_irqrestore(&zone->lru_lock, flags); 3717 spin_unlock_irqrestore(&zone->lru_lock, flags);
@@ -3653,21 +3728,14 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3653 3728
3654 pc = lookup_page_cgroup(page); 3729 pc = lookup_page_cgroup(page);
3655 3730
3656 ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL); 3731 if (mem_cgroup_move_parent(page, pc, memcg)) {
3657 if (ret == -ENOMEM || ret == -EINTR)
3658 break;
3659
3660 if (ret == -EBUSY || ret == -EINVAL) {
3661 /* found lock contention or "pc" is obsolete. */ 3732 /* found lock contention or "pc" is obsolete. */
3662 busy = page; 3733 busy = page;
3663 cond_resched(); 3734 cond_resched();
3664 } else 3735 } else
3665 busy = NULL; 3736 busy = NULL;
3666 } 3737 }
3667 3738 return !list_empty(list);
3668 if (!ret && !list_empty(list))
3669 return -EBUSY;
3670 return ret;
3671} 3739}
3672 3740
3673/* 3741/*
@@ -3692,9 +3760,6 @@ move_account:
3692 ret = -EBUSY; 3760 ret = -EBUSY;
3693 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) 3761 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
3694 goto out; 3762 goto out;
3695 ret = -EINTR;
3696 if (signal_pending(current))
3697 goto out;
3698 /* This is for making all *used* pages to be on LRU. */ 3763 /* This is for making all *used* pages to be on LRU. */
3699 lru_add_drain_all(); 3764 lru_add_drain_all();
3700 drain_all_stock_sync(memcg); 3765 drain_all_stock_sync(memcg);
@@ -3715,9 +3780,6 @@ move_account:
3715 } 3780 }
3716 mem_cgroup_end_move(memcg); 3781 mem_cgroup_end_move(memcg);
3717 memcg_oom_recover(memcg); 3782 memcg_oom_recover(memcg);
3718 /* it seems parent cgroup doesn't have enough mem */
3719 if (ret == -ENOMEM)
3720 goto try_to_free;
3721 cond_resched(); 3783 cond_resched();
3722 /* "ret" should also be checked to ensure all lists are empty. */ 3784 /* "ret" should also be checked to ensure all lists are empty. */
3723 } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret); 3785 } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret);
@@ -3779,6 +3841,10 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
3779 parent_memcg = mem_cgroup_from_cont(parent); 3841 parent_memcg = mem_cgroup_from_cont(parent);
3780 3842
3781 cgroup_lock(); 3843 cgroup_lock();
3844
3845 if (memcg->use_hierarchy == val)
3846 goto out;
3847
3782 /* 3848 /*
3783 * If parent's use_hierarchy is set, we can't make any modifications 3849 * If parent's use_hierarchy is set, we can't make any modifications
3784 * in the child subtrees. If it is unset, then the change can 3850 * in the child subtrees. If it is unset, then the change can
@@ -3795,6 +3861,8 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
3795 retval = -EBUSY; 3861 retval = -EBUSY;
3796 } else 3862 } else
3797 retval = -EINVAL; 3863 retval = -EINVAL;
3864
3865out:
3798 cgroup_unlock(); 3866 cgroup_unlock();
3799 3867
3800 return retval; 3868 return retval;
@@ -3831,7 +3899,7 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
3831 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); 3899 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
3832 3900
3833 if (swap) 3901 if (swap)
3834 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAPOUT); 3902 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
3835 3903
3836 return val << PAGE_SHIFT; 3904 return val << PAGE_SHIFT;
3837} 3905}
@@ -4015,7 +4083,7 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
4015#endif 4083#endif
4016 4084
4017#ifdef CONFIG_NUMA 4085#ifdef CONFIG_NUMA
4018static int mem_control_numa_stat_show(struct cgroup *cont, struct cftype *cft, 4086static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
4019 struct seq_file *m) 4087 struct seq_file *m)
4020{ 4088{
4021 int nid; 4089 int nid;
@@ -4074,7 +4142,7 @@ static inline void mem_cgroup_lru_names_not_uptodate(void)
4074 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); 4142 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
4075} 4143}
4076 4144
4077static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, 4145static int memcg_stat_show(struct cgroup *cont, struct cftype *cft,
4078 struct seq_file *m) 4146 struct seq_file *m)
4079{ 4147{
4080 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 4148 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
@@ -4082,7 +4150,7 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
4082 unsigned int i; 4150 unsigned int i;
4083 4151
4084 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 4152 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
4085 if (i == MEM_CGROUP_STAT_SWAPOUT && !do_swap_account) 4153 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
4086 continue; 4154 continue;
4087 seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i], 4155 seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i],
4088 mem_cgroup_read_stat(memcg, i) * PAGE_SIZE); 4156 mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
@@ -4109,7 +4177,7 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
4109 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 4177 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
4110 long long val = 0; 4178 long long val = 0;
4111 4179
4112 if (i == MEM_CGROUP_STAT_SWAPOUT && !do_swap_account) 4180 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
4113 continue; 4181 continue;
4114 for_each_mem_cgroup_tree(mi, memcg) 4182 for_each_mem_cgroup_tree(mi, memcg)
4115 val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE; 4183 val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
@@ -4533,7 +4601,7 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
4533 return 0; 4601 return 0;
4534} 4602}
4535 4603
4536#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM 4604#ifdef CONFIG_MEMCG_KMEM
4537static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 4605static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
4538{ 4606{
4539 return mem_cgroup_sockets_init(memcg, ss); 4607 return mem_cgroup_sockets_init(memcg, ss);
@@ -4588,7 +4656,7 @@ static struct cftype mem_cgroup_files[] = {
4588 }, 4656 },
4589 { 4657 {
4590 .name = "stat", 4658 .name = "stat",
4591 .read_seq_string = mem_control_stat_show, 4659 .read_seq_string = memcg_stat_show,
4592 }, 4660 },
4593 { 4661 {
4594 .name = "force_empty", 4662 .name = "force_empty",
@@ -4620,10 +4688,10 @@ static struct cftype mem_cgroup_files[] = {
4620#ifdef CONFIG_NUMA 4688#ifdef CONFIG_NUMA
4621 { 4689 {
4622 .name = "numa_stat", 4690 .name = "numa_stat",
4623 .read_seq_string = mem_control_numa_stat_show, 4691 .read_seq_string = memcg_numa_stat_show,
4624 }, 4692 },
4625#endif 4693#endif
4626#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 4694#ifdef CONFIG_MEMCG_SWAP
4627 { 4695 {
4628 .name = "memsw.usage_in_bytes", 4696 .name = "memsw.usage_in_bytes",
4629 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 4697 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
@@ -4810,7 +4878,7 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
4810} 4878}
4811EXPORT_SYMBOL(parent_mem_cgroup); 4879EXPORT_SYMBOL(parent_mem_cgroup);
4812 4880
4813#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 4881#ifdef CONFIG_MEMCG_SWAP
4814static void __init enable_swap_cgroup(void) 4882static void __init enable_swap_cgroup(void)
4815{ 4883{
4816 if (!mem_cgroup_disabled() && really_do_swap_account) 4884 if (!mem_cgroup_disabled() && really_do_swap_account)
@@ -5541,7 +5609,7 @@ struct cgroup_subsys mem_cgroup_subsys = {
5541 .__DEPRECATED_clear_css_refs = true, 5609 .__DEPRECATED_clear_css_refs = true,
5542}; 5610};
5543 5611
5544#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 5612#ifdef CONFIG_MEMCG_SWAP
5545static int __init enable_swap_account(char *s) 5613static int __init enable_swap_account(char *s)
5546{ 5614{
5547 /* consider enabled if no parameter or 1 is given */ 5615 /* consider enabled if no parameter or 1 is given */
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index ab1e7145e29..a6e2141a661 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -128,7 +128,7 @@ static int hwpoison_filter_flags(struct page *p)
128 * can only guarantee that the page either belongs to the memcg tasks, or is 128 * can only guarantee that the page either belongs to the memcg tasks, or is
129 * a freed page. 129 * a freed page.
130 */ 130 */
131#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 131#ifdef CONFIG_MEMCG_SWAP
132u64 hwpoison_filter_memcg; 132u64 hwpoison_filter_memcg;
133EXPORT_SYMBOL_GPL(hwpoison_filter_memcg); 133EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
134static int hwpoison_filter_task(struct page *p) 134static int hwpoison_filter_task(struct page *p)
@@ -345,14 +345,14 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
345 * Also when FAIL is set do a force kill because something went 345 * Also when FAIL is set do a force kill because something went
346 * wrong earlier. 346 * wrong earlier.
347 */ 347 */
348static void kill_procs(struct list_head *to_kill, int doit, int trapno, 348static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
349 int fail, struct page *page, unsigned long pfn, 349 int fail, struct page *page, unsigned long pfn,
350 int flags) 350 int flags)
351{ 351{
352 struct to_kill *tk, *next; 352 struct to_kill *tk, *next;
353 353
354 list_for_each_entry_safe (tk, next, to_kill, nd) { 354 list_for_each_entry_safe (tk, next, to_kill, nd) {
355 if (doit) { 355 if (forcekill) {
356 /* 356 /*
357 * In case something went wrong with munmapping 357 * In case something went wrong with munmapping
358 * make sure the process doesn't catch the 358 * make sure the process doesn't catch the
@@ -858,7 +858,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
858 struct address_space *mapping; 858 struct address_space *mapping;
859 LIST_HEAD(tokill); 859 LIST_HEAD(tokill);
860 int ret; 860 int ret;
861 int kill = 1; 861 int kill = 1, forcekill;
862 struct page *hpage = compound_head(p); 862 struct page *hpage = compound_head(p);
863 struct page *ppage; 863 struct page *ppage;
864 864
@@ -888,7 +888,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
888 * be called inside page lock (it's recommended but not enforced). 888 * be called inside page lock (it's recommended but not enforced).
889 */ 889 */
890 mapping = page_mapping(hpage); 890 mapping = page_mapping(hpage);
891 if (!PageDirty(hpage) && mapping && 891 if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping &&
892 mapping_cap_writeback_dirty(mapping)) { 892 mapping_cap_writeback_dirty(mapping)) {
893 if (page_mkclean(hpage)) { 893 if (page_mkclean(hpage)) {
894 SetPageDirty(hpage); 894 SetPageDirty(hpage);
@@ -965,12 +965,14 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
965 * Now that the dirty bit has been propagated to the 965 * Now that the dirty bit has been propagated to the
966 * struct page and all unmaps done we can decide if 966 * struct page and all unmaps done we can decide if
967 * killing is needed or not. Only kill when the page 967 * killing is needed or not. Only kill when the page
968 * was dirty, otherwise the tokill list is merely 968 * was dirty or the process is not restartable,
969 * otherwise the tokill list is merely
969 * freed. When there was a problem unmapping earlier 970 * freed. When there was a problem unmapping earlier
970 * use a more force-full uncatchable kill to prevent 971 * use a more force-full uncatchable kill to prevent
971 * any accesses to the poisoned memory. 972 * any accesses to the poisoned memory.
972 */ 973 */
973 kill_procs(&tokill, !!PageDirty(ppage), trapno, 974 forcekill = PageDirty(ppage) || (flags & MF_MUST_KILL);
975 kill_procs(&tokill, forcekill, trapno,
974 ret != SWAP_SUCCESS, p, pfn, flags); 976 ret != SWAP_SUCCESS, p, pfn, flags);
975 977
976 return ret; 978 return ret;
@@ -1414,7 +1416,6 @@ static int soft_offline_huge_page(struct page *page, int flags)
1414 int ret; 1416 int ret;
1415 unsigned long pfn = page_to_pfn(page); 1417 unsigned long pfn = page_to_pfn(page);
1416 struct page *hpage = compound_head(page); 1418 struct page *hpage = compound_head(page);
1417 LIST_HEAD(pagelist);
1418 1419
1419 ret = get_any_page(page, pfn, flags); 1420 ret = get_any_page(page, pfn, flags);
1420 if (ret < 0) 1421 if (ret < 0)
@@ -1429,24 +1430,18 @@ static int soft_offline_huge_page(struct page *page, int flags)
1429 } 1430 }
1430 1431
1431 /* Keep page count to indicate a given hugepage is isolated. */ 1432 /* Keep page count to indicate a given hugepage is isolated. */
1432 1433 ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL, false,
1433 list_add(&hpage->lru, &pagelist); 1434 MIGRATE_SYNC);
1434 ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0, 1435 put_page(hpage);
1435 true);
1436 if (ret) { 1436 if (ret) {
1437 struct page *page1, *page2;
1438 list_for_each_entry_safe(page1, page2, &pagelist, lru)
1439 put_page(page1);
1440
1441 pr_info("soft offline: %#lx: migration failed %d, type %lx\n", 1437 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
1442 pfn, ret, page->flags); 1438 pfn, ret, page->flags);
1443 if (ret > 0)
1444 ret = -EIO;
1445 return ret; 1439 return ret;
1446 } 1440 }
1447done: 1441done:
1448 if (!PageHWPoison(hpage)) 1442 if (!PageHWPoison(hpage))
1449 atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages); 1443 atomic_long_add(1 << compound_trans_order(hpage),
1444 &mce_bad_pages);
1450 set_page_hwpoison_huge_page(hpage); 1445 set_page_hwpoison_huge_page(hpage);
1451 dequeue_hwpoisoned_huge_page(hpage); 1446 dequeue_hwpoisoned_huge_page(hpage);
1452 /* keep elevated page count for bad page */ 1447 /* keep elevated page count for bad page */
@@ -1561,7 +1556,7 @@ int soft_offline_page(struct page *page, int flags)
1561 page_is_file_cache(page)); 1556 page_is_file_cache(page));
1562 list_add(&page->lru, &pagelist); 1557 list_add(&page->lru, &pagelist);
1563 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 1558 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
1564 0, MIGRATE_SYNC); 1559 false, MIGRATE_SYNC);
1565 if (ret) { 1560 if (ret) {
1566 putback_lru_pages(&pagelist); 1561 putback_lru_pages(&pagelist);
1567 pr_info("soft offline: %#lx: migration failed %d, type %lx\n", 1562 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
diff --git a/mm/memory.c b/mm/memory.c
index 2466d125023..57361708d1a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -206,6 +206,8 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)
206 tlb->mm = mm; 206 tlb->mm = mm;
207 207
208 tlb->fullmm = fullmm; 208 tlb->fullmm = fullmm;
209 tlb->start = -1UL;
210 tlb->end = 0;
209 tlb->need_flush = 0; 211 tlb->need_flush = 0;
210 tlb->fast_mode = (num_possible_cpus() == 1); 212 tlb->fast_mode = (num_possible_cpus() == 1);
211 tlb->local.next = NULL; 213 tlb->local.next = NULL;
@@ -248,6 +250,8 @@ void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long e
248{ 250{
249 struct mmu_gather_batch *batch, *next; 251 struct mmu_gather_batch *batch, *next;
250 252
253 tlb->start = start;
254 tlb->end = end;
251 tlb_flush_mmu(tlb); 255 tlb_flush_mmu(tlb);
252 256
253 /* keep the page table cache within bounds */ 257 /* keep the page table cache within bounds */
@@ -1204,6 +1208,11 @@ again:
1204 */ 1208 */
1205 if (force_flush) { 1209 if (force_flush) {
1206 force_flush = 0; 1210 force_flush = 0;
1211
1212#ifdef HAVE_GENERIC_MMU_GATHER
1213 tlb->start = addr;
1214 tlb->end = end;
1215#endif
1207 tlb_flush_mmu(tlb); 1216 tlb_flush_mmu(tlb);
1208 if (addr != end) 1217 if (addr != end)
1209 goto again; 1218 goto again;
@@ -1334,8 +1343,11 @@ static void unmap_single_vma(struct mmu_gather *tlb,
1334 * Since no pte has actually been setup, it is 1343 * Since no pte has actually been setup, it is
1335 * safe to do nothing in this case. 1344 * safe to do nothing in this case.
1336 */ 1345 */
1337 if (vma->vm_file) 1346 if (vma->vm_file) {
1338 unmap_hugepage_range(vma, start, end, NULL); 1347 mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
1348 __unmap_hugepage_range_final(tlb, vma, start, end, NULL);
1349 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
1350 }
1339 } else 1351 } else
1340 unmap_page_range(tlb, vma, start, end, details); 1352 unmap_page_range(tlb, vma, start, end, details);
1341 } 1353 }
@@ -2638,6 +2650,9 @@ reuse:
2638 if (!page_mkwrite) { 2650 if (!page_mkwrite) {
2639 wait_on_page_locked(dirty_page); 2651 wait_on_page_locked(dirty_page);
2640 set_page_dirty_balance(dirty_page, page_mkwrite); 2652 set_page_dirty_balance(dirty_page, page_mkwrite);
2653 /* file_update_time outside page_lock */
2654 if (vma->vm_file)
2655 file_update_time(vma->vm_file);
2641 } 2656 }
2642 put_page(dirty_page); 2657 put_page(dirty_page);
2643 if (page_mkwrite) { 2658 if (page_mkwrite) {
@@ -2655,10 +2670,6 @@ reuse:
2655 } 2670 }
2656 } 2671 }
2657 2672
2658 /* file_update_time outside page_lock */
2659 if (vma->vm_file)
2660 file_update_time(vma->vm_file);
2661
2662 return ret; 2673 return ret;
2663 } 2674 }
2664 2675
@@ -3327,12 +3338,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3327 3338
3328 if (dirty_page) { 3339 if (dirty_page) {
3329 struct address_space *mapping = page->mapping; 3340 struct address_space *mapping = page->mapping;
3341 int dirtied = 0;
3330 3342
3331 if (set_page_dirty(dirty_page)) 3343 if (set_page_dirty(dirty_page))
3332 page_mkwrite = 1; 3344 dirtied = 1;
3333 unlock_page(dirty_page); 3345 unlock_page(dirty_page);
3334 put_page(dirty_page); 3346 put_page(dirty_page);
3335 if (page_mkwrite && mapping) { 3347 if ((dirtied || page_mkwrite) && mapping) {
3336 /* 3348 /*
3337 * Some device drivers do not set page.mapping but still 3349 * Some device drivers do not set page.mapping but still
3338 * dirty their pages 3350 * dirty their pages
@@ -3341,7 +3353,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3341 } 3353 }
3342 3354
3343 /* file_update_time outside page_lock */ 3355 /* file_update_time outside page_lock */
3344 if (vma->vm_file) 3356 if (vma->vm_file && !page_mkwrite)
3345 file_update_time(vma->vm_file); 3357 file_update_time(vma->vm_file);
3346 } else { 3358 } else {
3347 unlock_page(vmf.page); 3359 unlock_page(vmf.page);
@@ -3929,7 +3941,7 @@ void print_vma_addr(char *prefix, unsigned long ip)
3929 free_page((unsigned long)buf); 3941 free_page((unsigned long)buf);
3930 } 3942 }
3931 } 3943 }
3932 up_read(&current->mm->mmap_sem); 3944 up_read(&mm->mmap_sem);
3933} 3945}
3934 3946
3935#ifdef CONFIG_PROVE_LOCKING 3947#ifdef CONFIG_PROVE_LOCKING
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 427bb291dd0..3ad25f9d1fc 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -512,19 +512,20 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
512 512
513 zone->present_pages += onlined_pages; 513 zone->present_pages += onlined_pages;
514 zone->zone_pgdat->node_present_pages += onlined_pages; 514 zone->zone_pgdat->node_present_pages += onlined_pages;
515 if (need_zonelists_rebuild) 515 if (onlined_pages) {
516 build_all_zonelists(zone); 516 node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
517 else 517 if (need_zonelists_rebuild)
518 zone_pcp_update(zone); 518 build_all_zonelists(NULL, zone);
519 else
520 zone_pcp_update(zone);
521 }
519 522
520 mutex_unlock(&zonelists_mutex); 523 mutex_unlock(&zonelists_mutex);
521 524
522 init_per_zone_wmark_min(); 525 init_per_zone_wmark_min();
523 526
524 if (onlined_pages) { 527 if (onlined_pages)
525 kswapd_run(zone_to_nid(zone)); 528 kswapd_run(zone_to_nid(zone));
526 node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
527 }
528 529
529 vm_total_pages = nr_free_pagecache_pages(); 530 vm_total_pages = nr_free_pagecache_pages();
530 531
@@ -562,7 +563,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
562 * to access not-initialized zonelist, build here. 563 * to access not-initialized zonelist, build here.
563 */ 564 */
564 mutex_lock(&zonelists_mutex); 565 mutex_lock(&zonelists_mutex);
565 build_all_zonelists(NULL); 566 build_all_zonelists(pgdat, NULL);
566 mutex_unlock(&zonelists_mutex); 567 mutex_unlock(&zonelists_mutex);
567 568
568 return pgdat; 569 return pgdat;
@@ -965,6 +966,9 @@ repeat:
965 966
966 init_per_zone_wmark_min(); 967 init_per_zone_wmark_min();
967 968
969 if (!populated_zone(zone))
970 zone_pcp_reset(zone);
971
968 if (!node_present_pages(node)) { 972 if (!node_present_pages(node)) {
969 node_clear_state(node, N_HIGH_MEMORY); 973 node_clear_state(node, N_HIGH_MEMORY);
970 kswapd_stop(node); 974 kswapd_stop(node);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 1d771e4200d..4ada3be6e25 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1602,8 +1602,14 @@ static unsigned interleave_nodes(struct mempolicy *policy)
1602 * task can change it's policy. The system default policy requires no 1602 * task can change it's policy. The system default policy requires no
1603 * such protection. 1603 * such protection.
1604 */ 1604 */
1605unsigned slab_node(struct mempolicy *policy) 1605unsigned slab_node(void)
1606{ 1606{
1607 struct mempolicy *policy;
1608
1609 if (in_interrupt())
1610 return numa_node_id();
1611
1612 policy = current->mempolicy;
1607 if (!policy || policy->flags & MPOL_F_LOCAL) 1613 if (!policy || policy->flags & MPOL_F_LOCAL)
1608 return numa_node_id(); 1614 return numa_node_id();
1609 1615
@@ -2556,7 +2562,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
2556 break; 2562 break;
2557 2563
2558 default: 2564 default:
2559 BUG(); 2565 return -EINVAL;
2560 } 2566 }
2561 2567
2562 l = strlen(policy_modes[mode]); 2568 l = strlen(policy_modes[mode]);
diff --git a/mm/mempool.c b/mm/mempool.c
index d9049811f35..54990476c04 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -63,19 +63,21 @@ EXPORT_SYMBOL(mempool_destroy);
63mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, 63mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
64 mempool_free_t *free_fn, void *pool_data) 64 mempool_free_t *free_fn, void *pool_data)
65{ 65{
66 return mempool_create_node(min_nr,alloc_fn,free_fn, pool_data,-1); 66 return mempool_create_node(min_nr,alloc_fn,free_fn, pool_data,
67 GFP_KERNEL, NUMA_NO_NODE);
67} 68}
68EXPORT_SYMBOL(mempool_create); 69EXPORT_SYMBOL(mempool_create);
69 70
70mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, 71mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
71 mempool_free_t *free_fn, void *pool_data, int node_id) 72 mempool_free_t *free_fn, void *pool_data,
73 gfp_t gfp_mask, int node_id)
72{ 74{
73 mempool_t *pool; 75 mempool_t *pool;
74 pool = kmalloc_node(sizeof(*pool), GFP_KERNEL | __GFP_ZERO, node_id); 76 pool = kmalloc_node(sizeof(*pool), gfp_mask | __GFP_ZERO, node_id);
75 if (!pool) 77 if (!pool)
76 return NULL; 78 return NULL;
77 pool->elements = kmalloc_node(min_nr * sizeof(void *), 79 pool->elements = kmalloc_node(min_nr * sizeof(void *),
78 GFP_KERNEL, node_id); 80 gfp_mask, node_id);
79 if (!pool->elements) { 81 if (!pool->elements) {
80 kfree(pool); 82 kfree(pool);
81 return NULL; 83 return NULL;
@@ -93,7 +95,7 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
93 while (pool->curr_nr < pool->min_nr) { 95 while (pool->curr_nr < pool->min_nr) {
94 void *element; 96 void *element;
95 97
96 element = pool->alloc(GFP_KERNEL, pool->pool_data); 98 element = pool->alloc(gfp_mask, pool->pool_data);
97 if (unlikely(!element)) { 99 if (unlikely(!element)) {
98 mempool_destroy(pool); 100 mempool_destroy(pool);
99 return NULL; 101 return NULL;
diff --git a/mm/migrate.c b/mm/migrate.c
index be26d5cbe56..77ed2d77370 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -33,6 +33,7 @@
33#include <linux/memcontrol.h> 33#include <linux/memcontrol.h>
34#include <linux/syscalls.h> 34#include <linux/syscalls.h>
35#include <linux/hugetlb.h> 35#include <linux/hugetlb.h>
36#include <linux/hugetlb_cgroup.h>
36#include <linux/gfp.h> 37#include <linux/gfp.h>
37 38
38#include <asm/tlbflush.h> 39#include <asm/tlbflush.h>
@@ -682,7 +683,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
682{ 683{
683 int rc = -EAGAIN; 684 int rc = -EAGAIN;
684 int remap_swapcache = 1; 685 int remap_swapcache = 1;
685 int charge = 0;
686 struct mem_cgroup *mem; 686 struct mem_cgroup *mem;
687 struct anon_vma *anon_vma = NULL; 687 struct anon_vma *anon_vma = NULL;
688 688
@@ -724,12 +724,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
724 } 724 }
725 725
726 /* charge against new page */ 726 /* charge against new page */
727 charge = mem_cgroup_prepare_migration(page, newpage, &mem, GFP_KERNEL); 727 mem_cgroup_prepare_migration(page, newpage, &mem);
728 if (charge == -ENOMEM) {
729 rc = -ENOMEM;
730 goto unlock;
731 }
732 BUG_ON(charge);
733 728
734 if (PageWriteback(page)) { 729 if (PageWriteback(page)) {
735 /* 730 /*
@@ -819,8 +814,7 @@ skip_unmap:
819 put_anon_vma(anon_vma); 814 put_anon_vma(anon_vma);
820 815
821uncharge: 816uncharge:
822 if (!charge) 817 mem_cgroup_end_migration(mem, page, newpage, rc == 0);
823 mem_cgroup_end_migration(mem, page, newpage, rc == 0);
824unlock: 818unlock:
825 unlock_page(page); 819 unlock_page(page);
826out: 820out:
@@ -931,16 +925,13 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
931 925
932 if (anon_vma) 926 if (anon_vma)
933 put_anon_vma(anon_vma); 927 put_anon_vma(anon_vma);
934 unlock_page(hpage);
935 928
936out: 929 if (!rc)
937 if (rc != -EAGAIN) { 930 hugetlb_cgroup_migrate(hpage, new_hpage);
938 list_del(&hpage->lru);
939 put_page(hpage);
940 }
941 931
932 unlock_page(hpage);
933out:
942 put_page(new_hpage); 934 put_page(new_hpage);
943
944 if (result) { 935 if (result) {
945 if (rc) 936 if (rc)
946 *result = rc; 937 *result = rc;
@@ -1016,48 +1007,32 @@ out:
1016 return nr_failed + retry; 1007 return nr_failed + retry;
1017} 1008}
1018 1009
1019int migrate_huge_pages(struct list_head *from, 1010int migrate_huge_page(struct page *hpage, new_page_t get_new_page,
1020 new_page_t get_new_page, unsigned long private, bool offlining, 1011 unsigned long private, bool offlining,
1021 enum migrate_mode mode) 1012 enum migrate_mode mode)
1022{ 1013{
1023 int retry = 1; 1014 int pass, rc;
1024 int nr_failed = 0; 1015
1025 int pass = 0; 1016 for (pass = 0; pass < 10; pass++) {
1026 struct page *page; 1017 rc = unmap_and_move_huge_page(get_new_page,
1027 struct page *page2; 1018 private, hpage, pass > 2, offlining,
1028 int rc; 1019 mode);
1029 1020 switch (rc) {
1030 for (pass = 0; pass < 10 && retry; pass++) { 1021 case -ENOMEM:
1031 retry = 0; 1022 goto out;
1032 1023 case -EAGAIN:
1033 list_for_each_entry_safe(page, page2, from, lru) { 1024 /* try again */
1034 cond_resched(); 1025 cond_resched();
1035 1026 break;
1036 rc = unmap_and_move_huge_page(get_new_page, 1027 case 0:
1037 private, page, pass > 2, offlining, 1028 goto out;
1038 mode); 1029 default:
1039 1030 rc = -EIO;
1040 switch(rc) { 1031 goto out;
1041 case -ENOMEM:
1042 goto out;
1043 case -EAGAIN:
1044 retry++;
1045 break;
1046 case 0:
1047 break;
1048 default:
1049 /* Permanent failure */
1050 nr_failed++;
1051 break;
1052 }
1053 } 1032 }
1054 } 1033 }
1055 rc = 0;
1056out: 1034out:
1057 if (rc) 1035 return rc;
1058 return rc;
1059
1060 return nr_failed + retry;
1061} 1036}
1062 1037
1063#ifdef CONFIG_NUMA 1038#ifdef CONFIG_NUMA
diff --git a/mm/mmap.c b/mm/mmap.c
index 3edfcdfa42d..ae18a48e7e4 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -943,6 +943,8 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags,
943 const unsigned long stack_flags 943 const unsigned long stack_flags
944 = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN); 944 = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN);
945 945
946 mm->total_vm += pages;
947
946 if (file) { 948 if (file) {
947 mm->shared_vm += pages; 949 mm->shared_vm += pages;
948 if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC) 950 if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC)
@@ -1347,7 +1349,6 @@ munmap_back:
1347out: 1349out:
1348 perf_event_mmap(vma); 1350 perf_event_mmap(vma);
1349 1351
1350 mm->total_vm += len >> PAGE_SHIFT;
1351 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); 1352 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
1352 if (vm_flags & VM_LOCKED) { 1353 if (vm_flags & VM_LOCKED) {
1353 if (!mlock_vma_pages_range(vma, addr, addr + len)) 1354 if (!mlock_vma_pages_range(vma, addr, addr + len))
@@ -1355,9 +1356,8 @@ out:
1355 } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) 1356 } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
1356 make_pages_present(addr, addr + len); 1357 make_pages_present(addr, addr + len);
1357 1358
1358 if (file && uprobe_mmap(vma)) 1359 if (file)
1359 /* matching probes but cannot insert */ 1360 uprobe_mmap(vma);
1360 goto unmap_and_free_vma;
1361 1361
1362 return addr; 1362 return addr;
1363 1363
@@ -1707,7 +1707,6 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
1707 return -ENOMEM; 1707 return -ENOMEM;
1708 1708
1709 /* Ok, everything looks good - let it rip */ 1709 /* Ok, everything looks good - let it rip */
1710 mm->total_vm += grow;
1711 if (vma->vm_flags & VM_LOCKED) 1710 if (vma->vm_flags & VM_LOCKED)
1712 mm->locked_vm += grow; 1711 mm->locked_vm += grow;
1713 vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); 1712 vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
@@ -1889,7 +1888,6 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
1889 1888
1890 if (vma->vm_flags & VM_ACCOUNT) 1889 if (vma->vm_flags & VM_ACCOUNT)
1891 nr_accounted += nrpages; 1890 nr_accounted += nrpages;
1892 mm->total_vm -= nrpages;
1893 vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); 1891 vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
1894 vma = remove_vma(vma); 1892 vma = remove_vma(vma);
1895 } while (vma); 1893 } while (vma);
@@ -2310,7 +2308,7 @@ void exit_mmap(struct mm_struct *mm)
2310 } 2308 }
2311 vm_unacct_memory(nr_accounted); 2309 vm_unacct_memory(nr_accounted);
2312 2310
2313 BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); 2311 WARN_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
2314} 2312}
2315 2313
2316/* Insert vm structure into process list sorted by address 2314/* Insert vm structure into process list sorted by address
@@ -2345,9 +2343,6 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
2345 security_vm_enough_memory_mm(mm, vma_pages(vma))) 2343 security_vm_enough_memory_mm(mm, vma_pages(vma)))
2346 return -ENOMEM; 2344 return -ENOMEM;
2347 2345
2348 if (vma->vm_file && uprobe_mmap(vma))
2349 return -EINVAL;
2350
2351 vma_link(mm, vma, prev, rb_link, rb_parent); 2346 vma_link(mm, vma, prev, rb_link, rb_parent);
2352 return 0; 2347 return 0;
2353} 2348}
@@ -2418,9 +2413,6 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2418 if (new_vma->vm_file) { 2413 if (new_vma->vm_file) {
2419 get_file(new_vma->vm_file); 2414 get_file(new_vma->vm_file);
2420 2415
2421 if (uprobe_mmap(new_vma))
2422 goto out_free_mempol;
2423
2424 if (vma->vm_flags & VM_EXECUTABLE) 2416 if (vma->vm_flags & VM_EXECUTABLE)
2425 added_exe_file_vma(mm); 2417 added_exe_file_vma(mm);
2426 } 2418 }
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 9a611d3a184..862b60822d9 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -33,6 +33,24 @@
33void __mmu_notifier_release(struct mm_struct *mm) 33void __mmu_notifier_release(struct mm_struct *mm)
34{ 34{
35 struct mmu_notifier *mn; 35 struct mmu_notifier *mn;
36 struct hlist_node *n;
37
38 /*
39 * RCU here will block mmu_notifier_unregister until
40 * ->release returns.
41 */
42 rcu_read_lock();
43 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist)
44 /*
45 * if ->release runs before mmu_notifier_unregister it
46 * must be handled as it's the only way for the driver
47 * to flush all existing sptes and stop the driver
48 * from establishing any more sptes before all the
49 * pages in the mm are freed.
50 */
51 if (mn->ops->release)
52 mn->ops->release(mn, mm);
53 rcu_read_unlock();
36 54
37 spin_lock(&mm->mmu_notifier_mm->lock); 55 spin_lock(&mm->mmu_notifier_mm->lock);
38 while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { 56 while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
@@ -46,23 +64,6 @@ void __mmu_notifier_release(struct mm_struct *mm)
46 * mmu_notifier_unregister to return. 64 * mmu_notifier_unregister to return.
47 */ 65 */
48 hlist_del_init_rcu(&mn->hlist); 66 hlist_del_init_rcu(&mn->hlist);
49 /*
50 * RCU here will block mmu_notifier_unregister until
51 * ->release returns.
52 */
53 rcu_read_lock();
54 spin_unlock(&mm->mmu_notifier_mm->lock);
55 /*
56 * if ->release runs before mmu_notifier_unregister it
57 * must be handled as it's the only way for the driver
58 * to flush all existing sptes and stop the driver
59 * from establishing any more sptes before all the
60 * pages in the mm are freed.
61 */
62 if (mn->ops->release)
63 mn->ops->release(mn, mm);
64 rcu_read_unlock();
65 spin_lock(&mm->mmu_notifier_mm->lock);
66 } 67 }
67 spin_unlock(&mm->mmu_notifier_mm->lock); 68 spin_unlock(&mm->mmu_notifier_mm->lock);
68 69
@@ -284,16 +285,13 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
284{ 285{
285 BUG_ON(atomic_read(&mm->mm_count) <= 0); 286 BUG_ON(atomic_read(&mm->mm_count) <= 0);
286 287
287 spin_lock(&mm->mmu_notifier_mm->lock);
288 if (!hlist_unhashed(&mn->hlist)) { 288 if (!hlist_unhashed(&mn->hlist)) {
289 hlist_del_rcu(&mn->hlist);
290
291 /* 289 /*
292 * RCU here will force exit_mmap to wait ->release to finish 290 * RCU here will force exit_mmap to wait ->release to finish
293 * before freeing the pages. 291 * before freeing the pages.
294 */ 292 */
295 rcu_read_lock(); 293 rcu_read_lock();
296 spin_unlock(&mm->mmu_notifier_mm->lock); 294
297 /* 295 /*
298 * exit_mmap will block in mmu_notifier_release to 296 * exit_mmap will block in mmu_notifier_release to
299 * guarantee ->release is called before freeing the 297 * guarantee ->release is called before freeing the
@@ -302,8 +300,11 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
302 if (mn->ops->release) 300 if (mn->ops->release)
303 mn->ops->release(mn, mm); 301 mn->ops->release(mn, mm);
304 rcu_read_unlock(); 302 rcu_read_unlock();
305 } else 303
304 spin_lock(&mm->mmu_notifier_mm->lock);
305 hlist_del_rcu(&mn->hlist);
306 spin_unlock(&mm->mmu_notifier_mm->lock); 306 spin_unlock(&mm->mmu_notifier_mm->lock);
307 }
307 308
308 /* 309 /*
309 * Wait any running method to finish, of course including 310 * Wait any running method to finish, of course including
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 6830eab5bf0..3cef80f6ac7 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -96,7 +96,7 @@ void lruvec_init(struct lruvec *lruvec, struct zone *zone)
96 for_each_lru(lru) 96 for_each_lru(lru)
97 INIT_LIST_HEAD(&lruvec->lists[lru]); 97 INIT_LIST_HEAD(&lruvec->lists[lru]);
98 98
99#ifdef CONFIG_CGROUP_MEM_RES_CTLR 99#ifdef CONFIG_MEMCG
100 lruvec->zone = zone; 100 lruvec->zone = zone;
101#endif 101#endif
102} 102}
diff --git a/mm/mremap.c b/mm/mremap.c
index 21fed202dda..cc06d0e48d0 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -260,7 +260,6 @@ static unsigned long move_vma(struct vm_area_struct *vma,
260 * If this were a serious issue, we'd add a flag to do_munmap(). 260 * If this were a serious issue, we'd add a flag to do_munmap().
261 */ 261 */
262 hiwater_vm = mm->hiwater_vm; 262 hiwater_vm = mm->hiwater_vm;
263 mm->total_vm += new_len >> PAGE_SHIFT;
264 vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT); 263 vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
265 264
266 if (do_munmap(mm, old_addr, old_len) < 0) { 265 if (do_munmap(mm, old_addr, old_len) < 0) {
@@ -497,7 +496,6 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
497 goto out; 496 goto out;
498 } 497 }
499 498
500 mm->total_vm += pages;
501 vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); 499 vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
502 if (vma->vm_flags & VM_LOCKED) { 500 if (vma->vm_flags & VM_LOCKED) {
503 mm->locked_vm += pages; 501 mm->locked_vm += pages;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index ac300c99baf..19860086163 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -288,76 +288,93 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
288} 288}
289#endif 289#endif
290 290
291enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
292 unsigned long totalpages, const nodemask_t *nodemask,
293 bool force_kill)
294{
295 if (task->exit_state)
296 return OOM_SCAN_CONTINUE;
297 if (oom_unkillable_task(task, NULL, nodemask))
298 return OOM_SCAN_CONTINUE;
299
300 /*
301 * This task already has access to memory reserves and is being killed.
302 * Don't allow any other task to have access to the reserves.
303 */
304 if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
305 if (unlikely(frozen(task)))
306 __thaw_task(task);
307 if (!force_kill)
308 return OOM_SCAN_ABORT;
309 }
310 if (!task->mm)
311 return OOM_SCAN_CONTINUE;
312
313 if (task->flags & PF_EXITING) {
314 /*
315 * If task is current and is in the process of releasing memory,
316 * allow the "kill" to set TIF_MEMDIE, which will allow it to
317 * access memory reserves. Otherwise, it may stall forever.
318 *
319 * The iteration isn't broken here, however, in case other
320 * threads are found to have already been oom killed.
321 */
322 if (task == current)
323 return OOM_SCAN_SELECT;
324 else if (!force_kill) {
325 /*
326 * If this task is not being ptraced on exit, then wait
327 * for it to finish before killing some other task
328 * unnecessarily.
329 */
330 if (!(task->group_leader->ptrace & PT_TRACE_EXIT))
331 return OOM_SCAN_ABORT;
332 }
333 }
334 return OOM_SCAN_OK;
335}
336
291/* 337/*
292 * Simple selection loop. We chose the process with the highest 338 * Simple selection loop. We chose the process with the highest
293 * number of 'points'. We expect the caller will lock the tasklist. 339 * number of 'points'.
294 * 340 *
295 * (not docbooked, we don't want this one cluttering up the manual) 341 * (not docbooked, we don't want this one cluttering up the manual)
296 */ 342 */
297static struct task_struct *select_bad_process(unsigned int *ppoints, 343static struct task_struct *select_bad_process(unsigned int *ppoints,
298 unsigned long totalpages, struct mem_cgroup *memcg, 344 unsigned long totalpages, const nodemask_t *nodemask,
299 const nodemask_t *nodemask, bool force_kill) 345 bool force_kill)
300{ 346{
301 struct task_struct *g, *p; 347 struct task_struct *g, *p;
302 struct task_struct *chosen = NULL; 348 struct task_struct *chosen = NULL;
303 unsigned long chosen_points = 0; 349 unsigned long chosen_points = 0;
304 350
351 rcu_read_lock();
305 do_each_thread(g, p) { 352 do_each_thread(g, p) {
306 unsigned int points; 353 unsigned int points;
307 354
308 if (p->exit_state) 355 switch (oom_scan_process_thread(p, totalpages, nodemask,
309 continue; 356 force_kill)) {
310 if (oom_unkillable_task(p, memcg, nodemask)) 357 case OOM_SCAN_SELECT:
311 continue; 358 chosen = p;
312 359 chosen_points = ULONG_MAX;
313 /* 360 /* fall through */
314 * This task already has access to memory reserves and is 361 case OOM_SCAN_CONTINUE:
315 * being killed. Don't allow any other task access to the
316 * memory reserve.
317 *
318 * Note: this may have a chance of deadlock if it gets
319 * blocked waiting for another task which itself is waiting
320 * for memory. Is there a better alternative?
321 */
322 if (test_tsk_thread_flag(p, TIF_MEMDIE)) {
323 if (unlikely(frozen(p)))
324 __thaw_task(p);
325 if (!force_kill)
326 return ERR_PTR(-1UL);
327 }
328 if (!p->mm)
329 continue; 362 continue;
330 363 case OOM_SCAN_ABORT:
331 if (p->flags & PF_EXITING) { 364 rcu_read_unlock();
332 /* 365 return ERR_PTR(-1UL);
333 * If p is the current task and is in the process of 366 case OOM_SCAN_OK:
334 * releasing memory, we allow the "kill" to set 367 break;
335 * TIF_MEMDIE, which will allow it to gain access to 368 };
336 * memory reserves. Otherwise, it may stall forever. 369 points = oom_badness(p, NULL, nodemask, totalpages);
337 *
338 * The loop isn't broken here, however, in case other
339 * threads are found to have already been oom killed.
340 */
341 if (p == current) {
342 chosen = p;
343 chosen_points = ULONG_MAX;
344 } else if (!force_kill) {
345 /*
346 * If this task is not being ptraced on exit,
347 * then wait for it to finish before killing
348 * some other task unnecessarily.
349 */
350 if (!(p->group_leader->ptrace & PT_TRACE_EXIT))
351 return ERR_PTR(-1UL);
352 }
353 }
354
355 points = oom_badness(p, memcg, nodemask, totalpages);
356 if (points > chosen_points) { 370 if (points > chosen_points) {
357 chosen = p; 371 chosen = p;
358 chosen_points = points; 372 chosen_points = points;
359 } 373 }
360 } while_each_thread(g, p); 374 } while_each_thread(g, p);
375 if (chosen)
376 get_task_struct(chosen);
377 rcu_read_unlock();
361 378
362 *ppoints = chosen_points * 1000 / totalpages; 379 *ppoints = chosen_points * 1000 / totalpages;
363 return chosen; 380 return chosen;
@@ -371,17 +388,16 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
371 * Dumps the current memory state of all eligible tasks. Tasks not in the same 388 * Dumps the current memory state of all eligible tasks. Tasks not in the same
372 * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes 389 * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
373 * are not shown. 390 * are not shown.
374 * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj 391 * State information includes task's pid, uid, tgid, vm size, rss, nr_ptes,
375 * value, oom_score_adj value, and name. 392 * swapents, oom_score_adj value, and name.
376 *
377 * Call with tasklist_lock read-locked.
378 */ 393 */
379static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemask) 394static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemask)
380{ 395{
381 struct task_struct *p; 396 struct task_struct *p;
382 struct task_struct *task; 397 struct task_struct *task;
383 398
384 pr_info("[ pid ] uid tgid total_vm rss cpu oom_adj oom_score_adj name\n"); 399 pr_info("[ pid ] uid tgid total_vm rss nr_ptes swapents oom_score_adj name\n");
400 rcu_read_lock();
385 for_each_process(p) { 401 for_each_process(p) {
386 if (oom_unkillable_task(p, memcg, nodemask)) 402 if (oom_unkillable_task(p, memcg, nodemask))
387 continue; 403 continue;
@@ -396,13 +412,15 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas
396 continue; 412 continue;
397 } 413 }
398 414
399 pr_info("[%5d] %5d %5d %8lu %8lu %3u %3d %5d %s\n", 415 pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5d %s\n",
400 task->pid, from_kuid(&init_user_ns, task_uid(task)), 416 task->pid, from_kuid(&init_user_ns, task_uid(task)),
401 task->tgid, task->mm->total_vm, get_mm_rss(task->mm), 417 task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
402 task_cpu(task), task->signal->oom_adj, 418 task->mm->nr_ptes,
419 get_mm_counter(task->mm, MM_SWAPENTS),
403 task->signal->oom_score_adj, task->comm); 420 task->signal->oom_score_adj, task->comm);
404 task_unlock(task); 421 task_unlock(task);
405 } 422 }
423 rcu_read_unlock();
406} 424}
407 425
408static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, 426static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
@@ -423,10 +441,14 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
423} 441}
424 442
425#define K(x) ((x) << (PAGE_SHIFT-10)) 443#define K(x) ((x) << (PAGE_SHIFT-10))
426static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, 444/*
427 unsigned int points, unsigned long totalpages, 445 * Must be called while holding a reference to p, which will be released upon
428 struct mem_cgroup *memcg, nodemask_t *nodemask, 446 * returning.
429 const char *message) 447 */
448void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
449 unsigned int points, unsigned long totalpages,
450 struct mem_cgroup *memcg, nodemask_t *nodemask,
451 const char *message)
430{ 452{
431 struct task_struct *victim = p; 453 struct task_struct *victim = p;
432 struct task_struct *child; 454 struct task_struct *child;
@@ -442,6 +464,7 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
442 */ 464 */
443 if (p->flags & PF_EXITING) { 465 if (p->flags & PF_EXITING) {
444 set_tsk_thread_flag(p, TIF_MEMDIE); 466 set_tsk_thread_flag(p, TIF_MEMDIE);
467 put_task_struct(p);
445 return; 468 return;
446 } 469 }
447 470
@@ -459,6 +482,7 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
459 * parent. This attempts to lose the minimal amount of work done while 482 * parent. This attempts to lose the minimal amount of work done while
460 * still freeing memory. 483 * still freeing memory.
461 */ 484 */
485 read_lock(&tasklist_lock);
462 do { 486 do {
463 list_for_each_entry(child, &t->children, sibling) { 487 list_for_each_entry(child, &t->children, sibling) {
464 unsigned int child_points; 488 unsigned int child_points;
@@ -471,15 +495,26 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
471 child_points = oom_badness(child, memcg, nodemask, 495 child_points = oom_badness(child, memcg, nodemask,
472 totalpages); 496 totalpages);
473 if (child_points > victim_points) { 497 if (child_points > victim_points) {
498 put_task_struct(victim);
474 victim = child; 499 victim = child;
475 victim_points = child_points; 500 victim_points = child_points;
501 get_task_struct(victim);
476 } 502 }
477 } 503 }
478 } while_each_thread(p, t); 504 } while_each_thread(p, t);
505 read_unlock(&tasklist_lock);
479 506
480 victim = find_lock_task_mm(victim); 507 rcu_read_lock();
481 if (!victim) 508 p = find_lock_task_mm(victim);
509 if (!p) {
510 rcu_read_unlock();
511 put_task_struct(victim);
482 return; 512 return;
513 } else if (victim != p) {
514 get_task_struct(p);
515 put_task_struct(victim);
516 victim = p;
517 }
483 518
484 /* mm cannot safely be dereferenced after task_unlock(victim) */ 519 /* mm cannot safely be dereferenced after task_unlock(victim) */
485 mm = victim->mm; 520 mm = victim->mm;
@@ -510,17 +545,19 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
510 task_unlock(p); 545 task_unlock(p);
511 do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); 546 do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
512 } 547 }
548 rcu_read_unlock();
513 549
514 set_tsk_thread_flag(victim, TIF_MEMDIE); 550 set_tsk_thread_flag(victim, TIF_MEMDIE);
515 do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); 551 do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
552 put_task_struct(victim);
516} 553}
517#undef K 554#undef K
518 555
519/* 556/*
520 * Determines whether the kernel must panic because of the panic_on_oom sysctl. 557 * Determines whether the kernel must panic because of the panic_on_oom sysctl.
521 */ 558 */
522static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, 559void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
523 int order, const nodemask_t *nodemask) 560 int order, const nodemask_t *nodemask)
524{ 561{
525 if (likely(!sysctl_panic_on_oom)) 562 if (likely(!sysctl_panic_on_oom))
526 return; 563 return;
@@ -533,42 +570,11 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
533 if (constraint != CONSTRAINT_NONE) 570 if (constraint != CONSTRAINT_NONE)
534 return; 571 return;
535 } 572 }
536 read_lock(&tasklist_lock);
537 dump_header(NULL, gfp_mask, order, NULL, nodemask); 573 dump_header(NULL, gfp_mask, order, NULL, nodemask);
538 read_unlock(&tasklist_lock);
539 panic("Out of memory: %s panic_on_oom is enabled\n", 574 panic("Out of memory: %s panic_on_oom is enabled\n",
540 sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide"); 575 sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
541} 576}
542 577
543#ifdef CONFIG_CGROUP_MEM_RES_CTLR
544void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
545 int order)
546{
547 unsigned long limit;
548 unsigned int points = 0;
549 struct task_struct *p;
550
551 /*
552 * If current has a pending SIGKILL, then automatically select it. The
553 * goal is to allow it to allocate so that it may quickly exit and free
554 * its memory.
555 */
556 if (fatal_signal_pending(current)) {
557 set_thread_flag(TIF_MEMDIE);
558 return;
559 }
560
561 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
562 limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
563 read_lock(&tasklist_lock);
564 p = select_bad_process(&points, limit, memcg, NULL, false);
565 if (p && PTR_ERR(p) != -1UL)
566 oom_kill_process(p, gfp_mask, order, points, limit, memcg, NULL,
567 "Memory cgroup out of memory");
568 read_unlock(&tasklist_lock);
569}
570#endif
571
572static BLOCKING_NOTIFIER_HEAD(oom_notify_list); 578static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
573 579
574int register_oom_notifier(struct notifier_block *nb) 580int register_oom_notifier(struct notifier_block *nb)
@@ -690,7 +696,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
690 struct task_struct *p; 696 struct task_struct *p;
691 unsigned long totalpages; 697 unsigned long totalpages;
692 unsigned long freed = 0; 698 unsigned long freed = 0;
693 unsigned int points; 699 unsigned int uninitialized_var(points);
694 enum oom_constraint constraint = CONSTRAINT_NONE; 700 enum oom_constraint constraint = CONSTRAINT_NONE;
695 int killed = 0; 701 int killed = 0;
696 702
@@ -718,22 +724,20 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
718 mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL; 724 mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;
719 check_panic_on_oom(constraint, gfp_mask, order, mpol_mask); 725 check_panic_on_oom(constraint, gfp_mask, order, mpol_mask);
720 726
721 read_lock(&tasklist_lock); 727 if (sysctl_oom_kill_allocating_task && current->mm &&
722 if (sysctl_oom_kill_allocating_task &&
723 !oom_unkillable_task(current, NULL, nodemask) && 728 !oom_unkillable_task(current, NULL, nodemask) &&
724 current->mm) { 729 current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
730 get_task_struct(current);
725 oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL, 731 oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL,
726 nodemask, 732 nodemask,
727 "Out of memory (oom_kill_allocating_task)"); 733 "Out of memory (oom_kill_allocating_task)");
728 goto out; 734 goto out;
729 } 735 }
730 736
731 p = select_bad_process(&points, totalpages, NULL, mpol_mask, 737 p = select_bad_process(&points, totalpages, mpol_mask, force_kill);
732 force_kill);
733 /* Found nothing?!?! Either we hang forever, or we panic. */ 738 /* Found nothing?!?! Either we hang forever, or we panic. */
734 if (!p) { 739 if (!p) {
735 dump_header(NULL, gfp_mask, order, NULL, mpol_mask); 740 dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
736 read_unlock(&tasklist_lock);
737 panic("Out of memory and no killable processes...\n"); 741 panic("Out of memory and no killable processes...\n");
738 } 742 }
739 if (PTR_ERR(p) != -1UL) { 743 if (PTR_ERR(p) != -1UL) {
@@ -742,14 +746,12 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
742 killed = 1; 746 killed = 1;
743 } 747 }
744out: 748out:
745 read_unlock(&tasklist_lock);
746
747 /* 749 /*
748 * Give "p" a good chance of killing itself before we 750 * Give the killed threads a good chance of exiting before trying to
749 * retry to allocate memory unless "p" is current 751 * allocate memory again.
750 */ 752 */
751 if (killed && !test_thread_flag(TIF_MEMDIE)) 753 if (killed)
752 schedule_timeout_uninterruptible(1); 754 schedule_timeout_killable(1);
753} 755}
754 756
755/* 757/*
@@ -764,6 +766,5 @@ void pagefault_out_of_memory(void)
764 out_of_memory(NULL, 0, 0, NULL, false); 766 out_of_memory(NULL, 0, 0, NULL, false);
765 clear_system_oom(); 767 clear_system_oom();
766 } 768 }
767 if (!test_thread_flag(TIF_MEMDIE)) 769 schedule_timeout_killable(1);
768 schedule_timeout_uninterruptible(1);
769} 770}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 93d8d2f7108..5ad5ce23c1e 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -34,6 +34,7 @@
34#include <linux/syscalls.h> 34#include <linux/syscalls.h>
35#include <linux/buffer_head.h> /* __set_page_dirty_buffers */ 35#include <linux/buffer_head.h> /* __set_page_dirty_buffers */
36#include <linux/pagevec.h> 36#include <linux/pagevec.h>
37#include <linux/timer.h>
37#include <trace/events/writeback.h> 38#include <trace/events/writeback.h>
38 39
39/* 40/*
@@ -135,7 +136,20 @@ unsigned long global_dirty_limit;
135 * measured in page writeback completions. 136 * measured in page writeback completions.
136 * 137 *
137 */ 138 */
138static struct prop_descriptor vm_completions; 139static struct fprop_global writeout_completions;
140
141static void writeout_period(unsigned long t);
142/* Timer for aging of writeout_completions */
143static struct timer_list writeout_period_timer =
144 TIMER_DEFERRED_INITIALIZER(writeout_period, 0, 0);
145static unsigned long writeout_period_time = 0;
146
147/*
148 * Length of period for aging writeout fractions of bdis. This is an
149 * arbitrarily chosen number. The longer the period, the slower fractions will
150 * reflect changes in current writeout rate.
151 */
152#define VM_COMPLETIONS_PERIOD_LEN (3*HZ)
139 153
140/* 154/*
141 * Work out the current dirty-memory clamping and background writeout 155 * Work out the current dirty-memory clamping and background writeout
@@ -322,34 +336,6 @@ bool zone_dirty_ok(struct zone *zone)
322 zone_page_state(zone, NR_WRITEBACK) <= limit; 336 zone_page_state(zone, NR_WRITEBACK) <= limit;
323} 337}
324 338
325/*
326 * couple the period to the dirty_ratio:
327 *
328 * period/2 ~ roundup_pow_of_two(dirty limit)
329 */
330static int calc_period_shift(void)
331{
332 unsigned long dirty_total;
333
334 if (vm_dirty_bytes)
335 dirty_total = vm_dirty_bytes / PAGE_SIZE;
336 else
337 dirty_total = (vm_dirty_ratio * global_dirtyable_memory()) /
338 100;
339 return 2 + ilog2(dirty_total - 1);
340}
341
342/*
343 * update the period when the dirty threshold changes.
344 */
345static void update_completion_period(void)
346{
347 int shift = calc_period_shift();
348 prop_change_shift(&vm_completions, shift);
349
350 writeback_set_ratelimit();
351}
352
353int dirty_background_ratio_handler(struct ctl_table *table, int write, 339int dirty_background_ratio_handler(struct ctl_table *table, int write,
354 void __user *buffer, size_t *lenp, 340 void __user *buffer, size_t *lenp,
355 loff_t *ppos) 341 loff_t *ppos)
@@ -383,7 +369,7 @@ int dirty_ratio_handler(struct ctl_table *table, int write,
383 369
384 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 370 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
385 if (ret == 0 && write && vm_dirty_ratio != old_ratio) { 371 if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
386 update_completion_period(); 372 writeback_set_ratelimit();
387 vm_dirty_bytes = 0; 373 vm_dirty_bytes = 0;
388 } 374 }
389 return ret; 375 return ret;
@@ -398,12 +384,21 @@ int dirty_bytes_handler(struct ctl_table *table, int write,
398 384
399 ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); 385 ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
400 if (ret == 0 && write && vm_dirty_bytes != old_bytes) { 386 if (ret == 0 && write && vm_dirty_bytes != old_bytes) {
401 update_completion_period(); 387 writeback_set_ratelimit();
402 vm_dirty_ratio = 0; 388 vm_dirty_ratio = 0;
403 } 389 }
404 return ret; 390 return ret;
405} 391}
406 392
393static unsigned long wp_next_time(unsigned long cur_time)
394{
395 cur_time += VM_COMPLETIONS_PERIOD_LEN;
396 /* 0 has a special meaning... */
397 if (!cur_time)
398 return 1;
399 return cur_time;
400}
401
407/* 402/*
408 * Increment the BDI's writeout completion count and the global writeout 403 * Increment the BDI's writeout completion count and the global writeout
409 * completion count. Called from test_clear_page_writeback(). 404 * completion count. Called from test_clear_page_writeback().
@@ -411,8 +406,19 @@ int dirty_bytes_handler(struct ctl_table *table, int write,
411static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) 406static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
412{ 407{
413 __inc_bdi_stat(bdi, BDI_WRITTEN); 408 __inc_bdi_stat(bdi, BDI_WRITTEN);
414 __prop_inc_percpu_max(&vm_completions, &bdi->completions, 409 __fprop_inc_percpu_max(&writeout_completions, &bdi->completions,
415 bdi->max_prop_frac); 410 bdi->max_prop_frac);
411 /* First event after period switching was turned off? */
412 if (!unlikely(writeout_period_time)) {
413 /*
414 * We can race with other __bdi_writeout_inc calls here but
415 * it does not cause any harm since the resulting time when
416 * timer will fire and what is in writeout_period_time will be
417 * roughly the same.
418 */
419 writeout_period_time = wp_next_time(jiffies);
420 mod_timer(&writeout_period_timer, writeout_period_time);
421 }
416} 422}
417 423
418void bdi_writeout_inc(struct backing_dev_info *bdi) 424void bdi_writeout_inc(struct backing_dev_info *bdi)
@@ -431,11 +437,33 @@ EXPORT_SYMBOL_GPL(bdi_writeout_inc);
431static void bdi_writeout_fraction(struct backing_dev_info *bdi, 437static void bdi_writeout_fraction(struct backing_dev_info *bdi,
432 long *numerator, long *denominator) 438 long *numerator, long *denominator)
433{ 439{
434 prop_fraction_percpu(&vm_completions, &bdi->completions, 440 fprop_fraction_percpu(&writeout_completions, &bdi->completions,
435 numerator, denominator); 441 numerator, denominator);
436} 442}
437 443
438/* 444/*
445 * On idle system, we can be called long after we scheduled because we use
446 * deferred timers so count with missed periods.
447 */
448static void writeout_period(unsigned long t)
449{
450 int miss_periods = (jiffies - writeout_period_time) /
451 VM_COMPLETIONS_PERIOD_LEN;
452
453 if (fprop_new_period(&writeout_completions, miss_periods + 1)) {
454 writeout_period_time = wp_next_time(writeout_period_time +
455 miss_periods * VM_COMPLETIONS_PERIOD_LEN);
456 mod_timer(&writeout_period_timer, writeout_period_time);
457 } else {
458 /*
459 * Aging has zeroed all fractions. Stop wasting CPU on period
460 * updates.
461 */
462 writeout_period_time = 0;
463 }
464}
465
466/*
439 * bdi_min_ratio keeps the sum of the minimum dirty shares of all 467 * bdi_min_ratio keeps the sum of the minimum dirty shares of all
440 * registered backing devices, which, for obvious reasons, can not 468 * registered backing devices, which, for obvious reasons, can not
441 * exceed 100%. 469 * exceed 100%.
@@ -475,7 +503,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
475 ret = -EINVAL; 503 ret = -EINVAL;
476 } else { 504 } else {
477 bdi->max_ratio = max_ratio; 505 bdi->max_ratio = max_ratio;
478 bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100; 506 bdi->max_prop_frac = (FPROP_FRAC_BASE * max_ratio) / 100;
479 } 507 }
480 spin_unlock_bh(&bdi_lock); 508 spin_unlock_bh(&bdi_lock);
481 509
@@ -918,7 +946,7 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
918 * bdi->dirty_ratelimit = balanced_dirty_ratelimit; 946 * bdi->dirty_ratelimit = balanced_dirty_ratelimit;
919 * 947 *
920 * However to get a more stable dirty_ratelimit, the below elaborated 948 * However to get a more stable dirty_ratelimit, the below elaborated
921 * code makes use of task_ratelimit to filter out sigular points and 949 * code makes use of task_ratelimit to filter out singular points and
922 * limit the step size. 950 * limit the step size.
923 * 951 *
924 * The below code essentially only uses the relative value of 952 * The below code essentially only uses the relative value of
@@ -941,7 +969,7 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
941 * feel and care are stable dirty rate and small position error. 969 * feel and care are stable dirty rate and small position error.
942 * 970 *
943 * |task_ratelimit - dirty_ratelimit| is used to limit the step size 971 * |task_ratelimit - dirty_ratelimit| is used to limit the step size
944 * and filter out the sigular points of balanced_dirty_ratelimit. Which 972 * and filter out the singular points of balanced_dirty_ratelimit. Which
945 * keeps jumping around randomly and can even leap far away at times 973 * keeps jumping around randomly and can even leap far away at times
946 * due to the small 200ms estimation period of dirty_rate (we want to 974 * due to the small 200ms estimation period of dirty_rate (we want to
947 * keep that period small to reduce time lags). 975 * keep that period small to reduce time lags).
@@ -1504,7 +1532,6 @@ int dirty_writeback_centisecs_handler(ctl_table *table, int write,
1504 void __user *buffer, size_t *length, loff_t *ppos) 1532 void __user *buffer, size_t *length, loff_t *ppos)
1505{ 1533{
1506 proc_dointvec(table, write, buffer, length, ppos); 1534 proc_dointvec(table, write, buffer, length, ppos);
1507 bdi_arm_supers_timer();
1508 return 0; 1535 return 0;
1509} 1536}
1510 1537
@@ -1606,13 +1633,10 @@ static struct notifier_block __cpuinitdata ratelimit_nb = {
1606 */ 1633 */
1607void __init page_writeback_init(void) 1634void __init page_writeback_init(void)
1608{ 1635{
1609 int shift;
1610
1611 writeback_set_ratelimit(); 1636 writeback_set_ratelimit();
1612 register_cpu_notifier(&ratelimit_nb); 1637 register_cpu_notifier(&ratelimit_nb);
1613 1638
1614 shift = calc_period_shift(); 1639 fprop_global_init(&writeout_completions);
1615 prop_descriptor_init(&vm_completions, shift);
1616} 1640}
1617 1641
1618/** 1642/**
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4a4f9219683..c66fb875104 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -51,7 +51,6 @@
51#include <linux/page_cgroup.h> 51#include <linux/page_cgroup.h>
52#include <linux/debugobjects.h> 52#include <linux/debugobjects.h>
53#include <linux/kmemleak.h> 53#include <linux/kmemleak.h>
54#include <linux/memory.h>
55#include <linux/compaction.h> 54#include <linux/compaction.h>
56#include <trace/events/kmem.h> 55#include <trace/events/kmem.h>
57#include <linux/ftrace_event.h> 56#include <linux/ftrace_event.h>
@@ -219,7 +218,12 @@ EXPORT_SYMBOL(nr_online_nodes);
219 218
220int page_group_by_mobility_disabled __read_mostly; 219int page_group_by_mobility_disabled __read_mostly;
221 220
222static void set_pageblock_migratetype(struct page *page, int migratetype) 221/*
222 * NOTE:
223 * Don't use set_pageblock_migratetype(page, MIGRATE_ISOLATE) directly.
224 * Instead, use {un}set_pageblock_isolate.
225 */
226void set_pageblock_migratetype(struct page *page, int migratetype)
223{ 227{
224 228
225 if (unlikely(page_group_by_mobility_disabled)) 229 if (unlikely(page_group_by_mobility_disabled))
@@ -954,7 +958,7 @@ static int move_freepages(struct zone *zone,
954 return pages_moved; 958 return pages_moved;
955} 959}
956 960
957static int move_freepages_block(struct zone *zone, struct page *page, 961int move_freepages_block(struct zone *zone, struct page *page,
958 int migratetype) 962 int migratetype)
959{ 963{
960 unsigned long start_pfn, end_pfn; 964 unsigned long start_pfn, end_pfn;
@@ -1158,8 +1162,10 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
1158 to_drain = pcp->batch; 1162 to_drain = pcp->batch;
1159 else 1163 else
1160 to_drain = pcp->count; 1164 to_drain = pcp->count;
1161 free_pcppages_bulk(zone, to_drain, pcp); 1165 if (to_drain > 0) {
1162 pcp->count -= to_drain; 1166 free_pcppages_bulk(zone, to_drain, pcp);
1167 pcp->count -= to_drain;
1168 }
1163 local_irq_restore(flags); 1169 local_irq_restore(flags);
1164} 1170}
1165#endif 1171#endif
@@ -1529,16 +1535,16 @@ static int __init setup_fail_page_alloc(char *str)
1529} 1535}
1530__setup("fail_page_alloc=", setup_fail_page_alloc); 1536__setup("fail_page_alloc=", setup_fail_page_alloc);
1531 1537
1532static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 1538static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1533{ 1539{
1534 if (order < fail_page_alloc.min_order) 1540 if (order < fail_page_alloc.min_order)
1535 return 0; 1541 return false;
1536 if (gfp_mask & __GFP_NOFAIL) 1542 if (gfp_mask & __GFP_NOFAIL)
1537 return 0; 1543 return false;
1538 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) 1544 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
1539 return 0; 1545 return false;
1540 if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) 1546 if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
1541 return 0; 1547 return false;
1542 1548
1543 return should_fail(&fail_page_alloc.attr, 1 << order); 1549 return should_fail(&fail_page_alloc.attr, 1 << order);
1544} 1550}
@@ -1578,9 +1584,9 @@ late_initcall(fail_page_alloc_debugfs);
1578 1584
1579#else /* CONFIG_FAIL_PAGE_ALLOC */ 1585#else /* CONFIG_FAIL_PAGE_ALLOC */
1580 1586
1581static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 1587static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1582{ 1588{
1583 return 0; 1589 return false;
1584} 1590}
1585 1591
1586#endif /* CONFIG_FAIL_PAGE_ALLOC */ 1592#endif /* CONFIG_FAIL_PAGE_ALLOC */
@@ -1594,6 +1600,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1594{ 1600{
1595 /* free_pages my go negative - that's OK */ 1601 /* free_pages my go negative - that's OK */
1596 long min = mark; 1602 long min = mark;
1603 long lowmem_reserve = z->lowmem_reserve[classzone_idx];
1597 int o; 1604 int o;
1598 1605
1599 free_pages -= (1 << order) - 1; 1606 free_pages -= (1 << order) - 1;
@@ -1602,7 +1609,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1602 if (alloc_flags & ALLOC_HARDER) 1609 if (alloc_flags & ALLOC_HARDER)
1603 min -= min / 4; 1610 min -= min / 4;
1604 1611
1605 if (free_pages <= min + z->lowmem_reserve[classzone_idx]) 1612 if (free_pages <= min + lowmem_reserve)
1606 return false; 1613 return false;
1607 for (o = 0; o < order; o++) { 1614 for (o = 0; o < order; o++) {
1608 /* At the next order, this order's pages become unavailable */ 1615 /* At the next order, this order's pages become unavailable */
@@ -1617,6 +1624,20 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1617 return true; 1624 return true;
1618} 1625}
1619 1626
1627#ifdef CONFIG_MEMORY_ISOLATION
1628static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
1629{
1630 if (unlikely(zone->nr_pageblock_isolate))
1631 return zone->nr_pageblock_isolate * pageblock_nr_pages;
1632 return 0;
1633}
1634#else
1635static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
1636{
1637 return 0;
1638}
1639#endif
1640
1620bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, 1641bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1621 int classzone_idx, int alloc_flags) 1642 int classzone_idx, int alloc_flags)
1622{ 1643{
@@ -1632,6 +1653,14 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
1632 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) 1653 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
1633 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); 1654 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
1634 1655
1656 /*
1657 * If the zone has MIGRATE_ISOLATE type free pages, we should consider
1658 * it. nr_zone_isolate_freepages is never accurate so kswapd might not
1659 * sleep although it could do so. But this is more desirable for memory
1660 * hotplug than sleeping which can cause a livelock in the direct
1661 * reclaim path.
1662 */
1663 free_pages -= nr_zone_isolate_freepages(z);
1635 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, 1664 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1636 free_pages); 1665 free_pages);
1637} 1666}
@@ -1899,6 +1928,17 @@ this_zone_full:
1899 zlc_active = 0; 1928 zlc_active = 0;
1900 goto zonelist_scan; 1929 goto zonelist_scan;
1901 } 1930 }
1931
1932 if (page)
1933 /*
1934 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
1935 * necessary to allocate the page. The expectation is
1936 * that the caller is taking steps that will free more
1937 * memory. The caller should avoid the page being used
1938 * for !PFMEMALLOC purposes.
1939 */
1940 page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
1941
1902 return page; 1942 return page;
1903} 1943}
1904 1944
@@ -2062,7 +2102,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2062 struct zonelist *zonelist, enum zone_type high_zoneidx, 2102 struct zonelist *zonelist, enum zone_type high_zoneidx,
2063 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2103 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2064 int migratetype, bool sync_migration, 2104 int migratetype, bool sync_migration,
2065 bool *deferred_compaction, 2105 bool *contended_compaction, bool *deferred_compaction,
2066 unsigned long *did_some_progress) 2106 unsigned long *did_some_progress)
2067{ 2107{
2068 struct page *page; 2108 struct page *page;
@@ -2077,7 +2117,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2077 2117
2078 current->flags |= PF_MEMALLOC; 2118 current->flags |= PF_MEMALLOC;
2079 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, 2119 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
2080 nodemask, sync_migration); 2120 nodemask, sync_migration,
2121 contended_compaction);
2081 current->flags &= ~PF_MEMALLOC; 2122 current->flags &= ~PF_MEMALLOC;
2082 if (*did_some_progress != COMPACT_SKIPPED) { 2123 if (*did_some_progress != COMPACT_SKIPPED) {
2083 2124
@@ -2087,8 +2128,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2087 2128
2088 page = get_page_from_freelist(gfp_mask, nodemask, 2129 page = get_page_from_freelist(gfp_mask, nodemask,
2089 order, zonelist, high_zoneidx, 2130 order, zonelist, high_zoneidx,
2090 alloc_flags, preferred_zone, 2131 alloc_flags & ~ALLOC_NO_WATERMARKS,
2091 migratetype); 2132 preferred_zone, migratetype);
2092 if (page) { 2133 if (page) {
2093 preferred_zone->compact_considered = 0; 2134 preferred_zone->compact_considered = 0;
2094 preferred_zone->compact_defer_shift = 0; 2135 preferred_zone->compact_defer_shift = 0;
@@ -2123,7 +2164,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2123 struct zonelist *zonelist, enum zone_type high_zoneidx, 2164 struct zonelist *zonelist, enum zone_type high_zoneidx,
2124 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2165 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2125 int migratetype, bool sync_migration, 2166 int migratetype, bool sync_migration,
2126 bool *deferred_compaction, 2167 bool *contended_compaction, bool *deferred_compaction,
2127 unsigned long *did_some_progress) 2168 unsigned long *did_some_progress)
2128{ 2169{
2129 return NULL; 2170 return NULL;
@@ -2180,8 +2221,8 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
2180retry: 2221retry:
2181 page = get_page_from_freelist(gfp_mask, nodemask, order, 2222 page = get_page_from_freelist(gfp_mask, nodemask, order,
2182 zonelist, high_zoneidx, 2223 zonelist, high_zoneidx,
2183 alloc_flags, preferred_zone, 2224 alloc_flags & ~ALLOC_NO_WATERMARKS,
2184 migratetype); 2225 preferred_zone, migratetype);
2185 2226
2186 /* 2227 /*
2187 * If an allocation failed after direct reclaim, it could be because 2228 * If an allocation failed after direct reclaim, it could be because
@@ -2265,15 +2306,24 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
2265 alloc_flags |= ALLOC_HARDER; 2306 alloc_flags |= ALLOC_HARDER;
2266 2307
2267 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { 2308 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
2268 if (!in_interrupt() && 2309 if (gfp_mask & __GFP_MEMALLOC)
2269 ((current->flags & PF_MEMALLOC) || 2310 alloc_flags |= ALLOC_NO_WATERMARKS;
2270 unlikely(test_thread_flag(TIF_MEMDIE)))) 2311 else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
2312 alloc_flags |= ALLOC_NO_WATERMARKS;
2313 else if (!in_interrupt() &&
2314 ((current->flags & PF_MEMALLOC) ||
2315 unlikely(test_thread_flag(TIF_MEMDIE))))
2271 alloc_flags |= ALLOC_NO_WATERMARKS; 2316 alloc_flags |= ALLOC_NO_WATERMARKS;
2272 } 2317 }
2273 2318
2274 return alloc_flags; 2319 return alloc_flags;
2275} 2320}
2276 2321
2322bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
2323{
2324 return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);
2325}
2326
2277static inline struct page * 2327static inline struct page *
2278__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, 2328__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2279 struct zonelist *zonelist, enum zone_type high_zoneidx, 2329 struct zonelist *zonelist, enum zone_type high_zoneidx,
@@ -2287,6 +2337,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2287 unsigned long did_some_progress; 2337 unsigned long did_some_progress;
2288 bool sync_migration = false; 2338 bool sync_migration = false;
2289 bool deferred_compaction = false; 2339 bool deferred_compaction = false;
2340 bool contended_compaction = false;
2290 2341
2291 /* 2342 /*
2292 * In the slowpath, we sanity check order to avoid ever trying to 2343 * In the slowpath, we sanity check order to avoid ever trying to
@@ -2340,11 +2391,19 @@ rebalance:
2340 2391
2341 /* Allocate without watermarks if the context allows */ 2392 /* Allocate without watermarks if the context allows */
2342 if (alloc_flags & ALLOC_NO_WATERMARKS) { 2393 if (alloc_flags & ALLOC_NO_WATERMARKS) {
2394 /*
2395 * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds
2396 * the allocation is high priority and these type of
2397 * allocations are system rather than user orientated
2398 */
2399 zonelist = node_zonelist(numa_node_id(), gfp_mask);
2400
2343 page = __alloc_pages_high_priority(gfp_mask, order, 2401 page = __alloc_pages_high_priority(gfp_mask, order,
2344 zonelist, high_zoneidx, nodemask, 2402 zonelist, high_zoneidx, nodemask,
2345 preferred_zone, migratetype); 2403 preferred_zone, migratetype);
2346 if (page) 2404 if (page) {
2347 goto got_pg; 2405 goto got_pg;
2406 }
2348 } 2407 }
2349 2408
2350 /* Atomic allocations - we can't balance anything */ 2409 /* Atomic allocations - we can't balance anything */
@@ -2368,6 +2427,7 @@ rebalance:
2368 nodemask, 2427 nodemask,
2369 alloc_flags, preferred_zone, 2428 alloc_flags, preferred_zone,
2370 migratetype, sync_migration, 2429 migratetype, sync_migration,
2430 &contended_compaction,
2371 &deferred_compaction, 2431 &deferred_compaction,
2372 &did_some_progress); 2432 &did_some_progress);
2373 if (page) 2433 if (page)
@@ -2377,10 +2437,11 @@ rebalance:
2377 /* 2437 /*
2378 * If compaction is deferred for high-order allocations, it is because 2438 * If compaction is deferred for high-order allocations, it is because
2379 * sync compaction recently failed. In this is the case and the caller 2439 * sync compaction recently failed. In this is the case and the caller
2380 * has requested the system not be heavily disrupted, fail the 2440 * requested a movable allocation that does not heavily disrupt the
2381 * allocation now instead of entering direct reclaim 2441 * system then fail the allocation instead of entering direct reclaim.
2382 */ 2442 */
2383 if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD)) 2443 if ((deferred_compaction || contended_compaction) &&
2444 (gfp_mask & __GFP_NO_KSWAPD))
2384 goto nopage; 2445 goto nopage;
2385 2446
2386 /* Try direct reclaim and then allocating */ 2447 /* Try direct reclaim and then allocating */
@@ -2451,6 +2512,7 @@ rebalance:
2451 nodemask, 2512 nodemask,
2452 alloc_flags, preferred_zone, 2513 alloc_flags, preferred_zone,
2453 migratetype, sync_migration, 2514 migratetype, sync_migration,
2515 &contended_compaction,
2454 &deferred_compaction, 2516 &deferred_compaction,
2455 &did_some_progress); 2517 &did_some_progress);
2456 if (page) 2518 if (page)
@@ -2463,8 +2525,8 @@ nopage:
2463got_pg: 2525got_pg:
2464 if (kmemcheck_enabled) 2526 if (kmemcheck_enabled)
2465 kmemcheck_pagealloc_alloc(page, order, gfp_mask); 2527 kmemcheck_pagealloc_alloc(page, order, gfp_mask);
2466 return page;
2467 2528
2529 return page;
2468} 2530}
2469 2531
2470/* 2532/*
@@ -3030,7 +3092,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
3030 user_zonelist_order = oldval; 3092 user_zonelist_order = oldval;
3031 } else if (oldval != user_zonelist_order) { 3093 } else if (oldval != user_zonelist_order) {
3032 mutex_lock(&zonelists_mutex); 3094 mutex_lock(&zonelists_mutex);
3033 build_all_zonelists(NULL); 3095 build_all_zonelists(NULL, NULL);
3034 mutex_unlock(&zonelists_mutex); 3096 mutex_unlock(&zonelists_mutex);
3035 } 3097 }
3036 } 3098 }
@@ -3409,14 +3471,21 @@ static void setup_zone_pageset(struct zone *zone);
3409DEFINE_MUTEX(zonelists_mutex); 3471DEFINE_MUTEX(zonelists_mutex);
3410 3472
3411/* return values int ....just for stop_machine() */ 3473/* return values int ....just for stop_machine() */
3412static __init_refok int __build_all_zonelists(void *data) 3474static int __build_all_zonelists(void *data)
3413{ 3475{
3414 int nid; 3476 int nid;
3415 int cpu; 3477 int cpu;
3478 pg_data_t *self = data;
3416 3479
3417#ifdef CONFIG_NUMA 3480#ifdef CONFIG_NUMA
3418 memset(node_load, 0, sizeof(node_load)); 3481 memset(node_load, 0, sizeof(node_load));
3419#endif 3482#endif
3483
3484 if (self && !node_online(self->node_id)) {
3485 build_zonelists(self);
3486 build_zonelist_cache(self);
3487 }
3488
3420 for_each_online_node(nid) { 3489 for_each_online_node(nid) {
3421 pg_data_t *pgdat = NODE_DATA(nid); 3490 pg_data_t *pgdat = NODE_DATA(nid);
3422 3491
@@ -3461,7 +3530,7 @@ static __init_refok int __build_all_zonelists(void *data)
3461 * Called with zonelists_mutex held always 3530 * Called with zonelists_mutex held always
3462 * unless system_state == SYSTEM_BOOTING. 3531 * unless system_state == SYSTEM_BOOTING.
3463 */ 3532 */
3464void __ref build_all_zonelists(void *data) 3533void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
3465{ 3534{
3466 set_zonelist_order(); 3535 set_zonelist_order();
3467 3536
@@ -3473,10 +3542,10 @@ void __ref build_all_zonelists(void *data)
3473 /* we have to stop all cpus to guarantee there is no user 3542 /* we have to stop all cpus to guarantee there is no user
3474 of zonelist */ 3543 of zonelist */
3475#ifdef CONFIG_MEMORY_HOTPLUG 3544#ifdef CONFIG_MEMORY_HOTPLUG
3476 if (data) 3545 if (zone)
3477 setup_zone_pageset((struct zone *)data); 3546 setup_zone_pageset(zone);
3478#endif 3547#endif
3479 stop_machine(__build_all_zonelists, NULL, NULL); 3548 stop_machine(__build_all_zonelists, pgdat, NULL);
3480 /* cpuset refresh routine should be here */ 3549 /* cpuset refresh routine should be here */
3481 } 3550 }
3482 vm_total_pages = nr_free_pagecache_pages(); 3551 vm_total_pages = nr_free_pagecache_pages();
@@ -3746,7 +3815,7 @@ static void __meminit zone_init_free_lists(struct zone *zone)
3746 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) 3815 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
3747#endif 3816#endif
3748 3817
3749static int zone_batchsize(struct zone *zone) 3818static int __meminit zone_batchsize(struct zone *zone)
3750{ 3819{
3751#ifdef CONFIG_MMU 3820#ifdef CONFIG_MMU
3752 int batch; 3821 int batch;
@@ -3828,7 +3897,7 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
3828 pcp->batch = PAGE_SHIFT * 8; 3897 pcp->batch = PAGE_SHIFT * 8;
3829} 3898}
3830 3899
3831static void setup_zone_pageset(struct zone *zone) 3900static void __meminit setup_zone_pageset(struct zone *zone)
3832{ 3901{
3833 int cpu; 3902 int cpu;
3834 3903
@@ -3901,32 +3970,6 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
3901 return 0; 3970 return 0;
3902} 3971}
3903 3972
3904static int __zone_pcp_update(void *data)
3905{
3906 struct zone *zone = data;
3907 int cpu;
3908 unsigned long batch = zone_batchsize(zone), flags;
3909
3910 for_each_possible_cpu(cpu) {
3911 struct per_cpu_pageset *pset;
3912 struct per_cpu_pages *pcp;
3913
3914 pset = per_cpu_ptr(zone->pageset, cpu);
3915 pcp = &pset->pcp;
3916
3917 local_irq_save(flags);
3918 free_pcppages_bulk(zone, pcp->count, pcp);
3919 setup_pageset(pset, batch);
3920 local_irq_restore(flags);
3921 }
3922 return 0;
3923}
3924
3925void zone_pcp_update(struct zone *zone)
3926{
3927 stop_machine(__zone_pcp_update, zone, NULL);
3928}
3929
3930static __meminit void zone_pcp_init(struct zone *zone) 3973static __meminit void zone_pcp_init(struct zone *zone)
3931{ 3974{
3932 /* 3975 /*
@@ -3942,7 +3985,7 @@ static __meminit void zone_pcp_init(struct zone *zone)
3942 zone_batchsize(zone)); 3985 zone_batchsize(zone));
3943} 3986}
3944 3987
3945__meminit int init_currently_empty_zone(struct zone *zone, 3988int __meminit init_currently_empty_zone(struct zone *zone,
3946 unsigned long zone_start_pfn, 3989 unsigned long zone_start_pfn,
3947 unsigned long size, 3990 unsigned long size,
3948 enum memmap_context context) 3991 enum memmap_context context)
@@ -4301,7 +4344,7 @@ static inline void setup_usemap(struct pglist_data *pgdat,
4301#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 4344#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
4302 4345
4303/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ 4346/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
4304static inline void __init set_pageblock_order(void) 4347void __init set_pageblock_order(void)
4305{ 4348{
4306 unsigned int order; 4349 unsigned int order;
4307 4350
@@ -4329,7 +4372,7 @@ static inline void __init set_pageblock_order(void)
4329 * include/linux/pageblock-flags.h for the values of pageblock_order based on 4372 * include/linux/pageblock-flags.h for the values of pageblock_order based on
4330 * the kernel config 4373 * the kernel config
4331 */ 4374 */
4332static inline void set_pageblock_order(void) 4375void __init set_pageblock_order(void)
4333{ 4376{
4334} 4377}
4335 4378
@@ -4340,6 +4383,8 @@ static inline void set_pageblock_order(void)
4340 * - mark all pages reserved 4383 * - mark all pages reserved
4341 * - mark all memory queues empty 4384 * - mark all memory queues empty
4342 * - clear the memory bitmaps 4385 * - clear the memory bitmaps
4386 *
4387 * NOTE: pgdat should get zeroed by caller.
4343 */ 4388 */
4344static void __paginginit free_area_init_core(struct pglist_data *pgdat, 4389static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4345 unsigned long *zones_size, unsigned long *zholes_size) 4390 unsigned long *zones_size, unsigned long *zholes_size)
@@ -4350,9 +4395,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4350 int ret; 4395 int ret;
4351 4396
4352 pgdat_resize_init(pgdat); 4397 pgdat_resize_init(pgdat);
4353 pgdat->nr_zones = 0;
4354 init_waitqueue_head(&pgdat->kswapd_wait); 4398 init_waitqueue_head(&pgdat->kswapd_wait);
4355 pgdat->kswapd_max_order = 0; 4399 init_waitqueue_head(&pgdat->pfmemalloc_wait);
4356 pgdat_page_cgroup_init(pgdat); 4400 pgdat_page_cgroup_init(pgdat);
4357 4401
4358 for (j = 0; j < MAX_NR_ZONES; j++) { 4402 for (j = 0; j < MAX_NR_ZONES; j++) {
@@ -4394,6 +4438,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4394 4438
4395 zone->spanned_pages = size; 4439 zone->spanned_pages = size;
4396 zone->present_pages = realsize; 4440 zone->present_pages = realsize;
4441#if defined CONFIG_COMPACTION || defined CONFIG_CMA
4442 zone->compact_cached_free_pfn = zone->zone_start_pfn +
4443 zone->spanned_pages;
4444 zone->compact_cached_free_pfn &= ~(pageblock_nr_pages-1);
4445#endif
4397#ifdef CONFIG_NUMA 4446#ifdef CONFIG_NUMA
4398 zone->node = nid; 4447 zone->node = nid;
4399 zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) 4448 zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
@@ -4408,8 +4457,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4408 4457
4409 zone_pcp_init(zone); 4458 zone_pcp_init(zone);
4410 lruvec_init(&zone->lruvec, zone); 4459 lruvec_init(&zone->lruvec, zone);
4411 zap_zone_vm_stats(zone);
4412 zone->flags = 0;
4413 if (!size) 4460 if (!size)
4414 continue; 4461 continue;
4415 4462
@@ -4469,6 +4516,9 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
4469{ 4516{
4470 pg_data_t *pgdat = NODE_DATA(nid); 4517 pg_data_t *pgdat = NODE_DATA(nid);
4471 4518
4519 /* pg_data_t should be reset to zero when it's allocated */
4520 WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);
4521
4472 pgdat->node_id = nid; 4522 pgdat->node_id = nid;
4473 pgdat->node_start_pfn = node_start_pfn; 4523 pgdat->node_start_pfn = node_start_pfn;
4474 calculate_node_totalpages(pgdat, zones_size, zholes_size); 4524 calculate_node_totalpages(pgdat, zones_size, zholes_size);
@@ -4750,7 +4800,7 @@ out:
4750} 4800}
4751 4801
4752/* Any regular memory on that node ? */ 4802/* Any regular memory on that node ? */
4753static void check_for_regular_memory(pg_data_t *pgdat) 4803static void __init check_for_regular_memory(pg_data_t *pgdat)
4754{ 4804{
4755#ifdef CONFIG_HIGHMEM 4805#ifdef CONFIG_HIGHMEM
4756 enum zone_type zone_type; 4806 enum zone_type zone_type;
@@ -5468,26 +5518,27 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
5468} 5518}
5469 5519
5470/* 5520/*
5471 * This is designed as sub function...plz see page_isolation.c also. 5521 * This function checks whether pageblock includes unmovable pages or not.
5472 * set/clear page block's type to be ISOLATE. 5522 * If @count is not zero, it is okay to include less @count unmovable pages
5473 * page allocater never alloc memory from ISOLATE block. 5523 *
5524 * PageLRU check wihtout isolation or lru_lock could race so that
5525 * MIGRATE_MOVABLE block might include unmovable pages. It means you can't
5526 * expect this function should be exact.
5474 */ 5527 */
5475 5528bool has_unmovable_pages(struct zone *zone, struct page *page, int count)
5476static int
5477__count_immobile_pages(struct zone *zone, struct page *page, int count)
5478{ 5529{
5479 unsigned long pfn, iter, found; 5530 unsigned long pfn, iter, found;
5480 int mt; 5531 int mt;
5481 5532
5482 /* 5533 /*
5483 * For avoiding noise data, lru_add_drain_all() should be called 5534 * For avoiding noise data, lru_add_drain_all() should be called
5484 * If ZONE_MOVABLE, the zone never contains immobile pages 5535 * If ZONE_MOVABLE, the zone never contains unmovable pages
5485 */ 5536 */
5486 if (zone_idx(zone) == ZONE_MOVABLE) 5537 if (zone_idx(zone) == ZONE_MOVABLE)
5487 return true; 5538 return false;
5488 mt = get_pageblock_migratetype(page); 5539 mt = get_pageblock_migratetype(page);
5489 if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt)) 5540 if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))
5490 return true; 5541 return false;
5491 5542
5492 pfn = page_to_pfn(page); 5543 pfn = page_to_pfn(page);
5493 for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { 5544 for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
@@ -5497,11 +5548,18 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count)
5497 continue; 5548 continue;
5498 5549
5499 page = pfn_to_page(check); 5550 page = pfn_to_page(check);
5500 if (!page_count(page)) { 5551 /*
5552 * We can't use page_count without pin a page
5553 * because another CPU can free compound page.
5554 * This check already skips compound tails of THP
5555 * because their page->_count is zero at all time.
5556 */
5557 if (!atomic_read(&page->_count)) {
5501 if (PageBuddy(page)) 5558 if (PageBuddy(page))
5502 iter += (1 << page_order(page)) - 1; 5559 iter += (1 << page_order(page)) - 1;
5503 continue; 5560 continue;
5504 } 5561 }
5562
5505 if (!PageLRU(page)) 5563 if (!PageLRU(page))
5506 found++; 5564 found++;
5507 /* 5565 /*
@@ -5518,9 +5576,9 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count)
5518 * page at boot. 5576 * page at boot.
5519 */ 5577 */
5520 if (found > count) 5578 if (found > count)
5521 return false; 5579 return true;
5522 } 5580 }
5523 return true; 5581 return false;
5524} 5582}
5525 5583
5526bool is_pageblock_removable_nolock(struct page *page) 5584bool is_pageblock_removable_nolock(struct page *page)
@@ -5544,77 +5602,7 @@ bool is_pageblock_removable_nolock(struct page *page)
5544 zone->zone_start_pfn + zone->spanned_pages <= pfn) 5602 zone->zone_start_pfn + zone->spanned_pages <= pfn)
5545 return false; 5603 return false;
5546 5604
5547 return __count_immobile_pages(zone, page, 0); 5605 return !has_unmovable_pages(zone, page, 0);
5548}
5549
5550int set_migratetype_isolate(struct page *page)
5551{
5552 struct zone *zone;
5553 unsigned long flags, pfn;
5554 struct memory_isolate_notify arg;
5555 int notifier_ret;
5556 int ret = -EBUSY;
5557
5558 zone = page_zone(page);
5559
5560 spin_lock_irqsave(&zone->lock, flags);
5561
5562 pfn = page_to_pfn(page);
5563 arg.start_pfn = pfn;
5564 arg.nr_pages = pageblock_nr_pages;
5565 arg.pages_found = 0;
5566
5567 /*
5568 * It may be possible to isolate a pageblock even if the
5569 * migratetype is not MIGRATE_MOVABLE. The memory isolation
5570 * notifier chain is used by balloon drivers to return the
5571 * number of pages in a range that are held by the balloon
5572 * driver to shrink memory. If all the pages are accounted for
5573 * by balloons, are free, or on the LRU, isolation can continue.
5574 * Later, for example, when memory hotplug notifier runs, these
5575 * pages reported as "can be isolated" should be isolated(freed)
5576 * by the balloon driver through the memory notifier chain.
5577 */
5578 notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
5579 notifier_ret = notifier_to_errno(notifier_ret);
5580 if (notifier_ret)
5581 goto out;
5582 /*
5583 * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
5584 * We just check MOVABLE pages.
5585 */
5586 if (__count_immobile_pages(zone, page, arg.pages_found))
5587 ret = 0;
5588
5589 /*
5590 * immobile means "not-on-lru" paes. If immobile is larger than
5591 * removable-by-driver pages reported by notifier, we'll fail.
5592 */
5593
5594out:
5595 if (!ret) {
5596 set_pageblock_migratetype(page, MIGRATE_ISOLATE);
5597 move_freepages_block(zone, page, MIGRATE_ISOLATE);
5598 }
5599
5600 spin_unlock_irqrestore(&zone->lock, flags);
5601 if (!ret)
5602 drain_all_pages();
5603 return ret;
5604}
5605
5606void unset_migratetype_isolate(struct page *page, unsigned migratetype)
5607{
5608 struct zone *zone;
5609 unsigned long flags;
5610 zone = page_zone(page);
5611 spin_lock_irqsave(&zone->lock, flags);
5612 if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
5613 goto out;
5614 set_pageblock_migratetype(page, migratetype);
5615 move_freepages_block(zone, page, migratetype);
5616out:
5617 spin_unlock_irqrestore(&zone->lock, flags);
5618} 5606}
5619 5607
5620#ifdef CONFIG_CMA 5608#ifdef CONFIG_CMA
@@ -5869,7 +5857,49 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages)
5869} 5857}
5870#endif 5858#endif
5871 5859
5860#ifdef CONFIG_MEMORY_HOTPLUG
5861static int __meminit __zone_pcp_update(void *data)
5862{
5863 struct zone *zone = data;
5864 int cpu;
5865 unsigned long batch = zone_batchsize(zone), flags;
5866
5867 for_each_possible_cpu(cpu) {
5868 struct per_cpu_pageset *pset;
5869 struct per_cpu_pages *pcp;
5870
5871 pset = per_cpu_ptr(zone->pageset, cpu);
5872 pcp = &pset->pcp;
5873
5874 local_irq_save(flags);
5875 if (pcp->count > 0)
5876 free_pcppages_bulk(zone, pcp->count, pcp);
5877 setup_pageset(pset, batch);
5878 local_irq_restore(flags);
5879 }
5880 return 0;
5881}
5882
5883void __meminit zone_pcp_update(struct zone *zone)
5884{
5885 stop_machine(__zone_pcp_update, zone, NULL);
5886}
5887#endif
5888
5872#ifdef CONFIG_MEMORY_HOTREMOVE 5889#ifdef CONFIG_MEMORY_HOTREMOVE
5890void zone_pcp_reset(struct zone *zone)
5891{
5892 unsigned long flags;
5893
5894 /* avoid races with drain_pages() */
5895 local_irq_save(flags);
5896 if (zone->pageset != &boot_pageset) {
5897 free_percpu(zone->pageset);
5898 zone->pageset = &boot_pageset;
5899 }
5900 local_irq_restore(flags);
5901}
5902
5873/* 5903/*
5874 * All pages in the range must be isolated before calling this. 5904 * All pages in the range must be isolated before calling this.
5875 */ 5905 */
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index eb750f85139..5ddad0c6daa 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -317,7 +317,7 @@ void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
317#endif 317#endif
318 318
319 319
320#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 320#ifdef CONFIG_MEMCG_SWAP
321 321
322static DEFINE_MUTEX(swap_cgroup_mutex); 322static DEFINE_MUTEX(swap_cgroup_mutex);
323struct swap_cgroup_ctrl { 323struct swap_cgroup_ctrl {
diff --git a/mm/page_io.c b/mm/page_io.c
index 34f02923744..78eee32ee48 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -17,6 +17,7 @@
17#include <linux/swap.h> 17#include <linux/swap.h>
18#include <linux/bio.h> 18#include <linux/bio.h>
19#include <linux/swapops.h> 19#include <linux/swapops.h>
20#include <linux/buffer_head.h>
20#include <linux/writeback.h> 21#include <linux/writeback.h>
21#include <linux/frontswap.h> 22#include <linux/frontswap.h>
22#include <asm/pgtable.h> 23#include <asm/pgtable.h>
@@ -86,6 +87,98 @@ void end_swap_bio_read(struct bio *bio, int err)
86 bio_put(bio); 87 bio_put(bio);
87} 88}
88 89
90int generic_swapfile_activate(struct swap_info_struct *sis,
91 struct file *swap_file,
92 sector_t *span)
93{
94 struct address_space *mapping = swap_file->f_mapping;
95 struct inode *inode = mapping->host;
96 unsigned blocks_per_page;
97 unsigned long page_no;
98 unsigned blkbits;
99 sector_t probe_block;
100 sector_t last_block;
101 sector_t lowest_block = -1;
102 sector_t highest_block = 0;
103 int nr_extents = 0;
104 int ret;
105
106 blkbits = inode->i_blkbits;
107 blocks_per_page = PAGE_SIZE >> blkbits;
108
109 /*
110 * Map all the blocks into the extent list. This code doesn't try
111 * to be very smart.
112 */
113 probe_block = 0;
114 page_no = 0;
115 last_block = i_size_read(inode) >> blkbits;
116 while ((probe_block + blocks_per_page) <= last_block &&
117 page_no < sis->max) {
118 unsigned block_in_page;
119 sector_t first_block;
120
121 first_block = bmap(inode, probe_block);
122 if (first_block == 0)
123 goto bad_bmap;
124
125 /*
126 * It must be PAGE_SIZE aligned on-disk
127 */
128 if (first_block & (blocks_per_page - 1)) {
129 probe_block++;
130 goto reprobe;
131 }
132
133 for (block_in_page = 1; block_in_page < blocks_per_page;
134 block_in_page++) {
135 sector_t block;
136
137 block = bmap(inode, probe_block + block_in_page);
138 if (block == 0)
139 goto bad_bmap;
140 if (block != first_block + block_in_page) {
141 /* Discontiguity */
142 probe_block++;
143 goto reprobe;
144 }
145 }
146
147 first_block >>= (PAGE_SHIFT - blkbits);
148 if (page_no) { /* exclude the header page */
149 if (first_block < lowest_block)
150 lowest_block = first_block;
151 if (first_block > highest_block)
152 highest_block = first_block;
153 }
154
155 /*
156 * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
157 */
158 ret = add_swap_extent(sis, page_no, 1, first_block);
159 if (ret < 0)
160 goto out;
161 nr_extents += ret;
162 page_no++;
163 probe_block += blocks_per_page;
164reprobe:
165 continue;
166 }
167 ret = nr_extents;
168 *span = 1 + highest_block - lowest_block;
169 if (page_no == 0)
170 page_no = 1; /* force Empty message */
171 sis->max = page_no;
172 sis->pages = page_no - 1;
173 sis->highest_bit = page_no - 1;
174out:
175 return ret;
176bad_bmap:
177 printk(KERN_ERR "swapon: swapfile has holes\n");
178 ret = -EINVAL;
179 goto out;
180}
181
89/* 182/*
90 * We may have stale swap cache pages in memory: notice 183 * We may have stale swap cache pages in memory: notice
91 * them here and get rid of the unnecessary final write. 184 * them here and get rid of the unnecessary final write.
@@ -94,6 +187,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
94{ 187{
95 struct bio *bio; 188 struct bio *bio;
96 int ret = 0, rw = WRITE; 189 int ret = 0, rw = WRITE;
190 struct swap_info_struct *sis = page_swap_info(page);
97 191
98 if (try_to_free_swap(page)) { 192 if (try_to_free_swap(page)) {
99 unlock_page(page); 193 unlock_page(page);
@@ -105,6 +199,33 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
105 end_page_writeback(page); 199 end_page_writeback(page);
106 goto out; 200 goto out;
107 } 201 }
202
203 if (sis->flags & SWP_FILE) {
204 struct kiocb kiocb;
205 struct file *swap_file = sis->swap_file;
206 struct address_space *mapping = swap_file->f_mapping;
207 struct iovec iov = {
208 .iov_base = kmap(page),
209 .iov_len = PAGE_SIZE,
210 };
211
212 init_sync_kiocb(&kiocb, swap_file);
213 kiocb.ki_pos = page_file_offset(page);
214 kiocb.ki_left = PAGE_SIZE;
215 kiocb.ki_nbytes = PAGE_SIZE;
216
217 unlock_page(page);
218 ret = mapping->a_ops->direct_IO(KERNEL_WRITE,
219 &kiocb, &iov,
220 kiocb.ki_pos, 1);
221 kunmap(page);
222 if (ret == PAGE_SIZE) {
223 count_vm_event(PSWPOUT);
224 ret = 0;
225 }
226 return ret;
227 }
228
108 bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write); 229 bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write);
109 if (bio == NULL) { 230 if (bio == NULL) {
110 set_page_dirty(page); 231 set_page_dirty(page);
@@ -126,6 +247,7 @@ int swap_readpage(struct page *page)
126{ 247{
127 struct bio *bio; 248 struct bio *bio;
128 int ret = 0; 249 int ret = 0;
250 struct swap_info_struct *sis = page_swap_info(page);
129 251
130 VM_BUG_ON(!PageLocked(page)); 252 VM_BUG_ON(!PageLocked(page));
131 VM_BUG_ON(PageUptodate(page)); 253 VM_BUG_ON(PageUptodate(page));
@@ -134,6 +256,17 @@ int swap_readpage(struct page *page)
134 unlock_page(page); 256 unlock_page(page);
135 goto out; 257 goto out;
136 } 258 }
259
260 if (sis->flags & SWP_FILE) {
261 struct file *swap_file = sis->swap_file;
262 struct address_space *mapping = swap_file->f_mapping;
263
264 ret = mapping->a_ops->readpage(swap_file, page);
265 if (!ret)
266 count_vm_event(PSWPIN);
267 return ret;
268 }
269
137 bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); 270 bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read);
138 if (bio == NULL) { 271 if (bio == NULL) {
139 unlock_page(page); 272 unlock_page(page);
@@ -145,3 +278,15 @@ int swap_readpage(struct page *page)
145out: 278out:
146 return ret; 279 return ret;
147} 280}
281
282int swap_set_page_dirty(struct page *page)
283{
284 struct swap_info_struct *sis = page_swap_info(page);
285
286 if (sis->flags & SWP_FILE) {
287 struct address_space *mapping = sis->swap_file->f_mapping;
288 return mapping->a_ops->set_page_dirty(page);
289 } else {
290 return __set_page_dirty_no_writeback(page);
291 }
292}
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index c9f04774f2b..247d1f17573 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -5,8 +5,101 @@
5#include <linux/mm.h> 5#include <linux/mm.h>
6#include <linux/page-isolation.h> 6#include <linux/page-isolation.h>
7#include <linux/pageblock-flags.h> 7#include <linux/pageblock-flags.h>
8#include <linux/memory.h>
8#include "internal.h" 9#include "internal.h"
9 10
11/* called while holding zone->lock */
12static void set_pageblock_isolate(struct page *page)
13{
14 if (get_pageblock_migratetype(page) == MIGRATE_ISOLATE)
15 return;
16
17 set_pageblock_migratetype(page, MIGRATE_ISOLATE);
18 page_zone(page)->nr_pageblock_isolate++;
19}
20
21/* called while holding zone->lock */
22static void restore_pageblock_isolate(struct page *page, int migratetype)
23{
24 struct zone *zone = page_zone(page);
25 if (WARN_ON(get_pageblock_migratetype(page) != MIGRATE_ISOLATE))
26 return;
27
28 BUG_ON(zone->nr_pageblock_isolate <= 0);
29 set_pageblock_migratetype(page, migratetype);
30 zone->nr_pageblock_isolate--;
31}
32
33int set_migratetype_isolate(struct page *page)
34{
35 struct zone *zone;
36 unsigned long flags, pfn;
37 struct memory_isolate_notify arg;
38 int notifier_ret;
39 int ret = -EBUSY;
40
41 zone = page_zone(page);
42
43 spin_lock_irqsave(&zone->lock, flags);
44
45 pfn = page_to_pfn(page);
46 arg.start_pfn = pfn;
47 arg.nr_pages = pageblock_nr_pages;
48 arg.pages_found = 0;
49
50 /*
51 * It may be possible to isolate a pageblock even if the
52 * migratetype is not MIGRATE_MOVABLE. The memory isolation
53 * notifier chain is used by balloon drivers to return the
54 * number of pages in a range that are held by the balloon
55 * driver to shrink memory. If all the pages are accounted for
56 * by balloons, are free, or on the LRU, isolation can continue.
57 * Later, for example, when memory hotplug notifier runs, these
58 * pages reported as "can be isolated" should be isolated(freed)
59 * by the balloon driver through the memory notifier chain.
60 */
61 notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
62 notifier_ret = notifier_to_errno(notifier_ret);
63 if (notifier_ret)
64 goto out;
65 /*
66 * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
67 * We just check MOVABLE pages.
68 */
69 if (!has_unmovable_pages(zone, page, arg.pages_found))
70 ret = 0;
71
72 /*
73 * immobile means "not-on-lru" paes. If immobile is larger than
74 * removable-by-driver pages reported by notifier, we'll fail.
75 */
76
77out:
78 if (!ret) {
79 set_pageblock_isolate(page);
80 move_freepages_block(zone, page, MIGRATE_ISOLATE);
81 }
82
83 spin_unlock_irqrestore(&zone->lock, flags);
84 if (!ret)
85 drain_all_pages();
86 return ret;
87}
88
89void unset_migratetype_isolate(struct page *page, unsigned migratetype)
90{
91 struct zone *zone;
92 unsigned long flags;
93 zone = page_zone(page);
94 spin_lock_irqsave(&zone->lock, flags);
95 if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
96 goto out;
97 move_freepages_block(zone, page, migratetype);
98 restore_pageblock_isolate(page, migratetype);
99out:
100 spin_unlock_irqrestore(&zone->lock, flags);
101}
102
10static inline struct page * 103static inline struct page *
11__first_valid_page(unsigned long pfn, unsigned long nr_pages) 104__first_valid_page(unsigned long pfn, unsigned long nr_pages)
12{ 105{
diff --git a/mm/shmem.c b/mm/shmem.c
index bd106361be4..d4e184e2a38 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -929,7 +929,8 @@ static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
929 929
930 /* Create a pseudo vma that just contains the policy */ 930 /* Create a pseudo vma that just contains the policy */
931 pvma.vm_start = 0; 931 pvma.vm_start = 0;
932 pvma.vm_pgoff = index; 932 /* Bias interleave by inode number to distribute better across nodes */
933 pvma.vm_pgoff = index + info->vfs_inode.i_ino;
933 pvma.vm_ops = NULL; 934 pvma.vm_ops = NULL;
934 pvma.vm_policy = spol; 935 pvma.vm_policy = spol;
935 return swapin_readahead(swap, gfp, &pvma, 0); 936 return swapin_readahead(swap, gfp, &pvma, 0);
@@ -942,7 +943,8 @@ static struct page *shmem_alloc_page(gfp_t gfp,
942 943
943 /* Create a pseudo vma that just contains the policy */ 944 /* Create a pseudo vma that just contains the policy */
944 pvma.vm_start = 0; 945 pvma.vm_start = 0;
945 pvma.vm_pgoff = index; 946 /* Bias interleave by inode number to distribute better across nodes */
947 pvma.vm_pgoff = index + info->vfs_inode.i_ino;
946 pvma.vm_ops = NULL; 948 pvma.vm_ops = NULL;
947 pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); 949 pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
948 950
@@ -1877,7 +1879,7 @@ static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
1877} 1879}
1878 1880
1879static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode, 1881static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode,
1880 struct nameidata *nd) 1882 bool excl)
1881{ 1883{
1882 return shmem_mknod(dir, dentry, mode | S_IFREG, 0); 1884 return shmem_mknod(dir, dentry, mode | S_IFREG, 0);
1883} 1885}
diff --git a/mm/slab.c b/mm/slab.c
index e901a36e252..811af03a14e 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -68,7 +68,7 @@
68 * Further notes from the original documentation: 68 * Further notes from the original documentation:
69 * 69 *
70 * 11 April '97. Started multi-threading - markhe 70 * 11 April '97. Started multi-threading - markhe
71 * The global cache-chain is protected by the mutex 'cache_chain_mutex'. 71 * The global cache-chain is protected by the mutex 'slab_mutex'.
72 * The sem is only needed when accessing/extending the cache-chain, which 72 * The sem is only needed when accessing/extending the cache-chain, which
73 * can never happen inside an interrupt (kmem_cache_create(), 73 * can never happen inside an interrupt (kmem_cache_create(),
74 * kmem_cache_shrink() and kmem_cache_reap()). 74 * kmem_cache_shrink() and kmem_cache_reap()).
@@ -87,6 +87,7 @@
87 */ 87 */
88 88
89#include <linux/slab.h> 89#include <linux/slab.h>
90#include "slab.h"
90#include <linux/mm.h> 91#include <linux/mm.h>
91#include <linux/poison.h> 92#include <linux/poison.h>
92#include <linux/swap.h> 93#include <linux/swap.h>
@@ -117,12 +118,16 @@
117#include <linux/memory.h> 118#include <linux/memory.h>
118#include <linux/prefetch.h> 119#include <linux/prefetch.h>
119 120
121#include <net/sock.h>
122
120#include <asm/cacheflush.h> 123#include <asm/cacheflush.h>
121#include <asm/tlbflush.h> 124#include <asm/tlbflush.h>
122#include <asm/page.h> 125#include <asm/page.h>
123 126
124#include <trace/events/kmem.h> 127#include <trace/events/kmem.h>
125 128
129#include "internal.h"
130
126/* 131/*
127 * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. 132 * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
128 * 0 for faster, smaller code (especially in the critical paths). 133 * 0 for faster, smaller code (especially in the critical paths).
@@ -151,6 +156,12 @@
151#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN 156#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
152#endif 157#endif
153 158
159/*
160 * true if a page was allocated from pfmemalloc reserves for network-based
161 * swap
162 */
163static bool pfmemalloc_active __read_mostly;
164
154/* Legal flag mask for kmem_cache_create(). */ 165/* Legal flag mask for kmem_cache_create(). */
155#if DEBUG 166#if DEBUG
156# define CREATE_MASK (SLAB_RED_ZONE | \ 167# define CREATE_MASK (SLAB_RED_ZONE | \
@@ -256,9 +267,30 @@ struct array_cache {
256 * Must have this definition in here for the proper 267 * Must have this definition in here for the proper
257 * alignment of array_cache. Also simplifies accessing 268 * alignment of array_cache. Also simplifies accessing
258 * the entries. 269 * the entries.
270 *
271 * Entries should not be directly dereferenced as
272 * entries belonging to slabs marked pfmemalloc will
273 * have the lower bits set SLAB_OBJ_PFMEMALLOC
259 */ 274 */
260}; 275};
261 276
277#define SLAB_OBJ_PFMEMALLOC 1
278static inline bool is_obj_pfmemalloc(void *objp)
279{
280 return (unsigned long)objp & SLAB_OBJ_PFMEMALLOC;
281}
282
283static inline void set_obj_pfmemalloc(void **objp)
284{
285 *objp = (void *)((unsigned long)*objp | SLAB_OBJ_PFMEMALLOC);
286 return;
287}
288
289static inline void clear_obj_pfmemalloc(void **objp)
290{
291 *objp = (void *)((unsigned long)*objp & ~SLAB_OBJ_PFMEMALLOC);
292}
293
262/* 294/*
263 * bootstrap: The caches do not work without cpuarrays anymore, but the 295 * bootstrap: The caches do not work without cpuarrays anymore, but the
264 * cpuarrays are allocated from the generic caches... 296 * cpuarrays are allocated from the generic caches...
@@ -424,8 +456,8 @@ static void kmem_list3_init(struct kmem_list3 *parent)
424 * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1: 456 * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1:
425 * redzone word. 457 * redzone word.
426 * cachep->obj_offset: The real object. 458 * cachep->obj_offset: The real object.
427 * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long] 459 * cachep->size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
428 * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address 460 * cachep->size - 1* BYTES_PER_WORD: last caller address
429 * [BYTES_PER_WORD long] 461 * [BYTES_PER_WORD long]
430 */ 462 */
431static int obj_offset(struct kmem_cache *cachep) 463static int obj_offset(struct kmem_cache *cachep)
@@ -433,11 +465,6 @@ static int obj_offset(struct kmem_cache *cachep)
433 return cachep->obj_offset; 465 return cachep->obj_offset;
434} 466}
435 467
436static int obj_size(struct kmem_cache *cachep)
437{
438 return cachep->obj_size;
439}
440
441static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp) 468static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp)
442{ 469{
443 BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); 470 BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
@@ -449,23 +476,22 @@ static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp)
449{ 476{
450 BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); 477 BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
451 if (cachep->flags & SLAB_STORE_USER) 478 if (cachep->flags & SLAB_STORE_USER)
452 return (unsigned long long *)(objp + cachep->buffer_size - 479 return (unsigned long long *)(objp + cachep->size -
453 sizeof(unsigned long long) - 480 sizeof(unsigned long long) -
454 REDZONE_ALIGN); 481 REDZONE_ALIGN);
455 return (unsigned long long *) (objp + cachep->buffer_size - 482 return (unsigned long long *) (objp + cachep->size -
456 sizeof(unsigned long long)); 483 sizeof(unsigned long long));
457} 484}
458 485
459static void **dbg_userword(struct kmem_cache *cachep, void *objp) 486static void **dbg_userword(struct kmem_cache *cachep, void *objp)
460{ 487{
461 BUG_ON(!(cachep->flags & SLAB_STORE_USER)); 488 BUG_ON(!(cachep->flags & SLAB_STORE_USER));
462 return (void **)(objp + cachep->buffer_size - BYTES_PER_WORD); 489 return (void **)(objp + cachep->size - BYTES_PER_WORD);
463} 490}
464 491
465#else 492#else
466 493
467#define obj_offset(x) 0 494#define obj_offset(x) 0
468#define obj_size(cachep) (cachep->buffer_size)
469#define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) 495#define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long long *)NULL;})
470#define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) 496#define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long long *)NULL;})
471#define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;}) 497#define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;})
@@ -475,7 +501,7 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
475#ifdef CONFIG_TRACING 501#ifdef CONFIG_TRACING
476size_t slab_buffer_size(struct kmem_cache *cachep) 502size_t slab_buffer_size(struct kmem_cache *cachep)
477{ 503{
478 return cachep->buffer_size; 504 return cachep->size;
479} 505}
480EXPORT_SYMBOL(slab_buffer_size); 506EXPORT_SYMBOL(slab_buffer_size);
481#endif 507#endif
@@ -489,56 +515,37 @@ EXPORT_SYMBOL(slab_buffer_size);
489static int slab_max_order = SLAB_MAX_ORDER_LO; 515static int slab_max_order = SLAB_MAX_ORDER_LO;
490static bool slab_max_order_set __initdata; 516static bool slab_max_order_set __initdata;
491 517
492/*
493 * Functions for storing/retrieving the cachep and or slab from the page
494 * allocator. These are used to find the slab an obj belongs to. With kfree(),
495 * these are used to find the cache which an obj belongs to.
496 */
497static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
498{
499 page->lru.next = (struct list_head *)cache;
500}
501
502static inline struct kmem_cache *page_get_cache(struct page *page) 518static inline struct kmem_cache *page_get_cache(struct page *page)
503{ 519{
504 page = compound_head(page); 520 page = compound_head(page);
505 BUG_ON(!PageSlab(page)); 521 BUG_ON(!PageSlab(page));
506 return (struct kmem_cache *)page->lru.next; 522 return page->slab_cache;
507}
508
509static inline void page_set_slab(struct page *page, struct slab *slab)
510{
511 page->lru.prev = (struct list_head *)slab;
512}
513
514static inline struct slab *page_get_slab(struct page *page)
515{
516 BUG_ON(!PageSlab(page));
517 return (struct slab *)page->lru.prev;
518} 523}
519 524
520static inline struct kmem_cache *virt_to_cache(const void *obj) 525static inline struct kmem_cache *virt_to_cache(const void *obj)
521{ 526{
522 struct page *page = virt_to_head_page(obj); 527 struct page *page = virt_to_head_page(obj);
523 return page_get_cache(page); 528 return page->slab_cache;
524} 529}
525 530
526static inline struct slab *virt_to_slab(const void *obj) 531static inline struct slab *virt_to_slab(const void *obj)
527{ 532{
528 struct page *page = virt_to_head_page(obj); 533 struct page *page = virt_to_head_page(obj);
529 return page_get_slab(page); 534
535 VM_BUG_ON(!PageSlab(page));
536 return page->slab_page;
530} 537}
531 538
532static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab, 539static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
533 unsigned int idx) 540 unsigned int idx)
534{ 541{
535 return slab->s_mem + cache->buffer_size * idx; 542 return slab->s_mem + cache->size * idx;
536} 543}
537 544
538/* 545/*
539 * We want to avoid an expensive divide : (offset / cache->buffer_size) 546 * We want to avoid an expensive divide : (offset / cache->size)
540 * Using the fact that buffer_size is a constant for a particular cache, 547 * Using the fact that size is a constant for a particular cache,
541 * we can replace (offset / cache->buffer_size) by 548 * we can replace (offset / cache->size) by
542 * reciprocal_divide(offset, cache->reciprocal_buffer_size) 549 * reciprocal_divide(offset, cache->reciprocal_buffer_size)
543 */ 550 */
544static inline unsigned int obj_to_index(const struct kmem_cache *cache, 551static inline unsigned int obj_to_index(const struct kmem_cache *cache,
@@ -584,33 +591,12 @@ static struct kmem_cache cache_cache = {
584 .batchcount = 1, 591 .batchcount = 1,
585 .limit = BOOT_CPUCACHE_ENTRIES, 592 .limit = BOOT_CPUCACHE_ENTRIES,
586 .shared = 1, 593 .shared = 1,
587 .buffer_size = sizeof(struct kmem_cache), 594 .size = sizeof(struct kmem_cache),
588 .name = "kmem_cache", 595 .name = "kmem_cache",
589}; 596};
590 597
591#define BAD_ALIEN_MAGIC 0x01020304ul 598#define BAD_ALIEN_MAGIC 0x01020304ul
592 599
593/*
594 * chicken and egg problem: delay the per-cpu array allocation
595 * until the general caches are up.
596 */
597static enum {
598 NONE,
599 PARTIAL_AC,
600 PARTIAL_L3,
601 EARLY,
602 LATE,
603 FULL
604} g_cpucache_up;
605
606/*
607 * used by boot code to determine if it can use slab based allocator
608 */
609int slab_is_available(void)
610{
611 return g_cpucache_up >= EARLY;
612}
613
614#ifdef CONFIG_LOCKDEP 600#ifdef CONFIG_LOCKDEP
615 601
616/* 602/*
@@ -676,7 +662,7 @@ static void init_node_lock_keys(int q)
676{ 662{
677 struct cache_sizes *s = malloc_sizes; 663 struct cache_sizes *s = malloc_sizes;
678 664
679 if (g_cpucache_up < LATE) 665 if (slab_state < UP)
680 return; 666 return;
681 667
682 for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) { 668 for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) {
@@ -716,12 +702,6 @@ static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep)
716} 702}
717#endif 703#endif
718 704
719/*
720 * Guard access to the cache-chain.
721 */
722static DEFINE_MUTEX(cache_chain_mutex);
723static struct list_head cache_chain;
724
725static DEFINE_PER_CPU(struct delayed_work, slab_reap_work); 705static DEFINE_PER_CPU(struct delayed_work, slab_reap_work);
726 706
727static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) 707static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
@@ -951,6 +931,124 @@ static struct array_cache *alloc_arraycache(int node, int entries,
951 return nc; 931 return nc;
952} 932}
953 933
934static inline bool is_slab_pfmemalloc(struct slab *slabp)
935{
936 struct page *page = virt_to_page(slabp->s_mem);
937
938 return PageSlabPfmemalloc(page);
939}
940
941/* Clears pfmemalloc_active if no slabs have pfmalloc set */
942static void recheck_pfmemalloc_active(struct kmem_cache *cachep,
943 struct array_cache *ac)
944{
945 struct kmem_list3 *l3 = cachep->nodelists[numa_mem_id()];
946 struct slab *slabp;
947 unsigned long flags;
948
949 if (!pfmemalloc_active)
950 return;
951
952 spin_lock_irqsave(&l3->list_lock, flags);
953 list_for_each_entry(slabp, &l3->slabs_full, list)
954 if (is_slab_pfmemalloc(slabp))
955 goto out;
956
957 list_for_each_entry(slabp, &l3->slabs_partial, list)
958 if (is_slab_pfmemalloc(slabp))
959 goto out;
960
961 list_for_each_entry(slabp, &l3->slabs_free, list)
962 if (is_slab_pfmemalloc(slabp))
963 goto out;
964
965 pfmemalloc_active = false;
966out:
967 spin_unlock_irqrestore(&l3->list_lock, flags);
968}
969
970static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac,
971 gfp_t flags, bool force_refill)
972{
973 int i;
974 void *objp = ac->entry[--ac->avail];
975
976 /* Ensure the caller is allowed to use objects from PFMEMALLOC slab */
977 if (unlikely(is_obj_pfmemalloc(objp))) {
978 struct kmem_list3 *l3;
979
980 if (gfp_pfmemalloc_allowed(flags)) {
981 clear_obj_pfmemalloc(&objp);
982 return objp;
983 }
984
985 /* The caller cannot use PFMEMALLOC objects, find another one */
986 for (i = 1; i < ac->avail; i++) {
987 /* If a !PFMEMALLOC object is found, swap them */
988 if (!is_obj_pfmemalloc(ac->entry[i])) {
989 objp = ac->entry[i];
990 ac->entry[i] = ac->entry[ac->avail];
991 ac->entry[ac->avail] = objp;
992 return objp;
993 }
994 }
995
996 /*
997 * If there are empty slabs on the slabs_free list and we are
998 * being forced to refill the cache, mark this one !pfmemalloc.
999 */
1000 l3 = cachep->nodelists[numa_mem_id()];
1001 if (!list_empty(&l3->slabs_free) && force_refill) {
1002 struct slab *slabp = virt_to_slab(objp);
1003 ClearPageSlabPfmemalloc(virt_to_page(slabp->s_mem));
1004 clear_obj_pfmemalloc(&objp);
1005 recheck_pfmemalloc_active(cachep, ac);
1006 return objp;
1007 }
1008
1009 /* No !PFMEMALLOC objects available */
1010 ac->avail++;
1011 objp = NULL;
1012 }
1013
1014 return objp;
1015}
1016
1017static inline void *ac_get_obj(struct kmem_cache *cachep,
1018 struct array_cache *ac, gfp_t flags, bool force_refill)
1019{
1020 void *objp;
1021
1022 if (unlikely(sk_memalloc_socks()))
1023 objp = __ac_get_obj(cachep, ac, flags, force_refill);
1024 else
1025 objp = ac->entry[--ac->avail];
1026
1027 return objp;
1028}
1029
1030static void *__ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
1031 void *objp)
1032{
1033 if (unlikely(pfmemalloc_active)) {
1034 /* Some pfmemalloc slabs exist, check if this is one */
1035 struct page *page = virt_to_page(objp);
1036 if (PageSlabPfmemalloc(page))
1037 set_obj_pfmemalloc(&objp);
1038 }
1039
1040 return objp;
1041}
1042
1043static inline void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
1044 void *objp)
1045{
1046 if (unlikely(sk_memalloc_socks()))
1047 objp = __ac_put_obj(cachep, ac, objp);
1048
1049 ac->entry[ac->avail++] = objp;
1050}
1051
954/* 1052/*
955 * Transfer objects in one arraycache to another. 1053 * Transfer objects in one arraycache to another.
956 * Locking must be handled by the caller. 1054 * Locking must be handled by the caller.
@@ -1127,7 +1225,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1127 STATS_INC_ACOVERFLOW(cachep); 1225 STATS_INC_ACOVERFLOW(cachep);
1128 __drain_alien_cache(cachep, alien, nodeid); 1226 __drain_alien_cache(cachep, alien, nodeid);
1129 } 1227 }
1130 alien->entry[alien->avail++] = objp; 1228 ac_put_obj(cachep, alien, objp);
1131 spin_unlock(&alien->lock); 1229 spin_unlock(&alien->lock);
1132 } else { 1230 } else {
1133 spin_lock(&(cachep->nodelists[nodeid])->list_lock); 1231 spin_lock(&(cachep->nodelists[nodeid])->list_lock);
@@ -1145,7 +1243,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1145 * When hotplugging memory or a cpu, existing nodelists are not replaced if 1243 * When hotplugging memory or a cpu, existing nodelists are not replaced if
1146 * already in use. 1244 * already in use.
1147 * 1245 *
1148 * Must hold cache_chain_mutex. 1246 * Must hold slab_mutex.
1149 */ 1247 */
1150static int init_cache_nodelists_node(int node) 1248static int init_cache_nodelists_node(int node)
1151{ 1249{
@@ -1153,7 +1251,7 @@ static int init_cache_nodelists_node(int node)
1153 struct kmem_list3 *l3; 1251 struct kmem_list3 *l3;
1154 const int memsize = sizeof(struct kmem_list3); 1252 const int memsize = sizeof(struct kmem_list3);
1155 1253
1156 list_for_each_entry(cachep, &cache_chain, next) { 1254 list_for_each_entry(cachep, &slab_caches, list) {
1157 /* 1255 /*
1158 * Set up the size64 kmemlist for cpu before we can 1256 * Set up the size64 kmemlist for cpu before we can
1159 * begin anything. Make sure some other cpu on this 1257 * begin anything. Make sure some other cpu on this
@@ -1169,7 +1267,7 @@ static int init_cache_nodelists_node(int node)
1169 1267
1170 /* 1268 /*
1171 * The l3s don't come and go as CPUs come and 1269 * The l3s don't come and go as CPUs come and
1172 * go. cache_chain_mutex is sufficient 1270 * go. slab_mutex is sufficient
1173 * protection here. 1271 * protection here.
1174 */ 1272 */
1175 cachep->nodelists[node] = l3; 1273 cachep->nodelists[node] = l3;
@@ -1191,7 +1289,7 @@ static void __cpuinit cpuup_canceled(long cpu)
1191 int node = cpu_to_mem(cpu); 1289 int node = cpu_to_mem(cpu);
1192 const struct cpumask *mask = cpumask_of_node(node); 1290 const struct cpumask *mask = cpumask_of_node(node);
1193 1291
1194 list_for_each_entry(cachep, &cache_chain, next) { 1292 list_for_each_entry(cachep, &slab_caches, list) {
1195 struct array_cache *nc; 1293 struct array_cache *nc;
1196 struct array_cache *shared; 1294 struct array_cache *shared;
1197 struct array_cache **alien; 1295 struct array_cache **alien;
@@ -1241,7 +1339,7 @@ free_array_cache:
1241 * the respective cache's slabs, now we can go ahead and 1339 * the respective cache's slabs, now we can go ahead and
1242 * shrink each nodelist to its limit. 1340 * shrink each nodelist to its limit.
1243 */ 1341 */
1244 list_for_each_entry(cachep, &cache_chain, next) { 1342 list_for_each_entry(cachep, &slab_caches, list) {
1245 l3 = cachep->nodelists[node]; 1343 l3 = cachep->nodelists[node];
1246 if (!l3) 1344 if (!l3)
1247 continue; 1345 continue;
@@ -1270,7 +1368,7 @@ static int __cpuinit cpuup_prepare(long cpu)
1270 * Now we can go ahead with allocating the shared arrays and 1368 * Now we can go ahead with allocating the shared arrays and
1271 * array caches 1369 * array caches
1272 */ 1370 */
1273 list_for_each_entry(cachep, &cache_chain, next) { 1371 list_for_each_entry(cachep, &slab_caches, list) {
1274 struct array_cache *nc; 1372 struct array_cache *nc;
1275 struct array_cache *shared = NULL; 1373 struct array_cache *shared = NULL;
1276 struct array_cache **alien = NULL; 1374 struct array_cache **alien = NULL;
@@ -1338,9 +1436,9 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1338 switch (action) { 1436 switch (action) {
1339 case CPU_UP_PREPARE: 1437 case CPU_UP_PREPARE:
1340 case CPU_UP_PREPARE_FROZEN: 1438 case CPU_UP_PREPARE_FROZEN:
1341 mutex_lock(&cache_chain_mutex); 1439 mutex_lock(&slab_mutex);
1342 err = cpuup_prepare(cpu); 1440 err = cpuup_prepare(cpu);
1343 mutex_unlock(&cache_chain_mutex); 1441 mutex_unlock(&slab_mutex);
1344 break; 1442 break;
1345 case CPU_ONLINE: 1443 case CPU_ONLINE:
1346 case CPU_ONLINE_FROZEN: 1444 case CPU_ONLINE_FROZEN:
@@ -1350,7 +1448,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1350 case CPU_DOWN_PREPARE: 1448 case CPU_DOWN_PREPARE:
1351 case CPU_DOWN_PREPARE_FROZEN: 1449 case CPU_DOWN_PREPARE_FROZEN:
1352 /* 1450 /*
1353 * Shutdown cache reaper. Note that the cache_chain_mutex is 1451 * Shutdown cache reaper. Note that the slab_mutex is
1354 * held so that if cache_reap() is invoked it cannot do 1452 * held so that if cache_reap() is invoked it cannot do
1355 * anything expensive but will only modify reap_work 1453 * anything expensive but will only modify reap_work
1356 * and reschedule the timer. 1454 * and reschedule the timer.
@@ -1377,9 +1475,9 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1377#endif 1475#endif
1378 case CPU_UP_CANCELED: 1476 case CPU_UP_CANCELED:
1379 case CPU_UP_CANCELED_FROZEN: 1477 case CPU_UP_CANCELED_FROZEN:
1380 mutex_lock(&cache_chain_mutex); 1478 mutex_lock(&slab_mutex);
1381 cpuup_canceled(cpu); 1479 cpuup_canceled(cpu);
1382 mutex_unlock(&cache_chain_mutex); 1480 mutex_unlock(&slab_mutex);
1383 break; 1481 break;
1384 } 1482 }
1385 return notifier_from_errno(err); 1483 return notifier_from_errno(err);
@@ -1395,14 +1493,14 @@ static struct notifier_block __cpuinitdata cpucache_notifier = {
1395 * Returns -EBUSY if all objects cannot be drained so that the node is not 1493 * Returns -EBUSY if all objects cannot be drained so that the node is not
1396 * removed. 1494 * removed.
1397 * 1495 *
1398 * Must hold cache_chain_mutex. 1496 * Must hold slab_mutex.
1399 */ 1497 */
1400static int __meminit drain_cache_nodelists_node(int node) 1498static int __meminit drain_cache_nodelists_node(int node)
1401{ 1499{
1402 struct kmem_cache *cachep; 1500 struct kmem_cache *cachep;
1403 int ret = 0; 1501 int ret = 0;
1404 1502
1405 list_for_each_entry(cachep, &cache_chain, next) { 1503 list_for_each_entry(cachep, &slab_caches, list) {
1406 struct kmem_list3 *l3; 1504 struct kmem_list3 *l3;
1407 1505
1408 l3 = cachep->nodelists[node]; 1506 l3 = cachep->nodelists[node];
@@ -1433,14 +1531,14 @@ static int __meminit slab_memory_callback(struct notifier_block *self,
1433 1531
1434 switch (action) { 1532 switch (action) {
1435 case MEM_GOING_ONLINE: 1533 case MEM_GOING_ONLINE:
1436 mutex_lock(&cache_chain_mutex); 1534 mutex_lock(&slab_mutex);
1437 ret = init_cache_nodelists_node(nid); 1535 ret = init_cache_nodelists_node(nid);
1438 mutex_unlock(&cache_chain_mutex); 1536 mutex_unlock(&slab_mutex);
1439 break; 1537 break;
1440 case MEM_GOING_OFFLINE: 1538 case MEM_GOING_OFFLINE:
1441 mutex_lock(&cache_chain_mutex); 1539 mutex_lock(&slab_mutex);
1442 ret = drain_cache_nodelists_node(nid); 1540 ret = drain_cache_nodelists_node(nid);
1443 mutex_unlock(&cache_chain_mutex); 1541 mutex_unlock(&slab_mutex);
1444 break; 1542 break;
1445 case MEM_ONLINE: 1543 case MEM_ONLINE:
1446 case MEM_OFFLINE: 1544 case MEM_OFFLINE:
@@ -1544,8 +1642,8 @@ void __init kmem_cache_init(void)
1544 node = numa_mem_id(); 1642 node = numa_mem_id();
1545 1643
1546 /* 1) create the cache_cache */ 1644 /* 1) create the cache_cache */
1547 INIT_LIST_HEAD(&cache_chain); 1645 INIT_LIST_HEAD(&slab_caches);
1548 list_add(&cache_cache.next, &cache_chain); 1646 list_add(&cache_cache.list, &slab_caches);
1549 cache_cache.colour_off = cache_line_size(); 1647 cache_cache.colour_off = cache_line_size();
1550 cache_cache.array[smp_processor_id()] = &initarray_cache.cache; 1648 cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
1551 cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node]; 1649 cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node];
@@ -1553,18 +1651,16 @@ void __init kmem_cache_init(void)
1553 /* 1651 /*
1554 * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids 1652 * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
1555 */ 1653 */
1556 cache_cache.buffer_size = offsetof(struct kmem_cache, array[nr_cpu_ids]) + 1654 cache_cache.size = offsetof(struct kmem_cache, array[nr_cpu_ids]) +
1557 nr_node_ids * sizeof(struct kmem_list3 *); 1655 nr_node_ids * sizeof(struct kmem_list3 *);
1558#if DEBUG 1656 cache_cache.object_size = cache_cache.size;
1559 cache_cache.obj_size = cache_cache.buffer_size; 1657 cache_cache.size = ALIGN(cache_cache.size,
1560#endif
1561 cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
1562 cache_line_size()); 1658 cache_line_size());
1563 cache_cache.reciprocal_buffer_size = 1659 cache_cache.reciprocal_buffer_size =
1564 reciprocal_value(cache_cache.buffer_size); 1660 reciprocal_value(cache_cache.size);
1565 1661
1566 for (order = 0; order < MAX_ORDER; order++) { 1662 for (order = 0; order < MAX_ORDER; order++) {
1567 cache_estimate(order, cache_cache.buffer_size, 1663 cache_estimate(order, cache_cache.size,
1568 cache_line_size(), 0, &left_over, &cache_cache.num); 1664 cache_line_size(), 0, &left_over, &cache_cache.num);
1569 if (cache_cache.num) 1665 if (cache_cache.num)
1570 break; 1666 break;
@@ -1585,7 +1681,7 @@ void __init kmem_cache_init(void)
1585 * bug. 1681 * bug.
1586 */ 1682 */
1587 1683
1588 sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name, 1684 sizes[INDEX_AC].cs_cachep = __kmem_cache_create(names[INDEX_AC].name,
1589 sizes[INDEX_AC].cs_size, 1685 sizes[INDEX_AC].cs_size,
1590 ARCH_KMALLOC_MINALIGN, 1686 ARCH_KMALLOC_MINALIGN,
1591 ARCH_KMALLOC_FLAGS|SLAB_PANIC, 1687 ARCH_KMALLOC_FLAGS|SLAB_PANIC,
@@ -1593,7 +1689,7 @@ void __init kmem_cache_init(void)
1593 1689
1594 if (INDEX_AC != INDEX_L3) { 1690 if (INDEX_AC != INDEX_L3) {
1595 sizes[INDEX_L3].cs_cachep = 1691 sizes[INDEX_L3].cs_cachep =
1596 kmem_cache_create(names[INDEX_L3].name, 1692 __kmem_cache_create(names[INDEX_L3].name,
1597 sizes[INDEX_L3].cs_size, 1693 sizes[INDEX_L3].cs_size,
1598 ARCH_KMALLOC_MINALIGN, 1694 ARCH_KMALLOC_MINALIGN,
1599 ARCH_KMALLOC_FLAGS|SLAB_PANIC, 1695 ARCH_KMALLOC_FLAGS|SLAB_PANIC,
@@ -1611,14 +1707,14 @@ void __init kmem_cache_init(void)
1611 * allow tighter packing of the smaller caches. 1707 * allow tighter packing of the smaller caches.
1612 */ 1708 */
1613 if (!sizes->cs_cachep) { 1709 if (!sizes->cs_cachep) {
1614 sizes->cs_cachep = kmem_cache_create(names->name, 1710 sizes->cs_cachep = __kmem_cache_create(names->name,
1615 sizes->cs_size, 1711 sizes->cs_size,
1616 ARCH_KMALLOC_MINALIGN, 1712 ARCH_KMALLOC_MINALIGN,
1617 ARCH_KMALLOC_FLAGS|SLAB_PANIC, 1713 ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1618 NULL); 1714 NULL);
1619 } 1715 }
1620#ifdef CONFIG_ZONE_DMA 1716#ifdef CONFIG_ZONE_DMA
1621 sizes->cs_dmacachep = kmem_cache_create( 1717 sizes->cs_dmacachep = __kmem_cache_create(
1622 names->name_dma, 1718 names->name_dma,
1623 sizes->cs_size, 1719 sizes->cs_size,
1624 ARCH_KMALLOC_MINALIGN, 1720 ARCH_KMALLOC_MINALIGN,
@@ -1676,27 +1772,27 @@ void __init kmem_cache_init(void)
1676 } 1772 }
1677 } 1773 }
1678 1774
1679 g_cpucache_up = EARLY; 1775 slab_state = UP;
1680} 1776}
1681 1777
1682void __init kmem_cache_init_late(void) 1778void __init kmem_cache_init_late(void)
1683{ 1779{
1684 struct kmem_cache *cachep; 1780 struct kmem_cache *cachep;
1685 1781
1686 g_cpucache_up = LATE; 1782 slab_state = UP;
1687 1783
1688 /* Annotate slab for lockdep -- annotate the malloc caches */ 1784 /* Annotate slab for lockdep -- annotate the malloc caches */
1689 init_lock_keys(); 1785 init_lock_keys();
1690 1786
1691 /* 6) resize the head arrays to their final sizes */ 1787 /* 6) resize the head arrays to their final sizes */
1692 mutex_lock(&cache_chain_mutex); 1788 mutex_lock(&slab_mutex);
1693 list_for_each_entry(cachep, &cache_chain, next) 1789 list_for_each_entry(cachep, &slab_caches, list)
1694 if (enable_cpucache(cachep, GFP_NOWAIT)) 1790 if (enable_cpucache(cachep, GFP_NOWAIT))
1695 BUG(); 1791 BUG();
1696 mutex_unlock(&cache_chain_mutex); 1792 mutex_unlock(&slab_mutex);
1697 1793
1698 /* Done! */ 1794 /* Done! */
1699 g_cpucache_up = FULL; 1795 slab_state = FULL;
1700 1796
1701 /* 1797 /*
1702 * Register a cpu startup notifier callback that initializes 1798 * Register a cpu startup notifier callback that initializes
@@ -1727,6 +1823,9 @@ static int __init cpucache_init(void)
1727 */ 1823 */
1728 for_each_online_cpu(cpu) 1824 for_each_online_cpu(cpu)
1729 start_cpu_timer(cpu); 1825 start_cpu_timer(cpu);
1826
1827 /* Done! */
1828 slab_state = FULL;
1730 return 0; 1829 return 0;
1731} 1830}
1732__initcall(cpucache_init); 1831__initcall(cpucache_init);
@@ -1743,7 +1842,7 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
1743 "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n", 1842 "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n",
1744 nodeid, gfpflags); 1843 nodeid, gfpflags);
1745 printk(KERN_WARNING " cache: %s, object size: %d, order: %d\n", 1844 printk(KERN_WARNING " cache: %s, object size: %d, order: %d\n",
1746 cachep->name, cachep->buffer_size, cachep->gfporder); 1845 cachep->name, cachep->size, cachep->gfporder);
1747 1846
1748 for_each_online_node(node) { 1847 for_each_online_node(node) {
1749 unsigned long active_objs = 0, num_objs = 0, free_objects = 0; 1848 unsigned long active_objs = 0, num_objs = 0, free_objects = 0;
@@ -1798,7 +1897,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1798 flags |= __GFP_COMP; 1897 flags |= __GFP_COMP;
1799#endif 1898#endif
1800 1899
1801 flags |= cachep->gfpflags; 1900 flags |= cachep->allocflags;
1802 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1901 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1803 flags |= __GFP_RECLAIMABLE; 1902 flags |= __GFP_RECLAIMABLE;
1804 1903
@@ -1809,6 +1908,10 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1809 return NULL; 1908 return NULL;
1810 } 1909 }
1811 1910
1911 /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
1912 if (unlikely(page->pfmemalloc))
1913 pfmemalloc_active = true;
1914
1812 nr_pages = (1 << cachep->gfporder); 1915 nr_pages = (1 << cachep->gfporder);
1813 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1916 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1814 add_zone_page_state(page_zone(page), 1917 add_zone_page_state(page_zone(page),
@@ -1816,9 +1919,13 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1816 else 1919 else
1817 add_zone_page_state(page_zone(page), 1920 add_zone_page_state(page_zone(page),
1818 NR_SLAB_UNRECLAIMABLE, nr_pages); 1921 NR_SLAB_UNRECLAIMABLE, nr_pages);
1819 for (i = 0; i < nr_pages; i++) 1922 for (i = 0; i < nr_pages; i++) {
1820 __SetPageSlab(page + i); 1923 __SetPageSlab(page + i);
1821 1924
1925 if (page->pfmemalloc)
1926 SetPageSlabPfmemalloc(page + i);
1927 }
1928
1822 if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { 1929 if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
1823 kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); 1930 kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);
1824 1931
@@ -1850,6 +1957,7 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
1850 NR_SLAB_UNRECLAIMABLE, nr_freed); 1957 NR_SLAB_UNRECLAIMABLE, nr_freed);
1851 while (i--) { 1958 while (i--) {
1852 BUG_ON(!PageSlab(page)); 1959 BUG_ON(!PageSlab(page));
1960 __ClearPageSlabPfmemalloc(page);
1853 __ClearPageSlab(page); 1961 __ClearPageSlab(page);
1854 page++; 1962 page++;
1855 } 1963 }
@@ -1874,7 +1982,7 @@ static void kmem_rcu_free(struct rcu_head *head)
1874static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, 1982static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
1875 unsigned long caller) 1983 unsigned long caller)
1876{ 1984{
1877 int size = obj_size(cachep); 1985 int size = cachep->object_size;
1878 1986
1879 addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)]; 1987 addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)];
1880 1988
@@ -1906,7 +2014,7 @@ static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
1906 2014
1907static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val) 2015static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
1908{ 2016{
1909 int size = obj_size(cachep); 2017 int size = cachep->object_size;
1910 addr = &((char *)addr)[obj_offset(cachep)]; 2018 addr = &((char *)addr)[obj_offset(cachep)];
1911 2019
1912 memset(addr, val, size); 2020 memset(addr, val, size);
@@ -1966,7 +2074,7 @@ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
1966 printk("\n"); 2074 printk("\n");
1967 } 2075 }
1968 realobj = (char *)objp + obj_offset(cachep); 2076 realobj = (char *)objp + obj_offset(cachep);
1969 size = obj_size(cachep); 2077 size = cachep->object_size;
1970 for (i = 0; i < size && lines; i += 16, lines--) { 2078 for (i = 0; i < size && lines; i += 16, lines--) {
1971 int limit; 2079 int limit;
1972 limit = 16; 2080 limit = 16;
@@ -1983,7 +2091,7 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
1983 int lines = 0; 2091 int lines = 0;
1984 2092
1985 realobj = (char *)objp + obj_offset(cachep); 2093 realobj = (char *)objp + obj_offset(cachep);
1986 size = obj_size(cachep); 2094 size = cachep->object_size;
1987 2095
1988 for (i = 0; i < size; i++) { 2096 for (i = 0; i < size; i++) {
1989 char exp = POISON_FREE; 2097 char exp = POISON_FREE;
@@ -2047,10 +2155,10 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slab
2047 2155
2048 if (cachep->flags & SLAB_POISON) { 2156 if (cachep->flags & SLAB_POISON) {
2049#ifdef CONFIG_DEBUG_PAGEALLOC 2157#ifdef CONFIG_DEBUG_PAGEALLOC
2050 if (cachep->buffer_size % PAGE_SIZE == 0 && 2158 if (cachep->size % PAGE_SIZE == 0 &&
2051 OFF_SLAB(cachep)) 2159 OFF_SLAB(cachep))
2052 kernel_map_pages(virt_to_page(objp), 2160 kernel_map_pages(virt_to_page(objp),
2053 cachep->buffer_size / PAGE_SIZE, 1); 2161 cachep->size / PAGE_SIZE, 1);
2054 else 2162 else
2055 check_poison_obj(cachep, objp); 2163 check_poison_obj(cachep, objp);
2056#else 2164#else
@@ -2194,10 +2302,10 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
2194 2302
2195static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) 2303static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
2196{ 2304{
2197 if (g_cpucache_up == FULL) 2305 if (slab_state >= FULL)
2198 return enable_cpucache(cachep, gfp); 2306 return enable_cpucache(cachep, gfp);
2199 2307
2200 if (g_cpucache_up == NONE) { 2308 if (slab_state == DOWN) {
2201 /* 2309 /*
2202 * Note: the first kmem_cache_create must create the cache 2310 * Note: the first kmem_cache_create must create the cache
2203 * that's used by kmalloc(24), otherwise the creation of 2311 * that's used by kmalloc(24), otherwise the creation of
@@ -2212,16 +2320,16 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
2212 */ 2320 */
2213 set_up_list3s(cachep, SIZE_AC); 2321 set_up_list3s(cachep, SIZE_AC);
2214 if (INDEX_AC == INDEX_L3) 2322 if (INDEX_AC == INDEX_L3)
2215 g_cpucache_up = PARTIAL_L3; 2323 slab_state = PARTIAL_L3;
2216 else 2324 else
2217 g_cpucache_up = PARTIAL_AC; 2325 slab_state = PARTIAL_ARRAYCACHE;
2218 } else { 2326 } else {
2219 cachep->array[smp_processor_id()] = 2327 cachep->array[smp_processor_id()] =
2220 kmalloc(sizeof(struct arraycache_init), gfp); 2328 kmalloc(sizeof(struct arraycache_init), gfp);
2221 2329
2222 if (g_cpucache_up == PARTIAL_AC) { 2330 if (slab_state == PARTIAL_ARRAYCACHE) {
2223 set_up_list3s(cachep, SIZE_L3); 2331 set_up_list3s(cachep, SIZE_L3);
2224 g_cpucache_up = PARTIAL_L3; 2332 slab_state = PARTIAL_L3;
2225 } else { 2333 } else {
2226 int node; 2334 int node;
2227 for_each_online_node(node) { 2335 for_each_online_node(node) {
@@ -2247,7 +2355,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
2247} 2355}
2248 2356
2249/** 2357/**
2250 * kmem_cache_create - Create a cache. 2358 * __kmem_cache_create - Create a cache.
2251 * @name: A string which is used in /proc/slabinfo to identify this cache. 2359 * @name: A string which is used in /proc/slabinfo to identify this cache.
2252 * @size: The size of objects to be created in this cache. 2360 * @size: The size of objects to be created in this cache.
2253 * @align: The required alignment for the objects. 2361 * @align: The required alignment for the objects.
@@ -2274,59 +2382,14 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
2274 * as davem. 2382 * as davem.
2275 */ 2383 */
2276struct kmem_cache * 2384struct kmem_cache *
2277kmem_cache_create (const char *name, size_t size, size_t align, 2385__kmem_cache_create (const char *name, size_t size, size_t align,
2278 unsigned long flags, void (*ctor)(void *)) 2386 unsigned long flags, void (*ctor)(void *))
2279{ 2387{
2280 size_t left_over, slab_size, ralign; 2388 size_t left_over, slab_size, ralign;
2281 struct kmem_cache *cachep = NULL, *pc; 2389 struct kmem_cache *cachep = NULL;
2282 gfp_t gfp; 2390 gfp_t gfp;
2283 2391
2284 /*
2285 * Sanity checks... these are all serious usage bugs.
2286 */
2287 if (!name || in_interrupt() || (size < BYTES_PER_WORD) ||
2288 size > KMALLOC_MAX_SIZE) {
2289 printk(KERN_ERR "%s: Early error in slab %s\n", __func__,
2290 name);
2291 BUG();
2292 }
2293
2294 /*
2295 * We use cache_chain_mutex to ensure a consistent view of
2296 * cpu_online_mask as well. Please see cpuup_callback
2297 */
2298 if (slab_is_available()) {
2299 get_online_cpus();
2300 mutex_lock(&cache_chain_mutex);
2301 }
2302
2303 list_for_each_entry(pc, &cache_chain, next) {
2304 char tmp;
2305 int res;
2306
2307 /*
2308 * This happens when the module gets unloaded and doesn't
2309 * destroy its slab cache and no-one else reuses the vmalloc
2310 * area of the module. Print a warning.
2311 */
2312 res = probe_kernel_address(pc->name, tmp);
2313 if (res) {
2314 printk(KERN_ERR
2315 "SLAB: cache with size %d has lost its name\n",
2316 pc->buffer_size);
2317 continue;
2318 }
2319
2320 if (!strcmp(pc->name, name)) {
2321 printk(KERN_ERR
2322 "kmem_cache_create: duplicate cache %s\n", name);
2323 dump_stack();
2324 goto oops;
2325 }
2326 }
2327
2328#if DEBUG 2392#if DEBUG
2329 WARN_ON(strchr(name, ' ')); /* It confuses parsers */
2330#if FORCED_DEBUG 2393#if FORCED_DEBUG
2331 /* 2394 /*
2332 * Enable redzoning and last user accounting, except for caches with 2395 * Enable redzoning and last user accounting, except for caches with
@@ -2415,11 +2478,12 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2415 /* Get cache's description obj. */ 2478 /* Get cache's description obj. */
2416 cachep = kmem_cache_zalloc(&cache_cache, gfp); 2479 cachep = kmem_cache_zalloc(&cache_cache, gfp);
2417 if (!cachep) 2480 if (!cachep)
2418 goto oops; 2481 return NULL;
2419 2482
2420 cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids]; 2483 cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids];
2484 cachep->object_size = size;
2485 cachep->align = align;
2421#if DEBUG 2486#if DEBUG
2422 cachep->obj_size = size;
2423 2487
2424 /* 2488 /*
2425 * Both debugging options require word-alignment which is calculated 2489 * Both debugging options require word-alignment which is calculated
@@ -2442,7 +2506,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2442 } 2506 }
2443#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) 2507#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
2444 if (size >= malloc_sizes[INDEX_L3 + 1].cs_size 2508 if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
2445 && cachep->obj_size > cache_line_size() && ALIGN(size, align) < PAGE_SIZE) { 2509 && cachep->object_size > cache_line_size() && ALIGN(size, align) < PAGE_SIZE) {
2446 cachep->obj_offset += PAGE_SIZE - ALIGN(size, align); 2510 cachep->obj_offset += PAGE_SIZE - ALIGN(size, align);
2447 size = PAGE_SIZE; 2511 size = PAGE_SIZE;
2448 } 2512 }
@@ -2471,8 +2535,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2471 printk(KERN_ERR 2535 printk(KERN_ERR
2472 "kmem_cache_create: couldn't create cache %s.\n", name); 2536 "kmem_cache_create: couldn't create cache %s.\n", name);
2473 kmem_cache_free(&cache_cache, cachep); 2537 kmem_cache_free(&cache_cache, cachep);
2474 cachep = NULL; 2538 return NULL;
2475 goto oops;
2476 } 2539 }
2477 slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t) 2540 slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
2478 + sizeof(struct slab), align); 2541 + sizeof(struct slab), align);
@@ -2508,10 +2571,10 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2508 cachep->colour = left_over / cachep->colour_off; 2571 cachep->colour = left_over / cachep->colour_off;
2509 cachep->slab_size = slab_size; 2572 cachep->slab_size = slab_size;
2510 cachep->flags = flags; 2573 cachep->flags = flags;
2511 cachep->gfpflags = 0; 2574 cachep->allocflags = 0;
2512 if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA)) 2575 if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
2513 cachep->gfpflags |= GFP_DMA; 2576 cachep->allocflags |= GFP_DMA;
2514 cachep->buffer_size = size; 2577 cachep->size = size;
2515 cachep->reciprocal_buffer_size = reciprocal_value(size); 2578 cachep->reciprocal_buffer_size = reciprocal_value(size);
2516 2579
2517 if (flags & CFLGS_OFF_SLAB) { 2580 if (flags & CFLGS_OFF_SLAB) {
@@ -2530,8 +2593,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2530 2593
2531 if (setup_cpu_cache(cachep, gfp)) { 2594 if (setup_cpu_cache(cachep, gfp)) {
2532 __kmem_cache_destroy(cachep); 2595 __kmem_cache_destroy(cachep);
2533 cachep = NULL; 2596 return NULL;
2534 goto oops;
2535 } 2597 }
2536 2598
2537 if (flags & SLAB_DEBUG_OBJECTS) { 2599 if (flags & SLAB_DEBUG_OBJECTS) {
@@ -2545,18 +2607,9 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2545 } 2607 }
2546 2608
2547 /* cache setup completed, link it into the list */ 2609 /* cache setup completed, link it into the list */
2548 list_add(&cachep->next, &cache_chain); 2610 list_add(&cachep->list, &slab_caches);
2549oops:
2550 if (!cachep && (flags & SLAB_PANIC))
2551 panic("kmem_cache_create(): failed to create slab `%s'\n",
2552 name);
2553 if (slab_is_available()) {
2554 mutex_unlock(&cache_chain_mutex);
2555 put_online_cpus();
2556 }
2557 return cachep; 2611 return cachep;
2558} 2612}
2559EXPORT_SYMBOL(kmem_cache_create);
2560 2613
2561#if DEBUG 2614#if DEBUG
2562static void check_irq_off(void) 2615static void check_irq_off(void)
@@ -2671,7 +2724,7 @@ out:
2671 return nr_freed; 2724 return nr_freed;
2672} 2725}
2673 2726
2674/* Called with cache_chain_mutex held to protect against cpu hotplug */ 2727/* Called with slab_mutex held to protect against cpu hotplug */
2675static int __cache_shrink(struct kmem_cache *cachep) 2728static int __cache_shrink(struct kmem_cache *cachep)
2676{ 2729{
2677 int ret = 0, i = 0; 2730 int ret = 0, i = 0;
@@ -2706,9 +2759,9 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
2706 BUG_ON(!cachep || in_interrupt()); 2759 BUG_ON(!cachep || in_interrupt());
2707 2760
2708 get_online_cpus(); 2761 get_online_cpus();
2709 mutex_lock(&cache_chain_mutex); 2762 mutex_lock(&slab_mutex);
2710 ret = __cache_shrink(cachep); 2763 ret = __cache_shrink(cachep);
2711 mutex_unlock(&cache_chain_mutex); 2764 mutex_unlock(&slab_mutex);
2712 put_online_cpus(); 2765 put_online_cpus();
2713 return ret; 2766 return ret;
2714} 2767}
@@ -2736,15 +2789,15 @@ void kmem_cache_destroy(struct kmem_cache *cachep)
2736 2789
2737 /* Find the cache in the chain of caches. */ 2790 /* Find the cache in the chain of caches. */
2738 get_online_cpus(); 2791 get_online_cpus();
2739 mutex_lock(&cache_chain_mutex); 2792 mutex_lock(&slab_mutex);
2740 /* 2793 /*
2741 * the chain is never empty, cache_cache is never destroyed 2794 * the chain is never empty, cache_cache is never destroyed
2742 */ 2795 */
2743 list_del(&cachep->next); 2796 list_del(&cachep->list);
2744 if (__cache_shrink(cachep)) { 2797 if (__cache_shrink(cachep)) {
2745 slab_error(cachep, "Can't free all objects"); 2798 slab_error(cachep, "Can't free all objects");
2746 list_add(&cachep->next, &cache_chain); 2799 list_add(&cachep->list, &slab_caches);
2747 mutex_unlock(&cache_chain_mutex); 2800 mutex_unlock(&slab_mutex);
2748 put_online_cpus(); 2801 put_online_cpus();
2749 return; 2802 return;
2750 } 2803 }
@@ -2753,7 +2806,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep)
2753 rcu_barrier(); 2806 rcu_barrier();
2754 2807
2755 __kmem_cache_destroy(cachep); 2808 __kmem_cache_destroy(cachep);
2756 mutex_unlock(&cache_chain_mutex); 2809 mutex_unlock(&slab_mutex);
2757 put_online_cpus(); 2810 put_online_cpus();
2758} 2811}
2759EXPORT_SYMBOL(kmem_cache_destroy); 2812EXPORT_SYMBOL(kmem_cache_destroy);
@@ -2840,10 +2893,10 @@ static void cache_init_objs(struct kmem_cache *cachep,
2840 slab_error(cachep, "constructor overwrote the" 2893 slab_error(cachep, "constructor overwrote the"
2841 " start of an object"); 2894 " start of an object");
2842 } 2895 }
2843 if ((cachep->buffer_size % PAGE_SIZE) == 0 && 2896 if ((cachep->size % PAGE_SIZE) == 0 &&
2844 OFF_SLAB(cachep) && cachep->flags & SLAB_POISON) 2897 OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
2845 kernel_map_pages(virt_to_page(objp), 2898 kernel_map_pages(virt_to_page(objp),
2846 cachep->buffer_size / PAGE_SIZE, 0); 2899 cachep->size / PAGE_SIZE, 0);
2847#else 2900#else
2848 if (cachep->ctor) 2901 if (cachep->ctor)
2849 cachep->ctor(objp); 2902 cachep->ctor(objp);
@@ -2857,9 +2910,9 @@ static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
2857{ 2910{
2858 if (CONFIG_ZONE_DMA_FLAG) { 2911 if (CONFIG_ZONE_DMA_FLAG) {
2859 if (flags & GFP_DMA) 2912 if (flags & GFP_DMA)
2860 BUG_ON(!(cachep->gfpflags & GFP_DMA)); 2913 BUG_ON(!(cachep->allocflags & GFP_DMA));
2861 else 2914 else
2862 BUG_ON(cachep->gfpflags & GFP_DMA); 2915 BUG_ON(cachep->allocflags & GFP_DMA);
2863 } 2916 }
2864} 2917}
2865 2918
@@ -2918,8 +2971,8 @@ static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
2918 nr_pages <<= cache->gfporder; 2971 nr_pages <<= cache->gfporder;
2919 2972
2920 do { 2973 do {
2921 page_set_cache(page, cache); 2974 page->slab_cache = cache;
2922 page_set_slab(page, slab); 2975 page->slab_page = slab;
2923 page++; 2976 page++;
2924 } while (--nr_pages); 2977 } while (--nr_pages);
2925} 2978}
@@ -3057,7 +3110,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
3057 kfree_debugcheck(objp); 3110 kfree_debugcheck(objp);
3058 page = virt_to_head_page(objp); 3111 page = virt_to_head_page(objp);
3059 3112
3060 slabp = page_get_slab(page); 3113 slabp = page->slab_page;
3061 3114
3062 if (cachep->flags & SLAB_RED_ZONE) { 3115 if (cachep->flags & SLAB_RED_ZONE) {
3063 verify_redzone_free(cachep, objp); 3116 verify_redzone_free(cachep, objp);
@@ -3077,10 +3130,10 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
3077#endif 3130#endif
3078 if (cachep->flags & SLAB_POISON) { 3131 if (cachep->flags & SLAB_POISON) {
3079#ifdef CONFIG_DEBUG_PAGEALLOC 3132#ifdef CONFIG_DEBUG_PAGEALLOC
3080 if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { 3133 if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
3081 store_stackinfo(cachep, objp, (unsigned long)caller); 3134 store_stackinfo(cachep, objp, (unsigned long)caller);
3082 kernel_map_pages(virt_to_page(objp), 3135 kernel_map_pages(virt_to_page(objp),
3083 cachep->buffer_size / PAGE_SIZE, 0); 3136 cachep->size / PAGE_SIZE, 0);
3084 } else { 3137 } else {
3085 poison_obj(cachep, objp, POISON_FREE); 3138 poison_obj(cachep, objp, POISON_FREE);
3086 } 3139 }
@@ -3120,16 +3173,19 @@ bad:
3120#define check_slabp(x,y) do { } while(0) 3173#define check_slabp(x,y) do { } while(0)
3121#endif 3174#endif
3122 3175
3123static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) 3176static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags,
3177 bool force_refill)
3124{ 3178{
3125 int batchcount; 3179 int batchcount;
3126 struct kmem_list3 *l3; 3180 struct kmem_list3 *l3;
3127 struct array_cache *ac; 3181 struct array_cache *ac;
3128 int node; 3182 int node;
3129 3183
3130retry:
3131 check_irq_off(); 3184 check_irq_off();
3132 node = numa_mem_id(); 3185 node = numa_mem_id();
3186 if (unlikely(force_refill))
3187 goto force_grow;
3188retry:
3133 ac = cpu_cache_get(cachep); 3189 ac = cpu_cache_get(cachep);
3134 batchcount = ac->batchcount; 3190 batchcount = ac->batchcount;
3135 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { 3191 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
@@ -3179,8 +3235,8 @@ retry:
3179 STATS_INC_ACTIVE(cachep); 3235 STATS_INC_ACTIVE(cachep);
3180 STATS_SET_HIGH(cachep); 3236 STATS_SET_HIGH(cachep);
3181 3237
3182 ac->entry[ac->avail++] = slab_get_obj(cachep, slabp, 3238 ac_put_obj(cachep, ac, slab_get_obj(cachep, slabp,
3183 node); 3239 node));
3184 } 3240 }
3185 check_slabp(cachep, slabp); 3241 check_slabp(cachep, slabp);
3186 3242
@@ -3199,18 +3255,23 @@ alloc_done:
3199 3255
3200 if (unlikely(!ac->avail)) { 3256 if (unlikely(!ac->avail)) {
3201 int x; 3257 int x;
3258force_grow:
3202 x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL); 3259 x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
3203 3260
3204 /* cache_grow can reenable interrupts, then ac could change. */ 3261 /* cache_grow can reenable interrupts, then ac could change. */
3205 ac = cpu_cache_get(cachep); 3262 ac = cpu_cache_get(cachep);
3206 if (!x && ac->avail == 0) /* no objects in sight? abort */ 3263 node = numa_mem_id();
3264
3265 /* no objects in sight? abort */
3266 if (!x && (ac->avail == 0 || force_refill))
3207 return NULL; 3267 return NULL;
3208 3268
3209 if (!ac->avail) /* objects refilled by interrupt? */ 3269 if (!ac->avail) /* objects refilled by interrupt? */
3210 goto retry; 3270 goto retry;
3211 } 3271 }
3212 ac->touched = 1; 3272 ac->touched = 1;
3213 return ac->entry[--ac->avail]; 3273
3274 return ac_get_obj(cachep, ac, flags, force_refill);
3214} 3275}
3215 3276
3216static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, 3277static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
@@ -3230,9 +3291,9 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
3230 return objp; 3291 return objp;
3231 if (cachep->flags & SLAB_POISON) { 3292 if (cachep->flags & SLAB_POISON) {
3232#ifdef CONFIG_DEBUG_PAGEALLOC 3293#ifdef CONFIG_DEBUG_PAGEALLOC
3233 if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) 3294 if ((cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
3234 kernel_map_pages(virt_to_page(objp), 3295 kernel_map_pages(virt_to_page(objp),
3235 cachep->buffer_size / PAGE_SIZE, 1); 3296 cachep->size / PAGE_SIZE, 1);
3236 else 3297 else
3237 check_poison_obj(cachep, objp); 3298 check_poison_obj(cachep, objp);
3238#else 3299#else
@@ -3261,8 +3322,8 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
3261 struct slab *slabp; 3322 struct slab *slabp;
3262 unsigned objnr; 3323 unsigned objnr;
3263 3324
3264 slabp = page_get_slab(virt_to_head_page(objp)); 3325 slabp = virt_to_head_page(objp)->slab_page;
3265 objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size; 3326 objnr = (unsigned)(objp - slabp->s_mem) / cachep->size;
3266 slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE; 3327 slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE;
3267 } 3328 }
3268#endif 3329#endif
@@ -3285,30 +3346,42 @@ static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags)
3285 if (cachep == &cache_cache) 3346 if (cachep == &cache_cache)
3286 return false; 3347 return false;
3287 3348
3288 return should_failslab(obj_size(cachep), flags, cachep->flags); 3349 return should_failslab(cachep->object_size, flags, cachep->flags);
3289} 3350}
3290 3351
3291static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) 3352static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3292{ 3353{
3293 void *objp; 3354 void *objp;
3294 struct array_cache *ac; 3355 struct array_cache *ac;
3356 bool force_refill = false;
3295 3357
3296 check_irq_off(); 3358 check_irq_off();
3297 3359
3298 ac = cpu_cache_get(cachep); 3360 ac = cpu_cache_get(cachep);
3299 if (likely(ac->avail)) { 3361 if (likely(ac->avail)) {
3300 STATS_INC_ALLOCHIT(cachep);
3301 ac->touched = 1; 3362 ac->touched = 1;
3302 objp = ac->entry[--ac->avail]; 3363 objp = ac_get_obj(cachep, ac, flags, false);
3303 } else { 3364
3304 STATS_INC_ALLOCMISS(cachep);
3305 objp = cache_alloc_refill(cachep, flags);
3306 /* 3365 /*
3307 * the 'ac' may be updated by cache_alloc_refill(), 3366 * Allow for the possibility all avail objects are not allowed
3308 * and kmemleak_erase() requires its correct value. 3367 * by the current flags
3309 */ 3368 */
3310 ac = cpu_cache_get(cachep); 3369 if (objp) {
3370 STATS_INC_ALLOCHIT(cachep);
3371 goto out;
3372 }
3373 force_refill = true;
3311 } 3374 }
3375
3376 STATS_INC_ALLOCMISS(cachep);
3377 objp = cache_alloc_refill(cachep, flags, force_refill);
3378 /*
3379 * the 'ac' may be updated by cache_alloc_refill(),
3380 * and kmemleak_erase() requires its correct value.
3381 */
3382 ac = cpu_cache_get(cachep);
3383
3384out:
3312 /* 3385 /*
3313 * To avoid a false negative, if an object that is in one of the 3386 * To avoid a false negative, if an object that is in one of the
3314 * per-CPU caches is leaked, we need to make sure kmemleak doesn't 3387 * per-CPU caches is leaked, we need to make sure kmemleak doesn't
@@ -3336,7 +3409,7 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
3336 if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) 3409 if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
3337 nid_alloc = cpuset_slab_spread_node(); 3410 nid_alloc = cpuset_slab_spread_node();
3338 else if (current->mempolicy) 3411 else if (current->mempolicy)
3339 nid_alloc = slab_node(current->mempolicy); 3412 nid_alloc = slab_node();
3340 if (nid_alloc != nid_here) 3413 if (nid_alloc != nid_here)
3341 return ____cache_alloc_node(cachep, flags, nid_alloc); 3414 return ____cache_alloc_node(cachep, flags, nid_alloc);
3342 return NULL; 3415 return NULL;
@@ -3368,7 +3441,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
3368 3441
3369retry_cpuset: 3442retry_cpuset:
3370 cpuset_mems_cookie = get_mems_allowed(); 3443 cpuset_mems_cookie = get_mems_allowed();
3371 zonelist = node_zonelist(slab_node(current->mempolicy), flags); 3444 zonelist = node_zonelist(slab_node(), flags);
3372 3445
3373retry: 3446retry:
3374 /* 3447 /*
@@ -3545,14 +3618,14 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3545 out: 3618 out:
3546 local_irq_restore(save_flags); 3619 local_irq_restore(save_flags);
3547 ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); 3620 ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
3548 kmemleak_alloc_recursive(ptr, obj_size(cachep), 1, cachep->flags, 3621 kmemleak_alloc_recursive(ptr, cachep->object_size, 1, cachep->flags,
3549 flags); 3622 flags);
3550 3623
3551 if (likely(ptr)) 3624 if (likely(ptr))
3552 kmemcheck_slab_alloc(cachep, flags, ptr, obj_size(cachep)); 3625 kmemcheck_slab_alloc(cachep, flags, ptr, cachep->object_size);
3553 3626
3554 if (unlikely((flags & __GFP_ZERO) && ptr)) 3627 if (unlikely((flags & __GFP_ZERO) && ptr))
3555 memset(ptr, 0, obj_size(cachep)); 3628 memset(ptr, 0, cachep->object_size);
3556 3629
3557 return ptr; 3630 return ptr;
3558} 3631}
@@ -3607,15 +3680,15 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
3607 objp = __do_cache_alloc(cachep, flags); 3680 objp = __do_cache_alloc(cachep, flags);
3608 local_irq_restore(save_flags); 3681 local_irq_restore(save_flags);
3609 objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); 3682 objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
3610 kmemleak_alloc_recursive(objp, obj_size(cachep), 1, cachep->flags, 3683 kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags,
3611 flags); 3684 flags);
3612 prefetchw(objp); 3685 prefetchw(objp);
3613 3686
3614 if (likely(objp)) 3687 if (likely(objp))
3615 kmemcheck_slab_alloc(cachep, flags, objp, obj_size(cachep)); 3688 kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size);
3616 3689
3617 if (unlikely((flags & __GFP_ZERO) && objp)) 3690 if (unlikely((flags & __GFP_ZERO) && objp))
3618 memset(objp, 0, obj_size(cachep)); 3691 memset(objp, 0, cachep->object_size);
3619 3692
3620 return objp; 3693 return objp;
3621} 3694}
@@ -3630,9 +3703,12 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
3630 struct kmem_list3 *l3; 3703 struct kmem_list3 *l3;
3631 3704
3632 for (i = 0; i < nr_objects; i++) { 3705 for (i = 0; i < nr_objects; i++) {
3633 void *objp = objpp[i]; 3706 void *objp;
3634 struct slab *slabp; 3707 struct slab *slabp;
3635 3708
3709 clear_obj_pfmemalloc(&objpp[i]);
3710 objp = objpp[i];
3711
3636 slabp = virt_to_slab(objp); 3712 slabp = virt_to_slab(objp);
3637 l3 = cachep->nodelists[node]; 3713 l3 = cachep->nodelists[node];
3638 list_del(&slabp->list); 3714 list_del(&slabp->list);
@@ -3731,7 +3807,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp,
3731 kmemleak_free_recursive(objp, cachep->flags); 3807 kmemleak_free_recursive(objp, cachep->flags);
3732 objp = cache_free_debugcheck(cachep, objp, caller); 3808 objp = cache_free_debugcheck(cachep, objp, caller);
3733 3809
3734 kmemcheck_slab_free(cachep, objp, obj_size(cachep)); 3810 kmemcheck_slab_free(cachep, objp, cachep->object_size);
3735 3811
3736 /* 3812 /*
3737 * Skip calling cache_free_alien() when the platform is not numa. 3813 * Skip calling cache_free_alien() when the platform is not numa.
@@ -3750,7 +3826,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp,
3750 cache_flusharray(cachep, ac); 3826 cache_flusharray(cachep, ac);
3751 } 3827 }
3752 3828
3753 ac->entry[ac->avail++] = objp; 3829 ac_put_obj(cachep, ac, objp);
3754} 3830}
3755 3831
3756/** 3832/**
@@ -3766,7 +3842,7 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3766 void *ret = __cache_alloc(cachep, flags, __builtin_return_address(0)); 3842 void *ret = __cache_alloc(cachep, flags, __builtin_return_address(0));
3767 3843
3768 trace_kmem_cache_alloc(_RET_IP_, ret, 3844 trace_kmem_cache_alloc(_RET_IP_, ret,
3769 obj_size(cachep), cachep->buffer_size, flags); 3845 cachep->object_size, cachep->size, flags);
3770 3846
3771 return ret; 3847 return ret;
3772} 3848}
@@ -3794,7 +3870,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
3794 __builtin_return_address(0)); 3870 __builtin_return_address(0));
3795 3871
3796 trace_kmem_cache_alloc_node(_RET_IP_, ret, 3872 trace_kmem_cache_alloc_node(_RET_IP_, ret,
3797 obj_size(cachep), cachep->buffer_size, 3873 cachep->object_size, cachep->size,
3798 flags, nodeid); 3874 flags, nodeid);
3799 3875
3800 return ret; 3876 return ret;
@@ -3876,7 +3952,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
3876 ret = __cache_alloc(cachep, flags, caller); 3952 ret = __cache_alloc(cachep, flags, caller);
3877 3953
3878 trace_kmalloc((unsigned long) caller, ret, 3954 trace_kmalloc((unsigned long) caller, ret,
3879 size, cachep->buffer_size, flags); 3955 size, cachep->size, flags);
3880 3956
3881 return ret; 3957 return ret;
3882} 3958}
@@ -3916,9 +3992,9 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
3916 unsigned long flags; 3992 unsigned long flags;
3917 3993
3918 local_irq_save(flags); 3994 local_irq_save(flags);
3919 debug_check_no_locks_freed(objp, obj_size(cachep)); 3995 debug_check_no_locks_freed(objp, cachep->object_size);
3920 if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) 3996 if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
3921 debug_check_no_obj_freed(objp, obj_size(cachep)); 3997 debug_check_no_obj_freed(objp, cachep->object_size);
3922 __cache_free(cachep, objp, __builtin_return_address(0)); 3998 __cache_free(cachep, objp, __builtin_return_address(0));
3923 local_irq_restore(flags); 3999 local_irq_restore(flags);
3924 4000
@@ -3947,8 +4023,9 @@ void kfree(const void *objp)
3947 local_irq_save(flags); 4023 local_irq_save(flags);
3948 kfree_debugcheck(objp); 4024 kfree_debugcheck(objp);
3949 c = virt_to_cache(objp); 4025 c = virt_to_cache(objp);
3950 debug_check_no_locks_freed(objp, obj_size(c)); 4026 debug_check_no_locks_freed(objp, c->object_size);
3951 debug_check_no_obj_freed(objp, obj_size(c)); 4027
4028 debug_check_no_obj_freed(objp, c->object_size);
3952 __cache_free(c, (void *)objp, __builtin_return_address(0)); 4029 __cache_free(c, (void *)objp, __builtin_return_address(0));
3953 local_irq_restore(flags); 4030 local_irq_restore(flags);
3954} 4031}
@@ -3956,7 +4033,7 @@ EXPORT_SYMBOL(kfree);
3956 4033
3957unsigned int kmem_cache_size(struct kmem_cache *cachep) 4034unsigned int kmem_cache_size(struct kmem_cache *cachep)
3958{ 4035{
3959 return obj_size(cachep); 4036 return cachep->object_size;
3960} 4037}
3961EXPORT_SYMBOL(kmem_cache_size); 4038EXPORT_SYMBOL(kmem_cache_size);
3962 4039
@@ -4030,7 +4107,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp)
4030 return 0; 4107 return 0;
4031 4108
4032fail: 4109fail:
4033 if (!cachep->next.next) { 4110 if (!cachep->list.next) {
4034 /* Cache is not active yet. Roll back what we did */ 4111 /* Cache is not active yet. Roll back what we did */
4035 node--; 4112 node--;
4036 while (node >= 0) { 4113 while (node >= 0) {
@@ -4065,7 +4142,7 @@ static void do_ccupdate_local(void *info)
4065 new->new[smp_processor_id()] = old; 4142 new->new[smp_processor_id()] = old;
4066} 4143}
4067 4144
4068/* Always called with the cache_chain_mutex held */ 4145/* Always called with the slab_mutex held */
4069static int do_tune_cpucache(struct kmem_cache *cachep, int limit, 4146static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
4070 int batchcount, int shared, gfp_t gfp) 4147 int batchcount, int shared, gfp_t gfp)
4071{ 4148{
@@ -4109,7 +4186,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
4109 return alloc_kmemlist(cachep, gfp); 4186 return alloc_kmemlist(cachep, gfp);
4110} 4187}
4111 4188
4112/* Called with cache_chain_mutex held always */ 4189/* Called with slab_mutex held always */
4113static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) 4190static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
4114{ 4191{
4115 int err; 4192 int err;
@@ -4124,13 +4201,13 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
4124 * The numbers are guessed, we should auto-tune as described by 4201 * The numbers are guessed, we should auto-tune as described by
4125 * Bonwick. 4202 * Bonwick.
4126 */ 4203 */
4127 if (cachep->buffer_size > 131072) 4204 if (cachep->size > 131072)
4128 limit = 1; 4205 limit = 1;
4129 else if (cachep->buffer_size > PAGE_SIZE) 4206 else if (cachep->size > PAGE_SIZE)
4130 limit = 8; 4207 limit = 8;
4131 else if (cachep->buffer_size > 1024) 4208 else if (cachep->size > 1024)
4132 limit = 24; 4209 limit = 24;
4133 else if (cachep->buffer_size > 256) 4210 else if (cachep->size > 256)
4134 limit = 54; 4211 limit = 54;
4135 else 4212 else
4136 limit = 120; 4213 limit = 120;
@@ -4145,7 +4222,7 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
4145 * to a larger limit. Thus disabled by default. 4222 * to a larger limit. Thus disabled by default.
4146 */ 4223 */
4147 shared = 0; 4224 shared = 0;
4148 if (cachep->buffer_size <= PAGE_SIZE && num_possible_cpus() > 1) 4225 if (cachep->size <= PAGE_SIZE && num_possible_cpus() > 1)
4149 shared = 8; 4226 shared = 8;
4150 4227
4151#if DEBUG 4228#if DEBUG
@@ -4211,11 +4288,11 @@ static void cache_reap(struct work_struct *w)
4211 int node = numa_mem_id(); 4288 int node = numa_mem_id();
4212 struct delayed_work *work = to_delayed_work(w); 4289 struct delayed_work *work = to_delayed_work(w);
4213 4290
4214 if (!mutex_trylock(&cache_chain_mutex)) 4291 if (!mutex_trylock(&slab_mutex))
4215 /* Give up. Setup the next iteration. */ 4292 /* Give up. Setup the next iteration. */
4216 goto out; 4293 goto out;
4217 4294
4218 list_for_each_entry(searchp, &cache_chain, next) { 4295 list_for_each_entry(searchp, &slab_caches, list) {
4219 check_irq_on(); 4296 check_irq_on();
4220 4297
4221 /* 4298 /*
@@ -4253,7 +4330,7 @@ next:
4253 cond_resched(); 4330 cond_resched();
4254 } 4331 }
4255 check_irq_on(); 4332 check_irq_on();
4256 mutex_unlock(&cache_chain_mutex); 4333 mutex_unlock(&slab_mutex);
4257 next_reap_node(); 4334 next_reap_node();
4258out: 4335out:
4259 /* Set up the next iteration */ 4336 /* Set up the next iteration */
@@ -4289,26 +4366,26 @@ static void *s_start(struct seq_file *m, loff_t *pos)
4289{ 4366{
4290 loff_t n = *pos; 4367 loff_t n = *pos;
4291 4368
4292 mutex_lock(&cache_chain_mutex); 4369 mutex_lock(&slab_mutex);
4293 if (!n) 4370 if (!n)
4294 print_slabinfo_header(m); 4371 print_slabinfo_header(m);
4295 4372
4296 return seq_list_start(&cache_chain, *pos); 4373 return seq_list_start(&slab_caches, *pos);
4297} 4374}
4298 4375
4299static void *s_next(struct seq_file *m, void *p, loff_t *pos) 4376static void *s_next(struct seq_file *m, void *p, loff_t *pos)
4300{ 4377{
4301 return seq_list_next(p, &cache_chain, pos); 4378 return seq_list_next(p, &slab_caches, pos);
4302} 4379}
4303 4380
4304static void s_stop(struct seq_file *m, void *p) 4381static void s_stop(struct seq_file *m, void *p)
4305{ 4382{
4306 mutex_unlock(&cache_chain_mutex); 4383 mutex_unlock(&slab_mutex);
4307} 4384}
4308 4385
4309static int s_show(struct seq_file *m, void *p) 4386static int s_show(struct seq_file *m, void *p)
4310{ 4387{
4311 struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next); 4388 struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list);
4312 struct slab *slabp; 4389 struct slab *slabp;
4313 unsigned long active_objs; 4390 unsigned long active_objs;
4314 unsigned long num_objs; 4391 unsigned long num_objs;
@@ -4364,7 +4441,7 @@ static int s_show(struct seq_file *m, void *p)
4364 printk(KERN_ERR "slab: cache %s error: %s\n", name, error); 4441 printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
4365 4442
4366 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", 4443 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
4367 name, active_objs, num_objs, cachep->buffer_size, 4444 name, active_objs, num_objs, cachep->size,
4368 cachep->num, (1 << cachep->gfporder)); 4445 cachep->num, (1 << cachep->gfporder));
4369 seq_printf(m, " : tunables %4u %4u %4u", 4446 seq_printf(m, " : tunables %4u %4u %4u",
4370 cachep->limit, cachep->batchcount, cachep->shared); 4447 cachep->limit, cachep->batchcount, cachep->shared);
@@ -4454,9 +4531,9 @@ static ssize_t slabinfo_write(struct file *file, const char __user *buffer,
4454 return -EINVAL; 4531 return -EINVAL;
4455 4532
4456 /* Find the cache in the chain of caches. */ 4533 /* Find the cache in the chain of caches. */
4457 mutex_lock(&cache_chain_mutex); 4534 mutex_lock(&slab_mutex);
4458 res = -EINVAL; 4535 res = -EINVAL;
4459 list_for_each_entry(cachep, &cache_chain, next) { 4536 list_for_each_entry(cachep, &slab_caches, list) {
4460 if (!strcmp(cachep->name, kbuf)) { 4537 if (!strcmp(cachep->name, kbuf)) {
4461 if (limit < 1 || batchcount < 1 || 4538 if (limit < 1 || batchcount < 1 ||
4462 batchcount > limit || shared < 0) { 4539 batchcount > limit || shared < 0) {
@@ -4469,7 +4546,7 @@ static ssize_t slabinfo_write(struct file *file, const char __user *buffer,
4469 break; 4546 break;
4470 } 4547 }
4471 } 4548 }
4472 mutex_unlock(&cache_chain_mutex); 4549 mutex_unlock(&slab_mutex);
4473 if (res >= 0) 4550 if (res >= 0)
4474 res = count; 4551 res = count;
4475 return res; 4552 return res;
@@ -4492,8 +4569,8 @@ static const struct file_operations proc_slabinfo_operations = {
4492 4569
4493static void *leaks_start(struct seq_file *m, loff_t *pos) 4570static void *leaks_start(struct seq_file *m, loff_t *pos)
4494{ 4571{
4495 mutex_lock(&cache_chain_mutex); 4572 mutex_lock(&slab_mutex);
4496 return seq_list_start(&cache_chain, *pos); 4573 return seq_list_start(&slab_caches, *pos);
4497} 4574}
4498 4575
4499static inline int add_caller(unsigned long *n, unsigned long v) 4576static inline int add_caller(unsigned long *n, unsigned long v)
@@ -4532,7 +4609,7 @@ static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s)
4532 int i; 4609 int i;
4533 if (n[0] == n[1]) 4610 if (n[0] == n[1])
4534 return; 4611 return;
4535 for (i = 0, p = s->s_mem; i < c->num; i++, p += c->buffer_size) { 4612 for (i = 0, p = s->s_mem; i < c->num; i++, p += c->size) {
4536 if (slab_bufctl(s)[i] != BUFCTL_ACTIVE) 4613 if (slab_bufctl(s)[i] != BUFCTL_ACTIVE)
4537 continue; 4614 continue;
4538 if (!add_caller(n, (unsigned long)*dbg_userword(c, p))) 4615 if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
@@ -4558,7 +4635,7 @@ static void show_symbol(struct seq_file *m, unsigned long address)
4558 4635
4559static int leaks_show(struct seq_file *m, void *p) 4636static int leaks_show(struct seq_file *m, void *p)
4560{ 4637{
4561 struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next); 4638 struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list);
4562 struct slab *slabp; 4639 struct slab *slabp;
4563 struct kmem_list3 *l3; 4640 struct kmem_list3 *l3;
4564 const char *name; 4641 const char *name;
@@ -4592,17 +4669,17 @@ static int leaks_show(struct seq_file *m, void *p)
4592 name = cachep->name; 4669 name = cachep->name;
4593 if (n[0] == n[1]) { 4670 if (n[0] == n[1]) {
4594 /* Increase the buffer size */ 4671 /* Increase the buffer size */
4595 mutex_unlock(&cache_chain_mutex); 4672 mutex_unlock(&slab_mutex);
4596 m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL); 4673 m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL);
4597 if (!m->private) { 4674 if (!m->private) {
4598 /* Too bad, we are really out */ 4675 /* Too bad, we are really out */
4599 m->private = n; 4676 m->private = n;
4600 mutex_lock(&cache_chain_mutex); 4677 mutex_lock(&slab_mutex);
4601 return -ENOMEM; 4678 return -ENOMEM;
4602 } 4679 }
4603 *(unsigned long *)m->private = n[0] * 2; 4680 *(unsigned long *)m->private = n[0] * 2;
4604 kfree(n); 4681 kfree(n);
4605 mutex_lock(&cache_chain_mutex); 4682 mutex_lock(&slab_mutex);
4606 /* Now make sure this entry will be retried */ 4683 /* Now make sure this entry will be retried */
4607 m->count = m->size; 4684 m->count = m->size;
4608 return 0; 4685 return 0;
@@ -4677,6 +4754,6 @@ size_t ksize(const void *objp)
4677 if (unlikely(objp == ZERO_SIZE_PTR)) 4754 if (unlikely(objp == ZERO_SIZE_PTR))
4678 return 0; 4755 return 0;
4679 4756
4680 return obj_size(virt_to_cache(objp)); 4757 return virt_to_cache(objp)->object_size;
4681} 4758}
4682EXPORT_SYMBOL(ksize); 4759EXPORT_SYMBOL(ksize);
diff --git a/mm/slab.h b/mm/slab.h
new file mode 100644
index 00000000000..db7848caaa2
--- /dev/null
+++ b/mm/slab.h
@@ -0,0 +1,33 @@
1#ifndef MM_SLAB_H
2#define MM_SLAB_H
3/*
4 * Internal slab definitions
5 */
6
7/*
8 * State of the slab allocator.
9 *
10 * This is used to describe the states of the allocator during bootup.
11 * Allocators use this to gradually bootstrap themselves. Most allocators
12 * have the problem that the structures used for managing slab caches are
13 * allocated from slab caches themselves.
14 */
15enum slab_state {
16 DOWN, /* No slab functionality yet */
17 PARTIAL, /* SLUB: kmem_cache_node available */
18 PARTIAL_ARRAYCACHE, /* SLAB: kmalloc size for arraycache available */
19 PARTIAL_L3, /* SLAB: kmalloc size for l3 struct available */
20 UP, /* Slab caches usable but not all extras yet */
21 FULL /* Everything is working */
22};
23
24extern enum slab_state slab_state;
25
26/* The slab cache mutex protects the management structures during changes */
27extern struct mutex slab_mutex;
28extern struct list_head slab_caches;
29
30struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
31 size_t align, unsigned long flags, void (*ctor)(void *));
32
33#endif
diff --git a/mm/slab_common.c b/mm/slab_common.c
new file mode 100644
index 00000000000..aa3ca5bb01b
--- /dev/null
+++ b/mm/slab_common.c
@@ -0,0 +1,120 @@
1/*
2 * Slab allocator functions that are independent of the allocator strategy
3 *
4 * (C) 2012 Christoph Lameter <cl@linux.com>
5 */
6#include <linux/slab.h>
7
8#include <linux/mm.h>
9#include <linux/poison.h>
10#include <linux/interrupt.h>
11#include <linux/memory.h>
12#include <linux/compiler.h>
13#include <linux/module.h>
14#include <linux/cpu.h>
15#include <linux/uaccess.h>
16#include <asm/cacheflush.h>
17#include <asm/tlbflush.h>
18#include <asm/page.h>
19
20#include "slab.h"
21
22enum slab_state slab_state;
23LIST_HEAD(slab_caches);
24DEFINE_MUTEX(slab_mutex);
25
26/*
27 * kmem_cache_create - Create a cache.
28 * @name: A string which is used in /proc/slabinfo to identify this cache.
29 * @size: The size of objects to be created in this cache.
30 * @align: The required alignment for the objects.
31 * @flags: SLAB flags
32 * @ctor: A constructor for the objects.
33 *
34 * Returns a ptr to the cache on success, NULL on failure.
35 * Cannot be called within a interrupt, but can be interrupted.
36 * The @ctor is run when new pages are allocated by the cache.
37 *
38 * The flags are
39 *
40 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
41 * to catch references to uninitialised memory.
42 *
43 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
44 * for buffer overruns.
45 *
46 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
47 * cacheline. This can be beneficial if you're counting cycles as closely
48 * as davem.
49 */
50
51struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align,
52 unsigned long flags, void (*ctor)(void *))
53{
54 struct kmem_cache *s = NULL;
55
56#ifdef CONFIG_DEBUG_VM
57 if (!name || in_interrupt() || size < sizeof(void *) ||
58 size > KMALLOC_MAX_SIZE) {
59 printk(KERN_ERR "kmem_cache_create(%s) integrity check"
60 " failed\n", name);
61 goto out;
62 }
63#endif
64
65 get_online_cpus();
66 mutex_lock(&slab_mutex);
67
68#ifdef CONFIG_DEBUG_VM
69 list_for_each_entry(s, &slab_caches, list) {
70 char tmp;
71 int res;
72
73 /*
74 * This happens when the module gets unloaded and doesn't
75 * destroy its slab cache and no-one else reuses the vmalloc
76 * area of the module. Print a warning.
77 */
78 res = probe_kernel_address(s->name, tmp);
79 if (res) {
80 printk(KERN_ERR
81 "Slab cache with size %d has lost its name\n",
82 s->object_size);
83 continue;
84 }
85
86 if (!strcmp(s->name, name)) {
87 printk(KERN_ERR "kmem_cache_create(%s): Cache name"
88 " already exists.\n",
89 name);
90 dump_stack();
91 s = NULL;
92 goto oops;
93 }
94 }
95
96 WARN_ON(strchr(name, ' ')); /* It confuses parsers */
97#endif
98
99 s = __kmem_cache_create(name, size, align, flags, ctor);
100
101#ifdef CONFIG_DEBUG_VM
102oops:
103#endif
104 mutex_unlock(&slab_mutex);
105 put_online_cpus();
106
107#ifdef CONFIG_DEBUG_VM
108out:
109#endif
110 if (!s && (flags & SLAB_PANIC))
111 panic("kmem_cache_create: Failed to create slab '%s'\n", name);
112
113 return s;
114}
115EXPORT_SYMBOL(kmem_cache_create);
116
117int slab_is_available(void)
118{
119 return slab_state >= UP;
120}
diff --git a/mm/slob.c b/mm/slob.c
index 8105be42cad..45d4ca79933 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -59,6 +59,8 @@
59 59
60#include <linux/kernel.h> 60#include <linux/kernel.h>
61#include <linux/slab.h> 61#include <linux/slab.h>
62#include "slab.h"
63
62#include <linux/mm.h> 64#include <linux/mm.h>
63#include <linux/swap.h> /* struct reclaim_state */ 65#include <linux/swap.h> /* struct reclaim_state */
64#include <linux/cache.h> 66#include <linux/cache.h>
@@ -92,36 +94,6 @@ struct slob_block {
92typedef struct slob_block slob_t; 94typedef struct slob_block slob_t;
93 95
94/* 96/*
95 * We use struct page fields to manage some slob allocation aspects,
96 * however to avoid the horrible mess in include/linux/mm_types.h, we'll
97 * just define our own struct page type variant here.
98 */
99struct slob_page {
100 union {
101 struct {
102 unsigned long flags; /* mandatory */
103 atomic_t _count; /* mandatory */
104 slobidx_t units; /* free units left in page */
105 unsigned long pad[2];
106 slob_t *free; /* first free slob_t in page */
107 struct list_head list; /* linked list of free pages */
108 };
109 struct page page;
110 };
111};
112static inline void struct_slob_page_wrong_size(void)
113{ BUILD_BUG_ON(sizeof(struct slob_page) != sizeof(struct page)); }
114
115/*
116 * free_slob_page: call before a slob_page is returned to the page allocator.
117 */
118static inline void free_slob_page(struct slob_page *sp)
119{
120 reset_page_mapcount(&sp->page);
121 sp->page.mapping = NULL;
122}
123
124/*
125 * All partially free slob pages go on these lists. 97 * All partially free slob pages go on these lists.
126 */ 98 */
127#define SLOB_BREAK1 256 99#define SLOB_BREAK1 256
@@ -131,46 +103,23 @@ static LIST_HEAD(free_slob_medium);
131static LIST_HEAD(free_slob_large); 103static LIST_HEAD(free_slob_large);
132 104
133/* 105/*
134 * is_slob_page: True for all slob pages (false for bigblock pages)
135 */
136static inline int is_slob_page(struct slob_page *sp)
137{
138 return PageSlab((struct page *)sp);
139}
140
141static inline void set_slob_page(struct slob_page *sp)
142{
143 __SetPageSlab((struct page *)sp);
144}
145
146static inline void clear_slob_page(struct slob_page *sp)
147{
148 __ClearPageSlab((struct page *)sp);
149}
150
151static inline struct slob_page *slob_page(const void *addr)
152{
153 return (struct slob_page *)virt_to_page(addr);
154}
155
156/*
157 * slob_page_free: true for pages on free_slob_pages list. 106 * slob_page_free: true for pages on free_slob_pages list.
158 */ 107 */
159static inline int slob_page_free(struct slob_page *sp) 108static inline int slob_page_free(struct page *sp)
160{ 109{
161 return PageSlobFree((struct page *)sp); 110 return PageSlobFree(sp);
162} 111}
163 112
164static void set_slob_page_free(struct slob_page *sp, struct list_head *list) 113static void set_slob_page_free(struct page *sp, struct list_head *list)
165{ 114{
166 list_add(&sp->list, list); 115 list_add(&sp->list, list);
167 __SetPageSlobFree((struct page *)sp); 116 __SetPageSlobFree(sp);
168} 117}
169 118
170static inline void clear_slob_page_free(struct slob_page *sp) 119static inline void clear_slob_page_free(struct page *sp)
171{ 120{
172 list_del(&sp->list); 121 list_del(&sp->list);
173 __ClearPageSlobFree((struct page *)sp); 122 __ClearPageSlobFree(sp);
174} 123}
175 124
176#define SLOB_UNIT sizeof(slob_t) 125#define SLOB_UNIT sizeof(slob_t)
@@ -267,12 +216,12 @@ static void slob_free_pages(void *b, int order)
267/* 216/*
268 * Allocate a slob block within a given slob_page sp. 217 * Allocate a slob block within a given slob_page sp.
269 */ 218 */
270static void *slob_page_alloc(struct slob_page *sp, size_t size, int align) 219static void *slob_page_alloc(struct page *sp, size_t size, int align)
271{ 220{
272 slob_t *prev, *cur, *aligned = NULL; 221 slob_t *prev, *cur, *aligned = NULL;
273 int delta = 0, units = SLOB_UNITS(size); 222 int delta = 0, units = SLOB_UNITS(size);
274 223
275 for (prev = NULL, cur = sp->free; ; prev = cur, cur = slob_next(cur)) { 224 for (prev = NULL, cur = sp->freelist; ; prev = cur, cur = slob_next(cur)) {
276 slobidx_t avail = slob_units(cur); 225 slobidx_t avail = slob_units(cur);
277 226
278 if (align) { 227 if (align) {
@@ -296,12 +245,12 @@ static void *slob_page_alloc(struct slob_page *sp, size_t size, int align)
296 if (prev) 245 if (prev)
297 set_slob(prev, slob_units(prev), next); 246 set_slob(prev, slob_units(prev), next);
298 else 247 else
299 sp->free = next; 248 sp->freelist = next;
300 } else { /* fragment */ 249 } else { /* fragment */
301 if (prev) 250 if (prev)
302 set_slob(prev, slob_units(prev), cur + units); 251 set_slob(prev, slob_units(prev), cur + units);
303 else 252 else
304 sp->free = cur + units; 253 sp->freelist = cur + units;
305 set_slob(cur + units, avail - units, next); 254 set_slob(cur + units, avail - units, next);
306 } 255 }
307 256
@@ -320,7 +269,7 @@ static void *slob_page_alloc(struct slob_page *sp, size_t size, int align)
320 */ 269 */
321static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) 270static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
322{ 271{
323 struct slob_page *sp; 272 struct page *sp;
324 struct list_head *prev; 273 struct list_head *prev;
325 struct list_head *slob_list; 274 struct list_head *slob_list;
326 slob_t *b = NULL; 275 slob_t *b = NULL;
@@ -341,7 +290,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
341 * If there's a node specification, search for a partial 290 * If there's a node specification, search for a partial
342 * page with a matching node id in the freelist. 291 * page with a matching node id in the freelist.
343 */ 292 */
344 if (node != -1 && page_to_nid(&sp->page) != node) 293 if (node != -1 && page_to_nid(sp) != node)
345 continue; 294 continue;
346#endif 295#endif
347 /* Enough room on this page? */ 296 /* Enough room on this page? */
@@ -369,12 +318,12 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
369 b = slob_new_pages(gfp & ~__GFP_ZERO, 0, node); 318 b = slob_new_pages(gfp & ~__GFP_ZERO, 0, node);
370 if (!b) 319 if (!b)
371 return NULL; 320 return NULL;
372 sp = slob_page(b); 321 sp = virt_to_page(b);
373 set_slob_page(sp); 322 __SetPageSlab(sp);
374 323
375 spin_lock_irqsave(&slob_lock, flags); 324 spin_lock_irqsave(&slob_lock, flags);
376 sp->units = SLOB_UNITS(PAGE_SIZE); 325 sp->units = SLOB_UNITS(PAGE_SIZE);
377 sp->free = b; 326 sp->freelist = b;
378 INIT_LIST_HEAD(&sp->list); 327 INIT_LIST_HEAD(&sp->list);
379 set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE)); 328 set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE));
380 set_slob_page_free(sp, slob_list); 329 set_slob_page_free(sp, slob_list);
@@ -392,7 +341,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
392 */ 341 */
393static void slob_free(void *block, int size) 342static void slob_free(void *block, int size)
394{ 343{
395 struct slob_page *sp; 344 struct page *sp;
396 slob_t *prev, *next, *b = (slob_t *)block; 345 slob_t *prev, *next, *b = (slob_t *)block;
397 slobidx_t units; 346 slobidx_t units;
398 unsigned long flags; 347 unsigned long flags;
@@ -402,7 +351,7 @@ static void slob_free(void *block, int size)
402 return; 351 return;
403 BUG_ON(!size); 352 BUG_ON(!size);
404 353
405 sp = slob_page(block); 354 sp = virt_to_page(block);
406 units = SLOB_UNITS(size); 355 units = SLOB_UNITS(size);
407 356
408 spin_lock_irqsave(&slob_lock, flags); 357 spin_lock_irqsave(&slob_lock, flags);
@@ -412,8 +361,8 @@ static void slob_free(void *block, int size)
412 if (slob_page_free(sp)) 361 if (slob_page_free(sp))
413 clear_slob_page_free(sp); 362 clear_slob_page_free(sp);
414 spin_unlock_irqrestore(&slob_lock, flags); 363 spin_unlock_irqrestore(&slob_lock, flags);
415 clear_slob_page(sp); 364 __ClearPageSlab(sp);
416 free_slob_page(sp); 365 reset_page_mapcount(sp);
417 slob_free_pages(b, 0); 366 slob_free_pages(b, 0);
418 return; 367 return;
419 } 368 }
@@ -421,7 +370,7 @@ static void slob_free(void *block, int size)
421 if (!slob_page_free(sp)) { 370 if (!slob_page_free(sp)) {
422 /* This slob page is about to become partially free. Easy! */ 371 /* This slob page is about to become partially free. Easy! */
423 sp->units = units; 372 sp->units = units;
424 sp->free = b; 373 sp->freelist = b;
425 set_slob(b, units, 374 set_slob(b, units,
426 (void *)((unsigned long)(b + 375 (void *)((unsigned long)(b +
427 SLOB_UNITS(PAGE_SIZE)) & PAGE_MASK)); 376 SLOB_UNITS(PAGE_SIZE)) & PAGE_MASK));
@@ -441,15 +390,15 @@ static void slob_free(void *block, int size)
441 */ 390 */
442 sp->units += units; 391 sp->units += units;
443 392
444 if (b < sp->free) { 393 if (b < (slob_t *)sp->freelist) {
445 if (b + units == sp->free) { 394 if (b + units == sp->freelist) {
446 units += slob_units(sp->free); 395 units += slob_units(sp->freelist);
447 sp->free = slob_next(sp->free); 396 sp->freelist = slob_next(sp->freelist);
448 } 397 }
449 set_slob(b, units, sp->free); 398 set_slob(b, units, sp->freelist);
450 sp->free = b; 399 sp->freelist = b;
451 } else { 400 } else {
452 prev = sp->free; 401 prev = sp->freelist;
453 next = slob_next(prev); 402 next = slob_next(prev);
454 while (b > next) { 403 while (b > next) {
455 prev = next; 404 prev = next;
@@ -522,7 +471,7 @@ EXPORT_SYMBOL(__kmalloc_node);
522 471
523void kfree(const void *block) 472void kfree(const void *block)
524{ 473{
525 struct slob_page *sp; 474 struct page *sp;
526 475
527 trace_kfree(_RET_IP_, block); 476 trace_kfree(_RET_IP_, block);
528 477
@@ -530,43 +479,36 @@ void kfree(const void *block)
530 return; 479 return;
531 kmemleak_free(block); 480 kmemleak_free(block);
532 481
533 sp = slob_page(block); 482 sp = virt_to_page(block);
534 if (is_slob_page(sp)) { 483 if (PageSlab(sp)) {
535 int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); 484 int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
536 unsigned int *m = (unsigned int *)(block - align); 485 unsigned int *m = (unsigned int *)(block - align);
537 slob_free(m, *m + align); 486 slob_free(m, *m + align);
538 } else 487 } else
539 put_page(&sp->page); 488 put_page(sp);
540} 489}
541EXPORT_SYMBOL(kfree); 490EXPORT_SYMBOL(kfree);
542 491
543/* can't use ksize for kmem_cache_alloc memory, only kmalloc */ 492/* can't use ksize for kmem_cache_alloc memory, only kmalloc */
544size_t ksize(const void *block) 493size_t ksize(const void *block)
545{ 494{
546 struct slob_page *sp; 495 struct page *sp;
547 496
548 BUG_ON(!block); 497 BUG_ON(!block);
549 if (unlikely(block == ZERO_SIZE_PTR)) 498 if (unlikely(block == ZERO_SIZE_PTR))
550 return 0; 499 return 0;
551 500
552 sp = slob_page(block); 501 sp = virt_to_page(block);
553 if (is_slob_page(sp)) { 502 if (PageSlab(sp)) {
554 int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); 503 int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
555 unsigned int *m = (unsigned int *)(block - align); 504 unsigned int *m = (unsigned int *)(block - align);
556 return SLOB_UNITS(*m) * SLOB_UNIT; 505 return SLOB_UNITS(*m) * SLOB_UNIT;
557 } else 506 } else
558 return sp->page.private; 507 return sp->private;
559} 508}
560EXPORT_SYMBOL(ksize); 509EXPORT_SYMBOL(ksize);
561 510
562struct kmem_cache { 511struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
563 unsigned int size, align;
564 unsigned long flags;
565 const char *name;
566 void (*ctor)(void *);
567};
568
569struct kmem_cache *kmem_cache_create(const char *name, size_t size,
570 size_t align, unsigned long flags, void (*ctor)(void *)) 512 size_t align, unsigned long flags, void (*ctor)(void *))
571{ 513{
572 struct kmem_cache *c; 514 struct kmem_cache *c;
@@ -589,13 +531,12 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
589 c->align = ARCH_SLAB_MINALIGN; 531 c->align = ARCH_SLAB_MINALIGN;
590 if (c->align < align) 532 if (c->align < align)
591 c->align = align; 533 c->align = align;
592 } else if (flags & SLAB_PANIC)
593 panic("Cannot create slab cache %s\n", name);
594 534
595 kmemleak_alloc(c, sizeof(struct kmem_cache), 1, GFP_KERNEL); 535 kmemleak_alloc(c, sizeof(struct kmem_cache), 1, GFP_KERNEL);
536 c->refcount = 1;
537 }
596 return c; 538 return c;
597} 539}
598EXPORT_SYMBOL(kmem_cache_create);
599 540
600void kmem_cache_destroy(struct kmem_cache *c) 541void kmem_cache_destroy(struct kmem_cache *c)
601{ 542{
@@ -678,19 +619,12 @@ int kmem_cache_shrink(struct kmem_cache *d)
678} 619}
679EXPORT_SYMBOL(kmem_cache_shrink); 620EXPORT_SYMBOL(kmem_cache_shrink);
680 621
681static unsigned int slob_ready __read_mostly;
682
683int slab_is_available(void)
684{
685 return slob_ready;
686}
687
688void __init kmem_cache_init(void) 622void __init kmem_cache_init(void)
689{ 623{
690 slob_ready = 1; 624 slab_state = UP;
691} 625}
692 626
693void __init kmem_cache_init_late(void) 627void __init kmem_cache_init_late(void)
694{ 628{
695 /* Nothing to do */ 629 slab_state = FULL;
696} 630}
diff --git a/mm/slub.c b/mm/slub.c
index 8c691fa1cf3..8f78e257703 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -16,6 +16,7 @@
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/bitops.h> 17#include <linux/bitops.h>
18#include <linux/slab.h> 18#include <linux/slab.h>
19#include "slab.h"
19#include <linux/proc_fs.h> 20#include <linux/proc_fs.h>
20#include <linux/seq_file.h> 21#include <linux/seq_file.h>
21#include <linux/kmemcheck.h> 22#include <linux/kmemcheck.h>
@@ -33,15 +34,17 @@
33 34
34#include <trace/events/kmem.h> 35#include <trace/events/kmem.h>
35 36
37#include "internal.h"
38
36/* 39/*
37 * Lock order: 40 * Lock order:
38 * 1. slub_lock (Global Semaphore) 41 * 1. slab_mutex (Global Mutex)
39 * 2. node->list_lock 42 * 2. node->list_lock
40 * 3. slab_lock(page) (Only on some arches and for debugging) 43 * 3. slab_lock(page) (Only on some arches and for debugging)
41 * 44 *
42 * slub_lock 45 * slab_mutex
43 * 46 *
44 * The role of the slub_lock is to protect the list of all the slabs 47 * The role of the slab_mutex is to protect the list of all the slabs
45 * and to synchronize major metadata changes to slab cache structures. 48 * and to synchronize major metadata changes to slab cache structures.
46 * 49 *
47 * The slab_lock is only used for debugging and on arches that do not 50 * The slab_lock is only used for debugging and on arches that do not
@@ -182,17 +185,6 @@ static int kmem_size = sizeof(struct kmem_cache);
182static struct notifier_block slab_notifier; 185static struct notifier_block slab_notifier;
183#endif 186#endif
184 187
185static enum {
186 DOWN, /* No slab functionality available */
187 PARTIAL, /* Kmem_cache_node works */
188 UP, /* Everything works but does not show up in sysfs */
189 SYSFS /* Sysfs up */
190} slab_state = DOWN;
191
192/* A list of all slab caches on the system */
193static DECLARE_RWSEM(slub_lock);
194static LIST_HEAD(slab_caches);
195
196/* 188/*
197 * Tracking user of a slab. 189 * Tracking user of a slab.
198 */ 190 */
@@ -237,11 +229,6 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si)
237 * Core slab cache functions 229 * Core slab cache functions
238 *******************************************************************/ 230 *******************************************************************/
239 231
240int slab_is_available(void)
241{
242 return slab_state >= UP;
243}
244
245static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) 232static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
246{ 233{
247 return s->node[node]; 234 return s->node[node];
@@ -311,7 +298,7 @@ static inline size_t slab_ksize(const struct kmem_cache *s)
311 * and whatever may come after it. 298 * and whatever may come after it.
312 */ 299 */
313 if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) 300 if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
314 return s->objsize; 301 return s->object_size;
315 302
316#endif 303#endif
317 /* 304 /*
@@ -609,11 +596,11 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
609 if (p > addr + 16) 596 if (p > addr + 16)
610 print_section("Bytes b4 ", p - 16, 16); 597 print_section("Bytes b4 ", p - 16, 16);
611 598
612 print_section("Object ", p, min_t(unsigned long, s->objsize, 599 print_section("Object ", p, min_t(unsigned long, s->object_size,
613 PAGE_SIZE)); 600 PAGE_SIZE));
614 if (s->flags & SLAB_RED_ZONE) 601 if (s->flags & SLAB_RED_ZONE)
615 print_section("Redzone ", p + s->objsize, 602 print_section("Redzone ", p + s->object_size,
616 s->inuse - s->objsize); 603 s->inuse - s->object_size);
617 604
618 if (s->offset) 605 if (s->offset)
619 off = s->offset + sizeof(void *); 606 off = s->offset + sizeof(void *);
@@ -655,12 +642,12 @@ static void init_object(struct kmem_cache *s, void *object, u8 val)
655 u8 *p = object; 642 u8 *p = object;
656 643
657 if (s->flags & __OBJECT_POISON) { 644 if (s->flags & __OBJECT_POISON) {
658 memset(p, POISON_FREE, s->objsize - 1); 645 memset(p, POISON_FREE, s->object_size - 1);
659 p[s->objsize - 1] = POISON_END; 646 p[s->object_size - 1] = POISON_END;
660 } 647 }
661 648
662 if (s->flags & SLAB_RED_ZONE) 649 if (s->flags & SLAB_RED_ZONE)
663 memset(p + s->objsize, val, s->inuse - s->objsize); 650 memset(p + s->object_size, val, s->inuse - s->object_size);
664} 651}
665 652
666static void restore_bytes(struct kmem_cache *s, char *message, u8 data, 653static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
@@ -705,10 +692,10 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
705 * Poisoning uses 0x6b (POISON_FREE) and the last byte is 692 * Poisoning uses 0x6b (POISON_FREE) and the last byte is
706 * 0xa5 (POISON_END) 693 * 0xa5 (POISON_END)
707 * 694 *
708 * object + s->objsize 695 * object + s->object_size
709 * Padding to reach word boundary. This is also used for Redzoning. 696 * Padding to reach word boundary. This is also used for Redzoning.
710 * Padding is extended by another word if Redzoning is enabled and 697 * Padding is extended by another word if Redzoning is enabled and
711 * objsize == inuse. 698 * object_size == inuse.
712 * 699 *
713 * We fill with 0xbb (RED_INACTIVE) for inactive objects and with 700 * We fill with 0xbb (RED_INACTIVE) for inactive objects and with
714 * 0xcc (RED_ACTIVE) for objects in use. 701 * 0xcc (RED_ACTIVE) for objects in use.
@@ -727,7 +714,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
727 * object + s->size 714 * object + s->size
728 * Nothing is used beyond s->size. 715 * Nothing is used beyond s->size.
729 * 716 *
730 * If slabcaches are merged then the objsize and inuse boundaries are mostly 717 * If slabcaches are merged then the object_size and inuse boundaries are mostly
731 * ignored. And therefore no slab options that rely on these boundaries 718 * ignored. And therefore no slab options that rely on these boundaries
732 * may be used with merged slabcaches. 719 * may be used with merged slabcaches.
733 */ 720 */
@@ -787,25 +774,25 @@ static int check_object(struct kmem_cache *s, struct page *page,
787 void *object, u8 val) 774 void *object, u8 val)
788{ 775{
789 u8 *p = object; 776 u8 *p = object;
790 u8 *endobject = object + s->objsize; 777 u8 *endobject = object + s->object_size;
791 778
792 if (s->flags & SLAB_RED_ZONE) { 779 if (s->flags & SLAB_RED_ZONE) {
793 if (!check_bytes_and_report(s, page, object, "Redzone", 780 if (!check_bytes_and_report(s, page, object, "Redzone",
794 endobject, val, s->inuse - s->objsize)) 781 endobject, val, s->inuse - s->object_size))
795 return 0; 782 return 0;
796 } else { 783 } else {
797 if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) { 784 if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) {
798 check_bytes_and_report(s, page, p, "Alignment padding", 785 check_bytes_and_report(s, page, p, "Alignment padding",
799 endobject, POISON_INUSE, s->inuse - s->objsize); 786 endobject, POISON_INUSE, s->inuse - s->object_size);
800 } 787 }
801 } 788 }
802 789
803 if (s->flags & SLAB_POISON) { 790 if (s->flags & SLAB_POISON) {
804 if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) && 791 if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) &&
805 (!check_bytes_and_report(s, page, p, "Poison", p, 792 (!check_bytes_and_report(s, page, p, "Poison", p,
806 POISON_FREE, s->objsize - 1) || 793 POISON_FREE, s->object_size - 1) ||
807 !check_bytes_and_report(s, page, p, "Poison", 794 !check_bytes_and_report(s, page, p, "Poison",
808 p + s->objsize - 1, POISON_END, 1))) 795 p + s->object_size - 1, POISON_END, 1)))
809 return 0; 796 return 0;
810 /* 797 /*
811 * check_pad_bytes cleans up on its own. 798 * check_pad_bytes cleans up on its own.
@@ -926,7 +913,7 @@ static void trace(struct kmem_cache *s, struct page *page, void *object,
926 page->freelist); 913 page->freelist);
927 914
928 if (!alloc) 915 if (!alloc)
929 print_section("Object ", (void *)object, s->objsize); 916 print_section("Object ", (void *)object, s->object_size);
930 917
931 dump_stack(); 918 dump_stack();
932 } 919 }
@@ -942,14 +929,14 @@ static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
942 lockdep_trace_alloc(flags); 929 lockdep_trace_alloc(flags);
943 might_sleep_if(flags & __GFP_WAIT); 930 might_sleep_if(flags & __GFP_WAIT);
944 931
945 return should_failslab(s->objsize, flags, s->flags); 932 return should_failslab(s->object_size, flags, s->flags);
946} 933}
947 934
948static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object) 935static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object)
949{ 936{
950 flags &= gfp_allowed_mask; 937 flags &= gfp_allowed_mask;
951 kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); 938 kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
952 kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, flags); 939 kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags);
953} 940}
954 941
955static inline void slab_free_hook(struct kmem_cache *s, void *x) 942static inline void slab_free_hook(struct kmem_cache *s, void *x)
@@ -966,13 +953,13 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
966 unsigned long flags; 953 unsigned long flags;
967 954
968 local_irq_save(flags); 955 local_irq_save(flags);
969 kmemcheck_slab_free(s, x, s->objsize); 956 kmemcheck_slab_free(s, x, s->object_size);
970 debug_check_no_locks_freed(x, s->objsize); 957 debug_check_no_locks_freed(x, s->object_size);
971 local_irq_restore(flags); 958 local_irq_restore(flags);
972 } 959 }
973#endif 960#endif
974 if (!(s->flags & SLAB_DEBUG_OBJECTS)) 961 if (!(s->flags & SLAB_DEBUG_OBJECTS))
975 debug_check_no_obj_freed(x, s->objsize); 962 debug_check_no_obj_freed(x, s->object_size);
976} 963}
977 964
978/* 965/*
@@ -1207,7 +1194,7 @@ out:
1207 1194
1208__setup("slub_debug", setup_slub_debug); 1195__setup("slub_debug", setup_slub_debug);
1209 1196
1210static unsigned long kmem_cache_flags(unsigned long objsize, 1197static unsigned long kmem_cache_flags(unsigned long object_size,
1211 unsigned long flags, const char *name, 1198 unsigned long flags, const char *name,
1212 void (*ctor)(void *)) 1199 void (*ctor)(void *))
1213{ 1200{
@@ -1237,7 +1224,7 @@ static inline int check_object(struct kmem_cache *s, struct page *page,
1237static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, 1224static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
1238 struct page *page) {} 1225 struct page *page) {}
1239static inline void remove_full(struct kmem_cache *s, struct page *page) {} 1226static inline void remove_full(struct kmem_cache *s, struct page *page) {}
1240static inline unsigned long kmem_cache_flags(unsigned long objsize, 1227static inline unsigned long kmem_cache_flags(unsigned long object_size,
1241 unsigned long flags, const char *name, 1228 unsigned long flags, const char *name,
1242 void (*ctor)(void *)) 1229 void (*ctor)(void *))
1243{ 1230{
@@ -1314,13 +1301,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1314 stat(s, ORDER_FALLBACK); 1301 stat(s, ORDER_FALLBACK);
1315 } 1302 }
1316 1303
1317 if (flags & __GFP_WAIT) 1304 if (kmemcheck_enabled && page
1318 local_irq_disable();
1319
1320 if (!page)
1321 return NULL;
1322
1323 if (kmemcheck_enabled
1324 && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { 1305 && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
1325 int pages = 1 << oo_order(oo); 1306 int pages = 1 << oo_order(oo);
1326 1307
@@ -1336,6 +1317,11 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1336 kmemcheck_mark_unallocated_pages(page, pages); 1317 kmemcheck_mark_unallocated_pages(page, pages);
1337 } 1318 }
1338 1319
1320 if (flags & __GFP_WAIT)
1321 local_irq_disable();
1322 if (!page)
1323 return NULL;
1324
1339 page->objects = oo_objects(oo); 1325 page->objects = oo_objects(oo);
1340 mod_zone_page_state(page_zone(page), 1326 mod_zone_page_state(page_zone(page),
1341 (s->flags & SLAB_RECLAIM_ACCOUNT) ? 1327 (s->flags & SLAB_RECLAIM_ACCOUNT) ?
@@ -1370,6 +1356,8 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1370 inc_slabs_node(s, page_to_nid(page), page->objects); 1356 inc_slabs_node(s, page_to_nid(page), page->objects);
1371 page->slab = s; 1357 page->slab = s;
1372 __SetPageSlab(page); 1358 __SetPageSlab(page);
1359 if (page->pfmemalloc)
1360 SetPageSlabPfmemalloc(page);
1373 1361
1374 start = page_address(page); 1362 start = page_address(page);
1375 1363
@@ -1413,6 +1401,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
1413 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 1401 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
1414 -pages); 1402 -pages);
1415 1403
1404 __ClearPageSlabPfmemalloc(page);
1416 __ClearPageSlab(page); 1405 __ClearPageSlab(page);
1417 reset_page_mapcount(page); 1406 reset_page_mapcount(page);
1418 if (current->reclaim_state) 1407 if (current->reclaim_state)
@@ -1490,12 +1479,12 @@ static inline void remove_partial(struct kmem_cache_node *n,
1490} 1479}
1491 1480
1492/* 1481/*
1493 * Lock slab, remove from the partial list and put the object into the 1482 * Remove slab from the partial list, freeze it and
1494 * per cpu freelist. 1483 * return the pointer to the freelist.
1495 * 1484 *
1496 * Returns a list of objects or NULL if it fails. 1485 * Returns a list of objects or NULL if it fails.
1497 * 1486 *
1498 * Must hold list_lock. 1487 * Must hold list_lock since we modify the partial list.
1499 */ 1488 */
1500static inline void *acquire_slab(struct kmem_cache *s, 1489static inline void *acquire_slab(struct kmem_cache *s,
1501 struct kmem_cache_node *n, struct page *page, 1490 struct kmem_cache_node *n, struct page *page,
@@ -1510,26 +1499,27 @@ static inline void *acquire_slab(struct kmem_cache *s,
1510 * The old freelist is the list of objects for the 1499 * The old freelist is the list of objects for the
1511 * per cpu allocation list. 1500 * per cpu allocation list.
1512 */ 1501 */
1513 do { 1502 freelist = page->freelist;
1514 freelist = page->freelist; 1503 counters = page->counters;
1515 counters = page->counters; 1504 new.counters = counters;
1516 new.counters = counters; 1505 if (mode) {
1517 if (mode) { 1506 new.inuse = page->objects;
1518 new.inuse = page->objects; 1507 new.freelist = NULL;
1519 new.freelist = NULL; 1508 } else {
1520 } else { 1509 new.freelist = freelist;
1521 new.freelist = freelist; 1510 }
1522 }
1523 1511
1524 VM_BUG_ON(new.frozen); 1512 VM_BUG_ON(new.frozen);
1525 new.frozen = 1; 1513 new.frozen = 1;
1526 1514
1527 } while (!__cmpxchg_double_slab(s, page, 1515 if (!__cmpxchg_double_slab(s, page,
1528 freelist, counters, 1516 freelist, counters,
1529 new.freelist, new.counters, 1517 new.freelist, new.counters,
1530 "lock and freeze")); 1518 "acquire_slab"))
1519 return NULL;
1531 1520
1532 remove_partial(n, page); 1521 remove_partial(n, page);
1522 WARN_ON(!freelist);
1533 return freelist; 1523 return freelist;
1534} 1524}
1535 1525
@@ -1563,7 +1553,6 @@ static void *get_partial_node(struct kmem_cache *s,
1563 1553
1564 if (!object) { 1554 if (!object) {
1565 c->page = page; 1555 c->page = page;
1566 c->node = page_to_nid(page);
1567 stat(s, ALLOC_FROM_PARTIAL); 1556 stat(s, ALLOC_FROM_PARTIAL);
1568 object = t; 1557 object = t;
1569 available = page->objects - page->inuse; 1558 available = page->objects - page->inuse;
@@ -1617,7 +1606,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
1617 1606
1618 do { 1607 do {
1619 cpuset_mems_cookie = get_mems_allowed(); 1608 cpuset_mems_cookie = get_mems_allowed();
1620 zonelist = node_zonelist(slab_node(current->mempolicy), flags); 1609 zonelist = node_zonelist(slab_node(), flags);
1621 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { 1610 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1622 struct kmem_cache_node *n; 1611 struct kmem_cache_node *n;
1623 1612
@@ -1731,14 +1720,12 @@ void init_kmem_cache_cpus(struct kmem_cache *s)
1731/* 1720/*
1732 * Remove the cpu slab 1721 * Remove the cpu slab
1733 */ 1722 */
1734static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) 1723static void deactivate_slab(struct kmem_cache *s, struct page *page, void *freelist)
1735{ 1724{
1736 enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE }; 1725 enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE };
1737 struct page *page = c->page;
1738 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 1726 struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1739 int lock = 0; 1727 int lock = 0;
1740 enum slab_modes l = M_NONE, m = M_NONE; 1728 enum slab_modes l = M_NONE, m = M_NONE;
1741 void *freelist;
1742 void *nextfree; 1729 void *nextfree;
1743 int tail = DEACTIVATE_TO_HEAD; 1730 int tail = DEACTIVATE_TO_HEAD;
1744 struct page new; 1731 struct page new;
@@ -1749,11 +1736,6 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1749 tail = DEACTIVATE_TO_TAIL; 1736 tail = DEACTIVATE_TO_TAIL;
1750 } 1737 }
1751 1738
1752 c->tid = next_tid(c->tid);
1753 c->page = NULL;
1754 freelist = c->freelist;
1755 c->freelist = NULL;
1756
1757 /* 1739 /*
1758 * Stage one: Free all available per cpu objects back 1740 * Stage one: Free all available per cpu objects back
1759 * to the page freelist while it is still frozen. Leave the 1741 * to the page freelist while it is still frozen. Leave the
@@ -1879,21 +1861,31 @@ redo:
1879 } 1861 }
1880} 1862}
1881 1863
1882/* Unfreeze all the cpu partial slabs */ 1864/*
1865 * Unfreeze all the cpu partial slabs.
1866 *
1867 * This function must be called with interrupt disabled.
1868 */
1883static void unfreeze_partials(struct kmem_cache *s) 1869static void unfreeze_partials(struct kmem_cache *s)
1884{ 1870{
1885 struct kmem_cache_node *n = NULL; 1871 struct kmem_cache_node *n = NULL, *n2 = NULL;
1886 struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab); 1872 struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab);
1887 struct page *page, *discard_page = NULL; 1873 struct page *page, *discard_page = NULL;
1888 1874
1889 while ((page = c->partial)) { 1875 while ((page = c->partial)) {
1890 enum slab_modes { M_PARTIAL, M_FREE };
1891 enum slab_modes l, m;
1892 struct page new; 1876 struct page new;
1893 struct page old; 1877 struct page old;
1894 1878
1895 c->partial = page->next; 1879 c->partial = page->next;
1896 l = M_FREE; 1880
1881 n2 = get_node(s, page_to_nid(page));
1882 if (n != n2) {
1883 if (n)
1884 spin_unlock(&n->list_lock);
1885
1886 n = n2;
1887 spin_lock(&n->list_lock);
1888 }
1897 1889
1898 do { 1890 do {
1899 1891
@@ -1906,43 +1898,17 @@ static void unfreeze_partials(struct kmem_cache *s)
1906 1898
1907 new.frozen = 0; 1899 new.frozen = 0;
1908 1900
1909 if (!new.inuse && (!n || n->nr_partial > s->min_partial)) 1901 } while (!__cmpxchg_double_slab(s, page,
1910 m = M_FREE;
1911 else {
1912 struct kmem_cache_node *n2 = get_node(s,
1913 page_to_nid(page));
1914
1915 m = M_PARTIAL;
1916 if (n != n2) {
1917 if (n)
1918 spin_unlock(&n->list_lock);
1919
1920 n = n2;
1921 spin_lock(&n->list_lock);
1922 }
1923 }
1924
1925 if (l != m) {
1926 if (l == M_PARTIAL) {
1927 remove_partial(n, page);
1928 stat(s, FREE_REMOVE_PARTIAL);
1929 } else {
1930 add_partial(n, page,
1931 DEACTIVATE_TO_TAIL);
1932 stat(s, FREE_ADD_PARTIAL);
1933 }
1934
1935 l = m;
1936 }
1937
1938 } while (!cmpxchg_double_slab(s, page,
1939 old.freelist, old.counters, 1902 old.freelist, old.counters,
1940 new.freelist, new.counters, 1903 new.freelist, new.counters,
1941 "unfreezing slab")); 1904 "unfreezing slab"));
1942 1905
1943 if (m == M_FREE) { 1906 if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) {
1944 page->next = discard_page; 1907 page->next = discard_page;
1945 discard_page = page; 1908 discard_page = page;
1909 } else {
1910 add_partial(n, page, DEACTIVATE_TO_TAIL);
1911 stat(s, FREE_ADD_PARTIAL);
1946 } 1912 }
1947 } 1913 }
1948 1914
@@ -2011,7 +1977,11 @@ int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
2011static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) 1977static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
2012{ 1978{
2013 stat(s, CPUSLAB_FLUSH); 1979 stat(s, CPUSLAB_FLUSH);
2014 deactivate_slab(s, c); 1980 deactivate_slab(s, c->page, c->freelist);
1981
1982 c->tid = next_tid(c->tid);
1983 c->page = NULL;
1984 c->freelist = NULL;
2015} 1985}
2016 1986
2017/* 1987/*
@@ -2055,10 +2025,10 @@ static void flush_all(struct kmem_cache *s)
2055 * Check if the objects in a per cpu structure fit numa 2025 * Check if the objects in a per cpu structure fit numa
2056 * locality expectations. 2026 * locality expectations.
2057 */ 2027 */
2058static inline int node_match(struct kmem_cache_cpu *c, int node) 2028static inline int node_match(struct page *page, int node)
2059{ 2029{
2060#ifdef CONFIG_NUMA 2030#ifdef CONFIG_NUMA
2061 if (node != NUMA_NO_NODE && c->node != node) 2031 if (node != NUMA_NO_NODE && page_to_nid(page) != node)
2062 return 0; 2032 return 0;
2063#endif 2033#endif
2064 return 1; 2034 return 1;
@@ -2101,10 +2071,10 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
2101 "SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n", 2071 "SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n",
2102 nid, gfpflags); 2072 nid, gfpflags);
2103 printk(KERN_WARNING " cache: %s, object size: %d, buffer size: %d, " 2073 printk(KERN_WARNING " cache: %s, object size: %d, buffer size: %d, "
2104 "default order: %d, min order: %d\n", s->name, s->objsize, 2074 "default order: %d, min order: %d\n", s->name, s->object_size,
2105 s->size, oo_order(s->oo), oo_order(s->min)); 2075 s->size, oo_order(s->oo), oo_order(s->min));
2106 2076
2107 if (oo_order(s->min) > get_order(s->objsize)) 2077 if (oo_order(s->min) > get_order(s->object_size))
2108 printk(KERN_WARNING " %s debugging increased min order, use " 2078 printk(KERN_WARNING " %s debugging increased min order, use "
2109 "slub_debug=O to disable.\n", s->name); 2079 "slub_debug=O to disable.\n", s->name);
2110 2080
@@ -2130,10 +2100,16 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
2130static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, 2100static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
2131 int node, struct kmem_cache_cpu **pc) 2101 int node, struct kmem_cache_cpu **pc)
2132{ 2102{
2133 void *object; 2103 void *freelist;
2134 struct kmem_cache_cpu *c; 2104 struct kmem_cache_cpu *c = *pc;
2135 struct page *page = new_slab(s, flags, node); 2105 struct page *page;
2136 2106
2107 freelist = get_partial(s, flags, node, c);
2108
2109 if (freelist)
2110 return freelist;
2111
2112 page = new_slab(s, flags, node);
2137 if (page) { 2113 if (page) {
2138 c = __this_cpu_ptr(s->cpu_slab); 2114 c = __this_cpu_ptr(s->cpu_slab);
2139 if (c->page) 2115 if (c->page)
@@ -2143,17 +2119,24 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
2143 * No other reference to the page yet so we can 2119 * No other reference to the page yet so we can
2144 * muck around with it freely without cmpxchg 2120 * muck around with it freely without cmpxchg
2145 */ 2121 */
2146 object = page->freelist; 2122 freelist = page->freelist;
2147 page->freelist = NULL; 2123 page->freelist = NULL;
2148 2124
2149 stat(s, ALLOC_SLAB); 2125 stat(s, ALLOC_SLAB);
2150 c->node = page_to_nid(page);
2151 c->page = page; 2126 c->page = page;
2152 *pc = c; 2127 *pc = c;
2153 } else 2128 } else
2154 object = NULL; 2129 freelist = NULL;
2155 2130
2156 return object; 2131 return freelist;
2132}
2133
2134static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags)
2135{
2136 if (unlikely(PageSlabPfmemalloc(page)))
2137 return gfp_pfmemalloc_allowed(gfpflags);
2138
2139 return true;
2157} 2140}
2158 2141
2159/* 2142/*
@@ -2163,6 +2146,8 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
2163 * The page is still frozen if the return value is not NULL. 2146 * The page is still frozen if the return value is not NULL.
2164 * 2147 *
2165 * If this function returns NULL then the page has been unfrozen. 2148 * If this function returns NULL then the page has been unfrozen.
2149 *
2150 * This function must be called with interrupt disabled.
2166 */ 2151 */
2167static inline void *get_freelist(struct kmem_cache *s, struct page *page) 2152static inline void *get_freelist(struct kmem_cache *s, struct page *page)
2168{ 2153{
@@ -2173,13 +2158,14 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page)
2173 do { 2158 do {
2174 freelist = page->freelist; 2159 freelist = page->freelist;
2175 counters = page->counters; 2160 counters = page->counters;
2161
2176 new.counters = counters; 2162 new.counters = counters;
2177 VM_BUG_ON(!new.frozen); 2163 VM_BUG_ON(!new.frozen);
2178 2164
2179 new.inuse = page->objects; 2165 new.inuse = page->objects;
2180 new.frozen = freelist != NULL; 2166 new.frozen = freelist != NULL;
2181 2167
2182 } while (!cmpxchg_double_slab(s, page, 2168 } while (!__cmpxchg_double_slab(s, page,
2183 freelist, counters, 2169 freelist, counters,
2184 NULL, new.counters, 2170 NULL, new.counters,
2185 "get_freelist")); 2171 "get_freelist"));
@@ -2206,7 +2192,8 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page)
2206static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, 2192static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
2207 unsigned long addr, struct kmem_cache_cpu *c) 2193 unsigned long addr, struct kmem_cache_cpu *c)
2208{ 2194{
2209 void **object; 2195 void *freelist;
2196 struct page *page;
2210 unsigned long flags; 2197 unsigned long flags;
2211 2198
2212 local_irq_save(flags); 2199 local_irq_save(flags);
@@ -2219,25 +2206,41 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
2219 c = this_cpu_ptr(s->cpu_slab); 2206 c = this_cpu_ptr(s->cpu_slab);
2220#endif 2207#endif
2221 2208
2222 if (!c->page) 2209 page = c->page;
2210 if (!page)
2223 goto new_slab; 2211 goto new_slab;
2224redo: 2212redo:
2225 if (unlikely(!node_match(c, node))) { 2213
2214 if (unlikely(!node_match(page, node))) {
2226 stat(s, ALLOC_NODE_MISMATCH); 2215 stat(s, ALLOC_NODE_MISMATCH);
2227 deactivate_slab(s, c); 2216 deactivate_slab(s, page, c->freelist);
2217 c->page = NULL;
2218 c->freelist = NULL;
2219 goto new_slab;
2220 }
2221
2222 /*
2223 * By rights, we should be searching for a slab page that was
2224 * PFMEMALLOC but right now, we are losing the pfmemalloc
2225 * information when the page leaves the per-cpu allocator
2226 */
2227 if (unlikely(!pfmemalloc_match(page, gfpflags))) {
2228 deactivate_slab(s, page, c->freelist);
2229 c->page = NULL;
2230 c->freelist = NULL;
2228 goto new_slab; 2231 goto new_slab;
2229 } 2232 }
2230 2233
2231 /* must check again c->freelist in case of cpu migration or IRQ */ 2234 /* must check again c->freelist in case of cpu migration or IRQ */
2232 object = c->freelist; 2235 freelist = c->freelist;
2233 if (object) 2236 if (freelist)
2234 goto load_freelist; 2237 goto load_freelist;
2235 2238
2236 stat(s, ALLOC_SLOWPATH); 2239 stat(s, ALLOC_SLOWPATH);
2237 2240
2238 object = get_freelist(s, c->page); 2241 freelist = get_freelist(s, page);
2239 2242
2240 if (!object) { 2243 if (!freelist) {
2241 c->page = NULL; 2244 c->page = NULL;
2242 stat(s, DEACTIVATE_BYPASS); 2245 stat(s, DEACTIVATE_BYPASS);
2243 goto new_slab; 2246 goto new_slab;
@@ -2246,50 +2249,50 @@ redo:
2246 stat(s, ALLOC_REFILL); 2249 stat(s, ALLOC_REFILL);
2247 2250
2248load_freelist: 2251load_freelist:
2249 c->freelist = get_freepointer(s, object); 2252 /*
2253 * freelist is pointing to the list of objects to be used.
2254 * page is pointing to the page from which the objects are obtained.
2255 * That page must be frozen for per cpu allocations to work.
2256 */
2257 VM_BUG_ON(!c->page->frozen);
2258 c->freelist = get_freepointer(s, freelist);
2250 c->tid = next_tid(c->tid); 2259 c->tid = next_tid(c->tid);
2251 local_irq_restore(flags); 2260 local_irq_restore(flags);
2252 return object; 2261 return freelist;
2253 2262
2254new_slab: 2263new_slab:
2255 2264
2256 if (c->partial) { 2265 if (c->partial) {
2257 c->page = c->partial; 2266 page = c->page = c->partial;
2258 c->partial = c->page->next; 2267 c->partial = page->next;
2259 c->node = page_to_nid(c->page);
2260 stat(s, CPU_PARTIAL_ALLOC); 2268 stat(s, CPU_PARTIAL_ALLOC);
2261 c->freelist = NULL; 2269 c->freelist = NULL;
2262 goto redo; 2270 goto redo;
2263 } 2271 }
2264 2272
2265 /* Then do expensive stuff like retrieving pages from the partial lists */ 2273 freelist = new_slab_objects(s, gfpflags, node, &c);
2266 object = get_partial(s, gfpflags, node, c);
2267
2268 if (unlikely(!object)) {
2269 2274
2270 object = new_slab_objects(s, gfpflags, node, &c); 2275 if (unlikely(!freelist)) {
2276 if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
2277 slab_out_of_memory(s, gfpflags, node);
2271 2278
2272 if (unlikely(!object)) { 2279 local_irq_restore(flags);
2273 if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) 2280 return NULL;
2274 slab_out_of_memory(s, gfpflags, node);
2275
2276 local_irq_restore(flags);
2277 return NULL;
2278 }
2279 } 2281 }
2280 2282
2281 if (likely(!kmem_cache_debug(s))) 2283 page = c->page;
2284 if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags)))
2282 goto load_freelist; 2285 goto load_freelist;
2283 2286
2284 /* Only entered in the debug case */ 2287 /* Only entered in the debug case */
2285 if (!alloc_debug_processing(s, c->page, object, addr)) 2288 if (kmem_cache_debug(s) && !alloc_debug_processing(s, page, freelist, addr))
2286 goto new_slab; /* Slab failed checks. Next slab needed */ 2289 goto new_slab; /* Slab failed checks. Next slab needed */
2287 2290
2288 c->freelist = get_freepointer(s, object); 2291 deactivate_slab(s, page, get_freepointer(s, freelist));
2289 deactivate_slab(s, c); 2292 c->page = NULL;
2290 c->node = NUMA_NO_NODE; 2293 c->freelist = NULL;
2291 local_irq_restore(flags); 2294 local_irq_restore(flags);
2292 return object; 2295 return freelist;
2293} 2296}
2294 2297
2295/* 2298/*
@@ -2307,6 +2310,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
2307{ 2310{
2308 void **object; 2311 void **object;
2309 struct kmem_cache_cpu *c; 2312 struct kmem_cache_cpu *c;
2313 struct page *page;
2310 unsigned long tid; 2314 unsigned long tid;
2311 2315
2312 if (slab_pre_alloc_hook(s, gfpflags)) 2316 if (slab_pre_alloc_hook(s, gfpflags))
@@ -2332,8 +2336,8 @@ redo:
2332 barrier(); 2336 barrier();
2333 2337
2334 object = c->freelist; 2338 object = c->freelist;
2335 if (unlikely(!object || !node_match(c, node))) 2339 page = c->page;
2336 2340 if (unlikely(!object || !node_match(page, node)))
2337 object = __slab_alloc(s, gfpflags, node, addr, c); 2341 object = __slab_alloc(s, gfpflags, node, addr, c);
2338 2342
2339 else { 2343 else {
@@ -2364,7 +2368,7 @@ redo:
2364 } 2368 }
2365 2369
2366 if (unlikely(gfpflags & __GFP_ZERO) && object) 2370 if (unlikely(gfpflags & __GFP_ZERO) && object)
2367 memset(object, 0, s->objsize); 2371 memset(object, 0, s->object_size);
2368 2372
2369 slab_post_alloc_hook(s, gfpflags, object); 2373 slab_post_alloc_hook(s, gfpflags, object);
2370 2374
@@ -2375,7 +2379,7 @@ void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
2375{ 2379{
2376 void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_); 2380 void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_);
2377 2381
2378 trace_kmem_cache_alloc(_RET_IP_, ret, s->objsize, s->size, gfpflags); 2382 trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size, s->size, gfpflags);
2379 2383
2380 return ret; 2384 return ret;
2381} 2385}
@@ -2405,7 +2409,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
2405 void *ret = slab_alloc(s, gfpflags, node, _RET_IP_); 2409 void *ret = slab_alloc(s, gfpflags, node, _RET_IP_);
2406 2410
2407 trace_kmem_cache_alloc_node(_RET_IP_, ret, 2411 trace_kmem_cache_alloc_node(_RET_IP_, ret,
2408 s->objsize, s->size, gfpflags, node); 2412 s->object_size, s->size, gfpflags, node);
2409 2413
2410 return ret; 2414 return ret;
2411} 2415}
@@ -2900,7 +2904,7 @@ static void set_min_partial(struct kmem_cache *s, unsigned long min)
2900static int calculate_sizes(struct kmem_cache *s, int forced_order) 2904static int calculate_sizes(struct kmem_cache *s, int forced_order)
2901{ 2905{
2902 unsigned long flags = s->flags; 2906 unsigned long flags = s->flags;
2903 unsigned long size = s->objsize; 2907 unsigned long size = s->object_size;
2904 unsigned long align = s->align; 2908 unsigned long align = s->align;
2905 int order; 2909 int order;
2906 2910
@@ -2929,7 +2933,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
2929 * end of the object and the free pointer. If not then add an 2933 * end of the object and the free pointer. If not then add an
2930 * additional word to have some bytes to store Redzone information. 2934 * additional word to have some bytes to store Redzone information.
2931 */ 2935 */
2932 if ((flags & SLAB_RED_ZONE) && size == s->objsize) 2936 if ((flags & SLAB_RED_ZONE) && size == s->object_size)
2933 size += sizeof(void *); 2937 size += sizeof(void *);
2934#endif 2938#endif
2935 2939
@@ -2977,7 +2981,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
2977 * user specified and the dynamic determination of cache line size 2981 * user specified and the dynamic determination of cache line size
2978 * on bootup. 2982 * on bootup.
2979 */ 2983 */
2980 align = calculate_alignment(flags, align, s->objsize); 2984 align = calculate_alignment(flags, align, s->object_size);
2981 s->align = align; 2985 s->align = align;
2982 2986
2983 /* 2987 /*
@@ -3025,7 +3029,7 @@ static int kmem_cache_open(struct kmem_cache *s,
3025 memset(s, 0, kmem_size); 3029 memset(s, 0, kmem_size);
3026 s->name = name; 3030 s->name = name;
3027 s->ctor = ctor; 3031 s->ctor = ctor;
3028 s->objsize = size; 3032 s->object_size = size;
3029 s->align = align; 3033 s->align = align;
3030 s->flags = kmem_cache_flags(size, flags, name, ctor); 3034 s->flags = kmem_cache_flags(size, flags, name, ctor);
3031 s->reserved = 0; 3035 s->reserved = 0;
@@ -3040,7 +3044,7 @@ static int kmem_cache_open(struct kmem_cache *s,
3040 * Disable debugging flags that store metadata if the min slab 3044 * Disable debugging flags that store metadata if the min slab
3041 * order increased. 3045 * order increased.
3042 */ 3046 */
3043 if (get_order(s->size) > get_order(s->objsize)) { 3047 if (get_order(s->size) > get_order(s->object_size)) {
3044 s->flags &= ~DEBUG_METADATA_FLAGS; 3048 s->flags &= ~DEBUG_METADATA_FLAGS;
3045 s->offset = 0; 3049 s->offset = 0;
3046 if (!calculate_sizes(s, -1)) 3050 if (!calculate_sizes(s, -1))
@@ -3114,7 +3118,7 @@ error:
3114 */ 3118 */
3115unsigned int kmem_cache_size(struct kmem_cache *s) 3119unsigned int kmem_cache_size(struct kmem_cache *s)
3116{ 3120{
3117 return s->objsize; 3121 return s->object_size;
3118} 3122}
3119EXPORT_SYMBOL(kmem_cache_size); 3123EXPORT_SYMBOL(kmem_cache_size);
3120 3124
@@ -3192,11 +3196,11 @@ static inline int kmem_cache_close(struct kmem_cache *s)
3192 */ 3196 */
3193void kmem_cache_destroy(struct kmem_cache *s) 3197void kmem_cache_destroy(struct kmem_cache *s)
3194{ 3198{
3195 down_write(&slub_lock); 3199 mutex_lock(&slab_mutex);
3196 s->refcount--; 3200 s->refcount--;
3197 if (!s->refcount) { 3201 if (!s->refcount) {
3198 list_del(&s->list); 3202 list_del(&s->list);
3199 up_write(&slub_lock); 3203 mutex_unlock(&slab_mutex);
3200 if (kmem_cache_close(s)) { 3204 if (kmem_cache_close(s)) {
3201 printk(KERN_ERR "SLUB %s: %s called for cache that " 3205 printk(KERN_ERR "SLUB %s: %s called for cache that "
3202 "still has objects.\n", s->name, __func__); 3206 "still has objects.\n", s->name, __func__);
@@ -3206,7 +3210,7 @@ void kmem_cache_destroy(struct kmem_cache *s)
3206 rcu_barrier(); 3210 rcu_barrier();
3207 sysfs_slab_remove(s); 3211 sysfs_slab_remove(s);
3208 } else 3212 } else
3209 up_write(&slub_lock); 3213 mutex_unlock(&slab_mutex);
3210} 3214}
3211EXPORT_SYMBOL(kmem_cache_destroy); 3215EXPORT_SYMBOL(kmem_cache_destroy);
3212 3216
@@ -3268,7 +3272,7 @@ static struct kmem_cache *__init create_kmalloc_cache(const char *name,
3268 3272
3269 /* 3273 /*
3270 * This function is called with IRQs disabled during early-boot on 3274 * This function is called with IRQs disabled during early-boot on
3271 * single CPU so there's no need to take slub_lock here. 3275 * single CPU so there's no need to take slab_mutex here.
3272 */ 3276 */
3273 if (!kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN, 3277 if (!kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN,
3274 flags, NULL)) 3278 flags, NULL))
@@ -3553,10 +3557,10 @@ static int slab_mem_going_offline_callback(void *arg)
3553{ 3557{
3554 struct kmem_cache *s; 3558 struct kmem_cache *s;
3555 3559
3556 down_read(&slub_lock); 3560 mutex_lock(&slab_mutex);
3557 list_for_each_entry(s, &slab_caches, list) 3561 list_for_each_entry(s, &slab_caches, list)
3558 kmem_cache_shrink(s); 3562 kmem_cache_shrink(s);
3559 up_read(&slub_lock); 3563 mutex_unlock(&slab_mutex);
3560 3564
3561 return 0; 3565 return 0;
3562} 3566}
@@ -3577,7 +3581,7 @@ static void slab_mem_offline_callback(void *arg)
3577 if (offline_node < 0) 3581 if (offline_node < 0)
3578 return; 3582 return;
3579 3583
3580 down_read(&slub_lock); 3584 mutex_lock(&slab_mutex);
3581 list_for_each_entry(s, &slab_caches, list) { 3585 list_for_each_entry(s, &slab_caches, list) {
3582 n = get_node(s, offline_node); 3586 n = get_node(s, offline_node);
3583 if (n) { 3587 if (n) {
@@ -3593,7 +3597,7 @@ static void slab_mem_offline_callback(void *arg)
3593 kmem_cache_free(kmem_cache_node, n); 3597 kmem_cache_free(kmem_cache_node, n);
3594 } 3598 }
3595 } 3599 }
3596 up_read(&slub_lock); 3600 mutex_unlock(&slab_mutex);
3597} 3601}
3598 3602
3599static int slab_mem_going_online_callback(void *arg) 3603static int slab_mem_going_online_callback(void *arg)
@@ -3616,7 +3620,7 @@ static int slab_mem_going_online_callback(void *arg)
3616 * allocate a kmem_cache_node structure in order to bring the node 3620 * allocate a kmem_cache_node structure in order to bring the node
3617 * online. 3621 * online.
3618 */ 3622 */
3619 down_read(&slub_lock); 3623 mutex_lock(&slab_mutex);
3620 list_for_each_entry(s, &slab_caches, list) { 3624 list_for_each_entry(s, &slab_caches, list) {
3621 /* 3625 /*
3622 * XXX: kmem_cache_alloc_node will fallback to other nodes 3626 * XXX: kmem_cache_alloc_node will fallback to other nodes
@@ -3632,7 +3636,7 @@ static int slab_mem_going_online_callback(void *arg)
3632 s->node[nid] = n; 3636 s->node[nid] = n;
3633 } 3637 }
3634out: 3638out:
3635 up_read(&slub_lock); 3639 mutex_unlock(&slab_mutex);
3636 return ret; 3640 return ret;
3637} 3641}
3638 3642
@@ -3843,11 +3847,11 @@ void __init kmem_cache_init(void)
3843 3847
3844 if (s && s->size) { 3848 if (s && s->size) {
3845 char *name = kasprintf(GFP_NOWAIT, 3849 char *name = kasprintf(GFP_NOWAIT,
3846 "dma-kmalloc-%d", s->objsize); 3850 "dma-kmalloc-%d", s->object_size);
3847 3851
3848 BUG_ON(!name); 3852 BUG_ON(!name);
3849 kmalloc_dma_caches[i] = create_kmalloc_cache(name, 3853 kmalloc_dma_caches[i] = create_kmalloc_cache(name,
3850 s->objsize, SLAB_CACHE_DMA); 3854 s->object_size, SLAB_CACHE_DMA);
3851 } 3855 }
3852 } 3856 }
3853#endif 3857#endif
@@ -3924,16 +3928,12 @@ static struct kmem_cache *find_mergeable(size_t size,
3924 return NULL; 3928 return NULL;
3925} 3929}
3926 3930
3927struct kmem_cache *kmem_cache_create(const char *name, size_t size, 3931struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
3928 size_t align, unsigned long flags, void (*ctor)(void *)) 3932 size_t align, unsigned long flags, void (*ctor)(void *))
3929{ 3933{
3930 struct kmem_cache *s; 3934 struct kmem_cache *s;
3931 char *n; 3935 char *n;
3932 3936
3933 if (WARN_ON(!name))
3934 return NULL;
3935
3936 down_write(&slub_lock);
3937 s = find_mergeable(size, align, flags, name, ctor); 3937 s = find_mergeable(size, align, flags, name, ctor);
3938 if (s) { 3938 if (s) {
3939 s->refcount++; 3939 s->refcount++;
@@ -3941,49 +3941,42 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
3941 * Adjust the object sizes so that we clear 3941 * Adjust the object sizes so that we clear
3942 * the complete object on kzalloc. 3942 * the complete object on kzalloc.
3943 */ 3943 */
3944 s->objsize = max(s->objsize, (int)size); 3944 s->object_size = max(s->object_size, (int)size);
3945 s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); 3945 s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
3946 3946
3947 if (sysfs_slab_alias(s, name)) { 3947 if (sysfs_slab_alias(s, name)) {
3948 s->refcount--; 3948 s->refcount--;
3949 goto err; 3949 return NULL;
3950 } 3950 }
3951 up_write(&slub_lock);
3952 return s; 3951 return s;
3953 } 3952 }
3954 3953
3955 n = kstrdup(name, GFP_KERNEL); 3954 n = kstrdup(name, GFP_KERNEL);
3956 if (!n) 3955 if (!n)
3957 goto err; 3956 return NULL;
3958 3957
3959 s = kmalloc(kmem_size, GFP_KERNEL); 3958 s = kmalloc(kmem_size, GFP_KERNEL);
3960 if (s) { 3959 if (s) {
3961 if (kmem_cache_open(s, n, 3960 if (kmem_cache_open(s, n,
3962 size, align, flags, ctor)) { 3961 size, align, flags, ctor)) {
3962 int r;
3963
3963 list_add(&s->list, &slab_caches); 3964 list_add(&s->list, &slab_caches);
3964 up_write(&slub_lock); 3965 mutex_unlock(&slab_mutex);
3965 if (sysfs_slab_add(s)) { 3966 r = sysfs_slab_add(s);
3966 down_write(&slub_lock); 3967 mutex_lock(&slab_mutex);
3967 list_del(&s->list); 3968
3968 kfree(n); 3969 if (!r)
3969 kfree(s); 3970 return s;
3970 goto err; 3971
3971 } 3972 list_del(&s->list);
3972 return s; 3973 kmem_cache_close(s);
3973 } 3974 }
3974 kfree(s); 3975 kfree(s);
3975 } 3976 }
3976 kfree(n); 3977 kfree(n);
3977err: 3978 return NULL;
3978 up_write(&slub_lock);
3979
3980 if (flags & SLAB_PANIC)
3981 panic("Cannot create slabcache %s\n", name);
3982 else
3983 s = NULL;
3984 return s;
3985} 3979}
3986EXPORT_SYMBOL(kmem_cache_create);
3987 3980
3988#ifdef CONFIG_SMP 3981#ifdef CONFIG_SMP
3989/* 3982/*
@@ -4002,13 +3995,13 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
4002 case CPU_UP_CANCELED_FROZEN: 3995 case CPU_UP_CANCELED_FROZEN:
4003 case CPU_DEAD: 3996 case CPU_DEAD:
4004 case CPU_DEAD_FROZEN: 3997 case CPU_DEAD_FROZEN:
4005 down_read(&slub_lock); 3998 mutex_lock(&slab_mutex);
4006 list_for_each_entry(s, &slab_caches, list) { 3999 list_for_each_entry(s, &slab_caches, list) {
4007 local_irq_save(flags); 4000 local_irq_save(flags);
4008 __flush_cpu_slab(s, cpu); 4001 __flush_cpu_slab(s, cpu);
4009 local_irq_restore(flags); 4002 local_irq_restore(flags);
4010 } 4003 }
4011 up_read(&slub_lock); 4004 mutex_unlock(&slab_mutex);
4012 break; 4005 break;
4013 default: 4006 default:
4014 break; 4007 break;
@@ -4500,30 +4493,31 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
4500 4493
4501 for_each_possible_cpu(cpu) { 4494 for_each_possible_cpu(cpu) {
4502 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); 4495 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
4503 int node = ACCESS_ONCE(c->node); 4496 int node;
4504 struct page *page; 4497 struct page *page;
4505 4498
4506 if (node < 0)
4507 continue;
4508 page = ACCESS_ONCE(c->page); 4499 page = ACCESS_ONCE(c->page);
4509 if (page) { 4500 if (!page)
4510 if (flags & SO_TOTAL) 4501 continue;
4511 x = page->objects;
4512 else if (flags & SO_OBJECTS)
4513 x = page->inuse;
4514 else
4515 x = 1;
4516 4502
4517 total += x; 4503 node = page_to_nid(page);
4518 nodes[node] += x; 4504 if (flags & SO_TOTAL)
4519 } 4505 x = page->objects;
4520 page = c->partial; 4506 else if (flags & SO_OBJECTS)
4507 x = page->inuse;
4508 else
4509 x = 1;
4521 4510
4511 total += x;
4512 nodes[node] += x;
4513
4514 page = ACCESS_ONCE(c->partial);
4522 if (page) { 4515 if (page) {
4523 x = page->pobjects; 4516 x = page->pobjects;
4524 total += x; 4517 total += x;
4525 nodes[node] += x; 4518 nodes[node] += x;
4526 } 4519 }
4520
4527 per_cpu[node]++; 4521 per_cpu[node]++;
4528 } 4522 }
4529 } 4523 }
@@ -4623,7 +4617,7 @@ SLAB_ATTR_RO(align);
4623 4617
4624static ssize_t object_size_show(struct kmem_cache *s, char *buf) 4618static ssize_t object_size_show(struct kmem_cache *s, char *buf)
4625{ 4619{
4626 return sprintf(buf, "%d\n", s->objsize); 4620 return sprintf(buf, "%d\n", s->object_size);
4627} 4621}
4628SLAB_ATTR_RO(object_size); 4622SLAB_ATTR_RO(object_size);
4629 4623
@@ -5286,7 +5280,7 @@ static int sysfs_slab_add(struct kmem_cache *s)
5286 const char *name; 5280 const char *name;
5287 int unmergeable; 5281 int unmergeable;
5288 5282
5289 if (slab_state < SYSFS) 5283 if (slab_state < FULL)
5290 /* Defer until later */ 5284 /* Defer until later */
5291 return 0; 5285 return 0;
5292 5286
@@ -5331,7 +5325,7 @@ static int sysfs_slab_add(struct kmem_cache *s)
5331 5325
5332static void sysfs_slab_remove(struct kmem_cache *s) 5326static void sysfs_slab_remove(struct kmem_cache *s)
5333{ 5327{
5334 if (slab_state < SYSFS) 5328 if (slab_state < FULL)
5335 /* 5329 /*
5336 * Sysfs has not been setup yet so no need to remove the 5330 * Sysfs has not been setup yet so no need to remove the
5337 * cache from sysfs. 5331 * cache from sysfs.
@@ -5359,7 +5353,7 @@ static int sysfs_slab_alias(struct kmem_cache *s, const char *name)
5359{ 5353{
5360 struct saved_alias *al; 5354 struct saved_alias *al;
5361 5355
5362 if (slab_state == SYSFS) { 5356 if (slab_state == FULL) {
5363 /* 5357 /*
5364 * If we have a leftover link then remove it. 5358 * If we have a leftover link then remove it.
5365 */ 5359 */
@@ -5383,16 +5377,16 @@ static int __init slab_sysfs_init(void)
5383 struct kmem_cache *s; 5377 struct kmem_cache *s;
5384 int err; 5378 int err;
5385 5379
5386 down_write(&slub_lock); 5380 mutex_lock(&slab_mutex);
5387 5381
5388 slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj); 5382 slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj);
5389 if (!slab_kset) { 5383 if (!slab_kset) {
5390 up_write(&slub_lock); 5384 mutex_unlock(&slab_mutex);
5391 printk(KERN_ERR "Cannot register slab subsystem.\n"); 5385 printk(KERN_ERR "Cannot register slab subsystem.\n");
5392 return -ENOSYS; 5386 return -ENOSYS;
5393 } 5387 }
5394 5388
5395 slab_state = SYSFS; 5389 slab_state = FULL;
5396 5390
5397 list_for_each_entry(s, &slab_caches, list) { 5391 list_for_each_entry(s, &slab_caches, list) {
5398 err = sysfs_slab_add(s); 5392 err = sysfs_slab_add(s);
@@ -5408,11 +5402,11 @@ static int __init slab_sysfs_init(void)
5408 err = sysfs_slab_alias(al->s, al->name); 5402 err = sysfs_slab_alias(al->s, al->name);
5409 if (err) 5403 if (err)
5410 printk(KERN_ERR "SLUB: Unable to add boot slab alias" 5404 printk(KERN_ERR "SLUB: Unable to add boot slab alias"
5411 " %s to sysfs\n", s->name); 5405 " %s to sysfs\n", al->name);
5412 kfree(al); 5406 kfree(al);
5413 } 5407 }
5414 5408
5415 up_write(&slub_lock); 5409 mutex_unlock(&slab_mutex);
5416 resiliency_test(); 5410 resiliency_test();
5417 return 0; 5411 return 0;
5418} 5412}
@@ -5427,7 +5421,7 @@ __initcall(slab_sysfs_init);
5427static void print_slabinfo_header(struct seq_file *m) 5421static void print_slabinfo_header(struct seq_file *m)
5428{ 5422{
5429 seq_puts(m, "slabinfo - version: 2.1\n"); 5423 seq_puts(m, "slabinfo - version: 2.1\n");
5430 seq_puts(m, "# name <active_objs> <num_objs> <objsize> " 5424 seq_puts(m, "# name <active_objs> <num_objs> <object_size> "
5431 "<objperslab> <pagesperslab>"); 5425 "<objperslab> <pagesperslab>");
5432 seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); 5426 seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
5433 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); 5427 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
@@ -5438,7 +5432,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)
5438{ 5432{
5439 loff_t n = *pos; 5433 loff_t n = *pos;
5440 5434
5441 down_read(&slub_lock); 5435 mutex_lock(&slab_mutex);
5442 if (!n) 5436 if (!n)
5443 print_slabinfo_header(m); 5437 print_slabinfo_header(m);
5444 5438
@@ -5452,7 +5446,7 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos)
5452 5446
5453static void s_stop(struct seq_file *m, void *p) 5447static void s_stop(struct seq_file *m, void *p)
5454{ 5448{
5455 up_read(&slub_lock); 5449 mutex_unlock(&slab_mutex);
5456} 5450}
5457 5451
5458static int s_show(struct seq_file *m, void *p) 5452static int s_show(struct seq_file *m, void *p)
diff --git a/mm/sparse.c b/mm/sparse.c
index c7bb952400c..fac95f2888f 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -65,21 +65,18 @@ static struct mem_section noinline __init_refok *sparse_index_alloc(int nid)
65 65
66 if (slab_is_available()) { 66 if (slab_is_available()) {
67 if (node_state(nid, N_HIGH_MEMORY)) 67 if (node_state(nid, N_HIGH_MEMORY))
68 section = kmalloc_node(array_size, GFP_KERNEL, nid); 68 section = kzalloc_node(array_size, GFP_KERNEL, nid);
69 else 69 else
70 section = kmalloc(array_size, GFP_KERNEL); 70 section = kzalloc(array_size, GFP_KERNEL);
71 } else 71 } else {
72 section = alloc_bootmem_node(NODE_DATA(nid), array_size); 72 section = alloc_bootmem_node(NODE_DATA(nid), array_size);
73 73 }
74 if (section)
75 memset(section, 0, array_size);
76 74
77 return section; 75 return section;
78} 76}
79 77
80static int __meminit sparse_index_init(unsigned long section_nr, int nid) 78static int __meminit sparse_index_init(unsigned long section_nr, int nid)
81{ 79{
82 static DEFINE_SPINLOCK(index_init_lock);
83 unsigned long root = SECTION_NR_TO_ROOT(section_nr); 80 unsigned long root = SECTION_NR_TO_ROOT(section_nr);
84 struct mem_section *section; 81 struct mem_section *section;
85 int ret = 0; 82 int ret = 0;
@@ -90,20 +87,9 @@ static int __meminit sparse_index_init(unsigned long section_nr, int nid)
90 section = sparse_index_alloc(nid); 87 section = sparse_index_alloc(nid);
91 if (!section) 88 if (!section)
92 return -ENOMEM; 89 return -ENOMEM;
93 /*
94 * This lock keeps two different sections from
95 * reallocating for the same index
96 */
97 spin_lock(&index_init_lock);
98
99 if (mem_section[root]) {
100 ret = -EEXIST;
101 goto out;
102 }
103 90
104 mem_section[root] = section; 91 mem_section[root] = section;
105out: 92
106 spin_unlock(&index_init_lock);
107 return ret; 93 return ret;
108} 94}
109#else /* !SPARSEMEM_EXTREME */ 95#else /* !SPARSEMEM_EXTREME */
@@ -132,6 +118,8 @@ int __section_nr(struct mem_section* ms)
132 break; 118 break;
133 } 119 }
134 120
121 VM_BUG_ON(root_nr == NR_SECTION_ROOTS);
122
135 return (root_nr * SECTIONS_PER_ROOT) + (ms - root); 123 return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
136} 124}
137 125
@@ -493,6 +481,9 @@ void __init sparse_init(void)
493 struct page **map_map; 481 struct page **map_map;
494#endif 482#endif
495 483
484 /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */
485 set_pageblock_order();
486
496 /* 487 /*
497 * map is using big page (aka 2M in x86 64 bit) 488 * map is using big page (aka 2M in x86 64 bit)
498 * usemap is less one page (aka 24 bytes) 489 * usemap is less one page (aka 24 bytes)
diff --git a/mm/swap.c b/mm/swap.c
index 4e7e2ec6707..77825883298 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -236,6 +236,58 @@ void put_pages_list(struct list_head *pages)
236} 236}
237EXPORT_SYMBOL(put_pages_list); 237EXPORT_SYMBOL(put_pages_list);
238 238
239/*
240 * get_kernel_pages() - pin kernel pages in memory
241 * @kiov: An array of struct kvec structures
242 * @nr_segs: number of segments to pin
243 * @write: pinning for read/write, currently ignored
244 * @pages: array that receives pointers to the pages pinned.
245 * Should be at least nr_segs long.
246 *
247 * Returns number of pages pinned. This may be fewer than the number
248 * requested. If nr_pages is 0 or negative, returns 0. If no pages
249 * were pinned, returns -errno. Each page returned must be released
250 * with a put_page() call when it is finished with.
251 */
252int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write,
253 struct page **pages)
254{
255 int seg;
256
257 for (seg = 0; seg < nr_segs; seg++) {
258 if (WARN_ON(kiov[seg].iov_len != PAGE_SIZE))
259 return seg;
260
261 pages[seg] = kmap_to_page(kiov[seg].iov_base);
262 page_cache_get(pages[seg]);
263 }
264
265 return seg;
266}
267EXPORT_SYMBOL_GPL(get_kernel_pages);
268
269/*
270 * get_kernel_page() - pin a kernel page in memory
271 * @start: starting kernel address
272 * @write: pinning for read/write, currently ignored
273 * @pages: array that receives pointer to the page pinned.
274 * Must be at least nr_segs long.
275 *
276 * Returns 1 if page is pinned. If the page was not pinned, returns
277 * -errno. The page returned must be released with a put_page() call
278 * when it is finished with.
279 */
280int get_kernel_page(unsigned long start, int write, struct page **pages)
281{
282 const struct kvec kiov = {
283 .iov_base = (void *)start,
284 .iov_len = PAGE_SIZE
285 };
286
287 return get_kernel_pages(&kiov, 1, write, pages);
288}
289EXPORT_SYMBOL_GPL(get_kernel_page);
290
239static void pagevec_lru_move_fn(struct pagevec *pvec, 291static void pagevec_lru_move_fn(struct pagevec *pvec,
240 void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg), 292 void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg),
241 void *arg) 293 void *arg)
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 4c5ff7f284d..0cb36fb1f61 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -14,6 +14,7 @@
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/pagemap.h> 15#include <linux/pagemap.h>
16#include <linux/backing-dev.h> 16#include <linux/backing-dev.h>
17#include <linux/blkdev.h>
17#include <linux/pagevec.h> 18#include <linux/pagevec.h>
18#include <linux/migrate.h> 19#include <linux/migrate.h>
19#include <linux/page_cgroup.h> 20#include <linux/page_cgroup.h>
@@ -26,7 +27,7 @@
26 */ 27 */
27static const struct address_space_operations swap_aops = { 28static const struct address_space_operations swap_aops = {
28 .writepage = swap_writepage, 29 .writepage = swap_writepage,
29 .set_page_dirty = __set_page_dirty_no_writeback, 30 .set_page_dirty = swap_set_page_dirty,
30 .migratepage = migrate_page, 31 .migratepage = migrate_page,
31}; 32};
32 33
@@ -376,6 +377,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
376 unsigned long offset = swp_offset(entry); 377 unsigned long offset = swp_offset(entry);
377 unsigned long start_offset, end_offset; 378 unsigned long start_offset, end_offset;
378 unsigned long mask = (1UL << page_cluster) - 1; 379 unsigned long mask = (1UL << page_cluster) - 1;
380 struct blk_plug plug;
379 381
380 /* Read a page_cluster sized and aligned cluster around offset. */ 382 /* Read a page_cluster sized and aligned cluster around offset. */
381 start_offset = offset & ~mask; 383 start_offset = offset & ~mask;
@@ -383,6 +385,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
383 if (!start_offset) /* First page is swap header. */ 385 if (!start_offset) /* First page is swap header. */
384 start_offset++; 386 start_offset++;
385 387
388 blk_start_plug(&plug);
386 for (offset = start_offset; offset <= end_offset ; offset++) { 389 for (offset = start_offset; offset <= end_offset ; offset++) {
387 /* Ok, do the async read-ahead now */ 390 /* Ok, do the async read-ahead now */
388 page = read_swap_cache_async(swp_entry(swp_type(entry), offset), 391 page = read_swap_cache_async(swp_entry(swp_type(entry), offset),
@@ -391,6 +394,8 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
391 continue; 394 continue;
392 page_cache_release(page); 395 page_cache_release(page);
393 } 396 }
397 blk_finish_plug(&plug);
398
394 lru_add_drain(); /* Push any new pages onto the LRU now */ 399 lru_add_drain(); /* Push any new pages onto the LRU now */
395 return read_swap_cache_async(entry, gfp_mask, vma, addr); 400 return read_swap_cache_async(entry, gfp_mask, vma, addr);
396} 401}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 71373d03fce..14e254c768f 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -33,6 +33,7 @@
33#include <linux/oom.h> 33#include <linux/oom.h>
34#include <linux/frontswap.h> 34#include <linux/frontswap.h>
35#include <linux/swapfile.h> 35#include <linux/swapfile.h>
36#include <linux/export.h>
36 37
37#include <asm/pgtable.h> 38#include <asm/pgtable.h>
38#include <asm/tlbflush.h> 39#include <asm/tlbflush.h>
@@ -548,7 +549,6 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
548 549
549 /* free if no reference */ 550 /* free if no reference */
550 if (!usage) { 551 if (!usage) {
551 struct gendisk *disk = p->bdev->bd_disk;
552 if (offset < p->lowest_bit) 552 if (offset < p->lowest_bit)
553 p->lowest_bit = offset; 553 p->lowest_bit = offset;
554 if (offset > p->highest_bit) 554 if (offset > p->highest_bit)
@@ -559,9 +559,12 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
559 nr_swap_pages++; 559 nr_swap_pages++;
560 p->inuse_pages--; 560 p->inuse_pages--;
561 frontswap_invalidate_page(p->type, offset); 561 frontswap_invalidate_page(p->type, offset);
562 if ((p->flags & SWP_BLKDEV) && 562 if (p->flags & SWP_BLKDEV) {
563 disk->fops->swap_slot_free_notify) 563 struct gendisk *disk = p->bdev->bd_disk;
564 disk->fops->swap_slot_free_notify(p->bdev, offset); 564 if (disk->fops->swap_slot_free_notify)
565 disk->fops->swap_slot_free_notify(p->bdev,
566 offset);
567 }
565 } 568 }
566 569
567 return usage; 570 return usage;
@@ -832,8 +835,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
832 835
833 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 836 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
834 if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { 837 if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) {
835 if (ret > 0) 838 mem_cgroup_cancel_charge_swapin(memcg);
836 mem_cgroup_cancel_charge_swapin(memcg);
837 ret = 0; 839 ret = 0;
838 goto out; 840 goto out;
839 } 841 }
@@ -1328,6 +1330,14 @@ static void destroy_swap_extents(struct swap_info_struct *sis)
1328 list_del(&se->list); 1330 list_del(&se->list);
1329 kfree(se); 1331 kfree(se);
1330 } 1332 }
1333
1334 if (sis->flags & SWP_FILE) {
1335 struct file *swap_file = sis->swap_file;
1336 struct address_space *mapping = swap_file->f_mapping;
1337
1338 sis->flags &= ~SWP_FILE;
1339 mapping->a_ops->swap_deactivate(swap_file);
1340 }
1331} 1341}
1332 1342
1333/* 1343/*
@@ -1336,7 +1346,7 @@ static void destroy_swap_extents(struct swap_info_struct *sis)
1336 * 1346 *
1337 * This function rather assumes that it is called in ascending page order. 1347 * This function rather assumes that it is called in ascending page order.
1338 */ 1348 */
1339static int 1349int
1340add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, 1350add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
1341 unsigned long nr_pages, sector_t start_block) 1351 unsigned long nr_pages, sector_t start_block)
1342{ 1352{
@@ -1409,98 +1419,28 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
1409 */ 1419 */
1410static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) 1420static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
1411{ 1421{
1412 struct inode *inode; 1422 struct file *swap_file = sis->swap_file;
1413 unsigned blocks_per_page; 1423 struct address_space *mapping = swap_file->f_mapping;
1414 unsigned long page_no; 1424 struct inode *inode = mapping->host;
1415 unsigned blkbits;
1416 sector_t probe_block;
1417 sector_t last_block;
1418 sector_t lowest_block = -1;
1419 sector_t highest_block = 0;
1420 int nr_extents = 0;
1421 int ret; 1425 int ret;
1422 1426
1423 inode = sis->swap_file->f_mapping->host;
1424 if (S_ISBLK(inode->i_mode)) { 1427 if (S_ISBLK(inode->i_mode)) {
1425 ret = add_swap_extent(sis, 0, sis->max, 0); 1428 ret = add_swap_extent(sis, 0, sis->max, 0);
1426 *span = sis->pages; 1429 *span = sis->pages;
1427 goto out; 1430 return ret;
1428 } 1431 }
1429 1432
1430 blkbits = inode->i_blkbits; 1433 if (mapping->a_ops->swap_activate) {
1431 blocks_per_page = PAGE_SIZE >> blkbits; 1434 ret = mapping->a_ops->swap_activate(sis, swap_file, span);
1432 1435 if (!ret) {
1433 /* 1436 sis->flags |= SWP_FILE;
1434 * Map all the blocks into the extent list. This code doesn't try 1437 ret = add_swap_extent(sis, 0, sis->max, 0);
1435 * to be very smart. 1438 *span = sis->pages;
1436 */
1437 probe_block = 0;
1438 page_no = 0;
1439 last_block = i_size_read(inode) >> blkbits;
1440 while ((probe_block + blocks_per_page) <= last_block &&
1441 page_no < sis->max) {
1442 unsigned block_in_page;
1443 sector_t first_block;
1444
1445 first_block = bmap(inode, probe_block);
1446 if (first_block == 0)
1447 goto bad_bmap;
1448
1449 /*
1450 * It must be PAGE_SIZE aligned on-disk
1451 */
1452 if (first_block & (blocks_per_page - 1)) {
1453 probe_block++;
1454 goto reprobe;
1455 }
1456
1457 for (block_in_page = 1; block_in_page < blocks_per_page;
1458 block_in_page++) {
1459 sector_t block;
1460
1461 block = bmap(inode, probe_block + block_in_page);
1462 if (block == 0)
1463 goto bad_bmap;
1464 if (block != first_block + block_in_page) {
1465 /* Discontiguity */
1466 probe_block++;
1467 goto reprobe;
1468 }
1469 }
1470
1471 first_block >>= (PAGE_SHIFT - blkbits);
1472 if (page_no) { /* exclude the header page */
1473 if (first_block < lowest_block)
1474 lowest_block = first_block;
1475 if (first_block > highest_block)
1476 highest_block = first_block;
1477 } 1439 }
1440 return ret;
1441 }
1478 1442
1479 /* 1443 return generic_swapfile_activate(sis, swap_file, span);
1480 * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
1481 */
1482 ret = add_swap_extent(sis, page_no, 1, first_block);
1483 if (ret < 0)
1484 goto out;
1485 nr_extents += ret;
1486 page_no++;
1487 probe_block += blocks_per_page;
1488reprobe:
1489 continue;
1490 }
1491 ret = nr_extents;
1492 *span = 1 + highest_block - lowest_block;
1493 if (page_no == 0)
1494 page_no = 1; /* force Empty message */
1495 sis->max = page_no;
1496 sis->pages = page_no - 1;
1497 sis->highest_bit = page_no - 1;
1498out:
1499 return ret;
1500bad_bmap:
1501 printk(KERN_ERR "swapon: swapfile has holes\n");
1502 ret = -EINVAL;
1503 goto out;
1504} 1444}
1505 1445
1506static void enable_swap_info(struct swap_info_struct *p, int prio, 1446static void enable_swap_info(struct swap_info_struct *p, int prio,
@@ -2285,6 +2225,31 @@ int swapcache_prepare(swp_entry_t entry)
2285 return __swap_duplicate(entry, SWAP_HAS_CACHE); 2225 return __swap_duplicate(entry, SWAP_HAS_CACHE);
2286} 2226}
2287 2227
2228struct swap_info_struct *page_swap_info(struct page *page)
2229{
2230 swp_entry_t swap = { .val = page_private(page) };
2231 BUG_ON(!PageSwapCache(page));
2232 return swap_info[swp_type(swap)];
2233}
2234
2235/*
2236 * out-of-line __page_file_ methods to avoid include hell.
2237 */
2238struct address_space *__page_file_mapping(struct page *page)
2239{
2240 VM_BUG_ON(!PageSwapCache(page));
2241 return page_swap_info(page)->swap_file->f_mapping;
2242}
2243EXPORT_SYMBOL_GPL(__page_file_mapping);
2244
2245pgoff_t __page_file_index(struct page *page)
2246{
2247 swp_entry_t swap = { .val = page_private(page) };
2248 VM_BUG_ON(!PageSwapCache(page));
2249 return swp_offset(swap);
2250}
2251EXPORT_SYMBOL_GPL(__page_file_index);
2252
2288/* 2253/*
2289 * add_swap_count_continuation - called when a swap count is duplicated 2254 * add_swap_count_continuation - called when a swap count is duplicated
2290 * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's 2255 * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 2aad49981b5..2bb90b1d241 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -413,11 +413,11 @@ nocache:
413 if (addr + size - 1 < addr) 413 if (addr + size - 1 < addr)
414 goto overflow; 414 goto overflow;
415 415
416 n = rb_next(&first->rb_node); 416 if (list_is_last(&first->list, &vmap_area_list))
417 if (n)
418 first = rb_entry(n, struct vmap_area, rb_node);
419 else
420 goto found; 417 goto found;
418
419 first = list_entry(first->list.next,
420 struct vmap_area, list);
421 } 421 }
422 422
423found: 423found:
@@ -904,6 +904,14 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
904 904
905 BUG_ON(size & ~PAGE_MASK); 905 BUG_ON(size & ~PAGE_MASK);
906 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); 906 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
907 if (WARN_ON(size == 0)) {
908 /*
909 * Allocating 0 bytes isn't what caller wants since
910 * get_order(0) returns funny result. Just warn and terminate
911 * early.
912 */
913 return NULL;
914 }
907 order = get_order(size); 915 order = get_order(size);
908 916
909again: 917again:
@@ -1280,7 +1288,7 @@ DEFINE_RWLOCK(vmlist_lock);
1280struct vm_struct *vmlist; 1288struct vm_struct *vmlist;
1281 1289
1282static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, 1290static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
1283 unsigned long flags, void *caller) 1291 unsigned long flags, const void *caller)
1284{ 1292{
1285 vm->flags = flags; 1293 vm->flags = flags;
1286 vm->addr = (void *)va->va_start; 1294 vm->addr = (void *)va->va_start;
@@ -1306,7 +1314,7 @@ static void insert_vmalloc_vmlist(struct vm_struct *vm)
1306} 1314}
1307 1315
1308static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, 1316static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
1309 unsigned long flags, void *caller) 1317 unsigned long flags, const void *caller)
1310{ 1318{
1311 setup_vmalloc_vm(vm, va, flags, caller); 1319 setup_vmalloc_vm(vm, va, flags, caller);
1312 insert_vmalloc_vmlist(vm); 1320 insert_vmalloc_vmlist(vm);
@@ -1314,7 +1322,7 @@ static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
1314 1322
1315static struct vm_struct *__get_vm_area_node(unsigned long size, 1323static struct vm_struct *__get_vm_area_node(unsigned long size,
1316 unsigned long align, unsigned long flags, unsigned long start, 1324 unsigned long align, unsigned long flags, unsigned long start,
1317 unsigned long end, int node, gfp_t gfp_mask, void *caller) 1325 unsigned long end, int node, gfp_t gfp_mask, const void *caller)
1318{ 1326{
1319 struct vmap_area *va; 1327 struct vmap_area *va;
1320 struct vm_struct *area; 1328 struct vm_struct *area;
@@ -1375,7 +1383,7 @@ EXPORT_SYMBOL_GPL(__get_vm_area);
1375 1383
1376struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, 1384struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
1377 unsigned long start, unsigned long end, 1385 unsigned long start, unsigned long end,
1378 void *caller) 1386 const void *caller)
1379{ 1387{
1380 return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL, 1388 return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL,
1381 caller); 1389 caller);
@@ -1397,13 +1405,21 @@ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
1397} 1405}
1398 1406
1399struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, 1407struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
1400 void *caller) 1408 const void *caller)
1401{ 1409{
1402 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, 1410 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
1403 -1, GFP_KERNEL, caller); 1411 -1, GFP_KERNEL, caller);
1404} 1412}
1405 1413
1406static struct vm_struct *find_vm_area(const void *addr) 1414/**
1415 * find_vm_area - find a continuous kernel virtual area
1416 * @addr: base address
1417 *
1418 * Search for the kernel VM area starting at @addr, and return it.
1419 * It is up to the caller to do all required locking to keep the returned
1420 * pointer valid.
1421 */
1422struct vm_struct *find_vm_area(const void *addr)
1407{ 1423{
1408 struct vmap_area *va; 1424 struct vmap_area *va;
1409 1425
@@ -1568,9 +1584,9 @@ EXPORT_SYMBOL(vmap);
1568 1584
1569static void *__vmalloc_node(unsigned long size, unsigned long align, 1585static void *__vmalloc_node(unsigned long size, unsigned long align,
1570 gfp_t gfp_mask, pgprot_t prot, 1586 gfp_t gfp_mask, pgprot_t prot,
1571 int node, void *caller); 1587 int node, const void *caller);
1572static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, 1588static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
1573 pgprot_t prot, int node, void *caller) 1589 pgprot_t prot, int node, const void *caller)
1574{ 1590{
1575 const int order = 0; 1591 const int order = 0;
1576 struct page **pages; 1592 struct page **pages;
@@ -1643,7 +1659,7 @@ fail:
1643 */ 1659 */
1644void *__vmalloc_node_range(unsigned long size, unsigned long align, 1660void *__vmalloc_node_range(unsigned long size, unsigned long align,
1645 unsigned long start, unsigned long end, gfp_t gfp_mask, 1661 unsigned long start, unsigned long end, gfp_t gfp_mask,
1646 pgprot_t prot, int node, void *caller) 1662 pgprot_t prot, int node, const void *caller)
1647{ 1663{
1648 struct vm_struct *area; 1664 struct vm_struct *area;
1649 void *addr; 1665 void *addr;
@@ -1699,7 +1715,7 @@ fail:
1699 */ 1715 */
1700static void *__vmalloc_node(unsigned long size, unsigned long align, 1716static void *__vmalloc_node(unsigned long size, unsigned long align,
1701 gfp_t gfp_mask, pgprot_t prot, 1717 gfp_t gfp_mask, pgprot_t prot,
1702 int node, void *caller) 1718 int node, const void *caller)
1703{ 1719{
1704 return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, 1720 return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
1705 gfp_mask, prot, node, caller); 1721 gfp_mask, prot, node, caller);
@@ -1975,9 +1991,7 @@ static int aligned_vwrite(char *buf, char *addr, unsigned long count)
1975 * IOREMAP area is treated as memory hole and no copy is done. 1991 * IOREMAP area is treated as memory hole and no copy is done.
1976 * 1992 *
1977 * If [addr...addr+count) doesn't includes any intersects with alive 1993 * If [addr...addr+count) doesn't includes any intersects with alive
1978 * vm_struct area, returns 0. 1994 * vm_struct area, returns 0. @buf should be kernel's buffer.
1979 * @buf should be kernel's buffer. Because this function uses KM_USER0,
1980 * the caller should guarantee KM_USER0 is not used.
1981 * 1995 *
1982 * Note: In usual ops, vread() is never necessary because the caller 1996 * Note: In usual ops, vread() is never necessary because the caller
1983 * should know vmalloc() area is valid and can use memcpy(). 1997 * should know vmalloc() area is valid and can use memcpy().
@@ -2051,9 +2065,7 @@ finished:
2051 * IOREMAP area is treated as memory hole and no copy is done. 2065 * IOREMAP area is treated as memory hole and no copy is done.
2052 * 2066 *
2053 * If [addr...addr+count) doesn't includes any intersects with alive 2067 * If [addr...addr+count) doesn't includes any intersects with alive
2054 * vm_struct area, returns 0. 2068 * vm_struct area, returns 0. @buf should be kernel's buffer.
2055 * @buf should be kernel's buffer. Because this function uses KM_USER0,
2056 * the caller should guarantee KM_USER0 is not used.
2057 * 2069 *
2058 * Note: In usual ops, vwrite() is never necessary because the caller 2070 * Note: In usual ops, vwrite() is never necessary because the caller
2059 * should know vmalloc() area is valid and can use memcpy(). 2071 * should know vmalloc() area is valid and can use memcpy().
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 661576324c7..8d01243d956 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -133,7 +133,7 @@ long vm_total_pages; /* The total number of pages which the VM controls */
133static LIST_HEAD(shrinker_list); 133static LIST_HEAD(shrinker_list);
134static DECLARE_RWSEM(shrinker_rwsem); 134static DECLARE_RWSEM(shrinker_rwsem);
135 135
136#ifdef CONFIG_CGROUP_MEM_RES_CTLR 136#ifdef CONFIG_MEMCG
137static bool global_reclaim(struct scan_control *sc) 137static bool global_reclaim(struct scan_control *sc)
138{ 138{
139 return !sc->target_mem_cgroup; 139 return !sc->target_mem_cgroup;
@@ -687,6 +687,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
687 687
688 cond_resched(); 688 cond_resched();
689 689
690 mem_cgroup_uncharge_start();
690 while (!list_empty(page_list)) { 691 while (!list_empty(page_list)) {
691 enum page_references references; 692 enum page_references references;
692 struct address_space *mapping; 693 struct address_space *mapping;
@@ -720,9 +721,41 @@ static unsigned long shrink_page_list(struct list_head *page_list,
720 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); 721 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
721 722
722 if (PageWriteback(page)) { 723 if (PageWriteback(page)) {
723 nr_writeback++; 724 /*
724 unlock_page(page); 725 * memcg doesn't have any dirty pages throttling so we
725 goto keep; 726 * could easily OOM just because too many pages are in
727 * writeback and there is nothing else to reclaim.
728 *
729 * Check __GFP_IO, certainly because a loop driver
730 * thread might enter reclaim, and deadlock if it waits
731 * on a page for which it is needed to do the write
732 * (loop masks off __GFP_IO|__GFP_FS for this reason);
733 * but more thought would probably show more reasons.
734 *
735 * Don't require __GFP_FS, since we're not going into
736 * the FS, just waiting on its writeback completion.
737 * Worryingly, ext4 gfs2 and xfs allocate pages with
738 * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so
739 * testing may_enter_fs here is liable to OOM on them.
740 */
741 if (global_reclaim(sc) ||
742 !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
743 /*
744 * This is slightly racy - end_page_writeback()
745 * might have just cleared PageReclaim, then
746 * setting PageReclaim here end up interpreted
747 * as PageReadahead - but that does not matter
748 * enough to care. What we do want is for this
749 * page to have PageReclaim set next time memcg
750 * reclaim reaches the tests above, so it will
751 * then wait_on_page_writeback() to avoid OOM;
752 * and it's also appropriate in global reclaim.
753 */
754 SetPageReclaim(page);
755 nr_writeback++;
756 goto keep_locked;
757 }
758 wait_on_page_writeback(page);
726 } 759 }
727 760
728 references = page_check_references(page, sc); 761 references = page_check_references(page, sc);
@@ -921,6 +954,7 @@ keep:
921 954
922 list_splice(&ret_pages, page_list); 955 list_splice(&ret_pages, page_list);
923 count_vm_events(PGACTIVATE, pgactivate); 956 count_vm_events(PGACTIVATE, pgactivate);
957 mem_cgroup_uncharge_end();
924 *ret_nr_dirty += nr_dirty; 958 *ret_nr_dirty += nr_dirty;
925 *ret_nr_writeback += nr_writeback; 959 *ret_nr_writeback += nr_writeback;
926 return nr_reclaimed; 960 return nr_reclaimed;
@@ -1567,7 +1601,8 @@ static int vmscan_swappiness(struct scan_control *sc)
1567 * by looking at the fraction of the pages scanned we did rotate back 1601 * by looking at the fraction of the pages scanned we did rotate back
1568 * onto the active list instead of evict. 1602 * onto the active list instead of evict.
1569 * 1603 *
1570 * nr[0] = anon pages to scan; nr[1] = file pages to scan 1604 * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan
1605 * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
1571 */ 1606 */
1572static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, 1607static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
1573 unsigned long *nr) 1608 unsigned long *nr)
@@ -2111,6 +2146,83 @@ out:
2111 return 0; 2146 return 0;
2112} 2147}
2113 2148
2149static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
2150{
2151 struct zone *zone;
2152 unsigned long pfmemalloc_reserve = 0;
2153 unsigned long free_pages = 0;
2154 int i;
2155 bool wmark_ok;
2156
2157 for (i = 0; i <= ZONE_NORMAL; i++) {
2158 zone = &pgdat->node_zones[i];
2159 pfmemalloc_reserve += min_wmark_pages(zone);
2160 free_pages += zone_page_state(zone, NR_FREE_PAGES);
2161 }
2162
2163 wmark_ok = free_pages > pfmemalloc_reserve / 2;
2164
2165 /* kswapd must be awake if processes are being throttled */
2166 if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
2167 pgdat->classzone_idx = min(pgdat->classzone_idx,
2168 (enum zone_type)ZONE_NORMAL);
2169 wake_up_interruptible(&pgdat->kswapd_wait);
2170 }
2171
2172 return wmark_ok;
2173}
2174
2175/*
2176 * Throttle direct reclaimers if backing storage is backed by the network
2177 * and the PFMEMALLOC reserve for the preferred node is getting dangerously
2178 * depleted. kswapd will continue to make progress and wake the processes
2179 * when the low watermark is reached
2180 */
2181static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
2182 nodemask_t *nodemask)
2183{
2184 struct zone *zone;
2185 int high_zoneidx = gfp_zone(gfp_mask);
2186 pg_data_t *pgdat;
2187
2188 /*
2189 * Kernel threads should not be throttled as they may be indirectly
2190 * responsible for cleaning pages necessary for reclaim to make forward
2191 * progress. kjournald for example may enter direct reclaim while
2192 * committing a transaction where throttling it could forcing other
2193 * processes to block on log_wait_commit().
2194 */
2195 if (current->flags & PF_KTHREAD)
2196 return;
2197
2198 /* Check if the pfmemalloc reserves are ok */
2199 first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
2200 pgdat = zone->zone_pgdat;
2201 if (pfmemalloc_watermark_ok(pgdat))
2202 return;
2203
2204 /* Account for the throttling */
2205 count_vm_event(PGSCAN_DIRECT_THROTTLE);
2206
2207 /*
2208 * If the caller cannot enter the filesystem, it's possible that it
2209 * is due to the caller holding an FS lock or performing a journal
2210 * transaction in the case of a filesystem like ext[3|4]. In this case,
2211 * it is not safe to block on pfmemalloc_wait as kswapd could be
2212 * blocked waiting on the same lock. Instead, throttle for up to a
2213 * second before continuing.
2214 */
2215 if (!(gfp_mask & __GFP_FS)) {
2216 wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
2217 pfmemalloc_watermark_ok(pgdat), HZ);
2218 return;
2219 }
2220
2221 /* Throttle until kswapd wakes the process */
2222 wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
2223 pfmemalloc_watermark_ok(pgdat));
2224}
2225
2114unsigned long try_to_free_pages(struct zonelist *zonelist, int order, 2226unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2115 gfp_t gfp_mask, nodemask_t *nodemask) 2227 gfp_t gfp_mask, nodemask_t *nodemask)
2116{ 2228{
@@ -2130,6 +2242,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2130 .gfp_mask = sc.gfp_mask, 2242 .gfp_mask = sc.gfp_mask,
2131 }; 2243 };
2132 2244
2245 throttle_direct_reclaim(gfp_mask, zonelist, nodemask);
2246
2247 /*
2248 * Do not enter reclaim if fatal signal is pending. 1 is returned so
2249 * that the page allocator does not consider triggering OOM
2250 */
2251 if (fatal_signal_pending(current))
2252 return 1;
2253
2133 trace_mm_vmscan_direct_reclaim_begin(order, 2254 trace_mm_vmscan_direct_reclaim_begin(order,
2134 sc.may_writepage, 2255 sc.may_writepage,
2135 gfp_mask); 2256 gfp_mask);
@@ -2141,7 +2262,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2141 return nr_reclaimed; 2262 return nr_reclaimed;
2142} 2263}
2143 2264
2144#ifdef CONFIG_CGROUP_MEM_RES_CTLR 2265#ifdef CONFIG_MEMCG
2145 2266
2146unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, 2267unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
2147 gfp_t gfp_mask, bool noswap, 2268 gfp_t gfp_mask, bool noswap,
@@ -2274,8 +2395,13 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
2274 return balanced_pages >= (present_pages >> 2); 2395 return balanced_pages >= (present_pages >> 2);
2275} 2396}
2276 2397
2277/* is kswapd sleeping prematurely? */ 2398/*
2278static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, 2399 * Prepare kswapd for sleeping. This verifies that there are no processes
2400 * waiting in throttle_direct_reclaim() and that watermarks have been met.
2401 *
2402 * Returns true if kswapd is ready to sleep
2403 */
2404static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
2279 int classzone_idx) 2405 int classzone_idx)
2280{ 2406{
2281 int i; 2407 int i;
@@ -2284,7 +2410,21 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
2284 2410
2285 /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ 2411 /* If a direct reclaimer woke kswapd within HZ/10, it's premature */
2286 if (remaining) 2412 if (remaining)
2287 return true; 2413 return false;
2414
2415 /*
2416 * There is a potential race between when kswapd checks its watermarks
2417 * and a process gets throttled. There is also a potential race if
2418 * processes get throttled, kswapd wakes, a large process exits therby
2419 * balancing the zones that causes kswapd to miss a wakeup. If kswapd
2420 * is going to sleep, no process should be sleeping on pfmemalloc_wait
2421 * so wake them now if necessary. If necessary, processes will wake
2422 * kswapd and get throttled again
2423 */
2424 if (waitqueue_active(&pgdat->pfmemalloc_wait)) {
2425 wake_up(&pgdat->pfmemalloc_wait);
2426 return false;
2427 }
2288 2428
2289 /* Check the watermark levels */ 2429 /* Check the watermark levels */
2290 for (i = 0; i <= classzone_idx; i++) { 2430 for (i = 0; i <= classzone_idx; i++) {
@@ -2317,9 +2457,9 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
2317 * must be balanced 2457 * must be balanced
2318 */ 2458 */
2319 if (order) 2459 if (order)
2320 return !pgdat_balanced(pgdat, balanced, classzone_idx); 2460 return pgdat_balanced(pgdat, balanced, classzone_idx);
2321 else 2461 else
2322 return !all_zones_ok; 2462 return all_zones_ok;
2323} 2463}
2324 2464
2325/* 2465/*
@@ -2537,7 +2677,7 @@ loop_again:
2537 * consider it to be no longer congested. It's 2677 * consider it to be no longer congested. It's
2538 * possible there are dirty pages backed by 2678 * possible there are dirty pages backed by
2539 * congested BDIs but as pressure is relieved, 2679 * congested BDIs but as pressure is relieved,
2540 * spectulatively avoid congestion waits 2680 * speculatively avoid congestion waits
2541 */ 2681 */
2542 zone_clear_flag(zone, ZONE_CONGESTED); 2682 zone_clear_flag(zone, ZONE_CONGESTED);
2543 if (i <= *classzone_idx) 2683 if (i <= *classzone_idx)
@@ -2545,6 +2685,16 @@ loop_again:
2545 } 2685 }
2546 2686
2547 } 2687 }
2688
2689 /*
2690 * If the low watermark is met there is no need for processes
2691 * to be throttled on pfmemalloc_wait as they should not be
2692 * able to safely make forward progress. Wake them
2693 */
2694 if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
2695 pfmemalloc_watermark_ok(pgdat))
2696 wake_up(&pgdat->pfmemalloc_wait);
2697
2548 if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) 2698 if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
2549 break; /* kswapd: all done */ 2699 break; /* kswapd: all done */
2550 /* 2700 /*
@@ -2646,7 +2796,7 @@ out:
2646 } 2796 }
2647 2797
2648 /* 2798 /*
2649 * Return the order we were reclaiming at so sleeping_prematurely() 2799 * Return the order we were reclaiming at so prepare_kswapd_sleep()
2650 * makes a decision on the order we were last reclaiming at. However, 2800 * makes a decision on the order we were last reclaiming at. However,
2651 * if another caller entered the allocator slow path while kswapd 2801 * if another caller entered the allocator slow path while kswapd
2652 * was awake, order will remain at the higher level 2802 * was awake, order will remain at the higher level
@@ -2666,7 +2816,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
2666 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); 2816 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
2667 2817
2668 /* Try to sleep for a short interval */ 2818 /* Try to sleep for a short interval */
2669 if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { 2819 if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
2670 remaining = schedule_timeout(HZ/10); 2820 remaining = schedule_timeout(HZ/10);
2671 finish_wait(&pgdat->kswapd_wait, &wait); 2821 finish_wait(&pgdat->kswapd_wait, &wait);
2672 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); 2822 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
@@ -2676,7 +2826,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
2676 * After a short sleep, check if it was a premature sleep. If not, then 2826 * After a short sleep, check if it was a premature sleep. If not, then
2677 * go fully to sleep until explicitly woken up. 2827 * go fully to sleep until explicitly woken up.
2678 */ 2828 */
2679 if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { 2829 if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
2680 trace_mm_vmscan_kswapd_sleep(pgdat->node_id); 2830 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
2681 2831
2682 /* 2832 /*
@@ -2688,7 +2838,10 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
2688 * them before going back to sleep. 2838 * them before going back to sleep.
2689 */ 2839 */
2690 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); 2840 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
2691 schedule(); 2841
2842 if (!kthread_should_stop())
2843 schedule();
2844
2692 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); 2845 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
2693 } else { 2846 } else {
2694 if (remaining) 2847 if (remaining)
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 1bbbbd9776a..df7a6748231 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -745,6 +745,7 @@ const char * const vmstat_text[] = {
745 TEXTS_FOR_ZONES("pgsteal_direct") 745 TEXTS_FOR_ZONES("pgsteal_direct")
746 TEXTS_FOR_ZONES("pgscan_kswapd") 746 TEXTS_FOR_ZONES("pgscan_kswapd")
747 TEXTS_FOR_ZONES("pgscan_direct") 747 TEXTS_FOR_ZONES("pgscan_direct")
748 "pgscan_direct_throttle",
748 749
749#ifdef CONFIG_NUMA 750#ifdef CONFIG_NUMA
750 "zone_reclaim_failed", 751 "zone_reclaim_failed",