diff options
author | Jiri Kosina <jkosina@suse.cz> | 2011-04-26 04:22:15 -0400 |
---|---|---|
committer | Jiri Kosina <jkosina@suse.cz> | 2011-04-26 04:22:59 -0400 |
commit | 07f9479a40cc778bc1462ada11f95b01360ae4ff (patch) | |
tree | 0676cf38df3844004bb3ebfd99dfa67a4a8998f5 /mm | |
parent | 9d5e6bdb3013acfb311ab407eeca0b6a6a3dedbf (diff) | |
parent | cd2e49e90f1cae7726c9a2c54488d881d7f1cd1c (diff) |
Merge branch 'master' into for-next
Fast-forwarded to current state of Linus' tree as there are patches to be
applied for files that didn't exist on the old branch.
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig.debug | 25 | ||||
-rw-r--r-- | mm/backing-dev.c | 18 | ||||
-rw-r--r-- | mm/bootmem.c | 8 | ||||
-rw-r--r-- | mm/compaction.c | 65 | ||||
-rw-r--r-- | mm/filemap.c | 211 | ||||
-rw-r--r-- | mm/huge_memory.c | 69 | ||||
-rw-r--r-- | mm/hugetlb.c | 16 | ||||
-rw-r--r-- | mm/hwpoison-inject.c | 2 | ||||
-rw-r--r-- | mm/internal.h | 2 | ||||
-rw-r--r-- | mm/kmemleak.c | 6 | ||||
-rw-r--r-- | mm/ksm.c | 25 | ||||
-rw-r--r-- | mm/memblock.c | 241 | ||||
-rw-r--r-- | mm/memcontrol.c | 669 | ||||
-rw-r--r-- | mm/memory-failure.c | 16 | ||||
-rw-r--r-- | mm/memory.c | 106 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 4 | ||||
-rw-r--r-- | mm/mempolicy.c | 3 | ||||
-rw-r--r-- | mm/migrate.c | 58 | ||||
-rw-r--r-- | mm/mlock.c | 17 | ||||
-rw-r--r-- | mm/mmap.c | 15 | ||||
-rw-r--r-- | mm/mremap.c | 11 | ||||
-rw-r--r-- | mm/nobootmem.c | 10 | ||||
-rw-r--r-- | mm/nommu.c | 58 | ||||
-rw-r--r-- | mm/oom_kill.c | 89 | ||||
-rw-r--r-- | mm/page-writeback.c | 25 | ||||
-rw-r--r-- | mm/page_alloc.c | 95 | ||||
-rw-r--r-- | mm/page_cgroup.c | 140 | ||||
-rw-r--r-- | mm/page_io.c | 2 | ||||
-rw-r--r-- | mm/pagewalk.c | 24 | ||||
-rw-r--r-- | mm/percpu.c | 13 | ||||
-rw-r--r-- | mm/readahead.c | 18 | ||||
-rw-r--r-- | mm/rmap.c | 85 | ||||
-rw-r--r-- | mm/shmem.c | 11 | ||||
-rw-r--r-- | mm/slab.c | 61 | ||||
-rw-r--r-- | mm/slob.c | 6 | ||||
-rw-r--r-- | mm/slub.c | 376 | ||||
-rw-r--r-- | mm/sparse.c | 2 | ||||
-rw-r--r-- | mm/swap.c | 189 | ||||
-rw-r--r-- | mm/swap_state.c | 5 | ||||
-rw-r--r-- | mm/swapfile.c | 411 | ||||
-rw-r--r-- | mm/truncate.c | 22 | ||||
-rw-r--r-- | mm/util.c | 2 | ||||
-rw-r--r-- | mm/vmalloc.c | 158 | ||||
-rw-r--r-- | mm/vmscan.c | 66 | ||||
-rw-r--r-- | mm/vmstat.c | 27 |
45 files changed, 2123 insertions, 1359 deletions
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index af7cfb43d2f0..8b1a477162dc 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug | |||
@@ -1,27 +1,24 @@ | |||
1 | config DEBUG_PAGEALLOC | 1 | config DEBUG_PAGEALLOC |
2 | bool "Debug page memory allocations" | 2 | bool "Debug page memory allocations" |
3 | depends on DEBUG_KERNEL && ARCH_SUPPORTS_DEBUG_PAGEALLOC | 3 | depends on DEBUG_KERNEL |
4 | depends on !HIBERNATION || !PPC && !SPARC | 4 | depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC |
5 | depends on !KMEMCHECK | 5 | depends on !KMEMCHECK |
6 | select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC | ||
6 | ---help--- | 7 | ---help--- |
7 | Unmap pages from the kernel linear mapping after free_pages(). | 8 | Unmap pages from the kernel linear mapping after free_pages(). |
8 | This results in a large slowdown, but helps to find certain types | 9 | This results in a large slowdown, but helps to find certain types |
9 | of memory corruption. | 10 | of memory corruption. |
10 | 11 | ||
12 | For architectures which don't enable ARCH_SUPPORTS_DEBUG_PAGEALLOC, | ||
13 | fill the pages with poison patterns after free_pages() and verify | ||
14 | the patterns before alloc_pages(). Additionally, | ||
15 | this option cannot be enabled in combination with hibernation as | ||
16 | that would result in incorrect warnings of memory corruption after | ||
17 | a resume because free pages are not saved to the suspend image. | ||
18 | |||
11 | config WANT_PAGE_DEBUG_FLAGS | 19 | config WANT_PAGE_DEBUG_FLAGS |
12 | bool | 20 | bool |
13 | 21 | ||
14 | config PAGE_POISONING | 22 | config PAGE_POISONING |
15 | bool "Debug page memory allocations" | 23 | bool |
16 | depends on DEBUG_KERNEL && !ARCH_SUPPORTS_DEBUG_PAGEALLOC | ||
17 | depends on !HIBERNATION | ||
18 | select DEBUG_PAGEALLOC | ||
19 | select WANT_PAGE_DEBUG_FLAGS | 24 | select WANT_PAGE_DEBUG_FLAGS |
20 | ---help--- | ||
21 | Fill the pages with poison patterns after free_pages() and verify | ||
22 | the patterns before alloc_pages(). This results in a large slowdown, | ||
23 | but helps to find certain types of memory corruption. | ||
24 | |||
25 | This option cannot be enabled in combination with hibernation as | ||
26 | that would result in incorrect warnings of memory corruption after | ||
27 | a resume because free pages are not saved to the suspend image. | ||
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 027100d30227..befc87531e4f 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c | |||
@@ -14,17 +14,11 @@ | |||
14 | 14 | ||
15 | static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); | 15 | static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); |
16 | 16 | ||
17 | void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) | ||
18 | { | ||
19 | } | ||
20 | EXPORT_SYMBOL(default_unplug_io_fn); | ||
21 | |||
22 | struct backing_dev_info default_backing_dev_info = { | 17 | struct backing_dev_info default_backing_dev_info = { |
23 | .name = "default", | 18 | .name = "default", |
24 | .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE, | 19 | .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE, |
25 | .state = 0, | 20 | .state = 0, |
26 | .capabilities = BDI_CAP_MAP_COPY, | 21 | .capabilities = BDI_CAP_MAP_COPY, |
27 | .unplug_io_fn = default_unplug_io_fn, | ||
28 | }; | 22 | }; |
29 | EXPORT_SYMBOL_GPL(default_backing_dev_info); | 23 | EXPORT_SYMBOL_GPL(default_backing_dev_info); |
30 | 24 | ||
@@ -73,14 +67,14 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) | |||
73 | struct inode *inode; | 67 | struct inode *inode; |
74 | 68 | ||
75 | nr_wb = nr_dirty = nr_io = nr_more_io = 0; | 69 | nr_wb = nr_dirty = nr_io = nr_more_io = 0; |
76 | spin_lock(&inode_lock); | 70 | spin_lock(&inode_wb_list_lock); |
77 | list_for_each_entry(inode, &wb->b_dirty, i_wb_list) | 71 | list_for_each_entry(inode, &wb->b_dirty, i_wb_list) |
78 | nr_dirty++; | 72 | nr_dirty++; |
79 | list_for_each_entry(inode, &wb->b_io, i_wb_list) | 73 | list_for_each_entry(inode, &wb->b_io, i_wb_list) |
80 | nr_io++; | 74 | nr_io++; |
81 | list_for_each_entry(inode, &wb->b_more_io, i_wb_list) | 75 | list_for_each_entry(inode, &wb->b_more_io, i_wb_list) |
82 | nr_more_io++; | 76 | nr_more_io++; |
83 | spin_unlock(&inode_lock); | 77 | spin_unlock(&inode_wb_list_lock); |
84 | 78 | ||
85 | global_dirty_limits(&background_thresh, &dirty_thresh); | 79 | global_dirty_limits(&background_thresh, &dirty_thresh); |
86 | bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); | 80 | bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); |
@@ -604,7 +598,7 @@ static void bdi_prune_sb(struct backing_dev_info *bdi) | |||
604 | spin_lock(&sb_lock); | 598 | spin_lock(&sb_lock); |
605 | list_for_each_entry(sb, &super_blocks, s_list) { | 599 | list_for_each_entry(sb, &super_blocks, s_list) { |
606 | if (sb->s_bdi == bdi) | 600 | if (sb->s_bdi == bdi) |
607 | sb->s_bdi = NULL; | 601 | sb->s_bdi = &default_backing_dev_info; |
608 | } | 602 | } |
609 | spin_unlock(&sb_lock); | 603 | spin_unlock(&sb_lock); |
610 | } | 604 | } |
@@ -682,11 +676,11 @@ void bdi_destroy(struct backing_dev_info *bdi) | |||
682 | if (bdi_has_dirty_io(bdi)) { | 676 | if (bdi_has_dirty_io(bdi)) { |
683 | struct bdi_writeback *dst = &default_backing_dev_info.wb; | 677 | struct bdi_writeback *dst = &default_backing_dev_info.wb; |
684 | 678 | ||
685 | spin_lock(&inode_lock); | 679 | spin_lock(&inode_wb_list_lock); |
686 | list_splice(&bdi->wb.b_dirty, &dst->b_dirty); | 680 | list_splice(&bdi->wb.b_dirty, &dst->b_dirty); |
687 | list_splice(&bdi->wb.b_io, &dst->b_io); | 681 | list_splice(&bdi->wb.b_io, &dst->b_io); |
688 | list_splice(&bdi->wb.b_more_io, &dst->b_more_io); | 682 | list_splice(&bdi->wb.b_more_io, &dst->b_more_io); |
689 | spin_unlock(&inode_lock); | 683 | spin_unlock(&inode_wb_list_lock); |
690 | } | 684 | } |
691 | 685 | ||
692 | bdi_unregister(bdi); | 686 | bdi_unregister(bdi); |
@@ -793,7 +787,7 @@ EXPORT_SYMBOL(congestion_wait); | |||
793 | * jiffies for either a BDI to exit congestion of the given @sync queue | 787 | * jiffies for either a BDI to exit congestion of the given @sync queue |
794 | * or a write to complete. | 788 | * or a write to complete. |
795 | * | 789 | * |
796 | * In the absense of zone congestion, cond_resched() is called to yield | 790 | * In the absence of zone congestion, cond_resched() is called to yield |
797 | * the processor if necessary but otherwise does not sleep. | 791 | * the processor if necessary but otherwise does not sleep. |
798 | * | 792 | * |
799 | * The return value is 0 if the sleep is for the full timeout. Otherwise, | 793 | * The return value is 0 if the sleep is for the full timeout. Otherwise, |
diff --git a/mm/bootmem.c b/mm/bootmem.c index 07aeb89e396e..01d5a4b3dd0c 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
@@ -34,14 +34,6 @@ unsigned long max_low_pfn; | |||
34 | unsigned long min_low_pfn; | 34 | unsigned long min_low_pfn; |
35 | unsigned long max_pfn; | 35 | unsigned long max_pfn; |
36 | 36 | ||
37 | #ifdef CONFIG_CRASH_DUMP | ||
38 | /* | ||
39 | * If we have booted due to a crash, max_pfn will be a very low value. We need | ||
40 | * to know the amount of memory that the previous kernel used. | ||
41 | */ | ||
42 | unsigned long saved_max_pfn; | ||
43 | #endif | ||
44 | |||
45 | bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; | 37 | bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; |
46 | 38 | ||
47 | static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list); | 39 | static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list); |
diff --git a/mm/compaction.c b/mm/compaction.c index 8be430b812de..021a2960ef9e 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -42,8 +42,6 @@ struct compact_control { | |||
42 | unsigned int order; /* order a direct compactor needs */ | 42 | unsigned int order; /* order a direct compactor needs */ |
43 | int migratetype; /* MOVABLE, RECLAIMABLE etc */ | 43 | int migratetype; /* MOVABLE, RECLAIMABLE etc */ |
44 | struct zone *zone; | 44 | struct zone *zone; |
45 | |||
46 | int compact_mode; | ||
47 | }; | 45 | }; |
48 | 46 | ||
49 | static unsigned long release_freepages(struct list_head *freelist) | 47 | static unsigned long release_freepages(struct list_head *freelist) |
@@ -155,7 +153,6 @@ static void isolate_freepages(struct zone *zone, | |||
155 | * pages on cc->migratepages. We stop searching if the migrate | 153 | * pages on cc->migratepages. We stop searching if the migrate |
156 | * and free page scanners meet or enough free pages are isolated. | 154 | * and free page scanners meet or enough free pages are isolated. |
157 | */ | 155 | */ |
158 | spin_lock_irqsave(&zone->lock, flags); | ||
159 | for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages; | 156 | for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages; |
160 | pfn -= pageblock_nr_pages) { | 157 | pfn -= pageblock_nr_pages) { |
161 | unsigned long isolated; | 158 | unsigned long isolated; |
@@ -178,9 +175,19 @@ static void isolate_freepages(struct zone *zone, | |||
178 | if (!suitable_migration_target(page)) | 175 | if (!suitable_migration_target(page)) |
179 | continue; | 176 | continue; |
180 | 177 | ||
181 | /* Found a block suitable for isolating free pages from */ | 178 | /* |
182 | isolated = isolate_freepages_block(zone, pfn, freelist); | 179 | * Found a block suitable for isolating free pages from. Now |
183 | nr_freepages += isolated; | 180 | * we disabled interrupts, double check things are ok and |
181 | * isolate the pages. This is to minimise the time IRQs | ||
182 | * are disabled | ||
183 | */ | ||
184 | isolated = 0; | ||
185 | spin_lock_irqsave(&zone->lock, flags); | ||
186 | if (suitable_migration_target(page)) { | ||
187 | isolated = isolate_freepages_block(zone, pfn, freelist); | ||
188 | nr_freepages += isolated; | ||
189 | } | ||
190 | spin_unlock_irqrestore(&zone->lock, flags); | ||
184 | 191 | ||
185 | /* | 192 | /* |
186 | * Record the highest PFN we isolated pages from. When next | 193 | * Record the highest PFN we isolated pages from. When next |
@@ -190,7 +197,6 @@ static void isolate_freepages(struct zone *zone, | |||
190 | if (isolated) | 197 | if (isolated) |
191 | high_pfn = max(high_pfn, pfn); | 198 | high_pfn = max(high_pfn, pfn); |
192 | } | 199 | } |
193 | spin_unlock_irqrestore(&zone->lock, flags); | ||
194 | 200 | ||
195 | /* split_free_page does not map the pages */ | 201 | /* split_free_page does not map the pages */ |
196 | list_for_each_entry(page, freelist, lru) { | 202 | list_for_each_entry(page, freelist, lru) { |
@@ -271,9 +277,27 @@ static unsigned long isolate_migratepages(struct zone *zone, | |||
271 | } | 277 | } |
272 | 278 | ||
273 | /* Time to isolate some pages for migration */ | 279 | /* Time to isolate some pages for migration */ |
280 | cond_resched(); | ||
274 | spin_lock_irq(&zone->lru_lock); | 281 | spin_lock_irq(&zone->lru_lock); |
275 | for (; low_pfn < end_pfn; low_pfn++) { | 282 | for (; low_pfn < end_pfn; low_pfn++) { |
276 | struct page *page; | 283 | struct page *page; |
284 | bool locked = true; | ||
285 | |||
286 | /* give a chance to irqs before checking need_resched() */ | ||
287 | if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) { | ||
288 | spin_unlock_irq(&zone->lru_lock); | ||
289 | locked = false; | ||
290 | } | ||
291 | if (need_resched() || spin_is_contended(&zone->lru_lock)) { | ||
292 | if (locked) | ||
293 | spin_unlock_irq(&zone->lru_lock); | ||
294 | cond_resched(); | ||
295 | spin_lock_irq(&zone->lru_lock); | ||
296 | if (fatal_signal_pending(current)) | ||
297 | break; | ||
298 | } else if (!locked) | ||
299 | spin_lock_irq(&zone->lru_lock); | ||
300 | |||
277 | if (!pfn_valid_within(low_pfn)) | 301 | if (!pfn_valid_within(low_pfn)) |
278 | continue; | 302 | continue; |
279 | nr_scanned++; | 303 | nr_scanned++; |
@@ -397,10 +421,7 @@ static int compact_finished(struct zone *zone, | |||
397 | return COMPACT_COMPLETE; | 421 | return COMPACT_COMPLETE; |
398 | 422 | ||
399 | /* Compaction run is not finished if the watermark is not met */ | 423 | /* Compaction run is not finished if the watermark is not met */ |
400 | if (cc->compact_mode != COMPACT_MODE_KSWAPD) | 424 | watermark = low_wmark_pages(zone); |
401 | watermark = low_wmark_pages(zone); | ||
402 | else | ||
403 | watermark = high_wmark_pages(zone); | ||
404 | watermark += (1 << cc->order); | 425 | watermark += (1 << cc->order); |
405 | 426 | ||
406 | if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0)) | 427 | if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0)) |
@@ -413,15 +434,6 @@ static int compact_finished(struct zone *zone, | |||
413 | if (cc->order == -1) | 434 | if (cc->order == -1) |
414 | return COMPACT_CONTINUE; | 435 | return COMPACT_CONTINUE; |
415 | 436 | ||
416 | /* | ||
417 | * Generating only one page of the right order is not enough | ||
418 | * for kswapd, we must continue until we're above the high | ||
419 | * watermark as a pool for high order GFP_ATOMIC allocations | ||
420 | * too. | ||
421 | */ | ||
422 | if (cc->compact_mode == COMPACT_MODE_KSWAPD) | ||
423 | return COMPACT_CONTINUE; | ||
424 | |||
425 | /* Direct compactor: Is a suitable page free? */ | 437 | /* Direct compactor: Is a suitable page free? */ |
426 | for (order = cc->order; order < MAX_ORDER; order++) { | 438 | for (order = cc->order; order < MAX_ORDER; order++) { |
427 | /* Job done if page is free of the right migratetype */ | 439 | /* Job done if page is free of the right migratetype */ |
@@ -508,12 +520,13 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
508 | 520 | ||
509 | while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { | 521 | while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { |
510 | unsigned long nr_migrate, nr_remaining; | 522 | unsigned long nr_migrate, nr_remaining; |
523 | int err; | ||
511 | 524 | ||
512 | if (!isolate_migratepages(zone, cc)) | 525 | if (!isolate_migratepages(zone, cc)) |
513 | continue; | 526 | continue; |
514 | 527 | ||
515 | nr_migrate = cc->nr_migratepages; | 528 | nr_migrate = cc->nr_migratepages; |
516 | migrate_pages(&cc->migratepages, compaction_alloc, | 529 | err = migrate_pages(&cc->migratepages, compaction_alloc, |
517 | (unsigned long)cc, false, | 530 | (unsigned long)cc, false, |
518 | cc->sync); | 531 | cc->sync); |
519 | update_nr_listpages(cc); | 532 | update_nr_listpages(cc); |
@@ -527,7 +540,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
527 | nr_remaining); | 540 | nr_remaining); |
528 | 541 | ||
529 | /* Release LRU pages not migrated */ | 542 | /* Release LRU pages not migrated */ |
530 | if (!list_empty(&cc->migratepages)) { | 543 | if (err) { |
531 | putback_lru_pages(&cc->migratepages); | 544 | putback_lru_pages(&cc->migratepages); |
532 | cc->nr_migratepages = 0; | 545 | cc->nr_migratepages = 0; |
533 | } | 546 | } |
@@ -543,8 +556,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
543 | 556 | ||
544 | unsigned long compact_zone_order(struct zone *zone, | 557 | unsigned long compact_zone_order(struct zone *zone, |
545 | int order, gfp_t gfp_mask, | 558 | int order, gfp_t gfp_mask, |
546 | bool sync, | 559 | bool sync) |
547 | int compact_mode) | ||
548 | { | 560 | { |
549 | struct compact_control cc = { | 561 | struct compact_control cc = { |
550 | .nr_freepages = 0, | 562 | .nr_freepages = 0, |
@@ -553,7 +565,6 @@ unsigned long compact_zone_order(struct zone *zone, | |||
553 | .migratetype = allocflags_to_migratetype(gfp_mask), | 565 | .migratetype = allocflags_to_migratetype(gfp_mask), |
554 | .zone = zone, | 566 | .zone = zone, |
555 | .sync = sync, | 567 | .sync = sync, |
556 | .compact_mode = compact_mode, | ||
557 | }; | 568 | }; |
558 | INIT_LIST_HEAD(&cc.freepages); | 569 | INIT_LIST_HEAD(&cc.freepages); |
559 | INIT_LIST_HEAD(&cc.migratepages); | 570 | INIT_LIST_HEAD(&cc.migratepages); |
@@ -599,8 +610,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, | |||
599 | nodemask) { | 610 | nodemask) { |
600 | int status; | 611 | int status; |
601 | 612 | ||
602 | status = compact_zone_order(zone, order, gfp_mask, sync, | 613 | status = compact_zone_order(zone, order, gfp_mask, sync); |
603 | COMPACT_MODE_DIRECT_RECLAIM); | ||
604 | rc = max(status, rc); | 614 | rc = max(status, rc); |
605 | 615 | ||
606 | /* If a normal allocation would succeed, stop compacting */ | 616 | /* If a normal allocation would succeed, stop compacting */ |
@@ -631,7 +641,6 @@ static int compact_node(int nid) | |||
631 | .nr_freepages = 0, | 641 | .nr_freepages = 0, |
632 | .nr_migratepages = 0, | 642 | .nr_migratepages = 0, |
633 | .order = -1, | 643 | .order = -1, |
634 | .compact_mode = COMPACT_MODE_DIRECT_RECLAIM, | ||
635 | }; | 644 | }; |
636 | 645 | ||
637 | zone = &pgdat->node_zones[zoneid]; | 646 | zone = &pgdat->node_zones[zoneid]; |
diff --git a/mm/filemap.c b/mm/filemap.c index 83a45d35468b..c641edf553a9 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -80,8 +80,8 @@ | |||
80 | * ->i_mutex | 80 | * ->i_mutex |
81 | * ->i_alloc_sem (various) | 81 | * ->i_alloc_sem (various) |
82 | * | 82 | * |
83 | * ->inode_lock | 83 | * inode_wb_list_lock |
84 | * ->sb_lock (fs/fs-writeback.c) | 84 | * sb_lock (fs/fs-writeback.c) |
85 | * ->mapping->tree_lock (__sync_single_inode) | 85 | * ->mapping->tree_lock (__sync_single_inode) |
86 | * | 86 | * |
87 | * ->i_mmap_lock | 87 | * ->i_mmap_lock |
@@ -98,8 +98,10 @@ | |||
98 | * ->zone.lru_lock (check_pte_range->isolate_lru_page) | 98 | * ->zone.lru_lock (check_pte_range->isolate_lru_page) |
99 | * ->private_lock (page_remove_rmap->set_page_dirty) | 99 | * ->private_lock (page_remove_rmap->set_page_dirty) |
100 | * ->tree_lock (page_remove_rmap->set_page_dirty) | 100 | * ->tree_lock (page_remove_rmap->set_page_dirty) |
101 | * ->inode_lock (page_remove_rmap->set_page_dirty) | 101 | * inode_wb_list_lock (page_remove_rmap->set_page_dirty) |
102 | * ->inode_lock (zap_pte_range->set_page_dirty) | 102 | * ->inode->i_lock (page_remove_rmap->set_page_dirty) |
103 | * inode_wb_list_lock (zap_pte_range->set_page_dirty) | ||
104 | * ->inode->i_lock (zap_pte_range->set_page_dirty) | ||
103 | * ->private_lock (zap_pte_range->__set_page_dirty_buffers) | 105 | * ->private_lock (zap_pte_range->__set_page_dirty_buffers) |
104 | * | 106 | * |
105 | * (code doesn't rely on that order, so you could switch it around) | 107 | * (code doesn't rely on that order, so you could switch it around) |
@@ -108,11 +110,11 @@ | |||
108 | */ | 110 | */ |
109 | 111 | ||
110 | /* | 112 | /* |
111 | * Remove a page from the page cache and free it. Caller has to make | 113 | * Delete a page from the page cache and free it. Caller has to make |
112 | * sure the page is locked and that nobody else uses it - or that usage | 114 | * sure the page is locked and that nobody else uses it - or that usage |
113 | * is safe. The caller must hold the mapping's tree_lock. | 115 | * is safe. The caller must hold the mapping's tree_lock. |
114 | */ | 116 | */ |
115 | void __remove_from_page_cache(struct page *page) | 117 | void __delete_from_page_cache(struct page *page) |
116 | { | 118 | { |
117 | struct address_space *mapping = page->mapping; | 119 | struct address_space *mapping = page->mapping; |
118 | 120 | ||
@@ -137,7 +139,15 @@ void __remove_from_page_cache(struct page *page) | |||
137 | } | 139 | } |
138 | } | 140 | } |
139 | 141 | ||
140 | void remove_from_page_cache(struct page *page) | 142 | /** |
143 | * delete_from_page_cache - delete page from page cache | ||
144 | * @page: the page which the kernel is trying to remove from page cache | ||
145 | * | ||
146 | * This must be called only on pages that have been verified to be in the page | ||
147 | * cache and locked. It will never put the page into the free list, the caller | ||
148 | * has a reference on the page. | ||
149 | */ | ||
150 | void delete_from_page_cache(struct page *page) | ||
141 | { | 151 | { |
142 | struct address_space *mapping = page->mapping; | 152 | struct address_space *mapping = page->mapping; |
143 | void (*freepage)(struct page *); | 153 | void (*freepage)(struct page *); |
@@ -146,54 +156,25 @@ void remove_from_page_cache(struct page *page) | |||
146 | 156 | ||
147 | freepage = mapping->a_ops->freepage; | 157 | freepage = mapping->a_ops->freepage; |
148 | spin_lock_irq(&mapping->tree_lock); | 158 | spin_lock_irq(&mapping->tree_lock); |
149 | __remove_from_page_cache(page); | 159 | __delete_from_page_cache(page); |
150 | spin_unlock_irq(&mapping->tree_lock); | 160 | spin_unlock_irq(&mapping->tree_lock); |
151 | mem_cgroup_uncharge_cache_page(page); | 161 | mem_cgroup_uncharge_cache_page(page); |
152 | 162 | ||
153 | if (freepage) | 163 | if (freepage) |
154 | freepage(page); | 164 | freepage(page); |
165 | page_cache_release(page); | ||
155 | } | 166 | } |
156 | EXPORT_SYMBOL(remove_from_page_cache); | 167 | EXPORT_SYMBOL(delete_from_page_cache); |
157 | 168 | ||
158 | static int sync_page(void *word) | 169 | static int sleep_on_page(void *word) |
159 | { | 170 | { |
160 | struct address_space *mapping; | ||
161 | struct page *page; | ||
162 | |||
163 | page = container_of((unsigned long *)word, struct page, flags); | ||
164 | |||
165 | /* | ||
166 | * page_mapping() is being called without PG_locked held. | ||
167 | * Some knowledge of the state and use of the page is used to | ||
168 | * reduce the requirements down to a memory barrier. | ||
169 | * The danger here is of a stale page_mapping() return value | ||
170 | * indicating a struct address_space different from the one it's | ||
171 | * associated with when it is associated with one. | ||
172 | * After smp_mb(), it's either the correct page_mapping() for | ||
173 | * the page, or an old page_mapping() and the page's own | ||
174 | * page_mapping() has gone NULL. | ||
175 | * The ->sync_page() address_space operation must tolerate | ||
176 | * page_mapping() going NULL. By an amazing coincidence, | ||
177 | * this comes about because none of the users of the page | ||
178 | * in the ->sync_page() methods make essential use of the | ||
179 | * page_mapping(), merely passing the page down to the backing | ||
180 | * device's unplug functions when it's non-NULL, which in turn | ||
181 | * ignore it for all cases but swap, where only page_private(page) is | ||
182 | * of interest. When page_mapping() does go NULL, the entire | ||
183 | * call stack gracefully ignores the page and returns. | ||
184 | * -- wli | ||
185 | */ | ||
186 | smp_mb(); | ||
187 | mapping = page_mapping(page); | ||
188 | if (mapping && mapping->a_ops && mapping->a_ops->sync_page) | ||
189 | mapping->a_ops->sync_page(page); | ||
190 | io_schedule(); | 171 | io_schedule(); |
191 | return 0; | 172 | return 0; |
192 | } | 173 | } |
193 | 174 | ||
194 | static int sync_page_killable(void *word) | 175 | static int sleep_on_page_killable(void *word) |
195 | { | 176 | { |
196 | sync_page(word); | 177 | sleep_on_page(word); |
197 | return fatal_signal_pending(current) ? -EINTR : 0; | 178 | return fatal_signal_pending(current) ? -EINTR : 0; |
198 | } | 179 | } |
199 | 180 | ||
@@ -387,6 +368,76 @@ int filemap_write_and_wait_range(struct address_space *mapping, | |||
387 | EXPORT_SYMBOL(filemap_write_and_wait_range); | 368 | EXPORT_SYMBOL(filemap_write_and_wait_range); |
388 | 369 | ||
389 | /** | 370 | /** |
371 | * replace_page_cache_page - replace a pagecache page with a new one | ||
372 | * @old: page to be replaced | ||
373 | * @new: page to replace with | ||
374 | * @gfp_mask: allocation mode | ||
375 | * | ||
376 | * This function replaces a page in the pagecache with a new one. On | ||
377 | * success it acquires the pagecache reference for the new page and | ||
378 | * drops it for the old page. Both the old and new pages must be | ||
379 | * locked. This function does not add the new page to the LRU, the | ||
380 | * caller must do that. | ||
381 | * | ||
382 | * The remove + add is atomic. The only way this function can fail is | ||
383 | * memory allocation failure. | ||
384 | */ | ||
385 | int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) | ||
386 | { | ||
387 | int error; | ||
388 | struct mem_cgroup *memcg = NULL; | ||
389 | |||
390 | VM_BUG_ON(!PageLocked(old)); | ||
391 | VM_BUG_ON(!PageLocked(new)); | ||
392 | VM_BUG_ON(new->mapping); | ||
393 | |||
394 | /* | ||
395 | * This is not page migration, but prepare_migration and | ||
396 | * end_migration does enough work for charge replacement. | ||
397 | * | ||
398 | * In the longer term we probably want a specialized function | ||
399 | * for moving the charge from old to new in a more efficient | ||
400 | * manner. | ||
401 | */ | ||
402 | error = mem_cgroup_prepare_migration(old, new, &memcg, gfp_mask); | ||
403 | if (error) | ||
404 | return error; | ||
405 | |||
406 | error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); | ||
407 | if (!error) { | ||
408 | struct address_space *mapping = old->mapping; | ||
409 | void (*freepage)(struct page *); | ||
410 | |||
411 | pgoff_t offset = old->index; | ||
412 | freepage = mapping->a_ops->freepage; | ||
413 | |||
414 | page_cache_get(new); | ||
415 | new->mapping = mapping; | ||
416 | new->index = offset; | ||
417 | |||
418 | spin_lock_irq(&mapping->tree_lock); | ||
419 | __delete_from_page_cache(old); | ||
420 | error = radix_tree_insert(&mapping->page_tree, offset, new); | ||
421 | BUG_ON(error); | ||
422 | mapping->nrpages++; | ||
423 | __inc_zone_page_state(new, NR_FILE_PAGES); | ||
424 | if (PageSwapBacked(new)) | ||
425 | __inc_zone_page_state(new, NR_SHMEM); | ||
426 | spin_unlock_irq(&mapping->tree_lock); | ||
427 | radix_tree_preload_end(); | ||
428 | if (freepage) | ||
429 | freepage(old); | ||
430 | page_cache_release(old); | ||
431 | mem_cgroup_end_migration(memcg, old, new, true); | ||
432 | } else { | ||
433 | mem_cgroup_end_migration(memcg, old, new, false); | ||
434 | } | ||
435 | |||
436 | return error; | ||
437 | } | ||
438 | EXPORT_SYMBOL_GPL(replace_page_cache_page); | ||
439 | |||
440 | /** | ||
390 | * add_to_page_cache_locked - add a locked page to the pagecache | 441 | * add_to_page_cache_locked - add a locked page to the pagecache |
391 | * @page: page to add | 442 | * @page: page to add |
392 | * @mapping: the page's address_space | 443 | * @mapping: the page's address_space |
@@ -479,12 +530,6 @@ struct page *__page_cache_alloc(gfp_t gfp) | |||
479 | EXPORT_SYMBOL(__page_cache_alloc); | 530 | EXPORT_SYMBOL(__page_cache_alloc); |
480 | #endif | 531 | #endif |
481 | 532 | ||
482 | static int __sleep_on_page_lock(void *word) | ||
483 | { | ||
484 | io_schedule(); | ||
485 | return 0; | ||
486 | } | ||
487 | |||
488 | /* | 533 | /* |
489 | * In order to wait for pages to become available there must be | 534 | * In order to wait for pages to become available there must be |
490 | * waitqueues associated with pages. By using a hash table of | 535 | * waitqueues associated with pages. By using a hash table of |
@@ -512,7 +557,7 @@ void wait_on_page_bit(struct page *page, int bit_nr) | |||
512 | DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); | 557 | DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); |
513 | 558 | ||
514 | if (test_bit(bit_nr, &page->flags)) | 559 | if (test_bit(bit_nr, &page->flags)) |
515 | __wait_on_bit(page_waitqueue(page), &wait, sync_page, | 560 | __wait_on_bit(page_waitqueue(page), &wait, sleep_on_page, |
516 | TASK_UNINTERRUPTIBLE); | 561 | TASK_UNINTERRUPTIBLE); |
517 | } | 562 | } |
518 | EXPORT_SYMBOL(wait_on_page_bit); | 563 | EXPORT_SYMBOL(wait_on_page_bit); |
@@ -576,17 +621,12 @@ EXPORT_SYMBOL(end_page_writeback); | |||
576 | /** | 621 | /** |
577 | * __lock_page - get a lock on the page, assuming we need to sleep to get it | 622 | * __lock_page - get a lock on the page, assuming we need to sleep to get it |
578 | * @page: the page to lock | 623 | * @page: the page to lock |
579 | * | ||
580 | * Ugly. Running sync_page() in state TASK_UNINTERRUPTIBLE is scary. If some | ||
581 | * random driver's requestfn sets TASK_RUNNING, we could busywait. However | ||
582 | * chances are that on the second loop, the block layer's plug list is empty, | ||
583 | * so sync_page() will then return in state TASK_UNINTERRUPTIBLE. | ||
584 | */ | 624 | */ |
585 | void __lock_page(struct page *page) | 625 | void __lock_page(struct page *page) |
586 | { | 626 | { |
587 | DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); | 627 | DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); |
588 | 628 | ||
589 | __wait_on_bit_lock(page_waitqueue(page), &wait, sync_page, | 629 | __wait_on_bit_lock(page_waitqueue(page), &wait, sleep_on_page, |
590 | TASK_UNINTERRUPTIBLE); | 630 | TASK_UNINTERRUPTIBLE); |
591 | } | 631 | } |
592 | EXPORT_SYMBOL(__lock_page); | 632 | EXPORT_SYMBOL(__lock_page); |
@@ -596,24 +636,10 @@ int __lock_page_killable(struct page *page) | |||
596 | DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); | 636 | DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); |
597 | 637 | ||
598 | return __wait_on_bit_lock(page_waitqueue(page), &wait, | 638 | return __wait_on_bit_lock(page_waitqueue(page), &wait, |
599 | sync_page_killable, TASK_KILLABLE); | 639 | sleep_on_page_killable, TASK_KILLABLE); |
600 | } | 640 | } |
601 | EXPORT_SYMBOL_GPL(__lock_page_killable); | 641 | EXPORT_SYMBOL_GPL(__lock_page_killable); |
602 | 642 | ||
603 | /** | ||
604 | * __lock_page_nosync - get a lock on the page, without calling sync_page() | ||
605 | * @page: the page to lock | ||
606 | * | ||
607 | * Variant of lock_page that does not require the caller to hold a reference | ||
608 | * on the page's mapping. | ||
609 | */ | ||
610 | void __lock_page_nosync(struct page *page) | ||
611 | { | ||
612 | DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); | ||
613 | __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock, | ||
614 | TASK_UNINTERRUPTIBLE); | ||
615 | } | ||
616 | |||
617 | int __lock_page_or_retry(struct page *page, struct mm_struct *mm, | 643 | int __lock_page_or_retry(struct page *page, struct mm_struct *mm, |
618 | unsigned int flags) | 644 | unsigned int flags) |
619 | { | 645 | { |
@@ -621,8 +647,10 @@ int __lock_page_or_retry(struct page *page, struct mm_struct *mm, | |||
621 | __lock_page(page); | 647 | __lock_page(page); |
622 | return 1; | 648 | return 1; |
623 | } else { | 649 | } else { |
624 | up_read(&mm->mmap_sem); | 650 | if (!(flags & FAULT_FLAG_RETRY_NOWAIT)) { |
625 | wait_on_page_locked(page); | 651 | up_read(&mm->mmap_sem); |
652 | wait_on_page_locked(page); | ||
653 | } | ||
626 | return 0; | 654 | return 0; |
627 | } | 655 | } |
628 | } | 656 | } |
@@ -782,9 +810,13 @@ repeat: | |||
782 | page = radix_tree_deref_slot((void **)pages[i]); | 810 | page = radix_tree_deref_slot((void **)pages[i]); |
783 | if (unlikely(!page)) | 811 | if (unlikely(!page)) |
784 | continue; | 812 | continue; |
813 | |||
814 | /* | ||
815 | * This can only trigger when the entry at index 0 moves out | ||
816 | * of or back to the root: none yet gotten, safe to restart. | ||
817 | */ | ||
785 | if (radix_tree_deref_retry(page)) { | 818 | if (radix_tree_deref_retry(page)) { |
786 | if (ret) | 819 | WARN_ON(start | i); |
787 | start = pages[ret-1]->index; | ||
788 | goto restart; | 820 | goto restart; |
789 | } | 821 | } |
790 | 822 | ||
@@ -800,6 +832,13 @@ repeat: | |||
800 | pages[ret] = page; | 832 | pages[ret] = page; |
801 | ret++; | 833 | ret++; |
802 | } | 834 | } |
835 | |||
836 | /* | ||
837 | * If all entries were removed before we could secure them, | ||
838 | * try again, because callers stop trying once 0 is returned. | ||
839 | */ | ||
840 | if (unlikely(!ret && nr_found)) | ||
841 | goto restart; | ||
803 | rcu_read_unlock(); | 842 | rcu_read_unlock(); |
804 | return ret; | 843 | return ret; |
805 | } | 844 | } |
@@ -834,6 +873,11 @@ repeat: | |||
834 | page = radix_tree_deref_slot((void **)pages[i]); | 873 | page = radix_tree_deref_slot((void **)pages[i]); |
835 | if (unlikely(!page)) | 874 | if (unlikely(!page)) |
836 | continue; | 875 | continue; |
876 | |||
877 | /* | ||
878 | * This can only trigger when the entry at index 0 moves out | ||
879 | * of or back to the root: none yet gotten, safe to restart. | ||
880 | */ | ||
837 | if (radix_tree_deref_retry(page)) | 881 | if (radix_tree_deref_retry(page)) |
838 | goto restart; | 882 | goto restart; |
839 | 883 | ||
@@ -894,6 +938,11 @@ repeat: | |||
894 | page = radix_tree_deref_slot((void **)pages[i]); | 938 | page = radix_tree_deref_slot((void **)pages[i]); |
895 | if (unlikely(!page)) | 939 | if (unlikely(!page)) |
896 | continue; | 940 | continue; |
941 | |||
942 | /* | ||
943 | * This can only trigger when the entry at index 0 moves out | ||
944 | * of or back to the root: none yet gotten, safe to restart. | ||
945 | */ | ||
897 | if (radix_tree_deref_retry(page)) | 946 | if (radix_tree_deref_retry(page)) |
898 | goto restart; | 947 | goto restart; |
899 | 948 | ||
@@ -909,6 +958,13 @@ repeat: | |||
909 | pages[ret] = page; | 958 | pages[ret] = page; |
910 | ret++; | 959 | ret++; |
911 | } | 960 | } |
961 | |||
962 | /* | ||
963 | * If all entries were removed before we could secure them, | ||
964 | * try again, because callers stop trying once 0 is returned. | ||
965 | */ | ||
966 | if (unlikely(!ret && nr_found)) | ||
967 | goto restart; | ||
912 | rcu_read_unlock(); | 968 | rcu_read_unlock(); |
913 | 969 | ||
914 | if (ret) | 970 | if (ret) |
@@ -1298,12 +1354,15 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, | |||
1298 | unsigned long seg = 0; | 1354 | unsigned long seg = 0; |
1299 | size_t count; | 1355 | size_t count; |
1300 | loff_t *ppos = &iocb->ki_pos; | 1356 | loff_t *ppos = &iocb->ki_pos; |
1357 | struct blk_plug plug; | ||
1301 | 1358 | ||
1302 | count = 0; | 1359 | count = 0; |
1303 | retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); | 1360 | retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); |
1304 | if (retval) | 1361 | if (retval) |
1305 | return retval; | 1362 | return retval; |
1306 | 1363 | ||
1364 | blk_start_plug(&plug); | ||
1365 | |||
1307 | /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ | 1366 | /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ |
1308 | if (filp->f_flags & O_DIRECT) { | 1367 | if (filp->f_flags & O_DIRECT) { |
1309 | loff_t size; | 1368 | loff_t size; |
@@ -1376,6 +1435,7 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, | |||
1376 | break; | 1435 | break; |
1377 | } | 1436 | } |
1378 | out: | 1437 | out: |
1438 | blk_finish_plug(&plug); | ||
1379 | return retval; | 1439 | return retval; |
1380 | } | 1440 | } |
1381 | EXPORT_SYMBOL(generic_file_aio_read); | 1441 | EXPORT_SYMBOL(generic_file_aio_read); |
@@ -2487,11 +2547,13 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, | |||
2487 | { | 2547 | { |
2488 | struct file *file = iocb->ki_filp; | 2548 | struct file *file = iocb->ki_filp; |
2489 | struct inode *inode = file->f_mapping->host; | 2549 | struct inode *inode = file->f_mapping->host; |
2550 | struct blk_plug plug; | ||
2490 | ssize_t ret; | 2551 | ssize_t ret; |
2491 | 2552 | ||
2492 | BUG_ON(iocb->ki_pos != pos); | 2553 | BUG_ON(iocb->ki_pos != pos); |
2493 | 2554 | ||
2494 | mutex_lock(&inode->i_mutex); | 2555 | mutex_lock(&inode->i_mutex); |
2556 | blk_start_plug(&plug); | ||
2495 | ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); | 2557 | ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); |
2496 | mutex_unlock(&inode->i_mutex); | 2558 | mutex_unlock(&inode->i_mutex); |
2497 | 2559 | ||
@@ -2502,6 +2564,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, | |||
2502 | if (err < 0 && ret > 0) | 2564 | if (err < 0 && ret > 0) |
2503 | ret = err; | 2565 | ret = err; |
2504 | } | 2566 | } |
2567 | blk_finish_plug(&plug); | ||
2505 | return ret; | 2568 | return ret; |
2506 | } | 2569 | } |
2507 | EXPORT_SYMBOL(generic_file_aio_write); | 2570 | EXPORT_SYMBOL(generic_file_aio_write); |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 113e35c47502..470dcda10add 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -244,24 +244,28 @@ static ssize_t single_flag_show(struct kobject *kobj, | |||
244 | struct kobj_attribute *attr, char *buf, | 244 | struct kobj_attribute *attr, char *buf, |
245 | enum transparent_hugepage_flag flag) | 245 | enum transparent_hugepage_flag flag) |
246 | { | 246 | { |
247 | if (test_bit(flag, &transparent_hugepage_flags)) | 247 | return sprintf(buf, "%d\n", |
248 | return sprintf(buf, "[yes] no\n"); | 248 | !!test_bit(flag, &transparent_hugepage_flags)); |
249 | else | ||
250 | return sprintf(buf, "yes [no]\n"); | ||
251 | } | 249 | } |
250 | |||
252 | static ssize_t single_flag_store(struct kobject *kobj, | 251 | static ssize_t single_flag_store(struct kobject *kobj, |
253 | struct kobj_attribute *attr, | 252 | struct kobj_attribute *attr, |
254 | const char *buf, size_t count, | 253 | const char *buf, size_t count, |
255 | enum transparent_hugepage_flag flag) | 254 | enum transparent_hugepage_flag flag) |
256 | { | 255 | { |
257 | if (!memcmp("yes", buf, | 256 | unsigned long value; |
258 | min(sizeof("yes")-1, count))) { | 257 | int ret; |
258 | |||
259 | ret = kstrtoul(buf, 10, &value); | ||
260 | if (ret < 0) | ||
261 | return ret; | ||
262 | if (value > 1) | ||
263 | return -EINVAL; | ||
264 | |||
265 | if (value) | ||
259 | set_bit(flag, &transparent_hugepage_flags); | 266 | set_bit(flag, &transparent_hugepage_flags); |
260 | } else if (!memcmp("no", buf, | 267 | else |
261 | min(sizeof("no")-1, count))) { | ||
262 | clear_bit(flag, &transparent_hugepage_flags); | 268 | clear_bit(flag, &transparent_hugepage_flags); |
263 | } else | ||
264 | return -EINVAL; | ||
265 | 269 | ||
266 | return count; | 270 | return count; |
267 | } | 271 | } |
@@ -643,23 +647,24 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | |||
643 | return ret; | 647 | return ret; |
644 | } | 648 | } |
645 | 649 | ||
646 | static inline gfp_t alloc_hugepage_gfpmask(int defrag) | 650 | static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp) |
647 | { | 651 | { |
648 | return GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT); | 652 | return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp; |
649 | } | 653 | } |
650 | 654 | ||
651 | static inline struct page *alloc_hugepage_vma(int defrag, | 655 | static inline struct page *alloc_hugepage_vma(int defrag, |
652 | struct vm_area_struct *vma, | 656 | struct vm_area_struct *vma, |
653 | unsigned long haddr, int nd) | 657 | unsigned long haddr, int nd, |
658 | gfp_t extra_gfp) | ||
654 | { | 659 | { |
655 | return alloc_pages_vma(alloc_hugepage_gfpmask(defrag), | 660 | return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp), |
656 | HPAGE_PMD_ORDER, vma, haddr, nd); | 661 | HPAGE_PMD_ORDER, vma, haddr, nd); |
657 | } | 662 | } |
658 | 663 | ||
659 | #ifndef CONFIG_NUMA | 664 | #ifndef CONFIG_NUMA |
660 | static inline struct page *alloc_hugepage(int defrag) | 665 | static inline struct page *alloc_hugepage(int defrag) |
661 | { | 666 | { |
662 | return alloc_pages(alloc_hugepage_gfpmask(defrag), | 667 | return alloc_pages(alloc_hugepage_gfpmask(defrag, 0), |
663 | HPAGE_PMD_ORDER); | 668 | HPAGE_PMD_ORDER); |
664 | } | 669 | } |
665 | #endif | 670 | #endif |
@@ -678,9 +683,12 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
678 | if (unlikely(khugepaged_enter(vma))) | 683 | if (unlikely(khugepaged_enter(vma))) |
679 | return VM_FAULT_OOM; | 684 | return VM_FAULT_OOM; |
680 | page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), | 685 | page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), |
681 | vma, haddr, numa_node_id()); | 686 | vma, haddr, numa_node_id(), 0); |
682 | if (unlikely(!page)) | 687 | if (unlikely(!page)) { |
688 | count_vm_event(THP_FAULT_FALLBACK); | ||
683 | goto out; | 689 | goto out; |
690 | } | ||
691 | count_vm_event(THP_FAULT_ALLOC); | ||
684 | if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { | 692 | if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { |
685 | put_page(page); | 693 | put_page(page); |
686 | goto out; | 694 | goto out; |
@@ -799,7 +807,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, | |||
799 | } | 807 | } |
800 | 808 | ||
801 | for (i = 0; i < HPAGE_PMD_NR; i++) { | 809 | for (i = 0; i < HPAGE_PMD_NR; i++) { |
802 | pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, | 810 | pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE | |
811 | __GFP_OTHER_NODE, | ||
803 | vma, address, page_to_nid(page)); | 812 | vma, address, page_to_nid(page)); |
804 | if (unlikely(!pages[i] || | 813 | if (unlikely(!pages[i] || |
805 | mem_cgroup_newpage_charge(pages[i], mm, | 814 | mem_cgroup_newpage_charge(pages[i], mm, |
@@ -902,16 +911,18 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
902 | if (transparent_hugepage_enabled(vma) && | 911 | if (transparent_hugepage_enabled(vma) && |
903 | !transparent_hugepage_debug_cow()) | 912 | !transparent_hugepage_debug_cow()) |
904 | new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), | 913 | new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), |
905 | vma, haddr, numa_node_id()); | 914 | vma, haddr, numa_node_id(), 0); |
906 | else | 915 | else |
907 | new_page = NULL; | 916 | new_page = NULL; |
908 | 917 | ||
909 | if (unlikely(!new_page)) { | 918 | if (unlikely(!new_page)) { |
919 | count_vm_event(THP_FAULT_FALLBACK); | ||
910 | ret = do_huge_pmd_wp_page_fallback(mm, vma, address, | 920 | ret = do_huge_pmd_wp_page_fallback(mm, vma, address, |
911 | pmd, orig_pmd, page, haddr); | 921 | pmd, orig_pmd, page, haddr); |
912 | put_page(page); | 922 | put_page(page); |
913 | goto out; | 923 | goto out; |
914 | } | 924 | } |
925 | count_vm_event(THP_FAULT_ALLOC); | ||
915 | 926 | ||
916 | if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { | 927 | if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { |
917 | put_page(new_page); | 928 | put_page(new_page); |
@@ -1388,6 +1399,7 @@ int split_huge_page(struct page *page) | |||
1388 | 1399 | ||
1389 | BUG_ON(!PageSwapBacked(page)); | 1400 | BUG_ON(!PageSwapBacked(page)); |
1390 | __split_huge_page(page, anon_vma); | 1401 | __split_huge_page(page, anon_vma); |
1402 | count_vm_event(THP_SPLIT); | ||
1391 | 1403 | ||
1392 | BUG_ON(PageCompound(page)); | 1404 | BUG_ON(PageCompound(page)); |
1393 | out_unlock: | 1405 | out_unlock: |
@@ -1779,12 +1791,14 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
1779 | * scalability. | 1791 | * scalability. |
1780 | */ | 1792 | */ |
1781 | new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address, | 1793 | new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address, |
1782 | node); | 1794 | node, __GFP_OTHER_NODE); |
1783 | if (unlikely(!new_page)) { | 1795 | if (unlikely(!new_page)) { |
1784 | up_read(&mm->mmap_sem); | 1796 | up_read(&mm->mmap_sem); |
1797 | count_vm_event(THP_COLLAPSE_ALLOC_FAILED); | ||
1785 | *hpage = ERR_PTR(-ENOMEM); | 1798 | *hpage = ERR_PTR(-ENOMEM); |
1786 | return; | 1799 | return; |
1787 | } | 1800 | } |
1801 | count_vm_event(THP_COLLAPSE_ALLOC); | ||
1788 | if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { | 1802 | if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { |
1789 | up_read(&mm->mmap_sem); | 1803 | up_read(&mm->mmap_sem); |
1790 | put_page(new_page); | 1804 | put_page(new_page); |
@@ -2149,8 +2163,11 @@ static void khugepaged_do_scan(struct page **hpage) | |||
2149 | #ifndef CONFIG_NUMA | 2163 | #ifndef CONFIG_NUMA |
2150 | if (!*hpage) { | 2164 | if (!*hpage) { |
2151 | *hpage = alloc_hugepage(khugepaged_defrag()); | 2165 | *hpage = alloc_hugepage(khugepaged_defrag()); |
2152 | if (unlikely(!*hpage)) | 2166 | if (unlikely(!*hpage)) { |
2167 | count_vm_event(THP_COLLAPSE_ALLOC_FAILED); | ||
2153 | break; | 2168 | break; |
2169 | } | ||
2170 | count_vm_event(THP_COLLAPSE_ALLOC); | ||
2154 | } | 2171 | } |
2155 | #else | 2172 | #else |
2156 | if (IS_ERR(*hpage)) | 2173 | if (IS_ERR(*hpage)) |
@@ -2190,8 +2207,11 @@ static struct page *khugepaged_alloc_hugepage(void) | |||
2190 | 2207 | ||
2191 | do { | 2208 | do { |
2192 | hpage = alloc_hugepage(khugepaged_defrag()); | 2209 | hpage = alloc_hugepage(khugepaged_defrag()); |
2193 | if (!hpage) | 2210 | if (!hpage) { |
2211 | count_vm_event(THP_COLLAPSE_ALLOC_FAILED); | ||
2194 | khugepaged_alloc_sleep(); | 2212 | khugepaged_alloc_sleep(); |
2213 | } else | ||
2214 | count_vm_event(THP_COLLAPSE_ALLOC); | ||
2195 | } while (unlikely(!hpage) && | 2215 | } while (unlikely(!hpage) && |
2196 | likely(khugepaged_enabled())); | 2216 | likely(khugepaged_enabled())); |
2197 | return hpage; | 2217 | return hpage; |
@@ -2208,8 +2228,11 @@ static void khugepaged_loop(void) | |||
2208 | while (likely(khugepaged_enabled())) { | 2228 | while (likely(khugepaged_enabled())) { |
2209 | #ifndef CONFIG_NUMA | 2229 | #ifndef CONFIG_NUMA |
2210 | hpage = khugepaged_alloc_hugepage(); | 2230 | hpage = khugepaged_alloc_hugepage(); |
2211 | if (unlikely(!hpage)) | 2231 | if (unlikely(!hpage)) { |
2232 | count_vm_event(THP_COLLAPSE_ALLOC_FAILED); | ||
2212 | break; | 2233 | break; |
2234 | } | ||
2235 | count_vm_event(THP_COLLAPSE_ALLOC); | ||
2213 | #else | 2236 | #else |
2214 | if (IS_ERR(hpage)) { | 2237 | if (IS_ERR(hpage)) { |
2215 | khugepaged_alloc_sleep(); | 2238 | khugepaged_alloc_sleep(); |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 838fe25f704c..bbb4a5bbb958 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -146,7 +146,7 @@ static long region_chg(struct list_head *head, long f, long t) | |||
146 | if (rg->from > t) | 146 | if (rg->from > t) |
147 | return chg; | 147 | return chg; |
148 | 148 | ||
149 | /* We overlap with this area, if it extends futher than | 149 | /* We overlap with this area, if it extends further than |
150 | * us then we must extend ourselves. Account for its | 150 | * us then we must extend ourselves. Account for its |
151 | * existing reservation. */ | 151 | * existing reservation. */ |
152 | if (rg->to > t) { | 152 | if (rg->to > t) { |
@@ -842,7 +842,7 @@ struct page *alloc_huge_page_node(struct hstate *h, int nid) | |||
842 | } | 842 | } |
843 | 843 | ||
844 | /* | 844 | /* |
845 | * Increase the hugetlb pool such that it can accomodate a reservation | 845 | * Increase the hugetlb pool such that it can accommodate a reservation |
846 | * of size 'delta'. | 846 | * of size 'delta'. |
847 | */ | 847 | */ |
848 | static int gather_surplus_pages(struct hstate *h, int delta) | 848 | static int gather_surplus_pages(struct hstate *h, int delta) |
@@ -890,7 +890,7 @@ retry: | |||
890 | 890 | ||
891 | /* | 891 | /* |
892 | * The surplus_list now contains _at_least_ the number of extra pages | 892 | * The surplus_list now contains _at_least_ the number of extra pages |
893 | * needed to accomodate the reservation. Add the appropriate number | 893 | * needed to accommodate the reservation. Add the appropriate number |
894 | * of pages to the hugetlb pool and free the extras back to the buddy | 894 | * of pages to the hugetlb pool and free the extras back to the buddy |
895 | * allocator. Commit the entire reservation here to prevent another | 895 | * allocator. Commit the entire reservation here to prevent another |
896 | * process from stealing the pages as they are added to the pool but | 896 | * process from stealing the pages as they are added to the pool but |
@@ -1872,8 +1872,7 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, | |||
1872 | unsigned long tmp; | 1872 | unsigned long tmp; |
1873 | int ret; | 1873 | int ret; |
1874 | 1874 | ||
1875 | if (!write) | 1875 | tmp = h->max_huge_pages; |
1876 | tmp = h->max_huge_pages; | ||
1877 | 1876 | ||
1878 | if (write && h->order >= MAX_ORDER) | 1877 | if (write && h->order >= MAX_ORDER) |
1879 | return -EINVAL; | 1878 | return -EINVAL; |
@@ -1938,8 +1937,7 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, | |||
1938 | unsigned long tmp; | 1937 | unsigned long tmp; |
1939 | int ret; | 1938 | int ret; |
1940 | 1939 | ||
1941 | if (!write) | 1940 | tmp = h->nr_overcommit_huge_pages; |
1942 | tmp = h->nr_overcommit_huge_pages; | ||
1943 | 1941 | ||
1944 | if (write && h->order >= MAX_ORDER) | 1942 | if (write && h->order >= MAX_ORDER) |
1945 | return -EINVAL; | 1943 | return -EINVAL; |
@@ -2045,7 +2043,7 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma) | |||
2045 | * This new VMA should share its siblings reservation map if present. | 2043 | * This new VMA should share its siblings reservation map if present. |
2046 | * The VMA will only ever have a valid reservation map pointer where | 2044 | * The VMA will only ever have a valid reservation map pointer where |
2047 | * it is being copied for another still existing VMA. As that VMA | 2045 | * it is being copied for another still existing VMA. As that VMA |
2048 | * has a reference to the reservation map it cannot dissappear until | 2046 | * has a reference to the reservation map it cannot disappear until |
2049 | * after this open call completes. It is therefore safe to take a | 2047 | * after this open call completes. It is therefore safe to take a |
2050 | * new reference here without additional locking. | 2048 | * new reference here without additional locking. |
2051 | */ | 2049 | */ |
@@ -2492,7 +2490,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2492 | /* | 2490 | /* |
2493 | * Currently, we are forced to kill the process in the event the | 2491 | * Currently, we are forced to kill the process in the event the |
2494 | * original mapper has unmapped pages from the child due to a failed | 2492 | * original mapper has unmapped pages from the child due to a failed |
2495 | * COW. Warn that such a situation has occured as it may not be obvious | 2493 | * COW. Warn that such a situation has occurred as it may not be obvious |
2496 | */ | 2494 | */ |
2497 | if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { | 2495 | if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { |
2498 | printk(KERN_WARNING | 2496 | printk(KERN_WARNING |
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index 0948f1072d6b..c7fc7fd00e32 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c | |||
@@ -1,4 +1,4 @@ | |||
1 | /* Inject a hwpoison memory failure on a arbitary pfn */ | 1 | /* Inject a hwpoison memory failure on a arbitrary pfn */ |
2 | #include <linux/module.h> | 2 | #include <linux/module.h> |
3 | #include <linux/debugfs.h> | 3 | #include <linux/debugfs.h> |
4 | #include <linux/kernel.h> | 4 | #include <linux/kernel.h> |
diff --git a/mm/internal.h b/mm/internal.h index 3438dd43a062..9d0ced8e505e 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -162,7 +162,7 @@ static inline struct page *mem_map_offset(struct page *base, int offset) | |||
162 | } | 162 | } |
163 | 163 | ||
164 | /* | 164 | /* |
165 | * Iterator over all subpages withing the maximally aligned gigantic | 165 | * Iterator over all subpages within the maximally aligned gigantic |
166 | * page 'base'. Handle any discontiguity in the mem_map. | 166 | * page 'base'. Handle any discontiguity in the mem_map. |
167 | */ | 167 | */ |
168 | static inline struct page *mem_map_next(struct page *iter, | 168 | static inline struct page *mem_map_next(struct page *iter, |
diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 84225f3b7190..c1d5867543e4 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c | |||
@@ -265,7 +265,7 @@ static void kmemleak_disable(void); | |||
265 | } while (0) | 265 | } while (0) |
266 | 266 | ||
267 | /* | 267 | /* |
268 | * Macro invoked when a serious kmemleak condition occured and cannot be | 268 | * Macro invoked when a serious kmemleak condition occurred and cannot be |
269 | * recovered from. Kmemleak will be disabled and further allocation/freeing | 269 | * recovered from. Kmemleak will be disabled and further allocation/freeing |
270 | * tracing no longer available. | 270 | * tracing no longer available. |
271 | */ | 271 | */ |
@@ -1006,7 +1006,7 @@ static bool update_checksum(struct kmemleak_object *object) | |||
1006 | 1006 | ||
1007 | /* | 1007 | /* |
1008 | * Memory scanning is a long process and it needs to be interruptable. This | 1008 | * Memory scanning is a long process and it needs to be interruptable. This |
1009 | * function checks whether such interrupt condition occured. | 1009 | * function checks whether such interrupt condition occurred. |
1010 | */ | 1010 | */ |
1011 | static int scan_should_stop(void) | 1011 | static int scan_should_stop(void) |
1012 | { | 1012 | { |
@@ -1733,7 +1733,7 @@ static int __init kmemleak_late_init(void) | |||
1733 | 1733 | ||
1734 | if (atomic_read(&kmemleak_error)) { | 1734 | if (atomic_read(&kmemleak_error)) { |
1735 | /* | 1735 | /* |
1736 | * Some error occured and kmemleak was disabled. There is a | 1736 | * Some error occurred and kmemleak was disabled. There is a |
1737 | * small chance that kmemleak_disable() was called immediately | 1737 | * small chance that kmemleak_disable() was called immediately |
1738 | * after setting kmemleak_initialized and we may end up with | 1738 | * after setting kmemleak_initialized and we may end up with |
1739 | * two clean-up threads but serialized by scan_mutex. | 1739 | * two clean-up threads but serialized by scan_mutex. |
@@ -301,20 +301,6 @@ static inline int in_stable_tree(struct rmap_item *rmap_item) | |||
301 | return rmap_item->address & STABLE_FLAG; | 301 | return rmap_item->address & STABLE_FLAG; |
302 | } | 302 | } |
303 | 303 | ||
304 | static void hold_anon_vma(struct rmap_item *rmap_item, | ||
305 | struct anon_vma *anon_vma) | ||
306 | { | ||
307 | rmap_item->anon_vma = anon_vma; | ||
308 | get_anon_vma(anon_vma); | ||
309 | } | ||
310 | |||
311 | static void ksm_drop_anon_vma(struct rmap_item *rmap_item) | ||
312 | { | ||
313 | struct anon_vma *anon_vma = rmap_item->anon_vma; | ||
314 | |||
315 | drop_anon_vma(anon_vma); | ||
316 | } | ||
317 | |||
318 | /* | 304 | /* |
319 | * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's | 305 | * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's |
320 | * page tables after it has passed through ksm_exit() - which, if necessary, | 306 | * page tables after it has passed through ksm_exit() - which, if necessary, |
@@ -397,7 +383,7 @@ static void break_cow(struct rmap_item *rmap_item) | |||
397 | * It is not an accident that whenever we want to break COW | 383 | * It is not an accident that whenever we want to break COW |
398 | * to undo, we also need to drop a reference to the anon_vma. | 384 | * to undo, we also need to drop a reference to the anon_vma. |
399 | */ | 385 | */ |
400 | ksm_drop_anon_vma(rmap_item); | 386 | put_anon_vma(rmap_item->anon_vma); |
401 | 387 | ||
402 | down_read(&mm->mmap_sem); | 388 | down_read(&mm->mmap_sem); |
403 | if (ksm_test_exit(mm)) | 389 | if (ksm_test_exit(mm)) |
@@ -466,7 +452,7 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node) | |||
466 | ksm_pages_sharing--; | 452 | ksm_pages_sharing--; |
467 | else | 453 | else |
468 | ksm_pages_shared--; | 454 | ksm_pages_shared--; |
469 | ksm_drop_anon_vma(rmap_item); | 455 | put_anon_vma(rmap_item->anon_vma); |
470 | rmap_item->address &= PAGE_MASK; | 456 | rmap_item->address &= PAGE_MASK; |
471 | cond_resched(); | 457 | cond_resched(); |
472 | } | 458 | } |
@@ -554,7 +540,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) | |||
554 | else | 540 | else |
555 | ksm_pages_shared--; | 541 | ksm_pages_shared--; |
556 | 542 | ||
557 | ksm_drop_anon_vma(rmap_item); | 543 | put_anon_vma(rmap_item->anon_vma); |
558 | rmap_item->address &= PAGE_MASK; | 544 | rmap_item->address &= PAGE_MASK; |
559 | 545 | ||
560 | } else if (rmap_item->address & UNSTABLE_FLAG) { | 546 | } else if (rmap_item->address & UNSTABLE_FLAG) { |
@@ -734,7 +720,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, | |||
734 | swapped = PageSwapCache(page); | 720 | swapped = PageSwapCache(page); |
735 | flush_cache_page(vma, addr, page_to_pfn(page)); | 721 | flush_cache_page(vma, addr, page_to_pfn(page)); |
736 | /* | 722 | /* |
737 | * Ok this is tricky, when get_user_pages_fast() run it doesnt | 723 | * Ok this is tricky, when get_user_pages_fast() run it doesn't |
738 | * take any lock, therefore the check that we are going to make | 724 | * take any lock, therefore the check that we are going to make |
739 | * with the pagecount against the mapcount is racey and | 725 | * with the pagecount against the mapcount is racey and |
740 | * O_DIRECT can happen right after the check. | 726 | * O_DIRECT can happen right after the check. |
@@ -949,7 +935,8 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item, | |||
949 | goto out; | 935 | goto out; |
950 | 936 | ||
951 | /* Must get reference to anon_vma while still holding mmap_sem */ | 937 | /* Must get reference to anon_vma while still holding mmap_sem */ |
952 | hold_anon_vma(rmap_item, vma->anon_vma); | 938 | rmap_item->anon_vma = vma->anon_vma; |
939 | get_anon_vma(vma->anon_vma); | ||
953 | out: | 940 | out: |
954 | up_read(&mm->mmap_sem); | 941 | up_read(&mm->mmap_sem); |
955 | return err; | 942 | return err; |
diff --git a/mm/memblock.c b/mm/memblock.c index 4618fda975a0..a0562d1a6ad4 100644 --- a/mm/memblock.c +++ b/mm/memblock.c | |||
@@ -58,28 +58,6 @@ static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, p | |||
58 | return ((base1 < (base2 + size2)) && (base2 < (base1 + size1))); | 58 | return ((base1 < (base2 + size2)) && (base2 < (base1 + size1))); |
59 | } | 59 | } |
60 | 60 | ||
61 | static long __init_memblock memblock_addrs_adjacent(phys_addr_t base1, phys_addr_t size1, | ||
62 | phys_addr_t base2, phys_addr_t size2) | ||
63 | { | ||
64 | if (base2 == base1 + size1) | ||
65 | return 1; | ||
66 | else if (base1 == base2 + size2) | ||
67 | return -1; | ||
68 | |||
69 | return 0; | ||
70 | } | ||
71 | |||
72 | static long __init_memblock memblock_regions_adjacent(struct memblock_type *type, | ||
73 | unsigned long r1, unsigned long r2) | ||
74 | { | ||
75 | phys_addr_t base1 = type->regions[r1].base; | ||
76 | phys_addr_t size1 = type->regions[r1].size; | ||
77 | phys_addr_t base2 = type->regions[r2].base; | ||
78 | phys_addr_t size2 = type->regions[r2].size; | ||
79 | |||
80 | return memblock_addrs_adjacent(base1, size1, base2, size2); | ||
81 | } | ||
82 | |||
83 | long __init_memblock memblock_overlaps_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size) | 61 | long __init_memblock memblock_overlaps_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size) |
84 | { | 62 | { |
85 | unsigned long i; | 63 | unsigned long i; |
@@ -206,14 +184,13 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u | |||
206 | type->regions[i].size = type->regions[i + 1].size; | 184 | type->regions[i].size = type->regions[i + 1].size; |
207 | } | 185 | } |
208 | type->cnt--; | 186 | type->cnt--; |
209 | } | ||
210 | 187 | ||
211 | /* Assumption: base addr of region 1 < base addr of region 2 */ | 188 | /* Special case for empty arrays */ |
212 | static void __init_memblock memblock_coalesce_regions(struct memblock_type *type, | 189 | if (type->cnt == 0) { |
213 | unsigned long r1, unsigned long r2) | 190 | type->cnt = 1; |
214 | { | 191 | type->regions[0].base = 0; |
215 | type->regions[r1].size += type->regions[r2].size; | 192 | type->regions[0].size = 0; |
216 | memblock_remove_region(type, r2); | 193 | } |
217 | } | 194 | } |
218 | 195 | ||
219 | /* Defined below but needed now */ | 196 | /* Defined below but needed now */ |
@@ -276,7 +253,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type) | |||
276 | return 0; | 253 | return 0; |
277 | 254 | ||
278 | /* Add the new reserved region now. Should not fail ! */ | 255 | /* Add the new reserved region now. Should not fail ! */ |
279 | BUG_ON(memblock_add_region(&memblock.reserved, addr, new_size) < 0); | 256 | BUG_ON(memblock_add_region(&memblock.reserved, addr, new_size)); |
280 | 257 | ||
281 | /* If the array wasn't our static init one, then free it. We only do | 258 | /* If the array wasn't our static init one, then free it. We only do |
282 | * that before SLAB is available as later on, we don't know whether | 259 | * that before SLAB is available as later on, we don't know whether |
@@ -296,58 +273,99 @@ extern int __init_memblock __weak memblock_memory_can_coalesce(phys_addr_t addr1 | |||
296 | return 1; | 273 | return 1; |
297 | } | 274 | } |
298 | 275 | ||
299 | static long __init_memblock memblock_add_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size) | 276 | static long __init_memblock memblock_add_region(struct memblock_type *type, |
277 | phys_addr_t base, phys_addr_t size) | ||
300 | { | 278 | { |
301 | unsigned long coalesced = 0; | 279 | phys_addr_t end = base + size; |
302 | long adjacent, i; | 280 | int i, slot = -1; |
303 | |||
304 | if ((type->cnt == 1) && (type->regions[0].size == 0)) { | ||
305 | type->regions[0].base = base; | ||
306 | type->regions[0].size = size; | ||
307 | return 0; | ||
308 | } | ||
309 | 281 | ||
310 | /* First try and coalesce this MEMBLOCK with another. */ | 282 | /* First try and coalesce this MEMBLOCK with others */ |
311 | for (i = 0; i < type->cnt; i++) { | 283 | for (i = 0; i < type->cnt; i++) { |
312 | phys_addr_t rgnbase = type->regions[i].base; | 284 | struct memblock_region *rgn = &type->regions[i]; |
313 | phys_addr_t rgnsize = type->regions[i].size; | 285 | phys_addr_t rend = rgn->base + rgn->size; |
286 | |||
287 | /* Exit if there's no possible hits */ | ||
288 | if (rgn->base > end || rgn->size == 0) | ||
289 | break; | ||
314 | 290 | ||
315 | if ((rgnbase == base) && (rgnsize == size)) | 291 | /* Check if we are fully enclosed within an existing |
316 | /* Already have this region, so we're done */ | 292 | * block |
293 | */ | ||
294 | if (rgn->base <= base && rend >= end) | ||
317 | return 0; | 295 | return 0; |
318 | 296 | ||
319 | adjacent = memblock_addrs_adjacent(base, size, rgnbase, rgnsize); | 297 | /* Check if we overlap or are adjacent with the bottom |
320 | /* Check if arch allows coalescing */ | 298 | * of a block. |
321 | if (adjacent != 0 && type == &memblock.memory && | 299 | */ |
322 | !memblock_memory_can_coalesce(base, size, rgnbase, rgnsize)) | 300 | if (base < rgn->base && end >= rgn->base) { |
323 | break; | 301 | /* If we can't coalesce, create a new block */ |
324 | if (adjacent > 0) { | 302 | if (!memblock_memory_can_coalesce(base, size, |
325 | type->regions[i].base -= size; | 303 | rgn->base, |
326 | type->regions[i].size += size; | 304 | rgn->size)) { |
327 | coalesced++; | 305 | /* Overlap & can't coalesce are mutually |
328 | break; | 306 | * exclusive, if you do that, be prepared |
329 | } else if (adjacent < 0) { | 307 | * for trouble |
330 | type->regions[i].size += size; | 308 | */ |
331 | coalesced++; | 309 | WARN_ON(end != rgn->base); |
332 | break; | 310 | goto new_block; |
311 | } | ||
312 | /* We extend the bottom of the block down to our | ||
313 | * base | ||
314 | */ | ||
315 | rgn->base = base; | ||
316 | rgn->size = rend - base; | ||
317 | |||
318 | /* Return if we have nothing else to allocate | ||
319 | * (fully coalesced) | ||
320 | */ | ||
321 | if (rend >= end) | ||
322 | return 0; | ||
323 | |||
324 | /* We continue processing from the end of the | ||
325 | * coalesced block. | ||
326 | */ | ||
327 | base = rend; | ||
328 | size = end - base; | ||
329 | } | ||
330 | |||
331 | /* Now check if we overlap or are adjacent with the | ||
332 | * top of a block | ||
333 | */ | ||
334 | if (base <= rend && end >= rend) { | ||
335 | /* If we can't coalesce, create a new block */ | ||
336 | if (!memblock_memory_can_coalesce(rgn->base, | ||
337 | rgn->size, | ||
338 | base, size)) { | ||
339 | /* Overlap & can't coalesce are mutually | ||
340 | * exclusive, if you do that, be prepared | ||
341 | * for trouble | ||
342 | */ | ||
343 | WARN_ON(rend != base); | ||
344 | goto new_block; | ||
345 | } | ||
346 | /* We adjust our base down to enclose the | ||
347 | * original block and destroy it. It will be | ||
348 | * part of our new allocation. Since we've | ||
349 | * freed an entry, we know we won't fail | ||
350 | * to allocate one later, so we won't risk | ||
351 | * losing the original block allocation. | ||
352 | */ | ||
353 | size += (base - rgn->base); | ||
354 | base = rgn->base; | ||
355 | memblock_remove_region(type, i--); | ||
333 | } | 356 | } |
334 | } | 357 | } |
335 | 358 | ||
336 | /* If we plugged a hole, we may want to also coalesce with the | 359 | /* If the array is empty, special case, replace the fake |
337 | * next region | 360 | * filler region and return |
338 | */ | 361 | */ |
339 | if ((i < type->cnt - 1) && memblock_regions_adjacent(type, i, i+1) && | 362 | if ((type->cnt == 1) && (type->regions[0].size == 0)) { |
340 | ((type != &memblock.memory || memblock_memory_can_coalesce(type->regions[i].base, | 363 | type->regions[0].base = base; |
341 | type->regions[i].size, | 364 | type->regions[0].size = size; |
342 | type->regions[i+1].base, | 365 | return 0; |
343 | type->regions[i+1].size)))) { | ||
344 | memblock_coalesce_regions(type, i, i+1); | ||
345 | coalesced++; | ||
346 | } | 366 | } |
347 | 367 | ||
348 | if (coalesced) | 368 | new_block: |
349 | return coalesced; | ||
350 | |||
351 | /* If we are out of space, we fail. It's too late to resize the array | 369 | /* If we are out of space, we fail. It's too late to resize the array |
352 | * but then this shouldn't have happened in the first place. | 370 | * but then this shouldn't have happened in the first place. |
353 | */ | 371 | */ |
@@ -362,13 +380,14 @@ static long __init_memblock memblock_add_region(struct memblock_type *type, phys | |||
362 | } else { | 380 | } else { |
363 | type->regions[i+1].base = base; | 381 | type->regions[i+1].base = base; |
364 | type->regions[i+1].size = size; | 382 | type->regions[i+1].size = size; |
383 | slot = i + 1; | ||
365 | break; | 384 | break; |
366 | } | 385 | } |
367 | } | 386 | } |
368 | |||
369 | if (base < type->regions[0].base) { | 387 | if (base < type->regions[0].base) { |
370 | type->regions[0].base = base; | 388 | type->regions[0].base = base; |
371 | type->regions[0].size = size; | 389 | type->regions[0].size = size; |
390 | slot = 0; | ||
372 | } | 391 | } |
373 | type->cnt++; | 392 | type->cnt++; |
374 | 393 | ||
@@ -376,7 +395,8 @@ static long __init_memblock memblock_add_region(struct memblock_type *type, phys | |||
376 | * our allocation and return an error | 395 | * our allocation and return an error |
377 | */ | 396 | */ |
378 | if (type->cnt == type->max && memblock_double_array(type)) { | 397 | if (type->cnt == type->max && memblock_double_array(type)) { |
379 | type->cnt--; | 398 | BUG_ON(slot < 0); |
399 | memblock_remove_region(type, slot); | ||
380 | return -1; | 400 | return -1; |
381 | } | 401 | } |
382 | 402 | ||
@@ -389,52 +409,55 @@ long __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) | |||
389 | 409 | ||
390 | } | 410 | } |
391 | 411 | ||
392 | static long __init_memblock __memblock_remove(struct memblock_type *type, phys_addr_t base, phys_addr_t size) | 412 | static long __init_memblock __memblock_remove(struct memblock_type *type, |
413 | phys_addr_t base, phys_addr_t size) | ||
393 | { | 414 | { |
394 | phys_addr_t rgnbegin, rgnend; | ||
395 | phys_addr_t end = base + size; | 415 | phys_addr_t end = base + size; |
396 | int i; | 416 | int i; |
397 | 417 | ||
398 | rgnbegin = rgnend = 0; /* supress gcc warnings */ | 418 | /* Walk through the array for collisions */ |
399 | 419 | for (i = 0; i < type->cnt; i++) { | |
400 | /* Find the region where (base, size) belongs to */ | 420 | struct memblock_region *rgn = &type->regions[i]; |
401 | for (i=0; i < type->cnt; i++) { | 421 | phys_addr_t rend = rgn->base + rgn->size; |
402 | rgnbegin = type->regions[i].base; | ||
403 | rgnend = rgnbegin + type->regions[i].size; | ||
404 | 422 | ||
405 | if ((rgnbegin <= base) && (end <= rgnend)) | 423 | /* Nothing more to do, exit */ |
424 | if (rgn->base > end || rgn->size == 0) | ||
406 | break; | 425 | break; |
407 | } | ||
408 | 426 | ||
409 | /* Didn't find the region */ | 427 | /* If we fully enclose the block, drop it */ |
410 | if (i == type->cnt) | 428 | if (base <= rgn->base && end >= rend) { |
411 | return -1; | 429 | memblock_remove_region(type, i--); |
430 | continue; | ||
431 | } | ||
412 | 432 | ||
413 | /* Check to see if we are removing entire region */ | 433 | /* If we are fully enclosed within a block |
414 | if ((rgnbegin == base) && (rgnend == end)) { | 434 | * then we need to split it and we are done |
415 | memblock_remove_region(type, i); | 435 | */ |
416 | return 0; | 436 | if (base > rgn->base && end < rend) { |
417 | } | 437 | rgn->size = base - rgn->base; |
438 | if (!memblock_add_region(type, end, rend - end)) | ||
439 | return 0; | ||
440 | /* Failure to split is bad, we at least | ||
441 | * restore the block before erroring | ||
442 | */ | ||
443 | rgn->size = rend - rgn->base; | ||
444 | WARN_ON(1); | ||
445 | return -1; | ||
446 | } | ||
418 | 447 | ||
419 | /* Check to see if region is matching at the front */ | 448 | /* Check if we need to trim the bottom of a block */ |
420 | if (rgnbegin == base) { | 449 | if (rgn->base < end && rend > end) { |
421 | type->regions[i].base = end; | 450 | rgn->size -= end - rgn->base; |
422 | type->regions[i].size -= size; | 451 | rgn->base = end; |
423 | return 0; | 452 | break; |
424 | } | 453 | } |
425 | 454 | ||
426 | /* Check to see if the region is matching at the end */ | 455 | /* And check if we need to trim the top of a block */ |
427 | if (rgnend == end) { | 456 | if (base < rend) |
428 | type->regions[i].size -= size; | 457 | rgn->size -= rend - base; |
429 | return 0; | ||
430 | } | ||
431 | 458 | ||
432 | /* | 459 | } |
433 | * We need to split the entry - adjust the current one to the | 460 | return 0; |
434 | * beginging of the hole and add the region after hole. | ||
435 | */ | ||
436 | type->regions[i].size = base - type->regions[i].base; | ||
437 | return memblock_add_region(type, end, rgnend - end); | ||
438 | } | 461 | } |
439 | 462 | ||
440 | long __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size) | 463 | long __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size) |
@@ -467,7 +490,7 @@ phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, ph | |||
467 | 490 | ||
468 | found = memblock_find_base(size, align, 0, max_addr); | 491 | found = memblock_find_base(size, align, 0, max_addr); |
469 | if (found != MEMBLOCK_ERROR && | 492 | if (found != MEMBLOCK_ERROR && |
470 | memblock_add_region(&memblock.reserved, found, size) >= 0) | 493 | !memblock_add_region(&memblock.reserved, found, size)) |
471 | return found; | 494 | return found; |
472 | 495 | ||
473 | return 0; | 496 | return 0; |
@@ -548,7 +571,7 @@ static phys_addr_t __init memblock_alloc_nid_region(struct memblock_region *mp, | |||
548 | if (this_nid == nid) { | 571 | if (this_nid == nid) { |
549 | phys_addr_t ret = memblock_find_region(start, this_end, size, align); | 572 | phys_addr_t ret = memblock_find_region(start, this_end, size, align); |
550 | if (ret != MEMBLOCK_ERROR && | 573 | if (ret != MEMBLOCK_ERROR && |
551 | memblock_add_region(&memblock.reserved, ret, size) >= 0) | 574 | !memblock_add_region(&memblock.reserved, ret, size)) |
552 | return ret; | 575 | return ret; |
553 | } | 576 | } |
554 | start = this_end; | 577 | start = this_end; |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index da53a252b259..010f9166fa6e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -73,15 +73,6 @@ static int really_do_swap_account __initdata = 0; | |||
73 | #define do_swap_account (0) | 73 | #define do_swap_account (0) |
74 | #endif | 74 | #endif |
75 | 75 | ||
76 | /* | ||
77 | * Per memcg event counter is incremented at every pagein/pageout. This counter | ||
78 | * is used for trigger some periodic events. This is straightforward and better | ||
79 | * than using jiffies etc. to handle periodic memcg event. | ||
80 | * | ||
81 | * These values will be used as !((event) & ((1 <<(thresh)) - 1)) | ||
82 | */ | ||
83 | #define THRESHOLDS_EVENTS_THRESH (7) /* once in 128 */ | ||
84 | #define SOFTLIMIT_EVENTS_THRESH (10) /* once in 1024 */ | ||
85 | 76 | ||
86 | /* | 77 | /* |
87 | * Statistics for memory cgroup. | 78 | * Statistics for memory cgroup. |
@@ -93,19 +84,36 @@ enum mem_cgroup_stat_index { | |||
93 | MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ | 84 | MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ |
94 | MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ | 85 | MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ |
95 | MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ | 86 | MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ |
96 | MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ | ||
97 | MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ | ||
98 | MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ | 87 | MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ |
99 | MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */ | 88 | MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */ |
100 | /* incremented at every pagein/pageout */ | ||
101 | MEM_CGROUP_EVENTS = MEM_CGROUP_STAT_DATA, | ||
102 | MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */ | 89 | MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */ |
103 | |||
104 | MEM_CGROUP_STAT_NSTATS, | 90 | MEM_CGROUP_STAT_NSTATS, |
105 | }; | 91 | }; |
106 | 92 | ||
93 | enum mem_cgroup_events_index { | ||
94 | MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ | ||
95 | MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ | ||
96 | MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */ | ||
97 | MEM_CGROUP_EVENTS_NSTATS, | ||
98 | }; | ||
99 | /* | ||
100 | * Per memcg event counter is incremented at every pagein/pageout. With THP, | ||
101 | * it will be incremated by the number of pages. This counter is used for | ||
102 | * for trigger some periodic events. This is straightforward and better | ||
103 | * than using jiffies etc. to handle periodic memcg event. | ||
104 | */ | ||
105 | enum mem_cgroup_events_target { | ||
106 | MEM_CGROUP_TARGET_THRESH, | ||
107 | MEM_CGROUP_TARGET_SOFTLIMIT, | ||
108 | MEM_CGROUP_NTARGETS, | ||
109 | }; | ||
110 | #define THRESHOLDS_EVENTS_TARGET (128) | ||
111 | #define SOFTLIMIT_EVENTS_TARGET (1024) | ||
112 | |||
107 | struct mem_cgroup_stat_cpu { | 113 | struct mem_cgroup_stat_cpu { |
108 | s64 count[MEM_CGROUP_STAT_NSTATS]; | 114 | long count[MEM_CGROUP_STAT_NSTATS]; |
115 | unsigned long events[MEM_CGROUP_EVENTS_NSTATS]; | ||
116 | unsigned long targets[MEM_CGROUP_NTARGETS]; | ||
109 | }; | 117 | }; |
110 | 118 | ||
111 | /* | 119 | /* |
@@ -218,12 +226,6 @@ struct mem_cgroup { | |||
218 | * per zone LRU lists. | 226 | * per zone LRU lists. |
219 | */ | 227 | */ |
220 | struct mem_cgroup_lru_info info; | 228 | struct mem_cgroup_lru_info info; |
221 | |||
222 | /* | ||
223 | protect against reclaim related member. | ||
224 | */ | ||
225 | spinlock_t reclaim_param_lock; | ||
226 | |||
227 | /* | 229 | /* |
228 | * While reclaiming in a hierarchy, we cache the last child we | 230 | * While reclaiming in a hierarchy, we cache the last child we |
229 | * reclaimed from. | 231 | * reclaimed from. |
@@ -327,13 +329,6 @@ enum charge_type { | |||
327 | NR_CHARGE_TYPE, | 329 | NR_CHARGE_TYPE, |
328 | }; | 330 | }; |
329 | 331 | ||
330 | /* only for here (for easy reading.) */ | ||
331 | #define PCGF_CACHE (1UL << PCG_CACHE) | ||
332 | #define PCGF_USED (1UL << PCG_USED) | ||
333 | #define PCGF_LOCK (1UL << PCG_LOCK) | ||
334 | /* Not used, but added here for completeness */ | ||
335 | #define PCGF_ACCT (1UL << PCG_ACCT) | ||
336 | |||
337 | /* for encoding cft->private value on file */ | 332 | /* for encoding cft->private value on file */ |
338 | #define _MEM (0) | 333 | #define _MEM (0) |
339 | #define _MEMSWAP (1) | 334 | #define _MEMSWAP (1) |
@@ -371,14 +366,10 @@ struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem) | |||
371 | } | 366 | } |
372 | 367 | ||
373 | static struct mem_cgroup_per_zone * | 368 | static struct mem_cgroup_per_zone * |
374 | page_cgroup_zoneinfo(struct page_cgroup *pc) | 369 | page_cgroup_zoneinfo(struct mem_cgroup *mem, struct page *page) |
375 | { | 370 | { |
376 | struct mem_cgroup *mem = pc->mem_cgroup; | 371 | int nid = page_to_nid(page); |
377 | int nid = page_cgroup_nid(pc); | 372 | int zid = page_zonenum(page); |
378 | int zid = page_cgroup_zid(pc); | ||
379 | |||
380 | if (!mem) | ||
381 | return NULL; | ||
382 | 373 | ||
383 | return mem_cgroup_zoneinfo(mem, nid, zid); | 374 | return mem_cgroup_zoneinfo(mem, nid, zid); |
384 | } | 375 | } |
@@ -504,11 +495,6 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem) | |||
504 | } | 495 | } |
505 | } | 496 | } |
506 | 497 | ||
507 | static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem) | ||
508 | { | ||
509 | return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT; | ||
510 | } | ||
511 | |||
512 | static struct mem_cgroup_per_zone * | 498 | static struct mem_cgroup_per_zone * |
513 | __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) | 499 | __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) |
514 | { | 500 | { |
@@ -565,11 +551,11 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) | |||
565 | * common workload, threashold and synchonization as vmstat[] should be | 551 | * common workload, threashold and synchonization as vmstat[] should be |
566 | * implemented. | 552 | * implemented. |
567 | */ | 553 | */ |
568 | static s64 mem_cgroup_read_stat(struct mem_cgroup *mem, | 554 | static long mem_cgroup_read_stat(struct mem_cgroup *mem, |
569 | enum mem_cgroup_stat_index idx) | 555 | enum mem_cgroup_stat_index idx) |
570 | { | 556 | { |
557 | long val = 0; | ||
571 | int cpu; | 558 | int cpu; |
572 | s64 val = 0; | ||
573 | 559 | ||
574 | get_online_cpus(); | 560 | get_online_cpus(); |
575 | for_each_online_cpu(cpu) | 561 | for_each_online_cpu(cpu) |
@@ -583,9 +569,9 @@ static s64 mem_cgroup_read_stat(struct mem_cgroup *mem, | |||
583 | return val; | 569 | return val; |
584 | } | 570 | } |
585 | 571 | ||
586 | static s64 mem_cgroup_local_usage(struct mem_cgroup *mem) | 572 | static long mem_cgroup_local_usage(struct mem_cgroup *mem) |
587 | { | 573 | { |
588 | s64 ret; | 574 | long ret; |
589 | 575 | ||
590 | ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); | 576 | ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); |
591 | ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); | 577 | ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); |
@@ -599,6 +585,22 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, | |||
599 | this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); | 585 | this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); |
600 | } | 586 | } |
601 | 587 | ||
588 | static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem, | ||
589 | enum mem_cgroup_events_index idx) | ||
590 | { | ||
591 | unsigned long val = 0; | ||
592 | int cpu; | ||
593 | |||
594 | for_each_online_cpu(cpu) | ||
595 | val += per_cpu(mem->stat->events[idx], cpu); | ||
596 | #ifdef CONFIG_HOTPLUG_CPU | ||
597 | spin_lock(&mem->pcp_counter_lock); | ||
598 | val += mem->nocpu_base.events[idx]; | ||
599 | spin_unlock(&mem->pcp_counter_lock); | ||
600 | #endif | ||
601 | return val; | ||
602 | } | ||
603 | |||
602 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, | 604 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, |
603 | bool file, int nr_pages) | 605 | bool file, int nr_pages) |
604 | { | 606 | { |
@@ -611,13 +613,13 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, | |||
611 | 613 | ||
612 | /* pagein of a big page is an event. So, ignore page size */ | 614 | /* pagein of a big page is an event. So, ignore page size */ |
613 | if (nr_pages > 0) | 615 | if (nr_pages > 0) |
614 | __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]); | 616 | __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); |
615 | else { | 617 | else { |
616 | __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]); | 618 | __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]); |
617 | nr_pages = -nr_pages; /* for event */ | 619 | nr_pages = -nr_pages; /* for event */ |
618 | } | 620 | } |
619 | 621 | ||
620 | __this_cpu_add(mem->stat->count[MEM_CGROUP_EVENTS], nr_pages); | 622 | __this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages); |
621 | 623 | ||
622 | preempt_enable(); | 624 | preempt_enable(); |
623 | } | 625 | } |
@@ -637,13 +639,34 @@ static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, | |||
637 | return total; | 639 | return total; |
638 | } | 640 | } |
639 | 641 | ||
640 | static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift) | 642 | static bool __memcg_event_check(struct mem_cgroup *mem, int target) |
641 | { | 643 | { |
642 | s64 val; | 644 | unsigned long val, next; |
645 | |||
646 | val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]); | ||
647 | next = this_cpu_read(mem->stat->targets[target]); | ||
648 | /* from time_after() in jiffies.h */ | ||
649 | return ((long)next - (long)val < 0); | ||
650 | } | ||
651 | |||
652 | static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target) | ||
653 | { | ||
654 | unsigned long val, next; | ||
643 | 655 | ||
644 | val = this_cpu_read(mem->stat->count[MEM_CGROUP_EVENTS]); | 656 | val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]); |
645 | 657 | ||
646 | return !(val & ((1 << event_mask_shift) - 1)); | 658 | switch (target) { |
659 | case MEM_CGROUP_TARGET_THRESH: | ||
660 | next = val + THRESHOLDS_EVENTS_TARGET; | ||
661 | break; | ||
662 | case MEM_CGROUP_TARGET_SOFTLIMIT: | ||
663 | next = val + SOFTLIMIT_EVENTS_TARGET; | ||
664 | break; | ||
665 | default: | ||
666 | return; | ||
667 | } | ||
668 | |||
669 | this_cpu_write(mem->stat->targets[target], next); | ||
647 | } | 670 | } |
648 | 671 | ||
649 | /* | 672 | /* |
@@ -653,10 +676,15 @@ static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift) | |||
653 | static void memcg_check_events(struct mem_cgroup *mem, struct page *page) | 676 | static void memcg_check_events(struct mem_cgroup *mem, struct page *page) |
654 | { | 677 | { |
655 | /* threshold event is triggered in finer grain than soft limit */ | 678 | /* threshold event is triggered in finer grain than soft limit */ |
656 | if (unlikely(__memcg_event_check(mem, THRESHOLDS_EVENTS_THRESH))) { | 679 | if (unlikely(__memcg_event_check(mem, MEM_CGROUP_TARGET_THRESH))) { |
657 | mem_cgroup_threshold(mem); | 680 | mem_cgroup_threshold(mem); |
658 | if (unlikely(__memcg_event_check(mem, SOFTLIMIT_EVENTS_THRESH))) | 681 | __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH); |
682 | if (unlikely(__memcg_event_check(mem, | ||
683 | MEM_CGROUP_TARGET_SOFTLIMIT))){ | ||
659 | mem_cgroup_update_tree(mem, page); | 684 | mem_cgroup_update_tree(mem, page); |
685 | __mem_cgroup_target_update(mem, | ||
686 | MEM_CGROUP_TARGET_SOFTLIMIT); | ||
687 | } | ||
660 | } | 688 | } |
661 | } | 689 | } |
662 | 690 | ||
@@ -815,7 +843,7 @@ void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) | |||
815 | * We don't check PCG_USED bit. It's cleared when the "page" is finally | 843 | * We don't check PCG_USED bit. It's cleared when the "page" is finally |
816 | * removed from global LRU. | 844 | * removed from global LRU. |
817 | */ | 845 | */ |
818 | mz = page_cgroup_zoneinfo(pc); | 846 | mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); |
819 | /* huge page split is done under lru_lock. so, we have no races. */ | 847 | /* huge page split is done under lru_lock. so, we have no races. */ |
820 | MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); | 848 | MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); |
821 | if (mem_cgroup_is_root(pc->mem_cgroup)) | 849 | if (mem_cgroup_is_root(pc->mem_cgroup)) |
@@ -829,6 +857,32 @@ void mem_cgroup_del_lru(struct page *page) | |||
829 | mem_cgroup_del_lru_list(page, page_lru(page)); | 857 | mem_cgroup_del_lru_list(page, page_lru(page)); |
830 | } | 858 | } |
831 | 859 | ||
860 | /* | ||
861 | * Writeback is about to end against a page which has been marked for immediate | ||
862 | * reclaim. If it still appears to be reclaimable, move it to the tail of the | ||
863 | * inactive list. | ||
864 | */ | ||
865 | void mem_cgroup_rotate_reclaimable_page(struct page *page) | ||
866 | { | ||
867 | struct mem_cgroup_per_zone *mz; | ||
868 | struct page_cgroup *pc; | ||
869 | enum lru_list lru = page_lru(page); | ||
870 | |||
871 | if (mem_cgroup_disabled()) | ||
872 | return; | ||
873 | |||
874 | pc = lookup_page_cgroup(page); | ||
875 | /* unused or root page is not rotated. */ | ||
876 | if (!PageCgroupUsed(pc)) | ||
877 | return; | ||
878 | /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ | ||
879 | smp_rmb(); | ||
880 | if (mem_cgroup_is_root(pc->mem_cgroup)) | ||
881 | return; | ||
882 | mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); | ||
883 | list_move_tail(&pc->lru, &mz->lists[lru]); | ||
884 | } | ||
885 | |||
832 | void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) | 886 | void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) |
833 | { | 887 | { |
834 | struct mem_cgroup_per_zone *mz; | 888 | struct mem_cgroup_per_zone *mz; |
@@ -845,7 +899,7 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) | |||
845 | smp_rmb(); | 899 | smp_rmb(); |
846 | if (mem_cgroup_is_root(pc->mem_cgroup)) | 900 | if (mem_cgroup_is_root(pc->mem_cgroup)) |
847 | return; | 901 | return; |
848 | mz = page_cgroup_zoneinfo(pc); | 902 | mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); |
849 | list_move(&pc->lru, &mz->lists[lru]); | 903 | list_move(&pc->lru, &mz->lists[lru]); |
850 | } | 904 | } |
851 | 905 | ||
@@ -862,7 +916,7 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) | |||
862 | return; | 916 | return; |
863 | /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ | 917 | /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ |
864 | smp_rmb(); | 918 | smp_rmb(); |
865 | mz = page_cgroup_zoneinfo(pc); | 919 | mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); |
866 | /* huge page split is done under lru_lock. so, we have no races. */ | 920 | /* huge page split is done under lru_lock. so, we have no races. */ |
867 | MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); | 921 | MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); |
868 | SetPageCgroupAcctLRU(pc); | 922 | SetPageCgroupAcctLRU(pc); |
@@ -872,18 +926,28 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) | |||
872 | } | 926 | } |
873 | 927 | ||
874 | /* | 928 | /* |
875 | * At handling SwapCache, pc->mem_cgroup may be changed while it's linked to | 929 | * At handling SwapCache and other FUSE stuff, pc->mem_cgroup may be changed |
876 | * lru because the page may.be reused after it's fully uncharged (because of | 930 | * while it's linked to lru because the page may be reused after it's fully |
877 | * SwapCache behavior).To handle that, unlink page_cgroup from LRU when charge | 931 | * uncharged. To handle that, unlink page_cgroup from LRU when charge it again. |
878 | * it again. This function is only used to charge SwapCache. It's done under | 932 | * It's done under lock_page and expected that zone->lru_lock isnever held. |
879 | * lock_page and expected that zone->lru_lock is never held. | ||
880 | */ | 933 | */ |
881 | static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page) | 934 | static void mem_cgroup_lru_del_before_commit(struct page *page) |
882 | { | 935 | { |
883 | unsigned long flags; | 936 | unsigned long flags; |
884 | struct zone *zone = page_zone(page); | 937 | struct zone *zone = page_zone(page); |
885 | struct page_cgroup *pc = lookup_page_cgroup(page); | 938 | struct page_cgroup *pc = lookup_page_cgroup(page); |
886 | 939 | ||
940 | /* | ||
941 | * Doing this check without taking ->lru_lock seems wrong but this | ||
942 | * is safe. Because if page_cgroup's USED bit is unset, the page | ||
943 | * will not be added to any memcg's LRU. If page_cgroup's USED bit is | ||
944 | * set, the commit after this will fail, anyway. | ||
945 | * This all charge/uncharge is done under some mutual execustion. | ||
946 | * So, we don't need to taking care of changes in USED bit. | ||
947 | */ | ||
948 | if (likely(!PageLRU(page))) | ||
949 | return; | ||
950 | |||
887 | spin_lock_irqsave(&zone->lru_lock, flags); | 951 | spin_lock_irqsave(&zone->lru_lock, flags); |
888 | /* | 952 | /* |
889 | * Forget old LRU when this page_cgroup is *not* used. This Used bit | 953 | * Forget old LRU when this page_cgroup is *not* used. This Used bit |
@@ -894,12 +958,15 @@ static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page) | |||
894 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 958 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
895 | } | 959 | } |
896 | 960 | ||
897 | static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page) | 961 | static void mem_cgroup_lru_add_after_commit(struct page *page) |
898 | { | 962 | { |
899 | unsigned long flags; | 963 | unsigned long flags; |
900 | struct zone *zone = page_zone(page); | 964 | struct zone *zone = page_zone(page); |
901 | struct page_cgroup *pc = lookup_page_cgroup(page); | 965 | struct page_cgroup *pc = lookup_page_cgroup(page); |
902 | 966 | ||
967 | /* taking care of that the page is added to LRU while we commit it */ | ||
968 | if (likely(!PageLRU(page))) | ||
969 | return; | ||
903 | spin_lock_irqsave(&zone->lru_lock, flags); | 970 | spin_lock_irqsave(&zone->lru_lock, flags); |
904 | /* link when the page is linked to LRU but page_cgroup isn't */ | 971 | /* link when the page is linked to LRU but page_cgroup isn't */ |
905 | if (PageLRU(page) && !PageCgroupAcctLRU(pc)) | 972 | if (PageLRU(page) && !PageCgroupAcctLRU(pc)) |
@@ -1032,10 +1099,7 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page) | |||
1032 | return NULL; | 1099 | return NULL; |
1033 | /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ | 1100 | /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ |
1034 | smp_rmb(); | 1101 | smp_rmb(); |
1035 | mz = page_cgroup_zoneinfo(pc); | 1102 | mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); |
1036 | if (!mz) | ||
1037 | return NULL; | ||
1038 | |||
1039 | return &mz->reclaim_stat; | 1103 | return &mz->reclaim_stat; |
1040 | } | 1104 | } |
1041 | 1105 | ||
@@ -1067,9 +1131,11 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | |||
1067 | if (scan >= nr_to_scan) | 1131 | if (scan >= nr_to_scan) |
1068 | break; | 1132 | break; |
1069 | 1133 | ||
1070 | page = pc->page; | ||
1071 | if (unlikely(!PageCgroupUsed(pc))) | 1134 | if (unlikely(!PageCgroupUsed(pc))) |
1072 | continue; | 1135 | continue; |
1136 | |||
1137 | page = lookup_cgroup_page(pc); | ||
1138 | |||
1073 | if (unlikely(!PageLRU(page))) | 1139 | if (unlikely(!PageLRU(page))) |
1074 | continue; | 1140 | continue; |
1075 | 1141 | ||
@@ -1101,49 +1167,32 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | |||
1101 | #define mem_cgroup_from_res_counter(counter, member) \ | 1167 | #define mem_cgroup_from_res_counter(counter, member) \ |
1102 | container_of(counter, struct mem_cgroup, member) | 1168 | container_of(counter, struct mem_cgroup, member) |
1103 | 1169 | ||
1104 | static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem) | ||
1105 | { | ||
1106 | if (do_swap_account) { | ||
1107 | if (res_counter_check_under_limit(&mem->res) && | ||
1108 | res_counter_check_under_limit(&mem->memsw)) | ||
1109 | return true; | ||
1110 | } else | ||
1111 | if (res_counter_check_under_limit(&mem->res)) | ||
1112 | return true; | ||
1113 | return false; | ||
1114 | } | ||
1115 | |||
1116 | /** | 1170 | /** |
1117 | * mem_cgroup_check_margin - check if the memory cgroup allows charging | 1171 | * mem_cgroup_margin - calculate chargeable space of a memory cgroup |
1118 | * @mem: memory cgroup to check | 1172 | * @mem: the memory cgroup |
1119 | * @bytes: the number of bytes the caller intends to charge | ||
1120 | * | 1173 | * |
1121 | * Returns a boolean value on whether @mem can be charged @bytes or | 1174 | * Returns the maximum amount of memory @mem can be charged with, in |
1122 | * whether this would exceed the limit. | 1175 | * pages. |
1123 | */ | 1176 | */ |
1124 | static bool mem_cgroup_check_margin(struct mem_cgroup *mem, unsigned long bytes) | 1177 | static unsigned long mem_cgroup_margin(struct mem_cgroup *mem) |
1125 | { | 1178 | { |
1126 | if (!res_counter_check_margin(&mem->res, bytes)) | 1179 | unsigned long long margin; |
1127 | return false; | 1180 | |
1128 | if (do_swap_account && !res_counter_check_margin(&mem->memsw, bytes)) | 1181 | margin = res_counter_margin(&mem->res); |
1129 | return false; | 1182 | if (do_swap_account) |
1130 | return true; | 1183 | margin = min(margin, res_counter_margin(&mem->memsw)); |
1184 | return margin >> PAGE_SHIFT; | ||
1131 | } | 1185 | } |
1132 | 1186 | ||
1133 | static unsigned int get_swappiness(struct mem_cgroup *memcg) | 1187 | static unsigned int get_swappiness(struct mem_cgroup *memcg) |
1134 | { | 1188 | { |
1135 | struct cgroup *cgrp = memcg->css.cgroup; | 1189 | struct cgroup *cgrp = memcg->css.cgroup; |
1136 | unsigned int swappiness; | ||
1137 | 1190 | ||
1138 | /* root ? */ | 1191 | /* root ? */ |
1139 | if (cgrp->parent == NULL) | 1192 | if (cgrp->parent == NULL) |
1140 | return vm_swappiness; | 1193 | return vm_swappiness; |
1141 | 1194 | ||
1142 | spin_lock(&memcg->reclaim_param_lock); | 1195 | return memcg->swappiness; |
1143 | swappiness = memcg->swappiness; | ||
1144 | spin_unlock(&memcg->reclaim_param_lock); | ||
1145 | |||
1146 | return swappiness; | ||
1147 | } | 1196 | } |
1148 | 1197 | ||
1149 | static void mem_cgroup_start_move(struct mem_cgroup *mem) | 1198 | static void mem_cgroup_start_move(struct mem_cgroup *mem) |
@@ -1359,13 +1408,11 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) | |||
1359 | 1408 | ||
1360 | rcu_read_unlock(); | 1409 | rcu_read_unlock(); |
1361 | /* Updates scanning parameter */ | 1410 | /* Updates scanning parameter */ |
1362 | spin_lock(&root_mem->reclaim_param_lock); | ||
1363 | if (!css) { | 1411 | if (!css) { |
1364 | /* this means start scan from ID:1 */ | 1412 | /* this means start scan from ID:1 */ |
1365 | root_mem->last_scanned_child = 0; | 1413 | root_mem->last_scanned_child = 0; |
1366 | } else | 1414 | } else |
1367 | root_mem->last_scanned_child = found; | 1415 | root_mem->last_scanned_child = found; |
1368 | spin_unlock(&root_mem->reclaim_param_lock); | ||
1369 | } | 1416 | } |
1370 | 1417 | ||
1371 | return ret; | 1418 | return ret; |
@@ -1394,7 +1441,9 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1394 | bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; | 1441 | bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; |
1395 | bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; | 1442 | bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; |
1396 | bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; | 1443 | bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; |
1397 | unsigned long excess = mem_cgroup_get_excess(root_mem); | 1444 | unsigned long excess; |
1445 | |||
1446 | excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; | ||
1398 | 1447 | ||
1399 | /* If memsw_is_minimum==1, swap-out is of-no-use. */ | 1448 | /* If memsw_is_minimum==1, swap-out is of-no-use. */ |
1400 | if (root_mem->memsw_is_minimum) | 1449 | if (root_mem->memsw_is_minimum) |
@@ -1417,7 +1466,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1417 | break; | 1466 | break; |
1418 | } | 1467 | } |
1419 | /* | 1468 | /* |
1420 | * We want to do more targetted reclaim. | 1469 | * We want to do more targeted reclaim. |
1421 | * excess >> 2 is not to excessive so as to | 1470 | * excess >> 2 is not to excessive so as to |
1422 | * reclaim too much, nor too less that we keep | 1471 | * reclaim too much, nor too less that we keep |
1423 | * coming back to reclaim from this cgroup | 1472 | * coming back to reclaim from this cgroup |
@@ -1451,9 +1500,9 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1451 | return ret; | 1500 | return ret; |
1452 | total += ret; | 1501 | total += ret; |
1453 | if (check_soft) { | 1502 | if (check_soft) { |
1454 | if (res_counter_check_under_soft_limit(&root_mem->res)) | 1503 | if (!res_counter_soft_limit_excess(&root_mem->res)) |
1455 | return total; | 1504 | return total; |
1456 | } else if (mem_cgroup_check_under_limit(root_mem)) | 1505 | } else if (mem_cgroup_margin(root_mem)) |
1457 | return 1 + total; | 1506 | return 1 + total; |
1458 | } | 1507 | } |
1459 | return total; | 1508 | return total; |
@@ -1661,17 +1710,17 @@ EXPORT_SYMBOL(mem_cgroup_update_page_stat); | |||
1661 | * size of first charge trial. "32" comes from vmscan.c's magic value. | 1710 | * size of first charge trial. "32" comes from vmscan.c's magic value. |
1662 | * TODO: maybe necessary to use big numbers in big irons. | 1711 | * TODO: maybe necessary to use big numbers in big irons. |
1663 | */ | 1712 | */ |
1664 | #define CHARGE_SIZE (32 * PAGE_SIZE) | 1713 | #define CHARGE_BATCH 32U |
1665 | struct memcg_stock_pcp { | 1714 | struct memcg_stock_pcp { |
1666 | struct mem_cgroup *cached; /* this never be root cgroup */ | 1715 | struct mem_cgroup *cached; /* this never be root cgroup */ |
1667 | int charge; | 1716 | unsigned int nr_pages; |
1668 | struct work_struct work; | 1717 | struct work_struct work; |
1669 | }; | 1718 | }; |
1670 | static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); | 1719 | static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); |
1671 | static atomic_t memcg_drain_count; | 1720 | static atomic_t memcg_drain_count; |
1672 | 1721 | ||
1673 | /* | 1722 | /* |
1674 | * Try to consume stocked charge on this cpu. If success, PAGE_SIZE is consumed | 1723 | * Try to consume stocked charge on this cpu. If success, one page is consumed |
1675 | * from local stock and true is returned. If the stock is 0 or charges from a | 1724 | * from local stock and true is returned. If the stock is 0 or charges from a |
1676 | * cgroup which is not current target, returns false. This stock will be | 1725 | * cgroup which is not current target, returns false. This stock will be |
1677 | * refilled. | 1726 | * refilled. |
@@ -1682,8 +1731,8 @@ static bool consume_stock(struct mem_cgroup *mem) | |||
1682 | bool ret = true; | 1731 | bool ret = true; |
1683 | 1732 | ||
1684 | stock = &get_cpu_var(memcg_stock); | 1733 | stock = &get_cpu_var(memcg_stock); |
1685 | if (mem == stock->cached && stock->charge) | 1734 | if (mem == stock->cached && stock->nr_pages) |
1686 | stock->charge -= PAGE_SIZE; | 1735 | stock->nr_pages--; |
1687 | else /* need to call res_counter_charge */ | 1736 | else /* need to call res_counter_charge */ |
1688 | ret = false; | 1737 | ret = false; |
1689 | put_cpu_var(memcg_stock); | 1738 | put_cpu_var(memcg_stock); |
@@ -1697,13 +1746,15 @@ static void drain_stock(struct memcg_stock_pcp *stock) | |||
1697 | { | 1746 | { |
1698 | struct mem_cgroup *old = stock->cached; | 1747 | struct mem_cgroup *old = stock->cached; |
1699 | 1748 | ||
1700 | if (stock->charge) { | 1749 | if (stock->nr_pages) { |
1701 | res_counter_uncharge(&old->res, stock->charge); | 1750 | unsigned long bytes = stock->nr_pages * PAGE_SIZE; |
1751 | |||
1752 | res_counter_uncharge(&old->res, bytes); | ||
1702 | if (do_swap_account) | 1753 | if (do_swap_account) |
1703 | res_counter_uncharge(&old->memsw, stock->charge); | 1754 | res_counter_uncharge(&old->memsw, bytes); |
1755 | stock->nr_pages = 0; | ||
1704 | } | 1756 | } |
1705 | stock->cached = NULL; | 1757 | stock->cached = NULL; |
1706 | stock->charge = 0; | ||
1707 | } | 1758 | } |
1708 | 1759 | ||
1709 | /* | 1760 | /* |
@@ -1720,7 +1771,7 @@ static void drain_local_stock(struct work_struct *dummy) | |||
1720 | * Cache charges(val) which is from res_counter, to local per_cpu area. | 1771 | * Cache charges(val) which is from res_counter, to local per_cpu area. |
1721 | * This will be consumed by consume_stock() function, later. | 1772 | * This will be consumed by consume_stock() function, later. |
1722 | */ | 1773 | */ |
1723 | static void refill_stock(struct mem_cgroup *mem, int val) | 1774 | static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages) |
1724 | { | 1775 | { |
1725 | struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); | 1776 | struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); |
1726 | 1777 | ||
@@ -1728,7 +1779,7 @@ static void refill_stock(struct mem_cgroup *mem, int val) | |||
1728 | drain_stock(stock); | 1779 | drain_stock(stock); |
1729 | stock->cached = mem; | 1780 | stock->cached = mem; |
1730 | } | 1781 | } |
1731 | stock->charge += val; | 1782 | stock->nr_pages += nr_pages; |
1732 | put_cpu_var(memcg_stock); | 1783 | put_cpu_var(memcg_stock); |
1733 | } | 1784 | } |
1734 | 1785 | ||
@@ -1780,11 +1831,17 @@ static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu) | |||
1780 | 1831 | ||
1781 | spin_lock(&mem->pcp_counter_lock); | 1832 | spin_lock(&mem->pcp_counter_lock); |
1782 | for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) { | 1833 | for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) { |
1783 | s64 x = per_cpu(mem->stat->count[i], cpu); | 1834 | long x = per_cpu(mem->stat->count[i], cpu); |
1784 | 1835 | ||
1785 | per_cpu(mem->stat->count[i], cpu) = 0; | 1836 | per_cpu(mem->stat->count[i], cpu) = 0; |
1786 | mem->nocpu_base.count[i] += x; | 1837 | mem->nocpu_base.count[i] += x; |
1787 | } | 1838 | } |
1839 | for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { | ||
1840 | unsigned long x = per_cpu(mem->stat->events[i], cpu); | ||
1841 | |||
1842 | per_cpu(mem->stat->events[i], cpu) = 0; | ||
1843 | mem->nocpu_base.events[i] += x; | ||
1844 | } | ||
1788 | /* need to clear ON_MOVE value, works as a kind of lock. */ | 1845 | /* need to clear ON_MOVE value, works as a kind of lock. */ |
1789 | per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0; | 1846 | per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0; |
1790 | spin_unlock(&mem->pcp_counter_lock); | 1847 | spin_unlock(&mem->pcp_counter_lock); |
@@ -1834,9 +1891,10 @@ enum { | |||
1834 | CHARGE_OOM_DIE, /* the current is killed because of OOM */ | 1891 | CHARGE_OOM_DIE, /* the current is killed because of OOM */ |
1835 | }; | 1892 | }; |
1836 | 1893 | ||
1837 | static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, | 1894 | static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, |
1838 | int csize, bool oom_check) | 1895 | unsigned int nr_pages, bool oom_check) |
1839 | { | 1896 | { |
1897 | unsigned long csize = nr_pages * PAGE_SIZE; | ||
1840 | struct mem_cgroup *mem_over_limit; | 1898 | struct mem_cgroup *mem_over_limit; |
1841 | struct res_counter *fail_res; | 1899 | struct res_counter *fail_res; |
1842 | unsigned long flags = 0; | 1900 | unsigned long flags = 0; |
@@ -1857,14 +1915,13 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, | |||
1857 | } else | 1915 | } else |
1858 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); | 1916 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); |
1859 | /* | 1917 | /* |
1860 | * csize can be either a huge page (HPAGE_SIZE), a batch of | 1918 | * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch |
1861 | * regular pages (CHARGE_SIZE), or a single regular page | 1919 | * of regular pages (CHARGE_BATCH), or a single regular page (1). |
1862 | * (PAGE_SIZE). | ||
1863 | * | 1920 | * |
1864 | * Never reclaim on behalf of optional batching, retry with a | 1921 | * Never reclaim on behalf of optional batching, retry with a |
1865 | * single page instead. | 1922 | * single page instead. |
1866 | */ | 1923 | */ |
1867 | if (csize == CHARGE_SIZE) | 1924 | if (nr_pages == CHARGE_BATCH) |
1868 | return CHARGE_RETRY; | 1925 | return CHARGE_RETRY; |
1869 | 1926 | ||
1870 | if (!(gfp_mask & __GFP_WAIT)) | 1927 | if (!(gfp_mask & __GFP_WAIT)) |
@@ -1872,7 +1929,7 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, | |||
1872 | 1929 | ||
1873 | ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, | 1930 | ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, |
1874 | gfp_mask, flags); | 1931 | gfp_mask, flags); |
1875 | if (mem_cgroup_check_margin(mem_over_limit, csize)) | 1932 | if (mem_cgroup_margin(mem_over_limit) >= nr_pages) |
1876 | return CHARGE_RETRY; | 1933 | return CHARGE_RETRY; |
1877 | /* | 1934 | /* |
1878 | * Even though the limit is exceeded at this point, reclaim | 1935 | * Even though the limit is exceeded at this point, reclaim |
@@ -1883,7 +1940,7 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, | |||
1883 | * unlikely to succeed so close to the limit, and we fall back | 1940 | * unlikely to succeed so close to the limit, and we fall back |
1884 | * to regular pages anyway in case of failure. | 1941 | * to regular pages anyway in case of failure. |
1885 | */ | 1942 | */ |
1886 | if (csize == PAGE_SIZE && ret) | 1943 | if (nr_pages == 1 && ret) |
1887 | return CHARGE_RETRY; | 1944 | return CHARGE_RETRY; |
1888 | 1945 | ||
1889 | /* | 1946 | /* |
@@ -1909,13 +1966,14 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, | |||
1909 | */ | 1966 | */ |
1910 | static int __mem_cgroup_try_charge(struct mm_struct *mm, | 1967 | static int __mem_cgroup_try_charge(struct mm_struct *mm, |
1911 | gfp_t gfp_mask, | 1968 | gfp_t gfp_mask, |
1912 | struct mem_cgroup **memcg, bool oom, | 1969 | unsigned int nr_pages, |
1913 | int page_size) | 1970 | struct mem_cgroup **memcg, |
1971 | bool oom) | ||
1914 | { | 1972 | { |
1973 | unsigned int batch = max(CHARGE_BATCH, nr_pages); | ||
1915 | int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; | 1974 | int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; |
1916 | struct mem_cgroup *mem = NULL; | 1975 | struct mem_cgroup *mem = NULL; |
1917 | int ret; | 1976 | int ret; |
1918 | int csize = max(CHARGE_SIZE, (unsigned long) page_size); | ||
1919 | 1977 | ||
1920 | /* | 1978 | /* |
1921 | * Unlike gloval-vm's OOM-kill, we're not in memory shortage | 1979 | * Unlike gloval-vm's OOM-kill, we're not in memory shortage |
@@ -1940,7 +1998,7 @@ again: | |||
1940 | VM_BUG_ON(css_is_removed(&mem->css)); | 1998 | VM_BUG_ON(css_is_removed(&mem->css)); |
1941 | if (mem_cgroup_is_root(mem)) | 1999 | if (mem_cgroup_is_root(mem)) |
1942 | goto done; | 2000 | goto done; |
1943 | if (page_size == PAGE_SIZE && consume_stock(mem)) | 2001 | if (nr_pages == 1 && consume_stock(mem)) |
1944 | goto done; | 2002 | goto done; |
1945 | css_get(&mem->css); | 2003 | css_get(&mem->css); |
1946 | } else { | 2004 | } else { |
@@ -1963,7 +2021,7 @@ again: | |||
1963 | rcu_read_unlock(); | 2021 | rcu_read_unlock(); |
1964 | goto done; | 2022 | goto done; |
1965 | } | 2023 | } |
1966 | if (page_size == PAGE_SIZE && consume_stock(mem)) { | 2024 | if (nr_pages == 1 && consume_stock(mem)) { |
1967 | /* | 2025 | /* |
1968 | * It seems dagerous to access memcg without css_get(). | 2026 | * It seems dagerous to access memcg without css_get(). |
1969 | * But considering how consume_stok works, it's not | 2027 | * But considering how consume_stok works, it's not |
@@ -1998,13 +2056,12 @@ again: | |||
1998 | nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; | 2056 | nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; |
1999 | } | 2057 | } |
2000 | 2058 | ||
2001 | ret = __mem_cgroup_do_charge(mem, gfp_mask, csize, oom_check); | 2059 | ret = mem_cgroup_do_charge(mem, gfp_mask, batch, oom_check); |
2002 | |||
2003 | switch (ret) { | 2060 | switch (ret) { |
2004 | case CHARGE_OK: | 2061 | case CHARGE_OK: |
2005 | break; | 2062 | break; |
2006 | case CHARGE_RETRY: /* not in OOM situation but retry */ | 2063 | case CHARGE_RETRY: /* not in OOM situation but retry */ |
2007 | csize = page_size; | 2064 | batch = nr_pages; |
2008 | css_put(&mem->css); | 2065 | css_put(&mem->css); |
2009 | mem = NULL; | 2066 | mem = NULL; |
2010 | goto again; | 2067 | goto again; |
@@ -2025,8 +2082,8 @@ again: | |||
2025 | } | 2082 | } |
2026 | } while (ret != CHARGE_OK); | 2083 | } while (ret != CHARGE_OK); |
2027 | 2084 | ||
2028 | if (csize > page_size) | 2085 | if (batch > nr_pages) |
2029 | refill_stock(mem, csize - page_size); | 2086 | refill_stock(mem, batch - nr_pages); |
2030 | css_put(&mem->css); | 2087 | css_put(&mem->css); |
2031 | done: | 2088 | done: |
2032 | *memcg = mem; | 2089 | *memcg = mem; |
@@ -2045,21 +2102,17 @@ bypass: | |||
2045 | * gotten by try_charge(). | 2102 | * gotten by try_charge(). |
2046 | */ | 2103 | */ |
2047 | static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, | 2104 | static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, |
2048 | unsigned long count) | 2105 | unsigned int nr_pages) |
2049 | { | 2106 | { |
2050 | if (!mem_cgroup_is_root(mem)) { | 2107 | if (!mem_cgroup_is_root(mem)) { |
2051 | res_counter_uncharge(&mem->res, PAGE_SIZE * count); | 2108 | unsigned long bytes = nr_pages * PAGE_SIZE; |
2109 | |||
2110 | res_counter_uncharge(&mem->res, bytes); | ||
2052 | if (do_swap_account) | 2111 | if (do_swap_account) |
2053 | res_counter_uncharge(&mem->memsw, PAGE_SIZE * count); | 2112 | res_counter_uncharge(&mem->memsw, bytes); |
2054 | } | 2113 | } |
2055 | } | 2114 | } |
2056 | 2115 | ||
2057 | static void mem_cgroup_cancel_charge(struct mem_cgroup *mem, | ||
2058 | int page_size) | ||
2059 | { | ||
2060 | __mem_cgroup_cancel_charge(mem, page_size >> PAGE_SHIFT); | ||
2061 | } | ||
2062 | |||
2063 | /* | 2116 | /* |
2064 | * A helper function to get mem_cgroup from ID. must be called under | 2117 | * A helper function to get mem_cgroup from ID. must be called under |
2065 | * rcu_read_lock(). The caller must check css_is_removed() or some if | 2118 | * rcu_read_lock(). The caller must check css_is_removed() or some if |
@@ -2108,20 +2161,15 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) | |||
2108 | } | 2161 | } |
2109 | 2162 | ||
2110 | static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | 2163 | static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, |
2164 | struct page *page, | ||
2165 | unsigned int nr_pages, | ||
2111 | struct page_cgroup *pc, | 2166 | struct page_cgroup *pc, |
2112 | enum charge_type ctype, | 2167 | enum charge_type ctype) |
2113 | int page_size) | ||
2114 | { | 2168 | { |
2115 | int nr_pages = page_size >> PAGE_SHIFT; | ||
2116 | |||
2117 | /* try_charge() can return NULL to *memcg, taking care of it. */ | ||
2118 | if (!mem) | ||
2119 | return; | ||
2120 | |||
2121 | lock_page_cgroup(pc); | 2169 | lock_page_cgroup(pc); |
2122 | if (unlikely(PageCgroupUsed(pc))) { | 2170 | if (unlikely(PageCgroupUsed(pc))) { |
2123 | unlock_page_cgroup(pc); | 2171 | unlock_page_cgroup(pc); |
2124 | mem_cgroup_cancel_charge(mem, page_size); | 2172 | __mem_cgroup_cancel_charge(mem, nr_pages); |
2125 | return; | 2173 | return; |
2126 | } | 2174 | } |
2127 | /* | 2175 | /* |
@@ -2158,7 +2206,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | |||
2158 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. | 2206 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. |
2159 | * if they exceeds softlimit. | 2207 | * if they exceeds softlimit. |
2160 | */ | 2208 | */ |
2161 | memcg_check_events(mem, pc->page); | 2209 | memcg_check_events(mem, page); |
2162 | } | 2210 | } |
2163 | 2211 | ||
2164 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 2212 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
@@ -2195,7 +2243,7 @@ void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail) | |||
2195 | * We hold lru_lock, then, reduce counter directly. | 2243 | * We hold lru_lock, then, reduce counter directly. |
2196 | */ | 2244 | */ |
2197 | lru = page_lru(head); | 2245 | lru = page_lru(head); |
2198 | mz = page_cgroup_zoneinfo(head_pc); | 2246 | mz = page_cgroup_zoneinfo(head_pc->mem_cgroup, head); |
2199 | MEM_CGROUP_ZSTAT(mz, lru) -= 1; | 2247 | MEM_CGROUP_ZSTAT(mz, lru) -= 1; |
2200 | } | 2248 | } |
2201 | tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; | 2249 | tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; |
@@ -2204,7 +2252,9 @@ void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail) | |||
2204 | #endif | 2252 | #endif |
2205 | 2253 | ||
2206 | /** | 2254 | /** |
2207 | * __mem_cgroup_move_account - move account of the page | 2255 | * mem_cgroup_move_account - move account of the page |
2256 | * @page: the page | ||
2257 | * @nr_pages: number of regular pages (>1 for huge pages) | ||
2208 | * @pc: page_cgroup of the page. | 2258 | * @pc: page_cgroup of the page. |
2209 | * @from: mem_cgroup which the page is moved from. | 2259 | * @from: mem_cgroup which the page is moved from. |
2210 | * @to: mem_cgroup which the page is moved to. @from != @to. | 2260 | * @to: mem_cgroup which the page is moved to. @from != @to. |
@@ -2212,25 +2262,42 @@ void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail) | |||
2212 | * | 2262 | * |
2213 | * The caller must confirm following. | 2263 | * The caller must confirm following. |
2214 | * - page is not on LRU (isolate_page() is useful.) | 2264 | * - page is not on LRU (isolate_page() is useful.) |
2215 | * - the pc is locked, used, and ->mem_cgroup points to @from. | 2265 | * - compound_lock is held when nr_pages > 1 |
2216 | * | 2266 | * |
2217 | * This function doesn't do "charge" nor css_get to new cgroup. It should be | 2267 | * This function doesn't do "charge" nor css_get to new cgroup. It should be |
2218 | * done by a caller(__mem_cgroup_try_charge would be usefull). If @uncharge is | 2268 | * done by a caller(__mem_cgroup_try_charge would be useful). If @uncharge is |
2219 | * true, this function does "uncharge" from old cgroup, but it doesn't if | 2269 | * true, this function does "uncharge" from old cgroup, but it doesn't if |
2220 | * @uncharge is false, so a caller should do "uncharge". | 2270 | * @uncharge is false, so a caller should do "uncharge". |
2221 | */ | 2271 | */ |
2222 | 2272 | static int mem_cgroup_move_account(struct page *page, | |
2223 | static void __mem_cgroup_move_account(struct page_cgroup *pc, | 2273 | unsigned int nr_pages, |
2224 | struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge, | 2274 | struct page_cgroup *pc, |
2225 | int charge_size) | 2275 | struct mem_cgroup *from, |
2276 | struct mem_cgroup *to, | ||
2277 | bool uncharge) | ||
2226 | { | 2278 | { |
2227 | int nr_pages = charge_size >> PAGE_SHIFT; | 2279 | unsigned long flags; |
2280 | int ret; | ||
2228 | 2281 | ||
2229 | VM_BUG_ON(from == to); | 2282 | VM_BUG_ON(from == to); |
2230 | VM_BUG_ON(PageLRU(pc->page)); | 2283 | VM_BUG_ON(PageLRU(page)); |
2231 | VM_BUG_ON(!page_is_cgroup_locked(pc)); | 2284 | /* |
2232 | VM_BUG_ON(!PageCgroupUsed(pc)); | 2285 | * The page is isolated from LRU. So, collapse function |
2233 | VM_BUG_ON(pc->mem_cgroup != from); | 2286 | * will not handle this page. But page splitting can happen. |
2287 | * Do this check under compound_page_lock(). The caller should | ||
2288 | * hold it. | ||
2289 | */ | ||
2290 | ret = -EBUSY; | ||
2291 | if (nr_pages > 1 && !PageTransHuge(page)) | ||
2292 | goto out; | ||
2293 | |||
2294 | lock_page_cgroup(pc); | ||
2295 | |||
2296 | ret = -EINVAL; | ||
2297 | if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) | ||
2298 | goto unlock; | ||
2299 | |||
2300 | move_lock_page_cgroup(pc, &flags); | ||
2234 | 2301 | ||
2235 | if (PageCgroupFileMapped(pc)) { | 2302 | if (PageCgroupFileMapped(pc)) { |
2236 | /* Update mapped_file data for mem_cgroup */ | 2303 | /* Update mapped_file data for mem_cgroup */ |
@@ -2242,7 +2309,7 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc, | |||
2242 | mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages); | 2309 | mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages); |
2243 | if (uncharge) | 2310 | if (uncharge) |
2244 | /* This is not "cancel", but cancel_charge does all we need. */ | 2311 | /* This is not "cancel", but cancel_charge does all we need. */ |
2245 | mem_cgroup_cancel_charge(from, charge_size); | 2312 | __mem_cgroup_cancel_charge(from, nr_pages); |
2246 | 2313 | ||
2247 | /* caller should have done css_get */ | 2314 | /* caller should have done css_get */ |
2248 | pc->mem_cgroup = to; | 2315 | pc->mem_cgroup = to; |
@@ -2251,43 +2318,19 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc, | |||
2251 | * We charges against "to" which may not have any tasks. Then, "to" | 2318 | * We charges against "to" which may not have any tasks. Then, "to" |
2252 | * can be under rmdir(). But in current implementation, caller of | 2319 | * can be under rmdir(). But in current implementation, caller of |
2253 | * this function is just force_empty() and move charge, so it's | 2320 | * this function is just force_empty() and move charge, so it's |
2254 | * garanteed that "to" is never removed. So, we don't check rmdir | 2321 | * guaranteed that "to" is never removed. So, we don't check rmdir |
2255 | * status here. | 2322 | * status here. |
2256 | */ | 2323 | */ |
2257 | } | 2324 | move_unlock_page_cgroup(pc, &flags); |
2258 | 2325 | ret = 0; | |
2259 | /* | 2326 | unlock: |
2260 | * check whether the @pc is valid for moving account and call | ||
2261 | * __mem_cgroup_move_account() | ||
2262 | */ | ||
2263 | static int mem_cgroup_move_account(struct page_cgroup *pc, | ||
2264 | struct mem_cgroup *from, struct mem_cgroup *to, | ||
2265 | bool uncharge, int charge_size) | ||
2266 | { | ||
2267 | int ret = -EINVAL; | ||
2268 | unsigned long flags; | ||
2269 | /* | ||
2270 | * The page is isolated from LRU. So, collapse function | ||
2271 | * will not handle this page. But page splitting can happen. | ||
2272 | * Do this check under compound_page_lock(). The caller should | ||
2273 | * hold it. | ||
2274 | */ | ||
2275 | if ((charge_size > PAGE_SIZE) && !PageTransHuge(pc->page)) | ||
2276 | return -EBUSY; | ||
2277 | |||
2278 | lock_page_cgroup(pc); | ||
2279 | if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { | ||
2280 | move_lock_page_cgroup(pc, &flags); | ||
2281 | __mem_cgroup_move_account(pc, from, to, uncharge, charge_size); | ||
2282 | move_unlock_page_cgroup(pc, &flags); | ||
2283 | ret = 0; | ||
2284 | } | ||
2285 | unlock_page_cgroup(pc); | 2327 | unlock_page_cgroup(pc); |
2286 | /* | 2328 | /* |
2287 | * check events | 2329 | * check events |
2288 | */ | 2330 | */ |
2289 | memcg_check_events(to, pc->page); | 2331 | memcg_check_events(to, page); |
2290 | memcg_check_events(from, pc->page); | 2332 | memcg_check_events(from, page); |
2333 | out: | ||
2291 | return ret; | 2334 | return ret; |
2292 | } | 2335 | } |
2293 | 2336 | ||
@@ -2295,16 +2338,16 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, | |||
2295 | * move charges to its parent. | 2338 | * move charges to its parent. |
2296 | */ | 2339 | */ |
2297 | 2340 | ||
2298 | static int mem_cgroup_move_parent(struct page_cgroup *pc, | 2341 | static int mem_cgroup_move_parent(struct page *page, |
2342 | struct page_cgroup *pc, | ||
2299 | struct mem_cgroup *child, | 2343 | struct mem_cgroup *child, |
2300 | gfp_t gfp_mask) | 2344 | gfp_t gfp_mask) |
2301 | { | 2345 | { |
2302 | struct page *page = pc->page; | ||
2303 | struct cgroup *cg = child->css.cgroup; | 2346 | struct cgroup *cg = child->css.cgroup; |
2304 | struct cgroup *pcg = cg->parent; | 2347 | struct cgroup *pcg = cg->parent; |
2305 | struct mem_cgroup *parent; | 2348 | struct mem_cgroup *parent; |
2306 | int page_size = PAGE_SIZE; | 2349 | unsigned int nr_pages; |
2307 | unsigned long flags; | 2350 | unsigned long uninitialized_var(flags); |
2308 | int ret; | 2351 | int ret; |
2309 | 2352 | ||
2310 | /* Is ROOT ? */ | 2353 | /* Is ROOT ? */ |
@@ -2317,23 +2360,21 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, | |||
2317 | if (isolate_lru_page(page)) | 2360 | if (isolate_lru_page(page)) |
2318 | goto put; | 2361 | goto put; |
2319 | 2362 | ||
2320 | if (PageTransHuge(page)) | 2363 | nr_pages = hpage_nr_pages(page); |
2321 | page_size = HPAGE_SIZE; | ||
2322 | 2364 | ||
2323 | parent = mem_cgroup_from_cont(pcg); | 2365 | parent = mem_cgroup_from_cont(pcg); |
2324 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, | 2366 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false); |
2325 | &parent, false, page_size); | ||
2326 | if (ret || !parent) | 2367 | if (ret || !parent) |
2327 | goto put_back; | 2368 | goto put_back; |
2328 | 2369 | ||
2329 | if (page_size > PAGE_SIZE) | 2370 | if (nr_pages > 1) |
2330 | flags = compound_lock_irqsave(page); | 2371 | flags = compound_lock_irqsave(page); |
2331 | 2372 | ||
2332 | ret = mem_cgroup_move_account(pc, child, parent, true, page_size); | 2373 | ret = mem_cgroup_move_account(page, nr_pages, pc, child, parent, true); |
2333 | if (ret) | 2374 | if (ret) |
2334 | mem_cgroup_cancel_charge(parent, page_size); | 2375 | __mem_cgroup_cancel_charge(parent, nr_pages); |
2335 | 2376 | ||
2336 | if (page_size > PAGE_SIZE) | 2377 | if (nr_pages > 1) |
2337 | compound_unlock_irqrestore(page, flags); | 2378 | compound_unlock_irqrestore(page, flags); |
2338 | put_back: | 2379 | put_back: |
2339 | putback_lru_page(page); | 2380 | putback_lru_page(page); |
@@ -2353,13 +2394,13 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
2353 | gfp_t gfp_mask, enum charge_type ctype) | 2394 | gfp_t gfp_mask, enum charge_type ctype) |
2354 | { | 2395 | { |
2355 | struct mem_cgroup *mem = NULL; | 2396 | struct mem_cgroup *mem = NULL; |
2356 | int page_size = PAGE_SIZE; | 2397 | unsigned int nr_pages = 1; |
2357 | struct page_cgroup *pc; | 2398 | struct page_cgroup *pc; |
2358 | bool oom = true; | 2399 | bool oom = true; |
2359 | int ret; | 2400 | int ret; |
2360 | 2401 | ||
2361 | if (PageTransHuge(page)) { | 2402 | if (PageTransHuge(page)) { |
2362 | page_size <<= compound_order(page); | 2403 | nr_pages <<= compound_order(page); |
2363 | VM_BUG_ON(!PageTransHuge(page)); | 2404 | VM_BUG_ON(!PageTransHuge(page)); |
2364 | /* | 2405 | /* |
2365 | * Never OOM-kill a process for a huge page. The | 2406 | * Never OOM-kill a process for a huge page. The |
@@ -2369,16 +2410,13 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
2369 | } | 2410 | } |
2370 | 2411 | ||
2371 | pc = lookup_page_cgroup(page); | 2412 | pc = lookup_page_cgroup(page); |
2372 | /* can happen at boot */ | 2413 | BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */ |
2373 | if (unlikely(!pc)) | ||
2374 | return 0; | ||
2375 | prefetchw(pc); | ||
2376 | 2414 | ||
2377 | ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, oom, page_size); | 2415 | ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &mem, oom); |
2378 | if (ret || !mem) | 2416 | if (ret || !mem) |
2379 | return ret; | 2417 | return ret; |
2380 | 2418 | ||
2381 | __mem_cgroup_commit_charge(mem, pc, ctype, page_size); | 2419 | __mem_cgroup_commit_charge(mem, page, nr_pages, pc, ctype); |
2382 | return 0; | 2420 | return 0; |
2383 | } | 2421 | } |
2384 | 2422 | ||
@@ -2406,9 +2444,26 @@ static void | |||
2406 | __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, | 2444 | __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, |
2407 | enum charge_type ctype); | 2445 | enum charge_type ctype); |
2408 | 2446 | ||
2447 | static void | ||
2448 | __mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem, | ||
2449 | enum charge_type ctype) | ||
2450 | { | ||
2451 | struct page_cgroup *pc = lookup_page_cgroup(page); | ||
2452 | /* | ||
2453 | * In some case, SwapCache, FUSE(splice_buf->radixtree), the page | ||
2454 | * is already on LRU. It means the page may on some other page_cgroup's | ||
2455 | * LRU. Take care of it. | ||
2456 | */ | ||
2457 | mem_cgroup_lru_del_before_commit(page); | ||
2458 | __mem_cgroup_commit_charge(mem, page, 1, pc, ctype); | ||
2459 | mem_cgroup_lru_add_after_commit(page); | ||
2460 | return; | ||
2461 | } | ||
2462 | |||
2409 | int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | 2463 | int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, |
2410 | gfp_t gfp_mask) | 2464 | gfp_t gfp_mask) |
2411 | { | 2465 | { |
2466 | struct mem_cgroup *mem = NULL; | ||
2412 | int ret; | 2467 | int ret; |
2413 | 2468 | ||
2414 | if (mem_cgroup_disabled()) | 2469 | if (mem_cgroup_disabled()) |
@@ -2443,14 +2498,22 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | |||
2443 | if (unlikely(!mm)) | 2498 | if (unlikely(!mm)) |
2444 | mm = &init_mm; | 2499 | mm = &init_mm; |
2445 | 2500 | ||
2446 | if (page_is_file_cache(page)) | 2501 | if (page_is_file_cache(page)) { |
2447 | return mem_cgroup_charge_common(page, mm, gfp_mask, | 2502 | ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &mem, true); |
2448 | MEM_CGROUP_CHARGE_TYPE_CACHE); | 2503 | if (ret || !mem) |
2504 | return ret; | ||
2449 | 2505 | ||
2506 | /* | ||
2507 | * FUSE reuses pages without going through the final | ||
2508 | * put that would remove them from the LRU list, make | ||
2509 | * sure that they get relinked properly. | ||
2510 | */ | ||
2511 | __mem_cgroup_commit_charge_lrucare(page, mem, | ||
2512 | MEM_CGROUP_CHARGE_TYPE_CACHE); | ||
2513 | return ret; | ||
2514 | } | ||
2450 | /* shmem */ | 2515 | /* shmem */ |
2451 | if (PageSwapCache(page)) { | 2516 | if (PageSwapCache(page)) { |
2452 | struct mem_cgroup *mem = NULL; | ||
2453 | |||
2454 | ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); | 2517 | ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); |
2455 | if (!ret) | 2518 | if (!ret) |
2456 | __mem_cgroup_commit_charge_swapin(page, mem, | 2519 | __mem_cgroup_commit_charge_swapin(page, mem, |
@@ -2475,6 +2538,8 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | |||
2475 | struct mem_cgroup *mem; | 2538 | struct mem_cgroup *mem; |
2476 | int ret; | 2539 | int ret; |
2477 | 2540 | ||
2541 | *ptr = NULL; | ||
2542 | |||
2478 | if (mem_cgroup_disabled()) | 2543 | if (mem_cgroup_disabled()) |
2479 | return 0; | 2544 | return 0; |
2480 | 2545 | ||
@@ -2492,30 +2557,26 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | |||
2492 | if (!mem) | 2557 | if (!mem) |
2493 | goto charge_cur_mm; | 2558 | goto charge_cur_mm; |
2494 | *ptr = mem; | 2559 | *ptr = mem; |
2495 | ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, PAGE_SIZE); | 2560 | ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true); |
2496 | css_put(&mem->css); | 2561 | css_put(&mem->css); |
2497 | return ret; | 2562 | return ret; |
2498 | charge_cur_mm: | 2563 | charge_cur_mm: |
2499 | if (unlikely(!mm)) | 2564 | if (unlikely(!mm)) |
2500 | mm = &init_mm; | 2565 | mm = &init_mm; |
2501 | return __mem_cgroup_try_charge(mm, mask, ptr, true, PAGE_SIZE); | 2566 | return __mem_cgroup_try_charge(mm, mask, 1, ptr, true); |
2502 | } | 2567 | } |
2503 | 2568 | ||
2504 | static void | 2569 | static void |
2505 | __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, | 2570 | __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, |
2506 | enum charge_type ctype) | 2571 | enum charge_type ctype) |
2507 | { | 2572 | { |
2508 | struct page_cgroup *pc; | ||
2509 | |||
2510 | if (mem_cgroup_disabled()) | 2573 | if (mem_cgroup_disabled()) |
2511 | return; | 2574 | return; |
2512 | if (!ptr) | 2575 | if (!ptr) |
2513 | return; | 2576 | return; |
2514 | cgroup_exclude_rmdir(&ptr->css); | 2577 | cgroup_exclude_rmdir(&ptr->css); |
2515 | pc = lookup_page_cgroup(page); | 2578 | |
2516 | mem_cgroup_lru_del_before_commit_swapcache(page); | 2579 | __mem_cgroup_commit_charge_lrucare(page, ptr, ctype); |
2517 | __mem_cgroup_commit_charge(ptr, pc, ctype, PAGE_SIZE); | ||
2518 | mem_cgroup_lru_add_after_commit_swapcache(page); | ||
2519 | /* | 2580 | /* |
2520 | * Now swap is on-memory. This means this page may be | 2581 | * Now swap is on-memory. This means this page may be |
2521 | * counted both as mem and swap....double count. | 2582 | * counted both as mem and swap....double count. |
@@ -2563,15 +2624,16 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) | |||
2563 | return; | 2624 | return; |
2564 | if (!mem) | 2625 | if (!mem) |
2565 | return; | 2626 | return; |
2566 | mem_cgroup_cancel_charge(mem, PAGE_SIZE); | 2627 | __mem_cgroup_cancel_charge(mem, 1); |
2567 | } | 2628 | } |
2568 | 2629 | ||
2569 | static void | 2630 | static void mem_cgroup_do_uncharge(struct mem_cgroup *mem, |
2570 | __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype, | 2631 | unsigned int nr_pages, |
2571 | int page_size) | 2632 | const enum charge_type ctype) |
2572 | { | 2633 | { |
2573 | struct memcg_batch_info *batch = NULL; | 2634 | struct memcg_batch_info *batch = NULL; |
2574 | bool uncharge_memsw = true; | 2635 | bool uncharge_memsw = true; |
2636 | |||
2575 | /* If swapout, usage of swap doesn't decrease */ | 2637 | /* If swapout, usage of swap doesn't decrease */ |
2576 | if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | 2638 | if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) |
2577 | uncharge_memsw = false; | 2639 | uncharge_memsw = false; |
@@ -2586,7 +2648,7 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype, | |||
2586 | batch->memcg = mem; | 2648 | batch->memcg = mem; |
2587 | /* | 2649 | /* |
2588 | * do_batch > 0 when unmapping pages or inode invalidate/truncate. | 2650 | * do_batch > 0 when unmapping pages or inode invalidate/truncate. |
2589 | * In those cases, all pages freed continously can be expected to be in | 2651 | * In those cases, all pages freed continuously can be expected to be in |
2590 | * the same cgroup and we have chance to coalesce uncharges. | 2652 | * the same cgroup and we have chance to coalesce uncharges. |
2591 | * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) | 2653 | * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) |
2592 | * because we want to do uncharge as soon as possible. | 2654 | * because we want to do uncharge as soon as possible. |
@@ -2595,7 +2657,7 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype, | |||
2595 | if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) | 2657 | if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) |
2596 | goto direct_uncharge; | 2658 | goto direct_uncharge; |
2597 | 2659 | ||
2598 | if (page_size != PAGE_SIZE) | 2660 | if (nr_pages > 1) |
2599 | goto direct_uncharge; | 2661 | goto direct_uncharge; |
2600 | 2662 | ||
2601 | /* | 2663 | /* |
@@ -2606,14 +2668,14 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype, | |||
2606 | if (batch->memcg != mem) | 2668 | if (batch->memcg != mem) |
2607 | goto direct_uncharge; | 2669 | goto direct_uncharge; |
2608 | /* remember freed charge and uncharge it later */ | 2670 | /* remember freed charge and uncharge it later */ |
2609 | batch->bytes += PAGE_SIZE; | 2671 | batch->nr_pages++; |
2610 | if (uncharge_memsw) | 2672 | if (uncharge_memsw) |
2611 | batch->memsw_bytes += PAGE_SIZE; | 2673 | batch->memsw_nr_pages++; |
2612 | return; | 2674 | return; |
2613 | direct_uncharge: | 2675 | direct_uncharge: |
2614 | res_counter_uncharge(&mem->res, page_size); | 2676 | res_counter_uncharge(&mem->res, nr_pages * PAGE_SIZE); |
2615 | if (uncharge_memsw) | 2677 | if (uncharge_memsw) |
2616 | res_counter_uncharge(&mem->memsw, page_size); | 2678 | res_counter_uncharge(&mem->memsw, nr_pages * PAGE_SIZE); |
2617 | if (unlikely(batch->memcg != mem)) | 2679 | if (unlikely(batch->memcg != mem)) |
2618 | memcg_oom_recover(mem); | 2680 | memcg_oom_recover(mem); |
2619 | return; | 2681 | return; |
@@ -2625,10 +2687,9 @@ direct_uncharge: | |||
2625 | static struct mem_cgroup * | 2687 | static struct mem_cgroup * |
2626 | __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | 2688 | __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) |
2627 | { | 2689 | { |
2628 | int count; | ||
2629 | struct page_cgroup *pc; | ||
2630 | struct mem_cgroup *mem = NULL; | 2690 | struct mem_cgroup *mem = NULL; |
2631 | int page_size = PAGE_SIZE; | 2691 | unsigned int nr_pages = 1; |
2692 | struct page_cgroup *pc; | ||
2632 | 2693 | ||
2633 | if (mem_cgroup_disabled()) | 2694 | if (mem_cgroup_disabled()) |
2634 | return NULL; | 2695 | return NULL; |
@@ -2637,11 +2698,9 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
2637 | return NULL; | 2698 | return NULL; |
2638 | 2699 | ||
2639 | if (PageTransHuge(page)) { | 2700 | if (PageTransHuge(page)) { |
2640 | page_size <<= compound_order(page); | 2701 | nr_pages <<= compound_order(page); |
2641 | VM_BUG_ON(!PageTransHuge(page)); | 2702 | VM_BUG_ON(!PageTransHuge(page)); |
2642 | } | 2703 | } |
2643 | |||
2644 | count = page_size >> PAGE_SHIFT; | ||
2645 | /* | 2704 | /* |
2646 | * Check if our page_cgroup is valid | 2705 | * Check if our page_cgroup is valid |
2647 | */ | 2706 | */ |
@@ -2674,7 +2733,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
2674 | break; | 2733 | break; |
2675 | } | 2734 | } |
2676 | 2735 | ||
2677 | mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -count); | 2736 | mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -nr_pages); |
2678 | 2737 | ||
2679 | ClearPageCgroupUsed(pc); | 2738 | ClearPageCgroupUsed(pc); |
2680 | /* | 2739 | /* |
@@ -2695,7 +2754,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
2695 | mem_cgroup_get(mem); | 2754 | mem_cgroup_get(mem); |
2696 | } | 2755 | } |
2697 | if (!mem_cgroup_is_root(mem)) | 2756 | if (!mem_cgroup_is_root(mem)) |
2698 | __do_uncharge(mem, ctype, page_size); | 2757 | mem_cgroup_do_uncharge(mem, nr_pages, ctype); |
2699 | 2758 | ||
2700 | return mem; | 2759 | return mem; |
2701 | 2760 | ||
@@ -2735,8 +2794,8 @@ void mem_cgroup_uncharge_start(void) | |||
2735 | /* We can do nest. */ | 2794 | /* We can do nest. */ |
2736 | if (current->memcg_batch.do_batch == 1) { | 2795 | if (current->memcg_batch.do_batch == 1) { |
2737 | current->memcg_batch.memcg = NULL; | 2796 | current->memcg_batch.memcg = NULL; |
2738 | current->memcg_batch.bytes = 0; | 2797 | current->memcg_batch.nr_pages = 0; |
2739 | current->memcg_batch.memsw_bytes = 0; | 2798 | current->memcg_batch.memsw_nr_pages = 0; |
2740 | } | 2799 | } |
2741 | } | 2800 | } |
2742 | 2801 | ||
@@ -2757,10 +2816,12 @@ void mem_cgroup_uncharge_end(void) | |||
2757 | * This "batch->memcg" is valid without any css_get/put etc... | 2816 | * This "batch->memcg" is valid without any css_get/put etc... |
2758 | * bacause we hide charges behind us. | 2817 | * bacause we hide charges behind us. |
2759 | */ | 2818 | */ |
2760 | if (batch->bytes) | 2819 | if (batch->nr_pages) |
2761 | res_counter_uncharge(&batch->memcg->res, batch->bytes); | 2820 | res_counter_uncharge(&batch->memcg->res, |
2762 | if (batch->memsw_bytes) | 2821 | batch->nr_pages * PAGE_SIZE); |
2763 | res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes); | 2822 | if (batch->memsw_nr_pages) |
2823 | res_counter_uncharge(&batch->memcg->memsw, | ||
2824 | batch->memsw_nr_pages * PAGE_SIZE); | ||
2764 | memcg_oom_recover(batch->memcg); | 2825 | memcg_oom_recover(batch->memcg); |
2765 | /* forget this pointer (for sanity check) */ | 2826 | /* forget this pointer (for sanity check) */ |
2766 | batch->memcg = NULL; | 2827 | batch->memcg = NULL; |
@@ -2883,13 +2944,15 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, | |||
2883 | * page belongs to. | 2944 | * page belongs to. |
2884 | */ | 2945 | */ |
2885 | int mem_cgroup_prepare_migration(struct page *page, | 2946 | int mem_cgroup_prepare_migration(struct page *page, |
2886 | struct page *newpage, struct mem_cgroup **ptr) | 2947 | struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask) |
2887 | { | 2948 | { |
2888 | struct page_cgroup *pc; | ||
2889 | struct mem_cgroup *mem = NULL; | 2949 | struct mem_cgroup *mem = NULL; |
2950 | struct page_cgroup *pc; | ||
2890 | enum charge_type ctype; | 2951 | enum charge_type ctype; |
2891 | int ret = 0; | 2952 | int ret = 0; |
2892 | 2953 | ||
2954 | *ptr = NULL; | ||
2955 | |||
2893 | VM_BUG_ON(PageTransHuge(page)); | 2956 | VM_BUG_ON(PageTransHuge(page)); |
2894 | if (mem_cgroup_disabled()) | 2957 | if (mem_cgroup_disabled()) |
2895 | return 0; | 2958 | return 0; |
@@ -2940,7 +3003,7 @@ int mem_cgroup_prepare_migration(struct page *page, | |||
2940 | return 0; | 3003 | return 0; |
2941 | 3004 | ||
2942 | *ptr = mem; | 3005 | *ptr = mem; |
2943 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false, PAGE_SIZE); | 3006 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false); |
2944 | css_put(&mem->css);/* drop extra refcnt */ | 3007 | css_put(&mem->css);/* drop extra refcnt */ |
2945 | if (ret || *ptr == NULL) { | 3008 | if (ret || *ptr == NULL) { |
2946 | if (PageAnon(page)) { | 3009 | if (PageAnon(page)) { |
@@ -2967,7 +3030,7 @@ int mem_cgroup_prepare_migration(struct page *page, | |||
2967 | ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; | 3030 | ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; |
2968 | else | 3031 | else |
2969 | ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; | 3032 | ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; |
2970 | __mem_cgroup_commit_charge(mem, pc, ctype, PAGE_SIZE); | 3033 | __mem_cgroup_commit_charge(mem, page, 1, pc, ctype); |
2971 | return ret; | 3034 | return ret; |
2972 | } | 3035 | } |
2973 | 3036 | ||
@@ -3032,7 +3095,7 @@ int mem_cgroup_shmem_charge_fallback(struct page *page, | |||
3032 | struct mm_struct *mm, | 3095 | struct mm_struct *mm, |
3033 | gfp_t gfp_mask) | 3096 | gfp_t gfp_mask) |
3034 | { | 3097 | { |
3035 | struct mem_cgroup *mem = NULL; | 3098 | struct mem_cgroup *mem; |
3036 | int ret; | 3099 | int ret; |
3037 | 3100 | ||
3038 | if (mem_cgroup_disabled()) | 3101 | if (mem_cgroup_disabled()) |
@@ -3045,6 +3108,52 @@ int mem_cgroup_shmem_charge_fallback(struct page *page, | |||
3045 | return ret; | 3108 | return ret; |
3046 | } | 3109 | } |
3047 | 3110 | ||
3111 | #ifdef CONFIG_DEBUG_VM | ||
3112 | static struct page_cgroup *lookup_page_cgroup_used(struct page *page) | ||
3113 | { | ||
3114 | struct page_cgroup *pc; | ||
3115 | |||
3116 | pc = lookup_page_cgroup(page); | ||
3117 | if (likely(pc) && PageCgroupUsed(pc)) | ||
3118 | return pc; | ||
3119 | return NULL; | ||
3120 | } | ||
3121 | |||
3122 | bool mem_cgroup_bad_page_check(struct page *page) | ||
3123 | { | ||
3124 | if (mem_cgroup_disabled()) | ||
3125 | return false; | ||
3126 | |||
3127 | return lookup_page_cgroup_used(page) != NULL; | ||
3128 | } | ||
3129 | |||
3130 | void mem_cgroup_print_bad_page(struct page *page) | ||
3131 | { | ||
3132 | struct page_cgroup *pc; | ||
3133 | |||
3134 | pc = lookup_page_cgroup_used(page); | ||
3135 | if (pc) { | ||
3136 | int ret = -1; | ||
3137 | char *path; | ||
3138 | |||
3139 | printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p", | ||
3140 | pc, pc->flags, pc->mem_cgroup); | ||
3141 | |||
3142 | path = kmalloc(PATH_MAX, GFP_KERNEL); | ||
3143 | if (path) { | ||
3144 | rcu_read_lock(); | ||
3145 | ret = cgroup_path(pc->mem_cgroup->css.cgroup, | ||
3146 | path, PATH_MAX); | ||
3147 | rcu_read_unlock(); | ||
3148 | } | ||
3149 | |||
3150 | printk(KERN_CONT "(%s)\n", | ||
3151 | (ret < 0) ? "cannot get the path" : path); | ||
3152 | kfree(path); | ||
3153 | } | ||
3154 | } | ||
3155 | #endif | ||
3156 | |||
3048 | static DEFINE_MUTEX(set_limit_mutex); | 3157 | static DEFINE_MUTEX(set_limit_mutex); |
3049 | 3158 | ||
3050 | static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | 3159 | static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, |
@@ -3288,6 +3397,8 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, | |||
3288 | loop += 256; | 3397 | loop += 256; |
3289 | busy = NULL; | 3398 | busy = NULL; |
3290 | while (loop--) { | 3399 | while (loop--) { |
3400 | struct page *page; | ||
3401 | |||
3291 | ret = 0; | 3402 | ret = 0; |
3292 | spin_lock_irqsave(&zone->lru_lock, flags); | 3403 | spin_lock_irqsave(&zone->lru_lock, flags); |
3293 | if (list_empty(list)) { | 3404 | if (list_empty(list)) { |
@@ -3303,7 +3414,9 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, | |||
3303 | } | 3414 | } |
3304 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 3415 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
3305 | 3416 | ||
3306 | ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL); | 3417 | page = lookup_cgroup_page(pc); |
3418 | |||
3419 | ret = mem_cgroup_move_parent(page, pc, mem, GFP_KERNEL); | ||
3307 | if (ret == -ENOMEM) | 3420 | if (ret == -ENOMEM) |
3308 | break; | 3421 | break; |
3309 | 3422 | ||
@@ -3451,13 +3564,13 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, | |||
3451 | } | 3564 | } |
3452 | 3565 | ||
3453 | 3566 | ||
3454 | static u64 mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem, | 3567 | static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *mem, |
3455 | enum mem_cgroup_stat_index idx) | 3568 | enum mem_cgroup_stat_index idx) |
3456 | { | 3569 | { |
3457 | struct mem_cgroup *iter; | 3570 | struct mem_cgroup *iter; |
3458 | s64 val = 0; | 3571 | long val = 0; |
3459 | 3572 | ||
3460 | /* each per cpu's value can be minus.Then, use s64 */ | 3573 | /* Per-cpu values can be negative, use a signed accumulator */ |
3461 | for_each_mem_cgroup_tree(iter, mem) | 3574 | for_each_mem_cgroup_tree(iter, mem) |
3462 | val += mem_cgroup_read_stat(iter, idx); | 3575 | val += mem_cgroup_read_stat(iter, idx); |
3463 | 3576 | ||
@@ -3477,12 +3590,11 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap) | |||
3477 | return res_counter_read_u64(&mem->memsw, RES_USAGE); | 3590 | return res_counter_read_u64(&mem->memsw, RES_USAGE); |
3478 | } | 3591 | } |
3479 | 3592 | ||
3480 | val = mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE); | 3593 | val = mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_CACHE); |
3481 | val += mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS); | 3594 | val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_RSS); |
3482 | 3595 | ||
3483 | if (swap) | 3596 | if (swap) |
3484 | val += mem_cgroup_get_recursive_idx_stat(mem, | 3597 | val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_SWAPOUT); |
3485 | MEM_CGROUP_STAT_SWAPOUT); | ||
3486 | 3598 | ||
3487 | return val << PAGE_SHIFT; | 3599 | return val << PAGE_SHIFT; |
3488 | } | 3600 | } |
@@ -3702,9 +3814,9 @@ mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) | |||
3702 | s->stat[MCS_RSS] += val * PAGE_SIZE; | 3814 | s->stat[MCS_RSS] += val * PAGE_SIZE; |
3703 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED); | 3815 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED); |
3704 | s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; | 3816 | s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; |
3705 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT); | 3817 | val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGIN); |
3706 | s->stat[MCS_PGPGIN] += val; | 3818 | s->stat[MCS_PGPGIN] += val; |
3707 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT); | 3819 | val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGOUT); |
3708 | s->stat[MCS_PGPGOUT] += val; | 3820 | s->stat[MCS_PGPGOUT] += val; |
3709 | if (do_swap_account) { | 3821 | if (do_swap_account) { |
3710 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT); | 3822 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT); |
@@ -3828,9 +3940,7 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, | |||
3828 | return -EINVAL; | 3940 | return -EINVAL; |
3829 | } | 3941 | } |
3830 | 3942 | ||
3831 | spin_lock(&memcg->reclaim_param_lock); | ||
3832 | memcg->swappiness = val; | 3943 | memcg->swappiness = val; |
3833 | spin_unlock(&memcg->reclaim_param_lock); | ||
3834 | 3944 | ||
3835 | cgroup_unlock(); | 3945 | cgroup_unlock(); |
3836 | 3946 | ||
@@ -4486,7 +4596,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
4486 | res_counter_init(&mem->memsw, NULL); | 4596 | res_counter_init(&mem->memsw, NULL); |
4487 | } | 4597 | } |
4488 | mem->last_scanned_child = 0; | 4598 | mem->last_scanned_child = 0; |
4489 | spin_lock_init(&mem->reclaim_param_lock); | ||
4490 | INIT_LIST_HEAD(&mem->oom_notify); | 4599 | INIT_LIST_HEAD(&mem->oom_notify); |
4491 | 4600 | ||
4492 | if (parent) | 4601 | if (parent) |
@@ -4574,8 +4683,7 @@ one_by_one: | |||
4574 | batch_count = PRECHARGE_COUNT_AT_ONCE; | 4683 | batch_count = PRECHARGE_COUNT_AT_ONCE; |
4575 | cond_resched(); | 4684 | cond_resched(); |
4576 | } | 4685 | } |
4577 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, | 4686 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, 1, &mem, false); |
4578 | PAGE_SIZE); | ||
4579 | if (ret || !mem) | 4687 | if (ret || !mem) |
4580 | /* mem_cgroup_clear_mc() will do uncharge later */ | 4688 | /* mem_cgroup_clear_mc() will do uncharge later */ |
4581 | return -ENOMEM; | 4689 | return -ENOMEM; |
@@ -4737,7 +4845,8 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, | |||
4737 | pte_t *pte; | 4845 | pte_t *pte; |
4738 | spinlock_t *ptl; | 4846 | spinlock_t *ptl; |
4739 | 4847 | ||
4740 | VM_BUG_ON(pmd_trans_huge(*pmd)); | 4848 | split_huge_page_pmd(walk->mm, pmd); |
4849 | |||
4741 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 4850 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
4742 | for (; addr != end; pte++, addr += PAGE_SIZE) | 4851 | for (; addr != end; pte++, addr += PAGE_SIZE) |
4743 | if (is_target_pte_for_mc(vma, addr, *pte, NULL)) | 4852 | if (is_target_pte_for_mc(vma, addr, *pte, NULL)) |
@@ -4899,8 +5008,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, | |||
4899 | pte_t *pte; | 5008 | pte_t *pte; |
4900 | spinlock_t *ptl; | 5009 | spinlock_t *ptl; |
4901 | 5010 | ||
5011 | split_huge_page_pmd(walk->mm, pmd); | ||
4902 | retry: | 5012 | retry: |
4903 | VM_BUG_ON(pmd_trans_huge(*pmd)); | ||
4904 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 5013 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
4905 | for (; addr != end; addr += PAGE_SIZE) { | 5014 | for (; addr != end; addr += PAGE_SIZE) { |
4906 | pte_t ptent = *(pte++); | 5015 | pte_t ptent = *(pte++); |
@@ -4920,8 +5029,8 @@ retry: | |||
4920 | if (isolate_lru_page(page)) | 5029 | if (isolate_lru_page(page)) |
4921 | goto put; | 5030 | goto put; |
4922 | pc = lookup_page_cgroup(page); | 5031 | pc = lookup_page_cgroup(page); |
4923 | if (!mem_cgroup_move_account(pc, | 5032 | if (!mem_cgroup_move_account(page, 1, pc, |
4924 | mc.from, mc.to, false, PAGE_SIZE)) { | 5033 | mc.from, mc.to, false)) { |
4925 | mc.precharge--; | 5034 | mc.precharge--; |
4926 | /* we uncharge from mc.from later. */ | 5035 | /* we uncharge from mc.from later. */ |
4927 | mc.moved_charge++; | 5036 | mc.moved_charge++; |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 99ccb4472623..2b9a5eef39e0 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -208,7 +208,7 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, | |||
208 | * Don't use force here, it's convenient if the signal | 208 | * Don't use force here, it's convenient if the signal |
209 | * can be temporarily blocked. | 209 | * can be temporarily blocked. |
210 | * This could cause a loop when the user sets SIGBUS | 210 | * This could cause a loop when the user sets SIGBUS |
211 | * to SIG_IGN, but hopefully noone will do that? | 211 | * to SIG_IGN, but hopefully no one will do that? |
212 | */ | 212 | */ |
213 | ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */ | 213 | ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */ |
214 | if (ret < 0) | 214 | if (ret < 0) |
@@ -634,7 +634,7 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn) | |||
634 | * when the page is reread or dropped. If an | 634 | * when the page is reread or dropped. If an |
635 | * application assumes it will always get error on | 635 | * application assumes it will always get error on |
636 | * fsync, but does other operations on the fd before | 636 | * fsync, but does other operations on the fd before |
637 | * and the page is dropped inbetween then the error | 637 | * and the page is dropped between then the error |
638 | * will not be properly reported. | 638 | * will not be properly reported. |
639 | * | 639 | * |
640 | * This can already happen even without hwpoisoned | 640 | * This can already happen even without hwpoisoned |
@@ -728,7 +728,7 @@ static int me_huge_page(struct page *p, unsigned long pfn) | |||
728 | * The table matches them in order and calls the right handler. | 728 | * The table matches them in order and calls the right handler. |
729 | * | 729 | * |
730 | * This is quite tricky because we can access page at any time | 730 | * This is quite tricky because we can access page at any time |
731 | * in its live cycle, so all accesses have to be extremly careful. | 731 | * in its live cycle, so all accesses have to be extremely careful. |
732 | * | 732 | * |
733 | * This is not complete. More states could be added. | 733 | * This is not complete. More states could be added. |
734 | * For any missing state don't attempt recovery. | 734 | * For any missing state don't attempt recovery. |
@@ -945,7 +945,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
945 | collect_procs(ppage, &tokill); | 945 | collect_procs(ppage, &tokill); |
946 | 946 | ||
947 | if (hpage != ppage) | 947 | if (hpage != ppage) |
948 | lock_page_nosync(ppage); | 948 | lock_page(ppage); |
949 | 949 | ||
950 | ret = try_to_unmap(ppage, ttu); | 950 | ret = try_to_unmap(ppage, ttu); |
951 | if (ret != SWAP_SUCCESS) | 951 | if (ret != SWAP_SUCCESS) |
@@ -1038,7 +1038,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) | |||
1038 | * Check "just unpoisoned", "filter hit", and | 1038 | * Check "just unpoisoned", "filter hit", and |
1039 | * "race with other subpage." | 1039 | * "race with other subpage." |
1040 | */ | 1040 | */ |
1041 | lock_page_nosync(hpage); | 1041 | lock_page(hpage); |
1042 | if (!PageHWPoison(hpage) | 1042 | if (!PageHWPoison(hpage) |
1043 | || (hwpoison_filter(p) && TestClearPageHWPoison(p)) | 1043 | || (hwpoison_filter(p) && TestClearPageHWPoison(p)) |
1044 | || (p != hpage && TestSetPageHWPoison(hpage))) { | 1044 | || (p != hpage && TestSetPageHWPoison(hpage))) { |
@@ -1088,7 +1088,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) | |||
1088 | * It's very difficult to mess with pages currently under IO | 1088 | * It's very difficult to mess with pages currently under IO |
1089 | * and in many cases impossible, so we just avoid it here. | 1089 | * and in many cases impossible, so we just avoid it here. |
1090 | */ | 1090 | */ |
1091 | lock_page_nosync(hpage); | 1091 | lock_page(hpage); |
1092 | 1092 | ||
1093 | /* | 1093 | /* |
1094 | * unpoison always clear PG_hwpoison inside page lock | 1094 | * unpoison always clear PG_hwpoison inside page lock |
@@ -1130,7 +1130,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) | |||
1130 | 1130 | ||
1131 | /* | 1131 | /* |
1132 | * Now take care of user space mappings. | 1132 | * Now take care of user space mappings. |
1133 | * Abort on fail: __remove_from_page_cache() assumes unmapped page. | 1133 | * Abort on fail: __delete_from_page_cache() assumes unmapped page. |
1134 | */ | 1134 | */ |
1135 | if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) { | 1135 | if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) { |
1136 | printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn); | 1136 | printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn); |
@@ -1231,7 +1231,7 @@ int unpoison_memory(unsigned long pfn) | |||
1231 | return 0; | 1231 | return 0; |
1232 | } | 1232 | } |
1233 | 1233 | ||
1234 | lock_page_nosync(page); | 1234 | lock_page(page); |
1235 | /* | 1235 | /* |
1236 | * This test is racy because PG_hwpoison is set outside of page lock. | 1236 | * This test is racy because PG_hwpoison is set outside of page lock. |
1237 | * That's acceptable because that won't trigger kernel panic. Instead, | 1237 | * That's acceptable because that won't trigger kernel panic. Instead, |
diff --git a/mm/memory.c b/mm/memory.c index e48945ab362b..ce22a250926f 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -1410,6 +1410,13 @@ no_page_table: | |||
1410 | return page; | 1410 | return page; |
1411 | } | 1411 | } |
1412 | 1412 | ||
1413 | static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr) | ||
1414 | { | ||
1415 | return (vma->vm_flags & VM_GROWSDOWN) && | ||
1416 | (vma->vm_start == addr) && | ||
1417 | !vma_stack_continue(vma->vm_prev, addr); | ||
1418 | } | ||
1419 | |||
1413 | /** | 1420 | /** |
1414 | * __get_user_pages() - pin user pages in memory | 1421 | * __get_user_pages() - pin user pages in memory |
1415 | * @tsk: task_struct of target task | 1422 | * @tsk: task_struct of target task |
@@ -1486,9 +1493,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1486 | struct vm_area_struct *vma; | 1493 | struct vm_area_struct *vma; |
1487 | 1494 | ||
1488 | vma = find_extend_vma(mm, start); | 1495 | vma = find_extend_vma(mm, start); |
1489 | if (!vma && in_gate_area(tsk, start)) { | 1496 | if (!vma && in_gate_area(mm, start)) { |
1490 | unsigned long pg = start & PAGE_MASK; | 1497 | unsigned long pg = start & PAGE_MASK; |
1491 | struct vm_area_struct *gate_vma = get_gate_vma(tsk); | ||
1492 | pgd_t *pgd; | 1498 | pgd_t *pgd; |
1493 | pud_t *pud; | 1499 | pud_t *pud; |
1494 | pmd_t *pmd; | 1500 | pmd_t *pmd; |
@@ -1513,10 +1519,11 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1513 | pte_unmap(pte); | 1519 | pte_unmap(pte); |
1514 | return i ? : -EFAULT; | 1520 | return i ? : -EFAULT; |
1515 | } | 1521 | } |
1522 | vma = get_gate_vma(mm); | ||
1516 | if (pages) { | 1523 | if (pages) { |
1517 | struct page *page; | 1524 | struct page *page; |
1518 | 1525 | ||
1519 | page = vm_normal_page(gate_vma, start, *pte); | 1526 | page = vm_normal_page(vma, start, *pte); |
1520 | if (!page) { | 1527 | if (!page) { |
1521 | if (!(gup_flags & FOLL_DUMP) && | 1528 | if (!(gup_flags & FOLL_DUMP) && |
1522 | is_zero_pfn(pte_pfn(*pte))) | 1529 | is_zero_pfn(pte_pfn(*pte))) |
@@ -1530,12 +1537,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1530 | get_page(page); | 1537 | get_page(page); |
1531 | } | 1538 | } |
1532 | pte_unmap(pte); | 1539 | pte_unmap(pte); |
1533 | if (vmas) | 1540 | goto next_page; |
1534 | vmas[i] = gate_vma; | ||
1535 | i++; | ||
1536 | start += PAGE_SIZE; | ||
1537 | nr_pages--; | ||
1538 | continue; | ||
1539 | } | 1541 | } |
1540 | 1542 | ||
1541 | if (!vma || | 1543 | if (!vma || |
@@ -1549,6 +1551,13 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1549 | continue; | 1551 | continue; |
1550 | } | 1552 | } |
1551 | 1553 | ||
1554 | /* | ||
1555 | * If we don't actually want the page itself, | ||
1556 | * and it's the stack guard page, just skip it. | ||
1557 | */ | ||
1558 | if (!pages && stack_guard_page(vma, start)) | ||
1559 | goto next_page; | ||
1560 | |||
1552 | do { | 1561 | do { |
1553 | struct page *page; | 1562 | struct page *page; |
1554 | unsigned int foll_flags = gup_flags; | 1563 | unsigned int foll_flags = gup_flags; |
@@ -1569,6 +1578,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1569 | fault_flags |= FAULT_FLAG_WRITE; | 1578 | fault_flags |= FAULT_FLAG_WRITE; |
1570 | if (nonblocking) | 1579 | if (nonblocking) |
1571 | fault_flags |= FAULT_FLAG_ALLOW_RETRY; | 1580 | fault_flags |= FAULT_FLAG_ALLOW_RETRY; |
1581 | if (foll_flags & FOLL_NOWAIT) | ||
1582 | fault_flags |= (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT); | ||
1572 | 1583 | ||
1573 | ret = handle_mm_fault(mm, vma, start, | 1584 | ret = handle_mm_fault(mm, vma, start, |
1574 | fault_flags); | 1585 | fault_flags); |
@@ -1589,13 +1600,17 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1589 | return i ? i : -EFAULT; | 1600 | return i ? i : -EFAULT; |
1590 | BUG(); | 1601 | BUG(); |
1591 | } | 1602 | } |
1592 | if (ret & VM_FAULT_MAJOR) | 1603 | |
1593 | tsk->maj_flt++; | 1604 | if (tsk) { |
1594 | else | 1605 | if (ret & VM_FAULT_MAJOR) |
1595 | tsk->min_flt++; | 1606 | tsk->maj_flt++; |
1607 | else | ||
1608 | tsk->min_flt++; | ||
1609 | } | ||
1596 | 1610 | ||
1597 | if (ret & VM_FAULT_RETRY) { | 1611 | if (ret & VM_FAULT_RETRY) { |
1598 | *nonblocking = 0; | 1612 | if (nonblocking) |
1613 | *nonblocking = 0; | ||
1599 | return i; | 1614 | return i; |
1600 | } | 1615 | } |
1601 | 1616 | ||
@@ -1625,6 +1640,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1625 | flush_anon_page(vma, page, start); | 1640 | flush_anon_page(vma, page, start); |
1626 | flush_dcache_page(page); | 1641 | flush_dcache_page(page); |
1627 | } | 1642 | } |
1643 | next_page: | ||
1628 | if (vmas) | 1644 | if (vmas) |
1629 | vmas[i] = vma; | 1645 | vmas[i] = vma; |
1630 | i++; | 1646 | i++; |
@@ -1638,7 +1654,8 @@ EXPORT_SYMBOL(__get_user_pages); | |||
1638 | 1654 | ||
1639 | /** | 1655 | /** |
1640 | * get_user_pages() - pin user pages in memory | 1656 | * get_user_pages() - pin user pages in memory |
1641 | * @tsk: task_struct of target task | 1657 | * @tsk: the task_struct to use for page fault accounting, or |
1658 | * NULL if faults are not to be recorded. | ||
1642 | * @mm: mm_struct of target mm | 1659 | * @mm: mm_struct of target mm |
1643 | * @start: starting user address | 1660 | * @start: starting user address |
1644 | * @nr_pages: number of pages from start to pin | 1661 | * @nr_pages: number of pages from start to pin |
@@ -2764,7 +2781,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2764 | swp_entry_t entry; | 2781 | swp_entry_t entry; |
2765 | pte_t pte; | 2782 | pte_t pte; |
2766 | int locked; | 2783 | int locked; |
2767 | struct mem_cgroup *ptr = NULL; | 2784 | struct mem_cgroup *ptr; |
2768 | int exclusive = 0; | 2785 | int exclusive = 0; |
2769 | int ret = 0; | 2786 | int ret = 0; |
2770 | 2787 | ||
@@ -3496,7 +3513,7 @@ static int __init gate_vma_init(void) | |||
3496 | __initcall(gate_vma_init); | 3513 | __initcall(gate_vma_init); |
3497 | #endif | 3514 | #endif |
3498 | 3515 | ||
3499 | struct vm_area_struct *get_gate_vma(struct task_struct *tsk) | 3516 | struct vm_area_struct *get_gate_vma(struct mm_struct *mm) |
3500 | { | 3517 | { |
3501 | #ifdef AT_SYSINFO_EHDR | 3518 | #ifdef AT_SYSINFO_EHDR |
3502 | return &gate_vma; | 3519 | return &gate_vma; |
@@ -3505,7 +3522,7 @@ struct vm_area_struct *get_gate_vma(struct task_struct *tsk) | |||
3505 | #endif | 3522 | #endif |
3506 | } | 3523 | } |
3507 | 3524 | ||
3508 | int in_gate_area_no_task(unsigned long addr) | 3525 | int in_gate_area_no_mm(unsigned long addr) |
3509 | { | 3526 | { |
3510 | #ifdef AT_SYSINFO_EHDR | 3527 | #ifdef AT_SYSINFO_EHDR |
3511 | if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END)) | 3528 | if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END)) |
@@ -3646,20 +3663,15 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, | |||
3646 | #endif | 3663 | #endif |
3647 | 3664 | ||
3648 | /* | 3665 | /* |
3649 | * Access another process' address space. | 3666 | * Access another process' address space as given in mm. If non-NULL, use the |
3650 | * Source/target buffer must be kernel space, | 3667 | * given task for page fault accounting. |
3651 | * Do not walk the page table directly, use get_user_pages | ||
3652 | */ | 3668 | */ |
3653 | int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) | 3669 | static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, |
3670 | unsigned long addr, void *buf, int len, int write) | ||
3654 | { | 3671 | { |
3655 | struct mm_struct *mm; | ||
3656 | struct vm_area_struct *vma; | 3672 | struct vm_area_struct *vma; |
3657 | void *old_buf = buf; | 3673 | void *old_buf = buf; |
3658 | 3674 | ||
3659 | mm = get_task_mm(tsk); | ||
3660 | if (!mm) | ||
3661 | return 0; | ||
3662 | |||
3663 | down_read(&mm->mmap_sem); | 3675 | down_read(&mm->mmap_sem); |
3664 | /* ignore errors, just check how much was successfully transferred */ | 3676 | /* ignore errors, just check how much was successfully transferred */ |
3665 | while (len) { | 3677 | while (len) { |
@@ -3676,7 +3688,7 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in | |||
3676 | */ | 3688 | */ |
3677 | #ifdef CONFIG_HAVE_IOREMAP_PROT | 3689 | #ifdef CONFIG_HAVE_IOREMAP_PROT |
3678 | vma = find_vma(mm, addr); | 3690 | vma = find_vma(mm, addr); |
3679 | if (!vma) | 3691 | if (!vma || vma->vm_start > addr) |
3680 | break; | 3692 | break; |
3681 | if (vma->vm_ops && vma->vm_ops->access) | 3693 | if (vma->vm_ops && vma->vm_ops->access) |
3682 | ret = vma->vm_ops->access(vma, addr, buf, | 3694 | ret = vma->vm_ops->access(vma, addr, buf, |
@@ -3708,11 +3720,47 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in | |||
3708 | addr += bytes; | 3720 | addr += bytes; |
3709 | } | 3721 | } |
3710 | up_read(&mm->mmap_sem); | 3722 | up_read(&mm->mmap_sem); |
3711 | mmput(mm); | ||
3712 | 3723 | ||
3713 | return buf - old_buf; | 3724 | return buf - old_buf; |
3714 | } | 3725 | } |
3715 | 3726 | ||
3727 | /** | ||
3728 | * access_remote_vm - access another process' address space | ||
3729 | * @mm: the mm_struct of the target address space | ||
3730 | * @addr: start address to access | ||
3731 | * @buf: source or destination buffer | ||
3732 | * @len: number of bytes to transfer | ||
3733 | * @write: whether the access is a write | ||
3734 | * | ||
3735 | * The caller must hold a reference on @mm. | ||
3736 | */ | ||
3737 | int access_remote_vm(struct mm_struct *mm, unsigned long addr, | ||
3738 | void *buf, int len, int write) | ||
3739 | { | ||
3740 | return __access_remote_vm(NULL, mm, addr, buf, len, write); | ||
3741 | } | ||
3742 | |||
3743 | /* | ||
3744 | * Access another process' address space. | ||
3745 | * Source/target buffer must be kernel space, | ||
3746 | * Do not walk the page table directly, use get_user_pages | ||
3747 | */ | ||
3748 | int access_process_vm(struct task_struct *tsk, unsigned long addr, | ||
3749 | void *buf, int len, int write) | ||
3750 | { | ||
3751 | struct mm_struct *mm; | ||
3752 | int ret; | ||
3753 | |||
3754 | mm = get_task_mm(tsk); | ||
3755 | if (!mm) | ||
3756 | return 0; | ||
3757 | |||
3758 | ret = __access_remote_vm(tsk, mm, addr, buf, len, write); | ||
3759 | mmput(mm); | ||
3760 | |||
3761 | return ret; | ||
3762 | } | ||
3763 | |||
3716 | /* | 3764 | /* |
3717 | * Print the name of a VMA. | 3765 | * Print the name of a VMA. |
3718 | */ | 3766 | */ |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 321fc7455df7..9ca1d604f7cd 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -375,7 +375,7 @@ void online_page(struct page *page) | |||
375 | #endif | 375 | #endif |
376 | 376 | ||
377 | #ifdef CONFIG_FLATMEM | 377 | #ifdef CONFIG_FLATMEM |
378 | max_mapnr = max(page_to_pfn(page), max_mapnr); | 378 | max_mapnr = max(pfn, max_mapnr); |
379 | #endif | 379 | #endif |
380 | 380 | ||
381 | ClearPageReserved(page); | 381 | ClearPageReserved(page); |
@@ -724,7 +724,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | |||
724 | pfn); | 724 | pfn); |
725 | dump_page(page); | 725 | dump_page(page); |
726 | #endif | 726 | #endif |
727 | /* Becasue we don't have big zone->lock. we should | 727 | /* Because we don't have big zone->lock. we should |
728 | check this again here. */ | 728 | check this again here. */ |
729 | if (page_count(page)) { | 729 | if (page_count(page)) { |
730 | not_managed++; | 730 | not_managed++; |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 78062ab641ff..959a8b8c7350 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -1979,8 +1979,7 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b) | |||
1979 | case MPOL_INTERLEAVE: | 1979 | case MPOL_INTERLEAVE: |
1980 | return nodes_equal(a->v.nodes, b->v.nodes); | 1980 | return nodes_equal(a->v.nodes, b->v.nodes); |
1981 | case MPOL_PREFERRED: | 1981 | case MPOL_PREFERRED: |
1982 | return a->v.preferred_node == b->v.preferred_node && | 1982 | return a->v.preferred_node == b->v.preferred_node; |
1983 | a->flags == b->flags; | ||
1984 | default: | 1983 | default: |
1985 | BUG(); | 1984 | BUG(); |
1986 | return 0; | 1985 | return 0; |
diff --git a/mm/migrate.c b/mm/migrate.c index 352de555626c..34132f8e9109 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -375,7 +375,7 @@ void migrate_page_copy(struct page *newpage, struct page *page) | |||
375 | * redo the accounting that clear_page_dirty_for_io undid, | 375 | * redo the accounting that clear_page_dirty_for_io undid, |
376 | * but we can't use set_page_dirty because that function | 376 | * but we can't use set_page_dirty because that function |
377 | * is actually a signal that all of the page has become dirty. | 377 | * is actually a signal that all of the page has become dirty. |
378 | * Wheras only part of our page may be dirty. | 378 | * Whereas only part of our page may be dirty. |
379 | */ | 379 | */ |
380 | __set_page_dirty_nobuffers(newpage); | 380 | __set_page_dirty_nobuffers(newpage); |
381 | } | 381 | } |
@@ -564,7 +564,7 @@ static int fallback_migrate_page(struct address_space *mapping, | |||
564 | * == 0 - success | 564 | * == 0 - success |
565 | */ | 565 | */ |
566 | static int move_to_new_page(struct page *newpage, struct page *page, | 566 | static int move_to_new_page(struct page *newpage, struct page *page, |
567 | int remap_swapcache) | 567 | int remap_swapcache, bool sync) |
568 | { | 568 | { |
569 | struct address_space *mapping; | 569 | struct address_space *mapping; |
570 | int rc; | 570 | int rc; |
@@ -586,18 +586,28 @@ static int move_to_new_page(struct page *newpage, struct page *page, | |||
586 | mapping = page_mapping(page); | 586 | mapping = page_mapping(page); |
587 | if (!mapping) | 587 | if (!mapping) |
588 | rc = migrate_page(mapping, newpage, page); | 588 | rc = migrate_page(mapping, newpage, page); |
589 | else if (mapping->a_ops->migratepage) | 589 | else { |
590 | /* | 590 | /* |
591 | * Most pages have a mapping and most filesystems | 591 | * Do not writeback pages if !sync and migratepage is |
592 | * should provide a migration function. Anonymous | 592 | * not pointing to migrate_page() which is nonblocking |
593 | * pages are part of swap space which also has its | 593 | * (swapcache/tmpfs uses migratepage = migrate_page). |
594 | * own migration function. This is the most common | ||
595 | * path for page migration. | ||
596 | */ | 594 | */ |
597 | rc = mapping->a_ops->migratepage(mapping, | 595 | if (PageDirty(page) && !sync && |
598 | newpage, page); | 596 | mapping->a_ops->migratepage != migrate_page) |
599 | else | 597 | rc = -EBUSY; |
600 | rc = fallback_migrate_page(mapping, newpage, page); | 598 | else if (mapping->a_ops->migratepage) |
599 | /* | ||
600 | * Most pages have a mapping and most filesystems | ||
601 | * should provide a migration function. Anonymous | ||
602 | * pages are part of swap space which also has its | ||
603 | * own migration function. This is the most common | ||
604 | * path for page migration. | ||
605 | */ | ||
606 | rc = mapping->a_ops->migratepage(mapping, | ||
607 | newpage, page); | ||
608 | else | ||
609 | rc = fallback_migrate_page(mapping, newpage, page); | ||
610 | } | ||
601 | 611 | ||
602 | if (rc) { | 612 | if (rc) { |
603 | newpage->mapping = NULL; | 613 | newpage->mapping = NULL; |
@@ -623,7 +633,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
623 | struct page *newpage = get_new_page(page, private, &result); | 633 | struct page *newpage = get_new_page(page, private, &result); |
624 | int remap_swapcache = 1; | 634 | int remap_swapcache = 1; |
625 | int charge = 0; | 635 | int charge = 0; |
626 | struct mem_cgroup *mem = NULL; | 636 | struct mem_cgroup *mem; |
627 | struct anon_vma *anon_vma = NULL; | 637 | struct anon_vma *anon_vma = NULL; |
628 | 638 | ||
629 | if (!newpage) | 639 | if (!newpage) |
@@ -641,7 +651,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
641 | rc = -EAGAIN; | 651 | rc = -EAGAIN; |
642 | 652 | ||
643 | if (!trylock_page(page)) { | 653 | if (!trylock_page(page)) { |
644 | if (!force) | 654 | if (!force || !sync) |
645 | goto move_newpage; | 655 | goto move_newpage; |
646 | 656 | ||
647 | /* | 657 | /* |
@@ -678,7 +688,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
678 | } | 688 | } |
679 | 689 | ||
680 | /* charge against new page */ | 690 | /* charge against new page */ |
681 | charge = mem_cgroup_prepare_migration(page, newpage, &mem); | 691 | charge = mem_cgroup_prepare_migration(page, newpage, &mem, GFP_KERNEL); |
682 | if (charge == -ENOMEM) { | 692 | if (charge == -ENOMEM) { |
683 | rc = -ENOMEM; | 693 | rc = -ENOMEM; |
684 | goto unlock; | 694 | goto unlock; |
@@ -686,7 +696,15 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
686 | BUG_ON(charge); | 696 | BUG_ON(charge); |
687 | 697 | ||
688 | if (PageWriteback(page)) { | 698 | if (PageWriteback(page)) { |
689 | if (!force || !sync) | 699 | /* |
700 | * For !sync, there is no point retrying as the retry loop | ||
701 | * is expected to be too short for PageWriteback to be cleared | ||
702 | */ | ||
703 | if (!sync) { | ||
704 | rc = -EBUSY; | ||
705 | goto uncharge; | ||
706 | } | ||
707 | if (!force) | ||
690 | goto uncharge; | 708 | goto uncharge; |
691 | wait_on_page_writeback(page); | 709 | wait_on_page_writeback(page); |
692 | } | 710 | } |
@@ -757,14 +775,14 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
757 | 775 | ||
758 | skip_unmap: | 776 | skip_unmap: |
759 | if (!page_mapped(page)) | 777 | if (!page_mapped(page)) |
760 | rc = move_to_new_page(newpage, page, remap_swapcache); | 778 | rc = move_to_new_page(newpage, page, remap_swapcache, sync); |
761 | 779 | ||
762 | if (rc && remap_swapcache) | 780 | if (rc && remap_swapcache) |
763 | remove_migration_ptes(page, page); | 781 | remove_migration_ptes(page, page); |
764 | 782 | ||
765 | /* Drop an anon_vma reference if we took one */ | 783 | /* Drop an anon_vma reference if we took one */ |
766 | if (anon_vma) | 784 | if (anon_vma) |
767 | drop_anon_vma(anon_vma); | 785 | put_anon_vma(anon_vma); |
768 | 786 | ||
769 | uncharge: | 787 | uncharge: |
770 | if (!charge) | 788 | if (!charge) |
@@ -850,13 +868,13 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, | |||
850 | try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); | 868 | try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); |
851 | 869 | ||
852 | if (!page_mapped(hpage)) | 870 | if (!page_mapped(hpage)) |
853 | rc = move_to_new_page(new_hpage, hpage, 1); | 871 | rc = move_to_new_page(new_hpage, hpage, 1, sync); |
854 | 872 | ||
855 | if (rc) | 873 | if (rc) |
856 | remove_migration_ptes(hpage, hpage); | 874 | remove_migration_ptes(hpage, hpage); |
857 | 875 | ||
858 | if (anon_vma) | 876 | if (anon_vma) |
859 | drop_anon_vma(anon_vma); | 877 | put_anon_vma(anon_vma); |
860 | out: | 878 | out: |
861 | unlock_page(hpage); | 879 | unlock_page(hpage); |
862 | 880 | ||
diff --git a/mm/mlock.c b/mm/mlock.c index c3924c7f00be..6b55e3efe0df 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -135,13 +135,6 @@ void munlock_vma_page(struct page *page) | |||
135 | } | 135 | } |
136 | } | 136 | } |
137 | 137 | ||
138 | static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr) | ||
139 | { | ||
140 | return (vma->vm_flags & VM_GROWSDOWN) && | ||
141 | (vma->vm_start == addr) && | ||
142 | !vma_stack_continue(vma->vm_prev, addr); | ||
143 | } | ||
144 | |||
145 | /** | 138 | /** |
146 | * __mlock_vma_pages_range() - mlock a range of pages in the vma. | 139 | * __mlock_vma_pages_range() - mlock a range of pages in the vma. |
147 | * @vma: target vma | 140 | * @vma: target vma |
@@ -188,12 +181,6 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, | |||
188 | if (vma->vm_flags & VM_LOCKED) | 181 | if (vma->vm_flags & VM_LOCKED) |
189 | gup_flags |= FOLL_MLOCK; | 182 | gup_flags |= FOLL_MLOCK; |
190 | 183 | ||
191 | /* We don't try to access the guard page of a stack vma */ | ||
192 | if (stack_guard_page(vma, start)) { | ||
193 | addr += PAGE_SIZE; | ||
194 | nr_pages--; | ||
195 | } | ||
196 | |||
197 | return __get_user_pages(current, mm, addr, nr_pages, gup_flags, | 184 | return __get_user_pages(current, mm, addr, nr_pages, gup_flags, |
198 | NULL, NULL, nonblocking); | 185 | NULL, NULL, nonblocking); |
199 | } | 186 | } |
@@ -237,7 +224,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma, | |||
237 | 224 | ||
238 | if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || | 225 | if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || |
239 | is_vm_hugetlb_page(vma) || | 226 | is_vm_hugetlb_page(vma) || |
240 | vma == get_gate_vma(current))) { | 227 | vma == get_gate_vma(current->mm))) { |
241 | 228 | ||
242 | __mlock_vma_pages_range(vma, start, end, NULL); | 229 | __mlock_vma_pages_range(vma, start, end, NULL); |
243 | 230 | ||
@@ -332,7 +319,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, | |||
332 | int lock = newflags & VM_LOCKED; | 319 | int lock = newflags & VM_LOCKED; |
333 | 320 | ||
334 | if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) || | 321 | if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) || |
335 | is_vm_hugetlb_page(vma) || vma == get_gate_vma(current)) | 322 | is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm)) |
336 | goto out; /* don't set VM_LOCKED, don't count */ | 323 | goto out; /* don't set VM_LOCKED, don't count */ |
337 | 324 | ||
338 | pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); | 325 | pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); |
@@ -259,7 +259,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) | |||
259 | * randomize_va_space to 2, which will still cause mm->start_brk | 259 | * randomize_va_space to 2, which will still cause mm->start_brk |
260 | * to be arbitrarily shifted | 260 | * to be arbitrarily shifted |
261 | */ | 261 | */ |
262 | if (mm->start_brk > PAGE_ALIGN(mm->end_data)) | 262 | if (current->brk_randomized) |
263 | min_brk = mm->start_brk; | 263 | min_brk = mm->start_brk; |
264 | else | 264 | else |
265 | min_brk = mm->end_data; | 265 | min_brk = mm->end_data; |
@@ -1814,11 +1814,14 @@ static int expand_downwards(struct vm_area_struct *vma, | |||
1814 | size = vma->vm_end - address; | 1814 | size = vma->vm_end - address; |
1815 | grow = (vma->vm_start - address) >> PAGE_SHIFT; | 1815 | grow = (vma->vm_start - address) >> PAGE_SHIFT; |
1816 | 1816 | ||
1817 | error = acct_stack_growth(vma, size, grow); | 1817 | error = -ENOMEM; |
1818 | if (!error) { | 1818 | if (grow <= vma->vm_pgoff) { |
1819 | vma->vm_start = address; | 1819 | error = acct_stack_growth(vma, size, grow); |
1820 | vma->vm_pgoff -= grow; | 1820 | if (!error) { |
1821 | perf_event_mmap(vma); | 1821 | vma->vm_start = address; |
1822 | vma->vm_pgoff -= grow; | ||
1823 | perf_event_mmap(vma); | ||
1824 | } | ||
1822 | } | 1825 | } |
1823 | } | 1826 | } |
1824 | vma_unlock_anon_vma(vma); | 1827 | vma_unlock_anon_vma(vma); |
diff --git a/mm/mremap.c b/mm/mremap.c index 1de98d492ddc..a7c1f9f9b941 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -277,9 +277,16 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, | |||
277 | if (old_len > vma->vm_end - addr) | 277 | if (old_len > vma->vm_end - addr) |
278 | goto Efault; | 278 | goto Efault; |
279 | 279 | ||
280 | if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) { | 280 | /* Need to be careful about a growing mapping */ |
281 | if (new_len > old_len) | 281 | if (new_len > old_len) { |
282 | unsigned long pgoff; | ||
283 | |||
284 | if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) | ||
282 | goto Efault; | 285 | goto Efault; |
286 | pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; | ||
287 | pgoff += vma->vm_pgoff; | ||
288 | if (pgoff + (new_len >> PAGE_SHIFT) < pgoff) | ||
289 | goto Einval; | ||
283 | } | 290 | } |
284 | 291 | ||
285 | if (vma->vm_flags & VM_LOCKED) { | 292 | if (vma->vm_flags & VM_LOCKED) { |
diff --git a/mm/nobootmem.c b/mm/nobootmem.c index e2bdb07079ce..9109049f0bbc 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c | |||
@@ -32,14 +32,6 @@ unsigned long max_low_pfn; | |||
32 | unsigned long min_low_pfn; | 32 | unsigned long min_low_pfn; |
33 | unsigned long max_pfn; | 33 | unsigned long max_pfn; |
34 | 34 | ||
35 | #ifdef CONFIG_CRASH_DUMP | ||
36 | /* | ||
37 | * If we have booted due to a crash, max_pfn will be a very low value. We need | ||
38 | * to know the amount of memory that the previous kernel used. | ||
39 | */ | ||
40 | unsigned long saved_max_pfn; | ||
41 | #endif | ||
42 | |||
43 | static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, | 35 | static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, |
44 | u64 goal, u64 limit) | 36 | u64 goal, u64 limit) |
45 | { | 37 | { |
@@ -158,7 +150,7 @@ unsigned long __init free_all_bootmem(void) | |||
158 | { | 150 | { |
159 | /* | 151 | /* |
160 | * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id | 152 | * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id |
161 | * because in some case like Node0 doesnt have RAM installed | 153 | * because in some case like Node0 doesn't have RAM installed |
162 | * low ram will be on Node1 | 154 | * low ram will be on Node1 |
163 | * Use MAX_NUMNODES will make sure all ranges in early_node_map[] | 155 | * Use MAX_NUMNODES will make sure all ranges in early_node_map[] |
164 | * will be used instead of only Node0 related | 156 | * will be used instead of only Node0 related |
diff --git a/mm/nommu.c b/mm/nommu.c index f59e1424d3db..c4c542c736a9 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -1842,10 +1842,6 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, | |||
1842 | } | 1842 | } |
1843 | EXPORT_SYMBOL(remap_vmalloc_range); | 1843 | EXPORT_SYMBOL(remap_vmalloc_range); |
1844 | 1844 | ||
1845 | void swap_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) | ||
1846 | { | ||
1847 | } | ||
1848 | |||
1849 | unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr, | 1845 | unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr, |
1850 | unsigned long len, unsigned long pgoff, unsigned long flags) | 1846 | unsigned long len, unsigned long pgoff, unsigned long flags) |
1851 | { | 1847 | { |
@@ -1963,7 +1959,7 @@ error: | |||
1963 | return -ENOMEM; | 1959 | return -ENOMEM; |
1964 | } | 1960 | } |
1965 | 1961 | ||
1966 | int in_gate_area_no_task(unsigned long addr) | 1962 | int in_gate_area_no_mm(unsigned long addr) |
1967 | { | 1963 | { |
1968 | return 0; | 1964 | return 0; |
1969 | } | 1965 | } |
@@ -1975,21 +1971,10 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1975 | } | 1971 | } |
1976 | EXPORT_SYMBOL(filemap_fault); | 1972 | EXPORT_SYMBOL(filemap_fault); |
1977 | 1973 | ||
1978 | /* | 1974 | static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, |
1979 | * Access another process' address space. | 1975 | unsigned long addr, void *buf, int len, int write) |
1980 | * - source/target buffer must be kernel space | ||
1981 | */ | ||
1982 | int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) | ||
1983 | { | 1976 | { |
1984 | struct vm_area_struct *vma; | 1977 | struct vm_area_struct *vma; |
1985 | struct mm_struct *mm; | ||
1986 | |||
1987 | if (addr + len < addr) | ||
1988 | return 0; | ||
1989 | |||
1990 | mm = get_task_mm(tsk); | ||
1991 | if (!mm) | ||
1992 | return 0; | ||
1993 | 1978 | ||
1994 | down_read(&mm->mmap_sem); | 1979 | down_read(&mm->mmap_sem); |
1995 | 1980 | ||
@@ -2014,6 +1999,43 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in | |||
2014 | } | 1999 | } |
2015 | 2000 | ||
2016 | up_read(&mm->mmap_sem); | 2001 | up_read(&mm->mmap_sem); |
2002 | |||
2003 | return len; | ||
2004 | } | ||
2005 | |||
2006 | /** | ||
2007 | * @access_remote_vm - access another process' address space | ||
2008 | * @mm: the mm_struct of the target address space | ||
2009 | * @addr: start address to access | ||
2010 | * @buf: source or destination buffer | ||
2011 | * @len: number of bytes to transfer | ||
2012 | * @write: whether the access is a write | ||
2013 | * | ||
2014 | * The caller must hold a reference on @mm. | ||
2015 | */ | ||
2016 | int access_remote_vm(struct mm_struct *mm, unsigned long addr, | ||
2017 | void *buf, int len, int write) | ||
2018 | { | ||
2019 | return __access_remote_vm(NULL, mm, addr, buf, len, write); | ||
2020 | } | ||
2021 | |||
2022 | /* | ||
2023 | * Access another process' address space. | ||
2024 | * - source/target buffer must be kernel space | ||
2025 | */ | ||
2026 | int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) | ||
2027 | { | ||
2028 | struct mm_struct *mm; | ||
2029 | |||
2030 | if (addr + len < addr) | ||
2031 | return 0; | ||
2032 | |||
2033 | mm = get_task_mm(tsk); | ||
2034 | if (!mm) | ||
2035 | return 0; | ||
2036 | |||
2037 | len = __access_remote_vm(tsk, mm, addr, buf, len, write); | ||
2038 | |||
2017 | mmput(mm); | 2039 | mmput(mm); |
2018 | return len; | 2040 | return len; |
2019 | } | 2041 | } |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 7dcca55ede7c..83fb72c108b7 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -31,6 +31,7 @@ | |||
31 | #include <linux/memcontrol.h> | 31 | #include <linux/memcontrol.h> |
32 | #include <linux/mempolicy.h> | 32 | #include <linux/mempolicy.h> |
33 | #include <linux/security.h> | 33 | #include <linux/security.h> |
34 | #include <linux/ptrace.h> | ||
34 | 35 | ||
35 | int sysctl_panic_on_oom; | 36 | int sysctl_panic_on_oom; |
36 | int sysctl_oom_kill_allocating_task; | 37 | int sysctl_oom_kill_allocating_task; |
@@ -83,24 +84,6 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk, | |||
83 | #endif /* CONFIG_NUMA */ | 84 | #endif /* CONFIG_NUMA */ |
84 | 85 | ||
85 | /* | 86 | /* |
86 | * If this is a system OOM (not a memcg OOM) and the task selected to be | ||
87 | * killed is not already running at high (RT) priorities, speed up the | ||
88 | * recovery by boosting the dying task to the lowest FIFO priority. | ||
89 | * That helps with the recovery and avoids interfering with RT tasks. | ||
90 | */ | ||
91 | static void boost_dying_task_prio(struct task_struct *p, | ||
92 | struct mem_cgroup *mem) | ||
93 | { | ||
94 | struct sched_param param = { .sched_priority = 1 }; | ||
95 | |||
96 | if (mem) | ||
97 | return; | ||
98 | |||
99 | if (!rt_task(p)) | ||
100 | sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); | ||
101 | } | ||
102 | |||
103 | /* | ||
104 | * The process p may have detached its own ->mm while exiting or through | 87 | * The process p may have detached its own ->mm while exiting or through |
105 | * use_mm(), but one or more of its subthreads may still have a valid | 88 | * use_mm(), but one or more of its subthreads may still have a valid |
106 | * pointer. Return p, or any of its subthreads with a valid ->mm, with | 89 | * pointer. Return p, or any of its subthreads with a valid ->mm, with |
@@ -292,13 +275,15 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, | |||
292 | unsigned long totalpages, struct mem_cgroup *mem, | 275 | unsigned long totalpages, struct mem_cgroup *mem, |
293 | const nodemask_t *nodemask) | 276 | const nodemask_t *nodemask) |
294 | { | 277 | { |
295 | struct task_struct *p; | 278 | struct task_struct *g, *p; |
296 | struct task_struct *chosen = NULL; | 279 | struct task_struct *chosen = NULL; |
297 | *ppoints = 0; | 280 | *ppoints = 0; |
298 | 281 | ||
299 | for_each_process(p) { | 282 | do_each_thread(g, p) { |
300 | unsigned int points; | 283 | unsigned int points; |
301 | 284 | ||
285 | if (!p->mm) | ||
286 | continue; | ||
302 | if (oom_unkillable_task(p, mem, nodemask)) | 287 | if (oom_unkillable_task(p, mem, nodemask)) |
303 | continue; | 288 | continue; |
304 | 289 | ||
@@ -314,22 +299,29 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, | |||
314 | if (test_tsk_thread_flag(p, TIF_MEMDIE)) | 299 | if (test_tsk_thread_flag(p, TIF_MEMDIE)) |
315 | return ERR_PTR(-1UL); | 300 | return ERR_PTR(-1UL); |
316 | 301 | ||
317 | /* | 302 | if (p->flags & PF_EXITING) { |
318 | * This is in the process of releasing memory so wait for it | 303 | /* |
319 | * to finish before killing some other task by mistake. | 304 | * If p is the current task and is in the process of |
320 | * | 305 | * releasing memory, we allow the "kill" to set |
321 | * However, if p is the current task, we allow the 'kill' to | 306 | * TIF_MEMDIE, which will allow it to gain access to |
322 | * go ahead if it is exiting: this will simply set TIF_MEMDIE, | 307 | * memory reserves. Otherwise, it may stall forever. |
323 | * which will allow it to gain access to memory reserves in | 308 | * |
324 | * the process of exiting and releasing its resources. | 309 | * The loop isn't broken here, however, in case other |
325 | * Otherwise we could get an easy OOM deadlock. | 310 | * threads are found to have already been oom killed. |
326 | */ | 311 | */ |
327 | if (thread_group_empty(p) && (p->flags & PF_EXITING) && p->mm) { | 312 | if (p == current) { |
328 | if (p != current) | 313 | chosen = p; |
329 | return ERR_PTR(-1UL); | 314 | *ppoints = 1000; |
330 | 315 | } else { | |
331 | chosen = p; | 316 | /* |
332 | *ppoints = 1000; | 317 | * If this task is not being ptraced on exit, |
318 | * then wait for it to finish before killing | ||
319 | * some other task unnecessarily. | ||
320 | */ | ||
321 | if (!(task_ptrace(p->group_leader) & | ||
322 | PT_TRACE_EXIT)) | ||
323 | return ERR_PTR(-1UL); | ||
324 | } | ||
333 | } | 325 | } |
334 | 326 | ||
335 | points = oom_badness(p, mem, nodemask, totalpages); | 327 | points = oom_badness(p, mem, nodemask, totalpages); |
@@ -337,7 +329,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, | |||
337 | chosen = p; | 329 | chosen = p; |
338 | *ppoints = points; | 330 | *ppoints = points; |
339 | } | 331 | } |
340 | } | 332 | } while_each_thread(g, p); |
341 | 333 | ||
342 | return chosen; | 334 | return chosen; |
343 | } | 335 | } |
@@ -396,7 +388,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, | |||
396 | task_unlock(current); | 388 | task_unlock(current); |
397 | dump_stack(); | 389 | dump_stack(); |
398 | mem_cgroup_print_oom_info(mem, p); | 390 | mem_cgroup_print_oom_info(mem, p); |
399 | show_mem(); | 391 | show_mem(SHOW_MEM_FILTER_NODES); |
400 | if (sysctl_oom_dump_tasks) | 392 | if (sysctl_oom_dump_tasks) |
401 | dump_tasks(mem, nodemask); | 393 | dump_tasks(mem, nodemask); |
402 | } | 394 | } |
@@ -442,13 +434,6 @@ static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem) | |||
442 | set_tsk_thread_flag(p, TIF_MEMDIE); | 434 | set_tsk_thread_flag(p, TIF_MEMDIE); |
443 | force_sig(SIGKILL, p); | 435 | force_sig(SIGKILL, p); |
444 | 436 | ||
445 | /* | ||
446 | * We give our sacrificial lamb high priority and access to | ||
447 | * all the memory it needs. That way it should be able to | ||
448 | * exit() and clear out its resources quickly... | ||
449 | */ | ||
450 | boost_dying_task_prio(p, mem); | ||
451 | |||
452 | return 0; | 437 | return 0; |
453 | } | 438 | } |
454 | #undef K | 439 | #undef K |
@@ -472,7 +457,6 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
472 | */ | 457 | */ |
473 | if (p->flags & PF_EXITING) { | 458 | if (p->flags & PF_EXITING) { |
474 | set_tsk_thread_flag(p, TIF_MEMDIE); | 459 | set_tsk_thread_flag(p, TIF_MEMDIE); |
475 | boost_dying_task_prio(p, mem); | ||
476 | return 0; | 460 | return 0; |
477 | } | 461 | } |
478 | 462 | ||
@@ -491,6 +475,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
491 | list_for_each_entry(child, &t->children, sibling) { | 475 | list_for_each_entry(child, &t->children, sibling) { |
492 | unsigned int child_points; | 476 | unsigned int child_points; |
493 | 477 | ||
478 | if (child->mm == p->mm) | ||
479 | continue; | ||
494 | /* | 480 | /* |
495 | * oom_badness() returns 0 if the thread is unkillable | 481 | * oom_badness() returns 0 if the thread is unkillable |
496 | */ | 482 | */ |
@@ -537,6 +523,16 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask) | |||
537 | unsigned int points = 0; | 523 | unsigned int points = 0; |
538 | struct task_struct *p; | 524 | struct task_struct *p; |
539 | 525 | ||
526 | /* | ||
527 | * If current has a pending SIGKILL, then automatically select it. The | ||
528 | * goal is to allow it to allocate so that it may quickly exit and free | ||
529 | * its memory. | ||
530 | */ | ||
531 | if (fatal_signal_pending(current)) { | ||
532 | set_thread_flag(TIF_MEMDIE); | ||
533 | return; | ||
534 | } | ||
535 | |||
540 | check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL); | 536 | check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL); |
541 | limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT; | 537 | limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT; |
542 | read_lock(&tasklist_lock); | 538 | read_lock(&tasklist_lock); |
@@ -689,7 +685,6 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, | |||
689 | */ | 685 | */ |
690 | if (fatal_signal_pending(current)) { | 686 | if (fatal_signal_pending(current)) { |
691 | set_thread_flag(TIF_MEMDIE); | 687 | set_thread_flag(TIF_MEMDIE); |
692 | boost_dying_task_prio(current, NULL); | ||
693 | return; | 688 | return; |
694 | } | 689 | } |
695 | 690 | ||
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 2cb01f6ec5d0..31f698862420 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -927,7 +927,7 @@ retry: | |||
927 | break; | 927 | break; |
928 | } | 928 | } |
929 | 929 | ||
930 | done_index = page->index + 1; | 930 | done_index = page->index; |
931 | 931 | ||
932 | lock_page(page); | 932 | lock_page(page); |
933 | 933 | ||
@@ -977,6 +977,7 @@ continue_unlock: | |||
977 | * not be suitable for data integrity | 977 | * not be suitable for data integrity |
978 | * writeout). | 978 | * writeout). |
979 | */ | 979 | */ |
980 | done_index = page->index + 1; | ||
980 | done = 1; | 981 | done = 1; |
981 | break; | 982 | break; |
982 | } | 983 | } |
@@ -1039,11 +1040,17 @@ static int __writepage(struct page *page, struct writeback_control *wbc, | |||
1039 | int generic_writepages(struct address_space *mapping, | 1040 | int generic_writepages(struct address_space *mapping, |
1040 | struct writeback_control *wbc) | 1041 | struct writeback_control *wbc) |
1041 | { | 1042 | { |
1043 | struct blk_plug plug; | ||
1044 | int ret; | ||
1045 | |||
1042 | /* deal with chardevs and other special file */ | 1046 | /* deal with chardevs and other special file */ |
1043 | if (!mapping->a_ops->writepage) | 1047 | if (!mapping->a_ops->writepage) |
1044 | return 0; | 1048 | return 0; |
1045 | 1049 | ||
1046 | return write_cache_pages(mapping, wbc, __writepage, mapping); | 1050 | blk_start_plug(&plug); |
1051 | ret = write_cache_pages(mapping, wbc, __writepage, mapping); | ||
1052 | blk_finish_plug(&plug); | ||
1053 | return ret; | ||
1047 | } | 1054 | } |
1048 | 1055 | ||
1049 | EXPORT_SYMBOL(generic_writepages); | 1056 | EXPORT_SYMBOL(generic_writepages); |
@@ -1211,6 +1218,17 @@ int set_page_dirty(struct page *page) | |||
1211 | 1218 | ||
1212 | if (likely(mapping)) { | 1219 | if (likely(mapping)) { |
1213 | int (*spd)(struct page *) = mapping->a_ops->set_page_dirty; | 1220 | int (*spd)(struct page *) = mapping->a_ops->set_page_dirty; |
1221 | /* | ||
1222 | * readahead/lru_deactivate_page could remain | ||
1223 | * PG_readahead/PG_reclaim due to race with end_page_writeback | ||
1224 | * About readahead, if the page is written, the flags would be | ||
1225 | * reset. So no problem. | ||
1226 | * About lru_deactivate_page, if the page is redirty, the flag | ||
1227 | * will be reset. So no problem. but if the page is used by readahead | ||
1228 | * it will confuse readahead and make it restart the size rampup | ||
1229 | * process. But it's a trivial problem. | ||
1230 | */ | ||
1231 | ClearPageReclaim(page); | ||
1214 | #ifdef CONFIG_BLOCK | 1232 | #ifdef CONFIG_BLOCK |
1215 | if (!spd) | 1233 | if (!spd) |
1216 | spd = __set_page_dirty_buffers; | 1234 | spd = __set_page_dirty_buffers; |
@@ -1239,7 +1257,7 @@ int set_page_dirty_lock(struct page *page) | |||
1239 | { | 1257 | { |
1240 | int ret; | 1258 | int ret; |
1241 | 1259 | ||
1242 | lock_page_nosync(page); | 1260 | lock_page(page); |
1243 | ret = set_page_dirty(page); | 1261 | ret = set_page_dirty(page); |
1244 | unlock_page(page); | 1262 | unlock_page(page); |
1245 | return ret; | 1263 | return ret; |
@@ -1266,7 +1284,6 @@ int clear_page_dirty_for_io(struct page *page) | |||
1266 | 1284 | ||
1267 | BUG_ON(!PageLocked(page)); | 1285 | BUG_ON(!PageLocked(page)); |
1268 | 1286 | ||
1269 | ClearPageReclaim(page); | ||
1270 | if (mapping && mapping_cap_account_dirty(mapping)) { | 1287 | if (mapping && mapping_cap_account_dirty(mapping)) { |
1271 | /* | 1288 | /* |
1272 | * Yes, Virginia, this is indeed insane. | 1289 | * Yes, Virginia, this is indeed insane. |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 48c9737ad49a..df9fc3385fb2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -53,6 +53,7 @@ | |||
53 | #include <linux/compaction.h> | 53 | #include <linux/compaction.h> |
54 | #include <trace/events/kmem.h> | 54 | #include <trace/events/kmem.h> |
55 | #include <linux/ftrace_event.h> | 55 | #include <linux/ftrace_event.h> |
56 | #include <linux/memcontrol.h> | ||
56 | 57 | ||
57 | #include <asm/tlbflush.h> | 58 | #include <asm/tlbflush.h> |
58 | #include <asm/div64.h> | 59 | #include <asm/div64.h> |
@@ -565,7 +566,8 @@ static inline int free_pages_check(struct page *page) | |||
565 | if (unlikely(page_mapcount(page) | | 566 | if (unlikely(page_mapcount(page) | |
566 | (page->mapping != NULL) | | 567 | (page->mapping != NULL) | |
567 | (atomic_read(&page->_count) != 0) | | 568 | (atomic_read(&page->_count) != 0) | |
568 | (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) { | 569 | (page->flags & PAGE_FLAGS_CHECK_AT_FREE) | |
570 | (mem_cgroup_bad_page_check(page)))) { | ||
569 | bad_page(page); | 571 | bad_page(page); |
570 | return 1; | 572 | return 1; |
571 | } | 573 | } |
@@ -614,6 +616,10 @@ static void free_pcppages_bulk(struct zone *zone, int count, | |||
614 | list = &pcp->lists[migratetype]; | 616 | list = &pcp->lists[migratetype]; |
615 | } while (list_empty(list)); | 617 | } while (list_empty(list)); |
616 | 618 | ||
619 | /* This is the only non-empty list. Free them all. */ | ||
620 | if (batch_free == MIGRATE_PCPTYPES) | ||
621 | batch_free = to_free; | ||
622 | |||
617 | do { | 623 | do { |
618 | page = list_entry(list->prev, struct page, lru); | 624 | page = list_entry(list->prev, struct page, lru); |
619 | /* must delete as __free_one_page list manipulates */ | 625 | /* must delete as __free_one_page list manipulates */ |
@@ -750,7 +756,8 @@ static inline int check_new_page(struct page *page) | |||
750 | if (unlikely(page_mapcount(page) | | 756 | if (unlikely(page_mapcount(page) | |
751 | (page->mapping != NULL) | | 757 | (page->mapping != NULL) | |
752 | (atomic_read(&page->_count) != 0) | | 758 | (atomic_read(&page->_count) != 0) | |
753 | (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) { | 759 | (page->flags & PAGE_FLAGS_CHECK_AT_PREP) | |
760 | (mem_cgroup_bad_page_check(page)))) { | ||
754 | bad_page(page); | 761 | bad_page(page); |
755 | return 1; | 762 | return 1; |
756 | } | 763 | } |
@@ -863,9 +870,8 @@ static int move_freepages(struct zone *zone, | |||
863 | } | 870 | } |
864 | 871 | ||
865 | order = page_order(page); | 872 | order = page_order(page); |
866 | list_del(&page->lru); | 873 | list_move(&page->lru, |
867 | list_add(&page->lru, | 874 | &zone->free_area[order].free_list[migratetype]); |
868 | &zone->free_area[order].free_list[migratetype]); | ||
869 | page += 1 << order; | 875 | page += 1 << order; |
870 | pages_moved += 1 << order; | 876 | pages_moved += 1 << order; |
871 | } | 877 | } |
@@ -936,7 +942,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) | |||
936 | * If breaking a large block of pages, move all free | 942 | * If breaking a large block of pages, move all free |
937 | * pages to the preferred allocation list. If falling | 943 | * pages to the preferred allocation list. If falling |
938 | * back for a reclaimable kernel allocation, be more | 944 | * back for a reclaimable kernel allocation, be more |
939 | * agressive about taking ownership of free pages | 945 | * aggressive about taking ownership of free pages |
940 | */ | 946 | */ |
941 | if (unlikely(current_order >= (pageblock_order >> 1)) || | 947 | if (unlikely(current_order >= (pageblock_order >> 1)) || |
942 | start_migratetype == MIGRATE_RECLAIMABLE || | 948 | start_migratetype == MIGRATE_RECLAIMABLE || |
@@ -1333,7 +1339,7 @@ again: | |||
1333 | } | 1339 | } |
1334 | 1340 | ||
1335 | __count_zone_vm_events(PGALLOC, zone, 1 << order); | 1341 | __count_zone_vm_events(PGALLOC, zone, 1 << order); |
1336 | zone_statistics(preferred_zone, zone); | 1342 | zone_statistics(preferred_zone, zone, gfp_flags); |
1337 | local_irq_restore(flags); | 1343 | local_irq_restore(flags); |
1338 | 1344 | ||
1339 | VM_BUG_ON(bad_range(zone, page)); | 1345 | VM_BUG_ON(bad_range(zone, page)); |
@@ -1714,6 +1720,20 @@ try_next_zone: | |||
1714 | return page; | 1720 | return page; |
1715 | } | 1721 | } |
1716 | 1722 | ||
1723 | /* | ||
1724 | * Large machines with many possible nodes should not always dump per-node | ||
1725 | * meminfo in irq context. | ||
1726 | */ | ||
1727 | static inline bool should_suppress_show_mem(void) | ||
1728 | { | ||
1729 | bool ret = false; | ||
1730 | |||
1731 | #if NODES_SHIFT > 8 | ||
1732 | ret = in_interrupt(); | ||
1733 | #endif | ||
1734 | return ret; | ||
1735 | } | ||
1736 | |||
1717 | static inline int | 1737 | static inline int |
1718 | should_alloc_retry(gfp_t gfp_mask, unsigned int order, | 1738 | should_alloc_retry(gfp_t gfp_mask, unsigned int order, |
1719 | unsigned long pages_reclaimed) | 1739 | unsigned long pages_reclaimed) |
@@ -2085,7 +2105,7 @@ rebalance: | |||
2085 | sync_migration); | 2105 | sync_migration); |
2086 | if (page) | 2106 | if (page) |
2087 | goto got_pg; | 2107 | goto got_pg; |
2088 | sync_migration = true; | 2108 | sync_migration = !(gfp_mask & __GFP_NO_KSWAPD); |
2089 | 2109 | ||
2090 | /* Try direct reclaim and then allocating */ | 2110 | /* Try direct reclaim and then allocating */ |
2091 | page = __alloc_pages_direct_reclaim(gfp_mask, order, | 2111 | page = __alloc_pages_direct_reclaim(gfp_mask, order, |
@@ -2157,11 +2177,25 @@ rebalance: | |||
2157 | 2177 | ||
2158 | nopage: | 2178 | nopage: |
2159 | if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { | 2179 | if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { |
2160 | printk(KERN_WARNING "%s: page allocation failure." | 2180 | unsigned int filter = SHOW_MEM_FILTER_NODES; |
2161 | " order:%d, mode:0x%x\n", | 2181 | |
2182 | /* | ||
2183 | * This documents exceptions given to allocations in certain | ||
2184 | * contexts that are allowed to allocate outside current's set | ||
2185 | * of allowed nodes. | ||
2186 | */ | ||
2187 | if (!(gfp_mask & __GFP_NOMEMALLOC)) | ||
2188 | if (test_thread_flag(TIF_MEMDIE) || | ||
2189 | (current->flags & (PF_MEMALLOC | PF_EXITING))) | ||
2190 | filter &= ~SHOW_MEM_FILTER_NODES; | ||
2191 | if (in_interrupt() || !wait) | ||
2192 | filter &= ~SHOW_MEM_FILTER_NODES; | ||
2193 | |||
2194 | pr_warning("%s: page allocation failure. order:%d, mode:0x%x\n", | ||
2162 | current->comm, order, gfp_mask); | 2195 | current->comm, order, gfp_mask); |
2163 | dump_stack(); | 2196 | dump_stack(); |
2164 | show_mem(); | 2197 | if (!should_suppress_show_mem()) |
2198 | show_mem(filter); | ||
2165 | } | 2199 | } |
2166 | return page; | 2200 | return page; |
2167 | got_pg: | 2201 | got_pg: |
@@ -2411,19 +2445,42 @@ void si_meminfo_node(struct sysinfo *val, int nid) | |||
2411 | } | 2445 | } |
2412 | #endif | 2446 | #endif |
2413 | 2447 | ||
2448 | /* | ||
2449 | * Determine whether the zone's node should be displayed or not, depending on | ||
2450 | * whether SHOW_MEM_FILTER_NODES was passed to __show_free_areas(). | ||
2451 | */ | ||
2452 | static bool skip_free_areas_zone(unsigned int flags, const struct zone *zone) | ||
2453 | { | ||
2454 | bool ret = false; | ||
2455 | |||
2456 | if (!(flags & SHOW_MEM_FILTER_NODES)) | ||
2457 | goto out; | ||
2458 | |||
2459 | get_mems_allowed(); | ||
2460 | ret = !node_isset(zone->zone_pgdat->node_id, | ||
2461 | cpuset_current_mems_allowed); | ||
2462 | put_mems_allowed(); | ||
2463 | out: | ||
2464 | return ret; | ||
2465 | } | ||
2466 | |||
2414 | #define K(x) ((x) << (PAGE_SHIFT-10)) | 2467 | #define K(x) ((x) << (PAGE_SHIFT-10)) |
2415 | 2468 | ||
2416 | /* | 2469 | /* |
2417 | * Show free area list (used inside shift_scroll-lock stuff) | 2470 | * Show free area list (used inside shift_scroll-lock stuff) |
2418 | * We also calculate the percentage fragmentation. We do this by counting the | 2471 | * We also calculate the percentage fragmentation. We do this by counting the |
2419 | * memory on each free list with the exception of the first item on the list. | 2472 | * memory on each free list with the exception of the first item on the list. |
2473 | * Suppresses nodes that are not allowed by current's cpuset if | ||
2474 | * SHOW_MEM_FILTER_NODES is passed. | ||
2420 | */ | 2475 | */ |
2421 | void show_free_areas(void) | 2476 | void __show_free_areas(unsigned int filter) |
2422 | { | 2477 | { |
2423 | int cpu; | 2478 | int cpu; |
2424 | struct zone *zone; | 2479 | struct zone *zone; |
2425 | 2480 | ||
2426 | for_each_populated_zone(zone) { | 2481 | for_each_populated_zone(zone) { |
2482 | if (skip_free_areas_zone(filter, zone)) | ||
2483 | continue; | ||
2427 | show_node(zone); | 2484 | show_node(zone); |
2428 | printk("%s per-cpu:\n", zone->name); | 2485 | printk("%s per-cpu:\n", zone->name); |
2429 | 2486 | ||
@@ -2465,6 +2522,8 @@ void show_free_areas(void) | |||
2465 | for_each_populated_zone(zone) { | 2522 | for_each_populated_zone(zone) { |
2466 | int i; | 2523 | int i; |
2467 | 2524 | ||
2525 | if (skip_free_areas_zone(filter, zone)) | ||
2526 | continue; | ||
2468 | show_node(zone); | 2527 | show_node(zone); |
2469 | printk("%s" | 2528 | printk("%s" |
2470 | " free:%lukB" | 2529 | " free:%lukB" |
@@ -2532,6 +2591,8 @@ void show_free_areas(void) | |||
2532 | for_each_populated_zone(zone) { | 2591 | for_each_populated_zone(zone) { |
2533 | unsigned long nr[MAX_ORDER], flags, order, total = 0; | 2592 | unsigned long nr[MAX_ORDER], flags, order, total = 0; |
2534 | 2593 | ||
2594 | if (skip_free_areas_zone(filter, zone)) | ||
2595 | continue; | ||
2535 | show_node(zone); | 2596 | show_node(zone); |
2536 | printk("%s: ", zone->name); | 2597 | printk("%s: ", zone->name); |
2537 | 2598 | ||
@@ -2551,6 +2612,11 @@ void show_free_areas(void) | |||
2551 | show_swap_cache_info(); | 2612 | show_swap_cache_info(); |
2552 | } | 2613 | } |
2553 | 2614 | ||
2615 | void show_free_areas(void) | ||
2616 | { | ||
2617 | __show_free_areas(0); | ||
2618 | } | ||
2619 | |||
2554 | static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) | 2620 | static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) |
2555 | { | 2621 | { |
2556 | zoneref->zone = zone; | 2622 | zoneref->zone = zone; |
@@ -3110,7 +3176,7 @@ static __init_refok int __build_all_zonelists(void *data) | |||
3110 | * Called with zonelists_mutex held always | 3176 | * Called with zonelists_mutex held always |
3111 | * unless system_state == SYSTEM_BOOTING. | 3177 | * unless system_state == SYSTEM_BOOTING. |
3112 | */ | 3178 | */ |
3113 | void build_all_zonelists(void *data) | 3179 | void __ref build_all_zonelists(void *data) |
3114 | { | 3180 | { |
3115 | set_zonelist_order(); | 3181 | set_zonelist_order(); |
3116 | 3182 | ||
@@ -3860,7 +3926,7 @@ static void __init find_usable_zone_for_movable(void) | |||
3860 | 3926 | ||
3861 | /* | 3927 | /* |
3862 | * The zone ranges provided by the architecture do not include ZONE_MOVABLE | 3928 | * The zone ranges provided by the architecture do not include ZONE_MOVABLE |
3863 | * because it is sized independant of architecture. Unlike the other zones, | 3929 | * because it is sized independent of architecture. Unlike the other zones, |
3864 | * the starting point for ZONE_MOVABLE is not fixed. It may be different | 3930 | * the starting point for ZONE_MOVABLE is not fixed. It may be different |
3865 | * in each node depending on the size of each node and how evenly kernelcore | 3931 | * in each node depending on the size of each node and how evenly kernelcore |
3866 | * is distributed. This helper function adjusts the zone ranges | 3932 | * is distributed. This helper function adjusts the zone ranges |
@@ -5621,4 +5687,5 @@ void dump_page(struct page *page) | |||
5621 | page, atomic_read(&page->_count), page_mapcount(page), | 5687 | page, atomic_read(&page->_count), page_mapcount(page), |
5622 | page->mapping, page->index); | 5688 | page->mapping, page->index); |
5623 | dump_page_flags(page->flags); | 5689 | dump_page_flags(page->flags); |
5690 | mem_cgroup_print_bad_page(page); | ||
5624 | } | 5691 | } |
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 5bffada7cde1..99055010cece 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c | |||
@@ -11,12 +11,11 @@ | |||
11 | #include <linux/swapops.h> | 11 | #include <linux/swapops.h> |
12 | #include <linux/kmemleak.h> | 12 | #include <linux/kmemleak.h> |
13 | 13 | ||
14 | static void __meminit | 14 | static void __meminit init_page_cgroup(struct page_cgroup *pc, unsigned long id) |
15 | __init_page_cgroup(struct page_cgroup *pc, unsigned long pfn) | ||
16 | { | 15 | { |
17 | pc->flags = 0; | 16 | pc->flags = 0; |
17 | set_page_cgroup_array_id(pc, id); | ||
18 | pc->mem_cgroup = NULL; | 18 | pc->mem_cgroup = NULL; |
19 | pc->page = pfn_to_page(pfn); | ||
20 | INIT_LIST_HEAD(&pc->lru); | 19 | INIT_LIST_HEAD(&pc->lru); |
21 | } | 20 | } |
22 | static unsigned long total_usage; | 21 | static unsigned long total_usage; |
@@ -43,6 +42,19 @@ struct page_cgroup *lookup_page_cgroup(struct page *page) | |||
43 | return base + offset; | 42 | return base + offset; |
44 | } | 43 | } |
45 | 44 | ||
45 | struct page *lookup_cgroup_page(struct page_cgroup *pc) | ||
46 | { | ||
47 | unsigned long pfn; | ||
48 | struct page *page; | ||
49 | pg_data_t *pgdat; | ||
50 | |||
51 | pgdat = NODE_DATA(page_cgroup_array_id(pc)); | ||
52 | pfn = pc - pgdat->node_page_cgroup + pgdat->node_start_pfn; | ||
53 | page = pfn_to_page(pfn); | ||
54 | VM_BUG_ON(pc != lookup_page_cgroup(page)); | ||
55 | return page; | ||
56 | } | ||
57 | |||
46 | static int __init alloc_node_page_cgroup(int nid) | 58 | static int __init alloc_node_page_cgroup(int nid) |
47 | { | 59 | { |
48 | struct page_cgroup *base, *pc; | 60 | struct page_cgroup *base, *pc; |
@@ -63,7 +75,7 @@ static int __init alloc_node_page_cgroup(int nid) | |||
63 | return -ENOMEM; | 75 | return -ENOMEM; |
64 | for (index = 0; index < nr_pages; index++) { | 76 | for (index = 0; index < nr_pages; index++) { |
65 | pc = base + index; | 77 | pc = base + index; |
66 | __init_page_cgroup(pc, start_pfn + index); | 78 | init_page_cgroup(pc, nid); |
67 | } | 79 | } |
68 | NODE_DATA(nid)->node_page_cgroup = base; | 80 | NODE_DATA(nid)->node_page_cgroup = base; |
69 | total_usage += table_size; | 81 | total_usage += table_size; |
@@ -105,46 +117,75 @@ struct page_cgroup *lookup_page_cgroup(struct page *page) | |||
105 | return section->page_cgroup + pfn; | 117 | return section->page_cgroup + pfn; |
106 | } | 118 | } |
107 | 119 | ||
108 | /* __alloc_bootmem...() is protected by !slab_available() */ | 120 | struct page *lookup_cgroup_page(struct page_cgroup *pc) |
121 | { | ||
122 | struct mem_section *section; | ||
123 | struct page *page; | ||
124 | unsigned long nr; | ||
125 | |||
126 | nr = page_cgroup_array_id(pc); | ||
127 | section = __nr_to_section(nr); | ||
128 | page = pfn_to_page(pc - section->page_cgroup); | ||
129 | VM_BUG_ON(pc != lookup_page_cgroup(page)); | ||
130 | return page; | ||
131 | } | ||
132 | |||
133 | static void *__init_refok alloc_page_cgroup(size_t size, int nid) | ||
134 | { | ||
135 | void *addr = NULL; | ||
136 | |||
137 | addr = alloc_pages_exact(size, GFP_KERNEL | __GFP_NOWARN); | ||
138 | if (addr) | ||
139 | return addr; | ||
140 | |||
141 | if (node_state(nid, N_HIGH_MEMORY)) | ||
142 | addr = vmalloc_node(size, nid); | ||
143 | else | ||
144 | addr = vmalloc(size); | ||
145 | |||
146 | return addr; | ||
147 | } | ||
148 | |||
149 | #ifdef CONFIG_MEMORY_HOTPLUG | ||
150 | static void free_page_cgroup(void *addr) | ||
151 | { | ||
152 | if (is_vmalloc_addr(addr)) { | ||
153 | vfree(addr); | ||
154 | } else { | ||
155 | struct page *page = virt_to_page(addr); | ||
156 | size_t table_size = | ||
157 | sizeof(struct page_cgroup) * PAGES_PER_SECTION; | ||
158 | |||
159 | BUG_ON(PageReserved(page)); | ||
160 | free_pages_exact(addr, table_size); | ||
161 | } | ||
162 | } | ||
163 | #endif | ||
164 | |||
109 | static int __init_refok init_section_page_cgroup(unsigned long pfn) | 165 | static int __init_refok init_section_page_cgroup(unsigned long pfn) |
110 | { | 166 | { |
111 | struct mem_section *section = __pfn_to_section(pfn); | ||
112 | struct page_cgroup *base, *pc; | 167 | struct page_cgroup *base, *pc; |
168 | struct mem_section *section; | ||
113 | unsigned long table_size; | 169 | unsigned long table_size; |
170 | unsigned long nr; | ||
114 | int nid, index; | 171 | int nid, index; |
115 | 172 | ||
116 | if (!section->page_cgroup) { | 173 | nr = pfn_to_section_nr(pfn); |
117 | nid = page_to_nid(pfn_to_page(pfn)); | 174 | section = __nr_to_section(nr); |
118 | table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; | 175 | |
119 | VM_BUG_ON(!slab_is_available()); | 176 | if (section->page_cgroup) |
120 | if (node_state(nid, N_HIGH_MEMORY)) { | 177 | return 0; |
121 | base = kmalloc_node(table_size, | 178 | |
122 | GFP_KERNEL | __GFP_NOWARN, nid); | 179 | nid = page_to_nid(pfn_to_page(pfn)); |
123 | if (!base) | 180 | table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; |
124 | base = vmalloc_node(table_size, nid); | 181 | base = alloc_page_cgroup(table_size, nid); |
125 | } else { | 182 | |
126 | base = kmalloc(table_size, GFP_KERNEL | __GFP_NOWARN); | 183 | /* |
127 | if (!base) | 184 | * The value stored in section->page_cgroup is (base - pfn) |
128 | base = vmalloc(table_size); | 185 | * and it does not point to the memory block allocated above, |
129 | } | 186 | * causing kmemleak false positives. |
130 | /* | 187 | */ |
131 | * The value stored in section->page_cgroup is (base - pfn) | 188 | kmemleak_not_leak(base); |
132 | * and it does not point to the memory block allocated above, | ||
133 | * causing kmemleak false positives. | ||
134 | */ | ||
135 | kmemleak_not_leak(base); | ||
136 | } else { | ||
137 | /* | ||
138 | * We don't have to allocate page_cgroup again, but | ||
139 | * address of memmap may be changed. So, we have to initialize | ||
140 | * again. | ||
141 | */ | ||
142 | base = section->page_cgroup + pfn; | ||
143 | table_size = 0; | ||
144 | /* check address of memmap is changed or not. */ | ||
145 | if (base->page == pfn_to_page(pfn)) | ||
146 | return 0; | ||
147 | } | ||
148 | 189 | ||
149 | if (!base) { | 190 | if (!base) { |
150 | printk(KERN_ERR "page cgroup allocation failure\n"); | 191 | printk(KERN_ERR "page cgroup allocation failure\n"); |
@@ -153,7 +194,7 @@ static int __init_refok init_section_page_cgroup(unsigned long pfn) | |||
153 | 194 | ||
154 | for (index = 0; index < PAGES_PER_SECTION; index++) { | 195 | for (index = 0; index < PAGES_PER_SECTION; index++) { |
155 | pc = base + index; | 196 | pc = base + index; |
156 | __init_page_cgroup(pc, pfn + index); | 197 | init_page_cgroup(pc, nr); |
157 | } | 198 | } |
158 | 199 | ||
159 | section->page_cgroup = base - pfn; | 200 | section->page_cgroup = base - pfn; |
@@ -170,16 +211,8 @@ void __free_page_cgroup(unsigned long pfn) | |||
170 | if (!ms || !ms->page_cgroup) | 211 | if (!ms || !ms->page_cgroup) |
171 | return; | 212 | return; |
172 | base = ms->page_cgroup + pfn; | 213 | base = ms->page_cgroup + pfn; |
173 | if (is_vmalloc_addr(base)) { | 214 | free_page_cgroup(base); |
174 | vfree(base); | 215 | ms->page_cgroup = NULL; |
175 | ms->page_cgroup = NULL; | ||
176 | } else { | ||
177 | struct page *page = virt_to_page(base); | ||
178 | if (!PageReserved(page)) { /* Is bootmem ? */ | ||
179 | kfree(base); | ||
180 | ms->page_cgroup = NULL; | ||
181 | } | ||
182 | } | ||
183 | } | 216 | } |
184 | 217 | ||
185 | int __meminit online_page_cgroup(unsigned long start_pfn, | 218 | int __meminit online_page_cgroup(unsigned long start_pfn, |
@@ -243,12 +276,7 @@ static int __meminit page_cgroup_callback(struct notifier_block *self, | |||
243 | break; | 276 | break; |
244 | } | 277 | } |
245 | 278 | ||
246 | if (ret) | 279 | return notifier_from_errno(ret); |
247 | ret = notifier_from_errno(ret); | ||
248 | else | ||
249 | ret = NOTIFY_OK; | ||
250 | |||
251 | return ret; | ||
252 | } | 280 | } |
253 | 281 | ||
254 | #endif | 282 | #endif |
@@ -349,7 +377,7 @@ not_enough_page: | |||
349 | * @new: new id | 377 | * @new: new id |
350 | * | 378 | * |
351 | * Returns old id at success, 0 at failure. | 379 | * Returns old id at success, 0 at failure. |
352 | * (There is no mem_cgroup useing 0 as its id) | 380 | * (There is no mem_cgroup using 0 as its id) |
353 | */ | 381 | */ |
354 | unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, | 382 | unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, |
355 | unsigned short old, unsigned short new) | 383 | unsigned short old, unsigned short new) |
diff --git a/mm/page_io.c b/mm/page_io.c index 2dee975bf469..dc76b4d0611e 100644 --- a/mm/page_io.c +++ b/mm/page_io.c | |||
@@ -106,7 +106,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) | |||
106 | goto out; | 106 | goto out; |
107 | } | 107 | } |
108 | if (wbc->sync_mode == WB_SYNC_ALL) | 108 | if (wbc->sync_mode == WB_SYNC_ALL) |
109 | rw |= REQ_SYNC | REQ_UNPLUG; | 109 | rw |= REQ_SYNC; |
110 | count_vm_event(PSWPOUT); | 110 | count_vm_event(PSWPOUT); |
111 | set_page_writeback(page); | 111 | set_page_writeback(page); |
112 | unlock_page(page); | 112 | unlock_page(page); |
diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 7cfa6ae02303..c3450d533611 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c | |||
@@ -33,19 +33,35 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, | |||
33 | 33 | ||
34 | pmd = pmd_offset(pud, addr); | 34 | pmd = pmd_offset(pud, addr); |
35 | do { | 35 | do { |
36 | again: | ||
36 | next = pmd_addr_end(addr, end); | 37 | next = pmd_addr_end(addr, end); |
37 | split_huge_page_pmd(walk->mm, pmd); | 38 | if (pmd_none(*pmd)) { |
38 | if (pmd_none_or_clear_bad(pmd)) { | ||
39 | if (walk->pte_hole) | 39 | if (walk->pte_hole) |
40 | err = walk->pte_hole(addr, next, walk); | 40 | err = walk->pte_hole(addr, next, walk); |
41 | if (err) | 41 | if (err) |
42 | break; | 42 | break; |
43 | continue; | 43 | continue; |
44 | } | 44 | } |
45 | /* | ||
46 | * This implies that each ->pmd_entry() handler | ||
47 | * needs to know about pmd_trans_huge() pmds | ||
48 | */ | ||
45 | if (walk->pmd_entry) | 49 | if (walk->pmd_entry) |
46 | err = walk->pmd_entry(pmd, addr, next, walk); | 50 | err = walk->pmd_entry(pmd, addr, next, walk); |
47 | if (!err && walk->pte_entry) | 51 | if (err) |
48 | err = walk_pte_range(pmd, addr, next, walk); | 52 | break; |
53 | |||
54 | /* | ||
55 | * Check this here so we only break down trans_huge | ||
56 | * pages when we _need_ to | ||
57 | */ | ||
58 | if (!walk->pte_entry) | ||
59 | continue; | ||
60 | |||
61 | split_huge_page_pmd(walk->mm, pmd); | ||
62 | if (pmd_none_or_clear_bad(pmd)) | ||
63 | goto again; | ||
64 | err = walk_pte_range(pmd, addr, next, walk); | ||
49 | if (err) | 65 | if (err) |
50 | break; | 66 | break; |
51 | } while (pmd++, addr = next, addr != end); | 67 | } while (pmd++, addr = next, addr != end); |
diff --git a/mm/percpu.c b/mm/percpu.c index 3f930018aa60..a160db39b810 100644 --- a/mm/percpu.c +++ b/mm/percpu.c | |||
@@ -342,7 +342,7 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) | |||
342 | * @chunk: chunk of interest | 342 | * @chunk: chunk of interest |
343 | * | 343 | * |
344 | * Determine whether area map of @chunk needs to be extended to | 344 | * Determine whether area map of @chunk needs to be extended to |
345 | * accomodate a new allocation. | 345 | * accommodate a new allocation. |
346 | * | 346 | * |
347 | * CONTEXT: | 347 | * CONTEXT: |
348 | * pcpu_lock. | 348 | * pcpu_lock. |
@@ -431,7 +431,7 @@ out_unlock: | |||
431 | * depending on @head, is reduced by @tail bytes and @tail byte block | 431 | * depending on @head, is reduced by @tail bytes and @tail byte block |
432 | * is inserted after the target block. | 432 | * is inserted after the target block. |
433 | * | 433 | * |
434 | * @chunk->map must have enough free slots to accomodate the split. | 434 | * @chunk->map must have enough free slots to accommodate the split. |
435 | * | 435 | * |
436 | * CONTEXT: | 436 | * CONTEXT: |
437 | * pcpu_lock. | 437 | * pcpu_lock. |
@@ -1008,8 +1008,7 @@ phys_addr_t per_cpu_ptr_to_phys(void *addr) | |||
1008 | } | 1008 | } |
1009 | 1009 | ||
1010 | if (in_first_chunk) { | 1010 | if (in_first_chunk) { |
1011 | if ((unsigned long)addr < VMALLOC_START || | 1011 | if (!is_vmalloc_addr(addr)) |
1012 | (unsigned long)addr >= VMALLOC_END) | ||
1013 | return __pa(addr); | 1012 | return __pa(addr); |
1014 | else | 1013 | else |
1015 | return page_to_phys(vmalloc_to_page(addr)); | 1014 | return page_to_phys(vmalloc_to_page(addr)); |
@@ -1436,7 +1435,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info( | |||
1436 | /* | 1435 | /* |
1437 | * Determine min_unit_size, alloc_size and max_upa such that | 1436 | * Determine min_unit_size, alloc_size and max_upa such that |
1438 | * alloc_size is multiple of atom_size and is the smallest | 1437 | * alloc_size is multiple of atom_size and is the smallest |
1439 | * which can accomodate 4k aligned segments which are equal to | 1438 | * which can accommodate 4k aligned segments which are equal to |
1440 | * or larger than min_unit_size. | 1439 | * or larger than min_unit_size. |
1441 | */ | 1440 | */ |
1442 | min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); | 1441 | min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); |
@@ -1551,7 +1550,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info( | |||
1551 | * @atom_size: allocation atom size | 1550 | * @atom_size: allocation atom size |
1552 | * @cpu_distance_fn: callback to determine distance between cpus, optional | 1551 | * @cpu_distance_fn: callback to determine distance between cpus, optional |
1553 | * @alloc_fn: function to allocate percpu page | 1552 | * @alloc_fn: function to allocate percpu page |
1554 | * @free_fn: funtion to free percpu page | 1553 | * @free_fn: function to free percpu page |
1555 | * | 1554 | * |
1556 | * This is a helper to ease setting up embedded first percpu chunk and | 1555 | * This is a helper to ease setting up embedded first percpu chunk and |
1557 | * can be called where pcpu_setup_first_chunk() is expected. | 1556 | * can be called where pcpu_setup_first_chunk() is expected. |
@@ -1679,7 +1678,7 @@ out_free: | |||
1679 | * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages | 1678 | * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages |
1680 | * @reserved_size: the size of reserved percpu area in bytes | 1679 | * @reserved_size: the size of reserved percpu area in bytes |
1681 | * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE | 1680 | * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE |
1682 | * @free_fn: funtion to free percpu page, always called with PAGE_SIZE | 1681 | * @free_fn: function to free percpu page, always called with PAGE_SIZE |
1683 | * @populate_pte_fn: function to populate pte | 1682 | * @populate_pte_fn: function to populate pte |
1684 | * | 1683 | * |
1685 | * This is a helper to ease setting up page-remapped first percpu | 1684 | * This is a helper to ease setting up page-remapped first percpu |
diff --git a/mm/readahead.c b/mm/readahead.c index 77506a291a2d..2c0cc489e288 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
@@ -109,9 +109,12 @@ EXPORT_SYMBOL(read_cache_pages); | |||
109 | static int read_pages(struct address_space *mapping, struct file *filp, | 109 | static int read_pages(struct address_space *mapping, struct file *filp, |
110 | struct list_head *pages, unsigned nr_pages) | 110 | struct list_head *pages, unsigned nr_pages) |
111 | { | 111 | { |
112 | struct blk_plug plug; | ||
112 | unsigned page_idx; | 113 | unsigned page_idx; |
113 | int ret; | 114 | int ret; |
114 | 115 | ||
116 | blk_start_plug(&plug); | ||
117 | |||
115 | if (mapping->a_ops->readpages) { | 118 | if (mapping->a_ops->readpages) { |
116 | ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages); | 119 | ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages); |
117 | /* Clean up the remaining pages */ | 120 | /* Clean up the remaining pages */ |
@@ -129,7 +132,10 @@ static int read_pages(struct address_space *mapping, struct file *filp, | |||
129 | page_cache_release(page); | 132 | page_cache_release(page); |
130 | } | 133 | } |
131 | ret = 0; | 134 | ret = 0; |
135 | |||
132 | out: | 136 | out: |
137 | blk_finish_plug(&plug); | ||
138 | |||
133 | return ret; | 139 | return ret; |
134 | } | 140 | } |
135 | 141 | ||
@@ -554,17 +560,5 @@ page_cache_async_readahead(struct address_space *mapping, | |||
554 | 560 | ||
555 | /* do read-ahead */ | 561 | /* do read-ahead */ |
556 | ondemand_readahead(mapping, ra, filp, true, offset, req_size); | 562 | ondemand_readahead(mapping, ra, filp, true, offset, req_size); |
557 | |||
558 | #ifdef CONFIG_BLOCK | ||
559 | /* | ||
560 | * Normally the current page is !uptodate and lock_page() will be | ||
561 | * immediately called to implicitly unplug the device. However this | ||
562 | * is not always true for RAID conifgurations, where data arrives | ||
563 | * not strictly in their submission order. In this case we need to | ||
564 | * explicitly kick off the IO. | ||
565 | */ | ||
566 | if (PageUptodate(page)) | ||
567 | blk_run_backing_dev(mapping->backing_dev_info, NULL); | ||
568 | #endif | ||
569 | } | 563 | } |
570 | EXPORT_SYMBOL_GPL(page_cache_async_readahead); | 564 | EXPORT_SYMBOL_GPL(page_cache_async_readahead); |
@@ -31,11 +31,12 @@ | |||
31 | * swap_lock (in swap_duplicate, swap_info_get) | 31 | * swap_lock (in swap_duplicate, swap_info_get) |
32 | * mmlist_lock (in mmput, drain_mmlist and others) | 32 | * mmlist_lock (in mmput, drain_mmlist and others) |
33 | * mapping->private_lock (in __set_page_dirty_buffers) | 33 | * mapping->private_lock (in __set_page_dirty_buffers) |
34 | * inode_lock (in set_page_dirty's __mark_inode_dirty) | 34 | * inode->i_lock (in set_page_dirty's __mark_inode_dirty) |
35 | * inode_wb_list_lock (in set_page_dirty's __mark_inode_dirty) | ||
35 | * sb_lock (within inode_lock in fs/fs-writeback.c) | 36 | * sb_lock (within inode_lock in fs/fs-writeback.c) |
36 | * mapping->tree_lock (widely used, in set_page_dirty, | 37 | * mapping->tree_lock (widely used, in set_page_dirty, |
37 | * in arch-dependent flush_dcache_mmap_lock, | 38 | * in arch-dependent flush_dcache_mmap_lock, |
38 | * within inode_lock in __sync_single_inode) | 39 | * within inode_wb_list_lock in __sync_single_inode) |
39 | * | 40 | * |
40 | * (code doesn't rely on that order so it could be switched around) | 41 | * (code doesn't rely on that order so it could be switched around) |
41 | * ->tasklist_lock | 42 | * ->tasklist_lock |
@@ -67,11 +68,24 @@ static struct kmem_cache *anon_vma_chain_cachep; | |||
67 | 68 | ||
68 | static inline struct anon_vma *anon_vma_alloc(void) | 69 | static inline struct anon_vma *anon_vma_alloc(void) |
69 | { | 70 | { |
70 | return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); | 71 | struct anon_vma *anon_vma; |
72 | |||
73 | anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); | ||
74 | if (anon_vma) { | ||
75 | atomic_set(&anon_vma->refcount, 1); | ||
76 | /* | ||
77 | * Initialise the anon_vma root to point to itself. If called | ||
78 | * from fork, the root will be reset to the parents anon_vma. | ||
79 | */ | ||
80 | anon_vma->root = anon_vma; | ||
81 | } | ||
82 | |||
83 | return anon_vma; | ||
71 | } | 84 | } |
72 | 85 | ||
73 | void anon_vma_free(struct anon_vma *anon_vma) | 86 | static inline void anon_vma_free(struct anon_vma *anon_vma) |
74 | { | 87 | { |
88 | VM_BUG_ON(atomic_read(&anon_vma->refcount)); | ||
75 | kmem_cache_free(anon_vma_cachep, anon_vma); | 89 | kmem_cache_free(anon_vma_cachep, anon_vma); |
76 | } | 90 | } |
77 | 91 | ||
@@ -133,11 +147,6 @@ int anon_vma_prepare(struct vm_area_struct *vma) | |||
133 | if (unlikely(!anon_vma)) | 147 | if (unlikely(!anon_vma)) |
134 | goto out_enomem_free_avc; | 148 | goto out_enomem_free_avc; |
135 | allocated = anon_vma; | 149 | allocated = anon_vma; |
136 | /* | ||
137 | * This VMA had no anon_vma yet. This anon_vma is | ||
138 | * the root of any anon_vma tree that might form. | ||
139 | */ | ||
140 | anon_vma->root = anon_vma; | ||
141 | } | 150 | } |
142 | 151 | ||
143 | anon_vma_lock(anon_vma); | 152 | anon_vma_lock(anon_vma); |
@@ -156,7 +165,7 @@ int anon_vma_prepare(struct vm_area_struct *vma) | |||
156 | anon_vma_unlock(anon_vma); | 165 | anon_vma_unlock(anon_vma); |
157 | 166 | ||
158 | if (unlikely(allocated)) | 167 | if (unlikely(allocated)) |
159 | anon_vma_free(allocated); | 168 | put_anon_vma(allocated); |
160 | if (unlikely(avc)) | 169 | if (unlikely(avc)) |
161 | anon_vma_chain_free(avc); | 170 | anon_vma_chain_free(avc); |
162 | } | 171 | } |
@@ -241,9 +250,9 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) | |||
241 | */ | 250 | */ |
242 | anon_vma->root = pvma->anon_vma->root; | 251 | anon_vma->root = pvma->anon_vma->root; |
243 | /* | 252 | /* |
244 | * With KSM refcounts, an anon_vma can stay around longer than the | 253 | * With refcounts, an anon_vma can stay around longer than the |
245 | * process it belongs to. The root anon_vma needs to be pinned | 254 | * process it belongs to. The root anon_vma needs to be pinned until |
246 | * until this anon_vma is freed, because the lock lives in the root. | 255 | * this anon_vma is freed, because the lock lives in the root. |
247 | */ | 256 | */ |
248 | get_anon_vma(anon_vma->root); | 257 | get_anon_vma(anon_vma->root); |
249 | /* Mark this anon_vma as the one where our new (COWed) pages go. */ | 258 | /* Mark this anon_vma as the one where our new (COWed) pages go. */ |
@@ -253,7 +262,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) | |||
253 | return 0; | 262 | return 0; |
254 | 263 | ||
255 | out_error_free_anon_vma: | 264 | out_error_free_anon_vma: |
256 | anon_vma_free(anon_vma); | 265 | put_anon_vma(anon_vma); |
257 | out_error: | 266 | out_error: |
258 | unlink_anon_vmas(vma); | 267 | unlink_anon_vmas(vma); |
259 | return -ENOMEM; | 268 | return -ENOMEM; |
@@ -272,15 +281,11 @@ static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain) | |||
272 | list_del(&anon_vma_chain->same_anon_vma); | 281 | list_del(&anon_vma_chain->same_anon_vma); |
273 | 282 | ||
274 | /* We must garbage collect the anon_vma if it's empty */ | 283 | /* We must garbage collect the anon_vma if it's empty */ |
275 | empty = list_empty(&anon_vma->head) && !anonvma_external_refcount(anon_vma); | 284 | empty = list_empty(&anon_vma->head); |
276 | anon_vma_unlock(anon_vma); | 285 | anon_vma_unlock(anon_vma); |
277 | 286 | ||
278 | if (empty) { | 287 | if (empty) |
279 | /* We no longer need the root anon_vma */ | 288 | put_anon_vma(anon_vma); |
280 | if (anon_vma->root != anon_vma) | ||
281 | drop_anon_vma(anon_vma->root); | ||
282 | anon_vma_free(anon_vma); | ||
283 | } | ||
284 | } | 289 | } |
285 | 290 | ||
286 | void unlink_anon_vmas(struct vm_area_struct *vma) | 291 | void unlink_anon_vmas(struct vm_area_struct *vma) |
@@ -303,7 +308,7 @@ static void anon_vma_ctor(void *data) | |||
303 | struct anon_vma *anon_vma = data; | 308 | struct anon_vma *anon_vma = data; |
304 | 309 | ||
305 | spin_lock_init(&anon_vma->lock); | 310 | spin_lock_init(&anon_vma->lock); |
306 | anonvma_external_refcount_init(anon_vma); | 311 | atomic_set(&anon_vma->refcount, 0); |
307 | INIT_LIST_HEAD(&anon_vma->head); | 312 | INIT_LIST_HEAD(&anon_vma->head); |
308 | } | 313 | } |
309 | 314 | ||
@@ -1486,41 +1491,15 @@ int try_to_munlock(struct page *page) | |||
1486 | return try_to_unmap_file(page, TTU_MUNLOCK); | 1491 | return try_to_unmap_file(page, TTU_MUNLOCK); |
1487 | } | 1492 | } |
1488 | 1493 | ||
1489 | #if defined(CONFIG_KSM) || defined(CONFIG_MIGRATION) | 1494 | void __put_anon_vma(struct anon_vma *anon_vma) |
1490 | /* | ||
1491 | * Drop an anon_vma refcount, freeing the anon_vma and anon_vma->root | ||
1492 | * if necessary. Be careful to do all the tests under the lock. Once | ||
1493 | * we know we are the last user, nobody else can get a reference and we | ||
1494 | * can do the freeing without the lock. | ||
1495 | */ | ||
1496 | void drop_anon_vma(struct anon_vma *anon_vma) | ||
1497 | { | 1495 | { |
1498 | BUG_ON(atomic_read(&anon_vma->external_refcount) <= 0); | 1496 | struct anon_vma *root = anon_vma->root; |
1499 | if (atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->root->lock)) { | ||
1500 | struct anon_vma *root = anon_vma->root; | ||
1501 | int empty = list_empty(&anon_vma->head); | ||
1502 | int last_root_user = 0; | ||
1503 | int root_empty = 0; | ||
1504 | 1497 | ||
1505 | /* | 1498 | if (root != anon_vma && atomic_dec_and_test(&root->refcount)) |
1506 | * The refcount on a non-root anon_vma got dropped. Drop | 1499 | anon_vma_free(root); |
1507 | * the refcount on the root and check if we need to free it. | ||
1508 | */ | ||
1509 | if (empty && anon_vma != root) { | ||
1510 | BUG_ON(atomic_read(&root->external_refcount) <= 0); | ||
1511 | last_root_user = atomic_dec_and_test(&root->external_refcount); | ||
1512 | root_empty = list_empty(&root->head); | ||
1513 | } | ||
1514 | anon_vma_unlock(anon_vma); | ||
1515 | 1500 | ||
1516 | if (empty) { | 1501 | anon_vma_free(anon_vma); |
1517 | anon_vma_free(anon_vma); | ||
1518 | if (root_empty && last_root_user) | ||
1519 | anon_vma_free(root); | ||
1520 | } | ||
1521 | } | ||
1522 | } | 1502 | } |
1523 | #endif | ||
1524 | 1503 | ||
1525 | #ifdef CONFIG_MIGRATION | 1504 | #ifdef CONFIG_MIGRATION |
1526 | /* | 1505 | /* |
diff --git a/mm/shmem.c b/mm/shmem.c index 048a95a5244d..8fa27e4e582a 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -224,7 +224,6 @@ static const struct vm_operations_struct shmem_vm_ops; | |||
224 | static struct backing_dev_info shmem_backing_dev_info __read_mostly = { | 224 | static struct backing_dev_info shmem_backing_dev_info __read_mostly = { |
225 | .ra_pages = 0, /* No readahead */ | 225 | .ra_pages = 0, /* No readahead */ |
226 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, | 226 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, |
227 | .unplug_io_fn = default_unplug_io_fn, | ||
228 | }; | 227 | }; |
229 | 228 | ||
230 | static LIST_HEAD(shmem_swaplist); | 229 | static LIST_HEAD(shmem_swaplist); |
@@ -422,7 +421,8 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long | |||
422 | * a waste to allocate index if we cannot allocate data. | 421 | * a waste to allocate index if we cannot allocate data. |
423 | */ | 422 | */ |
424 | if (sbinfo->max_blocks) { | 423 | if (sbinfo->max_blocks) { |
425 | if (percpu_counter_compare(&sbinfo->used_blocks, (sbinfo->max_blocks - 1)) > 0) | 424 | if (percpu_counter_compare(&sbinfo->used_blocks, |
425 | sbinfo->max_blocks - 1) >= 0) | ||
426 | return ERR_PTR(-ENOSPC); | 426 | return ERR_PTR(-ENOSPC); |
427 | percpu_counter_inc(&sbinfo->used_blocks); | 427 | percpu_counter_inc(&sbinfo->used_blocks); |
428 | spin_lock(&inode->i_lock); | 428 | spin_lock(&inode->i_lock); |
@@ -1081,7 +1081,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) | |||
1081 | shmem_recalc_inode(inode); | 1081 | shmem_recalc_inode(inode); |
1082 | 1082 | ||
1083 | if (swap.val && add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { | 1083 | if (swap.val && add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { |
1084 | remove_from_page_cache(page); | 1084 | delete_from_page_cache(page); |
1085 | shmem_swp_set(info, entry, swap.val); | 1085 | shmem_swp_set(info, entry, swap.val); |
1086 | shmem_swp_unmap(entry); | 1086 | shmem_swp_unmap(entry); |
1087 | if (list_empty(&info->swaplist)) | 1087 | if (list_empty(&info->swaplist)) |
@@ -1091,7 +1091,6 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) | |||
1091 | spin_unlock(&info->lock); | 1091 | spin_unlock(&info->lock); |
1092 | swap_shmem_alloc(swap); | 1092 | swap_shmem_alloc(swap); |
1093 | BUG_ON(page_mapped(page)); | 1093 | BUG_ON(page_mapped(page)); |
1094 | page_cache_release(page); /* pagecache ref */ | ||
1095 | swap_writepage(page, wbc); | 1094 | swap_writepage(page, wbc); |
1096 | if (inode) { | 1095 | if (inode) { |
1097 | mutex_lock(&shmem_swaplist_mutex); | 1096 | mutex_lock(&shmem_swaplist_mutex); |
@@ -1399,7 +1398,8 @@ repeat: | |||
1399 | shmem_swp_unmap(entry); | 1398 | shmem_swp_unmap(entry); |
1400 | sbinfo = SHMEM_SB(inode->i_sb); | 1399 | sbinfo = SHMEM_SB(inode->i_sb); |
1401 | if (sbinfo->max_blocks) { | 1400 | if (sbinfo->max_blocks) { |
1402 | if ((percpu_counter_compare(&sbinfo->used_blocks, sbinfo->max_blocks) > 0) || | 1401 | if (percpu_counter_compare(&sbinfo->used_blocks, |
1402 | sbinfo->max_blocks) >= 0 || | ||
1403 | shmem_acct_block(info->flags)) { | 1403 | shmem_acct_block(info->flags)) { |
1404 | spin_unlock(&info->lock); | 1404 | spin_unlock(&info->lock); |
1405 | error = -ENOSPC; | 1405 | error = -ENOSPC; |
@@ -2794,5 +2794,6 @@ int shmem_zero_setup(struct vm_area_struct *vma) | |||
2794 | fput(vma->vm_file); | 2794 | fput(vma->vm_file); |
2795 | vma->vm_file = file; | 2795 | vma->vm_file = file; |
2796 | vma->vm_ops = &shmem_vm_ops; | 2796 | vma->vm_ops = &shmem_vm_ops; |
2797 | vma->vm_flags |= VM_CAN_NONLINEAR; | ||
2797 | return 0; | 2798 | return 0; |
2798 | } | 2799 | } |
@@ -191,22 +191,6 @@ typedef unsigned int kmem_bufctl_t; | |||
191 | #define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3) | 191 | #define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3) |
192 | 192 | ||
193 | /* | 193 | /* |
194 | * struct slab | ||
195 | * | ||
196 | * Manages the objs in a slab. Placed either at the beginning of mem allocated | ||
197 | * for a slab, or allocated from an general cache. | ||
198 | * Slabs are chained into three list: fully used, partial, fully free slabs. | ||
199 | */ | ||
200 | struct slab { | ||
201 | struct list_head list; | ||
202 | unsigned long colouroff; | ||
203 | void *s_mem; /* including colour offset */ | ||
204 | unsigned int inuse; /* num of objs active in slab */ | ||
205 | kmem_bufctl_t free; | ||
206 | unsigned short nodeid; | ||
207 | }; | ||
208 | |||
209 | /* | ||
210 | * struct slab_rcu | 194 | * struct slab_rcu |
211 | * | 195 | * |
212 | * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to | 196 | * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to |
@@ -219,8 +203,6 @@ struct slab { | |||
219 | * | 203 | * |
220 | * rcu_read_lock before reading the address, then rcu_read_unlock after | 204 | * rcu_read_lock before reading the address, then rcu_read_unlock after |
221 | * taking the spinlock within the structure expected at that address. | 205 | * taking the spinlock within the structure expected at that address. |
222 | * | ||
223 | * We assume struct slab_rcu can overlay struct slab when destroying. | ||
224 | */ | 206 | */ |
225 | struct slab_rcu { | 207 | struct slab_rcu { |
226 | struct rcu_head head; | 208 | struct rcu_head head; |
@@ -229,6 +211,27 @@ struct slab_rcu { | |||
229 | }; | 211 | }; |
230 | 212 | ||
231 | /* | 213 | /* |
214 | * struct slab | ||
215 | * | ||
216 | * Manages the objs in a slab. Placed either at the beginning of mem allocated | ||
217 | * for a slab, or allocated from an general cache. | ||
218 | * Slabs are chained into three list: fully used, partial, fully free slabs. | ||
219 | */ | ||
220 | struct slab { | ||
221 | union { | ||
222 | struct { | ||
223 | struct list_head list; | ||
224 | unsigned long colouroff; | ||
225 | void *s_mem; /* including colour offset */ | ||
226 | unsigned int inuse; /* num of objs active in slab */ | ||
227 | kmem_bufctl_t free; | ||
228 | unsigned short nodeid; | ||
229 | }; | ||
230 | struct slab_rcu __slab_cover_slab_rcu; | ||
231 | }; | ||
232 | }; | ||
233 | |||
234 | /* | ||
232 | * struct array_cache | 235 | * struct array_cache |
233 | * | 236 | * |
234 | * Purpose: | 237 | * Purpose: |
@@ -875,7 +878,7 @@ static struct array_cache *alloc_arraycache(int node, int entries, | |||
875 | nc = kmalloc_node(memsize, gfp, node); | 878 | nc = kmalloc_node(memsize, gfp, node); |
876 | /* | 879 | /* |
877 | * The array_cache structures contain pointers to free object. | 880 | * The array_cache structures contain pointers to free object. |
878 | * However, when such objects are allocated or transfered to another | 881 | * However, when such objects are allocated or transferred to another |
879 | * cache the pointers are not cleared and they could be counted as | 882 | * cache the pointers are not cleared and they could be counted as |
880 | * valid references during a kmemleak scan. Therefore, kmemleak must | 883 | * valid references during a kmemleak scan. Therefore, kmemleak must |
881 | * not scan such objects. | 884 | * not scan such objects. |
@@ -1387,7 +1390,7 @@ static int __meminit slab_memory_callback(struct notifier_block *self, | |||
1387 | break; | 1390 | break; |
1388 | } | 1391 | } |
1389 | out: | 1392 | out: |
1390 | return ret ? notifier_from_errno(ret) : NOTIFY_OK; | 1393 | return notifier_from_errno(ret); |
1391 | } | 1394 | } |
1392 | #endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */ | 1395 | #endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */ |
1393 | 1396 | ||
@@ -2147,8 +2150,6 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) | |||
2147 | * | 2150 | * |
2148 | * @name must be valid until the cache is destroyed. This implies that | 2151 | * @name must be valid until the cache is destroyed. This implies that |
2149 | * the module calling this has to destroy the cache before getting unloaded. | 2152 | * the module calling this has to destroy the cache before getting unloaded. |
2150 | * Note that kmem_cache_name() is not guaranteed to return the same pointer, | ||
2151 | * therefore applications must manage it themselves. | ||
2152 | * | 2153 | * |
2153 | * The flags are | 2154 | * The flags are |
2154 | * | 2155 | * |
@@ -2288,8 +2289,8 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2288 | if (ralign < align) { | 2289 | if (ralign < align) { |
2289 | ralign = align; | 2290 | ralign = align; |
2290 | } | 2291 | } |
2291 | /* disable debug if not aligning with REDZONE_ALIGN */ | 2292 | /* disable debug if necessary */ |
2292 | if (ralign & (__alignof__(unsigned long long) - 1)) | 2293 | if (ralign > __alignof__(unsigned long long)) |
2293 | flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); | 2294 | flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); |
2294 | /* | 2295 | /* |
2295 | * 4) Store it. | 2296 | * 4) Store it. |
@@ -2315,8 +2316,8 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2315 | */ | 2316 | */ |
2316 | if (flags & SLAB_RED_ZONE) { | 2317 | if (flags & SLAB_RED_ZONE) { |
2317 | /* add space for red zone words */ | 2318 | /* add space for red zone words */ |
2318 | cachep->obj_offset += align; | 2319 | cachep->obj_offset += sizeof(unsigned long long); |
2319 | size += align + sizeof(unsigned long long); | 2320 | size += 2 * sizeof(unsigned long long); |
2320 | } | 2321 | } |
2321 | if (flags & SLAB_STORE_USER) { | 2322 | if (flags & SLAB_STORE_USER) { |
2322 | /* user store requires one word storage behind the end of | 2323 | /* user store requires one word storage behind the end of |
@@ -2605,7 +2606,7 @@ EXPORT_SYMBOL(kmem_cache_shrink); | |||
2605 | * | 2606 | * |
2606 | * The cache must be empty before calling this function. | 2607 | * The cache must be empty before calling this function. |
2607 | * | 2608 | * |
2608 | * The caller must guarantee that noone will allocate memory from the cache | 2609 | * The caller must guarantee that no one will allocate memory from the cache |
2609 | * during the kmem_cache_destroy(). | 2610 | * during the kmem_cache_destroy(). |
2610 | */ | 2611 | */ |
2611 | void kmem_cache_destroy(struct kmem_cache *cachep) | 2612 | void kmem_cache_destroy(struct kmem_cache *cachep) |
@@ -3840,12 +3841,6 @@ unsigned int kmem_cache_size(struct kmem_cache *cachep) | |||
3840 | } | 3841 | } |
3841 | EXPORT_SYMBOL(kmem_cache_size); | 3842 | EXPORT_SYMBOL(kmem_cache_size); |
3842 | 3843 | ||
3843 | const char *kmem_cache_name(struct kmem_cache *cachep) | ||
3844 | { | ||
3845 | return cachep->name; | ||
3846 | } | ||
3847 | EXPORT_SYMBOL_GPL(kmem_cache_name); | ||
3848 | |||
3849 | /* | 3844 | /* |
3850 | * This initializes kmem_list3 or resizes various caches for all nodes. | 3845 | * This initializes kmem_list3 or resizes various caches for all nodes. |
3851 | */ | 3846 | */ |
@@ -666,12 +666,6 @@ unsigned int kmem_cache_size(struct kmem_cache *c) | |||
666 | } | 666 | } |
667 | EXPORT_SYMBOL(kmem_cache_size); | 667 | EXPORT_SYMBOL(kmem_cache_size); |
668 | 668 | ||
669 | const char *kmem_cache_name(struct kmem_cache *c) | ||
670 | { | ||
671 | return c->name; | ||
672 | } | ||
673 | EXPORT_SYMBOL(kmem_cache_name); | ||
674 | |||
675 | int kmem_cache_shrink(struct kmem_cache *d) | 669 | int kmem_cache_shrink(struct kmem_cache *d) |
676 | { | 670 | { |
677 | return 0; | 671 | return 0; |
@@ -64,7 +64,7 @@ | |||
64 | * we must stay away from it for a while since we may cause a bouncing | 64 | * we must stay away from it for a while since we may cause a bouncing |
65 | * cacheline if we try to acquire the lock. So go onto the next slab. | 65 | * cacheline if we try to acquire the lock. So go onto the next slab. |
66 | * If all pages are busy then we may allocate a new slab instead of reusing | 66 | * If all pages are busy then we may allocate a new slab instead of reusing |
67 | * a partial slab. A new slab has noone operating on it and thus there is | 67 | * a partial slab. A new slab has no one operating on it and thus there is |
68 | * no danger of cacheline contention. | 68 | * no danger of cacheline contention. |
69 | * | 69 | * |
70 | * Interrupts are disabled during allocation and deallocation in order to | 70 | * Interrupts are disabled during allocation and deallocation in order to |
@@ -217,7 +217,7 @@ static inline void sysfs_slab_remove(struct kmem_cache *s) | |||
217 | 217 | ||
218 | #endif | 218 | #endif |
219 | 219 | ||
220 | static inline void stat(struct kmem_cache *s, enum stat_item si) | 220 | static inline void stat(const struct kmem_cache *s, enum stat_item si) |
221 | { | 221 | { |
222 | #ifdef CONFIG_SLUB_STATS | 222 | #ifdef CONFIG_SLUB_STATS |
223 | __this_cpu_inc(s->cpu_slab->stat[si]); | 223 | __this_cpu_inc(s->cpu_slab->stat[si]); |
@@ -281,11 +281,40 @@ static inline int slab_index(void *p, struct kmem_cache *s, void *addr) | |||
281 | return (p - addr) / s->size; | 281 | return (p - addr) / s->size; |
282 | } | 282 | } |
283 | 283 | ||
284 | static inline size_t slab_ksize(const struct kmem_cache *s) | ||
285 | { | ||
286 | #ifdef CONFIG_SLUB_DEBUG | ||
287 | /* | ||
288 | * Debugging requires use of the padding between object | ||
289 | * and whatever may come after it. | ||
290 | */ | ||
291 | if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) | ||
292 | return s->objsize; | ||
293 | |||
294 | #endif | ||
295 | /* | ||
296 | * If we have the need to store the freelist pointer | ||
297 | * back there or track user information then we can | ||
298 | * only use the space before that information. | ||
299 | */ | ||
300 | if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) | ||
301 | return s->inuse; | ||
302 | /* | ||
303 | * Else we can use all the padding etc for the allocation | ||
304 | */ | ||
305 | return s->size; | ||
306 | } | ||
307 | |||
308 | static inline int order_objects(int order, unsigned long size, int reserved) | ||
309 | { | ||
310 | return ((PAGE_SIZE << order) - reserved) / size; | ||
311 | } | ||
312 | |||
284 | static inline struct kmem_cache_order_objects oo_make(int order, | 313 | static inline struct kmem_cache_order_objects oo_make(int order, |
285 | unsigned long size) | 314 | unsigned long size, int reserved) |
286 | { | 315 | { |
287 | struct kmem_cache_order_objects x = { | 316 | struct kmem_cache_order_objects x = { |
288 | (order << OO_SHIFT) + (PAGE_SIZE << order) / size | 317 | (order << OO_SHIFT) + order_objects(order, size, reserved) |
289 | }; | 318 | }; |
290 | 319 | ||
291 | return x; | 320 | return x; |
@@ -617,7 +646,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) | |||
617 | return 1; | 646 | return 1; |
618 | 647 | ||
619 | start = page_address(page); | 648 | start = page_address(page); |
620 | length = (PAGE_SIZE << compound_order(page)); | 649 | length = (PAGE_SIZE << compound_order(page)) - s->reserved; |
621 | end = start + length; | 650 | end = start + length; |
622 | remainder = length % s->size; | 651 | remainder = length % s->size; |
623 | if (!remainder) | 652 | if (!remainder) |
@@ -698,7 +727,7 @@ static int check_slab(struct kmem_cache *s, struct page *page) | |||
698 | return 0; | 727 | return 0; |
699 | } | 728 | } |
700 | 729 | ||
701 | maxobj = (PAGE_SIZE << compound_order(page)) / s->size; | 730 | maxobj = order_objects(compound_order(page), s->size, s->reserved); |
702 | if (page->objects > maxobj) { | 731 | if (page->objects > maxobj) { |
703 | slab_err(s, page, "objects %u > max %u", | 732 | slab_err(s, page, "objects %u > max %u", |
704 | s->name, page->objects, maxobj); | 733 | s->name, page->objects, maxobj); |
@@ -748,7 +777,7 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) | |||
748 | nr++; | 777 | nr++; |
749 | } | 778 | } |
750 | 779 | ||
751 | max_objects = (PAGE_SIZE << compound_order(page)) / s->size; | 780 | max_objects = order_objects(compound_order(page), s->size, s->reserved); |
752 | if (max_objects > MAX_OBJS_PER_PAGE) | 781 | if (max_objects > MAX_OBJS_PER_PAGE) |
753 | max_objects = MAX_OBJS_PER_PAGE; | 782 | max_objects = MAX_OBJS_PER_PAGE; |
754 | 783 | ||
@@ -800,21 +829,31 @@ static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) | |||
800 | static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object) | 829 | static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object) |
801 | { | 830 | { |
802 | flags &= gfp_allowed_mask; | 831 | flags &= gfp_allowed_mask; |
803 | kmemcheck_slab_alloc(s, flags, object, s->objsize); | 832 | kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); |
804 | kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, flags); | 833 | kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, flags); |
805 | } | 834 | } |
806 | 835 | ||
807 | static inline void slab_free_hook(struct kmem_cache *s, void *x) | 836 | static inline void slab_free_hook(struct kmem_cache *s, void *x) |
808 | { | 837 | { |
809 | kmemleak_free_recursive(x, s->flags); | 838 | kmemleak_free_recursive(x, s->flags); |
810 | } | ||
811 | 839 | ||
812 | static inline void slab_free_hook_irq(struct kmem_cache *s, void *object) | 840 | /* |
813 | { | 841 | * Trouble is that we may no longer disable interupts in the fast path |
814 | kmemcheck_slab_free(s, object, s->objsize); | 842 | * So in order to make the debug calls that expect irqs to be |
815 | debug_check_no_locks_freed(object, s->objsize); | 843 | * disabled we need to disable interrupts temporarily. |
844 | */ | ||
845 | #if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP) | ||
846 | { | ||
847 | unsigned long flags; | ||
848 | |||
849 | local_irq_save(flags); | ||
850 | kmemcheck_slab_free(s, x, s->objsize); | ||
851 | debug_check_no_locks_freed(x, s->objsize); | ||
852 | local_irq_restore(flags); | ||
853 | } | ||
854 | #endif | ||
816 | if (!(s->flags & SLAB_DEBUG_OBJECTS)) | 855 | if (!(s->flags & SLAB_DEBUG_OBJECTS)) |
817 | debug_check_no_obj_freed(object, s->objsize); | 856 | debug_check_no_obj_freed(x, s->objsize); |
818 | } | 857 | } |
819 | 858 | ||
820 | /* | 859 | /* |
@@ -1101,9 +1140,6 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, | |||
1101 | 1140 | ||
1102 | static inline void slab_free_hook(struct kmem_cache *s, void *x) {} | 1141 | static inline void slab_free_hook(struct kmem_cache *s, void *x) {} |
1103 | 1142 | ||
1104 | static inline void slab_free_hook_irq(struct kmem_cache *s, | ||
1105 | void *object) {} | ||
1106 | |||
1107 | #endif /* CONFIG_SLUB_DEBUG */ | 1143 | #endif /* CONFIG_SLUB_DEBUG */ |
1108 | 1144 | ||
1109 | /* | 1145 | /* |
@@ -1249,21 +1285,38 @@ static void __free_slab(struct kmem_cache *s, struct page *page) | |||
1249 | __free_pages(page, order); | 1285 | __free_pages(page, order); |
1250 | } | 1286 | } |
1251 | 1287 | ||
1288 | #define need_reserve_slab_rcu \ | ||
1289 | (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head)) | ||
1290 | |||
1252 | static void rcu_free_slab(struct rcu_head *h) | 1291 | static void rcu_free_slab(struct rcu_head *h) |
1253 | { | 1292 | { |
1254 | struct page *page; | 1293 | struct page *page; |
1255 | 1294 | ||
1256 | page = container_of((struct list_head *)h, struct page, lru); | 1295 | if (need_reserve_slab_rcu) |
1296 | page = virt_to_head_page(h); | ||
1297 | else | ||
1298 | page = container_of((struct list_head *)h, struct page, lru); | ||
1299 | |||
1257 | __free_slab(page->slab, page); | 1300 | __free_slab(page->slab, page); |
1258 | } | 1301 | } |
1259 | 1302 | ||
1260 | static void free_slab(struct kmem_cache *s, struct page *page) | 1303 | static void free_slab(struct kmem_cache *s, struct page *page) |
1261 | { | 1304 | { |
1262 | if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) { | 1305 | if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) { |
1263 | /* | 1306 | struct rcu_head *head; |
1264 | * RCU free overloads the RCU head over the LRU | 1307 | |
1265 | */ | 1308 | if (need_reserve_slab_rcu) { |
1266 | struct rcu_head *head = (void *)&page->lru; | 1309 | int order = compound_order(page); |
1310 | int offset = (PAGE_SIZE << order) - s->reserved; | ||
1311 | |||
1312 | VM_BUG_ON(s->reserved != sizeof(*head)); | ||
1313 | head = page_address(page) + offset; | ||
1314 | } else { | ||
1315 | /* | ||
1316 | * RCU free overloads the RCU head over the LRU | ||
1317 | */ | ||
1318 | head = (void *)&page->lru; | ||
1319 | } | ||
1267 | 1320 | ||
1268 | call_rcu(head, rcu_free_slab); | 1321 | call_rcu(head, rcu_free_slab); |
1269 | } else | 1322 | } else |
@@ -1487,6 +1540,78 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) | |||
1487 | } | 1540 | } |
1488 | } | 1541 | } |
1489 | 1542 | ||
1543 | #ifdef CONFIG_CMPXCHG_LOCAL | ||
1544 | #ifdef CONFIG_PREEMPT | ||
1545 | /* | ||
1546 | * Calculate the next globally unique transaction for disambiguiation | ||
1547 | * during cmpxchg. The transactions start with the cpu number and are then | ||
1548 | * incremented by CONFIG_NR_CPUS. | ||
1549 | */ | ||
1550 | #define TID_STEP roundup_pow_of_two(CONFIG_NR_CPUS) | ||
1551 | #else | ||
1552 | /* | ||
1553 | * No preemption supported therefore also no need to check for | ||
1554 | * different cpus. | ||
1555 | */ | ||
1556 | #define TID_STEP 1 | ||
1557 | #endif | ||
1558 | |||
1559 | static inline unsigned long next_tid(unsigned long tid) | ||
1560 | { | ||
1561 | return tid + TID_STEP; | ||
1562 | } | ||
1563 | |||
1564 | static inline unsigned int tid_to_cpu(unsigned long tid) | ||
1565 | { | ||
1566 | return tid % TID_STEP; | ||
1567 | } | ||
1568 | |||
1569 | static inline unsigned long tid_to_event(unsigned long tid) | ||
1570 | { | ||
1571 | return tid / TID_STEP; | ||
1572 | } | ||
1573 | |||
1574 | static inline unsigned int init_tid(int cpu) | ||
1575 | { | ||
1576 | return cpu; | ||
1577 | } | ||
1578 | |||
1579 | static inline void note_cmpxchg_failure(const char *n, | ||
1580 | const struct kmem_cache *s, unsigned long tid) | ||
1581 | { | ||
1582 | #ifdef SLUB_DEBUG_CMPXCHG | ||
1583 | unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid); | ||
1584 | |||
1585 | printk(KERN_INFO "%s %s: cmpxchg redo ", n, s->name); | ||
1586 | |||
1587 | #ifdef CONFIG_PREEMPT | ||
1588 | if (tid_to_cpu(tid) != tid_to_cpu(actual_tid)) | ||
1589 | printk("due to cpu change %d -> %d\n", | ||
1590 | tid_to_cpu(tid), tid_to_cpu(actual_tid)); | ||
1591 | else | ||
1592 | #endif | ||
1593 | if (tid_to_event(tid) != tid_to_event(actual_tid)) | ||
1594 | printk("due to cpu running other code. Event %ld->%ld\n", | ||
1595 | tid_to_event(tid), tid_to_event(actual_tid)); | ||
1596 | else | ||
1597 | printk("for unknown reason: actual=%lx was=%lx target=%lx\n", | ||
1598 | actual_tid, tid, next_tid(tid)); | ||
1599 | #endif | ||
1600 | stat(s, CMPXCHG_DOUBLE_CPU_FAIL); | ||
1601 | } | ||
1602 | |||
1603 | #endif | ||
1604 | |||
1605 | void init_kmem_cache_cpus(struct kmem_cache *s) | ||
1606 | { | ||
1607 | #ifdef CONFIG_CMPXCHG_LOCAL | ||
1608 | int cpu; | ||
1609 | |||
1610 | for_each_possible_cpu(cpu) | ||
1611 | per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu); | ||
1612 | #endif | ||
1613 | |||
1614 | } | ||
1490 | /* | 1615 | /* |
1491 | * Remove the cpu slab | 1616 | * Remove the cpu slab |
1492 | */ | 1617 | */ |
@@ -1518,6 +1643,9 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) | |||
1518 | page->inuse--; | 1643 | page->inuse--; |
1519 | } | 1644 | } |
1520 | c->page = NULL; | 1645 | c->page = NULL; |
1646 | #ifdef CONFIG_CMPXCHG_LOCAL | ||
1647 | c->tid = next_tid(c->tid); | ||
1648 | #endif | ||
1521 | unfreeze_slab(s, page, tail); | 1649 | unfreeze_slab(s, page, tail); |
1522 | } | 1650 | } |
1523 | 1651 | ||
@@ -1652,6 +1780,19 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, | |||
1652 | { | 1780 | { |
1653 | void **object; | 1781 | void **object; |
1654 | struct page *new; | 1782 | struct page *new; |
1783 | #ifdef CONFIG_CMPXCHG_LOCAL | ||
1784 | unsigned long flags; | ||
1785 | |||
1786 | local_irq_save(flags); | ||
1787 | #ifdef CONFIG_PREEMPT | ||
1788 | /* | ||
1789 | * We may have been preempted and rescheduled on a different | ||
1790 | * cpu before disabling interrupts. Need to reload cpu area | ||
1791 | * pointer. | ||
1792 | */ | ||
1793 | c = this_cpu_ptr(s->cpu_slab); | ||
1794 | #endif | ||
1795 | #endif | ||
1655 | 1796 | ||
1656 | /* We handle __GFP_ZERO in the caller */ | 1797 | /* We handle __GFP_ZERO in the caller */ |
1657 | gfpflags &= ~__GFP_ZERO; | 1798 | gfpflags &= ~__GFP_ZERO; |
@@ -1678,6 +1819,10 @@ load_freelist: | |||
1678 | c->node = page_to_nid(c->page); | 1819 | c->node = page_to_nid(c->page); |
1679 | unlock_out: | 1820 | unlock_out: |
1680 | slab_unlock(c->page); | 1821 | slab_unlock(c->page); |
1822 | #ifdef CONFIG_CMPXCHG_LOCAL | ||
1823 | c->tid = next_tid(c->tid); | ||
1824 | local_irq_restore(flags); | ||
1825 | #endif | ||
1681 | stat(s, ALLOC_SLOWPATH); | 1826 | stat(s, ALLOC_SLOWPATH); |
1682 | return object; | 1827 | return object; |
1683 | 1828 | ||
@@ -1713,6 +1858,9 @@ new_slab: | |||
1713 | } | 1858 | } |
1714 | if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) | 1859 | if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) |
1715 | slab_out_of_memory(s, gfpflags, node); | 1860 | slab_out_of_memory(s, gfpflags, node); |
1861 | #ifdef CONFIG_CMPXCHG_LOCAL | ||
1862 | local_irq_restore(flags); | ||
1863 | #endif | ||
1716 | return NULL; | 1864 | return NULL; |
1717 | debug: | 1865 | debug: |
1718 | if (!alloc_debug_processing(s, c->page, object, addr)) | 1866 | if (!alloc_debug_processing(s, c->page, object, addr)) |
@@ -1739,23 +1887,76 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, | |||
1739 | { | 1887 | { |
1740 | void **object; | 1888 | void **object; |
1741 | struct kmem_cache_cpu *c; | 1889 | struct kmem_cache_cpu *c; |
1890 | #ifdef CONFIG_CMPXCHG_LOCAL | ||
1891 | unsigned long tid; | ||
1892 | #else | ||
1742 | unsigned long flags; | 1893 | unsigned long flags; |
1894 | #endif | ||
1743 | 1895 | ||
1744 | if (slab_pre_alloc_hook(s, gfpflags)) | 1896 | if (slab_pre_alloc_hook(s, gfpflags)) |
1745 | return NULL; | 1897 | return NULL; |
1746 | 1898 | ||
1899 | #ifndef CONFIG_CMPXCHG_LOCAL | ||
1747 | local_irq_save(flags); | 1900 | local_irq_save(flags); |
1901 | #else | ||
1902 | redo: | ||
1903 | #endif | ||
1904 | |||
1905 | /* | ||
1906 | * Must read kmem_cache cpu data via this cpu ptr. Preemption is | ||
1907 | * enabled. We may switch back and forth between cpus while | ||
1908 | * reading from one cpu area. That does not matter as long | ||
1909 | * as we end up on the original cpu again when doing the cmpxchg. | ||
1910 | */ | ||
1748 | c = __this_cpu_ptr(s->cpu_slab); | 1911 | c = __this_cpu_ptr(s->cpu_slab); |
1912 | |||
1913 | #ifdef CONFIG_CMPXCHG_LOCAL | ||
1914 | /* | ||
1915 | * The transaction ids are globally unique per cpu and per operation on | ||
1916 | * a per cpu queue. Thus they can be guarantee that the cmpxchg_double | ||
1917 | * occurs on the right processor and that there was no operation on the | ||
1918 | * linked list in between. | ||
1919 | */ | ||
1920 | tid = c->tid; | ||
1921 | barrier(); | ||
1922 | #endif | ||
1923 | |||
1749 | object = c->freelist; | 1924 | object = c->freelist; |
1750 | if (unlikely(!object || !node_match(c, node))) | 1925 | if (unlikely(!object || !node_match(c, node))) |
1751 | 1926 | ||
1752 | object = __slab_alloc(s, gfpflags, node, addr, c); | 1927 | object = __slab_alloc(s, gfpflags, node, addr, c); |
1753 | 1928 | ||
1754 | else { | 1929 | else { |
1930 | #ifdef CONFIG_CMPXCHG_LOCAL | ||
1931 | /* | ||
1932 | * The cmpxchg will only match if there was no additional | ||
1933 | * operation and if we are on the right processor. | ||
1934 | * | ||
1935 | * The cmpxchg does the following atomically (without lock semantics!) | ||
1936 | * 1. Relocate first pointer to the current per cpu area. | ||
1937 | * 2. Verify that tid and freelist have not been changed | ||
1938 | * 3. If they were not changed replace tid and freelist | ||
1939 | * | ||
1940 | * Since this is without lock semantics the protection is only against | ||
1941 | * code executing on this cpu *not* from access by other cpus. | ||
1942 | */ | ||
1943 | if (unlikely(!this_cpu_cmpxchg_double( | ||
1944 | s->cpu_slab->freelist, s->cpu_slab->tid, | ||
1945 | object, tid, | ||
1946 | get_freepointer(s, object), next_tid(tid)))) { | ||
1947 | |||
1948 | note_cmpxchg_failure("slab_alloc", s, tid); | ||
1949 | goto redo; | ||
1950 | } | ||
1951 | #else | ||
1755 | c->freelist = get_freepointer(s, object); | 1952 | c->freelist = get_freepointer(s, object); |
1953 | #endif | ||
1756 | stat(s, ALLOC_FASTPATH); | 1954 | stat(s, ALLOC_FASTPATH); |
1757 | } | 1955 | } |
1956 | |||
1957 | #ifndef CONFIG_CMPXCHG_LOCAL | ||
1758 | local_irq_restore(flags); | 1958 | local_irq_restore(flags); |
1959 | #endif | ||
1759 | 1960 | ||
1760 | if (unlikely(gfpflags & __GFP_ZERO) && object) | 1961 | if (unlikely(gfpflags & __GFP_ZERO) && object) |
1761 | memset(object, 0, s->objsize); | 1962 | memset(object, 0, s->objsize); |
@@ -1833,9 +2034,13 @@ static void __slab_free(struct kmem_cache *s, struct page *page, | |||
1833 | { | 2034 | { |
1834 | void *prior; | 2035 | void *prior; |
1835 | void **object = (void *)x; | 2036 | void **object = (void *)x; |
2037 | #ifdef CONFIG_CMPXCHG_LOCAL | ||
2038 | unsigned long flags; | ||
1836 | 2039 | ||
1837 | stat(s, FREE_SLOWPATH); | 2040 | local_irq_save(flags); |
2041 | #endif | ||
1838 | slab_lock(page); | 2042 | slab_lock(page); |
2043 | stat(s, FREE_SLOWPATH); | ||
1839 | 2044 | ||
1840 | if (kmem_cache_debug(s)) | 2045 | if (kmem_cache_debug(s)) |
1841 | goto debug; | 2046 | goto debug; |
@@ -1865,6 +2070,9 @@ checks_ok: | |||
1865 | 2070 | ||
1866 | out_unlock: | 2071 | out_unlock: |
1867 | slab_unlock(page); | 2072 | slab_unlock(page); |
2073 | #ifdef CONFIG_CMPXCHG_LOCAL | ||
2074 | local_irq_restore(flags); | ||
2075 | #endif | ||
1868 | return; | 2076 | return; |
1869 | 2077 | ||
1870 | slab_empty: | 2078 | slab_empty: |
@@ -1876,6 +2084,9 @@ slab_empty: | |||
1876 | stat(s, FREE_REMOVE_PARTIAL); | 2084 | stat(s, FREE_REMOVE_PARTIAL); |
1877 | } | 2085 | } |
1878 | slab_unlock(page); | 2086 | slab_unlock(page); |
2087 | #ifdef CONFIG_CMPXCHG_LOCAL | ||
2088 | local_irq_restore(flags); | ||
2089 | #endif | ||
1879 | stat(s, FREE_SLAB); | 2090 | stat(s, FREE_SLAB); |
1880 | discard_slab(s, page); | 2091 | discard_slab(s, page); |
1881 | return; | 2092 | return; |
@@ -1902,23 +2113,56 @@ static __always_inline void slab_free(struct kmem_cache *s, | |||
1902 | { | 2113 | { |
1903 | void **object = (void *)x; | 2114 | void **object = (void *)x; |
1904 | struct kmem_cache_cpu *c; | 2115 | struct kmem_cache_cpu *c; |
2116 | #ifdef CONFIG_CMPXCHG_LOCAL | ||
2117 | unsigned long tid; | ||
2118 | #else | ||
1905 | unsigned long flags; | 2119 | unsigned long flags; |
2120 | #endif | ||
1906 | 2121 | ||
1907 | slab_free_hook(s, x); | 2122 | slab_free_hook(s, x); |
1908 | 2123 | ||
2124 | #ifndef CONFIG_CMPXCHG_LOCAL | ||
1909 | local_irq_save(flags); | 2125 | local_irq_save(flags); |
2126 | |||
2127 | #else | ||
2128 | redo: | ||
2129 | #endif | ||
2130 | |||
2131 | /* | ||
2132 | * Determine the currently cpus per cpu slab. | ||
2133 | * The cpu may change afterward. However that does not matter since | ||
2134 | * data is retrieved via this pointer. If we are on the same cpu | ||
2135 | * during the cmpxchg then the free will succedd. | ||
2136 | */ | ||
1910 | c = __this_cpu_ptr(s->cpu_slab); | 2137 | c = __this_cpu_ptr(s->cpu_slab); |
1911 | 2138 | ||
1912 | slab_free_hook_irq(s, x); | 2139 | #ifdef CONFIG_CMPXCHG_LOCAL |
2140 | tid = c->tid; | ||
2141 | barrier(); | ||
2142 | #endif | ||
1913 | 2143 | ||
1914 | if (likely(page == c->page && c->node != NUMA_NO_NODE)) { | 2144 | if (likely(page == c->page && c->node != NUMA_NO_NODE)) { |
1915 | set_freepointer(s, object, c->freelist); | 2145 | set_freepointer(s, object, c->freelist); |
2146 | |||
2147 | #ifdef CONFIG_CMPXCHG_LOCAL | ||
2148 | if (unlikely(!this_cpu_cmpxchg_double( | ||
2149 | s->cpu_slab->freelist, s->cpu_slab->tid, | ||
2150 | c->freelist, tid, | ||
2151 | object, next_tid(tid)))) { | ||
2152 | |||
2153 | note_cmpxchg_failure("slab_free", s, tid); | ||
2154 | goto redo; | ||
2155 | } | ||
2156 | #else | ||
1916 | c->freelist = object; | 2157 | c->freelist = object; |
2158 | #endif | ||
1917 | stat(s, FREE_FASTPATH); | 2159 | stat(s, FREE_FASTPATH); |
1918 | } else | 2160 | } else |
1919 | __slab_free(s, page, x, addr); | 2161 | __slab_free(s, page, x, addr); |
1920 | 2162 | ||
2163 | #ifndef CONFIG_CMPXCHG_LOCAL | ||
1921 | local_irq_restore(flags); | 2164 | local_irq_restore(flags); |
2165 | #endif | ||
1922 | } | 2166 | } |
1923 | 2167 | ||
1924 | void kmem_cache_free(struct kmem_cache *s, void *x) | 2168 | void kmem_cache_free(struct kmem_cache *s, void *x) |
@@ -1988,13 +2232,13 @@ static int slub_nomerge; | |||
1988 | * the smallest order which will fit the object. | 2232 | * the smallest order which will fit the object. |
1989 | */ | 2233 | */ |
1990 | static inline int slab_order(int size, int min_objects, | 2234 | static inline int slab_order(int size, int min_objects, |
1991 | int max_order, int fract_leftover) | 2235 | int max_order, int fract_leftover, int reserved) |
1992 | { | 2236 | { |
1993 | int order; | 2237 | int order; |
1994 | int rem; | 2238 | int rem; |
1995 | int min_order = slub_min_order; | 2239 | int min_order = slub_min_order; |
1996 | 2240 | ||
1997 | if ((PAGE_SIZE << min_order) / size > MAX_OBJS_PER_PAGE) | 2241 | if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE) |
1998 | return get_order(size * MAX_OBJS_PER_PAGE) - 1; | 2242 | return get_order(size * MAX_OBJS_PER_PAGE) - 1; |
1999 | 2243 | ||
2000 | for (order = max(min_order, | 2244 | for (order = max(min_order, |
@@ -2003,10 +2247,10 @@ static inline int slab_order(int size, int min_objects, | |||
2003 | 2247 | ||
2004 | unsigned long slab_size = PAGE_SIZE << order; | 2248 | unsigned long slab_size = PAGE_SIZE << order; |
2005 | 2249 | ||
2006 | if (slab_size < min_objects * size) | 2250 | if (slab_size < min_objects * size + reserved) |
2007 | continue; | 2251 | continue; |
2008 | 2252 | ||
2009 | rem = slab_size % size; | 2253 | rem = (slab_size - reserved) % size; |
2010 | 2254 | ||
2011 | if (rem <= slab_size / fract_leftover) | 2255 | if (rem <= slab_size / fract_leftover) |
2012 | break; | 2256 | break; |
@@ -2016,7 +2260,7 @@ static inline int slab_order(int size, int min_objects, | |||
2016 | return order; | 2260 | return order; |
2017 | } | 2261 | } |
2018 | 2262 | ||
2019 | static inline int calculate_order(int size) | 2263 | static inline int calculate_order(int size, int reserved) |
2020 | { | 2264 | { |
2021 | int order; | 2265 | int order; |
2022 | int min_objects; | 2266 | int min_objects; |
@@ -2034,14 +2278,14 @@ static inline int calculate_order(int size) | |||
2034 | min_objects = slub_min_objects; | 2278 | min_objects = slub_min_objects; |
2035 | if (!min_objects) | 2279 | if (!min_objects) |
2036 | min_objects = 4 * (fls(nr_cpu_ids) + 1); | 2280 | min_objects = 4 * (fls(nr_cpu_ids) + 1); |
2037 | max_objects = (PAGE_SIZE << slub_max_order)/size; | 2281 | max_objects = order_objects(slub_max_order, size, reserved); |
2038 | min_objects = min(min_objects, max_objects); | 2282 | min_objects = min(min_objects, max_objects); |
2039 | 2283 | ||
2040 | while (min_objects > 1) { | 2284 | while (min_objects > 1) { |
2041 | fraction = 16; | 2285 | fraction = 16; |
2042 | while (fraction >= 4) { | 2286 | while (fraction >= 4) { |
2043 | order = slab_order(size, min_objects, | 2287 | order = slab_order(size, min_objects, |
2044 | slub_max_order, fraction); | 2288 | slub_max_order, fraction, reserved); |
2045 | if (order <= slub_max_order) | 2289 | if (order <= slub_max_order) |
2046 | return order; | 2290 | return order; |
2047 | fraction /= 2; | 2291 | fraction /= 2; |
@@ -2053,14 +2297,14 @@ static inline int calculate_order(int size) | |||
2053 | * We were unable to place multiple objects in a slab. Now | 2297 | * We were unable to place multiple objects in a slab. Now |
2054 | * lets see if we can place a single object there. | 2298 | * lets see if we can place a single object there. |
2055 | */ | 2299 | */ |
2056 | order = slab_order(size, 1, slub_max_order, 1); | 2300 | order = slab_order(size, 1, slub_max_order, 1, reserved); |
2057 | if (order <= slub_max_order) | 2301 | if (order <= slub_max_order) |
2058 | return order; | 2302 | return order; |
2059 | 2303 | ||
2060 | /* | 2304 | /* |
2061 | * Doh this slab cannot be placed using slub_max_order. | 2305 | * Doh this slab cannot be placed using slub_max_order. |
2062 | */ | 2306 | */ |
2063 | order = slab_order(size, 1, MAX_ORDER, 1); | 2307 | order = slab_order(size, 1, MAX_ORDER, 1, reserved); |
2064 | if (order < MAX_ORDER) | 2308 | if (order < MAX_ORDER) |
2065 | return order; | 2309 | return order; |
2066 | return -ENOSYS; | 2310 | return -ENOSYS; |
@@ -2110,9 +2354,23 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) | |||
2110 | BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < | 2354 | BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < |
2111 | SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu)); | 2355 | SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu)); |
2112 | 2356 | ||
2357 | #ifdef CONFIG_CMPXCHG_LOCAL | ||
2358 | /* | ||
2359 | * Must align to double word boundary for the double cmpxchg instructions | ||
2360 | * to work. | ||
2361 | */ | ||
2362 | s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu), 2 * sizeof(void *)); | ||
2363 | #else | ||
2364 | /* Regular alignment is sufficient */ | ||
2113 | s->cpu_slab = alloc_percpu(struct kmem_cache_cpu); | 2365 | s->cpu_slab = alloc_percpu(struct kmem_cache_cpu); |
2366 | #endif | ||
2367 | |||
2368 | if (!s->cpu_slab) | ||
2369 | return 0; | ||
2114 | 2370 | ||
2115 | return s->cpu_slab != NULL; | 2371 | init_kmem_cache_cpus(s); |
2372 | |||
2373 | return 1; | ||
2116 | } | 2374 | } |
2117 | 2375 | ||
2118 | static struct kmem_cache *kmem_cache_node; | 2376 | static struct kmem_cache *kmem_cache_node; |
@@ -2311,7 +2569,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) | |||
2311 | if (forced_order >= 0) | 2569 | if (forced_order >= 0) |
2312 | order = forced_order; | 2570 | order = forced_order; |
2313 | else | 2571 | else |
2314 | order = calculate_order(size); | 2572 | order = calculate_order(size, s->reserved); |
2315 | 2573 | ||
2316 | if (order < 0) | 2574 | if (order < 0) |
2317 | return 0; | 2575 | return 0; |
@@ -2329,8 +2587,8 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) | |||
2329 | /* | 2587 | /* |
2330 | * Determine the number of objects per slab | 2588 | * Determine the number of objects per slab |
2331 | */ | 2589 | */ |
2332 | s->oo = oo_make(order, size); | 2590 | s->oo = oo_make(order, size, s->reserved); |
2333 | s->min = oo_make(get_order(size), size); | 2591 | s->min = oo_make(get_order(size), size, s->reserved); |
2334 | if (oo_objects(s->oo) > oo_objects(s->max)) | 2592 | if (oo_objects(s->oo) > oo_objects(s->max)) |
2335 | s->max = s->oo; | 2593 | s->max = s->oo; |
2336 | 2594 | ||
@@ -2349,6 +2607,10 @@ static int kmem_cache_open(struct kmem_cache *s, | |||
2349 | s->objsize = size; | 2607 | s->objsize = size; |
2350 | s->align = align; | 2608 | s->align = align; |
2351 | s->flags = kmem_cache_flags(size, flags, name, ctor); | 2609 | s->flags = kmem_cache_flags(size, flags, name, ctor); |
2610 | s->reserved = 0; | ||
2611 | |||
2612 | if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU)) | ||
2613 | s->reserved = sizeof(struct rcu_head); | ||
2352 | 2614 | ||
2353 | if (!calculate_sizes(s, -1)) | 2615 | if (!calculate_sizes(s, -1)) |
2354 | goto error; | 2616 | goto error; |
@@ -2399,12 +2661,6 @@ unsigned int kmem_cache_size(struct kmem_cache *s) | |||
2399 | } | 2661 | } |
2400 | EXPORT_SYMBOL(kmem_cache_size); | 2662 | EXPORT_SYMBOL(kmem_cache_size); |
2401 | 2663 | ||
2402 | const char *kmem_cache_name(struct kmem_cache *s) | ||
2403 | { | ||
2404 | return s->name; | ||
2405 | } | ||
2406 | EXPORT_SYMBOL(kmem_cache_name); | ||
2407 | |||
2408 | static void list_slab_objects(struct kmem_cache *s, struct page *page, | 2664 | static void list_slab_objects(struct kmem_cache *s, struct page *page, |
2409 | const char *text) | 2665 | const char *text) |
2410 | { | 2666 | { |
@@ -2696,7 +2952,6 @@ EXPORT_SYMBOL(__kmalloc_node); | |||
2696 | size_t ksize(const void *object) | 2952 | size_t ksize(const void *object) |
2697 | { | 2953 | { |
2698 | struct page *page; | 2954 | struct page *page; |
2699 | struct kmem_cache *s; | ||
2700 | 2955 | ||
2701 | if (unlikely(object == ZERO_SIZE_PTR)) | 2956 | if (unlikely(object == ZERO_SIZE_PTR)) |
2702 | return 0; | 2957 | return 0; |
@@ -2707,28 +2962,8 @@ size_t ksize(const void *object) | |||
2707 | WARN_ON(!PageCompound(page)); | 2962 | WARN_ON(!PageCompound(page)); |
2708 | return PAGE_SIZE << compound_order(page); | 2963 | return PAGE_SIZE << compound_order(page); |
2709 | } | 2964 | } |
2710 | s = page->slab; | ||
2711 | |||
2712 | #ifdef CONFIG_SLUB_DEBUG | ||
2713 | /* | ||
2714 | * Debugging requires use of the padding between object | ||
2715 | * and whatever may come after it. | ||
2716 | */ | ||
2717 | if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) | ||
2718 | return s->objsize; | ||
2719 | 2965 | ||
2720 | #endif | 2966 | return slab_ksize(page->slab); |
2721 | /* | ||
2722 | * If we have the need to store the freelist pointer | ||
2723 | * back there or track user information then we can | ||
2724 | * only use the space before that information. | ||
2725 | */ | ||
2726 | if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) | ||
2727 | return s->inuse; | ||
2728 | /* | ||
2729 | * Else we can use all the padding etc for the allocation | ||
2730 | */ | ||
2731 | return s->size; | ||
2732 | } | 2967 | } |
2733 | EXPORT_SYMBOL(ksize); | 2968 | EXPORT_SYMBOL(ksize); |
2734 | 2969 | ||
@@ -3312,7 +3547,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) | |||
3312 | 3547 | ||
3313 | ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, caller); | 3548 | ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, caller); |
3314 | 3549 | ||
3315 | /* Honor the call site pointer we recieved. */ | 3550 | /* Honor the call site pointer we received. */ |
3316 | trace_kmalloc(caller, ret, size, s->size, gfpflags); | 3551 | trace_kmalloc(caller, ret, size, s->size, gfpflags); |
3317 | 3552 | ||
3318 | return ret; | 3553 | return ret; |
@@ -3342,7 +3577,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, | |||
3342 | 3577 | ||
3343 | ret = slab_alloc(s, gfpflags, node, caller); | 3578 | ret = slab_alloc(s, gfpflags, node, caller); |
3344 | 3579 | ||
3345 | /* Honor the call site pointer we recieved. */ | 3580 | /* Honor the call site pointer we received. */ |
3346 | trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node); | 3581 | trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node); |
3347 | 3582 | ||
3348 | return ret; | 3583 | return ret; |
@@ -4017,6 +4252,12 @@ static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) | |||
4017 | } | 4252 | } |
4018 | SLAB_ATTR_RO(destroy_by_rcu); | 4253 | SLAB_ATTR_RO(destroy_by_rcu); |
4019 | 4254 | ||
4255 | static ssize_t reserved_show(struct kmem_cache *s, char *buf) | ||
4256 | { | ||
4257 | return sprintf(buf, "%d\n", s->reserved); | ||
4258 | } | ||
4259 | SLAB_ATTR_RO(reserved); | ||
4260 | |||
4020 | #ifdef CONFIG_SLUB_DEBUG | 4261 | #ifdef CONFIG_SLUB_DEBUG |
4021 | static ssize_t slabs_show(struct kmem_cache *s, char *buf) | 4262 | static ssize_t slabs_show(struct kmem_cache *s, char *buf) |
4022 | { | 4263 | { |
@@ -4303,6 +4544,7 @@ static struct attribute *slab_attrs[] = { | |||
4303 | &reclaim_account_attr.attr, | 4544 | &reclaim_account_attr.attr, |
4304 | &destroy_by_rcu_attr.attr, | 4545 | &destroy_by_rcu_attr.attr, |
4305 | &shrink_attr.attr, | 4546 | &shrink_attr.attr, |
4547 | &reserved_attr.attr, | ||
4306 | #ifdef CONFIG_SLUB_DEBUG | 4548 | #ifdef CONFIG_SLUB_DEBUG |
4307 | &total_objects_attr.attr, | 4549 | &total_objects_attr.attr, |
4308 | &slabs_attr.attr, | 4550 | &slabs_attr.attr, |
diff --git a/mm/sparse.c b/mm/sparse.c index 93250207c5cf..aa64b12831a2 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -500,7 +500,7 @@ void __init sparse_init(void) | |||
500 | * so alloc 2M (with 2M align) and 24 bytes in turn will | 500 | * so alloc 2M (with 2M align) and 24 bytes in turn will |
501 | * make next 2M slip to one more 2M later. | 501 | * make next 2M slip to one more 2M later. |
502 | * then in big system, the memory will have a lot of holes... | 502 | * then in big system, the memory will have a lot of holes... |
503 | * here try to allocate 2M pages continously. | 503 | * here try to allocate 2M pages continuously. |
504 | * | 504 | * |
505 | * powerpc need to call sparse_init_one_section right after each | 505 | * powerpc need to call sparse_init_one_section right after each |
506 | * sparse_early_mem_map_alloc, so allocate usemap_map at first. | 506 | * sparse_early_mem_map_alloc, so allocate usemap_map at first. |
@@ -39,6 +39,7 @@ int page_cluster; | |||
39 | 39 | ||
40 | static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs); | 40 | static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs); |
41 | static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); | 41 | static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); |
42 | static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); | ||
42 | 43 | ||
43 | /* | 44 | /* |
44 | * This path almost never happens for VM activity - pages are normally | 45 | * This path almost never happens for VM activity - pages are normally |
@@ -178,15 +179,13 @@ void put_pages_list(struct list_head *pages) | |||
178 | } | 179 | } |
179 | EXPORT_SYMBOL(put_pages_list); | 180 | EXPORT_SYMBOL(put_pages_list); |
180 | 181 | ||
181 | /* | 182 | static void pagevec_lru_move_fn(struct pagevec *pvec, |
182 | * pagevec_move_tail() must be called with IRQ disabled. | 183 | void (*move_fn)(struct page *page, void *arg), |
183 | * Otherwise this may cause nasty races. | 184 | void *arg) |
184 | */ | ||
185 | static void pagevec_move_tail(struct pagevec *pvec) | ||
186 | { | 185 | { |
187 | int i; | 186 | int i; |
188 | int pgmoved = 0; | ||
189 | struct zone *zone = NULL; | 187 | struct zone *zone = NULL; |
188 | unsigned long flags = 0; | ||
190 | 189 | ||
191 | for (i = 0; i < pagevec_count(pvec); i++) { | 190 | for (i = 0; i < pagevec_count(pvec); i++) { |
192 | struct page *page = pvec->pages[i]; | 191 | struct page *page = pvec->pages[i]; |
@@ -194,29 +193,50 @@ static void pagevec_move_tail(struct pagevec *pvec) | |||
194 | 193 | ||
195 | if (pagezone != zone) { | 194 | if (pagezone != zone) { |
196 | if (zone) | 195 | if (zone) |
197 | spin_unlock(&zone->lru_lock); | 196 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
198 | zone = pagezone; | 197 | zone = pagezone; |
199 | spin_lock(&zone->lru_lock); | 198 | spin_lock_irqsave(&zone->lru_lock, flags); |
200 | } | ||
201 | if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { | ||
202 | int lru = page_lru_base_type(page); | ||
203 | list_move_tail(&page->lru, &zone->lru[lru].list); | ||
204 | pgmoved++; | ||
205 | } | 199 | } |
200 | |||
201 | (*move_fn)(page, arg); | ||
206 | } | 202 | } |
207 | if (zone) | 203 | if (zone) |
208 | spin_unlock(&zone->lru_lock); | 204 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
209 | __count_vm_events(PGROTATED, pgmoved); | ||
210 | release_pages(pvec->pages, pvec->nr, pvec->cold); | 205 | release_pages(pvec->pages, pvec->nr, pvec->cold); |
211 | pagevec_reinit(pvec); | 206 | pagevec_reinit(pvec); |
212 | } | 207 | } |
213 | 208 | ||
209 | static void pagevec_move_tail_fn(struct page *page, void *arg) | ||
210 | { | ||
211 | int *pgmoved = arg; | ||
212 | struct zone *zone = page_zone(page); | ||
213 | |||
214 | if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { | ||
215 | enum lru_list lru = page_lru_base_type(page); | ||
216 | list_move_tail(&page->lru, &zone->lru[lru].list); | ||
217 | mem_cgroup_rotate_reclaimable_page(page); | ||
218 | (*pgmoved)++; | ||
219 | } | ||
220 | } | ||
221 | |||
222 | /* | ||
223 | * pagevec_move_tail() must be called with IRQ disabled. | ||
224 | * Otherwise this may cause nasty races. | ||
225 | */ | ||
226 | static void pagevec_move_tail(struct pagevec *pvec) | ||
227 | { | ||
228 | int pgmoved = 0; | ||
229 | |||
230 | pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, &pgmoved); | ||
231 | __count_vm_events(PGROTATED, pgmoved); | ||
232 | } | ||
233 | |||
214 | /* | 234 | /* |
215 | * Writeback is about to end against a page which has been marked for immediate | 235 | * Writeback is about to end against a page which has been marked for immediate |
216 | * reclaim. If it still appears to be reclaimable, move it to the tail of the | 236 | * reclaim. If it still appears to be reclaimable, move it to the tail of the |
217 | * inactive list. | 237 | * inactive list. |
218 | */ | 238 | */ |
219 | void rotate_reclaimable_page(struct page *page) | 239 | void rotate_reclaimable_page(struct page *page) |
220 | { | 240 | { |
221 | if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) && | 241 | if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) && |
222 | !PageUnevictable(page) && PageLRU(page)) { | 242 | !PageUnevictable(page) && PageLRU(page)) { |
@@ -347,6 +367,71 @@ void add_page_to_unevictable_list(struct page *page) | |||
347 | } | 367 | } |
348 | 368 | ||
349 | /* | 369 | /* |
370 | * If the page can not be invalidated, it is moved to the | ||
371 | * inactive list to speed up its reclaim. It is moved to the | ||
372 | * head of the list, rather than the tail, to give the flusher | ||
373 | * threads some time to write it out, as this is much more | ||
374 | * effective than the single-page writeout from reclaim. | ||
375 | * | ||
376 | * If the page isn't page_mapped and dirty/writeback, the page | ||
377 | * could reclaim asap using PG_reclaim. | ||
378 | * | ||
379 | * 1. active, mapped page -> none | ||
380 | * 2. active, dirty/writeback page -> inactive, head, PG_reclaim | ||
381 | * 3. inactive, mapped page -> none | ||
382 | * 4. inactive, dirty/writeback page -> inactive, head, PG_reclaim | ||
383 | * 5. inactive, clean -> inactive, tail | ||
384 | * 6. Others -> none | ||
385 | * | ||
386 | * In 4, why it moves inactive's head, the VM expects the page would | ||
387 | * be write it out by flusher threads as this is much more effective | ||
388 | * than the single-page writeout from reclaim. | ||
389 | */ | ||
390 | static void lru_deactivate_fn(struct page *page, void *arg) | ||
391 | { | ||
392 | int lru, file; | ||
393 | bool active; | ||
394 | struct zone *zone = page_zone(page); | ||
395 | |||
396 | if (!PageLRU(page)) | ||
397 | return; | ||
398 | |||
399 | /* Some processes are using the page */ | ||
400 | if (page_mapped(page)) | ||
401 | return; | ||
402 | |||
403 | active = PageActive(page); | ||
404 | |||
405 | file = page_is_file_cache(page); | ||
406 | lru = page_lru_base_type(page); | ||
407 | del_page_from_lru_list(zone, page, lru + active); | ||
408 | ClearPageActive(page); | ||
409 | ClearPageReferenced(page); | ||
410 | add_page_to_lru_list(zone, page, lru); | ||
411 | |||
412 | if (PageWriteback(page) || PageDirty(page)) { | ||
413 | /* | ||
414 | * PG_reclaim could be raced with end_page_writeback | ||
415 | * It can make readahead confusing. But race window | ||
416 | * is _really_ small and it's non-critical problem. | ||
417 | */ | ||
418 | SetPageReclaim(page); | ||
419 | } else { | ||
420 | /* | ||
421 | * The page's writeback ends up during pagevec | ||
422 | * We moves tha page into tail of inactive. | ||
423 | */ | ||
424 | list_move_tail(&page->lru, &zone->lru[lru].list); | ||
425 | mem_cgroup_rotate_reclaimable_page(page); | ||
426 | __count_vm_event(PGROTATED); | ||
427 | } | ||
428 | |||
429 | if (active) | ||
430 | __count_vm_event(PGDEACTIVATE); | ||
431 | update_page_reclaim_stat(zone, page, file, 0); | ||
432 | } | ||
433 | |||
434 | /* | ||
350 | * Drain pages out of the cpu's pagevecs. | 435 | * Drain pages out of the cpu's pagevecs. |
351 | * Either "cpu" is the current CPU, and preemption has already been | 436 | * Either "cpu" is the current CPU, and preemption has already been |
352 | * disabled; or "cpu" is being hot-unplugged, and is already dead. | 437 | * disabled; or "cpu" is being hot-unplugged, and is already dead. |
@@ -372,6 +457,29 @@ static void drain_cpu_pagevecs(int cpu) | |||
372 | pagevec_move_tail(pvec); | 457 | pagevec_move_tail(pvec); |
373 | local_irq_restore(flags); | 458 | local_irq_restore(flags); |
374 | } | 459 | } |
460 | |||
461 | pvec = &per_cpu(lru_deactivate_pvecs, cpu); | ||
462 | if (pagevec_count(pvec)) | ||
463 | pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); | ||
464 | } | ||
465 | |||
466 | /** | ||
467 | * deactivate_page - forcefully deactivate a page | ||
468 | * @page: page to deactivate | ||
469 | * | ||
470 | * This function hints the VM that @page is a good reclaim candidate, | ||
471 | * for example if its invalidation fails due to the page being dirty | ||
472 | * or under writeback. | ||
473 | */ | ||
474 | void deactivate_page(struct page *page) | ||
475 | { | ||
476 | if (likely(get_page_unless_zero(page))) { | ||
477 | struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs); | ||
478 | |||
479 | if (!pagevec_add(pvec, page)) | ||
480 | pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); | ||
481 | put_cpu_var(lru_deactivate_pvecs); | ||
482 | } | ||
375 | } | 483 | } |
376 | 484 | ||
377 | void lru_add_drain(void) | 485 | void lru_add_drain(void) |
@@ -516,44 +624,33 @@ void lru_add_page_tail(struct zone* zone, | |||
516 | } | 624 | } |
517 | } | 625 | } |
518 | 626 | ||
627 | static void ____pagevec_lru_add_fn(struct page *page, void *arg) | ||
628 | { | ||
629 | enum lru_list lru = (enum lru_list)arg; | ||
630 | struct zone *zone = page_zone(page); | ||
631 | int file = is_file_lru(lru); | ||
632 | int active = is_active_lru(lru); | ||
633 | |||
634 | VM_BUG_ON(PageActive(page)); | ||
635 | VM_BUG_ON(PageUnevictable(page)); | ||
636 | VM_BUG_ON(PageLRU(page)); | ||
637 | |||
638 | SetPageLRU(page); | ||
639 | if (active) | ||
640 | SetPageActive(page); | ||
641 | update_page_reclaim_stat(zone, page, file, active); | ||
642 | add_page_to_lru_list(zone, page, lru); | ||
643 | } | ||
644 | |||
519 | /* | 645 | /* |
520 | * Add the passed pages to the LRU, then drop the caller's refcount | 646 | * Add the passed pages to the LRU, then drop the caller's refcount |
521 | * on them. Reinitialises the caller's pagevec. | 647 | * on them. Reinitialises the caller's pagevec. |
522 | */ | 648 | */ |
523 | void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) | 649 | void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) |
524 | { | 650 | { |
525 | int i; | ||
526 | struct zone *zone = NULL; | ||
527 | |||
528 | VM_BUG_ON(is_unevictable_lru(lru)); | 651 | VM_BUG_ON(is_unevictable_lru(lru)); |
529 | 652 | ||
530 | for (i = 0; i < pagevec_count(pvec); i++) { | 653 | pagevec_lru_move_fn(pvec, ____pagevec_lru_add_fn, (void *)lru); |
531 | struct page *page = pvec->pages[i]; | ||
532 | struct zone *pagezone = page_zone(page); | ||
533 | int file; | ||
534 | int active; | ||
535 | |||
536 | if (pagezone != zone) { | ||
537 | if (zone) | ||
538 | spin_unlock_irq(&zone->lru_lock); | ||
539 | zone = pagezone; | ||
540 | spin_lock_irq(&zone->lru_lock); | ||
541 | } | ||
542 | VM_BUG_ON(PageActive(page)); | ||
543 | VM_BUG_ON(PageUnevictable(page)); | ||
544 | VM_BUG_ON(PageLRU(page)); | ||
545 | SetPageLRU(page); | ||
546 | active = is_active_lru(lru); | ||
547 | file = is_file_lru(lru); | ||
548 | if (active) | ||
549 | SetPageActive(page); | ||
550 | update_page_reclaim_stat(zone, page, file, active); | ||
551 | add_page_to_lru_list(zone, page, lru); | ||
552 | } | ||
553 | if (zone) | ||
554 | spin_unlock_irq(&zone->lru_lock); | ||
555 | release_pages(pvec->pages, pvec->nr, pvec->cold); | ||
556 | pagevec_reinit(pvec); | ||
557 | } | 654 | } |
558 | 655 | ||
559 | EXPORT_SYMBOL(____pagevec_lru_add); | 656 | EXPORT_SYMBOL(____pagevec_lru_add); |
diff --git a/mm/swap_state.c b/mm/swap_state.c index 5c8cfabbc9bc..46680461785b 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -24,12 +24,10 @@ | |||
24 | 24 | ||
25 | /* | 25 | /* |
26 | * swapper_space is a fiction, retained to simplify the path through | 26 | * swapper_space is a fiction, retained to simplify the path through |
27 | * vmscan's shrink_page_list, to make sync_page look nicer, and to allow | 27 | * vmscan's shrink_page_list. |
28 | * future use of radix_tree tags in the swap cache. | ||
29 | */ | 28 | */ |
30 | static const struct address_space_operations swap_aops = { | 29 | static const struct address_space_operations swap_aops = { |
31 | .writepage = swap_writepage, | 30 | .writepage = swap_writepage, |
32 | .sync_page = block_sync_page, | ||
33 | .set_page_dirty = __set_page_dirty_nobuffers, | 31 | .set_page_dirty = __set_page_dirty_nobuffers, |
34 | .migratepage = migrate_page, | 32 | .migratepage = migrate_page, |
35 | }; | 33 | }; |
@@ -37,7 +35,6 @@ static const struct address_space_operations swap_aops = { | |||
37 | static struct backing_dev_info swap_backing_dev_info = { | 35 | static struct backing_dev_info swap_backing_dev_info = { |
38 | .name = "swap", | 36 | .name = "swap", |
39 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, | 37 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, |
40 | .unplug_io_fn = swap_unplug_io_fn, | ||
41 | }; | 38 | }; |
42 | 39 | ||
43 | struct address_space swapper_space = { | 40 | struct address_space swapper_space = { |
diff --git a/mm/swapfile.c b/mm/swapfile.c index 0341c5700e34..8c6b3ce38f09 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -95,39 +95,6 @@ __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) | |||
95 | } | 95 | } |
96 | 96 | ||
97 | /* | 97 | /* |
98 | * We need this because the bdev->unplug_fn can sleep and we cannot | ||
99 | * hold swap_lock while calling the unplug_fn. And swap_lock | ||
100 | * cannot be turned into a mutex. | ||
101 | */ | ||
102 | static DECLARE_RWSEM(swap_unplug_sem); | ||
103 | |||
104 | void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) | ||
105 | { | ||
106 | swp_entry_t entry; | ||
107 | |||
108 | down_read(&swap_unplug_sem); | ||
109 | entry.val = page_private(page); | ||
110 | if (PageSwapCache(page)) { | ||
111 | struct block_device *bdev = swap_info[swp_type(entry)]->bdev; | ||
112 | struct backing_dev_info *bdi; | ||
113 | |||
114 | /* | ||
115 | * If the page is removed from swapcache from under us (with a | ||
116 | * racy try_to_unuse/swapoff) we need an additional reference | ||
117 | * count to avoid reading garbage from page_private(page) above. | ||
118 | * If the WARN_ON triggers during a swapoff it maybe the race | ||
119 | * condition and it's harmless. However if it triggers without | ||
120 | * swapoff it signals a problem. | ||
121 | */ | ||
122 | WARN_ON(page_count(page) <= 1); | ||
123 | |||
124 | bdi = bdev->bd_inode->i_mapping->backing_dev_info; | ||
125 | blk_run_backing_dev(bdi, page); | ||
126 | } | ||
127 | up_read(&swap_unplug_sem); | ||
128 | } | ||
129 | |||
130 | /* | ||
131 | * swapon tell device that all the old swap contents can be discarded, | 98 | * swapon tell device that all the old swap contents can be discarded, |
132 | * to allow the swap device to optimize its wear-levelling. | 99 | * to allow the swap device to optimize its wear-levelling. |
133 | */ | 100 | */ |
@@ -212,8 +179,8 @@ static int wait_for_discard(void *word) | |||
212 | #define SWAPFILE_CLUSTER 256 | 179 | #define SWAPFILE_CLUSTER 256 |
213 | #define LATENCY_LIMIT 256 | 180 | #define LATENCY_LIMIT 256 |
214 | 181 | ||
215 | static inline unsigned long scan_swap_map(struct swap_info_struct *si, | 182 | static unsigned long scan_swap_map(struct swap_info_struct *si, |
216 | unsigned char usage) | 183 | unsigned char usage) |
217 | { | 184 | { |
218 | unsigned long offset; | 185 | unsigned long offset; |
219 | unsigned long scan_base; | 186 | unsigned long scan_base; |
@@ -880,7 +847,7 @@ unsigned int count_swap_pages(int type, int free) | |||
880 | static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, | 847 | static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, |
881 | unsigned long addr, swp_entry_t entry, struct page *page) | 848 | unsigned long addr, swp_entry_t entry, struct page *page) |
882 | { | 849 | { |
883 | struct mem_cgroup *ptr = NULL; | 850 | struct mem_cgroup *ptr; |
884 | spinlock_t *ptl; | 851 | spinlock_t *ptl; |
885 | pte_t *pte; | 852 | pte_t *pte; |
886 | int ret = 1; | 853 | int ret = 1; |
@@ -1550,6 +1517,36 @@ bad_bmap: | |||
1550 | goto out; | 1517 | goto out; |
1551 | } | 1518 | } |
1552 | 1519 | ||
1520 | static void enable_swap_info(struct swap_info_struct *p, int prio, | ||
1521 | unsigned char *swap_map) | ||
1522 | { | ||
1523 | int i, prev; | ||
1524 | |||
1525 | spin_lock(&swap_lock); | ||
1526 | if (prio >= 0) | ||
1527 | p->prio = prio; | ||
1528 | else | ||
1529 | p->prio = --least_priority; | ||
1530 | p->swap_map = swap_map; | ||
1531 | p->flags |= SWP_WRITEOK; | ||
1532 | nr_swap_pages += p->pages; | ||
1533 | total_swap_pages += p->pages; | ||
1534 | |||
1535 | /* insert swap space into swap_list: */ | ||
1536 | prev = -1; | ||
1537 | for (i = swap_list.head; i >= 0; i = swap_info[i]->next) { | ||
1538 | if (p->prio >= swap_info[i]->prio) | ||
1539 | break; | ||
1540 | prev = i; | ||
1541 | } | ||
1542 | p->next = i; | ||
1543 | if (prev < 0) | ||
1544 | swap_list.head = swap_list.next = p->type; | ||
1545 | else | ||
1546 | swap_info[prev]->next = p->type; | ||
1547 | spin_unlock(&swap_lock); | ||
1548 | } | ||
1549 | |||
1553 | SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | 1550 | SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) |
1554 | { | 1551 | { |
1555 | struct swap_info_struct *p = NULL; | 1552 | struct swap_info_struct *p = NULL; |
@@ -1621,32 +1618,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1621 | current->flags &= ~PF_OOM_ORIGIN; | 1618 | current->flags &= ~PF_OOM_ORIGIN; |
1622 | 1619 | ||
1623 | if (err) { | 1620 | if (err) { |
1621 | /* | ||
1622 | * reading p->prio and p->swap_map outside the lock is | ||
1623 | * safe here because only sys_swapon and sys_swapoff | ||
1624 | * change them, and there can be no other sys_swapon or | ||
1625 | * sys_swapoff for this swap_info_struct at this point. | ||
1626 | */ | ||
1624 | /* re-insert swap space back into swap_list */ | 1627 | /* re-insert swap space back into swap_list */ |
1625 | spin_lock(&swap_lock); | 1628 | enable_swap_info(p, p->prio, p->swap_map); |
1626 | if (p->prio < 0) | ||
1627 | p->prio = --least_priority; | ||
1628 | prev = -1; | ||
1629 | for (i = swap_list.head; i >= 0; i = swap_info[i]->next) { | ||
1630 | if (p->prio >= swap_info[i]->prio) | ||
1631 | break; | ||
1632 | prev = i; | ||
1633 | } | ||
1634 | p->next = i; | ||
1635 | if (prev < 0) | ||
1636 | swap_list.head = swap_list.next = type; | ||
1637 | else | ||
1638 | swap_info[prev]->next = type; | ||
1639 | nr_swap_pages += p->pages; | ||
1640 | total_swap_pages += p->pages; | ||
1641 | p->flags |= SWP_WRITEOK; | ||
1642 | spin_unlock(&swap_lock); | ||
1643 | goto out_dput; | 1629 | goto out_dput; |
1644 | } | 1630 | } |
1645 | 1631 | ||
1646 | /* wait for any unplug function to finish */ | ||
1647 | down_write(&swap_unplug_sem); | ||
1648 | up_write(&swap_unplug_sem); | ||
1649 | |||
1650 | destroy_swap_extents(p); | 1632 | destroy_swap_extents(p); |
1651 | if (p->flags & SWP_CONTINUED) | 1633 | if (p->flags & SWP_CONTINUED) |
1652 | free_swap_count_continuations(p); | 1634 | free_swap_count_continuations(p); |
@@ -1844,49 +1826,24 @@ static int __init max_swapfiles_check(void) | |||
1844 | late_initcall(max_swapfiles_check); | 1826 | late_initcall(max_swapfiles_check); |
1845 | #endif | 1827 | #endif |
1846 | 1828 | ||
1847 | /* | 1829 | static struct swap_info_struct *alloc_swap_info(void) |
1848 | * Written 01/25/92 by Simmule Turner, heavily changed by Linus. | ||
1849 | * | ||
1850 | * The swapon system call | ||
1851 | */ | ||
1852 | SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | ||
1853 | { | 1830 | { |
1854 | struct swap_info_struct *p; | 1831 | struct swap_info_struct *p; |
1855 | char *name = NULL; | ||
1856 | struct block_device *bdev = NULL; | ||
1857 | struct file *swap_file = NULL; | ||
1858 | struct address_space *mapping; | ||
1859 | unsigned int type; | 1832 | unsigned int type; |
1860 | int i, prev; | ||
1861 | int error; | ||
1862 | union swap_header *swap_header; | ||
1863 | unsigned int nr_good_pages; | ||
1864 | int nr_extents = 0; | ||
1865 | sector_t span; | ||
1866 | unsigned long maxpages; | ||
1867 | unsigned long swapfilepages; | ||
1868 | unsigned char *swap_map = NULL; | ||
1869 | struct page *page = NULL; | ||
1870 | struct inode *inode = NULL; | ||
1871 | int did_down = 0; | ||
1872 | |||
1873 | if (!capable(CAP_SYS_ADMIN)) | ||
1874 | return -EPERM; | ||
1875 | 1833 | ||
1876 | p = kzalloc(sizeof(*p), GFP_KERNEL); | 1834 | p = kzalloc(sizeof(*p), GFP_KERNEL); |
1877 | if (!p) | 1835 | if (!p) |
1878 | return -ENOMEM; | 1836 | return ERR_PTR(-ENOMEM); |
1879 | 1837 | ||
1880 | spin_lock(&swap_lock); | 1838 | spin_lock(&swap_lock); |
1881 | for (type = 0; type < nr_swapfiles; type++) { | 1839 | for (type = 0; type < nr_swapfiles; type++) { |
1882 | if (!(swap_info[type]->flags & SWP_USED)) | 1840 | if (!(swap_info[type]->flags & SWP_USED)) |
1883 | break; | 1841 | break; |
1884 | } | 1842 | } |
1885 | error = -EPERM; | ||
1886 | if (type >= MAX_SWAPFILES) { | 1843 | if (type >= MAX_SWAPFILES) { |
1887 | spin_unlock(&swap_lock); | 1844 | spin_unlock(&swap_lock); |
1888 | kfree(p); | 1845 | kfree(p); |
1889 | goto out; | 1846 | return ERR_PTR(-EPERM); |
1890 | } | 1847 | } |
1891 | if (type >= nr_swapfiles) { | 1848 | if (type >= nr_swapfiles) { |
1892 | p->type = type; | 1849 | p->type = type; |
@@ -1911,81 +1868,49 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
1911 | p->next = -1; | 1868 | p->next = -1; |
1912 | spin_unlock(&swap_lock); | 1869 | spin_unlock(&swap_lock); |
1913 | 1870 | ||
1914 | name = getname(specialfile); | 1871 | return p; |
1915 | error = PTR_ERR(name); | 1872 | } |
1916 | if (IS_ERR(name)) { | ||
1917 | name = NULL; | ||
1918 | goto bad_swap_2; | ||
1919 | } | ||
1920 | swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0); | ||
1921 | error = PTR_ERR(swap_file); | ||
1922 | if (IS_ERR(swap_file)) { | ||
1923 | swap_file = NULL; | ||
1924 | goto bad_swap_2; | ||
1925 | } | ||
1926 | |||
1927 | p->swap_file = swap_file; | ||
1928 | mapping = swap_file->f_mapping; | ||
1929 | inode = mapping->host; | ||
1930 | |||
1931 | error = -EBUSY; | ||
1932 | for (i = 0; i < nr_swapfiles; i++) { | ||
1933 | struct swap_info_struct *q = swap_info[i]; | ||
1934 | 1873 | ||
1935 | if (i == type || !q->swap_file) | 1874 | static int claim_swapfile(struct swap_info_struct *p, struct inode *inode) |
1936 | continue; | 1875 | { |
1937 | if (mapping == q->swap_file->f_mapping) | 1876 | int error; |
1938 | goto bad_swap; | ||
1939 | } | ||
1940 | 1877 | ||
1941 | error = -EINVAL; | ||
1942 | if (S_ISBLK(inode->i_mode)) { | 1878 | if (S_ISBLK(inode->i_mode)) { |
1943 | bdev = bdgrab(I_BDEV(inode)); | 1879 | p->bdev = bdgrab(I_BDEV(inode)); |
1944 | error = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL, | 1880 | error = blkdev_get(p->bdev, |
1881 | FMODE_READ | FMODE_WRITE | FMODE_EXCL, | ||
1945 | sys_swapon); | 1882 | sys_swapon); |
1946 | if (error < 0) { | 1883 | if (error < 0) { |
1947 | bdev = NULL; | 1884 | p->bdev = NULL; |
1948 | error = -EINVAL; | 1885 | return -EINVAL; |
1949 | goto bad_swap; | ||
1950 | } | 1886 | } |
1951 | p->old_block_size = block_size(bdev); | 1887 | p->old_block_size = block_size(p->bdev); |
1952 | error = set_blocksize(bdev, PAGE_SIZE); | 1888 | error = set_blocksize(p->bdev, PAGE_SIZE); |
1953 | if (error < 0) | 1889 | if (error < 0) |
1954 | goto bad_swap; | 1890 | return error; |
1955 | p->bdev = bdev; | ||
1956 | p->flags |= SWP_BLKDEV; | 1891 | p->flags |= SWP_BLKDEV; |
1957 | } else if (S_ISREG(inode->i_mode)) { | 1892 | } else if (S_ISREG(inode->i_mode)) { |
1958 | p->bdev = inode->i_sb->s_bdev; | 1893 | p->bdev = inode->i_sb->s_bdev; |
1959 | mutex_lock(&inode->i_mutex); | 1894 | mutex_lock(&inode->i_mutex); |
1960 | did_down = 1; | 1895 | if (IS_SWAPFILE(inode)) |
1961 | if (IS_SWAPFILE(inode)) { | 1896 | return -EBUSY; |
1962 | error = -EBUSY; | 1897 | } else |
1963 | goto bad_swap; | 1898 | return -EINVAL; |
1964 | } | ||
1965 | } else { | ||
1966 | goto bad_swap; | ||
1967 | } | ||
1968 | 1899 | ||
1969 | swapfilepages = i_size_read(inode) >> PAGE_SHIFT; | 1900 | return 0; |
1901 | } | ||
1970 | 1902 | ||
1971 | /* | 1903 | static unsigned long read_swap_header(struct swap_info_struct *p, |
1972 | * Read the swap header. | 1904 | union swap_header *swap_header, |
1973 | */ | 1905 | struct inode *inode) |
1974 | if (!mapping->a_ops->readpage) { | 1906 | { |
1975 | error = -EINVAL; | 1907 | int i; |
1976 | goto bad_swap; | 1908 | unsigned long maxpages; |
1977 | } | 1909 | unsigned long swapfilepages; |
1978 | page = read_mapping_page(mapping, 0, swap_file); | ||
1979 | if (IS_ERR(page)) { | ||
1980 | error = PTR_ERR(page); | ||
1981 | goto bad_swap; | ||
1982 | } | ||
1983 | swap_header = kmap(page); | ||
1984 | 1910 | ||
1985 | if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) { | 1911 | if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) { |
1986 | printk(KERN_ERR "Unable to find swap-space signature\n"); | 1912 | printk(KERN_ERR "Unable to find swap-space signature\n"); |
1987 | error = -EINVAL; | 1913 | return 0; |
1988 | goto bad_swap; | ||
1989 | } | 1914 | } |
1990 | 1915 | ||
1991 | /* swap partition endianess hack... */ | 1916 | /* swap partition endianess hack... */ |
@@ -2001,8 +1926,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
2001 | printk(KERN_WARNING | 1926 | printk(KERN_WARNING |
2002 | "Unable to handle swap header version %d\n", | 1927 | "Unable to handle swap header version %d\n", |
2003 | swap_header->info.version); | 1928 | swap_header->info.version); |
2004 | error = -EINVAL; | 1929 | return 0; |
2005 | goto bad_swap; | ||
2006 | } | 1930 | } |
2007 | 1931 | ||
2008 | p->lowest_bit = 1; | 1932 | p->lowest_bit = 1; |
@@ -2033,61 +1957,155 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
2033 | } | 1957 | } |
2034 | p->highest_bit = maxpages - 1; | 1958 | p->highest_bit = maxpages - 1; |
2035 | 1959 | ||
2036 | error = -EINVAL; | ||
2037 | if (!maxpages) | 1960 | if (!maxpages) |
2038 | goto bad_swap; | 1961 | return 0; |
1962 | swapfilepages = i_size_read(inode) >> PAGE_SHIFT; | ||
2039 | if (swapfilepages && maxpages > swapfilepages) { | 1963 | if (swapfilepages && maxpages > swapfilepages) { |
2040 | printk(KERN_WARNING | 1964 | printk(KERN_WARNING |
2041 | "Swap area shorter than signature indicates\n"); | 1965 | "Swap area shorter than signature indicates\n"); |
2042 | goto bad_swap; | 1966 | return 0; |
2043 | } | 1967 | } |
2044 | if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) | 1968 | if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) |
2045 | goto bad_swap; | 1969 | return 0; |
2046 | if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) | 1970 | if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) |
2047 | goto bad_swap; | 1971 | return 0; |
2048 | 1972 | ||
2049 | /* OK, set up the swap map and apply the bad block list */ | 1973 | return maxpages; |
2050 | swap_map = vmalloc(maxpages); | 1974 | } |
2051 | if (!swap_map) { | 1975 | |
2052 | error = -ENOMEM; | 1976 | static int setup_swap_map_and_extents(struct swap_info_struct *p, |
2053 | goto bad_swap; | 1977 | union swap_header *swap_header, |
2054 | } | 1978 | unsigned char *swap_map, |
1979 | unsigned long maxpages, | ||
1980 | sector_t *span) | ||
1981 | { | ||
1982 | int i; | ||
1983 | unsigned int nr_good_pages; | ||
1984 | int nr_extents; | ||
2055 | 1985 | ||
2056 | memset(swap_map, 0, maxpages); | ||
2057 | nr_good_pages = maxpages - 1; /* omit header page */ | 1986 | nr_good_pages = maxpages - 1; /* omit header page */ |
2058 | 1987 | ||
2059 | for (i = 0; i < swap_header->info.nr_badpages; i++) { | 1988 | for (i = 0; i < swap_header->info.nr_badpages; i++) { |
2060 | unsigned int page_nr = swap_header->info.badpages[i]; | 1989 | unsigned int page_nr = swap_header->info.badpages[i]; |
2061 | if (page_nr == 0 || page_nr > swap_header->info.last_page) { | 1990 | if (page_nr == 0 || page_nr > swap_header->info.last_page) |
2062 | error = -EINVAL; | 1991 | return -EINVAL; |
2063 | goto bad_swap; | ||
2064 | } | ||
2065 | if (page_nr < maxpages) { | 1992 | if (page_nr < maxpages) { |
2066 | swap_map[page_nr] = SWAP_MAP_BAD; | 1993 | swap_map[page_nr] = SWAP_MAP_BAD; |
2067 | nr_good_pages--; | 1994 | nr_good_pages--; |
2068 | } | 1995 | } |
2069 | } | 1996 | } |
2070 | 1997 | ||
2071 | error = swap_cgroup_swapon(type, maxpages); | ||
2072 | if (error) | ||
2073 | goto bad_swap; | ||
2074 | |||
2075 | if (nr_good_pages) { | 1998 | if (nr_good_pages) { |
2076 | swap_map[0] = SWAP_MAP_BAD; | 1999 | swap_map[0] = SWAP_MAP_BAD; |
2077 | p->max = maxpages; | 2000 | p->max = maxpages; |
2078 | p->pages = nr_good_pages; | 2001 | p->pages = nr_good_pages; |
2079 | nr_extents = setup_swap_extents(p, &span); | 2002 | nr_extents = setup_swap_extents(p, span); |
2080 | if (nr_extents < 0) { | 2003 | if (nr_extents < 0) |
2081 | error = nr_extents; | 2004 | return nr_extents; |
2082 | goto bad_swap; | ||
2083 | } | ||
2084 | nr_good_pages = p->pages; | 2005 | nr_good_pages = p->pages; |
2085 | } | 2006 | } |
2086 | if (!nr_good_pages) { | 2007 | if (!nr_good_pages) { |
2087 | printk(KERN_WARNING "Empty swap-file\n"); | 2008 | printk(KERN_WARNING "Empty swap-file\n"); |
2009 | return -EINVAL; | ||
2010 | } | ||
2011 | |||
2012 | return nr_extents; | ||
2013 | } | ||
2014 | |||
2015 | SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | ||
2016 | { | ||
2017 | struct swap_info_struct *p; | ||
2018 | char *name; | ||
2019 | struct file *swap_file = NULL; | ||
2020 | struct address_space *mapping; | ||
2021 | int i; | ||
2022 | int prio; | ||
2023 | int error; | ||
2024 | union swap_header *swap_header; | ||
2025 | int nr_extents; | ||
2026 | sector_t span; | ||
2027 | unsigned long maxpages; | ||
2028 | unsigned char *swap_map = NULL; | ||
2029 | struct page *page = NULL; | ||
2030 | struct inode *inode = NULL; | ||
2031 | |||
2032 | if (!capable(CAP_SYS_ADMIN)) | ||
2033 | return -EPERM; | ||
2034 | |||
2035 | p = alloc_swap_info(); | ||
2036 | if (IS_ERR(p)) | ||
2037 | return PTR_ERR(p); | ||
2038 | |||
2039 | name = getname(specialfile); | ||
2040 | if (IS_ERR(name)) { | ||
2041 | error = PTR_ERR(name); | ||
2042 | name = NULL; | ||
2043 | goto bad_swap; | ||
2044 | } | ||
2045 | swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0); | ||
2046 | if (IS_ERR(swap_file)) { | ||
2047 | error = PTR_ERR(swap_file); | ||
2048 | swap_file = NULL; | ||
2049 | goto bad_swap; | ||
2050 | } | ||
2051 | |||
2052 | p->swap_file = swap_file; | ||
2053 | mapping = swap_file->f_mapping; | ||
2054 | |||
2055 | for (i = 0; i < nr_swapfiles; i++) { | ||
2056 | struct swap_info_struct *q = swap_info[i]; | ||
2057 | |||
2058 | if (q == p || !q->swap_file) | ||
2059 | continue; | ||
2060 | if (mapping == q->swap_file->f_mapping) { | ||
2061 | error = -EBUSY; | ||
2062 | goto bad_swap; | ||
2063 | } | ||
2064 | } | ||
2065 | |||
2066 | inode = mapping->host; | ||
2067 | /* If S_ISREG(inode->i_mode) will do mutex_lock(&inode->i_mutex); */ | ||
2068 | error = claim_swapfile(p, inode); | ||
2069 | if (unlikely(error)) | ||
2070 | goto bad_swap; | ||
2071 | |||
2072 | /* | ||
2073 | * Read the swap header. | ||
2074 | */ | ||
2075 | if (!mapping->a_ops->readpage) { | ||
2088 | error = -EINVAL; | 2076 | error = -EINVAL; |
2089 | goto bad_swap; | 2077 | goto bad_swap; |
2090 | } | 2078 | } |
2079 | page = read_mapping_page(mapping, 0, swap_file); | ||
2080 | if (IS_ERR(page)) { | ||
2081 | error = PTR_ERR(page); | ||
2082 | goto bad_swap; | ||
2083 | } | ||
2084 | swap_header = kmap(page); | ||
2085 | |||
2086 | maxpages = read_swap_header(p, swap_header, inode); | ||
2087 | if (unlikely(!maxpages)) { | ||
2088 | error = -EINVAL; | ||
2089 | goto bad_swap; | ||
2090 | } | ||
2091 | |||
2092 | /* OK, set up the swap map and apply the bad block list */ | ||
2093 | swap_map = vzalloc(maxpages); | ||
2094 | if (!swap_map) { | ||
2095 | error = -ENOMEM; | ||
2096 | goto bad_swap; | ||
2097 | } | ||
2098 | |||
2099 | error = swap_cgroup_swapon(p->type, maxpages); | ||
2100 | if (error) | ||
2101 | goto bad_swap; | ||
2102 | |||
2103 | nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map, | ||
2104 | maxpages, &span); | ||
2105 | if (unlikely(nr_extents < 0)) { | ||
2106 | error = nr_extents; | ||
2107 | goto bad_swap; | ||
2108 | } | ||
2091 | 2109 | ||
2092 | if (p->bdev) { | 2110 | if (p->bdev) { |
2093 | if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { | 2111 | if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { |
@@ -2099,58 +2117,46 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
2099 | } | 2117 | } |
2100 | 2118 | ||
2101 | mutex_lock(&swapon_mutex); | 2119 | mutex_lock(&swapon_mutex); |
2102 | spin_lock(&swap_lock); | 2120 | prio = -1; |
2103 | if (swap_flags & SWAP_FLAG_PREFER) | 2121 | if (swap_flags & SWAP_FLAG_PREFER) |
2104 | p->prio = | 2122 | prio = |
2105 | (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; | 2123 | (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; |
2106 | else | 2124 | enable_swap_info(p, prio, swap_map); |
2107 | p->prio = --least_priority; | ||
2108 | p->swap_map = swap_map; | ||
2109 | p->flags |= SWP_WRITEOK; | ||
2110 | nr_swap_pages += nr_good_pages; | ||
2111 | total_swap_pages += nr_good_pages; | ||
2112 | 2125 | ||
2113 | printk(KERN_INFO "Adding %uk swap on %s. " | 2126 | printk(KERN_INFO "Adding %uk swap on %s. " |
2114 | "Priority:%d extents:%d across:%lluk %s%s\n", | 2127 | "Priority:%d extents:%d across:%lluk %s%s\n", |
2115 | nr_good_pages<<(PAGE_SHIFT-10), name, p->prio, | 2128 | p->pages<<(PAGE_SHIFT-10), name, p->prio, |
2116 | nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), | 2129 | nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), |
2117 | (p->flags & SWP_SOLIDSTATE) ? "SS" : "", | 2130 | (p->flags & SWP_SOLIDSTATE) ? "SS" : "", |
2118 | (p->flags & SWP_DISCARDABLE) ? "D" : ""); | 2131 | (p->flags & SWP_DISCARDABLE) ? "D" : ""); |
2119 | 2132 | ||
2120 | /* insert swap space into swap_list: */ | ||
2121 | prev = -1; | ||
2122 | for (i = swap_list.head; i >= 0; i = swap_info[i]->next) { | ||
2123 | if (p->prio >= swap_info[i]->prio) | ||
2124 | break; | ||
2125 | prev = i; | ||
2126 | } | ||
2127 | p->next = i; | ||
2128 | if (prev < 0) | ||
2129 | swap_list.head = swap_list.next = type; | ||
2130 | else | ||
2131 | swap_info[prev]->next = type; | ||
2132 | spin_unlock(&swap_lock); | ||
2133 | mutex_unlock(&swapon_mutex); | 2133 | mutex_unlock(&swapon_mutex); |
2134 | atomic_inc(&proc_poll_event); | 2134 | atomic_inc(&proc_poll_event); |
2135 | wake_up_interruptible(&proc_poll_wait); | 2135 | wake_up_interruptible(&proc_poll_wait); |
2136 | 2136 | ||
2137 | if (S_ISREG(inode->i_mode)) | ||
2138 | inode->i_flags |= S_SWAPFILE; | ||
2137 | error = 0; | 2139 | error = 0; |
2138 | goto out; | 2140 | goto out; |
2139 | bad_swap: | 2141 | bad_swap: |
2140 | if (bdev) { | 2142 | if (inode && S_ISBLK(inode->i_mode) && p->bdev) { |
2141 | set_blocksize(bdev, p->old_block_size); | 2143 | set_blocksize(p->bdev, p->old_block_size); |
2142 | blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); | 2144 | blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); |
2143 | } | 2145 | } |
2144 | destroy_swap_extents(p); | 2146 | destroy_swap_extents(p); |
2145 | swap_cgroup_swapoff(type); | 2147 | swap_cgroup_swapoff(p->type); |
2146 | bad_swap_2: | ||
2147 | spin_lock(&swap_lock); | 2148 | spin_lock(&swap_lock); |
2148 | p->swap_file = NULL; | 2149 | p->swap_file = NULL; |
2149 | p->flags = 0; | 2150 | p->flags = 0; |
2150 | spin_unlock(&swap_lock); | 2151 | spin_unlock(&swap_lock); |
2151 | vfree(swap_map); | 2152 | vfree(swap_map); |
2152 | if (swap_file) | 2153 | if (swap_file) { |
2154 | if (inode && S_ISREG(inode->i_mode)) { | ||
2155 | mutex_unlock(&inode->i_mutex); | ||
2156 | inode = NULL; | ||
2157 | } | ||
2153 | filp_close(swap_file, NULL); | 2158 | filp_close(swap_file, NULL); |
2159 | } | ||
2154 | out: | 2160 | out: |
2155 | if (page && !IS_ERR(page)) { | 2161 | if (page && !IS_ERR(page)) { |
2156 | kunmap(page); | 2162 | kunmap(page); |
@@ -2158,11 +2164,8 @@ out: | |||
2158 | } | 2164 | } |
2159 | if (name) | 2165 | if (name) |
2160 | putname(name); | 2166 | putname(name); |
2161 | if (did_down) { | 2167 | if (inode && S_ISREG(inode->i_mode)) |
2162 | if (!error) | ||
2163 | inode->i_flags |= S_SWAPFILE; | ||
2164 | mutex_unlock(&inode->i_mutex); | 2168 | mutex_unlock(&inode->i_mutex); |
2165 | } | ||
2166 | return error; | 2169 | return error; |
2167 | } | 2170 | } |
2168 | 2171 | ||
diff --git a/mm/truncate.c b/mm/truncate.c index d64296be00d3..a95667529135 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -106,9 +106,8 @@ truncate_complete_page(struct address_space *mapping, struct page *page) | |||
106 | cancel_dirty_page(page, PAGE_CACHE_SIZE); | 106 | cancel_dirty_page(page, PAGE_CACHE_SIZE); |
107 | 107 | ||
108 | clear_page_mlock(page); | 108 | clear_page_mlock(page); |
109 | remove_from_page_cache(page); | ||
110 | ClearPageMappedToDisk(page); | 109 | ClearPageMappedToDisk(page); |
111 | page_cache_release(page); /* pagecache ref */ | 110 | delete_from_page_cache(page); |
112 | return 0; | 111 | return 0; |
113 | } | 112 | } |
114 | 113 | ||
@@ -322,11 +321,12 @@ EXPORT_SYMBOL(truncate_inode_pages); | |||
322 | * pagetables. | 321 | * pagetables. |
323 | */ | 322 | */ |
324 | unsigned long invalidate_mapping_pages(struct address_space *mapping, | 323 | unsigned long invalidate_mapping_pages(struct address_space *mapping, |
325 | pgoff_t start, pgoff_t end) | 324 | pgoff_t start, pgoff_t end) |
326 | { | 325 | { |
327 | struct pagevec pvec; | 326 | struct pagevec pvec; |
328 | pgoff_t next = start; | 327 | pgoff_t next = start; |
329 | unsigned long ret = 0; | 328 | unsigned long ret; |
329 | unsigned long count = 0; | ||
330 | int i; | 330 | int i; |
331 | 331 | ||
332 | pagevec_init(&pvec, 0); | 332 | pagevec_init(&pvec, 0); |
@@ -353,9 +353,15 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, | |||
353 | if (lock_failed) | 353 | if (lock_failed) |
354 | continue; | 354 | continue; |
355 | 355 | ||
356 | ret += invalidate_inode_page(page); | 356 | ret = invalidate_inode_page(page); |
357 | |||
358 | unlock_page(page); | 357 | unlock_page(page); |
358 | /* | ||
359 | * Invalidation is a hint that the page is no longer | ||
360 | * of interest and try to speed up its reclaim. | ||
361 | */ | ||
362 | if (!ret) | ||
363 | deactivate_page(page); | ||
364 | count += ret; | ||
359 | if (next > end) | 365 | if (next > end) |
360 | break; | 366 | break; |
361 | } | 367 | } |
@@ -363,7 +369,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, | |||
363 | mem_cgroup_uncharge_end(); | 369 | mem_cgroup_uncharge_end(); |
364 | cond_resched(); | 370 | cond_resched(); |
365 | } | 371 | } |
366 | return ret; | 372 | return count; |
367 | } | 373 | } |
368 | EXPORT_SYMBOL(invalidate_mapping_pages); | 374 | EXPORT_SYMBOL(invalidate_mapping_pages); |
369 | 375 | ||
@@ -389,7 +395,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) | |||
389 | 395 | ||
390 | clear_page_mlock(page); | 396 | clear_page_mlock(page); |
391 | BUG_ON(page_has_private(page)); | 397 | BUG_ON(page_has_private(page)); |
392 | __remove_from_page_cache(page); | 398 | __delete_from_page_cache(page); |
393 | spin_unlock_irq(&mapping->tree_lock); | 399 | spin_unlock_irq(&mapping->tree_lock); |
394 | mem_cgroup_uncharge_cache_page(page); | 400 | mem_cgroup_uncharge_cache_page(page); |
395 | 401 | ||
@@ -227,7 +227,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm) | |||
227 | /* | 227 | /* |
228 | * Like get_user_pages_fast() except its IRQ-safe in that it won't fall | 228 | * Like get_user_pages_fast() except its IRQ-safe in that it won't fall |
229 | * back to the regular GUP. | 229 | * back to the regular GUP. |
230 | * If the architecture not support this fucntion, simply return with no | 230 | * If the architecture not support this function, simply return with no |
231 | * page pinned | 231 | * page pinned |
232 | */ | 232 | */ |
233 | int __attribute__((weak)) __get_user_pages_fast(unsigned long start, | 233 | int __attribute__((weak)) __get_user_pages_fast(unsigned long start, |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index f9b166732e70..5d6030235d7a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -261,8 +261,15 @@ struct vmap_area { | |||
261 | }; | 261 | }; |
262 | 262 | ||
263 | static DEFINE_SPINLOCK(vmap_area_lock); | 263 | static DEFINE_SPINLOCK(vmap_area_lock); |
264 | static struct rb_root vmap_area_root = RB_ROOT; | ||
265 | static LIST_HEAD(vmap_area_list); | 264 | static LIST_HEAD(vmap_area_list); |
265 | static struct rb_root vmap_area_root = RB_ROOT; | ||
266 | |||
267 | /* The vmap cache globals are protected by vmap_area_lock */ | ||
268 | static struct rb_node *free_vmap_cache; | ||
269 | static unsigned long cached_hole_size; | ||
270 | static unsigned long cached_vstart; | ||
271 | static unsigned long cached_align; | ||
272 | |||
266 | static unsigned long vmap_area_pcpu_hole; | 273 | static unsigned long vmap_area_pcpu_hole; |
267 | 274 | ||
268 | static struct vmap_area *__find_vmap_area(unsigned long addr) | 275 | static struct vmap_area *__find_vmap_area(unsigned long addr) |
@@ -331,9 +338,11 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, | |||
331 | struct rb_node *n; | 338 | struct rb_node *n; |
332 | unsigned long addr; | 339 | unsigned long addr; |
333 | int purged = 0; | 340 | int purged = 0; |
341 | struct vmap_area *first; | ||
334 | 342 | ||
335 | BUG_ON(!size); | 343 | BUG_ON(!size); |
336 | BUG_ON(size & ~PAGE_MASK); | 344 | BUG_ON(size & ~PAGE_MASK); |
345 | BUG_ON(!is_power_of_2(align)); | ||
337 | 346 | ||
338 | va = kmalloc_node(sizeof(struct vmap_area), | 347 | va = kmalloc_node(sizeof(struct vmap_area), |
339 | gfp_mask & GFP_RECLAIM_MASK, node); | 348 | gfp_mask & GFP_RECLAIM_MASK, node); |
@@ -341,79 +350,106 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, | |||
341 | return ERR_PTR(-ENOMEM); | 350 | return ERR_PTR(-ENOMEM); |
342 | 351 | ||
343 | retry: | 352 | retry: |
344 | addr = ALIGN(vstart, align); | ||
345 | |||
346 | spin_lock(&vmap_area_lock); | 353 | spin_lock(&vmap_area_lock); |
347 | if (addr + size - 1 < addr) | 354 | /* |
348 | goto overflow; | 355 | * Invalidate cache if we have more permissive parameters. |
356 | * cached_hole_size notes the largest hole noticed _below_ | ||
357 | * the vmap_area cached in free_vmap_cache: if size fits | ||
358 | * into that hole, we want to scan from vstart to reuse | ||
359 | * the hole instead of allocating above free_vmap_cache. | ||
360 | * Note that __free_vmap_area may update free_vmap_cache | ||
361 | * without updating cached_hole_size or cached_align. | ||
362 | */ | ||
363 | if (!free_vmap_cache || | ||
364 | size < cached_hole_size || | ||
365 | vstart < cached_vstart || | ||
366 | align < cached_align) { | ||
367 | nocache: | ||
368 | cached_hole_size = 0; | ||
369 | free_vmap_cache = NULL; | ||
370 | } | ||
371 | /* record if we encounter less permissive parameters */ | ||
372 | cached_vstart = vstart; | ||
373 | cached_align = align; | ||
374 | |||
375 | /* find starting point for our search */ | ||
376 | if (free_vmap_cache) { | ||
377 | first = rb_entry(free_vmap_cache, struct vmap_area, rb_node); | ||
378 | addr = ALIGN(first->va_end + PAGE_SIZE, align); | ||
379 | if (addr < vstart) | ||
380 | goto nocache; | ||
381 | if (addr + size - 1 < addr) | ||
382 | goto overflow; | ||
383 | |||
384 | } else { | ||
385 | addr = ALIGN(vstart, align); | ||
386 | if (addr + size - 1 < addr) | ||
387 | goto overflow; | ||
349 | 388 | ||
350 | /* XXX: could have a last_hole cache */ | 389 | n = vmap_area_root.rb_node; |
351 | n = vmap_area_root.rb_node; | 390 | first = NULL; |
352 | if (n) { | ||
353 | struct vmap_area *first = NULL; | ||
354 | 391 | ||
355 | do { | 392 | while (n) { |
356 | struct vmap_area *tmp; | 393 | struct vmap_area *tmp; |
357 | tmp = rb_entry(n, struct vmap_area, rb_node); | 394 | tmp = rb_entry(n, struct vmap_area, rb_node); |
358 | if (tmp->va_end >= addr) { | 395 | if (tmp->va_end >= addr) { |
359 | if (!first && tmp->va_start < addr + size) | ||
360 | first = tmp; | ||
361 | n = n->rb_left; | ||
362 | } else { | ||
363 | first = tmp; | 396 | first = tmp; |
397 | if (tmp->va_start <= addr) | ||
398 | break; | ||
399 | n = n->rb_left; | ||
400 | } else | ||
364 | n = n->rb_right; | 401 | n = n->rb_right; |
365 | } | 402 | } |
366 | } while (n); | ||
367 | 403 | ||
368 | if (!first) | 404 | if (!first) |
369 | goto found; | 405 | goto found; |
370 | |||
371 | if (first->va_end < addr) { | ||
372 | n = rb_next(&first->rb_node); | ||
373 | if (n) | ||
374 | first = rb_entry(n, struct vmap_area, rb_node); | ||
375 | else | ||
376 | goto found; | ||
377 | } | ||
378 | |||
379 | while (addr + size > first->va_start && addr + size <= vend) { | ||
380 | addr = ALIGN(first->va_end + PAGE_SIZE, align); | ||
381 | if (addr + size - 1 < addr) | ||
382 | goto overflow; | ||
383 | |||
384 | n = rb_next(&first->rb_node); | ||
385 | if (n) | ||
386 | first = rb_entry(n, struct vmap_area, rb_node); | ||
387 | else | ||
388 | goto found; | ||
389 | } | ||
390 | } | 406 | } |
391 | found: | 407 | |
392 | if (addr + size > vend) { | 408 | /* from the starting point, walk areas until a suitable hole is found */ |
393 | overflow: | 409 | while (addr + size >= first->va_start && addr + size <= vend) { |
394 | spin_unlock(&vmap_area_lock); | 410 | if (addr + cached_hole_size < first->va_start) |
395 | if (!purged) { | 411 | cached_hole_size = first->va_start - addr; |
396 | purge_vmap_area_lazy(); | 412 | addr = ALIGN(first->va_end + PAGE_SIZE, align); |
397 | purged = 1; | 413 | if (addr + size - 1 < addr) |
398 | goto retry; | 414 | goto overflow; |
399 | } | 415 | |
400 | if (printk_ratelimit()) | 416 | n = rb_next(&first->rb_node); |
401 | printk(KERN_WARNING | 417 | if (n) |
402 | "vmap allocation for size %lu failed: " | 418 | first = rb_entry(n, struct vmap_area, rb_node); |
403 | "use vmalloc=<size> to increase size.\n", size); | 419 | else |
404 | kfree(va); | 420 | goto found; |
405 | return ERR_PTR(-EBUSY); | ||
406 | } | 421 | } |
407 | 422 | ||
408 | BUG_ON(addr & (align-1)); | 423 | found: |
424 | if (addr + size > vend) | ||
425 | goto overflow; | ||
409 | 426 | ||
410 | va->va_start = addr; | 427 | va->va_start = addr; |
411 | va->va_end = addr + size; | 428 | va->va_end = addr + size; |
412 | va->flags = 0; | 429 | va->flags = 0; |
413 | __insert_vmap_area(va); | 430 | __insert_vmap_area(va); |
431 | free_vmap_cache = &va->rb_node; | ||
414 | spin_unlock(&vmap_area_lock); | 432 | spin_unlock(&vmap_area_lock); |
415 | 433 | ||
434 | BUG_ON(va->va_start & (align-1)); | ||
435 | BUG_ON(va->va_start < vstart); | ||
436 | BUG_ON(va->va_end > vend); | ||
437 | |||
416 | return va; | 438 | return va; |
439 | |||
440 | overflow: | ||
441 | spin_unlock(&vmap_area_lock); | ||
442 | if (!purged) { | ||
443 | purge_vmap_area_lazy(); | ||
444 | purged = 1; | ||
445 | goto retry; | ||
446 | } | ||
447 | if (printk_ratelimit()) | ||
448 | printk(KERN_WARNING | ||
449 | "vmap allocation for size %lu failed: " | ||
450 | "use vmalloc=<size> to increase size.\n", size); | ||
451 | kfree(va); | ||
452 | return ERR_PTR(-EBUSY); | ||
417 | } | 453 | } |
418 | 454 | ||
419 | static void rcu_free_va(struct rcu_head *head) | 455 | static void rcu_free_va(struct rcu_head *head) |
@@ -426,6 +462,22 @@ static void rcu_free_va(struct rcu_head *head) | |||
426 | static void __free_vmap_area(struct vmap_area *va) | 462 | static void __free_vmap_area(struct vmap_area *va) |
427 | { | 463 | { |
428 | BUG_ON(RB_EMPTY_NODE(&va->rb_node)); | 464 | BUG_ON(RB_EMPTY_NODE(&va->rb_node)); |
465 | |||
466 | if (free_vmap_cache) { | ||
467 | if (va->va_end < cached_vstart) { | ||
468 | free_vmap_cache = NULL; | ||
469 | } else { | ||
470 | struct vmap_area *cache; | ||
471 | cache = rb_entry(free_vmap_cache, struct vmap_area, rb_node); | ||
472 | if (va->va_start <= cache->va_start) { | ||
473 | free_vmap_cache = rb_prev(&va->rb_node); | ||
474 | /* | ||
475 | * We don't try to update cached_hole_size or | ||
476 | * cached_align, but it won't go very wrong. | ||
477 | */ | ||
478 | } | ||
479 | } | ||
480 | } | ||
429 | rb_erase(&va->rb_node, &vmap_area_root); | 481 | rb_erase(&va->rb_node, &vmap_area_root); |
430 | RB_CLEAR_NODE(&va->rb_node); | 482 | RB_CLEAR_NODE(&va->rb_node); |
431 | list_del_rcu(&va->list); | 483 | list_del_rcu(&va->list); |
@@ -1951,8 +2003,6 @@ finished: | |||
1951 | * should know vmalloc() area is valid and can use memcpy(). | 2003 | * should know vmalloc() area is valid and can use memcpy(). |
1952 | * This is for routines which have to access vmalloc area without | 2004 | * This is for routines which have to access vmalloc area without |
1953 | * any informaion, as /dev/kmem. | 2005 | * any informaion, as /dev/kmem. |
1954 | * | ||
1955 | * The caller should guarantee KM_USER1 is not used. | ||
1956 | */ | 2006 | */ |
1957 | 2007 | ||
1958 | long vwrite(char *buf, char *addr, unsigned long count) | 2008 | long vwrite(char *buf, char *addr, unsigned long count) |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 6771ea70bfe7..f6b435c80079 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -41,6 +41,7 @@ | |||
41 | #include <linux/memcontrol.h> | 41 | #include <linux/memcontrol.h> |
42 | #include <linux/delayacct.h> | 42 | #include <linux/delayacct.h> |
43 | #include <linux/sysctl.h> | 43 | #include <linux/sysctl.h> |
44 | #include <linux/oom.h> | ||
44 | 45 | ||
45 | #include <asm/tlbflush.h> | 46 | #include <asm/tlbflush.h> |
46 | #include <asm/div64.h> | 47 | #include <asm/div64.h> |
@@ -358,7 +359,7 @@ static int may_write_to_queue(struct backing_dev_info *bdi, | |||
358 | static void handle_write_error(struct address_space *mapping, | 359 | static void handle_write_error(struct address_space *mapping, |
359 | struct page *page, int error) | 360 | struct page *page, int error) |
360 | { | 361 | { |
361 | lock_page_nosync(page); | 362 | lock_page(page); |
362 | if (page_mapping(page) == mapping) | 363 | if (page_mapping(page) == mapping) |
363 | mapping_set_error(mapping, error); | 364 | mapping_set_error(mapping, error); |
364 | unlock_page(page); | 365 | unlock_page(page); |
@@ -514,7 +515,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page) | |||
514 | 515 | ||
515 | freepage = mapping->a_ops->freepage; | 516 | freepage = mapping->a_ops->freepage; |
516 | 517 | ||
517 | __remove_from_page_cache(page); | 518 | __delete_from_page_cache(page); |
518 | spin_unlock_irq(&mapping->tree_lock); | 519 | spin_unlock_irq(&mapping->tree_lock); |
519 | mem_cgroup_uncharge_cache_page(page); | 520 | mem_cgroup_uncharge_cache_page(page); |
520 | 521 | ||
@@ -1065,7 +1066,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
1065 | * surrounding the tag page. Only take those pages of | 1066 | * surrounding the tag page. Only take those pages of |
1066 | * the same active state as that tag page. We may safely | 1067 | * the same active state as that tag page. We may safely |
1067 | * round the target page pfn down to the requested order | 1068 | * round the target page pfn down to the requested order |
1068 | * as the mem_map is guarenteed valid out to MAX_ORDER, | 1069 | * as the mem_map is guaranteed valid out to MAX_ORDER, |
1069 | * where that page is in a different zone we will detect | 1070 | * where that page is in a different zone we will detect |
1070 | * it from its zone id and abort this block scan. | 1071 | * it from its zone id and abort this block scan. |
1071 | */ | 1072 | */ |
@@ -1988,17 +1989,12 @@ static bool zone_reclaimable(struct zone *zone) | |||
1988 | return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; | 1989 | return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; |
1989 | } | 1990 | } |
1990 | 1991 | ||
1991 | /* | 1992 | /* All zones in zonelist are unreclaimable? */ |
1992 | * As hibernation is going on, kswapd is freezed so that it can't mark | ||
1993 | * the zone into all_unreclaimable. It can't handle OOM during hibernation. | ||
1994 | * So let's check zone's unreclaimable in direct reclaim as well as kswapd. | ||
1995 | */ | ||
1996 | static bool all_unreclaimable(struct zonelist *zonelist, | 1993 | static bool all_unreclaimable(struct zonelist *zonelist, |
1997 | struct scan_control *sc) | 1994 | struct scan_control *sc) |
1998 | { | 1995 | { |
1999 | struct zoneref *z; | 1996 | struct zoneref *z; |
2000 | struct zone *zone; | 1997 | struct zone *zone; |
2001 | bool all_unreclaimable = true; | ||
2002 | 1998 | ||
2003 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 1999 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
2004 | gfp_zone(sc->gfp_mask), sc->nodemask) { | 2000 | gfp_zone(sc->gfp_mask), sc->nodemask) { |
@@ -2006,13 +2002,11 @@ static bool all_unreclaimable(struct zonelist *zonelist, | |||
2006 | continue; | 2002 | continue; |
2007 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) | 2003 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) |
2008 | continue; | 2004 | continue; |
2009 | if (zone_reclaimable(zone)) { | 2005 | if (!zone->all_unreclaimable) |
2010 | all_unreclaimable = false; | 2006 | return false; |
2011 | break; | ||
2012 | } | ||
2013 | } | 2007 | } |
2014 | 2008 | ||
2015 | return all_unreclaimable; | 2009 | return true; |
2016 | } | 2010 | } |
2017 | 2011 | ||
2018 | /* | 2012 | /* |
@@ -2108,6 +2102,14 @@ out: | |||
2108 | if (sc->nr_reclaimed) | 2102 | if (sc->nr_reclaimed) |
2109 | return sc->nr_reclaimed; | 2103 | return sc->nr_reclaimed; |
2110 | 2104 | ||
2105 | /* | ||
2106 | * As hibernation is going on, kswapd is freezed so that it can't mark | ||
2107 | * the zone into all_unreclaimable. Thus bypassing all_unreclaimable | ||
2108 | * check. | ||
2109 | */ | ||
2110 | if (oom_killer_disabled) | ||
2111 | return 0; | ||
2112 | |||
2111 | /* top priority shrink_zones still had more to do? don't OOM, then */ | 2113 | /* top priority shrink_zones still had more to do? don't OOM, then */ |
2112 | if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc)) | 2114 | if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc)) |
2113 | return 1; | 2115 | return 1; |
@@ -2224,7 +2226,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
2224 | * o a 16M DMA zone that is balanced will not balance a zone on any | 2226 | * o a 16M DMA zone that is balanced will not balance a zone on any |
2225 | * reasonable sized machine | 2227 | * reasonable sized machine |
2226 | * o On all other machines, the top zone must be at least a reasonable | 2228 | * o On all other machines, the top zone must be at least a reasonable |
2227 | * precentage of the middle zones. For example, on 32-bit x86, highmem | 2229 | * percentage of the middle zones. For example, on 32-bit x86, highmem |
2228 | * would need to be at least 256M for it to be balance a whole node. | 2230 | * would need to be at least 256M for it to be balance a whole node. |
2229 | * Similarly, on x86-64 the Normal zone would need to be at least 1G | 2231 | * Similarly, on x86-64 the Normal zone would need to be at least 1G |
2230 | * to balance a node on its own. These seemed like reasonable ratios. | 2232 | * to balance a node on its own. These seemed like reasonable ratios. |
@@ -2397,9 +2399,9 @@ loop_again: | |||
2397 | * cause too much scanning of the lower zones. | 2399 | * cause too much scanning of the lower zones. |
2398 | */ | 2400 | */ |
2399 | for (i = 0; i <= end_zone; i++) { | 2401 | for (i = 0; i <= end_zone; i++) { |
2400 | int compaction; | ||
2401 | struct zone *zone = pgdat->node_zones + i; | 2402 | struct zone *zone = pgdat->node_zones + i; |
2402 | int nr_slab; | 2403 | int nr_slab; |
2404 | unsigned long balance_gap; | ||
2403 | 2405 | ||
2404 | if (!populated_zone(zone)) | 2406 | if (!populated_zone(zone)) |
2405 | continue; | 2407 | continue; |
@@ -2416,11 +2418,20 @@ loop_again: | |||
2416 | mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask); | 2418 | mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask); |
2417 | 2419 | ||
2418 | /* | 2420 | /* |
2419 | * We put equal pressure on every zone, unless one | 2421 | * We put equal pressure on every zone, unless |
2420 | * zone has way too many pages free already. | 2422 | * one zone has way too many pages free |
2423 | * already. The "too many pages" is defined | ||
2424 | * as the high wmark plus a "gap" where the | ||
2425 | * gap is either the low watermark or 1% | ||
2426 | * of the zone, whichever is smaller. | ||
2421 | */ | 2427 | */ |
2428 | balance_gap = min(low_wmark_pages(zone), | ||
2429 | (zone->present_pages + | ||
2430 | KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / | ||
2431 | KSWAPD_ZONE_BALANCE_GAP_RATIO); | ||
2422 | if (!zone_watermark_ok_safe(zone, order, | 2432 | if (!zone_watermark_ok_safe(zone, order, |
2423 | 8*high_wmark_pages(zone), end_zone, 0)) | 2433 | high_wmark_pages(zone) + balance_gap, |
2434 | end_zone, 0)) | ||
2424 | shrink_zone(priority, zone, &sc); | 2435 | shrink_zone(priority, zone, &sc); |
2425 | reclaim_state->reclaimed_slab = 0; | 2436 | reclaim_state->reclaimed_slab = 0; |
2426 | nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, | 2437 | nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, |
@@ -2428,24 +2439,9 @@ loop_again: | |||
2428 | sc.nr_reclaimed += reclaim_state->reclaimed_slab; | 2439 | sc.nr_reclaimed += reclaim_state->reclaimed_slab; |
2429 | total_scanned += sc.nr_scanned; | 2440 | total_scanned += sc.nr_scanned; |
2430 | 2441 | ||
2431 | compaction = 0; | ||
2432 | if (order && | ||
2433 | zone_watermark_ok(zone, 0, | ||
2434 | high_wmark_pages(zone), | ||
2435 | end_zone, 0) && | ||
2436 | !zone_watermark_ok(zone, order, | ||
2437 | high_wmark_pages(zone), | ||
2438 | end_zone, 0)) { | ||
2439 | compact_zone_order(zone, | ||
2440 | order, | ||
2441 | sc.gfp_mask, false, | ||
2442 | COMPACT_MODE_KSWAPD); | ||
2443 | compaction = 1; | ||
2444 | } | ||
2445 | |||
2446 | if (zone->all_unreclaimable) | 2442 | if (zone->all_unreclaimable) |
2447 | continue; | 2443 | continue; |
2448 | if (!compaction && nr_slab == 0 && | 2444 | if (nr_slab == 0 && |
2449 | !zone_reclaimable(zone)) | 2445 | !zone_reclaimable(zone)) |
2450 | zone->all_unreclaimable = 1; | 2446 | zone->all_unreclaimable = 1; |
2451 | /* | 2447 | /* |
diff --git a/mm/vmstat.c b/mm/vmstat.c index 0c3b5048773e..897ea9e88238 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -321,9 +321,12 @@ static inline void mod_state(struct zone *zone, | |||
321 | /* | 321 | /* |
322 | * The fetching of the stat_threshold is racy. We may apply | 322 | * The fetching of the stat_threshold is racy. We may apply |
323 | * a counter threshold to the wrong the cpu if we get | 323 | * a counter threshold to the wrong the cpu if we get |
324 | * rescheduled while executing here. However, the following | 324 | * rescheduled while executing here. However, the next |
325 | * will apply the threshold again and therefore bring the | 325 | * counter update will apply the threshold again and |
326 | * counter under the threshold. | 326 | * therefore bring the counter under the threshold again. |
327 | * | ||
328 | * Most of the time the thresholds are the same anyways | ||
329 | * for all cpus in a zone. | ||
327 | */ | 330 | */ |
328 | t = this_cpu_read(pcp->stat_threshold); | 331 | t = this_cpu_read(pcp->stat_threshold); |
329 | 332 | ||
@@ -500,8 +503,12 @@ void refresh_cpu_vm_stats(int cpu) | |||
500 | * z = the zone from which the allocation occurred. | 503 | * z = the zone from which the allocation occurred. |
501 | * | 504 | * |
502 | * Must be called with interrupts disabled. | 505 | * Must be called with interrupts disabled. |
506 | * | ||
507 | * When __GFP_OTHER_NODE is set assume the node of the preferred | ||
508 | * zone is the local node. This is useful for daemons who allocate | ||
509 | * memory on behalf of other processes. | ||
503 | */ | 510 | */ |
504 | void zone_statistics(struct zone *preferred_zone, struct zone *z) | 511 | void zone_statistics(struct zone *preferred_zone, struct zone *z, gfp_t flags) |
505 | { | 512 | { |
506 | if (z->zone_pgdat == preferred_zone->zone_pgdat) { | 513 | if (z->zone_pgdat == preferred_zone->zone_pgdat) { |
507 | __inc_zone_state(z, NUMA_HIT); | 514 | __inc_zone_state(z, NUMA_HIT); |
@@ -509,7 +516,8 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z) | |||
509 | __inc_zone_state(z, NUMA_MISS); | 516 | __inc_zone_state(z, NUMA_MISS); |
510 | __inc_zone_state(preferred_zone, NUMA_FOREIGN); | 517 | __inc_zone_state(preferred_zone, NUMA_FOREIGN); |
511 | } | 518 | } |
512 | if (z->node == numa_node_id()) | 519 | if (z->node == ((flags & __GFP_OTHER_NODE) ? |
520 | preferred_zone->node : numa_node_id())) | ||
513 | __inc_zone_state(z, NUMA_LOCAL); | 521 | __inc_zone_state(z, NUMA_LOCAL); |
514 | else | 522 | else |
515 | __inc_zone_state(z, NUMA_OTHER); | 523 | __inc_zone_state(z, NUMA_OTHER); |
@@ -940,7 +948,16 @@ static const char * const vmstat_text[] = { | |||
940 | "unevictable_pgs_cleared", | 948 | "unevictable_pgs_cleared", |
941 | "unevictable_pgs_stranded", | 949 | "unevictable_pgs_stranded", |
942 | "unevictable_pgs_mlockfreed", | 950 | "unevictable_pgs_mlockfreed", |
951 | |||
952 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
953 | "thp_fault_alloc", | ||
954 | "thp_fault_fallback", | ||
955 | "thp_collapse_alloc", | ||
956 | "thp_collapse_alloc_failed", | ||
957 | "thp_split", | ||
943 | #endif | 958 | #endif |
959 | |||
960 | #endif /* CONFIG_VM_EVENTS_COUNTERS */ | ||
944 | }; | 961 | }; |
945 | 962 | ||
946 | static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, | 963 | static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, |