aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/compaction.c76
-rw-r--r--mm/huge_memory.c5
-rw-r--r--mm/hugetlb.c8
-rw-r--r--mm/ksm.c6
-rw-r--r--mm/memcontrol.c222
-rw-r--r--mm/memory-failure.c25
-rw-r--r--mm/memory.c33
-rw-r--r--mm/memory_hotplug.c10
-rw-r--r--mm/migrate.c2
-rw-r--r--mm/mmap.c12
-rw-r--r--mm/nommu.c9
-rw-r--r--mm/page_cgroup.c71
-rw-r--r--mm/rmap.c111
-rw-r--r--mm/shmem.c74
-rw-r--r--mm/slab.c9
-rw-r--r--mm/slub.c12
-rw-r--r--mm/swapfile.c2
-rw-r--r--mm/thrash.c105
-rw-r--r--mm/truncate.c29
-rw-r--r--mm/vmscan.c106
20 files changed, 634 insertions, 293 deletions
diff --git a/mm/compaction.c b/mm/compaction.c
index 021a2960ef9..6cc604bd564 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -144,9 +144,20 @@ static void isolate_freepages(struct zone *zone,
144 int nr_freepages = cc->nr_freepages; 144 int nr_freepages = cc->nr_freepages;
145 struct list_head *freelist = &cc->freepages; 145 struct list_head *freelist = &cc->freepages;
146 146
147 /*
148 * Initialise the free scanner. The starting point is where we last
149 * scanned from (or the end of the zone if starting). The low point
150 * is the end of the pageblock the migration scanner is using.
151 */
147 pfn = cc->free_pfn; 152 pfn = cc->free_pfn;
148 low_pfn = cc->migrate_pfn + pageblock_nr_pages; 153 low_pfn = cc->migrate_pfn + pageblock_nr_pages;
149 high_pfn = low_pfn; 154
155 /*
156 * Take care that if the migration scanner is at the end of the zone
157 * that the free scanner does not accidentally move to the next zone
158 * in the next isolation cycle.
159 */
160 high_pfn = min(low_pfn, pfn);
150 161
151 /* 162 /*
152 * Isolate free pages until enough are available to migrate the 163 * Isolate free pages until enough are available to migrate the
@@ -240,11 +251,18 @@ static bool too_many_isolated(struct zone *zone)
240 return isolated > (inactive + active) / 2; 251 return isolated > (inactive + active) / 2;
241} 252}
242 253
254/* possible outcome of isolate_migratepages */
255typedef enum {
256 ISOLATE_ABORT, /* Abort compaction now */
257 ISOLATE_NONE, /* No pages isolated, continue scanning */
258 ISOLATE_SUCCESS, /* Pages isolated, migrate */
259} isolate_migrate_t;
260
243/* 261/*
244 * Isolate all pages that can be migrated from the block pointed to by 262 * Isolate all pages that can be migrated from the block pointed to by
245 * the migrate scanner within compact_control. 263 * the migrate scanner within compact_control.
246 */ 264 */
247static unsigned long isolate_migratepages(struct zone *zone, 265static isolate_migrate_t isolate_migratepages(struct zone *zone,
248 struct compact_control *cc) 266 struct compact_control *cc)
249{ 267{
250 unsigned long low_pfn, end_pfn; 268 unsigned long low_pfn, end_pfn;
@@ -261,7 +279,7 @@ static unsigned long isolate_migratepages(struct zone *zone,
261 /* Do not cross the free scanner or scan within a memory hole */ 279 /* Do not cross the free scanner or scan within a memory hole */
262 if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) { 280 if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) {
263 cc->migrate_pfn = end_pfn; 281 cc->migrate_pfn = end_pfn;
264 return 0; 282 return ISOLATE_NONE;
265 } 283 }
266 284
267 /* 285 /*
@@ -270,10 +288,14 @@ static unsigned long isolate_migratepages(struct zone *zone,
270 * delay for some time until fewer pages are isolated 288 * delay for some time until fewer pages are isolated
271 */ 289 */
272 while (unlikely(too_many_isolated(zone))) { 290 while (unlikely(too_many_isolated(zone))) {
291 /* async migration should just abort */
292 if (!cc->sync)
293 return ISOLATE_ABORT;
294
273 congestion_wait(BLK_RW_ASYNC, HZ/10); 295 congestion_wait(BLK_RW_ASYNC, HZ/10);
274 296
275 if (fatal_signal_pending(current)) 297 if (fatal_signal_pending(current))
276 return 0; 298 return ISOLATE_ABORT;
277 } 299 }
278 300
279 /* Time to isolate some pages for migration */ 301 /* Time to isolate some pages for migration */
@@ -358,7 +380,7 @@ static unsigned long isolate_migratepages(struct zone *zone,
358 380
359 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); 381 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
360 382
361 return cc->nr_migratepages; 383 return ISOLATE_SUCCESS;
362} 384}
363 385
364/* 386/*
@@ -420,13 +442,6 @@ static int compact_finished(struct zone *zone,
420 if (cc->free_pfn <= cc->migrate_pfn) 442 if (cc->free_pfn <= cc->migrate_pfn)
421 return COMPACT_COMPLETE; 443 return COMPACT_COMPLETE;
422 444
423 /* Compaction run is not finished if the watermark is not met */
424 watermark = low_wmark_pages(zone);
425 watermark += (1 << cc->order);
426
427 if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
428 return COMPACT_CONTINUE;
429
430 /* 445 /*
431 * order == -1 is expected when compacting via 446 * order == -1 is expected when compacting via
432 * /proc/sys/vm/compact_memory 447 * /proc/sys/vm/compact_memory
@@ -434,6 +449,13 @@ static int compact_finished(struct zone *zone,
434 if (cc->order == -1) 449 if (cc->order == -1)
435 return COMPACT_CONTINUE; 450 return COMPACT_CONTINUE;
436 451
452 /* Compaction run is not finished if the watermark is not met */
453 watermark = low_wmark_pages(zone);
454 watermark += (1 << cc->order);
455
456 if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
457 return COMPACT_CONTINUE;
458
437 /* Direct compactor: Is a suitable page free? */ 459 /* Direct compactor: Is a suitable page free? */
438 for (order = cc->order; order < MAX_ORDER; order++) { 460 for (order = cc->order; order < MAX_ORDER; order++) {
439 /* Job done if page is free of the right migratetype */ 461 /* Job done if page is free of the right migratetype */
@@ -461,6 +483,13 @@ unsigned long compaction_suitable(struct zone *zone, int order)
461 unsigned long watermark; 483 unsigned long watermark;
462 484
463 /* 485 /*
486 * order == -1 is expected when compacting via
487 * /proc/sys/vm/compact_memory
488 */
489 if (order == -1)
490 return COMPACT_CONTINUE;
491
492 /*
464 * Watermarks for order-0 must be met for compaction. Note the 2UL. 493 * Watermarks for order-0 must be met for compaction. Note the 2UL.
465 * This is because during migration, copies of pages need to be 494 * This is because during migration, copies of pages need to be
466 * allocated and for a short time, the footprint is higher 495 * allocated and for a short time, the footprint is higher
@@ -470,17 +499,11 @@ unsigned long compaction_suitable(struct zone *zone, int order)
470 return COMPACT_SKIPPED; 499 return COMPACT_SKIPPED;
471 500
472 /* 501 /*
473 * order == -1 is expected when compacting via
474 * /proc/sys/vm/compact_memory
475 */
476 if (order == -1)
477 return COMPACT_CONTINUE;
478
479 /*
480 * fragmentation index determines if allocation failures are due to 502 * fragmentation index determines if allocation failures are due to
481 * low memory or external fragmentation 503 * low memory or external fragmentation
482 * 504 *
483 * index of -1 implies allocations might succeed dependingon watermarks 505 * index of -1000 implies allocations might succeed depending on
506 * watermarks
484 * index towards 0 implies failure is due to lack of memory 507 * index towards 0 implies failure is due to lack of memory
485 * index towards 1000 implies failure is due to fragmentation 508 * index towards 1000 implies failure is due to fragmentation
486 * 509 *
@@ -490,7 +513,8 @@ unsigned long compaction_suitable(struct zone *zone, int order)
490 if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) 513 if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
491 return COMPACT_SKIPPED; 514 return COMPACT_SKIPPED;
492 515
493 if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) 516 if (fragindex == -1000 && zone_watermark_ok(zone, order, watermark,
517 0, 0))
494 return COMPACT_PARTIAL; 518 return COMPACT_PARTIAL;
495 519
496 return COMPACT_CONTINUE; 520 return COMPACT_CONTINUE;
@@ -522,8 +546,15 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
522 unsigned long nr_migrate, nr_remaining; 546 unsigned long nr_migrate, nr_remaining;
523 int err; 547 int err;
524 548
525 if (!isolate_migratepages(zone, cc)) 549 switch (isolate_migratepages(zone, cc)) {
550 case ISOLATE_ABORT:
551 ret = COMPACT_PARTIAL;
552 goto out;
553 case ISOLATE_NONE:
526 continue; 554 continue;
555 case ISOLATE_SUCCESS:
556 ;
557 }
527 558
528 nr_migrate = cc->nr_migratepages; 559 nr_migrate = cc->nr_migratepages;
529 err = migrate_pages(&cc->migratepages, compaction_alloc, 560 err = migrate_pages(&cc->migratepages, compaction_alloc,
@@ -547,6 +578,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
547 578
548 } 579 }
549 580
581out:
550 /* Release free pages and check accounting */ 582 /* Release free pages and check accounting */
551 cc->nr_freepages -= release_freepages(&cc->freepages); 583 cc->nr_freepages -= release_freepages(&cc->freepages);
552 VM_BUG_ON(cc->nr_freepages != 0); 584 VM_BUG_ON(cc->nr_freepages != 0);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 615d9743a3c..81532f297fd 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2234,11 +2234,8 @@ static void khugepaged_loop(void)
2234 while (likely(khugepaged_enabled())) { 2234 while (likely(khugepaged_enabled())) {
2235#ifndef CONFIG_NUMA 2235#ifndef CONFIG_NUMA
2236 hpage = khugepaged_alloc_hugepage(); 2236 hpage = khugepaged_alloc_hugepage();
2237 if (unlikely(!hpage)) { 2237 if (unlikely(!hpage))
2238 count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
2239 break; 2238 break;
2240 }
2241 count_vm_event(THP_COLLAPSE_ALLOC);
2242#else 2239#else
2243 if (IS_ERR(hpage)) { 2240 if (IS_ERR(hpage)) {
2244 khugepaged_alloc_sleep(); 2241 khugepaged_alloc_sleep();
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 6402458fee3..bfcf153bc82 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1111,6 +1111,14 @@ static void __init gather_bootmem_prealloc(void)
1111 WARN_ON(page_count(page) != 1); 1111 WARN_ON(page_count(page) != 1);
1112 prep_compound_huge_page(page, h->order); 1112 prep_compound_huge_page(page, h->order);
1113 prep_new_huge_page(h, page, page_to_nid(page)); 1113 prep_new_huge_page(h, page, page_to_nid(page));
1114 /*
1115 * If we had gigantic hugepages allocated at boot time, we need
1116 * to restore the 'stolen' pages to totalram_pages in order to
1117 * fix confusing memory reports from free(1) and another
1118 * side-effects, like CommitLimit going negative.
1119 */
1120 if (h->order > (MAX_ORDER - 1))
1121 totalram_pages += 1 << h->order;
1114 } 1122 }
1115} 1123}
1116 1124
diff --git a/mm/ksm.c b/mm/ksm.c
index d708b3ef226..9a68b0cf0a1 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1302,6 +1302,12 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
1302 slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list); 1302 slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
1303 ksm_scan.mm_slot = slot; 1303 ksm_scan.mm_slot = slot;
1304 spin_unlock(&ksm_mmlist_lock); 1304 spin_unlock(&ksm_mmlist_lock);
1305 /*
1306 * Although we tested list_empty() above, a racing __ksm_exit
1307 * of the last mm on the list may have removed it since then.
1308 */
1309 if (slot == &ksm_mm_head)
1310 return NULL;
1305next_mm: 1311next_mm:
1306 ksm_scan.address = 0; 1312 ksm_scan.address = 0;
1307 ksm_scan.rmap_list = &slot->rmap_list; 1313 ksm_scan.rmap_list = &slot->rmap_list;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index bd9052a5d3a..e013b8e57d2 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -35,6 +35,7 @@
35#include <linux/limits.h> 35#include <linux/limits.h>
36#include <linux/mutex.h> 36#include <linux/mutex.h>
37#include <linux/rbtree.h> 37#include <linux/rbtree.h>
38#include <linux/shmem_fs.h>
38#include <linux/slab.h> 39#include <linux/slab.h>
39#include <linux/swap.h> 40#include <linux/swap.h>
40#include <linux/swapops.h> 41#include <linux/swapops.h>
@@ -107,10 +108,12 @@ enum mem_cgroup_events_index {
107enum mem_cgroup_events_target { 108enum mem_cgroup_events_target {
108 MEM_CGROUP_TARGET_THRESH, 109 MEM_CGROUP_TARGET_THRESH,
109 MEM_CGROUP_TARGET_SOFTLIMIT, 110 MEM_CGROUP_TARGET_SOFTLIMIT,
111 MEM_CGROUP_TARGET_NUMAINFO,
110 MEM_CGROUP_NTARGETS, 112 MEM_CGROUP_NTARGETS,
111}; 113};
112#define THRESHOLDS_EVENTS_TARGET (128) 114#define THRESHOLDS_EVENTS_TARGET (128)
113#define SOFTLIMIT_EVENTS_TARGET (1024) 115#define SOFTLIMIT_EVENTS_TARGET (1024)
116#define NUMAINFO_EVENTS_TARGET (1024)
114 117
115struct mem_cgroup_stat_cpu { 118struct mem_cgroup_stat_cpu {
116 long count[MEM_CGROUP_STAT_NSTATS]; 119 long count[MEM_CGROUP_STAT_NSTATS];
@@ -236,7 +239,8 @@ struct mem_cgroup {
236 int last_scanned_node; 239 int last_scanned_node;
237#if MAX_NUMNODES > 1 240#if MAX_NUMNODES > 1
238 nodemask_t scan_nodes; 241 nodemask_t scan_nodes;
239 unsigned long next_scan_node_update; 242 atomic_t numainfo_events;
243 atomic_t numainfo_updating;
240#endif 244#endif
241 /* 245 /*
242 * Should the accounting and control be hierarchical, per subtree? 246 * Should the accounting and control be hierarchical, per subtree?
@@ -359,7 +363,7 @@ enum charge_type {
359static void mem_cgroup_get(struct mem_cgroup *mem); 363static void mem_cgroup_get(struct mem_cgroup *mem);
360static void mem_cgroup_put(struct mem_cgroup *mem); 364static void mem_cgroup_put(struct mem_cgroup *mem);
361static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); 365static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
362static void drain_all_stock_async(void); 366static void drain_all_stock_async(struct mem_cgroup *mem);
363 367
364static struct mem_cgroup_per_zone * 368static struct mem_cgroup_per_zone *
365mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) 369mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
@@ -576,15 +580,6 @@ static long mem_cgroup_read_stat(struct mem_cgroup *mem,
576 return val; 580 return val;
577} 581}
578 582
579static long mem_cgroup_local_usage(struct mem_cgroup *mem)
580{
581 long ret;
582
583 ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
584 ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
585 return ret;
586}
587
588static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, 583static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
589 bool charge) 584 bool charge)
590{ 585{
@@ -688,6 +683,9 @@ static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target)
688 case MEM_CGROUP_TARGET_SOFTLIMIT: 683 case MEM_CGROUP_TARGET_SOFTLIMIT:
689 next = val + SOFTLIMIT_EVENTS_TARGET; 684 next = val + SOFTLIMIT_EVENTS_TARGET;
690 break; 685 break;
686 case MEM_CGROUP_TARGET_NUMAINFO:
687 next = val + NUMAINFO_EVENTS_TARGET;
688 break;
691 default: 689 default:
692 return; 690 return;
693 } 691 }
@@ -706,11 +704,19 @@ static void memcg_check_events(struct mem_cgroup *mem, struct page *page)
706 mem_cgroup_threshold(mem); 704 mem_cgroup_threshold(mem);
707 __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH); 705 __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH);
708 if (unlikely(__memcg_event_check(mem, 706 if (unlikely(__memcg_event_check(mem,
709 MEM_CGROUP_TARGET_SOFTLIMIT))){ 707 MEM_CGROUP_TARGET_SOFTLIMIT))) {
710 mem_cgroup_update_tree(mem, page); 708 mem_cgroup_update_tree(mem, page);
711 __mem_cgroup_target_update(mem, 709 __mem_cgroup_target_update(mem,
712 MEM_CGROUP_TARGET_SOFTLIMIT); 710 MEM_CGROUP_TARGET_SOFTLIMIT);
713 } 711 }
712#if MAX_NUMNODES > 1
713 if (unlikely(__memcg_event_check(mem,
714 MEM_CGROUP_TARGET_NUMAINFO))) {
715 atomic_inc(&mem->numainfo_events);
716 __mem_cgroup_target_update(mem,
717 MEM_CGROUP_TARGET_NUMAINFO);
718 }
719#endif
714 } 720 }
715} 721}
716 722
@@ -735,7 +741,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
735 struct mem_cgroup, css); 741 struct mem_cgroup, css);
736} 742}
737 743
738static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) 744struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
739{ 745{
740 struct mem_cgroup *mem = NULL; 746 struct mem_cgroup *mem = NULL;
741 747
@@ -1128,7 +1134,6 @@ unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg,
1128 return MEM_CGROUP_ZSTAT(mz, lru); 1134 return MEM_CGROUP_ZSTAT(mz, lru);
1129} 1135}
1130 1136
1131#ifdef CONFIG_NUMA
1132static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg, 1137static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg,
1133 int nid) 1138 int nid)
1134{ 1139{
@@ -1140,6 +1145,17 @@ static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg,
1140 return ret; 1145 return ret;
1141} 1146}
1142 1147
1148static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg,
1149 int nid)
1150{
1151 unsigned long ret;
1152
1153 ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) +
1154 mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON);
1155 return ret;
1156}
1157
1158#if MAX_NUMNODES > 1
1143static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg) 1159static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg)
1144{ 1160{
1145 u64 total = 0; 1161 u64 total = 0;
@@ -1151,17 +1167,6 @@ static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg)
1151 return total; 1167 return total;
1152} 1168}
1153 1169
1154static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg,
1155 int nid)
1156{
1157 unsigned long ret;
1158
1159 ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) +
1160 mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON);
1161
1162 return ret;
1163}
1164
1165static unsigned long mem_cgroup_nr_anon_lru_pages(struct mem_cgroup *memcg) 1170static unsigned long mem_cgroup_nr_anon_lru_pages(struct mem_cgroup *memcg)
1166{ 1171{
1167 u64 total = 0; 1172 u64 total = 0;
@@ -1558,6 +1563,28 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
1558 return ret; 1563 return ret;
1559} 1564}
1560 1565
1566/**
1567 * test_mem_cgroup_node_reclaimable
1568 * @mem: the target memcg
1569 * @nid: the node ID to be checked.
1570 * @noswap : specify true here if the user wants flle only information.
1571 *
1572 * This function returns whether the specified memcg contains any
1573 * reclaimable pages on a node. Returns true if there are any reclaimable
1574 * pages in the node.
1575 */
1576static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem,
1577 int nid, bool noswap)
1578{
1579 if (mem_cgroup_node_nr_file_lru_pages(mem, nid))
1580 return true;
1581 if (noswap || !total_swap_pages)
1582 return false;
1583 if (mem_cgroup_node_nr_anon_lru_pages(mem, nid))
1584 return true;
1585 return false;
1586
1587}
1561#if MAX_NUMNODES > 1 1588#if MAX_NUMNODES > 1
1562 1589
1563/* 1590/*
@@ -1569,26 +1596,26 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
1569static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem) 1596static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
1570{ 1597{
1571 int nid; 1598 int nid;
1572 1599 /*
1573 if (time_after(mem->next_scan_node_update, jiffies)) 1600 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
1601 * pagein/pageout changes since the last update.
1602 */
1603 if (!atomic_read(&mem->numainfo_events))
1604 return;
1605 if (atomic_inc_return(&mem->numainfo_updating) > 1)
1574 return; 1606 return;
1575 1607
1576 mem->next_scan_node_update = jiffies + 10*HZ;
1577 /* make a nodemask where this memcg uses memory from */ 1608 /* make a nodemask where this memcg uses memory from */
1578 mem->scan_nodes = node_states[N_HIGH_MEMORY]; 1609 mem->scan_nodes = node_states[N_HIGH_MEMORY];
1579 1610
1580 for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) { 1611 for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
1581 1612
1582 if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) || 1613 if (!test_mem_cgroup_node_reclaimable(mem, nid, false))
1583 mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE)) 1614 node_clear(nid, mem->scan_nodes);
1584 continue;
1585
1586 if (total_swap_pages &&
1587 (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) ||
1588 mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON)))
1589 continue;
1590 node_clear(nid, mem->scan_nodes);
1591 } 1615 }
1616
1617 atomic_set(&mem->numainfo_events, 0);
1618 atomic_set(&mem->numainfo_updating, 0);
1592} 1619}
1593 1620
1594/* 1621/*
@@ -1626,11 +1653,51 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
1626 return node; 1653 return node;
1627} 1654}
1628 1655
1656/*
1657 * Check all nodes whether it contains reclaimable pages or not.
1658 * For quick scan, we make use of scan_nodes. This will allow us to skip
1659 * unused nodes. But scan_nodes is lazily updated and may not cotain
1660 * enough new information. We need to do double check.
1661 */
1662bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
1663{
1664 int nid;
1665
1666 /*
1667 * quick check...making use of scan_node.
1668 * We can skip unused nodes.
1669 */
1670 if (!nodes_empty(mem->scan_nodes)) {
1671 for (nid = first_node(mem->scan_nodes);
1672 nid < MAX_NUMNODES;
1673 nid = next_node(nid, mem->scan_nodes)) {
1674
1675 if (test_mem_cgroup_node_reclaimable(mem, nid, noswap))
1676 return true;
1677 }
1678 }
1679 /*
1680 * Check rest of nodes.
1681 */
1682 for_each_node_state(nid, N_HIGH_MEMORY) {
1683 if (node_isset(nid, mem->scan_nodes))
1684 continue;
1685 if (test_mem_cgroup_node_reclaimable(mem, nid, noswap))
1686 return true;
1687 }
1688 return false;
1689}
1690
1629#else 1691#else
1630int mem_cgroup_select_victim_node(struct mem_cgroup *mem) 1692int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
1631{ 1693{
1632 return 0; 1694 return 0;
1633} 1695}
1696
1697bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
1698{
1699 return test_mem_cgroup_node_reclaimable(mem, 0, noswap);
1700}
1634#endif 1701#endif
1635 1702
1636/* 1703/*
@@ -1663,15 +1730,21 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1663 excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; 1730 excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;
1664 1731
1665 /* If memsw_is_minimum==1, swap-out is of-no-use. */ 1732 /* If memsw_is_minimum==1, swap-out is of-no-use. */
1666 if (root_mem->memsw_is_minimum) 1733 if (!check_soft && root_mem->memsw_is_minimum)
1667 noswap = true; 1734 noswap = true;
1668 1735
1669 while (1) { 1736 while (1) {
1670 victim = mem_cgroup_select_victim(root_mem); 1737 victim = mem_cgroup_select_victim(root_mem);
1671 if (victim == root_mem) { 1738 if (victim == root_mem) {
1672 loop++; 1739 loop++;
1673 if (loop >= 1) 1740 /*
1674 drain_all_stock_async(); 1741 * We are not draining per cpu cached charges during
1742 * soft limit reclaim because global reclaim doesn't
1743 * care about charges. It tries to free some memory and
1744 * charges will not give any.
1745 */
1746 if (!check_soft && loop >= 1)
1747 drain_all_stock_async(root_mem);
1675 if (loop >= 2) { 1748 if (loop >= 2) {
1676 /* 1749 /*
1677 * If we have not been able to reclaim 1750 * If we have not been able to reclaim
@@ -1695,7 +1768,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1695 } 1768 }
1696 } 1769 }
1697 } 1770 }
1698 if (!mem_cgroup_local_usage(victim)) { 1771 if (!mem_cgroup_reclaimable(victim, noswap)) {
1699 /* this cgroup's local usage == 0 */ 1772 /* this cgroup's local usage == 0 */
1700 css_put(&victim->css); 1773 css_put(&victim->css);
1701 continue; 1774 continue;
@@ -1934,9 +2007,11 @@ struct memcg_stock_pcp {
1934 struct mem_cgroup *cached; /* this never be root cgroup */ 2007 struct mem_cgroup *cached; /* this never be root cgroup */
1935 unsigned int nr_pages; 2008 unsigned int nr_pages;
1936 struct work_struct work; 2009 struct work_struct work;
2010 unsigned long flags;
2011#define FLUSHING_CACHED_CHARGE (0)
1937}; 2012};
1938static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); 2013static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
1939static atomic_t memcg_drain_count; 2014static DEFINE_MUTEX(percpu_charge_mutex);
1940 2015
1941/* 2016/*
1942 * Try to consume stocked charge on this cpu. If success, one page is consumed 2017 * Try to consume stocked charge on this cpu. If success, one page is consumed
@@ -1984,6 +2059,7 @@ static void drain_local_stock(struct work_struct *dummy)
1984{ 2059{
1985 struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); 2060 struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
1986 drain_stock(stock); 2061 drain_stock(stock);
2062 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
1987} 2063}
1988 2064
1989/* 2065/*
@@ -2008,26 +2084,45 @@ static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages)
2008 * expects some charges will be back to res_counter later but cannot wait for 2084 * expects some charges will be back to res_counter later but cannot wait for
2009 * it. 2085 * it.
2010 */ 2086 */
2011static void drain_all_stock_async(void) 2087static void drain_all_stock_async(struct mem_cgroup *root_mem)
2012{ 2088{
2013 int cpu; 2089 int cpu, curcpu;
2014 /* This function is for scheduling "drain" in asynchronous way. 2090 /*
2015 * The result of "drain" is not directly handled by callers. Then, 2091 * If someone calls draining, avoid adding more kworker runs.
2016 * if someone is calling drain, we don't have to call drain more.
2017 * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if
2018 * there is a race. We just do loose check here.
2019 */ 2092 */
2020 if (atomic_read(&memcg_drain_count)) 2093 if (!mutex_trylock(&percpu_charge_mutex))
2021 return; 2094 return;
2022 /* Notify other cpus that system-wide "drain" is running */ 2095 /* Notify other cpus that system-wide "drain" is running */
2023 atomic_inc(&memcg_drain_count);
2024 get_online_cpus(); 2096 get_online_cpus();
2097 /*
2098 * Get a hint for avoiding draining charges on the current cpu,
2099 * which must be exhausted by our charging. It is not required that
2100 * this be a precise check, so we use raw_smp_processor_id() instead of
2101 * getcpu()/putcpu().
2102 */
2103 curcpu = raw_smp_processor_id();
2025 for_each_online_cpu(cpu) { 2104 for_each_online_cpu(cpu) {
2026 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2105 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2027 schedule_work_on(cpu, &stock->work); 2106 struct mem_cgroup *mem;
2107
2108 if (cpu == curcpu)
2109 continue;
2110
2111 mem = stock->cached;
2112 if (!mem)
2113 continue;
2114 if (mem != root_mem) {
2115 if (!root_mem->use_hierarchy)
2116 continue;
2117 /* check whether "mem" is under tree of "root_mem" */
2118 if (!css_is_ancestor(&mem->css, &root_mem->css))
2119 continue;
2120 }
2121 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
2122 schedule_work_on(cpu, &stock->work);
2028 } 2123 }
2029 put_online_cpus(); 2124 put_online_cpus();
2030 atomic_dec(&memcg_drain_count); 2125 mutex_unlock(&percpu_charge_mutex);
2031 /* We don't wait for flush_work */ 2126 /* We don't wait for flush_work */
2032} 2127}
2033 2128
@@ -2035,9 +2130,9 @@ static void drain_all_stock_async(void)
2035static void drain_all_stock_sync(void) 2130static void drain_all_stock_sync(void)
2036{ 2131{
2037 /* called when force_empty is called */ 2132 /* called when force_empty is called */
2038 atomic_inc(&memcg_drain_count); 2133 mutex_lock(&percpu_charge_mutex);
2039 schedule_on_each_cpu(drain_local_stock); 2134 schedule_on_each_cpu(drain_local_stock);
2040 atomic_dec(&memcg_drain_count); 2135 mutex_unlock(&percpu_charge_mutex);
2041} 2136}
2042 2137
2043/* 2138/*
@@ -4640,6 +4735,7 @@ static struct cftype mem_cgroup_files[] = {
4640 { 4735 {
4641 .name = "numa_stat", 4736 .name = "numa_stat",
4642 .open = mem_control_numa_stat_open, 4737 .open = mem_control_numa_stat_open,
4738 .mode = S_IRUGO,
4643 }, 4739 },
4644#endif 4740#endif
4645}; 4741};
@@ -5414,18 +5510,16 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
5414 struct cgroup *old_cont, 5510 struct cgroup *old_cont,
5415 struct task_struct *p) 5511 struct task_struct *p)
5416{ 5512{
5417 struct mm_struct *mm; 5513 struct mm_struct *mm = get_task_mm(p);
5418 5514
5419 if (!mc.to)
5420 /* no need to move charge */
5421 return;
5422
5423 mm = get_task_mm(p);
5424 if (mm) { 5515 if (mm) {
5425 mem_cgroup_move_charge(mm); 5516 if (mc.to)
5517 mem_cgroup_move_charge(mm);
5518 put_swap_token(mm);
5426 mmput(mm); 5519 mmput(mm);
5427 } 5520 }
5428 mem_cgroup_clear_mc(); 5521 if (mc.to)
5522 mem_cgroup_clear_mc();
5429} 5523}
5430#else /* !CONFIG_MMU */ 5524#else /* !CONFIG_MMU */
5431static int mem_cgroup_can_attach(struct cgroup_subsys *ss, 5525static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 5c8f7e08928..740c4f52059 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -52,6 +52,7 @@
52#include <linux/swapops.h> 52#include <linux/swapops.h>
53#include <linux/hugetlb.h> 53#include <linux/hugetlb.h>
54#include <linux/memory_hotplug.h> 54#include <linux/memory_hotplug.h>
55#include <linux/mm_inline.h>
55#include "internal.h" 56#include "internal.h"
56 57
57int sysctl_memory_failure_early_kill __read_mostly = 0; 58int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -390,10 +391,11 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
390 struct task_struct *tsk; 391 struct task_struct *tsk;
391 struct anon_vma *av; 392 struct anon_vma *av;
392 393
393 read_lock(&tasklist_lock);
394 av = page_lock_anon_vma(page); 394 av = page_lock_anon_vma(page);
395 if (av == NULL) /* Not actually mapped anymore */ 395 if (av == NULL) /* Not actually mapped anymore */
396 goto out; 396 return;
397
398 read_lock(&tasklist_lock);
397 for_each_process (tsk) { 399 for_each_process (tsk) {
398 struct anon_vma_chain *vmac; 400 struct anon_vma_chain *vmac;
399 401
@@ -407,9 +409,8 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
407 add_to_kill(tsk, page, vma, to_kill, tkc); 409 add_to_kill(tsk, page, vma, to_kill, tkc);
408 } 410 }
409 } 411 }
410 page_unlock_anon_vma(av);
411out:
412 read_unlock(&tasklist_lock); 412 read_unlock(&tasklist_lock);
413 page_unlock_anon_vma(av);
413} 414}
414 415
415/* 416/*
@@ -423,17 +424,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
423 struct prio_tree_iter iter; 424 struct prio_tree_iter iter;
424 struct address_space *mapping = page->mapping; 425 struct address_space *mapping = page->mapping;
425 426
426 /*
427 * A note on the locking order between the two locks.
428 * We don't rely on this particular order.
429 * If you have some other code that needs a different order
430 * feel free to switch them around. Or add a reverse link
431 * from mm_struct to task_struct, then this could be all
432 * done without taking tasklist_lock and looping over all tasks.
433 */
434
435 read_lock(&tasklist_lock);
436 mutex_lock(&mapping->i_mmap_mutex); 427 mutex_lock(&mapping->i_mmap_mutex);
428 read_lock(&tasklist_lock);
437 for_each_process(tsk) { 429 for_each_process(tsk) {
438 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 430 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
439 431
@@ -453,8 +445,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
453 add_to_kill(tsk, page, vma, to_kill, tkc); 445 add_to_kill(tsk, page, vma, to_kill, tkc);
454 } 446 }
455 } 447 }
456 mutex_unlock(&mapping->i_mmap_mutex);
457 read_unlock(&tasklist_lock); 448 read_unlock(&tasklist_lock);
449 mutex_unlock(&mapping->i_mmap_mutex);
458} 450}
459 451
460/* 452/*
@@ -1468,7 +1460,8 @@ int soft_offline_page(struct page *page, int flags)
1468 put_page(page); 1460 put_page(page);
1469 if (!ret) { 1461 if (!ret) {
1470 LIST_HEAD(pagelist); 1462 LIST_HEAD(pagelist);
1471 1463 inc_zone_page_state(page, NR_ISOLATED_ANON +
1464 page_is_file_cache(page));
1472 list_add(&page->lru, &pagelist); 1465 list_add(&page->lru, &pagelist);
1473 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 1466 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
1474 0, true); 1467 0, true);
diff --git a/mm/memory.c b/mm/memory.c
index 6953d3926e0..9b8a01d941c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -305,6 +305,7 @@ int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
305 if (batch->nr == batch->max) { 305 if (batch->nr == batch->max) {
306 if (!tlb_next_batch(tlb)) 306 if (!tlb_next_batch(tlb))
307 return 0; 307 return 0;
308 batch = tlb->active;
308 } 309 }
309 VM_BUG_ON(batch->nr > batch->max); 310 VM_BUG_ON(batch->nr > batch->max);
310 311
@@ -1112,11 +1113,13 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
1112 int force_flush = 0; 1113 int force_flush = 0;
1113 int rss[NR_MM_COUNTERS]; 1114 int rss[NR_MM_COUNTERS];
1114 spinlock_t *ptl; 1115 spinlock_t *ptl;
1116 pte_t *start_pte;
1115 pte_t *pte; 1117 pte_t *pte;
1116 1118
1117again: 1119again:
1118 init_rss_vec(rss); 1120 init_rss_vec(rss);
1119 pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 1121 start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
1122 pte = start_pte;
1120 arch_enter_lazy_mmu_mode(); 1123 arch_enter_lazy_mmu_mode();
1121 do { 1124 do {
1122 pte_t ptent = *pte; 1125 pte_t ptent = *pte;
@@ -1196,7 +1199,7 @@ again:
1196 1199
1197 add_mm_rss_vec(mm, rss); 1200 add_mm_rss_vec(mm, rss);
1198 arch_leave_lazy_mmu_mode(); 1201 arch_leave_lazy_mmu_mode();
1199 pte_unmap_unlock(pte - 1, ptl); 1202 pte_unmap_unlock(start_pte, ptl);
1200 1203
1201 /* 1204 /*
1202 * mmu_gather ran out of room to batch pages, we break out of 1205 * mmu_gather ran out of room to batch pages, we break out of
@@ -1296,7 +1299,7 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
1296 1299
1297/** 1300/**
1298 * unmap_vmas - unmap a range of memory covered by a list of vma's 1301 * unmap_vmas - unmap a range of memory covered by a list of vma's
1299 * @tlbp: address of the caller's struct mmu_gather 1302 * @tlb: address of the caller's struct mmu_gather
1300 * @vma: the starting vma 1303 * @vma: the starting vma
1301 * @start_addr: virtual address at which to start unmapping 1304 * @start_addr: virtual address at which to start unmapping
1302 * @end_addr: virtual address at which to end unmapping 1305 * @end_addr: virtual address at which to end unmapping
@@ -2796,30 +2799,6 @@ void unmap_mapping_range(struct address_space *mapping,
2796} 2799}
2797EXPORT_SYMBOL(unmap_mapping_range); 2800EXPORT_SYMBOL(unmap_mapping_range);
2798 2801
2799int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
2800{
2801 struct address_space *mapping = inode->i_mapping;
2802
2803 /*
2804 * If the underlying filesystem is not going to provide
2805 * a way to truncate a range of blocks (punch a hole) -
2806 * we should return failure right now.
2807 */
2808 if (!inode->i_op->truncate_range)
2809 return -ENOSYS;
2810
2811 mutex_lock(&inode->i_mutex);
2812 down_write(&inode->i_alloc_sem);
2813 unmap_mapping_range(mapping, offset, (end - offset), 1);
2814 truncate_inode_pages_range(mapping, offset, end);
2815 unmap_mapping_range(mapping, offset, (end - offset), 1);
2816 inode->i_op->truncate_range(inode, offset, end);
2817 up_write(&inode->i_alloc_sem);
2818 mutex_unlock(&inode->i_mutex);
2819
2820 return 0;
2821}
2822
2823/* 2802/*
2824 * We enter with non-exclusive mmap_sem (to exclude vma changes, 2803 * We enter with non-exclusive mmap_sem (to exclude vma changes,
2825 * but allow concurrent faults), and pte mapped but not yet locked. 2804 * but allow concurrent faults), and pte mapped but not yet locked.
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 9f646374e32..c46887b5a11 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -494,6 +494,14 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
494 /* init node's zones as empty zones, we don't have any present pages.*/ 494 /* init node's zones as empty zones, we don't have any present pages.*/
495 free_area_init_node(nid, zones_size, start_pfn, zholes_size); 495 free_area_init_node(nid, zones_size, start_pfn, zholes_size);
496 496
497 /*
498 * The node we allocated has no zone fallback lists. For avoiding
499 * to access not-initialized zonelist, build here.
500 */
501 mutex_lock(&zonelists_mutex);
502 build_all_zonelists(NULL);
503 mutex_unlock(&zonelists_mutex);
504
497 return pgdat; 505 return pgdat;
498} 506}
499 507
@@ -515,7 +523,7 @@ int mem_online_node(int nid)
515 523
516 lock_memory_hotplug(); 524 lock_memory_hotplug();
517 pgdat = hotadd_new_pgdat(nid, 0); 525 pgdat = hotadd_new_pgdat(nid, 0);
518 if (pgdat) { 526 if (!pgdat) {
519 ret = -ENOMEM; 527 ret = -ENOMEM;
520 goto out; 528 goto out;
521 } 529 }
diff --git a/mm/migrate.c b/mm/migrate.c
index e4a5c912983..666e4e67741 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -288,7 +288,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
288 */ 288 */
289 __dec_zone_page_state(page, NR_FILE_PAGES); 289 __dec_zone_page_state(page, NR_FILE_PAGES);
290 __inc_zone_page_state(newpage, NR_FILE_PAGES); 290 __inc_zone_page_state(newpage, NR_FILE_PAGES);
291 if (PageSwapBacked(page)) { 291 if (!PageSwapCache(page) && PageSwapBacked(page)) {
292 __dec_zone_page_state(page, NR_SHMEM); 292 __dec_zone_page_state(page, NR_SHMEM);
293 __inc_zone_page_state(newpage, NR_SHMEM); 293 __inc_zone_page_state(newpage, NR_SHMEM);
294 } 294 }
diff --git a/mm/mmap.c b/mm/mmap.c
index bbdc9af5e11..d49736ff8a8 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -906,14 +906,7 @@ struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
906 if (anon_vma) 906 if (anon_vma)
907 return anon_vma; 907 return anon_vma;
908try_prev: 908try_prev:
909 /* 909 near = vma->vm_prev;
910 * It is potentially slow to have to call find_vma_prev here.
911 * But it's only on the first write fault on the vma, not
912 * every time, and we could devise a way to avoid it later
913 * (e.g. stash info in next's anon_vma_node when assigning
914 * an anon_vma, or when trying vma_merge). Another time.
915 */
916 BUG_ON(find_vma_prev(vma->vm_mm, vma->vm_start, &near) != vma);
917 if (!near) 910 if (!near)
918 goto none; 911 goto none;
919 912
@@ -2044,9 +2037,10 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
2044 return -EINVAL; 2037 return -EINVAL;
2045 2038
2046 /* Find the first overlapping VMA */ 2039 /* Find the first overlapping VMA */
2047 vma = find_vma_prev(mm, start, &prev); 2040 vma = find_vma(mm, start);
2048 if (!vma) 2041 if (!vma)
2049 return 0; 2042 return 0;
2043 prev = vma->vm_prev;
2050 /* we have start < vma->vm_end */ 2044 /* we have start < vma->vm_end */
2051 2045
2052 /* if it doesn't overlap, we have nothing.. */ 2046 /* if it doesn't overlap, we have nothing.. */
diff --git a/mm/nommu.c b/mm/nommu.c
index 1fd0c51b10a..9edc897a397 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1813,10 +1813,13 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1813 return NULL; 1813 return NULL;
1814} 1814}
1815 1815
1816int remap_pfn_range(struct vm_area_struct *vma, unsigned long from, 1816int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1817 unsigned long to, unsigned long size, pgprot_t prot) 1817 unsigned long pfn, unsigned long size, pgprot_t prot)
1818{ 1818{
1819 vma->vm_start = vma->vm_pgoff << PAGE_SHIFT; 1819 if (addr != (pfn << PAGE_SHIFT))
1820 return -EINVAL;
1821
1822 vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
1820 return 0; 1823 return 0;
1821} 1824}
1822EXPORT_SYMBOL(remap_pfn_range); 1825EXPORT_SYMBOL(remap_pfn_range);
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 74ccff61d1b..53bffc6c293 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -162,13 +162,13 @@ static void free_page_cgroup(void *addr)
162} 162}
163#endif 163#endif
164 164
165static int __meminit init_section_page_cgroup(unsigned long pfn) 165static int __meminit init_section_page_cgroup(unsigned long pfn, int nid)
166{ 166{
167 struct page_cgroup *base, *pc; 167 struct page_cgroup *base, *pc;
168 struct mem_section *section; 168 struct mem_section *section;
169 unsigned long table_size; 169 unsigned long table_size;
170 unsigned long nr; 170 unsigned long nr;
171 int nid, index; 171 int index;
172 172
173 nr = pfn_to_section_nr(pfn); 173 nr = pfn_to_section_nr(pfn);
174 section = __nr_to_section(nr); 174 section = __nr_to_section(nr);
@@ -176,7 +176,6 @@ static int __meminit init_section_page_cgroup(unsigned long pfn)
176 if (section->page_cgroup) 176 if (section->page_cgroup)
177 return 0; 177 return 0;
178 178
179 nid = page_to_nid(pfn_to_page(pfn));
180 table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; 179 table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
181 base = alloc_page_cgroup(table_size, nid); 180 base = alloc_page_cgroup(table_size, nid);
182 181
@@ -196,7 +195,11 @@ static int __meminit init_section_page_cgroup(unsigned long pfn)
196 pc = base + index; 195 pc = base + index;
197 init_page_cgroup(pc, nr); 196 init_page_cgroup(pc, nr);
198 } 197 }
199 198 /*
199 * The passed "pfn" may not be aligned to SECTION. For the calculation
200 * we need to apply a mask.
201 */
202 pfn &= PAGE_SECTION_MASK;
200 section->page_cgroup = base - pfn; 203 section->page_cgroup = base - pfn;
201 total_usage += table_size; 204 total_usage += table_size;
202 return 0; 205 return 0;
@@ -225,10 +228,20 @@ int __meminit online_page_cgroup(unsigned long start_pfn,
225 start = start_pfn & ~(PAGES_PER_SECTION - 1); 228 start = start_pfn & ~(PAGES_PER_SECTION - 1);
226 end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); 229 end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
227 230
231 if (nid == -1) {
232 /*
233 * In this case, "nid" already exists and contains valid memory.
234 * "start_pfn" passed to us is a pfn which is an arg for
235 * online__pages(), and start_pfn should exist.
236 */
237 nid = pfn_to_nid(start_pfn);
238 VM_BUG_ON(!node_state(nid, N_ONLINE));
239 }
240
228 for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) { 241 for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
229 if (!pfn_present(pfn)) 242 if (!pfn_present(pfn))
230 continue; 243 continue;
231 fail = init_section_page_cgroup(pfn); 244 fail = init_section_page_cgroup(pfn, nid);
232 } 245 }
233 if (!fail) 246 if (!fail)
234 return 0; 247 return 0;
@@ -284,25 +297,47 @@ static int __meminit page_cgroup_callback(struct notifier_block *self,
284void __init page_cgroup_init(void) 297void __init page_cgroup_init(void)
285{ 298{
286 unsigned long pfn; 299 unsigned long pfn;
287 int fail = 0; 300 int nid;
288 301
289 if (mem_cgroup_disabled()) 302 if (mem_cgroup_disabled())
290 return; 303 return;
291 304
292 for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) { 305 for_each_node_state(nid, N_HIGH_MEMORY) {
293 if (!pfn_present(pfn)) 306 unsigned long start_pfn, end_pfn;
294 continue; 307
295 fail = init_section_page_cgroup(pfn); 308 start_pfn = node_start_pfn(nid);
296 } 309 end_pfn = node_end_pfn(nid);
297 if (fail) { 310 /*
298 printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n"); 311 * start_pfn and end_pfn may not be aligned to SECTION and the
299 panic("Out of memory"); 312 * page->flags of out of node pages are not initialized. So we
300 } else { 313 * scan [start_pfn, the biggest section's pfn < end_pfn) here.
301 hotplug_memory_notifier(page_cgroup_callback, 0); 314 */
315 for (pfn = start_pfn;
316 pfn < end_pfn;
317 pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) {
318
319 if (!pfn_valid(pfn))
320 continue;
321 /*
322 * Nodes's pfns can be overlapping.
323 * We know some arch can have a nodes layout such as
324 * -------------pfn-------------->
325 * N0 | N1 | N2 | N0 | N1 | N2|....
326 */
327 if (pfn_to_nid(pfn) != nid)
328 continue;
329 if (init_section_page_cgroup(pfn, nid))
330 goto oom;
331 }
302 } 332 }
333 hotplug_memory_notifier(page_cgroup_callback, 0);
303 printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); 334 printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
304 printk(KERN_INFO "please try 'cgroup_disable=memory' option if you don't" 335 printk(KERN_INFO "please try 'cgroup_disable=memory' option if you "
305 " want memory cgroups\n"); 336 "don't want memory cgroups\n");
337 return;
338oom:
339 printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n");
340 panic("Out of memory");
306} 341}
307 342
308void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) 343void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
diff --git a/mm/rmap.c b/mm/rmap.c
index 0eb463ea88d..23295f65ae4 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -38,9 +38,8 @@
38 * in arch-dependent flush_dcache_mmap_lock, 38 * in arch-dependent flush_dcache_mmap_lock,
39 * within inode_wb_list_lock in __sync_single_inode) 39 * within inode_wb_list_lock in __sync_single_inode)
40 * 40 *
41 * (code doesn't rely on that order so it could be switched around) 41 * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon)
42 * ->tasklist_lock 42 * ->tasklist_lock
43 * anon_vma->mutex (memory_failure, collect_procs_anon)
44 * pte map lock 43 * pte map lock
45 */ 44 */
46 45
@@ -112,9 +111,9 @@ static inline void anon_vma_free(struct anon_vma *anon_vma)
112 kmem_cache_free(anon_vma_cachep, anon_vma); 111 kmem_cache_free(anon_vma_cachep, anon_vma);
113} 112}
114 113
115static inline struct anon_vma_chain *anon_vma_chain_alloc(void) 114static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp)
116{ 115{
117 return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL); 116 return kmem_cache_alloc(anon_vma_chain_cachep, gfp);
118} 117}
119 118
120static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) 119static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
@@ -159,7 +158,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
159 struct mm_struct *mm = vma->vm_mm; 158 struct mm_struct *mm = vma->vm_mm;
160 struct anon_vma *allocated; 159 struct anon_vma *allocated;
161 160
162 avc = anon_vma_chain_alloc(); 161 avc = anon_vma_chain_alloc(GFP_KERNEL);
163 if (!avc) 162 if (!avc)
164 goto out_enomem; 163 goto out_enomem;
165 164
@@ -200,6 +199,32 @@ int anon_vma_prepare(struct vm_area_struct *vma)
200 return -ENOMEM; 199 return -ENOMEM;
201} 200}
202 201
202/*
203 * This is a useful helper function for locking the anon_vma root as
204 * we traverse the vma->anon_vma_chain, looping over anon_vma's that
205 * have the same vma.
206 *
207 * Such anon_vma's should have the same root, so you'd expect to see
208 * just a single mutex_lock for the whole traversal.
209 */
210static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma)
211{
212 struct anon_vma *new_root = anon_vma->root;
213 if (new_root != root) {
214 if (WARN_ON_ONCE(root))
215 mutex_unlock(&root->mutex);
216 root = new_root;
217 mutex_lock(&root->mutex);
218 }
219 return root;
220}
221
222static inline void unlock_anon_vma_root(struct anon_vma *root)
223{
224 if (root)
225 mutex_unlock(&root->mutex);
226}
227
203static void anon_vma_chain_link(struct vm_area_struct *vma, 228static void anon_vma_chain_link(struct vm_area_struct *vma,
204 struct anon_vma_chain *avc, 229 struct anon_vma_chain *avc,
205 struct anon_vma *anon_vma) 230 struct anon_vma *anon_vma)
@@ -208,13 +233,11 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
208 avc->anon_vma = anon_vma; 233 avc->anon_vma = anon_vma;
209 list_add(&avc->same_vma, &vma->anon_vma_chain); 234 list_add(&avc->same_vma, &vma->anon_vma_chain);
210 235
211 anon_vma_lock(anon_vma);
212 /* 236 /*
213 * It's critical to add new vmas to the tail of the anon_vma, 237 * It's critical to add new vmas to the tail of the anon_vma,
214 * see comment in huge_memory.c:__split_huge_page(). 238 * see comment in huge_memory.c:__split_huge_page().
215 */ 239 */
216 list_add_tail(&avc->same_anon_vma, &anon_vma->head); 240 list_add_tail(&avc->same_anon_vma, &anon_vma->head);
217 anon_vma_unlock(anon_vma);
218} 241}
219 242
220/* 243/*
@@ -224,13 +247,24 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
224int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) 247int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
225{ 248{
226 struct anon_vma_chain *avc, *pavc; 249 struct anon_vma_chain *avc, *pavc;
250 struct anon_vma *root = NULL;
227 251
228 list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { 252 list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
229 avc = anon_vma_chain_alloc(); 253 struct anon_vma *anon_vma;
230 if (!avc) 254
231 goto enomem_failure; 255 avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN);
232 anon_vma_chain_link(dst, avc, pavc->anon_vma); 256 if (unlikely(!avc)) {
257 unlock_anon_vma_root(root);
258 root = NULL;
259 avc = anon_vma_chain_alloc(GFP_KERNEL);
260 if (!avc)
261 goto enomem_failure;
262 }
263 anon_vma = pavc->anon_vma;
264 root = lock_anon_vma_root(root, anon_vma);
265 anon_vma_chain_link(dst, avc, anon_vma);
233 } 266 }
267 unlock_anon_vma_root(root);
234 return 0; 268 return 0;
235 269
236 enomem_failure: 270 enomem_failure:
@@ -263,7 +297,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
263 anon_vma = anon_vma_alloc(); 297 anon_vma = anon_vma_alloc();
264 if (!anon_vma) 298 if (!anon_vma)
265 goto out_error; 299 goto out_error;
266 avc = anon_vma_chain_alloc(); 300 avc = anon_vma_chain_alloc(GFP_KERNEL);
267 if (!avc) 301 if (!avc)
268 goto out_error_free_anon_vma; 302 goto out_error_free_anon_vma;
269 303
@@ -280,7 +314,9 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
280 get_anon_vma(anon_vma->root); 314 get_anon_vma(anon_vma->root);
281 /* Mark this anon_vma as the one where our new (COWed) pages go. */ 315 /* Mark this anon_vma as the one where our new (COWed) pages go. */
282 vma->anon_vma = anon_vma; 316 vma->anon_vma = anon_vma;
317 anon_vma_lock(anon_vma);
283 anon_vma_chain_link(vma, avc, anon_vma); 318 anon_vma_chain_link(vma, avc, anon_vma);
319 anon_vma_unlock(anon_vma);
284 320
285 return 0; 321 return 0;
286 322
@@ -291,36 +327,43 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
291 return -ENOMEM; 327 return -ENOMEM;
292} 328}
293 329
294static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain)
295{
296 struct anon_vma *anon_vma = anon_vma_chain->anon_vma;
297 int empty;
298
299 /* If anon_vma_fork fails, we can get an empty anon_vma_chain. */
300 if (!anon_vma)
301 return;
302
303 anon_vma_lock(anon_vma);
304 list_del(&anon_vma_chain->same_anon_vma);
305
306 /* We must garbage collect the anon_vma if it's empty */
307 empty = list_empty(&anon_vma->head);
308 anon_vma_unlock(anon_vma);
309
310 if (empty)
311 put_anon_vma(anon_vma);
312}
313
314void unlink_anon_vmas(struct vm_area_struct *vma) 330void unlink_anon_vmas(struct vm_area_struct *vma)
315{ 331{
316 struct anon_vma_chain *avc, *next; 332 struct anon_vma_chain *avc, *next;
333 struct anon_vma *root = NULL;
317 334
318 /* 335 /*
319 * Unlink each anon_vma chained to the VMA. This list is ordered 336 * Unlink each anon_vma chained to the VMA. This list is ordered
320 * from newest to oldest, ensuring the root anon_vma gets freed last. 337 * from newest to oldest, ensuring the root anon_vma gets freed last.
321 */ 338 */
322 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { 339 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
323 anon_vma_unlink(avc); 340 struct anon_vma *anon_vma = avc->anon_vma;
341
342 root = lock_anon_vma_root(root, anon_vma);
343 list_del(&avc->same_anon_vma);
344
345 /*
346 * Leave empty anon_vmas on the list - we'll need
347 * to free them outside the lock.
348 */
349 if (list_empty(&anon_vma->head))
350 continue;
351
352 list_del(&avc->same_vma);
353 anon_vma_chain_free(avc);
354 }
355 unlock_anon_vma_root(root);
356
357 /*
358 * Iterate the list once more, it now only contains empty and unlinked
359 * anon_vmas, destroy them. Could not do before due to __put_anon_vma()
360 * needing to acquire the anon_vma->root->mutex.
361 */
362 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
363 struct anon_vma *anon_vma = avc->anon_vma;
364
365 put_anon_vma(anon_vma);
366
324 list_del(&avc->same_vma); 367 list_del(&avc->same_vma);
325 anon_vma_chain_free(avc); 368 anon_vma_chain_free(avc);
326 } 369 }
diff --git a/mm/shmem.c b/mm/shmem.c
index d221a1cfd7b..fcedf5464eb 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -539,7 +539,7 @@ static void shmem_free_pages(struct list_head *next)
539 } while (next); 539 } while (next);
540} 540}
541 541
542static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) 542void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
543{ 543{
544 struct shmem_inode_info *info = SHMEM_I(inode); 544 struct shmem_inode_info *info = SHMEM_I(inode);
545 unsigned long idx; 545 unsigned long idx;
@@ -562,6 +562,8 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
562 spinlock_t *punch_lock; 562 spinlock_t *punch_lock;
563 unsigned long upper_limit; 563 unsigned long upper_limit;
564 564
565 truncate_inode_pages_range(inode->i_mapping, start, end);
566
565 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 567 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
566 idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 568 idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
567 if (idx >= info->next_index) 569 if (idx >= info->next_index)
@@ -738,16 +740,8 @@ done2:
738 * lowered next_index. Also, though shmem_getpage checks 740 * lowered next_index. Also, though shmem_getpage checks
739 * i_size before adding to cache, no recheck after: so fix the 741 * i_size before adding to cache, no recheck after: so fix the
740 * narrow window there too. 742 * narrow window there too.
741 *
742 * Recalling truncate_inode_pages_range and unmap_mapping_range
743 * every time for punch_hole (which never got a chance to clear
744 * SHMEM_PAGEIN at the start of vmtruncate_range) is expensive,
745 * yet hardly ever necessary: try to optimize them out later.
746 */ 743 */
747 truncate_inode_pages_range(inode->i_mapping, start, end); 744 truncate_inode_pages_range(inode->i_mapping, start, end);
748 if (punch_hole)
749 unmap_mapping_range(inode->i_mapping, start,
750 end - start, 1);
751 } 745 }
752 746
753 spin_lock(&info->lock); 747 spin_lock(&info->lock);
@@ -766,22 +760,23 @@ done2:
766 shmem_free_pages(pages_to_free.next); 760 shmem_free_pages(pages_to_free.next);
767 } 761 }
768} 762}
763EXPORT_SYMBOL_GPL(shmem_truncate_range);
769 764
770static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) 765static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
771{ 766{
772 struct inode *inode = dentry->d_inode; 767 struct inode *inode = dentry->d_inode;
773 loff_t newsize = attr->ia_size;
774 int error; 768 int error;
775 769
776 error = inode_change_ok(inode, attr); 770 error = inode_change_ok(inode, attr);
777 if (error) 771 if (error)
778 return error; 772 return error;
779 773
780 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE) 774 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
781 && newsize != inode->i_size) { 775 loff_t oldsize = inode->i_size;
776 loff_t newsize = attr->ia_size;
782 struct page *page = NULL; 777 struct page *page = NULL;
783 778
784 if (newsize < inode->i_size) { 779 if (newsize < oldsize) {
785 /* 780 /*
786 * If truncating down to a partial page, then 781 * If truncating down to a partial page, then
787 * if that page is already allocated, hold it 782 * if that page is already allocated, hold it
@@ -810,12 +805,19 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
810 spin_unlock(&info->lock); 805 spin_unlock(&info->lock);
811 } 806 }
812 } 807 }
813 808 if (newsize != oldsize) {
814 /* XXX(truncate): truncate_setsize should be called last */ 809 i_size_write(inode, newsize);
815 truncate_setsize(inode, newsize); 810 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
811 }
812 if (newsize < oldsize) {
813 loff_t holebegin = round_up(newsize, PAGE_SIZE);
814 unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
815 shmem_truncate_range(inode, newsize, (loff_t)-1);
816 /* unmap again to remove racily COWed private pages */
817 unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
818 }
816 if (page) 819 if (page)
817 page_cache_release(page); 820 page_cache_release(page);
818 shmem_truncate_range(inode, newsize, (loff_t)-1);
819 } 821 }
820 822
821 setattr_copy(inode, attr); 823 setattr_copy(inode, attr);
@@ -832,7 +834,6 @@ static void shmem_evict_inode(struct inode *inode)
832 struct shmem_xattr *xattr, *nxattr; 834 struct shmem_xattr *xattr, *nxattr;
833 835
834 if (inode->i_mapping->a_ops == &shmem_aops) { 836 if (inode->i_mapping->a_ops == &shmem_aops) {
835 truncate_inode_pages(inode->i_mapping, 0);
836 shmem_unacct_size(info->flags, inode->i_size); 837 shmem_unacct_size(info->flags, inode->i_size);
837 inode->i_size = 0; 838 inode->i_size = 0;
838 shmem_truncate_range(inode, 0, (loff_t)-1); 839 shmem_truncate_range(inode, 0, (loff_t)-1);
@@ -2706,7 +2707,7 @@ static const struct file_operations shmem_file_operations = {
2706}; 2707};
2707 2708
2708static const struct inode_operations shmem_inode_operations = { 2709static const struct inode_operations shmem_inode_operations = {
2709 .setattr = shmem_notify_change, 2710 .setattr = shmem_setattr,
2710 .truncate_range = shmem_truncate_range, 2711 .truncate_range = shmem_truncate_range,
2711#ifdef CONFIG_TMPFS_XATTR 2712#ifdef CONFIG_TMPFS_XATTR
2712 .setxattr = shmem_setxattr, 2713 .setxattr = shmem_setxattr,
@@ -2739,7 +2740,7 @@ static const struct inode_operations shmem_dir_inode_operations = {
2739 .removexattr = shmem_removexattr, 2740 .removexattr = shmem_removexattr,
2740#endif 2741#endif
2741#ifdef CONFIG_TMPFS_POSIX_ACL 2742#ifdef CONFIG_TMPFS_POSIX_ACL
2742 .setattr = shmem_notify_change, 2743 .setattr = shmem_setattr,
2743 .check_acl = generic_check_acl, 2744 .check_acl = generic_check_acl,
2744#endif 2745#endif
2745}; 2746};
@@ -2752,7 +2753,7 @@ static const struct inode_operations shmem_special_inode_operations = {
2752 .removexattr = shmem_removexattr, 2753 .removexattr = shmem_removexattr,
2753#endif 2754#endif
2754#ifdef CONFIG_TMPFS_POSIX_ACL 2755#ifdef CONFIG_TMPFS_POSIX_ACL
2755 .setattr = shmem_notify_change, 2756 .setattr = shmem_setattr,
2756 .check_acl = generic_check_acl, 2757 .check_acl = generic_check_acl,
2757#endif 2758#endif
2758}; 2759};
@@ -2908,6 +2909,12 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
2908 return 0; 2909 return 0;
2909} 2910}
2910 2911
2912void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
2913{
2914 truncate_inode_pages_range(inode->i_mapping, start, end);
2915}
2916EXPORT_SYMBOL_GPL(shmem_truncate_range);
2917
2911#ifdef CONFIG_CGROUP_MEM_RES_CTLR 2918#ifdef CONFIG_CGROUP_MEM_RES_CTLR
2912/** 2919/**
2913 * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file 2920 * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file
@@ -3028,3 +3035,26 @@ int shmem_zero_setup(struct vm_area_struct *vma)
3028 vma->vm_flags |= VM_CAN_NONLINEAR; 3035 vma->vm_flags |= VM_CAN_NONLINEAR;
3029 return 0; 3036 return 0;
3030} 3037}
3038
3039/**
3040 * shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags.
3041 * @mapping: the page's address_space
3042 * @index: the page index
3043 * @gfp: the page allocator flags to use if allocating
3044 *
3045 * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)",
3046 * with any new page allocations done using the specified allocation flags.
3047 * But read_cache_page_gfp() uses the ->readpage() method: which does not
3048 * suit tmpfs, since it may have pages in swapcache, and needs to find those
3049 * for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
3050 *
3051 * Provide a stub for those callers to start using now, then later
3052 * flesh it out to call shmem_getpage() with additional gfp mask, when
3053 * shmem_file_splice_read() is added and shmem_readpage() is removed.
3054 */
3055struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
3056 pgoff_t index, gfp_t gfp)
3057{
3058 return read_cache_page_gfp(mapping, index, gfp);
3059}
3060EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);
diff --git a/mm/slab.c b/mm/slab.c
index bcfa4987c8a..d96e223de77 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3604,13 +3604,14 @@ free_done:
3604 * Release an obj back to its cache. If the obj has a constructed state, it must 3604 * Release an obj back to its cache. If the obj has a constructed state, it must
3605 * be in this state _before_ it is released. Called with disabled ints. 3605 * be in this state _before_ it is released. Called with disabled ints.
3606 */ 3606 */
3607static inline void __cache_free(struct kmem_cache *cachep, void *objp) 3607static inline void __cache_free(struct kmem_cache *cachep, void *objp,
3608 void *caller)
3608{ 3609{
3609 struct array_cache *ac = cpu_cache_get(cachep); 3610 struct array_cache *ac = cpu_cache_get(cachep);
3610 3611
3611 check_irq_off(); 3612 check_irq_off();
3612 kmemleak_free_recursive(objp, cachep->flags); 3613 kmemleak_free_recursive(objp, cachep->flags);
3613 objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); 3614 objp = cache_free_debugcheck(cachep, objp, caller);
3614 3615
3615 kmemcheck_slab_free(cachep, objp, obj_size(cachep)); 3616 kmemcheck_slab_free(cachep, objp, obj_size(cachep));
3616 3617
@@ -3801,7 +3802,7 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
3801 debug_check_no_locks_freed(objp, obj_size(cachep)); 3802 debug_check_no_locks_freed(objp, obj_size(cachep));
3802 if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) 3803 if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
3803 debug_check_no_obj_freed(objp, obj_size(cachep)); 3804 debug_check_no_obj_freed(objp, obj_size(cachep));
3804 __cache_free(cachep, objp); 3805 __cache_free(cachep, objp, __builtin_return_address(0));
3805 local_irq_restore(flags); 3806 local_irq_restore(flags);
3806 3807
3807 trace_kmem_cache_free(_RET_IP_, objp); 3808 trace_kmem_cache_free(_RET_IP_, objp);
@@ -3831,7 +3832,7 @@ void kfree(const void *objp)
3831 c = virt_to_cache(objp); 3832 c = virt_to_cache(objp);
3832 debug_check_no_locks_freed(objp, obj_size(c)); 3833 debug_check_no_locks_freed(objp, obj_size(c));
3833 debug_check_no_obj_freed(objp, obj_size(c)); 3834 debug_check_no_obj_freed(objp, obj_size(c));
3834 __cache_free(c, (void *)objp); 3835 __cache_free(c, (void *)objp, __builtin_return_address(0));
3835 local_irq_restore(flags); 3836 local_irq_restore(flags);
3836} 3837}
3837EXPORT_SYMBOL(kfree); 3838EXPORT_SYMBOL(kfree);
diff --git a/mm/slub.c b/mm/slub.c
index 7be0223531b..35f351f2619 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2320,16 +2320,12 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
2320 BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < 2320 BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
2321 SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu)); 2321 SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu));
2322 2322
2323#ifdef CONFIG_CMPXCHG_LOCAL
2324 /* 2323 /*
2325 * Must align to double word boundary for the double cmpxchg instructions 2324 * Must align to double word boundary for the double cmpxchg
2326 * to work. 2325 * instructions to work; see __pcpu_double_call_return_bool().
2327 */ 2326 */
2328 s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu), 2 * sizeof(void *)); 2327 s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu),
2329#else 2328 2 * sizeof(void *));
2330 /* Regular alignment is sufficient */
2331 s->cpu_slab = alloc_percpu(struct kmem_cache_cpu);
2332#endif
2333 2329
2334 if (!s->cpu_slab) 2330 if (!s->cpu_slab)
2335 return 0; 2331 return 0;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index d537d29e9b7..ff8dc1a18cb 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -14,7 +14,7 @@
14#include <linux/vmalloc.h> 14#include <linux/vmalloc.h>
15#include <linux/pagemap.h> 15#include <linux/pagemap.h>
16#include <linux/namei.h> 16#include <linux/namei.h>
17#include <linux/shm.h> 17#include <linux/shmem_fs.h>
18#include <linux/blkdev.h> 18#include <linux/blkdev.h>
19#include <linux/random.h> 19#include <linux/random.h>
20#include <linux/writeback.h> 20#include <linux/writeback.h>
diff --git a/mm/thrash.c b/mm/thrash.c
index 2372d4ed5dd..fabf2d0f516 100644
--- a/mm/thrash.c
+++ b/mm/thrash.c
@@ -21,14 +21,40 @@
21#include <linux/mm.h> 21#include <linux/mm.h>
22#include <linux/sched.h> 22#include <linux/sched.h>
23#include <linux/swap.h> 23#include <linux/swap.h>
24#include <linux/memcontrol.h>
25
26#include <trace/events/vmscan.h>
27
28#define TOKEN_AGING_INTERVAL (0xFF)
24 29
25static DEFINE_SPINLOCK(swap_token_lock); 30static DEFINE_SPINLOCK(swap_token_lock);
26struct mm_struct *swap_token_mm; 31struct mm_struct *swap_token_mm;
32struct mem_cgroup *swap_token_memcg;
27static unsigned int global_faults; 33static unsigned int global_faults;
34static unsigned int last_aging;
35
36#ifdef CONFIG_CGROUP_MEM_RES_CTLR
37static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm)
38{
39 struct mem_cgroup *memcg;
40
41 memcg = try_get_mem_cgroup_from_mm(mm);
42 if (memcg)
43 css_put(mem_cgroup_css(memcg));
44
45 return memcg;
46}
47#else
48static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm)
49{
50 return NULL;
51}
52#endif
28 53
29void grab_swap_token(struct mm_struct *mm) 54void grab_swap_token(struct mm_struct *mm)
30{ 55{
31 int current_interval; 56 int current_interval;
57 unsigned int old_prio = mm->token_priority;
32 58
33 global_faults++; 59 global_faults++;
34 60
@@ -38,40 +64,81 @@ void grab_swap_token(struct mm_struct *mm)
38 return; 64 return;
39 65
40 /* First come first served */ 66 /* First come first served */
41 if (swap_token_mm == NULL) { 67 if (!swap_token_mm)
42 mm->token_priority = mm->token_priority + 2; 68 goto replace_token;
43 swap_token_mm = mm; 69
44 goto out; 70 if ((global_faults - last_aging) > TOKEN_AGING_INTERVAL) {
71 swap_token_mm->token_priority /= 2;
72 last_aging = global_faults;
45 } 73 }
46 74
47 if (mm != swap_token_mm) { 75 if (mm == swap_token_mm) {
48 if (current_interval < mm->last_interval)
49 mm->token_priority++;
50 else {
51 if (likely(mm->token_priority > 0))
52 mm->token_priority--;
53 }
54 /* Check if we deserve the token */
55 if (mm->token_priority > swap_token_mm->token_priority) {
56 mm->token_priority += 2;
57 swap_token_mm = mm;
58 }
59 } else {
60 /* Token holder came in again! */
61 mm->token_priority += 2; 76 mm->token_priority += 2;
77 goto update_priority;
78 }
79
80 if (current_interval < mm->last_interval)
81 mm->token_priority++;
82 else {
83 if (likely(mm->token_priority > 0))
84 mm->token_priority--;
62 } 85 }
63 86
87 /* Check if we deserve the token */
88 if (mm->token_priority > swap_token_mm->token_priority)
89 goto replace_token;
90
91update_priority:
92 trace_update_swap_token_priority(mm, old_prio, swap_token_mm);
93
64out: 94out:
65 mm->faultstamp = global_faults; 95 mm->faultstamp = global_faults;
66 mm->last_interval = current_interval; 96 mm->last_interval = current_interval;
67 spin_unlock(&swap_token_lock); 97 spin_unlock(&swap_token_lock);
98 return;
99
100replace_token:
101 mm->token_priority += 2;
102 trace_replace_swap_token(swap_token_mm, mm);
103 swap_token_mm = mm;
104 swap_token_memcg = swap_token_memcg_from_mm(mm);
105 last_aging = global_faults;
106 goto out;
68} 107}
69 108
70/* Called on process exit. */ 109/* Called on process exit. */
71void __put_swap_token(struct mm_struct *mm) 110void __put_swap_token(struct mm_struct *mm)
72{ 111{
73 spin_lock(&swap_token_lock); 112 spin_lock(&swap_token_lock);
74 if (likely(mm == swap_token_mm)) 113 if (likely(mm == swap_token_mm)) {
114 trace_put_swap_token(swap_token_mm);
75 swap_token_mm = NULL; 115 swap_token_mm = NULL;
116 swap_token_memcg = NULL;
117 }
76 spin_unlock(&swap_token_lock); 118 spin_unlock(&swap_token_lock);
77} 119}
120
121static bool match_memcg(struct mem_cgroup *a, struct mem_cgroup *b)
122{
123 if (!a)
124 return true;
125 if (!b)
126 return true;
127 if (a == b)
128 return true;
129 return false;
130}
131
132void disable_swap_token(struct mem_cgroup *memcg)
133{
134 /* memcg reclaim don't disable unrelated mm token. */
135 if (match_memcg(memcg, swap_token_memcg)) {
136 spin_lock(&swap_token_lock);
137 if (match_memcg(memcg, swap_token_memcg)) {
138 trace_disable_swap_token(swap_token_mm);
139 swap_token_mm = NULL;
140 swap_token_memcg = NULL;
141 }
142 spin_unlock(&swap_token_lock);
143 }
144}
diff --git a/mm/truncate.c b/mm/truncate.c
index 3a29a618021..e13f22efaad 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -304,6 +304,11 @@ EXPORT_SYMBOL(truncate_inode_pages_range);
304 * @lstart: offset from which to truncate 304 * @lstart: offset from which to truncate
305 * 305 *
306 * Called under (and serialised by) inode->i_mutex. 306 * Called under (and serialised by) inode->i_mutex.
307 *
308 * Note: When this function returns, there can be a page in the process of
309 * deletion (inside __delete_from_page_cache()) in the specified range. Thus
310 * mapping->nrpages can be non-zero when this function returns even after
311 * truncation of the whole mapping.
307 */ 312 */
308void truncate_inode_pages(struct address_space *mapping, loff_t lstart) 313void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
309{ 314{
@@ -603,3 +608,27 @@ int vmtruncate(struct inode *inode, loff_t offset)
603 return 0; 608 return 0;
604} 609}
605EXPORT_SYMBOL(vmtruncate); 610EXPORT_SYMBOL(vmtruncate);
611
612int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
613{
614 struct address_space *mapping = inode->i_mapping;
615
616 /*
617 * If the underlying filesystem is not going to provide
618 * a way to truncate a range of blocks (punch a hole) -
619 * we should return failure right now.
620 */
621 if (!inode->i_op->truncate_range)
622 return -ENOSYS;
623
624 mutex_lock(&inode->i_mutex);
625 down_write(&inode->i_alloc_sem);
626 unmap_mapping_range(mapping, offset, (end - offset), 1);
627 inode->i_op->truncate_range(inode, offset, end);
628 /* unmap again to remove racily COWed private pages */
629 unmap_mapping_range(mapping, offset, (end - offset), 1);
630 up_write(&inode->i_alloc_sem);
631 mutex_unlock(&inode->i_mutex);
632
633 return 0;
634}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index faa0a088f9c..5ed24b94c5e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1124,8 +1124,20 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1124 nr_lumpy_dirty++; 1124 nr_lumpy_dirty++;
1125 scan++; 1125 scan++;
1126 } else { 1126 } else {
1127 /* the page is freed already. */ 1127 /*
1128 if (!page_count(cursor_page)) 1128 * Check if the page is freed already.
1129 *
1130 * We can't use page_count() as that
1131 * requires compound_head and we don't
1132 * have a pin on the page here. If a
1133 * page is tail, we may or may not
1134 * have isolated the head, so assume
1135 * it's not free, it'd be tricky to
1136 * track the head status without a
1137 * page pin.
1138 */
1139 if (!PageTail(cursor_page) &&
1140 !atomic_read(&cursor_page->_count))
1129 continue; 1141 continue;
1130 break; 1142 break;
1131 } 1143 }
@@ -1983,14 +1995,13 @@ restart:
1983 * If a zone is deemed to be full of pinned pages then just give it a light 1995 * If a zone is deemed to be full of pinned pages then just give it a light
1984 * scan then give up on it. 1996 * scan then give up on it.
1985 */ 1997 */
1986static unsigned long shrink_zones(int priority, struct zonelist *zonelist, 1998static void shrink_zones(int priority, struct zonelist *zonelist,
1987 struct scan_control *sc) 1999 struct scan_control *sc)
1988{ 2000{
1989 struct zoneref *z; 2001 struct zoneref *z;
1990 struct zone *zone; 2002 struct zone *zone;
1991 unsigned long nr_soft_reclaimed; 2003 unsigned long nr_soft_reclaimed;
1992 unsigned long nr_soft_scanned; 2004 unsigned long nr_soft_scanned;
1993 unsigned long total_scanned = 0;
1994 2005
1995 for_each_zone_zonelist_nodemask(zone, z, zonelist, 2006 for_each_zone_zonelist_nodemask(zone, z, zonelist,
1996 gfp_zone(sc->gfp_mask), sc->nodemask) { 2007 gfp_zone(sc->gfp_mask), sc->nodemask) {
@@ -2005,19 +2016,23 @@ static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
2005 continue; 2016 continue;
2006 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 2017 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
2007 continue; /* Let kswapd poll it */ 2018 continue; /* Let kswapd poll it */
2019 /*
2020 * This steals pages from memory cgroups over softlimit
2021 * and returns the number of reclaimed pages and
2022 * scanned pages. This works for global memory pressure
2023 * and balancing, not for a memcg's limit.
2024 */
2025 nr_soft_scanned = 0;
2026 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
2027 sc->order, sc->gfp_mask,
2028 &nr_soft_scanned);
2029 sc->nr_reclaimed += nr_soft_reclaimed;
2030 sc->nr_scanned += nr_soft_scanned;
2031 /* need some check for avoid more shrink_zone() */
2008 } 2032 }
2009 2033
2010 nr_soft_scanned = 0;
2011 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
2012 sc->order, sc->gfp_mask,
2013 &nr_soft_scanned);
2014 sc->nr_reclaimed += nr_soft_reclaimed;
2015 total_scanned += nr_soft_scanned;
2016
2017 shrink_zone(priority, zone, sc); 2034 shrink_zone(priority, zone, sc);
2018 } 2035 }
2019
2020 return total_scanned;
2021} 2036}
2022 2037
2023static bool zone_reclaimable(struct zone *zone) 2038static bool zone_reclaimable(struct zone *zone)
@@ -2081,8 +2096,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2081 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 2096 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
2082 sc->nr_scanned = 0; 2097 sc->nr_scanned = 0;
2083 if (!priority) 2098 if (!priority)
2084 disable_swap_token(); 2099 disable_swap_token(sc->mem_cgroup);
2085 total_scanned += shrink_zones(priority, zonelist, sc); 2100 shrink_zones(priority, zonelist, sc);
2086 /* 2101 /*
2087 * Don't shrink slabs when reclaiming memory from 2102 * Don't shrink slabs when reclaiming memory from
2088 * over limit cgroups 2103 * over limit cgroups
@@ -2311,7 +2326,7 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
2311 return true; 2326 return true;
2312 2327
2313 /* Check the watermark levels */ 2328 /* Check the watermark levels */
2314 for (i = 0; i < pgdat->nr_zones; i++) { 2329 for (i = 0; i <= classzone_idx; i++) {
2315 struct zone *zone = pgdat->node_zones + i; 2330 struct zone *zone = pgdat->node_zones + i;
2316 2331
2317 if (!populated_zone(zone)) 2332 if (!populated_zone(zone))
@@ -2329,7 +2344,7 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
2329 } 2344 }
2330 2345
2331 if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), 2346 if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
2332 classzone_idx, 0)) 2347 i, 0))
2333 all_zones_ok = false; 2348 all_zones_ok = false;
2334 else 2349 else
2335 balanced += zone->present_pages; 2350 balanced += zone->present_pages;
@@ -2407,7 +2422,7 @@ loop_again:
2407 2422
2408 /* The swap token gets in the way of swapout... */ 2423 /* The swap token gets in the way of swapout... */
2409 if (!priority) 2424 if (!priority)
2410 disable_swap_token(); 2425 disable_swap_token(NULL);
2411 2426
2412 all_zones_ok = 1; 2427 all_zones_ok = 1;
2413 balanced = 0; 2428 balanced = 0;
@@ -2436,7 +2451,6 @@ loop_again:
2436 if (!zone_watermark_ok_safe(zone, order, 2451 if (!zone_watermark_ok_safe(zone, order,
2437 high_wmark_pages(zone), 0, 0)) { 2452 high_wmark_pages(zone), 0, 0)) {
2438 end_zone = i; 2453 end_zone = i;
2439 *classzone_idx = i;
2440 break; 2454 break;
2441 } 2455 }
2442 } 2456 }
@@ -2495,18 +2509,18 @@ loop_again:
2495 KSWAPD_ZONE_BALANCE_GAP_RATIO); 2509 KSWAPD_ZONE_BALANCE_GAP_RATIO);
2496 if (!zone_watermark_ok_safe(zone, order, 2510 if (!zone_watermark_ok_safe(zone, order,
2497 high_wmark_pages(zone) + balance_gap, 2511 high_wmark_pages(zone) + balance_gap,
2498 end_zone, 0)) 2512 end_zone, 0)) {
2499 shrink_zone(priority, zone, &sc); 2513 shrink_zone(priority, zone, &sc);
2500 reclaim_state->reclaimed_slab = 0;
2501 nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
2502 sc.nr_reclaimed += reclaim_state->reclaimed_slab;
2503 total_scanned += sc.nr_scanned;
2504 2514
2505 if (zone->all_unreclaimable) 2515 reclaim_state->reclaimed_slab = 0;
2506 continue; 2516 nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
2507 if (nr_slab == 0 && 2517 sc.nr_reclaimed += reclaim_state->reclaimed_slab;
2508 !zone_reclaimable(zone)) 2518 total_scanned += sc.nr_scanned;
2509 zone->all_unreclaimable = 1; 2519
2520 if (nr_slab == 0 && !zone_reclaimable(zone))
2521 zone->all_unreclaimable = 1;
2522 }
2523
2510 /* 2524 /*
2511 * If we've done a decent amount of scanning and 2525 * If we've done a decent amount of scanning and
2512 * the reclaim ratio is low, start doing writepage 2526 * the reclaim ratio is low, start doing writepage
@@ -2516,6 +2530,12 @@ loop_again:
2516 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) 2530 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
2517 sc.may_writepage = 1; 2531 sc.may_writepage = 1;
2518 2532
2533 if (zone->all_unreclaimable) {
2534 if (end_zone && end_zone == i)
2535 end_zone--;
2536 continue;
2537 }
2538
2519 if (!zone_watermark_ok_safe(zone, order, 2539 if (!zone_watermark_ok_safe(zone, order,
2520 high_wmark_pages(zone), end_zone, 0)) { 2540 high_wmark_pages(zone), end_zone, 0)) {
2521 all_zones_ok = 0; 2541 all_zones_ok = 0;
@@ -2694,8 +2714,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
2694 */ 2714 */
2695static int kswapd(void *p) 2715static int kswapd(void *p)
2696{ 2716{
2697 unsigned long order; 2717 unsigned long order, new_order;
2698 int classzone_idx; 2718 int classzone_idx, new_classzone_idx;
2699 pg_data_t *pgdat = (pg_data_t*)p; 2719 pg_data_t *pgdat = (pg_data_t*)p;
2700 struct task_struct *tsk = current; 2720 struct task_struct *tsk = current;
2701 2721
@@ -2725,17 +2745,23 @@ static int kswapd(void *p)
2725 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; 2745 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
2726 set_freezable(); 2746 set_freezable();
2727 2747
2728 order = 0; 2748 order = new_order = 0;
2729 classzone_idx = MAX_NR_ZONES - 1; 2749 classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
2730 for ( ; ; ) { 2750 for ( ; ; ) {
2731 unsigned long new_order;
2732 int new_classzone_idx;
2733 int ret; 2751 int ret;
2734 2752
2735 new_order = pgdat->kswapd_max_order; 2753 /*
2736 new_classzone_idx = pgdat->classzone_idx; 2754 * If the last balance_pgdat was unsuccessful it's unlikely a
2737 pgdat->kswapd_max_order = 0; 2755 * new request of a similar or harder type will succeed soon
2738 pgdat->classzone_idx = MAX_NR_ZONES - 1; 2756 * so consider going to sleep on the basis we reclaimed at
2757 */
2758 if (classzone_idx >= new_classzone_idx && order == new_order) {
2759 new_order = pgdat->kswapd_max_order;
2760 new_classzone_idx = pgdat->classzone_idx;
2761 pgdat->kswapd_max_order = 0;
2762 pgdat->classzone_idx = pgdat->nr_zones - 1;
2763 }
2764
2739 if (order < new_order || classzone_idx > new_classzone_idx) { 2765 if (order < new_order || classzone_idx > new_classzone_idx) {
2740 /* 2766 /*
2741 * Don't sleep if someone wants a larger 'order' 2767 * Don't sleep if someone wants a larger 'order'
@@ -2748,7 +2774,7 @@ static int kswapd(void *p)
2748 order = pgdat->kswapd_max_order; 2774 order = pgdat->kswapd_max_order;
2749 classzone_idx = pgdat->classzone_idx; 2775 classzone_idx = pgdat->classzone_idx;
2750 pgdat->kswapd_max_order = 0; 2776 pgdat->kswapd_max_order = 0;
2751 pgdat->classzone_idx = MAX_NR_ZONES - 1; 2777 pgdat->classzone_idx = pgdat->nr_zones - 1;
2752 } 2778 }
2753 2779
2754 ret = try_to_freeze(); 2780 ret = try_to_freeze();