aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-07-03 20:12:13 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-07-03 20:12:13 -0400
commit7f0ef0267e20d62d45d527911a993b1e998f4968 (patch)
treede51abc7da5903f59d83e23937f22420164c9477 /mm
parent862f0012549110d6f2586bf54b52ed4540cbff3a (diff)
parent9307c29524502c21f0e8a6d96d850b2f5bc0bd9a (diff)
Merge branch 'akpm' (updates from Andrew Morton)
Merge first patch-bomb from Andrew Morton: - various misc bits - I'm been patchmonkeying ocfs2 for a while, as Joel and Mark have been distracted. There has been quite a bit of activity. - About half the MM queue - Some backlight bits - Various lib/ updates - checkpatch updates - zillions more little rtc patches - ptrace - signals - exec - procfs - rapidio - nbd - aoe - pps - memstick - tools/testing/selftests updates * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (445 commits) tools/testing/selftests: don't assume the x bit is set on scripts selftests: add .gitignore for kcmp selftests: fix clean target in kcmp Makefile selftests: add .gitignore for vm selftests: add hugetlbfstest self-test: fix make clean selftests: exit 1 on failure kernel/resource.c: remove the unneeded assignment in function __find_resource aio: fix wrong comment in aio_complete() drivers/w1/slaves/w1_ds2408.c: add magic sequence to disable P0 test mode drivers/memstick/host/r592.c: convert to module_pci_driver drivers/memstick/host/jmb38x_ms: convert to module_pci_driver pps-gpio: add device-tree binding and support drivers/pps/clients/pps-gpio.c: convert to module_platform_driver drivers/pps/clients/pps-gpio.c: convert to devm_* helpers drivers/parport/share.c: use kzalloc Documentation/accounting/getdelays.c: avoid strncpy in accounting tool aoe: update internal version number to v83 aoe: update copyright date aoe: perform I/O completions in parallel ...
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig12
-rw-r--r--mm/backing-dev.c5
-rw-r--r--mm/bootmem.c39
-rw-r--r--mm/huge_memory.c2
-rw-r--r--mm/hugetlb.c4
-rw-r--r--mm/memcontrol.c97
-rw-r--r--mm/memory-failure.c22
-rw-r--r--mm/memory.c13
-rw-r--r--mm/memory_hotplug.c48
-rw-r--r--mm/mm_init.c47
-rw-r--r--mm/mmap.c2
-rw-r--r--mm/mremap.c2
-rw-r--r--mm/nobootmem.c35
-rw-r--r--mm/nommu.c6
-rw-r--r--mm/page_alloc.c294
-rw-r--r--mm/page_io.c50
-rw-r--r--mm/rmap.c7
-rw-r--r--mm/sparse.c3
-rw-r--r--mm/swap.c106
-rw-r--r--mm/swapfile.c55
-rw-r--r--mm/vmalloc.c103
-rw-r--r--mm/vmscan.c585
22 files changed, 1021 insertions, 516 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index f5e698e30d4a..7e28ecfa8aa4 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -477,3 +477,15 @@ config FRONTSWAP
477 and swap data is stored as normal on the matching swap device. 477 and swap data is stored as normal on the matching swap device.
478 478
479 If unsure, say Y to enable frontswap. 479 If unsure, say Y to enable frontswap.
480
481config MEM_SOFT_DIRTY
482 bool "Track memory changes"
483 depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY
484 select PROC_PAGE_MONITOR
485 help
486 This option enables memory changes tracking by introducing a
487 soft-dirty bit on pte-s. This bit it set when someone writes
488 into a page just as regular dirty bit, but unlike the latter
489 it can be cleared by hands.
490
491 See Documentation/vm/soft-dirty.txt for more details.
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 502517492258..d014ee5fcbbd 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -515,7 +515,6 @@ EXPORT_SYMBOL(bdi_destroy);
515int bdi_setup_and_register(struct backing_dev_info *bdi, char *name, 515int bdi_setup_and_register(struct backing_dev_info *bdi, char *name,
516 unsigned int cap) 516 unsigned int cap)
517{ 517{
518 char tmp[32];
519 int err; 518 int err;
520 519
521 bdi->name = name; 520 bdi->name = name;
@@ -524,8 +523,8 @@ int bdi_setup_and_register(struct backing_dev_info *bdi, char *name,
524 if (err) 523 if (err)
525 return err; 524 return err;
526 525
527 sprintf(tmp, "%.28s%s", name, "-%d"); 526 err = bdi_register(bdi, NULL, "%.28s-%ld", name,
528 err = bdi_register(bdi, NULL, tmp, atomic_long_inc_return(&bdi_seq)); 527 atomic_long_inc_return(&bdi_seq));
529 if (err) { 528 if (err) {
530 bdi_destroy(bdi); 529 bdi_destroy(bdi);
531 return err; 530 return err;
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 2b0bcb019ec2..6ab7744e692e 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -241,33 +241,26 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
241 return count; 241 return count;
242} 242}
243 243
244static void reset_node_lowmem_managed_pages(pg_data_t *pgdat) 244static int reset_managed_pages_done __initdata;
245
246static inline void __init reset_node_managed_pages(pg_data_t *pgdat)
245{ 247{
246 struct zone *z; 248 struct zone *z;
247 249
248 /* 250 if (reset_managed_pages_done)
249 * In free_area_init_core(), highmem zone's managed_pages is set to 251 return;
250 * present_pages, and bootmem allocator doesn't allocate from highmem 252
251 * zones. So there's no need to recalculate managed_pages because all
252 * highmem pages will be managed by the buddy system. Here highmem
253 * zone also includes highmem movable zone.
254 */
255 for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) 253 for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
256 if (!is_highmem(z)) 254 z->managed_pages = 0;
257 z->managed_pages = 0;
258} 255}
259 256
260/** 257void __init reset_all_zones_managed_pages(void)
261 * free_all_bootmem_node - release a node's free pages to the buddy allocator
262 * @pgdat: node to be released
263 *
264 * Returns the number of pages actually released.
265 */
266unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
267{ 258{
268 register_page_bootmem_info_node(pgdat); 259 struct pglist_data *pgdat;
269 reset_node_lowmem_managed_pages(pgdat); 260
270 return free_all_bootmem_core(pgdat->bdata); 261 for_each_online_pgdat(pgdat)
262 reset_node_managed_pages(pgdat);
263 reset_managed_pages_done = 1;
271} 264}
272 265
273/** 266/**
@@ -279,14 +272,14 @@ unsigned long __init free_all_bootmem(void)
279{ 272{
280 unsigned long total_pages = 0; 273 unsigned long total_pages = 0;
281 bootmem_data_t *bdata; 274 bootmem_data_t *bdata;
282 struct pglist_data *pgdat;
283 275
284 for_each_online_pgdat(pgdat) 276 reset_all_zones_managed_pages();
285 reset_node_lowmem_managed_pages(pgdat);
286 277
287 list_for_each_entry(bdata, &bdata_list, list) 278 list_for_each_entry(bdata, &bdata_list, list)
288 total_pages += free_all_bootmem_core(bdata); 279 total_pages += free_all_bootmem_core(bdata);
289 280
281 totalram_pages += total_pages;
282
290 return total_pages; 283 return total_pages;
291} 284}
292 285
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 362c329b83fe..d8b3b850150c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1429,7 +1429,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
1429 if (ret == 1) { 1429 if (ret == 1) {
1430 pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); 1430 pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
1431 VM_BUG_ON(!pmd_none(*new_pmd)); 1431 VM_BUG_ON(!pmd_none(*new_pmd));
1432 set_pmd_at(mm, new_addr, new_pmd, pmd); 1432 set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
1433 spin_unlock(&mm->page_table_lock); 1433 spin_unlock(&mm->page_table_lock);
1434 } 1434 }
1435out: 1435out:
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index aed085ad11a8..83aff0a4d093 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -319,7 +319,7 @@ unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
319 319
320 hstate = hstate_vma(vma); 320 hstate = hstate_vma(vma);
321 321
322 return 1UL << (hstate->order + PAGE_SHIFT); 322 return 1UL << huge_page_shift(hstate);
323} 323}
324EXPORT_SYMBOL_GPL(vma_kernel_pagesize); 324EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
325 325
@@ -1263,7 +1263,7 @@ static void __init gather_bootmem_prealloc(void)
1263 * side-effects, like CommitLimit going negative. 1263 * side-effects, like CommitLimit going negative.
1264 */ 1264 */
1265 if (h->order > (MAX_ORDER - 1)) 1265 if (h->order > (MAX_ORDER - 1))
1266 totalram_pages += 1 << h->order; 1266 adjust_managed_page_count(page, 1 << h->order);
1267 } 1267 }
1268} 1268}
1269 1269
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 194721839cf5..2e851f453814 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1148,6 +1148,58 @@ skip_node:
1148 return NULL; 1148 return NULL;
1149} 1149}
1150 1150
1151static void mem_cgroup_iter_invalidate(struct mem_cgroup *root)
1152{
1153 /*
1154 * When a group in the hierarchy below root is destroyed, the
1155 * hierarchy iterator can no longer be trusted since it might
1156 * have pointed to the destroyed group. Invalidate it.
1157 */
1158 atomic_inc(&root->dead_count);
1159}
1160
1161static struct mem_cgroup *
1162mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
1163 struct mem_cgroup *root,
1164 int *sequence)
1165{
1166 struct mem_cgroup *position = NULL;
1167 /*
1168 * A cgroup destruction happens in two stages: offlining and
1169 * release. They are separated by a RCU grace period.
1170 *
1171 * If the iterator is valid, we may still race with an
1172 * offlining. The RCU lock ensures the object won't be
1173 * released, tryget will fail if we lost the race.
1174 */
1175 *sequence = atomic_read(&root->dead_count);
1176 if (iter->last_dead_count == *sequence) {
1177 smp_rmb();
1178 position = iter->last_visited;
1179 if (position && !css_tryget(&position->css))
1180 position = NULL;
1181 }
1182 return position;
1183}
1184
1185static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
1186 struct mem_cgroup *last_visited,
1187 struct mem_cgroup *new_position,
1188 int sequence)
1189{
1190 if (last_visited)
1191 css_put(&last_visited->css);
1192 /*
1193 * We store the sequence count from the time @last_visited was
1194 * loaded successfully instead of rereading it here so that we
1195 * don't lose destruction events in between. We could have
1196 * raced with the destruction of @new_position after all.
1197 */
1198 iter->last_visited = new_position;
1199 smp_wmb();
1200 iter->last_dead_count = sequence;
1201}
1202
1151/** 1203/**
1152 * mem_cgroup_iter - iterate over memory cgroup hierarchy 1204 * mem_cgroup_iter - iterate over memory cgroup hierarchy
1153 * @root: hierarchy root 1205 * @root: hierarchy root
@@ -1171,7 +1223,6 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1171{ 1223{
1172 struct mem_cgroup *memcg = NULL; 1224 struct mem_cgroup *memcg = NULL;
1173 struct mem_cgroup *last_visited = NULL; 1225 struct mem_cgroup *last_visited = NULL;
1174 unsigned long uninitialized_var(dead_count);
1175 1226
1176 if (mem_cgroup_disabled()) 1227 if (mem_cgroup_disabled())
1177 return NULL; 1228 return NULL;
@@ -1191,6 +1242,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1191 rcu_read_lock(); 1242 rcu_read_lock();
1192 while (!memcg) { 1243 while (!memcg) {
1193 struct mem_cgroup_reclaim_iter *uninitialized_var(iter); 1244 struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
1245 int uninitialized_var(seq);
1194 1246
1195 if (reclaim) { 1247 if (reclaim) {
1196 int nid = zone_to_nid(reclaim->zone); 1248 int nid = zone_to_nid(reclaim->zone);
@@ -1204,37 +1256,13 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1204 goto out_unlock; 1256 goto out_unlock;
1205 } 1257 }
1206 1258
1207 /* 1259 last_visited = mem_cgroup_iter_load(iter, root, &seq);
1208 * If the dead_count mismatches, a destruction
1209 * has happened or is happening concurrently.
1210 * If the dead_count matches, a destruction
1211 * might still happen concurrently, but since
1212 * we checked under RCU, that destruction
1213 * won't free the object until we release the
1214 * RCU reader lock. Thus, the dead_count
1215 * check verifies the pointer is still valid,
1216 * css_tryget() verifies the cgroup pointed to
1217 * is alive.
1218 */
1219 dead_count = atomic_read(&root->dead_count);
1220 if (dead_count == iter->last_dead_count) {
1221 smp_rmb();
1222 last_visited = iter->last_visited;
1223 if (last_visited &&
1224 !css_tryget(&last_visited->css))
1225 last_visited = NULL;
1226 }
1227 } 1260 }
1228 1261
1229 memcg = __mem_cgroup_iter_next(root, last_visited); 1262 memcg = __mem_cgroup_iter_next(root, last_visited);
1230 1263
1231 if (reclaim) { 1264 if (reclaim) {
1232 if (last_visited) 1265 mem_cgroup_iter_update(iter, last_visited, memcg, seq);
1233 css_put(&last_visited->css);
1234
1235 iter->last_visited = memcg;
1236 smp_wmb();
1237 iter->last_dead_count = dead_count;
1238 1266
1239 if (!memcg) 1267 if (!memcg)
1240 iter->generation++; 1268 iter->generation++;
@@ -1448,11 +1476,12 @@ static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
1448 return ret; 1476 return ret;
1449} 1477}
1450 1478
1451int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) 1479bool task_in_mem_cgroup(struct task_struct *task,
1480 const struct mem_cgroup *memcg)
1452{ 1481{
1453 int ret;
1454 struct mem_cgroup *curr = NULL; 1482 struct mem_cgroup *curr = NULL;
1455 struct task_struct *p; 1483 struct task_struct *p;
1484 bool ret;
1456 1485
1457 p = find_lock_task_mm(task); 1486 p = find_lock_task_mm(task);
1458 if (p) { 1487 if (p) {
@@ -1464,14 +1493,14 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
1464 * killer still needs to detect if they have already been oom 1493 * killer still needs to detect if they have already been oom
1465 * killed to prevent needlessly killing additional tasks. 1494 * killed to prevent needlessly killing additional tasks.
1466 */ 1495 */
1467 task_lock(task); 1496 rcu_read_lock();
1468 curr = mem_cgroup_from_task(task); 1497 curr = mem_cgroup_from_task(task);
1469 if (curr) 1498 if (curr)
1470 css_get(&curr->css); 1499 css_get(&curr->css);
1471 task_unlock(task); 1500 rcu_read_unlock();
1472 } 1501 }
1473 if (!curr) 1502 if (!curr)
1474 return 0; 1503 return false;
1475 /* 1504 /*
1476 * We should check use_hierarchy of "memcg" not "curr". Because checking 1505 * We should check use_hierarchy of "memcg" not "curr". Because checking
1477 * use_hierarchy of "curr" here make this function true if hierarchy is 1506 * use_hierarchy of "curr" here make this function true if hierarchy is
@@ -6317,14 +6346,14 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
6317 struct mem_cgroup *parent = memcg; 6346 struct mem_cgroup *parent = memcg;
6318 6347
6319 while ((parent = parent_mem_cgroup(parent))) 6348 while ((parent = parent_mem_cgroup(parent)))
6320 atomic_inc(&parent->dead_count); 6349 mem_cgroup_iter_invalidate(parent);
6321 6350
6322 /* 6351 /*
6323 * if the root memcg is not hierarchical we have to check it 6352 * if the root memcg is not hierarchical we have to check it
6324 * explicitely. 6353 * explicitely.
6325 */ 6354 */
6326 if (!root_mem_cgroup->use_hierarchy) 6355 if (!root_mem_cgroup->use_hierarchy)
6327 atomic_inc(&root_mem_cgroup->dead_count); 6356 mem_cgroup_iter_invalidate(root_mem_cgroup);
6328} 6357}
6329 6358
6330static void mem_cgroup_css_offline(struct cgroup *cont) 6359static void mem_cgroup_css_offline(struct cgroup *cont)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index ceb0c7f1932f..2c13aa7a0164 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1410,7 +1410,8 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags)
1410 1410
1411 /* 1411 /*
1412 * Isolate the page, so that it doesn't get reallocated if it 1412 * Isolate the page, so that it doesn't get reallocated if it
1413 * was free. 1413 * was free. This flag should be kept set until the source page
1414 * is freed and PG_hwpoison on it is set.
1414 */ 1415 */
1415 set_migratetype_isolate(p, true); 1416 set_migratetype_isolate(p, true);
1416 /* 1417 /*
@@ -1433,7 +1434,6 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags)
1433 /* Not a free page */ 1434 /* Not a free page */
1434 ret = 1; 1435 ret = 1;
1435 } 1436 }
1436 unset_migratetype_isolate(p, MIGRATE_MOVABLE);
1437 unlock_memory_hotplug(); 1437 unlock_memory_hotplug();
1438 return ret; 1438 return ret;
1439} 1439}
@@ -1494,7 +1494,6 @@ static int soft_offline_huge_page(struct page *page, int flags)
1494 atomic_long_add(1 << compound_trans_order(hpage), 1494 atomic_long_add(1 << compound_trans_order(hpage),
1495 &num_poisoned_pages); 1495 &num_poisoned_pages);
1496 } 1496 }
1497 /* keep elevated page count for bad page */
1498 return ret; 1497 return ret;
1499} 1498}
1500 1499
@@ -1559,7 +1558,7 @@ int soft_offline_page(struct page *page, int flags)
1559 atomic_long_inc(&num_poisoned_pages); 1558 atomic_long_inc(&num_poisoned_pages);
1560 } 1559 }
1561 } 1560 }
1562 /* keep elevated page count for bad page */ 1561 unset_migratetype_isolate(page, MIGRATE_MOVABLE);
1563 return ret; 1562 return ret;
1564} 1563}
1565 1564
@@ -1625,7 +1624,22 @@ static int __soft_offline_page(struct page *page, int flags)
1625 if (ret > 0) 1624 if (ret > 0)
1626 ret = -EIO; 1625 ret = -EIO;
1627 } else { 1626 } else {
1627 /*
1628 * After page migration succeeds, the source page can
1629 * be trapped in pagevec and actual freeing is delayed.
1630 * Freeing code works differently based on PG_hwpoison,
1631 * so there's a race. We need to make sure that the
1632 * source page should be freed back to buddy before
1633 * setting PG_hwpoison.
1634 */
1635 if (!is_free_buddy_page(page))
1636 lru_add_drain_all();
1637 if (!is_free_buddy_page(page))
1638 drain_all_pages();
1628 SetPageHWPoison(page); 1639 SetPageHWPoison(page);
1640 if (!is_free_buddy_page(page))
1641 pr_info("soft offline: %#lx: page leaked\n",
1642 pfn);
1629 atomic_long_inc(&num_poisoned_pages); 1643 atomic_long_inc(&num_poisoned_pages);
1630 } 1644 }
1631 } else { 1645 } else {
diff --git a/mm/memory.c b/mm/memory.c
index 95d0cce63583..b68812d682b6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -82,7 +82,6 @@ EXPORT_SYMBOL(max_mapnr);
82EXPORT_SYMBOL(mem_map); 82EXPORT_SYMBOL(mem_map);
83#endif 83#endif
84 84
85unsigned long num_physpages;
86/* 85/*
87 * A number of key systems in x86 including ioremap() rely on the assumption 86 * A number of key systems in x86 including ioremap() rely on the assumption
88 * that high_memory defines the upper bound on direct map memory, then end 87 * that high_memory defines the upper bound on direct map memory, then end
@@ -92,7 +91,6 @@ unsigned long num_physpages;
92 */ 91 */
93void * high_memory; 92void * high_memory;
94 93
95EXPORT_SYMBOL(num_physpages);
96EXPORT_SYMBOL(high_memory); 94EXPORT_SYMBOL(high_memory);
97 95
98/* 96/*
@@ -1101,6 +1099,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
1101 spinlock_t *ptl; 1099 spinlock_t *ptl;
1102 pte_t *start_pte; 1100 pte_t *start_pte;
1103 pte_t *pte; 1101 pte_t *pte;
1102 unsigned long range_start = addr;
1104 1103
1105again: 1104again:
1106 init_rss_vec(rss); 1105 init_rss_vec(rss);
@@ -1206,12 +1205,14 @@ again:
1206 force_flush = 0; 1205 force_flush = 0;
1207 1206
1208#ifdef HAVE_GENERIC_MMU_GATHER 1207#ifdef HAVE_GENERIC_MMU_GATHER
1209 tlb->start = addr; 1208 tlb->start = range_start;
1210 tlb->end = end; 1209 tlb->end = addr;
1211#endif 1210#endif
1212 tlb_flush_mmu(tlb); 1211 tlb_flush_mmu(tlb);
1213 if (addr != end) 1212 if (addr != end) {
1213 range_start = addr;
1214 goto again; 1214 goto again;
1215 }
1215 } 1216 }
1216 1217
1217 return addr; 1218 return addr;
@@ -2904,7 +2905,7 @@ static inline void unmap_mapping_range_tree(struct rb_root *root,
2904 details->first_index, details->last_index) { 2905 details->first_index, details->last_index) {
2905 2906
2906 vba = vma->vm_pgoff; 2907 vba = vma->vm_pgoff;
2907 vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1; 2908 vea = vba + vma_pages(vma) - 1;
2908 /* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */ 2909 /* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */
2909 zba = details->first_index; 2910 zba = details->first_index;
2910 if (zba < vba) 2911 if (zba < vba)
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 081b4d654ed6..f5ba127b2051 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -75,7 +75,7 @@ static struct resource *register_memory_resource(u64 start, u64 size)
75 res->end = start + size - 1; 75 res->end = start + size - 1;
76 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; 76 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
77 if (request_resource(&iomem_resource, res) < 0) { 77 if (request_resource(&iomem_resource, res) < 0) {
78 printk("System RAM resource %pR cannot be added\n", res); 78 pr_debug("System RAM resource %pR cannot be added\n", res);
79 kfree(res); 79 kfree(res);
80 res = NULL; 80 res = NULL;
81 } 81 }
@@ -101,12 +101,9 @@ void get_page_bootmem(unsigned long info, struct page *page,
101 atomic_inc(&page->_count); 101 atomic_inc(&page->_count);
102} 102}
103 103
104/* reference to __meminit __free_pages_bootmem is valid 104void put_page_bootmem(struct page *page)
105 * so use __ref to tell modpost not to generate a warning */
106void __ref put_page_bootmem(struct page *page)
107{ 105{
108 unsigned long type; 106 unsigned long type;
109 static DEFINE_MUTEX(ppb_lock);
110 107
111 type = (unsigned long) page->lru.next; 108 type = (unsigned long) page->lru.next;
112 BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || 109 BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
@@ -116,17 +113,8 @@ void __ref put_page_bootmem(struct page *page)
116 ClearPagePrivate(page); 113 ClearPagePrivate(page);
117 set_page_private(page, 0); 114 set_page_private(page, 0);
118 INIT_LIST_HEAD(&page->lru); 115 INIT_LIST_HEAD(&page->lru);
119 116 free_reserved_page(page);
120 /*
121 * Please refer to comment for __free_pages_bootmem()
122 * for why we serialize here.
123 */
124 mutex_lock(&ppb_lock);
125 __free_pages_bootmem(page, 0);
126 mutex_unlock(&ppb_lock);
127 totalram_pages++;
128 } 117 }
129
130} 118}
131 119
132#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE 120#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
@@ -309,7 +297,7 @@ static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
309 /* can't move pfns which are higher than @z2 */ 297 /* can't move pfns which are higher than @z2 */
310 if (end_pfn > zone_end_pfn(z2)) 298 if (end_pfn > zone_end_pfn(z2))
311 goto out_fail; 299 goto out_fail;
312 /* the move out part mast at the left most of @z2 */ 300 /* the move out part must be at the left most of @z2 */
313 if (start_pfn > z2->zone_start_pfn) 301 if (start_pfn > z2->zone_start_pfn)
314 goto out_fail; 302 goto out_fail;
315 /* must included/overlap */ 303 /* must included/overlap */
@@ -775,29 +763,18 @@ EXPORT_SYMBOL_GPL(restore_online_page_callback);
775 763
776void __online_page_set_limits(struct page *page) 764void __online_page_set_limits(struct page *page)
777{ 765{
778 unsigned long pfn = page_to_pfn(page);
779
780 if (pfn >= num_physpages)
781 num_physpages = pfn + 1;
782} 766}
783EXPORT_SYMBOL_GPL(__online_page_set_limits); 767EXPORT_SYMBOL_GPL(__online_page_set_limits);
784 768
785void __online_page_increment_counters(struct page *page) 769void __online_page_increment_counters(struct page *page)
786{ 770{
787 totalram_pages++; 771 adjust_managed_page_count(page, 1);
788
789#ifdef CONFIG_HIGHMEM
790 if (PageHighMem(page))
791 totalhigh_pages++;
792#endif
793} 772}
794EXPORT_SYMBOL_GPL(__online_page_increment_counters); 773EXPORT_SYMBOL_GPL(__online_page_increment_counters);
795 774
796void __online_page_free(struct page *page) 775void __online_page_free(struct page *page)
797{ 776{
798 ClearPageReserved(page); 777 __free_reserved_page(page);
799 init_page_count(page);
800 __free_page(page);
801} 778}
802EXPORT_SYMBOL_GPL(__online_page_free); 779EXPORT_SYMBOL_GPL(__online_page_free);
803 780
@@ -918,6 +895,7 @@ static void node_states_set_node(int node, struct memory_notify *arg)
918 895
919int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) 896int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
920{ 897{
898 unsigned long flags;
921 unsigned long onlined_pages = 0; 899 unsigned long onlined_pages = 0;
922 struct zone *zone; 900 struct zone *zone;
923 int need_zonelists_rebuild = 0; 901 int need_zonelists_rebuild = 0;
@@ -994,9 +972,12 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
994 return ret; 972 return ret;
995 } 973 }
996 974
997 zone->managed_pages += onlined_pages;
998 zone->present_pages += onlined_pages; 975 zone->present_pages += onlined_pages;
976
977 pgdat_resize_lock(zone->zone_pgdat, &flags);
999 zone->zone_pgdat->node_present_pages += onlined_pages; 978 zone->zone_pgdat->node_present_pages += onlined_pages;
979 pgdat_resize_unlock(zone->zone_pgdat, &flags);
980
1000 if (onlined_pages) { 981 if (onlined_pages) {
1001 node_states_set_node(zone_to_nid(zone), &arg); 982 node_states_set_node(zone_to_nid(zone), &arg);
1002 if (need_zonelists_rebuild) 983 if (need_zonelists_rebuild)
@@ -1487,6 +1468,7 @@ static int __ref __offline_pages(unsigned long start_pfn,
1487 unsigned long pfn, nr_pages, expire; 1468 unsigned long pfn, nr_pages, expire;
1488 long offlined_pages; 1469 long offlined_pages;
1489 int ret, drain, retry_max, node; 1470 int ret, drain, retry_max, node;
1471 unsigned long flags;
1490 struct zone *zone; 1472 struct zone *zone;
1491 struct memory_notify arg; 1473 struct memory_notify arg;
1492 1474
@@ -1578,10 +1560,12 @@ repeat:
1578 /* reset pagetype flags and makes migrate type to be MOVABLE */ 1560 /* reset pagetype flags and makes migrate type to be MOVABLE */
1579 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 1561 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
1580 /* removal success */ 1562 /* removal success */
1581 zone->managed_pages -= offlined_pages; 1563 adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages);
1582 zone->present_pages -= offlined_pages; 1564 zone->present_pages -= offlined_pages;
1565
1566 pgdat_resize_lock(zone->zone_pgdat, &flags);
1583 zone->zone_pgdat->node_present_pages -= offlined_pages; 1567 zone->zone_pgdat->node_present_pages -= offlined_pages;
1584 totalram_pages -= offlined_pages; 1568 pgdat_resize_unlock(zone->zone_pgdat, &flags);
1585 1569
1586 init_per_zone_wmark_min(); 1570 init_per_zone_wmark_min();
1587 1571
diff --git a/mm/mm_init.c b/mm/mm_init.c
index c280a02ea11e..633c08863fd8 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -9,6 +9,8 @@
9#include <linux/init.h> 9#include <linux/init.h>
10#include <linux/kobject.h> 10#include <linux/kobject.h>
11#include <linux/export.h> 11#include <linux/export.h>
12#include <linux/memory.h>
13#include <linux/notifier.h>
12#include "internal.h" 14#include "internal.h"
13 15
14#ifdef CONFIG_DEBUG_MEMORY_INIT 16#ifdef CONFIG_DEBUG_MEMORY_INIT
@@ -147,6 +149,51 @@ early_param("mminit_loglevel", set_mminit_loglevel);
147struct kobject *mm_kobj; 149struct kobject *mm_kobj;
148EXPORT_SYMBOL_GPL(mm_kobj); 150EXPORT_SYMBOL_GPL(mm_kobj);
149 151
152#ifdef CONFIG_SMP
153s32 vm_committed_as_batch = 32;
154
155static void __meminit mm_compute_batch(void)
156{
157 u64 memsized_batch;
158 s32 nr = num_present_cpus();
159 s32 batch = max_t(s32, nr*2, 32);
160
161 /* batch size set to 0.4% of (total memory/#cpus), or max int32 */
162 memsized_batch = min_t(u64, (totalram_pages/nr)/256, 0x7fffffff);
163
164 vm_committed_as_batch = max_t(s32, memsized_batch, batch);
165}
166
167static int __meminit mm_compute_batch_notifier(struct notifier_block *self,
168 unsigned long action, void *arg)
169{
170 switch (action) {
171 case MEM_ONLINE:
172 case MEM_OFFLINE:
173 mm_compute_batch();
174 default:
175 break;
176 }
177 return NOTIFY_OK;
178}
179
180static struct notifier_block compute_batch_nb __meminitdata = {
181 .notifier_call = mm_compute_batch_notifier,
182 .priority = IPC_CALLBACK_PRI, /* use lowest priority */
183};
184
185static int __init mm_compute_batch_init(void)
186{
187 mm_compute_batch();
188 register_hotmemory_notifier(&compute_batch_nb);
189
190 return 0;
191}
192
193__initcall(mm_compute_batch_init);
194
195#endif
196
150static int __init mm_sysfs_init(void) 197static int __init mm_sysfs_init(void)
151{ 198{
152 mm_kobj = kobject_create_and_add("mm", kernel_kobj); 199 mm_kobj = kobject_create_and_add("mm", kernel_kobj);
diff --git a/mm/mmap.c b/mm/mmap.c
index f681e1842fad..8468ffd05bae 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -955,7 +955,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
955 if (is_mergeable_vma(vma, file, vm_flags) && 955 if (is_mergeable_vma(vma, file, vm_flags) &&
956 is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { 956 is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
957 pgoff_t vm_pglen; 957 pgoff_t vm_pglen;
958 vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; 958 vm_pglen = vma_pages(vma);
959 if (vma->vm_pgoff + vm_pglen == vm_pgoff) 959 if (vma->vm_pgoff + vm_pglen == vm_pgoff)
960 return 1; 960 return 1;
961 } 961 }
diff --git a/mm/mremap.c b/mm/mremap.c
index 463a25705ac6..3708655378e9 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -126,7 +126,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
126 continue; 126 continue;
127 pte = ptep_get_and_clear(mm, old_addr, old_pte); 127 pte = ptep_get_and_clear(mm, old_addr, old_pte);
128 pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr); 128 pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
129 set_pte_at(mm, new_addr, new_pte, pte); 129 set_pte_at(mm, new_addr, new_pte, pte_mksoft_dirty(pte));
130 } 130 }
131 131
132 arch_leave_lazy_mmu_mode(); 132 arch_leave_lazy_mmu_mode();
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index bdd3fa2fc73b..61107cf55bb3 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -137,20 +137,25 @@ static unsigned long __init free_low_memory_core_early(void)
137 return count; 137 return count;
138} 138}
139 139
140static void reset_node_lowmem_managed_pages(pg_data_t *pgdat) 140static int reset_managed_pages_done __initdata;
141
142static inline void __init reset_node_managed_pages(pg_data_t *pgdat)
141{ 143{
142 struct zone *z; 144 struct zone *z;
143 145
144 /* 146 if (reset_managed_pages_done)
145 * In free_area_init_core(), highmem zone's managed_pages is set to 147 return;
146 * present_pages, and bootmem allocator doesn't allocate from highmem
147 * zones. So there's no need to recalculate managed_pages because all
148 * highmem pages will be managed by the buddy system. Here highmem
149 * zone also includes highmem movable zone.
150 */
151 for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) 148 for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
152 if (!is_highmem(z)) 149 z->managed_pages = 0;
153 z->managed_pages = 0; 150}
151
152void __init reset_all_zones_managed_pages(void)
153{
154 struct pglist_data *pgdat;
155
156 for_each_online_pgdat(pgdat)
157 reset_node_managed_pages(pgdat);
158 reset_managed_pages_done = 1;
154} 159}
155 160
156/** 161/**
@@ -160,17 +165,19 @@ static void reset_node_lowmem_managed_pages(pg_data_t *pgdat)
160 */ 165 */
161unsigned long __init free_all_bootmem(void) 166unsigned long __init free_all_bootmem(void)
162{ 167{
163 struct pglist_data *pgdat; 168 unsigned long pages;
164 169
165 for_each_online_pgdat(pgdat) 170 reset_all_zones_managed_pages();
166 reset_node_lowmem_managed_pages(pgdat);
167 171
168 /* 172 /*
169 * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id 173 * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
170 * because in some case like Node0 doesn't have RAM installed 174 * because in some case like Node0 doesn't have RAM installed
171 * low ram will be on Node1 175 * low ram will be on Node1
172 */ 176 */
173 return free_low_memory_core_early(); 177 pages = free_low_memory_core_early();
178 totalram_pages += pages;
179
180 return pages;
174} 181}
175 182
176/** 183/**
diff --git a/mm/nommu.c b/mm/nommu.c
index 298884dcd6e7..e44e6e0a125c 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -56,7 +56,6 @@
56void *high_memory; 56void *high_memory;
57struct page *mem_map; 57struct page *mem_map;
58unsigned long max_mapnr; 58unsigned long max_mapnr;
59unsigned long num_physpages;
60unsigned long highest_memmap_pfn; 59unsigned long highest_memmap_pfn;
61struct percpu_counter vm_committed_as; 60struct percpu_counter vm_committed_as;
62int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ 61int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
@@ -85,7 +84,6 @@ unsigned long vm_memory_committed(void)
85EXPORT_SYMBOL_GPL(vm_memory_committed); 84EXPORT_SYMBOL_GPL(vm_memory_committed);
86 85
87EXPORT_SYMBOL(mem_map); 86EXPORT_SYMBOL(mem_map);
88EXPORT_SYMBOL(num_physpages);
89 87
90/* list of mapped, potentially shareable regions */ 88/* list of mapped, potentially shareable regions */
91static struct kmem_cache *vm_region_jar; 89static struct kmem_cache *vm_region_jar;
@@ -282,6 +280,10 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
282 280
283long vread(char *buf, char *addr, unsigned long count) 281long vread(char *buf, char *addr, unsigned long count)
284{ 282{
283 /* Don't allow overflow */
284 if ((unsigned long) buf + count < count)
285 count = -(unsigned long) buf;
286
285 memcpy(buf, addr, count); 287 memcpy(buf, addr, count);
286 return count; 288 return count;
287} 289}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c3edb624fccf..327516b7aee9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -61,10 +61,14 @@
61#include <linux/hugetlb.h> 61#include <linux/hugetlb.h>
62#include <linux/sched/rt.h> 62#include <linux/sched/rt.h>
63 63
64#include <asm/sections.h>
64#include <asm/tlbflush.h> 65#include <asm/tlbflush.h>
65#include <asm/div64.h> 66#include <asm/div64.h>
66#include "internal.h" 67#include "internal.h"
67 68
69/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
70static DEFINE_MUTEX(pcp_batch_high_lock);
71
68#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID 72#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
69DEFINE_PER_CPU(int, numa_node); 73DEFINE_PER_CPU(int, numa_node);
70EXPORT_PER_CPU_SYMBOL(numa_node); 74EXPORT_PER_CPU_SYMBOL(numa_node);
@@ -100,6 +104,9 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
100}; 104};
101EXPORT_SYMBOL(node_states); 105EXPORT_SYMBOL(node_states);
102 106
107/* Protect totalram_pages and zone->managed_pages */
108static DEFINE_SPINLOCK(managed_page_count_lock);
109
103unsigned long totalram_pages __read_mostly; 110unsigned long totalram_pages __read_mostly;
104unsigned long totalreserve_pages __read_mostly; 111unsigned long totalreserve_pages __read_mostly;
105/* 112/*
@@ -739,14 +746,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
739 local_irq_restore(flags); 746 local_irq_restore(flags);
740} 747}
741 748
742/* 749void __init __free_pages_bootmem(struct page *page, unsigned int order)
743 * Read access to zone->managed_pages is safe because it's unsigned long,
744 * but we still need to serialize writers. Currently all callers of
745 * __free_pages_bootmem() except put_page_bootmem() should only be used
746 * at boot time. So for shorter boot time, we shift the burden to
747 * put_page_bootmem() to serialize writers.
748 */
749void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
750{ 750{
751 unsigned int nr_pages = 1 << order; 751 unsigned int nr_pages = 1 << order;
752 unsigned int loop; 752 unsigned int loop;
@@ -781,11 +781,7 @@ void __init init_cma_reserved_pageblock(struct page *page)
781 set_page_refcounted(page); 781 set_page_refcounted(page);
782 set_pageblock_migratetype(page, MIGRATE_CMA); 782 set_pageblock_migratetype(page, MIGRATE_CMA);
783 __free_pages(page, pageblock_order); 783 __free_pages(page, pageblock_order);
784 totalram_pages += pageblock_nr_pages; 784 adjust_managed_page_count(page, pageblock_nr_pages);
785#ifdef CONFIG_HIGHMEM
786 if (PageHighMem(page))
787 totalhigh_pages += pageblock_nr_pages;
788#endif
789} 785}
790#endif 786#endif
791 787
@@ -1179,10 +1175,12 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
1179{ 1175{
1180 unsigned long flags; 1176 unsigned long flags;
1181 int to_drain; 1177 int to_drain;
1178 unsigned long batch;
1182 1179
1183 local_irq_save(flags); 1180 local_irq_save(flags);
1184 if (pcp->count >= pcp->batch) 1181 batch = ACCESS_ONCE(pcp->batch);
1185 to_drain = pcp->batch; 1182 if (pcp->count >= batch)
1183 to_drain = batch;
1186 else 1184 else
1187 to_drain = pcp->count; 1185 to_drain = pcp->count;
1188 if (to_drain > 0) { 1186 if (to_drain > 0) {
@@ -1350,8 +1348,9 @@ void free_hot_cold_page(struct page *page, int cold)
1350 list_add(&page->lru, &pcp->lists[migratetype]); 1348 list_add(&page->lru, &pcp->lists[migratetype]);
1351 pcp->count++; 1349 pcp->count++;
1352 if (pcp->count >= pcp->high) { 1350 if (pcp->count >= pcp->high) {
1353 free_pcppages_bulk(zone, pcp->batch, pcp); 1351 unsigned long batch = ACCESS_ONCE(pcp->batch);
1354 pcp->count -= pcp->batch; 1352 free_pcppages_bulk(zone, batch, pcp);
1353 pcp->count -= batch;
1355 } 1354 }
1356 1355
1357out: 1356out:
@@ -2839,7 +2838,7 @@ EXPORT_SYMBOL(free_pages_exact);
2839 * nr_free_zone_pages() counts the number of counts pages which are beyond the 2838 * nr_free_zone_pages() counts the number of counts pages which are beyond the
2840 * high watermark within all zones at or below a given zone index. For each 2839 * high watermark within all zones at or below a given zone index. For each
2841 * zone, the number of pages is calculated as: 2840 * zone, the number of pages is calculated as:
2842 * present_pages - high_pages 2841 * managed_pages - high_pages
2843 */ 2842 */
2844static unsigned long nr_free_zone_pages(int offset) 2843static unsigned long nr_free_zone_pages(int offset)
2845{ 2844{
@@ -2906,9 +2905,13 @@ EXPORT_SYMBOL(si_meminfo);
2906#ifdef CONFIG_NUMA 2905#ifdef CONFIG_NUMA
2907void si_meminfo_node(struct sysinfo *val, int nid) 2906void si_meminfo_node(struct sysinfo *val, int nid)
2908{ 2907{
2908 int zone_type; /* needs to be signed */
2909 unsigned long managed_pages = 0;
2909 pg_data_t *pgdat = NODE_DATA(nid); 2910 pg_data_t *pgdat = NODE_DATA(nid);
2910 2911
2911 val->totalram = pgdat->node_present_pages; 2912 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
2913 managed_pages += pgdat->node_zones[zone_type].managed_pages;
2914 val->totalram = managed_pages;
2912 val->freeram = node_page_state(nid, NR_FREE_PAGES); 2915 val->freeram = node_page_state(nid, NR_FREE_PAGES);
2913#ifdef CONFIG_HIGHMEM 2916#ifdef CONFIG_HIGHMEM
2914 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages; 2917 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;
@@ -3250,18 +3253,25 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
3250 static DEFINE_MUTEX(zl_order_mutex); 3253 static DEFINE_MUTEX(zl_order_mutex);
3251 3254
3252 mutex_lock(&zl_order_mutex); 3255 mutex_lock(&zl_order_mutex);
3253 if (write) 3256 if (write) {
3254 strcpy(saved_string, (char*)table->data); 3257 if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) {
3258 ret = -EINVAL;
3259 goto out;
3260 }
3261 strcpy(saved_string, (char *)table->data);
3262 }
3255 ret = proc_dostring(table, write, buffer, length, ppos); 3263 ret = proc_dostring(table, write, buffer, length, ppos);
3256 if (ret) 3264 if (ret)
3257 goto out; 3265 goto out;
3258 if (write) { 3266 if (write) {
3259 int oldval = user_zonelist_order; 3267 int oldval = user_zonelist_order;
3260 if (__parse_numa_zonelist_order((char*)table->data)) { 3268
3269 ret = __parse_numa_zonelist_order((char *)table->data);
3270 if (ret) {
3261 /* 3271 /*
3262 * bogus value. restore saved string 3272 * bogus value. restore saved string
3263 */ 3273 */
3264 strncpy((char*)table->data, saved_string, 3274 strncpy((char *)table->data, saved_string,
3265 NUMA_ZONELIST_ORDER_LEN); 3275 NUMA_ZONELIST_ORDER_LEN);
3266 user_zonelist_order = oldval; 3276 user_zonelist_order = oldval;
3267 } else if (oldval != user_zonelist_order) { 3277 } else if (oldval != user_zonelist_order) {
@@ -3425,8 +3435,8 @@ static int default_zonelist_order(void)
3425 z = &NODE_DATA(nid)->node_zones[zone_type]; 3435 z = &NODE_DATA(nid)->node_zones[zone_type];
3426 if (populated_zone(z)) { 3436 if (populated_zone(z)) {
3427 if (zone_type < ZONE_NORMAL) 3437 if (zone_type < ZONE_NORMAL)
3428 low_kmem_size += z->present_pages; 3438 low_kmem_size += z->managed_pages;
3429 total_size += z->present_pages; 3439 total_size += z->managed_pages;
3430 } else if (zone_type == ZONE_NORMAL) { 3440 } else if (zone_type == ZONE_NORMAL) {
3431 /* 3441 /*
3432 * If any node has only lowmem, then node order 3442 * If any node has only lowmem, then node order
@@ -3705,12 +3715,12 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
3705 mminit_verify_zonelist(); 3715 mminit_verify_zonelist();
3706 cpuset_init_current_mems_allowed(); 3716 cpuset_init_current_mems_allowed();
3707 } else { 3717 } else {
3708 /* we have to stop all cpus to guarantee there is no user
3709 of zonelist */
3710#ifdef CONFIG_MEMORY_HOTPLUG 3718#ifdef CONFIG_MEMORY_HOTPLUG
3711 if (zone) 3719 if (zone)
3712 setup_zone_pageset(zone); 3720 setup_zone_pageset(zone);
3713#endif 3721#endif
3722 /* we have to stop all cpus to guarantee there is no user
3723 of zonelist */
3714 stop_machine(__build_all_zonelists, pgdat, NULL); 3724 stop_machine(__build_all_zonelists, pgdat, NULL);
3715 /* cpuset refresh routine should be here */ 3725 /* cpuset refresh routine should be here */
3716 } 3726 }
@@ -4032,7 +4042,40 @@ static int __meminit zone_batchsize(struct zone *zone)
4032#endif 4042#endif
4033} 4043}
4034 4044
4035static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) 4045/*
4046 * pcp->high and pcp->batch values are related and dependent on one another:
4047 * ->batch must never be higher then ->high.
4048 * The following function updates them in a safe manner without read side
4049 * locking.
4050 *
4051 * Any new users of pcp->batch and pcp->high should ensure they can cope with
4052 * those fields changing asynchronously (acording the the above rule).
4053 *
4054 * mutex_is_locked(&pcp_batch_high_lock) required when calling this function
4055 * outside of boot time (or some other assurance that no concurrent updaters
4056 * exist).
4057 */
4058static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
4059 unsigned long batch)
4060{
4061 /* start with a fail safe value for batch */
4062 pcp->batch = 1;
4063 smp_wmb();
4064
4065 /* Update high, then batch, in order */
4066 pcp->high = high;
4067 smp_wmb();
4068
4069 pcp->batch = batch;
4070}
4071
4072/* a companion to pageset_set_high() */
4073static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)
4074{
4075 pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));
4076}
4077
4078static void pageset_init(struct per_cpu_pageset *p)
4036{ 4079{
4037 struct per_cpu_pages *pcp; 4080 struct per_cpu_pages *pcp;
4038 int migratetype; 4081 int migratetype;
@@ -4041,45 +4084,55 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
4041 4084
4042 pcp = &p->pcp; 4085 pcp = &p->pcp;
4043 pcp->count = 0; 4086 pcp->count = 0;
4044 pcp->high = 6 * batch;
4045 pcp->batch = max(1UL, 1 * batch);
4046 for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) 4087 for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
4047 INIT_LIST_HEAD(&pcp->lists[migratetype]); 4088 INIT_LIST_HEAD(&pcp->lists[migratetype]);
4048} 4089}
4049 4090
4091static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
4092{
4093 pageset_init(p);
4094 pageset_set_batch(p, batch);
4095}
4096
4050/* 4097/*
4051 * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist 4098 * pageset_set_high() sets the high water mark for hot per_cpu_pagelist
4052 * to the value high for the pageset p. 4099 * to the value high for the pageset p.
4053 */ 4100 */
4054 4101static void pageset_set_high(struct per_cpu_pageset *p,
4055static void setup_pagelist_highmark(struct per_cpu_pageset *p,
4056 unsigned long high) 4102 unsigned long high)
4057{ 4103{
4058 struct per_cpu_pages *pcp; 4104 unsigned long batch = max(1UL, high / 4);
4105 if ((high / 4) > (PAGE_SHIFT * 8))
4106 batch = PAGE_SHIFT * 8;
4059 4107
4060 pcp = &p->pcp; 4108 pageset_update(&p->pcp, high, batch);
4061 pcp->high = high;
4062 pcp->batch = max(1UL, high/4);
4063 if ((high/4) > (PAGE_SHIFT * 8))
4064 pcp->batch = PAGE_SHIFT * 8;
4065} 4109}
4066 4110
4067static void __meminit setup_zone_pageset(struct zone *zone) 4111static void __meminit pageset_set_high_and_batch(struct zone *zone,
4112 struct per_cpu_pageset *pcp)
4068{ 4113{
4069 int cpu; 4114 if (percpu_pagelist_fraction)
4070 4115 pageset_set_high(pcp,
4071 zone->pageset = alloc_percpu(struct per_cpu_pageset); 4116 (zone->managed_pages /
4117 percpu_pagelist_fraction));
4118 else
4119 pageset_set_batch(pcp, zone_batchsize(zone));
4120}
4072 4121
4073 for_each_possible_cpu(cpu) { 4122static void __meminit zone_pageset_init(struct zone *zone, int cpu)
4074 struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); 4123{
4124 struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
4075 4125
4076 setup_pageset(pcp, zone_batchsize(zone)); 4126 pageset_init(pcp);
4127 pageset_set_high_and_batch(zone, pcp);
4128}
4077 4129
4078 if (percpu_pagelist_fraction) 4130static void __meminit setup_zone_pageset(struct zone *zone)
4079 setup_pagelist_highmark(pcp, 4131{
4080 (zone->managed_pages / 4132 int cpu;
4081 percpu_pagelist_fraction)); 4133 zone->pageset = alloc_percpu(struct per_cpu_pageset);
4082 } 4134 for_each_possible_cpu(cpu)
4135 zone_pageset_init(zone, cpu);
4083} 4136}
4084 4137
4085/* 4138/*
@@ -5150,35 +5203,101 @@ early_param("movablecore", cmdline_parse_movablecore);
5150 5203
5151#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 5204#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
5152 5205
5153unsigned long free_reserved_area(unsigned long start, unsigned long end, 5206void adjust_managed_page_count(struct page *page, long count)
5154 int poison, char *s) 5207{
5208 spin_lock(&managed_page_count_lock);
5209 page_zone(page)->managed_pages += count;
5210 totalram_pages += count;
5211#ifdef CONFIG_HIGHMEM
5212 if (PageHighMem(page))
5213 totalhigh_pages += count;
5214#endif
5215 spin_unlock(&managed_page_count_lock);
5216}
5217EXPORT_SYMBOL(adjust_managed_page_count);
5218
5219unsigned long free_reserved_area(void *start, void *end, int poison, char *s)
5155{ 5220{
5156 unsigned long pages, pos; 5221 void *pos;
5222 unsigned long pages = 0;
5157 5223
5158 pos = start = PAGE_ALIGN(start); 5224 start = (void *)PAGE_ALIGN((unsigned long)start);
5159 end &= PAGE_MASK; 5225 end = (void *)((unsigned long)end & PAGE_MASK);
5160 for (pages = 0; pos < end; pos += PAGE_SIZE, pages++) { 5226 for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
5161 if (poison) 5227 if ((unsigned int)poison <= 0xFF)
5162 memset((void *)pos, poison, PAGE_SIZE); 5228 memset(pos, poison, PAGE_SIZE);
5163 free_reserved_page(virt_to_page((void *)pos)); 5229 free_reserved_page(virt_to_page(pos));
5164 } 5230 }
5165 5231
5166 if (pages && s) 5232 if (pages && s)
5167 pr_info("Freeing %s memory: %ldK (%lx - %lx)\n", 5233 pr_info("Freeing %s memory: %ldK (%p - %p)\n",
5168 s, pages << (PAGE_SHIFT - 10), start, end); 5234 s, pages << (PAGE_SHIFT - 10), start, end);
5169 5235
5170 return pages; 5236 return pages;
5171} 5237}
5238EXPORT_SYMBOL(free_reserved_area);
5172 5239
5173#ifdef CONFIG_HIGHMEM 5240#ifdef CONFIG_HIGHMEM
5174void free_highmem_page(struct page *page) 5241void free_highmem_page(struct page *page)
5175{ 5242{
5176 __free_reserved_page(page); 5243 __free_reserved_page(page);
5177 totalram_pages++; 5244 totalram_pages++;
5245 page_zone(page)->managed_pages++;
5178 totalhigh_pages++; 5246 totalhigh_pages++;
5179} 5247}
5180#endif 5248#endif
5181 5249
5250
5251void __init mem_init_print_info(const char *str)
5252{
5253 unsigned long physpages, codesize, datasize, rosize, bss_size;
5254 unsigned long init_code_size, init_data_size;
5255
5256 physpages = get_num_physpages();
5257 codesize = _etext - _stext;
5258 datasize = _edata - _sdata;
5259 rosize = __end_rodata - __start_rodata;
5260 bss_size = __bss_stop - __bss_start;
5261 init_data_size = __init_end - __init_begin;
5262 init_code_size = _einittext - _sinittext;
5263
5264 /*
5265 * Detect special cases and adjust section sizes accordingly:
5266 * 1) .init.* may be embedded into .data sections
5267 * 2) .init.text.* may be out of [__init_begin, __init_end],
5268 * please refer to arch/tile/kernel/vmlinux.lds.S.
5269 * 3) .rodata.* may be embedded into .text or .data sections.
5270 */
5271#define adj_init_size(start, end, size, pos, adj) \
5272 if (start <= pos && pos < end && size > adj) \
5273 size -= adj;
5274
5275 adj_init_size(__init_begin, __init_end, init_data_size,
5276 _sinittext, init_code_size);
5277 adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);
5278 adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);
5279 adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);
5280 adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);
5281
5282#undef adj_init_size
5283
5284 printk("Memory: %luK/%luK available "
5285 "(%luK kernel code, %luK rwdata, %luK rodata, "
5286 "%luK init, %luK bss, %luK reserved"
5287#ifdef CONFIG_HIGHMEM
5288 ", %luK highmem"
5289#endif
5290 "%s%s)\n",
5291 nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10),
5292 codesize >> 10, datasize >> 10, rosize >> 10,
5293 (init_data_size + init_code_size) >> 10, bss_size >> 10,
5294 (physpages - totalram_pages) << (PAGE_SHIFT-10),
5295#ifdef CONFIG_HIGHMEM
5296 totalhigh_pages << (PAGE_SHIFT-10),
5297#endif
5298 str ? ", " : "", str ? str : "");
5299}
5300
5182/** 5301/**
5183 * set_dma_reserve - set the specified number of pages reserved in the first zone 5302 * set_dma_reserve - set the specified number of pages reserved in the first zone
5184 * @new_dma_reserve: The number of pages to mark reserved 5303 * @new_dma_reserve: The number of pages to mark reserved
@@ -5540,7 +5659,6 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
5540 * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist 5659 * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist
5541 * can have before it gets flushed back to buddy allocator. 5660 * can have before it gets flushed back to buddy allocator.
5542 */ 5661 */
5543
5544int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, 5662int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
5545 void __user *buffer, size_t *length, loff_t *ppos) 5663 void __user *buffer, size_t *length, loff_t *ppos)
5546{ 5664{
@@ -5551,14 +5669,16 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
5551 ret = proc_dointvec_minmax(table, write, buffer, length, ppos); 5669 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
5552 if (!write || (ret < 0)) 5670 if (!write || (ret < 0))
5553 return ret; 5671 return ret;
5672
5673 mutex_lock(&pcp_batch_high_lock);
5554 for_each_populated_zone(zone) { 5674 for_each_populated_zone(zone) {
5555 for_each_possible_cpu(cpu) { 5675 unsigned long high;
5556 unsigned long high; 5676 high = zone->managed_pages / percpu_pagelist_fraction;
5557 high = zone->managed_pages / percpu_pagelist_fraction; 5677 for_each_possible_cpu(cpu)
5558 setup_pagelist_highmark( 5678 pageset_set_high(per_cpu_ptr(zone->pageset, cpu),
5559 per_cpu_ptr(zone->pageset, cpu), high); 5679 high);
5560 }
5561 } 5680 }
5681 mutex_unlock(&pcp_batch_high_lock);
5562 return 0; 5682 return 0;
5563} 5683}
5564 5684
@@ -6047,32 +6167,18 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages)
6047#endif 6167#endif
6048 6168
6049#ifdef CONFIG_MEMORY_HOTPLUG 6169#ifdef CONFIG_MEMORY_HOTPLUG
6050static int __meminit __zone_pcp_update(void *data) 6170/*
6051{ 6171 * The zone indicated has a new number of managed_pages; batch sizes and percpu
6052 struct zone *zone = data; 6172 * page high values need to be recalulated.
6053 int cpu; 6173 */
6054 unsigned long batch = zone_batchsize(zone), flags;
6055
6056 for_each_possible_cpu(cpu) {
6057 struct per_cpu_pageset *pset;
6058 struct per_cpu_pages *pcp;
6059
6060 pset = per_cpu_ptr(zone->pageset, cpu);
6061 pcp = &pset->pcp;
6062
6063 local_irq_save(flags);
6064 if (pcp->count > 0)
6065 free_pcppages_bulk(zone, pcp->count, pcp);
6066 drain_zonestat(zone, pset);
6067 setup_pageset(pset, batch);
6068 local_irq_restore(flags);
6069 }
6070 return 0;
6071}
6072
6073void __meminit zone_pcp_update(struct zone *zone) 6174void __meminit zone_pcp_update(struct zone *zone)
6074{ 6175{
6075 stop_machine(__zone_pcp_update, zone, NULL); 6176 unsigned cpu;
6177 mutex_lock(&pcp_batch_high_lock);
6178 for_each_possible_cpu(cpu)
6179 pageset_set_high_and_batch(zone,
6180 per_cpu_ptr(zone->pageset, cpu));
6181 mutex_unlock(&pcp_batch_high_lock);
6076} 6182}
6077#endif 6183#endif
6078 6184
@@ -6142,6 +6248,10 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
6142 list_del(&page->lru); 6248 list_del(&page->lru);
6143 rmv_page_order(page); 6249 rmv_page_order(page);
6144 zone->free_area[order].nr_free--; 6250 zone->free_area[order].nr_free--;
6251#ifdef CONFIG_HIGHMEM
6252 if (PageHighMem(page))
6253 totalhigh_pages -= 1 << order;
6254#endif
6145 for (i = 0; i < (1 << order); i++) 6255 for (i = 0; i < (1 << order); i++)
6146 SetPageReserved((page+i)); 6256 SetPageReserved((page+i));
6147 pfn += (1 << order); 6257 pfn += (1 << order);
diff --git a/mm/page_io.c b/mm/page_io.c
index a8a3ef45fed7..ba05b64e5d8d 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -21,6 +21,7 @@
21#include <linux/writeback.h> 21#include <linux/writeback.h>
22#include <linux/frontswap.h> 22#include <linux/frontswap.h>
23#include <linux/aio.h> 23#include <linux/aio.h>
24#include <linux/blkdev.h>
24#include <asm/pgtable.h> 25#include <asm/pgtable.h>
25 26
26static struct bio *get_swap_bio(gfp_t gfp_flags, 27static struct bio *get_swap_bio(gfp_t gfp_flags,
@@ -80,9 +81,54 @@ void end_swap_bio_read(struct bio *bio, int err)
80 imajor(bio->bi_bdev->bd_inode), 81 imajor(bio->bi_bdev->bd_inode),
81 iminor(bio->bi_bdev->bd_inode), 82 iminor(bio->bi_bdev->bd_inode),
82 (unsigned long long)bio->bi_sector); 83 (unsigned long long)bio->bi_sector);
83 } else { 84 goto out;
84 SetPageUptodate(page);
85 } 85 }
86
87 SetPageUptodate(page);
88
89 /*
90 * There is no guarantee that the page is in swap cache - the software
91 * suspend code (at least) uses end_swap_bio_read() against a non-
92 * swapcache page. So we must check PG_swapcache before proceeding with
93 * this optimization.
94 */
95 if (likely(PageSwapCache(page))) {
96 struct swap_info_struct *sis;
97
98 sis = page_swap_info(page);
99 if (sis->flags & SWP_BLKDEV) {
100 /*
101 * The swap subsystem performs lazy swap slot freeing,
102 * expecting that the page will be swapped out again.
103 * So we can avoid an unnecessary write if the page
104 * isn't redirtied.
105 * This is good for real swap storage because we can
106 * reduce unnecessary I/O and enhance wear-leveling
107 * if an SSD is used as the as swap device.
108 * But if in-memory swap device (eg zram) is used,
109 * this causes a duplicated copy between uncompressed
110 * data in VM-owned memory and compressed data in
111 * zram-owned memory. So let's free zram-owned memory
112 * and make the VM-owned decompressed page *dirty*,
113 * so the page should be swapped out somewhere again if
114 * we again wish to reclaim it.
115 */
116 struct gendisk *disk = sis->bdev->bd_disk;
117 if (disk->fops->swap_slot_free_notify) {
118 swp_entry_t entry;
119 unsigned long offset;
120
121 entry.val = page_private(page);
122 offset = swp_offset(entry);
123
124 SetPageDirty(page);
125 disk->fops->swap_slot_free_notify(sis->bdev,
126 offset);
127 }
128 }
129 }
130
131out:
86 unlock_page(page); 132 unlock_page(page);
87 bio_put(bio); 133 bio_put(bio);
88} 134}
diff --git a/mm/rmap.c b/mm/rmap.c
index 6280da86b5d6..e22ceeb6e5ec 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1093,9 +1093,10 @@ void page_add_new_anon_rmap(struct page *page,
1093 else 1093 else
1094 __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); 1094 __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
1095 __page_set_anon_rmap(page, vma, address, 1); 1095 __page_set_anon_rmap(page, vma, address, 1);
1096 if (!mlocked_vma_newpage(vma, page)) 1096 if (!mlocked_vma_newpage(vma, page)) {
1097 lru_cache_add_lru(page, LRU_ACTIVE_ANON); 1097 SetPageActive(page);
1098 else 1098 lru_cache_add(page);
1099 } else
1099 add_page_to_unevictable_list(page); 1100 add_page_to_unevictable_list(page);
1100} 1101}
1101 1102
diff --git a/mm/sparse.c b/mm/sparse.c
index 1c91f0d3f6ab..3194ec414728 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -481,6 +481,9 @@ void __init sparse_init(void)
481 struct page **map_map; 481 struct page **map_map;
482#endif 482#endif
483 483
484 /* see include/linux/mmzone.h 'struct mem_section' definition */
485 BUILD_BUG_ON(!is_power_of_2(sizeof(struct mem_section)));
486
484 /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */ 487 /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */
485 set_pageblock_order(); 488 set_pageblock_order();
486 489
diff --git a/mm/swap.c b/mm/swap.c
index dfd7d71d6841..4a1d0d2c52fa 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -34,10 +34,13 @@
34 34
35#include "internal.h" 35#include "internal.h"
36 36
37#define CREATE_TRACE_POINTS
38#include <trace/events/pagemap.h>
39
37/* How many pages do we try to swap or page in/out together? */ 40/* How many pages do we try to swap or page in/out together? */
38int page_cluster; 41int page_cluster;
39 42
40static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs); 43static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
41static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); 44static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
42static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); 45static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
43 46
@@ -384,6 +387,7 @@ static void __activate_page(struct page *page, struct lruvec *lruvec,
384 SetPageActive(page); 387 SetPageActive(page);
385 lru += LRU_ACTIVE; 388 lru += LRU_ACTIVE;
386 add_page_to_lru_list(page, lruvec, lru); 389 add_page_to_lru_list(page, lruvec, lru);
390 trace_mm_lru_activate(page, page_to_pfn(page));
387 391
388 __count_vm_event(PGACTIVATE); 392 __count_vm_event(PGACTIVATE);
389 update_page_reclaim_stat(lruvec, file, 1); 393 update_page_reclaim_stat(lruvec, file, 1);
@@ -428,6 +432,33 @@ void activate_page(struct page *page)
428} 432}
429#endif 433#endif
430 434
435static void __lru_cache_activate_page(struct page *page)
436{
437 struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
438 int i;
439
440 /*
441 * Search backwards on the optimistic assumption that the page being
442 * activated has just been added to this pagevec. Note that only
443 * the local pagevec is examined as a !PageLRU page could be in the
444 * process of being released, reclaimed, migrated or on a remote
445 * pagevec that is currently being drained. Furthermore, marking
446 * a remote pagevec's page PageActive potentially hits a race where
447 * a page is marked PageActive just after it is added to the inactive
448 * list causing accounting errors and BUG_ON checks to trigger.
449 */
450 for (i = pagevec_count(pvec) - 1; i >= 0; i--) {
451 struct page *pagevec_page = pvec->pages[i];
452
453 if (pagevec_page == page) {
454 SetPageActive(page);
455 break;
456 }
457 }
458
459 put_cpu_var(lru_add_pvec);
460}
461
431/* 462/*
432 * Mark a page as having seen activity. 463 * Mark a page as having seen activity.
433 * 464 *
@@ -438,8 +469,18 @@ void activate_page(struct page *page)
438void mark_page_accessed(struct page *page) 469void mark_page_accessed(struct page *page)
439{ 470{
440 if (!PageActive(page) && !PageUnevictable(page) && 471 if (!PageActive(page) && !PageUnevictable(page) &&
441 PageReferenced(page) && PageLRU(page)) { 472 PageReferenced(page)) {
442 activate_page(page); 473
474 /*
475 * If the page is on the LRU, queue it for activation via
476 * activate_page_pvecs. Otherwise, assume the page is on a
477 * pagevec, mark it active and it'll be moved to the active
478 * LRU on the next drain.
479 */
480 if (PageLRU(page))
481 activate_page(page);
482 else
483 __lru_cache_activate_page(page);
443 ClearPageReferenced(page); 484 ClearPageReferenced(page);
444 } else if (!PageReferenced(page)) { 485 } else if (!PageReferenced(page)) {
445 SetPageReferenced(page); 486 SetPageReferenced(page);
@@ -448,42 +489,37 @@ void mark_page_accessed(struct page *page)
448EXPORT_SYMBOL(mark_page_accessed); 489EXPORT_SYMBOL(mark_page_accessed);
449 490
450/* 491/*
451 * Order of operations is important: flush the pagevec when it's already 492 * Queue the page for addition to the LRU via pagevec. The decision on whether
452 * full, not when adding the last page, to make sure that last page is 493 * to add the page to the [in]active [file|anon] list is deferred until the
453 * not added to the LRU directly when passed to this function. Because 494 * pagevec is drained. This gives a chance for the caller of __lru_cache_add()
454 * mark_page_accessed() (called after this when writing) only activates 495 * have the page added to the active list using mark_page_accessed().
455 * pages that are on the LRU, linear writes in subpage chunks would see
456 * every PAGEVEC_SIZE page activated, which is unexpected.
457 */ 496 */
458void __lru_cache_add(struct page *page, enum lru_list lru) 497void __lru_cache_add(struct page *page)
459{ 498{
460 struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru]; 499 struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
461 500
462 page_cache_get(page); 501 page_cache_get(page);
463 if (!pagevec_space(pvec)) 502 if (!pagevec_space(pvec))
464 __pagevec_lru_add(pvec, lru); 503 __pagevec_lru_add(pvec);
465 pagevec_add(pvec, page); 504 pagevec_add(pvec, page);
466 put_cpu_var(lru_add_pvecs); 505 put_cpu_var(lru_add_pvec);
467} 506}
468EXPORT_SYMBOL(__lru_cache_add); 507EXPORT_SYMBOL(__lru_cache_add);
469 508
470/** 509/**
471 * lru_cache_add_lru - add a page to a page list 510 * lru_cache_add - add a page to a page list
472 * @page: the page to be added to the LRU. 511 * @page: the page to be added to the LRU.
473 * @lru: the LRU list to which the page is added.
474 */ 512 */
475void lru_cache_add_lru(struct page *page, enum lru_list lru) 513void lru_cache_add(struct page *page)
476{ 514{
477 if (PageActive(page)) { 515 if (PageActive(page)) {
478 VM_BUG_ON(PageUnevictable(page)); 516 VM_BUG_ON(PageUnevictable(page));
479 ClearPageActive(page);
480 } else if (PageUnevictable(page)) { 517 } else if (PageUnevictable(page)) {
481 VM_BUG_ON(PageActive(page)); 518 VM_BUG_ON(PageActive(page));
482 ClearPageUnevictable(page);
483 } 519 }
484 520
485 VM_BUG_ON(PageLRU(page) || PageActive(page) || PageUnevictable(page)); 521 VM_BUG_ON(PageLRU(page));
486 __lru_cache_add(page, lru); 522 __lru_cache_add(page);
487} 523}
488 524
489/** 525/**
@@ -583,15 +619,10 @@ static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
583 */ 619 */
584void lru_add_drain_cpu(int cpu) 620void lru_add_drain_cpu(int cpu)
585{ 621{
586 struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu); 622 struct pagevec *pvec = &per_cpu(lru_add_pvec, cpu);
587 struct pagevec *pvec;
588 int lru;
589 623
590 for_each_lru(lru) { 624 if (pagevec_count(pvec))
591 pvec = &pvecs[lru - LRU_BASE]; 625 __pagevec_lru_add(pvec);
592 if (pagevec_count(pvec))
593 __pagevec_lru_add(pvec, lru);
594 }
595 626
596 pvec = &per_cpu(lru_rotate_pvecs, cpu); 627 pvec = &per_cpu(lru_rotate_pvecs, cpu);
597 if (pagevec_count(pvec)) { 628 if (pagevec_count(pvec)) {
@@ -708,6 +739,9 @@ void release_pages(struct page **pages, int nr, int cold)
708 del_page_from_lru_list(page, lruvec, page_off_lru(page)); 739 del_page_from_lru_list(page, lruvec, page_off_lru(page));
709 } 740 }
710 741
742 /* Clear Active bit in case of parallel mark_page_accessed */
743 ClearPageActive(page);
744
711 list_add(&page->lru, &pages_to_free); 745 list_add(&page->lru, &pages_to_free);
712 } 746 }
713 if (zone) 747 if (zone)
@@ -795,30 +829,26 @@ void lru_add_page_tail(struct page *page, struct page *page_tail,
795static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, 829static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
796 void *arg) 830 void *arg)
797{ 831{
798 enum lru_list lru = (enum lru_list)arg; 832 int file = page_is_file_cache(page);
799 int file = is_file_lru(lru); 833 int active = PageActive(page);
800 int active = is_active_lru(lru); 834 enum lru_list lru = page_lru(page);
801 835
802 VM_BUG_ON(PageActive(page));
803 VM_BUG_ON(PageUnevictable(page)); 836 VM_BUG_ON(PageUnevictable(page));
804 VM_BUG_ON(PageLRU(page)); 837 VM_BUG_ON(PageLRU(page));
805 838
806 SetPageLRU(page); 839 SetPageLRU(page);
807 if (active)
808 SetPageActive(page);
809 add_page_to_lru_list(page, lruvec, lru); 840 add_page_to_lru_list(page, lruvec, lru);
810 update_page_reclaim_stat(lruvec, file, active); 841 update_page_reclaim_stat(lruvec, file, active);
842 trace_mm_lru_insertion(page, page_to_pfn(page), lru, trace_pagemap_flags(page));
811} 843}
812 844
813/* 845/*
814 * Add the passed pages to the LRU, then drop the caller's refcount 846 * Add the passed pages to the LRU, then drop the caller's refcount
815 * on them. Reinitialises the caller's pagevec. 847 * on them. Reinitialises the caller's pagevec.
816 */ 848 */
817void __pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) 849void __pagevec_lru_add(struct pagevec *pvec)
818{ 850{
819 VM_BUG_ON(is_unevictable_lru(lru)); 851 pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, NULL);
820
821 pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, (void *)lru);
822} 852}
823EXPORT_SYMBOL(__pagevec_lru_add); 853EXPORT_SYMBOL(__pagevec_lru_add);
824 854
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 746af55b8455..36af6eeaa67e 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -212,7 +212,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
212 si->cluster_nr = SWAPFILE_CLUSTER - 1; 212 si->cluster_nr = SWAPFILE_CLUSTER - 1;
213 goto checks; 213 goto checks;
214 } 214 }
215 if (si->flags & SWP_DISCARDABLE) { 215 if (si->flags & SWP_PAGE_DISCARD) {
216 /* 216 /*
217 * Start range check on racing allocations, in case 217 * Start range check on racing allocations, in case
218 * they overlap the cluster we eventually decide on 218 * they overlap the cluster we eventually decide on
@@ -322,7 +322,7 @@ checks:
322 322
323 if (si->lowest_alloc) { 323 if (si->lowest_alloc) {
324 /* 324 /*
325 * Only set when SWP_DISCARDABLE, and there's a scan 325 * Only set when SWP_PAGE_DISCARD, and there's a scan
326 * for a free cluster in progress or just completed. 326 * for a free cluster in progress or just completed.
327 */ 327 */
328 if (found_free_cluster) { 328 if (found_free_cluster) {
@@ -2016,6 +2016,20 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
2016 return nr_extents; 2016 return nr_extents;
2017} 2017}
2018 2018
2019/*
2020 * Helper to sys_swapon determining if a given swap
2021 * backing device queue supports DISCARD operations.
2022 */
2023static bool swap_discardable(struct swap_info_struct *si)
2024{
2025 struct request_queue *q = bdev_get_queue(si->bdev);
2026
2027 if (!q || !blk_queue_discard(q))
2028 return false;
2029
2030 return true;
2031}
2032
2019SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) 2033SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2020{ 2034{
2021 struct swap_info_struct *p; 2035 struct swap_info_struct *p;
@@ -2123,8 +2137,37 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2123 p->flags |= SWP_SOLIDSTATE; 2137 p->flags |= SWP_SOLIDSTATE;
2124 p->cluster_next = 1 + (prandom_u32() % p->highest_bit); 2138 p->cluster_next = 1 + (prandom_u32() % p->highest_bit);
2125 } 2139 }
2126 if ((swap_flags & SWAP_FLAG_DISCARD) && discard_swap(p) == 0) 2140
2127 p->flags |= SWP_DISCARDABLE; 2141 if ((swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) {
2142 /*
2143 * When discard is enabled for swap with no particular
2144 * policy flagged, we set all swap discard flags here in
2145 * order to sustain backward compatibility with older
2146 * swapon(8) releases.
2147 */
2148 p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD |
2149 SWP_PAGE_DISCARD);
2150
2151 /*
2152 * By flagging sys_swapon, a sysadmin can tell us to
2153 * either do single-time area discards only, or to just
2154 * perform discards for released swap page-clusters.
2155 * Now it's time to adjust the p->flags accordingly.
2156 */
2157 if (swap_flags & SWAP_FLAG_DISCARD_ONCE)
2158 p->flags &= ~SWP_PAGE_DISCARD;
2159 else if (swap_flags & SWAP_FLAG_DISCARD_PAGES)
2160 p->flags &= ~SWP_AREA_DISCARD;
2161
2162 /* issue a swapon-time discard if it's still required */
2163 if (p->flags & SWP_AREA_DISCARD) {
2164 int err = discard_swap(p);
2165 if (unlikely(err))
2166 printk(KERN_ERR
2167 "swapon: discard_swap(%p): %d\n",
2168 p, err);
2169 }
2170 }
2128 } 2171 }
2129 2172
2130 mutex_lock(&swapon_mutex); 2173 mutex_lock(&swapon_mutex);
@@ -2135,11 +2178,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2135 enable_swap_info(p, prio, swap_map, frontswap_map); 2178 enable_swap_info(p, prio, swap_map, frontswap_map);
2136 2179
2137 printk(KERN_INFO "Adding %uk swap on %s. " 2180 printk(KERN_INFO "Adding %uk swap on %s. "
2138 "Priority:%d extents:%d across:%lluk %s%s%s\n", 2181 "Priority:%d extents:%d across:%lluk %s%s%s%s%s\n",
2139 p->pages<<(PAGE_SHIFT-10), name->name, p->prio, 2182 p->pages<<(PAGE_SHIFT-10), name->name, p->prio,
2140 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), 2183 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
2141 (p->flags & SWP_SOLIDSTATE) ? "SS" : "", 2184 (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
2142 (p->flags & SWP_DISCARDABLE) ? "D" : "", 2185 (p->flags & SWP_DISCARDABLE) ? "D" : "",
2186 (p->flags & SWP_AREA_DISCARD) ? "s" : "",
2187 (p->flags & SWP_PAGE_DISCARD) ? "c" : "",
2143 (frontswap_map) ? "FS" : ""); 2188 (frontswap_map) ? "FS" : "");
2144 2189
2145 mutex_unlock(&swapon_mutex); 2190 mutex_unlock(&swapon_mutex);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d365724feb05..91a10472a39a 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -292,7 +292,7 @@ static struct vmap_area *__find_vmap_area(unsigned long addr)
292 va = rb_entry(n, struct vmap_area, rb_node); 292 va = rb_entry(n, struct vmap_area, rb_node);
293 if (addr < va->va_start) 293 if (addr < va->va_start)
294 n = n->rb_left; 294 n = n->rb_left;
295 else if (addr > va->va_start) 295 else if (addr >= va->va_end)
296 n = n->rb_right; 296 n = n->rb_right;
297 else 297 else
298 return va; 298 return va;
@@ -1322,13 +1322,6 @@ static void clear_vm_unlist(struct vm_struct *vm)
1322 vm->flags &= ~VM_UNLIST; 1322 vm->flags &= ~VM_UNLIST;
1323} 1323}
1324 1324
1325static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
1326 unsigned long flags, const void *caller)
1327{
1328 setup_vmalloc_vm(vm, va, flags, caller);
1329 clear_vm_unlist(vm);
1330}
1331
1332static struct vm_struct *__get_vm_area_node(unsigned long size, 1325static struct vm_struct *__get_vm_area_node(unsigned long size,
1333 unsigned long align, unsigned long flags, unsigned long start, 1326 unsigned long align, unsigned long flags, unsigned long start,
1334 unsigned long end, int node, gfp_t gfp_mask, const void *caller) 1327 unsigned long end, int node, gfp_t gfp_mask, const void *caller)
@@ -1337,16 +1330,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
1337 struct vm_struct *area; 1330 struct vm_struct *area;
1338 1331
1339 BUG_ON(in_interrupt()); 1332 BUG_ON(in_interrupt());
1340 if (flags & VM_IOREMAP) { 1333 if (flags & VM_IOREMAP)
1341 int bit = fls(size); 1334 align = 1ul << clamp(fls(size), PAGE_SHIFT, IOREMAP_MAX_ORDER);
1342
1343 if (bit > IOREMAP_MAX_ORDER)
1344 bit = IOREMAP_MAX_ORDER;
1345 else if (bit < PAGE_SHIFT)
1346 bit = PAGE_SHIFT;
1347
1348 align = 1ul << bit;
1349 }
1350 1335
1351 size = PAGE_ALIGN(size); 1336 size = PAGE_ALIGN(size);
1352 if (unlikely(!size)) 1337 if (unlikely(!size))
@@ -1367,16 +1352,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
1367 return NULL; 1352 return NULL;
1368 } 1353 }
1369 1354
1370 /* 1355 setup_vmalloc_vm(area, va, flags, caller);
1371 * When this function is called from __vmalloc_node_range,
1372 * we add VM_UNLIST flag to avoid accessing uninitialized
1373 * members of vm_struct such as pages and nr_pages fields.
1374 * They will be set later.
1375 */
1376 if (flags & VM_UNLIST)
1377 setup_vmalloc_vm(area, va, flags, caller);
1378 else
1379 insert_vmalloc_vm(area, va, flags, caller);
1380 1356
1381 return area; 1357 return area;
1382} 1358}
@@ -1476,10 +1452,9 @@ static void __vunmap(const void *addr, int deallocate_pages)
1476 if (!addr) 1452 if (!addr)
1477 return; 1453 return;
1478 1454
1479 if ((PAGE_SIZE-1) & (unsigned long)addr) { 1455 if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n",
1480 WARN(1, KERN_ERR "Trying to vfree() bad address (%p)\n", addr); 1456 addr));
1481 return; 1457 return;
1482 }
1483 1458
1484 area = remove_vm_area(addr); 1459 area = remove_vm_area(addr);
1485 if (unlikely(!area)) { 1460 if (unlikely(!area)) {
@@ -2148,42 +2123,43 @@ finished:
2148} 2123}
2149 2124
2150/** 2125/**
2151 * remap_vmalloc_range - map vmalloc pages to userspace 2126 * remap_vmalloc_range_partial - map vmalloc pages to userspace
2152 * @vma: vma to cover (map full range of vma) 2127 * @vma: vma to cover
2153 * @addr: vmalloc memory 2128 * @uaddr: target user address to start at
2154 * @pgoff: number of pages into addr before first page to map 2129 * @kaddr: virtual address of vmalloc kernel memory
2130 * @size: size of map area
2155 * 2131 *
2156 * Returns: 0 for success, -Exxx on failure 2132 * Returns: 0 for success, -Exxx on failure
2157 * 2133 *
2158 * This function checks that addr is a valid vmalloc'ed area, and 2134 * This function checks that @kaddr is a valid vmalloc'ed area,
2159 * that it is big enough to cover the vma. Will return failure if 2135 * and that it is big enough to cover the range starting at
2160 * that criteria isn't met. 2136 * @uaddr in @vma. Will return failure if that criteria isn't
2137 * met.
2161 * 2138 *
2162 * Similar to remap_pfn_range() (see mm/memory.c) 2139 * Similar to remap_pfn_range() (see mm/memory.c)
2163 */ 2140 */
2164int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, 2141int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
2165 unsigned long pgoff) 2142 void *kaddr, unsigned long size)
2166{ 2143{
2167 struct vm_struct *area; 2144 struct vm_struct *area;
2168 unsigned long uaddr = vma->vm_start;
2169 unsigned long usize = vma->vm_end - vma->vm_start;
2170 2145
2171 if ((PAGE_SIZE-1) & (unsigned long)addr) 2146 size = PAGE_ALIGN(size);
2147
2148 if (!PAGE_ALIGNED(uaddr) || !PAGE_ALIGNED(kaddr))
2172 return -EINVAL; 2149 return -EINVAL;
2173 2150
2174 area = find_vm_area(addr); 2151 area = find_vm_area(kaddr);
2175 if (!area) 2152 if (!area)
2176 return -EINVAL; 2153 return -EINVAL;
2177 2154
2178 if (!(area->flags & VM_USERMAP)) 2155 if (!(area->flags & VM_USERMAP))
2179 return -EINVAL; 2156 return -EINVAL;
2180 2157
2181 if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE) 2158 if (kaddr + size > area->addr + area->size)
2182 return -EINVAL; 2159 return -EINVAL;
2183 2160
2184 addr += pgoff << PAGE_SHIFT;
2185 do { 2161 do {
2186 struct page *page = vmalloc_to_page(addr); 2162 struct page *page = vmalloc_to_page(kaddr);
2187 int ret; 2163 int ret;
2188 2164
2189 ret = vm_insert_page(vma, uaddr, page); 2165 ret = vm_insert_page(vma, uaddr, page);
@@ -2191,14 +2167,37 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
2191 return ret; 2167 return ret;
2192 2168
2193 uaddr += PAGE_SIZE; 2169 uaddr += PAGE_SIZE;
2194 addr += PAGE_SIZE; 2170 kaddr += PAGE_SIZE;
2195 usize -= PAGE_SIZE; 2171 size -= PAGE_SIZE;
2196 } while (usize > 0); 2172 } while (size > 0);
2197 2173
2198 vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; 2174 vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
2199 2175
2200 return 0; 2176 return 0;
2201} 2177}
2178EXPORT_SYMBOL(remap_vmalloc_range_partial);
2179
2180/**
2181 * remap_vmalloc_range - map vmalloc pages to userspace
2182 * @vma: vma to cover (map full range of vma)
2183 * @addr: vmalloc memory
2184 * @pgoff: number of pages into addr before first page to map
2185 *
2186 * Returns: 0 for success, -Exxx on failure
2187 *
2188 * This function checks that addr is a valid vmalloc'ed area, and
2189 * that it is big enough to cover the vma. Will return failure if
2190 * that criteria isn't met.
2191 *
2192 * Similar to remap_pfn_range() (see mm/memory.c)
2193 */
2194int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
2195 unsigned long pgoff)
2196{
2197 return remap_vmalloc_range_partial(vma, vma->vm_start,
2198 addr + (pgoff << PAGE_SHIFT),
2199 vma->vm_end - vma->vm_start);
2200}
2202EXPORT_SYMBOL(remap_vmalloc_range); 2201EXPORT_SYMBOL(remap_vmalloc_range);
2203 2202
2204/* 2203/*
@@ -2512,8 +2511,8 @@ found:
2512 2511
2513 /* insert all vm's */ 2512 /* insert all vm's */
2514 for (area = 0; area < nr_vms; area++) 2513 for (area = 0; area < nr_vms; area++)
2515 insert_vmalloc_vm(vms[area], vas[area], VM_ALLOC, 2514 setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
2516 pcpu_get_vm_areas); 2515 pcpu_get_vm_areas);
2517 2516
2518 kfree(vas); 2517 kfree(vas);
2519 return vms; 2518 return vms;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index fa6a85378ee4..99b3ac7771ad 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -546,7 +546,6 @@ int remove_mapping(struct address_space *mapping, struct page *page)
546void putback_lru_page(struct page *page) 546void putback_lru_page(struct page *page)
547{ 547{
548 int lru; 548 int lru;
549 int active = !!TestClearPageActive(page);
550 int was_unevictable = PageUnevictable(page); 549 int was_unevictable = PageUnevictable(page);
551 550
552 VM_BUG_ON(PageLRU(page)); 551 VM_BUG_ON(PageLRU(page));
@@ -561,8 +560,8 @@ redo:
561 * unevictable page on [in]active list. 560 * unevictable page on [in]active list.
562 * We know how to handle that. 561 * We know how to handle that.
563 */ 562 */
564 lru = active + page_lru_base_type(page); 563 lru = page_lru_base_type(page);
565 lru_cache_add_lru(page, lru); 564 lru_cache_add(page);
566 } else { 565 } else {
567 /* 566 /*
568 * Put unevictable pages directly on zone's unevictable 567 * Put unevictable pages directly on zone's unevictable
@@ -669,6 +668,35 @@ static enum page_references page_check_references(struct page *page,
669 return PAGEREF_RECLAIM; 668 return PAGEREF_RECLAIM;
670} 669}
671 670
671/* Check if a page is dirty or under writeback */
672static void page_check_dirty_writeback(struct page *page,
673 bool *dirty, bool *writeback)
674{
675 struct address_space *mapping;
676
677 /*
678 * Anonymous pages are not handled by flushers and must be written
679 * from reclaim context. Do not stall reclaim based on them
680 */
681 if (!page_is_file_cache(page)) {
682 *dirty = false;
683 *writeback = false;
684 return;
685 }
686
687 /* By default assume that the page flags are accurate */
688 *dirty = PageDirty(page);
689 *writeback = PageWriteback(page);
690
691 /* Verify dirty/writeback state if the filesystem supports it */
692 if (!page_has_private(page))
693 return;
694
695 mapping = page_mapping(page);
696 if (mapping && mapping->a_ops->is_dirty_writeback)
697 mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
698}
699
672/* 700/*
673 * shrink_page_list() returns the number of reclaimed pages 701 * shrink_page_list() returns the number of reclaimed pages
674 */ 702 */
@@ -677,16 +705,21 @@ static unsigned long shrink_page_list(struct list_head *page_list,
677 struct scan_control *sc, 705 struct scan_control *sc,
678 enum ttu_flags ttu_flags, 706 enum ttu_flags ttu_flags,
679 unsigned long *ret_nr_dirty, 707 unsigned long *ret_nr_dirty,
708 unsigned long *ret_nr_unqueued_dirty,
709 unsigned long *ret_nr_congested,
680 unsigned long *ret_nr_writeback, 710 unsigned long *ret_nr_writeback,
711 unsigned long *ret_nr_immediate,
681 bool force_reclaim) 712 bool force_reclaim)
682{ 713{
683 LIST_HEAD(ret_pages); 714 LIST_HEAD(ret_pages);
684 LIST_HEAD(free_pages); 715 LIST_HEAD(free_pages);
685 int pgactivate = 0; 716 int pgactivate = 0;
717 unsigned long nr_unqueued_dirty = 0;
686 unsigned long nr_dirty = 0; 718 unsigned long nr_dirty = 0;
687 unsigned long nr_congested = 0; 719 unsigned long nr_congested = 0;
688 unsigned long nr_reclaimed = 0; 720 unsigned long nr_reclaimed = 0;
689 unsigned long nr_writeback = 0; 721 unsigned long nr_writeback = 0;
722 unsigned long nr_immediate = 0;
690 723
691 cond_resched(); 724 cond_resched();
692 725
@@ -696,6 +729,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
696 struct page *page; 729 struct page *page;
697 int may_enter_fs; 730 int may_enter_fs;
698 enum page_references references = PAGEREF_RECLAIM_CLEAN; 731 enum page_references references = PAGEREF_RECLAIM_CLEAN;
732 bool dirty, writeback;
699 733
700 cond_resched(); 734 cond_resched();
701 735
@@ -723,25 +757,77 @@ static unsigned long shrink_page_list(struct list_head *page_list,
723 may_enter_fs = (sc->gfp_mask & __GFP_FS) || 757 may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
724 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); 758 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
725 759
760 /*
761 * The number of dirty pages determines if a zone is marked
762 * reclaim_congested which affects wait_iff_congested. kswapd
763 * will stall and start writing pages if the tail of the LRU
764 * is all dirty unqueued pages.
765 */
766 page_check_dirty_writeback(page, &dirty, &writeback);
767 if (dirty || writeback)
768 nr_dirty++;
769
770 if (dirty && !writeback)
771 nr_unqueued_dirty++;
772
773 /*
774 * Treat this page as congested if the underlying BDI is or if
775 * pages are cycling through the LRU so quickly that the
776 * pages marked for immediate reclaim are making it to the
777 * end of the LRU a second time.
778 */
779 mapping = page_mapping(page);
780 if ((mapping && bdi_write_congested(mapping->backing_dev_info)) ||
781 (writeback && PageReclaim(page)))
782 nr_congested++;
783
784 /*
785 * If a page at the tail of the LRU is under writeback, there
786 * are three cases to consider.
787 *
788 * 1) If reclaim is encountering an excessive number of pages
789 * under writeback and this page is both under writeback and
790 * PageReclaim then it indicates that pages are being queued
791 * for IO but are being recycled through the LRU before the
792 * IO can complete. Waiting on the page itself risks an
793 * indefinite stall if it is impossible to writeback the
794 * page due to IO error or disconnected storage so instead
795 * note that the LRU is being scanned too quickly and the
796 * caller can stall after page list has been processed.
797 *
798 * 2) Global reclaim encounters a page, memcg encounters a
799 * page that is not marked for immediate reclaim or
800 * the caller does not have __GFP_IO. In this case mark
801 * the page for immediate reclaim and continue scanning.
802 *
803 * __GFP_IO is checked because a loop driver thread might
804 * enter reclaim, and deadlock if it waits on a page for
805 * which it is needed to do the write (loop masks off
806 * __GFP_IO|__GFP_FS for this reason); but more thought
807 * would probably show more reasons.
808 *
809 * Don't require __GFP_FS, since we're not going into the
810 * FS, just waiting on its writeback completion. Worryingly,
811 * ext4 gfs2 and xfs allocate pages with
812 * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing
813 * may_enter_fs here is liable to OOM on them.
814 *
815 * 3) memcg encounters a page that is not already marked
816 * PageReclaim. memcg does not have any dirty pages
817 * throttling so we could easily OOM just because too many
818 * pages are in writeback and there is nothing else to
819 * reclaim. Wait for the writeback to complete.
820 */
726 if (PageWriteback(page)) { 821 if (PageWriteback(page)) {
727 /* 822 /* Case 1 above */
728 * memcg doesn't have any dirty pages throttling so we 823 if (current_is_kswapd() &&
729 * could easily OOM just because too many pages are in 824 PageReclaim(page) &&
730 * writeback and there is nothing else to reclaim. 825 zone_is_reclaim_writeback(zone)) {
731 * 826 nr_immediate++;
732 * Check __GFP_IO, certainly because a loop driver 827 goto keep_locked;
733 * thread might enter reclaim, and deadlock if it waits 828
734 * on a page for which it is needed to do the write 829 /* Case 2 above */
735 * (loop masks off __GFP_IO|__GFP_FS for this reason); 830 } else if (global_reclaim(sc) ||
736 * but more thought would probably show more reasons.
737 *
738 * Don't require __GFP_FS, since we're not going into
739 * the FS, just waiting on its writeback completion.
740 * Worryingly, ext4 gfs2 and xfs allocate pages with
741 * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so
742 * testing may_enter_fs here is liable to OOM on them.
743 */
744 if (global_reclaim(sc) ||
745 !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) { 831 !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
746 /* 832 /*
747 * This is slightly racy - end_page_writeback() 833 * This is slightly racy - end_page_writeback()
@@ -756,9 +842,13 @@ static unsigned long shrink_page_list(struct list_head *page_list,
756 */ 842 */
757 SetPageReclaim(page); 843 SetPageReclaim(page);
758 nr_writeback++; 844 nr_writeback++;
845
759 goto keep_locked; 846 goto keep_locked;
847
848 /* Case 3 above */
849 } else {
850 wait_on_page_writeback(page);
760 } 851 }
761 wait_on_page_writeback(page);
762 } 852 }
763 853
764 if (!force_reclaim) 854 if (!force_reclaim)
@@ -784,9 +874,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
784 if (!add_to_swap(page, page_list)) 874 if (!add_to_swap(page, page_list))
785 goto activate_locked; 875 goto activate_locked;
786 may_enter_fs = 1; 876 may_enter_fs = 1;
787 }
788 877
789 mapping = page_mapping(page); 878 /* Adding to swap updated mapping */
879 mapping = page_mapping(page);
880 }
790 881
791 /* 882 /*
792 * The page is mapped into the page tables of one or more 883 * The page is mapped into the page tables of one or more
@@ -806,16 +897,14 @@ static unsigned long shrink_page_list(struct list_head *page_list,
806 } 897 }
807 898
808 if (PageDirty(page)) { 899 if (PageDirty(page)) {
809 nr_dirty++;
810
811 /* 900 /*
812 * Only kswapd can writeback filesystem pages to 901 * Only kswapd can writeback filesystem pages to
813 * avoid risk of stack overflow but do not writeback 902 * avoid risk of stack overflow but only writeback
814 * unless under significant pressure. 903 * if many dirty pages have been encountered.
815 */ 904 */
816 if (page_is_file_cache(page) && 905 if (page_is_file_cache(page) &&
817 (!current_is_kswapd() || 906 (!current_is_kswapd() ||
818 sc->priority >= DEF_PRIORITY - 2)) { 907 !zone_is_reclaim_dirty(zone))) {
819 /* 908 /*
820 * Immediately reclaim when written back. 909 * Immediately reclaim when written back.
821 * Similar in principal to deactivate_page() 910 * Similar in principal to deactivate_page()
@@ -838,7 +927,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
838 /* Page is dirty, try to write it out here */ 927 /* Page is dirty, try to write it out here */
839 switch (pageout(page, mapping, sc)) { 928 switch (pageout(page, mapping, sc)) {
840 case PAGE_KEEP: 929 case PAGE_KEEP:
841 nr_congested++;
842 goto keep_locked; 930 goto keep_locked;
843 case PAGE_ACTIVATE: 931 case PAGE_ACTIVATE:
844 goto activate_locked; 932 goto activate_locked;
@@ -946,22 +1034,16 @@ keep:
946 VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); 1034 VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
947 } 1035 }
948 1036
949 /*
950 * Tag a zone as congested if all the dirty pages encountered were
951 * backed by a congested BDI. In this case, reclaimers should just
952 * back off and wait for congestion to clear because further reclaim
953 * will encounter the same problem
954 */
955 if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc))
956 zone_set_flag(zone, ZONE_CONGESTED);
957
958 free_hot_cold_page_list(&free_pages, 1); 1037 free_hot_cold_page_list(&free_pages, 1);
959 1038
960 list_splice(&ret_pages, page_list); 1039 list_splice(&ret_pages, page_list);
961 count_vm_events(PGACTIVATE, pgactivate); 1040 count_vm_events(PGACTIVATE, pgactivate);
962 mem_cgroup_uncharge_end(); 1041 mem_cgroup_uncharge_end();
963 *ret_nr_dirty += nr_dirty; 1042 *ret_nr_dirty += nr_dirty;
1043 *ret_nr_congested += nr_congested;
1044 *ret_nr_unqueued_dirty += nr_unqueued_dirty;
964 *ret_nr_writeback += nr_writeback; 1045 *ret_nr_writeback += nr_writeback;
1046 *ret_nr_immediate += nr_immediate;
965 return nr_reclaimed; 1047 return nr_reclaimed;
966} 1048}
967 1049
@@ -973,7 +1055,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
973 .priority = DEF_PRIORITY, 1055 .priority = DEF_PRIORITY,
974 .may_unmap = 1, 1056 .may_unmap = 1,
975 }; 1057 };
976 unsigned long ret, dummy1, dummy2; 1058 unsigned long ret, dummy1, dummy2, dummy3, dummy4, dummy5;
977 struct page *page, *next; 1059 struct page *page, *next;
978 LIST_HEAD(clean_pages); 1060 LIST_HEAD(clean_pages);
979 1061
@@ -985,8 +1067,8 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
985 } 1067 }
986 1068
987 ret = shrink_page_list(&clean_pages, zone, &sc, 1069 ret = shrink_page_list(&clean_pages, zone, &sc,
988 TTU_UNMAP|TTU_IGNORE_ACCESS, 1070 TTU_UNMAP|TTU_IGNORE_ACCESS,
989 &dummy1, &dummy2, true); 1071 &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true);
990 list_splice(&clean_pages, page_list); 1072 list_splice(&clean_pages, page_list);
991 __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); 1073 __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);
992 return ret; 1074 return ret;
@@ -1281,7 +1363,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1281 unsigned long nr_reclaimed = 0; 1363 unsigned long nr_reclaimed = 0;
1282 unsigned long nr_taken; 1364 unsigned long nr_taken;
1283 unsigned long nr_dirty = 0; 1365 unsigned long nr_dirty = 0;
1366 unsigned long nr_congested = 0;
1367 unsigned long nr_unqueued_dirty = 0;
1284 unsigned long nr_writeback = 0; 1368 unsigned long nr_writeback = 0;
1369 unsigned long nr_immediate = 0;
1285 isolate_mode_t isolate_mode = 0; 1370 isolate_mode_t isolate_mode = 0;
1286 int file = is_file_lru(lru); 1371 int file = is_file_lru(lru);
1287 struct zone *zone = lruvec_zone(lruvec); 1372 struct zone *zone = lruvec_zone(lruvec);
@@ -1323,7 +1408,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1323 return 0; 1408 return 0;
1324 1409
1325 nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP, 1410 nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
1326 &nr_dirty, &nr_writeback, false); 1411 &nr_dirty, &nr_unqueued_dirty, &nr_congested,
1412 &nr_writeback, &nr_immediate,
1413 false);
1327 1414
1328 spin_lock_irq(&zone->lru_lock); 1415 spin_lock_irq(&zone->lru_lock);
1329 1416
@@ -1357,7 +1444,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1357 * same way balance_dirty_pages() manages. 1444 * same way balance_dirty_pages() manages.
1358 * 1445 *
1359 * This scales the number of dirty pages that must be under writeback 1446 * This scales the number of dirty pages that must be under writeback
1360 * before throttling depending on priority. It is a simple backoff 1447 * before a zone gets flagged ZONE_WRITEBACK. It is a simple backoff
1361 * function that has the most effect in the range DEF_PRIORITY to 1448 * function that has the most effect in the range DEF_PRIORITY to
1362 * DEF_PRIORITY-2 which is the priority reclaim is considered to be 1449 * DEF_PRIORITY-2 which is the priority reclaim is considered to be
1363 * in trouble and reclaim is considered to be in trouble. 1450 * in trouble and reclaim is considered to be in trouble.
@@ -1368,9 +1455,53 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1368 * ... 1455 * ...
1369 * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any 1456 * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any
1370 * isolated page is PageWriteback 1457 * isolated page is PageWriteback
1458 *
1459 * Once a zone is flagged ZONE_WRITEBACK, kswapd will count the number
1460 * of pages under pages flagged for immediate reclaim and stall if any
1461 * are encountered in the nr_immediate check below.
1371 */ 1462 */
1372 if (nr_writeback && nr_writeback >= 1463 if (nr_writeback && nr_writeback >=
1373 (nr_taken >> (DEF_PRIORITY - sc->priority))) 1464 (nr_taken >> (DEF_PRIORITY - sc->priority)))
1465 zone_set_flag(zone, ZONE_WRITEBACK);
1466
1467 /*
1468 * memcg will stall in page writeback so only consider forcibly
1469 * stalling for global reclaim
1470 */
1471 if (global_reclaim(sc)) {
1472 /*
1473 * Tag a zone as congested if all the dirty pages scanned were
1474 * backed by a congested BDI and wait_iff_congested will stall.
1475 */
1476 if (nr_dirty && nr_dirty == nr_congested)
1477 zone_set_flag(zone, ZONE_CONGESTED);
1478
1479 /*
1480 * If dirty pages are scanned that are not queued for IO, it
1481 * implies that flushers are not keeping up. In this case, flag
1482 * the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing
1483 * pages from reclaim context. It will forcibly stall in the
1484 * next check.
1485 */
1486 if (nr_unqueued_dirty == nr_taken)
1487 zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY);
1488
1489 /*
1490 * In addition, if kswapd scans pages marked marked for
1491 * immediate reclaim and under writeback (nr_immediate), it
1492 * implies that pages are cycling through the LRU faster than
1493 * they are written so also forcibly stall.
1494 */
1495 if (nr_unqueued_dirty == nr_taken || nr_immediate)
1496 congestion_wait(BLK_RW_ASYNC, HZ/10);
1497 }
1498
1499 /*
1500 * Stall direct reclaim for IO completions if underlying BDIs or zone
1501 * is congested. Allow kswapd to continue until it starts encountering
1502 * unqueued dirty pages or cycling through the LRU too quickly.
1503 */
1504 if (!sc->hibernation_mode && !current_is_kswapd())
1374 wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); 1505 wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
1375 1506
1376 trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, 1507 trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
@@ -1822,17 +1953,25 @@ out:
1822static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) 1953static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
1823{ 1954{
1824 unsigned long nr[NR_LRU_LISTS]; 1955 unsigned long nr[NR_LRU_LISTS];
1956 unsigned long targets[NR_LRU_LISTS];
1825 unsigned long nr_to_scan; 1957 unsigned long nr_to_scan;
1826 enum lru_list lru; 1958 enum lru_list lru;
1827 unsigned long nr_reclaimed = 0; 1959 unsigned long nr_reclaimed = 0;
1828 unsigned long nr_to_reclaim = sc->nr_to_reclaim; 1960 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
1829 struct blk_plug plug; 1961 struct blk_plug plug;
1962 bool scan_adjusted = false;
1830 1963
1831 get_scan_count(lruvec, sc, nr); 1964 get_scan_count(lruvec, sc, nr);
1832 1965
1966 /* Record the original scan target for proportional adjustments later */
1967 memcpy(targets, nr, sizeof(nr));
1968
1833 blk_start_plug(&plug); 1969 blk_start_plug(&plug);
1834 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 1970 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
1835 nr[LRU_INACTIVE_FILE]) { 1971 nr[LRU_INACTIVE_FILE]) {
1972 unsigned long nr_anon, nr_file, percentage;
1973 unsigned long nr_scanned;
1974
1836 for_each_evictable_lru(lru) { 1975 for_each_evictable_lru(lru) {
1837 if (nr[lru]) { 1976 if (nr[lru]) {
1838 nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX); 1977 nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
@@ -1842,17 +1981,60 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
1842 lruvec, sc); 1981 lruvec, sc);
1843 } 1982 }
1844 } 1983 }
1984
1985 if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
1986 continue;
1987
1845 /* 1988 /*
1846 * On large memory systems, scan >> priority can become 1989 * For global direct reclaim, reclaim only the number of pages
1847 * really large. This is fine for the starting priority; 1990 * requested. Less care is taken to scan proportionally as it
1848 * we want to put equal scanning pressure on each zone. 1991 * is more important to minimise direct reclaim stall latency
1849 * However, if the VM has a harder time of freeing pages, 1992 * than it is to properly age the LRU lists.
1850 * with multiple processes reclaiming pages, the total
1851 * freeing target can get unreasonably large.
1852 */ 1993 */
1853 if (nr_reclaimed >= nr_to_reclaim && 1994 if (global_reclaim(sc) && !current_is_kswapd())
1854 sc->priority < DEF_PRIORITY)
1855 break; 1995 break;
1996
1997 /*
1998 * For kswapd and memcg, reclaim at least the number of pages
1999 * requested. Ensure that the anon and file LRUs shrink
2000 * proportionally what was requested by get_scan_count(). We
2001 * stop reclaiming one LRU and reduce the amount scanning
2002 * proportional to the original scan target.
2003 */
2004 nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
2005 nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
2006
2007 if (nr_file > nr_anon) {
2008 unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
2009 targets[LRU_ACTIVE_ANON] + 1;
2010 lru = LRU_BASE;
2011 percentage = nr_anon * 100 / scan_target;
2012 } else {
2013 unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
2014 targets[LRU_ACTIVE_FILE] + 1;
2015 lru = LRU_FILE;
2016 percentage = nr_file * 100 / scan_target;
2017 }
2018
2019 /* Stop scanning the smaller of the LRU */
2020 nr[lru] = 0;
2021 nr[lru + LRU_ACTIVE] = 0;
2022
2023 /*
2024 * Recalculate the other LRU scan count based on its original
2025 * scan target and the percentage scanning already complete
2026 */
2027 lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;
2028 nr_scanned = targets[lru] - nr[lru];
2029 nr[lru] = targets[lru] * (100 - percentage) / 100;
2030 nr[lru] -= min(nr[lru], nr_scanned);
2031
2032 lru += LRU_ACTIVE;
2033 nr_scanned = targets[lru] - nr[lru];
2034 nr[lru] = targets[lru] * (100 - percentage) / 100;
2035 nr[lru] -= min(nr[lru], nr_scanned);
2036
2037 scan_adjusted = true;
1856 } 2038 }
1857 blk_finish_plug(&plug); 2039 blk_finish_plug(&plug);
1858 sc->nr_reclaimed += nr_reclaimed; 2040 sc->nr_reclaimed += nr_reclaimed;
@@ -2222,17 +2404,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2222 WB_REASON_TRY_TO_FREE_PAGES); 2404 WB_REASON_TRY_TO_FREE_PAGES);
2223 sc->may_writepage = 1; 2405 sc->may_writepage = 1;
2224 } 2406 }
2225
2226 /* Take a nap, wait for some writeback to complete */
2227 if (!sc->hibernation_mode && sc->nr_scanned &&
2228 sc->priority < DEF_PRIORITY - 2) {
2229 struct zone *preferred_zone;
2230
2231 first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
2232 &cpuset_current_mems_allowed,
2233 &preferred_zone);
2234 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
2235 }
2236 } while (--sc->priority >= 0); 2407 } while (--sc->priority >= 0);
2237 2408
2238out: 2409out:
@@ -2601,6 +2772,91 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
2601} 2772}
2602 2773
2603/* 2774/*
2775 * kswapd shrinks the zone by the number of pages required to reach
2776 * the high watermark.
2777 *
2778 * Returns true if kswapd scanned at least the requested number of pages to
2779 * reclaim or if the lack of progress was due to pages under writeback.
2780 * This is used to determine if the scanning priority needs to be raised.
2781 */
2782static bool kswapd_shrink_zone(struct zone *zone,
2783 int classzone_idx,
2784 struct scan_control *sc,
2785 unsigned long lru_pages,
2786 unsigned long *nr_attempted)
2787{
2788 unsigned long nr_slab;
2789 int testorder = sc->order;
2790 unsigned long balance_gap;
2791 struct reclaim_state *reclaim_state = current->reclaim_state;
2792 struct shrink_control shrink = {
2793 .gfp_mask = sc->gfp_mask,
2794 };
2795 bool lowmem_pressure;
2796
2797 /* Reclaim above the high watermark. */
2798 sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone));
2799
2800 /*
2801 * Kswapd reclaims only single pages with compaction enabled. Trying
2802 * too hard to reclaim until contiguous free pages have become
2803 * available can hurt performance by evicting too much useful data
2804 * from memory. Do not reclaim more than needed for compaction.
2805 */
2806 if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
2807 compaction_suitable(zone, sc->order) !=
2808 COMPACT_SKIPPED)
2809 testorder = 0;
2810
2811 /*
2812 * We put equal pressure on every zone, unless one zone has way too
2813 * many pages free already. The "too many pages" is defined as the
2814 * high wmark plus a "gap" where the gap is either the low
2815 * watermark or 1% of the zone, whichever is smaller.
2816 */
2817 balance_gap = min(low_wmark_pages(zone),
2818 (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
2819 KSWAPD_ZONE_BALANCE_GAP_RATIO);
2820
2821 /*
2822 * If there is no low memory pressure or the zone is balanced then no
2823 * reclaim is necessary
2824 */
2825 lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone));
2826 if (!lowmem_pressure && zone_balanced(zone, testorder,
2827 balance_gap, classzone_idx))
2828 return true;
2829
2830 shrink_zone(zone, sc);
2831
2832 reclaim_state->reclaimed_slab = 0;
2833 nr_slab = shrink_slab(&shrink, sc->nr_scanned, lru_pages);
2834 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
2835
2836 /* Account for the number of pages attempted to reclaim */
2837 *nr_attempted += sc->nr_to_reclaim;
2838
2839 if (nr_slab == 0 && !zone_reclaimable(zone))
2840 zone->all_unreclaimable = 1;
2841
2842 zone_clear_flag(zone, ZONE_WRITEBACK);
2843
2844 /*
2845 * If a zone reaches its high watermark, consider it to be no longer
2846 * congested. It's possible there are dirty pages backed by congested
2847 * BDIs but as pressure is relieved, speculatively avoid congestion
2848 * waits.
2849 */
2850 if (!zone->all_unreclaimable &&
2851 zone_balanced(zone, testorder, 0, classzone_idx)) {
2852 zone_clear_flag(zone, ZONE_CONGESTED);
2853 zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
2854 }
2855
2856 return sc->nr_scanned >= sc->nr_to_reclaim;
2857}
2858
2859/*
2604 * For kswapd, balance_pgdat() will work across all this node's zones until 2860 * For kswapd, balance_pgdat() will work across all this node's zones until
2605 * they are all at high_wmark_pages(zone). 2861 * they are all at high_wmark_pages(zone).
2606 * 2862 *
@@ -2624,35 +2880,28 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
2624static unsigned long balance_pgdat(pg_data_t *pgdat, int order, 2880static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2625 int *classzone_idx) 2881 int *classzone_idx)
2626{ 2882{
2627 bool pgdat_is_balanced = false;
2628 int i; 2883 int i;
2629 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ 2884 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
2630 struct reclaim_state *reclaim_state = current->reclaim_state;
2631 unsigned long nr_soft_reclaimed; 2885 unsigned long nr_soft_reclaimed;
2632 unsigned long nr_soft_scanned; 2886 unsigned long nr_soft_scanned;
2633 struct scan_control sc = { 2887 struct scan_control sc = {
2634 .gfp_mask = GFP_KERNEL, 2888 .gfp_mask = GFP_KERNEL,
2889 .priority = DEF_PRIORITY,
2635 .may_unmap = 1, 2890 .may_unmap = 1,
2636 .may_swap = 1, 2891 .may_swap = 1,
2637 /* 2892 .may_writepage = !laptop_mode,
2638 * kswapd doesn't want to be bailed out while reclaim. because
2639 * we want to put equal scanning pressure on each zone.
2640 */
2641 .nr_to_reclaim = ULONG_MAX,
2642 .order = order, 2893 .order = order,
2643 .target_mem_cgroup = NULL, 2894 .target_mem_cgroup = NULL,
2644 }; 2895 };
2645 struct shrink_control shrink = {
2646 .gfp_mask = sc.gfp_mask,
2647 };
2648loop_again:
2649 sc.priority = DEF_PRIORITY;
2650 sc.nr_reclaimed = 0;
2651 sc.may_writepage = !laptop_mode;
2652 count_vm_event(PAGEOUTRUN); 2896 count_vm_event(PAGEOUTRUN);
2653 2897
2654 do { 2898 do {
2655 unsigned long lru_pages = 0; 2899 unsigned long lru_pages = 0;
2900 unsigned long nr_attempted = 0;
2901 bool raise_priority = true;
2902 bool pgdat_needs_compaction = (order > 0);
2903
2904 sc.nr_reclaimed = 0;
2656 2905
2657 /* 2906 /*
2658 * Scan in the highmem->dma direction for the highest 2907 * Scan in the highmem->dma direction for the highest
@@ -2689,23 +2938,46 @@ loop_again:
2689 end_zone = i; 2938 end_zone = i;
2690 break; 2939 break;
2691 } else { 2940 } else {
2692 /* If balanced, clear the congested flag */ 2941 /*
2942 * If balanced, clear the dirty and congested
2943 * flags
2944 */
2693 zone_clear_flag(zone, ZONE_CONGESTED); 2945 zone_clear_flag(zone, ZONE_CONGESTED);
2946 zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
2694 } 2947 }
2695 } 2948 }
2696 2949
2697 if (i < 0) { 2950 if (i < 0)
2698 pgdat_is_balanced = true;
2699 goto out; 2951 goto out;
2700 }
2701 2952
2702 for (i = 0; i <= end_zone; i++) { 2953 for (i = 0; i <= end_zone; i++) {
2703 struct zone *zone = pgdat->node_zones + i; 2954 struct zone *zone = pgdat->node_zones + i;
2704 2955
2956 if (!populated_zone(zone))
2957 continue;
2958
2705 lru_pages += zone_reclaimable_pages(zone); 2959 lru_pages += zone_reclaimable_pages(zone);
2960
2961 /*
2962 * If any zone is currently balanced then kswapd will
2963 * not call compaction as it is expected that the
2964 * necessary pages are already available.
2965 */
2966 if (pgdat_needs_compaction &&
2967 zone_watermark_ok(zone, order,
2968 low_wmark_pages(zone),
2969 *classzone_idx, 0))
2970 pgdat_needs_compaction = false;
2706 } 2971 }
2707 2972
2708 /* 2973 /*
2974 * If we're getting trouble reclaiming, start doing writepage
2975 * even in laptop mode.
2976 */
2977 if (sc.priority < DEF_PRIORITY - 2)
2978 sc.may_writepage = 1;
2979
2980 /*
2709 * Now scan the zone in the dma->highmem direction, stopping 2981 * Now scan the zone in the dma->highmem direction, stopping
2710 * at the last zone which needs scanning. 2982 * at the last zone which needs scanning.
2711 * 2983 *
@@ -2716,8 +2988,6 @@ loop_again:
2716 */ 2988 */
2717 for (i = 0; i <= end_zone; i++) { 2989 for (i = 0; i <= end_zone; i++) {
2718 struct zone *zone = pgdat->node_zones + i; 2990 struct zone *zone = pgdat->node_zones + i;
2719 int nr_slab, testorder;
2720 unsigned long balance_gap;
2721 2991
2722 if (!populated_zone(zone)) 2992 if (!populated_zone(zone))
2723 continue; 2993 continue;
@@ -2738,65 +3008,14 @@ loop_again:
2738 sc.nr_reclaimed += nr_soft_reclaimed; 3008 sc.nr_reclaimed += nr_soft_reclaimed;
2739 3009
2740 /* 3010 /*
2741 * We put equal pressure on every zone, unless 3011 * There should be no need to raise the scanning
2742 * one zone has way too many pages free 3012 * priority if enough pages are already being scanned
2743 * already. The "too many pages" is defined 3013 * that that high watermark would be met at 100%
2744 * as the high wmark plus a "gap" where the 3014 * efficiency.
2745 * gap is either the low watermark or 1%
2746 * of the zone, whichever is smaller.
2747 */ 3015 */
2748 balance_gap = min(low_wmark_pages(zone), 3016 if (kswapd_shrink_zone(zone, end_zone, &sc,
2749 (zone->managed_pages + 3017 lru_pages, &nr_attempted))
2750 KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / 3018 raise_priority = false;
2751 KSWAPD_ZONE_BALANCE_GAP_RATIO);
2752 /*
2753 * Kswapd reclaims only single pages with compaction
2754 * enabled. Trying too hard to reclaim until contiguous
2755 * free pages have become available can hurt performance
2756 * by evicting too much useful data from memory.
2757 * Do not reclaim more than needed for compaction.
2758 */
2759 testorder = order;
2760 if (IS_ENABLED(CONFIG_COMPACTION) && order &&
2761 compaction_suitable(zone, order) !=
2762 COMPACT_SKIPPED)
2763 testorder = 0;
2764
2765 if ((buffer_heads_over_limit && is_highmem_idx(i)) ||
2766 !zone_balanced(zone, testorder,
2767 balance_gap, end_zone)) {
2768 shrink_zone(zone, &sc);
2769
2770 reclaim_state->reclaimed_slab = 0;
2771 nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
2772 sc.nr_reclaimed += reclaim_state->reclaimed_slab;
2773
2774 if (nr_slab == 0 && !zone_reclaimable(zone))
2775 zone->all_unreclaimable = 1;
2776 }
2777
2778 /*
2779 * If we're getting trouble reclaiming, start doing
2780 * writepage even in laptop mode.
2781 */
2782 if (sc.priority < DEF_PRIORITY - 2)
2783 sc.may_writepage = 1;
2784
2785 if (zone->all_unreclaimable) {
2786 if (end_zone && end_zone == i)
2787 end_zone--;
2788 continue;
2789 }
2790
2791 if (zone_balanced(zone, testorder, 0, end_zone))
2792 /*
2793 * If a zone reaches its high watermark,
2794 * consider it to be no longer congested. It's
2795 * possible there are dirty pages backed by
2796 * congested BDIs but as pressure is relieved,
2797 * speculatively avoid congestion waits
2798 */
2799 zone_clear_flag(zone, ZONE_CONGESTED);
2800 } 3019 }
2801 3020
2802 /* 3021 /*
@@ -2808,74 +3027,38 @@ loop_again:
2808 pfmemalloc_watermark_ok(pgdat)) 3027 pfmemalloc_watermark_ok(pgdat))
2809 wake_up(&pgdat->pfmemalloc_wait); 3028 wake_up(&pgdat->pfmemalloc_wait);
2810 3029
2811 if (pgdat_balanced(pgdat, order, *classzone_idx)) {
2812 pgdat_is_balanced = true;
2813 break; /* kswapd: all done */
2814 }
2815
2816 /* 3030 /*
2817 * We do this so kswapd doesn't build up large priorities for 3031 * Fragmentation may mean that the system cannot be rebalanced
2818 * example when it is freeing in parallel with allocators. It 3032 * for high-order allocations in all zones. If twice the
2819 * matches the direct reclaim path behaviour in terms of impact 3033 * allocation size has been reclaimed and the zones are still
2820 * on zone->*_priority. 3034 * not balanced then recheck the watermarks at order-0 to
3035 * prevent kswapd reclaiming excessively. Assume that a
3036 * process requested a high-order can direct reclaim/compact.
2821 */ 3037 */
2822 if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) 3038 if (order && sc.nr_reclaimed >= 2UL << order)
2823 break; 3039 order = sc.order = 0;
2824 } while (--sc.priority >= 0);
2825
2826out:
2827 if (!pgdat_is_balanced) {
2828 cond_resched();
2829 3040
2830 try_to_freeze(); 3041 /* Check if kswapd should be suspending */
3042 if (try_to_freeze() || kthread_should_stop())
3043 break;
2831 3044
2832 /* 3045 /*
2833 * Fragmentation may mean that the system cannot be 3046 * Compact if necessary and kswapd is reclaiming at least the
2834 * rebalanced for high-order allocations in all zones. 3047 * high watermark number of pages as requsted
2835 * At this point, if nr_reclaimed < SWAP_CLUSTER_MAX,
2836 * it means the zones have been fully scanned and are still
2837 * not balanced. For high-order allocations, there is
2838 * little point trying all over again as kswapd may
2839 * infinite loop.
2840 *
2841 * Instead, recheck all watermarks at order-0 as they
2842 * are the most important. If watermarks are ok, kswapd will go
2843 * back to sleep. High-order users can still perform direct
2844 * reclaim if they wish.
2845 */ 3048 */
2846 if (sc.nr_reclaimed < SWAP_CLUSTER_MAX) 3049 if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted)
2847 order = sc.order = 0;
2848
2849 goto loop_again;
2850 }
2851
2852 /*
2853 * If kswapd was reclaiming at a higher order, it has the option of
2854 * sleeping without all zones being balanced. Before it does, it must
2855 * ensure that the watermarks for order-0 on *all* zones are met and
2856 * that the congestion flags are cleared. The congestion flag must
2857 * be cleared as kswapd is the only mechanism that clears the flag
2858 * and it is potentially going to sleep here.
2859 */
2860 if (order) {
2861 int zones_need_compaction = 1;
2862
2863 for (i = 0; i <= end_zone; i++) {
2864 struct zone *zone = pgdat->node_zones + i;
2865
2866 if (!populated_zone(zone))
2867 continue;
2868
2869 /* Check if the memory needs to be defragmented. */
2870 if (zone_watermark_ok(zone, order,
2871 low_wmark_pages(zone), *classzone_idx, 0))
2872 zones_need_compaction = 0;
2873 }
2874
2875 if (zones_need_compaction)
2876 compact_pgdat(pgdat, order); 3050 compact_pgdat(pgdat, order);
2877 }
2878 3051
3052 /*
3053 * Raise priority if scanning rate is too low or there was no
3054 * progress in reclaiming pages
3055 */
3056 if (raise_priority || !sc.nr_reclaimed)
3057 sc.priority--;
3058 } while (sc.priority >= 1 &&
3059 !pgdat_balanced(pgdat, order, *classzone_idx));
3060
3061out:
2879 /* 3062 /*
2880 * Return the order we were reclaiming at so prepare_kswapd_sleep() 3063 * Return the order we were reclaiming at so prepare_kswapd_sleep()
2881 * makes a decision on the order we were last reclaiming at. However, 3064 * makes a decision on the order we were last reclaiming at. However,