diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-07-03 20:12:13 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-07-03 20:12:13 -0400 |
commit | 7f0ef0267e20d62d45d527911a993b1e998f4968 (patch) | |
tree | de51abc7da5903f59d83e23937f22420164c9477 /mm | |
parent | 862f0012549110d6f2586bf54b52ed4540cbff3a (diff) | |
parent | 9307c29524502c21f0e8a6d96d850b2f5bc0bd9a (diff) |
Merge branch 'akpm' (updates from Andrew Morton)
Merge first patch-bomb from Andrew Morton:
- various misc bits
- I'm been patchmonkeying ocfs2 for a while, as Joel and Mark have been
distracted. There has been quite a bit of activity.
- About half the MM queue
- Some backlight bits
- Various lib/ updates
- checkpatch updates
- zillions more little rtc patches
- ptrace
- signals
- exec
- procfs
- rapidio
- nbd
- aoe
- pps
- memstick
- tools/testing/selftests updates
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (445 commits)
tools/testing/selftests: don't assume the x bit is set on scripts
selftests: add .gitignore for kcmp
selftests: fix clean target in kcmp Makefile
selftests: add .gitignore for vm
selftests: add hugetlbfstest
self-test: fix make clean
selftests: exit 1 on failure
kernel/resource.c: remove the unneeded assignment in function __find_resource
aio: fix wrong comment in aio_complete()
drivers/w1/slaves/w1_ds2408.c: add magic sequence to disable P0 test mode
drivers/memstick/host/r592.c: convert to module_pci_driver
drivers/memstick/host/jmb38x_ms: convert to module_pci_driver
pps-gpio: add device-tree binding and support
drivers/pps/clients/pps-gpio.c: convert to module_platform_driver
drivers/pps/clients/pps-gpio.c: convert to devm_* helpers
drivers/parport/share.c: use kzalloc
Documentation/accounting/getdelays.c: avoid strncpy in accounting tool
aoe: update internal version number to v83
aoe: update copyright date
aoe: perform I/O completions in parallel
...
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 12 | ||||
-rw-r--r-- | mm/backing-dev.c | 5 | ||||
-rw-r--r-- | mm/bootmem.c | 39 | ||||
-rw-r--r-- | mm/huge_memory.c | 2 | ||||
-rw-r--r-- | mm/hugetlb.c | 4 | ||||
-rw-r--r-- | mm/memcontrol.c | 97 | ||||
-rw-r--r-- | mm/memory-failure.c | 22 | ||||
-rw-r--r-- | mm/memory.c | 13 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 48 | ||||
-rw-r--r-- | mm/mm_init.c | 47 | ||||
-rw-r--r-- | mm/mmap.c | 2 | ||||
-rw-r--r-- | mm/mremap.c | 2 | ||||
-rw-r--r-- | mm/nobootmem.c | 35 | ||||
-rw-r--r-- | mm/nommu.c | 6 | ||||
-rw-r--r-- | mm/page_alloc.c | 294 | ||||
-rw-r--r-- | mm/page_io.c | 50 | ||||
-rw-r--r-- | mm/rmap.c | 7 | ||||
-rw-r--r-- | mm/sparse.c | 3 | ||||
-rw-r--r-- | mm/swap.c | 106 | ||||
-rw-r--r-- | mm/swapfile.c | 55 | ||||
-rw-r--r-- | mm/vmalloc.c | 103 | ||||
-rw-r--r-- | mm/vmscan.c | 585 |
22 files changed, 1021 insertions, 516 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index f5e698e30d4a..7e28ecfa8aa4 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -477,3 +477,15 @@ config FRONTSWAP | |||
477 | and swap data is stored as normal on the matching swap device. | 477 | and swap data is stored as normal on the matching swap device. |
478 | 478 | ||
479 | If unsure, say Y to enable frontswap. | 479 | If unsure, say Y to enable frontswap. |
480 | |||
481 | config MEM_SOFT_DIRTY | ||
482 | bool "Track memory changes" | ||
483 | depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY | ||
484 | select PROC_PAGE_MONITOR | ||
485 | help | ||
486 | This option enables memory changes tracking by introducing a | ||
487 | soft-dirty bit on pte-s. This bit it set when someone writes | ||
488 | into a page just as regular dirty bit, but unlike the latter | ||
489 | it can be cleared by hands. | ||
490 | |||
491 | See Documentation/vm/soft-dirty.txt for more details. | ||
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 502517492258..d014ee5fcbbd 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c | |||
@@ -515,7 +515,6 @@ EXPORT_SYMBOL(bdi_destroy); | |||
515 | int bdi_setup_and_register(struct backing_dev_info *bdi, char *name, | 515 | int bdi_setup_and_register(struct backing_dev_info *bdi, char *name, |
516 | unsigned int cap) | 516 | unsigned int cap) |
517 | { | 517 | { |
518 | char tmp[32]; | ||
519 | int err; | 518 | int err; |
520 | 519 | ||
521 | bdi->name = name; | 520 | bdi->name = name; |
@@ -524,8 +523,8 @@ int bdi_setup_and_register(struct backing_dev_info *bdi, char *name, | |||
524 | if (err) | 523 | if (err) |
525 | return err; | 524 | return err; |
526 | 525 | ||
527 | sprintf(tmp, "%.28s%s", name, "-%d"); | 526 | err = bdi_register(bdi, NULL, "%.28s-%ld", name, |
528 | err = bdi_register(bdi, NULL, tmp, atomic_long_inc_return(&bdi_seq)); | 527 | atomic_long_inc_return(&bdi_seq)); |
529 | if (err) { | 528 | if (err) { |
530 | bdi_destroy(bdi); | 529 | bdi_destroy(bdi); |
531 | return err; | 530 | return err; |
diff --git a/mm/bootmem.c b/mm/bootmem.c index 2b0bcb019ec2..6ab7744e692e 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
@@ -241,33 +241,26 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) | |||
241 | return count; | 241 | return count; |
242 | } | 242 | } |
243 | 243 | ||
244 | static void reset_node_lowmem_managed_pages(pg_data_t *pgdat) | 244 | static int reset_managed_pages_done __initdata; |
245 | |||
246 | static inline void __init reset_node_managed_pages(pg_data_t *pgdat) | ||
245 | { | 247 | { |
246 | struct zone *z; | 248 | struct zone *z; |
247 | 249 | ||
248 | /* | 250 | if (reset_managed_pages_done) |
249 | * In free_area_init_core(), highmem zone's managed_pages is set to | 251 | return; |
250 | * present_pages, and bootmem allocator doesn't allocate from highmem | 252 | |
251 | * zones. So there's no need to recalculate managed_pages because all | ||
252 | * highmem pages will be managed by the buddy system. Here highmem | ||
253 | * zone also includes highmem movable zone. | ||
254 | */ | ||
255 | for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) | 253 | for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) |
256 | if (!is_highmem(z)) | 254 | z->managed_pages = 0; |
257 | z->managed_pages = 0; | ||
258 | } | 255 | } |
259 | 256 | ||
260 | /** | 257 | void __init reset_all_zones_managed_pages(void) |
261 | * free_all_bootmem_node - release a node's free pages to the buddy allocator | ||
262 | * @pgdat: node to be released | ||
263 | * | ||
264 | * Returns the number of pages actually released. | ||
265 | */ | ||
266 | unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) | ||
267 | { | 258 | { |
268 | register_page_bootmem_info_node(pgdat); | 259 | struct pglist_data *pgdat; |
269 | reset_node_lowmem_managed_pages(pgdat); | 260 | |
270 | return free_all_bootmem_core(pgdat->bdata); | 261 | for_each_online_pgdat(pgdat) |
262 | reset_node_managed_pages(pgdat); | ||
263 | reset_managed_pages_done = 1; | ||
271 | } | 264 | } |
272 | 265 | ||
273 | /** | 266 | /** |
@@ -279,14 +272,14 @@ unsigned long __init free_all_bootmem(void) | |||
279 | { | 272 | { |
280 | unsigned long total_pages = 0; | 273 | unsigned long total_pages = 0; |
281 | bootmem_data_t *bdata; | 274 | bootmem_data_t *bdata; |
282 | struct pglist_data *pgdat; | ||
283 | 275 | ||
284 | for_each_online_pgdat(pgdat) | 276 | reset_all_zones_managed_pages(); |
285 | reset_node_lowmem_managed_pages(pgdat); | ||
286 | 277 | ||
287 | list_for_each_entry(bdata, &bdata_list, list) | 278 | list_for_each_entry(bdata, &bdata_list, list) |
288 | total_pages += free_all_bootmem_core(bdata); | 279 | total_pages += free_all_bootmem_core(bdata); |
289 | 280 | ||
281 | totalram_pages += total_pages; | ||
282 | |||
290 | return total_pages; | 283 | return total_pages; |
291 | } | 284 | } |
292 | 285 | ||
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 362c329b83fe..d8b3b850150c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -1429,7 +1429,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, | |||
1429 | if (ret == 1) { | 1429 | if (ret == 1) { |
1430 | pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); | 1430 | pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); |
1431 | VM_BUG_ON(!pmd_none(*new_pmd)); | 1431 | VM_BUG_ON(!pmd_none(*new_pmd)); |
1432 | set_pmd_at(mm, new_addr, new_pmd, pmd); | 1432 | set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); |
1433 | spin_unlock(&mm->page_table_lock); | 1433 | spin_unlock(&mm->page_table_lock); |
1434 | } | 1434 | } |
1435 | out: | 1435 | out: |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index aed085ad11a8..83aff0a4d093 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -319,7 +319,7 @@ unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) | |||
319 | 319 | ||
320 | hstate = hstate_vma(vma); | 320 | hstate = hstate_vma(vma); |
321 | 321 | ||
322 | return 1UL << (hstate->order + PAGE_SHIFT); | 322 | return 1UL << huge_page_shift(hstate); |
323 | } | 323 | } |
324 | EXPORT_SYMBOL_GPL(vma_kernel_pagesize); | 324 | EXPORT_SYMBOL_GPL(vma_kernel_pagesize); |
325 | 325 | ||
@@ -1263,7 +1263,7 @@ static void __init gather_bootmem_prealloc(void) | |||
1263 | * side-effects, like CommitLimit going negative. | 1263 | * side-effects, like CommitLimit going negative. |
1264 | */ | 1264 | */ |
1265 | if (h->order > (MAX_ORDER - 1)) | 1265 | if (h->order > (MAX_ORDER - 1)) |
1266 | totalram_pages += 1 << h->order; | 1266 | adjust_managed_page_count(page, 1 << h->order); |
1267 | } | 1267 | } |
1268 | } | 1268 | } |
1269 | 1269 | ||
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 194721839cf5..2e851f453814 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -1148,6 +1148,58 @@ skip_node: | |||
1148 | return NULL; | 1148 | return NULL; |
1149 | } | 1149 | } |
1150 | 1150 | ||
1151 | static void mem_cgroup_iter_invalidate(struct mem_cgroup *root) | ||
1152 | { | ||
1153 | /* | ||
1154 | * When a group in the hierarchy below root is destroyed, the | ||
1155 | * hierarchy iterator can no longer be trusted since it might | ||
1156 | * have pointed to the destroyed group. Invalidate it. | ||
1157 | */ | ||
1158 | atomic_inc(&root->dead_count); | ||
1159 | } | ||
1160 | |||
1161 | static struct mem_cgroup * | ||
1162 | mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter, | ||
1163 | struct mem_cgroup *root, | ||
1164 | int *sequence) | ||
1165 | { | ||
1166 | struct mem_cgroup *position = NULL; | ||
1167 | /* | ||
1168 | * A cgroup destruction happens in two stages: offlining and | ||
1169 | * release. They are separated by a RCU grace period. | ||
1170 | * | ||
1171 | * If the iterator is valid, we may still race with an | ||
1172 | * offlining. The RCU lock ensures the object won't be | ||
1173 | * released, tryget will fail if we lost the race. | ||
1174 | */ | ||
1175 | *sequence = atomic_read(&root->dead_count); | ||
1176 | if (iter->last_dead_count == *sequence) { | ||
1177 | smp_rmb(); | ||
1178 | position = iter->last_visited; | ||
1179 | if (position && !css_tryget(&position->css)) | ||
1180 | position = NULL; | ||
1181 | } | ||
1182 | return position; | ||
1183 | } | ||
1184 | |||
1185 | static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, | ||
1186 | struct mem_cgroup *last_visited, | ||
1187 | struct mem_cgroup *new_position, | ||
1188 | int sequence) | ||
1189 | { | ||
1190 | if (last_visited) | ||
1191 | css_put(&last_visited->css); | ||
1192 | /* | ||
1193 | * We store the sequence count from the time @last_visited was | ||
1194 | * loaded successfully instead of rereading it here so that we | ||
1195 | * don't lose destruction events in between. We could have | ||
1196 | * raced with the destruction of @new_position after all. | ||
1197 | */ | ||
1198 | iter->last_visited = new_position; | ||
1199 | smp_wmb(); | ||
1200 | iter->last_dead_count = sequence; | ||
1201 | } | ||
1202 | |||
1151 | /** | 1203 | /** |
1152 | * mem_cgroup_iter - iterate over memory cgroup hierarchy | 1204 | * mem_cgroup_iter - iterate over memory cgroup hierarchy |
1153 | * @root: hierarchy root | 1205 | * @root: hierarchy root |
@@ -1171,7 +1223,6 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, | |||
1171 | { | 1223 | { |
1172 | struct mem_cgroup *memcg = NULL; | 1224 | struct mem_cgroup *memcg = NULL; |
1173 | struct mem_cgroup *last_visited = NULL; | 1225 | struct mem_cgroup *last_visited = NULL; |
1174 | unsigned long uninitialized_var(dead_count); | ||
1175 | 1226 | ||
1176 | if (mem_cgroup_disabled()) | 1227 | if (mem_cgroup_disabled()) |
1177 | return NULL; | 1228 | return NULL; |
@@ -1191,6 +1242,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, | |||
1191 | rcu_read_lock(); | 1242 | rcu_read_lock(); |
1192 | while (!memcg) { | 1243 | while (!memcg) { |
1193 | struct mem_cgroup_reclaim_iter *uninitialized_var(iter); | 1244 | struct mem_cgroup_reclaim_iter *uninitialized_var(iter); |
1245 | int uninitialized_var(seq); | ||
1194 | 1246 | ||
1195 | if (reclaim) { | 1247 | if (reclaim) { |
1196 | int nid = zone_to_nid(reclaim->zone); | 1248 | int nid = zone_to_nid(reclaim->zone); |
@@ -1204,37 +1256,13 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, | |||
1204 | goto out_unlock; | 1256 | goto out_unlock; |
1205 | } | 1257 | } |
1206 | 1258 | ||
1207 | /* | 1259 | last_visited = mem_cgroup_iter_load(iter, root, &seq); |
1208 | * If the dead_count mismatches, a destruction | ||
1209 | * has happened or is happening concurrently. | ||
1210 | * If the dead_count matches, a destruction | ||
1211 | * might still happen concurrently, but since | ||
1212 | * we checked under RCU, that destruction | ||
1213 | * won't free the object until we release the | ||
1214 | * RCU reader lock. Thus, the dead_count | ||
1215 | * check verifies the pointer is still valid, | ||
1216 | * css_tryget() verifies the cgroup pointed to | ||
1217 | * is alive. | ||
1218 | */ | ||
1219 | dead_count = atomic_read(&root->dead_count); | ||
1220 | if (dead_count == iter->last_dead_count) { | ||
1221 | smp_rmb(); | ||
1222 | last_visited = iter->last_visited; | ||
1223 | if (last_visited && | ||
1224 | !css_tryget(&last_visited->css)) | ||
1225 | last_visited = NULL; | ||
1226 | } | ||
1227 | } | 1260 | } |
1228 | 1261 | ||
1229 | memcg = __mem_cgroup_iter_next(root, last_visited); | 1262 | memcg = __mem_cgroup_iter_next(root, last_visited); |
1230 | 1263 | ||
1231 | if (reclaim) { | 1264 | if (reclaim) { |
1232 | if (last_visited) | 1265 | mem_cgroup_iter_update(iter, last_visited, memcg, seq); |
1233 | css_put(&last_visited->css); | ||
1234 | |||
1235 | iter->last_visited = memcg; | ||
1236 | smp_wmb(); | ||
1237 | iter->last_dead_count = dead_count; | ||
1238 | 1266 | ||
1239 | if (!memcg) | 1267 | if (!memcg) |
1240 | iter->generation++; | 1268 | iter->generation++; |
@@ -1448,11 +1476,12 @@ static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, | |||
1448 | return ret; | 1476 | return ret; |
1449 | } | 1477 | } |
1450 | 1478 | ||
1451 | int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) | 1479 | bool task_in_mem_cgroup(struct task_struct *task, |
1480 | const struct mem_cgroup *memcg) | ||
1452 | { | 1481 | { |
1453 | int ret; | ||
1454 | struct mem_cgroup *curr = NULL; | 1482 | struct mem_cgroup *curr = NULL; |
1455 | struct task_struct *p; | 1483 | struct task_struct *p; |
1484 | bool ret; | ||
1456 | 1485 | ||
1457 | p = find_lock_task_mm(task); | 1486 | p = find_lock_task_mm(task); |
1458 | if (p) { | 1487 | if (p) { |
@@ -1464,14 +1493,14 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) | |||
1464 | * killer still needs to detect if they have already been oom | 1493 | * killer still needs to detect if they have already been oom |
1465 | * killed to prevent needlessly killing additional tasks. | 1494 | * killed to prevent needlessly killing additional tasks. |
1466 | */ | 1495 | */ |
1467 | task_lock(task); | 1496 | rcu_read_lock(); |
1468 | curr = mem_cgroup_from_task(task); | 1497 | curr = mem_cgroup_from_task(task); |
1469 | if (curr) | 1498 | if (curr) |
1470 | css_get(&curr->css); | 1499 | css_get(&curr->css); |
1471 | task_unlock(task); | 1500 | rcu_read_unlock(); |
1472 | } | 1501 | } |
1473 | if (!curr) | 1502 | if (!curr) |
1474 | return 0; | 1503 | return false; |
1475 | /* | 1504 | /* |
1476 | * We should check use_hierarchy of "memcg" not "curr". Because checking | 1505 | * We should check use_hierarchy of "memcg" not "curr". Because checking |
1477 | * use_hierarchy of "curr" here make this function true if hierarchy is | 1506 | * use_hierarchy of "curr" here make this function true if hierarchy is |
@@ -6317,14 +6346,14 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg) | |||
6317 | struct mem_cgroup *parent = memcg; | 6346 | struct mem_cgroup *parent = memcg; |
6318 | 6347 | ||
6319 | while ((parent = parent_mem_cgroup(parent))) | 6348 | while ((parent = parent_mem_cgroup(parent))) |
6320 | atomic_inc(&parent->dead_count); | 6349 | mem_cgroup_iter_invalidate(parent); |
6321 | 6350 | ||
6322 | /* | 6351 | /* |
6323 | * if the root memcg is not hierarchical we have to check it | 6352 | * if the root memcg is not hierarchical we have to check it |
6324 | * explicitely. | 6353 | * explicitely. |
6325 | */ | 6354 | */ |
6326 | if (!root_mem_cgroup->use_hierarchy) | 6355 | if (!root_mem_cgroup->use_hierarchy) |
6327 | atomic_inc(&root_mem_cgroup->dead_count); | 6356 | mem_cgroup_iter_invalidate(root_mem_cgroup); |
6328 | } | 6357 | } |
6329 | 6358 | ||
6330 | static void mem_cgroup_css_offline(struct cgroup *cont) | 6359 | static void mem_cgroup_css_offline(struct cgroup *cont) |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index ceb0c7f1932f..2c13aa7a0164 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -1410,7 +1410,8 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags) | |||
1410 | 1410 | ||
1411 | /* | 1411 | /* |
1412 | * Isolate the page, so that it doesn't get reallocated if it | 1412 | * Isolate the page, so that it doesn't get reallocated if it |
1413 | * was free. | 1413 | * was free. This flag should be kept set until the source page |
1414 | * is freed and PG_hwpoison on it is set. | ||
1414 | */ | 1415 | */ |
1415 | set_migratetype_isolate(p, true); | 1416 | set_migratetype_isolate(p, true); |
1416 | /* | 1417 | /* |
@@ -1433,7 +1434,6 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags) | |||
1433 | /* Not a free page */ | 1434 | /* Not a free page */ |
1434 | ret = 1; | 1435 | ret = 1; |
1435 | } | 1436 | } |
1436 | unset_migratetype_isolate(p, MIGRATE_MOVABLE); | ||
1437 | unlock_memory_hotplug(); | 1437 | unlock_memory_hotplug(); |
1438 | return ret; | 1438 | return ret; |
1439 | } | 1439 | } |
@@ -1494,7 +1494,6 @@ static int soft_offline_huge_page(struct page *page, int flags) | |||
1494 | atomic_long_add(1 << compound_trans_order(hpage), | 1494 | atomic_long_add(1 << compound_trans_order(hpage), |
1495 | &num_poisoned_pages); | 1495 | &num_poisoned_pages); |
1496 | } | 1496 | } |
1497 | /* keep elevated page count for bad page */ | ||
1498 | return ret; | 1497 | return ret; |
1499 | } | 1498 | } |
1500 | 1499 | ||
@@ -1559,7 +1558,7 @@ int soft_offline_page(struct page *page, int flags) | |||
1559 | atomic_long_inc(&num_poisoned_pages); | 1558 | atomic_long_inc(&num_poisoned_pages); |
1560 | } | 1559 | } |
1561 | } | 1560 | } |
1562 | /* keep elevated page count for bad page */ | 1561 | unset_migratetype_isolate(page, MIGRATE_MOVABLE); |
1563 | return ret; | 1562 | return ret; |
1564 | } | 1563 | } |
1565 | 1564 | ||
@@ -1625,7 +1624,22 @@ static int __soft_offline_page(struct page *page, int flags) | |||
1625 | if (ret > 0) | 1624 | if (ret > 0) |
1626 | ret = -EIO; | 1625 | ret = -EIO; |
1627 | } else { | 1626 | } else { |
1627 | /* | ||
1628 | * After page migration succeeds, the source page can | ||
1629 | * be trapped in pagevec and actual freeing is delayed. | ||
1630 | * Freeing code works differently based on PG_hwpoison, | ||
1631 | * so there's a race. We need to make sure that the | ||
1632 | * source page should be freed back to buddy before | ||
1633 | * setting PG_hwpoison. | ||
1634 | */ | ||
1635 | if (!is_free_buddy_page(page)) | ||
1636 | lru_add_drain_all(); | ||
1637 | if (!is_free_buddy_page(page)) | ||
1638 | drain_all_pages(); | ||
1628 | SetPageHWPoison(page); | 1639 | SetPageHWPoison(page); |
1640 | if (!is_free_buddy_page(page)) | ||
1641 | pr_info("soft offline: %#lx: page leaked\n", | ||
1642 | pfn); | ||
1629 | atomic_long_inc(&num_poisoned_pages); | 1643 | atomic_long_inc(&num_poisoned_pages); |
1630 | } | 1644 | } |
1631 | } else { | 1645 | } else { |
diff --git a/mm/memory.c b/mm/memory.c index 95d0cce63583..b68812d682b6 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -82,7 +82,6 @@ EXPORT_SYMBOL(max_mapnr); | |||
82 | EXPORT_SYMBOL(mem_map); | 82 | EXPORT_SYMBOL(mem_map); |
83 | #endif | 83 | #endif |
84 | 84 | ||
85 | unsigned long num_physpages; | ||
86 | /* | 85 | /* |
87 | * A number of key systems in x86 including ioremap() rely on the assumption | 86 | * A number of key systems in x86 including ioremap() rely on the assumption |
88 | * that high_memory defines the upper bound on direct map memory, then end | 87 | * that high_memory defines the upper bound on direct map memory, then end |
@@ -92,7 +91,6 @@ unsigned long num_physpages; | |||
92 | */ | 91 | */ |
93 | void * high_memory; | 92 | void * high_memory; |
94 | 93 | ||
95 | EXPORT_SYMBOL(num_physpages); | ||
96 | EXPORT_SYMBOL(high_memory); | 94 | EXPORT_SYMBOL(high_memory); |
97 | 95 | ||
98 | /* | 96 | /* |
@@ -1101,6 +1099,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, | |||
1101 | spinlock_t *ptl; | 1099 | spinlock_t *ptl; |
1102 | pte_t *start_pte; | 1100 | pte_t *start_pte; |
1103 | pte_t *pte; | 1101 | pte_t *pte; |
1102 | unsigned long range_start = addr; | ||
1104 | 1103 | ||
1105 | again: | 1104 | again: |
1106 | init_rss_vec(rss); | 1105 | init_rss_vec(rss); |
@@ -1206,12 +1205,14 @@ again: | |||
1206 | force_flush = 0; | 1205 | force_flush = 0; |
1207 | 1206 | ||
1208 | #ifdef HAVE_GENERIC_MMU_GATHER | 1207 | #ifdef HAVE_GENERIC_MMU_GATHER |
1209 | tlb->start = addr; | 1208 | tlb->start = range_start; |
1210 | tlb->end = end; | 1209 | tlb->end = addr; |
1211 | #endif | 1210 | #endif |
1212 | tlb_flush_mmu(tlb); | 1211 | tlb_flush_mmu(tlb); |
1213 | if (addr != end) | 1212 | if (addr != end) { |
1213 | range_start = addr; | ||
1214 | goto again; | 1214 | goto again; |
1215 | } | ||
1215 | } | 1216 | } |
1216 | 1217 | ||
1217 | return addr; | 1218 | return addr; |
@@ -2904,7 +2905,7 @@ static inline void unmap_mapping_range_tree(struct rb_root *root, | |||
2904 | details->first_index, details->last_index) { | 2905 | details->first_index, details->last_index) { |
2905 | 2906 | ||
2906 | vba = vma->vm_pgoff; | 2907 | vba = vma->vm_pgoff; |
2907 | vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1; | 2908 | vea = vba + vma_pages(vma) - 1; |
2908 | /* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */ | 2909 | /* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */ |
2909 | zba = details->first_index; | 2910 | zba = details->first_index; |
2910 | if (zba < vba) | 2911 | if (zba < vba) |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 081b4d654ed6..f5ba127b2051 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -75,7 +75,7 @@ static struct resource *register_memory_resource(u64 start, u64 size) | |||
75 | res->end = start + size - 1; | 75 | res->end = start + size - 1; |
76 | res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; | 76 | res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; |
77 | if (request_resource(&iomem_resource, res) < 0) { | 77 | if (request_resource(&iomem_resource, res) < 0) { |
78 | printk("System RAM resource %pR cannot be added\n", res); | 78 | pr_debug("System RAM resource %pR cannot be added\n", res); |
79 | kfree(res); | 79 | kfree(res); |
80 | res = NULL; | 80 | res = NULL; |
81 | } | 81 | } |
@@ -101,12 +101,9 @@ void get_page_bootmem(unsigned long info, struct page *page, | |||
101 | atomic_inc(&page->_count); | 101 | atomic_inc(&page->_count); |
102 | } | 102 | } |
103 | 103 | ||
104 | /* reference to __meminit __free_pages_bootmem is valid | 104 | void put_page_bootmem(struct page *page) |
105 | * so use __ref to tell modpost not to generate a warning */ | ||
106 | void __ref put_page_bootmem(struct page *page) | ||
107 | { | 105 | { |
108 | unsigned long type; | 106 | unsigned long type; |
109 | static DEFINE_MUTEX(ppb_lock); | ||
110 | 107 | ||
111 | type = (unsigned long) page->lru.next; | 108 | type = (unsigned long) page->lru.next; |
112 | BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || | 109 | BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || |
@@ -116,17 +113,8 @@ void __ref put_page_bootmem(struct page *page) | |||
116 | ClearPagePrivate(page); | 113 | ClearPagePrivate(page); |
117 | set_page_private(page, 0); | 114 | set_page_private(page, 0); |
118 | INIT_LIST_HEAD(&page->lru); | 115 | INIT_LIST_HEAD(&page->lru); |
119 | 116 | free_reserved_page(page); | |
120 | /* | ||
121 | * Please refer to comment for __free_pages_bootmem() | ||
122 | * for why we serialize here. | ||
123 | */ | ||
124 | mutex_lock(&ppb_lock); | ||
125 | __free_pages_bootmem(page, 0); | ||
126 | mutex_unlock(&ppb_lock); | ||
127 | totalram_pages++; | ||
128 | } | 117 | } |
129 | |||
130 | } | 118 | } |
131 | 119 | ||
132 | #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE | 120 | #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE |
@@ -309,7 +297,7 @@ static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, | |||
309 | /* can't move pfns which are higher than @z2 */ | 297 | /* can't move pfns which are higher than @z2 */ |
310 | if (end_pfn > zone_end_pfn(z2)) | 298 | if (end_pfn > zone_end_pfn(z2)) |
311 | goto out_fail; | 299 | goto out_fail; |
312 | /* the move out part mast at the left most of @z2 */ | 300 | /* the move out part must be at the left most of @z2 */ |
313 | if (start_pfn > z2->zone_start_pfn) | 301 | if (start_pfn > z2->zone_start_pfn) |
314 | goto out_fail; | 302 | goto out_fail; |
315 | /* must included/overlap */ | 303 | /* must included/overlap */ |
@@ -775,29 +763,18 @@ EXPORT_SYMBOL_GPL(restore_online_page_callback); | |||
775 | 763 | ||
776 | void __online_page_set_limits(struct page *page) | 764 | void __online_page_set_limits(struct page *page) |
777 | { | 765 | { |
778 | unsigned long pfn = page_to_pfn(page); | ||
779 | |||
780 | if (pfn >= num_physpages) | ||
781 | num_physpages = pfn + 1; | ||
782 | } | 766 | } |
783 | EXPORT_SYMBOL_GPL(__online_page_set_limits); | 767 | EXPORT_SYMBOL_GPL(__online_page_set_limits); |
784 | 768 | ||
785 | void __online_page_increment_counters(struct page *page) | 769 | void __online_page_increment_counters(struct page *page) |
786 | { | 770 | { |
787 | totalram_pages++; | 771 | adjust_managed_page_count(page, 1); |
788 | |||
789 | #ifdef CONFIG_HIGHMEM | ||
790 | if (PageHighMem(page)) | ||
791 | totalhigh_pages++; | ||
792 | #endif | ||
793 | } | 772 | } |
794 | EXPORT_SYMBOL_GPL(__online_page_increment_counters); | 773 | EXPORT_SYMBOL_GPL(__online_page_increment_counters); |
795 | 774 | ||
796 | void __online_page_free(struct page *page) | 775 | void __online_page_free(struct page *page) |
797 | { | 776 | { |
798 | ClearPageReserved(page); | 777 | __free_reserved_page(page); |
799 | init_page_count(page); | ||
800 | __free_page(page); | ||
801 | } | 778 | } |
802 | EXPORT_SYMBOL_GPL(__online_page_free); | 779 | EXPORT_SYMBOL_GPL(__online_page_free); |
803 | 780 | ||
@@ -918,6 +895,7 @@ static void node_states_set_node(int node, struct memory_notify *arg) | |||
918 | 895 | ||
919 | int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) | 896 | int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) |
920 | { | 897 | { |
898 | unsigned long flags; | ||
921 | unsigned long onlined_pages = 0; | 899 | unsigned long onlined_pages = 0; |
922 | struct zone *zone; | 900 | struct zone *zone; |
923 | int need_zonelists_rebuild = 0; | 901 | int need_zonelists_rebuild = 0; |
@@ -994,9 +972,12 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ | |||
994 | return ret; | 972 | return ret; |
995 | } | 973 | } |
996 | 974 | ||
997 | zone->managed_pages += onlined_pages; | ||
998 | zone->present_pages += onlined_pages; | 975 | zone->present_pages += onlined_pages; |
976 | |||
977 | pgdat_resize_lock(zone->zone_pgdat, &flags); | ||
999 | zone->zone_pgdat->node_present_pages += onlined_pages; | 978 | zone->zone_pgdat->node_present_pages += onlined_pages; |
979 | pgdat_resize_unlock(zone->zone_pgdat, &flags); | ||
980 | |||
1000 | if (onlined_pages) { | 981 | if (onlined_pages) { |
1001 | node_states_set_node(zone_to_nid(zone), &arg); | 982 | node_states_set_node(zone_to_nid(zone), &arg); |
1002 | if (need_zonelists_rebuild) | 983 | if (need_zonelists_rebuild) |
@@ -1487,6 +1468,7 @@ static int __ref __offline_pages(unsigned long start_pfn, | |||
1487 | unsigned long pfn, nr_pages, expire; | 1468 | unsigned long pfn, nr_pages, expire; |
1488 | long offlined_pages; | 1469 | long offlined_pages; |
1489 | int ret, drain, retry_max, node; | 1470 | int ret, drain, retry_max, node; |
1471 | unsigned long flags; | ||
1490 | struct zone *zone; | 1472 | struct zone *zone; |
1491 | struct memory_notify arg; | 1473 | struct memory_notify arg; |
1492 | 1474 | ||
@@ -1578,10 +1560,12 @@ repeat: | |||
1578 | /* reset pagetype flags and makes migrate type to be MOVABLE */ | 1560 | /* reset pagetype flags and makes migrate type to be MOVABLE */ |
1579 | undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); | 1561 | undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); |
1580 | /* removal success */ | 1562 | /* removal success */ |
1581 | zone->managed_pages -= offlined_pages; | 1563 | adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages); |
1582 | zone->present_pages -= offlined_pages; | 1564 | zone->present_pages -= offlined_pages; |
1565 | |||
1566 | pgdat_resize_lock(zone->zone_pgdat, &flags); | ||
1583 | zone->zone_pgdat->node_present_pages -= offlined_pages; | 1567 | zone->zone_pgdat->node_present_pages -= offlined_pages; |
1584 | totalram_pages -= offlined_pages; | 1568 | pgdat_resize_unlock(zone->zone_pgdat, &flags); |
1585 | 1569 | ||
1586 | init_per_zone_wmark_min(); | 1570 | init_per_zone_wmark_min(); |
1587 | 1571 | ||
diff --git a/mm/mm_init.c b/mm/mm_init.c index c280a02ea11e..633c08863fd8 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c | |||
@@ -9,6 +9,8 @@ | |||
9 | #include <linux/init.h> | 9 | #include <linux/init.h> |
10 | #include <linux/kobject.h> | 10 | #include <linux/kobject.h> |
11 | #include <linux/export.h> | 11 | #include <linux/export.h> |
12 | #include <linux/memory.h> | ||
13 | #include <linux/notifier.h> | ||
12 | #include "internal.h" | 14 | #include "internal.h" |
13 | 15 | ||
14 | #ifdef CONFIG_DEBUG_MEMORY_INIT | 16 | #ifdef CONFIG_DEBUG_MEMORY_INIT |
@@ -147,6 +149,51 @@ early_param("mminit_loglevel", set_mminit_loglevel); | |||
147 | struct kobject *mm_kobj; | 149 | struct kobject *mm_kobj; |
148 | EXPORT_SYMBOL_GPL(mm_kobj); | 150 | EXPORT_SYMBOL_GPL(mm_kobj); |
149 | 151 | ||
152 | #ifdef CONFIG_SMP | ||
153 | s32 vm_committed_as_batch = 32; | ||
154 | |||
155 | static void __meminit mm_compute_batch(void) | ||
156 | { | ||
157 | u64 memsized_batch; | ||
158 | s32 nr = num_present_cpus(); | ||
159 | s32 batch = max_t(s32, nr*2, 32); | ||
160 | |||
161 | /* batch size set to 0.4% of (total memory/#cpus), or max int32 */ | ||
162 | memsized_batch = min_t(u64, (totalram_pages/nr)/256, 0x7fffffff); | ||
163 | |||
164 | vm_committed_as_batch = max_t(s32, memsized_batch, batch); | ||
165 | } | ||
166 | |||
167 | static int __meminit mm_compute_batch_notifier(struct notifier_block *self, | ||
168 | unsigned long action, void *arg) | ||
169 | { | ||
170 | switch (action) { | ||
171 | case MEM_ONLINE: | ||
172 | case MEM_OFFLINE: | ||
173 | mm_compute_batch(); | ||
174 | default: | ||
175 | break; | ||
176 | } | ||
177 | return NOTIFY_OK; | ||
178 | } | ||
179 | |||
180 | static struct notifier_block compute_batch_nb __meminitdata = { | ||
181 | .notifier_call = mm_compute_batch_notifier, | ||
182 | .priority = IPC_CALLBACK_PRI, /* use lowest priority */ | ||
183 | }; | ||
184 | |||
185 | static int __init mm_compute_batch_init(void) | ||
186 | { | ||
187 | mm_compute_batch(); | ||
188 | register_hotmemory_notifier(&compute_batch_nb); | ||
189 | |||
190 | return 0; | ||
191 | } | ||
192 | |||
193 | __initcall(mm_compute_batch_init); | ||
194 | |||
195 | #endif | ||
196 | |||
150 | static int __init mm_sysfs_init(void) | 197 | static int __init mm_sysfs_init(void) |
151 | { | 198 | { |
152 | mm_kobj = kobject_create_and_add("mm", kernel_kobj); | 199 | mm_kobj = kobject_create_and_add("mm", kernel_kobj); |
@@ -955,7 +955,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, | |||
955 | if (is_mergeable_vma(vma, file, vm_flags) && | 955 | if (is_mergeable_vma(vma, file, vm_flags) && |
956 | is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { | 956 | is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { |
957 | pgoff_t vm_pglen; | 957 | pgoff_t vm_pglen; |
958 | vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; | 958 | vm_pglen = vma_pages(vma); |
959 | if (vma->vm_pgoff + vm_pglen == vm_pgoff) | 959 | if (vma->vm_pgoff + vm_pglen == vm_pgoff) |
960 | return 1; | 960 | return 1; |
961 | } | 961 | } |
diff --git a/mm/mremap.c b/mm/mremap.c index 463a25705ac6..3708655378e9 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -126,7 +126,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, | |||
126 | continue; | 126 | continue; |
127 | pte = ptep_get_and_clear(mm, old_addr, old_pte); | 127 | pte = ptep_get_and_clear(mm, old_addr, old_pte); |
128 | pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr); | 128 | pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr); |
129 | set_pte_at(mm, new_addr, new_pte, pte); | 129 | set_pte_at(mm, new_addr, new_pte, pte_mksoft_dirty(pte)); |
130 | } | 130 | } |
131 | 131 | ||
132 | arch_leave_lazy_mmu_mode(); | 132 | arch_leave_lazy_mmu_mode(); |
diff --git a/mm/nobootmem.c b/mm/nobootmem.c index bdd3fa2fc73b..61107cf55bb3 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c | |||
@@ -137,20 +137,25 @@ static unsigned long __init free_low_memory_core_early(void) | |||
137 | return count; | 137 | return count; |
138 | } | 138 | } |
139 | 139 | ||
140 | static void reset_node_lowmem_managed_pages(pg_data_t *pgdat) | 140 | static int reset_managed_pages_done __initdata; |
141 | |||
142 | static inline void __init reset_node_managed_pages(pg_data_t *pgdat) | ||
141 | { | 143 | { |
142 | struct zone *z; | 144 | struct zone *z; |
143 | 145 | ||
144 | /* | 146 | if (reset_managed_pages_done) |
145 | * In free_area_init_core(), highmem zone's managed_pages is set to | 147 | return; |
146 | * present_pages, and bootmem allocator doesn't allocate from highmem | ||
147 | * zones. So there's no need to recalculate managed_pages because all | ||
148 | * highmem pages will be managed by the buddy system. Here highmem | ||
149 | * zone also includes highmem movable zone. | ||
150 | */ | ||
151 | for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) | 148 | for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) |
152 | if (!is_highmem(z)) | 149 | z->managed_pages = 0; |
153 | z->managed_pages = 0; | 150 | } |
151 | |||
152 | void __init reset_all_zones_managed_pages(void) | ||
153 | { | ||
154 | struct pglist_data *pgdat; | ||
155 | |||
156 | for_each_online_pgdat(pgdat) | ||
157 | reset_node_managed_pages(pgdat); | ||
158 | reset_managed_pages_done = 1; | ||
154 | } | 159 | } |
155 | 160 | ||
156 | /** | 161 | /** |
@@ -160,17 +165,19 @@ static void reset_node_lowmem_managed_pages(pg_data_t *pgdat) | |||
160 | */ | 165 | */ |
161 | unsigned long __init free_all_bootmem(void) | 166 | unsigned long __init free_all_bootmem(void) |
162 | { | 167 | { |
163 | struct pglist_data *pgdat; | 168 | unsigned long pages; |
164 | 169 | ||
165 | for_each_online_pgdat(pgdat) | 170 | reset_all_zones_managed_pages(); |
166 | reset_node_lowmem_managed_pages(pgdat); | ||
167 | 171 | ||
168 | /* | 172 | /* |
169 | * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id | 173 | * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id |
170 | * because in some case like Node0 doesn't have RAM installed | 174 | * because in some case like Node0 doesn't have RAM installed |
171 | * low ram will be on Node1 | 175 | * low ram will be on Node1 |
172 | */ | 176 | */ |
173 | return free_low_memory_core_early(); | 177 | pages = free_low_memory_core_early(); |
178 | totalram_pages += pages; | ||
179 | |||
180 | return pages; | ||
174 | } | 181 | } |
175 | 182 | ||
176 | /** | 183 | /** |
diff --git a/mm/nommu.c b/mm/nommu.c index 298884dcd6e7..e44e6e0a125c 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -56,7 +56,6 @@ | |||
56 | void *high_memory; | 56 | void *high_memory; |
57 | struct page *mem_map; | 57 | struct page *mem_map; |
58 | unsigned long max_mapnr; | 58 | unsigned long max_mapnr; |
59 | unsigned long num_physpages; | ||
60 | unsigned long highest_memmap_pfn; | 59 | unsigned long highest_memmap_pfn; |
61 | struct percpu_counter vm_committed_as; | 60 | struct percpu_counter vm_committed_as; |
62 | int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ | 61 | int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ |
@@ -85,7 +84,6 @@ unsigned long vm_memory_committed(void) | |||
85 | EXPORT_SYMBOL_GPL(vm_memory_committed); | 84 | EXPORT_SYMBOL_GPL(vm_memory_committed); |
86 | 85 | ||
87 | EXPORT_SYMBOL(mem_map); | 86 | EXPORT_SYMBOL(mem_map); |
88 | EXPORT_SYMBOL(num_physpages); | ||
89 | 87 | ||
90 | /* list of mapped, potentially shareable regions */ | 88 | /* list of mapped, potentially shareable regions */ |
91 | static struct kmem_cache *vm_region_jar; | 89 | static struct kmem_cache *vm_region_jar; |
@@ -282,6 +280,10 @@ EXPORT_SYMBOL(vmalloc_to_pfn); | |||
282 | 280 | ||
283 | long vread(char *buf, char *addr, unsigned long count) | 281 | long vread(char *buf, char *addr, unsigned long count) |
284 | { | 282 | { |
283 | /* Don't allow overflow */ | ||
284 | if ((unsigned long) buf + count < count) | ||
285 | count = -(unsigned long) buf; | ||
286 | |||
285 | memcpy(buf, addr, count); | 287 | memcpy(buf, addr, count); |
286 | return count; | 288 | return count; |
287 | } | 289 | } |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c3edb624fccf..327516b7aee9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -61,10 +61,14 @@ | |||
61 | #include <linux/hugetlb.h> | 61 | #include <linux/hugetlb.h> |
62 | #include <linux/sched/rt.h> | 62 | #include <linux/sched/rt.h> |
63 | 63 | ||
64 | #include <asm/sections.h> | ||
64 | #include <asm/tlbflush.h> | 65 | #include <asm/tlbflush.h> |
65 | #include <asm/div64.h> | 66 | #include <asm/div64.h> |
66 | #include "internal.h" | 67 | #include "internal.h" |
67 | 68 | ||
69 | /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ | ||
70 | static DEFINE_MUTEX(pcp_batch_high_lock); | ||
71 | |||
68 | #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID | 72 | #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID |
69 | DEFINE_PER_CPU(int, numa_node); | 73 | DEFINE_PER_CPU(int, numa_node); |
70 | EXPORT_PER_CPU_SYMBOL(numa_node); | 74 | EXPORT_PER_CPU_SYMBOL(numa_node); |
@@ -100,6 +104,9 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = { | |||
100 | }; | 104 | }; |
101 | EXPORT_SYMBOL(node_states); | 105 | EXPORT_SYMBOL(node_states); |
102 | 106 | ||
107 | /* Protect totalram_pages and zone->managed_pages */ | ||
108 | static DEFINE_SPINLOCK(managed_page_count_lock); | ||
109 | |||
103 | unsigned long totalram_pages __read_mostly; | 110 | unsigned long totalram_pages __read_mostly; |
104 | unsigned long totalreserve_pages __read_mostly; | 111 | unsigned long totalreserve_pages __read_mostly; |
105 | /* | 112 | /* |
@@ -739,14 +746,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) | |||
739 | local_irq_restore(flags); | 746 | local_irq_restore(flags); |
740 | } | 747 | } |
741 | 748 | ||
742 | /* | 749 | void __init __free_pages_bootmem(struct page *page, unsigned int order) |
743 | * Read access to zone->managed_pages is safe because it's unsigned long, | ||
744 | * but we still need to serialize writers. Currently all callers of | ||
745 | * __free_pages_bootmem() except put_page_bootmem() should only be used | ||
746 | * at boot time. So for shorter boot time, we shift the burden to | ||
747 | * put_page_bootmem() to serialize writers. | ||
748 | */ | ||
749 | void __meminit __free_pages_bootmem(struct page *page, unsigned int order) | ||
750 | { | 750 | { |
751 | unsigned int nr_pages = 1 << order; | 751 | unsigned int nr_pages = 1 << order; |
752 | unsigned int loop; | 752 | unsigned int loop; |
@@ -781,11 +781,7 @@ void __init init_cma_reserved_pageblock(struct page *page) | |||
781 | set_page_refcounted(page); | 781 | set_page_refcounted(page); |
782 | set_pageblock_migratetype(page, MIGRATE_CMA); | 782 | set_pageblock_migratetype(page, MIGRATE_CMA); |
783 | __free_pages(page, pageblock_order); | 783 | __free_pages(page, pageblock_order); |
784 | totalram_pages += pageblock_nr_pages; | 784 | adjust_managed_page_count(page, pageblock_nr_pages); |
785 | #ifdef CONFIG_HIGHMEM | ||
786 | if (PageHighMem(page)) | ||
787 | totalhigh_pages += pageblock_nr_pages; | ||
788 | #endif | ||
789 | } | 785 | } |
790 | #endif | 786 | #endif |
791 | 787 | ||
@@ -1179,10 +1175,12 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) | |||
1179 | { | 1175 | { |
1180 | unsigned long flags; | 1176 | unsigned long flags; |
1181 | int to_drain; | 1177 | int to_drain; |
1178 | unsigned long batch; | ||
1182 | 1179 | ||
1183 | local_irq_save(flags); | 1180 | local_irq_save(flags); |
1184 | if (pcp->count >= pcp->batch) | 1181 | batch = ACCESS_ONCE(pcp->batch); |
1185 | to_drain = pcp->batch; | 1182 | if (pcp->count >= batch) |
1183 | to_drain = batch; | ||
1186 | else | 1184 | else |
1187 | to_drain = pcp->count; | 1185 | to_drain = pcp->count; |
1188 | if (to_drain > 0) { | 1186 | if (to_drain > 0) { |
@@ -1350,8 +1348,9 @@ void free_hot_cold_page(struct page *page, int cold) | |||
1350 | list_add(&page->lru, &pcp->lists[migratetype]); | 1348 | list_add(&page->lru, &pcp->lists[migratetype]); |
1351 | pcp->count++; | 1349 | pcp->count++; |
1352 | if (pcp->count >= pcp->high) { | 1350 | if (pcp->count >= pcp->high) { |
1353 | free_pcppages_bulk(zone, pcp->batch, pcp); | 1351 | unsigned long batch = ACCESS_ONCE(pcp->batch); |
1354 | pcp->count -= pcp->batch; | 1352 | free_pcppages_bulk(zone, batch, pcp); |
1353 | pcp->count -= batch; | ||
1355 | } | 1354 | } |
1356 | 1355 | ||
1357 | out: | 1356 | out: |
@@ -2839,7 +2838,7 @@ EXPORT_SYMBOL(free_pages_exact); | |||
2839 | * nr_free_zone_pages() counts the number of counts pages which are beyond the | 2838 | * nr_free_zone_pages() counts the number of counts pages which are beyond the |
2840 | * high watermark within all zones at or below a given zone index. For each | 2839 | * high watermark within all zones at or below a given zone index. For each |
2841 | * zone, the number of pages is calculated as: | 2840 | * zone, the number of pages is calculated as: |
2842 | * present_pages - high_pages | 2841 | * managed_pages - high_pages |
2843 | */ | 2842 | */ |
2844 | static unsigned long nr_free_zone_pages(int offset) | 2843 | static unsigned long nr_free_zone_pages(int offset) |
2845 | { | 2844 | { |
@@ -2906,9 +2905,13 @@ EXPORT_SYMBOL(si_meminfo); | |||
2906 | #ifdef CONFIG_NUMA | 2905 | #ifdef CONFIG_NUMA |
2907 | void si_meminfo_node(struct sysinfo *val, int nid) | 2906 | void si_meminfo_node(struct sysinfo *val, int nid) |
2908 | { | 2907 | { |
2908 | int zone_type; /* needs to be signed */ | ||
2909 | unsigned long managed_pages = 0; | ||
2909 | pg_data_t *pgdat = NODE_DATA(nid); | 2910 | pg_data_t *pgdat = NODE_DATA(nid); |
2910 | 2911 | ||
2911 | val->totalram = pgdat->node_present_pages; | 2912 | for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) |
2913 | managed_pages += pgdat->node_zones[zone_type].managed_pages; | ||
2914 | val->totalram = managed_pages; | ||
2912 | val->freeram = node_page_state(nid, NR_FREE_PAGES); | 2915 | val->freeram = node_page_state(nid, NR_FREE_PAGES); |
2913 | #ifdef CONFIG_HIGHMEM | 2916 | #ifdef CONFIG_HIGHMEM |
2914 | val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages; | 2917 | val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages; |
@@ -3250,18 +3253,25 @@ int numa_zonelist_order_handler(ctl_table *table, int write, | |||
3250 | static DEFINE_MUTEX(zl_order_mutex); | 3253 | static DEFINE_MUTEX(zl_order_mutex); |
3251 | 3254 | ||
3252 | mutex_lock(&zl_order_mutex); | 3255 | mutex_lock(&zl_order_mutex); |
3253 | if (write) | 3256 | if (write) { |
3254 | strcpy(saved_string, (char*)table->data); | 3257 | if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) { |
3258 | ret = -EINVAL; | ||
3259 | goto out; | ||
3260 | } | ||
3261 | strcpy(saved_string, (char *)table->data); | ||
3262 | } | ||
3255 | ret = proc_dostring(table, write, buffer, length, ppos); | 3263 | ret = proc_dostring(table, write, buffer, length, ppos); |
3256 | if (ret) | 3264 | if (ret) |
3257 | goto out; | 3265 | goto out; |
3258 | if (write) { | 3266 | if (write) { |
3259 | int oldval = user_zonelist_order; | 3267 | int oldval = user_zonelist_order; |
3260 | if (__parse_numa_zonelist_order((char*)table->data)) { | 3268 | |
3269 | ret = __parse_numa_zonelist_order((char *)table->data); | ||
3270 | if (ret) { | ||
3261 | /* | 3271 | /* |
3262 | * bogus value. restore saved string | 3272 | * bogus value. restore saved string |
3263 | */ | 3273 | */ |
3264 | strncpy((char*)table->data, saved_string, | 3274 | strncpy((char *)table->data, saved_string, |
3265 | NUMA_ZONELIST_ORDER_LEN); | 3275 | NUMA_ZONELIST_ORDER_LEN); |
3266 | user_zonelist_order = oldval; | 3276 | user_zonelist_order = oldval; |
3267 | } else if (oldval != user_zonelist_order) { | 3277 | } else if (oldval != user_zonelist_order) { |
@@ -3425,8 +3435,8 @@ static int default_zonelist_order(void) | |||
3425 | z = &NODE_DATA(nid)->node_zones[zone_type]; | 3435 | z = &NODE_DATA(nid)->node_zones[zone_type]; |
3426 | if (populated_zone(z)) { | 3436 | if (populated_zone(z)) { |
3427 | if (zone_type < ZONE_NORMAL) | 3437 | if (zone_type < ZONE_NORMAL) |
3428 | low_kmem_size += z->present_pages; | 3438 | low_kmem_size += z->managed_pages; |
3429 | total_size += z->present_pages; | 3439 | total_size += z->managed_pages; |
3430 | } else if (zone_type == ZONE_NORMAL) { | 3440 | } else if (zone_type == ZONE_NORMAL) { |
3431 | /* | 3441 | /* |
3432 | * If any node has only lowmem, then node order | 3442 | * If any node has only lowmem, then node order |
@@ -3705,12 +3715,12 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) | |||
3705 | mminit_verify_zonelist(); | 3715 | mminit_verify_zonelist(); |
3706 | cpuset_init_current_mems_allowed(); | 3716 | cpuset_init_current_mems_allowed(); |
3707 | } else { | 3717 | } else { |
3708 | /* we have to stop all cpus to guarantee there is no user | ||
3709 | of zonelist */ | ||
3710 | #ifdef CONFIG_MEMORY_HOTPLUG | 3718 | #ifdef CONFIG_MEMORY_HOTPLUG |
3711 | if (zone) | 3719 | if (zone) |
3712 | setup_zone_pageset(zone); | 3720 | setup_zone_pageset(zone); |
3713 | #endif | 3721 | #endif |
3722 | /* we have to stop all cpus to guarantee there is no user | ||
3723 | of zonelist */ | ||
3714 | stop_machine(__build_all_zonelists, pgdat, NULL); | 3724 | stop_machine(__build_all_zonelists, pgdat, NULL); |
3715 | /* cpuset refresh routine should be here */ | 3725 | /* cpuset refresh routine should be here */ |
3716 | } | 3726 | } |
@@ -4032,7 +4042,40 @@ static int __meminit zone_batchsize(struct zone *zone) | |||
4032 | #endif | 4042 | #endif |
4033 | } | 4043 | } |
4034 | 4044 | ||
4035 | static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) | 4045 | /* |
4046 | * pcp->high and pcp->batch values are related and dependent on one another: | ||
4047 | * ->batch must never be higher then ->high. | ||
4048 | * The following function updates them in a safe manner without read side | ||
4049 | * locking. | ||
4050 | * | ||
4051 | * Any new users of pcp->batch and pcp->high should ensure they can cope with | ||
4052 | * those fields changing asynchronously (acording the the above rule). | ||
4053 | * | ||
4054 | * mutex_is_locked(&pcp_batch_high_lock) required when calling this function | ||
4055 | * outside of boot time (or some other assurance that no concurrent updaters | ||
4056 | * exist). | ||
4057 | */ | ||
4058 | static void pageset_update(struct per_cpu_pages *pcp, unsigned long high, | ||
4059 | unsigned long batch) | ||
4060 | { | ||
4061 | /* start with a fail safe value for batch */ | ||
4062 | pcp->batch = 1; | ||
4063 | smp_wmb(); | ||
4064 | |||
4065 | /* Update high, then batch, in order */ | ||
4066 | pcp->high = high; | ||
4067 | smp_wmb(); | ||
4068 | |||
4069 | pcp->batch = batch; | ||
4070 | } | ||
4071 | |||
4072 | /* a companion to pageset_set_high() */ | ||
4073 | static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch) | ||
4074 | { | ||
4075 | pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch)); | ||
4076 | } | ||
4077 | |||
4078 | static void pageset_init(struct per_cpu_pageset *p) | ||
4036 | { | 4079 | { |
4037 | struct per_cpu_pages *pcp; | 4080 | struct per_cpu_pages *pcp; |
4038 | int migratetype; | 4081 | int migratetype; |
@@ -4041,45 +4084,55 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) | |||
4041 | 4084 | ||
4042 | pcp = &p->pcp; | 4085 | pcp = &p->pcp; |
4043 | pcp->count = 0; | 4086 | pcp->count = 0; |
4044 | pcp->high = 6 * batch; | ||
4045 | pcp->batch = max(1UL, 1 * batch); | ||
4046 | for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) | 4087 | for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) |
4047 | INIT_LIST_HEAD(&pcp->lists[migratetype]); | 4088 | INIT_LIST_HEAD(&pcp->lists[migratetype]); |
4048 | } | 4089 | } |
4049 | 4090 | ||
4091 | static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) | ||
4092 | { | ||
4093 | pageset_init(p); | ||
4094 | pageset_set_batch(p, batch); | ||
4095 | } | ||
4096 | |||
4050 | /* | 4097 | /* |
4051 | * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist | 4098 | * pageset_set_high() sets the high water mark for hot per_cpu_pagelist |
4052 | * to the value high for the pageset p. | 4099 | * to the value high for the pageset p. |
4053 | */ | 4100 | */ |
4054 | 4101 | static void pageset_set_high(struct per_cpu_pageset *p, | |
4055 | static void setup_pagelist_highmark(struct per_cpu_pageset *p, | ||
4056 | unsigned long high) | 4102 | unsigned long high) |
4057 | { | 4103 | { |
4058 | struct per_cpu_pages *pcp; | 4104 | unsigned long batch = max(1UL, high / 4); |
4105 | if ((high / 4) > (PAGE_SHIFT * 8)) | ||
4106 | batch = PAGE_SHIFT * 8; | ||
4059 | 4107 | ||
4060 | pcp = &p->pcp; | 4108 | pageset_update(&p->pcp, high, batch); |
4061 | pcp->high = high; | ||
4062 | pcp->batch = max(1UL, high/4); | ||
4063 | if ((high/4) > (PAGE_SHIFT * 8)) | ||
4064 | pcp->batch = PAGE_SHIFT * 8; | ||
4065 | } | 4109 | } |
4066 | 4110 | ||
4067 | static void __meminit setup_zone_pageset(struct zone *zone) | 4111 | static void __meminit pageset_set_high_and_batch(struct zone *zone, |
4112 | struct per_cpu_pageset *pcp) | ||
4068 | { | 4113 | { |
4069 | int cpu; | 4114 | if (percpu_pagelist_fraction) |
4070 | 4115 | pageset_set_high(pcp, | |
4071 | zone->pageset = alloc_percpu(struct per_cpu_pageset); | 4116 | (zone->managed_pages / |
4117 | percpu_pagelist_fraction)); | ||
4118 | else | ||
4119 | pageset_set_batch(pcp, zone_batchsize(zone)); | ||
4120 | } | ||
4072 | 4121 | ||
4073 | for_each_possible_cpu(cpu) { | 4122 | static void __meminit zone_pageset_init(struct zone *zone, int cpu) |
4074 | struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); | 4123 | { |
4124 | struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); | ||
4075 | 4125 | ||
4076 | setup_pageset(pcp, zone_batchsize(zone)); | 4126 | pageset_init(pcp); |
4127 | pageset_set_high_and_batch(zone, pcp); | ||
4128 | } | ||
4077 | 4129 | ||
4078 | if (percpu_pagelist_fraction) | 4130 | static void __meminit setup_zone_pageset(struct zone *zone) |
4079 | setup_pagelist_highmark(pcp, | 4131 | { |
4080 | (zone->managed_pages / | 4132 | int cpu; |
4081 | percpu_pagelist_fraction)); | 4133 | zone->pageset = alloc_percpu(struct per_cpu_pageset); |
4082 | } | 4134 | for_each_possible_cpu(cpu) |
4135 | zone_pageset_init(zone, cpu); | ||
4083 | } | 4136 | } |
4084 | 4137 | ||
4085 | /* | 4138 | /* |
@@ -5150,35 +5203,101 @@ early_param("movablecore", cmdline_parse_movablecore); | |||
5150 | 5203 | ||
5151 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ | 5204 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ |
5152 | 5205 | ||
5153 | unsigned long free_reserved_area(unsigned long start, unsigned long end, | 5206 | void adjust_managed_page_count(struct page *page, long count) |
5154 | int poison, char *s) | 5207 | { |
5208 | spin_lock(&managed_page_count_lock); | ||
5209 | page_zone(page)->managed_pages += count; | ||
5210 | totalram_pages += count; | ||
5211 | #ifdef CONFIG_HIGHMEM | ||
5212 | if (PageHighMem(page)) | ||
5213 | totalhigh_pages += count; | ||
5214 | #endif | ||
5215 | spin_unlock(&managed_page_count_lock); | ||
5216 | } | ||
5217 | EXPORT_SYMBOL(adjust_managed_page_count); | ||
5218 | |||
5219 | unsigned long free_reserved_area(void *start, void *end, int poison, char *s) | ||
5155 | { | 5220 | { |
5156 | unsigned long pages, pos; | 5221 | void *pos; |
5222 | unsigned long pages = 0; | ||
5157 | 5223 | ||
5158 | pos = start = PAGE_ALIGN(start); | 5224 | start = (void *)PAGE_ALIGN((unsigned long)start); |
5159 | end &= PAGE_MASK; | 5225 | end = (void *)((unsigned long)end & PAGE_MASK); |
5160 | for (pages = 0; pos < end; pos += PAGE_SIZE, pages++) { | 5226 | for (pos = start; pos < end; pos += PAGE_SIZE, pages++) { |
5161 | if (poison) | 5227 | if ((unsigned int)poison <= 0xFF) |
5162 | memset((void *)pos, poison, PAGE_SIZE); | 5228 | memset(pos, poison, PAGE_SIZE); |
5163 | free_reserved_page(virt_to_page((void *)pos)); | 5229 | free_reserved_page(virt_to_page(pos)); |
5164 | } | 5230 | } |
5165 | 5231 | ||
5166 | if (pages && s) | 5232 | if (pages && s) |
5167 | pr_info("Freeing %s memory: %ldK (%lx - %lx)\n", | 5233 | pr_info("Freeing %s memory: %ldK (%p - %p)\n", |
5168 | s, pages << (PAGE_SHIFT - 10), start, end); | 5234 | s, pages << (PAGE_SHIFT - 10), start, end); |
5169 | 5235 | ||
5170 | return pages; | 5236 | return pages; |
5171 | } | 5237 | } |
5238 | EXPORT_SYMBOL(free_reserved_area); | ||
5172 | 5239 | ||
5173 | #ifdef CONFIG_HIGHMEM | 5240 | #ifdef CONFIG_HIGHMEM |
5174 | void free_highmem_page(struct page *page) | 5241 | void free_highmem_page(struct page *page) |
5175 | { | 5242 | { |
5176 | __free_reserved_page(page); | 5243 | __free_reserved_page(page); |
5177 | totalram_pages++; | 5244 | totalram_pages++; |
5245 | page_zone(page)->managed_pages++; | ||
5178 | totalhigh_pages++; | 5246 | totalhigh_pages++; |
5179 | } | 5247 | } |
5180 | #endif | 5248 | #endif |
5181 | 5249 | ||
5250 | |||
5251 | void __init mem_init_print_info(const char *str) | ||
5252 | { | ||
5253 | unsigned long physpages, codesize, datasize, rosize, bss_size; | ||
5254 | unsigned long init_code_size, init_data_size; | ||
5255 | |||
5256 | physpages = get_num_physpages(); | ||
5257 | codesize = _etext - _stext; | ||
5258 | datasize = _edata - _sdata; | ||
5259 | rosize = __end_rodata - __start_rodata; | ||
5260 | bss_size = __bss_stop - __bss_start; | ||
5261 | init_data_size = __init_end - __init_begin; | ||
5262 | init_code_size = _einittext - _sinittext; | ||
5263 | |||
5264 | /* | ||
5265 | * Detect special cases and adjust section sizes accordingly: | ||
5266 | * 1) .init.* may be embedded into .data sections | ||
5267 | * 2) .init.text.* may be out of [__init_begin, __init_end], | ||
5268 | * please refer to arch/tile/kernel/vmlinux.lds.S. | ||
5269 | * 3) .rodata.* may be embedded into .text or .data sections. | ||
5270 | */ | ||
5271 | #define adj_init_size(start, end, size, pos, adj) \ | ||
5272 | if (start <= pos && pos < end && size > adj) \ | ||
5273 | size -= adj; | ||
5274 | |||
5275 | adj_init_size(__init_begin, __init_end, init_data_size, | ||
5276 | _sinittext, init_code_size); | ||
5277 | adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size); | ||
5278 | adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size); | ||
5279 | adj_init_size(_stext, _etext, codesize, __start_rodata, rosize); | ||
5280 | adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize); | ||
5281 | |||
5282 | #undef adj_init_size | ||
5283 | |||
5284 | printk("Memory: %luK/%luK available " | ||
5285 | "(%luK kernel code, %luK rwdata, %luK rodata, " | ||
5286 | "%luK init, %luK bss, %luK reserved" | ||
5287 | #ifdef CONFIG_HIGHMEM | ||
5288 | ", %luK highmem" | ||
5289 | #endif | ||
5290 | "%s%s)\n", | ||
5291 | nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10), | ||
5292 | codesize >> 10, datasize >> 10, rosize >> 10, | ||
5293 | (init_data_size + init_code_size) >> 10, bss_size >> 10, | ||
5294 | (physpages - totalram_pages) << (PAGE_SHIFT-10), | ||
5295 | #ifdef CONFIG_HIGHMEM | ||
5296 | totalhigh_pages << (PAGE_SHIFT-10), | ||
5297 | #endif | ||
5298 | str ? ", " : "", str ? str : ""); | ||
5299 | } | ||
5300 | |||
5182 | /** | 5301 | /** |
5183 | * set_dma_reserve - set the specified number of pages reserved in the first zone | 5302 | * set_dma_reserve - set the specified number of pages reserved in the first zone |
5184 | * @new_dma_reserve: The number of pages to mark reserved | 5303 | * @new_dma_reserve: The number of pages to mark reserved |
@@ -5540,7 +5659,6 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, | |||
5540 | * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist | 5659 | * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist |
5541 | * can have before it gets flushed back to buddy allocator. | 5660 | * can have before it gets flushed back to buddy allocator. |
5542 | */ | 5661 | */ |
5543 | |||
5544 | int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, | 5662 | int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, |
5545 | void __user *buffer, size_t *length, loff_t *ppos) | 5663 | void __user *buffer, size_t *length, loff_t *ppos) |
5546 | { | 5664 | { |
@@ -5551,14 +5669,16 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, | |||
5551 | ret = proc_dointvec_minmax(table, write, buffer, length, ppos); | 5669 | ret = proc_dointvec_minmax(table, write, buffer, length, ppos); |
5552 | if (!write || (ret < 0)) | 5670 | if (!write || (ret < 0)) |
5553 | return ret; | 5671 | return ret; |
5672 | |||
5673 | mutex_lock(&pcp_batch_high_lock); | ||
5554 | for_each_populated_zone(zone) { | 5674 | for_each_populated_zone(zone) { |
5555 | for_each_possible_cpu(cpu) { | 5675 | unsigned long high; |
5556 | unsigned long high; | 5676 | high = zone->managed_pages / percpu_pagelist_fraction; |
5557 | high = zone->managed_pages / percpu_pagelist_fraction; | 5677 | for_each_possible_cpu(cpu) |
5558 | setup_pagelist_highmark( | 5678 | pageset_set_high(per_cpu_ptr(zone->pageset, cpu), |
5559 | per_cpu_ptr(zone->pageset, cpu), high); | 5679 | high); |
5560 | } | ||
5561 | } | 5680 | } |
5681 | mutex_unlock(&pcp_batch_high_lock); | ||
5562 | return 0; | 5682 | return 0; |
5563 | } | 5683 | } |
5564 | 5684 | ||
@@ -6047,32 +6167,18 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages) | |||
6047 | #endif | 6167 | #endif |
6048 | 6168 | ||
6049 | #ifdef CONFIG_MEMORY_HOTPLUG | 6169 | #ifdef CONFIG_MEMORY_HOTPLUG |
6050 | static int __meminit __zone_pcp_update(void *data) | 6170 | /* |
6051 | { | 6171 | * The zone indicated has a new number of managed_pages; batch sizes and percpu |
6052 | struct zone *zone = data; | 6172 | * page high values need to be recalulated. |
6053 | int cpu; | 6173 | */ |
6054 | unsigned long batch = zone_batchsize(zone), flags; | ||
6055 | |||
6056 | for_each_possible_cpu(cpu) { | ||
6057 | struct per_cpu_pageset *pset; | ||
6058 | struct per_cpu_pages *pcp; | ||
6059 | |||
6060 | pset = per_cpu_ptr(zone->pageset, cpu); | ||
6061 | pcp = &pset->pcp; | ||
6062 | |||
6063 | local_irq_save(flags); | ||
6064 | if (pcp->count > 0) | ||
6065 | free_pcppages_bulk(zone, pcp->count, pcp); | ||
6066 | drain_zonestat(zone, pset); | ||
6067 | setup_pageset(pset, batch); | ||
6068 | local_irq_restore(flags); | ||
6069 | } | ||
6070 | return 0; | ||
6071 | } | ||
6072 | |||
6073 | void __meminit zone_pcp_update(struct zone *zone) | 6174 | void __meminit zone_pcp_update(struct zone *zone) |
6074 | { | 6175 | { |
6075 | stop_machine(__zone_pcp_update, zone, NULL); | 6176 | unsigned cpu; |
6177 | mutex_lock(&pcp_batch_high_lock); | ||
6178 | for_each_possible_cpu(cpu) | ||
6179 | pageset_set_high_and_batch(zone, | ||
6180 | per_cpu_ptr(zone->pageset, cpu)); | ||
6181 | mutex_unlock(&pcp_batch_high_lock); | ||
6076 | } | 6182 | } |
6077 | #endif | 6183 | #endif |
6078 | 6184 | ||
@@ -6142,6 +6248,10 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) | |||
6142 | list_del(&page->lru); | 6248 | list_del(&page->lru); |
6143 | rmv_page_order(page); | 6249 | rmv_page_order(page); |
6144 | zone->free_area[order].nr_free--; | 6250 | zone->free_area[order].nr_free--; |
6251 | #ifdef CONFIG_HIGHMEM | ||
6252 | if (PageHighMem(page)) | ||
6253 | totalhigh_pages -= 1 << order; | ||
6254 | #endif | ||
6145 | for (i = 0; i < (1 << order); i++) | 6255 | for (i = 0; i < (1 << order); i++) |
6146 | SetPageReserved((page+i)); | 6256 | SetPageReserved((page+i)); |
6147 | pfn += (1 << order); | 6257 | pfn += (1 << order); |
diff --git a/mm/page_io.c b/mm/page_io.c index a8a3ef45fed7..ba05b64e5d8d 100644 --- a/mm/page_io.c +++ b/mm/page_io.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/writeback.h> | 21 | #include <linux/writeback.h> |
22 | #include <linux/frontswap.h> | 22 | #include <linux/frontswap.h> |
23 | #include <linux/aio.h> | 23 | #include <linux/aio.h> |
24 | #include <linux/blkdev.h> | ||
24 | #include <asm/pgtable.h> | 25 | #include <asm/pgtable.h> |
25 | 26 | ||
26 | static struct bio *get_swap_bio(gfp_t gfp_flags, | 27 | static struct bio *get_swap_bio(gfp_t gfp_flags, |
@@ -80,9 +81,54 @@ void end_swap_bio_read(struct bio *bio, int err) | |||
80 | imajor(bio->bi_bdev->bd_inode), | 81 | imajor(bio->bi_bdev->bd_inode), |
81 | iminor(bio->bi_bdev->bd_inode), | 82 | iminor(bio->bi_bdev->bd_inode), |
82 | (unsigned long long)bio->bi_sector); | 83 | (unsigned long long)bio->bi_sector); |
83 | } else { | 84 | goto out; |
84 | SetPageUptodate(page); | ||
85 | } | 85 | } |
86 | |||
87 | SetPageUptodate(page); | ||
88 | |||
89 | /* | ||
90 | * There is no guarantee that the page is in swap cache - the software | ||
91 | * suspend code (at least) uses end_swap_bio_read() against a non- | ||
92 | * swapcache page. So we must check PG_swapcache before proceeding with | ||
93 | * this optimization. | ||
94 | */ | ||
95 | if (likely(PageSwapCache(page))) { | ||
96 | struct swap_info_struct *sis; | ||
97 | |||
98 | sis = page_swap_info(page); | ||
99 | if (sis->flags & SWP_BLKDEV) { | ||
100 | /* | ||
101 | * The swap subsystem performs lazy swap slot freeing, | ||
102 | * expecting that the page will be swapped out again. | ||
103 | * So we can avoid an unnecessary write if the page | ||
104 | * isn't redirtied. | ||
105 | * This is good for real swap storage because we can | ||
106 | * reduce unnecessary I/O and enhance wear-leveling | ||
107 | * if an SSD is used as the as swap device. | ||
108 | * But if in-memory swap device (eg zram) is used, | ||
109 | * this causes a duplicated copy between uncompressed | ||
110 | * data in VM-owned memory and compressed data in | ||
111 | * zram-owned memory. So let's free zram-owned memory | ||
112 | * and make the VM-owned decompressed page *dirty*, | ||
113 | * so the page should be swapped out somewhere again if | ||
114 | * we again wish to reclaim it. | ||
115 | */ | ||
116 | struct gendisk *disk = sis->bdev->bd_disk; | ||
117 | if (disk->fops->swap_slot_free_notify) { | ||
118 | swp_entry_t entry; | ||
119 | unsigned long offset; | ||
120 | |||
121 | entry.val = page_private(page); | ||
122 | offset = swp_offset(entry); | ||
123 | |||
124 | SetPageDirty(page); | ||
125 | disk->fops->swap_slot_free_notify(sis->bdev, | ||
126 | offset); | ||
127 | } | ||
128 | } | ||
129 | } | ||
130 | |||
131 | out: | ||
86 | unlock_page(page); | 132 | unlock_page(page); |
87 | bio_put(bio); | 133 | bio_put(bio); |
88 | } | 134 | } |
@@ -1093,9 +1093,10 @@ void page_add_new_anon_rmap(struct page *page, | |||
1093 | else | 1093 | else |
1094 | __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); | 1094 | __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); |
1095 | __page_set_anon_rmap(page, vma, address, 1); | 1095 | __page_set_anon_rmap(page, vma, address, 1); |
1096 | if (!mlocked_vma_newpage(vma, page)) | 1096 | if (!mlocked_vma_newpage(vma, page)) { |
1097 | lru_cache_add_lru(page, LRU_ACTIVE_ANON); | 1097 | SetPageActive(page); |
1098 | else | 1098 | lru_cache_add(page); |
1099 | } else | ||
1099 | add_page_to_unevictable_list(page); | 1100 | add_page_to_unevictable_list(page); |
1100 | } | 1101 | } |
1101 | 1102 | ||
diff --git a/mm/sparse.c b/mm/sparse.c index 1c91f0d3f6ab..3194ec414728 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -481,6 +481,9 @@ void __init sparse_init(void) | |||
481 | struct page **map_map; | 481 | struct page **map_map; |
482 | #endif | 482 | #endif |
483 | 483 | ||
484 | /* see include/linux/mmzone.h 'struct mem_section' definition */ | ||
485 | BUILD_BUG_ON(!is_power_of_2(sizeof(struct mem_section))); | ||
486 | |||
484 | /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */ | 487 | /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */ |
485 | set_pageblock_order(); | 488 | set_pageblock_order(); |
486 | 489 | ||
@@ -34,10 +34,13 @@ | |||
34 | 34 | ||
35 | #include "internal.h" | 35 | #include "internal.h" |
36 | 36 | ||
37 | #define CREATE_TRACE_POINTS | ||
38 | #include <trace/events/pagemap.h> | ||
39 | |||
37 | /* How many pages do we try to swap or page in/out together? */ | 40 | /* How many pages do we try to swap or page in/out together? */ |
38 | int page_cluster; | 41 | int page_cluster; |
39 | 42 | ||
40 | static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs); | 43 | static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); |
41 | static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); | 44 | static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); |
42 | static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); | 45 | static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); |
43 | 46 | ||
@@ -384,6 +387,7 @@ static void __activate_page(struct page *page, struct lruvec *lruvec, | |||
384 | SetPageActive(page); | 387 | SetPageActive(page); |
385 | lru += LRU_ACTIVE; | 388 | lru += LRU_ACTIVE; |
386 | add_page_to_lru_list(page, lruvec, lru); | 389 | add_page_to_lru_list(page, lruvec, lru); |
390 | trace_mm_lru_activate(page, page_to_pfn(page)); | ||
387 | 391 | ||
388 | __count_vm_event(PGACTIVATE); | 392 | __count_vm_event(PGACTIVATE); |
389 | update_page_reclaim_stat(lruvec, file, 1); | 393 | update_page_reclaim_stat(lruvec, file, 1); |
@@ -428,6 +432,33 @@ void activate_page(struct page *page) | |||
428 | } | 432 | } |
429 | #endif | 433 | #endif |
430 | 434 | ||
435 | static void __lru_cache_activate_page(struct page *page) | ||
436 | { | ||
437 | struct pagevec *pvec = &get_cpu_var(lru_add_pvec); | ||
438 | int i; | ||
439 | |||
440 | /* | ||
441 | * Search backwards on the optimistic assumption that the page being | ||
442 | * activated has just been added to this pagevec. Note that only | ||
443 | * the local pagevec is examined as a !PageLRU page could be in the | ||
444 | * process of being released, reclaimed, migrated or on a remote | ||
445 | * pagevec that is currently being drained. Furthermore, marking | ||
446 | * a remote pagevec's page PageActive potentially hits a race where | ||
447 | * a page is marked PageActive just after it is added to the inactive | ||
448 | * list causing accounting errors and BUG_ON checks to trigger. | ||
449 | */ | ||
450 | for (i = pagevec_count(pvec) - 1; i >= 0; i--) { | ||
451 | struct page *pagevec_page = pvec->pages[i]; | ||
452 | |||
453 | if (pagevec_page == page) { | ||
454 | SetPageActive(page); | ||
455 | break; | ||
456 | } | ||
457 | } | ||
458 | |||
459 | put_cpu_var(lru_add_pvec); | ||
460 | } | ||
461 | |||
431 | /* | 462 | /* |
432 | * Mark a page as having seen activity. | 463 | * Mark a page as having seen activity. |
433 | * | 464 | * |
@@ -438,8 +469,18 @@ void activate_page(struct page *page) | |||
438 | void mark_page_accessed(struct page *page) | 469 | void mark_page_accessed(struct page *page) |
439 | { | 470 | { |
440 | if (!PageActive(page) && !PageUnevictable(page) && | 471 | if (!PageActive(page) && !PageUnevictable(page) && |
441 | PageReferenced(page) && PageLRU(page)) { | 472 | PageReferenced(page)) { |
442 | activate_page(page); | 473 | |
474 | /* | ||
475 | * If the page is on the LRU, queue it for activation via | ||
476 | * activate_page_pvecs. Otherwise, assume the page is on a | ||
477 | * pagevec, mark it active and it'll be moved to the active | ||
478 | * LRU on the next drain. | ||
479 | */ | ||
480 | if (PageLRU(page)) | ||
481 | activate_page(page); | ||
482 | else | ||
483 | __lru_cache_activate_page(page); | ||
443 | ClearPageReferenced(page); | 484 | ClearPageReferenced(page); |
444 | } else if (!PageReferenced(page)) { | 485 | } else if (!PageReferenced(page)) { |
445 | SetPageReferenced(page); | 486 | SetPageReferenced(page); |
@@ -448,42 +489,37 @@ void mark_page_accessed(struct page *page) | |||
448 | EXPORT_SYMBOL(mark_page_accessed); | 489 | EXPORT_SYMBOL(mark_page_accessed); |
449 | 490 | ||
450 | /* | 491 | /* |
451 | * Order of operations is important: flush the pagevec when it's already | 492 | * Queue the page for addition to the LRU via pagevec. The decision on whether |
452 | * full, not when adding the last page, to make sure that last page is | 493 | * to add the page to the [in]active [file|anon] list is deferred until the |
453 | * not added to the LRU directly when passed to this function. Because | 494 | * pagevec is drained. This gives a chance for the caller of __lru_cache_add() |
454 | * mark_page_accessed() (called after this when writing) only activates | 495 | * have the page added to the active list using mark_page_accessed(). |
455 | * pages that are on the LRU, linear writes in subpage chunks would see | ||
456 | * every PAGEVEC_SIZE page activated, which is unexpected. | ||
457 | */ | 496 | */ |
458 | void __lru_cache_add(struct page *page, enum lru_list lru) | 497 | void __lru_cache_add(struct page *page) |
459 | { | 498 | { |
460 | struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru]; | 499 | struct pagevec *pvec = &get_cpu_var(lru_add_pvec); |
461 | 500 | ||
462 | page_cache_get(page); | 501 | page_cache_get(page); |
463 | if (!pagevec_space(pvec)) | 502 | if (!pagevec_space(pvec)) |
464 | __pagevec_lru_add(pvec, lru); | 503 | __pagevec_lru_add(pvec); |
465 | pagevec_add(pvec, page); | 504 | pagevec_add(pvec, page); |
466 | put_cpu_var(lru_add_pvecs); | 505 | put_cpu_var(lru_add_pvec); |
467 | } | 506 | } |
468 | EXPORT_SYMBOL(__lru_cache_add); | 507 | EXPORT_SYMBOL(__lru_cache_add); |
469 | 508 | ||
470 | /** | 509 | /** |
471 | * lru_cache_add_lru - add a page to a page list | 510 | * lru_cache_add - add a page to a page list |
472 | * @page: the page to be added to the LRU. | 511 | * @page: the page to be added to the LRU. |
473 | * @lru: the LRU list to which the page is added. | ||
474 | */ | 512 | */ |
475 | void lru_cache_add_lru(struct page *page, enum lru_list lru) | 513 | void lru_cache_add(struct page *page) |
476 | { | 514 | { |
477 | if (PageActive(page)) { | 515 | if (PageActive(page)) { |
478 | VM_BUG_ON(PageUnevictable(page)); | 516 | VM_BUG_ON(PageUnevictable(page)); |
479 | ClearPageActive(page); | ||
480 | } else if (PageUnevictable(page)) { | 517 | } else if (PageUnevictable(page)) { |
481 | VM_BUG_ON(PageActive(page)); | 518 | VM_BUG_ON(PageActive(page)); |
482 | ClearPageUnevictable(page); | ||
483 | } | 519 | } |
484 | 520 | ||
485 | VM_BUG_ON(PageLRU(page) || PageActive(page) || PageUnevictable(page)); | 521 | VM_BUG_ON(PageLRU(page)); |
486 | __lru_cache_add(page, lru); | 522 | __lru_cache_add(page); |
487 | } | 523 | } |
488 | 524 | ||
489 | /** | 525 | /** |
@@ -583,15 +619,10 @@ static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, | |||
583 | */ | 619 | */ |
584 | void lru_add_drain_cpu(int cpu) | 620 | void lru_add_drain_cpu(int cpu) |
585 | { | 621 | { |
586 | struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu); | 622 | struct pagevec *pvec = &per_cpu(lru_add_pvec, cpu); |
587 | struct pagevec *pvec; | ||
588 | int lru; | ||
589 | 623 | ||
590 | for_each_lru(lru) { | 624 | if (pagevec_count(pvec)) |
591 | pvec = &pvecs[lru - LRU_BASE]; | 625 | __pagevec_lru_add(pvec); |
592 | if (pagevec_count(pvec)) | ||
593 | __pagevec_lru_add(pvec, lru); | ||
594 | } | ||
595 | 626 | ||
596 | pvec = &per_cpu(lru_rotate_pvecs, cpu); | 627 | pvec = &per_cpu(lru_rotate_pvecs, cpu); |
597 | if (pagevec_count(pvec)) { | 628 | if (pagevec_count(pvec)) { |
@@ -708,6 +739,9 @@ void release_pages(struct page **pages, int nr, int cold) | |||
708 | del_page_from_lru_list(page, lruvec, page_off_lru(page)); | 739 | del_page_from_lru_list(page, lruvec, page_off_lru(page)); |
709 | } | 740 | } |
710 | 741 | ||
742 | /* Clear Active bit in case of parallel mark_page_accessed */ | ||
743 | ClearPageActive(page); | ||
744 | |||
711 | list_add(&page->lru, &pages_to_free); | 745 | list_add(&page->lru, &pages_to_free); |
712 | } | 746 | } |
713 | if (zone) | 747 | if (zone) |
@@ -795,30 +829,26 @@ void lru_add_page_tail(struct page *page, struct page *page_tail, | |||
795 | static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, | 829 | static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, |
796 | void *arg) | 830 | void *arg) |
797 | { | 831 | { |
798 | enum lru_list lru = (enum lru_list)arg; | 832 | int file = page_is_file_cache(page); |
799 | int file = is_file_lru(lru); | 833 | int active = PageActive(page); |
800 | int active = is_active_lru(lru); | 834 | enum lru_list lru = page_lru(page); |
801 | 835 | ||
802 | VM_BUG_ON(PageActive(page)); | ||
803 | VM_BUG_ON(PageUnevictable(page)); | 836 | VM_BUG_ON(PageUnevictable(page)); |
804 | VM_BUG_ON(PageLRU(page)); | 837 | VM_BUG_ON(PageLRU(page)); |
805 | 838 | ||
806 | SetPageLRU(page); | 839 | SetPageLRU(page); |
807 | if (active) | ||
808 | SetPageActive(page); | ||
809 | add_page_to_lru_list(page, lruvec, lru); | 840 | add_page_to_lru_list(page, lruvec, lru); |
810 | update_page_reclaim_stat(lruvec, file, active); | 841 | update_page_reclaim_stat(lruvec, file, active); |
842 | trace_mm_lru_insertion(page, page_to_pfn(page), lru, trace_pagemap_flags(page)); | ||
811 | } | 843 | } |
812 | 844 | ||
813 | /* | 845 | /* |
814 | * Add the passed pages to the LRU, then drop the caller's refcount | 846 | * Add the passed pages to the LRU, then drop the caller's refcount |
815 | * on them. Reinitialises the caller's pagevec. | 847 | * on them. Reinitialises the caller's pagevec. |
816 | */ | 848 | */ |
817 | void __pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) | 849 | void __pagevec_lru_add(struct pagevec *pvec) |
818 | { | 850 | { |
819 | VM_BUG_ON(is_unevictable_lru(lru)); | 851 | pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, NULL); |
820 | |||
821 | pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, (void *)lru); | ||
822 | } | 852 | } |
823 | EXPORT_SYMBOL(__pagevec_lru_add); | 853 | EXPORT_SYMBOL(__pagevec_lru_add); |
824 | 854 | ||
diff --git a/mm/swapfile.c b/mm/swapfile.c index 746af55b8455..36af6eeaa67e 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -212,7 +212,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, | |||
212 | si->cluster_nr = SWAPFILE_CLUSTER - 1; | 212 | si->cluster_nr = SWAPFILE_CLUSTER - 1; |
213 | goto checks; | 213 | goto checks; |
214 | } | 214 | } |
215 | if (si->flags & SWP_DISCARDABLE) { | 215 | if (si->flags & SWP_PAGE_DISCARD) { |
216 | /* | 216 | /* |
217 | * Start range check on racing allocations, in case | 217 | * Start range check on racing allocations, in case |
218 | * they overlap the cluster we eventually decide on | 218 | * they overlap the cluster we eventually decide on |
@@ -322,7 +322,7 @@ checks: | |||
322 | 322 | ||
323 | if (si->lowest_alloc) { | 323 | if (si->lowest_alloc) { |
324 | /* | 324 | /* |
325 | * Only set when SWP_DISCARDABLE, and there's a scan | 325 | * Only set when SWP_PAGE_DISCARD, and there's a scan |
326 | * for a free cluster in progress or just completed. | 326 | * for a free cluster in progress or just completed. |
327 | */ | 327 | */ |
328 | if (found_free_cluster) { | 328 | if (found_free_cluster) { |
@@ -2016,6 +2016,20 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, | |||
2016 | return nr_extents; | 2016 | return nr_extents; |
2017 | } | 2017 | } |
2018 | 2018 | ||
2019 | /* | ||
2020 | * Helper to sys_swapon determining if a given swap | ||
2021 | * backing device queue supports DISCARD operations. | ||
2022 | */ | ||
2023 | static bool swap_discardable(struct swap_info_struct *si) | ||
2024 | { | ||
2025 | struct request_queue *q = bdev_get_queue(si->bdev); | ||
2026 | |||
2027 | if (!q || !blk_queue_discard(q)) | ||
2028 | return false; | ||
2029 | |||
2030 | return true; | ||
2031 | } | ||
2032 | |||
2019 | SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | 2033 | SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) |
2020 | { | 2034 | { |
2021 | struct swap_info_struct *p; | 2035 | struct swap_info_struct *p; |
@@ -2123,8 +2137,37 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
2123 | p->flags |= SWP_SOLIDSTATE; | 2137 | p->flags |= SWP_SOLIDSTATE; |
2124 | p->cluster_next = 1 + (prandom_u32() % p->highest_bit); | 2138 | p->cluster_next = 1 + (prandom_u32() % p->highest_bit); |
2125 | } | 2139 | } |
2126 | if ((swap_flags & SWAP_FLAG_DISCARD) && discard_swap(p) == 0) | 2140 | |
2127 | p->flags |= SWP_DISCARDABLE; | 2141 | if ((swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) { |
2142 | /* | ||
2143 | * When discard is enabled for swap with no particular | ||
2144 | * policy flagged, we set all swap discard flags here in | ||
2145 | * order to sustain backward compatibility with older | ||
2146 | * swapon(8) releases. | ||
2147 | */ | ||
2148 | p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD | | ||
2149 | SWP_PAGE_DISCARD); | ||
2150 | |||
2151 | /* | ||
2152 | * By flagging sys_swapon, a sysadmin can tell us to | ||
2153 | * either do single-time area discards only, or to just | ||
2154 | * perform discards for released swap page-clusters. | ||
2155 | * Now it's time to adjust the p->flags accordingly. | ||
2156 | */ | ||
2157 | if (swap_flags & SWAP_FLAG_DISCARD_ONCE) | ||
2158 | p->flags &= ~SWP_PAGE_DISCARD; | ||
2159 | else if (swap_flags & SWAP_FLAG_DISCARD_PAGES) | ||
2160 | p->flags &= ~SWP_AREA_DISCARD; | ||
2161 | |||
2162 | /* issue a swapon-time discard if it's still required */ | ||
2163 | if (p->flags & SWP_AREA_DISCARD) { | ||
2164 | int err = discard_swap(p); | ||
2165 | if (unlikely(err)) | ||
2166 | printk(KERN_ERR | ||
2167 | "swapon: discard_swap(%p): %d\n", | ||
2168 | p, err); | ||
2169 | } | ||
2170 | } | ||
2128 | } | 2171 | } |
2129 | 2172 | ||
2130 | mutex_lock(&swapon_mutex); | 2173 | mutex_lock(&swapon_mutex); |
@@ -2135,11 +2178,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
2135 | enable_swap_info(p, prio, swap_map, frontswap_map); | 2178 | enable_swap_info(p, prio, swap_map, frontswap_map); |
2136 | 2179 | ||
2137 | printk(KERN_INFO "Adding %uk swap on %s. " | 2180 | printk(KERN_INFO "Adding %uk swap on %s. " |
2138 | "Priority:%d extents:%d across:%lluk %s%s%s\n", | 2181 | "Priority:%d extents:%d across:%lluk %s%s%s%s%s\n", |
2139 | p->pages<<(PAGE_SHIFT-10), name->name, p->prio, | 2182 | p->pages<<(PAGE_SHIFT-10), name->name, p->prio, |
2140 | nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), | 2183 | nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), |
2141 | (p->flags & SWP_SOLIDSTATE) ? "SS" : "", | 2184 | (p->flags & SWP_SOLIDSTATE) ? "SS" : "", |
2142 | (p->flags & SWP_DISCARDABLE) ? "D" : "", | 2185 | (p->flags & SWP_DISCARDABLE) ? "D" : "", |
2186 | (p->flags & SWP_AREA_DISCARD) ? "s" : "", | ||
2187 | (p->flags & SWP_PAGE_DISCARD) ? "c" : "", | ||
2143 | (frontswap_map) ? "FS" : ""); | 2188 | (frontswap_map) ? "FS" : ""); |
2144 | 2189 | ||
2145 | mutex_unlock(&swapon_mutex); | 2190 | mutex_unlock(&swapon_mutex); |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d365724feb05..91a10472a39a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -292,7 +292,7 @@ static struct vmap_area *__find_vmap_area(unsigned long addr) | |||
292 | va = rb_entry(n, struct vmap_area, rb_node); | 292 | va = rb_entry(n, struct vmap_area, rb_node); |
293 | if (addr < va->va_start) | 293 | if (addr < va->va_start) |
294 | n = n->rb_left; | 294 | n = n->rb_left; |
295 | else if (addr > va->va_start) | 295 | else if (addr >= va->va_end) |
296 | n = n->rb_right; | 296 | n = n->rb_right; |
297 | else | 297 | else |
298 | return va; | 298 | return va; |
@@ -1322,13 +1322,6 @@ static void clear_vm_unlist(struct vm_struct *vm) | |||
1322 | vm->flags &= ~VM_UNLIST; | 1322 | vm->flags &= ~VM_UNLIST; |
1323 | } | 1323 | } |
1324 | 1324 | ||
1325 | static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, | ||
1326 | unsigned long flags, const void *caller) | ||
1327 | { | ||
1328 | setup_vmalloc_vm(vm, va, flags, caller); | ||
1329 | clear_vm_unlist(vm); | ||
1330 | } | ||
1331 | |||
1332 | static struct vm_struct *__get_vm_area_node(unsigned long size, | 1325 | static struct vm_struct *__get_vm_area_node(unsigned long size, |
1333 | unsigned long align, unsigned long flags, unsigned long start, | 1326 | unsigned long align, unsigned long flags, unsigned long start, |
1334 | unsigned long end, int node, gfp_t gfp_mask, const void *caller) | 1327 | unsigned long end, int node, gfp_t gfp_mask, const void *caller) |
@@ -1337,16 +1330,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, | |||
1337 | struct vm_struct *area; | 1330 | struct vm_struct *area; |
1338 | 1331 | ||
1339 | BUG_ON(in_interrupt()); | 1332 | BUG_ON(in_interrupt()); |
1340 | if (flags & VM_IOREMAP) { | 1333 | if (flags & VM_IOREMAP) |
1341 | int bit = fls(size); | 1334 | align = 1ul << clamp(fls(size), PAGE_SHIFT, IOREMAP_MAX_ORDER); |
1342 | |||
1343 | if (bit > IOREMAP_MAX_ORDER) | ||
1344 | bit = IOREMAP_MAX_ORDER; | ||
1345 | else if (bit < PAGE_SHIFT) | ||
1346 | bit = PAGE_SHIFT; | ||
1347 | |||
1348 | align = 1ul << bit; | ||
1349 | } | ||
1350 | 1335 | ||
1351 | size = PAGE_ALIGN(size); | 1336 | size = PAGE_ALIGN(size); |
1352 | if (unlikely(!size)) | 1337 | if (unlikely(!size)) |
@@ -1367,16 +1352,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, | |||
1367 | return NULL; | 1352 | return NULL; |
1368 | } | 1353 | } |
1369 | 1354 | ||
1370 | /* | 1355 | setup_vmalloc_vm(area, va, flags, caller); |
1371 | * When this function is called from __vmalloc_node_range, | ||
1372 | * we add VM_UNLIST flag to avoid accessing uninitialized | ||
1373 | * members of vm_struct such as pages and nr_pages fields. | ||
1374 | * They will be set later. | ||
1375 | */ | ||
1376 | if (flags & VM_UNLIST) | ||
1377 | setup_vmalloc_vm(area, va, flags, caller); | ||
1378 | else | ||
1379 | insert_vmalloc_vm(area, va, flags, caller); | ||
1380 | 1356 | ||
1381 | return area; | 1357 | return area; |
1382 | } | 1358 | } |
@@ -1476,10 +1452,9 @@ static void __vunmap(const void *addr, int deallocate_pages) | |||
1476 | if (!addr) | 1452 | if (!addr) |
1477 | return; | 1453 | return; |
1478 | 1454 | ||
1479 | if ((PAGE_SIZE-1) & (unsigned long)addr) { | 1455 | if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n", |
1480 | WARN(1, KERN_ERR "Trying to vfree() bad address (%p)\n", addr); | 1456 | addr)); |
1481 | return; | 1457 | return; |
1482 | } | ||
1483 | 1458 | ||
1484 | area = remove_vm_area(addr); | 1459 | area = remove_vm_area(addr); |
1485 | if (unlikely(!area)) { | 1460 | if (unlikely(!area)) { |
@@ -2148,42 +2123,43 @@ finished: | |||
2148 | } | 2123 | } |
2149 | 2124 | ||
2150 | /** | 2125 | /** |
2151 | * remap_vmalloc_range - map vmalloc pages to userspace | 2126 | * remap_vmalloc_range_partial - map vmalloc pages to userspace |
2152 | * @vma: vma to cover (map full range of vma) | 2127 | * @vma: vma to cover |
2153 | * @addr: vmalloc memory | 2128 | * @uaddr: target user address to start at |
2154 | * @pgoff: number of pages into addr before first page to map | 2129 | * @kaddr: virtual address of vmalloc kernel memory |
2130 | * @size: size of map area | ||
2155 | * | 2131 | * |
2156 | * Returns: 0 for success, -Exxx on failure | 2132 | * Returns: 0 for success, -Exxx on failure |
2157 | * | 2133 | * |
2158 | * This function checks that addr is a valid vmalloc'ed area, and | 2134 | * This function checks that @kaddr is a valid vmalloc'ed area, |
2159 | * that it is big enough to cover the vma. Will return failure if | 2135 | * and that it is big enough to cover the range starting at |
2160 | * that criteria isn't met. | 2136 | * @uaddr in @vma. Will return failure if that criteria isn't |
2137 | * met. | ||
2161 | * | 2138 | * |
2162 | * Similar to remap_pfn_range() (see mm/memory.c) | 2139 | * Similar to remap_pfn_range() (see mm/memory.c) |
2163 | */ | 2140 | */ |
2164 | int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, | 2141 | int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, |
2165 | unsigned long pgoff) | 2142 | void *kaddr, unsigned long size) |
2166 | { | 2143 | { |
2167 | struct vm_struct *area; | 2144 | struct vm_struct *area; |
2168 | unsigned long uaddr = vma->vm_start; | ||
2169 | unsigned long usize = vma->vm_end - vma->vm_start; | ||
2170 | 2145 | ||
2171 | if ((PAGE_SIZE-1) & (unsigned long)addr) | 2146 | size = PAGE_ALIGN(size); |
2147 | |||
2148 | if (!PAGE_ALIGNED(uaddr) || !PAGE_ALIGNED(kaddr)) | ||
2172 | return -EINVAL; | 2149 | return -EINVAL; |
2173 | 2150 | ||
2174 | area = find_vm_area(addr); | 2151 | area = find_vm_area(kaddr); |
2175 | if (!area) | 2152 | if (!area) |
2176 | return -EINVAL; | 2153 | return -EINVAL; |
2177 | 2154 | ||
2178 | if (!(area->flags & VM_USERMAP)) | 2155 | if (!(area->flags & VM_USERMAP)) |
2179 | return -EINVAL; | 2156 | return -EINVAL; |
2180 | 2157 | ||
2181 | if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE) | 2158 | if (kaddr + size > area->addr + area->size) |
2182 | return -EINVAL; | 2159 | return -EINVAL; |
2183 | 2160 | ||
2184 | addr += pgoff << PAGE_SHIFT; | ||
2185 | do { | 2161 | do { |
2186 | struct page *page = vmalloc_to_page(addr); | 2162 | struct page *page = vmalloc_to_page(kaddr); |
2187 | int ret; | 2163 | int ret; |
2188 | 2164 | ||
2189 | ret = vm_insert_page(vma, uaddr, page); | 2165 | ret = vm_insert_page(vma, uaddr, page); |
@@ -2191,14 +2167,37 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, | |||
2191 | return ret; | 2167 | return ret; |
2192 | 2168 | ||
2193 | uaddr += PAGE_SIZE; | 2169 | uaddr += PAGE_SIZE; |
2194 | addr += PAGE_SIZE; | 2170 | kaddr += PAGE_SIZE; |
2195 | usize -= PAGE_SIZE; | 2171 | size -= PAGE_SIZE; |
2196 | } while (usize > 0); | 2172 | } while (size > 0); |
2197 | 2173 | ||
2198 | vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; | 2174 | vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; |
2199 | 2175 | ||
2200 | return 0; | 2176 | return 0; |
2201 | } | 2177 | } |
2178 | EXPORT_SYMBOL(remap_vmalloc_range_partial); | ||
2179 | |||
2180 | /** | ||
2181 | * remap_vmalloc_range - map vmalloc pages to userspace | ||
2182 | * @vma: vma to cover (map full range of vma) | ||
2183 | * @addr: vmalloc memory | ||
2184 | * @pgoff: number of pages into addr before first page to map | ||
2185 | * | ||
2186 | * Returns: 0 for success, -Exxx on failure | ||
2187 | * | ||
2188 | * This function checks that addr is a valid vmalloc'ed area, and | ||
2189 | * that it is big enough to cover the vma. Will return failure if | ||
2190 | * that criteria isn't met. | ||
2191 | * | ||
2192 | * Similar to remap_pfn_range() (see mm/memory.c) | ||
2193 | */ | ||
2194 | int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, | ||
2195 | unsigned long pgoff) | ||
2196 | { | ||
2197 | return remap_vmalloc_range_partial(vma, vma->vm_start, | ||
2198 | addr + (pgoff << PAGE_SHIFT), | ||
2199 | vma->vm_end - vma->vm_start); | ||
2200 | } | ||
2202 | EXPORT_SYMBOL(remap_vmalloc_range); | 2201 | EXPORT_SYMBOL(remap_vmalloc_range); |
2203 | 2202 | ||
2204 | /* | 2203 | /* |
@@ -2512,8 +2511,8 @@ found: | |||
2512 | 2511 | ||
2513 | /* insert all vm's */ | 2512 | /* insert all vm's */ |
2514 | for (area = 0; area < nr_vms; area++) | 2513 | for (area = 0; area < nr_vms; area++) |
2515 | insert_vmalloc_vm(vms[area], vas[area], VM_ALLOC, | 2514 | setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC, |
2516 | pcpu_get_vm_areas); | 2515 | pcpu_get_vm_areas); |
2517 | 2516 | ||
2518 | kfree(vas); | 2517 | kfree(vas); |
2519 | return vms; | 2518 | return vms; |
diff --git a/mm/vmscan.c b/mm/vmscan.c index fa6a85378ee4..99b3ac7771ad 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -546,7 +546,6 @@ int remove_mapping(struct address_space *mapping, struct page *page) | |||
546 | void putback_lru_page(struct page *page) | 546 | void putback_lru_page(struct page *page) |
547 | { | 547 | { |
548 | int lru; | 548 | int lru; |
549 | int active = !!TestClearPageActive(page); | ||
550 | int was_unevictable = PageUnevictable(page); | 549 | int was_unevictable = PageUnevictable(page); |
551 | 550 | ||
552 | VM_BUG_ON(PageLRU(page)); | 551 | VM_BUG_ON(PageLRU(page)); |
@@ -561,8 +560,8 @@ redo: | |||
561 | * unevictable page on [in]active list. | 560 | * unevictable page on [in]active list. |
562 | * We know how to handle that. | 561 | * We know how to handle that. |
563 | */ | 562 | */ |
564 | lru = active + page_lru_base_type(page); | 563 | lru = page_lru_base_type(page); |
565 | lru_cache_add_lru(page, lru); | 564 | lru_cache_add(page); |
566 | } else { | 565 | } else { |
567 | /* | 566 | /* |
568 | * Put unevictable pages directly on zone's unevictable | 567 | * Put unevictable pages directly on zone's unevictable |
@@ -669,6 +668,35 @@ static enum page_references page_check_references(struct page *page, | |||
669 | return PAGEREF_RECLAIM; | 668 | return PAGEREF_RECLAIM; |
670 | } | 669 | } |
671 | 670 | ||
671 | /* Check if a page is dirty or under writeback */ | ||
672 | static void page_check_dirty_writeback(struct page *page, | ||
673 | bool *dirty, bool *writeback) | ||
674 | { | ||
675 | struct address_space *mapping; | ||
676 | |||
677 | /* | ||
678 | * Anonymous pages are not handled by flushers and must be written | ||
679 | * from reclaim context. Do not stall reclaim based on them | ||
680 | */ | ||
681 | if (!page_is_file_cache(page)) { | ||
682 | *dirty = false; | ||
683 | *writeback = false; | ||
684 | return; | ||
685 | } | ||
686 | |||
687 | /* By default assume that the page flags are accurate */ | ||
688 | *dirty = PageDirty(page); | ||
689 | *writeback = PageWriteback(page); | ||
690 | |||
691 | /* Verify dirty/writeback state if the filesystem supports it */ | ||
692 | if (!page_has_private(page)) | ||
693 | return; | ||
694 | |||
695 | mapping = page_mapping(page); | ||
696 | if (mapping && mapping->a_ops->is_dirty_writeback) | ||
697 | mapping->a_ops->is_dirty_writeback(page, dirty, writeback); | ||
698 | } | ||
699 | |||
672 | /* | 700 | /* |
673 | * shrink_page_list() returns the number of reclaimed pages | 701 | * shrink_page_list() returns the number of reclaimed pages |
674 | */ | 702 | */ |
@@ -677,16 +705,21 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
677 | struct scan_control *sc, | 705 | struct scan_control *sc, |
678 | enum ttu_flags ttu_flags, | 706 | enum ttu_flags ttu_flags, |
679 | unsigned long *ret_nr_dirty, | 707 | unsigned long *ret_nr_dirty, |
708 | unsigned long *ret_nr_unqueued_dirty, | ||
709 | unsigned long *ret_nr_congested, | ||
680 | unsigned long *ret_nr_writeback, | 710 | unsigned long *ret_nr_writeback, |
711 | unsigned long *ret_nr_immediate, | ||
681 | bool force_reclaim) | 712 | bool force_reclaim) |
682 | { | 713 | { |
683 | LIST_HEAD(ret_pages); | 714 | LIST_HEAD(ret_pages); |
684 | LIST_HEAD(free_pages); | 715 | LIST_HEAD(free_pages); |
685 | int pgactivate = 0; | 716 | int pgactivate = 0; |
717 | unsigned long nr_unqueued_dirty = 0; | ||
686 | unsigned long nr_dirty = 0; | 718 | unsigned long nr_dirty = 0; |
687 | unsigned long nr_congested = 0; | 719 | unsigned long nr_congested = 0; |
688 | unsigned long nr_reclaimed = 0; | 720 | unsigned long nr_reclaimed = 0; |
689 | unsigned long nr_writeback = 0; | 721 | unsigned long nr_writeback = 0; |
722 | unsigned long nr_immediate = 0; | ||
690 | 723 | ||
691 | cond_resched(); | 724 | cond_resched(); |
692 | 725 | ||
@@ -696,6 +729,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
696 | struct page *page; | 729 | struct page *page; |
697 | int may_enter_fs; | 730 | int may_enter_fs; |
698 | enum page_references references = PAGEREF_RECLAIM_CLEAN; | 731 | enum page_references references = PAGEREF_RECLAIM_CLEAN; |
732 | bool dirty, writeback; | ||
699 | 733 | ||
700 | cond_resched(); | 734 | cond_resched(); |
701 | 735 | ||
@@ -723,25 +757,77 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
723 | may_enter_fs = (sc->gfp_mask & __GFP_FS) || | 757 | may_enter_fs = (sc->gfp_mask & __GFP_FS) || |
724 | (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); | 758 | (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); |
725 | 759 | ||
760 | /* | ||
761 | * The number of dirty pages determines if a zone is marked | ||
762 | * reclaim_congested which affects wait_iff_congested. kswapd | ||
763 | * will stall and start writing pages if the tail of the LRU | ||
764 | * is all dirty unqueued pages. | ||
765 | */ | ||
766 | page_check_dirty_writeback(page, &dirty, &writeback); | ||
767 | if (dirty || writeback) | ||
768 | nr_dirty++; | ||
769 | |||
770 | if (dirty && !writeback) | ||
771 | nr_unqueued_dirty++; | ||
772 | |||
773 | /* | ||
774 | * Treat this page as congested if the underlying BDI is or if | ||
775 | * pages are cycling through the LRU so quickly that the | ||
776 | * pages marked for immediate reclaim are making it to the | ||
777 | * end of the LRU a second time. | ||
778 | */ | ||
779 | mapping = page_mapping(page); | ||
780 | if ((mapping && bdi_write_congested(mapping->backing_dev_info)) || | ||
781 | (writeback && PageReclaim(page))) | ||
782 | nr_congested++; | ||
783 | |||
784 | /* | ||
785 | * If a page at the tail of the LRU is under writeback, there | ||
786 | * are three cases to consider. | ||
787 | * | ||
788 | * 1) If reclaim is encountering an excessive number of pages | ||
789 | * under writeback and this page is both under writeback and | ||
790 | * PageReclaim then it indicates that pages are being queued | ||
791 | * for IO but are being recycled through the LRU before the | ||
792 | * IO can complete. Waiting on the page itself risks an | ||
793 | * indefinite stall if it is impossible to writeback the | ||
794 | * page due to IO error or disconnected storage so instead | ||
795 | * note that the LRU is being scanned too quickly and the | ||
796 | * caller can stall after page list has been processed. | ||
797 | * | ||
798 | * 2) Global reclaim encounters a page, memcg encounters a | ||
799 | * page that is not marked for immediate reclaim or | ||
800 | * the caller does not have __GFP_IO. In this case mark | ||
801 | * the page for immediate reclaim and continue scanning. | ||
802 | * | ||
803 | * __GFP_IO is checked because a loop driver thread might | ||
804 | * enter reclaim, and deadlock if it waits on a page for | ||
805 | * which it is needed to do the write (loop masks off | ||
806 | * __GFP_IO|__GFP_FS for this reason); but more thought | ||
807 | * would probably show more reasons. | ||
808 | * | ||
809 | * Don't require __GFP_FS, since we're not going into the | ||
810 | * FS, just waiting on its writeback completion. Worryingly, | ||
811 | * ext4 gfs2 and xfs allocate pages with | ||
812 | * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing | ||
813 | * may_enter_fs here is liable to OOM on them. | ||
814 | * | ||
815 | * 3) memcg encounters a page that is not already marked | ||
816 | * PageReclaim. memcg does not have any dirty pages | ||
817 | * throttling so we could easily OOM just because too many | ||
818 | * pages are in writeback and there is nothing else to | ||
819 | * reclaim. Wait for the writeback to complete. | ||
820 | */ | ||
726 | if (PageWriteback(page)) { | 821 | if (PageWriteback(page)) { |
727 | /* | 822 | /* Case 1 above */ |
728 | * memcg doesn't have any dirty pages throttling so we | 823 | if (current_is_kswapd() && |
729 | * could easily OOM just because too many pages are in | 824 | PageReclaim(page) && |
730 | * writeback and there is nothing else to reclaim. | 825 | zone_is_reclaim_writeback(zone)) { |
731 | * | 826 | nr_immediate++; |
732 | * Check __GFP_IO, certainly because a loop driver | 827 | goto keep_locked; |
733 | * thread might enter reclaim, and deadlock if it waits | 828 | |
734 | * on a page for which it is needed to do the write | 829 | /* Case 2 above */ |
735 | * (loop masks off __GFP_IO|__GFP_FS for this reason); | 830 | } else if (global_reclaim(sc) || |
736 | * but more thought would probably show more reasons. | ||
737 | * | ||
738 | * Don't require __GFP_FS, since we're not going into | ||
739 | * the FS, just waiting on its writeback completion. | ||
740 | * Worryingly, ext4 gfs2 and xfs allocate pages with | ||
741 | * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so | ||
742 | * testing may_enter_fs here is liable to OOM on them. | ||
743 | */ | ||
744 | if (global_reclaim(sc) || | ||
745 | !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) { | 831 | !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) { |
746 | /* | 832 | /* |
747 | * This is slightly racy - end_page_writeback() | 833 | * This is slightly racy - end_page_writeback() |
@@ -756,9 +842,13 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
756 | */ | 842 | */ |
757 | SetPageReclaim(page); | 843 | SetPageReclaim(page); |
758 | nr_writeback++; | 844 | nr_writeback++; |
845 | |||
759 | goto keep_locked; | 846 | goto keep_locked; |
847 | |||
848 | /* Case 3 above */ | ||
849 | } else { | ||
850 | wait_on_page_writeback(page); | ||
760 | } | 851 | } |
761 | wait_on_page_writeback(page); | ||
762 | } | 852 | } |
763 | 853 | ||
764 | if (!force_reclaim) | 854 | if (!force_reclaim) |
@@ -784,9 +874,10 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
784 | if (!add_to_swap(page, page_list)) | 874 | if (!add_to_swap(page, page_list)) |
785 | goto activate_locked; | 875 | goto activate_locked; |
786 | may_enter_fs = 1; | 876 | may_enter_fs = 1; |
787 | } | ||
788 | 877 | ||
789 | mapping = page_mapping(page); | 878 | /* Adding to swap updated mapping */ |
879 | mapping = page_mapping(page); | ||
880 | } | ||
790 | 881 | ||
791 | /* | 882 | /* |
792 | * The page is mapped into the page tables of one or more | 883 | * The page is mapped into the page tables of one or more |
@@ -806,16 +897,14 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
806 | } | 897 | } |
807 | 898 | ||
808 | if (PageDirty(page)) { | 899 | if (PageDirty(page)) { |
809 | nr_dirty++; | ||
810 | |||
811 | /* | 900 | /* |
812 | * Only kswapd can writeback filesystem pages to | 901 | * Only kswapd can writeback filesystem pages to |
813 | * avoid risk of stack overflow but do not writeback | 902 | * avoid risk of stack overflow but only writeback |
814 | * unless under significant pressure. | 903 | * if many dirty pages have been encountered. |
815 | */ | 904 | */ |
816 | if (page_is_file_cache(page) && | 905 | if (page_is_file_cache(page) && |
817 | (!current_is_kswapd() || | 906 | (!current_is_kswapd() || |
818 | sc->priority >= DEF_PRIORITY - 2)) { | 907 | !zone_is_reclaim_dirty(zone))) { |
819 | /* | 908 | /* |
820 | * Immediately reclaim when written back. | 909 | * Immediately reclaim when written back. |
821 | * Similar in principal to deactivate_page() | 910 | * Similar in principal to deactivate_page() |
@@ -838,7 +927,6 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
838 | /* Page is dirty, try to write it out here */ | 927 | /* Page is dirty, try to write it out here */ |
839 | switch (pageout(page, mapping, sc)) { | 928 | switch (pageout(page, mapping, sc)) { |
840 | case PAGE_KEEP: | 929 | case PAGE_KEEP: |
841 | nr_congested++; | ||
842 | goto keep_locked; | 930 | goto keep_locked; |
843 | case PAGE_ACTIVATE: | 931 | case PAGE_ACTIVATE: |
844 | goto activate_locked; | 932 | goto activate_locked; |
@@ -946,22 +1034,16 @@ keep: | |||
946 | VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); | 1034 | VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); |
947 | } | 1035 | } |
948 | 1036 | ||
949 | /* | ||
950 | * Tag a zone as congested if all the dirty pages encountered were | ||
951 | * backed by a congested BDI. In this case, reclaimers should just | ||
952 | * back off and wait for congestion to clear because further reclaim | ||
953 | * will encounter the same problem | ||
954 | */ | ||
955 | if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc)) | ||
956 | zone_set_flag(zone, ZONE_CONGESTED); | ||
957 | |||
958 | free_hot_cold_page_list(&free_pages, 1); | 1037 | free_hot_cold_page_list(&free_pages, 1); |
959 | 1038 | ||
960 | list_splice(&ret_pages, page_list); | 1039 | list_splice(&ret_pages, page_list); |
961 | count_vm_events(PGACTIVATE, pgactivate); | 1040 | count_vm_events(PGACTIVATE, pgactivate); |
962 | mem_cgroup_uncharge_end(); | 1041 | mem_cgroup_uncharge_end(); |
963 | *ret_nr_dirty += nr_dirty; | 1042 | *ret_nr_dirty += nr_dirty; |
1043 | *ret_nr_congested += nr_congested; | ||
1044 | *ret_nr_unqueued_dirty += nr_unqueued_dirty; | ||
964 | *ret_nr_writeback += nr_writeback; | 1045 | *ret_nr_writeback += nr_writeback; |
1046 | *ret_nr_immediate += nr_immediate; | ||
965 | return nr_reclaimed; | 1047 | return nr_reclaimed; |
966 | } | 1048 | } |
967 | 1049 | ||
@@ -973,7 +1055,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, | |||
973 | .priority = DEF_PRIORITY, | 1055 | .priority = DEF_PRIORITY, |
974 | .may_unmap = 1, | 1056 | .may_unmap = 1, |
975 | }; | 1057 | }; |
976 | unsigned long ret, dummy1, dummy2; | 1058 | unsigned long ret, dummy1, dummy2, dummy3, dummy4, dummy5; |
977 | struct page *page, *next; | 1059 | struct page *page, *next; |
978 | LIST_HEAD(clean_pages); | 1060 | LIST_HEAD(clean_pages); |
979 | 1061 | ||
@@ -985,8 +1067,8 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, | |||
985 | } | 1067 | } |
986 | 1068 | ||
987 | ret = shrink_page_list(&clean_pages, zone, &sc, | 1069 | ret = shrink_page_list(&clean_pages, zone, &sc, |
988 | TTU_UNMAP|TTU_IGNORE_ACCESS, | 1070 | TTU_UNMAP|TTU_IGNORE_ACCESS, |
989 | &dummy1, &dummy2, true); | 1071 | &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true); |
990 | list_splice(&clean_pages, page_list); | 1072 | list_splice(&clean_pages, page_list); |
991 | __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); | 1073 | __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); |
992 | return ret; | 1074 | return ret; |
@@ -1281,7 +1363,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, | |||
1281 | unsigned long nr_reclaimed = 0; | 1363 | unsigned long nr_reclaimed = 0; |
1282 | unsigned long nr_taken; | 1364 | unsigned long nr_taken; |
1283 | unsigned long nr_dirty = 0; | 1365 | unsigned long nr_dirty = 0; |
1366 | unsigned long nr_congested = 0; | ||
1367 | unsigned long nr_unqueued_dirty = 0; | ||
1284 | unsigned long nr_writeback = 0; | 1368 | unsigned long nr_writeback = 0; |
1369 | unsigned long nr_immediate = 0; | ||
1285 | isolate_mode_t isolate_mode = 0; | 1370 | isolate_mode_t isolate_mode = 0; |
1286 | int file = is_file_lru(lru); | 1371 | int file = is_file_lru(lru); |
1287 | struct zone *zone = lruvec_zone(lruvec); | 1372 | struct zone *zone = lruvec_zone(lruvec); |
@@ -1323,7 +1408,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, | |||
1323 | return 0; | 1408 | return 0; |
1324 | 1409 | ||
1325 | nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP, | 1410 | nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP, |
1326 | &nr_dirty, &nr_writeback, false); | 1411 | &nr_dirty, &nr_unqueued_dirty, &nr_congested, |
1412 | &nr_writeback, &nr_immediate, | ||
1413 | false); | ||
1327 | 1414 | ||
1328 | spin_lock_irq(&zone->lru_lock); | 1415 | spin_lock_irq(&zone->lru_lock); |
1329 | 1416 | ||
@@ -1357,7 +1444,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, | |||
1357 | * same way balance_dirty_pages() manages. | 1444 | * same way balance_dirty_pages() manages. |
1358 | * | 1445 | * |
1359 | * This scales the number of dirty pages that must be under writeback | 1446 | * This scales the number of dirty pages that must be under writeback |
1360 | * before throttling depending on priority. It is a simple backoff | 1447 | * before a zone gets flagged ZONE_WRITEBACK. It is a simple backoff |
1361 | * function that has the most effect in the range DEF_PRIORITY to | 1448 | * function that has the most effect in the range DEF_PRIORITY to |
1362 | * DEF_PRIORITY-2 which is the priority reclaim is considered to be | 1449 | * DEF_PRIORITY-2 which is the priority reclaim is considered to be |
1363 | * in trouble and reclaim is considered to be in trouble. | 1450 | * in trouble and reclaim is considered to be in trouble. |
@@ -1368,9 +1455,53 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, | |||
1368 | * ... | 1455 | * ... |
1369 | * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any | 1456 | * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any |
1370 | * isolated page is PageWriteback | 1457 | * isolated page is PageWriteback |
1458 | * | ||
1459 | * Once a zone is flagged ZONE_WRITEBACK, kswapd will count the number | ||
1460 | * of pages under pages flagged for immediate reclaim and stall if any | ||
1461 | * are encountered in the nr_immediate check below. | ||
1371 | */ | 1462 | */ |
1372 | if (nr_writeback && nr_writeback >= | 1463 | if (nr_writeback && nr_writeback >= |
1373 | (nr_taken >> (DEF_PRIORITY - sc->priority))) | 1464 | (nr_taken >> (DEF_PRIORITY - sc->priority))) |
1465 | zone_set_flag(zone, ZONE_WRITEBACK); | ||
1466 | |||
1467 | /* | ||
1468 | * memcg will stall in page writeback so only consider forcibly | ||
1469 | * stalling for global reclaim | ||
1470 | */ | ||
1471 | if (global_reclaim(sc)) { | ||
1472 | /* | ||
1473 | * Tag a zone as congested if all the dirty pages scanned were | ||
1474 | * backed by a congested BDI and wait_iff_congested will stall. | ||
1475 | */ | ||
1476 | if (nr_dirty && nr_dirty == nr_congested) | ||
1477 | zone_set_flag(zone, ZONE_CONGESTED); | ||
1478 | |||
1479 | /* | ||
1480 | * If dirty pages are scanned that are not queued for IO, it | ||
1481 | * implies that flushers are not keeping up. In this case, flag | ||
1482 | * the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing | ||
1483 | * pages from reclaim context. It will forcibly stall in the | ||
1484 | * next check. | ||
1485 | */ | ||
1486 | if (nr_unqueued_dirty == nr_taken) | ||
1487 | zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY); | ||
1488 | |||
1489 | /* | ||
1490 | * In addition, if kswapd scans pages marked marked for | ||
1491 | * immediate reclaim and under writeback (nr_immediate), it | ||
1492 | * implies that pages are cycling through the LRU faster than | ||
1493 | * they are written so also forcibly stall. | ||
1494 | */ | ||
1495 | if (nr_unqueued_dirty == nr_taken || nr_immediate) | ||
1496 | congestion_wait(BLK_RW_ASYNC, HZ/10); | ||
1497 | } | ||
1498 | |||
1499 | /* | ||
1500 | * Stall direct reclaim for IO completions if underlying BDIs or zone | ||
1501 | * is congested. Allow kswapd to continue until it starts encountering | ||
1502 | * unqueued dirty pages or cycling through the LRU too quickly. | ||
1503 | */ | ||
1504 | if (!sc->hibernation_mode && !current_is_kswapd()) | ||
1374 | wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); | 1505 | wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); |
1375 | 1506 | ||
1376 | trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, | 1507 | trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, |
@@ -1822,17 +1953,25 @@ out: | |||
1822 | static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) | 1953 | static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) |
1823 | { | 1954 | { |
1824 | unsigned long nr[NR_LRU_LISTS]; | 1955 | unsigned long nr[NR_LRU_LISTS]; |
1956 | unsigned long targets[NR_LRU_LISTS]; | ||
1825 | unsigned long nr_to_scan; | 1957 | unsigned long nr_to_scan; |
1826 | enum lru_list lru; | 1958 | enum lru_list lru; |
1827 | unsigned long nr_reclaimed = 0; | 1959 | unsigned long nr_reclaimed = 0; |
1828 | unsigned long nr_to_reclaim = sc->nr_to_reclaim; | 1960 | unsigned long nr_to_reclaim = sc->nr_to_reclaim; |
1829 | struct blk_plug plug; | 1961 | struct blk_plug plug; |
1962 | bool scan_adjusted = false; | ||
1830 | 1963 | ||
1831 | get_scan_count(lruvec, sc, nr); | 1964 | get_scan_count(lruvec, sc, nr); |
1832 | 1965 | ||
1966 | /* Record the original scan target for proportional adjustments later */ | ||
1967 | memcpy(targets, nr, sizeof(nr)); | ||
1968 | |||
1833 | blk_start_plug(&plug); | 1969 | blk_start_plug(&plug); |
1834 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || | 1970 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || |
1835 | nr[LRU_INACTIVE_FILE]) { | 1971 | nr[LRU_INACTIVE_FILE]) { |
1972 | unsigned long nr_anon, nr_file, percentage; | ||
1973 | unsigned long nr_scanned; | ||
1974 | |||
1836 | for_each_evictable_lru(lru) { | 1975 | for_each_evictable_lru(lru) { |
1837 | if (nr[lru]) { | 1976 | if (nr[lru]) { |
1838 | nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX); | 1977 | nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX); |
@@ -1842,17 +1981,60 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) | |||
1842 | lruvec, sc); | 1981 | lruvec, sc); |
1843 | } | 1982 | } |
1844 | } | 1983 | } |
1984 | |||
1985 | if (nr_reclaimed < nr_to_reclaim || scan_adjusted) | ||
1986 | continue; | ||
1987 | |||
1845 | /* | 1988 | /* |
1846 | * On large memory systems, scan >> priority can become | 1989 | * For global direct reclaim, reclaim only the number of pages |
1847 | * really large. This is fine for the starting priority; | 1990 | * requested. Less care is taken to scan proportionally as it |
1848 | * we want to put equal scanning pressure on each zone. | 1991 | * is more important to minimise direct reclaim stall latency |
1849 | * However, if the VM has a harder time of freeing pages, | 1992 | * than it is to properly age the LRU lists. |
1850 | * with multiple processes reclaiming pages, the total | ||
1851 | * freeing target can get unreasonably large. | ||
1852 | */ | 1993 | */ |
1853 | if (nr_reclaimed >= nr_to_reclaim && | 1994 | if (global_reclaim(sc) && !current_is_kswapd()) |
1854 | sc->priority < DEF_PRIORITY) | ||
1855 | break; | 1995 | break; |
1996 | |||
1997 | /* | ||
1998 | * For kswapd and memcg, reclaim at least the number of pages | ||
1999 | * requested. Ensure that the anon and file LRUs shrink | ||
2000 | * proportionally what was requested by get_scan_count(). We | ||
2001 | * stop reclaiming one LRU and reduce the amount scanning | ||
2002 | * proportional to the original scan target. | ||
2003 | */ | ||
2004 | nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE]; | ||
2005 | nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON]; | ||
2006 | |||
2007 | if (nr_file > nr_anon) { | ||
2008 | unsigned long scan_target = targets[LRU_INACTIVE_ANON] + | ||
2009 | targets[LRU_ACTIVE_ANON] + 1; | ||
2010 | lru = LRU_BASE; | ||
2011 | percentage = nr_anon * 100 / scan_target; | ||
2012 | } else { | ||
2013 | unsigned long scan_target = targets[LRU_INACTIVE_FILE] + | ||
2014 | targets[LRU_ACTIVE_FILE] + 1; | ||
2015 | lru = LRU_FILE; | ||
2016 | percentage = nr_file * 100 / scan_target; | ||
2017 | } | ||
2018 | |||
2019 | /* Stop scanning the smaller of the LRU */ | ||
2020 | nr[lru] = 0; | ||
2021 | nr[lru + LRU_ACTIVE] = 0; | ||
2022 | |||
2023 | /* | ||
2024 | * Recalculate the other LRU scan count based on its original | ||
2025 | * scan target and the percentage scanning already complete | ||
2026 | */ | ||
2027 | lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE; | ||
2028 | nr_scanned = targets[lru] - nr[lru]; | ||
2029 | nr[lru] = targets[lru] * (100 - percentage) / 100; | ||
2030 | nr[lru] -= min(nr[lru], nr_scanned); | ||
2031 | |||
2032 | lru += LRU_ACTIVE; | ||
2033 | nr_scanned = targets[lru] - nr[lru]; | ||
2034 | nr[lru] = targets[lru] * (100 - percentage) / 100; | ||
2035 | nr[lru] -= min(nr[lru], nr_scanned); | ||
2036 | |||
2037 | scan_adjusted = true; | ||
1856 | } | 2038 | } |
1857 | blk_finish_plug(&plug); | 2039 | blk_finish_plug(&plug); |
1858 | sc->nr_reclaimed += nr_reclaimed; | 2040 | sc->nr_reclaimed += nr_reclaimed; |
@@ -2222,17 +2404,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
2222 | WB_REASON_TRY_TO_FREE_PAGES); | 2404 | WB_REASON_TRY_TO_FREE_PAGES); |
2223 | sc->may_writepage = 1; | 2405 | sc->may_writepage = 1; |
2224 | } | 2406 | } |
2225 | |||
2226 | /* Take a nap, wait for some writeback to complete */ | ||
2227 | if (!sc->hibernation_mode && sc->nr_scanned && | ||
2228 | sc->priority < DEF_PRIORITY - 2) { | ||
2229 | struct zone *preferred_zone; | ||
2230 | |||
2231 | first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask), | ||
2232 | &cpuset_current_mems_allowed, | ||
2233 | &preferred_zone); | ||
2234 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10); | ||
2235 | } | ||
2236 | } while (--sc->priority >= 0); | 2407 | } while (--sc->priority >= 0); |
2237 | 2408 | ||
2238 | out: | 2409 | out: |
@@ -2601,6 +2772,91 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, | |||
2601 | } | 2772 | } |
2602 | 2773 | ||
2603 | /* | 2774 | /* |
2775 | * kswapd shrinks the zone by the number of pages required to reach | ||
2776 | * the high watermark. | ||
2777 | * | ||
2778 | * Returns true if kswapd scanned at least the requested number of pages to | ||
2779 | * reclaim or if the lack of progress was due to pages under writeback. | ||
2780 | * This is used to determine if the scanning priority needs to be raised. | ||
2781 | */ | ||
2782 | static bool kswapd_shrink_zone(struct zone *zone, | ||
2783 | int classzone_idx, | ||
2784 | struct scan_control *sc, | ||
2785 | unsigned long lru_pages, | ||
2786 | unsigned long *nr_attempted) | ||
2787 | { | ||
2788 | unsigned long nr_slab; | ||
2789 | int testorder = sc->order; | ||
2790 | unsigned long balance_gap; | ||
2791 | struct reclaim_state *reclaim_state = current->reclaim_state; | ||
2792 | struct shrink_control shrink = { | ||
2793 | .gfp_mask = sc->gfp_mask, | ||
2794 | }; | ||
2795 | bool lowmem_pressure; | ||
2796 | |||
2797 | /* Reclaim above the high watermark. */ | ||
2798 | sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone)); | ||
2799 | |||
2800 | /* | ||
2801 | * Kswapd reclaims only single pages with compaction enabled. Trying | ||
2802 | * too hard to reclaim until contiguous free pages have become | ||
2803 | * available can hurt performance by evicting too much useful data | ||
2804 | * from memory. Do not reclaim more than needed for compaction. | ||
2805 | */ | ||
2806 | if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && | ||
2807 | compaction_suitable(zone, sc->order) != | ||
2808 | COMPACT_SKIPPED) | ||
2809 | testorder = 0; | ||
2810 | |||
2811 | /* | ||
2812 | * We put equal pressure on every zone, unless one zone has way too | ||
2813 | * many pages free already. The "too many pages" is defined as the | ||
2814 | * high wmark plus a "gap" where the gap is either the low | ||
2815 | * watermark or 1% of the zone, whichever is smaller. | ||
2816 | */ | ||
2817 | balance_gap = min(low_wmark_pages(zone), | ||
2818 | (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / | ||
2819 | KSWAPD_ZONE_BALANCE_GAP_RATIO); | ||
2820 | |||
2821 | /* | ||
2822 | * If there is no low memory pressure or the zone is balanced then no | ||
2823 | * reclaim is necessary | ||
2824 | */ | ||
2825 | lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone)); | ||
2826 | if (!lowmem_pressure && zone_balanced(zone, testorder, | ||
2827 | balance_gap, classzone_idx)) | ||
2828 | return true; | ||
2829 | |||
2830 | shrink_zone(zone, sc); | ||
2831 | |||
2832 | reclaim_state->reclaimed_slab = 0; | ||
2833 | nr_slab = shrink_slab(&shrink, sc->nr_scanned, lru_pages); | ||
2834 | sc->nr_reclaimed += reclaim_state->reclaimed_slab; | ||
2835 | |||
2836 | /* Account for the number of pages attempted to reclaim */ | ||
2837 | *nr_attempted += sc->nr_to_reclaim; | ||
2838 | |||
2839 | if (nr_slab == 0 && !zone_reclaimable(zone)) | ||
2840 | zone->all_unreclaimable = 1; | ||
2841 | |||
2842 | zone_clear_flag(zone, ZONE_WRITEBACK); | ||
2843 | |||
2844 | /* | ||
2845 | * If a zone reaches its high watermark, consider it to be no longer | ||
2846 | * congested. It's possible there are dirty pages backed by congested | ||
2847 | * BDIs but as pressure is relieved, speculatively avoid congestion | ||
2848 | * waits. | ||
2849 | */ | ||
2850 | if (!zone->all_unreclaimable && | ||
2851 | zone_balanced(zone, testorder, 0, classzone_idx)) { | ||
2852 | zone_clear_flag(zone, ZONE_CONGESTED); | ||
2853 | zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); | ||
2854 | } | ||
2855 | |||
2856 | return sc->nr_scanned >= sc->nr_to_reclaim; | ||
2857 | } | ||
2858 | |||
2859 | /* | ||
2604 | * For kswapd, balance_pgdat() will work across all this node's zones until | 2860 | * For kswapd, balance_pgdat() will work across all this node's zones until |
2605 | * they are all at high_wmark_pages(zone). | 2861 | * they are all at high_wmark_pages(zone). |
2606 | * | 2862 | * |
@@ -2624,35 +2880,28 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, | |||
2624 | static unsigned long balance_pgdat(pg_data_t *pgdat, int order, | 2880 | static unsigned long balance_pgdat(pg_data_t *pgdat, int order, |
2625 | int *classzone_idx) | 2881 | int *classzone_idx) |
2626 | { | 2882 | { |
2627 | bool pgdat_is_balanced = false; | ||
2628 | int i; | 2883 | int i; |
2629 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ | 2884 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ |
2630 | struct reclaim_state *reclaim_state = current->reclaim_state; | ||
2631 | unsigned long nr_soft_reclaimed; | 2885 | unsigned long nr_soft_reclaimed; |
2632 | unsigned long nr_soft_scanned; | 2886 | unsigned long nr_soft_scanned; |
2633 | struct scan_control sc = { | 2887 | struct scan_control sc = { |
2634 | .gfp_mask = GFP_KERNEL, | 2888 | .gfp_mask = GFP_KERNEL, |
2889 | .priority = DEF_PRIORITY, | ||
2635 | .may_unmap = 1, | 2890 | .may_unmap = 1, |
2636 | .may_swap = 1, | 2891 | .may_swap = 1, |
2637 | /* | 2892 | .may_writepage = !laptop_mode, |
2638 | * kswapd doesn't want to be bailed out while reclaim. because | ||
2639 | * we want to put equal scanning pressure on each zone. | ||
2640 | */ | ||
2641 | .nr_to_reclaim = ULONG_MAX, | ||
2642 | .order = order, | 2893 | .order = order, |
2643 | .target_mem_cgroup = NULL, | 2894 | .target_mem_cgroup = NULL, |
2644 | }; | 2895 | }; |
2645 | struct shrink_control shrink = { | ||
2646 | .gfp_mask = sc.gfp_mask, | ||
2647 | }; | ||
2648 | loop_again: | ||
2649 | sc.priority = DEF_PRIORITY; | ||
2650 | sc.nr_reclaimed = 0; | ||
2651 | sc.may_writepage = !laptop_mode; | ||
2652 | count_vm_event(PAGEOUTRUN); | 2896 | count_vm_event(PAGEOUTRUN); |
2653 | 2897 | ||
2654 | do { | 2898 | do { |
2655 | unsigned long lru_pages = 0; | 2899 | unsigned long lru_pages = 0; |
2900 | unsigned long nr_attempted = 0; | ||
2901 | bool raise_priority = true; | ||
2902 | bool pgdat_needs_compaction = (order > 0); | ||
2903 | |||
2904 | sc.nr_reclaimed = 0; | ||
2656 | 2905 | ||
2657 | /* | 2906 | /* |
2658 | * Scan in the highmem->dma direction for the highest | 2907 | * Scan in the highmem->dma direction for the highest |
@@ -2689,23 +2938,46 @@ loop_again: | |||
2689 | end_zone = i; | 2938 | end_zone = i; |
2690 | break; | 2939 | break; |
2691 | } else { | 2940 | } else { |
2692 | /* If balanced, clear the congested flag */ | 2941 | /* |
2942 | * If balanced, clear the dirty and congested | ||
2943 | * flags | ||
2944 | */ | ||
2693 | zone_clear_flag(zone, ZONE_CONGESTED); | 2945 | zone_clear_flag(zone, ZONE_CONGESTED); |
2946 | zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); | ||
2694 | } | 2947 | } |
2695 | } | 2948 | } |
2696 | 2949 | ||
2697 | if (i < 0) { | 2950 | if (i < 0) |
2698 | pgdat_is_balanced = true; | ||
2699 | goto out; | 2951 | goto out; |
2700 | } | ||
2701 | 2952 | ||
2702 | for (i = 0; i <= end_zone; i++) { | 2953 | for (i = 0; i <= end_zone; i++) { |
2703 | struct zone *zone = pgdat->node_zones + i; | 2954 | struct zone *zone = pgdat->node_zones + i; |
2704 | 2955 | ||
2956 | if (!populated_zone(zone)) | ||
2957 | continue; | ||
2958 | |||
2705 | lru_pages += zone_reclaimable_pages(zone); | 2959 | lru_pages += zone_reclaimable_pages(zone); |
2960 | |||
2961 | /* | ||
2962 | * If any zone is currently balanced then kswapd will | ||
2963 | * not call compaction as it is expected that the | ||
2964 | * necessary pages are already available. | ||
2965 | */ | ||
2966 | if (pgdat_needs_compaction && | ||
2967 | zone_watermark_ok(zone, order, | ||
2968 | low_wmark_pages(zone), | ||
2969 | *classzone_idx, 0)) | ||
2970 | pgdat_needs_compaction = false; | ||
2706 | } | 2971 | } |
2707 | 2972 | ||
2708 | /* | 2973 | /* |
2974 | * If we're getting trouble reclaiming, start doing writepage | ||
2975 | * even in laptop mode. | ||
2976 | */ | ||
2977 | if (sc.priority < DEF_PRIORITY - 2) | ||
2978 | sc.may_writepage = 1; | ||
2979 | |||
2980 | /* | ||
2709 | * Now scan the zone in the dma->highmem direction, stopping | 2981 | * Now scan the zone in the dma->highmem direction, stopping |
2710 | * at the last zone which needs scanning. | 2982 | * at the last zone which needs scanning. |
2711 | * | 2983 | * |
@@ -2716,8 +2988,6 @@ loop_again: | |||
2716 | */ | 2988 | */ |
2717 | for (i = 0; i <= end_zone; i++) { | 2989 | for (i = 0; i <= end_zone; i++) { |
2718 | struct zone *zone = pgdat->node_zones + i; | 2990 | struct zone *zone = pgdat->node_zones + i; |
2719 | int nr_slab, testorder; | ||
2720 | unsigned long balance_gap; | ||
2721 | 2991 | ||
2722 | if (!populated_zone(zone)) | 2992 | if (!populated_zone(zone)) |
2723 | continue; | 2993 | continue; |
@@ -2738,65 +3008,14 @@ loop_again: | |||
2738 | sc.nr_reclaimed += nr_soft_reclaimed; | 3008 | sc.nr_reclaimed += nr_soft_reclaimed; |
2739 | 3009 | ||
2740 | /* | 3010 | /* |
2741 | * We put equal pressure on every zone, unless | 3011 | * There should be no need to raise the scanning |
2742 | * one zone has way too many pages free | 3012 | * priority if enough pages are already being scanned |
2743 | * already. The "too many pages" is defined | 3013 | * that that high watermark would be met at 100% |
2744 | * as the high wmark plus a "gap" where the | 3014 | * efficiency. |
2745 | * gap is either the low watermark or 1% | ||
2746 | * of the zone, whichever is smaller. | ||
2747 | */ | 3015 | */ |
2748 | balance_gap = min(low_wmark_pages(zone), | 3016 | if (kswapd_shrink_zone(zone, end_zone, &sc, |
2749 | (zone->managed_pages + | 3017 | lru_pages, &nr_attempted)) |
2750 | KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / | 3018 | raise_priority = false; |
2751 | KSWAPD_ZONE_BALANCE_GAP_RATIO); | ||
2752 | /* | ||
2753 | * Kswapd reclaims only single pages with compaction | ||
2754 | * enabled. Trying too hard to reclaim until contiguous | ||
2755 | * free pages have become available can hurt performance | ||
2756 | * by evicting too much useful data from memory. | ||
2757 | * Do not reclaim more than needed for compaction. | ||
2758 | */ | ||
2759 | testorder = order; | ||
2760 | if (IS_ENABLED(CONFIG_COMPACTION) && order && | ||
2761 | compaction_suitable(zone, order) != | ||
2762 | COMPACT_SKIPPED) | ||
2763 | testorder = 0; | ||
2764 | |||
2765 | if ((buffer_heads_over_limit && is_highmem_idx(i)) || | ||
2766 | !zone_balanced(zone, testorder, | ||
2767 | balance_gap, end_zone)) { | ||
2768 | shrink_zone(zone, &sc); | ||
2769 | |||
2770 | reclaim_state->reclaimed_slab = 0; | ||
2771 | nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages); | ||
2772 | sc.nr_reclaimed += reclaim_state->reclaimed_slab; | ||
2773 | |||
2774 | if (nr_slab == 0 && !zone_reclaimable(zone)) | ||
2775 | zone->all_unreclaimable = 1; | ||
2776 | } | ||
2777 | |||
2778 | /* | ||
2779 | * If we're getting trouble reclaiming, start doing | ||
2780 | * writepage even in laptop mode. | ||
2781 | */ | ||
2782 | if (sc.priority < DEF_PRIORITY - 2) | ||
2783 | sc.may_writepage = 1; | ||
2784 | |||
2785 | if (zone->all_unreclaimable) { | ||
2786 | if (end_zone && end_zone == i) | ||
2787 | end_zone--; | ||
2788 | continue; | ||
2789 | } | ||
2790 | |||
2791 | if (zone_balanced(zone, testorder, 0, end_zone)) | ||
2792 | /* | ||
2793 | * If a zone reaches its high watermark, | ||
2794 | * consider it to be no longer congested. It's | ||
2795 | * possible there are dirty pages backed by | ||
2796 | * congested BDIs but as pressure is relieved, | ||
2797 | * speculatively avoid congestion waits | ||
2798 | */ | ||
2799 | zone_clear_flag(zone, ZONE_CONGESTED); | ||
2800 | } | 3019 | } |
2801 | 3020 | ||
2802 | /* | 3021 | /* |
@@ -2808,74 +3027,38 @@ loop_again: | |||
2808 | pfmemalloc_watermark_ok(pgdat)) | 3027 | pfmemalloc_watermark_ok(pgdat)) |
2809 | wake_up(&pgdat->pfmemalloc_wait); | 3028 | wake_up(&pgdat->pfmemalloc_wait); |
2810 | 3029 | ||
2811 | if (pgdat_balanced(pgdat, order, *classzone_idx)) { | ||
2812 | pgdat_is_balanced = true; | ||
2813 | break; /* kswapd: all done */ | ||
2814 | } | ||
2815 | |||
2816 | /* | 3030 | /* |
2817 | * We do this so kswapd doesn't build up large priorities for | 3031 | * Fragmentation may mean that the system cannot be rebalanced |
2818 | * example when it is freeing in parallel with allocators. It | 3032 | * for high-order allocations in all zones. If twice the |
2819 | * matches the direct reclaim path behaviour in terms of impact | 3033 | * allocation size has been reclaimed and the zones are still |
2820 | * on zone->*_priority. | 3034 | * not balanced then recheck the watermarks at order-0 to |
3035 | * prevent kswapd reclaiming excessively. Assume that a | ||
3036 | * process requested a high-order can direct reclaim/compact. | ||
2821 | */ | 3037 | */ |
2822 | if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) | 3038 | if (order && sc.nr_reclaimed >= 2UL << order) |
2823 | break; | 3039 | order = sc.order = 0; |
2824 | } while (--sc.priority >= 0); | ||
2825 | |||
2826 | out: | ||
2827 | if (!pgdat_is_balanced) { | ||
2828 | cond_resched(); | ||
2829 | 3040 | ||
2830 | try_to_freeze(); | 3041 | /* Check if kswapd should be suspending */ |
3042 | if (try_to_freeze() || kthread_should_stop()) | ||
3043 | break; | ||
2831 | 3044 | ||
2832 | /* | 3045 | /* |
2833 | * Fragmentation may mean that the system cannot be | 3046 | * Compact if necessary and kswapd is reclaiming at least the |
2834 | * rebalanced for high-order allocations in all zones. | 3047 | * high watermark number of pages as requsted |
2835 | * At this point, if nr_reclaimed < SWAP_CLUSTER_MAX, | ||
2836 | * it means the zones have been fully scanned and are still | ||
2837 | * not balanced. For high-order allocations, there is | ||
2838 | * little point trying all over again as kswapd may | ||
2839 | * infinite loop. | ||
2840 | * | ||
2841 | * Instead, recheck all watermarks at order-0 as they | ||
2842 | * are the most important. If watermarks are ok, kswapd will go | ||
2843 | * back to sleep. High-order users can still perform direct | ||
2844 | * reclaim if they wish. | ||
2845 | */ | 3048 | */ |
2846 | if (sc.nr_reclaimed < SWAP_CLUSTER_MAX) | 3049 | if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted) |
2847 | order = sc.order = 0; | ||
2848 | |||
2849 | goto loop_again; | ||
2850 | } | ||
2851 | |||
2852 | /* | ||
2853 | * If kswapd was reclaiming at a higher order, it has the option of | ||
2854 | * sleeping without all zones being balanced. Before it does, it must | ||
2855 | * ensure that the watermarks for order-0 on *all* zones are met and | ||
2856 | * that the congestion flags are cleared. The congestion flag must | ||
2857 | * be cleared as kswapd is the only mechanism that clears the flag | ||
2858 | * and it is potentially going to sleep here. | ||
2859 | */ | ||
2860 | if (order) { | ||
2861 | int zones_need_compaction = 1; | ||
2862 | |||
2863 | for (i = 0; i <= end_zone; i++) { | ||
2864 | struct zone *zone = pgdat->node_zones + i; | ||
2865 | |||
2866 | if (!populated_zone(zone)) | ||
2867 | continue; | ||
2868 | |||
2869 | /* Check if the memory needs to be defragmented. */ | ||
2870 | if (zone_watermark_ok(zone, order, | ||
2871 | low_wmark_pages(zone), *classzone_idx, 0)) | ||
2872 | zones_need_compaction = 0; | ||
2873 | } | ||
2874 | |||
2875 | if (zones_need_compaction) | ||
2876 | compact_pgdat(pgdat, order); | 3050 | compact_pgdat(pgdat, order); |
2877 | } | ||
2878 | 3051 | ||
3052 | /* | ||
3053 | * Raise priority if scanning rate is too low or there was no | ||
3054 | * progress in reclaiming pages | ||
3055 | */ | ||
3056 | if (raise_priority || !sc.nr_reclaimed) | ||
3057 | sc.priority--; | ||
3058 | } while (sc.priority >= 1 && | ||
3059 | !pgdat_balanced(pgdat, order, *classzone_idx)); | ||
3060 | |||
3061 | out: | ||
2879 | /* | 3062 | /* |
2880 | * Return the order we were reclaiming at so prepare_kswapd_sleep() | 3063 | * Return the order we were reclaiming at so prepare_kswapd_sleep() |
2881 | * makes a decision on the order we were last reclaiming at. However, | 3064 | * makes a decision on the order we were last reclaiming at. However, |