diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-10-08 00:38:00 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-10-08 00:38:00 -0400 |
commit | b66484cd74706fa8681d051840fe4b18a3da40ff (patch) | |
tree | e8215e7c25661d25f84abc4b98140c2062d6d5de /mm | |
parent | c913fc4146ba7c280e074558d0a461e5c6f07c8a (diff) | |
parent | 05fd007e46296afb24d15c7d589d535e5a5b9d5c (diff) |
Merge branch 'akpm' (patches from Andrew)
Merge updates from Andrew Morton:
- fsnotify updates
- ocfs2 updates
- all of MM
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (127 commits)
console: don't prefer first registered if DT specifies stdout-path
cred: simpler, 1D supplementary groups
CREDITS: update Pavel's information, add GPG key, remove snail mail address
mailmap: add Johan Hovold
.gitattributes: set git diff driver for C source code files
uprobes: remove function declarations from arch/{mips,s390}
spelling.txt: "modeled" is spelt correctly
nmi_backtrace: generate one-line reports for idle cpus
arch/tile: adopt the new nmi_backtrace framework
nmi_backtrace: do a local dump_stack() instead of a self-NMI
nmi_backtrace: add more trigger_*_cpu_backtrace() methods
min/max: remove sparse warnings when they're nested
Documentation/filesystems/proc.txt: add more description for maps/smaps
mm, proc: fix region lost in /proc/self/smaps
proc: fix timerslack_ns CAP_SYS_NICE check when adjusting self
proc: add LSM hook checks to /proc/<tid>/timerslack_ns
proc: relax /proc/<tid>/timerslack_ns capability requirements
meminfo: break apart a very long seq_printf with #ifdefs
seq/proc: modify seq_put_decimal_[u]ll to take a const char *, not char
proc: faster /proc/*/status
...
Diffstat (limited to 'mm')
-rw-r--r-- | mm/bootmem.c | 14 | ||||
-rw-r--r-- | mm/compaction.c | 205 | ||||
-rw-r--r-- | mm/debug.c | 5 | ||||
-rw-r--r-- | mm/filemap.c | 8 | ||||
-rw-r--r-- | mm/huge_memory.c | 81 | ||||
-rw-r--r-- | mm/hugetlb.c | 53 | ||||
-rw-r--r-- | mm/internal.h | 3 | ||||
-rw-r--r-- | mm/ksm.c | 7 | ||||
-rw-r--r-- | mm/memblock.c | 5 | ||||
-rw-r--r-- | mm/memcontrol.c | 154 | ||||
-rw-r--r-- | mm/memory.c | 21 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 4 | ||||
-rw-r--r-- | mm/mempolicy.c | 2 | ||||
-rw-r--r-- | mm/migrate.c | 2 | ||||
-rw-r--r-- | mm/mincore.c | 5 | ||||
-rw-r--r-- | mm/mlock.c | 52 | ||||
-rw-r--r-- | mm/mmap.c | 238 | ||||
-rw-r--r-- | mm/mprotect.c | 3 | ||||
-rw-r--r-- | mm/nobootmem.c | 20 | ||||
-rw-r--r-- | mm/oom_kill.c | 381 | ||||
-rw-r--r-- | mm/page-writeback.c | 34 | ||||
-rw-r--r-- | mm/page_alloc.c | 281 | ||||
-rw-r--r-- | mm/page_ext.c | 45 | ||||
-rw-r--r-- | mm/page_io.c | 7 | ||||
-rw-r--r-- | mm/page_isolation.c | 2 | ||||
-rw-r--r-- | mm/page_owner.c | 156 | ||||
-rw-r--r-- | mm/shmem.c | 2 | ||||
-rw-r--r-- | mm/swap.c | 4 | ||||
-rw-r--r-- | mm/swap_state.c | 14 | ||||
-rw-r--r-- | mm/swapfile.c | 137 | ||||
-rw-r--r-- | mm/vmacache.c | 8 | ||||
-rw-r--r-- | mm/vmalloc.c | 22 | ||||
-rw-r--r-- | mm/vmscan.c | 53 | ||||
-rw-r--r-- | mm/vmstat.c | 95 |
34 files changed, 1250 insertions, 873 deletions
diff --git a/mm/bootmem.c b/mm/bootmem.c index 0aa7dda52402..a869f84f44d3 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
@@ -11,15 +11,12 @@ | |||
11 | #include <linux/init.h> | 11 | #include <linux/init.h> |
12 | #include <linux/pfn.h> | 12 | #include <linux/pfn.h> |
13 | #include <linux/slab.h> | 13 | #include <linux/slab.h> |
14 | #include <linux/bootmem.h> | ||
15 | #include <linux/export.h> | 14 | #include <linux/export.h> |
16 | #include <linux/kmemleak.h> | 15 | #include <linux/kmemleak.h> |
17 | #include <linux/range.h> | 16 | #include <linux/range.h> |
18 | #include <linux/memblock.h> | ||
19 | #include <linux/bug.h> | 17 | #include <linux/bug.h> |
20 | #include <linux/io.h> | 18 | #include <linux/io.h> |
21 | 19 | #include <linux/bootmem.h> | |
22 | #include <asm/processor.h> | ||
23 | 20 | ||
24 | #include "internal.h" | 21 | #include "internal.h" |
25 | 22 | ||
@@ -712,7 +709,7 @@ void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat, | |||
712 | void *ptr; | 709 | void *ptr; |
713 | 710 | ||
714 | if (WARN_ON_ONCE(slab_is_available())) | 711 | if (WARN_ON_ONCE(slab_is_available())) |
715 | return kzalloc(size, GFP_NOWAIT); | 712 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); |
716 | again: | 713 | again: |
717 | 714 | ||
718 | /* do not panic in alloc_bootmem_bdata() */ | 715 | /* do not panic in alloc_bootmem_bdata() */ |
@@ -738,9 +735,6 @@ again: | |||
738 | void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, | 735 | void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, |
739 | unsigned long align, unsigned long goal) | 736 | unsigned long align, unsigned long goal) |
740 | { | 737 | { |
741 | if (WARN_ON_ONCE(slab_is_available())) | ||
742 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | ||
743 | |||
744 | return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0); | 738 | return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0); |
745 | } | 739 | } |
746 | 740 | ||
@@ -812,10 +806,6 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, | |||
812 | 806 | ||
813 | } | 807 | } |
814 | 808 | ||
815 | #ifndef ARCH_LOW_ADDRESS_LIMIT | ||
816 | #define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL | ||
817 | #endif | ||
818 | |||
819 | /** | 809 | /** |
820 | * __alloc_bootmem_low - allocate low boot memory | 810 | * __alloc_bootmem_low - allocate low boot memory |
821 | * @size: size of the request in bytes | 811 | * @size: size of the request in bytes |
diff --git a/mm/compaction.c b/mm/compaction.c index 9affb2908304..0409a4ad6ea1 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -997,8 +997,12 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, | |||
997 | #ifdef CONFIG_COMPACTION | 997 | #ifdef CONFIG_COMPACTION |
998 | 998 | ||
999 | /* Returns true if the page is within a block suitable for migration to */ | 999 | /* Returns true if the page is within a block suitable for migration to */ |
1000 | static bool suitable_migration_target(struct page *page) | 1000 | static bool suitable_migration_target(struct compact_control *cc, |
1001 | struct page *page) | ||
1001 | { | 1002 | { |
1003 | if (cc->ignore_block_suitable) | ||
1004 | return true; | ||
1005 | |||
1002 | /* If the page is a large free page, then disallow migration */ | 1006 | /* If the page is a large free page, then disallow migration */ |
1003 | if (PageBuddy(page)) { | 1007 | if (PageBuddy(page)) { |
1004 | /* | 1008 | /* |
@@ -1083,7 +1087,7 @@ static void isolate_freepages(struct compact_control *cc) | |||
1083 | continue; | 1087 | continue; |
1084 | 1088 | ||
1085 | /* Check the block is suitable for migration */ | 1089 | /* Check the block is suitable for migration */ |
1086 | if (!suitable_migration_target(page)) | 1090 | if (!suitable_migration_target(cc, page)) |
1087 | continue; | 1091 | continue; |
1088 | 1092 | ||
1089 | /* If isolation recently failed, do not retry */ | 1093 | /* If isolation recently failed, do not retry */ |
@@ -1316,7 +1320,7 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_ | |||
1316 | return COMPACT_CONTINUE; | 1320 | return COMPACT_CONTINUE; |
1317 | 1321 | ||
1318 | /* Compaction run is not finished if the watermark is not met */ | 1322 | /* Compaction run is not finished if the watermark is not met */ |
1319 | watermark = low_wmark_pages(zone); | 1323 | watermark = zone->watermark[cc->alloc_flags & ALLOC_WMARK_MASK]; |
1320 | 1324 | ||
1321 | if (!zone_watermark_ok(zone, cc->order, watermark, cc->classzone_idx, | 1325 | if (!zone_watermark_ok(zone, cc->order, watermark, cc->classzone_idx, |
1322 | cc->alloc_flags)) | 1326 | cc->alloc_flags)) |
@@ -1329,13 +1333,13 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_ | |||
1329 | 1333 | ||
1330 | /* Job done if page is free of the right migratetype */ | 1334 | /* Job done if page is free of the right migratetype */ |
1331 | if (!list_empty(&area->free_list[migratetype])) | 1335 | if (!list_empty(&area->free_list[migratetype])) |
1332 | return COMPACT_PARTIAL; | 1336 | return COMPACT_SUCCESS; |
1333 | 1337 | ||
1334 | #ifdef CONFIG_CMA | 1338 | #ifdef CONFIG_CMA |
1335 | /* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */ | 1339 | /* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */ |
1336 | if (migratetype == MIGRATE_MOVABLE && | 1340 | if (migratetype == MIGRATE_MOVABLE && |
1337 | !list_empty(&area->free_list[MIGRATE_CMA])) | 1341 | !list_empty(&area->free_list[MIGRATE_CMA])) |
1338 | return COMPACT_PARTIAL; | 1342 | return COMPACT_SUCCESS; |
1339 | #endif | 1343 | #endif |
1340 | /* | 1344 | /* |
1341 | * Job done if allocation would steal freepages from | 1345 | * Job done if allocation would steal freepages from |
@@ -1343,7 +1347,7 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_ | |||
1343 | */ | 1347 | */ |
1344 | if (find_suitable_fallback(area, order, migratetype, | 1348 | if (find_suitable_fallback(area, order, migratetype, |
1345 | true, &can_steal) != -1) | 1349 | true, &can_steal) != -1) |
1346 | return COMPACT_PARTIAL; | 1350 | return COMPACT_SUCCESS; |
1347 | } | 1351 | } |
1348 | 1352 | ||
1349 | return COMPACT_NO_SUITABLE_PAGE; | 1353 | return COMPACT_NO_SUITABLE_PAGE; |
@@ -1367,7 +1371,7 @@ static enum compact_result compact_finished(struct zone *zone, | |||
1367 | * compaction_suitable: Is this suitable to run compaction on this zone now? | 1371 | * compaction_suitable: Is this suitable to run compaction on this zone now? |
1368 | * Returns | 1372 | * Returns |
1369 | * COMPACT_SKIPPED - If there are too few free pages for compaction | 1373 | * COMPACT_SKIPPED - If there are too few free pages for compaction |
1370 | * COMPACT_PARTIAL - If the allocation would succeed without compaction | 1374 | * COMPACT_SUCCESS - If the allocation would succeed without compaction |
1371 | * COMPACT_CONTINUE - If compaction should run now | 1375 | * COMPACT_CONTINUE - If compaction should run now |
1372 | */ | 1376 | */ |
1373 | static enum compact_result __compaction_suitable(struct zone *zone, int order, | 1377 | static enum compact_result __compaction_suitable(struct zone *zone, int order, |
@@ -1375,46 +1379,41 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, | |||
1375 | int classzone_idx, | 1379 | int classzone_idx, |
1376 | unsigned long wmark_target) | 1380 | unsigned long wmark_target) |
1377 | { | 1381 | { |
1378 | int fragindex; | ||
1379 | unsigned long watermark; | 1382 | unsigned long watermark; |
1380 | 1383 | ||
1381 | if (is_via_compact_memory(order)) | 1384 | if (is_via_compact_memory(order)) |
1382 | return COMPACT_CONTINUE; | 1385 | return COMPACT_CONTINUE; |
1383 | 1386 | ||
1384 | watermark = low_wmark_pages(zone); | 1387 | watermark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; |
1385 | /* | 1388 | /* |
1386 | * If watermarks for high-order allocation are already met, there | 1389 | * If watermarks for high-order allocation are already met, there |
1387 | * should be no need for compaction at all. | 1390 | * should be no need for compaction at all. |
1388 | */ | 1391 | */ |
1389 | if (zone_watermark_ok(zone, order, watermark, classzone_idx, | 1392 | if (zone_watermark_ok(zone, order, watermark, classzone_idx, |
1390 | alloc_flags)) | 1393 | alloc_flags)) |
1391 | return COMPACT_PARTIAL; | 1394 | return COMPACT_SUCCESS; |
1392 | 1395 | ||
1393 | /* | 1396 | /* |
1394 | * Watermarks for order-0 must be met for compaction. Note the 2UL. | 1397 | * Watermarks for order-0 must be met for compaction to be able to |
1395 | * This is because during migration, copies of pages need to be | 1398 | * isolate free pages for migration targets. This means that the |
1396 | * allocated and for a short time, the footprint is higher | 1399 | * watermark and alloc_flags have to match, or be more pessimistic than |
1400 | * the check in __isolate_free_page(). We don't use the direct | ||
1401 | * compactor's alloc_flags, as they are not relevant for freepage | ||
1402 | * isolation. We however do use the direct compactor's classzone_idx to | ||
1403 | * skip over zones where lowmem reserves would prevent allocation even | ||
1404 | * if compaction succeeds. | ||
1405 | * For costly orders, we require low watermark instead of min for | ||
1406 | * compaction to proceed to increase its chances. | ||
1407 | * ALLOC_CMA is used, as pages in CMA pageblocks are considered | ||
1408 | * suitable migration targets | ||
1397 | */ | 1409 | */ |
1398 | watermark += (2UL << order); | 1410 | watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ? |
1411 | low_wmark_pages(zone) : min_wmark_pages(zone); | ||
1412 | watermark += compact_gap(order); | ||
1399 | if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx, | 1413 | if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx, |
1400 | alloc_flags, wmark_target)) | 1414 | ALLOC_CMA, wmark_target)) |
1401 | return COMPACT_SKIPPED; | 1415 | return COMPACT_SKIPPED; |
1402 | 1416 | ||
1403 | /* | ||
1404 | * fragmentation index determines if allocation failures are due to | ||
1405 | * low memory or external fragmentation | ||
1406 | * | ||
1407 | * index of -1000 would imply allocations might succeed depending on | ||
1408 | * watermarks, but we already failed the high-order watermark check | ||
1409 | * index towards 0 implies failure is due to lack of memory | ||
1410 | * index towards 1000 implies failure is due to fragmentation | ||
1411 | * | ||
1412 | * Only compact if a failure would be due to fragmentation. | ||
1413 | */ | ||
1414 | fragindex = fragmentation_index(zone, order); | ||
1415 | if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) | ||
1416 | return COMPACT_NOT_SUITABLE_ZONE; | ||
1417 | |||
1418 | return COMPACT_CONTINUE; | 1417 | return COMPACT_CONTINUE; |
1419 | } | 1418 | } |
1420 | 1419 | ||
@@ -1423,9 +1422,32 @@ enum compact_result compaction_suitable(struct zone *zone, int order, | |||
1423 | int classzone_idx) | 1422 | int classzone_idx) |
1424 | { | 1423 | { |
1425 | enum compact_result ret; | 1424 | enum compact_result ret; |
1425 | int fragindex; | ||
1426 | 1426 | ||
1427 | ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx, | 1427 | ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx, |
1428 | zone_page_state(zone, NR_FREE_PAGES)); | 1428 | zone_page_state(zone, NR_FREE_PAGES)); |
1429 | /* | ||
1430 | * fragmentation index determines if allocation failures are due to | ||
1431 | * low memory or external fragmentation | ||
1432 | * | ||
1433 | * index of -1000 would imply allocations might succeed depending on | ||
1434 | * watermarks, but we already failed the high-order watermark check | ||
1435 | * index towards 0 implies failure is due to lack of memory | ||
1436 | * index towards 1000 implies failure is due to fragmentation | ||
1437 | * | ||
1438 | * Only compact if a failure would be due to fragmentation. Also | ||
1439 | * ignore fragindex for non-costly orders where the alternative to | ||
1440 | * a successful reclaim/compaction is OOM. Fragindex and the | ||
1441 | * vm.extfrag_threshold sysctl is meant as a heuristic to prevent | ||
1442 | * excessive compaction for costly orders, but it should not be at the | ||
1443 | * expense of system stability. | ||
1444 | */ | ||
1445 | if (ret == COMPACT_CONTINUE && (order > PAGE_ALLOC_COSTLY_ORDER)) { | ||
1446 | fragindex = fragmentation_index(zone, order); | ||
1447 | if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) | ||
1448 | ret = COMPACT_NOT_SUITABLE_ZONE; | ||
1449 | } | ||
1450 | |||
1429 | trace_mm_compaction_suitable(zone, order, ret); | 1451 | trace_mm_compaction_suitable(zone, order, ret); |
1430 | if (ret == COMPACT_NOT_SUITABLE_ZONE) | 1452 | if (ret == COMPACT_NOT_SUITABLE_ZONE) |
1431 | ret = COMPACT_SKIPPED; | 1453 | ret = COMPACT_SKIPPED; |
@@ -1458,8 +1480,7 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, | |||
1458 | available += zone_page_state_snapshot(zone, NR_FREE_PAGES); | 1480 | available += zone_page_state_snapshot(zone, NR_FREE_PAGES); |
1459 | compact_result = __compaction_suitable(zone, order, alloc_flags, | 1481 | compact_result = __compaction_suitable(zone, order, alloc_flags, |
1460 | ac_classzone_idx(ac), available); | 1482 | ac_classzone_idx(ac), available); |
1461 | if (compact_result != COMPACT_SKIPPED && | 1483 | if (compact_result != COMPACT_SKIPPED) |
1462 | compact_result != COMPACT_NOT_SUITABLE_ZONE) | ||
1463 | return true; | 1484 | return true; |
1464 | } | 1485 | } |
1465 | 1486 | ||
@@ -1477,7 +1498,7 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro | |||
1477 | ret = compaction_suitable(zone, cc->order, cc->alloc_flags, | 1498 | ret = compaction_suitable(zone, cc->order, cc->alloc_flags, |
1478 | cc->classzone_idx); | 1499 | cc->classzone_idx); |
1479 | /* Compaction is likely to fail */ | 1500 | /* Compaction is likely to fail */ |
1480 | if (ret == COMPACT_PARTIAL || ret == COMPACT_SKIPPED) | 1501 | if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED) |
1481 | return ret; | 1502 | return ret; |
1482 | 1503 | ||
1483 | /* huh, compaction_suitable is returning something unexpected */ | 1504 | /* huh, compaction_suitable is returning something unexpected */ |
@@ -1492,23 +1513,29 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro | |||
1492 | 1513 | ||
1493 | /* | 1514 | /* |
1494 | * Setup to move all movable pages to the end of the zone. Used cached | 1515 | * Setup to move all movable pages to the end of the zone. Used cached |
1495 | * information on where the scanners should start but check that it | 1516 | * information on where the scanners should start (unless we explicitly |
1496 | * is initialised by ensuring the values are within zone boundaries. | 1517 | * want to compact the whole zone), but check that it is initialised |
1518 | * by ensuring the values are within zone boundaries. | ||
1497 | */ | 1519 | */ |
1498 | cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync]; | 1520 | if (cc->whole_zone) { |
1499 | cc->free_pfn = zone->compact_cached_free_pfn; | ||
1500 | if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) { | ||
1501 | cc->free_pfn = pageblock_start_pfn(end_pfn - 1); | ||
1502 | zone->compact_cached_free_pfn = cc->free_pfn; | ||
1503 | } | ||
1504 | if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) { | ||
1505 | cc->migrate_pfn = start_pfn; | 1521 | cc->migrate_pfn = start_pfn; |
1506 | zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; | 1522 | cc->free_pfn = pageblock_start_pfn(end_pfn - 1); |
1507 | zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; | 1523 | } else { |
1508 | } | 1524 | cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync]; |
1525 | cc->free_pfn = zone->compact_cached_free_pfn; | ||
1526 | if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) { | ||
1527 | cc->free_pfn = pageblock_start_pfn(end_pfn - 1); | ||
1528 | zone->compact_cached_free_pfn = cc->free_pfn; | ||
1529 | } | ||
1530 | if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) { | ||
1531 | cc->migrate_pfn = start_pfn; | ||
1532 | zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; | ||
1533 | zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; | ||
1534 | } | ||
1509 | 1535 | ||
1510 | if (cc->migrate_pfn == start_pfn) | 1536 | if (cc->migrate_pfn == start_pfn) |
1511 | cc->whole_zone = true; | 1537 | cc->whole_zone = true; |
1538 | } | ||
1512 | 1539 | ||
1513 | cc->last_migrated_pfn = 0; | 1540 | cc->last_migrated_pfn = 0; |
1514 | 1541 | ||
@@ -1638,6 +1665,9 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, | |||
1638 | .alloc_flags = alloc_flags, | 1665 | .alloc_flags = alloc_flags, |
1639 | .classzone_idx = classzone_idx, | 1666 | .classzone_idx = classzone_idx, |
1640 | .direct_compaction = true, | 1667 | .direct_compaction = true, |
1668 | .whole_zone = (prio == MIN_COMPACT_PRIORITY), | ||
1669 | .ignore_skip_hint = (prio == MIN_COMPACT_PRIORITY), | ||
1670 | .ignore_block_suitable = (prio == MIN_COMPACT_PRIORITY) | ||
1641 | }; | 1671 | }; |
1642 | INIT_LIST_HEAD(&cc.freepages); | 1672 | INIT_LIST_HEAD(&cc.freepages); |
1643 | INIT_LIST_HEAD(&cc.migratepages); | 1673 | INIT_LIST_HEAD(&cc.migratepages); |
@@ -1683,7 +1713,8 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, | |||
1683 | ac->nodemask) { | 1713 | ac->nodemask) { |
1684 | enum compact_result status; | 1714 | enum compact_result status; |
1685 | 1715 | ||
1686 | if (compaction_deferred(zone, order)) { | 1716 | if (prio > MIN_COMPACT_PRIORITY |
1717 | && compaction_deferred(zone, order)) { | ||
1687 | rc = max_t(enum compact_result, COMPACT_DEFERRED, rc); | 1718 | rc = max_t(enum compact_result, COMPACT_DEFERRED, rc); |
1688 | continue; | 1719 | continue; |
1689 | } | 1720 | } |
@@ -1692,9 +1723,8 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, | |||
1692 | alloc_flags, ac_classzone_idx(ac)); | 1723 | alloc_flags, ac_classzone_idx(ac)); |
1693 | rc = max(status, rc); | 1724 | rc = max(status, rc); |
1694 | 1725 | ||
1695 | /* If a normal allocation would succeed, stop compacting */ | 1726 | /* The allocation should succeed, stop compacting */ |
1696 | if (zone_watermark_ok(zone, order, low_wmark_pages(zone), | 1727 | if (status == COMPACT_SUCCESS) { |
1697 | ac_classzone_idx(ac), alloc_flags)) { | ||
1698 | /* | 1728 | /* |
1699 | * We think the allocation will succeed in this zone, | 1729 | * We think the allocation will succeed in this zone, |
1700 | * but it is not certain, hence the false. The caller | 1730 | * but it is not certain, hence the false. The caller |
@@ -1730,10 +1760,18 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, | |||
1730 | 1760 | ||
1731 | 1761 | ||
1732 | /* Compact all zones within a node */ | 1762 | /* Compact all zones within a node */ |
1733 | static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) | 1763 | static void compact_node(int nid) |
1734 | { | 1764 | { |
1765 | pg_data_t *pgdat = NODE_DATA(nid); | ||
1735 | int zoneid; | 1766 | int zoneid; |
1736 | struct zone *zone; | 1767 | struct zone *zone; |
1768 | struct compact_control cc = { | ||
1769 | .order = -1, | ||
1770 | .mode = MIGRATE_SYNC, | ||
1771 | .ignore_skip_hint = true, | ||
1772 | .whole_zone = true, | ||
1773 | }; | ||
1774 | |||
1737 | 1775 | ||
1738 | for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { | 1776 | for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { |
1739 | 1777 | ||
@@ -1741,60 +1779,19 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) | |||
1741 | if (!populated_zone(zone)) | 1779 | if (!populated_zone(zone)) |
1742 | continue; | 1780 | continue; |
1743 | 1781 | ||
1744 | cc->nr_freepages = 0; | 1782 | cc.nr_freepages = 0; |
1745 | cc->nr_migratepages = 0; | 1783 | cc.nr_migratepages = 0; |
1746 | cc->zone = zone; | 1784 | cc.zone = zone; |
1747 | INIT_LIST_HEAD(&cc->freepages); | 1785 | INIT_LIST_HEAD(&cc.freepages); |
1748 | INIT_LIST_HEAD(&cc->migratepages); | 1786 | INIT_LIST_HEAD(&cc.migratepages); |
1749 | |||
1750 | /* | ||
1751 | * When called via /proc/sys/vm/compact_memory | ||
1752 | * this makes sure we compact the whole zone regardless of | ||
1753 | * cached scanner positions. | ||
1754 | */ | ||
1755 | if (is_via_compact_memory(cc->order)) | ||
1756 | __reset_isolation_suitable(zone); | ||
1757 | |||
1758 | if (is_via_compact_memory(cc->order) || | ||
1759 | !compaction_deferred(zone, cc->order)) | ||
1760 | compact_zone(zone, cc); | ||
1761 | |||
1762 | VM_BUG_ON(!list_empty(&cc->freepages)); | ||
1763 | VM_BUG_ON(!list_empty(&cc->migratepages)); | ||
1764 | 1787 | ||
1765 | if (is_via_compact_memory(cc->order)) | 1788 | compact_zone(zone, &cc); |
1766 | continue; | ||
1767 | 1789 | ||
1768 | if (zone_watermark_ok(zone, cc->order, | 1790 | VM_BUG_ON(!list_empty(&cc.freepages)); |
1769 | low_wmark_pages(zone), 0, 0)) | 1791 | VM_BUG_ON(!list_empty(&cc.migratepages)); |
1770 | compaction_defer_reset(zone, cc->order, false); | ||
1771 | } | 1792 | } |
1772 | } | 1793 | } |
1773 | 1794 | ||
1774 | void compact_pgdat(pg_data_t *pgdat, int order) | ||
1775 | { | ||
1776 | struct compact_control cc = { | ||
1777 | .order = order, | ||
1778 | .mode = MIGRATE_ASYNC, | ||
1779 | }; | ||
1780 | |||
1781 | if (!order) | ||
1782 | return; | ||
1783 | |||
1784 | __compact_pgdat(pgdat, &cc); | ||
1785 | } | ||
1786 | |||
1787 | static void compact_node(int nid) | ||
1788 | { | ||
1789 | struct compact_control cc = { | ||
1790 | .order = -1, | ||
1791 | .mode = MIGRATE_SYNC, | ||
1792 | .ignore_skip_hint = true, | ||
1793 | }; | ||
1794 | |||
1795 | __compact_pgdat(NODE_DATA(nid), &cc); | ||
1796 | } | ||
1797 | |||
1798 | /* Compact all nodes in the system */ | 1795 | /* Compact all nodes in the system */ |
1799 | static void compact_nodes(void) | 1796 | static void compact_nodes(void) |
1800 | { | 1797 | { |
@@ -1900,8 +1897,6 @@ static void kcompactd_do_work(pg_data_t *pgdat) | |||
1900 | .ignore_skip_hint = true, | 1897 | .ignore_skip_hint = true, |
1901 | 1898 | ||
1902 | }; | 1899 | }; |
1903 | bool success = false; | ||
1904 | |||
1905 | trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order, | 1900 | trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order, |
1906 | cc.classzone_idx); | 1901 | cc.classzone_idx); |
1907 | count_vm_event(KCOMPACTD_WAKE); | 1902 | count_vm_event(KCOMPACTD_WAKE); |
@@ -1930,9 +1925,7 @@ static void kcompactd_do_work(pg_data_t *pgdat) | |||
1930 | return; | 1925 | return; |
1931 | status = compact_zone(zone, &cc); | 1926 | status = compact_zone(zone, &cc); |
1932 | 1927 | ||
1933 | if (zone_watermark_ok(zone, cc.order, low_wmark_pages(zone), | 1928 | if (status == COMPACT_SUCCESS) { |
1934 | cc.classzone_idx, 0)) { | ||
1935 | success = true; | ||
1936 | compaction_defer_reset(zone, cc.order, false); | 1929 | compaction_defer_reset(zone, cc.order, false); |
1937 | } else if (status == COMPACT_PARTIAL_SKIPPED || status == COMPACT_COMPLETE) { | 1930 | } else if (status == COMPACT_PARTIAL_SKIPPED || status == COMPACT_COMPLETE) { |
1938 | /* | 1931 | /* |
diff --git a/mm/debug.c b/mm/debug.c index 74c7cae4f683..9feb699c5d25 100644 --- a/mm/debug.c +++ b/mm/debug.c | |||
@@ -42,6 +42,11 @@ const struct trace_print_flags vmaflag_names[] = { | |||
42 | 42 | ||
43 | void __dump_page(struct page *page, const char *reason) | 43 | void __dump_page(struct page *page, const char *reason) |
44 | { | 44 | { |
45 | /* | ||
46 | * Avoid VM_BUG_ON() in page_mapcount(). | ||
47 | * page->_mapcount space in struct page is used by sl[aou]b pages to | ||
48 | * encode own info. | ||
49 | */ | ||
45 | int mapcount = PageSlab(page) ? 0 : page_mapcount(page); | 50 | int mapcount = PageSlab(page) ? 0 : page_mapcount(page); |
46 | 51 | ||
47 | pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx", | 52 | pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx", |
diff --git a/mm/filemap.c b/mm/filemap.c index 68f1813fbdc3..2f7b7783bd6b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -1687,6 +1687,10 @@ static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos, | |||
1687 | unsigned int prev_offset; | 1687 | unsigned int prev_offset; |
1688 | int error = 0; | 1688 | int error = 0; |
1689 | 1689 | ||
1690 | if (unlikely(*ppos >= inode->i_sb->s_maxbytes)) | ||
1691 | return -EINVAL; | ||
1692 | iov_iter_truncate(iter, inode->i_sb->s_maxbytes); | ||
1693 | |||
1690 | index = *ppos >> PAGE_SHIFT; | 1694 | index = *ppos >> PAGE_SHIFT; |
1691 | prev_index = ra->prev_pos >> PAGE_SHIFT; | 1695 | prev_index = ra->prev_pos >> PAGE_SHIFT; |
1692 | prev_offset = ra->prev_pos & (PAGE_SIZE-1); | 1696 | prev_offset = ra->prev_pos & (PAGE_SIZE-1); |
@@ -1721,7 +1725,9 @@ find_page: | |||
1721 | * wait_on_page_locked is used to avoid unnecessarily | 1725 | * wait_on_page_locked is used to avoid unnecessarily |
1722 | * serialisations and why it's safe. | 1726 | * serialisations and why it's safe. |
1723 | */ | 1727 | */ |
1724 | wait_on_page_locked_killable(page); | 1728 | error = wait_on_page_locked_killable(page); |
1729 | if (unlikely(error)) | ||
1730 | goto readpage_error; | ||
1725 | if (PageUptodate(page)) | 1731 | if (PageUptodate(page)) |
1726 | goto page_ok; | 1732 | goto page_ok; |
1727 | 1733 | ||
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 283583fcb1e7..cdcd25cb30fe 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -59,7 +59,7 @@ static struct shrinker deferred_split_shrinker; | |||
59 | static atomic_t huge_zero_refcount; | 59 | static atomic_t huge_zero_refcount; |
60 | struct page *huge_zero_page __read_mostly; | 60 | struct page *huge_zero_page __read_mostly; |
61 | 61 | ||
62 | struct page *get_huge_zero_page(void) | 62 | static struct page *get_huge_zero_page(void) |
63 | { | 63 | { |
64 | struct page *zero_page; | 64 | struct page *zero_page; |
65 | retry: | 65 | retry: |
@@ -86,7 +86,7 @@ retry: | |||
86 | return READ_ONCE(huge_zero_page); | 86 | return READ_ONCE(huge_zero_page); |
87 | } | 87 | } |
88 | 88 | ||
89 | void put_huge_zero_page(void) | 89 | static void put_huge_zero_page(void) |
90 | { | 90 | { |
91 | /* | 91 | /* |
92 | * Counter should never go to zero here. Only shrinker can put | 92 | * Counter should never go to zero here. Only shrinker can put |
@@ -95,6 +95,26 @@ void put_huge_zero_page(void) | |||
95 | BUG_ON(atomic_dec_and_test(&huge_zero_refcount)); | 95 | BUG_ON(atomic_dec_and_test(&huge_zero_refcount)); |
96 | } | 96 | } |
97 | 97 | ||
98 | struct page *mm_get_huge_zero_page(struct mm_struct *mm) | ||
99 | { | ||
100 | if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) | ||
101 | return READ_ONCE(huge_zero_page); | ||
102 | |||
103 | if (!get_huge_zero_page()) | ||
104 | return NULL; | ||
105 | |||
106 | if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) | ||
107 | put_huge_zero_page(); | ||
108 | |||
109 | return READ_ONCE(huge_zero_page); | ||
110 | } | ||
111 | |||
112 | void mm_put_huge_zero_page(struct mm_struct *mm) | ||
113 | { | ||
114 | if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) | ||
115 | put_huge_zero_page(); | ||
116 | } | ||
117 | |||
98 | static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink, | 118 | static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink, |
99 | struct shrink_control *sc) | 119 | struct shrink_control *sc) |
100 | { | 120 | { |
@@ -469,6 +489,49 @@ void prep_transhuge_page(struct page *page) | |||
469 | set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR); | 489 | set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR); |
470 | } | 490 | } |
471 | 491 | ||
492 | unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long len, | ||
493 | loff_t off, unsigned long flags, unsigned long size) | ||
494 | { | ||
495 | unsigned long addr; | ||
496 | loff_t off_end = off + len; | ||
497 | loff_t off_align = round_up(off, size); | ||
498 | unsigned long len_pad; | ||
499 | |||
500 | if (off_end <= off_align || (off_end - off_align) < size) | ||
501 | return 0; | ||
502 | |||
503 | len_pad = len + size; | ||
504 | if (len_pad < len || (off + len_pad) < off) | ||
505 | return 0; | ||
506 | |||
507 | addr = current->mm->get_unmapped_area(filp, 0, len_pad, | ||
508 | off >> PAGE_SHIFT, flags); | ||
509 | if (IS_ERR_VALUE(addr)) | ||
510 | return 0; | ||
511 | |||
512 | addr += (off - addr) & (size - 1); | ||
513 | return addr; | ||
514 | } | ||
515 | |||
516 | unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, | ||
517 | unsigned long len, unsigned long pgoff, unsigned long flags) | ||
518 | { | ||
519 | loff_t off = (loff_t)pgoff << PAGE_SHIFT; | ||
520 | |||
521 | if (addr) | ||
522 | goto out; | ||
523 | if (!IS_DAX(filp->f_mapping->host) || !IS_ENABLED(CONFIG_FS_DAX_PMD)) | ||
524 | goto out; | ||
525 | |||
526 | addr = __thp_get_unmapped_area(filp, len, off, flags, PMD_SIZE); | ||
527 | if (addr) | ||
528 | return addr; | ||
529 | |||
530 | out: | ||
531 | return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); | ||
532 | } | ||
533 | EXPORT_SYMBOL_GPL(thp_get_unmapped_area); | ||
534 | |||
472 | static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page, | 535 | static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page, |
473 | gfp_t gfp) | 536 | gfp_t gfp) |
474 | { | 537 | { |
@@ -601,7 +664,7 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe) | |||
601 | pgtable = pte_alloc_one(vma->vm_mm, haddr); | 664 | pgtable = pte_alloc_one(vma->vm_mm, haddr); |
602 | if (unlikely(!pgtable)) | 665 | if (unlikely(!pgtable)) |
603 | return VM_FAULT_OOM; | 666 | return VM_FAULT_OOM; |
604 | zero_page = get_huge_zero_page(); | 667 | zero_page = mm_get_huge_zero_page(vma->vm_mm); |
605 | if (unlikely(!zero_page)) { | 668 | if (unlikely(!zero_page)) { |
606 | pte_free(vma->vm_mm, pgtable); | 669 | pte_free(vma->vm_mm, pgtable); |
607 | count_vm_event(THP_FAULT_FALLBACK); | 670 | count_vm_event(THP_FAULT_FALLBACK); |
@@ -623,10 +686,8 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe) | |||
623 | } | 686 | } |
624 | } else | 687 | } else |
625 | spin_unlock(fe->ptl); | 688 | spin_unlock(fe->ptl); |
626 | if (!set) { | 689 | if (!set) |
627 | pte_free(vma->vm_mm, pgtable); | 690 | pte_free(vma->vm_mm, pgtable); |
628 | put_huge_zero_page(); | ||
629 | } | ||
630 | return ret; | 691 | return ret; |
631 | } | 692 | } |
632 | gfp = alloc_hugepage_direct_gfpmask(vma); | 693 | gfp = alloc_hugepage_direct_gfpmask(vma); |
@@ -780,7 +841,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
780 | * since we already have a zero page to copy. It just takes a | 841 | * since we already have a zero page to copy. It just takes a |
781 | * reference. | 842 | * reference. |
782 | */ | 843 | */ |
783 | zero_page = get_huge_zero_page(); | 844 | zero_page = mm_get_huge_zero_page(dst_mm); |
784 | set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, | 845 | set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, |
785 | zero_page); | 846 | zero_page); |
786 | ret = 0; | 847 | ret = 0; |
@@ -1038,7 +1099,6 @@ alloc: | |||
1038 | update_mmu_cache_pmd(vma, fe->address, fe->pmd); | 1099 | update_mmu_cache_pmd(vma, fe->address, fe->pmd); |
1039 | if (!page) { | 1100 | if (!page) { |
1040 | add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); | 1101 | add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); |
1041 | put_huge_zero_page(); | ||
1042 | } else { | 1102 | } else { |
1043 | VM_BUG_ON_PAGE(!PageHead(page), page); | 1103 | VM_BUG_ON_PAGE(!PageHead(page), page); |
1044 | page_remove_rmap(page, true); | 1104 | page_remove_rmap(page, true); |
@@ -1499,7 +1559,6 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, | |||
1499 | } | 1559 | } |
1500 | smp_wmb(); /* make pte visible before pmd */ | 1560 | smp_wmb(); /* make pte visible before pmd */ |
1501 | pmd_populate(mm, pmd, pgtable); | 1561 | pmd_populate(mm, pmd, pgtable); |
1502 | put_huge_zero_page(); | ||
1503 | } | 1562 | } |
1504 | 1563 | ||
1505 | static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, | 1564 | static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, |
@@ -1522,8 +1581,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, | |||
1522 | 1581 | ||
1523 | if (!vma_is_anonymous(vma)) { | 1582 | if (!vma_is_anonymous(vma)) { |
1524 | _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd); | 1583 | _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd); |
1525 | if (is_huge_zero_pmd(_pmd)) | ||
1526 | put_huge_zero_page(); | ||
1527 | if (vma_is_dax(vma)) | 1584 | if (vma_is_dax(vma)) |
1528 | return; | 1585 | return; |
1529 | page = pmd_page(_pmd); | 1586 | page = pmd_page(_pmd); |
@@ -1563,7 +1620,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, | |||
1563 | if (soft_dirty) | 1620 | if (soft_dirty) |
1564 | entry = pte_swp_mksoft_dirty(entry); | 1621 | entry = pte_swp_mksoft_dirty(entry); |
1565 | } else { | 1622 | } else { |
1566 | entry = mk_pte(page + i, vma->vm_page_prot); | 1623 | entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot)); |
1567 | entry = maybe_mkwrite(entry, vma); | 1624 | entry = maybe_mkwrite(entry, vma); |
1568 | if (!write) | 1625 | if (!write) |
1569 | entry = pte_wrprotect(entry); | 1626 | entry = pte_wrprotect(entry); |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 87e11d8ad536..ec49d9ef1eef 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -567,13 +567,13 @@ retry: | |||
567 | * appear as a "reserved" entry instead of simply dangling with incorrect | 567 | * appear as a "reserved" entry instead of simply dangling with incorrect |
568 | * counts. | 568 | * counts. |
569 | */ | 569 | */ |
570 | void hugetlb_fix_reserve_counts(struct inode *inode, bool restore_reserve) | 570 | void hugetlb_fix_reserve_counts(struct inode *inode) |
571 | { | 571 | { |
572 | struct hugepage_subpool *spool = subpool_inode(inode); | 572 | struct hugepage_subpool *spool = subpool_inode(inode); |
573 | long rsv_adjust; | 573 | long rsv_adjust; |
574 | 574 | ||
575 | rsv_adjust = hugepage_subpool_get_pages(spool, 1); | 575 | rsv_adjust = hugepage_subpool_get_pages(spool, 1); |
576 | if (restore_reserve && rsv_adjust) { | 576 | if (rsv_adjust) { |
577 | struct hstate *h = hstate_inode(inode); | 577 | struct hstate *h = hstate_inode(inode); |
578 | 578 | ||
579 | hugetlb_acct_memory(h, 1); | 579 | hugetlb_acct_memory(h, 1); |
@@ -1022,7 +1022,7 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) | |||
1022 | ((node = hstate_next_node_to_free(hs, mask)) || 1); \ | 1022 | ((node = hstate_next_node_to_free(hs, mask)) || 1); \ |
1023 | nr_nodes--) | 1023 | nr_nodes--) |
1024 | 1024 | ||
1025 | #if (defined(CONFIG_X86_64) || defined(CONFIG_S390)) && \ | 1025 | #if defined(CONFIG_ARCH_HAS_GIGANTIC_PAGE) && \ |
1026 | ((defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || \ | 1026 | ((defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || \ |
1027 | defined(CONFIG_CMA)) | 1027 | defined(CONFIG_CMA)) |
1028 | static void destroy_compound_gigantic_page(struct page *page, | 1028 | static void destroy_compound_gigantic_page(struct page *page, |
@@ -1437,38 +1437,61 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, | |||
1437 | 1437 | ||
1438 | /* | 1438 | /* |
1439 | * Dissolve a given free hugepage into free buddy pages. This function does | 1439 | * Dissolve a given free hugepage into free buddy pages. This function does |
1440 | * nothing for in-use (including surplus) hugepages. | 1440 | * nothing for in-use (including surplus) hugepages. Returns -EBUSY if the |
1441 | * number of free hugepages would be reduced below the number of reserved | ||
1442 | * hugepages. | ||
1441 | */ | 1443 | */ |
1442 | static void dissolve_free_huge_page(struct page *page) | 1444 | static int dissolve_free_huge_page(struct page *page) |
1443 | { | 1445 | { |
1446 | int rc = 0; | ||
1447 | |||
1444 | spin_lock(&hugetlb_lock); | 1448 | spin_lock(&hugetlb_lock); |
1445 | if (PageHuge(page) && !page_count(page)) { | 1449 | if (PageHuge(page) && !page_count(page)) { |
1446 | struct hstate *h = page_hstate(page); | 1450 | struct page *head = compound_head(page); |
1447 | int nid = page_to_nid(page); | 1451 | struct hstate *h = page_hstate(head); |
1448 | list_del(&page->lru); | 1452 | int nid = page_to_nid(head); |
1453 | if (h->free_huge_pages - h->resv_huge_pages == 0) { | ||
1454 | rc = -EBUSY; | ||
1455 | goto out; | ||
1456 | } | ||
1457 | list_del(&head->lru); | ||
1449 | h->free_huge_pages--; | 1458 | h->free_huge_pages--; |
1450 | h->free_huge_pages_node[nid]--; | 1459 | h->free_huge_pages_node[nid]--; |
1451 | h->max_huge_pages--; | 1460 | h->max_huge_pages--; |
1452 | update_and_free_page(h, page); | 1461 | update_and_free_page(h, head); |
1453 | } | 1462 | } |
1463 | out: | ||
1454 | spin_unlock(&hugetlb_lock); | 1464 | spin_unlock(&hugetlb_lock); |
1465 | return rc; | ||
1455 | } | 1466 | } |
1456 | 1467 | ||
1457 | /* | 1468 | /* |
1458 | * Dissolve free hugepages in a given pfn range. Used by memory hotplug to | 1469 | * Dissolve free hugepages in a given pfn range. Used by memory hotplug to |
1459 | * make specified memory blocks removable from the system. | 1470 | * make specified memory blocks removable from the system. |
1460 | * Note that start_pfn should aligned with (minimum) hugepage size. | 1471 | * Note that this will dissolve a free gigantic hugepage completely, if any |
1472 | * part of it lies within the given range. | ||
1473 | * Also note that if dissolve_free_huge_page() returns with an error, all | ||
1474 | * free hugepages that were dissolved before that error are lost. | ||
1461 | */ | 1475 | */ |
1462 | void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) | 1476 | int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) |
1463 | { | 1477 | { |
1464 | unsigned long pfn; | 1478 | unsigned long pfn; |
1479 | struct page *page; | ||
1480 | int rc = 0; | ||
1465 | 1481 | ||
1466 | if (!hugepages_supported()) | 1482 | if (!hugepages_supported()) |
1467 | return; | 1483 | return rc; |
1484 | |||
1485 | for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) { | ||
1486 | page = pfn_to_page(pfn); | ||
1487 | if (PageHuge(page) && !page_count(page)) { | ||
1488 | rc = dissolve_free_huge_page(page); | ||
1489 | if (rc) | ||
1490 | break; | ||
1491 | } | ||
1492 | } | ||
1468 | 1493 | ||
1469 | VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << minimum_order)); | 1494 | return rc; |
1470 | for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) | ||
1471 | dissolve_free_huge_page(pfn_to_page(pfn)); | ||
1472 | } | 1495 | } |
1473 | 1496 | ||
1474 | /* | 1497 | /* |
diff --git a/mm/internal.h b/mm/internal.h index 1501304f87a4..537ac9951f5f 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -178,8 +178,9 @@ struct compact_control { | |||
178 | unsigned long last_migrated_pfn;/* Not yet flushed page being freed */ | 178 | unsigned long last_migrated_pfn;/* Not yet flushed page being freed */ |
179 | enum migrate_mode mode; /* Async or sync migration mode */ | 179 | enum migrate_mode mode; /* Async or sync migration mode */ |
180 | bool ignore_skip_hint; /* Scan blocks even if marked skip */ | 180 | bool ignore_skip_hint; /* Scan blocks even if marked skip */ |
181 | bool ignore_block_suitable; /* Scan blocks considered unsuitable */ | ||
181 | bool direct_compaction; /* False from kcompactd or /proc/... */ | 182 | bool direct_compaction; /* False from kcompactd or /proc/... */ |
182 | bool whole_zone; /* Whole zone has been scanned */ | 183 | bool whole_zone; /* Whole zone should/has been scanned */ |
183 | int order; /* order a direct compactor needs */ | 184 | int order; /* order a direct compactor needs */ |
184 | const gfp_t gfp_mask; /* gfp mask of a direct compactor */ | 185 | const gfp_t gfp_mask; /* gfp mask of a direct compactor */ |
185 | const unsigned int alloc_flags; /* alloc flags of a direct compactor */ | 186 | const unsigned int alloc_flags; /* alloc flags of a direct compactor */ |
@@ -299,7 +299,12 @@ static inline void free_rmap_item(struct rmap_item *rmap_item) | |||
299 | 299 | ||
300 | static inline struct stable_node *alloc_stable_node(void) | 300 | static inline struct stable_node *alloc_stable_node(void) |
301 | { | 301 | { |
302 | return kmem_cache_alloc(stable_node_cache, GFP_KERNEL); | 302 | /* |
303 | * The allocation can take too long with GFP_KERNEL when memory is under | ||
304 | * pressure, which may lead to hung task warnings. Adding __GFP_HIGH | ||
305 | * grants access to memory reserves, helping to avoid this problem. | ||
306 | */ | ||
307 | return kmem_cache_alloc(stable_node_cache, GFP_KERNEL | __GFP_HIGH); | ||
303 | } | 308 | } |
304 | 309 | ||
305 | static inline void free_stable_node(struct stable_node *stable_node) | 310 | static inline void free_stable_node(struct stable_node *stable_node) |
diff --git a/mm/memblock.c b/mm/memblock.c index 483197ef613f..c8dfa430342b 100644 --- a/mm/memblock.c +++ b/mm/memblock.c | |||
@@ -1438,6 +1438,11 @@ phys_addr_t __init_memblock memblock_phys_mem_size(void) | |||
1438 | return memblock.memory.total_size; | 1438 | return memblock.memory.total_size; |
1439 | } | 1439 | } |
1440 | 1440 | ||
1441 | phys_addr_t __init_memblock memblock_reserved_size(void) | ||
1442 | { | ||
1443 | return memblock.reserved.total_size; | ||
1444 | } | ||
1445 | |||
1441 | phys_addr_t __init memblock_mem_size(unsigned long limit_pfn) | 1446 | phys_addr_t __init memblock_mem_size(unsigned long limit_pfn) |
1442 | { | 1447 | { |
1443 | unsigned long pages = 0; | 1448 | unsigned long pages = 0; |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4be518d4e68a..ae052b5e3315 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -921,6 +921,43 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) | |||
921 | iter = mem_cgroup_iter(NULL, iter, NULL)) | 921 | iter = mem_cgroup_iter(NULL, iter, NULL)) |
922 | 922 | ||
923 | /** | 923 | /** |
924 | * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy | ||
925 | * @memcg: hierarchy root | ||
926 | * @fn: function to call for each task | ||
927 | * @arg: argument passed to @fn | ||
928 | * | ||
929 | * This function iterates over tasks attached to @memcg or to any of its | ||
930 | * descendants and calls @fn for each task. If @fn returns a non-zero | ||
931 | * value, the function breaks the iteration loop and returns the value. | ||
932 | * Otherwise, it will iterate over all tasks and return 0. | ||
933 | * | ||
934 | * This function must not be called for the root memory cgroup. | ||
935 | */ | ||
936 | int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, | ||
937 | int (*fn)(struct task_struct *, void *), void *arg) | ||
938 | { | ||
939 | struct mem_cgroup *iter; | ||
940 | int ret = 0; | ||
941 | |||
942 | BUG_ON(memcg == root_mem_cgroup); | ||
943 | |||
944 | for_each_mem_cgroup_tree(iter, memcg) { | ||
945 | struct css_task_iter it; | ||
946 | struct task_struct *task; | ||
947 | |||
948 | css_task_iter_start(&iter->css, &it); | ||
949 | while (!ret && (task = css_task_iter_next(&it))) | ||
950 | ret = fn(task, arg); | ||
951 | css_task_iter_end(&it); | ||
952 | if (ret) { | ||
953 | mem_cgroup_iter_break(memcg, iter); | ||
954 | break; | ||
955 | } | ||
956 | } | ||
957 | return ret; | ||
958 | } | ||
959 | |||
960 | /** | ||
924 | * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page | 961 | * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page |
925 | * @page: the page | 962 | * @page: the page |
926 | * @zone: zone of the page | 963 | * @zone: zone of the page |
@@ -1178,7 +1215,7 @@ static int mem_cgroup_count_children(struct mem_cgroup *memcg) | |||
1178 | /* | 1215 | /* |
1179 | * Return the memory (and swap, if configured) limit for a memcg. | 1216 | * Return the memory (and swap, if configured) limit for a memcg. |
1180 | */ | 1217 | */ |
1181 | static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) | 1218 | unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) |
1182 | { | 1219 | { |
1183 | unsigned long limit; | 1220 | unsigned long limit; |
1184 | 1221 | ||
@@ -1205,79 +1242,12 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
1205 | .gfp_mask = gfp_mask, | 1242 | .gfp_mask = gfp_mask, |
1206 | .order = order, | 1243 | .order = order, |
1207 | }; | 1244 | }; |
1208 | struct mem_cgroup *iter; | 1245 | bool ret; |
1209 | unsigned long chosen_points = 0; | ||
1210 | unsigned long totalpages; | ||
1211 | unsigned int points = 0; | ||
1212 | struct task_struct *chosen = NULL; | ||
1213 | 1246 | ||
1214 | mutex_lock(&oom_lock); | 1247 | mutex_lock(&oom_lock); |
1215 | 1248 | ret = out_of_memory(&oc); | |
1216 | /* | ||
1217 | * If current has a pending SIGKILL or is exiting, then automatically | ||
1218 | * select it. The goal is to allow it to allocate so that it may | ||
1219 | * quickly exit and free its memory. | ||
1220 | */ | ||
1221 | if (task_will_free_mem(current)) { | ||
1222 | mark_oom_victim(current); | ||
1223 | wake_oom_reaper(current); | ||
1224 | goto unlock; | ||
1225 | } | ||
1226 | |||
1227 | check_panic_on_oom(&oc, CONSTRAINT_MEMCG); | ||
1228 | totalpages = mem_cgroup_get_limit(memcg) ? : 1; | ||
1229 | for_each_mem_cgroup_tree(iter, memcg) { | ||
1230 | struct css_task_iter it; | ||
1231 | struct task_struct *task; | ||
1232 | |||
1233 | css_task_iter_start(&iter->css, &it); | ||
1234 | while ((task = css_task_iter_next(&it))) { | ||
1235 | switch (oom_scan_process_thread(&oc, task)) { | ||
1236 | case OOM_SCAN_SELECT: | ||
1237 | if (chosen) | ||
1238 | put_task_struct(chosen); | ||
1239 | chosen = task; | ||
1240 | chosen_points = ULONG_MAX; | ||
1241 | get_task_struct(chosen); | ||
1242 | /* fall through */ | ||
1243 | case OOM_SCAN_CONTINUE: | ||
1244 | continue; | ||
1245 | case OOM_SCAN_ABORT: | ||
1246 | css_task_iter_end(&it); | ||
1247 | mem_cgroup_iter_break(memcg, iter); | ||
1248 | if (chosen) | ||
1249 | put_task_struct(chosen); | ||
1250 | /* Set a dummy value to return "true". */ | ||
1251 | chosen = (void *) 1; | ||
1252 | goto unlock; | ||
1253 | case OOM_SCAN_OK: | ||
1254 | break; | ||
1255 | }; | ||
1256 | points = oom_badness(task, memcg, NULL, totalpages); | ||
1257 | if (!points || points < chosen_points) | ||
1258 | continue; | ||
1259 | /* Prefer thread group leaders for display purposes */ | ||
1260 | if (points == chosen_points && | ||
1261 | thread_group_leader(chosen)) | ||
1262 | continue; | ||
1263 | |||
1264 | if (chosen) | ||
1265 | put_task_struct(chosen); | ||
1266 | chosen = task; | ||
1267 | chosen_points = points; | ||
1268 | get_task_struct(chosen); | ||
1269 | } | ||
1270 | css_task_iter_end(&it); | ||
1271 | } | ||
1272 | |||
1273 | if (chosen) { | ||
1274 | points = chosen_points * 1000 / totalpages; | ||
1275 | oom_kill_process(&oc, chosen, points, totalpages, | ||
1276 | "Memory cgroup out of memory"); | ||
1277 | } | ||
1278 | unlock: | ||
1279 | mutex_unlock(&oom_lock); | 1249 | mutex_unlock(&oom_lock); |
1280 | return chosen; | 1250 | return ret; |
1281 | } | 1251 | } |
1282 | 1252 | ||
1283 | #if MAX_NUMNODES > 1 | 1253 | #if MAX_NUMNODES > 1 |
@@ -1600,7 +1570,7 @@ bool mem_cgroup_oom_synchronize(bool handle) | |||
1600 | if (!memcg) | 1570 | if (!memcg) |
1601 | return false; | 1571 | return false; |
1602 | 1572 | ||
1603 | if (!handle || oom_killer_disabled) | 1573 | if (!handle) |
1604 | goto cleanup; | 1574 | goto cleanup; |
1605 | 1575 | ||
1606 | owait.memcg = memcg; | 1576 | owait.memcg = memcg; |
@@ -2969,16 +2939,16 @@ static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit) | |||
2969 | /* | 2939 | /* |
2970 | * The active flag needs to be written after the static_key | 2940 | * The active flag needs to be written after the static_key |
2971 | * update. This is what guarantees that the socket activation | 2941 | * update. This is what guarantees that the socket activation |
2972 | * function is the last one to run. See sock_update_memcg() for | 2942 | * function is the last one to run. See mem_cgroup_sk_alloc() |
2973 | * details, and note that we don't mark any socket as belonging | 2943 | * for details, and note that we don't mark any socket as |
2974 | * to this memcg until that flag is up. | 2944 | * belonging to this memcg until that flag is up. |
2975 | * | 2945 | * |
2976 | * We need to do this, because static_keys will span multiple | 2946 | * We need to do this, because static_keys will span multiple |
2977 | * sites, but we can't control their order. If we mark a socket | 2947 | * sites, but we can't control their order. If we mark a socket |
2978 | * as accounted, but the accounting functions are not patched in | 2948 | * as accounted, but the accounting functions are not patched in |
2979 | * yet, we'll lose accounting. | 2949 | * yet, we'll lose accounting. |
2980 | * | 2950 | * |
2981 | * We never race with the readers in sock_update_memcg(), | 2951 | * We never race with the readers in mem_cgroup_sk_alloc(), |
2982 | * because when this value change, the code to process it is not | 2952 | * because when this value change, the code to process it is not |
2983 | * patched in yet. | 2953 | * patched in yet. |
2984 | */ | 2954 | */ |
@@ -4092,11 +4062,13 @@ static DEFINE_IDR(mem_cgroup_idr); | |||
4092 | 4062 | ||
4093 | static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n) | 4063 | static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n) |
4094 | { | 4064 | { |
4065 | VM_BUG_ON(atomic_read(&memcg->id.ref) <= 0); | ||
4095 | atomic_add(n, &memcg->id.ref); | 4066 | atomic_add(n, &memcg->id.ref); |
4096 | } | 4067 | } |
4097 | 4068 | ||
4098 | static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) | 4069 | static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) |
4099 | { | 4070 | { |
4071 | VM_BUG_ON(atomic_read(&memcg->id.ref) < n); | ||
4100 | if (atomic_sub_and_test(n, &memcg->id.ref)) { | 4072 | if (atomic_sub_and_test(n, &memcg->id.ref)) { |
4101 | idr_remove(&mem_cgroup_idr, memcg->id.id); | 4073 | idr_remove(&mem_cgroup_idr, memcg->id.id); |
4102 | memcg->id.id = 0; | 4074 | memcg->id.id = 0; |
@@ -4285,8 +4257,10 @@ fail: | |||
4285 | 4257 | ||
4286 | static int mem_cgroup_css_online(struct cgroup_subsys_state *css) | 4258 | static int mem_cgroup_css_online(struct cgroup_subsys_state *css) |
4287 | { | 4259 | { |
4260 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | ||
4261 | |||
4288 | /* Online state pins memcg ID, memcg ID pins CSS */ | 4262 | /* Online state pins memcg ID, memcg ID pins CSS */ |
4289 | mem_cgroup_id_get(mem_cgroup_from_css(css)); | 4263 | atomic_set(&memcg->id.ref, 1); |
4290 | css_get(css); | 4264 | css_get(css); |
4291 | return 0; | 4265 | return 0; |
4292 | } | 4266 | } |
@@ -4434,7 +4408,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, | |||
4434 | * Because lookup_swap_cache() updates some statistics counter, | 4408 | * Because lookup_swap_cache() updates some statistics counter, |
4435 | * we call find_get_page() with swapper_space directly. | 4409 | * we call find_get_page() with swapper_space directly. |
4436 | */ | 4410 | */ |
4437 | page = find_get_page(swap_address_space(ent), ent.val); | 4411 | page = find_get_page(swap_address_space(ent), swp_offset(ent)); |
4438 | if (do_memsw_account()) | 4412 | if (do_memsw_account()) |
4439 | entry->val = ent.val; | 4413 | entry->val = ent.val; |
4440 | 4414 | ||
@@ -4472,7 +4446,8 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, | |||
4472 | swp_entry_t swp = radix_to_swp_entry(page); | 4446 | swp_entry_t swp = radix_to_swp_entry(page); |
4473 | if (do_memsw_account()) | 4447 | if (do_memsw_account()) |
4474 | *entry = swp; | 4448 | *entry = swp; |
4475 | page = find_get_page(swap_address_space(swp), swp.val); | 4449 | page = find_get_page(swap_address_space(swp), |
4450 | swp_offset(swp)); | ||
4476 | } | 4451 | } |
4477 | } else | 4452 | } else |
4478 | page = find_get_page(mapping, pgoff); | 4453 | page = find_get_page(mapping, pgoff); |
@@ -4707,7 +4682,8 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) | |||
4707 | .mm = mm, | 4682 | .mm = mm, |
4708 | }; | 4683 | }; |
4709 | down_read(&mm->mmap_sem); | 4684 | down_read(&mm->mmap_sem); |
4710 | walk_page_range(0, ~0UL, &mem_cgroup_count_precharge_walk); | 4685 | walk_page_range(0, mm->highest_vm_end, |
4686 | &mem_cgroup_count_precharge_walk); | ||
4711 | up_read(&mm->mmap_sem); | 4687 | up_read(&mm->mmap_sem); |
4712 | 4688 | ||
4713 | precharge = mc.precharge; | 4689 | precharge = mc.precharge; |
@@ -4995,7 +4971,8 @@ retry: | |||
4995 | * When we have consumed all precharges and failed in doing | 4971 | * When we have consumed all precharges and failed in doing |
4996 | * additional charge, the page walk just aborts. | 4972 | * additional charge, the page walk just aborts. |
4997 | */ | 4973 | */ |
4998 | walk_page_range(0, ~0UL, &mem_cgroup_move_charge_walk); | 4974 | walk_page_range(0, mc.mm->highest_vm_end, &mem_cgroup_move_charge_walk); |
4975 | |||
4999 | up_read(&mc.mm->mmap_sem); | 4976 | up_read(&mc.mm->mmap_sem); |
5000 | atomic_dec(&mc.from->moving_account); | 4977 | atomic_dec(&mc.from->moving_account); |
5001 | } | 4978 | } |
@@ -5674,11 +5651,15 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) | |||
5674 | DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key); | 5651 | DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key); |
5675 | EXPORT_SYMBOL(memcg_sockets_enabled_key); | 5652 | EXPORT_SYMBOL(memcg_sockets_enabled_key); |
5676 | 5653 | ||
5677 | void sock_update_memcg(struct sock *sk) | 5654 | void mem_cgroup_sk_alloc(struct sock *sk) |
5678 | { | 5655 | { |
5679 | struct mem_cgroup *memcg; | 5656 | struct mem_cgroup *memcg; |
5680 | 5657 | ||
5681 | /* Socket cloning can throw us here with sk_cgrp already | 5658 | if (!mem_cgroup_sockets_enabled) |
5659 | return; | ||
5660 | |||
5661 | /* | ||
5662 | * Socket cloning can throw us here with sk_memcg already | ||
5682 | * filled. It won't however, necessarily happen from | 5663 | * filled. It won't however, necessarily happen from |
5683 | * process context. So the test for root memcg given | 5664 | * process context. So the test for root memcg given |
5684 | * the current task's memcg won't help us in this case. | 5665 | * the current task's memcg won't help us in this case. |
@@ -5703,12 +5684,11 @@ void sock_update_memcg(struct sock *sk) | |||
5703 | out: | 5684 | out: |
5704 | rcu_read_unlock(); | 5685 | rcu_read_unlock(); |
5705 | } | 5686 | } |
5706 | EXPORT_SYMBOL(sock_update_memcg); | ||
5707 | 5687 | ||
5708 | void sock_release_memcg(struct sock *sk) | 5688 | void mem_cgroup_sk_free(struct sock *sk) |
5709 | { | 5689 | { |
5710 | WARN_ON(!sk->sk_memcg); | 5690 | if (sk->sk_memcg) |
5711 | css_put(&sk->sk_memcg->css); | 5691 | css_put(&sk->sk_memcg->css); |
5712 | } | 5692 | } |
5713 | 5693 | ||
5714 | /** | 5694 | /** |
diff --git a/mm/memory.c b/mm/memory.c index f1a68049edff..fc1987dfd8cc 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -1649,10 +1649,14 @@ EXPORT_SYMBOL(vm_insert_pfn_prot); | |||
1649 | int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, | 1649 | int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, |
1650 | pfn_t pfn) | 1650 | pfn_t pfn) |
1651 | { | 1651 | { |
1652 | pgprot_t pgprot = vma->vm_page_prot; | ||
1653 | |||
1652 | BUG_ON(!(vma->vm_flags & VM_MIXEDMAP)); | 1654 | BUG_ON(!(vma->vm_flags & VM_MIXEDMAP)); |
1653 | 1655 | ||
1654 | if (addr < vma->vm_start || addr >= vma->vm_end) | 1656 | if (addr < vma->vm_start || addr >= vma->vm_end) |
1655 | return -EFAULT; | 1657 | return -EFAULT; |
1658 | if (track_pfn_insert(vma, &pgprot, pfn)) | ||
1659 | return -EINVAL; | ||
1656 | 1660 | ||
1657 | /* | 1661 | /* |
1658 | * If we don't have pte special, then we have to use the pfn_valid() | 1662 | * If we don't have pte special, then we have to use the pfn_valid() |
@@ -1670,9 +1674,9 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, | |||
1670 | * result in pfn_t_has_page() == false. | 1674 | * result in pfn_t_has_page() == false. |
1671 | */ | 1675 | */ |
1672 | page = pfn_to_page(pfn_t_to_pfn(pfn)); | 1676 | page = pfn_to_page(pfn_t_to_pfn(pfn)); |
1673 | return insert_page(vma, addr, page, vma->vm_page_prot); | 1677 | return insert_page(vma, addr, page, pgprot); |
1674 | } | 1678 | } |
1675 | return insert_pfn(vma, addr, pfn, vma->vm_page_prot); | 1679 | return insert_pfn(vma, addr, pfn, pgprot); |
1676 | } | 1680 | } |
1677 | EXPORT_SYMBOL(vm_insert_mixed); | 1681 | EXPORT_SYMBOL(vm_insert_mixed); |
1678 | 1682 | ||
@@ -3658,6 +3662,19 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, | |||
3658 | mem_cgroup_oom_synchronize(false); | 3662 | mem_cgroup_oom_synchronize(false); |
3659 | } | 3663 | } |
3660 | 3664 | ||
3665 | /* | ||
3666 | * This mm has been already reaped by the oom reaper and so the | ||
3667 | * refault cannot be trusted in general. Anonymous refaults would | ||
3668 | * lose data and give a zero page instead e.g. This is especially | ||
3669 | * problem for use_mm() because regular tasks will just die and | ||
3670 | * the corrupted data will not be visible anywhere while kthread | ||
3671 | * will outlive the oom victim and potentially propagate the date | ||
3672 | * further. | ||
3673 | */ | ||
3674 | if (unlikely((current->flags & PF_KTHREAD) && !(ret & VM_FAULT_ERROR) | ||
3675 | && test_bit(MMF_UNSTABLE, &vma->vm_mm->flags))) | ||
3676 | ret = VM_FAULT_SIGBUS; | ||
3677 | |||
3661 | return ret; | 3678 | return ret; |
3662 | } | 3679 | } |
3663 | EXPORT_SYMBOL_GPL(handle_mm_fault); | 3680 | EXPORT_SYMBOL_GPL(handle_mm_fault); |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 9d29ba0f7192..962927309b6e 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -1945,7 +1945,9 @@ repeat: | |||
1945 | * dissolve free hugepages in the memory block before doing offlining | 1945 | * dissolve free hugepages in the memory block before doing offlining |
1946 | * actually in order to make hugetlbfs's object counting consistent. | 1946 | * actually in order to make hugetlbfs's object counting consistent. |
1947 | */ | 1947 | */ |
1948 | dissolve_free_huge_pages(start_pfn, end_pfn); | 1948 | ret = dissolve_free_huge_pages(start_pfn, end_pfn); |
1949 | if (ret) | ||
1950 | goto failed_removal; | ||
1949 | /* check again */ | 1951 | /* check again */ |
1950 | offlined_pages = check_pages_isolated(start_pfn, end_pfn); | 1952 | offlined_pages = check_pages_isolated(start_pfn, end_pfn); |
1951 | if (offlined_pages < 0) { | 1953 | if (offlined_pages < 0) { |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 2da72a5b6ecc..ad1c96ac313c 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -1749,7 +1749,7 @@ unsigned int mempolicy_slab_node(void) | |||
1749 | */ | 1749 | */ |
1750 | struct zonelist *zonelist; | 1750 | struct zonelist *zonelist; |
1751 | enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); | 1751 | enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); |
1752 | zonelist = &NODE_DATA(node)->node_zonelists[0]; | 1752 | zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK]; |
1753 | z = first_zones_zonelist(zonelist, highest_zoneidx, | 1753 | z = first_zones_zonelist(zonelist, highest_zoneidx, |
1754 | &policy->v.nodes); | 1754 | &policy->v.nodes); |
1755 | return z->zone ? z->zone->node : node; | 1755 | return z->zone ? z->zone->node : node; |
diff --git a/mm/migrate.c b/mm/migrate.c index f7ee04a5ae27..99250aee1ac1 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -234,7 +234,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, | |||
234 | goto unlock; | 234 | goto unlock; |
235 | 235 | ||
236 | get_page(new); | 236 | get_page(new); |
237 | pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); | 237 | pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot))); |
238 | if (pte_swp_soft_dirty(*ptep)) | 238 | if (pte_swp_soft_dirty(*ptep)) |
239 | pte = pte_mksoft_dirty(pte); | 239 | pte = pte_mksoft_dirty(pte); |
240 | 240 | ||
diff --git a/mm/mincore.c b/mm/mincore.c index c0b5ba965200..bfb866435478 100644 --- a/mm/mincore.c +++ b/mm/mincore.c | |||
@@ -66,7 +66,8 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) | |||
66 | */ | 66 | */ |
67 | if (radix_tree_exceptional_entry(page)) { | 67 | if (radix_tree_exceptional_entry(page)) { |
68 | swp_entry_t swp = radix_to_swp_entry(page); | 68 | swp_entry_t swp = radix_to_swp_entry(page); |
69 | page = find_get_page(swap_address_space(swp), swp.val); | 69 | page = find_get_page(swap_address_space(swp), |
70 | swp_offset(swp)); | ||
70 | } | 71 | } |
71 | } else | 72 | } else |
72 | page = find_get_page(mapping, pgoff); | 73 | page = find_get_page(mapping, pgoff); |
@@ -150,7 +151,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, | |||
150 | } else { | 151 | } else { |
151 | #ifdef CONFIG_SWAP | 152 | #ifdef CONFIG_SWAP |
152 | *vec = mincore_page(swap_address_space(entry), | 153 | *vec = mincore_page(swap_address_space(entry), |
153 | entry.val); | 154 | swp_offset(entry)); |
154 | #else | 155 | #else |
155 | WARN_ON(1); | 156 | WARN_ON(1); |
156 | *vec = 1; | 157 | *vec = 1; |
diff --git a/mm/mlock.c b/mm/mlock.c index 14645be06e30..145a4258ddbc 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -516,6 +516,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, | |||
516 | int nr_pages; | 516 | int nr_pages; |
517 | int ret = 0; | 517 | int ret = 0; |
518 | int lock = !!(newflags & VM_LOCKED); | 518 | int lock = !!(newflags & VM_LOCKED); |
519 | vm_flags_t old_flags = vma->vm_flags; | ||
519 | 520 | ||
520 | if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) || | 521 | if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) || |
521 | is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm)) | 522 | is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm)) |
@@ -550,6 +551,8 @@ success: | |||
550 | nr_pages = (end - start) >> PAGE_SHIFT; | 551 | nr_pages = (end - start) >> PAGE_SHIFT; |
551 | if (!lock) | 552 | if (!lock) |
552 | nr_pages = -nr_pages; | 553 | nr_pages = -nr_pages; |
554 | else if (old_flags & VM_LOCKED) | ||
555 | nr_pages = 0; | ||
553 | mm->locked_vm += nr_pages; | 556 | mm->locked_vm += nr_pages; |
554 | 557 | ||
555 | /* | 558 | /* |
@@ -617,6 +620,45 @@ static int apply_vma_lock_flags(unsigned long start, size_t len, | |||
617 | return error; | 620 | return error; |
618 | } | 621 | } |
619 | 622 | ||
623 | /* | ||
624 | * Go through vma areas and sum size of mlocked | ||
625 | * vma pages, as return value. | ||
626 | * Note deferred memory locking case(mlock2(,,MLOCK_ONFAULT) | ||
627 | * is also counted. | ||
628 | * Return value: previously mlocked page counts | ||
629 | */ | ||
630 | static int count_mm_mlocked_page_nr(struct mm_struct *mm, | ||
631 | unsigned long start, size_t len) | ||
632 | { | ||
633 | struct vm_area_struct *vma; | ||
634 | int count = 0; | ||
635 | |||
636 | if (mm == NULL) | ||
637 | mm = current->mm; | ||
638 | |||
639 | vma = find_vma(mm, start); | ||
640 | if (vma == NULL) | ||
641 | vma = mm->mmap; | ||
642 | |||
643 | for (; vma ; vma = vma->vm_next) { | ||
644 | if (start >= vma->vm_end) | ||
645 | continue; | ||
646 | if (start + len <= vma->vm_start) | ||
647 | break; | ||
648 | if (vma->vm_flags & VM_LOCKED) { | ||
649 | if (start > vma->vm_start) | ||
650 | count -= (start - vma->vm_start); | ||
651 | if (start + len < vma->vm_end) { | ||
652 | count += start + len - vma->vm_start; | ||
653 | break; | ||
654 | } | ||
655 | count += vma->vm_end - vma->vm_start; | ||
656 | } | ||
657 | } | ||
658 | |||
659 | return count >> PAGE_SHIFT; | ||
660 | } | ||
661 | |||
620 | static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t flags) | 662 | static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t flags) |
621 | { | 663 | { |
622 | unsigned long locked; | 664 | unsigned long locked; |
@@ -639,6 +681,16 @@ static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t fla | |||
639 | return -EINTR; | 681 | return -EINTR; |
640 | 682 | ||
641 | locked += current->mm->locked_vm; | 683 | locked += current->mm->locked_vm; |
684 | if ((locked > lock_limit) && (!capable(CAP_IPC_LOCK))) { | ||
685 | /* | ||
686 | * It is possible that the regions requested intersect with | ||
687 | * previously mlocked areas, that part area in "mm->locked_vm" | ||
688 | * should not be counted to new mlock increment count. So check | ||
689 | * and adjust locked count if necessary. | ||
690 | */ | ||
691 | locked -= count_mm_mlocked_page_nr(current->mm, | ||
692 | start, len); | ||
693 | } | ||
642 | 694 | ||
643 | /* check against resource limits */ | 695 | /* check against resource limits */ |
644 | if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) | 696 | if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) |
@@ -116,13 +116,15 @@ static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags) | |||
116 | void vma_set_page_prot(struct vm_area_struct *vma) | 116 | void vma_set_page_prot(struct vm_area_struct *vma) |
117 | { | 117 | { |
118 | unsigned long vm_flags = vma->vm_flags; | 118 | unsigned long vm_flags = vma->vm_flags; |
119 | pgprot_t vm_page_prot; | ||
119 | 120 | ||
120 | vma->vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags); | 121 | vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags); |
121 | if (vma_wants_writenotify(vma)) { | 122 | if (vma_wants_writenotify(vma, vm_page_prot)) { |
122 | vm_flags &= ~VM_SHARED; | 123 | vm_flags &= ~VM_SHARED; |
123 | vma->vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, | 124 | vm_page_prot = vm_pgprot_modify(vm_page_prot, vm_flags); |
124 | vm_flags); | ||
125 | } | 125 | } |
126 | /* remove_protection_ptes reads vma->vm_page_prot without mmap_sem */ | ||
127 | WRITE_ONCE(vma->vm_page_prot, vm_page_prot); | ||
126 | } | 128 | } |
127 | 129 | ||
128 | /* | 130 | /* |
@@ -400,15 +402,9 @@ static inline void vma_rb_insert(struct vm_area_struct *vma, | |||
400 | rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks); | 402 | rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks); |
401 | } | 403 | } |
402 | 404 | ||
403 | static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root) | 405 | static void __vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root) |
404 | { | 406 | { |
405 | /* | 407 | /* |
406 | * All rb_subtree_gap values must be consistent prior to erase, | ||
407 | * with the possible exception of the vma being erased. | ||
408 | */ | ||
409 | validate_mm_rb(root, vma); | ||
410 | |||
411 | /* | ||
412 | * Note rb_erase_augmented is a fairly large inline function, | 408 | * Note rb_erase_augmented is a fairly large inline function, |
413 | * so make sure we instantiate it only once with our desired | 409 | * so make sure we instantiate it only once with our desired |
414 | * augmented rbtree callbacks. | 410 | * augmented rbtree callbacks. |
@@ -416,6 +412,32 @@ static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root) | |||
416 | rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks); | 412 | rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks); |
417 | } | 413 | } |
418 | 414 | ||
415 | static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma, | ||
416 | struct rb_root *root, | ||
417 | struct vm_area_struct *ignore) | ||
418 | { | ||
419 | /* | ||
420 | * All rb_subtree_gap values must be consistent prior to erase, | ||
421 | * with the possible exception of the "next" vma being erased if | ||
422 | * next->vm_start was reduced. | ||
423 | */ | ||
424 | validate_mm_rb(root, ignore); | ||
425 | |||
426 | __vma_rb_erase(vma, root); | ||
427 | } | ||
428 | |||
429 | static __always_inline void vma_rb_erase(struct vm_area_struct *vma, | ||
430 | struct rb_root *root) | ||
431 | { | ||
432 | /* | ||
433 | * All rb_subtree_gap values must be consistent prior to erase, | ||
434 | * with the possible exception of the vma being erased. | ||
435 | */ | ||
436 | validate_mm_rb(root, vma); | ||
437 | |||
438 | __vma_rb_erase(vma, root); | ||
439 | } | ||
440 | |||
419 | /* | 441 | /* |
420 | * vma has some anon_vma assigned, and is already inserted on that | 442 | * vma has some anon_vma assigned, and is already inserted on that |
421 | * anon_vma's interval trees. | 443 | * anon_vma's interval trees. |
@@ -599,14 +621,25 @@ static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) | |||
599 | mm->map_count++; | 621 | mm->map_count++; |
600 | } | 622 | } |
601 | 623 | ||
602 | static inline void | 624 | static __always_inline void __vma_unlink_common(struct mm_struct *mm, |
603 | __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, | 625 | struct vm_area_struct *vma, |
604 | struct vm_area_struct *prev) | 626 | struct vm_area_struct *prev, |
627 | bool has_prev, | ||
628 | struct vm_area_struct *ignore) | ||
605 | { | 629 | { |
606 | struct vm_area_struct *next; | 630 | struct vm_area_struct *next; |
607 | 631 | ||
608 | vma_rb_erase(vma, &mm->mm_rb); | 632 | vma_rb_erase_ignore(vma, &mm->mm_rb, ignore); |
609 | prev->vm_next = next = vma->vm_next; | 633 | next = vma->vm_next; |
634 | if (has_prev) | ||
635 | prev->vm_next = next; | ||
636 | else { | ||
637 | prev = vma->vm_prev; | ||
638 | if (prev) | ||
639 | prev->vm_next = next; | ||
640 | else | ||
641 | mm->mmap = next; | ||
642 | } | ||
610 | if (next) | 643 | if (next) |
611 | next->vm_prev = prev; | 644 | next->vm_prev = prev; |
612 | 645 | ||
@@ -614,6 +647,13 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, | |||
614 | vmacache_invalidate(mm); | 647 | vmacache_invalidate(mm); |
615 | } | 648 | } |
616 | 649 | ||
650 | static inline void __vma_unlink_prev(struct mm_struct *mm, | ||
651 | struct vm_area_struct *vma, | ||
652 | struct vm_area_struct *prev) | ||
653 | { | ||
654 | __vma_unlink_common(mm, vma, prev, true, vma); | ||
655 | } | ||
656 | |||
617 | /* | 657 | /* |
618 | * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that | 658 | * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that |
619 | * is already present in an i_mmap tree without adjusting the tree. | 659 | * is already present in an i_mmap tree without adjusting the tree. |
@@ -621,11 +661,12 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, | |||
621 | * are necessary. The "insert" vma (if any) is to be inserted | 661 | * are necessary. The "insert" vma (if any) is to be inserted |
622 | * before we drop the necessary locks. | 662 | * before we drop the necessary locks. |
623 | */ | 663 | */ |
624 | int vma_adjust(struct vm_area_struct *vma, unsigned long start, | 664 | int __vma_adjust(struct vm_area_struct *vma, unsigned long start, |
625 | unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) | 665 | unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert, |
666 | struct vm_area_struct *expand) | ||
626 | { | 667 | { |
627 | struct mm_struct *mm = vma->vm_mm; | 668 | struct mm_struct *mm = vma->vm_mm; |
628 | struct vm_area_struct *next = vma->vm_next; | 669 | struct vm_area_struct *next = vma->vm_next, *orig_vma = vma; |
629 | struct address_space *mapping = NULL; | 670 | struct address_space *mapping = NULL; |
630 | struct rb_root *root = NULL; | 671 | struct rb_root *root = NULL; |
631 | struct anon_vma *anon_vma = NULL; | 672 | struct anon_vma *anon_vma = NULL; |
@@ -641,9 +682,38 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, | |||
641 | /* | 682 | /* |
642 | * vma expands, overlapping all the next, and | 683 | * vma expands, overlapping all the next, and |
643 | * perhaps the one after too (mprotect case 6). | 684 | * perhaps the one after too (mprotect case 6). |
685 | * The only other cases that gets here are | ||
686 | * case 1, case 7 and case 8. | ||
644 | */ | 687 | */ |
645 | remove_next = 1 + (end > next->vm_end); | 688 | if (next == expand) { |
646 | end = next->vm_end; | 689 | /* |
690 | * The only case where we don't expand "vma" | ||
691 | * and we expand "next" instead is case 8. | ||
692 | */ | ||
693 | VM_WARN_ON(end != next->vm_end); | ||
694 | /* | ||
695 | * remove_next == 3 means we're | ||
696 | * removing "vma" and that to do so we | ||
697 | * swapped "vma" and "next". | ||
698 | */ | ||
699 | remove_next = 3; | ||
700 | VM_WARN_ON(file != next->vm_file); | ||
701 | swap(vma, next); | ||
702 | } else { | ||
703 | VM_WARN_ON(expand != vma); | ||
704 | /* | ||
705 | * case 1, 6, 7, remove_next == 2 is case 6, | ||
706 | * remove_next == 1 is case 1 or 7. | ||
707 | */ | ||
708 | remove_next = 1 + (end > next->vm_end); | ||
709 | VM_WARN_ON(remove_next == 2 && | ||
710 | end != next->vm_next->vm_end); | ||
711 | VM_WARN_ON(remove_next == 1 && | ||
712 | end != next->vm_end); | ||
713 | /* trim end to next, for case 6 first pass */ | ||
714 | end = next->vm_end; | ||
715 | } | ||
716 | |||
647 | exporter = next; | 717 | exporter = next; |
648 | importer = vma; | 718 | importer = vma; |
649 | 719 | ||
@@ -651,7 +721,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, | |||
651 | * If next doesn't have anon_vma, import from vma after | 721 | * If next doesn't have anon_vma, import from vma after |
652 | * next, if the vma overlaps with it. | 722 | * next, if the vma overlaps with it. |
653 | */ | 723 | */ |
654 | if (remove_next == 2 && next && !next->anon_vma) | 724 | if (remove_next == 2 && !next->anon_vma) |
655 | exporter = next->vm_next; | 725 | exporter = next->vm_next; |
656 | 726 | ||
657 | } else if (end > next->vm_start) { | 727 | } else if (end > next->vm_start) { |
@@ -662,6 +732,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, | |||
662 | adjust_next = (end - next->vm_start) >> PAGE_SHIFT; | 732 | adjust_next = (end - next->vm_start) >> PAGE_SHIFT; |
663 | exporter = next; | 733 | exporter = next; |
664 | importer = vma; | 734 | importer = vma; |
735 | VM_WARN_ON(expand != importer); | ||
665 | } else if (end < vma->vm_end) { | 736 | } else if (end < vma->vm_end) { |
666 | /* | 737 | /* |
667 | * vma shrinks, and !insert tells it's not | 738 | * vma shrinks, and !insert tells it's not |
@@ -671,6 +742,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, | |||
671 | adjust_next = -((vma->vm_end - end) >> PAGE_SHIFT); | 742 | adjust_next = -((vma->vm_end - end) >> PAGE_SHIFT); |
672 | exporter = vma; | 743 | exporter = vma; |
673 | importer = next; | 744 | importer = next; |
745 | VM_WARN_ON(expand != importer); | ||
674 | } | 746 | } |
675 | 747 | ||
676 | /* | 748 | /* |
@@ -688,7 +760,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, | |||
688 | } | 760 | } |
689 | } | 761 | } |
690 | again: | 762 | again: |
691 | vma_adjust_trans_huge(vma, start, end, adjust_next); | 763 | vma_adjust_trans_huge(orig_vma, start, end, adjust_next); |
692 | 764 | ||
693 | if (file) { | 765 | if (file) { |
694 | mapping = file->f_mapping; | 766 | mapping = file->f_mapping; |
@@ -714,8 +786,8 @@ again: | |||
714 | if (!anon_vma && adjust_next) | 786 | if (!anon_vma && adjust_next) |
715 | anon_vma = next->anon_vma; | 787 | anon_vma = next->anon_vma; |
716 | if (anon_vma) { | 788 | if (anon_vma) { |
717 | VM_BUG_ON_VMA(adjust_next && next->anon_vma && | 789 | VM_WARN_ON(adjust_next && next->anon_vma && |
718 | anon_vma != next->anon_vma, next); | 790 | anon_vma != next->anon_vma); |
719 | anon_vma_lock_write(anon_vma); | 791 | anon_vma_lock_write(anon_vma); |
720 | anon_vma_interval_tree_pre_update_vma(vma); | 792 | anon_vma_interval_tree_pre_update_vma(vma); |
721 | if (adjust_next) | 793 | if (adjust_next) |
@@ -755,7 +827,19 @@ again: | |||
755 | * vma_merge has merged next into vma, and needs | 827 | * vma_merge has merged next into vma, and needs |
756 | * us to remove next before dropping the locks. | 828 | * us to remove next before dropping the locks. |
757 | */ | 829 | */ |
758 | __vma_unlink(mm, next, vma); | 830 | if (remove_next != 3) |
831 | __vma_unlink_prev(mm, next, vma); | ||
832 | else | ||
833 | /* | ||
834 | * vma is not before next if they've been | ||
835 | * swapped. | ||
836 | * | ||
837 | * pre-swap() next->vm_start was reduced so | ||
838 | * tell validate_mm_rb to ignore pre-swap() | ||
839 | * "next" (which is stored in post-swap() | ||
840 | * "vma"). | ||
841 | */ | ||
842 | __vma_unlink_common(mm, next, NULL, false, vma); | ||
759 | if (file) | 843 | if (file) |
760 | __remove_shared_vm_struct(next, file, mapping); | 844 | __remove_shared_vm_struct(next, file, mapping); |
761 | } else if (insert) { | 845 | } else if (insert) { |
@@ -807,7 +891,27 @@ again: | |||
807 | * we must remove another next too. It would clutter | 891 | * we must remove another next too. It would clutter |
808 | * up the code too much to do both in one go. | 892 | * up the code too much to do both in one go. |
809 | */ | 893 | */ |
810 | next = vma->vm_next; | 894 | if (remove_next != 3) { |
895 | /* | ||
896 | * If "next" was removed and vma->vm_end was | ||
897 | * expanded (up) over it, in turn | ||
898 | * "next->vm_prev->vm_end" changed and the | ||
899 | * "vma->vm_next" gap must be updated. | ||
900 | */ | ||
901 | next = vma->vm_next; | ||
902 | } else { | ||
903 | /* | ||
904 | * For the scope of the comment "next" and | ||
905 | * "vma" considered pre-swap(): if "vma" was | ||
906 | * removed, next->vm_start was expanded (down) | ||
907 | * over it and the "next" gap must be updated. | ||
908 | * Because of the swap() the post-swap() "vma" | ||
909 | * actually points to pre-swap() "next" | ||
910 | * (post-swap() "next" as opposed is now a | ||
911 | * dangling pointer). | ||
912 | */ | ||
913 | next = vma; | ||
914 | } | ||
811 | if (remove_next == 2) { | 915 | if (remove_next == 2) { |
812 | remove_next = 1; | 916 | remove_next = 1; |
813 | end = next->vm_end; | 917 | end = next->vm_end; |
@@ -815,8 +919,28 @@ again: | |||
815 | } | 919 | } |
816 | else if (next) | 920 | else if (next) |
817 | vma_gap_update(next); | 921 | vma_gap_update(next); |
818 | else | 922 | else { |
819 | mm->highest_vm_end = end; | 923 | /* |
924 | * If remove_next == 2 we obviously can't | ||
925 | * reach this path. | ||
926 | * | ||
927 | * If remove_next == 3 we can't reach this | ||
928 | * path because pre-swap() next is always not | ||
929 | * NULL. pre-swap() "next" is not being | ||
930 | * removed and its next->vm_end is not altered | ||
931 | * (and furthermore "end" already matches | ||
932 | * next->vm_end in remove_next == 3). | ||
933 | * | ||
934 | * We reach this only in the remove_next == 1 | ||
935 | * case if the "next" vma that was removed was | ||
936 | * the highest vma of the mm. However in such | ||
937 | * case next->vm_end == "end" and the extended | ||
938 | * "vma" has vma->vm_end == next->vm_end so | ||
939 | * mm->highest_vm_end doesn't need any update | ||
940 | * in remove_next == 1 case. | ||
941 | */ | ||
942 | VM_WARN_ON(mm->highest_vm_end != end); | ||
943 | } | ||
820 | } | 944 | } |
821 | if (insert && file) | 945 | if (insert && file) |
822 | uprobe_mmap(insert); | 946 | uprobe_mmap(insert); |
@@ -936,13 +1060,24 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, | |||
936 | * cannot merge might become might become might become | 1060 | * cannot merge might become might become might become |
937 | * PPNNNNNNNNNN PPPPPPPPPPNN PPPPPPPPPPPP 6 or | 1061 | * PPNNNNNNNNNN PPPPPPPPPPNN PPPPPPPPPPPP 6 or |
938 | * mmap, brk or case 4 below case 5 below PPPPPPPPXXXX 7 or | 1062 | * mmap, brk or case 4 below case 5 below PPPPPPPPXXXX 7 or |
939 | * mremap move: PPPPNNNNNNNN 8 | 1063 | * mremap move: PPPPXXXXXXXX 8 |
940 | * AAAA | 1064 | * AAAA |
941 | * PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN | 1065 | * PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN |
942 | * might become case 1 below case 2 below case 3 below | 1066 | * might become case 1 below case 2 below case 3 below |
943 | * | 1067 | * |
944 | * Odd one out? Case 8, because it extends NNNN but needs flags of XXXX: | 1068 | * It is important for case 8 that the the vma NNNN overlapping the |
945 | * mprotect_fixup updates vm_flags & vm_page_prot on successful return. | 1069 | * region AAAA is never going to extended over XXXX. Instead XXXX must |
1070 | * be extended in region AAAA and NNNN must be removed. This way in | ||
1071 | * all cases where vma_merge succeeds, the moment vma_adjust drops the | ||
1072 | * rmap_locks, the properties of the merged vma will be already | ||
1073 | * correct for the whole merged range. Some of those properties like | ||
1074 | * vm_page_prot/vm_flags may be accessed by rmap_walks and they must | ||
1075 | * be correct for the whole merged range immediately after the | ||
1076 | * rmap_locks are released. Otherwise if XXXX would be removed and | ||
1077 | * NNNN would be extended over the XXXX range, remove_migration_ptes | ||
1078 | * or other rmap walkers (if working on addresses beyond the "end" | ||
1079 | * parameter) may establish ptes with the wrong permissions of NNNN | ||
1080 | * instead of the right permissions of XXXX. | ||
946 | */ | 1081 | */ |
947 | struct vm_area_struct *vma_merge(struct mm_struct *mm, | 1082 | struct vm_area_struct *vma_merge(struct mm_struct *mm, |
948 | struct vm_area_struct *prev, unsigned long addr, | 1083 | struct vm_area_struct *prev, unsigned long addr, |
@@ -967,9 +1102,14 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, | |||
967 | else | 1102 | else |
968 | next = mm->mmap; | 1103 | next = mm->mmap; |
969 | area = next; | 1104 | area = next; |
970 | if (next && next->vm_end == end) /* cases 6, 7, 8 */ | 1105 | if (area && area->vm_end == end) /* cases 6, 7, 8 */ |
971 | next = next->vm_next; | 1106 | next = next->vm_next; |
972 | 1107 | ||
1108 | /* verify some invariant that must be enforced by the caller */ | ||
1109 | VM_WARN_ON(prev && addr <= prev->vm_start); | ||
1110 | VM_WARN_ON(area && end > area->vm_end); | ||
1111 | VM_WARN_ON(addr >= end); | ||
1112 | |||
973 | /* | 1113 | /* |
974 | * Can it merge with the predecessor? | 1114 | * Can it merge with the predecessor? |
975 | */ | 1115 | */ |
@@ -990,11 +1130,12 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, | |||
990 | is_mergeable_anon_vma(prev->anon_vma, | 1130 | is_mergeable_anon_vma(prev->anon_vma, |
991 | next->anon_vma, NULL)) { | 1131 | next->anon_vma, NULL)) { |
992 | /* cases 1, 6 */ | 1132 | /* cases 1, 6 */ |
993 | err = vma_adjust(prev, prev->vm_start, | 1133 | err = __vma_adjust(prev, prev->vm_start, |
994 | next->vm_end, prev->vm_pgoff, NULL); | 1134 | next->vm_end, prev->vm_pgoff, NULL, |
1135 | prev); | ||
995 | } else /* cases 2, 5, 7 */ | 1136 | } else /* cases 2, 5, 7 */ |
996 | err = vma_adjust(prev, prev->vm_start, | 1137 | err = __vma_adjust(prev, prev->vm_start, |
997 | end, prev->vm_pgoff, NULL); | 1138 | end, prev->vm_pgoff, NULL, prev); |
998 | if (err) | 1139 | if (err) |
999 | return NULL; | 1140 | return NULL; |
1000 | khugepaged_enter_vma_merge(prev, vm_flags); | 1141 | khugepaged_enter_vma_merge(prev, vm_flags); |
@@ -1010,11 +1151,18 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, | |||
1010 | anon_vma, file, pgoff+pglen, | 1151 | anon_vma, file, pgoff+pglen, |
1011 | vm_userfaultfd_ctx)) { | 1152 | vm_userfaultfd_ctx)) { |
1012 | if (prev && addr < prev->vm_end) /* case 4 */ | 1153 | if (prev && addr < prev->vm_end) /* case 4 */ |
1013 | err = vma_adjust(prev, prev->vm_start, | 1154 | err = __vma_adjust(prev, prev->vm_start, |
1014 | addr, prev->vm_pgoff, NULL); | 1155 | addr, prev->vm_pgoff, NULL, next); |
1015 | else /* cases 3, 8 */ | 1156 | else { /* cases 3, 8 */ |
1016 | err = vma_adjust(area, addr, next->vm_end, | 1157 | err = __vma_adjust(area, addr, next->vm_end, |
1017 | next->vm_pgoff - pglen, NULL); | 1158 | next->vm_pgoff - pglen, NULL, next); |
1159 | /* | ||
1160 | * In case 3 area is already equal to next and | ||
1161 | * this is a noop, but in case 8 "area" has | ||
1162 | * been removed and next was expanded over it. | ||
1163 | */ | ||
1164 | area = next; | ||
1165 | } | ||
1018 | if (err) | 1166 | if (err) |
1019 | return NULL; | 1167 | return NULL; |
1020 | khugepaged_enter_vma_merge(area, vm_flags); | 1168 | khugepaged_enter_vma_merge(area, vm_flags); |
@@ -1386,7 +1534,7 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) | |||
1386 | * to the private version (using protection_map[] without the | 1534 | * to the private version (using protection_map[] without the |
1387 | * VM_SHARED bit). | 1535 | * VM_SHARED bit). |
1388 | */ | 1536 | */ |
1389 | int vma_wants_writenotify(struct vm_area_struct *vma) | 1537 | int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot) |
1390 | { | 1538 | { |
1391 | vm_flags_t vm_flags = vma->vm_flags; | 1539 | vm_flags_t vm_flags = vma->vm_flags; |
1392 | const struct vm_operations_struct *vm_ops = vma->vm_ops; | 1540 | const struct vm_operations_struct *vm_ops = vma->vm_ops; |
@@ -1401,8 +1549,8 @@ int vma_wants_writenotify(struct vm_area_struct *vma) | |||
1401 | 1549 | ||
1402 | /* The open routine did something to the protections that pgprot_modify | 1550 | /* The open routine did something to the protections that pgprot_modify |
1403 | * won't preserve? */ | 1551 | * won't preserve? */ |
1404 | if (pgprot_val(vma->vm_page_prot) != | 1552 | if (pgprot_val(vm_page_prot) != |
1405 | pgprot_val(vm_pgprot_modify(vma->vm_page_prot, vm_flags))) | 1553 | pgprot_val(vm_pgprot_modify(vm_page_prot, vm_flags))) |
1406 | return 0; | 1554 | return 0; |
1407 | 1555 | ||
1408 | /* Do we need to track softdirty? */ | 1556 | /* Do we need to track softdirty? */ |
diff --git a/mm/mprotect.c b/mm/mprotect.c index a4830f0325fe..ec91dfd3f900 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
@@ -304,6 +304,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, | |||
304 | vma->vm_userfaultfd_ctx); | 304 | vma->vm_userfaultfd_ctx); |
305 | if (*pprev) { | 305 | if (*pprev) { |
306 | vma = *pprev; | 306 | vma = *pprev; |
307 | VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY); | ||
307 | goto success; | 308 | goto success; |
308 | } | 309 | } |
309 | 310 | ||
@@ -327,7 +328,7 @@ success: | |||
327 | * held in write mode. | 328 | * held in write mode. |
328 | */ | 329 | */ |
329 | vma->vm_flags = newflags; | 330 | vma->vm_flags = newflags; |
330 | dirty_accountable = vma_wants_writenotify(vma); | 331 | dirty_accountable = vma_wants_writenotify(vma, vma->vm_page_prot); |
331 | vma_set_page_prot(vma); | 332 | vma_set_page_prot(vma); |
332 | 333 | ||
333 | change_protection(vma, start, end, vma->vm_page_prot, | 334 | change_protection(vma, start, end, vma->vm_page_prot, |
diff --git a/mm/nobootmem.c b/mm/nobootmem.c index bd05a70f44b9..ba609b684d7a 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c | |||
@@ -11,18 +11,21 @@ | |||
11 | #include <linux/init.h> | 11 | #include <linux/init.h> |
12 | #include <linux/pfn.h> | 12 | #include <linux/pfn.h> |
13 | #include <linux/slab.h> | 13 | #include <linux/slab.h> |
14 | #include <linux/bootmem.h> | ||
15 | #include <linux/export.h> | 14 | #include <linux/export.h> |
16 | #include <linux/kmemleak.h> | 15 | #include <linux/kmemleak.h> |
17 | #include <linux/range.h> | 16 | #include <linux/range.h> |
18 | #include <linux/memblock.h> | 17 | #include <linux/memblock.h> |
18 | #include <linux/bootmem.h> | ||
19 | 19 | ||
20 | #include <asm/bug.h> | 20 | #include <asm/bug.h> |
21 | #include <asm/io.h> | 21 | #include <asm/io.h> |
22 | #include <asm/processor.h> | ||
23 | 22 | ||
24 | #include "internal.h" | 23 | #include "internal.h" |
25 | 24 | ||
25 | #ifndef CONFIG_HAVE_MEMBLOCK | ||
26 | #error CONFIG_HAVE_MEMBLOCK not defined | ||
27 | #endif | ||
28 | |||
26 | #ifndef CONFIG_NEED_MULTIPLE_NODES | 29 | #ifndef CONFIG_NEED_MULTIPLE_NODES |
27 | struct pglist_data __refdata contig_page_data; | 30 | struct pglist_data __refdata contig_page_data; |
28 | EXPORT_SYMBOL(contig_page_data); | 31 | EXPORT_SYMBOL(contig_page_data); |
@@ -134,6 +137,11 @@ static unsigned long __init free_low_memory_core_early(void) | |||
134 | for_each_reserved_mem_region(i, &start, &end) | 137 | for_each_reserved_mem_region(i, &start, &end) |
135 | reserve_bootmem_region(start, end); | 138 | reserve_bootmem_region(start, end); |
136 | 139 | ||
140 | /* | ||
141 | * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id | ||
142 | * because in some case like Node0 doesn't have RAM installed | ||
143 | * low ram will be on Node1 | ||
144 | */ | ||
137 | for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, | 145 | for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, |
138 | NULL) | 146 | NULL) |
139 | count += __free_memory_core(start, end); | 147 | count += __free_memory_core(start, end); |
@@ -191,11 +199,6 @@ unsigned long __init free_all_bootmem(void) | |||
191 | 199 | ||
192 | reset_all_zones_managed_pages(); | 200 | reset_all_zones_managed_pages(); |
193 | 201 | ||
194 | /* | ||
195 | * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id | ||
196 | * because in some case like Node0 doesn't have RAM installed | ||
197 | * low ram will be on Node1 | ||
198 | */ | ||
199 | pages = free_low_memory_core_early(); | 202 | pages = free_low_memory_core_early(); |
200 | totalram_pages += pages; | 203 | totalram_pages += pages; |
201 | 204 | ||
@@ -395,9 +398,6 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, | |||
395 | return __alloc_bootmem_node(pgdat, size, align, goal); | 398 | return __alloc_bootmem_node(pgdat, size, align, goal); |
396 | } | 399 | } |
397 | 400 | ||
398 | #ifndef ARCH_LOW_ADDRESS_LIMIT | ||
399 | #define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL | ||
400 | #endif | ||
401 | 401 | ||
402 | /** | 402 | /** |
403 | * __alloc_bootmem_low - allocate low boot memory | 403 | * __alloc_bootmem_low - allocate low boot memory |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index d53a9aa00977..ec9f11d4f094 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -132,6 +132,11 @@ static inline bool is_sysrq_oom(struct oom_control *oc) | |||
132 | return oc->order == -1; | 132 | return oc->order == -1; |
133 | } | 133 | } |
134 | 134 | ||
135 | static inline bool is_memcg_oom(struct oom_control *oc) | ||
136 | { | ||
137 | return oc->memcg != NULL; | ||
138 | } | ||
139 | |||
135 | /* return true if the task is not adequate as candidate victim task. */ | 140 | /* return true if the task is not adequate as candidate victim task. */ |
136 | static bool oom_unkillable_task(struct task_struct *p, | 141 | static bool oom_unkillable_task(struct task_struct *p, |
137 | struct mem_cgroup *memcg, const nodemask_t *nodemask) | 142 | struct mem_cgroup *memcg, const nodemask_t *nodemask) |
@@ -181,7 +186,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, | |||
181 | */ | 186 | */ |
182 | adj = (long)p->signal->oom_score_adj; | 187 | adj = (long)p->signal->oom_score_adj; |
183 | if (adj == OOM_SCORE_ADJ_MIN || | 188 | if (adj == OOM_SCORE_ADJ_MIN || |
184 | test_bit(MMF_OOM_REAPED, &p->mm->flags) || | 189 | test_bit(MMF_OOM_SKIP, &p->mm->flags) || |
185 | in_vfork(p)) { | 190 | in_vfork(p)) { |
186 | task_unlock(p); | 191 | task_unlock(p); |
187 | return 0; | 192 | return 0; |
@@ -213,12 +218,17 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, | |||
213 | return points > 0 ? points : 1; | 218 | return points > 0 ? points : 1; |
214 | } | 219 | } |
215 | 220 | ||
221 | enum oom_constraint { | ||
222 | CONSTRAINT_NONE, | ||
223 | CONSTRAINT_CPUSET, | ||
224 | CONSTRAINT_MEMORY_POLICY, | ||
225 | CONSTRAINT_MEMCG, | ||
226 | }; | ||
227 | |||
216 | /* | 228 | /* |
217 | * Determine the type of allocation constraint. | 229 | * Determine the type of allocation constraint. |
218 | */ | 230 | */ |
219 | #ifdef CONFIG_NUMA | 231 | static enum oom_constraint constrained_alloc(struct oom_control *oc) |
220 | static enum oom_constraint constrained_alloc(struct oom_control *oc, | ||
221 | unsigned long *totalpages) | ||
222 | { | 232 | { |
223 | struct zone *zone; | 233 | struct zone *zone; |
224 | struct zoneref *z; | 234 | struct zoneref *z; |
@@ -226,8 +236,16 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc, | |||
226 | bool cpuset_limited = false; | 236 | bool cpuset_limited = false; |
227 | int nid; | 237 | int nid; |
228 | 238 | ||
239 | if (is_memcg_oom(oc)) { | ||
240 | oc->totalpages = mem_cgroup_get_limit(oc->memcg) ?: 1; | ||
241 | return CONSTRAINT_MEMCG; | ||
242 | } | ||
243 | |||
229 | /* Default to all available memory */ | 244 | /* Default to all available memory */ |
230 | *totalpages = totalram_pages + total_swap_pages; | 245 | oc->totalpages = totalram_pages + total_swap_pages; |
246 | |||
247 | if (!IS_ENABLED(CONFIG_NUMA)) | ||
248 | return CONSTRAINT_NONE; | ||
231 | 249 | ||
232 | if (!oc->zonelist) | 250 | if (!oc->zonelist) |
233 | return CONSTRAINT_NONE; | 251 | return CONSTRAINT_NONE; |
@@ -246,9 +264,9 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc, | |||
246 | */ | 264 | */ |
247 | if (oc->nodemask && | 265 | if (oc->nodemask && |
248 | !nodes_subset(node_states[N_MEMORY], *oc->nodemask)) { | 266 | !nodes_subset(node_states[N_MEMORY], *oc->nodemask)) { |
249 | *totalpages = total_swap_pages; | 267 | oc->totalpages = total_swap_pages; |
250 | for_each_node_mask(nid, *oc->nodemask) | 268 | for_each_node_mask(nid, *oc->nodemask) |
251 | *totalpages += node_spanned_pages(nid); | 269 | oc->totalpages += node_spanned_pages(nid); |
252 | return CONSTRAINT_MEMORY_POLICY; | 270 | return CONSTRAINT_MEMORY_POLICY; |
253 | } | 271 | } |
254 | 272 | ||
@@ -259,98 +277,84 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc, | |||
259 | cpuset_limited = true; | 277 | cpuset_limited = true; |
260 | 278 | ||
261 | if (cpuset_limited) { | 279 | if (cpuset_limited) { |
262 | *totalpages = total_swap_pages; | 280 | oc->totalpages = total_swap_pages; |
263 | for_each_node_mask(nid, cpuset_current_mems_allowed) | 281 | for_each_node_mask(nid, cpuset_current_mems_allowed) |
264 | *totalpages += node_spanned_pages(nid); | 282 | oc->totalpages += node_spanned_pages(nid); |
265 | return CONSTRAINT_CPUSET; | 283 | return CONSTRAINT_CPUSET; |
266 | } | 284 | } |
267 | return CONSTRAINT_NONE; | 285 | return CONSTRAINT_NONE; |
268 | } | 286 | } |
269 | #else | ||
270 | static enum oom_constraint constrained_alloc(struct oom_control *oc, | ||
271 | unsigned long *totalpages) | ||
272 | { | ||
273 | *totalpages = totalram_pages + total_swap_pages; | ||
274 | return CONSTRAINT_NONE; | ||
275 | } | ||
276 | #endif | ||
277 | 287 | ||
278 | enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, | 288 | static int oom_evaluate_task(struct task_struct *task, void *arg) |
279 | struct task_struct *task) | ||
280 | { | 289 | { |
290 | struct oom_control *oc = arg; | ||
291 | unsigned long points; | ||
292 | |||
281 | if (oom_unkillable_task(task, NULL, oc->nodemask)) | 293 | if (oom_unkillable_task(task, NULL, oc->nodemask)) |
282 | return OOM_SCAN_CONTINUE; | 294 | goto next; |
283 | 295 | ||
284 | /* | 296 | /* |
285 | * This task already has access to memory reserves and is being killed. | 297 | * This task already has access to memory reserves and is being killed. |
286 | * Don't allow any other task to have access to the reserves unless | 298 | * Don't allow any other task to have access to the reserves unless |
287 | * the task has MMF_OOM_REAPED because chances that it would release | 299 | * the task has MMF_OOM_SKIP because chances that it would release |
288 | * any memory is quite low. | 300 | * any memory is quite low. |
289 | */ | 301 | */ |
290 | if (!is_sysrq_oom(oc) && atomic_read(&task->signal->oom_victims)) { | 302 | if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) { |
291 | struct task_struct *p = find_lock_task_mm(task); | 303 | if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags)) |
292 | enum oom_scan_t ret = OOM_SCAN_ABORT; | 304 | goto next; |
293 | 305 | goto abort; | |
294 | if (p) { | ||
295 | if (test_bit(MMF_OOM_REAPED, &p->mm->flags)) | ||
296 | ret = OOM_SCAN_CONTINUE; | ||
297 | task_unlock(p); | ||
298 | } | ||
299 | |||
300 | return ret; | ||
301 | } | 306 | } |
302 | 307 | ||
303 | /* | 308 | /* |
304 | * If task is allocating a lot of memory and has been marked to be | 309 | * If task is allocating a lot of memory and has been marked to be |
305 | * killed first if it triggers an oom, then select it. | 310 | * killed first if it triggers an oom, then select it. |
306 | */ | 311 | */ |
307 | if (oom_task_origin(task)) | 312 | if (oom_task_origin(task)) { |
308 | return OOM_SCAN_SELECT; | 313 | points = ULONG_MAX; |
314 | goto select; | ||
315 | } | ||
309 | 316 | ||
310 | return OOM_SCAN_OK; | 317 | points = oom_badness(task, NULL, oc->nodemask, oc->totalpages); |
318 | if (!points || points < oc->chosen_points) | ||
319 | goto next; | ||
320 | |||
321 | /* Prefer thread group leaders for display purposes */ | ||
322 | if (points == oc->chosen_points && thread_group_leader(oc->chosen)) | ||
323 | goto next; | ||
324 | select: | ||
325 | if (oc->chosen) | ||
326 | put_task_struct(oc->chosen); | ||
327 | get_task_struct(task); | ||
328 | oc->chosen = task; | ||
329 | oc->chosen_points = points; | ||
330 | next: | ||
331 | return 0; | ||
332 | abort: | ||
333 | if (oc->chosen) | ||
334 | put_task_struct(oc->chosen); | ||
335 | oc->chosen = (void *)-1UL; | ||
336 | return 1; | ||
311 | } | 337 | } |
312 | 338 | ||
313 | /* | 339 | /* |
314 | * Simple selection loop. We chose the process with the highest | 340 | * Simple selection loop. We choose the process with the highest number of |
315 | * number of 'points'. Returns -1 on scan abort. | 341 | * 'points'. In case scan was aborted, oc->chosen is set to -1. |
316 | */ | 342 | */ |
317 | static struct task_struct *select_bad_process(struct oom_control *oc, | 343 | static void select_bad_process(struct oom_control *oc) |
318 | unsigned int *ppoints, unsigned long totalpages) | ||
319 | { | 344 | { |
320 | struct task_struct *p; | 345 | if (is_memcg_oom(oc)) |
321 | struct task_struct *chosen = NULL; | 346 | mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc); |
322 | unsigned long chosen_points = 0; | 347 | else { |
323 | 348 | struct task_struct *p; | |
324 | rcu_read_lock(); | ||
325 | for_each_process(p) { | ||
326 | unsigned int points; | ||
327 | |||
328 | switch (oom_scan_process_thread(oc, p)) { | ||
329 | case OOM_SCAN_SELECT: | ||
330 | chosen = p; | ||
331 | chosen_points = ULONG_MAX; | ||
332 | /* fall through */ | ||
333 | case OOM_SCAN_CONTINUE: | ||
334 | continue; | ||
335 | case OOM_SCAN_ABORT: | ||
336 | rcu_read_unlock(); | ||
337 | return (struct task_struct *)(-1UL); | ||
338 | case OOM_SCAN_OK: | ||
339 | break; | ||
340 | }; | ||
341 | points = oom_badness(p, NULL, oc->nodemask, totalpages); | ||
342 | if (!points || points < chosen_points) | ||
343 | continue; | ||
344 | 349 | ||
345 | chosen = p; | 350 | rcu_read_lock(); |
346 | chosen_points = points; | 351 | for_each_process(p) |
352 | if (oom_evaluate_task(p, oc)) | ||
353 | break; | ||
354 | rcu_read_unlock(); | ||
347 | } | 355 | } |
348 | if (chosen) | ||
349 | get_task_struct(chosen); | ||
350 | rcu_read_unlock(); | ||
351 | 356 | ||
352 | *ppoints = chosen_points * 1000 / totalpages; | 357 | oc->chosen_points = oc->chosen_points * 1000 / oc->totalpages; |
353 | return chosen; | ||
354 | } | 358 | } |
355 | 359 | ||
356 | /** | 360 | /** |
@@ -399,9 +403,14 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) | |||
399 | 403 | ||
400 | static void dump_header(struct oom_control *oc, struct task_struct *p) | 404 | static void dump_header(struct oom_control *oc, struct task_struct *p) |
401 | { | 405 | { |
402 | pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n", | 406 | nodemask_t *nm = (oc->nodemask) ? oc->nodemask : &cpuset_current_mems_allowed; |
403 | current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order, | 407 | |
408 | pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), nodemask=%*pbl, order=%d, oom_score_adj=%hd\n", | ||
409 | current->comm, oc->gfp_mask, &oc->gfp_mask, | ||
410 | nodemask_pr_args(nm), oc->order, | ||
404 | current->signal->oom_score_adj); | 411 | current->signal->oom_score_adj); |
412 | if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order) | ||
413 | pr_warn("COMPACTION is disabled!!!\n"); | ||
405 | 414 | ||
406 | cpuset_print_current_mems_allowed(); | 415 | cpuset_print_current_mems_allowed(); |
407 | dump_stack(); | 416 | dump_stack(); |
@@ -419,7 +428,7 @@ static void dump_header(struct oom_control *oc, struct task_struct *p) | |||
419 | static atomic_t oom_victims = ATOMIC_INIT(0); | 428 | static atomic_t oom_victims = ATOMIC_INIT(0); |
420 | static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait); | 429 | static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait); |
421 | 430 | ||
422 | bool oom_killer_disabled __read_mostly; | 431 | static bool oom_killer_disabled __read_mostly; |
423 | 432 | ||
424 | #define K(x) ((x) << (PAGE_SHIFT-10)) | 433 | #define K(x) ((x) << (PAGE_SHIFT-10)) |
425 | 434 | ||
@@ -452,12 +461,10 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); | |||
452 | static struct task_struct *oom_reaper_list; | 461 | static struct task_struct *oom_reaper_list; |
453 | static DEFINE_SPINLOCK(oom_reaper_lock); | 462 | static DEFINE_SPINLOCK(oom_reaper_lock); |
454 | 463 | ||
455 | static bool __oom_reap_task(struct task_struct *tsk) | 464 | static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) |
456 | { | 465 | { |
457 | struct mmu_gather tlb; | 466 | struct mmu_gather tlb; |
458 | struct vm_area_struct *vma; | 467 | struct vm_area_struct *vma; |
459 | struct mm_struct *mm = NULL; | ||
460 | struct task_struct *p; | ||
461 | struct zap_details details = {.check_swap_entries = true, | 468 | struct zap_details details = {.check_swap_entries = true, |
462 | .ignore_dirty = true}; | 469 | .ignore_dirty = true}; |
463 | bool ret = true; | 470 | bool ret = true; |
@@ -465,7 +472,7 @@ static bool __oom_reap_task(struct task_struct *tsk) | |||
465 | /* | 472 | /* |
466 | * We have to make sure to not race with the victim exit path | 473 | * We have to make sure to not race with the victim exit path |
467 | * and cause premature new oom victim selection: | 474 | * and cause premature new oom victim selection: |
468 | * __oom_reap_task exit_mm | 475 | * __oom_reap_task_mm exit_mm |
469 | * mmget_not_zero | 476 | * mmget_not_zero |
470 | * mmput | 477 | * mmput |
471 | * atomic_dec_and_test | 478 | * atomic_dec_and_test |
@@ -478,22 +485,9 @@ static bool __oom_reap_task(struct task_struct *tsk) | |||
478 | */ | 485 | */ |
479 | mutex_lock(&oom_lock); | 486 | mutex_lock(&oom_lock); |
480 | 487 | ||
481 | /* | ||
482 | * Make sure we find the associated mm_struct even when the particular | ||
483 | * thread has already terminated and cleared its mm. | ||
484 | * We might have race with exit path so consider our work done if there | ||
485 | * is no mm. | ||
486 | */ | ||
487 | p = find_lock_task_mm(tsk); | ||
488 | if (!p) | ||
489 | goto unlock_oom; | ||
490 | mm = p->mm; | ||
491 | atomic_inc(&mm->mm_count); | ||
492 | task_unlock(p); | ||
493 | |||
494 | if (!down_read_trylock(&mm->mmap_sem)) { | 488 | if (!down_read_trylock(&mm->mmap_sem)) { |
495 | ret = false; | 489 | ret = false; |
496 | goto mm_drop; | 490 | goto unlock_oom; |
497 | } | 491 | } |
498 | 492 | ||
499 | /* | 493 | /* |
@@ -503,9 +497,17 @@ static bool __oom_reap_task(struct task_struct *tsk) | |||
503 | */ | 497 | */ |
504 | if (!mmget_not_zero(mm)) { | 498 | if (!mmget_not_zero(mm)) { |
505 | up_read(&mm->mmap_sem); | 499 | up_read(&mm->mmap_sem); |
506 | goto mm_drop; | 500 | goto unlock_oom; |
507 | } | 501 | } |
508 | 502 | ||
503 | /* | ||
504 | * Tell all users of get_user/copy_from_user etc... that the content | ||
505 | * is no longer stable. No barriers really needed because unmapping | ||
506 | * should imply barriers already and the reader would hit a page fault | ||
507 | * if it stumbled over a reaped memory. | ||
508 | */ | ||
509 | set_bit(MMF_UNSTABLE, &mm->flags); | ||
510 | |||
509 | tlb_gather_mmu(&tlb, mm, 0, -1); | 511 | tlb_gather_mmu(&tlb, mm, 0, -1); |
510 | for (vma = mm->mmap ; vma; vma = vma->vm_next) { | 512 | for (vma = mm->mmap ; vma; vma = vma->vm_next) { |
511 | if (is_vm_hugetlb_page(vma)) | 513 | if (is_vm_hugetlb_page(vma)) |
@@ -541,18 +543,11 @@ static bool __oom_reap_task(struct task_struct *tsk) | |||
541 | up_read(&mm->mmap_sem); | 543 | up_read(&mm->mmap_sem); |
542 | 544 | ||
543 | /* | 545 | /* |
544 | * This task can be safely ignored because we cannot do much more | ||
545 | * to release its memory. | ||
546 | */ | ||
547 | set_bit(MMF_OOM_REAPED, &mm->flags); | ||
548 | /* | ||
549 | * Drop our reference but make sure the mmput slow path is called from a | 546 | * Drop our reference but make sure the mmput slow path is called from a |
550 | * different context because we shouldn't risk we get stuck there and | 547 | * different context because we shouldn't risk we get stuck there and |
551 | * put the oom_reaper out of the way. | 548 | * put the oom_reaper out of the way. |
552 | */ | 549 | */ |
553 | mmput_async(mm); | 550 | mmput_async(mm); |
554 | mm_drop: | ||
555 | mmdrop(mm); | ||
556 | unlock_oom: | 551 | unlock_oom: |
557 | mutex_unlock(&oom_lock); | 552 | mutex_unlock(&oom_lock); |
558 | return ret; | 553 | return ret; |
@@ -562,44 +557,28 @@ unlock_oom: | |||
562 | static void oom_reap_task(struct task_struct *tsk) | 557 | static void oom_reap_task(struct task_struct *tsk) |
563 | { | 558 | { |
564 | int attempts = 0; | 559 | int attempts = 0; |
560 | struct mm_struct *mm = tsk->signal->oom_mm; | ||
565 | 561 | ||
566 | /* Retry the down_read_trylock(mmap_sem) a few times */ | 562 | /* Retry the down_read_trylock(mmap_sem) a few times */ |
567 | while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task(tsk)) | 563 | while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task_mm(tsk, mm)) |
568 | schedule_timeout_idle(HZ/10); | 564 | schedule_timeout_idle(HZ/10); |
569 | 565 | ||
570 | if (attempts > MAX_OOM_REAP_RETRIES) { | 566 | if (attempts <= MAX_OOM_REAP_RETRIES) |
571 | struct task_struct *p; | 567 | goto done; |
572 | 568 | ||
573 | pr_info("oom_reaper: unable to reap pid:%d (%s)\n", | ||
574 | task_pid_nr(tsk), tsk->comm); | ||
575 | 569 | ||
576 | /* | 570 | pr_info("oom_reaper: unable to reap pid:%d (%s)\n", |
577 | * If we've already tried to reap this task in the past and | 571 | task_pid_nr(tsk), tsk->comm); |
578 | * failed it probably doesn't make much sense to try yet again | 572 | debug_show_all_locks(); |
579 | * so hide the mm from the oom killer so that it can move on | ||
580 | * to another task with a different mm struct. | ||
581 | */ | ||
582 | p = find_lock_task_mm(tsk); | ||
583 | if (p) { | ||
584 | if (test_and_set_bit(MMF_OOM_NOT_REAPABLE, &p->mm->flags)) { | ||
585 | pr_info("oom_reaper: giving up pid:%d (%s)\n", | ||
586 | task_pid_nr(tsk), tsk->comm); | ||
587 | set_bit(MMF_OOM_REAPED, &p->mm->flags); | ||
588 | } | ||
589 | task_unlock(p); | ||
590 | } | ||
591 | 573 | ||
592 | debug_show_all_locks(); | 574 | done: |
593 | } | 575 | tsk->oom_reaper_list = NULL; |
594 | 576 | ||
595 | /* | 577 | /* |
596 | * Clear TIF_MEMDIE because the task shouldn't be sitting on a | 578 | * Hide this mm from OOM killer because it has been either reaped or |
597 | * reasonably reclaimable memory anymore or it is not a good candidate | 579 | * somebody can't call up_write(mmap_sem). |
598 | * for the oom victim right now because it cannot release its memory | ||
599 | * itself nor by the oom reaper. | ||
600 | */ | 580 | */ |
601 | tsk->oom_reaper_list = NULL; | 581 | set_bit(MMF_OOM_SKIP, &mm->flags); |
602 | exit_oom_victim(tsk); | ||
603 | 582 | ||
604 | /* Drop a reference taken by wake_oom_reaper */ | 583 | /* Drop a reference taken by wake_oom_reaper */ |
605 | put_task_struct(tsk); | 584 | put_task_struct(tsk); |
@@ -607,8 +586,6 @@ static void oom_reap_task(struct task_struct *tsk) | |||
607 | 586 | ||
608 | static int oom_reaper(void *unused) | 587 | static int oom_reaper(void *unused) |
609 | { | 588 | { |
610 | set_freezable(); | ||
611 | |||
612 | while (true) { | 589 | while (true) { |
613 | struct task_struct *tsk = NULL; | 590 | struct task_struct *tsk = NULL; |
614 | 591 | ||
@@ -627,7 +604,7 @@ static int oom_reaper(void *unused) | |||
627 | return 0; | 604 | return 0; |
628 | } | 605 | } |
629 | 606 | ||
630 | void wake_oom_reaper(struct task_struct *tsk) | 607 | static void wake_oom_reaper(struct task_struct *tsk) |
631 | { | 608 | { |
632 | if (!oom_reaper_th) | 609 | if (!oom_reaper_th) |
633 | return; | 610 | return; |
@@ -656,7 +633,11 @@ static int __init oom_init(void) | |||
656 | return 0; | 633 | return 0; |
657 | } | 634 | } |
658 | subsys_initcall(oom_init) | 635 | subsys_initcall(oom_init) |
659 | #endif | 636 | #else |
637 | static inline void wake_oom_reaper(struct task_struct *tsk) | ||
638 | { | ||
639 | } | ||
640 | #endif /* CONFIG_MMU */ | ||
660 | 641 | ||
661 | /** | 642 | /** |
662 | * mark_oom_victim - mark the given task as OOM victim | 643 | * mark_oom_victim - mark the given task as OOM victim |
@@ -664,14 +645,23 @@ subsys_initcall(oom_init) | |||
664 | * | 645 | * |
665 | * Has to be called with oom_lock held and never after | 646 | * Has to be called with oom_lock held and never after |
666 | * oom has been disabled already. | 647 | * oom has been disabled already. |
648 | * | ||
649 | * tsk->mm has to be non NULL and caller has to guarantee it is stable (either | ||
650 | * under task_lock or operate on the current). | ||
667 | */ | 651 | */ |
668 | void mark_oom_victim(struct task_struct *tsk) | 652 | static void mark_oom_victim(struct task_struct *tsk) |
669 | { | 653 | { |
654 | struct mm_struct *mm = tsk->mm; | ||
655 | |||
670 | WARN_ON(oom_killer_disabled); | 656 | WARN_ON(oom_killer_disabled); |
671 | /* OOM killer might race with memcg OOM */ | 657 | /* OOM killer might race with memcg OOM */ |
672 | if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE)) | 658 | if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE)) |
673 | return; | 659 | return; |
674 | atomic_inc(&tsk->signal->oom_victims); | 660 | |
661 | /* oom_mm is bound to the signal struct life time. */ | ||
662 | if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) | ||
663 | atomic_inc(&tsk->signal->oom_mm->mm_count); | ||
664 | |||
675 | /* | 665 | /* |
676 | * Make sure that the task is woken up from uninterruptible sleep | 666 | * Make sure that the task is woken up from uninterruptible sleep |
677 | * if it is frozen because OOM killer wouldn't be able to free | 667 | * if it is frozen because OOM killer wouldn't be able to free |
@@ -685,21 +675,29 @@ void mark_oom_victim(struct task_struct *tsk) | |||
685 | /** | 675 | /** |
686 | * exit_oom_victim - note the exit of an OOM victim | 676 | * exit_oom_victim - note the exit of an OOM victim |
687 | */ | 677 | */ |
688 | void exit_oom_victim(struct task_struct *tsk) | 678 | void exit_oom_victim(void) |
689 | { | 679 | { |
690 | if (!test_and_clear_tsk_thread_flag(tsk, TIF_MEMDIE)) | 680 | clear_thread_flag(TIF_MEMDIE); |
691 | return; | ||
692 | atomic_dec(&tsk->signal->oom_victims); | ||
693 | 681 | ||
694 | if (!atomic_dec_return(&oom_victims)) | 682 | if (!atomic_dec_return(&oom_victims)) |
695 | wake_up_all(&oom_victims_wait); | 683 | wake_up_all(&oom_victims_wait); |
696 | } | 684 | } |
697 | 685 | ||
698 | /** | 686 | /** |
687 | * oom_killer_enable - enable OOM killer | ||
688 | */ | ||
689 | void oom_killer_enable(void) | ||
690 | { | ||
691 | oom_killer_disabled = false; | ||
692 | } | ||
693 | |||
694 | /** | ||
699 | * oom_killer_disable - disable OOM killer | 695 | * oom_killer_disable - disable OOM killer |
696 | * @timeout: maximum timeout to wait for oom victims in jiffies | ||
700 | * | 697 | * |
701 | * Forces all page allocations to fail rather than trigger OOM killer. | 698 | * Forces all page allocations to fail rather than trigger OOM killer. |
702 | * Will block and wait until all OOM victims are killed. | 699 | * Will block and wait until all OOM victims are killed or the given |
700 | * timeout expires. | ||
703 | * | 701 | * |
704 | * The function cannot be called when there are runnable user tasks because | 702 | * The function cannot be called when there are runnable user tasks because |
705 | * the userspace would see unexpected allocation failures as a result. Any | 703 | * the userspace would see unexpected allocation failures as a result. Any |
@@ -708,8 +706,10 @@ void exit_oom_victim(struct task_struct *tsk) | |||
708 | * Returns true if successful and false if the OOM killer cannot be | 706 | * Returns true if successful and false if the OOM killer cannot be |
709 | * disabled. | 707 | * disabled. |
710 | */ | 708 | */ |
711 | bool oom_killer_disable(void) | 709 | bool oom_killer_disable(signed long timeout) |
712 | { | 710 | { |
711 | signed long ret; | ||
712 | |||
713 | /* | 713 | /* |
714 | * Make sure to not race with an ongoing OOM killer. Check that the | 714 | * Make sure to not race with an ongoing OOM killer. Check that the |
715 | * current is not killed (possibly due to sharing the victim's memory). | 715 | * current is not killed (possibly due to sharing the victim's memory). |
@@ -719,19 +719,16 @@ bool oom_killer_disable(void) | |||
719 | oom_killer_disabled = true; | 719 | oom_killer_disabled = true; |
720 | mutex_unlock(&oom_lock); | 720 | mutex_unlock(&oom_lock); |
721 | 721 | ||
722 | wait_event(oom_victims_wait, !atomic_read(&oom_victims)); | 722 | ret = wait_event_interruptible_timeout(oom_victims_wait, |
723 | !atomic_read(&oom_victims), timeout); | ||
724 | if (ret <= 0) { | ||
725 | oom_killer_enable(); | ||
726 | return false; | ||
727 | } | ||
723 | 728 | ||
724 | return true; | 729 | return true; |
725 | } | 730 | } |
726 | 731 | ||
727 | /** | ||
728 | * oom_killer_enable - enable OOM killer | ||
729 | */ | ||
730 | void oom_killer_enable(void) | ||
731 | { | ||
732 | oom_killer_disabled = false; | ||
733 | } | ||
734 | |||
735 | static inline bool __task_will_free_mem(struct task_struct *task) | 732 | static inline bool __task_will_free_mem(struct task_struct *task) |
736 | { | 733 | { |
737 | struct signal_struct *sig = task->signal; | 734 | struct signal_struct *sig = task->signal; |
@@ -760,7 +757,7 @@ static inline bool __task_will_free_mem(struct task_struct *task) | |||
760 | * Caller has to make sure that task->mm is stable (hold task_lock or | 757 | * Caller has to make sure that task->mm is stable (hold task_lock or |
761 | * it operates on the current). | 758 | * it operates on the current). |
762 | */ | 759 | */ |
763 | bool task_will_free_mem(struct task_struct *task) | 760 | static bool task_will_free_mem(struct task_struct *task) |
764 | { | 761 | { |
765 | struct mm_struct *mm = task->mm; | 762 | struct mm_struct *mm = task->mm; |
766 | struct task_struct *p; | 763 | struct task_struct *p; |
@@ -781,15 +778,16 @@ bool task_will_free_mem(struct task_struct *task) | |||
781 | * This task has already been drained by the oom reaper so there are | 778 | * This task has already been drained by the oom reaper so there are |
782 | * only small chances it will free some more | 779 | * only small chances it will free some more |
783 | */ | 780 | */ |
784 | if (test_bit(MMF_OOM_REAPED, &mm->flags)) | 781 | if (test_bit(MMF_OOM_SKIP, &mm->flags)) |
785 | return false; | 782 | return false; |
786 | 783 | ||
787 | if (atomic_read(&mm->mm_users) <= 1) | 784 | if (atomic_read(&mm->mm_users) <= 1) |
788 | return true; | 785 | return true; |
789 | 786 | ||
790 | /* | 787 | /* |
791 | * This is really pessimistic but we do not have any reliable way | 788 | * Make sure that all tasks which share the mm with the given tasks |
792 | * to check that external processes share with our mm | 789 | * are dying as well to make sure that a) nobody pins its mm and |
790 | * b) the task is also reapable by the oom reaper. | ||
793 | */ | 791 | */ |
794 | rcu_read_lock(); | 792 | rcu_read_lock(); |
795 | for_each_process(p) { | 793 | for_each_process(p) { |
@@ -806,14 +804,10 @@ bool task_will_free_mem(struct task_struct *task) | |||
806 | return ret; | 804 | return ret; |
807 | } | 805 | } |
808 | 806 | ||
809 | /* | 807 | static void oom_kill_process(struct oom_control *oc, const char *message) |
810 | * Must be called while holding a reference to p, which will be released upon | ||
811 | * returning. | ||
812 | */ | ||
813 | void oom_kill_process(struct oom_control *oc, struct task_struct *p, | ||
814 | unsigned int points, unsigned long totalpages, | ||
815 | const char *message) | ||
816 | { | 808 | { |
809 | struct task_struct *p = oc->chosen; | ||
810 | unsigned int points = oc->chosen_points; | ||
817 | struct task_struct *victim = p; | 811 | struct task_struct *victim = p; |
818 | struct task_struct *child; | 812 | struct task_struct *child; |
819 | struct task_struct *t; | 813 | struct task_struct *t; |
@@ -860,7 +854,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, | |||
860 | * oom_badness() returns 0 if the thread is unkillable | 854 | * oom_badness() returns 0 if the thread is unkillable |
861 | */ | 855 | */ |
862 | child_points = oom_badness(child, | 856 | child_points = oom_badness(child, |
863 | oc->memcg, oc->nodemask, totalpages); | 857 | oc->memcg, oc->nodemask, oc->totalpages); |
864 | if (child_points > victim_points) { | 858 | if (child_points > victim_points) { |
865 | put_task_struct(victim); | 859 | put_task_struct(victim); |
866 | victim = child; | 860 | victim = child; |
@@ -913,20 +907,20 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, | |||
913 | continue; | 907 | continue; |
914 | if (same_thread_group(p, victim)) | 908 | if (same_thread_group(p, victim)) |
915 | continue; | 909 | continue; |
916 | if (unlikely(p->flags & PF_KTHREAD) || is_global_init(p)) { | 910 | if (is_global_init(p)) { |
917 | /* | ||
918 | * We cannot use oom_reaper for the mm shared by this | ||
919 | * process because it wouldn't get killed and so the | ||
920 | * memory might be still used. Hide the mm from the oom | ||
921 | * killer to guarantee OOM forward progress. | ||
922 | */ | ||
923 | can_oom_reap = false; | 911 | can_oom_reap = false; |
924 | set_bit(MMF_OOM_REAPED, &mm->flags); | 912 | set_bit(MMF_OOM_SKIP, &mm->flags); |
925 | pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n", | 913 | pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n", |
926 | task_pid_nr(victim), victim->comm, | 914 | task_pid_nr(victim), victim->comm, |
927 | task_pid_nr(p), p->comm); | 915 | task_pid_nr(p), p->comm); |
928 | continue; | 916 | continue; |
929 | } | 917 | } |
918 | /* | ||
919 | * No use_mm() user needs to read from the userspace so we are | ||
920 | * ok to reap it. | ||
921 | */ | ||
922 | if (unlikely(p->flags & PF_KTHREAD)) | ||
923 | continue; | ||
930 | do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); | 924 | do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); |
931 | } | 925 | } |
932 | rcu_read_unlock(); | 926 | rcu_read_unlock(); |
@@ -942,7 +936,8 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, | |||
942 | /* | 936 | /* |
943 | * Determines whether the kernel must panic because of the panic_on_oom sysctl. | 937 | * Determines whether the kernel must panic because of the panic_on_oom sysctl. |
944 | */ | 938 | */ |
945 | void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint) | 939 | static void check_panic_on_oom(struct oom_control *oc, |
940 | enum oom_constraint constraint) | ||
946 | { | 941 | { |
947 | if (likely(!sysctl_panic_on_oom)) | 942 | if (likely(!sysctl_panic_on_oom)) |
948 | return; | 943 | return; |
@@ -988,19 +983,18 @@ EXPORT_SYMBOL_GPL(unregister_oom_notifier); | |||
988 | */ | 983 | */ |
989 | bool out_of_memory(struct oom_control *oc) | 984 | bool out_of_memory(struct oom_control *oc) |
990 | { | 985 | { |
991 | struct task_struct *p; | ||
992 | unsigned long totalpages; | ||
993 | unsigned long freed = 0; | 986 | unsigned long freed = 0; |
994 | unsigned int uninitialized_var(points); | ||
995 | enum oom_constraint constraint = CONSTRAINT_NONE; | 987 | enum oom_constraint constraint = CONSTRAINT_NONE; |
996 | 988 | ||
997 | if (oom_killer_disabled) | 989 | if (oom_killer_disabled) |
998 | return false; | 990 | return false; |
999 | 991 | ||
1000 | blocking_notifier_call_chain(&oom_notify_list, 0, &freed); | 992 | if (!is_memcg_oom(oc)) { |
1001 | if (freed > 0) | 993 | blocking_notifier_call_chain(&oom_notify_list, 0, &freed); |
1002 | /* Got some memory back in the last second. */ | 994 | if (freed > 0) |
1003 | return true; | 995 | /* Got some memory back in the last second. */ |
996 | return true; | ||
997 | } | ||
1004 | 998 | ||
1005 | /* | 999 | /* |
1006 | * If current has a pending SIGKILL or is exiting, then automatically | 1000 | * If current has a pending SIGKILL or is exiting, then automatically |
@@ -1024,37 +1018,38 @@ bool out_of_memory(struct oom_control *oc) | |||
1024 | 1018 | ||
1025 | /* | 1019 | /* |
1026 | * Check if there were limitations on the allocation (only relevant for | 1020 | * Check if there were limitations on the allocation (only relevant for |
1027 | * NUMA) that may require different handling. | 1021 | * NUMA and memcg) that may require different handling. |
1028 | */ | 1022 | */ |
1029 | constraint = constrained_alloc(oc, &totalpages); | 1023 | constraint = constrained_alloc(oc); |
1030 | if (constraint != CONSTRAINT_MEMORY_POLICY) | 1024 | if (constraint != CONSTRAINT_MEMORY_POLICY) |
1031 | oc->nodemask = NULL; | 1025 | oc->nodemask = NULL; |
1032 | check_panic_on_oom(oc, constraint); | 1026 | check_panic_on_oom(oc, constraint); |
1033 | 1027 | ||
1034 | if (sysctl_oom_kill_allocating_task && current->mm && | 1028 | if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task && |
1035 | !oom_unkillable_task(current, NULL, oc->nodemask) && | 1029 | current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) && |
1036 | current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { | 1030 | current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { |
1037 | get_task_struct(current); | 1031 | get_task_struct(current); |
1038 | oom_kill_process(oc, current, 0, totalpages, | 1032 | oc->chosen = current; |
1039 | "Out of memory (oom_kill_allocating_task)"); | 1033 | oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)"); |
1040 | return true; | 1034 | return true; |
1041 | } | 1035 | } |
1042 | 1036 | ||
1043 | p = select_bad_process(oc, &points, totalpages); | 1037 | select_bad_process(oc); |
1044 | /* Found nothing?!?! Either we hang forever, or we panic. */ | 1038 | /* Found nothing?!?! Either we hang forever, or we panic. */ |
1045 | if (!p && !is_sysrq_oom(oc)) { | 1039 | if (!oc->chosen && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) { |
1046 | dump_header(oc, NULL); | 1040 | dump_header(oc, NULL); |
1047 | panic("Out of memory and no killable processes...\n"); | 1041 | panic("Out of memory and no killable processes...\n"); |
1048 | } | 1042 | } |
1049 | if (p && p != (void *)-1UL) { | 1043 | if (oc->chosen && oc->chosen != (void *)-1UL) { |
1050 | oom_kill_process(oc, p, points, totalpages, "Out of memory"); | 1044 | oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" : |
1045 | "Memory cgroup out of memory"); | ||
1051 | /* | 1046 | /* |
1052 | * Give the killed process a good chance to exit before trying | 1047 | * Give the killed process a good chance to exit before trying |
1053 | * to allocate memory again. | 1048 | * to allocate memory again. |
1054 | */ | 1049 | */ |
1055 | schedule_timeout_killable(1); | 1050 | schedule_timeout_killable(1); |
1056 | } | 1051 | } |
1057 | return true; | 1052 | return !!oc->chosen; |
1058 | } | 1053 | } |
1059 | 1054 | ||
1060 | /* | 1055 | /* |
@@ -1077,16 +1072,6 @@ void pagefault_out_of_memory(void) | |||
1077 | 1072 | ||
1078 | if (!mutex_trylock(&oom_lock)) | 1073 | if (!mutex_trylock(&oom_lock)) |
1079 | return; | 1074 | return; |
1080 | 1075 | out_of_memory(&oc); | |
1081 | if (!out_of_memory(&oc)) { | ||
1082 | /* | ||
1083 | * There shouldn't be any user tasks runnable while the | ||
1084 | * OOM killer is disabled, so the current task has to | ||
1085 | * be a racing OOM victim for which oom_killer_disable() | ||
1086 | * is waiting for. | ||
1087 | */ | ||
1088 | WARN_ON(test_thread_flag(TIF_MEMDIE)); | ||
1089 | } | ||
1090 | |||
1091 | mutex_unlock(&oom_lock); | 1076 | mutex_unlock(&oom_lock); |
1092 | } | 1077 | } |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 28d6f36a2d79..439cc63ad903 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -1965,36 +1965,6 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb) | |||
1965 | return false; | 1965 | return false; |
1966 | } | 1966 | } |
1967 | 1967 | ||
1968 | void throttle_vm_writeout(gfp_t gfp_mask) | ||
1969 | { | ||
1970 | unsigned long background_thresh; | ||
1971 | unsigned long dirty_thresh; | ||
1972 | |||
1973 | for ( ; ; ) { | ||
1974 | global_dirty_limits(&background_thresh, &dirty_thresh); | ||
1975 | dirty_thresh = hard_dirty_limit(&global_wb_domain, dirty_thresh); | ||
1976 | |||
1977 | /* | ||
1978 | * Boost the allowable dirty threshold a bit for page | ||
1979 | * allocators so they don't get DoS'ed by heavy writers | ||
1980 | */ | ||
1981 | dirty_thresh += dirty_thresh / 10; /* wheeee... */ | ||
1982 | |||
1983 | if (global_node_page_state(NR_UNSTABLE_NFS) + | ||
1984 | global_node_page_state(NR_WRITEBACK) <= dirty_thresh) | ||
1985 | break; | ||
1986 | congestion_wait(BLK_RW_ASYNC, HZ/10); | ||
1987 | |||
1988 | /* | ||
1989 | * The caller might hold locks which can prevent IO completion | ||
1990 | * or progress in the filesystem. So we cannot just sit here | ||
1991 | * waiting for IO to complete. | ||
1992 | */ | ||
1993 | if ((gfp_mask & (__GFP_FS|__GFP_IO)) != (__GFP_FS|__GFP_IO)) | ||
1994 | break; | ||
1995 | } | ||
1996 | } | ||
1997 | |||
1998 | /* | 1968 | /* |
1999 | * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs | 1969 | * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs |
2000 | */ | 1970 | */ |
@@ -2746,7 +2716,7 @@ int test_clear_page_writeback(struct page *page) | |||
2746 | int ret; | 2716 | int ret; |
2747 | 2717 | ||
2748 | lock_page_memcg(page); | 2718 | lock_page_memcg(page); |
2749 | if (mapping) { | 2719 | if (mapping && mapping_use_writeback_tags(mapping)) { |
2750 | struct inode *inode = mapping->host; | 2720 | struct inode *inode = mapping->host; |
2751 | struct backing_dev_info *bdi = inode_to_bdi(inode); | 2721 | struct backing_dev_info *bdi = inode_to_bdi(inode); |
2752 | unsigned long flags; | 2722 | unsigned long flags; |
@@ -2789,7 +2759,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) | |||
2789 | int ret; | 2759 | int ret; |
2790 | 2760 | ||
2791 | lock_page_memcg(page); | 2761 | lock_page_memcg(page); |
2792 | if (mapping) { | 2762 | if (mapping && mapping_use_writeback_tags(mapping)) { |
2793 | struct inode *inode = mapping->host; | 2763 | struct inode *inode = mapping->host; |
2794 | struct backing_dev_info *bdi = inode_to_bdi(inode); | 2764 | struct backing_dev_info *bdi = inode_to_bdi(inode); |
2795 | unsigned long flags; | 2765 | unsigned long flags; |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a2214c64ed3c..ca423cc20b59 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -607,6 +607,9 @@ static bool need_debug_guardpage(void) | |||
607 | if (!debug_pagealloc_enabled()) | 607 | if (!debug_pagealloc_enabled()) |
608 | return false; | 608 | return false; |
609 | 609 | ||
610 | if (!debug_guardpage_minorder()) | ||
611 | return false; | ||
612 | |||
610 | return true; | 613 | return true; |
611 | } | 614 | } |
612 | 615 | ||
@@ -615,6 +618,9 @@ static void init_debug_guardpage(void) | |||
615 | if (!debug_pagealloc_enabled()) | 618 | if (!debug_pagealloc_enabled()) |
616 | return; | 619 | return; |
617 | 620 | ||
621 | if (!debug_guardpage_minorder()) | ||
622 | return; | ||
623 | |||
618 | _debug_guardpage_enabled = true; | 624 | _debug_guardpage_enabled = true; |
619 | } | 625 | } |
620 | 626 | ||
@@ -635,19 +641,22 @@ static int __init debug_guardpage_minorder_setup(char *buf) | |||
635 | pr_info("Setting debug_guardpage_minorder to %lu\n", res); | 641 | pr_info("Setting debug_guardpage_minorder to %lu\n", res); |
636 | return 0; | 642 | return 0; |
637 | } | 643 | } |
638 | __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup); | 644 | early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup); |
639 | 645 | ||
640 | static inline void set_page_guard(struct zone *zone, struct page *page, | 646 | static inline bool set_page_guard(struct zone *zone, struct page *page, |
641 | unsigned int order, int migratetype) | 647 | unsigned int order, int migratetype) |
642 | { | 648 | { |
643 | struct page_ext *page_ext; | 649 | struct page_ext *page_ext; |
644 | 650 | ||
645 | if (!debug_guardpage_enabled()) | 651 | if (!debug_guardpage_enabled()) |
646 | return; | 652 | return false; |
653 | |||
654 | if (order >= debug_guardpage_minorder()) | ||
655 | return false; | ||
647 | 656 | ||
648 | page_ext = lookup_page_ext(page); | 657 | page_ext = lookup_page_ext(page); |
649 | if (unlikely(!page_ext)) | 658 | if (unlikely(!page_ext)) |
650 | return; | 659 | return false; |
651 | 660 | ||
652 | __set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); | 661 | __set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); |
653 | 662 | ||
@@ -655,6 +664,8 @@ static inline void set_page_guard(struct zone *zone, struct page *page, | |||
655 | set_page_private(page, order); | 664 | set_page_private(page, order); |
656 | /* Guard pages are not available for any usage */ | 665 | /* Guard pages are not available for any usage */ |
657 | __mod_zone_freepage_state(zone, -(1 << order), migratetype); | 666 | __mod_zone_freepage_state(zone, -(1 << order), migratetype); |
667 | |||
668 | return true; | ||
658 | } | 669 | } |
659 | 670 | ||
660 | static inline void clear_page_guard(struct zone *zone, struct page *page, | 671 | static inline void clear_page_guard(struct zone *zone, struct page *page, |
@@ -676,9 +687,9 @@ static inline void clear_page_guard(struct zone *zone, struct page *page, | |||
676 | __mod_zone_freepage_state(zone, (1 << order), migratetype); | 687 | __mod_zone_freepage_state(zone, (1 << order), migratetype); |
677 | } | 688 | } |
678 | #else | 689 | #else |
679 | struct page_ext_operations debug_guardpage_ops = { NULL, }; | 690 | struct page_ext_operations debug_guardpage_ops; |
680 | static inline void set_page_guard(struct zone *zone, struct page *page, | 691 | static inline bool set_page_guard(struct zone *zone, struct page *page, |
681 | unsigned int order, int migratetype) {} | 692 | unsigned int order, int migratetype) { return false; } |
682 | static inline void clear_page_guard(struct zone *zone, struct page *page, | 693 | static inline void clear_page_guard(struct zone *zone, struct page *page, |
683 | unsigned int order, int migratetype) {} | 694 | unsigned int order, int migratetype) {} |
684 | #endif | 695 | #endif |
@@ -1393,15 +1404,18 @@ static void __init deferred_free_range(struct page *page, | |||
1393 | return; | 1404 | return; |
1394 | 1405 | ||
1395 | /* Free a large naturally-aligned chunk if possible */ | 1406 | /* Free a large naturally-aligned chunk if possible */ |
1396 | if (nr_pages == MAX_ORDER_NR_PAGES && | 1407 | if (nr_pages == pageblock_nr_pages && |
1397 | (pfn & (MAX_ORDER_NR_PAGES-1)) == 0) { | 1408 | (pfn & (pageblock_nr_pages - 1)) == 0) { |
1398 | set_pageblock_migratetype(page, MIGRATE_MOVABLE); | 1409 | set_pageblock_migratetype(page, MIGRATE_MOVABLE); |
1399 | __free_pages_boot_core(page, MAX_ORDER-1); | 1410 | __free_pages_boot_core(page, pageblock_order); |
1400 | return; | 1411 | return; |
1401 | } | 1412 | } |
1402 | 1413 | ||
1403 | for (i = 0; i < nr_pages; i++, page++) | 1414 | for (i = 0; i < nr_pages; i++, page++, pfn++) { |
1415 | if ((pfn & (pageblock_nr_pages - 1)) == 0) | ||
1416 | set_pageblock_migratetype(page, MIGRATE_MOVABLE); | ||
1404 | __free_pages_boot_core(page, 0); | 1417 | __free_pages_boot_core(page, 0); |
1418 | } | ||
1405 | } | 1419 | } |
1406 | 1420 | ||
1407 | /* Completion tracking for deferred_init_memmap() threads */ | 1421 | /* Completion tracking for deferred_init_memmap() threads */ |
@@ -1469,9 +1483,9 @@ static int __init deferred_init_memmap(void *data) | |||
1469 | 1483 | ||
1470 | /* | 1484 | /* |
1471 | * Ensure pfn_valid is checked every | 1485 | * Ensure pfn_valid is checked every |
1472 | * MAX_ORDER_NR_PAGES for memory holes | 1486 | * pageblock_nr_pages for memory holes |
1473 | */ | 1487 | */ |
1474 | if ((pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) { | 1488 | if ((pfn & (pageblock_nr_pages - 1)) == 0) { |
1475 | if (!pfn_valid(pfn)) { | 1489 | if (!pfn_valid(pfn)) { |
1476 | page = NULL; | 1490 | page = NULL; |
1477 | goto free_range; | 1491 | goto free_range; |
@@ -1484,7 +1498,7 @@ static int __init deferred_init_memmap(void *data) | |||
1484 | } | 1498 | } |
1485 | 1499 | ||
1486 | /* Minimise pfn page lookups and scheduler checks */ | 1500 | /* Minimise pfn page lookups and scheduler checks */ |
1487 | if (page && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0) { | 1501 | if (page && (pfn & (pageblock_nr_pages - 1)) != 0) { |
1488 | page++; | 1502 | page++; |
1489 | } else { | 1503 | } else { |
1490 | nr_pages += nr_to_free; | 1504 | nr_pages += nr_to_free; |
@@ -1520,6 +1534,9 @@ free_range: | |||
1520 | free_base_page = NULL; | 1534 | free_base_page = NULL; |
1521 | free_base_pfn = nr_to_free = 0; | 1535 | free_base_pfn = nr_to_free = 0; |
1522 | } | 1536 | } |
1537 | /* Free the last block of pages to allocator */ | ||
1538 | nr_pages += nr_to_free; | ||
1539 | deferred_free_range(free_base_page, free_base_pfn, nr_to_free); | ||
1523 | 1540 | ||
1524 | first_init_pfn = max(end_pfn, first_init_pfn); | 1541 | first_init_pfn = max(end_pfn, first_init_pfn); |
1525 | } | 1542 | } |
@@ -1616,18 +1633,15 @@ static inline void expand(struct zone *zone, struct page *page, | |||
1616 | size >>= 1; | 1633 | size >>= 1; |
1617 | VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]); | 1634 | VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]); |
1618 | 1635 | ||
1619 | if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) && | 1636 | /* |
1620 | debug_guardpage_enabled() && | 1637 | * Mark as guard pages (or page), that will allow to |
1621 | high < debug_guardpage_minorder()) { | 1638 | * merge back to allocator when buddy will be freed. |
1622 | /* | 1639 | * Corresponding page table entries will not be touched, |
1623 | * Mark as guard pages (or page), that will allow to | 1640 | * pages will stay not present in virtual address space |
1624 | * merge back to allocator when buddy will be freed. | 1641 | */ |
1625 | * Corresponding page table entries will not be touched, | 1642 | if (set_page_guard(zone, &page[size], high, migratetype)) |
1626 | * pages will stay not present in virtual address space | ||
1627 | */ | ||
1628 | set_page_guard(zone, &page[size], high, migratetype); | ||
1629 | continue; | 1643 | continue; |
1630 | } | 1644 | |
1631 | list_add(&page[size].lru, &area->free_list[migratetype]); | 1645 | list_add(&page[size].lru, &area->free_list[migratetype]); |
1632 | area->nr_free++; | 1646 | area->nr_free++; |
1633 | set_page_order(&page[size], high); | 1647 | set_page_order(&page[size], high); |
@@ -2489,9 +2503,14 @@ int __isolate_free_page(struct page *page, unsigned int order) | |||
2489 | mt = get_pageblock_migratetype(page); | 2503 | mt = get_pageblock_migratetype(page); |
2490 | 2504 | ||
2491 | if (!is_migrate_isolate(mt)) { | 2505 | if (!is_migrate_isolate(mt)) { |
2492 | /* Obey watermarks as if the page was being allocated */ | 2506 | /* |
2493 | watermark = low_wmark_pages(zone) + (1 << order); | 2507 | * Obey watermarks as if the page was being allocated. We can |
2494 | if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) | 2508 | * emulate a high-order watermark check with a raised order-0 |
2509 | * watermark, because we already know our high-order page | ||
2510 | * exists. | ||
2511 | */ | ||
2512 | watermark = min_wmark_pages(zone) + (1UL << order); | ||
2513 | if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) | ||
2495 | return 0; | 2514 | return 0; |
2496 | 2515 | ||
2497 | __mod_zone_freepage_state(zone, -(1UL << order), mt); | 2516 | __mod_zone_freepage_state(zone, -(1UL << order), mt); |
@@ -2960,9 +2979,11 @@ static DEFINE_RATELIMIT_STATE(nopage_rs, | |||
2960 | DEFAULT_RATELIMIT_INTERVAL, | 2979 | DEFAULT_RATELIMIT_INTERVAL, |
2961 | DEFAULT_RATELIMIT_BURST); | 2980 | DEFAULT_RATELIMIT_BURST); |
2962 | 2981 | ||
2963 | void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, const char *fmt, ...) | 2982 | void warn_alloc(gfp_t gfp_mask, const char *fmt, ...) |
2964 | { | 2983 | { |
2965 | unsigned int filter = SHOW_MEM_FILTER_NODES; | 2984 | unsigned int filter = SHOW_MEM_FILTER_NODES; |
2985 | struct va_format vaf; | ||
2986 | va_list args; | ||
2966 | 2987 | ||
2967 | if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || | 2988 | if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || |
2968 | debug_guardpage_minorder() > 0) | 2989 | debug_guardpage_minorder() > 0) |
@@ -2980,22 +3001,16 @@ void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, const char *fmt, ...) | |||
2980 | if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) | 3001 | if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) |
2981 | filter &= ~SHOW_MEM_FILTER_NODES; | 3002 | filter &= ~SHOW_MEM_FILTER_NODES; |
2982 | 3003 | ||
2983 | if (fmt) { | 3004 | pr_warn("%s: ", current->comm); |
2984 | struct va_format vaf; | ||
2985 | va_list args; | ||
2986 | 3005 | ||
2987 | va_start(args, fmt); | 3006 | va_start(args, fmt); |
3007 | vaf.fmt = fmt; | ||
3008 | vaf.va = &args; | ||
3009 | pr_cont("%pV", &vaf); | ||
3010 | va_end(args); | ||
2988 | 3011 | ||
2989 | vaf.fmt = fmt; | 3012 | pr_cont(", mode:%#x(%pGg)\n", gfp_mask, &gfp_mask); |
2990 | vaf.va = &args; | ||
2991 | 3013 | ||
2992 | pr_warn("%pV", &vaf); | ||
2993 | |||
2994 | va_end(args); | ||
2995 | } | ||
2996 | |||
2997 | pr_warn("%s: page allocation failure: order:%u, mode:%#x(%pGg)\n", | ||
2998 | current->comm, order, gfp_mask, &gfp_mask); | ||
2999 | dump_stack(); | 3014 | dump_stack(); |
3000 | if (!should_suppress_show_mem()) | 3015 | if (!should_suppress_show_mem()) |
3001 | show_mem(filter); | 3016 | show_mem(filter); |
@@ -3137,6 +3152,65 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
3137 | return NULL; | 3152 | return NULL; |
3138 | } | 3153 | } |
3139 | 3154 | ||
3155 | static inline bool | ||
3156 | should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, | ||
3157 | enum compact_result compact_result, | ||
3158 | enum compact_priority *compact_priority, | ||
3159 | int *compaction_retries) | ||
3160 | { | ||
3161 | int max_retries = MAX_COMPACT_RETRIES; | ||
3162 | int min_priority; | ||
3163 | |||
3164 | if (!order) | ||
3165 | return false; | ||
3166 | |||
3167 | if (compaction_made_progress(compact_result)) | ||
3168 | (*compaction_retries)++; | ||
3169 | |||
3170 | /* | ||
3171 | * compaction considers all the zone as desperately out of memory | ||
3172 | * so it doesn't really make much sense to retry except when the | ||
3173 | * failure could be caused by insufficient priority | ||
3174 | */ | ||
3175 | if (compaction_failed(compact_result)) | ||
3176 | goto check_priority; | ||
3177 | |||
3178 | /* | ||
3179 | * make sure the compaction wasn't deferred or didn't bail out early | ||
3180 | * due to locks contention before we declare that we should give up. | ||
3181 | * But do not retry if the given zonelist is not suitable for | ||
3182 | * compaction. | ||
3183 | */ | ||
3184 | if (compaction_withdrawn(compact_result)) | ||
3185 | return compaction_zonelist_suitable(ac, order, alloc_flags); | ||
3186 | |||
3187 | /* | ||
3188 | * !costly requests are much more important than __GFP_REPEAT | ||
3189 | * costly ones because they are de facto nofail and invoke OOM | ||
3190 | * killer to move on while costly can fail and users are ready | ||
3191 | * to cope with that. 1/4 retries is rather arbitrary but we | ||
3192 | * would need much more detailed feedback from compaction to | ||
3193 | * make a better decision. | ||
3194 | */ | ||
3195 | if (order > PAGE_ALLOC_COSTLY_ORDER) | ||
3196 | max_retries /= 4; | ||
3197 | if (*compaction_retries <= max_retries) | ||
3198 | return true; | ||
3199 | |||
3200 | /* | ||
3201 | * Make sure there are attempts at the highest priority if we exhausted | ||
3202 | * all retries or failed at the lower priorities. | ||
3203 | */ | ||
3204 | check_priority: | ||
3205 | min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ? | ||
3206 | MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY; | ||
3207 | if (*compact_priority > min_priority) { | ||
3208 | (*compact_priority)--; | ||
3209 | *compaction_retries = 0; | ||
3210 | return true; | ||
3211 | } | ||
3212 | return false; | ||
3213 | } | ||
3140 | #else | 3214 | #else |
3141 | static inline struct page * | 3215 | static inline struct page * |
3142 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | 3216 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, |
@@ -3147,13 +3221,11 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
3147 | return NULL; | 3221 | return NULL; |
3148 | } | 3222 | } |
3149 | 3223 | ||
3150 | #endif /* CONFIG_COMPACTION */ | ||
3151 | |||
3152 | static inline bool | 3224 | static inline bool |
3153 | should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags, | 3225 | should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags, |
3154 | enum compact_result compact_result, | 3226 | enum compact_result compact_result, |
3155 | enum compact_priority *compact_priority, | 3227 | enum compact_priority *compact_priority, |
3156 | int compaction_retries) | 3228 | int *compaction_retries) |
3157 | { | 3229 | { |
3158 | struct zone *zone; | 3230 | struct zone *zone; |
3159 | struct zoneref *z; | 3231 | struct zoneref *z; |
@@ -3175,6 +3247,7 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla | |||
3175 | } | 3247 | } |
3176 | return false; | 3248 | return false; |
3177 | } | 3249 | } |
3250 | #endif /* CONFIG_COMPACTION */ | ||
3178 | 3251 | ||
3179 | /* Perform direct synchronous page reclaim */ | 3252 | /* Perform direct synchronous page reclaim */ |
3180 | static int | 3253 | static int |
@@ -3325,16 +3398,26 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) | |||
3325 | static inline bool | 3398 | static inline bool |
3326 | should_reclaim_retry(gfp_t gfp_mask, unsigned order, | 3399 | should_reclaim_retry(gfp_t gfp_mask, unsigned order, |
3327 | struct alloc_context *ac, int alloc_flags, | 3400 | struct alloc_context *ac, int alloc_flags, |
3328 | bool did_some_progress, int no_progress_loops) | 3401 | bool did_some_progress, int *no_progress_loops) |
3329 | { | 3402 | { |
3330 | struct zone *zone; | 3403 | struct zone *zone; |
3331 | struct zoneref *z; | 3404 | struct zoneref *z; |
3332 | 3405 | ||
3333 | /* | 3406 | /* |
3407 | * Costly allocations might have made a progress but this doesn't mean | ||
3408 | * their order will become available due to high fragmentation so | ||
3409 | * always increment the no progress counter for them | ||
3410 | */ | ||
3411 | if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) | ||
3412 | *no_progress_loops = 0; | ||
3413 | else | ||
3414 | (*no_progress_loops)++; | ||
3415 | |||
3416 | /* | ||
3334 | * Make sure we converge to OOM if we cannot make any progress | 3417 | * Make sure we converge to OOM if we cannot make any progress |
3335 | * several times in the row. | 3418 | * several times in the row. |
3336 | */ | 3419 | */ |
3337 | if (no_progress_loops > MAX_RECLAIM_RETRIES) | 3420 | if (*no_progress_loops > MAX_RECLAIM_RETRIES) |
3338 | return false; | 3421 | return false; |
3339 | 3422 | ||
3340 | /* | 3423 | /* |
@@ -3349,7 +3432,7 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, | |||
3349 | unsigned long reclaimable; | 3432 | unsigned long reclaimable; |
3350 | 3433 | ||
3351 | available = reclaimable = zone_reclaimable_pages(zone); | 3434 | available = reclaimable = zone_reclaimable_pages(zone); |
3352 | available -= DIV_ROUND_UP(no_progress_loops * available, | 3435 | available -= DIV_ROUND_UP((*no_progress_loops) * available, |
3353 | MAX_RECLAIM_RETRIES); | 3436 | MAX_RECLAIM_RETRIES); |
3354 | available += zone_page_state_snapshot(zone, NR_FREE_PAGES); | 3437 | available += zone_page_state_snapshot(zone, NR_FREE_PAGES); |
3355 | 3438 | ||
@@ -3410,6 +3493,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | |||
3410 | enum compact_result compact_result; | 3493 | enum compact_result compact_result; |
3411 | int compaction_retries = 0; | 3494 | int compaction_retries = 0; |
3412 | int no_progress_loops = 0; | 3495 | int no_progress_loops = 0; |
3496 | unsigned long alloc_start = jiffies; | ||
3497 | unsigned int stall_timeout = 10 * HZ; | ||
3413 | 3498 | ||
3414 | /* | 3499 | /* |
3415 | * In the slowpath, we sanity check order to avoid ever trying to | 3500 | * In the slowpath, we sanity check order to avoid ever trying to |
@@ -3554,9 +3639,6 @@ retry: | |||
3554 | if (page) | 3639 | if (page) |
3555 | goto got_pg; | 3640 | goto got_pg; |
3556 | 3641 | ||
3557 | if (order && compaction_made_progress(compact_result)) | ||
3558 | compaction_retries++; | ||
3559 | |||
3560 | /* Do not loop if specifically requested */ | 3642 | /* Do not loop if specifically requested */ |
3561 | if (gfp_mask & __GFP_NORETRY) | 3643 | if (gfp_mask & __GFP_NORETRY) |
3562 | goto nopage; | 3644 | goto nopage; |
@@ -3568,18 +3650,16 @@ retry: | |||
3568 | if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT)) | 3650 | if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT)) |
3569 | goto nopage; | 3651 | goto nopage; |
3570 | 3652 | ||
3571 | /* | 3653 | /* Make sure we know about allocations which stall for too long */ |
3572 | * Costly allocations might have made a progress but this doesn't mean | 3654 | if (time_after(jiffies, alloc_start + stall_timeout)) { |
3573 | * their order will become available due to high fragmentation so | 3655 | warn_alloc(gfp_mask, |
3574 | * always increment the no progress counter for them | 3656 | "page alloction stalls for %ums, order:%u\n", |
3575 | */ | 3657 | jiffies_to_msecs(jiffies-alloc_start), order); |
3576 | if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) | 3658 | stall_timeout += 10 * HZ; |
3577 | no_progress_loops = 0; | 3659 | } |
3578 | else | ||
3579 | no_progress_loops++; | ||
3580 | 3660 | ||
3581 | if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, | 3661 | if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, |
3582 | did_some_progress > 0, no_progress_loops)) | 3662 | did_some_progress > 0, &no_progress_loops)) |
3583 | goto retry; | 3663 | goto retry; |
3584 | 3664 | ||
3585 | /* | 3665 | /* |
@@ -3591,7 +3671,7 @@ retry: | |||
3591 | if (did_some_progress > 0 && | 3671 | if (did_some_progress > 0 && |
3592 | should_compact_retry(ac, order, alloc_flags, | 3672 | should_compact_retry(ac, order, alloc_flags, |
3593 | compact_result, &compact_priority, | 3673 | compact_result, &compact_priority, |
3594 | compaction_retries)) | 3674 | &compaction_retries)) |
3595 | goto retry; | 3675 | goto retry; |
3596 | 3676 | ||
3597 | /* Reclaim has failed us, start killing things */ | 3677 | /* Reclaim has failed us, start killing things */ |
@@ -3606,7 +3686,8 @@ retry: | |||
3606 | } | 3686 | } |
3607 | 3687 | ||
3608 | nopage: | 3688 | nopage: |
3609 | warn_alloc_failed(gfp_mask, order, NULL); | 3689 | warn_alloc(gfp_mask, |
3690 | "page allocation failure: order:%u", order); | ||
3610 | got_pg: | 3691 | got_pg: |
3611 | return page; | 3692 | return page; |
3612 | } | 3693 | } |
@@ -4555,7 +4636,7 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) | |||
4555 | int j; | 4636 | int j; |
4556 | struct zonelist *zonelist; | 4637 | struct zonelist *zonelist; |
4557 | 4638 | ||
4558 | zonelist = &pgdat->node_zonelists[0]; | 4639 | zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK]; |
4559 | for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++) | 4640 | for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++) |
4560 | ; | 4641 | ; |
4561 | j = build_zonelists_node(NODE_DATA(node), zonelist, j); | 4642 | j = build_zonelists_node(NODE_DATA(node), zonelist, j); |
@@ -4571,7 +4652,7 @@ static void build_thisnode_zonelists(pg_data_t *pgdat) | |||
4571 | int j; | 4652 | int j; |
4572 | struct zonelist *zonelist; | 4653 | struct zonelist *zonelist; |
4573 | 4654 | ||
4574 | zonelist = &pgdat->node_zonelists[1]; | 4655 | zonelist = &pgdat->node_zonelists[ZONELIST_NOFALLBACK]; |
4575 | j = build_zonelists_node(pgdat, zonelist, 0); | 4656 | j = build_zonelists_node(pgdat, zonelist, 0); |
4576 | zonelist->_zonerefs[j].zone = NULL; | 4657 | zonelist->_zonerefs[j].zone = NULL; |
4577 | zonelist->_zonerefs[j].zone_idx = 0; | 4658 | zonelist->_zonerefs[j].zone_idx = 0; |
@@ -4592,7 +4673,7 @@ static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) | |||
4592 | struct zone *z; | 4673 | struct zone *z; |
4593 | struct zonelist *zonelist; | 4674 | struct zonelist *zonelist; |
4594 | 4675 | ||
4595 | zonelist = &pgdat->node_zonelists[0]; | 4676 | zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK]; |
4596 | pos = 0; | 4677 | pos = 0; |
4597 | for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) { | 4678 | for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) { |
4598 | for (j = 0; j < nr_nodes; j++) { | 4679 | for (j = 0; j < nr_nodes; j++) { |
@@ -4727,7 +4808,7 @@ static void build_zonelists(pg_data_t *pgdat) | |||
4727 | 4808 | ||
4728 | local_node = pgdat->node_id; | 4809 | local_node = pgdat->node_id; |
4729 | 4810 | ||
4730 | zonelist = &pgdat->node_zonelists[0]; | 4811 | zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK]; |
4731 | j = build_zonelists_node(pgdat, zonelist, 0); | 4812 | j = build_zonelists_node(pgdat, zonelist, 0); |
4732 | 4813 | ||
4733 | /* | 4814 | /* |
@@ -5000,15 +5081,6 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | |||
5000 | 5081 | ||
5001 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | 5082 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
5002 | /* | 5083 | /* |
5003 | * If not mirrored_kernelcore and ZONE_MOVABLE exists, range | ||
5004 | * from zone_movable_pfn[nid] to end of each node should be | ||
5005 | * ZONE_MOVABLE not ZONE_NORMAL. skip it. | ||
5006 | */ | ||
5007 | if (!mirrored_kernelcore && zone_movable_pfn[nid]) | ||
5008 | if (zone == ZONE_NORMAL && pfn >= zone_movable_pfn[nid]) | ||
5009 | continue; | ||
5010 | |||
5011 | /* | ||
5012 | * Check given memblock attribute by firmware which can affect | 5084 | * Check given memblock attribute by firmware which can affect |
5013 | * kernel memory layout. If zone==ZONE_MOVABLE but memory is | 5085 | * kernel memory layout. If zone==ZONE_MOVABLE but memory is |
5014 | * mirrored, it's an overlapped memmap init. skip it. | 5086 | * mirrored, it's an overlapped memmap init. skip it. |
@@ -5451,6 +5523,12 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid, | |||
5451 | *zone_end_pfn = min(node_end_pfn, | 5523 | *zone_end_pfn = min(node_end_pfn, |
5452 | arch_zone_highest_possible_pfn[movable_zone]); | 5524 | arch_zone_highest_possible_pfn[movable_zone]); |
5453 | 5525 | ||
5526 | /* Adjust for ZONE_MOVABLE starting within this range */ | ||
5527 | } else if (!mirrored_kernelcore && | ||
5528 | *zone_start_pfn < zone_movable_pfn[nid] && | ||
5529 | *zone_end_pfn > zone_movable_pfn[nid]) { | ||
5530 | *zone_end_pfn = zone_movable_pfn[nid]; | ||
5531 | |||
5454 | /* Check if this whole range is within ZONE_MOVABLE */ | 5532 | /* Check if this whole range is within ZONE_MOVABLE */ |
5455 | } else if (*zone_start_pfn >= zone_movable_pfn[nid]) | 5533 | } else if (*zone_start_pfn >= zone_movable_pfn[nid]) |
5456 | *zone_start_pfn = *zone_end_pfn; | 5534 | *zone_start_pfn = *zone_end_pfn; |
@@ -5554,28 +5632,23 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, | |||
5554 | * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages | 5632 | * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages |
5555 | * and vice versa. | 5633 | * and vice versa. |
5556 | */ | 5634 | */ |
5557 | if (zone_movable_pfn[nid]) { | 5635 | if (mirrored_kernelcore && zone_movable_pfn[nid]) { |
5558 | if (mirrored_kernelcore) { | 5636 | unsigned long start_pfn, end_pfn; |
5559 | unsigned long start_pfn, end_pfn; | 5637 | struct memblock_region *r; |
5560 | struct memblock_region *r; | 5638 | |
5561 | 5639 | for_each_memblock(memory, r) { | |
5562 | for_each_memblock(memory, r) { | 5640 | start_pfn = clamp(memblock_region_memory_base_pfn(r), |
5563 | start_pfn = clamp(memblock_region_memory_base_pfn(r), | 5641 | zone_start_pfn, zone_end_pfn); |
5564 | zone_start_pfn, zone_end_pfn); | 5642 | end_pfn = clamp(memblock_region_memory_end_pfn(r), |
5565 | end_pfn = clamp(memblock_region_memory_end_pfn(r), | 5643 | zone_start_pfn, zone_end_pfn); |
5566 | zone_start_pfn, zone_end_pfn); | 5644 | |
5567 | 5645 | if (zone_type == ZONE_MOVABLE && | |
5568 | if (zone_type == ZONE_MOVABLE && | 5646 | memblock_is_mirror(r)) |
5569 | memblock_is_mirror(r)) | 5647 | nr_absent += end_pfn - start_pfn; |
5570 | nr_absent += end_pfn - start_pfn; | 5648 | |
5571 | 5649 | if (zone_type == ZONE_NORMAL && | |
5572 | if (zone_type == ZONE_NORMAL && | 5650 | !memblock_is_mirror(r)) |
5573 | !memblock_is_mirror(r)) | 5651 | nr_absent += end_pfn - start_pfn; |
5574 | nr_absent += end_pfn - start_pfn; | ||
5575 | } | ||
5576 | } else { | ||
5577 | if (zone_type == ZONE_NORMAL) | ||
5578 | nr_absent += node_end_pfn - zone_movable_pfn[nid]; | ||
5579 | } | 5652 | } |
5580 | } | 5653 | } |
5581 | 5654 | ||
@@ -6929,6 +7002,17 @@ static int __init set_hashdist(char *str) | |||
6929 | __setup("hashdist=", set_hashdist); | 7002 | __setup("hashdist=", set_hashdist); |
6930 | #endif | 7003 | #endif |
6931 | 7004 | ||
7005 | #ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES | ||
7006 | /* | ||
7007 | * Returns the number of pages that arch has reserved but | ||
7008 | * is not known to alloc_large_system_hash(). | ||
7009 | */ | ||
7010 | static unsigned long __init arch_reserved_kernel_pages(void) | ||
7011 | { | ||
7012 | return 0; | ||
7013 | } | ||
7014 | #endif | ||
7015 | |||
6932 | /* | 7016 | /* |
6933 | * allocate a large system hash table from bootmem | 7017 | * allocate a large system hash table from bootmem |
6934 | * - it is assumed that the hash table must contain an exact power-of-2 | 7018 | * - it is assumed that the hash table must contain an exact power-of-2 |
@@ -6953,6 +7037,7 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
6953 | if (!numentries) { | 7037 | if (!numentries) { |
6954 | /* round applicable memory size up to nearest megabyte */ | 7038 | /* round applicable memory size up to nearest megabyte */ |
6955 | numentries = nr_kernel_pages; | 7039 | numentries = nr_kernel_pages; |
7040 | numentries -= arch_reserved_kernel_pages(); | ||
6956 | 7041 | ||
6957 | /* It isn't necessary when PAGE_SIZE >= 1MB */ | 7042 | /* It isn't necessary when PAGE_SIZE >= 1MB */ |
6958 | if (PAGE_SHIFT < 20) | 7043 | if (PAGE_SHIFT < 20) |
diff --git a/mm/page_ext.c b/mm/page_ext.c index 44a4c029c8e7..121dcffc4ec1 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c | |||
@@ -42,6 +42,11 @@ | |||
42 | * and page extension core can skip to allocate memory. As result, | 42 | * and page extension core can skip to allocate memory. As result, |
43 | * none of memory is wasted. | 43 | * none of memory is wasted. |
44 | * | 44 | * |
45 | * When need callback returns true, page_ext checks if there is a request for | ||
46 | * extra memory through size in struct page_ext_operations. If it is non-zero, | ||
47 | * extra space is allocated for each page_ext entry and offset is returned to | ||
48 | * user through offset in struct page_ext_operations. | ||
49 | * | ||
45 | * The init callback is used to do proper initialization after page extension | 50 | * The init callback is used to do proper initialization after page extension |
46 | * is completely initialized. In sparse memory system, extra memory is | 51 | * is completely initialized. In sparse memory system, extra memory is |
47 | * allocated some time later than memmap is allocated. In other words, lifetime | 52 | * allocated some time later than memmap is allocated. In other words, lifetime |
@@ -66,18 +71,24 @@ static struct page_ext_operations *page_ext_ops[] = { | |||
66 | }; | 71 | }; |
67 | 72 | ||
68 | static unsigned long total_usage; | 73 | static unsigned long total_usage; |
74 | static unsigned long extra_mem; | ||
69 | 75 | ||
70 | static bool __init invoke_need_callbacks(void) | 76 | static bool __init invoke_need_callbacks(void) |
71 | { | 77 | { |
72 | int i; | 78 | int i; |
73 | int entries = ARRAY_SIZE(page_ext_ops); | 79 | int entries = ARRAY_SIZE(page_ext_ops); |
80 | bool need = false; | ||
74 | 81 | ||
75 | for (i = 0; i < entries; i++) { | 82 | for (i = 0; i < entries; i++) { |
76 | if (page_ext_ops[i]->need && page_ext_ops[i]->need()) | 83 | if (page_ext_ops[i]->need && page_ext_ops[i]->need()) { |
77 | return true; | 84 | page_ext_ops[i]->offset = sizeof(struct page_ext) + |
85 | extra_mem; | ||
86 | extra_mem += page_ext_ops[i]->size; | ||
87 | need = true; | ||
88 | } | ||
78 | } | 89 | } |
79 | 90 | ||
80 | return false; | 91 | return need; |
81 | } | 92 | } |
82 | 93 | ||
83 | static void __init invoke_init_callbacks(void) | 94 | static void __init invoke_init_callbacks(void) |
@@ -91,6 +102,16 @@ static void __init invoke_init_callbacks(void) | |||
91 | } | 102 | } |
92 | } | 103 | } |
93 | 104 | ||
105 | static unsigned long get_entry_size(void) | ||
106 | { | ||
107 | return sizeof(struct page_ext) + extra_mem; | ||
108 | } | ||
109 | |||
110 | static inline struct page_ext *get_entry(void *base, unsigned long index) | ||
111 | { | ||
112 | return base + get_entry_size() * index; | ||
113 | } | ||
114 | |||
94 | #if !defined(CONFIG_SPARSEMEM) | 115 | #if !defined(CONFIG_SPARSEMEM) |
95 | 116 | ||
96 | 117 | ||
@@ -102,7 +123,7 @@ void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) | |||
102 | struct page_ext *lookup_page_ext(struct page *page) | 123 | struct page_ext *lookup_page_ext(struct page *page) |
103 | { | 124 | { |
104 | unsigned long pfn = page_to_pfn(page); | 125 | unsigned long pfn = page_to_pfn(page); |
105 | unsigned long offset; | 126 | unsigned long index; |
106 | struct page_ext *base; | 127 | struct page_ext *base; |
107 | 128 | ||
108 | base = NODE_DATA(page_to_nid(page))->node_page_ext; | 129 | base = NODE_DATA(page_to_nid(page))->node_page_ext; |
@@ -119,9 +140,9 @@ struct page_ext *lookup_page_ext(struct page *page) | |||
119 | if (unlikely(!base)) | 140 | if (unlikely(!base)) |
120 | return NULL; | 141 | return NULL; |
121 | #endif | 142 | #endif |
122 | offset = pfn - round_down(node_start_pfn(page_to_nid(page)), | 143 | index = pfn - round_down(node_start_pfn(page_to_nid(page)), |
123 | MAX_ORDER_NR_PAGES); | 144 | MAX_ORDER_NR_PAGES); |
124 | return base + offset; | 145 | return get_entry(base, index); |
125 | } | 146 | } |
126 | 147 | ||
127 | static int __init alloc_node_page_ext(int nid) | 148 | static int __init alloc_node_page_ext(int nid) |
@@ -143,7 +164,7 @@ static int __init alloc_node_page_ext(int nid) | |||
143 | !IS_ALIGNED(node_end_pfn(nid), MAX_ORDER_NR_PAGES)) | 164 | !IS_ALIGNED(node_end_pfn(nid), MAX_ORDER_NR_PAGES)) |
144 | nr_pages += MAX_ORDER_NR_PAGES; | 165 | nr_pages += MAX_ORDER_NR_PAGES; |
145 | 166 | ||
146 | table_size = sizeof(struct page_ext) * nr_pages; | 167 | table_size = get_entry_size() * nr_pages; |
147 | 168 | ||
148 | base = memblock_virt_alloc_try_nid_nopanic( | 169 | base = memblock_virt_alloc_try_nid_nopanic( |
149 | table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), | 170 | table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), |
@@ -196,7 +217,7 @@ struct page_ext *lookup_page_ext(struct page *page) | |||
196 | if (!section->page_ext) | 217 | if (!section->page_ext) |
197 | return NULL; | 218 | return NULL; |
198 | #endif | 219 | #endif |
199 | return section->page_ext + pfn; | 220 | return get_entry(section->page_ext, pfn); |
200 | } | 221 | } |
201 | 222 | ||
202 | static void *__meminit alloc_page_ext(size_t size, int nid) | 223 | static void *__meminit alloc_page_ext(size_t size, int nid) |
@@ -229,7 +250,7 @@ static int __meminit init_section_page_ext(unsigned long pfn, int nid) | |||
229 | if (section->page_ext) | 250 | if (section->page_ext) |
230 | return 0; | 251 | return 0; |
231 | 252 | ||
232 | table_size = sizeof(struct page_ext) * PAGES_PER_SECTION; | 253 | table_size = get_entry_size() * PAGES_PER_SECTION; |
233 | base = alloc_page_ext(table_size, nid); | 254 | base = alloc_page_ext(table_size, nid); |
234 | 255 | ||
235 | /* | 256 | /* |
@@ -249,7 +270,7 @@ static int __meminit init_section_page_ext(unsigned long pfn, int nid) | |||
249 | * we need to apply a mask. | 270 | * we need to apply a mask. |
250 | */ | 271 | */ |
251 | pfn &= PAGE_SECTION_MASK; | 272 | pfn &= PAGE_SECTION_MASK; |
252 | section->page_ext = base - pfn; | 273 | section->page_ext = (void *)base - get_entry_size() * pfn; |
253 | total_usage += table_size; | 274 | total_usage += table_size; |
254 | return 0; | 275 | return 0; |
255 | } | 276 | } |
@@ -262,7 +283,7 @@ static void free_page_ext(void *addr) | |||
262 | struct page *page = virt_to_page(addr); | 283 | struct page *page = virt_to_page(addr); |
263 | size_t table_size; | 284 | size_t table_size; |
264 | 285 | ||
265 | table_size = sizeof(struct page_ext) * PAGES_PER_SECTION; | 286 | table_size = get_entry_size() * PAGES_PER_SECTION; |
266 | 287 | ||
267 | BUG_ON(PageReserved(page)); | 288 | BUG_ON(PageReserved(page)); |
268 | free_pages_exact(addr, table_size); | 289 | free_pages_exact(addr, table_size); |
@@ -277,7 +298,7 @@ static void __free_page_ext(unsigned long pfn) | |||
277 | ms = __pfn_to_section(pfn); | 298 | ms = __pfn_to_section(pfn); |
278 | if (!ms || !ms->page_ext) | 299 | if (!ms || !ms->page_ext) |
279 | return; | 300 | return; |
280 | base = ms->page_ext + pfn; | 301 | base = get_entry(ms->page_ext, pfn); |
281 | free_page_ext(base); | 302 | free_page_ext(base); |
282 | ms->page_ext = NULL; | 303 | ms->page_ext = NULL; |
283 | } | 304 | } |
diff --git a/mm/page_io.c b/mm/page_io.c index eafe5ddc2b54..a2651f58c86a 100644 --- a/mm/page_io.c +++ b/mm/page_io.c | |||
@@ -264,7 +264,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, | |||
264 | int ret; | 264 | int ret; |
265 | struct swap_info_struct *sis = page_swap_info(page); | 265 | struct swap_info_struct *sis = page_swap_info(page); |
266 | 266 | ||
267 | BUG_ON(!PageSwapCache(page)); | 267 | VM_BUG_ON_PAGE(!PageSwapCache(page), page); |
268 | if (sis->flags & SWP_FILE) { | 268 | if (sis->flags & SWP_FILE) { |
269 | struct kiocb kiocb; | 269 | struct kiocb kiocb; |
270 | struct file *swap_file = sis->swap_file; | 270 | struct file *swap_file = sis->swap_file; |
@@ -338,7 +338,7 @@ int swap_readpage(struct page *page) | |||
338 | int ret = 0; | 338 | int ret = 0; |
339 | struct swap_info_struct *sis = page_swap_info(page); | 339 | struct swap_info_struct *sis = page_swap_info(page); |
340 | 340 | ||
341 | BUG_ON(!PageSwapCache(page)); | 341 | VM_BUG_ON_PAGE(!PageSwapCache(page), page); |
342 | VM_BUG_ON_PAGE(!PageLocked(page), page); | 342 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
343 | VM_BUG_ON_PAGE(PageUptodate(page), page); | 343 | VM_BUG_ON_PAGE(PageUptodate(page), page); |
344 | if (frontswap_load(page) == 0) { | 344 | if (frontswap_load(page) == 0) { |
@@ -388,7 +388,8 @@ int swap_set_page_dirty(struct page *page) | |||
388 | 388 | ||
389 | if (sis->flags & SWP_FILE) { | 389 | if (sis->flags & SWP_FILE) { |
390 | struct address_space *mapping = sis->swap_file->f_mapping; | 390 | struct address_space *mapping = sis->swap_file->f_mapping; |
391 | BUG_ON(!PageSwapCache(page)); | 391 | |
392 | VM_BUG_ON_PAGE(!PageSwapCache(page), page); | ||
392 | return mapping->a_ops->set_page_dirty(page); | 393 | return mapping->a_ops->set_page_dirty(page); |
393 | } else { | 394 | } else { |
394 | return __set_page_dirty_no_writeback(page); | 395 | return __set_page_dirty_no_writeback(page); |
diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 064b7fb6e0b5..a5594bfcc5ed 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c | |||
@@ -55,7 +55,7 @@ static int set_migratetype_isolate(struct page *page, | |||
55 | ret = 0; | 55 | ret = 0; |
56 | 56 | ||
57 | /* | 57 | /* |
58 | * immobile means "not-on-lru" paes. If immobile is larger than | 58 | * immobile means "not-on-lru" pages. If immobile is larger than |
59 | * removable-by-driver pages reported by notifier, we'll fail. | 59 | * removable-by-driver pages reported by notifier, we'll fail. |
60 | */ | 60 | */ |
61 | 61 | ||
diff --git a/mm/page_owner.c b/mm/page_owner.c index ec6dc1886f71..60634dc53a88 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c | |||
@@ -8,6 +8,7 @@ | |||
8 | #include <linux/jump_label.h> | 8 | #include <linux/jump_label.h> |
9 | #include <linux/migrate.h> | 9 | #include <linux/migrate.h> |
10 | #include <linux/stackdepot.h> | 10 | #include <linux/stackdepot.h> |
11 | #include <linux/seq_file.h> | ||
11 | 12 | ||
12 | #include "internal.h" | 13 | #include "internal.h" |
13 | 14 | ||
@@ -17,6 +18,13 @@ | |||
17 | */ | 18 | */ |
18 | #define PAGE_OWNER_STACK_DEPTH (16) | 19 | #define PAGE_OWNER_STACK_DEPTH (16) |
19 | 20 | ||
21 | struct page_owner { | ||
22 | unsigned int order; | ||
23 | gfp_t gfp_mask; | ||
24 | int last_migrate_reason; | ||
25 | depot_stack_handle_t handle; | ||
26 | }; | ||
27 | |||
20 | static bool page_owner_disabled = true; | 28 | static bool page_owner_disabled = true; |
21 | DEFINE_STATIC_KEY_FALSE(page_owner_inited); | 29 | DEFINE_STATIC_KEY_FALSE(page_owner_inited); |
22 | 30 | ||
@@ -85,10 +93,16 @@ static void init_page_owner(void) | |||
85 | } | 93 | } |
86 | 94 | ||
87 | struct page_ext_operations page_owner_ops = { | 95 | struct page_ext_operations page_owner_ops = { |
96 | .size = sizeof(struct page_owner), | ||
88 | .need = need_page_owner, | 97 | .need = need_page_owner, |
89 | .init = init_page_owner, | 98 | .init = init_page_owner, |
90 | }; | 99 | }; |
91 | 100 | ||
101 | static inline struct page_owner *get_page_owner(struct page_ext *page_ext) | ||
102 | { | ||
103 | return (void *)page_ext + page_owner_ops.offset; | ||
104 | } | ||
105 | |||
92 | void __reset_page_owner(struct page *page, unsigned int order) | 106 | void __reset_page_owner(struct page *page, unsigned int order) |
93 | { | 107 | { |
94 | int i; | 108 | int i; |
@@ -155,14 +169,16 @@ noinline void __set_page_owner(struct page *page, unsigned int order, | |||
155 | gfp_t gfp_mask) | 169 | gfp_t gfp_mask) |
156 | { | 170 | { |
157 | struct page_ext *page_ext = lookup_page_ext(page); | 171 | struct page_ext *page_ext = lookup_page_ext(page); |
172 | struct page_owner *page_owner; | ||
158 | 173 | ||
159 | if (unlikely(!page_ext)) | 174 | if (unlikely(!page_ext)) |
160 | return; | 175 | return; |
161 | 176 | ||
162 | page_ext->handle = save_stack(gfp_mask); | 177 | page_owner = get_page_owner(page_ext); |
163 | page_ext->order = order; | 178 | page_owner->handle = save_stack(gfp_mask); |
164 | page_ext->gfp_mask = gfp_mask; | 179 | page_owner->order = order; |
165 | page_ext->last_migrate_reason = -1; | 180 | page_owner->gfp_mask = gfp_mask; |
181 | page_owner->last_migrate_reason = -1; | ||
166 | 182 | ||
167 | __set_bit(PAGE_EXT_OWNER, &page_ext->flags); | 183 | __set_bit(PAGE_EXT_OWNER, &page_ext->flags); |
168 | } | 184 | } |
@@ -170,21 +186,26 @@ noinline void __set_page_owner(struct page *page, unsigned int order, | |||
170 | void __set_page_owner_migrate_reason(struct page *page, int reason) | 186 | void __set_page_owner_migrate_reason(struct page *page, int reason) |
171 | { | 187 | { |
172 | struct page_ext *page_ext = lookup_page_ext(page); | 188 | struct page_ext *page_ext = lookup_page_ext(page); |
189 | struct page_owner *page_owner; | ||
190 | |||
173 | if (unlikely(!page_ext)) | 191 | if (unlikely(!page_ext)) |
174 | return; | 192 | return; |
175 | 193 | ||
176 | page_ext->last_migrate_reason = reason; | 194 | page_owner = get_page_owner(page_ext); |
195 | page_owner->last_migrate_reason = reason; | ||
177 | } | 196 | } |
178 | 197 | ||
179 | void __split_page_owner(struct page *page, unsigned int order) | 198 | void __split_page_owner(struct page *page, unsigned int order) |
180 | { | 199 | { |
181 | int i; | 200 | int i; |
182 | struct page_ext *page_ext = lookup_page_ext(page); | 201 | struct page_ext *page_ext = lookup_page_ext(page); |
202 | struct page_owner *page_owner; | ||
183 | 203 | ||
184 | if (unlikely(!page_ext)) | 204 | if (unlikely(!page_ext)) |
185 | return; | 205 | return; |
186 | 206 | ||
187 | page_ext->order = 0; | 207 | page_owner = get_page_owner(page_ext); |
208 | page_owner->order = 0; | ||
188 | for (i = 1; i < (1 << order); i++) | 209 | for (i = 1; i < (1 << order); i++) |
189 | __copy_page_owner(page, page + i); | 210 | __copy_page_owner(page, page + i); |
190 | } | 211 | } |
@@ -193,14 +214,18 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage) | |||
193 | { | 214 | { |
194 | struct page_ext *old_ext = lookup_page_ext(oldpage); | 215 | struct page_ext *old_ext = lookup_page_ext(oldpage); |
195 | struct page_ext *new_ext = lookup_page_ext(newpage); | 216 | struct page_ext *new_ext = lookup_page_ext(newpage); |
217 | struct page_owner *old_page_owner, *new_page_owner; | ||
196 | 218 | ||
197 | if (unlikely(!old_ext || !new_ext)) | 219 | if (unlikely(!old_ext || !new_ext)) |
198 | return; | 220 | return; |
199 | 221 | ||
200 | new_ext->order = old_ext->order; | 222 | old_page_owner = get_page_owner(old_ext); |
201 | new_ext->gfp_mask = old_ext->gfp_mask; | 223 | new_page_owner = get_page_owner(new_ext); |
202 | new_ext->last_migrate_reason = old_ext->last_migrate_reason; | 224 | new_page_owner->order = old_page_owner->order; |
203 | new_ext->handle = old_ext->handle; | 225 | new_page_owner->gfp_mask = old_page_owner->gfp_mask; |
226 | new_page_owner->last_migrate_reason = | ||
227 | old_page_owner->last_migrate_reason; | ||
228 | new_page_owner->handle = old_page_owner->handle; | ||
204 | 229 | ||
205 | /* | 230 | /* |
206 | * We don't clear the bit on the oldpage as it's going to be freed | 231 | * We don't clear the bit on the oldpage as it's going to be freed |
@@ -214,9 +239,88 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage) | |||
214 | __set_bit(PAGE_EXT_OWNER, &new_ext->flags); | 239 | __set_bit(PAGE_EXT_OWNER, &new_ext->flags); |
215 | } | 240 | } |
216 | 241 | ||
242 | void pagetypeinfo_showmixedcount_print(struct seq_file *m, | ||
243 | pg_data_t *pgdat, struct zone *zone) | ||
244 | { | ||
245 | struct page *page; | ||
246 | struct page_ext *page_ext; | ||
247 | struct page_owner *page_owner; | ||
248 | unsigned long pfn = zone->zone_start_pfn, block_end_pfn; | ||
249 | unsigned long end_pfn = pfn + zone->spanned_pages; | ||
250 | unsigned long count[MIGRATE_TYPES] = { 0, }; | ||
251 | int pageblock_mt, page_mt; | ||
252 | int i; | ||
253 | |||
254 | /* Scan block by block. First and last block may be incomplete */ | ||
255 | pfn = zone->zone_start_pfn; | ||
256 | |||
257 | /* | ||
258 | * Walk the zone in pageblock_nr_pages steps. If a page block spans | ||
259 | * a zone boundary, it will be double counted between zones. This does | ||
260 | * not matter as the mixed block count will still be correct | ||
261 | */ | ||
262 | for (; pfn < end_pfn; ) { | ||
263 | if (!pfn_valid(pfn)) { | ||
264 | pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); | ||
265 | continue; | ||
266 | } | ||
267 | |||
268 | block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); | ||
269 | block_end_pfn = min(block_end_pfn, end_pfn); | ||
270 | |||
271 | page = pfn_to_page(pfn); | ||
272 | pageblock_mt = get_pageblock_migratetype(page); | ||
273 | |||
274 | for (; pfn < block_end_pfn; pfn++) { | ||
275 | if (!pfn_valid_within(pfn)) | ||
276 | continue; | ||
277 | |||
278 | page = pfn_to_page(pfn); | ||
279 | |||
280 | if (page_zone(page) != zone) | ||
281 | continue; | ||
282 | |||
283 | if (PageBuddy(page)) { | ||
284 | pfn += (1UL << page_order(page)) - 1; | ||
285 | continue; | ||
286 | } | ||
287 | |||
288 | if (PageReserved(page)) | ||
289 | continue; | ||
290 | |||
291 | page_ext = lookup_page_ext(page); | ||
292 | if (unlikely(!page_ext)) | ||
293 | continue; | ||
294 | |||
295 | if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) | ||
296 | continue; | ||
297 | |||
298 | page_owner = get_page_owner(page_ext); | ||
299 | page_mt = gfpflags_to_migratetype( | ||
300 | page_owner->gfp_mask); | ||
301 | if (pageblock_mt != page_mt) { | ||
302 | if (is_migrate_cma(pageblock_mt)) | ||
303 | count[MIGRATE_MOVABLE]++; | ||
304 | else | ||
305 | count[pageblock_mt]++; | ||
306 | |||
307 | pfn = block_end_pfn; | ||
308 | break; | ||
309 | } | ||
310 | pfn += (1UL << page_owner->order) - 1; | ||
311 | } | ||
312 | } | ||
313 | |||
314 | /* Print counts */ | ||
315 | seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); | ||
316 | for (i = 0; i < MIGRATE_TYPES; i++) | ||
317 | seq_printf(m, "%12lu ", count[i]); | ||
318 | seq_putc(m, '\n'); | ||
319 | } | ||
320 | |||
217 | static ssize_t | 321 | static ssize_t |
218 | print_page_owner(char __user *buf, size_t count, unsigned long pfn, | 322 | print_page_owner(char __user *buf, size_t count, unsigned long pfn, |
219 | struct page *page, struct page_ext *page_ext, | 323 | struct page *page, struct page_owner *page_owner, |
220 | depot_stack_handle_t handle) | 324 | depot_stack_handle_t handle) |
221 | { | 325 | { |
222 | int ret; | 326 | int ret; |
@@ -236,15 +340,15 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, | |||
236 | 340 | ||
237 | ret = snprintf(kbuf, count, | 341 | ret = snprintf(kbuf, count, |
238 | "Page allocated via order %u, mask %#x(%pGg)\n", | 342 | "Page allocated via order %u, mask %#x(%pGg)\n", |
239 | page_ext->order, page_ext->gfp_mask, | 343 | page_owner->order, page_owner->gfp_mask, |
240 | &page_ext->gfp_mask); | 344 | &page_owner->gfp_mask); |
241 | 345 | ||
242 | if (ret >= count) | 346 | if (ret >= count) |
243 | goto err; | 347 | goto err; |
244 | 348 | ||
245 | /* Print information relevant to grouping pages by mobility */ | 349 | /* Print information relevant to grouping pages by mobility */ |
246 | pageblock_mt = get_pageblock_migratetype(page); | 350 | pageblock_mt = get_pageblock_migratetype(page); |
247 | page_mt = gfpflags_to_migratetype(page_ext->gfp_mask); | 351 | page_mt = gfpflags_to_migratetype(page_owner->gfp_mask); |
248 | ret += snprintf(kbuf + ret, count - ret, | 352 | ret += snprintf(kbuf + ret, count - ret, |
249 | "PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n", | 353 | "PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n", |
250 | pfn, | 354 | pfn, |
@@ -261,10 +365,10 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, | |||
261 | if (ret >= count) | 365 | if (ret >= count) |
262 | goto err; | 366 | goto err; |
263 | 367 | ||
264 | if (page_ext->last_migrate_reason != -1) { | 368 | if (page_owner->last_migrate_reason != -1) { |
265 | ret += snprintf(kbuf + ret, count - ret, | 369 | ret += snprintf(kbuf + ret, count - ret, |
266 | "Page has been migrated, last migrate reason: %s\n", | 370 | "Page has been migrated, last migrate reason: %s\n", |
267 | migrate_reason_names[page_ext->last_migrate_reason]); | 371 | migrate_reason_names[page_owner->last_migrate_reason]); |
268 | if (ret >= count) | 372 | if (ret >= count) |
269 | goto err; | 373 | goto err; |
270 | } | 374 | } |
@@ -287,6 +391,7 @@ err: | |||
287 | void __dump_page_owner(struct page *page) | 391 | void __dump_page_owner(struct page *page) |
288 | { | 392 | { |
289 | struct page_ext *page_ext = lookup_page_ext(page); | 393 | struct page_ext *page_ext = lookup_page_ext(page); |
394 | struct page_owner *page_owner; | ||
290 | unsigned long entries[PAGE_OWNER_STACK_DEPTH]; | 395 | unsigned long entries[PAGE_OWNER_STACK_DEPTH]; |
291 | struct stack_trace trace = { | 396 | struct stack_trace trace = { |
292 | .nr_entries = 0, | 397 | .nr_entries = 0, |
@@ -302,7 +407,9 @@ void __dump_page_owner(struct page *page) | |||
302 | pr_alert("There is not page extension available.\n"); | 407 | pr_alert("There is not page extension available.\n"); |
303 | return; | 408 | return; |
304 | } | 409 | } |
305 | gfp_mask = page_ext->gfp_mask; | 410 | |
411 | page_owner = get_page_owner(page_ext); | ||
412 | gfp_mask = page_owner->gfp_mask; | ||
306 | mt = gfpflags_to_migratetype(gfp_mask); | 413 | mt = gfpflags_to_migratetype(gfp_mask); |
307 | 414 | ||
308 | if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) { | 415 | if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) { |
@@ -310,7 +417,7 @@ void __dump_page_owner(struct page *page) | |||
310 | return; | 417 | return; |
311 | } | 418 | } |
312 | 419 | ||
313 | handle = READ_ONCE(page_ext->handle); | 420 | handle = READ_ONCE(page_owner->handle); |
314 | if (!handle) { | 421 | if (!handle) { |
315 | pr_alert("page_owner info is not active (free page?)\n"); | 422 | pr_alert("page_owner info is not active (free page?)\n"); |
316 | return; | 423 | return; |
@@ -318,12 +425,12 @@ void __dump_page_owner(struct page *page) | |||
318 | 425 | ||
319 | depot_fetch_stack(handle, &trace); | 426 | depot_fetch_stack(handle, &trace); |
320 | pr_alert("page allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n", | 427 | pr_alert("page allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n", |
321 | page_ext->order, migratetype_names[mt], gfp_mask, &gfp_mask); | 428 | page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask); |
322 | print_stack_trace(&trace, 0); | 429 | print_stack_trace(&trace, 0); |
323 | 430 | ||
324 | if (page_ext->last_migrate_reason != -1) | 431 | if (page_owner->last_migrate_reason != -1) |
325 | pr_alert("page has been migrated, last migrate reason: %s\n", | 432 | pr_alert("page has been migrated, last migrate reason: %s\n", |
326 | migrate_reason_names[page_ext->last_migrate_reason]); | 433 | migrate_reason_names[page_owner->last_migrate_reason]); |
327 | } | 434 | } |
328 | 435 | ||
329 | static ssize_t | 436 | static ssize_t |
@@ -332,6 +439,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) | |||
332 | unsigned long pfn; | 439 | unsigned long pfn; |
333 | struct page *page; | 440 | struct page *page; |
334 | struct page_ext *page_ext; | 441 | struct page_ext *page_ext; |
442 | struct page_owner *page_owner; | ||
335 | depot_stack_handle_t handle; | 443 | depot_stack_handle_t handle; |
336 | 444 | ||
337 | if (!static_branch_unlikely(&page_owner_inited)) | 445 | if (!static_branch_unlikely(&page_owner_inited)) |
@@ -381,11 +489,13 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) | |||
381 | if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) | 489 | if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) |
382 | continue; | 490 | continue; |
383 | 491 | ||
492 | page_owner = get_page_owner(page_ext); | ||
493 | |||
384 | /* | 494 | /* |
385 | * Access to page_ext->handle isn't synchronous so we should | 495 | * Access to page_ext->handle isn't synchronous so we should |
386 | * be careful to access it. | 496 | * be careful to access it. |
387 | */ | 497 | */ |
388 | handle = READ_ONCE(page_ext->handle); | 498 | handle = READ_ONCE(page_owner->handle); |
389 | if (!handle) | 499 | if (!handle) |
390 | continue; | 500 | continue; |
391 | 501 | ||
@@ -393,7 +503,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) | |||
393 | *ppos = (pfn - min_low_pfn) + 1; | 503 | *ppos = (pfn - min_low_pfn) + 1; |
394 | 504 | ||
395 | return print_page_owner(buf, count, pfn, page, | 505 | return print_page_owner(buf, count, pfn, page, |
396 | page_ext, handle); | 506 | page_owner, handle); |
397 | } | 507 | } |
398 | 508 | ||
399 | return 0; | 509 | return 0; |
diff --git a/mm/shmem.c b/mm/shmem.c index d86b5e455fef..0e9901e69d24 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -3965,7 +3965,7 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range); | |||
3965 | 3965 | ||
3966 | /* common code */ | 3966 | /* common code */ |
3967 | 3967 | ||
3968 | static struct dentry_operations anon_ops = { | 3968 | static const struct dentry_operations anon_ops = { |
3969 | .d_dname = simple_dname | 3969 | .d_dname = simple_dname |
3970 | }; | 3970 | }; |
3971 | 3971 | ||
@@ -748,10 +748,8 @@ void release_pages(struct page **pages, int nr, bool cold) | |||
748 | locked_pgdat = NULL; | 748 | locked_pgdat = NULL; |
749 | } | 749 | } |
750 | 750 | ||
751 | if (is_huge_zero_page(page)) { | 751 | if (is_huge_zero_page(page)) |
752 | put_huge_zero_page(); | ||
753 | continue; | 752 | continue; |
754 | } | ||
755 | 753 | ||
756 | page = compound_head(page); | 754 | page = compound_head(page); |
757 | if (!put_page_testzero(page)) | 755 | if (!put_page_testzero(page)) |
diff --git a/mm/swap_state.c b/mm/swap_state.c index c8310a37be3a..35d7e0ee1c77 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -37,6 +37,8 @@ struct address_space swapper_spaces[MAX_SWAPFILES] = { | |||
37 | .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), | 37 | .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), |
38 | .i_mmap_writable = ATOMIC_INIT(0), | 38 | .i_mmap_writable = ATOMIC_INIT(0), |
39 | .a_ops = &swap_aops, | 39 | .a_ops = &swap_aops, |
40 | /* swap cache doesn't use writeback related tags */ | ||
41 | .flags = 1 << AS_NO_WRITEBACK_TAGS, | ||
40 | } | 42 | } |
41 | }; | 43 | }; |
42 | 44 | ||
@@ -92,7 +94,7 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry) | |||
92 | address_space = swap_address_space(entry); | 94 | address_space = swap_address_space(entry); |
93 | spin_lock_irq(&address_space->tree_lock); | 95 | spin_lock_irq(&address_space->tree_lock); |
94 | error = radix_tree_insert(&address_space->page_tree, | 96 | error = radix_tree_insert(&address_space->page_tree, |
95 | entry.val, page); | 97 | swp_offset(entry), page); |
96 | if (likely(!error)) { | 98 | if (likely(!error)) { |
97 | address_space->nrpages++; | 99 | address_space->nrpages++; |
98 | __inc_node_page_state(page, NR_FILE_PAGES); | 100 | __inc_node_page_state(page, NR_FILE_PAGES); |
@@ -143,7 +145,7 @@ void __delete_from_swap_cache(struct page *page) | |||
143 | 145 | ||
144 | entry.val = page_private(page); | 146 | entry.val = page_private(page); |
145 | address_space = swap_address_space(entry); | 147 | address_space = swap_address_space(entry); |
146 | radix_tree_delete(&address_space->page_tree, page_private(page)); | 148 | radix_tree_delete(&address_space->page_tree, swp_offset(entry)); |
147 | set_page_private(page, 0); | 149 | set_page_private(page, 0); |
148 | ClearPageSwapCache(page); | 150 | ClearPageSwapCache(page); |
149 | address_space->nrpages--; | 151 | address_space->nrpages--; |
@@ -252,9 +254,7 @@ static inline void free_swap_cache(struct page *page) | |||
252 | void free_page_and_swap_cache(struct page *page) | 254 | void free_page_and_swap_cache(struct page *page) |
253 | { | 255 | { |
254 | free_swap_cache(page); | 256 | free_swap_cache(page); |
255 | if (is_huge_zero_page(page)) | 257 | if (!is_huge_zero_page(page)) |
256 | put_huge_zero_page(); | ||
257 | else | ||
258 | put_page(page); | 258 | put_page(page); |
259 | } | 259 | } |
260 | 260 | ||
@@ -283,7 +283,7 @@ struct page * lookup_swap_cache(swp_entry_t entry) | |||
283 | { | 283 | { |
284 | struct page *page; | 284 | struct page *page; |
285 | 285 | ||
286 | page = find_get_page(swap_address_space(entry), entry.val); | 286 | page = find_get_page(swap_address_space(entry), swp_offset(entry)); |
287 | 287 | ||
288 | if (page) { | 288 | if (page) { |
289 | INC_CACHE_INFO(find_success); | 289 | INC_CACHE_INFO(find_success); |
@@ -310,7 +310,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, | |||
310 | * called after lookup_swap_cache() failed, re-calling | 310 | * called after lookup_swap_cache() failed, re-calling |
311 | * that would confuse statistics. | 311 | * that would confuse statistics. |
312 | */ | 312 | */ |
313 | found_page = find_get_page(swapper_space, entry.val); | 313 | found_page = find_get_page(swapper_space, swp_offset(entry)); |
314 | if (found_page) | 314 | if (found_page) |
315 | break; | 315 | break; |
316 | 316 | ||
diff --git a/mm/swapfile.c b/mm/swapfile.c index 2657accc6e2b..2210de290b54 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -105,7 +105,7 @@ __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) | |||
105 | struct page *page; | 105 | struct page *page; |
106 | int ret = 0; | 106 | int ret = 0; |
107 | 107 | ||
108 | page = find_get_page(swap_address_space(entry), entry.val); | 108 | page = find_get_page(swap_address_space(entry), swp_offset(entry)); |
109 | if (!page) | 109 | if (!page) |
110 | return 0; | 110 | return 0; |
111 | /* | 111 | /* |
@@ -257,6 +257,53 @@ static inline void cluster_set_null(struct swap_cluster_info *info) | |||
257 | info->data = 0; | 257 | info->data = 0; |
258 | } | 258 | } |
259 | 259 | ||
260 | static inline bool cluster_list_empty(struct swap_cluster_list *list) | ||
261 | { | ||
262 | return cluster_is_null(&list->head); | ||
263 | } | ||
264 | |||
265 | static inline unsigned int cluster_list_first(struct swap_cluster_list *list) | ||
266 | { | ||
267 | return cluster_next(&list->head); | ||
268 | } | ||
269 | |||
270 | static void cluster_list_init(struct swap_cluster_list *list) | ||
271 | { | ||
272 | cluster_set_null(&list->head); | ||
273 | cluster_set_null(&list->tail); | ||
274 | } | ||
275 | |||
276 | static void cluster_list_add_tail(struct swap_cluster_list *list, | ||
277 | struct swap_cluster_info *ci, | ||
278 | unsigned int idx) | ||
279 | { | ||
280 | if (cluster_list_empty(list)) { | ||
281 | cluster_set_next_flag(&list->head, idx, 0); | ||
282 | cluster_set_next_flag(&list->tail, idx, 0); | ||
283 | } else { | ||
284 | unsigned int tail = cluster_next(&list->tail); | ||
285 | |||
286 | cluster_set_next(&ci[tail], idx); | ||
287 | cluster_set_next_flag(&list->tail, idx, 0); | ||
288 | } | ||
289 | } | ||
290 | |||
291 | static unsigned int cluster_list_del_first(struct swap_cluster_list *list, | ||
292 | struct swap_cluster_info *ci) | ||
293 | { | ||
294 | unsigned int idx; | ||
295 | |||
296 | idx = cluster_next(&list->head); | ||
297 | if (cluster_next(&list->tail) == idx) { | ||
298 | cluster_set_null(&list->head); | ||
299 | cluster_set_null(&list->tail); | ||
300 | } else | ||
301 | cluster_set_next_flag(&list->head, | ||
302 | cluster_next(&ci[idx]), 0); | ||
303 | |||
304 | return idx; | ||
305 | } | ||
306 | |||
260 | /* Add a cluster to discard list and schedule it to do discard */ | 307 | /* Add a cluster to discard list and schedule it to do discard */ |
261 | static void swap_cluster_schedule_discard(struct swap_info_struct *si, | 308 | static void swap_cluster_schedule_discard(struct swap_info_struct *si, |
262 | unsigned int idx) | 309 | unsigned int idx) |
@@ -270,17 +317,7 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si, | |||
270 | memset(si->swap_map + idx * SWAPFILE_CLUSTER, | 317 | memset(si->swap_map + idx * SWAPFILE_CLUSTER, |
271 | SWAP_MAP_BAD, SWAPFILE_CLUSTER); | 318 | SWAP_MAP_BAD, SWAPFILE_CLUSTER); |
272 | 319 | ||
273 | if (cluster_is_null(&si->discard_cluster_head)) { | 320 | cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx); |
274 | cluster_set_next_flag(&si->discard_cluster_head, | ||
275 | idx, 0); | ||
276 | cluster_set_next_flag(&si->discard_cluster_tail, | ||
277 | idx, 0); | ||
278 | } else { | ||
279 | unsigned int tail = cluster_next(&si->discard_cluster_tail); | ||
280 | cluster_set_next(&si->cluster_info[tail], idx); | ||
281 | cluster_set_next_flag(&si->discard_cluster_tail, | ||
282 | idx, 0); | ||
283 | } | ||
284 | 321 | ||
285 | schedule_work(&si->discard_work); | 322 | schedule_work(&si->discard_work); |
286 | } | 323 | } |
@@ -296,15 +333,8 @@ static void swap_do_scheduled_discard(struct swap_info_struct *si) | |||
296 | 333 | ||
297 | info = si->cluster_info; | 334 | info = si->cluster_info; |
298 | 335 | ||
299 | while (!cluster_is_null(&si->discard_cluster_head)) { | 336 | while (!cluster_list_empty(&si->discard_clusters)) { |
300 | idx = cluster_next(&si->discard_cluster_head); | 337 | idx = cluster_list_del_first(&si->discard_clusters, info); |
301 | |||
302 | cluster_set_next_flag(&si->discard_cluster_head, | ||
303 | cluster_next(&info[idx]), 0); | ||
304 | if (cluster_next(&si->discard_cluster_tail) == idx) { | ||
305 | cluster_set_null(&si->discard_cluster_head); | ||
306 | cluster_set_null(&si->discard_cluster_tail); | ||
307 | } | ||
308 | spin_unlock(&si->lock); | 338 | spin_unlock(&si->lock); |
309 | 339 | ||
310 | discard_swap_cluster(si, idx * SWAPFILE_CLUSTER, | 340 | discard_swap_cluster(si, idx * SWAPFILE_CLUSTER, |
@@ -312,19 +342,7 @@ static void swap_do_scheduled_discard(struct swap_info_struct *si) | |||
312 | 342 | ||
313 | spin_lock(&si->lock); | 343 | spin_lock(&si->lock); |
314 | cluster_set_flag(&info[idx], CLUSTER_FLAG_FREE); | 344 | cluster_set_flag(&info[idx], CLUSTER_FLAG_FREE); |
315 | if (cluster_is_null(&si->free_cluster_head)) { | 345 | cluster_list_add_tail(&si->free_clusters, info, idx); |
316 | cluster_set_next_flag(&si->free_cluster_head, | ||
317 | idx, 0); | ||
318 | cluster_set_next_flag(&si->free_cluster_tail, | ||
319 | idx, 0); | ||
320 | } else { | ||
321 | unsigned int tail; | ||
322 | |||
323 | tail = cluster_next(&si->free_cluster_tail); | ||
324 | cluster_set_next(&info[tail], idx); | ||
325 | cluster_set_next_flag(&si->free_cluster_tail, | ||
326 | idx, 0); | ||
327 | } | ||
328 | memset(si->swap_map + idx * SWAPFILE_CLUSTER, | 346 | memset(si->swap_map + idx * SWAPFILE_CLUSTER, |
329 | 0, SWAPFILE_CLUSTER); | 347 | 0, SWAPFILE_CLUSTER); |
330 | } | 348 | } |
@@ -353,13 +371,8 @@ static void inc_cluster_info_page(struct swap_info_struct *p, | |||
353 | if (!cluster_info) | 371 | if (!cluster_info) |
354 | return; | 372 | return; |
355 | if (cluster_is_free(&cluster_info[idx])) { | 373 | if (cluster_is_free(&cluster_info[idx])) { |
356 | VM_BUG_ON(cluster_next(&p->free_cluster_head) != idx); | 374 | VM_BUG_ON(cluster_list_first(&p->free_clusters) != idx); |
357 | cluster_set_next_flag(&p->free_cluster_head, | 375 | cluster_list_del_first(&p->free_clusters, cluster_info); |
358 | cluster_next(&cluster_info[idx]), 0); | ||
359 | if (cluster_next(&p->free_cluster_tail) == idx) { | ||
360 | cluster_set_null(&p->free_cluster_tail); | ||
361 | cluster_set_null(&p->free_cluster_head); | ||
362 | } | ||
363 | cluster_set_count_flag(&cluster_info[idx], 0, 0); | 376 | cluster_set_count_flag(&cluster_info[idx], 0, 0); |
364 | } | 377 | } |
365 | 378 | ||
@@ -398,14 +411,7 @@ static void dec_cluster_info_page(struct swap_info_struct *p, | |||
398 | } | 411 | } |
399 | 412 | ||
400 | cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); | 413 | cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); |
401 | if (cluster_is_null(&p->free_cluster_head)) { | 414 | cluster_list_add_tail(&p->free_clusters, cluster_info, idx); |
402 | cluster_set_next_flag(&p->free_cluster_head, idx, 0); | ||
403 | cluster_set_next_flag(&p->free_cluster_tail, idx, 0); | ||
404 | } else { | ||
405 | unsigned int tail = cluster_next(&p->free_cluster_tail); | ||
406 | cluster_set_next(&cluster_info[tail], idx); | ||
407 | cluster_set_next_flag(&p->free_cluster_tail, idx, 0); | ||
408 | } | ||
409 | } | 415 | } |
410 | } | 416 | } |
411 | 417 | ||
@@ -421,8 +427,8 @@ scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si, | |||
421 | bool conflict; | 427 | bool conflict; |
422 | 428 | ||
423 | offset /= SWAPFILE_CLUSTER; | 429 | offset /= SWAPFILE_CLUSTER; |
424 | conflict = !cluster_is_null(&si->free_cluster_head) && | 430 | conflict = !cluster_list_empty(&si->free_clusters) && |
425 | offset != cluster_next(&si->free_cluster_head) && | 431 | offset != cluster_list_first(&si->free_clusters) && |
426 | cluster_is_free(&si->cluster_info[offset]); | 432 | cluster_is_free(&si->cluster_info[offset]); |
427 | 433 | ||
428 | if (!conflict) | 434 | if (!conflict) |
@@ -447,11 +453,11 @@ static void scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, | |||
447 | new_cluster: | 453 | new_cluster: |
448 | cluster = this_cpu_ptr(si->percpu_cluster); | 454 | cluster = this_cpu_ptr(si->percpu_cluster); |
449 | if (cluster_is_null(&cluster->index)) { | 455 | if (cluster_is_null(&cluster->index)) { |
450 | if (!cluster_is_null(&si->free_cluster_head)) { | 456 | if (!cluster_list_empty(&si->free_clusters)) { |
451 | cluster->index = si->free_cluster_head; | 457 | cluster->index = si->free_clusters.head; |
452 | cluster->next = cluster_next(&cluster->index) * | 458 | cluster->next = cluster_next(&cluster->index) * |
453 | SWAPFILE_CLUSTER; | 459 | SWAPFILE_CLUSTER; |
454 | } else if (!cluster_is_null(&si->discard_cluster_head)) { | 460 | } else if (!cluster_list_empty(&si->discard_clusters)) { |
455 | /* | 461 | /* |
456 | * we don't have free cluster but have some clusters in | 462 | * we don't have free cluster but have some clusters in |
457 | * discarding, do discard now and reclaim them | 463 | * discarding, do discard now and reclaim them |
@@ -999,7 +1005,7 @@ int free_swap_and_cache(swp_entry_t entry) | |||
999 | if (p) { | 1005 | if (p) { |
1000 | if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) { | 1006 | if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) { |
1001 | page = find_get_page(swap_address_space(entry), | 1007 | page = find_get_page(swap_address_space(entry), |
1002 | entry.val); | 1008 | swp_offset(entry)); |
1003 | if (page && !trylock_page(page)) { | 1009 | if (page && !trylock_page(page)) { |
1004 | put_page(page); | 1010 | put_page(page); |
1005 | page = NULL; | 1011 | page = NULL; |
@@ -2292,10 +2298,8 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, | |||
2292 | 2298 | ||
2293 | nr_good_pages = maxpages - 1; /* omit header page */ | 2299 | nr_good_pages = maxpages - 1; /* omit header page */ |
2294 | 2300 | ||
2295 | cluster_set_null(&p->free_cluster_head); | 2301 | cluster_list_init(&p->free_clusters); |
2296 | cluster_set_null(&p->free_cluster_tail); | 2302 | cluster_list_init(&p->discard_clusters); |
2297 | cluster_set_null(&p->discard_cluster_head); | ||
2298 | cluster_set_null(&p->discard_cluster_tail); | ||
2299 | 2303 | ||
2300 | for (i = 0; i < swap_header->info.nr_badpages; i++) { | 2304 | for (i = 0; i < swap_header->info.nr_badpages; i++) { |
2301 | unsigned int page_nr = swap_header->info.badpages[i]; | 2305 | unsigned int page_nr = swap_header->info.badpages[i]; |
@@ -2341,19 +2345,8 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, | |||
2341 | for (i = 0; i < nr_clusters; i++) { | 2345 | for (i = 0; i < nr_clusters; i++) { |
2342 | if (!cluster_count(&cluster_info[idx])) { | 2346 | if (!cluster_count(&cluster_info[idx])) { |
2343 | cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); | 2347 | cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); |
2344 | if (cluster_is_null(&p->free_cluster_head)) { | 2348 | cluster_list_add_tail(&p->free_clusters, cluster_info, |
2345 | cluster_set_next_flag(&p->free_cluster_head, | 2349 | idx); |
2346 | idx, 0); | ||
2347 | cluster_set_next_flag(&p->free_cluster_tail, | ||
2348 | idx, 0); | ||
2349 | } else { | ||
2350 | unsigned int tail; | ||
2351 | |||
2352 | tail = cluster_next(&p->free_cluster_tail); | ||
2353 | cluster_set_next(&cluster_info[tail], idx); | ||
2354 | cluster_set_next_flag(&p->free_cluster_tail, | ||
2355 | idx, 0); | ||
2356 | } | ||
2357 | } | 2350 | } |
2358 | idx++; | 2351 | idx++; |
2359 | if (idx == nr_clusters) | 2352 | if (idx == nr_clusters) |
diff --git a/mm/vmacache.c b/mm/vmacache.c index fd09dc9c6812..035fdeb35b43 100644 --- a/mm/vmacache.c +++ b/mm/vmacache.c | |||
@@ -87,11 +87,11 @@ struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr) | |||
87 | { | 87 | { |
88 | int i; | 88 | int i; |
89 | 89 | ||
90 | count_vm_vmacache_event(VMACACHE_FIND_CALLS); | ||
91 | |||
90 | if (!vmacache_valid(mm)) | 92 | if (!vmacache_valid(mm)) |
91 | return NULL; | 93 | return NULL; |
92 | 94 | ||
93 | count_vm_vmacache_event(VMACACHE_FIND_CALLS); | ||
94 | |||
95 | for (i = 0; i < VMACACHE_SIZE; i++) { | 95 | for (i = 0; i < VMACACHE_SIZE; i++) { |
96 | struct vm_area_struct *vma = current->vmacache[i]; | 96 | struct vm_area_struct *vma = current->vmacache[i]; |
97 | 97 | ||
@@ -115,11 +115,11 @@ struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm, | |||
115 | { | 115 | { |
116 | int i; | 116 | int i; |
117 | 117 | ||
118 | count_vm_vmacache_event(VMACACHE_FIND_CALLS); | ||
119 | |||
118 | if (!vmacache_valid(mm)) | 120 | if (!vmacache_valid(mm)) |
119 | return NULL; | 121 | return NULL; |
120 | 122 | ||
121 | count_vm_vmacache_event(VMACACHE_FIND_CALLS); | ||
122 | |||
123 | for (i = 0; i < VMACACHE_SIZE; i++) { | 123 | for (i = 0; i < VMACACHE_SIZE; i++) { |
124 | struct vm_area_struct *vma = current->vmacache[i]; | 124 | struct vm_area_struct *vma = current->vmacache[i]; |
125 | 125 | ||
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 91f44e78c516..f2481cb4e6b2 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -1359,14 +1359,14 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, | |||
1359 | struct vm_struct *area; | 1359 | struct vm_struct *area; |
1360 | 1360 | ||
1361 | BUG_ON(in_interrupt()); | 1361 | BUG_ON(in_interrupt()); |
1362 | if (flags & VM_IOREMAP) | ||
1363 | align = 1ul << clamp_t(int, fls_long(size), | ||
1364 | PAGE_SHIFT, IOREMAP_MAX_ORDER); | ||
1365 | |||
1366 | size = PAGE_ALIGN(size); | 1362 | size = PAGE_ALIGN(size); |
1367 | if (unlikely(!size)) | 1363 | if (unlikely(!size)) |
1368 | return NULL; | 1364 | return NULL; |
1369 | 1365 | ||
1366 | if (flags & VM_IOREMAP) | ||
1367 | align = 1ul << clamp_t(int, get_count_order_long(size), | ||
1368 | PAGE_SHIFT, IOREMAP_MAX_ORDER); | ||
1369 | |||
1370 | area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); | 1370 | area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); |
1371 | if (unlikely(!area)) | 1371 | if (unlikely(!area)) |
1372 | return NULL; | 1372 | return NULL; |
@@ -1601,7 +1601,6 @@ static void *__vmalloc_node(unsigned long size, unsigned long align, | |||
1601 | static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, | 1601 | static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, |
1602 | pgprot_t prot, int node) | 1602 | pgprot_t prot, int node) |
1603 | { | 1603 | { |
1604 | const int order = 0; | ||
1605 | struct page **pages; | 1604 | struct page **pages; |
1606 | unsigned int nr_pages, array_size, i; | 1605 | unsigned int nr_pages, array_size, i; |
1607 | const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; | 1606 | const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; |
@@ -1629,9 +1628,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, | |||
1629 | struct page *page; | 1628 | struct page *page; |
1630 | 1629 | ||
1631 | if (node == NUMA_NO_NODE) | 1630 | if (node == NUMA_NO_NODE) |
1632 | page = alloc_pages(alloc_mask, order); | 1631 | page = alloc_page(alloc_mask); |
1633 | else | 1632 | else |
1634 | page = alloc_pages_node(node, alloc_mask, order); | 1633 | page = alloc_pages_node(node, alloc_mask, 0); |
1635 | 1634 | ||
1636 | if (unlikely(!page)) { | 1635 | if (unlikely(!page)) { |
1637 | /* Successfully allocated i pages, free them in __vunmap() */ | 1636 | /* Successfully allocated i pages, free them in __vunmap() */ |
@@ -1648,8 +1647,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, | |||
1648 | return area->addr; | 1647 | return area->addr; |
1649 | 1648 | ||
1650 | fail: | 1649 | fail: |
1651 | warn_alloc_failed(gfp_mask, order, | 1650 | warn_alloc(gfp_mask, |
1652 | "vmalloc: allocation failure, allocated %ld of %ld bytes\n", | 1651 | "vmalloc: allocation failure, allocated %ld of %ld bytes", |
1653 | (area->nr_pages*PAGE_SIZE), area->size); | 1652 | (area->nr_pages*PAGE_SIZE), area->size); |
1654 | vfree(area->addr); | 1653 | vfree(area->addr); |
1655 | return NULL; | 1654 | return NULL; |
@@ -1710,9 +1709,8 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, | |||
1710 | return addr; | 1709 | return addr; |
1711 | 1710 | ||
1712 | fail: | 1711 | fail: |
1713 | warn_alloc_failed(gfp_mask, 0, | 1712 | warn_alloc(gfp_mask, |
1714 | "vmalloc: allocation failure: %lu bytes\n", | 1713 | "vmalloc: allocation failure: %lu bytes", real_size); |
1715 | real_size); | ||
1716 | return NULL; | 1714 | return NULL; |
1717 | } | 1715 | } |
1718 | 1716 | ||
diff --git a/mm/vmscan.c b/mm/vmscan.c index 0fe8b7113868..744f926af442 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -2418,8 +2418,6 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc | |||
2418 | if (inactive_list_is_low(lruvec, false, sc)) | 2418 | if (inactive_list_is_low(lruvec, false, sc)) |
2419 | shrink_active_list(SWAP_CLUSTER_MAX, lruvec, | 2419 | shrink_active_list(SWAP_CLUSTER_MAX, lruvec, |
2420 | sc, LRU_ACTIVE_ANON); | 2420 | sc, LRU_ACTIVE_ANON); |
2421 | |||
2422 | throttle_vm_writeout(sc->gfp_mask); | ||
2423 | } | 2421 | } |
2424 | 2422 | ||
2425 | /* Use reclaim/compaction for costly allocs or under memory pressure */ | 2423 | /* Use reclaim/compaction for costly allocs or under memory pressure */ |
@@ -2480,7 +2478,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, | |||
2480 | * If we have not reclaimed enough pages for compaction and the | 2478 | * If we have not reclaimed enough pages for compaction and the |
2481 | * inactive lists are large enough, continue reclaiming | 2479 | * inactive lists are large enough, continue reclaiming |
2482 | */ | 2480 | */ |
2483 | pages_for_compaction = (2UL << sc->order); | 2481 | pages_for_compaction = compact_gap(sc->order); |
2484 | inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE); | 2482 | inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE); |
2485 | if (get_nr_swap_pages() > 0) | 2483 | if (get_nr_swap_pages() > 0) |
2486 | inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON); | 2484 | inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON); |
@@ -2495,7 +2493,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, | |||
2495 | continue; | 2493 | continue; |
2496 | 2494 | ||
2497 | switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) { | 2495 | switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) { |
2498 | case COMPACT_PARTIAL: | 2496 | case COMPACT_SUCCESS: |
2499 | case COMPACT_CONTINUE: | 2497 | case COMPACT_CONTINUE: |
2500 | return false; | 2498 | return false; |
2501 | default: | 2499 | default: |
@@ -2598,38 +2596,35 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) | |||
2598 | } | 2596 | } |
2599 | 2597 | ||
2600 | /* | 2598 | /* |
2601 | * Returns true if compaction should go ahead for a high-order request, or | 2599 | * Returns true if compaction should go ahead for a costly-order request, or |
2602 | * the high-order allocation would succeed without compaction. | 2600 | * the allocation would already succeed without compaction. Return false if we |
2601 | * should reclaim first. | ||
2603 | */ | 2602 | */ |
2604 | static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) | 2603 | static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) |
2605 | { | 2604 | { |
2606 | unsigned long watermark; | 2605 | unsigned long watermark; |
2607 | bool watermark_ok; | 2606 | enum compact_result suitable; |
2608 | 2607 | ||
2609 | /* | 2608 | suitable = compaction_suitable(zone, sc->order, 0, sc->reclaim_idx); |
2610 | * Compaction takes time to run and there are potentially other | 2609 | if (suitable == COMPACT_SUCCESS) |
2611 | * callers using the pages just freed. Continue reclaiming until | 2610 | /* Allocation should succeed already. Don't reclaim. */ |
2612 | * there is a buffer of free pages available to give compaction | 2611 | return true; |
2613 | * a reasonable chance of completing and allocating the page | 2612 | if (suitable == COMPACT_SKIPPED) |
2614 | */ | 2613 | /* Compaction cannot yet proceed. Do reclaim. */ |
2615 | watermark = high_wmark_pages(zone) + (2UL << sc->order); | 2614 | return false; |
2616 | watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx); | ||
2617 | |||
2618 | /* | ||
2619 | * If compaction is deferred, reclaim up to a point where | ||
2620 | * compaction will have a chance of success when re-enabled | ||
2621 | */ | ||
2622 | if (compaction_deferred(zone, sc->order)) | ||
2623 | return watermark_ok; | ||
2624 | 2615 | ||
2625 | /* | 2616 | /* |
2626 | * If compaction is not ready to start and allocation is not likely | 2617 | * Compaction is already possible, but it takes time to run and there |
2627 | * to succeed without it, then keep reclaiming. | 2618 | * are potentially other callers using the pages just freed. So proceed |
2619 | * with reclaim to make a buffer of free pages available to give | ||
2620 | * compaction a reasonable chance of completing and allocating the page. | ||
2621 | * Note that we won't actually reclaim the whole buffer in one attempt | ||
2622 | * as the target watermark in should_continue_reclaim() is lower. But if | ||
2623 | * we are already above the high+gap watermark, don't reclaim at all. | ||
2628 | */ | 2624 | */ |
2629 | if (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx) == COMPACT_SKIPPED) | 2625 | watermark = high_wmark_pages(zone) + compact_gap(sc->order); |
2630 | return false; | ||
2631 | 2626 | ||
2632 | return watermark_ok; | 2627 | return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx); |
2633 | } | 2628 | } |
2634 | 2629 | ||
2635 | /* | 2630 | /* |
@@ -3041,7 +3036,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, | |||
3041 | */ | 3036 | */ |
3042 | nid = mem_cgroup_select_victim_node(memcg); | 3037 | nid = mem_cgroup_select_victim_node(memcg); |
3043 | 3038 | ||
3044 | zonelist = NODE_DATA(nid)->node_zonelists; | 3039 | zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK]; |
3045 | 3040 | ||
3046 | trace_mm_vmscan_memcg_reclaim_begin(0, | 3041 | trace_mm_vmscan_memcg_reclaim_begin(0, |
3047 | sc.may_writepage, | 3042 | sc.may_writepage, |
@@ -3169,7 +3164,7 @@ static bool kswapd_shrink_node(pg_data_t *pgdat, | |||
3169 | * excessive reclaim. Assume that a process requested a high-order | 3164 | * excessive reclaim. Assume that a process requested a high-order |
3170 | * can direct reclaim/compact. | 3165 | * can direct reclaim/compact. |
3171 | */ | 3166 | */ |
3172 | if (sc->order && sc->nr_reclaimed >= 2UL << sc->order) | 3167 | if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order)) |
3173 | sc->order = 0; | 3168 | sc->order = 0; |
3174 | 3169 | ||
3175 | return sc->nr_scanned >= sc->nr_to_reclaim; | 3170 | return sc->nr_scanned >= sc->nr_to_reclaim; |
diff --git a/mm/vmstat.c b/mm/vmstat.c index 89cec42d19ff..604f26a4f696 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -1254,85 +1254,6 @@ static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg) | |||
1254 | return 0; | 1254 | return 0; |
1255 | } | 1255 | } |
1256 | 1256 | ||
1257 | #ifdef CONFIG_PAGE_OWNER | ||
1258 | static void pagetypeinfo_showmixedcount_print(struct seq_file *m, | ||
1259 | pg_data_t *pgdat, | ||
1260 | struct zone *zone) | ||
1261 | { | ||
1262 | struct page *page; | ||
1263 | struct page_ext *page_ext; | ||
1264 | unsigned long pfn = zone->zone_start_pfn, block_end_pfn; | ||
1265 | unsigned long end_pfn = pfn + zone->spanned_pages; | ||
1266 | unsigned long count[MIGRATE_TYPES] = { 0, }; | ||
1267 | int pageblock_mt, page_mt; | ||
1268 | int i; | ||
1269 | |||
1270 | /* Scan block by block. First and last block may be incomplete */ | ||
1271 | pfn = zone->zone_start_pfn; | ||
1272 | |||
1273 | /* | ||
1274 | * Walk the zone in pageblock_nr_pages steps. If a page block spans | ||
1275 | * a zone boundary, it will be double counted between zones. This does | ||
1276 | * not matter as the mixed block count will still be correct | ||
1277 | */ | ||
1278 | for (; pfn < end_pfn; ) { | ||
1279 | if (!pfn_valid(pfn)) { | ||
1280 | pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); | ||
1281 | continue; | ||
1282 | } | ||
1283 | |||
1284 | block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); | ||
1285 | block_end_pfn = min(block_end_pfn, end_pfn); | ||
1286 | |||
1287 | page = pfn_to_page(pfn); | ||
1288 | pageblock_mt = get_pageblock_migratetype(page); | ||
1289 | |||
1290 | for (; pfn < block_end_pfn; pfn++) { | ||
1291 | if (!pfn_valid_within(pfn)) | ||
1292 | continue; | ||
1293 | |||
1294 | page = pfn_to_page(pfn); | ||
1295 | |||
1296 | if (page_zone(page) != zone) | ||
1297 | continue; | ||
1298 | |||
1299 | if (PageBuddy(page)) { | ||
1300 | pfn += (1UL << page_order(page)) - 1; | ||
1301 | continue; | ||
1302 | } | ||
1303 | |||
1304 | if (PageReserved(page)) | ||
1305 | continue; | ||
1306 | |||
1307 | page_ext = lookup_page_ext(page); | ||
1308 | if (unlikely(!page_ext)) | ||
1309 | continue; | ||
1310 | |||
1311 | if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) | ||
1312 | continue; | ||
1313 | |||
1314 | page_mt = gfpflags_to_migratetype(page_ext->gfp_mask); | ||
1315 | if (pageblock_mt != page_mt) { | ||
1316 | if (is_migrate_cma(pageblock_mt)) | ||
1317 | count[MIGRATE_MOVABLE]++; | ||
1318 | else | ||
1319 | count[pageblock_mt]++; | ||
1320 | |||
1321 | pfn = block_end_pfn; | ||
1322 | break; | ||
1323 | } | ||
1324 | pfn += (1UL << page_ext->order) - 1; | ||
1325 | } | ||
1326 | } | ||
1327 | |||
1328 | /* Print counts */ | ||
1329 | seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); | ||
1330 | for (i = 0; i < MIGRATE_TYPES; i++) | ||
1331 | seq_printf(m, "%12lu ", count[i]); | ||
1332 | seq_putc(m, '\n'); | ||
1333 | } | ||
1334 | #endif /* CONFIG_PAGE_OWNER */ | ||
1335 | |||
1336 | /* | 1257 | /* |
1337 | * Print out the number of pageblocks for each migratetype that contain pages | 1258 | * Print out the number of pageblocks for each migratetype that contain pages |
1338 | * of other types. This gives an indication of how well fallbacks are being | 1259 | * of other types. This gives an indication of how well fallbacks are being |
@@ -1592,7 +1513,10 @@ static int vmstat_show(struct seq_file *m, void *arg) | |||
1592 | { | 1513 | { |
1593 | unsigned long *l = arg; | 1514 | unsigned long *l = arg; |
1594 | unsigned long off = l - (unsigned long *)m->private; | 1515 | unsigned long off = l - (unsigned long *)m->private; |
1595 | seq_printf(m, "%s %lu\n", vmstat_text[off], *l); | 1516 | |
1517 | seq_puts(m, vmstat_text[off]); | ||
1518 | seq_put_decimal_ull(m, " ", *l); | ||
1519 | seq_putc(m, '\n'); | ||
1596 | return 0; | 1520 | return 0; |
1597 | } | 1521 | } |
1598 | 1522 | ||
@@ -1794,6 +1718,16 @@ static void __init start_shepherd_timer(void) | |||
1794 | round_jiffies_relative(sysctl_stat_interval)); | 1718 | round_jiffies_relative(sysctl_stat_interval)); |
1795 | } | 1719 | } |
1796 | 1720 | ||
1721 | static void __init init_cpu_node_state(void) | ||
1722 | { | ||
1723 | int cpu; | ||
1724 | |||
1725 | get_online_cpus(); | ||
1726 | for_each_online_cpu(cpu) | ||
1727 | node_set_state(cpu_to_node(cpu), N_CPU); | ||
1728 | put_online_cpus(); | ||
1729 | } | ||
1730 | |||
1797 | static void vmstat_cpu_dead(int node) | 1731 | static void vmstat_cpu_dead(int node) |
1798 | { | 1732 | { |
1799 | int cpu; | 1733 | int cpu; |
@@ -1851,6 +1785,7 @@ static int __init setup_vmstat(void) | |||
1851 | #ifdef CONFIG_SMP | 1785 | #ifdef CONFIG_SMP |
1852 | cpu_notifier_register_begin(); | 1786 | cpu_notifier_register_begin(); |
1853 | __register_cpu_notifier(&vmstat_notifier); | 1787 | __register_cpu_notifier(&vmstat_notifier); |
1788 | init_cpu_node_state(); | ||
1854 | 1789 | ||
1855 | start_shepherd_timer(); | 1790 | start_shepherd_timer(); |
1856 | cpu_notifier_register_done(); | 1791 | cpu_notifier_register_done(); |