aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-10-08 00:38:00 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-10-08 00:38:00 -0400
commitb66484cd74706fa8681d051840fe4b18a3da40ff (patch)
treee8215e7c25661d25f84abc4b98140c2062d6d5de /mm
parentc913fc4146ba7c280e074558d0a461e5c6f07c8a (diff)
parent05fd007e46296afb24d15c7d589d535e5a5b9d5c (diff)
Merge branch 'akpm' (patches from Andrew)
Merge updates from Andrew Morton: - fsnotify updates - ocfs2 updates - all of MM * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (127 commits) console: don't prefer first registered if DT specifies stdout-path cred: simpler, 1D supplementary groups CREDITS: update Pavel's information, add GPG key, remove snail mail address mailmap: add Johan Hovold .gitattributes: set git diff driver for C source code files uprobes: remove function declarations from arch/{mips,s390} spelling.txt: "modeled" is spelt correctly nmi_backtrace: generate one-line reports for idle cpus arch/tile: adopt the new nmi_backtrace framework nmi_backtrace: do a local dump_stack() instead of a self-NMI nmi_backtrace: add more trigger_*_cpu_backtrace() methods min/max: remove sparse warnings when they're nested Documentation/filesystems/proc.txt: add more description for maps/smaps mm, proc: fix region lost in /proc/self/smaps proc: fix timerslack_ns CAP_SYS_NICE check when adjusting self proc: add LSM hook checks to /proc/<tid>/timerslack_ns proc: relax /proc/<tid>/timerslack_ns capability requirements meminfo: break apart a very long seq_printf with #ifdefs seq/proc: modify seq_put_decimal_[u]ll to take a const char *, not char proc: faster /proc/*/status ...
Diffstat (limited to 'mm')
-rw-r--r--mm/bootmem.c14
-rw-r--r--mm/compaction.c205
-rw-r--r--mm/debug.c5
-rw-r--r--mm/filemap.c8
-rw-r--r--mm/huge_memory.c81
-rw-r--r--mm/hugetlb.c53
-rw-r--r--mm/internal.h3
-rw-r--r--mm/ksm.c7
-rw-r--r--mm/memblock.c5
-rw-r--r--mm/memcontrol.c154
-rw-r--r--mm/memory.c21
-rw-r--r--mm/memory_hotplug.c4
-rw-r--r--mm/mempolicy.c2
-rw-r--r--mm/migrate.c2
-rw-r--r--mm/mincore.c5
-rw-r--r--mm/mlock.c52
-rw-r--r--mm/mmap.c238
-rw-r--r--mm/mprotect.c3
-rw-r--r--mm/nobootmem.c20
-rw-r--r--mm/oom_kill.c381
-rw-r--r--mm/page-writeback.c34
-rw-r--r--mm/page_alloc.c281
-rw-r--r--mm/page_ext.c45
-rw-r--r--mm/page_io.c7
-rw-r--r--mm/page_isolation.c2
-rw-r--r--mm/page_owner.c156
-rw-r--r--mm/shmem.c2
-rw-r--r--mm/swap.c4
-rw-r--r--mm/swap_state.c14
-rw-r--r--mm/swapfile.c137
-rw-r--r--mm/vmacache.c8
-rw-r--r--mm/vmalloc.c22
-rw-r--r--mm/vmscan.c53
-rw-r--r--mm/vmstat.c95
34 files changed, 1250 insertions, 873 deletions
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 0aa7dda52402..a869f84f44d3 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -11,15 +11,12 @@
11#include <linux/init.h> 11#include <linux/init.h>
12#include <linux/pfn.h> 12#include <linux/pfn.h>
13#include <linux/slab.h> 13#include <linux/slab.h>
14#include <linux/bootmem.h>
15#include <linux/export.h> 14#include <linux/export.h>
16#include <linux/kmemleak.h> 15#include <linux/kmemleak.h>
17#include <linux/range.h> 16#include <linux/range.h>
18#include <linux/memblock.h>
19#include <linux/bug.h> 17#include <linux/bug.h>
20#include <linux/io.h> 18#include <linux/io.h>
21 19#include <linux/bootmem.h>
22#include <asm/processor.h>
23 20
24#include "internal.h" 21#include "internal.h"
25 22
@@ -712,7 +709,7 @@ void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
712 void *ptr; 709 void *ptr;
713 710
714 if (WARN_ON_ONCE(slab_is_available())) 711 if (WARN_ON_ONCE(slab_is_available()))
715 return kzalloc(size, GFP_NOWAIT); 712 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
716again: 713again:
717 714
718 /* do not panic in alloc_bootmem_bdata() */ 715 /* do not panic in alloc_bootmem_bdata() */
@@ -738,9 +735,6 @@ again:
738void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, 735void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
739 unsigned long align, unsigned long goal) 736 unsigned long align, unsigned long goal)
740{ 737{
741 if (WARN_ON_ONCE(slab_is_available()))
742 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
743
744 return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0); 738 return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
745} 739}
746 740
@@ -812,10 +806,6 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
812 806
813} 807}
814 808
815#ifndef ARCH_LOW_ADDRESS_LIMIT
816#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL
817#endif
818
819/** 809/**
820 * __alloc_bootmem_low - allocate low boot memory 810 * __alloc_bootmem_low - allocate low boot memory
821 * @size: size of the request in bytes 811 * @size: size of the request in bytes
diff --git a/mm/compaction.c b/mm/compaction.c
index 9affb2908304..0409a4ad6ea1 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -997,8 +997,12 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
997#ifdef CONFIG_COMPACTION 997#ifdef CONFIG_COMPACTION
998 998
999/* Returns true if the page is within a block suitable for migration to */ 999/* Returns true if the page is within a block suitable for migration to */
1000static bool suitable_migration_target(struct page *page) 1000static bool suitable_migration_target(struct compact_control *cc,
1001 struct page *page)
1001{ 1002{
1003 if (cc->ignore_block_suitable)
1004 return true;
1005
1002 /* If the page is a large free page, then disallow migration */ 1006 /* If the page is a large free page, then disallow migration */
1003 if (PageBuddy(page)) { 1007 if (PageBuddy(page)) {
1004 /* 1008 /*
@@ -1083,7 +1087,7 @@ static void isolate_freepages(struct compact_control *cc)
1083 continue; 1087 continue;
1084 1088
1085 /* Check the block is suitable for migration */ 1089 /* Check the block is suitable for migration */
1086 if (!suitable_migration_target(page)) 1090 if (!suitable_migration_target(cc, page))
1087 continue; 1091 continue;
1088 1092
1089 /* If isolation recently failed, do not retry */ 1093 /* If isolation recently failed, do not retry */
@@ -1316,7 +1320,7 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_
1316 return COMPACT_CONTINUE; 1320 return COMPACT_CONTINUE;
1317 1321
1318 /* Compaction run is not finished if the watermark is not met */ 1322 /* Compaction run is not finished if the watermark is not met */
1319 watermark = low_wmark_pages(zone); 1323 watermark = zone->watermark[cc->alloc_flags & ALLOC_WMARK_MASK];
1320 1324
1321 if (!zone_watermark_ok(zone, cc->order, watermark, cc->classzone_idx, 1325 if (!zone_watermark_ok(zone, cc->order, watermark, cc->classzone_idx,
1322 cc->alloc_flags)) 1326 cc->alloc_flags))
@@ -1329,13 +1333,13 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_
1329 1333
1330 /* Job done if page is free of the right migratetype */ 1334 /* Job done if page is free of the right migratetype */
1331 if (!list_empty(&area->free_list[migratetype])) 1335 if (!list_empty(&area->free_list[migratetype]))
1332 return COMPACT_PARTIAL; 1336 return COMPACT_SUCCESS;
1333 1337
1334#ifdef CONFIG_CMA 1338#ifdef CONFIG_CMA
1335 /* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */ 1339 /* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */
1336 if (migratetype == MIGRATE_MOVABLE && 1340 if (migratetype == MIGRATE_MOVABLE &&
1337 !list_empty(&area->free_list[MIGRATE_CMA])) 1341 !list_empty(&area->free_list[MIGRATE_CMA]))
1338 return COMPACT_PARTIAL; 1342 return COMPACT_SUCCESS;
1339#endif 1343#endif
1340 /* 1344 /*
1341 * Job done if allocation would steal freepages from 1345 * Job done if allocation would steal freepages from
@@ -1343,7 +1347,7 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_
1343 */ 1347 */
1344 if (find_suitable_fallback(area, order, migratetype, 1348 if (find_suitable_fallback(area, order, migratetype,
1345 true, &can_steal) != -1) 1349 true, &can_steal) != -1)
1346 return COMPACT_PARTIAL; 1350 return COMPACT_SUCCESS;
1347 } 1351 }
1348 1352
1349 return COMPACT_NO_SUITABLE_PAGE; 1353 return COMPACT_NO_SUITABLE_PAGE;
@@ -1367,7 +1371,7 @@ static enum compact_result compact_finished(struct zone *zone,
1367 * compaction_suitable: Is this suitable to run compaction on this zone now? 1371 * compaction_suitable: Is this suitable to run compaction on this zone now?
1368 * Returns 1372 * Returns
1369 * COMPACT_SKIPPED - If there are too few free pages for compaction 1373 * COMPACT_SKIPPED - If there are too few free pages for compaction
1370 * COMPACT_PARTIAL - If the allocation would succeed without compaction 1374 * COMPACT_SUCCESS - If the allocation would succeed without compaction
1371 * COMPACT_CONTINUE - If compaction should run now 1375 * COMPACT_CONTINUE - If compaction should run now
1372 */ 1376 */
1373static enum compact_result __compaction_suitable(struct zone *zone, int order, 1377static enum compact_result __compaction_suitable(struct zone *zone, int order,
@@ -1375,46 +1379,41 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order,
1375 int classzone_idx, 1379 int classzone_idx,
1376 unsigned long wmark_target) 1380 unsigned long wmark_target)
1377{ 1381{
1378 int fragindex;
1379 unsigned long watermark; 1382 unsigned long watermark;
1380 1383
1381 if (is_via_compact_memory(order)) 1384 if (is_via_compact_memory(order))
1382 return COMPACT_CONTINUE; 1385 return COMPACT_CONTINUE;
1383 1386
1384 watermark = low_wmark_pages(zone); 1387 watermark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
1385 /* 1388 /*
1386 * If watermarks for high-order allocation are already met, there 1389 * If watermarks for high-order allocation are already met, there
1387 * should be no need for compaction at all. 1390 * should be no need for compaction at all.
1388 */ 1391 */
1389 if (zone_watermark_ok(zone, order, watermark, classzone_idx, 1392 if (zone_watermark_ok(zone, order, watermark, classzone_idx,
1390 alloc_flags)) 1393 alloc_flags))
1391 return COMPACT_PARTIAL; 1394 return COMPACT_SUCCESS;
1392 1395
1393 /* 1396 /*
1394 * Watermarks for order-0 must be met for compaction. Note the 2UL. 1397 * Watermarks for order-0 must be met for compaction to be able to
1395 * This is because during migration, copies of pages need to be 1398 * isolate free pages for migration targets. This means that the
1396 * allocated and for a short time, the footprint is higher 1399 * watermark and alloc_flags have to match, or be more pessimistic than
1400 * the check in __isolate_free_page(). We don't use the direct
1401 * compactor's alloc_flags, as they are not relevant for freepage
1402 * isolation. We however do use the direct compactor's classzone_idx to
1403 * skip over zones where lowmem reserves would prevent allocation even
1404 * if compaction succeeds.
1405 * For costly orders, we require low watermark instead of min for
1406 * compaction to proceed to increase its chances.
1407 * ALLOC_CMA is used, as pages in CMA pageblocks are considered
1408 * suitable migration targets
1397 */ 1409 */
1398 watermark += (2UL << order); 1410 watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ?
1411 low_wmark_pages(zone) : min_wmark_pages(zone);
1412 watermark += compact_gap(order);
1399 if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx, 1413 if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx,
1400 alloc_flags, wmark_target)) 1414 ALLOC_CMA, wmark_target))
1401 return COMPACT_SKIPPED; 1415 return COMPACT_SKIPPED;
1402 1416
1403 /*
1404 * fragmentation index determines if allocation failures are due to
1405 * low memory or external fragmentation
1406 *
1407 * index of -1000 would imply allocations might succeed depending on
1408 * watermarks, but we already failed the high-order watermark check
1409 * index towards 0 implies failure is due to lack of memory
1410 * index towards 1000 implies failure is due to fragmentation
1411 *
1412 * Only compact if a failure would be due to fragmentation.
1413 */
1414 fragindex = fragmentation_index(zone, order);
1415 if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
1416 return COMPACT_NOT_SUITABLE_ZONE;
1417
1418 return COMPACT_CONTINUE; 1417 return COMPACT_CONTINUE;
1419} 1418}
1420 1419
@@ -1423,9 +1422,32 @@ enum compact_result compaction_suitable(struct zone *zone, int order,
1423 int classzone_idx) 1422 int classzone_idx)
1424{ 1423{
1425 enum compact_result ret; 1424 enum compact_result ret;
1425 int fragindex;
1426 1426
1427 ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx, 1427 ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx,
1428 zone_page_state(zone, NR_FREE_PAGES)); 1428 zone_page_state(zone, NR_FREE_PAGES));
1429 /*
1430 * fragmentation index determines if allocation failures are due to
1431 * low memory or external fragmentation
1432 *
1433 * index of -1000 would imply allocations might succeed depending on
1434 * watermarks, but we already failed the high-order watermark check
1435 * index towards 0 implies failure is due to lack of memory
1436 * index towards 1000 implies failure is due to fragmentation
1437 *
1438 * Only compact if a failure would be due to fragmentation. Also
1439 * ignore fragindex for non-costly orders where the alternative to
1440 * a successful reclaim/compaction is OOM. Fragindex and the
1441 * vm.extfrag_threshold sysctl is meant as a heuristic to prevent
1442 * excessive compaction for costly orders, but it should not be at the
1443 * expense of system stability.
1444 */
1445 if (ret == COMPACT_CONTINUE && (order > PAGE_ALLOC_COSTLY_ORDER)) {
1446 fragindex = fragmentation_index(zone, order);
1447 if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
1448 ret = COMPACT_NOT_SUITABLE_ZONE;
1449 }
1450
1429 trace_mm_compaction_suitable(zone, order, ret); 1451 trace_mm_compaction_suitable(zone, order, ret);
1430 if (ret == COMPACT_NOT_SUITABLE_ZONE) 1452 if (ret == COMPACT_NOT_SUITABLE_ZONE)
1431 ret = COMPACT_SKIPPED; 1453 ret = COMPACT_SKIPPED;
@@ -1458,8 +1480,7 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
1458 available += zone_page_state_snapshot(zone, NR_FREE_PAGES); 1480 available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
1459 compact_result = __compaction_suitable(zone, order, alloc_flags, 1481 compact_result = __compaction_suitable(zone, order, alloc_flags,
1460 ac_classzone_idx(ac), available); 1482 ac_classzone_idx(ac), available);
1461 if (compact_result != COMPACT_SKIPPED && 1483 if (compact_result != COMPACT_SKIPPED)
1462 compact_result != COMPACT_NOT_SUITABLE_ZONE)
1463 return true; 1484 return true;
1464 } 1485 }
1465 1486
@@ -1477,7 +1498,7 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
1477 ret = compaction_suitable(zone, cc->order, cc->alloc_flags, 1498 ret = compaction_suitable(zone, cc->order, cc->alloc_flags,
1478 cc->classzone_idx); 1499 cc->classzone_idx);
1479 /* Compaction is likely to fail */ 1500 /* Compaction is likely to fail */
1480 if (ret == COMPACT_PARTIAL || ret == COMPACT_SKIPPED) 1501 if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED)
1481 return ret; 1502 return ret;
1482 1503
1483 /* huh, compaction_suitable is returning something unexpected */ 1504 /* huh, compaction_suitable is returning something unexpected */
@@ -1492,23 +1513,29 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
1492 1513
1493 /* 1514 /*
1494 * Setup to move all movable pages to the end of the zone. Used cached 1515 * Setup to move all movable pages to the end of the zone. Used cached
1495 * information on where the scanners should start but check that it 1516 * information on where the scanners should start (unless we explicitly
1496 * is initialised by ensuring the values are within zone boundaries. 1517 * want to compact the whole zone), but check that it is initialised
1518 * by ensuring the values are within zone boundaries.
1497 */ 1519 */
1498 cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync]; 1520 if (cc->whole_zone) {
1499 cc->free_pfn = zone->compact_cached_free_pfn;
1500 if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) {
1501 cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
1502 zone->compact_cached_free_pfn = cc->free_pfn;
1503 }
1504 if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) {
1505 cc->migrate_pfn = start_pfn; 1521 cc->migrate_pfn = start_pfn;
1506 zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; 1522 cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
1507 zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; 1523 } else {
1508 } 1524 cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
1525 cc->free_pfn = zone->compact_cached_free_pfn;
1526 if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) {
1527 cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
1528 zone->compact_cached_free_pfn = cc->free_pfn;
1529 }
1530 if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) {
1531 cc->migrate_pfn = start_pfn;
1532 zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
1533 zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
1534 }
1509 1535
1510 if (cc->migrate_pfn == start_pfn) 1536 if (cc->migrate_pfn == start_pfn)
1511 cc->whole_zone = true; 1537 cc->whole_zone = true;
1538 }
1512 1539
1513 cc->last_migrated_pfn = 0; 1540 cc->last_migrated_pfn = 0;
1514 1541
@@ -1638,6 +1665,9 @@ static enum compact_result compact_zone_order(struct zone *zone, int order,
1638 .alloc_flags = alloc_flags, 1665 .alloc_flags = alloc_flags,
1639 .classzone_idx = classzone_idx, 1666 .classzone_idx = classzone_idx,
1640 .direct_compaction = true, 1667 .direct_compaction = true,
1668 .whole_zone = (prio == MIN_COMPACT_PRIORITY),
1669 .ignore_skip_hint = (prio == MIN_COMPACT_PRIORITY),
1670 .ignore_block_suitable = (prio == MIN_COMPACT_PRIORITY)
1641 }; 1671 };
1642 INIT_LIST_HEAD(&cc.freepages); 1672 INIT_LIST_HEAD(&cc.freepages);
1643 INIT_LIST_HEAD(&cc.migratepages); 1673 INIT_LIST_HEAD(&cc.migratepages);
@@ -1683,7 +1713,8 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
1683 ac->nodemask) { 1713 ac->nodemask) {
1684 enum compact_result status; 1714 enum compact_result status;
1685 1715
1686 if (compaction_deferred(zone, order)) { 1716 if (prio > MIN_COMPACT_PRIORITY
1717 && compaction_deferred(zone, order)) {
1687 rc = max_t(enum compact_result, COMPACT_DEFERRED, rc); 1718 rc = max_t(enum compact_result, COMPACT_DEFERRED, rc);
1688 continue; 1719 continue;
1689 } 1720 }
@@ -1692,9 +1723,8 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
1692 alloc_flags, ac_classzone_idx(ac)); 1723 alloc_flags, ac_classzone_idx(ac));
1693 rc = max(status, rc); 1724 rc = max(status, rc);
1694 1725
1695 /* If a normal allocation would succeed, stop compacting */ 1726 /* The allocation should succeed, stop compacting */
1696 if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 1727 if (status == COMPACT_SUCCESS) {
1697 ac_classzone_idx(ac), alloc_flags)) {
1698 /* 1728 /*
1699 * We think the allocation will succeed in this zone, 1729 * We think the allocation will succeed in this zone,
1700 * but it is not certain, hence the false. The caller 1730 * but it is not certain, hence the false. The caller
@@ -1730,10 +1760,18 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
1730 1760
1731 1761
1732/* Compact all zones within a node */ 1762/* Compact all zones within a node */
1733static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) 1763static void compact_node(int nid)
1734{ 1764{
1765 pg_data_t *pgdat = NODE_DATA(nid);
1735 int zoneid; 1766 int zoneid;
1736 struct zone *zone; 1767 struct zone *zone;
1768 struct compact_control cc = {
1769 .order = -1,
1770 .mode = MIGRATE_SYNC,
1771 .ignore_skip_hint = true,
1772 .whole_zone = true,
1773 };
1774
1737 1775
1738 for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { 1776 for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
1739 1777
@@ -1741,60 +1779,19 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
1741 if (!populated_zone(zone)) 1779 if (!populated_zone(zone))
1742 continue; 1780 continue;
1743 1781
1744 cc->nr_freepages = 0; 1782 cc.nr_freepages = 0;
1745 cc->nr_migratepages = 0; 1783 cc.nr_migratepages = 0;
1746 cc->zone = zone; 1784 cc.zone = zone;
1747 INIT_LIST_HEAD(&cc->freepages); 1785 INIT_LIST_HEAD(&cc.freepages);
1748 INIT_LIST_HEAD(&cc->migratepages); 1786 INIT_LIST_HEAD(&cc.migratepages);
1749
1750 /*
1751 * When called via /proc/sys/vm/compact_memory
1752 * this makes sure we compact the whole zone regardless of
1753 * cached scanner positions.
1754 */
1755 if (is_via_compact_memory(cc->order))
1756 __reset_isolation_suitable(zone);
1757
1758 if (is_via_compact_memory(cc->order) ||
1759 !compaction_deferred(zone, cc->order))
1760 compact_zone(zone, cc);
1761
1762 VM_BUG_ON(!list_empty(&cc->freepages));
1763 VM_BUG_ON(!list_empty(&cc->migratepages));
1764 1787
1765 if (is_via_compact_memory(cc->order)) 1788 compact_zone(zone, &cc);
1766 continue;
1767 1789
1768 if (zone_watermark_ok(zone, cc->order, 1790 VM_BUG_ON(!list_empty(&cc.freepages));
1769 low_wmark_pages(zone), 0, 0)) 1791 VM_BUG_ON(!list_empty(&cc.migratepages));
1770 compaction_defer_reset(zone, cc->order, false);
1771 } 1792 }
1772} 1793}
1773 1794
1774void compact_pgdat(pg_data_t *pgdat, int order)
1775{
1776 struct compact_control cc = {
1777 .order = order,
1778 .mode = MIGRATE_ASYNC,
1779 };
1780
1781 if (!order)
1782 return;
1783
1784 __compact_pgdat(pgdat, &cc);
1785}
1786
1787static void compact_node(int nid)
1788{
1789 struct compact_control cc = {
1790 .order = -1,
1791 .mode = MIGRATE_SYNC,
1792 .ignore_skip_hint = true,
1793 };
1794
1795 __compact_pgdat(NODE_DATA(nid), &cc);
1796}
1797
1798/* Compact all nodes in the system */ 1795/* Compact all nodes in the system */
1799static void compact_nodes(void) 1796static void compact_nodes(void)
1800{ 1797{
@@ -1900,8 +1897,6 @@ static void kcompactd_do_work(pg_data_t *pgdat)
1900 .ignore_skip_hint = true, 1897 .ignore_skip_hint = true,
1901 1898
1902 }; 1899 };
1903 bool success = false;
1904
1905 trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order, 1900 trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order,
1906 cc.classzone_idx); 1901 cc.classzone_idx);
1907 count_vm_event(KCOMPACTD_WAKE); 1902 count_vm_event(KCOMPACTD_WAKE);
@@ -1930,9 +1925,7 @@ static void kcompactd_do_work(pg_data_t *pgdat)
1930 return; 1925 return;
1931 status = compact_zone(zone, &cc); 1926 status = compact_zone(zone, &cc);
1932 1927
1933 if (zone_watermark_ok(zone, cc.order, low_wmark_pages(zone), 1928 if (status == COMPACT_SUCCESS) {
1934 cc.classzone_idx, 0)) {
1935 success = true;
1936 compaction_defer_reset(zone, cc.order, false); 1929 compaction_defer_reset(zone, cc.order, false);
1937 } else if (status == COMPACT_PARTIAL_SKIPPED || status == COMPACT_COMPLETE) { 1930 } else if (status == COMPACT_PARTIAL_SKIPPED || status == COMPACT_COMPLETE) {
1938 /* 1931 /*
diff --git a/mm/debug.c b/mm/debug.c
index 74c7cae4f683..9feb699c5d25 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -42,6 +42,11 @@ const struct trace_print_flags vmaflag_names[] = {
42 42
43void __dump_page(struct page *page, const char *reason) 43void __dump_page(struct page *page, const char *reason)
44{ 44{
45 /*
46 * Avoid VM_BUG_ON() in page_mapcount().
47 * page->_mapcount space in struct page is used by sl[aou]b pages to
48 * encode own info.
49 */
45 int mapcount = PageSlab(page) ? 0 : page_mapcount(page); 50 int mapcount = PageSlab(page) ? 0 : page_mapcount(page);
46 51
47 pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx", 52 pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx",
diff --git a/mm/filemap.c b/mm/filemap.c
index 68f1813fbdc3..2f7b7783bd6b 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1687,6 +1687,10 @@ static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos,
1687 unsigned int prev_offset; 1687 unsigned int prev_offset;
1688 int error = 0; 1688 int error = 0;
1689 1689
1690 if (unlikely(*ppos >= inode->i_sb->s_maxbytes))
1691 return -EINVAL;
1692 iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
1693
1690 index = *ppos >> PAGE_SHIFT; 1694 index = *ppos >> PAGE_SHIFT;
1691 prev_index = ra->prev_pos >> PAGE_SHIFT; 1695 prev_index = ra->prev_pos >> PAGE_SHIFT;
1692 prev_offset = ra->prev_pos & (PAGE_SIZE-1); 1696 prev_offset = ra->prev_pos & (PAGE_SIZE-1);
@@ -1721,7 +1725,9 @@ find_page:
1721 * wait_on_page_locked is used to avoid unnecessarily 1725 * wait_on_page_locked is used to avoid unnecessarily
1722 * serialisations and why it's safe. 1726 * serialisations and why it's safe.
1723 */ 1727 */
1724 wait_on_page_locked_killable(page); 1728 error = wait_on_page_locked_killable(page);
1729 if (unlikely(error))
1730 goto readpage_error;
1725 if (PageUptodate(page)) 1731 if (PageUptodate(page))
1726 goto page_ok; 1732 goto page_ok;
1727 1733
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 283583fcb1e7..cdcd25cb30fe 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -59,7 +59,7 @@ static struct shrinker deferred_split_shrinker;
59static atomic_t huge_zero_refcount; 59static atomic_t huge_zero_refcount;
60struct page *huge_zero_page __read_mostly; 60struct page *huge_zero_page __read_mostly;
61 61
62struct page *get_huge_zero_page(void) 62static struct page *get_huge_zero_page(void)
63{ 63{
64 struct page *zero_page; 64 struct page *zero_page;
65retry: 65retry:
@@ -86,7 +86,7 @@ retry:
86 return READ_ONCE(huge_zero_page); 86 return READ_ONCE(huge_zero_page);
87} 87}
88 88
89void put_huge_zero_page(void) 89static void put_huge_zero_page(void)
90{ 90{
91 /* 91 /*
92 * Counter should never go to zero here. Only shrinker can put 92 * Counter should never go to zero here. Only shrinker can put
@@ -95,6 +95,26 @@ void put_huge_zero_page(void)
95 BUG_ON(atomic_dec_and_test(&huge_zero_refcount)); 95 BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
96} 96}
97 97
98struct page *mm_get_huge_zero_page(struct mm_struct *mm)
99{
100 if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
101 return READ_ONCE(huge_zero_page);
102
103 if (!get_huge_zero_page())
104 return NULL;
105
106 if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
107 put_huge_zero_page();
108
109 return READ_ONCE(huge_zero_page);
110}
111
112void mm_put_huge_zero_page(struct mm_struct *mm)
113{
114 if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
115 put_huge_zero_page();
116}
117
98static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink, 118static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
99 struct shrink_control *sc) 119 struct shrink_control *sc)
100{ 120{
@@ -469,6 +489,49 @@ void prep_transhuge_page(struct page *page)
469 set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR); 489 set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
470} 490}
471 491
492unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long len,
493 loff_t off, unsigned long flags, unsigned long size)
494{
495 unsigned long addr;
496 loff_t off_end = off + len;
497 loff_t off_align = round_up(off, size);
498 unsigned long len_pad;
499
500 if (off_end <= off_align || (off_end - off_align) < size)
501 return 0;
502
503 len_pad = len + size;
504 if (len_pad < len || (off + len_pad) < off)
505 return 0;
506
507 addr = current->mm->get_unmapped_area(filp, 0, len_pad,
508 off >> PAGE_SHIFT, flags);
509 if (IS_ERR_VALUE(addr))
510 return 0;
511
512 addr += (off - addr) & (size - 1);
513 return addr;
514}
515
516unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
517 unsigned long len, unsigned long pgoff, unsigned long flags)
518{
519 loff_t off = (loff_t)pgoff << PAGE_SHIFT;
520
521 if (addr)
522 goto out;
523 if (!IS_DAX(filp->f_mapping->host) || !IS_ENABLED(CONFIG_FS_DAX_PMD))
524 goto out;
525
526 addr = __thp_get_unmapped_area(filp, len, off, flags, PMD_SIZE);
527 if (addr)
528 return addr;
529
530 out:
531 return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
532}
533EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
534
472static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page, 535static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page,
473 gfp_t gfp) 536 gfp_t gfp)
474{ 537{
@@ -601,7 +664,7 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe)
601 pgtable = pte_alloc_one(vma->vm_mm, haddr); 664 pgtable = pte_alloc_one(vma->vm_mm, haddr);
602 if (unlikely(!pgtable)) 665 if (unlikely(!pgtable))
603 return VM_FAULT_OOM; 666 return VM_FAULT_OOM;
604 zero_page = get_huge_zero_page(); 667 zero_page = mm_get_huge_zero_page(vma->vm_mm);
605 if (unlikely(!zero_page)) { 668 if (unlikely(!zero_page)) {
606 pte_free(vma->vm_mm, pgtable); 669 pte_free(vma->vm_mm, pgtable);
607 count_vm_event(THP_FAULT_FALLBACK); 670 count_vm_event(THP_FAULT_FALLBACK);
@@ -623,10 +686,8 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe)
623 } 686 }
624 } else 687 } else
625 spin_unlock(fe->ptl); 688 spin_unlock(fe->ptl);
626 if (!set) { 689 if (!set)
627 pte_free(vma->vm_mm, pgtable); 690 pte_free(vma->vm_mm, pgtable);
628 put_huge_zero_page();
629 }
630 return ret; 691 return ret;
631 } 692 }
632 gfp = alloc_hugepage_direct_gfpmask(vma); 693 gfp = alloc_hugepage_direct_gfpmask(vma);
@@ -780,7 +841,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
780 * since we already have a zero page to copy. It just takes a 841 * since we already have a zero page to copy. It just takes a
781 * reference. 842 * reference.
782 */ 843 */
783 zero_page = get_huge_zero_page(); 844 zero_page = mm_get_huge_zero_page(dst_mm);
784 set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, 845 set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
785 zero_page); 846 zero_page);
786 ret = 0; 847 ret = 0;
@@ -1038,7 +1099,6 @@ alloc:
1038 update_mmu_cache_pmd(vma, fe->address, fe->pmd); 1099 update_mmu_cache_pmd(vma, fe->address, fe->pmd);
1039 if (!page) { 1100 if (!page) {
1040 add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); 1101 add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
1041 put_huge_zero_page();
1042 } else { 1102 } else {
1043 VM_BUG_ON_PAGE(!PageHead(page), page); 1103 VM_BUG_ON_PAGE(!PageHead(page), page);
1044 page_remove_rmap(page, true); 1104 page_remove_rmap(page, true);
@@ -1499,7 +1559,6 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
1499 } 1559 }
1500 smp_wmb(); /* make pte visible before pmd */ 1560 smp_wmb(); /* make pte visible before pmd */
1501 pmd_populate(mm, pmd, pgtable); 1561 pmd_populate(mm, pmd, pgtable);
1502 put_huge_zero_page();
1503} 1562}
1504 1563
1505static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, 1564static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
@@ -1522,8 +1581,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
1522 1581
1523 if (!vma_is_anonymous(vma)) { 1582 if (!vma_is_anonymous(vma)) {
1524 _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd); 1583 _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
1525 if (is_huge_zero_pmd(_pmd))
1526 put_huge_zero_page();
1527 if (vma_is_dax(vma)) 1584 if (vma_is_dax(vma))
1528 return; 1585 return;
1529 page = pmd_page(_pmd); 1586 page = pmd_page(_pmd);
@@ -1563,7 +1620,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
1563 if (soft_dirty) 1620 if (soft_dirty)
1564 entry = pte_swp_mksoft_dirty(entry); 1621 entry = pte_swp_mksoft_dirty(entry);
1565 } else { 1622 } else {
1566 entry = mk_pte(page + i, vma->vm_page_prot); 1623 entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
1567 entry = maybe_mkwrite(entry, vma); 1624 entry = maybe_mkwrite(entry, vma);
1568 if (!write) 1625 if (!write)
1569 entry = pte_wrprotect(entry); 1626 entry = pte_wrprotect(entry);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 87e11d8ad536..ec49d9ef1eef 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -567,13 +567,13 @@ retry:
567 * appear as a "reserved" entry instead of simply dangling with incorrect 567 * appear as a "reserved" entry instead of simply dangling with incorrect
568 * counts. 568 * counts.
569 */ 569 */
570void hugetlb_fix_reserve_counts(struct inode *inode, bool restore_reserve) 570void hugetlb_fix_reserve_counts(struct inode *inode)
571{ 571{
572 struct hugepage_subpool *spool = subpool_inode(inode); 572 struct hugepage_subpool *spool = subpool_inode(inode);
573 long rsv_adjust; 573 long rsv_adjust;
574 574
575 rsv_adjust = hugepage_subpool_get_pages(spool, 1); 575 rsv_adjust = hugepage_subpool_get_pages(spool, 1);
576 if (restore_reserve && rsv_adjust) { 576 if (rsv_adjust) {
577 struct hstate *h = hstate_inode(inode); 577 struct hstate *h = hstate_inode(inode);
578 578
579 hugetlb_acct_memory(h, 1); 579 hugetlb_acct_memory(h, 1);
@@ -1022,7 +1022,7 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
1022 ((node = hstate_next_node_to_free(hs, mask)) || 1); \ 1022 ((node = hstate_next_node_to_free(hs, mask)) || 1); \
1023 nr_nodes--) 1023 nr_nodes--)
1024 1024
1025#if (defined(CONFIG_X86_64) || defined(CONFIG_S390)) && \ 1025#if defined(CONFIG_ARCH_HAS_GIGANTIC_PAGE) && \
1026 ((defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || \ 1026 ((defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || \
1027 defined(CONFIG_CMA)) 1027 defined(CONFIG_CMA))
1028static void destroy_compound_gigantic_page(struct page *page, 1028static void destroy_compound_gigantic_page(struct page *page,
@@ -1437,38 +1437,61 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
1437 1437
1438/* 1438/*
1439 * Dissolve a given free hugepage into free buddy pages. This function does 1439 * Dissolve a given free hugepage into free buddy pages. This function does
1440 * nothing for in-use (including surplus) hugepages. 1440 * nothing for in-use (including surplus) hugepages. Returns -EBUSY if the
1441 * number of free hugepages would be reduced below the number of reserved
1442 * hugepages.
1441 */ 1443 */
1442static void dissolve_free_huge_page(struct page *page) 1444static int dissolve_free_huge_page(struct page *page)
1443{ 1445{
1446 int rc = 0;
1447
1444 spin_lock(&hugetlb_lock); 1448 spin_lock(&hugetlb_lock);
1445 if (PageHuge(page) && !page_count(page)) { 1449 if (PageHuge(page) && !page_count(page)) {
1446 struct hstate *h = page_hstate(page); 1450 struct page *head = compound_head(page);
1447 int nid = page_to_nid(page); 1451 struct hstate *h = page_hstate(head);
1448 list_del(&page->lru); 1452 int nid = page_to_nid(head);
1453 if (h->free_huge_pages - h->resv_huge_pages == 0) {
1454 rc = -EBUSY;
1455 goto out;
1456 }
1457 list_del(&head->lru);
1449 h->free_huge_pages--; 1458 h->free_huge_pages--;
1450 h->free_huge_pages_node[nid]--; 1459 h->free_huge_pages_node[nid]--;
1451 h->max_huge_pages--; 1460 h->max_huge_pages--;
1452 update_and_free_page(h, page); 1461 update_and_free_page(h, head);
1453 } 1462 }
1463out:
1454 spin_unlock(&hugetlb_lock); 1464 spin_unlock(&hugetlb_lock);
1465 return rc;
1455} 1466}
1456 1467
1457/* 1468/*
1458 * Dissolve free hugepages in a given pfn range. Used by memory hotplug to 1469 * Dissolve free hugepages in a given pfn range. Used by memory hotplug to
1459 * make specified memory blocks removable from the system. 1470 * make specified memory blocks removable from the system.
1460 * Note that start_pfn should aligned with (minimum) hugepage size. 1471 * Note that this will dissolve a free gigantic hugepage completely, if any
1472 * part of it lies within the given range.
1473 * Also note that if dissolve_free_huge_page() returns with an error, all
1474 * free hugepages that were dissolved before that error are lost.
1461 */ 1475 */
1462void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) 1476int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
1463{ 1477{
1464 unsigned long pfn; 1478 unsigned long pfn;
1479 struct page *page;
1480 int rc = 0;
1465 1481
1466 if (!hugepages_supported()) 1482 if (!hugepages_supported())
1467 return; 1483 return rc;
1484
1485 for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) {
1486 page = pfn_to_page(pfn);
1487 if (PageHuge(page) && !page_count(page)) {
1488 rc = dissolve_free_huge_page(page);
1489 if (rc)
1490 break;
1491 }
1492 }
1468 1493
1469 VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << minimum_order)); 1494 return rc;
1470 for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order)
1471 dissolve_free_huge_page(pfn_to_page(pfn));
1472} 1495}
1473 1496
1474/* 1497/*
diff --git a/mm/internal.h b/mm/internal.h
index 1501304f87a4..537ac9951f5f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -178,8 +178,9 @@ struct compact_control {
178 unsigned long last_migrated_pfn;/* Not yet flushed page being freed */ 178 unsigned long last_migrated_pfn;/* Not yet flushed page being freed */
179 enum migrate_mode mode; /* Async or sync migration mode */ 179 enum migrate_mode mode; /* Async or sync migration mode */
180 bool ignore_skip_hint; /* Scan blocks even if marked skip */ 180 bool ignore_skip_hint; /* Scan blocks even if marked skip */
181 bool ignore_block_suitable; /* Scan blocks considered unsuitable */
181 bool direct_compaction; /* False from kcompactd or /proc/... */ 182 bool direct_compaction; /* False from kcompactd or /proc/... */
182 bool whole_zone; /* Whole zone has been scanned */ 183 bool whole_zone; /* Whole zone should/has been scanned */
183 int order; /* order a direct compactor needs */ 184 int order; /* order a direct compactor needs */
184 const gfp_t gfp_mask; /* gfp mask of a direct compactor */ 185 const gfp_t gfp_mask; /* gfp mask of a direct compactor */
185 const unsigned int alloc_flags; /* alloc flags of a direct compactor */ 186 const unsigned int alloc_flags; /* alloc flags of a direct compactor */
diff --git a/mm/ksm.c b/mm/ksm.c
index 5048083b60f2..9ae6011a41f8 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -299,7 +299,12 @@ static inline void free_rmap_item(struct rmap_item *rmap_item)
299 299
300static inline struct stable_node *alloc_stable_node(void) 300static inline struct stable_node *alloc_stable_node(void)
301{ 301{
302 return kmem_cache_alloc(stable_node_cache, GFP_KERNEL); 302 /*
303 * The allocation can take too long with GFP_KERNEL when memory is under
304 * pressure, which may lead to hung task warnings. Adding __GFP_HIGH
305 * grants access to memory reserves, helping to avoid this problem.
306 */
307 return kmem_cache_alloc(stable_node_cache, GFP_KERNEL | __GFP_HIGH);
303} 308}
304 309
305static inline void free_stable_node(struct stable_node *stable_node) 310static inline void free_stable_node(struct stable_node *stable_node)
diff --git a/mm/memblock.c b/mm/memblock.c
index 483197ef613f..c8dfa430342b 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1438,6 +1438,11 @@ phys_addr_t __init_memblock memblock_phys_mem_size(void)
1438 return memblock.memory.total_size; 1438 return memblock.memory.total_size;
1439} 1439}
1440 1440
1441phys_addr_t __init_memblock memblock_reserved_size(void)
1442{
1443 return memblock.reserved.total_size;
1444}
1445
1441phys_addr_t __init memblock_mem_size(unsigned long limit_pfn) 1446phys_addr_t __init memblock_mem_size(unsigned long limit_pfn)
1442{ 1447{
1443 unsigned long pages = 0; 1448 unsigned long pages = 0;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4be518d4e68a..ae052b5e3315 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -921,6 +921,43 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
921 iter = mem_cgroup_iter(NULL, iter, NULL)) 921 iter = mem_cgroup_iter(NULL, iter, NULL))
922 922
923/** 923/**
924 * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy
925 * @memcg: hierarchy root
926 * @fn: function to call for each task
927 * @arg: argument passed to @fn
928 *
929 * This function iterates over tasks attached to @memcg or to any of its
930 * descendants and calls @fn for each task. If @fn returns a non-zero
931 * value, the function breaks the iteration loop and returns the value.
932 * Otherwise, it will iterate over all tasks and return 0.
933 *
934 * This function must not be called for the root memory cgroup.
935 */
936int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
937 int (*fn)(struct task_struct *, void *), void *arg)
938{
939 struct mem_cgroup *iter;
940 int ret = 0;
941
942 BUG_ON(memcg == root_mem_cgroup);
943
944 for_each_mem_cgroup_tree(iter, memcg) {
945 struct css_task_iter it;
946 struct task_struct *task;
947
948 css_task_iter_start(&iter->css, &it);
949 while (!ret && (task = css_task_iter_next(&it)))
950 ret = fn(task, arg);
951 css_task_iter_end(&it);
952 if (ret) {
953 mem_cgroup_iter_break(memcg, iter);
954 break;
955 }
956 }
957 return ret;
958}
959
960/**
924 * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page 961 * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
925 * @page: the page 962 * @page: the page
926 * @zone: zone of the page 963 * @zone: zone of the page
@@ -1178,7 +1215,7 @@ static int mem_cgroup_count_children(struct mem_cgroup *memcg)
1178/* 1215/*
1179 * Return the memory (and swap, if configured) limit for a memcg. 1216 * Return the memory (and swap, if configured) limit for a memcg.
1180 */ 1217 */
1181static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) 1218unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
1182{ 1219{
1183 unsigned long limit; 1220 unsigned long limit;
1184 1221
@@ -1205,79 +1242,12 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1205 .gfp_mask = gfp_mask, 1242 .gfp_mask = gfp_mask,
1206 .order = order, 1243 .order = order,
1207 }; 1244 };
1208 struct mem_cgroup *iter; 1245 bool ret;
1209 unsigned long chosen_points = 0;
1210 unsigned long totalpages;
1211 unsigned int points = 0;
1212 struct task_struct *chosen = NULL;
1213 1246
1214 mutex_lock(&oom_lock); 1247 mutex_lock(&oom_lock);
1215 1248 ret = out_of_memory(&oc);
1216 /*
1217 * If current has a pending SIGKILL or is exiting, then automatically
1218 * select it. The goal is to allow it to allocate so that it may
1219 * quickly exit and free its memory.
1220 */
1221 if (task_will_free_mem(current)) {
1222 mark_oom_victim(current);
1223 wake_oom_reaper(current);
1224 goto unlock;
1225 }
1226
1227 check_panic_on_oom(&oc, CONSTRAINT_MEMCG);
1228 totalpages = mem_cgroup_get_limit(memcg) ? : 1;
1229 for_each_mem_cgroup_tree(iter, memcg) {
1230 struct css_task_iter it;
1231 struct task_struct *task;
1232
1233 css_task_iter_start(&iter->css, &it);
1234 while ((task = css_task_iter_next(&it))) {
1235 switch (oom_scan_process_thread(&oc, task)) {
1236 case OOM_SCAN_SELECT:
1237 if (chosen)
1238 put_task_struct(chosen);
1239 chosen = task;
1240 chosen_points = ULONG_MAX;
1241 get_task_struct(chosen);
1242 /* fall through */
1243 case OOM_SCAN_CONTINUE:
1244 continue;
1245 case OOM_SCAN_ABORT:
1246 css_task_iter_end(&it);
1247 mem_cgroup_iter_break(memcg, iter);
1248 if (chosen)
1249 put_task_struct(chosen);
1250 /* Set a dummy value to return "true". */
1251 chosen = (void *) 1;
1252 goto unlock;
1253 case OOM_SCAN_OK:
1254 break;
1255 };
1256 points = oom_badness(task, memcg, NULL, totalpages);
1257 if (!points || points < chosen_points)
1258 continue;
1259 /* Prefer thread group leaders for display purposes */
1260 if (points == chosen_points &&
1261 thread_group_leader(chosen))
1262 continue;
1263
1264 if (chosen)
1265 put_task_struct(chosen);
1266 chosen = task;
1267 chosen_points = points;
1268 get_task_struct(chosen);
1269 }
1270 css_task_iter_end(&it);
1271 }
1272
1273 if (chosen) {
1274 points = chosen_points * 1000 / totalpages;
1275 oom_kill_process(&oc, chosen, points, totalpages,
1276 "Memory cgroup out of memory");
1277 }
1278unlock:
1279 mutex_unlock(&oom_lock); 1249 mutex_unlock(&oom_lock);
1280 return chosen; 1250 return ret;
1281} 1251}
1282 1252
1283#if MAX_NUMNODES > 1 1253#if MAX_NUMNODES > 1
@@ -1600,7 +1570,7 @@ bool mem_cgroup_oom_synchronize(bool handle)
1600 if (!memcg) 1570 if (!memcg)
1601 return false; 1571 return false;
1602 1572
1603 if (!handle || oom_killer_disabled) 1573 if (!handle)
1604 goto cleanup; 1574 goto cleanup;
1605 1575
1606 owait.memcg = memcg; 1576 owait.memcg = memcg;
@@ -2969,16 +2939,16 @@ static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit)
2969 /* 2939 /*
2970 * The active flag needs to be written after the static_key 2940 * The active flag needs to be written after the static_key
2971 * update. This is what guarantees that the socket activation 2941 * update. This is what guarantees that the socket activation
2972 * function is the last one to run. See sock_update_memcg() for 2942 * function is the last one to run. See mem_cgroup_sk_alloc()
2973 * details, and note that we don't mark any socket as belonging 2943 * for details, and note that we don't mark any socket as
2974 * to this memcg until that flag is up. 2944 * belonging to this memcg until that flag is up.
2975 * 2945 *
2976 * We need to do this, because static_keys will span multiple 2946 * We need to do this, because static_keys will span multiple
2977 * sites, but we can't control their order. If we mark a socket 2947 * sites, but we can't control their order. If we mark a socket
2978 * as accounted, but the accounting functions are not patched in 2948 * as accounted, but the accounting functions are not patched in
2979 * yet, we'll lose accounting. 2949 * yet, we'll lose accounting.
2980 * 2950 *
2981 * We never race with the readers in sock_update_memcg(), 2951 * We never race with the readers in mem_cgroup_sk_alloc(),
2982 * because when this value change, the code to process it is not 2952 * because when this value change, the code to process it is not
2983 * patched in yet. 2953 * patched in yet.
2984 */ 2954 */
@@ -4092,11 +4062,13 @@ static DEFINE_IDR(mem_cgroup_idr);
4092 4062
4093static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n) 4063static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n)
4094{ 4064{
4065 VM_BUG_ON(atomic_read(&memcg->id.ref) <= 0);
4095 atomic_add(n, &memcg->id.ref); 4066 atomic_add(n, &memcg->id.ref);
4096} 4067}
4097 4068
4098static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) 4069static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
4099{ 4070{
4071 VM_BUG_ON(atomic_read(&memcg->id.ref) < n);
4100 if (atomic_sub_and_test(n, &memcg->id.ref)) { 4072 if (atomic_sub_and_test(n, &memcg->id.ref)) {
4101 idr_remove(&mem_cgroup_idr, memcg->id.id); 4073 idr_remove(&mem_cgroup_idr, memcg->id.id);
4102 memcg->id.id = 0; 4074 memcg->id.id = 0;
@@ -4285,8 +4257,10 @@ fail:
4285 4257
4286static int mem_cgroup_css_online(struct cgroup_subsys_state *css) 4258static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
4287{ 4259{
4260 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4261
4288 /* Online state pins memcg ID, memcg ID pins CSS */ 4262 /* Online state pins memcg ID, memcg ID pins CSS */
4289 mem_cgroup_id_get(mem_cgroup_from_css(css)); 4263 atomic_set(&memcg->id.ref, 1);
4290 css_get(css); 4264 css_get(css);
4291 return 0; 4265 return 0;
4292} 4266}
@@ -4434,7 +4408,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
4434 * Because lookup_swap_cache() updates some statistics counter, 4408 * Because lookup_swap_cache() updates some statistics counter,
4435 * we call find_get_page() with swapper_space directly. 4409 * we call find_get_page() with swapper_space directly.
4436 */ 4410 */
4437 page = find_get_page(swap_address_space(ent), ent.val); 4411 page = find_get_page(swap_address_space(ent), swp_offset(ent));
4438 if (do_memsw_account()) 4412 if (do_memsw_account())
4439 entry->val = ent.val; 4413 entry->val = ent.val;
4440 4414
@@ -4472,7 +4446,8 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
4472 swp_entry_t swp = radix_to_swp_entry(page); 4446 swp_entry_t swp = radix_to_swp_entry(page);
4473 if (do_memsw_account()) 4447 if (do_memsw_account())
4474 *entry = swp; 4448 *entry = swp;
4475 page = find_get_page(swap_address_space(swp), swp.val); 4449 page = find_get_page(swap_address_space(swp),
4450 swp_offset(swp));
4476 } 4451 }
4477 } else 4452 } else
4478 page = find_get_page(mapping, pgoff); 4453 page = find_get_page(mapping, pgoff);
@@ -4707,7 +4682,8 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
4707 .mm = mm, 4682 .mm = mm,
4708 }; 4683 };
4709 down_read(&mm->mmap_sem); 4684 down_read(&mm->mmap_sem);
4710 walk_page_range(0, ~0UL, &mem_cgroup_count_precharge_walk); 4685 walk_page_range(0, mm->highest_vm_end,
4686 &mem_cgroup_count_precharge_walk);
4711 up_read(&mm->mmap_sem); 4687 up_read(&mm->mmap_sem);
4712 4688
4713 precharge = mc.precharge; 4689 precharge = mc.precharge;
@@ -4995,7 +4971,8 @@ retry:
4995 * When we have consumed all precharges and failed in doing 4971 * When we have consumed all precharges and failed in doing
4996 * additional charge, the page walk just aborts. 4972 * additional charge, the page walk just aborts.
4997 */ 4973 */
4998 walk_page_range(0, ~0UL, &mem_cgroup_move_charge_walk); 4974 walk_page_range(0, mc.mm->highest_vm_end, &mem_cgroup_move_charge_walk);
4975
4999 up_read(&mc.mm->mmap_sem); 4976 up_read(&mc.mm->mmap_sem);
5000 atomic_dec(&mc.from->moving_account); 4977 atomic_dec(&mc.from->moving_account);
5001} 4978}
@@ -5674,11 +5651,15 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
5674DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key); 5651DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
5675EXPORT_SYMBOL(memcg_sockets_enabled_key); 5652EXPORT_SYMBOL(memcg_sockets_enabled_key);
5676 5653
5677void sock_update_memcg(struct sock *sk) 5654void mem_cgroup_sk_alloc(struct sock *sk)
5678{ 5655{
5679 struct mem_cgroup *memcg; 5656 struct mem_cgroup *memcg;
5680 5657
5681 /* Socket cloning can throw us here with sk_cgrp already 5658 if (!mem_cgroup_sockets_enabled)
5659 return;
5660
5661 /*
5662 * Socket cloning can throw us here with sk_memcg already
5682 * filled. It won't however, necessarily happen from 5663 * filled. It won't however, necessarily happen from
5683 * process context. So the test for root memcg given 5664 * process context. So the test for root memcg given
5684 * the current task's memcg won't help us in this case. 5665 * the current task's memcg won't help us in this case.
@@ -5703,12 +5684,11 @@ void sock_update_memcg(struct sock *sk)
5703out: 5684out:
5704 rcu_read_unlock(); 5685 rcu_read_unlock();
5705} 5686}
5706EXPORT_SYMBOL(sock_update_memcg);
5707 5687
5708void sock_release_memcg(struct sock *sk) 5688void mem_cgroup_sk_free(struct sock *sk)
5709{ 5689{
5710 WARN_ON(!sk->sk_memcg); 5690 if (sk->sk_memcg)
5711 css_put(&sk->sk_memcg->css); 5691 css_put(&sk->sk_memcg->css);
5712} 5692}
5713 5693
5714/** 5694/**
diff --git a/mm/memory.c b/mm/memory.c
index f1a68049edff..fc1987dfd8cc 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1649,10 +1649,14 @@ EXPORT_SYMBOL(vm_insert_pfn_prot);
1649int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, 1649int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
1650 pfn_t pfn) 1650 pfn_t pfn)
1651{ 1651{
1652 pgprot_t pgprot = vma->vm_page_prot;
1653
1652 BUG_ON(!(vma->vm_flags & VM_MIXEDMAP)); 1654 BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
1653 1655
1654 if (addr < vma->vm_start || addr >= vma->vm_end) 1656 if (addr < vma->vm_start || addr >= vma->vm_end)
1655 return -EFAULT; 1657 return -EFAULT;
1658 if (track_pfn_insert(vma, &pgprot, pfn))
1659 return -EINVAL;
1656 1660
1657 /* 1661 /*
1658 * If we don't have pte special, then we have to use the pfn_valid() 1662 * If we don't have pte special, then we have to use the pfn_valid()
@@ -1670,9 +1674,9 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
1670 * result in pfn_t_has_page() == false. 1674 * result in pfn_t_has_page() == false.
1671 */ 1675 */
1672 page = pfn_to_page(pfn_t_to_pfn(pfn)); 1676 page = pfn_to_page(pfn_t_to_pfn(pfn));
1673 return insert_page(vma, addr, page, vma->vm_page_prot); 1677 return insert_page(vma, addr, page, pgprot);
1674 } 1678 }
1675 return insert_pfn(vma, addr, pfn, vma->vm_page_prot); 1679 return insert_pfn(vma, addr, pfn, pgprot);
1676} 1680}
1677EXPORT_SYMBOL(vm_insert_mixed); 1681EXPORT_SYMBOL(vm_insert_mixed);
1678 1682
@@ -3658,6 +3662,19 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
3658 mem_cgroup_oom_synchronize(false); 3662 mem_cgroup_oom_synchronize(false);
3659 } 3663 }
3660 3664
3665 /*
3666 * This mm has been already reaped by the oom reaper and so the
3667 * refault cannot be trusted in general. Anonymous refaults would
3668 * lose data and give a zero page instead e.g. This is especially
3669 * problem for use_mm() because regular tasks will just die and
3670 * the corrupted data will not be visible anywhere while kthread
3671 * will outlive the oom victim and potentially propagate the date
3672 * further.
3673 */
3674 if (unlikely((current->flags & PF_KTHREAD) && !(ret & VM_FAULT_ERROR)
3675 && test_bit(MMF_UNSTABLE, &vma->vm_mm->flags)))
3676 ret = VM_FAULT_SIGBUS;
3677
3661 return ret; 3678 return ret;
3662} 3679}
3663EXPORT_SYMBOL_GPL(handle_mm_fault); 3680EXPORT_SYMBOL_GPL(handle_mm_fault);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 9d29ba0f7192..962927309b6e 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1945,7 +1945,9 @@ repeat:
1945 * dissolve free hugepages in the memory block before doing offlining 1945 * dissolve free hugepages in the memory block before doing offlining
1946 * actually in order to make hugetlbfs's object counting consistent. 1946 * actually in order to make hugetlbfs's object counting consistent.
1947 */ 1947 */
1948 dissolve_free_huge_pages(start_pfn, end_pfn); 1948 ret = dissolve_free_huge_pages(start_pfn, end_pfn);
1949 if (ret)
1950 goto failed_removal;
1949 /* check again */ 1951 /* check again */
1950 offlined_pages = check_pages_isolated(start_pfn, end_pfn); 1952 offlined_pages = check_pages_isolated(start_pfn, end_pfn);
1951 if (offlined_pages < 0) { 1953 if (offlined_pages < 0) {
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 2da72a5b6ecc..ad1c96ac313c 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1749,7 +1749,7 @@ unsigned int mempolicy_slab_node(void)
1749 */ 1749 */
1750 struct zonelist *zonelist; 1750 struct zonelist *zonelist;
1751 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); 1751 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1752 zonelist = &NODE_DATA(node)->node_zonelists[0]; 1752 zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1753 z = first_zones_zonelist(zonelist, highest_zoneidx, 1753 z = first_zones_zonelist(zonelist, highest_zoneidx,
1754 &policy->v.nodes); 1754 &policy->v.nodes);
1755 return z->zone ? z->zone->node : node; 1755 return z->zone ? z->zone->node : node;
diff --git a/mm/migrate.c b/mm/migrate.c
index f7ee04a5ae27..99250aee1ac1 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -234,7 +234,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
234 goto unlock; 234 goto unlock;
235 235
236 get_page(new); 236 get_page(new);
237 pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); 237 pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot)));
238 if (pte_swp_soft_dirty(*ptep)) 238 if (pte_swp_soft_dirty(*ptep))
239 pte = pte_mksoft_dirty(pte); 239 pte = pte_mksoft_dirty(pte);
240 240
diff --git a/mm/mincore.c b/mm/mincore.c
index c0b5ba965200..bfb866435478 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -66,7 +66,8 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
66 */ 66 */
67 if (radix_tree_exceptional_entry(page)) { 67 if (radix_tree_exceptional_entry(page)) {
68 swp_entry_t swp = radix_to_swp_entry(page); 68 swp_entry_t swp = radix_to_swp_entry(page);
69 page = find_get_page(swap_address_space(swp), swp.val); 69 page = find_get_page(swap_address_space(swp),
70 swp_offset(swp));
70 } 71 }
71 } else 72 } else
72 page = find_get_page(mapping, pgoff); 73 page = find_get_page(mapping, pgoff);
@@ -150,7 +151,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
150 } else { 151 } else {
151#ifdef CONFIG_SWAP 152#ifdef CONFIG_SWAP
152 *vec = mincore_page(swap_address_space(entry), 153 *vec = mincore_page(swap_address_space(entry),
153 entry.val); 154 swp_offset(entry));
154#else 155#else
155 WARN_ON(1); 156 WARN_ON(1);
156 *vec = 1; 157 *vec = 1;
diff --git a/mm/mlock.c b/mm/mlock.c
index 14645be06e30..145a4258ddbc 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -516,6 +516,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
516 int nr_pages; 516 int nr_pages;
517 int ret = 0; 517 int ret = 0;
518 int lock = !!(newflags & VM_LOCKED); 518 int lock = !!(newflags & VM_LOCKED);
519 vm_flags_t old_flags = vma->vm_flags;
519 520
520 if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) || 521 if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
521 is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm)) 522 is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm))
@@ -550,6 +551,8 @@ success:
550 nr_pages = (end - start) >> PAGE_SHIFT; 551 nr_pages = (end - start) >> PAGE_SHIFT;
551 if (!lock) 552 if (!lock)
552 nr_pages = -nr_pages; 553 nr_pages = -nr_pages;
554 else if (old_flags & VM_LOCKED)
555 nr_pages = 0;
553 mm->locked_vm += nr_pages; 556 mm->locked_vm += nr_pages;
554 557
555 /* 558 /*
@@ -617,6 +620,45 @@ static int apply_vma_lock_flags(unsigned long start, size_t len,
617 return error; 620 return error;
618} 621}
619 622
623/*
624 * Go through vma areas and sum size of mlocked
625 * vma pages, as return value.
626 * Note deferred memory locking case(mlock2(,,MLOCK_ONFAULT)
627 * is also counted.
628 * Return value: previously mlocked page counts
629 */
630static int count_mm_mlocked_page_nr(struct mm_struct *mm,
631 unsigned long start, size_t len)
632{
633 struct vm_area_struct *vma;
634 int count = 0;
635
636 if (mm == NULL)
637 mm = current->mm;
638
639 vma = find_vma(mm, start);
640 if (vma == NULL)
641 vma = mm->mmap;
642
643 for (; vma ; vma = vma->vm_next) {
644 if (start >= vma->vm_end)
645 continue;
646 if (start + len <= vma->vm_start)
647 break;
648 if (vma->vm_flags & VM_LOCKED) {
649 if (start > vma->vm_start)
650 count -= (start - vma->vm_start);
651 if (start + len < vma->vm_end) {
652 count += start + len - vma->vm_start;
653 break;
654 }
655 count += vma->vm_end - vma->vm_start;
656 }
657 }
658
659 return count >> PAGE_SHIFT;
660}
661
620static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t flags) 662static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t flags)
621{ 663{
622 unsigned long locked; 664 unsigned long locked;
@@ -639,6 +681,16 @@ static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t fla
639 return -EINTR; 681 return -EINTR;
640 682
641 locked += current->mm->locked_vm; 683 locked += current->mm->locked_vm;
684 if ((locked > lock_limit) && (!capable(CAP_IPC_LOCK))) {
685 /*
686 * It is possible that the regions requested intersect with
687 * previously mlocked areas, that part area in "mm->locked_vm"
688 * should not be counted to new mlock increment count. So check
689 * and adjust locked count if necessary.
690 */
691 locked -= count_mm_mlocked_page_nr(current->mm,
692 start, len);
693 }
642 694
643 /* check against resource limits */ 695 /* check against resource limits */
644 if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) 696 if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
diff --git a/mm/mmap.c b/mm/mmap.c
index 7a0707a48047..1af87c14183d 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -116,13 +116,15 @@ static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags)
116void vma_set_page_prot(struct vm_area_struct *vma) 116void vma_set_page_prot(struct vm_area_struct *vma)
117{ 117{
118 unsigned long vm_flags = vma->vm_flags; 118 unsigned long vm_flags = vma->vm_flags;
119 pgprot_t vm_page_prot;
119 120
120 vma->vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags); 121 vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags);
121 if (vma_wants_writenotify(vma)) { 122 if (vma_wants_writenotify(vma, vm_page_prot)) {
122 vm_flags &= ~VM_SHARED; 123 vm_flags &= ~VM_SHARED;
123 vma->vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, 124 vm_page_prot = vm_pgprot_modify(vm_page_prot, vm_flags);
124 vm_flags);
125 } 125 }
126 /* remove_protection_ptes reads vma->vm_page_prot without mmap_sem */
127 WRITE_ONCE(vma->vm_page_prot, vm_page_prot);
126} 128}
127 129
128/* 130/*
@@ -400,15 +402,9 @@ static inline void vma_rb_insert(struct vm_area_struct *vma,
400 rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks); 402 rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
401} 403}
402 404
403static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root) 405static void __vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
404{ 406{
405 /* 407 /*
406 * All rb_subtree_gap values must be consistent prior to erase,
407 * with the possible exception of the vma being erased.
408 */
409 validate_mm_rb(root, vma);
410
411 /*
412 * Note rb_erase_augmented is a fairly large inline function, 408 * Note rb_erase_augmented is a fairly large inline function,
413 * so make sure we instantiate it only once with our desired 409 * so make sure we instantiate it only once with our desired
414 * augmented rbtree callbacks. 410 * augmented rbtree callbacks.
@@ -416,6 +412,32 @@ static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
416 rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks); 412 rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
417} 413}
418 414
415static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma,
416 struct rb_root *root,
417 struct vm_area_struct *ignore)
418{
419 /*
420 * All rb_subtree_gap values must be consistent prior to erase,
421 * with the possible exception of the "next" vma being erased if
422 * next->vm_start was reduced.
423 */
424 validate_mm_rb(root, ignore);
425
426 __vma_rb_erase(vma, root);
427}
428
429static __always_inline void vma_rb_erase(struct vm_area_struct *vma,
430 struct rb_root *root)
431{
432 /*
433 * All rb_subtree_gap values must be consistent prior to erase,
434 * with the possible exception of the vma being erased.
435 */
436 validate_mm_rb(root, vma);
437
438 __vma_rb_erase(vma, root);
439}
440
419/* 441/*
420 * vma has some anon_vma assigned, and is already inserted on that 442 * vma has some anon_vma assigned, and is already inserted on that
421 * anon_vma's interval trees. 443 * anon_vma's interval trees.
@@ -599,14 +621,25 @@ static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
599 mm->map_count++; 621 mm->map_count++;
600} 622}
601 623
602static inline void 624static __always_inline void __vma_unlink_common(struct mm_struct *mm,
603__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, 625 struct vm_area_struct *vma,
604 struct vm_area_struct *prev) 626 struct vm_area_struct *prev,
627 bool has_prev,
628 struct vm_area_struct *ignore)
605{ 629{
606 struct vm_area_struct *next; 630 struct vm_area_struct *next;
607 631
608 vma_rb_erase(vma, &mm->mm_rb); 632 vma_rb_erase_ignore(vma, &mm->mm_rb, ignore);
609 prev->vm_next = next = vma->vm_next; 633 next = vma->vm_next;
634 if (has_prev)
635 prev->vm_next = next;
636 else {
637 prev = vma->vm_prev;
638 if (prev)
639 prev->vm_next = next;
640 else
641 mm->mmap = next;
642 }
610 if (next) 643 if (next)
611 next->vm_prev = prev; 644 next->vm_prev = prev;
612 645
@@ -614,6 +647,13 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
614 vmacache_invalidate(mm); 647 vmacache_invalidate(mm);
615} 648}
616 649
650static inline void __vma_unlink_prev(struct mm_struct *mm,
651 struct vm_area_struct *vma,
652 struct vm_area_struct *prev)
653{
654 __vma_unlink_common(mm, vma, prev, true, vma);
655}
656
617/* 657/*
618 * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that 658 * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that
619 * is already present in an i_mmap tree without adjusting the tree. 659 * is already present in an i_mmap tree without adjusting the tree.
@@ -621,11 +661,12 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
621 * are necessary. The "insert" vma (if any) is to be inserted 661 * are necessary. The "insert" vma (if any) is to be inserted
622 * before we drop the necessary locks. 662 * before we drop the necessary locks.
623 */ 663 */
624int vma_adjust(struct vm_area_struct *vma, unsigned long start, 664int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
625 unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) 665 unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert,
666 struct vm_area_struct *expand)
626{ 667{
627 struct mm_struct *mm = vma->vm_mm; 668 struct mm_struct *mm = vma->vm_mm;
628 struct vm_area_struct *next = vma->vm_next; 669 struct vm_area_struct *next = vma->vm_next, *orig_vma = vma;
629 struct address_space *mapping = NULL; 670 struct address_space *mapping = NULL;
630 struct rb_root *root = NULL; 671 struct rb_root *root = NULL;
631 struct anon_vma *anon_vma = NULL; 672 struct anon_vma *anon_vma = NULL;
@@ -641,9 +682,38 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
641 /* 682 /*
642 * vma expands, overlapping all the next, and 683 * vma expands, overlapping all the next, and
643 * perhaps the one after too (mprotect case 6). 684 * perhaps the one after too (mprotect case 6).
685 * The only other cases that gets here are
686 * case 1, case 7 and case 8.
644 */ 687 */
645 remove_next = 1 + (end > next->vm_end); 688 if (next == expand) {
646 end = next->vm_end; 689 /*
690 * The only case where we don't expand "vma"
691 * and we expand "next" instead is case 8.
692 */
693 VM_WARN_ON(end != next->vm_end);
694 /*
695 * remove_next == 3 means we're
696 * removing "vma" and that to do so we
697 * swapped "vma" and "next".
698 */
699 remove_next = 3;
700 VM_WARN_ON(file != next->vm_file);
701 swap(vma, next);
702 } else {
703 VM_WARN_ON(expand != vma);
704 /*
705 * case 1, 6, 7, remove_next == 2 is case 6,
706 * remove_next == 1 is case 1 or 7.
707 */
708 remove_next = 1 + (end > next->vm_end);
709 VM_WARN_ON(remove_next == 2 &&
710 end != next->vm_next->vm_end);
711 VM_WARN_ON(remove_next == 1 &&
712 end != next->vm_end);
713 /* trim end to next, for case 6 first pass */
714 end = next->vm_end;
715 }
716
647 exporter = next; 717 exporter = next;
648 importer = vma; 718 importer = vma;
649 719
@@ -651,7 +721,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
651 * If next doesn't have anon_vma, import from vma after 721 * If next doesn't have anon_vma, import from vma after
652 * next, if the vma overlaps with it. 722 * next, if the vma overlaps with it.
653 */ 723 */
654 if (remove_next == 2 && next && !next->anon_vma) 724 if (remove_next == 2 && !next->anon_vma)
655 exporter = next->vm_next; 725 exporter = next->vm_next;
656 726
657 } else if (end > next->vm_start) { 727 } else if (end > next->vm_start) {
@@ -662,6 +732,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
662 adjust_next = (end - next->vm_start) >> PAGE_SHIFT; 732 adjust_next = (end - next->vm_start) >> PAGE_SHIFT;
663 exporter = next; 733 exporter = next;
664 importer = vma; 734 importer = vma;
735 VM_WARN_ON(expand != importer);
665 } else if (end < vma->vm_end) { 736 } else if (end < vma->vm_end) {
666 /* 737 /*
667 * vma shrinks, and !insert tells it's not 738 * vma shrinks, and !insert tells it's not
@@ -671,6 +742,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
671 adjust_next = -((vma->vm_end - end) >> PAGE_SHIFT); 742 adjust_next = -((vma->vm_end - end) >> PAGE_SHIFT);
672 exporter = vma; 743 exporter = vma;
673 importer = next; 744 importer = next;
745 VM_WARN_ON(expand != importer);
674 } 746 }
675 747
676 /* 748 /*
@@ -688,7 +760,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
688 } 760 }
689 } 761 }
690again: 762again:
691 vma_adjust_trans_huge(vma, start, end, adjust_next); 763 vma_adjust_trans_huge(orig_vma, start, end, adjust_next);
692 764
693 if (file) { 765 if (file) {
694 mapping = file->f_mapping; 766 mapping = file->f_mapping;
@@ -714,8 +786,8 @@ again:
714 if (!anon_vma && adjust_next) 786 if (!anon_vma && adjust_next)
715 anon_vma = next->anon_vma; 787 anon_vma = next->anon_vma;
716 if (anon_vma) { 788 if (anon_vma) {
717 VM_BUG_ON_VMA(adjust_next && next->anon_vma && 789 VM_WARN_ON(adjust_next && next->anon_vma &&
718 anon_vma != next->anon_vma, next); 790 anon_vma != next->anon_vma);
719 anon_vma_lock_write(anon_vma); 791 anon_vma_lock_write(anon_vma);
720 anon_vma_interval_tree_pre_update_vma(vma); 792 anon_vma_interval_tree_pre_update_vma(vma);
721 if (adjust_next) 793 if (adjust_next)
@@ -755,7 +827,19 @@ again:
755 * vma_merge has merged next into vma, and needs 827 * vma_merge has merged next into vma, and needs
756 * us to remove next before dropping the locks. 828 * us to remove next before dropping the locks.
757 */ 829 */
758 __vma_unlink(mm, next, vma); 830 if (remove_next != 3)
831 __vma_unlink_prev(mm, next, vma);
832 else
833 /*
834 * vma is not before next if they've been
835 * swapped.
836 *
837 * pre-swap() next->vm_start was reduced so
838 * tell validate_mm_rb to ignore pre-swap()
839 * "next" (which is stored in post-swap()
840 * "vma").
841 */
842 __vma_unlink_common(mm, next, NULL, false, vma);
759 if (file) 843 if (file)
760 __remove_shared_vm_struct(next, file, mapping); 844 __remove_shared_vm_struct(next, file, mapping);
761 } else if (insert) { 845 } else if (insert) {
@@ -807,7 +891,27 @@ again:
807 * we must remove another next too. It would clutter 891 * we must remove another next too. It would clutter
808 * up the code too much to do both in one go. 892 * up the code too much to do both in one go.
809 */ 893 */
810 next = vma->vm_next; 894 if (remove_next != 3) {
895 /*
896 * If "next" was removed and vma->vm_end was
897 * expanded (up) over it, in turn
898 * "next->vm_prev->vm_end" changed and the
899 * "vma->vm_next" gap must be updated.
900 */
901 next = vma->vm_next;
902 } else {
903 /*
904 * For the scope of the comment "next" and
905 * "vma" considered pre-swap(): if "vma" was
906 * removed, next->vm_start was expanded (down)
907 * over it and the "next" gap must be updated.
908 * Because of the swap() the post-swap() "vma"
909 * actually points to pre-swap() "next"
910 * (post-swap() "next" as opposed is now a
911 * dangling pointer).
912 */
913 next = vma;
914 }
811 if (remove_next == 2) { 915 if (remove_next == 2) {
812 remove_next = 1; 916 remove_next = 1;
813 end = next->vm_end; 917 end = next->vm_end;
@@ -815,8 +919,28 @@ again:
815 } 919 }
816 else if (next) 920 else if (next)
817 vma_gap_update(next); 921 vma_gap_update(next);
818 else 922 else {
819 mm->highest_vm_end = end; 923 /*
924 * If remove_next == 2 we obviously can't
925 * reach this path.
926 *
927 * If remove_next == 3 we can't reach this
928 * path because pre-swap() next is always not
929 * NULL. pre-swap() "next" is not being
930 * removed and its next->vm_end is not altered
931 * (and furthermore "end" already matches
932 * next->vm_end in remove_next == 3).
933 *
934 * We reach this only in the remove_next == 1
935 * case if the "next" vma that was removed was
936 * the highest vma of the mm. However in such
937 * case next->vm_end == "end" and the extended
938 * "vma" has vma->vm_end == next->vm_end so
939 * mm->highest_vm_end doesn't need any update
940 * in remove_next == 1 case.
941 */
942 VM_WARN_ON(mm->highest_vm_end != end);
943 }
820 } 944 }
821 if (insert && file) 945 if (insert && file)
822 uprobe_mmap(insert); 946 uprobe_mmap(insert);
@@ -936,13 +1060,24 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
936 * cannot merge might become might become might become 1060 * cannot merge might become might become might become
937 * PPNNNNNNNNNN PPPPPPPPPPNN PPPPPPPPPPPP 6 or 1061 * PPNNNNNNNNNN PPPPPPPPPPNN PPPPPPPPPPPP 6 or
938 * mmap, brk or case 4 below case 5 below PPPPPPPPXXXX 7 or 1062 * mmap, brk or case 4 below case 5 below PPPPPPPPXXXX 7 or
939 * mremap move: PPPPNNNNNNNN 8 1063 * mremap move: PPPPXXXXXXXX 8
940 * AAAA 1064 * AAAA
941 * PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN 1065 * PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN
942 * might become case 1 below case 2 below case 3 below 1066 * might become case 1 below case 2 below case 3 below
943 * 1067 *
944 * Odd one out? Case 8, because it extends NNNN but needs flags of XXXX: 1068 * It is important for case 8 that the the vma NNNN overlapping the
945 * mprotect_fixup updates vm_flags & vm_page_prot on successful return. 1069 * region AAAA is never going to extended over XXXX. Instead XXXX must
1070 * be extended in region AAAA and NNNN must be removed. This way in
1071 * all cases where vma_merge succeeds, the moment vma_adjust drops the
1072 * rmap_locks, the properties of the merged vma will be already
1073 * correct for the whole merged range. Some of those properties like
1074 * vm_page_prot/vm_flags may be accessed by rmap_walks and they must
1075 * be correct for the whole merged range immediately after the
1076 * rmap_locks are released. Otherwise if XXXX would be removed and
1077 * NNNN would be extended over the XXXX range, remove_migration_ptes
1078 * or other rmap walkers (if working on addresses beyond the "end"
1079 * parameter) may establish ptes with the wrong permissions of NNNN
1080 * instead of the right permissions of XXXX.
946 */ 1081 */
947struct vm_area_struct *vma_merge(struct mm_struct *mm, 1082struct vm_area_struct *vma_merge(struct mm_struct *mm,
948 struct vm_area_struct *prev, unsigned long addr, 1083 struct vm_area_struct *prev, unsigned long addr,
@@ -967,9 +1102,14 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
967 else 1102 else
968 next = mm->mmap; 1103 next = mm->mmap;
969 area = next; 1104 area = next;
970 if (next && next->vm_end == end) /* cases 6, 7, 8 */ 1105 if (area && area->vm_end == end) /* cases 6, 7, 8 */
971 next = next->vm_next; 1106 next = next->vm_next;
972 1107
1108 /* verify some invariant that must be enforced by the caller */
1109 VM_WARN_ON(prev && addr <= prev->vm_start);
1110 VM_WARN_ON(area && end > area->vm_end);
1111 VM_WARN_ON(addr >= end);
1112
973 /* 1113 /*
974 * Can it merge with the predecessor? 1114 * Can it merge with the predecessor?
975 */ 1115 */
@@ -990,11 +1130,12 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
990 is_mergeable_anon_vma(prev->anon_vma, 1130 is_mergeable_anon_vma(prev->anon_vma,
991 next->anon_vma, NULL)) { 1131 next->anon_vma, NULL)) {
992 /* cases 1, 6 */ 1132 /* cases 1, 6 */
993 err = vma_adjust(prev, prev->vm_start, 1133 err = __vma_adjust(prev, prev->vm_start,
994 next->vm_end, prev->vm_pgoff, NULL); 1134 next->vm_end, prev->vm_pgoff, NULL,
1135 prev);
995 } else /* cases 2, 5, 7 */ 1136 } else /* cases 2, 5, 7 */
996 err = vma_adjust(prev, prev->vm_start, 1137 err = __vma_adjust(prev, prev->vm_start,
997 end, prev->vm_pgoff, NULL); 1138 end, prev->vm_pgoff, NULL, prev);
998 if (err) 1139 if (err)
999 return NULL; 1140 return NULL;
1000 khugepaged_enter_vma_merge(prev, vm_flags); 1141 khugepaged_enter_vma_merge(prev, vm_flags);
@@ -1010,11 +1151,18 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
1010 anon_vma, file, pgoff+pglen, 1151 anon_vma, file, pgoff+pglen,
1011 vm_userfaultfd_ctx)) { 1152 vm_userfaultfd_ctx)) {
1012 if (prev && addr < prev->vm_end) /* case 4 */ 1153 if (prev && addr < prev->vm_end) /* case 4 */
1013 err = vma_adjust(prev, prev->vm_start, 1154 err = __vma_adjust(prev, prev->vm_start,
1014 addr, prev->vm_pgoff, NULL); 1155 addr, prev->vm_pgoff, NULL, next);
1015 else /* cases 3, 8 */ 1156 else { /* cases 3, 8 */
1016 err = vma_adjust(area, addr, next->vm_end, 1157 err = __vma_adjust(area, addr, next->vm_end,
1017 next->vm_pgoff - pglen, NULL); 1158 next->vm_pgoff - pglen, NULL, next);
1159 /*
1160 * In case 3 area is already equal to next and
1161 * this is a noop, but in case 8 "area" has
1162 * been removed and next was expanded over it.
1163 */
1164 area = next;
1165 }
1018 if (err) 1166 if (err)
1019 return NULL; 1167 return NULL;
1020 khugepaged_enter_vma_merge(area, vm_flags); 1168 khugepaged_enter_vma_merge(area, vm_flags);
@@ -1386,7 +1534,7 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
1386 * to the private version (using protection_map[] without the 1534 * to the private version (using protection_map[] without the
1387 * VM_SHARED bit). 1535 * VM_SHARED bit).
1388 */ 1536 */
1389int vma_wants_writenotify(struct vm_area_struct *vma) 1537int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
1390{ 1538{
1391 vm_flags_t vm_flags = vma->vm_flags; 1539 vm_flags_t vm_flags = vma->vm_flags;
1392 const struct vm_operations_struct *vm_ops = vma->vm_ops; 1540 const struct vm_operations_struct *vm_ops = vma->vm_ops;
@@ -1401,8 +1549,8 @@ int vma_wants_writenotify(struct vm_area_struct *vma)
1401 1549
1402 /* The open routine did something to the protections that pgprot_modify 1550 /* The open routine did something to the protections that pgprot_modify
1403 * won't preserve? */ 1551 * won't preserve? */
1404 if (pgprot_val(vma->vm_page_prot) != 1552 if (pgprot_val(vm_page_prot) !=
1405 pgprot_val(vm_pgprot_modify(vma->vm_page_prot, vm_flags))) 1553 pgprot_val(vm_pgprot_modify(vm_page_prot, vm_flags)))
1406 return 0; 1554 return 0;
1407 1555
1408 /* Do we need to track softdirty? */ 1556 /* Do we need to track softdirty? */
diff --git a/mm/mprotect.c b/mm/mprotect.c
index a4830f0325fe..ec91dfd3f900 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -304,6 +304,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
304 vma->vm_userfaultfd_ctx); 304 vma->vm_userfaultfd_ctx);
305 if (*pprev) { 305 if (*pprev) {
306 vma = *pprev; 306 vma = *pprev;
307 VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY);
307 goto success; 308 goto success;
308 } 309 }
309 310
@@ -327,7 +328,7 @@ success:
327 * held in write mode. 328 * held in write mode.
328 */ 329 */
329 vma->vm_flags = newflags; 330 vma->vm_flags = newflags;
330 dirty_accountable = vma_wants_writenotify(vma); 331 dirty_accountable = vma_wants_writenotify(vma, vma->vm_page_prot);
331 vma_set_page_prot(vma); 332 vma_set_page_prot(vma);
332 333
333 change_protection(vma, start, end, vma->vm_page_prot, 334 change_protection(vma, start, end, vma->vm_page_prot,
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index bd05a70f44b9..ba609b684d7a 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -11,18 +11,21 @@
11#include <linux/init.h> 11#include <linux/init.h>
12#include <linux/pfn.h> 12#include <linux/pfn.h>
13#include <linux/slab.h> 13#include <linux/slab.h>
14#include <linux/bootmem.h>
15#include <linux/export.h> 14#include <linux/export.h>
16#include <linux/kmemleak.h> 15#include <linux/kmemleak.h>
17#include <linux/range.h> 16#include <linux/range.h>
18#include <linux/memblock.h> 17#include <linux/memblock.h>
18#include <linux/bootmem.h>
19 19
20#include <asm/bug.h> 20#include <asm/bug.h>
21#include <asm/io.h> 21#include <asm/io.h>
22#include <asm/processor.h>
23 22
24#include "internal.h" 23#include "internal.h"
25 24
25#ifndef CONFIG_HAVE_MEMBLOCK
26#error CONFIG_HAVE_MEMBLOCK not defined
27#endif
28
26#ifndef CONFIG_NEED_MULTIPLE_NODES 29#ifndef CONFIG_NEED_MULTIPLE_NODES
27struct pglist_data __refdata contig_page_data; 30struct pglist_data __refdata contig_page_data;
28EXPORT_SYMBOL(contig_page_data); 31EXPORT_SYMBOL(contig_page_data);
@@ -134,6 +137,11 @@ static unsigned long __init free_low_memory_core_early(void)
134 for_each_reserved_mem_region(i, &start, &end) 137 for_each_reserved_mem_region(i, &start, &end)
135 reserve_bootmem_region(start, end); 138 reserve_bootmem_region(start, end);
136 139
140 /*
141 * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id
142 * because in some case like Node0 doesn't have RAM installed
143 * low ram will be on Node1
144 */
137 for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, 145 for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end,
138 NULL) 146 NULL)
139 count += __free_memory_core(start, end); 147 count += __free_memory_core(start, end);
@@ -191,11 +199,6 @@ unsigned long __init free_all_bootmem(void)
191 199
192 reset_all_zones_managed_pages(); 200 reset_all_zones_managed_pages();
193 201
194 /*
195 * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id
196 * because in some case like Node0 doesn't have RAM installed
197 * low ram will be on Node1
198 */
199 pages = free_low_memory_core_early(); 202 pages = free_low_memory_core_early();
200 totalram_pages += pages; 203 totalram_pages += pages;
201 204
@@ -395,9 +398,6 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
395 return __alloc_bootmem_node(pgdat, size, align, goal); 398 return __alloc_bootmem_node(pgdat, size, align, goal);
396} 399}
397 400
398#ifndef ARCH_LOW_ADDRESS_LIMIT
399#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL
400#endif
401 401
402/** 402/**
403 * __alloc_bootmem_low - allocate low boot memory 403 * __alloc_bootmem_low - allocate low boot memory
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index d53a9aa00977..ec9f11d4f094 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -132,6 +132,11 @@ static inline bool is_sysrq_oom(struct oom_control *oc)
132 return oc->order == -1; 132 return oc->order == -1;
133} 133}
134 134
135static inline bool is_memcg_oom(struct oom_control *oc)
136{
137 return oc->memcg != NULL;
138}
139
135/* return true if the task is not adequate as candidate victim task. */ 140/* return true if the task is not adequate as candidate victim task. */
136static bool oom_unkillable_task(struct task_struct *p, 141static bool oom_unkillable_task(struct task_struct *p,
137 struct mem_cgroup *memcg, const nodemask_t *nodemask) 142 struct mem_cgroup *memcg, const nodemask_t *nodemask)
@@ -181,7 +186,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
181 */ 186 */
182 adj = (long)p->signal->oom_score_adj; 187 adj = (long)p->signal->oom_score_adj;
183 if (adj == OOM_SCORE_ADJ_MIN || 188 if (adj == OOM_SCORE_ADJ_MIN ||
184 test_bit(MMF_OOM_REAPED, &p->mm->flags) || 189 test_bit(MMF_OOM_SKIP, &p->mm->flags) ||
185 in_vfork(p)) { 190 in_vfork(p)) {
186 task_unlock(p); 191 task_unlock(p);
187 return 0; 192 return 0;
@@ -213,12 +218,17 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
213 return points > 0 ? points : 1; 218 return points > 0 ? points : 1;
214} 219}
215 220
221enum oom_constraint {
222 CONSTRAINT_NONE,
223 CONSTRAINT_CPUSET,
224 CONSTRAINT_MEMORY_POLICY,
225 CONSTRAINT_MEMCG,
226};
227
216/* 228/*
217 * Determine the type of allocation constraint. 229 * Determine the type of allocation constraint.
218 */ 230 */
219#ifdef CONFIG_NUMA 231static enum oom_constraint constrained_alloc(struct oom_control *oc)
220static enum oom_constraint constrained_alloc(struct oom_control *oc,
221 unsigned long *totalpages)
222{ 232{
223 struct zone *zone; 233 struct zone *zone;
224 struct zoneref *z; 234 struct zoneref *z;
@@ -226,8 +236,16 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc,
226 bool cpuset_limited = false; 236 bool cpuset_limited = false;
227 int nid; 237 int nid;
228 238
239 if (is_memcg_oom(oc)) {
240 oc->totalpages = mem_cgroup_get_limit(oc->memcg) ?: 1;
241 return CONSTRAINT_MEMCG;
242 }
243
229 /* Default to all available memory */ 244 /* Default to all available memory */
230 *totalpages = totalram_pages + total_swap_pages; 245 oc->totalpages = totalram_pages + total_swap_pages;
246
247 if (!IS_ENABLED(CONFIG_NUMA))
248 return CONSTRAINT_NONE;
231 249
232 if (!oc->zonelist) 250 if (!oc->zonelist)
233 return CONSTRAINT_NONE; 251 return CONSTRAINT_NONE;
@@ -246,9 +264,9 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc,
246 */ 264 */
247 if (oc->nodemask && 265 if (oc->nodemask &&
248 !nodes_subset(node_states[N_MEMORY], *oc->nodemask)) { 266 !nodes_subset(node_states[N_MEMORY], *oc->nodemask)) {
249 *totalpages = total_swap_pages; 267 oc->totalpages = total_swap_pages;
250 for_each_node_mask(nid, *oc->nodemask) 268 for_each_node_mask(nid, *oc->nodemask)
251 *totalpages += node_spanned_pages(nid); 269 oc->totalpages += node_spanned_pages(nid);
252 return CONSTRAINT_MEMORY_POLICY; 270 return CONSTRAINT_MEMORY_POLICY;
253 } 271 }
254 272
@@ -259,98 +277,84 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc,
259 cpuset_limited = true; 277 cpuset_limited = true;
260 278
261 if (cpuset_limited) { 279 if (cpuset_limited) {
262 *totalpages = total_swap_pages; 280 oc->totalpages = total_swap_pages;
263 for_each_node_mask(nid, cpuset_current_mems_allowed) 281 for_each_node_mask(nid, cpuset_current_mems_allowed)
264 *totalpages += node_spanned_pages(nid); 282 oc->totalpages += node_spanned_pages(nid);
265 return CONSTRAINT_CPUSET; 283 return CONSTRAINT_CPUSET;
266 } 284 }
267 return CONSTRAINT_NONE; 285 return CONSTRAINT_NONE;
268} 286}
269#else
270static enum oom_constraint constrained_alloc(struct oom_control *oc,
271 unsigned long *totalpages)
272{
273 *totalpages = totalram_pages + total_swap_pages;
274 return CONSTRAINT_NONE;
275}
276#endif
277 287
278enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, 288static int oom_evaluate_task(struct task_struct *task, void *arg)
279 struct task_struct *task)
280{ 289{
290 struct oom_control *oc = arg;
291 unsigned long points;
292
281 if (oom_unkillable_task(task, NULL, oc->nodemask)) 293 if (oom_unkillable_task(task, NULL, oc->nodemask))
282 return OOM_SCAN_CONTINUE; 294 goto next;
283 295
284 /* 296 /*
285 * This task already has access to memory reserves and is being killed. 297 * This task already has access to memory reserves and is being killed.
286 * Don't allow any other task to have access to the reserves unless 298 * Don't allow any other task to have access to the reserves unless
287 * the task has MMF_OOM_REAPED because chances that it would release 299 * the task has MMF_OOM_SKIP because chances that it would release
288 * any memory is quite low. 300 * any memory is quite low.
289 */ 301 */
290 if (!is_sysrq_oom(oc) && atomic_read(&task->signal->oom_victims)) { 302 if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) {
291 struct task_struct *p = find_lock_task_mm(task); 303 if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags))
292 enum oom_scan_t ret = OOM_SCAN_ABORT; 304 goto next;
293 305 goto abort;
294 if (p) {
295 if (test_bit(MMF_OOM_REAPED, &p->mm->flags))
296 ret = OOM_SCAN_CONTINUE;
297 task_unlock(p);
298 }
299
300 return ret;
301 } 306 }
302 307
303 /* 308 /*
304 * If task is allocating a lot of memory and has been marked to be 309 * If task is allocating a lot of memory and has been marked to be
305 * killed first if it triggers an oom, then select it. 310 * killed first if it triggers an oom, then select it.
306 */ 311 */
307 if (oom_task_origin(task)) 312 if (oom_task_origin(task)) {
308 return OOM_SCAN_SELECT; 313 points = ULONG_MAX;
314 goto select;
315 }
309 316
310 return OOM_SCAN_OK; 317 points = oom_badness(task, NULL, oc->nodemask, oc->totalpages);
318 if (!points || points < oc->chosen_points)
319 goto next;
320
321 /* Prefer thread group leaders for display purposes */
322 if (points == oc->chosen_points && thread_group_leader(oc->chosen))
323 goto next;
324select:
325 if (oc->chosen)
326 put_task_struct(oc->chosen);
327 get_task_struct(task);
328 oc->chosen = task;
329 oc->chosen_points = points;
330next:
331 return 0;
332abort:
333 if (oc->chosen)
334 put_task_struct(oc->chosen);
335 oc->chosen = (void *)-1UL;
336 return 1;
311} 337}
312 338
313/* 339/*
314 * Simple selection loop. We chose the process with the highest 340 * Simple selection loop. We choose the process with the highest number of
315 * number of 'points'. Returns -1 on scan abort. 341 * 'points'. In case scan was aborted, oc->chosen is set to -1.
316 */ 342 */
317static struct task_struct *select_bad_process(struct oom_control *oc, 343static void select_bad_process(struct oom_control *oc)
318 unsigned int *ppoints, unsigned long totalpages)
319{ 344{
320 struct task_struct *p; 345 if (is_memcg_oom(oc))
321 struct task_struct *chosen = NULL; 346 mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc);
322 unsigned long chosen_points = 0; 347 else {
323 348 struct task_struct *p;
324 rcu_read_lock();
325 for_each_process(p) {
326 unsigned int points;
327
328 switch (oom_scan_process_thread(oc, p)) {
329 case OOM_SCAN_SELECT:
330 chosen = p;
331 chosen_points = ULONG_MAX;
332 /* fall through */
333 case OOM_SCAN_CONTINUE:
334 continue;
335 case OOM_SCAN_ABORT:
336 rcu_read_unlock();
337 return (struct task_struct *)(-1UL);
338 case OOM_SCAN_OK:
339 break;
340 };
341 points = oom_badness(p, NULL, oc->nodemask, totalpages);
342 if (!points || points < chosen_points)
343 continue;
344 349
345 chosen = p; 350 rcu_read_lock();
346 chosen_points = points; 351 for_each_process(p)
352 if (oom_evaluate_task(p, oc))
353 break;
354 rcu_read_unlock();
347 } 355 }
348 if (chosen)
349 get_task_struct(chosen);
350 rcu_read_unlock();
351 356
352 *ppoints = chosen_points * 1000 / totalpages; 357 oc->chosen_points = oc->chosen_points * 1000 / oc->totalpages;
353 return chosen;
354} 358}
355 359
356/** 360/**
@@ -399,9 +403,14 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
399 403
400static void dump_header(struct oom_control *oc, struct task_struct *p) 404static void dump_header(struct oom_control *oc, struct task_struct *p)
401{ 405{
402 pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n", 406 nodemask_t *nm = (oc->nodemask) ? oc->nodemask : &cpuset_current_mems_allowed;
403 current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order, 407
408 pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), nodemask=%*pbl, order=%d, oom_score_adj=%hd\n",
409 current->comm, oc->gfp_mask, &oc->gfp_mask,
410 nodemask_pr_args(nm), oc->order,
404 current->signal->oom_score_adj); 411 current->signal->oom_score_adj);
412 if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order)
413 pr_warn("COMPACTION is disabled!!!\n");
405 414
406 cpuset_print_current_mems_allowed(); 415 cpuset_print_current_mems_allowed();
407 dump_stack(); 416 dump_stack();
@@ -419,7 +428,7 @@ static void dump_header(struct oom_control *oc, struct task_struct *p)
419static atomic_t oom_victims = ATOMIC_INIT(0); 428static atomic_t oom_victims = ATOMIC_INIT(0);
420static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait); 429static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
421 430
422bool oom_killer_disabled __read_mostly; 431static bool oom_killer_disabled __read_mostly;
423 432
424#define K(x) ((x) << (PAGE_SHIFT-10)) 433#define K(x) ((x) << (PAGE_SHIFT-10))
425 434
@@ -452,12 +461,10 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
452static struct task_struct *oom_reaper_list; 461static struct task_struct *oom_reaper_list;
453static DEFINE_SPINLOCK(oom_reaper_lock); 462static DEFINE_SPINLOCK(oom_reaper_lock);
454 463
455static bool __oom_reap_task(struct task_struct *tsk) 464static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
456{ 465{
457 struct mmu_gather tlb; 466 struct mmu_gather tlb;
458 struct vm_area_struct *vma; 467 struct vm_area_struct *vma;
459 struct mm_struct *mm = NULL;
460 struct task_struct *p;
461 struct zap_details details = {.check_swap_entries = true, 468 struct zap_details details = {.check_swap_entries = true,
462 .ignore_dirty = true}; 469 .ignore_dirty = true};
463 bool ret = true; 470 bool ret = true;
@@ -465,7 +472,7 @@ static bool __oom_reap_task(struct task_struct *tsk)
465 /* 472 /*
466 * We have to make sure to not race with the victim exit path 473 * We have to make sure to not race with the victim exit path
467 * and cause premature new oom victim selection: 474 * and cause premature new oom victim selection:
468 * __oom_reap_task exit_mm 475 * __oom_reap_task_mm exit_mm
469 * mmget_not_zero 476 * mmget_not_zero
470 * mmput 477 * mmput
471 * atomic_dec_and_test 478 * atomic_dec_and_test
@@ -478,22 +485,9 @@ static bool __oom_reap_task(struct task_struct *tsk)
478 */ 485 */
479 mutex_lock(&oom_lock); 486 mutex_lock(&oom_lock);
480 487
481 /*
482 * Make sure we find the associated mm_struct even when the particular
483 * thread has already terminated and cleared its mm.
484 * We might have race with exit path so consider our work done if there
485 * is no mm.
486 */
487 p = find_lock_task_mm(tsk);
488 if (!p)
489 goto unlock_oom;
490 mm = p->mm;
491 atomic_inc(&mm->mm_count);
492 task_unlock(p);
493
494 if (!down_read_trylock(&mm->mmap_sem)) { 488 if (!down_read_trylock(&mm->mmap_sem)) {
495 ret = false; 489 ret = false;
496 goto mm_drop; 490 goto unlock_oom;
497 } 491 }
498 492
499 /* 493 /*
@@ -503,9 +497,17 @@ static bool __oom_reap_task(struct task_struct *tsk)
503 */ 497 */
504 if (!mmget_not_zero(mm)) { 498 if (!mmget_not_zero(mm)) {
505 up_read(&mm->mmap_sem); 499 up_read(&mm->mmap_sem);
506 goto mm_drop; 500 goto unlock_oom;
507 } 501 }
508 502
503 /*
504 * Tell all users of get_user/copy_from_user etc... that the content
505 * is no longer stable. No barriers really needed because unmapping
506 * should imply barriers already and the reader would hit a page fault
507 * if it stumbled over a reaped memory.
508 */
509 set_bit(MMF_UNSTABLE, &mm->flags);
510
509 tlb_gather_mmu(&tlb, mm, 0, -1); 511 tlb_gather_mmu(&tlb, mm, 0, -1);
510 for (vma = mm->mmap ; vma; vma = vma->vm_next) { 512 for (vma = mm->mmap ; vma; vma = vma->vm_next) {
511 if (is_vm_hugetlb_page(vma)) 513 if (is_vm_hugetlb_page(vma))
@@ -541,18 +543,11 @@ static bool __oom_reap_task(struct task_struct *tsk)
541 up_read(&mm->mmap_sem); 543 up_read(&mm->mmap_sem);
542 544
543 /* 545 /*
544 * This task can be safely ignored because we cannot do much more
545 * to release its memory.
546 */
547 set_bit(MMF_OOM_REAPED, &mm->flags);
548 /*
549 * Drop our reference but make sure the mmput slow path is called from a 546 * Drop our reference but make sure the mmput slow path is called from a
550 * different context because we shouldn't risk we get stuck there and 547 * different context because we shouldn't risk we get stuck there and
551 * put the oom_reaper out of the way. 548 * put the oom_reaper out of the way.
552 */ 549 */
553 mmput_async(mm); 550 mmput_async(mm);
554mm_drop:
555 mmdrop(mm);
556unlock_oom: 551unlock_oom:
557 mutex_unlock(&oom_lock); 552 mutex_unlock(&oom_lock);
558 return ret; 553 return ret;
@@ -562,44 +557,28 @@ unlock_oom:
562static void oom_reap_task(struct task_struct *tsk) 557static void oom_reap_task(struct task_struct *tsk)
563{ 558{
564 int attempts = 0; 559 int attempts = 0;
560 struct mm_struct *mm = tsk->signal->oom_mm;
565 561
566 /* Retry the down_read_trylock(mmap_sem) a few times */ 562 /* Retry the down_read_trylock(mmap_sem) a few times */
567 while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task(tsk)) 563 while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task_mm(tsk, mm))
568 schedule_timeout_idle(HZ/10); 564 schedule_timeout_idle(HZ/10);
569 565
570 if (attempts > MAX_OOM_REAP_RETRIES) { 566 if (attempts <= MAX_OOM_REAP_RETRIES)
571 struct task_struct *p; 567 goto done;
572 568
573 pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
574 task_pid_nr(tsk), tsk->comm);
575 569
576 /* 570 pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
577 * If we've already tried to reap this task in the past and 571 task_pid_nr(tsk), tsk->comm);
578 * failed it probably doesn't make much sense to try yet again 572 debug_show_all_locks();
579 * so hide the mm from the oom killer so that it can move on
580 * to another task with a different mm struct.
581 */
582 p = find_lock_task_mm(tsk);
583 if (p) {
584 if (test_and_set_bit(MMF_OOM_NOT_REAPABLE, &p->mm->flags)) {
585 pr_info("oom_reaper: giving up pid:%d (%s)\n",
586 task_pid_nr(tsk), tsk->comm);
587 set_bit(MMF_OOM_REAPED, &p->mm->flags);
588 }
589 task_unlock(p);
590 }
591 573
592 debug_show_all_locks(); 574done:
593 } 575 tsk->oom_reaper_list = NULL;
594 576
595 /* 577 /*
596 * Clear TIF_MEMDIE because the task shouldn't be sitting on a 578 * Hide this mm from OOM killer because it has been either reaped or
597 * reasonably reclaimable memory anymore or it is not a good candidate 579 * somebody can't call up_write(mmap_sem).
598 * for the oom victim right now because it cannot release its memory
599 * itself nor by the oom reaper.
600 */ 580 */
601 tsk->oom_reaper_list = NULL; 581 set_bit(MMF_OOM_SKIP, &mm->flags);
602 exit_oom_victim(tsk);
603 582
604 /* Drop a reference taken by wake_oom_reaper */ 583 /* Drop a reference taken by wake_oom_reaper */
605 put_task_struct(tsk); 584 put_task_struct(tsk);
@@ -607,8 +586,6 @@ static void oom_reap_task(struct task_struct *tsk)
607 586
608static int oom_reaper(void *unused) 587static int oom_reaper(void *unused)
609{ 588{
610 set_freezable();
611
612 while (true) { 589 while (true) {
613 struct task_struct *tsk = NULL; 590 struct task_struct *tsk = NULL;
614 591
@@ -627,7 +604,7 @@ static int oom_reaper(void *unused)
627 return 0; 604 return 0;
628} 605}
629 606
630void wake_oom_reaper(struct task_struct *tsk) 607static void wake_oom_reaper(struct task_struct *tsk)
631{ 608{
632 if (!oom_reaper_th) 609 if (!oom_reaper_th)
633 return; 610 return;
@@ -656,7 +633,11 @@ static int __init oom_init(void)
656 return 0; 633 return 0;
657} 634}
658subsys_initcall(oom_init) 635subsys_initcall(oom_init)
659#endif 636#else
637static inline void wake_oom_reaper(struct task_struct *tsk)
638{
639}
640#endif /* CONFIG_MMU */
660 641
661/** 642/**
662 * mark_oom_victim - mark the given task as OOM victim 643 * mark_oom_victim - mark the given task as OOM victim
@@ -664,14 +645,23 @@ subsys_initcall(oom_init)
664 * 645 *
665 * Has to be called with oom_lock held and never after 646 * Has to be called with oom_lock held and never after
666 * oom has been disabled already. 647 * oom has been disabled already.
648 *
649 * tsk->mm has to be non NULL and caller has to guarantee it is stable (either
650 * under task_lock or operate on the current).
667 */ 651 */
668void mark_oom_victim(struct task_struct *tsk) 652static void mark_oom_victim(struct task_struct *tsk)
669{ 653{
654 struct mm_struct *mm = tsk->mm;
655
670 WARN_ON(oom_killer_disabled); 656 WARN_ON(oom_killer_disabled);
671 /* OOM killer might race with memcg OOM */ 657 /* OOM killer might race with memcg OOM */
672 if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE)) 658 if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
673 return; 659 return;
674 atomic_inc(&tsk->signal->oom_victims); 660
661 /* oom_mm is bound to the signal struct life time. */
662 if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm))
663 atomic_inc(&tsk->signal->oom_mm->mm_count);
664
675 /* 665 /*
676 * Make sure that the task is woken up from uninterruptible sleep 666 * Make sure that the task is woken up from uninterruptible sleep
677 * if it is frozen because OOM killer wouldn't be able to free 667 * if it is frozen because OOM killer wouldn't be able to free
@@ -685,21 +675,29 @@ void mark_oom_victim(struct task_struct *tsk)
685/** 675/**
686 * exit_oom_victim - note the exit of an OOM victim 676 * exit_oom_victim - note the exit of an OOM victim
687 */ 677 */
688void exit_oom_victim(struct task_struct *tsk) 678void exit_oom_victim(void)
689{ 679{
690 if (!test_and_clear_tsk_thread_flag(tsk, TIF_MEMDIE)) 680 clear_thread_flag(TIF_MEMDIE);
691 return;
692 atomic_dec(&tsk->signal->oom_victims);
693 681
694 if (!atomic_dec_return(&oom_victims)) 682 if (!atomic_dec_return(&oom_victims))
695 wake_up_all(&oom_victims_wait); 683 wake_up_all(&oom_victims_wait);
696} 684}
697 685
698/** 686/**
687 * oom_killer_enable - enable OOM killer
688 */
689void oom_killer_enable(void)
690{
691 oom_killer_disabled = false;
692}
693
694/**
699 * oom_killer_disable - disable OOM killer 695 * oom_killer_disable - disable OOM killer
696 * @timeout: maximum timeout to wait for oom victims in jiffies
700 * 697 *
701 * Forces all page allocations to fail rather than trigger OOM killer. 698 * Forces all page allocations to fail rather than trigger OOM killer.
702 * Will block and wait until all OOM victims are killed. 699 * Will block and wait until all OOM victims are killed or the given
700 * timeout expires.
703 * 701 *
704 * The function cannot be called when there are runnable user tasks because 702 * The function cannot be called when there are runnable user tasks because
705 * the userspace would see unexpected allocation failures as a result. Any 703 * the userspace would see unexpected allocation failures as a result. Any
@@ -708,8 +706,10 @@ void exit_oom_victim(struct task_struct *tsk)
708 * Returns true if successful and false if the OOM killer cannot be 706 * Returns true if successful and false if the OOM killer cannot be
709 * disabled. 707 * disabled.
710 */ 708 */
711bool oom_killer_disable(void) 709bool oom_killer_disable(signed long timeout)
712{ 710{
711 signed long ret;
712
713 /* 713 /*
714 * Make sure to not race with an ongoing OOM killer. Check that the 714 * Make sure to not race with an ongoing OOM killer. Check that the
715 * current is not killed (possibly due to sharing the victim's memory). 715 * current is not killed (possibly due to sharing the victim's memory).
@@ -719,19 +719,16 @@ bool oom_killer_disable(void)
719 oom_killer_disabled = true; 719 oom_killer_disabled = true;
720 mutex_unlock(&oom_lock); 720 mutex_unlock(&oom_lock);
721 721
722 wait_event(oom_victims_wait, !atomic_read(&oom_victims)); 722 ret = wait_event_interruptible_timeout(oom_victims_wait,
723 !atomic_read(&oom_victims), timeout);
724 if (ret <= 0) {
725 oom_killer_enable();
726 return false;
727 }
723 728
724 return true; 729 return true;
725} 730}
726 731
727/**
728 * oom_killer_enable - enable OOM killer
729 */
730void oom_killer_enable(void)
731{
732 oom_killer_disabled = false;
733}
734
735static inline bool __task_will_free_mem(struct task_struct *task) 732static inline bool __task_will_free_mem(struct task_struct *task)
736{ 733{
737 struct signal_struct *sig = task->signal; 734 struct signal_struct *sig = task->signal;
@@ -760,7 +757,7 @@ static inline bool __task_will_free_mem(struct task_struct *task)
760 * Caller has to make sure that task->mm is stable (hold task_lock or 757 * Caller has to make sure that task->mm is stable (hold task_lock or
761 * it operates on the current). 758 * it operates on the current).
762 */ 759 */
763bool task_will_free_mem(struct task_struct *task) 760static bool task_will_free_mem(struct task_struct *task)
764{ 761{
765 struct mm_struct *mm = task->mm; 762 struct mm_struct *mm = task->mm;
766 struct task_struct *p; 763 struct task_struct *p;
@@ -781,15 +778,16 @@ bool task_will_free_mem(struct task_struct *task)
781 * This task has already been drained by the oom reaper so there are 778 * This task has already been drained by the oom reaper so there are
782 * only small chances it will free some more 779 * only small chances it will free some more
783 */ 780 */
784 if (test_bit(MMF_OOM_REAPED, &mm->flags)) 781 if (test_bit(MMF_OOM_SKIP, &mm->flags))
785 return false; 782 return false;
786 783
787 if (atomic_read(&mm->mm_users) <= 1) 784 if (atomic_read(&mm->mm_users) <= 1)
788 return true; 785 return true;
789 786
790 /* 787 /*
791 * This is really pessimistic but we do not have any reliable way 788 * Make sure that all tasks which share the mm with the given tasks
792 * to check that external processes share with our mm 789 * are dying as well to make sure that a) nobody pins its mm and
790 * b) the task is also reapable by the oom reaper.
793 */ 791 */
794 rcu_read_lock(); 792 rcu_read_lock();
795 for_each_process(p) { 793 for_each_process(p) {
@@ -806,14 +804,10 @@ bool task_will_free_mem(struct task_struct *task)
806 return ret; 804 return ret;
807} 805}
808 806
809/* 807static void oom_kill_process(struct oom_control *oc, const char *message)
810 * Must be called while holding a reference to p, which will be released upon
811 * returning.
812 */
813void oom_kill_process(struct oom_control *oc, struct task_struct *p,
814 unsigned int points, unsigned long totalpages,
815 const char *message)
816{ 808{
809 struct task_struct *p = oc->chosen;
810 unsigned int points = oc->chosen_points;
817 struct task_struct *victim = p; 811 struct task_struct *victim = p;
818 struct task_struct *child; 812 struct task_struct *child;
819 struct task_struct *t; 813 struct task_struct *t;
@@ -860,7 +854,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
860 * oom_badness() returns 0 if the thread is unkillable 854 * oom_badness() returns 0 if the thread is unkillable
861 */ 855 */
862 child_points = oom_badness(child, 856 child_points = oom_badness(child,
863 oc->memcg, oc->nodemask, totalpages); 857 oc->memcg, oc->nodemask, oc->totalpages);
864 if (child_points > victim_points) { 858 if (child_points > victim_points) {
865 put_task_struct(victim); 859 put_task_struct(victim);
866 victim = child; 860 victim = child;
@@ -913,20 +907,20 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
913 continue; 907 continue;
914 if (same_thread_group(p, victim)) 908 if (same_thread_group(p, victim))
915 continue; 909 continue;
916 if (unlikely(p->flags & PF_KTHREAD) || is_global_init(p)) { 910 if (is_global_init(p)) {
917 /*
918 * We cannot use oom_reaper for the mm shared by this
919 * process because it wouldn't get killed and so the
920 * memory might be still used. Hide the mm from the oom
921 * killer to guarantee OOM forward progress.
922 */
923 can_oom_reap = false; 911 can_oom_reap = false;
924 set_bit(MMF_OOM_REAPED, &mm->flags); 912 set_bit(MMF_OOM_SKIP, &mm->flags);
925 pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n", 913 pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n",
926 task_pid_nr(victim), victim->comm, 914 task_pid_nr(victim), victim->comm,
927 task_pid_nr(p), p->comm); 915 task_pid_nr(p), p->comm);
928 continue; 916 continue;
929 } 917 }
918 /*
919 * No use_mm() user needs to read from the userspace so we are
920 * ok to reap it.
921 */
922 if (unlikely(p->flags & PF_KTHREAD))
923 continue;
930 do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); 924 do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
931 } 925 }
932 rcu_read_unlock(); 926 rcu_read_unlock();
@@ -942,7 +936,8 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
942/* 936/*
943 * Determines whether the kernel must panic because of the panic_on_oom sysctl. 937 * Determines whether the kernel must panic because of the panic_on_oom sysctl.
944 */ 938 */
945void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint) 939static void check_panic_on_oom(struct oom_control *oc,
940 enum oom_constraint constraint)
946{ 941{
947 if (likely(!sysctl_panic_on_oom)) 942 if (likely(!sysctl_panic_on_oom))
948 return; 943 return;
@@ -988,19 +983,18 @@ EXPORT_SYMBOL_GPL(unregister_oom_notifier);
988 */ 983 */
989bool out_of_memory(struct oom_control *oc) 984bool out_of_memory(struct oom_control *oc)
990{ 985{
991 struct task_struct *p;
992 unsigned long totalpages;
993 unsigned long freed = 0; 986 unsigned long freed = 0;
994 unsigned int uninitialized_var(points);
995 enum oom_constraint constraint = CONSTRAINT_NONE; 987 enum oom_constraint constraint = CONSTRAINT_NONE;
996 988
997 if (oom_killer_disabled) 989 if (oom_killer_disabled)
998 return false; 990 return false;
999 991
1000 blocking_notifier_call_chain(&oom_notify_list, 0, &freed); 992 if (!is_memcg_oom(oc)) {
1001 if (freed > 0) 993 blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
1002 /* Got some memory back in the last second. */ 994 if (freed > 0)
1003 return true; 995 /* Got some memory back in the last second. */
996 return true;
997 }
1004 998
1005 /* 999 /*
1006 * If current has a pending SIGKILL or is exiting, then automatically 1000 * If current has a pending SIGKILL or is exiting, then automatically
@@ -1024,37 +1018,38 @@ bool out_of_memory(struct oom_control *oc)
1024 1018
1025 /* 1019 /*
1026 * Check if there were limitations on the allocation (only relevant for 1020 * Check if there were limitations on the allocation (only relevant for
1027 * NUMA) that may require different handling. 1021 * NUMA and memcg) that may require different handling.
1028 */ 1022 */
1029 constraint = constrained_alloc(oc, &totalpages); 1023 constraint = constrained_alloc(oc);
1030 if (constraint != CONSTRAINT_MEMORY_POLICY) 1024 if (constraint != CONSTRAINT_MEMORY_POLICY)
1031 oc->nodemask = NULL; 1025 oc->nodemask = NULL;
1032 check_panic_on_oom(oc, constraint); 1026 check_panic_on_oom(oc, constraint);
1033 1027
1034 if (sysctl_oom_kill_allocating_task && current->mm && 1028 if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
1035 !oom_unkillable_task(current, NULL, oc->nodemask) && 1029 current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) &&
1036 current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { 1030 current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
1037 get_task_struct(current); 1031 get_task_struct(current);
1038 oom_kill_process(oc, current, 0, totalpages, 1032 oc->chosen = current;
1039 "Out of memory (oom_kill_allocating_task)"); 1033 oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)");
1040 return true; 1034 return true;
1041 } 1035 }
1042 1036
1043 p = select_bad_process(oc, &points, totalpages); 1037 select_bad_process(oc);
1044 /* Found nothing?!?! Either we hang forever, or we panic. */ 1038 /* Found nothing?!?! Either we hang forever, or we panic. */
1045 if (!p && !is_sysrq_oom(oc)) { 1039 if (!oc->chosen && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) {
1046 dump_header(oc, NULL); 1040 dump_header(oc, NULL);
1047 panic("Out of memory and no killable processes...\n"); 1041 panic("Out of memory and no killable processes...\n");
1048 } 1042 }
1049 if (p && p != (void *)-1UL) { 1043 if (oc->chosen && oc->chosen != (void *)-1UL) {
1050 oom_kill_process(oc, p, points, totalpages, "Out of memory"); 1044 oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
1045 "Memory cgroup out of memory");
1051 /* 1046 /*
1052 * Give the killed process a good chance to exit before trying 1047 * Give the killed process a good chance to exit before trying
1053 * to allocate memory again. 1048 * to allocate memory again.
1054 */ 1049 */
1055 schedule_timeout_killable(1); 1050 schedule_timeout_killable(1);
1056 } 1051 }
1057 return true; 1052 return !!oc->chosen;
1058} 1053}
1059 1054
1060/* 1055/*
@@ -1077,16 +1072,6 @@ void pagefault_out_of_memory(void)
1077 1072
1078 if (!mutex_trylock(&oom_lock)) 1073 if (!mutex_trylock(&oom_lock))
1079 return; 1074 return;
1080 1075 out_of_memory(&oc);
1081 if (!out_of_memory(&oc)) {
1082 /*
1083 * There shouldn't be any user tasks runnable while the
1084 * OOM killer is disabled, so the current task has to
1085 * be a racing OOM victim for which oom_killer_disable()
1086 * is waiting for.
1087 */
1088 WARN_ON(test_thread_flag(TIF_MEMDIE));
1089 }
1090
1091 mutex_unlock(&oom_lock); 1076 mutex_unlock(&oom_lock);
1092} 1077}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 28d6f36a2d79..439cc63ad903 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1965,36 +1965,6 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb)
1965 return false; 1965 return false;
1966} 1966}
1967 1967
1968void throttle_vm_writeout(gfp_t gfp_mask)
1969{
1970 unsigned long background_thresh;
1971 unsigned long dirty_thresh;
1972
1973 for ( ; ; ) {
1974 global_dirty_limits(&background_thresh, &dirty_thresh);
1975 dirty_thresh = hard_dirty_limit(&global_wb_domain, dirty_thresh);
1976
1977 /*
1978 * Boost the allowable dirty threshold a bit for page
1979 * allocators so they don't get DoS'ed by heavy writers
1980 */
1981 dirty_thresh += dirty_thresh / 10; /* wheeee... */
1982
1983 if (global_node_page_state(NR_UNSTABLE_NFS) +
1984 global_node_page_state(NR_WRITEBACK) <= dirty_thresh)
1985 break;
1986 congestion_wait(BLK_RW_ASYNC, HZ/10);
1987
1988 /*
1989 * The caller might hold locks which can prevent IO completion
1990 * or progress in the filesystem. So we cannot just sit here
1991 * waiting for IO to complete.
1992 */
1993 if ((gfp_mask & (__GFP_FS|__GFP_IO)) != (__GFP_FS|__GFP_IO))
1994 break;
1995 }
1996}
1997
1998/* 1968/*
1999 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs 1969 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
2000 */ 1970 */
@@ -2746,7 +2716,7 @@ int test_clear_page_writeback(struct page *page)
2746 int ret; 2716 int ret;
2747 2717
2748 lock_page_memcg(page); 2718 lock_page_memcg(page);
2749 if (mapping) { 2719 if (mapping && mapping_use_writeback_tags(mapping)) {
2750 struct inode *inode = mapping->host; 2720 struct inode *inode = mapping->host;
2751 struct backing_dev_info *bdi = inode_to_bdi(inode); 2721 struct backing_dev_info *bdi = inode_to_bdi(inode);
2752 unsigned long flags; 2722 unsigned long flags;
@@ -2789,7 +2759,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
2789 int ret; 2759 int ret;
2790 2760
2791 lock_page_memcg(page); 2761 lock_page_memcg(page);
2792 if (mapping) { 2762 if (mapping && mapping_use_writeback_tags(mapping)) {
2793 struct inode *inode = mapping->host; 2763 struct inode *inode = mapping->host;
2794 struct backing_dev_info *bdi = inode_to_bdi(inode); 2764 struct backing_dev_info *bdi = inode_to_bdi(inode);
2795 unsigned long flags; 2765 unsigned long flags;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a2214c64ed3c..ca423cc20b59 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -607,6 +607,9 @@ static bool need_debug_guardpage(void)
607 if (!debug_pagealloc_enabled()) 607 if (!debug_pagealloc_enabled())
608 return false; 608 return false;
609 609
610 if (!debug_guardpage_minorder())
611 return false;
612
610 return true; 613 return true;
611} 614}
612 615
@@ -615,6 +618,9 @@ static void init_debug_guardpage(void)
615 if (!debug_pagealloc_enabled()) 618 if (!debug_pagealloc_enabled())
616 return; 619 return;
617 620
621 if (!debug_guardpage_minorder())
622 return;
623
618 _debug_guardpage_enabled = true; 624 _debug_guardpage_enabled = true;
619} 625}
620 626
@@ -635,19 +641,22 @@ static int __init debug_guardpage_minorder_setup(char *buf)
635 pr_info("Setting debug_guardpage_minorder to %lu\n", res); 641 pr_info("Setting debug_guardpage_minorder to %lu\n", res);
636 return 0; 642 return 0;
637} 643}
638__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup); 644early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup);
639 645
640static inline void set_page_guard(struct zone *zone, struct page *page, 646static inline bool set_page_guard(struct zone *zone, struct page *page,
641 unsigned int order, int migratetype) 647 unsigned int order, int migratetype)
642{ 648{
643 struct page_ext *page_ext; 649 struct page_ext *page_ext;
644 650
645 if (!debug_guardpage_enabled()) 651 if (!debug_guardpage_enabled())
646 return; 652 return false;
653
654 if (order >= debug_guardpage_minorder())
655 return false;
647 656
648 page_ext = lookup_page_ext(page); 657 page_ext = lookup_page_ext(page);
649 if (unlikely(!page_ext)) 658 if (unlikely(!page_ext))
650 return; 659 return false;
651 660
652 __set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); 661 __set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
653 662
@@ -655,6 +664,8 @@ static inline void set_page_guard(struct zone *zone, struct page *page,
655 set_page_private(page, order); 664 set_page_private(page, order);
656 /* Guard pages are not available for any usage */ 665 /* Guard pages are not available for any usage */
657 __mod_zone_freepage_state(zone, -(1 << order), migratetype); 666 __mod_zone_freepage_state(zone, -(1 << order), migratetype);
667
668 return true;
658} 669}
659 670
660static inline void clear_page_guard(struct zone *zone, struct page *page, 671static inline void clear_page_guard(struct zone *zone, struct page *page,
@@ -676,9 +687,9 @@ static inline void clear_page_guard(struct zone *zone, struct page *page,
676 __mod_zone_freepage_state(zone, (1 << order), migratetype); 687 __mod_zone_freepage_state(zone, (1 << order), migratetype);
677} 688}
678#else 689#else
679struct page_ext_operations debug_guardpage_ops = { NULL, }; 690struct page_ext_operations debug_guardpage_ops;
680static inline void set_page_guard(struct zone *zone, struct page *page, 691static inline bool set_page_guard(struct zone *zone, struct page *page,
681 unsigned int order, int migratetype) {} 692 unsigned int order, int migratetype) { return false; }
682static inline void clear_page_guard(struct zone *zone, struct page *page, 693static inline void clear_page_guard(struct zone *zone, struct page *page,
683 unsigned int order, int migratetype) {} 694 unsigned int order, int migratetype) {}
684#endif 695#endif
@@ -1393,15 +1404,18 @@ static void __init deferred_free_range(struct page *page,
1393 return; 1404 return;
1394 1405
1395 /* Free a large naturally-aligned chunk if possible */ 1406 /* Free a large naturally-aligned chunk if possible */
1396 if (nr_pages == MAX_ORDER_NR_PAGES && 1407 if (nr_pages == pageblock_nr_pages &&
1397 (pfn & (MAX_ORDER_NR_PAGES-1)) == 0) { 1408 (pfn & (pageblock_nr_pages - 1)) == 0) {
1398 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 1409 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
1399 __free_pages_boot_core(page, MAX_ORDER-1); 1410 __free_pages_boot_core(page, pageblock_order);
1400 return; 1411 return;
1401 } 1412 }
1402 1413
1403 for (i = 0; i < nr_pages; i++, page++) 1414 for (i = 0; i < nr_pages; i++, page++, pfn++) {
1415 if ((pfn & (pageblock_nr_pages - 1)) == 0)
1416 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
1404 __free_pages_boot_core(page, 0); 1417 __free_pages_boot_core(page, 0);
1418 }
1405} 1419}
1406 1420
1407/* Completion tracking for deferred_init_memmap() threads */ 1421/* Completion tracking for deferred_init_memmap() threads */
@@ -1469,9 +1483,9 @@ static int __init deferred_init_memmap(void *data)
1469 1483
1470 /* 1484 /*
1471 * Ensure pfn_valid is checked every 1485 * Ensure pfn_valid is checked every
1472 * MAX_ORDER_NR_PAGES for memory holes 1486 * pageblock_nr_pages for memory holes
1473 */ 1487 */
1474 if ((pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) { 1488 if ((pfn & (pageblock_nr_pages - 1)) == 0) {
1475 if (!pfn_valid(pfn)) { 1489 if (!pfn_valid(pfn)) {
1476 page = NULL; 1490 page = NULL;
1477 goto free_range; 1491 goto free_range;
@@ -1484,7 +1498,7 @@ static int __init deferred_init_memmap(void *data)
1484 } 1498 }
1485 1499
1486 /* Minimise pfn page lookups and scheduler checks */ 1500 /* Minimise pfn page lookups and scheduler checks */
1487 if (page && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0) { 1501 if (page && (pfn & (pageblock_nr_pages - 1)) != 0) {
1488 page++; 1502 page++;
1489 } else { 1503 } else {
1490 nr_pages += nr_to_free; 1504 nr_pages += nr_to_free;
@@ -1520,6 +1534,9 @@ free_range:
1520 free_base_page = NULL; 1534 free_base_page = NULL;
1521 free_base_pfn = nr_to_free = 0; 1535 free_base_pfn = nr_to_free = 0;
1522 } 1536 }
1537 /* Free the last block of pages to allocator */
1538 nr_pages += nr_to_free;
1539 deferred_free_range(free_base_page, free_base_pfn, nr_to_free);
1523 1540
1524 first_init_pfn = max(end_pfn, first_init_pfn); 1541 first_init_pfn = max(end_pfn, first_init_pfn);
1525 } 1542 }
@@ -1616,18 +1633,15 @@ static inline void expand(struct zone *zone, struct page *page,
1616 size >>= 1; 1633 size >>= 1;
1617 VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]); 1634 VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
1618 1635
1619 if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) && 1636 /*
1620 debug_guardpage_enabled() && 1637 * Mark as guard pages (or page), that will allow to
1621 high < debug_guardpage_minorder()) { 1638 * merge back to allocator when buddy will be freed.
1622 /* 1639 * Corresponding page table entries will not be touched,
1623 * Mark as guard pages (or page), that will allow to 1640 * pages will stay not present in virtual address space
1624 * merge back to allocator when buddy will be freed. 1641 */
1625 * Corresponding page table entries will not be touched, 1642 if (set_page_guard(zone, &page[size], high, migratetype))
1626 * pages will stay not present in virtual address space
1627 */
1628 set_page_guard(zone, &page[size], high, migratetype);
1629 continue; 1643 continue;
1630 } 1644
1631 list_add(&page[size].lru, &area->free_list[migratetype]); 1645 list_add(&page[size].lru, &area->free_list[migratetype]);
1632 area->nr_free++; 1646 area->nr_free++;
1633 set_page_order(&page[size], high); 1647 set_page_order(&page[size], high);
@@ -2489,9 +2503,14 @@ int __isolate_free_page(struct page *page, unsigned int order)
2489 mt = get_pageblock_migratetype(page); 2503 mt = get_pageblock_migratetype(page);
2490 2504
2491 if (!is_migrate_isolate(mt)) { 2505 if (!is_migrate_isolate(mt)) {
2492 /* Obey watermarks as if the page was being allocated */ 2506 /*
2493 watermark = low_wmark_pages(zone) + (1 << order); 2507 * Obey watermarks as if the page was being allocated. We can
2494 if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) 2508 * emulate a high-order watermark check with a raised order-0
2509 * watermark, because we already know our high-order page
2510 * exists.
2511 */
2512 watermark = min_wmark_pages(zone) + (1UL << order);
2513 if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
2495 return 0; 2514 return 0;
2496 2515
2497 __mod_zone_freepage_state(zone, -(1UL << order), mt); 2516 __mod_zone_freepage_state(zone, -(1UL << order), mt);
@@ -2960,9 +2979,11 @@ static DEFINE_RATELIMIT_STATE(nopage_rs,
2960 DEFAULT_RATELIMIT_INTERVAL, 2979 DEFAULT_RATELIMIT_INTERVAL,
2961 DEFAULT_RATELIMIT_BURST); 2980 DEFAULT_RATELIMIT_BURST);
2962 2981
2963void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, const char *fmt, ...) 2982void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
2964{ 2983{
2965 unsigned int filter = SHOW_MEM_FILTER_NODES; 2984 unsigned int filter = SHOW_MEM_FILTER_NODES;
2985 struct va_format vaf;
2986 va_list args;
2966 2987
2967 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || 2988 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
2968 debug_guardpage_minorder() > 0) 2989 debug_guardpage_minorder() > 0)
@@ -2980,22 +3001,16 @@ void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, const char *fmt, ...)
2980 if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) 3001 if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
2981 filter &= ~SHOW_MEM_FILTER_NODES; 3002 filter &= ~SHOW_MEM_FILTER_NODES;
2982 3003
2983 if (fmt) { 3004 pr_warn("%s: ", current->comm);
2984 struct va_format vaf;
2985 va_list args;
2986 3005
2987 va_start(args, fmt); 3006 va_start(args, fmt);
3007 vaf.fmt = fmt;
3008 vaf.va = &args;
3009 pr_cont("%pV", &vaf);
3010 va_end(args);
2988 3011
2989 vaf.fmt = fmt; 3012 pr_cont(", mode:%#x(%pGg)\n", gfp_mask, &gfp_mask);
2990 vaf.va = &args;
2991 3013
2992 pr_warn("%pV", &vaf);
2993
2994 va_end(args);
2995 }
2996
2997 pr_warn("%s: page allocation failure: order:%u, mode:%#x(%pGg)\n",
2998 current->comm, order, gfp_mask, &gfp_mask);
2999 dump_stack(); 3014 dump_stack();
3000 if (!should_suppress_show_mem()) 3015 if (!should_suppress_show_mem())
3001 show_mem(filter); 3016 show_mem(filter);
@@ -3137,6 +3152,65 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
3137 return NULL; 3152 return NULL;
3138} 3153}
3139 3154
3155static inline bool
3156should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
3157 enum compact_result compact_result,
3158 enum compact_priority *compact_priority,
3159 int *compaction_retries)
3160{
3161 int max_retries = MAX_COMPACT_RETRIES;
3162 int min_priority;
3163
3164 if (!order)
3165 return false;
3166
3167 if (compaction_made_progress(compact_result))
3168 (*compaction_retries)++;
3169
3170 /*
3171 * compaction considers all the zone as desperately out of memory
3172 * so it doesn't really make much sense to retry except when the
3173 * failure could be caused by insufficient priority
3174 */
3175 if (compaction_failed(compact_result))
3176 goto check_priority;
3177
3178 /*
3179 * make sure the compaction wasn't deferred or didn't bail out early
3180 * due to locks contention before we declare that we should give up.
3181 * But do not retry if the given zonelist is not suitable for
3182 * compaction.
3183 */
3184 if (compaction_withdrawn(compact_result))
3185 return compaction_zonelist_suitable(ac, order, alloc_flags);
3186
3187 /*
3188 * !costly requests are much more important than __GFP_REPEAT
3189 * costly ones because they are de facto nofail and invoke OOM
3190 * killer to move on while costly can fail and users are ready
3191 * to cope with that. 1/4 retries is rather arbitrary but we
3192 * would need much more detailed feedback from compaction to
3193 * make a better decision.
3194 */
3195 if (order > PAGE_ALLOC_COSTLY_ORDER)
3196 max_retries /= 4;
3197 if (*compaction_retries <= max_retries)
3198 return true;
3199
3200 /*
3201 * Make sure there are attempts at the highest priority if we exhausted
3202 * all retries or failed at the lower priorities.
3203 */
3204check_priority:
3205 min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ?
3206 MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY;
3207 if (*compact_priority > min_priority) {
3208 (*compact_priority)--;
3209 *compaction_retries = 0;
3210 return true;
3211 }
3212 return false;
3213}
3140#else 3214#else
3141static inline struct page * 3215static inline struct page *
3142__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 3216__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
@@ -3147,13 +3221,11 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
3147 return NULL; 3221 return NULL;
3148} 3222}
3149 3223
3150#endif /* CONFIG_COMPACTION */
3151
3152static inline bool 3224static inline bool
3153should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags, 3225should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,
3154 enum compact_result compact_result, 3226 enum compact_result compact_result,
3155 enum compact_priority *compact_priority, 3227 enum compact_priority *compact_priority,
3156 int compaction_retries) 3228 int *compaction_retries)
3157{ 3229{
3158 struct zone *zone; 3230 struct zone *zone;
3159 struct zoneref *z; 3231 struct zoneref *z;
@@ -3175,6 +3247,7 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla
3175 } 3247 }
3176 return false; 3248 return false;
3177} 3249}
3250#endif /* CONFIG_COMPACTION */
3178 3251
3179/* Perform direct synchronous page reclaim */ 3252/* Perform direct synchronous page reclaim */
3180static int 3253static int
@@ -3325,16 +3398,26 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
3325static inline bool 3398static inline bool
3326should_reclaim_retry(gfp_t gfp_mask, unsigned order, 3399should_reclaim_retry(gfp_t gfp_mask, unsigned order,
3327 struct alloc_context *ac, int alloc_flags, 3400 struct alloc_context *ac, int alloc_flags,
3328 bool did_some_progress, int no_progress_loops) 3401 bool did_some_progress, int *no_progress_loops)
3329{ 3402{
3330 struct zone *zone; 3403 struct zone *zone;
3331 struct zoneref *z; 3404 struct zoneref *z;
3332 3405
3333 /* 3406 /*
3407 * Costly allocations might have made a progress but this doesn't mean
3408 * their order will become available due to high fragmentation so
3409 * always increment the no progress counter for them
3410 */
3411 if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER)
3412 *no_progress_loops = 0;
3413 else
3414 (*no_progress_loops)++;
3415
3416 /*
3334 * Make sure we converge to OOM if we cannot make any progress 3417 * Make sure we converge to OOM if we cannot make any progress
3335 * several times in the row. 3418 * several times in the row.
3336 */ 3419 */
3337 if (no_progress_loops > MAX_RECLAIM_RETRIES) 3420 if (*no_progress_loops > MAX_RECLAIM_RETRIES)
3338 return false; 3421 return false;
3339 3422
3340 /* 3423 /*
@@ -3349,7 +3432,7 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
3349 unsigned long reclaimable; 3432 unsigned long reclaimable;
3350 3433
3351 available = reclaimable = zone_reclaimable_pages(zone); 3434 available = reclaimable = zone_reclaimable_pages(zone);
3352 available -= DIV_ROUND_UP(no_progress_loops * available, 3435 available -= DIV_ROUND_UP((*no_progress_loops) * available,
3353 MAX_RECLAIM_RETRIES); 3436 MAX_RECLAIM_RETRIES);
3354 available += zone_page_state_snapshot(zone, NR_FREE_PAGES); 3437 available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
3355 3438
@@ -3410,6 +3493,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
3410 enum compact_result compact_result; 3493 enum compact_result compact_result;
3411 int compaction_retries = 0; 3494 int compaction_retries = 0;
3412 int no_progress_loops = 0; 3495 int no_progress_loops = 0;
3496 unsigned long alloc_start = jiffies;
3497 unsigned int stall_timeout = 10 * HZ;
3413 3498
3414 /* 3499 /*
3415 * In the slowpath, we sanity check order to avoid ever trying to 3500 * In the slowpath, we sanity check order to avoid ever trying to
@@ -3554,9 +3639,6 @@ retry:
3554 if (page) 3639 if (page)
3555 goto got_pg; 3640 goto got_pg;
3556 3641
3557 if (order && compaction_made_progress(compact_result))
3558 compaction_retries++;
3559
3560 /* Do not loop if specifically requested */ 3642 /* Do not loop if specifically requested */
3561 if (gfp_mask & __GFP_NORETRY) 3643 if (gfp_mask & __GFP_NORETRY)
3562 goto nopage; 3644 goto nopage;
@@ -3568,18 +3650,16 @@ retry:
3568 if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT)) 3650 if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT))
3569 goto nopage; 3651 goto nopage;
3570 3652
3571 /* 3653 /* Make sure we know about allocations which stall for too long */
3572 * Costly allocations might have made a progress but this doesn't mean 3654 if (time_after(jiffies, alloc_start + stall_timeout)) {
3573 * their order will become available due to high fragmentation so 3655 warn_alloc(gfp_mask,
3574 * always increment the no progress counter for them 3656 "page alloction stalls for %ums, order:%u\n",
3575 */ 3657 jiffies_to_msecs(jiffies-alloc_start), order);
3576 if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) 3658 stall_timeout += 10 * HZ;
3577 no_progress_loops = 0; 3659 }
3578 else
3579 no_progress_loops++;
3580 3660
3581 if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, 3661 if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
3582 did_some_progress > 0, no_progress_loops)) 3662 did_some_progress > 0, &no_progress_loops))
3583 goto retry; 3663 goto retry;
3584 3664
3585 /* 3665 /*
@@ -3591,7 +3671,7 @@ retry:
3591 if (did_some_progress > 0 && 3671 if (did_some_progress > 0 &&
3592 should_compact_retry(ac, order, alloc_flags, 3672 should_compact_retry(ac, order, alloc_flags,
3593 compact_result, &compact_priority, 3673 compact_result, &compact_priority,
3594 compaction_retries)) 3674 &compaction_retries))
3595 goto retry; 3675 goto retry;
3596 3676
3597 /* Reclaim has failed us, start killing things */ 3677 /* Reclaim has failed us, start killing things */
@@ -3606,7 +3686,8 @@ retry:
3606 } 3686 }
3607 3687
3608nopage: 3688nopage:
3609 warn_alloc_failed(gfp_mask, order, NULL); 3689 warn_alloc(gfp_mask,
3690 "page allocation failure: order:%u", order);
3610got_pg: 3691got_pg:
3611 return page; 3692 return page;
3612} 3693}
@@ -4555,7 +4636,7 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
4555 int j; 4636 int j;
4556 struct zonelist *zonelist; 4637 struct zonelist *zonelist;
4557 4638
4558 zonelist = &pgdat->node_zonelists[0]; 4639 zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
4559 for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++) 4640 for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
4560 ; 4641 ;
4561 j = build_zonelists_node(NODE_DATA(node), zonelist, j); 4642 j = build_zonelists_node(NODE_DATA(node), zonelist, j);
@@ -4571,7 +4652,7 @@ static void build_thisnode_zonelists(pg_data_t *pgdat)
4571 int j; 4652 int j;
4572 struct zonelist *zonelist; 4653 struct zonelist *zonelist;
4573 4654
4574 zonelist = &pgdat->node_zonelists[1]; 4655 zonelist = &pgdat->node_zonelists[ZONELIST_NOFALLBACK];
4575 j = build_zonelists_node(pgdat, zonelist, 0); 4656 j = build_zonelists_node(pgdat, zonelist, 0);
4576 zonelist->_zonerefs[j].zone = NULL; 4657 zonelist->_zonerefs[j].zone = NULL;
4577 zonelist->_zonerefs[j].zone_idx = 0; 4658 zonelist->_zonerefs[j].zone_idx = 0;
@@ -4592,7 +4673,7 @@ static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
4592 struct zone *z; 4673 struct zone *z;
4593 struct zonelist *zonelist; 4674 struct zonelist *zonelist;
4594 4675
4595 zonelist = &pgdat->node_zonelists[0]; 4676 zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
4596 pos = 0; 4677 pos = 0;
4597 for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) { 4678 for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
4598 for (j = 0; j < nr_nodes; j++) { 4679 for (j = 0; j < nr_nodes; j++) {
@@ -4727,7 +4808,7 @@ static void build_zonelists(pg_data_t *pgdat)
4727 4808
4728 local_node = pgdat->node_id; 4809 local_node = pgdat->node_id;
4729 4810
4730 zonelist = &pgdat->node_zonelists[0]; 4811 zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
4731 j = build_zonelists_node(pgdat, zonelist, 0); 4812 j = build_zonelists_node(pgdat, zonelist, 0);
4732 4813
4733 /* 4814 /*
@@ -5000,15 +5081,6 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
5000 5081
5001#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 5082#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
5002 /* 5083 /*
5003 * If not mirrored_kernelcore and ZONE_MOVABLE exists, range
5004 * from zone_movable_pfn[nid] to end of each node should be
5005 * ZONE_MOVABLE not ZONE_NORMAL. skip it.
5006 */
5007 if (!mirrored_kernelcore && zone_movable_pfn[nid])
5008 if (zone == ZONE_NORMAL && pfn >= zone_movable_pfn[nid])
5009 continue;
5010
5011 /*
5012 * Check given memblock attribute by firmware which can affect 5084 * Check given memblock attribute by firmware which can affect
5013 * kernel memory layout. If zone==ZONE_MOVABLE but memory is 5085 * kernel memory layout. If zone==ZONE_MOVABLE but memory is
5014 * mirrored, it's an overlapped memmap init. skip it. 5086 * mirrored, it's an overlapped memmap init. skip it.
@@ -5451,6 +5523,12 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid,
5451 *zone_end_pfn = min(node_end_pfn, 5523 *zone_end_pfn = min(node_end_pfn,
5452 arch_zone_highest_possible_pfn[movable_zone]); 5524 arch_zone_highest_possible_pfn[movable_zone]);
5453 5525
5526 /* Adjust for ZONE_MOVABLE starting within this range */
5527 } else if (!mirrored_kernelcore &&
5528 *zone_start_pfn < zone_movable_pfn[nid] &&
5529 *zone_end_pfn > zone_movable_pfn[nid]) {
5530 *zone_end_pfn = zone_movable_pfn[nid];
5531
5454 /* Check if this whole range is within ZONE_MOVABLE */ 5532 /* Check if this whole range is within ZONE_MOVABLE */
5455 } else if (*zone_start_pfn >= zone_movable_pfn[nid]) 5533 } else if (*zone_start_pfn >= zone_movable_pfn[nid])
5456 *zone_start_pfn = *zone_end_pfn; 5534 *zone_start_pfn = *zone_end_pfn;
@@ -5554,28 +5632,23 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
5554 * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages 5632 * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages
5555 * and vice versa. 5633 * and vice versa.
5556 */ 5634 */
5557 if (zone_movable_pfn[nid]) { 5635 if (mirrored_kernelcore && zone_movable_pfn[nid]) {
5558 if (mirrored_kernelcore) { 5636 unsigned long start_pfn, end_pfn;
5559 unsigned long start_pfn, end_pfn; 5637 struct memblock_region *r;
5560 struct memblock_region *r; 5638
5561 5639 for_each_memblock(memory, r) {
5562 for_each_memblock(memory, r) { 5640 start_pfn = clamp(memblock_region_memory_base_pfn(r),
5563 start_pfn = clamp(memblock_region_memory_base_pfn(r), 5641 zone_start_pfn, zone_end_pfn);
5564 zone_start_pfn, zone_end_pfn); 5642 end_pfn = clamp(memblock_region_memory_end_pfn(r),
5565 end_pfn = clamp(memblock_region_memory_end_pfn(r), 5643 zone_start_pfn, zone_end_pfn);
5566 zone_start_pfn, zone_end_pfn); 5644
5567 5645 if (zone_type == ZONE_MOVABLE &&
5568 if (zone_type == ZONE_MOVABLE && 5646 memblock_is_mirror(r))
5569 memblock_is_mirror(r)) 5647 nr_absent += end_pfn - start_pfn;
5570 nr_absent += end_pfn - start_pfn; 5648
5571 5649 if (zone_type == ZONE_NORMAL &&
5572 if (zone_type == ZONE_NORMAL && 5650 !memblock_is_mirror(r))
5573 !memblock_is_mirror(r)) 5651 nr_absent += end_pfn - start_pfn;
5574 nr_absent += end_pfn - start_pfn;
5575 }
5576 } else {
5577 if (zone_type == ZONE_NORMAL)
5578 nr_absent += node_end_pfn - zone_movable_pfn[nid];
5579 } 5652 }
5580 } 5653 }
5581 5654
@@ -6929,6 +7002,17 @@ static int __init set_hashdist(char *str)
6929__setup("hashdist=", set_hashdist); 7002__setup("hashdist=", set_hashdist);
6930#endif 7003#endif
6931 7004
7005#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
7006/*
7007 * Returns the number of pages that arch has reserved but
7008 * is not known to alloc_large_system_hash().
7009 */
7010static unsigned long __init arch_reserved_kernel_pages(void)
7011{
7012 return 0;
7013}
7014#endif
7015
6932/* 7016/*
6933 * allocate a large system hash table from bootmem 7017 * allocate a large system hash table from bootmem
6934 * - it is assumed that the hash table must contain an exact power-of-2 7018 * - it is assumed that the hash table must contain an exact power-of-2
@@ -6953,6 +7037,7 @@ void *__init alloc_large_system_hash(const char *tablename,
6953 if (!numentries) { 7037 if (!numentries) {
6954 /* round applicable memory size up to nearest megabyte */ 7038 /* round applicable memory size up to nearest megabyte */
6955 numentries = nr_kernel_pages; 7039 numentries = nr_kernel_pages;
7040 numentries -= arch_reserved_kernel_pages();
6956 7041
6957 /* It isn't necessary when PAGE_SIZE >= 1MB */ 7042 /* It isn't necessary when PAGE_SIZE >= 1MB */
6958 if (PAGE_SHIFT < 20) 7043 if (PAGE_SHIFT < 20)
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 44a4c029c8e7..121dcffc4ec1 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -42,6 +42,11 @@
42 * and page extension core can skip to allocate memory. As result, 42 * and page extension core can skip to allocate memory. As result,
43 * none of memory is wasted. 43 * none of memory is wasted.
44 * 44 *
45 * When need callback returns true, page_ext checks if there is a request for
46 * extra memory through size in struct page_ext_operations. If it is non-zero,
47 * extra space is allocated for each page_ext entry and offset is returned to
48 * user through offset in struct page_ext_operations.
49 *
45 * The init callback is used to do proper initialization after page extension 50 * The init callback is used to do proper initialization after page extension
46 * is completely initialized. In sparse memory system, extra memory is 51 * is completely initialized. In sparse memory system, extra memory is
47 * allocated some time later than memmap is allocated. In other words, lifetime 52 * allocated some time later than memmap is allocated. In other words, lifetime
@@ -66,18 +71,24 @@ static struct page_ext_operations *page_ext_ops[] = {
66}; 71};
67 72
68static unsigned long total_usage; 73static unsigned long total_usage;
74static unsigned long extra_mem;
69 75
70static bool __init invoke_need_callbacks(void) 76static bool __init invoke_need_callbacks(void)
71{ 77{
72 int i; 78 int i;
73 int entries = ARRAY_SIZE(page_ext_ops); 79 int entries = ARRAY_SIZE(page_ext_ops);
80 bool need = false;
74 81
75 for (i = 0; i < entries; i++) { 82 for (i = 0; i < entries; i++) {
76 if (page_ext_ops[i]->need && page_ext_ops[i]->need()) 83 if (page_ext_ops[i]->need && page_ext_ops[i]->need()) {
77 return true; 84 page_ext_ops[i]->offset = sizeof(struct page_ext) +
85 extra_mem;
86 extra_mem += page_ext_ops[i]->size;
87 need = true;
88 }
78 } 89 }
79 90
80 return false; 91 return need;
81} 92}
82 93
83static void __init invoke_init_callbacks(void) 94static void __init invoke_init_callbacks(void)
@@ -91,6 +102,16 @@ static void __init invoke_init_callbacks(void)
91 } 102 }
92} 103}
93 104
105static unsigned long get_entry_size(void)
106{
107 return sizeof(struct page_ext) + extra_mem;
108}
109
110static inline struct page_ext *get_entry(void *base, unsigned long index)
111{
112 return base + get_entry_size() * index;
113}
114
94#if !defined(CONFIG_SPARSEMEM) 115#if !defined(CONFIG_SPARSEMEM)
95 116
96 117
@@ -102,7 +123,7 @@ void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
102struct page_ext *lookup_page_ext(struct page *page) 123struct page_ext *lookup_page_ext(struct page *page)
103{ 124{
104 unsigned long pfn = page_to_pfn(page); 125 unsigned long pfn = page_to_pfn(page);
105 unsigned long offset; 126 unsigned long index;
106 struct page_ext *base; 127 struct page_ext *base;
107 128
108 base = NODE_DATA(page_to_nid(page))->node_page_ext; 129 base = NODE_DATA(page_to_nid(page))->node_page_ext;
@@ -119,9 +140,9 @@ struct page_ext *lookup_page_ext(struct page *page)
119 if (unlikely(!base)) 140 if (unlikely(!base))
120 return NULL; 141 return NULL;
121#endif 142#endif
122 offset = pfn - round_down(node_start_pfn(page_to_nid(page)), 143 index = pfn - round_down(node_start_pfn(page_to_nid(page)),
123 MAX_ORDER_NR_PAGES); 144 MAX_ORDER_NR_PAGES);
124 return base + offset; 145 return get_entry(base, index);
125} 146}
126 147
127static int __init alloc_node_page_ext(int nid) 148static int __init alloc_node_page_ext(int nid)
@@ -143,7 +164,7 @@ static int __init alloc_node_page_ext(int nid)
143 !IS_ALIGNED(node_end_pfn(nid), MAX_ORDER_NR_PAGES)) 164 !IS_ALIGNED(node_end_pfn(nid), MAX_ORDER_NR_PAGES))
144 nr_pages += MAX_ORDER_NR_PAGES; 165 nr_pages += MAX_ORDER_NR_PAGES;
145 166
146 table_size = sizeof(struct page_ext) * nr_pages; 167 table_size = get_entry_size() * nr_pages;
147 168
148 base = memblock_virt_alloc_try_nid_nopanic( 169 base = memblock_virt_alloc_try_nid_nopanic(
149 table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), 170 table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
@@ -196,7 +217,7 @@ struct page_ext *lookup_page_ext(struct page *page)
196 if (!section->page_ext) 217 if (!section->page_ext)
197 return NULL; 218 return NULL;
198#endif 219#endif
199 return section->page_ext + pfn; 220 return get_entry(section->page_ext, pfn);
200} 221}
201 222
202static void *__meminit alloc_page_ext(size_t size, int nid) 223static void *__meminit alloc_page_ext(size_t size, int nid)
@@ -229,7 +250,7 @@ static int __meminit init_section_page_ext(unsigned long pfn, int nid)
229 if (section->page_ext) 250 if (section->page_ext)
230 return 0; 251 return 0;
231 252
232 table_size = sizeof(struct page_ext) * PAGES_PER_SECTION; 253 table_size = get_entry_size() * PAGES_PER_SECTION;
233 base = alloc_page_ext(table_size, nid); 254 base = alloc_page_ext(table_size, nid);
234 255
235 /* 256 /*
@@ -249,7 +270,7 @@ static int __meminit init_section_page_ext(unsigned long pfn, int nid)
249 * we need to apply a mask. 270 * we need to apply a mask.
250 */ 271 */
251 pfn &= PAGE_SECTION_MASK; 272 pfn &= PAGE_SECTION_MASK;
252 section->page_ext = base - pfn; 273 section->page_ext = (void *)base - get_entry_size() * pfn;
253 total_usage += table_size; 274 total_usage += table_size;
254 return 0; 275 return 0;
255} 276}
@@ -262,7 +283,7 @@ static void free_page_ext(void *addr)
262 struct page *page = virt_to_page(addr); 283 struct page *page = virt_to_page(addr);
263 size_t table_size; 284 size_t table_size;
264 285
265 table_size = sizeof(struct page_ext) * PAGES_PER_SECTION; 286 table_size = get_entry_size() * PAGES_PER_SECTION;
266 287
267 BUG_ON(PageReserved(page)); 288 BUG_ON(PageReserved(page));
268 free_pages_exact(addr, table_size); 289 free_pages_exact(addr, table_size);
@@ -277,7 +298,7 @@ static void __free_page_ext(unsigned long pfn)
277 ms = __pfn_to_section(pfn); 298 ms = __pfn_to_section(pfn);
278 if (!ms || !ms->page_ext) 299 if (!ms || !ms->page_ext)
279 return; 300 return;
280 base = ms->page_ext + pfn; 301 base = get_entry(ms->page_ext, pfn);
281 free_page_ext(base); 302 free_page_ext(base);
282 ms->page_ext = NULL; 303 ms->page_ext = NULL;
283} 304}
diff --git a/mm/page_io.c b/mm/page_io.c
index eafe5ddc2b54..a2651f58c86a 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -264,7 +264,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
264 int ret; 264 int ret;
265 struct swap_info_struct *sis = page_swap_info(page); 265 struct swap_info_struct *sis = page_swap_info(page);
266 266
267 BUG_ON(!PageSwapCache(page)); 267 VM_BUG_ON_PAGE(!PageSwapCache(page), page);
268 if (sis->flags & SWP_FILE) { 268 if (sis->flags & SWP_FILE) {
269 struct kiocb kiocb; 269 struct kiocb kiocb;
270 struct file *swap_file = sis->swap_file; 270 struct file *swap_file = sis->swap_file;
@@ -338,7 +338,7 @@ int swap_readpage(struct page *page)
338 int ret = 0; 338 int ret = 0;
339 struct swap_info_struct *sis = page_swap_info(page); 339 struct swap_info_struct *sis = page_swap_info(page);
340 340
341 BUG_ON(!PageSwapCache(page)); 341 VM_BUG_ON_PAGE(!PageSwapCache(page), page);
342 VM_BUG_ON_PAGE(!PageLocked(page), page); 342 VM_BUG_ON_PAGE(!PageLocked(page), page);
343 VM_BUG_ON_PAGE(PageUptodate(page), page); 343 VM_BUG_ON_PAGE(PageUptodate(page), page);
344 if (frontswap_load(page) == 0) { 344 if (frontswap_load(page) == 0) {
@@ -388,7 +388,8 @@ int swap_set_page_dirty(struct page *page)
388 388
389 if (sis->flags & SWP_FILE) { 389 if (sis->flags & SWP_FILE) {
390 struct address_space *mapping = sis->swap_file->f_mapping; 390 struct address_space *mapping = sis->swap_file->f_mapping;
391 BUG_ON(!PageSwapCache(page)); 391
392 VM_BUG_ON_PAGE(!PageSwapCache(page), page);
392 return mapping->a_ops->set_page_dirty(page); 393 return mapping->a_ops->set_page_dirty(page);
393 } else { 394 } else {
394 return __set_page_dirty_no_writeback(page); 395 return __set_page_dirty_no_writeback(page);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 064b7fb6e0b5..a5594bfcc5ed 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -55,7 +55,7 @@ static int set_migratetype_isolate(struct page *page,
55 ret = 0; 55 ret = 0;
56 56
57 /* 57 /*
58 * immobile means "not-on-lru" paes. If immobile is larger than 58 * immobile means "not-on-lru" pages. If immobile is larger than
59 * removable-by-driver pages reported by notifier, we'll fail. 59 * removable-by-driver pages reported by notifier, we'll fail.
60 */ 60 */
61 61
diff --git a/mm/page_owner.c b/mm/page_owner.c
index ec6dc1886f71..60634dc53a88 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -8,6 +8,7 @@
8#include <linux/jump_label.h> 8#include <linux/jump_label.h>
9#include <linux/migrate.h> 9#include <linux/migrate.h>
10#include <linux/stackdepot.h> 10#include <linux/stackdepot.h>
11#include <linux/seq_file.h>
11 12
12#include "internal.h" 13#include "internal.h"
13 14
@@ -17,6 +18,13 @@
17 */ 18 */
18#define PAGE_OWNER_STACK_DEPTH (16) 19#define PAGE_OWNER_STACK_DEPTH (16)
19 20
21struct page_owner {
22 unsigned int order;
23 gfp_t gfp_mask;
24 int last_migrate_reason;
25 depot_stack_handle_t handle;
26};
27
20static bool page_owner_disabled = true; 28static bool page_owner_disabled = true;
21DEFINE_STATIC_KEY_FALSE(page_owner_inited); 29DEFINE_STATIC_KEY_FALSE(page_owner_inited);
22 30
@@ -85,10 +93,16 @@ static void init_page_owner(void)
85} 93}
86 94
87struct page_ext_operations page_owner_ops = { 95struct page_ext_operations page_owner_ops = {
96 .size = sizeof(struct page_owner),
88 .need = need_page_owner, 97 .need = need_page_owner,
89 .init = init_page_owner, 98 .init = init_page_owner,
90}; 99};
91 100
101static inline struct page_owner *get_page_owner(struct page_ext *page_ext)
102{
103 return (void *)page_ext + page_owner_ops.offset;
104}
105
92void __reset_page_owner(struct page *page, unsigned int order) 106void __reset_page_owner(struct page *page, unsigned int order)
93{ 107{
94 int i; 108 int i;
@@ -155,14 +169,16 @@ noinline void __set_page_owner(struct page *page, unsigned int order,
155 gfp_t gfp_mask) 169 gfp_t gfp_mask)
156{ 170{
157 struct page_ext *page_ext = lookup_page_ext(page); 171 struct page_ext *page_ext = lookup_page_ext(page);
172 struct page_owner *page_owner;
158 173
159 if (unlikely(!page_ext)) 174 if (unlikely(!page_ext))
160 return; 175 return;
161 176
162 page_ext->handle = save_stack(gfp_mask); 177 page_owner = get_page_owner(page_ext);
163 page_ext->order = order; 178 page_owner->handle = save_stack(gfp_mask);
164 page_ext->gfp_mask = gfp_mask; 179 page_owner->order = order;
165 page_ext->last_migrate_reason = -1; 180 page_owner->gfp_mask = gfp_mask;
181 page_owner->last_migrate_reason = -1;
166 182
167 __set_bit(PAGE_EXT_OWNER, &page_ext->flags); 183 __set_bit(PAGE_EXT_OWNER, &page_ext->flags);
168} 184}
@@ -170,21 +186,26 @@ noinline void __set_page_owner(struct page *page, unsigned int order,
170void __set_page_owner_migrate_reason(struct page *page, int reason) 186void __set_page_owner_migrate_reason(struct page *page, int reason)
171{ 187{
172 struct page_ext *page_ext = lookup_page_ext(page); 188 struct page_ext *page_ext = lookup_page_ext(page);
189 struct page_owner *page_owner;
190
173 if (unlikely(!page_ext)) 191 if (unlikely(!page_ext))
174 return; 192 return;
175 193
176 page_ext->last_migrate_reason = reason; 194 page_owner = get_page_owner(page_ext);
195 page_owner->last_migrate_reason = reason;
177} 196}
178 197
179void __split_page_owner(struct page *page, unsigned int order) 198void __split_page_owner(struct page *page, unsigned int order)
180{ 199{
181 int i; 200 int i;
182 struct page_ext *page_ext = lookup_page_ext(page); 201 struct page_ext *page_ext = lookup_page_ext(page);
202 struct page_owner *page_owner;
183 203
184 if (unlikely(!page_ext)) 204 if (unlikely(!page_ext))
185 return; 205 return;
186 206
187 page_ext->order = 0; 207 page_owner = get_page_owner(page_ext);
208 page_owner->order = 0;
188 for (i = 1; i < (1 << order); i++) 209 for (i = 1; i < (1 << order); i++)
189 __copy_page_owner(page, page + i); 210 __copy_page_owner(page, page + i);
190} 211}
@@ -193,14 +214,18 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage)
193{ 214{
194 struct page_ext *old_ext = lookup_page_ext(oldpage); 215 struct page_ext *old_ext = lookup_page_ext(oldpage);
195 struct page_ext *new_ext = lookup_page_ext(newpage); 216 struct page_ext *new_ext = lookup_page_ext(newpage);
217 struct page_owner *old_page_owner, *new_page_owner;
196 218
197 if (unlikely(!old_ext || !new_ext)) 219 if (unlikely(!old_ext || !new_ext))
198 return; 220 return;
199 221
200 new_ext->order = old_ext->order; 222 old_page_owner = get_page_owner(old_ext);
201 new_ext->gfp_mask = old_ext->gfp_mask; 223 new_page_owner = get_page_owner(new_ext);
202 new_ext->last_migrate_reason = old_ext->last_migrate_reason; 224 new_page_owner->order = old_page_owner->order;
203 new_ext->handle = old_ext->handle; 225 new_page_owner->gfp_mask = old_page_owner->gfp_mask;
226 new_page_owner->last_migrate_reason =
227 old_page_owner->last_migrate_reason;
228 new_page_owner->handle = old_page_owner->handle;
204 229
205 /* 230 /*
206 * We don't clear the bit on the oldpage as it's going to be freed 231 * We don't clear the bit on the oldpage as it's going to be freed
@@ -214,9 +239,88 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage)
214 __set_bit(PAGE_EXT_OWNER, &new_ext->flags); 239 __set_bit(PAGE_EXT_OWNER, &new_ext->flags);
215} 240}
216 241
242void pagetypeinfo_showmixedcount_print(struct seq_file *m,
243 pg_data_t *pgdat, struct zone *zone)
244{
245 struct page *page;
246 struct page_ext *page_ext;
247 struct page_owner *page_owner;
248 unsigned long pfn = zone->zone_start_pfn, block_end_pfn;
249 unsigned long end_pfn = pfn + zone->spanned_pages;
250 unsigned long count[MIGRATE_TYPES] = { 0, };
251 int pageblock_mt, page_mt;
252 int i;
253
254 /* Scan block by block. First and last block may be incomplete */
255 pfn = zone->zone_start_pfn;
256
257 /*
258 * Walk the zone in pageblock_nr_pages steps. If a page block spans
259 * a zone boundary, it will be double counted between zones. This does
260 * not matter as the mixed block count will still be correct
261 */
262 for (; pfn < end_pfn; ) {
263 if (!pfn_valid(pfn)) {
264 pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
265 continue;
266 }
267
268 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
269 block_end_pfn = min(block_end_pfn, end_pfn);
270
271 page = pfn_to_page(pfn);
272 pageblock_mt = get_pageblock_migratetype(page);
273
274 for (; pfn < block_end_pfn; pfn++) {
275 if (!pfn_valid_within(pfn))
276 continue;
277
278 page = pfn_to_page(pfn);
279
280 if (page_zone(page) != zone)
281 continue;
282
283 if (PageBuddy(page)) {
284 pfn += (1UL << page_order(page)) - 1;
285 continue;
286 }
287
288 if (PageReserved(page))
289 continue;
290
291 page_ext = lookup_page_ext(page);
292 if (unlikely(!page_ext))
293 continue;
294
295 if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
296 continue;
297
298 page_owner = get_page_owner(page_ext);
299 page_mt = gfpflags_to_migratetype(
300 page_owner->gfp_mask);
301 if (pageblock_mt != page_mt) {
302 if (is_migrate_cma(pageblock_mt))
303 count[MIGRATE_MOVABLE]++;
304 else
305 count[pageblock_mt]++;
306
307 pfn = block_end_pfn;
308 break;
309 }
310 pfn += (1UL << page_owner->order) - 1;
311 }
312 }
313
314 /* Print counts */
315 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
316 for (i = 0; i < MIGRATE_TYPES; i++)
317 seq_printf(m, "%12lu ", count[i]);
318 seq_putc(m, '\n');
319}
320
217static ssize_t 321static ssize_t
218print_page_owner(char __user *buf, size_t count, unsigned long pfn, 322print_page_owner(char __user *buf, size_t count, unsigned long pfn,
219 struct page *page, struct page_ext *page_ext, 323 struct page *page, struct page_owner *page_owner,
220 depot_stack_handle_t handle) 324 depot_stack_handle_t handle)
221{ 325{
222 int ret; 326 int ret;
@@ -236,15 +340,15 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
236 340
237 ret = snprintf(kbuf, count, 341 ret = snprintf(kbuf, count,
238 "Page allocated via order %u, mask %#x(%pGg)\n", 342 "Page allocated via order %u, mask %#x(%pGg)\n",
239 page_ext->order, page_ext->gfp_mask, 343 page_owner->order, page_owner->gfp_mask,
240 &page_ext->gfp_mask); 344 &page_owner->gfp_mask);
241 345
242 if (ret >= count) 346 if (ret >= count)
243 goto err; 347 goto err;
244 348
245 /* Print information relevant to grouping pages by mobility */ 349 /* Print information relevant to grouping pages by mobility */
246 pageblock_mt = get_pageblock_migratetype(page); 350 pageblock_mt = get_pageblock_migratetype(page);
247 page_mt = gfpflags_to_migratetype(page_ext->gfp_mask); 351 page_mt = gfpflags_to_migratetype(page_owner->gfp_mask);
248 ret += snprintf(kbuf + ret, count - ret, 352 ret += snprintf(kbuf + ret, count - ret,
249 "PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n", 353 "PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n",
250 pfn, 354 pfn,
@@ -261,10 +365,10 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
261 if (ret >= count) 365 if (ret >= count)
262 goto err; 366 goto err;
263 367
264 if (page_ext->last_migrate_reason != -1) { 368 if (page_owner->last_migrate_reason != -1) {
265 ret += snprintf(kbuf + ret, count - ret, 369 ret += snprintf(kbuf + ret, count - ret,
266 "Page has been migrated, last migrate reason: %s\n", 370 "Page has been migrated, last migrate reason: %s\n",
267 migrate_reason_names[page_ext->last_migrate_reason]); 371 migrate_reason_names[page_owner->last_migrate_reason]);
268 if (ret >= count) 372 if (ret >= count)
269 goto err; 373 goto err;
270 } 374 }
@@ -287,6 +391,7 @@ err:
287void __dump_page_owner(struct page *page) 391void __dump_page_owner(struct page *page)
288{ 392{
289 struct page_ext *page_ext = lookup_page_ext(page); 393 struct page_ext *page_ext = lookup_page_ext(page);
394 struct page_owner *page_owner;
290 unsigned long entries[PAGE_OWNER_STACK_DEPTH]; 395 unsigned long entries[PAGE_OWNER_STACK_DEPTH];
291 struct stack_trace trace = { 396 struct stack_trace trace = {
292 .nr_entries = 0, 397 .nr_entries = 0,
@@ -302,7 +407,9 @@ void __dump_page_owner(struct page *page)
302 pr_alert("There is not page extension available.\n"); 407 pr_alert("There is not page extension available.\n");
303 return; 408 return;
304 } 409 }
305 gfp_mask = page_ext->gfp_mask; 410
411 page_owner = get_page_owner(page_ext);
412 gfp_mask = page_owner->gfp_mask;
306 mt = gfpflags_to_migratetype(gfp_mask); 413 mt = gfpflags_to_migratetype(gfp_mask);
307 414
308 if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) { 415 if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) {
@@ -310,7 +417,7 @@ void __dump_page_owner(struct page *page)
310 return; 417 return;
311 } 418 }
312 419
313 handle = READ_ONCE(page_ext->handle); 420 handle = READ_ONCE(page_owner->handle);
314 if (!handle) { 421 if (!handle) {
315 pr_alert("page_owner info is not active (free page?)\n"); 422 pr_alert("page_owner info is not active (free page?)\n");
316 return; 423 return;
@@ -318,12 +425,12 @@ void __dump_page_owner(struct page *page)
318 425
319 depot_fetch_stack(handle, &trace); 426 depot_fetch_stack(handle, &trace);
320 pr_alert("page allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n", 427 pr_alert("page allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n",
321 page_ext->order, migratetype_names[mt], gfp_mask, &gfp_mask); 428 page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask);
322 print_stack_trace(&trace, 0); 429 print_stack_trace(&trace, 0);
323 430
324 if (page_ext->last_migrate_reason != -1) 431 if (page_owner->last_migrate_reason != -1)
325 pr_alert("page has been migrated, last migrate reason: %s\n", 432 pr_alert("page has been migrated, last migrate reason: %s\n",
326 migrate_reason_names[page_ext->last_migrate_reason]); 433 migrate_reason_names[page_owner->last_migrate_reason]);
327} 434}
328 435
329static ssize_t 436static ssize_t
@@ -332,6 +439,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
332 unsigned long pfn; 439 unsigned long pfn;
333 struct page *page; 440 struct page *page;
334 struct page_ext *page_ext; 441 struct page_ext *page_ext;
442 struct page_owner *page_owner;
335 depot_stack_handle_t handle; 443 depot_stack_handle_t handle;
336 444
337 if (!static_branch_unlikely(&page_owner_inited)) 445 if (!static_branch_unlikely(&page_owner_inited))
@@ -381,11 +489,13 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
381 if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) 489 if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
382 continue; 490 continue;
383 491
492 page_owner = get_page_owner(page_ext);
493
384 /* 494 /*
385 * Access to page_ext->handle isn't synchronous so we should 495 * Access to page_ext->handle isn't synchronous so we should
386 * be careful to access it. 496 * be careful to access it.
387 */ 497 */
388 handle = READ_ONCE(page_ext->handle); 498 handle = READ_ONCE(page_owner->handle);
389 if (!handle) 499 if (!handle)
390 continue; 500 continue;
391 501
@@ -393,7 +503,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
393 *ppos = (pfn - min_low_pfn) + 1; 503 *ppos = (pfn - min_low_pfn) + 1;
394 504
395 return print_page_owner(buf, count, pfn, page, 505 return print_page_owner(buf, count, pfn, page,
396 page_ext, handle); 506 page_owner, handle);
397 } 507 }
398 508
399 return 0; 509 return 0;
diff --git a/mm/shmem.c b/mm/shmem.c
index d86b5e455fef..0e9901e69d24 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3965,7 +3965,7 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range);
3965 3965
3966/* common code */ 3966/* common code */
3967 3967
3968static struct dentry_operations anon_ops = { 3968static const struct dentry_operations anon_ops = {
3969 .d_dname = simple_dname 3969 .d_dname = simple_dname
3970}; 3970};
3971 3971
diff --git a/mm/swap.c b/mm/swap.c
index 75c63bb2a1da..4dcf852e1e6d 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -748,10 +748,8 @@ void release_pages(struct page **pages, int nr, bool cold)
748 locked_pgdat = NULL; 748 locked_pgdat = NULL;
749 } 749 }
750 750
751 if (is_huge_zero_page(page)) { 751 if (is_huge_zero_page(page))
752 put_huge_zero_page();
753 continue; 752 continue;
754 }
755 753
756 page = compound_head(page); 754 page = compound_head(page);
757 if (!put_page_testzero(page)) 755 if (!put_page_testzero(page))
diff --git a/mm/swap_state.c b/mm/swap_state.c
index c8310a37be3a..35d7e0ee1c77 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -37,6 +37,8 @@ struct address_space swapper_spaces[MAX_SWAPFILES] = {
37 .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), 37 .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
38 .i_mmap_writable = ATOMIC_INIT(0), 38 .i_mmap_writable = ATOMIC_INIT(0),
39 .a_ops = &swap_aops, 39 .a_ops = &swap_aops,
40 /* swap cache doesn't use writeback related tags */
41 .flags = 1 << AS_NO_WRITEBACK_TAGS,
40 } 42 }
41}; 43};
42 44
@@ -92,7 +94,7 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry)
92 address_space = swap_address_space(entry); 94 address_space = swap_address_space(entry);
93 spin_lock_irq(&address_space->tree_lock); 95 spin_lock_irq(&address_space->tree_lock);
94 error = radix_tree_insert(&address_space->page_tree, 96 error = radix_tree_insert(&address_space->page_tree,
95 entry.val, page); 97 swp_offset(entry), page);
96 if (likely(!error)) { 98 if (likely(!error)) {
97 address_space->nrpages++; 99 address_space->nrpages++;
98 __inc_node_page_state(page, NR_FILE_PAGES); 100 __inc_node_page_state(page, NR_FILE_PAGES);
@@ -143,7 +145,7 @@ void __delete_from_swap_cache(struct page *page)
143 145
144 entry.val = page_private(page); 146 entry.val = page_private(page);
145 address_space = swap_address_space(entry); 147 address_space = swap_address_space(entry);
146 radix_tree_delete(&address_space->page_tree, page_private(page)); 148 radix_tree_delete(&address_space->page_tree, swp_offset(entry));
147 set_page_private(page, 0); 149 set_page_private(page, 0);
148 ClearPageSwapCache(page); 150 ClearPageSwapCache(page);
149 address_space->nrpages--; 151 address_space->nrpages--;
@@ -252,9 +254,7 @@ static inline void free_swap_cache(struct page *page)
252void free_page_and_swap_cache(struct page *page) 254void free_page_and_swap_cache(struct page *page)
253{ 255{
254 free_swap_cache(page); 256 free_swap_cache(page);
255 if (is_huge_zero_page(page)) 257 if (!is_huge_zero_page(page))
256 put_huge_zero_page();
257 else
258 put_page(page); 258 put_page(page);
259} 259}
260 260
@@ -283,7 +283,7 @@ struct page * lookup_swap_cache(swp_entry_t entry)
283{ 283{
284 struct page *page; 284 struct page *page;
285 285
286 page = find_get_page(swap_address_space(entry), entry.val); 286 page = find_get_page(swap_address_space(entry), swp_offset(entry));
287 287
288 if (page) { 288 if (page) {
289 INC_CACHE_INFO(find_success); 289 INC_CACHE_INFO(find_success);
@@ -310,7 +310,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
310 * called after lookup_swap_cache() failed, re-calling 310 * called after lookup_swap_cache() failed, re-calling
311 * that would confuse statistics. 311 * that would confuse statistics.
312 */ 312 */
313 found_page = find_get_page(swapper_space, entry.val); 313 found_page = find_get_page(swapper_space, swp_offset(entry));
314 if (found_page) 314 if (found_page)
315 break; 315 break;
316 316
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 2657accc6e2b..2210de290b54 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -105,7 +105,7 @@ __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
105 struct page *page; 105 struct page *page;
106 int ret = 0; 106 int ret = 0;
107 107
108 page = find_get_page(swap_address_space(entry), entry.val); 108 page = find_get_page(swap_address_space(entry), swp_offset(entry));
109 if (!page) 109 if (!page)
110 return 0; 110 return 0;
111 /* 111 /*
@@ -257,6 +257,53 @@ static inline void cluster_set_null(struct swap_cluster_info *info)
257 info->data = 0; 257 info->data = 0;
258} 258}
259 259
260static inline bool cluster_list_empty(struct swap_cluster_list *list)
261{
262 return cluster_is_null(&list->head);
263}
264
265static inline unsigned int cluster_list_first(struct swap_cluster_list *list)
266{
267 return cluster_next(&list->head);
268}
269
270static void cluster_list_init(struct swap_cluster_list *list)
271{
272 cluster_set_null(&list->head);
273 cluster_set_null(&list->tail);
274}
275
276static void cluster_list_add_tail(struct swap_cluster_list *list,
277 struct swap_cluster_info *ci,
278 unsigned int idx)
279{
280 if (cluster_list_empty(list)) {
281 cluster_set_next_flag(&list->head, idx, 0);
282 cluster_set_next_flag(&list->tail, idx, 0);
283 } else {
284 unsigned int tail = cluster_next(&list->tail);
285
286 cluster_set_next(&ci[tail], idx);
287 cluster_set_next_flag(&list->tail, idx, 0);
288 }
289}
290
291static unsigned int cluster_list_del_first(struct swap_cluster_list *list,
292 struct swap_cluster_info *ci)
293{
294 unsigned int idx;
295
296 idx = cluster_next(&list->head);
297 if (cluster_next(&list->tail) == idx) {
298 cluster_set_null(&list->head);
299 cluster_set_null(&list->tail);
300 } else
301 cluster_set_next_flag(&list->head,
302 cluster_next(&ci[idx]), 0);
303
304 return idx;
305}
306
260/* Add a cluster to discard list and schedule it to do discard */ 307/* Add a cluster to discard list and schedule it to do discard */
261static void swap_cluster_schedule_discard(struct swap_info_struct *si, 308static void swap_cluster_schedule_discard(struct swap_info_struct *si,
262 unsigned int idx) 309 unsigned int idx)
@@ -270,17 +317,7 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
270 memset(si->swap_map + idx * SWAPFILE_CLUSTER, 317 memset(si->swap_map + idx * SWAPFILE_CLUSTER,
271 SWAP_MAP_BAD, SWAPFILE_CLUSTER); 318 SWAP_MAP_BAD, SWAPFILE_CLUSTER);
272 319
273 if (cluster_is_null(&si->discard_cluster_head)) { 320 cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx);
274 cluster_set_next_flag(&si->discard_cluster_head,
275 idx, 0);
276 cluster_set_next_flag(&si->discard_cluster_tail,
277 idx, 0);
278 } else {
279 unsigned int tail = cluster_next(&si->discard_cluster_tail);
280 cluster_set_next(&si->cluster_info[tail], idx);
281 cluster_set_next_flag(&si->discard_cluster_tail,
282 idx, 0);
283 }
284 321
285 schedule_work(&si->discard_work); 322 schedule_work(&si->discard_work);
286} 323}
@@ -296,15 +333,8 @@ static void swap_do_scheduled_discard(struct swap_info_struct *si)
296 333
297 info = si->cluster_info; 334 info = si->cluster_info;
298 335
299 while (!cluster_is_null(&si->discard_cluster_head)) { 336 while (!cluster_list_empty(&si->discard_clusters)) {
300 idx = cluster_next(&si->discard_cluster_head); 337 idx = cluster_list_del_first(&si->discard_clusters, info);
301
302 cluster_set_next_flag(&si->discard_cluster_head,
303 cluster_next(&info[idx]), 0);
304 if (cluster_next(&si->discard_cluster_tail) == idx) {
305 cluster_set_null(&si->discard_cluster_head);
306 cluster_set_null(&si->discard_cluster_tail);
307 }
308 spin_unlock(&si->lock); 338 spin_unlock(&si->lock);
309 339
310 discard_swap_cluster(si, idx * SWAPFILE_CLUSTER, 340 discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
@@ -312,19 +342,7 @@ static void swap_do_scheduled_discard(struct swap_info_struct *si)
312 342
313 spin_lock(&si->lock); 343 spin_lock(&si->lock);
314 cluster_set_flag(&info[idx], CLUSTER_FLAG_FREE); 344 cluster_set_flag(&info[idx], CLUSTER_FLAG_FREE);
315 if (cluster_is_null(&si->free_cluster_head)) { 345 cluster_list_add_tail(&si->free_clusters, info, idx);
316 cluster_set_next_flag(&si->free_cluster_head,
317 idx, 0);
318 cluster_set_next_flag(&si->free_cluster_tail,
319 idx, 0);
320 } else {
321 unsigned int tail;
322
323 tail = cluster_next(&si->free_cluster_tail);
324 cluster_set_next(&info[tail], idx);
325 cluster_set_next_flag(&si->free_cluster_tail,
326 idx, 0);
327 }
328 memset(si->swap_map + idx * SWAPFILE_CLUSTER, 346 memset(si->swap_map + idx * SWAPFILE_CLUSTER,
329 0, SWAPFILE_CLUSTER); 347 0, SWAPFILE_CLUSTER);
330 } 348 }
@@ -353,13 +371,8 @@ static void inc_cluster_info_page(struct swap_info_struct *p,
353 if (!cluster_info) 371 if (!cluster_info)
354 return; 372 return;
355 if (cluster_is_free(&cluster_info[idx])) { 373 if (cluster_is_free(&cluster_info[idx])) {
356 VM_BUG_ON(cluster_next(&p->free_cluster_head) != idx); 374 VM_BUG_ON(cluster_list_first(&p->free_clusters) != idx);
357 cluster_set_next_flag(&p->free_cluster_head, 375 cluster_list_del_first(&p->free_clusters, cluster_info);
358 cluster_next(&cluster_info[idx]), 0);
359 if (cluster_next(&p->free_cluster_tail) == idx) {
360 cluster_set_null(&p->free_cluster_tail);
361 cluster_set_null(&p->free_cluster_head);
362 }
363 cluster_set_count_flag(&cluster_info[idx], 0, 0); 376 cluster_set_count_flag(&cluster_info[idx], 0, 0);
364 } 377 }
365 378
@@ -398,14 +411,7 @@ static void dec_cluster_info_page(struct swap_info_struct *p,
398 } 411 }
399 412
400 cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); 413 cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
401 if (cluster_is_null(&p->free_cluster_head)) { 414 cluster_list_add_tail(&p->free_clusters, cluster_info, idx);
402 cluster_set_next_flag(&p->free_cluster_head, idx, 0);
403 cluster_set_next_flag(&p->free_cluster_tail, idx, 0);
404 } else {
405 unsigned int tail = cluster_next(&p->free_cluster_tail);
406 cluster_set_next(&cluster_info[tail], idx);
407 cluster_set_next_flag(&p->free_cluster_tail, idx, 0);
408 }
409 } 415 }
410} 416}
411 417
@@ -421,8 +427,8 @@ scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
421 bool conflict; 427 bool conflict;
422 428
423 offset /= SWAPFILE_CLUSTER; 429 offset /= SWAPFILE_CLUSTER;
424 conflict = !cluster_is_null(&si->free_cluster_head) && 430 conflict = !cluster_list_empty(&si->free_clusters) &&
425 offset != cluster_next(&si->free_cluster_head) && 431 offset != cluster_list_first(&si->free_clusters) &&
426 cluster_is_free(&si->cluster_info[offset]); 432 cluster_is_free(&si->cluster_info[offset]);
427 433
428 if (!conflict) 434 if (!conflict)
@@ -447,11 +453,11 @@ static void scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
447new_cluster: 453new_cluster:
448 cluster = this_cpu_ptr(si->percpu_cluster); 454 cluster = this_cpu_ptr(si->percpu_cluster);
449 if (cluster_is_null(&cluster->index)) { 455 if (cluster_is_null(&cluster->index)) {
450 if (!cluster_is_null(&si->free_cluster_head)) { 456 if (!cluster_list_empty(&si->free_clusters)) {
451 cluster->index = si->free_cluster_head; 457 cluster->index = si->free_clusters.head;
452 cluster->next = cluster_next(&cluster->index) * 458 cluster->next = cluster_next(&cluster->index) *
453 SWAPFILE_CLUSTER; 459 SWAPFILE_CLUSTER;
454 } else if (!cluster_is_null(&si->discard_cluster_head)) { 460 } else if (!cluster_list_empty(&si->discard_clusters)) {
455 /* 461 /*
456 * we don't have free cluster but have some clusters in 462 * we don't have free cluster but have some clusters in
457 * discarding, do discard now and reclaim them 463 * discarding, do discard now and reclaim them
@@ -999,7 +1005,7 @@ int free_swap_and_cache(swp_entry_t entry)
999 if (p) { 1005 if (p) {
1000 if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) { 1006 if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) {
1001 page = find_get_page(swap_address_space(entry), 1007 page = find_get_page(swap_address_space(entry),
1002 entry.val); 1008 swp_offset(entry));
1003 if (page && !trylock_page(page)) { 1009 if (page && !trylock_page(page)) {
1004 put_page(page); 1010 put_page(page);
1005 page = NULL; 1011 page = NULL;
@@ -2292,10 +2298,8 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
2292 2298
2293 nr_good_pages = maxpages - 1; /* omit header page */ 2299 nr_good_pages = maxpages - 1; /* omit header page */
2294 2300
2295 cluster_set_null(&p->free_cluster_head); 2301 cluster_list_init(&p->free_clusters);
2296 cluster_set_null(&p->free_cluster_tail); 2302 cluster_list_init(&p->discard_clusters);
2297 cluster_set_null(&p->discard_cluster_head);
2298 cluster_set_null(&p->discard_cluster_tail);
2299 2303
2300 for (i = 0; i < swap_header->info.nr_badpages; i++) { 2304 for (i = 0; i < swap_header->info.nr_badpages; i++) {
2301 unsigned int page_nr = swap_header->info.badpages[i]; 2305 unsigned int page_nr = swap_header->info.badpages[i];
@@ -2341,19 +2345,8 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
2341 for (i = 0; i < nr_clusters; i++) { 2345 for (i = 0; i < nr_clusters; i++) {
2342 if (!cluster_count(&cluster_info[idx])) { 2346 if (!cluster_count(&cluster_info[idx])) {
2343 cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); 2347 cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
2344 if (cluster_is_null(&p->free_cluster_head)) { 2348 cluster_list_add_tail(&p->free_clusters, cluster_info,
2345 cluster_set_next_flag(&p->free_cluster_head, 2349 idx);
2346 idx, 0);
2347 cluster_set_next_flag(&p->free_cluster_tail,
2348 idx, 0);
2349 } else {
2350 unsigned int tail;
2351
2352 tail = cluster_next(&p->free_cluster_tail);
2353 cluster_set_next(&cluster_info[tail], idx);
2354 cluster_set_next_flag(&p->free_cluster_tail,
2355 idx, 0);
2356 }
2357 } 2350 }
2358 idx++; 2351 idx++;
2359 if (idx == nr_clusters) 2352 if (idx == nr_clusters)
diff --git a/mm/vmacache.c b/mm/vmacache.c
index fd09dc9c6812..035fdeb35b43 100644
--- a/mm/vmacache.c
+++ b/mm/vmacache.c
@@ -87,11 +87,11 @@ struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr)
87{ 87{
88 int i; 88 int i;
89 89
90 count_vm_vmacache_event(VMACACHE_FIND_CALLS);
91
90 if (!vmacache_valid(mm)) 92 if (!vmacache_valid(mm))
91 return NULL; 93 return NULL;
92 94
93 count_vm_vmacache_event(VMACACHE_FIND_CALLS);
94
95 for (i = 0; i < VMACACHE_SIZE; i++) { 95 for (i = 0; i < VMACACHE_SIZE; i++) {
96 struct vm_area_struct *vma = current->vmacache[i]; 96 struct vm_area_struct *vma = current->vmacache[i];
97 97
@@ -115,11 +115,11 @@ struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
115{ 115{
116 int i; 116 int i;
117 117
118 count_vm_vmacache_event(VMACACHE_FIND_CALLS);
119
118 if (!vmacache_valid(mm)) 120 if (!vmacache_valid(mm))
119 return NULL; 121 return NULL;
120 122
121 count_vm_vmacache_event(VMACACHE_FIND_CALLS);
122
123 for (i = 0; i < VMACACHE_SIZE; i++) { 123 for (i = 0; i < VMACACHE_SIZE; i++) {
124 struct vm_area_struct *vma = current->vmacache[i]; 124 struct vm_area_struct *vma = current->vmacache[i];
125 125
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 91f44e78c516..f2481cb4e6b2 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1359,14 +1359,14 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
1359 struct vm_struct *area; 1359 struct vm_struct *area;
1360 1360
1361 BUG_ON(in_interrupt()); 1361 BUG_ON(in_interrupt());
1362 if (flags & VM_IOREMAP)
1363 align = 1ul << clamp_t(int, fls_long(size),
1364 PAGE_SHIFT, IOREMAP_MAX_ORDER);
1365
1366 size = PAGE_ALIGN(size); 1362 size = PAGE_ALIGN(size);
1367 if (unlikely(!size)) 1363 if (unlikely(!size))
1368 return NULL; 1364 return NULL;
1369 1365
1366 if (flags & VM_IOREMAP)
1367 align = 1ul << clamp_t(int, get_count_order_long(size),
1368 PAGE_SHIFT, IOREMAP_MAX_ORDER);
1369
1370 area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); 1370 area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
1371 if (unlikely(!area)) 1371 if (unlikely(!area))
1372 return NULL; 1372 return NULL;
@@ -1601,7 +1601,6 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
1601static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, 1601static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
1602 pgprot_t prot, int node) 1602 pgprot_t prot, int node)
1603{ 1603{
1604 const int order = 0;
1605 struct page **pages; 1604 struct page **pages;
1606 unsigned int nr_pages, array_size, i; 1605 unsigned int nr_pages, array_size, i;
1607 const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; 1606 const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
@@ -1629,9 +1628,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
1629 struct page *page; 1628 struct page *page;
1630 1629
1631 if (node == NUMA_NO_NODE) 1630 if (node == NUMA_NO_NODE)
1632 page = alloc_pages(alloc_mask, order); 1631 page = alloc_page(alloc_mask);
1633 else 1632 else
1634 page = alloc_pages_node(node, alloc_mask, order); 1633 page = alloc_pages_node(node, alloc_mask, 0);
1635 1634
1636 if (unlikely(!page)) { 1635 if (unlikely(!page)) {
1637 /* Successfully allocated i pages, free them in __vunmap() */ 1636 /* Successfully allocated i pages, free them in __vunmap() */
@@ -1648,8 +1647,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
1648 return area->addr; 1647 return area->addr;
1649 1648
1650fail: 1649fail:
1651 warn_alloc_failed(gfp_mask, order, 1650 warn_alloc(gfp_mask,
1652 "vmalloc: allocation failure, allocated %ld of %ld bytes\n", 1651 "vmalloc: allocation failure, allocated %ld of %ld bytes",
1653 (area->nr_pages*PAGE_SIZE), area->size); 1652 (area->nr_pages*PAGE_SIZE), area->size);
1654 vfree(area->addr); 1653 vfree(area->addr);
1655 return NULL; 1654 return NULL;
@@ -1710,9 +1709,8 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
1710 return addr; 1709 return addr;
1711 1710
1712fail: 1711fail:
1713 warn_alloc_failed(gfp_mask, 0, 1712 warn_alloc(gfp_mask,
1714 "vmalloc: allocation failure: %lu bytes\n", 1713 "vmalloc: allocation failure: %lu bytes", real_size);
1715 real_size);
1716 return NULL; 1714 return NULL;
1717} 1715}
1718 1716
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 0fe8b7113868..744f926af442 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2418,8 +2418,6 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
2418 if (inactive_list_is_low(lruvec, false, sc)) 2418 if (inactive_list_is_low(lruvec, false, sc))
2419 shrink_active_list(SWAP_CLUSTER_MAX, lruvec, 2419 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
2420 sc, LRU_ACTIVE_ANON); 2420 sc, LRU_ACTIVE_ANON);
2421
2422 throttle_vm_writeout(sc->gfp_mask);
2423} 2421}
2424 2422
2425/* Use reclaim/compaction for costly allocs or under memory pressure */ 2423/* Use reclaim/compaction for costly allocs or under memory pressure */
@@ -2480,7 +2478,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
2480 * If we have not reclaimed enough pages for compaction and the 2478 * If we have not reclaimed enough pages for compaction and the
2481 * inactive lists are large enough, continue reclaiming 2479 * inactive lists are large enough, continue reclaiming
2482 */ 2480 */
2483 pages_for_compaction = (2UL << sc->order); 2481 pages_for_compaction = compact_gap(sc->order);
2484 inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE); 2482 inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
2485 if (get_nr_swap_pages() > 0) 2483 if (get_nr_swap_pages() > 0)
2486 inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON); 2484 inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
@@ -2495,7 +2493,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
2495 continue; 2493 continue;
2496 2494
2497 switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) { 2495 switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) {
2498 case COMPACT_PARTIAL: 2496 case COMPACT_SUCCESS:
2499 case COMPACT_CONTINUE: 2497 case COMPACT_CONTINUE:
2500 return false; 2498 return false;
2501 default: 2499 default:
@@ -2598,38 +2596,35 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
2598} 2596}
2599 2597
2600/* 2598/*
2601 * Returns true if compaction should go ahead for a high-order request, or 2599 * Returns true if compaction should go ahead for a costly-order request, or
2602 * the high-order allocation would succeed without compaction. 2600 * the allocation would already succeed without compaction. Return false if we
2601 * should reclaim first.
2603 */ 2602 */
2604static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) 2603static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
2605{ 2604{
2606 unsigned long watermark; 2605 unsigned long watermark;
2607 bool watermark_ok; 2606 enum compact_result suitable;
2608 2607
2609 /* 2608 suitable = compaction_suitable(zone, sc->order, 0, sc->reclaim_idx);
2610 * Compaction takes time to run and there are potentially other 2609 if (suitable == COMPACT_SUCCESS)
2611 * callers using the pages just freed. Continue reclaiming until 2610 /* Allocation should succeed already. Don't reclaim. */
2612 * there is a buffer of free pages available to give compaction 2611 return true;
2613 * a reasonable chance of completing and allocating the page 2612 if (suitable == COMPACT_SKIPPED)
2614 */ 2613 /* Compaction cannot yet proceed. Do reclaim. */
2615 watermark = high_wmark_pages(zone) + (2UL << sc->order); 2614 return false;
2616 watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
2617
2618 /*
2619 * If compaction is deferred, reclaim up to a point where
2620 * compaction will have a chance of success when re-enabled
2621 */
2622 if (compaction_deferred(zone, sc->order))
2623 return watermark_ok;
2624 2615
2625 /* 2616 /*
2626 * If compaction is not ready to start and allocation is not likely 2617 * Compaction is already possible, but it takes time to run and there
2627 * to succeed without it, then keep reclaiming. 2618 * are potentially other callers using the pages just freed. So proceed
2619 * with reclaim to make a buffer of free pages available to give
2620 * compaction a reasonable chance of completing and allocating the page.
2621 * Note that we won't actually reclaim the whole buffer in one attempt
2622 * as the target watermark in should_continue_reclaim() is lower. But if
2623 * we are already above the high+gap watermark, don't reclaim at all.
2628 */ 2624 */
2629 if (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx) == COMPACT_SKIPPED) 2625 watermark = high_wmark_pages(zone) + compact_gap(sc->order);
2630 return false;
2631 2626
2632 return watermark_ok; 2627 return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
2633} 2628}
2634 2629
2635/* 2630/*
@@ -3041,7 +3036,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
3041 */ 3036 */
3042 nid = mem_cgroup_select_victim_node(memcg); 3037 nid = mem_cgroup_select_victim_node(memcg);
3043 3038
3044 zonelist = NODE_DATA(nid)->node_zonelists; 3039 zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK];
3045 3040
3046 trace_mm_vmscan_memcg_reclaim_begin(0, 3041 trace_mm_vmscan_memcg_reclaim_begin(0,
3047 sc.may_writepage, 3042 sc.may_writepage,
@@ -3169,7 +3164,7 @@ static bool kswapd_shrink_node(pg_data_t *pgdat,
3169 * excessive reclaim. Assume that a process requested a high-order 3164 * excessive reclaim. Assume that a process requested a high-order
3170 * can direct reclaim/compact. 3165 * can direct reclaim/compact.
3171 */ 3166 */
3172 if (sc->order && sc->nr_reclaimed >= 2UL << sc->order) 3167 if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order))
3173 sc->order = 0; 3168 sc->order = 0;
3174 3169
3175 return sc->nr_scanned >= sc->nr_to_reclaim; 3170 return sc->nr_scanned >= sc->nr_to_reclaim;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 89cec42d19ff..604f26a4f696 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1254,85 +1254,6 @@ static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
1254 return 0; 1254 return 0;
1255} 1255}
1256 1256
1257#ifdef CONFIG_PAGE_OWNER
1258static void pagetypeinfo_showmixedcount_print(struct seq_file *m,
1259 pg_data_t *pgdat,
1260 struct zone *zone)
1261{
1262 struct page *page;
1263 struct page_ext *page_ext;
1264 unsigned long pfn = zone->zone_start_pfn, block_end_pfn;
1265 unsigned long end_pfn = pfn + zone->spanned_pages;
1266 unsigned long count[MIGRATE_TYPES] = { 0, };
1267 int pageblock_mt, page_mt;
1268 int i;
1269
1270 /* Scan block by block. First and last block may be incomplete */
1271 pfn = zone->zone_start_pfn;
1272
1273 /*
1274 * Walk the zone in pageblock_nr_pages steps. If a page block spans
1275 * a zone boundary, it will be double counted between zones. This does
1276 * not matter as the mixed block count will still be correct
1277 */
1278 for (; pfn < end_pfn; ) {
1279 if (!pfn_valid(pfn)) {
1280 pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
1281 continue;
1282 }
1283
1284 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
1285 block_end_pfn = min(block_end_pfn, end_pfn);
1286
1287 page = pfn_to_page(pfn);
1288 pageblock_mt = get_pageblock_migratetype(page);
1289
1290 for (; pfn < block_end_pfn; pfn++) {
1291 if (!pfn_valid_within(pfn))
1292 continue;
1293
1294 page = pfn_to_page(pfn);
1295
1296 if (page_zone(page) != zone)
1297 continue;
1298
1299 if (PageBuddy(page)) {
1300 pfn += (1UL << page_order(page)) - 1;
1301 continue;
1302 }
1303
1304 if (PageReserved(page))
1305 continue;
1306
1307 page_ext = lookup_page_ext(page);
1308 if (unlikely(!page_ext))
1309 continue;
1310
1311 if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
1312 continue;
1313
1314 page_mt = gfpflags_to_migratetype(page_ext->gfp_mask);
1315 if (pageblock_mt != page_mt) {
1316 if (is_migrate_cma(pageblock_mt))
1317 count[MIGRATE_MOVABLE]++;
1318 else
1319 count[pageblock_mt]++;
1320
1321 pfn = block_end_pfn;
1322 break;
1323 }
1324 pfn += (1UL << page_ext->order) - 1;
1325 }
1326 }
1327
1328 /* Print counts */
1329 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
1330 for (i = 0; i < MIGRATE_TYPES; i++)
1331 seq_printf(m, "%12lu ", count[i]);
1332 seq_putc(m, '\n');
1333}
1334#endif /* CONFIG_PAGE_OWNER */
1335
1336/* 1257/*
1337 * Print out the number of pageblocks for each migratetype that contain pages 1258 * Print out the number of pageblocks for each migratetype that contain pages
1338 * of other types. This gives an indication of how well fallbacks are being 1259 * of other types. This gives an indication of how well fallbacks are being
@@ -1592,7 +1513,10 @@ static int vmstat_show(struct seq_file *m, void *arg)
1592{ 1513{
1593 unsigned long *l = arg; 1514 unsigned long *l = arg;
1594 unsigned long off = l - (unsigned long *)m->private; 1515 unsigned long off = l - (unsigned long *)m->private;
1595 seq_printf(m, "%s %lu\n", vmstat_text[off], *l); 1516
1517 seq_puts(m, vmstat_text[off]);
1518 seq_put_decimal_ull(m, " ", *l);
1519 seq_putc(m, '\n');
1596 return 0; 1520 return 0;
1597} 1521}
1598 1522
@@ -1794,6 +1718,16 @@ static void __init start_shepherd_timer(void)
1794 round_jiffies_relative(sysctl_stat_interval)); 1718 round_jiffies_relative(sysctl_stat_interval));
1795} 1719}
1796 1720
1721static void __init init_cpu_node_state(void)
1722{
1723 int cpu;
1724
1725 get_online_cpus();
1726 for_each_online_cpu(cpu)
1727 node_set_state(cpu_to_node(cpu), N_CPU);
1728 put_online_cpus();
1729}
1730
1797static void vmstat_cpu_dead(int node) 1731static void vmstat_cpu_dead(int node)
1798{ 1732{
1799 int cpu; 1733 int cpu;
@@ -1851,6 +1785,7 @@ static int __init setup_vmstat(void)
1851#ifdef CONFIG_SMP 1785#ifdef CONFIG_SMP
1852 cpu_notifier_register_begin(); 1786 cpu_notifier_register_begin();
1853 __register_cpu_notifier(&vmstat_notifier); 1787 __register_cpu_notifier(&vmstat_notifier);
1788 init_cpu_node_state();
1854 1789
1855 start_shepherd_timer(); 1790 start_shepherd_timer();
1856 cpu_notifier_register_done(); 1791 cpu_notifier_register_done();