1 files changed, 200 insertions, 59 deletions
diff --git a/mm/compaction.c b/mm/compaction.c
index 4d709ee59013..6cc604bd5649 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -16,6 +16,9 @@
 #include <linux/sysfs.h>
 #include "internal.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/compaction.h>
 /*
 * compact_control is used to track pages being migrated and the free pages
 * they are being migrated to during memory compaction. The free_pfn starts
@@ -30,6 +33,7 @@ struct compact_control {
        unsigned long nr_migratepages;  /* Number of pages to migrate */
        unsigned long free_pfn;         /* isolate_freepages search base */
        unsigned long migrate_pfn;      /* isolate_migratepages search base */
+        bool sync;                      /* Synchronous migration */
        /* Account for isolated anon and file pages */
        unsigned long nr_anon;
@@ -60,7 +64,7 @@ static unsigned long isolate_freepages_block(struct zone *zone,
                                struct list_head *freelist)
 {
        unsigned long zone_end_pfn, end_pfn;
-        int total_isolated = 0;
+        int nr_scanned = 0, total_isolated = 0;
        struct page *cursor;
        /* Get the last PFN we should scan for free pages at */
@@ -81,6 +85,7 @@ static unsigned long isolate_freepages_block(struct zone *zone,
                if (!pfn_valid_within(blockpfn))
                        continue;
+                nr_scanned++;
                if (!PageBuddy(page))
                        continue;
@@ -100,6 +105,7 @@ static unsigned long isolate_freepages_block(struct zone *zone,
                }
        }
+        trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated);
        return total_isolated;
 }
@@ -138,16 +144,26 @@ static void isolate_freepages(struct zone *zone,
        int nr_freepages = cc->nr_freepages;
        struct list_head *freelist = &cc->freepages;
+        /*
+         * Initialise the free scanner. The starting point is where we last
+         * scanned from (or the end of the zone if starting). The low point
+         * is the end of the pageblock the migration scanner is using.
+         */
        pfn = cc->free_pfn;
        low_pfn = cc->migrate_pfn + pageblock_nr_pages;
-        high_pfn = low_pfn;
+        /*
+         * Take care that if the migration scanner is at the end of the zone
+         * that the free scanner does not accidentally move to the next zone
+         * in the next isolation cycle.
+         */
+        high_pfn = min(low_pfn, pfn);
        /*
         * Isolate free pages until enough are available to migrate the
         * pages on cc->migratepages. We stop searching if the migrate
         * and free page scanners meet or enough free pages are isolated.
         */
-        spin_lock_irqsave(&zone->lock, flags);
        for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages;
                                        pfn -= pageblock_nr_pages) {
                unsigned long isolated;
@@ -170,9 +186,19 @@ static void isolate_freepages(struct zone *zone,
                if (!suitable_migration_target(page))
                        continue;
-                /* Found a block suitable for isolating free pages from */
+                /*
-                isolated = isolate_freepages_block(zone, pfn, freelist);
+                 * Found a block suitable for isolating free pages from. Now
-                nr_freepages += isolated;
+                 * we disabled interrupts, double check things are ok and
+                 * isolate the pages. This is to minimise the time IRQs
+                 * are disabled
+                 */
+                isolated = 0;
+                spin_lock_irqsave(&zone->lock, flags);
+                if (suitable_migration_target(page)) {
+                        isolated = isolate_freepages_block(zone, pfn, freelist);
+                        nr_freepages += isolated;
+                }
+                spin_unlock_irqrestore(&zone->lock, flags);
                /*
                 * Record the highest PFN we isolated pages from. When next
@@ -182,7 +208,6 @@ static void isolate_freepages(struct zone *zone,
                if (isolated)
                        high_pfn = max(high_pfn, pfn);
        }
-        spin_unlock_irqrestore(&zone->lock, flags);
        /* split_free_page does not map the pages */
        list_for_each_entry(page, freelist, lru) {
@@ -226,14 +251,23 @@ static bool too_many_isolated(struct zone *zone)
        return isolated > (inactive + active) / 2;
 }
+/* possible outcome of isolate_migratepages */
+typedef enum {
+        ISOLATE_ABORT,          /* Abort compaction now */
+        ISOLATE_NONE,           /* No pages isolated, continue scanning */
+        ISOLATE_SUCCESS,        /* Pages isolated, migrate */
+} isolate_migrate_t;
 /*
 * Isolate all pages that can be migrated from the block pointed to by
 * the migrate scanner within compact_control.
 */
-static unsigned long isolate_migratepages(struct zone *zone,
+static isolate_migrate_t isolate_migratepages(struct zone *zone,
                                        struct compact_control *cc)
 {
        unsigned long low_pfn, end_pfn;
+        unsigned long last_pageblock_nr = 0, pageblock_nr;
+        unsigned long nr_scanned = 0, nr_isolated = 0;
        struct list_head *migratelist = &cc->migratepages;
        /* Do not scan outside zone boundaries */
@@ -245,7 +279,7 @@ static unsigned long isolate_migratepages(struct zone *zone,
        /* Do not cross the free scanner or scan within a memory hole */
        if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) {
                cc->migrate_pfn = end_pfn;
-                return 0;
+                return ISOLATE_NONE;
        }
        /*
@@ -254,33 +288,85 @@ static unsigned long isolate_migratepages(struct zone *zone,
         * delay for some time until fewer pages are isolated
         */
        while (unlikely(too_many_isolated(zone))) {
+                /* async migration should just abort */
+                if (!cc->sync)
+                        return ISOLATE_ABORT;
                congestion_wait(BLK_RW_ASYNC, HZ/10);
                if (fatal_signal_pending(current))
-                        return 0;
+                        return ISOLATE_ABORT;
        }
        /* Time to isolate some pages for migration */
+        cond_resched();
        spin_lock_irq(&zone->lru_lock);
        for (; low_pfn < end_pfn; low_pfn++) {
                struct page *page;
+                bool locked = true;
+                /* give a chance to irqs before checking need_resched() */
+                if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) {
+                        spin_unlock_irq(&zone->lru_lock);
+                        locked = false;
+                }
+                if (need_resched() || spin_is_contended(&zone->lru_lock)) {
+                        if (locked)
+                                spin_unlock_irq(&zone->lru_lock);
+                        cond_resched();
+                        spin_lock_irq(&zone->lru_lock);
+                        if (fatal_signal_pending(current))
+                                break;
+                } else if (!locked)
+                        spin_lock_irq(&zone->lru_lock);
                if (!pfn_valid_within(low_pfn))
                        continue;
+                nr_scanned++;
                /* Get the page and skip if free */
                page = pfn_to_page(low_pfn);
                if (PageBuddy(page))
                        continue;
+                /*
+                 * For async migration, also only scan in MOVABLE blocks. Async
+                 * migration is optimistic to see if the minimum amount of work
+                 * satisfies the allocation
+                 */
+                pageblock_nr = low_pfn >> pageblock_order;
+                if (!cc->sync && last_pageblock_nr != pageblock_nr &&
+                                get_pageblock_migratetype(page) != MIGRATE_MOVABLE) {
+                        low_pfn += pageblock_nr_pages;
+                        low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
+                        last_pageblock_nr = pageblock_nr;
+                        continue;
+                }
+                if (!PageLRU(page))
+                        continue;
+                /*
+                 * PageLRU is set, and lru_lock excludes isolation,
+                 * splitting and collapsing (collapsing has already
+                 * happened if PageLRU is set).
+                 */
+                if (PageTransHuge(page)) {
+                        low_pfn += (1 << compound_order(page)) - 1;
+                        continue;
+                }
                /* Try isolate the page */
                if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0)
                        continue;
+                VM_BUG_ON(PageTransCompound(page));
                /* Successfully isolated */
                del_page_from_lru_list(zone, page, page_lru(page));
                list_add(&page->lru, migratelist);
-                mem_cgroup_del_lru(page);
                cc->nr_migratepages++;
+                nr_isolated++;
                /* Avoid isolating too much */
                if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
@@ -292,7 +378,9 @@ static unsigned long isolate_migratepages(struct zone *zone,
        spin_unlock_irq(&zone->lru_lock);
        cc->migrate_pfn = low_pfn;
-        return cc->nr_migratepages;
+        trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
+        return ISOLATE_SUCCESS;
 }
 /*
@@ -342,10 +430,10 @@ static void update_nr_listpages(struct compact_control *cc)
 }
 static int compact_finished(struct zone *zone,
-                                                struct compact_control *cc)
+                            struct compact_control *cc)
 {
        unsigned int order;
-        unsigned long watermark = low_wmark_pages(zone) + (1 << cc->order);
+        unsigned long watermark;
        if (fatal_signal_pending(current))
                return COMPACT_PARTIAL;
@@ -354,11 +442,18 @@ static int compact_finished(struct zone *zone,
        if (cc->free_pfn <= cc->migrate_pfn)
                return COMPACT_COMPLETE;
-        /* Compaction run is not finished if the watermark is not met */
+        /*
-        if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
+         * order == -1 is expected when compacting via
+         * /proc/sys/vm/compact_memory
+         */
+        if (cc->order == -1)
                return COMPACT_CONTINUE;
-        if (cc->order == -1)
+        /* Compaction run is not finished if the watermark is not met */
+        watermark = low_wmark_pages(zone);
+        watermark += (1 << cc->order);
+        if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
                return COMPACT_CONTINUE;
        /* Direct compactor: Is a suitable page free? */
@@ -375,10 +470,71 @@ static int compact_finished(struct zone *zone,
        return COMPACT_CONTINUE;
 }
+/*
+ * compaction_suitable: Is this suitable to run compaction on this zone now?
+ * Returns
+ *   COMPACT_SKIPPED  - If there are too few free pages for compaction
+ *   COMPACT_PARTIAL  - If the allocation would succeed without compaction
+ *   COMPACT_CONTINUE - If compaction should run now
+ */
+unsigned long compaction_suitable(struct zone *zone, int order)
+{
+        int fragindex;
+        unsigned long watermark;
+        /*
+         * order == -1 is expected when compacting via
+         * /proc/sys/vm/compact_memory
+         */
+        if (order == -1)
+                return COMPACT_CONTINUE;
+        /*
+         * Watermarks for order-0 must be met for compaction. Note the 2UL.
+         * This is because during migration, copies of pages need to be
+         * allocated and for a short time, the footprint is higher
+         */
+        watermark = low_wmark_pages(zone) + (2UL << order);
+        if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
+                return COMPACT_SKIPPED;
+        /*
+         * fragmentation index determines if allocation failures are due to
+         * low memory or external fragmentation
+         *
+         * index of -1000 implies allocations might succeed depending on
+         * watermarks
+         * index towards 0 implies failure is due to lack of memory
+         * index towards 1000 implies failure is due to fragmentation
+         *
+         * Only compact if a failure would be due to fragmentation.
+         */
+        fragindex = fragmentation_index(zone, order);
+        if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
+                return COMPACT_SKIPPED;
+        if (fragindex == -1000 && zone_watermark_ok(zone, order, watermark,
+            0, 0))
+                return COMPACT_PARTIAL;
+        return COMPACT_CONTINUE;
+}
 static int compact_zone(struct zone *zone, struct compact_control *cc)
 {
        int ret;
+        ret = compaction_suitable(zone, cc->order);
+        switch (ret) {
+        case COMPACT_PARTIAL:
+        case COMPACT_SKIPPED:
+                /* Compaction is likely to fail */
+                return ret;
+        case COMPACT_CONTINUE:
+                /* Fall through to compaction */
+                ;
+        }
        /* Setup to move all movable pages to the end of the zone */
        cc->migrate_pfn = zone->zone_start_pfn;
        cc->free_pfn = cc->migrate_pfn + zone->spanned_pages;
@@ -388,13 +544,22 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
        while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
                unsigned long nr_migrate, nr_remaining;
+                int err;
-                if (!isolate_migratepages(zone, cc))
+                switch (isolate_migratepages(zone, cc)) {
+                case ISOLATE_ABORT:
+                        ret = COMPACT_PARTIAL;
+                        goto out;
+                case ISOLATE_NONE:
                        continue;
+                case ISOLATE_SUCCESS:
+                        ;
+                }
                nr_migrate = cc->nr_migratepages;
-                migrate_pages(&cc->migratepages, compaction_alloc,
+                err = migrate_pages(&cc->migratepages, compaction_alloc,
-                                                (unsigned long)cc, 0);
+                                (unsigned long)cc, false,
+                                cc->sync);
                update_nr_listpages(cc);
                nr_remaining = cc->nr_migratepages;
@@ -402,15 +567,18 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining);
                if (nr_remaining)
                        count_vm_events(COMPACTPAGEFAILED, nr_remaining);
+                trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
+                                                nr_remaining);
                /* Release LRU pages not migrated */
-                if (!list_empty(&cc->migratepages)) {
+                if (err) {
                        putback_lru_pages(&cc->migratepages);
                        cc->nr_migratepages = 0;
                }
        }
+out:
        /* Release free pages and check accounting */
        cc->nr_freepages -= release_freepages(&cc->freepages);
        VM_BUG_ON(cc->nr_freepages != 0);
@@ -418,8 +586,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
        return ret;
 }
-static unsigned long compact_zone_order(struct zone *zone,
+unsigned long compact_zone_order(struct zone *zone,
-                                                int order, gfp_t gfp_mask)
+                                 int order, gfp_t gfp_mask,
+                                 bool sync)
 {
        struct compact_control cc = {
                .nr_freepages = 0,
@@ -427,6 +596,7 @@ static unsigned long compact_zone_order(struct zone *zone,
                .order = order,
                .migratetype = allocflags_to_migratetype(gfp_mask),
                .zone = zone,
+                .sync = sync,
        };
        INIT_LIST_HEAD(&cc.freepages);
        INIT_LIST_HEAD(&cc.migratepages);
@@ -442,16 +612,17 @@ int sysctl_extfrag_threshold = 500;
 * @order: The order of the current allocation
 * @gfp_mask: The GFP mask of the current allocation
 * @nodemask: The allowed nodes to allocate from
+ * @sync: Whether migration is synchronous or not
 *
 * This is the main entry point for direct page compaction.
 */
 unsigned long try_to_compact_pages(struct zonelist *zonelist,
-                        int order, gfp_t gfp_mask, nodemask_t *nodemask)
+                        int order, gfp_t gfp_mask, nodemask_t *nodemask,
+                        bool sync)
 {
        enum zone_type high_zoneidx = gfp_zone(gfp_mask);
        int may_enter_fs = gfp_mask & __GFP_FS;
        int may_perform_io = gfp_mask & __GFP_IO;
-        unsigned long watermark;
        struct zoneref *z;
        struct zone *zone;
        int rc = COMPACT_SKIPPED;
@@ -461,7 +632,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
         * made because an assumption is made that the page allocator can satisfy
         * the "cheaper" orders without taking special steps
         */
-        if (order <= PAGE_ALLOC_COSTLY_ORDER || !may_enter_fs || !may_perform_io)
+        if (!order || !may_enter_fs || !may_perform_io)
                return rc;
        count_vm_event(COMPACTSTALL);
@@ -469,43 +640,13 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
        /* Compact each zone in the list */
        for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
                                                                nodemask) {
-                int fragindex;
                int status;
-                /*
+                status = compact_zone_order(zone, order, gfp_mask, sync);
-                 * Watermarks for order-0 must be met for compaction. Note
-                 * the 2UL. This is because during migration, copies of
-                 * pages need to be allocated and for a short time, the
-                 * footprint is higher
-                 */
-                watermark = low_wmark_pages(zone) + (2UL << order);
-                if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
-                        continue;
-                /*
-                 * fragmentation index determines if allocation failures are
-                 * due to low memory or external fragmentation
-                 *
-                 * index of -1 implies allocations might succeed depending
-                 *      on watermarks
-                 * index towards 0 implies failure is due to lack of memory
-                 * index towards 1000 implies failure is due to fragmentation
-                 *
-                 * Only compact if a failure would be due to fragmentation.
-                 */
-                fragindex = fragmentation_index(zone, order);
-                if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
-                        continue;
-                if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) {
-                        rc = COMPACT_PARTIAL;
-                        break;
-                }
-                status = compact_zone_order(zone, order, gfp_mask);
                rc = max(status, rc);
-                if (zone_watermark_ok(zone, order, watermark, 0, 0))
+                /* If a normal allocation would succeed, stop compacting */
+                if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
                        break;
        }