8 files changed, 133 insertions, 45 deletions
diff --git a/mm/fadvise.c b/mm/fadvise.c
index b8024fa7101d..6c707bfe02fd 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -126,6 +126,17 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
                 */
                start_index = (offset+(PAGE_SIZE-1)) >> PAGE_SHIFT;
                end_index = (endbyte >> PAGE_SHIFT);
+                if ((endbyte & ~PAGE_MASK) != ~PAGE_MASK) {
+                        /* First page is tricky as 0 - 1 = -1, but pgoff_t
+                         * is unsigned, so the end_index >= start_index
+                         * check below would be true and we'll discard the whole
+                         * file cache which is not what was asked.
+                         */
+                        if (end_index == 0)
+                                break;
+                        end_index--;
+                }
                if (end_index >= start_index) {
                        unsigned long count = invalidate_mapping_pages(mapping,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d26162e81fea..388c2bb9b55c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -832,8 +832,27 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
         * Only the process that called mmap() has reserves for
         * private mappings.
         */
-        if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))
+        if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
-                return true;
+                /*
+                 * Like the shared case above, a hole punch or truncate
+                 * could have been performed on the private mapping.
+                 * Examine the value of chg to determine if reserves
+                 * actually exist or were previously consumed.
+                 * Very Subtle - The value of chg comes from a previous
+                 * call to vma_needs_reserves().  The reserve map for
+                 * private mappings has different (opposite) semantics
+                 * than that of shared mappings.  vma_needs_reserves()
+                 * has already taken this difference in semantics into
+                 * account.  Therefore, the meaning of chg is the same
+                 * as in the shared case above.  Code could easily be
+                 * combined, but keeping it separate draws attention to
+                 * subtle differences.
+                 */
+                if (chg)
+                        return false;
+                else
+                        return true;
+        }
        return false;
 }
@@ -1816,6 +1835,25 @@ static long __vma_reservation_common(struct hstate *h,
        if (vma->vm_flags & VM_MAYSHARE)
                return ret;
+        else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && ret >= 0) {
+                /*
+                 * In most cases, reserves always exist for private mappings.
+                 * However, a file associated with mapping could have been
+                 * hole punched or truncated after reserves were consumed.
+                 * As subsequent fault on such a range will not use reserves.
+                 * Subtle - The reserve map for private mappings has the
+                 * opposite meaning than that of shared mappings.  If NO
+                 * entry is in the reserve map, it means a reservation exists.
+                 * If an entry exists in the reserve map, it means the
+                 * reservation has already been consumed.  As a result, the
+                 * return value of this routine is the opposite of the
+                 * value returned from reserve map manipulation routines above.
+                 */
+                if (ret)
+                        return 0;
+                else
+                        return 1;
+        }
        else
                return ret < 0 ? ret : 0;
 }
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 18b6a2b8d183..28439acda6ec 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -763,8 +763,8 @@ static int kasan_mem_notifier(struct notifier_block *nb,
 static int __init kasan_memhotplug_init(void)
 {
-        pr_err("WARNING: KASAN doesn't support memory hot-add\n");
+        pr_info("WARNING: KASAN doesn't support memory hot-add\n");
-        pr_err("Memory hot-add will be disabled\n");
+        pr_info("Memory hot-add will be disabled\n");
        hotplug_memory_notifier(kasan_mem_notifier, 0);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 58c69c94402a..75e74408cc8f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1608,7 +1608,7 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
 static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
 {
-        if (!current->memcg_may_oom || current->memcg_in_oom)
+        if (!current->memcg_may_oom)
                return;
        /*
         * We are in the middle of the charge context here, so we
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index b9956fdee8f5..e2481949494c 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -373,8 +373,9 @@ static void domain_dirty_limits(struct dirty_throttle_control *dtc)
        struct dirty_throttle_control *gdtc = mdtc_gdtc(dtc);
        unsigned long bytes = vm_dirty_bytes;
        unsigned long bg_bytes = dirty_background_bytes;
-        unsigned long ratio = vm_dirty_ratio;
+        /* convert ratios to per-PAGE_SIZE for higher precision */
-        unsigned long bg_ratio = dirty_background_ratio;
+        unsigned long ratio = (vm_dirty_ratio * PAGE_SIZE) / 100;
+        unsigned long bg_ratio = (dirty_background_ratio * PAGE_SIZE) / 100;
        unsigned long thresh;
        unsigned long bg_thresh;
        struct task_struct *tsk;
@@ -386,26 +387,28 @@ static void domain_dirty_limits(struct dirty_throttle_control *dtc)
                /*
                 * The byte settings can't be applied directly to memcg
                 * domains.  Convert them to ratios by scaling against
-                 * globally available memory.
+                 * globally available memory.  As the ratios are in
+                 * per-PAGE_SIZE, they can be obtained by dividing bytes by
+                 * number of pages.
                 */
                if (bytes)
-                        ratio = min(DIV_ROUND_UP(bytes, PAGE_SIZE) * 100 /
+                        ratio = min(DIV_ROUND_UP(bytes, global_avail),
-                                    global_avail, 100UL);
+                                    PAGE_SIZE);
                if (bg_bytes)
-                        bg_ratio = min(DIV_ROUND_UP(bg_bytes, PAGE_SIZE) * 100 /
+                        bg_ratio = min(DIV_ROUND_UP(bg_bytes, global_avail),
-                                       global_avail, 100UL);
+                                       PAGE_SIZE);
                bytes = bg_bytes = 0;
        }
        if (bytes)
                thresh = DIV_ROUND_UP(bytes, PAGE_SIZE);
        else
-                thresh = (ratio * available_memory) / 100;
+                thresh = (ratio * available_memory) / PAGE_SIZE;
        if (bg_bytes)
                bg_thresh = DIV_ROUND_UP(bg_bytes, PAGE_SIZE);
        else
-                bg_thresh = (bg_ratio * available_memory) / 100;
+                bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE;
        if (bg_thresh >= thresh)
                bg_thresh = thresh / 2;
diff --git a/mm/percpu.c b/mm/percpu.c
index 0c59684f1ff2..9903830aaebb 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -112,7 +112,7 @@ struct pcpu_chunk {
        int                     map_used;       /* # of map entries used before the sentry */
        int                     map_alloc;      /* # of map entries allocated */
        int                     *map;           /* allocation map */
-        struct work_struct      map_extend_work;/* async ->map[] extension */
+        struct list_head        map_extend_list;/* on pcpu_map_extend_chunks */
        void                    *data;          /* chunk data */
        int                     first_free;     /* no free below this */
@@ -162,10 +162,13 @@ static struct pcpu_chunk *pcpu_reserved_chunk;
 static int pcpu_reserved_chunk_limit;
 static DEFINE_SPINLOCK(pcpu_lock);      /* all internal data structures */
-static DEFINE_MUTEX(pcpu_alloc_mutex);  /* chunk create/destroy, [de]pop */
+static DEFINE_MUTEX(pcpu_alloc_mutex);  /* chunk create/destroy, [de]pop, map ext */
 static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
+/* chunks which need their map areas extended, protected by pcpu_lock */
+static LIST_HEAD(pcpu_map_extend_chunks);
 /*
 * The number of empty populated pages, protected by pcpu_lock.  The
 * reserved chunk doesn't contribute to the count.
@@ -395,13 +398,19 @@ static int pcpu_need_to_extend(struct pcpu_chunk *chunk, bool is_atomic)
 {
        int margin, new_alloc;
+        lockdep_assert_held(&pcpu_lock);
        if (is_atomic) {
                margin = 3;
                if (chunk->map_alloc <
-                    chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW &&
+                    chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW) {
-                    pcpu_async_enabled)
+                        if (list_empty(&chunk->map_extend_list)) {
-                        schedule_work(&chunk->map_extend_work);
+                                list_add_tail(&chunk->map_extend_list,
+                                              &pcpu_map_extend_chunks);
+                                pcpu_schedule_balance_work();
+                        }
+                }
        } else {
                margin = PCPU_ATOMIC_MAP_MARGIN_HIGH;
        }
@@ -435,6 +444,8 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc)
        size_t old_size = 0, new_size = new_alloc * sizeof(new[0]);
        unsigned long flags;
+        lockdep_assert_held(&pcpu_alloc_mutex);
        new = pcpu_mem_zalloc(new_size);
        if (!new)
                return -ENOMEM;
@@ -467,20 +478,6 @@ out_unlock:
        return 0;
 }
-static void pcpu_map_extend_workfn(struct work_struct *work)
-{
-        struct pcpu_chunk *chunk = container_of(work, struct pcpu_chunk,
-                                                map_extend_work);
-        int new_alloc;
-        spin_lock_irq(&pcpu_lock);
-        new_alloc = pcpu_need_to_extend(chunk, false);
-        spin_unlock_irq(&pcpu_lock);
-        if (new_alloc)
-                pcpu_extend_area_map(chunk, new_alloc);
-}
 /**
 * pcpu_fit_in_area - try to fit the requested allocation in a candidate area
 * @chunk: chunk the candidate area belongs to
@@ -740,7 +737,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void)
        chunk->map_used = 1;
        INIT_LIST_HEAD(&chunk->list);
-        INIT_WORK(&chunk->map_extend_work, pcpu_map_extend_workfn);
+        INIT_LIST_HEAD(&chunk->map_extend_list);
        chunk->free_size = pcpu_unit_size;
        chunk->contig_hint = pcpu_unit_size;
@@ -895,6 +892,9 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
                return NULL;
        }
+        if (!is_atomic)
+                mutex_lock(&pcpu_alloc_mutex);
        spin_lock_irqsave(&pcpu_lock, flags);
        /* serve reserved allocations from the reserved chunk if available */
@@ -967,12 +967,9 @@ restart:
        if (is_atomic)
                goto fail;
-        mutex_lock(&pcpu_alloc_mutex);
        if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
                chunk = pcpu_create_chunk();
                if (!chunk) {
-                        mutex_unlock(&pcpu_alloc_mutex);
                        err = "failed to allocate new chunk";
                        goto fail;
                }
@@ -983,7 +980,6 @@ restart:
                spin_lock_irqsave(&pcpu_lock, flags);
        }
-        mutex_unlock(&pcpu_alloc_mutex);
        goto restart;
 area_found:
@@ -993,8 +989,6 @@ area_found:
        if (!is_atomic) {
                int page_start, page_end, rs, re;
-                mutex_lock(&pcpu_alloc_mutex);
                page_start = PFN_DOWN(off);
                page_end = PFN_UP(off + size);
@@ -1005,7 +999,6 @@ area_found:
                        spin_lock_irqsave(&pcpu_lock, flags);
                        if (ret) {
-                                mutex_unlock(&pcpu_alloc_mutex);
                                pcpu_free_area(chunk, off, &occ_pages);
                                err = "failed to populate";
                                goto fail_unlock;
@@ -1045,6 +1038,8 @@ fail:
                /* see the flag handling in pcpu_blance_workfn() */
                pcpu_atomic_alloc_failed = true;
                pcpu_schedule_balance_work();
+        } else {
+                mutex_unlock(&pcpu_alloc_mutex);
        }
        return NULL;
 }
@@ -1129,6 +1124,7 @@ static void pcpu_balance_workfn(struct work_struct *work)
                if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
                        continue;
+                list_del_init(&chunk->map_extend_list);
                list_move(&chunk->list, &to_free);
        }
@@ -1146,6 +1142,25 @@ static void pcpu_balance_workfn(struct work_struct *work)
                pcpu_destroy_chunk(chunk);
        }
+        /* service chunks which requested async area map extension */
+        do {
+                int new_alloc = 0;
+                spin_lock_irq(&pcpu_lock);
+                chunk = list_first_entry_or_null(&pcpu_map_extend_chunks,
+                                        struct pcpu_chunk, map_extend_list);
+                if (chunk) {
+                        list_del_init(&chunk->map_extend_list);
+                        new_alloc = pcpu_need_to_extend(chunk, false);
+                }
+                spin_unlock_irq(&pcpu_lock);
+                if (new_alloc)
+                        pcpu_extend_area_map(chunk, new_alloc);
+        } while (chunk);
        /*
         * Ensure there are certain number of free populated pages for
         * atomic allocs.  Fill up from the most packed so that atomic
@@ -1644,7 +1659,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
         */
        schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
        INIT_LIST_HEAD(&schunk->list);
-        INIT_WORK(&schunk->map_extend_work, pcpu_map_extend_workfn);
+        INIT_LIST_HEAD(&schunk->map_extend_list);
        schunk->base_addr = base_addr;
        schunk->map = smap;
        schunk->map_alloc = ARRAY_SIZE(smap);
@@ -1673,7 +1688,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
        if (dyn_size) {
                dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
                INIT_LIST_HEAD(&dchunk->list);
-                INIT_WORK(&dchunk->map_extend_work, pcpu_map_extend_workfn);
+                INIT_LIST_HEAD(&dchunk->map_extend_list);
                dchunk->base_addr = base_addr;
                dchunk->map = dmap;
                dchunk->map_alloc = ARRAY_SIZE(dmap);
diff --git a/mm/swap.c b/mm/swap.c
index 95916142fc46..59f5fafa6e1f 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -667,6 +667,24 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
 static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
+/*
+ * lru_add_drain_wq is used to do lru_add_drain_all() from a WQ_MEM_RECLAIM
+ * workqueue, aiding in getting memory freed.
+ */
+static struct workqueue_struct *lru_add_drain_wq;
+static int __init lru_init(void)
+{
+        lru_add_drain_wq = alloc_workqueue("lru-add-drain", WQ_MEM_RECLAIM, 0);
+        if (WARN(!lru_add_drain_wq,
+                "Failed to create workqueue lru_add_drain_wq"))
+                return -ENOMEM;
+        return 0;
+}
+early_initcall(lru_init);
 void lru_add_drain_all(void)
 {
        static DEFINE_MUTEX(lock);
@@ -686,7 +704,7 @@ void lru_add_drain_all(void)
                    pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
                    need_activate_page_drain(cpu)) {
                        INIT_WORK(work, lru_add_drain_per_cpu);
-                        schedule_work_on(cpu, work);
+                        queue_work_on(cpu, lru_add_drain_wq, work);
                        cpumask_set_cpu(cpu, &has_work);
                }
        }
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 0d457e7db8d6..c99463ac02fb 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -252,7 +252,10 @@ static inline void free_swap_cache(struct page *page)
 void free_page_and_swap_cache(struct page *page)
 {
        free_swap_cache(page);
-        put_page(page);
+        if (is_huge_zero_page(page))
+                put_huge_zero_page();
+        else
+                put_page(page);
 }
 /*