Merge branch 'akpm' (Andrew's patch-bomb)

Merge third batch of patches from Andrew Morton: - Some MM stragglers - core SMP library cleanups (on_each_cpu_mask) - Some IPI optimisations - kexec - kdump - IPMI - the radix-tree iterator work - various other misc bits. "That'll do for -rc1. I still have ~10 patches for 3.4, will send those along when they've baked a little more." * emailed from Andrew Morton <akpm@linux-foundation.org>: (35 commits) backlight: fix typo in tosa_lcd.c crc32: add help text for the algorithm select option mm: move hugepage test examples to tools/testing/selftests/vm mm: move slabinfo.c to tools/vm mm: move page-types.c from Documentation to tools/vm selftests/Makefile: make `run_tests' depend on `all' selftests: launch individual selftests from the main Makefile radix-tree: use iterators in find_get_pages* functions radix-tree: rewrite gang lookup using iterator radix-tree: introduce bit-optimized iterator fs/proc/namespaces.c: prevent crash when ns_entries[] is empty nbd: rename the nbd_device variable from lo to nbd pidns: add reboot_pid_ns() to handle the reboot syscall sysctl: use bitmap library functions ipmi: use locks on watchdog timeout set on reboot ipmi: simplify locking ipmi: fix message handling during panics ipmi: use a tasklet for handling received messages ipmi: increase KCS timeouts ipmi: decrease the IPMI message transaction time in interrupt mode ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2012-03-28 20:19:27 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2012-03-28 20:19:28 -0400
commit: 532bfc851a7475fb6a36c1e953aa395798a7cca7 (patch)
tree: a7892e5a31330dd59f31959efbe9fda1803784fd /mm
parent: 0195c00244dc2e9f522475868fa278c473ba7339 (diff)
parent: 8da00edc1069f01c34510fa405dc15d96c090a3f (diff)
6 files changed, 136 insertions, 51 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index c3811bc6b9e3..79c4b2b0b14e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -813,20 +813,19 @@ EXPORT_SYMBOL(find_or_create_page);
 unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
                            unsigned int nr_pages, struct page **pages)
 {
-        unsigned int i;
+        struct radix_tree_iter iter;
-        unsigned int ret;
+        void **slot;
-        unsigned int nr_found, nr_skip;
+        unsigned ret = 0;
+        if (unlikely(!nr_pages))
+                return 0;
        rcu_read_lock();
 restart:
-        nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
+        radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
-                                (void ***)pages, NULL, start, nr_pages);
-        ret = 0;
-        nr_skip = 0;
-        for (i = 0; i < nr_found; i++) {
                struct page *page;
 repeat:
-                page = radix_tree_deref_slot((void **)pages[i]);
+                page = radix_tree_deref_slot(slot);
                if (unlikely(!page))
                        continue;
@@ -837,7 +836,7 @@ repeat:
                                 * when entry at index 0 moves out of or back
                                 * to root: none yet gotten, safe to restart.
                                 */
-                                WARN_ON(start | i);
+                                WARN_ON(iter.index);
                                goto restart;
                        }
                        /*
@@ -845,7 +844,6 @@ repeat:
                         * here as an exceptional entry: so skip over it -
                         * we only reach this from invalidate_mapping_pages().
                         */
-                        nr_skip++;
                        continue;
                }
@@ -853,21 +851,16 @@ repeat:
                        goto repeat;
                /* Has the page moved? */
-                if (unlikely(page != *((void **)pages[i]))) {
+                if (unlikely(page != *slot)) {
                        page_cache_release(page);
                        goto repeat;
                }
                pages[ret] = page;
-                ret++;
+                if (++ret == nr_pages)
+                        break;
        }
-        /*
-         * If all entries were removed before we could secure them,
-         * try again, because callers stop trying once 0 is returned.
-         */
-        if (unlikely(!ret && nr_found > nr_skip))
-                goto restart;
        rcu_read_unlock();
        return ret;
 }
@@ -887,21 +880,22 @@ repeat:
 unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
                               unsigned int nr_pages, struct page **pages)
 {
-        unsigned int i;
+        struct radix_tree_iter iter;
-        unsigned int ret;
+        void **slot;
-        unsigned int nr_found;
+        unsigned int ret = 0;
+        if (unlikely(!nr_pages))
+                return 0;
        rcu_read_lock();
 restart:
-        nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
+        radix_tree_for_each_contig(slot, &mapping->page_tree, &iter, index) {
-                                (void ***)pages, NULL, index, nr_pages);
-        ret = 0;
-        for (i = 0; i < nr_found; i++) {
                struct page *page;
 repeat:
-                page = radix_tree_deref_slot((void **)pages[i]);
+                page = radix_tree_deref_slot(slot);
+                /* The hole, there no reason to continue */
                if (unlikely(!page))
-                        continue;
+                        break;
                if (radix_tree_exception(page)) {
                        if (radix_tree_deref_retry(page)) {
@@ -924,7 +918,7 @@ repeat:
                        goto repeat;
                /* Has the page moved? */
-                if (unlikely(page != *((void **)pages[i]))) {
+                if (unlikely(page != *slot)) {
                        page_cache_release(page);
                        goto repeat;
                }
@@ -934,14 +928,14 @@ repeat:
                 * otherwise we can get both false positives and false
                 * negatives, which is just confusing to the caller.
                 */
-                if (page->mapping == NULL || page->index != index) {
+                if (page->mapping == NULL || page->index != iter.index) {
                        page_cache_release(page);
                        break;
                }
                pages[ret] = page;
-                ret++;
+                if (++ret == nr_pages)
-                index++;
+                        break;
        }
        rcu_read_unlock();
        return ret;
@@ -962,19 +956,20 @@ EXPORT_SYMBOL(find_get_pages_contig);
 unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
                        int tag, unsigned int nr_pages, struct page **pages)
 {
-        unsigned int i;
+        struct radix_tree_iter iter;
-        unsigned int ret;
+        void **slot;
-        unsigned int nr_found;
+        unsigned ret = 0;
+        if (unlikely(!nr_pages))
+                return 0;
        rcu_read_lock();
 restart:
-        nr_found = radix_tree_gang_lookup_tag_slot(&mapping->page_tree,
+        radix_tree_for_each_tagged(slot, &mapping->page_tree,
-                                (void ***)pages, *index, nr_pages, tag);
+                                   &iter, *index, tag) {
-        ret = 0;
-        for (i = 0; i < nr_found; i++) {
                struct page *page;
 repeat:
-                page = radix_tree_deref_slot((void **)pages[i]);
+                page = radix_tree_deref_slot(slot);
                if (unlikely(!page))
                        continue;
@@ -998,21 +993,16 @@ repeat:
                        goto repeat;
                /* Has the page moved? */
-                if (unlikely(page != *((void **)pages[i]))) {
+                if (unlikely(page != *slot)) {
                        page_cache_release(page);
                        goto repeat;
                }
                pages[ret] = page;
-                ret++;
+                if (++ret == nr_pages)
+                        break;
        }
-        /*
-         * If all entries were removed before we could secure them,
-         * try again, because callers stop trying once 0 is returned.
-         */
-        if (unlikely(!ret && nr_found))
-                goto restart;
        rcu_read_unlock();
        if (ret)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b2ee6df0e9bb..7d698df4a067 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5306,6 +5306,8 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
                return 0;
        }
+        if (pmd_trans_unstable(pmd))
+                return 0;
        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
        for (; addr != end; pte++, addr += PAGE_SIZE)
                if (get_mctgt_type(vma, addr, *pte, NULL))
@@ -5502,6 +5504,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
                return 0;
        }
+        if (pmd_trans_unstable(pmd))
+                return 0;
 retry:
        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
        for (; addr != end; addr += PAGE_SIZE) {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index caea788628e4..a712fb9e04ce 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1161,11 +1161,47 @@ void drain_local_pages(void *arg)
 }
 /*
- * Spill all the per-cpu pages from all CPUs back into the buddy allocator
+ * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
+ *
+ * Note that this code is protected against sending an IPI to an offline
+ * CPU but does not guarantee sending an IPI to newly hotplugged CPUs:
+ * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but
+ * nothing keeps CPUs from showing up after we populated the cpumask and
+ * before the call to on_each_cpu_mask().
 */
 void drain_all_pages(void)
 {
-        on_each_cpu(drain_local_pages, NULL, 1);
+        int cpu;
+        struct per_cpu_pageset *pcp;
+        struct zone *zone;
+        /*
+         * Allocate in the BSS so we wont require allocation in
+         * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
+         */
+        static cpumask_t cpus_with_pcps;
+        /*
+         * We don't care about racing with CPU hotplug event
+         * as offline notification will cause the notified
+         * cpu to drain that CPU pcps and on_each_cpu_mask
+         * disables preemption as part of its processing
+         */
+        for_each_online_cpu(cpu) {
+                bool has_pcps = false;
+                for_each_populated_zone(zone) {
+                        pcp = per_cpu_ptr(zone->pageset, cpu);
+                        if (pcp->pcp.count) {
+                                has_pcps = true;
+                                break;
+                        }
+                }
+                if (has_pcps)
+                        cpumask_set_cpu(cpu, &cpus_with_pcps);
+                else
+                        cpumask_clear_cpu(cpu, &cpus_with_pcps);
+        }
+        on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);
 }
 #ifdef CONFIG_HIBERNATION
@@ -2308,6 +2344,10 @@ rebalance:
                if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
                        if (oom_killer_disabled)
                                goto nopage;
+                        /* Coredumps can quickly deplete all memory reserves */
+                        if ((current->flags & PF_DUMPCORE) &&
+                            !(gfp_mask & __GFP_NOFAIL))
+                                goto nopage;
                        page = __alloc_pages_may_oom(gfp_mask, order,
                                        zonelist, high_zoneidx,
                                        nodemask, preferred_zone,
diff --git a/mm/slub.c b/mm/slub.c
index 64d9966d16bc..ffe13fdf8144 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2035,9 +2035,17 @@ static void flush_cpu_slab(void *d)
        __flush_cpu_slab(s, smp_processor_id());
 }
+static bool has_cpu_slab(int cpu, void *info)
+{
+        struct kmem_cache *s = info;
+        struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
+        return !!(c->page);
+}
 static void flush_all(struct kmem_cache *s)
 {
-        on_each_cpu(flush_cpu_slab, s, 1);
+        on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC);
 }
 /*
diff --git a/mm/swapfile.c b/mm/swapfile.c
index dae42f380d6e..fafc26d1b1dc 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2022,6 +2022,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
        struct page *page = NULL;
        struct inode *inode = NULL;
+        if (swap_flags & ~SWAP_FLAGS_VALID)
+                return -EINVAL;
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
diff --git a/mm/truncate.c b/mm/truncate.c
index 18aded3a89fc..61a183b89df6 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -626,3 +626,43 @@ int vmtruncate_range(struct inode *inode, loff_t lstart, loff_t lend)
        return 0;
 }
+/**
+ * truncate_pagecache_range - unmap and remove pagecache that is hole-punched
+ * @inode: inode
+ * @lstart: offset of beginning of hole
+ * @lend: offset of last byte of hole
+ *
+ * This function should typically be called before the filesystem
+ * releases resources associated with the freed range (eg. deallocates
+ * blocks). This way, pagecache will always stay logically coherent
+ * with on-disk format, and the filesystem would not have to deal with
+ * situations such as writepage being called for a page that has already
+ * had its underlying blocks deallocated.
+ */
+void truncate_pagecache_range(struct inode *inode, loff_t lstart, loff_t lend)
+{
+        struct address_space *mapping = inode->i_mapping;
+        loff_t unmap_start = round_up(lstart, PAGE_SIZE);
+        loff_t unmap_end = round_down(1 + lend, PAGE_SIZE) - 1;
+        /*
+         * This rounding is currently just for example: unmap_mapping_range
+         * expands its hole outwards, whereas we want it to contract the hole
+         * inwards.  However, existing callers of truncate_pagecache_range are
+         * doing their own page rounding first; and truncate_inode_pages_range
+         * currently BUGs if lend is not pagealigned-1 (it handles partial
+         * page at start of hole, but not partial page at end of hole).  Note
+         * unmap_mapping_range allows holelen 0 for all, and we allow lend -1.
+         */
+        /*
+         * Unlike in truncate_pagecache, unmap_mapping_range is called only
+         * once (before truncating pagecache), and without "even_cows" flag:
+         * hole-punching should not remove private COWed pages from the hole.
+         */
+        if ((u64)unmap_end > (u64)unmap_start)
+                unmap_mapping_range(mapping, unmap_start,
+                                    1 + unmap_end - unmap_start, 0);
+        truncate_inode_pages_range(mapping, lstart, lend);
+}
+EXPORT_SYMBOL(truncate_pagecache_range);
author	Linus Torvalds <torvalds@linux-foundation.org>	2012-03-28 20:19:27 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2012-03-28 20:19:28 -0400
commit	532bfc851a7475fb6a36c1e953aa395798a7cca7 (patch)
tree	a7892e5a31330dd59f31959efbe9fda1803784fd /mm
parent	0195c00244dc2e9f522475868fa278c473ba7339 (diff)
parent	8da00edc1069f01c34510fa405dc15d96c090a3f (diff)