Merge branch 'writeback-workqueue' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq into for-3.10/core

Tejun writes: ----- This is the pull request for the earlier patchset[1] with the same name. It's only three patches (the first one was committed to workqueue tree) but the merge strategy is a bit involved due to the dependencies. * Because the conversion needs features from wq/for-3.10, block/for-3.10/core is based on rc3, and wq/for-3.10 has conflicts with rc3, I pulled mainline (rc5) into wq/for-3.10 to prevent those workqueue conflicts from flaring up in block tree. * Resolving the issue that Jan and Dave raised about debugging requires arch-wide changes. The patchset is being worked on[2] but it'll have to go through -mm after these changes show up in -next, and not included in this pull request. The three commits are located in the following git branch. git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git writeback-workqueue Pulling it into block/for-3.10/core produces a conflict in drivers/md/raid5.c between the following two commits. e3620a3ad5 ("MD RAID5: Avoid accessing gendisk or queue structs when not available") 2f6db2a707 ("raid5: use bio_reset()") The conflict is trivial - one removes an "if ()" conditional while the other removes "rbi->bi_next = NULL" right above it. We just need to remove both. The merged branch is available at git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git block-test-merge so that you can use it for verification. The test merge commit has proper merge description. While these changes are a bit of pain to route, they make code simpler and even have, while minute, measureable performance gain[3] even on a workload which isn't particularly favorable to showing the benefits of this conversion. ---- Fixed up the conflict. Conflicts: drivers/md/raid5.c Signed-off-by: Jens Axboe <axboe@kernel.dk>
author: Jens Axboe <axboe@kernel.dk> 2013-04-02 04:04:39 -0400
committer: Jens Axboe <axboe@kernel.dk> 2013-04-02 04:04:39 -0400
commit: 64f8de4da7d3962632f152d3d702d68bb8accc29 (patch)
tree: c90a872a6d91c824635d59572e1e578980f4bc98 /mm
parent: f1fb3449efd5c49b48e35746bc7283eb9c73e3a0 (diff)
parent: b5c872ddb7083c7909fb76a170c3807e04564bb3 (diff)
6 files changed, 50 insertions, 250 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 41733c5dc820..502517492258 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -31,13 +31,14 @@ EXPORT_SYMBOL_GPL(noop_backing_dev_info);
 static struct class *bdi_class;
 /*
- * bdi_lock protects updates to bdi_list and bdi_pending_list, as well as
+ * bdi_lock protects updates to bdi_list. bdi_list has RCU reader side
- * reader side protection for bdi_pending_list. bdi_list has RCU reader side
 * locking.
 */
 DEFINE_SPINLOCK(bdi_lock);
 LIST_HEAD(bdi_list);
-LIST_HEAD(bdi_pending_list);
+/* bdi_wq serves all asynchronous writeback tasks */
+struct workqueue_struct *bdi_wq;
 void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
 {
@@ -257,6 +258,11 @@ static int __init default_bdi_init(void)
 {
        int err;
+        bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_FREEZABLE |
+                                              WQ_UNBOUND | WQ_SYSFS, 0);
+        if (!bdi_wq)
+                return -ENOMEM;
        err = bdi_init(&default_backing_dev_info);
        if (!err)
                bdi_register(&default_backing_dev_info, NULL, "default");
@@ -271,26 +277,6 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi)
        return wb_has_dirty_io(&bdi->wb);
 }
-static void wakeup_timer_fn(unsigned long data)
-{
-        struct backing_dev_info *bdi = (struct backing_dev_info *)data;
-        spin_lock_bh(&bdi->wb_lock);
-        if (bdi->wb.task) {
-                trace_writeback_wake_thread(bdi);
-                wake_up_process(bdi->wb.task);
-        } else if (bdi->dev) {
-                /*
-                 * When bdi tasks are inactive for long time, they are killed.
-                 * In this case we have to wake-up the forker thread which
-                 * should create and run the bdi thread.
-                 */
-                trace_writeback_wake_forker_thread(bdi);
-                wake_up_process(default_backing_dev_info.wb.task);
-        }
-        spin_unlock_bh(&bdi->wb_lock);
-}
 /*
 * This function is used when the first inode for this bdi is marked dirty. It
 * wakes-up the corresponding bdi thread which should then take care of the
@@ -307,176 +293,7 @@ void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi)
        unsigned long timeout;
        timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
-        mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout);
+        mod_delayed_work(bdi_wq, &bdi->wb.dwork, timeout);
-}
-/*
- * Calculate the longest interval (jiffies) bdi threads are allowed to be
- * inactive.
- */
-static unsigned long bdi_longest_inactive(void)
-{
-        unsigned long interval;
-        interval = msecs_to_jiffies(dirty_writeback_interval * 10);
-        return max(5UL * 60 * HZ, interval);
-}
-/*
- * Clear pending bit and wakeup anybody waiting for flusher thread creation or
- * shutdown
- */
-static void bdi_clear_pending(struct backing_dev_info *bdi)
-{
-        clear_bit(BDI_pending, &bdi->state);
-        smp_mb__after_clear_bit();
-        wake_up_bit(&bdi->state, BDI_pending);
-}
-static int bdi_forker_thread(void *ptr)
-{
-        struct bdi_writeback *me = ptr;
-        current->flags |= PF_SWAPWRITE;
-        set_freezable();
-        /*
-         * Our parent may run at a different priority, just set us to normal
-         */
-        set_user_nice(current, 0);
-        for (;;) {
-                struct task_struct *task = NULL;
-                struct backing_dev_info *bdi;
-                enum {
-                        NO_ACTION,   /* Nothing to do */
-                        FORK_THREAD, /* Fork bdi thread */
-                        KILL_THREAD, /* Kill inactive bdi thread */
-                } action = NO_ACTION;
-                /*
-                 * Temporary measure, we want to make sure we don't see
-                 * dirty data on the default backing_dev_info
-                 */
-                if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) {
-                        del_timer(&me->wakeup_timer);
-                        wb_do_writeback(me, 0);
-                }
-                spin_lock_bh(&bdi_lock);
-                /*
-                 * In the following loop we are going to check whether we have
-                 * some work to do without any synchronization with tasks
-                 * waking us up to do work for them. Set the task state here
-                 * so that we don't miss wakeups after verifying conditions.
-                 */
-                set_current_state(TASK_INTERRUPTIBLE);
-                list_for_each_entry(bdi, &bdi_list, bdi_list) {
-                        bool have_dirty_io;
-                        if (!bdi_cap_writeback_dirty(bdi) ||
-                             bdi_cap_flush_forker(bdi))
-                                continue;
-                        WARN(!test_bit(BDI_registered, &bdi->state),
-                             "bdi %p/%s is not registered!\n", bdi, bdi->name);
-                        have_dirty_io = !list_empty(&bdi->work_list) ||
-                                        wb_has_dirty_io(&bdi->wb);
-                        /*
-                         * If the bdi has work to do, but the thread does not
-                         * exist - create it.
-                         */
-                        if (!bdi->wb.task && have_dirty_io) {
-                                /*
-                                 * Set the pending bit - if someone will try to
-                                 * unregister this bdi - it'll wait on this bit.
-                                 */
-                                set_bit(BDI_pending, &bdi->state);
-                                action = FORK_THREAD;
-                                break;
-                        }
-                        spin_lock(&bdi->wb_lock);
-                        /*
-                         * If there is no work to do and the bdi thread was
-                         * inactive long enough - kill it. The wb_lock is taken
-                         * to make sure no-one adds more work to this bdi and
-                         * wakes the bdi thread up.
-                         */
-                        if (bdi->wb.task && !have_dirty_io &&
-                            time_after(jiffies, bdi->wb.last_active +
-                                                bdi_longest_inactive())) {
-                                task = bdi->wb.task;
-                                bdi->wb.task = NULL;
-                                spin_unlock(&bdi->wb_lock);
-                                set_bit(BDI_pending, &bdi->state);
-                                action = KILL_THREAD;
-                                break;
-                        }
-                        spin_unlock(&bdi->wb_lock);
-                }
-                spin_unlock_bh(&bdi_lock);
-                /* Keep working if default bdi still has things to do */
-                if (!list_empty(&me->bdi->work_list))
-                        __set_current_state(TASK_RUNNING);
-                switch (action) {
-                case FORK_THREAD:
-                        __set_current_state(TASK_RUNNING);
-                        task = kthread_create(bdi_writeback_thread, &bdi->wb,
-                                              "flush-%s", dev_name(bdi->dev));
-                        if (IS_ERR(task)) {
-                                /*
-                                 * If thread creation fails, force writeout of
-                                 * the bdi from the thread. Hopefully 1024 is
-                                 * large enough for efficient IO.
-                                 */
-                                writeback_inodes_wb(&bdi->wb, 1024,
-                                                    WB_REASON_FORKER_THREAD);
-                        } else {
-                                /*
-                                 * The spinlock makes sure we do not lose
-                                 * wake-ups when racing with 'bdi_queue_work()'.
-                                 * And as soon as the bdi thread is visible, we
-                                 * can start it.
-                                 */
-                                spin_lock_bh(&bdi->wb_lock);
-                                bdi->wb.task = task;
-                                spin_unlock_bh(&bdi->wb_lock);
-                                wake_up_process(task);
-                        }
-                        bdi_clear_pending(bdi);
-                        break;
-                case KILL_THREAD:
-                        __set_current_state(TASK_RUNNING);
-                        kthread_stop(task);
-                        bdi_clear_pending(bdi);
-                        break;
-                case NO_ACTION:
-                        if (!wb_has_dirty_io(me) || !dirty_writeback_interval)
-                                /*
-                                 * There are no dirty data. The only thing we
-                                 * should now care about is checking for
-                                 * inactive bdi threads and killing them. Thus,
-                                 * let's sleep for longer time, save energy and
-                                 * be friendly for battery-driven devices.
-                                 */
-                                schedule_timeout(bdi_longest_inactive());
-                        else
-                                schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
-                        try_to_freeze();
-                        break;
-                }
-        }
-        return 0;
 }
 /*
@@ -489,6 +306,9 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi)
        spin_unlock_bh(&bdi_lock);
        synchronize_rcu_expedited();
+        /* bdi_list is now unused, clear it to mark @bdi dying */
+        INIT_LIST_HEAD(&bdi->bdi_list);
 }
 int bdi_register(struct backing_dev_info *bdi, struct device *parent,
@@ -508,20 +328,6 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
        bdi->dev = dev;
-        /*
-         * Just start the forker thread for our default backing_dev_info,
-         * and add other bdi's to the list. They will get a thread created
-         * on-demand when they need it.
-         */
-        if (bdi_cap_flush_forker(bdi)) {
-                struct bdi_writeback *wb = &bdi->wb;
-                wb->task = kthread_run(bdi_forker_thread, wb, "bdi-%s",
-                                                dev_name(dev));
-                if (IS_ERR(wb->task))
-                        return PTR_ERR(wb->task);
-        }
        bdi_debug_register(bdi, dev_name(dev));
        set_bit(BDI_registered, &bdi->state);
@@ -545,8 +351,6 @@ EXPORT_SYMBOL(bdi_register_dev);
 */
 static void bdi_wb_shutdown(struct backing_dev_info *bdi)
 {
-        struct task_struct *task;
        if (!bdi_cap_writeback_dirty(bdi))
                return;
@@ -556,22 +360,20 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi)
        bdi_remove_from_list(bdi);
        /*
-         * If setup is pending, wait for that to complete first
+         * Drain work list and shutdown the delayed_work.  At this point,
+         * @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi
+         * is dying and its work_list needs to be drained no matter what.
         */
-        wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait,
+        mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
-                        TASK_UNINTERRUPTIBLE);
+        flush_delayed_work(&bdi->wb.dwork);
+        WARN_ON(!list_empty(&bdi->work_list));
        /*
-         * Finally, kill the kernel thread. We don't need to be RCU
+         * This shouldn't be necessary unless @bdi for some reason has
-         * safe anymore, since the bdi is gone from visibility.
+         * unflushed dirty IO after work_list is drained.  Do it anyway
+         * just in case.
         */
-        spin_lock_bh(&bdi->wb_lock);
+        cancel_delayed_work_sync(&bdi->wb.dwork);
-        task = bdi->wb.task;
-        bdi->wb.task = NULL;
-        spin_unlock_bh(&bdi->wb_lock);
-        if (task)
-                kthread_stop(task);
 }
 /*
@@ -597,10 +399,8 @@ void bdi_unregister(struct backing_dev_info *bdi)
                bdi_set_min_ratio(bdi, 0);
                trace_writeback_bdi_unregister(bdi);
                bdi_prune_sb(bdi);
-                del_timer_sync(&bdi->wb.wakeup_timer);
-                if (!bdi_cap_flush_forker(bdi))
+                bdi_wb_shutdown(bdi);
-                        bdi_wb_shutdown(bdi);
                bdi_debug_unregister(bdi);
                spin_lock_bh(&bdi->wb_lock);
@@ -622,7 +422,7 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
        INIT_LIST_HEAD(&wb->b_io);
        INIT_LIST_HEAD(&wb->b_more_io);
        spin_lock_init(&wb->list_lock);
-        setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi);
+        INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn);
 }
 /*
@@ -695,12 +495,11 @@ void bdi_destroy(struct backing_dev_info *bdi)
        bdi_unregister(bdi);
        /*
-         * If bdi_unregister() had already been called earlier, the
+         * If bdi_unregister() had already been called earlier, the dwork
-         * wakeup_timer could still be armed because bdi_prune_sb()
+         * could still be pending because bdi_prune_sb() can race with the
-         * can race with the bdi_wakeup_thread_delayed() calls from
+         * bdi_wakeup_thread_delayed() calls from __mark_inode_dirty().
-         * __mark_inode_dirty().
         */
-        del_timer_sync(&bdi->wb.wakeup_timer);
+        cancel_delayed_work_sync(&bdi->wb.dwork);
        for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
                percpu_counter_destroy(&bdi->bdi_stat[i]);
diff --git a/mm/fremap.c b/mm/fremap.c
index 4723ac8d2fc2..87da3590c61e 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -204,10 +204,8 @@ get_write_lock:
                        unsigned long addr;
                        struct file *file = get_file(vma->vm_file);
-                        vm_flags = vma->vm_flags;
+                        addr = mmap_region(file, start, size,
-                        if (!(flags & MAP_NONBLOCK))
+                                        vma->vm_flags, pgoff);
-                                vm_flags |= VM_POPULATE;
-                        addr = mmap_region(file, start, size, vm_flags, pgoff);
                        fput(file);
                        if (IS_ERR_VALUE(addr)) {
                                err = addr;
@@ -226,12 +224,6 @@ get_write_lock:
                mutex_unlock(&mapping->i_mmap_mutex);
        }
-        if (!(flags & MAP_NONBLOCK) && !(vma->vm_flags & VM_POPULATE)) {
-                if (!has_write_lock)
-                        goto get_write_lock;
-                vma->vm_flags |= VM_POPULATE;
-        }
        if (vma->vm_flags & VM_LOCKED) {
                /*
                 * drop PG_Mlocked flag for over-mapped range
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 0a0be33bb199..ca9a7c6d7e97 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2124,8 +2124,12 @@ int hugetlb_report_node_meminfo(int nid, char *buf)
 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
 unsigned long hugetlb_total_pages(void)
 {
-        struct hstate *h = &default_hstate;
+        struct hstate *h;
-        return h->nr_huge_pages * pages_per_huge_page(h);
+        unsigned long nr_total_pages = 0;
+        for_each_hstate(h)
+                nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h);
+        return nr_total_pages;
 }
 static int hugetlb_acct_memory(struct hstate *h, long delta)
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 9597eec8239d..ee3765760818 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1779,7 +1779,11 @@ void try_offline_node(int nid)
        for (i = 0; i < MAX_NR_ZONES; i++) {
                struct zone *zone = pgdat->node_zones + i;
-                if (zone->wait_table)
+                /*
+                 * wait_table may be allocated from boot memory,
+                 * here only free if it's allocated by vmalloc.
+                 */
+                if (is_vmalloc_addr(zone->wait_table))
                        vfree(zone->wait_table);
        }
diff --git a/mm/mlock.c b/mm/mlock.c
index 1c5e33fce639..79b7cf7d1bca 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -358,7 +358,7 @@ static int do_mlock(unsigned long start, size_t len, int on)
                newflags = vma->vm_flags & ~VM_LOCKED;
                if (on)
-                        newflags |= VM_LOCKED | VM_POPULATE;
+                        newflags |= VM_LOCKED;
                tmp = vma->vm_end;
                if (tmp > end)
@@ -418,8 +418,7 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
                 * range with the first VMA. Also, skip undesirable VMA types.
                 */
                nend = min(end, vma->vm_end);
-                if ((vma->vm_flags & (VM_IO | VM_PFNMAP | VM_POPULATE)) !=
+                if (vma->vm_flags & (VM_IO | VM_PFNMAP))
-                    VM_POPULATE)
                        continue;
                if (nstart < vma->vm_start)
                        nstart = vma->vm_start;
@@ -492,9 +491,9 @@ static int do_mlockall(int flags)
        struct vm_area_struct * vma, * prev = NULL;
        if (flags & MCL_FUTURE)
-                current->mm->def_flags |= VM_LOCKED | VM_POPULATE;
+                current->mm->def_flags |= VM_LOCKED;
        else
-                current->mm->def_flags &= ~(VM_LOCKED | VM_POPULATE);
+                current->mm->def_flags &= ~VM_LOCKED;
        if (flags == MCL_FUTURE)
                goto out;
@@ -503,7 +502,7 @@ static int do_mlockall(int flags)
                newflags = vma->vm_flags & ~VM_LOCKED;
                if (flags & MCL_CURRENT)
-                        newflags |= VM_LOCKED | VM_POPULATE;
+                        newflags |= VM_LOCKED;
                /* Ignore errors */
                mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
diff --git a/mm/mmap.c b/mm/mmap.c
index 2664a47cec93..6466699b16cb 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1306,7 +1306,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
        }
        addr = mmap_region(file, addr, len, vm_flags, pgoff);
-        if (!IS_ERR_VALUE(addr) && (vm_flags & VM_POPULATE))
+        if (!IS_ERR_VALUE(addr) &&
+            ((vm_flags & VM_LOCKED) ||
+             (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
                *populate = len;
        return addr;
 }
author	Jens Axboe <axboe@kernel.dk>	2013-04-02 04:04:39 -0400
committer	Jens Axboe <axboe@kernel.dk>	2013-04-02 04:04:39 -0400
commit	64f8de4da7d3962632f152d3d702d68bb8accc29 (patch)
tree	c90a872a6d91c824635d59572e1e578980f4bc98 /mm
parent	f1fb3449efd5c49b48e35746bc7283eb9c73e3a0 (diff)
parent	b5c872ddb7083c7909fb76a170c3807e04564bb3 (diff)

diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 41733c5dc820..502517492258 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c
@@ -31,13 +31,14 @@ EXPORT_SYMBOL_GPL(noop_backing_dev_info);
31	static struct class *bdi_class;	31	static struct class *bdi_class;
32		32
33	/*	33	/*
34	* bdi_lock protects updates to bdi_list and bdi_pending_list, as well as	34	* bdi_lock protects updates to bdi_list. bdi_list has RCU reader side
35	* reader side protection for bdi_pending_list. bdi_list has RCU reader side
36	* locking.	35	* locking.
37	*/	36	*/
38	DEFINE_SPINLOCK(bdi_lock);	37	DEFINE_SPINLOCK(bdi_lock);
39	LIST_HEAD(bdi_list);	38	LIST_HEAD(bdi_list);
40	LIST_HEAD(bdi_pending_list);	39
		40	/* bdi_wq serves all asynchronous writeback tasks */
		41	struct workqueue_struct *bdi_wq;
41		42
42	void bdi_lock_two(struct bdi_writeback wb1, struct bdi_writeback wb2)	43	void bdi_lock_two(struct bdi_writeback wb1, struct bdi_writeback wb2)
43	{	44	{
@@ -257,6 +258,11 @@ static int __init default_bdi_init(void)
257	{	258	{
258	int err;	259	int err;
259		260
		261	bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM \| WQ_FREEZABLE \|
		262	WQ_UNBOUND \| WQ_SYSFS, 0);
		263	if (!bdi_wq)
		264	return -ENOMEM;
		265
260	err = bdi_init(&default_backing_dev_info);	266	err = bdi_init(&default_backing_dev_info);
261	if (!err)	267	if (!err)
262	bdi_register(&default_backing_dev_info, NULL, "default");	268	bdi_register(&default_backing_dev_info, NULL, "default");
@@ -271,26 +277,6 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi)
271	return wb_has_dirty_io(&bdi->wb);	277	return wb_has_dirty_io(&bdi->wb);
272	}	278	}
273		279
274	static void wakeup_timer_fn(unsigned long data)
275	{
276	struct backing_dev_info bdi = (struct backing_dev_info )data;
277
278	spin_lock_bh(&bdi->wb_lock);
279	if (bdi->wb.task) {
280	trace_writeback_wake_thread(bdi);
281	wake_up_process(bdi->wb.task);
282	} else if (bdi->dev) {
283	/*
284	* When bdi tasks are inactive for long time, they are killed.
285	* In this case we have to wake-up the forker thread which
286	* should create and run the bdi thread.
287	*/
288	trace_writeback_wake_forker_thread(bdi);
289	wake_up_process(default_backing_dev_info.wb.task);
290	}
291	spin_unlock_bh(&bdi->wb_lock);
292	}
293
294	/*	280	/*
295	* This function is used when the first inode for this bdi is marked dirty. It	281	* This function is used when the first inode for this bdi is marked dirty. It
296	* wakes-up the corresponding bdi thread which should then take care of the	282	* wakes-up the corresponding bdi thread which should then take care of the
@@ -307,176 +293,7 @@ void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi)
307	unsigned long timeout;	293	unsigned long timeout;
308		294
309	timeout = msecs_to_jiffies(dirty_writeback_interval * 10);	295	timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
310	mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout);	296	mod_delayed_work(bdi_wq, &bdi->wb.dwork, timeout);
311	}
312
313	/*
314	* Calculate the longest interval (jiffies) bdi threads are allowed to be
315	* inactive.
316	*/
317	static unsigned long bdi_longest_inactive(void)
318	{
319	unsigned long interval;
320
321	interval = msecs_to_jiffies(dirty_writeback_interval * 10);
322	return max(5UL * 60 * HZ, interval);
323	}
324
325	/*
326	* Clear pending bit and wakeup anybody waiting for flusher thread creation or
327	* shutdown
328	*/
329	static void bdi_clear_pending(struct backing_dev_info *bdi)
330	{
331	clear_bit(BDI_pending, &bdi->state);
332	smp_mb__after_clear_bit();
333	wake_up_bit(&bdi->state, BDI_pending);
334	}
335
336	static int bdi_forker_thread(void *ptr)
337	{
338	struct bdi_writeback *me = ptr;
339
340	current->flags \|= PF_SWAPWRITE;
341	set_freezable();
342
343	/*
344	* Our parent may run at a different priority, just set us to normal
345	*/
346	set_user_nice(current, 0);
347
348	for (;;) {
349	struct task_struct *task = NULL;
350	struct backing_dev_info *bdi;
351	enum {
352	NO_ACTION, /* Nothing to do */
353	FORK_THREAD, /* Fork bdi thread */
354	KILL_THREAD, /* Kill inactive bdi thread */
355	} action = NO_ACTION;
356
357	/*
358	* Temporary measure, we want to make sure we don't see
359	* dirty data on the default backing_dev_info
360	*/
361	if (wb_has_dirty_io(me) \|\| !list_empty(&me->bdi->work_list)) {
362	del_timer(&me->wakeup_timer);
363	wb_do_writeback(me, 0);
364	}
365
366	spin_lock_bh(&bdi_lock);
367	/*
368	* In the following loop we are going to check whether we have
369	* some work to do without any synchronization with tasks
370	* waking us up to do work for them. Set the task state here
371	* so that we don't miss wakeups after verifying conditions.
372	*/
373	set_current_state(TASK_INTERRUPTIBLE);
374
375	list_for_each_entry(bdi, &bdi_list, bdi_list) {
376	bool have_dirty_io;
377
378	if (!bdi_cap_writeback_dirty(bdi) \|\|
379	bdi_cap_flush_forker(bdi))
380	continue;
381
382	WARN(!test_bit(BDI_registered, &bdi->state),
383	"bdi %p/%s is not registered!\n", bdi, bdi->name);
384
385	have_dirty_io = !list_empty(&bdi->work_list) \|\|
386	wb_has_dirty_io(&bdi->wb);
387
388	/*
389	* If the bdi has work to do, but the thread does not
390	* exist - create it.
391	*/
392	if (!bdi->wb.task && have_dirty_io) {
393	/*
394	* Set the pending bit - if someone will try to
395	* unregister this bdi - it'll wait on this bit.
396	*/
397	set_bit(BDI_pending, &bdi->state);
398	action = FORK_THREAD;
399	break;
400	}
401
402	spin_lock(&bdi->wb_lock);
403
404	/*
405	* If there is no work to do and the bdi thread was
406	* inactive long enough - kill it. The wb_lock is taken
407	* to make sure no-one adds more work to this bdi and
408	* wakes the bdi thread up.
409	*/
410	if (bdi->wb.task && !have_dirty_io &&
411	time_after(jiffies, bdi->wb.last_active +
412	bdi_longest_inactive())) {
413	task = bdi->wb.task;
414	bdi->wb.task = NULL;
415	spin_unlock(&bdi->wb_lock);
416	set_bit(BDI_pending, &bdi->state);
417	action = KILL_THREAD;
418	break;
419	}
420	spin_unlock(&bdi->wb_lock);
421	}
422	spin_unlock_bh(&bdi_lock);
423
424	/* Keep working if default bdi still has things to do */
425	if (!list_empty(&me->bdi->work_list))
426	__set_current_state(TASK_RUNNING);
427
428	switch (action) {
429	case FORK_THREAD:
430	__set_current_state(TASK_RUNNING);
431	task = kthread_create(bdi_writeback_thread, &bdi->wb,
432	"flush-%s", dev_name(bdi->dev));
433	if (IS_ERR(task)) {
434	/*
435	* If thread creation fails, force writeout of
436	* the bdi from the thread. Hopefully 1024 is
437	* large enough for efficient IO.
438	*/
439	writeback_inodes_wb(&bdi->wb, 1024,
440	WB_REASON_FORKER_THREAD);
441	} else {
442	/*
443	* The spinlock makes sure we do not lose
444	* wake-ups when racing with 'bdi_queue_work()'.
445	* And as soon as the bdi thread is visible, we
446	* can start it.
447	*/
448	spin_lock_bh(&bdi->wb_lock);
449	bdi->wb.task = task;
450	spin_unlock_bh(&bdi->wb_lock);
451	wake_up_process(task);
452	}
453	bdi_clear_pending(bdi);
454	break;
455
456	case KILL_THREAD:
457	__set_current_state(TASK_RUNNING);
458	kthread_stop(task);
459	bdi_clear_pending(bdi);
460	break;
461
462	case NO_ACTION:
463	if (!wb_has_dirty_io(me) \|\| !dirty_writeback_interval)
464	/*
465	* There are no dirty data. The only thing we
466	* should now care about is checking for
467	* inactive bdi threads and killing them. Thus,
468	* let's sleep for longer time, save energy and
469	* be friendly for battery-driven devices.
470	*/
471	schedule_timeout(bdi_longest_inactive());
472	else
473	schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
474	try_to_freeze();
475	break;
476	}
477	}
478
479	return 0;
480	}	297	}
481		298
482	/*	299	/*
@@ -489,6 +306,9 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi)
489	spin_unlock_bh(&bdi_lock);	306	spin_unlock_bh(&bdi_lock);
490		307
491	synchronize_rcu_expedited();	308	synchronize_rcu_expedited();
		309
		310	/* bdi_list is now unused, clear it to mark @bdi dying */
		311	INIT_LIST_HEAD(&bdi->bdi_list);
492	}	312	}
493		313
494	int bdi_register(struct backing_dev_info bdi, struct device parent,	314	int bdi_register(struct backing_dev_info bdi, struct device parent,
@@ -508,20 +328,6 @@ int bdi_register(struct backing_dev_info bdi, struct device parent,
508		328
509	bdi->dev = dev;	329	bdi->dev = dev;
510		330
511	/*
512	* Just start the forker thread for our default backing_dev_info,
513	* and add other bdi's to the list. They will get a thread created
514	* on-demand when they need it.
515	*/
516	if (bdi_cap_flush_forker(bdi)) {
517	struct bdi_writeback *wb = &bdi->wb;
518
519	wb->task = kthread_run(bdi_forker_thread, wb, "bdi-%s",
520	dev_name(dev));
521	if (IS_ERR(wb->task))
522	return PTR_ERR(wb->task);
523	}
524
525	bdi_debug_register(bdi, dev_name(dev));	331	bdi_debug_register(bdi, dev_name(dev));
526	set_bit(BDI_registered, &bdi->state);	332	set_bit(BDI_registered, &bdi->state);
527		333
@@ -545,8 +351,6 @@ EXPORT_SYMBOL(bdi_register_dev);
545	*/	351	*/
546	static void bdi_wb_shutdown(struct backing_dev_info *bdi)	352	static void bdi_wb_shutdown(struct backing_dev_info *bdi)
547	{	353	{
548	struct task_struct *task;
549
550	if (!bdi_cap_writeback_dirty(bdi))	354	if (!bdi_cap_writeback_dirty(bdi))
551	return;	355	return;
552		356
@@ -556,22 +360,20 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi)
556	bdi_remove_from_list(bdi);	360	bdi_remove_from_list(bdi);
557		361
558	/*	362	/*
559	* If setup is pending, wait for that to complete first	363	* Drain work list and shutdown the delayed_work. At this point,
		364	* @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi
		365	* is dying and its work_list needs to be drained no matter what.
560	*/	366	*/
561	wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait,	367	mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
562	TASK_UNINTERRUPTIBLE);	368	flush_delayed_work(&bdi->wb.dwork);
		369	WARN_ON(!list_empty(&bdi->work_list));
563		370
564	/*	371	/*
565	* Finally, kill the kernel thread. We don't need to be RCU	372	* This shouldn't be necessary unless @bdi for some reason has
566	* safe anymore, since the bdi is gone from visibility.	373	* unflushed dirty IO after work_list is drained. Do it anyway
		374	* just in case.
567	*/	375	*/
568	spin_lock_bh(&bdi->wb_lock);	376	cancel_delayed_work_sync(&bdi->wb.dwork);
569	task = bdi->wb.task;
570	bdi->wb.task = NULL;
571	spin_unlock_bh(&bdi->wb_lock);
572
573	if (task)
574	kthread_stop(task);
575	}	377	}
576		378
577	/*	379	/*
@@ -597,10 +399,8 @@ void bdi_unregister(struct backing_dev_info *bdi)
597	bdi_set_min_ratio(bdi, 0);	399	bdi_set_min_ratio(bdi, 0);
598	trace_writeback_bdi_unregister(bdi);	400	trace_writeback_bdi_unregister(bdi);
599	bdi_prune_sb(bdi);	401	bdi_prune_sb(bdi);
600	del_timer_sync(&bdi->wb.wakeup_timer);
601		402
602	if (!bdi_cap_flush_forker(bdi))	403	bdi_wb_shutdown(bdi);
603	bdi_wb_shutdown(bdi);
604	bdi_debug_unregister(bdi);	404	bdi_debug_unregister(bdi);
605		405
606	spin_lock_bh(&bdi->wb_lock);	406	spin_lock_bh(&bdi->wb_lock);
@@ -622,7 +422,7 @@ static void bdi_wb_init(struct bdi_writeback wb, struct backing_dev_info bdi)
622	INIT_LIST_HEAD(&wb->b_io);	422	INIT_LIST_HEAD(&wb->b_io);
623	INIT_LIST_HEAD(&wb->b_more_io);	423	INIT_LIST_HEAD(&wb->b_more_io);
624	spin_lock_init(&wb->list_lock);	424	spin_lock_init(&wb->list_lock);
625	setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi);	425	INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn);
626	}	426	}
627		427
628	/*	428	/*
@@ -695,12 +495,11 @@ void bdi_destroy(struct backing_dev_info *bdi)
695	bdi_unregister(bdi);	495	bdi_unregister(bdi);
696		496
697	/*	497	/*
698	* If bdi_unregister() had already been called earlier, the	498	* If bdi_unregister() had already been called earlier, the dwork
699	* wakeup_timer could still be armed because bdi_prune_sb()	499	* could still be pending because bdi_prune_sb() can race with the
700	* can race with the bdi_wakeup_thread_delayed() calls from	500	* bdi_wakeup_thread_delayed() calls from __mark_inode_dirty().
701	* __mark_inode_dirty().
702	*/	501	*/
703	del_timer_sync(&bdi->wb.wakeup_timer);	502	cancel_delayed_work_sync(&bdi->wb.dwork);
704		503
705	for (i = 0; i < NR_BDI_STAT_ITEMS; i++)	504	for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
706	percpu_counter_destroy(&bdi->bdi_stat[i]);	505	percpu_counter_destroy(&bdi->bdi_stat[i]);


diff --git a/mm/fremap.c b/mm/fremap.c index 4723ac8d2fc2..87da3590c61e 100644 --- a/mm/fremap.c +++ b/mm/fremap.c
@@ -204,10 +204,8 @@ get_write_lock:
204	unsigned long addr;	204	unsigned long addr;
205	struct file *file = get_file(vma->vm_file);	205	struct file *file = get_file(vma->vm_file);
206		206
207	vm_flags = vma->vm_flags;	207	addr = mmap_region(file, start, size,
208	if (!(flags & MAP_NONBLOCK))	208	vma->vm_flags, pgoff);
209	vm_flags \|= VM_POPULATE;
210	addr = mmap_region(file, start, size, vm_flags, pgoff);
211	fput(file);	209	fput(file);
212	if (IS_ERR_VALUE(addr)) {	210	if (IS_ERR_VALUE(addr)) {
213	err = addr;	211	err = addr;
@@ -226,12 +224,6 @@ get_write_lock:
226	mutex_unlock(&mapping->i_mmap_mutex);	224	mutex_unlock(&mapping->i_mmap_mutex);
227	}	225	}
228		226
229	if (!(flags & MAP_NONBLOCK) && !(vma->vm_flags & VM_POPULATE)) {
230	if (!has_write_lock)
231	goto get_write_lock;
232	vma->vm_flags \|= VM_POPULATE;
233	}
234
235	if (vma->vm_flags & VM_LOCKED) {	227	if (vma->vm_flags & VM_LOCKED) {
236	/*	228	/*
237	* drop PG_Mlocked flag for over-mapped range	229	* drop PG_Mlocked flag for over-mapped range


diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0a0be33bb199..ca9a7c6d7e97 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c
@@ -2124,8 +2124,12 @@ int hugetlb_report_node_meminfo(int nid, char *buf)
2124	/* Return the number pages of memory we physically have, in PAGE_SIZE units. */	2124	/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
2125	unsigned long hugetlb_total_pages(void)	2125	unsigned long hugetlb_total_pages(void)
2126	{	2126	{
2127	struct hstate *h = &default_hstate;	2127	struct hstate *h;
2128	return h->nr_huge_pages * pages_per_huge_page(h);	2128	unsigned long nr_total_pages = 0;
		2129
		2130	for_each_hstate(h)
		2131	nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h);
		2132	return nr_total_pages;
2129	}	2133	}
2130		2134
2131	static int hugetlb_acct_memory(struct hstate *h, long delta)	2135	static int hugetlb_acct_memory(struct hstate *h, long delta)


diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 9597eec8239d..ee3765760818 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c
@@ -1779,7 +1779,11 @@ void try_offline_node(int nid)
1779	for (i = 0; i < MAX_NR_ZONES; i++) {	1779	for (i = 0; i < MAX_NR_ZONES; i++) {
1780	struct zone *zone = pgdat->node_zones + i;	1780	struct zone *zone = pgdat->node_zones + i;
1781		1781
1782	if (zone->wait_table)	1782	/*
		1783	* wait_table may be allocated from boot memory,
		1784	* here only free if it's allocated by vmalloc.
		1785	*/
		1786	if (is_vmalloc_addr(zone->wait_table))
1783	vfree(zone->wait_table);	1787	vfree(zone->wait_table);
1784	}	1788	}
1785		1789


diff --git a/mm/mlock.c b/mm/mlock.c index 1c5e33fce639..79b7cf7d1bca 100644 --- a/mm/mlock.c +++ b/mm/mlock.c
@@ -358,7 +358,7 @@ static int do_mlock(unsigned long start, size_t len, int on)
358		358
359	newflags = vma->vm_flags & ~VM_LOCKED;	359	newflags = vma->vm_flags & ~VM_LOCKED;
360	if (on)	360	if (on)
361	newflags \|= VM_LOCKED \| VM_POPULATE;	361	newflags \|= VM_LOCKED;
362		362
363	tmp = vma->vm_end;	363	tmp = vma->vm_end;
364	if (tmp > end)	364	if (tmp > end)
@@ -418,8 +418,7 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
418	* range with the first VMA. Also, skip undesirable VMA types.	418	* range with the first VMA. Also, skip undesirable VMA types.
419	*/	419	*/
420	nend = min(end, vma->vm_end);	420	nend = min(end, vma->vm_end);
421	if ((vma->vm_flags & (VM_IO \| VM_PFNMAP \| VM_POPULATE)) !=	421	if (vma->vm_flags & (VM_IO \| VM_PFNMAP))
422	VM_POPULATE)
423	continue;	422	continue;
424	if (nstart < vma->vm_start)	423	if (nstart < vma->vm_start)
425	nstart = vma->vm_start;	424	nstart = vma->vm_start;
@@ -492,9 +491,9 @@ static int do_mlockall(int flags)
492	struct vm_area_struct * vma, * prev = NULL;	491	struct vm_area_struct * vma, * prev = NULL;
493		492
494	if (flags & MCL_FUTURE)	493	if (flags & MCL_FUTURE)
495	current->mm->def_flags \|= VM_LOCKED \| VM_POPULATE;	494	current->mm->def_flags \|= VM_LOCKED;
496	else	495	else
497	current->mm->def_flags &= ~(VM_LOCKED \| VM_POPULATE);	496	current->mm->def_flags &= ~VM_LOCKED;
498	if (flags == MCL_FUTURE)	497	if (flags == MCL_FUTURE)
499	goto out;	498	goto out;
500		499
@@ -503,7 +502,7 @@ static int do_mlockall(int flags)
503		502
504	newflags = vma->vm_flags & ~VM_LOCKED;	503	newflags = vma->vm_flags & ~VM_LOCKED;
505	if (flags & MCL_CURRENT)	504	if (flags & MCL_CURRENT)
506	newflags \|= VM_LOCKED \| VM_POPULATE;	505	newflags \|= VM_LOCKED;
507		506
508	/* Ignore errors */	507	/* Ignore errors */
509	mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);	508	mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);


diff --git a/mm/mmap.c b/mm/mmap.c index 2664a47cec93..6466699b16cb 100644 --- a/mm/mmap.c +++ b/mm/mmap.c
@@ -1306,7 +1306,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1306	}	1306	}
1307		1307
1308	addr = mmap_region(file, addr, len, vm_flags, pgoff);	1308	addr = mmap_region(file, addr, len, vm_flags, pgoff);
1309	if (!IS_ERR_VALUE(addr) && (vm_flags & VM_POPULATE))	1309	if (!IS_ERR_VALUE(addr) &&
		1310	((vm_flags & VM_LOCKED) \|\|
		1311	(flags & (MAP_POPULATE \| MAP_NONBLOCK)) == MAP_POPULATE))
1310	*populate = len;	1312	*populate = len;
1311	return addr;	1313	return addr;
1312	}	1314	}