1 files changed, 93 insertions, 102 deletions
diff --git a/fs/buffer.c b/fs/buffer.c
index 3e7dca279d1c..1a80b048ade8 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -41,6 +41,7 @@
 #include <linux/bitops.h>
 #include <linux/mpage.h>
 #include <linux/bit_spinlock.h>
+#include <linux/cleancache.h>
 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
@@ -54,23 +55,15 @@ init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
 }
 EXPORT_SYMBOL(init_buffer);
-static int sync_buffer(void *word)
+static int sleep_on_buffer(void *word)
 {
-        struct block_device *bd;
-        struct buffer_head *bh
-                = container_of(word, struct buffer_head, b_state);
-        smp_mb();
-        bd = bh->b_bdev;
-        if (bd)
-                blk_run_address_space(bd->bd_inode->i_mapping);
        io_schedule();
        return 0;
 }
 void __lock_buffer(struct buffer_head *bh)
 {
-        wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,
+        wait_on_bit_lock(&bh->b_state, BH_Lock, sleep_on_buffer,
                                                        TASK_UNINTERRUPTIBLE);
 }
 EXPORT_SYMBOL(__lock_buffer);
@@ -90,7 +83,7 @@ EXPORT_SYMBOL(unlock_buffer);
 */
 void __wait_on_buffer(struct buffer_head * bh)
 {
-        wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
+        wait_on_bit(&bh->b_state, BH_Lock, sleep_on_buffer, TASK_UNINTERRUPTIBLE);
 }
 EXPORT_SYMBOL(__wait_on_buffer);
@@ -156,7 +149,7 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
        if (uptodate) {
                set_buffer_uptodate(bh);
        } else {
-                if (!buffer_eopnotsupp(bh) && !quiet_error(bh)) {
+                if (!quiet_error(bh)) {
                        buffer_io_error(bh);
                        printk(KERN_WARNING "lost page write due to "
                                        "I/O error on %s\n",
@@ -277,6 +270,10 @@ void invalidate_bdev(struct block_device *bdev)
        invalidate_bh_lrus();
        lru_add_drain_all();    /* make sure all lru add caches are flushed */
        invalidate_mapping_pages(mapping, 0, -1);
+        /* 99% of the time, we don't need to flush the cleancache on the bdev.
+         * But, for the strange corners, lets be cautious
+         */
+        cleancache_flush_inode(mapping);
 }
 EXPORT_SYMBOL(invalidate_bdev);
@@ -749,10 +746,12 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 {
        struct buffer_head *bh;
        struct list_head tmp;
-        struct address_space *mapping, *prev_mapping = NULL;
+        struct address_space *mapping;
        int err = 0, err2;
+        struct blk_plug plug;
        INIT_LIST_HEAD(&tmp);
+        blk_start_plug(&plug);
        spin_lock(lock);
        while (!list_empty(list)) {
@@ -775,7 +774,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
                                 * still in flight on potentially older
                                 * contents.
                                 */
-                                write_dirty_buffer(bh, WRITE_SYNC_PLUG);
+                                write_dirty_buffer(bh, WRITE_SYNC);
                                /*
                                 * Kick off IO for the previous mapping. Note
@@ -783,16 +782,16 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
                                 * wait_on_buffer() will do that for us
                                 * through sync_buffer().
                                 */
-                                if (prev_mapping && prev_mapping != mapping)
-                                        blk_run_address_space(prev_mapping);
-                                prev_mapping = mapping;
                                brelse(bh);
                                spin_lock(lock);
                        }
                }
        }
+        spin_unlock(lock);
+        blk_finish_plug(&plug);
+        spin_lock(lock);
        while (!list_empty(&tmp)) {
                bh = BH_ENTRY(tmp.prev);
                get_bh(bh);
@@ -905,7 +904,6 @@ try_again:
                bh->b_state = 0;
                atomic_set(&bh->b_count, 0);
-                bh->b_private = NULL;
                bh->b_size = size;
                /* Link the buffer to its page */
@@ -1145,7 +1143,7 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
 * inode list.
 *
 * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
- * mapping->tree_lock and the global inode_lock.
+ * mapping->tree_lock and mapping->host->i_lock.
 */
 void mark_buffer_dirty(struct buffer_head *bh)
 {
@@ -1271,12 +1269,10 @@ static inline void check_irqs_on(void)
 static void bh_lru_install(struct buffer_head *bh)
 {
        struct buffer_head *evictee = NULL;
-        struct bh_lru *lru;
        check_irqs_on();
        bh_lru_lock();
-        lru = &__get_cpu_var(bh_lrus);
+        if (__this_cpu_read(bh_lrus.bhs[0]) != bh) {
-        if (lru->bhs[0] != bh) {
                struct buffer_head *bhs[BH_LRU_SIZE];
                int in;
                int out = 0;
@@ -1284,7 +1280,8 @@ static void bh_lru_install(struct buffer_head *bh)
                get_bh(bh);
                bhs[out++] = bh;
                for (in = 0; in < BH_LRU_SIZE; in++) {
-                        struct buffer_head *bh2 = lru->bhs[in];
+                        struct buffer_head *bh2 =
+                                __this_cpu_read(bh_lrus.bhs[in]);
                        if (bh2 == bh) {
                                __brelse(bh2);
@@ -1299,7 +1296,7 @@ static void bh_lru_install(struct buffer_head *bh)
                }
                while (out < BH_LRU_SIZE)
                        bhs[out++] = NULL;
-                memcpy(lru->bhs, bhs, sizeof(bhs));
+                memcpy(__this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs));
        }
        bh_lru_unlock();
@@ -1314,23 +1311,22 @@ static struct buffer_head *
 lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
 {
        struct buffer_head *ret = NULL;
-        struct bh_lru *lru;
        unsigned int i;
        check_irqs_on();
        bh_lru_lock();
-        lru = &__get_cpu_var(bh_lrus);
        for (i = 0; i < BH_LRU_SIZE; i++) {
-                struct buffer_head *bh = lru->bhs[i];
+                struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
                if (bh && bh->b_bdev == bdev &&
                                bh->b_blocknr == block && bh->b_size == size) {
                        if (i) {
                                while (i) {
-                                        lru->bhs[i] = lru->bhs[i - 1];
+                                        __this_cpu_write(bh_lrus.bhs[i],
+                                                __this_cpu_read(bh_lrus.bhs[i - 1]));
                                        i--;
                                }
-                                lru->bhs[0] = bh;
+                                __this_cpu_write(bh_lrus.bhs[0], bh);
                        }
                        get_bh(bh);
                        ret = bh;
@@ -1617,14 +1613,8 @@ EXPORT_SYMBOL(unmap_underlying_metadata);
 * prevents this contention from occurring.
 *
 * If block_write_full_page() is called with wbc->sync_mode ==
- * WB_SYNC_ALL, the writes are posted using WRITE_SYNC_PLUG; this
+ * WB_SYNC_ALL, the writes are posted using WRITE_SYNC; this
- * causes the writes to be flagged as synchronous writes, but the
+ * causes the writes to be flagged as synchronous writes.
- * block device queue will NOT be unplugged, since usually many pages
- * will be pushed to the out before the higher-level caller actually
- * waits for the writes to be completed.  The various wait functions,
- * such as wait_on_writeback_range() will ultimately call sync_page()
- * which will ultimately call blk_run_backing_dev(), which will end up
- * unplugging the device queue.
 */
 static int __block_write_full_page(struct inode *inode, struct page *page,
                        get_block_t *get_block, struct writeback_control *wbc,
@@ -1637,7 +1627,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
        const unsigned blocksize = 1 << inode->i_blkbits;
        int nr_underway = 0;
        int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
-                        WRITE_SYNC_PLUG : WRITE);
+                        WRITE_SYNC : WRITE);
        BUG_ON(!PageLocked(page));
@@ -1706,7 +1696,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
                 * and kswapd activity, but those code paths have their own
                 * higher-level throttling.
                 */
-                if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
+                if (wbc->sync_mode != WB_SYNC_NONE) {
                        lock_buffer(bh);
                } else if (!trylock_buffer(bh)) {
                        redirty_page_for_writepage(wbc, page);
@@ -1834,9 +1824,11 @@ void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
 }
 EXPORT_SYMBOL(page_zero_new_buffers);
-int block_prepare_write(struct page *page, unsigned from, unsigned to,
+int __block_write_begin(struct page *page, loff_t pos, unsigned len,
                get_block_t *get_block)
 {
+        unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+        unsigned to = from + len;
        struct inode *inode = page->mapping->host;
        unsigned block_start, block_end;
        sector_t block;
@@ -1910,13 +1902,11 @@ int block_prepare_write(struct page *page, unsigned from, unsigned to,
                if (!buffer_uptodate(*wait_bh))
                        err = -EIO;
        }
-        if (unlikely(err)) {
+        if (unlikely(err))
                page_zero_new_buffers(page, from, to);
-                ClearPageUptodate(page);
-        }
        return err;
 }
-EXPORT_SYMBOL(block_prepare_write);
+EXPORT_SYMBOL(__block_write_begin);
 static int __block_commit_write(struct inode *inode, struct page *page,
                unsigned from, unsigned to)
@@ -1953,15 +1943,6 @@ static int __block_commit_write(struct inode *inode, struct page *page,
        return 0;
 }
-int __block_write_begin(struct page *page, loff_t pos, unsigned len,
-                get_block_t *get_block)
-{
-        unsigned start = pos & (PAGE_CACHE_SIZE - 1);
-        return block_prepare_write(page, start, start + len, get_block);
-}
-EXPORT_SYMBOL(__block_write_begin);
 /*
 * block_write_begin takes care of the basic task of block allocation and
 * bringing partial write blocks uptodate first.
@@ -2353,24 +2334,26 @@ EXPORT_SYMBOL(block_commit_write);
 * page lock we can determine safely if the page is beyond EOF. If it is not
 * beyond EOF, then the page is guaranteed safe against truncation until we
 * unlock the page.
+ *
+ * Direct callers of this function should call vfs_check_frozen() so that page
+ * fault does not busyloop until the fs is thawed.
 */
-int
+int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
-block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
+                         get_block_t get_block)
-                   get_block_t get_block)
 {
        struct page *page = vmf->page;
        struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
        unsigned long end;
        loff_t size;
-        int ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
+        int ret;
        lock_page(page);
        size = i_size_read(inode);
        if ((page->mapping != inode->i_mapping) ||
            (page_offset(page) > size)) {
-                /* page got truncated out from underneath us */
+                /* We overload EFAULT to mean page got truncated */
-                unlock_page(page);
+                ret = -EFAULT;
-                goto out;
+                goto out_unlock;
        }
        /* page is wholly or partially inside EOF */
@@ -2379,22 +2362,46 @@ block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
        else
                end = PAGE_CACHE_SIZE;
-        ret = block_prepare_write(page, 0, end, get_block);
+        ret = __block_write_begin(page, 0, end, get_block);
        if (!ret)
                ret = block_commit_write(page, 0, end);
-        if (unlikely(ret)) {
+        if (unlikely(ret < 0))
-                unlock_page(page);
+                goto out_unlock;
-                if (ret == -ENOMEM)
+        /*
-                        ret = VM_FAULT_OOM;
+         * Freezing in progress? We check after the page is marked dirty and
-                else /* -ENOSPC, -EIO, etc */
+         * with page lock held so if the test here fails, we are sure freezing
-                        ret = VM_FAULT_SIGBUS;
+         * code will wait during syncing until the page fault is done - at that
-        } else
+         * point page will be dirty and unlocked so freezing code will write it
-                ret = VM_FAULT_LOCKED;
+         * and writeprotect it again.
+         */
-out:
+        set_page_dirty(page);
+        if (inode->i_sb->s_frozen != SB_UNFROZEN) {
+                ret = -EAGAIN;
+                goto out_unlock;
+        }
+        wait_on_page_writeback(page);
+        return 0;
+out_unlock:
+        unlock_page(page);
        return ret;
 }
+EXPORT_SYMBOL(__block_page_mkwrite);
+int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
+                   get_block_t get_block)
+{
+        int ret;
+        struct super_block *sb = vma->vm_file->f_path.dentry->d_inode->i_sb;
+        /*
+         * This check is racy but catches the common case. The check in
+         * __block_page_mkwrite() is reliable.
+         */
+        vfs_check_frozen(sb, SB_FREEZE_WRITE);
+        ret = __block_page_mkwrite(vma, vmf, get_block);
+        return block_page_mkwrite_return(ret);
+}
 EXPORT_SYMBOL(block_page_mkwrite);
 /*
@@ -2466,11 +2473,10 @@ int nobh_write_begin(struct address_space *mapping,
        *fsdata = NULL;
        if (page_has_buffers(page)) {
-                unlock_page(page);
+                ret = __block_write_begin(page, pos, len, get_block);
-                page_cache_release(page);
+                if (unlikely(ret))
-                *pagep = NULL;
+                        goto out_release;
-                return block_write_begin(mapping, pos, len, flags, pagep,
+                return ret;
-                                         get_block);
        }
        if (PageMappedToDisk(page))
@@ -2891,7 +2897,6 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
        if (err == -EOPNOTSUPP) {
                set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
-                set_bit(BH_Eopnotsupp, &bh->b_state);
        }
        if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
@@ -3031,10 +3036,6 @@ int __sync_dirty_buffer(struct buffer_head *bh, int rw)
                bh->b_end_io = end_buffer_write_sync;
                ret = submit_bh(rw, bh);
                wait_on_buffer(bh);
-                if (buffer_eopnotsupp(bh)) {
-                        clear_buffer_eopnotsupp(bh);
-                        ret = -EOPNOTSUPP;
-                }
                if (!ret && !buffer_uptodate(bh))
                        ret = -EIO;
        } else {
@@ -3154,17 +3155,6 @@ out:
 }
 EXPORT_SYMBOL(try_to_free_buffers);
-void block_sync_page(struct page *page)
-{
-        struct address_space *mapping;
-        smp_mb();
-        mapping = page_mapping(page);
-        if (mapping)
-                blk_run_backing_dev(mapping->backing_dev_info, page);
-}
-EXPORT_SYMBOL(block_sync_page);
 /*
 * There are no bdflush tunables left.  But distributions are
 * still running obsolete flush daemons, so we terminate them here.
@@ -3217,22 +3207,23 @@ static void recalc_bh_state(void)
        int i;
        int tot = 0;
-        if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
+        if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096)
                return;
-        __get_cpu_var(bh_accounting).ratelimit = 0;
+        __this_cpu_write(bh_accounting.ratelimit, 0);
        for_each_online_cpu(i)
                tot += per_cpu(bh_accounting, i).nr;
        buffer_heads_over_limit = (tot > max_buffer_heads);
 }
-        
 struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
 {
        struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
        if (ret) {
                INIT_LIST_HEAD(&ret->b_assoc_buffers);
-                get_cpu_var(bh_accounting).nr++;
+                preempt_disable();
+                __this_cpu_inc(bh_accounting.nr);
                recalc_bh_state();
-                put_cpu_var(bh_accounting);
+                preempt_enable();
        }
        return ret;
 }
@@ -3242,9 +3233,10 @@ void free_buffer_head(struct buffer_head *bh)
 {
        BUG_ON(!list_empty(&bh->b_assoc_buffers));
        kmem_cache_free(bh_cachep, bh);
-        get_cpu_var(bh_accounting).nr--;
+        preempt_disable();
+        __this_cpu_dec(bh_accounting.nr);
        recalc_bh_state();
-        put_cpu_var(bh_accounting);
+        preempt_enable();
 }
 EXPORT_SYMBOL(free_buffer_head);
@@ -3257,9 +3249,8 @@ static void buffer_exit_cpu(int cpu)
                brelse(b->bhs[i]);
                b->bhs[i] = NULL;
        }
-        get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr;
+        this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
        per_cpu(bh_accounting, cpu).nr = 0;
-        put_cpu_var(bh_accounting);
 }
 static int buffer_cpu_notify(struct notifier_block *self,

diff --git a/fs/buffer.c b/fs/buffer.c index 3e7dca279d1c..1a80b048ade8 100644 --- a/fs/buffer.c +++ b/fs/buffer.c
@@ -41,6 +41,7 @@
41	#include <linux/bitops.h>	41	#include <linux/bitops.h>
42	#include <linux/mpage.h>	42	#include <linux/mpage.h>
43	#include <linux/bit_spinlock.h>	43	#include <linux/bit_spinlock.h>
		44	#include <linux/cleancache.h>
44		45
45	static int fsync_buffers_list(spinlock_t lock, struct list_head list);	46	static int fsync_buffers_list(spinlock_t lock, struct list_head list);
46		47
@@ -54,23 +55,15 @@ init_buffer(struct buffer_head bh, bh_end_io_t handler, void *private)
54	}	55	}
55	EXPORT_SYMBOL(init_buffer);	56	EXPORT_SYMBOL(init_buffer);
56		57
57	static int sync_buffer(void *word)	58	static int sleep_on_buffer(void *word)
58	{	59	{
59	struct block_device *bd;
60	struct buffer_head *bh
61	= container_of(word, struct buffer_head, b_state);
62
63	smp_mb();
64	bd = bh->b_bdev;
65	if (bd)
66	blk_run_address_space(bd->bd_inode->i_mapping);
67	io_schedule();	60	io_schedule();
68	return 0;	61	return 0;
69	}	62	}
70		63
71	void __lock_buffer(struct buffer_head *bh)	64	void __lock_buffer(struct buffer_head *bh)
72	{	65	{
73	wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,	66	wait_on_bit_lock(&bh->b_state, BH_Lock, sleep_on_buffer,
74	TASK_UNINTERRUPTIBLE);	67	TASK_UNINTERRUPTIBLE);
75	}	68	}
76	EXPORT_SYMBOL(__lock_buffer);	69	EXPORT_SYMBOL(__lock_buffer);
@@ -90,7 +83,7 @@ EXPORT_SYMBOL(unlock_buffer);
90	*/	83	*/
91	void __wait_on_buffer(struct buffer_head * bh)	84	void __wait_on_buffer(struct buffer_head * bh)
92	{	85	{
93	wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);	86	wait_on_bit(&bh->b_state, BH_Lock, sleep_on_buffer, TASK_UNINTERRUPTIBLE);
94	}	87	}
95	EXPORT_SYMBOL(__wait_on_buffer);	88	EXPORT_SYMBOL(__wait_on_buffer);
96		89
@@ -156,7 +149,7 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
156	if (uptodate) {	149	if (uptodate) {
157	set_buffer_uptodate(bh);	150	set_buffer_uptodate(bh);
158	} else {	151	} else {
159	if (!buffer_eopnotsupp(bh) && !quiet_error(bh)) {	152	if (!quiet_error(bh)) {
160	buffer_io_error(bh);	153	buffer_io_error(bh);
161	printk(KERN_WARNING "lost page write due to "	154	printk(KERN_WARNING "lost page write due to "
162	"I/O error on %s\n",	155	"I/O error on %s\n",
@@ -277,6 +270,10 @@ void invalidate_bdev(struct block_device *bdev)
277	invalidate_bh_lrus();	270	invalidate_bh_lrus();
278	lru_add_drain_all(); /* make sure all lru add caches are flushed */	271	lru_add_drain_all(); /* make sure all lru add caches are flushed */
279	invalidate_mapping_pages(mapping, 0, -1);	272	invalidate_mapping_pages(mapping, 0, -1);
		273	/* 99% of the time, we don't need to flush the cleancache on the bdev.
		274	* But, for the strange corners, lets be cautious
		275	*/
		276	cleancache_flush_inode(mapping);
280	}	277	}
281	EXPORT_SYMBOL(invalidate_bdev);	278	EXPORT_SYMBOL(invalidate_bdev);
282		279
@@ -749,10 +746,12 @@ static int fsync_buffers_list(spinlock_t lock, struct list_head list)
749	{	746	{
750	struct buffer_head *bh;	747	struct buffer_head *bh;
751	struct list_head tmp;	748	struct list_head tmp;
752	struct address_space mapping, prev_mapping = NULL;	749	struct address_space *mapping;
753	int err = 0, err2;	750	int err = 0, err2;
		751	struct blk_plug plug;
754		752
755	INIT_LIST_HEAD(&tmp);	753	INIT_LIST_HEAD(&tmp);
		754	blk_start_plug(&plug);
756		755
757	spin_lock(lock);	756	spin_lock(lock);
758	while (!list_empty(list)) {	757	while (!list_empty(list)) {
@@ -775,7 +774,7 @@ static int fsync_buffers_list(spinlock_t lock, struct list_head list)
775	* still in flight on potentially older	774	* still in flight on potentially older
776	* contents.	775	* contents.
777	*/	776	*/
778	write_dirty_buffer(bh, WRITE_SYNC_PLUG);	777	write_dirty_buffer(bh, WRITE_SYNC);
779		778
780	/*	779	/*
781	* Kick off IO for the previous mapping. Note	780	* Kick off IO for the previous mapping. Note
@@ -783,16 +782,16 @@ static int fsync_buffers_list(spinlock_t lock, struct list_head list)
783	* wait_on_buffer() will do that for us	782	* wait_on_buffer() will do that for us
784	* through sync_buffer().	783	* through sync_buffer().
785	*/	784	*/
786	if (prev_mapping && prev_mapping != mapping)
787	blk_run_address_space(prev_mapping);
788	prev_mapping = mapping;
789
790	brelse(bh);	785	brelse(bh);
791	spin_lock(lock);	786	spin_lock(lock);
792	}	787	}
793	}	788	}
794	}	789	}
795		790
		791	spin_unlock(lock);
		792	blk_finish_plug(&plug);
		793	spin_lock(lock);
		794
796	while (!list_empty(&tmp)) {	795	while (!list_empty(&tmp)) {
797	bh = BH_ENTRY(tmp.prev);	796	bh = BH_ENTRY(tmp.prev);
798	get_bh(bh);	797	get_bh(bh);
@@ -905,7 +904,6 @@ try_again:
905		904
906	bh->b_state = 0;	905	bh->b_state = 0;
907	atomic_set(&bh->b_count, 0);	906	atomic_set(&bh->b_count, 0);
908	bh->b_private = NULL;
909	bh->b_size = size;	907	bh->b_size = size;
910		908
911	/* Link the buffer to its page */	909	/* Link the buffer to its page */
@@ -1145,7 +1143,7 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
1145	* inode list.	1143	* inode list.
1146	*	1144	*
1147	* mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,	1145	* mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
1148	* mapping->tree_lock and the global inode_lock.	1146	* mapping->tree_lock and mapping->host->i_lock.
1149	*/	1147	*/
1150	void mark_buffer_dirty(struct buffer_head *bh)	1148	void mark_buffer_dirty(struct buffer_head *bh)
1151	{	1149	{
@@ -1271,12 +1269,10 @@ static inline void check_irqs_on(void)
1271	static void bh_lru_install(struct buffer_head *bh)	1269	static void bh_lru_install(struct buffer_head *bh)
1272	{	1270	{
1273	struct buffer_head *evictee = NULL;	1271	struct buffer_head *evictee = NULL;
1274	struct bh_lru *lru;
1275		1272
1276	check_irqs_on();	1273	check_irqs_on();
1277	bh_lru_lock();	1274	bh_lru_lock();
1278	lru = &__get_cpu_var(bh_lrus);	1275	if (__this_cpu_read(bh_lrus.bhs[0]) != bh) {
1279	if (lru->bhs[0] != bh) {
1280	struct buffer_head *bhs[BH_LRU_SIZE];	1276	struct buffer_head *bhs[BH_LRU_SIZE];
1281	int in;	1277	int in;
1282	int out = 0;	1278	int out = 0;
@@ -1284,7 +1280,8 @@ static void bh_lru_install(struct buffer_head *bh)
1284	get_bh(bh);	1280	get_bh(bh);
1285	bhs[out++] = bh;	1281	bhs[out++] = bh;
1286	for (in = 0; in < BH_LRU_SIZE; in++) {	1282	for (in = 0; in < BH_LRU_SIZE; in++) {
1287	struct buffer_head *bh2 = lru->bhs[in];	1283	struct buffer_head *bh2 =
		1284	__this_cpu_read(bh_lrus.bhs[in]);
1288		1285
1289	if (bh2 == bh) {	1286	if (bh2 == bh) {
1290	__brelse(bh2);	1287	__brelse(bh2);
@@ -1299,7 +1296,7 @@ static void bh_lru_install(struct buffer_head *bh)
1299	}	1296	}
1300	while (out < BH_LRU_SIZE)	1297	while (out < BH_LRU_SIZE)
1301	bhs[out++] = NULL;	1298	bhs[out++] = NULL;
1302	memcpy(lru->bhs, bhs, sizeof(bhs));	1299	memcpy(__this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs));
1303	}	1300	}
1304	bh_lru_unlock();	1301	bh_lru_unlock();
1305		1302
@@ -1314,23 +1311,22 @@ static struct buffer_head *
1314	lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)	1311	lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
1315	{	1312	{
1316	struct buffer_head *ret = NULL;	1313	struct buffer_head *ret = NULL;
1317	struct bh_lru *lru;
1318	unsigned int i;	1314	unsigned int i;
1319		1315
1320	check_irqs_on();	1316	check_irqs_on();
1321	bh_lru_lock();	1317	bh_lru_lock();
1322	lru = &__get_cpu_var(bh_lrus);
1323	for (i = 0; i < BH_LRU_SIZE; i++) {	1318	for (i = 0; i < BH_LRU_SIZE; i++) {
1324	struct buffer_head *bh = lru->bhs[i];	1319	struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
1325		1320
1326	if (bh && bh->b_bdev == bdev &&	1321	if (bh && bh->b_bdev == bdev &&
1327	bh->b_blocknr == block && bh->b_size == size) {	1322	bh->b_blocknr == block && bh->b_size == size) {
1328	if (i) {	1323	if (i) {
1329	while (i) {	1324	while (i) {
1330	lru->bhs[i] = lru->bhs[i - 1];	1325	__this_cpu_write(bh_lrus.bhs[i],
		1326	__this_cpu_read(bh_lrus.bhs[i - 1]));
1331	i--;	1327	i--;
1332	}	1328	}
1333	lru->bhs[0] = bh;	1329	__this_cpu_write(bh_lrus.bhs[0], bh);
1334	}	1330	}
1335	get_bh(bh);	1331	get_bh(bh);
1336	ret = bh;	1332	ret = bh;
@@ -1617,14 +1613,8 @@ EXPORT_SYMBOL(unmap_underlying_metadata);
1617	* prevents this contention from occurring.	1613	* prevents this contention from occurring.
1618	*	1614	*
1619	* If block_write_full_page() is called with wbc->sync_mode ==	1615	* If block_write_full_page() is called with wbc->sync_mode ==
1620	* WB_SYNC_ALL, the writes are posted using WRITE_SYNC_PLUG; this	1616	* WB_SYNC_ALL, the writes are posted using WRITE_SYNC; this
1621	* causes the writes to be flagged as synchronous writes, but the	1617	* causes the writes to be flagged as synchronous writes.
1622	* block device queue will NOT be unplugged, since usually many pages
1623	* will be pushed to the out before the higher-level caller actually
1624	* waits for the writes to be completed. The various wait functions,
1625	* such as wait_on_writeback_range() will ultimately call sync_page()
1626	* which will ultimately call blk_run_backing_dev(), which will end up
1627	* unplugging the device queue.
1628	*/	1618	*/
1629	static int __block_write_full_page(struct inode inode, struct page page,	1619	static int __block_write_full_page(struct inode inode, struct page page,
1630	get_block_t get_block, struct writeback_control wbc,	1620	get_block_t get_block, struct writeback_control wbc,
@@ -1637,7 +1627,7 @@ static int __block_write_full_page(struct inode inode, struct page page,
1637	const unsigned blocksize = 1 << inode->i_blkbits;	1627	const unsigned blocksize = 1 << inode->i_blkbits;
1638	int nr_underway = 0;	1628	int nr_underway = 0;
1639	int write_op = (wbc->sync_mode == WB_SYNC_ALL ?	1629	int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
1640	WRITE_SYNC_PLUG : WRITE);	1630	WRITE_SYNC : WRITE);
1641		1631
1642	BUG_ON(!PageLocked(page));	1632	BUG_ON(!PageLocked(page));
1643		1633
@@ -1706,7 +1696,7 @@ static int __block_write_full_page(struct inode inode, struct page page,
1706	* and kswapd activity, but those code paths have their own	1696	* and kswapd activity, but those code paths have their own
1707	* higher-level throttling.	1697	* higher-level throttling.
1708	*/	1698	*/
1709	if (wbc->sync_mode != WB_SYNC_NONE \|\| !wbc->nonblocking) {	1699	if (wbc->sync_mode != WB_SYNC_NONE) {
1710	lock_buffer(bh);	1700	lock_buffer(bh);
1711	} else if (!trylock_buffer(bh)) {	1701	} else if (!trylock_buffer(bh)) {
1712	redirty_page_for_writepage(wbc, page);	1702	redirty_page_for_writepage(wbc, page);
@@ -1834,9 +1824,11 @@ void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
1834	}	1824	}
1835	EXPORT_SYMBOL(page_zero_new_buffers);	1825	EXPORT_SYMBOL(page_zero_new_buffers);
1836		1826
1837	int block_prepare_write(struct page *page, unsigned from, unsigned to,	1827	int __block_write_begin(struct page *page, loff_t pos, unsigned len,
1838	get_block_t *get_block)	1828	get_block_t *get_block)
1839	{	1829	{
		1830	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
		1831	unsigned to = from + len;
1840	struct inode *inode = page->mapping->host;	1832	struct inode *inode = page->mapping->host;
1841	unsigned block_start, block_end;	1833	unsigned block_start, block_end;
1842	sector_t block;	1834	sector_t block;
@@ -1910,13 +1902,11 @@ int block_prepare_write(struct page *page, unsigned from, unsigned to,
1910	if (!buffer_uptodate(*wait_bh))	1902	if (!buffer_uptodate(*wait_bh))
1911	err = -EIO;	1903	err = -EIO;
1912	}	1904	}
1913	if (unlikely(err)) {	1905	if (unlikely(err))
1914	page_zero_new_buffers(page, from, to);	1906	page_zero_new_buffers(page, from, to);
1915	ClearPageUptodate(page);
1916	}
1917	return err;	1907	return err;
1918	}	1908	}
1919	EXPORT_SYMBOL(block_prepare_write);	1909	EXPORT_SYMBOL(__block_write_begin);
1920		1910
1921	static int __block_commit_write(struct inode inode, struct page page,	1911	static int __block_commit_write(struct inode inode, struct page page,
1922	unsigned from, unsigned to)	1912	unsigned from, unsigned to)
@@ -1953,15 +1943,6 @@ static int __block_commit_write(struct inode inode, struct page page,
1953	return 0;	1943	return 0;
1954	}	1944	}
1955		1945
1956	int __block_write_begin(struct page *page, loff_t pos, unsigned len,
1957	get_block_t *get_block)
1958	{
1959	unsigned start = pos & (PAGE_CACHE_SIZE - 1);
1960
1961	return block_prepare_write(page, start, start + len, get_block);
1962	}
1963	EXPORT_SYMBOL(__block_write_begin);
1964
1965	/*	1946	/*
1966	* block_write_begin takes care of the basic task of block allocation and	1947	* block_write_begin takes care of the basic task of block allocation and
1967	* bringing partial write blocks uptodate first.	1948	* bringing partial write blocks uptodate first.
@@ -2353,24 +2334,26 @@ EXPORT_SYMBOL(block_commit_write);
2353	* page lock we can determine safely if the page is beyond EOF. If it is not	2334	* page lock we can determine safely if the page is beyond EOF. If it is not
2354	* beyond EOF, then the page is guaranteed safe against truncation until we	2335	* beyond EOF, then the page is guaranteed safe against truncation until we
2355	* unlock the page.	2336	* unlock the page.
		2337	*
		2338	* Direct callers of this function should call vfs_check_frozen() so that page
		2339	* fault does not busyloop until the fs is thawed.
2356	*/	2340	*/
2357	int	2341	int __block_page_mkwrite(struct vm_area_struct vma, struct vm_fault vmf,
2358	block_page_mkwrite(struct vm_area_struct vma, struct vm_fault vmf,	2342	get_block_t get_block)
2359	get_block_t get_block)
2360	{	2343	{
2361	struct page *page = vmf->page;	2344	struct page *page = vmf->page;
2362	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;	2345	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
2363	unsigned long end;	2346	unsigned long end;
2364	loff_t size;	2347	loff_t size;
2365	int ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */	2348	int ret;
2366		2349
2367	lock_page(page);	2350	lock_page(page);
2368	size = i_size_read(inode);	2351	size = i_size_read(inode);
2369	if ((page->mapping != inode->i_mapping) \|\|	2352	if ((page->mapping != inode->i_mapping) \|\|
2370	(page_offset(page) > size)) {	2353	(page_offset(page) > size)) {
2371	/* page got truncated out from underneath us */	2354	/* We overload EFAULT to mean page got truncated */
2372	unlock_page(page);	2355	ret = -EFAULT;
2373	goto out;	2356	goto out_unlock;
2374	}	2357	}
2375		2358
2376	/* page is wholly or partially inside EOF */	2359	/* page is wholly or partially inside EOF */
@@ -2379,22 +2362,46 @@ block_page_mkwrite(struct vm_area_struct vma, struct vm_fault vmf,
2379	else	2362	else
2380	end = PAGE_CACHE_SIZE;	2363	end = PAGE_CACHE_SIZE;
2381		2364
2382	ret = block_prepare_write(page, 0, end, get_block);	2365	ret = __block_write_begin(page, 0, end, get_block);
2383	if (!ret)	2366	if (!ret)
2384	ret = block_commit_write(page, 0, end);	2367	ret = block_commit_write(page, 0, end);
2385		2368
2386	if (unlikely(ret)) {	2369	if (unlikely(ret < 0))
2387	unlock_page(page);	2370	goto out_unlock;
2388	if (ret == -ENOMEM)	2371	/*
2389	ret = VM_FAULT_OOM;	2372	* Freezing in progress? We check after the page is marked dirty and
2390	else /* -ENOSPC, -EIO, etc */	2373	* with page lock held so if the test here fails, we are sure freezing
2391	ret = VM_FAULT_SIGBUS;	2374	* code will wait during syncing until the page fault is done - at that
2392	} else	2375	* point page will be dirty and unlocked so freezing code will write it
2393	ret = VM_FAULT_LOCKED;	2376	* and writeprotect it again.
2394		2377	*/
2395	out:	2378	set_page_dirty(page);
		2379	if (inode->i_sb->s_frozen != SB_UNFROZEN) {
		2380	ret = -EAGAIN;
		2381	goto out_unlock;
		2382	}
		2383	wait_on_page_writeback(page);
		2384	return 0;
		2385	out_unlock:
		2386	unlock_page(page);
2396	return ret;	2387	return ret;
2397	}	2388	}
		2389	EXPORT_SYMBOL(__block_page_mkwrite);
		2390
		2391	int block_page_mkwrite(struct vm_area_struct vma, struct vm_fault vmf,
		2392	get_block_t get_block)
		2393	{
		2394	int ret;
		2395	struct super_block *sb = vma->vm_file->f_path.dentry->d_inode->i_sb;
		2396
		2397	/*
		2398	* This check is racy but catches the common case. The check in
		2399	* __block_page_mkwrite() is reliable.
		2400	*/
		2401	vfs_check_frozen(sb, SB_FREEZE_WRITE);
		2402	ret = __block_page_mkwrite(vma, vmf, get_block);
		2403	return block_page_mkwrite_return(ret);
		2404	}
2398	EXPORT_SYMBOL(block_page_mkwrite);	2405	EXPORT_SYMBOL(block_page_mkwrite);
2399		2406
2400	/*	2407	/*
@@ -2466,11 +2473,10 @@ int nobh_write_begin(struct address_space *mapping,
2466	*fsdata = NULL;	2473	*fsdata = NULL;
2467		2474
2468	if (page_has_buffers(page)) {	2475	if (page_has_buffers(page)) {
2469	unlock_page(page);	2476	ret = __block_write_begin(page, pos, len, get_block);
2470	page_cache_release(page);	2477	if (unlikely(ret))
2471	*pagep = NULL;	2478	goto out_release;
2472	return block_write_begin(mapping, pos, len, flags, pagep,	2479	return ret;
2473	get_block);
2474	}	2480	}
2475		2481
2476	if (PageMappedToDisk(page))	2482	if (PageMappedToDisk(page))
@@ -2891,7 +2897,6 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
2891		2897
2892	if (err == -EOPNOTSUPP) {	2898	if (err == -EOPNOTSUPP) {
2893	set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);	2899	set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2894	set_bit(BH_Eopnotsupp, &bh->b_state);
2895	}	2900	}
2896		2901
2897	if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))	2902	if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
@@ -3031,10 +3036,6 @@ int __sync_dirty_buffer(struct buffer_head *bh, int rw)
3031	bh->b_end_io = end_buffer_write_sync;	3036	bh->b_end_io = end_buffer_write_sync;
3032	ret = submit_bh(rw, bh);	3037	ret = submit_bh(rw, bh);
3033	wait_on_buffer(bh);	3038	wait_on_buffer(bh);
3034	if (buffer_eopnotsupp(bh)) {
3035	clear_buffer_eopnotsupp(bh);
3036	ret = -EOPNOTSUPP;
3037	}
3038	if (!ret && !buffer_uptodate(bh))	3039	if (!ret && !buffer_uptodate(bh))
3039	ret = -EIO;	3040	ret = -EIO;
3040	} else {	3041	} else {
@@ -3154,17 +3155,6 @@ out:
3154	}	3155	}
3155	EXPORT_SYMBOL(try_to_free_buffers);	3156	EXPORT_SYMBOL(try_to_free_buffers);
3156		3157
3157	void block_sync_page(struct page *page)
3158	{
3159	struct address_space *mapping;
3160
3161	smp_mb();
3162	mapping = page_mapping(page);
3163	if (mapping)
3164	blk_run_backing_dev(mapping->backing_dev_info, page);
3165	}
3166	EXPORT_SYMBOL(block_sync_page);
3167
3168	/*	3158	/*
3169	* There are no bdflush tunables left. But distributions are	3159	* There are no bdflush tunables left. But distributions are
3170	* still running obsolete flush daemons, so we terminate them here.	3160	* still running obsolete flush daemons, so we terminate them here.
@@ -3217,22 +3207,23 @@ static void recalc_bh_state(void)
3217	int i;	3207	int i;
3218	int tot = 0;	3208	int tot = 0;
3219		3209
3220	if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)	3210	if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096)
3221	return;	3211	return;
3222	__get_cpu_var(bh_accounting).ratelimit = 0;	3212	__this_cpu_write(bh_accounting.ratelimit, 0);
3223	for_each_online_cpu(i)	3213	for_each_online_cpu(i)
3224	tot += per_cpu(bh_accounting, i).nr;	3214	tot += per_cpu(bh_accounting, i).nr;
3225	buffer_heads_over_limit = (tot > max_buffer_heads);	3215	buffer_heads_over_limit = (tot > max_buffer_heads);
3226	}	3216	}
3227		3217
3228	struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)	3218	struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
3229	{	3219	{
3230	struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);	3220	struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
3231	if (ret) {	3221	if (ret) {
3232	INIT_LIST_HEAD(&ret->b_assoc_buffers);	3222	INIT_LIST_HEAD(&ret->b_assoc_buffers);
3233	get_cpu_var(bh_accounting).nr++;	3223	preempt_disable();
		3224	__this_cpu_inc(bh_accounting.nr);
3234	recalc_bh_state();	3225	recalc_bh_state();
3235	put_cpu_var(bh_accounting);	3226	preempt_enable();
3236	}	3227	}
3237	return ret;	3228	return ret;
3238	}	3229	}
@@ -3242,9 +3233,10 @@ void free_buffer_head(struct buffer_head *bh)
3242	{	3233	{
3243	BUG_ON(!list_empty(&bh->b_assoc_buffers));	3234	BUG_ON(!list_empty(&bh->b_assoc_buffers));
3244	kmem_cache_free(bh_cachep, bh);	3235	kmem_cache_free(bh_cachep, bh);
3245	get_cpu_var(bh_accounting).nr--;	3236	preempt_disable();
		3237	__this_cpu_dec(bh_accounting.nr);
3246	recalc_bh_state();	3238	recalc_bh_state();
3247	put_cpu_var(bh_accounting);	3239	preempt_enable();
3248	}	3240	}
3249	EXPORT_SYMBOL(free_buffer_head);	3241	EXPORT_SYMBOL(free_buffer_head);
3250		3242
@@ -3257,9 +3249,8 @@ static void buffer_exit_cpu(int cpu)
3257	brelse(b->bhs[i]);	3249	brelse(b->bhs[i]);
3258	b->bhs[i] = NULL;	3250	b->bhs[i] = NULL;
3259	}	3251	}
3260	get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr;	3252	this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
3261	per_cpu(bh_accounting, cpu).nr = 0;	3253	per_cpu(bh_accounting, cpu).nr = 0;
3262	put_cpu_var(bh_accounting);
3263	}	3254	}
3264		3255
3265	static int buffer_cpu_notify(struct notifier_block *self,	3256	static int buffer_cpu_notify(struct notifier_block *self,