1 files changed, 137 insertions, 186 deletions
diff --git a/fs/buffer.c b/fs/buffer.c
index 891e1c78e4f1..28f320fac4d4 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -166,151 +166,6 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 }
 /*
- * Write out and wait upon all the dirty data associated with a block
- * device via its mapping.  Does not take the superblock lock.
- */
-int sync_blockdev(struct block_device *bdev)
-{
-        int ret = 0;
-        if (bdev)
-                ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
-        return ret;
-}
-EXPORT_SYMBOL(sync_blockdev);
-/*
- * Write out and wait upon all dirty data associated with this
- * device.   Filesystem data as well as the underlying block
- * device.  Takes the superblock lock.
- */
-int fsync_bdev(struct block_device *bdev)
-{
-        struct super_block *sb = get_super(bdev);
-        if (sb) {
-                int res = fsync_super(sb);
-                drop_super(sb);
-                return res;
-        }
-        return sync_blockdev(bdev);
-}
-/**
- * freeze_bdev  --  lock a filesystem and force it into a consistent state
- * @bdev:       blockdevice to lock
- *
- * This takes the block device bd_mount_sem to make sure no new mounts
- * happen on bdev until thaw_bdev() is called.
- * If a superblock is found on this device, we take the s_umount semaphore
- * on it to make sure nobody unmounts until the snapshot creation is done.
- * The reference counter (bd_fsfreeze_count) guarantees that only the last
- * unfreeze process can unfreeze the frozen filesystem actually when multiple
- * freeze requests arrive simultaneously. It counts up in freeze_bdev() and
- * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
- * actually.
- */
-struct super_block *freeze_bdev(struct block_device *bdev)
-{
-        struct super_block *sb;
-        int error = 0;
-        mutex_lock(&bdev->bd_fsfreeze_mutex);
-        if (bdev->bd_fsfreeze_count > 0) {
-                bdev->bd_fsfreeze_count++;
-                sb = get_super(bdev);
-                mutex_unlock(&bdev->bd_fsfreeze_mutex);
-                return sb;
-        }
-        bdev->bd_fsfreeze_count++;
-        down(&bdev->bd_mount_sem);
-        sb = get_super(bdev);
-        if (sb && !(sb->s_flags & MS_RDONLY)) {
-                sb->s_frozen = SB_FREEZE_WRITE;
-                smp_wmb();
-                __fsync_super(sb);
-                sb->s_frozen = SB_FREEZE_TRANS;
-                smp_wmb();
-                sync_blockdev(sb->s_bdev);
-                if (sb->s_op->freeze_fs) {
-                        error = sb->s_op->freeze_fs(sb);
-                        if (error) {
-                                printk(KERN_ERR
-                                        "VFS:Filesystem freeze failed\n");
-                                sb->s_frozen = SB_UNFROZEN;
-                                drop_super(sb);
-                                up(&bdev->bd_mount_sem);
-                                bdev->bd_fsfreeze_count--;
-                                mutex_unlock(&bdev->bd_fsfreeze_mutex);
-                                return ERR_PTR(error);
-                        }
-                }
-        }
-        sync_blockdev(bdev);
-        mutex_unlock(&bdev->bd_fsfreeze_mutex);
-        return sb;      /* thaw_bdev releases s->s_umount and bd_mount_sem */
-}
-EXPORT_SYMBOL(freeze_bdev);
-/**
- * thaw_bdev  -- unlock filesystem
- * @bdev:       blockdevice to unlock
- * @sb:         associated superblock
- *
- * Unlocks the filesystem and marks it writeable again after freeze_bdev().
- */
-int thaw_bdev(struct block_device *bdev, struct super_block *sb)
-{
-        int error = 0;
-        mutex_lock(&bdev->bd_fsfreeze_mutex);
-        if (!bdev->bd_fsfreeze_count) {
-                mutex_unlock(&bdev->bd_fsfreeze_mutex);
-                return -EINVAL;
-        }
-        bdev->bd_fsfreeze_count--;
-        if (bdev->bd_fsfreeze_count > 0) {
-                if (sb)
-                        drop_super(sb);
-                mutex_unlock(&bdev->bd_fsfreeze_mutex);
-                return 0;
-        }
-        if (sb) {
-                BUG_ON(sb->s_bdev != bdev);
-                if (!(sb->s_flags & MS_RDONLY)) {
-                        if (sb->s_op->unfreeze_fs) {
-                                error = sb->s_op->unfreeze_fs(sb);
-                                if (error) {
-                                        printk(KERN_ERR
-                                                "VFS:Filesystem thaw failed\n");
-                                        sb->s_frozen = SB_FREEZE_TRANS;
-                                        bdev->bd_fsfreeze_count++;
-                                        mutex_unlock(&bdev->bd_fsfreeze_mutex);
-                                        return error;
-                                }
-                        }
-                        sb->s_frozen = SB_UNFROZEN;
-                        smp_wmb();
-                        wake_up(&sb->s_wait_unfrozen);
-                }
-                drop_super(sb);
-        }
-        up(&bdev->bd_mount_sem);
-        mutex_unlock(&bdev->bd_fsfreeze_mutex);
-        return 0;
-}
-EXPORT_SYMBOL(thaw_bdev);
-/*
 * Various filesystems appear to want __find_get_block to be non-blocking.
 * But it's the page lock which protects the buffers.  To get around this,
 * we get exclusion from try_to_free_buffers with the blockdev mapping's
@@ -344,13 +199,13 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
        head = page_buffers(page);
        bh = head;
        do {
-                if (bh->b_blocknr == block) {
+                if (!buffer_mapped(bh))
+                        all_mapped = 0;
+                else if (bh->b_blocknr == block) {
                        ret = bh;
                        get_bh(bh);
                        goto out_unlock;
                }
-                if (!buffer_mapped(bh))
-                        all_mapped = 0;
                bh = bh->b_this_page;
        } while (bh != head);
@@ -435,7 +290,7 @@ static void free_more_memory(void)
                                                &zone);
                if (zone)
                        try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
-                                                GFP_NOFS);
+                                                GFP_NOFS, NULL);
        }
 }
@@ -505,7 +360,7 @@ still_busy:
 * Completion handler for block_write_full_page() - pages which are unlocked
 * during I/O, and which have PageWriteback cleared upon I/O completion.
 */
-static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
+void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 {
        char b[BDEVNAME_SIZE];
        unsigned long flags;
@@ -583,11 +438,17 @@ static void mark_buffer_async_read(struct buffer_head *bh)
        set_buffer_async_read(bh);
 }
-void mark_buffer_async_write(struct buffer_head *bh)
+void mark_buffer_async_write_endio(struct buffer_head *bh,
+                                   bh_end_io_t *handler)
 {
-        bh->b_end_io = end_buffer_async_write;
+        bh->b_end_io = handler;
        set_buffer_async_write(bh);
 }
+void mark_buffer_async_write(struct buffer_head *bh)
+{
+        mark_buffer_async_write_endio(bh, end_buffer_async_write);
+}
 EXPORT_SYMBOL(mark_buffer_async_write);
@@ -692,6 +553,46 @@ repeat:
        return err;
 }
+void do_thaw_all(struct work_struct *work)
+{
+        struct super_block *sb;
+        char b[BDEVNAME_SIZE];
+        spin_lock(&sb_lock);
+restart:
+        list_for_each_entry(sb, &super_blocks, s_list) {
+                sb->s_count++;
+                spin_unlock(&sb_lock);
+                down_read(&sb->s_umount);
+                while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
+                        printk(KERN_WARNING "Emergency Thaw on %s\n",
+                               bdevname(sb->s_bdev, b));
+                up_read(&sb->s_umount);
+                spin_lock(&sb_lock);
+                if (__put_super_and_need_restart(sb))
+                        goto restart;
+        }
+        spin_unlock(&sb_lock);
+        kfree(work);
+        printk(KERN_WARNING "Emergency Thaw complete\n");
+}
+/**
+ * emergency_thaw_all -- forcibly thaw every frozen filesystem
+ *
+ * Used for emergency unfreeze of all filesystems via SysRq
+ */
+void emergency_thaw_all(void)
+{
+        struct work_struct *work;
+        work = kmalloc(sizeof(*work), GFP_ATOMIC);
+        if (work) {
+                INIT_WORK(work, do_thaw_all);
+                schedule_work(work);
+        }
+}
 /**
 * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
 * @mapping: the mapping which wants those buffers written
@@ -766,14 +667,7 @@ static void __set_page_dirty(struct page *page,
        spin_lock_irq(&mapping->tree_lock);
        if (page->mapping) {    /* Race with truncate? */
                WARN_ON_ONCE(warn && !PageUptodate(page));
+                account_page_dirtied(page, mapping);
-                if (mapping_cap_account_dirty(mapping)) {
-                        __inc_zone_page_state(page, NR_FILE_DIRTY);
-                        __inc_bdi_stat(mapping->backing_dev_info,
-                                        BDI_RECLAIMABLE);
-                        task_dirty_inc(current);
-                        task_io_account_write(PAGE_CACHE_SIZE);
-                }
                radix_tree_tag_set(&mapping->page_tree,
                                page_index(page), PAGECACHE_TAG_DIRTY);
        }
@@ -856,7 +750,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 {
        struct buffer_head *bh;
        struct list_head tmp;
-        struct address_space *mapping;
+        struct address_space *mapping, *prev_mapping = NULL;
        int err = 0, err2;
        INIT_LIST_HEAD(&tmp);
@@ -881,7 +775,18 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
                                 * contents - it is a noop if I/O is still in
                                 * flight on potentially older contents.
                                 */
-                                ll_rw_block(SWRITE_SYNC, 1, &bh);
+                                ll_rw_block(SWRITE_SYNC_PLUG, 1, &bh);
+                                /*
+                                 * Kick off IO for the previous mapping. Note
+                                 * that we will not run the very last mapping,
+                                 * wait_on_buffer() will do that for us
+                                 * through sync_buffer().
+                                 */
+                                if (prev_mapping && prev_mapping != mapping)
+                                        blk_run_address_space(prev_mapping);
+                                prev_mapping = mapping;
                                brelse(bh);
                                spin_lock(lock);
                        }
@@ -1180,12 +1085,12 @@ static struct buffer_head *
 __getblk_slow(struct block_device *bdev, sector_t block, int size)
 {
        /* Size must be multiple of hard sectorsize */
-        if (unlikely(size & (bdev_hardsect_size(bdev)-1) ||
+        if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
                        (size < 512 || size > PAGE_SIZE))) {
                printk(KERN_ERR "getblk(): invalid block size %d requested\n",
                                        size);
-                printk(KERN_ERR "hardsect size: %d\n",
+                printk(KERN_ERR "logical block size: %d\n",
-                                        bdev_hardsect_size(bdev));
+                                        bdev_logical_block_size(bdev));
                dump_stack();
                return NULL;
@@ -1260,8 +1165,11 @@ void mark_buffer_dirty(struct buffer_head *bh)
        if (!test_set_buffer_dirty(bh)) {
                struct page *page = bh->b_page;
-                if (!TestSetPageDirty(page))
+                if (!TestSetPageDirty(page)) {
-                        __set_page_dirty(page, page_mapping(page), 0);
+                        struct address_space *mapping = page_mapping(page);
+                        if (mapping)
+                                __set_page_dirty(page, mapping, 0);
+                }
        }
 }
@@ -1704,9 +1612,20 @@ EXPORT_SYMBOL(unmap_underlying_metadata);
 * locked buffer.   This only can happen if someone has written the buffer
 * directly, with submit_bh().  At the address_space level PageWriteback
 * prevents this contention from occurring.
+ *
+ * If block_write_full_page() is called with wbc->sync_mode ==
+ * WB_SYNC_ALL, the writes are posted using WRITE_SYNC_PLUG; this
+ * causes the writes to be flagged as synchronous writes, but the
+ * block device queue will NOT be unplugged, since usually many pages
+ * will be pushed to the out before the higher-level caller actually
+ * waits for the writes to be completed.  The various wait functions,
+ * such as wait_on_writeback_range() will ultimately call sync_page()
+ * which will ultimately call blk_run_backing_dev(), which will end up
+ * unplugging the device queue.
 */
 static int __block_write_full_page(struct inode *inode, struct page *page,
-                        get_block_t *get_block, struct writeback_control *wbc)
+                        get_block_t *get_block, struct writeback_control *wbc,
+                        bh_end_io_t *handler)
 {
        int err;
        sector_t block;
@@ -1714,6 +1633,8 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
        struct buffer_head *bh, *head;
        const unsigned blocksize = 1 << inode->i_blkbits;
        int nr_underway = 0;
+        int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
+                        WRITE_SYNC_PLUG : WRITE);
        BUG_ON(!PageLocked(page));
@@ -1789,7 +1710,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
                        continue;
                }
                if (test_clear_buffer_dirty(bh)) {
-                        mark_buffer_async_write(bh);
+                        mark_buffer_async_write_endio(bh, handler);
                } else {
                        unlock_buffer(bh);
                }
@@ -1805,7 +1726,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
        do {
                struct buffer_head *next = bh->b_this_page;
                if (buffer_async_write(bh)) {
-                        submit_bh(WRITE, bh);
+                        submit_bh(write_op, bh);
                        nr_underway++;
                }
                bh = next;
@@ -1842,7 +1763,7 @@ recover:
                if (buffer_mapped(bh) && buffer_dirty(bh) &&
                    !buffer_delay(bh)) {
                        lock_buffer(bh);
-                        mark_buffer_async_write(bh);
+                        mark_buffer_async_write_endio(bh, handler);
                } else {
                        /*
                         * The buffer may have been set dirty during
@@ -1859,7 +1780,7 @@ recover:
                struct buffer_head *next = bh->b_this_page;
                if (buffer_async_write(bh)) {
                        clear_buffer_dirty(bh);
-                        submit_bh(WRITE, bh);
+                        submit_bh(write_op, bh);
                        nr_underway++;
                }
                bh = next;
@@ -2465,20 +2386,22 @@ int block_commit_write(struct page *page, unsigned from, unsigned to)
 * unlock the page.
 */
 int
-block_page_mkwrite(struct vm_area_struct *vma, struct page *page,
+block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
                   get_block_t get_block)
 {
+        struct page *page = vmf->page;
        struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
        unsigned long end;
        loff_t size;
-        int ret = -EINVAL;
+        int ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
        lock_page(page);
        size = i_size_read(inode);
        if ((page->mapping != inode->i_mapping) ||
            (page_offset(page) > size)) {
                /* page got truncated out from underneath us */
-                goto out_unlock;
+                unlock_page(page);
+                goto out;
        }
        /* page is wholly or partially inside EOF */
@@ -2491,8 +2414,16 @@ block_page_mkwrite(struct vm_area_struct *vma, struct page *page,
        if (!ret)
                ret = block_commit_write(page, 0, end);
-out_unlock:
+        if (unlikely(ret)) {
-        unlock_page(page);
+                unlock_page(page);
+                if (ret == -ENOMEM)
+                        ret = VM_FAULT_OOM;
+                else /* -ENOSPC, -EIO, etc */
+                        ret = VM_FAULT_SIGBUS;
+        } else
+                ret = VM_FAULT_LOCKED;
+out:
        return ret;
 }
@@ -2760,7 +2691,8 @@ int nobh_writepage(struct page *page, get_block_t *get_block,
 out:
        ret = mpage_writepage(page, get_block, wbc);
        if (ret == -EAGAIN)
-                ret = __block_write_full_page(inode, page, get_block, wbc);
+                ret = __block_write_full_page(inode, page, get_block, wbc,
+                                              end_buffer_async_write);
        return ret;
 }
 EXPORT_SYMBOL(nobh_writepage);
@@ -2807,6 +2739,8 @@ has_buffers:
                pos += blocksize;
        }
+        map_bh.b_size = blocksize;
+        map_bh.b_state = 0;
        err = get_block(inode, iblock, &map_bh, 0);
        if (err)
                goto unlock;
@@ -2918,9 +2852,10 @@ out:
 /*
 * The generic ->writepage function for buffer-backed address_spaces
+ * this form passes in the end_io handler used to finish the IO.
 */
-int block_write_full_page(struct page *page, get_block_t *get_block,
+int block_write_full_page_endio(struct page *page, get_block_t *get_block,
-                        struct writeback_control *wbc)
+                        struct writeback_control *wbc, bh_end_io_t *handler)
 {
        struct inode * const inode = page->mapping->host;
        loff_t i_size = i_size_read(inode);
@@ -2929,7 +2864,8 @@ int block_write_full_page(struct page *page, get_block_t *get_block,
        /* Is the page fully inside i_size? */
        if (page->index < end_index)
-                return __block_write_full_page(inode, page, get_block, wbc);
+                return __block_write_full_page(inode, page, get_block, wbc,
+                                               handler);
        /* Is the page fully outside i_size? (truncate in progress) */
        offset = i_size & (PAGE_CACHE_SIZE-1);
@@ -2952,9 +2888,20 @@ int block_write_full_page(struct page *page, get_block_t *get_block,
         * writes to that region are not written out to the file."
         */
        zero_user_segment(page, offset, PAGE_CACHE_SIZE);
-        return __block_write_full_page(inode, page, get_block, wbc);
+        return __block_write_full_page(inode, page, get_block, wbc, handler);
+}
+/*
+ * The generic ->writepage function for buffer-backed address_spaces
+ */
+int block_write_full_page(struct page *page, get_block_t *get_block,
+                        struct writeback_control *wbc)
+{
+        return block_write_full_page_endio(page, get_block, wbc,
+                                           end_buffer_async_write);
 }
 sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
                            get_block_t *get_block)
 {
@@ -2991,6 +2938,8 @@ int submit_bh(int rw, struct buffer_head * bh)
        BUG_ON(!buffer_locked(bh));
        BUG_ON(!buffer_mapped(bh));
        BUG_ON(!bh->b_end_io);
+        BUG_ON(buffer_delay(bh));
+        BUG_ON(buffer_unwritten(bh));
        /*
         * Mask in barrier bit for a write (could be either a WRITE or a
@@ -3067,12 +3016,13 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
        for (i = 0; i < nr; i++) {
                struct buffer_head *bh = bhs[i];
-                if (rw == SWRITE || rw == SWRITE_SYNC)
+                if (rw == SWRITE || rw == SWRITE_SYNC || rw == SWRITE_SYNC_PLUG)
                        lock_buffer(bh);
                else if (!trylock_buffer(bh))
                        continue;
-                if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC) {
+                if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC ||
+                    rw == SWRITE_SYNC_PLUG) {
                        if (test_clear_buffer_dirty(bh)) {
                                bh->b_end_io = end_buffer_write_sync;
                                get_bh(bh);
@@ -3108,7 +3058,7 @@ int sync_dirty_buffer(struct buffer_head *bh)
        if (test_clear_buffer_dirty(bh)) {
                get_bh(bh);
                bh->b_end_io = end_buffer_write_sync;
-                ret = submit_bh(WRITE, bh);
+                ret = submit_bh(WRITE_SYNC, bh);
                wait_on_buffer(bh);
                if (buffer_eopnotsupp(bh)) {
                        clear_buffer_eopnotsupp(bh);
@@ -3422,11 +3372,12 @@ EXPORT_SYMBOL(block_read_full_page);
 EXPORT_SYMBOL(block_sync_page);
 EXPORT_SYMBOL(block_truncate_page);
 EXPORT_SYMBOL(block_write_full_page);
+EXPORT_SYMBOL(block_write_full_page_endio);
 EXPORT_SYMBOL(cont_write_begin);
 EXPORT_SYMBOL(end_buffer_read_sync);
 EXPORT_SYMBOL(end_buffer_write_sync);
+EXPORT_SYMBOL(end_buffer_async_write);
 EXPORT_SYMBOL(file_fsync);
-EXPORT_SYMBOL(fsync_bdev);
 EXPORT_SYMBOL(generic_block_bmap);
 EXPORT_SYMBOL(generic_cont_expand_simple);
 EXPORT_SYMBOL(init_buffer);