aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_file.c3
-rw-r--r--fs/autofs4/expire.c36
-rw-r--r--fs/bio.c13
-rw-r--r--fs/block_dev.c3
-rw-r--r--fs/btrfs/backref.c4
-rw-r--r--fs/btrfs/compression.c1
-rw-r--r--fs/btrfs/ctree.c9
-rw-r--r--fs/btrfs/ctree.h3
-rw-r--r--fs/btrfs/delayed-inode.c12
-rw-r--r--fs/btrfs/delayed-ref.c163
-rw-r--r--fs/btrfs/delayed-ref.h4
-rw-r--r--fs/btrfs/disk-io.c56
-rw-r--r--fs/btrfs/disk-io.h2
-rw-r--r--fs/btrfs/extent-tree.c123
-rw-r--r--fs/btrfs/extent_io.c17
-rw-r--r--fs/btrfs/file-item.c4
-rw-r--r--fs/btrfs/file.c3
-rw-r--r--fs/btrfs/inode.c335
-rw-r--r--fs/btrfs/ioctl.c12
-rw-r--r--fs/btrfs/locking.c2
-rw-r--r--fs/btrfs/ordered-data.c2
-rw-r--r--fs/btrfs/qgroup.c12
-rw-r--r--fs/btrfs/root-tree.c4
-rw-r--r--fs/btrfs/super.c19
-rw-r--r--fs/btrfs/transaction.c10
-rw-r--r--fs/btrfs/volumes.c37
-rw-r--r--fs/btrfs/volumes.h2
-rw-r--r--fs/buffer.c94
-rw-r--r--fs/ceph/addr.c3
-rw-r--r--fs/ceph/debugfs.c1
-rw-r--r--fs/ceph/dir.c38
-rw-r--r--fs/ceph/file.c62
-rw-r--r--fs/ceph/inode.c15
-rw-r--r--fs/ceph/ioctl.c3
-rw-r--r--fs/ceph/super.h6
-rw-r--r--fs/cifs/cifsglob.h10
-rw-r--r--fs/cifs/cifsproto.h11
-rw-r--r--fs/cifs/cifssmb.c42
-rw-r--r--fs/cifs/dir.c9
-rw-r--r--fs/cifs/file.c2
-rw-r--r--fs/cifs/inode.c324
-rw-r--r--fs/cifs/link.c2
-rw-r--r--fs/cifs/smb1ops.c24
-rw-r--r--fs/cifs/smb2inode.c39
-rw-r--r--fs/cifs/smb2misc.c16
-rw-r--r--fs/cifs/smb2ops.c3
-rw-r--r--fs/cifs/smb2pdu.h10
-rw-r--r--fs/cifs/smb2proto.h8
-rw-r--r--fs/cifs/transport.c9
-rw-r--r--fs/compat.c10
-rw-r--r--fs/direct-io.c5
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h24
-rw-r--r--fs/ecryptfs/file.c90
-rw-r--r--fs/ecryptfs/inode.c95
-rw-r--r--fs/ecryptfs/main.c22
-rw-r--r--fs/ecryptfs/messaging.c136
-rw-r--r--fs/ecryptfs/miscdev.c98
-rw-r--r--fs/ecryptfs/mmap.c39
-rw-r--r--fs/eventpoll.c2
-rw-r--r--fs/exec.c19
-rw-r--r--fs/exofs/inode.c27
-rw-r--r--fs/exofs/ore.c14
-rw-r--r--fs/exofs/super.c11
-rw-r--r--fs/ext2/inode.c5
-rw-r--r--fs/ext2/super.c33
-rw-r--r--fs/ext3/inode.c25
-rw-r--r--fs/ext3/super.c11
-rw-r--r--fs/ext4/balloc.c62
-rw-r--r--fs/ext4/bitmap.c1
-rw-r--r--fs/ext4/extents.c1
-rw-r--r--fs/ext4/inode.c25
-rw-r--r--fs/ext4/mmp.c6
-rw-r--r--fs/ext4/super.c48
-rw-r--r--fs/fat/file.c15
-rw-r--r--fs/file_table.c4
-rw-r--r--fs/fuse/control.c4
-rw-r--r--fs/fuse/cuse.c4
-rw-r--r--fs/fuse/dev.c1
-rw-r--r--fs/fuse/dir.c3
-rw-r--r--fs/fuse/file.c19
-rw-r--r--fs/fuse/fuse_i.h3
-rw-r--r--fs/fuse/inode.c44
-rw-r--r--fs/gfs2/file.c18
-rw-r--r--fs/gfs2/meta_io.c2
-rw-r--r--fs/gfs2/trans.c4
-rw-r--r--fs/hfs/mdb.c4
-rw-r--r--fs/inode.c12
-rw-r--r--fs/internal.h4
-rw-r--r--fs/jbd/journal.c9
-rw-r--r--fs/jbd2/journal.c7
-rw-r--r--fs/lockd/clntproc.c14
-rw-r--r--fs/lockd/svc4proc.c1
-rw-r--r--fs/lockd/svclock.c1
-rw-r--r--fs/lockd/svcproc.c1
-rw-r--r--fs/locks.c6
-rw-r--r--fs/logfs/dev_bdev.c15
-rw-r--r--fs/logfs/inode.c18
-rw-r--r--fs/logfs/journal.c2
-rw-r--r--fs/logfs/readwrite.c1
-rw-r--r--fs/logfs/segment.c2
-rw-r--r--fs/namei.c323
-rw-r--r--fs/namespace.c97
-rw-r--r--fs/nfsd/nfs4callback.c4
-rw-r--r--fs/nfsd/nfs4recover.c9
-rw-r--r--fs/nfsd/nfsfh.c1
-rw-r--r--fs/nfsd/nfsproc.c9
-rw-r--r--fs/nfsd/state.h1
-rw-r--r--fs/nfsd/vfs.c79
-rw-r--r--fs/nfsd/vfs.h11
-rw-r--r--fs/nilfs2/file.c18
-rw-r--r--fs/nilfs2/ioctl.c2
-rw-r--r--fs/nilfs2/segment.c5
-rw-r--r--fs/nilfs2/super.c4
-rw-r--r--fs/nilfs2/the_nilfs.h2
-rw-r--r--fs/ntfs/file.c3
-rw-r--r--fs/ocfs2/file.c11
-rw-r--r--fs/ocfs2/ioctl.c14
-rw-r--r--fs/ocfs2/journal.c7
-rw-r--r--fs/ocfs2/mmap.c2
-rw-r--r--fs/ocfs2/refcounttree.c11
-rw-r--r--fs/open.c24
-rw-r--r--fs/pipe.c75
-rw-r--r--fs/quota/dquot.c2
-rw-r--r--fs/reiserfs/bitmap.c2
-rw-r--r--fs/reiserfs/inode.c2
-rw-r--r--fs/splice.c3
-rw-r--r--fs/super.c292
-rw-r--r--fs/sysfs/bin.c2
-rw-r--r--fs/ubifs/debug.h2
-rw-r--r--fs/ubifs/file.c10
-rw-r--r--fs/ubifs/lpt.c5
-rw-r--r--fs/ubifs/recovery.c2
-rw-r--r--fs/ubifs/replay.c3
-rw-r--r--fs/ubifs/super.c5
-rw-r--r--fs/udf/file.c35
-rw-r--r--fs/udf/inode.c5
-rw-r--r--fs/udf/super.c7
-rw-r--r--fs/xfs/xfs_aops.c18
-rw-r--r--fs/xfs/xfs_discard.c6
-rw-r--r--fs/xfs/xfs_file.c10
-rw-r--r--fs/xfs/xfs_ialloc.c17
-rw-r--r--fs/xfs/xfs_ioctl.c55
-rw-r--r--fs/xfs/xfs_ioctl32.c12
-rw-r--r--fs/xfs/xfs_iomap.c4
-rw-r--r--fs/xfs/xfs_mount.c2
-rw-r--r--fs/xfs/xfs_mount.h3
-rw-r--r--fs/xfs/xfs_rtalloc.c2
-rw-r--r--fs/xfs/xfs_sync.c2
-rw-r--r--fs/xfs/xfs_trans.c17
-rw-r--r--fs/xfs/xfs_trans.h2
150 files changed, 2211 insertions, 1677 deletions
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index fc06fd27065e..dd6f7ee1e312 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -610,6 +610,9 @@ v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
610 p9_debug(P9_DEBUG_VFS, "page %p fid %lx\n", 610 p9_debug(P9_DEBUG_VFS, "page %p fid %lx\n",
611 page, (unsigned long)filp->private_data); 611 page, (unsigned long)filp->private_data);
612 612
613 /* Update file times before taking page lock */
614 file_update_time(filp);
615
613 v9inode = V9FS_I(inode); 616 v9inode = V9FS_I(inode);
614 /* make sure the cache has finished storing the page */ 617 /* make sure the cache has finished storing the page */
615 v9fs_fscache_wait_on_page_write(inode, page); 618 v9fs_fscache_wait_on_page_write(inode, page);
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 1feb68ecef95..842d00048a65 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -94,25 +94,21 @@ static struct dentry *get_next_positive_subdir(struct dentry *prev,
94{ 94{
95 struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb); 95 struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb);
96 struct list_head *next; 96 struct list_head *next;
97 struct dentry *p, *q; 97 struct dentry *q;
98 98
99 spin_lock(&sbi->lookup_lock); 99 spin_lock(&sbi->lookup_lock);
100 spin_lock(&root->d_lock);
100 101
101 if (prev == NULL) { 102 if (prev)
102 spin_lock(&root->d_lock); 103 next = prev->d_u.d_child.next;
104 else {
103 prev = dget_dlock(root); 105 prev = dget_dlock(root);
104 next = prev->d_subdirs.next; 106 next = prev->d_subdirs.next;
105 p = prev;
106 goto start;
107 } 107 }
108 108
109 p = prev; 109cont:
110 spin_lock(&p->d_lock);
111again:
112 next = p->d_u.d_child.next;
113start:
114 if (next == &root->d_subdirs) { 110 if (next == &root->d_subdirs) {
115 spin_unlock(&p->d_lock); 111 spin_unlock(&root->d_lock);
116 spin_unlock(&sbi->lookup_lock); 112 spin_unlock(&sbi->lookup_lock);
117 dput(prev); 113 dput(prev);
118 return NULL; 114 return NULL;
@@ -121,16 +117,15 @@ start:
121 q = list_entry(next, struct dentry, d_u.d_child); 117 q = list_entry(next, struct dentry, d_u.d_child);
122 118
123 spin_lock_nested(&q->d_lock, DENTRY_D_LOCK_NESTED); 119 spin_lock_nested(&q->d_lock, DENTRY_D_LOCK_NESTED);
124 /* Negative dentry - try next */ 120 /* Already gone or negative dentry (under construction) - try next */
125 if (!simple_positive(q)) { 121 if (q->d_count == 0 || !simple_positive(q)) {
126 spin_unlock(&p->d_lock); 122 spin_unlock(&q->d_lock);
127 lock_set_subclass(&q->d_lock.dep_map, 0, _RET_IP_); 123 next = q->d_u.d_child.next;
128 p = q; 124 goto cont;
129 goto again;
130 } 125 }
131 dget_dlock(q); 126 dget_dlock(q);
132 spin_unlock(&q->d_lock); 127 spin_unlock(&q->d_lock);
133 spin_unlock(&p->d_lock); 128 spin_unlock(&root->d_lock);
134 spin_unlock(&sbi->lookup_lock); 129 spin_unlock(&sbi->lookup_lock);
135 130
136 dput(prev); 131 dput(prev);
@@ -404,11 +399,6 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
404 DPRINTK("checking mountpoint %p %.*s", 399 DPRINTK("checking mountpoint %p %.*s",
405 dentry, (int)dentry->d_name.len, dentry->d_name.name); 400 dentry, (int)dentry->d_name.len, dentry->d_name.name);
406 401
407 /* Path walk currently on this dentry? */
408 ino_count = atomic_read(&ino->count) + 2;
409 if (dentry->d_count > ino_count)
410 goto next;
411
412 /* Can we umount this guy */ 402 /* Can we umount this guy */
413 if (autofs4_mount_busy(mnt, dentry)) 403 if (autofs4_mount_busy(mnt, dentry))
414 goto next; 404 goto next;
diff --git a/fs/bio.c b/fs/bio.c
index 73922abba832..71072ab99128 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -73,7 +73,7 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
73{ 73{
74 unsigned int sz = sizeof(struct bio) + extra_size; 74 unsigned int sz = sizeof(struct bio) + extra_size;
75 struct kmem_cache *slab = NULL; 75 struct kmem_cache *slab = NULL;
76 struct bio_slab *bslab; 76 struct bio_slab *bslab, *new_bio_slabs;
77 unsigned int i, entry = -1; 77 unsigned int i, entry = -1;
78 78
79 mutex_lock(&bio_slab_lock); 79 mutex_lock(&bio_slab_lock);
@@ -97,11 +97,12 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
97 97
98 if (bio_slab_nr == bio_slab_max && entry == -1) { 98 if (bio_slab_nr == bio_slab_max && entry == -1) {
99 bio_slab_max <<= 1; 99 bio_slab_max <<= 1;
100 bio_slabs = krealloc(bio_slabs, 100 new_bio_slabs = krealloc(bio_slabs,
101 bio_slab_max * sizeof(struct bio_slab), 101 bio_slab_max * sizeof(struct bio_slab),
102 GFP_KERNEL); 102 GFP_KERNEL);
103 if (!bio_slabs) 103 if (!new_bio_slabs)
104 goto out_unlock; 104 goto out_unlock;
105 bio_slabs = new_bio_slabs;
105 } 106 }
106 if (entry == -1) 107 if (entry == -1)
107 entry = bio_slab_nr++; 108 entry = bio_slab_nr++;
@@ -1312,7 +1313,7 @@ EXPORT_SYMBOL(bio_copy_kern);
1312 * Note that this code is very hard to test under normal circumstances because 1313 * Note that this code is very hard to test under normal circumstances because
1313 * direct-io pins the pages with get_user_pages(). This makes 1314 * direct-io pins the pages with get_user_pages(). This makes
1314 * is_page_cache_freeable return false, and the VM will not clean the pages. 1315 * is_page_cache_freeable return false, and the VM will not clean the pages.
1315 * But other code (eg, pdflush) could clean the pages if they are mapped 1316 * But other code (eg, flusher threads) could clean the pages if they are mapped
1316 * pagecache. 1317 * pagecache.
1317 * 1318 *
1318 * Simply disabling the call to bio_set_pages_dirty() is a good way to test the 1319 * Simply disabling the call to bio_set_pages_dirty() is a good way to test the
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 1e519195d45b..38e721b35d45 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1578,10 +1578,12 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
1578 unsigned long nr_segs, loff_t pos) 1578 unsigned long nr_segs, loff_t pos)
1579{ 1579{
1580 struct file *file = iocb->ki_filp; 1580 struct file *file = iocb->ki_filp;
1581 struct blk_plug plug;
1581 ssize_t ret; 1582 ssize_t ret;
1582 1583
1583 BUG_ON(iocb->ki_pos != pos); 1584 BUG_ON(iocb->ki_pos != pos);
1584 1585
1586 blk_start_plug(&plug);
1585 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); 1587 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
1586 if (ret > 0 || ret == -EIOCBQUEUED) { 1588 if (ret > 0 || ret == -EIOCBQUEUED) {
1587 ssize_t err; 1589 ssize_t err;
@@ -1590,6 +1592,7 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
1590 if (err < 0 && ret > 0) 1592 if (err < 0 && ret > 0)
1591 ret = err; 1593 ret = err;
1592 } 1594 }
1595 blk_finish_plug(&plug);
1593 return ret; 1596 return ret;
1594} 1597}
1595EXPORT_SYMBOL_GPL(blkdev_aio_write); 1598EXPORT_SYMBOL_GPL(blkdev_aio_write);
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index a256f3b2a845..ff6475f409d6 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1438,10 +1438,10 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
1438 ret = extent_from_logical(fs_info, logical, path, 1438 ret = extent_from_logical(fs_info, logical, path,
1439 &found_key); 1439 &found_key);
1440 btrfs_release_path(path); 1440 btrfs_release_path(path);
1441 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1442 ret = -EINVAL;
1443 if (ret < 0) 1441 if (ret < 0)
1444 return ret; 1442 return ret;
1443 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1444 return -EINVAL;
1445 1445
1446 extent_item_pos = logical - found_key.objectid; 1446 extent_item_pos = logical - found_key.objectid;
1447 ret = iterate_extent_inodes(fs_info, found_key.objectid, 1447 ret = iterate_extent_inodes(fs_info, found_key.objectid,
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 86eff48dab78..43d1c5a3a030 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -818,6 +818,7 @@ static void free_workspace(int type, struct list_head *workspace)
818 btrfs_compress_op[idx]->free_workspace(workspace); 818 btrfs_compress_op[idx]->free_workspace(workspace);
819 atomic_dec(alloc_workspace); 819 atomic_dec(alloc_workspace);
820wake: 820wake:
821 smp_mb();
821 if (waitqueue_active(workspace_wait)) 822 if (waitqueue_active(workspace_wait))
822 wake_up(workspace_wait); 823 wake_up(workspace_wait);
823} 824}
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 9d7621f271ff..6d183f60d63a 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -421,12 +421,6 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
421 spin_unlock(&fs_info->tree_mod_seq_lock); 421 spin_unlock(&fs_info->tree_mod_seq_lock);
422 422
423 /* 423 /*
424 * we removed the lowest blocker from the blocker list, so there may be
425 * more processible delayed refs.
426 */
427 wake_up(&fs_info->tree_mod_seq_wait);
428
429 /*
430 * anything that's lower than the lowest existing (read: blocked) 424 * anything that's lower than the lowest existing (read: blocked)
431 * sequence number can be removed from the tree. 425 * sequence number can be removed from the tree.
432 */ 426 */
@@ -631,6 +625,9 @@ __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
631 u32 nritems; 625 u32 nritems;
632 int ret; 626 int ret;
633 627
628 if (btrfs_header_level(eb) == 0)
629 return;
630
634 nritems = btrfs_header_nritems(eb); 631 nritems = btrfs_header_nritems(eb);
635 for (i = nritems - 1; i >= 0; i--) { 632 for (i = nritems - 1; i >= 0; i--) {
636 ret = tree_mod_log_insert_key_locked(fs_info, eb, i, 633 ret = tree_mod_log_insert_key_locked(fs_info, eb, i,
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4bab807227ad..0d195b507660 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1252,7 +1252,6 @@ struct btrfs_fs_info {
1252 atomic_t tree_mod_seq; 1252 atomic_t tree_mod_seq;
1253 struct list_head tree_mod_seq_list; 1253 struct list_head tree_mod_seq_list;
1254 struct seq_list tree_mod_seq_elem; 1254 struct seq_list tree_mod_seq_elem;
1255 wait_queue_head_t tree_mod_seq_wait;
1256 1255
1257 /* this protects tree_mod_log */ 1256 /* this protects tree_mod_log */
1258 rwlock_t tree_mod_log_lock; 1257 rwlock_t tree_mod_log_lock;
@@ -3192,7 +3191,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
3192int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, 3191int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
3193 struct bio *bio, u32 *dst); 3192 struct bio *bio, u32 *dst);
3194int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode, 3193int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
3195 struct bio *bio, u64 logical_offset, u32 *dst); 3194 struct bio *bio, u64 logical_offset);
3196int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, 3195int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
3197 struct btrfs_root *root, 3196 struct btrfs_root *root,
3198 u64 objectid, u64 pos, 3197 u64 objectid, u64 pos,
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 335605c8ceab..07d5eeb1e6f1 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -512,8 +512,8 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
512 512
513 rb_erase(&delayed_item->rb_node, root); 513 rb_erase(&delayed_item->rb_node, root);
514 delayed_item->delayed_node->count--; 514 delayed_item->delayed_node->count--;
515 atomic_dec(&delayed_root->items); 515 if (atomic_dec_return(&delayed_root->items) <
516 if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND && 516 BTRFS_DELAYED_BACKGROUND &&
517 waitqueue_active(&delayed_root->wait)) 517 waitqueue_active(&delayed_root->wait))
518 wake_up(&delayed_root->wait); 518 wake_up(&delayed_root->wait);
519} 519}
@@ -1028,9 +1028,10 @@ do_again:
1028 btrfs_release_delayed_item(prev); 1028 btrfs_release_delayed_item(prev);
1029 ret = 0; 1029 ret = 0;
1030 btrfs_release_path(path); 1030 btrfs_release_path(path);
1031 if (curr) 1031 if (curr) {
1032 mutex_unlock(&node->mutex);
1032 goto do_again; 1033 goto do_again;
1033 else 1034 } else
1034 goto delete_fail; 1035 goto delete_fail;
1035 } 1036 }
1036 1037
@@ -1055,8 +1056,7 @@ static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node)
1055 delayed_node->count--; 1056 delayed_node->count--;
1056 1057
1057 delayed_root = delayed_node->root->fs_info->delayed_root; 1058 delayed_root = delayed_node->root->fs_info->delayed_root;
1058 atomic_dec(&delayed_root->items); 1059 if (atomic_dec_return(&delayed_root->items) <
1059 if (atomic_read(&delayed_root->items) <
1060 BTRFS_DELAYED_BACKGROUND && 1060 BTRFS_DELAYED_BACKGROUND &&
1061 waitqueue_active(&delayed_root->wait)) 1061 waitqueue_active(&delayed_root->wait))
1062 wake_up(&delayed_root->wait); 1062 wake_up(&delayed_root->wait);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index da7419ed01bb..ae9411773397 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -38,17 +38,14 @@
38static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref2, 38static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref2,
39 struct btrfs_delayed_tree_ref *ref1) 39 struct btrfs_delayed_tree_ref *ref1)
40{ 40{
41 if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) { 41 if (ref1->root < ref2->root)
42 if (ref1->root < ref2->root) 42 return -1;
43 return -1; 43 if (ref1->root > ref2->root)
44 if (ref1->root > ref2->root) 44 return 1;
45 return 1; 45 if (ref1->parent < ref2->parent)
46 } else { 46 return -1;
47 if (ref1->parent < ref2->parent) 47 if (ref1->parent > ref2->parent)
48 return -1; 48 return 1;
49 if (ref1->parent > ref2->parent)
50 return 1;
51 }
52 return 0; 49 return 0;
53} 50}
54 51
@@ -85,7 +82,8 @@ static int comp_data_refs(struct btrfs_delayed_data_ref *ref2,
85 * type of the delayed backrefs and content of delayed backrefs. 82 * type of the delayed backrefs and content of delayed backrefs.
86 */ 83 */
87static int comp_entry(struct btrfs_delayed_ref_node *ref2, 84static int comp_entry(struct btrfs_delayed_ref_node *ref2,
88 struct btrfs_delayed_ref_node *ref1) 85 struct btrfs_delayed_ref_node *ref1,
86 bool compare_seq)
89{ 87{
90 if (ref1->bytenr < ref2->bytenr) 88 if (ref1->bytenr < ref2->bytenr)
91 return -1; 89 return -1;
@@ -102,10 +100,12 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref2,
102 if (ref1->type > ref2->type) 100 if (ref1->type > ref2->type)
103 return 1; 101 return 1;
104 /* merging of sequenced refs is not allowed */ 102 /* merging of sequenced refs is not allowed */
105 if (ref1->seq < ref2->seq) 103 if (compare_seq) {
106 return -1; 104 if (ref1->seq < ref2->seq)
107 if (ref1->seq > ref2->seq) 105 return -1;
108 return 1; 106 if (ref1->seq > ref2->seq)
107 return 1;
108 }
109 if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY || 109 if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
110 ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) { 110 ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) {
111 return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2), 111 return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2),
@@ -139,7 +139,7 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
139 entry = rb_entry(parent_node, struct btrfs_delayed_ref_node, 139 entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
140 rb_node); 140 rb_node);
141 141
142 cmp = comp_entry(entry, ins); 142 cmp = comp_entry(entry, ins, 1);
143 if (cmp < 0) 143 if (cmp < 0)
144 p = &(*p)->rb_left; 144 p = &(*p)->rb_left;
145 else if (cmp > 0) 145 else if (cmp > 0)
@@ -233,6 +233,114 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
233 return 0; 233 return 0;
234} 234}
235 235
236static void inline drop_delayed_ref(struct btrfs_trans_handle *trans,
237 struct btrfs_delayed_ref_root *delayed_refs,
238 struct btrfs_delayed_ref_node *ref)
239{
240 rb_erase(&ref->rb_node, &delayed_refs->root);
241 ref->in_tree = 0;
242 btrfs_put_delayed_ref(ref);
243 delayed_refs->num_entries--;
244 if (trans->delayed_ref_updates)
245 trans->delayed_ref_updates--;
246}
247
248static int merge_ref(struct btrfs_trans_handle *trans,
249 struct btrfs_delayed_ref_root *delayed_refs,
250 struct btrfs_delayed_ref_node *ref, u64 seq)
251{
252 struct rb_node *node;
253 int merged = 0;
254 int mod = 0;
255 int done = 0;
256
257 node = rb_prev(&ref->rb_node);
258 while (node) {
259 struct btrfs_delayed_ref_node *next;
260
261 next = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
262 node = rb_prev(node);
263 if (next->bytenr != ref->bytenr)
264 break;
265 if (seq && next->seq >= seq)
266 break;
267 if (comp_entry(ref, next, 0))
268 continue;
269
270 if (ref->action == next->action) {
271 mod = next->ref_mod;
272 } else {
273 if (ref->ref_mod < next->ref_mod) {
274 struct btrfs_delayed_ref_node *tmp;
275
276 tmp = ref;
277 ref = next;
278 next = tmp;
279 done = 1;
280 }
281 mod = -next->ref_mod;
282 }
283
284 merged++;
285 drop_delayed_ref(trans, delayed_refs, next);
286 ref->ref_mod += mod;
287 if (ref->ref_mod == 0) {
288 drop_delayed_ref(trans, delayed_refs, ref);
289 break;
290 } else {
291 /*
292 * You can't have multiples of the same ref on a tree
293 * block.
294 */
295 WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
296 ref->type == BTRFS_SHARED_BLOCK_REF_KEY);
297 }
298
299 if (done)
300 break;
301 node = rb_prev(&ref->rb_node);
302 }
303
304 return merged;
305}
306
307void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
308 struct btrfs_fs_info *fs_info,
309 struct btrfs_delayed_ref_root *delayed_refs,
310 struct btrfs_delayed_ref_head *head)
311{
312 struct rb_node *node;
313 u64 seq = 0;
314
315 spin_lock(&fs_info->tree_mod_seq_lock);
316 if (!list_empty(&fs_info->tree_mod_seq_list)) {
317 struct seq_list *elem;
318
319 elem = list_first_entry(&fs_info->tree_mod_seq_list,
320 struct seq_list, list);
321 seq = elem->seq;
322 }
323 spin_unlock(&fs_info->tree_mod_seq_lock);
324
325 node = rb_prev(&head->node.rb_node);
326 while (node) {
327 struct btrfs_delayed_ref_node *ref;
328
329 ref = rb_entry(node, struct btrfs_delayed_ref_node,
330 rb_node);
331 if (ref->bytenr != head->node.bytenr)
332 break;
333
334 /* We can't merge refs that are outside of our seq count */
335 if (seq && ref->seq >= seq)
336 break;
337 if (merge_ref(trans, delayed_refs, ref, seq))
338 node = rb_prev(&head->node.rb_node);
339 else
340 node = rb_prev(node);
341 }
342}
343
236int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, 344int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
237 struct btrfs_delayed_ref_root *delayed_refs, 345 struct btrfs_delayed_ref_root *delayed_refs,
238 u64 seq) 346 u64 seq)
@@ -336,18 +444,11 @@ update_existing_ref(struct btrfs_trans_handle *trans,
336 * every changing the extent allocation tree. 444 * every changing the extent allocation tree.
337 */ 445 */
338 existing->ref_mod--; 446 existing->ref_mod--;
339 if (existing->ref_mod == 0) { 447 if (existing->ref_mod == 0)
340 rb_erase(&existing->rb_node, 448 drop_delayed_ref(trans, delayed_refs, existing);
341 &delayed_refs->root); 449 else
342 existing->in_tree = 0;
343 btrfs_put_delayed_ref(existing);
344 delayed_refs->num_entries--;
345 if (trans->delayed_ref_updates)
346 trans->delayed_ref_updates--;
347 } else {
348 WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY || 450 WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
349 existing->type == BTRFS_SHARED_BLOCK_REF_KEY); 451 existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
350 }
351 } else { 452 } else {
352 WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY || 453 WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
353 existing->type == BTRFS_SHARED_BLOCK_REF_KEY); 454 existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
@@ -662,9 +763,6 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
662 add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr, 763 add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr,
663 num_bytes, parent, ref_root, level, action, 764 num_bytes, parent, ref_root, level, action,
664 for_cow); 765 for_cow);
665 if (!need_ref_seq(for_cow, ref_root) &&
666 waitqueue_active(&fs_info->tree_mod_seq_wait))
667 wake_up(&fs_info->tree_mod_seq_wait);
668 spin_unlock(&delayed_refs->lock); 766 spin_unlock(&delayed_refs->lock);
669 if (need_ref_seq(for_cow, ref_root)) 767 if (need_ref_seq(for_cow, ref_root))
670 btrfs_qgroup_record_ref(trans, &ref->node, extent_op); 768 btrfs_qgroup_record_ref(trans, &ref->node, extent_op);
@@ -713,9 +811,6 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
713 add_delayed_data_ref(fs_info, trans, &ref->node, bytenr, 811 add_delayed_data_ref(fs_info, trans, &ref->node, bytenr,
714 num_bytes, parent, ref_root, owner, offset, 812 num_bytes, parent, ref_root, owner, offset,
715 action, for_cow); 813 action, for_cow);
716 if (!need_ref_seq(for_cow, ref_root) &&
717 waitqueue_active(&fs_info->tree_mod_seq_wait))
718 wake_up(&fs_info->tree_mod_seq_wait);
719 spin_unlock(&delayed_refs->lock); 814 spin_unlock(&delayed_refs->lock);
720 if (need_ref_seq(for_cow, ref_root)) 815 if (need_ref_seq(for_cow, ref_root))
721 btrfs_qgroup_record_ref(trans, &ref->node, extent_op); 816 btrfs_qgroup_record_ref(trans, &ref->node, extent_op);
@@ -744,8 +839,6 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
744 num_bytes, BTRFS_UPDATE_DELAYED_HEAD, 839 num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
745 extent_op->is_data); 840 extent_op->is_data);
746 841
747 if (waitqueue_active(&fs_info->tree_mod_seq_wait))
748 wake_up(&fs_info->tree_mod_seq_wait);
749 spin_unlock(&delayed_refs->lock); 842 spin_unlock(&delayed_refs->lock);
750 return 0; 843 return 0;
751} 844}
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 0d7c90c366b6..ab5300595847 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -167,6 +167,10 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
167 struct btrfs_trans_handle *trans, 167 struct btrfs_trans_handle *trans,
168 u64 bytenr, u64 num_bytes, 168 u64 bytenr, u64 num_bytes,
169 struct btrfs_delayed_extent_op *extent_op); 169 struct btrfs_delayed_extent_op *extent_op);
170void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
171 struct btrfs_fs_info *fs_info,
172 struct btrfs_delayed_ref_root *delayed_refs,
173 struct btrfs_delayed_ref_head *head);
170 174
171struct btrfs_delayed_ref_head * 175struct btrfs_delayed_ref_head *
172btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr); 176btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index fadeba6a5db9..22e98e04c2ea 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -377,9 +377,13 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
377 ret = read_extent_buffer_pages(io_tree, eb, start, 377 ret = read_extent_buffer_pages(io_tree, eb, start,
378 WAIT_COMPLETE, 378 WAIT_COMPLETE,
379 btree_get_extent, mirror_num); 379 btree_get_extent, mirror_num);
380 if (!ret && !verify_parent_transid(io_tree, eb, 380 if (!ret) {
381 if (!verify_parent_transid(io_tree, eb,
381 parent_transid, 0)) 382 parent_transid, 0))
382 break; 383 break;
384 else
385 ret = -EIO;
386 }
383 387
384 /* 388 /*
385 * This buffer's crc is fine, but its contents are corrupted, so 389 * This buffer's crc is fine, but its contents are corrupted, so
@@ -754,9 +758,7 @@ static void run_one_async_done(struct btrfs_work *work)
754 limit = btrfs_async_submit_limit(fs_info); 758 limit = btrfs_async_submit_limit(fs_info);
755 limit = limit * 2 / 3; 759 limit = limit * 2 / 3;
756 760
757 atomic_dec(&fs_info->nr_async_submits); 761 if (atomic_dec_return(&fs_info->nr_async_submits) < limit &&
758
759 if (atomic_read(&fs_info->nr_async_submits) < limit &&
760 waitqueue_active(&fs_info->async_submit_wait)) 762 waitqueue_active(&fs_info->async_submit_wait))
761 wake_up(&fs_info->async_submit_wait); 763 wake_up(&fs_info->async_submit_wait);
762 764
@@ -1614,8 +1616,6 @@ static int cleaner_kthread(void *arg)
1614 struct btrfs_root *root = arg; 1616 struct btrfs_root *root = arg;
1615 1617
1616 do { 1618 do {
1617 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1618
1619 if (!(root->fs_info->sb->s_flags & MS_RDONLY) && 1619 if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
1620 mutex_trylock(&root->fs_info->cleaner_mutex)) { 1620 mutex_trylock(&root->fs_info->cleaner_mutex)) {
1621 btrfs_run_delayed_iputs(root); 1621 btrfs_run_delayed_iputs(root);
@@ -1647,7 +1647,6 @@ static int transaction_kthread(void *arg)
1647 do { 1647 do {
1648 cannot_commit = false; 1648 cannot_commit = false;
1649 delay = HZ * 30; 1649 delay = HZ * 30;
1650 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1651 mutex_lock(&root->fs_info->transaction_kthread_mutex); 1650 mutex_lock(&root->fs_info->transaction_kthread_mutex);
1652 1651
1653 spin_lock(&root->fs_info->trans_lock); 1652 spin_lock(&root->fs_info->trans_lock);
@@ -2035,8 +2034,6 @@ int open_ctree(struct super_block *sb,
2035 fs_info->free_chunk_space = 0; 2034 fs_info->free_chunk_space = 0;
2036 fs_info->tree_mod_log = RB_ROOT; 2035 fs_info->tree_mod_log = RB_ROOT;
2037 2036
2038 init_waitqueue_head(&fs_info->tree_mod_seq_wait);
2039
2040 /* readahead state */ 2037 /* readahead state */
2041 INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT); 2038 INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
2042 spin_lock_init(&fs_info->reada_lock); 2039 spin_lock_init(&fs_info->reada_lock);
@@ -2531,8 +2528,7 @@ retry_root_backup:
2531 goto fail_trans_kthread; 2528 goto fail_trans_kthread;
2532 2529
2533 /* do not make disk changes in broken FS */ 2530 /* do not make disk changes in broken FS */
2534 if (btrfs_super_log_root(disk_super) != 0 && 2531 if (btrfs_super_log_root(disk_super) != 0) {
2535 !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) {
2536 u64 bytenr = btrfs_super_log_root(disk_super); 2532 u64 bytenr = btrfs_super_log_root(disk_super);
2537 2533
2538 if (fs_devices->rw_devices == 0) { 2534 if (fs_devices->rw_devices == 0) {
@@ -3192,30 +3188,14 @@ int close_ctree(struct btrfs_root *root)
3192 /* clear out the rbtree of defraggable inodes */ 3188 /* clear out the rbtree of defraggable inodes */
3193 btrfs_run_defrag_inodes(fs_info); 3189 btrfs_run_defrag_inodes(fs_info);
3194 3190
3195 /*
3196 * Here come 2 situations when btrfs is broken to flip readonly:
3197 *
3198 * 1. when btrfs flips readonly somewhere else before
3199 * btrfs_commit_super, sb->s_flags has MS_RDONLY flag,
3200 * and btrfs will skip to write sb directly to keep
3201 * ERROR state on disk.
3202 *
3203 * 2. when btrfs flips readonly just in btrfs_commit_super,
3204 * and in such case, btrfs cannot write sb via btrfs_commit_super,
3205 * and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag,
3206 * btrfs will cleanup all FS resources first and write sb then.
3207 */
3208 if (!(fs_info->sb->s_flags & MS_RDONLY)) { 3191 if (!(fs_info->sb->s_flags & MS_RDONLY)) {
3209 ret = btrfs_commit_super(root); 3192 ret = btrfs_commit_super(root);
3210 if (ret) 3193 if (ret)
3211 printk(KERN_ERR "btrfs: commit super ret %d\n", ret); 3194 printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
3212 } 3195 }
3213 3196
3214 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { 3197 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
3215 ret = btrfs_error_commit_super(root); 3198 btrfs_error_commit_super(root);
3216 if (ret)
3217 printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
3218 }
3219 3199
3220 btrfs_put_block_group_cache(fs_info); 3200 btrfs_put_block_group_cache(fs_info);
3221 3201
@@ -3437,18 +3417,11 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
3437 if (read_only) 3417 if (read_only)
3438 return 0; 3418 return 0;
3439 3419
3440 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
3441 printk(KERN_WARNING "warning: mount fs with errors, "
3442 "running btrfsck is recommended\n");
3443 }
3444
3445 return 0; 3420 return 0;
3446} 3421}
3447 3422
3448int btrfs_error_commit_super(struct btrfs_root *root) 3423void btrfs_error_commit_super(struct btrfs_root *root)
3449{ 3424{
3450 int ret;
3451
3452 mutex_lock(&root->fs_info->cleaner_mutex); 3425 mutex_lock(&root->fs_info->cleaner_mutex);
3453 btrfs_run_delayed_iputs(root); 3426 btrfs_run_delayed_iputs(root);
3454 mutex_unlock(&root->fs_info->cleaner_mutex); 3427 mutex_unlock(&root->fs_info->cleaner_mutex);
@@ -3458,10 +3431,6 @@ int btrfs_error_commit_super(struct btrfs_root *root)
3458 3431
3459 /* cleanup FS via transaction */ 3432 /* cleanup FS via transaction */
3460 btrfs_cleanup_transaction(root); 3433 btrfs_cleanup_transaction(root);
3461
3462 ret = write_ctree_super(NULL, root, 0);
3463
3464 return ret;
3465} 3434}
3466 3435
3467static void btrfs_destroy_ordered_operations(struct btrfs_root *root) 3436static void btrfs_destroy_ordered_operations(struct btrfs_root *root)
@@ -3785,14 +3754,17 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)
3785 /* FIXME: cleanup wait for commit */ 3754 /* FIXME: cleanup wait for commit */
3786 t->in_commit = 1; 3755 t->in_commit = 1;
3787 t->blocked = 1; 3756 t->blocked = 1;
3757 smp_mb();
3788 if (waitqueue_active(&root->fs_info->transaction_blocked_wait)) 3758 if (waitqueue_active(&root->fs_info->transaction_blocked_wait))
3789 wake_up(&root->fs_info->transaction_blocked_wait); 3759 wake_up(&root->fs_info->transaction_blocked_wait);
3790 3760
3791 t->blocked = 0; 3761 t->blocked = 0;
3762 smp_mb();
3792 if (waitqueue_active(&root->fs_info->transaction_wait)) 3763 if (waitqueue_active(&root->fs_info->transaction_wait))
3793 wake_up(&root->fs_info->transaction_wait); 3764 wake_up(&root->fs_info->transaction_wait);
3794 3765
3795 t->commit_done = 1; 3766 t->commit_done = 1;
3767 smp_mb();
3796 if (waitqueue_active(&t->commit_wait)) 3768 if (waitqueue_active(&t->commit_wait))
3797 wake_up(&t->commit_wait); 3769 wake_up(&t->commit_wait);
3798 3770
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 95e147eea239..c5b00a735fef 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -54,7 +54,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
54 struct btrfs_root *root, int max_mirrors); 54 struct btrfs_root *root, int max_mirrors);
55struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); 55struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
56int btrfs_commit_super(struct btrfs_root *root); 56int btrfs_commit_super(struct btrfs_root *root);
57int btrfs_error_commit_super(struct btrfs_root *root); 57void btrfs_error_commit_super(struct btrfs_root *root);
58struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, 58struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
59 u64 bytenr, u32 blocksize); 59 u64 bytenr, u32 blocksize);
60struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, 60struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 4e1b153b7c47..ba58024d40d3 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2252,6 +2252,16 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2252 } 2252 }
2253 2253
2254 /* 2254 /*
2255 * We need to try and merge add/drops of the same ref since we
2256 * can run into issues with relocate dropping the implicit ref
2257 * and then it being added back again before the drop can
2258 * finish. If we merged anything we need to re-loop so we can
2259 * get a good ref.
2260 */
2261 btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
2262 locked_ref);
2263
2264 /*
2255 * locked_ref is the head node, so we have to go one 2265 * locked_ref is the head node, so we have to go one
2256 * node back for any delayed ref updates 2266 * node back for any delayed ref updates
2257 */ 2267 */
@@ -2318,12 +2328,23 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2318 ref->in_tree = 0; 2328 ref->in_tree = 0;
2319 rb_erase(&ref->rb_node, &delayed_refs->root); 2329 rb_erase(&ref->rb_node, &delayed_refs->root);
2320 delayed_refs->num_entries--; 2330 delayed_refs->num_entries--;
2321 /* 2331 if (locked_ref) {
2322 * we modified num_entries, but as we're currently running 2332 /*
2323 * delayed refs, skip 2333 * when we play the delayed ref, also correct the
2324 * wake_up(&delayed_refs->seq_wait); 2334 * ref_mod on head
2325 * here. 2335 */
2326 */ 2336 switch (ref->action) {
2337 case BTRFS_ADD_DELAYED_REF:
2338 case BTRFS_ADD_DELAYED_EXTENT:
2339 locked_ref->node.ref_mod -= ref->ref_mod;
2340 break;
2341 case BTRFS_DROP_DELAYED_REF:
2342 locked_ref->node.ref_mod += ref->ref_mod;
2343 break;
2344 default:
2345 WARN_ON(1);
2346 }
2347 }
2327 spin_unlock(&delayed_refs->lock); 2348 spin_unlock(&delayed_refs->lock);
2328 2349
2329 ret = run_one_delayed_ref(trans, root, ref, extent_op, 2350 ret = run_one_delayed_ref(trans, root, ref, extent_op,
@@ -2350,22 +2371,6 @@ next:
2350 return count; 2371 return count;
2351} 2372}
2352 2373
2353static void wait_for_more_refs(struct btrfs_fs_info *fs_info,
2354 struct btrfs_delayed_ref_root *delayed_refs,
2355 unsigned long num_refs,
2356 struct list_head *first_seq)
2357{
2358 spin_unlock(&delayed_refs->lock);
2359 pr_debug("waiting for more refs (num %ld, first %p)\n",
2360 num_refs, first_seq);
2361 wait_event(fs_info->tree_mod_seq_wait,
2362 num_refs != delayed_refs->num_entries ||
2363 fs_info->tree_mod_seq_list.next != first_seq);
2364 pr_debug("done waiting for more refs (num %ld, first %p)\n",
2365 delayed_refs->num_entries, fs_info->tree_mod_seq_list.next);
2366 spin_lock(&delayed_refs->lock);
2367}
2368
2369#ifdef SCRAMBLE_DELAYED_REFS 2374#ifdef SCRAMBLE_DELAYED_REFS
2370/* 2375/*
2371 * Normally delayed refs get processed in ascending bytenr order. This 2376 * Normally delayed refs get processed in ascending bytenr order. This
@@ -2460,13 +2465,11 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2460 struct btrfs_delayed_ref_root *delayed_refs; 2465 struct btrfs_delayed_ref_root *delayed_refs;
2461 struct btrfs_delayed_ref_node *ref; 2466 struct btrfs_delayed_ref_node *ref;
2462 struct list_head cluster; 2467 struct list_head cluster;
2463 struct list_head *first_seq = NULL;
2464 int ret; 2468 int ret;
2465 u64 delayed_start; 2469 u64 delayed_start;
2466 int run_all = count == (unsigned long)-1; 2470 int run_all = count == (unsigned long)-1;
2467 int run_most = 0; 2471 int run_most = 0;
2468 unsigned long num_refs = 0; 2472 int loops;
2469 int consider_waiting;
2470 2473
2471 /* We'll clean this up in btrfs_cleanup_transaction */ 2474 /* We'll clean this up in btrfs_cleanup_transaction */
2472 if (trans->aborted) 2475 if (trans->aborted)
@@ -2484,7 +2487,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2484 delayed_refs = &trans->transaction->delayed_refs; 2487 delayed_refs = &trans->transaction->delayed_refs;
2485 INIT_LIST_HEAD(&cluster); 2488 INIT_LIST_HEAD(&cluster);
2486again: 2489again:
2487 consider_waiting = 0; 2490 loops = 0;
2488 spin_lock(&delayed_refs->lock); 2491 spin_lock(&delayed_refs->lock);
2489 2492
2490#ifdef SCRAMBLE_DELAYED_REFS 2493#ifdef SCRAMBLE_DELAYED_REFS
@@ -2512,31 +2515,6 @@ again:
2512 if (ret) 2515 if (ret)
2513 break; 2516 break;
2514 2517
2515 if (delayed_start >= delayed_refs->run_delayed_start) {
2516 if (consider_waiting == 0) {
2517 /*
2518 * btrfs_find_ref_cluster looped. let's do one
2519 * more cycle. if we don't run any delayed ref
2520 * during that cycle (because we can't because
2521 * all of them are blocked) and if the number of
2522 * refs doesn't change, we avoid busy waiting.
2523 */
2524 consider_waiting = 1;
2525 num_refs = delayed_refs->num_entries;
2526 first_seq = root->fs_info->tree_mod_seq_list.next;
2527 } else {
2528 wait_for_more_refs(root->fs_info, delayed_refs,
2529 num_refs, first_seq);
2530 /*
2531 * after waiting, things have changed. we
2532 * dropped the lock and someone else might have
2533 * run some refs, built new clusters and so on.
2534 * therefore, we restart staleness detection.
2535 */
2536 consider_waiting = 0;
2537 }
2538 }
2539
2540 ret = run_clustered_refs(trans, root, &cluster); 2518 ret = run_clustered_refs(trans, root, &cluster);
2541 if (ret < 0) { 2519 if (ret < 0) {
2542 spin_unlock(&delayed_refs->lock); 2520 spin_unlock(&delayed_refs->lock);
@@ -2549,9 +2527,26 @@ again:
2549 if (count == 0) 2527 if (count == 0)
2550 break; 2528 break;
2551 2529
2552 if (ret || delayed_refs->run_delayed_start == 0) { 2530 if (delayed_start >= delayed_refs->run_delayed_start) {
2531 if (loops == 0) {
2532 /*
2533 * btrfs_find_ref_cluster looped. let's do one
2534 * more cycle. if we don't run any delayed ref
2535 * during that cycle (because we can't because
2536 * all of them are blocked), bail out.
2537 */
2538 loops = 1;
2539 } else {
2540 /*
2541 * no runnable refs left, stop trying
2542 */
2543 BUG_ON(run_all);
2544 break;
2545 }
2546 }
2547 if (ret) {
2553 /* refs were run, let's reset staleness detection */ 2548 /* refs were run, let's reset staleness detection */
2554 consider_waiting = 0; 2549 loops = 0;
2555 } 2550 }
2556 } 2551 }
2557 2552
@@ -3007,17 +3002,16 @@ again:
3007 } 3002 }
3008 spin_unlock(&block_group->lock); 3003 spin_unlock(&block_group->lock);
3009 3004
3010 num_pages = (int)div64_u64(block_group->key.offset, 1024 * 1024 * 1024); 3005 /*
3006 * Try to preallocate enough space based on how big the block group is.
3007 * Keep in mind this has to include any pinned space which could end up
3008 * taking up quite a bit since it's not folded into the other space
3009 * cache.
3010 */
3011 num_pages = (int)div64_u64(block_group->key.offset, 256 * 1024 * 1024);
3011 if (!num_pages) 3012 if (!num_pages)
3012 num_pages = 1; 3013 num_pages = 1;
3013 3014
3014 /*
3015 * Just to make absolutely sure we have enough space, we're going to
3016 * preallocate 12 pages worth of space for each block group. In
3017 * practice we ought to use at most 8, but we need extra space so we can
3018 * add our header and have a terminator between the extents and the
3019 * bitmaps.
3020 */
3021 num_pages *= 16; 3015 num_pages *= 16;
3022 num_pages *= PAGE_CACHE_SIZE; 3016 num_pages *= PAGE_CACHE_SIZE;
3023 3017
@@ -4571,8 +4565,10 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4571 if (root->fs_info->quota_enabled) { 4565 if (root->fs_info->quota_enabled) {
4572 ret = btrfs_qgroup_reserve(root, num_bytes + 4566 ret = btrfs_qgroup_reserve(root, num_bytes +
4573 nr_extents * root->leafsize); 4567 nr_extents * root->leafsize);
4574 if (ret) 4568 if (ret) {
4569 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4575 return ret; 4570 return ret;
4571 }
4576 } 4572 }
4577 4573
4578 ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); 4574 ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
@@ -5294,9 +5290,6 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
5294 rb_erase(&head->node.rb_node, &delayed_refs->root); 5290 rb_erase(&head->node.rb_node, &delayed_refs->root);
5295 5291
5296 delayed_refs->num_entries--; 5292 delayed_refs->num_entries--;
5297 smp_mb();
5298 if (waitqueue_active(&root->fs_info->tree_mod_seq_wait))
5299 wake_up(&root->fs_info->tree_mod_seq_wait);
5300 5293
5301 /* 5294 /*
5302 * we don't take a ref on the node because we're removing it from the 5295 * we don't take a ref on the node because we're removing it from the
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 45c81bb4ac82..4c878476bb91 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2330,23 +2330,10 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
2330 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { 2330 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
2331 ret = tree->ops->readpage_end_io_hook(page, start, end, 2331 ret = tree->ops->readpage_end_io_hook(page, start, end,
2332 state, mirror); 2332 state, mirror);
2333 if (ret) { 2333 if (ret)
2334 /* no IO indicated but software detected errors
2335 * in the block, either checksum errors or
2336 * issues with the contents */
2337 struct btrfs_root *root =
2338 BTRFS_I(page->mapping->host)->root;
2339 struct btrfs_device *device;
2340
2341 uptodate = 0; 2334 uptodate = 0;
2342 device = btrfs_find_device_for_logical( 2335 else
2343 root, start, mirror);
2344 if (device)
2345 btrfs_dev_stat_inc_and_print(device,
2346 BTRFS_DEV_STAT_CORRUPTION_ERRS);
2347 } else {
2348 clean_io_failure(start, page); 2336 clean_io_failure(start, page);
2349 }
2350 } 2337 }
2351 2338
2352 if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) { 2339 if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) {
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index b45b9de0c21d..857d93cd01dc 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -272,9 +272,9 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
272} 272}
273 273
274int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode, 274int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
275 struct bio *bio, u64 offset, u32 *dst) 275 struct bio *bio, u64 offset)
276{ 276{
277 return __btrfs_lookup_bio_sums(root, inode, bio, offset, dst, 1); 277 return __btrfs_lookup_bio_sums(root, inode, bio, offset, NULL, 1);
278} 278}
279 279
280int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, 280int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9aa01ec2138d..5caf285c6e4d 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1379,7 +1379,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1379 ssize_t err = 0; 1379 ssize_t err = 0;
1380 size_t count, ocount; 1380 size_t count, ocount;
1381 1381
1382 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 1382 sb_start_write(inode->i_sb);
1383 1383
1384 mutex_lock(&inode->i_mutex); 1384 mutex_lock(&inode->i_mutex);
1385 1385
@@ -1469,6 +1469,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1469 num_written = err; 1469 num_written = err;
1470 } 1470 }
1471out: 1471out:
1472 sb_end_write(inode->i_sb);
1472 current->backing_dev_info = NULL; 1473 current->backing_dev_info = NULL;
1473 return num_written ? num_written : err; 1474 return num_written ? num_written : err;
1474} 1475}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 48bdfd2591c2..ec154f954646 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -324,7 +324,8 @@ static noinline int add_async_extent(struct async_cow *cow,
324 * If this code finds it can't get good compression, it puts an 324 * If this code finds it can't get good compression, it puts an
325 * entry onto the work queue to write the uncompressed bytes. This 325 * entry onto the work queue to write the uncompressed bytes. This
326 * makes sure that both compressed inodes and uncompressed inodes 326 * makes sure that both compressed inodes and uncompressed inodes
327 * are written in the same order that pdflush sent them down. 327 * are written in the same order that the flusher thread sent them
328 * down.
328 */ 329 */
329static noinline int compress_file_range(struct inode *inode, 330static noinline int compress_file_range(struct inode *inode,
330 struct page *locked_page, 331 struct page *locked_page,
@@ -1007,9 +1008,7 @@ static noinline void async_cow_submit(struct btrfs_work *work)
1007 nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >> 1008 nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
1008 PAGE_CACHE_SHIFT; 1009 PAGE_CACHE_SHIFT;
1009 1010
1010 atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages); 1011 if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) <
1011
1012 if (atomic_read(&root->fs_info->async_delalloc_pages) <
1013 5 * 1024 * 1024 && 1012 5 * 1024 * 1024 &&
1014 waitqueue_active(&root->fs_info->async_submit_wait)) 1013 waitqueue_active(&root->fs_info->async_submit_wait))
1015 wake_up(&root->fs_info->async_submit_wait); 1014 wake_up(&root->fs_info->async_submit_wait);
@@ -1884,8 +1883,11 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
1884 trans = btrfs_join_transaction_nolock(root); 1883 trans = btrfs_join_transaction_nolock(root);
1885 else 1884 else
1886 trans = btrfs_join_transaction(root); 1885 trans = btrfs_join_transaction(root);
1887 if (IS_ERR(trans)) 1886 if (IS_ERR(trans)) {
1888 return PTR_ERR(trans); 1887 ret = PTR_ERR(trans);
1888 trans = NULL;
1889 goto out;
1890 }
1889 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1891 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1890 ret = btrfs_update_inode_fallback(trans, root, inode); 1892 ret = btrfs_update_inode_fallback(trans, root, inode);
1891 if (ret) /* -ENOMEM or corruption */ 1893 if (ret) /* -ENOMEM or corruption */
@@ -3173,7 +3175,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
3173 btrfs_i_size_write(dir, dir->i_size - name_len * 2); 3175 btrfs_i_size_write(dir, dir->i_size - name_len * 2);
3174 inode_inc_iversion(dir); 3176 inode_inc_iversion(dir);
3175 dir->i_mtime = dir->i_ctime = CURRENT_TIME; 3177 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
3176 ret = btrfs_update_inode(trans, root, dir); 3178 ret = btrfs_update_inode_fallback(trans, root, dir);
3177 if (ret) 3179 if (ret)
3178 btrfs_abort_transaction(trans, root, ret); 3180 btrfs_abort_transaction(trans, root, ret);
3179out: 3181out:
@@ -5773,18 +5775,112 @@ out:
5773 return ret; 5775 return ret;
5774} 5776}
5775 5777
5778static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
5779 struct extent_state **cached_state, int writing)
5780{
5781 struct btrfs_ordered_extent *ordered;
5782 int ret = 0;
5783
5784 while (1) {
5785 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
5786 0, cached_state);
5787 /*
5788 * We're concerned with the entire range that we're going to be
5789 * doing DIO to, so we need to make sure theres no ordered
5790 * extents in this range.
5791 */
5792 ordered = btrfs_lookup_ordered_range(inode, lockstart,
5793 lockend - lockstart + 1);
5794
5795 /*
5796 * We need to make sure there are no buffered pages in this
5797 * range either, we could have raced between the invalidate in
5798 * generic_file_direct_write and locking the extent. The
5799 * invalidate needs to happen so that reads after a write do not
5800 * get stale data.
5801 */
5802 if (!ordered && (!writing ||
5803 !test_range_bit(&BTRFS_I(inode)->io_tree,
5804 lockstart, lockend, EXTENT_UPTODATE, 0,
5805 *cached_state)))
5806 break;
5807
5808 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
5809 cached_state, GFP_NOFS);
5810
5811 if (ordered) {
5812 btrfs_start_ordered_extent(inode, ordered, 1);
5813 btrfs_put_ordered_extent(ordered);
5814 } else {
5815 /* Screw you mmap */
5816 ret = filemap_write_and_wait_range(inode->i_mapping,
5817 lockstart,
5818 lockend);
5819 if (ret)
5820 break;
5821
5822 /*
5823 * If we found a page that couldn't be invalidated just
5824 * fall back to buffered.
5825 */
5826 ret = invalidate_inode_pages2_range(inode->i_mapping,
5827 lockstart >> PAGE_CACHE_SHIFT,
5828 lockend >> PAGE_CACHE_SHIFT);
5829 if (ret)
5830 break;
5831 }
5832
5833 cond_resched();
5834 }
5835
5836 return ret;
5837}
5838
5776static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, 5839static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
5777 struct buffer_head *bh_result, int create) 5840 struct buffer_head *bh_result, int create)
5778{ 5841{
5779 struct extent_map *em; 5842 struct extent_map *em;
5780 struct btrfs_root *root = BTRFS_I(inode)->root; 5843 struct btrfs_root *root = BTRFS_I(inode)->root;
5844 struct extent_state *cached_state = NULL;
5781 u64 start = iblock << inode->i_blkbits; 5845 u64 start = iblock << inode->i_blkbits;
5846 u64 lockstart, lockend;
5782 u64 len = bh_result->b_size; 5847 u64 len = bh_result->b_size;
5783 struct btrfs_trans_handle *trans; 5848 struct btrfs_trans_handle *trans;
5849 int unlock_bits = EXTENT_LOCKED;
5850 int ret;
5851
5852 if (create) {
5853 ret = btrfs_delalloc_reserve_space(inode, len);
5854 if (ret)
5855 return ret;
5856 unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY;
5857 } else {
5858 len = min_t(u64, len, root->sectorsize);
5859 }
5860
5861 lockstart = start;
5862 lockend = start + len - 1;
5863
5864 /*
5865 * If this errors out it's because we couldn't invalidate pagecache for
5866 * this range and we need to fallback to buffered.
5867 */
5868 if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create))
5869 return -ENOTBLK;
5870
5871 if (create) {
5872 ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
5873 lockend, EXTENT_DELALLOC, NULL,
5874 &cached_state, GFP_NOFS);
5875 if (ret)
5876 goto unlock_err;
5877 }
5784 5878
5785 em = btrfs_get_extent(inode, NULL, 0, start, len, 0); 5879 em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
5786 if (IS_ERR(em)) 5880 if (IS_ERR(em)) {
5787 return PTR_ERR(em); 5881 ret = PTR_ERR(em);
5882 goto unlock_err;
5883 }
5788 5884
5789 /* 5885 /*
5790 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered 5886 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
@@ -5803,17 +5899,16 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
5803 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) || 5899 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
5804 em->block_start == EXTENT_MAP_INLINE) { 5900 em->block_start == EXTENT_MAP_INLINE) {
5805 free_extent_map(em); 5901 free_extent_map(em);
5806 return -ENOTBLK; 5902 ret = -ENOTBLK;
5903 goto unlock_err;
5807 } 5904 }
5808 5905
5809 /* Just a good old fashioned hole, return */ 5906 /* Just a good old fashioned hole, return */
5810 if (!create && (em->block_start == EXTENT_MAP_HOLE || 5907 if (!create && (em->block_start == EXTENT_MAP_HOLE ||
5811 test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { 5908 test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
5812 free_extent_map(em); 5909 free_extent_map(em);
5813 /* DIO will do one hole at a time, so just unlock a sector */ 5910 ret = 0;
5814 unlock_extent(&BTRFS_I(inode)->io_tree, start, 5911 goto unlock_err;
5815 start + root->sectorsize - 1);
5816 return 0;
5817 } 5912 }
5818 5913
5819 /* 5914 /*
@@ -5826,8 +5921,9 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
5826 * 5921 *
5827 */ 5922 */
5828 if (!create) { 5923 if (!create) {
5829 len = em->len - (start - em->start); 5924 len = min(len, em->len - (start - em->start));
5830 goto map; 5925 lockstart = start + len;
5926 goto unlock;
5831 } 5927 }
5832 5928
5833 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || 5929 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
@@ -5859,7 +5955,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
5859 btrfs_end_transaction(trans, root); 5955 btrfs_end_transaction(trans, root);
5860 if (ret) { 5956 if (ret) {
5861 free_extent_map(em); 5957 free_extent_map(em);
5862 return ret; 5958 goto unlock_err;
5863 } 5959 }
5864 goto unlock; 5960 goto unlock;
5865 } 5961 }
@@ -5872,14 +5968,12 @@ must_cow:
5872 */ 5968 */
5873 len = bh_result->b_size; 5969 len = bh_result->b_size;
5874 em = btrfs_new_extent_direct(inode, em, start, len); 5970 em = btrfs_new_extent_direct(inode, em, start, len);
5875 if (IS_ERR(em)) 5971 if (IS_ERR(em)) {
5876 return PTR_ERR(em); 5972 ret = PTR_ERR(em);
5973 goto unlock_err;
5974 }
5877 len = min(len, em->len - (start - em->start)); 5975 len = min(len, em->len - (start - em->start));
5878unlock: 5976unlock:
5879 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, start + len - 1,
5880 EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DIRTY, 1,
5881 0, NULL, GFP_NOFS);
5882map:
5883 bh_result->b_blocknr = (em->block_start + (start - em->start)) >> 5977 bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
5884 inode->i_blkbits; 5978 inode->i_blkbits;
5885 bh_result->b_size = len; 5979 bh_result->b_size = len;
@@ -5897,9 +5991,44 @@ map:
5897 i_size_write(inode, start + len); 5991 i_size_write(inode, start + len);
5898 } 5992 }
5899 5993
5994 /*
5995 * In the case of write we need to clear and unlock the entire range,
5996 * in the case of read we need to unlock only the end area that we
5997 * aren't using if there is any left over space.
5998 */
5999 if (lockstart < lockend) {
6000 if (create && len < lockend - lockstart) {
6001 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
6002 lockstart + len - 1, unlock_bits, 1, 0,
6003 &cached_state, GFP_NOFS);
6004 /*
6005 * Beside unlock, we also need to cleanup reserved space
6006 * for the left range by attaching EXTENT_DO_ACCOUNTING.
6007 */
6008 clear_extent_bit(&BTRFS_I(inode)->io_tree,
6009 lockstart + len, lockend,
6010 unlock_bits | EXTENT_DO_ACCOUNTING,
6011 1, 0, NULL, GFP_NOFS);
6012 } else {
6013 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
6014 lockend, unlock_bits, 1, 0,
6015 &cached_state, GFP_NOFS);
6016 }
6017 } else {
6018 free_extent_state(cached_state);
6019 }
6020
5900 free_extent_map(em); 6021 free_extent_map(em);
5901 6022
5902 return 0; 6023 return 0;
6024
6025unlock_err:
6026 if (create)
6027 unlock_bits |= EXTENT_DO_ACCOUNTING;
6028
6029 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
6030 unlock_bits, 1, 0, &cached_state, GFP_NOFS);
6031 return ret;
5903} 6032}
5904 6033
5905struct btrfs_dio_private { 6034struct btrfs_dio_private {
@@ -5907,7 +6036,6 @@ struct btrfs_dio_private {
5907 u64 logical_offset; 6036 u64 logical_offset;
5908 u64 disk_bytenr; 6037 u64 disk_bytenr;
5909 u64 bytes; 6038 u64 bytes;
5910 u32 *csums;
5911 void *private; 6039 void *private;
5912 6040
5913 /* number of bios pending for this dio */ 6041 /* number of bios pending for this dio */
@@ -5927,7 +6055,6 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
5927 struct inode *inode = dip->inode; 6055 struct inode *inode = dip->inode;
5928 struct btrfs_root *root = BTRFS_I(inode)->root; 6056 struct btrfs_root *root = BTRFS_I(inode)->root;
5929 u64 start; 6057 u64 start;
5930 u32 *private = dip->csums;
5931 6058
5932 start = dip->logical_offset; 6059 start = dip->logical_offset;
5933 do { 6060 do {
@@ -5935,8 +6062,12 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
5935 struct page *page = bvec->bv_page; 6062 struct page *page = bvec->bv_page;
5936 char *kaddr; 6063 char *kaddr;
5937 u32 csum = ~(u32)0; 6064 u32 csum = ~(u32)0;
6065 u64 private = ~(u32)0;
5938 unsigned long flags; 6066 unsigned long flags;
5939 6067
6068 if (get_state_private(&BTRFS_I(inode)->io_tree,
6069 start, &private))
6070 goto failed;
5940 local_irq_save(flags); 6071 local_irq_save(flags);
5941 kaddr = kmap_atomic(page); 6072 kaddr = kmap_atomic(page);
5942 csum = btrfs_csum_data(root, kaddr + bvec->bv_offset, 6073 csum = btrfs_csum_data(root, kaddr + bvec->bv_offset,
@@ -5946,18 +6077,18 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
5946 local_irq_restore(flags); 6077 local_irq_restore(flags);
5947 6078
5948 flush_dcache_page(bvec->bv_page); 6079 flush_dcache_page(bvec->bv_page);
5949 if (csum != *private) { 6080 if (csum != private) {
6081failed:
5950 printk(KERN_ERR "btrfs csum failed ino %llu off" 6082 printk(KERN_ERR "btrfs csum failed ino %llu off"
5951 " %llu csum %u private %u\n", 6083 " %llu csum %u private %u\n",
5952 (unsigned long long)btrfs_ino(inode), 6084 (unsigned long long)btrfs_ino(inode),
5953 (unsigned long long)start, 6085 (unsigned long long)start,
5954 csum, *private); 6086 csum, (unsigned)private);
5955 err = -EIO; 6087 err = -EIO;
5956 } 6088 }
5957 } 6089 }
5958 6090
5959 start += bvec->bv_len; 6091 start += bvec->bv_len;
5960 private++;
5961 bvec++; 6092 bvec++;
5962 } while (bvec <= bvec_end); 6093 } while (bvec <= bvec_end);
5963 6094
@@ -5965,7 +6096,6 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
5965 dip->logical_offset + dip->bytes - 1); 6096 dip->logical_offset + dip->bytes - 1);
5966 bio->bi_private = dip->private; 6097 bio->bi_private = dip->private;
5967 6098
5968 kfree(dip->csums);
5969 kfree(dip); 6099 kfree(dip);
5970 6100
5971 /* If we had a csum failure make sure to clear the uptodate flag */ 6101 /* If we had a csum failure make sure to clear the uptodate flag */
@@ -6071,7 +6201,7 @@ static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
6071 6201
6072static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, 6202static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
6073 int rw, u64 file_offset, int skip_sum, 6203 int rw, u64 file_offset, int skip_sum,
6074 u32 *csums, int async_submit) 6204 int async_submit)
6075{ 6205{
6076 int write = rw & REQ_WRITE; 6206 int write = rw & REQ_WRITE;
6077 struct btrfs_root *root = BTRFS_I(inode)->root; 6207 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -6104,8 +6234,7 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
6104 if (ret) 6234 if (ret)
6105 goto err; 6235 goto err;
6106 } else if (!skip_sum) { 6236 } else if (!skip_sum) {
6107 ret = btrfs_lookup_bio_sums_dio(root, inode, bio, 6237 ret = btrfs_lookup_bio_sums_dio(root, inode, bio, file_offset);
6108 file_offset, csums);
6109 if (ret) 6238 if (ret)
6110 goto err; 6239 goto err;
6111 } 6240 }
@@ -6131,10 +6260,8 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
6131 u64 submit_len = 0; 6260 u64 submit_len = 0;
6132 u64 map_length; 6261 u64 map_length;
6133 int nr_pages = 0; 6262 int nr_pages = 0;
6134 u32 *csums = dip->csums;
6135 int ret = 0; 6263 int ret = 0;
6136 int async_submit = 0; 6264 int async_submit = 0;
6137 int write = rw & REQ_WRITE;
6138 6265
6139 map_length = orig_bio->bi_size; 6266 map_length = orig_bio->bi_size;
6140 ret = btrfs_map_block(map_tree, READ, start_sector << 9, 6267 ret = btrfs_map_block(map_tree, READ, start_sector << 9,
@@ -6170,16 +6297,13 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
6170 atomic_inc(&dip->pending_bios); 6297 atomic_inc(&dip->pending_bios);
6171 ret = __btrfs_submit_dio_bio(bio, inode, rw, 6298 ret = __btrfs_submit_dio_bio(bio, inode, rw,
6172 file_offset, skip_sum, 6299 file_offset, skip_sum,
6173 csums, async_submit); 6300 async_submit);
6174 if (ret) { 6301 if (ret) {
6175 bio_put(bio); 6302 bio_put(bio);
6176 atomic_dec(&dip->pending_bios); 6303 atomic_dec(&dip->pending_bios);
6177 goto out_err; 6304 goto out_err;
6178 } 6305 }
6179 6306
6180 /* Write's use the ordered csums */
6181 if (!write && !skip_sum)
6182 csums = csums + nr_pages;
6183 start_sector += submit_len >> 9; 6307 start_sector += submit_len >> 9;
6184 file_offset += submit_len; 6308 file_offset += submit_len;
6185 6309
@@ -6209,7 +6333,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
6209 6333
6210submit: 6334submit:
6211 ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum, 6335 ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum,
6212 csums, async_submit); 6336 async_submit);
6213 if (!ret) 6337 if (!ret)
6214 return 0; 6338 return 0;
6215 6339
@@ -6245,17 +6369,6 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
6245 ret = -ENOMEM; 6369 ret = -ENOMEM;
6246 goto free_ordered; 6370 goto free_ordered;
6247 } 6371 }
6248 dip->csums = NULL;
6249
6250 /* Write's use the ordered csum stuff, so we don't need dip->csums */
6251 if (!write && !skip_sum) {
6252 dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
6253 if (!dip->csums) {
6254 kfree(dip);
6255 ret = -ENOMEM;
6256 goto free_ordered;
6257 }
6258 }
6259 6372
6260 dip->private = bio->bi_private; 6373 dip->private = bio->bi_private;
6261 dip->inode = inode; 6374 dip->inode = inode;
@@ -6340,132 +6453,22 @@ static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *io
6340out: 6453out:
6341 return retval; 6454 return retval;
6342} 6455}
6456
6343static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, 6457static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
6344 const struct iovec *iov, loff_t offset, 6458 const struct iovec *iov, loff_t offset,
6345 unsigned long nr_segs) 6459 unsigned long nr_segs)
6346{ 6460{
6347 struct file *file = iocb->ki_filp; 6461 struct file *file = iocb->ki_filp;
6348 struct inode *inode = file->f_mapping->host; 6462 struct inode *inode = file->f_mapping->host;
6349 struct btrfs_ordered_extent *ordered;
6350 struct extent_state *cached_state = NULL;
6351 u64 lockstart, lockend;
6352 ssize_t ret;
6353 int writing = rw & WRITE;
6354 int write_bits = 0;
6355 size_t count = iov_length(iov, nr_segs);
6356 6463
6357 if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov, 6464 if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
6358 offset, nr_segs)) { 6465 offset, nr_segs))
6359 return 0; 6466 return 0;
6360 }
6361 6467
6362 lockstart = offset; 6468 return __blockdev_direct_IO(rw, iocb, inode,
6363 lockend = offset + count - 1;
6364
6365 if (writing) {
6366 ret = btrfs_delalloc_reserve_space(inode, count);
6367 if (ret)
6368 goto out;
6369 }
6370
6371 while (1) {
6372 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
6373 0, &cached_state);
6374 /*
6375 * We're concerned with the entire range that we're going to be
6376 * doing DIO to, so we need to make sure theres no ordered
6377 * extents in this range.
6378 */
6379 ordered = btrfs_lookup_ordered_range(inode, lockstart,
6380 lockend - lockstart + 1);
6381
6382 /*
6383 * We need to make sure there are no buffered pages in this
6384 * range either, we could have raced between the invalidate in
6385 * generic_file_direct_write and locking the extent. The
6386 * invalidate needs to happen so that reads after a write do not
6387 * get stale data.
6388 */
6389 if (!ordered && (!writing ||
6390 !test_range_bit(&BTRFS_I(inode)->io_tree,
6391 lockstart, lockend, EXTENT_UPTODATE, 0,
6392 cached_state)))
6393 break;
6394
6395 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
6396 &cached_state, GFP_NOFS);
6397
6398 if (ordered) {
6399 btrfs_start_ordered_extent(inode, ordered, 1);
6400 btrfs_put_ordered_extent(ordered);
6401 } else {
6402 /* Screw you mmap */
6403 ret = filemap_write_and_wait_range(file->f_mapping,
6404 lockstart,
6405 lockend);
6406 if (ret)
6407 goto out;
6408
6409 /*
6410 * If we found a page that couldn't be invalidated just
6411 * fall back to buffered.
6412 */
6413 ret = invalidate_inode_pages2_range(file->f_mapping,
6414 lockstart >> PAGE_CACHE_SHIFT,
6415 lockend >> PAGE_CACHE_SHIFT);
6416 if (ret) {
6417 if (ret == -EBUSY)
6418 ret = 0;
6419 goto out;
6420 }
6421 }
6422
6423 cond_resched();
6424 }
6425
6426 /*
6427 * we don't use btrfs_set_extent_delalloc because we don't want
6428 * the dirty or uptodate bits
6429 */
6430 if (writing) {
6431 write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING;
6432 ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
6433 EXTENT_DELALLOC, NULL, &cached_state,
6434 GFP_NOFS);
6435 if (ret) {
6436 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
6437 lockend, EXTENT_LOCKED | write_bits,
6438 1, 0, &cached_state, GFP_NOFS);
6439 goto out;
6440 }
6441 }
6442
6443 free_extent_state(cached_state);
6444 cached_state = NULL;
6445
6446 ret = __blockdev_direct_IO(rw, iocb, inode,
6447 BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, 6469 BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
6448 iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, 6470 iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
6449 btrfs_submit_direct, 0); 6471 btrfs_submit_direct, 0);
6450
6451 if (ret < 0 && ret != -EIOCBQUEUED) {
6452 clear_extent_bit(&BTRFS_I(inode)->io_tree, offset,
6453 offset + iov_length(iov, nr_segs) - 1,
6454 EXTENT_LOCKED | write_bits, 1, 0,
6455 &cached_state, GFP_NOFS);
6456 } else if (ret >= 0 && ret < iov_length(iov, nr_segs)) {
6457 /*
6458 * We're falling back to buffered, unlock the section we didn't
6459 * do IO on.
6460 */
6461 clear_extent_bit(&BTRFS_I(inode)->io_tree, offset + ret,
6462 offset + iov_length(iov, nr_segs) - 1,
6463 EXTENT_LOCKED | write_bits, 1, 0,
6464 &cached_state, GFP_NOFS);
6465 }
6466out:
6467 free_extent_state(cached_state);
6468 return ret;
6469} 6472}
6470 6473
6471static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 6474static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
@@ -6629,6 +6632,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
6629 u64 page_start; 6632 u64 page_start;
6630 u64 page_end; 6633 u64 page_end;
6631 6634
6635 sb_start_pagefault(inode->i_sb);
6632 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 6636 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
6633 if (!ret) { 6637 if (!ret) {
6634 ret = file_update_time(vma->vm_file); 6638 ret = file_update_time(vma->vm_file);
@@ -6718,12 +6722,15 @@ again:
6718 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS); 6722 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
6719 6723
6720out_unlock: 6724out_unlock:
6721 if (!ret) 6725 if (!ret) {
6726 sb_end_pagefault(inode->i_sb);
6722 return VM_FAULT_LOCKED; 6727 return VM_FAULT_LOCKED;
6728 }
6723 unlock_page(page); 6729 unlock_page(page);
6724out: 6730out:
6725 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 6731 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
6726out_noreserve: 6732out_noreserve:
6733 sb_end_pagefault(inode->i_sb);
6727 return ret; 6734 return ret;
6728} 6735}
6729 6736
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 43f0012016e3..9df50fa8a078 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -195,6 +195,10 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
195 if (!inode_owner_or_capable(inode)) 195 if (!inode_owner_or_capable(inode))
196 return -EACCES; 196 return -EACCES;
197 197
198 ret = mnt_want_write_file(file);
199 if (ret)
200 return ret;
201
198 mutex_lock(&inode->i_mutex); 202 mutex_lock(&inode->i_mutex);
199 203
200 ip_oldflags = ip->flags; 204 ip_oldflags = ip->flags;
@@ -209,10 +213,6 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
209 } 213 }
210 } 214 }
211 215
212 ret = mnt_want_write_file(file);
213 if (ret)
214 goto out_unlock;
215
216 if (flags & FS_SYNC_FL) 216 if (flags & FS_SYNC_FL)
217 ip->flags |= BTRFS_INODE_SYNC; 217 ip->flags |= BTRFS_INODE_SYNC;
218 else 218 else
@@ -275,9 +275,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
275 inode->i_flags = i_oldflags; 275 inode->i_flags = i_oldflags;
276 } 276 }
277 277
278 mnt_drop_write_file(file);
279 out_unlock: 278 out_unlock:
280 mutex_unlock(&inode->i_mutex); 279 mutex_unlock(&inode->i_mutex);
280 mnt_drop_write_file(file);
281 return ret; 281 return ret;
282} 282}
283 283
@@ -424,7 +424,7 @@ static noinline int create_subvol(struct btrfs_root *root,
424 uuid_le_gen(&new_uuid); 424 uuid_le_gen(&new_uuid);
425 memcpy(root_item.uuid, new_uuid.b, BTRFS_UUID_SIZE); 425 memcpy(root_item.uuid, new_uuid.b, BTRFS_UUID_SIZE);
426 root_item.otime.sec = cpu_to_le64(cur_time.tv_sec); 426 root_item.otime.sec = cpu_to_le64(cur_time.tv_sec);
427 root_item.otime.nsec = cpu_to_le64(cur_time.tv_nsec); 427 root_item.otime.nsec = cpu_to_le32(cur_time.tv_nsec);
428 root_item.ctime = root_item.otime; 428 root_item.ctime = root_item.otime;
429 btrfs_set_root_ctransid(&root_item, trans->transid); 429 btrfs_set_root_ctransid(&root_item, trans->transid);
430 btrfs_set_root_otransid(&root_item, trans->transid); 430 btrfs_set_root_otransid(&root_item, trans->transid);
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index a44eff074805..2a1762c66041 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -67,7 +67,7 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
67{ 67{
68 if (eb->lock_nested) { 68 if (eb->lock_nested) {
69 read_lock(&eb->lock); 69 read_lock(&eb->lock);
70 if (&eb->lock_nested && current->pid == eb->lock_owner) { 70 if (eb->lock_nested && current->pid == eb->lock_owner) {
71 read_unlock(&eb->lock); 71 read_unlock(&eb->lock);
72 return; 72 return;
73 } 73 }
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 643335a4fe3c..051c7fe551dd 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -596,7 +596,7 @@ void btrfs_start_ordered_extent(struct inode *inode,
596 /* 596 /*
597 * pages in the range can be dirty, clean or writeback. We 597 * pages in the range can be dirty, clean or writeback. We
598 * start IO on any dirty ones so the wait doesn't stall waiting 598 * start IO on any dirty ones so the wait doesn't stall waiting
599 * for pdflush to find them 599 * for the flusher thread to find them
600 */ 600 */
601 if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) 601 if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
602 filemap_fdatawrite_range(inode->i_mapping, start, end); 602 filemap_fdatawrite_range(inode->i_mapping, start, end);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index bc424ae5a81a..38b42e7bc91d 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1364,13 +1364,17 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
1364 spin_lock(&fs_info->qgroup_lock); 1364 spin_lock(&fs_info->qgroup_lock);
1365 1365
1366 dstgroup = add_qgroup_rb(fs_info, objectid); 1366 dstgroup = add_qgroup_rb(fs_info, objectid);
1367 if (!dstgroup) 1367 if (IS_ERR(dstgroup)) {
1368 ret = PTR_ERR(dstgroup);
1368 goto unlock; 1369 goto unlock;
1370 }
1369 1371
1370 if (srcid) { 1372 if (srcid) {
1371 srcgroup = find_qgroup_rb(fs_info, srcid); 1373 srcgroup = find_qgroup_rb(fs_info, srcid);
1372 if (!srcgroup) 1374 if (!srcgroup) {
1375 ret = -EINVAL;
1373 goto unlock; 1376 goto unlock;
1377 }
1374 dstgroup->rfer = srcgroup->rfer - level_size; 1378 dstgroup->rfer = srcgroup->rfer - level_size;
1375 dstgroup->rfer_cmpr = srcgroup->rfer_cmpr - level_size; 1379 dstgroup->rfer_cmpr = srcgroup->rfer_cmpr - level_size;
1376 srcgroup->excl = level_size; 1380 srcgroup->excl = level_size;
@@ -1379,8 +1383,10 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
1379 qgroup_dirty(fs_info, srcgroup); 1383 qgroup_dirty(fs_info, srcgroup);
1380 } 1384 }
1381 1385
1382 if (!inherit) 1386 if (!inherit) {
1387 ret = -EINVAL;
1383 goto unlock; 1388 goto unlock;
1389 }
1384 1390
1385 i_qgroups = (u64 *)(inherit + 1); 1391 i_qgroups = (u64 *)(inherit + 1);
1386 for (i = 0; i < inherit->num_qgroups; ++i) { 1392 for (i = 0; i < inherit->num_qgroups; ++i) {
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 6bb465cca20f..10d8e4d88071 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -544,8 +544,8 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans,
544 struct timespec ct = CURRENT_TIME; 544 struct timespec ct = CURRENT_TIME;
545 545
546 spin_lock(&root->root_times_lock); 546 spin_lock(&root->root_times_lock);
547 item->ctransid = trans->transid; 547 item->ctransid = cpu_to_le64(trans->transid);
548 item->ctime.sec = cpu_to_le64(ct.tv_sec); 548 item->ctime.sec = cpu_to_le64(ct.tv_sec);
549 item->ctime.nsec = cpu_to_le64(ct.tv_nsec); 549 item->ctime.nsec = cpu_to_le32(ct.tv_nsec);
550 spin_unlock(&root->root_times_lock); 550 spin_unlock(&root->root_times_lock);
551} 551}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 8c6e61d6eed5..83d6f9f9c220 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -100,10 +100,6 @@ static void __save_error_info(struct btrfs_fs_info *fs_info)
100 fs_info->fs_state = BTRFS_SUPER_FLAG_ERROR; 100 fs_info->fs_state = BTRFS_SUPER_FLAG_ERROR;
101} 101}
102 102
103/* NOTE:
104 * We move write_super stuff at umount in order to avoid deadlock
105 * for umount hold all lock.
106 */
107static void save_error_info(struct btrfs_fs_info *fs_info) 103static void save_error_info(struct btrfs_fs_info *fs_info)
108{ 104{
109 __save_error_info(fs_info); 105 __save_error_info(fs_info);
@@ -842,7 +838,6 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
842 struct btrfs_trans_handle *trans; 838 struct btrfs_trans_handle *trans;
843 struct btrfs_fs_info *fs_info = btrfs_sb(sb); 839 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
844 struct btrfs_root *root = fs_info->tree_root; 840 struct btrfs_root *root = fs_info->tree_root;
845 int ret;
846 841
847 trace_btrfs_sync_fs(wait); 842 trace_btrfs_sync_fs(wait);
848 843
@@ -853,11 +848,17 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
853 848
854 btrfs_wait_ordered_extents(root, 0, 0); 849 btrfs_wait_ordered_extents(root, 0, 0);
855 850
856 trans = btrfs_start_transaction(root, 0); 851 spin_lock(&fs_info->trans_lock);
852 if (!fs_info->running_transaction) {
853 spin_unlock(&fs_info->trans_lock);
854 return 0;
855 }
856 spin_unlock(&fs_info->trans_lock);
857
858 trans = btrfs_join_transaction(root);
857 if (IS_ERR(trans)) 859 if (IS_ERR(trans))
858 return PTR_ERR(trans); 860 return PTR_ERR(trans);
859 ret = btrfs_commit_transaction(trans, root); 861 return btrfs_commit_transaction(trans, root);
860 return ret;
861} 862}
862 863
863static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) 864static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
@@ -1534,6 +1535,8 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
1534 while (cur_devices) { 1535 while (cur_devices) {
1535 head = &cur_devices->devices; 1536 head = &cur_devices->devices;
1536 list_for_each_entry(dev, head, dev_list) { 1537 list_for_each_entry(dev, head, dev_list) {
1538 if (dev->missing)
1539 continue;
1537 if (!first_dev || dev->devid < first_dev->devid) 1540 if (!first_dev || dev->devid < first_dev->devid)
1538 first_dev = dev; 1541 first_dev = dev;
1539 } 1542 }
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 7ac7cdcc294e..27c26004e050 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -335,6 +335,8 @@ again:
335 if (!h) 335 if (!h)
336 return ERR_PTR(-ENOMEM); 336 return ERR_PTR(-ENOMEM);
337 337
338 sb_start_intwrite(root->fs_info->sb);
339
338 if (may_wait_transaction(root, type)) 340 if (may_wait_transaction(root, type))
339 wait_current_trans(root); 341 wait_current_trans(root);
340 342
@@ -345,6 +347,7 @@ again:
345 } while (ret == -EBUSY); 347 } while (ret == -EBUSY);
346 348
347 if (ret < 0) { 349 if (ret < 0) {
350 sb_end_intwrite(root->fs_info->sb);
348 kmem_cache_free(btrfs_trans_handle_cachep, h); 351 kmem_cache_free(btrfs_trans_handle_cachep, h);
349 return ERR_PTR(ret); 352 return ERR_PTR(ret);
350 } 353 }
@@ -548,6 +551,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
548 btrfs_trans_release_metadata(trans, root); 551 btrfs_trans_release_metadata(trans, root);
549 trans->block_rsv = NULL; 552 trans->block_rsv = NULL;
550 553
554 sb_end_intwrite(root->fs_info->sb);
555
551 if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && 556 if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
552 should_end_transaction(trans, root)) { 557 should_end_transaction(trans, root)) {
553 trans->transaction->blocked = 1; 558 trans->transaction->blocked = 1;
@@ -1026,6 +1031,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1026 1031
1027 btrfs_i_size_write(parent_inode, parent_inode->i_size + 1032 btrfs_i_size_write(parent_inode, parent_inode->i_size +
1028 dentry->d_name.len * 2); 1033 dentry->d_name.len * 2);
1034 parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
1029 ret = btrfs_update_inode(trans, parent_root, parent_inode); 1035 ret = btrfs_update_inode(trans, parent_root, parent_inode);
1030 if (ret) 1036 if (ret)
1031 goto abort_trans_dput; 1037 goto abort_trans_dput;
@@ -1061,7 +1067,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1061 memcpy(new_root_item->parent_uuid, root->root_item.uuid, 1067 memcpy(new_root_item->parent_uuid, root->root_item.uuid,
1062 BTRFS_UUID_SIZE); 1068 BTRFS_UUID_SIZE);
1063 new_root_item->otime.sec = cpu_to_le64(cur_time.tv_sec); 1069 new_root_item->otime.sec = cpu_to_le64(cur_time.tv_sec);
1064 new_root_item->otime.nsec = cpu_to_le64(cur_time.tv_nsec); 1070 new_root_item->otime.nsec = cpu_to_le32(cur_time.tv_nsec);
1065 btrfs_set_root_otransid(new_root_item, trans->transid); 1071 btrfs_set_root_otransid(new_root_item, trans->transid);
1066 memset(&new_root_item->stime, 0, sizeof(new_root_item->stime)); 1072 memset(&new_root_item->stime, 0, sizeof(new_root_item->stime));
1067 memset(&new_root_item->rtime, 0, sizeof(new_root_item->rtime)); 1073 memset(&new_root_item->rtime, 0, sizeof(new_root_item->rtime));
@@ -1578,6 +1584,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1578 put_transaction(cur_trans); 1584 put_transaction(cur_trans);
1579 put_transaction(cur_trans); 1585 put_transaction(cur_trans);
1580 1586
1587 sb_end_intwrite(root->fs_info->sb);
1588
1581 trace_btrfs_transaction_commit(root); 1589 trace_btrfs_transaction_commit(root);
1582 1590
1583 btrfs_scrub_continue(root); 1591 btrfs_scrub_continue(root);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b8708f994e67..88b969aeeb71 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -227,9 +227,8 @@ loop_lock:
227 cur = pending; 227 cur = pending;
228 pending = pending->bi_next; 228 pending = pending->bi_next;
229 cur->bi_next = NULL; 229 cur->bi_next = NULL;
230 atomic_dec(&fs_info->nr_async_bios);
231 230
232 if (atomic_read(&fs_info->nr_async_bios) < limit && 231 if (atomic_dec_return(&fs_info->nr_async_bios) < limit &&
233 waitqueue_active(&fs_info->async_submit_wait)) 232 waitqueue_active(&fs_info->async_submit_wait))
234 wake_up(&fs_info->async_submit_wait); 233 wake_up(&fs_info->async_submit_wait);
235 234
@@ -569,9 +568,11 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
569 memcpy(new_device, device, sizeof(*new_device)); 568 memcpy(new_device, device, sizeof(*new_device));
570 569
571 /* Safe because we are under uuid_mutex */ 570 /* Safe because we are under uuid_mutex */
572 name = rcu_string_strdup(device->name->str, GFP_NOFS); 571 if (device->name) {
573 BUG_ON(device->name && !name); /* -ENOMEM */ 572 name = rcu_string_strdup(device->name->str, GFP_NOFS);
574 rcu_assign_pointer(new_device->name, name); 573 BUG_ON(device->name && !name); /* -ENOMEM */
574 rcu_assign_pointer(new_device->name, name);
575 }
575 new_device->bdev = NULL; 576 new_device->bdev = NULL;
576 new_device->writeable = 0; 577 new_device->writeable = 0;
577 new_device->in_fs_metadata = 0; 578 new_device->in_fs_metadata = 0;
@@ -1744,10 +1745,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1744 1745
1745 device->fs_devices = root->fs_info->fs_devices; 1746 device->fs_devices = root->fs_info->fs_devices;
1746 1747
1747 /*
1748 * we don't want write_supers to jump in here with our device
1749 * half setup
1750 */
1751 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1748 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1752 list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices); 1749 list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices);
1753 list_add(&device->dev_alloc_list, 1750 list_add(&device->dev_alloc_list,
@@ -4609,28 +4606,6 @@ int btrfs_read_sys_array(struct btrfs_root *root)
4609 return ret; 4606 return ret;
4610} 4607}
4611 4608
4612struct btrfs_device *btrfs_find_device_for_logical(struct btrfs_root *root,
4613 u64 logical, int mirror_num)
4614{
4615 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
4616 int ret;
4617 u64 map_length = 0;
4618 struct btrfs_bio *bbio = NULL;
4619 struct btrfs_device *device;
4620
4621 BUG_ON(mirror_num == 0);
4622 ret = btrfs_map_block(map_tree, WRITE, logical, &map_length, &bbio,
4623 mirror_num);
4624 if (ret) {
4625 BUG_ON(bbio != NULL);
4626 return NULL;
4627 }
4628 BUG_ON(mirror_num != bbio->mirror_num);
4629 device = bbio->stripes[mirror_num - 1].dev;
4630 kfree(bbio);
4631 return device;
4632}
4633
4634int btrfs_read_chunk_tree(struct btrfs_root *root) 4609int btrfs_read_chunk_tree(struct btrfs_root *root)
4635{ 4610{
4636 struct btrfs_path *path; 4611 struct btrfs_path *path;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 5479325987b3..53c06af92e8d 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -289,8 +289,6 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
289int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); 289int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
290int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, 290int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
291 u64 *start, u64 *max_avail); 291 u64 *start, u64 *max_avail);
292struct btrfs_device *btrfs_find_device_for_logical(struct btrfs_root *root,
293 u64 logical, int mirror_num);
294void btrfs_dev_stat_print_on_error(struct btrfs_device *device); 292void btrfs_dev_stat_print_on_error(struct btrfs_device *device);
295void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index); 293void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index);
296int btrfs_get_dev_stats(struct btrfs_root *root, 294int btrfs_get_dev_stats(struct btrfs_root *root,
diff --git a/fs/buffer.c b/fs/buffer.c
index c7062c896d7c..58e2e7b77372 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -914,7 +914,7 @@ link_dev_buffers(struct page *page, struct buffer_head *head)
914/* 914/*
915 * Initialise the state of a blockdev page's buffers. 915 * Initialise the state of a blockdev page's buffers.
916 */ 916 */
917static void 917static sector_t
918init_page_buffers(struct page *page, struct block_device *bdev, 918init_page_buffers(struct page *page, struct block_device *bdev,
919 sector_t block, int size) 919 sector_t block, int size)
920{ 920{
@@ -936,33 +936,41 @@ init_page_buffers(struct page *page, struct block_device *bdev,
936 block++; 936 block++;
937 bh = bh->b_this_page; 937 bh = bh->b_this_page;
938 } while (bh != head); 938 } while (bh != head);
939
940 /*
941 * Caller needs to validate requested block against end of device.
942 */
943 return end_block;
939} 944}
940 945
941/* 946/*
942 * Create the page-cache page that contains the requested block. 947 * Create the page-cache page that contains the requested block.
943 * 948 *
944 * This is user purely for blockdev mappings. 949 * This is used purely for blockdev mappings.
945 */ 950 */
946static struct page * 951static int
947grow_dev_page(struct block_device *bdev, sector_t block, 952grow_dev_page(struct block_device *bdev, sector_t block,
948 pgoff_t index, int size) 953 pgoff_t index, int size, int sizebits)
949{ 954{
950 struct inode *inode = bdev->bd_inode; 955 struct inode *inode = bdev->bd_inode;
951 struct page *page; 956 struct page *page;
952 struct buffer_head *bh; 957 struct buffer_head *bh;
958 sector_t end_block;
959 int ret = 0; /* Will call free_more_memory() */
953 960
954 page = find_or_create_page(inode->i_mapping, index, 961 page = find_or_create_page(inode->i_mapping, index,
955 (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE); 962 (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
956 if (!page) 963 if (!page)
957 return NULL; 964 return ret;
958 965
959 BUG_ON(!PageLocked(page)); 966 BUG_ON(!PageLocked(page));
960 967
961 if (page_has_buffers(page)) { 968 if (page_has_buffers(page)) {
962 bh = page_buffers(page); 969 bh = page_buffers(page);
963 if (bh->b_size == size) { 970 if (bh->b_size == size) {
964 init_page_buffers(page, bdev, block, size); 971 end_block = init_page_buffers(page, bdev,
965 return page; 972 index << sizebits, size);
973 goto done;
966 } 974 }
967 if (!try_to_free_buffers(page)) 975 if (!try_to_free_buffers(page))
968 goto failed; 976 goto failed;
@@ -982,14 +990,14 @@ grow_dev_page(struct block_device *bdev, sector_t block,
982 */ 990 */
983 spin_lock(&inode->i_mapping->private_lock); 991 spin_lock(&inode->i_mapping->private_lock);
984 link_dev_buffers(page, bh); 992 link_dev_buffers(page, bh);
985 init_page_buffers(page, bdev, block, size); 993 end_block = init_page_buffers(page, bdev, index << sizebits, size);
986 spin_unlock(&inode->i_mapping->private_lock); 994 spin_unlock(&inode->i_mapping->private_lock);
987 return page; 995done:
988 996 ret = (block < end_block) ? 1 : -ENXIO;
989failed: 997failed:
990 unlock_page(page); 998 unlock_page(page);
991 page_cache_release(page); 999 page_cache_release(page);
992 return NULL; 1000 return ret;
993} 1001}
994 1002
995/* 1003/*
@@ -999,7 +1007,6 @@ failed:
999static int 1007static int
1000grow_buffers(struct block_device *bdev, sector_t block, int size) 1008grow_buffers(struct block_device *bdev, sector_t block, int size)
1001{ 1009{
1002 struct page *page;
1003 pgoff_t index; 1010 pgoff_t index;
1004 int sizebits; 1011 int sizebits;
1005 1012
@@ -1023,22 +1030,14 @@ grow_buffers(struct block_device *bdev, sector_t block, int size)
1023 bdevname(bdev, b)); 1030 bdevname(bdev, b));
1024 return -EIO; 1031 return -EIO;
1025 } 1032 }
1026 block = index << sizebits; 1033
1027 /* Create a page with the proper size buffers.. */ 1034 /* Create a page with the proper size buffers.. */
1028 page = grow_dev_page(bdev, block, index, size); 1035 return grow_dev_page(bdev, block, index, size, sizebits);
1029 if (!page)
1030 return 0;
1031 unlock_page(page);
1032 page_cache_release(page);
1033 return 1;
1034} 1036}
1035 1037
1036static struct buffer_head * 1038static struct buffer_head *
1037__getblk_slow(struct block_device *bdev, sector_t block, int size) 1039__getblk_slow(struct block_device *bdev, sector_t block, int size)
1038{ 1040{
1039 int ret;
1040 struct buffer_head *bh;
1041
1042 /* Size must be multiple of hard sectorsize */ 1041 /* Size must be multiple of hard sectorsize */
1043 if (unlikely(size & (bdev_logical_block_size(bdev)-1) || 1042 if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
1044 (size < 512 || size > PAGE_SIZE))) { 1043 (size < 512 || size > PAGE_SIZE))) {
@@ -1051,21 +1050,20 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
1051 return NULL; 1050 return NULL;
1052 } 1051 }
1053 1052
1054retry: 1053 for (;;) {
1055 bh = __find_get_block(bdev, block, size); 1054 struct buffer_head *bh;
1056 if (bh) 1055 int ret;
1057 return bh;
1058 1056
1059 ret = grow_buffers(bdev, block, size);
1060 if (ret == 0) {
1061 free_more_memory();
1062 goto retry;
1063 } else if (ret > 0) {
1064 bh = __find_get_block(bdev, block, size); 1057 bh = __find_get_block(bdev, block, size);
1065 if (bh) 1058 if (bh)
1066 return bh; 1059 return bh;
1060
1061 ret = grow_buffers(bdev, block, size);
1062 if (ret < 0)
1063 return NULL;
1064 if (ret == 0)
1065 free_more_memory();
1067 } 1066 }
1068 return NULL;
1069} 1067}
1070 1068
1071/* 1069/*
@@ -1321,10 +1319,6 @@ EXPORT_SYMBOL(__find_get_block);
1321 * which corresponds to the passed block_device, block and size. The 1319 * which corresponds to the passed block_device, block and size. The
1322 * returned buffer has its reference count incremented. 1320 * returned buffer has its reference count incremented.
1323 * 1321 *
1324 * __getblk() cannot fail - it just keeps trying. If you pass it an
1325 * illegal block number, __getblk() will happily return a buffer_head
1326 * which represents the non-existent block. Very weird.
1327 *
1328 * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers() 1322 * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
1329 * attempt is failing. FIXME, perhaps? 1323 * attempt is failing. FIXME, perhaps?
1330 */ 1324 */
@@ -2306,8 +2300,8 @@ EXPORT_SYMBOL(block_commit_write);
2306 * beyond EOF, then the page is guaranteed safe against truncation until we 2300 * beyond EOF, then the page is guaranteed safe against truncation until we
2307 * unlock the page. 2301 * unlock the page.
2308 * 2302 *
2309 * Direct callers of this function should call vfs_check_frozen() so that page 2303 * Direct callers of this function should protect against filesystem freezing
2310 * fault does not busyloop until the fs is thawed. 2304 * using sb_start_write() - sb_end_write() functions.
2311 */ 2305 */
2312int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, 2306int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2313 get_block_t get_block) 2307 get_block_t get_block)
@@ -2318,6 +2312,12 @@ int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2318 loff_t size; 2312 loff_t size;
2319 int ret; 2313 int ret;
2320 2314
2315 /*
2316 * Update file times before taking page lock. We may end up failing the
2317 * fault so this update may be superfluous but who really cares...
2318 */
2319 file_update_time(vma->vm_file);
2320
2321 lock_page(page); 2321 lock_page(page);
2322 size = i_size_read(inode); 2322 size = i_size_read(inode);
2323 if ((page->mapping != inode->i_mapping) || 2323 if ((page->mapping != inode->i_mapping) ||
@@ -2339,18 +2339,7 @@ int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2339 2339
2340 if (unlikely(ret < 0)) 2340 if (unlikely(ret < 0))
2341 goto out_unlock; 2341 goto out_unlock;
2342 /*
2343 * Freezing in progress? We check after the page is marked dirty and
2344 * with page lock held so if the test here fails, we are sure freezing
2345 * code will wait during syncing until the page fault is done - at that
2346 * point page will be dirty and unlocked so freezing code will write it
2347 * and writeprotect it again.
2348 */
2349 set_page_dirty(page); 2342 set_page_dirty(page);
2350 if (inode->i_sb->s_frozen != SB_UNFROZEN) {
2351 ret = -EAGAIN;
2352 goto out_unlock;
2353 }
2354 wait_on_page_writeback(page); 2343 wait_on_page_writeback(page);
2355 return 0; 2344 return 0;
2356out_unlock: 2345out_unlock:
@@ -2365,12 +2354,9 @@ int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2365 int ret; 2354 int ret;
2366 struct super_block *sb = vma->vm_file->f_path.dentry->d_inode->i_sb; 2355 struct super_block *sb = vma->vm_file->f_path.dentry->d_inode->i_sb;
2367 2356
2368 /* 2357 sb_start_pagefault(sb);
2369 * This check is racy but catches the common case. The check in
2370 * __block_page_mkwrite() is reliable.
2371 */
2372 vfs_check_frozen(sb, SB_FREEZE_WRITE);
2373 ret = __block_page_mkwrite(vma, vmf, get_block); 2358 ret = __block_page_mkwrite(vma, vmf, get_block);
2359 sb_end_pagefault(sb);
2374 return block_page_mkwrite_return(ret); 2360 return block_page_mkwrite_return(ret);
2375} 2361}
2376EXPORT_SYMBOL(block_page_mkwrite); 2362EXPORT_SYMBOL(block_page_mkwrite);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 8b67304e4b80..452e71a1b753 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1184,6 +1184,9 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1184 loff_t size, len; 1184 loff_t size, len;
1185 int ret; 1185 int ret;
1186 1186
1187 /* Update time before taking page lock */
1188 file_update_time(vma->vm_file);
1189
1187 size = i_size_read(inode); 1190 size = i_size_read(inode);
1188 if (off + PAGE_CACHE_SIZE <= size) 1191 if (off + PAGE_CACHE_SIZE <= size)
1189 len = PAGE_CACHE_SIZE; 1192 len = PAGE_CACHE_SIZE;
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index fb962efdacee..6d59006bfa27 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -201,6 +201,7 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
201 int err = -ENOMEM; 201 int err = -ENOMEM;
202 202
203 dout("ceph_fs_debugfs_init\n"); 203 dout("ceph_fs_debugfs_init\n");
204 BUG_ON(!fsc->client->debugfs_dir);
204 fsc->debugfs_congestion_kb = 205 fsc->debugfs_congestion_kb =
205 debugfs_create_file("writeback_congestion_kb", 206 debugfs_create_file("writeback_congestion_kb",
206 0600, 207 0600,
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index f391f1e75414..e5b77319c97b 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -633,44 +633,6 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
633 return dentry; 633 return dentry;
634} 634}
635 635
636int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
637 struct file *file, unsigned flags, umode_t mode,
638 int *opened)
639{
640 int err;
641 struct dentry *res = NULL;
642
643 if (!(flags & O_CREAT)) {
644 if (dentry->d_name.len > NAME_MAX)
645 return -ENAMETOOLONG;
646
647 err = ceph_init_dentry(dentry);
648 if (err < 0)
649 return err;
650
651 return ceph_lookup_open(dir, dentry, file, flags, mode, opened);
652 }
653
654 if (d_unhashed(dentry)) {
655 res = ceph_lookup(dir, dentry, 0);
656 if (IS_ERR(res))
657 return PTR_ERR(res);
658
659 if (res)
660 dentry = res;
661 }
662
663 /* We don't deal with positive dentries here */
664 if (dentry->d_inode)
665 return finish_no_open(file, res);
666
667 *opened |= FILE_CREATED;
668 err = ceph_lookup_open(dir, dentry, file, flags, mode, opened);
669 dput(res);
670
671 return err;
672}
673
674/* 636/*
675 * If we do a create but get no trace back from the MDS, follow up with 637 * If we do a create but get no trace back from the MDS, follow up with
676 * a lookup (the VFS expects us to link up the provided dentry). 638 * a lookup (the VFS expects us to link up the provided dentry).
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 1b81d6c31878..ecebbc09bfc7 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -4,6 +4,7 @@
4#include <linux/sched.h> 4#include <linux/sched.h>
5#include <linux/slab.h> 5#include <linux/slab.h>
6#include <linux/file.h> 6#include <linux/file.h>
7#include <linux/mount.h>
7#include <linux/namei.h> 8#include <linux/namei.h>
8#include <linux/writeback.h> 9#include <linux/writeback.h>
9 10
@@ -106,9 +107,6 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
106} 107}
107 108
108/* 109/*
109 * If the filp already has private_data, that means the file was
110 * already opened by intent during lookup, and we do nothing.
111 *
112 * If we already have the requisite capabilities, we can satisfy 110 * If we already have the requisite capabilities, we can satisfy
113 * the open request locally (no need to request new caps from the 111 * the open request locally (no need to request new caps from the
114 * MDS). We do, however, need to inform the MDS (asynchronously) 112 * MDS). We do, however, need to inform the MDS (asynchronously)
@@ -207,24 +205,29 @@ out:
207 205
208 206
209/* 207/*
210 * Do a lookup + open with a single request. 208 * Do a lookup + open with a single request. If we get a non-existent
211 * 209 * file or symlink, return 1 so the VFS can retry.
212 * If this succeeds, but some subsequent check in the vfs
213 * may_open() fails, the struct *file gets cleaned up (i.e.
214 * ceph_release gets called). So fear not!
215 */ 210 */
216int ceph_lookup_open(struct inode *dir, struct dentry *dentry, 211int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
217 struct file *file, unsigned flags, umode_t mode, 212 struct file *file, unsigned flags, umode_t mode,
218 int *opened) 213 int *opened)
219{ 214{
220 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); 215 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
221 struct ceph_mds_client *mdsc = fsc->mdsc; 216 struct ceph_mds_client *mdsc = fsc->mdsc;
222 struct ceph_mds_request *req; 217 struct ceph_mds_request *req;
223 struct dentry *ret; 218 struct dentry *dn;
224 int err; 219 int err;
225 220
226 dout("ceph_lookup_open dentry %p '%.*s' flags %d mode 0%o\n", 221 dout("atomic_open %p dentry %p '%.*s' %s flags %d mode 0%o\n",
227 dentry, dentry->d_name.len, dentry->d_name.name, flags, mode); 222 dir, dentry, dentry->d_name.len, dentry->d_name.name,
223 d_unhashed(dentry) ? "unhashed" : "hashed", flags, mode);
224
225 if (dentry->d_name.len > NAME_MAX)
226 return -ENAMETOOLONG;
227
228 err = ceph_init_dentry(dentry);
229 if (err < 0)
230 return err;
228 231
229 /* do the open */ 232 /* do the open */
230 req = prepare_open_request(dir->i_sb, flags, mode); 233 req = prepare_open_request(dir->i_sb, flags, mode);
@@ -241,22 +244,31 @@ int ceph_lookup_open(struct inode *dir, struct dentry *dentry,
241 (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, 244 (flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
242 req); 245 req);
243 err = ceph_handle_snapdir(req, dentry, err); 246 err = ceph_handle_snapdir(req, dentry, err);
244 if (err) 247 if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
245 goto out;
246 if ((flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
247 err = ceph_handle_notrace_create(dir, dentry); 248 err = ceph_handle_notrace_create(dir, dentry);
248 if (err)
249 goto out;
250 err = finish_open(file, req->r_dentry, ceph_open, opened);
251out:
252 ret = ceph_finish_lookup(req, dentry, err);
253 ceph_mdsc_put_request(req);
254 dout("ceph_lookup_open result=%p\n", ret);
255 249
256 if (IS_ERR(ret)) 250 if (d_unhashed(dentry)) {
257 return PTR_ERR(ret); 251 dn = ceph_finish_lookup(req, dentry, err);
252 if (IS_ERR(dn))
253 err = PTR_ERR(dn);
254 } else {
255 /* we were given a hashed negative dentry */
256 dn = NULL;
257 }
258 if (err)
259 goto out_err;
260 if (dn || dentry->d_inode == NULL || S_ISLNK(dentry->d_inode->i_mode)) {
261 /* make vfs retry on splice, ENOENT, or symlink */
262 dout("atomic_open finish_no_open on dn %p\n", dn);
263 err = finish_no_open(file, dn);
264 } else {
265 dout("atomic_open finish_open on dn %p\n", dn);
266 err = finish_open(file, dentry, ceph_open, opened);
267 }
258 268
259 dput(ret); 269out_err:
270 ceph_mdsc_put_request(req);
271 dout("atomic_open result=%d\n", err);
260 return err; 272 return err;
261} 273}
262 274
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 9fff9f3b17e4..4b5762ef7c2b 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -992,11 +992,15 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
992 if (rinfo->head->is_dentry) { 992 if (rinfo->head->is_dentry) {
993 struct inode *dir = req->r_locked_dir; 993 struct inode *dir = req->r_locked_dir;
994 994
995 err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag, 995 if (dir) {
996 session, req->r_request_started, -1, 996 err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag,
997 &req->r_caps_reservation); 997 session, req->r_request_started, -1,
998 if (err < 0) 998 &req->r_caps_reservation);
999 return err; 999 if (err < 0)
1000 return err;
1001 } else {
1002 WARN_ON_ONCE(1);
1003 }
1000 } 1004 }
1001 1005
1002 /* 1006 /*
@@ -1004,6 +1008,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
1004 * will have trouble splicing in the virtual snapdir later 1008 * will have trouble splicing in the virtual snapdir later
1005 */ 1009 */
1006 if (rinfo->head->is_dentry && !req->r_aborted && 1010 if (rinfo->head->is_dentry && !req->r_aborted &&
1011 req->r_locked_dir &&
1007 (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name, 1012 (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name,
1008 fsc->mount_options->snapdir_name, 1013 fsc->mount_options->snapdir_name,
1009 req->r_dentry->d_name.len))) { 1014 req->r_dentry->d_name.len))) {
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 8e3fb69fbe62..1396ceb46797 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -42,7 +42,8 @@ static long __validate_layout(struct ceph_mds_client *mdsc,
42 /* validate striping parameters */ 42 /* validate striping parameters */
43 if ((l->object_size & ~PAGE_MASK) || 43 if ((l->object_size & ~PAGE_MASK) ||
44 (l->stripe_unit & ~PAGE_MASK) || 44 (l->stripe_unit & ~PAGE_MASK) ||
45 ((unsigned)l->object_size % (unsigned)l->stripe_unit)) 45 (l->stripe_unit != 0 &&
46 ((unsigned)l->object_size % (unsigned)l->stripe_unit)))
46 return -EINVAL; 47 return -EINVAL;
47 48
48 /* make sure it's a valid data pool */ 49 /* make sure it's a valid data pool */
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index ebc95cc652be..66ebe720e40d 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -806,9 +806,9 @@ extern int ceph_copy_from_page_vector(struct page **pages,
806 loff_t off, size_t len); 806 loff_t off, size_t len);
807extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags); 807extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags);
808extern int ceph_open(struct inode *inode, struct file *file); 808extern int ceph_open(struct inode *inode, struct file *file);
809extern int ceph_lookup_open(struct inode *dir, struct dentry *dentry, 809extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
810 struct file *od, unsigned flags, 810 struct file *file, unsigned flags, umode_t mode,
811 umode_t mode, int *opened); 811 int *opened);
812extern int ceph_release(struct inode *inode, struct file *filp); 812extern int ceph_release(struct inode *inode, struct file *filp);
813 813
814/* dir.c */ 814/* dir.c */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 497da5ce704c..977dc0e85ccb 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -246,6 +246,16 @@ struct smb_version_operations {
246 bool (*can_echo)(struct TCP_Server_Info *); 246 bool (*can_echo)(struct TCP_Server_Info *);
247 /* send echo request */ 247 /* send echo request */
248 int (*echo)(struct TCP_Server_Info *); 248 int (*echo)(struct TCP_Server_Info *);
249 /* create directory */
250 int (*mkdir)(const unsigned int, struct cifs_tcon *, const char *,
251 struct cifs_sb_info *);
252 /* set info on created directory */
253 void (*mkdir_setinfo)(struct inode *, const char *,
254 struct cifs_sb_info *, struct cifs_tcon *,
255 const unsigned int);
256 /* remove directory */
257 int (*rmdir)(const unsigned int, struct cifs_tcon *, const char *,
258 struct cifs_sb_info *);
249}; 259};
250 260
251struct smb_version_values { 261struct smb_version_values {
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index cf7fb185103c..f1bbf8305d3a 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -289,18 +289,15 @@ extern int CIFSSMBUnixSetFileInfo(const unsigned int xid,
289 u16 fid, u32 pid_of_opener); 289 u16 fid, u32 pid_of_opener);
290 290
291extern int CIFSSMBUnixSetPathInfo(const unsigned int xid, 291extern int CIFSSMBUnixSetPathInfo(const unsigned int xid,
292 struct cifs_tcon *tcon, char *file_name, 292 struct cifs_tcon *tcon, const char *file_name,
293 const struct cifs_unix_set_info_args *args, 293 const struct cifs_unix_set_info_args *args,
294 const struct nls_table *nls_codepage, 294 const struct nls_table *nls_codepage,
295 int remap_special_chars); 295 int remap);
296 296
297extern int CIFSSMBMkDir(const unsigned int xid, struct cifs_tcon *tcon, 297extern int CIFSSMBMkDir(const unsigned int xid, struct cifs_tcon *tcon,
298 const char *newName, 298 const char *name, struct cifs_sb_info *cifs_sb);
299 const struct nls_table *nls_codepage,
300 int remap_special_chars);
301extern int CIFSSMBRmDir(const unsigned int xid, struct cifs_tcon *tcon, 299extern int CIFSSMBRmDir(const unsigned int xid, struct cifs_tcon *tcon,
302 const char *name, const struct nls_table *nls_codepage, 300 const char *name, struct cifs_sb_info *cifs_sb);
303 int remap_special_chars);
304extern int CIFSPOSIXDelFile(const unsigned int xid, struct cifs_tcon *tcon, 301extern int CIFSPOSIXDelFile(const unsigned int xid, struct cifs_tcon *tcon,
305 const char *name, __u16 type, 302 const char *name, __u16 type,
306 const struct nls_table *nls_codepage, 303 const struct nls_table *nls_codepage,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index cabc7a01f5df..f0cf934ba877 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -948,15 +948,15 @@ DelFileRetry:
948} 948}
949 949
950int 950int
951CIFSSMBRmDir(const unsigned int xid, struct cifs_tcon *tcon, 951CIFSSMBRmDir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
952 const char *dirName, const struct nls_table *nls_codepage, 952 struct cifs_sb_info *cifs_sb)
953 int remap)
954{ 953{
955 DELETE_DIRECTORY_REQ *pSMB = NULL; 954 DELETE_DIRECTORY_REQ *pSMB = NULL;
956 DELETE_DIRECTORY_RSP *pSMBr = NULL; 955 DELETE_DIRECTORY_RSP *pSMBr = NULL;
957 int rc = 0; 956 int rc = 0;
958 int bytes_returned; 957 int bytes_returned;
959 int name_len; 958 int name_len;
959 int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR;
960 960
961 cFYI(1, "In CIFSSMBRmDir"); 961 cFYI(1, "In CIFSSMBRmDir");
962RmDirRetry: 962RmDirRetry:
@@ -966,14 +966,15 @@ RmDirRetry:
966 return rc; 966 return rc;
967 967
968 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { 968 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
969 name_len = cifsConvertToUTF16((__le16 *) pSMB->DirName, dirName, 969 name_len = cifsConvertToUTF16((__le16 *) pSMB->DirName, name,
970 PATH_MAX, nls_codepage, remap); 970 PATH_MAX, cifs_sb->local_nls,
971 remap);
971 name_len++; /* trailing null */ 972 name_len++; /* trailing null */
972 name_len *= 2; 973 name_len *= 2;
973 } else { /* BB improve check for buffer overruns BB */ 974 } else { /* BB improve check for buffer overruns BB */
974 name_len = strnlen(dirName, PATH_MAX); 975 name_len = strnlen(name, PATH_MAX);
975 name_len++; /* trailing null */ 976 name_len++; /* trailing null */
976 strncpy(pSMB->DirName, dirName, name_len); 977 strncpy(pSMB->DirName, name, name_len);
977 } 978 }
978 979
979 pSMB->BufferFormat = 0x04; 980 pSMB->BufferFormat = 0x04;
@@ -992,14 +993,15 @@ RmDirRetry:
992} 993}
993 994
994int 995int
995CIFSSMBMkDir(const unsigned int xid, struct cifs_tcon *tcon, 996CIFSSMBMkDir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
996 const char *name, const struct nls_table *nls_codepage, int remap) 997 struct cifs_sb_info *cifs_sb)
997{ 998{
998 int rc = 0; 999 int rc = 0;
999 CREATE_DIRECTORY_REQ *pSMB = NULL; 1000 CREATE_DIRECTORY_REQ *pSMB = NULL;
1000 CREATE_DIRECTORY_RSP *pSMBr = NULL; 1001 CREATE_DIRECTORY_RSP *pSMBr = NULL;
1001 int bytes_returned; 1002 int bytes_returned;
1002 int name_len; 1003 int name_len;
1004 int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR;
1003 1005
1004 cFYI(1, "In CIFSSMBMkDir"); 1006 cFYI(1, "In CIFSSMBMkDir");
1005MkDirRetry: 1007MkDirRetry:
@@ -1010,7 +1012,8 @@ MkDirRetry:
1010 1012
1011 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { 1013 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
1012 name_len = cifsConvertToUTF16((__le16 *) pSMB->DirName, name, 1014 name_len = cifsConvertToUTF16((__le16 *) pSMB->DirName, name,
1013 PATH_MAX, nls_codepage, remap); 1015 PATH_MAX, cifs_sb->local_nls,
1016 remap);
1014 name_len++; /* trailing null */ 1017 name_len++; /* trailing null */
1015 name_len *= 2; 1018 name_len *= 2;
1016 } else { /* BB improve check for buffer overruns BB */ 1019 } else { /* BB improve check for buffer overruns BB */
@@ -1573,9 +1576,14 @@ cifs_readv_callback(struct mid_q_entry *mid)
1573 /* result already set, check signature */ 1576 /* result already set, check signature */
1574 if (server->sec_mode & 1577 if (server->sec_mode &
1575 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) { 1578 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
1576 if (cifs_verify_signature(rdata->iov, rdata->nr_iov, 1579 int rc = 0;
1577 server, mid->sequence_number + 1)) 1580
1578 cERROR(1, "Unexpected SMB signature"); 1581 rc = cifs_verify_signature(rdata->iov, rdata->nr_iov,
1582 server,
1583 mid->sequence_number + 1);
1584 if (rc)
1585 cERROR(1, "SMB signature verification returned "
1586 "error = %d", rc);
1579 } 1587 }
1580 /* FIXME: should this be counted toward the initiating task? */ 1588 /* FIXME: should this be counted toward the initiating task? */
1581 task_io_account_read(rdata->bytes); 1589 task_io_account_read(rdata->bytes);
@@ -5943,7 +5951,7 @@ CIFSSMBUnixSetFileInfo(const unsigned int xid, struct cifs_tcon *tcon,
5943 5951
5944int 5952int
5945CIFSSMBUnixSetPathInfo(const unsigned int xid, struct cifs_tcon *tcon, 5953CIFSSMBUnixSetPathInfo(const unsigned int xid, struct cifs_tcon *tcon,
5946 char *fileName, 5954 const char *file_name,
5947 const struct cifs_unix_set_info_args *args, 5955 const struct cifs_unix_set_info_args *args,
5948 const struct nls_table *nls_codepage, int remap) 5956 const struct nls_table *nls_codepage, int remap)
5949{ 5957{
@@ -5964,14 +5972,14 @@ setPermsRetry:
5964 5972
5965 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { 5973 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
5966 name_len = 5974 name_len =
5967 cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName, 5975 cifsConvertToUTF16((__le16 *) pSMB->FileName, file_name,
5968 PATH_MAX, nls_codepage, remap); 5976 PATH_MAX, nls_codepage, remap);
5969 name_len++; /* trailing null */ 5977 name_len++; /* trailing null */
5970 name_len *= 2; 5978 name_len *= 2;
5971 } else { /* BB improve the check for buffer overruns BB */ 5979 } else { /* BB improve the check for buffer overruns BB */
5972 name_len = strnlen(fileName, PATH_MAX); 5980 name_len = strnlen(file_name, PATH_MAX);
5973 name_len++; /* trailing null */ 5981 name_len++; /* trailing null */
5974 strncpy(pSMB->FileName, fileName, name_len); 5982 strncpy(pSMB->FileName, file_name, name_len);
5975 } 5983 }
5976 5984
5977 params = 6 + name_len; 5985 params = 6 + name_len;
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index cbe709ad6663..781025be48bc 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -356,19 +356,12 @@ cifs_create_get_file_info:
356cifs_create_set_dentry: 356cifs_create_set_dentry:
357 if (rc != 0) { 357 if (rc != 0) {
358 cFYI(1, "Create worked, get_inode_info failed rc = %d", rc); 358 cFYI(1, "Create worked, get_inode_info failed rc = %d", rc);
359 CIFSSMBClose(xid, tcon, *fileHandle);
359 goto out; 360 goto out;
360 } 361 }
361 d_drop(direntry); 362 d_drop(direntry);
362 d_add(direntry, newinode); 363 d_add(direntry, newinode);
363 364
364 /* ENOENT for create? How weird... */
365 rc = -ENOENT;
366 if (!newinode) {
367 CIFSSMBClose(xid, tcon, *fileHandle);
368 goto out;
369 }
370 rc = 0;
371
372out: 365out:
373 kfree(buf); 366 kfree(buf);
374 kfree(full_path); 367 kfree(full_path);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 9154192b0683..71e9ad9f5961 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -917,7 +917,7 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
917 if (!buf) { 917 if (!buf) {
918 mutex_unlock(&cinode->lock_mutex); 918 mutex_unlock(&cinode->lock_mutex);
919 free_xid(xid); 919 free_xid(xid);
920 return rc; 920 return -ENOMEM;
921 } 921 }
922 922
923 for (i = 0; i < 2; i++) { 923 for (i = 0; i < 2; i++) {
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 35cb6a374a45..cb79c7edecb0 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -124,10 +124,10 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
124{ 124{
125 struct cifsInodeInfo *cifs_i = CIFS_I(inode); 125 struct cifsInodeInfo *cifs_i = CIFS_I(inode);
126 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 126 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
127 unsigned long oldtime = cifs_i->time;
128 127
129 cifs_revalidate_cache(inode, fattr); 128 cifs_revalidate_cache(inode, fattr);
130 129
130 spin_lock(&inode->i_lock);
131 inode->i_atime = fattr->cf_atime; 131 inode->i_atime = fattr->cf_atime;
132 inode->i_mtime = fattr->cf_mtime; 132 inode->i_mtime = fattr->cf_mtime;
133 inode->i_ctime = fattr->cf_ctime; 133 inode->i_ctime = fattr->cf_ctime;
@@ -148,9 +148,6 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
148 else 148 else
149 cifs_i->time = jiffies; 149 cifs_i->time = jiffies;
150 150
151 cFYI(1, "inode 0x%p old_time=%ld new_time=%ld", inode,
152 oldtime, cifs_i->time);
153
154 cifs_i->delete_pending = fattr->cf_flags & CIFS_FATTR_DELETE_PENDING; 151 cifs_i->delete_pending = fattr->cf_flags & CIFS_FATTR_DELETE_PENDING;
155 152
156 cifs_i->server_eof = fattr->cf_eof; 153 cifs_i->server_eof = fattr->cf_eof;
@@ -158,7 +155,6 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
158 * Can't safely change the file size here if the client is writing to 155 * Can't safely change the file size here if the client is writing to
159 * it due to potential races. 156 * it due to potential races.
160 */ 157 */
161 spin_lock(&inode->i_lock);
162 if (is_size_safe_to_change(cifs_i, fattr->cf_eof)) { 158 if (is_size_safe_to_change(cifs_i, fattr->cf_eof)) {
163 i_size_write(inode, fattr->cf_eof); 159 i_size_write(inode, fattr->cf_eof);
164 160
@@ -859,12 +855,14 @@ struct inode *cifs_root_iget(struct super_block *sb)
859 855
860 if (rc && tcon->ipc) { 856 if (rc && tcon->ipc) {
861 cFYI(1, "ipc connection - fake read inode"); 857 cFYI(1, "ipc connection - fake read inode");
858 spin_lock(&inode->i_lock);
862 inode->i_mode |= S_IFDIR; 859 inode->i_mode |= S_IFDIR;
863 set_nlink(inode, 2); 860 set_nlink(inode, 2);
864 inode->i_op = &cifs_ipc_inode_ops; 861 inode->i_op = &cifs_ipc_inode_ops;
865 inode->i_fop = &simple_dir_operations; 862 inode->i_fop = &simple_dir_operations;
866 inode->i_uid = cifs_sb->mnt_uid; 863 inode->i_uid = cifs_sb->mnt_uid;
867 inode->i_gid = cifs_sb->mnt_gid; 864 inode->i_gid = cifs_sb->mnt_gid;
865 spin_unlock(&inode->i_lock);
868 } else if (rc) { 866 } else if (rc) {
869 iget_failed(inode); 867 iget_failed(inode);
870 inode = ERR_PTR(rc); 868 inode = ERR_PTR(rc);
@@ -1110,6 +1108,15 @@ undo_setattr:
1110 goto out_close; 1108 goto out_close;
1111} 1109}
1112 1110
1111/* copied from fs/nfs/dir.c with small changes */
1112static void
1113cifs_drop_nlink(struct inode *inode)
1114{
1115 spin_lock(&inode->i_lock);
1116 if (inode->i_nlink > 0)
1117 drop_nlink(inode);
1118 spin_unlock(&inode->i_lock);
1119}
1113 1120
1114/* 1121/*
1115 * If dentry->d_inode is null (usually meaning the cached dentry 1122 * If dentry->d_inode is null (usually meaning the cached dentry
@@ -1166,13 +1173,13 @@ retry_std_delete:
1166psx_del_no_retry: 1173psx_del_no_retry:
1167 if (!rc) { 1174 if (!rc) {
1168 if (inode) 1175 if (inode)
1169 drop_nlink(inode); 1176 cifs_drop_nlink(inode);
1170 } else if (rc == -ENOENT) { 1177 } else if (rc == -ENOENT) {
1171 d_drop(dentry); 1178 d_drop(dentry);
1172 } else if (rc == -ETXTBSY) { 1179 } else if (rc == -ETXTBSY) {
1173 rc = cifs_rename_pending_delete(full_path, dentry, xid); 1180 rc = cifs_rename_pending_delete(full_path, dentry, xid);
1174 if (rc == 0) 1181 if (rc == 0)
1175 drop_nlink(inode); 1182 cifs_drop_nlink(inode);
1176 } else if ((rc == -EACCES) && (dosattr == 0) && inode) { 1183 } else if ((rc == -EACCES) && (dosattr == 0) && inode) {
1177 attrs = kzalloc(sizeof(*attrs), GFP_KERNEL); 1184 attrs = kzalloc(sizeof(*attrs), GFP_KERNEL);
1178 if (attrs == NULL) { 1185 if (attrs == NULL) {
@@ -1219,16 +1226,154 @@ unlink_out:
1219 return rc; 1226 return rc;
1220} 1227}
1221 1228
1229static int
1230cifs_mkdir_qinfo(struct inode *inode, struct dentry *dentry, umode_t mode,
1231 const char *full_path, struct cifs_sb_info *cifs_sb,
1232 struct cifs_tcon *tcon, const unsigned int xid)
1233{
1234 int rc = 0;
1235 struct inode *newinode = NULL;
1236
1237 if (tcon->unix_ext)
1238 rc = cifs_get_inode_info_unix(&newinode, full_path, inode->i_sb,
1239 xid);
1240 else
1241 rc = cifs_get_inode_info(&newinode, full_path, NULL,
1242 inode->i_sb, xid, NULL);
1243 if (rc)
1244 return rc;
1245
1246 d_instantiate(dentry, newinode);
1247 /*
1248 * setting nlink not necessary except in cases where we failed to get it
1249 * from the server or was set bogus
1250 */
1251 spin_lock(&dentry->d_inode->i_lock);
1252 if ((dentry->d_inode) && (dentry->d_inode->i_nlink < 2))
1253 set_nlink(dentry->d_inode, 2);
1254 spin_unlock(&dentry->d_inode->i_lock);
1255 mode &= ~current_umask();
1256 /* must turn on setgid bit if parent dir has it */
1257 if (inode->i_mode & S_ISGID)
1258 mode |= S_ISGID;
1259
1260 if (tcon->unix_ext) {
1261 struct cifs_unix_set_info_args args = {
1262 .mode = mode,
1263 .ctime = NO_CHANGE_64,
1264 .atime = NO_CHANGE_64,
1265 .mtime = NO_CHANGE_64,
1266 .device = 0,
1267 };
1268 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
1269 args.uid = (__u64)current_fsuid();
1270 if (inode->i_mode & S_ISGID)
1271 args.gid = (__u64)inode->i_gid;
1272 else
1273 args.gid = (__u64)current_fsgid();
1274 } else {
1275 args.uid = NO_CHANGE_64;
1276 args.gid = NO_CHANGE_64;
1277 }
1278 CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args,
1279 cifs_sb->local_nls,
1280 cifs_sb->mnt_cifs_flags &
1281 CIFS_MOUNT_MAP_SPECIAL_CHR);
1282 } else {
1283 struct TCP_Server_Info *server = tcon->ses->server;
1284 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) &&
1285 (mode & S_IWUGO) == 0 && server->ops->mkdir_setinfo)
1286 server->ops->mkdir_setinfo(newinode, full_path, cifs_sb,
1287 tcon, xid);
1288 if (dentry->d_inode) {
1289 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)
1290 dentry->d_inode->i_mode = (mode | S_IFDIR);
1291
1292 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
1293 dentry->d_inode->i_uid = current_fsuid();
1294 if (inode->i_mode & S_ISGID)
1295 dentry->d_inode->i_gid = inode->i_gid;
1296 else
1297 dentry->d_inode->i_gid =
1298 current_fsgid();
1299 }
1300 }
1301 }
1302 return rc;
1303}
1304
1305static int
1306cifs_posix_mkdir(struct inode *inode, struct dentry *dentry, umode_t mode,
1307 const char *full_path, struct cifs_sb_info *cifs_sb,
1308 struct cifs_tcon *tcon, const unsigned int xid)
1309{
1310 int rc = 0;
1311 u32 oplock = 0;
1312 FILE_UNIX_BASIC_INFO *info = NULL;
1313 struct inode *newinode = NULL;
1314 struct cifs_fattr fattr;
1315
1316 info = kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL);
1317 if (info == NULL) {
1318 rc = -ENOMEM;
1319 goto posix_mkdir_out;
1320 }
1321
1322 mode &= ~current_umask();
1323 rc = CIFSPOSIXCreate(xid, tcon, SMB_O_DIRECTORY | SMB_O_CREAT, mode,
1324 NULL /* netfid */, info, &oplock, full_path,
1325 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
1326 CIFS_MOUNT_MAP_SPECIAL_CHR);
1327 if (rc == -EOPNOTSUPP)
1328 goto posix_mkdir_out;
1329 else if (rc) {
1330 cFYI(1, "posix mkdir returned 0x%x", rc);
1331 d_drop(dentry);
1332 goto posix_mkdir_out;
1333 }
1334
1335 if (info->Type == cpu_to_le32(-1))
1336 /* no return info, go query for it */
1337 goto posix_mkdir_get_info;
1338 /*
1339 * BB check (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID ) to see if
1340 * need to set uid/gid.
1341 */
1342
1343 cifs_unix_basic_to_fattr(&fattr, info, cifs_sb);
1344 cifs_fill_uniqueid(inode->i_sb, &fattr);
1345 newinode = cifs_iget(inode->i_sb, &fattr);
1346 if (!newinode)
1347 goto posix_mkdir_get_info;
1348
1349 d_instantiate(dentry, newinode);
1350
1351#ifdef CONFIG_CIFS_DEBUG2
1352 cFYI(1, "instantiated dentry %p %s to inode %p", dentry,
1353 dentry->d_name.name, newinode);
1354
1355 if (newinode->i_nlink != 2)
1356 cFYI(1, "unexpected number of links %d", newinode->i_nlink);
1357#endif
1358
1359posix_mkdir_out:
1360 kfree(info);
1361 return rc;
1362posix_mkdir_get_info:
1363 rc = cifs_mkdir_qinfo(inode, dentry, mode, full_path, cifs_sb, tcon,
1364 xid);
1365 goto posix_mkdir_out;
1366}
1367
1222int cifs_mkdir(struct inode *inode, struct dentry *direntry, umode_t mode) 1368int cifs_mkdir(struct inode *inode, struct dentry *direntry, umode_t mode)
1223{ 1369{
1224 int rc = 0, tmprc; 1370 int rc = 0;
1225 unsigned int xid; 1371 unsigned int xid;
1226 struct cifs_sb_info *cifs_sb; 1372 struct cifs_sb_info *cifs_sb;
1227 struct tcon_link *tlink; 1373 struct tcon_link *tlink;
1228 struct cifs_tcon *tcon; 1374 struct cifs_tcon *tcon;
1229 char *full_path = NULL; 1375 struct TCP_Server_Info *server;
1230 struct inode *newinode = NULL; 1376 char *full_path;
1231 struct cifs_fattr fattr;
1232 1377
1233 cFYI(1, "In cifs_mkdir, mode = 0x%hx inode = 0x%p", mode, inode); 1378 cFYI(1, "In cifs_mkdir, mode = 0x%hx inode = 0x%p", mode, inode);
1234 1379
@@ -1248,145 +1393,29 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, umode_t mode)
1248 1393
1249 if (cap_unix(tcon->ses) && (CIFS_UNIX_POSIX_PATH_OPS_CAP & 1394 if (cap_unix(tcon->ses) && (CIFS_UNIX_POSIX_PATH_OPS_CAP &
1250 le64_to_cpu(tcon->fsUnixInfo.Capability))) { 1395 le64_to_cpu(tcon->fsUnixInfo.Capability))) {
1251 u32 oplock = 0; 1396 rc = cifs_posix_mkdir(inode, direntry, mode, full_path, cifs_sb,
1252 FILE_UNIX_BASIC_INFO *pInfo = 1397 tcon, xid);
1253 kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL); 1398 if (rc != -EOPNOTSUPP)
1254 if (pInfo == NULL) {
1255 rc = -ENOMEM;
1256 goto mkdir_out; 1399 goto mkdir_out;
1257 } 1400 }
1258
1259 mode &= ~current_umask();
1260 rc = CIFSPOSIXCreate(xid, tcon, SMB_O_DIRECTORY | SMB_O_CREAT,
1261 mode, NULL /* netfid */, pInfo, &oplock,
1262 full_path, cifs_sb->local_nls,
1263 cifs_sb->mnt_cifs_flags &
1264 CIFS_MOUNT_MAP_SPECIAL_CHR);
1265 if (rc == -EOPNOTSUPP) {
1266 kfree(pInfo);
1267 goto mkdir_retry_old;
1268 } else if (rc) {
1269 cFYI(1, "posix mkdir returned 0x%x", rc);
1270 d_drop(direntry);
1271 } else {
1272 if (pInfo->Type == cpu_to_le32(-1)) {
1273 /* no return info, go query for it */
1274 kfree(pInfo);
1275 goto mkdir_get_info;
1276 }
1277/*BB check (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID ) to see if need
1278 to set uid/gid */
1279
1280 cifs_unix_basic_to_fattr(&fattr, pInfo, cifs_sb);
1281 cifs_fill_uniqueid(inode->i_sb, &fattr);
1282 newinode = cifs_iget(inode->i_sb, &fattr);
1283 if (!newinode) {
1284 kfree(pInfo);
1285 goto mkdir_get_info;
1286 }
1287
1288 d_instantiate(direntry, newinode);
1289 1401
1290#ifdef CONFIG_CIFS_DEBUG2 1402 server = tcon->ses->server;
1291 cFYI(1, "instantiated dentry %p %s to inode %p",
1292 direntry, direntry->d_name.name, newinode);
1293 1403
1294 if (newinode->i_nlink != 2) 1404 if (!server->ops->mkdir) {
1295 cFYI(1, "unexpected number of links %d", 1405 rc = -ENOSYS;
1296 newinode->i_nlink);
1297#endif
1298 }
1299 kfree(pInfo);
1300 goto mkdir_out; 1406 goto mkdir_out;
1301 } 1407 }
1302mkdir_retry_old: 1408
1303 /* BB add setting the equivalent of mode via CreateX w/ACLs */ 1409 /* BB add setting the equivalent of mode via CreateX w/ACLs */
1304 rc = CIFSSMBMkDir(xid, tcon, full_path, cifs_sb->local_nls, 1410 rc = server->ops->mkdir(xid, tcon, full_path, cifs_sb);
1305 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
1306 if (rc) { 1411 if (rc) {
1307 cFYI(1, "cifs_mkdir returned 0x%x", rc); 1412 cFYI(1, "cifs_mkdir returned 0x%x", rc);
1308 d_drop(direntry); 1413 d_drop(direntry);
1309 } else { 1414 goto mkdir_out;
1310mkdir_get_info:
1311 if (tcon->unix_ext)
1312 rc = cifs_get_inode_info_unix(&newinode, full_path,
1313 inode->i_sb, xid);
1314 else
1315 rc = cifs_get_inode_info(&newinode, full_path, NULL,
1316 inode->i_sb, xid, NULL);
1317
1318 d_instantiate(direntry, newinode);
1319 /* setting nlink not necessary except in cases where we
1320 * failed to get it from the server or was set bogus */
1321 if ((direntry->d_inode) && (direntry->d_inode->i_nlink < 2))
1322 set_nlink(direntry->d_inode, 2);
1323
1324 mode &= ~current_umask();
1325 /* must turn on setgid bit if parent dir has it */
1326 if (inode->i_mode & S_ISGID)
1327 mode |= S_ISGID;
1328
1329 if (tcon->unix_ext) {
1330 struct cifs_unix_set_info_args args = {
1331 .mode = mode,
1332 .ctime = NO_CHANGE_64,
1333 .atime = NO_CHANGE_64,
1334 .mtime = NO_CHANGE_64,
1335 .device = 0,
1336 };
1337 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
1338 args.uid = (__u64)current_fsuid();
1339 if (inode->i_mode & S_ISGID)
1340 args.gid = (__u64)inode->i_gid;
1341 else
1342 args.gid = (__u64)current_fsgid();
1343 } else {
1344 args.uid = NO_CHANGE_64;
1345 args.gid = NO_CHANGE_64;
1346 }
1347 CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args,
1348 cifs_sb->local_nls,
1349 cifs_sb->mnt_cifs_flags &
1350 CIFS_MOUNT_MAP_SPECIAL_CHR);
1351 } else {
1352 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) &&
1353 (mode & S_IWUGO) == 0) {
1354 FILE_BASIC_INFO pInfo;
1355 struct cifsInodeInfo *cifsInode;
1356 u32 dosattrs;
1357
1358 memset(&pInfo, 0, sizeof(pInfo));
1359 cifsInode = CIFS_I(newinode);
1360 dosattrs = cifsInode->cifsAttrs|ATTR_READONLY;
1361 pInfo.Attributes = cpu_to_le32(dosattrs);
1362 tmprc = CIFSSMBSetPathInfo(xid, tcon,
1363 full_path, &pInfo,
1364 cifs_sb->local_nls,
1365 cifs_sb->mnt_cifs_flags &
1366 CIFS_MOUNT_MAP_SPECIAL_CHR);
1367 if (tmprc == 0)
1368 cifsInode->cifsAttrs = dosattrs;
1369 }
1370 if (direntry->d_inode) {
1371 if (cifs_sb->mnt_cifs_flags &
1372 CIFS_MOUNT_DYNPERM)
1373 direntry->d_inode->i_mode =
1374 (mode | S_IFDIR);
1375
1376 if (cifs_sb->mnt_cifs_flags &
1377 CIFS_MOUNT_SET_UID) {
1378 direntry->d_inode->i_uid =
1379 current_fsuid();
1380 if (inode->i_mode & S_ISGID)
1381 direntry->d_inode->i_gid =
1382 inode->i_gid;
1383 else
1384 direntry->d_inode->i_gid =
1385 current_fsgid();
1386 }
1387 }
1388 }
1389 } 1415 }
1416
1417 rc = cifs_mkdir_qinfo(inode, direntry, mode, full_path, cifs_sb, tcon,
1418 xid);
1390mkdir_out: 1419mkdir_out:
1391 /* 1420 /*
1392 * Force revalidate to get parent dir info when needed since cached 1421 * Force revalidate to get parent dir info when needed since cached
@@ -1405,7 +1434,8 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
1405 unsigned int xid; 1434 unsigned int xid;
1406 struct cifs_sb_info *cifs_sb; 1435 struct cifs_sb_info *cifs_sb;
1407 struct tcon_link *tlink; 1436 struct tcon_link *tlink;
1408 struct cifs_tcon *pTcon; 1437 struct cifs_tcon *tcon;
1438 struct TCP_Server_Info *server;
1409 char *full_path = NULL; 1439 char *full_path = NULL;
1410 struct cifsInodeInfo *cifsInode; 1440 struct cifsInodeInfo *cifsInode;
1411 1441
@@ -1425,10 +1455,16 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
1425 rc = PTR_ERR(tlink); 1455 rc = PTR_ERR(tlink);
1426 goto rmdir_exit; 1456 goto rmdir_exit;
1427 } 1457 }
1428 pTcon = tlink_tcon(tlink); 1458 tcon = tlink_tcon(tlink);
1459 server = tcon->ses->server;
1460
1461 if (!server->ops->rmdir) {
1462 rc = -ENOSYS;
1463 cifs_put_tlink(tlink);
1464 goto rmdir_exit;
1465 }
1429 1466
1430 rc = CIFSSMBRmDir(xid, pTcon, full_path, cifs_sb->local_nls, 1467 rc = server->ops->rmdir(xid, tcon, full_path, cifs_sb);
1431 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
1432 cifs_put_tlink(tlink); 1468 cifs_put_tlink(tlink);
1433 1469
1434 if (!rc) { 1470 if (!rc) {
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 09e4b3ae4564..e6ce3b112875 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -433,7 +433,9 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode,
433 if (old_file->d_inode) { 433 if (old_file->d_inode) {
434 cifsInode = CIFS_I(old_file->d_inode); 434 cifsInode = CIFS_I(old_file->d_inode);
435 if (rc == 0) { 435 if (rc == 0) {
436 spin_lock(&old_file->d_inode->i_lock);
436 inc_nlink(old_file->d_inode); 437 inc_nlink(old_file->d_inode);
438 spin_unlock(&old_file->d_inode->i_lock);
437/* BB should we make this contingent on superblock flag NOATIME? */ 439/* BB should we make this contingent on superblock flag NOATIME? */
438/* old_file->d_inode->i_ctime = CURRENT_TIME;*/ 440/* old_file->d_inode->i_ctime = CURRENT_TIME;*/
439 /* parent dir timestamps will update from srv 441 /* parent dir timestamps will update from srv
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index c40356d24c5c..3129ac74b819 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -586,6 +586,27 @@ cifs_print_stats(struct seq_file *m, struct cifs_tcon *tcon)
586#endif 586#endif
587} 587}
588 588
589static void
590cifs_mkdir_setinfo(struct inode *inode, const char *full_path,
591 struct cifs_sb_info *cifs_sb, struct cifs_tcon *tcon,
592 const unsigned int xid)
593{
594 FILE_BASIC_INFO info;
595 struct cifsInodeInfo *cifsInode;
596 u32 dosattrs;
597 int rc;
598
599 memset(&info, 0, sizeof(info));
600 cifsInode = CIFS_I(inode);
601 dosattrs = cifsInode->cifsAttrs|ATTR_READONLY;
602 info.Attributes = cpu_to_le32(dosattrs);
603 rc = CIFSSMBSetPathInfo(xid, tcon, full_path, &info, cifs_sb->local_nls,
604 cifs_sb->mnt_cifs_flags &
605 CIFS_MOUNT_MAP_SPECIAL_CHR);
606 if (rc == 0)
607 cifsInode->cifsAttrs = dosattrs;
608}
609
589struct smb_version_operations smb1_operations = { 610struct smb_version_operations smb1_operations = {
590 .send_cancel = send_nt_cancel, 611 .send_cancel = send_nt_cancel,
591 .compare_fids = cifs_compare_fids, 612 .compare_fids = cifs_compare_fids,
@@ -620,6 +641,9 @@ struct smb_version_operations smb1_operations = {
620 .get_srv_inum = cifs_get_srv_inum, 641 .get_srv_inum = cifs_get_srv_inum,
621 .build_path_to_root = cifs_build_path_to_root, 642 .build_path_to_root = cifs_build_path_to_root,
622 .echo = CIFSSMBEcho, 643 .echo = CIFSSMBEcho,
644 .mkdir = CIFSSMBMkDir,
645 .mkdir_setinfo = cifs_mkdir_setinfo,
646 .rmdir = CIFSSMBRmDir,
623}; 647};
624 648
625struct smb_version_values smb1_values = { 649struct smb_version_values smb1_values = {
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index 1ba5c405315c..2aa5cb08c526 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -122,3 +122,42 @@ out:
122 kfree(smb2_data); 122 kfree(smb2_data);
123 return rc; 123 return rc;
124} 124}
125
126int
127smb2_mkdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
128 struct cifs_sb_info *cifs_sb)
129{
130 return smb2_open_op_close(xid, tcon, cifs_sb, name,
131 FILE_WRITE_ATTRIBUTES, FILE_CREATE, 0,
132 CREATE_NOT_FILE, NULL, SMB2_OP_MKDIR);
133}
134
135void
136smb2_mkdir_setinfo(struct inode *inode, const char *name,
137 struct cifs_sb_info *cifs_sb, struct cifs_tcon *tcon,
138 const unsigned int xid)
139{
140 FILE_BASIC_INFO data;
141 struct cifsInodeInfo *cifs_i;
142 u32 dosattrs;
143 int tmprc;
144
145 memset(&data, 0, sizeof(data));
146 cifs_i = CIFS_I(inode);
147 dosattrs = cifs_i->cifsAttrs | ATTR_READONLY;
148 data.Attributes = cpu_to_le32(dosattrs);
149 tmprc = smb2_open_op_close(xid, tcon, cifs_sb, name,
150 FILE_WRITE_ATTRIBUTES, FILE_CREATE, 0,
151 CREATE_NOT_FILE, &data, SMB2_OP_SET_INFO);
152 if (tmprc == 0)
153 cifs_i->cifsAttrs = dosattrs;
154}
155
156int
157smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
158 struct cifs_sb_info *cifs_sb)
159{
160 return smb2_open_op_close(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
161 0, CREATE_NOT_FILE | CREATE_DELETE_ON_CLOSE,
162 NULL, SMB2_OP_DELETE);
163}
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index a4ff5d547554..e4d3b9964167 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -52,7 +52,8 @@ check_smb2_hdr(struct smb2_hdr *hdr, __u64 mid)
52 cERROR(1, "Bad protocol string signature header %x", 52 cERROR(1, "Bad protocol string signature header %x",
53 *(unsigned int *) hdr->ProtocolId); 53 *(unsigned int *) hdr->ProtocolId);
54 if (mid != hdr->MessageId) 54 if (mid != hdr->MessageId)
55 cERROR(1, "Mids do not match"); 55 cERROR(1, "Mids do not match: %llu and %llu", mid,
56 hdr->MessageId);
56 } 57 }
57 cERROR(1, "Bad SMB detected. The Mid=%llu", hdr->MessageId); 58 cERROR(1, "Bad SMB detected. The Mid=%llu", hdr->MessageId);
58 return 1; 59 return 1;
@@ -107,7 +108,7 @@ smb2_check_message(char *buf, unsigned int length)
107 * ie Validate the wct via smb2_struct_sizes table above 108 * ie Validate the wct via smb2_struct_sizes table above
108 */ 109 */
109 110
110 if (length < 2 + sizeof(struct smb2_hdr)) { 111 if (length < sizeof(struct smb2_pdu)) {
111 if ((length >= sizeof(struct smb2_hdr)) && (hdr->Status != 0)) { 112 if ((length >= sizeof(struct smb2_hdr)) && (hdr->Status != 0)) {
112 pdu->StructureSize2 = 0; 113 pdu->StructureSize2 = 0;
113 /* 114 /*
@@ -121,15 +122,15 @@ smb2_check_message(char *buf, unsigned int length)
121 return 1; 122 return 1;
122 } 123 }
123 if (len > CIFSMaxBufSize + MAX_SMB2_HDR_SIZE - 4) { 124 if (len > CIFSMaxBufSize + MAX_SMB2_HDR_SIZE - 4) {
124 cERROR(1, "SMB length greater than maximum, mid=%lld", mid); 125 cERROR(1, "SMB length greater than maximum, mid=%llu", mid);
125 return 1; 126 return 1;
126 } 127 }
127 128
128 if (check_smb2_hdr(hdr, mid)) 129 if (check_smb2_hdr(hdr, mid))
129 return 1; 130 return 1;
130 131
131 if (hdr->StructureSize != SMB2_HEADER_SIZE) { 132 if (hdr->StructureSize != SMB2_HEADER_STRUCTURE_SIZE) {
132 cERROR(1, "Illegal structure size %d", 133 cERROR(1, "Illegal structure size %u",
133 le16_to_cpu(hdr->StructureSize)); 134 le16_to_cpu(hdr->StructureSize));
134 return 1; 135 return 1;
135 } 136 }
@@ -161,8 +162,9 @@ smb2_check_message(char *buf, unsigned int length)
161 if (4 + len != clc_len) { 162 if (4 + len != clc_len) {
162 cFYI(1, "Calculated size %u length %u mismatch mid %llu", 163 cFYI(1, "Calculated size %u length %u mismatch mid %llu",
163 clc_len, 4 + len, mid); 164 clc_len, 4 + len, mid);
164 if (clc_len == 4 + len + 1) /* BB FIXME (fix samba) */ 165 /* server can return one byte more */
165 return 0; /* BB workaround Samba 3 bug SessSetup rsp */ 166 if (clc_len == 4 + len + 1)
167 return 0;
166 return 1; 168 return 1;
167 } 169 }
168 return 0; 170 return 0;
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 410cf925ea26..826209bf3684 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -318,6 +318,9 @@ struct smb_version_operations smb21_operations = {
318 .query_path_info = smb2_query_path_info, 318 .query_path_info = smb2_query_path_info,
319 .get_srv_inum = smb2_get_srv_inum, 319 .get_srv_inum = smb2_get_srv_inum,
320 .build_path_to_root = smb2_build_path_to_root, 320 .build_path_to_root = smb2_build_path_to_root,
321 .mkdir = smb2_mkdir,
322 .mkdir_setinfo = smb2_mkdir_setinfo,
323 .rmdir = smb2_rmdir,
321}; 324};
322 325
323struct smb_version_values smb21_values = { 326struct smb_version_values smb21_values = {
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index f37a1b41b402..15dc8eea8273 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -87,10 +87,6 @@
87 87
88#define SMB2_PROTO_NUMBER __constant_cpu_to_le32(0x424d53fe) 88#define SMB2_PROTO_NUMBER __constant_cpu_to_le32(0x424d53fe)
89 89
90#define SMB2_HEADER_SIZE __constant_le16_to_cpu(64)
91
92#define SMB2_ERROR_STRUCTURE_SIZE2 __constant_le16_to_cpu(9)
93
94/* 90/*
95 * SMB2 Header Definition 91 * SMB2 Header Definition
96 * 92 *
@@ -99,6 +95,9 @@
99 * "PDU" : "Protocol Data Unit" (ie a network "frame") 95 * "PDU" : "Protocol Data Unit" (ie a network "frame")
100 * 96 *
101 */ 97 */
98
99#define SMB2_HEADER_STRUCTURE_SIZE __constant_cpu_to_le16(64)
100
102struct smb2_hdr { 101struct smb2_hdr {
103 __be32 smb2_buf_length; /* big endian on wire */ 102 __be32 smb2_buf_length; /* big endian on wire */
104 /* length is only two or three bytes - with 103 /* length is only two or three bytes - with
@@ -140,6 +139,9 @@ struct smb2_pdu {
140 * command code name for the struct. Note that structures must be packed. 139 * command code name for the struct. Note that structures must be packed.
141 * 140 *
142 */ 141 */
142
143#define SMB2_ERROR_STRUCTURE_SIZE2 __constant_cpu_to_le16(9)
144
143struct smb2_err_rsp { 145struct smb2_err_rsp {
144 struct smb2_hdr hdr; 146 struct smb2_hdr hdr;
145 __le16 StructureSize; 147 __le16 StructureSize;
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 902bbe2b5ad3..bfaa7b148afd 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -52,6 +52,14 @@ extern int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
52 struct cifs_sb_info *cifs_sb, 52 struct cifs_sb_info *cifs_sb,
53 const char *full_path, FILE_ALL_INFO *data, 53 const char *full_path, FILE_ALL_INFO *data,
54 bool *adjust_tz); 54 bool *adjust_tz);
55extern int smb2_mkdir(const unsigned int xid, struct cifs_tcon *tcon,
56 const char *name, struct cifs_sb_info *cifs_sb);
57extern void smb2_mkdir_setinfo(struct inode *inode, const char *full_path,
58 struct cifs_sb_info *cifs_sb,
59 struct cifs_tcon *tcon, const unsigned int xid);
60extern int smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon,
61 const char *name, struct cifs_sb_info *cifs_sb);
62
55/* 63/*
56 * SMB2 Worker functions - most of protocol specific implementation details 64 * SMB2 Worker functions - most of protocol specific implementation details
57 * are contained within these calls. 65 * are contained within these calls.
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 83867ef348df..d9b639b95fa8 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -503,13 +503,16 @@ cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server,
503 /* convert the length into a more usable form */ 503 /* convert the length into a more usable form */
504 if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) { 504 if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
505 struct kvec iov; 505 struct kvec iov;
506 int rc = 0;
506 507
507 iov.iov_base = mid->resp_buf; 508 iov.iov_base = mid->resp_buf;
508 iov.iov_len = len; 509 iov.iov_len = len;
509 /* FIXME: add code to kill session */ 510 /* FIXME: add code to kill session */
510 if (cifs_verify_signature(&iov, 1, server, 511 rc = cifs_verify_signature(&iov, 1, server,
511 mid->sequence_number + 1) != 0) 512 mid->sequence_number + 1);
512 cERROR(1, "Unexpected SMB signature"); 513 if (rc)
514 cERROR(1, "SMB signature verification returned error = "
515 "%d", rc);
513 } 516 }
514 517
515 /* BB special case reconnect tid and uid here? */ 518 /* BB special case reconnect tid and uid here? */
diff --git a/fs/compat.c b/fs/compat.c
index 6161255fac45..1bdb350ea5d3 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1155,11 +1155,14 @@ compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec,
1155 struct file *file; 1155 struct file *file;
1156 int fput_needed; 1156 int fput_needed;
1157 ssize_t ret; 1157 ssize_t ret;
1158 loff_t pos;
1158 1159
1159 file = fget_light(fd, &fput_needed); 1160 file = fget_light(fd, &fput_needed);
1160 if (!file) 1161 if (!file)
1161 return -EBADF; 1162 return -EBADF;
1162 ret = compat_readv(file, vec, vlen, &file->f_pos); 1163 pos = file->f_pos;
1164 ret = compat_readv(file, vec, vlen, &pos);
1165 file->f_pos = pos;
1163 fput_light(file, fput_needed); 1166 fput_light(file, fput_needed);
1164 return ret; 1167 return ret;
1165} 1168}
@@ -1221,11 +1224,14 @@ compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec,
1221 struct file *file; 1224 struct file *file;
1222 int fput_needed; 1225 int fput_needed;
1223 ssize_t ret; 1226 ssize_t ret;
1227 loff_t pos;
1224 1228
1225 file = fget_light(fd, &fput_needed); 1229 file = fget_light(fd, &fput_needed);
1226 if (!file) 1230 if (!file)
1227 return -EBADF; 1231 return -EBADF;
1228 ret = compat_writev(file, vec, vlen, &file->f_pos); 1232 pos = file->f_pos;
1233 ret = compat_writev(file, vec, vlen, &pos);
1234 file->f_pos = pos;
1229 fput_light(file, fput_needed); 1235 fput_light(file, fput_needed);
1230 return ret; 1236 return ret;
1231} 1237}
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 1faf4cb56f39..f86c720dba0e 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1062,6 +1062,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1062 unsigned long user_addr; 1062 unsigned long user_addr;
1063 size_t bytes; 1063 size_t bytes;
1064 struct buffer_head map_bh = { 0, }; 1064 struct buffer_head map_bh = { 0, };
1065 struct blk_plug plug;
1065 1066
1066 if (rw & WRITE) 1067 if (rw & WRITE)
1067 rw = WRITE_ODIRECT; 1068 rw = WRITE_ODIRECT;
@@ -1177,6 +1178,8 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1177 PAGE_SIZE - user_addr / PAGE_SIZE); 1178 PAGE_SIZE - user_addr / PAGE_SIZE);
1178 } 1179 }
1179 1180
1181 blk_start_plug(&plug);
1182
1180 for (seg = 0; seg < nr_segs; seg++) { 1183 for (seg = 0; seg < nr_segs; seg++) {
1181 user_addr = (unsigned long)iov[seg].iov_base; 1184 user_addr = (unsigned long)iov[seg].iov_base;
1182 sdio.size += bytes = iov[seg].iov_len; 1185 sdio.size += bytes = iov[seg].iov_len;
@@ -1235,6 +1238,8 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1235 if (sdio.bio) 1238 if (sdio.bio)
1236 dio_bio_submit(dio, &sdio); 1239 dio_bio_submit(dio, &sdio);
1237 1240
1241 blk_finish_plug(&plug);
1242
1238 /* 1243 /*
1239 * It is possible that, we return short IO due to end of file. 1244 * It is possible that, we return short IO due to end of file.
1240 * In that case, we need to release all the pages we got hold on. 1245 * In that case, we need to release all the pages we got hold on.
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 989e034f02bd..cfb4b9fed520 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -385,8 +385,6 @@ struct ecryptfs_msg_ctx {
385 struct mutex mux; 385 struct mutex mux;
386}; 386};
387 387
388struct ecryptfs_daemon;
389
390struct ecryptfs_daemon { 388struct ecryptfs_daemon {
391#define ECRYPTFS_DAEMON_IN_READ 0x00000001 389#define ECRYPTFS_DAEMON_IN_READ 0x00000001
392#define ECRYPTFS_DAEMON_IN_POLL 0x00000002 390#define ECRYPTFS_DAEMON_IN_POLL 0x00000002
@@ -394,10 +392,7 @@ struct ecryptfs_daemon {
394#define ECRYPTFS_DAEMON_MISCDEV_OPEN 0x00000008 392#define ECRYPTFS_DAEMON_MISCDEV_OPEN 0x00000008
395 u32 flags; 393 u32 flags;
396 u32 num_queued_msg_ctx; 394 u32 num_queued_msg_ctx;
397 struct pid *pid; 395 struct file *file;
398 uid_t euid;
399 struct user_namespace *user_ns;
400 struct task_struct *task;
401 struct mutex mux; 396 struct mutex mux;
402 struct list_head msg_ctx_out_queue; 397 struct list_head msg_ctx_out_queue;
403 wait_queue_head_t wait; 398 wait_queue_head_t wait;
@@ -554,6 +549,8 @@ extern struct kmem_cache *ecryptfs_key_tfm_cache;
554struct inode *ecryptfs_get_inode(struct inode *lower_inode, 549struct inode *ecryptfs_get_inode(struct inode *lower_inode,
555 struct super_block *sb); 550 struct super_block *sb);
556void ecryptfs_i_size_init(const char *page_virt, struct inode *inode); 551void ecryptfs_i_size_init(const char *page_virt, struct inode *inode);
552int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry,
553 struct inode *ecryptfs_inode);
557int ecryptfs_decode_and_decrypt_filename(char **decrypted_name, 554int ecryptfs_decode_and_decrypt_filename(char **decrypted_name,
558 size_t *decrypted_name_size, 555 size_t *decrypted_name_size,
559 struct dentry *ecryptfs_dentry, 556 struct dentry *ecryptfs_dentry,
@@ -607,13 +604,8 @@ int
607ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value, 604ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
608 size_t size, int flags); 605 size_t size, int flags);
609int ecryptfs_read_xattr_region(char *page_virt, struct inode *ecryptfs_inode); 606int ecryptfs_read_xattr_region(char *page_virt, struct inode *ecryptfs_inode);
610int ecryptfs_process_helo(uid_t euid, struct user_namespace *user_ns, 607int ecryptfs_process_response(struct ecryptfs_daemon *daemon,
611 struct pid *pid); 608 struct ecryptfs_message *msg, u32 seq);
612int ecryptfs_process_quit(uid_t euid, struct user_namespace *user_ns,
613 struct pid *pid);
614int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid,
615 struct user_namespace *user_ns, struct pid *pid,
616 u32 seq);
617int ecryptfs_send_message(char *data, int data_len, 609int ecryptfs_send_message(char *data, int data_len,
618 struct ecryptfs_msg_ctx **msg_ctx); 610 struct ecryptfs_msg_ctx **msg_ctx);
619int ecryptfs_wait_for_response(struct ecryptfs_msg_ctx *msg_ctx, 611int ecryptfs_wait_for_response(struct ecryptfs_msg_ctx *msg_ctx,
@@ -658,8 +650,7 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
658 struct inode *ecryptfs_inode); 650 struct inode *ecryptfs_inode);
659struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index); 651struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index);
660int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon); 652int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon);
661int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon, uid_t euid, 653int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon);
662 struct user_namespace *user_ns);
663int ecryptfs_parse_packet_length(unsigned char *data, size_t *size, 654int ecryptfs_parse_packet_length(unsigned char *data, size_t *size,
664 size_t *length_size); 655 size_t *length_size);
665int ecryptfs_write_packet_length(char *dest, size_t size, 656int ecryptfs_write_packet_length(char *dest, size_t size,
@@ -671,8 +662,7 @@ int ecryptfs_send_miscdev(char *data, size_t data_size,
671 u16 msg_flags, struct ecryptfs_daemon *daemon); 662 u16 msg_flags, struct ecryptfs_daemon *daemon);
672void ecryptfs_msg_ctx_alloc_to_free(struct ecryptfs_msg_ctx *msg_ctx); 663void ecryptfs_msg_ctx_alloc_to_free(struct ecryptfs_msg_ctx *msg_ctx);
673int 664int
674ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, uid_t euid, 665ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, struct file *file);
675 struct user_namespace *user_ns, struct pid *pid);
676int ecryptfs_init_kthread(void); 666int ecryptfs_init_kthread(void);
677void ecryptfs_destroy_kthread(void); 667void ecryptfs_destroy_kthread(void);
678int ecryptfs_privileged_open(struct file **lower_file, 668int ecryptfs_privileged_open(struct file **lower_file,
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 2b17f2f9b121..44ce5c6a541d 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -138,29 +138,50 @@ out:
138 return rc; 138 return rc;
139} 139}
140 140
141static void ecryptfs_vma_close(struct vm_area_struct *vma) 141struct kmem_cache *ecryptfs_file_info_cache;
142{
143 filemap_write_and_wait(vma->vm_file->f_mapping);
144}
145
146static const struct vm_operations_struct ecryptfs_file_vm_ops = {
147 .close = ecryptfs_vma_close,
148 .fault = filemap_fault,
149};
150 142
151static int ecryptfs_file_mmap(struct file *file, struct vm_area_struct *vma) 143static int read_or_initialize_metadata(struct dentry *dentry)
152{ 144{
145 struct inode *inode = dentry->d_inode;
146 struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
147 struct ecryptfs_crypt_stat *crypt_stat;
153 int rc; 148 int rc;
154 149
155 rc = generic_file_mmap(file, vma); 150 crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat;
151 mount_crypt_stat = &ecryptfs_superblock_to_private(
152 inode->i_sb)->mount_crypt_stat;
153 mutex_lock(&crypt_stat->cs_mutex);
154
155 if (crypt_stat->flags & ECRYPTFS_POLICY_APPLIED &&
156 crypt_stat->flags & ECRYPTFS_KEY_VALID) {
157 rc = 0;
158 goto out;
159 }
160
161 rc = ecryptfs_read_metadata(dentry);
156 if (!rc) 162 if (!rc)
157 vma->vm_ops = &ecryptfs_file_vm_ops; 163 goto out;
164
165 if (mount_crypt_stat->flags & ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED) {
166 crypt_stat->flags &= ~(ECRYPTFS_I_SIZE_INITIALIZED
167 | ECRYPTFS_ENCRYPTED);
168 rc = 0;
169 goto out;
170 }
158 171
172 if (!(mount_crypt_stat->flags & ECRYPTFS_XATTR_METADATA_ENABLED) &&
173 !i_size_read(ecryptfs_inode_to_lower(inode))) {
174 rc = ecryptfs_initialize_file(dentry, inode);
175 if (!rc)
176 goto out;
177 }
178
179 rc = -EIO;
180out:
181 mutex_unlock(&crypt_stat->cs_mutex);
159 return rc; 182 return rc;
160} 183}
161 184
162struct kmem_cache *ecryptfs_file_info_cache;
163
164/** 185/**
165 * ecryptfs_open 186 * ecryptfs_open
166 * @inode: inode speciying file to open 187 * @inode: inode speciying file to open
@@ -236,32 +257,9 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
236 rc = 0; 257 rc = 0;
237 goto out; 258 goto out;
238 } 259 }
239 mutex_lock(&crypt_stat->cs_mutex); 260 rc = read_or_initialize_metadata(ecryptfs_dentry);
240 if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED) 261 if (rc)
241 || !(crypt_stat->flags & ECRYPTFS_KEY_VALID)) { 262 goto out_put;
242 rc = ecryptfs_read_metadata(ecryptfs_dentry);
243 if (rc) {
244 ecryptfs_printk(KERN_DEBUG,
245 "Valid headers not found\n");
246 if (!(mount_crypt_stat->flags
247 & ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED)) {
248 rc = -EIO;
249 printk(KERN_WARNING "Either the lower file "
250 "is not in a valid eCryptfs format, "
251 "or the key could not be retrieved. "
252 "Plaintext passthrough mode is not "
253 "enabled; returning -EIO\n");
254 mutex_unlock(&crypt_stat->cs_mutex);
255 goto out_put;
256 }
257 rc = 0;
258 crypt_stat->flags &= ~(ECRYPTFS_I_SIZE_INITIALIZED
259 | ECRYPTFS_ENCRYPTED);
260 mutex_unlock(&crypt_stat->cs_mutex);
261 goto out;
262 }
263 }
264 mutex_unlock(&crypt_stat->cs_mutex);
265 ecryptfs_printk(KERN_DEBUG, "inode w/ addr = [0x%p], i_ino = " 263 ecryptfs_printk(KERN_DEBUG, "inode w/ addr = [0x%p], i_ino = "
266 "[0x%.16lx] size: [0x%.16llx]\n", inode, inode->i_ino, 264 "[0x%.16lx] size: [0x%.16llx]\n", inode, inode->i_ino,
267 (unsigned long long)i_size_read(inode)); 265 (unsigned long long)i_size_read(inode));
@@ -292,15 +290,7 @@ static int ecryptfs_release(struct inode *inode, struct file *file)
292static int 290static int
293ecryptfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) 291ecryptfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
294{ 292{
295 int rc = 0; 293 return vfs_fsync(ecryptfs_file_to_lower(file), datasync);
296
297 rc = generic_file_fsync(file, start, end, datasync);
298 if (rc)
299 goto out;
300 rc = vfs_fsync_range(ecryptfs_file_to_lower(file), start, end,
301 datasync);
302out:
303 return rc;
304} 294}
305 295
306static int ecryptfs_fasync(int fd, struct file *file, int flag) 296static int ecryptfs_fasync(int fd, struct file *file, int flag)
@@ -369,7 +359,7 @@ const struct file_operations ecryptfs_main_fops = {
369#ifdef CONFIG_COMPAT 359#ifdef CONFIG_COMPAT
370 .compat_ioctl = ecryptfs_compat_ioctl, 360 .compat_ioctl = ecryptfs_compat_ioctl,
371#endif 361#endif
372 .mmap = ecryptfs_file_mmap, 362 .mmap = generic_file_mmap,
373 .open = ecryptfs_open, 363 .open = ecryptfs_open,
374 .flush = ecryptfs_flush, 364 .flush = ecryptfs_flush,
375 .release = ecryptfs_release, 365 .release = ecryptfs_release,
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index ffa2be57804d..534b129ea676 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -143,6 +143,31 @@ static int ecryptfs_interpose(struct dentry *lower_dentry,
143 return 0; 143 return 0;
144} 144}
145 145
146static int ecryptfs_do_unlink(struct inode *dir, struct dentry *dentry,
147 struct inode *inode)
148{
149 struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
150 struct inode *lower_dir_inode = ecryptfs_inode_to_lower(dir);
151 struct dentry *lower_dir_dentry;
152 int rc;
153
154 dget(lower_dentry);
155 lower_dir_dentry = lock_parent(lower_dentry);
156 rc = vfs_unlink(lower_dir_inode, lower_dentry);
157 if (rc) {
158 printk(KERN_ERR "Error in vfs_unlink; rc = [%d]\n", rc);
159 goto out_unlock;
160 }
161 fsstack_copy_attr_times(dir, lower_dir_inode);
162 set_nlink(inode, ecryptfs_inode_to_lower(inode)->i_nlink);
163 inode->i_ctime = dir->i_ctime;
164 d_drop(dentry);
165out_unlock:
166 unlock_dir(lower_dir_dentry);
167 dput(lower_dentry);
168 return rc;
169}
170
146/** 171/**
147 * ecryptfs_do_create 172 * ecryptfs_do_create
148 * @directory_inode: inode of the new file's dentry's parent in ecryptfs 173 * @directory_inode: inode of the new file's dentry's parent in ecryptfs
@@ -182,8 +207,10 @@ ecryptfs_do_create(struct inode *directory_inode,
182 } 207 }
183 inode = __ecryptfs_get_inode(lower_dentry->d_inode, 208 inode = __ecryptfs_get_inode(lower_dentry->d_inode,
184 directory_inode->i_sb); 209 directory_inode->i_sb);
185 if (IS_ERR(inode)) 210 if (IS_ERR(inode)) {
211 vfs_unlink(lower_dir_dentry->d_inode, lower_dentry);
186 goto out_lock; 212 goto out_lock;
213 }
187 fsstack_copy_attr_times(directory_inode, lower_dir_dentry->d_inode); 214 fsstack_copy_attr_times(directory_inode, lower_dir_dentry->d_inode);
188 fsstack_copy_inode_size(directory_inode, lower_dir_dentry->d_inode); 215 fsstack_copy_inode_size(directory_inode, lower_dir_dentry->d_inode);
189out_lock: 216out_lock:
@@ -200,8 +227,8 @@ out:
200 * 227 *
201 * Returns zero on success 228 * Returns zero on success
202 */ 229 */
203static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry, 230int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry,
204 struct inode *ecryptfs_inode) 231 struct inode *ecryptfs_inode)
205{ 232{
206 struct ecryptfs_crypt_stat *crypt_stat = 233 struct ecryptfs_crypt_stat *crypt_stat =
207 &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; 234 &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
@@ -264,7 +291,9 @@ ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry,
264 * that this on disk file is prepared to be an ecryptfs file */ 291 * that this on disk file is prepared to be an ecryptfs file */
265 rc = ecryptfs_initialize_file(ecryptfs_dentry, ecryptfs_inode); 292 rc = ecryptfs_initialize_file(ecryptfs_dentry, ecryptfs_inode);
266 if (rc) { 293 if (rc) {
267 drop_nlink(ecryptfs_inode); 294 ecryptfs_do_unlink(directory_inode, ecryptfs_dentry,
295 ecryptfs_inode);
296 make_bad_inode(ecryptfs_inode);
268 unlock_new_inode(ecryptfs_inode); 297 unlock_new_inode(ecryptfs_inode);
269 iput(ecryptfs_inode); 298 iput(ecryptfs_inode);
270 goto out; 299 goto out;
@@ -318,21 +347,20 @@ static int ecryptfs_lookup_interpose(struct dentry *dentry,
318 struct vfsmount *lower_mnt; 347 struct vfsmount *lower_mnt;
319 int rc = 0; 348 int rc = 0;
320 349
321 lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent));
322 fsstack_copy_attr_atime(dir_inode, lower_dentry->d_parent->d_inode);
323 BUG_ON(!lower_dentry->d_count);
324
325 dentry_info = kmem_cache_alloc(ecryptfs_dentry_info_cache, GFP_KERNEL); 350 dentry_info = kmem_cache_alloc(ecryptfs_dentry_info_cache, GFP_KERNEL);
326 ecryptfs_set_dentry_private(dentry, dentry_info);
327 if (!dentry_info) { 351 if (!dentry_info) {
328 printk(KERN_ERR "%s: Out of memory whilst attempting " 352 printk(KERN_ERR "%s: Out of memory whilst attempting "
329 "to allocate ecryptfs_dentry_info struct\n", 353 "to allocate ecryptfs_dentry_info struct\n",
330 __func__); 354 __func__);
331 dput(lower_dentry); 355 dput(lower_dentry);
332 mntput(lower_mnt);
333 d_drop(dentry);
334 return -ENOMEM; 356 return -ENOMEM;
335 } 357 }
358
359 lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent));
360 fsstack_copy_attr_atime(dir_inode, lower_dentry->d_parent->d_inode);
361 BUG_ON(!lower_dentry->d_count);
362
363 ecryptfs_set_dentry_private(dentry, dentry_info);
336 ecryptfs_set_dentry_lower(dentry, lower_dentry); 364 ecryptfs_set_dentry_lower(dentry, lower_dentry);
337 ecryptfs_set_dentry_lower_mnt(dentry, lower_mnt); 365 ecryptfs_set_dentry_lower_mnt(dentry, lower_mnt);
338 366
@@ -381,12 +409,6 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
381 struct dentry *lower_dir_dentry, *lower_dentry; 409 struct dentry *lower_dir_dentry, *lower_dentry;
382 int rc = 0; 410 int rc = 0;
383 411
384 if ((ecryptfs_dentry->d_name.len == 1
385 && !strcmp(ecryptfs_dentry->d_name.name, "."))
386 || (ecryptfs_dentry->d_name.len == 2
387 && !strcmp(ecryptfs_dentry->d_name.name, ".."))) {
388 goto out_d_drop;
389 }
390 lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent); 412 lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent);
391 mutex_lock(&lower_dir_dentry->d_inode->i_mutex); 413 mutex_lock(&lower_dir_dentry->d_inode->i_mutex);
392 lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name, 414 lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name,
@@ -397,8 +419,8 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
397 rc = PTR_ERR(lower_dentry); 419 rc = PTR_ERR(lower_dentry);
398 ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " 420 ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
399 "[%d] on lower_dentry = [%s]\n", __func__, rc, 421 "[%d] on lower_dentry = [%s]\n", __func__, rc,
400 encrypted_and_encoded_name); 422 ecryptfs_dentry->d_name.name);
401 goto out_d_drop; 423 goto out;
402 } 424 }
403 if (lower_dentry->d_inode) 425 if (lower_dentry->d_inode)
404 goto interpose; 426 goto interpose;
@@ -415,7 +437,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
415 if (rc) { 437 if (rc) {
416 printk(KERN_ERR "%s: Error attempting to encrypt and encode " 438 printk(KERN_ERR "%s: Error attempting to encrypt and encode "
417 "filename; rc = [%d]\n", __func__, rc); 439 "filename; rc = [%d]\n", __func__, rc);
418 goto out_d_drop; 440 goto out;
419 } 441 }
420 mutex_lock(&lower_dir_dentry->d_inode->i_mutex); 442 mutex_lock(&lower_dir_dentry->d_inode->i_mutex);
421 lower_dentry = lookup_one_len(encrypted_and_encoded_name, 443 lower_dentry = lookup_one_len(encrypted_and_encoded_name,
@@ -427,14 +449,11 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
427 ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " 449 ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
428 "[%d] on lower_dentry = [%s]\n", __func__, rc, 450 "[%d] on lower_dentry = [%s]\n", __func__, rc,
429 encrypted_and_encoded_name); 451 encrypted_and_encoded_name);
430 goto out_d_drop; 452 goto out;
431 } 453 }
432interpose: 454interpose:
433 rc = ecryptfs_lookup_interpose(ecryptfs_dentry, lower_dentry, 455 rc = ecryptfs_lookup_interpose(ecryptfs_dentry, lower_dentry,
434 ecryptfs_dir_inode); 456 ecryptfs_dir_inode);
435 goto out;
436out_d_drop:
437 d_drop(ecryptfs_dentry);
438out: 457out:
439 kfree(encrypted_and_encoded_name); 458 kfree(encrypted_and_encoded_name);
440 return ERR_PTR(rc); 459 return ERR_PTR(rc);
@@ -476,27 +495,7 @@ out_lock:
476 495
477static int ecryptfs_unlink(struct inode *dir, struct dentry *dentry) 496static int ecryptfs_unlink(struct inode *dir, struct dentry *dentry)
478{ 497{
479 int rc = 0; 498 return ecryptfs_do_unlink(dir, dentry, dentry->d_inode);
480 struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
481 struct inode *lower_dir_inode = ecryptfs_inode_to_lower(dir);
482 struct dentry *lower_dir_dentry;
483
484 dget(lower_dentry);
485 lower_dir_dentry = lock_parent(lower_dentry);
486 rc = vfs_unlink(lower_dir_inode, lower_dentry);
487 if (rc) {
488 printk(KERN_ERR "Error in vfs_unlink; rc = [%d]\n", rc);
489 goto out_unlock;
490 }
491 fsstack_copy_attr_times(dir, lower_dir_inode);
492 set_nlink(dentry->d_inode,
493 ecryptfs_inode_to_lower(dentry->d_inode)->i_nlink);
494 dentry->d_inode->i_ctime = dir->i_ctime;
495 d_drop(dentry);
496out_unlock:
497 unlock_dir(lower_dir_dentry);
498 dput(lower_dentry);
499 return rc;
500} 499}
501 500
502static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry, 501static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
@@ -971,12 +970,6 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
971 goto out; 970 goto out;
972 } 971 }
973 972
974 if (S_ISREG(inode->i_mode)) {
975 rc = filemap_write_and_wait(inode->i_mapping);
976 if (rc)
977 goto out;
978 fsstack_copy_attr_all(inode, lower_inode);
979 }
980 memcpy(&lower_ia, ia, sizeof(lower_ia)); 973 memcpy(&lower_ia, ia, sizeof(lower_ia));
981 if (ia->ia_valid & ATTR_FILE) 974 if (ia->ia_valid & ATTR_FILE)
982 lower_ia.ia_file = ecryptfs_file_to_lower(ia->ia_file); 975 lower_ia.ia_file = ecryptfs_file_to_lower(ia->ia_file);
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 1c0b3b6b75c6..2768138eefee 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -279,6 +279,7 @@ static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options,
279 char *fnek_src; 279 char *fnek_src;
280 char *cipher_key_bytes_src; 280 char *cipher_key_bytes_src;
281 char *fn_cipher_key_bytes_src; 281 char *fn_cipher_key_bytes_src;
282 u8 cipher_code;
282 283
283 *check_ruid = 0; 284 *check_ruid = 0;
284 285
@@ -420,6 +421,18 @@ static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options,
420 && !fn_cipher_key_bytes_set) 421 && !fn_cipher_key_bytes_set)
421 mount_crypt_stat->global_default_fn_cipher_key_bytes = 422 mount_crypt_stat->global_default_fn_cipher_key_bytes =
422 mount_crypt_stat->global_default_cipher_key_size; 423 mount_crypt_stat->global_default_cipher_key_size;
424
425 cipher_code = ecryptfs_code_for_cipher_string(
426 mount_crypt_stat->global_default_cipher_name,
427 mount_crypt_stat->global_default_cipher_key_size);
428 if (!cipher_code) {
429 ecryptfs_printk(KERN_ERR,
430 "eCryptfs doesn't support cipher: %s",
431 mount_crypt_stat->global_default_cipher_name);
432 rc = -EINVAL;
433 goto out;
434 }
435
423 mutex_lock(&key_tfm_list_mutex); 436 mutex_lock(&key_tfm_list_mutex);
424 if (!ecryptfs_tfm_exists(mount_crypt_stat->global_default_cipher_name, 437 if (!ecryptfs_tfm_exists(mount_crypt_stat->global_default_cipher_name,
425 NULL)) { 438 NULL)) {
@@ -540,6 +553,15 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
540 } 553 }
541 554
542 ecryptfs_set_superblock_lower(s, path.dentry->d_sb); 555 ecryptfs_set_superblock_lower(s, path.dentry->d_sb);
556
557 /**
558 * Set the POSIX ACL flag based on whether they're enabled in the lower
559 * mount. Force a read-only eCryptfs mount if the lower mount is ro.
560 * Allow a ro eCryptfs mount even when the lower mount is rw.
561 */
562 s->s_flags = flags & ~MS_POSIXACL;
563 s->s_flags |= path.dentry->d_sb->s_flags & (MS_RDONLY | MS_POSIXACL);
564
543 s->s_maxbytes = path.dentry->d_sb->s_maxbytes; 565 s->s_maxbytes = path.dentry->d_sb->s_maxbytes;
544 s->s_blocksize = path.dentry->d_sb->s_blocksize; 566 s->s_blocksize = path.dentry->d_sb->s_blocksize;
545 s->s_magic = ECRYPTFS_SUPER_MAGIC; 567 s->s_magic = ECRYPTFS_SUPER_MAGIC;
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index a750f957b145..b29bb8bfa8d9 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -32,8 +32,8 @@ static struct mutex ecryptfs_msg_ctx_lists_mux;
32static struct hlist_head *ecryptfs_daemon_hash; 32static struct hlist_head *ecryptfs_daemon_hash;
33struct mutex ecryptfs_daemon_hash_mux; 33struct mutex ecryptfs_daemon_hash_mux;
34static int ecryptfs_hash_bits; 34static int ecryptfs_hash_bits;
35#define ecryptfs_uid_hash(uid) \ 35#define ecryptfs_current_euid_hash(uid) \
36 hash_long((unsigned long)uid, ecryptfs_hash_bits) 36 hash_long((unsigned long)current_euid(), ecryptfs_hash_bits)
37 37
38static u32 ecryptfs_msg_counter; 38static u32 ecryptfs_msg_counter;
39static struct ecryptfs_msg_ctx *ecryptfs_msg_ctx_arr; 39static struct ecryptfs_msg_ctx *ecryptfs_msg_ctx_arr;
@@ -105,26 +105,24 @@ void ecryptfs_msg_ctx_alloc_to_free(struct ecryptfs_msg_ctx *msg_ctx)
105 105
106/** 106/**
107 * ecryptfs_find_daemon_by_euid 107 * ecryptfs_find_daemon_by_euid
108 * @euid: The effective user id which maps to the desired daemon id
109 * @user_ns: The namespace in which @euid applies
110 * @daemon: If return value is zero, points to the desired daemon pointer 108 * @daemon: If return value is zero, points to the desired daemon pointer
111 * 109 *
112 * Must be called with ecryptfs_daemon_hash_mux held. 110 * Must be called with ecryptfs_daemon_hash_mux held.
113 * 111 *
114 * Search the hash list for the given user id. 112 * Search the hash list for the current effective user id.
115 * 113 *
116 * Returns zero if the user id exists in the list; non-zero otherwise. 114 * Returns zero if the user id exists in the list; non-zero otherwise.
117 */ 115 */
118int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon, uid_t euid, 116int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon)
119 struct user_namespace *user_ns)
120{ 117{
121 struct hlist_node *elem; 118 struct hlist_node *elem;
122 int rc; 119 int rc;
123 120
124 hlist_for_each_entry(*daemon, elem, 121 hlist_for_each_entry(*daemon, elem,
125 &ecryptfs_daemon_hash[ecryptfs_uid_hash(euid)], 122 &ecryptfs_daemon_hash[ecryptfs_current_euid_hash()],
126 euid_chain) { 123 euid_chain) {
127 if ((*daemon)->euid == euid && (*daemon)->user_ns == user_ns) { 124 if ((*daemon)->file->f_cred->euid == current_euid() &&
125 (*daemon)->file->f_cred->user_ns == current_user_ns()) {
128 rc = 0; 126 rc = 0;
129 goto out; 127 goto out;
130 } 128 }
@@ -137,9 +135,7 @@ out:
137/** 135/**
138 * ecryptfs_spawn_daemon - Create and initialize a new daemon struct 136 * ecryptfs_spawn_daemon - Create and initialize a new daemon struct
139 * @daemon: Pointer to set to newly allocated daemon struct 137 * @daemon: Pointer to set to newly allocated daemon struct
140 * @euid: Effective user id for the daemon 138 * @file: File used when opening /dev/ecryptfs
141 * @user_ns: The namespace in which @euid applies
142 * @pid: Process id for the daemon
143 * 139 *
144 * Must be called ceremoniously while in possession of 140 * Must be called ceremoniously while in possession of
145 * ecryptfs_sacred_daemon_hash_mux 141 * ecryptfs_sacred_daemon_hash_mux
@@ -147,8 +143,7 @@ out:
147 * Returns zero on success; non-zero otherwise 143 * Returns zero on success; non-zero otherwise
148 */ 144 */
149int 145int
150ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, uid_t euid, 146ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, struct file *file)
151 struct user_namespace *user_ns, struct pid *pid)
152{ 147{
153 int rc = 0; 148 int rc = 0;
154 149
@@ -159,16 +154,13 @@ ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, uid_t euid,
159 "GFP_KERNEL memory\n", __func__, sizeof(**daemon)); 154 "GFP_KERNEL memory\n", __func__, sizeof(**daemon));
160 goto out; 155 goto out;
161 } 156 }
162 (*daemon)->euid = euid; 157 (*daemon)->file = file;
163 (*daemon)->user_ns = get_user_ns(user_ns);
164 (*daemon)->pid = get_pid(pid);
165 (*daemon)->task = current;
166 mutex_init(&(*daemon)->mux); 158 mutex_init(&(*daemon)->mux);
167 INIT_LIST_HEAD(&(*daemon)->msg_ctx_out_queue); 159 INIT_LIST_HEAD(&(*daemon)->msg_ctx_out_queue);
168 init_waitqueue_head(&(*daemon)->wait); 160 init_waitqueue_head(&(*daemon)->wait);
169 (*daemon)->num_queued_msg_ctx = 0; 161 (*daemon)->num_queued_msg_ctx = 0;
170 hlist_add_head(&(*daemon)->euid_chain, 162 hlist_add_head(&(*daemon)->euid_chain,
171 &ecryptfs_daemon_hash[ecryptfs_uid_hash(euid)]); 163 &ecryptfs_daemon_hash[ecryptfs_current_euid_hash()]);
172out: 164out:
173 return rc; 165 return rc;
174} 166}
@@ -188,9 +180,6 @@ int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon)
188 if ((daemon->flags & ECRYPTFS_DAEMON_IN_READ) 180 if ((daemon->flags & ECRYPTFS_DAEMON_IN_READ)
189 || (daemon->flags & ECRYPTFS_DAEMON_IN_POLL)) { 181 || (daemon->flags & ECRYPTFS_DAEMON_IN_POLL)) {
190 rc = -EBUSY; 182 rc = -EBUSY;
191 printk(KERN_WARNING "%s: Attempt to destroy daemon with pid "
192 "[0x%p], but it is in the midst of a read or a poll\n",
193 __func__, daemon->pid);
194 mutex_unlock(&daemon->mux); 183 mutex_unlock(&daemon->mux);
195 goto out; 184 goto out;
196 } 185 }
@@ -203,12 +192,6 @@ int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon)
203 ecryptfs_msg_ctx_alloc_to_free(msg_ctx); 192 ecryptfs_msg_ctx_alloc_to_free(msg_ctx);
204 } 193 }
205 hlist_del(&daemon->euid_chain); 194 hlist_del(&daemon->euid_chain);
206 if (daemon->task)
207 wake_up_process(daemon->task);
208 if (daemon->pid)
209 put_pid(daemon->pid);
210 if (daemon->user_ns)
211 put_user_ns(daemon->user_ns);
212 mutex_unlock(&daemon->mux); 195 mutex_unlock(&daemon->mux);
213 kzfree(daemon); 196 kzfree(daemon);
214out: 197out:
@@ -216,42 +199,9 @@ out:
216} 199}
217 200
218/** 201/**
219 * ecryptfs_process_quit
220 * @euid: The user ID owner of the message
221 * @user_ns: The namespace in which @euid applies
222 * @pid: The process ID for the userspace program that sent the
223 * message
224 *
225 * Deletes the corresponding daemon for the given euid and pid, if
226 * it is the registered that is requesting the deletion. Returns zero
227 * after deleting the desired daemon; non-zero otherwise.
228 */
229int ecryptfs_process_quit(uid_t euid, struct user_namespace *user_ns,
230 struct pid *pid)
231{
232 struct ecryptfs_daemon *daemon;
233 int rc;
234
235 mutex_lock(&ecryptfs_daemon_hash_mux);
236 rc = ecryptfs_find_daemon_by_euid(&daemon, euid, user_ns);
237 if (rc || !daemon) {
238 rc = -EINVAL;
239 printk(KERN_ERR "Received request from user [%d] to "
240 "unregister unrecognized daemon [0x%p]\n", euid, pid);
241 goto out_unlock;
242 }
243 rc = ecryptfs_exorcise_daemon(daemon);
244out_unlock:
245 mutex_unlock(&ecryptfs_daemon_hash_mux);
246 return rc;
247}
248
249/**
250 * ecryptfs_process_reponse 202 * ecryptfs_process_reponse
251 * @msg: The ecryptfs message received; the caller should sanity check 203 * @msg: The ecryptfs message received; the caller should sanity check
252 * msg->data_len and free the memory 204 * msg->data_len and free the memory
253 * @pid: The process ID of the userspace application that sent the
254 * message
255 * @seq: The sequence number of the message; must match the sequence 205 * @seq: The sequence number of the message; must match the sequence
256 * number for the existing message context waiting for this 206 * number for the existing message context waiting for this
257 * response 207 * response
@@ -270,16 +220,11 @@ out_unlock:
270 * 220 *
271 * Returns zero on success; non-zero otherwise 221 * Returns zero on success; non-zero otherwise
272 */ 222 */
273int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid, 223int ecryptfs_process_response(struct ecryptfs_daemon *daemon,
274 struct user_namespace *user_ns, struct pid *pid, 224 struct ecryptfs_message *msg, u32 seq)
275 u32 seq)
276{ 225{
277 struct ecryptfs_daemon *uninitialized_var(daemon);
278 struct ecryptfs_msg_ctx *msg_ctx; 226 struct ecryptfs_msg_ctx *msg_ctx;
279 size_t msg_size; 227 size_t msg_size;
280 struct nsproxy *nsproxy;
281 struct user_namespace *tsk_user_ns;
282 uid_t ctx_euid;
283 int rc; 228 int rc;
284 229
285 if (msg->index >= ecryptfs_message_buf_len) { 230 if (msg->index >= ecryptfs_message_buf_len) {
@@ -292,51 +237,6 @@ int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid,
292 } 237 }
293 msg_ctx = &ecryptfs_msg_ctx_arr[msg->index]; 238 msg_ctx = &ecryptfs_msg_ctx_arr[msg->index];
294 mutex_lock(&msg_ctx->mux); 239 mutex_lock(&msg_ctx->mux);
295 mutex_lock(&ecryptfs_daemon_hash_mux);
296 rcu_read_lock();
297 nsproxy = task_nsproxy(msg_ctx->task);
298 if (nsproxy == NULL) {
299 rc = -EBADMSG;
300 printk(KERN_ERR "%s: Receiving process is a zombie. Dropping "
301 "message.\n", __func__);
302 rcu_read_unlock();
303 mutex_unlock(&ecryptfs_daemon_hash_mux);
304 goto wake_up;
305 }
306 tsk_user_ns = __task_cred(msg_ctx->task)->user_ns;
307 ctx_euid = task_euid(msg_ctx->task);
308 rc = ecryptfs_find_daemon_by_euid(&daemon, ctx_euid, tsk_user_ns);
309 rcu_read_unlock();
310 mutex_unlock(&ecryptfs_daemon_hash_mux);
311 if (rc) {
312 rc = -EBADMSG;
313 printk(KERN_WARNING "%s: User [%d] received a "
314 "message response from process [0x%p] but does "
315 "not have a registered daemon\n", __func__,
316 ctx_euid, pid);
317 goto wake_up;
318 }
319 if (ctx_euid != euid) {
320 rc = -EBADMSG;
321 printk(KERN_WARNING "%s: Received message from user "
322 "[%d]; expected message from user [%d]\n", __func__,
323 euid, ctx_euid);
324 goto unlock;
325 }
326 if (tsk_user_ns != user_ns) {
327 rc = -EBADMSG;
328 printk(KERN_WARNING "%s: Received message from user_ns "
329 "[0x%p]; expected message from user_ns [0x%p]\n",
330 __func__, user_ns, tsk_user_ns);
331 goto unlock;
332 }
333 if (daemon->pid != pid) {
334 rc = -EBADMSG;
335 printk(KERN_ERR "%s: User [%d] sent a message response "
336 "from an unrecognized process [0x%p]\n",
337 __func__, ctx_euid, pid);
338 goto unlock;
339 }
340 if (msg_ctx->state != ECRYPTFS_MSG_CTX_STATE_PENDING) { 240 if (msg_ctx->state != ECRYPTFS_MSG_CTX_STATE_PENDING) {
341 rc = -EINVAL; 241 rc = -EINVAL;
342 printk(KERN_WARNING "%s: Desired context element is not " 242 printk(KERN_WARNING "%s: Desired context element is not "
@@ -359,9 +259,8 @@ int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid,
359 } 259 }
360 memcpy(msg_ctx->msg, msg, msg_size); 260 memcpy(msg_ctx->msg, msg, msg_size);
361 msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_DONE; 261 msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_DONE;
362 rc = 0;
363wake_up:
364 wake_up_process(msg_ctx->task); 262 wake_up_process(msg_ctx->task);
263 rc = 0;
365unlock: 264unlock:
366 mutex_unlock(&msg_ctx->mux); 265 mutex_unlock(&msg_ctx->mux);
367out: 266out:
@@ -383,14 +282,11 @@ ecryptfs_send_message_locked(char *data, int data_len, u8 msg_type,
383 struct ecryptfs_msg_ctx **msg_ctx) 282 struct ecryptfs_msg_ctx **msg_ctx)
384{ 283{
385 struct ecryptfs_daemon *daemon; 284 struct ecryptfs_daemon *daemon;
386 uid_t euid = current_euid();
387 int rc; 285 int rc;
388 286
389 rc = ecryptfs_find_daemon_by_euid(&daemon, euid, current_user_ns()); 287 rc = ecryptfs_find_daemon_by_euid(&daemon);
390 if (rc || !daemon) { 288 if (rc || !daemon) {
391 rc = -ENOTCONN; 289 rc = -ENOTCONN;
392 printk(KERN_ERR "%s: User [%d] does not have a daemon "
393 "registered\n", __func__, euid);
394 goto out; 290 goto out;
395 } 291 }
396 mutex_lock(&ecryptfs_msg_ctx_lists_mux); 292 mutex_lock(&ecryptfs_msg_ctx_lists_mux);
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index c0038f6566d4..412e6eda25f8 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -33,7 +33,7 @@ static atomic_t ecryptfs_num_miscdev_opens;
33 33
34/** 34/**
35 * ecryptfs_miscdev_poll 35 * ecryptfs_miscdev_poll
36 * @file: dev file (ignored) 36 * @file: dev file
37 * @pt: dev poll table (ignored) 37 * @pt: dev poll table (ignored)
38 * 38 *
39 * Returns the poll mask 39 * Returns the poll mask
@@ -41,20 +41,10 @@ static atomic_t ecryptfs_num_miscdev_opens;
41static unsigned int 41static unsigned int
42ecryptfs_miscdev_poll(struct file *file, poll_table *pt) 42ecryptfs_miscdev_poll(struct file *file, poll_table *pt)
43{ 43{
44 struct ecryptfs_daemon *daemon; 44 struct ecryptfs_daemon *daemon = file->private_data;
45 unsigned int mask = 0; 45 unsigned int mask = 0;
46 uid_t euid = current_euid();
47 int rc;
48 46
49 mutex_lock(&ecryptfs_daemon_hash_mux);
50 /* TODO: Just use file->private_data? */
51 rc = ecryptfs_find_daemon_by_euid(&daemon, euid, current_user_ns());
52 if (rc || !daemon) {
53 mutex_unlock(&ecryptfs_daemon_hash_mux);
54 return -EINVAL;
55 }
56 mutex_lock(&daemon->mux); 47 mutex_lock(&daemon->mux);
57 mutex_unlock(&ecryptfs_daemon_hash_mux);
58 if (daemon->flags & ECRYPTFS_DAEMON_ZOMBIE) { 48 if (daemon->flags & ECRYPTFS_DAEMON_ZOMBIE) {
59 printk(KERN_WARNING "%s: Attempt to poll on zombified " 49 printk(KERN_WARNING "%s: Attempt to poll on zombified "
60 "daemon\n", __func__); 50 "daemon\n", __func__);
@@ -79,7 +69,7 @@ out_unlock_daemon:
79/** 69/**
80 * ecryptfs_miscdev_open 70 * ecryptfs_miscdev_open
81 * @inode: inode of miscdev handle (ignored) 71 * @inode: inode of miscdev handle (ignored)
82 * @file: file for miscdev handle (ignored) 72 * @file: file for miscdev handle
83 * 73 *
84 * Returns zero on success; non-zero otherwise 74 * Returns zero on success; non-zero otherwise
85 */ 75 */
@@ -87,7 +77,6 @@ static int
87ecryptfs_miscdev_open(struct inode *inode, struct file *file) 77ecryptfs_miscdev_open(struct inode *inode, struct file *file)
88{ 78{
89 struct ecryptfs_daemon *daemon = NULL; 79 struct ecryptfs_daemon *daemon = NULL;
90 uid_t euid = current_euid();
91 int rc; 80 int rc;
92 81
93 mutex_lock(&ecryptfs_daemon_hash_mux); 82 mutex_lock(&ecryptfs_daemon_hash_mux);
@@ -98,30 +87,20 @@ ecryptfs_miscdev_open(struct inode *inode, struct file *file)
98 "count; rc = [%d]\n", __func__, rc); 87 "count; rc = [%d]\n", __func__, rc);
99 goto out_unlock_daemon_list; 88 goto out_unlock_daemon_list;
100 } 89 }
101 rc = ecryptfs_find_daemon_by_euid(&daemon, euid, current_user_ns()); 90 rc = ecryptfs_find_daemon_by_euid(&daemon);
102 if (rc || !daemon) { 91 if (!rc) {
103 rc = ecryptfs_spawn_daemon(&daemon, euid, current_user_ns(),
104 task_pid(current));
105 if (rc) {
106 printk(KERN_ERR "%s: Error attempting to spawn daemon; "
107 "rc = [%d]\n", __func__, rc);
108 goto out_module_put_unlock_daemon_list;
109 }
110 }
111 mutex_lock(&daemon->mux);
112 if (daemon->pid != task_pid(current)) {
113 rc = -EINVAL; 92 rc = -EINVAL;
114 printk(KERN_ERR "%s: pid [0x%p] has registered with euid [%d], " 93 goto out_unlock_daemon_list;
115 "but pid [0x%p] has attempted to open the handle " 94 }
116 "instead\n", __func__, daemon->pid, daemon->euid, 95 rc = ecryptfs_spawn_daemon(&daemon, file);
117 task_pid(current)); 96 if (rc) {
118 goto out_unlock_daemon; 97 printk(KERN_ERR "%s: Error attempting to spawn daemon; "
98 "rc = [%d]\n", __func__, rc);
99 goto out_module_put_unlock_daemon_list;
119 } 100 }
101 mutex_lock(&daemon->mux);
120 if (daemon->flags & ECRYPTFS_DAEMON_MISCDEV_OPEN) { 102 if (daemon->flags & ECRYPTFS_DAEMON_MISCDEV_OPEN) {
121 rc = -EBUSY; 103 rc = -EBUSY;
122 printk(KERN_ERR "%s: Miscellaneous device handle may only be "
123 "opened once per daemon; pid [0x%p] already has this "
124 "handle open\n", __func__, daemon->pid);
125 goto out_unlock_daemon; 104 goto out_unlock_daemon;
126 } 105 }
127 daemon->flags |= ECRYPTFS_DAEMON_MISCDEV_OPEN; 106 daemon->flags |= ECRYPTFS_DAEMON_MISCDEV_OPEN;
@@ -140,7 +119,7 @@ out_unlock_daemon_list:
140/** 119/**
141 * ecryptfs_miscdev_release 120 * ecryptfs_miscdev_release
142 * @inode: inode of fs/ecryptfs/euid handle (ignored) 121 * @inode: inode of fs/ecryptfs/euid handle (ignored)
143 * @file: file for fs/ecryptfs/euid handle (ignored) 122 * @file: file for fs/ecryptfs/euid handle
144 * 123 *
145 * This keeps the daemon registered until the daemon sends another 124 * This keeps the daemon registered until the daemon sends another
146 * ioctl to fs/ecryptfs/ctl or until the kernel module unregisters. 125 * ioctl to fs/ecryptfs/ctl or until the kernel module unregisters.
@@ -150,20 +129,18 @@ out_unlock_daemon_list:
150static int 129static int
151ecryptfs_miscdev_release(struct inode *inode, struct file *file) 130ecryptfs_miscdev_release(struct inode *inode, struct file *file)
152{ 131{
153 struct ecryptfs_daemon *daemon = NULL; 132 struct ecryptfs_daemon *daemon = file->private_data;
154 uid_t euid = current_euid();
155 int rc; 133 int rc;
156 134
157 mutex_lock(&ecryptfs_daemon_hash_mux);
158 rc = ecryptfs_find_daemon_by_euid(&daemon, euid, current_user_ns());
159 if (rc || !daemon)
160 daemon = file->private_data;
161 mutex_lock(&daemon->mux); 135 mutex_lock(&daemon->mux);
162 BUG_ON(!(daemon->flags & ECRYPTFS_DAEMON_MISCDEV_OPEN)); 136 BUG_ON(!(daemon->flags & ECRYPTFS_DAEMON_MISCDEV_OPEN));
163 daemon->flags &= ~ECRYPTFS_DAEMON_MISCDEV_OPEN; 137 daemon->flags &= ~ECRYPTFS_DAEMON_MISCDEV_OPEN;
164 atomic_dec(&ecryptfs_num_miscdev_opens); 138 atomic_dec(&ecryptfs_num_miscdev_opens);
165 mutex_unlock(&daemon->mux); 139 mutex_unlock(&daemon->mux);
140
141 mutex_lock(&ecryptfs_daemon_hash_mux);
166 rc = ecryptfs_exorcise_daemon(daemon); 142 rc = ecryptfs_exorcise_daemon(daemon);
143 mutex_unlock(&ecryptfs_daemon_hash_mux);
167 if (rc) { 144 if (rc) {
168 printk(KERN_CRIT "%s: Fatal error whilst attempting to " 145 printk(KERN_CRIT "%s: Fatal error whilst attempting to "
169 "shut down daemon; rc = [%d]. Please report this " 146 "shut down daemon; rc = [%d]. Please report this "
@@ -171,7 +148,6 @@ ecryptfs_miscdev_release(struct inode *inode, struct file *file)
171 BUG(); 148 BUG();
172 } 149 }
173 module_put(THIS_MODULE); 150 module_put(THIS_MODULE);
174 mutex_unlock(&ecryptfs_daemon_hash_mux);
175 return rc; 151 return rc;
176} 152}
177 153
@@ -248,7 +224,7 @@ int ecryptfs_send_miscdev(char *data, size_t data_size,
248 224
249/** 225/**
250 * ecryptfs_miscdev_read - format and send message from queue 226 * ecryptfs_miscdev_read - format and send message from queue
251 * @file: fs/ecryptfs/euid miscdevfs handle (ignored) 227 * @file: miscdevfs handle
252 * @buf: User buffer into which to copy the next message on the daemon queue 228 * @buf: User buffer into which to copy the next message on the daemon queue
253 * @count: Amount of space available in @buf 229 * @count: Amount of space available in @buf
254 * @ppos: Offset in file (ignored) 230 * @ppos: Offset in file (ignored)
@@ -262,43 +238,27 @@ static ssize_t
262ecryptfs_miscdev_read(struct file *file, char __user *buf, size_t count, 238ecryptfs_miscdev_read(struct file *file, char __user *buf, size_t count,
263 loff_t *ppos) 239 loff_t *ppos)
264{ 240{
265 struct ecryptfs_daemon *daemon; 241 struct ecryptfs_daemon *daemon = file->private_data;
266 struct ecryptfs_msg_ctx *msg_ctx; 242 struct ecryptfs_msg_ctx *msg_ctx;
267 size_t packet_length_size; 243 size_t packet_length_size;
268 char packet_length[ECRYPTFS_MAX_PKT_LEN_SIZE]; 244 char packet_length[ECRYPTFS_MAX_PKT_LEN_SIZE];
269 size_t i; 245 size_t i;
270 size_t total_length; 246 size_t total_length;
271 uid_t euid = current_euid();
272 int rc; 247 int rc;
273 248
274 mutex_lock(&ecryptfs_daemon_hash_mux);
275 /* TODO: Just use file->private_data? */
276 rc = ecryptfs_find_daemon_by_euid(&daemon, euid, current_user_ns());
277 if (rc || !daemon) {
278 mutex_unlock(&ecryptfs_daemon_hash_mux);
279 return -EINVAL;
280 }
281 mutex_lock(&daemon->mux); 249 mutex_lock(&daemon->mux);
282 if (task_pid(current) != daemon->pid) {
283 mutex_unlock(&daemon->mux);
284 mutex_unlock(&ecryptfs_daemon_hash_mux);
285 return -EPERM;
286 }
287 if (daemon->flags & ECRYPTFS_DAEMON_ZOMBIE) { 250 if (daemon->flags & ECRYPTFS_DAEMON_ZOMBIE) {
288 rc = 0; 251 rc = 0;
289 mutex_unlock(&ecryptfs_daemon_hash_mux);
290 printk(KERN_WARNING "%s: Attempt to read from zombified " 252 printk(KERN_WARNING "%s: Attempt to read from zombified "
291 "daemon\n", __func__); 253 "daemon\n", __func__);
292 goto out_unlock_daemon; 254 goto out_unlock_daemon;
293 } 255 }
294 if (daemon->flags & ECRYPTFS_DAEMON_IN_READ) { 256 if (daemon->flags & ECRYPTFS_DAEMON_IN_READ) {
295 rc = 0; 257 rc = 0;
296 mutex_unlock(&ecryptfs_daemon_hash_mux);
297 goto out_unlock_daemon; 258 goto out_unlock_daemon;
298 } 259 }
299 /* This daemon will not go away so long as this flag is set */ 260 /* This daemon will not go away so long as this flag is set */
300 daemon->flags |= ECRYPTFS_DAEMON_IN_READ; 261 daemon->flags |= ECRYPTFS_DAEMON_IN_READ;
301 mutex_unlock(&ecryptfs_daemon_hash_mux);
302check_list: 262check_list:
303 if (list_empty(&daemon->msg_ctx_out_queue)) { 263 if (list_empty(&daemon->msg_ctx_out_queue)) {
304 mutex_unlock(&daemon->mux); 264 mutex_unlock(&daemon->mux);
@@ -382,16 +342,12 @@ out_unlock_daemon:
382 * ecryptfs_miscdev_response - miscdevess response to message previously sent to daemon 342 * ecryptfs_miscdev_response - miscdevess response to message previously sent to daemon
383 * @data: Bytes comprising struct ecryptfs_message 343 * @data: Bytes comprising struct ecryptfs_message
384 * @data_size: sizeof(struct ecryptfs_message) + data len 344 * @data_size: sizeof(struct ecryptfs_message) + data len
385 * @euid: Effective user id of miscdevess sending the miscdev response
386 * @user_ns: The namespace in which @euid applies
387 * @pid: Miscdevess id of miscdevess sending the miscdev response
388 * @seq: Sequence number for miscdev response packet 345 * @seq: Sequence number for miscdev response packet
389 * 346 *
390 * Returns zero on success; non-zero otherwise 347 * Returns zero on success; non-zero otherwise
391 */ 348 */
392static int ecryptfs_miscdev_response(char *data, size_t data_size, 349static int ecryptfs_miscdev_response(struct ecryptfs_daemon *daemon, char *data,
393 uid_t euid, struct user_namespace *user_ns, 350 size_t data_size, u32 seq)
394 struct pid *pid, u32 seq)
395{ 351{
396 struct ecryptfs_message *msg = (struct ecryptfs_message *)data; 352 struct ecryptfs_message *msg = (struct ecryptfs_message *)data;
397 int rc; 353 int rc;
@@ -403,7 +359,7 @@ static int ecryptfs_miscdev_response(char *data, size_t data_size,
403 rc = -EINVAL; 359 rc = -EINVAL;
404 goto out; 360 goto out;
405 } 361 }
406 rc = ecryptfs_process_response(msg, euid, user_ns, pid, seq); 362 rc = ecryptfs_process_response(daemon, msg, seq);
407 if (rc) 363 if (rc)
408 printk(KERN_ERR 364 printk(KERN_ERR
409 "Error processing response message; rc = [%d]\n", rc); 365 "Error processing response message; rc = [%d]\n", rc);
@@ -413,7 +369,7 @@ out:
413 369
414/** 370/**
415 * ecryptfs_miscdev_write - handle write to daemon miscdev handle 371 * ecryptfs_miscdev_write - handle write to daemon miscdev handle
416 * @file: File for misc dev handle (ignored) 372 * @file: File for misc dev handle
417 * @buf: Buffer containing user data 373 * @buf: Buffer containing user data
418 * @count: Amount of data in @buf 374 * @count: Amount of data in @buf
419 * @ppos: Pointer to offset in file (ignored) 375 * @ppos: Pointer to offset in file (ignored)
@@ -428,7 +384,6 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
428 u32 seq; 384 u32 seq;
429 size_t packet_size, packet_size_length; 385 size_t packet_size, packet_size_length;
430 char *data; 386 char *data;
431 uid_t euid = current_euid();
432 unsigned char packet_size_peek[ECRYPTFS_MAX_PKT_LEN_SIZE]; 387 unsigned char packet_size_peek[ECRYPTFS_MAX_PKT_LEN_SIZE];
433 ssize_t rc; 388 ssize_t rc;
434 389
@@ -488,10 +443,9 @@ memdup:
488 } 443 }
489 memcpy(&counter_nbo, &data[PKT_CTR_OFFSET], PKT_CTR_SIZE); 444 memcpy(&counter_nbo, &data[PKT_CTR_OFFSET], PKT_CTR_SIZE);
490 seq = be32_to_cpu(counter_nbo); 445 seq = be32_to_cpu(counter_nbo);
491 rc = ecryptfs_miscdev_response( 446 rc = ecryptfs_miscdev_response(file->private_data,
492 &data[PKT_LEN_OFFSET + packet_size_length], 447 &data[PKT_LEN_OFFSET + packet_size_length],
493 packet_size, euid, current_user_ns(), 448 packet_size, seq);
494 task_pid(current), seq);
495 if (rc) { 449 if (rc) {
496 printk(KERN_WARNING "%s: Failed to deliver miscdev " 450 printk(KERN_WARNING "%s: Failed to deliver miscdev "
497 "response to requesting operation; rc = [%zd]\n", 451 "response to requesting operation; rc = [%zd]\n",
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index a46b3a8fee1e..bd1d57f98f74 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -66,18 +66,6 @@ static int ecryptfs_writepage(struct page *page, struct writeback_control *wbc)
66{ 66{
67 int rc; 67 int rc;
68 68
69 /*
70 * Refuse to write the page out if we are called from reclaim context
71 * since our writepage() path may potentially allocate memory when
72 * calling into the lower fs vfs_write() which may in turn invoke
73 * us again.
74 */
75 if (current->flags & PF_MEMALLOC) {
76 redirty_page_for_writepage(wbc, page);
77 rc = 0;
78 goto out;
79 }
80
81 rc = ecryptfs_encrypt_page(page); 69 rc = ecryptfs_encrypt_page(page);
82 if (rc) { 70 if (rc) {
83 ecryptfs_printk(KERN_WARNING, "Error encrypting " 71 ecryptfs_printk(KERN_WARNING, "Error encrypting "
@@ -498,7 +486,6 @@ static int ecryptfs_write_end(struct file *file,
498 struct ecryptfs_crypt_stat *crypt_stat = 486 struct ecryptfs_crypt_stat *crypt_stat =
499 &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; 487 &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
500 int rc; 488 int rc;
501 int need_unlock_page = 1;
502 489
503 ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page" 490 ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page"
504 "(page w/ index = [0x%.16lx], to = [%d])\n", index, to); 491 "(page w/ index = [0x%.16lx], to = [%d])\n", index, to);
@@ -519,26 +506,26 @@ static int ecryptfs_write_end(struct file *file,
519 "zeros in page with index = [0x%.16lx]\n", index); 506 "zeros in page with index = [0x%.16lx]\n", index);
520 goto out; 507 goto out;
521 } 508 }
522 set_page_dirty(page); 509 rc = ecryptfs_encrypt_page(page);
523 unlock_page(page); 510 if (rc) {
524 need_unlock_page = 0; 511 ecryptfs_printk(KERN_WARNING, "Error encrypting page (upper "
512 "index [0x%.16lx])\n", index);
513 goto out;
514 }
525 if (pos + copied > i_size_read(ecryptfs_inode)) { 515 if (pos + copied > i_size_read(ecryptfs_inode)) {
526 i_size_write(ecryptfs_inode, pos + copied); 516 i_size_write(ecryptfs_inode, pos + copied);
527 ecryptfs_printk(KERN_DEBUG, "Expanded file size to " 517 ecryptfs_printk(KERN_DEBUG, "Expanded file size to "
528 "[0x%.16llx]\n", 518 "[0x%.16llx]\n",
529 (unsigned long long)i_size_read(ecryptfs_inode)); 519 (unsigned long long)i_size_read(ecryptfs_inode));
530 balance_dirty_pages_ratelimited(mapping);
531 rc = ecryptfs_write_inode_size_to_metadata(ecryptfs_inode);
532 if (rc) {
533 printk(KERN_ERR "Error writing inode size to metadata; "
534 "rc = [%d]\n", rc);
535 goto out;
536 }
537 } 520 }
538 rc = copied; 521 rc = ecryptfs_write_inode_size_to_metadata(ecryptfs_inode);
522 if (rc)
523 printk(KERN_ERR "Error writing inode size to metadata; "
524 "rc = [%d]\n", rc);
525 else
526 rc = copied;
539out: 527out:
540 if (need_unlock_page) 528 unlock_page(page);
541 unlock_page(page);
542 page_cache_release(page); 529 page_cache_release(page);
543 return rc; 530 return rc;
544} 531}
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 1c8b55670804..eedec84c1809 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1654,8 +1654,8 @@ SYSCALL_DEFINE1(epoll_create1, int, flags)
1654 error = PTR_ERR(file); 1654 error = PTR_ERR(file);
1655 goto out_free_fd; 1655 goto out_free_fd;
1656 } 1656 }
1657 fd_install(fd, file);
1658 ep->file = file; 1657 ep->file = file;
1658 fd_install(fd, file);
1659 return fd; 1659 return fd;
1660 1660
1661out_free_fd: 1661out_free_fd:
diff --git a/fs/exec.c b/fs/exec.c
index 3684353ebd5f..574cf4de4ec3 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -2069,25 +2069,18 @@ static void wait_for_dump_helpers(struct file *file)
2069 */ 2069 */
2070static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) 2070static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
2071{ 2071{
2072 struct file *rp, *wp; 2072 struct file *files[2];
2073 struct fdtable *fdt; 2073 struct fdtable *fdt;
2074 struct coredump_params *cp = (struct coredump_params *)info->data; 2074 struct coredump_params *cp = (struct coredump_params *)info->data;
2075 struct files_struct *cf = current->files; 2075 struct files_struct *cf = current->files;
2076 int err = create_pipe_files(files, 0);
2077 if (err)
2078 return err;
2076 2079
2077 wp = create_write_pipe(0); 2080 cp->file = files[1];
2078 if (IS_ERR(wp))
2079 return PTR_ERR(wp);
2080
2081 rp = create_read_pipe(wp, 0);
2082 if (IS_ERR(rp)) {
2083 free_write_pipe(wp);
2084 return PTR_ERR(rp);
2085 }
2086
2087 cp->file = wp;
2088 2081
2089 sys_close(0); 2082 sys_close(0);
2090 fd_install(0, rp); 2083 fd_install(0, files[0]);
2091 spin_lock(&cf->file_lock); 2084 spin_lock(&cf->file_lock);
2092 fdt = files_fdtable(cf); 2085 fdt = files_fdtable(cf);
2093 __set_open_fd(0, fdt); 2086 __set_open_fd(0, fdt);
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 5badb0c039de..1562c27a2fab 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -37,15 +37,12 @@
37 37
38#define EXOFS_DBGMSG2(M...) do {} while (0) 38#define EXOFS_DBGMSG2(M...) do {} while (0)
39 39
40enum {MAX_PAGES_KMALLOC = PAGE_SIZE / sizeof(struct page *), };
41
42unsigned exofs_max_io_pages(struct ore_layout *layout, 40unsigned exofs_max_io_pages(struct ore_layout *layout,
43 unsigned expected_pages) 41 unsigned expected_pages)
44{ 42{
45 unsigned pages = min_t(unsigned, expected_pages, MAX_PAGES_KMALLOC); 43 unsigned pages = min_t(unsigned, expected_pages,
44 layout->max_io_length / PAGE_SIZE);
46 45
47 /* TODO: easily support bio chaining */
48 pages = min_t(unsigned, pages, layout->max_io_length / PAGE_SIZE);
49 return pages; 46 return pages;
50} 47}
51 48
@@ -101,7 +98,8 @@ static void _pcol_reset(struct page_collect *pcol)
101 * it might not end here. don't be left with nothing 98 * it might not end here. don't be left with nothing
102 */ 99 */
103 if (!pcol->expected_pages) 100 if (!pcol->expected_pages)
104 pcol->expected_pages = MAX_PAGES_KMALLOC; 101 pcol->expected_pages =
102 exofs_max_io_pages(&pcol->sbi->layout, ~0);
105} 103}
106 104
107static int pcol_try_alloc(struct page_collect *pcol) 105static int pcol_try_alloc(struct page_collect *pcol)
@@ -389,6 +387,8 @@ static int readpage_strip(void *data, struct page *page)
389 size_t len; 387 size_t len;
390 int ret; 388 int ret;
391 389
390 BUG_ON(!PageLocked(page));
391
392 /* FIXME: Just for debugging, will be removed */ 392 /* FIXME: Just for debugging, will be removed */
393 if (PageUptodate(page)) 393 if (PageUptodate(page))
394 EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino, 394 EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino,
@@ -572,8 +572,16 @@ static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
572 572
573 if (!pcol->that_locked_page || 573 if (!pcol->that_locked_page ||
574 (pcol->that_locked_page->index != index)) { 574 (pcol->that_locked_page->index != index)) {
575 struct page *page = find_get_page(pcol->inode->i_mapping, index); 575 struct page *page;
576 loff_t i_size = i_size_read(pcol->inode);
577
578 if (offset >= i_size) {
579 *uptodate = true;
580 EXOFS_DBGMSG("offset >= i_size index=0x%lx\n", index);
581 return ZERO_PAGE(0);
582 }
576 583
584 page = find_get_page(pcol->inode->i_mapping, index);
577 if (!page) { 585 if (!page) {
578 page = find_or_create_page(pcol->inode->i_mapping, 586 page = find_or_create_page(pcol->inode->i_mapping,
579 index, GFP_NOFS); 587 index, GFP_NOFS);
@@ -602,12 +610,13 @@ static void __r4w_put_page(void *priv, struct page *page)
602{ 610{
603 struct page_collect *pcol = priv; 611 struct page_collect *pcol = priv;
604 612
605 if (pcol->that_locked_page != page) { 613 if ((pcol->that_locked_page != page) && (ZERO_PAGE(0) != page)) {
606 EXOFS_DBGMSG("index=0x%lx\n", page->index); 614 EXOFS_DBGMSG("index=0x%lx\n", page->index);
607 page_cache_release(page); 615 page_cache_release(page);
608 return; 616 return;
609 } 617 }
610 EXOFS_DBGMSG("that_locked_page index=0x%lx\n", page->index); 618 EXOFS_DBGMSG("that_locked_page index=0x%lx\n",
619 ZERO_PAGE(0) == page ? -1 : page->index);
611} 620}
612 621
613static const struct _ore_r4w_op _r4w_op = { 622static const struct _ore_r4w_op _r4w_op = {
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index 24a49d47e935..1585db1aa365 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -837,11 +837,11 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
837 bio->bi_rw |= REQ_WRITE; 837 bio->bi_rw |= REQ_WRITE;
838 } 838 }
839 839
840 osd_req_write(or, _ios_obj(ios, dev), per_dev->offset, 840 osd_req_write(or, _ios_obj(ios, cur_comp),
841 bio, per_dev->length); 841 per_dev->offset, bio, per_dev->length);
842 ORE_DBGMSG("write(0x%llx) offset=0x%llx " 842 ORE_DBGMSG("write(0x%llx) offset=0x%llx "
843 "length=0x%llx dev=%d\n", 843 "length=0x%llx dev=%d\n",
844 _LLU(_ios_obj(ios, dev)->id), 844 _LLU(_ios_obj(ios, cur_comp)->id),
845 _LLU(per_dev->offset), 845 _LLU(per_dev->offset),
846 _LLU(per_dev->length), dev); 846 _LLU(per_dev->length), dev);
847 } else if (ios->kern_buff) { 847 } else if (ios->kern_buff) {
@@ -853,20 +853,20 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
853 (ios->si.unit_off + ios->length > 853 (ios->si.unit_off + ios->length >
854 ios->layout->stripe_unit)); 854 ios->layout->stripe_unit));
855 855
856 ret = osd_req_write_kern(or, _ios_obj(ios, per_dev->dev), 856 ret = osd_req_write_kern(or, _ios_obj(ios, cur_comp),
857 per_dev->offset, 857 per_dev->offset,
858 ios->kern_buff, ios->length); 858 ios->kern_buff, ios->length);
859 if (unlikely(ret)) 859 if (unlikely(ret))
860 goto out; 860 goto out;
861 ORE_DBGMSG2("write_kern(0x%llx) offset=0x%llx " 861 ORE_DBGMSG2("write_kern(0x%llx) offset=0x%llx "
862 "length=0x%llx dev=%d\n", 862 "length=0x%llx dev=%d\n",
863 _LLU(_ios_obj(ios, dev)->id), 863 _LLU(_ios_obj(ios, cur_comp)->id),
864 _LLU(per_dev->offset), 864 _LLU(per_dev->offset),
865 _LLU(ios->length), per_dev->dev); 865 _LLU(ios->length), per_dev->dev);
866 } else { 866 } else {
867 osd_req_set_attributes(or, _ios_obj(ios, dev)); 867 osd_req_set_attributes(or, _ios_obj(ios, cur_comp));
868 ORE_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n", 868 ORE_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n",
869 _LLU(_ios_obj(ios, dev)->id), 869 _LLU(_ios_obj(ios, cur_comp)->id),
870 ios->out_attr_len, dev); 870 ios->out_attr_len, dev);
871 } 871 }
872 872
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 433783624d10..dde41a75c7c8 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -400,8 +400,6 @@ static int exofs_sync_fs(struct super_block *sb, int wait)
400 ret = ore_write(ios); 400 ret = ore_write(ios);
401 if (unlikely(ret)) 401 if (unlikely(ret))
402 EXOFS_ERR("%s: ore_write failed.\n", __func__); 402 EXOFS_ERR("%s: ore_write failed.\n", __func__);
403 else
404 sb->s_dirt = 0;
405 403
406 404
407 unlock_super(sb); 405 unlock_super(sb);
@@ -412,14 +410,6 @@ out:
412 return ret; 410 return ret;
413} 411}
414 412
415static void exofs_write_super(struct super_block *sb)
416{
417 if (!(sb->s_flags & MS_RDONLY))
418 exofs_sync_fs(sb, 1);
419 else
420 sb->s_dirt = 0;
421}
422
423static void _exofs_print_device(const char *msg, const char *dev_path, 413static void _exofs_print_device(const char *msg, const char *dev_path,
424 struct osd_dev *od, u64 pid) 414 struct osd_dev *od, u64 pid)
425{ 415{
@@ -952,7 +942,6 @@ static const struct super_operations exofs_sops = {
952 .write_inode = exofs_write_inode, 942 .write_inode = exofs_write_inode,
953 .evict_inode = exofs_evict_inode, 943 .evict_inode = exofs_evict_inode,
954 .put_super = exofs_put_super, 944 .put_super = exofs_put_super,
955 .write_super = exofs_write_super,
956 .sync_fs = exofs_sync_fs, 945 .sync_fs = exofs_sync_fs,
957 .statfs = exofs_statfs, 946 .statfs = exofs_statfs,
958}; 947};
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 264d315f6c47..6363ac66fafa 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -79,6 +79,7 @@ void ext2_evict_inode(struct inode * inode)
79 truncate_inode_pages(&inode->i_data, 0); 79 truncate_inode_pages(&inode->i_data, 0);
80 80
81 if (want_delete) { 81 if (want_delete) {
82 sb_start_intwrite(inode->i_sb);
82 /* set dtime */ 83 /* set dtime */
83 EXT2_I(inode)->i_dtime = get_seconds(); 84 EXT2_I(inode)->i_dtime = get_seconds();
84 mark_inode_dirty(inode); 85 mark_inode_dirty(inode);
@@ -98,8 +99,10 @@ void ext2_evict_inode(struct inode * inode)
98 if (unlikely(rsv)) 99 if (unlikely(rsv))
99 kfree(rsv); 100 kfree(rsv);
100 101
101 if (want_delete) 102 if (want_delete) {
102 ext2_free_inode(inode); 103 ext2_free_inode(inode);
104 sb_end_intwrite(inode->i_sb);
105 }
103} 106}
104 107
105typedef struct { 108typedef struct {
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 9f311d27b16f..af74d9e27b71 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -42,6 +42,8 @@ static void ext2_sync_super(struct super_block *sb,
42static int ext2_remount (struct super_block * sb, int * flags, char * data); 42static int ext2_remount (struct super_block * sb, int * flags, char * data);
43static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf); 43static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf);
44static int ext2_sync_fs(struct super_block *sb, int wait); 44static int ext2_sync_fs(struct super_block *sb, int wait);
45static int ext2_freeze(struct super_block *sb);
46static int ext2_unfreeze(struct super_block *sb);
45 47
46void ext2_error(struct super_block *sb, const char *function, 48void ext2_error(struct super_block *sb, const char *function,
47 const char *fmt, ...) 49 const char *fmt, ...)
@@ -305,6 +307,8 @@ static const struct super_operations ext2_sops = {
305 .evict_inode = ext2_evict_inode, 307 .evict_inode = ext2_evict_inode,
306 .put_super = ext2_put_super, 308 .put_super = ext2_put_super,
307 .sync_fs = ext2_sync_fs, 309 .sync_fs = ext2_sync_fs,
310 .freeze_fs = ext2_freeze,
311 .unfreeze_fs = ext2_unfreeze,
308 .statfs = ext2_statfs, 312 .statfs = ext2_statfs,
309 .remount_fs = ext2_remount, 313 .remount_fs = ext2_remount,
310 .show_options = ext2_show_options, 314 .show_options = ext2_show_options,
@@ -1200,6 +1204,35 @@ static int ext2_sync_fs(struct super_block *sb, int wait)
1200 return 0; 1204 return 0;
1201} 1205}
1202 1206
1207static int ext2_freeze(struct super_block *sb)
1208{
1209 struct ext2_sb_info *sbi = EXT2_SB(sb);
1210
1211 /*
1212 * Open but unlinked files present? Keep EXT2_VALID_FS flag cleared
1213 * because we have unattached inodes and thus filesystem is not fully
1214 * consistent.
1215 */
1216 if (atomic_long_read(&sb->s_remove_count)) {
1217 ext2_sync_fs(sb, 1);
1218 return 0;
1219 }
1220 /* Set EXT2_FS_VALID flag */
1221 spin_lock(&sbi->s_lock);
1222 sbi->s_es->s_state = cpu_to_le16(sbi->s_mount_state);
1223 spin_unlock(&sbi->s_lock);
1224 ext2_sync_super(sb, sbi->s_es, 1);
1225
1226 return 0;
1227}
1228
1229static int ext2_unfreeze(struct super_block *sb)
1230{
1231 /* Just write sb to clear EXT2_VALID_FS flag */
1232 ext2_write_super(sb);
1233
1234 return 0;
1235}
1203 1236
1204void ext2_write_super(struct super_block *sb) 1237void ext2_write_super(struct super_block *sb)
1205{ 1238{
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 9a4a5c48b1c9..ff574b4e345e 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -3072,6 +3072,8 @@ static int ext3_do_update_inode(handle_t *handle,
3072 struct ext3_inode_info *ei = EXT3_I(inode); 3072 struct ext3_inode_info *ei = EXT3_I(inode);
3073 struct buffer_head *bh = iloc->bh; 3073 struct buffer_head *bh = iloc->bh;
3074 int err = 0, rc, block; 3074 int err = 0, rc, block;
3075 int need_datasync = 0;
3076 __le32 disksize;
3075 uid_t i_uid; 3077 uid_t i_uid;
3076 gid_t i_gid; 3078 gid_t i_gid;
3077 3079
@@ -3113,7 +3115,11 @@ again:
3113 raw_inode->i_gid_high = 0; 3115 raw_inode->i_gid_high = 0;
3114 } 3116 }
3115 raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); 3117 raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
3116 raw_inode->i_size = cpu_to_le32(ei->i_disksize); 3118 disksize = cpu_to_le32(ei->i_disksize);
3119 if (disksize != raw_inode->i_size) {
3120 need_datasync = 1;
3121 raw_inode->i_size = disksize;
3122 }
3117 raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec); 3123 raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
3118 raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); 3124 raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
3119 raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); 3125 raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
@@ -3129,8 +3135,11 @@ again:
3129 if (!S_ISREG(inode->i_mode)) { 3135 if (!S_ISREG(inode->i_mode)) {
3130 raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl); 3136 raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
3131 } else { 3137 } else {
3132 raw_inode->i_size_high = 3138 disksize = cpu_to_le32(ei->i_disksize >> 32);
3133 cpu_to_le32(ei->i_disksize >> 32); 3139 if (disksize != raw_inode->i_size_high) {
3140 raw_inode->i_size_high = disksize;
3141 need_datasync = 1;
3142 }
3134 if (ei->i_disksize > 0x7fffffffULL) { 3143 if (ei->i_disksize > 0x7fffffffULL) {
3135 struct super_block *sb = inode->i_sb; 3144 struct super_block *sb = inode->i_sb;
3136 if (!EXT3_HAS_RO_COMPAT_FEATURE(sb, 3145 if (!EXT3_HAS_RO_COMPAT_FEATURE(sb,
@@ -3183,6 +3192,8 @@ again:
3183 ext3_clear_inode_state(inode, EXT3_STATE_NEW); 3192 ext3_clear_inode_state(inode, EXT3_STATE_NEW);
3184 3193
3185 atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid); 3194 atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid);
3195 if (need_datasync)
3196 atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid);
3186out_brelse: 3197out_brelse:
3187 brelse (bh); 3198 brelse (bh);
3188 ext3_std_error(inode->i_sb, err); 3199 ext3_std_error(inode->i_sb, err);
@@ -3459,14 +3470,6 @@ ext3_reserve_inode_write(handle_t *handle, struct inode *inode,
3459 * inode out, but prune_icache isn't a user-visible syncing function. 3470 * inode out, but prune_icache isn't a user-visible syncing function.
3460 * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync) 3471 * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
3461 * we start and wait on commits. 3472 * we start and wait on commits.
3462 *
3463 * Is this efficient/effective? Well, we're being nice to the system
3464 * by cleaning up our inodes proactively so they can be reaped
3465 * without I/O. But we are potentially leaving up to five seconds'
3466 * worth of inodes floating about which prune_icache wants us to
3467 * write out. One way to fix that would be to get prune_icache()
3468 * to do a write_super() to free up some memory. It has the desired
3469 * effect.
3470 */ 3473 */
3471int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode) 3474int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
3472{ 3475{
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index ff9bcdc5b0d5..8c892e93d8e7 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -64,11 +64,6 @@ static int ext3_freeze(struct super_block *sb);
64 64
65/* 65/*
66 * Wrappers for journal_start/end. 66 * Wrappers for journal_start/end.
67 *
68 * The only special thing we need to do here is to make sure that all
69 * journal_end calls result in the superblock being marked dirty, so
70 * that sync() will call the filesystem's write_super callback if
71 * appropriate.
72 */ 67 */
73handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks) 68handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks)
74{ 69{
@@ -90,12 +85,6 @@ handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks)
90 return journal_start(journal, nblocks); 85 return journal_start(journal, nblocks);
91} 86}
92 87
93/*
94 * The only special thing we need to do here is to make sure that all
95 * journal_stop calls result in the superblock being marked dirty, so
96 * that sync() will call the filesystem's write_super callback if
97 * appropriate.
98 */
99int __ext3_journal_stop(const char *where, handle_t *handle) 88int __ext3_journal_stop(const char *where, handle_t *handle)
100{ 89{
101 struct super_block *sb; 90 struct super_block *sb;
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index d23b31ca9d7a..1b5089067d01 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -280,14 +280,18 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
280 return desc; 280 return desc;
281} 281}
282 282
283static int ext4_valid_block_bitmap(struct super_block *sb, 283/*
284 struct ext4_group_desc *desc, 284 * Return the block number which was discovered to be invalid, or 0 if
285 unsigned int block_group, 285 * the block bitmap is valid.
286 struct buffer_head *bh) 286 */
287static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb,
288 struct ext4_group_desc *desc,
289 unsigned int block_group,
290 struct buffer_head *bh)
287{ 291{
288 ext4_grpblk_t offset; 292 ext4_grpblk_t offset;
289 ext4_grpblk_t next_zero_bit; 293 ext4_grpblk_t next_zero_bit;
290 ext4_fsblk_t bitmap_blk; 294 ext4_fsblk_t blk;
291 ext4_fsblk_t group_first_block; 295 ext4_fsblk_t group_first_block;
292 296
293 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) { 297 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
@@ -297,37 +301,33 @@ static int ext4_valid_block_bitmap(struct super_block *sb,
297 * or it has to also read the block group where the bitmaps 301 * or it has to also read the block group where the bitmaps
298 * are located to verify they are set. 302 * are located to verify they are set.
299 */ 303 */
300 return 1; 304 return 0;
301 } 305 }
302 group_first_block = ext4_group_first_block_no(sb, block_group); 306 group_first_block = ext4_group_first_block_no(sb, block_group);
303 307
304 /* check whether block bitmap block number is set */ 308 /* check whether block bitmap block number is set */
305 bitmap_blk = ext4_block_bitmap(sb, desc); 309 blk = ext4_block_bitmap(sb, desc);
306 offset = bitmap_blk - group_first_block; 310 offset = blk - group_first_block;
307 if (!ext4_test_bit(offset, bh->b_data)) 311 if (!ext4_test_bit(offset, bh->b_data))
308 /* bad block bitmap */ 312 /* bad block bitmap */
309 goto err_out; 313 return blk;
310 314
311 /* check whether the inode bitmap block number is set */ 315 /* check whether the inode bitmap block number is set */
312 bitmap_blk = ext4_inode_bitmap(sb, desc); 316 blk = ext4_inode_bitmap(sb, desc);
313 offset = bitmap_blk - group_first_block; 317 offset = blk - group_first_block;
314 if (!ext4_test_bit(offset, bh->b_data)) 318 if (!ext4_test_bit(offset, bh->b_data))
315 /* bad block bitmap */ 319 /* bad block bitmap */
316 goto err_out; 320 return blk;
317 321
318 /* check whether the inode table block number is set */ 322 /* check whether the inode table block number is set */
319 bitmap_blk = ext4_inode_table(sb, desc); 323 blk = ext4_inode_table(sb, desc);
320 offset = bitmap_blk - group_first_block; 324 offset = blk - group_first_block;
321 next_zero_bit = ext4_find_next_zero_bit(bh->b_data, 325 next_zero_bit = ext4_find_next_zero_bit(bh->b_data,
322 offset + EXT4_SB(sb)->s_itb_per_group, 326 offset + EXT4_SB(sb)->s_itb_per_group,
323 offset); 327 offset);
324 if (next_zero_bit >= offset + EXT4_SB(sb)->s_itb_per_group) 328 if (next_zero_bit < offset + EXT4_SB(sb)->s_itb_per_group)
325 /* good bitmap for inode tables */ 329 /* bad bitmap for inode tables */
326 return 1; 330 return blk;
327
328err_out:
329 ext4_error(sb, "Invalid block bitmap - block_group = %d, block = %llu",
330 block_group, bitmap_blk);
331 return 0; 331 return 0;
332} 332}
333 333
@@ -336,14 +336,26 @@ void ext4_validate_block_bitmap(struct super_block *sb,
336 unsigned int block_group, 336 unsigned int block_group,
337 struct buffer_head *bh) 337 struct buffer_head *bh)
338{ 338{
339 ext4_fsblk_t blk;
340
339 if (buffer_verified(bh)) 341 if (buffer_verified(bh))
340 return; 342 return;
341 343
342 ext4_lock_group(sb, block_group); 344 ext4_lock_group(sb, block_group);
343 if (ext4_valid_block_bitmap(sb, desc, block_group, bh) && 345 blk = ext4_valid_block_bitmap(sb, desc, block_group, bh);
344 ext4_block_bitmap_csum_verify(sb, block_group, desc, bh, 346 if (unlikely(blk != 0)) {
345 EXT4_BLOCKS_PER_GROUP(sb) / 8)) 347 ext4_unlock_group(sb, block_group);
346 set_buffer_verified(bh); 348 ext4_error(sb, "bg %u: block %llu: invalid block bitmap",
349 block_group, blk);
350 return;
351 }
352 if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group,
353 desc, bh, EXT4_BLOCKS_PER_GROUP(sb) / 8))) {
354 ext4_unlock_group(sb, block_group);
355 ext4_error(sb, "bg %u: bad block bitmap checksum", block_group);
356 return;
357 }
358 set_buffer_verified(bh);
347 ext4_unlock_group(sb, block_group); 359 ext4_unlock_group(sb, block_group);
348} 360}
349 361
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
index f8716eab9995..5c2d1813ebe9 100644
--- a/fs/ext4/bitmap.c
+++ b/fs/ext4/bitmap.c
@@ -79,7 +79,6 @@ int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
79 if (provided == calculated) 79 if (provided == calculated)
80 return 1; 80 return 1;
81 81
82 ext4_error(sb, "Bad block bitmap checksum: block_group = %u", group);
83 return 0; 82 return 0;
84} 83}
85 84
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index cd0c7ed06772..aabbb3f53683 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2662,6 +2662,7 @@ cont:
2662 } 2662 }
2663 path[0].p_depth = depth; 2663 path[0].p_depth = depth;
2664 path[0].p_hdr = ext_inode_hdr(inode); 2664 path[0].p_hdr = ext_inode_hdr(inode);
2665 i = 0;
2665 2666
2666 if (ext4_ext_check(inode, path[0].p_hdr, depth)) { 2667 if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
2667 err = -EIO; 2668 err = -EIO;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 89b59cb7f9b8..dff171c3a123 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -233,6 +233,11 @@ void ext4_evict_inode(struct inode *inode)
233 if (is_bad_inode(inode)) 233 if (is_bad_inode(inode))
234 goto no_delete; 234 goto no_delete;
235 235
236 /*
237 * Protect us against freezing - iput() caller didn't have to have any
238 * protection against it
239 */
240 sb_start_intwrite(inode->i_sb);
236 handle = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)+3); 241 handle = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)+3);
237 if (IS_ERR(handle)) { 242 if (IS_ERR(handle)) {
238 ext4_std_error(inode->i_sb, PTR_ERR(handle)); 243 ext4_std_error(inode->i_sb, PTR_ERR(handle));
@@ -242,6 +247,7 @@ void ext4_evict_inode(struct inode *inode)
242 * cleaned up. 247 * cleaned up.
243 */ 248 */
244 ext4_orphan_del(NULL, inode); 249 ext4_orphan_del(NULL, inode);
250 sb_end_intwrite(inode->i_sb);
245 goto no_delete; 251 goto no_delete;
246 } 252 }
247 253
@@ -273,6 +279,7 @@ void ext4_evict_inode(struct inode *inode)
273 stop_handle: 279 stop_handle:
274 ext4_journal_stop(handle); 280 ext4_journal_stop(handle);
275 ext4_orphan_del(NULL, inode); 281 ext4_orphan_del(NULL, inode);
282 sb_end_intwrite(inode->i_sb);
276 goto no_delete; 283 goto no_delete;
277 } 284 }
278 } 285 }
@@ -301,6 +308,7 @@ void ext4_evict_inode(struct inode *inode)
301 else 308 else
302 ext4_free_inode(handle, inode); 309 ext4_free_inode(handle, inode);
303 ext4_journal_stop(handle); 310 ext4_journal_stop(handle);
311 sb_end_intwrite(inode->i_sb);
304 return; 312 return;
305no_delete: 313no_delete:
306 ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ 314 ext4_clear_inode(inode); /* We must guarantee clearing of inode... */
@@ -1962,7 +1970,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
1962 * This function can get called via... 1970 * This function can get called via...
1963 * - ext4_da_writepages after taking page lock (have journal handle) 1971 * - ext4_da_writepages after taking page lock (have journal handle)
1964 * - journal_submit_inode_data_buffers (no journal handle) 1972 * - journal_submit_inode_data_buffers (no journal handle)
1965 * - shrink_page_list via pdflush (no journal handle) 1973 * - shrink_page_list via the kswapd/direct reclaim (no journal handle)
1966 * - grab_page_cache when doing write_begin (have journal handle) 1974 * - grab_page_cache when doing write_begin (have journal handle)
1967 * 1975 *
1968 * We don't do any block allocation in this function. If we have page with 1976 * We don't do any block allocation in this function. If we have page with
@@ -4581,14 +4589,6 @@ static int ext4_expand_extra_isize(struct inode *inode,
4581 * inode out, but prune_icache isn't a user-visible syncing function. 4589 * inode out, but prune_icache isn't a user-visible syncing function.
4582 * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync) 4590 * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
4583 * we start and wait on commits. 4591 * we start and wait on commits.
4584 *
4585 * Is this efficient/effective? Well, we're being nice to the system
4586 * by cleaning up our inodes proactively so they can be reaped
4587 * without I/O. But we are potentially leaving up to five seconds'
4588 * worth of inodes floating about which prune_icache wants us to
4589 * write out. One way to fix that would be to get prune_icache()
4590 * to do a write_super() to free up some memory. It has the desired
4591 * effect.
4592 */ 4592 */
4593int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) 4593int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
4594{ 4594{
@@ -4779,11 +4779,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
4779 get_block_t *get_block; 4779 get_block_t *get_block;
4780 int retries = 0; 4780 int retries = 0;
4781 4781
4782 /* 4782 sb_start_pagefault(inode->i_sb);
4783 * This check is racy but catches the common case. We rely on
4784 * __block_page_mkwrite() to do a reliable check.
4785 */
4786 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
4787 /* Delalloc case is easy... */ 4783 /* Delalloc case is easy... */
4788 if (test_opt(inode->i_sb, DELALLOC) && 4784 if (test_opt(inode->i_sb, DELALLOC) &&
4789 !ext4_should_journal_data(inode) && 4785 !ext4_should_journal_data(inode) &&
@@ -4851,5 +4847,6 @@ retry_alloc:
4851out_ret: 4847out_ret:
4852 ret = block_page_mkwrite_return(ret); 4848 ret = block_page_mkwrite_return(ret);
4853out: 4849out:
4850 sb_end_pagefault(inode->i_sb);
4854 return ret; 4851 return ret;
4855} 4852}
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index f99a1311e847..fe7c63f4717e 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -44,6 +44,11 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
44{ 44{
45 struct mmp_struct *mmp = (struct mmp_struct *)(bh->b_data); 45 struct mmp_struct *mmp = (struct mmp_struct *)(bh->b_data);
46 46
47 /*
48 * We protect against freezing so that we don't create dirty buffers
49 * on frozen filesystem.
50 */
51 sb_start_write(sb);
47 ext4_mmp_csum_set(sb, mmp); 52 ext4_mmp_csum_set(sb, mmp);
48 mark_buffer_dirty(bh); 53 mark_buffer_dirty(bh);
49 lock_buffer(bh); 54 lock_buffer(bh);
@@ -51,6 +56,7 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
51 get_bh(bh); 56 get_bh(bh);
52 submit_bh(WRITE_SYNC, bh); 57 submit_bh(WRITE_SYNC, bh);
53 wait_on_buffer(bh); 58 wait_on_buffer(bh);
59 sb_end_write(sb);
54 if (unlikely(!buffer_uptodate(bh))) 60 if (unlikely(!buffer_uptodate(bh)))
55 return 1; 61 return 1;
56 62
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 2d51cd9af225..c6e0cb3d1f4a 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -326,38 +326,17 @@ static void ext4_put_nojournal(handle_t *handle)
326 326
327/* 327/*
328 * Wrappers for jbd2_journal_start/end. 328 * Wrappers for jbd2_journal_start/end.
329 *
330 * The only special thing we need to do here is to make sure that all
331 * journal_end calls result in the superblock being marked dirty, so
332 * that sync() will call the filesystem's write_super callback if
333 * appropriate.
334 *
335 * To avoid j_barrier hold in userspace when a user calls freeze(),
336 * ext4 prevents a new handle from being started by s_frozen, which
337 * is in an upper layer.
338 */ 329 */
339handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks) 330handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
340{ 331{
341 journal_t *journal; 332 journal_t *journal;
342 handle_t *handle;
343 333
344 trace_ext4_journal_start(sb, nblocks, _RET_IP_); 334 trace_ext4_journal_start(sb, nblocks, _RET_IP_);
345 if (sb->s_flags & MS_RDONLY) 335 if (sb->s_flags & MS_RDONLY)
346 return ERR_PTR(-EROFS); 336 return ERR_PTR(-EROFS);
347 337
338 WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE);
348 journal = EXT4_SB(sb)->s_journal; 339 journal = EXT4_SB(sb)->s_journal;
349 handle = ext4_journal_current_handle();
350
351 /*
352 * If a handle has been started, it should be allowed to
353 * finish, otherwise deadlock could happen between freeze
354 * and others(e.g. truncate) due to the restart of the
355 * journal handle if the filesystem is forzen and active
356 * handles are not stopped.
357 */
358 if (!handle)
359 vfs_check_frozen(sb, SB_FREEZE_TRANS);
360
361 if (!journal) 340 if (!journal)
362 return ext4_get_nojournal(); 341 return ext4_get_nojournal();
363 /* 342 /*
@@ -372,12 +351,6 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
372 return jbd2_journal_start(journal, nblocks); 351 return jbd2_journal_start(journal, nblocks);
373} 352}
374 353
375/*
376 * The only special thing we need to do here is to make sure that all
377 * jbd2_journal_stop calls result in the superblock being marked dirty, so
378 * that sync() will call the filesystem's write_super callback if
379 * appropriate.
380 */
381int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle) 354int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
382{ 355{
383 struct super_block *sb; 356 struct super_block *sb;
@@ -975,6 +948,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
975 ei->i_reserved_meta_blocks = 0; 948 ei->i_reserved_meta_blocks = 0;
976 ei->i_allocated_meta_blocks = 0; 949 ei->i_allocated_meta_blocks = 0;
977 ei->i_da_metadata_calc_len = 0; 950 ei->i_da_metadata_calc_len = 0;
951 ei->i_da_metadata_calc_last_lblock = 0;
978 spin_lock_init(&(ei->i_block_reservation_lock)); 952 spin_lock_init(&(ei->i_block_reservation_lock));
979#ifdef CONFIG_QUOTA 953#ifdef CONFIG_QUOTA
980 ei->i_reserved_quota = 0; 954 ei->i_reserved_quota = 0;
@@ -2747,6 +2721,7 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
2747 sb = elr->lr_super; 2721 sb = elr->lr_super;
2748 ngroups = EXT4_SB(sb)->s_groups_count; 2722 ngroups = EXT4_SB(sb)->s_groups_count;
2749 2723
2724 sb_start_write(sb);
2750 for (group = elr->lr_next_group; group < ngroups; group++) { 2725 for (group = elr->lr_next_group; group < ngroups; group++) {
2751 gdp = ext4_get_group_desc(sb, group, NULL); 2726 gdp = ext4_get_group_desc(sb, group, NULL);
2752 if (!gdp) { 2727 if (!gdp) {
@@ -2773,6 +2748,7 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
2773 elr->lr_next_sched = jiffies + elr->lr_timeout; 2748 elr->lr_next_sched = jiffies + elr->lr_timeout;
2774 elr->lr_next_group = group + 1; 2749 elr->lr_next_group = group + 1;
2775 } 2750 }
2751 sb_end_write(sb);
2776 2752
2777 return ret; 2753 return ret;
2778} 2754}
@@ -3133,6 +3109,10 @@ static int count_overhead(struct super_block *sb, ext4_group_t grp,
3133 ext4_group_t i, ngroups = ext4_get_groups_count(sb); 3109 ext4_group_t i, ngroups = ext4_get_groups_count(sb);
3134 int s, j, count = 0; 3110 int s, j, count = 0;
3135 3111
3112 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC))
3113 return (ext4_bg_has_super(sb, grp) + ext4_bg_num_gdb(sb, grp) +
3114 sbi->s_itb_per_group + 2);
3115
3136 first_block = le32_to_cpu(sbi->s_es->s_first_data_block) + 3116 first_block = le32_to_cpu(sbi->s_es->s_first_data_block) +
3137 (grp * EXT4_BLOCKS_PER_GROUP(sb)); 3117 (grp * EXT4_BLOCKS_PER_GROUP(sb));
3138 last_block = first_block + EXT4_BLOCKS_PER_GROUP(sb) - 1; 3118 last_block = first_block + EXT4_BLOCKS_PER_GROUP(sb) - 1;
@@ -4444,6 +4424,7 @@ static void ext4_clear_journal_err(struct super_block *sb,
4444 ext4_commit_super(sb, 1); 4424 ext4_commit_super(sb, 1);
4445 4425
4446 jbd2_journal_clear_err(journal); 4426 jbd2_journal_clear_err(journal);
4427 jbd2_journal_update_sb_errno(journal);
4447 } 4428 }
4448} 4429}
4449 4430
@@ -4460,10 +4441,8 @@ int ext4_force_commit(struct super_block *sb)
4460 return 0; 4441 return 0;
4461 4442
4462 journal = EXT4_SB(sb)->s_journal; 4443 journal = EXT4_SB(sb)->s_journal;
4463 if (journal) { 4444 if (journal)
4464 vfs_check_frozen(sb, SB_FREEZE_TRANS);
4465 ret = ext4_journal_force_commit(journal); 4445 ret = ext4_journal_force_commit(journal);
4466 }
4467 4446
4468 return ret; 4447 return ret;
4469} 4448}
@@ -4493,9 +4472,8 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
4493 * gives us a chance to flush the journal completely and mark the fs clean. 4472 * gives us a chance to flush the journal completely and mark the fs clean.
4494 * 4473 *
4495 * Note that only this function cannot bring a filesystem to be in a clean 4474 * Note that only this function cannot bring a filesystem to be in a clean
4496 * state independently, because ext4 prevents a new handle from being started 4475 * state independently. It relies on upper layer to stop all data & metadata
4497 * by @sb->s_frozen, which stays in an upper layer. It thus needs help from 4476 * modifications.
4498 * the upper layer.
4499 */ 4477 */
4500static int ext4_freeze(struct super_block *sb) 4478static int ext4_freeze(struct super_block *sb)
4501{ 4479{
@@ -4522,7 +4500,7 @@ static int ext4_freeze(struct super_block *sb)
4522 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 4500 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
4523 error = ext4_commit_super(sb, 1); 4501 error = ext4_commit_super(sb, 1);
4524out: 4502out:
4525 /* we rely on s_frozen to stop further updates */ 4503 /* we rely on upper layer to stop further updates */
4526 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); 4504 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
4527 return error; 4505 return error;
4528} 4506}
diff --git a/fs/fat/file.c b/fs/fat/file.c
index a71fe3715ee8..e007b8bd8e5e 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -43,10 +43,10 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr)
43 if (err) 43 if (err)
44 goto out; 44 goto out;
45 45
46 mutex_lock(&inode->i_mutex);
47 err = mnt_want_write_file(file); 46 err = mnt_want_write_file(file);
48 if (err) 47 if (err)
49 goto out_unlock_inode; 48 goto out;
49 mutex_lock(&inode->i_mutex);
50 50
51 /* 51 /*
52 * ATTR_VOLUME and ATTR_DIR cannot be changed; this also 52 * ATTR_VOLUME and ATTR_DIR cannot be changed; this also
@@ -73,14 +73,14 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr)
73 /* The root directory has no attributes */ 73 /* The root directory has no attributes */
74 if (inode->i_ino == MSDOS_ROOT_INO && attr != ATTR_DIR) { 74 if (inode->i_ino == MSDOS_ROOT_INO && attr != ATTR_DIR) {
75 err = -EINVAL; 75 err = -EINVAL;
76 goto out_drop_write; 76 goto out_unlock_inode;
77 } 77 }
78 78
79 if (sbi->options.sys_immutable && 79 if (sbi->options.sys_immutable &&
80 ((attr | oldattr) & ATTR_SYS) && 80 ((attr | oldattr) & ATTR_SYS) &&
81 !capable(CAP_LINUX_IMMUTABLE)) { 81 !capable(CAP_LINUX_IMMUTABLE)) {
82 err = -EPERM; 82 err = -EPERM;
83 goto out_drop_write; 83 goto out_unlock_inode;
84 } 84 }
85 85
86 /* 86 /*
@@ -90,12 +90,12 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr)
90 */ 90 */
91 err = security_inode_setattr(file->f_path.dentry, &ia); 91 err = security_inode_setattr(file->f_path.dentry, &ia);
92 if (err) 92 if (err)
93 goto out_drop_write; 93 goto out_unlock_inode;
94 94
95 /* This MUST be done before doing anything irreversible... */ 95 /* This MUST be done before doing anything irreversible... */
96 err = fat_setattr(file->f_path.dentry, &ia); 96 err = fat_setattr(file->f_path.dentry, &ia);
97 if (err) 97 if (err)
98 goto out_drop_write; 98 goto out_unlock_inode;
99 99
100 fsnotify_change(file->f_path.dentry, ia.ia_valid); 100 fsnotify_change(file->f_path.dentry, ia.ia_valid);
101 if (sbi->options.sys_immutable) { 101 if (sbi->options.sys_immutable) {
@@ -107,10 +107,9 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr)
107 107
108 fat_save_attrs(inode, attr); 108 fat_save_attrs(inode, attr);
109 mark_inode_dirty(inode); 109 mark_inode_dirty(inode);
110out_drop_write:
111 mnt_drop_write_file(file);
112out_unlock_inode: 110out_unlock_inode:
113 mutex_unlock(&inode->i_mutex); 111 mutex_unlock(&inode->i_mutex);
112 mnt_drop_write_file(file);
114out: 113out:
115 return err; 114 return err;
116} 115}
diff --git a/fs/file_table.c b/fs/file_table.c
index b3fc4d67a26b..701985e4ccda 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -43,7 +43,7 @@ static struct kmem_cache *filp_cachep __read_mostly;
43 43
44static struct percpu_counter nr_files __cacheline_aligned_in_smp; 44static struct percpu_counter nr_files __cacheline_aligned_in_smp;
45 45
46static inline void file_free_rcu(struct rcu_head *head) 46static void file_free_rcu(struct rcu_head *head)
47{ 47{
48 struct file *f = container_of(head, struct file, f_u.fu_rcuhead); 48 struct file *f = container_of(head, struct file, f_u.fu_rcuhead);
49 49
@@ -217,7 +217,7 @@ static void drop_file_write_access(struct file *file)
217 return; 217 return;
218 if (file_check_writeable(file) != 0) 218 if (file_check_writeable(file) != 0)
219 return; 219 return;
220 mnt_drop_write(mnt); 220 __mnt_drop_write(mnt);
221 file_release_write(file); 221 file_release_write(file);
222} 222}
223 223
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 03ff5b1eba93..75a20c092dd4 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -117,7 +117,7 @@ static ssize_t fuse_conn_max_background_write(struct file *file,
117 const char __user *buf, 117 const char __user *buf,
118 size_t count, loff_t *ppos) 118 size_t count, loff_t *ppos)
119{ 119{
120 unsigned val; 120 unsigned uninitialized_var(val);
121 ssize_t ret; 121 ssize_t ret;
122 122
123 ret = fuse_conn_limit_write(file, buf, count, ppos, &val, 123 ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
@@ -154,7 +154,7 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
154 const char __user *buf, 154 const char __user *buf,
155 size_t count, loff_t *ppos) 155 size_t count, loff_t *ppos)
156{ 156{
157 unsigned val; 157 unsigned uninitialized_var(val);
158 ssize_t ret; 158 ssize_t ret;
159 159
160 ret = fuse_conn_limit_write(file, buf, count, ppos, &val, 160 ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index 3426521f3205..ee8d55042298 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -396,7 +396,7 @@ err_device:
396err_region: 396err_region:
397 unregister_chrdev_region(devt, 1); 397 unregister_chrdev_region(devt, 1);
398err: 398err:
399 fc->conn_error = 1; 399 fuse_conn_kill(fc);
400 goto out; 400 goto out;
401} 401}
402 402
@@ -532,8 +532,6 @@ static int cuse_channel_release(struct inode *inode, struct file *file)
532 cdev_del(cc->cdev); 532 cdev_del(cc->cdev);
533 } 533 }
534 534
535 /* kill connection and shutdown channel */
536 fuse_conn_kill(&cc->fc);
537 rc = fuse_dev_release(inode, file); /* puts the base reference */ 535 rc = fuse_dev_release(inode, file); /* puts the base reference */
538 536
539 return rc; 537 return rc;
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 7df2b5e8fbe1..f4246cfc8d87 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1576,6 +1576,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
1576 req->pages[req->num_pages] = page; 1576 req->pages[req->num_pages] = page;
1577 req->num_pages++; 1577 req->num_pages++;
1578 1578
1579 offset = 0;
1579 num -= this_num; 1580 num -= this_num;
1580 total_len += this_num; 1581 total_len += this_num;
1581 index++; 1582 index++;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 8964cf3999b2..324bc0850534 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -383,6 +383,9 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
383 struct fuse_entry_out outentry; 383 struct fuse_entry_out outentry;
384 struct fuse_file *ff; 384 struct fuse_file *ff;
385 385
386 /* Userspace expects S_IFREG in create mode */
387 BUG_ON((mode & S_IFMT) != S_IFREG);
388
386 forget = fuse_alloc_forget(); 389 forget = fuse_alloc_forget();
387 err = -ENOMEM; 390 err = -ENOMEM;
388 if (!forget) 391 if (!forget)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index b321a688cde7..aba15f1b7ad2 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -703,13 +703,16 @@ static ssize_t fuse_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
703 unsigned long nr_segs, loff_t pos) 703 unsigned long nr_segs, loff_t pos)
704{ 704{
705 struct inode *inode = iocb->ki_filp->f_mapping->host; 705 struct inode *inode = iocb->ki_filp->f_mapping->host;
706 struct fuse_conn *fc = get_fuse_conn(inode);
706 707
707 if (pos + iov_length(iov, nr_segs) > i_size_read(inode)) { 708 /*
709 * In auto invalidate mode, always update attributes on read.
710 * Otherwise, only update if we attempt to read past EOF (to ensure
711 * i_size is up to date).
712 */
713 if (fc->auto_inval_data ||
714 (pos + iov_length(iov, nr_segs) > i_size_read(inode))) {
708 int err; 715 int err;
709 /*
710 * If trying to read past EOF, make sure the i_size
711 * attribute is up-to-date.
712 */
713 err = fuse_update_attributes(inode, NULL, iocb->ki_filp, NULL); 716 err = fuse_update_attributes(inode, NULL, iocb->ki_filp, NULL);
714 if (err) 717 if (err)
715 return err; 718 return err;
@@ -944,9 +947,8 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
944 return err; 947 return err;
945 948
946 count = ocount; 949 count = ocount;
947 950 sb_start_write(inode->i_sb);
948 mutex_lock(&inode->i_mutex); 951 mutex_lock(&inode->i_mutex);
949 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
950 952
951 /* We can write back this queue in page reclaim */ 953 /* We can write back this queue in page reclaim */
952 current->backing_dev_info = mapping->backing_dev_info; 954 current->backing_dev_info = mapping->backing_dev_info;
@@ -1004,6 +1006,7 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1004out: 1006out:
1005 current->backing_dev_info = NULL; 1007 current->backing_dev_info = NULL;
1006 mutex_unlock(&inode->i_mutex); 1008 mutex_unlock(&inode->i_mutex);
1009 sb_end_write(inode->i_sb);
1007 1010
1008 return written ? written : err; 1011 return written ? written : err;
1009} 1012}
@@ -1700,7 +1703,7 @@ static int fuse_verify_ioctl_iov(struct iovec *iov, size_t count)
1700 size_t n; 1703 size_t n;
1701 u32 max = FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT; 1704 u32 max = FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT;
1702 1705
1703 for (n = 0; n < count; n++) { 1706 for (n = 0; n < count; n++, iov++) {
1704 if (iov->iov_len > (size_t) max) 1707 if (iov->iov_len > (size_t) max)
1705 return -ENOMEM; 1708 return -ENOMEM;
1706 max -= iov->iov_len; 1709 max -= iov->iov_len;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 771fb6322c07..e24dd74e3068 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -484,6 +484,9 @@ struct fuse_conn {
484 /** Is fallocate not implemented by fs? */ 484 /** Is fallocate not implemented by fs? */
485 unsigned no_fallocate:1; 485 unsigned no_fallocate:1;
486 486
487 /** Use enhanced/automatic page cache invalidation. */
488 unsigned auto_inval_data:1;
489
487 /** The number of requests waiting for completion */ 490 /** The number of requests waiting for completion */
488 atomic_t num_waiting; 491 atomic_t num_waiting;
489 492
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 1cd61652018c..fca222dabe3c 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -197,6 +197,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
197 struct fuse_conn *fc = get_fuse_conn(inode); 197 struct fuse_conn *fc = get_fuse_conn(inode);
198 struct fuse_inode *fi = get_fuse_inode(inode); 198 struct fuse_inode *fi = get_fuse_inode(inode);
199 loff_t oldsize; 199 loff_t oldsize;
200 struct timespec old_mtime;
200 201
201 spin_lock(&fc->lock); 202 spin_lock(&fc->lock);
202 if (attr_version != 0 && fi->attr_version > attr_version) { 203 if (attr_version != 0 && fi->attr_version > attr_version) {
@@ -204,15 +205,35 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
204 return; 205 return;
205 } 206 }
206 207
208 old_mtime = inode->i_mtime;
207 fuse_change_attributes_common(inode, attr, attr_valid); 209 fuse_change_attributes_common(inode, attr, attr_valid);
208 210
209 oldsize = inode->i_size; 211 oldsize = inode->i_size;
210 i_size_write(inode, attr->size); 212 i_size_write(inode, attr->size);
211 spin_unlock(&fc->lock); 213 spin_unlock(&fc->lock);
212 214
213 if (S_ISREG(inode->i_mode) && oldsize != attr->size) { 215 if (S_ISREG(inode->i_mode)) {
214 truncate_pagecache(inode, oldsize, attr->size); 216 bool inval = false;
215 invalidate_inode_pages2(inode->i_mapping); 217
218 if (oldsize != attr->size) {
219 truncate_pagecache(inode, oldsize, attr->size);
220 inval = true;
221 } else if (fc->auto_inval_data) {
222 struct timespec new_mtime = {
223 .tv_sec = attr->mtime,
224 .tv_nsec = attr->mtimensec,
225 };
226
227 /*
228 * Auto inval mode also checks and invalidates if mtime
229 * has changed.
230 */
231 if (!timespec_equal(&old_mtime, &new_mtime))
232 inval = true;
233 }
234
235 if (inval)
236 invalidate_inode_pages2(inode->i_mapping);
216 } 237 }
217} 238}
218 239
@@ -346,11 +367,6 @@ void fuse_conn_kill(struct fuse_conn *fc)
346 wake_up_all(&fc->waitq); 367 wake_up_all(&fc->waitq);
347 wake_up_all(&fc->blocked_waitq); 368 wake_up_all(&fc->blocked_waitq);
348 wake_up_all(&fc->reserved_req_waitq); 369 wake_up_all(&fc->reserved_req_waitq);
349 mutex_lock(&fuse_mutex);
350 list_del(&fc->entry);
351 fuse_ctl_remove_conn(fc);
352 mutex_unlock(&fuse_mutex);
353 fuse_bdi_destroy(fc);
354} 370}
355EXPORT_SYMBOL_GPL(fuse_conn_kill); 371EXPORT_SYMBOL_GPL(fuse_conn_kill);
356 372
@@ -359,7 +375,14 @@ static void fuse_put_super(struct super_block *sb)
359 struct fuse_conn *fc = get_fuse_conn_super(sb); 375 struct fuse_conn *fc = get_fuse_conn_super(sb);
360 376
361 fuse_send_destroy(fc); 377 fuse_send_destroy(fc);
378
362 fuse_conn_kill(fc); 379 fuse_conn_kill(fc);
380 mutex_lock(&fuse_mutex);
381 list_del(&fc->entry);
382 fuse_ctl_remove_conn(fc);
383 mutex_unlock(&fuse_mutex);
384 fuse_bdi_destroy(fc);
385
363 fuse_conn_put(fc); 386 fuse_conn_put(fc);
364} 387}
365 388
@@ -834,6 +857,8 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
834 fc->big_writes = 1; 857 fc->big_writes = 1;
835 if (arg->flags & FUSE_DONT_MASK) 858 if (arg->flags & FUSE_DONT_MASK)
836 fc->dont_mask = 1; 859 fc->dont_mask = 1;
860 if (arg->flags & FUSE_AUTO_INVAL_DATA)
861 fc->auto_inval_data = 1;
837 } else { 862 } else {
838 ra_pages = fc->max_read / PAGE_CACHE_SIZE; 863 ra_pages = fc->max_read / PAGE_CACHE_SIZE;
839 fc->no_lock = 1; 864 fc->no_lock = 1;
@@ -859,7 +884,8 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
859 arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE; 884 arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE;
860 arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | 885 arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC |
861 FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK | 886 FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK |
862 FUSE_FLOCK_LOCKS; 887 FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ |
888 FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA;
863 req->in.h.opcode = FUSE_INIT; 889 req->in.h.opcode = FUSE_INIT;
864 req->in.numargs = 1; 890 req->in.numargs = 1;
865 req->in.args[0].size = sizeof(*arg); 891 req->in.args[0].size = sizeof(*arg);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 9aa6af13823c..d1d791ef38de 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -373,11 +373,10 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
373 loff_t size; 373 loff_t size;
374 int ret; 374 int ret;
375 375
376 /* Wait if fs is frozen. This is racy so we check again later on 376 sb_start_pagefault(inode->i_sb);
377 * and retry if the fs has been frozen after the page lock has 377
378 * been acquired 378 /* Update file times before taking page lock */
379 */ 379 file_update_time(vma->vm_file);
380 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
381 380
382 ret = gfs2_rs_alloc(ip); 381 ret = gfs2_rs_alloc(ip);
383 if (ret) 382 if (ret)
@@ -462,14 +461,9 @@ out:
462 gfs2_holder_uninit(&gh); 461 gfs2_holder_uninit(&gh);
463 if (ret == 0) { 462 if (ret == 0) {
464 set_page_dirty(page); 463 set_page_dirty(page);
465 /* This check must be post dropping of transaction lock */ 464 wait_on_page_writeback(page);
466 if (inode->i_sb->s_frozen == SB_UNFROZEN) {
467 wait_on_page_writeback(page);
468 } else {
469 ret = -EAGAIN;
470 unlock_page(page);
471 }
472 } 465 }
466 sb_end_pagefault(inode->i_sb);
473 return block_page_mkwrite_return(ret); 467 return block_page_mkwrite_return(ret);
474} 468}
475 469
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 3a56c8d94de0..22255d96b27e 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -52,7 +52,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
52 /* 52 /*
53 * If it's a fully non-blocking write attempt and we cannot 53 * If it's a fully non-blocking write attempt and we cannot
54 * lock the buffer then redirty the page. Note that this can 54 * lock the buffer then redirty the page. Note that this can
55 * potentially cause a busy-wait loop from pdflush and kswapd 55 * potentially cause a busy-wait loop from flusher thread and kswapd
56 * activity, but those code paths have their own higher-level 56 * activity, but those code paths have their own higher-level
57 * throttling. 57 * throttling.
58 */ 58 */
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index ad3e2fb763d7..adbd27875ef9 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -50,6 +50,7 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
50 if (revokes) 50 if (revokes)
51 tr->tr_reserved += gfs2_struct2blk(sdp, revokes, 51 tr->tr_reserved += gfs2_struct2blk(sdp, revokes,
52 sizeof(u64)); 52 sizeof(u64));
53 sb_start_intwrite(sdp->sd_vfs);
53 gfs2_holder_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &tr->tr_t_gh); 54 gfs2_holder_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &tr->tr_t_gh);
54 55
55 error = gfs2_glock_nq(&tr->tr_t_gh); 56 error = gfs2_glock_nq(&tr->tr_t_gh);
@@ -68,6 +69,7 @@ fail_gunlock:
68 gfs2_glock_dq(&tr->tr_t_gh); 69 gfs2_glock_dq(&tr->tr_t_gh);
69 70
70fail_holder_uninit: 71fail_holder_uninit:
72 sb_end_intwrite(sdp->sd_vfs);
71 gfs2_holder_uninit(&tr->tr_t_gh); 73 gfs2_holder_uninit(&tr->tr_t_gh);
72 kfree(tr); 74 kfree(tr);
73 75
@@ -116,6 +118,7 @@ void gfs2_trans_end(struct gfs2_sbd *sdp)
116 gfs2_holder_uninit(&tr->tr_t_gh); 118 gfs2_holder_uninit(&tr->tr_t_gh);
117 kfree(tr); 119 kfree(tr);
118 } 120 }
121 sb_end_intwrite(sdp->sd_vfs);
119 return; 122 return;
120 } 123 }
121 124
@@ -136,6 +139,7 @@ void gfs2_trans_end(struct gfs2_sbd *sdp)
136 139
137 if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS) 140 if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS)
138 gfs2_log_flush(sdp, NULL); 141 gfs2_log_flush(sdp, NULL);
142 sb_end_intwrite(sdp->sd_vfs);
139} 143}
140 144
141/** 145/**
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index 5fd51a5833ff..b7ec224910c5 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -236,10 +236,10 @@ out:
236 * hfs_mdb_commit() 236 * hfs_mdb_commit()
237 * 237 *
238 * Description: 238 * Description:
239 * This updates the MDB on disk (look also at hfs_write_super()). 239 * This updates the MDB on disk.
240 * It does not check, if the superblock has been modified, or 240 * It does not check, if the superblock has been modified, or
241 * if the filesystem has been mounted read-only. It is mainly 241 * if the filesystem has been mounted read-only. It is mainly
242 * called by hfs_write_super() and hfs_btree_extend(). 242 * called by hfs_sync_fs() and flush_mdb().
243 * Input Variable(s): 243 * Input Variable(s):
244 * struct hfs_mdb *mdb: Pointer to the hfs MDB 244 * struct hfs_mdb *mdb: Pointer to the hfs MDB
245 * int backup; 245 * int backup;
diff --git a/fs/inode.c b/fs/inode.c
index 3cc504320467..ac8d904b3f16 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1542,9 +1542,11 @@ void touch_atime(struct path *path)
1542 if (timespec_equal(&inode->i_atime, &now)) 1542 if (timespec_equal(&inode->i_atime, &now))
1543 return; 1543 return;
1544 1544
1545 if (mnt_want_write(mnt)) 1545 if (!sb_start_write_trylock(inode->i_sb))
1546 return; 1546 return;
1547 1547
1548 if (__mnt_want_write(mnt))
1549 goto skip_update;
1548 /* 1550 /*
1549 * File systems can error out when updating inodes if they need to 1551 * File systems can error out when updating inodes if they need to
1550 * allocate new space to modify an inode (such is the case for 1552 * allocate new space to modify an inode (such is the case for
@@ -1555,7 +1557,9 @@ void touch_atime(struct path *path)
1555 * of the fs read only, e.g. subvolumes in Btrfs. 1557 * of the fs read only, e.g. subvolumes in Btrfs.
1556 */ 1558 */
1557 update_time(inode, &now, S_ATIME); 1559 update_time(inode, &now, S_ATIME);
1558 mnt_drop_write(mnt); 1560 __mnt_drop_write(mnt);
1561skip_update:
1562 sb_end_write(inode->i_sb);
1559} 1563}
1560EXPORT_SYMBOL(touch_atime); 1564EXPORT_SYMBOL(touch_atime);
1561 1565
@@ -1662,11 +1666,11 @@ int file_update_time(struct file *file)
1662 return 0; 1666 return 0;
1663 1667
1664 /* Finally allowed to write? Takes lock. */ 1668 /* Finally allowed to write? Takes lock. */
1665 if (mnt_want_write_file(file)) 1669 if (__mnt_want_write_file(file))
1666 return 0; 1670 return 0;
1667 1671
1668 ret = update_time(inode, &now, sync_it); 1672 ret = update_time(inode, &now, sync_it);
1669 mnt_drop_write_file(file); 1673 __mnt_drop_write_file(file);
1670 1674
1671 return ret; 1675 return ret;
1672} 1676}
diff --git a/fs/internal.h b/fs/internal.h
index a6fd56c68b11..371bcc4b1697 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -61,6 +61,10 @@ extern void __init mnt_init(void);
61 61
62extern struct lglock vfsmount_lock; 62extern struct lglock vfsmount_lock;
63 63
64extern int __mnt_want_write(struct vfsmount *);
65extern int __mnt_want_write_file(struct file *);
66extern void __mnt_drop_write(struct vfsmount *);
67extern void __mnt_drop_write_file(struct file *);
64 68
65/* 69/*
66 * fs_struct.c 70 * fs_struct.c
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 425c2f2cf170..a2862339323b 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -534,8 +534,8 @@ int journal_start_commit(journal_t *journal, tid_t *ptid)
534 ret = 1; 534 ret = 1;
535 } else if (journal->j_committing_transaction) { 535 } else if (journal->j_committing_transaction) {
536 /* 536 /*
537 * If ext3_write_super() recently started a commit, then we 537 * If commit has been started, then we have to wait for
538 * have to wait for completion of that transaction 538 * completion of that transaction.
539 */ 539 */
540 if (ptid) 540 if (ptid)
541 *ptid = journal->j_committing_transaction->t_tid; 541 *ptid = journal->j_committing_transaction->t_tid;
@@ -1113,6 +1113,11 @@ static void mark_journal_empty(journal_t *journal)
1113 1113
1114 BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); 1114 BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
1115 spin_lock(&journal->j_state_lock); 1115 spin_lock(&journal->j_state_lock);
1116 /* Is it already empty? */
1117 if (sb->s_start == 0) {
1118 spin_unlock(&journal->j_state_lock);
1119 return;
1120 }
1116 jbd_debug(1, "JBD: Marking journal as empty (seq %d)\n", 1121 jbd_debug(1, "JBD: Marking journal as empty (seq %d)\n",
1117 journal->j_tail_sequence); 1122 journal->j_tail_sequence);
1118 1123
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index e9a3c4c85594..e149b99a7ffb 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -612,8 +612,8 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
612 ret = 1; 612 ret = 1;
613 } else if (journal->j_committing_transaction) { 613 } else if (journal->j_committing_transaction) {
614 /* 614 /*
615 * If ext3_write_super() recently started a commit, then we 615 * If commit has been started, then we have to wait for
616 * have to wait for completion of that transaction 616 * completion of that transaction.
617 */ 617 */
618 if (ptid) 618 if (ptid)
619 *ptid = journal->j_committing_transaction->t_tid; 619 *ptid = journal->j_committing_transaction->t_tid;
@@ -1377,7 +1377,7 @@ static void jbd2_mark_journal_empty(journal_t *journal)
1377 * Update a journal's errno. Write updated superblock to disk waiting for IO 1377 * Update a journal's errno. Write updated superblock to disk waiting for IO
1378 * to complete. 1378 * to complete.
1379 */ 1379 */
1380static void jbd2_journal_update_sb_errno(journal_t *journal) 1380void jbd2_journal_update_sb_errno(journal_t *journal)
1381{ 1381{
1382 journal_superblock_t *sb = journal->j_superblock; 1382 journal_superblock_t *sb = journal->j_superblock;
1383 1383
@@ -1390,6 +1390,7 @@ static void jbd2_journal_update_sb_errno(journal_t *journal)
1390 1390
1391 jbd2_write_superblock(journal, WRITE_SYNC); 1391 jbd2_write_superblock(journal, WRITE_SYNC);
1392} 1392}
1393EXPORT_SYMBOL(jbd2_journal_update_sb_errno);
1393 1394
1394/* 1395/*
1395 * Read the superblock for a given journal, performing initial 1396 * Read the superblock for a given journal, performing initial
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 8392cb85bd54..05d29124c6ab 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -156,12 +156,16 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl)
156 struct nlm_rqst *call; 156 struct nlm_rqst *call;
157 int status; 157 int status;
158 158
159 nlm_get_host(host);
160 call = nlm_alloc_call(host); 159 call = nlm_alloc_call(host);
161 if (call == NULL) 160 if (call == NULL)
162 return -ENOMEM; 161 return -ENOMEM;
163 162
164 nlmclnt_locks_init_private(fl, host); 163 nlmclnt_locks_init_private(fl, host);
164 if (!fl->fl_u.nfs_fl.owner) {
165 /* lockowner allocation has failed */
166 nlmclnt_release_call(call);
167 return -ENOMEM;
168 }
165 /* Set up the argument struct */ 169 /* Set up the argument struct */
166 nlmclnt_setlockargs(call, fl); 170 nlmclnt_setlockargs(call, fl);
167 171
@@ -185,9 +189,6 @@ EXPORT_SYMBOL_GPL(nlmclnt_proc);
185 189
186/* 190/*
187 * Allocate an NLM RPC call struct 191 * Allocate an NLM RPC call struct
188 *
189 * Note: the caller must hold a reference to host. In case of failure,
190 * this reference will be released.
191 */ 192 */
192struct nlm_rqst *nlm_alloc_call(struct nlm_host *host) 193struct nlm_rqst *nlm_alloc_call(struct nlm_host *host)
193{ 194{
@@ -199,7 +200,7 @@ struct nlm_rqst *nlm_alloc_call(struct nlm_host *host)
199 atomic_set(&call->a_count, 1); 200 atomic_set(&call->a_count, 1);
200 locks_init_lock(&call->a_args.lock.fl); 201 locks_init_lock(&call->a_args.lock.fl);
201 locks_init_lock(&call->a_res.lock.fl); 202 locks_init_lock(&call->a_res.lock.fl);
202 call->a_host = host; 203 call->a_host = nlm_get_host(host);
203 return call; 204 return call;
204 } 205 }
205 if (signalled()) 206 if (signalled())
@@ -207,7 +208,6 @@ struct nlm_rqst *nlm_alloc_call(struct nlm_host *host)
207 printk("nlm_alloc_call: failed, waiting for memory\n"); 208 printk("nlm_alloc_call: failed, waiting for memory\n");
208 schedule_timeout_interruptible(5*HZ); 209 schedule_timeout_interruptible(5*HZ);
209 } 210 }
210 nlmclnt_release_host(host);
211 return NULL; 211 return NULL;
212} 212}
213 213
@@ -750,7 +750,7 @@ static int nlmclnt_cancel(struct nlm_host *host, int block, struct file_lock *fl
750 dprintk("lockd: blocking lock attempt was interrupted by a signal.\n" 750 dprintk("lockd: blocking lock attempt was interrupted by a signal.\n"
751 " Attempting to cancel lock.\n"); 751 " Attempting to cancel lock.\n");
752 752
753 req = nlm_alloc_call(nlm_get_host(host)); 753 req = nlm_alloc_call(host);
754 if (!req) 754 if (!req)
755 return -ENOMEM; 755 return -ENOMEM;
756 req->a_flags = RPC_TASK_ASYNC; 756 req->a_flags = RPC_TASK_ASYNC;
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 4a43d253c045..b147d1ae71fd 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -257,6 +257,7 @@ static __be32 nlm4svc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args
257 return rpc_system_err; 257 return rpc_system_err;
258 258
259 call = nlm_alloc_call(host); 259 call = nlm_alloc_call(host);
260 nlmsvc_release_host(host);
260 if (call == NULL) 261 if (call == NULL)
261 return rpc_system_err; 262 return rpc_system_err;
262 263
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index afe4488c33d8..fb1a2bedbe97 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -219,7 +219,6 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_host *host,
219 struct nlm_block *block; 219 struct nlm_block *block;
220 struct nlm_rqst *call = NULL; 220 struct nlm_rqst *call = NULL;
221 221
222 nlm_get_host(host);
223 call = nlm_alloc_call(host); 222 call = nlm_alloc_call(host);
224 if (call == NULL) 223 if (call == NULL)
225 return NULL; 224 return NULL;
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index de8f2caa2235..3009a365e082 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -297,6 +297,7 @@ static __be32 nlmsvc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args
297 return rpc_system_err; 297 return rpc_system_err;
298 298
299 call = nlm_alloc_call(host); 299 call = nlm_alloc_call(host);
300 nlmsvc_release_host(host);
300 if (call == NULL) 301 if (call == NULL)
301 return rpc_system_err; 302 return rpc_system_err;
302 303
diff --git a/fs/locks.c b/fs/locks.c
index cdcf219a7391..7e81bfc75164 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -200,11 +200,7 @@ void locks_release_private(struct file_lock *fl)
200 fl->fl_ops->fl_release_private(fl); 200 fl->fl_ops->fl_release_private(fl);
201 fl->fl_ops = NULL; 201 fl->fl_ops = NULL;
202 } 202 }
203 if (fl->fl_lmops) { 203 fl->fl_lmops = NULL;
204 if (fl->fl_lmops->lm_release_private)
205 fl->fl_lmops->lm_release_private(fl);
206 fl->fl_lmops = NULL;
207 }
208 204
209} 205}
210EXPORT_SYMBOL_GPL(locks_release_private); 206EXPORT_SYMBOL_GPL(locks_release_private);
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index df0de27c2733..e784a217b500 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -26,6 +26,7 @@ static int sync_request(struct page *page, struct block_device *bdev, int rw)
26 struct completion complete; 26 struct completion complete;
27 27
28 bio_init(&bio); 28 bio_init(&bio);
29 bio.bi_max_vecs = 1;
29 bio.bi_io_vec = &bio_vec; 30 bio.bi_io_vec = &bio_vec;
30 bio_vec.bv_page = page; 31 bio_vec.bv_page = page;
31 bio_vec.bv_len = PAGE_SIZE; 32 bio_vec.bv_len = PAGE_SIZE;
@@ -95,12 +96,11 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
95 struct address_space *mapping = super->s_mapping_inode->i_mapping; 96 struct address_space *mapping = super->s_mapping_inode->i_mapping;
96 struct bio *bio; 97 struct bio *bio;
97 struct page *page; 98 struct page *page;
98 struct request_queue *q = bdev_get_queue(sb->s_bdev); 99 unsigned int max_pages;
99 unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9);
100 int i; 100 int i;
101 101
102 if (max_pages > BIO_MAX_PAGES) 102 max_pages = min(nr_pages, (size_t) bio_get_nr_vecs(super->s_bdev));
103 max_pages = BIO_MAX_PAGES; 103
104 bio = bio_alloc(GFP_NOFS, max_pages); 104 bio = bio_alloc(GFP_NOFS, max_pages);
105 BUG_ON(!bio); 105 BUG_ON(!bio);
106 106
@@ -190,12 +190,11 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
190{ 190{
191 struct logfs_super *super = logfs_super(sb); 191 struct logfs_super *super = logfs_super(sb);
192 struct bio *bio; 192 struct bio *bio;
193 struct request_queue *q = bdev_get_queue(sb->s_bdev); 193 unsigned int max_pages;
194 unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9);
195 int i; 194 int i;
196 195
197 if (max_pages > BIO_MAX_PAGES) 196 max_pages = min(nr_pages, (size_t) bio_get_nr_vecs(super->s_bdev));
198 max_pages = BIO_MAX_PAGES; 197
199 bio = bio_alloc(GFP_NOFS, max_pages); 198 bio = bio_alloc(GFP_NOFS, max_pages);
200 BUG_ON(!bio); 199 BUG_ON(!bio);
201 200
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index a422f42238b2..6984562738d3 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -156,10 +156,26 @@ static void __logfs_destroy_inode(struct inode *inode)
156 call_rcu(&inode->i_rcu, logfs_i_callback); 156 call_rcu(&inode->i_rcu, logfs_i_callback);
157} 157}
158 158
159static void __logfs_destroy_meta_inode(struct inode *inode)
160{
161 struct logfs_inode *li = logfs_inode(inode);
162 BUG_ON(li->li_block);
163 call_rcu(&inode->i_rcu, logfs_i_callback);
164}
165
159static void logfs_destroy_inode(struct inode *inode) 166static void logfs_destroy_inode(struct inode *inode)
160{ 167{
161 struct logfs_inode *li = logfs_inode(inode); 168 struct logfs_inode *li = logfs_inode(inode);
162 169
170 if (inode->i_ino < LOGFS_RESERVED_INOS) {
171 /*
172 * The reserved inodes are never destroyed unless we are in
173 * unmont path.
174 */
175 __logfs_destroy_meta_inode(inode);
176 return;
177 }
178
163 BUG_ON(list_empty(&li->li_freeing_list)); 179 BUG_ON(list_empty(&li->li_freeing_list));
164 spin_lock(&logfs_inode_lock); 180 spin_lock(&logfs_inode_lock);
165 li->li_refcount--; 181 li->li_refcount--;
@@ -373,8 +389,8 @@ static void logfs_put_super(struct super_block *sb)
373{ 389{
374 struct logfs_super *super = logfs_super(sb); 390 struct logfs_super *super = logfs_super(sb);
375 /* kill the meta-inodes */ 391 /* kill the meta-inodes */
376 iput(super->s_master_inode);
377 iput(super->s_segfile_inode); 392 iput(super->s_segfile_inode);
393 iput(super->s_master_inode);
378 iput(super->s_mapping_inode); 394 iput(super->s_mapping_inode);
379} 395}
380 396
diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c
index 1e1c369df22b..2a09b8d73989 100644
--- a/fs/logfs/journal.c
+++ b/fs/logfs/journal.c
@@ -565,7 +565,7 @@ static void write_wbuf(struct super_block *sb, struct logfs_area *area,
565 index = ofs >> PAGE_SHIFT; 565 index = ofs >> PAGE_SHIFT;
566 page_ofs = ofs & (PAGE_SIZE - 1); 566 page_ofs = ofs & (PAGE_SIZE - 1);
567 567
568 page = find_lock_page(mapping, index); 568 page = find_or_create_page(mapping, index, GFP_NOFS);
569 BUG_ON(!page); 569 BUG_ON(!page);
570 memcpy(wbuf, page_address(page) + page_ofs, super->s_writesize); 570 memcpy(wbuf, page_address(page) + page_ofs, super->s_writesize);
571 unlock_page(page); 571 unlock_page(page);
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index f1cb512c5019..5be0abef603d 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -2189,7 +2189,6 @@ void logfs_evict_inode(struct inode *inode)
2189 return; 2189 return;
2190 } 2190 }
2191 2191
2192 BUG_ON(inode->i_ino < LOGFS_RESERVED_INOS);
2193 page = inode_to_page(inode); 2192 page = inode_to_page(inode);
2194 BUG_ON(!page); /* FIXME: Use emergency page */ 2193 BUG_ON(!page); /* FIXME: Use emergency page */
2195 logfs_put_write_page(page); 2194 logfs_put_write_page(page);
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
index e28d090c98d6..038da0991794 100644
--- a/fs/logfs/segment.c
+++ b/fs/logfs/segment.c
@@ -886,7 +886,7 @@ static struct logfs_area *alloc_area(struct super_block *sb)
886 886
887static void map_invalidatepage(struct page *page, unsigned long l) 887static void map_invalidatepage(struct page *page, unsigned long l)
888{ 888{
889 BUG(); 889 return;
890} 890}
891 891
892static int map_releasepage(struct page *page, gfp_t g) 892static int map_releasepage(struct page *page, gfp_t g)
diff --git a/fs/namei.c b/fs/namei.c
index 2ccc35c4dc24..dd1ed1b8e98e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -352,6 +352,7 @@ int __inode_permission(struct inode *inode, int mask)
352/** 352/**
353 * sb_permission - Check superblock-level permissions 353 * sb_permission - Check superblock-level permissions
354 * @sb: Superblock of inode to check permission on 354 * @sb: Superblock of inode to check permission on
355 * @inode: Inode to check permission on
355 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) 356 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
356 * 357 *
357 * Separate out file-system wide checks from inode-specific permission checks. 358 * Separate out file-system wide checks from inode-specific permission checks.
@@ -650,6 +651,122 @@ static inline void put_link(struct nameidata *nd, struct path *link, void *cooki
650 path_put(link); 651 path_put(link);
651} 652}
652 653
654int sysctl_protected_symlinks __read_mostly = 1;
655int sysctl_protected_hardlinks __read_mostly = 1;
656
657/**
658 * may_follow_link - Check symlink following for unsafe situations
659 * @link: The path of the symlink
660 * @nd: nameidata pathwalk data
661 *
662 * In the case of the sysctl_protected_symlinks sysctl being enabled,
663 * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is
664 * in a sticky world-writable directory. This is to protect privileged
665 * processes from failing races against path names that may change out
666 * from under them by way of other users creating malicious symlinks.
667 * It will permit symlinks to be followed only when outside a sticky
668 * world-writable directory, or when the uid of the symlink and follower
669 * match, or when the directory owner matches the symlink's owner.
670 *
671 * Returns 0 if following the symlink is allowed, -ve on error.
672 */
673static inline int may_follow_link(struct path *link, struct nameidata *nd)
674{
675 const struct inode *inode;
676 const struct inode *parent;
677
678 if (!sysctl_protected_symlinks)
679 return 0;
680
681 /* Allowed if owner and follower match. */
682 inode = link->dentry->d_inode;
683 if (current_cred()->fsuid == inode->i_uid)
684 return 0;
685
686 /* Allowed if parent directory not sticky and world-writable. */
687 parent = nd->path.dentry->d_inode;
688 if ((parent->i_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH))
689 return 0;
690
691 /* Allowed if parent directory and link owner match. */
692 if (parent->i_uid == inode->i_uid)
693 return 0;
694
695 path_put_conditional(link, nd);
696 path_put(&nd->path);
697 audit_log_link_denied("follow_link", link);
698 return -EACCES;
699}
700
701/**
702 * safe_hardlink_source - Check for safe hardlink conditions
703 * @inode: the source inode to hardlink from
704 *
705 * Return false if at least one of the following conditions:
706 * - inode is not a regular file
707 * - inode is setuid
708 * - inode is setgid and group-exec
709 * - access failure for read and write
710 *
711 * Otherwise returns true.
712 */
713static bool safe_hardlink_source(struct inode *inode)
714{
715 umode_t mode = inode->i_mode;
716
717 /* Special files should not get pinned to the filesystem. */
718 if (!S_ISREG(mode))
719 return false;
720
721 /* Setuid files should not get pinned to the filesystem. */
722 if (mode & S_ISUID)
723 return false;
724
725 /* Executable setgid files should not get pinned to the filesystem. */
726 if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
727 return false;
728
729 /* Hardlinking to unreadable or unwritable sources is dangerous. */
730 if (inode_permission(inode, MAY_READ | MAY_WRITE))
731 return false;
732
733 return true;
734}
735
736/**
737 * may_linkat - Check permissions for creating a hardlink
738 * @link: the source to hardlink from
739 *
740 * Block hardlink when all of:
741 * - sysctl_protected_hardlinks enabled
742 * - fsuid does not match inode
743 * - hardlink source is unsafe (see safe_hardlink_source() above)
744 * - not CAP_FOWNER
745 *
746 * Returns 0 if successful, -ve on error.
747 */
748static int may_linkat(struct path *link)
749{
750 const struct cred *cred;
751 struct inode *inode;
752
753 if (!sysctl_protected_hardlinks)
754 return 0;
755
756 cred = current_cred();
757 inode = link->dentry->d_inode;
758
759 /* Source inode owner (or CAP_FOWNER) can hardlink all they like,
760 * otherwise, it must be a safe source.
761 */
762 if (cred->fsuid == inode->i_uid || safe_hardlink_source(inode) ||
763 capable(CAP_FOWNER))
764 return 0;
765
766 audit_log_link_denied("linkat", link);
767 return -EPERM;
768}
769
653static __always_inline int 770static __always_inline int
654follow_link(struct path *link, struct nameidata *nd, void **p) 771follow_link(struct path *link, struct nameidata *nd, void **p)
655{ 772{
@@ -1818,6 +1935,9 @@ static int path_lookupat(int dfd, const char *name,
1818 while (err > 0) { 1935 while (err > 0) {
1819 void *cookie; 1936 void *cookie;
1820 struct path link = path; 1937 struct path link = path;
1938 err = may_follow_link(&link, nd);
1939 if (unlikely(err))
1940 break;
1821 nd->flags |= LOOKUP_PARENT; 1941 nd->flags |= LOOKUP_PARENT;
1822 err = follow_link(&link, nd, &cookie); 1942 err = follow_link(&link, nd, &cookie);
1823 if (err) 1943 if (err)
@@ -2277,7 +2397,7 @@ static int may_o_create(struct path *dir, struct dentry *dentry, umode_t mode)
2277static int atomic_open(struct nameidata *nd, struct dentry *dentry, 2397static int atomic_open(struct nameidata *nd, struct dentry *dentry,
2278 struct path *path, struct file *file, 2398 struct path *path, struct file *file,
2279 const struct open_flags *op, 2399 const struct open_flags *op,
2280 bool *want_write, bool need_lookup, 2400 bool got_write, bool need_lookup,
2281 int *opened) 2401 int *opened)
2282{ 2402{
2283 struct inode *dir = nd->path.dentry->d_inode; 2403 struct inode *dir = nd->path.dentry->d_inode;
@@ -2296,11 +2416,11 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry,
2296 goto out; 2416 goto out;
2297 } 2417 }
2298 2418
2299 mode = op->mode & S_IALLUGO; 2419 mode = op->mode;
2300 if ((open_flag & O_CREAT) && !IS_POSIXACL(dir)) 2420 if ((open_flag & O_CREAT) && !IS_POSIXACL(dir))
2301 mode &= ~current_umask(); 2421 mode &= ~current_umask();
2302 2422
2303 if (open_flag & O_EXCL) { 2423 if ((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT)) {
2304 open_flag &= ~O_TRUNC; 2424 open_flag &= ~O_TRUNC;
2305 *opened |= FILE_CREATED; 2425 *opened |= FILE_CREATED;
2306 } 2426 }
@@ -2314,12 +2434,9 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry,
2314 * Another problem is returing the "right" error value (e.g. for an 2434 * Another problem is returing the "right" error value (e.g. for an
2315 * O_EXCL open we want to return EEXIST not EROFS). 2435 * O_EXCL open we want to return EEXIST not EROFS).
2316 */ 2436 */
2317 if ((open_flag & (O_CREAT | O_TRUNC)) || 2437 if (((open_flag & (O_CREAT | O_TRUNC)) ||
2318 (open_flag & O_ACCMODE) != O_RDONLY) { 2438 (open_flag & O_ACCMODE) != O_RDONLY) && unlikely(!got_write)) {
2319 error = mnt_want_write(nd->path.mnt); 2439 if (!(open_flag & O_CREAT)) {
2320 if (!error) {
2321 *want_write = true;
2322 } else if (!(open_flag & O_CREAT)) {
2323 /* 2440 /*
2324 * No O_CREATE -> atomicity not a requirement -> fall 2441 * No O_CREATE -> atomicity not a requirement -> fall
2325 * back to lookup + open 2442 * back to lookup + open
@@ -2327,17 +2444,17 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry,
2327 goto no_open; 2444 goto no_open;
2328 } else if (open_flag & (O_EXCL | O_TRUNC)) { 2445 } else if (open_flag & (O_EXCL | O_TRUNC)) {
2329 /* Fall back and fail with the right error */ 2446 /* Fall back and fail with the right error */
2330 create_error = error; 2447 create_error = -EROFS;
2331 goto no_open; 2448 goto no_open;
2332 } else { 2449 } else {
2333 /* No side effects, safe to clear O_CREAT */ 2450 /* No side effects, safe to clear O_CREAT */
2334 create_error = error; 2451 create_error = -EROFS;
2335 open_flag &= ~O_CREAT; 2452 open_flag &= ~O_CREAT;
2336 } 2453 }
2337 } 2454 }
2338 2455
2339 if (open_flag & O_CREAT) { 2456 if (open_flag & O_CREAT) {
2340 error = may_o_create(&nd->path, dentry, op->mode); 2457 error = may_o_create(&nd->path, dentry, mode);
2341 if (error) { 2458 if (error) {
2342 create_error = error; 2459 create_error = error;
2343 if (open_flag & O_EXCL) 2460 if (open_flag & O_EXCL)
@@ -2374,6 +2491,10 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry,
2374 dput(dentry); 2491 dput(dentry);
2375 dentry = file->f_path.dentry; 2492 dentry = file->f_path.dentry;
2376 } 2493 }
2494 if (create_error && dentry->d_inode == NULL) {
2495 error = create_error;
2496 goto out;
2497 }
2377 goto looked_up; 2498 goto looked_up;
2378 } 2499 }
2379 2500
@@ -2438,7 +2559,7 @@ looked_up:
2438static int lookup_open(struct nameidata *nd, struct path *path, 2559static int lookup_open(struct nameidata *nd, struct path *path,
2439 struct file *file, 2560 struct file *file,
2440 const struct open_flags *op, 2561 const struct open_flags *op,
2441 bool *want_write, int *opened) 2562 bool got_write, int *opened)
2442{ 2563{
2443 struct dentry *dir = nd->path.dentry; 2564 struct dentry *dir = nd->path.dentry;
2444 struct inode *dir_inode = dir->d_inode; 2565 struct inode *dir_inode = dir->d_inode;
@@ -2456,7 +2577,7 @@ static int lookup_open(struct nameidata *nd, struct path *path,
2456 goto out_no_open; 2577 goto out_no_open;
2457 2578
2458 if ((nd->flags & LOOKUP_OPEN) && dir_inode->i_op->atomic_open) { 2579 if ((nd->flags & LOOKUP_OPEN) && dir_inode->i_op->atomic_open) {
2459 return atomic_open(nd, dentry, path, file, op, want_write, 2580 return atomic_open(nd, dentry, path, file, op, got_write,
2460 need_lookup, opened); 2581 need_lookup, opened);
2461 } 2582 }
2462 2583
@@ -2480,10 +2601,10 @@ static int lookup_open(struct nameidata *nd, struct path *path,
2480 * a permanent write count is taken through 2601 * a permanent write count is taken through
2481 * the 'struct file' in finish_open(). 2602 * the 'struct file' in finish_open().
2482 */ 2603 */
2483 error = mnt_want_write(nd->path.mnt); 2604 if (!got_write) {
2484 if (error) 2605 error = -EROFS;
2485 goto out_dput; 2606 goto out_dput;
2486 *want_write = true; 2607 }
2487 *opened |= FILE_CREATED; 2608 *opened |= FILE_CREATED;
2488 error = security_path_mknod(&nd->path, dentry, mode, 0); 2609 error = security_path_mknod(&nd->path, dentry, mode, 0);
2489 if (error) 2610 if (error)
@@ -2513,7 +2634,7 @@ static int do_last(struct nameidata *nd, struct path *path,
2513 struct dentry *dir = nd->path.dentry; 2634 struct dentry *dir = nd->path.dentry;
2514 int open_flag = op->open_flag; 2635 int open_flag = op->open_flag;
2515 bool will_truncate = (open_flag & O_TRUNC) != 0; 2636 bool will_truncate = (open_flag & O_TRUNC) != 0;
2516 bool want_write = false; 2637 bool got_write = false;
2517 int acc_mode = op->acc_mode; 2638 int acc_mode = op->acc_mode;
2518 struct inode *inode; 2639 struct inode *inode;
2519 bool symlink_ok = false; 2640 bool symlink_ok = false;
@@ -2582,8 +2703,18 @@ static int do_last(struct nameidata *nd, struct path *path,
2582 } 2703 }
2583 2704
2584retry_lookup: 2705retry_lookup:
2706 if (op->open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
2707 error = mnt_want_write(nd->path.mnt);
2708 if (!error)
2709 got_write = true;
2710 /*
2711 * do _not_ fail yet - we might not need that or fail with
2712 * a different error; let lookup_open() decide; we'll be
2713 * dropping this one anyway.
2714 */
2715 }
2585 mutex_lock(&dir->d_inode->i_mutex); 2716 mutex_lock(&dir->d_inode->i_mutex);
2586 error = lookup_open(nd, path, file, op, &want_write, opened); 2717 error = lookup_open(nd, path, file, op, got_write, opened);
2587 mutex_unlock(&dir->d_inode->i_mutex); 2718 mutex_unlock(&dir->d_inode->i_mutex);
2588 2719
2589 if (error <= 0) { 2720 if (error <= 0) {
@@ -2608,22 +2739,23 @@ retry_lookup:
2608 } 2739 }
2609 2740
2610 /* 2741 /*
2611 * It already exists. 2742 * create/update audit record if it already exists.
2612 */ 2743 */
2613 audit_inode(pathname, path->dentry); 2744 if (path->dentry->d_inode)
2745 audit_inode(pathname, path->dentry);
2614 2746
2615 /* 2747 /*
2616 * If atomic_open() acquired write access it is dropped now due to 2748 * If atomic_open() acquired write access it is dropped now due to
2617 * possible mount and symlink following (this might be optimized away if 2749 * possible mount and symlink following (this might be optimized away if
2618 * necessary...) 2750 * necessary...)
2619 */ 2751 */
2620 if (want_write) { 2752 if (got_write) {
2621 mnt_drop_write(nd->path.mnt); 2753 mnt_drop_write(nd->path.mnt);
2622 want_write = false; 2754 got_write = false;
2623 } 2755 }
2624 2756
2625 error = -EEXIST; 2757 error = -EEXIST;
2626 if (open_flag & O_EXCL) 2758 if ((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))
2627 goto exit_dput; 2759 goto exit_dput;
2628 2760
2629 error = follow_managed(path, nd->flags); 2761 error = follow_managed(path, nd->flags);
@@ -2684,7 +2816,7 @@ finish_open:
2684 error = mnt_want_write(nd->path.mnt); 2816 error = mnt_want_write(nd->path.mnt);
2685 if (error) 2817 if (error)
2686 goto out; 2818 goto out;
2687 want_write = true; 2819 got_write = true;
2688 } 2820 }
2689finish_open_created: 2821finish_open_created:
2690 error = may_open(&nd->path, acc_mode, open_flag); 2822 error = may_open(&nd->path, acc_mode, open_flag);
@@ -2711,7 +2843,7 @@ opened:
2711 goto exit_fput; 2843 goto exit_fput;
2712 } 2844 }
2713out: 2845out:
2714 if (want_write) 2846 if (got_write)
2715 mnt_drop_write(nd->path.mnt); 2847 mnt_drop_write(nd->path.mnt);
2716 path_put(&save_parent); 2848 path_put(&save_parent);
2717 terminate_walk(nd); 2849 terminate_walk(nd);
@@ -2735,9 +2867,9 @@ stale_open:
2735 nd->inode = dir->d_inode; 2867 nd->inode = dir->d_inode;
2736 save_parent.mnt = NULL; 2868 save_parent.mnt = NULL;
2737 save_parent.dentry = NULL; 2869 save_parent.dentry = NULL;
2738 if (want_write) { 2870 if (got_write) {
2739 mnt_drop_write(nd->path.mnt); 2871 mnt_drop_write(nd->path.mnt);
2740 want_write = false; 2872 got_write = false;
2741 } 2873 }
2742 retried = true; 2874 retried = true;
2743 goto retry_lookup; 2875 goto retry_lookup;
@@ -2777,6 +2909,9 @@ static struct file *path_openat(int dfd, const char *pathname,
2777 error = -ELOOP; 2909 error = -ELOOP;
2778 break; 2910 break;
2779 } 2911 }
2912 error = may_follow_link(&link, nd);
2913 if (unlikely(error))
2914 break;
2780 nd->flags |= LOOKUP_PARENT; 2915 nd->flags |= LOOKUP_PARENT;
2781 nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); 2916 nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
2782 error = follow_link(&link, nd, &cookie); 2917 error = follow_link(&link, nd, &cookie);
@@ -2846,6 +2981,7 @@ struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path
2846{ 2981{
2847 struct dentry *dentry = ERR_PTR(-EEXIST); 2982 struct dentry *dentry = ERR_PTR(-EEXIST);
2848 struct nameidata nd; 2983 struct nameidata nd;
2984 int err2;
2849 int error = do_path_lookup(dfd, pathname, LOOKUP_PARENT, &nd); 2985 int error = do_path_lookup(dfd, pathname, LOOKUP_PARENT, &nd);
2850 if (error) 2986 if (error)
2851 return ERR_PTR(error); 2987 return ERR_PTR(error);
@@ -2859,16 +2995,19 @@ struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path
2859 nd.flags &= ~LOOKUP_PARENT; 2995 nd.flags &= ~LOOKUP_PARENT;
2860 nd.flags |= LOOKUP_CREATE | LOOKUP_EXCL; 2996 nd.flags |= LOOKUP_CREATE | LOOKUP_EXCL;
2861 2997
2998 /* don't fail immediately if it's r/o, at least try to report other errors */
2999 err2 = mnt_want_write(nd.path.mnt);
2862 /* 3000 /*
2863 * Do the final lookup. 3001 * Do the final lookup.
2864 */ 3002 */
2865 mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); 3003 mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
2866 dentry = lookup_hash(&nd); 3004 dentry = lookup_hash(&nd);
2867 if (IS_ERR(dentry)) 3005 if (IS_ERR(dentry))
2868 goto fail; 3006 goto unlock;
2869 3007
3008 error = -EEXIST;
2870 if (dentry->d_inode) 3009 if (dentry->d_inode)
2871 goto eexist; 3010 goto fail;
2872 /* 3011 /*
2873 * Special case - lookup gave negative, but... we had foo/bar/ 3012 * Special case - lookup gave negative, but... we had foo/bar/
2874 * From the vfs_mknod() POV we just have a negative dentry - 3013 * From the vfs_mknod() POV we just have a negative dentry -
@@ -2876,23 +3015,37 @@ struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path
2876 * been asking for (non-existent) directory. -ENOENT for you. 3015 * been asking for (non-existent) directory. -ENOENT for you.
2877 */ 3016 */
2878 if (unlikely(!is_dir && nd.last.name[nd.last.len])) { 3017 if (unlikely(!is_dir && nd.last.name[nd.last.len])) {
2879 dput(dentry); 3018 error = -ENOENT;
2880 dentry = ERR_PTR(-ENOENT); 3019 goto fail;
3020 }
3021 if (unlikely(err2)) {
3022 error = err2;
2881 goto fail; 3023 goto fail;
2882 } 3024 }
2883 *path = nd.path; 3025 *path = nd.path;
2884 return dentry; 3026 return dentry;
2885eexist:
2886 dput(dentry);
2887 dentry = ERR_PTR(-EEXIST);
2888fail: 3027fail:
3028 dput(dentry);
3029 dentry = ERR_PTR(error);
3030unlock:
2889 mutex_unlock(&nd.path.dentry->d_inode->i_mutex); 3031 mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
3032 if (!err2)
3033 mnt_drop_write(nd.path.mnt);
2890out: 3034out:
2891 path_put(&nd.path); 3035 path_put(&nd.path);
2892 return dentry; 3036 return dentry;
2893} 3037}
2894EXPORT_SYMBOL(kern_path_create); 3038EXPORT_SYMBOL(kern_path_create);
2895 3039
3040void done_path_create(struct path *path, struct dentry *dentry)
3041{
3042 dput(dentry);
3043 mutex_unlock(&path->dentry->d_inode->i_mutex);
3044 mnt_drop_write(path->mnt);
3045 path_put(path);
3046}
3047EXPORT_SYMBOL(done_path_create);
3048
2896struct dentry *user_path_create(int dfd, const char __user *pathname, struct path *path, int is_dir) 3049struct dentry *user_path_create(int dfd, const char __user *pathname, struct path *path, int is_dir)
2897{ 3050{
2898 char *tmp = getname(pathname); 3051 char *tmp = getname(pathname);
@@ -2956,8 +3109,9 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
2956 struct path path; 3109 struct path path;
2957 int error; 3110 int error;
2958 3111
2959 if (S_ISDIR(mode)) 3112 error = may_mknod(mode);
2960 return -EPERM; 3113 if (error)
3114 return error;
2961 3115
2962 dentry = user_path_create(dfd, filename, &path, 0); 3116 dentry = user_path_create(dfd, filename, &path, 0);
2963 if (IS_ERR(dentry)) 3117 if (IS_ERR(dentry))
@@ -2965,15 +3119,9 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
2965 3119
2966 if (!IS_POSIXACL(path.dentry->d_inode)) 3120 if (!IS_POSIXACL(path.dentry->d_inode))
2967 mode &= ~current_umask(); 3121 mode &= ~current_umask();
2968 error = may_mknod(mode);
2969 if (error)
2970 goto out_dput;
2971 error = mnt_want_write(path.mnt);
2972 if (error)
2973 goto out_dput;
2974 error = security_path_mknod(&path, dentry, mode, dev); 3122 error = security_path_mknod(&path, dentry, mode, dev);
2975 if (error) 3123 if (error)
2976 goto out_drop_write; 3124 goto out;
2977 switch (mode & S_IFMT) { 3125 switch (mode & S_IFMT) {
2978 case 0: case S_IFREG: 3126 case 0: case S_IFREG:
2979 error = vfs_create(path.dentry->d_inode,dentry,mode,true); 3127 error = vfs_create(path.dentry->d_inode,dentry,mode,true);
@@ -2986,13 +3134,8 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
2986 error = vfs_mknod(path.dentry->d_inode,dentry,mode,0); 3134 error = vfs_mknod(path.dentry->d_inode,dentry,mode,0);
2987 break; 3135 break;
2988 } 3136 }
2989out_drop_write: 3137out:
2990 mnt_drop_write(path.mnt); 3138 done_path_create(&path, dentry);
2991out_dput:
2992 dput(dentry);
2993 mutex_unlock(&path.dentry->d_inode->i_mutex);
2994 path_put(&path);
2995
2996 return error; 3139 return error;
2997} 3140}
2998 3141
@@ -3038,19 +3181,10 @@ SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
3038 3181
3039 if (!IS_POSIXACL(path.dentry->d_inode)) 3182 if (!IS_POSIXACL(path.dentry->d_inode))
3040 mode &= ~current_umask(); 3183 mode &= ~current_umask();
3041 error = mnt_want_write(path.mnt);
3042 if (error)
3043 goto out_dput;
3044 error = security_path_mkdir(&path, dentry, mode); 3184 error = security_path_mkdir(&path, dentry, mode);
3045 if (error) 3185 if (!error)
3046 goto out_drop_write; 3186 error = vfs_mkdir(path.dentry->d_inode, dentry, mode);
3047 error = vfs_mkdir(path.dentry->d_inode, dentry, mode); 3187 done_path_create(&path, dentry);
3048out_drop_write:
3049 mnt_drop_write(path.mnt);
3050out_dput:
3051 dput(dentry);
3052 mutex_unlock(&path.dentry->d_inode->i_mutex);
3053 path_put(&path);
3054 return error; 3188 return error;
3055} 3189}
3056 3190
@@ -3144,6 +3278,9 @@ static long do_rmdir(int dfd, const char __user *pathname)
3144 } 3278 }
3145 3279
3146 nd.flags &= ~LOOKUP_PARENT; 3280 nd.flags &= ~LOOKUP_PARENT;
3281 error = mnt_want_write(nd.path.mnt);
3282 if (error)
3283 goto exit1;
3147 3284
3148 mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); 3285 mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
3149 dentry = lookup_hash(&nd); 3286 dentry = lookup_hash(&nd);
@@ -3154,19 +3291,15 @@ static long do_rmdir(int dfd, const char __user *pathname)
3154 error = -ENOENT; 3291 error = -ENOENT;
3155 goto exit3; 3292 goto exit3;
3156 } 3293 }
3157 error = mnt_want_write(nd.path.mnt);
3158 if (error)
3159 goto exit3;
3160 error = security_path_rmdir(&nd.path, dentry); 3294 error = security_path_rmdir(&nd.path, dentry);
3161 if (error) 3295 if (error)
3162 goto exit4; 3296 goto exit3;
3163 error = vfs_rmdir(nd.path.dentry->d_inode, dentry); 3297 error = vfs_rmdir(nd.path.dentry->d_inode, dentry);
3164exit4:
3165 mnt_drop_write(nd.path.mnt);
3166exit3: 3298exit3:
3167 dput(dentry); 3299 dput(dentry);
3168exit2: 3300exit2:
3169 mutex_unlock(&nd.path.dentry->d_inode->i_mutex); 3301 mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
3302 mnt_drop_write(nd.path.mnt);
3170exit1: 3303exit1:
3171 path_put(&nd.path); 3304 path_put(&nd.path);
3172 putname(name); 3305 putname(name);
@@ -3233,6 +3366,9 @@ static long do_unlinkat(int dfd, const char __user *pathname)
3233 goto exit1; 3366 goto exit1;
3234 3367
3235 nd.flags &= ~LOOKUP_PARENT; 3368 nd.flags &= ~LOOKUP_PARENT;
3369 error = mnt_want_write(nd.path.mnt);
3370 if (error)
3371 goto exit1;
3236 3372
3237 mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); 3373 mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
3238 dentry = lookup_hash(&nd); 3374 dentry = lookup_hash(&nd);
@@ -3245,21 +3381,17 @@ static long do_unlinkat(int dfd, const char __user *pathname)
3245 if (!inode) 3381 if (!inode)
3246 goto slashes; 3382 goto slashes;
3247 ihold(inode); 3383 ihold(inode);
3248 error = mnt_want_write(nd.path.mnt);
3249 if (error)
3250 goto exit2;
3251 error = security_path_unlink(&nd.path, dentry); 3384 error = security_path_unlink(&nd.path, dentry);
3252 if (error) 3385 if (error)
3253 goto exit3; 3386 goto exit2;
3254 error = vfs_unlink(nd.path.dentry->d_inode, dentry); 3387 error = vfs_unlink(nd.path.dentry->d_inode, dentry);
3255exit3: 3388exit2:
3256 mnt_drop_write(nd.path.mnt);
3257 exit2:
3258 dput(dentry); 3389 dput(dentry);
3259 } 3390 }
3260 mutex_unlock(&nd.path.dentry->d_inode->i_mutex); 3391 mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
3261 if (inode) 3392 if (inode)
3262 iput(inode); /* truncate the inode here */ 3393 iput(inode); /* truncate the inode here */
3394 mnt_drop_write(nd.path.mnt);
3263exit1: 3395exit1:
3264 path_put(&nd.path); 3396 path_put(&nd.path);
3265 putname(name); 3397 putname(name);
@@ -3324,19 +3456,10 @@ SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
3324 if (IS_ERR(dentry)) 3456 if (IS_ERR(dentry))
3325 goto out_putname; 3457 goto out_putname;
3326 3458
3327 error = mnt_want_write(path.mnt);
3328 if (error)
3329 goto out_dput;
3330 error = security_path_symlink(&path, dentry, from); 3459 error = security_path_symlink(&path, dentry, from);
3331 if (error) 3460 if (!error)
3332 goto out_drop_write; 3461 error = vfs_symlink(path.dentry->d_inode, dentry, from);
3333 error = vfs_symlink(path.dentry->d_inode, dentry, from); 3462 done_path_create(&path, dentry);
3334out_drop_write:
3335 mnt_drop_write(path.mnt);
3336out_dput:
3337 dput(dentry);
3338 mutex_unlock(&path.dentry->d_inode->i_mutex);
3339 path_put(&path);
3340out_putname: 3463out_putname:
3341 putname(from); 3464 putname(from);
3342 return error; 3465 return error;
@@ -3436,19 +3559,15 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
3436 error = -EXDEV; 3559 error = -EXDEV;
3437 if (old_path.mnt != new_path.mnt) 3560 if (old_path.mnt != new_path.mnt)
3438 goto out_dput; 3561 goto out_dput;
3439 error = mnt_want_write(new_path.mnt); 3562 error = may_linkat(&old_path);
3440 if (error) 3563 if (unlikely(error))
3441 goto out_dput; 3564 goto out_dput;
3442 error = security_path_link(old_path.dentry, &new_path, new_dentry); 3565 error = security_path_link(old_path.dentry, &new_path, new_dentry);
3443 if (error) 3566 if (error)
3444 goto out_drop_write; 3567 goto out_dput;
3445 error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry); 3568 error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry);
3446out_drop_write:
3447 mnt_drop_write(new_path.mnt);
3448out_dput: 3569out_dput:
3449 dput(new_dentry); 3570 done_path_create(&new_path, new_dentry);
3450 mutex_unlock(&new_path.dentry->d_inode->i_mutex);
3451 path_put(&new_path);
3452out: 3571out:
3453 path_put(&old_path); 3572 path_put(&old_path);
3454 3573
@@ -3644,6 +3763,10 @@ SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
3644 if (newnd.last_type != LAST_NORM) 3763 if (newnd.last_type != LAST_NORM)
3645 goto exit2; 3764 goto exit2;
3646 3765
3766 error = mnt_want_write(oldnd.path.mnt);
3767 if (error)
3768 goto exit2;
3769
3647 oldnd.flags &= ~LOOKUP_PARENT; 3770 oldnd.flags &= ~LOOKUP_PARENT;
3648 newnd.flags &= ~LOOKUP_PARENT; 3771 newnd.flags &= ~LOOKUP_PARENT;
3649 newnd.flags |= LOOKUP_RENAME_TARGET; 3772 newnd.flags |= LOOKUP_RENAME_TARGET;
@@ -3679,23 +3802,19 @@ SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
3679 if (new_dentry == trap) 3802 if (new_dentry == trap)
3680 goto exit5; 3803 goto exit5;
3681 3804
3682 error = mnt_want_write(oldnd.path.mnt);
3683 if (error)
3684 goto exit5;
3685 error = security_path_rename(&oldnd.path, old_dentry, 3805 error = security_path_rename(&oldnd.path, old_dentry,
3686 &newnd.path, new_dentry); 3806 &newnd.path, new_dentry);
3687 if (error) 3807 if (error)
3688 goto exit6; 3808 goto exit5;
3689 error = vfs_rename(old_dir->d_inode, old_dentry, 3809 error = vfs_rename(old_dir->d_inode, old_dentry,
3690 new_dir->d_inode, new_dentry); 3810 new_dir->d_inode, new_dentry);
3691exit6:
3692 mnt_drop_write(oldnd.path.mnt);
3693exit5: 3811exit5:
3694 dput(new_dentry); 3812 dput(new_dentry);
3695exit4: 3813exit4:
3696 dput(old_dentry); 3814 dput(old_dentry);
3697exit3: 3815exit3:
3698 unlock_rename(new_dir, old_dir); 3816 unlock_rename(new_dir, old_dir);
3817 mnt_drop_write(oldnd.path.mnt);
3699exit2: 3818exit2:
3700 path_put(&newnd.path); 3819 path_put(&newnd.path);
3701 putname(to); 3820 putname(to);
diff --git a/fs/namespace.c b/fs/namespace.c
index c53d3381b0d0..4d31f73e2561 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -283,24 +283,22 @@ static int mnt_is_readonly(struct vfsmount *mnt)
283} 283}
284 284
285/* 285/*
286 * Most r/o checks on a fs are for operations that take 286 * Most r/o & frozen checks on a fs are for operations that take discrete
287 * discrete amounts of time, like a write() or unlink(). 287 * amounts of time, like a write() or unlink(). We must keep track of when
288 * We must keep track of when those operations start 288 * those operations start (for permission checks) and when they end, so that we
289 * (for permission checks) and when they end, so that 289 * can determine when writes are able to occur to a filesystem.
290 * we can determine when writes are able to occur to
291 * a filesystem.
292 */ 290 */
293/** 291/**
294 * mnt_want_write - get write access to a mount 292 * __mnt_want_write - get write access to a mount without freeze protection
295 * @m: the mount on which to take a write 293 * @m: the mount on which to take a write
296 * 294 *
297 * This tells the low-level filesystem that a write is 295 * This tells the low-level filesystem that a write is about to be performed to
298 * about to be performed to it, and makes sure that 296 * it, and makes sure that writes are allowed (mnt it read-write) before
299 * writes are allowed before returning success. When 297 * returning success. This operation does not protect against filesystem being
300 * the write operation is finished, mnt_drop_write() 298 * frozen. When the write operation is finished, __mnt_drop_write() must be
301 * must be called. This is effectively a refcount. 299 * called. This is effectively a refcount.
302 */ 300 */
303int mnt_want_write(struct vfsmount *m) 301int __mnt_want_write(struct vfsmount *m)
304{ 302{
305 struct mount *mnt = real_mount(m); 303 struct mount *mnt = real_mount(m);
306 int ret = 0; 304 int ret = 0;
@@ -326,6 +324,27 @@ int mnt_want_write(struct vfsmount *m)
326 ret = -EROFS; 324 ret = -EROFS;
327 } 325 }
328 preempt_enable(); 326 preempt_enable();
327
328 return ret;
329}
330
331/**
332 * mnt_want_write - get write access to a mount
333 * @m: the mount on which to take a write
334 *
335 * This tells the low-level filesystem that a write is about to be performed to
336 * it, and makes sure that writes are allowed (mount is read-write, filesystem
337 * is not frozen) before returning success. When the write operation is
338 * finished, mnt_drop_write() must be called. This is effectively a refcount.
339 */
340int mnt_want_write(struct vfsmount *m)
341{
342 int ret;
343
344 sb_start_write(m->mnt_sb);
345 ret = __mnt_want_write(m);
346 if (ret)
347 sb_end_write(m->mnt_sb);
329 return ret; 348 return ret;
330} 349}
331EXPORT_SYMBOL_GPL(mnt_want_write); 350EXPORT_SYMBOL_GPL(mnt_want_write);
@@ -355,38 +374,76 @@ int mnt_clone_write(struct vfsmount *mnt)
355EXPORT_SYMBOL_GPL(mnt_clone_write); 374EXPORT_SYMBOL_GPL(mnt_clone_write);
356 375
357/** 376/**
358 * mnt_want_write_file - get write access to a file's mount 377 * __mnt_want_write_file - get write access to a file's mount
359 * @file: the file who's mount on which to take a write 378 * @file: the file who's mount on which to take a write
360 * 379 *
361 * This is like mnt_want_write, but it takes a file and can 380 * This is like __mnt_want_write, but it takes a file and can
362 * do some optimisations if the file is open for write already 381 * do some optimisations if the file is open for write already
363 */ 382 */
364int mnt_want_write_file(struct file *file) 383int __mnt_want_write_file(struct file *file)
365{ 384{
366 struct inode *inode = file->f_dentry->d_inode; 385 struct inode *inode = file->f_dentry->d_inode;
386
367 if (!(file->f_mode & FMODE_WRITE) || special_file(inode->i_mode)) 387 if (!(file->f_mode & FMODE_WRITE) || special_file(inode->i_mode))
368 return mnt_want_write(file->f_path.mnt); 388 return __mnt_want_write(file->f_path.mnt);
369 else 389 else
370 return mnt_clone_write(file->f_path.mnt); 390 return mnt_clone_write(file->f_path.mnt);
371} 391}
392
393/**
394 * mnt_want_write_file - get write access to a file's mount
395 * @file: the file who's mount on which to take a write
396 *
397 * This is like mnt_want_write, but it takes a file and can
398 * do some optimisations if the file is open for write already
399 */
400int mnt_want_write_file(struct file *file)
401{
402 int ret;
403
404 sb_start_write(file->f_path.mnt->mnt_sb);
405 ret = __mnt_want_write_file(file);
406 if (ret)
407 sb_end_write(file->f_path.mnt->mnt_sb);
408 return ret;
409}
372EXPORT_SYMBOL_GPL(mnt_want_write_file); 410EXPORT_SYMBOL_GPL(mnt_want_write_file);
373 411
374/** 412/**
375 * mnt_drop_write - give up write access to a mount 413 * __mnt_drop_write - give up write access to a mount
376 * @mnt: the mount on which to give up write access 414 * @mnt: the mount on which to give up write access
377 * 415 *
378 * Tells the low-level filesystem that we are done 416 * Tells the low-level filesystem that we are done
379 * performing writes to it. Must be matched with 417 * performing writes to it. Must be matched with
380 * mnt_want_write() call above. 418 * __mnt_want_write() call above.
381 */ 419 */
382void mnt_drop_write(struct vfsmount *mnt) 420void __mnt_drop_write(struct vfsmount *mnt)
383{ 421{
384 preempt_disable(); 422 preempt_disable();
385 mnt_dec_writers(real_mount(mnt)); 423 mnt_dec_writers(real_mount(mnt));
386 preempt_enable(); 424 preempt_enable();
387} 425}
426
427/**
428 * mnt_drop_write - give up write access to a mount
429 * @mnt: the mount on which to give up write access
430 *
431 * Tells the low-level filesystem that we are done performing writes to it and
432 * also allows filesystem to be frozen again. Must be matched with
433 * mnt_want_write() call above.
434 */
435void mnt_drop_write(struct vfsmount *mnt)
436{
437 __mnt_drop_write(mnt);
438 sb_end_write(mnt->mnt_sb);
439}
388EXPORT_SYMBOL_GPL(mnt_drop_write); 440EXPORT_SYMBOL_GPL(mnt_drop_write);
389 441
442void __mnt_drop_write_file(struct file *file)
443{
444 __mnt_drop_write(file->f_path.mnt);
445}
446
390void mnt_drop_write_file(struct file *file) 447void mnt_drop_write_file(struct file *file)
391{ 448{
392 mnt_drop_write(file->f_path.mnt); 449 mnt_drop_write(file->f_path.mnt);
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index cbaf4f8bb7b7..4c7bd35b1876 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -651,12 +651,12 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
651 651
652 if (clp->cl_minorversion == 0) { 652 if (clp->cl_minorversion == 0) {
653 if (!clp->cl_cred.cr_principal && 653 if (!clp->cl_cred.cr_principal &&
654 (clp->cl_flavor >= RPC_AUTH_GSS_KRB5)) 654 (clp->cl_cred.cr_flavor >= RPC_AUTH_GSS_KRB5))
655 return -EINVAL; 655 return -EINVAL;
656 args.client_name = clp->cl_cred.cr_principal; 656 args.client_name = clp->cl_cred.cr_principal;
657 args.prognumber = conn->cb_prog, 657 args.prognumber = conn->cb_prog,
658 args.protocol = XPRT_TRANSPORT_TCP; 658 args.protocol = XPRT_TRANSPORT_TCP;
659 args.authflavor = clp->cl_flavor; 659 args.authflavor = clp->cl_cred.cr_flavor;
660 clp->cl_cb_ident = conn->cb_ident; 660 clp->cl_cb_ident = conn->cb_ident;
661 } else { 661 } else {
662 if (!conn->cb_xprt) 662 if (!conn->cb_xprt)
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 5ff0b7b9fc08..43295d45cc2b 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -154,6 +154,10 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
154 if (status < 0) 154 if (status < 0)
155 return; 155 return;
156 156
157 status = mnt_want_write_file(rec_file);
158 if (status)
159 return;
160
157 dir = rec_file->f_path.dentry; 161 dir = rec_file->f_path.dentry;
158 /* lock the parent */ 162 /* lock the parent */
159 mutex_lock(&dir->d_inode->i_mutex); 163 mutex_lock(&dir->d_inode->i_mutex);
@@ -173,11 +177,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
173 * as well be forgiving and just succeed silently. 177 * as well be forgiving and just succeed silently.
174 */ 178 */
175 goto out_put; 179 goto out_put;
176 status = mnt_want_write_file(rec_file);
177 if (status)
178 goto out_put;
179 status = vfs_mkdir(dir->d_inode, dentry, S_IRWXU); 180 status = vfs_mkdir(dir->d_inode, dentry, S_IRWXU);
180 mnt_drop_write_file(rec_file);
181out_put: 181out_put:
182 dput(dentry); 182 dput(dentry);
183out_unlock: 183out_unlock:
@@ -189,6 +189,7 @@ out_unlock:
189 " (err %d); please check that %s exists" 189 " (err %d); please check that %s exists"
190 " and is writeable", status, 190 " and is writeable", status,
191 user_recovery_dirname); 191 user_recovery_dirname);
192 mnt_drop_write_file(rec_file);
192 nfs4_reset_creds(original_cred); 193 nfs4_reset_creds(original_cred);
193} 194}
194 195
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index cc793005a87c..032af381b3aa 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -635,6 +635,7 @@ fh_put(struct svc_fh *fhp)
635 fhp->fh_post_saved = 0; 635 fhp->fh_post_saved = 0;
636#endif 636#endif
637 } 637 }
638 fh_drop_write(fhp);
638 if (exp) { 639 if (exp) {
639 exp_put(exp); 640 exp_put(exp);
640 fhp->fh_export = NULL; 641 fhp->fh_export = NULL;
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index e15dc45fc5ec..aad6d457b9e8 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -196,6 +196,7 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
196 struct dentry *dchild; 196 struct dentry *dchild;
197 int type, mode; 197 int type, mode;
198 __be32 nfserr; 198 __be32 nfserr;
199 int hosterr;
199 dev_t rdev = 0, wanted = new_decode_dev(attr->ia_size); 200 dev_t rdev = 0, wanted = new_decode_dev(attr->ia_size);
200 201
201 dprintk("nfsd: CREATE %s %.*s\n", 202 dprintk("nfsd: CREATE %s %.*s\n",
@@ -214,6 +215,12 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
214 nfserr = nfserr_exist; 215 nfserr = nfserr_exist;
215 if (isdotent(argp->name, argp->len)) 216 if (isdotent(argp->name, argp->len))
216 goto done; 217 goto done;
218 hosterr = fh_want_write(dirfhp);
219 if (hosterr) {
220 nfserr = nfserrno(hosterr);
221 goto done;
222 }
223
217 fh_lock_nested(dirfhp, I_MUTEX_PARENT); 224 fh_lock_nested(dirfhp, I_MUTEX_PARENT);
218 dchild = lookup_one_len(argp->name, dirfhp->fh_dentry, argp->len); 225 dchild = lookup_one_len(argp->name, dirfhp->fh_dentry, argp->len);
219 if (IS_ERR(dchild)) { 226 if (IS_ERR(dchild)) {
@@ -330,7 +337,7 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
330out_unlock: 337out_unlock:
331 /* We don't really need to unlock, as fh_put does it. */ 338 /* We don't really need to unlock, as fh_put does it. */
332 fh_unlock(dirfhp); 339 fh_unlock(dirfhp);
333 340 fh_drop_write(dirfhp);
334done: 341done:
335 fh_put(dirfhp); 342 fh_put(dirfhp);
336 return nfsd_return_dirop(nfserr, resp); 343 return nfsd_return_dirop(nfserr, resp);
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index e6173147f982..22bd0a66c356 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -231,7 +231,6 @@ struct nfs4_client {
231 nfs4_verifier cl_verifier; /* generated by client */ 231 nfs4_verifier cl_verifier; /* generated by client */
232 time_t cl_time; /* time of last lease renewal */ 232 time_t cl_time; /* time of last lease renewal */
233 struct sockaddr_storage cl_addr; /* client ipaddress */ 233 struct sockaddr_storage cl_addr; /* client ipaddress */
234 u32 cl_flavor; /* setclientid pseudoflavor */
235 struct svc_cred cl_cred; /* setclientid principal */ 234 struct svc_cred cl_cred; /* setclientid principal */
236 clientid_t cl_clientid; /* generated by server */ 235 clientid_t cl_clientid; /* generated by server */
237 nfs4_verifier cl_confirm; /* generated by server */ 236 nfs4_verifier cl_confirm; /* generated by server */
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 702f64e820c3..a9269f142cc4 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1284,6 +1284,10 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
1284 * If it has, the parent directory should already be locked. 1284 * If it has, the parent directory should already be locked.
1285 */ 1285 */
1286 if (!resfhp->fh_dentry) { 1286 if (!resfhp->fh_dentry) {
1287 host_err = fh_want_write(fhp);
1288 if (host_err)
1289 goto out_nfserr;
1290
1287 /* called from nfsd_proc_mkdir, or possibly nfsd3_proc_create */ 1291 /* called from nfsd_proc_mkdir, or possibly nfsd3_proc_create */
1288 fh_lock_nested(fhp, I_MUTEX_PARENT); 1292 fh_lock_nested(fhp, I_MUTEX_PARENT);
1289 dchild = lookup_one_len(fname, dentry, flen); 1293 dchild = lookup_one_len(fname, dentry, flen);
@@ -1327,14 +1331,11 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
1327 goto out; 1331 goto out;
1328 } 1332 }
1329 1333
1330 host_err = fh_want_write(fhp);
1331 if (host_err)
1332 goto out_nfserr;
1333
1334 /* 1334 /*
1335 * Get the dir op function pointer. 1335 * Get the dir op function pointer.
1336 */ 1336 */
1337 err = 0; 1337 err = 0;
1338 host_err = 0;
1338 switch (type) { 1339 switch (type) {
1339 case S_IFREG: 1340 case S_IFREG:
1340 host_err = vfs_create(dirp, dchild, iap->ia_mode, true); 1341 host_err = vfs_create(dirp, dchild, iap->ia_mode, true);
@@ -1351,10 +1352,8 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
1351 host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev); 1352 host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev);
1352 break; 1353 break;
1353 } 1354 }
1354 if (host_err < 0) { 1355 if (host_err < 0)
1355 fh_drop_write(fhp);
1356 goto out_nfserr; 1356 goto out_nfserr;
1357 }
1358 1357
1359 err = nfsd_create_setattr(rqstp, resfhp, iap); 1358 err = nfsd_create_setattr(rqstp, resfhp, iap);
1360 1359
@@ -1366,7 +1365,6 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
1366 err2 = nfserrno(commit_metadata(fhp)); 1365 err2 = nfserrno(commit_metadata(fhp));
1367 if (err2) 1366 if (err2)
1368 err = err2; 1367 err = err2;
1369 fh_drop_write(fhp);
1370 /* 1368 /*
1371 * Update the file handle to get the new inode info. 1369 * Update the file handle to get the new inode info.
1372 */ 1370 */
@@ -1425,6 +1423,11 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
1425 err = nfserr_notdir; 1423 err = nfserr_notdir;
1426 if (!dirp->i_op->lookup) 1424 if (!dirp->i_op->lookup)
1427 goto out; 1425 goto out;
1426
1427 host_err = fh_want_write(fhp);
1428 if (host_err)
1429 goto out_nfserr;
1430
1428 fh_lock_nested(fhp, I_MUTEX_PARENT); 1431 fh_lock_nested(fhp, I_MUTEX_PARENT);
1429 1432
1430 /* 1433 /*
@@ -1457,9 +1460,6 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
1457 v_atime = verifier[1]&0x7fffffff; 1460 v_atime = verifier[1]&0x7fffffff;
1458 } 1461 }
1459 1462
1460 host_err = fh_want_write(fhp);
1461 if (host_err)
1462 goto out_nfserr;
1463 if (dchild->d_inode) { 1463 if (dchild->d_inode) {
1464 err = 0; 1464 err = 0;
1465 1465
@@ -1530,7 +1530,6 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
1530 if (!err) 1530 if (!err)
1531 err = nfserrno(commit_metadata(fhp)); 1531 err = nfserrno(commit_metadata(fhp));
1532 1532
1533 fh_drop_write(fhp);
1534 /* 1533 /*
1535 * Update the filehandle to get the new inode info. 1534 * Update the filehandle to get the new inode info.
1536 */ 1535 */
@@ -1541,6 +1540,7 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
1541 fh_unlock(fhp); 1540 fh_unlock(fhp);
1542 if (dchild && !IS_ERR(dchild)) 1541 if (dchild && !IS_ERR(dchild))
1543 dput(dchild); 1542 dput(dchild);
1543 fh_drop_write(fhp);
1544 return err; 1544 return err;
1545 1545
1546 out_nfserr: 1546 out_nfserr:
@@ -1621,6 +1621,11 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
1621 err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE); 1621 err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE);
1622 if (err) 1622 if (err)
1623 goto out; 1623 goto out;
1624
1625 host_err = fh_want_write(fhp);
1626 if (host_err)
1627 goto out_nfserr;
1628
1624 fh_lock(fhp); 1629 fh_lock(fhp);
1625 dentry = fhp->fh_dentry; 1630 dentry = fhp->fh_dentry;
1626 dnew = lookup_one_len(fname, dentry, flen); 1631 dnew = lookup_one_len(fname, dentry, flen);
@@ -1628,10 +1633,6 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
1628 if (IS_ERR(dnew)) 1633 if (IS_ERR(dnew))
1629 goto out_nfserr; 1634 goto out_nfserr;
1630 1635
1631 host_err = fh_want_write(fhp);
1632 if (host_err)
1633 goto out_nfserr;
1634
1635 if (unlikely(path[plen] != 0)) { 1636 if (unlikely(path[plen] != 0)) {
1636 char *path_alloced = kmalloc(plen+1, GFP_KERNEL); 1637 char *path_alloced = kmalloc(plen+1, GFP_KERNEL);
1637 if (path_alloced == NULL) 1638 if (path_alloced == NULL)
@@ -1691,6 +1692,12 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
1691 if (isdotent(name, len)) 1692 if (isdotent(name, len))
1692 goto out; 1693 goto out;
1693 1694
1695 host_err = fh_want_write(tfhp);
1696 if (host_err) {
1697 err = nfserrno(host_err);
1698 goto out;
1699 }
1700
1694 fh_lock_nested(ffhp, I_MUTEX_PARENT); 1701 fh_lock_nested(ffhp, I_MUTEX_PARENT);
1695 ddir = ffhp->fh_dentry; 1702 ddir = ffhp->fh_dentry;
1696 dirp = ddir->d_inode; 1703 dirp = ddir->d_inode;
@@ -1702,18 +1709,13 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
1702 1709
1703 dold = tfhp->fh_dentry; 1710 dold = tfhp->fh_dentry;
1704 1711
1705 host_err = fh_want_write(tfhp);
1706 if (host_err) {
1707 err = nfserrno(host_err);
1708 goto out_dput;
1709 }
1710 err = nfserr_noent; 1712 err = nfserr_noent;
1711 if (!dold->d_inode) 1713 if (!dold->d_inode)
1712 goto out_drop_write; 1714 goto out_dput;
1713 host_err = nfsd_break_lease(dold->d_inode); 1715 host_err = nfsd_break_lease(dold->d_inode);
1714 if (host_err) { 1716 if (host_err) {
1715 err = nfserrno(host_err); 1717 err = nfserrno(host_err);
1716 goto out_drop_write; 1718 goto out_dput;
1717 } 1719 }
1718 host_err = vfs_link(dold, dirp, dnew); 1720 host_err = vfs_link(dold, dirp, dnew);
1719 if (!host_err) { 1721 if (!host_err) {
@@ -1726,12 +1728,11 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
1726 else 1728 else
1727 err = nfserrno(host_err); 1729 err = nfserrno(host_err);
1728 } 1730 }
1729out_drop_write:
1730 fh_drop_write(tfhp);
1731out_dput: 1731out_dput:
1732 dput(dnew); 1732 dput(dnew);
1733out_unlock: 1733out_unlock:
1734 fh_unlock(ffhp); 1734 fh_unlock(ffhp);
1735 fh_drop_write(tfhp);
1735out: 1736out:
1736 return err; 1737 return err;
1737 1738
@@ -1774,6 +1775,12 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
1774 if (!flen || isdotent(fname, flen) || !tlen || isdotent(tname, tlen)) 1775 if (!flen || isdotent(fname, flen) || !tlen || isdotent(tname, tlen))
1775 goto out; 1776 goto out;
1776 1777
1778 host_err = fh_want_write(ffhp);
1779 if (host_err) {
1780 err = nfserrno(host_err);
1781 goto out;
1782 }
1783
1777 /* cannot use fh_lock as we need deadlock protective ordering 1784 /* cannot use fh_lock as we need deadlock protective ordering
1778 * so do it by hand */ 1785 * so do it by hand */
1779 trap = lock_rename(tdentry, fdentry); 1786 trap = lock_rename(tdentry, fdentry);
@@ -1804,17 +1811,14 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
1804 host_err = -EXDEV; 1811 host_err = -EXDEV;
1805 if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt) 1812 if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt)
1806 goto out_dput_new; 1813 goto out_dput_new;
1807 host_err = fh_want_write(ffhp);
1808 if (host_err)
1809 goto out_dput_new;
1810 1814
1811 host_err = nfsd_break_lease(odentry->d_inode); 1815 host_err = nfsd_break_lease(odentry->d_inode);
1812 if (host_err) 1816 if (host_err)
1813 goto out_drop_write; 1817 goto out_dput_new;
1814 if (ndentry->d_inode) { 1818 if (ndentry->d_inode) {
1815 host_err = nfsd_break_lease(ndentry->d_inode); 1819 host_err = nfsd_break_lease(ndentry->d_inode);
1816 if (host_err) 1820 if (host_err)
1817 goto out_drop_write; 1821 goto out_dput_new;
1818 } 1822 }
1819 host_err = vfs_rename(fdir, odentry, tdir, ndentry); 1823 host_err = vfs_rename(fdir, odentry, tdir, ndentry);
1820 if (!host_err) { 1824 if (!host_err) {
@@ -1822,8 +1826,6 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
1822 if (!host_err) 1826 if (!host_err)
1823 host_err = commit_metadata(ffhp); 1827 host_err = commit_metadata(ffhp);
1824 } 1828 }
1825out_drop_write:
1826 fh_drop_write(ffhp);
1827 out_dput_new: 1829 out_dput_new:
1828 dput(ndentry); 1830 dput(ndentry);
1829 out_dput_old: 1831 out_dput_old:
@@ -1839,6 +1841,7 @@ out_drop_write:
1839 fill_post_wcc(tfhp); 1841 fill_post_wcc(tfhp);
1840 unlock_rename(tdentry, fdentry); 1842 unlock_rename(tdentry, fdentry);
1841 ffhp->fh_locked = tfhp->fh_locked = 0; 1843 ffhp->fh_locked = tfhp->fh_locked = 0;
1844 fh_drop_write(ffhp);
1842 1845
1843out: 1846out:
1844 return err; 1847 return err;
@@ -1864,6 +1867,10 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
1864 if (err) 1867 if (err)
1865 goto out; 1868 goto out;
1866 1869
1870 host_err = fh_want_write(fhp);
1871 if (host_err)
1872 goto out_nfserr;
1873
1867 fh_lock_nested(fhp, I_MUTEX_PARENT); 1874 fh_lock_nested(fhp, I_MUTEX_PARENT);
1868 dentry = fhp->fh_dentry; 1875 dentry = fhp->fh_dentry;
1869 dirp = dentry->d_inode; 1876 dirp = dentry->d_inode;
@@ -1882,21 +1889,15 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
1882 if (!type) 1889 if (!type)
1883 type = rdentry->d_inode->i_mode & S_IFMT; 1890 type = rdentry->d_inode->i_mode & S_IFMT;
1884 1891
1885 host_err = fh_want_write(fhp);
1886 if (host_err)
1887 goto out_put;
1888
1889 host_err = nfsd_break_lease(rdentry->d_inode); 1892 host_err = nfsd_break_lease(rdentry->d_inode);
1890 if (host_err) 1893 if (host_err)
1891 goto out_drop_write; 1894 goto out_put;
1892 if (type != S_IFDIR) 1895 if (type != S_IFDIR)
1893 host_err = vfs_unlink(dirp, rdentry); 1896 host_err = vfs_unlink(dirp, rdentry);
1894 else 1897 else
1895 host_err = vfs_rmdir(dirp, rdentry); 1898 host_err = vfs_rmdir(dirp, rdentry);
1896 if (!host_err) 1899 if (!host_err)
1897 host_err = commit_metadata(fhp); 1900 host_err = commit_metadata(fhp);
1898out_drop_write:
1899 fh_drop_write(fhp);
1900out_put: 1901out_put:
1901 dput(rdentry); 1902 dput(rdentry);
1902 1903
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index ec0611b2b738..359594c393d2 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -110,12 +110,19 @@ int nfsd_set_posix_acl(struct svc_fh *, int, struct posix_acl *);
110 110
111static inline int fh_want_write(struct svc_fh *fh) 111static inline int fh_want_write(struct svc_fh *fh)
112{ 112{
113 return mnt_want_write(fh->fh_export->ex_path.mnt); 113 int ret = mnt_want_write(fh->fh_export->ex_path.mnt);
114
115 if (!ret)
116 fh->fh_want_write = 1;
117 return ret;
114} 118}
115 119
116static inline void fh_drop_write(struct svc_fh *fh) 120static inline void fh_drop_write(struct svc_fh *fh)
117{ 121{
118 mnt_drop_write(fh->fh_export->ex_path.mnt); 122 if (fh->fh_want_write) {
123 fh->fh_want_write = 0;
124 mnt_drop_write(fh->fh_export->ex_path.mnt);
125 }
119} 126}
120 127
121#endif /* LINUX_NFSD_VFS_H */ 128#endif /* LINUX_NFSD_VFS_H */
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 62cebc8e1a1f..a4d56ac02e6c 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -69,16 +69,18 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
69 struct page *page = vmf->page; 69 struct page *page = vmf->page;
70 struct inode *inode = vma->vm_file->f_dentry->d_inode; 70 struct inode *inode = vma->vm_file->f_dentry->d_inode;
71 struct nilfs_transaction_info ti; 71 struct nilfs_transaction_info ti;
72 int ret; 72 int ret = 0;
73 73
74 if (unlikely(nilfs_near_disk_full(inode->i_sb->s_fs_info))) 74 if (unlikely(nilfs_near_disk_full(inode->i_sb->s_fs_info)))
75 return VM_FAULT_SIGBUS; /* -ENOSPC */ 75 return VM_FAULT_SIGBUS; /* -ENOSPC */
76 76
77 sb_start_pagefault(inode->i_sb);
77 lock_page(page); 78 lock_page(page);
78 if (page->mapping != inode->i_mapping || 79 if (page->mapping != inode->i_mapping ||
79 page_offset(page) >= i_size_read(inode) || !PageUptodate(page)) { 80 page_offset(page) >= i_size_read(inode) || !PageUptodate(page)) {
80 unlock_page(page); 81 unlock_page(page);
81 return VM_FAULT_NOPAGE; /* make the VM retry the fault */ 82 ret = -EFAULT; /* make the VM retry the fault */
83 goto out;
82 } 84 }
83 85
84 /* 86 /*
@@ -112,19 +114,21 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
112 ret = nilfs_transaction_begin(inode->i_sb, &ti, 1); 114 ret = nilfs_transaction_begin(inode->i_sb, &ti, 1);
113 /* never returns -ENOMEM, but may return -ENOSPC */ 115 /* never returns -ENOMEM, but may return -ENOSPC */
114 if (unlikely(ret)) 116 if (unlikely(ret))
115 return VM_FAULT_SIGBUS; 117 goto out;
116 118
117 ret = block_page_mkwrite(vma, vmf, nilfs_get_block); 119 ret = __block_page_mkwrite(vma, vmf, nilfs_get_block);
118 if (ret != VM_FAULT_LOCKED) { 120 if (ret) {
119 nilfs_transaction_abort(inode->i_sb); 121 nilfs_transaction_abort(inode->i_sb);
120 return ret; 122 goto out;
121 } 123 }
122 nilfs_set_file_dirty(inode, 1 << (PAGE_SHIFT - inode->i_blkbits)); 124 nilfs_set_file_dirty(inode, 1 << (PAGE_SHIFT - inode->i_blkbits));
123 nilfs_transaction_commit(inode->i_sb); 125 nilfs_transaction_commit(inode->i_sb);
124 126
125 mapped: 127 mapped:
126 wait_on_page_writeback(page); 128 wait_on_page_writeback(page);
127 return VM_FAULT_LOCKED; 129 out:
130 sb_end_pagefault(inode->i_sb);
131 return block_page_mkwrite_return(ret);
128} 132}
129 133
130static const struct vm_operations_struct nilfs_file_vm_ops = { 134static const struct vm_operations_struct nilfs_file_vm_ops = {
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 0b6387c67e6c..fdb180769485 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -660,8 +660,6 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
660 goto out_free; 660 goto out_free;
661 } 661 }
662 662
663 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
664
665 ret = nilfs_ioctl_move_blocks(inode->i_sb, &argv[0], kbufs[0]); 663 ret = nilfs_ioctl_move_blocks(inode->i_sb, &argv[0], kbufs[0]);
666 if (ret < 0) 664 if (ret < 0)
667 printk(KERN_ERR "NILFS: GC failed during preparation: " 665 printk(KERN_ERR "NILFS: GC failed during preparation: "
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 88e11fb346b6..a5752a589932 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -189,7 +189,7 @@ int nilfs_transaction_begin(struct super_block *sb,
189 if (ret > 0) 189 if (ret > 0)
190 return 0; 190 return 0;
191 191
192 vfs_check_frozen(sb, SB_FREEZE_WRITE); 192 sb_start_intwrite(sb);
193 193
194 nilfs = sb->s_fs_info; 194 nilfs = sb->s_fs_info;
195 down_read(&nilfs->ns_segctor_sem); 195 down_read(&nilfs->ns_segctor_sem);
@@ -205,6 +205,7 @@ int nilfs_transaction_begin(struct super_block *sb,
205 current->journal_info = ti->ti_save; 205 current->journal_info = ti->ti_save;
206 if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC) 206 if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
207 kmem_cache_free(nilfs_transaction_cachep, ti); 207 kmem_cache_free(nilfs_transaction_cachep, ti);
208 sb_end_intwrite(sb);
208 return ret; 209 return ret;
209} 210}
210 211
@@ -246,6 +247,7 @@ int nilfs_transaction_commit(struct super_block *sb)
246 err = nilfs_construct_segment(sb); 247 err = nilfs_construct_segment(sb);
247 if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC) 248 if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
248 kmem_cache_free(nilfs_transaction_cachep, ti); 249 kmem_cache_free(nilfs_transaction_cachep, ti);
250 sb_end_intwrite(sb);
249 return err; 251 return err;
250} 252}
251 253
@@ -264,6 +266,7 @@ void nilfs_transaction_abort(struct super_block *sb)
264 current->journal_info = ti->ti_save; 266 current->journal_info = ti->ti_save;
265 if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC) 267 if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
266 kmem_cache_free(nilfs_transaction_cachep, ti); 268 kmem_cache_free(nilfs_transaction_cachep, ti);
269 sb_end_intwrite(sb);
267} 270}
268 271
269void nilfs_relax_pressure_in_lock(struct super_block *sb) 272void nilfs_relax_pressure_in_lock(struct super_block *sb)
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 6522cac6057c..6a10812711c1 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -676,17 +676,13 @@ static const struct super_operations nilfs_sops = {
676 .alloc_inode = nilfs_alloc_inode, 676 .alloc_inode = nilfs_alloc_inode,
677 .destroy_inode = nilfs_destroy_inode, 677 .destroy_inode = nilfs_destroy_inode,
678 .dirty_inode = nilfs_dirty_inode, 678 .dirty_inode = nilfs_dirty_inode,
679 /* .write_inode = nilfs_write_inode, */
680 /* .drop_inode = nilfs_drop_inode, */
681 .evict_inode = nilfs_evict_inode, 679 .evict_inode = nilfs_evict_inode,
682 .put_super = nilfs_put_super, 680 .put_super = nilfs_put_super,
683 /* .write_super = nilfs_write_super, */
684 .sync_fs = nilfs_sync_fs, 681 .sync_fs = nilfs_sync_fs,
685 .freeze_fs = nilfs_freeze, 682 .freeze_fs = nilfs_freeze,
686 .unfreeze_fs = nilfs_unfreeze, 683 .unfreeze_fs = nilfs_unfreeze,
687 .statfs = nilfs_statfs, 684 .statfs = nilfs_statfs,
688 .remount_fs = nilfs_remount, 685 .remount_fs = nilfs_remount,
689 /* .umount_begin */
690 .show_options = nilfs_show_options 686 .show_options = nilfs_show_options
691}; 687};
692 688
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 6eee4177807b..be1267a34cea 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -107,8 +107,6 @@ struct the_nilfs {
107 * used for 107 * used for
108 * - loading the latest checkpoint exclusively. 108 * - loading the latest checkpoint exclusively.
109 * - allocating a new full segment. 109 * - allocating a new full segment.
110 * - protecting s_dirt in the super_block struct
111 * (see nilfs_write_super) and the following fields.
112 */ 110 */
113 struct buffer_head *ns_sbh[2]; 111 struct buffer_head *ns_sbh[2];
114 struct nilfs_super_block *ns_sbp[2]; 112 struct nilfs_super_block *ns_sbp[2];
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 7389d2d5e51d..1ecf46448f85 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -2084,7 +2084,6 @@ static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb,
2084 if (err) 2084 if (err)
2085 return err; 2085 return err;
2086 pos = *ppos; 2086 pos = *ppos;
2087 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
2088 /* We can write back this queue in page reclaim. */ 2087 /* We can write back this queue in page reclaim. */
2089 current->backing_dev_info = mapping->backing_dev_info; 2088 current->backing_dev_info = mapping->backing_dev_info;
2090 written = 0; 2089 written = 0;
@@ -2119,6 +2118,7 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2119 2118
2120 BUG_ON(iocb->ki_pos != pos); 2119 BUG_ON(iocb->ki_pos != pos);
2121 2120
2121 sb_start_write(inode->i_sb);
2122 mutex_lock(&inode->i_mutex); 2122 mutex_lock(&inode->i_mutex);
2123 ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos); 2123 ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos);
2124 mutex_unlock(&inode->i_mutex); 2124 mutex_unlock(&inode->i_mutex);
@@ -2127,6 +2127,7 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2127 if (err < 0) 2127 if (err < 0)
2128 ret = err; 2128 ret = err;
2129 } 2129 }
2130 sb_end_write(inode->i_sb);
2130 return ret; 2131 return ret;
2131} 2132}
2132 2133
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 7602783d7f41..46a1f6d75104 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1971,6 +1971,7 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd,
1971{ 1971{
1972 struct inode *inode = file->f_path.dentry->d_inode; 1972 struct inode *inode = file->f_path.dentry->d_inode;
1973 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1973 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1974 int ret;
1974 1975
1975 if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) && 1976 if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) &&
1976 !ocfs2_writes_unwritten_extents(osb)) 1977 !ocfs2_writes_unwritten_extents(osb))
@@ -1985,7 +1986,12 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd,
1985 if (!(file->f_mode & FMODE_WRITE)) 1986 if (!(file->f_mode & FMODE_WRITE))
1986 return -EBADF; 1987 return -EBADF;
1987 1988
1988 return __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0); 1989 ret = mnt_want_write_file(file);
1990 if (ret)
1991 return ret;
1992 ret = __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0);
1993 mnt_drop_write_file(file);
1994 return ret;
1989} 1995}
1990 1996
1991static long ocfs2_fallocate(struct file *file, int mode, loff_t offset, 1997static long ocfs2_fallocate(struct file *file, int mode, loff_t offset,
@@ -2261,7 +2267,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
2261 if (iocb->ki_left == 0) 2267 if (iocb->ki_left == 0)
2262 return 0; 2268 return 0;
2263 2269
2264 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 2270 sb_start_write(inode->i_sb);
2265 2271
2266 appending = file->f_flags & O_APPEND ? 1 : 0; 2272 appending = file->f_flags & O_APPEND ? 1 : 0;
2267 direct_io = file->f_flags & O_DIRECT ? 1 : 0; 2273 direct_io = file->f_flags & O_DIRECT ? 1 : 0;
@@ -2436,6 +2442,7 @@ out_sems:
2436 ocfs2_iocb_clear_sem_locked(iocb); 2442 ocfs2_iocb_clear_sem_locked(iocb);
2437 2443
2438 mutex_unlock(&inode->i_mutex); 2444 mutex_unlock(&inode->i_mutex);
2445 sb_end_write(inode->i_sb);
2439 2446
2440 if (written) 2447 if (written)
2441 ret = written; 2448 ret = written;
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index d96f7f81d8dd..f20edcbfe700 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -928,7 +928,12 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
928 if (get_user(new_clusters, (int __user *)arg)) 928 if (get_user(new_clusters, (int __user *)arg))
929 return -EFAULT; 929 return -EFAULT;
930 930
931 return ocfs2_group_extend(inode, new_clusters); 931 status = mnt_want_write_file(filp);
932 if (status)
933 return status;
934 status = ocfs2_group_extend(inode, new_clusters);
935 mnt_drop_write_file(filp);
936 return status;
932 case OCFS2_IOC_GROUP_ADD: 937 case OCFS2_IOC_GROUP_ADD:
933 case OCFS2_IOC_GROUP_ADD64: 938 case OCFS2_IOC_GROUP_ADD64:
934 if (!capable(CAP_SYS_RESOURCE)) 939 if (!capable(CAP_SYS_RESOURCE))
@@ -937,7 +942,12 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
937 if (copy_from_user(&input, (int __user *) arg, sizeof(input))) 942 if (copy_from_user(&input, (int __user *) arg, sizeof(input)))
938 return -EFAULT; 943 return -EFAULT;
939 944
940 return ocfs2_group_add(inode, &input); 945 status = mnt_want_write_file(filp);
946 if (status)
947 return status;
948 status = ocfs2_group_add(inode, &input);
949 mnt_drop_write_file(filp);
950 return status;
941 case OCFS2_IOC_REFLINK: 951 case OCFS2_IOC_REFLINK:
942 if (copy_from_user(&args, argp, sizeof(args))) 952 if (copy_from_user(&args, argp, sizeof(args)))
943 return -EFAULT; 953 return -EFAULT;
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 0a42ae96dca7..2dd36af79e26 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -355,11 +355,14 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
355 if (journal_current_handle()) 355 if (journal_current_handle())
356 return jbd2_journal_start(journal, max_buffs); 356 return jbd2_journal_start(journal, max_buffs);
357 357
358 sb_start_intwrite(osb->sb);
359
358 down_read(&osb->journal->j_trans_barrier); 360 down_read(&osb->journal->j_trans_barrier);
359 361
360 handle = jbd2_journal_start(journal, max_buffs); 362 handle = jbd2_journal_start(journal, max_buffs);
361 if (IS_ERR(handle)) { 363 if (IS_ERR(handle)) {
362 up_read(&osb->journal->j_trans_barrier); 364 up_read(&osb->journal->j_trans_barrier);
365 sb_end_intwrite(osb->sb);
363 366
364 mlog_errno(PTR_ERR(handle)); 367 mlog_errno(PTR_ERR(handle));
365 368
@@ -388,8 +391,10 @@ int ocfs2_commit_trans(struct ocfs2_super *osb,
388 if (ret < 0) 391 if (ret < 0)
389 mlog_errno(ret); 392 mlog_errno(ret);
390 393
391 if (!nested) 394 if (!nested) {
392 up_read(&journal->j_trans_barrier); 395 up_read(&journal->j_trans_barrier);
396 sb_end_intwrite(osb->sb);
397 }
393 398
394 return ret; 399 return ret;
395} 400}
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 9cd41083e991..d150372fd81d 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -136,6 +136,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
136 sigset_t oldset; 136 sigset_t oldset;
137 int ret; 137 int ret;
138 138
139 sb_start_pagefault(inode->i_sb);
139 ocfs2_block_signals(&oldset); 140 ocfs2_block_signals(&oldset);
140 141
141 /* 142 /*
@@ -165,6 +166,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
165 166
166out: 167out:
167 ocfs2_unblock_signals(&oldset); 168 ocfs2_unblock_signals(&oldset);
169 sb_end_pagefault(inode->i_sb);
168 return ret; 170 return ret;
169} 171}
170 172
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 9f32d7cbb7a3..30a055049e16 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4466,20 +4466,11 @@ int ocfs2_reflink_ioctl(struct inode *inode,
4466 goto out_dput; 4466 goto out_dput;
4467 } 4467 }
4468 4468
4469 error = mnt_want_write(new_path.mnt);
4470 if (error) {
4471 mlog_errno(error);
4472 goto out_dput;
4473 }
4474
4475 error = ocfs2_vfs_reflink(old_path.dentry, 4469 error = ocfs2_vfs_reflink(old_path.dentry,
4476 new_path.dentry->d_inode, 4470 new_path.dentry->d_inode,
4477 new_dentry, preserve); 4471 new_dentry, preserve);
4478 mnt_drop_write(new_path.mnt);
4479out_dput: 4472out_dput:
4480 dput(new_dentry); 4473 done_path_create(&new_path, new_dentry);
4481 mutex_unlock(&new_path.dentry->d_inode->i_mutex);
4482 path_put(&new_path);
4483out: 4474out:
4484 path_put(&old_path); 4475 path_put(&old_path);
4485 4476
diff --git a/fs/open.c b/fs/open.c
index 1e914b397e12..e1f2cdb91a4d 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -164,11 +164,13 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
164 if (IS_APPEND(inode)) 164 if (IS_APPEND(inode))
165 goto out_putf; 165 goto out_putf;
166 166
167 sb_start_write(inode->i_sb);
167 error = locks_verify_truncate(inode, file, length); 168 error = locks_verify_truncate(inode, file, length);
168 if (!error) 169 if (!error)
169 error = security_path_truncate(&file->f_path); 170 error = security_path_truncate(&file->f_path);
170 if (!error) 171 if (!error)
171 error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, file); 172 error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, file);
173 sb_end_write(inode->i_sb);
172out_putf: 174out_putf:
173 fput(file); 175 fput(file);
174out: 176out:
@@ -266,7 +268,10 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
266 if (!file->f_op->fallocate) 268 if (!file->f_op->fallocate)
267 return -EOPNOTSUPP; 269 return -EOPNOTSUPP;
268 270
269 return file->f_op->fallocate(file, mode, offset, len); 271 sb_start_write(inode->i_sb);
272 ret = file->f_op->fallocate(file, mode, offset, len);
273 sb_end_write(inode->i_sb);
274 return ret;
270} 275}
271 276
272SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len) 277SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len)
@@ -620,7 +625,7 @@ static inline int __get_file_write_access(struct inode *inode,
620 /* 625 /*
621 * Balanced in __fput() 626 * Balanced in __fput()
622 */ 627 */
623 error = mnt_want_write(mnt); 628 error = __mnt_want_write(mnt);
624 if (error) 629 if (error)
625 put_write_access(inode); 630 put_write_access(inode);
626 } 631 }
@@ -654,6 +659,7 @@ static int do_dentry_open(struct file *f,
654 if (unlikely(f->f_flags & O_PATH)) 659 if (unlikely(f->f_flags & O_PATH))
655 f->f_mode = FMODE_PATH; 660 f->f_mode = FMODE_PATH;
656 661
662 path_get(&f->f_path);
657 inode = f->f_path.dentry->d_inode; 663 inode = f->f_path.dentry->d_inode;
658 if (f->f_mode & FMODE_WRITE) { 664 if (f->f_mode & FMODE_WRITE) {
659 error = __get_file_write_access(inode, f->f_path.mnt); 665 error = __get_file_write_access(inode, f->f_path.mnt);
@@ -711,7 +717,7 @@ cleanup_all:
711 * here, so just reset the state. 717 * here, so just reset the state.
712 */ 718 */
713 file_reset_write(f); 719 file_reset_write(f);
714 mnt_drop_write(f->f_path.mnt); 720 __mnt_drop_write(f->f_path.mnt);
715 } 721 }
716 } 722 }
717cleanup_file: 723cleanup_file:
@@ -739,9 +745,7 @@ int finish_open(struct file *file, struct dentry *dentry,
739 int error; 745 int error;
740 BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */ 746 BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */
741 747
742 mntget(file->f_path.mnt); 748 file->f_path.dentry = dentry;
743 file->f_path.dentry = dget(dentry);
744
745 error = do_dentry_open(file, open, current_cred()); 749 error = do_dentry_open(file, open, current_cred());
746 if (!error) 750 if (!error)
747 *opened |= FILE_OPENED; 751 *opened |= FILE_OPENED;
@@ -784,7 +788,6 @@ struct file *dentry_open(const struct path *path, int flags,
784 788
785 f->f_flags = flags; 789 f->f_flags = flags;
786 f->f_path = *path; 790 f->f_path = *path;
787 path_get(&f->f_path);
788 error = do_dentry_open(f, NULL, cred); 791 error = do_dentry_open(f, NULL, cred);
789 if (!error) { 792 if (!error) {
790 error = open_check_o_direct(f); 793 error = open_check_o_direct(f);
@@ -849,9 +852,10 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o
849 int lookup_flags = 0; 852 int lookup_flags = 0;
850 int acc_mode; 853 int acc_mode;
851 854
852 if (!(flags & O_CREAT)) 855 if (flags & O_CREAT)
853 mode = 0; 856 op->mode = (mode & S_IALLUGO) | S_IFREG;
854 op->mode = mode; 857 else
858 op->mode = 0;
855 859
856 /* Must never be set by userspace */ 860 /* Must never be set by userspace */
857 flags &= ~FMODE_NONOTIFY; 861 flags &= ~FMODE_NONOTIFY;
diff --git a/fs/pipe.c b/fs/pipe.c
index 95cbd6b227e6..8d85d7068c1e 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1016,18 +1016,16 @@ fail_inode:
1016 return NULL; 1016 return NULL;
1017} 1017}
1018 1018
1019struct file *create_write_pipe(int flags) 1019int create_pipe_files(struct file **res, int flags)
1020{ 1020{
1021 int err; 1021 int err;
1022 struct inode *inode; 1022 struct inode *inode = get_pipe_inode();
1023 struct file *f; 1023 struct file *f;
1024 struct path path; 1024 struct path path;
1025 struct qstr name = { .name = "" }; 1025 static struct qstr name = { .name = "" };
1026 1026
1027 err = -ENFILE;
1028 inode = get_pipe_inode();
1029 if (!inode) 1027 if (!inode)
1030 goto err; 1028 return -ENFILE;
1031 1029
1032 err = -ENOMEM; 1030 err = -ENOMEM;
1033 path.dentry = d_alloc_pseudo(pipe_mnt->mnt_sb, &name); 1031 path.dentry = d_alloc_pseudo(pipe_mnt->mnt_sb, &name);
@@ -1041,62 +1039,43 @@ struct file *create_write_pipe(int flags)
1041 f = alloc_file(&path, FMODE_WRITE, &write_pipefifo_fops); 1039 f = alloc_file(&path, FMODE_WRITE, &write_pipefifo_fops);
1042 if (!f) 1040 if (!f)
1043 goto err_dentry; 1041 goto err_dentry;
1044 f->f_mapping = inode->i_mapping;
1045 1042
1046 f->f_flags = O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)); 1043 f->f_flags = O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT));
1047 f->f_version = 0;
1048 1044
1049 return f; 1045 res[0] = alloc_file(&path, FMODE_READ, &read_pipefifo_fops);
1046 if (!res[0])
1047 goto err_file;
1048
1049 path_get(&path);
1050 res[0]->f_flags = O_RDONLY | (flags & O_NONBLOCK);
1051 res[1] = f;
1052 return 0;
1050 1053
1051 err_dentry: 1054err_file:
1055 put_filp(f);
1056err_dentry:
1052 free_pipe_info(inode); 1057 free_pipe_info(inode);
1053 path_put(&path); 1058 path_put(&path);
1054 return ERR_PTR(err); 1059 return err;
1055 1060
1056 err_inode: 1061err_inode:
1057 free_pipe_info(inode); 1062 free_pipe_info(inode);
1058 iput(inode); 1063 iput(inode);
1059 err: 1064 return err;
1060 return ERR_PTR(err);
1061}
1062
1063void free_write_pipe(struct file *f)
1064{
1065 free_pipe_info(f->f_dentry->d_inode);
1066 path_put(&f->f_path);
1067 put_filp(f);
1068}
1069
1070struct file *create_read_pipe(struct file *wrf, int flags)
1071{
1072 /* Grab pipe from the writer */
1073 struct file *f = alloc_file(&wrf->f_path, FMODE_READ,
1074 &read_pipefifo_fops);
1075 if (!f)
1076 return ERR_PTR(-ENFILE);
1077
1078 path_get(&wrf->f_path);
1079 f->f_flags = O_RDONLY | (flags & O_NONBLOCK);
1080
1081 return f;
1082} 1065}
1083 1066
1084int do_pipe_flags(int *fd, int flags) 1067int do_pipe_flags(int *fd, int flags)
1085{ 1068{
1086 struct file *fw, *fr; 1069 struct file *files[2];
1087 int error; 1070 int error;
1088 int fdw, fdr; 1071 int fdw, fdr;
1089 1072
1090 if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT)) 1073 if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT))
1091 return -EINVAL; 1074 return -EINVAL;
1092 1075
1093 fw = create_write_pipe(flags); 1076 error = create_pipe_files(files, flags);
1094 if (IS_ERR(fw)) 1077 if (error)
1095 return PTR_ERR(fw); 1078 return error;
1096 fr = create_read_pipe(fw, flags);
1097 error = PTR_ERR(fr);
1098 if (IS_ERR(fr))
1099 goto err_write_pipe;
1100 1079
1101 error = get_unused_fd_flags(flags); 1080 error = get_unused_fd_flags(flags);
1102 if (error < 0) 1081 if (error < 0)
@@ -1109,8 +1088,8 @@ int do_pipe_flags(int *fd, int flags)
1109 fdw = error; 1088 fdw = error;
1110 1089
1111 audit_fd_pair(fdr, fdw); 1090 audit_fd_pair(fdr, fdw);
1112 fd_install(fdr, fr); 1091 fd_install(fdr, files[0]);
1113 fd_install(fdw, fw); 1092 fd_install(fdw, files[1]);
1114 fd[0] = fdr; 1093 fd[0] = fdr;
1115 fd[1] = fdw; 1094 fd[1] = fdw;
1116 1095
@@ -1119,10 +1098,8 @@ int do_pipe_flags(int *fd, int flags)
1119 err_fdr: 1098 err_fdr:
1120 put_unused_fd(fdr); 1099 put_unused_fd(fdr);
1121 err_read_pipe: 1100 err_read_pipe:
1122 path_put(&fr->f_path); 1101 fput(files[0]);
1123 put_filp(fr); 1102 fput(files[1]);
1124 err_write_pipe:
1125 free_write_pipe(fw);
1126 return error; 1103 return error;
1127} 1104}
1128 1105
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 36a29b753c79..c495a3055e2a 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1589,10 +1589,10 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
1589 goto out; 1589 goto out;
1590 } 1590 }
1591 1591
1592 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1593 for (cnt = 0; cnt < MAXQUOTAS; cnt++) 1592 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1594 warn[cnt].w_type = QUOTA_NL_NOWARN; 1593 warn[cnt].w_type = QUOTA_NL_NOWARN;
1595 1594
1595 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1596 spin_lock(&dq_data_lock); 1596 spin_lock(&dq_data_lock);
1597 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1597 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1598 if (!dquots[cnt]) 1598 if (!dquots[cnt])
diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c
index 4c0c7d163d15..a98b7740a0fc 100644
--- a/fs/reiserfs/bitmap.c
+++ b/fs/reiserfs/bitmap.c
@@ -1334,9 +1334,7 @@ struct buffer_head *reiserfs_read_bitmap_block(struct super_block *sb,
1334 else if (bitmap == 0) 1334 else if (bitmap == 0)
1335 block = (REISERFS_DISK_OFFSET_IN_BYTES >> sb->s_blocksize_bits) + 1; 1335 block = (REISERFS_DISK_OFFSET_IN_BYTES >> sb->s_blocksize_bits) + 1;
1336 1336
1337 reiserfs_write_unlock(sb);
1338 bh = sb_bread(sb, block); 1337 bh = sb_bread(sb, block);
1339 reiserfs_write_lock(sb);
1340 if (bh == NULL) 1338 if (bh == NULL)
1341 reiserfs_warning(sb, "sh-2029: %s: bitmap block (#%u) " 1339 reiserfs_warning(sb, "sh-2029: %s: bitmap block (#%u) "
1342 "reading failed", __func__, block); 1340 "reading failed", __func__, block);
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index a6d4268fb6c1..855da58db145 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -76,10 +76,10 @@ void reiserfs_evict_inode(struct inode *inode)
76 ; 76 ;
77 } 77 }
78 out: 78 out:
79 reiserfs_write_unlock_once(inode->i_sb, depth);
79 clear_inode(inode); /* note this must go after the journal_end to prevent deadlock */ 80 clear_inode(inode); /* note this must go after the journal_end to prevent deadlock */
80 dquot_drop(inode); 81 dquot_drop(inode);
81 inode->i_blocks = 0; 82 inode->i_blocks = 0;
82 reiserfs_write_unlock_once(inode->i_sb, depth);
83 return; 83 return;
84 84
85no_delete: 85no_delete:
diff --git a/fs/splice.c b/fs/splice.c
index 7bf08fa22ec9..41514dd89462 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -996,6 +996,8 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
996 }; 996 };
997 ssize_t ret; 997 ssize_t ret;
998 998
999 sb_start_write(inode->i_sb);
1000
999 pipe_lock(pipe); 1001 pipe_lock(pipe);
1000 1002
1001 splice_from_pipe_begin(&sd); 1003 splice_from_pipe_begin(&sd);
@@ -1034,6 +1036,7 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
1034 *ppos += ret; 1036 *ppos += ret;
1035 balance_dirty_pages_ratelimited_nr(mapping, nr_pages); 1037 balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
1036 } 1038 }
1039 sb_end_write(inode->i_sb);
1037 1040
1038 return ret; 1041 return ret;
1039} 1042}
diff --git a/fs/super.c b/fs/super.c
index 4bf714459a4b..0902cfa6a12e 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -33,12 +33,19 @@
33#include <linux/rculist_bl.h> 33#include <linux/rculist_bl.h>
34#include <linux/cleancache.h> 34#include <linux/cleancache.h>
35#include <linux/fsnotify.h> 35#include <linux/fsnotify.h>
36#include <linux/lockdep.h>
36#include "internal.h" 37#include "internal.h"
37 38
38 39
39LIST_HEAD(super_blocks); 40LIST_HEAD(super_blocks);
40DEFINE_SPINLOCK(sb_lock); 41DEFINE_SPINLOCK(sb_lock);
41 42
43static char *sb_writers_name[SB_FREEZE_LEVELS] = {
44 "sb_writers",
45 "sb_pagefaults",
46 "sb_internal",
47};
48
42/* 49/*
43 * One thing we have to be careful of with a per-sb shrinker is that we don't 50 * One thing we have to be careful of with a per-sb shrinker is that we don't
44 * drop the last active reference to the superblock from within the shrinker. 51 * drop the last active reference to the superblock from within the shrinker.
@@ -102,6 +109,35 @@ static int prune_super(struct shrinker *shrink, struct shrink_control *sc)
102 return total_objects; 109 return total_objects;
103} 110}
104 111
112static int init_sb_writers(struct super_block *s, struct file_system_type *type)
113{
114 int err;
115 int i;
116
117 for (i = 0; i < SB_FREEZE_LEVELS; i++) {
118 err = percpu_counter_init(&s->s_writers.counter[i], 0);
119 if (err < 0)
120 goto err_out;
121 lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i],
122 &type->s_writers_key[i], 0);
123 }
124 init_waitqueue_head(&s->s_writers.wait);
125 init_waitqueue_head(&s->s_writers.wait_unfrozen);
126 return 0;
127err_out:
128 while (--i >= 0)
129 percpu_counter_destroy(&s->s_writers.counter[i]);
130 return err;
131}
132
133static void destroy_sb_writers(struct super_block *s)
134{
135 int i;
136
137 for (i = 0; i < SB_FREEZE_LEVELS; i++)
138 percpu_counter_destroy(&s->s_writers.counter[i]);
139}
140
105/** 141/**
106 * alloc_super - create new superblock 142 * alloc_super - create new superblock
107 * @type: filesystem type superblock should belong to 143 * @type: filesystem type superblock should belong to
@@ -117,18 +153,19 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
117 153
118 if (s) { 154 if (s) {
119 if (security_sb_alloc(s)) { 155 if (security_sb_alloc(s)) {
156 /*
157 * We cannot call security_sb_free() without
158 * security_sb_alloc() succeeding. So bail out manually
159 */
120 kfree(s); 160 kfree(s);
121 s = NULL; 161 s = NULL;
122 goto out; 162 goto out;
123 } 163 }
124#ifdef CONFIG_SMP 164#ifdef CONFIG_SMP
125 s->s_files = alloc_percpu(struct list_head); 165 s->s_files = alloc_percpu(struct list_head);
126 if (!s->s_files) { 166 if (!s->s_files)
127 security_sb_free(s); 167 goto err_out;
128 kfree(s); 168 else {
129 s = NULL;
130 goto out;
131 } else {
132 int i; 169 int i;
133 170
134 for_each_possible_cpu(i) 171 for_each_possible_cpu(i)
@@ -137,6 +174,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
137#else 174#else
138 INIT_LIST_HEAD(&s->s_files); 175 INIT_LIST_HEAD(&s->s_files);
139#endif 176#endif
177 if (init_sb_writers(s, type))
178 goto err_out;
140 s->s_flags = flags; 179 s->s_flags = flags;
141 s->s_bdi = &default_backing_dev_info; 180 s->s_bdi = &default_backing_dev_info;
142 INIT_HLIST_NODE(&s->s_instances); 181 INIT_HLIST_NODE(&s->s_instances);
@@ -178,7 +217,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
178 mutex_init(&s->s_dquot.dqio_mutex); 217 mutex_init(&s->s_dquot.dqio_mutex);
179 mutex_init(&s->s_dquot.dqonoff_mutex); 218 mutex_init(&s->s_dquot.dqonoff_mutex);
180 init_rwsem(&s->s_dquot.dqptr_sem); 219 init_rwsem(&s->s_dquot.dqptr_sem);
181 init_waitqueue_head(&s->s_wait_unfrozen);
182 s->s_maxbytes = MAX_NON_LFS; 220 s->s_maxbytes = MAX_NON_LFS;
183 s->s_op = &default_op; 221 s->s_op = &default_op;
184 s->s_time_gran = 1000000000; 222 s->s_time_gran = 1000000000;
@@ -190,6 +228,16 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
190 } 228 }
191out: 229out:
192 return s; 230 return s;
231err_out:
232 security_sb_free(s);
233#ifdef CONFIG_SMP
234 if (s->s_files)
235 free_percpu(s->s_files);
236#endif
237 destroy_sb_writers(s);
238 kfree(s);
239 s = NULL;
240 goto out;
193} 241}
194 242
195/** 243/**
@@ -203,6 +251,7 @@ static inline void destroy_super(struct super_block *s)
203#ifdef CONFIG_SMP 251#ifdef CONFIG_SMP
204 free_percpu(s->s_files); 252 free_percpu(s->s_files);
205#endif 253#endif
254 destroy_sb_writers(s);
206 security_sb_free(s); 255 security_sb_free(s);
207 WARN_ON(!list_empty(&s->s_mounts)); 256 WARN_ON(!list_empty(&s->s_mounts));
208 kfree(s->s_subtype); 257 kfree(s->s_subtype);
@@ -488,46 +537,6 @@ void drop_super(struct super_block *sb)
488EXPORT_SYMBOL(drop_super); 537EXPORT_SYMBOL(drop_super);
489 538
490/** 539/**
491 * sync_supers - helper for periodic superblock writeback
492 *
493 * Call the write_super method if present on all dirty superblocks in
494 * the system. This is for the periodic writeback used by most older
495 * filesystems. For data integrity superblock writeback use
496 * sync_filesystems() instead.
497 *
498 * Note: check the dirty flag before waiting, so we don't
499 * hold up the sync while mounting a device. (The newly
500 * mounted device won't need syncing.)
501 */
502void sync_supers(void)
503{
504 struct super_block *sb, *p = NULL;
505
506 spin_lock(&sb_lock);
507 list_for_each_entry(sb, &super_blocks, s_list) {
508 if (hlist_unhashed(&sb->s_instances))
509 continue;
510 if (sb->s_op->write_super && sb->s_dirt) {
511 sb->s_count++;
512 spin_unlock(&sb_lock);
513
514 down_read(&sb->s_umount);
515 if (sb->s_root && sb->s_dirt && (sb->s_flags & MS_BORN))
516 sb->s_op->write_super(sb);
517 up_read(&sb->s_umount);
518
519 spin_lock(&sb_lock);
520 if (p)
521 __put_super(p);
522 p = sb;
523 }
524 }
525 if (p)
526 __put_super(p);
527 spin_unlock(&sb_lock);
528}
529
530/**
531 * iterate_supers - call function for all active superblocks 540 * iterate_supers - call function for all active superblocks
532 * @f: function to call 541 * @f: function to call
533 * @arg: argument to pass to it 542 * @arg: argument to pass to it
@@ -651,10 +660,11 @@ struct super_block *get_super_thawed(struct block_device *bdev)
651{ 660{
652 while (1) { 661 while (1) {
653 struct super_block *s = get_super(bdev); 662 struct super_block *s = get_super(bdev);
654 if (!s || s->s_frozen == SB_UNFROZEN) 663 if (!s || s->s_writers.frozen == SB_UNFROZEN)
655 return s; 664 return s;
656 up_read(&s->s_umount); 665 up_read(&s->s_umount);
657 vfs_check_frozen(s, SB_FREEZE_WRITE); 666 wait_event(s->s_writers.wait_unfrozen,
667 s->s_writers.frozen == SB_UNFROZEN);
658 put_super(s); 668 put_super(s);
659 } 669 }
660} 670}
@@ -732,7 +742,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
732 int retval; 742 int retval;
733 int remount_ro; 743 int remount_ro;
734 744
735 if (sb->s_frozen != SB_UNFROZEN) 745 if (sb->s_writers.frozen != SB_UNFROZEN)
736 return -EBUSY; 746 return -EBUSY;
737 747
738#ifdef CONFIG_BLOCK 748#ifdef CONFIG_BLOCK
@@ -1163,6 +1173,120 @@ out:
1163 return ERR_PTR(error); 1173 return ERR_PTR(error);
1164} 1174}
1165 1175
1176/*
1177 * This is an internal function, please use sb_end_{write,pagefault,intwrite}
1178 * instead.
1179 */
1180void __sb_end_write(struct super_block *sb, int level)
1181{
1182 percpu_counter_dec(&sb->s_writers.counter[level-1]);
1183 /*
1184 * Make sure s_writers are updated before we wake up waiters in
1185 * freeze_super().
1186 */
1187 smp_mb();
1188 if (waitqueue_active(&sb->s_writers.wait))
1189 wake_up(&sb->s_writers.wait);
1190 rwsem_release(&sb->s_writers.lock_map[level-1], 1, _RET_IP_);
1191}
1192EXPORT_SYMBOL(__sb_end_write);
1193
1194#ifdef CONFIG_LOCKDEP
1195/*
1196 * We want lockdep to tell us about possible deadlocks with freezing but
1197 * it's it bit tricky to properly instrument it. Getting a freeze protection
1198 * works as getting a read lock but there are subtle problems. XFS for example
1199 * gets freeze protection on internal level twice in some cases, which is OK
1200 * only because we already hold a freeze protection also on higher level. Due
1201 * to these cases we have to tell lockdep we are doing trylock when we
1202 * already hold a freeze protection for a higher freeze level.
1203 */
1204static void acquire_freeze_lock(struct super_block *sb, int level, bool trylock,
1205 unsigned long ip)
1206{
1207 int i;
1208
1209 if (!trylock) {
1210 for (i = 0; i < level - 1; i++)
1211 if (lock_is_held(&sb->s_writers.lock_map[i])) {
1212 trylock = true;
1213 break;
1214 }
1215 }
1216 rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, trylock, ip);
1217}
1218#endif
1219
1220/*
1221 * This is an internal function, please use sb_start_{write,pagefault,intwrite}
1222 * instead.
1223 */
1224int __sb_start_write(struct super_block *sb, int level, bool wait)
1225{
1226retry:
1227 if (unlikely(sb->s_writers.frozen >= level)) {
1228 if (!wait)
1229 return 0;
1230 wait_event(sb->s_writers.wait_unfrozen,
1231 sb->s_writers.frozen < level);
1232 }
1233
1234#ifdef CONFIG_LOCKDEP
1235 acquire_freeze_lock(sb, level, !wait, _RET_IP_);
1236#endif
1237 percpu_counter_inc(&sb->s_writers.counter[level-1]);
1238 /*
1239 * Make sure counter is updated before we check for frozen.
1240 * freeze_super() first sets frozen and then checks the counter.
1241 */
1242 smp_mb();
1243 if (unlikely(sb->s_writers.frozen >= level)) {
1244 __sb_end_write(sb, level);
1245 goto retry;
1246 }
1247 return 1;
1248}
1249EXPORT_SYMBOL(__sb_start_write);
1250
1251/**
1252 * sb_wait_write - wait until all writers to given file system finish
1253 * @sb: the super for which we wait
1254 * @level: type of writers we wait for (normal vs page fault)
1255 *
1256 * This function waits until there are no writers of given type to given file
1257 * system. Caller of this function should make sure there can be no new writers
1258 * of type @level before calling this function. Otherwise this function can
1259 * livelock.
1260 */
1261static void sb_wait_write(struct super_block *sb, int level)
1262{
1263 s64 writers;
1264
1265 /*
1266 * We just cycle-through lockdep here so that it does not complain
1267 * about returning with lock to userspace
1268 */
1269 rwsem_acquire(&sb->s_writers.lock_map[level-1], 0, 0, _THIS_IP_);
1270 rwsem_release(&sb->s_writers.lock_map[level-1], 1, _THIS_IP_);
1271
1272 do {
1273 DEFINE_WAIT(wait);
1274
1275 /*
1276 * We use a barrier in prepare_to_wait() to separate setting
1277 * of frozen and checking of the counter
1278 */
1279 prepare_to_wait(&sb->s_writers.wait, &wait,
1280 TASK_UNINTERRUPTIBLE);
1281
1282 writers = percpu_counter_sum(&sb->s_writers.counter[level-1]);
1283 if (writers)
1284 schedule();
1285
1286 finish_wait(&sb->s_writers.wait, &wait);
1287 } while (writers);
1288}
1289
1166/** 1290/**
1167 * freeze_super - lock the filesystem and force it into a consistent state 1291 * freeze_super - lock the filesystem and force it into a consistent state
1168 * @sb: the super to lock 1292 * @sb: the super to lock
@@ -1170,6 +1294,31 @@ out:
1170 * Syncs the super to make sure the filesystem is consistent and calls the fs's 1294 * Syncs the super to make sure the filesystem is consistent and calls the fs's
1171 * freeze_fs. Subsequent calls to this without first thawing the fs will return 1295 * freeze_fs. Subsequent calls to this without first thawing the fs will return
1172 * -EBUSY. 1296 * -EBUSY.
1297 *
1298 * During this function, sb->s_writers.frozen goes through these values:
1299 *
1300 * SB_UNFROZEN: File system is normal, all writes progress as usual.
1301 *
1302 * SB_FREEZE_WRITE: The file system is in the process of being frozen. New
1303 * writes should be blocked, though page faults are still allowed. We wait for
1304 * all writes to complete and then proceed to the next stage.
1305 *
1306 * SB_FREEZE_PAGEFAULT: Freezing continues. Now also page faults are blocked
1307 * but internal fs threads can still modify the filesystem (although they
1308 * should not dirty new pages or inodes), writeback can run etc. After waiting
1309 * for all running page faults we sync the filesystem which will clean all
1310 * dirty pages and inodes (no new dirty pages or inodes can be created when
1311 * sync is running).
1312 *
1313 * SB_FREEZE_FS: The file system is frozen. Now all internal sources of fs
1314 * modification are blocked (e.g. XFS preallocation truncation on inode
1315 * reclaim). This is usually implemented by blocking new transactions for
1316 * filesystems that have them and need this additional guard. After all
1317 * internal writers are finished we call ->freeze_fs() to finish filesystem
1318 * freezing. Then we transition to SB_FREEZE_COMPLETE state. This state is
1319 * mostly auxiliary for filesystems to verify they do not modify frozen fs.
1320 *
1321 * sb->s_writers.frozen is protected by sb->s_umount.
1173 */ 1322 */
1174int freeze_super(struct super_block *sb) 1323int freeze_super(struct super_block *sb)
1175{ 1324{
@@ -1177,7 +1326,7 @@ int freeze_super(struct super_block *sb)
1177 1326
1178 atomic_inc(&sb->s_active); 1327 atomic_inc(&sb->s_active);
1179 down_write(&sb->s_umount); 1328 down_write(&sb->s_umount);
1180 if (sb->s_frozen) { 1329 if (sb->s_writers.frozen != SB_UNFROZEN) {
1181 deactivate_locked_super(sb); 1330 deactivate_locked_super(sb);
1182 return -EBUSY; 1331 return -EBUSY;
1183 } 1332 }
@@ -1188,33 +1337,53 @@ int freeze_super(struct super_block *sb)
1188 } 1337 }
1189 1338
1190 if (sb->s_flags & MS_RDONLY) { 1339 if (sb->s_flags & MS_RDONLY) {
1191 sb->s_frozen = SB_FREEZE_TRANS; 1340 /* Nothing to do really... */
1192 smp_wmb(); 1341 sb->s_writers.frozen = SB_FREEZE_COMPLETE;
1193 up_write(&sb->s_umount); 1342 up_write(&sb->s_umount);
1194 return 0; 1343 return 0;
1195 } 1344 }
1196 1345
1197 sb->s_frozen = SB_FREEZE_WRITE; 1346 /* From now on, no new normal writers can start */
1347 sb->s_writers.frozen = SB_FREEZE_WRITE;
1198 smp_wmb(); 1348 smp_wmb();
1199 1349
1350 /* Release s_umount to preserve sb_start_write -> s_umount ordering */
1351 up_write(&sb->s_umount);
1352
1353 sb_wait_write(sb, SB_FREEZE_WRITE);
1354
1355 /* Now we go and block page faults... */
1356 down_write(&sb->s_umount);
1357 sb->s_writers.frozen = SB_FREEZE_PAGEFAULT;
1358 smp_wmb();
1359
1360 sb_wait_write(sb, SB_FREEZE_PAGEFAULT);
1361
1362 /* All writers are done so after syncing there won't be dirty data */
1200 sync_filesystem(sb); 1363 sync_filesystem(sb);
1201 1364
1202 sb->s_frozen = SB_FREEZE_TRANS; 1365 /* Now wait for internal filesystem counter */
1366 sb->s_writers.frozen = SB_FREEZE_FS;
1203 smp_wmb(); 1367 smp_wmb();
1368 sb_wait_write(sb, SB_FREEZE_FS);
1204 1369
1205 sync_blockdev(sb->s_bdev);
1206 if (sb->s_op->freeze_fs) { 1370 if (sb->s_op->freeze_fs) {
1207 ret = sb->s_op->freeze_fs(sb); 1371 ret = sb->s_op->freeze_fs(sb);
1208 if (ret) { 1372 if (ret) {
1209 printk(KERN_ERR 1373 printk(KERN_ERR
1210 "VFS:Filesystem freeze failed\n"); 1374 "VFS:Filesystem freeze failed\n");
1211 sb->s_frozen = SB_UNFROZEN; 1375 sb->s_writers.frozen = SB_UNFROZEN;
1212 smp_wmb(); 1376 smp_wmb();
1213 wake_up(&sb->s_wait_unfrozen); 1377 wake_up(&sb->s_writers.wait_unfrozen);
1214 deactivate_locked_super(sb); 1378 deactivate_locked_super(sb);
1215 return ret; 1379 return ret;
1216 } 1380 }
1217 } 1381 }
1382 /*
1383 * This is just for debugging purposes so that fs can warn if it
1384 * sees write activity when frozen is set to SB_FREEZE_COMPLETE.
1385 */
1386 sb->s_writers.frozen = SB_FREEZE_COMPLETE;
1218 up_write(&sb->s_umount); 1387 up_write(&sb->s_umount);
1219 return 0; 1388 return 0;
1220} 1389}
@@ -1231,7 +1400,7 @@ int thaw_super(struct super_block *sb)
1231 int error; 1400 int error;
1232 1401
1233 down_write(&sb->s_umount); 1402 down_write(&sb->s_umount);
1234 if (sb->s_frozen == SB_UNFROZEN) { 1403 if (sb->s_writers.frozen == SB_UNFROZEN) {
1235 up_write(&sb->s_umount); 1404 up_write(&sb->s_umount);
1236 return -EINVAL; 1405 return -EINVAL;
1237 } 1406 }
@@ -1244,16 +1413,15 @@ int thaw_super(struct super_block *sb)
1244 if (error) { 1413 if (error) {
1245 printk(KERN_ERR 1414 printk(KERN_ERR
1246 "VFS:Filesystem thaw failed\n"); 1415 "VFS:Filesystem thaw failed\n");
1247 sb->s_frozen = SB_FREEZE_TRANS;
1248 up_write(&sb->s_umount); 1416 up_write(&sb->s_umount);
1249 return error; 1417 return error;
1250 } 1418 }
1251 } 1419 }
1252 1420
1253out: 1421out:
1254 sb->s_frozen = SB_UNFROZEN; 1422 sb->s_writers.frozen = SB_UNFROZEN;
1255 smp_wmb(); 1423 smp_wmb();
1256 wake_up(&sb->s_wait_unfrozen); 1424 wake_up(&sb->s_writers.wait_unfrozen);
1257 deactivate_locked_super(sb); 1425 deactivate_locked_super(sb);
1258 1426
1259 return 0; 1427 return 0;
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index a4759833d62d..614b2b544880 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -228,6 +228,8 @@ static int bin_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
228 ret = 0; 228 ret = 0;
229 if (bb->vm_ops->page_mkwrite) 229 if (bb->vm_ops->page_mkwrite)
230 ret = bb->vm_ops->page_mkwrite(vma, vmf); 230 ret = bb->vm_ops->page_mkwrite(vma, vmf);
231 else
232 file_update_time(file);
231 233
232 sysfs_put_active(attr_sd); 234 sysfs_put_active(attr_sd);
233 return ret; 235 return ret;
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 8b8cc4e945f4..760de723dadb 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -167,7 +167,7 @@ struct ubifs_global_debug_info {
167#define ubifs_dbg_msg(type, fmt, ...) \ 167#define ubifs_dbg_msg(type, fmt, ...) \
168 pr_debug("UBIFS DBG " type ": " fmt "\n", ##__VA_ARGS__) 168 pr_debug("UBIFS DBG " type ": " fmt "\n", ##__VA_ARGS__)
169 169
170#define DBG_KEY_BUF_LEN 32 170#define DBG_KEY_BUF_LEN 48
171#define ubifs_dbg_msg_key(type, key, fmt, ...) do { \ 171#define ubifs_dbg_msg_key(type, key, fmt, ...) do { \
172 char __tmp_key_buf[DBG_KEY_BUF_LEN]; \ 172 char __tmp_key_buf[DBG_KEY_BUF_LEN]; \
173 pr_debug("UBIFS DBG " type ": " fmt "%s\n", ##__VA_ARGS__, \ 173 pr_debug("UBIFS DBG " type ": " fmt "%s\n", ##__VA_ARGS__, \
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 35389ca2d267..7bd6e72afd11 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -37,11 +37,11 @@
37 * 37 *
38 * A thing to keep in mind: inode @i_mutex is locked in most VFS operations we 38 * A thing to keep in mind: inode @i_mutex is locked in most VFS operations we
39 * implement. However, this is not true for 'ubifs_writepage()', which may be 39 * implement. However, this is not true for 'ubifs_writepage()', which may be
40 * called with @i_mutex unlocked. For example, when pdflush is doing background 40 * called with @i_mutex unlocked. For example, when flusher thread is doing
41 * write-back, it calls 'ubifs_writepage()' with unlocked @i_mutex. At "normal" 41 * background write-back, it calls 'ubifs_writepage()' with unlocked @i_mutex.
42 * work-paths the @i_mutex is locked in 'ubifs_writepage()', e.g. in the 42 * At "normal" work-paths the @i_mutex is locked in 'ubifs_writepage()', e.g.
43 * "sys_write -> alloc_pages -> direct reclaim path". So, in 'ubifs_writepage()' 43 * in the "sys_write -> alloc_pages -> direct reclaim path". So, in
44 * we are only guaranteed that the page is locked. 44 * 'ubifs_writepage()' we are only guaranteed that the page is locked.
45 * 45 *
46 * Similarly, @i_mutex is not always locked in 'ubifs_readpage()', e.g., the 46 * Similarly, @i_mutex is not always locked in 'ubifs_readpage()', e.g., the
47 * read-ahead path does not lock it ("sys_read -> generic_file_aio_read -> 47 * read-ahead path does not lock it ("sys_read -> generic_file_aio_read ->
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index ce33b2beb151..8640920766ed 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -1749,7 +1749,10 @@ int ubifs_lpt_init(struct ubifs_info *c, int rd, int wr)
1749 return 0; 1749 return 0;
1750 1750
1751out_err: 1751out_err:
1752 ubifs_lpt_free(c, 0); 1752 if (wr)
1753 ubifs_lpt_free(c, 1);
1754 if (rd)
1755 ubifs_lpt_free(c, 0);
1753 return err; 1756 return err;
1754} 1757}
1755 1758
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index c30d976b4be8..edeec499c048 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -788,7 +788,7 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
788 788
789corrupted_rescan: 789corrupted_rescan:
790 /* Re-scan the corrupted data with verbose messages */ 790 /* Re-scan the corrupted data with verbose messages */
791 ubifs_err("corruptio %d", ret); 791 ubifs_err("corruption %d", ret);
792 ubifs_scan_a_node(c, buf, len, lnum, offs, 1); 792 ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
793corrupted: 793corrupted:
794 ubifs_scanned_corruption(c, lnum, offs, buf); 794 ubifs_scanned_corruption(c, lnum, offs, buf);
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index eba46d4a7619..94d78fc5d4e0 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -1026,7 +1026,6 @@ int ubifs_replay_journal(struct ubifs_info *c)
1026 c->replaying = 1; 1026 c->replaying = 1;
1027 lnum = c->ltail_lnum = c->lhead_lnum; 1027 lnum = c->ltail_lnum = c->lhead_lnum;
1028 1028
1029 lnum = UBIFS_LOG_LNUM;
1030 do { 1029 do {
1031 err = replay_log_leb(c, lnum, 0, c->sbuf); 1030 err = replay_log_leb(c, lnum, 0, c->sbuf);
1032 if (err == 1) 1031 if (err == 1)
@@ -1035,7 +1034,7 @@ int ubifs_replay_journal(struct ubifs_info *c)
1035 if (err) 1034 if (err)
1036 goto out; 1035 goto out;
1037 lnum = ubifs_next_log_lnum(c, lnum); 1036 lnum = ubifs_next_log_lnum(c, lnum);
1038 } while (lnum != UBIFS_LOG_LNUM); 1037 } while (lnum != c->ltail_lnum);
1039 1038
1040 err = replay_buds(c); 1039 err = replay_buds(c);
1041 if (err) 1040 if (err)
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 1c766c39c038..71a197f0f93d 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -303,7 +303,7 @@ static int ubifs_write_inode(struct inode *inode, struct writeback_control *wbc)
303 mutex_lock(&ui->ui_mutex); 303 mutex_lock(&ui->ui_mutex);
304 /* 304 /*
305 * Due to races between write-back forced by budgeting 305 * Due to races between write-back forced by budgeting
306 * (see 'sync_some_inodes()') and pdflush write-back, the inode may 306 * (see 'sync_some_inodes()') and background write-back, the inode may
307 * have already been synchronized, do not do this again. This might 307 * have already been synchronized, do not do this again. This might
308 * also happen if it was synchronized in an VFS operation, e.g. 308 * also happen if it was synchronized in an VFS operation, e.g.
309 * 'ubifs_link()'. 309 * 'ubifs_link()'.
@@ -1157,9 +1157,6 @@ static int check_free_space(struct ubifs_info *c)
1157 * 1157 *
1158 * This function mounts UBIFS file system. Returns zero in case of success and 1158 * This function mounts UBIFS file system. Returns zero in case of success and
1159 * a negative error code in case of failure. 1159 * a negative error code in case of failure.
1160 *
1161 * Note, the function does not de-allocate resources it it fails half way
1162 * through, and the caller has to do this instead.
1163 */ 1160 */
1164static int mount_ubifs(struct ubifs_info *c) 1161static int mount_ubifs(struct ubifs_info *c)
1165{ 1162{
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 7f3f7ba3df6e..d1c6093fd3d3 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -39,20 +39,24 @@
39#include "udf_i.h" 39#include "udf_i.h"
40#include "udf_sb.h" 40#include "udf_sb.h"
41 41
42static int udf_adinicb_readpage(struct file *file, struct page *page) 42static void __udf_adinicb_readpage(struct page *page)
43{ 43{
44 struct inode *inode = page->mapping->host; 44 struct inode *inode = page->mapping->host;
45 char *kaddr; 45 char *kaddr;
46 struct udf_inode_info *iinfo = UDF_I(inode); 46 struct udf_inode_info *iinfo = UDF_I(inode);
47 47
48 BUG_ON(!PageLocked(page));
49
50 kaddr = kmap(page); 48 kaddr = kmap(page);
51 memset(kaddr, 0, PAGE_CACHE_SIZE);
52 memcpy(kaddr, iinfo->i_ext.i_data + iinfo->i_lenEAttr, inode->i_size); 49 memcpy(kaddr, iinfo->i_ext.i_data + iinfo->i_lenEAttr, inode->i_size);
50 memset(kaddr + inode->i_size, 0, PAGE_CACHE_SIZE - inode->i_size);
53 flush_dcache_page(page); 51 flush_dcache_page(page);
54 SetPageUptodate(page); 52 SetPageUptodate(page);
55 kunmap(page); 53 kunmap(page);
54}
55
56static int udf_adinicb_readpage(struct file *file, struct page *page)
57{
58 BUG_ON(!PageLocked(page));
59 __udf_adinicb_readpage(page);
56 unlock_page(page); 60 unlock_page(page);
57 61
58 return 0; 62 return 0;
@@ -77,6 +81,25 @@ static int udf_adinicb_writepage(struct page *page,
77 return 0; 81 return 0;
78} 82}
79 83
84static int udf_adinicb_write_begin(struct file *file,
85 struct address_space *mapping, loff_t pos,
86 unsigned len, unsigned flags, struct page **pagep,
87 void **fsdata)
88{
89 struct page *page;
90
91 if (WARN_ON_ONCE(pos >= PAGE_CACHE_SIZE))
92 return -EIO;
93 page = grab_cache_page_write_begin(mapping, 0, flags);
94 if (!page)
95 return -ENOMEM;
96 *pagep = page;
97
98 if (!PageUptodate(page) && len != PAGE_CACHE_SIZE)
99 __udf_adinicb_readpage(page);
100 return 0;
101}
102
80static int udf_adinicb_write_end(struct file *file, 103static int udf_adinicb_write_end(struct file *file,
81 struct address_space *mapping, 104 struct address_space *mapping,
82 loff_t pos, unsigned len, unsigned copied, 105 loff_t pos, unsigned len, unsigned copied,
@@ -98,8 +121,8 @@ static int udf_adinicb_write_end(struct file *file,
98const struct address_space_operations udf_adinicb_aops = { 121const struct address_space_operations udf_adinicb_aops = {
99 .readpage = udf_adinicb_readpage, 122 .readpage = udf_adinicb_readpage,
100 .writepage = udf_adinicb_writepage, 123 .writepage = udf_adinicb_writepage,
101 .write_begin = simple_write_begin, 124 .write_begin = udf_adinicb_write_begin,
102 .write_end = udf_adinicb_write_end, 125 .write_end = udf_adinicb_write_end,
103}; 126};
104 127
105static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov, 128static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index fafaad795cd6..aa233469b3c1 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1124,14 +1124,17 @@ int udf_setsize(struct inode *inode, loff_t newsize)
1124 if (err) 1124 if (err)
1125 return err; 1125 return err;
1126 down_write(&iinfo->i_data_sem); 1126 down_write(&iinfo->i_data_sem);
1127 } else 1127 } else {
1128 iinfo->i_lenAlloc = newsize; 1128 iinfo->i_lenAlloc = newsize;
1129 goto set_size;
1130 }
1129 } 1131 }
1130 err = udf_extend_file(inode, newsize); 1132 err = udf_extend_file(inode, newsize);
1131 if (err) { 1133 if (err) {
1132 up_write(&iinfo->i_data_sem); 1134 up_write(&iinfo->i_data_sem);
1133 return err; 1135 return err;
1134 } 1136 }
1137set_size:
1135 truncate_setsize(inode, newsize); 1138 truncate_setsize(inode, newsize);
1136 up_write(&iinfo->i_data_sem); 1139 up_write(&iinfo->i_data_sem);
1137 } else { 1140 } else {
diff --git a/fs/udf/super.c b/fs/udf/super.c
index dcbf98722afc..18fc038a438d 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1344,6 +1344,7 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
1344 udf_err(sb, "error loading logical volume descriptor: " 1344 udf_err(sb, "error loading logical volume descriptor: "
1345 "Partition table too long (%u > %lu)\n", table_len, 1345 "Partition table too long (%u > %lu)\n", table_len,
1346 sb->s_blocksize - sizeof(*lvd)); 1346 sb->s_blocksize - sizeof(*lvd));
1347 ret = 1;
1347 goto out_bh; 1348 goto out_bh;
1348 } 1349 }
1349 1350
@@ -1388,8 +1389,10 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
1388 UDF_ID_SPARABLE, 1389 UDF_ID_SPARABLE,
1389 strlen(UDF_ID_SPARABLE))) { 1390 strlen(UDF_ID_SPARABLE))) {
1390 if (udf_load_sparable_map(sb, map, 1391 if (udf_load_sparable_map(sb, map,
1391 (struct sparablePartitionMap *)gpm) < 0) 1392 (struct sparablePartitionMap *)gpm) < 0) {
1393 ret = 1;
1392 goto out_bh; 1394 goto out_bh;
1395 }
1393 } else if (!strncmp(upm2->partIdent.ident, 1396 } else if (!strncmp(upm2->partIdent.ident,
1394 UDF_ID_METADATA, 1397 UDF_ID_METADATA,
1395 strlen(UDF_ID_METADATA))) { 1398 strlen(UDF_ID_METADATA))) {
@@ -2000,6 +2003,8 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
2000 if (!silent) 2003 if (!silent)
2001 pr_notice("Rescanning with blocksize %d\n", 2004 pr_notice("Rescanning with blocksize %d\n",
2002 UDF_DEFAULT_BLOCKSIZE); 2005 UDF_DEFAULT_BLOCKSIZE);
2006 brelse(sbi->s_lvid_bh);
2007 sbi->s_lvid_bh = NULL;
2003 uopt.blocksize = UDF_DEFAULT_BLOCKSIZE; 2008 uopt.blocksize = UDF_DEFAULT_BLOCKSIZE;
2004 ret = udf_load_vrs(sb, &uopt, silent, &fileset); 2009 ret = udf_load_vrs(sb, &uopt, silent, &fileset);
2005 } 2010 }
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 15052ff916ec..e562dd43f41f 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -124,6 +124,12 @@ xfs_setfilesize_trans_alloc(
124 ioend->io_append_trans = tp; 124 ioend->io_append_trans = tp;
125 125
126 /* 126 /*
127 * We will pass freeze protection with a transaction. So tell lockdep
128 * we released it.
129 */
130 rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
131 1, _THIS_IP_);
132 /*
127 * We hand off the transaction to the completion thread now, so 133 * We hand off the transaction to the completion thread now, so
128 * clear the flag here. 134 * clear the flag here.
129 */ 135 */
@@ -199,6 +205,15 @@ xfs_end_io(
199 struct xfs_inode *ip = XFS_I(ioend->io_inode); 205 struct xfs_inode *ip = XFS_I(ioend->io_inode);
200 int error = 0; 206 int error = 0;
201 207
208 if (ioend->io_append_trans) {
209 /*
210 * We've got freeze protection passed with the transaction.
211 * Tell lockdep about it.
212 */
213 rwsem_acquire_read(
214 &ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
215 0, 1, _THIS_IP_);
216 }
202 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { 217 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
203 ioend->io_error = -EIO; 218 ioend->io_error = -EIO;
204 goto done; 219 goto done;
@@ -1425,6 +1440,9 @@ out_trans_cancel:
1425 if (ioend->io_append_trans) { 1440 if (ioend->io_append_trans) {
1426 current_set_flags_nested(&ioend->io_append_trans->t_pflags, 1441 current_set_flags_nested(&ioend->io_append_trans->t_pflags,
1427 PF_FSTRANS); 1442 PF_FSTRANS);
1443 rwsem_acquire_read(
1444 &inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
1445 0, 1, _THIS_IP_);
1428 xfs_trans_cancel(ioend->io_append_trans, 0); 1446 xfs_trans_cancel(ioend->io_append_trans, 0);
1429 } 1447 }
1430out_destroy_ioend: 1448out_destroy_ioend:
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index f9c3fe304a17..69cf4fcde03e 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -179,12 +179,14 @@ xfs_ioc_trim(
179 * used by the fstrim application. In the end it really doesn't 179 * used by the fstrim application. In the end it really doesn't
180 * matter as trimming blocks is an advisory interface. 180 * matter as trimming blocks is an advisory interface.
181 */ 181 */
182 if (range.start >= XFS_FSB_TO_B(mp, mp->m_sb.sb_dblocks) ||
183 range.minlen > XFS_FSB_TO_B(mp, XFS_ALLOC_AG_MAX_USABLE(mp)))
184 return -XFS_ERROR(EINVAL);
185
182 start = BTOBB(range.start); 186 start = BTOBB(range.start);
183 end = start + BTOBBT(range.len) - 1; 187 end = start + BTOBBT(range.len) - 1;
184 minlen = BTOBB(max_t(u64, granularity, range.minlen)); 188 minlen = BTOBB(max_t(u64, granularity, range.minlen));
185 189
186 if (XFS_BB_TO_FSB(mp, start) >= mp->m_sb.sb_dblocks)
187 return -XFS_ERROR(EINVAL);
188 if (end > XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1) 190 if (end > XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1)
189 end = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)- 1; 191 end = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)- 1;
190 192
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index c4559c6e6f2c..56afcdb2377d 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -770,10 +770,12 @@ xfs_file_aio_write(
770 if (ocount == 0) 770 if (ocount == 0)
771 return 0; 771 return 0;
772 772
773 xfs_wait_for_freeze(ip->i_mount, SB_FREEZE_WRITE); 773 sb_start_write(inode->i_sb);
774 774
775 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 775 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
776 return -EIO; 776 ret = -EIO;
777 goto out;
778 }
777 779
778 if (unlikely(file->f_flags & O_DIRECT)) 780 if (unlikely(file->f_flags & O_DIRECT))
779 ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, ocount); 781 ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, ocount);
@@ -792,6 +794,8 @@ xfs_file_aio_write(
792 ret = err; 794 ret = err;
793 } 795 }
794 796
797out:
798 sb_end_write(inode->i_sb);
795 return ret; 799 return ret;
796} 800}
797 801
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 21e37b55f7e5..5aceb3f8ecd6 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -962,23 +962,22 @@ xfs_dialloc(
962 if (!pag->pagi_freecount && !okalloc) 962 if (!pag->pagi_freecount && !okalloc)
963 goto nextag; 963 goto nextag;
964 964
965 /*
966 * Then read in the AGI buffer and recheck with the AGI buffer
967 * lock held.
968 */
965 error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); 969 error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
966 if (error) 970 if (error)
967 goto out_error; 971 goto out_error;
968 972
969 /*
970 * Once the AGI has been read in we have to recheck
971 * pagi_freecount with the AGI buffer lock held.
972 */
973 if (pag->pagi_freecount) { 973 if (pag->pagi_freecount) {
974 xfs_perag_put(pag); 974 xfs_perag_put(pag);
975 goto out_alloc; 975 goto out_alloc;
976 } 976 }
977 977
978 if (!okalloc) { 978 if (!okalloc)
979 xfs_trans_brelse(tp, agbp); 979 goto nextag_relse_buffer;
980 goto nextag; 980
981 }
982 981
983 error = xfs_ialloc_ag_alloc(tp, agbp, &ialloced); 982 error = xfs_ialloc_ag_alloc(tp, agbp, &ialloced);
984 if (error) { 983 if (error) {
@@ -1007,6 +1006,8 @@ xfs_dialloc(
1007 return 0; 1006 return 0;
1008 } 1007 }
1009 1008
1009nextag_relse_buffer:
1010 xfs_trans_brelse(tp, agbp);
1010nextag: 1011nextag:
1011 xfs_perag_put(pag); 1012 xfs_perag_put(pag);
1012 if (++agno == mp->m_sb.sb_agcount) 1013 if (++agno == mp->m_sb.sb_agcount)
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 1f1535d25a9b..0e0232c3b6d9 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -364,9 +364,15 @@ xfs_fssetdm_by_handle(
364 if (copy_from_user(&dmhreq, arg, sizeof(xfs_fsop_setdm_handlereq_t))) 364 if (copy_from_user(&dmhreq, arg, sizeof(xfs_fsop_setdm_handlereq_t)))
365 return -XFS_ERROR(EFAULT); 365 return -XFS_ERROR(EFAULT);
366 366
367 error = mnt_want_write_file(parfilp);
368 if (error)
369 return error;
370
367 dentry = xfs_handlereq_to_dentry(parfilp, &dmhreq.hreq); 371 dentry = xfs_handlereq_to_dentry(parfilp, &dmhreq.hreq);
368 if (IS_ERR(dentry)) 372 if (IS_ERR(dentry)) {
373 mnt_drop_write_file(parfilp);
369 return PTR_ERR(dentry); 374 return PTR_ERR(dentry);
375 }
370 376
371 if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) { 377 if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) {
372 error = -XFS_ERROR(EPERM); 378 error = -XFS_ERROR(EPERM);
@@ -382,6 +388,7 @@ xfs_fssetdm_by_handle(
382 fsd.fsd_dmstate); 388 fsd.fsd_dmstate);
383 389
384 out: 390 out:
391 mnt_drop_write_file(parfilp);
385 dput(dentry); 392 dput(dentry);
386 return error; 393 return error;
387} 394}
@@ -634,7 +641,11 @@ xfs_ioc_space(
634 if (ioflags & IO_INVIS) 641 if (ioflags & IO_INVIS)
635 attr_flags |= XFS_ATTR_DMI; 642 attr_flags |= XFS_ATTR_DMI;
636 643
644 error = mnt_want_write_file(filp);
645 if (error)
646 return error;
637 error = xfs_change_file_space(ip, cmd, bf, filp->f_pos, attr_flags); 647 error = xfs_change_file_space(ip, cmd, bf, filp->f_pos, attr_flags);
648 mnt_drop_write_file(filp);
638 return -error; 649 return -error;
639} 650}
640 651
@@ -1163,6 +1174,7 @@ xfs_ioc_fssetxattr(
1163{ 1174{
1164 struct fsxattr fa; 1175 struct fsxattr fa;
1165 unsigned int mask; 1176 unsigned int mask;
1177 int error;
1166 1178
1167 if (copy_from_user(&fa, arg, sizeof(fa))) 1179 if (copy_from_user(&fa, arg, sizeof(fa)))
1168 return -EFAULT; 1180 return -EFAULT;
@@ -1171,7 +1183,12 @@ xfs_ioc_fssetxattr(
1171 if (filp->f_flags & (O_NDELAY|O_NONBLOCK)) 1183 if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
1172 mask |= FSX_NONBLOCK; 1184 mask |= FSX_NONBLOCK;
1173 1185
1174 return -xfs_ioctl_setattr(ip, &fa, mask); 1186 error = mnt_want_write_file(filp);
1187 if (error)
1188 return error;
1189 error = xfs_ioctl_setattr(ip, &fa, mask);
1190 mnt_drop_write_file(filp);
1191 return -error;
1175} 1192}
1176 1193
1177STATIC int 1194STATIC int
@@ -1196,6 +1213,7 @@ xfs_ioc_setxflags(
1196 struct fsxattr fa; 1213 struct fsxattr fa;
1197 unsigned int flags; 1214 unsigned int flags;
1198 unsigned int mask; 1215 unsigned int mask;
1216 int error;
1199 1217
1200 if (copy_from_user(&flags, arg, sizeof(flags))) 1218 if (copy_from_user(&flags, arg, sizeof(flags)))
1201 return -EFAULT; 1219 return -EFAULT;
@@ -1210,7 +1228,12 @@ xfs_ioc_setxflags(
1210 mask |= FSX_NONBLOCK; 1228 mask |= FSX_NONBLOCK;
1211 fa.fsx_xflags = xfs_merge_ioc_xflags(flags, xfs_ip2xflags(ip)); 1229 fa.fsx_xflags = xfs_merge_ioc_xflags(flags, xfs_ip2xflags(ip));
1212 1230
1213 return -xfs_ioctl_setattr(ip, &fa, mask); 1231 error = mnt_want_write_file(filp);
1232 if (error)
1233 return error;
1234 error = xfs_ioctl_setattr(ip, &fa, mask);
1235 mnt_drop_write_file(filp);
1236 return -error;
1214} 1237}
1215 1238
1216STATIC int 1239STATIC int
@@ -1385,8 +1408,13 @@ xfs_file_ioctl(
1385 if (copy_from_user(&dmi, arg, sizeof(dmi))) 1408 if (copy_from_user(&dmi, arg, sizeof(dmi)))
1386 return -XFS_ERROR(EFAULT); 1409 return -XFS_ERROR(EFAULT);
1387 1410
1411 error = mnt_want_write_file(filp);
1412 if (error)
1413 return error;
1414
1388 error = xfs_set_dmattrs(ip, dmi.fsd_dmevmask, 1415 error = xfs_set_dmattrs(ip, dmi.fsd_dmevmask,
1389 dmi.fsd_dmstate); 1416 dmi.fsd_dmstate);
1417 mnt_drop_write_file(filp);
1390 return -error; 1418 return -error;
1391 } 1419 }
1392 1420
@@ -1434,7 +1462,11 @@ xfs_file_ioctl(
1434 1462
1435 if (copy_from_user(&sxp, arg, sizeof(xfs_swapext_t))) 1463 if (copy_from_user(&sxp, arg, sizeof(xfs_swapext_t)))
1436 return -XFS_ERROR(EFAULT); 1464 return -XFS_ERROR(EFAULT);
1465 error = mnt_want_write_file(filp);
1466 if (error)
1467 return error;
1437 error = xfs_swapext(&sxp); 1468 error = xfs_swapext(&sxp);
1469 mnt_drop_write_file(filp);
1438 return -error; 1470 return -error;
1439 } 1471 }
1440 1472
@@ -1463,9 +1495,14 @@ xfs_file_ioctl(
1463 if (copy_from_user(&inout, arg, sizeof(inout))) 1495 if (copy_from_user(&inout, arg, sizeof(inout)))
1464 return -XFS_ERROR(EFAULT); 1496 return -XFS_ERROR(EFAULT);
1465 1497
1498 error = mnt_want_write_file(filp);
1499 if (error)
1500 return error;
1501
1466 /* input parameter is passed in resblks field of structure */ 1502 /* input parameter is passed in resblks field of structure */
1467 in = inout.resblks; 1503 in = inout.resblks;
1468 error = xfs_reserve_blocks(mp, &in, &inout); 1504 error = xfs_reserve_blocks(mp, &in, &inout);
1505 mnt_drop_write_file(filp);
1469 if (error) 1506 if (error)
1470 return -error; 1507 return -error;
1471 1508
@@ -1496,7 +1533,11 @@ xfs_file_ioctl(
1496 if (copy_from_user(&in, arg, sizeof(in))) 1533 if (copy_from_user(&in, arg, sizeof(in)))
1497 return -XFS_ERROR(EFAULT); 1534 return -XFS_ERROR(EFAULT);
1498 1535
1536 error = mnt_want_write_file(filp);
1537 if (error)
1538 return error;
1499 error = xfs_growfs_data(mp, &in); 1539 error = xfs_growfs_data(mp, &in);
1540 mnt_drop_write_file(filp);
1500 return -error; 1541 return -error;
1501 } 1542 }
1502 1543
@@ -1506,7 +1547,11 @@ xfs_file_ioctl(
1506 if (copy_from_user(&in, arg, sizeof(in))) 1547 if (copy_from_user(&in, arg, sizeof(in)))
1507 return -XFS_ERROR(EFAULT); 1548 return -XFS_ERROR(EFAULT);
1508 1549
1550 error = mnt_want_write_file(filp);
1551 if (error)
1552 return error;
1509 error = xfs_growfs_log(mp, &in); 1553 error = xfs_growfs_log(mp, &in);
1554 mnt_drop_write_file(filp);
1510 return -error; 1555 return -error;
1511 } 1556 }
1512 1557
@@ -1516,7 +1561,11 @@ xfs_file_ioctl(
1516 if (copy_from_user(&in, arg, sizeof(in))) 1561 if (copy_from_user(&in, arg, sizeof(in)))
1517 return -XFS_ERROR(EFAULT); 1562 return -XFS_ERROR(EFAULT);
1518 1563
1564 error = mnt_want_write_file(filp);
1565 if (error)
1566 return error;
1519 error = xfs_growfs_rt(mp, &in); 1567 error = xfs_growfs_rt(mp, &in);
1568 mnt_drop_write_file(filp);
1520 return -error; 1569 return -error;
1521 } 1570 }
1522 1571
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index c4f2da0d2bf5..1244274a5674 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -600,7 +600,11 @@ xfs_file_compat_ioctl(
600 600
601 if (xfs_compat_growfs_data_copyin(&in, arg)) 601 if (xfs_compat_growfs_data_copyin(&in, arg))
602 return -XFS_ERROR(EFAULT); 602 return -XFS_ERROR(EFAULT);
603 error = mnt_want_write_file(filp);
604 if (error)
605 return error;
603 error = xfs_growfs_data(mp, &in); 606 error = xfs_growfs_data(mp, &in);
607 mnt_drop_write_file(filp);
604 return -error; 608 return -error;
605 } 609 }
606 case XFS_IOC_FSGROWFSRT_32: { 610 case XFS_IOC_FSGROWFSRT_32: {
@@ -608,7 +612,11 @@ xfs_file_compat_ioctl(
608 612
609 if (xfs_compat_growfs_rt_copyin(&in, arg)) 613 if (xfs_compat_growfs_rt_copyin(&in, arg))
610 return -XFS_ERROR(EFAULT); 614 return -XFS_ERROR(EFAULT);
615 error = mnt_want_write_file(filp);
616 if (error)
617 return error;
611 error = xfs_growfs_rt(mp, &in); 618 error = xfs_growfs_rt(mp, &in);
619 mnt_drop_write_file(filp);
612 return -error; 620 return -error;
613 } 621 }
614#endif 622#endif
@@ -627,7 +635,11 @@ xfs_file_compat_ioctl(
627 offsetof(struct xfs_swapext, sx_stat)) || 635 offsetof(struct xfs_swapext, sx_stat)) ||
628 xfs_ioctl32_bstat_copyin(&sxp.sx_stat, &sxu->sx_stat)) 636 xfs_ioctl32_bstat_copyin(&sxp.sx_stat, &sxu->sx_stat))
629 return -XFS_ERROR(EFAULT); 637 return -XFS_ERROR(EFAULT);
638 error = mnt_want_write_file(filp);
639 if (error)
640 return error;
630 error = xfs_swapext(&sxp); 641 error = xfs_swapext(&sxp);
642 mnt_drop_write_file(filp);
631 return -error; 643 return -error;
632 } 644 }
633 case XFS_IOC_FSBULKSTAT_32: 645 case XFS_IOC_FSBULKSTAT_32:
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 915edf6639f0..973dff6ad935 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -680,9 +680,9 @@ xfs_iomap_write_unwritten(
680 * the same inode that we complete here and might deadlock 680 * the same inode that we complete here and might deadlock
681 * on the iolock. 681 * on the iolock.
682 */ 682 */
683 xfs_wait_for_freeze(mp, SB_FREEZE_TRANS); 683 sb_start_intwrite(mp->m_super);
684 tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS); 684 tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS);
685 tp->t_flags |= XFS_TRANS_RESERVE; 685 tp->t_flags |= XFS_TRANS_RESERVE | XFS_TRANS_FREEZE_PROT;
686 error = xfs_trans_reserve(tp, resblks, 686 error = xfs_trans_reserve(tp, resblks,
687 XFS_WRITE_LOG_RES(mp), 0, 687 XFS_WRITE_LOG_RES(mp), 0,
688 XFS_TRANS_PERM_LOG_RES, 688 XFS_TRANS_PERM_LOG_RES,
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 711ca51ca3d7..29c2f83d4147 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1551,7 +1551,7 @@ xfs_unmountfs(
1551int 1551int
1552xfs_fs_writable(xfs_mount_t *mp) 1552xfs_fs_writable(xfs_mount_t *mp)
1553{ 1553{
1554 return !(xfs_test_for_freeze(mp) || XFS_FORCED_SHUTDOWN(mp) || 1554 return !(mp->m_super->s_writers.frozen || XFS_FORCED_SHUTDOWN(mp) ||
1555 (mp->m_flags & XFS_MOUNT_RDONLY)); 1555 (mp->m_flags & XFS_MOUNT_RDONLY));
1556} 1556}
1557 1557
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 8724336a9a08..05a05a7b6119 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -311,9 +311,6 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname,
311#define SHUTDOWN_REMOTE_REQ 0x0010 /* shutdown came from remote cell */ 311#define SHUTDOWN_REMOTE_REQ 0x0010 /* shutdown came from remote cell */
312#define SHUTDOWN_DEVICE_REQ 0x0020 /* failed all paths to the device */ 312#define SHUTDOWN_DEVICE_REQ 0x0020 /* failed all paths to the device */
313 313
314#define xfs_test_for_freeze(mp) ((mp)->m_super->s_frozen)
315#define xfs_wait_for_freeze(mp,l) vfs_check_frozen((mp)->m_super, (l))
316
317/* 314/*
318 * Flags for xfs_mountfs 315 * Flags for xfs_mountfs
319 */ 316 */
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 92d4331cd4f1..ca28a4ba4b54 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -857,7 +857,7 @@ xfs_rtbuf_get(
857 xfs_buf_t *bp; /* block buffer, result */ 857 xfs_buf_t *bp; /* block buffer, result */
858 xfs_inode_t *ip; /* bitmap or summary inode */ 858 xfs_inode_t *ip; /* bitmap or summary inode */
859 xfs_bmbt_irec_t map; 859 xfs_bmbt_irec_t map;
860 int nmap; 860 int nmap = 1;
861 int error; /* error value */ 861 int error; /* error value */
862 862
863 ip = issum ? mp->m_rsumip : mp->m_rbmip; 863 ip = issum ? mp->m_rsumip : mp->m_rbmip;
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index 97304f10e78a..96548176db80 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -403,7 +403,7 @@ xfs_sync_worker(
403 if (!(mp->m_super->s_flags & MS_ACTIVE) && 403 if (!(mp->m_super->s_flags & MS_ACTIVE) &&
404 !(mp->m_flags & XFS_MOUNT_RDONLY)) { 404 !(mp->m_flags & XFS_MOUNT_RDONLY)) {
405 /* dgc: errors ignored here */ 405 /* dgc: errors ignored here */
406 if (mp->m_super->s_frozen == SB_UNFROZEN && 406 if (mp->m_super->s_writers.frozen == SB_UNFROZEN &&
407 xfs_log_need_covered(mp)) 407 xfs_log_need_covered(mp))
408 error = xfs_fs_log_dummy(mp); 408 error = xfs_fs_log_dummy(mp);
409 else 409 else
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index fdf324508c5e..06ed520a767f 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -576,8 +576,12 @@ xfs_trans_alloc(
576 xfs_mount_t *mp, 576 xfs_mount_t *mp,
577 uint type) 577 uint type)
578{ 578{
579 xfs_wait_for_freeze(mp, SB_FREEZE_TRANS); 579 xfs_trans_t *tp;
580 return _xfs_trans_alloc(mp, type, KM_SLEEP); 580
581 sb_start_intwrite(mp->m_super);
582 tp = _xfs_trans_alloc(mp, type, KM_SLEEP);
583 tp->t_flags |= XFS_TRANS_FREEZE_PROT;
584 return tp;
581} 585}
582 586
583xfs_trans_t * 587xfs_trans_t *
@@ -588,6 +592,7 @@ _xfs_trans_alloc(
588{ 592{
589 xfs_trans_t *tp; 593 xfs_trans_t *tp;
590 594
595 WARN_ON(mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE);
591 atomic_inc(&mp->m_active_trans); 596 atomic_inc(&mp->m_active_trans);
592 597
593 tp = kmem_zone_zalloc(xfs_trans_zone, memflags); 598 tp = kmem_zone_zalloc(xfs_trans_zone, memflags);
@@ -611,6 +616,8 @@ xfs_trans_free(
611 xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false); 616 xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false);
612 617
613 atomic_dec(&tp->t_mountp->m_active_trans); 618 atomic_dec(&tp->t_mountp->m_active_trans);
619 if (tp->t_flags & XFS_TRANS_FREEZE_PROT)
620 sb_end_intwrite(tp->t_mountp->m_super);
614 xfs_trans_free_dqinfo(tp); 621 xfs_trans_free_dqinfo(tp);
615 kmem_zone_free(xfs_trans_zone, tp); 622 kmem_zone_free(xfs_trans_zone, tp);
616} 623}
@@ -643,7 +650,11 @@ xfs_trans_dup(
643 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); 650 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
644 ASSERT(tp->t_ticket != NULL); 651 ASSERT(tp->t_ticket != NULL);
645 652
646 ntp->t_flags = XFS_TRANS_PERM_LOG_RES | (tp->t_flags & XFS_TRANS_RESERVE); 653 ntp->t_flags = XFS_TRANS_PERM_LOG_RES |
654 (tp->t_flags & XFS_TRANS_RESERVE) |
655 (tp->t_flags & XFS_TRANS_FREEZE_PROT);
656 /* We gave our writer reference to the new transaction */
657 tp->t_flags &= ~XFS_TRANS_FREEZE_PROT;
647 ntp->t_ticket = xfs_log_ticket_get(tp->t_ticket); 658 ntp->t_ticket = xfs_log_ticket_get(tp->t_ticket);
648 ntp->t_blk_res = tp->t_blk_res - tp->t_blk_res_used; 659 ntp->t_blk_res = tp->t_blk_res - tp->t_blk_res_used;
649 tp->t_blk_res = tp->t_blk_res_used; 660 tp->t_blk_res = tp->t_blk_res_used;
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index bc2afd52a0b7..db056544cbb5 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -179,6 +179,8 @@ struct xfs_log_item_desc {
179#define XFS_TRANS_SYNC 0x08 /* make commit synchronous */ 179#define XFS_TRANS_SYNC 0x08 /* make commit synchronous */
180#define XFS_TRANS_DQ_DIRTY 0x10 /* at least one dquot in trx dirty */ 180#define XFS_TRANS_DQ_DIRTY 0x10 /* at least one dquot in trx dirty */
181#define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */ 181#define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */
182#define XFS_TRANS_FREEZE_PROT 0x40 /* Transaction has elevated writer
183 count in superblock */
182 184
183/* 185/*
184 * Values for call flags parameter. 186 * Values for call flags parameter.