aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorJiri Kosina <jkosina@suse.cz>2010-12-22 12:57:02 -0500
committerJiri Kosina <jkosina@suse.cz>2010-12-22 12:57:02 -0500
commit4b7bd364700d9ac8372eff48832062b936d0793b (patch)
tree0dbf78c95456a0b02d07fcd473281f04a87e266d /fs
parentc0d8768af260e2cbb4bf659ae6094a262c86b085 (diff)
parent90a8a73c06cc32b609a880d48449d7083327e11a (diff)
Merge branch 'master' into for-next
Conflicts: MAINTAINERS arch/arm/mach-omap2/pm24xx.c drivers/scsi/bfa/bfa_fcpim.c Needed to update to apply fixes for which the old branch was too outdated.
Diffstat (limited to 'fs')
-rw-r--r--fs/autofs4/root.c12
-rw-r--r--fs/bio.c23
-rw-r--r--fs/block_dev.c1
-rw-r--r--fs/btrfs/compression.c15
-rw-r--r--fs/btrfs/ctree.h6
-rw-r--r--fs/btrfs/disk-io.c41
-rw-r--r--fs/btrfs/export.c78
-rw-r--r--fs/btrfs/extent-tree.c77
-rw-r--r--fs/btrfs/extent_io.c77
-rw-r--r--fs/btrfs/extent_io.h3
-rw-r--r--fs/btrfs/file.c99
-rw-r--r--fs/btrfs/free-space-cache.c12
-rw-r--r--fs/btrfs/inode.c299
-rw-r--r--fs/btrfs/ioctl.c87
-rw-r--r--fs/btrfs/ioctl.h14
-rw-r--r--fs/btrfs/ordered-data.c67
-rw-r--r--fs/btrfs/ordered-data.h3
-rw-r--r--fs/btrfs/orphan.c6
-rw-r--r--fs/btrfs/super.c43
-rw-r--r--fs/btrfs/transaction.c5
-rw-r--r--fs/btrfs/tree-log.c21
-rw-r--r--fs/btrfs/volumes.c20
-rw-r--r--fs/btrfs/volumes.h2
-rw-r--r--fs/ceph/addr.c6
-rw-r--r--fs/ceph/caps.c17
-rw-r--r--fs/ceph/dir.c23
-rw-r--r--fs/ceph/file.c65
-rw-r--r--fs/ceph/inode.c50
-rw-r--r--fs/ceph/ioctl.h2
-rw-r--r--fs/ceph/locks.c94
-rw-r--r--fs/ceph/mds_client.c49
-rw-r--r--fs/ceph/mds_client.h33
-rw-r--r--fs/ceph/super.h4
-rw-r--r--fs/cifs/Kconfig8
-rw-r--r--fs/cifs/Makefile4
-rw-r--r--fs/cifs/README9
-rw-r--r--fs/cifs/TODO2
-rw-r--r--fs/cifs/cifs_fs_sb.h7
-rw-r--r--fs/cifs/cifsacl.c51
-rw-r--r--fs/cifs/cifsacl.h4
-rw-r--r--fs/cifs/cifsfs.c10
-rw-r--r--fs/cifs/cifsglob.h15
-rw-r--r--fs/cifs/cifsproto.h12
-rw-r--r--fs/cifs/cifssmb.c183
-rw-r--r--fs/cifs/connect.c242
-rw-r--r--fs/cifs/dns_resolve.c2
-rw-r--r--fs/cifs/file.c78
-rw-r--r--fs/cifs/fscache.c12
-rw-r--r--fs/cifs/inode.c58
-rw-r--r--fs/cifs/ioctl.c16
-rw-r--r--fs/cifs/misc.c25
-rw-r--r--fs/cifs/readdir.c41
-rw-r--r--fs/cifs/xattr.c55
-rw-r--r--fs/compat.c28
-rw-r--r--fs/compat_ioctl.c1
-rw-r--r--fs/ecryptfs/super.c1
-rw-r--r--fs/exec.c41
-rw-r--r--fs/ext3/super.c1
-rw-r--r--fs/ext4/ext4.h5
-rw-r--r--fs/ext4/inode.c10
-rw-r--r--fs/ext4/ioctl.c24
-rw-r--r--fs/ext4/mballoc.c2
-rw-r--r--fs/ext4/namei.c2
-rw-r--r--fs/ext4/page-io.c99
-rw-r--r--fs/ext4/super.c125
-rw-r--r--fs/fuse/file.c82
-rw-r--r--fs/gfs2/export.c46
-rw-r--r--fs/gfs2/glock.c21
-rw-r--r--fs/gfs2/inode.c152
-rw-r--r--fs/gfs2/inode.h4
-rw-r--r--fs/gfs2/quota.c15
-rw-r--r--fs/gfs2/rgrp.c91
-rw-r--r--fs/hpfs/buffer.c4
-rw-r--r--fs/hpfs/hpfs_fn.h2
-rw-r--r--fs/hpfs/super.c2
-rw-r--r--fs/hugetlbfs/inode.c3
-rw-r--r--fs/ioctl.c40
-rw-r--r--fs/ioprio.c13
-rw-r--r--fs/jbd2/journal.c24
-rw-r--r--fs/lockd/clntlock.c1
-rw-r--r--fs/lockd/clntproc.c1
-rw-r--r--fs/lockd/host.c11
-rw-r--r--fs/lockd/svc4proc.c1
-rw-r--r--fs/lockd/svclock.c1
-rw-r--r--fs/lockd/svcproc.c1
-rw-r--r--fs/locks.c91
-rw-r--r--fs/logfs/logfs.h2
-rw-r--r--fs/namei.c3
-rw-r--r--fs/namespace.c1
-rw-r--r--fs/ncpfs/dir.c1
-rw-r--r--fs/ncpfs/file.c1
-rw-r--r--fs/ncpfs/inode.c1
-rw-r--r--fs/ncpfs/ioctl.c1
-rw-r--r--fs/nfs/callback.c1
-rw-r--r--fs/nfs/delegation.c1
-rw-r--r--fs/nfs/dir.c220
-rw-r--r--fs/nfs/direct.c2
-rw-r--r--fs/nfs/file.c3
-rw-r--r--fs/nfs/inode.c1
-rw-r--r--fs/nfs/internal.h9
-rw-r--r--fs/nfs/mount_clnt.c4
-rw-r--r--fs/nfs/nfs2xdr.c8
-rw-r--r--fs/nfs/nfs3xdr.c8
-rw-r--r--fs/nfs/nfs4proc.c13
-rw-r--r--fs/nfs/nfs4xdr.c8
-rw-r--r--fs/nfs/pagelist.c4
-rw-r--r--fs/nfs/read.c1
-rw-r--r--fs/nfs/super.c13
-rw-r--r--fs/nfs/write.c3
-rw-r--r--fs/nfsd/nfs3xdr.c6
-rw-r--r--fs/nfsd/nfs4state.c44
-rw-r--r--fs/nfsd/xdr4.h21
-rw-r--r--fs/nilfs2/dat.c2
-rw-r--r--fs/nilfs2/gcinode.c9
-rw-r--r--fs/nilfs2/ioctl.c16
-rw-r--r--fs/notify/fanotify/fanotify.c6
-rw-r--r--fs/notify/fanotify/fanotify_user.c81
-rw-r--r--fs/notify/inotify/inotify_user.c1
-rw-r--r--fs/ocfs2/cluster/heartbeat.c14
-rw-r--r--fs/ocfs2/dcache.c1
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c2
-rw-r--r--fs/ocfs2/ocfs2.h6
-rw-r--r--fs/ocfs2/stack_user.c2
-rw-r--r--fs/ocfs2/super.c1
-rw-r--r--fs/openpromfs/inode.c2
-rw-r--r--fs/pipe.c14
-rw-r--r--fs/proc/base.c2
-rw-r--r--fs/proc/inode.c1
-rw-r--r--fs/proc/task_mmu.c3
-rw-r--r--fs/read_write.c1
-rw-r--r--fs/reiserfs/inode.c1
-rw-r--r--fs/reiserfs/ioctl.c8
-rw-r--r--fs/reiserfs/journal.c1
-rw-r--r--fs/reiserfs/super.c1
-rw-r--r--fs/reiserfs/xattr_acl.c6
-rw-r--r--fs/splice.c24
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c101
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c37
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c3
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c3
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c1
-rw-r--r--fs/xfs/xfs_bmap.c85
-rw-r--r--fs/xfs/xfs_bmap.h5
-rw-r--r--fs/xfs/xfs_dfrag.c13
-rw-r--r--fs/xfs/xfs_error.c3
-rw-r--r--fs/xfs/xfs_error.h5
-rw-r--r--fs/xfs/xfs_filestream.c8
-rw-r--r--fs/xfs/xfs_inode_item.c31
-rw-r--r--fs/xfs/xfs_mount.c1
-rw-r--r--fs/xfs/xfs_quota.h20
-rw-r--r--fs/xfs/xfs_rename.c1
152 files changed, 2489 insertions, 1548 deletions
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index d5c1401f0031..d34896cfb19f 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -980,19 +980,11 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
980 } 980 }
981} 981}
982 982
983static DEFINE_MUTEX(autofs4_ioctl_mutex);
984
985static long autofs4_root_ioctl(struct file *filp, 983static long autofs4_root_ioctl(struct file *filp,
986 unsigned int cmd, unsigned long arg) 984 unsigned int cmd, unsigned long arg)
987{ 985{
988 long ret;
989 struct inode *inode = filp->f_dentry->d_inode; 986 struct inode *inode = filp->f_dentry->d_inode;
990 987 return autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
991 mutex_lock(&autofs4_ioctl_mutex);
992 ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
993 mutex_unlock(&autofs4_ioctl_mutex);
994
995 return ret;
996} 988}
997 989
998#ifdef CONFIG_COMPAT 990#ifdef CONFIG_COMPAT
@@ -1002,13 +994,11 @@ static long autofs4_root_compat_ioctl(struct file *filp,
1002 struct inode *inode = filp->f_path.dentry->d_inode; 994 struct inode *inode = filp->f_path.dentry->d_inode;
1003 int ret; 995 int ret;
1004 996
1005 mutex_lock(&autofs4_ioctl_mutex);
1006 if (cmd == AUTOFS_IOC_READY || cmd == AUTOFS_IOC_FAIL) 997 if (cmd == AUTOFS_IOC_READY || cmd == AUTOFS_IOC_FAIL)
1007 ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg); 998 ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
1008 else 999 else
1009 ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, 1000 ret = autofs4_root_ioctl_unlocked(inode, filp, cmd,
1010 (unsigned long)compat_ptr(arg)); 1001 (unsigned long)compat_ptr(arg));
1011 mutex_unlock(&autofs4_ioctl_mutex);
1012 1002
1013 return ret; 1003 return ret;
1014} 1004}
diff --git a/fs/bio.c b/fs/bio.c
index 8abb2dfb2e7c..4bd454fa844e 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -370,6 +370,9 @@ struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs)
370{ 370{
371 struct bio *bio; 371 struct bio *bio;
372 372
373 if (nr_iovecs > UIO_MAXIOV)
374 return NULL;
375
373 bio = kmalloc(sizeof(struct bio) + nr_iovecs * sizeof(struct bio_vec), 376 bio = kmalloc(sizeof(struct bio) + nr_iovecs * sizeof(struct bio_vec),
374 gfp_mask); 377 gfp_mask);
375 if (unlikely(!bio)) 378 if (unlikely(!bio))
@@ -697,8 +700,12 @@ static void bio_free_map_data(struct bio_map_data *bmd)
697static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count, 700static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count,
698 gfp_t gfp_mask) 701 gfp_t gfp_mask)
699{ 702{
700 struct bio_map_data *bmd = kmalloc(sizeof(*bmd), gfp_mask); 703 struct bio_map_data *bmd;
701 704
705 if (iov_count > UIO_MAXIOV)
706 return NULL;
707
708 bmd = kmalloc(sizeof(*bmd), gfp_mask);
702 if (!bmd) 709 if (!bmd)
703 return NULL; 710 return NULL;
704 711
@@ -827,6 +834,12 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
827 end = (uaddr + iov[i].iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT; 834 end = (uaddr + iov[i].iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
828 start = uaddr >> PAGE_SHIFT; 835 start = uaddr >> PAGE_SHIFT;
829 836
837 /*
838 * Overflow, abort
839 */
840 if (end < start)
841 return ERR_PTR(-EINVAL);
842
830 nr_pages += end - start; 843 nr_pages += end - start;
831 len += iov[i].iov_len; 844 len += iov[i].iov_len;
832 } 845 }
@@ -955,6 +968,12 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
955 unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 968 unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
956 unsigned long start = uaddr >> PAGE_SHIFT; 969 unsigned long start = uaddr >> PAGE_SHIFT;
957 970
971 /*
972 * Overflow, abort
973 */
974 if (end < start)
975 return ERR_PTR(-EINVAL);
976
958 nr_pages += end - start; 977 nr_pages += end - start;
959 /* 978 /*
960 * buffer must be aligned to at least hardsector size for now 979 * buffer must be aligned to at least hardsector size for now
@@ -982,7 +1001,7 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
982 unsigned long start = uaddr >> PAGE_SHIFT; 1001 unsigned long start = uaddr >> PAGE_SHIFT;
983 const int local_nr_pages = end - start; 1002 const int local_nr_pages = end - start;
984 const int page_limit = cur_page + local_nr_pages; 1003 const int page_limit = cur_page + local_nr_pages;
985 1004
986 ret = get_user_pages_fast(uaddr, local_nr_pages, 1005 ret = get_user_pages_fast(uaddr, local_nr_pages,
987 write_to_vm, &pages[cur_page]); 1006 write_to_vm, &pages[cur_page]);
988 if (ret < local_nr_pages) { 1007 if (ret < local_nr_pages) {
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 06e8ff12b97c..4230252fd689 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -11,7 +11,6 @@
11#include <linux/slab.h> 11#include <linux/slab.h>
12#include <linux/kmod.h> 12#include <linux/kmod.h>
13#include <linux/major.h> 13#include <linux/major.h>
14#include <linux/smp_lock.h>
15#include <linux/device_cgroup.h> 14#include <linux/device_cgroup.h>
16#include <linux/highmem.h> 15#include <linux/highmem.h>
17#include <linux/blkdev.h> 16#include <linux/blkdev.h>
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 7845d1f7d1d9..b50bc4bd5c56 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -91,23 +91,10 @@ static inline int compressed_bio_size(struct btrfs_root *root,
91static struct bio *compressed_bio_alloc(struct block_device *bdev, 91static struct bio *compressed_bio_alloc(struct block_device *bdev,
92 u64 first_byte, gfp_t gfp_flags) 92 u64 first_byte, gfp_t gfp_flags)
93{ 93{
94 struct bio *bio;
95 int nr_vecs; 94 int nr_vecs;
96 95
97 nr_vecs = bio_get_nr_vecs(bdev); 96 nr_vecs = bio_get_nr_vecs(bdev);
98 bio = bio_alloc(gfp_flags, nr_vecs); 97 return btrfs_bio_alloc(bdev, first_byte >> 9, nr_vecs, gfp_flags);
99
100 if (bio == NULL && (current->flags & PF_MEMALLOC)) {
101 while (!bio && (nr_vecs /= 2))
102 bio = bio_alloc(gfp_flags, nr_vecs);
103 }
104
105 if (bio) {
106 bio->bi_size = 0;
107 bio->bi_bdev = bdev;
108 bio->bi_sector = first_byte >> 9;
109 }
110 return bio;
111} 98}
112 99
113static int check_compressed_csum(struct inode *inode, 100static int check_compressed_csum(struct inode *inode,
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8db9234f6b41..af52f6d7a4d8 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -808,9 +808,9 @@ struct btrfs_block_group_cache {
808 int extents_thresh; 808 int extents_thresh;
809 int free_extents; 809 int free_extents;
810 int total_bitmaps; 810 int total_bitmaps;
811 int ro:1; 811 unsigned int ro:1;
812 int dirty:1; 812 unsigned int dirty:1;
813 int iref:1; 813 unsigned int iref:1;
814 814
815 int disk_cache_state; 815 int disk_cache_state;
816 816
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index fb827d0d7181..51d2e4de34eb 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -28,6 +28,7 @@
28#include <linux/freezer.h> 28#include <linux/freezer.h>
29#include <linux/crc32c.h> 29#include <linux/crc32c.h>
30#include <linux/slab.h> 30#include <linux/slab.h>
31#include <linux/migrate.h>
31#include "compat.h" 32#include "compat.h"
32#include "ctree.h" 33#include "ctree.h"
33#include "disk-io.h" 34#include "disk-io.h"
@@ -355,6 +356,8 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
355 ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE, 356 ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
356 btrfs_header_generation(eb)); 357 btrfs_header_generation(eb));
357 BUG_ON(ret); 358 BUG_ON(ret);
359 WARN_ON(!btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN));
360
358 found_start = btrfs_header_bytenr(eb); 361 found_start = btrfs_header_bytenr(eb);
359 if (found_start != start) { 362 if (found_start != start) {
360 WARN_ON(1); 363 WARN_ON(1);
@@ -693,6 +696,27 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
693 __btree_submit_bio_done); 696 __btree_submit_bio_done);
694} 697}
695 698
699#ifdef CONFIG_MIGRATION
700static int btree_migratepage(struct address_space *mapping,
701 struct page *newpage, struct page *page)
702{
703 /*
704 * we can't safely write a btree page from here,
705 * we haven't done the locking hook
706 */
707 if (PageDirty(page))
708 return -EAGAIN;
709 /*
710 * Buffers may be managed in a filesystem specific way.
711 * We must have no buffers or drop them.
712 */
713 if (page_has_private(page) &&
714 !try_to_release_page(page, GFP_KERNEL))
715 return -EAGAIN;
716 return migrate_page(mapping, newpage, page);
717}
718#endif
719
696static int btree_writepage(struct page *page, struct writeback_control *wbc) 720static int btree_writepage(struct page *page, struct writeback_control *wbc)
697{ 721{
698 struct extent_io_tree *tree; 722 struct extent_io_tree *tree;
@@ -707,8 +731,7 @@ static int btree_writepage(struct page *page, struct writeback_control *wbc)
707 } 731 }
708 732
709 redirty_page_for_writepage(wbc, page); 733 redirty_page_for_writepage(wbc, page);
710 eb = btrfs_find_tree_block(root, page_offset(page), 734 eb = btrfs_find_tree_block(root, page_offset(page), PAGE_CACHE_SIZE);
711 PAGE_CACHE_SIZE);
712 WARN_ON(!eb); 735 WARN_ON(!eb);
713 736
714 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); 737 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
@@ -799,6 +822,9 @@ static const struct address_space_operations btree_aops = {
799 .releasepage = btree_releasepage, 822 .releasepage = btree_releasepage,
800 .invalidatepage = btree_invalidatepage, 823 .invalidatepage = btree_invalidatepage,
801 .sync_page = block_sync_page, 824 .sync_page = block_sync_page,
825#ifdef CONFIG_MIGRATION
826 .migratepage = btree_migratepage,
827#endif
802}; 828};
803 829
804int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, 830int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
@@ -981,7 +1007,10 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
981 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); 1007 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
982 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), 1008 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
983 blocksize, generation); 1009 blocksize, generation);
984 BUG_ON(!root->node); 1010 if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) {
1011 free_extent_buffer(root->node);
1012 return -EIO;
1013 }
985 root->commit_root = btrfs_root_node(root); 1014 root->commit_root = btrfs_root_node(root);
986 return 0; 1015 return 0;
987} 1016}
@@ -1538,10 +1567,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1538 GFP_NOFS); 1567 GFP_NOFS);
1539 struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root), 1568 struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root),
1540 GFP_NOFS); 1569 GFP_NOFS);
1541 struct btrfs_root *tree_root = kzalloc(sizeof(struct btrfs_root), 1570 struct btrfs_root *tree_root = btrfs_sb(sb);
1542 GFP_NOFS); 1571 struct btrfs_fs_info *fs_info = tree_root->fs_info;
1543 struct btrfs_fs_info *fs_info = kzalloc(sizeof(*fs_info),
1544 GFP_NOFS);
1545 struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root), 1572 struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root),
1546 GFP_NOFS); 1573 GFP_NOFS);
1547 struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root), 1574 struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root),
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 951ef09b82f4..659f532d26a0 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -166,7 +166,7 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
166static struct dentry *btrfs_get_parent(struct dentry *child) 166static struct dentry *btrfs_get_parent(struct dentry *child)
167{ 167{
168 struct inode *dir = child->d_inode; 168 struct inode *dir = child->d_inode;
169 static struct dentry *dentry; 169 struct dentry *dentry;
170 struct btrfs_root *root = BTRFS_I(dir)->root; 170 struct btrfs_root *root = BTRFS_I(dir)->root;
171 struct btrfs_path *path; 171 struct btrfs_path *path;
172 struct extent_buffer *leaf; 172 struct extent_buffer *leaf;
@@ -232,9 +232,85 @@ fail:
232 return ERR_PTR(ret); 232 return ERR_PTR(ret);
233} 233}
234 234
235static int btrfs_get_name(struct dentry *parent, char *name,
236 struct dentry *child)
237{
238 struct inode *inode = child->d_inode;
239 struct inode *dir = parent->d_inode;
240 struct btrfs_path *path;
241 struct btrfs_root *root = BTRFS_I(dir)->root;
242 struct btrfs_inode_ref *iref;
243 struct btrfs_root_ref *rref;
244 struct extent_buffer *leaf;
245 unsigned long name_ptr;
246 struct btrfs_key key;
247 int name_len;
248 int ret;
249
250 if (!dir || !inode)
251 return -EINVAL;
252
253 if (!S_ISDIR(dir->i_mode))
254 return -EINVAL;
255
256 path = btrfs_alloc_path();
257 if (!path)
258 return -ENOMEM;
259 path->leave_spinning = 1;
260
261 if (inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
262 key.objectid = BTRFS_I(inode)->root->root_key.objectid;
263 key.type = BTRFS_ROOT_BACKREF_KEY;
264 key.offset = (u64)-1;
265 root = root->fs_info->tree_root;
266 } else {
267 key.objectid = inode->i_ino;
268 key.offset = dir->i_ino;
269 key.type = BTRFS_INODE_REF_KEY;
270 }
271
272 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
273 if (ret < 0) {
274 btrfs_free_path(path);
275 return ret;
276 } else if (ret > 0) {
277 if (inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
278 path->slots[0]--;
279 } else {
280 btrfs_free_path(path);
281 return -ENOENT;
282 }
283 }
284 leaf = path->nodes[0];
285
286 if (inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
287 rref = btrfs_item_ptr(leaf, path->slots[0],
288 struct btrfs_root_ref);
289 name_ptr = (unsigned long)(rref + 1);
290 name_len = btrfs_root_ref_name_len(leaf, rref);
291 } else {
292 iref = btrfs_item_ptr(leaf, path->slots[0],
293 struct btrfs_inode_ref);
294 name_ptr = (unsigned long)(iref + 1);
295 name_len = btrfs_inode_ref_name_len(leaf, iref);
296 }
297
298 read_extent_buffer(leaf, name, name_ptr, name_len);
299 btrfs_free_path(path);
300
301 /*
302 * have to add the null termination to make sure that reconnect_path
303 * gets the right len for strlen
304 */
305 name[name_len] = '\0';
306
307 return 0;
308}
309
235const struct export_operations btrfs_export_ops = { 310const struct export_operations btrfs_export_ops = {
236 .encode_fh = btrfs_encode_fh, 311 .encode_fh = btrfs_encode_fh,
237 .fh_to_dentry = btrfs_fh_to_dentry, 312 .fh_to_dentry = btrfs_fh_to_dentry,
238 .fh_to_parent = btrfs_fh_to_parent, 313 .fh_to_parent = btrfs_fh_to_parent,
239 .get_parent = btrfs_get_parent, 314 .get_parent = btrfs_get_parent,
315 .get_name = btrfs_get_name,
240}; 316};
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0c097f3aec41..227e5815d838 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -429,6 +429,7 @@ err:
429 429
430static int cache_block_group(struct btrfs_block_group_cache *cache, 430static int cache_block_group(struct btrfs_block_group_cache *cache,
431 struct btrfs_trans_handle *trans, 431 struct btrfs_trans_handle *trans,
432 struct btrfs_root *root,
432 int load_cache_only) 433 int load_cache_only)
433{ 434{
434 struct btrfs_fs_info *fs_info = cache->fs_info; 435 struct btrfs_fs_info *fs_info = cache->fs_info;
@@ -442,9 +443,12 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
442 443
443 /* 444 /*
444 * We can't do the read from on-disk cache during a commit since we need 445 * We can't do the read from on-disk cache during a commit since we need
445 * to have the normal tree locking. 446 * to have the normal tree locking. Also if we are currently trying to
447 * allocate blocks for the tree root we can't do the fast caching since
448 * we likely hold important locks.
446 */ 449 */
447 if (!trans->transaction->in_commit) { 450 if (!trans->transaction->in_commit &&
451 (root && root != root->fs_info->tree_root)) {
448 spin_lock(&cache->lock); 452 spin_lock(&cache->lock);
449 if (cache->cached != BTRFS_CACHE_NO) { 453 if (cache->cached != BTRFS_CACHE_NO) {
450 spin_unlock(&cache->lock); 454 spin_unlock(&cache->lock);
@@ -2741,6 +2745,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group,
2741 struct btrfs_root *root = block_group->fs_info->tree_root; 2745 struct btrfs_root *root = block_group->fs_info->tree_root;
2742 struct inode *inode = NULL; 2746 struct inode *inode = NULL;
2743 u64 alloc_hint = 0; 2747 u64 alloc_hint = 0;
2748 int dcs = BTRFS_DC_ERROR;
2744 int num_pages = 0; 2749 int num_pages = 0;
2745 int retries = 0; 2750 int retries = 0;
2746 int ret = 0; 2751 int ret = 0;
@@ -2795,6 +2800,8 @@ again:
2795 2800
2796 spin_lock(&block_group->lock); 2801 spin_lock(&block_group->lock);
2797 if (block_group->cached != BTRFS_CACHE_FINISHED) { 2802 if (block_group->cached != BTRFS_CACHE_FINISHED) {
2803 /* We're not cached, don't bother trying to write stuff out */
2804 dcs = BTRFS_DC_WRITTEN;
2798 spin_unlock(&block_group->lock); 2805 spin_unlock(&block_group->lock);
2799 goto out_put; 2806 goto out_put;
2800 } 2807 }
@@ -2821,6 +2828,8 @@ again:
2821 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, 2828 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
2822 num_pages, num_pages, 2829 num_pages, num_pages,
2823 &alloc_hint); 2830 &alloc_hint);
2831 if (!ret)
2832 dcs = BTRFS_DC_SETUP;
2824 btrfs_free_reserved_data_space(inode, num_pages); 2833 btrfs_free_reserved_data_space(inode, num_pages);
2825out_put: 2834out_put:
2826 iput(inode); 2835 iput(inode);
@@ -2828,10 +2837,7 @@ out_free:
2828 btrfs_release_path(root, path); 2837 btrfs_release_path(root, path);
2829out: 2838out:
2830 spin_lock(&block_group->lock); 2839 spin_lock(&block_group->lock);
2831 if (ret) 2840 block_group->disk_cache_state = dcs;
2832 block_group->disk_cache_state = BTRFS_DC_ERROR;
2833 else
2834 block_group->disk_cache_state = BTRFS_DC_SETUP;
2835 spin_unlock(&block_group->lock); 2841 spin_unlock(&block_group->lock);
2836 2842
2837 return ret; 2843 return ret;
@@ -3037,7 +3043,13 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
3037 3043
3038u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) 3044u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3039{ 3045{
3040 u64 num_devices = root->fs_info->fs_devices->rw_devices; 3046 /*
3047 * we add in the count of missing devices because we want
3048 * to make sure that any RAID levels on a degraded FS
3049 * continue to be honored.
3050 */
3051 u64 num_devices = root->fs_info->fs_devices->rw_devices +
3052 root->fs_info->fs_devices->missing_devices;
3041 3053
3042 if (num_devices == 1) 3054 if (num_devices == 1)
3043 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); 3055 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
@@ -3412,7 +3424,7 @@ again:
3412 * our reservation. 3424 * our reservation.
3413 */ 3425 */
3414 if (unused <= space_info->total_bytes) { 3426 if (unused <= space_info->total_bytes) {
3415 unused -= space_info->total_bytes; 3427 unused = space_info->total_bytes - unused;
3416 if (unused >= num_bytes) { 3428 if (unused >= num_bytes) {
3417 if (!reserved) 3429 if (!reserved)
3418 space_info->bytes_reserved += orig_bytes; 3430 space_info->bytes_reserved += orig_bytes;
@@ -4080,7 +4092,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
4080 * space back to the block group, otherwise we will leak space. 4092 * space back to the block group, otherwise we will leak space.
4081 */ 4093 */
4082 if (!alloc && cache->cached == BTRFS_CACHE_NO) 4094 if (!alloc && cache->cached == BTRFS_CACHE_NO)
4083 cache_block_group(cache, trans, 1); 4095 cache_block_group(cache, trans, NULL, 1);
4084 4096
4085 byte_in_group = bytenr - cache->key.objectid; 4097 byte_in_group = bytenr - cache->key.objectid;
4086 WARN_ON(byte_in_group > cache->key.offset); 4098 WARN_ON(byte_in_group > cache->key.offset);
@@ -4930,11 +4942,31 @@ search:
4930 btrfs_get_block_group(block_group); 4942 btrfs_get_block_group(block_group);
4931 search_start = block_group->key.objectid; 4943 search_start = block_group->key.objectid;
4932 4944
4945 /*
4946 * this can happen if we end up cycling through all the
4947 * raid types, but we want to make sure we only allocate
4948 * for the proper type.
4949 */
4950 if (!block_group_bits(block_group, data)) {
4951 u64 extra = BTRFS_BLOCK_GROUP_DUP |
4952 BTRFS_BLOCK_GROUP_RAID1 |
4953 BTRFS_BLOCK_GROUP_RAID10;
4954
4955 /*
4956 * if they asked for extra copies and this block group
4957 * doesn't provide them, bail. This does allow us to
4958 * fill raid0 from raid1.
4959 */
4960 if ((data & extra) && !(block_group->flags & extra))
4961 goto loop;
4962 }
4963
4933have_block_group: 4964have_block_group:
4934 if (unlikely(block_group->cached == BTRFS_CACHE_NO)) { 4965 if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
4935 u64 free_percent; 4966 u64 free_percent;
4936 4967
4937 ret = cache_block_group(block_group, trans, 1); 4968 ret = cache_block_group(block_group, trans,
4969 orig_root, 1);
4938 if (block_group->cached == BTRFS_CACHE_FINISHED) 4970 if (block_group->cached == BTRFS_CACHE_FINISHED)
4939 goto have_block_group; 4971 goto have_block_group;
4940 4972
@@ -4958,7 +4990,8 @@ have_block_group:
4958 if (loop > LOOP_CACHING_NOWAIT || 4990 if (loop > LOOP_CACHING_NOWAIT ||
4959 (loop > LOOP_FIND_IDEAL && 4991 (loop > LOOP_FIND_IDEAL &&
4960 atomic_read(&space_info->caching_threads) < 2)) { 4992 atomic_read(&space_info->caching_threads) < 2)) {
4961 ret = cache_block_group(block_group, trans, 0); 4993 ret = cache_block_group(block_group, trans,
4994 orig_root, 0);
4962 BUG_ON(ret); 4995 BUG_ON(ret);
4963 } 4996 }
4964 found_uncached_bg = true; 4997 found_uncached_bg = true;
@@ -5515,7 +5548,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
5515 u64 num_bytes = ins->offset; 5548 u64 num_bytes = ins->offset;
5516 5549
5517 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); 5550 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
5518 cache_block_group(block_group, trans, 0); 5551 cache_block_group(block_group, trans, NULL, 0);
5519 caching_ctl = get_caching_control(block_group); 5552 caching_ctl = get_caching_control(block_group);
5520 5553
5521 if (!caching_ctl) { 5554 if (!caching_ctl) {
@@ -6300,9 +6333,13 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
6300 NULL, NULL); 6333 NULL, NULL);
6301 BUG_ON(ret < 0); 6334 BUG_ON(ret < 0);
6302 if (ret > 0) { 6335 if (ret > 0) {
6303 ret = btrfs_del_orphan_item(trans, tree_root, 6336 /* if we fail to delete the orphan item this time
6304 root->root_key.objectid); 6337 * around, it'll get picked up the next time.
6305 BUG_ON(ret); 6338 *
6339 * The most common failure here is just -ENOENT.
6340 */
6341 btrfs_del_orphan_item(trans, tree_root,
6342 root->root_key.objectid);
6306 } 6343 }
6307 } 6344 }
6308 6345
@@ -7878,7 +7915,14 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
7878 u64 stripped = BTRFS_BLOCK_GROUP_RAID0 | 7915 u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
7879 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; 7916 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
7880 7917
7881 num_devices = root->fs_info->fs_devices->rw_devices; 7918 /*
7919 * we add in the count of missing devices because we want
7920 * to make sure that any RAID levels on a degraded FS
7921 * continue to be honored.
7922 */
7923 num_devices = root->fs_info->fs_devices->rw_devices +
7924 root->fs_info->fs_devices->missing_devices;
7925
7882 if (num_devices == 1) { 7926 if (num_devices == 1) {
7883 stripped |= BTRFS_BLOCK_GROUP_DUP; 7927 stripped |= BTRFS_BLOCK_GROUP_DUP;
7884 stripped = flags & ~stripped; 7928 stripped = flags & ~stripped;
@@ -8247,7 +8291,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
8247 break; 8291 break;
8248 if (ret != 0) 8292 if (ret != 0)
8249 goto error; 8293 goto error;
8250
8251 leaf = path->nodes[0]; 8294 leaf = path->nodes[0];
8252 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 8295 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
8253 cache = kzalloc(sizeof(*cache), GFP_NOFS); 8296 cache = kzalloc(sizeof(*cache), GFP_NOFS);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index eac10e3260a9..3e86b9f36507 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1828,9 +1828,9 @@ static void end_bio_extent_preparewrite(struct bio *bio, int err)
1828 bio_put(bio); 1828 bio_put(bio);
1829} 1829}
1830 1830
1831static struct bio * 1831struct bio *
1832extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, 1832btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
1833 gfp_t gfp_flags) 1833 gfp_t gfp_flags)
1834{ 1834{
1835 struct bio *bio; 1835 struct bio *bio;
1836 1836
@@ -1919,7 +1919,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
1919 else 1919 else
1920 nr = bio_get_nr_vecs(bdev); 1920 nr = bio_get_nr_vecs(bdev);
1921 1921
1922 bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); 1922 bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
1923 1923
1924 bio_add_page(bio, page, page_size, offset); 1924 bio_add_page(bio, page, page_size, offset);
1925 bio->bi_end_io = end_io_func; 1925 bio->bi_end_io = end_io_func;
@@ -2901,21 +2901,53 @@ out:
2901int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 2901int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2902 __u64 start, __u64 len, get_extent_t *get_extent) 2902 __u64 start, __u64 len, get_extent_t *get_extent)
2903{ 2903{
2904 int ret; 2904 int ret = 0;
2905 u64 off = start; 2905 u64 off = start;
2906 u64 max = start + len; 2906 u64 max = start + len;
2907 u32 flags = 0; 2907 u32 flags = 0;
2908 u32 found_type;
2909 u64 last;
2908 u64 disko = 0; 2910 u64 disko = 0;
2911 struct btrfs_key found_key;
2909 struct extent_map *em = NULL; 2912 struct extent_map *em = NULL;
2910 struct extent_state *cached_state = NULL; 2913 struct extent_state *cached_state = NULL;
2914 struct btrfs_path *path;
2915 struct btrfs_file_extent_item *item;
2911 int end = 0; 2916 int end = 0;
2912 u64 em_start = 0, em_len = 0; 2917 u64 em_start = 0, em_len = 0;
2913 unsigned long emflags; 2918 unsigned long emflags;
2914 ret = 0; 2919 int hole = 0;
2915 2920
2916 if (len == 0) 2921 if (len == 0)
2917 return -EINVAL; 2922 return -EINVAL;
2918 2923
2924 path = btrfs_alloc_path();
2925 if (!path)
2926 return -ENOMEM;
2927 path->leave_spinning = 1;
2928
2929 ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root,
2930 path, inode->i_ino, -1, 0);
2931 if (ret < 0) {
2932 btrfs_free_path(path);
2933 return ret;
2934 }
2935 WARN_ON(!ret);
2936 path->slots[0]--;
2937 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2938 struct btrfs_file_extent_item);
2939 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
2940 found_type = btrfs_key_type(&found_key);
2941
2942 /* No extents, just return */
2943 if (found_key.objectid != inode->i_ino ||
2944 found_type != BTRFS_EXTENT_DATA_KEY) {
2945 btrfs_free_path(path);
2946 return 0;
2947 }
2948 last = found_key.offset;
2949 btrfs_free_path(path);
2950
2919 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, 2951 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
2920 &cached_state, GFP_NOFS); 2952 &cached_state, GFP_NOFS);
2921 em = get_extent(inode, NULL, 0, off, max - off, 0); 2953 em = get_extent(inode, NULL, 0, off, max - off, 0);
@@ -2925,11 +2957,18 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2925 ret = PTR_ERR(em); 2957 ret = PTR_ERR(em);
2926 goto out; 2958 goto out;
2927 } 2959 }
2960
2928 while (!end) { 2961 while (!end) {
2962 hole = 0;
2929 off = em->start + em->len; 2963 off = em->start + em->len;
2930 if (off >= max) 2964 if (off >= max)
2931 end = 1; 2965 end = 1;
2932 2966
2967 if (em->block_start == EXTENT_MAP_HOLE) {
2968 hole = 1;
2969 goto next;
2970 }
2971
2933 em_start = em->start; 2972 em_start = em->start;
2934 em_len = em->len; 2973 em_len = em->len;
2935 2974
@@ -2939,8 +2978,6 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2939 if (em->block_start == EXTENT_MAP_LAST_BYTE) { 2978 if (em->block_start == EXTENT_MAP_LAST_BYTE) {
2940 end = 1; 2979 end = 1;
2941 flags |= FIEMAP_EXTENT_LAST; 2980 flags |= FIEMAP_EXTENT_LAST;
2942 } else if (em->block_start == EXTENT_MAP_HOLE) {
2943 flags |= FIEMAP_EXTENT_UNWRITTEN;
2944 } else if (em->block_start == EXTENT_MAP_INLINE) { 2981 } else if (em->block_start == EXTENT_MAP_INLINE) {
2945 flags |= (FIEMAP_EXTENT_DATA_INLINE | 2982 flags |= (FIEMAP_EXTENT_DATA_INLINE |
2946 FIEMAP_EXTENT_NOT_ALIGNED); 2983 FIEMAP_EXTENT_NOT_ALIGNED);
@@ -2953,10 +2990,10 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2953 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 2990 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
2954 flags |= FIEMAP_EXTENT_ENCODED; 2991 flags |= FIEMAP_EXTENT_ENCODED;
2955 2992
2993next:
2956 emflags = em->flags; 2994 emflags = em->flags;
2957 free_extent_map(em); 2995 free_extent_map(em);
2958 em = NULL; 2996 em = NULL;
2959
2960 if (!end) { 2997 if (!end) {
2961 em = get_extent(inode, NULL, 0, off, max - off, 0); 2998 em = get_extent(inode, NULL, 0, off, max - off, 0);
2962 if (!em) 2999 if (!em)
@@ -2967,15 +3004,23 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2967 } 3004 }
2968 emflags = em->flags; 3005 emflags = em->flags;
2969 } 3006 }
3007
2970 if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) { 3008 if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) {
2971 flags |= FIEMAP_EXTENT_LAST; 3009 flags |= FIEMAP_EXTENT_LAST;
2972 end = 1; 3010 end = 1;
2973 } 3011 }
2974 3012
2975 ret = fiemap_fill_next_extent(fieinfo, em_start, disko, 3013 if (em_start == last) {
2976 em_len, flags); 3014 flags |= FIEMAP_EXTENT_LAST;
2977 if (ret) 3015 end = 1;
2978 goto out_free; 3016 }
3017
3018 if (!hole) {
3019 ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
3020 em_len, flags);
3021 if (ret)
3022 goto out_free;
3023 }
2979 } 3024 }
2980out_free: 3025out_free:
2981 free_extent_map(em); 3026 free_extent_map(em);
@@ -3836,8 +3881,10 @@ int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
3836 3881
3837 spin_lock(&tree->buffer_lock); 3882 spin_lock(&tree->buffer_lock);
3838 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); 3883 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
3839 if (!eb) 3884 if (!eb) {
3840 goto out; 3885 spin_unlock(&tree->buffer_lock);
3886 return ret;
3887 }
3841 3888
3842 if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 3889 if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
3843 ret = 0; 3890 ret = 0;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 1c6d4f342ef7..4183c8178f01 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -310,4 +310,7 @@ int extent_clear_unlock_delalloc(struct inode *inode,
310 struct extent_io_tree *tree, 310 struct extent_io_tree *tree,
311 u64 start, u64 end, struct page *locked_page, 311 u64 start, u64 end, struct page *locked_page,
312 unsigned long op); 312 unsigned long op);
313struct bio *
314btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
315 gfp_t gfp_flags);
313#endif 316#endif
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e354c33df082..66836d85763b 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -48,30 +48,34 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
48 struct page **prepared_pages, 48 struct page **prepared_pages,
49 struct iov_iter *i) 49 struct iov_iter *i)
50{ 50{
51 size_t copied; 51 size_t copied = 0;
52 int pg = 0; 52 int pg = 0;
53 int offset = pos & (PAGE_CACHE_SIZE - 1); 53 int offset = pos & (PAGE_CACHE_SIZE - 1);
54 int total_copied = 0;
54 55
55 while (write_bytes > 0) { 56 while (write_bytes > 0) {
56 size_t count = min_t(size_t, 57 size_t count = min_t(size_t,
57 PAGE_CACHE_SIZE - offset, write_bytes); 58 PAGE_CACHE_SIZE - offset, write_bytes);
58 struct page *page = prepared_pages[pg]; 59 struct page *page = prepared_pages[pg];
59again: 60 /*
60 if (unlikely(iov_iter_fault_in_readable(i, count))) 61 * Copy data from userspace to the current page
61 return -EFAULT; 62 *
62 63 * Disable pagefault to avoid recursive lock since
63 /* Copy data from userspace to the current page */ 64 * the pages are already locked
64 copied = iov_iter_copy_from_user(page, i, offset, count); 65 */
66 pagefault_disable();
67 copied = iov_iter_copy_from_user_atomic(page, i, offset, count);
68 pagefault_enable();
65 69
66 /* Flush processor's dcache for this page */ 70 /* Flush processor's dcache for this page */
67 flush_dcache_page(page); 71 flush_dcache_page(page);
68 iov_iter_advance(i, copied); 72 iov_iter_advance(i, copied);
69 write_bytes -= copied; 73 write_bytes -= copied;
74 total_copied += copied;
70 75
76 /* Return to btrfs_file_aio_write to fault page */
71 if (unlikely(copied == 0)) { 77 if (unlikely(copied == 0)) {
72 count = min_t(size_t, PAGE_CACHE_SIZE - offset, 78 break;
73 iov_iter_single_seg_count(i));
74 goto again;
75 } 79 }
76 80
77 if (unlikely(copied < PAGE_CACHE_SIZE - offset)) { 81 if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
@@ -81,7 +85,7 @@ again:
81 offset = 0; 85 offset = 0;
82 } 86 }
83 } 87 }
84 return 0; 88 return total_copied;
85} 89}
86 90
87/* 91/*
@@ -854,6 +858,8 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
854 unsigned long last_index; 858 unsigned long last_index;
855 int will_write; 859 int will_write;
856 int buffered = 0; 860 int buffered = 0;
861 int copied = 0;
862 int dirty_pages = 0;
857 863
858 will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) || 864 will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
859 (file->f_flags & O_DIRECT)); 865 (file->f_flags & O_DIRECT));
@@ -970,7 +976,17 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
970 WARN_ON(num_pages > nrptrs); 976 WARN_ON(num_pages > nrptrs);
971 memset(pages, 0, sizeof(struct page *) * nrptrs); 977 memset(pages, 0, sizeof(struct page *) * nrptrs);
972 978
973 ret = btrfs_delalloc_reserve_space(inode, write_bytes); 979 /*
980 * Fault pages before locking them in prepare_pages
981 * to avoid recursive lock
982 */
983 if (unlikely(iov_iter_fault_in_readable(&i, write_bytes))) {
984 ret = -EFAULT;
985 goto out;
986 }
987
988 ret = btrfs_delalloc_reserve_space(inode,
989 num_pages << PAGE_CACHE_SHIFT);
974 if (ret) 990 if (ret)
975 goto out; 991 goto out;
976 992
@@ -978,37 +994,49 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
978 pos, first_index, last_index, 994 pos, first_index, last_index,
979 write_bytes); 995 write_bytes);
980 if (ret) { 996 if (ret) {
981 btrfs_delalloc_release_space(inode, write_bytes); 997 btrfs_delalloc_release_space(inode,
998 num_pages << PAGE_CACHE_SHIFT);
982 goto out; 999 goto out;
983 } 1000 }
984 1001
985 ret = btrfs_copy_from_user(pos, num_pages, 1002 copied = btrfs_copy_from_user(pos, num_pages,
986 write_bytes, pages, &i); 1003 write_bytes, pages, &i);
987 if (ret == 0) { 1004 dirty_pages = (copied + PAGE_CACHE_SIZE - 1) >>
1005 PAGE_CACHE_SHIFT;
1006
1007 if (num_pages > dirty_pages) {
1008 if (copied > 0)
1009 atomic_inc(
1010 &BTRFS_I(inode)->outstanding_extents);
1011 btrfs_delalloc_release_space(inode,
1012 (num_pages - dirty_pages) <<
1013 PAGE_CACHE_SHIFT);
1014 }
1015
1016 if (copied > 0) {
988 dirty_and_release_pages(NULL, root, file, pages, 1017 dirty_and_release_pages(NULL, root, file, pages,
989 num_pages, pos, write_bytes); 1018 dirty_pages, pos, copied);
990 } 1019 }
991 1020
992 btrfs_drop_pages(pages, num_pages); 1021 btrfs_drop_pages(pages, num_pages);
993 if (ret) {
994 btrfs_delalloc_release_space(inode, write_bytes);
995 goto out;
996 }
997 1022
998 if (will_write) { 1023 if (copied > 0) {
999 filemap_fdatawrite_range(inode->i_mapping, pos, 1024 if (will_write) {
1000 pos + write_bytes - 1); 1025 filemap_fdatawrite_range(inode->i_mapping, pos,
1001 } else { 1026 pos + copied - 1);
1002 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1027 } else {
1003 num_pages); 1028 balance_dirty_pages_ratelimited_nr(
1004 if (num_pages < 1029 inode->i_mapping,
1005 (root->leafsize >> PAGE_CACHE_SHIFT) + 1) 1030 dirty_pages);
1006 btrfs_btree_balance_dirty(root, 1); 1031 if (dirty_pages <
1007 btrfs_throttle(root); 1032 (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
1033 btrfs_btree_balance_dirty(root, 1);
1034 btrfs_throttle(root);
1035 }
1008 } 1036 }
1009 1037
1010 pos += write_bytes; 1038 pos += copied;
1011 num_written += write_bytes; 1039 num_written += copied;
1012 1040
1013 cond_resched(); 1041 cond_resched();
1014 } 1042 }
@@ -1047,8 +1075,14 @@ out:
1047 1075
1048 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { 1076 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
1049 trans = btrfs_start_transaction(root, 0); 1077 trans = btrfs_start_transaction(root, 0);
1078 if (IS_ERR(trans)) {
1079 num_written = PTR_ERR(trans);
1080 goto done;
1081 }
1082 mutex_lock(&inode->i_mutex);
1050 ret = btrfs_log_dentry_safe(trans, root, 1083 ret = btrfs_log_dentry_safe(trans, root,
1051 file->f_dentry); 1084 file->f_dentry);
1085 mutex_unlock(&inode->i_mutex);
1052 if (ret == 0) { 1086 if (ret == 0) {
1053 ret = btrfs_sync_log(trans, root); 1087 ret = btrfs_sync_log(trans, root);
1054 if (ret == 0) 1088 if (ret == 0)
@@ -1067,6 +1101,7 @@ out:
1067 (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT); 1101 (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
1068 } 1102 }
1069 } 1103 }
1104done:
1070 current->backing_dev_info = NULL; 1105 current->backing_dev_info = NULL;
1071 return num_written ? num_written : err; 1106 return num_written ? num_written : err;
1072} 1107}
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 22ee0dc2e6b8..60d684266959 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -290,7 +290,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
290 (unsigned long long)BTRFS_I(inode)->generation, 290 (unsigned long long)BTRFS_I(inode)->generation,
291 (unsigned long long)generation, 291 (unsigned long long)generation,
292 (unsigned long long)block_group->key.objectid); 292 (unsigned long long)block_group->key.objectid);
293 goto out; 293 goto free_cache;
294 } 294 }
295 295
296 if (!num_entries) 296 if (!num_entries)
@@ -524,6 +524,12 @@ int btrfs_write_out_cache(struct btrfs_root *root,
524 return 0; 524 return 0;
525 } 525 }
526 526
527 node = rb_first(&block_group->free_space_offset);
528 if (!node) {
529 iput(inode);
530 return 0;
531 }
532
527 last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; 533 last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
528 filemap_write_and_wait(inode->i_mapping); 534 filemap_write_and_wait(inode->i_mapping);
529 btrfs_wait_ordered_range(inode, inode->i_size & 535 btrfs_wait_ordered_range(inode, inode->i_size &
@@ -543,10 +549,6 @@ int btrfs_write_out_cache(struct btrfs_root *root,
543 */ 549 */
544 first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64); 550 first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64);
545 551
546 node = rb_first(&block_group->free_space_offset);
547 if (!node)
548 goto out_free;
549
550 /* 552 /*
551 * Lock all pages first so we can lock the extent safely. 553 * Lock all pages first so we can lock the extent safely.
552 * 554 *
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 558cac2dfa54..72f31ecb5c90 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -495,7 +495,7 @@ again:
495 add_async_extent(async_cow, start, num_bytes, 495 add_async_extent(async_cow, start, num_bytes,
496 total_compressed, pages, nr_pages_ret); 496 total_compressed, pages, nr_pages_ret);
497 497
498 if (start + num_bytes < end && start + num_bytes < actual_end) { 498 if (start + num_bytes < end) {
499 start += num_bytes; 499 start += num_bytes;
500 pages = NULL; 500 pages = NULL;
501 cond_resched(); 501 cond_resched();
@@ -4501,6 +4501,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4501 BTRFS_I(inode)->index_cnt = 2; 4501 BTRFS_I(inode)->index_cnt = 2;
4502 BTRFS_I(inode)->root = root; 4502 BTRFS_I(inode)->root = root;
4503 BTRFS_I(inode)->generation = trans->transid; 4503 BTRFS_I(inode)->generation = trans->transid;
4504 inode->i_generation = BTRFS_I(inode)->generation;
4504 btrfs_set_inode_space_info(root, inode); 4505 btrfs_set_inode_space_info(root, inode);
4505 4506
4506 if (mode & S_IFDIR) 4507 if (mode & S_IFDIR)
@@ -4622,12 +4623,12 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
4622} 4623}
4623 4624
4624static int btrfs_add_nondir(struct btrfs_trans_handle *trans, 4625static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
4625 struct dentry *dentry, struct inode *inode, 4626 struct inode *dir, struct dentry *dentry,
4626 int backref, u64 index) 4627 struct inode *inode, int backref, u64 index)
4627{ 4628{
4628 int err = btrfs_add_link(trans, dentry->d_parent->d_inode, 4629 int err = btrfs_add_link(trans, dir, inode,
4629 inode, dentry->d_name.name, 4630 dentry->d_name.name, dentry->d_name.len,
4630 dentry->d_name.len, backref, index); 4631 backref, index);
4631 if (!err) { 4632 if (!err) {
4632 d_instantiate(dentry, inode); 4633 d_instantiate(dentry, inode);
4633 return 0; 4634 return 0;
@@ -4668,8 +4669,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4668 btrfs_set_trans_block_group(trans, dir); 4669 btrfs_set_trans_block_group(trans, dir);
4669 4670
4670 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4671 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4671 dentry->d_name.len, 4672 dentry->d_name.len, dir->i_ino, objectid,
4672 dentry->d_parent->d_inode->i_ino, objectid,
4673 BTRFS_I(dir)->block_group, mode, &index); 4673 BTRFS_I(dir)->block_group, mode, &index);
4674 err = PTR_ERR(inode); 4674 err = PTR_ERR(inode);
4675 if (IS_ERR(inode)) 4675 if (IS_ERR(inode))
@@ -4682,7 +4682,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4682 } 4682 }
4683 4683
4684 btrfs_set_trans_block_group(trans, inode); 4684 btrfs_set_trans_block_group(trans, inode);
4685 err = btrfs_add_nondir(trans, dentry, inode, 0, index); 4685 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
4686 if (err) 4686 if (err)
4687 drop_inode = 1; 4687 drop_inode = 1;
4688 else { 4688 else {
@@ -4730,10 +4730,8 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4730 btrfs_set_trans_block_group(trans, dir); 4730 btrfs_set_trans_block_group(trans, dir);
4731 4731
4732 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4732 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4733 dentry->d_name.len, 4733 dentry->d_name.len, dir->i_ino, objectid,
4734 dentry->d_parent->d_inode->i_ino, 4734 BTRFS_I(dir)->block_group, mode, &index);
4735 objectid, BTRFS_I(dir)->block_group, mode,
4736 &index);
4737 err = PTR_ERR(inode); 4735 err = PTR_ERR(inode);
4738 if (IS_ERR(inode)) 4736 if (IS_ERR(inode))
4739 goto out_unlock; 4737 goto out_unlock;
@@ -4745,7 +4743,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4745 } 4743 }
4746 4744
4747 btrfs_set_trans_block_group(trans, inode); 4745 btrfs_set_trans_block_group(trans, inode);
4748 err = btrfs_add_nondir(trans, dentry, inode, 0, index); 4746 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
4749 if (err) 4747 if (err)
4750 drop_inode = 1; 4748 drop_inode = 1;
4751 else { 4749 else {
@@ -4787,6 +4785,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4787 return -EPERM; 4785 return -EPERM;
4788 4786
4789 btrfs_inc_nlink(inode); 4787 btrfs_inc_nlink(inode);
4788 inode->i_ctime = CURRENT_TIME;
4790 4789
4791 err = btrfs_set_inode_index(dir, &index); 4790 err = btrfs_set_inode_index(dir, &index);
4792 if (err) 4791 if (err)
@@ -4805,15 +4804,17 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4805 btrfs_set_trans_block_group(trans, dir); 4804 btrfs_set_trans_block_group(trans, dir);
4806 ihold(inode); 4805 ihold(inode);
4807 4806
4808 err = btrfs_add_nondir(trans, dentry, inode, 1, index); 4807 err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index);
4809 4808
4810 if (err) { 4809 if (err) {
4811 drop_inode = 1; 4810 drop_inode = 1;
4812 } else { 4811 } else {
4812 struct dentry *parent = dget_parent(dentry);
4813 btrfs_update_inode_block_group(trans, dir); 4813 btrfs_update_inode_block_group(trans, dir);
4814 err = btrfs_update_inode(trans, root, inode); 4814 err = btrfs_update_inode(trans, root, inode);
4815 BUG_ON(err); 4815 BUG_ON(err);
4816 btrfs_log_new_name(trans, inode, NULL, dentry->d_parent); 4816 btrfs_log_new_name(trans, inode, NULL, parent);
4817 dput(parent);
4817 } 4818 }
4818 4819
4819 nr = trans->blocks_used; 4820 nr = trans->blocks_used;
@@ -4853,8 +4854,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
4853 btrfs_set_trans_block_group(trans, dir); 4854 btrfs_set_trans_block_group(trans, dir);
4854 4855
4855 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4856 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4856 dentry->d_name.len, 4857 dentry->d_name.len, dir->i_ino, objectid,
4857 dentry->d_parent->d_inode->i_ino, objectid,
4858 BTRFS_I(dir)->block_group, S_IFDIR | mode, 4858 BTRFS_I(dir)->block_group, S_IFDIR | mode,
4859 &index); 4859 &index);
4860 if (IS_ERR(inode)) { 4860 if (IS_ERR(inode)) {
@@ -4877,9 +4877,8 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
4877 if (err) 4877 if (err)
4878 goto out_fail; 4878 goto out_fail;
4879 4879
4880 err = btrfs_add_link(trans, dentry->d_parent->d_inode, 4880 err = btrfs_add_link(trans, dir, inode, dentry->d_name.name,
4881 inode, dentry->d_name.name, 4881 dentry->d_name.len, 0, index);
4882 dentry->d_name.len, 0, index);
4883 if (err) 4882 if (err)
4884 goto out_fail; 4883 goto out_fail;
4885 4884
@@ -5535,13 +5534,21 @@ struct btrfs_dio_private {
5535 u64 bytes; 5534 u64 bytes;
5536 u32 *csums; 5535 u32 *csums;
5537 void *private; 5536 void *private;
5537
5538 /* number of bios pending for this dio */
5539 atomic_t pending_bios;
5540
5541 /* IO errors */
5542 int errors;
5543
5544 struct bio *orig_bio;
5538}; 5545};
5539 5546
5540static void btrfs_endio_direct_read(struct bio *bio, int err) 5547static void btrfs_endio_direct_read(struct bio *bio, int err)
5541{ 5548{
5549 struct btrfs_dio_private *dip = bio->bi_private;
5542 struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; 5550 struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
5543 struct bio_vec *bvec = bio->bi_io_vec; 5551 struct bio_vec *bvec = bio->bi_io_vec;
5544 struct btrfs_dio_private *dip = bio->bi_private;
5545 struct inode *inode = dip->inode; 5552 struct inode *inode = dip->inode;
5546 struct btrfs_root *root = BTRFS_I(inode)->root; 5553 struct btrfs_root *root = BTRFS_I(inode)->root;
5547 u64 start; 5554 u64 start;
@@ -5595,15 +5602,18 @@ static void btrfs_endio_direct_write(struct bio *bio, int err)
5595 struct btrfs_trans_handle *trans; 5602 struct btrfs_trans_handle *trans;
5596 struct btrfs_ordered_extent *ordered = NULL; 5603 struct btrfs_ordered_extent *ordered = NULL;
5597 struct extent_state *cached_state = NULL; 5604 struct extent_state *cached_state = NULL;
5605 u64 ordered_offset = dip->logical_offset;
5606 u64 ordered_bytes = dip->bytes;
5598 int ret; 5607 int ret;
5599 5608
5600 if (err) 5609 if (err)
5601 goto out_done; 5610 goto out_done;
5602 5611again:
5603 ret = btrfs_dec_test_ordered_pending(inode, &ordered, 5612 ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
5604 dip->logical_offset, dip->bytes); 5613 &ordered_offset,
5614 ordered_bytes);
5605 if (!ret) 5615 if (!ret)
5606 goto out_done; 5616 goto out_test;
5607 5617
5608 BUG_ON(!ordered); 5618 BUG_ON(!ordered);
5609 5619
@@ -5663,8 +5673,20 @@ out_unlock:
5663out: 5673out:
5664 btrfs_delalloc_release_metadata(inode, ordered->len); 5674 btrfs_delalloc_release_metadata(inode, ordered->len);
5665 btrfs_end_transaction(trans, root); 5675 btrfs_end_transaction(trans, root);
5676 ordered_offset = ordered->file_offset + ordered->len;
5666 btrfs_put_ordered_extent(ordered); 5677 btrfs_put_ordered_extent(ordered);
5667 btrfs_put_ordered_extent(ordered); 5678 btrfs_put_ordered_extent(ordered);
5679
5680out_test:
5681 /*
5682 * our bio might span multiple ordered extents. If we haven't
5683 * completed the accounting for the whole dio, go back and try again
5684 */
5685 if (ordered_offset < dip->logical_offset + dip->bytes) {
5686 ordered_bytes = dip->logical_offset + dip->bytes -
5687 ordered_offset;
5688 goto again;
5689 }
5668out_done: 5690out_done:
5669 bio->bi_private = dip->private; 5691 bio->bi_private = dip->private;
5670 5692
@@ -5684,6 +5706,176 @@ static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
5684 return 0; 5706 return 0;
5685} 5707}
5686 5708
5709static void btrfs_end_dio_bio(struct bio *bio, int err)
5710{
5711 struct btrfs_dio_private *dip = bio->bi_private;
5712
5713 if (err) {
5714 printk(KERN_ERR "btrfs direct IO failed ino %lu rw %lu "
5715 "sector %#Lx len %u err no %d\n",
5716 dip->inode->i_ino, bio->bi_rw,
5717 (unsigned long long)bio->bi_sector, bio->bi_size, err);
5718 dip->errors = 1;
5719
5720 /*
5721 * before atomic variable goto zero, we must make sure
5722 * dip->errors is perceived to be set.
5723 */
5724 smp_mb__before_atomic_dec();
5725 }
5726
5727 /* if there are more bios still pending for this dio, just exit */
5728 if (!atomic_dec_and_test(&dip->pending_bios))
5729 goto out;
5730
5731 if (dip->errors)
5732 bio_io_error(dip->orig_bio);
5733 else {
5734 set_bit(BIO_UPTODATE, &dip->orig_bio->bi_flags);
5735 bio_endio(dip->orig_bio, 0);
5736 }
5737out:
5738 bio_put(bio);
5739}
5740
5741static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
5742 u64 first_sector, gfp_t gfp_flags)
5743{
5744 int nr_vecs = bio_get_nr_vecs(bdev);
5745 return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags);
5746}
5747
5748static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
5749 int rw, u64 file_offset, int skip_sum,
5750 u32 *csums)
5751{
5752 int write = rw & REQ_WRITE;
5753 struct btrfs_root *root = BTRFS_I(inode)->root;
5754 int ret;
5755
5756 bio_get(bio);
5757 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
5758 if (ret)
5759 goto err;
5760
5761 if (write && !skip_sum) {
5762 ret = btrfs_wq_submit_bio(root->fs_info,
5763 inode, rw, bio, 0, 0,
5764 file_offset,
5765 __btrfs_submit_bio_start_direct_io,
5766 __btrfs_submit_bio_done);
5767 goto err;
5768 } else if (!skip_sum)
5769 btrfs_lookup_bio_sums_dio(root, inode, bio,
5770 file_offset, csums);
5771
5772 ret = btrfs_map_bio(root, rw, bio, 0, 1);
5773err:
5774 bio_put(bio);
5775 return ret;
5776}
5777
5778static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
5779 int skip_sum)
5780{
5781 struct inode *inode = dip->inode;
5782 struct btrfs_root *root = BTRFS_I(inode)->root;
5783 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
5784 struct bio *bio;
5785 struct bio *orig_bio = dip->orig_bio;
5786 struct bio_vec *bvec = orig_bio->bi_io_vec;
5787 u64 start_sector = orig_bio->bi_sector;
5788 u64 file_offset = dip->logical_offset;
5789 u64 submit_len = 0;
5790 u64 map_length;
5791 int nr_pages = 0;
5792 u32 *csums = dip->csums;
5793 int ret = 0;
5794
5795 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
5796 if (!bio)
5797 return -ENOMEM;
5798 bio->bi_private = dip;
5799 bio->bi_end_io = btrfs_end_dio_bio;
5800 atomic_inc(&dip->pending_bios);
5801
5802 map_length = orig_bio->bi_size;
5803 ret = btrfs_map_block(map_tree, READ, start_sector << 9,
5804 &map_length, NULL, 0);
5805 if (ret) {
5806 bio_put(bio);
5807 return -EIO;
5808 }
5809
5810 while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) {
5811 if (unlikely(map_length < submit_len + bvec->bv_len ||
5812 bio_add_page(bio, bvec->bv_page, bvec->bv_len,
5813 bvec->bv_offset) < bvec->bv_len)) {
5814 /*
5815 * inc the count before we submit the bio so
5816 * we know the end IO handler won't happen before
5817 * we inc the count. Otherwise, the dip might get freed
5818 * before we're done setting it up
5819 */
5820 atomic_inc(&dip->pending_bios);
5821 ret = __btrfs_submit_dio_bio(bio, inode, rw,
5822 file_offset, skip_sum,
5823 csums);
5824 if (ret) {
5825 bio_put(bio);
5826 atomic_dec(&dip->pending_bios);
5827 goto out_err;
5828 }
5829
5830 if (!skip_sum)
5831 csums = csums + nr_pages;
5832 start_sector += submit_len >> 9;
5833 file_offset += submit_len;
5834
5835 submit_len = 0;
5836 nr_pages = 0;
5837
5838 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev,
5839 start_sector, GFP_NOFS);
5840 if (!bio)
5841 goto out_err;
5842 bio->bi_private = dip;
5843 bio->bi_end_io = btrfs_end_dio_bio;
5844
5845 map_length = orig_bio->bi_size;
5846 ret = btrfs_map_block(map_tree, READ, start_sector << 9,
5847 &map_length, NULL, 0);
5848 if (ret) {
5849 bio_put(bio);
5850 goto out_err;
5851 }
5852 } else {
5853 submit_len += bvec->bv_len;
5854 nr_pages ++;
5855 bvec++;
5856 }
5857 }
5858
5859 ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum,
5860 csums);
5861 if (!ret)
5862 return 0;
5863
5864 bio_put(bio);
5865out_err:
5866 dip->errors = 1;
5867 /*
5868 * before atomic variable goto zero, we must
5869 * make sure dip->errors is perceived to be set.
5870 */
5871 smp_mb__before_atomic_dec();
5872 if (atomic_dec_and_test(&dip->pending_bios))
5873 bio_io_error(dip->orig_bio);
5874
5875 /* bio_end_io() will handle error, so we needn't return it */
5876 return 0;
5877}
5878
5687static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode, 5879static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
5688 loff_t file_offset) 5880 loff_t file_offset)
5689{ 5881{
@@ -5723,36 +5915,18 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
5723 5915
5724 dip->disk_bytenr = (u64)bio->bi_sector << 9; 5916 dip->disk_bytenr = (u64)bio->bi_sector << 9;
5725 bio->bi_private = dip; 5917 bio->bi_private = dip;
5918 dip->errors = 0;
5919 dip->orig_bio = bio;
5920 atomic_set(&dip->pending_bios, 0);
5726 5921
5727 if (write) 5922 if (write)
5728 bio->bi_end_io = btrfs_endio_direct_write; 5923 bio->bi_end_io = btrfs_endio_direct_write;
5729 else 5924 else
5730 bio->bi_end_io = btrfs_endio_direct_read; 5925 bio->bi_end_io = btrfs_endio_direct_read;
5731 5926
5732 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); 5927 ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
5733 if (ret) 5928 if (!ret)
5734 goto out_err;
5735
5736 if (write && !skip_sum) {
5737 ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
5738 inode, rw, bio, 0, 0,
5739 dip->logical_offset,
5740 __btrfs_submit_bio_start_direct_io,
5741 __btrfs_submit_bio_done);
5742 if (ret)
5743 goto out_err;
5744 return; 5929 return;
5745 } else if (!skip_sum)
5746 btrfs_lookup_bio_sums_dio(root, inode, bio,
5747 dip->logical_offset, dip->csums);
5748
5749 ret = btrfs_map_bio(root, rw, bio, 0, 1);
5750 if (ret)
5751 goto out_err;
5752 return;
5753out_err:
5754 kfree(dip->csums);
5755 kfree(dip);
5756free_ordered: 5930free_ordered:
5757 /* 5931 /*
5758 * If this is a write, we need to clean up the reserved space and kill 5932 * If this is a write, we need to clean up the reserved space and kill
@@ -5760,8 +5934,7 @@ free_ordered:
5760 */ 5934 */
5761 if (write) { 5935 if (write) {
5762 struct btrfs_ordered_extent *ordered; 5936 struct btrfs_ordered_extent *ordered;
5763 ordered = btrfs_lookup_ordered_extent(inode, 5937 ordered = btrfs_lookup_ordered_extent(inode, file_offset);
5764 dip->logical_offset);
5765 if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) && 5938 if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
5766 !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) 5939 !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
5767 btrfs_free_reserved_extent(root, ordered->start, 5940 btrfs_free_reserved_extent(root, ordered->start,
@@ -6607,8 +6780,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
6607 BUG_ON(ret); 6780 BUG_ON(ret);
6608 6781
6609 if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) { 6782 if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
6610 btrfs_log_new_name(trans, old_inode, old_dir, 6783 struct dentry *parent = dget_parent(new_dentry);
6611 new_dentry->d_parent); 6784 btrfs_log_new_name(trans, old_inode, old_dir, parent);
6785 dput(parent);
6612 btrfs_end_log_trans(root); 6786 btrfs_end_log_trans(root);
6613 } 6787 }
6614out_fail: 6788out_fail:
@@ -6758,8 +6932,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
6758 btrfs_set_trans_block_group(trans, dir); 6932 btrfs_set_trans_block_group(trans, dir);
6759 6933
6760 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 6934 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
6761 dentry->d_name.len, 6935 dentry->d_name.len, dir->i_ino, objectid,
6762 dentry->d_parent->d_inode->i_ino, objectid,
6763 BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO, 6936 BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO,
6764 &index); 6937 &index);
6765 err = PTR_ERR(inode); 6938 err = PTR_ERR(inode);
@@ -6773,7 +6946,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
6773 } 6946 }
6774 6947
6775 btrfs_set_trans_block_group(trans, inode); 6948 btrfs_set_trans_block_group(trans, inode);
6776 err = btrfs_add_nondir(trans, dentry, inode, 0, index); 6949 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
6777 if (err) 6950 if (err)
6778 drop_inode = 1; 6951 drop_inode = 1;
6779 else { 6952 else {
@@ -6844,6 +7017,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
6844 struct btrfs_root *root = BTRFS_I(inode)->root; 7017 struct btrfs_root *root = BTRFS_I(inode)->root;
6845 struct btrfs_key ins; 7018 struct btrfs_key ins;
6846 u64 cur_offset = start; 7019 u64 cur_offset = start;
7020 u64 i_size;
6847 int ret = 0; 7021 int ret = 0;
6848 bool own_trans = true; 7022 bool own_trans = true;
6849 7023
@@ -6885,11 +7059,11 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
6885 (actual_len > inode->i_size) && 7059 (actual_len > inode->i_size) &&
6886 (cur_offset > inode->i_size)) { 7060 (cur_offset > inode->i_size)) {
6887 if (cur_offset > actual_len) 7061 if (cur_offset > actual_len)
6888 i_size_write(inode, actual_len); 7062 i_size = actual_len;
6889 else 7063 else
6890 i_size_write(inode, cur_offset); 7064 i_size = cur_offset;
6891 i_size_write(inode, cur_offset); 7065 i_size_write(inode, i_size);
6892 btrfs_ordered_update_i_size(inode, cur_offset, NULL); 7066 btrfs_ordered_update_i_size(inode, i_size, NULL);
6893 } 7067 }
6894 7068
6895 ret = btrfs_update_inode(trans, root, inode); 7069 ret = btrfs_update_inode(trans, root, inode);
@@ -6943,6 +7117,10 @@ static long btrfs_fallocate(struct inode *inode, int mode,
6943 btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start); 7117 btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
6944 7118
6945 mutex_lock(&inode->i_mutex); 7119 mutex_lock(&inode->i_mutex);
7120 ret = inode_newsize_ok(inode, alloc_end);
7121 if (ret)
7122 goto out;
7123
6946 if (alloc_start > inode->i_size) { 7124 if (alloc_start > inode->i_size) {
6947 ret = btrfs_cont_expand(inode, alloc_start); 7125 ret = btrfs_cont_expand(inode, alloc_start);
6948 if (ret) 7126 if (ret)
@@ -7139,6 +7317,7 @@ static const struct inode_operations btrfs_symlink_inode_operations = {
7139 .readlink = generic_readlink, 7317 .readlink = generic_readlink,
7140 .follow_link = page_follow_link_light, 7318 .follow_link = page_follow_link_light,
7141 .put_link = page_put_link, 7319 .put_link = page_put_link,
7320 .getattr = btrfs_getattr,
7142 .permission = btrfs_permission, 7321 .permission = btrfs_permission,
7143 .setxattr = btrfs_setxattr, 7322 .setxattr = btrfs_setxattr,
7144 .getxattr = btrfs_getxattr, 7323 .getxattr = btrfs_getxattr,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 463d91b4dd3a..f87552a1d7ea 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -233,7 +233,8 @@ static noinline int create_subvol(struct btrfs_root *root,
233 struct btrfs_inode_item *inode_item; 233 struct btrfs_inode_item *inode_item;
234 struct extent_buffer *leaf; 234 struct extent_buffer *leaf;
235 struct btrfs_root *new_root; 235 struct btrfs_root *new_root;
236 struct inode *dir = dentry->d_parent->d_inode; 236 struct dentry *parent = dget_parent(dentry);
237 struct inode *dir;
237 int ret; 238 int ret;
238 int err; 239 int err;
239 u64 objectid; 240 u64 objectid;
@@ -242,8 +243,13 @@ static noinline int create_subvol(struct btrfs_root *root,
242 243
243 ret = btrfs_find_free_objectid(NULL, root->fs_info->tree_root, 244 ret = btrfs_find_free_objectid(NULL, root->fs_info->tree_root,
244 0, &objectid); 245 0, &objectid);
245 if (ret) 246 if (ret) {
247 dput(parent);
246 return ret; 248 return ret;
249 }
250
251 dir = parent->d_inode;
252
247 /* 253 /*
248 * 1 - inode item 254 * 1 - inode item
249 * 2 - refs 255 * 2 - refs
@@ -251,8 +257,10 @@ static noinline int create_subvol(struct btrfs_root *root,
251 * 2 - dir items 257 * 2 - dir items
252 */ 258 */
253 trans = btrfs_start_transaction(root, 6); 259 trans = btrfs_start_transaction(root, 6);
254 if (IS_ERR(trans)) 260 if (IS_ERR(trans)) {
261 dput(parent);
255 return PTR_ERR(trans); 262 return PTR_ERR(trans);
263 }
256 264
257 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 265 leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
258 0, objectid, NULL, 0, 0, 0); 266 0, objectid, NULL, 0, 0, 0);
@@ -339,6 +347,7 @@ static noinline int create_subvol(struct btrfs_root *root,
339 347
340 d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry)); 348 d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
341fail: 349fail:
350 dput(parent);
342 if (async_transid) { 351 if (async_transid) {
343 *async_transid = trans->transid; 352 *async_transid = trans->transid;
344 err = btrfs_commit_transaction_async(trans, root, 1); 353 err = btrfs_commit_transaction_async(trans, root, 1);
@@ -354,6 +363,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
354 char *name, int namelen, u64 *async_transid) 363 char *name, int namelen, u64 *async_transid)
355{ 364{
356 struct inode *inode; 365 struct inode *inode;
366 struct dentry *parent;
357 struct btrfs_pending_snapshot *pending_snapshot; 367 struct btrfs_pending_snapshot *pending_snapshot;
358 struct btrfs_trans_handle *trans; 368 struct btrfs_trans_handle *trans;
359 int ret; 369 int ret;
@@ -396,7 +406,9 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
396 406
397 btrfs_orphan_cleanup(pending_snapshot->snap); 407 btrfs_orphan_cleanup(pending_snapshot->snap);
398 408
399 inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); 409 parent = dget_parent(dentry);
410 inode = btrfs_lookup_dentry(parent->d_inode, dentry);
411 dput(parent);
400 if (IS_ERR(inode)) { 412 if (IS_ERR(inode)) {
401 ret = PTR_ERR(inode); 413 ret = PTR_ERR(inode);
402 goto fail; 414 goto fail;
@@ -935,23 +947,42 @@ out:
935 947
936static noinline int btrfs_ioctl_snap_create(struct file *file, 948static noinline int btrfs_ioctl_snap_create(struct file *file,
937 void __user *arg, int subvol, 949 void __user *arg, int subvol,
938 int async) 950 int v2)
939{ 951{
940 struct btrfs_ioctl_vol_args *vol_args = NULL; 952 struct btrfs_ioctl_vol_args *vol_args = NULL;
941 struct btrfs_ioctl_async_vol_args *async_vol_args = NULL; 953 struct btrfs_ioctl_vol_args_v2 *vol_args_v2 = NULL;
942 char *name; 954 char *name;
943 u64 fd; 955 u64 fd;
944 u64 transid = 0;
945 int ret; 956 int ret;
946 957
947 if (async) { 958 if (v2) {
948 async_vol_args = memdup_user(arg, sizeof(*async_vol_args)); 959 u64 transid = 0;
949 if (IS_ERR(async_vol_args)) 960 u64 *ptr = NULL;
950 return PTR_ERR(async_vol_args); 961
962 vol_args_v2 = memdup_user(arg, sizeof(*vol_args_v2));
963 if (IS_ERR(vol_args_v2))
964 return PTR_ERR(vol_args_v2);
965
966 if (vol_args_v2->flags & ~BTRFS_SUBVOL_CREATE_ASYNC) {
967 ret = -EINVAL;
968 goto out;
969 }
970
971 name = vol_args_v2->name;
972 fd = vol_args_v2->fd;
973 vol_args_v2->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
974
975 if (vol_args_v2->flags & BTRFS_SUBVOL_CREATE_ASYNC)
976 ptr = &transid;
977
978 ret = btrfs_ioctl_snap_create_transid(file, name, fd,
979 subvol, ptr);
951 980
952 name = async_vol_args->name; 981 if (ret == 0 && ptr &&
953 fd = async_vol_args->fd; 982 copy_to_user(arg +
954 async_vol_args->name[BTRFS_SNAPSHOT_NAME_MAX] = '\0'; 983 offsetof(struct btrfs_ioctl_vol_args_v2,
984 transid), ptr, sizeof(*ptr)))
985 ret = -EFAULT;
955 } else { 986 } else {
956 vol_args = memdup_user(arg, sizeof(*vol_args)); 987 vol_args = memdup_user(arg, sizeof(*vol_args));
957 if (IS_ERR(vol_args)) 988 if (IS_ERR(vol_args))
@@ -959,20 +990,13 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
959 name = vol_args->name; 990 name = vol_args->name;
960 fd = vol_args->fd; 991 fd = vol_args->fd;
961 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 992 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
962 }
963
964 ret = btrfs_ioctl_snap_create_transid(file, name, fd,
965 subvol, &transid);
966 993
967 if (!ret && async) { 994 ret = btrfs_ioctl_snap_create_transid(file, name, fd,
968 if (copy_to_user(arg + 995 subvol, NULL);
969 offsetof(struct btrfs_ioctl_async_vol_args,
970 transid), &transid, sizeof(transid)))
971 return -EFAULT;
972 } 996 }
973 997out:
974 kfree(vol_args); 998 kfree(vol_args);
975 kfree(async_vol_args); 999 kfree(vol_args_v2);
976 1000
977 return ret; 1001 return ret;
978} 1002}
@@ -1669,12 +1693,11 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1669 olen = len = src->i_size - off; 1693 olen = len = src->i_size - off;
1670 /* if we extend to eof, continue to block boundary */ 1694 /* if we extend to eof, continue to block boundary */
1671 if (off + len == src->i_size) 1695 if (off + len == src->i_size)
1672 len = ((src->i_size + bs-1) & ~(bs-1)) 1696 len = ALIGN(src->i_size, bs) - off;
1673 - off;
1674 1697
1675 /* verify the end result is block aligned */ 1698 /* verify the end result is block aligned */
1676 if ((off & (bs-1)) || 1699 if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) ||
1677 ((off + len) & (bs-1))) 1700 !IS_ALIGNED(destoff, bs))
1678 goto out_unlock; 1701 goto out_unlock;
1679 1702
1680 /* do any pending delalloc/csum calc on src, one way or 1703 /* do any pending delalloc/csum calc on src, one way or
@@ -1874,8 +1897,8 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1874 * but shouldn't round up the file size 1897 * but shouldn't round up the file size
1875 */ 1898 */
1876 endoff = new_key.offset + datal; 1899 endoff = new_key.offset + datal;
1877 if (endoff > off+olen) 1900 if (endoff > destoff+olen)
1878 endoff = off+olen; 1901 endoff = destoff+olen;
1879 if (endoff > inode->i_size) 1902 if (endoff > inode->i_size)
1880 btrfs_i_size_write(inode, endoff); 1903 btrfs_i_size_write(inode, endoff);
1881 1904
@@ -2235,7 +2258,7 @@ long btrfs_ioctl(struct file *file, unsigned int
2235 return btrfs_ioctl_getversion(file, argp); 2258 return btrfs_ioctl_getversion(file, argp);
2236 case BTRFS_IOC_SNAP_CREATE: 2259 case BTRFS_IOC_SNAP_CREATE:
2237 return btrfs_ioctl_snap_create(file, argp, 0, 0); 2260 return btrfs_ioctl_snap_create(file, argp, 0, 0);
2238 case BTRFS_IOC_SNAP_CREATE_ASYNC: 2261 case BTRFS_IOC_SNAP_CREATE_V2:
2239 return btrfs_ioctl_snap_create(file, argp, 0, 1); 2262 return btrfs_ioctl_snap_create(file, argp, 0, 1);
2240 case BTRFS_IOC_SUBVOL_CREATE: 2263 case BTRFS_IOC_SUBVOL_CREATE:
2241 return btrfs_ioctl_snap_create(file, argp, 1, 0); 2264 return btrfs_ioctl_snap_create(file, argp, 1, 0);
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 17c99ebdf960..c344d12c646b 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -30,11 +30,15 @@ struct btrfs_ioctl_vol_args {
30 char name[BTRFS_PATH_NAME_MAX + 1]; 30 char name[BTRFS_PATH_NAME_MAX + 1];
31}; 31};
32 32
33#define BTRFS_SNAPSHOT_NAME_MAX 4079 33#define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0)
34struct btrfs_ioctl_async_vol_args { 34
35#define BTRFS_SUBVOL_NAME_MAX 4039
36struct btrfs_ioctl_vol_args_v2 {
35 __s64 fd; 37 __s64 fd;
36 __u64 transid; 38 __u64 transid;
37 char name[BTRFS_SNAPSHOT_NAME_MAX + 1]; 39 __u64 flags;
40 __u64 unused[4];
41 char name[BTRFS_SUBVOL_NAME_MAX + 1];
38}; 42};
39 43
40#define BTRFS_INO_LOOKUP_PATH_MAX 4080 44#define BTRFS_INO_LOOKUP_PATH_MAX 4080
@@ -187,6 +191,6 @@ struct btrfs_ioctl_space_args {
187 struct btrfs_ioctl_space_args) 191 struct btrfs_ioctl_space_args)
188#define BTRFS_IOC_START_SYNC _IOR(BTRFS_IOCTL_MAGIC, 24, __u64) 192#define BTRFS_IOC_START_SYNC _IOR(BTRFS_IOCTL_MAGIC, 24, __u64)
189#define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64) 193#define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64)
190#define BTRFS_IOC_SNAP_CREATE_ASYNC _IOW(BTRFS_IOCTL_MAGIC, 23, \ 194#define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \
191 struct btrfs_ioctl_async_vol_args) 195 struct btrfs_ioctl_vol_args_v2)
192#endif 196#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index f4621f6deca1..ae7737e352c9 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -250,6 +250,73 @@ int btrfs_add_ordered_sum(struct inode *inode,
250 250
251/* 251/*
252 * this is used to account for finished IO across a given range 252 * this is used to account for finished IO across a given range
253 * of the file. The IO may span ordered extents. If
254 * a given ordered_extent is completely done, 1 is returned, otherwise
255 * 0.
256 *
257 * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
258 * to make sure this function only returns 1 once for a given ordered extent.
259 *
260 * file_offset is updated to one byte past the range that is recorded as
261 * complete. This allows you to walk forward in the file.
262 */
263int btrfs_dec_test_first_ordered_pending(struct inode *inode,
264 struct btrfs_ordered_extent **cached,
265 u64 *file_offset, u64 io_size)
266{
267 struct btrfs_ordered_inode_tree *tree;
268 struct rb_node *node;
269 struct btrfs_ordered_extent *entry = NULL;
270 int ret;
271 u64 dec_end;
272 u64 dec_start;
273 u64 to_dec;
274
275 tree = &BTRFS_I(inode)->ordered_tree;
276 spin_lock(&tree->lock);
277 node = tree_search(tree, *file_offset);
278 if (!node) {
279 ret = 1;
280 goto out;
281 }
282
283 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
284 if (!offset_in_entry(entry, *file_offset)) {
285 ret = 1;
286 goto out;
287 }
288
289 dec_start = max(*file_offset, entry->file_offset);
290 dec_end = min(*file_offset + io_size, entry->file_offset +
291 entry->len);
292 *file_offset = dec_end;
293 if (dec_start > dec_end) {
294 printk(KERN_CRIT "bad ordering dec_start %llu end %llu\n",
295 (unsigned long long)dec_start,
296 (unsigned long long)dec_end);
297 }
298 to_dec = dec_end - dec_start;
299 if (to_dec > entry->bytes_left) {
300 printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n",
301 (unsigned long long)entry->bytes_left,
302 (unsigned long long)to_dec);
303 }
304 entry->bytes_left -= to_dec;
305 if (entry->bytes_left == 0)
306 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
307 else
308 ret = 1;
309out:
310 if (!ret && cached && entry) {
311 *cached = entry;
312 atomic_inc(&entry->refs);
313 }
314 spin_unlock(&tree->lock);
315 return ret == 0;
316}
317
318/*
319 * this is used to account for finished IO across a given range
253 * of the file. The IO should not span ordered extents. If 320 * of the file. The IO should not span ordered extents. If
254 * a given ordered_extent is completely done, 1 is returned, otherwise 321 * a given ordered_extent is completely done, 1 is returned, otherwise
255 * 0. 322 * 0.
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 8ac365492a3f..61dca83119dd 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -141,6 +141,9 @@ int btrfs_remove_ordered_extent(struct inode *inode,
141int btrfs_dec_test_ordered_pending(struct inode *inode, 141int btrfs_dec_test_ordered_pending(struct inode *inode,
142 struct btrfs_ordered_extent **cached, 142 struct btrfs_ordered_extent **cached,
143 u64 file_offset, u64 io_size); 143 u64 file_offset, u64 io_size);
144int btrfs_dec_test_first_ordered_pending(struct inode *inode,
145 struct btrfs_ordered_extent **cached,
146 u64 *file_offset, u64 io_size);
144int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, 147int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
145 u64 start, u64 len, u64 disk_len, int type); 148 u64 start, u64 len, u64 disk_len, int type);
146int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, 149int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
index 79cba5fbc28e..f8be250963a0 100644
--- a/fs/btrfs/orphan.c
+++ b/fs/btrfs/orphan.c
@@ -56,8 +56,12 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
56 return -ENOMEM; 56 return -ENOMEM;
57 57
58 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 58 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
59 if (ret) 59 if (ret < 0)
60 goto out; 60 goto out;
61 if (ret) {
62 ret = -ENOENT;
63 goto out;
64 }
61 65
62 ret = btrfs_del_item(trans, root, path); 66 ret = btrfs_del_item(trans, root, path);
63 67
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 8299a25ffc8f..883c6fa1367e 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -244,6 +244,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
244 case Opt_space_cache: 244 case Opt_space_cache:
245 printk(KERN_INFO "btrfs: enabling disk space caching\n"); 245 printk(KERN_INFO "btrfs: enabling disk space caching\n");
246 btrfs_set_opt(info->mount_opt, SPACE_CACHE); 246 btrfs_set_opt(info->mount_opt, SPACE_CACHE);
247 break;
247 case Opt_clear_cache: 248 case Opt_clear_cache:
248 printk(KERN_INFO "btrfs: force clearing of disk cache\n"); 249 printk(KERN_INFO "btrfs: force clearing of disk cache\n");
249 btrfs_set_opt(info->mount_opt, CLEAR_CACHE); 250 btrfs_set_opt(info->mount_opt, CLEAR_CACHE);
@@ -562,12 +563,26 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
562 563
563static int btrfs_test_super(struct super_block *s, void *data) 564static int btrfs_test_super(struct super_block *s, void *data)
564{ 565{
565 struct btrfs_fs_devices *test_fs_devices = data; 566 struct btrfs_root *test_root = data;
566 struct btrfs_root *root = btrfs_sb(s); 567 struct btrfs_root *root = btrfs_sb(s);
567 568
568 return root->fs_info->fs_devices == test_fs_devices; 569 /*
570 * If this super block is going away, return false as it
571 * can't match as an existing super block.
572 */
573 if (!atomic_read(&s->s_active))
574 return 0;
575 return root->fs_info->fs_devices == test_root->fs_info->fs_devices;
576}
577
578static int btrfs_set_super(struct super_block *s, void *data)
579{
580 s->s_fs_info = data;
581
582 return set_anon_super(s, data);
569} 583}
570 584
585
571/* 586/*
572 * Find a superblock for the given device / mount point. 587 * Find a superblock for the given device / mount point.
573 * 588 *
@@ -581,6 +596,8 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
581 struct super_block *s; 596 struct super_block *s;
582 struct dentry *root; 597 struct dentry *root;
583 struct btrfs_fs_devices *fs_devices = NULL; 598 struct btrfs_fs_devices *fs_devices = NULL;
599 struct btrfs_root *tree_root = NULL;
600 struct btrfs_fs_info *fs_info = NULL;
584 fmode_t mode = FMODE_READ; 601 fmode_t mode = FMODE_READ;
585 char *subvol_name = NULL; 602 char *subvol_name = NULL;
586 u64 subvol_objectid = 0; 603 u64 subvol_objectid = 0;
@@ -608,8 +625,24 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
608 goto error_close_devices; 625 goto error_close_devices;
609 } 626 }
610 627
628 /*
629 * Setup a dummy root and fs_info for test/set super. This is because
630 * we don't actually fill this stuff out until open_ctree, but we need
631 * it for searching for existing supers, so this lets us do that and
632 * then open_ctree will properly initialize everything later.
633 */
634 fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS);
635 tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
636 if (!fs_info || !tree_root) {
637 error = -ENOMEM;
638 goto error_close_devices;
639 }
640 fs_info->tree_root = tree_root;
641 fs_info->fs_devices = fs_devices;
642 tree_root->fs_info = fs_info;
643
611 bdev = fs_devices->latest_bdev; 644 bdev = fs_devices->latest_bdev;
612 s = sget(fs_type, btrfs_test_super, set_anon_super, fs_devices); 645 s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root);
613 if (IS_ERR(s)) 646 if (IS_ERR(s))
614 goto error_s; 647 goto error_s;
615 648
@@ -652,9 +685,9 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
652 mutex_unlock(&root->d_inode->i_mutex); 685 mutex_unlock(&root->d_inode->i_mutex);
653 686
654 if (IS_ERR(new_root)) { 687 if (IS_ERR(new_root)) {
688 dput(root);
655 deactivate_locked_super(s); 689 deactivate_locked_super(s);
656 error = PTR_ERR(new_root); 690 error = PTR_ERR(new_root);
657 dput(root);
658 goto error_free_subvol_name; 691 goto error_free_subvol_name;
659 } 692 }
660 if (!new_root->d_inode) { 693 if (!new_root->d_inode) {
@@ -675,6 +708,8 @@ error_s:
675 error = PTR_ERR(s); 708 error = PTR_ERR(s);
676error_close_devices: 709error_close_devices:
677 btrfs_close_devices(fs_devices); 710 btrfs_close_devices(fs_devices);
711 kfree(fs_info);
712 kfree(tree_root);
678error_free_subvol_name: 713error_free_subvol_name:
679 kfree(subvol_name); 714 kfree(subvol_name);
680 return ERR_PTR(error); 715 return ERR_PTR(error);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 1fffbc017bdf..f50e931fc217 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -902,6 +902,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
902 struct btrfs_root *root = pending->root; 902 struct btrfs_root *root = pending->root;
903 struct btrfs_root *parent_root; 903 struct btrfs_root *parent_root;
904 struct inode *parent_inode; 904 struct inode *parent_inode;
905 struct dentry *parent;
905 struct dentry *dentry; 906 struct dentry *dentry;
906 struct extent_buffer *tmp; 907 struct extent_buffer *tmp;
907 struct extent_buffer *old; 908 struct extent_buffer *old;
@@ -941,7 +942,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
941 trans->block_rsv = &pending->block_rsv; 942 trans->block_rsv = &pending->block_rsv;
942 943
943 dentry = pending->dentry; 944 dentry = pending->dentry;
944 parent_inode = dentry->d_parent->d_inode; 945 parent = dget_parent(dentry);
946 parent_inode = parent->d_inode;
945 parent_root = BTRFS_I(parent_inode)->root; 947 parent_root = BTRFS_I(parent_inode)->root;
946 record_root_in_trans(trans, parent_root); 948 record_root_in_trans(trans, parent_root);
947 949
@@ -989,6 +991,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
989 parent_inode->i_ino, index, 991 parent_inode->i_ino, index,
990 dentry->d_name.name, dentry->d_name.len); 992 dentry->d_name.name, dentry->d_name.len);
991 BUG_ON(ret); 993 BUG_ON(ret);
994 dput(parent);
992 995
993 key.offset = (u64)-1; 996 key.offset = (u64)-1;
994 pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key); 997 pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index a29f19384a27..054744ac5719 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2869,6 +2869,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
2869{ 2869{
2870 int ret = 0; 2870 int ret = 0;
2871 struct btrfs_root *root; 2871 struct btrfs_root *root;
2872 struct dentry *old_parent = NULL;
2872 2873
2873 /* 2874 /*
2874 * for regular files, if its inode is already on disk, we don't 2875 * for regular files, if its inode is already on disk, we don't
@@ -2910,10 +2911,13 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
2910 if (IS_ROOT(parent)) 2911 if (IS_ROOT(parent))
2911 break; 2912 break;
2912 2913
2913 parent = parent->d_parent; 2914 parent = dget_parent(parent);
2915 dput(old_parent);
2916 old_parent = parent;
2914 inode = parent->d_inode; 2917 inode = parent->d_inode;
2915 2918
2916 } 2919 }
2920 dput(old_parent);
2917out: 2921out:
2918 return ret; 2922 return ret;
2919} 2923}
@@ -2945,6 +2949,7 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
2945{ 2949{
2946 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; 2950 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
2947 struct super_block *sb; 2951 struct super_block *sb;
2952 struct dentry *old_parent = NULL;
2948 int ret = 0; 2953 int ret = 0;
2949 u64 last_committed = root->fs_info->last_trans_committed; 2954 u64 last_committed = root->fs_info->last_trans_committed;
2950 2955
@@ -3016,10 +3021,13 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
3016 if (IS_ROOT(parent)) 3021 if (IS_ROOT(parent))
3017 break; 3022 break;
3018 3023
3019 parent = parent->d_parent; 3024 parent = dget_parent(parent);
3025 dput(old_parent);
3026 old_parent = parent;
3020 } 3027 }
3021 ret = 0; 3028 ret = 0;
3022end_trans: 3029end_trans:
3030 dput(old_parent);
3023 if (ret < 0) { 3031 if (ret < 0) {
3024 BUG_ON(ret != -ENOSPC); 3032 BUG_ON(ret != -ENOSPC);
3025 root->fs_info->last_trans_log_full_commit = trans->transid; 3033 root->fs_info->last_trans_log_full_commit = trans->transid;
@@ -3039,8 +3047,13 @@ end_no_trans:
3039int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 3047int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
3040 struct btrfs_root *root, struct dentry *dentry) 3048 struct btrfs_root *root, struct dentry *dentry)
3041{ 3049{
3042 return btrfs_log_inode_parent(trans, root, dentry->d_inode, 3050 struct dentry *parent = dget_parent(dentry);
3043 dentry->d_parent, 0); 3051 int ret;
3052
3053 ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, 0);
3054 dput(parent);
3055
3056 return ret;
3044} 3057}
3045 3058
3046/* 3059/*
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index cc04dc1445d6..6b9884507837 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -412,12 +412,16 @@ static noinline int device_list_add(const char *path,
412 412
413 device->fs_devices = fs_devices; 413 device->fs_devices = fs_devices;
414 fs_devices->num_devices++; 414 fs_devices->num_devices++;
415 } else if (strcmp(device->name, path)) { 415 } else if (!device->name || strcmp(device->name, path)) {
416 name = kstrdup(path, GFP_NOFS); 416 name = kstrdup(path, GFP_NOFS);
417 if (!name) 417 if (!name)
418 return -ENOMEM; 418 return -ENOMEM;
419 kfree(device->name); 419 kfree(device->name);
420 device->name = name; 420 device->name = name;
421 if (device->missing) {
422 fs_devices->missing_devices--;
423 device->missing = 0;
424 }
421 } 425 }
422 426
423 if (found_transid > fs_devices->latest_trans) { 427 if (found_transid > fs_devices->latest_trans) {
@@ -1236,6 +1240,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1236 1240
1237 device->fs_devices->num_devices--; 1241 device->fs_devices->num_devices--;
1238 1242
1243 if (device->missing)
1244 root->fs_info->fs_devices->missing_devices--;
1245
1239 next_device = list_entry(root->fs_info->fs_devices->devices.next, 1246 next_device = list_entry(root->fs_info->fs_devices->devices.next,
1240 struct btrfs_device, dev_list); 1247 struct btrfs_device, dev_list);
1241 if (device->bdev == root->fs_info->sb->s_bdev) 1248 if (device->bdev == root->fs_info->sb->s_bdev)
@@ -3080,7 +3087,9 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
3080 device->devid = devid; 3087 device->devid = devid;
3081 device->work.func = pending_bios_fn; 3088 device->work.func = pending_bios_fn;
3082 device->fs_devices = fs_devices; 3089 device->fs_devices = fs_devices;
3090 device->missing = 1;
3083 fs_devices->num_devices++; 3091 fs_devices->num_devices++;
3092 fs_devices->missing_devices++;
3084 spin_lock_init(&device->io_lock); 3093 spin_lock_init(&device->io_lock);
3085 INIT_LIST_HEAD(&device->dev_alloc_list); 3094 INIT_LIST_HEAD(&device->dev_alloc_list);
3086 memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE); 3095 memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
@@ -3278,6 +3287,15 @@ static int read_one_dev(struct btrfs_root *root,
3278 device = add_missing_dev(root, devid, dev_uuid); 3287 device = add_missing_dev(root, devid, dev_uuid);
3279 if (!device) 3288 if (!device)
3280 return -ENOMEM; 3289 return -ENOMEM;
3290 } else if (!device->missing) {
3291 /*
3292 * this happens when a device that was properly setup
3293 * in the device info lists suddenly goes bad.
3294 * device->bdev is NULL, and so we have to set
3295 * device->missing to one here
3296 */
3297 root->fs_info->fs_devices->missing_devices++;
3298 device->missing = 1;
3281 } 3299 }
3282 } 3300 }
3283 3301
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 2b638b6e4eea..2740db49eb04 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -44,6 +44,7 @@ struct btrfs_device {
44 44
45 int writeable; 45 int writeable;
46 int in_fs_metadata; 46 int in_fs_metadata;
47 int missing;
47 48
48 spinlock_t io_lock; 49 spinlock_t io_lock;
49 50
@@ -93,6 +94,7 @@ struct btrfs_fs_devices {
93 u64 num_devices; 94 u64 num_devices;
94 u64 open_devices; 95 u64 open_devices;
95 u64 rw_devices; 96 u64 rw_devices;
97 u64 missing_devices;
96 u64 total_rw_bytes; 98 u64 total_rw_bytes;
97 struct block_device *latest_bdev; 99 struct block_device *latest_bdev;
98 100
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index e9c874abc9e1..561438b6a50c 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -204,7 +204,7 @@ static int readpage_nounlock(struct file *filp, struct page *page)
204 err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, 204 err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
205 page->index << PAGE_CACHE_SHIFT, &len, 205 page->index << PAGE_CACHE_SHIFT, &len,
206 ci->i_truncate_seq, ci->i_truncate_size, 206 ci->i_truncate_seq, ci->i_truncate_size,
207 &page, 1); 207 &page, 1, 0);
208 if (err == -ENOENT) 208 if (err == -ENOENT)
209 err = 0; 209 err = 0;
210 if (err < 0) { 210 if (err < 0) {
@@ -287,7 +287,7 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
287 rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, 287 rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
288 offset, &len, 288 offset, &len,
289 ci->i_truncate_seq, ci->i_truncate_size, 289 ci->i_truncate_seq, ci->i_truncate_size,
290 pages, nr_pages); 290 pages, nr_pages, 0);
291 if (rc == -ENOENT) 291 if (rc == -ENOENT)
292 rc = 0; 292 rc = 0;
293 if (rc < 0) 293 if (rc < 0)
@@ -774,7 +774,7 @@ get_more_pages:
774 snapc, do_sync, 774 snapc, do_sync,
775 ci->i_truncate_seq, 775 ci->i_truncate_seq,
776 ci->i_truncate_size, 776 ci->i_truncate_size,
777 &inode->i_mtime, true, 1); 777 &inode->i_mtime, true, 1, 0);
778 max_pages = req->r_num_pages; 778 max_pages = req->r_num_pages;
779 779
780 alloc_page_vec(fsc, req); 780 alloc_page_vec(fsc, req);
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 98ab13e2b71d..60d27bc9eb83 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1430,8 +1430,8 @@ static int try_nonblocking_invalidate(struct inode *inode)
1430 invalidating_gen == ci->i_rdcache_gen) { 1430 invalidating_gen == ci->i_rdcache_gen) {
1431 /* success. */ 1431 /* success. */
1432 dout("try_nonblocking_invalidate %p success\n", inode); 1432 dout("try_nonblocking_invalidate %p success\n", inode);
1433 ci->i_rdcache_gen = 0; 1433 /* save any racing async invalidate some trouble */
1434 ci->i_rdcache_revoking = 0; 1434 ci->i_rdcache_revoking = ci->i_rdcache_gen - 1;
1435 return 0; 1435 return 0;
1436 } 1436 }
1437 dout("try_nonblocking_invalidate %p failed\n", inode); 1437 dout("try_nonblocking_invalidate %p failed\n", inode);
@@ -2273,8 +2273,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2273{ 2273{
2274 struct ceph_inode_info *ci = ceph_inode(inode); 2274 struct ceph_inode_info *ci = ceph_inode(inode);
2275 int mds = session->s_mds; 2275 int mds = session->s_mds;
2276 unsigned seq = le32_to_cpu(grant->seq); 2276 int seq = le32_to_cpu(grant->seq);
2277 unsigned issue_seq = le32_to_cpu(grant->issue_seq);
2278 int newcaps = le32_to_cpu(grant->caps); 2277 int newcaps = le32_to_cpu(grant->caps);
2279 int issued, implemented, used, wanted, dirty; 2278 int issued, implemented, used, wanted, dirty;
2280 u64 size = le64_to_cpu(grant->size); 2279 u64 size = le64_to_cpu(grant->size);
@@ -2286,8 +2285,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2286 int revoked_rdcache = 0; 2285 int revoked_rdcache = 0;
2287 int queue_invalidate = 0; 2286 int queue_invalidate = 0;
2288 2287
2289 dout("handle_cap_grant inode %p cap %p mds%d seq %u/%u %s\n", 2288 dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
2290 inode, cap, mds, seq, issue_seq, ceph_cap_string(newcaps)); 2289 inode, cap, mds, seq, ceph_cap_string(newcaps));
2291 dout(" size %llu max_size %llu, i_size %llu\n", size, max_size, 2290 dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
2292 inode->i_size); 2291 inode->i_size);
2293 2292
@@ -2383,7 +2382,6 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2383 } 2382 }
2384 2383
2385 cap->seq = seq; 2384 cap->seq = seq;
2386 cap->issue_seq = issue_seq;
2387 2385
2388 /* file layout may have changed */ 2386 /* file layout may have changed */
2389 ci->i_layout = grant->layout; 2387 ci->i_layout = grant->layout;
@@ -2691,6 +2689,11 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
2691 NULL /* no caps context */); 2689 NULL /* no caps context */);
2692 try_flush_caps(inode, session, NULL); 2690 try_flush_caps(inode, session, NULL);
2693 up_read(&mdsc->snap_rwsem); 2691 up_read(&mdsc->snap_rwsem);
2692
2693 /* make sure we re-request max_size, if necessary */
2694 spin_lock(&inode->i_lock);
2695 ci->i_requested_max_size = 0;
2696 spin_unlock(&inode->i_lock);
2694} 2697}
2695 2698
2696/* 2699/*
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index e0a2dc6fcafc..d902948a90d8 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -40,7 +40,8 @@ int ceph_init_dentry(struct dentry *dentry)
40 if (dentry->d_fsdata) 40 if (dentry->d_fsdata)
41 return 0; 41 return 0;
42 42
43 if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) 43 if (dentry->d_parent == NULL || /* nfs fh_to_dentry */
44 ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
44 dentry->d_op = &ceph_dentry_ops; 45 dentry->d_op = &ceph_dentry_ops;
45 else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR) 46 else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR)
46 dentry->d_op = &ceph_snapdir_dentry_ops; 47 dentry->d_op = &ceph_snapdir_dentry_ops;
@@ -114,8 +115,8 @@ static int __dcache_readdir(struct file *filp,
114 spin_lock(&dcache_lock); 115 spin_lock(&dcache_lock);
115 116
116 /* start at beginning? */ 117 /* start at beginning? */
117 if (filp->f_pos == 2 || (last && 118 if (filp->f_pos == 2 || last == NULL ||
118 filp->f_pos < ceph_dentry(last)->offset)) { 119 filp->f_pos < ceph_dentry(last)->offset) {
119 if (list_empty(&parent->d_subdirs)) 120 if (list_empty(&parent->d_subdirs))
120 goto out_unlock; 121 goto out_unlock;
121 p = parent->d_subdirs.prev; 122 p = parent->d_subdirs.prev;
@@ -336,7 +337,10 @@ more:
336 if (req->r_reply_info.dir_end) { 337 if (req->r_reply_info.dir_end) {
337 kfree(fi->last_name); 338 kfree(fi->last_name);
338 fi->last_name = NULL; 339 fi->last_name = NULL;
339 fi->next_offset = 2; 340 if (ceph_frag_is_rightmost(frag))
341 fi->next_offset = 2;
342 else
343 fi->next_offset = 0;
340 } else { 344 } else {
341 rinfo = &req->r_reply_info; 345 rinfo = &req->r_reply_info;
342 err = note_last_dentry(fi, 346 err = note_last_dentry(fi,
@@ -355,18 +359,22 @@ more:
355 u64 pos = ceph_make_fpos(frag, off); 359 u64 pos = ceph_make_fpos(frag, off);
356 struct ceph_mds_reply_inode *in = 360 struct ceph_mds_reply_inode *in =
357 rinfo->dir_in[off - fi->offset].in; 361 rinfo->dir_in[off - fi->offset].in;
362 struct ceph_vino vino;
363 ino_t ino;
364
358 dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n", 365 dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
359 off, off - fi->offset, rinfo->dir_nr, pos, 366 off, off - fi->offset, rinfo->dir_nr, pos,
360 rinfo->dir_dname_len[off - fi->offset], 367 rinfo->dir_dname_len[off - fi->offset],
361 rinfo->dir_dname[off - fi->offset], in); 368 rinfo->dir_dname[off - fi->offset], in);
362 BUG_ON(!in); 369 BUG_ON(!in);
363 ftype = le32_to_cpu(in->mode) >> 12; 370 ftype = le32_to_cpu(in->mode) >> 12;
371 vino.ino = le64_to_cpu(in->ino);
372 vino.snap = le64_to_cpu(in->snapid);
373 ino = ceph_vino_to_ino(vino);
364 if (filldir(dirent, 374 if (filldir(dirent,
365 rinfo->dir_dname[off - fi->offset], 375 rinfo->dir_dname[off - fi->offset],
366 rinfo->dir_dname_len[off - fi->offset], 376 rinfo->dir_dname_len[off - fi->offset],
367 pos, 377 pos, ino, ftype) < 0) {
368 le64_to_cpu(in->ino),
369 ftype) < 0) {
370 dout("filldir stopping us...\n"); 378 dout("filldir stopping us...\n");
371 return 0; 379 return 0;
372 } 380 }
@@ -414,6 +422,7 @@ static void reset_readdir(struct ceph_file_info *fi)
414 fi->last_readdir = NULL; 422 fi->last_readdir = NULL;
415 } 423 }
416 kfree(fi->last_name); 424 kfree(fi->last_name);
425 fi->last_name = NULL;
417 fi->next_offset = 2; /* compensate for . and .. */ 426 fi->next_offset = 2; /* compensate for . and .. */
418 if (fi->dentry) { 427 if (fi->dentry) {
419 dput(fi->dentry); 428 dput(fi->dentry);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index e77c28cf3690..7d0e4a82d898 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -154,11 +154,13 @@ int ceph_open(struct inode *inode, struct file *file)
154 } 154 }
155 155
156 /* 156 /*
157 * No need to block if we have any caps. Update wanted set 157 * No need to block if we have caps on the auth MDS (for
158 * write) or any MDS (for read). Update wanted set
158 * asynchronously. 159 * asynchronously.
159 */ 160 */
160 spin_lock(&inode->i_lock); 161 spin_lock(&inode->i_lock);
161 if (__ceph_is_any_real_caps(ci)) { 162 if (__ceph_is_any_real_caps(ci) &&
163 (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) {
162 int mds_wanted = __ceph_caps_mds_wanted(ci); 164 int mds_wanted = __ceph_caps_mds_wanted(ci);
163 int issued = __ceph_caps_issued(ci, NULL); 165 int issued = __ceph_caps_issued(ci, NULL);
164 166
@@ -280,11 +282,13 @@ int ceph_release(struct inode *inode, struct file *file)
280static int striped_read(struct inode *inode, 282static int striped_read(struct inode *inode,
281 u64 off, u64 len, 283 u64 off, u64 len,
282 struct page **pages, int num_pages, 284 struct page **pages, int num_pages,
283 int *checkeof) 285 int *checkeof, bool align_to_pages,
286 unsigned long buf_align)
284{ 287{
285 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 288 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
286 struct ceph_inode_info *ci = ceph_inode(inode); 289 struct ceph_inode_info *ci = ceph_inode(inode);
287 u64 pos, this_len; 290 u64 pos, this_len;
291 int io_align, page_align;
288 int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */ 292 int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */
289 int left, pages_left; 293 int left, pages_left;
290 int read; 294 int read;
@@ -300,14 +304,19 @@ static int striped_read(struct inode *inode,
300 page_pos = pages; 304 page_pos = pages;
301 pages_left = num_pages; 305 pages_left = num_pages;
302 read = 0; 306 read = 0;
307 io_align = off & ~PAGE_MASK;
303 308
304more: 309more:
310 if (align_to_pages)
311 page_align = (pos - io_align + buf_align) & ~PAGE_MASK;
312 else
313 page_align = pos & ~PAGE_MASK;
305 this_len = left; 314 this_len = left;
306 ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode), 315 ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
307 &ci->i_layout, pos, &this_len, 316 &ci->i_layout, pos, &this_len,
308 ci->i_truncate_seq, 317 ci->i_truncate_seq,
309 ci->i_truncate_size, 318 ci->i_truncate_size,
310 page_pos, pages_left); 319 page_pos, pages_left, page_align);
311 hit_stripe = this_len < left; 320 hit_stripe = this_len < left;
312 was_short = ret >= 0 && ret < this_len; 321 was_short = ret >= 0 && ret < this_len;
313 if (ret == -ENOENT) 322 if (ret == -ENOENT)
@@ -368,32 +377,34 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
368 struct inode *inode = file->f_dentry->d_inode; 377 struct inode *inode = file->f_dentry->d_inode;
369 struct page **pages; 378 struct page **pages;
370 u64 off = *poff; 379 u64 off = *poff;
371 int num_pages = calc_pages_for(off, len); 380 int num_pages, ret;
372 int ret;
373 381
374 dout("sync_read on file %p %llu~%u %s\n", file, off, len, 382 dout("sync_read on file %p %llu~%u %s\n", file, off, len,
375 (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); 383 (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
376 384
377 if (file->f_flags & O_DIRECT) { 385 if (file->f_flags & O_DIRECT) {
378 pages = ceph_get_direct_page_vector(data, num_pages, off, len); 386 num_pages = calc_pages_for((unsigned long)data, len);
379 387 pages = ceph_get_direct_page_vector(data, num_pages, true);
380 /*
381 * flush any page cache pages in this range. this
382 * will make concurrent normal and O_DIRECT io slow,
383 * but it will at least behave sensibly when they are
384 * in sequence.
385 */
386 } else { 388 } else {
389 num_pages = calc_pages_for(off, len);
387 pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); 390 pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
388 } 391 }
389 if (IS_ERR(pages)) 392 if (IS_ERR(pages))
390 return PTR_ERR(pages); 393 return PTR_ERR(pages);
391 394
395 /*
396 * flush any page cache pages in this range. this
397 * will make concurrent normal and sync io slow,
398 * but it will at least behave sensibly when they are
399 * in sequence.
400 */
392 ret = filemap_write_and_wait(inode->i_mapping); 401 ret = filemap_write_and_wait(inode->i_mapping);
393 if (ret < 0) 402 if (ret < 0)
394 goto done; 403 goto done;
395 404
396 ret = striped_read(inode, off, len, pages, num_pages, checkeof); 405 ret = striped_read(inode, off, len, pages, num_pages, checkeof,
406 file->f_flags & O_DIRECT,
407 (unsigned long)data & ~PAGE_MASK);
397 408
398 if (ret >= 0 && (file->f_flags & O_DIRECT) == 0) 409 if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
399 ret = ceph_copy_page_vector_to_user(pages, data, off, ret); 410 ret = ceph_copy_page_vector_to_user(pages, data, off, ret);
@@ -402,7 +413,7 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
402 413
403done: 414done:
404 if (file->f_flags & O_DIRECT) 415 if (file->f_flags & O_DIRECT)
405 ceph_put_page_vector(pages, num_pages); 416 ceph_put_page_vector(pages, num_pages, true);
406 else 417 else
407 ceph_release_page_vector(pages, num_pages); 418 ceph_release_page_vector(pages, num_pages);
408 dout("sync_read result %d\n", ret); 419 dout("sync_read result %d\n", ret);
@@ -448,6 +459,8 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
448 int flags; 459 int flags;
449 int do_sync = 0; 460 int do_sync = 0;
450 int check_caps = 0; 461 int check_caps = 0;
462 int page_align, io_align;
463 unsigned long buf_align;
451 int ret; 464 int ret;
452 struct timespec mtime = CURRENT_TIME; 465 struct timespec mtime = CURRENT_TIME;
453 466
@@ -462,6 +475,9 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
462 else 475 else
463 pos = *offset; 476 pos = *offset;
464 477
478 io_align = pos & ~PAGE_MASK;
479 buf_align = (unsigned long)data & ~PAGE_MASK;
480
465 ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left); 481 ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left);
466 if (ret < 0) 482 if (ret < 0)
467 return ret; 483 return ret;
@@ -486,20 +502,27 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
486 */ 502 */
487more: 503more:
488 len = left; 504 len = left;
505 if (file->f_flags & O_DIRECT) {
506 /* write from beginning of first page, regardless of
507 io alignment */
508 page_align = (pos - io_align + buf_align) & ~PAGE_MASK;
509 num_pages = calc_pages_for((unsigned long)data, len);
510 } else {
511 page_align = pos & ~PAGE_MASK;
512 num_pages = calc_pages_for(pos, len);
513 }
489 req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, 514 req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
490 ceph_vino(inode), pos, &len, 515 ceph_vino(inode), pos, &len,
491 CEPH_OSD_OP_WRITE, flags, 516 CEPH_OSD_OP_WRITE, flags,
492 ci->i_snap_realm->cached_context, 517 ci->i_snap_realm->cached_context,
493 do_sync, 518 do_sync,
494 ci->i_truncate_seq, ci->i_truncate_size, 519 ci->i_truncate_seq, ci->i_truncate_size,
495 &mtime, false, 2); 520 &mtime, false, 2, page_align);
496 if (!req) 521 if (!req)
497 return -ENOMEM; 522 return -ENOMEM;
498 523
499 num_pages = calc_pages_for(pos, len);
500
501 if (file->f_flags & O_DIRECT) { 524 if (file->f_flags & O_DIRECT) {
502 pages = ceph_get_direct_page_vector(data, num_pages, pos, len); 525 pages = ceph_get_direct_page_vector(data, num_pages, false);
503 if (IS_ERR(pages)) { 526 if (IS_ERR(pages)) {
504 ret = PTR_ERR(pages); 527 ret = PTR_ERR(pages);
505 goto out; 528 goto out;
@@ -549,7 +572,7 @@ more:
549 } 572 }
550 573
551 if (file->f_flags & O_DIRECT) 574 if (file->f_flags & O_DIRECT)
552 ceph_put_page_vector(pages, num_pages); 575 ceph_put_page_vector(pages, num_pages, false);
553 else if (file->f_flags & O_SYNC) 576 else if (file->f_flags & O_SYNC)
554 ceph_release_page_vector(pages, num_pages); 577 ceph_release_page_vector(pages, num_pages);
555 578
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 1d6a45b5a04c..bf1286588f26 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -2,7 +2,6 @@
2 2
3#include <linux/module.h> 3#include <linux/module.h>
4#include <linux/fs.h> 4#include <linux/fs.h>
5#include <linux/smp_lock.h>
6#include <linux/slab.h> 5#include <linux/slab.h>
7#include <linux/string.h> 6#include <linux/string.h>
8#include <linux/uaccess.h> 7#include <linux/uaccess.h>
@@ -471,7 +470,9 @@ void ceph_fill_file_time(struct inode *inode, int issued,
471 470
472 if (issued & (CEPH_CAP_FILE_EXCL| 471 if (issued & (CEPH_CAP_FILE_EXCL|
473 CEPH_CAP_FILE_WR| 472 CEPH_CAP_FILE_WR|
474 CEPH_CAP_FILE_BUFFER)) { 473 CEPH_CAP_FILE_BUFFER|
474 CEPH_CAP_AUTH_EXCL|
475 CEPH_CAP_XATTR_EXCL)) {
475 if (timespec_compare(ctime, &inode->i_ctime) > 0) { 476 if (timespec_compare(ctime, &inode->i_ctime) > 0) {
476 dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n", 477 dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n",
477 inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec, 478 inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
@@ -511,7 +512,7 @@ void ceph_fill_file_time(struct inode *inode, int issued,
511 warn = 1; 512 warn = 1;
512 } 513 }
513 } else { 514 } else {
514 /* we have no write caps; whatever the MDS says is true */ 515 /* we have no write|excl caps; whatever the MDS says is true */
515 if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) { 516 if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) {
516 inode->i_ctime = *ctime; 517 inode->i_ctime = *ctime;
517 inode->i_mtime = *mtime; 518 inode->i_mtime = *mtime;
@@ -567,12 +568,17 @@ static int fill_inode(struct inode *inode,
567 568
568 /* 569 /*
569 * provided version will be odd if inode value is projected, 570 * provided version will be odd if inode value is projected,
570 * even if stable. skip the update if we have a newer info 571 * even if stable. skip the update if we have newer stable
571 * (e.g., due to inode info racing form multiple MDSs), or if 572 * info (ours>=theirs, e.g. due to racing mds replies), unless
572 * we are getting projected (unstable) inode info. 573 * we are getting projected (unstable) info (in which case the
574 * version is odd, and we want ours>theirs).
575 * us them
576 * 2 2 skip
577 * 3 2 skip
578 * 3 3 update
573 */ 579 */
574 if (le64_to_cpu(info->version) > 0 && 580 if (le64_to_cpu(info->version) > 0 &&
575 (ci->i_version & ~1) > le64_to_cpu(info->version)) 581 (ci->i_version & ~1) >= le64_to_cpu(info->version))
576 goto no_change; 582 goto no_change;
577 583
578 issued = __ceph_caps_issued(ci, &implemented); 584 issued = __ceph_caps_issued(ci, &implemented);
@@ -606,7 +612,14 @@ static int fill_inode(struct inode *inode,
606 le32_to_cpu(info->time_warp_seq), 612 le32_to_cpu(info->time_warp_seq),
607 &ctime, &mtime, &atime); 613 &ctime, &mtime, &atime);
608 614
609 ci->i_max_size = le64_to_cpu(info->max_size); 615 /* only update max_size on auth cap */
616 if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
617 ci->i_max_size != le64_to_cpu(info->max_size)) {
618 dout("max_size %lld -> %llu\n", ci->i_max_size,
619 le64_to_cpu(info->max_size));
620 ci->i_max_size = le64_to_cpu(info->max_size);
621 }
622
610 ci->i_layout = info->layout; 623 ci->i_layout = info->layout;
611 inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; 624 inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
612 625
@@ -1055,7 +1068,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
1055 ininfo = rinfo->targeti.in; 1068 ininfo = rinfo->targeti.in;
1056 vino.ino = le64_to_cpu(ininfo->ino); 1069 vino.ino = le64_to_cpu(ininfo->ino);
1057 vino.snap = le64_to_cpu(ininfo->snapid); 1070 vino.snap = le64_to_cpu(ininfo->snapid);
1058 if (!dn->d_inode) { 1071 in = dn->d_inode;
1072 if (!in) {
1059 in = ceph_get_inode(sb, vino); 1073 in = ceph_get_inode(sb, vino);
1060 if (IS_ERR(in)) { 1074 if (IS_ERR(in)) {
1061 pr_err("fill_trace bad get_inode " 1075 pr_err("fill_trace bad get_inode "
@@ -1386,11 +1400,8 @@ static void ceph_invalidate_work(struct work_struct *work)
1386 spin_lock(&inode->i_lock); 1400 spin_lock(&inode->i_lock);
1387 dout("invalidate_pages %p gen %d revoking %d\n", inode, 1401 dout("invalidate_pages %p gen %d revoking %d\n", inode,
1388 ci->i_rdcache_gen, ci->i_rdcache_revoking); 1402 ci->i_rdcache_gen, ci->i_rdcache_revoking);
1389 if (ci->i_rdcache_gen == 0 || 1403 if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
1390 ci->i_rdcache_revoking != ci->i_rdcache_gen) {
1391 BUG_ON(ci->i_rdcache_revoking > ci->i_rdcache_gen);
1392 /* nevermind! */ 1404 /* nevermind! */
1393 ci->i_rdcache_revoking = 0;
1394 spin_unlock(&inode->i_lock); 1405 spin_unlock(&inode->i_lock);
1395 goto out; 1406 goto out;
1396 } 1407 }
@@ -1400,15 +1411,16 @@ static void ceph_invalidate_work(struct work_struct *work)
1400 ceph_invalidate_nondirty_pages(inode->i_mapping); 1411 ceph_invalidate_nondirty_pages(inode->i_mapping);
1401 1412
1402 spin_lock(&inode->i_lock); 1413 spin_lock(&inode->i_lock);
1403 if (orig_gen == ci->i_rdcache_gen) { 1414 if (orig_gen == ci->i_rdcache_gen &&
1415 orig_gen == ci->i_rdcache_revoking) {
1404 dout("invalidate_pages %p gen %d successful\n", inode, 1416 dout("invalidate_pages %p gen %d successful\n", inode,
1405 ci->i_rdcache_gen); 1417 ci->i_rdcache_gen);
1406 ci->i_rdcache_gen = 0; 1418 ci->i_rdcache_revoking--;
1407 ci->i_rdcache_revoking = 0;
1408 check = 1; 1419 check = 1;
1409 } else { 1420 } else {
1410 dout("invalidate_pages %p gen %d raced, gen now %d\n", 1421 dout("invalidate_pages %p gen %d raced, now %d revoking %d\n",
1411 inode, orig_gen, ci->i_rdcache_gen); 1422 inode, orig_gen, ci->i_rdcache_gen,
1423 ci->i_rdcache_revoking);
1412 } 1424 }
1413 spin_unlock(&inode->i_lock); 1425 spin_unlock(&inode->i_lock);
1414 1426
@@ -1739,7 +1751,7 @@ int ceph_do_getattr(struct inode *inode, int mask)
1739 return 0; 1751 return 0;
1740 } 1752 }
1741 1753
1742 dout("do_getattr inode %p mask %s\n", inode, ceph_cap_string(mask)); 1754 dout("do_getattr inode %p mask %s mode 0%o\n", inode, ceph_cap_string(mask), inode->i_mode);
1743 if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1)) 1755 if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
1744 return 0; 1756 return 0;
1745 1757
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
index a6ce54e94eb5..52e8fd74d450 100644
--- a/fs/ceph/ioctl.h
+++ b/fs/ceph/ioctl.h
@@ -4,7 +4,7 @@
4#include <linux/ioctl.h> 4#include <linux/ioctl.h>
5#include <linux/types.h> 5#include <linux/types.h>
6 6
7#define CEPH_IOCTL_MAGIC 0x98 7#define CEPH_IOCTL_MAGIC 0x97
8 8
9/* just use u64 to align sanely on all archs */ 9/* just use u64 to align sanely on all archs */
10struct ceph_ioctl_layout { 10struct ceph_ioctl_layout {
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 40abde93c345..476b329867d4 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -11,40 +11,68 @@
11 * Implement fcntl and flock locking functions. 11 * Implement fcntl and flock locking functions.
12 */ 12 */
13static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, 13static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
14 u64 pid, u64 pid_ns, 14 int cmd, u8 wait, struct file_lock *fl)
15 int cmd, u64 start, u64 length, u8 wait)
16{ 15{
17 struct inode *inode = file->f_dentry->d_inode; 16 struct inode *inode = file->f_dentry->d_inode;
18 struct ceph_mds_client *mdsc = 17 struct ceph_mds_client *mdsc =
19 ceph_sb_to_client(inode->i_sb)->mdsc; 18 ceph_sb_to_client(inode->i_sb)->mdsc;
20 struct ceph_mds_request *req; 19 struct ceph_mds_request *req;
21 int err; 20 int err;
21 u64 length = 0;
22 22
23 req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS); 23 req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS);
24 if (IS_ERR(req)) 24 if (IS_ERR(req))
25 return PTR_ERR(req); 25 return PTR_ERR(req);
26 req->r_inode = igrab(inode); 26 req->r_inode = igrab(inode);
27 27
28 /* mds requires start and length rather than start and end */
29 if (LLONG_MAX == fl->fl_end)
30 length = 0;
31 else
32 length = fl->fl_end - fl->fl_start + 1;
33
28 dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, " 34 dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
29 "length: %llu, wait: %d, type`: %d", (int)lock_type, 35 "length: %llu, wait: %d, type`: %d", (int)lock_type,
30 (int)operation, pid, start, length, wait, cmd); 36 (int)operation, (u64)fl->fl_pid, fl->fl_start,
37 length, wait, fl->fl_type);
38
31 39
32 req->r_args.filelock_change.rule = lock_type; 40 req->r_args.filelock_change.rule = lock_type;
33 req->r_args.filelock_change.type = cmd; 41 req->r_args.filelock_change.type = cmd;
34 req->r_args.filelock_change.pid = cpu_to_le64(pid); 42 req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid);
35 /* This should be adjusted, but I'm not sure if 43 /* This should be adjusted, but I'm not sure if
36 namespaces actually get id numbers*/ 44 namespaces actually get id numbers*/
37 req->r_args.filelock_change.pid_namespace = 45 req->r_args.filelock_change.pid_namespace =
38 cpu_to_le64((u64)pid_ns); 46 cpu_to_le64((u64)(unsigned long)fl->fl_nspid);
39 req->r_args.filelock_change.start = cpu_to_le64(start); 47 req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start);
40 req->r_args.filelock_change.length = cpu_to_le64(length); 48 req->r_args.filelock_change.length = cpu_to_le64(length);
41 req->r_args.filelock_change.wait = wait; 49 req->r_args.filelock_change.wait = wait;
42 50
43 err = ceph_mdsc_do_request(mdsc, inode, req); 51 err = ceph_mdsc_do_request(mdsc, inode, req);
52
53 if ( operation == CEPH_MDS_OP_GETFILELOCK){
54 fl->fl_pid = le64_to_cpu(req->r_reply_info.filelock_reply->pid);
55 if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type)
56 fl->fl_type = F_RDLCK;
57 else if (CEPH_LOCK_EXCL == req->r_reply_info.filelock_reply->type)
58 fl->fl_type = F_WRLCK;
59 else
60 fl->fl_type = F_UNLCK;
61
62 fl->fl_start = le64_to_cpu(req->r_reply_info.filelock_reply->start);
63 length = le64_to_cpu(req->r_reply_info.filelock_reply->start) +
64 le64_to_cpu(req->r_reply_info.filelock_reply->length);
65 if (length >= 1)
66 fl->fl_end = length -1;
67 else
68 fl->fl_end = 0;
69
70 }
44 ceph_mdsc_put_request(req); 71 ceph_mdsc_put_request(req);
45 dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, " 72 dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
46 "length: %llu, wait: %d, type`: %d err code %d", (int)lock_type, 73 "length: %llu, wait: %d, type`: %d, err code %d", (int)lock_type,
47 (int)operation, pid, start, length, wait, cmd, err); 74 (int)operation, (u64)fl->fl_pid, fl->fl_start,
75 length, wait, fl->fl_type, err);
48 return err; 76 return err;
49} 77}
50 78
@@ -54,7 +82,6 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
54 */ 82 */
55int ceph_lock(struct file *file, int cmd, struct file_lock *fl) 83int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
56{ 84{
57 u64 length;
58 u8 lock_cmd; 85 u8 lock_cmd;
59 int err; 86 int err;
60 u8 wait = 0; 87 u8 wait = 0;
@@ -76,29 +103,20 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
76 else 103 else
77 lock_cmd = CEPH_LOCK_UNLOCK; 104 lock_cmd = CEPH_LOCK_UNLOCK;
78 105
79 if (LLONG_MAX == fl->fl_end) 106 err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, lock_cmd, wait, fl);
80 length = 0;
81 else
82 length = fl->fl_end - fl->fl_start + 1;
83
84 err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
85 (u64)fl->fl_pid,
86 (u64)(unsigned long)fl->fl_nspid,
87 lock_cmd, fl->fl_start,
88 length, wait);
89 if (!err) { 107 if (!err) {
90 dout("mds locked, locking locally"); 108 if ( op != CEPH_MDS_OP_GETFILELOCK ){
91 err = posix_lock_file(file, fl, NULL); 109 dout("mds locked, locking locally");
92 if (err && (CEPH_MDS_OP_SETFILELOCK == op)) { 110 err = posix_lock_file(file, fl, NULL);
93 /* undo! This should only happen if the kernel detects 111 if (err && (CEPH_MDS_OP_SETFILELOCK == op)) {
94 * local deadlock. */ 112 /* undo! This should only happen if the kernel detects
95 ceph_lock_message(CEPH_LOCK_FCNTL, op, file, 113 * local deadlock. */
96 (u64)fl->fl_pid, 114 ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
97 (u64)(unsigned long)fl->fl_nspid, 115 CEPH_LOCK_UNLOCK, 0, fl);
98 CEPH_LOCK_UNLOCK, fl->fl_start, 116 dout("got %d on posix_lock_file, undid lock", err);
99 length, 0); 117 }
100 dout("got %d on posix_lock_file, undid lock", err);
101 } 118 }
119
102 } else { 120 } else {
103 dout("mds returned error code %d", err); 121 dout("mds returned error code %d", err);
104 } 122 }
@@ -107,7 +125,6 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
107 125
108int ceph_flock(struct file *file, int cmd, struct file_lock *fl) 126int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
109{ 127{
110 u64 length;
111 u8 lock_cmd; 128 u8 lock_cmd;
112 int err; 129 int err;
113 u8 wait = 1; 130 u8 wait = 1;
@@ -127,26 +144,15 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
127 lock_cmd = CEPH_LOCK_EXCL; 144 lock_cmd = CEPH_LOCK_EXCL;
128 else 145 else
129 lock_cmd = CEPH_LOCK_UNLOCK; 146 lock_cmd = CEPH_LOCK_UNLOCK;
130 /* mds requires start and length rather than start and end */
131 if (LLONG_MAX == fl->fl_end)
132 length = 0;
133 else
134 length = fl->fl_end - fl->fl_start + 1;
135 147
136 err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, 148 err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
137 file, (u64)fl->fl_pid, 149 file, lock_cmd, wait, fl);
138 (u64)(unsigned long)fl->fl_nspid,
139 lock_cmd, fl->fl_start,
140 length, wait);
141 if (!err) { 150 if (!err) {
142 err = flock_lock_file_wait(file, fl); 151 err = flock_lock_file_wait(file, fl);
143 if (err) { 152 if (err) {
144 ceph_lock_message(CEPH_LOCK_FLOCK, 153 ceph_lock_message(CEPH_LOCK_FLOCK,
145 CEPH_MDS_OP_SETFILELOCK, 154 CEPH_MDS_OP_SETFILELOCK,
146 file, (u64)fl->fl_pid, 155 file, CEPH_LOCK_UNLOCK, 0, fl);
147 (u64)(unsigned long)fl->fl_nspid,
148 CEPH_LOCK_UNLOCK, fl->fl_start,
149 length, 0);
150 dout("got %d on flock_lock_file_wait, undid lock", err); 156 dout("got %d on flock_lock_file_wait, undid lock", err);
151 } 157 }
152 } else { 158 } else {
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 3142b15940c2..38800eaa81d0 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -6,7 +6,6 @@
6#include <linux/sched.h> 6#include <linux/sched.h>
7#include <linux/debugfs.h> 7#include <linux/debugfs.h>
8#include <linux/seq_file.h> 8#include <linux/seq_file.h>
9#include <linux/smp_lock.h>
10 9
11#include "super.h" 10#include "super.h"
12#include "mds_client.h" 11#include "mds_client.h"
@@ -203,6 +202,38 @@ out_bad:
203} 202}
204 203
205/* 204/*
205 * parse fcntl F_GETLK results
206 */
207static int parse_reply_info_filelock(void **p, void *end,
208 struct ceph_mds_reply_info_parsed *info)
209{
210 if (*p + sizeof(*info->filelock_reply) > end)
211 goto bad;
212
213 info->filelock_reply = *p;
214 *p += sizeof(*info->filelock_reply);
215
216 if (unlikely(*p != end))
217 goto bad;
218 return 0;
219
220bad:
221 return -EIO;
222}
223
224/*
225 * parse extra results
226 */
227static int parse_reply_info_extra(void **p, void *end,
228 struct ceph_mds_reply_info_parsed *info)
229{
230 if (info->head->op == CEPH_MDS_OP_GETFILELOCK)
231 return parse_reply_info_filelock(p, end, info);
232 else
233 return parse_reply_info_dir(p, end, info);
234}
235
236/*
206 * parse entire mds reply 237 * parse entire mds reply
207 */ 238 */
208static int parse_reply_info(struct ceph_msg *msg, 239static int parse_reply_info(struct ceph_msg *msg,
@@ -224,10 +255,10 @@ static int parse_reply_info(struct ceph_msg *msg,
224 goto out_bad; 255 goto out_bad;
225 } 256 }
226 257
227 /* dir content */ 258 /* extra */
228 ceph_decode_32_safe(&p, end, len, bad); 259 ceph_decode_32_safe(&p, end, len, bad);
229 if (len > 0) { 260 if (len > 0) {
230 err = parse_reply_info_dir(&p, p+len, info); 261 err = parse_reply_info_extra(&p, p+len, info);
231 if (err < 0) 262 if (err < 0)
232 goto out_bad; 263 goto out_bad;
233 } 264 }
@@ -529,6 +560,9 @@ static void __register_request(struct ceph_mds_client *mdsc,
529 ceph_mdsc_get_request(req); 560 ceph_mdsc_get_request(req);
530 __insert_request(mdsc, req); 561 __insert_request(mdsc, req);
531 562
563 req->r_uid = current_fsuid();
564 req->r_gid = current_fsgid();
565
532 if (dir) { 566 if (dir) {
533 struct ceph_inode_info *ci = ceph_inode(dir); 567 struct ceph_inode_info *ci = ceph_inode(dir);
534 568
@@ -1588,8 +1622,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
1588 1622
1589 head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch); 1623 head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
1590 head->op = cpu_to_le32(req->r_op); 1624 head->op = cpu_to_le32(req->r_op);
1591 head->caller_uid = cpu_to_le32(current_fsuid()); 1625 head->caller_uid = cpu_to_le32(req->r_uid);
1592 head->caller_gid = cpu_to_le32(current_fsgid()); 1626 head->caller_gid = cpu_to_le32(req->r_gid);
1593 head->args = req->r_args; 1627 head->args = req->r_args;
1594 1628
1595 ceph_encode_filepath(&p, end, ino1, path1); 1629 ceph_encode_filepath(&p, end, ino1, path1);
@@ -2072,7 +2106,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
2072 2106
2073 mutex_lock(&session->s_mutex); 2107 mutex_lock(&session->s_mutex);
2074 if (err < 0) { 2108 if (err < 0) {
2075 pr_err("mdsc_handle_reply got corrupt reply mds%d\n", mds); 2109 pr_err("mdsc_handle_reply got corrupt reply mds%d(tid:%lld)\n", mds, tid);
2076 ceph_msg_dump(msg); 2110 ceph_msg_dump(msg);
2077 goto out_err; 2111 goto out_err;
2078 } 2112 }
@@ -2092,7 +2126,8 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
2092 mutex_lock(&req->r_fill_mutex); 2126 mutex_lock(&req->r_fill_mutex);
2093 err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session); 2127 err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);
2094 if (err == 0) { 2128 if (err == 0) {
2095 if (result == 0 && rinfo->dir_nr) 2129 if (result == 0 && req->r_op != CEPH_MDS_OP_GETFILELOCK &&
2130 rinfo->dir_nr)
2096 ceph_readdir_prepopulate(req, req->r_session); 2131 ceph_readdir_prepopulate(req, req->r_session);
2097 ceph_unreserve_caps(mdsc, &req->r_caps_reservation); 2132 ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
2098 } 2133 }
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index d66d63c72355..aabe563b54db 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -42,26 +42,37 @@ struct ceph_mds_reply_info_in {
42}; 42};
43 43
44/* 44/*
45 * parsed info about an mds reply, including information about the 45 * parsed info about an mds reply, including information about
46 * target inode and/or its parent directory and dentry, and directory 46 * either: 1) the target inode and/or its parent directory and dentry,
47 * contents (for readdir results). 47 * and directory contents (for readdir results), or
48 * 2) the file range lock info (for fcntl F_GETLK results).
48 */ 49 */
49struct ceph_mds_reply_info_parsed { 50struct ceph_mds_reply_info_parsed {
50 struct ceph_mds_reply_head *head; 51 struct ceph_mds_reply_head *head;
51 52
53 /* trace */
52 struct ceph_mds_reply_info_in diri, targeti; 54 struct ceph_mds_reply_info_in diri, targeti;
53 struct ceph_mds_reply_dirfrag *dirfrag; 55 struct ceph_mds_reply_dirfrag *dirfrag;
54 char *dname; 56 char *dname;
55 u32 dname_len; 57 u32 dname_len;
56 struct ceph_mds_reply_lease *dlease; 58 struct ceph_mds_reply_lease *dlease;
57 59
58 struct ceph_mds_reply_dirfrag *dir_dir; 60 /* extra */
59 int dir_nr; 61 union {
60 char **dir_dname; 62 /* for fcntl F_GETLK results */
61 u32 *dir_dname_len; 63 struct ceph_filelock *filelock_reply;
62 struct ceph_mds_reply_lease **dir_dlease; 64
63 struct ceph_mds_reply_info_in *dir_in; 65 /* for readdir results */
64 u8 dir_complete, dir_end; 66 struct {
67 struct ceph_mds_reply_dirfrag *dir_dir;
68 int dir_nr;
69 char **dir_dname;
70 u32 *dir_dname_len;
71 struct ceph_mds_reply_lease **dir_dlease;
72 struct ceph_mds_reply_info_in *dir_in;
73 u8 dir_complete, dir_end;
74 };
75 };
65 76
66 /* encoded blob describing snapshot contexts for certain 77 /* encoded blob describing snapshot contexts for certain
67 operations (e.g., open) */ 78 operations (e.g., open) */
@@ -170,6 +181,8 @@ struct ceph_mds_request {
170 181
171 union ceph_mds_request_args r_args; 182 union ceph_mds_request_args r_args;
172 int r_fmode; /* file mode, if expecting cap */ 183 int r_fmode; /* file mode, if expecting cap */
184 uid_t r_uid;
185 gid_t r_gid;
173 186
174 /* for choosing which mds to send this request to */ 187 /* for choosing which mds to send this request to */
175 int r_direct_mode; 188 int r_direct_mode;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 1886294e12f7..7f01728a4657 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -293,9 +293,7 @@ struct ceph_inode_info {
293 int i_rd_ref, i_rdcache_ref, i_wr_ref; 293 int i_rd_ref, i_rdcache_ref, i_wr_ref;
294 int i_wrbuffer_ref, i_wrbuffer_ref_head; 294 int i_wrbuffer_ref, i_wrbuffer_ref_head;
295 u32 i_shared_gen; /* increment each time we get FILE_SHARED */ 295 u32 i_shared_gen; /* increment each time we get FILE_SHARED */
296 u32 i_rdcache_gen; /* we increment this each time we get 296 u32 i_rdcache_gen; /* incremented each time we get FILE_CACHE. */
297 FILE_CACHE. If it's non-zero, we
298 _may_ have cached pages. */
299 u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */ 297 u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */
300 298
301 struct list_head i_unsafe_writes; /* uncommitted sync writes */ 299 struct list_head i_unsafe_writes; /* uncommitted sync writes */
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 0ed213970ced..ee45648b0d1a 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -4,6 +4,7 @@ config CIFS
4 select NLS 4 select NLS
5 select CRYPTO 5 select CRYPTO
6 select CRYPTO_MD5 6 select CRYPTO_MD5
7 select CRYPTO_HMAC
7 select CRYPTO_ARC4 8 select CRYPTO_ARC4
8 help 9 help
9 This is the client VFS module for the Common Internet File System 10 This is the client VFS module for the Common Internet File System
@@ -143,6 +144,13 @@ config CIFS_FSCACHE
143 to be cached locally on disk through the general filesystem cache 144 to be cached locally on disk through the general filesystem cache
144 manager. If unsure, say N. 145 manager. If unsure, say N.
145 146
147config CIFS_ACL
148 bool "Provide CIFS ACL support (EXPERIMENTAL)"
149 depends on EXPERIMENTAL && CIFS_XATTR
150 help
151 Allows to fetch CIFS/NTFS ACL from the server. The DACL blob
152 is handed over to the application/caller.
153
146config CIFS_EXPERIMENTAL 154config CIFS_EXPERIMENTAL
147 bool "CIFS Experimental Features (EXPERIMENTAL)" 155 bool "CIFS Experimental Features (EXPERIMENTAL)"
148 depends on CIFS && EXPERIMENTAL 156 depends on CIFS && EXPERIMENTAL
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index adefa60a9bdc..43b19dd39191 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -6,7 +6,9 @@ obj-$(CONFIG_CIFS) += cifs.o
6cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \ 6cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \
7 link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o \ 7 link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o \
8 md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o \ 8 md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o \
9 readdir.o ioctl.o sess.o export.o cifsacl.o 9 readdir.o ioctl.o sess.o export.o
10
11cifs-$(CONFIG_CIFS_ACL) += cifsacl.o
10 12
11cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o 13cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o
12 14
diff --git a/fs/cifs/README b/fs/cifs/README
index ee68d1036544..46af99ab3614 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -337,6 +337,15 @@ A partial list of the supported mount options follows:
337 wsize default write size (default 57344) 337 wsize default write size (default 57344)
338 maximum wsize currently allowed by CIFS is 57344 (fourteen 338 maximum wsize currently allowed by CIFS is 57344 (fourteen
339 4096 byte pages) 339 4096 byte pages)
340 actimeo=n attribute cache timeout in seconds (default 1 second).
341 After this timeout, the cifs client requests fresh attribute
342 information from the server. This option allows to tune the
343 attribute cache timeout to suit the workload needs. Shorter
344 timeouts mean better the cache coherency, but increased number
345 of calls to the server. Longer timeouts mean reduced number
346 of calls to the server at the expense of less stricter cache
347 coherency checks (i.e. incorrect attribute cache for a short
348 period of time).
340 rw mount the network share read-write (note that the 349 rw mount the network share read-write (note that the
341 server may still consider the share read-only) 350 server may still consider the share read-only)
342 ro mount network share read-only 351 ro mount network share read-only
diff --git a/fs/cifs/TODO b/fs/cifs/TODO
index 5aff46c61e52..355abcdcda98 100644
--- a/fs/cifs/TODO
+++ b/fs/cifs/TODO
@@ -81,7 +81,7 @@ u) DOS attrs - returned as pseudo-xattr in Samba format (check VFAT and NTFS for
81 81
82v) mount check for unmatched uids 82v) mount check for unmatched uids
83 83
84w) Add support for new vfs entry points for setlease and fallocate 84w) Add support for new vfs entry point for fallocate
85 85
86x) Fix Samba 3 server to handle Linux kernel aio so dbench with lots of 86x) Fix Samba 3 server to handle Linux kernel aio so dbench with lots of
87processes can proceed better in parallel (on the server) 87processes can proceed better in parallel (on the server)
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 525ba59a4105..7852cd677051 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -15,7 +15,7 @@
15 * the GNU Lesser General Public License for more details. 15 * the GNU Lesser General Public License for more details.
16 * 16 *
17 */ 17 */
18#include <linux/radix-tree.h> 18#include <linux/rbtree.h>
19 19
20#ifndef _CIFS_FS_SB_H 20#ifndef _CIFS_FS_SB_H
21#define _CIFS_FS_SB_H 21#define _CIFS_FS_SB_H
@@ -42,12 +42,13 @@
42#define CIFS_MOUNT_MULTIUSER 0x20000 /* multiuser mount */ 42#define CIFS_MOUNT_MULTIUSER 0x20000 /* multiuser mount */
43 43
44struct cifs_sb_info { 44struct cifs_sb_info {
45 struct radix_tree_root tlink_tree; 45 struct rb_root tlink_tree;
46#define CIFS_TLINK_MASTER_TAG 0 /* is "master" (mount) tcon */
47 spinlock_t tlink_tree_lock; 46 spinlock_t tlink_tree_lock;
47 struct tcon_link *master_tlink;
48 struct nls_table *local_nls; 48 struct nls_table *local_nls;
49 unsigned int rsize; 49 unsigned int rsize;
50 unsigned int wsize; 50 unsigned int wsize;
51 unsigned long actimeo; /* attribute cache timeout (jiffies) */
51 atomic_t active; 52 atomic_t active;
52 uid_t mnt_uid; 53 uid_t mnt_uid;
53 gid_t mnt_gid; 54 gid_t mnt_gid;
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index c9b4792ae825..a437ec391a01 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -30,8 +30,6 @@
30#include "cifs_debug.h" 30#include "cifs_debug.h"
31 31
32 32
33#ifdef CONFIG_CIFS_EXPERIMENTAL
34
35static struct cifs_wksid wksidarr[NUM_WK_SIDS] = { 33static struct cifs_wksid wksidarr[NUM_WK_SIDS] = {
36 {{1, 0, {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0} }, "null user"}, 34 {{1, 0, {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0} }, "null user"},
37 {{1, 1, {0, 0, 0, 0, 0, 1}, {0, 0, 0, 0, 0} }, "nobody"}, 35 {{1, 1, {0, 0, 0, 0, 0, 1}, {0, 0, 0, 0, 0} }, "nobody"},
@@ -560,7 +558,7 @@ static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
560 struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); 558 struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
561 559
562 if (IS_ERR(tlink)) 560 if (IS_ERR(tlink))
563 return NULL; 561 return ERR_CAST(tlink);
564 562
565 xid = GetXid(); 563 xid = GetXid();
566 rc = CIFSSMBGetCIFSACL(xid, tlink_tcon(tlink), fid, &pntsd, pacllen); 564 rc = CIFSSMBGetCIFSACL(xid, tlink_tcon(tlink), fid, &pntsd, pacllen);
@@ -568,7 +566,9 @@ static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
568 566
569 cifs_put_tlink(tlink); 567 cifs_put_tlink(tlink);
570 568
571 cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen); 569 cFYI(1, "%s: rc = %d ACL len %d", __func__, rc, *pacllen);
570 if (rc)
571 return ERR_PTR(rc);
572 return pntsd; 572 return pntsd;
573} 573}
574 574
@@ -583,7 +583,7 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
583 struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); 583 struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
584 584
585 if (IS_ERR(tlink)) 585 if (IS_ERR(tlink))
586 return NULL; 586 return ERR_CAST(tlink);
587 587
588 tcon = tlink_tcon(tlink); 588 tcon = tlink_tcon(tlink);
589 xid = GetXid(); 589 xid = GetXid();
@@ -591,23 +591,22 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
591 rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, READ_CONTROL, 0, 591 rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, READ_CONTROL, 0,
592 &fid, &oplock, NULL, cifs_sb->local_nls, 592 &fid, &oplock, NULL, cifs_sb->local_nls,
593 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 593 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
594 if (rc) { 594 if (!rc) {
595 cERROR(1, "Unable to open file to get ACL"); 595 rc = CIFSSMBGetCIFSACL(xid, tcon, fid, &pntsd, pacllen);
596 goto out; 596 CIFSSMBClose(xid, tcon, fid);
597 } 597 }
598 598
599 rc = CIFSSMBGetCIFSACL(xid, tcon, fid, &pntsd, pacllen);
600 cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen);
601
602 CIFSSMBClose(xid, tcon, fid);
603 out:
604 cifs_put_tlink(tlink); 599 cifs_put_tlink(tlink);
605 FreeXid(xid); 600 FreeXid(xid);
601
602 cFYI(1, "%s: rc = %d ACL len %d", __func__, rc, *pacllen);
603 if (rc)
604 return ERR_PTR(rc);
606 return pntsd; 605 return pntsd;
607} 606}
608 607
609/* Retrieve an ACL from the server */ 608/* Retrieve an ACL from the server */
610static struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb, 609struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
611 struct inode *inode, const char *path, 610 struct inode *inode, const char *path,
612 u32 *pacllen) 611 u32 *pacllen)
613{ 612{
@@ -695,7 +694,7 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
695} 694}
696 695
697/* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */ 696/* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */
698void 697int
699cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, 698cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
700 struct inode *inode, const char *path, const __u16 *pfid) 699 struct inode *inode, const char *path, const __u16 *pfid)
701{ 700{
@@ -711,17 +710,21 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
711 pntsd = get_cifs_acl(cifs_sb, inode, path, &acllen); 710 pntsd = get_cifs_acl(cifs_sb, inode, path, &acllen);
712 711
713 /* if we can retrieve the ACL, now parse Access Control Entries, ACEs */ 712 /* if we can retrieve the ACL, now parse Access Control Entries, ACEs */
714 if (pntsd) 713 if (IS_ERR(pntsd)) {
714 rc = PTR_ERR(pntsd);
715 cERROR(1, "%s: error %d getting sec desc", __func__, rc);
716 } else {
715 rc = parse_sec_desc(pntsd, acllen, fattr); 717 rc = parse_sec_desc(pntsd, acllen, fattr);
716 if (rc) 718 kfree(pntsd);
717 cFYI(1, "parse sec desc failed rc = %d", rc); 719 if (rc)
720 cERROR(1, "parse sec desc failed rc = %d", rc);
721 }
718 722
719 kfree(pntsd); 723 return rc;
720 return;
721} 724}
722 725
723/* Convert mode bits to an ACL so we can update the ACL on the server */ 726/* Convert mode bits to an ACL so we can update the ACL on the server */
724int mode_to_acl(struct inode *inode, const char *path, __u64 nmode) 727int mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode)
725{ 728{
726 int rc = 0; 729 int rc = 0;
727 __u32 secdesclen = 0; 730 __u32 secdesclen = 0;
@@ -736,7 +739,10 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
736 /* Add three ACEs for owner, group, everyone getting rid of 739 /* Add three ACEs for owner, group, everyone getting rid of
737 other ACEs as chmod disables ACEs and set the security descriptor */ 740 other ACEs as chmod disables ACEs and set the security descriptor */
738 741
739 if (pntsd) { 742 if (IS_ERR(pntsd)) {
743 rc = PTR_ERR(pntsd);
744 cERROR(1, "%s: error %d getting sec desc", __func__, rc);
745 } else {
740 /* allocate memory for the smb header, 746 /* allocate memory for the smb header,
741 set security descriptor request security descriptor 747 set security descriptor request security descriptor
742 parameters, and secuirty descriptor itself */ 748 parameters, and secuirty descriptor itself */
@@ -766,4 +772,3 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
766 772
767 return rc; 773 return rc;
768} 774}
769#endif /* CONFIG_CIFS_EXPERIMENTAL */
diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h
index 6c8096cf5155..c4ae7d036563 100644
--- a/fs/cifs/cifsacl.h
+++ b/fs/cifs/cifsacl.h
@@ -74,11 +74,7 @@ struct cifs_wksid {
74 char sidname[SIDNAMELENGTH]; 74 char sidname[SIDNAMELENGTH];
75} __attribute__((packed)); 75} __attribute__((packed));
76 76
77#ifdef CONFIG_CIFS_EXPERIMENTAL
78
79extern int match_sid(struct cifs_sid *); 77extern int match_sid(struct cifs_sid *);
80extern int compare_sids(const struct cifs_sid *, const struct cifs_sid *); 78extern int compare_sids(const struct cifs_sid *, const struct cifs_sid *);
81 79
82#endif /* CONFIG_CIFS_EXPERIMENTAL */
83
84#endif /* _CIFSACL_H */ 80#endif /* _CIFSACL_H */
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 75c4eaa79588..3936aa7f2c22 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -116,7 +116,7 @@ cifs_read_super(struct super_block *sb, void *data,
116 return -ENOMEM; 116 return -ENOMEM;
117 117
118 spin_lock_init(&cifs_sb->tlink_tree_lock); 118 spin_lock_init(&cifs_sb->tlink_tree_lock);
119 INIT_RADIX_TREE(&cifs_sb->tlink_tree, GFP_KERNEL); 119 cifs_sb->tlink_tree = RB_ROOT;
120 120
121 rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY); 121 rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY);
122 if (rc) { 122 if (rc) {
@@ -321,8 +321,7 @@ cifs_alloc_inode(struct super_block *sb)
321 /* Until the file is open and we have gotten oplock 321 /* Until the file is open and we have gotten oplock
322 info back from the server, can not assume caching of 322 info back from the server, can not assume caching of
323 file data or metadata */ 323 file data or metadata */
324 cifs_inode->clientCanCacheRead = false; 324 cifs_set_oplock_level(cifs_inode, 0);
325 cifs_inode->clientCanCacheAll = false;
326 cifs_inode->delete_pending = false; 325 cifs_inode->delete_pending = false;
327 cifs_inode->invalid_mapping = false; 326 cifs_inode->invalid_mapping = false;
328 cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ 327 cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */
@@ -459,9 +458,13 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
459 seq_printf(s, ",acl"); 458 seq_printf(s, ",acl");
460 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) 459 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
461 seq_printf(s, ",mfsymlinks"); 460 seq_printf(s, ",mfsymlinks");
461 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE)
462 seq_printf(s, ",fsc");
462 463
463 seq_printf(s, ",rsize=%d", cifs_sb->rsize); 464 seq_printf(s, ",rsize=%d", cifs_sb->rsize);
464 seq_printf(s, ",wsize=%d", cifs_sb->wsize); 465 seq_printf(s, ",wsize=%d", cifs_sb->wsize);
466 /* convert actimeo and display it in seconds */
467 seq_printf(s, ",actimeo=%lu", cifs_sb->actimeo / HZ);
465 468
466 return 0; 469 return 0;
467} 470}
@@ -934,7 +937,6 @@ init_cifs(void)
934 GlobalCurrentXid = 0; 937 GlobalCurrentXid = 0;
935 GlobalTotalActiveXid = 0; 938 GlobalTotalActiveXid = 0;
936 GlobalMaxActiveXid = 0; 939 GlobalMaxActiveXid = 0;
937 memset(Local_System_Name, 0, 15);
938 spin_lock_init(&cifs_tcp_ses_lock); 940 spin_lock_init(&cifs_tcp_ses_lock);
939 spin_lock_init(&cifs_file_list_lock); 941 spin_lock_init(&cifs_file_list_lock);
940 spin_lock_init(&GlobalMid_Lock); 942 spin_lock_init(&GlobalMid_Lock);
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index f259e4d7612d..7136c0c3e2f9 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -45,6 +45,16 @@
45#define CIFS_MIN_RCV_POOL 4 45#define CIFS_MIN_RCV_POOL 4
46 46
47/* 47/*
48 * default attribute cache timeout (jiffies)
49 */
50#define CIFS_DEF_ACTIMEO (1 * HZ)
51
52/*
53 * max attribute cache timeout (jiffies) - 2^30
54 */
55#define CIFS_MAX_ACTIMEO (1 << 30)
56
57/*
48 * MAX_REQ is the maximum number of requests that WE will send 58 * MAX_REQ is the maximum number of requests that WE will send
49 * on one socket concurrently. It also matches the most common 59 * on one socket concurrently. It also matches the most common
50 * value of max multiplex returned by servers. We may 60 * value of max multiplex returned by servers. We may
@@ -336,7 +346,8 @@ struct cifsTconInfo {
336 * "get" on the container. 346 * "get" on the container.
337 */ 347 */
338struct tcon_link { 348struct tcon_link {
339 unsigned long tl_index; 349 struct rb_node tl_rbnode;
350 uid_t tl_uid;
340 unsigned long tl_flags; 351 unsigned long tl_flags;
341#define TCON_LINK_MASTER 0 352#define TCON_LINK_MASTER 0
342#define TCON_LINK_PENDING 1 353#define TCON_LINK_PENDING 1
@@ -745,8 +756,6 @@ GLOBAL_EXTERN unsigned int GlobalTotalActiveXid; /* prot by GlobalMid_Sem */
745GLOBAL_EXTERN unsigned int GlobalMaxActiveXid; /* prot by GlobalMid_Sem */ 756GLOBAL_EXTERN unsigned int GlobalMaxActiveXid; /* prot by GlobalMid_Sem */
746GLOBAL_EXTERN spinlock_t GlobalMid_Lock; /* protects above & list operations */ 757GLOBAL_EXTERN spinlock_t GlobalMid_Lock; /* protects above & list operations */
747 /* on midQ entries */ 758 /* on midQ entries */
748GLOBAL_EXTERN char Local_System_Name[15];
749
750/* 759/*
751 * Global counters, updated atomically 760 * Global counters, updated atomically
752 */ 761 */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index edb6d90efdf2..e6d1481b16c1 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -54,7 +54,8 @@ do { \
54 __func__, curr_xid, (int)rc); \ 54 __func__, curr_xid, (int)rc); \
55} while (0) 55} while (0)
56extern char *build_path_from_dentry(struct dentry *); 56extern char *build_path_from_dentry(struct dentry *);
57extern char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb); 57extern char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb,
58 struct cifsTconInfo *tcon);
58extern char *build_wildcard_path_from_dentry(struct dentry *direntry); 59extern char *build_wildcard_path_from_dentry(struct dentry *direntry);
59extern char *cifs_compose_mount_options(const char *sb_mountdata, 60extern char *cifs_compose_mount_options(const char *sb_mountdata,
60 const char *fullpath, const struct dfs_info3_param *ref, 61 const char *fullpath, const struct dfs_info3_param *ref,
@@ -79,9 +80,7 @@ extern bool is_valid_oplock_break(struct smb_hdr *smb,
79 struct TCP_Server_Info *); 80 struct TCP_Server_Info *);
80extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof); 81extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof);
81extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool); 82extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool);
82#ifdef CONFIG_CIFS_EXPERIMENTAL
83extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool); 83extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool);
84#endif
85extern unsigned int smbCalcSize(struct smb_hdr *ptr); 84extern unsigned int smbCalcSize(struct smb_hdr *ptr);
86extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr); 85extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr);
87extern int decode_negTokenInit(unsigned char *security_blob, int length, 86extern int decode_negTokenInit(unsigned char *security_blob, int length,
@@ -104,6 +103,7 @@ extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601);
104extern u64 cifs_UnixTimeToNT(struct timespec); 103extern u64 cifs_UnixTimeToNT(struct timespec);
105extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, 104extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time,
106 int offset); 105 int offset);
106extern void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock);
107 107
108extern struct cifsFileInfo *cifs_new_fileinfo(__u16 fileHandle, 108extern struct cifsFileInfo *cifs_new_fileinfo(__u16 fileHandle,
109 struct file *file, struct tcon_link *tlink, 109 struct file *file, struct tcon_link *tlink,
@@ -129,10 +129,12 @@ extern int cifs_get_file_info_unix(struct file *filp);
129extern int cifs_get_inode_info_unix(struct inode **pinode, 129extern int cifs_get_inode_info_unix(struct inode **pinode,
130 const unsigned char *search_path, 130 const unsigned char *search_path,
131 struct super_block *sb, int xid); 131 struct super_block *sb, int xid);
132extern void cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, 132extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb,
133 struct cifs_fattr *fattr, struct inode *inode, 133 struct cifs_fattr *fattr, struct inode *inode,
134 const char *path, const __u16 *pfid); 134 const char *path, const __u16 *pfid);
135extern int mode_to_acl(struct inode *inode, const char *path, __u64); 135extern int mode_to_cifs_acl(struct inode *inode, const char *path, __u64);
136extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *,
137 const char *, u32 *);
136 138
137extern int cifs_mount(struct super_block *, struct cifs_sb_info *, char *, 139extern int cifs_mount(struct super_block *, struct cifs_sb_info *, char *,
138 const char *); 140 const char *);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 2f2632b6df5a..67acfb3acad2 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -2478,95 +2478,6 @@ querySymLinkRetry:
2478} 2478}
2479 2479
2480#ifdef CONFIG_CIFS_EXPERIMENTAL 2480#ifdef CONFIG_CIFS_EXPERIMENTAL
2481/* Initialize NT TRANSACT SMB into small smb request buffer.
2482 This assumes that all NT TRANSACTS that we init here have
2483 total parm and data under about 400 bytes (to fit in small cifs
2484 buffer size), which is the case so far, it easily fits. NB:
2485 Setup words themselves and ByteCount
2486 MaxSetupCount (size of returned setup area) and
2487 MaxParameterCount (returned parms size) must be set by caller */
2488static int
2489smb_init_nttransact(const __u16 sub_command, const int setup_count,
2490 const int parm_len, struct cifsTconInfo *tcon,
2491 void **ret_buf)
2492{
2493 int rc;
2494 __u32 temp_offset;
2495 struct smb_com_ntransact_req *pSMB;
2496
2497 rc = small_smb_init(SMB_COM_NT_TRANSACT, 19 + setup_count, tcon,
2498 (void **)&pSMB);
2499 if (rc)
2500 return rc;
2501 *ret_buf = (void *)pSMB;
2502 pSMB->Reserved = 0;
2503 pSMB->TotalParameterCount = cpu_to_le32(parm_len);
2504 pSMB->TotalDataCount = 0;
2505 pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf -
2506 MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
2507 pSMB->ParameterCount = pSMB->TotalParameterCount;
2508 pSMB->DataCount = pSMB->TotalDataCount;
2509 temp_offset = offsetof(struct smb_com_ntransact_req, Parms) +
2510 (setup_count * 2) - 4 /* for rfc1001 length itself */;
2511 pSMB->ParameterOffset = cpu_to_le32(temp_offset);
2512 pSMB->DataOffset = cpu_to_le32(temp_offset + parm_len);
2513 pSMB->SetupCount = setup_count; /* no need to le convert byte fields */
2514 pSMB->SubCommand = cpu_to_le16(sub_command);
2515 return 0;
2516}
2517
2518static int
2519validate_ntransact(char *buf, char **ppparm, char **ppdata,
2520 __u32 *pparmlen, __u32 *pdatalen)
2521{
2522 char *end_of_smb;
2523 __u32 data_count, data_offset, parm_count, parm_offset;
2524 struct smb_com_ntransact_rsp *pSMBr;
2525
2526 *pdatalen = 0;
2527 *pparmlen = 0;
2528
2529 if (buf == NULL)
2530 return -EINVAL;
2531
2532 pSMBr = (struct smb_com_ntransact_rsp *)buf;
2533
2534 /* ByteCount was converted from little endian in SendReceive */
2535 end_of_smb = 2 /* sizeof byte count */ + pSMBr->ByteCount +
2536 (char *)&pSMBr->ByteCount;
2537
2538 data_offset = le32_to_cpu(pSMBr->DataOffset);
2539 data_count = le32_to_cpu(pSMBr->DataCount);
2540 parm_offset = le32_to_cpu(pSMBr->ParameterOffset);
2541 parm_count = le32_to_cpu(pSMBr->ParameterCount);
2542
2543 *ppparm = (char *)&pSMBr->hdr.Protocol + parm_offset;
2544 *ppdata = (char *)&pSMBr->hdr.Protocol + data_offset;
2545
2546 /* should we also check that parm and data areas do not overlap? */
2547 if (*ppparm > end_of_smb) {
2548 cFYI(1, "parms start after end of smb");
2549 return -EINVAL;
2550 } else if (parm_count + *ppparm > end_of_smb) {
2551 cFYI(1, "parm end after end of smb");
2552 return -EINVAL;
2553 } else if (*ppdata > end_of_smb) {
2554 cFYI(1, "data starts after end of smb");
2555 return -EINVAL;
2556 } else if (data_count + *ppdata > end_of_smb) {
2557 cFYI(1, "data %p + count %d (%p) past smb end %p start %p",
2558 *ppdata, data_count, (data_count + *ppdata),
2559 end_of_smb, pSMBr);
2560 return -EINVAL;
2561 } else if (parm_count + data_count > pSMBr->ByteCount) {
2562 cFYI(1, "parm count and data count larger than SMB");
2563 return -EINVAL;
2564 }
2565 *pdatalen = data_count;
2566 *pparmlen = parm_count;
2567 return 0;
2568}
2569
2570int 2481int
2571CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon, 2482CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
2572 const unsigned char *searchName, 2483 const unsigned char *searchName,
@@ -3056,7 +2967,97 @@ GetExtAttrOut:
3056 2967
3057#endif /* CONFIG_POSIX */ 2968#endif /* CONFIG_POSIX */
3058 2969
3059#ifdef CONFIG_CIFS_EXPERIMENTAL 2970#ifdef CONFIG_CIFS_ACL
2971/*
2972 * Initialize NT TRANSACT SMB into small smb request buffer. This assumes that
2973 * all NT TRANSACTS that we init here have total parm and data under about 400
2974 * bytes (to fit in small cifs buffer size), which is the case so far, it
2975 * easily fits. NB: Setup words themselves and ByteCount MaxSetupCount (size of
2976 * returned setup area) and MaxParameterCount (returned parms size) must be set
2977 * by caller
2978 */
2979static int
2980smb_init_nttransact(const __u16 sub_command, const int setup_count,
2981 const int parm_len, struct cifsTconInfo *tcon,
2982 void **ret_buf)
2983{
2984 int rc;
2985 __u32 temp_offset;
2986 struct smb_com_ntransact_req *pSMB;
2987
2988 rc = small_smb_init(SMB_COM_NT_TRANSACT, 19 + setup_count, tcon,
2989 (void **)&pSMB);
2990 if (rc)
2991 return rc;
2992 *ret_buf = (void *)pSMB;
2993 pSMB->Reserved = 0;
2994 pSMB->TotalParameterCount = cpu_to_le32(parm_len);
2995 pSMB->TotalDataCount = 0;
2996 pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf -
2997 MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
2998 pSMB->ParameterCount = pSMB->TotalParameterCount;
2999 pSMB->DataCount = pSMB->TotalDataCount;
3000 temp_offset = offsetof(struct smb_com_ntransact_req, Parms) +
3001 (setup_count * 2) - 4 /* for rfc1001 length itself */;
3002 pSMB->ParameterOffset = cpu_to_le32(temp_offset);
3003 pSMB->DataOffset = cpu_to_le32(temp_offset + parm_len);
3004 pSMB->SetupCount = setup_count; /* no need to le convert byte fields */
3005 pSMB->SubCommand = cpu_to_le16(sub_command);
3006 return 0;
3007}
3008
3009static int
3010validate_ntransact(char *buf, char **ppparm, char **ppdata,
3011 __u32 *pparmlen, __u32 *pdatalen)
3012{
3013 char *end_of_smb;
3014 __u32 data_count, data_offset, parm_count, parm_offset;
3015 struct smb_com_ntransact_rsp *pSMBr;
3016
3017 *pdatalen = 0;
3018 *pparmlen = 0;
3019
3020 if (buf == NULL)
3021 return -EINVAL;
3022
3023 pSMBr = (struct smb_com_ntransact_rsp *)buf;
3024
3025 /* ByteCount was converted from little endian in SendReceive */
3026 end_of_smb = 2 /* sizeof byte count */ + pSMBr->ByteCount +
3027 (char *)&pSMBr->ByteCount;
3028
3029 data_offset = le32_to_cpu(pSMBr->DataOffset);
3030 data_count = le32_to_cpu(pSMBr->DataCount);
3031 parm_offset = le32_to_cpu(pSMBr->ParameterOffset);
3032 parm_count = le32_to_cpu(pSMBr->ParameterCount);
3033
3034 *ppparm = (char *)&pSMBr->hdr.Protocol + parm_offset;
3035 *ppdata = (char *)&pSMBr->hdr.Protocol + data_offset;
3036
3037 /* should we also check that parm and data areas do not overlap? */
3038 if (*ppparm > end_of_smb) {
3039 cFYI(1, "parms start after end of smb");
3040 return -EINVAL;
3041 } else if (parm_count + *ppparm > end_of_smb) {
3042 cFYI(1, "parm end after end of smb");
3043 return -EINVAL;
3044 } else if (*ppdata > end_of_smb) {
3045 cFYI(1, "data starts after end of smb");
3046 return -EINVAL;
3047 } else if (data_count + *ppdata > end_of_smb) {
3048 cFYI(1, "data %p + count %d (%p) past smb end %p start %p",
3049 *ppdata, data_count, (data_count + *ppdata),
3050 end_of_smb, pSMBr);
3051 return -EINVAL;
3052 } else if (parm_count + data_count > pSMBr->ByteCount) {
3053 cFYI(1, "parm count and data count larger than SMB");
3054 return -EINVAL;
3055 }
3056 *pdatalen = data_count;
3057 *pparmlen = parm_count;
3058 return 0;
3059}
3060
3060/* Get Security Descriptor (by handle) from remote server for a file or dir */ 3061/* Get Security Descriptor (by handle) from remote server for a file or dir */
3061int 3062int
3062CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid, 3063CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
@@ -3214,7 +3215,7 @@ setCifsAclRetry:
3214 return (rc); 3215 return (rc);
3215} 3216}
3216 3217
3217#endif /* CONFIG_CIFS_EXPERIMENTAL */ 3218#endif /* CONFIG_CIFS_ACL */
3218 3219
3219/* Legacy Query Path Information call for lookup to old servers such 3220/* Legacy Query Path Information call for lookup to old servers such
3220 as Win9x/WinME */ 3221 as Win9x/WinME */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 9eb327defa1d..cc1a8604a790 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -105,6 +105,7 @@ struct smb_vol {
105 unsigned int wsize; 105 unsigned int wsize;
106 bool sockopt_tcp_nodelay:1; 106 bool sockopt_tcp_nodelay:1;
107 unsigned short int port; 107 unsigned short int port;
108 unsigned long actimeo; /* attribute cache timeout (jiffies) */
108 char *prepath; 109 char *prepath;
109 struct sockaddr_storage srcaddr; /* allow binding to a local IP */ 110 struct sockaddr_storage srcaddr; /* allow binding to a local IP */
110 struct nls_table *local_nls; 111 struct nls_table *local_nls;
@@ -116,6 +117,7 @@ struct smb_vol {
116 117
117static int ipv4_connect(struct TCP_Server_Info *server); 118static int ipv4_connect(struct TCP_Server_Info *server);
118static int ipv6_connect(struct TCP_Server_Info *server); 119static int ipv6_connect(struct TCP_Server_Info *server);
120static void tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink);
119static void cifs_prune_tlinks(struct work_struct *work); 121static void cifs_prune_tlinks(struct work_struct *work);
120 122
121/* 123/*
@@ -805,23 +807,20 @@ cifs_parse_mount_options(char *options, const char *devname,
805 short int override_gid = -1; 807 short int override_gid = -1;
806 bool uid_specified = false; 808 bool uid_specified = false;
807 bool gid_specified = false; 809 bool gid_specified = false;
810 char *nodename = utsname()->nodename;
808 811
809 separator[0] = ','; 812 separator[0] = ',';
810 separator[1] = 0; 813 separator[1] = 0;
811 814
812 if (Local_System_Name[0] != 0) 815 /*
813 memcpy(vol->source_rfc1001_name, Local_System_Name, 15); 816 * does not have to be perfect mapping since field is
814 else { 817 * informational, only used for servers that do not support
815 char *nodename = utsname()->nodename; 818 * port 445 and it can be overridden at mount time
816 int n = strnlen(nodename, 15); 819 */
817 memset(vol->source_rfc1001_name, 0x20, 15); 820 memset(vol->source_rfc1001_name, 0x20, 15);
818 for (i = 0; i < n; i++) { 821 for (i = 0; i < strnlen(nodename, 15); i++)
819 /* does not have to be perfect mapping since field is 822 vol->source_rfc1001_name[i] = toupper(nodename[i]);
820 informational, only used for servers that do not support 823
821 port 445 and it can be overridden at mount time */
822 vol->source_rfc1001_name[i] = toupper(nodename[i]);
823 }
824 }
825 vol->source_rfc1001_name[15] = 0; 824 vol->source_rfc1001_name[15] = 0;
826 /* null target name indicates to use *SMBSERVR default called name 825 /* null target name indicates to use *SMBSERVR default called name
827 if we end up sending RFC1001 session initialize */ 826 if we end up sending RFC1001 session initialize */
@@ -839,6 +838,8 @@ cifs_parse_mount_options(char *options, const char *devname,
839 /* default to using server inode numbers where available */ 838 /* default to using server inode numbers where available */
840 vol->server_ino = 1; 839 vol->server_ino = 1;
841 840
841 vol->actimeo = CIFS_DEF_ACTIMEO;
842
842 if (!options) 843 if (!options)
843 return 1; 844 return 1;
844 845
@@ -1213,6 +1214,16 @@ cifs_parse_mount_options(char *options, const char *devname,
1213 printk(KERN_WARNING "CIFS: server net" 1214 printk(KERN_WARNING "CIFS: server net"
1214 "biosname longer than 15 truncated.\n"); 1215 "biosname longer than 15 truncated.\n");
1215 } 1216 }
1217 } else if (strnicmp(data, "actimeo", 7) == 0) {
1218 if (value && *value) {
1219 vol->actimeo = HZ * simple_strtoul(value,
1220 &value, 0);
1221 if (vol->actimeo > CIFS_MAX_ACTIMEO) {
1222 cERROR(1, "CIFS: attribute cache"
1223 "timeout too large");
1224 return 1;
1225 }
1226 }
1216 } else if (strnicmp(data, "credentials", 4) == 0) { 1227 } else if (strnicmp(data, "credentials", 4) == 0) {
1217 /* ignore */ 1228 /* ignore */
1218 } else if (strnicmp(data, "version", 3) == 0) { 1229 } else if (strnicmp(data, "version", 3) == 0) {
@@ -1351,6 +1362,11 @@ cifs_parse_mount_options(char *options, const char *devname,
1351 "supported. Instead set " 1362 "supported. Instead set "
1352 "/proc/fs/cifs/LookupCacheEnabled to 0\n"); 1363 "/proc/fs/cifs/LookupCacheEnabled to 0\n");
1353 } else if (strnicmp(data, "fsc", 3) == 0) { 1364 } else if (strnicmp(data, "fsc", 3) == 0) {
1365#ifndef CONFIG_CIFS_FSCACHE
1366 cERROR(1, "FS-Cache support needs CONFIG_CIFS_FSCACHE"
1367 "kernel config option set");
1368 return 1;
1369#endif
1354 vol->fsc = true; 1370 vol->fsc = true;
1355 } else if (strnicmp(data, "mfsymlinks", 10) == 0) { 1371 } else if (strnicmp(data, "mfsymlinks", 10) == 0) {
1356 vol->mfsymlinks = true; 1372 vol->mfsymlinks = true;
@@ -2565,6 +2581,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
2565 cFYI(1, "file mode: 0x%x dir mode: 0x%x", 2581 cFYI(1, "file mode: 0x%x dir mode: 0x%x",
2566 cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode); 2582 cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode);
2567 2583
2584 cifs_sb->actimeo = pvolume_info->actimeo;
2585
2568 if (pvolume_info->noperm) 2586 if (pvolume_info->noperm)
2569 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM; 2587 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM;
2570 if (pvolume_info->setuids) 2588 if (pvolume_info->setuids)
@@ -2815,13 +2833,13 @@ remote_path_check:
2815 /* check if a whole path (including prepath) is not remote */ 2833 /* check if a whole path (including prepath) is not remote */
2816 if (!rc && cifs_sb->prepathlen && tcon) { 2834 if (!rc && cifs_sb->prepathlen && tcon) {
2817 /* build_path_to_root works only when we have a valid tcon */ 2835 /* build_path_to_root works only when we have a valid tcon */
2818 full_path = cifs_build_path_to_root(cifs_sb); 2836 full_path = cifs_build_path_to_root(cifs_sb, tcon);
2819 if (full_path == NULL) { 2837 if (full_path == NULL) {
2820 rc = -ENOMEM; 2838 rc = -ENOMEM;
2821 goto mount_fail_check; 2839 goto mount_fail_check;
2822 } 2840 }
2823 rc = is_path_accessible(xid, tcon, cifs_sb, full_path); 2841 rc = is_path_accessible(xid, tcon, cifs_sb, full_path);
2824 if (rc != -EREMOTE) { 2842 if (rc != 0 && rc != -EREMOTE) {
2825 kfree(full_path); 2843 kfree(full_path);
2826 goto mount_fail_check; 2844 goto mount_fail_check;
2827 } 2845 }
@@ -2900,24 +2918,16 @@ remote_path_check:
2900 goto mount_fail_check; 2918 goto mount_fail_check;
2901 } 2919 }
2902 2920
2903 tlink->tl_index = pSesInfo->linux_uid; 2921 tlink->tl_uid = pSesInfo->linux_uid;
2904 tlink->tl_tcon = tcon; 2922 tlink->tl_tcon = tcon;
2905 tlink->tl_time = jiffies; 2923 tlink->tl_time = jiffies;
2906 set_bit(TCON_LINK_MASTER, &tlink->tl_flags); 2924 set_bit(TCON_LINK_MASTER, &tlink->tl_flags);
2907 set_bit(TCON_LINK_IN_TREE, &tlink->tl_flags); 2925 set_bit(TCON_LINK_IN_TREE, &tlink->tl_flags);
2908 2926
2909 rc = radix_tree_preload(GFP_KERNEL); 2927 cifs_sb->master_tlink = tlink;
2910 if (rc == -ENOMEM) {
2911 kfree(tlink);
2912 goto mount_fail_check;
2913 }
2914
2915 spin_lock(&cifs_sb->tlink_tree_lock); 2928 spin_lock(&cifs_sb->tlink_tree_lock);
2916 radix_tree_insert(&cifs_sb->tlink_tree, pSesInfo->linux_uid, tlink); 2929 tlink_rb_insert(&cifs_sb->tlink_tree, tlink);
2917 radix_tree_tag_set(&cifs_sb->tlink_tree, pSesInfo->linux_uid,
2918 CIFS_TLINK_MASTER_TAG);
2919 spin_unlock(&cifs_sb->tlink_tree_lock); 2930 spin_unlock(&cifs_sb->tlink_tree_lock);
2920 radix_tree_preload_end();
2921 2931
2922 queue_delayed_work(system_nrt_wq, &cifs_sb->prune_tlinks, 2932 queue_delayed_work(system_nrt_wq, &cifs_sb->prune_tlinks,
2923 TLINK_IDLE_EXPIRE); 2933 TLINK_IDLE_EXPIRE);
@@ -3107,32 +3117,25 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
3107int 3117int
3108cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb) 3118cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
3109{ 3119{
3110 int i, ret; 3120 struct rb_root *root = &cifs_sb->tlink_tree;
3121 struct rb_node *node;
3122 struct tcon_link *tlink;
3111 char *tmp; 3123 char *tmp;
3112 struct tcon_link *tlink[8];
3113 unsigned long index = 0;
3114 3124
3115 cancel_delayed_work_sync(&cifs_sb->prune_tlinks); 3125 cancel_delayed_work_sync(&cifs_sb->prune_tlinks);
3116 3126
3117 do { 3127 spin_lock(&cifs_sb->tlink_tree_lock);
3118 spin_lock(&cifs_sb->tlink_tree_lock); 3128 while ((node = rb_first(root))) {
3119 ret = radix_tree_gang_lookup(&cifs_sb->tlink_tree, 3129 tlink = rb_entry(node, struct tcon_link, tl_rbnode);
3120 (void **)tlink, index, 3130 cifs_get_tlink(tlink);
3121 ARRAY_SIZE(tlink)); 3131 clear_bit(TCON_LINK_IN_TREE, &tlink->tl_flags);
3122 /* increment index for next pass */ 3132 rb_erase(node, root);
3123 if (ret > 0)
3124 index = tlink[ret - 1]->tl_index + 1;
3125 for (i = 0; i < ret; i++) {
3126 cifs_get_tlink(tlink[i]);
3127 clear_bit(TCON_LINK_IN_TREE, &tlink[i]->tl_flags);
3128 radix_tree_delete(&cifs_sb->tlink_tree,
3129 tlink[i]->tl_index);
3130 }
3131 spin_unlock(&cifs_sb->tlink_tree_lock);
3132 3133
3133 for (i = 0; i < ret; i++) 3134 spin_unlock(&cifs_sb->tlink_tree_lock);
3134 cifs_put_tlink(tlink[i]); 3135 cifs_put_tlink(tlink);
3135 } while (ret != 0); 3136 spin_lock(&cifs_sb->tlink_tree_lock);
3137 }
3138 spin_unlock(&cifs_sb->tlink_tree_lock);
3136 3139
3137 tmp = cifs_sb->prepath; 3140 tmp = cifs_sb->prepath;
3138 cifs_sb->prepathlen = 0; 3141 cifs_sb->prepathlen = 0;
@@ -3271,22 +3274,10 @@ out:
3271 return tcon; 3274 return tcon;
3272} 3275}
3273 3276
3274static struct tcon_link * 3277static inline struct tcon_link *
3275cifs_sb_master_tlink(struct cifs_sb_info *cifs_sb) 3278cifs_sb_master_tlink(struct cifs_sb_info *cifs_sb)
3276{ 3279{
3277 struct tcon_link *tlink; 3280 return cifs_sb->master_tlink;
3278 unsigned int ret;
3279
3280 spin_lock(&cifs_sb->tlink_tree_lock);
3281 ret = radix_tree_gang_lookup_tag(&cifs_sb->tlink_tree, (void **)&tlink,
3282 0, 1, CIFS_TLINK_MASTER_TAG);
3283 spin_unlock(&cifs_sb->tlink_tree_lock);
3284
3285 /* the master tcon should always be present */
3286 if (ret == 0)
3287 BUG();
3288
3289 return tlink;
3290} 3281}
3291 3282
3292struct cifsTconInfo * 3283struct cifsTconInfo *
@@ -3302,6 +3293,47 @@ cifs_sb_tcon_pending_wait(void *unused)
3302 return signal_pending(current) ? -ERESTARTSYS : 0; 3293 return signal_pending(current) ? -ERESTARTSYS : 0;
3303} 3294}
3304 3295
3296/* find and return a tlink with given uid */
3297static struct tcon_link *
3298tlink_rb_search(struct rb_root *root, uid_t uid)
3299{
3300 struct rb_node *node = root->rb_node;
3301 struct tcon_link *tlink;
3302
3303 while (node) {
3304 tlink = rb_entry(node, struct tcon_link, tl_rbnode);
3305
3306 if (tlink->tl_uid > uid)
3307 node = node->rb_left;
3308 else if (tlink->tl_uid < uid)
3309 node = node->rb_right;
3310 else
3311 return tlink;
3312 }
3313 return NULL;
3314}
3315
3316/* insert a tcon_link into the tree */
3317static void
3318tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink)
3319{
3320 struct rb_node **new = &(root->rb_node), *parent = NULL;
3321 struct tcon_link *tlink;
3322
3323 while (*new) {
3324 tlink = rb_entry(*new, struct tcon_link, tl_rbnode);
3325 parent = *new;
3326
3327 if (tlink->tl_uid > new_tlink->tl_uid)
3328 new = &((*new)->rb_left);
3329 else
3330 new = &((*new)->rb_right);
3331 }
3332
3333 rb_link_node(&new_tlink->tl_rbnode, parent, new);
3334 rb_insert_color(&new_tlink->tl_rbnode, root);
3335}
3336
3305/* 3337/*
3306 * Find or construct an appropriate tcon given a cifs_sb and the fsuid of the 3338 * Find or construct an appropriate tcon given a cifs_sb and the fsuid of the
3307 * current task. 3339 * current task.
@@ -3309,7 +3341,7 @@ cifs_sb_tcon_pending_wait(void *unused)
3309 * If the superblock doesn't refer to a multiuser mount, then just return 3341 * If the superblock doesn't refer to a multiuser mount, then just return
3310 * the master tcon for the mount. 3342 * the master tcon for the mount.
3311 * 3343 *
3312 * First, search the radix tree for an existing tcon for this fsuid. If one 3344 * First, search the rbtree for an existing tcon for this fsuid. If one
3313 * exists, then check to see if it's pending construction. If it is then wait 3345 * exists, then check to see if it's pending construction. If it is then wait
3314 * for construction to complete. Once it's no longer pending, check to see if 3346 * for construction to complete. Once it's no longer pending, check to see if
3315 * it failed and either return an error or retry construction, depending on 3347 * it failed and either return an error or retry construction, depending on
@@ -3322,14 +3354,14 @@ struct tcon_link *
3322cifs_sb_tlink(struct cifs_sb_info *cifs_sb) 3354cifs_sb_tlink(struct cifs_sb_info *cifs_sb)
3323{ 3355{
3324 int ret; 3356 int ret;
3325 unsigned long fsuid = (unsigned long) current_fsuid(); 3357 uid_t fsuid = current_fsuid();
3326 struct tcon_link *tlink, *newtlink; 3358 struct tcon_link *tlink, *newtlink;
3327 3359
3328 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)) 3360 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
3329 return cifs_get_tlink(cifs_sb_master_tlink(cifs_sb)); 3361 return cifs_get_tlink(cifs_sb_master_tlink(cifs_sb));
3330 3362
3331 spin_lock(&cifs_sb->tlink_tree_lock); 3363 spin_lock(&cifs_sb->tlink_tree_lock);
3332 tlink = radix_tree_lookup(&cifs_sb->tlink_tree, fsuid); 3364 tlink = tlink_rb_search(&cifs_sb->tlink_tree, fsuid);
3333 if (tlink) 3365 if (tlink)
3334 cifs_get_tlink(tlink); 3366 cifs_get_tlink(tlink);
3335 spin_unlock(&cifs_sb->tlink_tree_lock); 3367 spin_unlock(&cifs_sb->tlink_tree_lock);
@@ -3338,36 +3370,24 @@ cifs_sb_tlink(struct cifs_sb_info *cifs_sb)
3338 newtlink = kzalloc(sizeof(*tlink), GFP_KERNEL); 3370 newtlink = kzalloc(sizeof(*tlink), GFP_KERNEL);
3339 if (newtlink == NULL) 3371 if (newtlink == NULL)
3340 return ERR_PTR(-ENOMEM); 3372 return ERR_PTR(-ENOMEM);
3341 newtlink->tl_index = fsuid; 3373 newtlink->tl_uid = fsuid;
3342 newtlink->tl_tcon = ERR_PTR(-EACCES); 3374 newtlink->tl_tcon = ERR_PTR(-EACCES);
3343 set_bit(TCON_LINK_PENDING, &newtlink->tl_flags); 3375 set_bit(TCON_LINK_PENDING, &newtlink->tl_flags);
3344 set_bit(TCON_LINK_IN_TREE, &newtlink->tl_flags); 3376 set_bit(TCON_LINK_IN_TREE, &newtlink->tl_flags);
3345 cifs_get_tlink(newtlink); 3377 cifs_get_tlink(newtlink);
3346 3378
3347 ret = radix_tree_preload(GFP_KERNEL);
3348 if (ret != 0) {
3349 kfree(newtlink);
3350 return ERR_PTR(ret);
3351 }
3352
3353 spin_lock(&cifs_sb->tlink_tree_lock); 3379 spin_lock(&cifs_sb->tlink_tree_lock);
3354 /* was one inserted after previous search? */ 3380 /* was one inserted after previous search? */
3355 tlink = radix_tree_lookup(&cifs_sb->tlink_tree, fsuid); 3381 tlink = tlink_rb_search(&cifs_sb->tlink_tree, fsuid);
3356 if (tlink) { 3382 if (tlink) {
3357 cifs_get_tlink(tlink); 3383 cifs_get_tlink(tlink);
3358 spin_unlock(&cifs_sb->tlink_tree_lock); 3384 spin_unlock(&cifs_sb->tlink_tree_lock);
3359 radix_tree_preload_end();
3360 kfree(newtlink); 3385 kfree(newtlink);
3361 goto wait_for_construction; 3386 goto wait_for_construction;
3362 } 3387 }
3363 ret = radix_tree_insert(&cifs_sb->tlink_tree, fsuid, newtlink);
3364 spin_unlock(&cifs_sb->tlink_tree_lock);
3365 radix_tree_preload_end();
3366 if (ret) {
3367 kfree(newtlink);
3368 return ERR_PTR(ret);
3369 }
3370 tlink = newtlink; 3388 tlink = newtlink;
3389 tlink_rb_insert(&cifs_sb->tlink_tree, tlink);
3390 spin_unlock(&cifs_sb->tlink_tree_lock);
3371 } else { 3391 } else {
3372wait_for_construction: 3392wait_for_construction:
3373 ret = wait_on_bit(&tlink->tl_flags, TCON_LINK_PENDING, 3393 ret = wait_on_bit(&tlink->tl_flags, TCON_LINK_PENDING,
@@ -3413,39 +3433,39 @@ cifs_prune_tlinks(struct work_struct *work)
3413{ 3433{
3414 struct cifs_sb_info *cifs_sb = container_of(work, struct cifs_sb_info, 3434 struct cifs_sb_info *cifs_sb = container_of(work, struct cifs_sb_info,
3415 prune_tlinks.work); 3435 prune_tlinks.work);
3416 struct tcon_link *tlink[8]; 3436 struct rb_root *root = &cifs_sb->tlink_tree;
3417 unsigned long now = jiffies; 3437 struct rb_node *node = rb_first(root);
3418 unsigned long index = 0; 3438 struct rb_node *tmp;
3419 int i, ret; 3439 struct tcon_link *tlink;
3420 3440
3421 do { 3441 /*
3422 spin_lock(&cifs_sb->tlink_tree_lock); 3442 * Because we drop the spinlock in the loop in order to put the tlink
3423 ret = radix_tree_gang_lookup(&cifs_sb->tlink_tree, 3443 * it's not guarded against removal of links from the tree. The only
3424 (void **)tlink, index, 3444 * places that remove entries from the tree are this function and
3425 ARRAY_SIZE(tlink)); 3445 * umounts. Because this function is non-reentrant and is canceled
3426 /* increment index for next pass */ 3446 * before umount can proceed, this is safe.
3427 if (ret > 0) 3447 */
3428 index = tlink[ret - 1]->tl_index + 1; 3448 spin_lock(&cifs_sb->tlink_tree_lock);
3429 for (i = 0; i < ret; i++) { 3449 node = rb_first(root);
3430 if (test_bit(TCON_LINK_MASTER, &tlink[i]->tl_flags) || 3450 while (node != NULL) {
3431 atomic_read(&tlink[i]->tl_count) != 0 || 3451 tmp = node;
3432 time_after(tlink[i]->tl_time + TLINK_IDLE_EXPIRE, 3452 node = rb_next(tmp);
3433 now)) { 3453 tlink = rb_entry(tmp, struct tcon_link, tl_rbnode);
3434 tlink[i] = NULL; 3454
3435 continue; 3455 if (test_bit(TCON_LINK_MASTER, &tlink->tl_flags) ||
3436 } 3456 atomic_read(&tlink->tl_count) != 0 ||
3437 cifs_get_tlink(tlink[i]); 3457 time_after(tlink->tl_time + TLINK_IDLE_EXPIRE, jiffies))
3438 clear_bit(TCON_LINK_IN_TREE, &tlink[i]->tl_flags); 3458 continue;
3439 radix_tree_delete(&cifs_sb->tlink_tree,
3440 tlink[i]->tl_index);
3441 }
3442 spin_unlock(&cifs_sb->tlink_tree_lock);
3443 3459
3444 for (i = 0; i < ret; i++) { 3460 cifs_get_tlink(tlink);
3445 if (tlink[i] != NULL) 3461 clear_bit(TCON_LINK_IN_TREE, &tlink->tl_flags);
3446 cifs_put_tlink(tlink[i]); 3462 rb_erase(tmp, root);
3447 } 3463
3448 } while (ret != 0); 3464 spin_unlock(&cifs_sb->tlink_tree_lock);
3465 cifs_put_tlink(tlink);
3466 spin_lock(&cifs_sb->tlink_tree_lock);
3467 }
3468 spin_unlock(&cifs_sb->tlink_tree_lock);
3449 3469
3450 queue_delayed_work(system_nrt_wq, &cifs_sb->prune_tlinks, 3470 queue_delayed_work(system_nrt_wq, &cifs_sb->prune_tlinks,
3451 TLINK_IDLE_EXPIRE); 3471 TLINK_IDLE_EXPIRE);
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index 0eb87026cad3..548f06230a6d 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -66,7 +66,7 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
66 /* Search for server name delimiter */ 66 /* Search for server name delimiter */
67 sep = memchr(hostname, '\\', len); 67 sep = memchr(hostname, '\\', len);
68 if (sep) 68 if (sep)
69 len = sep - unc; 69 len = sep - hostname;
70 else 70 else
71 cFYI(1, "%s: probably server name is whole unc: %s", 71 cFYI(1, "%s: probably server name is whole unc: %s",
72 __func__, unc); 72 __func__, unc);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index ae82159cf7fa..5a28660ca2b5 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -146,12 +146,7 @@ client_can_cache:
146 rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb, 146 rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb,
147 xid, NULL); 147 xid, NULL);
148 148
149 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) { 149 cifs_set_oplock_level(pCifsInode, oplock);
150 pCifsInode->clientCanCacheAll = true;
151 pCifsInode->clientCanCacheRead = true;
152 cFYI(1, "Exclusive Oplock granted on inode %p", inode);
153 } else if ((oplock & 0xF) == OPLOCK_READ)
154 pCifsInode->clientCanCacheRead = true;
155 150
156 return rc; 151 return rc;
157} 152}
@@ -253,12 +248,7 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file,
253 list_add_tail(&pCifsFile->flist, &pCifsInode->openFileList); 248 list_add_tail(&pCifsFile->flist, &pCifsInode->openFileList);
254 spin_unlock(&cifs_file_list_lock); 249 spin_unlock(&cifs_file_list_lock);
255 250
256 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) { 251 cifs_set_oplock_level(pCifsInode, oplock);
257 pCifsInode->clientCanCacheAll = true;
258 pCifsInode->clientCanCacheRead = true;
259 cFYI(1, "Exclusive Oplock inode %p", inode);
260 } else if ((oplock & 0xF) == OPLOCK_READ)
261 pCifsInode->clientCanCacheRead = true;
262 252
263 file->private_data = pCifsFile; 253 file->private_data = pCifsFile;
264 return pCifsFile; 254 return pCifsFile;
@@ -271,8 +261,9 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file,
271 */ 261 */
272void cifsFileInfo_put(struct cifsFileInfo *cifs_file) 262void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
273{ 263{
264 struct inode *inode = cifs_file->dentry->d_inode;
274 struct cifsTconInfo *tcon = tlink_tcon(cifs_file->tlink); 265 struct cifsTconInfo *tcon = tlink_tcon(cifs_file->tlink);
275 struct cifsInodeInfo *cifsi = CIFS_I(cifs_file->dentry->d_inode); 266 struct cifsInodeInfo *cifsi = CIFS_I(inode);
276 struct cifsLockInfo *li, *tmp; 267 struct cifsLockInfo *li, *tmp;
277 268
278 spin_lock(&cifs_file_list_lock); 269 spin_lock(&cifs_file_list_lock);
@@ -288,8 +279,7 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
288 if (list_empty(&cifsi->openFileList)) { 279 if (list_empty(&cifsi->openFileList)) {
289 cFYI(1, "closing last open instance for inode %p", 280 cFYI(1, "closing last open instance for inode %p",
290 cifs_file->dentry->d_inode); 281 cifs_file->dentry->d_inode);
291 cifsi->clientCanCacheRead = false; 282 cifs_set_oplock_level(cifsi, 0);
292 cifsi->clientCanCacheAll = false;
293 } 283 }
294 spin_unlock(&cifs_file_list_lock); 284 spin_unlock(&cifs_file_list_lock);
295 285
@@ -607,8 +597,6 @@ reopen_success:
607 rc = filemap_write_and_wait(inode->i_mapping); 597 rc = filemap_write_and_wait(inode->i_mapping);
608 mapping_set_error(inode->i_mapping, rc); 598 mapping_set_error(inode->i_mapping, rc);
609 599
610 pCifsInode->clientCanCacheAll = false;
611 pCifsInode->clientCanCacheRead = false;
612 if (tcon->unix_ext) 600 if (tcon->unix_ext)
613 rc = cifs_get_inode_info_unix(&inode, 601 rc = cifs_get_inode_info_unix(&inode,
614 full_path, inode->i_sb, xid); 602 full_path, inode->i_sb, xid);
@@ -622,18 +610,9 @@ reopen_success:
622 invalidate the current end of file on the server 610 invalidate the current end of file on the server
623 we can not go to the server to get the new inod 611 we can not go to the server to get the new inod
624 info */ 612 info */
625 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) { 613
626 pCifsInode->clientCanCacheAll = true; 614 cifs_set_oplock_level(pCifsInode, oplock);
627 pCifsInode->clientCanCacheRead = true; 615
628 cFYI(1, "Exclusive Oplock granted on inode %p",
629 pCifsFile->dentry->d_inode);
630 } else if ((oplock & 0xF) == OPLOCK_READ) {
631 pCifsInode->clientCanCacheRead = true;
632 pCifsInode->clientCanCacheAll = false;
633 } else {
634 pCifsInode->clientCanCacheRead = false;
635 pCifsInode->clientCanCacheAll = false;
636 }
637 cifs_relock_file(pCifsFile); 616 cifs_relock_file(pCifsFile);
638 617
639reopen_error_exit: 618reopen_error_exit:
@@ -775,12 +754,6 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
775 754
776 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 755 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
777 tcon = tlink_tcon(((struct cifsFileInfo *)file->private_data)->tlink); 756 tcon = tlink_tcon(((struct cifsFileInfo *)file->private_data)->tlink);
778
779 if (file->private_data == NULL) {
780 rc = -EBADF;
781 FreeXid(xid);
782 return rc;
783 }
784 netfid = ((struct cifsFileInfo *)file->private_data)->netfid; 757 netfid = ((struct cifsFileInfo *)file->private_data)->netfid;
785 758
786 if ((tcon->ses->capabilities & CAP_UNIX) && 759 if ((tcon->ses->capabilities & CAP_UNIX) &&
@@ -956,6 +929,7 @@ cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
956ssize_t cifs_user_write(struct file *file, const char __user *write_data, 929ssize_t cifs_user_write(struct file *file, const char __user *write_data,
957 size_t write_size, loff_t *poffset) 930 size_t write_size, loff_t *poffset)
958{ 931{
932 struct inode *inode = file->f_path.dentry->d_inode;
959 int rc = 0; 933 int rc = 0;
960 unsigned int bytes_written = 0; 934 unsigned int bytes_written = 0;
961 unsigned int total_written; 935 unsigned int total_written;
@@ -963,7 +937,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
963 struct cifsTconInfo *pTcon; 937 struct cifsTconInfo *pTcon;
964 int xid, long_op; 938 int xid, long_op;
965 struct cifsFileInfo *open_file; 939 struct cifsFileInfo *open_file;
966 struct cifsInodeInfo *cifsi = CIFS_I(file->f_path.dentry->d_inode); 940 struct cifsInodeInfo *cifsi = CIFS_I(inode);
967 941
968 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 942 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
969 943
@@ -1029,21 +1003,17 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
1029 1003
1030 cifs_stats_bytes_written(pTcon, total_written); 1004 cifs_stats_bytes_written(pTcon, total_written);
1031 1005
1032 /* since the write may have blocked check these pointers again */
1033 if ((file->f_path.dentry) && (file->f_path.dentry->d_inode)) {
1034 struct inode *inode = file->f_path.dentry->d_inode;
1035/* Do not update local mtime - server will set its actual value on write 1006/* Do not update local mtime - server will set its actual value on write
1036 * inode->i_ctime = inode->i_mtime = 1007 * inode->i_ctime = inode->i_mtime =
1037 * current_fs_time(inode->i_sb);*/ 1008 * current_fs_time(inode->i_sb);*/
1038 if (total_written > 0) { 1009 if (total_written > 0) {
1039 spin_lock(&inode->i_lock); 1010 spin_lock(&inode->i_lock);
1040 if (*poffset > file->f_path.dentry->d_inode->i_size) 1011 if (*poffset > inode->i_size)
1041 i_size_write(file->f_path.dentry->d_inode, 1012 i_size_write(inode, *poffset);
1042 *poffset); 1013 spin_unlock(&inode->i_lock);
1043 spin_unlock(&inode->i_lock);
1044 }
1045 mark_inode_dirty_sync(file->f_path.dentry->d_inode);
1046 } 1014 }
1015 mark_inode_dirty_sync(inode);
1016
1047 FreeXid(xid); 1017 FreeXid(xid);
1048 return total_written; 1018 return total_written;
1049} 1019}
@@ -1138,7 +1108,6 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
1138 return total_written; 1108 return total_written;
1139} 1109}
1140 1110
1141#ifdef CONFIG_CIFS_EXPERIMENTAL
1142struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode, 1111struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
1143 bool fsuid_only) 1112 bool fsuid_only)
1144{ 1113{
@@ -1172,13 +1141,12 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
1172 spin_unlock(&cifs_file_list_lock); 1141 spin_unlock(&cifs_file_list_lock);
1173 return NULL; 1142 return NULL;
1174} 1143}
1175#endif
1176 1144
1177struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode, 1145struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode,
1178 bool fsuid_only) 1146 bool fsuid_only)
1179{ 1147{
1180 struct cifsFileInfo *open_file; 1148 struct cifsFileInfo *open_file;
1181 struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb); 1149 struct cifs_sb_info *cifs_sb;
1182 bool any_available = false; 1150 bool any_available = false;
1183 int rc; 1151 int rc;
1184 1152
@@ -1192,6 +1160,8 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode,
1192 return NULL; 1160 return NULL;
1193 } 1161 }
1194 1162
1163 cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb);
1164
1195 /* only filter by fsuid on multiuser mounts */ 1165 /* only filter by fsuid on multiuser mounts */
1196 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)) 1166 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
1197 fsuid_only = false; 1167 fsuid_only = false;
@@ -2299,8 +2269,10 @@ void cifs_oplock_break_get(struct cifsFileInfo *cfile)
2299 2269
2300void cifs_oplock_break_put(struct cifsFileInfo *cfile) 2270void cifs_oplock_break_put(struct cifsFileInfo *cfile)
2301{ 2271{
2272 struct super_block *sb = cfile->dentry->d_sb;
2273
2302 cifsFileInfo_put(cfile); 2274 cifsFileInfo_put(cfile);
2303 cifs_sb_deactive(cfile->dentry->d_sb); 2275 cifs_sb_deactive(sb);
2304} 2276}
2305 2277
2306const struct address_space_operations cifs_addr_ops = { 2278const struct address_space_operations cifs_addr_ops = {
diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c
index a2ad94efcfe6..297a43d0ff7f 100644
--- a/fs/cifs/fscache.c
+++ b/fs/cifs/fscache.c
@@ -2,7 +2,7 @@
2 * fs/cifs/fscache.c - CIFS filesystem cache interface 2 * fs/cifs/fscache.c - CIFS filesystem cache interface
3 * 3 *
4 * Copyright (c) 2010 Novell, Inc. 4 * Copyright (c) 2010 Novell, Inc.
5 * Author(s): Suresh Jayaraman (sjayaraman@suse.de> 5 * Author(s): Suresh Jayaraman <sjayaraman@suse.de>
6 * 6 *
7 * This library is free software; you can redistribute it and/or modify 7 * This library is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU Lesser General Public License as published 8 * it under the terms of the GNU Lesser General Public License as published
@@ -67,10 +67,12 @@ static void cifs_fscache_enable_inode_cookie(struct inode *inode)
67 if (cifsi->fscache) 67 if (cifsi->fscache)
68 return; 68 return;
69 69
70 cifsi->fscache = fscache_acquire_cookie(tcon->fscache, 70 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE) {
71 cifsi->fscache = fscache_acquire_cookie(tcon->fscache,
71 &cifs_fscache_inode_object_def, cifsi); 72 &cifs_fscache_inode_object_def, cifsi);
72 cFYI(1, "CIFS: got FH cookie (0x%p/0x%p)", tcon->fscache, 73 cFYI(1, "CIFS: got FH cookie (0x%p/0x%p)", tcon->fscache,
73 cifsi->fscache); 74 cifsi->fscache);
75 }
74} 76}
75 77
76void cifs_fscache_release_inode_cookie(struct inode *inode) 78void cifs_fscache_release_inode_cookie(struct inode *inode)
@@ -101,10 +103,8 @@ void cifs_fscache_set_inode_cookie(struct inode *inode, struct file *filp)
101{ 103{
102 if ((filp->f_flags & O_ACCMODE) != O_RDONLY) 104 if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
103 cifs_fscache_disable_inode_cookie(inode); 105 cifs_fscache_disable_inode_cookie(inode);
104 else { 106 else
105 cifs_fscache_enable_inode_cookie(inode); 107 cifs_fscache_enable_inode_cookie(inode);
106 cFYI(1, "CIFS: fscache inode cookie set");
107 }
108} 108}
109 109
110void cifs_fscache_reset_inode_cookie(struct inode *inode) 110void cifs_fscache_reset_inode_cookie(struct inode *inode)
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 39869c3c3efb..589f3e3f6e00 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -686,13 +686,18 @@ int cifs_get_inode_info(struct inode **pinode,
686 cFYI(1, "cifs_sfu_type failed: %d", tmprc); 686 cFYI(1, "cifs_sfu_type failed: %d", tmprc);
687 } 687 }
688 688
689#ifdef CONFIG_CIFS_EXPERIMENTAL 689#ifdef CONFIG_CIFS_ACL
690 /* fill in 0777 bits from ACL */ 690 /* fill in 0777 bits from ACL */
691 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { 691 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
692 cFYI(1, "Getting mode bits from ACL"); 692 rc = cifs_acl_to_fattr(cifs_sb, &fattr, *pinode, full_path,
693 cifs_acl_to_fattr(cifs_sb, &fattr, *pinode, full_path, pfid); 693 pfid);
694 if (rc) {
695 cFYI(1, "%s: Getting ACL failed with error: %d",
696 __func__, rc);
697 goto cgii_exit;
698 }
694 } 699 }
695#endif 700#endif /* CONFIG_CIFS_ACL */
696 701
697 /* fill in remaining high mode bits e.g. SUID, VTX */ 702 /* fill in remaining high mode bits e.g. SUID, VTX */
698 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) 703 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)
@@ -723,12 +728,12 @@ static const struct inode_operations cifs_ipc_inode_ops = {
723 .lookup = cifs_lookup, 728 .lookup = cifs_lookup,
724}; 729};
725 730
726char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb) 731char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb,
732 struct cifsTconInfo *tcon)
727{ 733{
728 int pplen = cifs_sb->prepathlen; 734 int pplen = cifs_sb->prepathlen;
729 int dfsplen; 735 int dfsplen;
730 char *full_path = NULL; 736 char *full_path = NULL;
731 struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
732 737
733 /* if no prefix path, simply set path to the root of share to "" */ 738 /* if no prefix path, simply set path to the root of share to "" */
734 if (pplen == 0) { 739 if (pplen == 0) {
@@ -870,7 +875,7 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
870 char *full_path; 875 char *full_path;
871 struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb); 876 struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
872 877
873 full_path = cifs_build_path_to_root(cifs_sb); 878 full_path = cifs_build_path_to_root(cifs_sb, tcon);
874 if (full_path == NULL) 879 if (full_path == NULL)
875 return ERR_PTR(-ENOMEM); 880 return ERR_PTR(-ENOMEM);
876 881
@@ -881,8 +886,10 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
881 rc = cifs_get_inode_info(&inode, full_path, NULL, sb, 886 rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
882 xid, NULL); 887 xid, NULL);
883 888
884 if (!inode) 889 if (!inode) {
885 return ERR_PTR(rc); 890 inode = ERR_PTR(rc);
891 goto out;
892 }
886 893
887#ifdef CONFIG_CIFS_FSCACHE 894#ifdef CONFIG_CIFS_FSCACHE
888 /* populate tcon->resource_id */ 895 /* populate tcon->resource_id */
@@ -898,13 +905,11 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
898 inode->i_uid = cifs_sb->mnt_uid; 905 inode->i_uid = cifs_sb->mnt_uid;
899 inode->i_gid = cifs_sb->mnt_gid; 906 inode->i_gid = cifs_sb->mnt_gid;
900 } else if (rc) { 907 } else if (rc) {
901 kfree(full_path);
902 _FreeXid(xid);
903 iget_failed(inode); 908 iget_failed(inode);
904 return ERR_PTR(rc); 909 inode = ERR_PTR(rc);
905 } 910 }
906 911
907 912out:
908 kfree(full_path); 913 kfree(full_path);
909 /* can not call macro FreeXid here since in a void func 914 /* can not call macro FreeXid here since in a void func
910 * TODO: This is no longer true 915 * TODO: This is no longer true
@@ -1648,6 +1653,7 @@ static bool
1648cifs_inode_needs_reval(struct inode *inode) 1653cifs_inode_needs_reval(struct inode *inode)
1649{ 1654{
1650 struct cifsInodeInfo *cifs_i = CIFS_I(inode); 1655 struct cifsInodeInfo *cifs_i = CIFS_I(inode);
1656 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
1651 1657
1652 if (cifs_i->clientCanCacheRead) 1658 if (cifs_i->clientCanCacheRead)
1653 return false; 1659 return false;
@@ -1658,19 +1664,21 @@ cifs_inode_needs_reval(struct inode *inode)
1658 if (cifs_i->time == 0) 1664 if (cifs_i->time == 0)
1659 return true; 1665 return true;
1660 1666
1661 /* FIXME: the actimeo should be tunable */ 1667 if (!time_in_range(jiffies, cifs_i->time,
1662 if (time_after_eq(jiffies, cifs_i->time + HZ)) 1668 cifs_i->time + cifs_sb->actimeo))
1663 return true; 1669 return true;
1664 1670
1665 /* hardlinked files w/ noserverino get "special" treatment */ 1671 /* hardlinked files w/ noserverino get "special" treatment */
1666 if (!(CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) && 1672 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) &&
1667 S_ISREG(inode->i_mode) && inode->i_nlink != 1) 1673 S_ISREG(inode->i_mode) && inode->i_nlink != 1)
1668 return true; 1674 return true;
1669 1675
1670 return false; 1676 return false;
1671} 1677}
1672 1678
1673/* check invalid_mapping flag and zap the cache if it's set */ 1679/*
1680 * Zap the cache. Called when invalid_mapping flag is set.
1681 */
1674static void 1682static void
1675cifs_invalidate_mapping(struct inode *inode) 1683cifs_invalidate_mapping(struct inode *inode)
1676{ 1684{
@@ -2114,11 +2122,16 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
2114 2122
2115 if (attrs->ia_valid & ATTR_MODE) { 2123 if (attrs->ia_valid & ATTR_MODE) {
2116 rc = 0; 2124 rc = 0;
2117#ifdef CONFIG_CIFS_EXPERIMENTAL 2125#ifdef CONFIG_CIFS_ACL
2118 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) 2126 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
2119 rc = mode_to_acl(inode, full_path, mode); 2127 rc = mode_to_cifs_acl(inode, full_path, mode);
2120 else 2128 if (rc) {
2121#endif 2129 cFYI(1, "%s: Setting ACL failed with error: %d",
2130 __func__, rc);
2131 goto cifs_setattr_exit;
2132 }
2133 } else
2134#endif /* CONFIG_CIFS_ACL */
2122 if (((mode & S_IWUGO) == 0) && 2135 if (((mode & S_IWUGO) == 0) &&
2123 (cifsInode->cifsAttrs & ATTR_READONLY) == 0) { 2136 (cifsInode->cifsAttrs & ATTR_READONLY) == 0) {
2124 2137
@@ -2177,7 +2190,6 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
2177 2190
2178 setattr_copy(inode, attrs); 2191 setattr_copy(inode, attrs);
2179 mark_inode_dirty(inode); 2192 mark_inode_dirty(inode);
2180 return 0;
2181 2193
2182cifs_setattr_exit: 2194cifs_setattr_exit:
2183 kfree(full_path); 2195 kfree(full_path);
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 077bf756f342..0c98672d0122 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -38,10 +38,10 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
38 struct cifs_sb_info *cifs_sb; 38 struct cifs_sb_info *cifs_sb;
39#ifdef CONFIG_CIFS_POSIX 39#ifdef CONFIG_CIFS_POSIX
40 struct cifsFileInfo *pSMBFile = filep->private_data; 40 struct cifsFileInfo *pSMBFile = filep->private_data;
41 struct cifsTconInfo *tcon = tlink_tcon(pSMBFile->tlink); 41 struct cifsTconInfo *tcon;
42 __u64 ExtAttrBits = 0; 42 __u64 ExtAttrBits = 0;
43 __u64 ExtAttrMask = 0; 43 __u64 ExtAttrMask = 0;
44 __u64 caps = le64_to_cpu(tcon->fsUnixInfo.Capability); 44 __u64 caps;
45#endif /* CONFIG_CIFS_POSIX */ 45#endif /* CONFIG_CIFS_POSIX */
46 46
47 xid = GetXid(); 47 xid = GetXid();
@@ -62,9 +62,11 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
62 break; 62 break;
63#ifdef CONFIG_CIFS_POSIX 63#ifdef CONFIG_CIFS_POSIX
64 case FS_IOC_GETFLAGS: 64 case FS_IOC_GETFLAGS:
65 if (pSMBFile == NULL)
66 break;
67 tcon = tlink_tcon(pSMBFile->tlink);
68 caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
65 if (CIFS_UNIX_EXTATTR_CAP & caps) { 69 if (CIFS_UNIX_EXTATTR_CAP & caps) {
66 if (pSMBFile == NULL)
67 break;
68 rc = CIFSGetExtAttr(xid, tcon, pSMBFile->netfid, 70 rc = CIFSGetExtAttr(xid, tcon, pSMBFile->netfid,
69 &ExtAttrBits, &ExtAttrMask); 71 &ExtAttrBits, &ExtAttrMask);
70 if (rc == 0) 72 if (rc == 0)
@@ -75,13 +77,15 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
75 break; 77 break;
76 78
77 case FS_IOC_SETFLAGS: 79 case FS_IOC_SETFLAGS:
80 if (pSMBFile == NULL)
81 break;
82 tcon = tlink_tcon(pSMBFile->tlink);
83 caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
78 if (CIFS_UNIX_EXTATTR_CAP & caps) { 84 if (CIFS_UNIX_EXTATTR_CAP & caps) {
79 if (get_user(ExtAttrBits, (int __user *)arg)) { 85 if (get_user(ExtAttrBits, (int __user *)arg)) {
80 rc = -EFAULT; 86 rc = -EFAULT;
81 break; 87 break;
82 } 88 }
83 if (pSMBFile == NULL)
84 break;
85 /* rc= CIFSGetExtAttr(xid,tcon,pSMBFile->netfid, 89 /* rc= CIFSGetExtAttr(xid,tcon,pSMBFile->netfid,
86 extAttrBits, &ExtAttrMask);*/ 90 extAttrBits, &ExtAttrMask);*/
87 } 91 }
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index c4e296fe3518..43f10281bc19 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -569,10 +569,9 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
569 569
570 cFYI(1, "file id match, oplock break"); 570 cFYI(1, "file id match, oplock break");
571 pCifsInode = CIFS_I(netfile->dentry->d_inode); 571 pCifsInode = CIFS_I(netfile->dentry->d_inode);
572 pCifsInode->clientCanCacheAll = false;
573 if (pSMB->OplockLevel == 0)
574 pCifsInode->clientCanCacheRead = false;
575 572
573 cifs_set_oplock_level(pCifsInode,
574 pSMB->OplockLevel);
576 /* 575 /*
577 * cifs_oplock_break_put() can't be called 576 * cifs_oplock_break_put() can't be called
578 * from here. Get reference after queueing 577 * from here. Get reference after queueing
@@ -722,3 +721,23 @@ cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb)
722 cifs_sb_master_tcon(cifs_sb)->treeName); 721 cifs_sb_master_tcon(cifs_sb)->treeName);
723 } 722 }
724} 723}
724
725void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock)
726{
727 oplock &= 0xF;
728
729 if (oplock == OPLOCK_EXCLUSIVE) {
730 cinode->clientCanCacheAll = true;
731 cinode->clientCanCacheRead = true;
732 cFYI(1, "Exclusive Oplock granted on inode %p",
733 &cinode->vfs_inode);
734 } else if (oplock == OPLOCK_READ) {
735 cinode->clientCanCacheAll = false;
736 cinode->clientCanCacheRead = true;
737 cFYI(1, "Level II Oplock granted on inode %p",
738 &cinode->vfs_inode);
739 } else {
740 cinode->clientCanCacheAll = false;
741 cinode->clientCanCacheRead = false;
742 }
743}
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index ef7bb7b50f58..a73eb9f4bdaf 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -226,26 +226,29 @@ static int initiate_cifs_search(const int xid, struct file *file)
226 char *full_path = NULL; 226 char *full_path = NULL;
227 struct cifsFileInfo *cifsFile; 227 struct cifsFileInfo *cifsFile;
228 struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 228 struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
229 struct tcon_link *tlink; 229 struct tcon_link *tlink = NULL;
230 struct cifsTconInfo *pTcon; 230 struct cifsTconInfo *pTcon;
231 231
232 tlink = cifs_sb_tlink(cifs_sb);
233 if (IS_ERR(tlink))
234 return PTR_ERR(tlink);
235 pTcon = tlink_tcon(tlink);
236
237 if (file->private_data == NULL)
238 file->private_data =
239 kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
240 if (file->private_data == NULL) { 232 if (file->private_data == NULL) {
241 rc = -ENOMEM; 233 tlink = cifs_sb_tlink(cifs_sb);
242 goto error_exit; 234 if (IS_ERR(tlink))
235 return PTR_ERR(tlink);
236
237 cifsFile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
238 if (cifsFile == NULL) {
239 rc = -ENOMEM;
240 goto error_exit;
241 }
242 file->private_data = cifsFile;
243 cifsFile->tlink = cifs_get_tlink(tlink);
244 pTcon = tlink_tcon(tlink);
245 } else {
246 cifsFile = file->private_data;
247 pTcon = tlink_tcon(cifsFile->tlink);
243 } 248 }
244 249
245 cifsFile = file->private_data;
246 cifsFile->invalidHandle = true; 250 cifsFile->invalidHandle = true;
247 cifsFile->srch_inf.endOfSearch = false; 251 cifsFile->srch_inf.endOfSearch = false;
248 cifsFile->tlink = cifs_get_tlink(tlink);
249 252
250 full_path = build_path_from_dentry(file->f_path.dentry); 253 full_path = build_path_from_dentry(file->f_path.dentry);
251 if (full_path == NULL) { 254 if (full_path == NULL) {
@@ -756,18 +759,6 @@ static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir,
756 rc = filldir(direntry, qstring.name, qstring.len, file->f_pos, 759 rc = filldir(direntry, qstring.name, qstring.len, file->f_pos,
757 ino, fattr.cf_dtype); 760 ino, fattr.cf_dtype);
758 761
759 /*
760 * we can not return filldir errors to the caller since they are
761 * "normal" when the stat blocksize is too small - we return remapped
762 * error instead
763 *
764 * FIXME: This looks bogus. filldir returns -EOVERFLOW in the above
765 * case already. Why should we be clobbering other errors from it?
766 */
767 if (rc) {
768 cFYI(1, "filldir rc = %d", rc);
769 rc = -EOVERFLOW;
770 }
771 dput(tmp_dentry); 762 dput(tmp_dentry);
772 return rc; 763 return rc;
773} 764}
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index a264b744bb41..eae2a1491608 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -30,10 +30,11 @@
30 30
31#define MAX_EA_VALUE_SIZE 65535 31#define MAX_EA_VALUE_SIZE 65535
32#define CIFS_XATTR_DOS_ATTRIB "user.DosAttrib" 32#define CIFS_XATTR_DOS_ATTRIB "user.DosAttrib"
33#define CIFS_XATTR_CIFS_ACL "system.cifs_acl"
33#define CIFS_XATTR_USER_PREFIX "user." 34#define CIFS_XATTR_USER_PREFIX "user."
34#define CIFS_XATTR_SYSTEM_PREFIX "system." 35#define CIFS_XATTR_SYSTEM_PREFIX "system."
35#define CIFS_XATTR_OS2_PREFIX "os2." 36#define CIFS_XATTR_OS2_PREFIX "os2."
36#define CIFS_XATTR_SECURITY_PREFIX ".security" 37#define CIFS_XATTR_SECURITY_PREFIX "security."
37#define CIFS_XATTR_TRUSTED_PREFIX "trusted." 38#define CIFS_XATTR_TRUSTED_PREFIX "trusted."
38#define XATTR_TRUSTED_PREFIX_LEN 8 39#define XATTR_TRUSTED_PREFIX_LEN 8
39#define XATTR_SECURITY_PREFIX_LEN 9 40#define XATTR_SECURITY_PREFIX_LEN 9
@@ -277,29 +278,8 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
277 cifs_sb->local_nls, 278 cifs_sb->local_nls,
278 cifs_sb->mnt_cifs_flags & 279 cifs_sb->mnt_cifs_flags &
279 CIFS_MOUNT_MAP_SPECIAL_CHR); 280 CIFS_MOUNT_MAP_SPECIAL_CHR);
280#ifdef CONFIG_CIFS_EXPERIMENTAL
281 else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
282 __u16 fid;
283 int oplock = 0;
284 struct cifs_ntsd *pacl = NULL;
285 __u32 buflen = 0;
286 if (experimEnabled)
287 rc = CIFSSMBOpen(xid, pTcon, full_path,
288 FILE_OPEN, GENERIC_READ, 0, &fid,
289 &oplock, NULL, cifs_sb->local_nls,
290 cifs_sb->mnt_cifs_flags &
291 CIFS_MOUNT_MAP_SPECIAL_CHR);
292 /* else rc is EOPNOTSUPP from above */
293
294 if (rc == 0) {
295 rc = CIFSSMBGetCIFSACL(xid, pTcon, fid, &pacl,
296 &buflen);
297 CIFSSMBClose(xid, pTcon, fid);
298 }
299 }
300#endif /* EXPERIMENTAL */
301#else 281#else
302 cFYI(1, "query POSIX ACL not supported yet"); 282 cFYI(1, "Query POSIX ACL not supported yet");
303#endif /* CONFIG_CIFS_POSIX */ 283#endif /* CONFIG_CIFS_POSIX */
304 } else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT, 284 } else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT,
305 strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) { 285 strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) {
@@ -311,8 +291,33 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
311 cifs_sb->mnt_cifs_flags & 291 cifs_sb->mnt_cifs_flags &
312 CIFS_MOUNT_MAP_SPECIAL_CHR); 292 CIFS_MOUNT_MAP_SPECIAL_CHR);
313#else 293#else
314 cFYI(1, "query POSIX default ACL not supported yet"); 294 cFYI(1, "Query POSIX default ACL not supported yet");
315#endif 295#endif /* CONFIG_CIFS_POSIX */
296 } else if (strncmp(ea_name, CIFS_XATTR_CIFS_ACL,
297 strlen(CIFS_XATTR_CIFS_ACL)) == 0) {
298#ifdef CONFIG_CIFS_ACL
299 u32 acllen;
300 struct cifs_ntsd *pacl;
301
302 pacl = get_cifs_acl(cifs_sb, direntry->d_inode,
303 full_path, &acllen);
304 if (IS_ERR(pacl)) {
305 rc = PTR_ERR(pacl);
306 cERROR(1, "%s: error %zd getting sec desc",
307 __func__, rc);
308 } else {
309 if (ea_value) {
310 if (acllen > buf_size)
311 acllen = -ERANGE;
312 else
313 memcpy(ea_value, pacl, acllen);
314 }
315 rc = acllen;
316 kfree(pacl);
317 }
318#else
319 cFYI(1, "Query CIFS ACL not supported yet");
320#endif /* CONFIG_CIFS_ACL */
316 } else if (strncmp(ea_name, 321 } else if (strncmp(ea_name,
317 CIFS_XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) { 322 CIFS_XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) {
318 cFYI(1, "Trusted xattr namespace not supported yet"); 323 cFYI(1, "Trusted xattr namespace not supported yet");
diff --git a/fs/compat.c b/fs/compat.c
index c580c322fa6b..eb1740ac8c0a 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1350,6 +1350,10 @@ static int compat_count(compat_uptr_t __user *argv, int max)
1350 argv++; 1350 argv++;
1351 if (i++ >= max) 1351 if (i++ >= max)
1352 return -E2BIG; 1352 return -E2BIG;
1353
1354 if (fatal_signal_pending(current))
1355 return -ERESTARTNOHAND;
1356 cond_resched();
1353 } 1357 }
1354 } 1358 }
1355 return i; 1359 return i;
@@ -1391,6 +1395,12 @@ static int compat_copy_strings(int argc, compat_uptr_t __user *argv,
1391 while (len > 0) { 1395 while (len > 0) {
1392 int offset, bytes_to_copy; 1396 int offset, bytes_to_copy;
1393 1397
1398 if (fatal_signal_pending(current)) {
1399 ret = -ERESTARTNOHAND;
1400 goto out;
1401 }
1402 cond_resched();
1403
1394 offset = pos % PAGE_SIZE; 1404 offset = pos % PAGE_SIZE;
1395 if (offset == 0) 1405 if (offset == 0)
1396 offset = PAGE_SIZE; 1406 offset = PAGE_SIZE;
@@ -1407,18 +1417,8 @@ static int compat_copy_strings(int argc, compat_uptr_t __user *argv,
1407 if (!kmapped_page || kpos != (pos & PAGE_MASK)) { 1417 if (!kmapped_page || kpos != (pos & PAGE_MASK)) {
1408 struct page *page; 1418 struct page *page;
1409 1419
1410#ifdef CONFIG_STACK_GROWSUP 1420 page = get_arg_page(bprm, pos, 1);
1411 ret = expand_stack_downwards(bprm->vma, pos); 1421 if (!page) {
1412 if (ret < 0) {
1413 /* We've exceed the stack rlimit. */
1414 ret = -E2BIG;
1415 goto out;
1416 }
1417#endif
1418 ret = get_user_pages(current, bprm->mm, pos,
1419 1, 1, 1, &page, NULL);
1420 if (ret <= 0) {
1421 /* We've exceed the stack rlimit. */
1422 ret = -E2BIG; 1422 ret = -E2BIG;
1423 goto out; 1423 goto out;
1424 } 1424 }
@@ -1539,8 +1539,10 @@ int compat_do_execve(char * filename,
1539 return retval; 1539 return retval;
1540 1540
1541out: 1541out:
1542 if (bprm->mm) 1542 if (bprm->mm) {
1543 acct_arg_size(bprm, 0);
1543 mmput(bprm->mm); 1544 mmput(bprm->mm);
1545 }
1544 1546
1545out_file: 1547out_file:
1546 if (bprm->file) { 1548 if (bprm->file) {
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 410ed188faa1..a60579b007b0 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -19,7 +19,6 @@
19#include <linux/compiler.h> 19#include <linux/compiler.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/smp.h> 21#include <linux/smp.h>
22#include <linux/smp_lock.h>
23#include <linux/ioctl.h> 22#include <linux/ioctl.h>
24#include <linux/if.h> 23#include <linux/if.h>
25#include <linux/if_bridge.h> 24#include <linux/if_bridge.h>
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index 253732382d37..2720178b7718 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -28,7 +28,6 @@
28#include <linux/key.h> 28#include <linux/key.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/seq_file.h> 30#include <linux/seq_file.h>
31#include <linux/smp_lock.h>
32#include <linux/file.h> 31#include <linux/file.h>
33#include <linux/crypto.h> 32#include <linux/crypto.h>
34#include "ecryptfs_kernel.h" 33#include "ecryptfs_kernel.h"
diff --git a/fs/exec.c b/fs/exec.c
index 99d33a1371e9..c62efcb959c7 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -164,7 +164,26 @@ out:
164 164
165#ifdef CONFIG_MMU 165#ifdef CONFIG_MMU
166 166
167static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, 167void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
168{
169 struct mm_struct *mm = current->mm;
170 long diff = (long)(pages - bprm->vma_pages);
171
172 if (!mm || !diff)
173 return;
174
175 bprm->vma_pages = pages;
176
177#ifdef SPLIT_RSS_COUNTING
178 add_mm_counter(mm, MM_ANONPAGES, diff);
179#else
180 spin_lock(&mm->page_table_lock);
181 add_mm_counter(mm, MM_ANONPAGES, diff);
182 spin_unlock(&mm->page_table_lock);
183#endif
184}
185
186struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
168 int write) 187 int write)
169{ 188{
170 struct page *page; 189 struct page *page;
@@ -186,6 +205,8 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
186 unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start; 205 unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start;
187 struct rlimit *rlim; 206 struct rlimit *rlim;
188 207
208 acct_arg_size(bprm, size / PAGE_SIZE);
209
189 /* 210 /*
190 * We've historically supported up to 32 pages (ARG_MAX) 211 * We've historically supported up to 32 pages (ARG_MAX)
191 * of argument strings even with small stacks 212 * of argument strings even with small stacks
@@ -254,6 +275,11 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
254 vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; 275 vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
255 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); 276 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
256 INIT_LIST_HEAD(&vma->anon_vma_chain); 277 INIT_LIST_HEAD(&vma->anon_vma_chain);
278
279 err = security_file_mmap(NULL, 0, 0, 0, vma->vm_start, 1);
280 if (err)
281 goto err;
282
257 err = insert_vm_struct(mm, vma); 283 err = insert_vm_struct(mm, vma);
258 if (err) 284 if (err)
259 goto err; 285 goto err;
@@ -276,7 +302,11 @@ static bool valid_arg_len(struct linux_binprm *bprm, long len)
276 302
277#else 303#else
278 304
279static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, 305void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
306{
307}
308
309struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
280 int write) 310 int write)
281{ 311{
282 struct page *page; 312 struct page *page;
@@ -1003,6 +1033,7 @@ int flush_old_exec(struct linux_binprm * bprm)
1003 /* 1033 /*
1004 * Release all of the old mmap stuff 1034 * Release all of the old mmap stuff
1005 */ 1035 */
1036 acct_arg_size(bprm, 0);
1006 retval = exec_mmap(bprm->mm); 1037 retval = exec_mmap(bprm->mm);
1007 if (retval) 1038 if (retval)
1008 goto out; 1039 goto out;
@@ -1426,8 +1457,10 @@ int do_execve(const char * filename,
1426 return retval; 1457 return retval;
1427 1458
1428out: 1459out:
1429 if (bprm->mm) 1460 if (bprm->mm) {
1430 mmput (bprm->mm); 1461 acct_arg_size(bprm, 0);
1462 mmput(bprm->mm);
1463 }
1431 1464
1432out_file: 1465out_file:
1433 if (bprm->file) { 1466 if (bprm->file) {
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 2fedaf8b5012..acf8695fa8f0 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -27,7 +27,6 @@
27#include <linux/init.h> 27#include <linux/init.h>
28#include <linux/blkdev.h> 28#include <linux/blkdev.h>
29#include <linux/parser.h> 29#include <linux/parser.h>
30#include <linux/smp_lock.h>
31#include <linux/buffer_head.h> 30#include <linux/buffer_head.h>
32#include <linux/exportfs.h> 31#include <linux/exportfs.h>
33#include <linux/vfs.h> 32#include <linux/vfs.h>
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 47162de0b957..1f253a9a141d 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -177,7 +177,7 @@ struct mpage_da_data {
177 177
178struct ext4_io_page { 178struct ext4_io_page {
179 struct page *p_page; 179 struct page *p_page;
180 int p_count; 180 atomic_t p_count;
181}; 181};
182 182
183#define MAX_IO_PAGES 128 183#define MAX_IO_PAGES 128
@@ -858,6 +858,7 @@ struct ext4_inode_info {
858 spinlock_t i_completed_io_lock; 858 spinlock_t i_completed_io_lock;
859 /* current io_end structure for async DIO write*/ 859 /* current io_end structure for async DIO write*/
860 ext4_io_end_t *cur_aio_dio; 860 ext4_io_end_t *cur_aio_dio;
861 atomic_t i_ioend_count; /* Number of outstanding io_end structs */
861 862
862 /* 863 /*
863 * Transactions that contain inode's metadata needed to complete 864 * Transactions that contain inode's metadata needed to complete
@@ -909,6 +910,7 @@ struct ext4_inode_info {
909#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ 910#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
910#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ 911#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
911#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ 912#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
913#define EXT4_MOUNT_MBLK_IO_SUBMIT 0x4000000 /* multi-block io submits */
912#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ 914#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
913#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ 915#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */
914#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ 916#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */
@@ -2060,6 +2062,7 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
2060/* page-io.c */ 2062/* page-io.c */
2061extern int __init ext4_init_pageio(void); 2063extern int __init ext4_init_pageio(void);
2062extern void ext4_exit_pageio(void); 2064extern void ext4_exit_pageio(void);
2065extern void ext4_ioend_wait(struct inode *);
2063extern void ext4_free_io_end(ext4_io_end_t *io); 2066extern void ext4_free_io_end(ext4_io_end_t *io);
2064extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); 2067extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
2065extern int ext4_end_io_nolock(ext4_io_end_t *io); 2068extern int ext4_end_io_nolock(ext4_io_end_t *io);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index b6a4b41d7e14..ef9d5be0b2a8 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -53,6 +53,7 @@
53static inline int ext4_begin_ordered_truncate(struct inode *inode, 53static inline int ext4_begin_ordered_truncate(struct inode *inode,
54 loff_t new_size) 54 loff_t new_size)
55{ 55{
56 trace_ext4_begin_ordered_truncate(inode, new_size);
56 return jbd2_journal_begin_ordered_truncate( 57 return jbd2_journal_begin_ordered_truncate(
57 EXT4_SB(inode->i_sb)->s_journal, 58 EXT4_SB(inode->i_sb)->s_journal,
58 &EXT4_I(inode)->jinode, 59 &EXT4_I(inode)->jinode,
@@ -178,6 +179,7 @@ void ext4_evict_inode(struct inode *inode)
178 handle_t *handle; 179 handle_t *handle;
179 int err; 180 int err;
180 181
182 trace_ext4_evict_inode(inode);
181 if (inode->i_nlink) { 183 if (inode->i_nlink) {
182 truncate_inode_pages(&inode->i_data, 0); 184 truncate_inode_pages(&inode->i_data, 0);
183 goto no_delete; 185 goto no_delete;
@@ -2123,9 +2125,12 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
2123 */ 2125 */
2124 if (unlikely(journal_data && PageChecked(page))) 2126 if (unlikely(journal_data && PageChecked(page)))
2125 err = __ext4_journalled_writepage(page, len); 2127 err = __ext4_journalled_writepage(page, len);
2126 else 2128 else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT))
2127 err = ext4_bio_write_page(&io_submit, page, 2129 err = ext4_bio_write_page(&io_submit, page,
2128 len, mpd->wbc); 2130 len, mpd->wbc);
2131 else
2132 err = block_write_full_page(page,
2133 noalloc_get_block_write, mpd->wbc);
2129 2134
2130 if (!err) 2135 if (!err)
2131 mpd->pages_written++; 2136 mpd->pages_written++;
@@ -5410,9 +5415,7 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
5410 * will return the blocks that include the delayed allocation 5415 * will return the blocks that include the delayed allocation
5411 * blocks for this file. 5416 * blocks for this file.
5412 */ 5417 */
5413 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
5414 delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks; 5418 delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;
5415 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
5416 5419
5417 stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9; 5420 stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
5418 return 0; 5421 return 0;
@@ -5649,6 +5652,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
5649 int err, ret; 5652 int err, ret;
5650 5653
5651 might_sleep(); 5654 might_sleep();
5655 trace_ext4_mark_inode_dirty(inode, _RET_IP_);
5652 err = ext4_reserve_inode_write(handle, inode, &iloc); 5656 err = ext4_reserve_inode_write(handle, inode, &iloc);
5653 if (ext4_handle_valid(handle) && 5657 if (ext4_handle_valid(handle) &&
5654 EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && 5658 EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index bf5ae883b1bd..eb3bc2fe647e 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -331,6 +331,30 @@ mext_out:
331 return err; 331 return err;
332 } 332 }
333 333
334 case FITRIM:
335 {
336 struct super_block *sb = inode->i_sb;
337 struct fstrim_range range;
338 int ret = 0;
339
340 if (!capable(CAP_SYS_ADMIN))
341 return -EPERM;
342
343 if (copy_from_user(&range, (struct fstrim_range *)arg,
344 sizeof(range)))
345 return -EFAULT;
346
347 ret = ext4_trim_fs(sb, &range);
348 if (ret < 0)
349 return ret;
350
351 if (copy_to_user((struct fstrim_range *)arg, &range,
352 sizeof(range)))
353 return -EFAULT;
354
355 return 0;
356 }
357
334 default: 358 default:
335 return -ENOTTY; 359 return -ENOTTY;
336 } 360 }
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index c58eba34724a..5b4d4e3a4d58 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4640,8 +4640,6 @@ do_more:
4640 * with group lock held. generate_buddy look at 4640 * with group lock held. generate_buddy look at
4641 * them with group lock_held 4641 * them with group lock_held
4642 */ 4642 */
4643 if (test_opt(sb, DISCARD))
4644 ext4_issue_discard(sb, block_group, bit, count);
4645 ext4_lock_group(sb, block_group); 4643 ext4_lock_group(sb, block_group);
4646 mb_clear_bits(bitmap_bh->b_data, bit, count); 4644 mb_clear_bits(bitmap_bh->b_data, bit, count);
4647 mb_free_blocks(inode, &e4b, bit, count); 4645 mb_free_blocks(inode, &e4b, bit, count);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 92203b8a099f..dc40e75cba88 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -872,7 +872,7 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
872 if (namelen > EXT4_NAME_LEN) 872 if (namelen > EXT4_NAME_LEN)
873 return NULL; 873 return NULL;
874 if ((namelen <= 2) && (name[0] == '.') && 874 if ((namelen <= 2) && (name[0] == '.') &&
875 (name[1] == '.' || name[1] == '0')) { 875 (name[1] == '.' || name[1] == '\0')) {
876 /* 876 /*
877 * "." or ".." will only be in the first block 877 * "." or ".." will only be in the first block
878 * NFS may look up ".."; "." should be handled by the VFS 878 * NFS may look up ".."; "." should be handled by the VFS
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 46a7d6a9d976..beacce11ac50 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -32,8 +32,14 @@
32 32
33static struct kmem_cache *io_page_cachep, *io_end_cachep; 33static struct kmem_cache *io_page_cachep, *io_end_cachep;
34 34
35#define WQ_HASH_SZ 37
36#define to_ioend_wq(v) (&ioend_wq[((unsigned long)v) % WQ_HASH_SZ])
37static wait_queue_head_t ioend_wq[WQ_HASH_SZ];
38
35int __init ext4_init_pageio(void) 39int __init ext4_init_pageio(void)
36{ 40{
41 int i;
42
37 io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT); 43 io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT);
38 if (io_page_cachep == NULL) 44 if (io_page_cachep == NULL)
39 return -ENOMEM; 45 return -ENOMEM;
@@ -42,6 +48,8 @@ int __init ext4_init_pageio(void)
42 kmem_cache_destroy(io_page_cachep); 48 kmem_cache_destroy(io_page_cachep);
43 return -ENOMEM; 49 return -ENOMEM;
44 } 50 }
51 for (i = 0; i < WQ_HASH_SZ; i++)
52 init_waitqueue_head(&ioend_wq[i]);
45 53
46 return 0; 54 return 0;
47} 55}
@@ -52,24 +60,37 @@ void ext4_exit_pageio(void)
52 kmem_cache_destroy(io_page_cachep); 60 kmem_cache_destroy(io_page_cachep);
53} 61}
54 62
63void ext4_ioend_wait(struct inode *inode)
64{
65 wait_queue_head_t *wq = to_ioend_wq(inode);
66
67 wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0));
68}
69
70static void put_io_page(struct ext4_io_page *io_page)
71{
72 if (atomic_dec_and_test(&io_page->p_count)) {
73 end_page_writeback(io_page->p_page);
74 put_page(io_page->p_page);
75 kmem_cache_free(io_page_cachep, io_page);
76 }
77}
78
55void ext4_free_io_end(ext4_io_end_t *io) 79void ext4_free_io_end(ext4_io_end_t *io)
56{ 80{
57 int i; 81 int i;
82 wait_queue_head_t *wq;
58 83
59 BUG_ON(!io); 84 BUG_ON(!io);
60 if (io->page) 85 if (io->page)
61 put_page(io->page); 86 put_page(io->page);
62 for (i = 0; i < io->num_io_pages; i++) { 87 for (i = 0; i < io->num_io_pages; i++)
63 if (--io->pages[i]->p_count == 0) { 88 put_io_page(io->pages[i]);
64 struct page *page = io->pages[i]->p_page;
65
66 end_page_writeback(page);
67 put_page(page);
68 kmem_cache_free(io_page_cachep, io->pages[i]);
69 }
70 }
71 io->num_io_pages = 0; 89 io->num_io_pages = 0;
72 iput(io->inode); 90 wq = to_ioend_wq(io->inode);
91 if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) &&
92 waitqueue_active(wq))
93 wake_up_all(wq);
73 kmem_cache_free(io_end_cachep, io); 94 kmem_cache_free(io_end_cachep, io);
74} 95}
75 96
@@ -142,8 +163,8 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
142 io = kmem_cache_alloc(io_end_cachep, flags); 163 io = kmem_cache_alloc(io_end_cachep, flags);
143 if (io) { 164 if (io) {
144 memset(io, 0, sizeof(*io)); 165 memset(io, 0, sizeof(*io));
145 io->inode = igrab(inode); 166 atomic_inc(&EXT4_I(inode)->i_ioend_count);
146 BUG_ON(!io->inode); 167 io->inode = inode;
147 INIT_WORK(&io->work, ext4_end_io_work); 168 INIT_WORK(&io->work, ext4_end_io_work);
148 INIT_LIST_HEAD(&io->list); 169 INIT_LIST_HEAD(&io->list);
149 } 170 }
@@ -171,35 +192,15 @@ static void ext4_end_bio(struct bio *bio, int error)
171 struct workqueue_struct *wq; 192 struct workqueue_struct *wq;
172 struct inode *inode; 193 struct inode *inode;
173 unsigned long flags; 194 unsigned long flags;
174 ext4_fsblk_t err_block;
175 int i; 195 int i;
176 196
177 BUG_ON(!io_end); 197 BUG_ON(!io_end);
178 inode = io_end->inode;
179 bio->bi_private = NULL; 198 bio->bi_private = NULL;
180 bio->bi_end_io = NULL; 199 bio->bi_end_io = NULL;
181 if (test_bit(BIO_UPTODATE, &bio->bi_flags)) 200 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
182 error = 0; 201 error = 0;
183 err_block = bio->bi_sector >> (inode->i_blkbits - 9);
184 bio_put(bio); 202 bio_put(bio);
185 203
186 if (!(inode->i_sb->s_flags & MS_ACTIVE)) {
187 pr_err("sb umounted, discard end_io request for inode %lu\n",
188 io_end->inode->i_ino);
189 ext4_free_io_end(io_end);
190 return;
191 }
192
193 if (error) {
194 io_end->flag |= EXT4_IO_END_ERROR;
195 ext4_warning(inode->i_sb, "I/O error writing to inode %lu "
196 "(offset %llu size %ld starting block %llu)",
197 inode->i_ino,
198 (unsigned long long) io_end->offset,
199 (long) io_end->size,
200 (unsigned long long) err_block);
201 }
202
203 for (i = 0; i < io_end->num_io_pages; i++) { 204 for (i = 0; i < io_end->num_io_pages; i++) {
204 struct page *page = io_end->pages[i]->p_page; 205 struct page *page = io_end->pages[i]->p_page;
205 struct buffer_head *bh, *head; 206 struct buffer_head *bh, *head;
@@ -236,14 +237,6 @@ static void ext4_end_bio(struct bio *bio, int error)
236 } while (bh != head); 237 } while (bh != head);
237 } 238 }
238 239
239 if (--io_end->pages[i]->p_count == 0) {
240 struct page *page = io_end->pages[i]->p_page;
241
242 end_page_writeback(page);
243 put_page(page);
244 kmem_cache_free(io_page_cachep, io_end->pages[i]);
245 }
246
247 /* 240 /*
248 * If this is a partial write which happened to make 241 * If this is a partial write which happened to make
249 * all buffers uptodate then we can optimize away a 242 * all buffers uptodate then we can optimize away a
@@ -253,9 +246,22 @@ static void ext4_end_bio(struct bio *bio, int error)
253 */ 246 */
254 if (!partial_write) 247 if (!partial_write)
255 SetPageUptodate(page); 248 SetPageUptodate(page);
256 }
257 249
250 put_io_page(io_end->pages[i]);
251 }
258 io_end->num_io_pages = 0; 252 io_end->num_io_pages = 0;
253 inode = io_end->inode;
254
255 if (error) {
256 io_end->flag |= EXT4_IO_END_ERROR;
257 ext4_warning(inode->i_sb, "I/O error writing to inode %lu "
258 "(offset %llu size %ld starting block %llu)",
259 inode->i_ino,
260 (unsigned long long) io_end->offset,
261 (long) io_end->size,
262 (unsigned long long)
263 bio->bi_sector >> (inode->i_blkbits - 9));
264 }
259 265
260 /* Add the io_end to per-inode completed io list*/ 266 /* Add the io_end to per-inode completed io list*/
261 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); 267 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
@@ -305,7 +311,6 @@ static int io_submit_init(struct ext4_io_submit *io,
305 bio->bi_private = io->io_end = io_end; 311 bio->bi_private = io->io_end = io_end;
306 bio->bi_end_io = ext4_end_bio; 312 bio->bi_end_io = ext4_end_bio;
307 313
308 io_end->inode = inode;
309 io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh); 314 io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh);
310 315
311 io->io_bio = bio; 316 io->io_bio = bio;
@@ -360,7 +365,7 @@ submit_and_retry:
360 if ((io_end->num_io_pages == 0) || 365 if ((io_end->num_io_pages == 0) ||
361 (io_end->pages[io_end->num_io_pages-1] != io_page)) { 366 (io_end->pages[io_end->num_io_pages-1] != io_page)) {
362 io_end->pages[io_end->num_io_pages++] = io_page; 367 io_end->pages[io_end->num_io_pages++] = io_page;
363 io_page->p_count++; 368 atomic_inc(&io_page->p_count);
364 } 369 }
365 return 0; 370 return 0;
366} 371}
@@ -389,7 +394,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
389 return -ENOMEM; 394 return -ENOMEM;
390 } 395 }
391 io_page->p_page = page; 396 io_page->p_page = page;
392 io_page->p_count = 0; 397 atomic_set(&io_page->p_count, 1);
393 get_page(page); 398 get_page(page);
394 399
395 for (bh = head = page_buffers(page), block_start = 0; 400 for (bh = head = page_buffers(page), block_start = 0;
@@ -421,10 +426,6 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
421 * PageWriteback bit from the page to prevent the system from 426 * PageWriteback bit from the page to prevent the system from
422 * wedging later on. 427 * wedging later on.
423 */ 428 */
424 if (io_page->p_count == 0) { 429 put_io_page(io_page);
425 put_page(page);
426 end_page_writeback(page);
427 kmem_cache_free(io_page_cachep, io_page);
428 }
429 return ret; 430 return ret;
430} 431}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 40131b777af6..fb15c9c0be74 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -828,12 +828,22 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
828 ei->cur_aio_dio = NULL; 828 ei->cur_aio_dio = NULL;
829 ei->i_sync_tid = 0; 829 ei->i_sync_tid = 0;
830 ei->i_datasync_tid = 0; 830 ei->i_datasync_tid = 0;
831 atomic_set(&ei->i_ioend_count, 0);
831 832
832 return &ei->vfs_inode; 833 return &ei->vfs_inode;
833} 834}
834 835
836static int ext4_drop_inode(struct inode *inode)
837{
838 int drop = generic_drop_inode(inode);
839
840 trace_ext4_drop_inode(inode, drop);
841 return drop;
842}
843
835static void ext4_destroy_inode(struct inode *inode) 844static void ext4_destroy_inode(struct inode *inode)
836{ 845{
846 ext4_ioend_wait(inode);
837 if (!list_empty(&(EXT4_I(inode)->i_orphan))) { 847 if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
838 ext4_msg(inode->i_sb, KERN_ERR, 848 ext4_msg(inode->i_sb, KERN_ERR,
839 "Inode %lu (%p): orphan list check failed!", 849 "Inode %lu (%p): orphan list check failed!",
@@ -1016,6 +1026,8 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
1016 !(def_mount_opts & EXT4_DEFM_NODELALLOC)) 1026 !(def_mount_opts & EXT4_DEFM_NODELALLOC))
1017 seq_puts(seq, ",nodelalloc"); 1027 seq_puts(seq, ",nodelalloc");
1018 1028
1029 if (test_opt(sb, MBLK_IO_SUBMIT))
1030 seq_puts(seq, ",mblk_io_submit");
1019 if (sbi->s_stripe) 1031 if (sbi->s_stripe)
1020 seq_printf(seq, ",stripe=%lu", sbi->s_stripe); 1032 seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
1021 /* 1033 /*
@@ -1173,6 +1185,7 @@ static const struct super_operations ext4_sops = {
1173 .destroy_inode = ext4_destroy_inode, 1185 .destroy_inode = ext4_destroy_inode,
1174 .write_inode = ext4_write_inode, 1186 .write_inode = ext4_write_inode,
1175 .dirty_inode = ext4_dirty_inode, 1187 .dirty_inode = ext4_dirty_inode,
1188 .drop_inode = ext4_drop_inode,
1176 .evict_inode = ext4_evict_inode, 1189 .evict_inode = ext4_evict_inode,
1177 .put_super = ext4_put_super, 1190 .put_super = ext4_put_super,
1178 .sync_fs = ext4_sync_fs, 1191 .sync_fs = ext4_sync_fs,
@@ -1186,7 +1199,6 @@ static const struct super_operations ext4_sops = {
1186 .quota_write = ext4_quota_write, 1199 .quota_write = ext4_quota_write,
1187#endif 1200#endif
1188 .bdev_try_to_free_page = bdev_try_to_free_page, 1201 .bdev_try_to_free_page = bdev_try_to_free_page,
1189 .trim_fs = ext4_trim_fs
1190}; 1202};
1191 1203
1192static const struct super_operations ext4_nojournal_sops = { 1204static const struct super_operations ext4_nojournal_sops = {
@@ -1194,6 +1206,7 @@ static const struct super_operations ext4_nojournal_sops = {
1194 .destroy_inode = ext4_destroy_inode, 1206 .destroy_inode = ext4_destroy_inode,
1195 .write_inode = ext4_write_inode, 1207 .write_inode = ext4_write_inode,
1196 .dirty_inode = ext4_dirty_inode, 1208 .dirty_inode = ext4_dirty_inode,
1209 .drop_inode = ext4_drop_inode,
1197 .evict_inode = ext4_evict_inode, 1210 .evict_inode = ext4_evict_inode,
1198 .write_super = ext4_write_super, 1211 .write_super = ext4_write_super,
1199 .put_super = ext4_put_super, 1212 .put_super = ext4_put_super,
@@ -1228,8 +1241,8 @@ enum {
1228 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, 1241 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
1229 Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, 1242 Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err,
1230 Opt_resize, Opt_usrquota, Opt_grpquota, Opt_i_version, 1243 Opt_resize, Opt_usrquota, Opt_grpquota, Opt_i_version,
1231 Opt_stripe, Opt_delalloc, Opt_nodelalloc, 1244 Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,
1232 Opt_block_validity, Opt_noblock_validity, 1245 Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
1233 Opt_inode_readahead_blks, Opt_journal_ioprio, 1246 Opt_inode_readahead_blks, Opt_journal_ioprio,
1234 Opt_dioread_nolock, Opt_dioread_lock, 1247 Opt_dioread_nolock, Opt_dioread_lock,
1235 Opt_discard, Opt_nodiscard, 1248 Opt_discard, Opt_nodiscard,
@@ -1293,6 +1306,8 @@ static const match_table_t tokens = {
1293 {Opt_resize, "resize"}, 1306 {Opt_resize, "resize"},
1294 {Opt_delalloc, "delalloc"}, 1307 {Opt_delalloc, "delalloc"},
1295 {Opt_nodelalloc, "nodelalloc"}, 1308 {Opt_nodelalloc, "nodelalloc"},
1309 {Opt_mblk_io_submit, "mblk_io_submit"},
1310 {Opt_nomblk_io_submit, "nomblk_io_submit"},
1296 {Opt_block_validity, "block_validity"}, 1311 {Opt_block_validity, "block_validity"},
1297 {Opt_noblock_validity, "noblock_validity"}, 1312 {Opt_noblock_validity, "noblock_validity"},
1298 {Opt_inode_readahead_blks, "inode_readahead_blks=%u"}, 1313 {Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
@@ -1714,6 +1729,12 @@ set_qf_format:
1714 case Opt_nodelalloc: 1729 case Opt_nodelalloc:
1715 clear_opt(sbi->s_mount_opt, DELALLOC); 1730 clear_opt(sbi->s_mount_opt, DELALLOC);
1716 break; 1731 break;
1732 case Opt_mblk_io_submit:
1733 set_opt(sbi->s_mount_opt, MBLK_IO_SUBMIT);
1734 break;
1735 case Opt_nomblk_io_submit:
1736 clear_opt(sbi->s_mount_opt, MBLK_IO_SUBMIT);
1737 break;
1717 case Opt_stripe: 1738 case Opt_stripe:
1718 if (match_int(&args[0], &option)) 1739 if (match_int(&args[0], &option))
1719 return 0; 1740 return 0;
@@ -2699,7 +2720,6 @@ static int ext4_lazyinit_thread(void *arg)
2699 struct ext4_li_request *elr; 2720 struct ext4_li_request *elr;
2700 unsigned long next_wakeup; 2721 unsigned long next_wakeup;
2701 DEFINE_WAIT(wait); 2722 DEFINE_WAIT(wait);
2702 int ret;
2703 2723
2704 BUG_ON(NULL == eli); 2724 BUG_ON(NULL == eli);
2705 2725
@@ -2723,13 +2743,12 @@ cont_thread:
2723 elr = list_entry(pos, struct ext4_li_request, 2743 elr = list_entry(pos, struct ext4_li_request,
2724 lr_request); 2744 lr_request);
2725 2745
2726 if (time_after_eq(jiffies, elr->lr_next_sched)) 2746 if (time_after_eq(jiffies, elr->lr_next_sched)) {
2727 ret = ext4_run_li_request(elr); 2747 if (ext4_run_li_request(elr) != 0) {
2728 2748 /* error, remove the lazy_init job */
2729 if (ret) { 2749 ext4_remove_li_request(elr);
2730 ret = 0; 2750 continue;
2731 ext4_remove_li_request(elr); 2751 }
2732 continue;
2733 } 2752 }
2734 2753
2735 if (time_before(elr->lr_next_sched, next_wakeup)) 2754 if (time_before(elr->lr_next_sched, next_wakeup))
@@ -2740,7 +2759,8 @@ cont_thread:
2740 if (freezing(current)) 2759 if (freezing(current))
2741 refrigerator(); 2760 refrigerator();
2742 2761
2743 if (time_after_eq(jiffies, next_wakeup)) { 2762 if ((time_after_eq(jiffies, next_wakeup)) ||
2763 (MAX_JIFFY_OFFSET == next_wakeup)) {
2744 cond_resched(); 2764 cond_resched();
2745 continue; 2765 continue;
2746 } 2766 }
@@ -2788,9 +2808,6 @@ static void ext4_clear_request_list(void)
2788 struct ext4_li_request *elr; 2808 struct ext4_li_request *elr;
2789 2809
2790 mutex_lock(&ext4_li_info->li_list_mtx); 2810 mutex_lock(&ext4_li_info->li_list_mtx);
2791 if (list_empty(&ext4_li_info->li_request_list))
2792 return;
2793
2794 list_for_each_safe(pos, n, &ext4_li_info->li_request_list) { 2811 list_for_each_safe(pos, n, &ext4_li_info->li_request_list) {
2795 elr = list_entry(pos, struct ext4_li_request, 2812 elr = list_entry(pos, struct ext4_li_request,
2796 lr_request); 2813 lr_request);
@@ -3257,13 +3274,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3257 * Test whether we have more sectors than will fit in sector_t, 3274 * Test whether we have more sectors than will fit in sector_t,
3258 * and whether the max offset is addressable by the page cache. 3275 * and whether the max offset is addressable by the page cache.
3259 */ 3276 */
3260 ret = generic_check_addressable(sb->s_blocksize_bits, 3277 err = generic_check_addressable(sb->s_blocksize_bits,
3261 ext4_blocks_count(es)); 3278 ext4_blocks_count(es));
3262 if (ret) { 3279 if (err) {
3263 ext4_msg(sb, KERN_ERR, "filesystem" 3280 ext4_msg(sb, KERN_ERR, "filesystem"
3264 " too large to mount safely on this system"); 3281 " too large to mount safely on this system");
3265 if (sizeof(sector_t) < 8) 3282 if (sizeof(sector_t) < 8)
3266 ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled"); 3283 ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
3284 ret = err;
3267 goto failed_mount; 3285 goto failed_mount;
3268 } 3286 }
3269 3287
@@ -3348,6 +3366,24 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3348 get_random_bytes(&sbi->s_next_generation, sizeof(u32)); 3366 get_random_bytes(&sbi->s_next_generation, sizeof(u32));
3349 spin_lock_init(&sbi->s_next_gen_lock); 3367 spin_lock_init(&sbi->s_next_gen_lock);
3350 3368
3369 err = percpu_counter_init(&sbi->s_freeblocks_counter,
3370 ext4_count_free_blocks(sb));
3371 if (!err) {
3372 err = percpu_counter_init(&sbi->s_freeinodes_counter,
3373 ext4_count_free_inodes(sb));
3374 }
3375 if (!err) {
3376 err = percpu_counter_init(&sbi->s_dirs_counter,
3377 ext4_count_dirs(sb));
3378 }
3379 if (!err) {
3380 err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
3381 }
3382 if (err) {
3383 ext4_msg(sb, KERN_ERR, "insufficient memory");
3384 goto failed_mount3;
3385 }
3386
3351 sbi->s_stripe = ext4_get_stripe_size(sbi); 3387 sbi->s_stripe = ext4_get_stripe_size(sbi);
3352 sbi->s_max_writeback_mb_bump = 128; 3388 sbi->s_max_writeback_mb_bump = 128;
3353 3389
@@ -3446,22 +3482,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3446 } 3482 }
3447 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); 3483 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
3448 3484
3449no_journal: 3485 /*
3450 err = percpu_counter_init(&sbi->s_freeblocks_counter, 3486 * The journal may have updated the bg summary counts, so we
3451 ext4_count_free_blocks(sb)); 3487 * need to update the global counters.
3452 if (!err) 3488 */
3453 err = percpu_counter_init(&sbi->s_freeinodes_counter, 3489 percpu_counter_set(&sbi->s_freeblocks_counter,
3454 ext4_count_free_inodes(sb)); 3490 ext4_count_free_blocks(sb));
3455 if (!err) 3491 percpu_counter_set(&sbi->s_freeinodes_counter,
3456 err = percpu_counter_init(&sbi->s_dirs_counter, 3492 ext4_count_free_inodes(sb));
3457 ext4_count_dirs(sb)); 3493 percpu_counter_set(&sbi->s_dirs_counter,
3458 if (!err) 3494 ext4_count_dirs(sb));
3459 err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0); 3495 percpu_counter_set(&sbi->s_dirtyblocks_counter, 0);
3460 if (err) {
3461 ext4_msg(sb, KERN_ERR, "insufficient memory");
3462 goto failed_mount_wq;
3463 }
3464 3496
3497no_journal:
3465 EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten"); 3498 EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
3466 if (!EXT4_SB(sb)->dio_unwritten_wq) { 3499 if (!EXT4_SB(sb)->dio_unwritten_wq) {
3467 printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); 3500 printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
@@ -3611,10 +3644,6 @@ failed_mount_wq:
3611 jbd2_journal_destroy(sbi->s_journal); 3644 jbd2_journal_destroy(sbi->s_journal);
3612 sbi->s_journal = NULL; 3645 sbi->s_journal = NULL;
3613 } 3646 }
3614 percpu_counter_destroy(&sbi->s_freeblocks_counter);
3615 percpu_counter_destroy(&sbi->s_freeinodes_counter);
3616 percpu_counter_destroy(&sbi->s_dirs_counter);
3617 percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
3618failed_mount3: 3647failed_mount3:
3619 if (sbi->s_flex_groups) { 3648 if (sbi->s_flex_groups) {
3620 if (is_vmalloc_addr(sbi->s_flex_groups)) 3649 if (is_vmalloc_addr(sbi->s_flex_groups))
@@ -3622,6 +3651,10 @@ failed_mount3:
3622 else 3651 else
3623 kfree(sbi->s_flex_groups); 3652 kfree(sbi->s_flex_groups);
3624 } 3653 }
3654 percpu_counter_destroy(&sbi->s_freeblocks_counter);
3655 percpu_counter_destroy(&sbi->s_freeinodes_counter);
3656 percpu_counter_destroy(&sbi->s_dirs_counter);
3657 percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
3625failed_mount2: 3658failed_mount2:
3626 for (i = 0; i < db_count; i++) 3659 for (i = 0; i < db_count; i++)
3627 brelse(sbi->s_group_desc[i]); 3660 brelse(sbi->s_group_desc[i]);
@@ -3949,13 +3982,11 @@ static int ext4_commit_super(struct super_block *sb, int sync)
3949 else 3982 else
3950 es->s_kbytes_written = 3983 es->s_kbytes_written =
3951 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written); 3984 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written);
3952 if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeblocks_counter)) 3985 ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
3953 ext4_free_blocks_count_set(es, percpu_counter_sum_positive( 3986 &EXT4_SB(sb)->s_freeblocks_counter));
3954 &EXT4_SB(sb)->s_freeblocks_counter)); 3987 es->s_free_inodes_count =
3955 if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeinodes_counter)) 3988 cpu_to_le32(percpu_counter_sum_positive(
3956 es->s_free_inodes_count = 3989 &EXT4_SB(sb)->s_freeinodes_counter));
3957 cpu_to_le32(percpu_counter_sum_positive(
3958 &EXT4_SB(sb)->s_freeinodes_counter));
3959 sb->s_dirt = 0; 3990 sb->s_dirt = 0;
3960 BUFFER_TRACE(sbh, "marking dirty"); 3991 BUFFER_TRACE(sbh, "marking dirty");
3961 mark_buffer_dirty(sbh); 3992 mark_buffer_dirty(sbh);
@@ -4556,12 +4587,10 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
4556 4587
4557static int ext4_quota_off(struct super_block *sb, int type) 4588static int ext4_quota_off(struct super_block *sb, int type)
4558{ 4589{
4559 /* Force all delayed allocation blocks to be allocated */ 4590 /* Force all delayed allocation blocks to be allocated.
4560 if (test_opt(sb, DELALLOC)) { 4591 * Caller already holds s_umount sem */
4561 down_read(&sb->s_umount); 4592 if (test_opt(sb, DELALLOC))
4562 sync_filesystem(sb); 4593 sync_filesystem(sb);
4563 up_read(&sb->s_umount);
4564 }
4565 4594
4566 return dquot_quota_off(sb, type); 4595 return dquot_quota_off(sb, type);
4567} 4596}
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index c8224587123f..8b984a2cebbd 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -13,6 +13,7 @@
13#include <linux/kernel.h> 13#include <linux/kernel.h>
14#include <linux/sched.h> 14#include <linux/sched.h>
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/compat.h>
16 17
17static const struct file_operations fuse_direct_io_file_operations; 18static const struct file_operations fuse_direct_io_file_operations;
18 19
@@ -134,6 +135,7 @@ EXPORT_SYMBOL_GPL(fuse_do_open);
134void fuse_finish_open(struct inode *inode, struct file *file) 135void fuse_finish_open(struct inode *inode, struct file *file)
135{ 136{
136 struct fuse_file *ff = file->private_data; 137 struct fuse_file *ff = file->private_data;
138 struct fuse_conn *fc = get_fuse_conn(inode);
137 139
138 if (ff->open_flags & FOPEN_DIRECT_IO) 140 if (ff->open_flags & FOPEN_DIRECT_IO)
139 file->f_op = &fuse_direct_io_file_operations; 141 file->f_op = &fuse_direct_io_file_operations;
@@ -141,6 +143,15 @@ void fuse_finish_open(struct inode *inode, struct file *file)
141 invalidate_inode_pages2(inode->i_mapping); 143 invalidate_inode_pages2(inode->i_mapping);
142 if (ff->open_flags & FOPEN_NONSEEKABLE) 144 if (ff->open_flags & FOPEN_NONSEEKABLE)
143 nonseekable_open(inode, file); 145 nonseekable_open(inode, file);
146 if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) {
147 struct fuse_inode *fi = get_fuse_inode(inode);
148
149 spin_lock(&fc->lock);
150 fi->attr_version = ++fc->attr_version;
151 i_size_write(inode, 0);
152 spin_unlock(&fc->lock);
153 fuse_invalidate_attr(inode);
154 }
144} 155}
145 156
146int fuse_open_common(struct inode *inode, struct file *file, bool isdir) 157int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
@@ -1618,6 +1629,58 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
1618} 1629}
1619 1630
1620/* 1631/*
1632 * CUSE servers compiled on 32bit broke on 64bit kernels because the
1633 * ABI was defined to be 'struct iovec' which is different on 32bit
1634 * and 64bit. Fortunately we can determine which structure the server
1635 * used from the size of the reply.
1636 */
1637static int fuse_copy_ioctl_iovec(struct iovec *dst, void *src,
1638 size_t transferred, unsigned count,
1639 bool is_compat)
1640{
1641#ifdef CONFIG_COMPAT
1642 if (count * sizeof(struct compat_iovec) == transferred) {
1643 struct compat_iovec *ciov = src;
1644 unsigned i;
1645
1646 /*
1647 * With this interface a 32bit server cannot support
1648 * non-compat (i.e. ones coming from 64bit apps) ioctl
1649 * requests
1650 */
1651 if (!is_compat)
1652 return -EINVAL;
1653
1654 for (i = 0; i < count; i++) {
1655 dst[i].iov_base = compat_ptr(ciov[i].iov_base);
1656 dst[i].iov_len = ciov[i].iov_len;
1657 }
1658 return 0;
1659 }
1660#endif
1661
1662 if (count * sizeof(struct iovec) != transferred)
1663 return -EIO;
1664
1665 memcpy(dst, src, transferred);
1666 return 0;
1667}
1668
1669/* Make sure iov_length() won't overflow */
1670static int fuse_verify_ioctl_iov(struct iovec *iov, size_t count)
1671{
1672 size_t n;
1673 u32 max = FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT;
1674
1675 for (n = 0; n < count; n++) {
1676 if (iov->iov_len > (size_t) max)
1677 return -ENOMEM;
1678 max -= iov->iov_len;
1679 }
1680 return 0;
1681}
1682
1683/*
1621 * For ioctls, there is no generic way to determine how much memory 1684 * For ioctls, there is no generic way to determine how much memory
1622 * needs to be read and/or written. Furthermore, ioctls are allowed 1685 * needs to be read and/or written. Furthermore, ioctls are allowed
1623 * to dereference the passed pointer, so the parameter requires deep 1686 * to dereference the passed pointer, so the parameter requires deep
@@ -1798,18 +1861,25 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
1798 in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV) 1861 in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV)
1799 goto out; 1862 goto out;
1800 1863
1801 err = -EIO;
1802 if ((in_iovs + out_iovs) * sizeof(struct iovec) != transferred)
1803 goto out;
1804
1805 /* okay, copy in iovs and retry */
1806 vaddr = kmap_atomic(pages[0], KM_USER0); 1864 vaddr = kmap_atomic(pages[0], KM_USER0);
1807 memcpy(page_address(iov_page), vaddr, transferred); 1865 err = fuse_copy_ioctl_iovec(page_address(iov_page), vaddr,
1866 transferred, in_iovs + out_iovs,
1867 (flags & FUSE_IOCTL_COMPAT) != 0);
1808 kunmap_atomic(vaddr, KM_USER0); 1868 kunmap_atomic(vaddr, KM_USER0);
1869 if (err)
1870 goto out;
1809 1871
1810 in_iov = page_address(iov_page); 1872 in_iov = page_address(iov_page);
1811 out_iov = in_iov + in_iovs; 1873 out_iov = in_iov + in_iovs;
1812 1874
1875 err = fuse_verify_ioctl_iov(in_iov, in_iovs);
1876 if (err)
1877 goto out;
1878
1879 err = fuse_verify_ioctl_iov(out_iov, out_iovs);
1880 if (err)
1881 goto out;
1882
1813 goto retry; 1883 goto retry;
1814 } 1884 }
1815 1885
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 06d582732d34..5ab3839dfcb9 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -138,10 +138,8 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
138 struct gfs2_inum_host *inum) 138 struct gfs2_inum_host *inum)
139{ 139{
140 struct gfs2_sbd *sdp = sb->s_fs_info; 140 struct gfs2_sbd *sdp = sb->s_fs_info;
141 struct gfs2_holder i_gh;
142 struct inode *inode; 141 struct inode *inode;
143 struct dentry *dentry; 142 struct dentry *dentry;
144 int error;
145 143
146 inode = gfs2_ilookup(sb, inum->no_addr); 144 inode = gfs2_ilookup(sb, inum->no_addr);
147 if (inode) { 145 if (inode) {
@@ -152,52 +150,16 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
152 goto out_inode; 150 goto out_inode;
153 } 151 }
154 152
155 error = gfs2_glock_nq_num(sdp, inum->no_addr, &gfs2_inode_glops, 153 inode = gfs2_lookup_by_inum(sdp, inum->no_addr, &inum->no_formal_ino,
156 LM_ST_SHARED, LM_FLAG_ANY, &i_gh); 154 GFS2_BLKST_DINODE);
157 if (error) 155 if (IS_ERR(inode))
158 return ERR_PTR(error); 156 return ERR_CAST(inode);
159
160 error = gfs2_check_blk_type(sdp, inum->no_addr, GFS2_BLKST_DINODE);
161 if (error)
162 goto fail;
163
164 inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0);
165 if (IS_ERR(inode)) {
166 error = PTR_ERR(inode);
167 goto fail;
168 }
169
170 error = gfs2_inode_refresh(GFS2_I(inode));
171 if (error) {
172 iput(inode);
173 goto fail;
174 }
175
176 /* Pick up the works we bypass in gfs2_inode_lookup */
177 if (inode->i_state & I_NEW)
178 gfs2_set_iop(inode);
179
180 if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) {
181 iput(inode);
182 goto fail;
183 }
184
185 error = -EIO;
186 if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM) {
187 iput(inode);
188 goto fail;
189 }
190
191 gfs2_glock_dq_uninit(&i_gh);
192 157
193out_inode: 158out_inode:
194 dentry = d_obtain_alias(inode); 159 dentry = d_obtain_alias(inode);
195 if (!IS_ERR(dentry)) 160 if (!IS_ERR(dentry))
196 dentry->d_op = &gfs2_dops; 161 dentry->d_op = &gfs2_dops;
197 return dentry; 162 return dentry;
198fail:
199 gfs2_glock_dq_uninit(&i_gh);
200 return ERR_PTR(error);
201} 163}
202 164
203static struct dentry *gfs2_fh_to_dentry(struct super_block *sb, struct fid *fid, 165static struct dentry *gfs2_fh_to_dentry(struct super_block *sb, struct fid *fid,
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 87778857f099..f92c17704169 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -686,21 +686,20 @@ static void delete_work_func(struct work_struct *work)
686{ 686{
687 struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_delete); 687 struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_delete);
688 struct gfs2_sbd *sdp = gl->gl_sbd; 688 struct gfs2_sbd *sdp = gl->gl_sbd;
689 struct gfs2_inode *ip = NULL; 689 struct gfs2_inode *ip;
690 struct inode *inode; 690 struct inode *inode;
691 u64 no_addr = 0; 691 u64 no_addr = gl->gl_name.ln_number;
692
693 ip = gl->gl_object;
694 /* Note: Unsafe to dereference ip as we don't hold right refs/locks */
692 695
693 spin_lock(&gl->gl_spin);
694 ip = (struct gfs2_inode *)gl->gl_object;
695 if (ip) 696 if (ip)
696 no_addr = ip->i_no_addr;
697 spin_unlock(&gl->gl_spin);
698 if (ip) {
699 inode = gfs2_ilookup(sdp->sd_vfs, no_addr); 697 inode = gfs2_ilookup(sdp->sd_vfs, no_addr);
700 if (inode) { 698 else
701 d_prune_aliases(inode); 699 inode = gfs2_lookup_by_inum(sdp, no_addr, NULL, GFS2_BLKST_UNLINKED);
702 iput(inode); 700 if (inode && !IS_ERR(inode)) {
703 } 701 d_prune_aliases(inode);
702 iput(inode);
704 } 703 }
705 gfs2_glock_put(gl); 704 gfs2_glock_put(gl);
706} 705}
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 06370f8bd8cf..e1213f7f9217 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -73,49 +73,6 @@ static struct inode *gfs2_iget(struct super_block *sb, u64 no_addr)
73 return iget5_locked(sb, hash, iget_test, iget_set, &no_addr); 73 return iget5_locked(sb, hash, iget_test, iget_set, &no_addr);
74} 74}
75 75
76struct gfs2_skip_data {
77 u64 no_addr;
78 int skipped;
79};
80
81static int iget_skip_test(struct inode *inode, void *opaque)
82{
83 struct gfs2_inode *ip = GFS2_I(inode);
84 struct gfs2_skip_data *data = opaque;
85
86 if (ip->i_no_addr == data->no_addr) {
87 if (inode->i_state & (I_FREEING|I_WILL_FREE)){
88 data->skipped = 1;
89 return 0;
90 }
91 return 1;
92 }
93 return 0;
94}
95
96static int iget_skip_set(struct inode *inode, void *opaque)
97{
98 struct gfs2_inode *ip = GFS2_I(inode);
99 struct gfs2_skip_data *data = opaque;
100
101 if (data->skipped)
102 return 1;
103 inode->i_ino = (unsigned long)(data->no_addr);
104 ip->i_no_addr = data->no_addr;
105 return 0;
106}
107
108static struct inode *gfs2_iget_skip(struct super_block *sb,
109 u64 no_addr)
110{
111 struct gfs2_skip_data data;
112 unsigned long hash = (unsigned long)no_addr;
113
114 data.no_addr = no_addr;
115 data.skipped = 0;
116 return iget5_locked(sb, hash, iget_skip_test, iget_skip_set, &data);
117}
118
119/** 76/**
120 * GFS2 lookup code fills in vfs inode contents based on info obtained 77 * GFS2 lookup code fills in vfs inode contents based on info obtained
121 * from directory entry inside gfs2_inode_lookup(). This has caused issues 78 * from directory entry inside gfs2_inode_lookup(). This has caused issues
@@ -243,93 +200,54 @@ fail:
243 return ERR_PTR(error); 200 return ERR_PTR(error);
244} 201}
245 202
246/** 203struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
247 * gfs2_process_unlinked_inode - Lookup an unlinked inode for reclamation 204 u64 *no_formal_ino, unsigned int blktype)
248 * and try to reclaim it by doing iput.
249 *
250 * This function assumes no rgrp locks are currently held.
251 *
252 * @sb: The super block
253 * no_addr: The inode number
254 *
255 */
256
257void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr)
258{ 205{
259 struct gfs2_sbd *sdp; 206 struct super_block *sb = sdp->sd_vfs;
260 struct gfs2_inode *ip; 207 struct gfs2_holder i_gh;
261 struct gfs2_glock *io_gl = NULL;
262 int error;
263 struct gfs2_holder gh;
264 struct inode *inode; 208 struct inode *inode;
209 int error;
265 210
266 inode = gfs2_iget_skip(sb, no_addr); 211 error = gfs2_glock_nq_num(sdp, no_addr, &gfs2_inode_glops,
267 212 LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
268 if (!inode) 213 if (error)
269 return; 214 return ERR_PTR(error);
270
271 /* If it's not a new inode, someone's using it, so leave it alone. */
272 if (!(inode->i_state & I_NEW)) {
273 iput(inode);
274 return;
275 }
276
277 ip = GFS2_I(inode);
278 sdp = GFS2_SB(inode);
279 ip->i_no_formal_ino = -1;
280 215
281 error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); 216 error = gfs2_check_blk_type(sdp, no_addr, blktype);
282 if (unlikely(error)) 217 if (error)
283 goto fail; 218 goto fail;
284 ip->i_gl->gl_object = ip;
285 219
286 error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl); 220 inode = gfs2_inode_lookup(sb, DT_UNKNOWN, no_addr, 0);
287 if (unlikely(error)) 221 if (IS_ERR(inode))
288 goto fail_put; 222 goto fail;
289
290 set_bit(GIF_INVALID, &ip->i_flags);
291 error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, LM_FLAG_TRY | GL_EXACT,
292 &ip->i_iopen_gh);
293 if (unlikely(error))
294 goto fail_iopen;
295 223
296 ip->i_iopen_gh.gh_gl->gl_object = ip; 224 error = gfs2_inode_refresh(GFS2_I(inode));
297 gfs2_glock_put(io_gl); 225 if (error)
298 io_gl = NULL; 226 goto fail_iput;
299 227
300 inode->i_mode = DT2IF(DT_UNKNOWN); 228 /* Pick up the works we bypass in gfs2_inode_lookup */
229 if (inode->i_state & I_NEW)
230 gfs2_set_iop(inode);
301 231
302 /* 232 /* Two extra checks for NFS only */
303 * We must read the inode in order to work out its type in 233 if (no_formal_ino) {
304 * this case. Note that this doesn't happen often as we normally 234 error = -ESTALE;
305 * know the type beforehand. This code path only occurs during 235 if (GFS2_I(inode)->i_no_formal_ino != *no_formal_ino)
306 * unlinked inode recovery (where it is safe to do this glock, 236 goto fail_iput;
307 * which is not true in the general case).
308 */
309 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY,
310 &gh);
311 if (unlikely(error))
312 goto fail_glock;
313 237
314 /* Inode is now uptodate */ 238 error = -EIO;
315 gfs2_glock_dq_uninit(&gh); 239 if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM)
316 gfs2_set_iop(inode); 240 goto fail_iput;
317 241
318 /* The iput will cause it to be deleted. */ 242 error = 0;
319 iput(inode); 243 }
320 return;
321 244
322fail_glock:
323 gfs2_glock_dq(&ip->i_iopen_gh);
324fail_iopen:
325 if (io_gl)
326 gfs2_glock_put(io_gl);
327fail_put:
328 ip->i_gl->gl_object = NULL;
329 gfs2_glock_put(ip->i_gl);
330fail: 245fail:
331 iget_failed(inode); 246 gfs2_glock_dq_uninit(&i_gh);
332 return; 247 return error ? ERR_PTR(error) : inode;
248fail_iput:
249 iput(inode);
250 goto fail;
333} 251}
334 252
335static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) 253static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 6720d7d5fbc6..d8499fadcc53 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -99,7 +99,9 @@ err:
99extern void gfs2_set_iop(struct inode *inode); 99extern void gfs2_set_iop(struct inode *inode);
100extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 100extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type,
101 u64 no_addr, u64 no_formal_ino); 101 u64 no_addr, u64 no_formal_ino);
102extern void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr); 102extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
103 u64 *no_formal_ino,
104 unsigned int blktype);
103extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr); 105extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr);
104 106
105extern int gfs2_inode_refresh(struct gfs2_inode *ip); 107extern int gfs2_inode_refresh(struct gfs2_inode *ip);
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 58a9b9998b42..f606baf9ba72 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -631,6 +631,7 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
631 struct fs_disk_quota *fdq) 631 struct fs_disk_quota *fdq)
632{ 632{
633 struct inode *inode = &ip->i_inode; 633 struct inode *inode = &ip->i_inode;
634 struct gfs2_sbd *sdp = GFS2_SB(inode);
634 struct address_space *mapping = inode->i_mapping; 635 struct address_space *mapping = inode->i_mapping;
635 unsigned long index = loc >> PAGE_CACHE_SHIFT; 636 unsigned long index = loc >> PAGE_CACHE_SHIFT;
636 unsigned offset = loc & (PAGE_CACHE_SIZE - 1); 637 unsigned offset = loc & (PAGE_CACHE_SIZE - 1);
@@ -658,11 +659,11 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
658 qd->qd_qb.qb_value = qp->qu_value; 659 qd->qd_qb.qb_value = qp->qu_value;
659 if (fdq) { 660 if (fdq) {
660 if (fdq->d_fieldmask & FS_DQ_BSOFT) { 661 if (fdq->d_fieldmask & FS_DQ_BSOFT) {
661 qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit); 662 qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift);
662 qd->qd_qb.qb_warn = qp->qu_warn; 663 qd->qd_qb.qb_warn = qp->qu_warn;
663 } 664 }
664 if (fdq->d_fieldmask & FS_DQ_BHARD) { 665 if (fdq->d_fieldmask & FS_DQ_BHARD) {
665 qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit); 666 qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift);
666 qd->qd_qb.qb_limit = qp->qu_limit; 667 qd->qd_qb.qb_limit = qp->qu_limit;
667 } 668 }
668 } 669 }
@@ -1497,9 +1498,9 @@ static int gfs2_get_dqblk(struct super_block *sb, int type, qid_t id,
1497 fdq->d_version = FS_DQUOT_VERSION; 1498 fdq->d_version = FS_DQUOT_VERSION;
1498 fdq->d_flags = (type == QUOTA_USER) ? FS_USER_QUOTA : FS_GROUP_QUOTA; 1499 fdq->d_flags = (type == QUOTA_USER) ? FS_USER_QUOTA : FS_GROUP_QUOTA;
1499 fdq->d_id = id; 1500 fdq->d_id = id;
1500 fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit); 1501 fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit) << sdp->sd_fsb2bb_shift;
1501 fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn); 1502 fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn) << sdp->sd_fsb2bb_shift;
1502 fdq->d_bcount = be64_to_cpu(qlvb->qb_value); 1503 fdq->d_bcount = be64_to_cpu(qlvb->qb_value) << sdp->sd_fsb2bb_shift;
1503 1504
1504 gfs2_glock_dq_uninit(&q_gh); 1505 gfs2_glock_dq_uninit(&q_gh);
1505out: 1506out:
@@ -1566,10 +1567,10 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
1566 1567
1567 /* If nothing has changed, this is a no-op */ 1568 /* If nothing has changed, this is a no-op */
1568 if ((fdq->d_fieldmask & FS_DQ_BSOFT) && 1569 if ((fdq->d_fieldmask & FS_DQ_BSOFT) &&
1569 (fdq->d_blk_softlimit == be64_to_cpu(qd->qd_qb.qb_warn))) 1570 ((fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_warn)))
1570 fdq->d_fieldmask ^= FS_DQ_BSOFT; 1571 fdq->d_fieldmask ^= FS_DQ_BSOFT;
1571 if ((fdq->d_fieldmask & FS_DQ_BHARD) && 1572 if ((fdq->d_fieldmask & FS_DQ_BHARD) &&
1572 (fdq->d_blk_hardlimit == be64_to_cpu(qd->qd_qb.qb_limit))) 1573 ((fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_limit)))
1573 fdq->d_fieldmask ^= FS_DQ_BHARD; 1574 fdq->d_fieldmask ^= FS_DQ_BHARD;
1574 if (fdq->d_fieldmask == 0) 1575 if (fdq->d_fieldmask == 0)
1575 goto out_i; 1576 goto out_i;
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index bef3ab6cf5c1..33c8407b876f 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -963,17 +963,18 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
963 * The inode, if one has been found, in inode. 963 * The inode, if one has been found, in inode.
964 */ 964 */
965 965
966static u64 try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, 966static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip)
967 u64 skip)
968{ 967{
969 u32 goal = 0, block; 968 u32 goal = 0, block;
970 u64 no_addr; 969 u64 no_addr;
971 struct gfs2_sbd *sdp = rgd->rd_sbd; 970 struct gfs2_sbd *sdp = rgd->rd_sbd;
972 unsigned int n; 971 unsigned int n;
972 struct gfs2_glock *gl;
973 struct gfs2_inode *ip;
974 int error;
975 int found = 0;
973 976
974 for(;;) { 977 while (goal < rgd->rd_data) {
975 if (goal >= rgd->rd_data)
976 break;
977 down_write(&sdp->sd_log_flush_lock); 978 down_write(&sdp->sd_log_flush_lock);
978 n = 1; 979 n = 1;
979 block = rgblk_search(rgd, goal, GFS2_BLKST_UNLINKED, 980 block = rgblk_search(rgd, goal, GFS2_BLKST_UNLINKED,
@@ -990,11 +991,32 @@ static u64 try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
990 if (no_addr == skip) 991 if (no_addr == skip)
991 continue; 992 continue;
992 *last_unlinked = no_addr; 993 *last_unlinked = no_addr;
993 return no_addr; 994
995 error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &gl);
996 if (error)
997 continue;
998
999 /* If the inode is already in cache, we can ignore it here
1000 * because the existing inode disposal code will deal with
1001 * it when all refs have gone away. Accessing gl_object like
1002 * this is not safe in general. Here it is ok because we do
1003 * not dereference the pointer, and we only need an approx
1004 * answer to whether it is NULL or not.
1005 */
1006 ip = gl->gl_object;
1007
1008 if (ip || queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0)
1009 gfs2_glock_put(gl);
1010 else
1011 found++;
1012
1013 /* Limit reclaim to sensible number of tasks */
1014 if (found > 2*NR_CPUS)
1015 return;
994 } 1016 }
995 1017
996 rgd->rd_flags &= ~GFS2_RDF_CHECK; 1018 rgd->rd_flags &= ~GFS2_RDF_CHECK;
997 return 0; 1019 return;
998} 1020}
999 1021
1000/** 1022/**
@@ -1075,11 +1097,9 @@ static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd)
1075 * Try to acquire rgrp in way which avoids contending with others. 1097 * Try to acquire rgrp in way which avoids contending with others.
1076 * 1098 *
1077 * Returns: errno 1099 * Returns: errno
1078 * unlinked: the block address of an unlinked block to be reclaimed
1079 */ 1100 */
1080 1101
1081static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked, 1102static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
1082 u64 *last_unlinked)
1083{ 1103{
1084 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1104 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1085 struct gfs2_rgrpd *rgd, *begin = NULL; 1105 struct gfs2_rgrpd *rgd, *begin = NULL;
@@ -1089,7 +1109,6 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked,
1089 int loops = 0; 1109 int loops = 0;
1090 int error, rg_locked; 1110 int error, rg_locked;
1091 1111
1092 *unlinked = 0;
1093 rgd = gfs2_blk2rgrpd(sdp, ip->i_goal); 1112 rgd = gfs2_blk2rgrpd(sdp, ip->i_goal);
1094 1113
1095 while (rgd) { 1114 while (rgd) {
@@ -1106,17 +1125,10 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked,
1106 case 0: 1125 case 0:
1107 if (try_rgrp_fit(rgd, al)) 1126 if (try_rgrp_fit(rgd, al))
1108 goto out; 1127 goto out;
1109 /* If the rg came in already locked, there's no 1128 if (rgd->rd_flags & GFS2_RDF_CHECK)
1110 way we can recover from a failed try_rgrp_unlink 1129 try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
1111 because that would require an iput which can only
1112 happen after the rgrp is unlocked. */
1113 if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK)
1114 *unlinked = try_rgrp_unlink(rgd, last_unlinked,
1115 ip->i_no_addr);
1116 if (!rg_locked) 1130 if (!rg_locked)
1117 gfs2_glock_dq_uninit(&al->al_rgd_gh); 1131 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1118 if (*unlinked)
1119 return -EAGAIN;
1120 /* fall through */ 1132 /* fall through */
1121 case GLR_TRYFAILED: 1133 case GLR_TRYFAILED:
1122 rgd = recent_rgrp_next(rgd); 1134 rgd = recent_rgrp_next(rgd);
@@ -1145,13 +1157,10 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked,
1145 case 0: 1157 case 0:
1146 if (try_rgrp_fit(rgd, al)) 1158 if (try_rgrp_fit(rgd, al))
1147 goto out; 1159 goto out;
1148 if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK) 1160 if (rgd->rd_flags & GFS2_RDF_CHECK)
1149 *unlinked = try_rgrp_unlink(rgd, last_unlinked, 1161 try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
1150 ip->i_no_addr);
1151 if (!rg_locked) 1162 if (!rg_locked)
1152 gfs2_glock_dq_uninit(&al->al_rgd_gh); 1163 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1153 if (*unlinked)
1154 return -EAGAIN;
1155 break; 1164 break;
1156 1165
1157 case GLR_TRYFAILED: 1166 case GLR_TRYFAILED:
@@ -1204,12 +1213,12 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
1204 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1213 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1205 struct gfs2_alloc *al = ip->i_alloc; 1214 struct gfs2_alloc *al = ip->i_alloc;
1206 int error = 0; 1215 int error = 0;
1207 u64 last_unlinked = NO_BLOCK, unlinked; 1216 u64 last_unlinked = NO_BLOCK;
1217 int tries = 0;
1208 1218
1209 if (gfs2_assert_warn(sdp, al->al_requested)) 1219 if (gfs2_assert_warn(sdp, al->al_requested))
1210 return -EINVAL; 1220 return -EINVAL;
1211 1221
1212try_again:
1213 if (hold_rindex) { 1222 if (hold_rindex) {
1214 /* We need to hold the rindex unless the inode we're using is 1223 /* We need to hold the rindex unless the inode we're using is
1215 the rindex itself, in which case it's already held. */ 1224 the rindex itself, in which case it's already held. */
@@ -1218,31 +1227,23 @@ try_again:
1218 else if (!sdp->sd_rgrps) /* We may not have the rindex read 1227 else if (!sdp->sd_rgrps) /* We may not have the rindex read
1219 in, so: */ 1228 in, so: */
1220 error = gfs2_ri_update_special(ip); 1229 error = gfs2_ri_update_special(ip);
1230 if (error)
1231 return error;
1221 } 1232 }
1222 1233
1223 if (error) 1234 do {
1224 return error; 1235 error = get_local_rgrp(ip, &last_unlinked);
1236 /* If there is no space, flushing the log may release some */
1237 if (error)
1238 gfs2_log_flush(sdp, NULL);
1239 } while (error && tries++ < 3);
1225 1240
1226 /* Find an rgrp suitable for allocation. If it encounters any unlinked
1227 dinodes along the way, error will equal -EAGAIN and unlinked will
1228 contains it block address. We then need to look up that inode and
1229 try to free it, and try the allocation again. */
1230 error = get_local_rgrp(ip, &unlinked, &last_unlinked);
1231 if (error) { 1241 if (error) {
1232 if (hold_rindex && ip != GFS2_I(sdp->sd_rindex)) 1242 if (hold_rindex && ip != GFS2_I(sdp->sd_rindex))
1233 gfs2_glock_dq_uninit(&al->al_ri_gh); 1243 gfs2_glock_dq_uninit(&al->al_ri_gh);
1234 if (error != -EAGAIN) 1244 return error;
1235 return error;
1236
1237 gfs2_process_unlinked_inode(ip->i_inode.i_sb, unlinked);
1238 /* regardless of whether or not gfs2_process_unlinked_inode
1239 was successful, we don't want to repeat it again. */
1240 last_unlinked = unlinked;
1241 gfs2_log_flush(sdp, NULL);
1242 error = 0;
1243
1244 goto try_again;
1245 } 1245 }
1246
1246 /* no error, so we have the rgrp set in the inode's allocation. */ 1247 /* no error, so we have the rgrp set in the inode's allocation. */
1247 al->al_file = file; 1248 al->al_file = file;
1248 al->al_line = line; 1249 al->al_line = line;
diff --git a/fs/hpfs/buffer.c b/fs/hpfs/buffer.c
index eac5f96323e3..793cb9d943d2 100644
--- a/fs/hpfs/buffer.c
+++ b/fs/hpfs/buffer.c
@@ -14,7 +14,7 @@ void hpfs_lock_creation(struct super_block *s)
14#ifdef DEBUG_LOCKS 14#ifdef DEBUG_LOCKS
15 printk("lock creation\n"); 15 printk("lock creation\n");
16#endif 16#endif
17 down(&hpfs_sb(s)->hpfs_creation_de); 17 mutex_lock(&hpfs_sb(s)->hpfs_creation_de);
18} 18}
19 19
20void hpfs_unlock_creation(struct super_block *s) 20void hpfs_unlock_creation(struct super_block *s)
@@ -22,7 +22,7 @@ void hpfs_unlock_creation(struct super_block *s)
22#ifdef DEBUG_LOCKS 22#ifdef DEBUG_LOCKS
23 printk("unlock creation\n"); 23 printk("unlock creation\n");
24#endif 24#endif
25 up(&hpfs_sb(s)->hpfs_creation_de); 25 mutex_unlock(&hpfs_sb(s)->hpfs_creation_de);
26} 26}
27 27
28/* Map a sector into a buffer and return pointers to it and to the buffer. */ 28/* Map a sector into a buffer and return pointers to it and to the buffer. */
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index b59eac0232a0..2fee17d0d9ab 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -87,7 +87,7 @@ struct hpfs_sb_info {
87 unsigned *sb_bmp_dir; /* main bitmap directory */ 87 unsigned *sb_bmp_dir; /* main bitmap directory */
88 unsigned sb_c_bitmap; /* current bitmap */ 88 unsigned sb_c_bitmap; /* current bitmap */
89 unsigned sb_max_fwd_alloc; /* max forwad allocation */ 89 unsigned sb_max_fwd_alloc; /* max forwad allocation */
90 struct semaphore hpfs_creation_de; /* when creating dirents, nobody else 90 struct mutex hpfs_creation_de; /* when creating dirents, nobody else
91 can alloc blocks */ 91 can alloc blocks */
92 /*unsigned sb_mounting : 1;*/ 92 /*unsigned sb_mounting : 1;*/
93 int sb_timeshift; 93 int sb_timeshift;
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index bb69389972eb..6c5f01597c3a 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -491,7 +491,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
491 sbi->sb_bmp_dir = NULL; 491 sbi->sb_bmp_dir = NULL;
492 sbi->sb_cp_table = NULL; 492 sbi->sb_cp_table = NULL;
493 493
494 init_MUTEX(&sbi->hpfs_creation_de); 494 mutex_init(&sbi->hpfs_creation_de);
495 495
496 uid = current_uid(); 496 uid = current_uid();
497 gid = current_gid(); 497 gid = current_gid();
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index d6cfac1f0a40..a5fe68189eed 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -932,8 +932,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag,
932 if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) { 932 if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
933 *user = current_user(); 933 *user = current_user();
934 if (user_shm_lock(size, *user)) { 934 if (user_shm_lock(size, *user)) {
935 WARN_ONCE(1, 935 printk_once(KERN_WARNING "Using mlock ulimits for SHM_HUGETLB is deprecated\n");
936 "Using mlock ulimits for SHM_HUGETLB deprecated\n");
937 } else { 936 } else {
938 *user = NULL; 937 *user = NULL;
939 return ERR_PTR(-EPERM); 938 return ERR_PTR(-EPERM);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index e92fdbb3bc3a..d6cc16476620 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -6,7 +6,6 @@
6 6
7#include <linux/syscalls.h> 7#include <linux/syscalls.h>
8#include <linux/mm.h> 8#include <linux/mm.h>
9#include <linux/smp_lock.h>
10#include <linux/capability.h> 9#include <linux/capability.h>
11#include <linux/file.h> 10#include <linux/file.h>
12#include <linux/fs.h> 11#include <linux/fs.h>
@@ -530,41 +529,6 @@ static int ioctl_fsthaw(struct file *filp)
530 return thaw_super(sb); 529 return thaw_super(sb);
531} 530}
532 531
533static int ioctl_fstrim(struct file *filp, void __user *argp)
534{
535 struct super_block *sb = filp->f_path.dentry->d_inode->i_sb;
536 struct fstrim_range range;
537 int ret = 0;
538
539 if (!capable(CAP_SYS_ADMIN))
540 return -EPERM;
541
542 /* If filesystem doesn't support trim feature, return. */
543 if (sb->s_op->trim_fs == NULL)
544 return -EOPNOTSUPP;
545
546 /* If a blockdevice-backed filesystem isn't specified, return EINVAL. */
547 if (sb->s_bdev == NULL)
548 return -EINVAL;
549
550 if (argp == NULL) {
551 range.start = 0;
552 range.len = ULLONG_MAX;
553 range.minlen = 0;
554 } else if (copy_from_user(&range, argp, sizeof(range)))
555 return -EFAULT;
556
557 ret = sb->s_op->trim_fs(sb, &range);
558 if (ret < 0)
559 return ret;
560
561 if ((argp != NULL) &&
562 (copy_to_user(argp, &range, sizeof(range))))
563 return -EFAULT;
564
565 return 0;
566}
567
568/* 532/*
569 * When you add any new common ioctls to the switches above and below 533 * When you add any new common ioctls to the switches above and below
570 * please update compat_sys_ioctl() too. 534 * please update compat_sys_ioctl() too.
@@ -615,10 +579,6 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
615 error = ioctl_fsthaw(filp); 579 error = ioctl_fsthaw(filp);
616 break; 580 break;
617 581
618 case FITRIM:
619 error = ioctl_fstrim(filp, argp);
620 break;
621
622 case FS_IOC_FIEMAP: 582 case FS_IOC_FIEMAP:
623 return ioctl_fiemap(filp, arg); 583 return ioctl_fiemap(filp, arg);
624 584
diff --git a/fs/ioprio.c b/fs/ioprio.c
index 748cfb92dcc6..7da2a06508e5 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -103,12 +103,7 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
103 } 103 }
104 104
105 ret = -ESRCH; 105 ret = -ESRCH;
106 /* 106 rcu_read_lock();
107 * We want IOPRIO_WHO_PGRP/IOPRIO_WHO_USER to be "atomic",
108 * so we can't use rcu_read_lock(). See re-copy of ->ioprio
109 * in copy_process().
110 */
111 read_lock(&tasklist_lock);
112 switch (which) { 107 switch (which) {
113 case IOPRIO_WHO_PROCESS: 108 case IOPRIO_WHO_PROCESS:
114 if (!who) 109 if (!who)
@@ -153,7 +148,7 @@ free_uid:
153 ret = -EINVAL; 148 ret = -EINVAL;
154 } 149 }
155 150
156 read_unlock(&tasklist_lock); 151 rcu_read_unlock();
157 return ret; 152 return ret;
158} 153}
159 154
@@ -197,7 +192,7 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
197 int ret = -ESRCH; 192 int ret = -ESRCH;
198 int tmpio; 193 int tmpio;
199 194
200 read_lock(&tasklist_lock); 195 rcu_read_lock();
201 switch (which) { 196 switch (which) {
202 case IOPRIO_WHO_PROCESS: 197 case IOPRIO_WHO_PROCESS:
203 if (!who) 198 if (!who)
@@ -250,6 +245,6 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
250 ret = -EINVAL; 245 ret = -EINVAL;
251 } 246 }
252 247
253 read_unlock(&tasklist_lock); 248 rcu_read_unlock();
254 return ret; 249 return ret;
255} 250}
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 538417c1fdbb..f837ba953529 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -899,6 +899,14 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
899 899
900 /* journal descriptor can store up to n blocks -bzzz */ 900 /* journal descriptor can store up to n blocks -bzzz */
901 journal->j_blocksize = blocksize; 901 journal->j_blocksize = blocksize;
902 journal->j_dev = bdev;
903 journal->j_fs_dev = fs_dev;
904 journal->j_blk_offset = start;
905 journal->j_maxlen = len;
906 bdevname(journal->j_dev, journal->j_devname);
907 p = journal->j_devname;
908 while ((p = strchr(p, '/')))
909 *p = '!';
902 jbd2_stats_proc_init(journal); 910 jbd2_stats_proc_init(journal);
903 n = journal->j_blocksize / sizeof(journal_block_tag_t); 911 n = journal->j_blocksize / sizeof(journal_block_tag_t);
904 journal->j_wbufsize = n; 912 journal->j_wbufsize = n;
@@ -908,14 +916,6 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
908 __func__); 916 __func__);
909 goto out_err; 917 goto out_err;
910 } 918 }
911 journal->j_dev = bdev;
912 journal->j_fs_dev = fs_dev;
913 journal->j_blk_offset = start;
914 journal->j_maxlen = len;
915 bdevname(journal->j_dev, journal->j_devname);
916 p = journal->j_devname;
917 while ((p = strchr(p, '/')))
918 *p = '!';
919 919
920 bh = __getblk(journal->j_dev, start, journal->j_blocksize); 920 bh = __getblk(journal->j_dev, start, journal->j_blocksize);
921 if (!bh) { 921 if (!bh) {
@@ -1838,7 +1838,6 @@ size_t journal_tag_bytes(journal_t *journal)
1838 */ 1838 */
1839#define JBD2_MAX_SLABS 8 1839#define JBD2_MAX_SLABS 8
1840static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS]; 1840static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS];
1841static DECLARE_MUTEX(jbd2_slab_create_sem);
1842 1841
1843static const char *jbd2_slab_names[JBD2_MAX_SLABS] = { 1842static const char *jbd2_slab_names[JBD2_MAX_SLABS] = {
1844 "jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k", 1843 "jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k",
@@ -1859,6 +1858,7 @@ static void jbd2_journal_destroy_slabs(void)
1859 1858
1860static int jbd2_journal_create_slab(size_t size) 1859static int jbd2_journal_create_slab(size_t size)
1861{ 1860{
1861 static DEFINE_MUTEX(jbd2_slab_create_mutex);
1862 int i = order_base_2(size) - 10; 1862 int i = order_base_2(size) - 10;
1863 size_t slab_size; 1863 size_t slab_size;
1864 1864
@@ -1870,16 +1870,16 @@ static int jbd2_journal_create_slab(size_t size)
1870 1870
1871 if (unlikely(i < 0)) 1871 if (unlikely(i < 0))
1872 i = 0; 1872 i = 0;
1873 down(&jbd2_slab_create_sem); 1873 mutex_lock(&jbd2_slab_create_mutex);
1874 if (jbd2_slab[i]) { 1874 if (jbd2_slab[i]) {
1875 up(&jbd2_slab_create_sem); 1875 mutex_unlock(&jbd2_slab_create_mutex);
1876 return 0; /* Already created */ 1876 return 0; /* Already created */
1877 } 1877 }
1878 1878
1879 slab_size = 1 << (i+10); 1879 slab_size = 1 << (i+10);
1880 jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size, 1880 jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size,
1881 slab_size, 0, NULL); 1881 slab_size, 0, NULL);
1882 up(&jbd2_slab_create_sem); 1882 mutex_unlock(&jbd2_slab_create_mutex);
1883 if (!jbd2_slab[i]) { 1883 if (!jbd2_slab[i]) {
1884 printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n"); 1884 printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n");
1885 return -ENOMEM; 1885 return -ENOMEM;
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index d5bb86866e6c..25509eb28fd7 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -14,7 +14,6 @@
14#include <linux/sunrpc/clnt.h> 14#include <linux/sunrpc/clnt.h>
15#include <linux/sunrpc/svc.h> 15#include <linux/sunrpc/svc.h>
16#include <linux/lockd/lockd.h> 16#include <linux/lockd/lockd.h>
17#include <linux/smp_lock.h>
18#include <linux/kthread.h> 17#include <linux/kthread.h>
19 18
20#define NLMDBG_FACILITY NLMDBG_CLIENT 19#define NLMDBG_FACILITY NLMDBG_CLIENT
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 47ea1e1925b8..332c54cf75e0 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -7,7 +7,6 @@
7 */ 7 */
8 8
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/smp_lock.h>
11#include <linux/slab.h> 10#include <linux/slab.h>
12#include <linux/types.h> 11#include <linux/types.h>
13#include <linux/errno.h> 12#include <linux/errno.h>
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 25e21e4023b2..ed0c59fe23ce 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -124,7 +124,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
124 continue; 124 continue;
125 if (host->h_server != ni->server) 125 if (host->h_server != ni->server)
126 continue; 126 continue;
127 if (ni->server && 127 if (ni->server && ni->src_len != 0 &&
128 !rpc_cmp_addr(nlm_srcaddr(host), ni->src_sap)) 128 !rpc_cmp_addr(nlm_srcaddr(host), ni->src_sap))
129 continue; 129 continue;
130 130
@@ -167,6 +167,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
167 host->h_addrlen = ni->salen; 167 host->h_addrlen = ni->salen;
168 rpc_set_port(nlm_addr(host), 0); 168 rpc_set_port(nlm_addr(host), 0);
169 memcpy(nlm_srcaddr(host), ni->src_sap, ni->src_len); 169 memcpy(nlm_srcaddr(host), ni->src_sap, ni->src_len);
170 host->h_srcaddrlen = ni->src_len;
170 host->h_version = ni->version; 171 host->h_version = ni->version;
171 host->h_proto = ni->protocol; 172 host->h_proto = ni->protocol;
172 host->h_rpcclnt = NULL; 173 host->h_rpcclnt = NULL;
@@ -238,9 +239,6 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
238 const char *hostname, 239 const char *hostname,
239 int noresvport) 240 int noresvport)
240{ 241{
241 const struct sockaddr source = {
242 .sa_family = AF_UNSPEC,
243 };
244 struct nlm_lookup_host_info ni = { 242 struct nlm_lookup_host_info ni = {
245 .server = 0, 243 .server = 0,
246 .sap = sap, 244 .sap = sap,
@@ -249,8 +247,6 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
249 .version = version, 247 .version = version,
250 .hostname = hostname, 248 .hostname = hostname,
251 .hostname_len = strlen(hostname), 249 .hostname_len = strlen(hostname),
252 .src_sap = &source,
253 .src_len = sizeof(source),
254 .noresvport = noresvport, 250 .noresvport = noresvport,
255 }; 251 };
256 252
@@ -357,7 +353,6 @@ nlm_bind_host(struct nlm_host *host)
357 .protocol = host->h_proto, 353 .protocol = host->h_proto,
358 .address = nlm_addr(host), 354 .address = nlm_addr(host),
359 .addrsize = host->h_addrlen, 355 .addrsize = host->h_addrlen,
360 .saddress = nlm_srcaddr(host),
361 .timeout = &timeparms, 356 .timeout = &timeparms,
362 .servername = host->h_name, 357 .servername = host->h_name,
363 .program = &nlm_program, 358 .program = &nlm_program,
@@ -376,6 +371,8 @@ nlm_bind_host(struct nlm_host *host)
376 args.flags |= RPC_CLNT_CREATE_HARDRTRY; 371 args.flags |= RPC_CLNT_CREATE_HARDRTRY;
377 if (host->h_noresvport) 372 if (host->h_noresvport)
378 args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; 373 args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
374 if (host->h_srcaddrlen)
375 args.saddress = nlm_srcaddr(host);
379 376
380 clnt = rpc_create(&args); 377 clnt = rpc_create(&args);
381 if (!IS_ERR(clnt)) 378 if (!IS_ERR(clnt))
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index a336e832475d..38d261192453 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -9,7 +9,6 @@
9 9
10#include <linux/types.h> 10#include <linux/types.h>
11#include <linux/time.h> 11#include <linux/time.h>
12#include <linux/smp_lock.h>
13#include <linux/lockd/lockd.h> 12#include <linux/lockd/lockd.h>
14#include <linux/lockd/share.h> 13#include <linux/lockd/share.h>
15 14
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index c462d346acbd..ef5659b211e9 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -25,7 +25,6 @@
25#include <linux/errno.h> 25#include <linux/errno.h>
26#include <linux/kernel.h> 26#include <linux/kernel.h>
27#include <linux/sched.h> 27#include <linux/sched.h>
28#include <linux/smp_lock.h>
29#include <linux/sunrpc/clnt.h> 28#include <linux/sunrpc/clnt.h>
30#include <linux/sunrpc/svc.h> 29#include <linux/sunrpc/svc.h>
31#include <linux/lockd/nlm.h> 30#include <linux/lockd/nlm.h>
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index c3069f38d602..0caea5310ac3 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -9,7 +9,6 @@
9 9
10#include <linux/types.h> 10#include <linux/types.h>
11#include <linux/time.h> 11#include <linux/time.h>
12#include <linux/smp_lock.h>
13#include <linux/lockd/lockd.h> 12#include <linux/lockd/lockd.h>
14#include <linux/lockd/share.h> 13#include <linux/lockd/share.h>
15 14
diff --git a/fs/locks.c b/fs/locks.c
index 50ec15927aab..8729347bcd1a 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -122,7 +122,6 @@
122#include <linux/module.h> 122#include <linux/module.h>
123#include <linux/security.h> 123#include <linux/security.h>
124#include <linux/slab.h> 124#include <linux/slab.h>
125#include <linux/smp_lock.h>
126#include <linux/syscalls.h> 125#include <linux/syscalls.h>
127#include <linux/time.h> 126#include <linux/time.h>
128#include <linux/rcupdate.h> 127#include <linux/rcupdate.h>
@@ -186,7 +185,7 @@ void locks_release_private(struct file_lock *fl)
186EXPORT_SYMBOL_GPL(locks_release_private); 185EXPORT_SYMBOL_GPL(locks_release_private);
187 186
188/* Free a lock which is not in use. */ 187/* Free a lock which is not in use. */
189static void locks_free_lock(struct file_lock *fl) 188void locks_free_lock(struct file_lock *fl)
190{ 189{
191 BUG_ON(waitqueue_active(&fl->fl_wait)); 190 BUG_ON(waitqueue_active(&fl->fl_wait));
192 BUG_ON(!list_empty(&fl->fl_block)); 191 BUG_ON(!list_empty(&fl->fl_block));
@@ -195,6 +194,7 @@ static void locks_free_lock(struct file_lock *fl)
195 locks_release_private(fl); 194 locks_release_private(fl);
196 kmem_cache_free(filelock_cache, fl); 195 kmem_cache_free(filelock_cache, fl);
197} 196}
197EXPORT_SYMBOL(locks_free_lock);
198 198
199void locks_init_lock(struct file_lock *fl) 199void locks_init_lock(struct file_lock *fl)
200{ 200{
@@ -234,11 +234,8 @@ static void locks_copy_private(struct file_lock *new, struct file_lock *fl)
234 fl->fl_ops->fl_copy_lock(new, fl); 234 fl->fl_ops->fl_copy_lock(new, fl);
235 new->fl_ops = fl->fl_ops; 235 new->fl_ops = fl->fl_ops;
236 } 236 }
237 if (fl->fl_lmops) { 237 if (fl->fl_lmops)
238 if (fl->fl_lmops->fl_copy_lock)
239 fl->fl_lmops->fl_copy_lock(new, fl);
240 new->fl_lmops = fl->fl_lmops; 238 new->fl_lmops = fl->fl_lmops;
241 }
242} 239}
243 240
244/* 241/*
@@ -1371,20 +1368,22 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
1371 struct inode *inode = dentry->d_inode; 1368 struct inode *inode = dentry->d_inode;
1372 int error, rdlease_count = 0, wrlease_count = 0; 1369 int error, rdlease_count = 0, wrlease_count = 0;
1373 1370
1371 lease = *flp;
1372
1373 error = -EACCES;
1374 if ((current_fsuid() != inode->i_uid) && !capable(CAP_LEASE)) 1374 if ((current_fsuid() != inode->i_uid) && !capable(CAP_LEASE))
1375 return -EACCES; 1375 goto out;
1376 error = -EINVAL;
1376 if (!S_ISREG(inode->i_mode)) 1377 if (!S_ISREG(inode->i_mode))
1377 return -EINVAL; 1378 goto out;
1378 error = security_file_lock(filp, arg); 1379 error = security_file_lock(filp, arg);
1379 if (error) 1380 if (error)
1380 return error; 1381 goto out;
1381 1382
1382 time_out_leases(inode); 1383 time_out_leases(inode);
1383 1384
1384 BUG_ON(!(*flp)->fl_lmops->fl_break); 1385 BUG_ON(!(*flp)->fl_lmops->fl_break);
1385 1386
1386 lease = *flp;
1387
1388 if (arg != F_UNLCK) { 1387 if (arg != F_UNLCK) {
1389 error = -EAGAIN; 1388 error = -EAGAIN;
1390 if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0)) 1389 if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
@@ -1425,8 +1424,9 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
1425 goto out; 1424 goto out;
1426 1425
1427 if (my_before != NULL) { 1426 if (my_before != NULL) {
1428 *flp = *my_before;
1429 error = lease->fl_lmops->fl_change(my_before, arg); 1427 error = lease->fl_lmops->fl_change(my_before, arg);
1428 if (!error)
1429 *flp = *my_before;
1430 goto out; 1430 goto out;
1431 } 1431 }
1432 1432
@@ -1441,7 +1441,6 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
1441 return 0; 1441 return 0;
1442 1442
1443out: 1443out:
1444 locks_free_lock(lease);
1445 return error; 1444 return error;
1446} 1445}
1447EXPORT_SYMBOL(generic_setlease); 1446EXPORT_SYMBOL(generic_setlease);
@@ -1493,21 +1492,19 @@ int vfs_setlease(struct file *filp, long arg, struct file_lock **lease)
1493} 1492}
1494EXPORT_SYMBOL_GPL(vfs_setlease); 1493EXPORT_SYMBOL_GPL(vfs_setlease);
1495 1494
1496/** 1495static int do_fcntl_delete_lease(struct file *filp)
1497 * fcntl_setlease - sets a lease on an open file
1498 * @fd: open file descriptor
1499 * @filp: file pointer
1500 * @arg: type of lease to obtain
1501 *
1502 * Call this fcntl to establish a lease on the file.
1503 * Note that you also need to call %F_SETSIG to
1504 * receive a signal when the lease is broken.
1505 */
1506int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
1507{ 1496{
1508 struct file_lock *fl; 1497 struct file_lock fl, *flp = &fl;
1498
1499 lease_init(filp, F_UNLCK, flp);
1500
1501 return vfs_setlease(filp, F_UNLCK, &flp);
1502}
1503
1504static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
1505{
1506 struct file_lock *fl, *ret;
1509 struct fasync_struct *new; 1507 struct fasync_struct *new;
1510 struct inode *inode = filp->f_path.dentry->d_inode;
1511 int error; 1508 int error;
1512 1509
1513 fl = lease_alloc(filp, arg); 1510 fl = lease_alloc(filp, arg);
@@ -1519,10 +1516,16 @@ int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
1519 locks_free_lock(fl); 1516 locks_free_lock(fl);
1520 return -ENOMEM; 1517 return -ENOMEM;
1521 } 1518 }
1519 ret = fl;
1522 lock_flocks(); 1520 lock_flocks();
1523 error = __vfs_setlease(filp, arg, &fl); 1521 error = __vfs_setlease(filp, arg, &ret);
1524 if (error || arg == F_UNLCK) 1522 if (error) {
1525 goto out_unlock; 1523 unlock_flocks();
1524 locks_free_lock(fl);
1525 goto out_free_fasync;
1526 }
1527 if (ret != fl)
1528 locks_free_lock(fl);
1526 1529
1527 /* 1530 /*
1528 * fasync_insert_entry() returns the old entry if any. 1531 * fasync_insert_entry() returns the old entry if any.
@@ -1530,26 +1533,36 @@ int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
1530 * inserted it into the fasync list. Clear new so that 1533 * inserted it into the fasync list. Clear new so that
1531 * we don't release it here. 1534 * we don't release it here.
1532 */ 1535 */
1533 if (!fasync_insert_entry(fd, filp, &fl->fl_fasync, new)) 1536 if (!fasync_insert_entry(fd, filp, &ret->fl_fasync, new))
1534 new = NULL; 1537 new = NULL;
1535 1538
1536 if (error < 0) {
1537 /* remove lease just inserted by setlease */
1538 fl->fl_type = F_UNLCK | F_INPROGRESS;
1539 fl->fl_break_time = jiffies - 10;
1540 time_out_leases(inode);
1541 goto out_unlock;
1542 }
1543
1544 error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0); 1539 error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
1545out_unlock:
1546 unlock_flocks(); 1540 unlock_flocks();
1541
1542out_free_fasync:
1547 if (new) 1543 if (new)
1548 fasync_free(new); 1544 fasync_free(new);
1549 return error; 1545 return error;
1550} 1546}
1551 1547
1552/** 1548/**
1549 * fcntl_setlease - sets a lease on an open file
1550 * @fd: open file descriptor
1551 * @filp: file pointer
1552 * @arg: type of lease to obtain
1553 *
1554 * Call this fcntl to establish a lease on the file.
1555 * Note that you also need to call %F_SETSIG to
1556 * receive a signal when the lease is broken.
1557 */
1558int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
1559{
1560 if (arg == F_UNLCK)
1561 return do_fcntl_delete_lease(filp);
1562 return do_fcntl_add_lease(fd, filp, arg);
1563}
1564
1565/**
1553 * flock_lock_file_wait - Apply a FLOCK-style lock to a file 1566 * flock_lock_file_wait - Apply a FLOCK-style lock to a file
1554 * @filp: The file to apply the lock to 1567 * @filp: The file to apply the lock to
1555 * @fl: The lock to be applied 1568 * @fl: The lock to be applied
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index cd51a36b37f0..57afd4a6fabb 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -486,7 +486,7 @@ static inline int logfs_get_sb_bdev(struct logfs_super *s,
486 486
487/* dev_mtd.c */ 487/* dev_mtd.c */
488#ifdef CONFIG_MTD 488#ifdef CONFIG_MTD
489int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr) 489int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr);
490#else 490#else
491static inline int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr) 491static inline int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr)
492{ 492{
diff --git a/fs/namei.c b/fs/namei.c
index 5362af9b7372..4ff7ca530533 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1748,6 +1748,9 @@ struct file *do_filp_open(int dfd, const char *pathname,
1748 if (!(open_flag & O_CREAT)) 1748 if (!(open_flag & O_CREAT))
1749 mode = 0; 1749 mode = 0;
1750 1750
1751 /* Must never be set by userspace */
1752 open_flag &= ~FMODE_NONOTIFY;
1753
1751 /* 1754 /*
1752 * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only 1755 * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only
1753 * check for O_DSYNC if the need any syncing at all we enforce it's 1756 * check for O_DSYNC if the need any syncing at all we enforce it's
diff --git a/fs/namespace.c b/fs/namespace.c
index 8a415c9c5e55..3dbfc072ec70 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -13,7 +13,6 @@
13#include <linux/sched.h> 13#include <linux/sched.h>
14#include <linux/spinlock.h> 14#include <linux/spinlock.h>
15#include <linux/percpu.h> 15#include <linux/percpu.h>
16#include <linux/smp_lock.h>
17#include <linux/init.h> 16#include <linux/init.h>
18#include <linux/kernel.h> 17#include <linux/kernel.h>
19#include <linux/acct.h> 18#include <linux/acct.h>
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index aac8832e919e..f22b12e7d337 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -19,7 +19,6 @@
19#include <linux/mm.h> 19#include <linux/mm.h>
20#include <asm/uaccess.h> 20#include <asm/uaccess.h>
21#include <asm/byteorder.h> 21#include <asm/byteorder.h>
22#include <linux/smp_lock.h>
23 22
24#include <linux/ncp_fs.h> 23#include <linux/ncp_fs.h>
25 24
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 6c754f70c529..cb50aaf981df 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -17,7 +17,6 @@
17#include <linux/mm.h> 17#include <linux/mm.h>
18#include <linux/vmalloc.h> 18#include <linux/vmalloc.h>
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/smp_lock.h>
21 20
22#include <linux/ncp_fs.h> 21#include <linux/ncp_fs.h>
23#include "ncplib_kernel.h" 22#include "ncplib_kernel.h"
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index d290545aa0c4..8fb93b604e73 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -26,7 +26,6 @@
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/vmalloc.h> 27#include <linux/vmalloc.h>
28#include <linux/init.h> 28#include <linux/init.h>
29#include <linux/smp_lock.h>
30#include <linux/vfs.h> 29#include <linux/vfs.h>
31#include <linux/mount.h> 30#include <linux/mount.h>
32#include <linux/seq_file.h> 31#include <linux/seq_file.h>
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index c2a1f9a155c3..d40a547e3377 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -17,7 +17,6 @@
17#include <linux/mount.h> 17#include <linux/mount.h>
18#include <linux/slab.h> 18#include <linux/slab.h>
19#include <linux/highuid.h> 19#include <linux/highuid.h>
20#include <linux/smp_lock.h>
21#include <linux/vmalloc.h> 20#include <linux/vmalloc.h>
22#include <linux/sched.h> 21#include <linux/sched.h>
23 22
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index aeec017fe814..93a8b3bd69e3 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -9,7 +9,6 @@
9#include <linux/completion.h> 9#include <linux/completion.h>
10#include <linux/ip.h> 10#include <linux/ip.h>
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/smp_lock.h>
13#include <linux/sunrpc/svc.h> 12#include <linux/sunrpc/svc.h>
14#include <linux/sunrpc/svcsock.h> 13#include <linux/sunrpc/svcsock.h>
15#include <linux/nfs_fs.h> 14#include <linux/nfs_fs.h>
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 232a7eead33a..1fd62fc49be3 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -11,7 +11,6 @@
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/sched.h> 12#include <linux/sched.h>
13#include <linux/slab.h> 13#include <linux/slab.h>
14#include <linux/smp_lock.h>
15#include <linux/spinlock.h> 14#include <linux/spinlock.h>
16 15
17#include <linux/nfs4.h> 16#include <linux/nfs4.h>
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 07ac3847e562..996dd8989a91 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -34,6 +34,7 @@
34#include <linux/mount.h> 34#include <linux/mount.h>
35#include <linux/sched.h> 35#include <linux/sched.h>
36#include <linux/vmalloc.h> 36#include <linux/vmalloc.h>
37#include <linux/kmemleak.h>
37 38
38#include "delegation.h" 39#include "delegation.h"
39#include "iostat.h" 40#include "iostat.h"
@@ -56,7 +57,7 @@ static int nfs_rename(struct inode *, struct dentry *,
56 struct inode *, struct dentry *); 57 struct inode *, struct dentry *);
57static int nfs_fsync_dir(struct file *, int); 58static int nfs_fsync_dir(struct file *, int);
58static loff_t nfs_llseek_dir(struct file *, loff_t, int); 59static loff_t nfs_llseek_dir(struct file *, loff_t, int);
59static int nfs_readdir_clear_array(struct page*, gfp_t); 60static void nfs_readdir_clear_array(struct page*);
60 61
61const struct file_operations nfs_dir_operations = { 62const struct file_operations nfs_dir_operations = {
62 .llseek = nfs_llseek_dir, 63 .llseek = nfs_llseek_dir,
@@ -82,8 +83,8 @@ const struct inode_operations nfs_dir_inode_operations = {
82 .setattr = nfs_setattr, 83 .setattr = nfs_setattr,
83}; 84};
84 85
85const struct address_space_operations nfs_dir_addr_space_ops = { 86const struct address_space_operations nfs_dir_aops = {
86 .releasepage = nfs_readdir_clear_array, 87 .freepage = nfs_readdir_clear_array,
87}; 88};
88 89
89#ifdef CONFIG_NFS_V3 90#ifdef CONFIG_NFS_V3
@@ -161,6 +162,7 @@ struct nfs_cache_array_entry {
161 u64 cookie; 162 u64 cookie;
162 u64 ino; 163 u64 ino;
163 struct qstr string; 164 struct qstr string;
165 unsigned char d_type;
164}; 166};
165 167
166struct nfs_cache_array { 168struct nfs_cache_array {
@@ -170,14 +172,13 @@ struct nfs_cache_array {
170 struct nfs_cache_array_entry array[0]; 172 struct nfs_cache_array_entry array[0];
171}; 173};
172 174
173#define MAX_READDIR_ARRAY ((PAGE_SIZE - sizeof(struct nfs_cache_array)) / sizeof(struct nfs_cache_array_entry))
174
175typedef __be32 * (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int); 175typedef __be32 * (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
176typedef struct { 176typedef struct {
177 struct file *file; 177 struct file *file;
178 struct page *page; 178 struct page *page;
179 unsigned long page_index; 179 unsigned long page_index;
180 u64 *dir_cookie; 180 u64 *dir_cookie;
181 u64 last_cookie;
181 loff_t current_index; 182 loff_t current_index;
182 decode_dirent_t decode; 183 decode_dirent_t decode;
183 184
@@ -194,9 +195,13 @@ typedef struct {
194static 195static
195struct nfs_cache_array *nfs_readdir_get_array(struct page *page) 196struct nfs_cache_array *nfs_readdir_get_array(struct page *page)
196{ 197{
198 void *ptr;
197 if (page == NULL) 199 if (page == NULL)
198 return ERR_PTR(-EIO); 200 return ERR_PTR(-EIO);
199 return (struct nfs_cache_array *)kmap(page); 201 ptr = kmap(page);
202 if (ptr == NULL)
203 return ERR_PTR(-ENOMEM);
204 return ptr;
200} 205}
201 206
202static 207static
@@ -209,14 +214,15 @@ void nfs_readdir_release_array(struct page *page)
209 * we are freeing strings created by nfs_add_to_readdir_array() 214 * we are freeing strings created by nfs_add_to_readdir_array()
210 */ 215 */
211static 216static
212int nfs_readdir_clear_array(struct page *page, gfp_t mask) 217void nfs_readdir_clear_array(struct page *page)
213{ 218{
214 struct nfs_cache_array *array = nfs_readdir_get_array(page); 219 struct nfs_cache_array *array;
215 int i; 220 int i;
221
222 array = kmap_atomic(page, KM_USER0);
216 for (i = 0; i < array->size; i++) 223 for (i = 0; i < array->size; i++)
217 kfree(array->array[i].string.name); 224 kfree(array->array[i].string.name);
218 nfs_readdir_release_array(page); 225 kunmap_atomic(array, KM_USER0);
219 return 0;
220} 226}
221 227
222/* 228/*
@@ -231,6 +237,11 @@ int nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int le
231 string->name = kmemdup(name, len, GFP_KERNEL); 237 string->name = kmemdup(name, len, GFP_KERNEL);
232 if (string->name == NULL) 238 if (string->name == NULL)
233 return -ENOMEM; 239 return -ENOMEM;
240 /*
241 * Avoid a kmemleak false positive. The pointer to the name is stored
242 * in a page cache page which kmemleak does not scan.
243 */
244 kmemleak_not_leak(string->name);
234 string->hash = full_name_hash(name, len); 245 string->hash = full_name_hash(name, len);
235 return 0; 246 return 0;
236} 247}
@@ -244,20 +255,24 @@ int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page)
244 255
245 if (IS_ERR(array)) 256 if (IS_ERR(array))
246 return PTR_ERR(array); 257 return PTR_ERR(array);
247 ret = -EIO;
248 if (array->size >= MAX_READDIR_ARRAY)
249 goto out;
250 258
251 cache_entry = &array->array[array->size]; 259 cache_entry = &array->array[array->size];
260
261 /* Check that this entry lies within the page bounds */
262 ret = -ENOSPC;
263 if ((char *)&cache_entry[1] - (char *)page_address(page) > PAGE_SIZE)
264 goto out;
265
252 cache_entry->cookie = entry->prev_cookie; 266 cache_entry->cookie = entry->prev_cookie;
253 cache_entry->ino = entry->ino; 267 cache_entry->ino = entry->ino;
268 cache_entry->d_type = entry->d_type;
254 ret = nfs_readdir_make_qstr(&cache_entry->string, entry->name, entry->len); 269 ret = nfs_readdir_make_qstr(&cache_entry->string, entry->name, entry->len);
255 if (ret) 270 if (ret)
256 goto out; 271 goto out;
257 array->last_cookie = entry->cookie; 272 array->last_cookie = entry->cookie;
258 if (entry->eof == 1)
259 array->eof_index = array->size;
260 array->size++; 273 array->size++;
274 if (entry->eof != 0)
275 array->eof_index = array->size;
261out: 276out:
262 nfs_readdir_release_array(page); 277 nfs_readdir_release_array(page);
263 return ret; 278 return ret;
@@ -272,7 +287,7 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri
272 if (diff < 0) 287 if (diff < 0)
273 goto out_eof; 288 goto out_eof;
274 if (diff >= array->size) { 289 if (diff >= array->size) {
275 if (array->eof_index > 0) 290 if (array->eof_index >= 0)
276 goto out_eof; 291 goto out_eof;
277 desc->current_index += array->size; 292 desc->current_index += array->size;
278 return -EAGAIN; 293 return -EAGAIN;
@@ -281,8 +296,6 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri
281 index = (unsigned int)diff; 296 index = (unsigned int)diff;
282 *desc->dir_cookie = array->array[index].cookie; 297 *desc->dir_cookie = array->array[index].cookie;
283 desc->cache_entry_index = index; 298 desc->cache_entry_index = index;
284 if (index == array->eof_index)
285 desc->eof = 1;
286 return 0; 299 return 0;
287out_eof: 300out_eof:
288 desc->eof = 1; 301 desc->eof = 1;
@@ -296,17 +309,16 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
296 int status = -EAGAIN; 309 int status = -EAGAIN;
297 310
298 for (i = 0; i < array->size; i++) { 311 for (i = 0; i < array->size; i++) {
299 if (i == array->eof_index) {
300 desc->eof = 1;
301 status = -EBADCOOKIE;
302 }
303 if (array->array[i].cookie == *desc->dir_cookie) { 312 if (array->array[i].cookie == *desc->dir_cookie) {
304 desc->cache_entry_index = i; 313 desc->cache_entry_index = i;
305 status = 0; 314 return 0;
306 break;
307 } 315 }
308 } 316 }
309 317 if (array->eof_index >= 0) {
318 status = -EBADCOOKIE;
319 if (*desc->dir_cookie == array->last_cookie)
320 desc->eof = 1;
321 }
310 return status; 322 return status;
311} 323}
312 324
@@ -314,10 +326,7 @@ static
314int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc) 326int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
315{ 327{
316 struct nfs_cache_array *array; 328 struct nfs_cache_array *array;
317 int status = -EBADCOOKIE; 329 int status;
318
319 if (desc->dir_cookie == NULL)
320 goto out;
321 330
322 array = nfs_readdir_get_array(desc->page); 331 array = nfs_readdir_get_array(desc->page);
323 if (IS_ERR(array)) { 332 if (IS_ERR(array)) {
@@ -330,6 +339,10 @@ int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
330 else 339 else
331 status = nfs_readdir_search_for_cookie(array, desc); 340 status = nfs_readdir_search_for_cookie(array, desc);
332 341
342 if (status == -EAGAIN) {
343 desc->last_cookie = array->last_cookie;
344 desc->page_index++;
345 }
333 nfs_readdir_release_array(desc->page); 346 nfs_readdir_release_array(desc->page);
334out: 347out:
335 return status; 348 return status;
@@ -381,13 +394,9 @@ int xdr_decode(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, struct x
381static 394static
382int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry) 395int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry)
383{ 396{
384 struct nfs_inode *node;
385 if (dentry->d_inode == NULL) 397 if (dentry->d_inode == NULL)
386 goto different; 398 goto different;
387 node = NFS_I(dentry->d_inode); 399 if (nfs_compare_fh(entry->fh, NFS_FH(dentry->d_inode)) != 0)
388 if (node->fh.size != entry->fh->size)
389 goto different;
390 if (strncmp(node->fh.data, entry->fh->data, node->fh.size) != 0)
391 goto different; 400 goto different;
392 return 1; 401 return 1;
393different: 402different:
@@ -449,14 +458,15 @@ out:
449 458
450/* Perform conversion from xdr to cache array */ 459/* Perform conversion from xdr to cache array */
451static 460static
452void nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, 461int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry,
453 void *xdr_page, struct page *page, unsigned int buflen) 462 void *xdr_page, struct page *page, unsigned int buflen)
454{ 463{
455 struct xdr_stream stream; 464 struct xdr_stream stream;
456 struct xdr_buf buf; 465 struct xdr_buf buf;
457 __be32 *ptr = xdr_page; 466 __be32 *ptr = xdr_page;
458 int status;
459 struct nfs_cache_array *array; 467 struct nfs_cache_array *array;
468 unsigned int count = 0;
469 int status;
460 470
461 buf.head->iov_base = xdr_page; 471 buf.head->iov_base = xdr_page;
462 buf.head->iov_len = buflen; 472 buf.head->iov_len = buflen;
@@ -471,21 +481,32 @@ void nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *e
471 481
472 do { 482 do {
473 status = xdr_decode(desc, entry, &stream); 483 status = xdr_decode(desc, entry, &stream);
474 if (status != 0) 484 if (status != 0) {
485 if (status == -EAGAIN)
486 status = 0;
475 break; 487 break;
488 }
476 489
477 if (nfs_readdir_add_to_array(entry, page) == -1) 490 count++;
478 break; 491
479 if (desc->plus == 1) 492 if (desc->plus != 0)
480 nfs_prime_dcache(desc->file->f_path.dentry, entry); 493 nfs_prime_dcache(desc->file->f_path.dentry, entry);
494
495 status = nfs_readdir_add_to_array(entry, page);
496 if (status != 0)
497 break;
481 } while (!entry->eof); 498 } while (!entry->eof);
482 499
483 if (status == -EBADCOOKIE && entry->eof) { 500 if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) {
484 array = nfs_readdir_get_array(page); 501 array = nfs_readdir_get_array(page);
485 array->eof_index = array->size - 1; 502 if (!IS_ERR(array)) {
486 status = 0; 503 array->eof_index = array->size;
487 nfs_readdir_release_array(page); 504 status = 0;
505 nfs_readdir_release_array(page);
506 } else
507 status = PTR_ERR(array);
488 } 508 }
509 return status;
489} 510}
490 511
491static 512static
@@ -537,11 +558,11 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
537 struct nfs_entry entry; 558 struct nfs_entry entry;
538 struct file *file = desc->file; 559 struct file *file = desc->file;
539 struct nfs_cache_array *array; 560 struct nfs_cache_array *array;
540 int status = 0; 561 int status = -ENOMEM;
541 unsigned int array_size = ARRAY_SIZE(pages); 562 unsigned int array_size = ARRAY_SIZE(pages);
542 563
543 entry.prev_cookie = 0; 564 entry.prev_cookie = 0;
544 entry.cookie = *desc->dir_cookie; 565 entry.cookie = desc->last_cookie;
545 entry.eof = 0; 566 entry.eof = 0;
546 entry.fh = nfs_alloc_fhandle(); 567 entry.fh = nfs_alloc_fhandle();
547 entry.fattr = nfs_alloc_fattr(); 568 entry.fattr = nfs_alloc_fattr();
@@ -549,6 +570,10 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
549 goto out; 570 goto out;
550 571
551 array = nfs_readdir_get_array(page); 572 array = nfs_readdir_get_array(page);
573 if (IS_ERR(array)) {
574 status = PTR_ERR(array);
575 goto out;
576 }
552 memset(array, 0, sizeof(struct nfs_cache_array)); 577 memset(array, 0, sizeof(struct nfs_cache_array));
553 array->eof_index = -1; 578 array->eof_index = -1;
554 579
@@ -556,12 +581,19 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
556 if (!pages_ptr) 581 if (!pages_ptr)
557 goto out_release_array; 582 goto out_release_array;
558 do { 583 do {
584 unsigned int pglen;
559 status = nfs_readdir_xdr_filler(pages, desc, &entry, file, inode); 585 status = nfs_readdir_xdr_filler(pages, desc, &entry, file, inode);
560 586
561 if (status < 0) 587 if (status < 0)
562 break; 588 break;
563 nfs_readdir_page_filler(desc, &entry, pages_ptr, page, array_size * PAGE_SIZE); 589 pglen = status;
564 } while (array->eof_index < 0 && array->size < MAX_READDIR_ARRAY); 590 status = nfs_readdir_page_filler(desc, &entry, pages_ptr, page, pglen);
591 if (status < 0) {
592 if (status == -ENOSPC)
593 status = 0;
594 break;
595 }
596 } while (array->eof_index < 0);
565 597
566 nfs_readdir_free_large_page(pages_ptr, pages, array_size); 598 nfs_readdir_free_large_page(pages_ptr, pages, array_size);
567out_release_array: 599out_release_array:
@@ -582,8 +614,10 @@ static
582int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page) 614int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page)
583{ 615{
584 struct inode *inode = desc->file->f_path.dentry->d_inode; 616 struct inode *inode = desc->file->f_path.dentry->d_inode;
617 int ret;
585 618
586 if (nfs_readdir_xdr_to_array(desc, page, inode) < 0) 619 ret = nfs_readdir_xdr_to_array(desc, page, inode);
620 if (ret < 0)
587 goto error; 621 goto error;
588 SetPageUptodate(page); 622 SetPageUptodate(page);
589 623
@@ -595,12 +629,14 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page)
595 return 0; 629 return 0;
596 error: 630 error:
597 unlock_page(page); 631 unlock_page(page);
598 return -EIO; 632 return ret;
599} 633}
600 634
601static 635static
602void cache_page_release(nfs_readdir_descriptor_t *desc) 636void cache_page_release(nfs_readdir_descriptor_t *desc)
603{ 637{
638 if (!desc->page->mapping)
639 nfs_readdir_clear_array(desc->page);
604 page_cache_release(desc->page); 640 page_cache_release(desc->page);
605 desc->page = NULL; 641 desc->page = NULL;
606} 642}
@@ -608,12 +644,8 @@ void cache_page_release(nfs_readdir_descriptor_t *desc)
608static 644static
609struct page *get_cache_page(nfs_readdir_descriptor_t *desc) 645struct page *get_cache_page(nfs_readdir_descriptor_t *desc)
610{ 646{
611 struct page *page; 647 return read_cache_page(desc->file->f_path.dentry->d_inode->i_mapping,
612 page = read_cache_page(desc->file->f_path.dentry->d_inode->i_mapping,
613 desc->page_index, (filler_t *)nfs_readdir_filler, desc); 648 desc->page_index, (filler_t *)nfs_readdir_filler, desc);
614 if (IS_ERR(page))
615 desc->eof = 1;
616 return page;
617} 649}
618 650
619/* 651/*
@@ -629,9 +661,8 @@ int find_cache_page(nfs_readdir_descriptor_t *desc)
629 return PTR_ERR(desc->page); 661 return PTR_ERR(desc->page);
630 662
631 res = nfs_readdir_search_array(desc); 663 res = nfs_readdir_search_array(desc);
632 if (res == 0) 664 if (res != 0)
633 return 0; 665 cache_page_release(desc);
634 cache_page_release(desc);
635 return res; 666 return res;
636} 667}
637 668
@@ -639,22 +670,18 @@ int find_cache_page(nfs_readdir_descriptor_t *desc)
639static inline 670static inline
640int readdir_search_pagecache(nfs_readdir_descriptor_t *desc) 671int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
641{ 672{
642 int res = -EAGAIN; 673 int res;
643 674
644 while (1) { 675 if (desc->page_index == 0) {
645 res = find_cache_page(desc); 676 desc->current_index = 0;
646 if (res != -EAGAIN) 677 desc->last_cookie = 0;
647 break;
648 desc->page_index++;
649 } 678 }
679 do {
680 res = find_cache_page(desc);
681 } while (res == -EAGAIN);
650 return res; 682 return res;
651} 683}
652 684
653static inline unsigned int dt_type(struct inode *inode)
654{
655 return (inode->i_mode >> 12) & 15;
656}
657
658/* 685/*
659 * Once we've found the start of the dirent within a page: fill 'er up... 686 * Once we've found the start of the dirent within a page: fill 'er up...
660 */ 687 */
@@ -666,35 +693,35 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
666 int i = 0; 693 int i = 0;
667 int res = 0; 694 int res = 0;
668 struct nfs_cache_array *array = NULL; 695 struct nfs_cache_array *array = NULL;
669 unsigned int d_type = DT_UNKNOWN;
670 struct dentry *dentry = NULL;
671 696
672 array = nfs_readdir_get_array(desc->page); 697 array = nfs_readdir_get_array(desc->page);
698 if (IS_ERR(array)) {
699 res = PTR_ERR(array);
700 goto out;
701 }
673 702
674 for (i = desc->cache_entry_index; i < array->size; i++) { 703 for (i = desc->cache_entry_index; i < array->size; i++) {
675 d_type = DT_UNKNOWN; 704 struct nfs_cache_array_entry *ent;
676 705
677 res = filldir(dirent, array->array[i].string.name, 706 ent = &array->array[i];
678 array->array[i].string.len, file->f_pos, 707 if (filldir(dirent, ent->string.name, ent->string.len,
679 nfs_compat_user_ino64(array->array[i].ino), d_type); 708 file->f_pos, nfs_compat_user_ino64(ent->ino),
680 if (res < 0) 709 ent->d_type) < 0) {
710 desc->eof = 1;
681 break; 711 break;
712 }
682 file->f_pos++; 713 file->f_pos++;
683 desc->cache_entry_index = i;
684 if (i < (array->size-1)) 714 if (i < (array->size-1))
685 *desc->dir_cookie = array->array[i+1].cookie; 715 *desc->dir_cookie = array->array[i+1].cookie;
686 else 716 else
687 *desc->dir_cookie = array->last_cookie; 717 *desc->dir_cookie = array->last_cookie;
688 if (i == array->eof_index) {
689 desc->eof = 1;
690 break;
691 }
692 } 718 }
719 if (array->eof_index >= 0)
720 desc->eof = 1;
693 721
694 nfs_readdir_release_array(desc->page); 722 nfs_readdir_release_array(desc->page);
723out:
695 cache_page_release(desc); 724 cache_page_release(desc);
696 if (dentry != NULL)
697 dput(dentry);
698 dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n", 725 dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n",
699 (unsigned long long)*desc->dir_cookie, res); 726 (unsigned long long)*desc->dir_cookie, res);
700 return res; 727 return res;
@@ -729,13 +756,14 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
729 goto out; 756 goto out;
730 } 757 }
731 758
732 if (nfs_readdir_xdr_to_array(desc, page, inode) == -1) {
733 status = -EIO;
734 goto out_release;
735 }
736
737 desc->page_index = 0; 759 desc->page_index = 0;
760 desc->last_cookie = *desc->dir_cookie;
738 desc->page = page; 761 desc->page = page;
762
763 status = nfs_readdir_xdr_to_array(desc, page, inode);
764 if (status < 0)
765 goto out_release;
766
739 status = nfs_do_filldir(desc, dirent, filldir); 767 status = nfs_do_filldir(desc, dirent, filldir);
740 768
741 out: 769 out:
@@ -757,7 +785,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
757 struct inode *inode = dentry->d_inode; 785 struct inode *inode = dentry->d_inode;
758 nfs_readdir_descriptor_t my_desc, 786 nfs_readdir_descriptor_t my_desc,
759 *desc = &my_desc; 787 *desc = &my_desc;
760 int res = -ENOMEM; 788 int res;
761 789
762 dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n", 790 dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n",
763 dentry->d_parent->d_name.name, dentry->d_name.name, 791 dentry->d_parent->d_name.name, dentry->d_name.name,
@@ -782,18 +810,18 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
782 if (res < 0) 810 if (res < 0)
783 goto out; 811 goto out;
784 812
785 while (desc->eof != 1) { 813 do {
786 res = readdir_search_pagecache(desc); 814 res = readdir_search_pagecache(desc);
787 815
788 if (res == -EBADCOOKIE) { 816 if (res == -EBADCOOKIE) {
817 res = 0;
789 /* This means either end of directory */ 818 /* This means either end of directory */
790 if (*desc->dir_cookie && desc->eof == 0) { 819 if (*desc->dir_cookie && desc->eof == 0) {
791 /* Or that the server has 'lost' a cookie */ 820 /* Or that the server has 'lost' a cookie */
792 res = uncached_readdir(desc, dirent, filldir); 821 res = uncached_readdir(desc, dirent, filldir);
793 if (res >= 0) 822 if (res == 0)
794 continue; 823 continue;
795 } 824 }
796 res = 0;
797 break; 825 break;
798 } 826 }
799 if (res == -ETOOSMALL && desc->plus) { 827 if (res == -ETOOSMALL && desc->plus) {
@@ -808,11 +836,9 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
808 break; 836 break;
809 837
810 res = nfs_do_filldir(desc, dirent, filldir); 838 res = nfs_do_filldir(desc, dirent, filldir);
811 if (res < 0) { 839 if (res < 0)
812 res = 0;
813 break; 840 break;
814 } 841 } while (!desc->eof);
815 }
816out: 842out:
817 nfs_unblock_sillyrename(dentry); 843 nfs_unblock_sillyrename(dentry);
818 if (res > 0) 844 if (res > 0)
@@ -1345,12 +1371,12 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
1345 res = NULL; 1371 res = NULL;
1346 goto out; 1372 goto out;
1347 /* This turned out not to be a regular file */ 1373 /* This turned out not to be a regular file */
1348 case -EISDIR:
1349 case -ENOTDIR: 1374 case -ENOTDIR:
1350 goto no_open; 1375 goto no_open;
1351 case -ELOOP: 1376 case -ELOOP:
1352 if (!(nd->intent.open.flags & O_NOFOLLOW)) 1377 if (!(nd->intent.open.flags & O_NOFOLLOW))
1353 goto no_open; 1378 goto no_open;
1379 /* case -EISDIR: */
1354 /* case -EINVAL: */ 1380 /* case -EINVAL: */
1355 default: 1381 default:
1356 res = ERR_CAST(inode); 1382 res = ERR_CAST(inode);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 84d3c8b90206..e6ace0d93c71 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -867,7 +867,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
867 goto out; 867 goto out;
868 nfs_alloc_commit_data(dreq); 868 nfs_alloc_commit_data(dreq);
869 869
870 if (dreq->commit_data == NULL || count < wsize) 870 if (dreq->commit_data == NULL || count <= wsize)
871 sync = NFS_FILE_SYNC; 871 sync = NFS_FILE_SYNC;
872 872
873 dreq->inode = inode; 873 dreq->inode = inode;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index e756075637b0..7bf029ef4084 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -693,6 +693,7 @@ do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
693{ 693{
694 struct inode *inode = filp->f_mapping->host; 694 struct inode *inode = filp->f_mapping->host;
695 int status = 0; 695 int status = 0;
696 unsigned int saved_type = fl->fl_type;
696 697
697 /* Try local locking first */ 698 /* Try local locking first */
698 posix_test_lock(filp, fl); 699 posix_test_lock(filp, fl);
@@ -700,6 +701,7 @@ do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
700 /* found a conflict */ 701 /* found a conflict */
701 goto out; 702 goto out;
702 } 703 }
704 fl->fl_type = saved_type;
703 705
704 if (nfs_have_delegation(inode, FMODE_READ)) 706 if (nfs_have_delegation(inode, FMODE_READ))
705 goto out_noconflict; 707 goto out_noconflict;
@@ -884,6 +886,5 @@ static int nfs_setlease(struct file *file, long arg, struct file_lock **fl)
884 dprintk("NFS: setlease(%s/%s, arg=%ld)\n", 886 dprintk("NFS: setlease(%s/%s, arg=%ld)\n",
885 file->f_path.dentry->d_parent->d_name.name, 887 file->f_path.dentry->d_parent->d_name.name,
886 file->f_path.dentry->d_name.name, arg); 888 file->f_path.dentry->d_name.name, arg);
887
888 return -EINVAL; 889 return -EINVAL;
889} 890}
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 314f57164602..e67e31c73416 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -289,6 +289,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
289 } else if (S_ISDIR(inode->i_mode)) { 289 } else if (S_ISDIR(inode->i_mode)) {
290 inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops; 290 inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops;
291 inode->i_fop = &nfs_dir_operations; 291 inode->i_fop = &nfs_dir_operations;
292 inode->i_data.a_ops = &nfs_dir_aops;
292 if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS)) 293 if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS))
293 set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags); 294 set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
294 /* Deal with crossing mountpoints */ 295 /* Deal with crossing mountpoints */
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index db08ff3ff454..e6356b750b77 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -362,6 +362,15 @@ unsigned int nfs_page_length(struct page *page)
362} 362}
363 363
364/* 364/*
365 * Convert a umode to a dirent->d_type
366 */
367static inline
368unsigned char nfs_umode_to_dtype(umode_t mode)
369{
370 return (mode >> 12) & 15;
371}
372
373/*
365 * Determine the number of pages in an array of length 'len' and 374 * Determine the number of pages in an array of length 'len' and
366 * with a base offset of 'base' 375 * with a base offset of 'base'
367 */ 376 */
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index eceafe74f473..4f981f1f6689 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -505,13 +505,13 @@ static struct rpc_procinfo mnt3_procedures[] = {
505 505
506static struct rpc_version mnt_version1 = { 506static struct rpc_version mnt_version1 = {
507 .number = 1, 507 .number = 1,
508 .nrprocs = 2, 508 .nrprocs = ARRAY_SIZE(mnt_procedures),
509 .procs = mnt_procedures, 509 .procs = mnt_procedures,
510}; 510};
511 511
512static struct rpc_version mnt_version3 = { 512static struct rpc_version mnt_version3 = {
513 .number = 3, 513 .number = 3,
514 .nrprocs = 2, 514 .nrprocs = ARRAY_SIZE(mnt3_procedures),
515 .procs = mnt3_procedures, 515 .procs = mnt3_procedures,
516}; 516};
517 517
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index e6bf45710cc7..5914a1911c95 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -423,7 +423,7 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy)
423 struct page **page; 423 struct page **page;
424 size_t hdrlen; 424 size_t hdrlen;
425 unsigned int pglen, recvd; 425 unsigned int pglen, recvd;
426 int status, nr = 0; 426 int status;
427 427
428 if ((status = ntohl(*p++))) 428 if ((status = ntohl(*p++)))
429 return nfs_stat_to_errno(status); 429 return nfs_stat_to_errno(status);
@@ -443,7 +443,7 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy)
443 if (pglen > recvd) 443 if (pglen > recvd)
444 pglen = recvd; 444 pglen = recvd;
445 page = rcvbuf->pages; 445 page = rcvbuf->pages;
446 return nr; 446 return pglen;
447} 447}
448 448
449static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) 449static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
@@ -485,6 +485,8 @@ nfs_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_se
485 entry->prev_cookie = entry->cookie; 485 entry->prev_cookie = entry->cookie;
486 entry->cookie = ntohl(*p++); 486 entry->cookie = ntohl(*p++);
487 487
488 entry->d_type = DT_UNKNOWN;
489
488 p = xdr_inline_peek(xdr, 8); 490 p = xdr_inline_peek(xdr, 8);
489 if (p != NULL) 491 if (p != NULL)
490 entry->eof = !p[0] && p[1]; 492 entry->eof = !p[0] && p[1];
@@ -495,7 +497,7 @@ nfs_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_se
495 497
496out_overflow: 498out_overflow:
497 print_overflow_msg(__func__, xdr); 499 print_overflow_msg(__func__, xdr);
498 return ERR_PTR(-EIO); 500 return ERR_PTR(-EAGAIN);
499} 501}
500 502
501/* 503/*
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index d9a5e832c257..f6cc60f06dac 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -555,7 +555,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
555 struct page **page; 555 struct page **page;
556 size_t hdrlen; 556 size_t hdrlen;
557 u32 recvd, pglen; 557 u32 recvd, pglen;
558 int status, nr = 0; 558 int status;
559 559
560 status = ntohl(*p++); 560 status = ntohl(*p++);
561 /* Decode post_op_attrs */ 561 /* Decode post_op_attrs */
@@ -586,7 +586,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
586 pglen = recvd; 586 pglen = recvd;
587 page = rcvbuf->pages; 587 page = rcvbuf->pages;
588 588
589 return nr; 589 return pglen;
590} 590}
591 591
592__be32 * 592__be32 *
@@ -622,11 +622,13 @@ nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_s
622 entry->prev_cookie = entry->cookie; 622 entry->prev_cookie = entry->cookie;
623 p = xdr_decode_hyper(p, &entry->cookie); 623 p = xdr_decode_hyper(p, &entry->cookie);
624 624
625 entry->d_type = DT_UNKNOWN;
625 if (plus) { 626 if (plus) {
626 entry->fattr->valid = 0; 627 entry->fattr->valid = 0;
627 p = xdr_decode_post_op_attr_stream(xdr, entry->fattr); 628 p = xdr_decode_post_op_attr_stream(xdr, entry->fattr);
628 if (IS_ERR(p)) 629 if (IS_ERR(p))
629 goto out_overflow_exit; 630 goto out_overflow_exit;
631 entry->d_type = nfs_umode_to_dtype(entry->fattr->mode);
630 /* In fact, a post_op_fh3: */ 632 /* In fact, a post_op_fh3: */
631 p = xdr_inline_decode(xdr, 4); 633 p = xdr_inline_decode(xdr, 4);
632 if (unlikely(!p)) 634 if (unlikely(!p))
@@ -656,7 +658,7 @@ nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_s
656out_overflow: 658out_overflow:
657 print_overflow_msg(__func__, xdr); 659 print_overflow_msg(__func__, xdr);
658out_overflow_exit: 660out_overflow_exit:
659 return ERR_PTR(-EIO); 661 return ERR_PTR(-EAGAIN);
660} 662}
661 663
662/* 664/*
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 0f24cdf2cb13..4435e5e1f904 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2852,8 +2852,10 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
2852 nfs4_setup_readdir(cookie, NFS_COOKIEVERF(dir), dentry, &args); 2852 nfs4_setup_readdir(cookie, NFS_COOKIEVERF(dir), dentry, &args);
2853 res.pgbase = args.pgbase; 2853 res.pgbase = args.pgbase;
2854 status = nfs4_call_sync(NFS_SERVER(dir), &msg, &args, &res, 0); 2854 status = nfs4_call_sync(NFS_SERVER(dir), &msg, &args, &res, 0);
2855 if (status == 0) 2855 if (status >= 0) {
2856 memcpy(NFS_COOKIEVERF(dir), res.verifier.data, NFS4_VERIFIER_SIZE); 2856 memcpy(NFS_COOKIEVERF(dir), res.verifier.data, NFS4_VERIFIER_SIZE);
2857 status += args.pgbase;
2858 }
2857 2859
2858 nfs_invalidate_atime(dir); 2860 nfs_invalidate_atime(dir);
2859 2861
@@ -3359,6 +3361,8 @@ static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen)
3359 ret = nfs_revalidate_inode(server, inode); 3361 ret = nfs_revalidate_inode(server, inode);
3360 if (ret < 0) 3362 if (ret < 0)
3361 return ret; 3363 return ret;
3364 if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL)
3365 nfs_zap_acl_cache(inode);
3362 ret = nfs4_read_cached_acl(inode, buf, buflen); 3366 ret = nfs4_read_cached_acl(inode, buf, buflen);
3363 if (ret != -ENOENT) 3367 if (ret != -ENOENT)
3364 return ret; 3368 return ret;
@@ -3387,6 +3391,13 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
3387 nfs_inode_return_delegation(inode); 3391 nfs_inode_return_delegation(inode);
3388 buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase); 3392 buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
3389 ret = nfs4_call_sync(server, &msg, &arg, &res, 1); 3393 ret = nfs4_call_sync(server, &msg, &arg, &res, 1);
3394 /*
3395 * Acl update can result in inode attribute update.
3396 * so mark the attribute cache invalid.
3397 */
3398 spin_lock(&inode->i_lock);
3399 NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATTR;
3400 spin_unlock(&inode->i_lock);
3390 nfs_access_zap_cache(inode); 3401 nfs_access_zap_cache(inode);
3391 nfs_zap_acl_cache(inode); 3402 nfs_zap_acl_cache(inode);
3392 return ret; 3403 return ret;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index f313c4cce7e4..9f1826b012e6 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4518,7 +4518,7 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
4518 xdr_read_pages(xdr, pglen); 4518 xdr_read_pages(xdr, pglen);
4519 4519
4520 4520
4521 return 0; 4521 return pglen;
4522} 4522}
4523 4523
4524static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req) 4524static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
@@ -6208,6 +6208,10 @@ __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
6208 if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID) 6208 if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID)
6209 entry->ino = entry->fattr->fileid; 6209 entry->ino = entry->fattr->fileid;
6210 6210
6211 entry->d_type = DT_UNKNOWN;
6212 if (entry->fattr->valid & NFS_ATTR_FATTR_TYPE)
6213 entry->d_type = nfs_umode_to_dtype(entry->fattr->mode);
6214
6211 if (verify_attr_len(xdr, p, len) < 0) 6215 if (verify_attr_len(xdr, p, len) < 0)
6212 goto out_overflow; 6216 goto out_overflow;
6213 6217
@@ -6221,7 +6225,7 @@ __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
6221 6225
6222out_overflow: 6226out_overflow:
6223 print_overflow_msg(__func__, xdr); 6227 print_overflow_msg(__func__, xdr);
6224 return ERR_PTR(-EIO); 6228 return ERR_PTR(-EAGAIN);
6225} 6229}
6226 6230
6227/* 6231/*
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 137b549e63db..b68536cc9046 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -115,7 +115,7 @@ int nfs_set_page_tag_locked(struct nfs_page *req)
115{ 115{
116 if (!nfs_lock_request_dontget(req)) 116 if (!nfs_lock_request_dontget(req))
117 return 0; 117 return 0;
118 if (req->wb_page != NULL) 118 if (test_bit(PG_MAPPED, &req->wb_flags))
119 radix_tree_tag_set(&NFS_I(req->wb_context->path.dentry->d_inode)->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED); 119 radix_tree_tag_set(&NFS_I(req->wb_context->path.dentry->d_inode)->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
120 return 1; 120 return 1;
121} 121}
@@ -125,7 +125,7 @@ int nfs_set_page_tag_locked(struct nfs_page *req)
125 */ 125 */
126void nfs_clear_page_tag_locked(struct nfs_page *req) 126void nfs_clear_page_tag_locked(struct nfs_page *req)
127{ 127{
128 if (req->wb_page != NULL) { 128 if (test_bit(PG_MAPPED, &req->wb_flags)) {
129 struct inode *inode = req->wb_context->path.dentry->d_inode; 129 struct inode *inode = req->wb_context->path.dentry->d_inode;
130 struct nfs_inode *nfsi = NFS_I(inode); 130 struct nfs_inode *nfsi = NFS_I(inode);
131 131
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index e4b62c6f5a6e..aedcaa7f291f 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -152,7 +152,6 @@ static void nfs_readpage_release(struct nfs_page *req)
152 (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode), 152 (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode),
153 req->wb_bytes, 153 req->wb_bytes,
154 (long long)req_offset(req)); 154 (long long)req_offset(req));
155 nfs_clear_request(req);
156 nfs_release_request(req); 155 nfs_release_request(req);
157} 156}
158 157
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 0a42e8f4adcb..4100630c9a5b 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -39,7 +39,6 @@
39#include <linux/nfs_mount.h> 39#include <linux/nfs_mount.h>
40#include <linux/nfs4_mount.h> 40#include <linux/nfs4_mount.h>
41#include <linux/lockd/bind.h> 41#include <linux/lockd/bind.h>
42#include <linux/smp_lock.h>
43#include <linux/seq_file.h> 42#include <linux/seq_file.h>
44#include <linux/mount.h> 43#include <linux/mount.h>
45#include <linux/mnt_namespace.h> 44#include <linux/mnt_namespace.h>
@@ -67,6 +66,12 @@
67 66
68#define NFSDBG_FACILITY NFSDBG_VFS 67#define NFSDBG_FACILITY NFSDBG_VFS
69 68
69#ifdef CONFIG_NFS_V3
70#define NFS_DEFAULT_VERSION 3
71#else
72#define NFS_DEFAULT_VERSION 2
73#endif
74
70enum { 75enum {
71 /* Mount options that take no arguments */ 76 /* Mount options that take no arguments */
72 Opt_soft, Opt_hard, 77 Opt_soft, Opt_hard,
@@ -1064,12 +1069,10 @@ static int nfs_parse_mount_options(char *raw,
1064 mnt->flags |= NFS_MOUNT_VER3; 1069 mnt->flags |= NFS_MOUNT_VER3;
1065 mnt->version = 3; 1070 mnt->version = 3;
1066 break; 1071 break;
1067#ifdef CONFIG_NFS_V4
1068 case Opt_v4: 1072 case Opt_v4:
1069 mnt->flags &= ~NFS_MOUNT_VER3; 1073 mnt->flags &= ~NFS_MOUNT_VER3;
1070 mnt->version = 4; 1074 mnt->version = 4;
1071 break; 1075 break;
1072#endif
1073 case Opt_udp: 1076 case Opt_udp:
1074 mnt->flags &= ~NFS_MOUNT_TCP; 1077 mnt->flags &= ~NFS_MOUNT_TCP;
1075 mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP; 1078 mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
@@ -1281,12 +1284,10 @@ static int nfs_parse_mount_options(char *raw,
1281 mnt->flags |= NFS_MOUNT_VER3; 1284 mnt->flags |= NFS_MOUNT_VER3;
1282 mnt->version = 3; 1285 mnt->version = 3;
1283 break; 1286 break;
1284#ifdef CONFIG_NFS_V4
1285 case NFS4_VERSION: 1287 case NFS4_VERSION:
1286 mnt->flags &= ~NFS_MOUNT_VER3; 1288 mnt->flags &= ~NFS_MOUNT_VER3;
1287 mnt->version = 4; 1289 mnt->version = 4;
1288 break; 1290 break;
1289#endif
1290 default: 1291 default:
1291 goto out_invalid_value; 1292 goto out_invalid_value;
1292 } 1293 }
@@ -2277,7 +2278,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
2277 }; 2278 };
2278 int error = -ENOMEM; 2279 int error = -ENOMEM;
2279 2280
2280 data = nfs_alloc_parsed_mount_data(3); 2281 data = nfs_alloc_parsed_mount_data(NFS_DEFAULT_VERSION);
2281 mntfh = nfs_alloc_fhandle(); 2282 mntfh = nfs_alloc_fhandle();
2282 if (data == NULL || mntfh == NULL) 2283 if (data == NULL || mntfh == NULL)
2283 goto out_free_fh; 2284 goto out_free_fh;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 4c14c17a5276..10d648ea128b 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -390,6 +390,7 @@ static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
390 if (nfs_have_delegation(inode, FMODE_WRITE)) 390 if (nfs_have_delegation(inode, FMODE_WRITE))
391 nfsi->change_attr++; 391 nfsi->change_attr++;
392 } 392 }
393 set_bit(PG_MAPPED, &req->wb_flags);
393 SetPagePrivate(req->wb_page); 394 SetPagePrivate(req->wb_page);
394 set_page_private(req->wb_page, (unsigned long)req); 395 set_page_private(req->wb_page, (unsigned long)req);
395 nfsi->npages++; 396 nfsi->npages++;
@@ -415,6 +416,7 @@ static void nfs_inode_remove_request(struct nfs_page *req)
415 spin_lock(&inode->i_lock); 416 spin_lock(&inode->i_lock);
416 set_page_private(req->wb_page, 0); 417 set_page_private(req->wb_page, 0);
417 ClearPagePrivate(req->wb_page); 418 ClearPagePrivate(req->wb_page);
419 clear_bit(PG_MAPPED, &req->wb_flags);
418 radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index); 420 radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index);
419 nfsi->npages--; 421 nfsi->npages--;
420 if (!nfsi->npages) { 422 if (!nfsi->npages) {
@@ -422,7 +424,6 @@ static void nfs_inode_remove_request(struct nfs_page *req)
422 iput(inode); 424 iput(inode);
423 } else 425 } else
424 spin_unlock(&inode->i_lock); 426 spin_unlock(&inode->i_lock);
425 nfs_clear_request(req);
426 nfs_release_request(req); 427 nfs_release_request(req);
427} 428}
428 429
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 2a533a0af2a9..7e84a852cdae 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -260,9 +260,11 @@ void fill_post_wcc(struct svc_fh *fhp)
260 err = vfs_getattr(fhp->fh_export->ex_path.mnt, fhp->fh_dentry, 260 err = vfs_getattr(fhp->fh_export->ex_path.mnt, fhp->fh_dentry,
261 &fhp->fh_post_attr); 261 &fhp->fh_post_attr);
262 fhp->fh_post_change = fhp->fh_dentry->d_inode->i_version; 262 fhp->fh_post_change = fhp->fh_dentry->d_inode->i_version;
263 if (err) 263 if (err) {
264 fhp->fh_post_saved = 0; 264 fhp->fh_post_saved = 0;
265 else 265 /* Grab the ctime anyway - set_change_info might use it */
266 fhp->fh_post_attr.ctime = fhp->fh_dentry->d_inode->i_ctime;
267 } else
266 fhp->fh_post_saved = 1; 268 fhp->fh_post_saved = 1;
267} 269}
268 270
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 56347e0ac88d..116cab970e0f 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -673,16 +673,17 @@ static void nfsd4_hash_conn(struct nfsd4_conn *conn, struct nfsd4_session *ses)
673 spin_unlock(&clp->cl_lock); 673 spin_unlock(&clp->cl_lock);
674} 674}
675 675
676static void nfsd4_register_conn(struct nfsd4_conn *conn) 676static int nfsd4_register_conn(struct nfsd4_conn *conn)
677{ 677{
678 conn->cn_xpt_user.callback = nfsd4_conn_lost; 678 conn->cn_xpt_user.callback = nfsd4_conn_lost;
679 register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user); 679 return register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user);
680} 680}
681 681
682static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses) 682static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses)
683{ 683{
684 struct nfsd4_conn *conn; 684 struct nfsd4_conn *conn;
685 u32 flags = NFS4_CDFC4_FORE; 685 u32 flags = NFS4_CDFC4_FORE;
686 int ret;
686 687
687 if (ses->se_flags & SESSION4_BACK_CHAN) 688 if (ses->se_flags & SESSION4_BACK_CHAN)
688 flags |= NFS4_CDFC4_BACK; 689 flags |= NFS4_CDFC4_BACK;
@@ -690,7 +691,10 @@ static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses)
690 if (!conn) 691 if (!conn)
691 return nfserr_jukebox; 692 return nfserr_jukebox;
692 nfsd4_hash_conn(conn, ses); 693 nfsd4_hash_conn(conn, ses);
693 nfsd4_register_conn(conn); 694 ret = nfsd4_register_conn(conn);
695 if (ret)
696 /* oops; xprt is already down: */
697 nfsd4_conn_lost(&conn->cn_xpt_user);
694 return nfs_ok; 698 return nfs_ok;
695} 699}
696 700
@@ -1644,6 +1648,7 @@ static void nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_sessi
1644{ 1648{
1645 struct nfs4_client *clp = ses->se_client; 1649 struct nfs4_client *clp = ses->se_client;
1646 struct nfsd4_conn *c; 1650 struct nfsd4_conn *c;
1651 int ret;
1647 1652
1648 spin_lock(&clp->cl_lock); 1653 spin_lock(&clp->cl_lock);
1649 c = __nfsd4_find_conn(new->cn_xprt, ses); 1654 c = __nfsd4_find_conn(new->cn_xprt, ses);
@@ -1654,7 +1659,10 @@ static void nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_sessi
1654 } 1659 }
1655 __nfsd4_hash_conn(new, ses); 1660 __nfsd4_hash_conn(new, ses);
1656 spin_unlock(&clp->cl_lock); 1661 spin_unlock(&clp->cl_lock);
1657 nfsd4_register_conn(new); 1662 ret = nfsd4_register_conn(new);
1663 if (ret)
1664 /* oops; xprt is already down: */
1665 nfsd4_conn_lost(&new->cn_xpt_user);
1658 return; 1666 return;
1659} 1667}
1660 1668
@@ -2254,7 +2262,7 @@ nfs4_file_downgrade(struct nfs4_file *fp, unsigned int share_access)
2254 * Spawn a thread to perform a recall on the delegation represented 2262 * Spawn a thread to perform a recall on the delegation represented
2255 * by the lease (file_lock) 2263 * by the lease (file_lock)
2256 * 2264 *
2257 * Called from break_lease() with lock_kernel() held. 2265 * Called from break_lease() with lock_flocks() held.
2258 * Note: we assume break_lease will only call this *once* for any given 2266 * Note: we assume break_lease will only call this *once* for any given
2259 * lease. 2267 * lease.
2260 */ 2268 */
@@ -2278,7 +2286,7 @@ void nfsd_break_deleg_cb(struct file_lock *fl)
2278 list_add_tail(&dp->dl_recall_lru, &del_recall_lru); 2286 list_add_tail(&dp->dl_recall_lru, &del_recall_lru);
2279 spin_unlock(&recall_lock); 2287 spin_unlock(&recall_lock);
2280 2288
2281 /* only place dl_time is set. protected by lock_kernel*/ 2289 /* only place dl_time is set. protected by lock_flocks*/
2282 dp->dl_time = get_seconds(); 2290 dp->dl_time = get_seconds();
2283 2291
2284 /* 2292 /*
@@ -2295,7 +2303,7 @@ void nfsd_break_deleg_cb(struct file_lock *fl)
2295/* 2303/*
2296 * The file_lock is being reapd. 2304 * The file_lock is being reapd.
2297 * 2305 *
2298 * Called by locks_free_lock() with lock_kernel() held. 2306 * Called by locks_free_lock() with lock_flocks() held.
2299 */ 2307 */
2300static 2308static
2301void nfsd_release_deleg_cb(struct file_lock *fl) 2309void nfsd_release_deleg_cb(struct file_lock *fl)
@@ -2310,23 +2318,7 @@ void nfsd_release_deleg_cb(struct file_lock *fl)
2310} 2318}
2311 2319
2312/* 2320/*
2313 * Set the delegation file_lock back pointer. 2321 * Called from setlease() with lock_flocks() held
2314 *
2315 * Called from setlease() with lock_kernel() held.
2316 */
2317static
2318void nfsd_copy_lock_deleg_cb(struct file_lock *new, struct file_lock *fl)
2319{
2320 struct nfs4_delegation *dp = (struct nfs4_delegation *)new->fl_owner;
2321
2322 dprintk("NFSD: nfsd_copy_lock_deleg_cb: new fl %p dp %p\n", new, dp);
2323 if (!dp)
2324 return;
2325 dp->dl_flock = new;
2326}
2327
2328/*
2329 * Called from setlease() with lock_kernel() held
2330 */ 2322 */
2331static 2323static
2332int nfsd_same_client_deleg_cb(struct file_lock *onlist, struct file_lock *try) 2324int nfsd_same_client_deleg_cb(struct file_lock *onlist, struct file_lock *try)
@@ -2355,7 +2347,6 @@ int nfsd_change_deleg_cb(struct file_lock **onlist, int arg)
2355static const struct lock_manager_operations nfsd_lease_mng_ops = { 2347static const struct lock_manager_operations nfsd_lease_mng_ops = {
2356 .fl_break = nfsd_break_deleg_cb, 2348 .fl_break = nfsd_break_deleg_cb,
2357 .fl_release_private = nfsd_release_deleg_cb, 2349 .fl_release_private = nfsd_release_deleg_cb,
2358 .fl_copy_lock = nfsd_copy_lock_deleg_cb,
2359 .fl_mylease = nfsd_same_client_deleg_cb, 2350 .fl_mylease = nfsd_same_client_deleg_cb,
2360 .fl_change = nfsd_change_deleg_cb, 2351 .fl_change = nfsd_change_deleg_cb,
2361}; 2352};
@@ -2661,12 +2652,15 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
2661 fl->fl_file = find_readable_file(stp->st_file); 2652 fl->fl_file = find_readable_file(stp->st_file);
2662 BUG_ON(!fl->fl_file); 2653 BUG_ON(!fl->fl_file);
2663 fl->fl_pid = current->tgid; 2654 fl->fl_pid = current->tgid;
2655 dp->dl_flock = fl;
2664 2656
2665 /* vfs_setlease checks to see if delegation should be handed out. 2657 /* vfs_setlease checks to see if delegation should be handed out.
2666 * the lock_manager callbacks fl_mylease and fl_change are used 2658 * the lock_manager callbacks fl_mylease and fl_change are used
2667 */ 2659 */
2668 if ((status = vfs_setlease(fl->fl_file, fl->fl_type, &fl))) { 2660 if ((status = vfs_setlease(fl->fl_file, fl->fl_type, &fl))) {
2669 dprintk("NFSD: setlease failed [%d], no delegation\n", status); 2661 dprintk("NFSD: setlease failed [%d], no delegation\n", status);
2662 dp->dl_flock = NULL;
2663 locks_free_lock(fl);
2670 unhash_delegation(dp); 2664 unhash_delegation(dp);
2671 flag = NFS4_OPEN_DELEGATE_NONE; 2665 flag = NFS4_OPEN_DELEGATE_NONE;
2672 goto out; 2666 goto out;
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 4d476ff08ae6..60fce3dc5cb5 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -484,18 +484,17 @@ static inline bool nfsd4_not_cached(struct nfsd4_compoundres *resp)
484static inline void 484static inline void
485set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp) 485set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp)
486{ 486{
487 BUG_ON(!fhp->fh_pre_saved || !fhp->fh_post_saved); 487 BUG_ON(!fhp->fh_pre_saved);
488 cinfo->atomic = 1; 488 cinfo->atomic = fhp->fh_post_saved;
489 cinfo->change_supported = IS_I_VERSION(fhp->fh_dentry->d_inode); 489 cinfo->change_supported = IS_I_VERSION(fhp->fh_dentry->d_inode);
490 if (cinfo->change_supported) { 490
491 cinfo->before_change = fhp->fh_pre_change; 491 cinfo->before_change = fhp->fh_pre_change;
492 cinfo->after_change = fhp->fh_post_change; 492 cinfo->after_change = fhp->fh_post_change;
493 } else { 493 cinfo->before_ctime_sec = fhp->fh_pre_ctime.tv_sec;
494 cinfo->before_ctime_sec = fhp->fh_pre_ctime.tv_sec; 494 cinfo->before_ctime_nsec = fhp->fh_pre_ctime.tv_nsec;
495 cinfo->before_ctime_nsec = fhp->fh_pre_ctime.tv_nsec; 495 cinfo->after_ctime_sec = fhp->fh_post_attr.ctime.tv_sec;
496 cinfo->after_ctime_sec = fhp->fh_post_attr.ctime.tv_sec; 496 cinfo->after_ctime_nsec = fhp->fh_post_attr.ctime.tv_nsec;
497 cinfo->after_ctime_nsec = fhp->fh_post_attr.ctime.tv_nsec; 497
498 }
499} 498}
500 499
501int nfs4svc_encode_voidres(struct svc_rqst *, __be32 *, void *); 500int nfs4svc_encode_voidres(struct svc_rqst *, __be32 *, void *);
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 49c844dab33a..59e5fe742f7b 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -335,7 +335,7 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)
335 * the device at this point. 335 * the device at this point.
336 * 336 *
337 * To prevent nilfs_dat_translate() from returning the 337 * To prevent nilfs_dat_translate() from returning the
338 * uncommited block number, this makes a copy of the entry 338 * uncommitted block number, this makes a copy of the entry
339 * buffer and redirects nilfs_dat_translate() to the copy. 339 * buffer and redirects nilfs_dat_translate() to the copy.
340 */ 340 */
341 if (!buffer_nilfs_redirected(entry_bh)) { 341 if (!buffer_nilfs_redirected(entry_bh)) {
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 33ad25ddd5c4..caf9a6a3fb54 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -176,7 +176,6 @@ int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh)
176int nilfs_init_gcinode(struct inode *inode) 176int nilfs_init_gcinode(struct inode *inode)
177{ 177{
178 struct nilfs_inode_info *ii = NILFS_I(inode); 178 struct nilfs_inode_info *ii = NILFS_I(inode);
179 struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
180 179
181 inode->i_mode = S_IFREG; 180 inode->i_mode = S_IFREG;
182 mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); 181 mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
@@ -186,14 +185,6 @@ int nilfs_init_gcinode(struct inode *inode)
186 ii->i_flags = 0; 185 ii->i_flags = 0;
187 nilfs_bmap_init_gc(ii->i_bmap); 186 nilfs_bmap_init_gc(ii->i_bmap);
188 187
189 /*
190 * Add the inode to GC inode list. Garbage Collection
191 * is serialized and no two processes manipulate the
192 * list simultaneously.
193 */
194 igrab(inode);
195 list_add(&NILFS_I(inode)->i_dirty, &nilfs->ns_gc_inodes);
196
197 return 0; 188 return 0;
198} 189}
199 190
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 3e90f86d5bfe..b185e937a335 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -337,6 +337,7 @@ static int nilfs_ioctl_move_blocks(struct super_block *sb,
337 struct nilfs_argv *argv, void *buf) 337 struct nilfs_argv *argv, void *buf)
338{ 338{
339 size_t nmembs = argv->v_nmembs; 339 size_t nmembs = argv->v_nmembs;
340 struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
340 struct inode *inode; 341 struct inode *inode;
341 struct nilfs_vdesc *vdesc; 342 struct nilfs_vdesc *vdesc;
342 struct buffer_head *bh, *n; 343 struct buffer_head *bh, *n;
@@ -349,10 +350,21 @@ static int nilfs_ioctl_move_blocks(struct super_block *sb,
349 ino = vdesc->vd_ino; 350 ino = vdesc->vd_ino;
350 cno = vdesc->vd_cno; 351 cno = vdesc->vd_cno;
351 inode = nilfs_iget_for_gc(sb, ino, cno); 352 inode = nilfs_iget_for_gc(sb, ino, cno);
352 if (unlikely(inode == NULL)) { 353 if (IS_ERR(inode)) {
353 ret = -ENOMEM; 354 ret = PTR_ERR(inode);
354 goto failed; 355 goto failed;
355 } 356 }
357 if (list_empty(&NILFS_I(inode)->i_dirty)) {
358 /*
359 * Add the inode to GC inode list. Garbage Collection
360 * is serialized and no two processes manipulate the
361 * list simultaneously.
362 */
363 igrab(inode);
364 list_add(&NILFS_I(inode)->i_dirty,
365 &nilfs->ns_gc_inodes);
366 }
367
356 do { 368 do {
357 ret = nilfs_ioctl_move_inode_block(inode, vdesc, 369 ret = nilfs_ioctl_move_inode_block(inode, vdesc,
358 &buffers); 370 &buffers);
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index b04f88eed09e..f35794b97e8e 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -92,7 +92,11 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group,
92 92
93 pr_debug("%s: group=%p event=%p\n", __func__, group, event); 93 pr_debug("%s: group=%p event=%p\n", __func__, group, event);
94 94
95 wait_event(group->fanotify_data.access_waitq, event->response); 95 wait_event(group->fanotify_data.access_waitq, event->response ||
96 atomic_read(&group->fanotify_data.bypass_perm));
97
98 if (!event->response) /* bypass_perm set */
99 return 0;
96 100
97 /* userspace responded, convert to something usable */ 101 /* userspace responded, convert to something usable */
98 spin_lock(&event->lock); 102 spin_lock(&event->lock);
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 063224812b7e..8b61220cffc5 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -106,20 +106,29 @@ static int create_fd(struct fsnotify_group *group, struct fsnotify_event *event)
106 return client_fd; 106 return client_fd;
107} 107}
108 108
109static ssize_t fill_event_metadata(struct fsnotify_group *group, 109static int fill_event_metadata(struct fsnotify_group *group,
110 struct fanotify_event_metadata *metadata, 110 struct fanotify_event_metadata *metadata,
111 struct fsnotify_event *event) 111 struct fsnotify_event *event)
112{ 112{
113 int ret = 0;
114
113 pr_debug("%s: group=%p metadata=%p event=%p\n", __func__, 115 pr_debug("%s: group=%p metadata=%p event=%p\n", __func__,
114 group, metadata, event); 116 group, metadata, event);
115 117
116 metadata->event_len = FAN_EVENT_METADATA_LEN; 118 metadata->event_len = FAN_EVENT_METADATA_LEN;
119 metadata->metadata_len = FAN_EVENT_METADATA_LEN;
117 metadata->vers = FANOTIFY_METADATA_VERSION; 120 metadata->vers = FANOTIFY_METADATA_VERSION;
118 metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS; 121 metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS;
119 metadata->pid = pid_vnr(event->tgid); 122 metadata->pid = pid_vnr(event->tgid);
120 metadata->fd = create_fd(group, event); 123 if (unlikely(event->mask & FAN_Q_OVERFLOW))
124 metadata->fd = FAN_NOFD;
125 else {
126 metadata->fd = create_fd(group, event);
127 if (metadata->fd < 0)
128 ret = metadata->fd;
129 }
121 130
122 return metadata->fd; 131 return ret;
123} 132}
124 133
125#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS 134#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
@@ -200,7 +209,7 @@ static int prepare_for_access_response(struct fsnotify_group *group,
200 209
201 mutex_lock(&group->fanotify_data.access_mutex); 210 mutex_lock(&group->fanotify_data.access_mutex);
202 211
203 if (group->fanotify_data.bypass_perm) { 212 if (atomic_read(&group->fanotify_data.bypass_perm)) {
204 mutex_unlock(&group->fanotify_data.access_mutex); 213 mutex_unlock(&group->fanotify_data.access_mutex);
205 kmem_cache_free(fanotify_response_event_cache, re); 214 kmem_cache_free(fanotify_response_event_cache, re);
206 event->response = FAN_ALLOW; 215 event->response = FAN_ALLOW;
@@ -257,24 +266,34 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
257 266
258 pr_debug("%s: group=%p event=%p\n", __func__, group, event); 267 pr_debug("%s: group=%p event=%p\n", __func__, group, event);
259 268
260 fd = fill_event_metadata(group, &fanotify_event_metadata, event); 269 ret = fill_event_metadata(group, &fanotify_event_metadata, event);
261 if (fd < 0) 270 if (ret < 0)
262 return fd; 271 goto out;
263 272
273 fd = fanotify_event_metadata.fd;
264 ret = prepare_for_access_response(group, event, fd); 274 ret = prepare_for_access_response(group, event, fd);
265 if (ret) 275 if (ret)
266 goto out_close_fd; 276 goto out_close_fd;
267 277
268 ret = -EFAULT; 278 ret = -EFAULT;
269 if (copy_to_user(buf, &fanotify_event_metadata, FAN_EVENT_METADATA_LEN)) 279 if (copy_to_user(buf, &fanotify_event_metadata,
280 fanotify_event_metadata.event_len))
270 goto out_kill_access_response; 281 goto out_kill_access_response;
271 282
272 return FAN_EVENT_METADATA_LEN; 283 return fanotify_event_metadata.event_len;
273 284
274out_kill_access_response: 285out_kill_access_response:
275 remove_access_response(group, event, fd); 286 remove_access_response(group, event, fd);
276out_close_fd: 287out_close_fd:
277 sys_close(fd); 288 if (fd != FAN_NOFD)
289 sys_close(fd);
290out:
291#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
292 if (event->mask & FAN_ALL_PERM_EVENTS) {
293 event->response = FAN_DENY;
294 wake_up(&group->fanotify_data.access_waitq);
295 }
296#endif
278 return ret; 297 return ret;
279} 298}
280 299
@@ -382,7 +401,7 @@ static int fanotify_release(struct inode *ignored, struct file *file)
382 401
383 mutex_lock(&group->fanotify_data.access_mutex); 402 mutex_lock(&group->fanotify_data.access_mutex);
384 403
385 group->fanotify_data.bypass_perm = true; 404 atomic_inc(&group->fanotify_data.bypass_perm);
386 405
387 list_for_each_entry_safe(re, lre, &group->fanotify_data.access_list, list) { 406 list_for_each_entry_safe(re, lre, &group->fanotify_data.access_list, list) {
388 pr_debug("%s: found group=%p re=%p event=%p\n", __func__, group, 407 pr_debug("%s: found group=%p re=%p event=%p\n", __func__, group,
@@ -586,11 +605,10 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
586{ 605{
587 struct fsnotify_mark *fsn_mark; 606 struct fsnotify_mark *fsn_mark;
588 __u32 added; 607 __u32 added;
608 int ret = 0;
589 609
590 fsn_mark = fsnotify_find_vfsmount_mark(group, mnt); 610 fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
591 if (!fsn_mark) { 611 if (!fsn_mark) {
592 int ret;
593
594 if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks) 612 if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
595 return -ENOSPC; 613 return -ENOSPC;
596 614
@@ -600,17 +618,16 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
600 618
601 fsnotify_init_mark(fsn_mark, fanotify_free_mark); 619 fsnotify_init_mark(fsn_mark, fanotify_free_mark);
602 ret = fsnotify_add_mark(fsn_mark, group, NULL, mnt, 0); 620 ret = fsnotify_add_mark(fsn_mark, group, NULL, mnt, 0);
603 if (ret) { 621 if (ret)
604 fanotify_free_mark(fsn_mark); 622 goto err;
605 return ret;
606 }
607 } 623 }
608 added = fanotify_mark_add_to_mask(fsn_mark, mask, flags); 624 added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
609 fsnotify_put_mark(fsn_mark); 625
610 if (added & ~mnt->mnt_fsnotify_mask) 626 if (added & ~mnt->mnt_fsnotify_mask)
611 fsnotify_recalc_vfsmount_mask(mnt); 627 fsnotify_recalc_vfsmount_mask(mnt);
612 628err:
613 return 0; 629 fsnotify_put_mark(fsn_mark);
630 return ret;
614} 631}
615 632
616static int fanotify_add_inode_mark(struct fsnotify_group *group, 633static int fanotify_add_inode_mark(struct fsnotify_group *group,
@@ -619,6 +636,7 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
619{ 636{
620 struct fsnotify_mark *fsn_mark; 637 struct fsnotify_mark *fsn_mark;
621 __u32 added; 638 __u32 added;
639 int ret = 0;
622 640
623 pr_debug("%s: group=%p inode=%p\n", __func__, group, inode); 641 pr_debug("%s: group=%p inode=%p\n", __func__, group, inode);
624 642
@@ -634,8 +652,6 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
634 652
635 fsn_mark = fsnotify_find_inode_mark(group, inode); 653 fsn_mark = fsnotify_find_inode_mark(group, inode);
636 if (!fsn_mark) { 654 if (!fsn_mark) {
637 int ret;
638
639 if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks) 655 if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
640 return -ENOSPC; 656 return -ENOSPC;
641 657
@@ -645,16 +661,16 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
645 661
646 fsnotify_init_mark(fsn_mark, fanotify_free_mark); 662 fsnotify_init_mark(fsn_mark, fanotify_free_mark);
647 ret = fsnotify_add_mark(fsn_mark, group, inode, NULL, 0); 663 ret = fsnotify_add_mark(fsn_mark, group, inode, NULL, 0);
648 if (ret) { 664 if (ret)
649 fanotify_free_mark(fsn_mark); 665 goto err;
650 return ret;
651 }
652 } 666 }
653 added = fanotify_mark_add_to_mask(fsn_mark, mask, flags); 667 added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
654 fsnotify_put_mark(fsn_mark); 668
655 if (added & ~inode->i_fsnotify_mask) 669 if (added & ~inode->i_fsnotify_mask)
656 fsnotify_recalc_inode_mask(inode); 670 fsnotify_recalc_inode_mask(inode);
657 return 0; 671err:
672 fsnotify_put_mark(fsn_mark);
673 return ret;
658} 674}
659 675
660/* fanotify syscalls */ 676/* fanotify syscalls */
@@ -687,8 +703,10 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
687 703
688 /* fsnotify_alloc_group takes a ref. Dropped in fanotify_release */ 704 /* fsnotify_alloc_group takes a ref. Dropped in fanotify_release */
689 group = fsnotify_alloc_group(&fanotify_fsnotify_ops); 705 group = fsnotify_alloc_group(&fanotify_fsnotify_ops);
690 if (IS_ERR(group)) 706 if (IS_ERR(group)) {
707 free_uid(user);
691 return PTR_ERR(group); 708 return PTR_ERR(group);
709 }
692 710
693 group->fanotify_data.user = user; 711 group->fanotify_data.user = user;
694 atomic_inc(&user->fanotify_listeners); 712 atomic_inc(&user->fanotify_listeners);
@@ -698,6 +716,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
698 mutex_init(&group->fanotify_data.access_mutex); 716 mutex_init(&group->fanotify_data.access_mutex);
699 init_waitqueue_head(&group->fanotify_data.access_waitq); 717 init_waitqueue_head(&group->fanotify_data.access_waitq);
700 INIT_LIST_HEAD(&group->fanotify_data.access_list); 718 INIT_LIST_HEAD(&group->fanotify_data.access_list);
719 atomic_set(&group->fanotify_data.bypass_perm, 0);
701#endif 720#endif
702 switch (flags & FAN_ALL_CLASS_BITS) { 721 switch (flags & FAN_ALL_CLASS_BITS) {
703 case FAN_CLASS_NOTIF: 722 case FAN_CLASS_NOTIF:
@@ -764,8 +783,10 @@ SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags,
764 if (flags & ~FAN_ALL_MARK_FLAGS) 783 if (flags & ~FAN_ALL_MARK_FLAGS)
765 return -EINVAL; 784 return -EINVAL;
766 switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) { 785 switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) {
767 case FAN_MARK_ADD: 786 case FAN_MARK_ADD: /* fallthrough */
768 case FAN_MARK_REMOVE: 787 case FAN_MARK_REMOVE:
788 if (!mask)
789 return -EINVAL;
769 case FAN_MARK_FLUSH: 790 case FAN_MARK_FLUSH:
770 break; 791 break;
771 default: 792 default:
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 444c305a468c..4cd5d5d78f9f 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -752,6 +752,7 @@ SYSCALL_DEFINE1(inotify_init1, int, flags)
752 if (ret >= 0) 752 if (ret >= 0)
753 return ret; 753 return ret;
754 754
755 fsnotify_put_group(group);
755 atomic_dec(&user->inotify_devs); 756 atomic_dec(&user->inotify_devs);
756out_free_uid: 757out_free_uid:
757 free_uid(user); 758 free_uid(user);
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 52c7557f3e25..9f26ac9be2a4 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1964,8 +1964,10 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g
1964 if (reg == NULL) 1964 if (reg == NULL)
1965 return ERR_PTR(-ENOMEM); 1965 return ERR_PTR(-ENOMEM);
1966 1966
1967 if (strlen(name) > O2HB_MAX_REGION_NAME_LEN) 1967 if (strlen(name) > O2HB_MAX_REGION_NAME_LEN) {
1968 return ERR_PTR(-ENAMETOOLONG); 1968 ret = -ENAMETOOLONG;
1969 goto free;
1970 }
1969 1971
1970 spin_lock(&o2hb_live_lock); 1972 spin_lock(&o2hb_live_lock);
1971 reg->hr_region_num = 0; 1973 reg->hr_region_num = 0;
@@ -1974,7 +1976,8 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g
1974 O2NM_MAX_REGIONS); 1976 O2NM_MAX_REGIONS);
1975 if (reg->hr_region_num >= O2NM_MAX_REGIONS) { 1977 if (reg->hr_region_num >= O2NM_MAX_REGIONS) {
1976 spin_unlock(&o2hb_live_lock); 1978 spin_unlock(&o2hb_live_lock);
1977 return ERR_PTR(-EFBIG); 1979 ret = -EFBIG;
1980 goto free;
1978 } 1981 }
1979 set_bit(reg->hr_region_num, o2hb_region_bitmap); 1982 set_bit(reg->hr_region_num, o2hb_region_bitmap);
1980 } 1983 }
@@ -1986,10 +1989,13 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g
1986 ret = o2hb_debug_region_init(reg, o2hb_debug_dir); 1989 ret = o2hb_debug_region_init(reg, o2hb_debug_dir);
1987 if (ret) { 1990 if (ret) {
1988 config_item_put(&reg->hr_item); 1991 config_item_put(&reg->hr_item);
1989 return ERR_PTR(ret); 1992 goto free;
1990 } 1993 }
1991 1994
1992 return &reg->hr_item; 1995 return &reg->hr_item;
1996free:
1997 kfree(reg);
1998 return ERR_PTR(ret);
1993} 1999}
1994 2000
1995static void o2hb_heartbeat_group_drop_item(struct config_group *group, 2001static void o2hb_heartbeat_group_drop_item(struct config_group *group,
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index edaded48e7e9..895532ac4d98 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -476,7 +476,6 @@ static void ocfs2_dentry_iput(struct dentry *dentry, struct inode *inode)
476 476
477out: 477out:
478 iput(inode); 478 iput(inode);
479 ocfs2_dentry_attach_gen(dentry);
480} 479}
481 480
482/* 481/*
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 58a93b953735..cc2aaa96cfe5 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -959,7 +959,7 @@ static int dlm_match_regions(struct dlm_ctxt *dlm,
959 r += O2HB_MAX_REGION_NAME_LEN; 959 r += O2HB_MAX_REGION_NAME_LEN;
960 } 960 }
961 961
962 local = kmalloc(sizeof(qr->qr_regions), GFP_KERNEL); 962 local = kmalloc(sizeof(qr->qr_regions), GFP_ATOMIC);
963 if (!local) { 963 if (!local) {
964 status = -ENOMEM; 964 status = -ENOMEM;
965 goto bail; 965 goto bail;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index d8408217e3bd..70dd3b1798f1 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -159,7 +159,9 @@ struct ocfs2_lock_res {
159 char l_name[OCFS2_LOCK_ID_MAX_LEN]; 159 char l_name[OCFS2_LOCK_ID_MAX_LEN];
160 unsigned int l_ro_holders; 160 unsigned int l_ro_holders;
161 unsigned int l_ex_holders; 161 unsigned int l_ex_holders;
162 unsigned char l_level; 162 signed char l_level;
163 signed char l_requested;
164 signed char l_blocking;
163 165
164 /* Data packed - type enum ocfs2_lock_type */ 166 /* Data packed - type enum ocfs2_lock_type */
165 unsigned char l_type; 167 unsigned char l_type;
@@ -169,8 +171,6 @@ struct ocfs2_lock_res {
169 unsigned char l_action; 171 unsigned char l_action;
170 /* Data packed - enum type ocfs2_unlock_action */ 172 /* Data packed - enum type ocfs2_unlock_action */
171 unsigned char l_unlock_action; 173 unsigned char l_unlock_action;
172 unsigned char l_requested;
173 unsigned char l_blocking;
174 unsigned int l_pending_gen; 174 unsigned int l_pending_gen;
175 175
176 spinlock_t l_lock; 176 spinlock_t l_lock;
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 252e7c82f929..a5ebe421195f 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -190,7 +190,7 @@ static struct ocfs2_live_connection *ocfs2_connection_find(const char *name)
190 return c; 190 return c;
191 } 191 }
192 192
193 return c; 193 return NULL;
194} 194}
195 195
196/* 196/*
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index f02c0ef31578..cfeab7ce3697 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -41,7 +41,6 @@
41#include <linux/mount.h> 41#include <linux/mount.h>
42#include <linux/seq_file.h> 42#include <linux/seq_file.h>
43#include <linux/quotaops.h> 43#include <linux/quotaops.h>
44#include <linux/smp_lock.h>
45 44
46#define MLOG_MASK_PREFIX ML_SUPER 45#define MLOG_MASK_PREFIX ML_SUPER
47#include <cluster/masklog.h> 46#include <cluster/masklog.h>
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index ddb1f41376e5..911e61f348fc 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -418,7 +418,7 @@ out_no_root:
418static struct dentry *openprom_mount(struct file_system_type *fs_type, 418static struct dentry *openprom_mount(struct file_system_type *fs_type,
419 int flags, const char *dev_name, void *data) 419 int flags, const char *dev_name, void *data)
420{ 420{
421 return mount_single(fs_type, flags, data, openprom_fill_super) 421 return mount_single(fs_type, flags, data, openprom_fill_super);
422} 422}
423 423
424static struct file_system_type openprom_fs_type = { 424static struct file_system_type openprom_fs_type = {
diff --git a/fs/pipe.c b/fs/pipe.c
index a8012a955720..04629f36e397 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1199,12 +1199,24 @@ int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
1199 return ret; 1199 return ret;
1200} 1200}
1201 1201
1202/*
1203 * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
1204 * location, so checking ->i_pipe is not enough to verify that this is a
1205 * pipe.
1206 */
1207struct pipe_inode_info *get_pipe_info(struct file *file)
1208{
1209 struct inode *i = file->f_path.dentry->d_inode;
1210
1211 return S_ISFIFO(i->i_mode) ? i->i_pipe : NULL;
1212}
1213
1202long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) 1214long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
1203{ 1215{
1204 struct pipe_inode_info *pipe; 1216 struct pipe_inode_info *pipe;
1205 long ret; 1217 long ret;
1206 1218
1207 pipe = file->f_path.dentry->d_inode->i_pipe; 1219 pipe = get_pipe_info(file);
1208 if (!pipe) 1220 if (!pipe)
1209 return -EBADF; 1221 return -EBADF;
1210 1222
diff --git a/fs/proc/base.c b/fs/proc/base.c
index f3d02ca461ec..182845147fe4 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1574,7 +1574,7 @@ static int do_proc_readlink(struct path *path, char __user *buffer, int buflen)
1574 if (!tmp) 1574 if (!tmp)
1575 return -ENOMEM; 1575 return -ENOMEM;
1576 1576
1577 pathname = d_path_with_unreachable(path, tmp, PAGE_SIZE); 1577 pathname = d_path(path, tmp, PAGE_SIZE);
1578 len = PTR_ERR(pathname); 1578 len = PTR_ERR(pathname);
1579 if (IS_ERR(pathname)) 1579 if (IS_ERR(pathname))
1580 goto out; 1580 goto out;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 9c2b5f484879..3ddb6068177c 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -16,7 +16,6 @@
16#include <linux/limits.h> 16#include <linux/limits.h>
17#include <linux/init.h> 17#include <linux/init.h>
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/smp_lock.h>
20#include <linux/sysctl.h> 19#include <linux/sysctl.h>
21#include <linux/slab.h> 20#include <linux/slab.h>
22 21
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index da6b01d70f01..c126c83b9a45 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -706,6 +706,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
706 * skip over unmapped regions. 706 * skip over unmapped regions.
707 */ 707 */
708#define PAGEMAP_WALK_SIZE (PMD_SIZE) 708#define PAGEMAP_WALK_SIZE (PMD_SIZE)
709#define PAGEMAP_WALK_MASK (PMD_MASK)
709static ssize_t pagemap_read(struct file *file, char __user *buf, 710static ssize_t pagemap_read(struct file *file, char __user *buf,
710 size_t count, loff_t *ppos) 711 size_t count, loff_t *ppos)
711{ 712{
@@ -776,7 +777,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
776 unsigned long end; 777 unsigned long end;
777 778
778 pm.pos = 0; 779 pm.pos = 0;
779 end = start_vaddr + PAGEMAP_WALK_SIZE; 780 end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK;
780 /* overflow ? */ 781 /* overflow ? */
781 if (end < start_vaddr || end > end_vaddr) 782 if (end < start_vaddr || end > end_vaddr)
782 end = end_vaddr; 783 end = end_vaddr;
diff --git a/fs/read_write.c b/fs/read_write.c
index 431a0ed610c8..5d431bacbea9 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -9,7 +9,6 @@
9#include <linux/fcntl.h> 9#include <linux/fcntl.h>
10#include <linux/file.h> 10#include <linux/file.h>
11#include <linux/uio.h> 11#include <linux/uio.h>
12#include <linux/smp_lock.h>
13#include <linux/fsnotify.h> 12#include <linux/fsnotify.h>
14#include <linux/security.h> 13#include <linux/security.h>
15#include <linux/module.h> 14#include <linux/module.h>
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 41656d40dc5c..0bae036831e2 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -8,7 +8,6 @@
8#include <linux/reiserfs_acl.h> 8#include <linux/reiserfs_acl.h>
9#include <linux/reiserfs_xattr.h> 9#include <linux/reiserfs_xattr.h>
10#include <linux/exportfs.h> 10#include <linux/exportfs.h>
11#include <linux/smp_lock.h>
12#include <linux/pagemap.h> 11#include <linux/pagemap.h>
13#include <linux/highmem.h> 12#include <linux/highmem.h>
14#include <linux/slab.h> 13#include <linux/slab.h>
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index adf22b485cea..79265fdc317a 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -9,7 +9,6 @@
9#include <linux/time.h> 9#include <linux/time.h>
10#include <asm/uaccess.h> 10#include <asm/uaccess.h>
11#include <linux/pagemap.h> 11#include <linux/pagemap.h>
12#include <linux/smp_lock.h>
13#include <linux/compat.h> 12#include <linux/compat.h>
14 13
15/* 14/*
@@ -184,12 +183,11 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
184 return 0; 183 return 0;
185 } 184 }
186 185
187 /* we need to make sure nobody is changing the file size beneath
188 ** us
189 */
190 reiserfs_mutex_lock_safe(&inode->i_mutex, inode->i_sb);
191 depth = reiserfs_write_lock_once(inode->i_sb); 186 depth = reiserfs_write_lock_once(inode->i_sb);
192 187
188 /* we need to make sure nobody is changing the file size beneath us */
189 reiserfs_mutex_lock_safe(&inode->i_mutex, inode->i_sb);
190
193 write_from = inode->i_size & (blocksize - 1); 191 write_from = inode->i_size & (blocksize - 1);
194 /* if we are on a block boundary, we are already unpacked. */ 192 /* if we are on a block boundary, we are already unpacked. */
195 if (write_from == 0) { 193 if (write_from == 0) {
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 076c8b194682..d31bce1a9f90 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -43,7 +43,6 @@
43#include <linux/fcntl.h> 43#include <linux/fcntl.h>
44#include <linux/stat.h> 44#include <linux/stat.h>
45#include <linux/string.h> 45#include <linux/string.h>
46#include <linux/smp_lock.h>
47#include <linux/buffer_head.h> 46#include <linux/buffer_head.h>
48#include <linux/workqueue.h> 47#include <linux/workqueue.h>
49#include <linux/writeback.h> 48#include <linux/writeback.h>
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 3bf7a6457f4d..b243117b8752 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -28,7 +28,6 @@
28#include <linux/mount.h> 28#include <linux/mount.h>
29#include <linux/namei.h> 29#include <linux/namei.h>
30#include <linux/crc32.h> 30#include <linux/crc32.h>
31#include <linux/smp_lock.h>
32 31
33struct file_system_type reiserfs_fs_type; 32struct file_system_type reiserfs_fs_type;
34 33
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 536d697a8a28..90d2fcb67a31 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -472,7 +472,9 @@ int reiserfs_acl_chmod(struct inode *inode)
472 struct reiserfs_transaction_handle th; 472 struct reiserfs_transaction_handle th;
473 size_t size = reiserfs_xattr_nblocks(inode, 473 size_t size = reiserfs_xattr_nblocks(inode,
474 reiserfs_acl_size(clone->a_count)); 474 reiserfs_acl_size(clone->a_count));
475 reiserfs_write_lock(inode->i_sb); 475 int depth;
476
477 depth = reiserfs_write_lock_once(inode->i_sb);
476 error = journal_begin(&th, inode->i_sb, size * 2); 478 error = journal_begin(&th, inode->i_sb, size * 2);
477 if (!error) { 479 if (!error) {
478 int error2; 480 int error2;
@@ -482,7 +484,7 @@ int reiserfs_acl_chmod(struct inode *inode)
482 if (error2) 484 if (error2)
483 error = error2; 485 error = error2;
484 } 486 }
485 reiserfs_write_unlock(inode->i_sb); 487 reiserfs_write_unlock_once(inode->i_sb, depth);
486 } 488 }
487 posix_acl_release(clone); 489 posix_acl_release(clone);
488 return error; 490 return error;
diff --git a/fs/splice.c b/fs/splice.c
index 8f1dfaecc8f0..ce2f02579e35 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1311,18 +1311,6 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
1311static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, 1311static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1312 struct pipe_inode_info *opipe, 1312 struct pipe_inode_info *opipe,
1313 size_t len, unsigned int flags); 1313 size_t len, unsigned int flags);
1314/*
1315 * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
1316 * location, so checking ->i_pipe is not enough to verify that this is a
1317 * pipe.
1318 */
1319static inline struct pipe_inode_info *pipe_info(struct inode *inode)
1320{
1321 if (S_ISFIFO(inode->i_mode))
1322 return inode->i_pipe;
1323
1324 return NULL;
1325}
1326 1314
1327/* 1315/*
1328 * Determine where to splice to/from. 1316 * Determine where to splice to/from.
@@ -1336,8 +1324,8 @@ static long do_splice(struct file *in, loff_t __user *off_in,
1336 loff_t offset, *off; 1324 loff_t offset, *off;
1337 long ret; 1325 long ret;
1338 1326
1339 ipipe = pipe_info(in->f_path.dentry->d_inode); 1327 ipipe = get_pipe_info(in);
1340 opipe = pipe_info(out->f_path.dentry->d_inode); 1328 opipe = get_pipe_info(out);
1341 1329
1342 if (ipipe && opipe) { 1330 if (ipipe && opipe) {
1343 if (off_in || off_out) 1331 if (off_in || off_out)
@@ -1555,7 +1543,7 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *iov,
1555 int error; 1543 int error;
1556 long ret; 1544 long ret;
1557 1545
1558 pipe = pipe_info(file->f_path.dentry->d_inode); 1546 pipe = get_pipe_info(file);
1559 if (!pipe) 1547 if (!pipe)
1560 return -EBADF; 1548 return -EBADF;
1561 1549
@@ -1642,7 +1630,7 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
1642 }; 1630 };
1643 long ret; 1631 long ret;
1644 1632
1645 pipe = pipe_info(file->f_path.dentry->d_inode); 1633 pipe = get_pipe_info(file);
1646 if (!pipe) 1634 if (!pipe)
1647 return -EBADF; 1635 return -EBADF;
1648 1636
@@ -2022,8 +2010,8 @@ static int link_pipe(struct pipe_inode_info *ipipe,
2022static long do_tee(struct file *in, struct file *out, size_t len, 2010static long do_tee(struct file *in, struct file *out, size_t len,
2023 unsigned int flags) 2011 unsigned int flags)
2024{ 2012{
2025 struct pipe_inode_info *ipipe = pipe_info(in->f_path.dentry->d_inode); 2013 struct pipe_inode_info *ipipe = get_pipe_info(in);
2026 struct pipe_inode_info *opipe = pipe_info(out->f_path.dentry->d_inode); 2014 struct pipe_inode_info *opipe = get_pipe_info(out);
2027 int ret = -EINVAL; 2015 int ret = -EINVAL;
2028 2016
2029 /* 2017 /*
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index c9af48fffcd7..691f61223ed6 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -934,7 +934,6 @@ xfs_aops_discard_page(
934 struct xfs_inode *ip = XFS_I(inode); 934 struct xfs_inode *ip = XFS_I(inode);
935 struct buffer_head *bh, *head; 935 struct buffer_head *bh, *head;
936 loff_t offset = page_offset(page); 936 loff_t offset = page_offset(page);
937 ssize_t len = 1 << inode->i_blkbits;
938 937
939 if (!xfs_is_delayed_page(page, IO_DELAY)) 938 if (!xfs_is_delayed_page(page, IO_DELAY))
940 goto out_invalidate; 939 goto out_invalidate;
@@ -949,58 +948,14 @@ xfs_aops_discard_page(
949 xfs_ilock(ip, XFS_ILOCK_EXCL); 948 xfs_ilock(ip, XFS_ILOCK_EXCL);
950 bh = head = page_buffers(page); 949 bh = head = page_buffers(page);
951 do { 950 do {
952 int done;
953 xfs_fileoff_t offset_fsb;
954 xfs_bmbt_irec_t imap;
955 int nimaps = 1;
956 int error; 951 int error;
957 xfs_fsblock_t firstblock; 952 xfs_fileoff_t start_fsb;
958 xfs_bmap_free_t flist;
959 953
960 if (!buffer_delay(bh)) 954 if (!buffer_delay(bh))
961 goto next_buffer; 955 goto next_buffer;
962 956
963 offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); 957 start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
964 958 error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1);
965 /*
966 * Map the range first and check that it is a delalloc extent
967 * before trying to unmap the range. Otherwise we will be
968 * trying to remove a real extent (which requires a
969 * transaction) or a hole, which is probably a bad idea...
970 */
971 error = xfs_bmapi(NULL, ip, offset_fsb, 1,
972 XFS_BMAPI_ENTIRE, NULL, 0, &imap,
973 &nimaps, NULL);
974
975 if (error) {
976 /* something screwed, just bail */
977 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
978 xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
979 "page discard failed delalloc mapping lookup.");
980 }
981 break;
982 }
983 if (!nimaps) {
984 /* nothing there */
985 goto next_buffer;
986 }
987 if (imap.br_startblock != DELAYSTARTBLOCK) {
988 /* been converted, ignore */
989 goto next_buffer;
990 }
991 WARN_ON(imap.br_blockcount == 0);
992
993 /*
994 * Note: while we initialise the firstblock/flist pair, they
995 * should never be used because blocks should never be
996 * allocated or freed for a delalloc extent and hence we need
997 * don't cancel or finish them after the xfs_bunmapi() call.
998 */
999 xfs_bmap_init(&flist, &firstblock);
1000 error = xfs_bunmapi(NULL, ip, offset_fsb, 1, 0, 1, &firstblock,
1001 &flist, &done);
1002
1003 ASSERT(!flist.xbf_count && !flist.xbf_first);
1004 if (error) { 959 if (error) {
1005 /* something screwed, just bail */ 960 /* something screwed, just bail */
1006 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { 961 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
@@ -1010,7 +965,7 @@ xfs_aops_discard_page(
1010 break; 965 break;
1011 } 966 }
1012next_buffer: 967next_buffer:
1013 offset += len; 968 offset += 1 << inode->i_blkbits;
1014 969
1015 } while ((bh = bh->b_this_page) != head); 970 } while ((bh = bh->b_this_page) != head);
1016 971
@@ -1111,11 +1066,12 @@ xfs_vm_writepage(
1111 uptodate = 0; 1066 uptodate = 0;
1112 1067
1113 /* 1068 /*
1114 * A hole may still be marked uptodate because discard_buffer 1069 * set_page_dirty dirties all buffers in a page, independent
1115 * leaves the flag set. 1070 * of their state. The dirty state however is entirely
1071 * meaningless for holes (!mapped && uptodate), so skip
1072 * buffers covering holes here.
1116 */ 1073 */
1117 if (!buffer_mapped(bh) && buffer_uptodate(bh)) { 1074 if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
1118 ASSERT(!buffer_dirty(bh));
1119 imap_valid = 0; 1075 imap_valid = 0;
1120 continue; 1076 continue;
1121 } 1077 }
@@ -1504,11 +1460,42 @@ xfs_vm_write_failed(
1504 struct inode *inode = mapping->host; 1460 struct inode *inode = mapping->host;
1505 1461
1506 if (to > inode->i_size) { 1462 if (to > inode->i_size) {
1507 struct iattr ia = { 1463 /*
1508 .ia_valid = ATTR_SIZE | ATTR_FORCE, 1464 * punch out the delalloc blocks we have already allocated. We
1509 .ia_size = inode->i_size, 1465 * don't call xfs_setattr() to do this as we may be in the
1510 }; 1466 * middle of a multi-iovec write and so the vfs inode->i_size
1511 xfs_setattr(XFS_I(inode), &ia, XFS_ATTR_NOLOCK); 1467 * will not match the xfs ip->i_size and so it will zero too
1468 * much. Hence we jus truncate the page cache to zero what is
1469 * necessary and punch the delalloc blocks directly.
1470 */
1471 struct xfs_inode *ip = XFS_I(inode);
1472 xfs_fileoff_t start_fsb;
1473 xfs_fileoff_t end_fsb;
1474 int error;
1475
1476 truncate_pagecache(inode, to, inode->i_size);
1477
1478 /*
1479 * Check if there are any blocks that are outside of i_size
1480 * that need to be trimmed back.
1481 */
1482 start_fsb = XFS_B_TO_FSB(ip->i_mount, inode->i_size) + 1;
1483 end_fsb = XFS_B_TO_FSB(ip->i_mount, to);
1484 if (end_fsb <= start_fsb)
1485 return;
1486
1487 xfs_ilock(ip, XFS_ILOCK_EXCL);
1488 error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
1489 end_fsb - start_fsb);
1490 if (error) {
1491 /* something screwed, just bail */
1492 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
1493 xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
1494 "xfs_vm_write_failed: unable to clean up ino %lld",
1495 ip->i_ino);
1496 }
1497 }
1498 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1512 } 1499 }
1513} 1500}
1514 1501
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 63fd2c07cb57..4c5deb6e9e31 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -488,29 +488,16 @@ found:
488 spin_unlock(&pag->pag_buf_lock); 488 spin_unlock(&pag->pag_buf_lock);
489 xfs_perag_put(pag); 489 xfs_perag_put(pag);
490 490
491 /* Attempt to get the semaphore without sleeping, 491 if (xfs_buf_cond_lock(bp)) {
492 * if this does not work then we need to drop the 492 /* failed, so wait for the lock if requested. */
493 * spinlock and do a hard attempt on the semaphore.
494 */
495 if (down_trylock(&bp->b_sema)) {
496 if (!(flags & XBF_TRYLOCK)) { 493 if (!(flags & XBF_TRYLOCK)) {
497 /* wait for buffer ownership */
498 xfs_buf_lock(bp); 494 xfs_buf_lock(bp);
499 XFS_STATS_INC(xb_get_locked_waited); 495 XFS_STATS_INC(xb_get_locked_waited);
500 } else { 496 } else {
501 /* We asked for a trylock and failed, no need
502 * to look at file offset and length here, we
503 * know that this buffer at least overlaps our
504 * buffer and is locked, therefore our buffer
505 * either does not exist, or is this buffer.
506 */
507 xfs_buf_rele(bp); 497 xfs_buf_rele(bp);
508 XFS_STATS_INC(xb_busy_locked); 498 XFS_STATS_INC(xb_busy_locked);
509 return NULL; 499 return NULL;
510 } 500 }
511 } else {
512 /* trylock worked */
513 XB_SET_OWNER(bp);
514 } 501 }
515 502
516 if (bp->b_flags & XBF_STALE) { 503 if (bp->b_flags & XBF_STALE) {
@@ -876,10 +863,18 @@ xfs_buf_rele(
876 */ 863 */
877 864
878/* 865/*
879 * Locks a buffer object, if it is not already locked. 866 * Locks a buffer object, if it is not already locked. Note that this in
880 * Note that this in no way locks the underlying pages, so it is only 867 * no way locks the underlying pages, so it is only useful for
881 * useful for synchronizing concurrent use of buffer objects, not for 868 * synchronizing concurrent use of buffer objects, not for synchronizing
882 * synchronizing independent access to the underlying pages. 869 * independent access to the underlying pages.
870 *
871 * If we come across a stale, pinned, locked buffer, we know that we are
872 * being asked to lock a buffer that has been reallocated. Because it is
873 * pinned, we know that the log has not been pushed to disk and hence it
874 * will still be locked. Rather than continuing to have trylock attempts
875 * fail until someone else pushes the log, push it ourselves before
876 * returning. This means that the xfsaild will not get stuck trying
877 * to push on stale inode buffers.
883 */ 878 */
884int 879int
885xfs_buf_cond_lock( 880xfs_buf_cond_lock(
@@ -890,6 +885,8 @@ xfs_buf_cond_lock(
890 locked = down_trylock(&bp->b_sema) == 0; 885 locked = down_trylock(&bp->b_sema) == 0;
891 if (locked) 886 if (locked)
892 XB_SET_OWNER(bp); 887 XB_SET_OWNER(bp);
888 else if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
889 xfs_log_force(bp->b_target->bt_mount, 0);
893 890
894 trace_xfs_buf_cond_lock(bp, _RET_IP_); 891 trace_xfs_buf_cond_lock(bp, _RET_IP_);
895 return locked ? 0 : -EBUSY; 892 return locked ? 0 : -EBUSY;
@@ -1781,7 +1778,6 @@ xfs_buf_delwri_split(
1781 INIT_LIST_HEAD(list); 1778 INIT_LIST_HEAD(list);
1782 spin_lock(dwlk); 1779 spin_lock(dwlk);
1783 list_for_each_entry_safe(bp, n, dwq, b_list) { 1780 list_for_each_entry_safe(bp, n, dwq, b_list) {
1784 trace_xfs_buf_delwri_split(bp, _RET_IP_);
1785 ASSERT(bp->b_flags & XBF_DELWRI); 1781 ASSERT(bp->b_flags & XBF_DELWRI);
1786 1782
1787 if (!XFS_BUF_ISPINNED(bp) && !xfs_buf_cond_lock(bp)) { 1783 if (!XFS_BUF_ISPINNED(bp) && !xfs_buf_cond_lock(bp)) {
@@ -1795,6 +1791,7 @@ xfs_buf_delwri_split(
1795 _XBF_RUN_QUEUES); 1791 _XBF_RUN_QUEUES);
1796 bp->b_flags |= XBF_WRITE; 1792 bp->b_flags |= XBF_WRITE;
1797 list_move_tail(&bp->b_list, list); 1793 list_move_tail(&bp->b_list, list);
1794 trace_xfs_buf_delwri_split(bp, _RET_IP_);
1798 } else 1795 } else
1799 skipped++; 1796 skipped++;
1800 } 1797 }
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 2ea238f6d38e..ad442d9e392e 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -416,7 +416,7 @@ xfs_attrlist_by_handle(
416 if (IS_ERR(dentry)) 416 if (IS_ERR(dentry))
417 return PTR_ERR(dentry); 417 return PTR_ERR(dentry);
418 418
419 kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL); 419 kbuf = kzalloc(al_hreq.buflen, GFP_KERNEL);
420 if (!kbuf) 420 if (!kbuf)
421 goto out_dput; 421 goto out_dput;
422 422
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 96107efc0c61..94d5fd6a2973 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -762,7 +762,8 @@ xfs_setup_inode(
762 inode->i_state = I_NEW; 762 inode->i_state = I_NEW;
763 763
764 inode_sb_list_add(inode); 764 inode_sb_list_add(inode);
765 insert_inode_hash(inode); 765 /* make the inode look hashed for the writeback code */
766 hlist_add_fake(&inode->i_hash);
766 767
767 inode->i_mode = ip->i_d.di_mode; 768 inode->i_mode = ip->i_d.di_mode;
768 inode->i_nlink = ip->i_d.di_nlink; 769 inode->i_nlink = ip->i_d.di_nlink;
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 7465a7ffc4fd..c115dd5e95a8 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -353,9 +353,6 @@ xfs_parseargs(
353 mp->m_qflags &= ~XFS_OQUOTA_ENFD; 353 mp->m_qflags &= ~XFS_OQUOTA_ENFD;
354 } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) { 354 } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
355 mp->m_flags |= XFS_MOUNT_DELAYLOG; 355 mp->m_flags |= XFS_MOUNT_DELAYLOG;
356 cmn_err(CE_WARN,
357 "Enabling EXPERIMENTAL delayed logging feature "
358 "- use at your own risk.\n");
359 } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) { 356 } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
360 mp->m_flags &= ~XFS_MOUNT_DELAYLOG; 357 mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
361 } else if (!strcmp(this_char, "ihashsize")) { 358 } else if (!strcmp(this_char, "ihashsize")) {
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 37d33254981d..afb0d7cfad1c 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -853,6 +853,7 @@ restart:
853 if (trylock) { 853 if (trylock) {
854 if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) { 854 if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) {
855 skipped++; 855 skipped++;
856 xfs_perag_put(pag);
856 continue; 857 continue;
857 } 858 }
858 first_index = pag->pag_ici_reclaim_cursor; 859 first_index = pag->pag_ici_reclaim_cursor;
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 8abd12e32e13..4111cd3966c7 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -5471,8 +5471,13 @@ xfs_getbmap(
5471 if (error) 5471 if (error)
5472 goto out_unlock_iolock; 5472 goto out_unlock_iolock;
5473 } 5473 }
5474 5474 /*
5475 ASSERT(ip->i_delayed_blks == 0); 5475 * even after flushing the inode, there can still be delalloc
5476 * blocks on the inode beyond EOF due to speculative
5477 * preallocation. These are not removed until the release
5478 * function is called or the inode is inactivated. Hence we
5479 * cannot assert here that ip->i_delayed_blks == 0.
5480 */
5476 } 5481 }
5477 5482
5478 lock = xfs_ilock_map_shared(ip); 5483 lock = xfs_ilock_map_shared(ip);
@@ -6070,3 +6075,79 @@ xfs_bmap_disk_count_leaves(
6070 *count += xfs_bmbt_disk_get_blockcount(frp); 6075 *count += xfs_bmbt_disk_get_blockcount(frp);
6071 } 6076 }
6072} 6077}
6078
6079/*
6080 * dead simple method of punching delalyed allocation blocks from a range in
6081 * the inode. Walks a block at a time so will be slow, but is only executed in
6082 * rare error cases so the overhead is not critical. This will alays punch out
6083 * both the start and end blocks, even if the ranges only partially overlap
6084 * them, so it is up to the caller to ensure that partial blocks are not
6085 * passed in.
6086 */
6087int
6088xfs_bmap_punch_delalloc_range(
6089 struct xfs_inode *ip,
6090 xfs_fileoff_t start_fsb,
6091 xfs_fileoff_t length)
6092{
6093 xfs_fileoff_t remaining = length;
6094 int error = 0;
6095
6096 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
6097
6098 do {
6099 int done;
6100 xfs_bmbt_irec_t imap;
6101 int nimaps = 1;
6102 xfs_fsblock_t firstblock;
6103 xfs_bmap_free_t flist;
6104
6105 /*
6106 * Map the range first and check that it is a delalloc extent
6107 * before trying to unmap the range. Otherwise we will be
6108 * trying to remove a real extent (which requires a
6109 * transaction) or a hole, which is probably a bad idea...
6110 */
6111 error = xfs_bmapi(NULL, ip, start_fsb, 1,
6112 XFS_BMAPI_ENTIRE, NULL, 0, &imap,
6113 &nimaps, NULL);
6114
6115 if (error) {
6116 /* something screwed, just bail */
6117 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
6118 xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
6119 "Failed delalloc mapping lookup ino %lld fsb %lld.",
6120 ip->i_ino, start_fsb);
6121 }
6122 break;
6123 }
6124 if (!nimaps) {
6125 /* nothing there */
6126 goto next_block;
6127 }
6128 if (imap.br_startblock != DELAYSTARTBLOCK) {
6129 /* been converted, ignore */
6130 goto next_block;
6131 }
6132 WARN_ON(imap.br_blockcount == 0);
6133
6134 /*
6135 * Note: while we initialise the firstblock/flist pair, they
6136 * should never be used because blocks should never be
6137 * allocated or freed for a delalloc extent and hence we need
6138 * don't cancel or finish them after the xfs_bunmapi() call.
6139 */
6140 xfs_bmap_init(&flist, &firstblock);
6141 error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock,
6142 &flist, &done);
6143 if (error)
6144 break;
6145
6146 ASSERT(!flist.xbf_count && !flist.xbf_first);
6147next_block:
6148 start_fsb++;
6149 remaining--;
6150 } while(remaining > 0);
6151
6152 return error;
6153}
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 71ec9b6ecdfc..3651191daea1 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -394,6 +394,11 @@ xfs_bmap_count_blocks(
394 int whichfork, 394 int whichfork,
395 int *count); 395 int *count);
396 396
397int
398xfs_bmap_punch_delalloc_range(
399 struct xfs_inode *ip,
400 xfs_fileoff_t start_fsb,
401 xfs_fileoff_t length);
397#endif /* __KERNEL__ */ 402#endif /* __KERNEL__ */
398 403
399#endif /* __XFS_BMAP_H__ */ 404#endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 3b9582c60a22..e60490bc00a6 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -377,6 +377,19 @@ xfs_swap_extents(
377 ip->i_d.di_format = tip->i_d.di_format; 377 ip->i_d.di_format = tip->i_d.di_format;
378 tip->i_d.di_format = tmp; 378 tip->i_d.di_format = tmp;
379 379
380 /*
381 * The extents in the source inode could still contain speculative
382 * preallocation beyond EOF (e.g. the file is open but not modified
383 * while defrag is in progress). In that case, we need to copy over the
384 * number of delalloc blocks the data fork in the source inode is
385 * tracking beyond EOF so that when the fork is truncated away when the
386 * temporary inode is unlinked we don't underrun the i_delayed_blks
387 * counter on that inode.
388 */
389 ASSERT(tip->i_delayed_blks == 0);
390 tip->i_delayed_blks = ip->i_delayed_blks;
391 ip->i_delayed_blks = 0;
392
380 ilf_fields = XFS_ILOG_CORE; 393 ilf_fields = XFS_ILOG_CORE;
381 394
382 switch(ip->i_d.di_format) { 395 switch(ip->i_d.di_format) {
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index ed9990267661..c78cc6a3d87c 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -58,6 +58,7 @@ xfs_error_trap(int e)
58int xfs_etest[XFS_NUM_INJECT_ERROR]; 58int xfs_etest[XFS_NUM_INJECT_ERROR];
59int64_t xfs_etest_fsid[XFS_NUM_INJECT_ERROR]; 59int64_t xfs_etest_fsid[XFS_NUM_INJECT_ERROR];
60char * xfs_etest_fsname[XFS_NUM_INJECT_ERROR]; 60char * xfs_etest_fsname[XFS_NUM_INJECT_ERROR];
61int xfs_error_test_active;
61 62
62int 63int
63xfs_error_test(int error_tag, int *fsidp, char *expression, 64xfs_error_test(int error_tag, int *fsidp, char *expression,
@@ -108,6 +109,7 @@ xfs_errortag_add(int error_tag, xfs_mount_t *mp)
108 len = strlen(mp->m_fsname); 109 len = strlen(mp->m_fsname);
109 xfs_etest_fsname[i] = kmem_alloc(len + 1, KM_SLEEP); 110 xfs_etest_fsname[i] = kmem_alloc(len + 1, KM_SLEEP);
110 strcpy(xfs_etest_fsname[i], mp->m_fsname); 111 strcpy(xfs_etest_fsname[i], mp->m_fsname);
112 xfs_error_test_active++;
111 return 0; 113 return 0;
112 } 114 }
113 } 115 }
@@ -137,6 +139,7 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
137 xfs_etest_fsid[i] = 0LL; 139 xfs_etest_fsid[i] = 0LL;
138 kmem_free(xfs_etest_fsname[i]); 140 kmem_free(xfs_etest_fsname[i]);
139 xfs_etest_fsname[i] = NULL; 141 xfs_etest_fsname[i] = NULL;
142 xfs_error_test_active--;
140 } 143 }
141 } 144 }
142 145
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index c2c1a072bb82..f338847f80b8 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -127,13 +127,14 @@ extern void xfs_corruption_error(const char *tag, int level,
127#define XFS_RANDOM_BMAPIFORMAT XFS_RANDOM_DEFAULT 127#define XFS_RANDOM_BMAPIFORMAT XFS_RANDOM_DEFAULT
128 128
129#ifdef DEBUG 129#ifdef DEBUG
130extern int xfs_error_test_active;
130extern int xfs_error_test(int, int *, char *, int, char *, unsigned long); 131extern int xfs_error_test(int, int *, char *, int, char *, unsigned long);
131 132
132#define XFS_NUM_INJECT_ERROR 10 133#define XFS_NUM_INJECT_ERROR 10
133#define XFS_TEST_ERROR(expr, mp, tag, rf) \ 134#define XFS_TEST_ERROR(expr, mp, tag, rf) \
134 ((expr) || \ 135 ((expr) || (xfs_error_test_active && \
135 xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \ 136 xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \
136 (rf))) 137 (rf))))
137 138
138extern int xfs_errortag_add(int error_tag, xfs_mount_t *mp); 139extern int xfs_errortag_add(int error_tag, xfs_mount_t *mp);
139extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud); 140extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 9b715dce5699..9124425b7f2f 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -744,9 +744,15 @@ xfs_filestream_new_ag(
744 * If the file's parent directory is known, take its iolock in exclusive 744 * If the file's parent directory is known, take its iolock in exclusive
745 * mode to prevent two sibling files from racing each other to migrate 745 * mode to prevent two sibling files from racing each other to migrate
746 * themselves and their parent to different AGs. 746 * themselves and their parent to different AGs.
747 *
748 * Note that we lock the parent directory iolock inside the child
749 * iolock here. That's fine as we never hold both parent and child
750 * iolock in any other place. This is different from the ilock,
751 * which requires locking of the child after the parent for namespace
752 * operations.
747 */ 753 */
748 if (pip) 754 if (pip)
749 xfs_ilock(pip, XFS_IOLOCK_EXCL); 755 xfs_ilock(pip, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
750 756
751 /* 757 /*
752 * A new AG needs to be found for the file. If the file's parent 758 * A new AG needs to be found for the file. If the file's parent
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index c7ac020705df..7c8d30c453c3 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -657,18 +657,37 @@ xfs_inode_item_unlock(
657} 657}
658 658
659/* 659/*
660 * This is called to find out where the oldest active copy of the 660 * This is called to find out where the oldest active copy of the inode log
661 * inode log item in the on disk log resides now that the last log 661 * item in the on disk log resides now that the last log write of it completed
662 * write of it completed at the given lsn. Since we always re-log 662 * at the given lsn. Since we always re-log all dirty data in an inode, the
663 * all dirty data in an inode, the latest copy in the on disk log 663 * latest copy in the on disk log is the only one that matters. Therefore,
664 * is the only one that matters. Therefore, simply return the 664 * simply return the given lsn.
665 * given lsn. 665 *
666 * If the inode has been marked stale because the cluster is being freed, we
667 * don't want to (re-)insert this inode into the AIL. There is a race condition
668 * where the cluster buffer may be unpinned before the inode is inserted into
669 * the AIL during transaction committed processing. If the buffer is unpinned
670 * before the inode item has been committed and inserted, then it is possible
671 * for the buffer to be written and IO completions before the inode is inserted
672 * into the AIL. In that case, we'd be inserting a clean, stale inode into the
673 * AIL which will never get removed. It will, however, get reclaimed which
674 * triggers an assert in xfs_inode_free() complaining about freein an inode
675 * still in the AIL.
676 *
677 * To avoid this, return a lower LSN than the one passed in so that the
678 * transaction committed code will not move the inode forward in the AIL but
679 * will still unpin it properly.
666 */ 680 */
667STATIC xfs_lsn_t 681STATIC xfs_lsn_t
668xfs_inode_item_committed( 682xfs_inode_item_committed(
669 struct xfs_log_item *lip, 683 struct xfs_log_item *lip,
670 xfs_lsn_t lsn) 684 xfs_lsn_t lsn)
671{ 685{
686 struct xfs_inode_log_item *iip = INODE_ITEM(lip);
687 struct xfs_inode *ip = iip->ili_inode;
688
689 if (xfs_iflags_test(ip, XFS_ISTALE))
690 return lsn - 1;
672 return lsn; 691 return lsn;
673} 692}
674 693
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index b1498ab5a399..19e9dfa1c254 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -275,6 +275,7 @@ xfs_free_perag(
275 pag = radix_tree_delete(&mp->m_perag_tree, agno); 275 pag = radix_tree_delete(&mp->m_perag_tree, agno);
276 spin_unlock(&mp->m_perag_lock); 276 spin_unlock(&mp->m_perag_lock);
277 ASSERT(pag); 277 ASSERT(pag);
278 ASSERT(atomic_read(&pag->pag_ref) == 0);
278 call_rcu(&pag->rcu_head, __xfs_free_perag); 279 call_rcu(&pag->rcu_head, __xfs_free_perag);
279 } 280 }
280} 281}
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index e0e64b113bd6..9bb6eda4cd21 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -346,8 +346,17 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid,
346#define xfs_trans_mod_dquot_byino(tp, ip, fields, delta) 346#define xfs_trans_mod_dquot_byino(tp, ip, fields, delta)
347#define xfs_trans_apply_dquot_deltas(tp) 347#define xfs_trans_apply_dquot_deltas(tp)
348#define xfs_trans_unreserve_and_mod_dquots(tp) 348#define xfs_trans_unreserve_and_mod_dquots(tp)
349#define xfs_trans_reserve_quota_nblks(tp, ip, nblks, ninos, flags) (0) 349static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp,
350#define xfs_trans_reserve_quota_bydquots(tp, mp, u, g, nb, ni, fl) (0) 350 struct xfs_inode *ip, long nblks, long ninos, uint flags)
351{
352 return 0;
353}
354static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp,
355 struct xfs_mount *mp, struct xfs_dquot *udqp,
356 struct xfs_dquot *gdqp, long nblks, long nions, uint flags)
357{
358 return 0;
359}
351#define xfs_qm_vop_create_dqattach(tp, ip, u, g) 360#define xfs_qm_vop_create_dqattach(tp, ip, u, g)
352#define xfs_qm_vop_rename_dqattach(it) (0) 361#define xfs_qm_vop_rename_dqattach(it) (0)
353#define xfs_qm_vop_chown(tp, ip, old, new) (NULL) 362#define xfs_qm_vop_chown(tp, ip, old, new) (NULL)
@@ -357,11 +366,14 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid,
357#define xfs_qm_dqdetach(ip) 366#define xfs_qm_dqdetach(ip)
358#define xfs_qm_dqrele(d) 367#define xfs_qm_dqrele(d)
359#define xfs_qm_statvfs(ip, s) 368#define xfs_qm_statvfs(ip, s)
360#define xfs_qm_sync(mp, fl) (0) 369static inline int xfs_qm_sync(struct xfs_mount *mp, int flags)
370{
371 return 0;
372}
361#define xfs_qm_newmount(mp, a, b) (0) 373#define xfs_qm_newmount(mp, a, b) (0)
362#define xfs_qm_mount_quotas(mp) 374#define xfs_qm_mount_quotas(mp)
363#define xfs_qm_unmount(mp) 375#define xfs_qm_unmount(mp)
364#define xfs_qm_unmount_quotas(mp) (0) 376#define xfs_qm_unmount_quotas(mp)
365#endif /* CONFIG_XFS_QUOTA */ 377#endif /* CONFIG_XFS_QUOTA */
366 378
367#define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \ 379#define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index d2af0a8381a6..77a59891734e 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -297,6 +297,7 @@ xfs_rename(
297 * it and some incremental backup programs won't work without it. 297 * it and some incremental backup programs won't work without it.
298 */ 298 */
299 xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG); 299 xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
300 xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE);
300 301
301 /* 302 /*
302 * Adjust the link count on src_dp. This is necessary when 303 * Adjust the link count on src_dp. This is necessary when