From d9d318d39dd5cb686660504a3565aac453709ccc Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Tue, 30 Nov 2010 16:39:27 +0100
Subject: fuse: fix ioctl when server is 32bit

If a 32bit CUSE server is run on 64bit this results in EIO being
returned to the caller.

The reason is that FUSE_IOCTL_RETRY reply was defined to use 'struct
iovec', which is different on 32bit and 64bit archs.

Work around this by looking at the size of the reply to determine
which struct was used.  This is only needed if CONFIG_COMPAT is
defined.

A more permanent fix for the interface will be to use the same struct
on both 32bit and 64bit.

Reported-by: "ccmail111" <ccmail111@yahoo.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
CC: Tejun Heo <tj@kernel.org>
CC: <stable@kernel.org>         [2.6.31+]
---
 fs/fuse/file.c | 50 ++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 44 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 9242d294fe90..0e2e25b114a6 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -13,6 +13,7 @@
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/module.h>
+#include <linux/compat.h>
 
 static const struct file_operations fuse_direct_io_file_operations;
 
@@ -1627,6 +1628,44 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
 	return 0;
 }
 
+/*
+ * CUSE servers compiled on 32bit broke on 64bit kernels because the
+ * ABI was defined to be 'struct iovec' which is different on 32bit
+ * and 64bit.  Fortunately we can determine which structure the server
+ * used from the size of the reply.
+ */
+static int fuse_copy_ioctl_iovec(struct iovec *dst, void *src,
+				 size_t transferred, unsigned count,
+				 bool is_compat)
+{
+#ifdef CONFIG_COMPAT
+	if (count * sizeof(struct compat_iovec) == transferred) {
+		struct compat_iovec *ciov = src;
+		unsigned i;
+
+		/*
+		 * With this interface a 32bit server cannot support
+		 * non-compat (i.e. ones coming from 64bit apps) ioctl
+		 * requests
+		 */
+		if (!is_compat)
+			return -EINVAL;
+
+		for (i = 0; i < count; i++) {
+			dst[i].iov_base = compat_ptr(ciov[i].iov_base);
+			dst[i].iov_len = ciov[i].iov_len;
+		}
+		return 0;
+	}
+#endif
+
+	if (count * sizeof(struct iovec) != transferred)
+		return -EIO;
+
+	memcpy(dst, src, transferred);
+	return 0;
+}
+
 /*
  * For ioctls, there is no generic way to determine how much memory
  * needs to be read and/or written.  Furthermore, ioctls are allowed
@@ -1808,14 +1847,13 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
 		    in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV)
 			goto out;
 
-		err = -EIO;
-		if ((in_iovs + out_iovs) * sizeof(struct iovec) != transferred)
-			goto out;
-
-		/* okay, copy in iovs and retry */
 		vaddr = kmap_atomic(pages[0], KM_USER0);
-		memcpy(page_address(iov_page), vaddr, transferred);
+		err = fuse_copy_ioctl_iovec(page_address(iov_page), vaddr,
+					    transferred, in_iovs + out_iovs,
+					    (flags & FUSE_IOCTL_COMPAT) != 0);
 		kunmap_atomic(vaddr, KM_USER0);
+		if (err)
+			goto out;
 
 		in_iov = page_address(iov_page);
 		out_iov = in_iov + in_iovs;
-- 
cgit v1.2.2


From 7572777eef78ebdee1ecb7c258c0ef94d35bad16 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Tue, 30 Nov 2010 16:39:27 +0100
Subject: fuse: verify ioctl retries

Verify that the total length of the iovec returned in FUSE_IOCTL_RETRY
doesn't overflow iov_length().

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
CC: Tejun Heo <tj@kernel.org>
CC: <stable@kernel.org>         [2.6.31+]
---
 fs/fuse/file.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'fs')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 0e2e25b114a6..8b984a2cebbd 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1666,6 +1666,20 @@ static int fuse_copy_ioctl_iovec(struct iovec *dst, void *src,
 	return 0;
 }
 
+/* Make sure iov_length() won't overflow */
+static int fuse_verify_ioctl_iov(struct iovec *iov, size_t count)
+{
+	size_t n;
+	u32 max = FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT;
+
+	for (n = 0; n < count; n++) {
+		if (iov->iov_len > (size_t) max)
+			return -ENOMEM;
+		max -= iov->iov_len;
+	}
+	return 0;
+}
+
 /*
  * For ioctls, there is no generic way to determine how much memory
  * needs to be read and/or written.  Furthermore, ioctls are allowed
@@ -1858,6 +1872,14 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
 		in_iov = page_address(iov_page);
 		out_iov = in_iov + in_iovs;
 
+		err = fuse_verify_ioctl_iov(in_iov, in_iovs);
+		if (err)
+			goto out;
+
+		err = fuse_verify_ioctl_iov(out_iov, out_iovs);
+		if (err)
+			goto out;
+
 		goto retry;
 	}
 
-- 
cgit v1.2.2


From 0aded708d125a3ff7e5abaea9c2d9c6d7ebbfdcd Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 30 Nov 2010 21:56:32 -0500
Subject: NFS: Ensure we use the correct cookie in nfs_readdir_xdr_filler

We need to use the cookie from the previous array entry, not the
actual cookie that we are searching for (except for the case of
uncached_readdir).

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index f0a384e2ae63..e03537ff9350 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -178,6 +178,7 @@ typedef struct {
 	struct page	*page;
 	unsigned long	page_index;
 	u64		*dir_cookie;
+	u64		last_cookie;
 	loff_t		current_index;
 	decode_dirent_t	decode;
 
@@ -344,6 +345,8 @@ int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
 	else
 		status = nfs_readdir_search_for_cookie(array, desc);
 
+	if (status == -EAGAIN)
+		desc->last_cookie = array->last_cookie;
 	nfs_readdir_release_array(desc->page);
 out:
 	return status;
@@ -563,7 +566,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
 	unsigned int array_size = ARRAY_SIZE(pages);
 
 	entry.prev_cookie = 0;
-	entry.cookie = *desc->dir_cookie;
+	entry.cookie = desc->last_cookie;
 	entry.eof = 0;
 	entry.fh = nfs_alloc_fhandle();
 	entry.fattr = nfs_alloc_fattr();
@@ -672,8 +675,10 @@ int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
 {
 	int res;
 
-	if (desc->page_index == 0)
+	if (desc->page_index == 0) {
 		desc->current_index = 0;
+		desc->last_cookie = 0;
+	}
 	while (1) {
 		res = find_cache_page(desc);
 		if (res != -EAGAIN)
@@ -764,6 +769,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
 	}
 
 	desc->page_index = 0;
+	desc->last_cookie = *desc->dir_cookie;
 	desc->page = page;
 
 	status = nfs_readdir_xdr_to_array(desc, page, inode);
-- 
cgit v1.2.2


From 884ea892763d4dfba509743f65961c782c0442db Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 22 Nov 2010 22:58:06 -0800
Subject: ceph: avoid possible null deref in readdir after dir llseek

last may be NULL, but we dereference it in the else branch without
checking.  Normally it doesn't trigger because last == NULL when fpos == 2,
but it could happen on a newly opened dir if the user seeks forward.

Reported-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/dir.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 7d447af84ec4..158c700fdca5 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -114,8 +114,8 @@ static int __dcache_readdir(struct file *filp,
 	spin_lock(&dcache_lock);
 
 	/* start at beginning? */
-	if (filp->f_pos == 2 || (last &&
-				 filp->f_pos < ceph_dentry(last)->offset)) {
+	if (filp->f_pos == 2 || last == NULL ||
+	    filp->f_pos < ceph_dentry(last)->offset) {
 		if (list_empty(&parent->d_subdirs))
 			goto out_unlock;
 		p = parent->d_subdirs.prev;
-- 
cgit v1.2.2


From 25933abdd8c562182ca6dc9f8c4c2cc8265c3a80 Mon Sep 17 00:00:00 2001
From: Herb Shiu <herb_shiu@tcloudcomputing.com>
Date: Wed, 1 Dec 2010 14:14:38 -0800
Subject: ceph: Handle file locks in replies from the MDS.

Previously the kernel client incorrectly assumed everything was a directory.

Signed-off-by: Herb Shiu <herb_shiu@tcloudcomputing.com>
Acked-by: Greg Farnum <gregf@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mds_client.c | 41 +++++++++++++++++++++++++++++++++++++----
 fs/ceph/mds_client.h | 31 +++++++++++++++++++++----------
 2 files changed, 58 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 098b18508479..38800eaa81d0 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -201,6 +201,38 @@ out_bad:
 	return err;
 }
 
+/*
+ * parse fcntl F_GETLK results
+ */
+static int parse_reply_info_filelock(void **p, void *end,
+                struct ceph_mds_reply_info_parsed *info)
+{
+	if (*p + sizeof(*info->filelock_reply) > end)
+		goto bad;
+
+	info->filelock_reply = *p;
+	*p += sizeof(*info->filelock_reply);
+
+	if (unlikely(*p != end))
+		goto bad;
+	return 0;
+
+bad:
+	return -EIO;
+}
+
+/*
+ * parse extra results
+ */
+static int parse_reply_info_extra(void **p, void *end,
+                struct ceph_mds_reply_info_parsed *info)
+{
+	if (info->head->op == CEPH_MDS_OP_GETFILELOCK)
+		return parse_reply_info_filelock(p, end, info);
+	else
+		return parse_reply_info_dir(p, end, info);
+}
+
 /*
  * parse entire mds reply
  */
@@ -223,10 +255,10 @@ static int parse_reply_info(struct ceph_msg *msg,
 			goto out_bad;
 	}
 
-	/* dir content */
+	/* extra */
 	ceph_decode_32_safe(&p, end, len, bad);
 	if (len > 0) {
-		err = parse_reply_info_dir(&p, p+len, info);
+		err = parse_reply_info_extra(&p, p+len, info);
 		if (err < 0)
 			goto out_bad;
 	}
@@ -2074,7 +2106,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 
 	mutex_lock(&session->s_mutex);
 	if (err < 0) {
-		pr_err("mdsc_handle_reply got corrupt reply mds%d\n", mds);
+		pr_err("mdsc_handle_reply got corrupt reply mds%d(tid:%lld)\n", mds, tid);
 		ceph_msg_dump(msg);
 		goto out_err;
 	}
@@ -2094,7 +2126,8 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 	mutex_lock(&req->r_fill_mutex);
 	err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);
 	if (err == 0) {
-		if (result == 0 && rinfo->dir_nr)
+		if (result == 0 && req->r_op != CEPH_MDS_OP_GETFILELOCK &&
+		    rinfo->dir_nr)
 			ceph_readdir_prepopulate(req, req->r_session);
 		ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
 	}
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 9341fd4f1432..aabe563b54db 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -42,26 +42,37 @@ struct ceph_mds_reply_info_in {
 };
 
 /*
- * parsed info about an mds reply, including information about the
- * target inode and/or its parent directory and dentry, and directory
- * contents (for readdir results).
+ * parsed info about an mds reply, including information about
+ * either: 1) the target inode and/or its parent directory and dentry,
+ * and directory contents (for readdir results), or
+ * 2) the file range lock info (for fcntl F_GETLK results).
  */
 struct ceph_mds_reply_info_parsed {
 	struct ceph_mds_reply_head    *head;
 
+	/* trace */
 	struct ceph_mds_reply_info_in diri, targeti;
 	struct ceph_mds_reply_dirfrag *dirfrag;
 	char                          *dname;
 	u32                           dname_len;
 	struct ceph_mds_reply_lease   *dlease;
 
-	struct ceph_mds_reply_dirfrag *dir_dir;
-	int                           dir_nr;
-	char                          **dir_dname;
-	u32                           *dir_dname_len;
-	struct ceph_mds_reply_lease   **dir_dlease;
-	struct ceph_mds_reply_info_in *dir_in;
-	u8                            dir_complete, dir_end;
+	/* extra */
+	union {
+		/* for fcntl F_GETLK results */
+		struct ceph_filelock *filelock_reply;
+
+		/* for readdir results */
+		struct {
+			struct ceph_mds_reply_dirfrag *dir_dir;
+			int                           dir_nr;
+			char                          **dir_dname;
+			u32                           *dir_dname_len;
+			struct ceph_mds_reply_lease   **dir_dlease;
+			struct ceph_mds_reply_info_in *dir_in;
+			u8                            dir_complete, dir_end;
+		};
+	};
 
 	/* encoded blob describing snapshot contexts for certain
 	   operations (e.g., open) */
-- 
cgit v1.2.2


From 637ae8d547390df75bad42a7e9cb65e625119767 Mon Sep 17 00:00:00 2001
From: Herb Shiu <herb_shiu@tcloudcomputing.com>
Date: Tue, 23 Nov 2010 13:42:23 -0800
Subject: ceph: pass lock information by struct file_lock instead of as
 individual params.

Signed-off-by: Herb Shiu <herb_shiu@tcloudcomputing.com>
Acked-by: Greg Farnum <gregf@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/locks.c | 58 +++++++++++++++++++++------------------------------------
 1 file changed, 21 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 40abde93c345..7b7ac310f052 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -11,40 +11,49 @@
  * Implement fcntl and flock locking functions.
  */
 static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
-			     u64 pid, u64 pid_ns,
-			     int cmd, u64 start, u64 length, u8 wait)
+			     int cmd, u8 wait, struct file_lock *fl)
 {
 	struct inode *inode = file->f_dentry->d_inode;
 	struct ceph_mds_client *mdsc =
 		ceph_sb_to_client(inode->i_sb)->mdsc;
 	struct ceph_mds_request *req;
 	int err;
+	u64 length = 0;
 
 	req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 	req->r_inode = igrab(inode);
 
+	/* mds requires start and length rather than start and end */
+	if (LLONG_MAX == fl->fl_end)
+		length = 0;
+	else
+		length = fl->fl_end - fl->fl_start + 1;
+
 	dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
 	     "length: %llu, wait: %d, type`: %d", (int)lock_type,
-	     (int)operation, pid, start, length, wait, cmd);
+	     (int)operation, (u64)fl->fl_pid, fl->fl_start,
+	     length, wait, fl->fl_type);
+
 
 	req->r_args.filelock_change.rule = lock_type;
 	req->r_args.filelock_change.type = cmd;
-	req->r_args.filelock_change.pid = cpu_to_le64(pid);
+	req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid);
 	/* This should be adjusted, but I'm not sure if
 	   namespaces actually get id numbers*/
 	req->r_args.filelock_change.pid_namespace =
-		cpu_to_le64((u64)pid_ns);
-	req->r_args.filelock_change.start = cpu_to_le64(start);
+		cpu_to_le64((u64)(unsigned long)fl->fl_nspid);
+	req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start);
 	req->r_args.filelock_change.length = cpu_to_le64(length);
 	req->r_args.filelock_change.wait = wait;
 
 	err = ceph_mdsc_do_request(mdsc, inode, req);
 	ceph_mdsc_put_request(req);
 	dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
-	     "length: %llu, wait: %d, type`: %d err code %d", (int)lock_type,
-	     (int)operation, pid, start, length, wait, cmd, err);
+	     "length: %llu, wait: %d, type`: %d, err code %d", (int)lock_type,
+	     (int)operation, (u64)fl->fl_pid, fl->fl_start,
+	     length, wait, fl->fl_type, err);
 	return err;
 }
 
@@ -54,7 +63,6 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
  */
 int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
 {
-	u64 length;
 	u8 lock_cmd;
 	int err;
 	u8 wait = 0;
@@ -76,16 +84,7 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
 	else
 		lock_cmd = CEPH_LOCK_UNLOCK;
 
-	if (LLONG_MAX == fl->fl_end)
-		length = 0;
-	else
-		length = fl->fl_end - fl->fl_start + 1;
-
-	err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
-				(u64)fl->fl_pid,
-				(u64)(unsigned long)fl->fl_nspid,
-				lock_cmd, fl->fl_start,
-				length, wait);
+	err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, lock_cmd, wait, fl);
 	if (!err) {
 		dout("mds locked, locking locally");
 		err = posix_lock_file(file, fl, NULL);
@@ -93,10 +92,7 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
 			/* undo! This should only happen if the kernel detects
 			 * local deadlock. */
 			ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
-					  (u64)fl->fl_pid,
-					  (u64)(unsigned long)fl->fl_nspid,
-					  CEPH_LOCK_UNLOCK, fl->fl_start,
-					  length, 0);
+					  CEPH_LOCK_UNLOCK, 0, fl);
 			dout("got %d on posix_lock_file, undid lock", err);
 		}
 	} else {
@@ -107,7 +103,6 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
 
 int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
 {
-	u64 length;
 	u8 lock_cmd;
 	int err;
 	u8 wait = 1;
@@ -127,26 +122,15 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
 		lock_cmd = CEPH_LOCK_EXCL;
 	else
 		lock_cmd = CEPH_LOCK_UNLOCK;
-	/* mds requires start and length rather than start and end */
-	if (LLONG_MAX == fl->fl_end)
-		length = 0;
-	else
-		length = fl->fl_end - fl->fl_start + 1;
 
 	err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
-				file, (u64)fl->fl_pid,
-				(u64)(unsigned long)fl->fl_nspid,
-				lock_cmd, fl->fl_start,
-				length, wait);
+				file, lock_cmd, wait, fl);
 	if (!err) {
 		err = flock_lock_file_wait(file, fl);
 		if (err) {
 			ceph_lock_message(CEPH_LOCK_FLOCK,
 					  CEPH_MDS_OP_SETFILELOCK,
-					  file, (u64)fl->fl_pid,
-					  (u64)(unsigned long)fl->fl_nspid,
-					  CEPH_LOCK_UNLOCK, fl->fl_start,
-					  length, 0);
+					  file, CEPH_LOCK_UNLOCK, 0, fl);
 			dout("got %d on flock_lock_file_wait, undid lock", err);
 		}
 	} else {
-- 
cgit v1.2.2


From a5b10629edfa521071ccdb3b1e0e7fb350a044db Mon Sep 17 00:00:00 2001
From: Herb Shiu <herb_shiu@tcloudcomputing.com>
Date: Tue, 23 Nov 2010 13:58:29 -0800
Subject: ceph: Behave better when handling file lock replies.

Fill in the local lock with response data if appropriate,
and don't call posix_lock_file when reading locks.

Signed-off-by: Herb Shiu <herb_shiu@tcloudcomputing.com>
Acked-by: Greg Farnum <gregf@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/locks.c | 38 ++++++++++++++++++++++++++++++--------
 1 file changed, 30 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 7b7ac310f052..476b329867d4 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -49,6 +49,25 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
 	req->r_args.filelock_change.wait = wait;
 
 	err = ceph_mdsc_do_request(mdsc, inode, req);
+
+	if ( operation == CEPH_MDS_OP_GETFILELOCK){
+		fl->fl_pid = le64_to_cpu(req->r_reply_info.filelock_reply->pid);
+		if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type)
+			fl->fl_type = F_RDLCK;
+		else if (CEPH_LOCK_EXCL == req->r_reply_info.filelock_reply->type)
+			fl->fl_type = F_WRLCK;
+		else
+			fl->fl_type = F_UNLCK;
+
+		fl->fl_start = le64_to_cpu(req->r_reply_info.filelock_reply->start);
+		length = le64_to_cpu(req->r_reply_info.filelock_reply->start) +
+						 le64_to_cpu(req->r_reply_info.filelock_reply->length);
+		if (length >= 1)
+			fl->fl_end = length -1;
+		else
+			fl->fl_end = 0;
+
+	}
 	ceph_mdsc_put_request(req);
 	dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
 	     "length: %llu, wait: %d, type`: %d, err code %d", (int)lock_type,
@@ -86,15 +105,18 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
 
 	err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, lock_cmd, wait, fl);
 	if (!err) {
-		dout("mds locked, locking locally");
-		err = posix_lock_file(file, fl, NULL);
-		if (err && (CEPH_MDS_OP_SETFILELOCK == op)) {
-			/* undo! This should only happen if the kernel detects
-			 * local deadlock. */
-			ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
-					  CEPH_LOCK_UNLOCK, 0, fl);
-			dout("got %d on posix_lock_file, undid lock", err);
+		if ( op != CEPH_MDS_OP_GETFILELOCK ){
+			dout("mds locked, locking locally");
+			err = posix_lock_file(file, fl, NULL);
+			if (err && (CEPH_MDS_OP_SETFILELOCK == op)) {
+				/* undo! This should only happen if the kernel detects
+				 * local deadlock. */
+				ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
+						  CEPH_LOCK_UNLOCK, 0, fl);
+				dout("got %d on posix_lock_file, undid lock", err);
+			}
 		}
+
 	} else {
 		dout("mds returned error code %d", err);
 	}
-- 
cgit v1.2.2


From 11de3b11e08cac26d59e88efaf4e316701883552 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 1 Dec 2010 14:17:06 -0500
Subject: NFS: Fix a memory leak in nfs_readdir

We need to ensure that the entries in the nfs_cache_array get cleared
when the page is removed from the page cache. To do so, we use the
freepage address_space operation.

Change nfs_readdir_clear_array to use kmap_atomic(), so that the
function can be safely called from all contexts.

Finally, modify the cache_page_release helper to call
nfs_readdir_clear_array directly, when dealing with an anonymous
page from 'uncached_readdir'.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c   | 18 +++++++++---------
 fs/nfs/inode.c |  1 +
 2 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e03537ff9350..d529e0e99efa 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -57,7 +57,7 @@ static int nfs_rename(struct inode *, struct dentry *,
 		      struct inode *, struct dentry *);
 static int nfs_fsync_dir(struct file *, int);
 static loff_t nfs_llseek_dir(struct file *, loff_t, int);
-static int nfs_readdir_clear_array(struct page*, gfp_t);
+static void nfs_readdir_clear_array(struct page*);
 
 const struct file_operations nfs_dir_operations = {
 	.llseek		= nfs_llseek_dir,
@@ -83,8 +83,8 @@ const struct inode_operations nfs_dir_inode_operations = {
 	.setattr	= nfs_setattr,
 };
 
-const struct address_space_operations nfs_dir_addr_space_ops = {
-	.releasepage = nfs_readdir_clear_array,
+const struct address_space_operations nfs_dir_aops = {
+	.freepage = nfs_readdir_clear_array,
 };
 
 #ifdef CONFIG_NFS_V3
@@ -214,17 +214,15 @@ void nfs_readdir_release_array(struct page *page)
  * we are freeing strings created by nfs_add_to_readdir_array()
  */
 static
-int nfs_readdir_clear_array(struct page *page, gfp_t mask)
+void nfs_readdir_clear_array(struct page *page)
 {
-	struct nfs_cache_array *array = nfs_readdir_get_array(page);
+	struct nfs_cache_array *array;
 	int i;
 
-	if (IS_ERR(array))
-		return PTR_ERR(array);
+	array = kmap_atomic(page, KM_USER0);
 	for (i = 0; i < array->size; i++)
 		kfree(array->array[i].string.name);
-	nfs_readdir_release_array(page);
-	return 0;
+	kunmap_atomic(array, KM_USER0);
 }
 
 /*
@@ -639,6 +637,8 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page)
 static
 void cache_page_release(nfs_readdir_descriptor_t *desc)
 {
+	if (!desc->page->mapping)
+		nfs_readdir_clear_array(desc->page);
 	page_cache_release(desc->page);
 	desc->page = NULL;
 }
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 314f57164602..e67e31c73416 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -289,6 +289,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 		} else if (S_ISDIR(inode->i_mode)) {
 			inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops;
 			inode->i_fop = &nfs_dir_operations;
+			inode->i_data.a_ops = &nfs_dir_aops;
 			if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS))
 				set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
 			/* Deal with crossing mountpoints */
-- 
cgit v1.2.2


From 6d20e8406f0942228a73000663c2b33f488103ea Mon Sep 17 00:00:00 2001
From: Suresh Jayaraman <sjayaraman@suse.de>
Date: Wed, 1 Dec 2010 14:42:28 +0530
Subject: cifs: add attribute cache timeout (actimeo) tunable

Currently, the attribute cache timeout for CIFS is hardcoded to 1 second. This
means that the client might have to issue a QPATHINFO/QFILEINFO call every 1
second to verify if something has changes, which seems too expensive. On the
other hand, if the timeout is hardcoded to a higher value, workloads that
expect strict cache coherency might see unexpected results.

Making attribute cache timeout as a tunable will allow us to make a tradeoff
between performance and cache metadata correctness depending on the
application/workload needs.

Add 'actimeo' tunable that can be used to tune the attribute cache timeout.
The default timeout is set to 1 second. Also, display actimeo option value in
/proc/mounts.

It appears to me that 'actimeo' and the proposed (but not yet merged)
'strictcache' option cannot coexist, so care must be taken that we reset the
other option if one of them is set.

Changes since last post:
   - fix option parsing and handle possible values correcly

Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Suresh Jayaraman <sjayaraman@suse.de>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/README       |  9 +++++++++
 fs/cifs/cifs_fs_sb.h |  1 +
 fs/cifs/cifsfs.c     |  2 ++
 fs/cifs/cifsglob.h   | 10 ++++++++++
 fs/cifs/connect.c    | 15 +++++++++++++++
 fs/cifs/inode.c      |  7 ++++---
 6 files changed, 41 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/README b/fs/cifs/README
index ee68d1036544..46af99ab3614 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -337,6 +337,15 @@ A partial list of the supported mount options follows:
   wsize		default write size (default 57344)
 		maximum wsize currently allowed by CIFS is 57344 (fourteen
 		4096 byte pages)
+  actimeo=n	attribute cache timeout in seconds (default 1 second).
+		After this timeout, the cifs client requests fresh attribute
+		information from the server. This option allows to tune the
+		attribute cache timeout to suit the workload needs. Shorter
+		timeouts mean better the cache coherency, but increased number
+		of calls to the server. Longer timeouts mean reduced number
+		of calls to the server at the expense of less stricter cache
+		coherency checks (i.e. incorrect attribute cache for a short
+		period of time).
   rw		mount the network share read-write (note that the
 		server may still consider the share read-only)
   ro		mount network share read-only
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index e9a393c9c2ca..7852cd677051 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -48,6 +48,7 @@ struct cifs_sb_info {
 	struct nls_table *local_nls;
 	unsigned int rsize;
 	unsigned int wsize;
+	unsigned long actimeo; /* attribute cache timeout (jiffies) */
 	atomic_t active;
 	uid_t	mnt_uid;
 	gid_t	mnt_gid;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 76c8a906a63e..56a4b7544c3c 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -463,6 +463,8 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
 
 	seq_printf(s, ",rsize=%d", cifs_sb->rsize);
 	seq_printf(s, ",wsize=%d", cifs_sb->wsize);
+	/* convert actimeo and display it in seconds */
+		seq_printf(s, ",actimeo=%lu", cifs_sb->actimeo / HZ);
 
 	return 0;
 }
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index b577bf0a1bb3..94ccfacaed8a 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -44,6 +44,16 @@
 
 #define CIFS_MIN_RCV_POOL 4
 
+/*
+ * default attribute cache timeout (jiffies)
+ */
+#define CIFS_DEF_ACTIMEO (1 * HZ)
+
+/*
+ * max attribute cache timeout (jiffies) - 2^30
+ */
+#define CIFS_MAX_ACTIMEO (1 << 30)
+
 /*
  * MAX_REQ is the maximum number of requests that WE will send
  * on one socket concurrently. It also matches the most common
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 32fa4d9b5dbc..bb17ee2ba782 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -105,6 +105,7 @@ struct smb_vol {
 	unsigned int wsize;
 	bool sockopt_tcp_nodelay:1;
 	unsigned short int port;
+	unsigned long actimeo; /* attribute cache timeout (jiffies) */
 	char *prepath;
 	struct sockaddr_storage srcaddr; /* allow binding to a local IP */
 	struct nls_table *local_nls;
@@ -840,6 +841,8 @@ cifs_parse_mount_options(char *options, const char *devname,
 	/* default to using server inode numbers where available */
 	vol->server_ino = 1;
 
+	vol->actimeo = CIFS_DEF_ACTIMEO;
+
 	if (!options)
 		return 1;
 
@@ -1214,6 +1217,16 @@ cifs_parse_mount_options(char *options, const char *devname,
 					printk(KERN_WARNING "CIFS: server net"
 					"biosname longer than 15 truncated.\n");
 			}
+		} else if (strnicmp(data, "actimeo", 7) == 0) {
+			if (value && *value) {
+				vol->actimeo = HZ * simple_strtoul(value,
+								   &value, 0);
+				if (vol->actimeo > CIFS_MAX_ACTIMEO) {
+					cERROR(1, "CIFS: attribute cache"
+							"timeout too large");
+					return 1;
+				}
+			}
 		} else if (strnicmp(data, "credentials", 4) == 0) {
 			/* ignore */
 		} else if (strnicmp(data, "version", 3) == 0) {
@@ -2571,6 +2584,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
 	cFYI(1, "file mode: 0x%x  dir mode: 0x%x",
 		cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode);
 
+	cifs_sb->actimeo = pvolume_info->actimeo;
+
 	if (pvolume_info->noperm)
 		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM;
 	if (pvolume_info->setuids)
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 28cb6e735943..bb5ca4848e81 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1653,6 +1653,7 @@ static bool
 cifs_inode_needs_reval(struct inode *inode)
 {
 	struct cifsInodeInfo *cifs_i = CIFS_I(inode);
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 
 	if (cifs_i->clientCanCacheRead)
 		return false;
@@ -1663,12 +1664,12 @@ cifs_inode_needs_reval(struct inode *inode)
 	if (cifs_i->time == 0)
 		return true;
 
-	/* FIXME: the actimeo should be tunable */
-	if (time_after_eq(jiffies, cifs_i->time + HZ))
+	if (!time_in_range(jiffies, cifs_i->time,
+				cifs_i->time + cifs_sb->actimeo))
 		return true;
 
 	/* hardlinked files w/ noserverino get "special" treatment */
-	if (!(CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) &&
+	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) &&
 	    S_ISREG(inode->i_mode) && inode->i_nlink != 1)
 		return true;
 
-- 
cgit v1.2.2


From 1cd275f609ba46c8cae3ee77e499c54a0d13a983 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 6 Dec 2010 09:45:22 -0800
Subject: ceph: fix ioctl magic

The ioctl magic was inadvertently changed in 571dba52.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/ioctl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
index a6ce54e94eb5..52e8fd74d450 100644
--- a/fs/ceph/ioctl.h
+++ b/fs/ceph/ioctl.h
@@ -4,7 +4,7 @@
 #include <linux/ioctl.h>
 #include <linux/types.h>
 
-#define CEPH_IOCTL_MAGIC 0x98
+#define CEPH_IOCTL_MAGIC 0x97
 
 /* just use u64 to align sanely on all archs */
 struct ceph_ioctl_layout {
-- 
cgit v1.2.2


From 79df1baeec29022e4181f2964187b88661ef5517 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 6 Dec 2010 12:52:08 -0500
Subject: cifs: fix use of CONFIG_CIFS_ACL

Some of the code under CONFIG_CIFS_ACL is dependent upon code under
CONFIG_CIFS_EXPERIMENTAL, but the Kconfig options don't reflect that
dependency. Move more of the ACL code out from under
CONFIG_CIFS_EXPERIMENTAL and under CONFIG_CIFS_ACL.

Also move find_readable_file out from other any sort of Kconfig
option and make it a function normally compiled in.

Reported-and-Acked-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/Makefile    |   4 +-
 fs/cifs/cifsacl.c   |   3 -
 fs/cifs/cifsacl.h   |   4 --
 fs/cifs/cifsproto.h |   2 -
 fs/cifs/cifssmb.c   | 183 ++++++++++++++++++++++++++--------------------------
 fs/cifs/file.c      |   2 -
 fs/cifs/inode.c     |   8 +--
 7 files changed, 99 insertions(+), 107 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index adefa60a9bdc..43b19dd39191 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -6,7 +6,9 @@ obj-$(CONFIG_CIFS) += cifs.o
 cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \
 	  link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o \
 	  md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o \
-	  readdir.o ioctl.o sess.o export.o cifsacl.o
+	  readdir.o ioctl.o sess.o export.o
+
+cifs-$(CONFIG_CIFS_ACL) += cifsacl.o
 
 cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o
 
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index c6ebea088ac7..a437ec391a01 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -30,8 +30,6 @@
 #include "cifs_debug.h"
 
 
-#ifdef CONFIG_CIFS_EXPERIMENTAL
-
 static struct cifs_wksid wksidarr[NUM_WK_SIDS] = {
 	{{1, 0, {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0} }, "null user"},
 	{{1, 1, {0, 0, 0, 0, 0, 1}, {0, 0, 0, 0, 0} }, "nobody"},
@@ -774,4 +772,3 @@ int mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode)
 
 	return rc;
 }
-#endif /* CONFIG_CIFS_EXPERIMENTAL */
diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h
index 6c8096cf5155..c4ae7d036563 100644
--- a/fs/cifs/cifsacl.h
+++ b/fs/cifs/cifsacl.h
@@ -74,11 +74,7 @@ struct cifs_wksid {
 	char sidname[SIDNAMELENGTH];
 } __attribute__((packed));
 
-#ifdef CONFIG_CIFS_EXPERIMENTAL
-
 extern int match_sid(struct cifs_sid *);
 extern int compare_sids(const struct cifs_sid *, const struct cifs_sid *);
 
-#endif /*  CONFIG_CIFS_EXPERIMENTAL */
-
 #endif /* _CIFSACL_H */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index db961dc4fd3d..5523047b1ebd 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -79,9 +79,7 @@ extern bool is_valid_oplock_break(struct smb_hdr *smb,
 				  struct TCP_Server_Info *);
 extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof);
 extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool);
-#ifdef CONFIG_CIFS_EXPERIMENTAL
 extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool);
-#endif
 extern unsigned int smbCalcSize(struct smb_hdr *ptr);
 extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr);
 extern int decode_negTokenInit(unsigned char *security_blob, int length,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 2f2632b6df5a..67acfb3acad2 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -2478,95 +2478,6 @@ querySymLinkRetry:
 }
 
 #ifdef CONFIG_CIFS_EXPERIMENTAL
-/* Initialize NT TRANSACT SMB into small smb request buffer.
-   This assumes that all NT TRANSACTS that we init here have
-   total parm and data under about 400 bytes (to fit in small cifs
-   buffer size), which is the case so far, it easily fits. NB:
-	Setup words themselves and ByteCount
-	MaxSetupCount (size of returned setup area) and
-	MaxParameterCount (returned parms size) must be set by caller */
-static int
-smb_init_nttransact(const __u16 sub_command, const int setup_count,
-		   const int parm_len, struct cifsTconInfo *tcon,
-		   void **ret_buf)
-{
-	int rc;
-	__u32 temp_offset;
-	struct smb_com_ntransact_req *pSMB;
-
-	rc = small_smb_init(SMB_COM_NT_TRANSACT, 19 + setup_count, tcon,
-				(void **)&pSMB);
-	if (rc)
-		return rc;
-	*ret_buf = (void *)pSMB;
-	pSMB->Reserved = 0;
-	pSMB->TotalParameterCount = cpu_to_le32(parm_len);
-	pSMB->TotalDataCount  = 0;
-	pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf -
-					  MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
-	pSMB->ParameterCount = pSMB->TotalParameterCount;
-	pSMB->DataCount  = pSMB->TotalDataCount;
-	temp_offset = offsetof(struct smb_com_ntransact_req, Parms) +
-			(setup_count * 2) - 4 /* for rfc1001 length itself */;
-	pSMB->ParameterOffset = cpu_to_le32(temp_offset);
-	pSMB->DataOffset = cpu_to_le32(temp_offset + parm_len);
-	pSMB->SetupCount = setup_count; /* no need to le convert byte fields */
-	pSMB->SubCommand = cpu_to_le16(sub_command);
-	return 0;
-}
-
-static int
-validate_ntransact(char *buf, char **ppparm, char **ppdata,
-		   __u32 *pparmlen, __u32 *pdatalen)
-{
-	char *end_of_smb;
-	__u32 data_count, data_offset, parm_count, parm_offset;
-	struct smb_com_ntransact_rsp *pSMBr;
-
-	*pdatalen = 0;
-	*pparmlen = 0;
-
-	if (buf == NULL)
-		return -EINVAL;
-
-	pSMBr = (struct smb_com_ntransact_rsp *)buf;
-
-	/* ByteCount was converted from little endian in SendReceive */
-	end_of_smb = 2 /* sizeof byte count */ + pSMBr->ByteCount +
-			(char *)&pSMBr->ByteCount;
-
-	data_offset = le32_to_cpu(pSMBr->DataOffset);
-	data_count = le32_to_cpu(pSMBr->DataCount);
-	parm_offset = le32_to_cpu(pSMBr->ParameterOffset);
-	parm_count = le32_to_cpu(pSMBr->ParameterCount);
-
-	*ppparm = (char *)&pSMBr->hdr.Protocol + parm_offset;
-	*ppdata = (char *)&pSMBr->hdr.Protocol + data_offset;
-
-	/* should we also check that parm and data areas do not overlap? */
-	if (*ppparm > end_of_smb) {
-		cFYI(1, "parms start after end of smb");
-		return -EINVAL;
-	} else if (parm_count + *ppparm > end_of_smb) {
-		cFYI(1, "parm end after end of smb");
-		return -EINVAL;
-	} else if (*ppdata > end_of_smb) {
-		cFYI(1, "data starts after end of smb");
-		return -EINVAL;
-	} else if (data_count + *ppdata > end_of_smb) {
-		cFYI(1, "data %p + count %d (%p) past smb end %p start %p",
-			*ppdata, data_count, (data_count + *ppdata),
-			end_of_smb, pSMBr);
-		return -EINVAL;
-	} else if (parm_count + data_count > pSMBr->ByteCount) {
-		cFYI(1, "parm count and data count larger than SMB");
-		return -EINVAL;
-	}
-	*pdatalen = data_count;
-	*pparmlen = parm_count;
-	return 0;
-}
-
 int
 CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
 			const unsigned char *searchName,
@@ -3056,7 +2967,97 @@ GetExtAttrOut:
 
 #endif /* CONFIG_POSIX */
 
-#ifdef CONFIG_CIFS_EXPERIMENTAL
+#ifdef CONFIG_CIFS_ACL
+/*
+ * Initialize NT TRANSACT SMB into small smb request buffer.  This assumes that
+ * all NT TRANSACTS that we init here have total parm and data under about 400
+ * bytes (to fit in small cifs buffer size), which is the case so far, it
+ * easily fits. NB: Setup words themselves and ByteCount MaxSetupCount (size of
+ * returned setup area) and MaxParameterCount (returned parms size) must be set
+ * by caller
+ */
+static int
+smb_init_nttransact(const __u16 sub_command, const int setup_count,
+		   const int parm_len, struct cifsTconInfo *tcon,
+		   void **ret_buf)
+{
+	int rc;
+	__u32 temp_offset;
+	struct smb_com_ntransact_req *pSMB;
+
+	rc = small_smb_init(SMB_COM_NT_TRANSACT, 19 + setup_count, tcon,
+				(void **)&pSMB);
+	if (rc)
+		return rc;
+	*ret_buf = (void *)pSMB;
+	pSMB->Reserved = 0;
+	pSMB->TotalParameterCount = cpu_to_le32(parm_len);
+	pSMB->TotalDataCount  = 0;
+	pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf -
+					  MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
+	pSMB->ParameterCount = pSMB->TotalParameterCount;
+	pSMB->DataCount  = pSMB->TotalDataCount;
+	temp_offset = offsetof(struct smb_com_ntransact_req, Parms) +
+			(setup_count * 2) - 4 /* for rfc1001 length itself */;
+	pSMB->ParameterOffset = cpu_to_le32(temp_offset);
+	pSMB->DataOffset = cpu_to_le32(temp_offset + parm_len);
+	pSMB->SetupCount = setup_count; /* no need to le convert byte fields */
+	pSMB->SubCommand = cpu_to_le16(sub_command);
+	return 0;
+}
+
+static int
+validate_ntransact(char *buf, char **ppparm, char **ppdata,
+		   __u32 *pparmlen, __u32 *pdatalen)
+{
+	char *end_of_smb;
+	__u32 data_count, data_offset, parm_count, parm_offset;
+	struct smb_com_ntransact_rsp *pSMBr;
+
+	*pdatalen = 0;
+	*pparmlen = 0;
+
+	if (buf == NULL)
+		return -EINVAL;
+
+	pSMBr = (struct smb_com_ntransact_rsp *)buf;
+
+	/* ByteCount was converted from little endian in SendReceive */
+	end_of_smb = 2 /* sizeof byte count */ + pSMBr->ByteCount +
+			(char *)&pSMBr->ByteCount;
+
+	data_offset = le32_to_cpu(pSMBr->DataOffset);
+	data_count = le32_to_cpu(pSMBr->DataCount);
+	parm_offset = le32_to_cpu(pSMBr->ParameterOffset);
+	parm_count = le32_to_cpu(pSMBr->ParameterCount);
+
+	*ppparm = (char *)&pSMBr->hdr.Protocol + parm_offset;
+	*ppdata = (char *)&pSMBr->hdr.Protocol + data_offset;
+
+	/* should we also check that parm and data areas do not overlap? */
+	if (*ppparm > end_of_smb) {
+		cFYI(1, "parms start after end of smb");
+		return -EINVAL;
+	} else if (parm_count + *ppparm > end_of_smb) {
+		cFYI(1, "parm end after end of smb");
+		return -EINVAL;
+	} else if (*ppdata > end_of_smb) {
+		cFYI(1, "data starts after end of smb");
+		return -EINVAL;
+	} else if (data_count + *ppdata > end_of_smb) {
+		cFYI(1, "data %p + count %d (%p) past smb end %p start %p",
+			*ppdata, data_count, (data_count + *ppdata),
+			end_of_smb, pSMBr);
+		return -EINVAL;
+	} else if (parm_count + data_count > pSMBr->ByteCount) {
+		cFYI(1, "parm count and data count larger than SMB");
+		return -EINVAL;
+	}
+	*pdatalen = data_count;
+	*pparmlen = parm_count;
+	return 0;
+}
+
 /* Get Security Descriptor (by handle) from remote server for a file or dir */
 int
 CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
@@ -3214,7 +3215,7 @@ setCifsAclRetry:
 	return (rc);
 }
 
-#endif /* CONFIG_CIFS_EXPERIMENTAL */
+#endif /* CONFIG_CIFS_ACL */
 
 /* Legacy Query Path Information call for lookup to old servers such
    as Win9x/WinME */
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index b857ce5db775..5a28660ca2b5 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1108,7 +1108,6 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
 	return total_written;
 }
 
-#ifdef CONFIG_CIFS_EXPERIMENTAL
 struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
 					bool fsuid_only)
 {
@@ -1142,7 +1141,6 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
 	spin_unlock(&cifs_file_list_lock);
 	return NULL;
 }
-#endif
 
 struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode,
 					bool fsuid_only)
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index bb5ca4848e81..aa48521a78c1 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -686,7 +686,7 @@ int cifs_get_inode_info(struct inode **pinode,
 			cFYI(1, "cifs_sfu_type failed: %d", tmprc);
 	}
 
-#ifdef CONFIG_CIFS_EXPERIMENTAL
+#ifdef CONFIG_CIFS_ACL
 	/* fill in 0777 bits from ACL */
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
 		rc = cifs_acl_to_fattr(cifs_sb, &fattr, *pinode, full_path,
@@ -697,7 +697,7 @@ int cifs_get_inode_info(struct inode **pinode,
 			goto cgii_exit;
 		}
 	}
-#endif
+#endif /* CONFIG_CIFS_ACL */
 
 	/* fill in remaining high mode bits e.g. SUID, VTX */
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)
@@ -2122,7 +2122,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
 
 	if (attrs->ia_valid & ATTR_MODE) {
 		rc = 0;
-#ifdef CONFIG_CIFS_EXPERIMENTAL
+#ifdef CONFIG_CIFS_ACL
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
 			rc = mode_to_cifs_acl(inode, full_path, mode);
 			if (rc) {
@@ -2131,7 +2131,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
 				goto cifs_setattr_exit;
 			}
 		} else
-#endif
+#endif /* CONFIG_CIFS_ACL */
 		if (((mode & S_IWUGO) == 0) &&
 		    (cifsInode->cifsAttrs & ATTR_READONLY) == 0) {
 
-- 
cgit v1.2.2


From 884639996814585ef7079daa9e03a1eb562e235c Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@poochiereds.net>
Date: Mon, 22 Nov 2010 15:31:03 -0500
Subject: cifs: remove Local_System_Name

...this string is zeroed out and nothing ever changes it.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsfs.c   |  1 -
 fs/cifs/cifsglob.h |  2 --
 fs/cifs/connect.c  | 23 ++++++++++-------------
 3 files changed, 10 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 56a4b7544c3c..3936aa7f2c22 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -937,7 +937,6 @@ init_cifs(void)
 	GlobalCurrentXid = 0;
 	GlobalTotalActiveXid = 0;
 	GlobalMaxActiveXid = 0;
-	memset(Local_System_Name, 0, 15);
 	spin_lock_init(&cifs_tcp_ses_lock);
 	spin_lock_init(&cifs_file_list_lock);
 	spin_lock_init(&GlobalMid_Lock);
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 94ccfacaed8a..7136c0c3e2f9 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -756,8 +756,6 @@ GLOBAL_EXTERN unsigned int GlobalTotalActiveXid; /* prot by GlobalMid_Sem */
 GLOBAL_EXTERN unsigned int GlobalMaxActiveXid;	/* prot by GlobalMid_Sem */
 GLOBAL_EXTERN spinlock_t GlobalMid_Lock;  /* protects above & list operations */
 					  /* on midQ entries */
-GLOBAL_EXTERN char Local_System_Name[15];
-
 /*
  *  Global counters, updated atomically
  */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index bb17ee2ba782..53f9c31a52b2 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -807,23 +807,20 @@ cifs_parse_mount_options(char *options, const char *devname,
 	short int override_gid = -1;
 	bool uid_specified = false;
 	bool gid_specified = false;
+	char *nodename = utsname()->nodename;
 
 	separator[0] = ',';
 	separator[1] = 0;
 
-	if (Local_System_Name[0] != 0)
-		memcpy(vol->source_rfc1001_name, Local_System_Name, 15);
-	else {
-		char *nodename = utsname()->nodename;
-		int n = strnlen(nodename, 15);
-		memset(vol->source_rfc1001_name, 0x20, 15);
-		for (i = 0; i < n; i++) {
-			/* does not have to be perfect mapping since field is
-			informational, only used for servers that do not support
-			port 445 and it can be overridden at mount time */
-			vol->source_rfc1001_name[i] = toupper(nodename[i]);
-		}
-	}
+	/*
+	 * does not have to be perfect mapping since field is
+	 * informational, only used for servers that do not support
+	 * port 445 and it can be overridden at mount time
+	 */
+	memset(vol->source_rfc1001_name, 0x20, 15);
+	for (i = 0; i < strnlen(nodename, 15); i++)
+		vol->source_rfc1001_name[i] = toupper(nodename[i]);
+
 	vol->source_rfc1001_name[15] = 0;
 	/* null target name indicates to use *SMBSERVR default called name
 	   if we end up sending RFC1001 session initialize */
-- 
cgit v1.2.2


From de47de7404e29df8de82f5822b4fde1a6ed97b54 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Tue, 7 Dec 2010 13:04:00 +0800
Subject: autofs4 - remove ioctl mutex (bz23142)

With the recent changes to remove the BKL a mutex was added to the
ioctl entry point for calls to the old ioctl interface. This mutex
needs to be removed because of the need for the expire ioctl to call
back to the daemon to perform a umount and receive a completion
status (via another ioctl).

This should be fine as the new ioctl interface uses much of the same
code and it has been used without a mutex for around a year without
issue, as was the original intention.

Ref: Bugzilla bug 23142

Signed-off-by: Ian Kent <raven@themaw.net>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs4/root.c | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index d5c1401f0031..d34896cfb19f 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -980,19 +980,11 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
 	}
 }
 
-static DEFINE_MUTEX(autofs4_ioctl_mutex);
-
 static long autofs4_root_ioctl(struct file *filp,
 			       unsigned int cmd, unsigned long arg)
 {
-	long ret;
 	struct inode *inode = filp->f_dentry->d_inode;
-
-	mutex_lock(&autofs4_ioctl_mutex);
-	ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
-	mutex_unlock(&autofs4_ioctl_mutex);
-
-	return ret;
+	return autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
 }
 
 #ifdef CONFIG_COMPAT
@@ -1002,13 +994,11 @@ static long autofs4_root_compat_ioctl(struct file *filp,
 	struct inode *inode = filp->f_path.dentry->d_inode;
 	int ret;
 
-	mutex_lock(&autofs4_ioctl_mutex);
 	if (cmd == AUTOFS_IOC_READY || cmd == AUTOFS_IOC_FAIL)
 		ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
 	else
 		ret = autofs4_root_ioctl_unlocked(inode, filp, cmd,
 			(unsigned long)compat_ptr(arg));
-	mutex_unlock(&autofs4_ioctl_mutex);
 
 	return ret;
 }
-- 
cgit v1.2.2


From 18fb5fe40ce7f789b5cfc3aa81ff1e6175b0a5be Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 7 Dec 2010 12:41:58 -0500
Subject: NFS: nfs_readdir_search_for_cookie() don't mark as eof if cookie not
 found

If we're searching for a specific cookie, and it isn't found in the page
cache, we should try an uncached_readdir(). To do so, we return EBADCOOKIE,
but we don't set desc->eof.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index d529e0e99efa..ad2fde23446d 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -316,8 +316,9 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
 		}
 	}
 	if (i == array->eof_index) {
-		desc->eof = 1;
 		status = -EBADCOOKIE;
+		if (*desc->dir_cookie == array->last_cookie)
+			desc->eof = 1;
 	}
 out:
 	return status;
-- 
cgit v1.2.2


From 47c716cbf638a16583441d78be3fc24345eab636 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 7 Dec 2010 12:44:56 -0500
Subject: NFS: Readdir cleanups

No functional changes, but clarify the code.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c | 47 +++++++++++++++++------------------------------
 1 file changed, 17 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index ad2fde23446d..996dd8989a91 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -271,7 +271,7 @@ int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page)
 		goto out;
 	array->last_cookie = entry->cookie;
 	array->size++;
-	if (entry->eof == 1)
+	if (entry->eof != 0)
 		array->eof_index = array->size;
 out:
 	nfs_readdir_release_array(page);
@@ -311,16 +311,14 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
 	for (i = 0; i < array->size; i++) {
 		if (array->array[i].cookie == *desc->dir_cookie) {
 			desc->cache_entry_index = i;
-			status = 0;
-			goto out;
+			return 0;
 		}
 	}
-	if (i == array->eof_index) {
+	if (array->eof_index >= 0) {
 		status = -EBADCOOKIE;
 		if (*desc->dir_cookie == array->last_cookie)
 			desc->eof = 1;
 	}
-out:
 	return status;
 }
 
@@ -328,10 +326,7 @@ static
 int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
 {
 	struct nfs_cache_array *array;
-	int status = -EBADCOOKIE;
-
-	if (desc->dir_cookie == NULL)
-		goto out;
+	int status;
 
 	array = nfs_readdir_get_array(desc->page);
 	if (IS_ERR(array)) {
@@ -344,8 +339,10 @@ int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
 	else
 		status = nfs_readdir_search_for_cookie(array, desc);
 
-	if (status == -EAGAIN)
+	if (status == -EAGAIN) {
 		desc->last_cookie = array->last_cookie;
+		desc->page_index++;
+	}
 	nfs_readdir_release_array(desc->page);
 out:
 	return status;
@@ -492,7 +489,7 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
 
 		count++;
 
-		if (desc->plus == 1)
+		if (desc->plus != 0)
 			nfs_prime_dcache(desc->file->f_path.dentry, entry);
 
 		status = nfs_readdir_add_to_array(entry, page);
@@ -500,7 +497,7 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
 			break;
 	} while (!entry->eof);
 
-	if (count == 0 || (status == -EBADCOOKIE && entry->eof == 1)) {
+	if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) {
 		array = nfs_readdir_get_array(page);
 		if (!IS_ERR(array)) {
 			array->eof_index = array->size;
@@ -664,9 +661,8 @@ int find_cache_page(nfs_readdir_descriptor_t *desc)
 		return PTR_ERR(desc->page);
 
 	res = nfs_readdir_search_array(desc);
-	if (res == 0)
-		return 0;
-	cache_page_release(desc);
+	if (res != 0)
+		cache_page_release(desc);
 	return res;
 }
 
@@ -680,20 +676,12 @@ int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
 		desc->current_index = 0;
 		desc->last_cookie = 0;
 	}
-	while (1) {
+	do {
 		res = find_cache_page(desc);
-		if (res != -EAGAIN)
-			break;
-		desc->page_index++;
-	}
+	} while (res == -EAGAIN);
 	return res;
 }
 
-static inline unsigned int dt_type(struct inode *inode)
-{
-	return (inode->i_mode >> 12) & 15;
-}
-
 /*
  * Once we've found the start of the dirent within a page: fill 'er up...
  */
@@ -723,13 +711,12 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
 			break;
 		}
 		file->f_pos++;
-		desc->cache_entry_index = i;
 		if (i < (array->size-1))
 			*desc->dir_cookie = array->array[i+1].cookie;
 		else
 			*desc->dir_cookie = array->last_cookie;
 	}
-	if (i == array->eof_index)
+	if (array->eof_index >= 0)
 		desc->eof = 1;
 
 	nfs_readdir_release_array(desc->page);
@@ -798,7 +785,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	struct inode	*inode = dentry->d_inode;
 	nfs_readdir_descriptor_t my_desc,
 			*desc = &my_desc;
-	int res = -ENOMEM;
+	int res;
 
 	dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n",
 			dentry->d_parent->d_name.name, dentry->d_name.name,
@@ -823,7 +810,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	if (res < 0)
 		goto out;
 
-	while (desc->eof != 1) {
+	do {
 		res = readdir_search_pagecache(desc);
 
 		if (res == -EBADCOOKIE) {
@@ -851,7 +838,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		res = nfs_do_filldir(desc, dirent, filldir);
 		if (res < 0)
 			break;
-	}
+	} while (!desc->eof);
 out:
 	nfs_unblock_sillyrename(dentry);
 	if (res > 0)
-- 
cgit v1.2.2


From 03ceace5c6923ffbcf2b4d2e37afbffbdb5d4a67 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 6 Dec 2010 21:07:33 -0500
Subject: cifs: fix check of error return from is_path_accessable

This function will return 0 if everything went ok. Commit 9d002df4
however added a block of code after the following check for
rc == -EREMOTE. With that change and when rc == 0, doing the
"goto mount_fail_check" here skips that code, leaving the tlink_tree
and master_tlink pointer unpopulated. That causes an oops later
in cifs_root_iget.

Reported-and-Tested-by: Robbert Kouprie <robbert@exx.nl>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 53f9c31a52b2..387b91ae7400 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2839,7 +2839,7 @@ remote_path_check:
 			goto mount_fail_check;
 		}
 		rc = is_path_accessible(xid, tcon, cifs_sb, full_path);
-		if (rc != -EREMOTE) {
+		if (rc != 0 && rc != -EREMOTE) {
 			kfree(full_path);
 			goto mount_fail_check;
 		}
-- 
cgit v1.2.2


From 7d161b7f41e24203b54b0f18ae0d26c18a6c6fab Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 7 Dec 2010 02:10:35 -0500
Subject: cifs: allow calling cifs_build_path_to_root on incomplete cifs_sb

It's possible that cifs_mount will call cifs_build_path_to_root on a
newly instantiated cifs_sb. In that case, it's likely that the
master_tlink pointer has not yet been instantiated.

Fix this by having cifs_build_path_to_root take a cifsTconInfo pointer
as well, and have the caller pass that in.

Reported-and-Tested-by: Robbert Kouprie <robbert@exx.nl>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsproto.h | 3 ++-
 fs/cifs/connect.c   | 2 +-
 fs/cifs/inode.c     | 6 +++---
 3 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 5523047b1ebd..e6d1481b16c1 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -54,7 +54,8 @@ do {								\
 	     __func__, curr_xid, (int)rc);			\
 } while (0)
 extern char *build_path_from_dentry(struct dentry *);
-extern char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb);
+extern char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb,
+					struct cifsTconInfo *tcon);
 extern char *build_wildcard_path_from_dentry(struct dentry *direntry);
 extern char *cifs_compose_mount_options(const char *sb_mountdata,
 		const char *fullpath, const struct dfs_info3_param *ref,
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 387b91ae7400..cc1a8604a790 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2833,7 +2833,7 @@ remote_path_check:
 	/* check if a whole path (including prepath) is not remote */
 	if (!rc && cifs_sb->prepathlen && tcon) {
 		/* build_path_to_root works only when we have a valid tcon */
-		full_path = cifs_build_path_to_root(cifs_sb);
+		full_path = cifs_build_path_to_root(cifs_sb, tcon);
 		if (full_path == NULL) {
 			rc = -ENOMEM;
 			goto mount_fail_check;
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index aa48521a78c1..589f3e3f6e00 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -728,12 +728,12 @@ static const struct inode_operations cifs_ipc_inode_ops = {
 	.lookup = cifs_lookup,
 };
 
-char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb)
+char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb,
+				struct cifsTconInfo *tcon)
 {
 	int pplen = cifs_sb->prepathlen;
 	int dfsplen;
 	char *full_path = NULL;
-	struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
 
 	/* if no prefix path, simply set path to the root of share to "" */
 	if (pplen == 0) {
@@ -875,7 +875,7 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
 	char *full_path;
 	struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
 
-	full_path = cifs_build_path_to_root(cifs_sb);
+	full_path = cifs_build_path_to_root(cifs_sb, tcon);
 	if (full_path == NULL)
 		return ERR_PTR(-ENOMEM);
 
-- 
cgit v1.2.2


From ecf6f5e7d68471b08603f7c20143ac236602364f Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Mon, 8 Nov 2010 18:08:14 -0500
Subject: fanotify: deny permissions when no event was sent

If no event was sent to userspace we cannot expect userspace to respond to
permissions requests.  Today such requests just hang forever. This patch will
deny any permissions event which was unable to be sent to userspace.

Reported-by: Tvrtko Ursulin <tvrtko.ursulin@sophos.com>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fanotify/fanotify_user.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 063224812b7e..045c0794d435 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -106,7 +106,7 @@ static int create_fd(struct fsnotify_group *group, struct fsnotify_event *event)
 	return client_fd;
 }
 
-static ssize_t fill_event_metadata(struct fsnotify_group *group,
+static int fill_event_metadata(struct fsnotify_group *group,
 				   struct fanotify_event_metadata *metadata,
 				   struct fsnotify_event *event)
 {
@@ -257,10 +257,11 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
 
 	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
 
-	fd = fill_event_metadata(group, &fanotify_event_metadata, event);
-	if (fd < 0)
-		return fd;
+	ret = fill_event_metadata(group, &fanotify_event_metadata, event);
+	if (ret < 0)
+		goto out;
 
+	fd = ret;
 	ret = prepare_for_access_response(group, event, fd);
 	if (ret)
 		goto out_close_fd;
@@ -275,6 +276,13 @@ out_kill_access_response:
 	remove_access_response(group, event, fd);
 out_close_fd:
 	sys_close(fd);
+out:
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+	if (event->mask & FAN_ALL_PERM_EVENTS) {
+		event->response = FAN_DENY;
+		wake_up(&group->fanotify_data.access_waitq);
+	}
+#endif
 	return ret;
 }
 
-- 
cgit v1.2.2


From b1085ba80cd2784400a7beec3fda5099198ed01c Mon Sep 17 00:00:00 2001
From: Lino Sanfilippo <LinoSanfilippo@gmx.de>
Date: Fri, 5 Nov 2010 17:05:27 +0100
Subject: fanotify: if set by user unset FMODE_NONOTIFY before fsnotify_perm()
 is called

Unsetting FMODE_NONOTIFY in fsnotify_open() is too late, since fsnotify_perm()
is called before. If FMODE_NONOTIFY is set fsnotify_perm() will skip permission
checks, so a user can still disable permission checks by setting this flag
in an open() call.
This patch corrects this by unsetting the flag before fsnotify_perm is called.

Signed-off-by: Lino Sanfilippo <LinoSanfilippo@gmx.de>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/namei.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 5362af9b7372..4ff7ca530533 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1748,6 +1748,9 @@ struct file *do_filp_open(int dfd, const char *pathname,
 	if (!(open_flag & O_CREAT))
 		mode = 0;
 
+	/* Must never be set by userspace */
+	open_flag &= ~FMODE_NONOTIFY;
+
 	/*
 	 * O_SYNC is implemented as __O_SYNC|O_DSYNC.  As many places only
 	 * check for O_DSYNC if the need any syncing at all we enforce it's
-- 
cgit v1.2.2


From fa218ab98c31eeacd12b89501e6b99d146ea56cc Mon Sep 17 00:00:00 2001
From: Lino Sanfilippo <LinoSanfilippo@gmx.de>
Date: Tue, 9 Nov 2010 18:18:16 +0100
Subject: fanotify: correct broken ref counting in case adding a mark failed

If adding a mount or inode mark failed fanotify_free_mark() is called explicitly.
But at this time the mark has already been put into the destroy list of the
fsnotify_mark kernel thread. If the thread is too slow it will try to decrease
the reference of a mark, that has already been freed by fanotify_free_mark().
(If its fast enough it will only decrease the marks ref counter from 2 to 1 - note
that the counter has been increased to 2 in add_mark() - which has practically no
effect.)

This patch fixes the ref counting by not calling free_mark() explicitly, but
decreasing the ref counter and rely on the fsnotify_mark thread to cleanup in
case adding the mark has failed.

Signed-off-by: Lino Sanfilippo <LinoSanfilippo@gmx.de>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fanotify/fanotify_user.c | 31 ++++++++++++++-----------------
 1 file changed, 14 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 045c0794d435..c0ca1fa1550c 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -594,11 +594,10 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
 {
 	struct fsnotify_mark *fsn_mark;
 	__u32 added;
+	int ret = 0;
 
 	fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
 	if (!fsn_mark) {
-		int ret;
-
 		if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
 			return -ENOSPC;
 
@@ -608,17 +607,16 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
 
 		fsnotify_init_mark(fsn_mark, fanotify_free_mark);
 		ret = fsnotify_add_mark(fsn_mark, group, NULL, mnt, 0);
-		if (ret) {
-			fanotify_free_mark(fsn_mark);
-			return ret;
-		}
+		if (ret)
+			goto err;
 	}
 	added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
-	fsnotify_put_mark(fsn_mark);
+
 	if (added & ~mnt->mnt_fsnotify_mask)
 		fsnotify_recalc_vfsmount_mask(mnt);
-
-	return 0;
+err:
+	fsnotify_put_mark(fsn_mark);
+	return ret;
 }
 
 static int fanotify_add_inode_mark(struct fsnotify_group *group,
@@ -627,6 +625,7 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
 {
 	struct fsnotify_mark *fsn_mark;
 	__u32 added;
+	int ret = 0;
 
 	pr_debug("%s: group=%p inode=%p\n", __func__, group, inode);
 
@@ -642,8 +641,6 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
 
 	fsn_mark = fsnotify_find_inode_mark(group, inode);
 	if (!fsn_mark) {
-		int ret;
-
 		if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
 			return -ENOSPC;
 
@@ -653,16 +650,16 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
 
 		fsnotify_init_mark(fsn_mark, fanotify_free_mark);
 		ret = fsnotify_add_mark(fsn_mark, group, inode, NULL, 0);
-		if (ret) {
-			fanotify_free_mark(fsn_mark);
-			return ret;
-		}
+		if (ret)
+			goto err;
 	}
 	added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
-	fsnotify_put_mark(fsn_mark);
+
 	if (added & ~inode->i_fsnotify_mask)
 		fsnotify_recalc_inode_mask(inode);
-	return 0;
+err:
+	fsnotify_put_mark(fsn_mark);
+	return ret;
 }
 
 /* fanotify syscalls */
-- 
cgit v1.2.2


From 1734dee4e3a296cb72b4819fc2e7ef2440737dff Mon Sep 17 00:00:00 2001
From: Lino Sanfilippo <LinoSanfilippo@gmx.de>
Date: Mon, 22 Nov 2010 18:46:33 +0100
Subject: fanotify: Dont allow a mask of 0 if setting or removing a mark

In mark_remove_from_mask() we destroy marks that have their event mask cleared.
Thus we should not allow the creation of those marks in the first place.
With this patch we check if the mask given from user is 0 in case of FAN_MARK_ADD.
If so we return an error. Same for FAN_MARK_REMOVE since this does not have any
effect.

Signed-off-by: Lino Sanfilippo <LinoSanfilippo@gmx.de>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fanotify/fanotify_user.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index c0ca1fa1550c..480434c5ee5f 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -769,8 +769,10 @@ SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags,
 	if (flags & ~FAN_ALL_MARK_FLAGS)
 		return -EINVAL;
 	switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) {
-	case FAN_MARK_ADD:
+	case FAN_MARK_ADD:		/* fallthrough */
 	case FAN_MARK_REMOVE:
+		if (!mask)
+			return -EINVAL;
 	case FAN_MARK_FLUSH:
 		break;
 	default:
-- 
cgit v1.2.2


From 09e5f14e57c70f9d357862bb56e57026c51092a1 Mon Sep 17 00:00:00 2001
From: Lino Sanfilippo <LinoSanfilippo@gmx.de>
Date: Fri, 19 Nov 2010 10:58:07 +0100
Subject: fanotify: on group destroy allow all waiters to bypass permission
 check

When fanotify_release() is called, there may still be processes waiting for
access permission. Currently only processes for which an event has already been
queued into the groups access list will be woken up.  Processes for which no
event has been queued will continue to sleep and thus cause a deadlock when
fsnotify_put_group() is called.
Furthermore there is a race allowing further processes to be waiting on the
access wait queue after wake_up (if they arrive before clear_marks_by_group()
is called).
This patch corrects this by setting a flag to inform processes that the group
is about to be destroyed and thus not to wait for access permission.

[additional changelog from eparis]
Lets think about the 4 relevant code paths from the PoV of the
'operator' 'listener' 'responder' and 'closer'.  Where operator is the
process doing an action (like open/read) which could require permission.
Listener is the task (or in this case thread) slated with reading from
the fanotify file descriptor.  The 'responder' is the thread responsible
for responding to access requests.  'Closer' is the thread attempting to
close the fanotify file descriptor.

The 'operator' is going to end up in:
fanotify_handle_event()
  get_response_from_access()
    (THIS BLOCKS WAITING ON USERSPACE)

The 'listener' interesting code path
fanotify_read()
  copy_event_to_user()
    prepare_for_access_response()
      (THIS CREATES AN fanotify_response_event)

The 'responder' code path:
fanotify_write()
  process_access_response()
    (REMOVE A fanotify_response_event, SET RESPONSE, WAKE UP 'operator')

The 'closer':
fanotify_release()
  (SUPPOSED TO CLEAN UP THE REST OF THIS MESS)

What we have today is that in the closer we remove all of the
fanotify_response_events and set a bit so no more response events are
ever created in prepare_for_access_response().

The bug is that we never wake all of the operators up and tell them to
move along.  You fix that in fanotify_get_response_from_access().  You
also fix other operators which haven't gotten there yet.  So I agree
that's a good fix.
[/additional changelog from eparis]

[remove additional changes to minimize patch size]
[move initialization so it was inside CONFIG_FANOTIFY_PERMISSION]

Signed-off-by: Lino Sanfilippo <LinoSanfilippo@gmx.de>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fanotify/fanotify.c      | 6 +++++-
 fs/notify/fanotify/fanotify_user.c | 5 +++--
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index b04f88eed09e..f35794b97e8e 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -92,7 +92,11 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group,
 
 	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
 
-	wait_event(group->fanotify_data.access_waitq, event->response);
+	wait_event(group->fanotify_data.access_waitq, event->response ||
+				atomic_read(&group->fanotify_data.bypass_perm));
+
+	if (!event->response) /* bypass_perm set */
+		return 0;
 
 	/* userspace responded, convert to something usable */
 	spin_lock(&event->lock);
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 480434c5ee5f..01fffe62a2d4 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -200,7 +200,7 @@ static int prepare_for_access_response(struct fsnotify_group *group,
 
 	mutex_lock(&group->fanotify_data.access_mutex);
 
-	if (group->fanotify_data.bypass_perm) {
+	if (atomic_read(&group->fanotify_data.bypass_perm)) {
 		mutex_unlock(&group->fanotify_data.access_mutex);
 		kmem_cache_free(fanotify_response_event_cache, re);
 		event->response = FAN_ALLOW;
@@ -390,7 +390,7 @@ static int fanotify_release(struct inode *ignored, struct file *file)
 
 	mutex_lock(&group->fanotify_data.access_mutex);
 
-	group->fanotify_data.bypass_perm = true;
+	atomic_inc(&group->fanotify_data.bypass_perm);
 
 	list_for_each_entry_safe(re, lre, &group->fanotify_data.access_list, list) {
 		pr_debug("%s: found group=%p re=%p event=%p\n", __func__, group,
@@ -703,6 +703,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 	mutex_init(&group->fanotify_data.access_mutex);
 	init_waitqueue_head(&group->fanotify_data.access_waitq);
 	INIT_LIST_HEAD(&group->fanotify_data.access_list);
+	atomic_set(&group->fanotify_data.bypass_perm, 0);
 #endif
 	switch (flags & FAN_ALL_CLASS_BITS) {
 	case FAN_CLASS_NOTIF:
-- 
cgit v1.2.2


From a2ae4cc9a16e211c8a128ba10d22a85431f093ab Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 23 Nov 2010 18:18:37 -0500
Subject: inotify: stop kernel memory leak on file creation failure

If inotify_init is unable to allocate a new file for the new inotify
group we leak the new group.  This patch drops the reference on the
group on file allocation failure.

Reported-by: Vegard Nossum <vegard.nossum@gmail.com>
cc: stable@kernel.org
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/inotify/inotify_user.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 444c305a468c..4cd5d5d78f9f 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -752,6 +752,7 @@ SYSCALL_DEFINE1(inotify_init1, int, flags)
 	if (ret >= 0)
 		return ret;
 
+	fsnotify_put_group(group);
 	atomic_dec(&user->inotify_devs);
 out_free_uid:
 	free_uid(user);
-- 
cgit v1.2.2


From 26379198937fcc9bbe7be76be695d06df8334eaa Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 23 Nov 2010 23:48:26 -0500
Subject: fanotify: do not leak user reference on allocation failure

If fanotify_init is unable to allocate a new fsnotify group it will
return but will not drop its reference on the associated user struct.
Drop that reference on error.

Reported-by: Vegard Nossum <vegard.nossum@gmail.com>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fanotify/fanotify_user.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 01fffe62a2d4..ca54957b1f61 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -692,8 +692,10 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 
 	/* fsnotify_alloc_group takes a ref.  Dropped in fanotify_release */
 	group = fsnotify_alloc_group(&fanotify_fsnotify_ops);
-	if (IS_ERR(group))
+	if (IS_ERR(group)) {
+		free_uid(user);
 		return PTR_ERR(group);
+	}
 
 	group->fanotify_data.user = user;
 	atomic_inc(&user->fanotify_listeners);
-- 
cgit v1.2.2


From fdbf3ceeb659f0b3c0e8dd79b331b7ac05910f1e Mon Sep 17 00:00:00 2001
From: Lino Sanfilippo <LinoSanfilippo@gmx.de>
Date: Wed, 24 Nov 2010 18:26:04 +0100
Subject: fanotify: Dont try to open a file descriptor for the overflow event

We should not try to open a file descriptor for the overflow event since this
will always fail.

Signed-off-by: Lino Sanfilippo <LinoSanfilippo@gmx.de>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fanotify/fanotify_user.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index ca54957b1f61..dccd7985e65a 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -110,6 +110,8 @@ static int fill_event_metadata(struct fsnotify_group *group,
 				   struct fanotify_event_metadata *metadata,
 				   struct fsnotify_event *event)
 {
+	int ret = 0;
+
 	pr_debug("%s: group=%p metadata=%p event=%p\n", __func__,
 		 group, metadata, event);
 
@@ -117,9 +119,15 @@ static int fill_event_metadata(struct fsnotify_group *group,
 	metadata->vers = FANOTIFY_METADATA_VERSION;
 	metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS;
 	metadata->pid = pid_vnr(event->tgid);
-	metadata->fd = create_fd(group, event);
+	if (unlikely(event->mask & FAN_Q_OVERFLOW))
+		metadata->fd = FAN_NOFD;
+	else {
+		metadata->fd = create_fd(group, event);
+		if (metadata->fd < 0)
+			ret = metadata->fd;
+	}
 
-	return metadata->fd;
+	return ret;
 }
 
 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
@@ -261,7 +269,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
 	if (ret < 0)
 		goto out;
 
-	fd = ret;
+	fd = fanotify_event_metadata.fd;
 	ret = prepare_for_access_response(group, event, fd);
 	if (ret)
 		goto out_close_fd;
@@ -275,7 +283,8 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
 out_kill_access_response:
 	remove_access_response(group, event, fd);
 out_close_fd:
-	sys_close(fd);
+	if (fd != FAN_NOFD)
+		sys_close(fd);
 out:
 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
 	if (event->mask & FAN_ALL_PERM_EVENTS) {
-- 
cgit v1.2.2


From 08a22b392a141c201d7ed4d435de942aa853acd3 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 1 Dec 2010 10:42:16 +0000
Subject: nfs: Discard ACL cache on mode update

An update of mode bits can result in ACL value being changed. We need
to mark the acl cache invalid when we update mode. Similarly we need
to update file attribute when we change ACL value

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 6a653ffd8e4e..4435e5e1f904 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3361,6 +3361,8 @@ static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen)
 	ret = nfs_revalidate_inode(server, inode);
 	if (ret < 0)
 		return ret;
+	if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL)
+		nfs_zap_acl_cache(inode);
 	ret = nfs4_read_cached_acl(inode, buf, buflen);
 	if (ret != -ENOENT)
 		return ret;
@@ -3389,6 +3391,13 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
 	nfs_inode_return_delegation(inode);
 	buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
 	ret = nfs4_call_sync(server, &msg, &arg, &res, 1);
+	/*
+	 * Acl update can result in inode attribute update.
+	 * so mark the attribute cache invalid.
+	 */
+	spin_lock(&inode->i_lock);
+	NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATTR;
+	spin_unlock(&inode->i_lock);
 	nfs_access_zap_cache(inode);
 	nfs_zap_acl_cache(inode);
 	return ret;
-- 
cgit v1.2.2


From 21ac19d484a8ffb66f64487846c8d53afef04d2b Mon Sep 17 00:00:00 2001
From: Sergey Vlasov <vsu@altlinux.ru>
Date: Sun, 28 Nov 2010 21:04:05 +0000
Subject: NFS: Fix fcntl F_GETLK not reporting some conflicts

The commit 129a84de2347002f09721cda3155ccfd19fade40 (locks: fix F_GETLK
regression (failure to find conflicts)) fixed the posix_test_lock()
function by itself, however, its usage in NFS changed by the commit
9d6a8c5c213e34c475e72b245a8eb709258e968c (locks: give posix_test_lock
same interface as ->lock) remained broken - subsequent NFS-specific
locking code received F_UNLCK instead of the user-specified lock type.
To fix the problem, fl->fl_type needs to be saved before the
posix_test_lock() call and restored if no local conflicts were reported.

Reference: https://bugzilla.kernel.org/show_bug.cgi?id=23892
Tested-by: Alexander Morozov <amorozov@etersoft.ru>
Signed-off-by: Sergey Vlasov <vsu@altlinux.ru>
Cc: <stable@kernel.org>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/file.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 60677f9f1311..7bf029ef4084 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -693,6 +693,7 @@ do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
 {
 	struct inode *inode = filp->f_mapping->host;
 	int status = 0;
+	unsigned int saved_type = fl->fl_type;
 
 	/* Try local locking first */
 	posix_test_lock(filp, fl);
@@ -700,6 +701,7 @@ do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
 		/* found a conflict */
 		goto out;
 	}
+	fl->fl_type = saved_type;
 
 	if (nfs_have_delegation(inode, FMODE_READ))
 		goto out_noconflict;
-- 
cgit v1.2.2


From 0de1b7e800188782973598158e0acbb9e08e6c99 Mon Sep 17 00:00:00 2001
From: Mi Jinlong <mijinlong@cn.fujitsu.com>
Date: Sat, 30 Oct 2010 10:19:33 +0800
Subject: nfs: kernel should return EPROTONOSUPPORT when not support NFSv4

  When nfs client(kernel) don't support NFSv4, maybe user build
  kernel without NFSv4, there is a problem.

  Using command "mount SERVER-IP:/nfsv3 /mnt/" to mount NFSv3
  filesystem, mount should should success, but fail and get error:

    "mount.nfs: an incorrect mount option was specified"

  System call mount "nfs"(not "nfs4") with "vers=4",
  if CONFIG_NFS_V4 is not defined, the "vers=4" will be parsed
  as invalid argument and kernel return EINVAL to nfs-utils.

  About that, we really want get EPROTONOSUPPORT rather than
  EINVAL. This path make sure kernel parses argument success,
  and return EPROTONOSUPPORT at nfs_validate_mount_data().

Signed-off-by: Mi Jinlong <mijinlong@cn.fujitsu.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 3c045044fca2..4100630c9a5b 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1069,12 +1069,10 @@ static int nfs_parse_mount_options(char *raw,
 			mnt->flags |= NFS_MOUNT_VER3;
 			mnt->version = 3;
 			break;
-#ifdef CONFIG_NFS_V4
 		case Opt_v4:
 			mnt->flags &= ~NFS_MOUNT_VER3;
 			mnt->version = 4;
 			break;
-#endif
 		case Opt_udp:
 			mnt->flags &= ~NFS_MOUNT_TCP;
 			mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
@@ -1286,12 +1284,10 @@ static int nfs_parse_mount_options(char *raw,
 				mnt->flags |= NFS_MOUNT_VER3;
 				mnt->version = 3;
 				break;
-#ifdef CONFIG_NFS_V4
 			case NFS4_VERSION:
 				mnt->flags &= ~NFS_MOUNT_VER3;
 				mnt->version = 4;
 				break;
-#endif
 			default:
 				goto out_invalid_value;
 			}
-- 
cgit v1.2.2


From 2df485a774ba59c3f43bfe84107672c1d9b731a0 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 7 Dec 2010 22:39:17 -0500
Subject: nfs: remove extraneous and problematic calls to nfs_clear_request

When a nfs_page is freed, nfs_free_request is called which also calls
nfs_clear_request to clean out the lock and open contexts and free the
pagecache page.

However, a couple of places in the nfs code call nfs_clear_request
themselves. What happens here if the refcount on the request is still high?
We'll be releasing contexts and freeing pointers while the request is
possibly still in use.

Remove those bare calls to nfs_clear_context. That should only be done when
the request is being freed.

Note that when doing this, we need to watch out for tests of req->wb_page.
Previously, nfs_set_page_tag_locked() and nfs_clear_page_tag_locked()
would check the value of req->wb_page to figure out if the page is mapped
into the nfsi->nfs_page_tree. We now indicate the page is mapped using
the new bit PG_MAPPED in req->wb_flags .

Reported-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/pagelist.c | 4 ++--
 fs/nfs/read.c     | 1 -
 fs/nfs/write.c    | 3 ++-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 137b549e63db..b68536cc9046 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -115,7 +115,7 @@ int nfs_set_page_tag_locked(struct nfs_page *req)
 {
 	if (!nfs_lock_request_dontget(req))
 		return 0;
-	if (req->wb_page != NULL)
+	if (test_bit(PG_MAPPED, &req->wb_flags))
 		radix_tree_tag_set(&NFS_I(req->wb_context->path.dentry->d_inode)->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
 	return 1;
 }
@@ -125,7 +125,7 @@ int nfs_set_page_tag_locked(struct nfs_page *req)
  */
 void nfs_clear_page_tag_locked(struct nfs_page *req)
 {
-	if (req->wb_page != NULL) {
+	if (test_bit(PG_MAPPED, &req->wb_flags)) {
 		struct inode *inode = req->wb_context->path.dentry->d_inode;
 		struct nfs_inode *nfsi = NFS_I(inode);
 
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index e4b62c6f5a6e..aedcaa7f291f 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -152,7 +152,6 @@ static void nfs_readpage_release(struct nfs_page *req)
 			(long long)NFS_FILEID(req->wb_context->path.dentry->d_inode),
 			req->wb_bytes,
 			(long long)req_offset(req));
-	nfs_clear_request(req);
 	nfs_release_request(req);
 }
 
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 4c14c17a5276..10d648ea128b 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -390,6 +390,7 @@ static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
 		if (nfs_have_delegation(inode, FMODE_WRITE))
 			nfsi->change_attr++;
 	}
+	set_bit(PG_MAPPED, &req->wb_flags);
 	SetPagePrivate(req->wb_page);
 	set_page_private(req->wb_page, (unsigned long)req);
 	nfsi->npages++;
@@ -415,6 +416,7 @@ static void nfs_inode_remove_request(struct nfs_page *req)
 	spin_lock(&inode->i_lock);
 	set_page_private(req->wb_page, 0);
 	ClearPagePrivate(req->wb_page);
+	clear_bit(PG_MAPPED, &req->wb_flags);
 	radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index);
 	nfsi->npages--;
 	if (!nfsi->npages) {
@@ -422,7 +424,6 @@ static void nfs_inode_remove_request(struct nfs_page *req)
 		iput(inode);
 	} else
 		spin_unlock(&inode->i_lock);
-	nfs_clear_request(req);
 	nfs_release_request(req);
 }
 
-- 
cgit v1.2.2


From c1ac3ffcd0bc7e9617f62be8c7043d53ab84deac Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@suse.de>
Date: Thu, 2 Dec 2010 11:14:30 +1100
Subject: nfsd: Fix possible BUG_ON firing in set_change_info

If vfs_getattr in fill_post_wcc returns an error, we don't
set fh_post_change.
For NFSv4, this can result in set_change_info triggering a BUG_ON.
i.e. fh_post_saved being zero isn't really a bug.

So:
 - instead of BUGging when fh_post_saved is zero, just clear ->atomic.
 - if vfs_getattr fails in fill_post_wcc, take a copy of i_ctime anyway.
   This will be used i seg_change_info, but not overly trusted.
 - While we are there, remove the pointless 'if' statements in set_change_info.
   There is no harm setting all the values.

Signed-off-by: NeilBrown <neilb@suse.de>
Cc: stable@kernel.org
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs3xdr.c |  6 ++++--
 fs/nfsd/xdr4.h    | 21 ++++++++++-----------
 2 files changed, 14 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 2a533a0af2a9..7e84a852cdae 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -260,9 +260,11 @@ void fill_post_wcc(struct svc_fh *fhp)
 	err = vfs_getattr(fhp->fh_export->ex_path.mnt, fhp->fh_dentry,
 			&fhp->fh_post_attr);
 	fhp->fh_post_change = fhp->fh_dentry->d_inode->i_version;
-	if (err)
+	if (err) {
 		fhp->fh_post_saved = 0;
-	else
+		/* Grab the ctime anyway - set_change_info might use it */
+		fhp->fh_post_attr.ctime = fhp->fh_dentry->d_inode->i_ctime;
+	} else
 		fhp->fh_post_saved = 1;
 }
 
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 4d476ff08ae6..60fce3dc5cb5 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -484,18 +484,17 @@ static inline bool nfsd4_not_cached(struct nfsd4_compoundres *resp)
 static inline void
 set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp)
 {
-	BUG_ON(!fhp->fh_pre_saved || !fhp->fh_post_saved);
-	cinfo->atomic = 1;
+	BUG_ON(!fhp->fh_pre_saved);
+	cinfo->atomic = fhp->fh_post_saved;
 	cinfo->change_supported = IS_I_VERSION(fhp->fh_dentry->d_inode);
-	if (cinfo->change_supported) {
-		cinfo->before_change = fhp->fh_pre_change;
-		cinfo->after_change = fhp->fh_post_change;
-	} else {
-		cinfo->before_ctime_sec = fhp->fh_pre_ctime.tv_sec;
-		cinfo->before_ctime_nsec = fhp->fh_pre_ctime.tv_nsec;
-		cinfo->after_ctime_sec = fhp->fh_post_attr.ctime.tv_sec;
-		cinfo->after_ctime_nsec = fhp->fh_post_attr.ctime.tv_nsec;
-	}
+
+	cinfo->before_change = fhp->fh_pre_change;
+	cinfo->after_change = fhp->fh_post_change;
+	cinfo->before_ctime_sec = fhp->fh_pre_ctime.tv_sec;
+	cinfo->before_ctime_nsec = fhp->fh_pre_ctime.tv_nsec;
+	cinfo->after_ctime_sec = fhp->fh_post_attr.ctime.tv_sec;
+	cinfo->after_ctime_nsec = fhp->fh_post_attr.ctime.tv_nsec;
+
 }
 
 int nfs4svc_encode_voidres(struct svc_rqst *, __be32 *, void *);
-- 
cgit v1.2.2


From 545c988b2018a593f24d291b66776a0d08525acd Mon Sep 17 00:00:00 2001
From: Suresh Jayaraman <sjayaraman@suse.de>
Date: Mon, 6 Dec 2010 13:45:50 +0530
Subject: cifs: remove bogus remapping of error in cifs_filldir()

As the FIXME points out correctly, now filldir() itself returns -EOVERFLOW if
it not possible to represent the inode number supplied by the filesystem in
the field provided by userspace.

Signed-off-by: Suresh Jayaraman <sjayaraman@suse.de>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/readdir.c | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 32d300e8f20e..a73eb9f4bdaf 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -759,18 +759,6 @@ static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir,
 	rc = filldir(direntry, qstring.name, qstring.len, file->f_pos,
 		     ino, fattr.cf_dtype);
 
-	/*
-	 * we can not return filldir errors to the caller since they are
-	 * "normal" when the stat blocksize is too small - we return remapped
-	 * error instead
-	 *
-	 * FIXME: This looks bogus. filldir returns -EOVERFLOW in the above
-	 * case already. Why should we be clobbering other errors from it?
-	 */
-	if (rc) {
-		cFYI(1, "filldir rc = %d", rc);
-		rc = -EOVERFLOW;
-	}
 	dput(tmp_dentry);
 	return rc;
 }
-- 
cgit v1.2.2


From 955256f2c3e25c94ad373c43fbc38d2ac8af2a71 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 19 Nov 2010 09:41:10 -0500
Subject: Btrfs: fix use after free in O_DIRECT

This fixes a bug where we use dip after we have freed it.  Instead just use the
file_offset that was passed to the function.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/inode.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 0f34cae0a633..ae6c0d190bc1 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5934,8 +5934,7 @@ free_ordered:
 	 */
 	if (write) {
 		struct btrfs_ordered_extent *ordered;
-		ordered = btrfs_lookup_ordered_extent(inode,
-						      dip->logical_offset);
+		ordered = btrfs_lookup_ordered_extent(inode, file_offset);
 		if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
 		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
 			btrfs_free_reserved_extent(root, ordered->start,
-- 
cgit v1.2.2


From 2b20982e3154266106573beac2a4d4ba57a2789a Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 3 Dec 2010 13:17:53 -0500
Subject: Btrfs: deal with space cache errors better

Currently if the space cache inode generation number doesn't match the
generation number in the space cache header we will just fail to load the space
cache, but we won't mark the space cache as an error, so we'll keep getting that
error each time somebody tries to cache that block group until we actually clear
the thing.  Fix this by marking the space cache as having an error so we only
get the message once.  This patch also makes it so that we don't try and setup
space cache for a block group that isn't cached, since we won't be able to write
it out anyway.  None of these problems are actual problems, they are just
annoying and sub-optimal.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c      | 10 ++++++----
 fs/btrfs/free-space-cache.c | 12 +++++++-----
 2 files changed, 13 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ddaf6340fe7f..8c56f5b38948 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2742,6 +2742,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group,
 	struct btrfs_root *root = block_group->fs_info->tree_root;
 	struct inode *inode = NULL;
 	u64 alloc_hint = 0;
+	int dcs = BTRFS_DC_ERROR;
 	int num_pages = 0;
 	int retries = 0;
 	int ret = 0;
@@ -2796,6 +2797,8 @@ again:
 
 	spin_lock(&block_group->lock);
 	if (block_group->cached != BTRFS_CACHE_FINISHED) {
+		/* We're not cached, don't bother trying to write stuff out */
+		dcs = BTRFS_DC_WRITTEN;
 		spin_unlock(&block_group->lock);
 		goto out_put;
 	}
@@ -2822,6 +2825,8 @@ again:
 	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
 					      num_pages, num_pages,
 					      &alloc_hint);
+	if (!ret)
+		dcs = BTRFS_DC_SETUP;
 	btrfs_free_reserved_data_space(inode, num_pages);
 out_put:
 	iput(inode);
@@ -2829,10 +2834,7 @@ out_free:
 	btrfs_release_path(root, path);
 out:
 	spin_lock(&block_group->lock);
-	if (ret)
-		block_group->disk_cache_state = BTRFS_DC_ERROR;
-	else
-		block_group->disk_cache_state = BTRFS_DC_SETUP;
+	block_group->disk_cache_state = dcs;
 	spin_unlock(&block_group->lock);
 
 	return ret;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 22ee0dc2e6b8..60d684266959 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -290,7 +290,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
 		       (unsigned long long)BTRFS_I(inode)->generation,
 		       (unsigned long long)generation,
 		       (unsigned long long)block_group->key.objectid);
-		goto out;
+		goto free_cache;
 	}
 
 	if (!num_entries)
@@ -524,6 +524,12 @@ int btrfs_write_out_cache(struct btrfs_root *root,
 		return 0;
 	}
 
+	node = rb_first(&block_group->free_space_offset);
+	if (!node) {
+		iput(inode);
+		return 0;
+	}
+
 	last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
 	filemap_write_and_wait(inode->i_mapping);
 	btrfs_wait_ordered_range(inode, inode->i_size &
@@ -543,10 +549,6 @@ int btrfs_write_out_cache(struct btrfs_root *root,
 	 */
 	first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64);
 
-	node = rb_first(&block_group->free_space_offset);
-	if (!node)
-		goto out_free;
-
 	/*
 	 * Lock all pages first so we can lock the extent safely.
 	 *
-- 
cgit v1.2.2


From b8399dee478db7939cd0d6fda8ecacddf2facd03 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Wed, 8 Dec 2010 09:15:11 -0500
Subject: Btrfs: do not do fast caching if we are allocating blocks for
 tree_root

Since the fast caching uses normal tree locking, we can possibly deadlock if we
get to the caching via a btrfs_search_slot() on the tree_root.  So just check to
see if the root we are on is the tree root, and just don't do the fast caching.

Reported-by: Sage Weil <sage@newdream.net>
Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 8c56f5b38948..cec05e100142 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -429,6 +429,7 @@ err:
 
 static int cache_block_group(struct btrfs_block_group_cache *cache,
 			     struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root,
 			     int load_cache_only)
 {
 	struct btrfs_fs_info *fs_info = cache->fs_info;
@@ -442,9 +443,12 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
 
 	/*
 	 * We can't do the read from on-disk cache during a commit since we need
-	 * to have the normal tree locking.
+	 * to have the normal tree locking.  Also if we are currently trying to
+	 * allocate blocks for the tree root we can't do the fast caching since
+	 * we likely hold important locks.
 	 */
-	if (!trans->transaction->in_commit) {
+	if (!trans->transaction->in_commit &&
+	    (root && root != root->fs_info->tree_root)) {
 		spin_lock(&cache->lock);
 		if (cache->cached != BTRFS_CACHE_NO) {
 			spin_unlock(&cache->lock);
@@ -4083,7 +4087,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 		 * space back to the block group, otherwise we will leak space.
 		 */
 		if (!alloc && cache->cached == BTRFS_CACHE_NO)
-			cache_block_group(cache, trans, 1);
+			cache_block_group(cache, trans, NULL, 1);
 
 		byte_in_group = bytenr - cache->key.objectid;
 		WARN_ON(byte_in_group > cache->key.offset);
@@ -4937,7 +4941,8 @@ have_block_group:
 		if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
 			u64 free_percent;
 
-			ret = cache_block_group(block_group, trans, 1);
+			ret = cache_block_group(block_group, trans,
+						orig_root, 1);
 			if (block_group->cached == BTRFS_CACHE_FINISHED)
 				goto have_block_group;
 
@@ -4961,7 +4966,8 @@ have_block_group:
 			if (loop > LOOP_CACHING_NOWAIT ||
 			    (loop > LOOP_FIND_IDEAL &&
 			     atomic_read(&space_info->caching_threads) < 2)) {
-				ret = cache_block_group(block_group, trans, 0);
+				ret = cache_block_group(block_group, trans,
+							orig_root, 0);
 				BUG_ON(ret);
 			}
 			found_uncached_bg = true;
@@ -5518,7 +5524,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
 	u64 num_bytes = ins->offset;
 
 	block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
-	cache_block_group(block_group, trans, 0);
+	cache_block_group(block_group, trans, NULL, 0);
 	caching_ctl = get_caching_control(block_group);
 
 	if (!caching_ctl) {
-- 
cgit v1.2.2


From 7e1fea731da8c1b5fcf5d8e157befd389b030760 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Wed, 8 Dec 2010 12:22:34 -0500
Subject: Btrfs: fixup return code for btrfs_del_orphan_item

If the orphan item doesn't exist, we return 1, which doesn't make any sense to
the callers.  Instead return -ENOENT if we didn't find the item.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/orphan.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
index 79cba5fbc28e..f8be250963a0 100644
--- a/fs/btrfs/orphan.c
+++ b/fs/btrfs/orphan.c
@@ -56,8 +56,12 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
 		return -ENOMEM;
 
 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
-	if (ret)
+	if (ret < 0)
 		goto out;
+	if (ret) {
+		ret = -ENOENT;
+		goto out;
+	}
 
 	ret = btrfs_del_item(trans, root, path);
 
-- 
cgit v1.2.2


From 05340d4ab2ec2b6b4962c1c41c6ea8fb550f947b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 7 Dec 2010 10:16:41 +0000
Subject: xfs: log timestamp changes to the source inode in rename

Now that we don't mark VFS inodes dirty anymore for internal
timestamp changes, but rely on the transaction subsystem to push
them out, we need to explicitly log the source inode in rename after
updating it's timestamps to make sure the changes actually get
forced out by sync/fsync or an AIL push.

We already account for the fourth inode in the log reservation, as a
rename of directories needs to update the nlink field, so just
adding the xfs_trans_log_inode call is enough.

This fixes the xfsqa 065 regression introduced by:

	"xfs: don't use vfs writeback for pure metadata modifications"

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_rename.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index d2af0a8381a6..77a59891734e 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -297,6 +297,7 @@ xfs_rename(
 	 * it and some incremental backup programs won't work without it.
 	 */
 	xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
+	xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE);
 
 	/*
 	 * Adjust the link count on src_dp.  This is necessary when
-- 
cgit v1.2.2


From 5b362ac3799ff4225c40935500f520cad4d7ed66 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 10 Dec 2010 12:31:14 -0500
Subject: NFS: Fix panic after nfs_umount()

After a few unsuccessful NFS mount attempts in which the client and
server cannot agree on an authentication flavor both support, the
client panics.  nfs_umount() is invoked in the kernel in this case.

Turns out nfs_umount()'s UMNT RPC invocation causes the RPC client to
write off the end of the rpc_clnt's iostat array.  This is because the
mount client's nrprocs field is initialized with the count of defined
procedures (two: MNT and UMNT), rather than the size of the client's
proc array (four).

The fix is to use the same initialization technique used by most other
upper layer clients in the kernel.

Introduced by commit 0b524123, which failed to update nrprocs when
support was added for UMNT in the kernel.

BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=24302
BugLink: http://bugs.launchpad.net/bugs/683938

Reported-by: Stefan Bader <stefan.bader@canonical.com>
Tested-by: Stefan Bader <stefan.bader@canonical.com>
Cc: stable@kernel.org # >= 2.6.32
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/mount_clnt.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index eceafe74f473..4f981f1f6689 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -505,13 +505,13 @@ static struct rpc_procinfo mnt3_procedures[] = {
 
 static struct rpc_version mnt_version1 = {
 	.number		= 1,
-	.nrprocs	= 2,
+	.nrprocs	= ARRAY_SIZE(mnt_procedures),
 	.procs		= mnt_procedures,
 };
 
 static struct rpc_version mnt_version3 = {
 	.number		= 3,
-	.nrprocs	= 2,
+	.nrprocs	= ARRAY_SIZE(mnt3_procedures),
 	.procs		= mnt3_procedures,
 };
 
-- 
cgit v1.2.2


From 84cd948cb11041f205242de457e680b9bb872a36 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Wed, 8 Dec 2010 12:24:01 -0500
Subject: Btrfs: do not BUG if we fail to remove the orphan item for dead
 snapshots

Not being able to delete an orphan item isn't a horrible thing.  The worst that
happens is the next time around we try and do the orphan cleanup and we can't
find the referenced object and just delete the item and move on.

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index cec05e100142..41133b064d72 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6309,9 +6309,13 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
 					   NULL, NULL);
 		BUG_ON(ret < 0);
 		if (ret > 0) {
-			ret = btrfs_del_orphan_item(trans, tree_root,
-						    root->root_key.objectid);
-			BUG_ON(ret);
+			/* if we fail to delete the orphan item this time
+			 * around, it'll get picked up the next time.
+			 *
+			 * The most common failure here is just -ENOENT.
+			 */
+			btrfs_del_orphan_item(trans, tree_root,
+					      root->root_key.objectid);
 		}
 	}
 
-- 
cgit v1.2.2


From 24ae63656a165c870c0d69fcc8aac1dc35e25e34 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@linux.intel.com>
Date: Mon, 6 Dec 2010 07:02:36 +0000
Subject: Btrfs: Fix page leak in compressed writeback path

"start + num_bytes >= actual_end" can happen when compressed page writeback races
with file truncation. In that case we need unlock and release pages past the end
of file.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ae6c0d190bc1..4875d69871b5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -495,7 +495,7 @@ again:
 		add_async_extent(async_cow, start, num_bytes,
 				 total_compressed, pages, nr_pages_ret);
 
-		if (start + num_bytes < end && start + num_bytes < actual_end) {
+		if (start + num_bytes < end) {
 			start += num_bytes;
 			pages = NULL;
 			cond_resched();
-- 
cgit v1.2.2


From 75eaa0e22c055e38982df267d0f84cc510ba38bf Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 10 Dec 2010 00:36:28 +0000
Subject: Btrfs: fix sync subvol/snapshot creation

We were incorrectly taking the async path even for the sync ioctls by
passing in &transid unconditionally.

There's ample room for further cleanup here, but this keeps the fix simple.

Signed-off-by: Sage Weil <sage@newdream.net>
Reviewed-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ioctl.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index f1c9bb4079ed..7cc2e8e075b4 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -964,6 +964,15 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
 		name = async_vol_args->name;
 		fd = async_vol_args->fd;
 		async_vol_args->name[BTRFS_SNAPSHOT_NAME_MAX] = '\0';
+
+		ret = btrfs_ioctl_snap_create_transid(file, name, fd,
+						      subvol, &transid);
+
+		if (ret == 0 &&
+		    copy_to_user(arg +
+				 offsetof(struct btrfs_ioctl_async_vol_args,
+					  transid), &transid, sizeof(transid)))
+			ret = -EFAULT;
 	} else {
 		vol_args = memdup_user(arg, sizeof(*vol_args));
 		if (IS_ERR(vol_args))
@@ -971,16 +980,9 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
 		name = vol_args->name;
 		fd = vol_args->fd;
 		vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
-	}
 
-	ret = btrfs_ioctl_snap_create_transid(file, name, fd,
-					      subvol, &transid);
-
-	if (!ret && async) {
-		if (copy_to_user(arg +
-				offsetof(struct btrfs_ioctl_async_vol_args,
-				transid), &transid, sizeof(transid)))
-			return -EFAULT;
+		ret = btrfs_ioctl_snap_create_transid(file, name, fd,
+						      subvol, NULL);
 	}
 
 	kfree(vol_args);
-- 
cgit v1.2.2


From f106e82caaa0d943e47cacc184f5b40d538e0044 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 7 Dec 2010 01:51:26 +0000
Subject: Btrfs: Fix a crash when mounting a subvolume

We should drop dentry before deactivating the superblock, otherwise
we can hit this bug:

BUG: Dentry f349a690{i=100,n=/} still in use (1) [unmount of btrfs loop1]
...

Steps to reproduce the bug:

  # mount /dev/loop1 /mnt
  # mkdir save
  # btrfs subvolume snapshot /mnt save/snap1
  # umount /mnt
  # mount -o subvol=save/snap1 /dev/loop1 /mnt
  (crash)

Reported-by: Michael Niederle <mniederle@gmx.at>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 47bf67cbe6bf..61bd79abb805 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -685,9 +685,9 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
 		mutex_unlock(&root->d_inode->i_mutex);
 
 		if (IS_ERR(new_root)) {
+			dput(root);
 			deactivate_locked_super(s);
 			error = PTR_ERR(new_root);
-			dput(root);
 			goto error_free_subvol_name;
 		}
 		if (!new_root->d_inode) {
-- 
cgit v1.2.2


From 914ee295af418e936ec20a08c1663eaabe4cd07a Mon Sep 17 00:00:00 2001
From: Xin Zhong <xin.zhong@intel.com>
Date: Thu, 9 Dec 2010 09:30:14 +0000
Subject: Btrfs: pwrite blocked when writing from the mmaped buffer of the same
 page

This problem is found in meego testing:
http://bugs.meego.com/show_bug.cgi?id=6672
A file in btrfs is mmaped and the mmaped buffer is passed to pwrite to write to the same page
of the same file. In btrfs_file_aio_write(), the pages is locked by prepare_pages(). So when
btrfs_copy_from_user() is called, page fault happens and the same page needs to be locked again
in filemap_fault(). The fix is to move iov_iter_fault_in_readable() before prepage_pages() to make page
fault happen before pages are locked. And also disable page fault in critical region in
btrfs_copy_from_user().

Reviewed-by: Yan, Zheng<zheng.z.yan@intel.com>
Signed-off-by: Zhong, Xin <xin.zhong@intel.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file.c | 92 +++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 60 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index c1faded5fca0..66836d85763b 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -48,30 +48,34 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
 					 struct page **prepared_pages,
 					 struct iov_iter *i)
 {
-	size_t copied;
+	size_t copied = 0;
 	int pg = 0;
 	int offset = pos & (PAGE_CACHE_SIZE - 1);
+	int total_copied = 0;
 
 	while (write_bytes > 0) {
 		size_t count = min_t(size_t,
 				     PAGE_CACHE_SIZE - offset, write_bytes);
 		struct page *page = prepared_pages[pg];
-again:
-		if (unlikely(iov_iter_fault_in_readable(i, count)))
-			return -EFAULT;
-
-		/* Copy data from userspace to the current page */
-		copied = iov_iter_copy_from_user(page, i, offset, count);
+		/*
+		 * Copy data from userspace to the current page
+		 *
+		 * Disable pagefault to avoid recursive lock since
+		 * the pages are already locked
+		 */
+		pagefault_disable();
+		copied = iov_iter_copy_from_user_atomic(page, i, offset, count);
+		pagefault_enable();
 
 		/* Flush processor's dcache for this page */
 		flush_dcache_page(page);
 		iov_iter_advance(i, copied);
 		write_bytes -= copied;
+		total_copied += copied;
 
+		/* Return to btrfs_file_aio_write to fault page */
 		if (unlikely(copied == 0)) {
-			count = min_t(size_t, PAGE_CACHE_SIZE - offset,
-				      iov_iter_single_seg_count(i));
-			goto again;
+			break;
 		}
 
 		if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
@@ -81,7 +85,7 @@ again:
 			offset = 0;
 		}
 	}
-	return 0;
+	return total_copied;
 }
 
 /*
@@ -854,6 +858,8 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
 	unsigned long last_index;
 	int will_write;
 	int buffered = 0;
+	int copied = 0;
+	int dirty_pages = 0;
 
 	will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
 		      (file->f_flags & O_DIRECT));
@@ -970,7 +976,17 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
 		WARN_ON(num_pages > nrptrs);
 		memset(pages, 0, sizeof(struct page *) * nrptrs);
 
-		ret = btrfs_delalloc_reserve_space(inode, write_bytes);
+		/*
+		 * Fault pages before locking them in prepare_pages
+		 * to avoid recursive lock
+		 */
+		if (unlikely(iov_iter_fault_in_readable(&i, write_bytes))) {
+			ret = -EFAULT;
+			goto out;
+		}
+
+		ret = btrfs_delalloc_reserve_space(inode,
+					num_pages << PAGE_CACHE_SHIFT);
 		if (ret)
 			goto out;
 
@@ -978,37 +994,49 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
 				    pos, first_index, last_index,
 				    write_bytes);
 		if (ret) {
-			btrfs_delalloc_release_space(inode, write_bytes);
+			btrfs_delalloc_release_space(inode,
+					num_pages << PAGE_CACHE_SHIFT);
 			goto out;
 		}
 
-		ret = btrfs_copy_from_user(pos, num_pages,
+		copied = btrfs_copy_from_user(pos, num_pages,
 					   write_bytes, pages, &i);
-		if (ret == 0) {
+		dirty_pages = (copied + PAGE_CACHE_SIZE - 1) >>
+					PAGE_CACHE_SHIFT;
+
+		if (num_pages > dirty_pages) {
+			if (copied > 0)
+				atomic_inc(
+					&BTRFS_I(inode)->outstanding_extents);
+			btrfs_delalloc_release_space(inode,
+					(num_pages - dirty_pages) <<
+					PAGE_CACHE_SHIFT);
+		}
+
+		if (copied > 0) {
 			dirty_and_release_pages(NULL, root, file, pages,
-						num_pages, pos, write_bytes);
+						dirty_pages, pos, copied);
 		}
 
 		btrfs_drop_pages(pages, num_pages);
-		if (ret) {
-			btrfs_delalloc_release_space(inode, write_bytes);
-			goto out;
-		}
 
-		if (will_write) {
-			filemap_fdatawrite_range(inode->i_mapping, pos,
-						 pos + write_bytes - 1);
-		} else {
-			balance_dirty_pages_ratelimited_nr(inode->i_mapping,
-							   num_pages);
-			if (num_pages <
-			    (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
-				btrfs_btree_balance_dirty(root, 1);
-			btrfs_throttle(root);
+		if (copied > 0) {
+			if (will_write) {
+				filemap_fdatawrite_range(inode->i_mapping, pos,
+							 pos + copied - 1);
+			} else {
+				balance_dirty_pages_ratelimited_nr(
+							inode->i_mapping,
+							dirty_pages);
+				if (dirty_pages <
+				(root->leafsize >> PAGE_CACHE_SHIFT) + 1)
+					btrfs_btree_balance_dirty(root, 1);
+				btrfs_throttle(root);
+			}
 		}
 
-		pos += write_bytes;
-		num_written += write_bytes;
+		pos += copied;
+		num_written += copied;
 
 		cond_resched();
 	}
-- 
cgit v1.2.2


From fdfb1e4f6c61477a61890b64974d65cdc3a98702 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 10 Dec 2010 06:41:56 +0000
Subject: Btrfs: Make async snapshot ioctl more generic

If we had reserved some bytes in struct btrfs_ioctl_vol_args, we
wouldn't have to create a new structure for async snapshot creation.

Here we convert async snapshot ioctl to use a more generic ABI, as
we'll add more ioctls for snapshots/subvolumes in the future, readonly
snapshots for example.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ioctl.c | 44 +++++++++++++++++++++++++++-----------------
 fs/btrfs/ioctl.h | 14 +++++++++-----
 2 files changed, 36 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 7cc2e8e075b4..f87552a1d7ea 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -947,31 +947,41 @@ out:
 
 static noinline int btrfs_ioctl_snap_create(struct file *file,
 					    void __user *arg, int subvol,
-					    int async)
+					    int v2)
 {
 	struct btrfs_ioctl_vol_args *vol_args = NULL;
-	struct btrfs_ioctl_async_vol_args *async_vol_args = NULL;
+	struct btrfs_ioctl_vol_args_v2 *vol_args_v2 = NULL;
 	char *name;
 	u64 fd;
-	u64 transid = 0;
 	int ret;
 
-	if (async) {
-		async_vol_args = memdup_user(arg, sizeof(*async_vol_args));
-		if (IS_ERR(async_vol_args))
-			return PTR_ERR(async_vol_args);
+	if (v2) {
+		u64 transid = 0;
+		u64 *ptr = NULL;
 
-		name = async_vol_args->name;
-		fd = async_vol_args->fd;
-		async_vol_args->name[BTRFS_SNAPSHOT_NAME_MAX] = '\0';
+		vol_args_v2 = memdup_user(arg, sizeof(*vol_args_v2));
+		if (IS_ERR(vol_args_v2))
+			return PTR_ERR(vol_args_v2);
+
+		if (vol_args_v2->flags & ~BTRFS_SUBVOL_CREATE_ASYNC) {
+			ret = -EINVAL;
+			goto out;
+		}
+
+		name = vol_args_v2->name;
+		fd = vol_args_v2->fd;
+		vol_args_v2->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
+
+		if (vol_args_v2->flags & BTRFS_SUBVOL_CREATE_ASYNC)
+			ptr = &transid;
 
 		ret = btrfs_ioctl_snap_create_transid(file, name, fd,
-						      subvol, &transid);
+						      subvol, ptr);
 
-		if (ret == 0 &&
+		if (ret == 0 && ptr &&
 		    copy_to_user(arg +
-				 offsetof(struct btrfs_ioctl_async_vol_args,
-					  transid), &transid, sizeof(transid)))
+				 offsetof(struct btrfs_ioctl_vol_args_v2,
+					  transid), ptr, sizeof(*ptr)))
 			ret = -EFAULT;
 	} else {
 		vol_args = memdup_user(arg, sizeof(*vol_args));
@@ -984,9 +994,9 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
 		ret = btrfs_ioctl_snap_create_transid(file, name, fd,
 						      subvol, NULL);
 	}
-
+out:
 	kfree(vol_args);
-	kfree(async_vol_args);
+	kfree(vol_args_v2);
 
 	return ret;
 }
@@ -2248,7 +2258,7 @@ long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_getversion(file, argp);
 	case BTRFS_IOC_SNAP_CREATE:
 		return btrfs_ioctl_snap_create(file, argp, 0, 0);
-	case BTRFS_IOC_SNAP_CREATE_ASYNC:
+	case BTRFS_IOC_SNAP_CREATE_V2:
 		return btrfs_ioctl_snap_create(file, argp, 0, 1);
 	case BTRFS_IOC_SUBVOL_CREATE:
 		return btrfs_ioctl_snap_create(file, argp, 1, 0);
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 17c99ebdf960..c344d12c646b 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -30,11 +30,15 @@ struct btrfs_ioctl_vol_args {
 	char name[BTRFS_PATH_NAME_MAX + 1];
 };
 
-#define BTRFS_SNAPSHOT_NAME_MAX 4079
-struct btrfs_ioctl_async_vol_args {
+#define BTRFS_SUBVOL_CREATE_ASYNC	(1ULL << 0)
+
+#define BTRFS_SUBVOL_NAME_MAX 4039
+struct btrfs_ioctl_vol_args_v2 {
 	__s64 fd;
 	__u64 transid;
-	char name[BTRFS_SNAPSHOT_NAME_MAX + 1];
+	__u64 flags;
+	__u64 unused[4];
+	char name[BTRFS_SUBVOL_NAME_MAX + 1];
 };
 
 #define BTRFS_INO_LOOKUP_PATH_MAX 4080
@@ -187,6 +191,6 @@ struct btrfs_ioctl_space_args {
 				    struct btrfs_ioctl_space_args)
 #define BTRFS_IOC_START_SYNC _IOR(BTRFS_IOCTL_MAGIC, 24, __u64)
 #define BTRFS_IOC_WAIT_SYNC  _IOW(BTRFS_IOCTL_MAGIC, 22, __u64)
-#define BTRFS_IOC_SNAP_CREATE_ASYNC _IOW(BTRFS_IOCTL_MAGIC, 23, \
-				   struct btrfs_ioctl_async_vol_args)
+#define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \
+				   struct btrfs_ioctl_vol_args_v2)
 #endif
-- 
cgit v1.2.2


From 3dd1462e82bcab7625cec129952f26dae7a8b742 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Tue, 7 Dec 2010 14:54:09 +0000
Subject: Btrfs: fix compiler warnings

... regarding an unused function when !MIGRATION, and regarding a
printk() format string vs argument mismatch.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 6 ++----
 fs/btrfs/inode.c   | 6 +++---
 2 files changed, 5 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 33b6d459494c..b803c2667673 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -696,6 +696,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 				   __btree_submit_bio_done);
 }
 
+#ifdef CONFIG_MIGRATION
 static int btree_migratepage(struct address_space *mapping,
 			struct page *newpage, struct page *page)
 {
@@ -712,12 +713,9 @@ static int btree_migratepage(struct address_space *mapping,
 	if (page_has_private(page) &&
 	    !try_to_release_page(page, GFP_KERNEL))
 		return -EAGAIN;
-#ifdef CONFIG_MIGRATION
 	return migrate_page(mapping, newpage, page);
-#else
-	return -ENOSYS;
-#endif
 }
+#endif
 
 static int btree_writepage(struct page *page, struct writeback_control *wbc)
 {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4875d69871b5..5f9194438f7c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5712,9 +5712,9 @@ static void btrfs_end_dio_bio(struct bio *bio, int err)
 
 	if (err) {
 		printk(KERN_ERR "btrfs direct IO failed ino %lu rw %lu "
-		      "disk_bytenr %lu len %u err no %d\n",
-		      dip->inode->i_ino, bio->bi_rw, bio->bi_sector,
-		      bio->bi_size, err);
+		      "sector %#Lx len %u err no %d\n",
+		      dip->inode->i_ino, bio->bi_rw,
+		      (unsigned long long)bio->bi_sector, bio->bi_size, err);
 		dip->errors = 1;
 
 		/*
-- 
cgit v1.2.2


From 68433b73b104bff388aac376631d32abbbd872b0 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 13 Dec 2010 14:47:58 -0500
Subject: Btrfs: EIO when we fail to read tree roots

If we just get a plain IO error when we read tree roots, the code
wasn't properly sending that error up the chain.  This allowed mounts to
continue when they should failed, and allowed operations
on partially setup root structs.  The end result was usually oopsen
on spinlocks that hadn't been spun up correctly.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b803c2667673..a5d2249e6da5 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1007,7 +1007,10 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
 	blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
 	root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
 				     blocksize, generation);
-	BUG_ON(!root->node);
+	if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) {
+		free_extent_buffer(root->node);
+		return -EIO;
+	}
 	root->commit_root = btrfs_root_node(root);
 	return 0;
 }
-- 
cgit v1.2.2


From cd02dca56442e1504fd6bc5b96f7f1870162b266 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 13 Dec 2010 14:56:23 -0500
Subject: Btrfs: account for missing devices in RAID allocation profiles

When we mount in RAID degraded mode without adding a new device to
replace the failed one, we can end up using the wrong RAID flags for
allocations.

This results in strange combinations of block groups (raid1 in a raid10
filesystem) and corruptions when we try to allocate blocks from single
spindle chunks on drives that are actually missing.

The first device has two small 4MB chunks in it that mkfs creates and
these are usually unused in a raid1 or raid10 setup.  But, in -o degraded,
the allocator will fall back to these because the mask of desired raid groups
isn't correct.

The fix here is to count the missing devices as we build up the list
of devices in the system.  This count is used when picking the
raid level to make sure we continue using the same levels that were
in place before we lost a drive.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 17 +++++++++++++++--
 fs/btrfs/volumes.c     | 20 +++++++++++++++++++-
 fs/btrfs/volumes.h     |  2 ++
 3 files changed, 36 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 41133b064d72..4be231e0d2bd 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3044,7 +3044,13 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
 
 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 {
-	u64 num_devices = root->fs_info->fs_devices->rw_devices;
+	/*
+	 * we add in the count of missing devices because we want
+	 * to make sure that any RAID levels on a degraded FS
+	 * continue to be honored.
+	 */
+	u64 num_devices = root->fs_info->fs_devices->rw_devices +
+		root->fs_info->fs_devices->missing_devices;
 
 	if (num_devices == 1)
 		flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
@@ -7891,7 +7897,14 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
 	u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
 		BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
 
-	num_devices = root->fs_info->fs_devices->rw_devices;
+	/*
+	 * we add in the count of missing devices because we want
+	 * to make sure that any RAID levels on a degraded FS
+	 * continue to be honored.
+	 */
+	num_devices = root->fs_info->fs_devices->rw_devices +
+		root->fs_info->fs_devices->missing_devices;
+
 	if (num_devices == 1) {
 		stripped |= BTRFS_BLOCK_GROUP_DUP;
 		stripped = flags & ~stripped;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 91851b555e2e..177b73179590 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -413,12 +413,16 @@ static noinline int device_list_add(const char *path,
 
 		device->fs_devices = fs_devices;
 		fs_devices->num_devices++;
-	} else if (strcmp(device->name, path)) {
+	} else if (!device->name || strcmp(device->name, path)) {
 		name = kstrdup(path, GFP_NOFS);
 		if (!name)
 			return -ENOMEM;
 		kfree(device->name);
 		device->name = name;
+		if (device->missing) {
+			fs_devices->missing_devices--;
+			device->missing = 0;
+		}
 	}
 
 	if (found_transid > fs_devices->latest_trans) {
@@ -1238,6 +1242,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 
 	device->fs_devices->num_devices--;
 
+	if (device->missing)
+		root->fs_info->fs_devices->missing_devices--;
+
 	next_device = list_entry(root->fs_info->fs_devices->devices.next,
 				 struct btrfs_device, dev_list);
 	if (device->bdev == root->fs_info->sb->s_bdev)
@@ -3084,7 +3091,9 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
 	device->devid = devid;
 	device->work.func = pending_bios_fn;
 	device->fs_devices = fs_devices;
+	device->missing = 1;
 	fs_devices->num_devices++;
+	fs_devices->missing_devices++;
 	spin_lock_init(&device->io_lock);
 	INIT_LIST_HEAD(&device->dev_alloc_list);
 	memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
@@ -3282,6 +3291,15 @@ static int read_one_dev(struct btrfs_root *root,
 			device = add_missing_dev(root, devid, dev_uuid);
 			if (!device)
 				return -ENOMEM;
+		} else if (!device->missing) {
+			/*
+			 * this happens when a device that was properly setup
+			 * in the device info lists suddenly goes bad.
+			 * device->bdev is NULL, and so we have to set
+			 * device->missing to one here
+			 */
+			root->fs_info->fs_devices->missing_devices++;
+			device->missing = 1;
 		}
 	}
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 31b0fabdd2ea..a668c0116982 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -45,6 +45,7 @@ struct btrfs_device {
 	int barriers;
 	int writeable;
 	int in_fs_metadata;
+	int missing;
 
 	spinlock_t io_lock;
 
@@ -94,6 +95,7 @@ struct btrfs_fs_devices {
 	u64 num_devices;
 	u64 open_devices;
 	u64 rw_devices;
+	u64 missing_devices;
 	u64 total_rw_bytes;
 	struct block_device *latest_bdev;
 
-- 
cgit v1.2.2


From 83a50de97fe96aca82389e061862ed760ece2283 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 13 Dec 2010 15:06:46 -0500
Subject: Btrfs: prevent RAID level downgrades when space is low

The extent allocator has code that allows us to fill
allocations from any available block group, even if it doesn't
match the raid level we've requested.

This was put in because adding a new drive to a filesystem
made with the default mkfs options actually upgrades the metadata from
single spindle dup to full RAID1.

But, the code also allows us to allocate from a raid0 chunk when we
really want a raid1 or raid10 chunk.  This can cause big trouble because
mkfs creates a small (4MB) raid0 chunk for data and metadata which then
goes unused for raid1/raid10 installs.

The allocator will happily wander in and allocate from that chunk when
things get tight, which is not correct.

The fix here is to make sure that we provide duplication when the
caller has asked for it.  It does all the dups to be any raid level,
which preserves the dup->raid1 upgrade abilities.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 4be231e0d2bd..7e5162e5c411 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4943,6 +4943,25 @@ search:
 		btrfs_get_block_group(block_group);
 		search_start = block_group->key.objectid;
 
+		/*
+		 * this can happen if we end up cycling through all the
+		 * raid types, but we want to make sure we only allocate
+		 * for the proper type.
+		 */
+		if (!block_group_bits(block_group, data)) {
+		    u64 extra = BTRFS_BLOCK_GROUP_DUP |
+				BTRFS_BLOCK_GROUP_RAID1 |
+				BTRFS_BLOCK_GROUP_RAID10;
+
+			/*
+			 * if they asked for extra copies and this block group
+			 * doesn't provide them, bail.  This does allow us to
+			 * fill raid0 from raid1.
+			 */
+			if ((data & extra) && !(block_group->flags & extra))
+				goto loop;
+		}
+
 have_block_group:
 		if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
 			u64 free_percent;
@@ -8273,7 +8292,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 			break;
 		if (ret != 0)
 			goto error;
-
 		leaf = path->nodes[0];
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 		cache = kzalloc(sizeof(*cache), GFP_NOFS);
-- 
cgit v1.2.2


From 1449032be17abb69116dbc393f67ceb8bd034f92 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 14 Dec 2010 15:27:50 -0500
Subject: ext4: Turn off multiple page-io submission by default

Jon Nelson has found a test case which causes postgresql to fail with
the error:

psql:t.sql:4: ERROR: invalid page header in block 38269 of relation base/16384/16581

Under memory pressure, it looks like part of a file can end up getting
replaced by zero's.  Until we can figure out the cause, we'll roll
back the change and use block_write_full_page() instead of
ext4_bio_write_page().  The new, more efficient writing function can
be used via the mount option mblk_io_submit, so we can test and fix
the new page I/O code.

To reproduce the problem, install postgres 8.4 or 9.0, and pin enough
memory such that the system just at the end of triggering writeback
before running the following sql script:

begin;
create temporary table foo as select x as a, ARRAY[x] as b FROM
generate_series(1, 10000000 ) AS x;
create index foo_a_idx on foo (a);
create index foo_b_idx on foo USING GIN (b);
rollback;

If the temporary table is created on a hard drive partition which is
encrypted using dm_crypt, then under memory pressure, approximately
30-40% of the time, pgsql will issue the above failure.

This patch should fix this problem, and the problem will come back if
the file system is mounted with the mblk_io_submit mount option.

Reported-by: Jon Nelson <jnelson@jamponi.net>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h  |  1 +
 fs/ext4/inode.c |  5 ++++-
 fs/ext4/super.c | 14 ++++++++++++--
 3 files changed, 17 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 6a5edea2d70b..94ce3d7a1c4b 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -910,6 +910,7 @@ struct ext4_inode_info {
 #define EXT4_MOUNT_JOURNAL_CHECKSUM	0x800000 /* Journal checksums */
 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT	0x1000000 /* Journal Async Commit */
 #define EXT4_MOUNT_I_VERSION            0x2000000 /* i_version support */
+#define EXT4_MOUNT_MBLK_IO_SUBMIT	0x4000000 /* multi-block io submits */
 #define EXT4_MOUNT_DELALLOC		0x8000000 /* Delalloc support */
 #define EXT4_MOUNT_DATA_ERR_ABORT	0x10000000 /* Abort on file data write */
 #define EXT4_MOUNT_BLOCK_VALIDITY	0x20000000 /* Block validity checking */
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index bdbe69902207..e659597b690b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2125,9 +2125,12 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
 			 */
 			if (unlikely(journal_data && PageChecked(page)))
 				err = __ext4_journalled_writepage(page, len);
-			else
+			else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT))
 				err = ext4_bio_write_page(&io_submit, page,
 							  len, mpd->wbc);
+			else
+				err = block_write_full_page(page,
+					noalloc_get_block_write, mpd->wbc);
 
 			if (!err)
 				mpd->pages_written++;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e32195d6aac3..fb15c9c0be74 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1026,6 +1026,8 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	    !(def_mount_opts & EXT4_DEFM_NODELALLOC))
 		seq_puts(seq, ",nodelalloc");
 
+	if (test_opt(sb, MBLK_IO_SUBMIT))
+		seq_puts(seq, ",mblk_io_submit");
 	if (sbi->s_stripe)
 		seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
 	/*
@@ -1239,8 +1241,8 @@ enum {
 	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
 	Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err,
 	Opt_resize, Opt_usrquota, Opt_grpquota, Opt_i_version,
-	Opt_stripe, Opt_delalloc, Opt_nodelalloc,
-	Opt_block_validity, Opt_noblock_validity,
+	Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,
+	Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
 	Opt_inode_readahead_blks, Opt_journal_ioprio,
 	Opt_dioread_nolock, Opt_dioread_lock,
 	Opt_discard, Opt_nodiscard,
@@ -1304,6 +1306,8 @@ static const match_table_t tokens = {
 	{Opt_resize, "resize"},
 	{Opt_delalloc, "delalloc"},
 	{Opt_nodelalloc, "nodelalloc"},
+	{Opt_mblk_io_submit, "mblk_io_submit"},
+	{Opt_nomblk_io_submit, "nomblk_io_submit"},
 	{Opt_block_validity, "block_validity"},
 	{Opt_noblock_validity, "noblock_validity"},
 	{Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
@@ -1725,6 +1729,12 @@ set_qf_format:
 		case Opt_nodelalloc:
 			clear_opt(sbi->s_mount_opt, DELALLOC);
 			break;
+		case Opt_mblk_io_submit:
+			set_opt(sbi->s_mount_opt, MBLK_IO_SUBMIT);
+			break;
+		case Opt_nomblk_io_submit:
+			clear_opt(sbi->s_mount_opt, MBLK_IO_SUBMIT);
+			break;
 		case Opt_stripe:
 			if (match_int(&args[0], &option))
 				return 0;
-- 
cgit v1.2.2


From 6d5c3aa84b3e431f2d0fc39c73c867d1a4dd8cff Mon Sep 17 00:00:00 2001
From: Aaro Koskinen <aaro.koskinen@nokia.com>
Date: Tue, 14 Dec 2010 21:45:31 -0500
Subject: ext4: fix typo which broke '..' detection in ext4_find_entry()

There should be a check for the NUL character instead of '0'.

Fortunately the only thing that cares about this is NFS serving, which
is why we didn't notice this in the merge window testing.

Reported-by: Phil Carmody <ext-phil.2.carmody@nokia.com>
Signed-off-by: Aaro Koskinen <aaro.koskinen@nokia.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/namei.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 92203b8a099f..dc40e75cba88 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -872,7 +872,7 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
 	if (namelen > EXT4_NAME_LEN)
 		return NULL;
 	if ((namelen <= 2) && (name[0] == '.') &&
-	    (name[1] == '.' || name[1] == '0')) {
+	    (name[1] == '.' || name[1] == '\0')) {
 		/*
 		 * "." or ".." will only be in the first block
 		 * NFS may look up ".."; "." should be handled by the VFS
-- 
cgit v1.2.2


From 7d13162332f2b67a941d18cee20f1c0413e020de Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 7 Dec 2010 15:27:57 -0500
Subject: fanotify: fill in the metadata_len field on struct
 fanotify_event_metadata

The fanotify_event_metadata now has a field which is supposed to
indicate the length of the metadata portion of the event.  Fill in that
field as well.

Based-in-part-on-patch-by: Alexey Zaytsev <alexey.zaytsev@gmail.com>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fanotify/fanotify_user.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index dccd7985e65a..8b61220cffc5 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -116,6 +116,7 @@ static int fill_event_metadata(struct fsnotify_group *group,
 		 group, metadata, event);
 
 	metadata->event_len = FAN_EVENT_METADATA_LEN;
+	metadata->metadata_len = FAN_EVENT_METADATA_LEN;
 	metadata->vers = FANOTIFY_METADATA_VERSION;
 	metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS;
 	metadata->pid = pid_vnr(event->tgid);
@@ -275,10 +276,11 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
 		goto out_close_fd;
 
 	ret = -EFAULT;
-	if (copy_to_user(buf, &fanotify_event_metadata, FAN_EVENT_METADATA_LEN))
+	if (copy_to_user(buf, &fanotify_event_metadata,
+			 fanotify_event_metadata.event_len))
 		goto out_kill_access_response;
 
-	return FAN_EVENT_METADATA_LEN;
+	return fanotify_event_metadata.event_len;
 
 out_kill_access_response:
 	remove_access_response(group, event, fd);
-- 
cgit v1.2.2


From 462e635e5b73ba9a4c03913b77138cd57ce4b050 Mon Sep 17 00:00:00 2001
From: Tavis Ormandy <taviso@cmpxchg8b.com>
Date: Thu, 9 Dec 2010 15:29:42 +0100
Subject: install_special_mapping skips security_file_mmap check.

The install_special_mapping routine (used, for example, to setup the
vdso) skips the security check before insert_vm_struct, allowing a local
attacker to bypass the mmap_min_addr security restriction by limiting
the available pages for special mappings.

bprm_mm_init() also skips the check, and although I don't think this can
be used to bypass any restrictions, I don't see any reason not to have
the security check.

  $ uname -m
  x86_64
  $ cat /proc/sys/vm/mmap_min_addr
  65536
  $ cat install_special_mapping.s
  section .bss
      resb BSS_SIZE
  section .text
      global _start
      _start:
          mov     eax, __NR_pause
          int     0x80
  $ nasm -D__NR_pause=29 -DBSS_SIZE=0xfffed000 -f elf -o install_special_mapping.o install_special_mapping.s
  $ ld -m elf_i386 -Ttext=0x10000 -Tbss=0x11000 -o install_special_mapping install_special_mapping.o
  $ ./install_special_mapping &
  [1] 14303
  $ cat /proc/14303/maps
  0000f000-00010000 r-xp 00000000 00:00 0                                  [vdso]
  00010000-00011000 r-xp 00001000 00:19 2453665                            /home/taviso/install_special_mapping
  00011000-ffffe000 rwxp 00000000 00:00 0                                  [stack]

It's worth noting that Red Hat are shipping with mmap_min_addr set to
4096.

Signed-off-by: Tavis Ormandy <taviso@google.com>
Acked-by: Kees Cook <kees@ubuntu.com>
Acked-by: Robert Swiecki <swiecki@google.com>
[ Changed to not drop the error code - akpm ]
Reviewed-by: James Morris <jmorris@namei.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index d68c378a3137..c62efcb959c7 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -275,6 +275,11 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
 	vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
 	vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
 	INIT_LIST_HEAD(&vma->anon_vma_chain);
+
+	err = security_file_mmap(NULL, 0, 0, 0, vma->vm_start, 1);
+	if (err)
+		goto err;
+
 	err = insert_vm_struct(mm, vma);
 	if (err)
 		goto err;
-- 
cgit v1.2.2


From 947b10ae0aeda89fc066a7470fdba55f72b0b8fc Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Thu, 16 Dec 2010 09:57:57 +0900
Subject: nilfs2: fix regression of garbage collection ioctl

On 2.6.37-rc1, garbage collection ioctl of nilfs was broken due to the
commit 263d90cefc7d82a0 ("nilfs2: remove own inode hash used for GC"),
and leading to filesystem corruption.

The patch doesn't queue gc-inodes for log writer if they are reused
through the vfs inode cache.  Here, gc-inode is the inode which
buffers blocks to be relocated on GC.  That patch queues gc-inodes in
nilfs_init_gcinode() function, but this function is not called when
they don't have I_NEW flag.  Thus, some of live blocks are wrongly
overrode without being moved to new logs.

This resolves the problem by moving the gc-inode queueing to an outer
function to ensure it's done right.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/gcinode.c |  9 ---------
 fs/nilfs2/ioctl.c   | 12 ++++++++++++
 2 files changed, 12 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 33ad25ddd5c4..caf9a6a3fb54 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -176,7 +176,6 @@ int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh)
 int nilfs_init_gcinode(struct inode *inode)
 {
 	struct nilfs_inode_info *ii = NILFS_I(inode);
-	struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
 
 	inode->i_mode = S_IFREG;
 	mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
@@ -186,14 +185,6 @@ int nilfs_init_gcinode(struct inode *inode)
 	ii->i_flags = 0;
 	nilfs_bmap_init_gc(ii->i_bmap);
 
-	/*
-	 * Add the inode to GC inode list. Garbage Collection
-	 * is serialized and no two processes manipulate the
-	 * list simultaneously.
-	 */
-	igrab(inode);
-	list_add(&NILFS_I(inode)->i_dirty, &nilfs->ns_gc_inodes);
-
 	return 0;
 }
 
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index e00d9457c256..b185e937a335 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -337,6 +337,7 @@ static int nilfs_ioctl_move_blocks(struct super_block *sb,
 				   struct nilfs_argv *argv, void *buf)
 {
 	size_t nmembs = argv->v_nmembs;
+	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
 	struct inode *inode;
 	struct nilfs_vdesc *vdesc;
 	struct buffer_head *bh, *n;
@@ -353,6 +354,17 @@ static int nilfs_ioctl_move_blocks(struct super_block *sb,
 			ret = PTR_ERR(inode);
 			goto failed;
 		}
+		if (list_empty(&NILFS_I(inode)->i_dirty)) {
+			/*
+			 * Add the inode to GC inode list. Garbage Collection
+			 * is serialized and no two processes manipulate the
+			 * list simultaneously.
+			 */
+			igrab(inode);
+			list_add(&NILFS_I(inode)->i_dirty,
+				 &nilfs->ns_gc_inodes);
+		}
+
 		do {
 			ret = nilfs_ioctl_move_inode_block(inode, vdesc,
 							   &buffers);
-- 
cgit v1.2.2