From 43b0178eda1e7e5d1e205bbfd076ab5d6ecacc02 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Wed, 27 Oct 2010 23:19:04 +0200
Subject: nfsd: fix NULL dereference in setattr()

The original code would oops if this were called from nfsd4_setattr()
because "filpp" is NULL.

(Note this case is currently impossible, as long as we only give out
read delegations.)

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index ad2bfa68d534..2d191293e6aa 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3081,9 +3081,10 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
 		if (status)
 			goto out;
 		renew_client(dp->dl_client);
-		if (filpp)
+		if (filpp) {
 			*filpp = find_readable_file(dp->dl_file);
-		BUG_ON(!*filpp);
+			BUG_ON(!*filpp);
+		}
 	} else { /* open or lock stateid */
 		stp = find_stateid(stateid, flags);
 		if (!stp)
-- 
cgit v1.2.2


From 5afa040b307952bb804eba34b21646da2842e14d Mon Sep 17 00:00:00 2001
From: Mi Jinlong <mijinlong@cn.fujitsu.com>
Date: Tue, 9 Nov 2010 09:39:23 +0800
Subject: NFSv4.1: Make sure nfsd can decode SP4_SSV correctly at exchange_id

According to RFC, the argument of ssv_sp_parms4 is:

   struct ssv_sp_parms4 {
           state_protect_ops4      ssp_ops;
           sec_oid4                ssp_hash_algs<>;
           sec_oid4                ssp_encr_algs<>;
           uint32_t                ssp_window;
           uint32_t                ssp_num_gss_handles;
   };

If client send a exchange_id with SP4_SSV, server cann't decode
the SP4_SSV's ssp_hash_algs and ssp_encr_algs arguments correctly.

Because the kernel treat the two arguments as a signal
sec_oid4 struct, but should be a set of sec_oid4 struct.

Signed-off-by: Mi Jinlong <mijinlong@cn.fujitsu.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4xdr.c | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index f35a94a04026..71d7d339e44a 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1005,7 +1005,7 @@ static __be32
 nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
 			 struct nfsd4_exchange_id *exid)
 {
-	int dummy;
+	int dummy, tmp;
 	DECODE_HEAD;
 
 	READ_BUF(NFS4_VERIFIER_SIZE);
@@ -1053,15 +1053,23 @@ nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
 
 		/* ssp_hash_algs<> */
 		READ_BUF(4);
-		READ32(dummy);
-		READ_BUF(dummy);
-		p += XDR_QUADLEN(dummy);
+		READ32(tmp);
+		while (tmp--) {
+			READ_BUF(4);
+			READ32(dummy);
+			READ_BUF(dummy);
+			p += XDR_QUADLEN(dummy);
+		}
 
 		/* ssp_encr_algs<> */
 		READ_BUF(4);
-		READ32(dummy);
-		READ_BUF(dummy);
-		p += XDR_QUADLEN(dummy);
+		READ32(tmp);
+		while (tmp--) {
+			READ_BUF(4);
+			READ32(dummy);
+			READ_BUF(dummy);
+			p += XDR_QUADLEN(dummy);
+		}
 
 		/* ssp_window and ssp_num_gss_handles */
 		READ_BUF(8);
-- 
cgit v1.2.2


From 044bc1d4324bfb34761cb361e404cb8d39c68777 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Fri, 12 Nov 2010 14:36:06 -0500
Subject: nfsd4: return serverfault on request for ssv

We're refusing to support a mandatory features of 4.1, so serverfault
seems the better error; see e.g.:

	http://www.ietf.org/mail-archive/web/nfsv4/current/msg07638.html

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 2d191293e6aa..9e7f8af12f8f 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1344,7 +1344,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
 	case SP4_NONE:
 		break;
 	case SP4_SSV:
-		return nfserr_encr_alg_unsupp;
+		return nfserr_serverfault;
 	default:
 		BUG();				/* checked by xdr code */
 	case SP4_MACH_CRED:
-- 
cgit v1.2.2


From ced6dfe9fc7128995c6d60627938944b430d82c8 Mon Sep 17 00:00:00 2001
From: Mi Jinlong <mijinlong@cn.fujitsu.com>
Date: Thu, 11 Nov 2010 18:03:50 +0800
Subject: NFS4.1: server gets drc mem fail should reply error at create_session

When server gets drc mem fail, it should reply error to client.

Signed-off-by: Mi Jinlong <mijinlong@cn.fujitsu.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 9e7f8af12f8f..5d0ee0f0cb0e 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -749,6 +749,8 @@ static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct n
 	 */
 	slotsize = nfsd4_sanitize_slot_size(fchan->maxresp_cached);
 	numslots = nfsd4_get_drc_mem(slotsize, fchan->maxreqs);
+	if (numslots < 1)
+		return NULL;
 
 	new = alloc_session(slotsize, numslots);
 	if (!new) {
-- 
cgit v1.2.2


From 1205065764f2eda3216ebe213143f69891ee3460 Mon Sep 17 00:00:00 2001
From: Mi Jinlong <mijinlong@cn.fujitsu.com>
Date: Thu, 11 Nov 2010 18:03:40 +0800
Subject: NFS4.1: Fix bug server don't reply the right fore_channel to client
 at create_session

At the latest kernel(2.6.37-rc1), server just initialize the forechannel
at init_forechannel_attrs, but don't reflect it to reply.

After initialize the session success, we should copy the forechannel info
to nfsd4_create_session struct.

Reviewed-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Mi Jinlong <mijinlong@cn.fujitsu.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 5d0ee0f0cb0e..afa7525a86be 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1562,6 +1562,8 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 	status = nfs_ok;
 	memcpy(cr_ses->sessionid.data, new->se_sessionid.data,
 	       NFS4_MAX_SESSIONID_LEN);
+	memcpy(&cr_ses->fore_channel, &new->se_fchannel,
+		sizeof(struct nfsd4_channel_attrs));
 	cs_slot->sl_seqid++;
 	cr_ses->seqid = cs_slot->sl_seqid;
 
-- 
cgit v1.2.2


From 7ddbead6e6d3c730570a215ab9a6b1d126c54d34 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Thu, 4 Nov 2010 20:08:02 -0700
Subject: jffs2: use vzalloc

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 fs/jffs2/build.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c
index 85c6be2db02f..3005ec4520ad 100644
--- a/fs/jffs2/build.c
+++ b/fs/jffs2/build.c
@@ -336,14 +336,13 @@ int jffs2_do_mount_fs(struct jffs2_sb_info *c)
 	size = sizeof(struct jffs2_eraseblock) * c->nr_blocks;
 #ifndef __ECOS
 	if (jffs2_blocks_use_vmalloc(c))
-		c->blocks = vmalloc(size);
+		c->blocks = vzalloc(size);
 	else
 #endif
-		c->blocks = kmalloc(size, GFP_KERNEL);
+		c->blocks = kzalloc(size, GFP_KERNEL);
 	if (!c->blocks)
 		return -ENOMEM;
 
-	memset(c->blocks, 0, size);
 	for (i=0; i<c->nr_blocks; i++) {
 		INIT_LIST_HEAD(&c->blocks[i].list);
 		c->blocks[i].offset = i * c->sector_size;
-- 
cgit v1.2.2


From f326966b3df47f4fa7e90425f60efdd30c31fe19 Mon Sep 17 00:00:00 2001
From: Vasiliy Kulikov <segoon@openwall.com>
Date: Sun, 14 Nov 2010 23:08:39 +0300
Subject: jffs2: fix error value sign

do_verify_xattr_datum(), do_load_xattr_datum(), load_xattr_datum()
and verify_xattr_ref() should return negative value on error.
Sometimes they return EIO that is positive.  Change this to -EIO.

Signed-off-by: Vasiliy Kulikov <segoon@openwall.com>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 fs/jffs2/xattr.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 9b572ca40a49..4f9cc0482949 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -151,7 +151,7 @@ static int do_verify_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_dat
 		JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
 			    offset, je32_to_cpu(rx.hdr_crc), crc);
 		xd->flags |= JFFS2_XFLAGS_INVALID;
-		return EIO;
+		return -EIO;
 	}
 	totlen = PAD(sizeof(rx) + rx.name_len + 1 + je16_to_cpu(rx.value_len));
 	if (je16_to_cpu(rx.magic) != JFFS2_MAGIC_BITMASK
@@ -167,7 +167,7 @@ static int do_verify_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_dat
 			    je32_to_cpu(rx.xid), xd->xid,
 			    je32_to_cpu(rx.version), xd->version);
 		xd->flags |= JFFS2_XFLAGS_INVALID;
-		return EIO;
+		return -EIO;
 	}
 	xd->xprefix = rx.xprefix;
 	xd->name_len = rx.name_len;
@@ -230,7 +230,7 @@ static int do_load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum
 			      ref_offset(xd->node), xd->data_crc, crc);
 		kfree(data);
 		xd->flags |= JFFS2_XFLAGS_INVALID;
-		return EIO;
+		return -EIO;
 	}
 
 	xd->flags |= JFFS2_XFLAGS_HOT;
@@ -268,7 +268,7 @@ static int load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *x
 	if (xd->xname)
 		return 0;
 	if (xd->flags & JFFS2_XFLAGS_INVALID)
-		return EIO;
+		return -EIO;
 	if (unlikely(is_xattr_datum_unchecked(c, xd)))
 		rc = do_verify_xattr_datum(c, xd);
 	if (!rc)
@@ -460,7 +460,7 @@ static int verify_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref
 	if (crc != je32_to_cpu(rr.node_crc)) {
 		JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
 			    offset, je32_to_cpu(rr.node_crc), crc);
-		return EIO;
+		return -EIO;
 	}
 	if (je16_to_cpu(rr.magic) != JFFS2_MAGIC_BITMASK
 	    || je16_to_cpu(rr.nodetype) != JFFS2_NODETYPE_XREF
@@ -470,7 +470,7 @@ static int verify_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref
 			    offset, je16_to_cpu(rr.magic), JFFS2_MAGIC_BITMASK,
 			    je16_to_cpu(rr.nodetype), JFFS2_NODETYPE_XREF,
 			    je32_to_cpu(rr.totlen), PAD(sizeof(rr)));
-		return EIO;
+		return -EIO;
 	}
 	ref->ino = je32_to_cpu(rr.ino);
 	ref->xid = je32_to_cpu(rr.xid);
-- 
cgit v1.2.2


From 027d9ac2c8de9f70b7319e08dee121b8b85c8d88 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Mon, 15 Nov 2010 21:20:05 +0300
Subject: jffs2: typo in comment

It says FB instead of FS (file system).

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 fs/jffs2/jffs2_fs_sb.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
index f864005de64c..0bc6a6c80a56 100644
--- a/fs/jffs2/jffs2_fs_sb.h
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -144,4 +144,4 @@ struct jffs2_sb_info {
 	void *os_priv;
 };
 
-#endif /* _JFFS2_FB_SB */
+#endif /* _JFFS2_FS_SB */
-- 
cgit v1.2.2


From 6e5f15c93dc745d46c2bb9e4597b44463203844b Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Wed, 24 Nov 2010 17:17:34 -0500
Subject: nfsd4: replace unintuitive match_clientid_establishment

Reviewed-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 4d542cfd6960..0c61cc1eadca 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1135,19 +1135,13 @@ find_unconfirmed_client(clientid_t *clid)
 }
 
 /*
- * Return 1 iff clp's clientid establishment method matches the use_exchange_id
- * parameter. Matching is based on the fact the at least one of the
- * EXCHGID4_FLAG_USE_{NON_PNFS,PNFS_MDS,PNFS_DS} flags must be set for v4.1
- *
  * FIXME: we need to unify the clientid namespaces for nfsv4.x
  * and correctly deal with client upgrade/downgrade in EXCHANGE_ID
  * and SET_CLIENTID{,_CONFIRM}
  */
-static inline int
-match_clientid_establishment(struct nfs4_client *clp, bool use_exchange_id)
+static bool clp_used_exchangeid(struct nfs4_client *clp)
 {
-	bool has_exchange_flags = (clp->cl_exchange_flags != 0);
-	return use_exchange_id == has_exchange_flags;
+	return clp->cl_exchange_flags != 0;
 }
 
 static struct nfs4_client *
@@ -1158,7 +1152,7 @@ find_confirmed_client_by_str(const char *dname, unsigned int hashval,
 
 	list_for_each_entry(clp, &conf_str_hashtbl[hashval], cl_strhash) {
 		if (same_name(clp->cl_recdir, dname) &&
-		    match_clientid_establishment(clp, use_exchange_id))
+		    clp_used_exchangeid(clp) == use_exchange_id)
 			return clp;
 	}
 	return NULL;
@@ -1172,7 +1166,7 @@ find_unconfirmed_client_by_str(const char *dname, unsigned int hashval,
 
 	list_for_each_entry(clp, &unconf_str_hashtbl[hashval], cl_strhash) {
 		if (same_name(clp->cl_recdir, dname) &&
-		    match_clientid_establishment(clp, use_exchange_id))
+		    clp_used_exchangeid(clp) == use_exchange_id)
 			return clp;
 	}
 	return NULL;
-- 
cgit v1.2.2


From e203d506bd221bfa5b3acbb7336ae7b7646636a4 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Wed, 24 Nov 2010 17:30:54 -0500
Subject: nfsd4: fix mixed 4.0/4.1 handling, 4.1 reboot

Instead of failing to find client entries which don't match the
minorversion, we should be finding them, then either erroring out or
expiring them as appropriate.

This also fixes a problem which would cause the 4.1 server to fail to
recognize clients after a second reboot.

Reported-by: Casey Bodley <cbodley@citi.umich.edu>
Reviewed-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4recover.c |  1 -
 fs/nfsd/nfs4state.c   | 37 +++++++++++++++++--------------------
 2 files changed, 17 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 7e26caab2a26..ffb59ef6f82f 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -302,7 +302,6 @@ purge_old(struct dentry *parent, struct dentry *child)
 {
 	int status;
 
-	/* note: we currently use this path only for minorversion 0 */
 	if (nfs4_has_reclaimed_state(child->d_name.name, false))
 		return 0;
 
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 0c61cc1eadca..73adcfb2dc17 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1134,39 +1134,30 @@ find_unconfirmed_client(clientid_t *clid)
 	return NULL;
 }
 
-/*
- * FIXME: we need to unify the clientid namespaces for nfsv4.x
- * and correctly deal with client upgrade/downgrade in EXCHANGE_ID
- * and SET_CLIENTID{,_CONFIRM}
- */
 static bool clp_used_exchangeid(struct nfs4_client *clp)
 {
 	return clp->cl_exchange_flags != 0;
-}
+} 
 
 static struct nfs4_client *
-find_confirmed_client_by_str(const char *dname, unsigned int hashval,
-			     bool use_exchange_id)
+find_confirmed_client_by_str(const char *dname, unsigned int hashval)
 {
 	struct nfs4_client *clp;
 
 	list_for_each_entry(clp, &conf_str_hashtbl[hashval], cl_strhash) {
-		if (same_name(clp->cl_recdir, dname) &&
-		    clp_used_exchangeid(clp) == use_exchange_id)
+		if (same_name(clp->cl_recdir, dname))
 			return clp;
 	}
 	return NULL;
 }
 
 static struct nfs4_client *
-find_unconfirmed_client_by_str(const char *dname, unsigned int hashval,
-			       bool use_exchange_id)
+find_unconfirmed_client_by_str(const char *dname, unsigned int hashval)
 {
 	struct nfs4_client *clp;
 
 	list_for_each_entry(clp, &unconf_str_hashtbl[hashval], cl_strhash) {
-		if (same_name(clp->cl_recdir, dname) &&
-		    clp_used_exchangeid(clp) == use_exchange_id)
+		if (same_name(clp->cl_recdir, dname))
 			return clp;
 	}
 	return NULL;
@@ -1357,8 +1348,12 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
 	nfs4_lock_state();
 	status = nfs_ok;
 
-	conf = find_confirmed_client_by_str(dname, strhashval, true);
+	conf = find_confirmed_client_by_str(dname, strhashval);
 	if (conf) {
+		if (!clp_used_exchangeid(conf)) {
+			status = nfserr_clid_inuse; /* XXX: ? */
+			goto out;
+		}
 		if (!same_verf(&verf, &conf->cl_verifier)) {
 			/* 18.35.4 case 8 */
 			if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
@@ -1399,7 +1394,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
 		goto out;
 	}
 
-	unconf  = find_unconfirmed_client_by_str(dname, strhashval, true);
+	unconf  = find_unconfirmed_client_by_str(dname, strhashval);
 	if (unconf) {
 		/*
 		 * Possible retry or client restart.  Per 18.35.4 case 4,
@@ -1799,10 +1794,12 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	strhashval = clientstr_hashval(dname);
 
 	nfs4_lock_state();
-	conf = find_confirmed_client_by_str(dname, strhashval, false);
+	conf = find_confirmed_client_by_str(dname, strhashval);
 	if (conf) {
 		/* RFC 3530 14.2.33 CASE 0: */
 		status = nfserr_clid_inuse;
+		if (clp_used_exchangeid(conf))
+			goto out;
 		if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) {
 			char addr_str[INET6_ADDRSTRLEN];
 			rpc_ntop((struct sockaddr *) &conf->cl_addr, addr_str,
@@ -1817,7 +1814,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	 * has a description of SETCLIENTID request processing consisting
 	 * of 5 bullet points, labeled as CASE0 - CASE4 below.
 	 */
-	unconf = find_unconfirmed_client_by_str(dname, strhashval, false);
+	unconf = find_unconfirmed_client_by_str(dname, strhashval);
 	status = nfserr_resource;
 	if (!conf) {
 		/*
@@ -1962,7 +1959,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
 			unsigned int hash =
 				clientstr_hashval(unconf->cl_recdir);
 			conf = find_confirmed_client_by_str(unconf->cl_recdir,
-							    hash, false);
+							    hash);
 			if (conf) {
 				nfsd4_remove_clid_dir(conf);
 				expire_client(conf);
@@ -4106,7 +4103,7 @@ nfs4_has_reclaimed_state(const char *name, bool use_exchange_id)
 	unsigned int strhashval = clientstr_hashval(name);
 	struct nfs4_client *clp;
 
-	clp = find_confirmed_client_by_str(name, strhashval, use_exchange_id);
+	clp = find_confirmed_client_by_str(name, strhashval);
 	return clp ? 1 : 0;
 }
 
-- 
cgit v1.2.2


From 18b631f83810e95eeb2e1839889b27142bd8d6d8 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 29 Nov 2010 15:28:10 -0500
Subject: nfsd: fix offset printk's in nfsd3 read/write

Thanks to dysbr01@ca.com for noticing that the debugging printk in
the v3 write procedure can print >2GB offsets as negative numbers:
	https://bugzilla.kernel.org/show_bug.cgi?id=23342

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs3proc.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 5b7e3021e06b..2247fc91d5e9 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -151,10 +151,10 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
 	__be32	nfserr;
 	u32	max_blocksize = svc_max_payload(rqstp);
 
-	dprintk("nfsd: READ(3) %s %lu bytes at %lu\n",
+	dprintk("nfsd: READ(3) %s %lu bytes at %Lu\n",
 				SVCFH_fmt(&argp->fh),
 				(unsigned long) argp->count,
-				(unsigned long) argp->offset);
+				(unsigned long long) argp->offset);
 
 	/* Obtain buffer pointer for payload.
 	 * 1 (status) + 22 (post_op_attr) + 1 (count) + 1 (eof)
@@ -191,10 +191,10 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp,
 	__be32	nfserr;
 	unsigned long cnt = argp->len;
 
-	dprintk("nfsd: WRITE(3)    %s %d bytes at %ld%s\n",
+	dprintk("nfsd: WRITE(3)    %s %d bytes at %Lu%s\n",
 				SVCFH_fmt(&argp->fh),
 				argp->len,
-				(unsigned long) argp->offset,
+				(unsigned long long) argp->offset,
 				argp->stable? " stable" : "");
 
 	fh_copy(&resp->fh, &argp->fh);
-- 
cgit v1.2.2


From 5b6a599f0da3722dea9ecc01d97f54061662ce49 Mon Sep 17 00:00:00 2001
From: "bookjovi@gmail.com" <bookjovi@gmail.com>
Date: Sat, 11 Dec 2010 00:21:17 -0500
Subject: nfs: add missed CONFIG_NFSD_DEPRECATED

these pieces of code only make sense when CONFIG_NFSD_DEPRECATED enabled

Signed-off-by: Jovi Zhang <bookjovi@gmail.com>

 fs/nfsd/nfsctl.c |    2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfsctl.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 4514ebbee4d6..6840ec3ceecf 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -127,6 +127,7 @@ static ssize_t nfsctl_transaction_write(struct file *file, const char __user *bu
 
 static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos)
 {
+#ifdef CONFIG_NFSD_DEPRECATED
 	static int warned;
 	if (file->f_dentry->d_name.name[0] == '.' && !warned) {
 		printk(KERN_INFO
@@ -135,6 +136,7 @@ static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size
 		       current->comm, file->f_dentry->d_name.name);
 		warned = 1;
 	}
+#endif
 	if (! file->private_data) {
 		/* An attempt to read a transaction file without writing
 		 * causes a 0-byte write so that the file can return
-- 
cgit v1.2.2


From 56560b9ae0c2d07bb5bbcec16f799d7bf756d3de Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 16 Dec 2010 09:57:15 -0500
Subject: nfsd4: 4.1 SECINFO should consume filehandle

See the referenced spec language; an attempt by a 4.1 client to use the
current filehandle after a secinfo call should result in a NOFILEHANDLE
error.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4proc.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 0cdfd022bb7b..bad4bf8e4bbc 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -769,6 +769,9 @@ nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	} else
 		secinfo->si_exp = exp;
 	dput(dentry);
+	if (cstate->minorversion)
+		/* See rfc 5661 section 2.6.3.1.1.8 */
+		fh_put(&cstate->current_fh);
 	return err;
 }
 
-- 
cgit v1.2.2


From 0ff7ab46719a9c1e264b8d8e85416d59737ff13c Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 16 Dec 2010 10:06:27 -0500
Subject: nfsd4: move guts of nfsd4_lookupp into helper

We'll reuse this code in secinfo_no_name.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4proc.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index bad4bf8e4bbc..095431a35722 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -604,9 +604,7 @@ nfsd4_link(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	return status;
 }
 
-static __be32
-nfsd4_lookupp(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
-	      void *arg)
+static __be32 nfsd4_do_lookupp(struct svc_rqst *rqstp, struct svc_fh *fh)
 {
 	struct svc_fh tmp_fh;
 	__be32 ret;
@@ -615,13 +613,19 @@ nfsd4_lookupp(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	ret = exp_pseudoroot(rqstp, &tmp_fh);
 	if (ret)
 		return ret;
-	if (tmp_fh.fh_dentry == cstate->current_fh.fh_dentry) {
+	if (tmp_fh.fh_dentry == fh->fh_dentry) {
 		fh_put(&tmp_fh);
 		return nfserr_noent;
 	}
 	fh_put(&tmp_fh);
-	return nfsd_lookup(rqstp, &cstate->current_fh,
-			   "..", 2, &cstate->current_fh);
+	return nfsd_lookup(rqstp, fh, "..", 2, fh);
+}
+
+static __be32
+nfsd4_lookupp(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	      void *arg)
+{
+	return nfsd4_do_lookupp(rqstp, &cstate->current_fh);
 }
 
 static __be32
-- 
cgit v1.2.2


From 04f4ad16b231abbfde34c762697ad035a3af0b5f Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 16 Dec 2010 09:51:13 -0500
Subject: nfsd4: implement secinfo_no_name

Implementation of this operation is mandatory for NFSv4.1.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4proc.c | 27 +++++++++++++++++++++++++++
 fs/nfsd/nfs4xdr.c  | 15 +++++++++++++--
 fs/nfsd/xdr4.h     |  5 +++++
 3 files changed, 45 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 095431a35722..f80c3997d24c 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -779,6 +779,29 @@ nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	return err;
 }
 
+static __be32
+nfsd4_secinfo_no_name(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	      struct nfsd4_secinfo_no_name *sin)
+{
+	__be32 err;
+
+	switch (sin->sin_style) {
+	case NFS4_SECINFO_STYLE4_CURRENT_FH:
+		break;
+	case NFS4_SECINFO_STYLE4_PARENT:
+		err = nfsd4_do_lookupp(rqstp, &cstate->current_fh);
+		if (err)
+			return err;
+		break;
+	default:
+		return nfserr_inval;
+	}
+	exp_get(cstate->current_fh.fh_export);
+	sin->sin_exp = cstate->current_fh.fh_export;
+	fh_put(&cstate->current_fh);
+	return nfs_ok;
+}
+
 static __be32
 nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	      struct nfsd4_setattr *setattr)
@@ -1327,6 +1350,10 @@ static struct nfsd4_operation nfsd4_ops[] = {
 		.op_flags = ALLOWED_WITHOUT_FH,
 		.op_name = "OP_RECLAIM_COMPLETE",
 	},
+	[OP_SECINFO_NO_NAME] = {
+		.op_func = (nfsd4op_func)nfsd4_secinfo_no_name,
+		.op_name = "OP_SECINFO_NO_NAME",
+	},
 };
 
 static const char *nfsd4_op_name(unsigned opnum)
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 71d7d339e44a..b543b2410b54 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -846,6 +846,17 @@ nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp,
 	DECODE_TAIL;
 }
 
+static __be32
+nfsd4_decode_secinfo_no_name(struct nfsd4_compoundargs *argp,
+		     struct nfsd4_secinfo_no_name *sin)
+{
+	DECODE_HEAD;
+
+	READ_BUF(4);
+	READ32(sin->sin_style);
+	DECODE_TAIL;
+}
+
 static __be32
 nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr)
 {
@@ -1358,7 +1369,7 @@ static nfsd4_dec nfsd41_dec_ops[] = {
 	[OP_LAYOUTCOMMIT]	= (nfsd4_dec)nfsd4_decode_notsupp,
 	[OP_LAYOUTGET]		= (nfsd4_dec)nfsd4_decode_notsupp,
 	[OP_LAYOUTRETURN]	= (nfsd4_dec)nfsd4_decode_notsupp,
-	[OP_SECINFO_NO_NAME]	= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_SECINFO_NO_NAME]	= (nfsd4_dec)nfsd4_decode_secinfo_no_name,
 	[OP_SEQUENCE]		= (nfsd4_dec)nfsd4_decode_sequence,
 	[OP_SET_SSV]		= (nfsd4_dec)nfsd4_decode_notsupp,
 	[OP_TEST_STATEID]	= (nfsd4_dec)nfsd4_decode_notsupp,
@@ -3162,7 +3173,7 @@ static nfsd4_enc nfsd4_enc_ops[] = {
 	[OP_LAYOUTCOMMIT]	= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_LAYOUTGET]		= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_LAYOUTRETURN]	= (nfsd4_enc)nfsd4_encode_noop,
-	[OP_SECINFO_NO_NAME]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_SECINFO_NO_NAME]	= (nfsd4_enc)nfsd4_encode_secinfo,
 	[OP_SEQUENCE]		= (nfsd4_enc)nfsd4_encode_sequence,
 	[OP_SET_SSV]		= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_TEST_STATEID]	= (nfsd4_enc)nfsd4_encode_noop,
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 60fce3dc5cb5..799c30c3b495 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -311,6 +311,11 @@ struct nfsd4_secinfo {
 	struct svc_export *si_exp;			/* response */
 };
 
+struct nfsd4_secinfo_no_name {
+	u32 sin_style;					/* request */
+	struct svc_export *sin_exp;			/* response */
+};
+
 struct nfsd4_setattr {
 	stateid_t	sa_stateid;         /* request */
 	u32		sa_bmval[3];        /* request */
-- 
cgit v1.2.2


From 8844355df7f4e091b03cc131e1549631238b397b Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 25 Oct 2010 15:11:43 +0800
Subject: btrfs: Fix bugs in zlib workspace

- Fix a race that can result in alloc_workspace > cpus.
- Fix to check num_workspace after wakeup.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/zlib.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index b9cd5445f71c..e5b8b22e07d6 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -75,16 +75,19 @@ again:
 		return workspace;
 
 	}
-	spin_unlock(&workspace_lock);
 	if (atomic_read(&alloc_workspace) > cpus) {
 		DEFINE_WAIT(wait);
+
+		spin_unlock(&workspace_lock);
 		prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
-		if (atomic_read(&alloc_workspace) > cpus)
+		if (atomic_read(&alloc_workspace) > cpus && !num_workspace)
 			schedule();
 		finish_wait(&workspace_wait, &wait);
 		goto again;
 	}
 	atomic_inc(&alloc_workspace);
+	spin_unlock(&workspace_lock);
+
 	workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
 	if (!workspace) {
 		ret = -ENOMEM;
-- 
cgit v1.2.2


From 4b72029dc3fd6ba7dc45ccd1cf0aa0ebfa209bd3 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 9 Nov 2010 08:27:27 +0800
Subject: btrfs: Fix error handling in zlib

Return failure if alloc_page() fails to allocate memory,
and the upper code will just give up compression.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/zlib.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index e5b8b22e07d6..b01558661e3b 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -225,6 +225,10 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
 	data_in = kmap(in_page);
 
 	out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+	if (out_page == NULL) {
+		ret = -1;
+		goto out;
+	}
 	cpage_out = kmap(out_page);
 	pages[0] = out_page;
 	nr_pages = 1;
@@ -263,6 +267,10 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
 				goto out;
 			}
 			out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+			if (out_page == NULL) {
+				ret = -1;
+				goto out;
+			}
 			cpage_out = kmap(out_page);
 			pages[nr_pages] = out_page;
 			nr_pages++;
-- 
cgit v1.2.2


From 261507a02ccba9afda919852263b6bc1581ce1ef Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 17 Dec 2010 14:21:50 +0800
Subject: btrfs: Allow to add new compression algorithm

Make the code aware of compression type, instead of always assuming
zlib compression.

Also make the zlib workspace function as common code for all
compression types.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/btrfs_inode.h  |   2 +-
 fs/btrfs/compression.c  | 236 +++++++++++++++++++++++++++++++++++++++++++-
 fs/btrfs/compression.h  |  66 +++++++++----
 fs/btrfs/ctree.h        |  10 +-
 fs/btrfs/extent_io.c    |   5 +-
 fs/btrfs/extent_io.h    |  17 +++-
 fs/btrfs/extent_map.c   |   2 +
 fs/btrfs/extent_map.h   |   3 +-
 fs/btrfs/file.c         |   2 +
 fs/btrfs/inode.c        |  82 ++++++++++------
 fs/btrfs/ioctl.c        |   4 +-
 fs/btrfs/ordered-data.c |  18 +++-
 fs/btrfs/ordered-data.h |   8 +-
 fs/btrfs/super.c        |  47 ++++++---
 fs/btrfs/zlib.c         | 253 ++++++++++--------------------------------------
 15 files changed, 473 insertions(+), 282 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 6ad63f17eca0..ccc991c542df 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -157,7 +157,7 @@ struct btrfs_inode {
 	/*
 	 * always compress this one file
 	 */
-	unsigned force_compress:1;
+	unsigned force_compress:4;
 
 	struct inode vfs_inode;
 };
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index b50bc4bd5c56..6638c9877720 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -62,6 +62,9 @@ struct compressed_bio {
 	/* number of bytes on disk */
 	unsigned long compressed_len;
 
+	/* the compression algorithm for this bio */
+	int compress_type;
+
 	/* number of compressed pages in the array */
 	unsigned long nr_pages;
 
@@ -173,11 +176,12 @@ static void end_compressed_bio_read(struct bio *bio, int err)
 	/* ok, we're the last bio for this extent, lets start
 	 * the decompression.
 	 */
-	ret = btrfs_zlib_decompress_biovec(cb->compressed_pages,
-					cb->start,
-					cb->orig_bio->bi_io_vec,
-					cb->orig_bio->bi_vcnt,
-					cb->compressed_len);
+	ret = btrfs_decompress_biovec(cb->compress_type,
+				      cb->compressed_pages,
+				      cb->start,
+				      cb->orig_bio->bi_io_vec,
+				      cb->orig_bio->bi_vcnt,
+				      cb->compressed_len);
 csum_failed:
 	if (ret)
 		cb->errors = 1;
@@ -588,6 +592,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 
 	cb->len = uncompressed_len;
 	cb->compressed_len = compressed_len;
+	cb->compress_type = extent_compress_type(bio_flags);
 	cb->orig_bio = bio;
 
 	nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) /
@@ -677,3 +682,224 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	bio_put(comp_bio);
 	return 0;
 }
+
+static struct list_head comp_idle_workspace[BTRFS_COMPRESS_TYPES];
+static spinlock_t comp_workspace_lock[BTRFS_COMPRESS_TYPES];
+static int comp_num_workspace[BTRFS_COMPRESS_TYPES];
+static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES];
+static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES];
+
+struct btrfs_compress_op *btrfs_compress_op[] = {
+	&btrfs_zlib_compress,
+};
+
+int __init btrfs_init_compress(void)
+{
+	int i;
+
+	for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
+		INIT_LIST_HEAD(&comp_idle_workspace[i]);
+		spin_lock_init(&comp_workspace_lock[i]);
+		atomic_set(&comp_alloc_workspace[i], 0);
+		init_waitqueue_head(&comp_workspace_wait[i]);
+	}
+	return 0;
+}
+
+/*
+ * this finds an available workspace or allocates a new one
+ * ERR_PTR is returned if things go bad.
+ */
+static struct list_head *find_workspace(int type)
+{
+	struct list_head *workspace;
+	int cpus = num_online_cpus();
+	int idx = type - 1;
+
+	struct list_head *idle_workspace	= &comp_idle_workspace[idx];
+	spinlock_t *workspace_lock		= &comp_workspace_lock[idx];
+	atomic_t *alloc_workspace		= &comp_alloc_workspace[idx];
+	wait_queue_head_t *workspace_wait	= &comp_workspace_wait[idx];
+	int *num_workspace			= &comp_num_workspace[idx];
+again:
+	spin_lock(workspace_lock);
+	if (!list_empty(idle_workspace)) {
+		workspace = idle_workspace->next;
+		list_del(workspace);
+		(*num_workspace)--;
+		spin_unlock(workspace_lock);
+		return workspace;
+
+	}
+	if (atomic_read(alloc_workspace) > cpus) {
+		DEFINE_WAIT(wait);
+
+		spin_unlock(workspace_lock);
+		prepare_to_wait(workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
+		if (atomic_read(alloc_workspace) > cpus && !*num_workspace)
+			schedule();
+		finish_wait(workspace_wait, &wait);
+		goto again;
+	}
+	atomic_inc(alloc_workspace);
+	spin_unlock(workspace_lock);
+
+	workspace = btrfs_compress_op[idx]->alloc_workspace();
+	if (IS_ERR(workspace)) {
+		atomic_dec(alloc_workspace);
+		wake_up(workspace_wait);
+	}
+	return workspace;
+}
+
+/*
+ * put a workspace struct back on the list or free it if we have enough
+ * idle ones sitting around
+ */
+static void free_workspace(int type, struct list_head *workspace)
+{
+	int idx = type - 1;
+	struct list_head *idle_workspace	= &comp_idle_workspace[idx];
+	spinlock_t *workspace_lock		= &comp_workspace_lock[idx];
+	atomic_t *alloc_workspace		= &comp_alloc_workspace[idx];
+	wait_queue_head_t *workspace_wait	= &comp_workspace_wait[idx];
+	int *num_workspace			= &comp_num_workspace[idx];
+
+	spin_lock(workspace_lock);
+	if (*num_workspace < num_online_cpus()) {
+		list_add_tail(workspace, idle_workspace);
+		(*num_workspace)++;
+		spin_unlock(workspace_lock);
+		goto wake;
+	}
+	spin_unlock(workspace_lock);
+
+	btrfs_compress_op[idx]->free_workspace(workspace);
+	atomic_dec(alloc_workspace);
+wake:
+	if (waitqueue_active(workspace_wait))
+		wake_up(workspace_wait);
+}
+
+/*
+ * cleanup function for module exit
+ */
+static void free_workspaces(void)
+{
+	struct list_head *workspace;
+	int i;
+
+	for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
+		while (!list_empty(&comp_idle_workspace[i])) {
+			workspace = comp_idle_workspace[i].next;
+			list_del(workspace);
+			btrfs_compress_op[i]->free_workspace(workspace);
+			atomic_dec(&comp_alloc_workspace[i]);
+		}
+	}
+}
+
+/*
+ * given an address space and start/len, compress the bytes.
+ *
+ * pages are allocated to hold the compressed result and stored
+ * in 'pages'
+ *
+ * out_pages is used to return the number of pages allocated.  There
+ * may be pages allocated even if we return an error
+ *
+ * total_in is used to return the number of bytes actually read.  It
+ * may be smaller then len if we had to exit early because we
+ * ran out of room in the pages array or because we cross the
+ * max_out threshold.
+ *
+ * total_out is used to return the total number of compressed bytes
+ *
+ * max_out tells us the max number of bytes that we're allowed to
+ * stuff into pages
+ */
+int btrfs_compress_pages(int type, struct address_space *mapping,
+			 u64 start, unsigned long len,
+			 struct page **pages,
+			 unsigned long nr_dest_pages,
+			 unsigned long *out_pages,
+			 unsigned long *total_in,
+			 unsigned long *total_out,
+			 unsigned long max_out)
+{
+	struct list_head *workspace;
+	int ret;
+
+	workspace = find_workspace(type);
+	if (IS_ERR(workspace))
+		return -1;
+
+	ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping,
+						      start, len, pages,
+						      nr_dest_pages, out_pages,
+						      total_in, total_out,
+						      max_out);
+	free_workspace(type, workspace);
+	return ret;
+}
+
+/*
+ * pages_in is an array of pages with compressed data.
+ *
+ * disk_start is the starting logical offset of this array in the file
+ *
+ * bvec is a bio_vec of pages from the file that we want to decompress into
+ *
+ * vcnt is the count of pages in the biovec
+ *
+ * srclen is the number of bytes in pages_in
+ *
+ * The basic idea is that we have a bio that was created by readpages.
+ * The pages in the bio are for the uncompressed data, and they may not
+ * be contiguous.  They all correspond to the range of bytes covered by
+ * the compressed extent.
+ */
+int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start,
+			    struct bio_vec *bvec, int vcnt, size_t srclen)
+{
+	struct list_head *workspace;
+	int ret;
+
+	workspace = find_workspace(type);
+	if (IS_ERR(workspace))
+		return -ENOMEM;
+
+	ret = btrfs_compress_op[type-1]->decompress_biovec(workspace, pages_in,
+							 disk_start,
+							 bvec, vcnt, srclen);
+	free_workspace(type, workspace);
+	return ret;
+}
+
+/*
+ * a less complex decompression routine.  Our compressed data fits in a
+ * single page, and we want to read a single page out of it.
+ * start_byte tells us the offset into the compressed data we're interested in
+ */
+int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
+		     unsigned long start_byte, size_t srclen, size_t destlen)
+{
+	struct list_head *workspace;
+	int ret;
+
+	workspace = find_workspace(type);
+	if (IS_ERR(workspace))
+		return -ENOMEM;
+
+	ret = btrfs_compress_op[type-1]->decompress(workspace, data_in,
+						  dest_page, start_byte,
+						  srclen, destlen);
+
+	free_workspace(type, workspace);
+	return ret;
+}
+
+void __exit btrfs_exit_compress(void)
+{
+	free_workspaces();
+}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 421f5b4aa715..9b5f2f365b79 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -19,24 +19,22 @@
 #ifndef __BTRFS_COMPRESSION_
 #define __BTRFS_COMPRESSION_
 
-int btrfs_zlib_decompress(unsigned char *data_in,
-			  struct page *dest_page,
-			  unsigned long start_byte,
-			  size_t srclen, size_t destlen);
-int btrfs_zlib_compress_pages(struct address_space *mapping,
-			      u64 start, unsigned long len,
-			      struct page **pages,
-			      unsigned long nr_dest_pages,
-			      unsigned long *out_pages,
-			      unsigned long *total_in,
-			      unsigned long *total_out,
-			      unsigned long max_out);
-int btrfs_zlib_decompress_biovec(struct page **pages_in,
-			      u64 disk_start,
-			      struct bio_vec *bvec,
-			      int vcnt,
-			      size_t srclen);
-void btrfs_zlib_exit(void);
+int btrfs_init_compress(void);
+void btrfs_exit_compress(void);
+
+int btrfs_compress_pages(int type, struct address_space *mapping,
+			 u64 start, unsigned long len,
+			 struct page **pages,
+			 unsigned long nr_dest_pages,
+			 unsigned long *out_pages,
+			 unsigned long *total_in,
+			 unsigned long *total_out,
+			 unsigned long max_out);
+int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start,
+			    struct bio_vec *bvec, int vcnt, size_t srclen);
+int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
+		     unsigned long start_byte, size_t srclen, size_t destlen);
+
 int btrfs_submit_compressed_write(struct inode *inode, u64 start,
 				  unsigned long len, u64 disk_start,
 				  unsigned long compressed_len,
@@ -44,4 +42,36 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
 				  unsigned long nr_pages);
 int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 				 int mirror_num, unsigned long bio_flags);
+
+struct btrfs_compress_op {
+	struct list_head *(*alloc_workspace)(void);
+
+	void (*free_workspace)(struct list_head *workspace);
+
+	int (*compress_pages)(struct list_head *workspace,
+			      struct address_space *mapping,
+			      u64 start, unsigned long len,
+			      struct page **pages,
+			      unsigned long nr_dest_pages,
+			      unsigned long *out_pages,
+			      unsigned long *total_in,
+			      unsigned long *total_out,
+			      unsigned long max_out);
+
+	int (*decompress_biovec)(struct list_head *workspace,
+				 struct page **pages_in,
+				 u64 disk_start,
+				 struct bio_vec *bvec,
+				 int vcnt,
+				 size_t srclen);
+
+	int (*decompress)(struct list_head *workspace,
+			  unsigned char *data_in,
+			  struct page *dest_page,
+			  unsigned long start_byte,
+			  size_t srclen, size_t destlen);
+};
+
+extern struct btrfs_compress_op btrfs_zlib_compress;
+
 #endif
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index af52f6d7a4d8..e06534438592 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -551,9 +551,10 @@ struct btrfs_timespec {
 } __attribute__ ((__packed__));
 
 enum btrfs_compression_type {
-	BTRFS_COMPRESS_NONE = 0,
-	BTRFS_COMPRESS_ZLIB = 1,
-	BTRFS_COMPRESS_LAST = 2,
+	BTRFS_COMPRESS_NONE  = 0,
+	BTRFS_COMPRESS_ZLIB  = 1,
+	BTRFS_COMPRESS_TYPES = 1,
+	BTRFS_COMPRESS_LAST  = 2,
 };
 
 struct btrfs_inode_item {
@@ -895,7 +896,8 @@ struct btrfs_fs_info {
 	 */
 	u64 last_trans_log_full_commit;
 	u64 open_ioctl_trans;
-	unsigned long mount_opt;
+	unsigned long mount_opt:20;
+	unsigned long compress_type:4;
 	u64 max_inline;
 	u64 alloc_start;
 	struct btrfs_transaction *running_transaction;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 5e7a94d7da89..f1d198128959 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2028,8 +2028,11 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
 		BUG_ON(extent_map_end(em) <= cur);
 		BUG_ON(end < cur);
 
-		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
 			this_bio_flag = EXTENT_BIO_COMPRESSED;
+			extent_set_compress_type(&this_bio_flag,
+						 em->compress_type);
+		}
 
 		iosize = min(extent_map_end(em) - cur, end - cur + 1);
 		cur_end = min(extent_map_end(em) - 1, end);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 4183c8178f01..7083cfafd061 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -20,8 +20,12 @@
 #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
 #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
 
-/* flags for bio submission */
+/*
+ * flags for bio submission. The high bits indicate the compression
+ * type for this bio
+ */
 #define EXTENT_BIO_COMPRESSED 1
+#define EXTENT_BIO_FLAG_SHIFT 16
 
 /* these are bit numbers for test/set bit */
 #define EXTENT_BUFFER_UPTODATE 0
@@ -135,6 +139,17 @@ struct extent_buffer {
 	wait_queue_head_t lock_wq;
 };
 
+static inline void extent_set_compress_type(unsigned long *bio_flags,
+					    int compress_type)
+{
+	*bio_flags |= compress_type << EXTENT_BIO_FLAG_SHIFT;
+}
+
+static inline int extent_compress_type(unsigned long bio_flags)
+{
+	return bio_flags >> EXTENT_BIO_FLAG_SHIFT;
+}
+
 struct extent_map_tree;
 
 static inline struct extent_state *extent_state_next(struct extent_state *state)
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 23cb8da3ff66..b0e1fce12530 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -3,6 +3,7 @@
 #include <linux/module.h>
 #include <linux/spinlock.h>
 #include <linux/hardirq.h>
+#include "ctree.h"
 #include "extent_map.h"
 
 
@@ -54,6 +55,7 @@ struct extent_map *alloc_extent_map(gfp_t mask)
 		return em;
 	em->in_tree = 0;
 	em->flags = 0;
+	em->compress_type = BTRFS_COMPRESS_NONE;
 	atomic_set(&em->refs, 1);
 	return em;
 }
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index ab6d74b6e647..28b44dbd1e35 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -26,7 +26,8 @@ struct extent_map {
 	unsigned long flags;
 	struct block_device *bdev;
 	atomic_t refs;
-	int in_tree;
+	unsigned int in_tree:1;
+	unsigned int compress_type:4;
 };
 
 struct extent_map_tree {
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 66836d85763b..05df688c96f4 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -224,6 +224,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 
 			split->bdev = em->bdev;
 			split->flags = flags;
+			split->compress_type = em->compress_type;
 			ret = add_extent_mapping(em_tree, split);
 			BUG_ON(ret);
 			free_extent_map(split);
@@ -238,6 +239,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 			split->len = em->start + em->len - (start + len);
 			split->bdev = em->bdev;
 			split->flags = flags;
+			split->compress_type = em->compress_type;
 
 			if (compressed) {
 				split->block_len = em->block_len;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5f9194438f7c..ba563b2a5d6c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -122,10 +122,10 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
 	size_t cur_size = size;
 	size_t datasize;
 	unsigned long offset;
-	int use_compress = 0;
+	int compress_type = BTRFS_COMPRESS_NONE;
 
 	if (compressed_size && compressed_pages) {
-		use_compress = 1;
+		compress_type = root->fs_info->compress_type;
 		cur_size = compressed_size;
 	}
 
@@ -159,7 +159,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
 	btrfs_set_file_extent_ram_bytes(leaf, ei, size);
 	ptr = btrfs_file_extent_inline_start(ei);
 
-	if (use_compress) {
+	if (compress_type != BTRFS_COMPRESS_NONE) {
 		struct page *cpage;
 		int i = 0;
 		while (compressed_size > 0) {
@@ -176,7 +176,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
 			compressed_size -= cur_size;
 		}
 		btrfs_set_file_extent_compression(leaf, ei,
-						  BTRFS_COMPRESS_ZLIB);
+						  compress_type);
 	} else {
 		page = find_get_page(inode->i_mapping,
 				     start >> PAGE_CACHE_SHIFT);
@@ -263,6 +263,7 @@ struct async_extent {
 	u64 compressed_size;
 	struct page **pages;
 	unsigned long nr_pages;
+	int compress_type;
 	struct list_head list;
 };
 
@@ -280,7 +281,8 @@ static noinline int add_async_extent(struct async_cow *cow,
 				     u64 start, u64 ram_size,
 				     u64 compressed_size,
 				     struct page **pages,
-				     unsigned long nr_pages)
+				     unsigned long nr_pages,
+				     int compress_type)
 {
 	struct async_extent *async_extent;
 
@@ -290,6 +292,7 @@ static noinline int add_async_extent(struct async_cow *cow,
 	async_extent->compressed_size = compressed_size;
 	async_extent->pages = pages;
 	async_extent->nr_pages = nr_pages;
+	async_extent->compress_type = compress_type;
 	list_add_tail(&async_extent->list, &cow->extents);
 	return 0;
 }
@@ -332,6 +335,7 @@ static noinline int compress_file_range(struct inode *inode,
 	unsigned long max_uncompressed = 128 * 1024;
 	int i;
 	int will_compress;
+	int compress_type = root->fs_info->compress_type;
 
 	actual_end = min_t(u64, isize, end + 1);
 again:
@@ -381,12 +385,16 @@ again:
 		WARN_ON(pages);
 		pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
 
-		ret = btrfs_zlib_compress_pages(inode->i_mapping, start,
-						total_compressed, pages,
-						nr_pages, &nr_pages_ret,
-						&total_in,
-						&total_compressed,
-						max_compressed);
+		if (BTRFS_I(inode)->force_compress)
+			compress_type = BTRFS_I(inode)->force_compress;
+
+		ret = btrfs_compress_pages(compress_type,
+					   inode->i_mapping, start,
+					   total_compressed, pages,
+					   nr_pages, &nr_pages_ret,
+					   &total_in,
+					   &total_compressed,
+					   max_compressed);
 
 		if (!ret) {
 			unsigned long offset = total_compressed &
@@ -493,7 +501,8 @@ again:
 		 * and will submit them to the elevator.
 		 */
 		add_async_extent(async_cow, start, num_bytes,
-				 total_compressed, pages, nr_pages_ret);
+				 total_compressed, pages, nr_pages_ret,
+				 compress_type);
 
 		if (start + num_bytes < end) {
 			start += num_bytes;
@@ -515,7 +524,8 @@ cleanup_and_bail_uncompressed:
 			__set_page_dirty_nobuffers(locked_page);
 			/* unlocked later on in the async handlers */
 		}
-		add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0);
+		add_async_extent(async_cow, start, end - start + 1,
+				 0, NULL, 0, BTRFS_COMPRESS_NONE);
 		*num_added += 1;
 	}
 
@@ -640,6 +650,7 @@ retry:
 		em->block_start = ins.objectid;
 		em->block_len = ins.offset;
 		em->bdev = root->fs_info->fs_devices->latest_bdev;
+		em->compress_type = async_extent->compress_type;
 		set_bit(EXTENT_FLAG_PINNED, &em->flags);
 		set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
 
@@ -656,11 +667,13 @@ retry:
 						async_extent->ram_size - 1, 0);
 		}
 
-		ret = btrfs_add_ordered_extent(inode, async_extent->start,
-					       ins.objectid,
-					       async_extent->ram_size,
-					       ins.offset,
-					       BTRFS_ORDERED_COMPRESSED);
+		ret = btrfs_add_ordered_extent_compress(inode,
+						async_extent->start,
+						ins.objectid,
+						async_extent->ram_size,
+						ins.offset,
+						BTRFS_ORDERED_COMPRESSED,
+						async_extent->compress_type);
 		BUG_ON(ret);
 
 		/*
@@ -1670,7 +1683,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 	struct btrfs_ordered_extent *ordered_extent = NULL;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct extent_state *cached_state = NULL;
-	int compressed = 0;
+	int compress_type = 0;
 	int ret;
 	bool nolock = false;
 
@@ -1711,9 +1724,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
 
 	if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
-		compressed = 1;
+		compress_type = ordered_extent->compress_type;
 	if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
-		BUG_ON(compressed);
+		BUG_ON(compress_type);
 		ret = btrfs_mark_extent_written(trans, inode,
 						ordered_extent->file_offset,
 						ordered_extent->file_offset +
@@ -1727,7 +1740,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 						ordered_extent->disk_len,
 						ordered_extent->len,
 						ordered_extent->len,
-						compressed, 0, 0,
+						compress_type, 0, 0,
 						BTRFS_FILE_EXTENT_REG);
 		unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
 				   ordered_extent->file_offset,
@@ -1829,6 +1842,8 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
 			logical = em->block_start;
 			failrec->bio_flags = EXTENT_BIO_COMPRESSED;
+			extent_set_compress_type(&failrec->bio_flags,
+						 em->compress_type);
 		}
 		failrec->logical = logical;
 		free_extent_map(em);
@@ -4930,8 +4945,10 @@ static noinline int uncompress_inline(struct btrfs_path *path,
 	size_t max_size;
 	unsigned long inline_size;
 	unsigned long ptr;
+	int compress_type;
 
 	WARN_ON(pg_offset != 0);
+	compress_type = btrfs_file_extent_compression(leaf, item);
 	max_size = btrfs_file_extent_ram_bytes(leaf, item);
 	inline_size = btrfs_file_extent_inline_item_len(leaf,
 					btrfs_item_nr(leaf, path->slots[0]));
@@ -4941,8 +4958,8 @@ static noinline int uncompress_inline(struct btrfs_path *path,
 	read_extent_buffer(leaf, tmp, ptr, inline_size);
 
 	max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
-	ret = btrfs_zlib_decompress(tmp, page, extent_offset,
-				    inline_size, max_size);
+	ret = btrfs_decompress(compress_type, tmp, page,
+			       extent_offset, inline_size, max_size);
 	if (ret) {
 		char *kaddr = kmap_atomic(page, KM_USER0);
 		unsigned long copy_size = min_t(u64,
@@ -4984,7 +5001,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct btrfs_trans_handle *trans = NULL;
-	int compressed;
+	int compress_type;
 
 again:
 	read_lock(&em_tree->lock);
@@ -5043,7 +5060,7 @@ again:
 
 	found_type = btrfs_file_extent_type(leaf, item);
 	extent_start = found_key.offset;
-	compressed = btrfs_file_extent_compression(leaf, item);
+	compress_type = btrfs_file_extent_compression(leaf, item);
 	if (found_type == BTRFS_FILE_EXTENT_REG ||
 	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
 		extent_end = extent_start +
@@ -5089,8 +5106,9 @@ again:
 			em->block_start = EXTENT_MAP_HOLE;
 			goto insert;
 		}
-		if (compressed) {
+		if (compress_type != BTRFS_COMPRESS_NONE) {
 			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+			em->compress_type = compress_type;
 			em->block_start = bytenr;
 			em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
 									 item);
@@ -5124,12 +5142,14 @@ again:
 		em->len = (copy_size + root->sectorsize - 1) &
 			~((u64)root->sectorsize - 1);
 		em->orig_start = EXTENT_MAP_INLINE;
-		if (compressed)
+		if (compress_type) {
 			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+			em->compress_type = compress_type;
+		}
 		ptr = btrfs_file_extent_inline_start(item) + extent_offset;
 		if (create == 0 && !PageUptodate(page)) {
-			if (btrfs_file_extent_compression(leaf, item) ==
-			    BTRFS_COMPRESS_ZLIB) {
+			if (btrfs_file_extent_compression(leaf, item) !=
+			    BTRFS_COMPRESS_NONE) {
 				ret = uncompress_inline(path, inode, page,
 							pg_offset,
 							extent_offset, item);
@@ -6479,7 +6499,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 	ei->ordered_data_close = 0;
 	ei->orphan_meta_reserved = 0;
 	ei->dummy_inode = 0;
-	ei->force_compress = 0;
+	ei->force_compress = BTRFS_COMPRESS_NONE;
 
 	inode = &ei->vfs_inode;
 	extent_map_tree_init(&ei->extent_tree, GFP_NOFS);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index f87552a1d7ea..8cb86d4d763c 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -683,7 +683,7 @@ static int btrfs_defrag_file(struct file *file,
 		total_read++;
 		mutex_lock(&inode->i_mutex);
 		if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
-			BTRFS_I(inode)->force_compress = 1;
+			BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_ZLIB;
 
 		ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
 		if (ret)
@@ -781,7 +781,7 @@ loop_unlock:
 		atomic_dec(&root->fs_info->async_submit_draining);
 
 		mutex_lock(&inode->i_mutex);
-		BTRFS_I(inode)->force_compress = 0;
+		BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE;
 		mutex_unlock(&inode->i_mutex);
 	}
 
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index ae7737e352c9..2b61e1ddcd99 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -172,7 +172,7 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
  */
 static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 				      u64 start, u64 len, u64 disk_len,
-				      int type, int dio)
+				      int type, int dio, int compress_type)
 {
 	struct btrfs_ordered_inode_tree *tree;
 	struct rb_node *node;
@@ -189,6 +189,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 	entry->disk_len = disk_len;
 	entry->bytes_left = len;
 	entry->inode = inode;
+	entry->compress_type = compress_type;
 	if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
 		set_bit(type, &entry->flags);
 
@@ -220,14 +221,25 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 			     u64 start, u64 len, u64 disk_len, int type)
 {
 	return __btrfs_add_ordered_extent(inode, file_offset, start, len,
-					  disk_len, type, 0);
+					  disk_len, type, 0,
+					  BTRFS_COMPRESS_NONE);
 }
 
 int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
 				 u64 start, u64 len, u64 disk_len, int type)
 {
 	return __btrfs_add_ordered_extent(inode, file_offset, start, len,
-					  disk_len, type, 1);
+					  disk_len, type, 1,
+					  BTRFS_COMPRESS_NONE);
+}
+
+int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
+				      u64 start, u64 len, u64 disk_len,
+				      int type, int compress_type)
+{
+	return __btrfs_add_ordered_extent(inode, file_offset, start, len,
+					  disk_len, type, 0,
+					  compress_type);
 }
 
 /*
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 61dca83119dd..ff1f69aa1883 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -68,7 +68,7 @@ struct btrfs_ordered_sum {
 
 #define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */
 
-#define BTRFS_ORDERED_COMPRESSED 3 /* writing a compressed extent */
+#define BTRFS_ORDERED_COMPRESSED 3 /* writing a zlib compressed extent */
 
 #define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
 
@@ -93,6 +93,9 @@ struct btrfs_ordered_extent {
 	/* flags (described above) */
 	unsigned long flags;
 
+	/* compression algorithm */
+	int compress_type;
+
 	/* reference count */
 	atomic_t refs;
 
@@ -148,6 +151,9 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 			     u64 start, u64 len, u64 disk_len, int type);
 int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
 				 u64 start, u64 len, u64 disk_len, int type);
+int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
+				      u64 start, u64 len, u64 disk_len,
+				      int type, int compress_type);
 int btrfs_add_ordered_sum(struct inode *inode,
 			  struct btrfs_ordered_extent *entry,
 			  struct btrfs_ordered_sum *sum);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 61bd79abb805..f348f2b93164 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -69,9 +69,9 @@ enum {
 	Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum,
 	Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd,
 	Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
-	Opt_compress_force, Opt_notreelog, Opt_ratio, Opt_flushoncommit,
-	Opt_discard, Opt_space_cache, Opt_clear_cache, Opt_err,
-	Opt_user_subvol_rm_allowed,
+	Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
+	Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
+	Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, Opt_err,
 };
 
 static match_table_t tokens = {
@@ -86,7 +86,9 @@ static match_table_t tokens = {
 	{Opt_alloc_start, "alloc_start=%s"},
 	{Opt_thread_pool, "thread_pool=%d"},
 	{Opt_compress, "compress"},
+	{Opt_compress_type, "compress=%s"},
 	{Opt_compress_force, "compress-force"},
+	{Opt_compress_force_type, "compress-force=%s"},
 	{Opt_ssd, "ssd"},
 	{Opt_ssd_spread, "ssd_spread"},
 	{Opt_nossd, "nossd"},
@@ -112,6 +114,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 	char *p, *num, *orig;
 	int intarg;
 	int ret = 0;
+	char *compress_type;
+	bool compress_force = false;
 
 	if (!options)
 		return 0;
@@ -154,14 +158,29 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			btrfs_set_opt(info->mount_opt, NODATACOW);
 			btrfs_set_opt(info->mount_opt, NODATASUM);
 			break;
-		case Opt_compress:
-			printk(KERN_INFO "btrfs: use compression\n");
-			btrfs_set_opt(info->mount_opt, COMPRESS);
-			break;
 		case Opt_compress_force:
-			printk(KERN_INFO "btrfs: forcing compression\n");
-			btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
+		case Opt_compress_force_type:
+			compress_force = true;
+		case Opt_compress:
+		case Opt_compress_type:
+			if (token == Opt_compress ||
+			    token == Opt_compress_force ||
+			    strcmp(args[0].from, "zlib") == 0) {
+				compress_type = "zlib";
+				info->compress_type = BTRFS_COMPRESS_ZLIB;
+			} else {
+				ret = -EINVAL;
+				goto out;
+			}
+
 			btrfs_set_opt(info->mount_opt, COMPRESS);
+			if (compress_force) {
+				btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
+				pr_info("btrfs: force %s compression\n",
+					compress_type);
+			} else
+				pr_info("btrfs: use %s compression\n",
+					compress_type);
 			break;
 		case Opt_ssd:
 			printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
@@ -898,10 +917,14 @@ static int __init init_btrfs_fs(void)
 	if (err)
 		return err;
 
-	err = btrfs_init_cachep();
+	err = btrfs_init_compress();
 	if (err)
 		goto free_sysfs;
 
+	err = btrfs_init_cachep();
+	if (err)
+		goto free_compress;
+
 	err = extent_io_init();
 	if (err)
 		goto free_cachep;
@@ -929,6 +952,8 @@ free_extent_io:
 	extent_io_exit();
 free_cachep:
 	btrfs_destroy_cachep();
+free_compress:
+	btrfs_exit_compress();
 free_sysfs:
 	btrfs_exit_sysfs();
 	return err;
@@ -943,7 +968,7 @@ static void __exit exit_btrfs_fs(void)
 	unregister_filesystem(&btrfs_fs_type);
 	btrfs_exit_sysfs();
 	btrfs_cleanup_fs_uuids();
-	btrfs_zlib_exit();
+	btrfs_exit_compress();
 }
 
 module_init(init_btrfs_fs)
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index b01558661e3b..9a3e693917f2 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -32,15 +32,6 @@
 #include <linux/bio.h>
 #include "compression.h"
 
-/* Plan: call deflate() with avail_in == *sourcelen,
-	avail_out = *dstlen - 12 and flush == Z_FINISH.
-	If it doesn't manage to finish,	call it again with
-	avail_in == 0 and avail_out set to the remaining 12
-	bytes for it to clean up.
-   Q: Is 12 bytes sufficient?
-*/
-#define STREAM_END_SPACE 12
-
 struct workspace {
 	z_stream inf_strm;
 	z_stream def_strm;
@@ -48,155 +39,51 @@ struct workspace {
 	struct list_head list;
 };
 
-static LIST_HEAD(idle_workspace);
-static DEFINE_SPINLOCK(workspace_lock);
-static unsigned long num_workspace;
-static atomic_t alloc_workspace = ATOMIC_INIT(0);
-static DECLARE_WAIT_QUEUE_HEAD(workspace_wait);
+static void zlib_free_workspace(struct list_head *ws)
+{
+	struct workspace *workspace = list_entry(ws, struct workspace, list);
 
-/*
- * this finds an available zlib workspace or allocates a new one
- * NULL or an ERR_PTR is returned if things go bad.
- */
-static struct workspace *find_zlib_workspace(void)
+	vfree(workspace->def_strm.workspace);
+	vfree(workspace->inf_strm.workspace);
+	kfree(workspace->buf);
+	kfree(workspace);
+}
+
+static struct list_head *zlib_alloc_workspace(void)
 {
 	struct workspace *workspace;
-	int ret;
-	int cpus = num_online_cpus();
-
-again:
-	spin_lock(&workspace_lock);
-	if (!list_empty(&idle_workspace)) {
-		workspace = list_entry(idle_workspace.next, struct workspace,
-				       list);
-		list_del(&workspace->list);
-		num_workspace--;
-		spin_unlock(&workspace_lock);
-		return workspace;
-
-	}
-	if (atomic_read(&alloc_workspace) > cpus) {
-		DEFINE_WAIT(wait);
-
-		spin_unlock(&workspace_lock);
-		prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
-		if (atomic_read(&alloc_workspace) > cpus && !num_workspace)
-			schedule();
-		finish_wait(&workspace_wait, &wait);
-		goto again;
-	}
-	atomic_inc(&alloc_workspace);
-	spin_unlock(&workspace_lock);
 
 	workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
-	if (!workspace) {
-		ret = -ENOMEM;
-		goto fail;
-	}
+	if (!workspace)
+		return ERR_PTR(-ENOMEM);
 
 	workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize());
-	if (!workspace->def_strm.workspace) {
-		ret = -ENOMEM;
-		goto fail;
-	}
 	workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
-	if (!workspace->inf_strm.workspace) {
-		ret = -ENOMEM;
-		goto fail_inflate;
-	}
 	workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
-	if (!workspace->buf) {
-		ret = -ENOMEM;
-		goto fail_kmalloc;
-	}
-	return workspace;
-
-fail_kmalloc:
-	vfree(workspace->inf_strm.workspace);
-fail_inflate:
-	vfree(workspace->def_strm.workspace);
-fail:
-	kfree(workspace);
-	atomic_dec(&alloc_workspace);
-	wake_up(&workspace_wait);
-	return ERR_PTR(ret);
-}
-
-/*
- * put a workspace struct back on the list or free it if we have enough
- * idle ones sitting around
- */
-static int free_workspace(struct workspace *workspace)
-{
-	spin_lock(&workspace_lock);
-	if (num_workspace < num_online_cpus()) {
-		list_add_tail(&workspace->list, &idle_workspace);
-		num_workspace++;
-		spin_unlock(&workspace_lock);
-		if (waitqueue_active(&workspace_wait))
-			wake_up(&workspace_wait);
-		return 0;
-	}
-	spin_unlock(&workspace_lock);
-	vfree(workspace->def_strm.workspace);
-	vfree(workspace->inf_strm.workspace);
-	kfree(workspace->buf);
-	kfree(workspace);
+	if (!workspace->def_strm.workspace ||
+	    !workspace->inf_strm.workspace || !workspace->buf)
+		goto fail;
 
-	atomic_dec(&alloc_workspace);
-	if (waitqueue_active(&workspace_wait))
-		wake_up(&workspace_wait);
-	return 0;
-}
+	INIT_LIST_HEAD(&workspace->list);
 
-/*
- * cleanup function for module exit
- */
-static void free_workspaces(void)
-{
-	struct workspace *workspace;
-	while (!list_empty(&idle_workspace)) {
-		workspace = list_entry(idle_workspace.next, struct workspace,
-				       list);
-		list_del(&workspace->list);
-		vfree(workspace->def_strm.workspace);
-		vfree(workspace->inf_strm.workspace);
-		kfree(workspace->buf);
-		kfree(workspace);
-		atomic_dec(&alloc_workspace);
-	}
+	return &workspace->list;
+fail:
+	zlib_free_workspace(&workspace->list);
+	return ERR_PTR(-ENOMEM);
 }
 
-/*
- * given an address space and start/len, compress the bytes.
- *
- * pages are allocated to hold the compressed result and stored
- * in 'pages'
- *
- * out_pages is used to return the number of pages allocated.  There
- * may be pages allocated even if we return an error
- *
- * total_in is used to return the number of bytes actually read.  It
- * may be smaller then len if we had to exit early because we
- * ran out of room in the pages array or because we cross the
- * max_out threshold.
- *
- * total_out is used to return the total number of compressed bytes
- *
- * max_out tells us the max number of bytes that we're allowed to
- * stuff into pages
- */
-int btrfs_zlib_compress_pages(struct address_space *mapping,
-			      u64 start, unsigned long len,
-			      struct page **pages,
-			      unsigned long nr_dest_pages,
-			      unsigned long *out_pages,
-			      unsigned long *total_in,
-			      unsigned long *total_out,
-			      unsigned long max_out)
+static int zlib_compress_pages(struct list_head *ws,
+			       struct address_space *mapping,
+			       u64 start, unsigned long len,
+			       struct page **pages,
+			       unsigned long nr_dest_pages,
+			       unsigned long *out_pages,
+			       unsigned long *total_in,
+			       unsigned long *total_out,
+			       unsigned long max_out)
 {
+	struct workspace *workspace = list_entry(ws, struct workspace, list);
 	int ret;
-	struct workspace *workspace;
 	char *data_in;
 	char *cpage_out;
 	int nr_pages = 0;
@@ -208,10 +95,6 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
 	*total_out = 0;
 	*total_in = 0;
 
-	workspace = find_zlib_workspace();
-	if (IS_ERR(workspace))
-		return -1;
-
 	if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
 		printk(KERN_WARNING "deflateInit failed\n");
 		ret = -1;
@@ -325,35 +208,18 @@ out:
 		kunmap(in_page);
 		page_cache_release(in_page);
 	}
-	free_workspace(workspace);
 	return ret;
 }
 
-/*
- * pages_in is an array of pages with compressed data.
- *
- * disk_start is the starting logical offset of this array in the file
- *
- * bvec is a bio_vec of pages from the file that we want to decompress into
- *
- * vcnt is the count of pages in the biovec
- *
- * srclen is the number of bytes in pages_in
- *
- * The basic idea is that we have a bio that was created by readpages.
- * The pages in the bio are for the uncompressed data, and they may not
- * be contiguous.  They all correspond to the range of bytes covered by
- * the compressed extent.
- */
-int btrfs_zlib_decompress_biovec(struct page **pages_in,
-			      u64 disk_start,
-			      struct bio_vec *bvec,
-			      int vcnt,
-			      size_t srclen)
+static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
+				  u64 disk_start,
+				  struct bio_vec *bvec,
+				  int vcnt,
+				  size_t srclen)
 {
+	struct workspace *workspace = list_entry(ws, struct workspace, list);
 	int ret = 0;
 	int wbits = MAX_WBITS;
-	struct workspace *workspace;
 	char *data_in;
 	size_t total_out = 0;
 	unsigned long page_bytes_left;
@@ -371,10 +237,6 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
 	unsigned long current_buf_start;
 	char *kaddr;
 
-	workspace = find_zlib_workspace();
-	if (IS_ERR(workspace))
-		return -ENOMEM;
-
 	data_in = kmap(pages_in[page_in_index]);
 	workspace->inf_strm.next_in = data_in;
 	workspace->inf_strm.avail_in = min_t(size_t, srclen, PAGE_CACHE_SIZE);
@@ -400,8 +262,7 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
 
 	if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
 		printk(KERN_WARNING "inflateInit failed\n");
-		ret = -1;
-		goto out;
+		return -1;
 	}
 	while (workspace->inf_strm.total_in < srclen) {
 		ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
@@ -527,35 +388,21 @@ done:
 	zlib_inflateEnd(&workspace->inf_strm);
 	if (data_in)
 		kunmap(pages_in[page_in_index]);
-out:
-	free_workspace(workspace);
 	return ret;
 }
 
-/*
- * a less complex decompression routine.  Our compressed data fits in a
- * single page, and we want to read a single page out of it.
- * start_byte tells us the offset into the compressed data we're interested in
- */
-int btrfs_zlib_decompress(unsigned char *data_in,
-			  struct page *dest_page,
-			  unsigned long start_byte,
-			  size_t srclen, size_t destlen)
+static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
+			   struct page *dest_page,
+			   unsigned long start_byte,
+			   size_t srclen, size_t destlen)
 {
+	struct workspace *workspace = list_entry(ws, struct workspace, list);
 	int ret = 0;
 	int wbits = MAX_WBITS;
-	struct workspace *workspace;
 	unsigned long bytes_left = destlen;
 	unsigned long total_out = 0;
 	char *kaddr;
 
-	if (destlen > PAGE_CACHE_SIZE)
-		return -ENOMEM;
-
-	workspace = find_zlib_workspace();
-	if (IS_ERR(workspace))
-		return -ENOMEM;
-
 	workspace->inf_strm.next_in = data_in;
 	workspace->inf_strm.avail_in = srclen;
 	workspace->inf_strm.total_in = 0;
@@ -576,8 +423,7 @@ int btrfs_zlib_decompress(unsigned char *data_in,
 
 	if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
 		printk(KERN_WARNING "inflateInit failed\n");
-		ret = -1;
-		goto out;
+		return -1;
 	}
 
 	while (bytes_left > 0) {
@@ -627,12 +473,13 @@ next:
 		ret = 0;
 
 	zlib_inflateEnd(&workspace->inf_strm);
-out:
-	free_workspace(workspace);
 	return ret;
 }
 
-void btrfs_zlib_exit(void)
-{
-    free_workspaces();
-}
+struct btrfs_compress_op btrfs_zlib_compress = {
+	.alloc_workspace	= zlib_alloc_workspace,
+	.free_workspace		= zlib_free_workspace,
+	.compress_pages		= zlib_compress_pages,
+	.decompress_biovec	= zlib_decompress_biovec,
+	.decompress		= zlib_decompress,
+};
-- 
cgit v1.2.2


From a6fa6fae40ec336c7df6155255ae64ebef43a8bc Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 25 Oct 2010 15:12:26 +0800
Subject: btrfs: Add lzo compression support

Lzo is a much faster compression algorithm than gzib, so would allow
more users to enable transparent compression, and some users can
choose from compression ratio and speed for different applications

Usage:

 # mount -t btrfs -o compress[=<zlib,lzo>] dev /mnt
or
 # mount -t btrfs -o compress-force[=<zlib,lzo>] dev /mnt

"-o compress" without argument is still allowed for compatability.

Compatibility:

If we mount a filesystem with lzo compression, it will not be able be
mounted in old kernels. One reason is, otherwise btrfs will directly
dump compressed data, which sits in inline extent, to user.

Performance:

The test copied a linux source tarball (~400M) from an ext4 partition
to the btrfs partition, and then extracted it.

(time in second)
           lzo        zlib        nocompress
copy:      10.6       21.7        14.9
extract:   70.1       94.4        66.6

(data size in MB)
           lzo        zlib        nocompress
copy:      185.87     108.69      394.49
extract:   193.80     132.36      381.21

Changelog:

v1 -> v2:
- Select LZO_COMPRESS and LZO_DECOMPRESS in btrfs Kconfig.
- Add incompability flag.
- Fix error handling in compress code.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/Kconfig       |   2 +
 fs/btrfs/Makefile      |   2 +-
 fs/btrfs/compression.c |   1 +
 fs/btrfs/compression.h |   1 +
 fs/btrfs/ctree.h       |   9 +-
 fs/btrfs/disk-io.c     |   8 +-
 fs/btrfs/lzo.c         | 509 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/super.c       |   3 +
 8 files changed, 527 insertions(+), 8 deletions(-)
 create mode 100644 fs/btrfs/lzo.c

(limited to 'fs')

diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 7bb3c020e570..ecb9fd3be143 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -4,6 +4,8 @@ config BTRFS_FS
 	select LIBCRC32C
 	select ZLIB_INFLATE
 	select ZLIB_DEFLATE
+	select LZO_COMPRESS
+	select LZO_DECOMPRESS
 	help
 	  Btrfs is a new filesystem with extents, writable snapshotting,
 	  support for multiple devices and many more features.
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index a35eb36b32fd..31610ea73aec 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -6,5 +6,5 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   transaction.o inode.o file.o tree-defrag.o \
 	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
 	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
-	   export.o tree-log.o acl.o free-space-cache.o zlib.o \
+	   export.o tree-log.o acl.o free-space-cache.o zlib.o lzo.o \
 	   compression.o delayed-ref.o relocation.o
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 6638c9877720..8faa2df9e719 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -691,6 +691,7 @@ static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES];
 
 struct btrfs_compress_op *btrfs_compress_op[] = {
 	&btrfs_zlib_compress,
+	&btrfs_lzo_compress,
 };
 
 int __init btrfs_init_compress(void)
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 9b5f2f365b79..f7ce217113fa 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -73,5 +73,6 @@ struct btrfs_compress_op {
 };
 
 extern struct btrfs_compress_op btrfs_zlib_compress;
+extern struct btrfs_compress_op btrfs_lzo_compress;
 
 #endif
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index e06534438592..53b984623983 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -398,13 +398,15 @@ struct btrfs_super_block {
 #define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF	(1ULL << 0)
 #define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL	(1ULL << 1)
 #define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS	(1ULL << 2)
+#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO	(1ULL << 3)
 
 #define BTRFS_FEATURE_COMPAT_SUPP		0ULL
 #define BTRFS_FEATURE_COMPAT_RO_SUPP		0ULL
 #define BTRFS_FEATURE_INCOMPAT_SUPP			\
 	(BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF |		\
 	 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL |	\
-	 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
+	 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS |		\
+	 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO)
 
 /*
  * A leaf is full of items. offset and size tell us where to find
@@ -553,8 +555,9 @@ struct btrfs_timespec {
 enum btrfs_compression_type {
 	BTRFS_COMPRESS_NONE  = 0,
 	BTRFS_COMPRESS_ZLIB  = 1,
-	BTRFS_COMPRESS_TYPES = 1,
-	BTRFS_COMPRESS_LAST  = 2,
+	BTRFS_COMPRESS_LZO   = 2,
+	BTRFS_COMPRESS_TYPES = 2,
+	BTRFS_COMPRESS_LAST  = 3,
 };
 
 struct btrfs_inode_item {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a5d2249e6da5..f88eb2ce7919 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1744,10 +1744,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	}
 
 	features = btrfs_super_incompat_flags(disk_super);
-	if (!(features & BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF)) {
-		features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
-		btrfs_set_super_incompat_flags(disk_super, features);
-	}
+	features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
+	if (tree_root->fs_info->compress_type & BTRFS_COMPRESS_LZO)
+		features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
+	btrfs_set_super_incompat_flags(disk_super, features);
 
 	features = btrfs_super_compat_ro_flags(disk_super) &
 		~BTRFS_FEATURE_COMPAT_RO_SUPP;
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
new file mode 100644
index 000000000000..523b144e2aec
--- /dev/null
+++ b/fs/btrfs/lzo.c
@@ -0,0 +1,509 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/bio.h>
+#include <linux/lzo.h>
+#include "compression.h"
+
+#define LZO_LEN	4
+
+struct workspace {
+	void *mem;
+	void *buf;	/* where compressed data goes */
+	void *cbuf;	/* where decompressed data goes */
+	struct list_head list;
+};
+
+static void lzo_free_workspace(struct list_head *ws)
+{
+	struct workspace *workspace = list_entry(ws, struct workspace, list);
+
+	vfree(workspace->buf);
+	vfree(workspace->cbuf);
+	vfree(workspace->mem);
+	kfree(workspace);
+}
+
+static struct list_head *lzo_alloc_workspace(void)
+{
+	struct workspace *workspace;
+
+	workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
+	if (!workspace)
+		return ERR_PTR(-ENOMEM);
+
+	workspace->mem = vmalloc(LZO1X_MEM_COMPRESS);
+	workspace->buf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE));
+	workspace->cbuf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE));
+	if (!workspace->mem || !workspace->buf || !workspace->cbuf)
+		goto fail;
+
+	INIT_LIST_HEAD(&workspace->list);
+
+	return &workspace->list;
+fail:
+	lzo_free_workspace(&workspace->list);
+	return ERR_PTR(-ENOMEM);
+}
+
+static inline void write_compress_length(char *buf, size_t len)
+{
+	__le32 dlen;
+
+	dlen = cpu_to_le32(len);
+	memcpy(buf, &dlen, LZO_LEN);
+}
+
+static inline size_t read_compress_length(char *buf)
+{
+	__le32 dlen;
+
+	memcpy(&dlen, buf, LZO_LEN);
+	return le32_to_cpu(dlen);
+}
+
+static int lzo_compress_pages(struct list_head *ws,
+			      struct address_space *mapping,
+			      u64 start, unsigned long len,
+			      struct page **pages,
+			      unsigned long nr_dest_pages,
+			      unsigned long *out_pages,
+			      unsigned long *total_in,
+			      unsigned long *total_out,
+			      unsigned long max_out)
+{
+	struct workspace *workspace = list_entry(ws, struct workspace, list);
+	int ret = 0;
+	char *data_in;
+	char *cpage_out;
+	int nr_pages = 0;
+	struct page *in_page = NULL;
+	struct page *out_page = NULL;
+	unsigned long bytes_left;
+
+	size_t in_len;
+	size_t out_len;
+	char *buf;
+	unsigned long tot_in = 0;
+	unsigned long tot_out = 0;
+	unsigned long pg_bytes_left;
+	unsigned long out_offset;
+	unsigned long bytes;
+
+	*out_pages = 0;
+	*total_out = 0;
+	*total_in = 0;
+
+	in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+	data_in = kmap(in_page);
+
+	/*
+	 * store the size of all chunks of compressed data in
+	 * the first 4 bytes
+	 */
+	out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+	if (out_page == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	cpage_out = kmap(out_page);
+	out_offset = LZO_LEN;
+	tot_out = LZO_LEN;
+	pages[0] = out_page;
+	nr_pages = 1;
+	pg_bytes_left = PAGE_CACHE_SIZE - LZO_LEN;
+
+	/* compress at most one page of data each time */
+	in_len = min(len, PAGE_CACHE_SIZE);
+	while (tot_in < len) {
+		ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf,
+				       &out_len, workspace->mem);
+		if (ret != LZO_E_OK) {
+			printk(KERN_DEBUG "btrfs deflate in loop returned %d\n",
+			       ret);
+			ret = -1;
+			goto out;
+		}
+
+		/* store the size of this chunk of compressed data */
+		write_compress_length(cpage_out + out_offset, out_len);
+		tot_out += LZO_LEN;
+		out_offset += LZO_LEN;
+		pg_bytes_left -= LZO_LEN;
+
+		tot_in += in_len;
+		tot_out += out_len;
+
+		/* copy bytes from the working buffer into the pages */
+		buf = workspace->cbuf;
+		while (out_len) {
+			bytes = min_t(unsigned long, pg_bytes_left, out_len);
+
+			memcpy(cpage_out + out_offset, buf, bytes);
+
+			out_len -= bytes;
+			pg_bytes_left -= bytes;
+			buf += bytes;
+			out_offset += bytes;
+
+			/*
+			 * we need another page for writing out.
+			 *
+			 * Note if there's less than 4 bytes left, we just
+			 * skip to a new page.
+			 */
+			if ((out_len == 0 && pg_bytes_left < LZO_LEN) ||
+			    pg_bytes_left == 0) {
+				if (pg_bytes_left) {
+					memset(cpage_out + out_offset, 0,
+					       pg_bytes_left);
+					tot_out += pg_bytes_left;
+				}
+
+				/* we're done, don't allocate new page */
+				if (out_len == 0 && tot_in >= len)
+					break;
+
+				kunmap(out_page);
+				if (nr_pages == nr_dest_pages) {
+					out_page = NULL;
+					ret = -1;
+					goto out;
+				}
+
+				out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+				if (out_page == NULL) {
+					ret = -ENOMEM;
+					goto out;
+				}
+				cpage_out = kmap(out_page);
+				pages[nr_pages++] = out_page;
+
+				pg_bytes_left = PAGE_CACHE_SIZE;
+				out_offset = 0;
+			}
+		}
+
+		/* we're making it bigger, give up */
+		if (tot_in > 8192 && tot_in < tot_out)
+			goto out;
+
+		/* we're all done */
+		if (tot_in >= len)
+			break;
+
+		if (tot_out > max_out)
+			break;
+
+		bytes_left = len - tot_in;
+		kunmap(in_page);
+		page_cache_release(in_page);
+
+		start += PAGE_CACHE_SIZE;
+		in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+		data_in = kmap(in_page);
+		in_len = min(bytes_left, PAGE_CACHE_SIZE);
+	}
+
+	if (tot_out > tot_in)
+		goto out;
+
+	/* store the size of all chunks of compressed data */
+	cpage_out = kmap(pages[0]);
+	write_compress_length(cpage_out, tot_out);
+
+	kunmap(pages[0]);
+
+	ret = 0;
+	*total_out = tot_out;
+	*total_in = tot_in;
+out:
+	*out_pages = nr_pages;
+	if (out_page)
+		kunmap(out_page);
+
+	if (in_page) {
+		kunmap(in_page);
+		page_cache_release(in_page);
+	}
+
+	return ret;
+}
+
+static int lzo_decompress_biovec(struct list_head *ws,
+				 struct page **pages_in,
+				 u64 disk_start,
+				 struct bio_vec *bvec,
+				 int vcnt,
+				 size_t srclen)
+{
+	struct workspace *workspace = list_entry(ws, struct workspace, list);
+	int ret = 0;
+	char *data_in;
+	unsigned long page_bytes_left;
+	unsigned long page_in_index = 0;
+	unsigned long page_out_index = 0;
+	struct page *page_out;
+	unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
+					PAGE_CACHE_SIZE;
+	unsigned long buf_start;
+	unsigned long buf_offset = 0;
+	unsigned long bytes;
+	unsigned long working_bytes;
+	unsigned long pg_offset;
+	unsigned long start_byte;
+	unsigned long current_buf_start;
+	char *kaddr;
+
+	size_t in_len;
+	size_t out_len;
+	unsigned long in_offset;
+	unsigned long in_page_bytes_left;
+	unsigned long tot_in;
+	unsigned long tot_out;
+	unsigned long tot_len;
+	char *buf;
+
+	data_in = kmap(pages_in[0]);
+	tot_len = read_compress_length(data_in);
+
+	tot_in = LZO_LEN;
+	in_offset = LZO_LEN;
+	tot_len = min_t(size_t, srclen, tot_len);
+	in_page_bytes_left = PAGE_CACHE_SIZE - LZO_LEN;
+
+	tot_out = 0;
+	page_out = bvec[0].bv_page;
+	page_bytes_left = PAGE_CACHE_SIZE;
+	pg_offset = 0;
+
+	while (tot_in < tot_len) {
+		in_len = read_compress_length(data_in + in_offset);
+		in_page_bytes_left -= LZO_LEN;
+		in_offset += LZO_LEN;
+		tot_in += LZO_LEN;
+
+		tot_in += in_len;
+		working_bytes = in_len;
+
+		/* fast path: avoid using the working buffer */
+		if (in_page_bytes_left >= in_len) {
+			buf = data_in + in_offset;
+			bytes = in_len;
+			goto cont;
+		}
+
+		/* copy bytes from the pages into the working buffer */
+		buf = workspace->cbuf;
+		buf_offset = 0;
+		while (working_bytes) {
+			bytes = min(working_bytes, in_page_bytes_left);
+
+			memcpy(buf + buf_offset, data_in + in_offset, bytes);
+			buf_offset += bytes;
+cont:
+			working_bytes -= bytes;
+			in_page_bytes_left -= bytes;
+			in_offset += bytes;
+
+			/* check if we need to pick another page */
+			if ((working_bytes == 0 && in_page_bytes_left < LZO_LEN)
+			    || in_page_bytes_left == 0) {
+				tot_in += in_page_bytes_left;
+
+				if (working_bytes == 0 && tot_in >= tot_len)
+					break;
+
+				kunmap(pages_in[page_in_index]);
+				page_in_index++;
+				if (page_in_index >= total_pages_in) {
+					ret = -1;
+					data_in = NULL;
+					goto done;
+				}
+				data_in = kmap(pages_in[page_in_index]);
+
+				in_page_bytes_left = PAGE_CACHE_SIZE;
+				in_offset = 0;
+			}
+		}
+
+		out_len = lzo1x_worst_compress(PAGE_CACHE_SIZE);
+		ret = lzo1x_decompress_safe(buf, in_len, workspace->buf,
+					    &out_len);
+		if (ret != LZO_E_OK) {
+			printk(KERN_WARNING "btrfs decompress failed\n");
+			ret = -1;
+			break;
+		}
+
+		/*
+		 * buf start is the byte offset we're of the start of
+		 * our workspace buffer
+		 */
+		buf_start = tot_out;
+
+		/* tot_out is the last byte of the workspace buffer */
+		tot_out += out_len;
+
+		working_bytes = tot_out - buf_start;
+
+		/*
+		 * start_byte is the first byte of the page we're currently
+		 * copying into relative to the start of the compressed data.
+		 */
+		start_byte = page_offset(page_out) - disk_start;
+
+		if (working_bytes == 0) {
+			/* we didn't make progress in this inflate
+			 * call, we're done
+			 */
+			break;
+		}
+
+		/* we haven't yet hit data corresponding to this page */
+		if (tot_out <= start_byte)
+			continue;
+
+		/*
+		 * the start of the data we care about is offset into
+		 * the middle of our working buffer
+		 */
+		if (tot_out > start_byte && buf_start < start_byte) {
+			buf_offset = start_byte - buf_start;
+			working_bytes -= buf_offset;
+		} else {
+			buf_offset = 0;
+		}
+		current_buf_start = buf_start;
+
+		/* copy bytes from the working buffer into the pages */
+		while (working_bytes > 0) {
+			bytes = min(PAGE_CACHE_SIZE - pg_offset,
+				    PAGE_CACHE_SIZE - buf_offset);
+			bytes = min(bytes, working_bytes);
+			kaddr = kmap_atomic(page_out, KM_USER0);
+			memcpy(kaddr + pg_offset, workspace->buf + buf_offset,
+			       bytes);
+			kunmap_atomic(kaddr, KM_USER0);
+			flush_dcache_page(page_out);
+
+			pg_offset += bytes;
+			page_bytes_left -= bytes;
+			buf_offset += bytes;
+			working_bytes -= bytes;
+			current_buf_start += bytes;
+
+			/* check if we need to pick another page */
+			if (page_bytes_left == 0) {
+				page_out_index++;
+				if (page_out_index >= vcnt) {
+					ret = 0;
+					goto done;
+				}
+
+				page_out = bvec[page_out_index].bv_page;
+				pg_offset = 0;
+				page_bytes_left = PAGE_CACHE_SIZE;
+				start_byte = page_offset(page_out) - disk_start;
+
+				/*
+				 * make sure our new page is covered by this
+				 * working buffer
+				 */
+				if (tot_out <= start_byte)
+					break;
+
+				/* the next page in the biovec might not
+				 * be adjacent to the last page, but it
+				 * might still be found inside this working
+				 * buffer.  bump our offset pointer
+				 */
+				if (tot_out > start_byte &&
+				    current_buf_start < start_byte) {
+					buf_offset = start_byte - buf_start;
+					working_bytes = tot_out - start_byte;
+					current_buf_start = buf_start +
+						buf_offset;
+				}
+			}
+		}
+	}
+done:
+	if (data_in)
+		kunmap(pages_in[page_in_index]);
+	return ret;
+}
+
+static int lzo_decompress(struct list_head *ws, unsigned char *data_in,
+			  struct page *dest_page,
+			  unsigned long start_byte,
+			  size_t srclen, size_t destlen)
+{
+	struct workspace *workspace = list_entry(ws, struct workspace, list);
+	size_t in_len;
+	size_t out_len;
+	size_t tot_len;
+	int ret = 0;
+	char *kaddr;
+	unsigned long bytes;
+
+	BUG_ON(srclen < LZO_LEN);
+
+	tot_len = read_compress_length(data_in);
+	data_in += LZO_LEN;
+
+	in_len = read_compress_length(data_in);
+	data_in += LZO_LEN;
+
+	out_len = PAGE_CACHE_SIZE;
+	ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len);
+	if (ret != LZO_E_OK) {
+		printk(KERN_WARNING "btrfs decompress failed!\n");
+		ret = -1;
+		goto out;
+	}
+
+	if (out_len < start_byte) {
+		ret = -1;
+		goto out;
+	}
+
+	bytes = min_t(unsigned long, destlen, out_len - start_byte);
+
+	kaddr = kmap_atomic(dest_page, KM_USER0);
+	memcpy(kaddr, workspace->buf + start_byte, bytes);
+	kunmap_atomic(kaddr, KM_USER0);
+out:
+	return ret;
+}
+
+struct btrfs_compress_op btrfs_lzo_compress = {
+	.alloc_workspace	= lzo_alloc_workspace,
+	.free_workspace		= lzo_free_workspace,
+	.compress_pages		= lzo_compress_pages,
+	.decompress_biovec	= lzo_decompress_biovec,
+	.decompress		= lzo_decompress,
+};
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f348f2b93164..a1a76b2a61f9 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -168,6 +168,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			    strcmp(args[0].from, "zlib") == 0) {
 				compress_type = "zlib";
 				info->compress_type = BTRFS_COMPRESS_ZLIB;
+			} else if (strcmp(args[0].from, "lzo") == 0) {
+				compress_type = "lzo";
+				info->compress_type = BTRFS_COMPRESS_LZO;
 			} else {
 				ret = -EINVAL;
 				goto out;
-- 
cgit v1.2.2


From 1a419d85a76853d7d04e9b6280a80e96770bf3e3 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 25 Oct 2010 15:12:50 +0800
Subject: btrfs: Allow to specify compress method when defrag

Update defrag ioctl, so one can choose lzo or zlib when turning
on compression in defrag operation.

Changelog:

v1 -> v2
- Add incompability flag.
- Fix to check invalid compress type.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/ioctl.c | 19 ++++++++++++++++++-
 fs/btrfs/ioctl.h |  9 ++++++++-
 2 files changed, 26 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 8cb86d4d763c..b6985d33eede 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -638,9 +638,11 @@ static int btrfs_defrag_file(struct file *file,
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct btrfs_ordered_extent *ordered;
 	struct page *page;
+	struct btrfs_super_block *disk_super;
 	unsigned long last_index;
 	unsigned long ra_pages = root->fs_info->bdi.ra_pages;
 	unsigned long total_read = 0;
+	u64 features;
 	u64 page_start;
 	u64 page_end;
 	u64 last_len = 0;
@@ -648,6 +650,14 @@ static int btrfs_defrag_file(struct file *file,
 	u64 defrag_end = 0;
 	unsigned long i;
 	int ret;
+	int compress_type = BTRFS_COMPRESS_ZLIB;
+
+	if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) {
+		if (range->compress_type > BTRFS_COMPRESS_TYPES)
+			return -EINVAL;
+		if (range->compress_type)
+			compress_type = range->compress_type;
+	}
 
 	if (inode->i_size == 0)
 		return 0;
@@ -683,7 +693,7 @@ static int btrfs_defrag_file(struct file *file,
 		total_read++;
 		mutex_lock(&inode->i_mutex);
 		if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
-			BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_ZLIB;
+			BTRFS_I(inode)->force_compress = compress_type;
 
 		ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
 		if (ret)
@@ -785,6 +795,13 @@ loop_unlock:
 		mutex_unlock(&inode->i_mutex);
 	}
 
+	disk_super = &root->fs_info->super_copy;
+	features = btrfs_super_incompat_flags(disk_super);
+	if (range->compress_type == BTRFS_COMPRESS_LZO) {
+		features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
+		btrfs_set_super_incompat_flags(disk_super, features);
+	}
+
 	return 0;
 
 err_reservations:
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index c344d12c646b..24d0f4628240 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -133,8 +133,15 @@ struct btrfs_ioctl_defrag_range_args {
 	 */
 	__u32 extent_thresh;
 
+	/*
+	 * which compression method to use if turning on compression
+	 * for this defrag operation.  If unspecified, zlib will
+	 * be used
+	 */
+	__u32 compress_type;
+
 	/* spare for later */
-	__u32 unused[5];
+	__u32 unused[4];
 };
 
 struct btrfs_ioctl_space_info {
-- 
cgit v1.2.2


From 3a39c18d63fec35f49df577d4b2a4e29c2212f22 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 8 Nov 2010 15:22:19 +0800
Subject: btrfs: Extract duplicate decompress code

Add a common function to copy decompressed data from working buffer
to bio pages.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/compression.c |  92 ++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/compression.h |   5 +++
 fs/btrfs/lzo.c         | 101 +++-----------------------------------------
 fs/btrfs/zlib.c        | 111 ++++++-------------------------------------------
 4 files changed, 115 insertions(+), 194 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 8faa2df9e719..f745287fbf2e 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -904,3 +904,95 @@ void __exit btrfs_exit_compress(void)
 {
 	free_workspaces();
 }
+
+/*
+ * Copy uncompressed data from working buffer to pages.
+ *
+ * buf_start is the byte offset we're of the start of our workspace buffer.
+ *
+ * total_out is the last byte of the buffer
+ */
+int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
+			      unsigned long total_out, u64 disk_start,
+			      struct bio_vec *bvec, int vcnt,
+			      unsigned long *page_index,
+			      unsigned long *pg_offset)
+{
+	unsigned long buf_offset;
+	unsigned long current_buf_start;
+	unsigned long start_byte;
+	unsigned long working_bytes = total_out - buf_start;
+	unsigned long bytes;
+	char *kaddr;
+	struct page *page_out = bvec[*page_index].bv_page;
+
+	/*
+	 * start byte is the first byte of the page we're currently
+	 * copying into relative to the start of the compressed data.
+	 */
+	start_byte = page_offset(page_out) - disk_start;
+
+	/* we haven't yet hit data corresponding to this page */
+	if (total_out <= start_byte)
+		return 1;
+
+	/*
+	 * the start of the data we care about is offset into
+	 * the middle of our working buffer
+	 */
+	if (total_out > start_byte && buf_start < start_byte) {
+		buf_offset = start_byte - buf_start;
+		working_bytes -= buf_offset;
+	} else {
+		buf_offset = 0;
+	}
+	current_buf_start = buf_start;
+
+	/* copy bytes from the working buffer into the pages */
+	while (working_bytes > 0) {
+		bytes = min(PAGE_CACHE_SIZE - *pg_offset,
+			    PAGE_CACHE_SIZE - buf_offset);
+		bytes = min(bytes, working_bytes);
+		kaddr = kmap_atomic(page_out, KM_USER0);
+		memcpy(kaddr + *pg_offset, buf + buf_offset, bytes);
+		kunmap_atomic(kaddr, KM_USER0);
+		flush_dcache_page(page_out);
+
+		*pg_offset += bytes;
+		buf_offset += bytes;
+		working_bytes -= bytes;
+		current_buf_start += bytes;
+
+		/* check if we need to pick another page */
+		if (*pg_offset == PAGE_CACHE_SIZE) {
+			(*page_index)++;
+			if (*page_index >= vcnt)
+				return 0;
+
+			page_out = bvec[*page_index].bv_page;
+			*pg_offset = 0;
+			start_byte = page_offset(page_out) - disk_start;
+
+			/*
+			 * make sure our new page is covered by this
+			 * working buffer
+			 */
+			if (total_out <= start_byte)
+				return 1;
+
+			/*
+			 * the next page in the biovec might not be adjacent
+			 * to the last page, but it might still be found
+			 * inside this working buffer. bump our offset pointer
+			 */
+			if (total_out > start_byte &&
+			    current_buf_start < start_byte) {
+				buf_offset = start_byte - buf_start;
+				working_bytes = total_out - start_byte;
+				current_buf_start = buf_start + buf_offset;
+			}
+		}
+	}
+
+	return 1;
+}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index f7ce217113fa..51000174b9d7 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -34,6 +34,11 @@ int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start,
 			    struct bio_vec *bvec, int vcnt, size_t srclen);
 int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
 		     unsigned long start_byte, size_t srclen, size_t destlen);
+int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
+			      unsigned long total_out, u64 disk_start,
+			      struct bio_vec *bvec, int vcnt,
+			      unsigned long *page_index,
+			      unsigned long *pg_offset);
 
 int btrfs_submit_compressed_write(struct inode *inode, u64 start,
 				  unsigned long len, u64 disk_start,
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 523b144e2aec..cc9b450399df 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -260,12 +260,10 @@ static int lzo_decompress_biovec(struct list_head *ws,
 				 size_t srclen)
 {
 	struct workspace *workspace = list_entry(ws, struct workspace, list);
-	int ret = 0;
+	int ret = 0, ret2;
 	char *data_in;
-	unsigned long page_bytes_left;
 	unsigned long page_in_index = 0;
 	unsigned long page_out_index = 0;
-	struct page *page_out;
 	unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
 					PAGE_CACHE_SIZE;
 	unsigned long buf_start;
@@ -273,9 +271,6 @@ static int lzo_decompress_biovec(struct list_head *ws,
 	unsigned long bytes;
 	unsigned long working_bytes;
 	unsigned long pg_offset;
-	unsigned long start_byte;
-	unsigned long current_buf_start;
-	char *kaddr;
 
 	size_t in_len;
 	size_t out_len;
@@ -295,8 +290,6 @@ static int lzo_decompress_biovec(struct list_head *ws,
 	in_page_bytes_left = PAGE_CACHE_SIZE - LZO_LEN;
 
 	tot_out = 0;
-	page_out = bvec[0].bv_page;
-	page_bytes_left = PAGE_CACHE_SIZE;
 	pg_offset = 0;
 
 	while (tot_in < tot_len) {
@@ -359,97 +352,15 @@ cont:
 			break;
 		}
 
-		/*
-		 * buf start is the byte offset we're of the start of
-		 * our workspace buffer
-		 */
 		buf_start = tot_out;
-
-		/* tot_out is the last byte of the workspace buffer */
 		tot_out += out_len;
 
-		working_bytes = tot_out - buf_start;
-
-		/*
-		 * start_byte is the first byte of the page we're currently
-		 * copying into relative to the start of the compressed data.
-		 */
-		start_byte = page_offset(page_out) - disk_start;
-
-		if (working_bytes == 0) {
-			/* we didn't make progress in this inflate
-			 * call, we're done
-			 */
+		ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start,
+						 tot_out, disk_start,
+						 bvec, vcnt,
+						 &page_out_index, &pg_offset);
+		if (ret2 == 0)
 			break;
-		}
-
-		/* we haven't yet hit data corresponding to this page */
-		if (tot_out <= start_byte)
-			continue;
-
-		/*
-		 * the start of the data we care about is offset into
-		 * the middle of our working buffer
-		 */
-		if (tot_out > start_byte && buf_start < start_byte) {
-			buf_offset = start_byte - buf_start;
-			working_bytes -= buf_offset;
-		} else {
-			buf_offset = 0;
-		}
-		current_buf_start = buf_start;
-
-		/* copy bytes from the working buffer into the pages */
-		while (working_bytes > 0) {
-			bytes = min(PAGE_CACHE_SIZE - pg_offset,
-				    PAGE_CACHE_SIZE - buf_offset);
-			bytes = min(bytes, working_bytes);
-			kaddr = kmap_atomic(page_out, KM_USER0);
-			memcpy(kaddr + pg_offset, workspace->buf + buf_offset,
-			       bytes);
-			kunmap_atomic(kaddr, KM_USER0);
-			flush_dcache_page(page_out);
-
-			pg_offset += bytes;
-			page_bytes_left -= bytes;
-			buf_offset += bytes;
-			working_bytes -= bytes;
-			current_buf_start += bytes;
-
-			/* check if we need to pick another page */
-			if (page_bytes_left == 0) {
-				page_out_index++;
-				if (page_out_index >= vcnt) {
-					ret = 0;
-					goto done;
-				}
-
-				page_out = bvec[page_out_index].bv_page;
-				pg_offset = 0;
-				page_bytes_left = PAGE_CACHE_SIZE;
-				start_byte = page_offset(page_out) - disk_start;
-
-				/*
-				 * make sure our new page is covered by this
-				 * working buffer
-				 */
-				if (tot_out <= start_byte)
-					break;
-
-				/* the next page in the biovec might not
-				 * be adjacent to the last page, but it
-				 * might still be found inside this working
-				 * buffer.  bump our offset pointer
-				 */
-				if (tot_out > start_byte &&
-				    current_buf_start < start_byte) {
-					buf_offset = start_byte - buf_start;
-					working_bytes = tot_out - start_byte;
-					current_buf_start = buf_start +
-						buf_offset;
-				}
-			}
-		}
 	}
 done:
 	if (data_in)
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 9a3e693917f2..f5ec2d44150d 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -218,24 +218,16 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
 				  size_t srclen)
 {
 	struct workspace *workspace = list_entry(ws, struct workspace, list);
-	int ret = 0;
+	int ret = 0, ret2;
 	int wbits = MAX_WBITS;
 	char *data_in;
 	size_t total_out = 0;
-	unsigned long page_bytes_left;
 	unsigned long page_in_index = 0;
 	unsigned long page_out_index = 0;
-	struct page *page_out;
 	unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
 					PAGE_CACHE_SIZE;
 	unsigned long buf_start;
-	unsigned long buf_offset;
-	unsigned long bytes;
-	unsigned long working_bytes;
 	unsigned long pg_offset;
-	unsigned long start_byte;
-	unsigned long current_buf_start;
-	char *kaddr;
 
 	data_in = kmap(pages_in[page_in_index]);
 	workspace->inf_strm.next_in = data_in;
@@ -245,8 +237,6 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
 	workspace->inf_strm.total_out = 0;
 	workspace->inf_strm.next_out = workspace->buf;
 	workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
-	page_out = bvec[page_out_index].bv_page;
-	page_bytes_left = PAGE_CACHE_SIZE;
 	pg_offset = 0;
 
 	/* If it's deflate, and it's got no preset dictionary, then
@@ -268,100 +258,23 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
 		ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
 		if (ret != Z_OK && ret != Z_STREAM_END)
 			break;
-		/*
-		 * buf start is the byte offset we're of the start of
-		 * our workspace buffer
-		 */
-		buf_start = total_out;
 
-		/* total_out is the last byte of the workspace buffer */
+		buf_start = total_out;
 		total_out = workspace->inf_strm.total_out;
 
-		working_bytes = total_out - buf_start;
-
-		/*
-		 * start byte is the first byte of the page we're currently
-		 * copying into relative to the start of the compressed data.
-		 */
-		start_byte = page_offset(page_out) - disk_start;
-
-		if (working_bytes == 0) {
-			/* we didn't make progress in this inflate
-			 * call, we're done
-			 */
-			if (ret != Z_STREAM_END)
-				ret = -1;
+		/* we didn't make progress in this inflate call, we're done */
+		if (buf_start == total_out)
 			break;
-		}
 
-		/* we haven't yet hit data corresponding to this page */
-		if (total_out <= start_byte)
-			goto next;
-
-		/*
-		 * the start of the data we care about is offset into
-		 * the middle of our working buffer
-		 */
-		if (total_out > start_byte && buf_start < start_byte) {
-			buf_offset = start_byte - buf_start;
-			working_bytes -= buf_offset;
-		} else {
-			buf_offset = 0;
-		}
-		current_buf_start = buf_start;
-
-		/* copy bytes from the working buffer into the pages */
-		while (working_bytes > 0) {
-			bytes = min(PAGE_CACHE_SIZE - pg_offset,
-				    PAGE_CACHE_SIZE - buf_offset);
-			bytes = min(bytes, working_bytes);
-			kaddr = kmap_atomic(page_out, KM_USER0);
-			memcpy(kaddr + pg_offset, workspace->buf + buf_offset,
-			       bytes);
-			kunmap_atomic(kaddr, KM_USER0);
-			flush_dcache_page(page_out);
-
-			pg_offset += bytes;
-			page_bytes_left -= bytes;
-			buf_offset += bytes;
-			working_bytes -= bytes;
-			current_buf_start += bytes;
-
-			/* check if we need to pick another page */
-			if (page_bytes_left == 0) {
-				page_out_index++;
-				if (page_out_index >= vcnt) {
-					ret = 0;
-					goto done;
-				}
-
-				page_out = bvec[page_out_index].bv_page;
-				pg_offset = 0;
-				page_bytes_left = PAGE_CACHE_SIZE;
-				start_byte = page_offset(page_out) - disk_start;
-
-				/*
-				 * make sure our new page is covered by this
-				 * working buffer
-				 */
-				if (total_out <= start_byte)
-					goto next;
-
-				/* the next page in the biovec might not
-				 * be adjacent to the last page, but it
-				 * might still be found inside this working
-				 * buffer.  bump our offset pointer
-				 */
-				if (total_out > start_byte &&
-				    current_buf_start < start_byte) {
-					buf_offset = start_byte - buf_start;
-					working_bytes = total_out - start_byte;
-					current_buf_start = buf_start +
-						buf_offset;
-				}
-			}
+		ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start,
+						 total_out, disk_start,
+						 bvec, vcnt,
+						 &page_out_index, &pg_offset);
+		if (ret2 == 0) {
+			ret = 0;
+			goto done;
 		}
-next:
+
 		workspace->inf_strm.next_out = workspace->buf;
 		workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
 
-- 
cgit v1.2.2


From fa0d2b9bd717340e0bc4850a80ac0eb344e9a7fb Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 20 Dec 2010 15:53:28 +0800
Subject: Btrfs: Refactor btrfs_ioctl_snap_create()

Split it into two functions for two different ioctls, since they
share no common code.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/ioctl.c | 84 +++++++++++++++++++++++++++-----------------------------
 1 file changed, 40 insertions(+), 44 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index f87552a1d7ea..02554e19d974 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -946,58 +946,54 @@ out:
 }
 
 static noinline int btrfs_ioctl_snap_create(struct file *file,
-					    void __user *arg, int subvol,
-					    int v2)
+					    void __user *arg, int subvol)
 {
-	struct btrfs_ioctl_vol_args *vol_args = NULL;
-	struct btrfs_ioctl_vol_args_v2 *vol_args_v2 = NULL;
-	char *name;
-	u64 fd;
+	struct btrfs_ioctl_vol_args *vol_args;
 	int ret;
 
-	if (v2) {
-		u64 transid = 0;
-		u64 *ptr = NULL;
+	vol_args = memdup_user(arg, sizeof(*vol_args));
+	if (IS_ERR(vol_args))
+		return PTR_ERR(vol_args);
+	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
 
-		vol_args_v2 = memdup_user(arg, sizeof(*vol_args_v2));
-		if (IS_ERR(vol_args_v2))
-			return PTR_ERR(vol_args_v2);
+	ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
+					      vol_args->fd, subvol, NULL);
 
-		if (vol_args_v2->flags & ~BTRFS_SUBVOL_CREATE_ASYNC) {
-			ret = -EINVAL;
-			goto out;
-		}
-
-		name = vol_args_v2->name;
-		fd = vol_args_v2->fd;
-		vol_args_v2->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
+	kfree(vol_args);
+	return ret;
+}
 
-		if (vol_args_v2->flags & BTRFS_SUBVOL_CREATE_ASYNC)
-			ptr = &transid;
+static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
+					       void __user *arg, int subvol)
+{
+	struct btrfs_ioctl_vol_args_v2 *vol_args;
+	int ret;
+	u64 transid = 0;
+	u64 *ptr = NULL;
 
-		ret = btrfs_ioctl_snap_create_transid(file, name, fd,
-						      subvol, ptr);
+	vol_args = memdup_user(arg, sizeof(*vol_args));
+	if (IS_ERR(vol_args))
+		return PTR_ERR(vol_args);
+	vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
 
-		if (ret == 0 && ptr &&
-		    copy_to_user(arg +
-				 offsetof(struct btrfs_ioctl_vol_args_v2,
-					  transid), ptr, sizeof(*ptr)))
-			ret = -EFAULT;
-	} else {
-		vol_args = memdup_user(arg, sizeof(*vol_args));
-		if (IS_ERR(vol_args))
-			return PTR_ERR(vol_args);
-		name = vol_args->name;
-		fd = vol_args->fd;
-		vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
-
-		ret = btrfs_ioctl_snap_create_transid(file, name, fd,
-						      subvol, NULL);
+	if (vol_args->flags & ~BTRFS_SUBVOL_CREATE_ASYNC) {
+		ret = -EINVAL;
+		goto out;
 	}
+
+	if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC)
+		ptr = &transid;
+
+	ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
+					      vol_args->fd, subvol, ptr);
+
+	if (ret == 0 && ptr &&
+	    copy_to_user(arg +
+			 offsetof(struct btrfs_ioctl_vol_args_v2,
+				  transid), ptr, sizeof(*ptr)))
+		ret = -EFAULT;
 out:
 	kfree(vol_args);
-	kfree(vol_args_v2);
-
 	return ret;
 }
 
@@ -2257,11 +2253,11 @@ long btrfs_ioctl(struct file *file, unsigned int
 	case FS_IOC_GETVERSION:
 		return btrfs_ioctl_getversion(file, argp);
 	case BTRFS_IOC_SNAP_CREATE:
-		return btrfs_ioctl_snap_create(file, argp, 0, 0);
+		return btrfs_ioctl_snap_create(file, argp, 0);
 	case BTRFS_IOC_SNAP_CREATE_V2:
-		return btrfs_ioctl_snap_create(file, argp, 0, 1);
+		return btrfs_ioctl_snap_create_v2(file, argp, 0);
 	case BTRFS_IOC_SUBVOL_CREATE:
-		return btrfs_ioctl_snap_create(file, argp, 1, 0);
+		return btrfs_ioctl_snap_create(file, argp, 1);
 	case BTRFS_IOC_SNAP_DESTROY:
 		return btrfs_ioctl_snap_destroy(file, argp);
 	case BTRFS_IOC_DEFAULT_SUBVOL:
-- 
cgit v1.2.2


From b83cc9693f39689490970c19f6c5b866f6719a70 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 20 Dec 2010 16:04:08 +0800
Subject: Btrfs: Add readonly snapshots support

Usage:

Set BTRFS_SUBVOL_RDONLY of btrfs_ioctl_vol_arg_v2->flags, and call
ioctl(BTRFS_I0CTL_SNAP_CREATE_V2).

Implementation:

- Set readonly bit of btrfs_root_item->flags.
- Add readonly checks in btrfs_permission (inode_permission),
btrfs_setattr, btrfs_set/remove_xattr and some ioctls.

Changelog for v3:

- Eliminate btrfs_root->readonly, but check btrfs_root->root_item.flags.
- Rename BTRFS_ROOT_SNAP_RDONLY to BTRFS_ROOT_SUBVOL_RDONLY.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/ctree.h       |  7 +++++++
 fs/btrfs/inode.c       |  8 ++++++++
 fs/btrfs/ioctl.c       | 42 ++++++++++++++++++++++++++++++++----------
 fs/btrfs/ioctl.h       |  1 +
 fs/btrfs/transaction.c |  8 ++++++++
 fs/btrfs/transaction.h |  1 +
 fs/btrfs/xattr.c       | 18 ++++++++++++++++++
 7 files changed, 75 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index af52f6d7a4d8..4403e5643d43 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -597,6 +597,8 @@ struct btrfs_dir_item {
 	u8 type;
 } __attribute__ ((__packed__));
 
+#define BTRFS_ROOT_SUBVOL_RDONLY	(1ULL << 0)
+
 struct btrfs_root_item {
 	struct btrfs_inode_item inode;
 	__le64 generation;
@@ -1893,6 +1895,11 @@ BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64);
 BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
 			 last_snapshot, 64);
 
+static inline bool btrfs_root_readonly(struct btrfs_root *root)
+{
+	return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY;
+}
+
 /* struct btrfs_super_block */
 
 BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5f9194438f7c..956f1eb913b1 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3671,8 +3671,12 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
 static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = dentry->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int err;
 
+	if (btrfs_root_readonly(root))
+		return -EROFS;
+
 	err = inode_change_ok(inode, attr);
 	if (err)
 		return err;
@@ -7206,6 +7210,10 @@ static int btrfs_set_page_dirty(struct page *page)
 
 static int btrfs_permission(struct inode *inode, int mask)
 {
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+
+	if (btrfs_root_readonly(root) && (mask & MAY_WRITE))
+		return -EROFS;
 	if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE))
 		return -EACCES;
 	return generic_permission(inode, mask, btrfs_check_acl);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 02554e19d974..f066ccb5dddf 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -147,6 +147,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 	unsigned int flags, oldflags;
 	int ret;
 
+	if (btrfs_root_readonly(root))
+		return -EROFS;
+
 	if (copy_from_user(&flags, arg, sizeof(flags)))
 		return -EFAULT;
 
@@ -360,7 +363,8 @@ fail:
 }
 
 static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
-			   char *name, int namelen, u64 *async_transid)
+			   char *name, int namelen, u64 *async_transid,
+			   bool readonly)
 {
 	struct inode *inode;
 	struct dentry *parent;
@@ -378,6 +382,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
 	btrfs_init_block_rsv(&pending_snapshot->block_rsv);
 	pending_snapshot->dentry = dentry;
 	pending_snapshot->root = root;
+	pending_snapshot->readonly = readonly;
 
 	trans = btrfs_start_transaction(root->fs_info->extent_root, 5);
 	if (IS_ERR(trans)) {
@@ -509,7 +514,7 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
 static noinline int btrfs_mksubvol(struct path *parent,
 				   char *name, int namelen,
 				   struct btrfs_root *snap_src,
-				   u64 *async_transid)
+				   u64 *async_transid, bool readonly)
 {
 	struct inode *dir  = parent->dentry->d_inode;
 	struct dentry *dentry;
@@ -541,7 +546,7 @@ static noinline int btrfs_mksubvol(struct path *parent,
 
 	if (snap_src) {
 		error = create_snapshot(snap_src, dentry,
-					name, namelen, async_transid);
+					name, namelen, async_transid, readonly);
 	} else {
 		error = create_subvol(BTRFS_I(dir)->root, dentry,
 				      name, namelen, async_transid);
@@ -901,7 +906,8 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
 						    char *name,
 						    unsigned long fd,
 						    int subvol,
-						    u64 *transid)
+						    u64 *transid,
+						    bool readonly)
 {
 	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
 	struct file *src_file;
@@ -919,7 +925,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
 
 	if (subvol) {
 		ret = btrfs_mksubvol(&file->f_path, name, namelen,
-				     NULL, transid);
+				     NULL, transid, readonly);
 	} else {
 		struct inode *src_inode;
 		src_file = fget(fd);
@@ -938,7 +944,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
 		}
 		ret = btrfs_mksubvol(&file->f_path, name, namelen,
 				     BTRFS_I(src_inode)->root,
-				     transid);
+				     transid, readonly);
 		fput(src_file);
 	}
 out:
@@ -957,7 +963,8 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
 	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
 
 	ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
-					      vol_args->fd, subvol, NULL);
+					      vol_args->fd, subvol,
+					      NULL, false);
 
 	kfree(vol_args);
 	return ret;
@@ -970,22 +977,27 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
 	int ret;
 	u64 transid = 0;
 	u64 *ptr = NULL;
+	bool readonly = false;
 
 	vol_args = memdup_user(arg, sizeof(*vol_args));
 	if (IS_ERR(vol_args))
 		return PTR_ERR(vol_args);
 	vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
 
-	if (vol_args->flags & ~BTRFS_SUBVOL_CREATE_ASYNC) {
-		ret = -EINVAL;
+	if (vol_args->flags &
+	    ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY)) {
+		ret = -EOPNOTSUPP;
 		goto out;
 	}
 
 	if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC)
 		ptr = &transid;
+	if (vol_args->flags & BTRFS_SUBVOL_RDONLY)
+		readonly = true;
 
 	ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
-					      vol_args->fd, subvol, ptr);
+					      vol_args->fd, subvol,
+					      ptr, readonly);
 
 	if (ret == 0 && ptr &&
 	    copy_to_user(arg +
@@ -1505,6 +1517,9 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
 	struct btrfs_ioctl_defrag_range_args *range;
 	int ret;
 
+	if (btrfs_root_readonly(root))
+		return -EROFS;
+
 	ret = mnt_want_write(file->f_path.mnt);
 	if (ret)
 		return ret;
@@ -1633,6 +1648,9 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 	if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND))
 		return -EINVAL;
 
+	if (btrfs_root_readonly(root))
+		return -EROFS;
+
 	ret = mnt_want_write(file->f_path.mnt);
 	if (ret)
 		return ret;
@@ -1954,6 +1972,10 @@ static long btrfs_ioctl_trans_start(struct file *file)
 	if (file->private_data)
 		goto out;
 
+	ret = -EROFS;
+	if (btrfs_root_readonly(root))
+		goto out;
+
 	ret = mnt_want_write(file->f_path.mnt);
 	if (ret)
 		goto out;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index c344d12c646b..52ae489974be 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -31,6 +31,7 @@ struct btrfs_ioctl_vol_args {
 };
 
 #define BTRFS_SUBVOL_CREATE_ASYNC	(1ULL << 0)
+#define BTRFS_SUBVOL_RDONLY		(1ULL << 1)
 
 #define BTRFS_SUBVOL_NAME_MAX 4039
 struct btrfs_ioctl_vol_args_v2 {
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index f50e931fc217..29e30d832ec9 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -910,6 +910,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	u64 to_reserve = 0;
 	u64 index = 0;
 	u64 objectid;
+	u64 root_flags;
 
 	new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
 	if (!new_root_item) {
@@ -967,6 +968,13 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
 	memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
 
+	root_flags = btrfs_root_flags(new_root_item);
+	if (pending->readonly)
+		root_flags |= BTRFS_ROOT_SUBVOL_RDONLY;
+	else
+		root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY;
+	btrfs_set_root_flags(new_root_item, root_flags);
+
 	old = btrfs_lock_root_node(root);
 	btrfs_cow_block(trans, root, old, NULL, 0, &old);
 	btrfs_set_lock_blocking(old);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index f104b57ad4ef..229a594cacd5 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -62,6 +62,7 @@ struct btrfs_pending_snapshot {
 	struct btrfs_block_rsv block_rsv;
 	/* extra metadata reseration for relocation */
 	int error;
+	bool readonly;
 	struct list_head list;
 };
 
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 698fdd2c739c..a5776531dc2b 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -316,6 +316,15 @@ ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
 int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 		   size_t size, int flags)
 {
+	struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root;
+
+	/*
+	 * The permission on security.* and system.* is not checked
+	 * in permission().
+	 */
+	if (btrfs_root_readonly(root))
+		return -EROFS;
+
 	/*
 	 * If this is a request for a synthetic attribute in the system.*
 	 * namespace use the generic infrastructure to resolve a handler
@@ -336,6 +345,15 @@ int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 
 int btrfs_removexattr(struct dentry *dentry, const char *name)
 {
+	struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root;
+
+	/*
+	 * The permission on security.* and system.* is not checked
+	 * in permission().
+	 */
+	if (btrfs_root_readonly(root))
+		return -EROFS;
+
 	/*
 	 * If this is a request for a synthetic attribute in the system.*
 	 * namespace use the generic infrastructure to resolve a handler
-- 
cgit v1.2.2


From 0caa102da82799efaba88e234484786a9591c797 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 20 Dec 2010 16:30:25 +0800
Subject: Btrfs: Add BTRFS_IOC_SUBVOL_GETFLAGS/SETFLAGS ioctls

This allows us to set a snapshot or a subvolume readonly or writable
on the fly.

Usage:

Set BTRFS_SUBVOL_RDONLY of btrfs_ioctl_vol_arg_v2->flags, and then
call ioctl(BTRFS_IOCTL_SUBVOL_SETFLAGS);

Changelog for v3:

- Change to pass __u64 as ioctl parameter.

Changelog for v2:

- Add _GETFLAGS ioctl.
- Check if the passed fd is the root of a subvolume.
- Change the name from _SNAP_SETFLAGS to _SUBVOL_SETFLAGS.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/ioctl.c | 83 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/ioctl.h |  2 ++
 2 files changed, 85 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index f066ccb5dddf..ad1983524f97 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1009,6 +1009,85 @@ out:
 	return ret;
 }
 
+static noinline int btrfs_ioctl_subvol_getflags(struct file *file,
+						void __user *arg)
+{
+	struct inode *inode = fdentry(file)->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret = 0;
+	u64 flags = 0;
+
+	if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID)
+		return -EINVAL;
+
+	down_read(&root->fs_info->subvol_sem);
+	if (btrfs_root_readonly(root))
+		flags |= BTRFS_SUBVOL_RDONLY;
+	up_read(&root->fs_info->subvol_sem);
+
+	if (copy_to_user(arg, &flags, sizeof(flags)))
+		ret = -EFAULT;
+
+	return ret;
+}
+
+static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
+					      void __user *arg)
+{
+	struct inode *inode = fdentry(file)->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
+	u64 root_flags;
+	u64 flags;
+	int ret = 0;
+
+	if (root->fs_info->sb->s_flags & MS_RDONLY)
+		return -EROFS;
+
+	if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID)
+		return -EINVAL;
+
+	if (copy_from_user(&flags, arg, sizeof(flags)))
+		return -EFAULT;
+
+	if (flags & ~BTRFS_SUBVOL_CREATE_ASYNC)
+		return -EINVAL;
+
+	if (flags & ~BTRFS_SUBVOL_RDONLY)
+		return -EOPNOTSUPP;
+
+	down_write(&root->fs_info->subvol_sem);
+
+	/* nothing to do */
+	if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root))
+		goto out;
+
+	root_flags = btrfs_root_flags(&root->root_item);
+	if (flags & BTRFS_SUBVOL_RDONLY)
+		btrfs_set_root_flags(&root->root_item,
+				     root_flags | BTRFS_ROOT_SUBVOL_RDONLY);
+	else
+		btrfs_set_root_flags(&root->root_item,
+				     root_flags & ~BTRFS_ROOT_SUBVOL_RDONLY);
+
+	trans = btrfs_start_transaction(root, 1);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto out_reset;
+	}
+
+	ret = btrfs_update_root(trans, root,
+				&root->root_key, &root->root_item);
+
+	btrfs_commit_transaction(trans, root);
+out_reset:
+	if (ret)
+		btrfs_set_root_flags(&root->root_item, root_flags);
+out:
+	up_write(&root->fs_info->subvol_sem);
+	return ret;
+}
+
 /*
  * helper to check if the subvolume references other subvolumes
  */
@@ -2282,6 +2361,10 @@ long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_snap_create(file, argp, 1);
 	case BTRFS_IOC_SNAP_DESTROY:
 		return btrfs_ioctl_snap_destroy(file, argp);
+	case BTRFS_IOC_SUBVOL_GETFLAGS:
+		return btrfs_ioctl_subvol_getflags(file, argp);
+	case BTRFS_IOC_SUBVOL_SETFLAGS:
+		return btrfs_ioctl_subvol_setflags(file, argp);
 	case BTRFS_IOC_DEFAULT_SUBVOL:
 		return btrfs_ioctl_default_subvol(file, argp);
 	case BTRFS_IOC_DEFRAG:
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 52ae489974be..1223223351fa 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -194,4 +194,6 @@ struct btrfs_ioctl_space_args {
 #define BTRFS_IOC_WAIT_SYNC  _IOW(BTRFS_IOCTL_MAGIC, 22, __u64)
 #define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \
 				   struct btrfs_ioctl_vol_args_v2)
+#define BTRFS_IOC_SUBVOL_GETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 25, __u64)
+#define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64)
 #endif
-- 
cgit v1.2.2


From 22b6dee842c6341b49bc09cc5728eb2f8f2b3766 Mon Sep 17 00:00:00 2001
From: Mi Jinlong <mijinlong@cn.fujitsu.com>
Date: Mon, 27 Dec 2010 14:29:57 +0800
Subject: nfsd4: fix oops on secinfo_no_name result encoding

The secinfo_no_name code oopses on encoding with

	BUG: unable to handle kernel NULL pointer dereference at 00000044
	IP: [<e2bd239a>] nfsd4_encode_secinfo+0x1c/0x1c1 [nfsd]

We should implement a nfsd4_encode_secinfo_no_name() instead using
nfsd4_encode_secinfo().

Signed-off-by: Mi Jinlong <mijinlong@cn.fujitsu.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4xdr.c | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index b543b2410b54..437b4623cb02 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2845,11 +2845,10 @@ nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
 }
 
 static __be32
-nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
-		     struct nfsd4_secinfo *secinfo)
+nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp,
+			 __be32 nfserr,struct svc_export *exp)
 {
 	int i = 0;
-	struct svc_export *exp = secinfo->si_exp;
 	u32 nflavs;
 	struct exp_flavor_info *flavs;
 	struct exp_flavor_info def_flavs[2];
@@ -2911,6 +2910,20 @@ out:
 	return nfserr;
 }
 
+static __be32
+nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
+		     struct nfsd4_secinfo *secinfo)
+{
+	return nfsd4_do_encode_secinfo(resp, nfserr, secinfo->si_exp);
+}
+
+static __be32
+nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr,
+		     struct nfsd4_secinfo_no_name *secinfo)
+{
+	return nfsd4_do_encode_secinfo(resp, nfserr, secinfo->sin_exp);
+}
+
 /*
  * The SETATTR encode routine is special -- it always encodes a bitmap,
  * regardless of the error status.
@@ -3173,7 +3186,7 @@ static nfsd4_enc nfsd4_enc_ops[] = {
 	[OP_LAYOUTCOMMIT]	= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_LAYOUTGET]		= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_LAYOUTRETURN]	= (nfsd4_enc)nfsd4_encode_noop,
-	[OP_SECINFO_NO_NAME]	= (nfsd4_enc)nfsd4_encode_secinfo,
+	[OP_SECINFO_NO_NAME]	= (nfsd4_enc)nfsd4_encode_secinfo_no_name,
 	[OP_SEQUENCE]		= (nfsd4_enc)nfsd4_encode_sequence,
 	[OP_SET_SSV]		= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_TEST_STATEID]	= (nfsd4_enc)nfsd4_encode_noop,
-- 
cgit v1.2.2


From 65e5341b9a0c39767ae1fecc727d70eda0dd6d83 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 24 Dec 2010 06:41:52 -0500
Subject: Btrfs: fix off by one while setting block groups readonly

When we read in block groups, we'll set non-redundant groups
readonly if we find a raid1, DUP or raid10 group.  But the
ro code has an off by one bug in the math around testing to
make sure out accounting doesn't go wrong.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 7e5162e5c411..b180efdc8b68 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -7971,13 +7971,14 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache)
 
 	if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
 	    sinfo->bytes_may_use + sinfo->bytes_readonly +
-	    cache->reserved_pinned + num_bytes < sinfo->total_bytes) {
+	    cache->reserved_pinned + num_bytes <= sinfo->total_bytes) {
 		sinfo->bytes_readonly += num_bytes;
 		sinfo->bytes_reserved += cache->reserved_pinned;
 		cache->reserved_pinned = 0;
 		cache->ro = 1;
 		ret = 0;
 	}
+
 	spin_unlock(&cache->lock);
 	spin_unlock(&sinfo->lock);
 	return ret;
-- 
cgit v1.2.2


From 65e4c8945575abca4e368e05ca3e9f77df030290 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill@shutemov.name>
Date: Thu, 16 Dec 2010 15:25:54 +0200
Subject: nfsd: declare several functions of nfs4callback as static

setup_callback_client(), nfsd4_release_cb() and nfsd4_process_cb_update()
do not have users outside the translation unit. Let's declare it as
static.

Signed-off-by: Kirill A. Shutemov <kirill@shutemov.name>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4callback.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 143da2eecd7b..a08580553fda 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -473,7 +473,8 @@ static int max_cb_time(void)
 /* Reference counting, callback cleanup, etc., all look racy as heck.
  * And why is cl_cb_set an atomic? */
 
-int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
+static int setup_callback_client(struct nfs4_client *clp,
+		struct nfs4_cb_conn *conn)
 {
 	struct rpc_timeout	timeparms = {
 		.to_initval	= max_cb_time(),
@@ -748,13 +749,13 @@ void nfsd4_shutdown_callback(struct nfs4_client *clp)
 	flush_workqueue(callback_wq);
 }
 
-void nfsd4_release_cb(struct nfsd4_callback *cb)
+static void nfsd4_release_cb(struct nfsd4_callback *cb)
 {
 	if (cb->cb_ops->rpc_release)
 		cb->cb_ops->rpc_release(cb);
 }
 
-void nfsd4_process_cb_update(struct nfsd4_callback *cb)
+static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
 {
 	struct nfs4_cb_conn conn;
 	struct nfs4_client *clp = cb->cb_clp;
-- 
cgit v1.2.2


From 3beb6cd1d448e7ded938bbd676493e6a08e9a6cd Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Sat, 1 Jan 2011 15:43:50 -0500
Subject: nfsd: don't drop requests on -ENOMEM

We never want to drop a request if we could return a JUKEBOX/DELAY error
instead; so, convert to nfserr_jukebox and let nfsd_dispatch() convert
that to a dropit error as a last resort if JUKEBOX/DELAY is unavailable
(as in the NFSv2 case).

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfsproc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 08e17264784b..dc9c2e3fd1b8 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -736,7 +736,7 @@ nfserrno (int errno)
 		{ nfserr_jukebox, -ETIMEDOUT },
 		{ nfserr_jukebox, -ERESTARTSYS },
 		{ nfserr_dropit, -EAGAIN },
-		{ nfserr_dropit, -ENOMEM },
+		{ nfserr_jukebox, -ENOMEM },
 		{ nfserr_badname, -ESRCH },
 		{ nfserr_io, -ETXTBSY },
 		{ nfserr_notsupp, -EOPNOTSUPP },
-- 
cgit v1.2.2


From 9e701c610923aaeac8b38b9202a686d1cc9ee35d Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Sun, 2 Jan 2011 21:56:36 -0500
Subject: svcrpc: simpler request dropping

Currently we use -EAGAIN returns to determine when to drop a deferred
request.  On its own, that is error-prone, as it makes us treat -EAGAIN
returns from other functions specially to prevent inadvertent dropping.

So, use a flag on the request instead.

Returning an error on request deferral is still required, to prevent
further processing, but we no longer need worry that an error return on
its own could result in a drop.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfssvc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 2bae1d86f5f2..18743c4d8bca 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -608,7 +608,7 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
 	/* Now call the procedure handler, and encode NFS status. */
 	nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
 	nfserr = map_new_errors(rqstp->rq_vers, nfserr);
-	if (nfserr == nfserr_dropit) {
+	if (nfserr == nfserr_dropit || rqstp->rq_dropme) {
 		dprintk("nfsd: Dropping request; may be revisited later\n");
 		nfsd_cache_update(rqstp, RC_NOCACHE, NULL);
 		return 0;
-- 
cgit v1.2.2


From 062304a815fe10068c478a4a3f28cf091c55cb82 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Sun, 2 Jan 2011 22:05:33 -0500
Subject: nfsd: stop translating EAGAIN to nfserr_dropit

We no longer need this.

Also, EWOULDBLOCK is generally a synonym for EAGAIN, but that may not be
true on all architectures, so map it as well.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfsproc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index dc9c2e3fd1b8..fd608a27a8d5 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -735,7 +735,8 @@ nfserrno (int errno)
 		{ nfserr_stale, -ESTALE },
 		{ nfserr_jukebox, -ETIMEDOUT },
 		{ nfserr_jukebox, -ERESTARTSYS },
-		{ nfserr_dropit, -EAGAIN },
+		{ nfserr_jukebox, -EAGAIN },
+		{ nfserr_jukebox, -EWOULDBLOCK },
 		{ nfserr_jukebox, -ENOMEM },
 		{ nfserr_badname, -ESRCH },
 		{ nfserr_io, -ETXTBSY },
-- 
cgit v1.2.2


From da165dd60e136d0609e0a2c0c2a9b9a5372200d6 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Sun, 2 Jan 2011 22:13:18 -0500
Subject: nfsd: remove some unnecessary dropit handling

We no longer need a few of these special cases.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4proc.c  | 4 ----
 fs/nfsd/nfs4state.c | 2 --
 fs/nfsd/nfs4xdr.c   | 2 --
 fs/nfsd/vfs.c       | 4 ----
 4 files changed, 12 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index f80c3997d24c..fd6694b49e1c 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1156,10 +1156,6 @@ encode_op:
 
 		nfsd4_increment_op_stats(op->opnum);
 	}
-	if (!rqstp->rq_usedeferral && status == nfserr_dropit) {
-		dprintk("%s Dropit - send NFS4ERR_DELAY\n", __func__);
-		status = nfserr_jukebox;
-	}
 
 	resp->cstate.status = status;
 	fh_put(&resp->cstate.current_fh);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 73adcfb2dc17..b82e36862044 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2509,8 +2509,6 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file
 	if (!fp->fi_fds[oflag]) {
 		status = nfsd_open(rqstp, cur_fh, S_IFREG, access,
 			&fp->fi_fds[oflag]);
-		if (status == nfserr_dropit)
-			status = nfserr_jukebox;
 		if (status)
 			return status;
 	}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 437b4623cb02..364aae7d5998 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2328,8 +2328,6 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
 	case nfserr_resource:
 		nfserr = nfserr_toosmall;
 		goto fail;
-	case nfserr_dropit:
-		goto fail;
 	case nfserr_noent:
 		goto skip_entry;
 	default:
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 184938fcff04..6a3af2ff3afe 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -380,8 +380,6 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
 		 * we need to break all leases.
 		 */
 		host_err = break_lease(inode, O_WRONLY | O_NONBLOCK);
-		if (host_err == -EWOULDBLOCK)
-			host_err = -ETIMEDOUT;
 		if (host_err) /* ENOMEM or EWOULDBLOCK */
 			goto out_nfserr;
 
@@ -752,8 +750,6 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 	 */
 	if (!(access & NFSD_MAY_NOT_BREAK_LEASE))
 		host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? O_WRONLY : 0));
-	if (host_err == -EWOULDBLOCK)
-		host_err = -ETIMEDOUT;
 	if (host_err) /* NOMEM or WOULDBLOCK */
 		goto out_nfserr;
 
-- 
cgit v1.2.2


From e63eb9375089f9d2041305d04c3f33a194e0e014 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Sat, 30 Oct 2010 17:41:26 -0400
Subject: nfsd4: eliminate lease delete callback

nfsd controls the lifetime of the lease, not the lock code, so there's
no need for this callback on lease destruction.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 18 ------------------
 1 file changed, 18 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index b82e36862044..2e44ad2539ab 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2295,23 +2295,6 @@ void nfsd_break_deleg_cb(struct file_lock *fl)
 	nfsd4_cb_recall(dp);
 }
 
-/*
- * The file_lock is being reapd.
- *
- * Called by locks_free_lock() with lock_flocks() held.
- */
-static
-void nfsd_release_deleg_cb(struct file_lock *fl)
-{
-	struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner;
-
-	dprintk("NFSD nfsd_release_deleg_cb: fl %p dp %p dl_count %d\n", fl,dp, atomic_read(&dp->dl_count));
-
-	if (!(fl->fl_flags & FL_LEASE) || !dp)
-		return;
-	dp->dl_flock = NULL;
-}
-
 /*
  * Called from setlease() with lock_flocks() held
  */
@@ -2341,7 +2324,6 @@ int nfsd_change_deleg_cb(struct file_lock **onlist, int arg)
 
 static const struct lock_manager_operations nfsd_lease_mng_ops = {
 	.fl_break = nfsd_break_deleg_cb,
-	.fl_release_private = nfsd_release_deleg_cb,
 	.fl_mylease = nfsd_same_client_deleg_cb,
 	.fl_change = nfsd_change_deleg_cb,
 };
-- 
cgit v1.2.2


From c84d500bc41658165ceb0dd04dc6a75249940fba Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Sat, 30 Oct 2010 23:35:04 -0400
Subject: nfsd4: use a single struct file for delegations

When we converted to sharing struct filess between nfs4 opens I went too
far and also used the same mechanism for delegations.  But keeping
a reference to the struct file ensures it will outlast the lease, and
allows us to remove the lease with the same file as we added it.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 10 +++++-----
 fs/nfsd/state.h     |  1 +
 2 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 2e44ad2539ab..cbe1b81c147d 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -230,7 +230,8 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
 	dp->dl_client = clp;
 	get_nfs4_file(fp);
 	dp->dl_file = fp;
-	nfs4_file_get_access(fp, O_RDONLY);
+	dp->dl_vfs_file = find_readable_file(fp);
+	get_file(dp->dl_vfs_file);
 	dp->dl_flock = NULL;
 	dp->dl_type = type;
 	dp->dl_stateid.si_boot = boot_time;
@@ -252,6 +253,7 @@ nfs4_put_delegation(struct nfs4_delegation *dp)
 	if (atomic_dec_and_test(&dp->dl_count)) {
 		dprintk("NFSD: freeing dp %p\n",dp);
 		put_nfs4_file(dp->dl_file);
+		fput(dp->dl_vfs_file);
 		kmem_cache_free(deleg_slab, dp);
 		num_delegations--;
 	}
@@ -265,12 +267,10 @@ nfs4_put_delegation(struct nfs4_delegation *dp)
 static void
 nfs4_close_delegation(struct nfs4_delegation *dp)
 {
-	struct file *filp = find_readable_file(dp->dl_file);
-
 	dprintk("NFSD: close_delegation dp %p\n",dp);
+	/* XXX: do we even need this check?: */
 	if (dp->dl_flock)
-		vfs_setlease(filp, F_UNLCK, &dp->dl_flock);
-	nfs4_file_put_access(dp->dl_file, O_RDONLY);
+		vfs_setlease(dp->dl_vfs_file, F_UNLCK, &dp->dl_flock);
 }
 
 /* Called under the state lock. */
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 39adc27b0685..84b230217b1b 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -81,6 +81,7 @@ struct nfs4_delegation {
 	atomic_t		dl_count;       /* ref count */
 	struct nfs4_client	*dl_client;
 	struct nfs4_file	*dl_file;
+	struct file		*dl_vfs_file;
 	struct file_lock	*dl_flock;
 	u32			dl_type;
 	time_t			dl_time;
-- 
cgit v1.2.2


From c45821d263a8a5109d69a9e8942b8d65bcd5f31a Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Sun, 31 Oct 2010 00:04:44 -0400
Subject: locks: eliminate fl_mylease callback

The nfs server only supports read delegations for now, so we don't care
how conflicts are determined.  All we care is that unlocks are
recognized as matching the leases they are meant to remove.  After the
last patch, a comparison of struct files will work for that purpose.  So
we no longer need this callback.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/locks.c          |  8 +-------
 fs/nfsd/nfs4state.c | 21 +--------------------
 2 files changed, 2 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/locks.c b/fs/locks.c
index 8729347bcd1a..5cb65062281a 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -444,15 +444,9 @@ static void lease_release_private_callback(struct file_lock *fl)
 	fl->fl_file->f_owner.signum = 0;
 }
 
-static int lease_mylease_callback(struct file_lock *fl, struct file_lock *try)
-{
-	return fl->fl_file == try->fl_file;
-}
-
 static const struct lock_manager_operations lease_manager_ops = {
 	.fl_break = lease_break_callback,
 	.fl_release_private = lease_release_private_callback,
-	.fl_mylease = lease_mylease_callback,
 	.fl_change = lease_modify,
 };
 
@@ -1405,7 +1399,7 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
 	for (before = &inode->i_flock;
 			((fl = *before) != NULL) && IS_LEASE(fl);
 			before = &fl->fl_next) {
-		if (lease->fl_lmops->fl_mylease(fl, lease))
+		if (fl->fl_file == lease->fl_file)
 			my_before = before;
 		else if (fl->fl_type == (F_INPROGRESS | F_UNLCK))
 			/*
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index cbe1b81c147d..87d4c48b6069 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2295,24 +2295,6 @@ void nfsd_break_deleg_cb(struct file_lock *fl)
 	nfsd4_cb_recall(dp);
 }
 
-/*
- * Called from setlease() with lock_flocks() held
- */
-static
-int nfsd_same_client_deleg_cb(struct file_lock *onlist, struct file_lock *try)
-{
-	struct nfs4_delegation *onlistd =
-		(struct nfs4_delegation *)onlist->fl_owner;
-	struct nfs4_delegation *tryd =
-		(struct nfs4_delegation *)try->fl_owner;
-
-	if (onlist->fl_lmops != try->fl_lmops)
-		return 0;
-
-	return onlistd->dl_client == tryd->dl_client;
-}
-
-
 static
 int nfsd_change_deleg_cb(struct file_lock **onlist, int arg)
 {
@@ -2324,7 +2306,6 @@ int nfsd_change_deleg_cb(struct file_lock **onlist, int arg)
 
 static const struct lock_manager_operations nfsd_lease_mng_ops = {
 	.fl_break = nfsd_break_deleg_cb,
-	.fl_mylease = nfsd_same_client_deleg_cb,
 	.fl_change = nfsd_change_deleg_cb,
 };
 
@@ -2630,7 +2611,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
 	dp->dl_flock = fl;
 
 	/* vfs_setlease checks to see if delegation should be handed out.
-	 * the lock_manager callbacks fl_mylease and fl_change are used
+	 * the lock_manager callback fl_change is used
 	 */
 	if ((status = vfs_setlease(fl->fl_file, fl->fl_type, &fl))) {
 		dprintk("NFSD: setlease failed [%d], no delegation\n", status);
-- 
cgit v1.2.2


From 255c7cf810e4776ae8f1023332060459f30d8a2a Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Sun, 31 Oct 2010 12:35:48 -0400
Subject: locks: minor setlease cleanup

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/locks.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/locks.c b/fs/locks.c
index 5cb65062281a..feaac634d0da 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1399,7 +1399,7 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
 	for (before = &inode->i_flock;
 			((fl = *before) != NULL) && IS_LEASE(fl);
 			before = &fl->fl_next) {
-		if (fl->fl_file == lease->fl_file)
+		if (fl->fl_file == filp)
 			my_before = before;
 		else if (fl->fl_type == (F_INPROGRESS | F_UNLCK))
 			/*
-- 
cgit v1.2.2


From f6af99ec1b261e21219d5eba99e3af48fc6c32d4 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 4 Jan 2011 18:02:15 -0500
Subject: nfsd4: name->id mapping should fail with BADOWNER not BADNAME

According to rfc 3530 BADNAME is for strings that represent paths;
BADOWNER is for user/group names that don't map.

And the too-long name should probably be BADOWNER as well; it's
effectively the same as if we couldn't map it.

Cc: stable@kernel.org
Reported-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Reported-by: Simon Kirby <sim@hostway.ca>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4idmap.c | 4 ++--
 fs/nfsd/nfsd.h      | 1 +
 fs/nfsd/nfsproc.c   | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index f0695e815f0e..844960fd0395 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -524,13 +524,13 @@ idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen
 	int ret;
 
 	if (namelen + 1 > sizeof(key.name))
-		return -EINVAL;
+		return -ESRCH; /* nfserr_badowner */
 	memcpy(key.name, name, namelen);
 	key.name[namelen] = '\0';
 	strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname));
 	ret = idmap_lookup(rqstp, nametoid_lookup, &key, &nametoid_cache, &item);
 	if (ret == -ENOENT)
-		ret = -ESRCH; /* nfserr_badname */
+		ret = -ESRCH; /* nfserr_badowner */
 	if (ret)
 		return ret;
 	*id = item->id;
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 6b641cf2c19a..7ecfa2420307 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -158,6 +158,7 @@ void		nfsd_lockd_shutdown(void);
 #define	nfserr_attrnotsupp	cpu_to_be32(NFSERR_ATTRNOTSUPP)
 #define	nfserr_bad_xdr		cpu_to_be32(NFSERR_BAD_XDR)
 #define	nfserr_openmode		cpu_to_be32(NFSERR_OPENMODE)
+#define	nfserr_badowner		cpu_to_be32(NFSERR_BADOWNER)
 #define	nfserr_locks_held	cpu_to_be32(NFSERR_LOCKS_HELD)
 #define	nfserr_op_illegal	cpu_to_be32(NFSERR_OP_ILLEGAL)
 #define	nfserr_grace		cpu_to_be32(NFSERR_GRACE)
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index fd608a27a8d5..8f05dcd0bf85 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -738,7 +738,7 @@ nfserrno (int errno)
 		{ nfserr_jukebox, -EAGAIN },
 		{ nfserr_jukebox, -EWOULDBLOCK },
 		{ nfserr_jukebox, -ENOMEM },
-		{ nfserr_badname, -ESRCH },
+		{ nfserr_badowner, -ESRCH },
 		{ nfserr_io, -ETXTBSY },
 		{ nfserr_notsupp, -EOPNOTSUPP },
 		{ nfserr_toosmall, -ETOOSMALL },
-- 
cgit v1.2.2


From 2ca72e17e5acb1052c35c9faba609c2289ce7a92 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 4 Jan 2011 17:37:15 -0500
Subject: nfsd4: move idmap and acl header files into fs/nfsd

These are internal nfsd interfaces.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/acl.h       | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/nfsd/idmap.h     | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/nfsd/nfs4acl.c   |  2 +-
 fs/nfsd/nfs4idmap.c |  2 +-
 fs/nfsd/nfs4xdr.c   |  5 +++--
 fs/nfsd/nfsctl.c    |  2 +-
 fs/nfsd/vfs.c       |  4 ++--
 7 files changed, 133 insertions(+), 7 deletions(-)
 create mode 100644 fs/nfsd/acl.h
 create mode 100644 fs/nfsd/idmap.h

(limited to 'fs')

diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h
new file mode 100644
index 000000000000..c9c05a78e9bb
--- /dev/null
+++ b/fs/nfsd/acl.h
@@ -0,0 +1,61 @@
+/*
+ *  include/linux/nfs4_acl.c
+ *
+ *  Common NFSv4 ACL handling definitions.
+ *
+ *  Copyright (c) 2002 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Marius Aamodt Eriksen <marius@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef LINUX_NFS4_ACL_H
+#define LINUX_NFS4_ACL_H
+
+#include <linux/posix_acl.h>
+
+/* Maximum ACL we'll accept from client; chosen (somewhat arbitrarily) to
+ * fit in a page: */
+#define NFS4_ACL_MAX 170
+
+struct nfs4_acl *nfs4_acl_new(int);
+int nfs4_acl_get_whotype(char *, u32);
+int nfs4_acl_write_who(int who, char *p);
+int nfs4_acl_permission(struct nfs4_acl *acl, uid_t owner, gid_t group,
+		                        uid_t who, u32 mask);
+
+#define NFS4_ACL_TYPE_DEFAULT	0x01
+#define NFS4_ACL_DIR		0x02
+#define NFS4_ACL_OWNER		0x04
+
+struct nfs4_acl *nfs4_acl_posix_to_nfsv4(struct posix_acl *,
+				struct posix_acl *, unsigned int flags);
+int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *, struct posix_acl **,
+				struct posix_acl **, unsigned int flags);
+
+#endif /* LINUX_NFS4_ACL_H */
diff --git a/fs/nfsd/idmap.h b/fs/nfsd/idmap.h
new file mode 100644
index 000000000000..d4a2ac18bd4c
--- /dev/null
+++ b/fs/nfsd/idmap.h
@@ -0,0 +1,64 @@
+/*
+ *  include/linux/nfsd_idmap.h
+ *
+ *  Mapping of UID to name and vice versa.
+ *
+ *  Copyright (c) 2002, 2003 The Regents of the University of
+ *  Michigan.  All rights reserved.
+> *
+ *  Marius Aamodt Eriksen <marius@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef LINUX_NFSD_IDMAP_H
+#define LINUX_NFSD_IDMAP_H
+
+#include <linux/in.h>
+#include <linux/sunrpc/svc.h>
+
+/* XXX from linux/nfs_idmap.h */
+#define IDMAP_NAMESZ 128
+
+#ifdef CONFIG_NFSD_V4
+int nfsd_idmap_init(void);
+void nfsd_idmap_shutdown(void);
+#else
+static inline int nfsd_idmap_init(void)
+{
+	return 0;
+}
+static inline void nfsd_idmap_shutdown(void)
+{
+}
+#endif
+
+int nfsd_map_name_to_uid(struct svc_rqst *, const char *, size_t, __u32 *);
+int nfsd_map_name_to_gid(struct svc_rqst *, const char *, size_t, __u32 *);
+int nfsd_map_uid_to_name(struct svc_rqst *, __u32, char *);
+int nfsd_map_gid_to_name(struct svc_rqst *, __u32, char *);
+
+#endif /* LINUX_NFSD_IDMAP_H */
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index e48052615159..ad88f1c0a4c3 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -36,7 +36,7 @@
 
 #include <linux/slab.h>
 #include <linux/nfs_fs.h>
-#include <linux/nfs4_acl.h>
+#include "acl.h"
 
 
 /* mode bit translations: */
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 844960fd0395..cbd599732765 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -33,10 +33,10 @@
  */
 
 #include <linux/module.h>
-#include <linux/nfsd_idmap.h>
 #include <linux/seq_file.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include "idmap.h"
 
 /*
  * Cache entry
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 364aae7d5998..2a0814d0ab1a 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -44,13 +44,14 @@
 #include <linux/namei.h>
 #include <linux/statfs.h>
 #include <linux/utsname.h>
-#include <linux/nfsd_idmap.h>
-#include <linux/nfs4_acl.h>
 #include <linux/sunrpc/svcauth_gss.h>
 
+#include "idmap.h"
+#include "acl.h"
 #include "xdr4.h"
 #include "vfs.h"
 
+
 #define NFSDDBG_FACILITY		NFSDDBG_XDR
 
 /*
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 6840ec3ceecf..33b3e2b06779 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -8,12 +8,12 @@
 #include <linux/namei.h>
 #include <linux/ctype.h>
 
-#include <linux/nfsd_idmap.h>
 #include <linux/sunrpc/svcsock.h>
 #include <linux/nfsd/syscall.h>
 #include <linux/lockd/lockd.h>
 #include <linux/sunrpc/clnt.h>
 
+#include "idmap.h"
 #include "nfsd.h"
 #include "cache.h"
 
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 6a3af2ff3afe..b991125ce4a5 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -35,8 +35,8 @@
 #endif /* CONFIG_NFSD_V3 */
 
 #ifdef CONFIG_NFSD_V4
-#include <linux/nfs4_acl.h>
-#include <linux/nfsd_idmap.h>
+#include "acl.h"
+#include "idmap.h"
 #endif /* CONFIG_NFSD_V4 */
 
 #include "nfsd.h"
-- 
cgit v1.2.2


From 775a1905e1e042e830eae31e70efec9387eb3e1d Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 4 Jan 2011 17:38:41 -0500
Subject: nfsd4: remove outdated pathname-comments

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/acl.h   | 2 --
 fs/nfsd/idmap.h | 2 --
 2 files changed, 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h
index c9c05a78e9bb..34e5c40af5ef 100644
--- a/fs/nfsd/acl.h
+++ b/fs/nfsd/acl.h
@@ -1,6 +1,4 @@
 /*
- *  include/linux/nfs4_acl.c
- *
  *  Common NFSv4 ACL handling definitions.
  *
  *  Copyright (c) 2002 The Regents of the University of Michigan.
diff --git a/fs/nfsd/idmap.h b/fs/nfsd/idmap.h
index d4a2ac18bd4c..514758994763 100644
--- a/fs/nfsd/idmap.h
+++ b/fs/nfsd/idmap.h
@@ -1,6 +1,4 @@
 /*
- *  include/linux/nfsd_idmap.h
- *
  *  Mapping of UID to name and vice versa.
  *
  *  Copyright (c) 2002, 2003 The Regents of the University of
-- 
cgit v1.2.2


From 3c726023402a2f3b28f49b9d90ebf9e71151157d Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 4 Jan 2011 17:53:52 -0500
Subject: nfsd4: return nfs errno from name_to_id functions

This avoids the need for the confusing ESRCH mapping.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/idmap.h     |  4 ++--
 fs/nfsd/nfs4idmap.c | 13 +++++++------
 fs/nfsd/nfs4xdr.c   | 10 +++++-----
 fs/nfsd/nfsproc.c   |  1 -
 4 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/idmap.h b/fs/nfsd/idmap.h
index 514758994763..2f3be1321534 100644
--- a/fs/nfsd/idmap.h
+++ b/fs/nfsd/idmap.h
@@ -54,8 +54,8 @@ static inline void nfsd_idmap_shutdown(void)
 }
 #endif
 
-int nfsd_map_name_to_uid(struct svc_rqst *, const char *, size_t, __u32 *);
-int nfsd_map_name_to_gid(struct svc_rqst *, const char *, size_t, __u32 *);
+__be32 nfsd_map_name_to_uid(struct svc_rqst *, const char *, size_t, __u32 *);
+__be32 nfsd_map_name_to_gid(struct svc_rqst *, const char *, size_t, __u32 *);
 int nfsd_map_uid_to_name(struct svc_rqst *, __u32, char *);
 int nfsd_map_gid_to_name(struct svc_rqst *, __u32, char *);
 
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index cbd599732765..6d2c397d458b 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -37,6 +37,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include "idmap.h"
+#include "nfsd.h"
 
 /*
  * Cache entry
@@ -514,7 +515,7 @@ rqst_authname(struct svc_rqst *rqstp)
 	return clp->name;
 }
 
-static int
+static __be32
 idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen,
 		uid_t *id)
 {
@@ -524,15 +525,15 @@ idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen
 	int ret;
 
 	if (namelen + 1 > sizeof(key.name))
-		return -ESRCH; /* nfserr_badowner */
+		return nfserr_badowner;
 	memcpy(key.name, name, namelen);
 	key.name[namelen] = '\0';
 	strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname));
 	ret = idmap_lookup(rqstp, nametoid_lookup, &key, &nametoid_cache, &item);
 	if (ret == -ENOENT)
-		ret = -ESRCH; /* nfserr_badowner */
+		return nfserr_badowner;
 	if (ret)
-		return ret;
+		return nfserrno(ret);
 	*id = item->id;
 	cache_put(&item->h, &nametoid_cache);
 	return 0;
@@ -560,14 +561,14 @@ idmap_id_to_name(struct svc_rqst *rqstp, int type, uid_t id, char *name)
 	return ret;
 }
 
-int
+__be32
 nfsd_map_name_to_uid(struct svc_rqst *rqstp, const char *name, size_t namelen,
 		__u32 *id)
 {
 	return idmap_name_to_id(rqstp, IDMAP_TYPE_USER, name, namelen, id);
 }
 
-int
+__be32
 nfsd_map_name_to_gid(struct svc_rqst *rqstp, const char *name, size_t namelen,
 		__u32 *id)
 {
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 2a0814d0ab1a..ca3786905dec 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -289,17 +289,17 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
 			len += XDR_QUADLEN(dummy32) << 2;
 			READMEM(buf, dummy32);
 			ace->whotype = nfs4_acl_get_whotype(buf, dummy32);
-			host_err = 0;
+			status = nfs_ok;
 			if (ace->whotype != NFS4_ACL_WHO_NAMED)
 				ace->who = 0;
 			else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP)
-				host_err = nfsd_map_name_to_gid(argp->rqstp,
+				status = nfsd_map_name_to_gid(argp->rqstp,
 						buf, dummy32, &ace->who);
 			else
-				host_err = nfsd_map_name_to_uid(argp->rqstp,
+				status = nfsd_map_name_to_uid(argp->rqstp,
 						buf, dummy32, &ace->who);
-			if (host_err)
-				goto out_nfserr;
+			if (status)
+				return status;
 		}
 	} else
 		*acl = NULL;
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 8f05dcd0bf85..e15dc45fc5ec 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -738,7 +738,6 @@ nfserrno (int errno)
 		{ nfserr_jukebox, -EAGAIN },
 		{ nfserr_jukebox, -EWOULDBLOCK },
 		{ nfserr_jukebox, -ENOMEM },
-		{ nfserr_badowner, -ESRCH },
 		{ nfserr_io, -ETXTBSY },
 		{ nfserr_notsupp, -EOPNOTSUPP },
 		{ nfserr_toosmall, -ETOOSMALL },
-- 
cgit v1.2.2


From 6f3d772fb8a039de8f21d725f5e38c252b4c0efd Mon Sep 17 00:00:00 2001
From: Takuma Umeya <tumeya@redhat.com>
Date: Wed, 15 Dec 2010 14:09:01 +0900
Subject: nfs4: set source address when callback is generated

when callback is generated in NFSv4 server, it doesn't set the source
address. When an alias IP is utilized on NFSv4 server and suppose the
client is accessing via that alias IP (e.g. eth0:0), the client invokes
the callback to the IP address that is set on the original device (e.g.
eth0). This behavior results in timeout of xprt.
The patch sets the IP address that the client should invoke callback to.

Signed-off-by: Takuma Umeya <tumeya@redhat.com>
[bfields@redhat.com: Simplify gen_callback arguments, use helper function]
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4callback.c |  1 +
 fs/nfsd/nfs4state.c    | 22 +++++++++++++++++++---
 fs/nfsd/state.h        |  1 +
 3 files changed, 21 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index a08580553fda..dd183af24fe6 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -484,6 +484,7 @@ static int setup_callback_client(struct nfs4_client *clp,
 		.net		= &init_net,
 		.address	= (struct sockaddr *) &conn->cb_addr,
 		.addrsize	= conn->cb_addrlen,
+		.saddress	= (struct sockaddr *) &conn->cb_saddr,
 		.timeout	= &timeparms,
 		.program	= &cb_program,
 		.version	= 0,
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 87d4c48b6069..b583e4e800ab 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1163,10 +1163,26 @@ find_unconfirmed_client_by_str(const char *dname, unsigned int hashval)
 	return NULL;
 }
 
+static void rpc_svcaddr2sockaddr(struct sockaddr *sa, unsigned short family, union svc_addr_u *svcaddr)
+{
+	switch (family) {
+	case AF_INET:
+		((struct sockaddr_in *)sa)->sin_family = AF_INET;
+		((struct sockaddr_in *)sa)->sin_addr = svcaddr->addr;
+		return;
+	case AF_INET6:
+		((struct sockaddr_in6 *)sa)->sin6_family = AF_INET6;
+		((struct sockaddr_in6 *)sa)->sin6_addr = svcaddr->addr6;
+		return;
+	}
+}
+
 static void
-gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid)
+gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_rqst *rqstp)
 {
 	struct nfs4_cb_conn *conn = &clp->cl_cb_conn;
+	struct sockaddr	*sa = svc_addr(rqstp);
+	u32 scopeid = rpc_get_scope_id(sa);
 	unsigned short expected_family;
 
 	/* Currently, we only support tcp and tcp6 for the callback channel */
@@ -1192,6 +1208,7 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid)
 
 	conn->cb_prog = se->se_callback_prog;
 	conn->cb_ident = se->se_callback_ident;
+	rpc_svcaddr2sockaddr((struct sockaddr *)&conn->cb_saddr, expected_family, &rqstp->rq_daddr);
 	return;
 out_err:
 	conn->cb_addr.ss_family = AF_UNSPEC;
@@ -1768,7 +1785,6 @@ __be32
 nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		  struct nfsd4_setclientid *setclid)
 {
-	struct sockaddr		*sa = svc_addr(rqstp);
 	struct xdr_netobj 	clname = { 
 		.len = setclid->se_namelen,
 		.data = setclid->se_name,
@@ -1871,7 +1887,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	 * for consistent minorversion use throughout:
 	 */
 	new->cl_minorversion = 0;
-	gen_callback(new, setclid, rpc_get_scope_id(sa));
+	gen_callback(new, setclid, rqstp);
 	add_to_unconfirmed(new, strhashval);
 	setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot;
 	setclid->se_clientid.cl_id = new->cl_clientid.cl_id;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 84b230217b1b..cf6dc83fd545 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -96,6 +96,7 @@ struct nfs4_delegation {
 struct nfs4_cb_conn {
 	/* SETCLIENTID info */
 	struct sockaddr_storage	cb_addr;
+	struct sockaddr_storage	cb_saddr;
 	size_t			cb_addrlen;
 	u32                     cb_prog; /* used only in 4.0 case;
 					    per-session otherwise */
-- 
cgit v1.2.2


From a363f0c2030cb9781e7e458f4a9e354b6c43d7ce Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 11 Jan 2011 10:13:53 +1100
Subject: xfs: ensure sync write errors are returned

xfs_file_aio_write() only returns the error from synchronous
flushing of the data and inode if error == 0. At the point where
error is being checked, it is guaranteed to be > 0. Therefore any
errors returned by the data or fsync flush will never be returned.
Fix the checks so we overwrite the current error once and only if an
error really occurred.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Alex Elder <aelder@sgi.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/linux-2.6/xfs_file.c | 49 +++++++++++++++++++++------------------------
 1 file changed, 23 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index ba8ad422a165..10b7fb4807a6 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -574,7 +574,7 @@ xfs_file_aio_write(
 	struct inode		*inode = mapping->host;
 	struct xfs_inode	*ip = XFS_I(inode);
 	struct xfs_mount	*mp = ip->i_mount;
-	ssize_t			ret = 0, error = 0;
+	ssize_t			ret = 0;
 	int			ioflags = 0;
 	xfs_fsize_t		isize, new_size;
 	int			iolock;
@@ -590,9 +590,9 @@ xfs_file_aio_write(
 	if (file->f_mode & FMODE_NOCMTIME)
 		ioflags |= IO_INVIS;
 
-	error = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
-	if (error)
-		return error;
+	ret = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
+	if (ret)
+		return ret;
 
 	count = ocount;
 	if (count == 0)
@@ -616,9 +616,9 @@ relock:
 	xfs_ilock(ip, XFS_ILOCK_EXCL|iolock);
 
 start:
-	error = -generic_write_checks(file, &pos, &count,
+	ret = generic_write_checks(file, &pos, &count,
 					S_ISBLK(inode->i_mode));
-	if (error) {
+	if (ret) {
 		xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
 		goto out_unlock_mutex;
 	}
@@ -660,8 +660,8 @@ start:
 	 */
 
 	if (pos > ip->i_size) {
-		error = xfs_zero_eof(ip, pos, ip->i_size);
-		if (error) {
+		ret = -xfs_zero_eof(ip, pos, ip->i_size);
+		if (ret) {
 			xfs_iunlock(ip, XFS_ILOCK_EXCL);
 			goto out_unlock_internal;
 		}
@@ -674,8 +674,8 @@ start:
 	 * by root.  This keeps people from modifying setuid and
 	 * setgid binaries.
 	 */
-	error = -file_remove_suid(file);
-	if (unlikely(error))
+	ret = file_remove_suid(file);
+	if (unlikely(ret))
 		goto out_unlock_internal;
 
 	/* We can write back this queue in page reclaim */
@@ -684,10 +684,10 @@ start:
 	if ((ioflags & IO_ISDIRECT)) {
 		if (mapping->nrpages) {
 			WARN_ON(need_i_mutex == 0);
-			error = xfs_flushinval_pages(ip,
+			ret = -xfs_flushinval_pages(ip,
 					(pos & PAGE_CACHE_MASK),
 					-1, FI_REMAPF_LOCKED);
-			if (error)
+			if (ret)
 				goto out_unlock_internal;
 		}
 
@@ -720,24 +720,22 @@ start:
 		}
 	} else {
 		int enospc = 0;
-		ssize_t ret2 = 0;
 
 write_retry:
 		trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, ioflags);
-		ret2 = generic_file_buffered_write(iocb, iovp, nr_segs,
+		ret = generic_file_buffered_write(iocb, iovp, nr_segs,
 				pos, &iocb->ki_pos, count, ret);
 		/*
 		 * if we just got an ENOSPC, flush the inode now we
 		 * aren't holding any page locks and retry *once*
 		 */
-		if (ret2 == -ENOSPC && !enospc) {
-			error = xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
-			if (error)
+		if (ret == -ENOSPC && !enospc) {
+			ret = xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
+			if (ret)
 				goto out_unlock_internal;
 			enospc = 1;
 			goto write_retry;
 		}
-		ret = ret2;
 	}
 
 	current->backing_dev_info = NULL;
@@ -753,7 +751,6 @@ write_retry:
 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	}
 
-	error = -ret;
 	if (ret <= 0)
 		goto out_unlock_internal;
 
@@ -762,23 +759,23 @@ write_retry:
 	/* Handle various SYNC-type writes */
 	if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
 		loff_t end = pos + ret - 1;
-		int error2;
+		int error, error2;
 
 		xfs_iunlock(ip, iolock);
 		if (need_i_mutex)
 			mutex_unlock(&inode->i_mutex);
 
-		error2 = filemap_write_and_wait_range(mapping, pos, end);
-		if (!error)
-			error = error2;
+		error = filemap_write_and_wait_range(mapping, pos, end);
 		if (need_i_mutex)
 			mutex_lock(&inode->i_mutex);
 		xfs_ilock(ip, iolock);
 
 		error2 = -xfs_file_fsync(file,
 					 (file->f_flags & __O_SYNC) ? 0 : 1);
-		if (!error)
-			error = error2;
+		if (error)
+			ret = error;
+		else if (error2)
+			ret = error2;
 	}
 
  out_unlock_internal:
@@ -800,7 +797,7 @@ write_retry:
  out_unlock_mutex:
 	if (need_i_mutex)
 		mutex_unlock(&inode->i_mutex);
-	return -error;
+	return ret;
 }
 
 STATIC int
-- 
cgit v1.2.2


From edafb6da9aa725e4de5fe758fe81644b6167f9a2 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 11 Jan 2011 10:14:06 +1100
Subject: xfs: factor common post-write isize handling code

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Alex Elder <aelder@sgi.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/linux-2.6/xfs_file.c | 54 +++++++++++++++++++++++----------------------
 1 file changed, 28 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 10b7fb4807a6..b3915bf25770 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -321,6 +321,30 @@ xfs_file_splice_read(
 	return ret;
 }
 
+STATIC void
+xfs_aio_write_isize_update(
+	struct inode	*inode,
+	loff_t		*ppos,
+	ssize_t		bytes_written)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	xfs_fsize_t		isize = i_size_read(inode);
+
+	if (bytes_written > 0)
+		XFS_STATS_ADD(xs_write_bytes, bytes_written);
+
+	if (unlikely(bytes_written < 0 && bytes_written != -EFAULT &&
+					*ppos > isize))
+		*ppos = isize;
+
+	if (*ppos > ip->i_size) {
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		if (*ppos > ip->i_size)
+			ip->i_size = *ppos;
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	}
+}
+
 STATIC ssize_t
 xfs_file_splice_write(
 	struct pipe_inode_info	*pipe,
@@ -331,7 +355,7 @@ xfs_file_splice_write(
 {
 	struct inode		*inode = outfilp->f_mapping->host;
 	struct xfs_inode	*ip = XFS_I(inode);
-	xfs_fsize_t		isize, new_size;
+	xfs_fsize_t		new_size;
 	int			ioflags = 0;
 	ssize_t			ret;
 
@@ -355,19 +379,8 @@ xfs_file_splice_write(
 	trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
 
 	ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
-	if (ret > 0)
-		XFS_STATS_ADD(xs_write_bytes, ret);
-
-	isize = i_size_read(inode);
-	if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize))
-		*ppos = isize;
 
-	if (*ppos > ip->i_size) {
-		xfs_ilock(ip, XFS_ILOCK_EXCL);
-		if (*ppos > ip->i_size)
-			ip->i_size = *ppos;
-		xfs_iunlock(ip, XFS_ILOCK_EXCL);
-	}
+	xfs_aio_write_isize_update(inode, ppos, ret);
 
 	if (ip->i_new_size) {
 		xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -576,7 +589,7 @@ xfs_file_aio_write(
 	struct xfs_mount	*mp = ip->i_mount;
 	ssize_t			ret = 0;
 	int			ioflags = 0;
-	xfs_fsize_t		isize, new_size;
+	xfs_fsize_t		new_size;
 	int			iolock;
 	size_t			ocount = 0, count;
 	int			need_i_mutex;
@@ -740,22 +753,11 @@ write_retry:
 
 	current->backing_dev_info = NULL;
 
-	isize = i_size_read(inode);
-	if (unlikely(ret < 0 && ret != -EFAULT && iocb->ki_pos > isize))
-		iocb->ki_pos = isize;
-
-	if (iocb->ki_pos > ip->i_size) {
-		xfs_ilock(ip, XFS_ILOCK_EXCL);
-		if (iocb->ki_pos > ip->i_size)
-			ip->i_size = iocb->ki_pos;
-		xfs_iunlock(ip, XFS_ILOCK_EXCL);
-	}
+	xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret);
 
 	if (ret <= 0)
 		goto out_unlock_internal;
 
-	XFS_STATS_ADD(xs_write_bytes, ret);
-
 	/* Handle various SYNC-type writes */
 	if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
 		loff_t end = pos + ret - 1;
-- 
cgit v1.2.2


From 4c5cfd1b4157fb75d43b44a147c2feba6422fc4f Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 11 Jan 2011 10:14:16 +1100
Subject: xfs: factor post-write newsize updates

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Alex Elder <aelder@sgi.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/linux-2.6/xfs_file.c | 43 +++++++++++++++++++++----------------------
 1 file changed, 21 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index b3915bf25770..c47d7dc0a307 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -345,6 +345,25 @@ xfs_aio_write_isize_update(
 	}
 }
 
+/*
+ * If this was a direct or synchronous I/O that failed (such as ENOSPC) then
+ * part of the I/O may have been written to disk before the error occured.  In
+ * this case the on-disk file size may have been adjusted beyond the in-memory
+ * file size and now needs to be truncated back.
+ */
+STATIC void
+xfs_aio_write_newsize_update(
+	struct xfs_inode	*ip)
+{
+	if (ip->i_new_size) {
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		ip->i_new_size = 0;
+		if (ip->i_d.di_size > ip->i_size)
+			ip->i_d.di_size = ip->i_size;
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	}
+}
+
 STATIC ssize_t
 xfs_file_splice_write(
 	struct pipe_inode_info	*pipe,
@@ -381,14 +400,7 @@ xfs_file_splice_write(
 	ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
 
 	xfs_aio_write_isize_update(inode, ppos, ret);
-
-	if (ip->i_new_size) {
-		xfs_ilock(ip, XFS_ILOCK_EXCL);
-		ip->i_new_size = 0;
-		if (ip->i_d.di_size > ip->i_size)
-			ip->i_d.di_size = ip->i_size;
-		xfs_iunlock(ip, XFS_ILOCK_EXCL);
-	}
+	xfs_aio_write_newsize_update(ip);
 	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 	return ret;
 }
@@ -781,20 +793,7 @@ write_retry:
 	}
 
  out_unlock_internal:
-	if (ip->i_new_size) {
-		xfs_ilock(ip, XFS_ILOCK_EXCL);
-		ip->i_new_size = 0;
-		/*
-		 * If this was a direct or synchronous I/O that failed (such
-		 * as ENOSPC) then part of the I/O may have been written to
-		 * disk before the error occured.  In this case the on-disk
-		 * file size may have been adjusted beyond the in-memory file
-		 * size and now needs to be truncated back.
-		 */
-		if (ip->i_d.di_size > ip->i_size)
-			ip->i_d.di_size = ip->i_size;
-		xfs_iunlock(ip, XFS_ILOCK_EXCL);
-	}
+	xfs_aio_write_newsize_update(ip);
 	xfs_iunlock(ip, iolock);
  out_unlock_mutex:
 	if (need_i_mutex)
-- 
cgit v1.2.2


From 487f84f3f80bc6f00c59725e822653d3ec174b85 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 12 Jan 2011 11:37:10 +1100
Subject: xfs: introduce xfs_rw_lock() helpers for locking the inode

We need to obtain the i_mutex, i_iolock and i_ilock during the read
and write paths. Add a set of wrapper functions to neatly
encapsulate the lock ordering and shared/exclusive semantics to make
the locking easier to follow and get right.

Note that this changes some of the exclusive locking serialisation in
that serialisation will occur against the i_mutex instead of the
XFS_IOLOCK_EXCL. This does not change any behaviour, and it is
arguably more efficient to use the mutex for such serialisation than
the rw_sem.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/linux-2.6/xfs_file.c | 131 +++++++++++++++++++++++++++-----------------
 1 file changed, 80 insertions(+), 51 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index c47d7dc0a307..b5e13fbb7386 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -40,6 +40,40 @@
 
 static const struct vm_operations_struct xfs_file_vm_ops;
 
+/*
+ * Locking primitives for read and write IO paths to ensure we consistently use
+ * and order the inode->i_mutex, ip->i_lock and ip->i_iolock.
+ */
+static inline void
+xfs_rw_ilock(
+	struct xfs_inode	*ip,
+	int			type)
+{
+	if (type & XFS_IOLOCK_EXCL)
+		mutex_lock(&VFS_I(ip)->i_mutex);
+	xfs_ilock(ip, type);
+}
+
+static inline void
+xfs_rw_iunlock(
+	struct xfs_inode	*ip,
+	int			type)
+{
+	xfs_iunlock(ip, type);
+	if (type & XFS_IOLOCK_EXCL)
+		mutex_unlock(&VFS_I(ip)->i_mutex);
+}
+
+static inline void
+xfs_rw_ilock_demote(
+	struct xfs_inode	*ip,
+	int			type)
+{
+	xfs_ilock_demote(ip, type);
+	if (type & XFS_IOLOCK_EXCL)
+		mutex_unlock(&VFS_I(ip)->i_mutex);
+}
+
 /*
  *	xfs_iozero
  *
@@ -262,22 +296,21 @@ xfs_file_aio_read(
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return -EIO;
 
-	if (unlikely(ioflags & IO_ISDIRECT))
-		mutex_lock(&inode->i_mutex);
-	xfs_ilock(ip, XFS_IOLOCK_SHARED);
-
 	if (unlikely(ioflags & IO_ISDIRECT)) {
+		xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
+
 		if (inode->i_mapping->nrpages) {
 			ret = -xfs_flushinval_pages(ip,
 					(iocb->ki_pos & PAGE_CACHE_MASK),
 					-1, FI_REMAPF_LOCKED);
+			if (ret) {
+				xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
+				return ret;
+			}
 		}
-		mutex_unlock(&inode->i_mutex);
-		if (ret) {
-			xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-			return ret;
-		}
-	}
+		xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
+	} else
+		xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
 
 	trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags);
 
@@ -285,7 +318,7 @@ xfs_file_aio_read(
 	if (ret > 0)
 		XFS_STATS_ADD(xs_read_bytes, ret);
 
-	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+	xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
 	return ret;
 }
 
@@ -309,7 +342,7 @@ xfs_file_splice_read(
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 		return -EIO;
 
-	xfs_ilock(ip, XFS_IOLOCK_SHARED);
+	xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
 
 	trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
 
@@ -317,7 +350,7 @@ xfs_file_splice_read(
 	if (ret > 0)
 		XFS_STATS_ADD(xs_read_bytes, ret);
 
-	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+	xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
 	return ret;
 }
 
@@ -338,10 +371,10 @@ xfs_aio_write_isize_update(
 		*ppos = isize;
 
 	if (*ppos > ip->i_size) {
-		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
 		if (*ppos > ip->i_size)
 			ip->i_size = *ppos;
-		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
 	}
 }
 
@@ -356,14 +389,22 @@ xfs_aio_write_newsize_update(
 	struct xfs_inode	*ip)
 {
 	if (ip->i_new_size) {
-		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
 		ip->i_new_size = 0;
 		if (ip->i_d.di_size > ip->i_size)
 			ip->i_d.di_size = ip->i_size;
-		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
 	}
 }
 
+/*
+ * xfs_file_splice_write() does not use xfs_rw_ilock() because
+ * generic_file_splice_write() takes the i_mutex itself. This, in theory,
+ * couuld cause lock inversions between the aio_write path and the splice path
+ * if someone is doing concurrent splice(2) based writes and write(2) based
+ * writes to the same inode. The only real way to fix this is to re-implement
+ * the generic code here with correct locking orders.
+ */
 STATIC ssize_t
 xfs_file_splice_write(
 	struct pipe_inode_info	*pipe,
@@ -604,7 +645,6 @@ xfs_file_aio_write(
 	xfs_fsize_t		new_size;
 	int			iolock;
 	size_t			ocount = 0, count;
-	int			need_i_mutex;
 
 	XFS_STATS_INC(xs_write_calls);
 
@@ -631,21 +671,17 @@ xfs_file_aio_write(
 relock:
 	if (ioflags & IO_ISDIRECT) {
 		iolock = XFS_IOLOCK_SHARED;
-		need_i_mutex = 0;
 	} else {
 		iolock = XFS_IOLOCK_EXCL;
-		need_i_mutex = 1;
-		mutex_lock(&inode->i_mutex);
 	}
 
-	xfs_ilock(ip, XFS_ILOCK_EXCL|iolock);
-
 start:
+	xfs_rw_ilock(ip, XFS_ILOCK_EXCL|iolock);
 	ret = generic_write_checks(file, &pos, &count,
 					S_ISBLK(inode->i_mode));
 	if (ret) {
-		xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
-		goto out_unlock_mutex;
+		xfs_rw_iunlock(ip, XFS_ILOCK_EXCL|iolock);
+		return ret;
 	}
 
 	if (ioflags & IO_ISDIRECT) {
@@ -654,16 +690,20 @@ start:
 				mp->m_rtdev_targp : mp->m_ddev_targp;
 
 		if ((pos & target->bt_smask) || (count & target->bt_smask)) {
-			xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
+			xfs_rw_iunlock(ip, XFS_ILOCK_EXCL|iolock);
 			return XFS_ERROR(-EINVAL);
 		}
 
-		if (!need_i_mutex && (mapping->nrpages || pos > ip->i_size)) {
-			xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
+		/*
+		 * For direct I/O, if there are cached pages or we're extending
+		 * the file, we need IOLOCK_EXCL until we're sure the bytes at
+		 * the new EOF have been zeroed and/or the cached pages are
+		 * flushed out.  Upgrade the I/O lock and start again.
+		 */
+		if (iolock != XFS_IOLOCK_EXCL &&
+		    (mapping->nrpages || pos > ip->i_size)) {
+			xfs_rw_iunlock(ip, XFS_ILOCK_EXCL|iolock);
 			iolock = XFS_IOLOCK_EXCL;
-			need_i_mutex = 1;
-			mutex_lock(&inode->i_mutex);
-			xfs_ilock(ip, XFS_ILOCK_EXCL|iolock);
 			goto start;
 		}
 	}
@@ -687,11 +727,11 @@ start:
 	if (pos > ip->i_size) {
 		ret = -xfs_zero_eof(ip, pos, ip->i_size);
 		if (ret) {
-			xfs_iunlock(ip, XFS_ILOCK_EXCL);
+			xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
 			goto out_unlock_internal;
 		}
 	}
-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
 
 	/*
 	 * If we're writing the file then make sure to clear the
@@ -708,7 +748,7 @@ start:
 
 	if ((ioflags & IO_ISDIRECT)) {
 		if (mapping->nrpages) {
-			WARN_ON(need_i_mutex == 0);
+			WARN_ON(iolock != XFS_IOLOCK_EXCL);
 			ret = -xfs_flushinval_pages(ip,
 					(pos & PAGE_CACHE_MASK),
 					-1, FI_REMAPF_LOCKED);
@@ -716,13 +756,10 @@ start:
 				goto out_unlock_internal;
 		}
 
-		if (need_i_mutex) {
+		if (iolock == XFS_IOLOCK_EXCL) {
 			/* demote the lock now the cached pages are gone */
-			xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
-			mutex_unlock(&inode->i_mutex);
-
+			xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
 			iolock = XFS_IOLOCK_SHARED;
-			need_i_mutex = 0;
 		}
 
 		trace_xfs_file_direct_write(ip, count, iocb->ki_pos, ioflags);
@@ -740,7 +777,7 @@ start:
 			count -= ret;
 
 			ioflags &= ~IO_ISDIRECT;
-			xfs_iunlock(ip, iolock);
+			xfs_rw_iunlock(ip, iolock);
 			goto relock;
 		}
 	} else {
@@ -775,14 +812,9 @@ write_retry:
 		loff_t end = pos + ret - 1;
 		int error, error2;
 
-		xfs_iunlock(ip, iolock);
-		if (need_i_mutex)
-			mutex_unlock(&inode->i_mutex);
-
+		xfs_rw_iunlock(ip, iolock);
 		error = filemap_write_and_wait_range(mapping, pos, end);
-		if (need_i_mutex)
-			mutex_lock(&inode->i_mutex);
-		xfs_ilock(ip, iolock);
+		xfs_rw_ilock(ip, iolock);
 
 		error2 = -xfs_file_fsync(file,
 					 (file->f_flags & __O_SYNC) ? 0 : 1);
@@ -794,10 +826,7 @@ write_retry:
 
  out_unlock_internal:
 	xfs_aio_write_newsize_update(ip);
-	xfs_iunlock(ip, iolock);
- out_unlock_mutex:
-	if (need_i_mutex)
-		mutex_unlock(&inode->i_mutex);
+	xfs_rw_iunlock(ip, iolock);
 	return ret;
 }
 
-- 
cgit v1.2.2


From f0d26e860b6c496464c5c8165d7df08dabde01fa Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 11 Jan 2011 10:15:36 +1100
Subject: xfs: split direct IO write path from xfs_file_aio_write

The current xfs_file_aio_write code is a mess of locking shenanigans
to handle the different locking requirements of buffered and direct
IO. Start to clean this up by disentangling the direct IO path from
the mess.

This also removes the failed direct IO fallback path to buffered IO.
XFS handles all direct IO cases without needing to fall back to
buffered IO, so we can safely remove this unused path. This greatly
simplifies the logic and locking needed in the write path.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/linux-2.6/xfs_file.c | 179 ++++++++++++++++++++++++++++----------------
 1 file changed, 116 insertions(+), 63 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index b5e13fbb7386..00661fd21fc0 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -628,6 +628,116 @@ out_lock:
 	return error;
 }
 
+/*
+ * xfs_file_dio_aio_write - handle direct IO writes
+ *
+ * Lock the inode appropriately to prepare for and issue a direct IO write.
+ * By spearating it from the buffered write path we remove all the tricky to
+ * follow locking changes and looping.
+ *
+ * Returns with locks held indicated by @iolock and errors indicated by
+ * negative return values.
+ */
+STATIC ssize_t
+xfs_file_dio_aio_write(
+	struct kiocb		*iocb,
+	const struct iovec	*iovp,
+	unsigned long		nr_segs,
+	loff_t			pos,
+	size_t			ocount,
+	int			*iolock)
+{
+	struct file		*file = iocb->ki_filp;
+	struct address_space	*mapping = file->f_mapping;
+	struct inode		*inode = mapping->host;
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	ssize_t			ret = 0;
+	xfs_fsize_t		new_size;
+	size_t			count = ocount;
+	struct xfs_buftarg	*target = XFS_IS_REALTIME_INODE(ip) ?
+					mp->m_rtdev_targp : mp->m_ddev_targp;
+
+	*iolock = 0;
+	if ((pos & target->bt_smask) || (count & target->bt_smask))
+		return -XFS_ERROR(EINVAL);
+
+	/*
+	 * For direct I/O, if there are cached pages or we're extending
+	 * the file, we need IOLOCK_EXCL until we're sure the bytes at
+	 * the new EOF have been zeroed and/or the cached pages are
+	 * flushed out.
+	 */
+	if (mapping->nrpages || pos > ip->i_size)
+		*iolock = XFS_IOLOCK_EXCL;
+	else
+		*iolock = XFS_IOLOCK_SHARED;
+	xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
+
+	ret = generic_write_checks(file, &pos, &count,
+					S_ISBLK(inode->i_mode));
+	if (ret) {
+		xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
+		*iolock = 0;
+		return ret;
+	}
+
+	new_size = pos + count;
+	if (new_size > ip->i_size)
+		ip->i_new_size = new_size;
+
+	if (likely(!(file->f_mode & FMODE_NOCMTIME)))
+		file_update_time(file);
+
+	/*
+	 * If the offset is beyond the size of the file, we have a couple of
+	 * things to do. First, if there is already space allocated we need to
+	 * either create holes or zero the disk or ...
+	 *
+	 * If there is a page where the previous size lands, we need to zero it
+	 * out up to the new size.
+	 */
+	if (pos > ip->i_size) {
+		ret = -xfs_zero_eof(ip, pos, ip->i_size);
+		if (ret) {
+			xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
+			return ret;
+		}
+	}
+	xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
+
+	/*
+	 * If we're writing the file then make sure to clear the setuid and
+	 * setgid bits if the process is not being run by root.  This keeps
+	 * people from modifying setuid and setgid binaries.
+	 */
+	ret = file_remove_suid(file);
+	if (unlikely(ret))
+		return ret;
+
+	if (mapping->nrpages) {
+		WARN_ON(*iolock != XFS_IOLOCK_EXCL);
+		ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1,
+							FI_REMAPF_LOCKED);
+		if (ret)
+			return ret;
+	}
+
+	if (*iolock == XFS_IOLOCK_EXCL) {
+		/* demote the lock now the cached pages are gone */
+		xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
+		*iolock = XFS_IOLOCK_SHARED;
+	}
+
+	trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
+	ret = generic_file_direct_write(iocb, iovp,
+			&nr_segs, pos, &iocb->ki_pos, count, ocount);
+
+	/* No fallback to buffered IO on errors for XFS. */
+	ASSERT(ret < 0 || ret == count);
+	return ret;
+}
+
 STATIC ssize_t
 xfs_file_aio_write(
 	struct kiocb		*iocb,
@@ -670,12 +780,12 @@ xfs_file_aio_write(
 
 relock:
 	if (ioflags & IO_ISDIRECT) {
-		iolock = XFS_IOLOCK_SHARED;
-	} else {
-		iolock = XFS_IOLOCK_EXCL;
+		ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos,
+						ocount, &iolock);
+		goto done_io;
 	}
+	iolock = XFS_IOLOCK_EXCL;
 
-start:
 	xfs_rw_ilock(ip, XFS_ILOCK_EXCL|iolock);
 	ret = generic_write_checks(file, &pos, &count,
 					S_ISBLK(inode->i_mode));
@@ -684,30 +794,6 @@ start:
 		return ret;
 	}
 
-	if (ioflags & IO_ISDIRECT) {
-		xfs_buftarg_t	*target =
-			XFS_IS_REALTIME_INODE(ip) ?
-				mp->m_rtdev_targp : mp->m_ddev_targp;
-
-		if ((pos & target->bt_smask) || (count & target->bt_smask)) {
-			xfs_rw_iunlock(ip, XFS_ILOCK_EXCL|iolock);
-			return XFS_ERROR(-EINVAL);
-		}
-
-		/*
-		 * For direct I/O, if there are cached pages or we're extending
-		 * the file, we need IOLOCK_EXCL until we're sure the bytes at
-		 * the new EOF have been zeroed and/or the cached pages are
-		 * flushed out.  Upgrade the I/O lock and start again.
-		 */
-		if (iolock != XFS_IOLOCK_EXCL &&
-		    (mapping->nrpages || pos > ip->i_size)) {
-			xfs_rw_iunlock(ip, XFS_ILOCK_EXCL|iolock);
-			iolock = XFS_IOLOCK_EXCL;
-			goto start;
-		}
-	}
-
 	new_size = pos + count;
 	if (new_size > ip->i_size)
 		ip->i_new_size = new_size;
@@ -746,41 +832,7 @@ start:
 	/* We can write back this queue in page reclaim */
 	current->backing_dev_info = mapping->backing_dev_info;
 
-	if ((ioflags & IO_ISDIRECT)) {
-		if (mapping->nrpages) {
-			WARN_ON(iolock != XFS_IOLOCK_EXCL);
-			ret = -xfs_flushinval_pages(ip,
-					(pos & PAGE_CACHE_MASK),
-					-1, FI_REMAPF_LOCKED);
-			if (ret)
-				goto out_unlock_internal;
-		}
-
-		if (iolock == XFS_IOLOCK_EXCL) {
-			/* demote the lock now the cached pages are gone */
-			xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
-			iolock = XFS_IOLOCK_SHARED;
-		}
-
-		trace_xfs_file_direct_write(ip, count, iocb->ki_pos, ioflags);
-		ret = generic_file_direct_write(iocb, iovp,
-				&nr_segs, pos, &iocb->ki_pos, count, ocount);
-
-		/*
-		 * direct-io write to a hole: fall through to buffered I/O
-		 * for completing the rest of the request.
-		 */
-		if (ret >= 0 && ret != count) {
-			XFS_STATS_ADD(xs_write_bytes, ret);
-
-			pos += ret;
-			count -= ret;
-
-			ioflags &= ~IO_ISDIRECT;
-			xfs_rw_iunlock(ip, iolock);
-			goto relock;
-		}
-	} else {
+	if (!(ioflags & IO_ISDIRECT)) {
 		int enospc = 0;
 
 write_retry:
@@ -802,6 +854,7 @@ write_retry:
 
 	current->backing_dev_info = NULL;
 
+done_io:
 	xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret);
 
 	if (ret <= 0)
-- 
cgit v1.2.2


From 637bbc75d9fda57c7bc77ce5ee37e29a77a0520d Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 11 Jan 2011 10:17:30 +1100
Subject: xfs: split buffered IO write path from xfs_file_aio_write

Complete the split of the different write IO paths by splitting the
buffered IO write path out of xfs_file_aio_write(). This makes the
different mechanisms of the write patchs easier to follow.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Alex Elder <aelder@sgi.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/linux-2.6/xfs_file.c | 146 +++++++++++++++++++++-----------------------
 1 file changed, 69 insertions(+), 77 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 00661fd21fc0..e2bcf51d292e 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -739,58 +739,31 @@ xfs_file_dio_aio_write(
 }
 
 STATIC ssize_t
-xfs_file_aio_write(
+xfs_file_buffered_aio_write(
 	struct kiocb		*iocb,
 	const struct iovec	*iovp,
 	unsigned long		nr_segs,
-	loff_t			pos)
+	loff_t			pos,
+	size_t			ocount,
+	int			*iolock)
 {
 	struct file		*file = iocb->ki_filp;
 	struct address_space	*mapping = file->f_mapping;
 	struct inode		*inode = mapping->host;
 	struct xfs_inode	*ip = XFS_I(inode);
-	struct xfs_mount	*mp = ip->i_mount;
-	ssize_t			ret = 0;
-	int			ioflags = 0;
+	ssize_t			ret;
+	int			enospc = 0;
 	xfs_fsize_t		new_size;
-	int			iolock;
-	size_t			ocount = 0, count;
-
-	XFS_STATS_INC(xs_write_calls);
-
-	BUG_ON(iocb->ki_pos != pos);
-
-	if (unlikely(file->f_flags & O_DIRECT))
-		ioflags |= IO_ISDIRECT;
-	if (file->f_mode & FMODE_NOCMTIME)
-		ioflags |= IO_INVIS;
-
-	ret = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
-	if (ret)
-		return ret;
-
-	count = ocount;
-	if (count == 0)
-		return 0;
-
-	xfs_wait_for_freeze(mp, SB_FREEZE_WRITE);
-
-	if (XFS_FORCED_SHUTDOWN(mp))
-		return -EIO;
+	size_t			count = ocount;
 
-relock:
-	if (ioflags & IO_ISDIRECT) {
-		ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos,
-						ocount, &iolock);
-		goto done_io;
-	}
-	iolock = XFS_IOLOCK_EXCL;
+	*iolock = XFS_IOLOCK_EXCL;
+	xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
 
-	xfs_rw_ilock(ip, XFS_ILOCK_EXCL|iolock);
 	ret = generic_write_checks(file, &pos, &count,
 					S_ISBLK(inode->i_mode));
 	if (ret) {
-		xfs_rw_iunlock(ip, XFS_ILOCK_EXCL|iolock);
+		xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
+		*iolock = 0;
 		return ret;
 	}
 
@@ -798,67 +771,86 @@ relock:
 	if (new_size > ip->i_size)
 		ip->i_new_size = new_size;
 
-	if (likely(!(ioflags & IO_INVIS)))
+	if (likely(!(file->f_mode & FMODE_NOCMTIME)))
 		file_update_time(file);
 
-	/*
-	 * If the offset is beyond the size of the file, we have a couple
-	 * of things to do. First, if there is already space allocated
-	 * we need to either create holes or zero the disk or ...
-	 *
-	 * If there is a page where the previous size lands, we need
-	 * to zero it out up to the new size.
-	 */
-
 	if (pos > ip->i_size) {
 		ret = -xfs_zero_eof(ip, pos, ip->i_size);
 		if (ret) {
 			xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
-			goto out_unlock_internal;
+			return ret;
 		}
 	}
 	xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
 
-	/*
-	 * If we're writing the file then make sure to clear the
-	 * setuid and setgid bits if the process is not being run
-	 * by root.  This keeps people from modifying setuid and
-	 * setgid binaries.
-	 */
 	ret = file_remove_suid(file);
 	if (unlikely(ret))
-		goto out_unlock_internal;
+		return ret;
 
 	/* We can write back this queue in page reclaim */
 	current->backing_dev_info = mapping->backing_dev_info;
 
-	if (!(ioflags & IO_ISDIRECT)) {
-		int enospc = 0;
-
 write_retry:
-		trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, ioflags);
-		ret = generic_file_buffered_write(iocb, iovp, nr_segs,
-				pos, &iocb->ki_pos, count, ret);
-		/*
-		 * if we just got an ENOSPC, flush the inode now we
-		 * aren't holding any page locks and retry *once*
-		 */
-		if (ret == -ENOSPC && !enospc) {
-			ret = xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
-			if (ret)
-				goto out_unlock_internal;
-			enospc = 1;
-			goto write_retry;
-		}
+	trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
+	ret = generic_file_buffered_write(iocb, iovp, nr_segs,
+			pos, &iocb->ki_pos, count, ret);
+	/*
+	 * if we just got an ENOSPC, flush the inode now we aren't holding any
+	 * page locks and retry *once*
+	 */
+	if (ret == -ENOSPC && !enospc) {
+		ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
+		if (ret)
+			return ret;
+		enospc = 1;
+		goto write_retry;
 	}
-
 	current->backing_dev_info = NULL;
+	return ret;
+}
+
+STATIC ssize_t
+xfs_file_aio_write(
+	struct kiocb		*iocb,
+	const struct iovec	*iovp,
+	unsigned long		nr_segs,
+	loff_t			pos)
+{
+	struct file		*file = iocb->ki_filp;
+	struct address_space	*mapping = file->f_mapping;
+	struct inode		*inode = mapping->host;
+	struct xfs_inode	*ip = XFS_I(inode);
+	ssize_t			ret;
+	int			iolock;
+	size_t			ocount = 0;
+
+	XFS_STATS_INC(xs_write_calls);
+
+	BUG_ON(iocb->ki_pos != pos);
+
+	ret = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
+	if (ret)
+		return ret;
+
+	if (ocount == 0)
+		return 0;
+
+	xfs_wait_for_freeze(ip->i_mount, SB_FREEZE_WRITE);
+
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+		return -EIO;
+
+	if (unlikely(file->f_flags & O_DIRECT))
+		ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos,
+						ocount, &iolock);
+	else
+		ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
+						ocount, &iolock);
 
-done_io:
 	xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret);
 
 	if (ret <= 0)
-		goto out_unlock_internal;
+		goto out_unlock;
 
 	/* Handle various SYNC-type writes */
 	if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
@@ -877,7 +869,7 @@ done_io:
 			ret = error2;
 	}
 
- out_unlock_internal:
+out_unlock:
 	xfs_aio_write_newsize_update(ip);
 	xfs_rw_iunlock(ip, iolock);
 	return ret;
-- 
cgit v1.2.2


From 4d8d15812fd9bc96d0da11467d23e0373feae933 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 11 Jan 2011 10:23:42 +1100
Subject: xfs: factor common write setup code

The buffered IO and direct IO write paths share a common set of
checks and limiting code prior to issuing the write. Factor that
into a common helper function.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Alex Elder <aelder@sgi.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/linux-2.6/xfs_file.c | 123 ++++++++++++++++++++------------------------
 1 file changed, 56 insertions(+), 67 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index e2bcf51d292e..5863dd8f448c 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -628,6 +628,58 @@ out_lock:
 	return error;
 }
 
+/*
+ * Common pre-write limit and setup checks.
+ *
+ * Returns with iolock held according to @iolock.
+ */
+STATIC ssize_t
+xfs_file_aio_write_checks(
+	struct file		*file,
+	loff_t			*pos,
+	size_t			*count,
+	int			*iolock)
+{
+	struct inode		*inode = file->f_mapping->host;
+	struct xfs_inode	*ip = XFS_I(inode);
+	xfs_fsize_t		new_size;
+	int			error = 0;
+
+	error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
+	if (error) {
+		xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
+		*iolock = 0;
+		return error;
+	}
+
+	new_size = *pos + *count;
+	if (new_size > ip->i_size)
+		ip->i_new_size = new_size;
+
+	if (likely(!(file->f_mode & FMODE_NOCMTIME)))
+		file_update_time(file);
+
+	/*
+	 * If the offset is beyond the size of the file, we need to zero any
+	 * blocks that fall between the existing EOF and the start of this
+	 * write.
+	 */
+	if (*pos > ip->i_size)
+		error = -xfs_zero_eof(ip, *pos, ip->i_size);
+
+	xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
+	if (error)
+		return error;
+
+	/*
+	 * If we're writing the file then make sure to clear the setuid and
+	 * setgid bits if the process is not being run by root.  This keeps
+	 * people from modifying setuid and setgid binaries.
+	 */
+	return file_remove_suid(file);
+
+}
+
 /*
  * xfs_file_dio_aio_write - handle direct IO writes
  *
@@ -653,7 +705,6 @@ xfs_file_dio_aio_write(
 	struct xfs_inode	*ip = XFS_I(inode);
 	struct xfs_mount	*mp = ip->i_mount;
 	ssize_t			ret = 0;
-	xfs_fsize_t		new_size;
 	size_t			count = ocount;
 	struct xfs_buftarg	*target = XFS_IS_REALTIME_INODE(ip) ?
 					mp->m_rtdev_targp : mp->m_ddev_targp;
@@ -674,45 +725,8 @@ xfs_file_dio_aio_write(
 		*iolock = XFS_IOLOCK_SHARED;
 	xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
 
-	ret = generic_write_checks(file, &pos, &count,
-					S_ISBLK(inode->i_mode));
-	if (ret) {
-		xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
-		*iolock = 0;
-		return ret;
-	}
-
-	new_size = pos + count;
-	if (new_size > ip->i_size)
-		ip->i_new_size = new_size;
-
-	if (likely(!(file->f_mode & FMODE_NOCMTIME)))
-		file_update_time(file);
-
-	/*
-	 * If the offset is beyond the size of the file, we have a couple of
-	 * things to do. First, if there is already space allocated we need to
-	 * either create holes or zero the disk or ...
-	 *
-	 * If there is a page where the previous size lands, we need to zero it
-	 * out up to the new size.
-	 */
-	if (pos > ip->i_size) {
-		ret = -xfs_zero_eof(ip, pos, ip->i_size);
-		if (ret) {
-			xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
-			return ret;
-		}
-	}
-	xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
-
-	/*
-	 * If we're writing the file then make sure to clear the setuid and
-	 * setgid bits if the process is not being run by root.  This keeps
-	 * people from modifying setuid and setgid binaries.
-	 */
-	ret = file_remove_suid(file);
-	if (unlikely(ret))
+	ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
+	if (ret)
 		return ret;
 
 	if (mapping->nrpages) {
@@ -753,38 +767,13 @@ xfs_file_buffered_aio_write(
 	struct xfs_inode	*ip = XFS_I(inode);
 	ssize_t			ret;
 	int			enospc = 0;
-	xfs_fsize_t		new_size;
 	size_t			count = ocount;
 
 	*iolock = XFS_IOLOCK_EXCL;
 	xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
 
-	ret = generic_write_checks(file, &pos, &count,
-					S_ISBLK(inode->i_mode));
-	if (ret) {
-		xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
-		*iolock = 0;
-		return ret;
-	}
-
-	new_size = pos + count;
-	if (new_size > ip->i_size)
-		ip->i_new_size = new_size;
-
-	if (likely(!(file->f_mode & FMODE_NOCMTIME)))
-		file_update_time(file);
-
-	if (pos > ip->i_size) {
-		ret = -xfs_zero_eof(ip, pos, ip->i_size);
-		if (ret) {
-			xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
-			return ret;
-		}
-	}
-	xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
-
-	ret = file_remove_suid(file);
-	if (unlikely(ret))
+	ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
+	if (ret)
 		return ret;
 
 	/* We can write back this queue in page reclaim */
-- 
cgit v1.2.2


From eda77982729b7170bdc9e8855f0682edf322d277 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 11 Jan 2011 10:22:40 +1100
Subject: xfs: serialise unaligned direct IOs

When two concurrent unaligned, non-overlapping direct IOs are issued
to the same block, the direct Io layer will race to zero the block.
The result is that one of the concurrent IOs will overwrite data
written by the other IO with zeros. This is demonstrated by the
xfsqa test 240.

To avoid this problem, serialise all unaligned direct IOs to an
inode with a big hammer. We need a big hammer approach as we need to
serialise AIO as well, so we can't just block writes on locks.
Hence, the big hammer is calling xfs_ioend_wait() while holding out
other unaligned direct IOs from starting.

We don't bother trying to serialised aligned vs unaligned IOs as
they are overlapping IO and the result of concurrent overlapping IOs
is undefined - the result of either IO is a valid result so we let
them race. Hence we only penalise unaligned IO, which already has a
major overhead compared to aligned IO so this isn't a major problem.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Alex Elder <aelder@sgi.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/linux-2.6/xfs_file.c | 38 ++++++++++++++++++++++++++++----------
 1 file changed, 28 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 5863dd8f448c..ef51eb43e137 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -684,9 +684,24 @@ xfs_file_aio_write_checks(
  * xfs_file_dio_aio_write - handle direct IO writes
  *
  * Lock the inode appropriately to prepare for and issue a direct IO write.
- * By spearating it from the buffered write path we remove all the tricky to
+ * By separating it from the buffered write path we remove all the tricky to
  * follow locking changes and looping.
  *
+ * If there are cached pages or we're extending the file, we need IOLOCK_EXCL
+ * until we're sure the bytes at the new EOF have been zeroed and/or the cached
+ * pages are flushed out.
+ *
+ * In most cases the direct IO writes will be done holding IOLOCK_SHARED
+ * allowing them to be done in parallel with reads and other direct IO writes.
+ * However, if the IO is not aligned to filesystem blocks, the direct IO layer
+ * needs to do sub-block zeroing and that requires serialisation against other
+ * direct IOs to the same block. In this case we need to serialise the
+ * submission of the unaligned IOs so that we don't get racing block zeroing in
+ * the dio layer.  To avoid the problem with aio, we also need to wait for
+ * outstanding IOs to complete so that unwritten extent conversion is completed
+ * before we try to map the overlapping block. This is currently implemented by
+ * hitting it with a big hammer (i.e. xfs_ioend_wait()).
+ *
  * Returns with locks held indicated by @iolock and errors indicated by
  * negative return values.
  */
@@ -706,6 +721,7 @@ xfs_file_dio_aio_write(
 	struct xfs_mount	*mp = ip->i_mount;
 	ssize_t			ret = 0;
 	size_t			count = ocount;
+	int			unaligned_io = 0;
 	struct xfs_buftarg	*target = XFS_IS_REALTIME_INODE(ip) ?
 					mp->m_rtdev_targp : mp->m_ddev_targp;
 
@@ -713,13 +729,10 @@ xfs_file_dio_aio_write(
 	if ((pos & target->bt_smask) || (count & target->bt_smask))
 		return -XFS_ERROR(EINVAL);
 
-	/*
-	 * For direct I/O, if there are cached pages or we're extending
-	 * the file, we need IOLOCK_EXCL until we're sure the bytes at
-	 * the new EOF have been zeroed and/or the cached pages are
-	 * flushed out.
-	 */
-	if (mapping->nrpages || pos > ip->i_size)
+	if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask))
+		unaligned_io = 1;
+
+	if (unaligned_io || mapping->nrpages || pos > ip->i_size)
 		*iolock = XFS_IOLOCK_EXCL;
 	else
 		*iolock = XFS_IOLOCK_SHARED;
@@ -737,8 +750,13 @@ xfs_file_dio_aio_write(
 			return ret;
 	}
 
-	if (*iolock == XFS_IOLOCK_EXCL) {
-		/* demote the lock now the cached pages are gone */
+	/*
+	 * If we are doing unaligned IO, wait for all other IO to drain,
+	 * otherwise demote the lock if we had to flush cached pages
+	 */
+	if (unaligned_io)
+		xfs_ioend_wait(ip);
+	else if (*iolock == XFS_IOLOCK_EXCL) {
 		xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
 		*iolock = XFS_IOLOCK_SHARED;
 	}
-- 
cgit v1.2.2


From 4c6493785a1ea9c3b3522f199760a90a30e1626c Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Tue, 15 Jun 2010 14:22:37 -0400
Subject: nfsd4: modify session list under cl_lock

We want to traverse this from the callback code.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4state.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index b583e4e800ab..3cf9900d5f32 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -771,7 +771,9 @@ static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct n
 	idx = hash_sessionid(&new->se_sessionid);
 	spin_lock(&client_lock);
 	list_add(&new->se_hash, &sessionid_hashtbl[idx]);
+	spin_lock(&clp->cl_lock);
 	list_add(&new->se_perclnt, &clp->cl_sessions);
+	spin_unlock(&clp->cl_lock);
 	spin_unlock(&client_lock);
 
 	status = nfsd4_new_conn(rqstp, new);
@@ -819,7 +821,9 @@ static void
 unhash_session(struct nfsd4_session *ses)
 {
 	list_del(&ses->se_hash);
+	spin_lock(&ses->se_client->cl_lock);
 	list_del(&ses->se_perclnt);
+	spin_unlock(&ses->se_client->cl_lock);
 }
 
 /* must be called under the client_lock */
@@ -925,8 +929,10 @@ unhash_client_locked(struct nfs4_client *clp)
 
 	mark_client_expired(clp);
 	list_del(&clp->cl_lru);
+	spin_lock(&clp->cl_lock);
 	list_for_each_entry(ses, &clp->cl_sessions, se_perclnt)
 		list_del_init(&ses->se_hash);
+	spin_unlock(&clp->cl_lock);
 }
 
 static void
-- 
cgit v1.2.2


From 1d1bc8f2074f0b728dfca2a3c16f2f5a3f298ffc Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 4 Oct 2010 23:12:59 -0400
Subject: nfsd4: support BIND_CONN_TO_SESSION

Basic xdr and processing for BIND_CONN_TO_SESSION.  This adds a
connection to the list of connections associated with a session.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4proc.c  |  9 ++++++--
 fs/nfsd/nfs4state.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++------
 fs/nfsd/nfs4xdr.c   | 34 ++++++++++++++++++++++++++++--
 fs/nfsd/state.h     |  5 +++++
 fs/nfsd/xdr4.h      |  2 ++
 5 files changed, 99 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index fd6694b49e1c..db52546143d1 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1004,8 +1004,8 @@ static const char *nfsd4_op_name(unsigned opnum);
  * Also note, enforced elsewhere:
  *	- SEQUENCE other than as first op results in
  *	  NFS4ERR_SEQUENCE_POS. (Enforced in nfsd4_sequence().)
- *	- BIND_CONN_TO_SESSION must be the only op in its compound
- *	  (Will be enforced in nfsd4_bind_conn_to_session().)
+ *	- BIND_CONN_TO_SESSION must be the only op in its compound.
+ *	  (Enforced in nfsd4_bind_conn_to_session().)
  *	- DESTROY_SESSION must be the final operation in a compound, if
  *	  sessionid's in SEQUENCE and DESTROY_SESSION are the same.
  *	  (Enforced in nfsd4_destroy_session().)
@@ -1326,6 +1326,11 @@ static struct nfsd4_operation nfsd4_ops[] = {
 		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
 		.op_name = "OP_EXCHANGE_ID",
 	},
+	[OP_BIND_CONN_TO_SESSION] = {
+		.op_func = (nfsd4op_func)nfsd4_bind_conn_to_session,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+		.op_name = "OP_BIND_CONN_TO_SESSION",
+	},
 	[OP_CREATE_SESSION] = {
 		.op_func = (nfsd4op_func)nfsd4_create_session,
 		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 3cf9900d5f32..956174f488a7 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -679,15 +679,12 @@ static int nfsd4_register_conn(struct nfsd4_conn *conn)
 	return register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user);
 }
 
-static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses)
+static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses, u32 dir)
 {
 	struct nfsd4_conn *conn;
-	u32 flags = NFS4_CDFC4_FORE;
 	int ret;
 
-	if (ses->se_flags & SESSION4_BACK_CHAN)
-		flags |= NFS4_CDFC4_BACK;
-	conn = alloc_conn(rqstp, flags);
+	conn = alloc_conn(rqstp, dir);
 	if (!conn)
 		return nfserr_jukebox;
 	nfsd4_hash_conn(conn, ses);
@@ -698,6 +695,17 @@ static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses)
 	return nfs_ok;
 }
 
+static __be32 nfsd4_new_conn_from_crses(struct svc_rqst *rqstp, struct nfsd4_session *ses)
+{
+	u32 dir = NFS4_CDFC4_FORE;
+
+	if (ses->se_flags & SESSION4_BACK_CHAN)
+		dir |= NFS4_CDFC4_BACK;
+
+	return nfsd4_new_conn(rqstp, ses, dir);
+}
+
+/* must be called under client_lock */
 static void nfsd4_del_conns(struct nfsd4_session *s)
 {
 	struct nfs4_client *clp = s->se_client;
@@ -776,7 +784,7 @@ static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct n
 	spin_unlock(&clp->cl_lock);
 	spin_unlock(&client_lock);
 
-	status = nfsd4_new_conn(rqstp, new);
+	status = nfsd4_new_conn_from_crses(rqstp, new);
 	/* whoops: benny points out, status is ignored! (err, or bogus) */
 	if (status) {
 		free_session(&new->se_ref);
@@ -1597,6 +1605,45 @@ static bool nfsd4_last_compound_op(struct svc_rqst *rqstp)
 	return argp->opcnt == resp->opcnt;
 }
 
+static __be32 nfsd4_map_bcts_dir(u32 *dir)
+{
+	switch (*dir) {
+	case NFS4_CDFC4_FORE:
+	case NFS4_CDFC4_BACK:
+		return nfs_ok;
+	case NFS4_CDFC4_FORE_OR_BOTH:
+	case NFS4_CDFC4_BACK_OR_BOTH:
+		*dir = NFS4_CDFC4_BOTH;
+		return nfs_ok;
+	};
+	return nfserr_inval;
+}
+
+__be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp,
+		     struct nfsd4_compound_state *cstate,
+		     struct nfsd4_bind_conn_to_session *bcts)
+{
+	__be32 status;
+
+	if (!nfsd4_last_compound_op(rqstp))
+		return nfserr_not_only_op;
+	spin_lock(&client_lock);
+	cstate->session = find_in_sessionid_hashtbl(&bcts->sessionid);
+	/* Sorta weird: we only need the refcnt'ing because new_conn acquires
+	 * client_lock iself: */
+	if (cstate->session) {
+		nfsd4_get_session(cstate->session);
+		atomic_inc(&cstate->session->se_client->cl_refcount);
+	}
+	spin_unlock(&client_lock);
+	if (!cstate->session)
+		return nfserr_badsession;
+
+	status = nfsd4_map_bcts_dir(&bcts->dir);
+	nfsd4_new_conn(rqstp, cstate->session, bcts->dir);
+	return nfs_ok;
+}
+
 static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid)
 {
 	if (!session)
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index ca3786905dec..4ff2c9e0b276 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -421,6 +421,21 @@ nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access
 	DECODE_TAIL;
 }
 
+static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, struct nfsd4_bind_conn_to_session *bcts)
+{
+	DECODE_HEAD;
+	u32 dummy;
+
+	READ_BUF(NFS4_MAX_SESSIONID_LEN + 8);
+	COPYMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+	READ32(bcts->dir);
+	/* XXX: Perhaps Tom Tucker could help us figure out how we
+	 * should be using ctsa_use_conn_in_rdma_mode: */
+	READ32(dummy);
+
+	DECODE_TAIL;
+}
+
 static __be32
 nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close)
 {
@@ -1359,7 +1374,7 @@ static nfsd4_dec nfsd41_dec_ops[] = {
 
 	/* new operations for NFSv4.1 */
 	[OP_BACKCHANNEL_CTL]	= (nfsd4_dec)nfsd4_decode_notsupp,
-	[OP_BIND_CONN_TO_SESSION]= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_BIND_CONN_TO_SESSION]= (nfsd4_dec)nfsd4_decode_bind_conn_to_session,
 	[OP_EXCHANGE_ID]	= (nfsd4_dec)nfsd4_decode_exchange_id,
 	[OP_CREATE_SESSION]	= (nfsd4_dec)nfsd4_decode_create_session,
 	[OP_DESTROY_SESSION]	= (nfsd4_dec)nfsd4_decode_destroy_session,
@@ -2383,6 +2398,21 @@ nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
 	return nfserr;
 }
 
+static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_bind_conn_to_session *bcts)
+{
+	__be32 *p;
+
+	if (!nfserr) {
+		RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 8);
+		WRITEMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+		WRITE32(bcts->dir);
+		/* XXX: ? */
+		WRITE32(0);
+		ADJUST_ARGS();
+	}
+	return nfserr;
+}
+
 static __be32
 nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close)
 {
@@ -3174,7 +3204,7 @@ static nfsd4_enc nfsd4_enc_ops[] = {
 
 	/* NFSv4.1 operations */
 	[OP_BACKCHANNEL_CTL]	= (nfsd4_enc)nfsd4_encode_noop,
-	[OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_noop,
+	[OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_bind_conn_to_session,
 	[OP_EXCHANGE_ID]	= (nfsd4_enc)nfsd4_encode_exchange_id,
 	[OP_CREATE_SESSION]	= (nfsd4_enc)nfsd4_encode_create_session,
 	[OP_DESTROY_SESSION]	= (nfsd4_enc)nfsd4_encode_destroy_session,
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index cf6dc83fd545..442f6d8e024c 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -148,6 +148,11 @@ struct nfsd4_create_session {
 	u32				gid;
 };
 
+struct nfsd4_bind_conn_to_session {
+	struct nfs4_sessionid		sessionid;
+	u32				dir;
+};
+
 /* The single slot clientid cache structure */
 struct nfsd4_clid_slot {
 	u32				sl_seqid;
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 799c30c3b495..3a7aa4d98c1f 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -427,6 +427,7 @@ struct nfsd4_op {
 
 		/* NFSv4.1 */
 		struct nfsd4_exchange_id	exchange_id;
+		struct nfsd4_bind_conn_to_session bind_conn_to_session;
 		struct nfsd4_create_session	create_session;
 		struct nfsd4_destroy_session	destroy_session;
 		struct nfsd4_sequence		sequence;
@@ -523,6 +524,7 @@ extern __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
 		struct nfsd4_sequence *seq);
 extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp,
 		struct nfsd4_compound_state *, struct nfsd4_exchange_id *);
+extern __be32 nfsd4_bind_conn_to_session(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_bind_conn_to_session *);
 extern __be32 nfsd4_create_session(struct svc_rqst *,
 		struct nfsd4_compound_state *,
 		struct nfsd4_create_session *);
-- 
cgit v1.2.2


From dcbeaa68dbbdacbbb330a86c7fc95a28473fc209 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Tue, 15 Jun 2010 17:25:45 -0400
Subject: nfsd4: allow backchannel recovery

Now that we have a list of connections to choose from, we can teach the
callback code to just pick a suitable connection and use that, instead
of insisting on forever using the connection that the first
create_session was sent with.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4callback.c | 36 +++++++++++++++++++++++++++++++++---
 fs/nfsd/nfs4state.c    | 16 ++++++++++------
 2 files changed, 43 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index dd183af24fe6..18b740bd29ac 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -473,8 +473,7 @@ static int max_cb_time(void)
 /* Reference counting, callback cleanup, etc., all look racy as heck.
  * And why is cl_cb_set an atomic? */
 
-static int setup_callback_client(struct nfs4_client *clp,
-		struct nfs4_cb_conn *conn)
+static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn, struct nfsd4_session *ses)
 {
 	struct rpc_timeout	timeparms = {
 		.to_initval	= max_cb_time(),
@@ -501,6 +500,10 @@ static int setup_callback_client(struct nfs4_client *clp,
 		args.protocol = XPRT_TRANSPORT_TCP;
 		clp->cl_cb_ident = conn->cb_ident;
 	} else {
+		if (!conn->cb_xprt)
+			return -EINVAL;
+		clp->cl_cb_conn.cb_xprt = conn->cb_xprt;
+		clp->cl_cb_session = ses;
 		args.bc_xprt = conn->cb_xprt;
 		args.prognumber = clp->cl_cb_session->se_cb_prog;
 		args.protocol = XPRT_TRANSPORT_BC_TCP;
@@ -756,10 +759,27 @@ static void nfsd4_release_cb(struct nfsd4_callback *cb)
 		cb->cb_ops->rpc_release(cb);
 }
 
+/* requires cl_lock: */
+static struct nfsd4_conn * __nfsd4_find_backchannel(struct nfs4_client *clp)
+{
+	struct nfsd4_session *s;
+	struct nfsd4_conn *c;
+
+	list_for_each_entry(s, &clp->cl_sessions, se_perclnt) {
+		list_for_each_entry(c, &s->se_conns, cn_persession) {
+			if (c->cn_flags & NFS4_CDFC4_BACK)
+				return c;
+		}
+	}
+	return NULL;
+}
+
 static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
 {
 	struct nfs4_cb_conn conn;
 	struct nfs4_client *clp = cb->cb_clp;
+	struct nfsd4_session *ses = NULL;
+	struct nfsd4_conn *c;
 	int err;
 
 	/*
@@ -770,6 +790,10 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
 		rpc_shutdown_client(clp->cl_cb_client);
 		clp->cl_cb_client = NULL;
 	}
+	if (clp->cl_cb_conn.cb_xprt) {
+		svc_xprt_put(clp->cl_cb_conn.cb_xprt);
+		clp->cl_cb_conn.cb_xprt = NULL;
+	}
 	if (test_bit(NFSD4_CLIENT_KILL, &clp->cl_cb_flags))
 		return;
 	spin_lock(&clp->cl_lock);
@@ -780,9 +804,15 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
 	BUG_ON(!clp->cl_cb_flags);
 	clear_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags);
 	memcpy(&conn, &cb->cb_clp->cl_cb_conn, sizeof(struct nfs4_cb_conn));
+	c = __nfsd4_find_backchannel(clp);
+	if (c) {
+		svc_xprt_get(c->cn_xprt);
+		conn.cb_xprt = c->cn_xprt;
+		ses = c->cn_session;
+	}
 	spin_unlock(&clp->cl_lock);
 
-	err = setup_callback_client(clp, &conn);
+	err = setup_callback_client(clp, &conn, ses);
 	if (err)
 		warn_no_callback_path(clp, err);
 }
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 956174f488a7..290370bc9ae7 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -642,6 +642,7 @@ static void nfsd4_conn_lost(struct svc_xpt_user *u)
 		free_conn(c);
 	}
 	spin_unlock(&clp->cl_lock);
+	/* XXX: mark callback for update, probe callback */
 }
 
 static struct nfsd4_conn *alloc_conn(struct svc_rqst *rqstp, u32 flags)
@@ -790,16 +791,19 @@ static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct n
 		free_session(&new->se_ref);
 		return NULL;
 	}
-	if (!clp->cl_cb_session && (cses->flags & SESSION4_BACK_CHAN)) {
+	if (cses->flags & SESSION4_BACK_CHAN) {
 		struct sockaddr *sa = svc_addr(rqstp);
-
-		clp->cl_cb_session = new;
-		clp->cl_cb_conn.cb_xprt = rqstp->rq_xprt;
-		svc_xprt_get(rqstp->rq_xprt);
+		/*
+		 * This is a little silly; with sessions there's no real
+		 * use for the callback address.  Use the peer address
+		 * as a reasonable default for now, but consider fixing
+		 * the rpc client not to require an address in the
+		 * future:
+		 */
 		rpc_copy_addr((struct sockaddr *)&clp->cl_cb_conn.cb_addr, sa);
 		clp->cl_cb_conn.cb_addrlen = svc_addr_len(sa);
-		nfsd4_probe_callback(clp);
 	}
+	nfsd4_probe_callback(clp);
 	return new;
 }
 
-- 
cgit v1.2.2


From 77a3569d6c4e14e89fa628df383b6dccc0cce6be Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Fri, 30 Apr 2010 18:51:44 -0400
Subject: nfsd4: keep finer-grained callback status

Distinguish between when the callback channel is known to be down, and
when it is not yet confirmed.  This will be useful in the 4.1 case.

Also, we don't seem to be using the fact that this field is atomic.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4callback.c | 26 ++++++++++++++------------
 fs/nfsd/nfs4state.c    |  8 ++++----
 fs/nfsd/state.h        |  5 ++++-
 3 files changed, 22 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 18b740bd29ac..d32f49d6ca2c 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -470,8 +470,6 @@ static int max_cb_time(void)
 	return max(nfsd4_lease/10, (time_t)1) * HZ;
 }
 
-/* Reference counting, callback cleanup, etc., all look racy as heck.
- * And why is cl_cb_set an atomic? */
 
 static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn, struct nfsd4_session *ses)
 {
@@ -526,14 +524,20 @@ static void warn_no_callback_path(struct nfs4_client *clp, int reason)
 		(int)clp->cl_name.len, clp->cl_name.data, reason);
 }
 
+static void nfsd4_mark_cb_down(struct nfs4_client *clp, int reason)
+{
+	clp->cl_cb_state = NFSD4_CB_DOWN;
+	warn_no_callback_path(clp, reason);
+}
+
 static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
 {
 	struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null);
 
 	if (task->tk_status)
-		warn_no_callback_path(clp, task->tk_status);
+		nfsd4_mark_cb_down(clp, task->tk_status);
 	else
-		atomic_set(&clp->cl_cb_set, 1);
+		clp->cl_cb_state = NFSD4_CB_UP;
 }
 
 static const struct rpc_call_ops nfsd4_cb_probe_ops = {
@@ -579,14 +583,15 @@ static void do_probe_callback(struct nfs4_client *clp)
  */
 void nfsd4_probe_callback(struct nfs4_client *clp)
 {
+	/* XXX: atomicity?  Also, should we be using cl_cb_flags? */
+	clp->cl_cb_state = NFSD4_CB_UNKNOWN;
 	set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags);
 	do_probe_callback(clp);
 }
 
 void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
 {
-	BUG_ON(atomic_read(&clp->cl_cb_set));
-
+	clp->cl_cb_state = NFSD4_CB_UNKNOWN;
 	spin_lock(&clp->cl_lock);
 	memcpy(&clp->cl_cb_conn, conn, sizeof(struct nfs4_cb_conn));
 	spin_unlock(&clp->cl_lock);
@@ -693,8 +698,7 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
 		break;
 	default:
 		/* Network partition? */
-		atomic_set(&clp->cl_cb_set, 0);
-		warn_no_callback_path(clp, task->tk_status);
+		nfsd4_mark_cb_down(clp, task->tk_status);
 		if (current_rpc_client != task->tk_client) {
 			/* queue a callback on the new connection: */
 			atomic_inc(&dp->dl_count);
@@ -707,10 +711,8 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
 		task->tk_status = 0;
 		rpc_restart_call_prepare(task);
 		return;
-	} else {
-		atomic_set(&clp->cl_cb_set, 0);
-		warn_no_callback_path(clp, task->tk_status);
-	}
+	} else
+		nfsd4_mark_cb_down(clp, task->tk_status);
 }
 
 static void nfsd4_cb_recall_release(void *calldata)
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 290370bc9ae7..919ad25660d6 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1071,7 +1071,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
 
 	memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
 	atomic_set(&clp->cl_refcount, 0);
-	atomic_set(&clp->cl_cb_set, 0);
+	clp->cl_cb_state = NFSD4_CB_UNKNOWN;
 	INIT_LIST_HEAD(&clp->cl_idhash);
 	INIT_LIST_HEAD(&clp->cl_strhash);
 	INIT_LIST_HEAD(&clp->cl_openowners);
@@ -2003,7 +2003,6 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
 		if (!same_creds(&conf->cl_cred, &unconf->cl_cred))
 			status = nfserr_clid_inuse;
 		else {
-			atomic_set(&conf->cl_cb_set, 0);
 			nfsd4_change_callback(conf, &unconf->cl_cb_conn);
 			nfsd4_probe_callback(conf);
 			expire_client(unconf);
@@ -2633,7 +2632,8 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
 {
 	struct nfs4_delegation *dp;
 	struct nfs4_stateowner *sop = stp->st_stateowner;
-	int cb_up = atomic_read(&sop->so_client->cl_cb_set);
+	/* XXX: or unknown and nfsv4.1: */
+	int cb_up = (sop->so_client->cl_cb_state == NFSD4_CB_UP);
 	struct file_lock *fl;
 	int status, flag = 0;
 
@@ -2823,7 +2823,7 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	renew_client(clp);
 	status = nfserr_cb_path_down;
 	if (!list_empty(&clp->cl_delegations)
-			&& !atomic_read(&clp->cl_cb_set))
+			&& clp->cl_cb_state != NFSD4_CB_UP)
 		goto out;
 	status = nfs_ok;
 out:
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 442f6d8e024c..32ff615c36f4 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -242,7 +242,10 @@ struct nfs4_client {
 	unsigned long		cl_cb_flags;
 	struct rpc_clnt		*cl_cb_client;
 	u32			cl_cb_ident;
-	atomic_t		cl_cb_set;
+#define NFSD4_CB_UP		0
+#define NFSD4_CB_UNKNOWN	1
+#define NFSD4_CB_DOWN		2
+	int			cl_cb_state;
 	struct nfsd4_callback	cl_cb_null;
 	struct nfsd4_session	*cl_cb_session;
 
-- 
cgit v1.2.2


From 0d7bb71907546b2baf15d78edd3e508e12963dbf Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 18 Nov 2010 08:30:33 -0500
Subject: nfsd4: set sequence flag when backchannel is down

Implement the SEQ4_STATUS_CB_PATH_DOWN flag.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 6 +++++-
 fs/nfsd/nfs4xdr.c   | 8 ++------
 fs/nfsd/xdr4.h      | 2 +-
 3 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 919ad25660d6..15bd1cc77de7 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1800,8 +1800,12 @@ nfsd4_sequence(struct svc_rqst *rqstp,
 out:
 	/* Hold a session reference until done processing the compound. */
 	if (cstate->session) {
+		struct nfs4_client *clp = session->se_client;
+
 		nfsd4_get_session(cstate->session);
-		atomic_inc(&session->se_client->cl_refcount);
+		atomic_inc(&clp->cl_refcount);
+		if (clp->cl_cb_state == NFSD4_CB_DOWN)
+			seq->status_flags |= SEQ4_STATUS_CB_PATH_DOWN;
 	}
 	kfree(conn);
 	spin_unlock(&client_lock);
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 4ff2c9e0b276..956629b9cdc9 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3137,13 +3137,9 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
 	WRITE32(seq->seqid);
 	WRITE32(seq->slotid);
 	WRITE32(seq->maxslots);
-	/*
-	 * FIXME: for now:
-	 *   target_maxslots = maxslots
-	 *   status_flags = 0
-	 */
+	/* For now: target_maxslots = maxslots */
 	WRITE32(seq->maxslots);
-	WRITE32(0);
+	WRITE32(seq->status_flags);
 
 	ADJUST_ARGS();
 	resp->cstate.datap = p; /* DRC cache data pointer */
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 3a7aa4d98c1f..366401e1a536 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -378,8 +378,8 @@ struct nfsd4_sequence {
 	u32			cachethis;		/* request */
 #if 0
 	u32			target_maxslots;	/* response */
-	u32			status_flags;		/* response */
 #endif /* not yet */
+	u32			status_flags;		/* response */
 };
 
 struct nfsd4_destroy_session {
-- 
cgit v1.2.2


From eea4980660bc204bb9d11bb3bf2b1bde5fd5175f Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 18 Nov 2010 08:34:12 -0500
Subject: nfsd4: re-probe callback on connection loss

This makes sure we set the sequence flag when necessary.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 15bd1cc77de7..b24f19d4187a 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -642,7 +642,7 @@ static void nfsd4_conn_lost(struct svc_xpt_user *u)
 		free_conn(c);
 	}
 	spin_unlock(&clp->cl_lock);
-	/* XXX: mark callback for update, probe callback */
+	nfsd4_probe_callback(clp);
 }
 
 static struct nfsd4_conn *alloc_conn(struct svc_rqst *rqstp, u32 flags)
-- 
cgit v1.2.2


From 84f5f7ccc59e628fc8754c0a837fd7e9559711ac Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 9 Dec 2010 15:52:19 -0500
Subject: nfsd4: make sure sequence flags are set after destroy_session

If this loses any backchannel, make sure we have a chance to notice that
and set the sequence flags.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4callback.c | 6 ++++++
 fs/nfsd/nfs4state.c    | 3 +--
 fs/nfsd/state.h        | 1 +
 3 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index d32f49d6ca2c..cb002dce5630 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -589,6 +589,12 @@ void nfsd4_probe_callback(struct nfs4_client *clp)
 	do_probe_callback(clp);
 }
 
+void nfsd4_probe_callback_sync(struct nfs4_client *clp)
+{
+	nfsd4_probe_callback(clp);
+	flush_workqueue(callback_wq);
+}
+
 void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
 {
 	clp->cl_cb_state = NFSD4_CB_UNKNOWN;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index b24f19d4187a..00a50b8ac878 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1686,8 +1686,7 @@ nfsd4_destroy_session(struct svc_rqst *r,
 	spin_unlock(&client_lock);
 
 	nfs4_lock_state();
-	/* wait for callbacks */
-	nfsd4_shutdown_callback(ses->se_client);
+	nfsd4_probe_callback_sync(ses->se_client);
 	nfs4_unlock_state();
 
 	nfsd4_del_conns(ses);
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 32ff615c36f4..4e5bdfd9169c 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -464,6 +464,7 @@ extern __be32 nfs4_check_open_reclaim(clientid_t *clid);
 extern void nfs4_free_stateowner(struct kref *kref);
 extern int set_callback_cred(void);
 extern void nfsd4_probe_callback(struct nfs4_client *clp);
+extern void nfsd4_probe_callback_sync(struct nfs4_client *clp);
 extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
 extern void nfsd4_do_callback_rpc(struct work_struct *);
 extern void nfsd4_cb_recall(struct nfs4_delegation *dp);
-- 
cgit v1.2.2


From 229b2a0839870d0d4f91ad3b24ec13c57bbd50a0 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Fri, 10 Dec 2010 17:37:44 -0500
Subject: nfsd4: add helper function to run callbacks

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4callback.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index cb002dce5630..fff96dc7704e 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -560,6 +560,11 @@ int set_callback_cred(void)
 
 static struct workqueue_struct *callback_wq;
 
+static void run_nfsd4_cb(struct nfsd4_callback *cb)
+{
+	queue_work(callback_wq, &cb->cb_work);
+}
+
 static void do_probe_callback(struct nfs4_client *clp)
 {
 	struct nfsd4_callback *cb = &clp->cl_cb_null;
@@ -574,7 +579,7 @@ static void do_probe_callback(struct nfs4_client *clp)
 
 	cb->cb_ops = &nfsd4_cb_probe_ops;
 
-	queue_work(callback_wq, &cb->cb_work);
+	run_nfsd4_cb(cb);
 }
 
 /*
@@ -859,5 +864,5 @@ void nfsd4_cb_recall(struct nfs4_delegation *dp)
 	cb->cb_ops = &nfsd4_cb_recall_ops;
 	dp->dl_retries = 1;
 
-	queue_work(callback_wq, &dp->dl_recall.cb_work);
+	run_nfsd4_cb(&dp->dl_recall);
 }
-- 
cgit v1.2.2


From 14a24e99f4f506265b634c1cd04eca6394f49dbc Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Fri, 10 Dec 2010 19:02:49 -0500
Subject: nfsd4: give out delegations more quickly in 4.1 case

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 00a50b8ac878..408957cf6016 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2627,6 +2627,19 @@ nfs4_set_claim_prev(struct nfsd4_open *open)
 	open->op_stateowner->so_client->cl_firststate = 1;
 }
 
+/* Should we give out recallable state?: */
+static bool nfsd4_cb_channel_good(struct nfs4_client *clp)
+{
+	if (clp->cl_cb_state == NFSD4_CB_UP)
+		return true;
+	/*
+	 * In the sessions case, since we don't have to establish a
+	 * separate connection for callbacks, we assume it's OK
+	 * until we hear otherwise:
+	 */
+	return clp->cl_minorversion && clp->cl_cb_state == NFSD4_CB_UNKNOWN;
+}
+
 /*
  * Attempt to hand out a delegation.
  */
@@ -2635,11 +2648,11 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
 {
 	struct nfs4_delegation *dp;
 	struct nfs4_stateowner *sop = stp->st_stateowner;
-	/* XXX: or unknown and nfsv4.1: */
-	int cb_up = (sop->so_client->cl_cb_state == NFSD4_CB_UP);
+	int cb_up;
 	struct file_lock *fl;
 	int status, flag = 0;
 
+	cb_up = nfsd4_cb_channel_good(sop->so_client);
 	flag = NFS4_OPEN_DELEGATE_NONE;
 	open->op_recall = 0;
 	switch (open->op_claim_type) {
-- 
cgit v1.2.2


From 3ff3600e7eab16301e824293e8f49b9990bd4641 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 10 Jan 2011 16:37:51 -0500
Subject: nfsd4: simplify nfsd4_cb_prepare

Remove handling for a nonexistant case (status && !-EAGAIN).

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4callback.c | 26 ++++----------------------
 1 file changed, 4 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index fff96dc7704e..69955e98e086 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -613,24 +613,14 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
  * If the slot is available, then mark it busy.  Otherwise, set the
  * thread for sleeping on the callback RPC wait queue.
  */
-static int nfsd41_cb_setup_sequence(struct nfs4_client *clp,
-		struct rpc_task *task)
+static bool nfsd41_cb_get_slot(struct nfs4_client *clp, struct rpc_task *task)
 {
-	u32 *ptr = (u32 *)clp->cl_cb_session->se_sessionid.data;
-	int status = 0;
-
-	dprintk("%s: %u:%u:%u:%u\n", __func__,
-		ptr[0], ptr[1], ptr[2], ptr[3]);
-
 	if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
 		rpc_sleep_on(&clp->cl_cb_waitq, task, NULL);
 		dprintk("%s slot is busy\n", __func__);
-		status = -EAGAIN;
-		goto out;
+		return false;
 	}
-out:
-	dprintk("%s status=%d\n", __func__, status);
-	return status;
+	return true;
 }
 
 /*
@@ -643,19 +633,11 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
 	struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
 	struct nfs4_client *clp = dp->dl_client;
 	u32 minorversion = clp->cl_minorversion;
-	int status = 0;
 
 	cb->cb_minorversion = minorversion;
 	if (minorversion) {
-		status = nfsd41_cb_setup_sequence(clp, task);
-		if (status) {
-			if (status != -EAGAIN) {
-				/* terminate rpc task */
-				task->tk_status = status;
-				task->tk_action = NULL;
-			}
+		if (!nfsd41_cb_get_slot(clp, task))
 			return;
-		}
 	}
 	rpc_call_start(task);
 }
-- 
cgit v1.2.2


From 5ce8ba25d657a71d6d8cdb05a2b90c5ae7debfda Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 10 Jan 2011 16:44:41 -0500
Subject: nfsd4: allow restarting callbacks

If we lose the backchannel and then the client repairs the problem,
resend any callbacks.

We use a new cb_done flag to track whether there is still work to be
done for the callback or whether it can be destroyed with the rpc.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4callback.c | 34 ++++++++++++++++++++++++++++------
 fs/nfsd/nfs4state.c    |  1 +
 fs/nfsd/state.h        |  3 +++
 3 files changed, 32 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 69955e98e086..f1d9dd45553a 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -639,6 +639,10 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
 		if (!nfsd41_cb_get_slot(clp, task))
 			return;
 	}
+	cb->cb_done = false;
+	spin_lock(&clp->cl_lock);
+	list_add(&cb->cb_per_client, &clp->cl_callbacks);
+	spin_unlock(&clp->cl_lock);
 	rpc_call_start(task);
 }
 
@@ -681,8 +685,11 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
 		return;
 	}
 
+	if (cb->cb_done)
+		return;
 	switch (task->tk_status) {
 	case 0:
+		cb->cb_done = true;
 		return;
 	case -EBADHANDLE:
 	case -NFS4ERR_BAD_STATEID:
@@ -695,7 +702,7 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
 		if (current_rpc_client != task->tk_client) {
 			/* queue a callback on the new connection: */
 			atomic_inc(&dp->dl_count);
-			nfsd4_cb_recall(dp);
+			run_nfsd4_cb(&dp->dl_recall);
 			return;
 		}
 	}
@@ -704,16 +711,23 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
 		task->tk_status = 0;
 		rpc_restart_call_prepare(task);
 		return;
-	} else
-		nfsd4_mark_cb_down(clp, task->tk_status);
+	}
+	nfsd4_mark_cb_down(clp, task->tk_status);
+	cb->cb_done = true;
 }
 
 static void nfsd4_cb_recall_release(void *calldata)
 {
 	struct nfsd4_callback *cb = calldata;
+	struct nfs4_client *clp = cb->cb_clp;
 	struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
 
-	nfs4_put_delegation(dp);
+	if (cb->cb_done) {
+		spin_lock(&clp->cl_lock);
+		list_del(&cb->cb_per_client);
+		spin_unlock(&clp->cl_lock);
+		nfs4_put_delegation(dp);
+	}
 }
 
 static const struct rpc_call_ops nfsd4_cb_recall_ops = {
@@ -808,8 +822,13 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
 	spin_unlock(&clp->cl_lock);
 
 	err = setup_callback_client(clp, &conn, ses);
-	if (err)
+	if (err) {
 		warn_no_callback_path(clp, err);
+		return;
+	}
+	/* Yay, the callback channel's back! Restart any callbacks: */
+	list_for_each_entry(cb, &clp->cl_callbacks, cb_per_client)
+		run_nfsd4_cb(cb);
 }
 
 void nfsd4_do_callback_rpc(struct work_struct *w)
@@ -834,10 +853,11 @@ void nfsd4_do_callback_rpc(struct work_struct *w)
 void nfsd4_cb_recall(struct nfs4_delegation *dp)
 {
 	struct nfsd4_callback *cb = &dp->dl_recall;
+	struct nfs4_client *clp = dp->dl_client;
 
 	dp->dl_retries = 1;
 	cb->cb_op = dp;
-	cb->cb_clp = dp->dl_client;
+	cb->cb_clp = clp;
 	cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL];
 	cb->cb_msg.rpc_argp = cb;
 	cb->cb_msg.rpc_resp = cb;
@@ -846,5 +866,7 @@ void nfsd4_cb_recall(struct nfs4_delegation *dp)
 	cb->cb_ops = &nfsd4_cb_recall_ops;
 	dp->dl_retries = 1;
 
+	cb->cb_done = true;
+
 	run_nfsd4_cb(&dp->dl_recall);
 }
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 408957cf6016..6e1f9aadd439 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1077,6 +1077,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
 	INIT_LIST_HEAD(&clp->cl_openowners);
 	INIT_LIST_HEAD(&clp->cl_delegations);
 	INIT_LIST_HEAD(&clp->cl_lru);
+	INIT_LIST_HEAD(&clp->cl_callbacks);
 	spin_lock_init(&clp->cl_lock);
 	INIT_WORK(&clp->cl_cb_null.cb_work, nfsd4_do_callback_rpc);
 	clp->cl_time = get_seconds();
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 4e5bdfd9169c..3074656ba7bf 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -68,10 +68,12 @@ typedef struct {
 struct nfsd4_callback {
 	void *cb_op;
 	struct nfs4_client *cb_clp;
+	struct list_head cb_per_client;
 	u32 cb_minorversion;
 	struct rpc_message cb_msg;
 	const struct rpc_call_ops *cb_ops;
 	struct work_struct cb_work;
+	bool cb_done;
 };
 
 struct nfs4_delegation {
@@ -248,6 +250,7 @@ struct nfs4_client {
 	int			cl_cb_state;
 	struct nfsd4_callback	cl_cb_null;
 	struct nfsd4_session	*cl_cb_session;
+	struct list_head	cl_callbacks; /* list of in-progress callbacks */
 
 	/* for all client information that callback code might need: */
 	spinlock_t		cl_lock;
-- 
cgit v1.2.2


From c58efdb442bb49dea1d148f207560c41918c1bf4 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 4 Jan 2011 04:49:29 +0000
Subject: xfs: ensure log covering transactions are synchronous

To ensure the log is covered and the filesystem idles correctly, we
need to ensure that dummy transactions hit the disk and do not stay
pinned in memory.  If the superblock is pinned in memory, it can't
be flushed so the log covering cannot make progress. The result is
dependent on timing - more oftent han not we continue to issues a
log covering transaction every 36s rather than idling after ~90s.

Fix this by making the log covering transaction synchronous. To
avoid additional log force from xfssyncd, make the log covering
transaction take the place of the existing log force in the xfssyncd
background sync process.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_super.c |  2 +-
 fs/xfs/linux-2.6/xfs_sync.c  | 11 ++++++-----
 fs/xfs/xfs_fsops.c           | 10 +++++-----
 fs/xfs/xfs_fsops.h           |  2 +-
 4 files changed, 13 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index c51faaa5e291..af32f375ca96 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1413,7 +1413,7 @@ xfs_fs_freeze(
 
 	xfs_save_resvblks(mp);
 	xfs_quiesce_attr(mp);
-	return -xfs_fs_log_dummy(mp, SYNC_WAIT);
+	return -xfs_fs_log_dummy(mp);
 }
 
 STATIC int
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index a02480de9759..e22f0057d21f 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -362,7 +362,7 @@ xfs_quiesce_data(
 
 	/* mark the log as covered if needed */
 	if (xfs_log_need_covered(mp))
-		error2 = xfs_fs_log_dummy(mp, SYNC_WAIT);
+		error2 = xfs_fs_log_dummy(mp);
 
 	/* flush data-only devices */
 	if (mp->m_rtdev_targp)
@@ -503,13 +503,14 @@ xfs_sync_worker(
 	int		error;
 
 	if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
-		xfs_log_force(mp, 0);
-		xfs_reclaim_inodes(mp, 0);
 		/* dgc: errors ignored here */
-		error = xfs_qm_sync(mp, SYNC_TRYLOCK);
 		if (mp->m_super->s_frozen == SB_UNFROZEN &&
 		    xfs_log_need_covered(mp))
-			error = xfs_fs_log_dummy(mp, 0);
+			error = xfs_fs_log_dummy(mp);
+		else
+			xfs_log_force(mp, 0);
+		xfs_reclaim_inodes(mp, 0);
+		error = xfs_qm_sync(mp, SYNC_TRYLOCK);
 	}
 	mp->m_sync_seq++;
 	wake_up(&mp->m_wait_single_sync_task);
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index f56d30e8040c..cec89dd5d7d2 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -612,12 +612,13 @@ out:
  *
  * We cannot use an inode here for this - that will push dirty state back up
  * into the VFS and then periodic inode flushing will prevent log covering from
- * making progress. Hence we log a field in the superblock instead.
+ * making progress. Hence we log a field in the superblock instead and use a
+ * synchronous transaction to ensure the superblock is immediately unpinned
+ * and can be written back.
  */
 int
 xfs_fs_log_dummy(
-	xfs_mount_t	*mp,
-	int		flags)
+	xfs_mount_t	*mp)
 {
 	xfs_trans_t	*tp;
 	int		error;
@@ -632,8 +633,7 @@ xfs_fs_log_dummy(
 
 	/* log the UUID because it is an unchanging field */
 	xfs_mod_sb(tp, XFS_SB_UUID);
-	if (flags & SYNC_WAIT)
-		xfs_trans_set_sync(tp);
+	xfs_trans_set_sync(tp);
 	return xfs_trans_commit(tp, 0);
 }
 
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index a786c5212c1e..1b6a98b66886 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -25,6 +25,6 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
 extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval,
 				xfs_fsop_resblks_t *outval);
 extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags);
-extern int xfs_fs_log_dummy(xfs_mount_t *mp, int flags);
+extern int xfs_fs_log_dummy(struct xfs_mount *mp);
 
 #endif	/* __XFS_FSOPS_H__ */
-- 
cgit v1.2.2


From a46db60834883c1c8c665d7fcc7b4ab66f5966fc Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 7 Jan 2011 13:02:04 +0000
Subject: xfs: add FITRIM support

Allow manual discards from userspace using the FITRIM ioctl.  This is not
intended to be run during normal workloads, as the freepsace btree walks
can cause large performance degradation.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/Makefile                |   1 +
 fs/xfs/linux-2.6/xfs_discard.c | 191 +++++++++++++++++++++++++++++++++++++++++
 fs/xfs/linux-2.6/xfs_discard.h |   8 ++
 fs/xfs/linux-2.6/xfs_ioctl.c   |   3 +
 fs/xfs/linux-2.6/xfs_trace.h   |  33 +++++++
 fs/xfs/xfs_alloc.c             |  10 +--
 fs/xfs/xfs_alloc.h             |  25 ++++--
 7 files changed, 259 insertions(+), 12 deletions(-)
 create mode 100644 fs/xfs/linux-2.6/xfs_discard.c
 create mode 100644 fs/xfs/linux-2.6/xfs_discard.h

(limited to 'fs')

diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 0dce969d6cad..faca44997099 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -98,6 +98,7 @@ xfs-y				+= $(addprefix $(XFS_LINUX)/, \
 				   kmem.o \
 				   xfs_aops.o \
 				   xfs_buf.o \
+				   xfs_discard.o \
 				   xfs_export.o \
 				   xfs_file.o \
 				   xfs_fs_subr.o \
diff --git a/fs/xfs/linux-2.6/xfs_discard.c b/fs/xfs/linux-2.6/xfs_discard.c
new file mode 100644
index 000000000000..05201ae719e5
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_discard.c
@@ -0,0 +1,191 @@
+/*
+ * Copyright (C) 2010 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_sb.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_quota.h"
+#include "xfs_trans.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_discard.h"
+#include "xfs_trace.h"
+
+STATIC int
+xfs_trim_extents(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno,
+	xfs_fsblock_t		start,
+	xfs_fsblock_t		len,
+	xfs_fsblock_t		minlen,
+	__uint64_t		*blocks_trimmed)
+{
+	struct block_device	*bdev = mp->m_ddev_targp->bt_bdev;
+	struct xfs_btree_cur	*cur;
+	struct xfs_buf		*agbp;
+	struct xfs_perag	*pag;
+	int			error;
+	int			i;
+
+	pag = xfs_perag_get(mp, agno);
+
+	error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
+	if (error || !agbp)
+		goto out_put_perag;
+
+	cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT);
+
+	/*
+	 * Force out the log.  This means any transactions that might have freed
+	 * space before we took the AGF buffer lock are now on disk, and the
+	 * volatile disk cache is flushed.
+	 */
+	xfs_log_force(mp, XFS_LOG_SYNC);
+
+	/*
+	 * Look up the longest btree in the AGF and start with it.
+	 */
+	error = xfs_alloc_lookup_le(cur, 0,
+				    XFS_BUF_TO_AGF(agbp)->agf_longest, &i);
+	if (error)
+		goto out_del_cursor;
+
+	/*
+	 * Loop until we are done with all extents that are large
+	 * enough to be worth discarding.
+	 */
+	while (i) {
+		xfs_agblock_t fbno;
+		xfs_extlen_t flen;
+
+		error = xfs_alloc_get_rec(cur, &fbno, &flen, &i);
+		if (error)
+			goto out_del_cursor;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor);
+		ASSERT(flen <= XFS_BUF_TO_AGF(agbp)->agf_longest);
+
+		/*
+		 * Too small?  Give up.
+		 */
+		if (flen < minlen) {
+			trace_xfs_discard_toosmall(mp, agno, fbno, flen);
+			goto out_del_cursor;
+		}
+
+		/*
+		 * If the extent is entirely outside of the range we are
+		 * supposed to discard skip it.  Do not bother to trim
+		 * down partially overlapping ranges for now.
+		 */
+		if (XFS_AGB_TO_FSB(mp, agno, fbno) + flen < start ||
+		    XFS_AGB_TO_FSB(mp, agno, fbno) >= start + len) {
+			trace_xfs_discard_exclude(mp, agno, fbno, flen);
+			goto next_extent;
+		}
+
+		/*
+		 * If any blocks in the range are still busy, skip the
+		 * discard and try again the next time.
+		 */
+		if (xfs_alloc_busy_search(mp, agno, fbno, flen)) {
+			trace_xfs_discard_busy(mp, agno, fbno, flen);
+			goto next_extent;
+		}
+
+		trace_xfs_discard_extent(mp, agno, fbno, flen);
+		error = -blkdev_issue_discard(bdev,
+				XFS_AGB_TO_DADDR(mp, agno, fbno),
+				XFS_FSB_TO_BB(mp, flen),
+				GFP_NOFS, 0);
+		if (error)
+			goto out_del_cursor;
+		*blocks_trimmed += flen;
+
+next_extent:
+		error = xfs_btree_decrement(cur, 0, &i);
+		if (error)
+			goto out_del_cursor;
+	}
+
+out_del_cursor:
+	xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+	xfs_buf_relse(agbp);
+out_put_perag:
+	xfs_perag_put(pag);
+	return error;
+}
+
+int
+xfs_ioc_trim(
+	struct xfs_mount		*mp,
+	struct fstrim_range __user	*urange)
+{
+	struct request_queue	*q = mp->m_ddev_targp->bt_bdev->bd_disk->queue;
+	unsigned int		granularity = q->limits.discard_granularity;
+	struct fstrim_range	range;
+	xfs_fsblock_t		start, len, minlen;
+	xfs_agnumber_t		start_agno, end_agno, agno;
+	__uint64_t		blocks_trimmed = 0;
+	int			error, last_error = 0;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -XFS_ERROR(EPERM);
+	if (copy_from_user(&range, urange, sizeof(range)))
+		return -XFS_ERROR(EFAULT);
+
+	/*
+	 * Truncating down the len isn't actually quite correct, but using
+	 * XFS_B_TO_FSB would mean we trivially get overflows for values
+	 * of ULLONG_MAX or slightly lower.  And ULLONG_MAX is the default
+	 * used by the fstrim application.  In the end it really doesn't
+	 * matter as trimming blocks is an advisory interface.
+	 */
+	start = XFS_B_TO_FSBT(mp, range.start);
+	len = XFS_B_TO_FSBT(mp, range.len);
+	minlen = XFS_B_TO_FSB(mp, max_t(u64, granularity, range.minlen));
+
+	start_agno = XFS_FSB_TO_AGNO(mp, start);
+	if (start_agno >= mp->m_sb.sb_agcount)
+		return -XFS_ERROR(EINVAL);
+
+	end_agno = XFS_FSB_TO_AGNO(mp, start + len);
+	if (end_agno >= mp->m_sb.sb_agcount)
+		end_agno = mp->m_sb.sb_agcount - 1;
+
+	for (agno = start_agno; agno <= end_agno; agno++) {
+		error = -xfs_trim_extents(mp, agno, start, len, minlen,
+					  &blocks_trimmed);
+		if (error)
+			last_error = error;
+	}
+
+	if (last_error)
+		return last_error;
+
+	range.len = XFS_FSB_TO_B(mp, blocks_trimmed);
+	if (copy_to_user(urange, &range, sizeof(range)))
+		return -XFS_ERROR(EFAULT);
+	return 0;
+}
diff --git a/fs/xfs/linux-2.6/xfs_discard.h b/fs/xfs/linux-2.6/xfs_discard.h
new file mode 100644
index 000000000000..e82b6dd3e127
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_discard.h
@@ -0,0 +1,8 @@
+#ifndef XFS_DISCARD_H
+#define XFS_DISCARD_H 1
+
+struct fstrim_range;
+
+extern int	xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *);
+
+#endif /* XFS_DISCARD_H */
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index ad442d9e392e..b06ede1d0bed 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -39,6 +39,7 @@
 #include "xfs_dfrag.h"
 #include "xfs_fsops.h"
 #include "xfs_vnodeops.h"
+#include "xfs_discard.h"
 #include "xfs_quota.h"
 #include "xfs_inode_item.h"
 #include "xfs_export.h"
@@ -1294,6 +1295,8 @@ xfs_file_ioctl(
 	trace_xfs_file_ioctl(ip);
 
 	switch (cmd) {
+	case FITRIM:
+		return xfs_ioc_trim(mp, arg);
 	case XFS_IOC_ALLOCSP:
 	case XFS_IOC_FREESP:
 	case XFS_IOC_RESVSP:
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index 647af2a2e7aa..2d0bcb479075 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -1759,6 +1759,39 @@ DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover);
 DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel);
 DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip);
 
+DECLARE_EVENT_CLASS(xfs_discard_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 xfs_agblock_t agbno, xfs_extlen_t len),
+	TP_ARGS(mp, agno, agbno, len),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, agbno)
+		__field(xfs_extlen_t, len)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->agbno = agbno;
+		__entry->len = len;
+	),
+	TP_printk("dev %d:%d agno %u agbno %u len %u\n",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->agbno,
+		  __entry->len)
+)
+
+#define DEFINE_DISCARD_EVENT(name) \
+DEFINE_EVENT(xfs_discard_class, name, \
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+		 xfs_agblock_t agbno, xfs_extlen_t len), \
+	TP_ARGS(mp, agno, agbno, len))
+DEFINE_DISCARD_EVENT(xfs_discard_extent);
+DEFINE_DISCARD_EVENT(xfs_discard_toosmall);
+DEFINE_DISCARD_EVENT(xfs_discard_exclude);
+DEFINE_DISCARD_EVENT(xfs_discard_busy);
+
 #endif /* _TRACE_XFS_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index fa8723f5870a..f3227984a9bf 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -41,10 +41,6 @@
 #define	XFSA_FIXUP_BNO_OK	1
 #define	XFSA_FIXUP_CNT_OK	2
 
-static int
-xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
-		    xfs_agblock_t bno, xfs_extlen_t len);
-
 /*
  * Prototypes for per-ag allocation routines
  */
@@ -94,7 +90,7 @@ xfs_alloc_lookup_ge(
  * Lookup the first record less than or equal to [bno, len]
  * in the btree given by cur.
  */
-STATIC int				/* error */
+int					/* error */
 xfs_alloc_lookup_le(
 	struct xfs_btree_cur	*cur,	/* btree cursor */
 	xfs_agblock_t		bno,	/* starting block of extent */
@@ -127,7 +123,7 @@ xfs_alloc_update(
 /*
  * Get the data from the pointed-to record.
  */
-STATIC int				/* error */
+int					/* error */
 xfs_alloc_get_rec(
 	struct xfs_btree_cur	*cur,	/* btree cursor */
 	xfs_agblock_t		*bno,	/* output: starting block of extent */
@@ -2615,7 +2611,7 @@ restart:
  * will require a synchronous transaction, but it can still be
  * used to distinguish between a partial or exact match.
  */
-static int
+int
 xfs_alloc_busy_search(
 	struct xfs_mount	*mp,
 	xfs_agnumber_t		agno,
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 895009a97271..0ab56b32c7eb 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -19,6 +19,7 @@
 #define	__XFS_ALLOC_H__
 
 struct xfs_buf;
+struct xfs_btree_cur;
 struct xfs_mount;
 struct xfs_perag;
 struct xfs_trans;
@@ -118,16 +119,16 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp,
 		struct xfs_perag *pag);
 
 #ifdef __KERNEL__
-
 void
-xfs_alloc_busy_insert(xfs_trans_t *tp,
-		xfs_agnumber_t agno,
-		xfs_agblock_t bno,
-		xfs_extlen_t len);
+xfs_alloc_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno,
+	xfs_agblock_t bno, xfs_extlen_t len);
 
 void
 xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp);
 
+int
+xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
+	xfs_agblock_t bno, xfs_extlen_t len);
 #endif	/* __KERNEL__ */
 
 /*
@@ -205,4 +206,18 @@ xfs_free_extent(
 	xfs_fsblock_t	bno,	/* starting block number of extent */
 	xfs_extlen_t	len);	/* length of extent */
 
+int					/* error */
+xfs_alloc_lookup_le(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agblock_t		bno,	/* starting block of extent */
+	xfs_extlen_t		len,	/* length of extent */
+	int			*stat);	/* success/failure */
+
+int					/* error */
+xfs_alloc_get_rec(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agblock_t		*bno,	/* output: starting block of extent */
+	xfs_extlen_t		*len,	/* output: length of extent */
+	int			*stat);	/* output: success/failure */
+
 #endif	/* __XFS_ALLOC_H__ */
-- 
cgit v1.2.2


From bfc60177f8ab509bc225becbb58f7e53a0e33e81 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 7 Jan 2011 13:02:23 +0000
Subject: xfs: fix error handling for synchronous writes

If we get an IO error on a synchronous superblock write, we attach an
error release function to it so that when the last reference goes away
the release function is called and the buffer is invalidated and
unlocked. The buffer is left locked until the release function is
called so that other concurrent users of the buffer will be locked out
until the buffer error is fully processed.

Unfortunately, for the superblock buffer the filesyetm itself holds a
reference to the buffer which prevents the reference count from
dropping to zero and the release function being called. As a result,
once an IO error occurs on a sync write, the buffer will never be
unlocked and all future attempts to lock the buffer will hang.

To make matters worse, this problems is not unique to such buffers;
if there is a concurrent _xfs_buf_find() running, the lookup will grab
a reference to the buffer and then wait on the buffer lock, preventing
the reference count from ever falling to zero and hence unlocking the
buffer.

As such, the whole b_relse function implementation is broken because it
cannot rely on the buffer reference count falling to zero to unlock the
errored buffer. The synchronous write error path is the only path that
uses this callback - it is used to ensure that the synchronous waiter
gets the buffer error before the error state is cleared from the buffer
by the release function.

Given that the only sychronous buffer writes now go through xfs_bwrite
and the error path in question can only occur for a write of a dirty,
logged buffer, we can move most of the b_relse processing to happen
inline in xfs_buf_iodone_callbacks, just like a normal I/O completion.
In addition to that we make sure the error is not cleared in
xfs_buf_iodone_callbacks, so that xfs_bwrite can reliably check it.
Given that xfs_bwrite keeps the buffer locked until it has waited for
it and checked the error this allows to reliably propagate the error
to the caller, and make sure that the buffer is reliably unlocked.

Given that xfs_buf_iodone_callbacks was the only instance of the
b_relse callback we can remove it entirely.

Based on earlier patches by Dave Chinner and Ajeet Yadav.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reported-by: Ajeet Yadav <ajeet.yadav.77@gmail.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_buf.c |   7 +--
 fs/xfs/linux-2.6/xfs_buf.h |   7 +--
 fs/xfs/xfs_buf_item.c      | 151 +++++++++++++++------------------------------
 3 files changed, 51 insertions(+), 114 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 92f1f2acc6ab..ac1c7e8378dd 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -896,7 +896,6 @@ xfs_buf_rele(
 	trace_xfs_buf_rele(bp, _RET_IP_);
 
 	if (!pag) {
-		ASSERT(!bp->b_relse);
 		ASSERT(list_empty(&bp->b_lru));
 		ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
 		if (atomic_dec_and_test(&bp->b_hold))
@@ -908,11 +907,7 @@ xfs_buf_rele(
 
 	ASSERT(atomic_read(&bp->b_hold) > 0);
 	if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
-		if (bp->b_relse) {
-			atomic_inc(&bp->b_hold);
-			spin_unlock(&pag->pag_buf_lock);
-			bp->b_relse(bp);
-		} else if (!(bp->b_flags & XBF_STALE) &&
+		if (!(bp->b_flags & XBF_STALE) &&
 			   atomic_read(&bp->b_lru_ref)) {
 			xfs_buf_lru_add(bp);
 			spin_unlock(&pag->pag_buf_lock);
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index a76c2428faff..cbe65950e524 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -152,8 +152,6 @@ typedef struct xfs_buftarg {
 
 struct xfs_buf;
 typedef void (*xfs_buf_iodone_t)(struct xfs_buf *);
-typedef void (*xfs_buf_relse_t)(struct xfs_buf *);
-typedef int (*xfs_buf_bdstrat_t)(struct xfs_buf *);
 
 #define XB_PAGES	2
 
@@ -183,7 +181,6 @@ typedef struct xfs_buf {
 	void			*b_addr;	/* virtual address of buffer */
 	struct work_struct	b_iodone_work;
 	xfs_buf_iodone_t	b_iodone;	/* I/O completion function */
-	xfs_buf_relse_t		b_relse;	/* releasing function */
 	struct completion	b_iowait;	/* queue for I/O waiters */
 	void			*b_fspriv;
 	void			*b_fspriv2;
@@ -323,7 +320,6 @@ void xfs_buf_stale(struct xfs_buf *bp);
 #define XFS_BUF_FSPRIVATE2(bp, type)		((type)(bp)->b_fspriv2)
 #define XFS_BUF_SET_FSPRIVATE2(bp, val)		((bp)->b_fspriv2 = (void*)(val))
 #define XFS_BUF_SET_START(bp)			do { } while (0)
-#define XFS_BUF_SET_BRELSE_FUNC(bp, func)	((bp)->b_relse = (func))
 
 #define XFS_BUF_PTR(bp)			(xfs_caddr_t)((bp)->b_addr)
 #define XFS_BUF_SET_PTR(bp, val, cnt)	xfs_buf_associate_memory(bp, val, cnt)
@@ -360,8 +356,7 @@ xfs_buf_set_ref(
 
 static inline void xfs_buf_relse(xfs_buf_t *bp)
 {
-	if (!bp->b_relse)
-		xfs_buf_unlock(bp);
+	xfs_buf_unlock(bp);
 	xfs_buf_rele(bp);
 }
 
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index ed2b65f3f8b9..98c6f73b6752 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -141,7 +141,6 @@ xfs_buf_item_log_check(
 #define		xfs_buf_item_log_check(x)
 #endif
 
-STATIC void	xfs_buf_error_relse(xfs_buf_t *bp);
 STATIC void	xfs_buf_do_callbacks(struct xfs_buf *bp);
 
 /*
@@ -959,128 +958,76 @@ xfs_buf_do_callbacks(
  */
 void
 xfs_buf_iodone_callbacks(
-	xfs_buf_t	*bp)
+	struct xfs_buf		*bp)
 {
-	xfs_log_item_t	*lip;
-	static ulong	lasttime;
-	static xfs_buftarg_t *lasttarg;
-	xfs_mount_t	*mp;
+	struct xfs_log_item	*lip = bp->b_fspriv;
+	struct xfs_mount	*mp = lip->li_mountp;
+	static ulong		lasttime;
+	static xfs_buftarg_t	*lasttarg;
 
-	ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
-	lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+	if (likely(!XFS_BUF_GETERROR(bp)))
+		goto do_callbacks;
 
-	if (XFS_BUF_GETERROR(bp) != 0) {
-		/*
-		 * If we've already decided to shutdown the filesystem
-		 * because of IO errors, there's no point in giving this
-		 * a retry.
-		 */
-		mp = lip->li_mountp;
-		if (XFS_FORCED_SHUTDOWN(mp)) {
-			ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp);
-			XFS_BUF_SUPER_STALE(bp);
-			trace_xfs_buf_item_iodone(bp, _RET_IP_);
-			xfs_buf_do_callbacks(bp);
-			XFS_BUF_SET_FSPRIVATE(bp, NULL);
-			XFS_BUF_CLR_IODONE_FUNC(bp);
-			xfs_buf_ioend(bp, 0);
-			return;
-		}
+	/*
+	 * If we've already decided to shutdown the filesystem because of
+	 * I/O errors, there's no point in giving this a retry.
+	 */
+	if (XFS_FORCED_SHUTDOWN(mp)) {
+		XFS_BUF_SUPER_STALE(bp);
+		trace_xfs_buf_item_iodone(bp, _RET_IP_);
+		goto do_callbacks;
+	}
 
-		if ((XFS_BUF_TARGET(bp) != lasttarg) ||
-		    (time_after(jiffies, (lasttime + 5*HZ)))) {
-			lasttime = jiffies;
-			cmn_err(CE_ALERT, "Device %s, XFS metadata write error"
-					" block 0x%llx in %s",
-				XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
-			      (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname);
-		}
-		lasttarg = XFS_BUF_TARGET(bp);
+	if (XFS_BUF_TARGET(bp) != lasttarg ||
+	    time_after(jiffies, (lasttime + 5*HZ))) {
+		lasttime = jiffies;
+		cmn_err(CE_ALERT, "Device %s, XFS metadata write error"
+				" block 0x%llx in %s",
+			XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
+		      (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname);
+	}
+	lasttarg = XFS_BUF_TARGET(bp);
 
-		if (XFS_BUF_ISASYNC(bp)) {
-			/*
-			 * If the write was asynchronous then noone will be
-			 * looking for the error.  Clear the error state
-			 * and write the buffer out again delayed write.
-			 *
-			 * XXXsup This is OK, so long as we catch these
-			 * before we start the umount; we don't want these
-			 * DELWRI metadata bufs to be hanging around.
-			 */
-			XFS_BUF_ERROR(bp,0); /* errno of 0 unsets the flag */
-
-			if (!(XFS_BUF_ISSTALE(bp))) {
-				XFS_BUF_DELAYWRITE(bp);
-				XFS_BUF_DONE(bp);
-				XFS_BUF_SET_START(bp);
-			}
-			ASSERT(XFS_BUF_IODONE_FUNC(bp));
-			trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
-			xfs_buf_relse(bp);
-		} else {
-			/*
-			 * If the write of the buffer was not asynchronous,
-			 * then we want to make sure to return the error
-			 * to the caller of bwrite().  Because of this we
-			 * cannot clear the B_ERROR state at this point.
-			 * Instead we install a callback function that
-			 * will be called when the buffer is released, and
-			 * that routine will clear the error state and
-			 * set the buffer to be written out again after
-			 * some delay.
-			 */
-			/* We actually overwrite the existing b-relse
-			   function at times, but we're gonna be shutting down
-			   anyway. */
-			XFS_BUF_SET_BRELSE_FUNC(bp,xfs_buf_error_relse);
+	/*
+	 * If the write was asynchronous then noone will be looking for the
+	 * error.  Clear the error state and write the buffer out again.
+	 *
+	 * During sync or umount we'll write all pending buffers again
+	 * synchronous, which will catch these errors if they keep hanging
+	 * around.
+	 */
+	if (XFS_BUF_ISASYNC(bp)) {
+		XFS_BUF_ERROR(bp, 0); /* errno of 0 unsets the flag */
+
+		if (!XFS_BUF_ISSTALE(bp)) {
+			XFS_BUF_DELAYWRITE(bp);
 			XFS_BUF_DONE(bp);
-			XFS_BUF_FINISH_IOWAIT(bp);
+			XFS_BUF_SET_START(bp);
 		}
+		ASSERT(XFS_BUF_IODONE_FUNC(bp));
+		trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
+		xfs_buf_relse(bp);
 		return;
 	}
 
-	xfs_buf_do_callbacks(bp);
-	XFS_BUF_SET_FSPRIVATE(bp, NULL);
-	XFS_BUF_CLR_IODONE_FUNC(bp);
-	xfs_buf_ioend(bp, 0);
-}
-
-/*
- * This is a callback routine attached to a buffer which gets an error
- * when being written out synchronously.
- */
-STATIC void
-xfs_buf_error_relse(
-	xfs_buf_t	*bp)
-{
-	xfs_log_item_t	*lip;
-	xfs_mount_t	*mp;
-
-	lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
-	mp = (xfs_mount_t *)lip->li_mountp;
-	ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp);
-
+	/*
+	 * If the write of the buffer was synchronous, we want to make
+	 * sure to return the error to the caller of xfs_bwrite().
+	 */
 	XFS_BUF_STALE(bp);
 	XFS_BUF_DONE(bp);
 	XFS_BUF_UNDELAYWRITE(bp);
-	XFS_BUF_ERROR(bp,0);
 
 	trace_xfs_buf_error_relse(bp, _RET_IP_);
+	xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
 
-	if (! XFS_FORCED_SHUTDOWN(mp))
-		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
-	/*
-	 * We have to unpin the pinned buffers so do the
-	 * callbacks.
-	 */
+do_callbacks:
 	xfs_buf_do_callbacks(bp);
 	XFS_BUF_SET_FSPRIVATE(bp, NULL);
 	XFS_BUF_CLR_IODONE_FUNC(bp);
-	XFS_BUF_SET_BRELSE_FUNC(bp,NULL);
-	xfs_buf_relse(bp);
+	xfs_buf_ioend(bp, 0);
 }
 
-
 /*
  * This is the iodone() function for buffers which have been
  * logged.  It is called when they are eventually flushed out.
-- 
cgit v1.2.2


From 1884bd8354c9aec4ca501dc4773c13ad2a09af7b Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jj@chaosbits.net>
Date: Sat, 25 Dec 2010 20:14:53 +0000
Subject: xfs: fix an assignment within an ASSERT()

In fs/xfs/xfs_trans.c::xfs_trans_unreserve_and_mod_sb() at the out:
label we have this:
	ASSERT(error = 0);
I believe a comparison was intended, not an assignment. If I'm
right, the patch below fixes that up.

Signed-off-by: Jesper Juhl <jj@chaosbits.net>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_trans.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index f80a067a4658..33dbc4e0ad62 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1137,7 +1137,7 @@ out_undo_fdblocks:
 	if (blkdelta)
 		xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, -blkdelta, rsvd);
 out:
-	ASSERT(error = 0);
+	ASSERT(error == 0);
 	return;
 }
 
-- 
cgit v1.2.2


From 65a84a0f7567ea244e5246e642920260cfc2744a Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Fri, 7 Jan 2011 03:30:41 +0000
Subject: xfs: Add log level to assertion printk

I received a ppc64 bug report involving xfs but the assertion was
filtered out by the console log level. Use KERN_CRIT to ensure it
makes it out.

Signed-off-by: Anton Blanchard <anton@samba.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/support/debug.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
index 975aa10e1a47..86162e5f9a21 100644
--- a/fs/xfs/support/debug.c
+++ b/fs/xfs/support/debug.c
@@ -104,7 +104,8 @@ xfs_fs_vcmn_err(
 void
 assfail(char *expr, char *file, int line)
 {
-	printk("Assertion failed: %s, file: %s, line: %d\n", expr, file, line);
+	printk(KERN_CRIT "Assertion failed: %s, file: %s, line: %d\n", expr,
+	       file, line);
 	BUG();
 }
 
-- 
cgit v1.2.2


From 73efe4a4ddf8eb2b1cc7039e8a66a23a424961af Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 12 Jan 2011 00:35:42 +0000
Subject: xfs: prevent NMI timeouts in cmn_err

We currently have a global error message buffer in cmn_err that is
protected by a spin lock that disables interrupts.  Recently there
have been reports of NMI timeouts occurring when the console is
being flooded by SCSI error reports due to cmn_err() getting stuck
trying to print to the console while holding this lock (i.e. with
interrupts disabled). The NMI watchdog is seeing this CPU as
non-responding and so is triggering a panic.  While the trigger for
the reported case is SCSI errors, pretty much anything that spams
the kernel log could cause this to occur.

Realistically the only reason that we have the intemediate message
buffer is to prepend the correct kernel log level prefix to the log
message. The only reason we have the lock is to protect the global
message buffer and the only reason the message buffer is global is
to keep it off the stack. Hence if we can avoid needing a global
message buffer we avoid needing the lock, and we can do this with a
small amount of cleanup and some preprocessor tricks:

	1. clean up xfs_cmn_err() panic mask functionality to avoid
	   needing debug code in xfs_cmn_err()
	2. remove the couple of "!" message prefixes that still exist that
	   the existing cmn_err() code steps over.
	3. redefine CE_* levels directly to KERN_*
	4. redefine cmn_err() and friends to use printk() directly
	   via variable argument length macros.

By doing this, we can completely remove the cmn_err() code and the
lock that is causing the problems, and rely solely on printk()
serialisation to ensure that we don't get garbled messages.

A series of followup patches is really needed to clean up all the
cmn_err() calls and related messages properly, but that results in a
series that is not easily back portable to enterprise kernels. Hence
this initial fix is only to address the direct problem in the lowest
impact way possible.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_sysctl.c |  23 ++++++++-
 fs/xfs/support/debug.c        | 109 +++++++++++++++++++-----------------------
 fs/xfs/support/debug.h        |  25 ++++++----
 fs/xfs/xfs_error.c            |  31 ------------
 fs/xfs/xfs_error.h            |  18 +++----
 fs/xfs/xfs_log.c              |   2 +-
 fs/xfs/xfs_log_recover.c      |   2 +-
 7 files changed, 96 insertions(+), 114 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
index 7bb5092d6ae4..ee3cee097e7e 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -18,6 +18,7 @@
 #include "xfs.h"
 #include <linux/sysctl.h>
 #include <linux/proc_fs.h>
+#include "xfs_error.h"
 
 static struct ctl_table_header *xfs_table_header;
 
@@ -51,6 +52,26 @@ xfs_stats_clear_proc_handler(
 
 	return ret;
 }
+
+STATIC int
+xfs_panic_mask_proc_handler(
+	ctl_table	*ctl,
+	int		write,
+	void		__user *buffer,
+	size_t		*lenp,
+	loff_t		*ppos)
+{
+	int		ret, *valp = ctl->data;
+
+	ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
+	if (!ret && write) {
+		xfs_panic_mask = *valp;
+#ifdef DEBUG
+		xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES);
+#endif
+	}
+	return ret;
+}
 #endif /* CONFIG_PROC_FS */
 
 static ctl_table xfs_table[] = {
@@ -77,7 +98,7 @@ static ctl_table xfs_table[] = {
 		.data		= &xfs_params.panic_mask.val,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
+		.proc_handler	= xfs_panic_mask_proc_handler,
 		.extra1		= &xfs_params.panic_mask.min,
 		.extra2		= &xfs_params.panic_mask.max
 	},
diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
index 86162e5f9a21..e6cf955ec0fc 100644
--- a/fs/xfs/support/debug.c
+++ b/fs/xfs/support/debug.c
@@ -25,80 +25,71 @@
 #include "xfs_mount.h"
 #include "xfs_error.h"
 
-static char		message[1024];	/* keep it off the stack */
-static DEFINE_SPINLOCK(xfs_err_lock);
-
-/* Translate from CE_FOO to KERN_FOO, err_level(CE_FOO) == KERN_FOO */
-#define XFS_MAX_ERR_LEVEL	7
-#define XFS_ERR_MASK		((1 << 3) - 1)
-static const char * const	err_level[XFS_MAX_ERR_LEVEL+1] =
-					{KERN_EMERG, KERN_ALERT, KERN_CRIT,
-					 KERN_ERR, KERN_WARNING, KERN_NOTICE,
-					 KERN_INFO, KERN_DEBUG};
-
 void
-cmn_err(register int level, char *fmt, ...)
+cmn_err(
+	const char	*lvl,
+	const char	*fmt,
+	...)
 {
-	char	*fp = fmt;
-	int	len;
-	ulong	flags;
-	va_list	ap;
-
-	level &= XFS_ERR_MASK;
-	if (level > XFS_MAX_ERR_LEVEL)
-		level = XFS_MAX_ERR_LEVEL;
-	spin_lock_irqsave(&xfs_err_lock,flags);
-	va_start(ap, fmt);
-	if (*fmt == '!') fp++;
-	len = vsnprintf(message, sizeof(message), fp, ap);
-	if (len >= sizeof(message))
-		len = sizeof(message) - 1;
-	if (message[len-1] == '\n')
-		message[len-1] = 0;
-	printk("%s%s\n", err_level[level], message);
-	va_end(ap);
-	spin_unlock_irqrestore(&xfs_err_lock,flags);
-	BUG_ON(level == CE_PANIC);
+	struct va_format vaf;
+	va_list		args;
+
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	printk("%s%pV", lvl, &vaf);
+	va_end(args);
+
+	BUG_ON(strncmp(lvl, KERN_EMERG, strlen(KERN_EMERG)) == 0);
 }
 
 void
-xfs_fs_vcmn_err(
-	int			level,
+xfs_fs_cmn_err(
+	const char		*lvl,
 	struct xfs_mount	*mp,
-	char			*fmt,
-	va_list			ap)
+	const char		*fmt,
+	...)
 {
-	unsigned long		flags;
-	int			len = 0;
+	struct va_format	vaf;
+	va_list			args;
 
-	level &= XFS_ERR_MASK;
-	if (level > XFS_MAX_ERR_LEVEL)
-		level = XFS_MAX_ERR_LEVEL;
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
 
-	spin_lock_irqsave(&xfs_err_lock,flags);
+	printk("%sFilesystem %s: %pV", lvl, mp->m_fsname, &vaf);
+	va_end(args);
 
-	if (mp) {
-		len = sprintf(message, "Filesystem \"%s\": ", mp->m_fsname);
+	BUG_ON(strncmp(lvl, KERN_EMERG, strlen(KERN_EMERG)) == 0);
+}
+
+/* All callers to xfs_cmn_err use CE_ALERT, so don't bother testing lvl */
+void
+xfs_cmn_err(
+	int			panic_tag,
+	const char		*lvl,
+	struct xfs_mount	*mp,
+	const char		*fmt,
+	...)
+{
+	struct va_format	vaf;
+	va_list			args;
+	int			panic = 0;
 
-		/*
-		 * Skip the printk if we can't print anything useful
-		 * due to an over-long device name.
-		 */
-		if (len >= sizeof(message))
-			goto out;
+	if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) {
+		printk(KERN_ALERT "XFS: Transforming an alert into a BUG.");
+		panic = 1;
 	}
 
-	len = vsnprintf(message + len, sizeof(message) - len, fmt, ap);
-	if (len >= sizeof(message))
-		len = sizeof(message) - 1;
-	if (message[len-1] == '\n')
-		message[len-1] = 0;
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
 
-	printk("%s%s\n", err_level[level], message);
- out:
-	spin_unlock_irqrestore(&xfs_err_lock,flags);
+	printk(KERN_ALERT "Filesystem %s: %pV", mp->m_fsname, &vaf);
+	va_end(args);
 
-	BUG_ON(level == CE_PANIC);
+	BUG_ON(panic);
 }
 
 void
diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h
index d2d20462fd4f..05699f67d475 100644
--- a/fs/xfs/support/debug.h
+++ b/fs/xfs/support/debug.h
@@ -20,15 +20,22 @@
 
 #include <stdarg.h>
 
-#define CE_DEBUG        7               /* debug        */
-#define CE_CONT         6               /* continuation */
-#define CE_NOTE         5               /* notice       */
-#define CE_WARN         4               /* warning      */
-#define CE_ALERT        1               /* alert        */
-#define CE_PANIC        0               /* panic        */
-
-extern void cmn_err(int, char *, ...)
-	__attribute__ ((format (printf, 2, 3)));
+struct xfs_mount;
+
+#define CE_DEBUG        KERN_DEBUG
+#define CE_CONT         KERN_INFO
+#define CE_NOTE         KERN_NOTICE
+#define CE_WARN         KERN_WARNING
+#define CE_ALERT        KERN_ALERT
+#define CE_PANIC        KERN_EMERG
+
+void cmn_err(const char *lvl, const char *fmt, ...)
+		__attribute__ ((format (printf, 2, 3)));
+void xfs_fs_cmn_err( const char *lvl, struct xfs_mount *mp,
+		const char *fmt, ...) __attribute__ ((format (printf, 3, 4)));
+void xfs_cmn_err( int panic_tag, const char *lvl, struct xfs_mount *mp,
+		const char *fmt, ...) __attribute__ ((format (printf, 4, 5)));
+
 extern void assfail(char *expr, char *f, int l);
 
 #define ASSERT_ALWAYS(expr)	\
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index c78cc6a3d87c..4c7db74a05f7 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -152,37 +152,6 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
 }
 #endif /* DEBUG */
 
-
-void
-xfs_fs_cmn_err(int level, xfs_mount_t *mp, char *fmt, ...)
-{
-	va_list ap;
-
-	va_start(ap, fmt);
-	xfs_fs_vcmn_err(level, mp, fmt, ap);
-	va_end(ap);
-}
-
-void
-xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...)
-{
-	va_list ap;
-
-#ifdef DEBUG
-	xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES);
-#endif
-
-	if (xfs_panic_mask && (xfs_panic_mask & panic_tag)
-	    && (level & CE_ALERT)) {
-		level &= ~CE_ALERT;
-		level |= CE_PANIC;
-		cmn_err(CE_ALERT, "XFS: Transforming an alert into a BUG.");
-	}
-	va_start(ap, fmt);
-	xfs_fs_vcmn_err(level, mp, fmt, ap);
-	va_end(ap);
-}
-
 void
 xfs_error_report(
 	const char		*tag,
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index f338847f80b8..10dce5475f02 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -136,8 +136,8 @@ extern int xfs_error_test(int, int *, char *, int, char *, unsigned long);
 	 xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \
 			(rf))))
 
-extern int xfs_errortag_add(int error_tag, xfs_mount_t *mp);
-extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);
+extern int xfs_errortag_add(int error_tag, struct xfs_mount *mp);
+extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud);
 #else
 #define XFS_TEST_ERROR(expr, mp, tag, rf)	(expr)
 #define xfs_errortag_add(tag, mp)		(ENOSYS)
@@ -162,21 +162,15 @@ extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);
 
 struct xfs_mount;
 
-extern void xfs_fs_vcmn_err(int level, struct xfs_mount *mp,
-		char *fmt, va_list ap)
-	__attribute__ ((format (printf, 3, 0)));
-extern void xfs_cmn_err(int panic_tag, int level, struct xfs_mount *mp,
-			char *fmt, ...)
-	__attribute__ ((format (printf, 4, 5)));
-extern void xfs_fs_cmn_err(int level, struct xfs_mount *mp, char *fmt, ...)
-	__attribute__ ((format (printf, 3, 4)));
-
 extern void xfs_hex_dump(void *p, int length);
 
 #define xfs_fs_repair_cmn_err(level, mp, fmt, args...) \
 	xfs_fs_cmn_err(level, mp, fmt "  Unmount and run xfs_repair.", ## args)
 
 #define xfs_fs_mount_cmn_err(f, fmt, args...) \
-	((f & XFS_MFSI_QUIET)? (void)0 : cmn_err(CE_WARN, "XFS: " fmt, ## args))
+	do { \
+		if (!(f & XFS_MFSI_QUIET)) 	\
+			cmn_err(CE_WARN, "XFS: " fmt, ## args); \
+	} while (0)
 
 #endif	/* __XFS_ERROR_H__ */
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 0bf24b11d0c4..ae6fef1ff563 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -377,7 +377,7 @@ xfs_log_mount(
 		cmn_err(CE_NOTE, "XFS mounting filesystem %s", mp->m_fsname);
 	else {
 		cmn_err(CE_NOTE,
-			"!Mounting filesystem \"%s\" in no-recovery mode.  Filesystem will be inconsistent.",
+			"Mounting filesystem \"%s\" in no-recovery mode.  Filesystem will be inconsistent.",
 			mp->m_fsname);
 		ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
 	}
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 204d8e5fa7fa..aa0ebb776903 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -3800,7 +3800,7 @@ xlog_recover_finish(
 		log->l_flags &= ~XLOG_RECOVERY_NEEDED;
 	} else {
 		cmn_err(CE_DEBUG,
-			"!Ending clean XFS mount for filesystem: %s\n",
+			"Ending clean XFS mount for filesystem: %s\n",
 			log->l_mp->m_fsname);
 	}
 	return 0;
-- 
cgit v1.2.2


From f00c9e44ad1a9660fe8cd3ca15b6cd9497172eab Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 15 Sep 2010 17:38:58 +0200
Subject: quota: Fix deadlock during path resolution

As Al Viro pointed out path resolution during Q_QUOTAON calls to quotactl
is prone to deadlocks. We hold s_umount semaphore for reading during the
path resolution and resolution itself may need to acquire the semaphore
for writing when e. g. autofs mountpoint is passed.

Solve the problem by performing the resolution before we get hold of the
superblock (and thus s_umount semaphore). The whole thing is complicated
by the fact that some filesystems (OCFS2) ignore the path argument. So to
distinguish between filesystem which want the path and which do not we
introduce new .quota_on_meta callback which does not get the path. OCFS2
then uses this callback instead of old .quota_on.

CC: Al Viro <viro@ZenIV.linux.org.uk>
CC: Christoph Hellwig <hch@lst.de>
CC: Ted Ts'o <tytso@mit.edu>
CC: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/super.c     | 25 +++++++------------------
 fs/ext4/super.c     | 25 +++++++------------------
 fs/ocfs2/super.c    |  5 ++---
 fs/quota/dquot.c    | 18 ++----------------
 fs/quota/quota.c    | 41 +++++++++++++++++++++++++++--------------
 fs/reiserfs/super.c | 17 ++++++-----------
 6 files changed, 51 insertions(+), 80 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index b7d0554631e4..0e0d391626be 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -755,7 +755,7 @@ static int ext3_release_dquot(struct dquot *dquot);
 static int ext3_mark_dquot_dirty(struct dquot *dquot);
 static int ext3_write_info(struct super_block *sb, int type);
 static int ext3_quota_on(struct super_block *sb, int type, int format_id,
-				char *path);
+			 struct path *path);
 static int ext3_quota_on_mount(struct super_block *sb, int type);
 static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data,
 			       size_t len, loff_t off);
@@ -2885,27 +2885,20 @@ static int ext3_quota_on_mount(struct super_block *sb, int type)
  * Standard function to be called on quota_on
  */
 static int ext3_quota_on(struct super_block *sb, int type, int format_id,
-			 char *name)
+			 struct path *path)
 {
 	int err;
-	struct path path;
 
 	if (!test_opt(sb, QUOTA))
 		return -EINVAL;
 
-	err = kern_path(name, LOOKUP_FOLLOW, &path);
-	if (err)
-		return err;
-
 	/* Quotafile not on the same filesystem? */
-	if (path.mnt->mnt_sb != sb) {
-		path_put(&path);
+	if (path->mnt->mnt_sb != sb)
 		return -EXDEV;
-	}
 	/* Journaling quota? */
 	if (EXT3_SB(sb)->s_qf_names[type]) {
 		/* Quotafile not of fs root? */
-		if (path.dentry->d_parent != sb->s_root)
+		if (path->dentry->d_parent != sb->s_root)
 			ext3_msg(sb, KERN_WARNING,
 				"warning: Quota file not on filesystem root. "
 				"Journaled quota will not work.");
@@ -2915,7 +2908,7 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id,
 	 * When we journal data on quota file, we have to flush journal to see
 	 * all updates to the file when we bypass pagecache...
 	 */
-	if (ext3_should_journal_data(path.dentry->d_inode)) {
+	if (ext3_should_journal_data(path->dentry->d_inode)) {
 		/*
 		 * We don't need to lock updates but journal_flush() could
 		 * otherwise be livelocked...
@@ -2923,15 +2916,11 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id,
 		journal_lock_updates(EXT3_SB(sb)->s_journal);
 		err = journal_flush(EXT3_SB(sb)->s_journal);
 		journal_unlock_updates(EXT3_SB(sb)->s_journal);
-		if (err) {
-			path_put(&path);
+		if (err)
 			return err;
-		}
 	}
 
-	err = dquot_quota_on_path(sb, type, format_id, &path);
-	path_put(&path);
-	return err;
+	return dquot_quota_on(sb, type, format_id, path);
 }
 
 /* Read data from quotafile - avoid pagecache and such because we cannot afford
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 29c80f6d8b27..0f10ccd6bfc0 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1162,7 +1162,7 @@ static int ext4_release_dquot(struct dquot *dquot);
 static int ext4_mark_dquot_dirty(struct dquot *dquot);
 static int ext4_write_info(struct super_block *sb, int type);
 static int ext4_quota_on(struct super_block *sb, int type, int format_id,
-				char *path);
+			 struct path *path);
 static int ext4_quota_off(struct super_block *sb, int type);
 static int ext4_quota_on_mount(struct super_block *sb, int type);
 static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
@@ -4566,27 +4566,20 @@ static int ext4_quota_on_mount(struct super_block *sb, int type)
  * Standard function to be called on quota_on
  */
 static int ext4_quota_on(struct super_block *sb, int type, int format_id,
-			 char *name)
+			 struct path *path)
 {
 	int err;
-	struct path path;
 
 	if (!test_opt(sb, QUOTA))
 		return -EINVAL;
 
-	err = kern_path(name, LOOKUP_FOLLOW, &path);
-	if (err)
-		return err;
-
 	/* Quotafile not on the same filesystem? */
-	if (path.mnt->mnt_sb != sb) {
-		path_put(&path);
+	if (path->mnt->mnt_sb != sb)
 		return -EXDEV;
-	}
 	/* Journaling quota? */
 	if (EXT4_SB(sb)->s_qf_names[type]) {
 		/* Quotafile not in fs root? */
-		if (path.dentry->d_parent != sb->s_root)
+		if (path->dentry->d_parent != sb->s_root)
 			ext4_msg(sb, KERN_WARNING,
 				"Quota file not on filesystem root. "
 				"Journaled quota will not work");
@@ -4597,7 +4590,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
 	 * all updates to the file when we bypass pagecache...
 	 */
 	if (EXT4_SB(sb)->s_journal &&
-	    ext4_should_journal_data(path.dentry->d_inode)) {
+	    ext4_should_journal_data(path->dentry->d_inode)) {
 		/*
 		 * We don't need to lock updates but journal_flush() could
 		 * otherwise be livelocked...
@@ -4605,15 +4598,11 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
 		jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
 		err = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
 		jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
-		if (err) {
-			path_put(&path);
+		if (err)
 			return err;
-		}
 	}
 
-	err = dquot_quota_on_path(sb, type, format_id, &path);
-	path_put(&path);
-	return err;
+	return dquot_quota_on(sb, type, format_id, path);
 }
 
 static int ext4_quota_off(struct super_block *sb, int type)
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 17ff46fa8a10..31c3ffd2f8d0 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -993,8 +993,7 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
 }
 
 /* Handle quota on quotactl */
-static int ocfs2_quota_on(struct super_block *sb, int type, int format_id,
-			  char *path)
+static int ocfs2_quota_on(struct super_block *sb, int type, int format_id)
 {
 	unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
 					     OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
@@ -1013,7 +1012,7 @@ static int ocfs2_quota_off(struct super_block *sb, int type)
 }
 
 static const struct quotactl_ops ocfs2_quotactl_ops = {
-	.quota_on	= ocfs2_quota_on,
+	.quota_on_meta	= ocfs2_quota_on,
 	.quota_off	= ocfs2_quota_off,
 	.quota_sync	= dquot_quota_sync,
 	.get_info	= dquot_get_dqinfo,
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 84becd3e4772..a2a622e079f0 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2189,8 +2189,8 @@ int dquot_resume(struct super_block *sb, int type)
 }
 EXPORT_SYMBOL(dquot_resume);
 
-int dquot_quota_on_path(struct super_block *sb, int type, int format_id,
-		      struct path *path)
+int dquot_quota_on(struct super_block *sb, int type, int format_id,
+		   struct path *path)
 {
 	int error = security_quota_on(path->dentry);
 	if (error)
@@ -2204,20 +2204,6 @@ int dquot_quota_on_path(struct super_block *sb, int type, int format_id,
 					     DQUOT_LIMITS_ENABLED);
 	return error;
 }
-EXPORT_SYMBOL(dquot_quota_on_path);
-
-int dquot_quota_on(struct super_block *sb, int type, int format_id, char *name)
-{
-	struct path path;
-	int error;
-
-	error = kern_path(name, LOOKUP_FOLLOW, &path);
-	if (!error) {
-		error = dquot_quota_on_path(sb, type, format_id, &path);
-		path_put(&path);
-	}
-	return error;
-}
 EXPORT_SYMBOL(dquot_quota_on);
 
 /*
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index b299961e1edb..b34bdb25490c 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -64,18 +64,15 @@ static int quota_sync_all(int type)
 }
 
 static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id,
-		         void __user *addr)
+		         struct path *path)
 {
-	char *pathname;
-	int ret = -ENOSYS;
-
-	pathname = getname(addr);
-	if (IS_ERR(pathname))
-		return PTR_ERR(pathname);
-	if (sb->s_qcop->quota_on)
-		ret = sb->s_qcop->quota_on(sb, type, id, pathname);
-	putname(pathname);
-	return ret;
+	if (!sb->s_qcop->quota_on && !sb->s_qcop->quota_on_meta)
+		return -ENOSYS;
+	if (sb->s_qcop->quota_on_meta)
+		return sb->s_qcop->quota_on_meta(sb, type, id);
+	if (IS_ERR(path))
+		return PTR_ERR(path);
+	return sb->s_qcop->quota_on(sb, type, id, path);
 }
 
 static int quota_getfmt(struct super_block *sb, int type, void __user *addr)
@@ -241,7 +238,7 @@ static int quota_getxquota(struct super_block *sb, int type, qid_t id,
 
 /* Copy parameters and call proper function */
 static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
-		       void __user *addr)
+		       void __user *addr, struct path *path)
 {
 	int ret;
 
@@ -256,7 +253,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
 
 	switch (cmd) {
 	case Q_QUOTAON:
-		return quota_quotaon(sb, type, cmd, id, addr);
+		return quota_quotaon(sb, type, cmd, id, path);
 	case Q_QUOTAOFF:
 		if (!sb->s_qcop->quota_off)
 			return -ENOSYS;
@@ -335,6 +332,7 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
 {
 	uint cmds, type;
 	struct super_block *sb = NULL;
+	struct path path, *pathp = NULL;
 	int ret;
 
 	cmds = cmd >> SUBCMDSHIFT;
@@ -351,12 +349,27 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
 		return -ENODEV;
 	}
 
+	/*
+	 * Path for quotaon has to be resolved before grabbing superblock
+	 * because that gets s_umount sem which is also possibly needed by path
+	 * resolution (think about autofs) and thus deadlocks could arise.
+	 */
+	if (cmds == Q_QUOTAON) {
+		ret = user_path_at(AT_FDCWD, addr, LOOKUP_FOLLOW, &path);
+		if (ret)
+			pathp = ERR_PTR(ret);
+		else
+			pathp = &path;
+	}
+
 	sb = quotactl_block(special);
 	if (IS_ERR(sb))
 		return PTR_ERR(sb);
 
-	ret = do_quotactl(sb, type, cmds, id, addr);
+	ret = do_quotactl(sb, type, cmds, id, addr, pathp);
 
 	drop_super(sb);
+	if (pathp && !IS_ERR(pathp))
+		path_put(pathp);
 	return ret;
 }
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 2575682a9ead..0aab04f46827 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -632,7 +632,7 @@ static int reiserfs_acquire_dquot(struct dquot *);
 static int reiserfs_release_dquot(struct dquot *);
 static int reiserfs_mark_dquot_dirty(struct dquot *);
 static int reiserfs_write_info(struct super_block *, int);
-static int reiserfs_quota_on(struct super_block *, int, int, char *);
+static int reiserfs_quota_on(struct super_block *, int, int, struct path *);
 
 static const struct dquot_operations reiserfs_quota_operations = {
 	.write_dquot = reiserfs_write_dquot,
@@ -2048,25 +2048,21 @@ static int reiserfs_quota_on_mount(struct super_block *sb, int type)
  * Standard function to be called on quota_on
  */
 static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
-			     char *name)
+			     struct path *path)
 {
 	int err;
-	struct path path;
 	struct inode *inode;
 	struct reiserfs_transaction_handle th;
 
 	if (!(REISERFS_SB(sb)->s_mount_opt & (1 << REISERFS_QUOTA)))
 		return -EINVAL;
 
-	err = kern_path(name, LOOKUP_FOLLOW, &path);
-	if (err)
-		return err;
 	/* Quotafile not on the same filesystem? */
-	if (path.mnt->mnt_sb != sb) {
+	if (path->mnt->mnt_sb != sb) {
 		err = -EXDEV;
 		goto out;
 	}
-	inode = path.dentry->d_inode;
+	inode = path->dentry->d_inode;
 	/* We must not pack tails for quota files on reiserfs for quota IO to work */
 	if (!(REISERFS_I(inode)->i_flags & i_nopack_mask)) {
 		err = reiserfs_unpack(inode, NULL);
@@ -2082,7 +2078,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
 	/* Journaling quota? */
 	if (REISERFS_SB(sb)->s_qf_names[type]) {
 		/* Quotafile not of fs root? */
-		if (path.dentry->d_parent != sb->s_root)
+		if (path->dentry->d_parent != sb->s_root)
 			reiserfs_warning(sb, "super-6521",
 				 "Quota file not on filesystem root. "
 				 "Journalled quota will not work.");
@@ -2101,9 +2097,8 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
 		if (err)
 			goto out;
 	}
-	err = dquot_quota_on_path(sb, type, format_id, &path);
+	err = dquot_quota_on(sb, type, format_id, path);
 out:
-	path_put(&path);
 	return err;
 }
 
-- 
cgit v1.2.2


From 8a0eebf66e3b1deae036553ba641a9c2bdbae678 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 13 Jan 2011 14:15:50 -0500
Subject: NFS: Fix NFSv3 exclusive open semantics

Commit c0204fd2b8fe047b18b67e07e1bf2a03691240cd (NFS: Clean up
nfs4_proc_create()) broke NFSv3 exclusive open by removing the code
that passes the O_EXCL flag down to nfs3_proc_create(). This patch
reverts that offending hunk from the original commit.

Reported-by: Nick Bowler <nbowler@elliptictech.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: stable@kernel.org    [2.6.37]
Tested-by: Nick Bowler <nbowler@elliptictech.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nfs/dir.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 95b081bc9e25..64ee240f3c80 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1579,6 +1579,7 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
 {
 	struct iattr attr;
 	int error;
+	int open_flags = 0;
 
 	dfprintk(VFS, "NFS: create(%s/%ld), %s\n",
 			dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
@@ -1586,7 +1587,10 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
 	attr.ia_mode = mode;
 	attr.ia_valid = ATTR_MODE;
 
-	error = NFS_PROTO(dir)->create(dir, dentry, &attr, 0, NULL);
+	if ((nd->flags & LOOKUP_CREATE) != 0)
+		open_flags = nd->intent.open.flags;
+
+	error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, NULL);
 	if (error != 0)
 		goto out_err;
 	return 0;
-- 
cgit v1.2.2


From 81bb8debd0d570dc67dc1e9d8b612632cb941893 Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@lougher.demon.co.uk>
Date: Thu, 9 Dec 2010 02:02:29 +0000
Subject: Squashfs: add XZ compression support

Add support for reading file systems compressed with the
XZ compression algorithm.

This patch adds the XZ decompressor wrapper code.

Signed-off-by: Phillip Lougher <phillip@lougher.demon.co.uk>
---
 fs/squashfs/squashfs_fs.h |   1 +
 fs/squashfs/xz_wrapper.c  | 153 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 154 insertions(+)
 create mode 100644 fs/squashfs/xz_wrapper.c

(limited to 'fs')

diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index c5137fc9ab11..39533feffd6d 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -238,6 +238,7 @@ struct meta_index {
 #define ZLIB_COMPRESSION	1
 #define LZMA_COMPRESSION	2
 #define LZO_COMPRESSION		3
+#define XZ_COMPRESSION		4
 
 struct squashfs_super_block {
 	__le32			s_magic;
diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c
new file mode 100644
index 000000000000..856756ca5ee4
--- /dev/null
+++ b/fs/squashfs/xz_wrapper.c
@@ -0,0 +1,153 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * xz_wrapper.c
+ */
+
+
+#include <linux/mutex.h>
+#include <linux/buffer_head.h>
+#include <linux/slab.h>
+#include <linux/xz.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+#include "decompressor.h"
+
+struct squashfs_xz {
+	struct xz_dec *state;
+	struct xz_buf buf;
+};
+
+static void *squashfs_xz_init(struct squashfs_sb_info *msblk)
+{
+	int block_size = max_t(int, msblk->block_size, SQUASHFS_METADATA_SIZE);
+
+	struct squashfs_xz *stream = kmalloc(sizeof(*stream), GFP_KERNEL);
+	if (stream == NULL)
+		goto failed;
+
+	stream->state = xz_dec_init(XZ_PREALLOC, block_size);
+	if (stream->state == NULL)
+		goto failed;
+
+	return stream;
+
+failed:
+	ERROR("Failed to allocate xz workspace\n");
+	kfree(stream);
+	return NULL;
+}
+
+
+static void squashfs_xz_free(void *strm)
+{
+	struct squashfs_xz *stream = strm;
+
+	if (stream) {
+		xz_dec_end(stream->state);
+		kfree(stream);
+	}
+}
+
+
+static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void **buffer,
+	struct buffer_head **bh, int b, int offset, int length, int srclength,
+	int pages)
+{
+	enum xz_ret xz_err;
+	int avail, total = 0, k = 0, page = 0;
+	struct squashfs_xz *stream = msblk->stream;
+
+	mutex_lock(&msblk->read_data_mutex);
+
+	xz_dec_reset(stream->state);
+	stream->buf.in_pos = 0;
+	stream->buf.in_size = 0;
+	stream->buf.out_pos = 0;
+	stream->buf.out_size = PAGE_CACHE_SIZE;
+	stream->buf.out = buffer[page++];
+
+	do {
+		if (stream->buf.in_pos == stream->buf.in_size && k < b) {
+			avail = min(length, msblk->devblksize - offset);
+			length -= avail;
+			wait_on_buffer(bh[k]);
+			if (!buffer_uptodate(bh[k]))
+				goto release_mutex;
+
+			if (avail == 0) {
+				offset = 0;
+				put_bh(bh[k++]);
+				continue;
+			}
+
+			stream->buf.in = bh[k]->b_data + offset;
+			stream->buf.in_size = avail;
+			stream->buf.in_pos = 0;
+			offset = 0;
+		}
+
+		if (stream->buf.out_pos == stream->buf.out_size
+							&& page < pages) {
+			stream->buf.out = buffer[page++];
+			stream->buf.out_pos = 0;
+			total += PAGE_CACHE_SIZE;
+		}
+
+		xz_err = xz_dec_run(stream->state, &stream->buf);
+
+		if (stream->buf.in_pos == stream->buf.in_size && k < b)
+			put_bh(bh[k++]);
+	} while (xz_err == XZ_OK);
+
+	if (xz_err != XZ_STREAM_END) {
+		ERROR("xz_dec_run error, data probably corrupt\n");
+		goto release_mutex;
+	}
+
+	if (k < b) {
+		ERROR("xz_uncompress error, input remaining\n");
+		goto release_mutex;
+	}
+
+	total += stream->buf.out_pos;
+	mutex_unlock(&msblk->read_data_mutex);
+	return total;
+
+release_mutex:
+	mutex_unlock(&msblk->read_data_mutex);
+
+	for (; k < b; k++)
+		put_bh(bh[k]);
+
+	return -EIO;
+}
+
+const struct squashfs_decompressor squashfs_xz_comp_ops = {
+	.init = squashfs_xz_init,
+	.free = squashfs_xz_free,
+	.decompress = squashfs_xz_uncompress,
+	.id = XZ_COMPRESSION,
+	.name = "xz",
+	.supported = 1
+};
-- 
cgit v1.2.2


From 7a43ae523744c01b6187013e781f44c2281c579c Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@lougher.demon.co.uk>
Date: Thu, 9 Dec 2010 02:08:31 +0000
Subject: Squashfs: Add XZ compression configuration option

Signed-off-by: Phillip Lougher <phillip@lougher.demon.co.uk>
---
 fs/squashfs/Kconfig        | 15 +++++++++++++++
 fs/squashfs/Makefile       |  1 +
 fs/squashfs/decompressor.c |  7 +++++++
 fs/squashfs/decompressor.h |  5 +++++
 4 files changed, 28 insertions(+)

(limited to 'fs')

diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index e5f63da64d04..50cc4edbca06 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -53,6 +53,21 @@ config SQUASHFS_LZO
 
 	  If unsure, say N.
 
+config SQUASHFS_XZ
+	bool "Include support for XZ compressed file systems"
+	depends on SQUASHFS
+	select XZ_DEC
+	help
+	  Saying Y here includes support for reading Squashfs file systems
+	  compressed with XZ compresssion.  XZ gives better compression than
+	  the default zlib compression, at the expense of greater CPU and
+	  memory overhead.
+
+	  XZ is not the standard compression used in Squashfs and so most
+	  file systems will be readable without selecting this option.
+
+	  If unsure, say N.
+
 config SQUASHFS_EMBEDDED
 	bool "Additional option for memory-constrained systems"
 	depends on SQUASHFS
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index 7672bac8d328..cecf2bea07af 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -7,3 +7,4 @@ squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
 squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o
 squashfs-$(CONFIG_SQUASHFS_XATTR) += xattr.o xattr_id.o
 squashfs-$(CONFIG_SQUASHFS_LZO) += lzo_wrapper.o
+squashfs-$(CONFIG_SQUASHFS_XZ) += xz_wrapper.o
diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c
index 24af9ce9722f..482d78197811 100644
--- a/fs/squashfs/decompressor.c
+++ b/fs/squashfs/decompressor.c
@@ -46,6 +46,12 @@ static const struct squashfs_decompressor squashfs_lzo_unsupported_comp_ops = {
 };
 #endif
 
+#ifndef CONFIG_SQUASHFS_XZ
+static const struct squashfs_decompressor squashfs_xz_comp_ops = {
+	NULL, NULL, NULL, XZ_COMPRESSION, "xz", 0
+};
+#endif
+
 static const struct squashfs_decompressor squashfs_unknown_comp_ops = {
 	NULL, NULL, NULL, 0, "unknown", 0
 };
@@ -58,6 +64,7 @@ static const struct squashfs_decompressor *decompressor[] = {
 #else
 	&squashfs_lzo_unsupported_comp_ops,
 #endif
+	&squashfs_xz_comp_ops,
 	&squashfs_unknown_comp_ops
 };
 
diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h
index 7425f80783f6..57e1acb4c6a9 100644
--- a/fs/squashfs/decompressor.h
+++ b/fs/squashfs/decompressor.h
@@ -52,4 +52,9 @@ static inline int squashfs_decompress(struct squashfs_sb_info *msblk,
 	return msblk->decompressor->decompress(msblk, buffer, bh, b, offset,
 		length, srclength, pages);
 }
+
+#ifdef CONFIG_SQUASHFS_XZ
+extern const struct squashfs_decompressor squashfs_xz_comp_ops;
+#endif
+
 #endif
-- 
cgit v1.2.2


From 170cf02165272dfe026eba183563bad973ca4f05 Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@lougher.demon.co.uk>
Date: Wed, 5 Jan 2011 17:52:26 +0000
Subject: Squashfs: remove unnecessary variable in zlib_wrapper

Get rid of unnecessary bytes variable, and remove redundant
initialisation of zlib_err.

Signed-off-by: Phillip Lougher <phillip@lougher.demon.co.uk>
---
 fs/squashfs/zlib_wrapper.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
index 7a603874e483..1f4833b87ea3 100644
--- a/fs/squashfs/zlib_wrapper.c
+++ b/fs/squashfs/zlib_wrapper.c
@@ -66,8 +66,8 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
 	struct buffer_head **bh, int b, int offset, int length, int srclength,
 	int pages)
 {
-	int zlib_err = 0, zlib_init = 0;
-	int avail, bytes, k = 0, page = 0;
+	int zlib_err, zlib_init = 0;
+	int k = 0, page = 0;
 	z_stream *stream = msblk->stream;
 
 	mutex_lock(&msblk->read_data_mutex);
@@ -75,11 +75,10 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
 	stream->avail_out = 0;
 	stream->avail_in = 0;
 
-	bytes = length;
 	do {
 		if (stream->avail_in == 0 && k < b) {
-			avail = min(bytes, msblk->devblksize - offset);
-			bytes -= avail;
+			int avail = min(length, msblk->devblksize - offset);
+			length -= avail;
 			wait_on_buffer(bh[k]);
 			if (!buffer_uptodate(bh[k]))
 				goto release_mutex;
-- 
cgit v1.2.2


From e7ee11f0ecd587caed0063c5f68ca20fef699f32 Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@lougher.demon.co.uk>
Date: Wed, 5 Jan 2011 18:02:37 +0000
Subject: Squashfs: add missing check in zlib_wrapper

On file system corruption zlib can return Z_STREAM_OK with
input buffers remaining, which will not be released.

Signed-off-by: Phillip Lougher <phillip@lougher.demon.co.uk>
---
 fs/squashfs/zlib_wrapper.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
index 1f4833b87ea3..ab5801f66e26 100644
--- a/fs/squashfs/zlib_wrapper.c
+++ b/fs/squashfs/zlib_wrapper.c
@@ -127,6 +127,11 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
 		goto release_mutex;
 	}
 
+	if (k < b) {
+		ERROR("zlib_uncompress error, data remaining\n");
+		goto release_mutex;
+	}
+
 	length = stream->total_out;
 	mutex_unlock(&msblk->read_data_mutex);
 	return length;
-- 
cgit v1.2.2


From 6197fd86789a28760f8375b5ae8885cd7258042f Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@lougher.demon.co.uk>
Date: Wed, 5 Jan 2011 18:15:58 +0000
Subject: Squashfs: get rid of default n in Kconfig

As pointed out by Geert Uytterhoeven, "default n" is the default,
no reason to specify it.

Signed-off-by: Phillip Lougher <phillip@lougher.demon.co.uk>
---
 fs/squashfs/Kconfig | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index 50cc4edbca06..aa68a8a31518 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -29,7 +29,6 @@ config SQUASHFS
 config SQUASHFS_XATTR
 	bool "Squashfs XATTR support"
 	depends on SQUASHFS
-	default n
 	help
 	  Saying Y here includes support for extended attributes (xattrs).
 	  Xattrs are name:value pairs associated with inodes by
@@ -40,7 +39,6 @@ config SQUASHFS_XATTR
 config SQUASHFS_LZO
 	bool "Include support for LZO compressed file systems"
 	depends on SQUASHFS
-	default n
 	select LZO_DECOMPRESS
 	help
 	  Saying Y here includes support for reading Squashfs file systems
@@ -71,7 +69,6 @@ config SQUASHFS_XZ
 config SQUASHFS_EMBEDDED
 	bool "Additional option for memory-constrained systems"
 	depends on SQUASHFS
-	default n
 	help
 	  Saying Y here allows you to specify cache size.
 
-- 
cgit v1.2.2


From 8fcd97216f45b1691f8f91f35cc108d06e0bfca8 Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@lougher.demon.co.uk>
Date: Thu, 6 Jan 2011 06:08:50 +0000
Subject: Squashfs: move squashfs_i() definition from squashfs.h

Move squashfs_i() definition out of squashfs.h, this eliminates
the need to #include squashfs_fs_i.h from numerous files.

Signed-off-by: Phillip Lougher <phillip@lougher.demon.co.uk>
---
 fs/squashfs/block.c         | 1 -
 fs/squashfs/cache.c         | 1 -
 fs/squashfs/decompressor.c  | 1 -
 fs/squashfs/fragment.c      | 1 -
 fs/squashfs/id.c            | 1 -
 fs/squashfs/lzo_wrapper.c   | 1 -
 fs/squashfs/squashfs.h      | 5 -----
 fs/squashfs/squashfs_fs_i.h | 6 ++++++
 fs/squashfs/xattr_id.c      | 1 -
 fs/squashfs/zlib_wrapper.c  | 1 -
 10 files changed, 6 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 653c030eb840..2fb2882f0fa7 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -34,7 +34,6 @@
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
-#include "squashfs_fs_i.h"
 #include "squashfs.h"
 #include "decompressor.h"
 
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index 57314bee9059..26b15ae34d6f 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -55,7 +55,6 @@
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
-#include "squashfs_fs_i.h"
 #include "squashfs.h"
 
 /*
diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c
index 482d78197811..50b22d330cec 100644
--- a/fs/squashfs/decompressor.c
+++ b/fs/squashfs/decompressor.c
@@ -27,7 +27,6 @@
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
-#include "squashfs_fs_i.h"
 #include "decompressor.h"
 #include "squashfs.h"
 
diff --git a/fs/squashfs/fragment.c b/fs/squashfs/fragment.c
index 7c90bbd6879d..7eef571443c6 100644
--- a/fs/squashfs/fragment.c
+++ b/fs/squashfs/fragment.c
@@ -39,7 +39,6 @@
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
-#include "squashfs_fs_i.h"
 #include "squashfs.h"
 
 /*
diff --git a/fs/squashfs/id.c b/fs/squashfs/id.c
index b7f64bcd2b70..d8f32452638e 100644
--- a/fs/squashfs/id.c
+++ b/fs/squashfs/id.c
@@ -37,7 +37,6 @@
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
-#include "squashfs_fs_i.h"
 #include "squashfs.h"
 
 /*
diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c
index 5d87789bf1c1..7da759e34c52 100644
--- a/fs/squashfs/lzo_wrapper.c
+++ b/fs/squashfs/lzo_wrapper.c
@@ -29,7 +29,6 @@
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
-#include "squashfs_fs_i.h"
 #include "squashfs.h"
 #include "decompressor.h"
 
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index 5d45569d5f72..18f187fb486b 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -27,11 +27,6 @@
 
 #define WARNING(s, args...)	pr_warning("SQUASHFS: "s, ## args)
 
-static inline struct squashfs_inode_info *squashfs_i(struct inode *inode)
-{
-	return list_entry(inode, struct squashfs_inode_info, vfs_inode);
-}
-
 /* block.c */
 extern int squashfs_read_data(struct super_block *, void **, u64, int, u64 *,
 				int, int);
diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h
index d3e3a37f28a1..359baefc01fc 100644
--- a/fs/squashfs/squashfs_fs_i.h
+++ b/fs/squashfs/squashfs_fs_i.h
@@ -45,4 +45,10 @@ struct squashfs_inode_info {
 	};
 	struct inode	vfs_inode;
 };
+
+
+static inline struct squashfs_inode_info *squashfs_i(struct inode *inode)
+{
+	return list_entry(inode, struct squashfs_inode_info, vfs_inode);
+}
 #endif
diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c
index d33be5dd6c32..05385dbe1465 100644
--- a/fs/squashfs/xattr_id.c
+++ b/fs/squashfs/xattr_id.c
@@ -32,7 +32,6 @@
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
-#include "squashfs_fs_i.h"
 #include "squashfs.h"
 #include "xattr.h"
 
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
index ab5801f66e26..818a5e063faf 100644
--- a/fs/squashfs/zlib_wrapper.c
+++ b/fs/squashfs/zlib_wrapper.c
@@ -29,7 +29,6 @@
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
-#include "squashfs_fs_i.h"
 #include "squashfs.h"
 #include "decompressor.h"
 
-- 
cgit v1.2.2


From 01a678c5a2f41663b8faf03d17e2bbdbf44158a9 Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@lougher.demon.co.uk>
Date: Wed, 5 Jan 2011 18:23:53 +0000
Subject: Squashfs: simplify CONFIG_SQUASHFS_LZO handling

Get rid of messy repeated #if(n)def CONFIG_SQUASHFS_LZO code
in decompressor.c

Signed-off-by: Phillip Lougher <phillip@lougher.demon.co.uk>
---
 fs/squashfs/decompressor.c | 8 ++------
 fs/squashfs/decompressor.h | 4 ++++
 fs/squashfs/squashfs.h     | 3 ---
 3 files changed, 6 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c
index 50b22d330cec..a5940e54c4dd 100644
--- a/fs/squashfs/decompressor.c
+++ b/fs/squashfs/decompressor.c
@@ -40,7 +40,7 @@ static const struct squashfs_decompressor squashfs_lzma_unsupported_comp_ops = {
 };
 
 #ifndef CONFIG_SQUASHFS_LZO
-static const struct squashfs_decompressor squashfs_lzo_unsupported_comp_ops = {
+static const struct squashfs_decompressor squashfs_lzo_comp_ops = {
 	NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0
 };
 #endif
@@ -57,13 +57,9 @@ static const struct squashfs_decompressor squashfs_unknown_comp_ops = {
 
 static const struct squashfs_decompressor *decompressor[] = {
 	&squashfs_zlib_comp_ops,
-	&squashfs_lzma_unsupported_comp_ops,
-#ifdef CONFIG_SQUASHFS_LZO
 	&squashfs_lzo_comp_ops,
-#else
-	&squashfs_lzo_unsupported_comp_ops,
-#endif
 	&squashfs_xz_comp_ops,
+	&squashfs_lzma_unsupported_comp_ops,
 	&squashfs_unknown_comp_ops
 };
 
diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h
index 57e1acb4c6a9..3b305a70f7aa 100644
--- a/fs/squashfs/decompressor.h
+++ b/fs/squashfs/decompressor.h
@@ -57,4 +57,8 @@ static inline int squashfs_decompress(struct squashfs_sb_info *msblk,
 extern const struct squashfs_decompressor squashfs_xz_comp_ops;
 #endif
 
+#ifdef CONFIG_SQUASHFS_LZO
+extern const struct squashfs_decompressor squashfs_lzo_comp_ops;
+#endif
+
 #endif
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index 18f187fb486b..ba729d808876 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -99,6 +99,3 @@ extern const struct xattr_handler *squashfs_xattr_handlers[];
 
 /* zlib_wrapper.c */
 extern const struct squashfs_decompressor squashfs_zlib_comp_ops;
-
-/* lzo_wrapper.c */
-extern const struct squashfs_decompressor squashfs_lzo_comp_ops;
-- 
cgit v1.2.2


From 1c1266bb916e6a6b362d3be95f2cc7f3c41277a6 Mon Sep 17 00:00:00 2001
From: Yehuda Sadeh <yehuda@hq.newdream.net>
Date: Wed, 12 Jan 2011 16:53:27 -0800
Subject: ceph: fix getattr on directory when using norbytes

The norbytes mount option was broken, and when doing getattr
on a directory it return the rbytes instead of the number of
entities. This commit fixes it.

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/inode.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index e791fa34b23d..50001de66c69 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -701,10 +701,6 @@ static int fill_inode(struct inode *inode,
 			ci->i_ceph_flags |= CEPH_I_COMPLETE;
 			ci->i_max_offset = 2;
 		}
-
-		/* it may be better to set st_size in getattr instead? */
-		if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), RBYTES))
-			inode->i_size = ci->i_rbytes;
 		break;
 	default:
 		pr_err("fill_inode %llx.%llx BAD mode 0%o\n",
@@ -1805,7 +1801,11 @@ int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		else
 			stat->dev = 0;
 		if (S_ISDIR(inode->i_mode)) {
-			stat->size = ci->i_rbytes;
+			if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb),
+						RBYTES))
+				stat->size = ci->i_rbytes;
+			else
+				stat->size = ci->i_files + ci->i_subdirs;
 			stat->blocks = 0;
 			stat->blksize = 65536;
 		}
-- 
cgit v1.2.2


From 17db143fc091238c43ab9f373974ca2224a4c3f8 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 13 Jan 2011 15:27:29 -0800
Subject: ceph: fix xattr rbtree search

Fix xattr name comparison in rbtree search for strings that share a prefix.
The *name argument is null terminated, but the xattr name is not, so we
need to use strncmp, but that means adjusting for the case where name is
a prefix of xattr->name.

The corresponding case in __set_xattr() already handles this properly
(although in that case *name is also not null terminated).

Reported-by: Sergiy Kibrik <sakib@meta.ua>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/xattr.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 6e12a6ba5f79..8c9eba6ef9df 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -219,6 +219,7 @@ static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
 	struct rb_node **p;
 	struct rb_node *parent = NULL;
 	struct ceph_inode_xattr *xattr = NULL;
+	int name_len = strlen(name);
 	int c;
 
 	p = &ci->i_xattrs.index.rb_node;
@@ -226,6 +227,8 @@ static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
 		parent = *p;
 		xattr = rb_entry(parent, struct ceph_inode_xattr, node);
 		c = strncmp(name, xattr->name, xattr->name_len);
+		if (c == 0 && name_len > xattr->name_len)
+			c = 1;
 		if (c < 0)
 			p = &(*p)->rb_left;
 		else if (c > 0)
-- 
cgit v1.2.2


From 6254b32b5791e47ba1c679d023f26985fa34755a Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 13 Jan 2011 17:19:38 -0800
Subject: ecryptfs: fix broken build

Stephen Rothwell reports that the vfs merge broke the build of ecryptfs.
The breakage comes from commit 66cb76666d69 ("sanitize ecryptfs
->mount()") which was obviously not even build tested. Tssk, tssk, Al.

This is the minimal build fixup for the situation, although I don't have
a filesystem to actually test it with.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ecryptfs/main.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 9ed476906327..d3b28abdd6aa 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -141,13 +141,12 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
 	return rc;
 }
 
-static inode *ecryptfs_get_inode(struct inode *lower_inode,
+static struct inode *ecryptfs_get_inode(struct inode *lower_inode,
 		       struct super_block *sb)
 {
 	struct inode *inode;
 	int rc = 0;
 
-	lower_inode = lower_dentry->d_inode;
 	if (lower_inode->i_sb != ecryptfs_superblock_to_lower(sb)) {
 		rc = -EXDEV;
 		goto out;
@@ -202,7 +201,7 @@ int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
 {
 	struct inode *lower_inode = lower_dentry->d_inode;
 	struct inode *inode = ecryptfs_get_inode(lower_inode, sb);
-	if (IS_ERR(inode)
+	if (IS_ERR(inode))
 		return PTR_ERR(inode);
 	if (flags & ECRYPTFS_INTERPOSE_FLAG_D_ADD)
 		d_add(dentry, inode);
-- 
cgit v1.2.2


From 6585027a5e8cb490e3a761b2f3f3c3acf722aff2 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 13 Jan 2011 15:45:44 -0800
Subject: writeback: integrated background writeback work

Check whether background writeback is needed after finishing each work.

When bdi flusher thread finishes doing some work check whether any kind of
background writeback needs to be done (either because
dirty_background_ratio is exceeded or because we need to start flushing
old inodes).  If so, just do background write back.

This way, bdi_start_background_writeback() just needs to wake up the
flusher thread.  It will do background writeback as soon as there is no
other work.

This is a preparatory patch for the next patch which stops background
writeback as soon as there is other work to do.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Jan Engelhardt <jengelh@medozas.de>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fs-writeback.c | 61 +++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 46 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 3d06ccc953aa..3a07f6d8bc0b 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -84,13 +84,9 @@ static inline struct inode *wb_inode(struct list_head *head)
 	return list_entry(head, struct inode, i_wb_list);
 }
 
-static void bdi_queue_work(struct backing_dev_info *bdi,
-		struct wb_writeback_work *work)
+/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */
+static void bdi_wakeup_flusher(struct backing_dev_info *bdi)
 {
-	trace_writeback_queue(bdi, work);
-
-	spin_lock_bh(&bdi->wb_lock);
-	list_add_tail(&work->list, &bdi->work_list);
 	if (bdi->wb.task) {
 		wake_up_process(bdi->wb.task);
 	} else {
@@ -98,15 +94,26 @@ static void bdi_queue_work(struct backing_dev_info *bdi,
 		 * The bdi thread isn't there, wake up the forker thread which
 		 * will create and run it.
 		 */
-		trace_writeback_nothread(bdi, work);
 		wake_up_process(default_backing_dev_info.wb.task);
 	}
+}
+
+static void bdi_queue_work(struct backing_dev_info *bdi,
+			   struct wb_writeback_work *work)
+{
+	trace_writeback_queue(bdi, work);
+
+	spin_lock_bh(&bdi->wb_lock);
+	list_add_tail(&work->list, &bdi->work_list);
+	if (!bdi->wb.task)
+		trace_writeback_nothread(bdi, work);
+	bdi_wakeup_flusher(bdi);
 	spin_unlock_bh(&bdi->wb_lock);
 }
 
 static void
 __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
-		bool range_cyclic, bool for_background)
+		      bool range_cyclic)
 {
 	struct wb_writeback_work *work;
 
@@ -126,7 +133,6 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
 	work->sync_mode	= WB_SYNC_NONE;
 	work->nr_pages	= nr_pages;
 	work->range_cyclic = range_cyclic;
-	work->for_background = for_background;
 
 	bdi_queue_work(bdi, work);
 }
@@ -144,7 +150,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
  */
 void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
 {
-	__bdi_start_writeback(bdi, nr_pages, true, false);
+	__bdi_start_writeback(bdi, nr_pages, true);
 }
 
 /**
@@ -152,13 +158,20 @@ void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
  * @bdi: the backing device to write from
  *
  * Description:
- *   This does WB_SYNC_NONE background writeback. The IO is only
- *   started when this function returns, we make no guarentees on
- *   completion. Caller need not hold sb s_umount semaphore.
+ *   This makes sure WB_SYNC_NONE background writeback happens. When
+ *   this function returns, it is only guaranteed that for given BDI
+ *   some IO is happening if we are over background dirty threshold.
+ *   Caller need not hold sb s_umount semaphore.
  */
 void bdi_start_background_writeback(struct backing_dev_info *bdi)
 {
-	__bdi_start_writeback(bdi, LONG_MAX, true, true);
+	/*
+	 * We just wake up the flusher thread. It will perform background
+	 * writeback as soon as there is no other work to do.
+	 */
+	spin_lock_bh(&bdi->wb_lock);
+	bdi_wakeup_flusher(bdi);
+	spin_unlock_bh(&bdi->wb_lock);
 }
 
 /*
@@ -718,6 +731,23 @@ static unsigned long get_nr_dirty_pages(void)
 		get_nr_dirty_inodes();
 }
 
+static long wb_check_background_flush(struct bdi_writeback *wb)
+{
+	if (over_bground_thresh()) {
+
+		struct wb_writeback_work work = {
+			.nr_pages	= LONG_MAX,
+			.sync_mode	= WB_SYNC_NONE,
+			.for_background	= 1,
+			.range_cyclic	= 1,
+		};
+
+		return wb_writeback(wb, &work);
+	}
+
+	return 0;
+}
+
 static long wb_check_old_data_flush(struct bdi_writeback *wb)
 {
 	unsigned long expired;
@@ -787,6 +817,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
 	 * Check for periodic writeback, kupdated() style
 	 */
 	wrote += wb_check_old_data_flush(wb);
+	wrote += wb_check_background_flush(wb);
 	clear_bit(BDI_writeback_running, &wb->bdi->state);
 
 	return wrote;
@@ -873,7 +904,7 @@ void wakeup_flusher_threads(long nr_pages)
 	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
 		if (!bdi_has_dirty_io(bdi))
 			continue;
-		__bdi_start_writeback(bdi, nr_pages, false, false);
+		__bdi_start_writeback(bdi, nr_pages, false);
 	}
 	rcu_read_unlock();
 }
-- 
cgit v1.2.2


From 71927e84e0aebfbe5a91565c3b207af25a4e9162 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Thu, 13 Jan 2011 15:45:46 -0800
Subject: writeback: trace wakeup event for background writeback

This tracks when balance_dirty_pages() tries to wakeup the flusher thread
for background writeback (if it was not started already).

Suggested-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jan Engelhardt <jengelh@medozas.de>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fs-writeback.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 3a07f6d8bc0b..482de0a92ca7 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -169,6 +169,7 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi)
 	 * We just wake up the flusher thread. It will perform background
 	 * writeback as soon as there is no other work to do.
 	 */
+	trace_writeback_wake_background(bdi);
 	spin_lock_bh(&bdi->wb_lock);
 	bdi_wakeup_flusher(bdi);
 	spin_unlock_bh(&bdi->wb_lock);
-- 
cgit v1.2.2


From aa373cf550994623efb5d49a4d8775bafd10bbc1 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 13 Jan 2011 15:45:47 -0800
Subject: writeback: stop background/kupdate works from livelocking other works

Background writeback is easily livelockable in a loop in wb_writeback() by
a process continuously re-dirtying pages (or continuously appending to a
file).  This is in fact intended as the target of background writeback is
to write dirty pages it can find as long as we are over
dirty_background_threshold.

But the above behavior gets inconvenient at times because no other work
queued in the flusher thread's queue gets processed.  In particular, since
e.g.  sync(1) relies on flusher thread to do all the IO for it, sync(1)
can hang forever waiting for flusher thread to do the work.

Generally, when a flusher thread has some work queued, someone submitted
the work to achieve a goal more specific than what background writeback
does.  Moreover by working on the specific work, we also reduce amount of
dirty pages which is exactly the target of background writeout.  So it
makes sense to give specific work a priority over a generic page cleaning.

Thus we interrupt background writeback if there is some other work to do.
We return to the background writeback after completing all the queued
work.

This may delay the writeback of expired inodes for a while, however the
expired inodes will eventually be flushed to disk as long as the other
works won't livelock.

[fengguang.wu@intel.com: update comment]
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Jan Engelhardt <jengelh@medozas.de>
Cc: Jens Axboe <axboe@kernel.dk>

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fs-writeback.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 482de0a92ca7..9e72d04e706e 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -650,6 +650,16 @@ static long wb_writeback(struct bdi_writeback *wb,
 		if (work->nr_pages <= 0)
 			break;
 
+		/*
+		 * Background writeout and kupdate-style writeback may
+		 * run forever. Stop them if there is other work to do
+		 * so that e.g. sync can proceed. They'll be restarted
+		 * after the other works are all done.
+		 */
+		if ((work->for_background || work->for_kupdate) &&
+		    !list_empty(&wb->bdi->work_list))
+			break;
+
 		/*
 		 * For background writeout, stop when we are below the
 		 * background dirty threshold
-- 
cgit v1.2.2


From b9543dac5bbc4aef0a598965b6b34f6259ab9a9b Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 13 Jan 2011 15:45:48 -0800
Subject: writeback: avoid livelocking WB_SYNC_ALL writeback

When wb_writeback() is called in WB_SYNC_ALL mode, work->nr_to_write is
usually set to LONG_MAX.  The logic in wb_writeback() then calls
__writeback_inodes_sb() with nr_to_write == MAX_WRITEBACK_PAGES and we
easily end up with non-positive nr_to_write after the function returns, if
the inode has more than MAX_WRITEBACK_PAGES dirty pages at the moment.

When nr_to_write is <= 0 wb_writeback() decides we need another round of
writeback but this is wrong in some cases!  For example when a single
large file is continuously dirtied, we would never finish syncing it
because each pass would be able to write MAX_WRITEBACK_PAGES and inode
dirty timestamp never gets updated (as inode is never completely clean).
Thus __writeback_inodes_sb() would write the redirtied inode again and
again.

Fix the issue by setting nr_to_write to LONG_MAX in WB_SYNC_ALL mode.  We
do not need nr_to_write in WB_SYNC_ALL mode anyway since
write_cache_pages() does livelock avoidance using page tagging in
WB_SYNC_ALL mode.

This makes wb_writeback() call __writeback_inodes_sb() only once on
WB_SYNC_ALL.  The latter function won't livelock because it works on

- a finite set of files by doing queue_io() once at the beginning
- a finite set of pages by PAGECACHE_TAG_TOWRITE page tagging

After this patch, program from http://lkml.org/lkml/2010/10/24/154 is no
longer able to stall sync forever.

[fengguang.wu@intel.com: fix locking comment]
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Jan Engelhardt <jengelh@medozas.de>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fs-writeback.c | 27 +++++++++++++++++++++++----
 1 file changed, 23 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 9e72d04e706e..e8063c938dd2 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -630,6 +630,7 @@ static long wb_writeback(struct bdi_writeback *wb,
 	};
 	unsigned long oldest_jif;
 	long wrote = 0;
+	long write_chunk;
 	struct inode *inode;
 
 	if (wbc.for_kupdate) {
@@ -642,6 +643,24 @@ static long wb_writeback(struct bdi_writeback *wb,
 		wbc.range_end = LLONG_MAX;
 	}
 
+	/*
+	 * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
+	 * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
+	 * here avoids calling into writeback_inodes_wb() more than once.
+	 *
+	 * The intended call sequence for WB_SYNC_ALL writeback is:
+	 *
+	 *      wb_writeback()
+	 *          __writeback_inodes_sb()     <== called only once
+	 *              write_cache_pages()     <== called once for each inode
+	 *                   (quickly) tag currently dirty pages
+	 *                   (maybe slowly) sync all tagged pages
+	 */
+	if (wbc.sync_mode == WB_SYNC_NONE)
+		write_chunk = MAX_WRITEBACK_PAGES;
+	else
+		write_chunk = LONG_MAX;
+
 	wbc.wb_start = jiffies; /* livelock avoidance */
 	for (;;) {
 		/*
@@ -668,7 +687,7 @@ static long wb_writeback(struct bdi_writeback *wb,
 			break;
 
 		wbc.more_io = 0;
-		wbc.nr_to_write = MAX_WRITEBACK_PAGES;
+		wbc.nr_to_write = write_chunk;
 		wbc.pages_skipped = 0;
 
 		trace_wbc_writeback_start(&wbc, wb->bdi);
@@ -678,8 +697,8 @@ static long wb_writeback(struct bdi_writeback *wb,
 			writeback_inodes_wb(wb, &wbc);
 		trace_wbc_writeback_written(&wbc, wb->bdi);
 
-		work->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
-		wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+		work->nr_pages -= write_chunk - wbc.nr_to_write;
+		wrote += write_chunk - wbc.nr_to_write;
 
 		/*
 		 * If we consumed everything, see if we have more
@@ -694,7 +713,7 @@ static long wb_writeback(struct bdi_writeback *wb,
 		/*
 		 * Did we write something? Try for more
 		 */
-		if (wbc.nr_to_write < MAX_WRITEBACK_PAGES)
+		if (wbc.nr_to_write < write_chunk)
 			continue;
 		/*
 		 * Nothing written. Wait for some inode to
-- 
cgit v1.2.2


From c691b9d983d7015d54057034f4cd9b6d8affd976 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 13 Jan 2011 15:45:48 -0800
Subject: sync_inode_metadata: fix comment

Use correct function name, remove incorrect apostrophe

Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fs-writeback.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index e8063c938dd2..05aab263e9aa 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1303,11 +1303,11 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc)
 EXPORT_SYMBOL(sync_inode);
 
 /**
- * sync_inode - write an inode to disk
+ * sync_inode_metadata - write an inode to disk
  * @inode: the inode to sync
  * @wait: wait for I/O to complete.
  *
- * Write an inode to disk and adjust it's dirty state after completion.
+ * Write an inode to disk and adjust its dirty state after completion.
  *
  * Note: only writes the actual inode, no associated data or other metadata.
  */
-- 
cgit v1.2.2


From c32b0d4b3f19c2f5d29568f8b7b72b61693f1277 Mon Sep 17 00:00:00 2001
From: Hai Shan <haishan.bai@gmail.com>
Date: Thu, 13 Jan 2011 15:45:51 -0800
Subject: fs/mpage.c: consolidate code

Merge mpage_end_io_read() and mpage_end_io_write() into mpage_end_io() to
eliminate code duplication.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Hai Shan <shan.hai@windriver.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/mpage.c | 49 +++++++++++++++++--------------------------------
 1 file changed, 17 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/mpage.c b/fs/mpage.c
index fd56ca2ea556..d78455a81ec9 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -40,7 +40,7 @@
  * status of that page is hard.  See end_buffer_async_read() for the details.
  * There is no point in duplicating all that complexity.
  */
-static void mpage_end_io_read(struct bio *bio, int err)
+static void mpage_end_io(struct bio *bio, int err)
 {
 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
@@ -50,44 +50,29 @@ static void mpage_end_io_read(struct bio *bio, int err)
 
 		if (--bvec >= bio->bi_io_vec)
 			prefetchw(&bvec->bv_page->flags);
-
-		if (uptodate) {
-			SetPageUptodate(page);
-		} else {
-			ClearPageUptodate(page);
-			SetPageError(page);
-		}
-		unlock_page(page);
-	} while (bvec >= bio->bi_io_vec);
-	bio_put(bio);
-}
-
-static void mpage_end_io_write(struct bio *bio, int err)
-{
-	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
-
-	do {
-		struct page *page = bvec->bv_page;
-
-		if (--bvec >= bio->bi_io_vec)
-			prefetchw(&bvec->bv_page->flags);
-
-		if (!uptodate){
-			SetPageError(page);
-			if (page->mapping)
-				set_bit(AS_EIO, &page->mapping->flags);
+		if (bio_data_dir(bio) == READ) {
+			if (uptodate) {
+				SetPageUptodate(page);
+			} else {
+				ClearPageUptodate(page);
+				SetPageError(page);
+			}
+			unlock_page(page);
+		} else { /* bio_data_dir(bio) == WRITE */
+			if (!uptodate) {
+				SetPageError(page);
+				if (page->mapping)
+					set_bit(AS_EIO, &page->mapping->flags);
+			}
+			end_page_writeback(page);
 		}
-		end_page_writeback(page);
 	} while (bvec >= bio->bi_io_vec);
 	bio_put(bio);
 }
 
 static struct bio *mpage_bio_submit(int rw, struct bio *bio)
 {
-	bio->bi_end_io = mpage_end_io_read;
-	if (rw == WRITE)
-		bio->bi_end_io = mpage_end_io_write;
+	bio->bi_end_io = mpage_end_io;
 	submit_bio(rw, bio);
 	return NULL;
 }
-- 
cgit v1.2.2


From 2d90508f638241a2e7422d884767398296ebe720 Mon Sep 17 00:00:00 2001
From: Nikanth Karthikesan <knikanth@suse.de>
Date: Thu, 13 Jan 2011 15:45:53 -0800
Subject: mm: smaps: export mlock information

Currently there is no way to find whether a process has locked its pages
in memory or not.  And which of the memory regions are locked in memory.

Add a new field "Locked" to export this information via the smaps file.

Signed-off-by: Nikanth Karthikesan <knikanth@suse.de>
Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Acked-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index c3755bd8dd3e..60b914860f81 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -418,7 +418,8 @@ static int show_smap(struct seq_file *m, void *v)
 		   "Anonymous:      %8lu kB\n"
 		   "Swap:           %8lu kB\n"
 		   "KernelPageSize: %8lu kB\n"
-		   "MMUPageSize:    %8lu kB\n",
+		   "MMUPageSize:    %8lu kB\n"
+		   "Locked:         %8lu kB\n",
 		   (vma->vm_end - vma->vm_start) >> 10,
 		   mss.resident >> 10,
 		   (unsigned long)(mss.pss >> (10 + PSS_SHIFT)),
@@ -430,7 +431,9 @@ static int show_smap(struct seq_file *m, void *v)
 		   mss.anonymous >> 10,
 		   mss.swap >> 10,
 		   vma_kernel_pagesize(vma) >> 10,
-		   vma_mmu_pagesize(vma) >> 10);
+		   vma_mmu_pagesize(vma) >> 10,
+		   (vma->vm_flags & VM_LOCKED) ?
+			(unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
 
 	if (m->count < m->size)  /* vma is copied successfully */
 		m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
-- 
cgit v1.2.2


From dabb16f639820267b3850d804571c70bd93d4e07 Mon Sep 17 00:00:00 2001
From: Mandeep Singh Baines <msb@chromium.org>
Date: Thu, 13 Jan 2011 15:46:05 -0800
Subject: oom: allow a non-CAP_SYS_RESOURCE proces to oom_score_adj down

We'd like to be able to oom_score_adj a process up/down as it
enters/leaves the foreground.  Currently, it is not possible to oom_adj
down without CAP_SYS_RESOURCE.  This patch allows a task to decrease its
oom_score_adj back to the value that a CAP_SYS_RESOURCE thread set it to
or its inherited value at fork.  Assuming the thread that has forked it
has oom_score_adj of 0, each process could decrease it back from 0 upon
activation unless a CAP_SYS_RESOURCE thread elevated it to something
higher.

Alternative considered:

* a setuid binary
* a daemon with CAP_SYS_RESOURCE

Since you don't wan't all processes to be able to reduce their oom_adj, a
setuid or daemon implementation would be complex.  The alternatives also
have much higher overhead.

This patch updated from original patch based on feedback from David
Rientjes.

Signed-off-by: Mandeep Singh Baines <msb@chromium.org>
Acked-by: David Rientjes <rientjes@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 93f1cdd5d3d7..9d096e82b201 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1151,7 +1151,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
 		goto err_task_lock;
 	}
 
-	if (oom_score_adj < task->signal->oom_score_adj &&
+	if (oom_score_adj < task->signal->oom_score_adj_min &&
 			!capable(CAP_SYS_RESOURCE)) {
 		err = -EACCES;
 		goto err_sighand;
@@ -1164,6 +1164,8 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
 			atomic_dec(&task->mm->oom_disable_count);
 	}
 	task->signal->oom_score_adj = oom_score_adj;
+	if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
+		task->signal->oom_score_adj_min = oom_score_adj;
 	/*
 	 * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
 	 * always attainable.
-- 
cgit v1.2.2


From 79134171df238171daa4c024a42b77b401ccb00b Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 13 Jan 2011 15:46:58 -0800
Subject: thp: transparent hugepage vmstat

Add hugepage stat information to /proc/vmstat and /proc/meminfo.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/meminfo.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index a65239cfd97e..ed257d141568 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -100,6 +100,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 		"VmallocChunk:   %8lu kB\n"
 #ifdef CONFIG_MEMORY_FAILURE
 		"HardwareCorrupted: %5lu kB\n"
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+		"AnonHugePages:  %8lu kB\n"
 #endif
 		,
 		K(i.totalram),
@@ -128,7 +131,12 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 		K(i.freeswap),
 		K(global_page_state(NR_FILE_DIRTY)),
 		K(global_page_state(NR_WRITEBACK)),
-		K(global_page_state(NR_ANON_PAGES)),
+		K(global_page_state(NR_ANON_PAGES)
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+		  + global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
+		  HPAGE_PMD_NR
+#endif
+		  ),
 		K(global_page_state(NR_FILE_MAPPED)),
 		K(global_page_state(NR_SHMEM)),
 		K(global_page_state(NR_SLAB_RECLAIMABLE) +
@@ -150,6 +158,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 		vmi.largest_chunk >> 10
 #ifdef CONFIG_MEMORY_FAILURE
 		,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10)
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+		,K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
+		   HPAGE_PMD_NR)
 #endif
 		);
 
-- 
cgit v1.2.2


From 5f24ce5fd34c3ca1b3d10d30da754732da64d5c0 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 13 Jan 2011 15:47:00 -0800
Subject: thp: remove PG_buddy

PG_buddy can be converted to _mapcount == -2.  So the PG_compound_lock can
be added to page->flags without overflowing (because of the sparse section
bits increasing) with CONFIG_X86_PAE=y and CONFIG_X86_PAT=y.  This also
has to move the memory hotplug code from _mapcount to lru.next to avoid
any risk of clashes.  We can't use lru.next for PG_buddy removal, but
memory hotplug can use lru.next even more easily than the mapcount
instead.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/page.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/page.c b/fs/proc/page.c
index b06c674624e6..6d8e6a9e93ab 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -116,15 +116,17 @@ u64 stable_page_flags(struct page *page)
 	if (PageHuge(page))
 		u |= 1 << KPF_HUGE;
 
-	u |= kpf_copy_bit(k, KPF_LOCKED,	PG_locked);
-
 	/*
-	 * Caveats on high order pages:
-	 * PG_buddy will only be set on the head page; SLUB/SLQB do the same
-	 * for PG_slab; SLOB won't set PG_slab at all on compound pages.
+	 * Caveats on high order pages: page->_count will only be set
+	 * -1 on the head page; SLUB/SLQB do the same for PG_slab;
+	 * SLOB won't set PG_slab at all on compound pages.
 	 */
+	if (PageBuddy(page))
+		u |= 1 << KPF_BUDDY;
+
+	u |= kpf_copy_bit(k, KPF_LOCKED,	PG_locked);
+
 	u |= kpf_copy_bit(k, KPF_SLAB,		PG_slab);
-	u |= kpf_copy_bit(k, KPF_BUDDY,		PG_buddy);
 
 	u |= kpf_copy_bit(k, KPF_ERROR,		PG_error);
 	u |= kpf_copy_bit(k, KPF_DIRTY,		PG_dirty);
-- 
cgit v1.2.2


From cb9ef8d5e394f70db64bda79c20d3569a20d2574 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Date: Thu, 13 Jan 2011 15:47:26 -0800
Subject: fs/fs-writeback.c: fix sync_inodes_sb() return value kernel-doc

The sync_inodes_sb() function does not have a return value.  Remove the
outdated documentation comment.

Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fs-writeback.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 05aab263e9aa..59c6e4956786 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1225,7 +1225,7 @@ EXPORT_SYMBOL(writeback_inodes_sb_nr_if_idle);
  * @sb: the superblock
  *
  * This function writes and waits on any dirty inode belonging to this
- * super_block. The number of pages synced is returned.
+ * super_block.
  */
 void sync_inodes_sb(struct super_block *sb)
 {
-- 
cgit v1.2.2


From 9ee1ba5402e9d35fb35f8e61c968f4987b5fb443 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 13 Jan 2011 17:08:19 -0500
Subject: nfsd4: initialize cb_per_client

Otherwise a callback that is aborted before it runs will result in a
list_del on an uninitialized list head.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4callback.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index f1d9dd45553a..209e186386a0 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -866,6 +866,7 @@ void nfsd4_cb_recall(struct nfs4_delegation *dp)
 	cb->cb_ops = &nfsd4_cb_recall_ops;
 	dp->dl_retries = 1;
 
+	INIT_LIST_HEAD(&cb->cb_per_client);
 	cb->cb_done = true;
 
 	run_nfsd4_cb(&dp->dl_recall);
-- 
cgit v1.2.2


From 9ce137eee4febaabca81143be07d4205d2bd52d4 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 11 Jan 2011 14:07:12 -0500
Subject: nfsd: don't support msnfs export option

We've long had these pointless #ifdef MSNFS's sprinkled throughout the
code--pointless because MSNFS is always defined (and we give no config
option to make that easy to change).  So we could just remove the
ifdef's and compile the resulting code unconditionally.

But as long as we're there: why not just rip out this code entirely?
The only purpose is to implement the "msnfs" export option which turns
on Windows-like behavior in some cases, and:

	- the export option isn't documented anywhere;
	- the userland utilities (which would need to be able to parse
	  "msnfs" in an export file) don't support it;
	- I don't know how to maintain this, as I don't know what the
	  proper behavior is; and
	- google shows no evidence that anyone has ever used this.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/export.c |  4 ----
 fs/nfsd/vfs.c    | 41 ++---------------------------------------
 2 files changed, 2 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index c0fcb7ab7f6d..8b31e5f8795d 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1,4 +1,3 @@
-#define MSNFS	/* HACK HACK */
 /*
  * NFS exporting and validation.
  *
@@ -1444,9 +1443,6 @@ static struct flags {
 	{ NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}},
 	{ NFSEXP_NOAUTHNLM, {"insecure_locks", ""}},
 	{ NFSEXP_V4ROOT, {"v4root", ""}},
-#ifdef MSNFS
-	{ NFSEXP_MSNFS, {"msnfs", ""}},
-#endif
 	{ 0, {"", ""}}
 };
 
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index b991125ce4a5..0a01e2fc5dda 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1,4 +1,3 @@
-#define MSNFS	/* HACK HACK */
 /*
  * File operations used by nfsd. Some of these have been ripped from
  * other parts of the kernel because they weren't exported, others
@@ -875,15 +874,6 @@ static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe,
 	return __splice_from_pipe(pipe, sd, nfsd_splice_actor);
 }
 
-static inline int svc_msnfs(struct svc_fh *ffhp)
-{
-#ifdef MSNFS
-	return (ffhp->fh_export->ex_flags & NFSEXP_MSNFS);
-#else
-	return 0;
-#endif
-}
-
 static __be32
 nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
               loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
@@ -896,9 +886,6 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 	err = nfserr_perm;
 	inode = file->f_path.dentry->d_inode;
 
-	if (svc_msnfs(fhp) && !lock_may_read(inode, offset, *count))
-		goto out;
-
 	if (file->f_op->splice_read && rqstp->rq_splice_ok) {
 		struct splice_desc sd = {
 			.len		= 0,
@@ -923,7 +910,6 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 		fsnotify_access(file);
 	} else 
 		err = nfserrno(host_err);
-out:
 	return err;
 }
 
@@ -988,14 +974,6 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 	int			stable = *stablep;
 	int			use_wgather;
 
-#ifdef MSNFS
-	err = nfserr_perm;
-
-	if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
-		(!lock_may_write(file->f_path.dentry->d_inode, offset, *cnt)))
-		goto out;
-#endif
-
 	dentry = file->f_path.dentry;
 	inode = dentry->d_inode;
 	exp   = fhp->fh_export;
@@ -1046,7 +1024,6 @@ out_nfserr:
 		err = 0;
 	else
 		err = nfserrno(host_err);
-out:
 	return err;
 }
 
@@ -1751,13 +1728,6 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
 	if (ndentry == trap)
 		goto out_dput_new;
 
-	if (svc_msnfs(ffhp) &&
-		((atomic_read(&odentry->d_count) > 1)
-		 || (atomic_read(&ndentry->d_count) > 1))) {
-			host_err = -EPERM;
-			goto out_dput_new;
-	}
-
 	host_err = -EXDEV;
 	if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt)
 		goto out_dput_new;
@@ -1836,17 +1806,10 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 	if (host_err)
 		goto out_nfserr;
 
-	if (type != S_IFDIR) { /* It's UNLINK */
-#ifdef MSNFS
-		if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
-			(atomic_read(&rdentry->d_count) > 1)) {
-			host_err = -EPERM;
-		} else
-#endif
+	if (type != S_IFDIR)
 		host_err = vfs_unlink(dirp, rdentry);
-	} else { /* It's RMDIR */
+	else
 		host_err = vfs_rmdir(dirp, rdentry);
-	}
 
 	dput(rdentry);
 
-- 
cgit v1.2.2


From 6a76bebefe15d9a08864f824d7f8d5beaf37c997 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 11 Jan 2011 12:54:39 -0500
Subject: nfsd4: break lease on nfsd setattr

Leases (delegations) should really be broken on any metadata change, not
just on size change.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/vfs.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 0a01e2fc5dda..f97d4356431b 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -374,14 +374,6 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
 				goto out;
 		}
 
-		/*
-		 * If we are changing the size of the file, then
-		 * we need to break all leases.
-		 */
-		host_err = break_lease(inode, O_WRONLY | O_NONBLOCK);
-		if (host_err) /* ENOMEM or EWOULDBLOCK */
-			goto out_nfserr;
-
 		host_err = get_write_access(inode);
 		if (host_err)
 			goto out_nfserr;
@@ -422,7 +414,11 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
 
 	err = nfserr_notsync;
 	if (!check_guard || guardtime == inode->i_ctime.tv_sec) {
+		host_err = break_lease(inode, O_WRONLY | O_NONBLOCK);
+		if (host_err)
+			goto out_nfserr;
 		fh_lock(fhp);
+
 		host_err = notify_change(dentry, iap);
 		err = nfserrno(host_err);
 		fh_unlock(fhp);
-- 
cgit v1.2.2


From 4795bb37effb7b8fe77e2d2034545d062d3788a8 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 11 Jan 2011 13:55:46 -0500
Subject: nfsd: break lease on unlink, link, and rename

Any change to any of the links pointing to an entry should also break
delegations.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/vfs.c | 27 +++++++++++++++++++++++----
 1 file changed, 23 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index f97d4356431b..a110adbb3673 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -272,6 +272,13 @@ out:
 	return err;
 }
 
+static int nfsd_break_lease(struct inode *inode)
+{
+	if (!S_ISREG(inode->i_mode))
+		return 0;
+	return break_lease(inode, O_WRONLY | O_NONBLOCK);
+}
+
 /*
  * Commit metadata changes to stable storage.
  */
@@ -414,7 +421,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
 
 	err = nfserr_notsync;
 	if (!check_guard || guardtime == inode->i_ctime.tv_sec) {
-		host_err = break_lease(inode, O_WRONLY | O_NONBLOCK);
+		host_err = nfsd_break_lease(inode);
 		if (host_err)
 			goto out_nfserr;
 		fh_lock(fhp);
@@ -1639,6 +1646,12 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
 		err = nfserrno(host_err);
 		goto out_dput;
 	}
+	err = nfserr_noent;
+	if (!dold->d_inode)
+		goto out_drop_write;
+	host_err = nfsd_break_lease(dold->d_inode);
+	if (host_err)
+		goto out_drop_write;
 	host_err = vfs_link(dold, dirp, dnew);
 	if (!host_err) {
 		err = nfserrno(commit_metadata(ffhp));
@@ -1650,6 +1663,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
 		else
 			err = nfserrno(host_err);
 	}
+out_drop_write:
 	mnt_drop_write(tfhp->fh_export->ex_path.mnt);
 out_dput:
 	dput(dnew);
@@ -1731,15 +1745,17 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
 	if (host_err)
 		goto out_dput_new;
 
+	host_err = nfsd_break_lease(odentry->d_inode);
+	if (host_err)
+		goto out_drop_write;
 	host_err = vfs_rename(fdir, odentry, tdir, ndentry);
 	if (!host_err) {
 		host_err = commit_metadata(tfhp);
 		if (!host_err)
 			host_err = commit_metadata(ffhp);
 	}
-
+out_drop_write:
 	mnt_drop_write(ffhp->fh_export->ex_path.mnt);
-
  out_dput_new:
 	dput(ndentry);
  out_dput_old:
@@ -1802,11 +1818,14 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 	if (host_err)
 		goto out_nfserr;
 
+	host_err = nfsd_break_lease(rdentry->d_inode);
+	if (host_err)
+		goto out_put;
 	if (type != S_IFDIR)
 		host_err = vfs_unlink(dirp, rdentry);
 	else
 		host_err = vfs_rmdir(dirp, rdentry);
-
+out_put:
 	dput(rdentry);
 
 	if (!host_err)
-- 
cgit v1.2.2


From bb20c18db6fbb5e6ba499c76473a487d35073467 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 14 Jan 2011 02:35:53 +0000
Subject: fs: force_reval_path drop rcu-walk before d_invalidate

d_revalidate can return in rcu-walk mode even when it returns 0.  We can't just
call any old dcache function on rcu-walk dentry (the dentry is unstable, so
even through d_lock can safely be taken, the result may no longer be what we
expect -- careful re-checks would be required). So just drop rcu in this case.

(I missed this conversion when switching to the rcu-walk convention that Linus
suggested)

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/namei.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 19433cdba011..0f02359ce685 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -583,6 +583,13 @@ void release_open_intent(struct nameidata *nd)
 		fput(nd->intent.open.file);
 }
 
+/*
+ * Call d_revalidate and handle filesystems that request rcu-walk
+ * to be dropped. This may be called and return in rcu-walk mode,
+ * regardless of success or error. If -ECHILD is returned, the caller
+ * must return -ECHILD back up the path walk stack so path walk may
+ * be restarted in ref-walk mode.
+ */
 static int d_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
 	int status;
@@ -673,6 +680,9 @@ force_reval_path(struct path *path, struct nameidata *nd)
 		return 0;
 
 	if (!status) {
+		/* Don't d_invalidate in rcu-walk mode */
+		if (nameidata_drop_rcu(nd))
+			return -ECHILD;
 		d_invalidate(dentry);
 		status = -ESTALE;
 	}
-- 
cgit v1.2.2


From 90dbb77ba48dddb87445d238e84cd137cf97dd98 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 14 Jan 2011 02:36:19 +0000
Subject: fs: fix dropping of rcu-walk from force_reval_path

As J. R. Okajima noted, force_reval_path passes in the same dentry to
d_revalidate as the one in the nameidata structure (other callers pass in a
child), so the locking breaks. This can oops with a chrooted nfs mount, for
example. Similarly there can be other problems with revalidating a dentry
which is already in nameidata of the path walk.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/namei.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 0f02359ce685..14c73edca9ce 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -479,6 +479,14 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry
 	struct fs_struct *fs = current->fs;
 	struct dentry *parent = nd->path.dentry;
 
+	/*
+	 * It can be possible to revalidate the dentry that we started
+	 * the path walk with. force_reval_path may also revalidate the
+	 * dentry already committed to the nameidata.
+	 */
+	if (unlikely(parent == dentry))
+		return nameidata_drop_rcu(nd);
+
 	BUG_ON(!(nd->flags & LOOKUP_RCU));
 	if (nd->root.mnt) {
 		spin_lock(&fs->lock);
-- 
cgit v1.2.2


From 657e94b673a805b427903c5628e95348235fad06 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 14 Jan 2011 02:48:39 +0000
Subject: nfs: add missing rcu-walk check

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/nfs/dir.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index d33da530097a..a0d8320bed9c 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1410,11 +1410,15 @@ no_open:
 static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
 	struct dentry *parent = NULL;
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode;
 	struct inode *dir;
 	struct nfs_open_context *ctx;
 	int openflags, ret = 0;
 
+	if (nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	inode = dentry->d_inode;
 	if (!is_atomic_open(nd) || d_mountpoint(dentry))
 		goto no_open;
 
-- 
cgit v1.2.2


From f20877d94a74557b7c28b4ed8920d834c31e0ea5 Mon Sep 17 00:00:00 2001
From: "J. R. Okajima" <hooanon05@yahoo.co.jp>
Date: Fri, 14 Jan 2011 03:56:04 +0000
Subject: fs: fix do_last error case when need_reval_dot

When open(2) without O_DIRECTORY opens an existing dir, it should return
EISDIR. In do_last(), the variable 'error' is initialized EISDIR, but it
is changed by d_revalidate() which returns any positive to represent
'the target dir is valid.'

Should we keep and return the initialized 'error' in this case.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/namei.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 14c73edca9ce..bc24894c5f14 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2122,11 +2122,13 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 		dir = nd->path.dentry;
 	case LAST_DOT:
 		if (need_reval_dot(dir)) {
-			error = d_revalidate(nd->path.dentry, nd);
-			if (!error)
-				error = -ESTALE;
-			if (error < 0)
+			int status = d_revalidate(nd->path.dentry, nd);
+			if (!status)
+				status = -ESTALE;
+			if (status < 0) {
+				error = status;
 				goto exit;
+			}
 		}
 		/* fallthrough */
 	case LAST_ROOT:
-- 
cgit v1.2.2


From 7b9337aaf98f9941d0927a75217d3ff31afec609 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 14 Jan 2011 08:42:43 +0000
Subject: fs: namei fix ->put_link on wrong inode in do_filp_open

J. R. Okajima noticed that ->put_link is being attempted on the
wrong inode, and suggested the way to fix it. I changed it a bit
according to Al's suggestion to keep an explicit link path around.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/namei.c | 37 +++++++++++++++++++------------------
 1 file changed, 19 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index bc24894c5f14..9cda4c452a6d 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -779,7 +779,8 @@ static void path_put_conditional(struct path *path, struct nameidata *nd)
 		mntput(path->mnt);
 }
 
-static inline void path_to_nameidata(struct path *path, struct nameidata *nd)
+static inline void path_to_nameidata(const struct path *path,
+					struct nameidata *nd)
 {
 	if (!(nd->flags & LOOKUP_RCU)) {
 		dput(nd->path.dentry);
@@ -791,20 +792,20 @@ static inline void path_to_nameidata(struct path *path, struct nameidata *nd)
 }
 
 static __always_inline int
-__do_follow_link(struct path *path, struct nameidata *nd, void **p)
+__do_follow_link(const struct path *link, struct nameidata *nd, void **p)
 {
 	int error;
-	struct dentry *dentry = path->dentry;
+	struct dentry *dentry = link->dentry;
 
-	touch_atime(path->mnt, dentry);
+	touch_atime(link->mnt, dentry);
 	nd_set_link(nd, NULL);
 
-	if (path->mnt != nd->path.mnt) {
-		path_to_nameidata(path, nd);
+	if (link->mnt != nd->path.mnt) {
+		path_to_nameidata(link, nd);
 		nd->inode = nd->path.dentry->d_inode;
 		dget(dentry);
 	}
-	mntget(path->mnt);
+	mntget(link->mnt);
 
 	nd->last_type = LAST_BIND;
 	*p = dentry->d_inode->i_op->follow_link(dentry, nd);
@@ -2347,11 +2348,12 @@ reval:
 	nd.flags = flags;
 	filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
 	while (unlikely(!filp)) { /* trailing symlink */
-		struct path holder;
+		struct path link = path;
+		struct inode *linki = link.dentry->d_inode;
 		void *cookie;
 		error = -ELOOP;
 		/* S_ISDIR part is a temporary automount kludge */
-		if (!(nd.flags & LOOKUP_FOLLOW) && !S_ISDIR(nd.inode->i_mode))
+		if (!(nd.flags & LOOKUP_FOLLOW) && !S_ISDIR(linki->i_mode))
 			goto exit_dput;
 		if (count++ == 32)
 			goto exit_dput;
@@ -2367,23 +2369,22 @@ reval:
 		 * just set LAST_BIND.
 		 */
 		nd.flags |= LOOKUP_PARENT;
-		error = security_inode_follow_link(path.dentry, &nd);
+		error = security_inode_follow_link(link.dentry, &nd);
 		if (error)
 			goto exit_dput;
-		error = __do_follow_link(&path, &nd, &cookie);
+		error = __do_follow_link(&link, &nd, &cookie);
 		if (unlikely(error)) {
-			if (!IS_ERR(cookie) && nd.inode->i_op->put_link)
-				nd.inode->i_op->put_link(path.dentry, &nd, cookie);
+			if (!IS_ERR(cookie) && linki->i_op->put_link)
+				linki->i_op->put_link(link.dentry, &nd, cookie);
 			/* nd.path had been dropped */
-			nd.path = path;
+			nd.path = link;
 			goto out_path;
 		}
-		holder = path;
 		nd.flags &= ~LOOKUP_PARENT;
 		filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
-		if (nd.inode->i_op->put_link)
-			nd.inode->i_op->put_link(holder.dentry, &nd, cookie);
-		path_put(&holder);
+		if (linki->i_op->put_link)
+			linki->i_op->put_link(link.dentry, &nd, cookie);
+		path_put(&link);
 	}
 out:
 	if (nd.root.mnt)
-- 
cgit v1.2.2


From ba28b93a5227cc69ec811507f7d85ac25fa20fe2 Mon Sep 17 00:00:00 2001
From: Akshat Aranya <aranya@nec-labs.com>
Date: Fri, 14 Jan 2011 16:00:47 +0000
Subject: FS-Cache: Fix operation handling

fscache_submit_exclusive_op() adds an operation to the pending list if
other operations are pending.  Fix the check for pending ops as n_ops
must be greater than 0 at the point it is checked as it is incremented
immediately before under lock.

Signed-off-by: Akshat Aranya <aranya@nec-labs.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fscache/operation.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index b9f34eaede09..48a18f184d50 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -101,7 +101,7 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
 		object->n_ops++;
 		object->n_exclusive++;	/* reads and writes must wait */
 
-		if (object->n_ops > 0) {
+		if (object->n_ops > 1) {
 			atomic_inc(&op->usage);
 			list_add_tail(&op->pend_link, &object->pending_ops);
 			fscache_stat(&fscache_n_op_pend);
-- 
cgit v1.2.2


From 0ad53eeefcbb2620b6a71ffdaad4add20b450b8b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 14 Jan 2011 15:56:37 +0000
Subject: afs: add afs_wq and use it instead of the system workqueue

flush_scheduled_work() is going away.  afs needs to make sure all the
works it has queued have finished before being unloaded and there can
be arbitrary number of pending works.  Add afs_wq and use it as the
flush domain instead of the system workqueue.

Also, convert cancel_delayed_work() + flush_scheduled_work() to
cancel_delayed_work_sync() in afs_mntpt_kill_timer().

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: David Howells <dhowells@redhat.com>
Cc: linux-afs@lists.infradead.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/afs/cmservice.c | 12 ++++++------
 fs/afs/internal.h  |  1 +
 fs/afs/main.c      | 13 +++++++++++--
 fs/afs/mntpt.c     | 11 +++++------
 fs/afs/rxrpc.c     |  2 +-
 fs/afs/server.c    | 13 +++++++------
 fs/afs/vlocation.c | 14 +++++++-------
 7 files changed, 38 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index a3bcec75c54a..1c8c6cc6de30 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -289,7 +289,7 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
 	call->server = server;
 
 	INIT_WORK(&call->work, SRXAFSCB_CallBack);
-	schedule_work(&call->work);
+	queue_work(afs_wq, &call->work);
 	return 0;
 }
 
@@ -336,7 +336,7 @@ static int afs_deliver_cb_init_call_back_state(struct afs_call *call,
 	call->server = server;
 
 	INIT_WORK(&call->work, SRXAFSCB_InitCallBackState);
-	schedule_work(&call->work);
+	queue_work(afs_wq, &call->work);
 	return 0;
 }
 
@@ -367,7 +367,7 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call,
 	call->server = server;
 
 	INIT_WORK(&call->work, SRXAFSCB_InitCallBackState);
-	schedule_work(&call->work);
+	queue_work(afs_wq, &call->work);
 	return 0;
 }
 
@@ -400,7 +400,7 @@ static int afs_deliver_cb_probe(struct afs_call *call, struct sk_buff *skb,
 	call->state = AFS_CALL_REPLYING;
 
 	INIT_WORK(&call->work, SRXAFSCB_Probe);
-	schedule_work(&call->work);
+	queue_work(afs_wq, &call->work);
 	return 0;
 }
 
@@ -496,7 +496,7 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call, struct sk_buff *skb,
 	call->state = AFS_CALL_REPLYING;
 
 	INIT_WORK(&call->work, SRXAFSCB_ProbeUuid);
-	schedule_work(&call->work);
+	queue_work(afs_wq, &call->work);
 	return 0;
 }
 
@@ -580,6 +580,6 @@ static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call,
 	call->state = AFS_CALL_REPLYING;
 
 	INIT_WORK(&call->work, SRXAFSCB_TellMeAboutYourself);
-	schedule_work(&call->work);
+	queue_work(afs_wq, &call->work);
 	return 0;
 }
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index ab6db5abaf53..58c633b80246 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -577,6 +577,7 @@ extern int afs_drop_inode(struct inode *);
 /*
  * main.c
  */
+extern struct workqueue_struct *afs_wq;
 extern struct afs_uuid afs_uuid;
 
 /*
diff --git a/fs/afs/main.c b/fs/afs/main.c
index cfd1cbe25b22..42dd2e499ed8 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -30,6 +30,7 @@ module_param(rootcell, charp, 0);
 MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list");
 
 struct afs_uuid afs_uuid;
+struct workqueue_struct *afs_wq;
 
 /*
  * get a client UUID
@@ -87,10 +88,16 @@ static int __init afs_init(void)
 	if (ret < 0)
 		return ret;
 
+	/* create workqueue */
+	ret = -ENOMEM;
+	afs_wq = alloc_workqueue("afs", 0, 0);
+	if (!afs_wq)
+		return ret;
+
 	/* register the /proc stuff */
 	ret = afs_proc_init();
 	if (ret < 0)
-		return ret;
+		goto error_proc;
 
 #ifdef CONFIG_AFS_FSCACHE
 	/* we want to be able to cache */
@@ -140,6 +147,8 @@ error_cell_init:
 error_cache:
 #endif
 	afs_proc_cleanup();
+error_proc:
+	destroy_workqueue(afs_wq);
 	rcu_barrier();
 	printk(KERN_ERR "kAFS: failed to register: %d\n", ret);
 	return ret;
@@ -163,7 +172,7 @@ static void __exit afs_exit(void)
 	afs_purge_servers();
 	afs_callback_update_kill();
 	afs_vlocation_purge();
-	flush_scheduled_work();
+	destroy_workqueue(afs_wq);
 	afs_cell_purge();
 #ifdef CONFIG_AFS_FSCACHE
 	fscache_unregister_netfs(&afs_cache_netfs);
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 6153417caf57..e83c0336e7b5 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -268,8 +268,8 @@ static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd)
 		path_put(&nd->path);
 		nd->path.mnt = newmnt;
 		nd->path.dentry = dget(newmnt->mnt_root);
-		schedule_delayed_work(&afs_mntpt_expiry_timer,
-				      afs_mntpt_expiry_timeout * HZ);
+		queue_delayed_work(afs_wq, &afs_mntpt_expiry_timer,
+				   afs_mntpt_expiry_timeout * HZ);
 		break;
 	case -EBUSY:
 		/* someone else made a mount here whilst we were busy */
@@ -295,8 +295,8 @@ static void afs_mntpt_expiry_timed_out(struct work_struct *work)
 
 	if (!list_empty(&afs_vfsmounts)) {
 		mark_mounts_for_expiry(&afs_vfsmounts);
-		schedule_delayed_work(&afs_mntpt_expiry_timer,
-				      afs_mntpt_expiry_timeout * HZ);
+		queue_delayed_work(afs_wq, &afs_mntpt_expiry_timer,
+				   afs_mntpt_expiry_timeout * HZ);
 	}
 
 	_leave("");
@@ -310,6 +310,5 @@ void afs_mntpt_kill_timer(void)
 	_enter("");
 
 	ASSERT(list_empty(&afs_vfsmounts));
-	cancel_delayed_work(&afs_mntpt_expiry_timer);
-	flush_scheduled_work();
+	cancel_delayed_work_sync(&afs_mntpt_expiry_timer);
 }
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 654d8fdbf01f..e45a323aebb4 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -410,7 +410,7 @@ static void afs_rx_interceptor(struct sock *sk, unsigned long user_call_ID,
 	if (!call) {
 		/* its an incoming call for our callback service */
 		skb_queue_tail(&afs_incoming_calls, skb);
-		schedule_work(&afs_collect_incoming_call_work);
+		queue_work(afs_wq, &afs_collect_incoming_call_work);
 	} else {
 		/* route the messages directly to the appropriate call */
 		skb_queue_tail(&call->rx_queue, skb);
diff --git a/fs/afs/server.c b/fs/afs/server.c
index 9fdc7fe3a7bc..d59b7516e943 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -238,8 +238,8 @@ void afs_put_server(struct afs_server *server)
 	if (atomic_read(&server->usage) == 0) {
 		list_move_tail(&server->grave, &afs_server_graveyard);
 		server->time_of_death = get_seconds();
-		schedule_delayed_work(&afs_server_reaper,
-				      afs_server_timeout * HZ);
+		queue_delayed_work(afs_wq, &afs_server_reaper,
+				   afs_server_timeout * HZ);
 	}
 	spin_unlock(&afs_server_graveyard_lock);
 	_leave(" [dead]");
@@ -285,10 +285,11 @@ static void afs_reap_server(struct work_struct *work)
 		expiry = server->time_of_death + afs_server_timeout;
 		if (expiry > now) {
 			delay = (expiry - now) * HZ;
-			if (!schedule_delayed_work(&afs_server_reaper, delay)) {
+			if (!queue_delayed_work(afs_wq, &afs_server_reaper,
+						delay)) {
 				cancel_delayed_work(&afs_server_reaper);
-				schedule_delayed_work(&afs_server_reaper,
-						      delay);
+				queue_delayed_work(afs_wq, &afs_server_reaper,
+						   delay);
 			}
 			break;
 		}
@@ -323,5 +324,5 @@ void __exit afs_purge_servers(void)
 {
 	afs_server_timeout = 0;
 	cancel_delayed_work(&afs_server_reaper);
-	schedule_delayed_work(&afs_server_reaper, 0);
+	queue_delayed_work(afs_wq, &afs_server_reaper, 0);
 }
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index 9ac260d1361d..431984d2e372 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -507,8 +507,8 @@ void afs_put_vlocation(struct afs_vlocation *vl)
 		_debug("buried");
 		list_move_tail(&vl->grave, &afs_vlocation_graveyard);
 		vl->time_of_death = get_seconds();
-		schedule_delayed_work(&afs_vlocation_reap,
-				      afs_vlocation_timeout * HZ);
+		queue_delayed_work(afs_wq, &afs_vlocation_reap,
+				   afs_vlocation_timeout * HZ);
 
 		/* suspend updates on this record */
 		if (!list_empty(&vl->update)) {
@@ -561,11 +561,11 @@ static void afs_vlocation_reaper(struct work_struct *work)
 		if (expiry > now) {
 			delay = (expiry - now) * HZ;
 			_debug("delay %lu", delay);
-			if (!schedule_delayed_work(&afs_vlocation_reap,
-						   delay)) {
+			if (!queue_delayed_work(afs_wq, &afs_vlocation_reap,
+						delay)) {
 				cancel_delayed_work(&afs_vlocation_reap);
-				schedule_delayed_work(&afs_vlocation_reap,
-						      delay);
+				queue_delayed_work(afs_wq, &afs_vlocation_reap,
+						   delay);
 			}
 			break;
 		}
@@ -620,7 +620,7 @@ void afs_vlocation_purge(void)
 	destroy_workqueue(afs_vlocation_update_worker);
 
 	cancel_delayed_work(&afs_vlocation_reap);
-	schedule_delayed_work(&afs_vlocation_reap, 0);
+	queue_delayed_work(afs_wq, &afs_vlocation_reap, 0);
 }
 
 /*
-- 
cgit v1.2.2


From 49731baa41df404c2c3f44555869ab387363af43 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 14 Jan 2011 18:43:57 +0100
Subject: block: restore multiple bd_link_disk_holder() support

Commit e09b457b (block: simplify holder symlink handling) incorrectly
assumed that there is only one link at maximum.  dm may use multiple
links and expects block layer to track reference count for each link,
which is different from and unrelated to the exclusive device holder
identified by @holder when the device is opened.

Remove the single holder assumption and automatic removal of the link
and revive the per-link reference count tracking.  The code
essentially behaves the same as before commit e09b457b sans the
unnecessary kobject reference count dancing.

While at it, note that this facility should not be used by anyone else
than the current ones.  Sysfs symlinks shouldn't be abused like this
and the whole thing doesn't belong in the block layer at all.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Milan Broz <mbroz@redhat.com>
Cc: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Cc: Neil Brown <neilb@suse.de>
Cc: linux-raid@vger.kernel.org
Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/block_dev.c | 93 ++++++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 75 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index fe3f59c14a02..333a7bb4cb9c 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -432,6 +432,9 @@ static void init_once(void *foo)
 	mutex_init(&bdev->bd_mutex);
 	INIT_LIST_HEAD(&bdev->bd_inodes);
 	INIT_LIST_HEAD(&bdev->bd_list);
+#ifdef CONFIG_SYSFS
+	INIT_LIST_HEAD(&bdev->bd_holder_disks);
+#endif
 	inode_init_once(&ei->vfs_inode);
 	/* Initialize mutex for freeze. */
 	mutex_init(&bdev->bd_fsfreeze_mutex);
@@ -779,6 +782,23 @@ static struct block_device *bd_start_claiming(struct block_device *bdev,
 }
 
 #ifdef CONFIG_SYSFS
+struct bd_holder_disk {
+	struct list_head	list;
+	struct gendisk		*disk;
+	int			refcnt;
+};
+
+static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev,
+						  struct gendisk *disk)
+{
+	struct bd_holder_disk *holder;
+
+	list_for_each_entry(holder, &bdev->bd_holder_disks, list)
+		if (holder->disk == disk)
+			return holder;
+	return NULL;
+}
+
 static int add_symlink(struct kobject *from, struct kobject *to)
 {
 	return sysfs_create_link(from, to, kobject_name(to));
@@ -794,6 +814,8 @@ static void del_symlink(struct kobject *from, struct kobject *to)
  * @bdev: the claimed slave bdev
  * @disk: the holding disk
  *
+ * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
+ *
  * This functions creates the following sysfs symlinks.
  *
  * - from "slaves" directory of the holder @disk to the claimed @bdev
@@ -817,47 +839,83 @@ static void del_symlink(struct kobject *from, struct kobject *to)
  */
 int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
 {
+	struct bd_holder_disk *holder;
 	int ret = 0;
 
 	mutex_lock(&bdev->bd_mutex);
 
-	WARN_ON_ONCE(!bdev->bd_holder || bdev->bd_holder_disk);
+	WARN_ON_ONCE(!bdev->bd_holder);
 
 	/* FIXME: remove the following once add_disk() handles errors */
 	if (WARN_ON(!disk->slave_dir || !bdev->bd_part->holder_dir))
 		goto out_unlock;
 
-	ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
-	if (ret)
+	holder = bd_find_holder_disk(bdev, disk);
+	if (holder) {
+		holder->refcnt++;
 		goto out_unlock;
+	}
 
-	ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj);
-	if (ret) {
-		del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
+	holder = kzalloc(sizeof(*holder), GFP_KERNEL);
+	if (!holder) {
+		ret = -ENOMEM;
 		goto out_unlock;
 	}
 
-	bdev->bd_holder_disk = disk;
+	INIT_LIST_HEAD(&holder->list);
+	holder->disk = disk;
+	holder->refcnt = 1;
+
+	ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
+	if (ret)
+		goto out_free;
+
+	ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj);
+	if (ret)
+		goto out_del;
+
+	list_add(&holder->list, &bdev->bd_holder_disks);
+	goto out_unlock;
+
+out_del:
+	del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
+out_free:
+	kfree(holder);
 out_unlock:
 	mutex_unlock(&bdev->bd_mutex);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(bd_link_disk_holder);
 
-static void bd_unlink_disk_holder(struct block_device *bdev)
+/**
+ * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder()
+ * @bdev: the calimed slave bdev
+ * @disk: the holding disk
+ *
+ * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
+ *
+ * CONTEXT:
+ * Might sleep.
+ */
+void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
 {
-	struct gendisk *disk = bdev->bd_holder_disk;
+	struct bd_holder_disk *holder;
 
-	bdev->bd_holder_disk = NULL;
-	if (!disk)
-		return;
+	mutex_lock(&bdev->bd_mutex);
 
-	del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
-	del_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj);
+	holder = bd_find_holder_disk(bdev, disk);
+
+	if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) {
+		del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
+		del_symlink(bdev->bd_part->holder_dir,
+			    &disk_to_dev(disk)->kobj);
+		list_del_init(&holder->list);
+		kfree(holder);
+	}
+
+	mutex_unlock(&bdev->bd_mutex);
 }
-#else
-static inline void bd_unlink_disk_holder(struct block_device *bdev)
-{ }
+EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
 #endif
 
 /**
@@ -1380,7 +1438,6 @@ int blkdev_put(struct block_device *bdev, fmode_t mode)
 		 * unblock evpoll if it was a write holder.
 		 */
 		if (bdev_free) {
-			bd_unlink_disk_holder(bdev);
 			if (bdev->bd_write_holder) {
 				disk_unblock_events(bdev->bd_disk);
 				bdev->bd_write_holder = false;
-- 
cgit v1.2.2


From 56c24305d1494a7e345c75669dc60e8b231b735b Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 11 Jan 2011 07:24:25 -0500
Subject: cifs: cFYI the entire error code in map_smb_to_linux_error

We currently only print the DOS error part.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/netmisc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index 9aad47a2d62f..6783ce6cdc89 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -899,8 +899,8 @@ map_smb_to_linux_error(struct smb_hdr *smb, int logErr)
 	}
 	/* else ERRHRD class errors or junk  - return EIO */
 
-	cFYI(1, "Mapping smb error code %d to POSIX err %d",
-		 smberrcode, rc);
+	cFYI(1, "Mapping smb error code 0x%x to POSIX err %d",
+		 le32_to_cpu(smb->Status.CifsError), rc);
 
 	/* generic corrective action e.g. reconnect SMB session on
 	 * ERRbaduid could be added */
-- 
cgit v1.2.2


From bd7633195581c7665ce9dd80c665ec93466d1b64 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 11 Jan 2011 10:33:24 -0500
Subject: cifs: add cruid= mount option

In commit 3e4b3e1f we separated the "uid" mount option such that it
no longer determined the owner of the credential cache by default. When
we did this, we added a new option to cifs.upcall (--legacy-uid) to
try to make it so that it would behave the same was as it did before.

This ignored a rather important point -- the kernel has no way to know
what options are being passed to cifs.upcall, so it doesn't know what
uid it should use to determine whether to match an existing krb5 session.

The simplest solution is to simply add a new "cruid=" mount option that
only governs the uid owner of the credential cache for the mount.

Unfortunately, this means that the --legacy-uid option in cifs.upcall was
ill-considered and is now useless, but I don't see a better way to deal
with this.

A patch for the mount.cifs manpage will follow once this patch has been
accepted.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index a65d311d163a..9f59887badd2 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1113,6 +1113,8 @@ cifs_parse_mount_options(char *options, const char *devname,
 		} else if (!strnicmp(data, "uid", 3) && value && *value) {
 			vol->linux_uid = simple_strtoul(value, &value, 0);
 			uid_specified = true;
+		} else if (!strnicmp(data, "cruid", 5) && value && *value) {
+			vol->cred_uid = simple_strtoul(value, &value, 0);
 		} else if (!strnicmp(data, "forceuid", 8)) {
 			override_uid = 1;
 		} else if (!strnicmp(data, "noforceuid", 10)) {
-- 
cgit v1.2.2


From a8f2800b4f7b76cecb7209cb6a7d2b14904fc711 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Fri, 14 Jan 2011 14:25:48 -0500
Subject: nfsd4: fix callback restarting

Ensure a new callback is added to the client's list of callbacks at most
once.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4callback.c | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 209e186386a0..ae93c5c83e87 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -639,9 +639,12 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
 		if (!nfsd41_cb_get_slot(clp, task))
 			return;
 	}
-	cb->cb_done = false;
 	spin_lock(&clp->cl_lock);
-	list_add(&cb->cb_per_client, &clp->cl_callbacks);
+	if (list_empty(&cb->cb_per_client)) {
+		/* This is the first call, not a restart */
+		cb->cb_done = false;
+		list_add(&cb->cb_per_client, &clp->cl_callbacks);
+	}
 	spin_unlock(&clp->cl_lock);
 	rpc_call_start(task);
 }
@@ -678,10 +681,10 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
 
 	nfsd4_cb_done(task, calldata);
 
-	if (current_rpc_client == NULL) {
-		/* We're shutting down; give up. */
-		/* XXX: err, or is it ok just to fall through
-		 * and rpc_restart_call? */
+	if (current_rpc_client != task->tk_client) {
+		/* We're shutting down or changing cl_cb_client; leave
+		 * it to nfsd4_process_cb_update to restart the call if
+		 * necessary. */
 		return;
 	}
 
@@ -699,12 +702,6 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
 	default:
 		/* Network partition? */
 		nfsd4_mark_cb_down(clp, task->tk_status);
-		if (current_rpc_client != task->tk_client) {
-			/* queue a callback on the new connection: */
-			atomic_inc(&dp->dl_count);
-			run_nfsd4_cb(&dp->dl_recall);
-			return;
-		}
 	}
 	if (dp->dl_retries--) {
 		rpc_delay(task, 2*HZ);
-- 
cgit v1.2.2


From 6f7f7caab259026234277b659485d22c1dcb1ab4 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 14 Jan 2011 13:26:18 -0800
Subject: Turn d_set_d_op() BUG_ON() into WARN_ON_ONCE()

It's indicative of a real problem, and it actually triggers with
autofs4, but the BUG_ON() is excessive.  The autofs4 case is being fixed
(to only set d_op in the ->lookup method) but not merged yet.  In the
meantime this gets the code limping along.

Reported-by: Alex Elder <aelder@sgi.com>
Cc: Ian Kent <raven@themaw.net>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dcache.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 0c6d5c549d84..274a22250380 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1357,8 +1357,8 @@ EXPORT_SYMBOL(d_alloc_name);
 
 void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
 {
-	BUG_ON(dentry->d_op);
-	BUG_ON(dentry->d_flags & (DCACHE_OP_HASH	|
+	WARN_ON_ONCE(dentry->d_op);
+	WARN_ON_ONCE(dentry->d_flags & (DCACHE_OP_HASH	|
 				DCACHE_OP_COMPARE	|
 				DCACHE_OP_REVALIDATE	|
 				DCACHE_OP_DELETE ));
-- 
cgit v1.2.2


From 1a8edf40e7c3eee955e0dd0316a7c9d85e36f597 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 15 Jan 2011 13:12:53 -0500
Subject: do_lookup() fix

do_lookup() has a path leading from LOOKUP_RCU case to non-RCU
crossing of mountpoints, which breaks things badly.  If we
hit need_revalidate: and do nothing in there, we need to come
back into LOOKUP_RCU half of things, not to done: in non-RCU
one.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 8df7a78ace58..529e917ad2fc 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1089,6 +1089,7 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
 		nd->seq = seq;
 		if (dentry->d_flags & DCACHE_OP_REVALIDATE)
 			goto need_revalidate;
+done2:
 		path->mnt = mnt;
 		path->dentry = dentry;
 		__follow_mount_rcu(nd, path, inode);
@@ -1143,6 +1144,8 @@ need_revalidate:
 		goto need_lookup;
 	if (IS_ERR(dentry))
 		goto fail;
+	if (nd->flags & LOOKUP_RCU)
+		goto done2;
 	goto done;
 
 fail:
-- 
cgit v1.2.2


From 9875cf806403fae66b2410a3c2cc820d97731e04 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Jan 2011 18:45:21 +0000
Subject: Add a dentry op to handle automounting rather than abusing
 follow_link()

Add a dentry op (d_automount) to handle automounting directories rather than
abusing the follow_link() inode operation.  The operation is keyed off a new
dentry flag (DCACHE_NEED_AUTOMOUNT).

This also makes it easier to add an AT_ flag to suppress terminal segment
automount during pathwalk and removes the need for the kludge code in the
pathwalk algorithm to handle directories with follow_link() semantics.

The ->d_automount() dentry operation:

	struct vfsmount *(*d_automount)(struct path *mountpoint);

takes a pointer to the directory to be mounted upon, which is expected to
provide sufficient data to determine what should be mounted.  If successful, it
should return the vfsmount struct it creates (which it should also have added
to the namespace using do_add_mount() or similar).  If there's a collision with
another automount attempt, NULL should be returned.  If the directory specified
by the parameter should be used directly rather than being mounted upon,
-EISDIR should be returned.  In any other case, an error code should be
returned.

The ->d_automount() operation is called with no locks held and may sleep.  At
this point the pathwalk algorithm will be in ref-walk mode.

Within fs/namei.c itself, a new pathwalk subroutine (follow_automount()) is
added to handle mountpoints.  It will return -EREMOTE if the automount flag was
set, but no d_automount() op was supplied, -ELOOP if we've encountered too many
symlinks or mountpoints, -EISDIR if the walk point should be used without
mounting and 0 if successful.  The path will be updated to point to the mounted
filesystem if a successful automount took place.

__follow_mount() is replaced by follow_managed() which is more generic
(especially with the patch that adds ->d_manage()).  This handles transits from
directories during pathwalk, including automounting and skipping over
mountpoints (and holding processes with the next patch).

__follow_mount_rcu() will jump out of RCU-walk mode if it encounters an
automount point with nothing mounted on it.

follow_dotdot*() does not handle automounts as you don't want to trigger them
whilst following "..".

I've also extracted the mount/don't-mount logic from autofs4 and included it
here.  It makes the mount go ahead anyway if someone calls open() or creat(),
tries to traverse the directory, tries to chdir/chroot/etc. into the directory,
or sticks a '/' on the end of the pathname.  If they do a stat(), however,
they'll only trigger the automount if they didn't also say O_NOFOLLOW.

I've also added an inode flag (S_AUTOMOUNT) so that filesystems can mark their
inodes as automount points.  This flag is automatically propagated to the
dentry as DCACHE_NEED_AUTOMOUNT by __d_instantiate().  This saves NFS and could
save AFS a private flag bit apiece, but is not strictly necessary.  It would be
preferable to do the propagation in d_set_d_op(), but that doesn't normally
have access to the inode.

[AV: fixed breakage in case if __follow_mount_rcu() fails and nameidata_drop_rcu()
succeeds in RCU case of do_lookup(); we need to fall through to non-RCU case after
that, rather than just returning with ungrabbed *path]

Signed-off-by: David Howells <dhowells@redhat.com>
Was-Acked-by: Ian Kent <raven@themaw.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c |   5 +-
 fs/namei.c  | 226 +++++++++++++++++++++++++++++++++++++++++++++---------------
 2 files changed, 174 insertions(+), 57 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 0c6d5c549d84..51f7bb6463af 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1380,8 +1380,11 @@ EXPORT_SYMBOL(d_set_d_op);
 static void __d_instantiate(struct dentry *dentry, struct inode *inode)
 {
 	spin_lock(&dentry->d_lock);
-	if (inode)
+	if (inode) {
+		if (unlikely(IS_AUTOMOUNT(inode)))
+			dentry->d_flags |= DCACHE_NEED_AUTOMOUNT;
 		list_add(&dentry->d_alias, &inode->i_dentry);
+	}
 	dentry->d_inode = inode;
 	dentry_rcuwalk_barrier(dentry);
 	spin_unlock(&dentry->d_lock);
diff --git a/fs/namei.c b/fs/namei.c
index 529e917ad2fc..16109da68bbf 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -896,51 +896,120 @@ int follow_up(struct path *path)
 }
 
 /*
- * serialization is taken care of in namespace.c
+ * Perform an automount
+ * - return -EISDIR to tell follow_managed() to stop and return the path we
+ *   were called with.
  */
-static void __follow_mount_rcu(struct nameidata *nd, struct path *path,
-				struct inode **inode)
+static int follow_automount(struct path *path, unsigned flags,
+			    bool *need_mntput)
 {
-	while (d_mountpoint(path->dentry)) {
-		struct vfsmount *mounted;
-		mounted = __lookup_mnt(path->mnt, path->dentry, 1);
-		if (!mounted)
-			return;
-		path->mnt = mounted;
-		path->dentry = mounted->mnt_root;
-		nd->seq = read_seqcount_begin(&path->dentry->d_seq);
-		*inode = path->dentry->d_inode;
+	struct vfsmount *mnt;
+
+	if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
+		return -EREMOTE;
+
+	/* We want to mount if someone is trying to open/create a file of any
+	 * type under the mountpoint, wants to traverse through the mountpoint
+	 * or wants to open the mounted directory.
+	 *
+	 * We don't want to mount if someone's just doing a stat and they've
+	 * set AT_SYMLINK_NOFOLLOW - unless they're stat'ing a directory and
+	 * appended a '/' to the name.
+	 */
+	if (!(flags & LOOKUP_FOLLOW) &&
+	    !(flags & (LOOKUP_CONTINUE | LOOKUP_DIRECTORY |
+		       LOOKUP_OPEN | LOOKUP_CREATE)))
+		return -EISDIR;
+
+	current->total_link_count++;
+	if (current->total_link_count >= 40)
+		return -ELOOP;
+
+	mnt = path->dentry->d_op->d_automount(path);
+	if (IS_ERR(mnt)) {
+		/*
+		 * The filesystem is allowed to return -EISDIR here to indicate
+		 * it doesn't want to automount.  For instance, autofs would do
+		 * this so that its userspace daemon can mount on this dentry.
+		 *
+		 * However, we can only permit this if it's a terminal point in
+		 * the path being looked up; if it wasn't then the remainder of
+		 * the path is inaccessible and we should say so.
+		 */
+		if (PTR_ERR(mnt) == -EISDIR && (flags & LOOKUP_CONTINUE))
+			return -EREMOTE;
+		return PTR_ERR(mnt);
 	}
-}
+	if (!mnt) /* mount collision */
+		return 0;
 
-static int __follow_mount(struct path *path)
-{
-	int res = 0;
-	while (d_mountpoint(path->dentry)) {
-		struct vfsmount *mounted = lookup_mnt(path);
-		if (!mounted)
-			break;
-		dput(path->dentry);
-		if (res)
-			mntput(path->mnt);
-		path->mnt = mounted;
-		path->dentry = dget(mounted->mnt_root);
-		res = 1;
+	if (mnt->mnt_sb == path->mnt->mnt_sb &&
+	    mnt->mnt_root == path->dentry) {
+		mntput(mnt);
+		return -ELOOP;
 	}
-	return res;
+
+	dput(path->dentry);
+	if (*need_mntput)
+		mntput(path->mnt);
+	path->mnt = mnt;
+	path->dentry = dget(mnt->mnt_root);
+	*need_mntput = true;
+	return 0;
 }
 
-static void follow_mount(struct path *path)
+/*
+ * Handle a dentry that is managed in some way.
+ * - Flagged as mountpoint
+ * - Flagged as automount point
+ *
+ * This may only be called in refwalk mode.
+ *
+ * Serialization is taken care of in namespace.c
+ */
+static int follow_managed(struct path *path, unsigned flags)
 {
-	while (d_mountpoint(path->dentry)) {
-		struct vfsmount *mounted = lookup_mnt(path);
-		if (!mounted)
-			break;
-		dput(path->dentry);
-		mntput(path->mnt);
-		path->mnt = mounted;
-		path->dentry = dget(mounted->mnt_root);
+	unsigned managed;
+	bool need_mntput = false;
+	int ret;
+
+	/* Given that we're not holding a lock here, we retain the value in a
+	 * local variable for each dentry as we look at it so that we don't see
+	 * the components of that value change under us */
+	while (managed = ACCESS_ONCE(path->dentry->d_flags),
+	       managed &= DCACHE_MANAGED_DENTRY,
+	       unlikely(managed != 0)) {
+		/* Transit to a mounted filesystem. */
+		if (managed & DCACHE_MOUNTED) {
+			struct vfsmount *mounted = lookup_mnt(path);
+			if (mounted) {
+				dput(path->dentry);
+				if (need_mntput)
+					mntput(path->mnt);
+				path->mnt = mounted;
+				path->dentry = dget(mounted->mnt_root);
+				need_mntput = true;
+				continue;
+			}
+
+			/* Something is mounted on this dentry in another
+			 * namespace and/or whatever was mounted there in this
+			 * namespace got unmounted before we managed to get the
+			 * vfsmount_lock */
+		}
+
+		/* Handle an automount point */
+		if (managed & DCACHE_NEED_AUTOMOUNT) {
+			ret = follow_automount(path, flags, &need_mntput);
+			if (ret < 0)
+				return ret == -EISDIR ? 0 : ret;
+			continue;
+		}
+
+		/* We didn't change the current path point */
+		break;
 	}
+	return 0;
 }
 
 int follow_down(struct path *path)
@@ -958,13 +1027,37 @@ int follow_down(struct path *path)
 	return 0;
 }
 
+/*
+ * Skip to top of mountpoint pile in rcuwalk mode.  We abort the rcu-walk if we
+ * meet an automount point and we're not walking to "..".  True is returned to
+ * continue, false to abort.
+ */
+static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
+			       struct inode **inode, bool reverse_transit)
+{
+	while (d_mountpoint(path->dentry)) {
+		struct vfsmount *mounted;
+		mounted = __lookup_mnt(path->mnt, path->dentry, 1);
+		if (!mounted)
+			break;
+		path->mnt = mounted;
+		path->dentry = mounted->mnt_root;
+		nd->seq = read_seqcount_begin(&path->dentry->d_seq);
+		*inode = path->dentry->d_inode;
+	}
+
+	if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
+		return reverse_transit;
+	return true;
+}
+
 static int follow_dotdot_rcu(struct nameidata *nd)
 {
 	struct inode *inode = nd->inode;
 
 	set_root_rcu(nd);
 
-	while(1) {
+	while (1) {
 		if (nd->path.dentry == nd->root.dentry &&
 		    nd->path.mnt == nd->root.mnt) {
 			break;
@@ -987,12 +1080,28 @@ static int follow_dotdot_rcu(struct nameidata *nd)
 		nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
 		inode = nd->path.dentry->d_inode;
 	}
-	__follow_mount_rcu(nd, &nd->path, &inode);
+	__follow_mount_rcu(nd, &nd->path, &inode, true);
 	nd->inode = inode;
 
 	return 0;
 }
 
+/*
+ * Skip to top of mountpoint pile in refwalk mode for follow_dotdot()
+ */
+static void follow_mount(struct path *path)
+{
+	while (d_mountpoint(path->dentry)) {
+		struct vfsmount *mounted = lookup_mnt(path);
+		if (!mounted)
+			break;
+		dput(path->dentry);
+		mntput(path->mnt);
+		path->mnt = mounted;
+		path->dentry = dget(mounted->mnt_root);
+	}
+}
+
 static void follow_dotdot(struct nameidata *nd)
 {
 	set_root(nd);
@@ -1057,12 +1166,14 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
 	struct vfsmount *mnt = nd->path.mnt;
 	struct dentry *dentry, *parent = nd->path.dentry;
 	struct inode *dir;
+	int err;
+
 	/*
 	 * See if the low-level filesystem might want
 	 * to use its own hash..
 	 */
 	if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
-		int err = parent->d_op->d_hash(parent, nd->inode, name);
+		err = parent->d_op->d_hash(parent, nd->inode, name);
 		if (err < 0)
 			return err;
 	}
@@ -1092,20 +1203,25 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
 done2:
 		path->mnt = mnt;
 		path->dentry = dentry;
-		__follow_mount_rcu(nd, path, inode);
-	} else {
-		dentry = __d_lookup(parent, name);
-		if (!dentry)
-			goto need_lookup;
+		if (likely(__follow_mount_rcu(nd, path, inode, false)))
+			return 0;
+		if (nameidata_drop_rcu(nd))
+			return -ECHILD;
+		/* fallthru */
+	}
+	dentry = __d_lookup(parent, name);
+	if (!dentry)
+		goto need_lookup;
 found:
-		if (dentry->d_flags & DCACHE_OP_REVALIDATE)
-			goto need_revalidate;
+	if (dentry->d_flags & DCACHE_OP_REVALIDATE)
+		goto need_revalidate;
 done:
-		path->mnt = mnt;
-		path->dentry = dentry;
-		__follow_mount(path);
-		*inode = path->dentry->d_inode;
-	}
+	path->mnt = mnt;
+	path->dentry = dentry;
+	err = follow_managed(path, nd->flags);
+	if (unlikely(err < 0))
+		return err;
+	*inode = path->dentry->d_inode;
 	return 0;
 
 need_lookup:
@@ -2203,11 +2319,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 	if (open_flag & O_EXCL)
 		goto exit_dput;
 
-	if (__follow_mount(path)) {
-		error = -ELOOP;
-		if (open_flag & O_NOFOLLOW)
-			goto exit_dput;
-	}
+	error = follow_managed(path, nd->flags);
+	if (error < 0)
+		goto exit_dput;
 
 	error = -ENOENT;
 	if (!path->dentry->d_inode)
-- 
cgit v1.2.2


From cc53ce53c86924bfe98a12ea20b7465038a08792 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Jan 2011 18:45:26 +0000
Subject: Add a dentry op to allow processes to be held during pathwalk transit

Add a dentry op (d_manage) to permit a filesystem to hold a process and make it
sleep when it tries to transit away from one of that filesystem's directories
during a pathwalk.  The operation is keyed off a new dentry flag
(DCACHE_MANAGE_TRANSIT).

The filesystem is allowed to be selective about which processes it holds and
which it permits to continue on or prohibits from transiting from each flagged
directory.  This will allow autofs to hold up client processes whilst letting
its userspace daemon through to maintain the directory or the stuff behind it
or mounted upon it.

The ->d_manage() dentry operation:

	int (*d_manage)(struct path *path, bool mounting_here);

takes a pointer to the directory about to be transited away from and a flag
indicating whether the transit is undertaken by do_add_mount() or
do_move_mount() skipping through a pile of filesystems mounted on a mountpoint.

It should return 0 if successful and to let the process continue on its way;
-EISDIR to prohibit the caller from skipping to overmounted filesystems or
automounting, and to use this directory; or some other error code to return to
the user.

->d_manage() is called with namespace_sem writelocked if mounting_here is true
and no other locks held, so it may sleep.  However, if mounting_here is true,
it may not initiate or wait for a mount or unmount upon the parameter
directory, even if the act is actually performed by userspace.

Within fs/namei.c, follow_managed() is extended to check with d_manage() first
on each managed directory, before transiting away from it or attempting to
automount upon it.

follow_down() is renamed follow_down_one() and should only be used where the
filesystem deliberately intends to avoid management steps (e.g. autofs).

A new follow_down() is added that incorporates the loop done by all other
callers of follow_down() (do_add/move_mount(), autofs and NFSD; whilst AFS, NFS
and CIFS do use it, their use is removed by converting them to use
d_automount()).  The new follow_down() calls d_manage() as appropriate.  It
also takes an extra parameter to indicate if it is being called from mount code
(with namespace_sem writelocked) which it passes to d_manage().  follow_down()
ignores automount points so that it can be used to mount on them.

__follow_mount_rcu() is made to abort rcu-walk mode if it hits a directory with
DCACHE_MANAGE_TRANSIT set on the basis that we're probably going to have to
sleep.  It would be possible to enter d_manage() in rcu-walk mode too, and have
that determine whether to abort or not itself.  That would allow the autofs
daemon to continue on in rcu-walk mode.

Note that DCACHE_MANAGE_TRANSIT on a directory should be cleared when it isn't
required as every tranist from that directory will cause d_manage() to be
invoked.  It can always be set again when necessary.

==========================
WHAT THIS MEANS FOR AUTOFS
==========================

Autofs currently uses the lookup() inode op and the d_revalidate() dentry op to
trigger the automounting of indirect mounts, and both of these can be called
with i_mutex held.

autofs knows that the i_mutex will be held by the caller in lookup(), and so
can drop it before invoking the daemon - but this isn't so for d_revalidate(),
since the lock is only held on _some_ of the code paths that call it.  This
means that autofs can't risk dropping i_mutex from its d_revalidate() function
before it calls the daemon.

The bug could manifest itself as, for example, a process that's trying to
validate an automount dentry that gets made to wait because that dentry is
expired and needs cleaning up:

	mkdir         S ffffffff8014e05a     0 32580  24956
	Call Trace:
	 [<ffffffff885371fd>] :autofs4:autofs4_wait+0x674/0x897
	 [<ffffffff80127f7d>] avc_has_perm+0x46/0x58
	 [<ffffffff8009fdcf>] autoremove_wake_function+0x0/0x2e
	 [<ffffffff88537be6>] :autofs4:autofs4_expire_wait+0x41/0x6b
	 [<ffffffff88535cfc>] :autofs4:autofs4_revalidate+0x91/0x149
	 [<ffffffff80036d96>] __lookup_hash+0xa0/0x12f
	 [<ffffffff80057a2f>] lookup_create+0x46/0x80
	 [<ffffffff800e6e31>] sys_mkdirat+0x56/0xe4

versus the automount daemon which wants to remove that dentry, but can't
because the normal process is holding the i_mutex lock:

	automount     D ffffffff8014e05a     0 32581      1              32561
	Call Trace:
	 [<ffffffff80063c3f>] __mutex_lock_slowpath+0x60/0x9b
	 [<ffffffff8000ccf1>] do_path_lookup+0x2ca/0x2f1
	 [<ffffffff80063c89>] .text.lock.mutex+0xf/0x14
	 [<ffffffff800e6d55>] do_rmdir+0x77/0xde
	 [<ffffffff8005d229>] tracesys+0x71/0xe0
	 [<ffffffff8005d28d>] tracesys+0xd5/0xe0

which means that the system is deadlocked.

This patch allows autofs to hold up normal processes whilst the daemon goes
ahead and does things to the dentry tree behind the automouter point without
risking a deadlock as almost no locks are held in d_manage() and none in
d_automount().

Signed-off-by: David Howells <dhowells@redhat.com>
Was-Acked-by: Ian Kent <raven@themaw.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/afs/mntpt.c         |  5 +---
 fs/autofs4/autofs_i.h  | 13 ---------
 fs/autofs4/dev-ioctl.c |  2 +-
 fs/autofs4/expire.c    |  2 +-
 fs/autofs4/root.c      | 11 ++++----
 fs/cifs/cifs_dfs_ref.c |  5 +---
 fs/namei.c             | 72 ++++++++++++++++++++++++++++++++++++++++++++++++--
 fs/namespace.c         | 14 +++++-----
 fs/nfs/namespace.c     |  5 +---
 fs/nfsd/vfs.c          |  5 ++--
 10 files changed, 91 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index e83c0336e7b5..f3e891d57a2c 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -273,10 +273,7 @@ static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd)
 		break;
 	case -EBUSY:
 		/* someone else made a mount here whilst we were busy */
-		while (d_mountpoint(nd->path.dentry) &&
-		       follow_down(&nd->path))
-			;
-		err = 0;
+		err = follow_down(&nd->path, false);
 	default:
 		mntput(newmnt);
 		break;
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 0fffe1c24cec..eb67953452bb 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -229,19 +229,6 @@ int autofs4_wait(struct autofs_sb_info *,struct dentry *, enum autofs_notify);
 int autofs4_wait_release(struct autofs_sb_info *,autofs_wqt_t,int);
 void autofs4_catatonic_mode(struct autofs_sb_info *);
 
-static inline int autofs4_follow_mount(struct path *path)
-{
-	int res = 0;
-
-	while (d_mountpoint(path->dentry)) {
-		int followed = follow_down(path);
-		if (!followed)
-			break;
-		res = 1;
-	}
-	return res;
-}
-
 static inline u32 autofs4_get_dev(struct autofs_sb_info *sbi)
 {
 	return new_encode_dev(sbi->sb->s_dev);
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index eff9a419469a..1442da4860e5 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -551,7 +551,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
 
 		err = have_submounts(path.dentry);
 
-		if (follow_down(&path))
+		if (follow_down_one(&path))
 			magic = path.mnt->mnt_sb->s_magic;
 	}
 
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index cc1d01365905..6a930b90d389 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -56,7 +56,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
 
 	path_get(&path);
 
-	if (!follow_down(&path))
+	if (!follow_down_one(&path))
 		goto done;
 
 	if (is_autofs4_dentry(path.dentry)) {
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 651e4ef563b1..20225636a4e9 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -234,7 +234,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
 		nd->flags);
 	/*
 	 * For an expire of a covered direct or offset mount we need
-	 * to break out of follow_down() at the autofs mount trigger
+	 * to break out of follow_down_one() at the autofs mount trigger
 	 * (d_mounted--), so we can see the expiring flag, and manage
 	 * the blocking and following here until the expire is completed.
 	 */
@@ -243,7 +243,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
 		if (ino->flags & AUTOFS_INF_EXPIRING) {
 			spin_unlock(&sbi->fs_lock);
 			/* Follow down to our covering mount. */
-			if (!follow_down(&nd->path))
+			if (!follow_down_one(&nd->path))
 				goto done;
 			goto follow;
 		}
@@ -292,11 +292,10 @@ follow:
 	 * multi-mount with no root offset so we don't need
 	 * to follow it.
 	 */
-	if (d_mountpoint(dentry)) {
-		if (!autofs4_follow_mount(&nd->path)) {
-			status = -ENOENT;
+	if (d_managed(dentry)) {
+		status = follow_down(&nd->path, false);
+		if (status < 0)
 			goto out_error;
-		}
 	}
 
 done:
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index c68a056f27fd..83479cf63f96 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -273,10 +273,7 @@ static int add_mount_helper(struct vfsmount *newmnt, struct nameidata *nd,
 		break;
 	case -EBUSY:
 		/* someone else made a mount here whilst we were busy */
-		while (d_mountpoint(nd->path.dentry) &&
-		       follow_down(&nd->path))
-			;
-		err = 0;
+		err = follow_down(&nd->path, false);
 	default:
 		mntput(newmnt);
 		break;
diff --git a/fs/namei.c b/fs/namei.c
index 16109da68bbf..9d3033dc22e9 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -960,6 +960,7 @@ static int follow_automount(struct path *path, unsigned flags,
 
 /*
  * Handle a dentry that is managed in some way.
+ * - Flagged for transit management (autofs)
  * - Flagged as mountpoint
  * - Flagged as automount point
  *
@@ -979,6 +980,16 @@ static int follow_managed(struct path *path, unsigned flags)
 	while (managed = ACCESS_ONCE(path->dentry->d_flags),
 	       managed &= DCACHE_MANAGED_DENTRY,
 	       unlikely(managed != 0)) {
+		/* Allow the filesystem to manage the transit without i_mutex
+		 * being held. */
+		if (managed & DCACHE_MANAGE_TRANSIT) {
+			BUG_ON(!path->dentry->d_op);
+			BUG_ON(!path->dentry->d_op->d_manage);
+			ret = path->dentry->d_op->d_manage(path->dentry, false);
+			if (ret < 0)
+				return ret == -EISDIR ? 0 : ret;
+		}
+
 		/* Transit to a mounted filesystem. */
 		if (managed & DCACHE_MOUNTED) {
 			struct vfsmount *mounted = lookup_mnt(path);
@@ -1012,7 +1023,7 @@ static int follow_managed(struct path *path, unsigned flags)
 	return 0;
 }
 
-int follow_down(struct path *path)
+int follow_down_one(struct path *path)
 {
 	struct vfsmount *mounted;
 
@@ -1029,14 +1040,19 @@ int follow_down(struct path *path)
 
 /*
  * Skip to top of mountpoint pile in rcuwalk mode.  We abort the rcu-walk if we
- * meet an automount point and we're not walking to "..".  True is returned to
+ * meet a managed dentry and we're not walking to "..".  True is returned to
  * continue, false to abort.
  */
 static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
 			       struct inode **inode, bool reverse_transit)
 {
+	unsigned abort_mask =
+		reverse_transit ? 0 : DCACHE_MANAGE_TRANSIT;
+
 	while (d_mountpoint(path->dentry)) {
 		struct vfsmount *mounted;
+		if (path->dentry->d_flags & abort_mask)
+			return true;
 		mounted = __lookup_mnt(path->mnt, path->dentry, 1);
 		if (!mounted)
 			break;
@@ -1086,6 +1102,57 @@ static int follow_dotdot_rcu(struct nameidata *nd)
 	return 0;
 }
 
+/*
+ * Follow down to the covering mount currently visible to userspace.  At each
+ * point, the filesystem owning that dentry may be queried as to whether the
+ * caller is permitted to proceed or not.
+ *
+ * Care must be taken as namespace_sem may be held (indicated by mounting_here
+ * being true).
+ */
+int follow_down(struct path *path, bool mounting_here)
+{
+	unsigned managed;
+	int ret;
+
+	while (managed = ACCESS_ONCE(path->dentry->d_flags),
+	       unlikely(managed & DCACHE_MANAGED_DENTRY)) {
+		/* Allow the filesystem to manage the transit without i_mutex
+		 * being held.
+		 *
+		 * We indicate to the filesystem if someone is trying to mount
+		 * something here.  This gives autofs the chance to deny anyone
+		 * other than its daemon the right to mount on its
+		 * superstructure.
+		 *
+		 * The filesystem may sleep at this point.
+		 */
+		if (managed & DCACHE_MANAGE_TRANSIT) {
+			BUG_ON(!path->dentry->d_op);
+			BUG_ON(!path->dentry->d_op->d_manage);
+			ret = path->dentry->d_op->d_manage(path->dentry, mounting_here);
+			if (ret < 0)
+				return ret == -EISDIR ? 0 : ret;
+		}
+
+		/* Transit to a mounted filesystem. */
+		if (managed & DCACHE_MOUNTED) {
+			struct vfsmount *mounted = lookup_mnt(path);
+			if (!mounted)
+				break;
+			dput(path->dentry);
+			mntput(path->mnt);
+			path->mnt = mounted;
+			path->dentry = dget(mounted->mnt_root);
+			continue;
+		}
+
+		/* Don't handle automount points here */
+		break;
+	}
+	return 0;
+}
+
 /*
  * Skip to top of mountpoint pile in refwalk mode for follow_dotdot()
  */
@@ -3530,6 +3597,7 @@ const struct inode_operations page_symlink_inode_operations = {
 };
 
 EXPORT_SYMBOL(user_path_at);
+EXPORT_SYMBOL(follow_down_one);
 EXPORT_SYMBOL(follow_down);
 EXPORT_SYMBOL(follow_up);
 EXPORT_SYMBOL(get_write_access); /* binfmt_aout */
diff --git a/fs/namespace.c b/fs/namespace.c
index 3ddfd9046c44..d94ccd6ddafd 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1844,9 +1844,10 @@ static int do_move_mount(struct path *path, char *old_name)
 		return err;
 
 	down_write(&namespace_sem);
-	while (d_mountpoint(path->dentry) &&
-	       follow_down(path))
-		;
+	err = follow_down(path, true);
+	if (err < 0)
+		goto out;
+
 	err = -EINVAL;
 	if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt))
 		goto out;
@@ -1940,9 +1941,10 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path,
 
 	down_write(&namespace_sem);
 	/* Something was mounted here while we slept */
-	while (d_mountpoint(path->dentry) &&
-	       follow_down(path))
-		;
+	err = follow_down(path, true);
+	if (err < 0)
+		goto unlock;
+
 	err = -EINVAL;
 	if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt))
 		goto unlock;
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 74aaf3963c10..bfcb933e5755 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -176,10 +176,7 @@ out_err:
 	path_put(&nd->path);
 	goto out;
 out_follow:
-	while (d_mountpoint(nd->path.dentry) &&
-	       follow_down(&nd->path))
-		;
-	err = 0;
+	err = follow_down(&nd->path, false);
 	goto out;
 }
 
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 230b79fbf005..0f79e33a65d7 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -88,8 +88,9 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
 			    .dentry = dget(dentry)};
 	int err = 0;
 
-	while (d_mountpoint(path.dentry) && follow_down(&path))
-		;
+	err = follow_down(&path, false);
+	if (err < 0)
+		goto out;
 
 	exp2 = rqst_exp_get_by_name(rqstp, &path);
 	if (IS_ERR(exp2)) {
-- 
cgit v1.2.2


From 6f45b65672c8017d5e210e338bb5858a938ef445 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Jan 2011 18:45:31 +0000
Subject: Add an AT_NO_AUTOMOUNT flag to suppress terminal automount

Add an AT_NO_AUTOMOUNT flag to suppress terminal automounting of automount
point directories.  This can be used by fstatat() users to permit the
gathering of attributes on an automount point and also prevent
mass-automounting of a directory of automount points by ls.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Ian Kent <raven@themaw.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 6 ++++++
 fs/stat.c  | 4 +++-
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 9d3033dc22e9..dc50bfb2f5d6 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -908,6 +908,12 @@ static int follow_automount(struct path *path, unsigned flags,
 	if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
 		return -EREMOTE;
 
+	/* We don't want to mount if someone supplied AT_NO_AUTOMOUNT
+	 * and this is the terminal part of the path.
+	 */
+	if ((flags & LOOKUP_NO_AUTOMOUNT) && !(flags & LOOKUP_CONTINUE))
+		return -EISDIR; /* we actually want to stop here */
+
 	/* We want to mount if someone is trying to open/create a file of any
 	 * type under the mountpoint, wants to traverse through the mountpoint
 	 * or wants to open the mounted directory.
diff --git a/fs/stat.c b/fs/stat.c
index 12e90e213900..d5c61cf2b703 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -75,11 +75,13 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
 	int error = -EINVAL;
 	int lookup_flags = 0;
 
-	if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
+	if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT)) != 0)
 		goto out;
 
 	if (!(flag & AT_SYMLINK_NOFOLLOW))
 		lookup_flags |= LOOKUP_FOLLOW;
+	if (flag & AT_NO_AUTOMOUNT)
+		lookup_flags |= LOOKUP_NO_AUTOMOUNT;
 
 	error = user_path_at(dfd, filename, lookup_flags, &path);
 	if (error)
-- 
cgit v1.2.2


From d18610b0ce9eb48c60649d8fcbf68374c84349d3 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Jan 2011 19:04:05 +0000
Subject: AFS: Use d_automount() rather than abusing follow_link()

Make AFS use the new d_automount() dentry operation rather than abusing
follow_link() on directories.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/afs/dir.c      |  1 +
 fs/afs/inode.c    |  3 ++-
 fs/afs/internal.h |  1 +
 fs/afs/mntpt.c    | 44 +++++++++++++++-----------------------------
 4 files changed, 19 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index e6a4ab980e31..20c106f24927 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -66,6 +66,7 @@ const struct dentry_operations afs_fs_dentry_operations = {
 	.d_revalidate	= afs_d_revalidate,
 	.d_delete	= afs_d_delete,
 	.d_release	= afs_d_release,
+	.d_automount	= afs_d_automount,
 };
 
 #define AFS_DIR_HASHTBL_SIZE	128
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 0747339011c3..db66c5201474 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -184,7 +184,8 @@ struct inode *afs_iget_autocell(struct inode *dir, const char *dev_name,
 	inode->i_generation	= 0;
 
 	set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags);
-	inode->i_flags |= S_NOATIME;
+	set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags);
+	inode->i_flags |= S_AUTOMOUNT | S_NOATIME;
 	unlock_new_inode(inode);
 	_leave(" = %p", inode);
 	return inode;
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 58c633b80246..5a9b6843bac1 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -592,6 +592,7 @@ extern const struct inode_operations afs_mntpt_inode_operations;
 extern const struct inode_operations afs_autocell_inode_operations;
 extern const struct file_operations afs_mntpt_file_operations;
 
+extern struct vfsmount *afs_d_automount(struct path *);
 extern int afs_mntpt_check_symlink(struct afs_vnode *, struct key *);
 extern void afs_mntpt_kill_timer(void);
 
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index f3e891d57a2c..d23b2e344a78 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -24,7 +24,6 @@ static struct dentry *afs_mntpt_lookup(struct inode *dir,
 				       struct dentry *dentry,
 				       struct nameidata *nd);
 static int afs_mntpt_open(struct inode *inode, struct file *file);
-static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd);
 static void afs_mntpt_expiry_timed_out(struct work_struct *work);
 
 const struct file_operations afs_mntpt_file_operations = {
@@ -34,13 +33,11 @@ const struct file_operations afs_mntpt_file_operations = {
 
 const struct inode_operations afs_mntpt_inode_operations = {
 	.lookup		= afs_mntpt_lookup,
-	.follow_link	= afs_mntpt_follow_link,
 	.readlink	= page_readlink,
 	.getattr	= afs_getattr,
 };
 
 const struct inode_operations afs_autocell_inode_operations = {
-	.follow_link	= afs_mntpt_follow_link,
 	.getattr	= afs_getattr,
 };
 
@@ -88,6 +85,7 @@ int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key)
 		_debug("symlink is a mountpoint");
 		spin_lock(&vnode->lock);
 		set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags);
+		vnode->vfs_inode.i_flags |= S_AUTOMOUNT;
 		spin_unlock(&vnode->lock);
 	}
 
@@ -238,49 +236,37 @@ error_no_devname:
 }
 
 /*
- * follow a link from a mountpoint directory, thus causing it to be mounted
+ * handle an automount point
  */
-static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd)
+struct vfsmount *afs_d_automount(struct path *path)
 {
 	struct vfsmount *newmnt;
 	int err;
 
-	_enter("%p{%s},{%s:%p{%s},}",
-	       dentry,
-	       dentry->d_name.name,
-	       nd->path.mnt->mnt_devname,
-	       dentry,
-	       nd->path.dentry->d_name.name);
-
-	dput(nd->path.dentry);
-	nd->path.dentry = dget(dentry);
+	_enter("{%s,%s}", path->mnt->mnt_devname, path->dentry->d_name.name);
 
-	newmnt = afs_mntpt_do_automount(nd->path.dentry);
-	if (IS_ERR(newmnt)) {
-		path_put(&nd->path);
-		return (void *)newmnt;
-	}
+	newmnt = afs_mntpt_do_automount(path->dentry);
+	if (IS_ERR(newmnt))
+		return newmnt;
 
 	mntget(newmnt);
-	err = do_add_mount(newmnt, &nd->path, MNT_SHRINKABLE, &afs_vfsmounts);
+	err = do_add_mount(newmnt, path, MNT_SHRINKABLE, &afs_vfsmounts);
 	switch (err) {
 	case 0:
-		path_put(&nd->path);
-		nd->path.mnt = newmnt;
-		nd->path.dentry = dget(newmnt->mnt_root);
 		queue_delayed_work(afs_wq, &afs_mntpt_expiry_timer,
 				   afs_mntpt_expiry_timeout * HZ);
-		break;
+		_leave(" = %p {%s}", newmnt, newmnt->mnt_devname);
+		return newmnt;
 	case -EBUSY:
 		/* someone else made a mount here whilst we were busy */
-		err = follow_down(&nd->path, false);
+		mntput(newmnt);
+		_leave(" = NULL [EBUSY]");
+		return NULL;
 	default:
 		mntput(newmnt);
-		break;
+		_leave(" = %d", err);
+		return ERR_PTR(err);
 	}
-
-	_leave(" = %d", err);
-	return ERR_PTR(err);
 }
 
 /*
-- 
cgit v1.2.2


From 36d43a43761b004ad1879ac21471d8fc5f3157ec Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Jan 2011 18:45:42 +0000
Subject: NFS: Use d_automount() rather than abusing follow_link()

Make NFS use the new d_automount() dentry operation rather than abusing
follow_link() on directories.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Ian Kent <raven@themaw.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfs/dir.c       |  4 ++-
 fs/nfs/inode.c     |  4 +--
 fs/nfs/internal.h  |  1 +
 fs/nfs/namespace.c | 84 ++++++++++++++++++++++++++----------------------------
 4 files changed, 46 insertions(+), 47 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index df8c03a02161..2c3eb33b904d 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -970,7 +970,7 @@ int nfs_lookup_verify_inode(struct inode *inode, struct nameidata *nd)
 {
 	struct nfs_server *server = NFS_SERVER(inode);
 
-	if (test_bit(NFS_INO_MOUNTPOINT, &NFS_I(inode)->flags))
+	if (IS_AUTOMOUNT(inode))
 		return 0;
 	if (nd != NULL) {
 		/* VFS wants an on-the-wire revalidation */
@@ -1173,6 +1173,7 @@ const struct dentry_operations nfs_dentry_operations = {
 	.d_revalidate	= nfs_lookup_revalidate,
 	.d_delete	= nfs_dentry_delete,
 	.d_iput		= nfs_dentry_iput,
+	.d_automount	= nfs_d_automount,
 };
 
 static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
@@ -1246,6 +1247,7 @@ const struct dentry_operations nfs4_dentry_operations = {
 	.d_revalidate	= nfs_open_revalidate,
 	.d_delete	= nfs_dentry_delete,
 	.d_iput		= nfs_dentry_iput,
+	.d_automount	= nfs_d_automount,
 };
 
 /*
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index ce00b704452c..d8512423ba72 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -300,7 +300,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 				else
 					inode->i_op = &nfs_mountpoint_inode_operations;
 				inode->i_fop = NULL;
-				set_bit(NFS_INO_MOUNTPOINT, &nfsi->flags);
+				inode->i_flags |= S_AUTOMOUNT;
 			}
 		} else if (S_ISLNK(inode->i_mode))
 			inode->i_op = &nfs_symlink_inode_operations;
@@ -1208,7 +1208,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 	/* Update the fsid? */
 	if (S_ISDIR(inode->i_mode) && (fattr->valid & NFS_ATTR_FATTR_FSID) &&
 			!nfs_fsid_equal(&server->fsid, &fattr->fsid) &&
-			!test_bit(NFS_INO_MOUNTPOINT, &nfsi->flags))
+			!IS_AUTOMOUNT(inode))
 		server->fsid = fattr->fsid;
 
 	/*
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index bfa3a34af801..4644f04b4b46 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -252,6 +252,7 @@ extern char *nfs_path(const char *base,
 		      const struct dentry *droot,
 		      const struct dentry *dentry,
 		      char *buffer, ssize_t buflen);
+extern struct vfsmount *nfs_d_automount(struct path *path);
 
 /* getroot.c */
 extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *);
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index bfcb933e5755..f3fbb1bf3f18 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -97,9 +97,8 @@ Elong:
 }
 
 /*
- * nfs_follow_mountpoint - handle crossing a mountpoint on the server
- * @dentry - dentry of mountpoint
- * @nd - nameidata info
+ * nfs_d_automount - Handle crossing a mountpoint on the server
+ * @path - The mountpoint
  *
  * When we encounter a mountpoint on the server, we want to set up
  * a mountpoint on the client too, to prevent inode numbers from
@@ -109,84 +108,81 @@ Elong:
  * situation, and that different filesystems may want to use
  * different security flavours.
  */
-static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
+struct vfsmount *nfs_d_automount(struct path *path)
 {
 	struct vfsmount *mnt;
-	struct nfs_server *server = NFS_SERVER(dentry->d_inode);
+	struct nfs_server *server = NFS_SERVER(path->dentry->d_inode);
 	struct dentry *parent;
 	struct nfs_fh *fh = NULL;
 	struct nfs_fattr *fattr = NULL;
 	int err;
 
-	dprintk("--> nfs_follow_mountpoint()\n");
+	dprintk("--> nfs_d_automount()\n");
 
-	err = -ESTALE;
-	if (IS_ROOT(dentry))
-		goto out_err;
+	mnt = ERR_PTR(-ESTALE);
+	if (IS_ROOT(path->dentry))
+		goto out_nofree;
 
-	err = -ENOMEM;
+	mnt = ERR_PTR(-ENOMEM);
 	fh = nfs_alloc_fhandle();
 	fattr = nfs_alloc_fattr();
 	if (fh == NULL || fattr == NULL)
-		goto out_err;
+		goto out;
 
 	dprintk("%s: enter\n", __func__);
-	dput(nd->path.dentry);
-	nd->path.dentry = dget(dentry);
 
-	/* Look it up again */
-	parent = dget_parent(nd->path.dentry);
+	/* Look it up again to get its attributes */
+	parent = dget_parent(path->dentry);
 	err = server->nfs_client->rpc_ops->lookup(parent->d_inode,
-						  &nd->path.dentry->d_name,
+						  &path->dentry->d_name,
 						  fh, fattr);
 	dput(parent);
-	if (err != 0)
-		goto out_err;
+	if (err != 0) {
+		mnt = ERR_PTR(err);
+		goto out;
+	}
 
 	if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
-		mnt = nfs_do_refmount(nd->path.mnt, nd->path.dentry);
+		mnt = nfs_do_refmount(path->mnt, path->dentry);
 	else
-		mnt = nfs_do_submount(nd->path.mnt, nd->path.dentry, fh,
-				      fattr);
-	err = PTR_ERR(mnt);
+		mnt = nfs_do_submount(path->mnt, path->dentry, fh, fattr);
 	if (IS_ERR(mnt))
-		goto out_err;
+		goto out;
 
 	mntget(mnt);
-	err = do_add_mount(mnt, &nd->path, nd->path.mnt->mnt_flags|MNT_SHRINKABLE,
+	err = do_add_mount(mnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE,
 			   &nfs_automount_list);
-	if (err < 0) {
+	switch (err) {
+	case 0:
+		dprintk("%s: done, success\n", __func__);
+		schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
+		break;
+	case -EBUSY:
+		/* someone else made a mount here whilst we were busy */
 		mntput(mnt);
-		if (err == -EBUSY)
-			goto out_follow;
-		goto out_err;
+		dprintk("%s: done, collision\n", __func__);
+		mnt = NULL;
+		break;
+	default:
+		mntput(mnt);
+		dprintk("%s: done, error %d\n", __func__, err);
+		mnt = ERR_PTR(err);
+		break;
 	}
-	path_put(&nd->path);
-	nd->path.mnt = mnt;
-	nd->path.dentry = dget(mnt->mnt_root);
-	schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
+
 out:
 	nfs_free_fattr(fattr);
 	nfs_free_fhandle(fh);
-	dprintk("%s: done, returned %d\n", __func__, err);
-
-	dprintk("<-- nfs_follow_mountpoint() = %d\n", err);
-	return ERR_PTR(err);
-out_err:
-	path_put(&nd->path);
-	goto out;
-out_follow:
-	err = follow_down(&nd->path, false);
-	goto out;
+out_nofree:
+	dprintk("<-- nfs_follow_mountpoint() = %p\n", mnt);
+	return mnt;
 }
 
 const struct inode_operations nfs_mountpoint_inode_operations = {
-	.follow_link	= nfs_follow_mountpoint,
 	.getattr	= nfs_getattr,
 };
 
 const struct inode_operations nfs_referral_inode_operations = {
-	.follow_link	= nfs_follow_mountpoint,
 };
 
 static void nfs_expire_automounts(struct work_struct *work)
-- 
cgit v1.2.2


From 01c64feac45cea1317263eabc4f7ee1b240f297f Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Jan 2011 18:45:47 +0000
Subject: CIFS: Use d_automount() rather than abusing follow_link()

Make CIFS use the new d_automount() dentry operation rather than abusing
follow_link() on directories.

[NOTE: THIS IS UNTESTED!]

Signed-off-by: David Howells <dhowells@redhat.com>
Cc: Steve French <sfrench@samba.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/cifs/cifs_dfs_ref.c | 131 +++++++++++++++++++++++++------------------------
 fs/cifs/cifsfs.h       |   6 +++
 fs/cifs/dir.c          |   2 +
 fs/cifs/inode.c        |   8 +--
 4 files changed, 80 insertions(+), 67 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 83479cf63f96..0fc163808de3 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -255,32 +255,6 @@ static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb,
 
 }
 
-static int add_mount_helper(struct vfsmount *newmnt, struct nameidata *nd,
-				struct list_head *mntlist)
-{
-	/* stolen from afs code */
-	int err;
-
-	mntget(newmnt);
-	err = do_add_mount(newmnt, &nd->path, nd->path.mnt->mnt_flags | MNT_SHRINKABLE, mntlist);
-	switch (err) {
-	case 0:
-		path_put(&nd->path);
-		nd->path.mnt = newmnt;
-		nd->path.dentry = dget(newmnt->mnt_root);
-		schedule_delayed_work(&cifs_dfs_automount_task,
-				      cifs_dfs_mountpoint_expiry_timeout);
-		break;
-	case -EBUSY:
-		/* someone else made a mount here whilst we were busy */
-		err = follow_down(&nd->path, false);
-	default:
-		mntput(newmnt);
-		break;
-	}
-	return err;
-}
-
 static void dump_referral(const struct dfs_info3_param *ref)
 {
 	cFYI(1, "DFS: ref path: %s", ref->path_name);
@@ -290,45 +264,43 @@ static void dump_referral(const struct dfs_info3_param *ref)
 				ref->path_consumed);
 }
 
-
-static void*
-cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
+/*
+ * Create a vfsmount that we can automount
+ */
+static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
 {
 	struct dfs_info3_param *referrals = NULL;
 	unsigned int num_referrals = 0;
 	struct cifs_sb_info *cifs_sb;
 	struct cifsSesInfo *ses;
-	char *full_path = NULL;
+	char *full_path;
 	int xid, i;
-	int rc = 0;
-	struct vfsmount *mnt = ERR_PTR(-ENOENT);
+	int rc;
+	struct vfsmount *mnt;
 	struct tcon_link *tlink;
 
 	cFYI(1, "in %s", __func__);
-	BUG_ON(IS_ROOT(dentry));
+	BUG_ON(IS_ROOT(mntpt));
 
 	xid = GetXid();
 
-	dput(nd->path.dentry);
-	nd->path.dentry = dget(dentry);
-
 	/*
 	 * The MSDFS spec states that paths in DFS referral requests and
 	 * responses must be prefixed by a single '\' character instead of
 	 * the double backslashes usually used in the UNC. This function
 	 * gives us the latter, so we must adjust the result.
 	 */
-	full_path = build_path_from_dentry(dentry);
-	if (full_path == NULL) {
-		rc = -ENOMEM;
-		goto out_err;
-	}
+	mnt = ERR_PTR(-ENOMEM);
+	full_path = build_path_from_dentry(mntpt);
+	if (full_path == NULL)
+		goto free_xid;
 
-	cifs_sb = CIFS_SB(dentry->d_inode->i_sb);
+	cifs_sb = CIFS_SB(mntpt->d_inode->i_sb);
 	tlink = cifs_sb_tlink(cifs_sb);
+	mnt = ERR_PTR(-EINVAL);
 	if (IS_ERR(tlink)) {
-		rc = PTR_ERR(tlink);
-		goto out_err;
+		mnt = ERR_CAST(tlink);
+		goto free_full_path;
 	}
 	ses = tlink_tcon(tlink)->ses;
 
@@ -338,46 +310,77 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
 
 	cifs_put_tlink(tlink);
 
+	mnt = ERR_PTR(-ENOENT);
 	for (i = 0; i < num_referrals; i++) {
 		int len;
-		dump_referral(referrals+i);
+		dump_referral(referrals + i);
 		/* connect to a node */
 		len = strlen(referrals[i].node_name);
 		if (len < 2) {
 			cERROR(1, "%s: Net Address path too short: %s",
 					__func__, referrals[i].node_name);
-			rc = -EINVAL;
-			goto out_err;
+			mnt = ERR_PTR(-EINVAL);
+			break;
 		}
 		mnt = cifs_dfs_do_refmount(cifs_sb,
 				full_path, referrals + i);
 		cFYI(1, "%s: cifs_dfs_do_refmount:%s , mnt:%p", __func__,
 					referrals[i].node_name, mnt);
-
-		/* complete mount procedure if we accured submount */
 		if (!IS_ERR(mnt))
-			break;
+			goto success;
 	}
 
-	/* we need it cause for() above could exit without valid submount */
-	rc = PTR_ERR(mnt);
-	if (IS_ERR(mnt))
-		goto out_err;
-
-	rc = add_mount_helper(mnt, nd, &cifs_dfs_automount_list);
+	/* no valid submounts were found; return error from get_dfs_path() by
+	 * preference */
+	if (rc != 0)
+		mnt = ERR_PTR(rc);
 
-out:
-	FreeXid(xid);
+success:
 	free_dfs_info_array(referrals, num_referrals);
+free_full_path:
 	kfree(full_path);
+free_xid:
+	FreeXid(xid);
 	cFYI(1, "leaving %s" , __func__);
-	return ERR_PTR(rc);
-out_err:
-	path_put(&nd->path);
-	goto out;
+	return mnt;
+}
+
+/*
+ * Attempt to automount the referral
+ */
+struct vfsmount *cifs_dfs_d_automount(struct path *path)
+{
+	struct vfsmount *newmnt;
+	int err;
+
+	cFYI(1, "in %s", __func__);
+
+	newmnt = cifs_dfs_do_automount(path->dentry);
+	if (IS_ERR(newmnt)) {
+		cFYI(1, "leaving %s [automount failed]" , __func__);
+		return newmnt;
+	}
+
+	mntget(newmnt);
+	err = do_add_mount(newmnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE,
+			   &cifs_dfs_automount_list);
+	switch (err) {
+	case 0:
+		schedule_delayed_work(&cifs_dfs_automount_task,
+				      cifs_dfs_mountpoint_expiry_timeout);
+		cFYI(1, "leaving %s [ok]" , __func__);
+		return newmnt;
+	case -EBUSY:
+		/* someone else made a mount here whilst we were busy */
+		mntput(newmnt);
+		cFYI(1, "leaving %s [EBUSY]" , __func__);
+		return NULL;
+	default:
+		mntput(newmnt);
+		cFYI(1, "leaving %s [error %d]" , __func__, err);
+		return ERR_PTR(err);
+	}
 }
 
 const struct inode_operations cifs_dfs_referral_inode_operations = {
-	.follow_link = cifs_dfs_follow_mountpoint,
 };
-
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 897b2b2b28b5..851030f74939 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -93,6 +93,12 @@ extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir);
 extern const struct dentry_operations cifs_dentry_ops;
 extern const struct dentry_operations cifs_ci_dentry_ops;
 
+#ifdef CONFIG_CIFS_DFS_UPCALL
+extern struct vfsmount *cifs_dfs_d_automount(struct path *path);
+#else
+#define cifs_dfs_d_automount NULL
+#endif
+
 /* Functions related to symlinks */
 extern void *cifs_follow_link(struct dentry *direntry, struct nameidata *nd);
 extern void cifs_put_link(struct dentry *direntry,
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 1e95dd635632..dd5f22918c33 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -675,6 +675,7 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
 
 const struct dentry_operations cifs_dentry_ops = {
 	.d_revalidate = cifs_d_revalidate,
+	.d_automount = cifs_dfs_d_automount,
 /* d_delete:       cifs_d_delete,      */ /* not needed except for debugging */
 };
 
@@ -711,4 +712,5 @@ const struct dentry_operations cifs_ci_dentry_ops = {
 	.d_revalidate = cifs_d_revalidate,
 	.d_hash = cifs_ci_hash,
 	.d_compare = cifs_ci_compare,
+	.d_automount = cifs_dfs_d_automount,
 };
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index b06b60620240..6c9ee8014ff0 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -32,7 +32,7 @@
 #include "fscache.h"
 
 
-static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral)
+static void cifs_set_ops(struct inode *inode)
 {
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 
@@ -60,7 +60,7 @@ static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral)
 		break;
 	case S_IFDIR:
 #ifdef CONFIG_CIFS_DFS_UPCALL
-		if (is_dfs_referral) {
+		if (IS_AUTOMOUNT(inode)) {
 			inode->i_op = &cifs_dfs_referral_inode_operations;
 		} else {
 #else /* NO DFS support, treat as a directory */
@@ -167,7 +167,9 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
 	}
 	spin_unlock(&inode->i_lock);
 
-	cifs_set_ops(inode, fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL);
+	if (fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL)
+		inode->i_flags |= S_AUTOMOUNT;
+	cifs_set_ops(inode);
 }
 
 void
-- 
cgit v1.2.2


From db3729153e82ba3ada89681f26c4f1b6d6807a80 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Jan 2011 18:45:53 +0000
Subject: Remove the automount through follow_link() kludge code from pathwalk

Remove the automount through follow_link() kludge code from pathwalk in favour
of using d_automount().

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Ian Kent <raven@themaw.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index dc50bfb2f5d6..61995fba4e21 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1341,17 +1341,6 @@ fail:
 	return PTR_ERR(dentry);
 }
 
-/*
- * This is a temporary kludge to deal with "automount" symlinks; proper
- * solution is to trigger them on follow_mount(), so that do_lookup()
- * would DTRT.  To be killed before 2.6.34-final.
- */
-static inline int follow_on_final(struct inode *inode, unsigned lookup_flags)
-{
-	return inode && unlikely(inode->i_op->follow_link) &&
-		((lookup_flags & LOOKUP_FOLLOW) || S_ISDIR(inode->i_mode));
-}
-
 /*
  * Name resolution.
  * This is the basic name resolution function, turning a pathname into
@@ -1490,7 +1479,8 @@ last_component:
 		err = do_lookup(nd, &this, &next, &inode);
 		if (err)
 			break;
-		if (follow_on_final(inode, lookup_flags)) {
+		if (inode && unlikely(inode->i_op->follow_link) &&
+		    (lookup_flags & LOOKUP_FOLLOW)) {
 			if (nameidata_dentry_drop_rcu_maybe(nd, next.dentry))
 				return -ECHILD;
 			BUG_ON(inode != next.dentry->d_inode);
@@ -2543,8 +2533,7 @@ reval:
 		struct inode *linki = link.dentry->d_inode;
 		void *cookie;
 		error = -ELOOP;
-		/* S_ISDIR part is a temporary automount kludge */
-		if (!(nd.flags & LOOKUP_FOLLOW) && !S_ISDIR(linki->i_mode))
+		if (!(nd.flags & LOOKUP_FOLLOW))
 			goto exit_dput;
 		if (count++ == 32)
 			goto exit_dput;
-- 
cgit v1.2.2


From 10584211e48036182212b598cc53331776406d60 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Fri, 14 Jan 2011 18:45:58 +0000
Subject: autofs4: Add d_automount() dentry operation

Add a function to use the newly defined ->d_automount() dentry operation
for triggering mounts instead of doing the user space callback in ->lookup()
and ->d_revalidate().

Note, to be useful the subsequent patch to add the ->d_manage() dentry
operation is also needed so the discussion of functionality is deferred to
that patch.

Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/autofs4/autofs_i.h |  30 ++++++
 fs/autofs4/expire.c   |   6 ++
 fs/autofs4/inode.c    |   4 +
 fs/autofs4/root.c     | 261 ++++++++++++++++++++++++++++----------------------
 4 files changed, 189 insertions(+), 112 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index eb67953452bb..1ebfe53872b5 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -218,6 +218,36 @@ extern const struct inode_operations autofs4_direct_root_inode_operations;
 extern const struct file_operations autofs4_dir_operations;
 extern const struct file_operations autofs4_root_operations;
 
+/* Operations methods */
+
+struct vfsmount *autofs4_d_automount(struct path *);
+
+/* VFS automount flags management functions */
+
+static inline void __managed_dentry_set_automount(struct dentry *dentry)
+{
+	dentry->d_flags |= DCACHE_NEED_AUTOMOUNT;
+}
+
+static inline void managed_dentry_set_automount(struct dentry *dentry)
+{
+	spin_lock(&dentry->d_lock);
+	__managed_dentry_set_automount(dentry);
+	spin_unlock(&dentry->d_lock);
+}
+
+static inline void __managed_dentry_clear_automount(struct dentry *dentry)
+{
+	dentry->d_flags &= ~DCACHE_NEED_AUTOMOUNT;
+}
+
+static inline void managed_dentry_clear_automount(struct dentry *dentry)
+{
+	spin_lock(&dentry->d_lock);
+	__managed_dentry_clear_automount(dentry);
+	spin_unlock(&dentry->d_lock);
+}
+
 /* Initializing function */
 
 int autofs4_fill_super(struct super_block *, void *, int);
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 6a930b90d389..0571ec8352b7 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -300,6 +300,7 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
 			spin_unlock(&root->d_lock);
 		}
 		ino->flags |= AUTOFS_INF_EXPIRING;
+		managed_dentry_set_automount(root);
 		init_completion(&ino->expire_complete);
 		spin_unlock(&sbi->fs_lock);
 		return root;
@@ -408,6 +409,7 @@ found:
 		expired, (int)expired->d_name.len, expired->d_name.name);
 	ino = autofs4_dentry_ino(expired);
 	ino->flags |= AUTOFS_INF_EXPIRING;
+	managed_dentry_set_automount(expired);
 	init_completion(&ino->expire_complete);
 	spin_unlock(&sbi->fs_lock);
 	spin_lock(&autofs4_lock);
@@ -479,6 +481,8 @@ int autofs4_expire_run(struct super_block *sb,
 	spin_lock(&sbi->fs_lock);
 	ino = autofs4_dentry_ino(dentry);
 	ino->flags &= ~AUTOFS_INF_EXPIRING;
+	if (!d_unhashed(dentry))
+		managed_dentry_clear_automount(dentry);
 	complete_all(&ino->expire_complete);
 	spin_unlock(&sbi->fs_lock);
 
@@ -516,6 +520,8 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
 			ino->flags &= ~AUTOFS_INF_MOUNTPOINT;
 		}
 		ino->flags &= ~AUTOFS_INF_EXPIRING;
+		if (ret)
+			managed_dentry_clear_automount(dentry);
 		complete_all(&ino->expire_complete);
 		spin_unlock(&sbi->fs_lock);
 		dput(dentry);
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index a7bdb9dcac84..d0aa38cac302 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -252,6 +252,7 @@ static struct autofs_info *autofs4_mkroot(struct autofs_sb_info *sbi)
 }
 
 static const struct dentry_operations autofs4_sb_dentry_operations = {
+	.d_automount	= autofs4_d_automount,
 	.d_release      = autofs4_dentry_release,
 };
 
@@ -320,6 +321,9 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
 		goto fail_dput;
 	}
 
+	if (autofs_type_trigger(sbi->type))
+		__managed_dentry_set_automount(root);
+
 	root_inode->i_fop = &autofs4_root_operations;
 	root_inode->i_op = autofs_type_trigger(sbi->type) ?
 			&autofs4_direct_root_inode_operations :
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 20225636a4e9..27dc53e111fd 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -35,7 +35,6 @@ static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long);
 #endif
 static int autofs4_dir_open(struct inode *inode, struct file *file);
 static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *);
-static void *autofs4_follow_link(struct dentry *, struct nameidata *);
 
 #define TRIGGER_FLAGS   (LOOKUP_CONTINUE | LOOKUP_DIRECTORY)
 #define TRIGGER_INTENTS (LOOKUP_OPEN | LOOKUP_CREATE)
@@ -73,7 +72,6 @@ const struct inode_operations autofs4_direct_root_inode_operations = {
 	.unlink		= autofs4_dir_unlink,
 	.mkdir		= autofs4_dir_mkdir,
 	.rmdir		= autofs4_dir_rmdir,
-	.follow_link	= autofs4_follow_link,
 };
 
 const struct inode_operations autofs4_dir_inode_operations = {
@@ -420,13 +418,12 @@ void autofs4_dentry_release(struct dentry *de)
 
 /* For dentries of directories in the root dir */
 static const struct dentry_operations autofs4_root_dentry_operations = {
-	.d_revalidate	= autofs4_revalidate,
 	.d_release	= autofs4_dentry_release,
 };
 
 /* For other dentries */
 static const struct dentry_operations autofs4_dentry_operations = {
-	.d_revalidate	= autofs4_revalidate,
+	.d_automount	= autofs4_d_automount,
 	.d_release	= autofs4_dentry_release,
 };
 
@@ -540,50 +537,176 @@ next:
 	return NULL;
 }
 
+static int autofs4_mount_wait(struct dentry *dentry)
+{
+	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+	struct autofs_info *ino = autofs4_dentry_ino(dentry);
+	int status;
+
+	if (ino->flags & AUTOFS_INF_PENDING) {
+		DPRINTK("waiting for mount name=%.*s",
+			dentry->d_name.len, dentry->d_name.name);
+		status = autofs4_wait(sbi, dentry, NFY_MOUNT);
+		DPRINTK("mount wait done status=%d", status);
+		ino->last_used = jiffies;
+		return status;
+	}
+	return 0;
+}
+
+static int do_expire_wait(struct dentry *dentry)
+{
+	struct dentry *expiring;
+
+	expiring = autofs4_lookup_expiring(dentry);
+	if (!expiring)
+		return autofs4_expire_wait(dentry);
+	else {
+		/*
+		 * If we are racing with expire the request might not
+		 * be quite complete, but the directory has been removed
+		 * so it must have been successful, just wait for it.
+		 */
+		autofs4_expire_wait(expiring);
+		autofs4_del_expiring(expiring);
+		dput(expiring);
+	}
+	return 0;
+}
+
+static struct dentry *autofs4_mountpoint_changed(struct path *path)
+{
+	struct dentry *dentry = path->dentry;
+	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+
+	/*
+	 * If this is an indirect mount the dentry could have gone away
+	 * as a result of an expire and a new one created.
+	 */
+	if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) {
+		struct dentry *parent = dentry->d_parent;
+		struct dentry *new = d_lookup(parent, &dentry->d_name);
+		if (!new)
+			return NULL;
+		dput(path->dentry);
+		path->dentry = new;
+	}
+	return path->dentry;
+}
+
+struct vfsmount *autofs4_d_automount(struct path *path)
+{
+	struct dentry *dentry = path->dentry;
+	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+	struct autofs_info *ino = autofs4_dentry_ino(dentry);
+	int status;
+
+	DPRINTK("dentry=%p %.*s",
+		dentry, dentry->d_name.len, dentry->d_name.name);
+
+	/* The daemon never triggers a mount. */
+	if (autofs4_oz_mode(sbi))
+		return NULL;
+
+	/*
+	 * If an expire request is pending everyone must wait.
+	 * If the expire fails we're still mounted so continue
+	 * the follow and return. A return of -EAGAIN (which only
+	 * happens with indirect mounts) means the expire completed
+	 * and the directory was removed, so just go ahead and try
+	 * the mount.
+	 */
+	status = do_expire_wait(dentry);
+	if (status && status != -EAGAIN)
+		return NULL;
+
+	/* Callback to the daemon to perform the mount or wait */
+	spin_lock(&sbi->fs_lock);
+	if (ino->flags & AUTOFS_INF_PENDING) {
+		spin_unlock(&sbi->fs_lock);
+		status = autofs4_mount_wait(dentry);
+		if (status)
+			return ERR_PTR(status);
+		spin_lock(&sbi->fs_lock);
+		goto done;
+	}
+
+	/*
+	 * If the dentry is a symlink it's equivalent to a directory
+	 * having d_mounted() true, so there's no need to call back
+	 * to the daemon.
+	 */
+	if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode))
+		goto done;
+	spin_lock(&dentry->d_lock);
+	if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
+		ino->flags |= AUTOFS_INF_PENDING;
+		spin_unlock(&dentry->d_lock);
+		spin_unlock(&sbi->fs_lock);
+		status = autofs4_mount_wait(dentry);
+		if (status)
+			return ERR_PTR(status);
+		spin_lock(&sbi->fs_lock);
+		ino->flags &= ~AUTOFS_INF_PENDING;
+		goto done;
+	}
+	spin_unlock(&dentry->d_lock);
+done:
+	/*
+	 * Any needed mounting has been completed and the path updated
+	 * so turn this into a normal dentry so we don't continually
+	 * call ->d_automount().
+	 */
+	managed_dentry_clear_automount(dentry);
+	spin_unlock(&sbi->fs_lock);
+
+	/* Mount succeeded, check if we ended up with a new dentry */
+	dentry = autofs4_mountpoint_changed(path);
+	if (!dentry)
+		return ERR_PTR(-ENOENT);
+
+	return NULL;
+}
+
 /* Lookups in the root directory */
 static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
 	struct autofs_sb_info *sbi;
 	struct autofs_info *ino;
-	struct dentry *expiring, *active;
-	int oz_mode;
+	struct dentry *active;
 
-	DPRINTK("name = %.*s",
-		dentry->d_name.len, dentry->d_name.name);
+	DPRINTK("name = %.*s", dentry->d_name.len, dentry->d_name.name);
 
 	/* File name too long to exist */
 	if (dentry->d_name.len > NAME_MAX)
 		return ERR_PTR(-ENAMETOOLONG);
 
 	sbi = autofs4_sbi(dir->i_sb);
-	oz_mode = autofs4_oz_mode(sbi);
 
 	DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d",
-		 current->pid, task_pgrp_nr(current), sbi->catatonic, oz_mode);
+		current->pid, task_pgrp_nr(current), sbi->catatonic, oz_mode);
 
 	active = autofs4_lookup_active(dentry);
 	if (active) {
-		dentry = active;
-		ino = autofs4_dentry_ino(dentry);
+		return active;
 	} else {
-		/*
-		 * Mark the dentry incomplete but don't hash it. We do this
-		 * to serialize our inode creation operations (symlink and
-		 * mkdir) which prevents deadlock during the callback to
-		 * the daemon. Subsequent user space lookups for the same
-		 * dentry are placed on the wait queue while the daemon
-		 * itself is allowed passage unresticted so the create
-		 * operation itself can then hash the dentry. Finally,
-		 * we check for the hashed dentry and return the newly
-		 * hashed dentry.
-		 */
 		d_set_d_op(dentry, &autofs4_root_dentry_operations);
 
 		/*
-		 * And we need to ensure that the same dentry is used for
-		 * all following lookup calls until it is hashed so that
-		 * the dentry flags are persistent throughout the request.
+		 * A dentry that is not within the root can never trigger a
+		 * mount operation, unless the directory already exists, so we
+		 * can return fail immediately.  The daemon however does need
+		 * to create directories within the file system.
 		 */
+		if (!autofs4_oz_mode(sbi) && !IS_ROOT(dentry->d_parent))
+			return ERR_PTR(-ENOENT);
+
+		/* Mark entries in the root as mount triggers */
+		if (autofs_type_indirect(sbi->type) && IS_ROOT(dentry->d_parent)) {
+			d_set_d_op(dentry, &autofs4_dentry_operations);
+			managed_dentry_set_automount(dentry);
+		}
+
 		ino = autofs4_init_ino(NULL, sbi, 0555);
 		if (!ino)
 			return ERR_PTR(-ENOMEM);
@@ -595,82 +718,6 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
 
 		d_instantiate(dentry, NULL);
 	}
-
-	if (!oz_mode) {
-		mutex_unlock(&dir->i_mutex);
-		expiring = autofs4_lookup_expiring(dentry);
-		if (expiring) {
-			/*
-			 * If we are racing with expire the request might not
-			 * be quite complete but the directory has been removed
-			 * so it must have been successful, so just wait for it.
-			 */
-			autofs4_expire_wait(expiring);
-			autofs4_del_expiring(expiring);
-			dput(expiring);
-		}
-
-		spin_lock(&sbi->fs_lock);
-		ino->flags |= AUTOFS_INF_PENDING;
-		spin_unlock(&sbi->fs_lock);
-		if (dentry->d_op && dentry->d_op->d_revalidate)
-			(dentry->d_op->d_revalidate)(dentry, nd);
-		mutex_lock(&dir->i_mutex);
-	}
-
-	/*
-	 * If we are still pending, check if we had to handle
-	 * a signal. If so we can force a restart..
-	 */
-	if (ino->flags & AUTOFS_INF_PENDING) {
-		/* See if we were interrupted */
-		if (signal_pending(current)) {
-			sigset_t *sigset = &current->pending.signal;
-			if (sigismember (sigset, SIGKILL) ||
-			    sigismember (sigset, SIGQUIT) ||
-			    sigismember (sigset, SIGINT)) {
-			    if (active)
-				dput(active);
-			    return ERR_PTR(-ERESTARTNOINTR);
-			}
-		}
-		if (!oz_mode) {
-			spin_lock(&sbi->fs_lock);
-			ino->flags &= ~AUTOFS_INF_PENDING;
-			spin_unlock(&sbi->fs_lock);
-		}
-	}
-
-	/*
-	 * If this dentry is unhashed, then we shouldn't honour this
-	 * lookup.  Returning ENOENT here doesn't do the right thing
-	 * for all system calls, but it should be OK for the operations
-	 * we permit from an autofs.
-	 */
-	if (!oz_mode && d_unhashed(dentry)) {
-		/*
-		 * A user space application can (and has done in the past)
-		 * remove and re-create this directory during the callback.
-		 * This can leave us with an unhashed dentry, but a
-		 * successful mount!  So we need to perform another
-		 * cached lookup in case the dentry now exists.
-		 */
-		struct dentry *parent = dentry->d_parent;
-		struct dentry *new = d_lookup(parent, &dentry->d_name);
-		if (new != NULL)
-			dentry = new;
-		else
-			dentry = ERR_PTR(-ENOENT);
-
-		if (active)
-			dput(active);
-
-		return dentry;
-	}
-
-	if (active)
-		return active;
-
 	return NULL;
 }
 
@@ -715,11 +762,6 @@ static int autofs4_dir_symlink(struct inode *dir,
 	}
 	d_add(dentry, inode);
 
-	if (dir == dir->i_sb->s_root->d_inode)
-		d_set_d_op(dentry, &autofs4_root_dentry_operations);
-	else
-		d_set_d_op(dentry, &autofs4_dentry_operations);
-
 	dentry->d_fsdata = ino;
 	ino->dentry = dget(dentry);
 	atomic_inc(&ino->count);
@@ -850,11 +892,6 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	}
 	d_add(dentry, inode);
 
-	if (dir == dir->i_sb->s_root->d_inode)
-		d_set_d_op(dentry, &autofs4_root_dentry_operations);
-	else
-		d_set_d_op(dentry, &autofs4_dentry_operations);
-
 	dentry->d_fsdata = ino;
 	ino->dentry = dget(dentry);
 	atomic_inc(&ino->count);
-- 
cgit v1.2.2


From b5b801779d59165c4ecf1009009109545bd1f642 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Fri, 14 Jan 2011 18:46:03 +0000
Subject: autofs4: Add d_manage() dentry operation

This patch required a previous patch to add the ->d_automount()
dentry operation.

Add a function to use the newly defined ->d_manage() dentry operation
for blocking during mount and expire.

Whether the VFS calls the dentry operations d_automount() and d_manage()
is controled by the DMANAGED_AUTOMOUNT and DMANAGED_TRANSIT flags. autofs
uses the d_automount() operation to callback to user space to request
mount operations and the d_manage() operation to block walks into mounts
that are under construction or destruction.

In order to prevent these functions from being called unnecessarily the
DMANAGED_* flags are cleared for cases which would cause this. In the
common case the DMANAGED_AUTOMOUNT and DMANAGED_TRANSIT flags are both
set for dentrys waiting to be mounted. The DMANAGED_TRANSIT flag is
cleared upon successful mount request completion and set during expire
runs, both during the dentry expire check, and if selected for expire,
is left set until a subsequent successful mount request completes.

The exception to this is the so-called rootless multi-mount which has
no actual mount at its base. In this case the DMANAGED_AUTOMOUNT flag
is cleared upon successful mount request completion as well and set
again after a successful expire.

Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/autofs4/autofs_i.h | 50 ++++++++++++++++++++++++++-
 fs/autofs4/expire.c   | 51 +++++++++++++--------------
 fs/autofs4/inode.c    |  3 +-
 fs/autofs4/root.c     | 95 ++++++++++++++++++++++++++++++++++++++++++++-------
 4 files changed, 159 insertions(+), 40 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 1ebfe53872b5..f0c95e0460cf 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -99,7 +99,6 @@ struct autofs_info {
 };
 
 #define AUTOFS_INF_EXPIRING	(1<<0) /* dentry is in the process of expiring */
-#define AUTOFS_INF_MOUNTPOINT	(1<<1) /* mountpoint status for direct expire */
 #define AUTOFS_INF_PENDING	(1<<2) /* dentry pending mount */
 
 struct autofs_wait_queue {
@@ -221,6 +220,7 @@ extern const struct file_operations autofs4_root_operations;
 /* Operations methods */
 
 struct vfsmount *autofs4_d_automount(struct path *);
+int autofs4_d_manage(struct dentry *, bool);
 
 /* VFS automount flags management functions */
 
@@ -248,6 +248,54 @@ static inline void managed_dentry_clear_automount(struct dentry *dentry)
 	spin_unlock(&dentry->d_lock);
 }
 
+static inline void __managed_dentry_set_transit(struct dentry *dentry)
+{
+	dentry->d_flags |= DCACHE_MANAGE_TRANSIT;
+}
+
+static inline void managed_dentry_set_transit(struct dentry *dentry)
+{
+	spin_lock(&dentry->d_lock);
+	__managed_dentry_set_transit(dentry);
+	spin_unlock(&dentry->d_lock);
+}
+
+static inline void __managed_dentry_clear_transit(struct dentry *dentry)
+{
+	dentry->d_flags &= ~DCACHE_MANAGE_TRANSIT;
+}
+
+static inline void managed_dentry_clear_transit(struct dentry *dentry)
+{
+	spin_lock(&dentry->d_lock);
+	__managed_dentry_clear_transit(dentry);
+	spin_unlock(&dentry->d_lock);
+}
+
+static inline void __managed_dentry_set_managed(struct dentry *dentry)
+{
+	dentry->d_flags |= (DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT);
+}
+
+static inline void managed_dentry_set_managed(struct dentry *dentry)
+{
+	spin_lock(&dentry->d_lock);
+	__managed_dentry_set_managed(dentry);
+	spin_unlock(&dentry->d_lock);
+}
+
+static inline void __managed_dentry_clear_managed(struct dentry *dentry)
+{
+	dentry->d_flags &= ~(DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT);
+}
+
+static inline void managed_dentry_clear_managed(struct dentry *dentry)
+{
+	spin_lock(&dentry->d_lock);
+	__managed_dentry_clear_managed(dentry);
+	spin_unlock(&dentry->d_lock);
+}
+
 /* Initializing function */
 
 int autofs4_fill_super(struct super_block *, void *, int);
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 0571ec8352b7..3ed79d76c233 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -26,10 +26,6 @@ static inline int autofs4_can_expire(struct dentry *dentry,
 	if (ino == NULL)
 		return 0;
 
-	/* No point expiring a pending mount */
-	if (ino->flags & AUTOFS_INF_PENDING)
-		return 0;
-
 	if (!do_now) {
 		/* Too young to die */
 		if (!timeout || time_after(ino->last_used + timeout, now))
@@ -283,6 +279,7 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
 	unsigned long timeout;
 	struct dentry *root = dget(sb->s_root);
 	int do_now = how & AUTOFS_EXP_IMMEDIATE;
+	struct autofs_info *ino;
 
 	if (!root)
 		return NULL;
@@ -291,20 +288,21 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
 	timeout = sbi->exp_timeout;
 
 	spin_lock(&sbi->fs_lock);
+	ino = autofs4_dentry_ino(root);
+	/* No point expiring a pending mount */
+	if (ino->flags & AUTOFS_INF_PENDING) {
+		spin_unlock(&sbi->fs_lock);
+		return NULL;
+	}
+	managed_dentry_set_transit(root);
 	if (!autofs4_direct_busy(mnt, root, timeout, do_now)) {
 		struct autofs_info *ino = autofs4_dentry_ino(root);
-		if (d_mountpoint(root)) {
-			ino->flags |= AUTOFS_INF_MOUNTPOINT;
-			spin_lock(&root->d_lock);
-			root->d_flags &= ~DCACHE_MOUNTED;
-			spin_unlock(&root->d_lock);
-		}
 		ino->flags |= AUTOFS_INF_EXPIRING;
-		managed_dentry_set_automount(root);
 		init_completion(&ino->expire_complete);
 		spin_unlock(&sbi->fs_lock);
 		return root;
 	}
+	managed_dentry_clear_transit(root);
 	spin_unlock(&sbi->fs_lock);
 	dput(root);
 
@@ -341,6 +339,10 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
 	while ((dentry = get_next_positive_dentry(dentry, root))) {
 		spin_lock(&sbi->fs_lock);
 		ino = autofs4_dentry_ino(dentry);
+		/* No point expiring a pending mount */
+		if (ino->flags & AUTOFS_INF_PENDING)
+			goto cont;
+		managed_dentry_set_transit(dentry);
 
 		/*
 		 * Case 1: (i) indirect mount or top level pseudo direct mount
@@ -400,6 +402,8 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
 			}
 		}
 next:
+		managed_dentry_clear_transit(dentry);
+cont:
 		spin_unlock(&sbi->fs_lock);
 	}
 	return NULL;
@@ -409,7 +413,6 @@ found:
 		expired, (int)expired->d_name.len, expired->d_name.name);
 	ino = autofs4_dentry_ino(expired);
 	ino->flags |= AUTOFS_INF_EXPIRING;
-	managed_dentry_set_automount(expired);
 	init_completion(&ino->expire_complete);
 	spin_unlock(&sbi->fs_lock);
 	spin_lock(&autofs4_lock);
@@ -482,7 +485,7 @@ int autofs4_expire_run(struct super_block *sb,
 	ino = autofs4_dentry_ino(dentry);
 	ino->flags &= ~AUTOFS_INF_EXPIRING;
 	if (!d_unhashed(dentry))
-		managed_dentry_clear_automount(dentry);
+		managed_dentry_clear_transit(dentry);
 	complete_all(&ino->expire_complete);
 	spin_unlock(&sbi->fs_lock);
 
@@ -508,20 +511,18 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
 		ret = autofs4_wait(sbi, dentry, NFY_EXPIRE);
 
 		spin_lock(&sbi->fs_lock);
-		if (ino->flags & AUTOFS_INF_MOUNTPOINT) {
-			spin_lock(&sb->s_root->d_lock);
-			/*
-			 * If we haven't been expired away, then reset
-			 * mounted status.
-			 */
-			if (mnt->mnt_parent != mnt)
-				sb->s_root->d_flags |= DCACHE_MOUNTED;
-			spin_unlock(&sb->s_root->d_lock);
-			ino->flags &= ~AUTOFS_INF_MOUNTPOINT;
-		}
 		ino->flags &= ~AUTOFS_INF_EXPIRING;
+		spin_lock(&dentry->d_lock);
 		if (ret)
-			managed_dentry_clear_automount(dentry);
+			__managed_dentry_clear_transit(dentry);
+		else {
+			if ((IS_ROOT(dentry) ||
+			    (autofs_type_indirect(sbi->type) &&
+			     IS_ROOT(dentry->d_parent))) &&
+			    !(dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
+				__managed_dentry_set_automount(dentry);
+		}
+		spin_unlock(&dentry->d_lock);
 		complete_all(&ino->expire_complete);
 		spin_unlock(&sbi->fs_lock);
 		dput(dentry);
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index d0aa38cac302..75c1ed8e2fb9 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -253,6 +253,7 @@ static struct autofs_info *autofs4_mkroot(struct autofs_sb_info *sbi)
 
 static const struct dentry_operations autofs4_sb_dentry_operations = {
 	.d_automount	= autofs4_d_automount,
+	.d_manage	= autofs4_d_manage,
 	.d_release      = autofs4_dentry_release,
 };
 
@@ -322,7 +323,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
 	}
 
 	if (autofs_type_trigger(sbi->type))
-		__managed_dentry_set_automount(root);
+		__managed_dentry_set_managed(root);
 
 	root_inode->i_fop = &autofs4_root_operations;
 	root_inode->i_op = autofs_type_trigger(sbi->type) ?
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 27dc53e111fd..f1076b91a0fa 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -424,6 +424,7 @@ static const struct dentry_operations autofs4_root_dentry_operations = {
 /* For other dentries */
 static const struct dentry_operations autofs4_dentry_operations = {
 	.d_automount	= autofs4_d_automount,
+	.d_manage	= autofs4_d_manage,
 	.d_release	= autofs4_dentry_release,
 };
 
@@ -604,6 +605,18 @@ struct vfsmount *autofs4_d_automount(struct path *path)
 	DPRINTK("dentry=%p %.*s",
 		dentry, dentry->d_name.len, dentry->d_name.name);
 
+	/*
+	 * Someone may have manually umounted this or it was a submount
+	 * that has gone away.
+	 */
+	spin_lock(&dentry->d_lock);
+	if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
+		if (!(dentry->d_flags & DCACHE_MANAGE_TRANSIT) &&
+		     (dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
+			__managed_dentry_set_transit(path->dentry);
+	}
+	spin_unlock(&dentry->d_lock);
+
 	/* The daemon never triggers a mount. */
 	if (autofs4_oz_mode(sbi))
 		return NULL;
@@ -633,31 +646,63 @@ struct vfsmount *autofs4_d_automount(struct path *path)
 
 	/*
 	 * If the dentry is a symlink it's equivalent to a directory
-	 * having d_mounted() true, so there's no need to call back
+	 * having d_mountpoint() true, so there's no need to call back
 	 * to the daemon.
 	 */
 	if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode))
 		goto done;
-	spin_lock(&dentry->d_lock);
-	if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
+	if (!d_mountpoint(dentry)) {
+		/*
+		 * It's possible that user space hasn't removed directories
+		 * after umounting a rootless multi-mount, although it
+		 * should. For v5 have_submounts() is sufficient to handle
+		 * this because the leaves of the directory tree under the
+		 * mount never trigger mounts themselves (they have an autofs
+		 * trigger mount mounted on them). But v4 pseudo direct mounts
+		 * do need the leaves to to trigger mounts. In this case we
+		 * have no choice but to use the list_empty() check and
+		 * require user space behave.
+		 */
+		if (sbi->version > 4) {
+			if (have_submounts(dentry))
+				goto done;
+		} else {
+			spin_lock(&dentry->d_lock);
+			if (!list_empty(&dentry->d_subdirs)) {
+				spin_unlock(&dentry->d_lock);
+				goto done;
+			}
+			spin_unlock(&dentry->d_lock);
+		}
 		ino->flags |= AUTOFS_INF_PENDING;
-		spin_unlock(&dentry->d_lock);
 		spin_unlock(&sbi->fs_lock);
 		status = autofs4_mount_wait(dentry);
 		if (status)
 			return ERR_PTR(status);
 		spin_lock(&sbi->fs_lock);
 		ino->flags &= ~AUTOFS_INF_PENDING;
-		goto done;
 	}
-	spin_unlock(&dentry->d_lock);
 done:
-	/*
-	 * Any needed mounting has been completed and the path updated
-	 * so turn this into a normal dentry so we don't continually
-	 * call ->d_automount().
-	 */
-	managed_dentry_clear_automount(dentry);
+	if (!(ino->flags & AUTOFS_INF_EXPIRING)) {
+		/*
+		 * Any needed mounting has been completed and the path updated
+		 * so turn this into a normal dentry so we don't continually
+		 * call ->d_automount() and ->d_manage().
+		 */
+		spin_lock(&dentry->d_lock);
+		__managed_dentry_clear_transit(dentry);
+		/*
+		 * Only clear DMANAGED_AUTOMOUNT for rootless multi-mounts and
+		 * symlinks as in all other cases the dentry will be covered by
+		 * an actual mount so ->d_automount() won't be called during
+		 * the follow.
+		 */
+		if ((!d_mountpoint(dentry) &&
+		    !list_empty(&dentry->d_subdirs)) ||
+		    (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)))
+			__managed_dentry_clear_automount(dentry);
+		spin_unlock(&dentry->d_lock);
+	}
 	spin_unlock(&sbi->fs_lock);
 
 	/* Mount succeeded, check if we ended up with a new dentry */
@@ -668,6 +713,30 @@ done:
 	return NULL;
 }
 
+int autofs4_d_manage(struct dentry *dentry, bool mounting_here)
+{
+	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+
+	DPRINTK("dentry=%p %.*s",
+		dentry, dentry->d_name.len, dentry->d_name.name);
+
+	/* The daemon never waits. */
+	if (autofs4_oz_mode(sbi) || mounting_here) {
+		if (!d_mountpoint(dentry))
+			return -EISDIR;
+		return 0;
+	}
+
+	/* Wait for pending expires */
+	do_expire_wait(dentry);
+
+	/*
+	 * This dentry may be under construction so wait on mount
+	 * completion.
+	 */
+	return autofs4_mount_wait(dentry);
+}
+
 /* Lookups in the root directory */
 static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
@@ -704,7 +773,7 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
 		/* Mark entries in the root as mount triggers */
 		if (autofs_type_indirect(sbi->type) && IS_ROOT(dentry->d_parent)) {
 			d_set_d_op(dentry, &autofs4_dentry_operations);
-			managed_dentry_set_automount(dentry);
+			__managed_dentry_set_managed(dentry);
 		}
 
 		ino = autofs4_init_ino(NULL, sbi, 0555);
-- 
cgit v1.2.2


From 8c13a676d5a56495c350f3141824a5ef6c6b4606 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Fri, 14 Jan 2011 18:46:08 +0000
Subject: autofs4: Remove unused code

Remove code that is not used due to the use of ->d_automount()
and ->d_manage().

Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/autofs4/autofs_i.h |   7 --
 fs/autofs4/root.c     | 243 --------------------------------------------------
 2 files changed, 250 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index f0c95e0460cf..1ee3b9afbe9e 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -175,13 +175,6 @@ static inline int autofs4_ispending(struct dentry *dentry)
 	return 0;
 }
 
-static inline void autofs4_copy_atime(struct file *src, struct file *dst)
-{
-	dst->f_path.dentry->d_inode->i_atime =
-		src->f_path.dentry->d_inode->i_atime;
-	return;
-}
-
 struct inode *autofs4_get_inode(struct super_block *, struct autofs_info *);
 void autofs4_free_ino(struct autofs_info *);
 
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index f1076b91a0fa..b2498c8cb0a7 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -36,9 +36,6 @@ static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long);
 static int autofs4_dir_open(struct inode *inode, struct file *file);
 static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *);
 
-#define TRIGGER_FLAGS   (LOOKUP_CONTINUE | LOOKUP_DIRECTORY)
-#define TRIGGER_INTENTS (LOOKUP_OPEN | LOOKUP_CREATE)
-
 const struct file_operations autofs4_root_operations = {
 	.open		= dcache_dir_open,
 	.release	= dcache_dir_close,
@@ -114,14 +111,6 @@ static void autofs4_del_active(struct dentry *dentry)
 	return;
 }
 
-static unsigned int autofs4_need_mount(unsigned int flags)
-{
-	unsigned int res = 0;
-	if (flags & (TRIGGER_FLAGS | TRIGGER_INTENTS))
-		res = 1;
-	return res;
-}
-
 static int autofs4_dir_open(struct inode *inode, struct file *file)
 {
 	struct dentry *dentry = file->f_path.dentry;
@@ -156,238 +145,6 @@ out:
 	return dcache_dir_open(inode, file);
 }
 
-static int try_to_fill_dentry(struct dentry *dentry, int flags)
-{
-	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
-	struct autofs_info *ino = autofs4_dentry_ino(dentry);
-	int status;
-
-	DPRINTK("dentry=%p %.*s ino=%p",
-		 dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
-
-	/*
-	 * Wait for a pending mount, triggering one if there
-	 * isn't one already
-	 */
-	if (dentry->d_inode == NULL) {
-		DPRINTK("waiting for mount name=%.*s",
-			 dentry->d_name.len, dentry->d_name.name);
-
-		status = autofs4_wait(sbi, dentry, NFY_MOUNT);
-
-		DPRINTK("mount done status=%d", status);
-
-		/* Turn this into a real negative dentry? */
-		if (status == -ENOENT) {
-			spin_lock(&sbi->fs_lock);
-			ino->flags &= ~AUTOFS_INF_PENDING;
-			spin_unlock(&sbi->fs_lock);
-			return status;
-		} else if (status) {
-			/* Return a negative dentry, but leave it "pending" */
-			return status;
-		}
-	/* Trigger mount for path component or follow link */
-	} else if (ino->flags & AUTOFS_INF_PENDING ||
-			autofs4_need_mount(flags)) {
-		DPRINTK("waiting for mount name=%.*s",
-			dentry->d_name.len, dentry->d_name.name);
-
-		spin_lock(&sbi->fs_lock);
-		ino->flags |= AUTOFS_INF_PENDING;
-		spin_unlock(&sbi->fs_lock);
-		status = autofs4_wait(sbi, dentry, NFY_MOUNT);
-
-		DPRINTK("mount done status=%d", status);
-
-		if (status) {
-			spin_lock(&sbi->fs_lock);
-			ino->flags &= ~AUTOFS_INF_PENDING;
-			spin_unlock(&sbi->fs_lock);
-			return status;
-		}
-	}
-
-	/* Initialize expiry counter after successful mount */
-	ino->last_used = jiffies;
-
-	spin_lock(&sbi->fs_lock);
-	ino->flags &= ~AUTOFS_INF_PENDING;
-	spin_unlock(&sbi->fs_lock);
-
-	return 0;
-}
-
-/* For autofs direct mounts the follow link triggers the mount */
-static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
-	struct autofs_info *ino = autofs4_dentry_ino(dentry);
-	int oz_mode = autofs4_oz_mode(sbi);
-	unsigned int lookup_type;
-	int status;
-
-	DPRINTK("dentry=%p %.*s oz_mode=%d nd->flags=%d",
-		dentry, dentry->d_name.len, dentry->d_name.name, oz_mode,
-		nd->flags);
-	/*
-	 * For an expire of a covered direct or offset mount we need
-	 * to break out of follow_down_one() at the autofs mount trigger
-	 * (d_mounted--), so we can see the expiring flag, and manage
-	 * the blocking and following here until the expire is completed.
-	 */
-	if (oz_mode) {
-		spin_lock(&sbi->fs_lock);
-		if (ino->flags & AUTOFS_INF_EXPIRING) {
-			spin_unlock(&sbi->fs_lock);
-			/* Follow down to our covering mount. */
-			if (!follow_down_one(&nd->path))
-				goto done;
-			goto follow;
-		}
-		spin_unlock(&sbi->fs_lock);
-		goto done;
-	}
-
-	/* If an expire request is pending everyone must wait. */
-	autofs4_expire_wait(dentry);
-
-	/* We trigger a mount for almost all flags */
-	lookup_type = autofs4_need_mount(nd->flags);
-	spin_lock(&sbi->fs_lock);
-	spin_lock(&autofs4_lock);
-	spin_lock(&dentry->d_lock);
-	if (!(lookup_type || ino->flags & AUTOFS_INF_PENDING)) {
-		spin_unlock(&dentry->d_lock);
-		spin_unlock(&autofs4_lock);
-		spin_unlock(&sbi->fs_lock);
-		goto follow;
-	}
-
-	/*
-	 * If the dentry contains directories then it is an autofs
-	 * multi-mount with no root mount offset. So don't try to
-	 * mount it again.
-	 */
-	if (ino->flags & AUTOFS_INF_PENDING ||
-	    (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs))) {
-		spin_unlock(&dentry->d_lock);
-		spin_unlock(&autofs4_lock);
-		spin_unlock(&sbi->fs_lock);
-
-		status = try_to_fill_dentry(dentry, nd->flags);
-		if (status)
-			goto out_error;
-
-		goto follow;
-	}
-	spin_unlock(&dentry->d_lock);
-	spin_unlock(&autofs4_lock);
-	spin_unlock(&sbi->fs_lock);
-follow:
-	/*
-	 * If there is no root mount it must be an autofs
-	 * multi-mount with no root offset so we don't need
-	 * to follow it.
-	 */
-	if (d_managed(dentry)) {
-		status = follow_down(&nd->path, false);
-		if (status < 0)
-			goto out_error;
-	}
-
-done:
-	return NULL;
-
-out_error:
-	path_put(&nd->path);
-	return ERR_PTR(status);
-}
-
-/*
- * Revalidate is called on every cache lookup.  Some of those
- * cache lookups may actually happen while the dentry is not
- * yet completely filled in, and revalidate has to delay such
- * lookups..
- */
-static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
-{
-	struct inode *dir;
-	struct autofs_sb_info *sbi;
-	int oz_mode;
-	int flags = nd ? nd->flags : 0;
-	int status = 1;
-
-	if (flags & LOOKUP_RCU)
-		return -ECHILD;
-
-	dir = dentry->d_parent->d_inode;
-	sbi = autofs4_sbi(dir->i_sb);
-	oz_mode = autofs4_oz_mode(sbi);
-
-	/* Pending dentry */
-	spin_lock(&sbi->fs_lock);
-	if (autofs4_ispending(dentry)) {
-		/* The daemon never causes a mount to trigger */
-		spin_unlock(&sbi->fs_lock);
-
-		if (oz_mode)
-			return 1;
-
-		/*
-		 * If the directory has gone away due to an expire
-		 * we have been called as ->d_revalidate() and so
-		 * we need to return false and proceed to ->lookup().
-		 */
-		if (autofs4_expire_wait(dentry) == -EAGAIN)
-			return 0;
-
-		/*
-		 * A zero status is success otherwise we have a
-		 * negative error code.
-		 */
-		status = try_to_fill_dentry(dentry, flags);
-		if (status == 0)
-			return 1;
-
-		return status;
-	}
-	spin_unlock(&sbi->fs_lock);
-
-	/* Negative dentry.. invalidate if "old" */
-	if (dentry->d_inode == NULL)
-		return 0;
-
-	/* Check for a non-mountpoint directory with no contents */
-	spin_lock(&autofs4_lock);
-	spin_lock(&dentry->d_lock);
-	if (S_ISDIR(dentry->d_inode->i_mode) &&
-	    !d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
-		DPRINTK("dentry=%p %.*s, emptydir",
-			 dentry, dentry->d_name.len, dentry->d_name.name);
-		spin_unlock(&dentry->d_lock);
-		spin_unlock(&autofs4_lock);
-
-		/* The daemon never causes a mount to trigger */
-		if (oz_mode)
-			return 1;
-
-		/*
-		 * A zero status is success otherwise we have a
-		 * negative error code.
-		 */
-		status = try_to_fill_dentry(dentry, flags);
-		if (status == 0)
-			return 1;
-
-		return status;
-	}
-	spin_unlock(&dentry->d_lock);
-	spin_unlock(&autofs4_lock);
-
-	return 1;
-}
-
 void autofs4_dentry_release(struct dentry *de)
 {
 	struct autofs_info *inf;
-- 
cgit v1.2.2


From e61da20a50d21725ff27571a6dff9468e4fb7146 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Fri, 14 Jan 2011 18:46:14 +0000
Subject: autofs4: Clean up inode operations

Since the use of ->follow_link() has been eliminated there is no
need to separate the indirect and direct inode operations.

Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/autofs4/autofs_i.h |  3 ---
 fs/autofs4/inode.c    |  4 +---
 fs/autofs4/root.c     | 15 ---------------
 3 files changed, 1 insertion(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 1ee3b9afbe9e..f4b4030cf406 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -204,9 +204,6 @@ void autofs_dev_ioctl_exit(void);
 
 extern const struct inode_operations autofs4_symlink_inode_operations;
 extern const struct inode_operations autofs4_dir_inode_operations;
-extern const struct inode_operations autofs4_root_inode_operations;
-extern const struct inode_operations autofs4_indirect_root_inode_operations;
-extern const struct inode_operations autofs4_direct_root_inode_operations;
 extern const struct file_operations autofs4_dir_operations;
 extern const struct file_operations autofs4_root_operations;
 
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 75c1ed8e2fb9..dac3dc79ccb4 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -326,9 +326,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
 		__managed_dentry_set_managed(root);
 
 	root_inode->i_fop = &autofs4_root_operations;
-	root_inode->i_op = autofs_type_trigger(sbi->type) ?
-			&autofs4_direct_root_inode_operations :
-			&autofs4_indirect_root_inode_operations;
+	root_inode->i_op = &autofs4_dir_inode_operations;
 
 	/* Couldn't this be tested earlier? */
 	if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION ||
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index b2498c8cb0a7..52af410a831d 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -56,21 +56,6 @@ const struct file_operations autofs4_dir_operations = {
 	.llseek		= dcache_dir_lseek,
 };
 
-const struct inode_operations autofs4_indirect_root_inode_operations = {
-	.lookup		= autofs4_lookup,
-	.unlink		= autofs4_dir_unlink,
-	.symlink	= autofs4_dir_symlink,
-	.mkdir		= autofs4_dir_mkdir,
-	.rmdir		= autofs4_dir_rmdir,
-};
-
-const struct inode_operations autofs4_direct_root_inode_operations = {
-	.lookup		= autofs4_lookup,
-	.unlink		= autofs4_dir_unlink,
-	.mkdir		= autofs4_dir_mkdir,
-	.rmdir		= autofs4_dir_rmdir,
-};
-
 const struct inode_operations autofs4_dir_inode_operations = {
 	.lookup		= autofs4_lookup,
 	.unlink		= autofs4_dir_unlink,
-- 
cgit v1.2.2


From 71e469db242c2eeb00faf9caf7d9e00150c00a6e Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Fri, 14 Jan 2011 18:46:19 +0000
Subject: autofs4: Clean up dentry operations

There are now two distinct dentry operations uses. One for dentrys
that trigger mounts and one for dentrys that do not.

Rationalize the use of these dentry operations and rename them to
reflect their function.

Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/autofs4/autofs_i.h |  7 ++-----
 fs/autofs4/inode.c    | 12 ++++--------
 fs/autofs4/root.c     | 36 ++++++++++++++++++++----------------
 3 files changed, 26 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index f4b4030cf406..c28085cb82a5 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -206,11 +206,8 @@ extern const struct inode_operations autofs4_symlink_inode_operations;
 extern const struct inode_operations autofs4_dir_inode_operations;
 extern const struct file_operations autofs4_dir_operations;
 extern const struct file_operations autofs4_root_operations;
-
-/* Operations methods */
-
-struct vfsmount *autofs4_d_automount(struct path *);
-int autofs4_d_manage(struct dentry *, bool);
+extern const struct dentry_operations autofs4_dentry_operations;
+extern const struct dentry_operations autofs4_mount_dentry_operations;
 
 /* VFS automount flags management functions */
 
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index dac3dc79ccb4..427c35746340 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -251,12 +251,6 @@ static struct autofs_info *autofs4_mkroot(struct autofs_sb_info *sbi)
 	return ino;
 }
 
-static const struct dentry_operations autofs4_sb_dentry_operations = {
-	.d_automount	= autofs4_d_automount,
-	.d_manage	= autofs4_d_manage,
-	.d_release      = autofs4_dentry_release,
-};
-
 int autofs4_fill_super(struct super_block *s, void *data, int silent)
 {
 	struct inode * root_inode;
@@ -311,7 +305,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
 		goto fail_iput;
 	pipe = NULL;
 
-	d_set_d_op(root, &autofs4_sb_dentry_operations);
+	d_set_d_op(root, &autofs4_dentry_operations);
 	root->d_fsdata = ino;
 
 	/* Can this call block? */
@@ -322,8 +316,10 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
 		goto fail_dput;
 	}
 
-	if (autofs_type_trigger(sbi->type))
+	if (autofs_type_trigger(sbi->type)) {
+		d_set_d_op(root, &autofs4_mount_dentry_operations);
 		__managed_dentry_set_managed(root);
+	}
 
 	root_inode->i_fop = &autofs4_root_operations;
 	root_inode->i_op = &autofs4_dir_inode_operations;
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 52af410a831d..62c1229a4d31 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -35,6 +35,8 @@ static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long);
 #endif
 static int autofs4_dir_open(struct inode *inode, struct file *file);
 static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *);
+static struct vfsmount *autofs4_d_automount(struct path *);
+static int autofs4_d_manage(struct dentry *, bool);
 
 const struct file_operations autofs4_root_operations = {
 	.open		= dcache_dir_open,
@@ -64,6 +66,18 @@ const struct inode_operations autofs4_dir_inode_operations = {
 	.rmdir		= autofs4_dir_rmdir,
 };
 
+/* For dentries that don't initiate mounting */
+const struct dentry_operations autofs4_dentry_operations = {
+	.d_release	= autofs4_dentry_release,
+};
+
+/* For dentries that do initiate mounting */
+const struct dentry_operations autofs4_mount_dentry_operations = {
+	.d_automount	= autofs4_d_automount,
+	.d_manage	= autofs4_d_manage,
+	.d_release	= autofs4_dentry_release,
+};
+
 static void autofs4_add_active(struct dentry *dentry)
 {
 	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
@@ -158,18 +172,6 @@ void autofs4_dentry_release(struct dentry *de)
 	}
 }
 
-/* For dentries of directories in the root dir */
-static const struct dentry_operations autofs4_root_dentry_operations = {
-	.d_release	= autofs4_dentry_release,
-};
-
-/* For other dentries */
-static const struct dentry_operations autofs4_dentry_operations = {
-	.d_automount	= autofs4_d_automount,
-	.d_manage	= autofs4_d_manage,
-	.d_release	= autofs4_dentry_release,
-};
-
 static struct dentry *autofs4_lookup_active(struct dentry *dentry)
 {
 	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
@@ -337,7 +339,7 @@ static struct dentry *autofs4_mountpoint_changed(struct path *path)
 	return path->dentry;
 }
 
-struct vfsmount *autofs4_d_automount(struct path *path)
+static struct vfsmount *autofs4_d_automount(struct path *path)
 {
 	struct dentry *dentry = path->dentry;
 	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
@@ -501,7 +503,7 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
 	if (active) {
 		return active;
 	} else {
-		d_set_d_op(dentry, &autofs4_root_dentry_operations);
+		d_set_d_op(dentry, &autofs4_dentry_operations);
 
 		/*
 		 * A dentry that is not within the root can never trigger a
@@ -514,7 +516,7 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
 
 		/* Mark entries in the root as mount triggers */
 		if (autofs_type_indirect(sbi->type) && IS_ROOT(dentry->d_parent)) {
-			d_set_d_op(dentry, &autofs4_dentry_operations);
+			d_set_d_op(dentry, &autofs4_mount_dentry_operations);
 			__managed_dentry_set_managed(dentry);
 		}
 
@@ -573,6 +575,8 @@ static int autofs4_dir_symlink(struct inode *dir,
 	}
 	d_add(dentry, inode);
 
+	d_set_d_op(dentry, &autofs4_dentry_operations);
+
 	dentry->d_fsdata = ino;
 	ino->dentry = dget(dentry);
 	atomic_inc(&ino->count);
@@ -791,7 +795,7 @@ static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p)
 int is_autofs4_dentry(struct dentry *dentry)
 {
 	return dentry && dentry->d_inode &&
-		(dentry->d_op == &autofs4_root_dentry_operations ||
+		(dentry->d_op == &autofs4_mount_dentry_operations ||
 		 dentry->d_op == &autofs4_dentry_operations) &&
 		dentry->d_fsdata != NULL;
 }
-- 
cgit v1.2.2


From 6651149371b842715906311b4631b8489cebf7e8 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Fri, 14 Jan 2011 18:46:24 +0000
Subject: autofs4: Clean up autofs4_free_ino()

When this function is called the local reference count does't need to
be updated since the dentry is going away and dput definitely must
not be called here.

Also the autofs info struct field inode isn't used so remove it.

Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/autofs4/inode.c | 13 -------------
 fs/autofs4/root.c  |  9 ---------
 2 files changed, 22 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 427c35746340..3ecd2e2bcdbd 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -45,7 +45,6 @@ struct autofs_info *autofs4_init_ino(struct autofs_info *ino,
 
 	if (!reinit) {
 		ino->flags = 0;
-		ino->inode = NULL;
 		ino->dentry = NULL;
 		ino->size = 0;
 		INIT_LIST_HEAD(&ino->active);
@@ -76,19 +75,8 @@ struct autofs_info *autofs4_init_ino(struct autofs_info *ino,
 
 void autofs4_free_ino(struct autofs_info *ino)
 {
-	struct autofs_info *p_ino;
-
 	if (ino->dentry) {
 		ino->dentry->d_fsdata = NULL;
-		if (ino->dentry->d_inode) {
-			struct dentry *parent = ino->dentry->d_parent;
-			if (atomic_dec_and_test(&ino->count)) {
-				p_ino = autofs4_dentry_ino(parent);
-				if (p_ino && parent != ino->dentry)
-					atomic_dec(&p_ino->count);
-			}
-			dput(ino->dentry);
-		}
 		ino->dentry = NULL;
 	}
 	if (ino->free)
@@ -390,7 +378,6 @@ struct inode *autofs4_get_inode(struct super_block *sb,
 	if (inode == NULL)
 		return NULL;
 
-	inf->inode = inode;
 	inode->i_mode = inf->mode;
 	if (sb->s_root) {
 		inode->i_uid = sb->s_root->d_inode->i_uid;
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 62c1229a4d31..8315565ed7d4 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -151,11 +151,8 @@ void autofs4_dentry_release(struct dentry *de)
 	DPRINTK("releasing %p", de);
 
 	inf = autofs4_dentry_ino(de);
-	de->d_fsdata = NULL;
-
 	if (inf) {
 		struct autofs_sb_info *sbi = autofs4_sbi(de->d_sb);
-
 		if (sbi) {
 			spin_lock(&sbi->lookup_lock);
 			if (!list_empty(&inf->active))
@@ -164,10 +161,6 @@ void autofs4_dentry_release(struct dentry *de)
 				list_del(&inf->expiring);
 			spin_unlock(&sbi->lookup_lock);
 		}
-
-		inf->dentry = NULL;
-		inf->inode = NULL;
-
 		autofs4_free_ino(inf);
 	}
 }
@@ -583,7 +576,6 @@ static int autofs4_dir_symlink(struct inode *dir,
 	p_ino = autofs4_dentry_ino(dentry->d_parent);
 	if (p_ino && dentry->d_parent != dentry)
 		atomic_inc(&p_ino->count);
-	ino->inode = inode;
 
 	ino->u.symlink = cp;
 	dir->i_mtime = CURRENT_TIME;
@@ -713,7 +705,6 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	p_ino = autofs4_dentry_ino(dentry->d_parent);
 	if (p_ino && dentry->d_parent != dentry)
 		atomic_inc(&p_ino->count);
-	ino->inode = inode;
 	inc_nlink(dir);
 	dir->i_mtime = CURRENT_TIME;
 
-- 
cgit v1.2.2


From 9e3fea16ba386fa549a0b2de8a203e5d412997a0 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Fri, 14 Jan 2011 18:46:30 +0000
Subject: autofs4: Fix wait validation

It is possible for the check in wait.c:validate_request() to return
an incorrect result if the dentry that was mounted upon has changed
during the callback.

Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/autofs4/waitq.c | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index c5f8459c905e..56010056b2e6 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -309,6 +309,9 @@ static int validate_request(struct autofs_wait_queue **wait,
 	 * completed while we waited on the mutex ...
 	 */
 	if (notify == NFY_MOUNT) {
+		struct dentry *new = NULL;
+		int valid = 1;
+
 		/*
 		 * If the dentry was successfully mounted while we slept
 		 * on the wait queue mutex we can return success. If it
@@ -316,8 +319,20 @@ static int validate_request(struct autofs_wait_queue **wait,
 		 * a multi-mount with no mount at it's base) we can
 		 * continue on and create a new request.
 		 */
+		if (!IS_ROOT(dentry)) {
+			if (dentry->d_inode && d_unhashed(dentry)) {
+				struct dentry *parent = dentry->d_parent;
+				new = d_lookup(parent, &dentry->d_name);
+				if (new)
+					dentry = new;
+			}
+		}
 		if (have_submounts(dentry))
-			return 0;
+			valid = 0;
+
+		if (new)
+			dput(new);
+		return valid;
 	}
 
 	return 1;
-- 
cgit v1.2.2


From dd89f90d2deb9aa5bc8e1b15d726ff5c0bb2b623 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Fri, 14 Jan 2011 18:46:35 +0000
Subject: autofs4: Add v4 pseudo direct mount support

Version 4 of autofs provides a pseudo direct mount implementation
that relies on directories at the leaves of a directory tree under
an indirect mount to trigger mounts.

This patch adds support for that functionality.

Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/autofs4/root.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)

(limited to 'fs')

diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 8315565ed7d4..9194e274f849 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -630,6 +630,58 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
 	return 0;
 }
 
+/*
+ * Version 4 of autofs provides a pseudo direct mount implementation
+ * that relies on directories at the leaves of a directory tree under
+ * an indirect mount to trigger mounts. To allow for this we need to
+ * set the DMANAGED_AUTOMOUNT and DMANAGED_TRANSIT flags on the leaves
+ * of the directory tree. There is no need to clear the automount flag
+ * following a mount or restore it after an expire because these mounts
+ * are always covered. However, it is neccessary to ensure that these
+ * flags are clear on non-empty directories to avoid unnecessary calls
+ * during path walks.
+ */
+static void autofs_set_leaf_automount_flags(struct dentry *dentry)
+{
+	struct dentry *parent;
+
+	/* root and dentrys in the root are already handled */
+	if (IS_ROOT(dentry->d_parent))
+		return;
+
+	managed_dentry_set_managed(dentry);
+
+	parent = dentry->d_parent;
+	/* only consider parents below dentrys in the root */
+	if (IS_ROOT(parent->d_parent))
+		return;
+	managed_dentry_clear_managed(parent);
+	return;
+}
+
+static void autofs_clear_leaf_automount_flags(struct dentry *dentry)
+{
+	struct list_head *d_child;
+	struct dentry *parent;
+
+	/* flags for dentrys in the root are handled elsewhere */
+	if (IS_ROOT(dentry->d_parent))
+		return;
+
+	managed_dentry_clear_managed(dentry);
+
+	parent = dentry->d_parent;
+	/* only consider parents below dentrys in the root */
+	if (IS_ROOT(parent->d_parent))
+		return;
+	d_child = &dentry->d_u.d_child;
+	/* Set parent managed if it's becoming empty */
+	if (d_child->next == &parent->d_subdirs &&
+	    d_child->prev == &parent->d_subdirs)
+		managed_dentry_set_managed(parent);
+	return;
+}
+
 static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
 {
 	struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
@@ -657,6 +709,9 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
 	spin_unlock(&dentry->d_lock);
 	spin_unlock(&autofs4_lock);
 
+	if (sbi->version < 5)
+		autofs_clear_leaf_automount_flags(dentry);
+
 	if (atomic_dec_and_test(&ino->count)) {
 		p_ino = autofs4_dentry_ino(dentry->d_parent);
 		if (p_ino && dentry->d_parent != dentry)
@@ -699,6 +754,9 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	}
 	d_add(dentry, inode);
 
+	if (sbi->version < 5)
+		autofs_set_leaf_automount_flags(dentry);
+
 	dentry->d_fsdata = ino;
 	ino->dentry = dget(dentry);
 	atomic_inc(&ino->count);
-- 
cgit v1.2.2


From 87556ef19926e97464e0163a7840140527ae6615 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Jan 2011 18:46:46 +0000
Subject: Remove a further kludge from __do_follow_link()

Remove a further kludge from __do_follow_link() as it's no longer required with
the automount code.

This reverts the non-helper-function parts of
051d381259eb57d6074d02a6ba6e90e744f1a29f, which breaks union mounts.

Reported-by: vaurora@redhat.com
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 61995fba4e21..373852012713 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -800,12 +800,8 @@ __do_follow_link(const struct path *link, struct nameidata *nd, void **p)
 	touch_atime(link->mnt, dentry);
 	nd_set_link(nd, NULL);
 
-	if (link->mnt != nd->path.mnt) {
-		path_to_nameidata(link, nd);
-		nd->inode = nd->path.dentry->d_inode;
-		dget(dentry);
-	}
-	mntget(link->mnt);
+	if (link->mnt == nd->path.mnt)
+		mntget(link->mnt);
 
 	nd->last_type = LAST_BIND;
 	*p = dentry->d_inode->i_op->follow_link(dentry, nd);
-- 
cgit v1.2.2


From ab90911ff90cdab59b31c045c3f0ae480d14f29d Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Jan 2011 18:46:51 +0000
Subject: Allow d_manage() to be used in RCU-walk mode

Allow d_manage() to be called from pathwalk when it is in RCU-walk mode as well
as when it is in Ref-walk mode.  This permits __follow_mount_rcu() to call
d_manage() directly.  d_manage() needs a parameter to indicate that it is in
RCU-walk mode as it isn't allowed to sleep if in that mode (but should return
-ECHILD instead).

autofs4_d_manage() can then be set to retain RCU-walk mode if the daemon
accesses it and otherwise request dropping back to ref-walk mode.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/autofs4/root.c |  8 ++++++--
 fs/namei.c        | 15 ++++++++-------
 2 files changed, 14 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 9194e274f849..dbd95512808c 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -36,7 +36,7 @@ static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long);
 static int autofs4_dir_open(struct inode *inode, struct file *file);
 static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *);
 static struct vfsmount *autofs4_d_automount(struct path *);
-static int autofs4_d_manage(struct dentry *, bool);
+static int autofs4_d_manage(struct dentry *, bool, bool);
 
 const struct file_operations autofs4_root_operations = {
 	.open		= dcache_dir_open,
@@ -450,7 +450,7 @@ done:
 	return NULL;
 }
 
-int autofs4_d_manage(struct dentry *dentry, bool mounting_here)
+int autofs4_d_manage(struct dentry *dentry, bool mounting_here, bool rcu_walk)
 {
 	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
 
@@ -464,6 +464,10 @@ int autofs4_d_manage(struct dentry *dentry, bool mounting_here)
 		return 0;
 	}
 
+	/* We need to sleep, so we need pathwalk to be in ref-mode */
+	if (rcu_walk)
+		return -ECHILD;
+
 	/* Wait for pending expires */
 	do_expire_wait(dentry);
 
diff --git a/fs/namei.c b/fs/namei.c
index 373852012713..5c89695ae1e4 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -987,7 +987,8 @@ static int follow_managed(struct path *path, unsigned flags)
 		if (managed & DCACHE_MANAGE_TRANSIT) {
 			BUG_ON(!path->dentry->d_op);
 			BUG_ON(!path->dentry->d_op->d_manage);
-			ret = path->dentry->d_op->d_manage(path->dentry, false);
+			ret = path->dentry->d_op->d_manage(path->dentry,
+							   false, false);
 			if (ret < 0)
 				return ret == -EISDIR ? 0 : ret;
 		}
@@ -1048,13 +1049,12 @@ int follow_down_one(struct path *path)
 static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
 			       struct inode **inode, bool reverse_transit)
 {
-	unsigned abort_mask =
-		reverse_transit ? 0 : DCACHE_MANAGE_TRANSIT;
-
 	while (d_mountpoint(path->dentry)) {
 		struct vfsmount *mounted;
-		if (path->dentry->d_flags & abort_mask)
-			return true;
+		if (unlikely(path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) &&
+		    !reverse_transit &&
+		    path->dentry->d_op->d_manage(path->dentry, false, true) < 0)
+			return false;
 		mounted = __lookup_mnt(path->mnt, path->dentry, 1);
 		if (!mounted)
 			break;
@@ -1132,7 +1132,8 @@ int follow_down(struct path *path, bool mounting_here)
 		if (managed & DCACHE_MANAGE_TRANSIT) {
 			BUG_ON(!path->dentry->d_op);
 			BUG_ON(!path->dentry->d_op->d_manage);
-			ret = path->dentry->d_op->d_manage(path->dentry, mounting_here);
+			ret = path->dentry->d_op->d_manage(
+				path->dentry, mounting_here, false);
 			if (ret < 0)
 				return ret == -EISDIR ? 0 : ret;
 		}
-- 
cgit v1.2.2


From ea5b778a8b98c85a87d66bf844904f9c3802b869 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Jan 2011 19:10:03 +0000
Subject: Unexport do_add_mount() and add in follow_automount(), not
 ->d_automount()

Unexport do_add_mount() and make ->d_automount() return the vfsmount to be
added rather than calling do_add_mount() itself.  follow_automount() will then
do the addition.

This slightly complicates things as ->d_automount() normally wants to add the
new vfsmount to an expiration list and start an expiration timer.  The problem
with that is that the vfsmount will be deleted if it has a refcount of 1 and
the timer will not repeat if the expiration list is empty.

To this end, we require the vfsmount to be returned from d_automount() with a
refcount of (at least) 2.  One of these refs will be dropped unconditionally.
In addition, follow_automount() must get a 3rd ref around the call to
do_add_mount() lest it eat a ref and return an error, leaving the mount we
have open to being expired as we would otherwise have only 1 ref on it.

d_automount() should also add the the vfsmount to the expiration list (by
calling mnt_set_expiry()) and start the expiration timer before returning, if
this mechanism is to be used.  The vfsmount will be unlinked from the
expiration list by follow_automount() if do_add_mount() fails.

This patch also fixes the call to do_add_mount() for AFS to propagate the mount
flags from the parent vfsmount.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/afs/mntpt.c         | 25 ++++++-------------------
 fs/cifs/cifs_dfs_ref.c | 26 ++++++--------------------
 fs/internal.h          |  2 ++
 fs/namei.c             | 42 +++++++++++++++++++++++++++++++++++-------
 fs/namespace.c         | 41 +++++++++++++++++++++++++++++++++--------
 fs/nfs/namespace.c     | 24 ++++--------------------
 6 files changed, 86 insertions(+), 74 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index d23b2e344a78..aa59184151d0 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -241,7 +241,6 @@ error_no_devname:
 struct vfsmount *afs_d_automount(struct path *path)
 {
 	struct vfsmount *newmnt;
-	int err;
 
 	_enter("{%s,%s}", path->mnt->mnt_devname, path->dentry->d_name.name);
 
@@ -249,24 +248,12 @@ struct vfsmount *afs_d_automount(struct path *path)
 	if (IS_ERR(newmnt))
 		return newmnt;
 
-	mntget(newmnt);
-	err = do_add_mount(newmnt, path, MNT_SHRINKABLE, &afs_vfsmounts);
-	switch (err) {
-	case 0:
-		queue_delayed_work(afs_wq, &afs_mntpt_expiry_timer,
-				   afs_mntpt_expiry_timeout * HZ);
-		_leave(" = %p {%s}", newmnt, newmnt->mnt_devname);
-		return newmnt;
-	case -EBUSY:
-		/* someone else made a mount here whilst we were busy */
-		mntput(newmnt);
-		_leave(" = NULL [EBUSY]");
-		return NULL;
-	default:
-		mntput(newmnt);
-		_leave(" = %d", err);
-		return ERR_PTR(err);
-	}
+	mntget(newmnt); /* prevent immediate expiration */
+	mnt_set_expiry(newmnt, &afs_vfsmounts);
+	queue_delayed_work(afs_wq, &afs_mntpt_expiry_timer,
+			   afs_mntpt_expiry_timeout * HZ);
+	_leave(" = %p {%s}", newmnt, newmnt->mnt_devname);
+	return newmnt;
 }
 
 /*
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 0fc163808de3..7ed36536e754 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -351,7 +351,6 @@ free_xid:
 struct vfsmount *cifs_dfs_d_automount(struct path *path)
 {
 	struct vfsmount *newmnt;
-	int err;
 
 	cFYI(1, "in %s", __func__);
 
@@ -361,25 +360,12 @@ struct vfsmount *cifs_dfs_d_automount(struct path *path)
 		return newmnt;
 	}
 
-	mntget(newmnt);
-	err = do_add_mount(newmnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE,
-			   &cifs_dfs_automount_list);
-	switch (err) {
-	case 0:
-		schedule_delayed_work(&cifs_dfs_automount_task,
-				      cifs_dfs_mountpoint_expiry_timeout);
-		cFYI(1, "leaving %s [ok]" , __func__);
-		return newmnt;
-	case -EBUSY:
-		/* someone else made a mount here whilst we were busy */
-		mntput(newmnt);
-		cFYI(1, "leaving %s [EBUSY]" , __func__);
-		return NULL;
-	default:
-		mntput(newmnt);
-		cFYI(1, "leaving %s [error %d]" , __func__, err);
-		return ERR_PTR(err);
-	}
+	mntget(newmnt); /* prevent immediate expiration */
+	mnt_set_expiry(newmnt, &cifs_dfs_automount_list);
+	schedule_delayed_work(&cifs_dfs_automount_task,
+			      cifs_dfs_mountpoint_expiry_timeout);
+	cFYI(1, "leaving %s [ok]" , __func__);
+	return newmnt;
 }
 
 const struct inode_operations cifs_dfs_referral_inode_operations = {
diff --git a/fs/internal.h b/fs/internal.h
index 9687c2ee2735..4931060fd089 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -70,6 +70,8 @@ extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *,
 extern void release_mounts(struct list_head *);
 extern void umount_tree(struct vfsmount *, int, struct list_head *);
 extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
+extern int do_add_mount(struct vfsmount *, struct path *, int);
+extern void mnt_clear_expiry(struct vfsmount *);
 
 extern void __init mnt_init(void);
 
diff --git a/fs/namei.c b/fs/namei.c
index 5c89695ae1e4..c2e37727e3ab 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -900,6 +900,7 @@ static int follow_automount(struct path *path, unsigned flags,
 			    bool *need_mntput)
 {
 	struct vfsmount *mnt;
+	int err;
 
 	if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
 		return -EREMOTE;
@@ -942,22 +943,49 @@ static int follow_automount(struct path *path, unsigned flags,
 			return -EREMOTE;
 		return PTR_ERR(mnt);
 	}
+
 	if (!mnt) /* mount collision */
 		return 0;
 
+	/* The new mount record should have at least 2 refs to prevent it being
+	 * expired before we get a chance to add it
+	 */
+	BUG_ON(mnt_get_count(mnt) < 2);
+
 	if (mnt->mnt_sb == path->mnt->mnt_sb &&
 	    mnt->mnt_root == path->dentry) {
+		mnt_clear_expiry(mnt);
+		mntput(mnt);
 		mntput(mnt);
 		return -ELOOP;
 	}
 
-	dput(path->dentry);
-	if (*need_mntput)
-		mntput(path->mnt);
-	path->mnt = mnt;
-	path->dentry = dget(mnt->mnt_root);
-	*need_mntput = true;
-	return 0;
+	/* We need to add the mountpoint to the parent.  The filesystem may
+	 * have placed it on an expiry list, and so we need to make sure it
+	 * won't be expired under us if do_add_mount() fails (do_add_mount()
+	 * will eat a reference unconditionally).
+	 */
+	mntget(mnt);
+	err = do_add_mount(mnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE);
+	switch (err) {
+	case -EBUSY:
+		/* Someone else made a mount here whilst we were busy */
+		err = 0;
+	default:
+		mnt_clear_expiry(mnt);
+		mntput(mnt);
+		mntput(mnt);
+		return err;
+	case 0:
+		mntput(mnt);
+		dput(path->dentry);
+		if (*need_mntput)
+			mntput(path->mnt);
+		path->mnt = mnt;
+		path->dentry = dget(mnt->mnt_root);
+		*need_mntput = true;
+		return 0;
+	}
 }
 
 /*
diff --git a/fs/namespace.c b/fs/namespace.c
index d94ccd6ddafd..bfcb701f9490 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1925,15 +1925,14 @@ static int do_new_mount(struct path *path, char *type, int flags,
 	if (IS_ERR(mnt))
 		return PTR_ERR(mnt);
 
-	return do_add_mount(mnt, path, mnt_flags, NULL);
+	return do_add_mount(mnt, path, mnt_flags);
 }
 
 /*
  * add a mount into a namespace's mount tree
- * - provide the option of adding the new mount to an expiration list
+ * - this unconditionally eats one of the caller's references to newmnt.
  */
-int do_add_mount(struct vfsmount *newmnt, struct path *path,
-		 int mnt_flags, struct list_head *fslist)
+int do_add_mount(struct vfsmount *newmnt, struct path *path, int mnt_flags)
 {
 	int err;
 
@@ -1963,9 +1962,6 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path,
 	if ((err = graft_tree(newmnt, path)))
 		goto unlock;
 
-	if (fslist) /* add to the specified expiration list */
-		list_add_tail(&newmnt->mnt_expire, fslist);
-
 	up_write(&namespace_sem);
 	return 0;
 
@@ -1975,7 +1971,36 @@ unlock:
 	return err;
 }
 
-EXPORT_SYMBOL_GPL(do_add_mount);
+/**
+ * mnt_set_expiry - Put a mount on an expiration list
+ * @mnt: The mount to list.
+ * @expiry_list: The list to add the mount to.
+ */
+void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
+{
+	down_write(&namespace_sem);
+	br_write_lock(vfsmount_lock);
+
+	list_add_tail(&mnt->mnt_expire, expiry_list);
+
+	br_write_unlock(vfsmount_lock);
+	up_write(&namespace_sem);
+}
+EXPORT_SYMBOL(mnt_set_expiry);
+
+/*
+ * Remove a vfsmount from any expiration list it may be on
+ */
+void mnt_clear_expiry(struct vfsmount *mnt)
+{
+	if (!list_empty(&mnt->mnt_expire)) {
+		down_write(&namespace_sem);
+		br_write_lock(vfsmount_lock);
+		list_del_init(&mnt->mnt_expire);
+		br_write_unlock(vfsmount_lock);
+		up_write(&namespace_sem);
+	}
+}
 
 /*
  * process a list of expirable mountpoints with the intent of discarding any
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index f3fbb1bf3f18..f32b8603dca8 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -149,26 +149,10 @@ struct vfsmount *nfs_d_automount(struct path *path)
 	if (IS_ERR(mnt))
 		goto out;
 
-	mntget(mnt);
-	err = do_add_mount(mnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE,
-			   &nfs_automount_list);
-	switch (err) {
-	case 0:
-		dprintk("%s: done, success\n", __func__);
-		schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
-		break;
-	case -EBUSY:
-		/* someone else made a mount here whilst we were busy */
-		mntput(mnt);
-		dprintk("%s: done, collision\n", __func__);
-		mnt = NULL;
-		break;
-	default:
-		mntput(mnt);
-		dprintk("%s: done, error %d\n", __func__, err);
-		mnt = ERR_PTR(err);
-		break;
-	}
+	dprintk("%s: done, success\n", __func__);
+	mntget(mnt); /* prevent immediate expiration */
+	mnt_set_expiry(mnt, &nfs_automount_list);
+	schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
 
 out:
 	nfs_free_fattr(fattr);
-- 
cgit v1.2.2


From b650c858c26bd9ba29ebc82d30f09355845a294a Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 15 Jan 2011 10:51:57 +0000
Subject: autofs4: Merge the remaining dentry ops tables

Merge the remaining autofs4 dentry ops tables.  It doesn't matter if
d_automount and d_manage are present on something that's not mountable or
holdable as these ops are only used if the appropriate flags are set in
dentry->d_flags.

[AV] switch to ->s_d_op, since now _everything_ on autofs4 is using the
same dentry_operations.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/autofs4/autofs_i.h |  1 -
 fs/autofs4/inode.c    |  6 ++----
 fs/autofs4/root.c     | 17 ++---------------
 3 files changed, 4 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index c28085cb82a5..1f016bfb42d5 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -207,7 +207,6 @@ extern const struct inode_operations autofs4_dir_inode_operations;
 extern const struct file_operations autofs4_dir_operations;
 extern const struct file_operations autofs4_root_operations;
 extern const struct dentry_operations autofs4_dentry_operations;
-extern const struct dentry_operations autofs4_mount_dentry_operations;
 
 /* VFS automount flags management functions */
 
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 3ecd2e2bcdbd..9e1a9dad23e1 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -276,6 +276,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
 	s->s_blocksize_bits = 10;
 	s->s_magic = AUTOFS_SUPER_MAGIC;
 	s->s_op = &autofs4_sops;
+	s->s_d_op = &autofs4_dentry_operations;
 	s->s_time_gran = 1;
 
 	/*
@@ -293,7 +294,6 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
 		goto fail_iput;
 	pipe = NULL;
 
-	d_set_d_op(root, &autofs4_dentry_operations);
 	root->d_fsdata = ino;
 
 	/* Can this call block? */
@@ -304,10 +304,8 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
 		goto fail_dput;
 	}
 
-	if (autofs_type_trigger(sbi->type)) {
-		d_set_d_op(root, &autofs4_mount_dentry_operations);
+	if (autofs_type_trigger(sbi->type))
 		__managed_dentry_set_managed(root);
-	}
 
 	root_inode->i_fop = &autofs4_root_operations;
 	root_inode->i_op = &autofs4_dir_inode_operations;
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index dbd95512808c..1dba035fc376 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -66,13 +66,7 @@ const struct inode_operations autofs4_dir_inode_operations = {
 	.rmdir		= autofs4_dir_rmdir,
 };
 
-/* For dentries that don't initiate mounting */
 const struct dentry_operations autofs4_dentry_operations = {
-	.d_release	= autofs4_dentry_release,
-};
-
-/* For dentries that do initiate mounting */
-const struct dentry_operations autofs4_mount_dentry_operations = {
 	.d_automount	= autofs4_d_automount,
 	.d_manage	= autofs4_d_manage,
 	.d_release	= autofs4_dentry_release,
@@ -500,8 +494,6 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
 	if (active) {
 		return active;
 	} else {
-		d_set_d_op(dentry, &autofs4_dentry_operations);
-
 		/*
 		 * A dentry that is not within the root can never trigger a
 		 * mount operation, unless the directory already exists, so we
@@ -512,10 +504,8 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
 			return ERR_PTR(-ENOENT);
 
 		/* Mark entries in the root as mount triggers */
-		if (autofs_type_indirect(sbi->type) && IS_ROOT(dentry->d_parent)) {
-			d_set_d_op(dentry, &autofs4_mount_dentry_operations);
+		if (autofs_type_indirect(sbi->type) && IS_ROOT(dentry->d_parent))
 			__managed_dentry_set_managed(dentry);
-		}
 
 		ino = autofs4_init_ino(NULL, sbi, 0555);
 		if (!ino)
@@ -572,8 +562,6 @@ static int autofs4_dir_symlink(struct inode *dir,
 	}
 	d_add(dentry, inode);
 
-	d_set_d_op(dentry, &autofs4_dentry_operations);
-
 	dentry->d_fsdata = ino;
 	ino->dentry = dget(dentry);
 	atomic_inc(&ino->count);
@@ -848,8 +836,7 @@ static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p)
 int is_autofs4_dentry(struct dentry *dentry)
 {
 	return dentry && dentry->d_inode &&
-		(dentry->d_op == &autofs4_mount_dentry_operations ||
-		 dentry->d_op == &autofs4_dentry_operations) &&
+		dentry->d_op == &autofs4_dentry_operations &&
 		dentry->d_fsdata != NULL;
 }
 
-- 
cgit v1.2.2


From f580eb0931fbcb6dc3916f094f471671facd1daa Mon Sep 17 00:00:00 2001
From: Stefan Schmidt <stefan@datenfreihafen.org>
Date: Wed, 12 Jan 2011 09:30:42 +0000
Subject: fs/btrfs: Fix build of ctree

CC [M]  fs/btrfs/ctree.o
In file included from fs/btrfs/ctree.c:21:0:
fs/btrfs/ctree.h:1003:17: error: field <91>super_kobj<92> has incomplete type
fs/btrfs/ctree.h:1074:17: error: field <91>root_kobj<92> has incomplete type
make[2]: *** [fs/btrfs/ctree.o] Error 1
make[1]: *** [fs/btrfs] Error 2
make: *** [fs] Error 2

We need to include kobject.h here.

Reported-by: Jeff Garzik <jeff@garzik.org>
Fix-suggested-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Stefan Schmidt <stefan@datenfreihafen.org>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4acd4c611efa..0cb322cc4fc0 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -27,6 +27,7 @@
 #include <linux/backing-dev.h>
 #include <linux/wait.h>
 #include <linux/slab.h>
+#include <linux/kobject.h>
 #include <asm/kmap_types.h>
 #include "extent_io.h"
 #include "extent_map.h"
-- 
cgit v1.2.2


From 299a08b1c34f9397797946a0fa215c5fd145c5cf Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Wed, 5 Jan 2011 10:07:15 +0000
Subject: btrfs: fix wrong data space statistics

Josef has implemented mixed data/metadata chunks, we must add those chunks'
space just like data chunks.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/super.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index a1a76b2a61f9..caa5bcc62f16 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -790,11 +790,10 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(found, head, list) {
-		if (found->flags & (BTRFS_BLOCK_GROUP_METADATA |
-				    BTRFS_BLOCK_GROUP_SYSTEM))
-			total_used_data += found->disk_total;
-		else
+		if (found->flags & BTRFS_BLOCK_GROUP_DATA)
 			total_used_data += found->disk_used;
+		else
+			total_used_data += found->disk_total;
 		total_used += found->disk_used;
 	}
 	rcu_read_unlock();
-- 
cgit v1.2.2


From d52a5b5f1fa40804f681cf9868d4a8f90661bdf3 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Wed, 5 Jan 2011 10:07:18 +0000
Subject: btrfs: try to reclaim some space when chunk allocation fails

We cannot write data into files when when there is tiny space in the filesystem.

Reproduce steps:
 # mkfs.btrfs /dev/sda1
 # mount /dev/sda1 /mnt
 # dd if=/dev/zero of=/mnt/tmpfile0 bs=4K count=1
 # dd if=/dev/zero of=/mnt/tmpfile1 bs=4K count=99999999999999
   (fill the filesystem)
 # umount /mnt
 # mount /dev/sda1 /mnt
 # rm -f /mnt/tmpfile0
 # dd if=/dev/zero of=/mnt/tmpfile0 bs=4K count=1
   (failed with nospec)

But if we do the last step again, we can write data successfully. The reason of
the problem is that btrfs didn't try to commit the current transaction and
reclaim some space when chunk allocation failed.

This patch fixes it by committing the current transaction to reclaim some
space when chunk allocation fails.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b180efdc8b68..3c71d95111fe 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3162,8 +3162,12 @@ alloc:
 					     bytes + 2 * 1024 * 1024,
 					     alloc_target, 0);
 			btrfs_end_transaction(trans, root);
-			if (ret < 0)
-				return ret;
+			if (ret < 0) {
+				if (ret != -ENOSPC)
+					return ret;
+				else
+					goto commit_trans;
+			}
 
 			if (!data_sinfo) {
 				btrfs_set_inode_space_info(root, inode);
@@ -3174,6 +3178,7 @@ alloc:
 		spin_unlock(&data_sinfo->lock);
 
 		/* commit the current transaction and try again */
+commit_trans:
 		if (!committed && !root->fs_info->open_ioctl_trans) {
 			committed = 1;
 			trans = btrfs_join_transaction(root, 1);
-- 
cgit v1.2.2


From 1974a3b42d8cf7a9c74f1e0310c593023617037a Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Wed, 5 Jan 2011 10:07:24 +0000
Subject: btrfs: fix wrong calculation of stripe size

There are two tiny problem:
- One is When we check the chunk size is greater than the max chunk size or not,
  we should take mirrors into account, but the original code didn't.
- The other is btrfs shouldn't use the size of the residual free space as the
  length of of a dup chunk when doing chunk allocation. It is because the device
  space that a dup chunk needs is twice as large as the chunk size, if we use
  the size of the residual free space as the length of a dup chunk, we can not
  get enough free space. Fix it.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 177b73179590..c50a85e0d08f 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2177,6 +2177,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	int num_stripes = 1;
 	int min_stripes = 1;
 	int sub_stripes = 0;
+	int ncopies = 1;
 	int looped = 0;
 	int ret;
 	int index;
@@ -2197,12 +2198,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	if (type & (BTRFS_BLOCK_GROUP_DUP)) {
 		num_stripes = 2;
 		min_stripes = 2;
+		ncopies = 2;
 	}
 	if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
 		if (fs_devices->rw_devices < 2)
 			return -ENOSPC;
 		num_stripes = 2;
 		min_stripes = 2;
+		ncopies = 2;
 	}
 	if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
 		num_stripes = fs_devices->rw_devices;
@@ -2210,6 +2213,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 			return -ENOSPC;
 		num_stripes &= ~(u32)1;
 		sub_stripes = 2;
+		ncopies = 2;
 		min_stripes = 4;
 	}
 
@@ -2239,8 +2243,8 @@ again:
 		map->num_stripes = num_stripes;
 	}
 
-	if (calc_size * num_stripes > max_chunk_size) {
-		calc_size = max_chunk_size;
+	if (calc_size * num_stripes > max_chunk_size * ncopies) {
+		calc_size = max_chunk_size * ncopies;
 		do_div(calc_size, num_stripes);
 		do_div(calc_size, stripe_len);
 		calc_size *= stripe_len;
@@ -2321,6 +2325,8 @@ again:
 		if (!looped && max_avail > 0) {
 			looped = 1;
 			calc_size = max_avail;
+			if (type & BTRFS_BLOCK_GROUP_DUP)
+				do_div(calc_size, 2);
 			goto again;
 		}
 		kfree(map);
-- 
cgit v1.2.2


From 7bfc837df935d850fe996dfe92ef48975cd4170a Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Wed, 5 Jan 2011 10:07:26 +0000
Subject: btrfs: restructure find_free_dev_extent()

- make it return the start position and length of the max free space when it can
  not find a suitable free space.
- make it more readability

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c |   4 +-
 fs/btrfs/volumes.c     | 155 ++++++++++++++++++++++++++++---------------------
 2 files changed, 91 insertions(+), 68 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3c71d95111fe..1e1c9a177626 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -8099,7 +8099,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
 	mutex_lock(&root->fs_info->chunk_mutex);
 	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
 		u64 min_free = btrfs_block_group_used(&block_group->item);
-		u64 dev_offset, max_avail;
+		u64 dev_offset;
 
 		/*
 		 * check to make sure we can actually find a chunk with enough
@@ -8107,7 +8107,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
 		 */
 		if (device->total_bytes > device->bytes_used + min_free) {
 			ret = find_free_dev_extent(NULL, device, min_free,
-						   &dev_offset, &max_avail);
+						   &dev_offset, NULL);
 			if (!ret)
 				break;
 			ret = -1;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c50a85e0d08f..4838bd395e49 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -729,58 +729,82 @@ error:
 }
 
 /*
+ * find_free_dev_extent - find free space in the specified device
+ * @trans:	transaction handler
+ * @device:	the device which we search the free space in
+ * @num_bytes:	the size of the free space that we need
+ * @start:	store the start of the free space.
+ * @len:	the size of the free space. that we find, or the size of the max
+ * 		free space if we don't find suitable free space
+ *
  * this uses a pretty simple search, the expectation is that it is
  * called very infrequently and that a given device has a small number
  * of extents
+ *
+ * @start is used to store the start of the free space if we find. But if we
+ * don't find suitable free space, it will be used to store the start position
+ * of the max free space.
+ *
+ * @len is used to store the size of the free space that we find.
+ * But if we don't find suitable free space, it is used to store the size of
+ * the max free space.
  */
 int find_free_dev_extent(struct btrfs_trans_handle *trans,
 			 struct btrfs_device *device, u64 num_bytes,
-			 u64 *start, u64 *max_avail)
+			 u64 *start, u64 *len)
 {
 	struct btrfs_key key;
 	struct btrfs_root *root = device->dev_root;
-	struct btrfs_dev_extent *dev_extent = NULL;
+	struct btrfs_dev_extent *dev_extent;
 	struct btrfs_path *path;
-	u64 hole_size = 0;
-	u64 last_byte = 0;
-	u64 search_start = 0;
+	u64 hole_size;
+	u64 max_hole_start;
+	u64 max_hole_size;
+	u64 extent_end;
+	u64 search_start;
 	u64 search_end = device->total_bytes;
 	int ret;
-	int slot = 0;
-	int start_found;
+	int slot;
 	struct extent_buffer *l;
 
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-	path->reada = 2;
-	start_found = 0;
-
 	/* FIXME use last free of some kind */
 
 	/* we don't want to overwrite the superblock on the drive,
 	 * so we make sure to start at an offset of at least 1MB
 	 */
-	search_start = max((u64)1024 * 1024, search_start);
+	search_start = 1024 * 1024;
 
-	if (root->fs_info->alloc_start + num_bytes <= device->total_bytes)
+	if (root->fs_info->alloc_start + num_bytes <= search_end)
 		search_start = max(root->fs_info->alloc_start, search_start);
 
+	max_hole_start = search_start;
+	max_hole_size = 0;
+
+	if (search_start >= search_end) {
+		ret = -ENOSPC;
+		goto error;
+	}
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto error;
+	}
+	path->reada = 2;
+
 	key.objectid = device->devid;
 	key.offset = search_start;
 	key.type = BTRFS_DEV_EXTENT_KEY;
+
 	ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
 	if (ret < 0)
-		goto error;
+		goto out;
 	if (ret > 0) {
 		ret = btrfs_previous_item(root, path, key.objectid, key.type);
 		if (ret < 0)
-			goto error;
-		if (ret > 0)
-			start_found = 1;
+			goto out;
 	}
-	l = path->nodes[0];
-	btrfs_item_key_to_cpu(l, &key, path->slots[0]);
+
 	while (1) {
 		l = path->nodes[0];
 		slot = path->slots[0];
@@ -789,24 +813,9 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
 			if (ret == 0)
 				continue;
 			if (ret < 0)
-				goto error;
-no_more_items:
-			if (!start_found) {
-				if (search_start >= search_end) {
-					ret = -ENOSPC;
-					goto error;
-				}
-				*start = search_start;
-				start_found = 1;
-				goto check_pending;
-			}
-			*start = last_byte > search_start ?
-				last_byte : search_start;
-			if (search_end <= *start) {
-				ret = -ENOSPC;
-				goto error;
-			}
-			goto check_pending;
+				goto out;
+
+			break;
 		}
 		btrfs_item_key_to_cpu(l, &key, slot);
 
@@ -814,48 +823,62 @@ no_more_items:
 			goto next;
 
 		if (key.objectid > device->devid)
-			goto no_more_items;
+			break;
 
-		if (key.offset >= search_start && key.offset > last_byte &&
-		    start_found) {
-			if (last_byte < search_start)
-				last_byte = search_start;
-			hole_size = key.offset - last_byte;
+		if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
+			goto next;
 
-			if (hole_size > *max_avail)
-				*max_avail = hole_size;
+		if (key.offset > search_start) {
+			hole_size = key.offset - search_start;
 
-			if (key.offset > last_byte &&
-			    hole_size >= num_bytes) {
-				*start = last_byte;
-				goto check_pending;
+			if (hole_size > max_hole_size) {
+				max_hole_start = search_start;
+				max_hole_size = hole_size;
+			}
+
+			/*
+			 * If this free space is greater than which we need,
+			 * it must be the max free space that we have found
+			 * until now, so max_hole_start must point to the start
+			 * of this free space and the length of this free space
+			 * is stored in max_hole_size. Thus, we return
+			 * max_hole_start and max_hole_size and go back to the
+			 * caller.
+			 */
+			if (hole_size >= num_bytes) {
+				ret = 0;
+				goto out;
 			}
 		}
-		if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
-			goto next;
 
-		start_found = 1;
 		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
-		last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent);
+		extent_end = key.offset + btrfs_dev_extent_length(l,
+								  dev_extent);
+		if (extent_end > search_start)
+			search_start = extent_end;
 next:
 		path->slots[0]++;
 		cond_resched();
 	}
-check_pending:
-	/* we have to make sure we didn't find an extent that has already
-	 * been allocated by the map tree or the original allocation
-	 */
-	BUG_ON(*start < search_start);
 
-	if (*start + num_bytes > search_end) {
-		ret = -ENOSPC;
-		goto error;
+	hole_size = search_end- search_start;
+	if (hole_size > max_hole_size) {
+		max_hole_start = search_start;
+		max_hole_size = hole_size;
 	}
-	/* check for pending inserts here */
-	ret = 0;
 
-error:
+	/* See above. */
+	if (hole_size < num_bytes)
+		ret = -ENOSPC;
+	else
+		ret = 0;
+
+out:
 	btrfs_free_path(path);
+error:
+	*start = max_hole_start;
+	if (len && max_hole_size > *len)
+		*len = max_hole_size;
 	return ret;
 }
 
-- 
cgit v1.2.2


From b2117a39fa96cf4814e7cab8c11494149ba6f29d Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Wed, 5 Jan 2011 10:07:28 +0000
Subject: btrfs: make the chunk allocator utilize the devices better

With this patch, we change the handling method when we can not get enough free
extents with default size.

Implementation:
1. Look up the suitable free extent on each device and keep the search result.
   If not find a suitable free extent, keep the max free extent
2. If we get enough suitable free extents with default size, chunk allocation
   succeeds.
3. If we can not get enough free extents, but the number of the extent with
   default size is >= min_stripes, we just change the mapping information
   (reduce the number of stripes in the extent map), and chunk allocation
   succeeds.
4. If the number of the extent with default size is < min_stripes, sort the
   devices by its max free extent's size descending
5. Use the size of the max free extent on the (num_stripes - 1)th device as the
   stripe size to allocate the device space

By this way, the chunk allocator can allocate chunks as large as possible when
the devices' space is not enough and make full use of the devices.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 379 ++++++++++++++++++++++++++++++++++++++---------------
 fs/btrfs/volumes.h |  24 ++++
 2 files changed, 300 insertions(+), 103 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 4838bd395e49..c22784b989b7 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -877,7 +877,7 @@ out:
 	btrfs_free_path(path);
 error:
 	*start = max_hole_start;
-	if (len && max_hole_size > *len)
+	if (len)
 		*len = max_hole_size;
 	return ret;
 }
@@ -2176,70 +2176,67 @@ static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size,
 		return calc_size * num_stripes;
 }
 
-static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
-			       struct btrfs_root *extent_root,
-			       struct map_lookup **map_ret,
-			       u64 *num_bytes, u64 *stripe_size,
-			       u64 start, u64 type)
+/* Used to sort the devices by max_avail(descending sort) */
+int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2)
 {
-	struct btrfs_fs_info *info = extent_root->fs_info;
-	struct btrfs_device *device = NULL;
-	struct btrfs_fs_devices *fs_devices = info->fs_devices;
-	struct list_head *cur;
-	struct map_lookup *map = NULL;
-	struct extent_map_tree *em_tree;
-	struct extent_map *em;
-	struct list_head private_devs;
-	int min_stripe_size = 1 * 1024 * 1024;
-	u64 calc_size = 1024 * 1024 * 1024;
-	u64 max_chunk_size = calc_size;
-	u64 min_free;
-	u64 avail;
-	u64 max_avail = 0;
-	u64 dev_offset;
-	int num_stripes = 1;
-	int min_stripes = 1;
-	int sub_stripes = 0;
-	int ncopies = 1;
-	int looped = 0;
-	int ret;
-	int index;
-	int stripe_len = 64 * 1024;
+	if (((struct btrfs_device_info *)dev_info1)->max_avail >
+	    ((struct btrfs_device_info *)dev_info2)->max_avail)
+		return -1;
+	else if (((struct btrfs_device_info *)dev_info1)->max_avail <
+		 ((struct btrfs_device_info *)dev_info2)->max_avail)
+		return 1;
+	else
+		return 0;
+}
 
-	if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
-	    (type & BTRFS_BLOCK_GROUP_DUP)) {
-		WARN_ON(1);
-		type &= ~BTRFS_BLOCK_GROUP_DUP;
-	}
-	if (list_empty(&fs_devices->alloc_list))
-		return -ENOSPC;
+static int __btrfs_calc_nstripes(struct btrfs_fs_devices *fs_devices, u64 type,
+				 int *num_stripes, int *min_stripes,
+				 int *sub_stripes)
+{
+	*num_stripes = 1;
+	*min_stripes = 1;
+	*sub_stripes = 0;
 
 	if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
-		num_stripes = fs_devices->rw_devices;
-		min_stripes = 2;
+		*num_stripes = fs_devices->rw_devices;
+		*min_stripes = 2;
 	}
 	if (type & (BTRFS_BLOCK_GROUP_DUP)) {
-		num_stripes = 2;
-		min_stripes = 2;
-		ncopies = 2;
+		*num_stripes = 2;
+		*min_stripes = 2;
 	}
 	if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
 		if (fs_devices->rw_devices < 2)
 			return -ENOSPC;
-		num_stripes = 2;
-		min_stripes = 2;
-		ncopies = 2;
+		*num_stripes = 2;
+		*min_stripes = 2;
 	}
 	if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
-		num_stripes = fs_devices->rw_devices;
-		if (num_stripes < 4)
+		*num_stripes = fs_devices->rw_devices;
+		if (*num_stripes < 4)
 			return -ENOSPC;
-		num_stripes &= ~(u32)1;
-		sub_stripes = 2;
-		ncopies = 2;
-		min_stripes = 4;
+		*num_stripes &= ~(u32)1;
+		*sub_stripes = 2;
+		*min_stripes = 4;
 	}
 
+	return 0;
+}
+
+static u64 __btrfs_calc_stripe_size(struct btrfs_fs_devices *fs_devices,
+				    u64 proposed_size, u64 type,
+				    int num_stripes, int small_stripe)
+{
+	int min_stripe_size = 1 * 1024 * 1024;
+	u64 calc_size = proposed_size;
+	u64 max_chunk_size = calc_size;
+	int ncopies = 1;
+
+	if (type & (BTRFS_BLOCK_GROUP_RAID1 |
+		    BTRFS_BLOCK_GROUP_DUP |
+		    BTRFS_BLOCK_GROUP_RAID10))
+		ncopies = 2;
+
 	if (type & BTRFS_BLOCK_GROUP_DATA) {
 		max_chunk_size = 10 * calc_size;
 		min_stripe_size = 64 * 1024 * 1024;
@@ -2256,51 +2253,209 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
 			     max_chunk_size);
 
-again:
-	max_avail = 0;
-	if (!map || map->num_stripes != num_stripes) {
-		kfree(map);
-		map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
-		if (!map)
-			return -ENOMEM;
-		map->num_stripes = num_stripes;
-	}
-
 	if (calc_size * num_stripes > max_chunk_size * ncopies) {
 		calc_size = max_chunk_size * ncopies;
 		do_div(calc_size, num_stripes);
-		do_div(calc_size, stripe_len);
-		calc_size *= stripe_len;
+		do_div(calc_size, BTRFS_STRIPE_LEN);
+		calc_size *= BTRFS_STRIPE_LEN;
 	}
 
 	/* we don't want tiny stripes */
-	if (!looped)
+	if (!small_stripe)
 		calc_size = max_t(u64, min_stripe_size, calc_size);
 
 	/*
-	 * we're about to do_div by the stripe_len so lets make sure
+	 * we're about to do_div by the BTRFS_STRIPE_LEN so lets make sure
 	 * we end up with something bigger than a stripe
 	 */
-	calc_size = max_t(u64, calc_size, stripe_len * 4);
+	calc_size = max_t(u64, calc_size, BTRFS_STRIPE_LEN);
+
+	do_div(calc_size, BTRFS_STRIPE_LEN);
+	calc_size *= BTRFS_STRIPE_LEN;
+
+	return calc_size;
+}
+
+static struct map_lookup *__shrink_map_lookup_stripes(struct map_lookup *map,
+						      int num_stripes)
+{
+	struct map_lookup *new;
+	size_t len = map_lookup_size(num_stripes);
+
+	BUG_ON(map->num_stripes < num_stripes);
+
+	if (map->num_stripes == num_stripes)
+		return map;
+
+	new = kmalloc(len, GFP_NOFS);
+	if (!new) {
+		/* just change map->num_stripes */
+		map->num_stripes = num_stripes;
+		return map;
+	}
+
+	memcpy(new, map, len);
+	new->num_stripes = num_stripes;
+	kfree(map);
+	return new;
+}
+
+/*
+ * helper to allocate device space from btrfs_device_info, in which we stored
+ * max free space information of every device. It is used when we can not
+ * allocate chunks by default size.
+ *
+ * By this helper, we can allocate a new chunk as larger as possible.
+ */
+static int __btrfs_alloc_tiny_space(struct btrfs_trans_handle *trans,
+				    struct btrfs_fs_devices *fs_devices,
+				    struct btrfs_device_info *devices,
+				    int nr_device, u64 type,
+				    struct map_lookup **map_lookup,
+				    int min_stripes, u64 *stripe_size)
+{
+	int i, index, sort_again = 0;
+	int min_devices = min_stripes;
+	u64 max_avail, min_free;
+	struct map_lookup *map = *map_lookup;
+	int ret;
+
+	if (nr_device < min_stripes)
+		return -ENOSPC;
+
+	btrfs_descending_sort_devices(devices, nr_device);
+
+	max_avail = devices[0].max_avail;
+	if (!max_avail)
+		return -ENOSPC;
+
+	for (i = 0; i < nr_device; i++) {
+		/*
+		 * if dev_offset = 0, it means the free space of this device
+		 * is less than what we need, and we didn't search max avail
+		 * extent on this device, so do it now.
+		 */
+		if (!devices[i].dev_offset) {
+			ret = find_free_dev_extent(trans, devices[i].dev,
+						   max_avail,
+						   &devices[i].dev_offset,
+						   &devices[i].max_avail);
+			if (ret != 0 && ret != -ENOSPC)
+				return ret;
+			sort_again = 1;
+		}
+	}
+
+	/* we update the max avail free extent of each devices, sort again */
+	if (sort_again)
+		btrfs_descending_sort_devices(devices, nr_device);
+
+	if (type & BTRFS_BLOCK_GROUP_DUP)
+		min_devices = 1;
+
+	if (!devices[min_devices - 1].max_avail)
+		return -ENOSPC;
+
+	max_avail = devices[min_devices - 1].max_avail;
+	if (type & BTRFS_BLOCK_GROUP_DUP)
+		do_div(max_avail, 2);
 
-	do_div(calc_size, stripe_len);
-	calc_size *= stripe_len;
+	max_avail = __btrfs_calc_stripe_size(fs_devices, max_avail, type,
+					     min_stripes, 1);
+	if (type & BTRFS_BLOCK_GROUP_DUP)
+		min_free = max_avail * 2;
+	else
+		min_free = max_avail;
+
+	if (min_free > devices[min_devices - 1].max_avail)
+		return -ENOSPC;
+
+	map = __shrink_map_lookup_stripes(map, min_stripes);
+	*stripe_size = max_avail;
+
+	index = 0;
+	for (i = 0; i < min_stripes; i++) {
+		map->stripes[i].dev = devices[index].dev;
+		map->stripes[i].physical = devices[index].dev_offset;
+		if (type & BTRFS_BLOCK_GROUP_DUP) {
+			i++;
+			map->stripes[i].dev = devices[index].dev;
+			map->stripes[i].physical = devices[index].dev_offset +
+						   max_avail;
+		}
+		index++;
+	}
+	*map_lookup = map;
+
+	return 0;
+}
+
+static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *extent_root,
+			       struct map_lookup **map_ret,
+			       u64 *num_bytes, u64 *stripe_size,
+			       u64 start, u64 type)
+{
+	struct btrfs_fs_info *info = extent_root->fs_info;
+	struct btrfs_device *device = NULL;
+	struct btrfs_fs_devices *fs_devices = info->fs_devices;
+	struct list_head *cur;
+	struct map_lookup *map;
+	struct extent_map_tree *em_tree;
+	struct extent_map *em;
+	struct btrfs_device_info *devices_info;
+	struct list_head private_devs;
+	u64 calc_size = 1024 * 1024 * 1024;
+	u64 min_free;
+	u64 avail;
+	u64 dev_offset;
+	int num_stripes;
+	int min_stripes;
+	int sub_stripes;
+	int min_devices;	/* the min number of devices we need */
+	int i;
+	int ret;
+	int index;
+
+	if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
+	    (type & BTRFS_BLOCK_GROUP_DUP)) {
+		WARN_ON(1);
+		type &= ~BTRFS_BLOCK_GROUP_DUP;
+	}
+	if (list_empty(&fs_devices->alloc_list))
+		return -ENOSPC;
+
+	ret = __btrfs_calc_nstripes(fs_devices, type, &num_stripes,
+				    &min_stripes, &sub_stripes);
+	if (ret)
+		return ret;
+
+	devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
+			       GFP_NOFS);
+	if (!devices_info)
+		return -ENOMEM;
+
+	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
+	if (!map) {
+		ret = -ENOMEM;
+		goto error;
+	}
+	map->num_stripes = num_stripes;
 
 	cur = fs_devices->alloc_list.next;
 	index = 0;
+	i = 0;
 
-	if (type & BTRFS_BLOCK_GROUP_DUP)
+	calc_size = __btrfs_calc_stripe_size(fs_devices, calc_size, type,
+					     num_stripes, 0);
+
+	if (type & BTRFS_BLOCK_GROUP_DUP) {
 		min_free = calc_size * 2;
-	else
+		min_devices = 1;
+	} else {
 		min_free = calc_size;
-
-	/*
-	 * we add 1MB because we never use the first 1MB of the device, unless
-	 * we've looped, then we are likely allocating the maximum amount of
-	 * space left already
-	 */
-	if (!looped)
-		min_free += 1024 * 1024;
+		min_devices = min_stripes;
+	}
 
 	INIT_LIST_HEAD(&private_devs);
 	while (index < num_stripes) {
@@ -2313,27 +2468,39 @@ again:
 		cur = cur->next;
 
 		if (device->in_fs_metadata && avail >= min_free) {
-			ret = find_free_dev_extent(trans, device,
-						   min_free, &dev_offset,
-						   &max_avail);
+			ret = find_free_dev_extent(trans, device, min_free,
+						   &devices_info[i].dev_offset,
+						   &devices_info[i].max_avail);
 			if (ret == 0) {
 				list_move_tail(&device->dev_alloc_list,
 					       &private_devs);
 				map->stripes[index].dev = device;
-				map->stripes[index].physical = dev_offset;
+				map->stripes[index].physical =
+						devices_info[i].dev_offset;
 				index++;
 				if (type & BTRFS_BLOCK_GROUP_DUP) {
 					map->stripes[index].dev = device;
 					map->stripes[index].physical =
-						dev_offset + calc_size;
+						devices_info[i].dev_offset +
+						calc_size;
 					index++;
 				}
-			}
-		} else if (device->in_fs_metadata && avail > max_avail)
-			max_avail = avail;
+			} else if (ret != -ENOSPC)
+				goto error;
+
+			devices_info[i].dev = device;
+			i++;
+		} else if (device->in_fs_metadata &&
+			   avail >= BTRFS_STRIPE_LEN) {
+			devices_info[i].dev = device;
+			devices_info[i].max_avail = avail;
+			i++;
+		}
+
 		if (cur == &fs_devices->alloc_list)
 			break;
 	}
+
 	list_splice(&private_devs, &fs_devices->alloc_list);
 	if (index < num_stripes) {
 		if (index >= min_stripes) {
@@ -2342,36 +2509,36 @@ again:
 				num_stripes /= sub_stripes;
 				num_stripes *= sub_stripes;
 			}
-			looped = 1;
-			goto again;
-		}
-		if (!looped && max_avail > 0) {
-			looped = 1;
-			calc_size = max_avail;
-			if (type & BTRFS_BLOCK_GROUP_DUP)
-				do_div(calc_size, 2);
-			goto again;
+
+			map = __shrink_map_lookup_stripes(map, num_stripes);
+		} else if (i >= min_devices) {
+			ret = __btrfs_alloc_tiny_space(trans, fs_devices,
+						       devices_info, i, type,
+						       &map, min_stripes,
+						       &calc_size);
+			if (ret)
+				goto error;
+		} else {
+			ret = -ENOSPC;
+			goto error;
 		}
-		kfree(map);
-		return -ENOSPC;
 	}
 	map->sector_size = extent_root->sectorsize;
-	map->stripe_len = stripe_len;
-	map->io_align = stripe_len;
-	map->io_width = stripe_len;
+	map->stripe_len = BTRFS_STRIPE_LEN;
+	map->io_align = BTRFS_STRIPE_LEN;
+	map->io_width = BTRFS_STRIPE_LEN;
 	map->type = type;
-	map->num_stripes = num_stripes;
 	map->sub_stripes = sub_stripes;
 
 	*map_ret = map;
 	*stripe_size = calc_size;
 	*num_bytes = chunk_bytes_by_type(type, calc_size,
-					 num_stripes, sub_stripes);
+					 map->num_stripes, sub_stripes);
 
 	em = alloc_extent_map(GFP_NOFS);
 	if (!em) {
-		kfree(map);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto error;
 	}
 	em->bdev = (struct block_device *)map;
 	em->start = start;
@@ -2404,7 +2571,13 @@ again:
 		index++;
 	}
 
+	kfree(devices_info);
 	return 0;
+
+error:
+	kfree(map);
+	kfree(devices_info);
+	return ret;
 }
 
 static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index a668c0116982..a5cfedf393f9 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -20,8 +20,11 @@
 #define __BTRFS_VOLUMES_
 
 #include <linux/bio.h>
+#include <linux/sort.h>
 #include "async-thread.h"
 
+#define BTRFS_STRIPE_LEN	(64 * 1024)
+
 struct buffer_head;
 struct btrfs_pending_bios {
 	struct bio *head;
@@ -137,6 +140,27 @@ struct btrfs_multi_bio {
 	struct btrfs_bio_stripe stripes[];
 };
 
+struct btrfs_device_info {
+	struct btrfs_device *dev;
+	u64 dev_offset;
+	u64 max_avail;
+};
+
+/* Used to sort the devices by max_avail(descending sort) */
+int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2);
+
+/*
+ * sort the devices by max_avail, in which max free extent size of each device
+ * is stored.(Descending Sort)
+ */
+static inline void btrfs_descending_sort_devices(
+					struct btrfs_device_info *devices,
+					size_t nr_devices)
+{
+	sort(devices, nr_devices, sizeof(struct btrfs_device_info),
+	     btrfs_cmp_device_free_bytes, NULL);
+}
+
 #define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \
 			    (sizeof(struct btrfs_bio_stripe) * (n)))
 
-- 
cgit v1.2.2


From 6d07bcec969af335d4e35b3921131b7929bd634e Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Wed, 5 Jan 2011 10:07:31 +0000
Subject: btrfs: fix wrong free space information of btrfs

When we store data by raid profile in btrfs with two or more different size
disks, df command shows there is some free space in the filesystem, but the
user can not write any data in fact, df command shows the wrong free space
information of btrfs.

 # mkfs.btrfs -d raid1 /dev/sda9 /dev/sda10
 # btrfs-show
 Label: none  uuid: a95cd49e-6e33-45b8-8741-a36153ce4b64
 	Total devices 2 FS bytes used 28.00KB
 	devid    1 size 5.01GB used 2.03GB path /dev/sda9
 	devid    2 size 10.00GB used 2.01GB path /dev/sda10
 # btrfs device scan /dev/sda9 /dev/sda10
 # mount /dev/sda9 /mnt
 # dd if=/dev/zero of=tmpfile0 bs=4K count=9999999999
   (fill the filesystem)
 # sync
 # df -TH
 Filesystem	Type	Size	Used	Avail	Use%	Mounted on
 /dev/sda9	btrfs	17G	8.6G	5.4G	62%	/mnt
 # btrfs-show
 Label: none  uuid: a95cd49e-6e33-45b8-8741-a36153ce4b64
 	Total devices 2 FS bytes used 3.99GB
 	devid    1 size 5.01GB used 5.01GB path /dev/sda9
 	devid    2 size 10.00GB used 4.99GB path /dev/sda10

It is because btrfs cannot allocate chunks when one of the pairing disks has
no space, the free space on the other disks can not be used for ever, and should
be subtracted from the total space, but btrfs doesn't subtract this space from
the total. It is strange to the user.

This patch fixes it by calcing the free space that can be used to allocate
chunks.

Implementation:
1. get all the devices free space, and align them by stripe length.
2. sort the devices by the free space.
3. check the free space of the devices,
   3.1. if it is not zero, and then check the number of the devices that has
        more free space than this device,
        if the number of the devices is beyond the min stripe number, the free
        space can be used, and add into total free space.
        if the number of the devices is below the min stripe number, we can not
        use the free space, the check ends.
   3.2. if the free space is zero, check the next devices, goto 3.1

This implementation is just likely fake chunk allocation.

After appling this patch, df can show correct space information:
 # df -TH
 Filesystem	Type	Size	Used	Avail	Use%	Mounted on
 /dev/sda9	btrfs	17G	8.6G	0	100%	/mnt

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |   2 +
 fs/btrfs/extent-tree.c |  58 +++++++++++++++++++-
 fs/btrfs/super.c       | 146 +++++++++++++++++++++++++++++++++++++++++++++++--
 fs/btrfs/volumes.c     |  84 ++++++++++++++++++++++++++++
 fs/btrfs/volumes.h     |   3 +
 5 files changed, 286 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0cb322cc4fc0..0995f4f68d7a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2158,6 +2158,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root, u64 group_start);
 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
+u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
 void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
 void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
 int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
@@ -2201,6 +2202,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
 int btrfs_set_block_group_rw(struct btrfs_root *root,
 			     struct btrfs_block_group_cache *cache);
 void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
+u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
 /* ctree.c */
 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
 		     int level, int *slot);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 1e1c9a177626..04bfc3a2bd9f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3090,7 +3090,7 @@ static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
 	return btrfs_reduce_alloc_profile(root, flags);
 }
 
-static u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
+u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
 {
 	u64 flags;
 
@@ -8019,6 +8019,62 @@ out:
 	return ret;
 }
 
+/*
+ * helper to account the unused space of all the readonly block group in the
+ * list. takes mirrors into account.
+ */
+static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
+{
+	struct btrfs_block_group_cache *block_group;
+	u64 free_bytes = 0;
+	int factor;
+
+	list_for_each_entry(block_group, groups_list, list) {
+		spin_lock(&block_group->lock);
+
+		if (!block_group->ro) {
+			spin_unlock(&block_group->lock);
+			continue;
+		}
+
+		if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
+					  BTRFS_BLOCK_GROUP_RAID10 |
+					  BTRFS_BLOCK_GROUP_DUP))
+			factor = 2;
+		else
+			factor = 1;
+
+		free_bytes += (block_group->key.offset -
+			       btrfs_block_group_used(&block_group->item)) *
+			       factor;
+
+		spin_unlock(&block_group->lock);
+	}
+
+	return free_bytes;
+}
+
+/*
+ * helper to account the unused space of all the readonly block group in the
+ * space_info. takes mirrors into account.
+ */
+u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
+{
+	int i;
+	u64 free_bytes = 0;
+
+	spin_lock(&sinfo->lock);
+
+	for(i = 0; i < BTRFS_NR_RAID_TYPES; i++)
+		if (!list_empty(&sinfo->block_groups[i]))
+			free_bytes += __btrfs_get_ro_block_group_free_space(
+						&sinfo->block_groups[i]);
+
+	spin_unlock(&sinfo->lock);
+
+	return free_bytes;
+}
+
 int btrfs_set_block_group_rw(struct btrfs_root *root,
 			      struct btrfs_block_group_cache *cache)
 {
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index caa5bcc62f16..2963376e77f4 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -777,6 +777,127 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 	return 0;
 }
 
+/*
+ * The helper to calc the free space on the devices that can be used to store
+ * file data.
+ */
+static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_device_info *devices_info;
+	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+	struct btrfs_device *device;
+	u64 skip_space;
+	u64 type;
+	u64 avail_space;
+	u64 used_space;
+	u64 min_stripe_size;
+	int min_stripes = 1;
+	int i = 0, nr_devices;
+	int ret;
+
+	nr_devices = fs_info->fs_devices->rw_devices;
+	BUG_ON(!nr_devices);
+
+	devices_info = kmalloc(sizeof(*devices_info) * nr_devices,
+			       GFP_NOFS);
+	if (!devices_info)
+		return -ENOMEM;
+
+	/* calc min stripe number for data space alloction */
+	type = btrfs_get_alloc_profile(root, 1);
+	if (type & BTRFS_BLOCK_GROUP_RAID0)
+		min_stripes = 2;
+	else if (type & BTRFS_BLOCK_GROUP_RAID1)
+		min_stripes = 2;
+	else if (type & BTRFS_BLOCK_GROUP_RAID10)
+		min_stripes = 4;
+
+	if (type & BTRFS_BLOCK_GROUP_DUP)
+		min_stripe_size = 2 * BTRFS_STRIPE_LEN;
+	else
+		min_stripe_size = BTRFS_STRIPE_LEN;
+
+	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
+		if (!device->in_fs_metadata)
+			continue;
+
+		avail_space = device->total_bytes - device->bytes_used;
+
+		/* align with stripe_len */
+		do_div(avail_space, BTRFS_STRIPE_LEN);
+		avail_space *= BTRFS_STRIPE_LEN;
+
+		/*
+		 * In order to avoid overwritting the superblock on the drive,
+		 * btrfs starts at an offset of at least 1MB when doing chunk
+		 * allocation.
+		 */
+		skip_space = 1024 * 1024;
+
+		/* user can set the offset in fs_info->alloc_start. */
+		if (fs_info->alloc_start + BTRFS_STRIPE_LEN <=
+		    device->total_bytes)
+			skip_space = max(fs_info->alloc_start, skip_space);
+
+		/*
+		 * btrfs can not use the free space in [0, skip_space - 1],
+		 * we must subtract it from the total. In order to implement
+		 * it, we account the used space in this range first.
+		 */
+		ret = btrfs_account_dev_extents_size(device, 0, skip_space - 1,
+						     &used_space);
+		if (ret) {
+			kfree(devices_info);
+			return ret;
+		}
+
+		/* calc the free space in [0, skip_space - 1] */
+		skip_space -= used_space;
+
+		/*
+		 * we can use the free space in [0, skip_space - 1], subtract
+		 * it from the total.
+		 */
+		if (avail_space && avail_space >= skip_space)
+			avail_space -= skip_space;
+		else
+			avail_space = 0;
+
+		if (avail_space < min_stripe_size)
+			continue;
+
+		devices_info[i].dev = device;
+		devices_info[i].max_avail = avail_space;
+
+		i++;
+	}
+
+	nr_devices = i;
+
+	btrfs_descending_sort_devices(devices_info, nr_devices);
+
+	i = nr_devices - 1;
+	avail_space = 0;
+	while (nr_devices >= min_stripes) {
+		if (devices_info[i].max_avail >= min_stripe_size) {
+			int j;
+			u64 alloc_size;
+
+			avail_space += devices_info[i].max_avail * min_stripes;
+			alloc_size = devices_info[i].max_avail;
+			for (j = i + 1 - min_stripes; j <= i; j++)
+				devices_info[j].max_avail -= alloc_size;
+		}
+		i--;
+		nr_devices--;
+	}
+
+	kfree(devices_info);
+	*free_bytes = avail_space;
+	return 0;
+}
+
 static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct btrfs_root *root = btrfs_sb(dentry->d_sb);
@@ -784,16 +905,21 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	struct list_head *head = &root->fs_info->space_info;
 	struct btrfs_space_info *found;
 	u64 total_used = 0;
-	u64 total_used_data = 0;
+	u64 total_free_data = 0;
 	int bits = dentry->d_sb->s_blocksize_bits;
 	__be32 *fsid = (__be32 *)root->fs_info->fsid;
+	int ret;
 
+	/* holding chunk_muext to avoid allocating new chunks */
+	mutex_lock(&root->fs_info->chunk_mutex);
 	rcu_read_lock();
 	list_for_each_entry_rcu(found, head, list) {
-		if (found->flags & BTRFS_BLOCK_GROUP_DATA)
-			total_used_data += found->disk_used;
-		else
-			total_used_data += found->disk_total;
+		if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
+			total_free_data += found->disk_total - found->disk_used;
+			total_free_data -=
+				btrfs_account_ro_block_groups_free_space(found);
+		}
+
 		total_used += found->disk_used;
 	}
 	rcu_read_unlock();
@@ -801,9 +927,17 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_namelen = BTRFS_NAME_LEN;
 	buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
 	buf->f_bfree = buf->f_blocks - (total_used >> bits);
-	buf->f_bavail = buf->f_blocks - (total_used_data >> bits);
 	buf->f_bsize = dentry->d_sb->s_blocksize;
 	buf->f_type = BTRFS_SUPER_MAGIC;
+	buf->f_bavail = total_free_data;
+	ret = btrfs_calc_avail_data_space(root, &total_free_data);
+	if (ret) {
+		mutex_unlock(&root->fs_info->chunk_mutex);
+		return ret;
+	}
+	buf->f_bavail += total_free_data;
+	buf->f_bavail = buf->f_bavail >> bits;
+	mutex_unlock(&root->fs_info->chunk_mutex);
 
 	/* We treat it as constant endianness (it doesn't matter _which_)
 	   because we want the fsid to come out the same whether mounted
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c22784b989b7..0c7f478cf645 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -728,6 +728,90 @@ error:
 	return ret;
 }
 
+/* helper to account the used device space in the range */
+int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
+				   u64 end, u64 *length)
+{
+	struct btrfs_key key;
+	struct btrfs_root *root = device->dev_root;
+	struct btrfs_dev_extent *dev_extent;
+	struct btrfs_path *path;
+	u64 extent_end;
+	int ret;
+	int slot;
+	struct extent_buffer *l;
+
+	*length = 0;
+
+	if (start >= device->total_bytes)
+		return 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	path->reada = 2;
+
+	key.objectid = device->devid;
+	key.offset = start;
+	key.type = BTRFS_DEV_EXTENT_KEY;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+	if (ret > 0) {
+		ret = btrfs_previous_item(root, path, key.objectid, key.type);
+		if (ret < 0)
+			goto out;
+	}
+
+	while (1) {
+		l = path->nodes[0];
+		slot = path->slots[0];
+		if (slot >= btrfs_header_nritems(l)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret == 0)
+				continue;
+			if (ret < 0)
+				goto out;
+
+			break;
+		}
+		btrfs_item_key_to_cpu(l, &key, slot);
+
+		if (key.objectid < device->devid)
+			goto next;
+
+		if (key.objectid > device->devid)
+			break;
+
+		if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
+			goto next;
+
+		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
+		extent_end = key.offset + btrfs_dev_extent_length(l,
+								  dev_extent);
+		if (key.offset <= start && extent_end > end) {
+			*length = end - start + 1;
+			break;
+		} else if (key.offset <= start && extent_end > start)
+			*length += extent_end - start;
+		else if (key.offset > start && extent_end <= end)
+			*length += extent_end - key.offset;
+		else if (key.offset > start && key.offset <= end) {
+			*length += end - key.offset + 1;
+			break;
+		} else if (key.offset > end)
+			break;
+
+next:
+		path->slots[0]++;
+	}
+	ret = 0;
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
 /*
  * find_free_dev_extent - find free space in the specified device
  * @trans:	transaction handler
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index a5cfedf393f9..7af6144a7954 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -161,6 +161,9 @@ static inline void btrfs_descending_sort_devices(
 	     btrfs_cmp_device_free_bytes, NULL);
 }
 
+int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
+				   u64 end, u64 *length);
+
 #define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \
 			    (sizeof(struct btrfs_bio_stripe) * (n)))
 
-- 
cgit v1.2.2


From 42838bb265b9cff3de9587fcacc398b5112dc2d9 Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jj@chaosbits.net>
Date: Thu, 6 Jan 2011 21:45:21 +0000
Subject: btrfs: Mem leak in btrfs_get_acl()

It seems to me that we leak the memory allocated to 'value' in
btrfs_get_acl() if the call to posix_acl_from_xattr() fails.
Here's a patch that attempts to correct that problem.

Signed-off-by: Jesper Juhl <jj@chaosbits.net>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/acl.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 2222d161c7b6..6d1410e392d3 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -60,8 +60,10 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
 		size = __btrfs_getxattr(inode, name, value, size);
 		if (size > 0) {
 			acl = posix_acl_from_xattr(value, size);
-			if (IS_ERR(acl))
+			if (IS_ERR(acl)) {
+				kfree(value);
 				return acl;
+			}
 			set_cached_acl(inode, type, acl);
 		}
 		kfree(value);
-- 
cgit v1.2.2


From 20b450773d17e325190c158e10bfdb25dc21d2d6 Mon Sep 17 00:00:00 2001
From: Dave Young <hidave.darkstar@gmail.com>
Date: Sat, 8 Jan 2011 10:09:13 +0000
Subject: btrfs: mount failure return value fix

I happened to pass swap partition as root partition in cmdline,
then kernel panic and tell me about "Cannot open root device".
It is not correct, in fact it is a fs type mismatch instead of 'no device'.

Eventually I found btrfs mounting failed with -EIO, it should be -EINVAL.
The logic in init/do_mounts.c:
        for (p = fs_names; *p; p += strlen(p)+1) {
                int err = do_mount_root(name, p, flags, root_mount_data);
                switch (err) {
                        case 0:
                                goto out;
                        case -EACCES:
                                flags |= MS_RDONLY;
                                goto retry;
                        case -EINVAL:
                                continue;
                }
		print "Cannot open root device"
		panic
	}
SO fs type after btrfs will have no chance to mount

Here fix the return value as -EINVAL

Signed-off-by: Dave Young <hidave.darkstar@gmail.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 4 +++-
 fs/btrfs/volumes.c | 8 +++++---
 2 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f88eb2ce7919..f9efb68fc2e3 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1713,8 +1713,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 		     fs_info, BTRFS_ROOT_TREE_OBJECTID);
 
 	bh = btrfs_read_dev_super(fs_devices->latest_bdev);
-	if (!bh)
+	if (!bh) {
+		err = -EINVAL;
 		goto fail_iput;
+	}
 
 	memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
 	memcpy(&fs_info->super_for_commit, &fs_info->super_copy,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 0c7f478cf645..e8be478178aa 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -600,8 +600,10 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 		set_blocksize(bdev, 4096);
 
 		bh = btrfs_read_dev_super(bdev);
-		if (!bh)
+		if (!bh) {
+			ret = -EINVAL;
 			goto error_close;
+		}
 
 		disk_super = (struct btrfs_super_block *)bh->b_data;
 		devid = btrfs_stack_device_id(&disk_super->dev_item);
@@ -702,7 +704,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 		goto error_close;
 	bh = btrfs_read_dev_super(bdev);
 	if (!bh) {
-		ret = -EIO;
+		ret = -EINVAL;
 		goto error_close;
 	}
 	disk_super = (struct btrfs_super_block *)bh->b_data;
@@ -1302,7 +1304,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 		set_blocksize(bdev, 4096);
 		bh = btrfs_read_dev_super(bdev);
 		if (!bh) {
-			ret = -EIO;
+			ret = -EINVAL;
 			goto error_close;
 		}
 		disk_super = (struct btrfs_super_block *)bh->b_data;
-- 
cgit v1.2.2


From ff175d57f057f77d2d3031d674c2af9167a4af02 Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jj@chaosbits.net>
Date: Sat, 25 Dec 2010 21:22:30 +0000
Subject: btrfs: Don't pass NULL ptr to func that may deref it.

Hi,

In fs/btrfs/inode.c::fixup_tree_root_location() we have this code:

...
 		if (!path) {
 			err = -ENOMEM;
 			goto out;
 		}
...
 	out:
 		btrfs_free_path(path);
 		return err;

btrfs_free_path() passes its argument on to other functions and some of
them end up dereferencing the pointer.
In the code above that pointer is clearly NULL, so btrfs_free_path() will
eventually cause a NULL dereference.

There are many ways to cut this cake (fix the bug). The one I chose was to
make btrfs_free_path() deal gracefully with NULL pointers. If you
disagree, feel free to come up with an alternative patch.

Signed-off-by: Jesper Juhl <jj@chaosbits.net>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 9ac171599258..99599f1c1554 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -105,6 +105,8 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
 /* this also releases the path */
 void btrfs_free_path(struct btrfs_path *p)
 {
+	if (!p)
+		return;
 	btrfs_release_path(NULL, p);
 	kmem_cache_free(btrfs_path_cachep, p);
 }
-- 
cgit v1.2.2


From 91ca338d776e0cefb255bf2979b6448febd880f5 Mon Sep 17 00:00:00 2001
From: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Date: Wed, 5 Jan 2011 02:32:22 +0000
Subject: btrfs: check NULL or not

Should check if functions returns NULL or not.

Signed-off-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c     | 6 ++++++
 fs/btrfs/disk-io.c   | 8 ++++++++
 fs/btrfs/extent_io.c | 2 ++
 3 files changed, 16 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 99599f1c1554..b5baff0dccfe 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2516,6 +2516,9 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_assert_tree_locked(path->nodes[1]);
 
 	right = read_node_slot(root, upper, slot + 1);
+	if (right == NULL)
+		return 1;
+
 	btrfs_tree_lock(right);
 	btrfs_set_lock_blocking(right);
 
@@ -2766,6 +2769,9 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_assert_tree_locked(path->nodes[1]);
 
 	left = read_node_slot(root, path->nodes[1], slot - 1);
+	if (left == NULL)
+		return 1;
+
 	btrfs_tree_lock(left);
 	btrfs_set_lock_blocking(left);
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f9efb68fc2e3..a0c37b2ee9ed 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -353,6 +353,10 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
 	WARN_ON(len == 0);
 
 	eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
+	if (eb == NULL) {
+		WARN_ON(1);
+		goto out;
+	}
 	ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
 					     btrfs_header_generation(eb));
 	BUG_ON(ret);
@@ -427,6 +431,10 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 	WARN_ON(len == 0);
 
 	eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
+	if (eb == NULL) {
+		ret = -EIO;
+		goto out;
+	}
 
 	found_start = btrfs_header_bytenr(eb);
 	if (found_start != start) {
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index f1d198128959..8b8d3d99ae68 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3075,6 +3075,8 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
 #endif
 
 	eb = kmem_cache_zalloc(extent_buffer_cache, mask);
+	if (eb == NULL)
+		return NULL;
 	eb->start = start;
 	eb->len = len;
 	spin_lock_init(&eb->lock);
-- 
cgit v1.2.2


From 5e540f7715b8cd83b8e60beaaa525b125cc122de Mon Sep 17 00:00:00 2001
From: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Date: Mon, 27 Dec 2010 06:53:10 +0000
Subject: btrfs: Fix memory leak in btrfs_read_fs_root_no_radix()

In btrfs_read_fs_root_no_radix(), 'root' is not freed if
btrfs_search_slot() returns error.

Signed-off-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a0c37b2ee9ed..9b1dd4138072 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1153,6 +1153,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
 	}
 	btrfs_free_path(path);
 	if (ret) {
+		kfree(root);
 		if (ret > 0)
 			ret = -ENOENT;
 		return ERR_PTR(ret);
-- 
cgit v1.2.2


From f690efb1aa2a961dd6655529c1797fcac60ad6d9 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Wed, 12 Jan 2011 21:04:22 +0000
Subject: Btrfs: don't warn if we get ENOSPC in btrfs_block_rsv_check

If we run low on space we could get a bunch of warnings out of
btrfs_block_rsv_check, but this is mostly just called via the transaction code
to see if we need to end the transaction, it expects to see failures, so let's
not WARN and freak everybody out for no reason.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 04bfc3a2bd9f..055b837eab19 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3727,11 +3727,6 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
 		return 0;
 	}
 
-	WARN_ON(1);
-	printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
-		block_rsv->size, block_rsv->reserved,
-		block_rsv->freed[0], block_rsv->freed[1]);
-
 	return -ENOSPC;
 }
 
-- 
cgit v1.2.2


From 6f88a4403def422bd8e276ddf6863d6ac71435d2 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben@decadent.org.uk>
Date: Wed, 29 Dec 2010 14:55:03 +0000
Subject: btrfs: Require CAP_SYS_ADMIN for filesystem rebalance

Filesystem rebalancing (BTRFS_IOC_BALANCE) affects the entire
filesystem and may run uninterruptibly for a long time.  This does not
seem to be something that an unprivileged user should be able to do.

Reported-by: Aron Xu <happyaron.xu@gmail.com>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index e8be478178aa..f2d2f4ccc738 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -22,6 +22,7 @@
 #include <linux/blkdev.h>
 #include <linux/random.h>
 #include <linux/iocontext.h>
+#include <linux/capability.h>
 #include <asm/div64.h>
 #include "compat.h"
 #include "ctree.h"
@@ -2024,6 +2025,9 @@ int btrfs_balance(struct btrfs_root *dev_root)
 	if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
 		return -EROFS;
 
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
 	mutex_lock(&dev_root->fs_info->volume_mutex);
 	dev_root = dev_root->fs_info->dev_root;
 
-- 
cgit v1.2.2


From 7b8a53fd815deb39542085897743fa0063f9fe06 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 15 Jan 2011 20:08:44 -0500
Subject: fix old umount_tree() breakage

Expiry-related code calls umount_tree() several times with
the same list to collect vfsmounts to.  Which is fine, except
that umount_tree() implicitly assumed that the list would
be empty on each call - it moves the victims over there and
then iterates through the list kicking them out.  It's *almost*
idempotent, so everything nearly worked.  However, mnt->ghosts
handling (and thus expirability checks) had been broken - that
part was not idempotent...

The fix is trivial - use local temporary list, splice it to
the the collector list when we are through.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index bfcb701f9490..d7fc05fac753 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1226,15 +1226,16 @@ void release_mounts(struct list_head *head)
  */
 void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
 {
+	LIST_HEAD(tmp_list);
 	struct vfsmount *p;
 
 	for (p = mnt; p; p = next_mnt(p, mnt))
-		list_move(&p->mnt_hash, kill);
+		list_move(&p->mnt_hash, &tmp_list);
 
 	if (propagate)
-		propagate_umount(kill);
+		propagate_umount(&tmp_list);
 
-	list_for_each_entry(p, kill, mnt_hash) {
+	list_for_each_entry(p, &tmp_list, mnt_hash) {
 		list_del_init(&p->mnt_expire);
 		list_del_init(&p->mnt_list);
 		__touch_mnt_namespace(p->mnt_ns);
@@ -1246,6 +1247,7 @@ void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
 		}
 		change_mnt_propagation(p, MS_PRIVATE);
 	}
+	list_splice(&tmp_list, kill);
 }
 
 static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts);
-- 
cgit v1.2.2


From f03c65993b98eeb909a4012ce7833c5857d74755 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 14 Jan 2011 22:30:21 -0500
Subject: sanitize vfsmount refcounting changes

Instead of splitting refcount between (per-cpu) mnt_count
and (SMP-only) mnt_longrefs, make all references contribute
to mnt_count again and keep track of how many are longterm
ones.

Accounting rules for longterm count:
	* 1 for each fs_struct.root.mnt
	* 1 for each fs_struct.pwd.mnt
	* 1 for having non-NULL ->mnt_ns
	* decrement to 0 happens only under vfsmount lock exclusive

That allows nice common case for mntput() - since we can't drop the
final reference until after mnt_longterm has reached 0 due to the rules
above, mntput() can grab vfsmount lock shared and check mnt_longterm.
If it turns out to be non-zero (which is the common case), we know
that this is not the final mntput() and can just blindly decrement
percpu mnt_count.  Otherwise we grab vfsmount lock exclusive and
do usual decrement-and-check of percpu mnt_count.

For fs_struct.c we have mnt_make_longterm() and mnt_make_shortterm();
namespace.c uses the latter in places where we don't already hold
vfsmount lock exclusive and opencodes a few remaining spots where
we need to manipulate mnt_longterm.

Note that we mostly revert the code outside of fs/namespace.c back
to what we used to have; in particular, normal code doesn't need
to care about two kinds of references, etc.  And we get to keep
the optimization Nick's variant had bought us...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/anon_inodes.c |   2 +-
 fs/fs_struct.c   |  35 +++++++++++------
 fs/internal.h    |   3 ++
 fs/namei.c       |  24 ------------
 fs/namespace.c   | 116 +++++++++++++++++++++----------------------------------
 fs/pipe.c        |   2 +-
 fs/super.c       |   2 +-
 7 files changed, 73 insertions(+), 111 deletions(-)

(limited to 'fs')

diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index cbe57f3c4d89..c5567cb78432 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -233,7 +233,7 @@ static int __init anon_inode_init(void)
 	return 0;
 
 err_mntput:
-	mntput_long(anon_inode_mnt);
+	mntput(anon_inode_mnt);
 err_unregister_filesystem:
 	unregister_filesystem(&anon_inode_fs_type);
 err_exit:
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 68ca487bedb1..78b519c13536 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -4,6 +4,19 @@
 #include <linux/path.h>
 #include <linux/slab.h>
 #include <linux/fs_struct.h>
+#include "internal.h"
+
+static inline void path_get_longterm(struct path *path)
+{
+	path_get(path);
+	mnt_make_longterm(path->mnt);
+}
+
+static inline void path_put_longterm(struct path *path)
+{
+	mnt_make_shortterm(path->mnt);
+	path_put(path);
+}
 
 /*
  * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
@@ -17,11 +30,11 @@ void set_fs_root(struct fs_struct *fs, struct path *path)
 	write_seqcount_begin(&fs->seq);
 	old_root = fs->root;
 	fs->root = *path;
-	path_get_long(path);
+	path_get_longterm(path);
 	write_seqcount_end(&fs->seq);
 	spin_unlock(&fs->lock);
 	if (old_root.dentry)
-		path_put_long(&old_root);
+		path_put_longterm(&old_root);
 }
 
 /*
@@ -36,12 +49,12 @@ void set_fs_pwd(struct fs_struct *fs, struct path *path)
 	write_seqcount_begin(&fs->seq);
 	old_pwd = fs->pwd;
 	fs->pwd = *path;
-	path_get_long(path);
+	path_get_longterm(path);
 	write_seqcount_end(&fs->seq);
 	spin_unlock(&fs->lock);
 
 	if (old_pwd.dentry)
-		path_put_long(&old_pwd);
+		path_put_longterm(&old_pwd);
 }
 
 void chroot_fs_refs(struct path *old_root, struct path *new_root)
@@ -59,13 +72,13 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)
 			write_seqcount_begin(&fs->seq);
 			if (fs->root.dentry == old_root->dentry
 			    && fs->root.mnt == old_root->mnt) {
-				path_get_long(new_root);
+				path_get_longterm(new_root);
 				fs->root = *new_root;
 				count++;
 			}
 			if (fs->pwd.dentry == old_root->dentry
 			    && fs->pwd.mnt == old_root->mnt) {
-				path_get_long(new_root);
+				path_get_longterm(new_root);
 				fs->pwd = *new_root;
 				count++;
 			}
@@ -76,13 +89,13 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)
 	} while_each_thread(g, p);
 	read_unlock(&tasklist_lock);
 	while (count--)
-		path_put_long(old_root);
+		path_put_longterm(old_root);
 }
 
 void free_fs_struct(struct fs_struct *fs)
 {
-	path_put_long(&fs->root);
-	path_put_long(&fs->pwd);
+	path_put_longterm(&fs->root);
+	path_put_longterm(&fs->pwd);
 	kmem_cache_free(fs_cachep, fs);
 }
 
@@ -118,9 +131,9 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
 
 		spin_lock(&old->lock);
 		fs->root = old->root;
-		path_get_long(&fs->root);
+		path_get_longterm(&fs->root);
 		fs->pwd = old->pwd;
-		path_get_long(&fs->pwd);
+		path_get_longterm(&fs->pwd);
 		spin_unlock(&old->lock);
 	}
 	return fs;
diff --git a/fs/internal.h b/fs/internal.h
index 4931060fd089..12ccb86edef7 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -73,6 +73,9 @@ extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
 extern int do_add_mount(struct vfsmount *, struct path *, int);
 extern void mnt_clear_expiry(struct vfsmount *);
 
+extern void mnt_make_longterm(struct vfsmount *);
+extern void mnt_make_shortterm(struct vfsmount *);
+
 extern void __init mnt_init(void);
 
 DECLARE_BRLOCK(vfsmount_lock);
diff --git a/fs/namei.c b/fs/namei.c
index c2e37727e3ab..8f7b41a14882 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -367,18 +367,6 @@ void path_get(struct path *path)
 }
 EXPORT_SYMBOL(path_get);
 
-/**
- * path_get_long - get a long reference to a path
- * @path: path to get the reference to
- *
- * Given a path increment the reference count to the dentry and the vfsmount.
- */
-void path_get_long(struct path *path)
-{
-	mntget_long(path->mnt);
-	dget(path->dentry);
-}
-
 /**
  * path_put - put a reference to a path
  * @path: path to put the reference to
@@ -392,18 +380,6 @@ void path_put(struct path *path)
 }
 EXPORT_SYMBOL(path_put);
 
-/**
- * path_put_long - put a long reference to a path
- * @path: path to put the reference to
- *
- * Given a path decrement the reference count to the dentry and the vfsmount.
- */
-void path_put_long(struct path *path)
-{
-	dput(path->dentry);
-	mntput_long(path->mnt);
-}
-
 /**
  * nameidata_drop_rcu - drop this nameidata out of rcu-walk
  * @nd: nameidata pathwalk data to drop
diff --git a/fs/namespace.c b/fs/namespace.c
index d7fc05fac753..48809e21f270 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -183,7 +183,7 @@ static inline void mnt_dec_count(struct vfsmount *mnt)
 unsigned int mnt_get_count(struct vfsmount *mnt)
 {
 #ifdef CONFIG_SMP
-	unsigned int count = atomic_read(&mnt->mnt_longrefs);
+	unsigned int count = 0;
 	int cpu;
 
 	for_each_possible_cpu(cpu) {
@@ -217,7 +217,7 @@ struct vfsmount *alloc_vfsmnt(const char *name)
 		if (!mnt->mnt_pcp)
 			goto out_free_devname;
 
-		atomic_set(&mnt->mnt_longrefs, 1);
+		this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
 #else
 		mnt->mnt_count = 1;
 		mnt->mnt_writers = 0;
@@ -624,8 +624,11 @@ static void commit_tree(struct vfsmount *mnt)
 	BUG_ON(parent == mnt);
 
 	list_add_tail(&head, &mnt->mnt_list);
-	list_for_each_entry(m, &head, mnt_list)
+	list_for_each_entry(m, &head, mnt_list) {
 		m->mnt_ns = n;
+		atomic_inc(&m->mnt_longterm);
+	}
+
 	list_splice(&head, n->list.prev);
 
 	list_add_tail(&mnt->mnt_hash, mount_hashtable +
@@ -734,51 +737,30 @@ static inline void mntfree(struct vfsmount *mnt)
 	deactivate_super(sb);
 }
 
-#ifdef CONFIG_SMP
-static inline void __mntput(struct vfsmount *mnt, int longrefs)
+static void mntput_no_expire(struct vfsmount *mnt)
 {
-	if (!longrefs) {
 put_again:
-		br_read_lock(vfsmount_lock);
-		if (likely(atomic_read(&mnt->mnt_longrefs))) {
-			mnt_dec_count(mnt);
-			br_read_unlock(vfsmount_lock);
-			return;
-		}
+#ifdef CONFIG_SMP
+	br_read_lock(vfsmount_lock);
+	if (likely(atomic_read(&mnt->mnt_longterm))) {
+		mnt_dec_count(mnt);
 		br_read_unlock(vfsmount_lock);
-	} else {
-		BUG_ON(!atomic_read(&mnt->mnt_longrefs));
-		if (atomic_add_unless(&mnt->mnt_longrefs, -1, 1))
-			return;
+		return;
 	}
+	br_read_unlock(vfsmount_lock);
 
 	br_write_lock(vfsmount_lock);
-	if (!longrefs)
-		mnt_dec_count(mnt);
-	else
-		atomic_dec(&mnt->mnt_longrefs);
+	mnt_dec_count(mnt);
 	if (mnt_get_count(mnt)) {
 		br_write_unlock(vfsmount_lock);
 		return;
 	}
-	if (unlikely(mnt->mnt_pinned)) {
-		mnt_add_count(mnt, mnt->mnt_pinned + 1);
-		mnt->mnt_pinned = 0;
-		br_write_unlock(vfsmount_lock);
-		acct_auto_close_mnt(mnt);
-		goto put_again;
-	}
-	br_write_unlock(vfsmount_lock);
-	mntfree(mnt);
-}
 #else
-static inline void __mntput(struct vfsmount *mnt, int longrefs)
-{
-put_again:
 	mnt_dec_count(mnt);
 	if (likely(mnt_get_count(mnt)))
 		return;
 	br_write_lock(vfsmount_lock);
+#endif
 	if (unlikely(mnt->mnt_pinned)) {
 		mnt_add_count(mnt, mnt->mnt_pinned + 1);
 		mnt->mnt_pinned = 0;
@@ -789,12 +771,6 @@ put_again:
 	br_write_unlock(vfsmount_lock);
 	mntfree(mnt);
 }
-#endif
-
-static void mntput_no_expire(struct vfsmount *mnt)
-{
-	__mntput(mnt, 0);
-}
 
 void mntput(struct vfsmount *mnt)
 {
@@ -802,7 +778,7 @@ void mntput(struct vfsmount *mnt)
 		/* avoid cacheline pingpong, hope gcc doesn't get "smart" */
 		if (unlikely(mnt->mnt_expiry_mark))
 			mnt->mnt_expiry_mark = 0;
-		__mntput(mnt, 0);
+		mntput_no_expire(mnt);
 	}
 }
 EXPORT_SYMBOL(mntput);
@@ -815,33 +791,6 @@ struct vfsmount *mntget(struct vfsmount *mnt)
 }
 EXPORT_SYMBOL(mntget);
 
-void mntput_long(struct vfsmount *mnt)
-{
-#ifdef CONFIG_SMP
-	if (mnt) {
-		/* avoid cacheline pingpong, hope gcc doesn't get "smart" */
-		if (unlikely(mnt->mnt_expiry_mark))
-			mnt->mnt_expiry_mark = 0;
-		__mntput(mnt, 1);
-	}
-#else
-	mntput(mnt);
-#endif
-}
-EXPORT_SYMBOL(mntput_long);
-
-struct vfsmount *mntget_long(struct vfsmount *mnt)
-{
-#ifdef CONFIG_SMP
-	if (mnt)
-		atomic_inc(&mnt->mnt_longrefs);
-	return mnt;
-#else
-	return mntget(mnt);
-#endif
-}
-EXPORT_SYMBOL(mntget_long);
-
 void mnt_pin(struct vfsmount *mnt)
 {
 	br_write_lock(vfsmount_lock);
@@ -1216,7 +1165,7 @@ void release_mounts(struct list_head *head)
 			dput(dentry);
 			mntput(m);
 		}
-		mntput_long(mnt);
+		mntput(mnt);
 	}
 }
 
@@ -1240,6 +1189,7 @@ void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
 		list_del_init(&p->mnt_list);
 		__touch_mnt_namespace(p->mnt_ns);
 		p->mnt_ns = NULL;
+		atomic_dec(&p->mnt_longterm);
 		list_del_init(&p->mnt_child);
 		if (p->mnt_parent != p) {
 			p->mnt_parent->mnt_ghosts++;
@@ -1969,7 +1919,7 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path, int mnt_flags)
 
 unlock:
 	up_write(&namespace_sem);
-	mntput_long(newmnt);
+	mntput(newmnt);
 	return err;
 }
 
@@ -2291,6 +2241,20 @@ static struct mnt_namespace *alloc_mnt_ns(void)
 	return new_ns;
 }
 
+void mnt_make_longterm(struct vfsmount *mnt)
+{
+	atomic_inc(&mnt->mnt_longterm);
+}
+
+void mnt_make_shortterm(struct vfsmount *mnt)
+{
+	if (atomic_add_unless(&mnt->mnt_longterm, -1, 1))
+		return;
+	br_write_lock(vfsmount_lock);
+	atomic_dec(&mnt->mnt_longterm);
+	br_write_unlock(vfsmount_lock);
+}
+
 /*
  * Allocate a new namespace structure and populate it with contents
  * copied from the namespace of the passed in task structure.
@@ -2328,14 +2292,19 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
 	q = new_ns->root;
 	while (p) {
 		q->mnt_ns = new_ns;
+		atomic_inc(&q->mnt_longterm);
 		if (fs) {
 			if (p == fs->root.mnt) {
+				fs->root.mnt = mntget(q);
+				atomic_inc(&q->mnt_longterm);
+				mnt_make_shortterm(p);
 				rootmnt = p;
-				fs->root.mnt = mntget_long(q);
 			}
 			if (p == fs->pwd.mnt) {
+				fs->pwd.mnt = mntget(q);
+				atomic_inc(&q->mnt_longterm);
+				mnt_make_shortterm(p);
 				pwdmnt = p;
-				fs->pwd.mnt = mntget_long(q);
 			}
 		}
 		p = next_mnt(p, mnt_ns->root);
@@ -2344,9 +2313,9 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
 	up_write(&namespace_sem);
 
 	if (rootmnt)
-		mntput_long(rootmnt);
+		mntput(rootmnt);
 	if (pwdmnt)
-		mntput_long(pwdmnt);
+		mntput(pwdmnt);
 
 	return new_ns;
 }
@@ -2379,6 +2348,7 @@ struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt)
 	new_ns = alloc_mnt_ns();
 	if (!IS_ERR(new_ns)) {
 		mnt->mnt_ns = new_ns;
+		atomic_inc(&mnt->mnt_longterm);
 		new_ns->root = mnt;
 		list_add(&new_ns->list, &new_ns->root->mnt_list);
 	}
diff --git a/fs/pipe.c b/fs/pipe.c
index e2e95fb46a1e..89e9e19b1b2e 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1292,7 +1292,7 @@ static int __init init_pipe_fs(void)
 static void __exit exit_pipe_fs(void)
 {
 	unregister_filesystem(&pipe_fs_type);
-	mntput_long(pipe_mnt);
+	mntput(pipe_mnt);
 }
 
 fs_initcall(init_pipe_fs);
diff --git a/fs/super.c b/fs/super.c
index 4f6a3571a634..74e149efed81 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1141,7 +1141,7 @@ static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
 	return mnt;
 
  err:
-	mntput_long(mnt);
+	mntput(mnt);
 	return ERR_PTR(err);
 }
 
-- 
cgit v1.2.2


From f8b18087fd3277e424a24e13ce0edf30abe97ce0 Mon Sep 17 00:00:00 2001
From: Stefan Schmidt <stefan@datenfreihafen.org>
Date: Wed, 12 Jan 2011 10:30:42 +0100
Subject: fs/btrfs: Fix build of ctree

Fix the build failure in some configurations:

     CC [M]  fs/btrfs/ctree.o
  In file included from fs/btrfs/ctree.c:21:0:
  fs/btrfs/ctree.h:1003:17: error: field 'super_kobj' has incomplete type
  fs/btrfs/ctree.h:1074:17: error: field 'root_kobj' has incomplete type
  make[2]: *** [fs/btrfs/ctree.o] Error 1
  make[1]: *** [fs/btrfs] Error 2
  make: *** [fs] Error 2

caused by commit 57cc7215b708 ("headers: kobject.h redux")

We need to include kobject.h here.

Reported-by: Jeff Garzik <jeff@garzik.org>
Fix-suggested-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Stefan Schmidt <stefan@datenfreihafen.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/btrfs/ctree.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index a142d204b526..b875d445ea81 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -27,6 +27,7 @@
 #include <linux/backing-dev.h>
 #include <linux/wait.h>
 #include <linux/slab.h>
+#include <linux/kobject.h>
 #include <asm/kmap_types.h>
 #include "extent_io.h"
 #include "extent_map.h"
-- 
cgit v1.2.2


From e205117285d6035af135c9d6c34a30ee6b8d1f2e Mon Sep 17 00:00:00 2001
From: Nicholas Bellinger <nab@linux-iscsi.org>
Date: Sun, 16 Jan 2011 21:11:25 +0000
Subject: configfs: change depends -> select SYSFS

This patch changes configfs to select SYSFS to fix the following:

warning: (TARGET_CORE && GFS2_FS) selects CONFIGFS_FS which has unmet direct dependencies (SYSFS)

Reported-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Nicholas A. Bellinger <nab@linux-iscsi.org>
Acked-by: Joel Becker <jlbec@evilplan.org>
---
 fs/configfs/Kconfig | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/configfs/Kconfig b/fs/configfs/Kconfig
index 13587cc97a0b..9febcdefdfdc 100644
--- a/fs/configfs/Kconfig
+++ b/fs/configfs/Kconfig
@@ -1,8 +1,8 @@
 config CONFIGFS_FS
 	tristate "Userspace-driven configuration filesystem"
-	depends on SYSFS
+	select SYSFS
 	help
-	  configfs is a ram-based filesystem that provides the converse
+	  configfs is a RAM-based filesystem that provides the converse
 	  of sysfs's functionality. Where sysfs is a filesystem-based
 	  view of kernel objects, configfs is a filesystem-based manager
 	  of kernel objects, or config_items.
-- 
cgit v1.2.2


From 86c747d2a4f028fe2fdf091c3a81d0e187827682 Mon Sep 17 00:00:00 2001
From: Nicholas Bellinger <nab@linux-iscsi.org>
Date: Sun, 16 Jan 2011 21:14:52 +0000
Subject: dlm: Make DLM depend on CONFIGFS_FS

This patch fixes the following kconfig error after changing
CONFIGFS_FS -> select SYSFS:

fs/sysfs/Kconfig:1:error: recursive dependency detected!
fs/sysfs/Kconfig:1:	symbol SYSFS is selected by CONFIGFS_FS
fs/configfs/Kconfig:1:	symbol CONFIGFS_FS is selected by DLM
fs/dlm/Kconfig:1:	symbol DLM depends on SYSFS

Signed-off-by: Nicholas A. Bellinger <nab@linux-iscsi.org>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: James Bottomley <James.Bottomley@suse.de>
---
 fs/dlm/Kconfig | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
index 2dbb422e8116..1897eb1b4b6a 100644
--- a/fs/dlm/Kconfig
+++ b/fs/dlm/Kconfig
@@ -1,8 +1,7 @@
 menuconfig DLM
 	tristate "Distributed Lock Manager (DLM)"
 	depends on EXPERIMENTAL && INET
-	depends on SYSFS && (IPV6 || IPV6=n)
-	select CONFIGFS_FS
+	depends on SYSFS && CONFIGFS_FS && (IPV6 || IPV6=n)
 	select IP_SCTP
 	help
 	A general purpose distributed lock manager for kernel or userspace
-- 
cgit v1.2.2


From 7b1fff7e4fdf2805fce7afd6247912588d72d604 Mon Sep 17 00:00:00 2001
From: Nicholas Bellinger <nab@linux-iscsi.org>
Date: Sun, 16 Jan 2011 21:16:07 +0000
Subject: ocfs2: Make OCFS2_FS depend on CONFIGFS_FS

This patch fixes the following kconfig error after changing
CONFIGFS_FS -> select SYSFS:

fs/sysfs/Kconfig:1:error: recursive dependency detected!
fs/sysfs/Kconfig:1:	symbol SYSFS is selected by CONFIGFS_FS
fs/configfs/Kconfig:1:	symbol CONFIGFS_FS is selected by OCFS2_FS
fs/ocfs2/Kconfig:1:	symbol OCFS2_FS depends on SYSFS

Signed-off-by: Nicholas A. Bellinger <nab@linux-iscsi.org>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: James Bottomley <James.Bottomley@suse.de>
---
 fs/ocfs2/Kconfig | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig
index ab152c00cd3a..77a8de5f7119 100644
--- a/fs/ocfs2/Kconfig
+++ b/fs/ocfs2/Kconfig
@@ -1,7 +1,6 @@
 config OCFS2_FS
 	tristate "OCFS2 file system support"
-	depends on NET && SYSFS
-	select CONFIGFS_FS
+	depends on NET && SYSFS && CONFIGFS_FS
 	select JBD2
 	select CRC32
 	select QUOTA
-- 
cgit v1.2.2


From 7e3d0eb0b028ed9e9384e6afcae2f22993bbdf25 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 16 Jan 2011 16:32:11 -0500
Subject: VFS: Fix UP compile error in fs/namespace.c

mnt_longterm is there only on SMP

Reported-and-tested-by: Joachim Eastwood <manabian@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/namespace.c | 31 ++++++++++++++++++++++++-------
 1 file changed, 24 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index 48809e21f270..9f544f35ed34 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -611,6 +611,21 @@ static void attach_mnt(struct vfsmount *mnt, struct path *path)
 	list_add_tail(&mnt->mnt_child, &path->mnt->mnt_mounts);
 }
 
+static inline void __mnt_make_longterm(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+	atomic_inc(&mnt->mnt_longterm);
+#endif
+}
+
+/* needs vfsmount lock for write */
+static inline void __mnt_make_shortterm(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+	atomic_dec(&mnt->mnt_longterm);
+#endif
+}
+
 /*
  * vfsmount lock must be held for write
  */
@@ -626,7 +641,7 @@ static void commit_tree(struct vfsmount *mnt)
 	list_add_tail(&head, &mnt->mnt_list);
 	list_for_each_entry(m, &head, mnt_list) {
 		m->mnt_ns = n;
-		atomic_inc(&m->mnt_longterm);
+		__mnt_make_longterm(m);
 	}
 
 	list_splice(&head, n->list.prev);
@@ -1189,7 +1204,7 @@ void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
 		list_del_init(&p->mnt_list);
 		__touch_mnt_namespace(p->mnt_ns);
 		p->mnt_ns = NULL;
-		atomic_dec(&p->mnt_longterm);
+		__mnt_make_shortterm(p);
 		list_del_init(&p->mnt_child);
 		if (p->mnt_parent != p) {
 			p->mnt_parent->mnt_ghosts++;
@@ -2243,16 +2258,18 @@ static struct mnt_namespace *alloc_mnt_ns(void)
 
 void mnt_make_longterm(struct vfsmount *mnt)
 {
-	atomic_inc(&mnt->mnt_longterm);
+	__mnt_make_longterm(mnt);
 }
 
 void mnt_make_shortterm(struct vfsmount *mnt)
 {
+#ifdef CONFIG_SMP
 	if (atomic_add_unless(&mnt->mnt_longterm, -1, 1))
 		return;
 	br_write_lock(vfsmount_lock);
 	atomic_dec(&mnt->mnt_longterm);
 	br_write_unlock(vfsmount_lock);
+#endif
 }
 
 /*
@@ -2292,17 +2309,17 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
 	q = new_ns->root;
 	while (p) {
 		q->mnt_ns = new_ns;
-		atomic_inc(&q->mnt_longterm);
+		__mnt_make_longterm(q);
 		if (fs) {
 			if (p == fs->root.mnt) {
 				fs->root.mnt = mntget(q);
-				atomic_inc(&q->mnt_longterm);
+				__mnt_make_longterm(q);
 				mnt_make_shortterm(p);
 				rootmnt = p;
 			}
 			if (p == fs->pwd.mnt) {
 				fs->pwd.mnt = mntget(q);
-				atomic_inc(&q->mnt_longterm);
+				__mnt_make_longterm(q);
 				mnt_make_shortterm(p);
 				pwdmnt = p;
 			}
@@ -2348,7 +2365,7 @@ struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt)
 	new_ns = alloc_mnt_ns();
 	if (!IS_ERR(new_ns)) {
 		mnt->mnt_ns = new_ns;
-		atomic_inc(&mnt->mnt_longterm);
+		__mnt_make_longterm(mnt);
 		new_ns->root = mnt;
 		list_add(&new_ns->list, &new_ns->root->mnt_list);
 	}
-- 
cgit v1.2.2


From 19a167af7c97248ec646552ebc9140bc6aa3552a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 17 Jan 2011 01:35:23 -0500
Subject: Take the completion of automount into new helper

... and shift it from namei.c to namespace.c

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/internal.h  |  1 +
 fs/namei.c     | 31 +++++--------------------------
 fs/namespace.c | 33 +++++++++++++++++++++++++++++++++
 3 files changed, 39 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/internal.h b/fs/internal.h
index 12ccb86edef7..e8a0b245177d 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -70,6 +70,7 @@ extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *,
 extern void release_mounts(struct list_head *);
 extern void umount_tree(struct vfsmount *, int, struct list_head *);
 extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
+extern int finish_automount(struct vfsmount *, struct path *);
 extern int do_add_mount(struct vfsmount *, struct path *, int);
 extern void mnt_clear_expiry(struct vfsmount *);
 
diff --git a/fs/namei.c b/fs/namei.c
index 8f7b41a14882..b753192d8c3f 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -923,37 +923,13 @@ static int follow_automount(struct path *path, unsigned flags,
 	if (!mnt) /* mount collision */
 		return 0;
 
-	/* The new mount record should have at least 2 refs to prevent it being
-	 * expired before we get a chance to add it
-	 */
-	BUG_ON(mnt_get_count(mnt) < 2);
-
-	if (mnt->mnt_sb == path->mnt->mnt_sb &&
-	    mnt->mnt_root == path->dentry) {
-		mnt_clear_expiry(mnt);
-		mntput(mnt);
-		mntput(mnt);
-		return -ELOOP;
-	}
+	err = finish_automount(mnt, path);
 
-	/* We need to add the mountpoint to the parent.  The filesystem may
-	 * have placed it on an expiry list, and so we need to make sure it
-	 * won't be expired under us if do_add_mount() fails (do_add_mount()
-	 * will eat a reference unconditionally).
-	 */
-	mntget(mnt);
-	err = do_add_mount(mnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE);
 	switch (err) {
 	case -EBUSY:
 		/* Someone else made a mount here whilst we were busy */
-		err = 0;
-	default:
-		mnt_clear_expiry(mnt);
-		mntput(mnt);
-		mntput(mnt);
-		return err;
+		return 0;
 	case 0:
-		mntput(mnt);
 		dput(path->dentry);
 		if (*need_mntput)
 			mntput(path->mnt);
@@ -961,7 +937,10 @@ static int follow_automount(struct path *path, unsigned flags,
 		path->dentry = dget(mnt->mnt_root);
 		*need_mntput = true;
 		return 0;
+	default:
+		return err;
 	}
+
 }
 
 /*
diff --git a/fs/namespace.c b/fs/namespace.c
index 9f544f35ed34..bec51e4e0549 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1895,6 +1895,39 @@ static int do_new_mount(struct path *path, char *type, int flags,
 	return do_add_mount(mnt, path, mnt_flags);
 }
 
+int finish_automount(struct vfsmount *m, struct path *path)
+{
+	int err;
+	/* The new mount record should have at least 2 refs to prevent it being
+	 * expired before we get a chance to add it
+	 */
+	BUG_ON(mnt_get_count(m) < 2);
+
+	if (m->mnt_sb == path->mnt->mnt_sb &&
+	    m->mnt_root == path->dentry) {
+		mnt_clear_expiry(m);
+		mntput(m);
+		mntput(m);
+		return -ELOOP;
+	}
+
+	/* We need to add the mountpoint to the parent.  The filesystem may
+	 * have placed it on an expiry list, and so we need to make sure it
+	 * won't be expired under us if do_add_mount() fails (do_add_mount()
+	 * will eat a reference unconditionally).
+	 */
+	mntget(m);
+	err = do_add_mount(m, path, path->mnt->mnt_flags | MNT_SHRINKABLE);
+	if (err) {
+		mnt_clear_expiry(m);
+		mntput(m);
+		mntput(m);
+	} else {
+		mntput(m);
+	}
+	return err;
+}
+
 /*
  * add a mount into a namespace's mount tree
  * - this unconditionally eats one of the caller's references to newmnt.
-- 
cgit v1.2.2


From 15f9a3f3e199647fe0cac19302c5033cf031372d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 17 Jan 2011 01:41:58 -0500
Subject: don't drop newmnt on error in do_add_mount()

That gets rid of the kludge in finish_automount() - we need
to keep refcount on the vfsmount as-is until we evict it from
expiry list.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index bec51e4e0549..31aefc8e5fa6 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1880,6 +1880,7 @@ static int do_new_mount(struct path *path, char *type, int flags,
 			int mnt_flags, char *name, void *data)
 {
 	struct vfsmount *mnt;
+	int err;
 
 	if (!type)
 		return -EINVAL;
@@ -1892,7 +1893,10 @@ static int do_new_mount(struct path *path, char *type, int flags,
 	if (IS_ERR(mnt))
 		return PTR_ERR(mnt);
 
-	return do_add_mount(mnt, path, mnt_flags);
+	err = do_add_mount(mnt, path, mnt_flags);
+	if (err)
+		mntput(mnt);
+	return err;
 }
 
 int finish_automount(struct vfsmount *m, struct path *path)
@@ -1911,26 +1915,17 @@ int finish_automount(struct vfsmount *m, struct path *path)
 		return -ELOOP;
 	}
 
-	/* We need to add the mountpoint to the parent.  The filesystem may
-	 * have placed it on an expiry list, and so we need to make sure it
-	 * won't be expired under us if do_add_mount() fails (do_add_mount()
-	 * will eat a reference unconditionally).
-	 */
-	mntget(m);
 	err = do_add_mount(m, path, path->mnt->mnt_flags | MNT_SHRINKABLE);
 	if (err) {
 		mnt_clear_expiry(m);
 		mntput(m);
 		mntput(m);
-	} else {
-		mntput(m);
 	}
 	return err;
 }
 
 /*
  * add a mount into a namespace's mount tree
- * - this unconditionally eats one of the caller's references to newmnt.
  */
 int do_add_mount(struct vfsmount *newmnt, struct path *path, int mnt_flags)
 {
@@ -1967,7 +1962,6 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path, int mnt_flags)
 
 unlock:
 	up_write(&namespace_sem);
-	mntput(newmnt);
 	return err;
 }
 
-- 
cgit v1.2.2


From b1e75df45a3d8a490b8648e44632debc5eea04b1 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 17 Jan 2011 01:47:59 -0500
Subject: tidy up around finish_automount()

do_add_mount() and mnt_clear_expiry() are not needed outside of
namespace.c anymore, now that namei has finish_automount() to
use.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/internal.h  |  2 --
 fs/namespace.c | 46 ++++++++++++++++++----------------------------
 2 files changed, 18 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/internal.h b/fs/internal.h
index e8a0b245177d..0663568b1247 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -71,8 +71,6 @@ extern void release_mounts(struct list_head *);
 extern void umount_tree(struct vfsmount *, int, struct list_head *);
 extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
 extern int finish_automount(struct vfsmount *, struct path *);
-extern int do_add_mount(struct vfsmount *, struct path *, int);
-extern void mnt_clear_expiry(struct vfsmount *);
 
 extern void mnt_make_longterm(struct vfsmount *);
 extern void mnt_make_shortterm(struct vfsmount *);
diff --git a/fs/namespace.c b/fs/namespace.c
index 31aefc8e5fa6..7b0b95371696 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1872,6 +1872,8 @@ out:
 	return err;
 }
 
+static int do_add_mount(struct vfsmount *, struct path *, int);
+
 /*
  * create a new mount for userspace and request it to be added into the
  * namespace's tree
@@ -1909,25 +1911,31 @@ int finish_automount(struct vfsmount *m, struct path *path)
 
 	if (m->mnt_sb == path->mnt->mnt_sb &&
 	    m->mnt_root == path->dentry) {
-		mnt_clear_expiry(m);
-		mntput(m);
-		mntput(m);
-		return -ELOOP;
+		err = -ELOOP;
+		goto fail;
 	}
 
 	err = do_add_mount(m, path, path->mnt->mnt_flags | MNT_SHRINKABLE);
-	if (err) {
-		mnt_clear_expiry(m);
-		mntput(m);
-		mntput(m);
+	if (!err)
+		return 0;
+fail:
+	/* remove m from any expiration list it may be on */
+	if (!list_empty(&m->mnt_expire)) {
+		down_write(&namespace_sem);
+		br_write_lock(vfsmount_lock);
+		list_del_init(&m->mnt_expire);
+		br_write_unlock(vfsmount_lock);
+		up_write(&namespace_sem);
 	}
+	mntput(m);
+	mntput(m);
 	return err;
 }
 
 /*
  * add a mount into a namespace's mount tree
  */
-int do_add_mount(struct vfsmount *newmnt, struct path *path, int mnt_flags)
+static int do_add_mount(struct vfsmount *newmnt, struct path *path, int mnt_flags)
 {
 	int err;
 
@@ -1954,11 +1962,7 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path, int mnt_flags)
 		goto unlock;
 
 	newmnt->mnt_flags = mnt_flags;
-	if ((err = graft_tree(newmnt, path)))
-		goto unlock;
-
-	up_write(&namespace_sem);
-	return 0;
+	err = graft_tree(newmnt, path);
 
 unlock:
 	up_write(&namespace_sem);
@@ -1982,20 +1986,6 @@ void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
 }
 EXPORT_SYMBOL(mnt_set_expiry);
 
-/*
- * Remove a vfsmount from any expiration list it may be on
- */
-void mnt_clear_expiry(struct vfsmount *mnt)
-{
-	if (!list_empty(&mnt->mnt_expire)) {
-		down_write(&namespace_sem);
-		br_write_lock(vfsmount_lock);
-		list_del_init(&mnt->mnt_expire);
-		br_write_unlock(vfsmount_lock);
-		up_write(&namespace_sem);
-	}
-}
-
 /*
  * process a list of expirable mountpoints with the intent of discarding any
  * mountpoints that aren't in use and haven't been touched since last we came
-- 
cgit v1.2.2


From 64c23e86873ee410554d6d1c76b60da47025e96f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 14 Jan 2011 13:07:30 +0100
Subject: make the feature checks in ->fallocate future proof

Instead of various home grown checks that might need updates for new
flags just check for any bit outside the mask of the features supported
by the filesystem.  This makes the check future proof for any newly
added flag.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/inode.c            | 2 +-
 fs/ext4/extents.c           | 2 +-
 fs/gfs2/ops_inode.c         | 2 +-
 fs/ocfs2/file.c             | 2 ++
 fs/xfs/linux-2.6/xfs_iops.c | 3 +++
 5 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a3798a3aa0d2..64daf2acd0d5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7116,7 +7116,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
 	alloc_end =  (offset + len + mask) & ~mask;
 
 	/* We only support the FALLOC_FL_KEEP_SIZE mode */
-	if (mode && (mode != FALLOC_FL_KEEP_SIZE))
+	if (mode & ~FALLOC_FL_KEEP_SIZE)
 		return -EOPNOTSUPP;
 
 	/*
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index c4068f6abf03..4bdd160854eb 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3645,7 +3645,7 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
 	unsigned int credits, blkbits = inode->i_blkbits;
 
 	/* We only support the FALLOC_FL_KEEP_SIZE mode */
-	if (mode && (mode != FALLOC_FL_KEEP_SIZE))
+	if (mode & ~FALLOC_FL_KEEP_SIZE)
 		return -EOPNOTSUPP;
 
 	/*
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 040b5a2e6556..c09528c07f3d 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -1426,7 +1426,7 @@ static long gfs2_fallocate(struct inode *inode, int mode, loff_t offset,
 	next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
 
 	/* We only support the FALLOC_FL_KEEP_SIZE mode */
-	if (mode && (mode != FALLOC_FL_KEEP_SIZE))
+	if (mode & ~FALLOC_FL_KEEP_SIZE)
 		return -EOPNOTSUPP;
 
 	offset = (offset >> sdp->sd_sb.sb_bsize_shift) <<
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 63e3fca266e0..cf254ce8c941 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1997,6 +1997,8 @@ static long ocfs2_fallocate(struct inode *inode, int mode, loff_t offset,
 	int change_size = 1;
 	int cmd = OCFS2_IOC_RESVSP64;
 
+	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+		return -EOPNOTSUPP;
 	if (!ocfs2_writes_unwritten_extents(osb))
 		return -EOPNOTSUPP;
 
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index da54403633b6..a4ecc2188a09 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -518,6 +518,9 @@ xfs_vn_fallocate(
 	xfs_inode_t	*ip = XFS_I(inode);
 	int		cmd = XFS_IOC_RESVSP;
 
+	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+		return -EOPNOTSUPP;
+
 	/* preallocation on directories not yet supported */
 	error = -ENODEV;
 	if (S_ISDIR(inode->i_mode))
-- 
cgit v1.2.2


From 2fe17c1075836b66678ed2a305fd09b6773883aa Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 14 Jan 2011 13:07:43 +0100
Subject: fallocate should be a file operation

Currently all filesystems except XFS implement fallocate asynchronously,
while XFS forced a commit.  Both of these are suboptimal - in case of O_SYNC
I/O we really want our allocation on disk, especially for the !KEEP_SIZE
case where we actually grow the file with user-visible zeroes.  On the
other hand always commiting the transaction is a bad idea for fast-path
uses of fallocate like for example in recent Samba versions.   Given
that block allocation is a data plane operation anyway change it from
an inode operation to a file operation so that we have the file structure
available that lets us check for O_SYNC.

This also includes moving the code around for a few of the filesystems,
and remove the already unnedded S_ISDIR checks given that we only wire
up fallocate for regular files.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/file.c             | 113 +++++++++++++++++++
 fs/btrfs/inode.c            | 111 -------------------
 fs/ext4/ext4.h              |   2 +-
 fs/ext4/extents.c           |   9 +-
 fs/ext4/file.c              |   2 +-
 fs/gfs2/file.c              | 258 ++++++++++++++++++++++++++++++++++++++++++++
 fs/gfs2/ops_inode.c         | 258 --------------------------------------------
 fs/ocfs2/file.c             |   8 +-
 fs/open.c                   |   4 +-
 fs/xfs/linux-2.6/xfs_file.c |  56 ++++++++++
 fs/xfs/linux-2.6/xfs_iops.c |  60 -----------
 11 files changed, 437 insertions(+), 444 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 66836d85763b..a9e0a4eaf3d9 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -24,6 +24,7 @@
 #include <linux/string.h>
 #include <linux/backing-dev.h>
 #include <linux/mpage.h>
+#include <linux/falloc.h>
 #include <linux/swap.h>
 #include <linux/writeback.h>
 #include <linux/statfs.h>
@@ -1237,6 +1238,117 @@ static int btrfs_file_mmap(struct file	*filp, struct vm_area_struct *vma)
 	return 0;
 }
 
+static long btrfs_fallocate(struct file *file, int mode,
+			    loff_t offset, loff_t len)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct extent_state *cached_state = NULL;
+	u64 cur_offset;
+	u64 last_byte;
+	u64 alloc_start;
+	u64 alloc_end;
+	u64 alloc_hint = 0;
+	u64 locked_end;
+	u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
+	struct extent_map *em;
+	int ret;
+
+	alloc_start = offset & ~mask;
+	alloc_end =  (offset + len + mask) & ~mask;
+
+	/* We only support the FALLOC_FL_KEEP_SIZE mode */
+	if (mode & ~FALLOC_FL_KEEP_SIZE)
+		return -EOPNOTSUPP;
+
+	/*
+	 * wait for ordered IO before we have any locks.  We'll loop again
+	 * below with the locks held.
+	 */
+	btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
+
+	mutex_lock(&inode->i_mutex);
+	ret = inode_newsize_ok(inode, alloc_end);
+	if (ret)
+		goto out;
+
+	if (alloc_start > inode->i_size) {
+		ret = btrfs_cont_expand(inode, alloc_start);
+		if (ret)
+			goto out;
+	}
+
+	ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
+	if (ret)
+		goto out;
+
+	locked_end = alloc_end - 1;
+	while (1) {
+		struct btrfs_ordered_extent *ordered;
+
+		/* the extent lock is ordered inside the running
+		 * transaction
+		 */
+		lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
+				 locked_end, 0, &cached_state, GFP_NOFS);
+		ordered = btrfs_lookup_first_ordered_extent(inode,
+							    alloc_end - 1);
+		if (ordered &&
+		    ordered->file_offset + ordered->len > alloc_start &&
+		    ordered->file_offset < alloc_end) {
+			btrfs_put_ordered_extent(ordered);
+			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+					     alloc_start, locked_end,
+					     &cached_state, GFP_NOFS);
+			/*
+			 * we can't wait on the range with the transaction
+			 * running or with the extent lock held
+			 */
+			btrfs_wait_ordered_range(inode, alloc_start,
+						 alloc_end - alloc_start);
+		} else {
+			if (ordered)
+				btrfs_put_ordered_extent(ordered);
+			break;
+		}
+	}
+
+	cur_offset = alloc_start;
+	while (1) {
+		em = btrfs_get_extent(inode, NULL, 0, cur_offset,
+				      alloc_end - cur_offset, 0);
+		BUG_ON(IS_ERR(em) || !em);
+		last_byte = min(extent_map_end(em), alloc_end);
+		last_byte = (last_byte + mask) & ~mask;
+		if (em->block_start == EXTENT_MAP_HOLE ||
+		    (cur_offset >= inode->i_size &&
+		     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
+			ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
+							last_byte - cur_offset,
+							1 << inode->i_blkbits,
+							offset + len,
+							&alloc_hint);
+			if (ret < 0) {
+				free_extent_map(em);
+				break;
+			}
+		}
+		free_extent_map(em);
+
+		cur_offset = last_byte;
+		if (cur_offset >= alloc_end) {
+			ret = 0;
+			break;
+		}
+	}
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
+			     &cached_state, GFP_NOFS);
+
+	btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
+out:
+	mutex_unlock(&inode->i_mutex);
+	return ret;
+}
+
 const struct file_operations btrfs_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
@@ -1248,6 +1360,7 @@ const struct file_operations btrfs_file_operations = {
 	.open		= generic_file_open,
 	.release	= btrfs_release_file,
 	.fsync		= btrfs_sync_file,
+	.fallocate	= btrfs_fallocate,
 	.unlocked_ioctl	= btrfs_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= btrfs_ioctl,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 64daf2acd0d5..902afbf50811 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7098,116 +7098,6 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
 					   min_size, actual_len, alloc_hint, trans);
 }
 
-static long btrfs_fallocate(struct inode *inode, int mode,
-			    loff_t offset, loff_t len)
-{
-	struct extent_state *cached_state = NULL;
-	u64 cur_offset;
-	u64 last_byte;
-	u64 alloc_start;
-	u64 alloc_end;
-	u64 alloc_hint = 0;
-	u64 locked_end;
-	u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
-	struct extent_map *em;
-	int ret;
-
-	alloc_start = offset & ~mask;
-	alloc_end =  (offset + len + mask) & ~mask;
-
-	/* We only support the FALLOC_FL_KEEP_SIZE mode */
-	if (mode & ~FALLOC_FL_KEEP_SIZE)
-		return -EOPNOTSUPP;
-
-	/*
-	 * wait for ordered IO before we have any locks.  We'll loop again
-	 * below with the locks held.
-	 */
-	btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
-
-	mutex_lock(&inode->i_mutex);
-	ret = inode_newsize_ok(inode, alloc_end);
-	if (ret)
-		goto out;
-
-	if (alloc_start > inode->i_size) {
-		ret = btrfs_cont_expand(inode, alloc_start);
-		if (ret)
-			goto out;
-	}
-
-	ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
-	if (ret)
-		goto out;
-
-	locked_end = alloc_end - 1;
-	while (1) {
-		struct btrfs_ordered_extent *ordered;
-
-		/* the extent lock is ordered inside the running
-		 * transaction
-		 */
-		lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
-				 locked_end, 0, &cached_state, GFP_NOFS);
-		ordered = btrfs_lookup_first_ordered_extent(inode,
-							    alloc_end - 1);
-		if (ordered &&
-		    ordered->file_offset + ordered->len > alloc_start &&
-		    ordered->file_offset < alloc_end) {
-			btrfs_put_ordered_extent(ordered);
-			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
-					     alloc_start, locked_end,
-					     &cached_state, GFP_NOFS);
-			/*
-			 * we can't wait on the range with the transaction
-			 * running or with the extent lock held
-			 */
-			btrfs_wait_ordered_range(inode, alloc_start,
-						 alloc_end - alloc_start);
-		} else {
-			if (ordered)
-				btrfs_put_ordered_extent(ordered);
-			break;
-		}
-	}
-
-	cur_offset = alloc_start;
-	while (1) {
-		em = btrfs_get_extent(inode, NULL, 0, cur_offset,
-				      alloc_end - cur_offset, 0);
-		BUG_ON(IS_ERR(em) || !em);
-		last_byte = min(extent_map_end(em), alloc_end);
-		last_byte = (last_byte + mask) & ~mask;
-		if (em->block_start == EXTENT_MAP_HOLE ||
-		    (cur_offset >= inode->i_size &&
-		     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
-			ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
-							last_byte - cur_offset,
-							1 << inode->i_blkbits,
-							offset + len,
-							&alloc_hint);
-			if (ret < 0) {
-				free_extent_map(em);
-				break;
-			}
-		}
-		free_extent_map(em);
-
-		cur_offset = last_byte;
-		if (cur_offset >= alloc_end) {
-			ret = 0;
-			break;
-		}
-	}
-	unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
-			     &cached_state, GFP_NOFS);
-
-	btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
-out:
-	mutex_unlock(&inode->i_mutex);
-	return ret;
-}
-
 static int btrfs_set_page_dirty(struct page *page)
 {
 	return __set_page_dirty_nobuffers(page);
@@ -7310,7 +7200,6 @@ static const struct inode_operations btrfs_file_inode_operations = {
 	.listxattr      = btrfs_listxattr,
 	.removexattr	= btrfs_removexattr,
 	.permission	= btrfs_permission,
-	.fallocate	= btrfs_fallocate,
 	.fiemap		= btrfs_fiemap,
 };
 static const struct inode_operations btrfs_special_inode_operations = {
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 1de65f572033..0c8d97b56f34 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2065,7 +2065,7 @@ extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 extern void ext4_ext_truncate(struct inode *);
 extern void ext4_ext_init(struct super_block *);
 extern void ext4_ext_release(struct super_block *);
-extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
+extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
 			  loff_t len);
 extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
 			  ssize_t len);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 4bdd160854eb..63a75810b7c3 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3627,14 +3627,15 @@ static void ext4_falloc_update_inode(struct inode *inode,
 }
 
 /*
- * preallocate space for a file. This implements ext4's fallocate inode
+ * preallocate space for a file. This implements ext4's fallocate file
  * operation, which gets called from sys_fallocate system call.
  * For block-mapped files, posix_fallocate should fall back to the method
  * of writing zeroes to the required new blocks (the same behavior which is
  * expected for file systems which do not support fallocate() system call).
  */
-long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
+long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 {
+	struct inode *inode = file->f_path.dentry->d_inode;
 	handle_t *handle;
 	loff_t new_size;
 	unsigned int max_blocks;
@@ -3655,10 +3656,6 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
 	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
 		return -EOPNOTSUPP;
 
-	/* preallocation to directories is currently not supported */
-	if (S_ISDIR(inode->i_mode))
-		return -ENODEV;
-
 	map.m_lblk = offset >> blkbits;
 	/*
 	 * We can't just convert len to max_blocks because
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index bb003dc9ffff..2e8322c8aa88 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -210,6 +210,7 @@ const struct file_operations ext4_file_operations = {
 	.fsync		= ext4_sync_file,
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= generic_file_splice_write,
+	.fallocate	= ext4_fallocate,
 };
 
 const struct inode_operations ext4_file_inode_operations = {
@@ -223,7 +224,6 @@ const struct inode_operations ext4_file_inode_operations = {
 	.removexattr	= generic_removexattr,
 #endif
 	.check_acl	= ext4_check_acl,
-	.fallocate	= ext4_fallocate,
 	.fiemap		= ext4_fiemap,
 };
 
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index fca6689e12e6..7cfdcb913363 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -19,6 +19,8 @@
 #include <linux/fs.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/ext2_fs.h>
+#include <linux/falloc.h>
+#include <linux/swap.h>
 #include <linux/crc32.h>
 #include <linux/writeback.h>
 #include <asm/uaccess.h>
@@ -610,6 +612,260 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	return generic_file_aio_write(iocb, iov, nr_segs, pos);
 }
 
+static void empty_write_end(struct page *page, unsigned from,
+			   unsigned to)
+{
+	struct gfs2_inode *ip = GFS2_I(page->mapping->host);
+
+	page_zero_new_buffers(page, from, to);
+	flush_dcache_page(page);
+	mark_page_accessed(page);
+
+	if (!gfs2_is_writeback(ip))
+		gfs2_page_add_databufs(ip, page, from, to);
+
+	block_commit_write(page, from, to);
+}
+
+static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
+{
+	unsigned start, end, next;
+	struct buffer_head *bh, *head;
+	int error;
+
+	if (!page_has_buffers(page)) {
+		error = __block_write_begin(page, from, to - from, gfs2_block_map);
+		if (unlikely(error))
+			return error;
+
+		empty_write_end(page, from, to);
+		return 0;
+	}
+
+	bh = head = page_buffers(page);
+	next = end = 0;
+	while (next < from) {
+		next += bh->b_size;
+		bh = bh->b_this_page;
+	}
+	start = next;
+	do {
+		next += bh->b_size;
+		if (buffer_mapped(bh)) {
+			if (end) {
+				error = __block_write_begin(page, start, end - start,
+							    gfs2_block_map);
+				if (unlikely(error))
+					return error;
+				empty_write_end(page, start, end);
+				end = 0;
+			}
+			start = next;
+		}
+		else
+			end = next;
+		bh = bh->b_this_page;
+	} while (next < to);
+
+	if (end) {
+		error = __block_write_begin(page, start, end - start, gfs2_block_map);
+		if (unlikely(error))
+			return error;
+		empty_write_end(page, start, end);
+	}
+
+	return 0;
+}
+
+static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
+			   int mode)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct buffer_head *dibh;
+	int error;
+	u64 start = offset >> PAGE_CACHE_SHIFT;
+	unsigned int start_offset = offset & ~PAGE_CACHE_MASK;
+	u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
+	pgoff_t curr;
+	struct page *page;
+	unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK;
+	unsigned int from, to;
+
+	if (!end_offset)
+		end_offset = PAGE_CACHE_SIZE;
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (unlikely(error))
+		goto out;
+
+	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+
+	if (gfs2_is_stuffed(ip)) {
+		error = gfs2_unstuff_dinode(ip, NULL);
+		if (unlikely(error))
+			goto out;
+	}
+
+	curr = start;
+	offset = start << PAGE_CACHE_SHIFT;
+	from = start_offset;
+	to = PAGE_CACHE_SIZE;
+	while (curr <= end) {
+		page = grab_cache_page_write_begin(inode->i_mapping, curr,
+						   AOP_FLAG_NOFS);
+		if (unlikely(!page)) {
+			error = -ENOMEM;
+			goto out;
+		}
+
+		if (curr == end)
+			to = end_offset;
+		error = write_empty_blocks(page, from, to);
+		if (!error && offset + to > inode->i_size &&
+		    !(mode & FALLOC_FL_KEEP_SIZE)) {
+			i_size_write(inode, offset + to);
+		}
+		unlock_page(page);
+		page_cache_release(page);
+		if (error)
+			goto out;
+		curr++;
+		offset += PAGE_CACHE_SIZE;
+		from = 0;
+	}
+
+	gfs2_dinode_out(ip, dibh->b_data);
+	mark_inode_dirty(inode);
+
+	brelse(dibh);
+
+out:
+	return error;
+}
+
+static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len,
+			    unsigned int *data_blocks, unsigned int *ind_blocks)
+{
+	const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone;
+	unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1);
+
+	for (tmp = max_data; tmp > sdp->sd_diptrs;) {
+		tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
+		max_data -= tmp;
+	}
+	/* This calculation isn't the exact reverse of gfs2_write_calc_reserve,
+	   so it might end up with fewer data blocks */
+	if (max_data <= *data_blocks)
+		return;
+	*data_blocks = max_data;
+	*ind_blocks = max_blocks - max_data;
+	*len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift;
+	if (*len > max) {
+		*len = max;
+		gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks);
+	}
+}
+
+static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
+			   loff_t len)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct gfs2_inode *ip = GFS2_I(inode);
+	unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
+	loff_t bytes, max_bytes;
+	struct gfs2_alloc *al;
+	int error;
+	loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
+	next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
+
+	/* We only support the FALLOC_FL_KEEP_SIZE mode */
+	if (mode & ~FALLOC_FL_KEEP_SIZE)
+		return -EOPNOTSUPP;
+
+	offset = (offset >> sdp->sd_sb.sb_bsize_shift) <<
+		 sdp->sd_sb.sb_bsize_shift;
+
+	len = next - offset;
+	bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2;
+	if (!bytes)
+		bytes = UINT_MAX;
+
+	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
+	error = gfs2_glock_nq(&ip->i_gh);
+	if (unlikely(error))
+		goto out_uninit;
+
+	if (!gfs2_write_alloc_required(ip, offset, len))
+		goto out_unlock;
+
+	while (len > 0) {
+		if (len < bytes)
+			bytes = len;
+		al = gfs2_alloc_get(ip);
+		if (!al) {
+			error = -ENOMEM;
+			goto out_unlock;
+		}
+
+		error = gfs2_quota_lock_check(ip);
+		if (error)
+			goto out_alloc_put;
+
+retry:
+		gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);
+
+		al->al_requested = data_blocks + ind_blocks;
+		error = gfs2_inplace_reserve(ip);
+		if (error) {
+			if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) {
+				bytes >>= 1;
+				goto retry;
+			}
+			goto out_qunlock;
+		}
+		max_bytes = bytes;
+		calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks);
+		al->al_requested = data_blocks + ind_blocks;
+
+		rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
+			  RES_RG_HDR + gfs2_rg_blocks(al);
+		if (gfs2_is_jdata(ip))
+			rblocks += data_blocks ? data_blocks : 1;
+
+		error = gfs2_trans_begin(sdp, rblocks,
+					 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
+		if (error)
+			goto out_trans_fail;
+
+		error = fallocate_chunk(inode, offset, max_bytes, mode);
+		gfs2_trans_end(sdp);
+
+		if (error)
+			goto out_trans_fail;
+
+		len -= max_bytes;
+		offset += max_bytes;
+		gfs2_inplace_release(ip);
+		gfs2_quota_unlock(ip);
+		gfs2_alloc_put(ip);
+	}
+	goto out_unlock;
+
+out_trans_fail:
+	gfs2_inplace_release(ip);
+out_qunlock:
+	gfs2_quota_unlock(ip);
+out_alloc_put:
+	gfs2_alloc_put(ip);
+out_unlock:
+	gfs2_glock_dq(&ip->i_gh);
+out_uninit:
+	gfs2_holder_uninit(&ip->i_gh);
+	return error;
+}
+
 #ifdef CONFIG_GFS2_FS_LOCKING_DLM
 
 /**
@@ -765,6 +1021,7 @@ const struct file_operations gfs2_file_fops = {
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= generic_file_splice_write,
 	.setlease	= gfs2_setlease,
+	.fallocate	= gfs2_fallocate,
 };
 
 const struct file_operations gfs2_dir_fops = {
@@ -794,6 +1051,7 @@ const struct file_operations gfs2_file_fops_nolock = {
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= generic_file_splice_write,
 	.setlease	= generic_setlease,
+	.fallocate	= gfs2_fallocate,
 };
 
 const struct file_operations gfs2_dir_fops_nolock = {
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index c09528c07f3d..d8b26ac2e20b 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -18,8 +18,6 @@
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
 #include <linux/fiemap.h>
-#include <linux/swap.h>
-#include <linux/falloc.h>
 #include <asm/uaccess.h>
 
 #include "gfs2.h"
@@ -1257,261 +1255,6 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name)
 	return ret;
 }
 
-static void empty_write_end(struct page *page, unsigned from,
-			   unsigned to)
-{
-	struct gfs2_inode *ip = GFS2_I(page->mapping->host);
-
-	page_zero_new_buffers(page, from, to);
-	flush_dcache_page(page);
-	mark_page_accessed(page);
-
-	if (!gfs2_is_writeback(ip))
-		gfs2_page_add_databufs(ip, page, from, to);
-
-	block_commit_write(page, from, to);
-}
-
-
-static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
-{
-	unsigned start, end, next;
-	struct buffer_head *bh, *head;
-	int error;
-
-	if (!page_has_buffers(page)) {
-		error = __block_write_begin(page, from, to - from, gfs2_block_map);
-		if (unlikely(error))
-			return error;
-
-		empty_write_end(page, from, to);
-		return 0;
-	}
-
-	bh = head = page_buffers(page);
-	next = end = 0;
-	while (next < from) {
-		next += bh->b_size;
-		bh = bh->b_this_page;
-	}
-	start = next;
-	do {
-		next += bh->b_size;
-		if (buffer_mapped(bh)) {
-			if (end) {
-				error = __block_write_begin(page, start, end - start,
-							    gfs2_block_map);
-				if (unlikely(error))
-					return error;
-				empty_write_end(page, start, end);
-				end = 0;
-			}
-			start = next;
-		}
-		else
-			end = next;
-		bh = bh->b_this_page;
-	} while (next < to);
-
-	if (end) {
-		error = __block_write_begin(page, start, end - start, gfs2_block_map);
-		if (unlikely(error))
-			return error;
-		empty_write_end(page, start, end);
-	}
-
-	return 0;
-}
-
-static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
-			   int mode)
-{
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct buffer_head *dibh;
-	int error;
-	u64 start = offset >> PAGE_CACHE_SHIFT;
-	unsigned int start_offset = offset & ~PAGE_CACHE_MASK;
-	u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
-	pgoff_t curr;
-	struct page *page;
-	unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK;
-	unsigned int from, to;
-
-	if (!end_offset)
-		end_offset = PAGE_CACHE_SIZE;
-
-	error = gfs2_meta_inode_buffer(ip, &dibh);
-	if (unlikely(error))
-		goto out;
-
-	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-
-	if (gfs2_is_stuffed(ip)) {
-		error = gfs2_unstuff_dinode(ip, NULL);
-		if (unlikely(error))
-			goto out;
-	}
-
-	curr = start;
-	offset = start << PAGE_CACHE_SHIFT;
-	from = start_offset;
-	to = PAGE_CACHE_SIZE;
-	while (curr <= end) {
-		page = grab_cache_page_write_begin(inode->i_mapping, curr,
-						   AOP_FLAG_NOFS);
-		if (unlikely(!page)) {
-			error = -ENOMEM;
-			goto out;
-		}
-
-		if (curr == end)
-			to = end_offset;
-		error = write_empty_blocks(page, from, to);
-		if (!error && offset + to > inode->i_size &&
-		    !(mode & FALLOC_FL_KEEP_SIZE)) {
-			i_size_write(inode, offset + to);
-		}
-		unlock_page(page);
-		page_cache_release(page);
-		if (error)
-			goto out;
-		curr++;
-		offset += PAGE_CACHE_SIZE;
-		from = 0;
-	}
-
-	gfs2_dinode_out(ip, dibh->b_data);
-	mark_inode_dirty(inode);
-
-	brelse(dibh);
-
-out:
-	return error;
-}
-
-static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len,
-			    unsigned int *data_blocks, unsigned int *ind_blocks)
-{
-	const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone;
-	unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1);
-
-	for (tmp = max_data; tmp > sdp->sd_diptrs;) {
-		tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
-		max_data -= tmp;
-	}
-	/* This calculation isn't the exact reverse of gfs2_write_calc_reserve,
-	   so it might end up with fewer data blocks */
-	if (max_data <= *data_blocks)
-		return;
-	*data_blocks = max_data;
-	*ind_blocks = max_blocks - max_data;
-	*len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift;
-	if (*len > max) {
-		*len = max;
-		gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks);
-	}
-}
-
-static long gfs2_fallocate(struct inode *inode, int mode, loff_t offset,
-			   loff_t len)
-{
-	struct gfs2_sbd *sdp = GFS2_SB(inode);
-	struct gfs2_inode *ip = GFS2_I(inode);
-	unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
-	loff_t bytes, max_bytes;
-	struct gfs2_alloc *al;
-	int error;
-	loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
-	next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
-
-	/* We only support the FALLOC_FL_KEEP_SIZE mode */
-	if (mode & ~FALLOC_FL_KEEP_SIZE)
-		return -EOPNOTSUPP;
-
-	offset = (offset >> sdp->sd_sb.sb_bsize_shift) <<
-		 sdp->sd_sb.sb_bsize_shift;
-
-	len = next - offset;
-	bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2;
-	if (!bytes)
-		bytes = UINT_MAX;
-
-	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
-	error = gfs2_glock_nq(&ip->i_gh);
-	if (unlikely(error))
-		goto out_uninit;
-
-	if (!gfs2_write_alloc_required(ip, offset, len))
-		goto out_unlock;
-
-	while (len > 0) {
-		if (len < bytes)
-			bytes = len;
-		al = gfs2_alloc_get(ip);
-		if (!al) {
-			error = -ENOMEM;
-			goto out_unlock;
-		}
-
-		error = gfs2_quota_lock_check(ip);
-		if (error)
-			goto out_alloc_put;
-
-retry:
-		gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);
-
-		al->al_requested = data_blocks + ind_blocks;
-		error = gfs2_inplace_reserve(ip);
-		if (error) {
-			if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) {
-				bytes >>= 1;
-				goto retry;
-			}
-			goto out_qunlock;
-		}
-		max_bytes = bytes;
-		calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks);
-		al->al_requested = data_blocks + ind_blocks;
-
-		rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
-			  RES_RG_HDR + gfs2_rg_blocks(al);
-		if (gfs2_is_jdata(ip))
-			rblocks += data_blocks ? data_blocks : 1;
-
-		error = gfs2_trans_begin(sdp, rblocks,
-					 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
-		if (error)
-			goto out_trans_fail;
-
-		error = fallocate_chunk(inode, offset, max_bytes, mode);
-		gfs2_trans_end(sdp);
-
-		if (error)
-			goto out_trans_fail;
-
-		len -= max_bytes;
-		offset += max_bytes;
-		gfs2_inplace_release(ip);
-		gfs2_quota_unlock(ip);
-		gfs2_alloc_put(ip);
-	}
-	goto out_unlock;
-
-out_trans_fail:
-	gfs2_inplace_release(ip);
-out_qunlock:
-	gfs2_quota_unlock(ip);
-out_alloc_put:
-	gfs2_alloc_put(ip);
-out_unlock:
-	gfs2_glock_dq(&ip->i_gh);
-out_uninit:
-	gfs2_holder_uninit(&ip->i_gh);
-	return error;
-}
-
-
 static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		       u64 start, u64 len)
 {
@@ -1562,7 +1305,6 @@ const struct inode_operations gfs2_file_iops = {
 	.getxattr = gfs2_getxattr,
 	.listxattr = gfs2_listxattr,
 	.removexattr = gfs2_removexattr,
-	.fallocate = gfs2_fallocate,
 	.fiemap = gfs2_fiemap,
 };
 
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index cf254ce8c941..a6651956482e 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1989,9 +1989,10 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd,
 	return __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0);
 }
 
-static long ocfs2_fallocate(struct inode *inode, int mode, loff_t offset,
+static long ocfs2_fallocate(struct file *file, int mode, loff_t offset,
 			    loff_t len)
 {
+	struct inode *inode = file->f_path.dentry->d_inode;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_space_resv sr;
 	int change_size = 1;
@@ -2002,9 +2003,6 @@ static long ocfs2_fallocate(struct inode *inode, int mode, loff_t offset,
 	if (!ocfs2_writes_unwritten_extents(osb))
 		return -EOPNOTSUPP;
 
-	if (S_ISDIR(inode->i_mode))
-		return -ENODEV;
-
 	if (mode & FALLOC_FL_KEEP_SIZE)
 		change_size = 0;
 
@@ -2612,7 +2610,6 @@ const struct inode_operations ocfs2_file_iops = {
 	.getxattr	= generic_getxattr,
 	.listxattr	= ocfs2_listxattr,
 	.removexattr	= generic_removexattr,
-	.fallocate	= ocfs2_fallocate,
 	.fiemap		= ocfs2_fiemap,
 };
 
@@ -2644,6 +2641,7 @@ const struct file_operations ocfs2_fops = {
 	.flock		= ocfs2_flock,
 	.splice_read	= ocfs2_file_splice_read,
 	.splice_write	= ocfs2_file_splice_write,
+	.fallocate	= ocfs2_fallocate,
 };
 
 const struct file_operations ocfs2_dops = {
diff --git a/fs/open.c b/fs/open.c
index 5b6ef7e2859e..e52389e1f05b 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -255,10 +255,10 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 	if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0))
 		return -EFBIG;
 
-	if (!inode->i_op->fallocate)
+	if (!file->f_op->fallocate)
 		return -EOPNOTSUPP;
 
-	return inode->i_op->fallocate(inode, mode, offset, len);
+	return file->f_op->fallocate(file, mode, offset, len);
 }
 
 SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len)
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index ef51eb43e137..a55c1b46b219 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -37,6 +37,7 @@
 #include "xfs_trace.h"
 
 #include <linux/dcache.h>
+#include <linux/falloc.h>
 
 static const struct vm_operations_struct xfs_file_vm_ops;
 
@@ -882,6 +883,60 @@ out_unlock:
 	return ret;
 }
 
+STATIC long
+xfs_file_fallocate(
+	struct file	*file,
+	int		mode,
+	loff_t		offset,
+	loff_t		len)
+{
+	struct inode	*inode = file->f_path.dentry->d_inode;
+	long		error;
+	loff_t		new_size = 0;
+	xfs_flock64_t	bf;
+	xfs_inode_t	*ip = XFS_I(inode);
+	int		cmd = XFS_IOC_RESVSP;
+
+	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+		return -EOPNOTSUPP;
+
+	bf.l_whence = 0;
+	bf.l_start = offset;
+	bf.l_len = len;
+
+	xfs_ilock(ip, XFS_IOLOCK_EXCL);
+
+	if (mode & FALLOC_FL_PUNCH_HOLE)
+		cmd = XFS_IOC_UNRESVSP;
+
+	/* check the new inode size is valid before allocating */
+	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+	    offset + len > i_size_read(inode)) {
+		new_size = offset + len;
+		error = inode_newsize_ok(inode, new_size);
+		if (error)
+			goto out_unlock;
+	}
+
+	error = -xfs_change_file_space(ip, cmd, &bf, 0, XFS_ATTR_NOLOCK);
+	if (error)
+		goto out_unlock;
+
+	/* Change file size if needed */
+	if (new_size) {
+		struct iattr iattr;
+
+		iattr.ia_valid = ATTR_SIZE;
+		iattr.ia_size = new_size;
+		error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
+	}
+
+out_unlock:
+	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+	return error;
+}
+
+
 STATIC int
 xfs_file_open(
 	struct inode	*inode,
@@ -1000,6 +1055,7 @@ const struct file_operations xfs_file_operations = {
 	.open		= xfs_file_open,
 	.release	= xfs_file_release,
 	.fsync		= xfs_file_fsync,
+	.fallocate	= xfs_file_fallocate,
 };
 
 const struct file_operations xfs_dir_file_operations = {
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index a4ecc2188a09..bd5727852fd6 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -46,7 +46,6 @@
 #include <linux/namei.h>
 #include <linux/posix_acl.h>
 #include <linux/security.h>
-#include <linux/falloc.h>
 #include <linux/fiemap.h>
 #include <linux/slab.h>
 
@@ -505,64 +504,6 @@ xfs_vn_setattr(
 	return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0);
 }
 
-STATIC long
-xfs_vn_fallocate(
-	struct inode	*inode,
-	int		mode,
-	loff_t		offset,
-	loff_t		len)
-{
-	long		error;
-	loff_t		new_size = 0;
-	xfs_flock64_t	bf;
-	xfs_inode_t	*ip = XFS_I(inode);
-	int		cmd = XFS_IOC_RESVSP;
-
-	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
-		return -EOPNOTSUPP;
-
-	/* preallocation on directories not yet supported */
-	error = -ENODEV;
-	if (S_ISDIR(inode->i_mode))
-		goto out_error;
-
-	bf.l_whence = 0;
-	bf.l_start = offset;
-	bf.l_len = len;
-
-	xfs_ilock(ip, XFS_IOLOCK_EXCL);
-
-	if (mode & FALLOC_FL_PUNCH_HOLE)
-		cmd = XFS_IOC_UNRESVSP;
-
-	/* check the new inode size is valid before allocating */
-	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
-	    offset + len > i_size_read(inode)) {
-		new_size = offset + len;
-		error = inode_newsize_ok(inode, new_size);
-		if (error)
-			goto out_unlock;
-	}
-
-	error = -xfs_change_file_space(ip, cmd, &bf, 0, XFS_ATTR_NOLOCK);
-	if (error)
-		goto out_unlock;
-
-	/* Change file size if needed */
-	if (new_size) {
-		struct iattr iattr;
-
-		iattr.ia_valid = ATTR_SIZE;
-		iattr.ia_size = new_size;
-		error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
-	}
-
-out_unlock:
-	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-out_error:
-	return error;
-}
-
 #define XFS_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
 
 /*
@@ -656,7 +597,6 @@ static const struct inode_operations xfs_inode_operations = {
 	.getxattr		= generic_getxattr,
 	.removexattr		= generic_removexattr,
 	.listxattr		= xfs_vn_listxattr,
-	.fallocate		= xfs_vn_fallocate,
 	.fiemap			= xfs_vn_fiemap,
 };
 
-- 
cgit v1.2.2


From 3bc0ba4305fa99b32caac8c60df84a2f14fce228 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 13 Dec 2010 19:38:09 -0500
Subject: fs: Remove unlikely() from fget_light()

There's an unlikely() in fget_light() that assumes the file ref count
will be 1. Running the annotate branch profiler on a desktop that is
performing daily tasks (running firefox, evolution, xchat and is also part
of a distcc farm), it shows that the ref count is not 1 that often.

 correct incorrect      %    Function                  File              Line
 ------- ---------      -    --------                  ----              ----
1035099358 6209599193  85    fget_light              file_table.c         315

Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/file_table.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/file_table.c b/fs/file_table.c
index c3dee381f1b4..c3e89adf53c0 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -311,7 +311,7 @@ struct file *fget_light(unsigned int fd, int *fput_needed)
 	struct files_struct *files = current->files;
 
 	*fput_needed = 0;
-	if (likely((atomic_read(&files->count) == 1))) {
+	if (atomic_read(&files->count) == 1) {
 		file = fcheck_files(files, fd);
 	} else {
 		rcu_read_lock();
-- 
cgit v1.2.2


From 16ebe911eba8afa88879213dc4388f2dd701561e Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Sun, 2 Jan 2011 14:44:00 -0800
Subject: fs: FS_POSIX_ACL does not depend on BLOCK

- Fix a kconfig unmet dependency warning.
- Remove the comment that identifies which filesystems use POSIX ACL
  utility routines.
- Move the FS_POSIX_ACL symbol outside of the BLOCK symbol if/endif block
  because its functions do not depend on BLOCK and some of the filesystems
  that use it do not depend on BLOCK.

warning: (GENERIC_ACL && JFFS2_FS_POSIX_ACL && NFSD_V4 && NFS_ACL_SUPPORT && 9P_FS_POSIX_ACL) selects FS_POSIX_ACL which has unmet direct dependencies (BLOCK)

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/Kconfig | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 771f457402d4..9a7921ae4763 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -30,15 +30,6 @@ config FS_MBCACHE
 source "fs/reiserfs/Kconfig"
 source "fs/jfs/Kconfig"
 
-config FS_POSIX_ACL
-# Posix ACL utility routines (for now, only ext2/ext3/jfs/reiserfs/nfs4)
-#
-# NOTE: you can implement Posix ACLs without these helpers (XFS does).
-# 	Never use this symbol for ifdefs.
-#
-	bool
-	default n
-
 source "fs/xfs/Kconfig"
 source "fs/gfs2/Kconfig"
 source "fs/ocfs2/Kconfig"
@@ -47,6 +38,14 @@ source "fs/nilfs2/Kconfig"
 
 endif # BLOCK
 
+# Posix ACL utility routines
+#
+# Note: Posix ACLs can be implemented without these helpers.  Never use
+# this symbol for ifdefs in core code.
+#
+config FS_POSIX_ACL
+	def_bool n
+
 config EXPORTFS
 	tristate
 
-- 
cgit v1.2.2


From 6a5640f10255a8941a3a57396dda20af7a5c9a9e Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Mon, 27 Dec 2010 01:41:52 +0900
Subject: compat: remove unnecessary assignment in
 compat_rw_copy_check_uvector()

*@ret_pointer is initialized to @fast_pointer thus the assignment is
redundant.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/compat.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/compat.c b/fs/compat.c
index eb1740ac8c0a..d717442c4133 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -597,10 +597,8 @@ ssize_t compat_rw_copy_check_uvector(int type,
 	if (nr_segs > fast_segs) {
 		ret = -ENOMEM;
 		iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
-		if (iov == NULL) {
-			*ret_pointer = fast_pointer;
+		if (iov == NULL)
 			goto out;
-		}
 	}
 	*ret_pointer = iov;
 
-- 
cgit v1.2.2


From 974d879e8070bbb132bd4e79ef314703853d0b82 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Mon, 27 Dec 2010 01:41:53 +0900
Subject: compat: update comment of compat statfs syscalls

The commit 7ed1ee6118ae ("Take statfs variants to fs/statfs.c")
separates out statfs syscalls from fs/open.c. Thus the comment
should be changed also.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Cc: Jiri Kosina <trivial@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/compat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/compat.c b/fs/compat.c
index d717442c4133..c62b5e6a1c15 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -257,7 +257,7 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *
 }
 
 /*
- * The following statfs calls are copies of code from fs/open.c and
+ * The following statfs calls are copies of code from fs/statfs.c and
  * should be checked against those from time to time
  */
 asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_statfs __user *buf)
-- 
cgit v1.2.2


From e0bb6bda43e20aa1db5774c73a519cd52c463a55 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Mon, 27 Dec 2010 01:41:54 +0900
Subject: compat: copy missing fields in compat_statfs64 to user

f_flags and f_spare fields were not copied to userspace when
compat_sys_[f]statfs64 called.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/compat.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/compat.c b/fs/compat.c
index c62b5e6a1c15..f6fd0a00e6cc 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -320,7 +320,9 @@ static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstat
 	    __put_user(kbuf->f_namelen, &ubuf->f_namelen) ||
 	    __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) ||
 	    __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) ||
-	    __put_user(kbuf->f_frsize, &ubuf->f_frsize))
+	    __put_user(kbuf->f_frsize, &ubuf->f_frsize) ||
+	    __put_user(kbuf->f_flags, &ubuf->f_flags) ||
+	    __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare)))
 		return -EFAULT;
 	return 0;
 }
-- 
cgit v1.2.2


From 274052ef0bac011249925f6616d147b1491fc601 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Mon, 13 Dec 2010 17:09:52 +0000
Subject: hpfs_setattr error case avoids unlock_kernel

This fixed a case that 'sparse' spotted where hpfs_setattr has an error return
that didn't go through it's path that unlocks.

This is against git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git
version 6313e3c21743cc88bb5bd8aa72948ee1e83937b6.

Build tested only, I don't have an hpfs file system to test.

Dave

Signed-off-by: Dr. David Alan Gilbert <linux@treblig.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/hpfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 56f0da1cfd10..1ae35baa539e 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -281,7 +281,7 @@ int hpfs_setattr(struct dentry *dentry, struct iattr *attr)
 	    attr->ia_size != i_size_read(inode)) {
 		error = vmtruncate(inode, attr->ia_size);
 		if (error)
-			return error;
+			goto out_unlock;
 	}
 
 	setattr_copy(inode, attr);
-- 
cgit v1.2.2


From 27eaa1c90c608aa907336c2743d5ecf35c469440 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Tue, 14 Dec 2010 00:06:25 +0900
Subject: aio: check return value of create_workqueue()

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/aio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/aio.c b/fs/aio.c
index 5e00f15c54aa..fc557a3be0a9 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -87,7 +87,7 @@ static int __init aio_setup(void)
 
 	aio_wq = create_workqueue("aio");
 	abe_pool = mempool_create_kmalloc_pool(1, sizeof(struct aio_batch_entry));
-	BUG_ON(!abe_pool);
+	BUG_ON(!aio_wq || !abe_pool);
 
 	pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page));
 
-- 
cgit v1.2.2


From ecf5632dd189ab4c366cef853d6e5fe7adfe52e5 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Sun, 16 Jan 2011 23:28:17 +0900
Subject: fs: fix address space warnings in ioctl_fiemap()

The fi_extents_start field of struct fiemap_extent_info is a
user pointer but was not marked as __user. This makes sparse
emit following warnings:

  CHECK   fs/ioctl.c
fs/ioctl.c:114:26: warning: incorrect type in argument 1 (different address spaces)
fs/ioctl.c:114:26:    expected void [noderef] <asn:1>*dst
fs/ioctl.c:114:26:    got struct fiemap_extent *[assigned] dest
fs/ioctl.c:202:14: warning: incorrect type in argument 1 (different address spaces)
fs/ioctl.c:202:14:    expected void const volatile [noderef] <asn:1>*<noident>
fs/ioctl.c:202:14:    got struct fiemap_extent *[assigned] fi_extents_start
fs/ioctl.c:212:27: warning: incorrect type in argument 1 (different address spaces)
fs/ioctl.c:212:27:    expected void [noderef] <asn:1>*dst
fs/ioctl.c:212:27:    got char *<noident>

Also add 'ufiemap' variable to eliminate unnecessary casts.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ioctl.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ioctl.c b/fs/ioctl.c
index d6cc16476620..a59635e295fa 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -86,7 +86,7 @@ int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical,
 			    u64 phys, u64 len, u32 flags)
 {
 	struct fiemap_extent extent;
-	struct fiemap_extent *dest = fieinfo->fi_extents_start;
+	struct fiemap_extent __user *dest = fieinfo->fi_extents_start;
 
 	/* only count the extents */
 	if (fieinfo->fi_extents_max == 0) {
@@ -173,6 +173,7 @@ static int fiemap_check_ranges(struct super_block *sb,
 static int ioctl_fiemap(struct file *filp, unsigned long arg)
 {
 	struct fiemap fiemap;
+	struct fiemap __user *ufiemap = (struct fiemap __user *) arg;
 	struct fiemap_extent_info fieinfo = { 0, };
 	struct inode *inode = filp->f_path.dentry->d_inode;
 	struct super_block *sb = inode->i_sb;
@@ -182,8 +183,7 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
 	if (!inode->i_op->fiemap)
 		return -EOPNOTSUPP;
 
-	if (copy_from_user(&fiemap, (struct fiemap __user *)arg,
-			   sizeof(struct fiemap)))
+	if (copy_from_user(&fiemap, ufiemap, sizeof(fiemap)))
 		return -EFAULT;
 
 	if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS)
@@ -196,7 +196,7 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
 
 	fieinfo.fi_flags = fiemap.fm_flags;
 	fieinfo.fi_extents_max = fiemap.fm_extent_count;
-	fieinfo.fi_extents_start = (struct fiemap_extent *)(arg + sizeof(fiemap));
+	fieinfo.fi_extents_start = ufiemap->fm_extents;
 
 	if (fiemap.fm_extent_count != 0 &&
 	    !access_ok(VERIFY_WRITE, fieinfo.fi_extents_start,
@@ -209,7 +209,7 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
 	error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start, len);
 	fiemap.fm_flags = fieinfo.fi_flags;
 	fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped;
-	if (copy_to_user((char *)arg, &fiemap, sizeof(fiemap)))
+	if (copy_to_user(ufiemap, &fiemap, sizeof(fiemap)))
 		error = -EFAULT;
 
 	return error;
-- 
cgit v1.2.2


From 38a708d7759476318d0eec64af174513032ec67a Mon Sep 17 00:00:00 2001
From: Edward Shishkin <edward.shishkin@gmail.com>
Date: Sat, 30 Oct 2010 00:11:50 +0200
Subject: ecryptfs: fix truncation error in ecryptfs_read_update_atime

This is similar to the bug found in direct-io not so long ago.

Fix up truncation (ssize_t->int).  This only matters with >2G
reads/writes, which the kernel doesn't permit.

Signed-off-by: Edward Shishkin <edward.shishkin@gmail.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Eric Sandeen <esandeen@redhat.com>
Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
---
 fs/ecryptfs/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 91da02987bff..679817e82484 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -47,7 +47,7 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb,
 				const struct iovec *iov,
 				unsigned long nr_segs, loff_t pos)
 {
-	int rc;
+	ssize_t rc;
 	struct dentry *lower_dentry;
 	struct vfsmount *lower_vfsmount;
 	struct file *file = iocb->ki_filp;
-- 
cgit v1.2.2


From 2a8652f4e0d11ee27b1d2870c600fd1300661a6e Mon Sep 17 00:00:00 2001
From: Roberto Sassu <roberto.sassu@polito.it>
Date: Wed, 3 Nov 2010 11:11:15 +0100
Subject: ecryptfs: moved ECRYPTFS_SUPER_MAGIC definition to linux/magic.h

The definition of ECRYPTFS_SUPER_MAGIC has been moved to the include
file 'linux/magic.h' to become available to other kernel subsystems.

Signed-off-by: Roberto Sassu <roberto.sassu@polito.it>
Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
---
 fs/ecryptfs/ecryptfs_kernel.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 413a3c48f0bb..bc530a81e4ce 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -192,7 +192,6 @@ ecryptfs_get_key_payload_data(struct key *key)
 		(((struct user_key_payload*)key->payload.data)->data);
 }
 
-#define ECRYPTFS_SUPER_MAGIC 0xf15f
 #define ECRYPTFS_MAX_KEYSET_SIZE 1024
 #define ECRYPTFS_MAX_CIPHER_NAME_SIZE 32
 #define ECRYPTFS_MAX_NUM_ENC_KEYS 64
-- 
cgit v1.2.2


From 070baa51286e5cf59dde6be52fa23647ffb5d32d Mon Sep 17 00:00:00 2001
From: Roberto Sassu <roberto.sassu@polito.it>
Date: Wed, 3 Nov 2010 11:11:22 +0100
Subject: ecryptfs: missing initialization of the superblock 'magic' field

This patch initializes the 'magic' field of ecryptfs filesystems to
ECRYPTFS_SUPER_MAGIC.

Signed-off-by: Roberto Sassu <roberto.sassu@polito.it>
[tyhicks: merge with 66cb76666d69]
Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
---
 fs/ecryptfs/main.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index d3b28abdd6aa..19f04504f625 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -36,6 +36,7 @@
 #include <linux/parser.h>
 #include <linux/fs_stack.h>
 #include <linux/slab.h>
+#include <linux/magic.h>
 #include "ecryptfs_kernel.h"
 
 /**
@@ -564,6 +565,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
 	ecryptfs_set_superblock_lower(s, path.dentry->d_sb);
 	s->s_maxbytes = path.dentry->d_sb->s_maxbytes;
 	s->s_blocksize = path.dentry->d_sb->s_blocksize;
+	s->s_magic = ECRYPTFS_SUPER_MAGIC;
 
 	inode = ecryptfs_get_inode(path.dentry->d_inode, s);
 	rc = PTR_ERR(inode);
-- 
cgit v1.2.2


From 27992890b02d340198a3a22fc210d13684a41564 Mon Sep 17 00:00:00 2001
From: Roberto Sassu <roberto.sassu@polito.it>
Date: Wed, 3 Nov 2010 11:11:28 +0100
Subject: ecryptfs: test lower_file pointer when lower_file_mutex is locked

This patch prevents the lower_file pointer in the 'ecryptfs_inode_info'
structure to be checked when the mutex 'lower_file_mutex' is not locked.

Signed-off-by: Roberto Sassu <roberto.sassu@polito.it>
Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
---
 fs/ecryptfs/file.c  | 16 +++++++---------
 fs/ecryptfs/inode.c | 32 ++++++++++++++------------------
 2 files changed, 21 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 679817e82484..99259f850e58 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -191,15 +191,13 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
 				      | ECRYPTFS_ENCRYPTED);
 	}
 	mutex_unlock(&crypt_stat->cs_mutex);
-	if (!ecryptfs_inode_to_private(inode)->lower_file) {
-		rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
-		if (rc) {
-			printk(KERN_ERR "%s: Error attempting to initialize "
-			       "the persistent file for the dentry with name "
-			       "[%s]; rc = [%d]\n", __func__,
-			       ecryptfs_dentry->d_name.name, rc);
-			goto out_free;
-		}
+	rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
+	if (rc) {
+		printk(KERN_ERR "%s: Error attempting to initialize "
+			"the persistent file for the dentry with name "
+			"[%s]; rc = [%d]\n", __func__,
+			ecryptfs_dentry->d_name.name, rc);
+		goto out_free;
 	}
 	if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_RDONLY)
 	    && !(file->f_flags & O_RDONLY)) {
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 64ff02330752..bd33f87a1907 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -185,15 +185,13 @@ static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry)
 				"context; rc = [%d]\n", rc);
 		goto out;
 	}
-	if (!ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->lower_file) {
-		rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
-		if (rc) {
-			printk(KERN_ERR "%s: Error attempting to initialize "
-			       "the persistent file for the dentry with name "
-			       "[%s]; rc = [%d]\n", __func__,
-			       ecryptfs_dentry->d_name.name, rc);
-			goto out;
-		}
+	rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
+	if (rc) {
+		printk(KERN_ERR "%s: Error attempting to initialize "
+			"the persistent file for the dentry with name "
+			"[%s]; rc = [%d]\n", __func__,
+			ecryptfs_dentry->d_name.name, rc);
+		goto out;
 	}
 	rc = ecryptfs_write_metadata(ecryptfs_dentry);
 	if (rc) {
@@ -302,15 +300,13 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
 		rc = -ENOMEM;
 		goto out;
 	}
-	if (!ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->lower_file) {
-		rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
-		if (rc) {
-			printk(KERN_ERR "%s: Error attempting to initialize "
-			       "the persistent file for the dentry with name "
-			       "[%s]; rc = [%d]\n", __func__,
-			       ecryptfs_dentry->d_name.name, rc);
-			goto out_free_kmem;
-		}
+	rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
+	if (rc) {
+		printk(KERN_ERR "%s: Error attempting to initialize "
+			"the persistent file for the dentry with name "
+			"[%s]; rc = [%d]\n", __func__,
+			ecryptfs_dentry->d_name.name, rc);
+		goto out_free_kmem;
 	}
 	crypt_stat = &ecryptfs_inode_to_private(
 					ecryptfs_dentry->d_inode)->crypt_stat;
-- 
cgit v1.2.2


From 0abe1169470571c473ee720c35fe5b3481c46c46 Mon Sep 17 00:00:00 2001
From: Roberto Sassu <roberto.sassu@polito.it>
Date: Wed, 3 Nov 2010 11:11:34 +0100
Subject: ecryptfs: fixed testing of file descriptor flags

This patch replaces the check (lower_file->f_flags & O_RDONLY) with
((lower_file & O_ACCMODE) == O_RDONLY).

Signed-off-by: Roberto Sassu <roberto.sassu@polito.it>
Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
---
 fs/ecryptfs/file.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 99259f850e58..e069e786da43 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -199,8 +199,8 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
 			ecryptfs_dentry->d_name.name, rc);
 		goto out_free;
 	}
-	if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_RDONLY)
-	    && !(file->f_flags & O_RDONLY)) {
+	if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_ACCMODE)
+	    == O_RDONLY && (file->f_flags & O_ACCMODE) != O_RDONLY) {
 		rc = -EPERM;
 		printk(KERN_WARNING "%s: Lower persistent file is RO; eCryptfs "
 		       "file must hence be opened RO\n", __func__);
-- 
cgit v1.2.2


From 888d57bbc91ebd031451d4ab1c669baee826a06c Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 10 Nov 2010 15:46:16 -0800
Subject: fs/ecryptfs: Add printf format/argument verification and fix fallout

Add __attribute__((format... to __ecryptfs_printk
Make formats and arguments match.
Add casts to (unsigned long long) for %llu.

Signed-off-by: Joe Perches <joe@perches.com>
[tyhicks: 80 columns cleanup and fixed typo]
Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
---
 fs/ecryptfs/crypto.c          | 26 ++++++++++++--------------
 fs/ecryptfs/ecryptfs_kernel.h |  1 +
 fs/ecryptfs/file.c            |  6 +++---
 fs/ecryptfs/keystore.c        |  6 +++---
 fs/ecryptfs/main.c            |  7 ++++---
 fs/ecryptfs/mmap.c            | 13 +++++++------
 6 files changed, 30 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index cbadc1bee6e7..57bdd7a13207 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -413,10 +413,9 @@ static int ecryptfs_encrypt_extent(struct page *enc_extent_page,
 	rc = ecryptfs_derive_iv(extent_iv, crypt_stat,
 				(extent_base + extent_offset));
 	if (rc) {
-		ecryptfs_printk(KERN_ERR, "Error attempting to "
-				"derive IV for extent [0x%.16x]; "
-				"rc = [%d]\n", (extent_base + extent_offset),
-				rc);
+		ecryptfs_printk(KERN_ERR, "Error attempting to derive IV for "
+			"extent [0x%.16llx]; rc = [%d]\n",
+			(unsigned long long)(extent_base + extent_offset), rc);
 		goto out;
 	}
 	if (unlikely(ecryptfs_verbosity > 0)) {
@@ -443,9 +442,9 @@ static int ecryptfs_encrypt_extent(struct page *enc_extent_page,
 	}
 	rc = 0;
 	if (unlikely(ecryptfs_verbosity > 0)) {
-		ecryptfs_printk(KERN_DEBUG, "Encrypt extent [0x%.16x]; "
-				"rc = [%d]\n", (extent_base + extent_offset),
-				rc);
+		ecryptfs_printk(KERN_DEBUG, "Encrypt extent [0x%.16llx]; "
+			"rc = [%d]\n",
+			(unsigned long long)(extent_base + extent_offset), rc);
 		ecryptfs_printk(KERN_DEBUG, "First 8 bytes after "
 				"encryption:\n");
 		ecryptfs_dump_hex((char *)(page_address(enc_extent_page)), 8);
@@ -540,10 +539,9 @@ static int ecryptfs_decrypt_extent(struct page *page,
 	rc = ecryptfs_derive_iv(extent_iv, crypt_stat,
 				(extent_base + extent_offset));
 	if (rc) {
-		ecryptfs_printk(KERN_ERR, "Error attempting to "
-				"derive IV for extent [0x%.16x]; "
-				"rc = [%d]\n", (extent_base + extent_offset),
-				rc);
+		ecryptfs_printk(KERN_ERR, "Error attempting to derive IV for "
+			"extent [0x%.16llx]; rc = [%d]\n",
+			(unsigned long long)(extent_base + extent_offset), rc);
 		goto out;
 	}
 	if (unlikely(ecryptfs_verbosity > 0)) {
@@ -571,9 +569,9 @@ static int ecryptfs_decrypt_extent(struct page *page,
 	}
 	rc = 0;
 	if (unlikely(ecryptfs_verbosity > 0)) {
-		ecryptfs_printk(KERN_DEBUG, "Decrypt extent [0x%.16x]; "
-				"rc = [%d]\n", (extent_base + extent_offset),
-				rc);
+		ecryptfs_printk(KERN_DEBUG, "Decrypt extent [0x%.16llx]; "
+			"rc = [%d]\n",
+			(unsigned long long)(extent_base + extent_offset), rc);
 		ecryptfs_printk(KERN_DEBUG, "First 8 bytes after "
 				"decryption:\n");
 		ecryptfs_dump_hex((char *)(page_address(page)
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index bc530a81e4ce..dbc84ed96336 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -583,6 +583,7 @@ ecryptfs_set_dentry_lower_mnt(struct dentry *dentry, struct vfsmount *lower_mnt)
 
 #define ecryptfs_printk(type, fmt, arg...) \
         __ecryptfs_printk(type "%s: " fmt, __func__, ## arg);
+__attribute__ ((format(printf, 1, 2)))
 void __ecryptfs_printk(const char *fmt, ...);
 
 extern const struct file_operations ecryptfs_main_fops;
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index e069e786da43..81e10e6a9443 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -241,9 +241,9 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
 		}
 	}
 	mutex_unlock(&crypt_stat->cs_mutex);
-	ecryptfs_printk(KERN_DEBUG, "inode w/ addr = [0x%p], i_ino = [0x%.16x] "
-			"size: [0x%.16x]\n", inode, inode->i_ino,
-			i_size_read(inode));
+	ecryptfs_printk(KERN_DEBUG, "inode w/ addr = [0x%p], i_ino = "
+			"[0x%.16lx] size: [0x%.16llx]\n", inode, inode->i_ino,
+			(unsigned long long)i_size_read(inode));
 	goto out;
 out_free:
 	kmem_cache_free(ecryptfs_file_info_cache,
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index b1f6858a5223..25fd7f595c99 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -59,7 +59,7 @@ static int process_request_key_err(long err_code)
 		break;
 	default:
 		ecryptfs_printk(KERN_WARNING, "Unknown error code: "
-				"[0x%.16x]\n", err_code);
+				"[0x%.16lx]\n", err_code);
 		rc = -EINVAL;
 	}
 	return rc;
@@ -1864,8 +1864,8 @@ found_matching_auth_tok:
 				"session key for authentication token with sig "
 				"[%.*s]; rc = [%d]. Removing auth tok "
 				"candidate from the list and searching for "
-				"the next match.\n", candidate_auth_tok_sig,
-				ECRYPTFS_SIG_SIZE_HEX, rc);
+				"the next match.\n", ECRYPTFS_SIG_SIZE_HEX,
+				candidate_auth_tok_sig,	rc);
 		list_for_each_entry_safe(auth_tok_list_item,
 					 auth_tok_list_item_tmp,
 					 &auth_tok_list, list) {
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 19f04504f625..758323a0f09a 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -810,9 +810,10 @@ static int __init ecryptfs_init(void)
 		ecryptfs_printk(KERN_ERR, "The eCryptfs extent size is "
 				"larger than the host's page size, and so "
 				"eCryptfs cannot run on this system. The "
-				"default eCryptfs extent size is [%d] bytes; "
-				"the page size is [%d] bytes.\n",
-				ECRYPTFS_DEFAULT_EXTENT_SIZE, PAGE_CACHE_SIZE);
+				"default eCryptfs extent size is [%u] bytes; "
+				"the page size is [%lu] bytes.\n",
+				ECRYPTFS_DEFAULT_EXTENT_SIZE,
+				(unsigned long)PAGE_CACHE_SIZE);
 		goto out;
 	}
 	rc = ecryptfs_init_kmem_caches();
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index b1d82756544b..4b9011392736 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -65,7 +65,7 @@ static int ecryptfs_writepage(struct page *page, struct writeback_control *wbc)
 	rc = ecryptfs_encrypt_page(page);
 	if (rc) {
 		ecryptfs_printk(KERN_WARNING, "Error encrypting "
-				"page (upper index [0x%.16x])\n", page->index);
+				"page (upper index [0x%.16lx])\n", page->index);
 		ClearPageUptodate(page);
 		goto out;
 	}
@@ -237,7 +237,7 @@ out:
 		ClearPageUptodate(page);
 	else
 		SetPageUptodate(page);
-	ecryptfs_printk(KERN_DEBUG, "Unlocking page with index = [0x%.16x]\n",
+	ecryptfs_printk(KERN_DEBUG, "Unlocking page with index = [0x%.16lx]\n",
 			page->index);
 	unlock_page(page);
 	return rc;
@@ -488,7 +488,7 @@ static int ecryptfs_write_end(struct file *file,
 	} else
 		ecryptfs_printk(KERN_DEBUG, "Not a new file\n");
 	ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page"
-			"(page w/ index = [0x%.16x], to = [%d])\n", index, to);
+			"(page w/ index = [0x%.16lx], to = [%d])\n", index, to);
 	if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
 		rc = ecryptfs_write_lower_page_segment(ecryptfs_inode, page, 0,
 						       to);
@@ -503,19 +503,20 @@ static int ecryptfs_write_end(struct file *file,
 	rc = fill_zeros_to_end_of_page(page, to);
 	if (rc) {
 		ecryptfs_printk(KERN_WARNING, "Error attempting to fill "
-			"zeros in page with index = [0x%.16x]\n", index);
+			"zeros in page with index = [0x%.16lx]\n", index);
 		goto out;
 	}
 	rc = ecryptfs_encrypt_page(page);
 	if (rc) {
 		ecryptfs_printk(KERN_WARNING, "Error encrypting page (upper "
-				"index [0x%.16x])\n", index);
+				"index [0x%.16lx])\n", index);
 		goto out;
 	}
 	if (pos + copied > i_size_read(ecryptfs_inode)) {
 		i_size_write(ecryptfs_inode, pos + copied);
 		ecryptfs_printk(KERN_DEBUG, "Expanded file size to "
-				"[0x%.16x]\n", i_size_read(ecryptfs_inode));
+			"[0x%.16llx]\n",
+			(unsigned long long)i_size_read(ecryptfs_inode));
 	}
 	rc = ecryptfs_write_inode_size_to_metadata(ecryptfs_inode);
 	if (rc)
-- 
cgit v1.2.2


From f24b38874e1e37bb70291bbc4c5c3c13f5f9dac8 Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
Date: Mon, 15 Nov 2010 17:36:38 -0600
Subject: ecryptfs: Fix ecryptfs_printk() size_t warnings

Commit cb55d21f6fa19d8c6c2680d90317ce88c1f57269 revealed a number of
missing 'z' length modifiers in calls to ecryptfs_printk() when
printing variables of type size_t. This patch fixes those compiler
warnings.

Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
---
 fs/ecryptfs/crypto.c   |  4 ++--
 fs/ecryptfs/keystore.c | 20 ++++++++++----------
 2 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 57bdd7a13207..bfd8b680e648 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -348,7 +348,7 @@ static int encrypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
 	BUG_ON(!crypt_stat || !crypt_stat->tfm
 	       || !(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED));
 	if (unlikely(ecryptfs_verbosity > 0)) {
-		ecryptfs_printk(KERN_DEBUG, "Key size [%d]; key:\n",
+		ecryptfs_printk(KERN_DEBUG, "Key size [%zd]; key:\n",
 				crypt_stat->key_size);
 		ecryptfs_dump_hex(crypt_stat->key,
 				  crypt_stat->key_size);
@@ -778,7 +778,7 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat)
 	}
 	ecryptfs_printk(KERN_DEBUG,
 			"Initializing cipher [%s]; strlen = [%d]; "
-			"key_size_bits = [%d]\n",
+			"key_size_bits = [%zd]\n",
 			crypt_stat->cipher, (int)strlen(crypt_stat->cipher),
 			crypt_stat->key_size << 3);
 	if (crypt_stat->tfm) {
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 25fd7f595c99..c1436cff6f2d 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -130,7 +130,7 @@ int ecryptfs_write_packet_length(char *dest, size_t size,
 	} else {
 		rc = -EINVAL;
 		ecryptfs_printk(KERN_WARNING,
-				"Unsupported packet size: [%d]\n", size);
+				"Unsupported packet size: [%zd]\n", size);
 	}
 	return rc;
 }
@@ -1672,7 +1672,7 @@ decrypt_passphrase_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
 	       auth_tok->session_key.decrypted_key_size);
 	crypt_stat->flags |= ECRYPTFS_KEY_VALID;
 	if (unlikely(ecryptfs_verbosity > 0)) {
-		ecryptfs_printk(KERN_DEBUG, "FEK of size [%d]:\n",
+		ecryptfs_printk(KERN_DEBUG, "FEK of size [%zd]:\n",
 				crypt_stat->key_size);
 		ecryptfs_dump_hex(crypt_stat->key,
 				  crypt_stat->key_size);
@@ -1754,7 +1754,7 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
 			if (ECRYPTFS_SIG_SIZE != tag_11_contents_size) {
 				ecryptfs_printk(KERN_ERR, "Expected "
 						"signature of size [%d]; "
-						"read size [%d]\n",
+						"read size [%zd]\n",
 						ECRYPTFS_SIG_SIZE,
 						tag_11_contents_size);
 				rc = -EIO;
@@ -1787,8 +1787,8 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
 			goto out_wipe_list;
 			break;
 		default:
-			ecryptfs_printk(KERN_DEBUG, "No packet at offset "
-					"[%d] of the file header; hex value of "
+			ecryptfs_printk(KERN_DEBUG, "No packet at offset [%zd] "
+					"of the file header; hex value of "
 					"character is [0x%.2x]\n", i, src[i]);
 			next_packet_is_auth_tok_packet = 0;
 		}
@@ -2168,7 +2168,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
 	if (encrypted_session_key_valid) {
 		ecryptfs_printk(KERN_DEBUG, "encrypted_session_key_valid != 0; "
 				"using auth_tok->session_key.encrypted_key, "
-				"where key_rec->enc_key_size = [%d]\n",
+				"where key_rec->enc_key_size = [%zd]\n",
 				key_rec->enc_key_size);
 		memcpy(key_rec->enc_key,
 		       auth_tok->session_key.encrypted_key,
@@ -2198,7 +2198,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
 	if (rc < 1 || rc > 2) {
 		ecryptfs_printk(KERN_ERR, "Error generating scatterlist "
 				"for crypt_stat session key; expected rc = 1; "
-				"got rc = [%d]. key_rec->enc_key_size = [%d]\n",
+				"got rc = [%d]. key_rec->enc_key_size = [%zd]\n",
 				rc, key_rec->enc_key_size);
 		rc = -ENOMEM;
 		goto out;
@@ -2209,7 +2209,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
 		ecryptfs_printk(KERN_ERR, "Error generating scatterlist "
 				"for crypt_stat encrypted session key; "
 				"expected rc = 1; got rc = [%d]. "
-				"key_rec->enc_key_size = [%d]\n", rc,
+				"key_rec->enc_key_size = [%zd]\n", rc,
 				key_rec->enc_key_size);
 		rc = -ENOMEM;
 		goto out;
@@ -2224,7 +2224,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
 		goto out;
 	}
 	rc = 0;
-	ecryptfs_printk(KERN_DEBUG, "Encrypting [%d] bytes of the key\n",
+	ecryptfs_printk(KERN_DEBUG, "Encrypting [%zd] bytes of the key\n",
 			crypt_stat->key_size);
 	rc = crypto_blkcipher_encrypt(&desc, dst_sg, src_sg,
 				      (*key_rec).enc_key_size);
@@ -2235,7 +2235,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
 	}
 	ecryptfs_printk(KERN_DEBUG, "This should be the encrypted key:\n");
 	if (ecryptfs_verbosity > 0) {
-		ecryptfs_printk(KERN_DEBUG, "EFEK of size [%d]:\n",
+		ecryptfs_printk(KERN_DEBUG, "EFEK of size [%zd]:\n",
 				key_rec->enc_key_size);
 		ecryptfs_dump_hex(key_rec->enc_key,
 				  key_rec->enc_key_size);
-- 
cgit v1.2.2


From 24562486be76cf223b8d911f45e1d26eb3364b13 Mon Sep 17 00:00:00 2001
From: Frank Swiderski <fes@chromium.org>
Date: Mon, 15 Nov 2010 10:43:22 -0800
Subject: ecryptfs: remove unnecessary decrypt when extending a file

Removes an unecessary page decrypt from ecryptfs_begin_write when the
page is beyond the current file size. Previously, the call to
ecryptfs_decrypt_page would result in a read of 0 bytes, but still
attempt to decrypt an entire page. This patch detects that case and
merely zeros the page before marking it up-to-date.

Signed-off-by: Frank Swiderski <fes@chromium.org>
Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
---
 fs/ecryptfs/mmap.c | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 4b9011392736..cc64fca89f8d 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -290,6 +290,7 @@ static int ecryptfs_write_begin(struct file *file,
 		return -ENOMEM;
 	*pagep = page;
 
+	prev_page_end_size = ((loff_t)index << PAGE_CACHE_SHIFT);
 	if (!PageUptodate(page)) {
 		struct ecryptfs_crypt_stat *crypt_stat =
 			&ecryptfs_inode_to_private(mapping->host)->crypt_stat;
@@ -335,18 +336,23 @@ static int ecryptfs_write_begin(struct file *file,
 				SetPageUptodate(page);
 			}
 		} else {
-			rc = ecryptfs_decrypt_page(page);
-			if (rc) {
-				printk(KERN_ERR "%s: Error decrypting page "
-				       "at index [%ld]; rc = [%d]\n",
-				       __func__, page->index, rc);
-				ClearPageUptodate(page);
-				goto out;
+			if (prev_page_end_size
+			    >= i_size_read(page->mapping->host)) {
+				zero_user(page, 0, PAGE_CACHE_SIZE);
+			} else {
+				rc = ecryptfs_decrypt_page(page);
+				if (rc) {
+					printk(KERN_ERR "%s: Error decrypting "
+					       "page at index [%ld]; "
+					       "rc = [%d]\n",
+					       __func__, page->index, rc);
+					ClearPageUptodate(page);
+					goto out;
+				}
 			}
 			SetPageUptodate(page);
 		}
 	}
-	prev_page_end_size = ((loff_t)index << PAGE_CACHE_SHIFT);
 	/* If creating a page or more of holes, zero them out via truncate.
 	 * Note, this will increase i_size. */
 	if (index != 0) {
-- 
cgit v1.2.2


From acce952b0263825da32cf10489413dec78053347 Mon Sep 17 00:00:00 2001
From: liubo <liubo2009@cn.fujitsu.com>
Date: Thu, 6 Jan 2011 19:30:25 +0800
Subject: Btrfs: forced readonly mounts on errors

This patch comes from "Forced readonly mounts on errors" ideas.

As we know, this is the first step in being more fault tolerant of disk
corruptions instead of just using BUG() statements.

The major content:
- add a framework for generating errors that should result in filesystems
  going readonly.
- keep FS state in disk super block.
- make sure that all of resource will be freed and released at umount time.
- make sure that fter FS is forced readonly on error, there will be no more
  disk change before FS is corrected. For this, we should stop write operation.

After this patch is applied, the conversion from BUG() to such a framework can
happen incrementally.

Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |  24 +++
 fs/btrfs/disk-io.c     | 391 ++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/btrfs/disk-io.h     |   1 +
 fs/btrfs/extent-tree.c |  11 ++
 fs/btrfs/file.c        |  11 ++
 fs/btrfs/super.c       |  84 +++++++++++
 fs/btrfs/transaction.c |   3 +
 7 files changed, 523 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0995f4f68d7a..72195378bef9 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -295,6 +295,14 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
 #define BTRFS_FSID_SIZE 16
 #define BTRFS_HEADER_FLAG_WRITTEN	(1ULL << 0)
 #define BTRFS_HEADER_FLAG_RELOC		(1ULL << 1)
+
+/*
+ * File system states
+ */
+
+/* Errors detected */
+#define BTRFS_SUPER_FLAG_ERROR		(1ULL << 2)
+
 #define BTRFS_SUPER_FLAG_SEEDING	(1ULL << 32)
 #define BTRFS_SUPER_FLAG_METADUMP	(1ULL << 33)
 
@@ -1058,6 +1066,9 @@ struct btrfs_fs_info {
 	unsigned metadata_ratio;
 
 	void *bdev_holder;
+
+	/* filesystem state */
+	u64 fs_state;
 };
 
 /*
@@ -2203,6 +2214,11 @@ int btrfs_set_block_group_rw(struct btrfs_root *root,
 			     struct btrfs_block_group_cache *cache);
 void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
+int btrfs_error_unpin_extent_range(struct btrfs_root *root,
+				   u64 start, u64 end);
+int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
+			       u64 num_bytes);
+
 /* ctree.c */
 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
 		     int level, int *slot);
@@ -2556,6 +2572,14 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
 /* super.c */
 int btrfs_parse_options(struct btrfs_root *root, char *options);
 int btrfs_sync_fs(struct super_block *sb, int wait);
+void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
+		     unsigned int line, int errno);
+
+#define btrfs_std_error(fs_info, errno)				\
+do {								\
+	if ((errno))						\
+		__btrfs_std_error((fs_info), __func__, __LINE__, (errno));\
+} while (0)
 
 /* acl.c */
 #ifdef CONFIG_BTRFS_FS_POSIX_ACL
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 9b1dd4138072..1a3af9e8e0c4 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -44,6 +44,20 @@
 static struct extent_io_ops btree_extent_io_ops;
 static void end_workqueue_fn(struct btrfs_work *work);
 static void free_fs_root(struct btrfs_root *root);
+static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
+				    int read_only);
+static int btrfs_destroy_ordered_operations(struct btrfs_root *root);
+static int btrfs_destroy_ordered_extents(struct btrfs_root *root);
+static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
+				      struct btrfs_root *root);
+static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t);
+static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
+static int btrfs_destroy_marked_extents(struct btrfs_root *root,
+					struct extent_io_tree *dirty_pages,
+					int mark);
+static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
+				       struct extent_io_tree *pinned_extents);
+static int btrfs_cleanup_transaction(struct btrfs_root *root);
 
 /*
  * end_io_wq structs are used to do processing in task context when an IO is
@@ -1738,6 +1752,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	if (!btrfs_super_root(disk_super))
 		goto fail_iput;
 
+	/* check FS state, whether FS is broken. */
+	fs_info->fs_state |= btrfs_super_flags(disk_super);
+
+	btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
+
 	ret = btrfs_parse_options(tree_root, options);
 	if (ret) {
 		err = ret;
@@ -1968,7 +1987,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 		btrfs_set_opt(fs_info->mount_opt, SSD);
 	}
 
-	if (btrfs_super_log_root(disk_super) != 0) {
+	/* do not make disk changes in broken FS */
+	if (btrfs_super_log_root(disk_super) != 0 &&
+	    !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) {
 		u64 bytenr = btrfs_super_log_root(disk_super);
 
 		if (fs_devices->rw_devices == 0) {
@@ -2464,8 +2485,28 @@ int close_ctree(struct btrfs_root *root)
 	smp_mb();
 
 	btrfs_put_block_group_cache(fs_info);
+
+	/*
+	 * Here come 2 situations when btrfs is broken to flip readonly:
+	 *
+	 * 1. when btrfs flips readonly somewhere else before
+	 * btrfs_commit_super, sb->s_flags has MS_RDONLY flag,
+	 * and btrfs will skip to write sb directly to keep
+	 * ERROR state on disk.
+	 *
+	 * 2. when btrfs flips readonly just in btrfs_commit_super,
+	 * and in such case, btrfs cannnot write sb via btrfs_commit_super,
+	 * and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag,
+	 * btrfs will cleanup all FS resources first and write sb then.
+	 */
 	if (!(fs_info->sb->s_flags & MS_RDONLY)) {
-		ret =  btrfs_commit_super(root);
+		ret = btrfs_commit_super(root);
+		if (ret)
+			printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
+	}
+
+	if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+		ret = btrfs_error_commit_super(root);
 		if (ret)
 			printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
 	}
@@ -2641,6 +2682,352 @@ out:
 	return 0;
 }
 
+static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
+			      int read_only)
+{
+	if (read_only)
+		return;
+
+	if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
+		printk(KERN_WARNING "warning: mount fs with errors, "
+		       "running btrfsck is recommended\n");
+}
+
+int btrfs_error_commit_super(struct btrfs_root *root)
+{
+	int ret;
+
+	mutex_lock(&root->fs_info->cleaner_mutex);
+	btrfs_run_delayed_iputs(root);
+	mutex_unlock(&root->fs_info->cleaner_mutex);
+
+	down_write(&root->fs_info->cleanup_work_sem);
+	up_write(&root->fs_info->cleanup_work_sem);
+
+	/* cleanup FS via transaction */
+	btrfs_cleanup_transaction(root);
+
+	ret = write_ctree_super(NULL, root, 0);
+
+	return ret;
+}
+
+static int btrfs_destroy_ordered_operations(struct btrfs_root *root)
+{
+	struct btrfs_inode *btrfs_inode;
+	struct list_head splice;
+
+	INIT_LIST_HEAD(&splice);
+
+	mutex_lock(&root->fs_info->ordered_operations_mutex);
+	spin_lock(&root->fs_info->ordered_extent_lock);
+
+	list_splice_init(&root->fs_info->ordered_operations, &splice);
+	while (!list_empty(&splice)) {
+		btrfs_inode = list_entry(splice.next, struct btrfs_inode,
+					 ordered_operations);
+
+		list_del_init(&btrfs_inode->ordered_operations);
+
+		btrfs_invalidate_inodes(btrfs_inode->root);
+	}
+
+	spin_unlock(&root->fs_info->ordered_extent_lock);
+	mutex_unlock(&root->fs_info->ordered_operations_mutex);
+
+	return 0;
+}
+
+static int btrfs_destroy_ordered_extents(struct btrfs_root *root)
+{
+	struct list_head splice;
+	struct btrfs_ordered_extent *ordered;
+	struct inode *inode;
+
+	INIT_LIST_HEAD(&splice);
+
+	spin_lock(&root->fs_info->ordered_extent_lock);
+
+	list_splice_init(&root->fs_info->ordered_extents, &splice);
+	while (!list_empty(&splice)) {
+		ordered = list_entry(splice.next, struct btrfs_ordered_extent,
+				     root_extent_list);
+
+		list_del_init(&ordered->root_extent_list);
+		atomic_inc(&ordered->refs);
+
+		/* the inode may be getting freed (in sys_unlink path). */
+		inode = igrab(ordered->inode);
+
+		spin_unlock(&root->fs_info->ordered_extent_lock);
+		if (inode)
+			iput(inode);
+
+		atomic_set(&ordered->refs, 1);
+		btrfs_put_ordered_extent(ordered);
+
+		spin_lock(&root->fs_info->ordered_extent_lock);
+	}
+
+	spin_unlock(&root->fs_info->ordered_extent_lock);
+
+	return 0;
+}
+
+static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
+				      struct btrfs_root *root)
+{
+	struct rb_node *node;
+	struct btrfs_delayed_ref_root *delayed_refs;
+	struct btrfs_delayed_ref_node *ref;
+	int ret = 0;
+
+	delayed_refs = &trans->delayed_refs;
+
+	spin_lock(&delayed_refs->lock);
+	if (delayed_refs->num_entries == 0) {
+		printk(KERN_INFO "delayed_refs has NO entry\n");
+		return ret;
+	}
+
+	node = rb_first(&delayed_refs->root);
+	while (node) {
+		ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+		node = rb_next(node);
+
+		ref->in_tree = 0;
+		rb_erase(&ref->rb_node, &delayed_refs->root);
+		delayed_refs->num_entries--;
+
+		atomic_set(&ref->refs, 1);
+		if (btrfs_delayed_ref_is_head(ref)) {
+			struct btrfs_delayed_ref_head *head;
+
+			head = btrfs_delayed_node_to_head(ref);
+			mutex_lock(&head->mutex);
+			kfree(head->extent_op);
+			delayed_refs->num_heads--;
+			if (list_empty(&head->cluster))
+				delayed_refs->num_heads_ready--;
+			list_del_init(&head->cluster);
+			mutex_unlock(&head->mutex);
+		}
+
+		spin_unlock(&delayed_refs->lock);
+		btrfs_put_delayed_ref(ref);
+
+		cond_resched();
+		spin_lock(&delayed_refs->lock);
+	}
+
+	spin_unlock(&delayed_refs->lock);
+
+	return ret;
+}
+
+static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t)
+{
+	struct btrfs_pending_snapshot *snapshot;
+	struct list_head splice;
+
+	INIT_LIST_HEAD(&splice);
+
+	list_splice_init(&t->pending_snapshots, &splice);
+
+	while (!list_empty(&splice)) {
+		snapshot = list_entry(splice.next,
+				      struct btrfs_pending_snapshot,
+				      list);
+
+		list_del_init(&snapshot->list);
+
+		kfree(snapshot);
+	}
+
+	return 0;
+}
+
+static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
+{
+	struct btrfs_inode *btrfs_inode;
+	struct list_head splice;
+
+	INIT_LIST_HEAD(&splice);
+
+	list_splice_init(&root->fs_info->delalloc_inodes, &splice);
+
+	spin_lock(&root->fs_info->delalloc_lock);
+
+	while (!list_empty(&splice)) {
+		btrfs_inode = list_entry(splice.next, struct btrfs_inode,
+				    delalloc_inodes);
+
+		list_del_init(&btrfs_inode->delalloc_inodes);
+
+		btrfs_invalidate_inodes(btrfs_inode->root);
+	}
+
+	spin_unlock(&root->fs_info->delalloc_lock);
+
+	return 0;
+}
+
+static int btrfs_destroy_marked_extents(struct btrfs_root *root,
+					struct extent_io_tree *dirty_pages,
+					int mark)
+{
+	int ret;
+	struct page *page;
+	struct inode *btree_inode = root->fs_info->btree_inode;
+	struct extent_buffer *eb;
+	u64 start = 0;
+	u64 end;
+	u64 offset;
+	unsigned long index;
+
+	while (1) {
+		ret = find_first_extent_bit(dirty_pages, start, &start, &end,
+					    mark);
+		if (ret)
+			break;
+
+		clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
+		while (start <= end) {
+			index = start >> PAGE_CACHE_SHIFT;
+			start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
+			page = find_get_page(btree_inode->i_mapping, index);
+			if (!page)
+				continue;
+			offset = page_offset(page);
+
+			spin_lock(&dirty_pages->buffer_lock);
+			eb = radix_tree_lookup(
+			     &(&BTRFS_I(page->mapping->host)->io_tree)->buffer,
+					       offset >> PAGE_CACHE_SHIFT);
+			spin_unlock(&dirty_pages->buffer_lock);
+			if (eb) {
+				ret = test_and_clear_bit(EXTENT_BUFFER_DIRTY,
+							 &eb->bflags);
+				atomic_set(&eb->refs, 1);
+			}
+			if (PageWriteback(page))
+				end_page_writeback(page);
+
+			lock_page(page);
+			if (PageDirty(page)) {
+				clear_page_dirty_for_io(page);
+				spin_lock_irq(&page->mapping->tree_lock);
+				radix_tree_tag_clear(&page->mapping->page_tree,
+							page_index(page),
+							PAGECACHE_TAG_DIRTY);
+				spin_unlock_irq(&page->mapping->tree_lock);
+			}
+
+			page->mapping->a_ops->invalidatepage(page, 0);
+			unlock_page(page);
+		}
+	}
+
+	return ret;
+}
+
+static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
+				       struct extent_io_tree *pinned_extents)
+{
+	struct extent_io_tree *unpin;
+	u64 start;
+	u64 end;
+	int ret;
+
+	unpin = pinned_extents;
+	while (1) {
+		ret = find_first_extent_bit(unpin, 0, &start, &end,
+					    EXTENT_DIRTY);
+		if (ret)
+			break;
+
+		/* opt_discard */
+		ret = btrfs_error_discard_extent(root, start, end + 1 - start);
+
+		clear_extent_dirty(unpin, start, end, GFP_NOFS);
+		btrfs_error_unpin_extent_range(root, start, end);
+		cond_resched();
+	}
+
+	return 0;
+}
+
+static int btrfs_cleanup_transaction(struct btrfs_root *root)
+{
+	struct btrfs_transaction *t;
+	LIST_HEAD(list);
+
+	WARN_ON(1);
+
+	mutex_lock(&root->fs_info->trans_mutex);
+	mutex_lock(&root->fs_info->transaction_kthread_mutex);
+
+	list_splice_init(&root->fs_info->trans_list, &list);
+	while (!list_empty(&list)) {
+		t = list_entry(list.next, struct btrfs_transaction, list);
+		if (!t)
+			break;
+
+		btrfs_destroy_ordered_operations(root);
+
+		btrfs_destroy_ordered_extents(root);
+
+		btrfs_destroy_delayed_refs(t, root);
+
+		btrfs_block_rsv_release(root,
+					&root->fs_info->trans_block_rsv,
+					t->dirty_pages.dirty_bytes);
+
+		/* FIXME: cleanup wait for commit */
+		t->in_commit = 1;
+		t->blocked = 1;
+		if (waitqueue_active(&root->fs_info->transaction_blocked_wait))
+			wake_up(&root->fs_info->transaction_blocked_wait);
+
+		t->blocked = 0;
+		if (waitqueue_active(&root->fs_info->transaction_wait))
+			wake_up(&root->fs_info->transaction_wait);
+		mutex_unlock(&root->fs_info->trans_mutex);
+
+		mutex_lock(&root->fs_info->trans_mutex);
+		t->commit_done = 1;
+		if (waitqueue_active(&t->commit_wait))
+			wake_up(&t->commit_wait);
+		mutex_unlock(&root->fs_info->trans_mutex);
+
+		mutex_lock(&root->fs_info->trans_mutex);
+
+		btrfs_destroy_pending_snapshots(t);
+
+		btrfs_destroy_delalloc_inodes(root);
+
+		spin_lock(&root->fs_info->new_trans_lock);
+		root->fs_info->running_transaction = NULL;
+		spin_unlock(&root->fs_info->new_trans_lock);
+
+		btrfs_destroy_marked_extents(root, &t->dirty_pages,
+					     EXTENT_DIRTY);
+
+		btrfs_destroy_pinned_extent(root,
+					    root->fs_info->pinned_extents);
+
+		t->use_count = 0;
+		list_del_init(&t->list);
+		memset(t, 0, sizeof(*t));
+		kmem_cache_free(btrfs_transaction_cachep, t);
+	}
+
+	mutex_unlock(&root->fs_info->transaction_kthread_mutex);
+	mutex_unlock(&root->fs_info->trans_mutex);
+
+	return 0;
+}
+
 static struct extent_io_ops btree_extent_io_ops = {
 	.write_cache_pages_lock_hook = btree_lock_page_hook,
 	.readpage_end_io_hook = btree_readpage_end_io_hook,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 88e825a0bf21..07b20dc2fd95 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -52,6 +52,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root, int max_mirrors);
 struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
 int btrfs_commit_super(struct btrfs_root *root);
+int btrfs_error_commit_super(struct btrfs_root *root);
 struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
 					    u64 bytenr, u32 blocksize);
 struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 055b837eab19..bcf303204f7f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -8642,3 +8642,14 @@ out:
 	btrfs_free_path(path);
 	return ret;
 }
+
+int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
+{
+	return unpin_extent_range(root, start, end);
+}
+
+int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
+			       u64 num_bytes)
+{
+	return btrfs_discard_extent(root, bytenr, num_bytes);
+}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 05df688c96f4..f903433f5bdf 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -892,6 +892,17 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
 	if (err)
 		goto out;
 
+	/*
+	 * If BTRFS flips readonly due to some impossible error
+	 * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),
+	 * although we have opened a file as writable, we have
+	 * to stop this write operation to ensure FS consistency.
+	 */
+	if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+		err = -EROFS;
+		goto out;
+	}
+
 	file_update_time(file);
 	BTRFS_I(inode)->sequence++;
 
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 2963376e77f4..52e903b0a293 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -54,6 +54,90 @@
 
 static const struct super_operations btrfs_super_ops;
 
+static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno,
+				      char nbuf[16])
+{
+	char *errstr = NULL;
+
+	switch (errno) {
+	case -EIO:
+		errstr = "IO failure";
+		break;
+	case -ENOMEM:
+		errstr = "Out of memory";
+		break;
+	case -EROFS:
+		errstr = "Readonly filesystem";
+		break;
+	default:
+		if (nbuf) {
+			if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
+				errstr = nbuf;
+		}
+		break;
+	}
+
+	return errstr;
+}
+
+static void __save_error_info(struct btrfs_fs_info *fs_info)
+{
+	/*
+	 * today we only save the error info into ram.  Long term we'll
+	 * also send it down to the disk
+	 */
+	fs_info->fs_state = BTRFS_SUPER_FLAG_ERROR;
+}
+
+/* NOTE:
+ *	We move write_super stuff at umount in order to avoid deadlock
+ *	for umount hold all lock.
+ */
+static void save_error_info(struct btrfs_fs_info *fs_info)
+{
+	__save_error_info(fs_info);
+}
+
+/* btrfs handle error by forcing the filesystem readonly */
+static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
+{
+	struct super_block *sb = fs_info->sb;
+
+	if (sb->s_flags & MS_RDONLY)
+		return;
+
+	if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+		sb->s_flags |= MS_RDONLY;
+		printk(KERN_INFO "btrfs is forced readonly\n");
+	}
+}
+
+/*
+ * __btrfs_std_error decodes expected errors from the caller and
+ * invokes the approciate error response.
+ */
+void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
+		     unsigned int line, int errno)
+{
+	struct super_block *sb = fs_info->sb;
+	char nbuf[16];
+	const char *errstr;
+
+	/*
+	 * Special case: if the error is EROFS, and we're already
+	 * under MS_RDONLY, then it is safe here.
+	 */
+	if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
+		return;
+
+	errstr = btrfs_decode_error(fs_info, errno, nbuf);
+	printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n",
+		sb->s_id, function, line, errstr);
+	save_error_info(fs_info);
+
+	btrfs_handle_error(fs_info);
+}
+
 static void btrfs_put_super(struct super_block *sb)
 {
 	struct btrfs_root *root = btrfs_sb(sb);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 29e30d832ec9..bae5c7b8bbe2 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -181,6 +181,9 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
 	struct btrfs_trans_handle *h;
 	struct btrfs_transaction *cur_trans;
 	int ret;
+
+	if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
+		return ERR_PTR(-EROFS);
 again:
 	h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
 	if (!h)
-- 
cgit v1.2.2


From cf78859f520f8275318f47d7864f4459d940cb6b Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Mon, 17 Jan 2011 21:21:14 +0100
Subject: xfs: Do not name variables "panic"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On platforms that call panic() inside their BUG() macro (m68k/sun3, and
all platforms that don't set HAVE_ARCH_BUG), compilation fails with:

| fs/xfs/support/debug.c: In function ‘xfs_cmn_err’:
| fs/xfs/support/debug.c:92: error: called object ‘panic’ is not a function

as the local variable "panic" conflicts with the "panic()" function.
Rename the local variable to resolve this.

Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/xfs/support/debug.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
index e6cf955ec0fc..0df88897ef84 100644
--- a/fs/xfs/support/debug.c
+++ b/fs/xfs/support/debug.c
@@ -75,11 +75,11 @@ xfs_cmn_err(
 {
 	struct va_format	vaf;
 	va_list			args;
-	int			panic = 0;
+	int			do_panic = 0;
 
 	if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) {
 		printk(KERN_ALERT "XFS: Transforming an alert into a BUG.");
-		panic = 1;
+		do_panic = 1;
 	}
 
 	va_start(args, fmt);
@@ -89,7 +89,7 @@ xfs_cmn_err(
 	printk(KERN_ALERT "Filesystem %s: %pV", mp->m_fsname, &vaf);
 	va_end(args);
 
-	BUG_ON(panic);
+	BUG_ON(do_panic);
 }
 
 void
-- 
cgit v1.2.2


From c14cc63a63e94d490ac6517a555113c30d420db4 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Tue, 18 Jan 2011 12:06:04 +0800
Subject: autofs4 - fix get_next_positive_dentry()

The initialization condition in fs/autofs4/expire.c:get_next_positive_dentry()
appears to be incorrect. If prev == NULL I believe that root should be
returned.

Further down, at the current dentry check for it being simple_positive()
it looks like the d_lock for dentry p should be dropped instead of dentry
ret, otherwise when p is assinged to ret we end up with no lock on p and
a lost lock on ret, which leads to a deadlock.

Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/autofs4/expire.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 3ed79d76c233..f43100b9662b 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -96,7 +96,7 @@ static struct dentry *get_next_positive_dentry(struct dentry *prev,
 	struct dentry *p, *ret;
 
 	if (prev == NULL)
-		return dget(prev);
+		return dget(root);
 
 	spin_lock(&autofs4_lock);
 relock:
@@ -133,7 +133,7 @@ again:
 	spin_lock_nested(&ret->d_lock, DENTRY_D_LOCK_NESTED);
 	/* Negative dentry - try next */
 	if (!simple_positive(ret)) {
-		spin_unlock(&ret->d_lock);
+		spin_unlock(&p->d_lock);
 		p = ret;
 		goto again;
 	}
-- 
cgit v1.2.2


From 8931221411f9ff950de8fd686dc5ab881394cb9a Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Tue, 18 Jan 2011 12:06:10 +0800
Subject: vfs - fix dentry ref count in do_lookup()

There is a ref count problem in fs/namei.c:do_lookup().

When walking in ref-walk mode, if follow_managed() returns a fail we
need to drop dentry and possibly vfsmount.  Clean up properly,
as we do in the other caller of follow_managed().

Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index b753192d8c3f..7d77f24d32a9 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1272,8 +1272,10 @@ done:
 	path->mnt = mnt;
 	path->dentry = dentry;
 	err = follow_managed(path, nd->flags);
-	if (unlikely(err < 0))
+	if (unlikely(err < 0)) {
+		path_put_conditional(path, nd);
 		return err;
+	}
 	*inode = path->dentry->d_inode;
 	return 0;
 
-- 
cgit v1.2.2


From c0bcc9d55252012805300ca01b9b7a143b4daf85 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Tue, 18 Jan 2011 12:06:15 +0800
Subject: autofs4 - fix debug print in autofs4_lookup()

oz_mode isn't defined any more, use autofs4_oz_mode(sbi) instead.

Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/autofs4/root.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 1dba035fc376..427129ab5292 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -488,7 +488,8 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
 	sbi = autofs4_sbi(dir->i_sb);
 
 	DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d",
-		current->pid, task_pgrp_nr(current), sbi->catatonic, oz_mode);
+		current->pid, task_pgrp_nr(current), sbi->catatonic,
+		autofs4_oz_mode(sbi));
 
 	active = autofs4_lookup_active(dentry);
 	if (active) {
-- 
cgit v1.2.2


From 292c5ee802e9b969b84ee671a5e3001d94230f5b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 17 Jan 2011 00:47:38 -0500
Subject: autofs4: keep symlink body in inode->i_private

gets rid of all ->free()/->u.symlink machinery in autofs; we simply
keep symlink bodies in inode->i_private and free them in ->evict_inode().

Acked-by: Ian Kent <raven@themaw.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/autofs4/autofs_i.h |  5 -----
 fs/autofs4/inode.c    | 27 +++++++--------------------
 fs/autofs4/root.c     |  2 +-
 fs/autofs4/symlink.c  |  3 +--
 4 files changed, 9 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 1f016bfb42d5..99a4af8d9c83 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -91,11 +91,6 @@ struct autofs_info {
 
 	mode_t	mode;
 	size_t	size;
-
-	void (*free)(struct autofs_info *);
-	union {
-		const char *symlink;
-	} u;
 };
 
 #define AUTOFS_INF_EXPIRING	(1<<0) /* dentry is in the process of expiring */
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 9e1a9dad23e1..cf8abc793d50 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -22,14 +22,6 @@
 #include "autofs_i.h"
 #include <linux/module.h>
 
-static void ino_lnkfree(struct autofs_info *ino)
-{
-	if (ino->u.symlink) {
-		kfree(ino->u.symlink);
-		ino->u.symlink = NULL;
-	}
-}
-
 struct autofs_info *autofs4_init_ino(struct autofs_info *ino,
 				     struct autofs_sb_info *sbi, mode_t mode)
 {
@@ -60,16 +52,6 @@ struct autofs_info *autofs4_init_ino(struct autofs_info *ino,
 
 	ino->sbi = sbi;
 
-	if (reinit && ino->free)
-		(ino->free)(ino);
-
-	memset(&ino->u, 0, sizeof(ino->u));
-
-	ino->free = NULL;
-
-	if (S_ISLNK(mode))
-		ino->free = ino_lnkfree;
-
 	return ino;
 }
 
@@ -79,8 +61,6 @@ void autofs4_free_ino(struct autofs_info *ino)
 		ino->dentry->d_fsdata = NULL;
 		ino->dentry = NULL;
 	}
-	if (ino->free)
-		(ino->free)(ino);
 	kfree(ino);
 }
 
@@ -136,9 +116,16 @@ static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt)
 	return 0;
 }
 
+static void autofs4_evict_inode(struct inode *inode)
+{
+	end_writeback(inode);
+	kfree(inode->i_private);
+}
+
 static const struct super_operations autofs4_sops = {
 	.statfs		= simple_statfs,
 	.show_options	= autofs4_show_options,
+	.evict_inode	= autofs4_evict_inode,
 };
 
 enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto,
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 427129ab5292..f47aceabf58f 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -561,6 +561,7 @@ static int autofs4_dir_symlink(struct inode *dir,
 			kfree(ino);
 		return -ENOMEM;
 	}
+	inode->i_private = cp;
 	d_add(dentry, inode);
 
 	dentry->d_fsdata = ino;
@@ -570,7 +571,6 @@ static int autofs4_dir_symlink(struct inode *dir,
 	if (p_ino && dentry->d_parent != dentry)
 		atomic_inc(&p_ino->count);
 
-	ino->u.symlink = cp;
 	dir->i_mtime = CURRENT_TIME;
 
 	return 0;
diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c
index b4ea82934d2e..f27c094a1919 100644
--- a/fs/autofs4/symlink.c
+++ b/fs/autofs4/symlink.c
@@ -14,8 +14,7 @@
 
 static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
-	struct autofs_info *ino = autofs4_dentry_ino(dentry);
-	nd_set_link(nd, (char *)ino->u.symlink);
+	nd_set_link(nd, dentry->d_inode->i_private);
 	return NULL;
 }
 
-- 
cgit v1.2.2


From 14a2f00bde7668fe18d1c8355d26c7c96961e1f7 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 16 Jan 2011 17:13:33 -0500
Subject: autofs4: autofs4_mkroot() is not different from autofs4_init_ino()

Kill it.  Mind you, it's been an obfuscated call of autofs4_init_ino()
ever since 2.3.99pre6-4...

Acked-by: Ian Kent <raven@themaw.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/autofs4/inode.c | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index cf8abc793d50..7421b47b1bb9 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -215,17 +215,6 @@ static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid,
 	return (*pipefd < 0);
 }
 
-static struct autofs_info *autofs4_mkroot(struct autofs_sb_info *sbi)
-{
-	struct autofs_info *ino;
-
-	ino = autofs4_init_ino(NULL, sbi, S_IFDIR | 0755);
-	if (!ino)
-		return NULL;
-
-	return ino;
-}
-
 int autofs4_fill_super(struct super_block *s, void *data, int silent)
 {
 	struct inode * root_inode;
@@ -269,7 +258,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
 	/*
 	 * Get the root inode and dentry, but defer checking for errors.
 	 */
-	ino = autofs4_mkroot(sbi);
+	ino = autofs4_init_ino(NULL, sbi, S_IFDIR | 0755);
 	if (!ino)
 		goto fail_free;
 	root_inode = autofs4_get_inode(s, ino);
-- 
cgit v1.2.2


From 09f12c03fa699ce7d030c47add60577138927d4f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 16 Jan 2011 17:20:23 -0500
Subject: autofs4: pass mode to autofs4_get_inode() explicitly

In all cases we'd set inf->mode to know value just before
passing it to autofs4_get_inode().  That kills the need
to store it in autofs_info and pass it to autofs_init_ino()

Acked-by: Ian Kent <raven@themaw.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/autofs4/autofs_i.h |  5 ++---
 fs/autofs4/inode.c    | 16 ++++++++--------
 fs/autofs4/root.c     | 10 +++++-----
 3 files changed, 15 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 99a4af8d9c83..c6d66db67ff1 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -89,7 +89,6 @@ struct autofs_info {
 	uid_t uid;
 	gid_t gid;
 
-	mode_t	mode;
 	size_t	size;
 };
 
@@ -170,7 +169,7 @@ static inline int autofs4_ispending(struct dentry *dentry)
 	return 0;
 }
 
-struct inode *autofs4_get_inode(struct super_block *, struct autofs_info *);
+struct inode *autofs4_get_inode(struct super_block *, struct autofs_info *, mode_t);
 void autofs4_free_ino(struct autofs_info *);
 
 /* Expiration */
@@ -280,7 +279,7 @@ static inline void managed_dentry_clear_managed(struct dentry *dentry)
 /* Initializing function */
 
 int autofs4_fill_super(struct super_block *, void *, int);
-struct autofs_info *autofs4_init_ino(struct autofs_info *, struct autofs_sb_info *sbi, mode_t mode);
+struct autofs_info *autofs4_init_ino(struct autofs_info *, struct autofs_sb_info *sbi);
 
 /* Queue management functions */
 
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 7421b47b1bb9..6b6f43f00c46 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -23,7 +23,7 @@
 #include <linux/module.h>
 
 struct autofs_info *autofs4_init_ino(struct autofs_info *ino,
-				     struct autofs_sb_info *sbi, mode_t mode)
+				     struct autofs_sb_info *sbi)
 {
 	int reinit = 1;
 
@@ -47,7 +47,6 @@ struct autofs_info *autofs4_init_ino(struct autofs_info *ino,
 
 	ino->uid = 0;
 	ino->gid = 0;
-	ino->mode = mode;
 	ino->last_used = jiffies;
 
 	ino->sbi = sbi;
@@ -258,10 +257,10 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
 	/*
 	 * Get the root inode and dentry, but defer checking for errors.
 	 */
-	ino = autofs4_init_ino(NULL, sbi, S_IFDIR | 0755);
+	ino = autofs4_init_ino(NULL, sbi);
 	if (!ino)
 		goto fail_free;
-	root_inode = autofs4_get_inode(s, ino);
+	root_inode = autofs4_get_inode(s, ino, S_IFDIR | 0755);
 	if (!root_inode)
 		goto fail_ino;
 
@@ -345,14 +344,15 @@ fail_unlock:
 }
 
 struct inode *autofs4_get_inode(struct super_block *sb,
-				struct autofs_info *inf)
+				struct autofs_info *inf,
+				mode_t mode)
 {
 	struct inode *inode = new_inode(sb);
 
 	if (inode == NULL)
 		return NULL;
 
-	inode->i_mode = inf->mode;
+	inode->i_mode = mode;
 	if (sb->s_root) {
 		inode->i_uid = sb->s_root->d_inode->i_uid;
 		inode->i_gid = sb->s_root->d_inode->i_gid;
@@ -360,11 +360,11 @@ struct inode *autofs4_get_inode(struct super_block *sb,
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 	inode->i_ino = get_next_ino();
 
-	if (S_ISDIR(inf->mode)) {
+	if (S_ISDIR(mode)) {
 		inode->i_nlink = 2;
 		inode->i_op = &autofs4_dir_inode_operations;
 		inode->i_fop = &autofs4_dir_operations;
-	} else if (S_ISLNK(inf->mode)) {
+	} else if (S_ISLNK(mode)) {
 		inode->i_size = inf->size;
 		inode->i_op = &autofs4_symlink_inode_operations;
 	}
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index f47aceabf58f..e55dcdbeb450 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -508,7 +508,7 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
 		if (autofs_type_indirect(sbi->type) && IS_ROOT(dentry->d_parent))
 			__managed_dentry_set_managed(dentry);
 
-		ino = autofs4_init_ino(NULL, sbi, 0555);
+		ino = autofs4_init_ino(NULL, sbi);
 		if (!ino)
 			return ERR_PTR(-ENOMEM);
 
@@ -538,7 +538,7 @@ static int autofs4_dir_symlink(struct inode *dir,
 	if (!autofs4_oz_mode(sbi))
 		return -EACCES;
 
-	ino = autofs4_init_ino(ino, sbi, S_IFLNK | 0555);
+	ino = autofs4_init_ino(ino, sbi);
 	if (!ino)
 		return -ENOMEM;
 
@@ -554,7 +554,7 @@ static int autofs4_dir_symlink(struct inode *dir,
 
 	strcpy(cp, symname);
 
-	inode = autofs4_get_inode(dir->i_sb, ino);
+	inode = autofs4_get_inode(dir->i_sb, ino, S_IFLNK | 0555);
 	if (!inode) {
 		kfree(cp);
 		if (!dentry->d_fsdata)
@@ -733,13 +733,13 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	DPRINTK("dentry %p, creating %.*s",
 		dentry, dentry->d_name.len, dentry->d_name.name);
 
-	ino = autofs4_init_ino(ino, sbi, S_IFDIR | 0555);
+	ino = autofs4_init_ino(ino, sbi);
 	if (!ino)
 		return -ENOMEM;
 
 	autofs4_del_active(dentry);
 
-	inode = autofs4_get_inode(dir->i_sb, ino);
+	inode = autofs4_get_inode(dir->i_sb, ino, S_IFDIR | 0555);
 	if (!inode) {
 		if (!dentry->d_fsdata)
 			kfree(ino);
-- 
cgit v1.2.2


From 0bf71d4d005176f6b6587ba64a377f9798213f21 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 16 Jan 2011 17:39:15 -0500
Subject: autofs4: kill ->size in autofs_info

It's used only to pass the length of symlink body to
autofs4_get_inode() in autofs4_dir_symlink().  We can
bloody well set inode->i_size in autofs4_dir_symlink()
directly and be done with that.

Acked-by: Ian Kent <raven@themaw.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/autofs4/autofs_i.h | 2 --
 fs/autofs4/inode.c    | 2 --
 fs/autofs4/root.c     | 5 +++--
 3 files changed, 3 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index c6d66db67ff1..0925bacb5c3c 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -88,8 +88,6 @@ struct autofs_info {
 
 	uid_t uid;
 	gid_t gid;
-
-	size_t	size;
 };
 
 #define AUTOFS_INF_EXPIRING	(1<<0) /* dentry is in the process of expiring */
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 6b6f43f00c46..ac1a99ce820b 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -38,7 +38,6 @@ struct autofs_info *autofs4_init_ino(struct autofs_info *ino,
 	if (!reinit) {
 		ino->flags = 0;
 		ino->dentry = NULL;
-		ino->size = 0;
 		INIT_LIST_HEAD(&ino->active);
 		ino->active_count = 0;
 		INIT_LIST_HEAD(&ino->expiring);
@@ -365,7 +364,6 @@ struct inode *autofs4_get_inode(struct super_block *sb,
 		inode->i_op = &autofs4_dir_inode_operations;
 		inode->i_fop = &autofs4_dir_operations;
 	} else if (S_ISLNK(mode)) {
-		inode->i_size = inf->size;
 		inode->i_op = &autofs4_symlink_inode_operations;
 	}
 
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index e55dcdbeb450..1ad3c6ca9b03 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -530,6 +530,7 @@ static int autofs4_dir_symlink(struct inode *dir,
 	struct autofs_info *ino = autofs4_dentry_ino(dentry);
 	struct autofs_info *p_ino;
 	struct inode *inode;
+	size_t size = strlen(symname);
 	char *cp;
 
 	DPRINTK("%s <- %.*s", symname,
@@ -544,8 +545,7 @@ static int autofs4_dir_symlink(struct inode *dir,
 
 	autofs4_del_active(dentry);
 
-	ino->size = strlen(symname);
-	cp = kmalloc(ino->size + 1, GFP_KERNEL);
+	cp = kmalloc(size + 1, GFP_KERNEL);
 	if (!cp) {
 		if (!dentry->d_fsdata)
 			kfree(ino);
@@ -562,6 +562,7 @@ static int autofs4_dir_symlink(struct inode *dir,
 		return -ENOMEM;
 	}
 	inode->i_private = cp;
+	inode->i_size = size;
 	d_add(dentry, inode);
 
 	dentry->d_fsdata = ino;
-- 
cgit v1.2.2


From 726a5e0688fd344110d8f2979d87f243a4ba1a48 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 16 Jan 2011 17:43:52 -0500
Subject: autofs4: autofs4_get_inode() doesn't need autofs_info * argument
 anymore

Acked-by: Ian Kent <raven@themaw.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/autofs4/autofs_i.h | 2 +-
 fs/autofs4/inode.c    | 6 ++----
 fs/autofs4/root.c     | 4 ++--
 3 files changed, 5 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 0925bacb5c3c..8f15162f1672 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -167,7 +167,7 @@ static inline int autofs4_ispending(struct dentry *dentry)
 	return 0;
 }
 
-struct inode *autofs4_get_inode(struct super_block *, struct autofs_info *, mode_t);
+struct inode *autofs4_get_inode(struct super_block *, mode_t);
 void autofs4_free_ino(struct autofs_info *);
 
 /* Expiration */
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index ac1a99ce820b..b3f9477c9745 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -259,7 +259,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
 	ino = autofs4_init_ino(NULL, sbi);
 	if (!ino)
 		goto fail_free;
-	root_inode = autofs4_get_inode(s, ino, S_IFDIR | 0755);
+	root_inode = autofs4_get_inode(s, S_IFDIR | 0755);
 	if (!root_inode)
 		goto fail_ino;
 
@@ -342,9 +342,7 @@ fail_unlock:
 	return -EINVAL;
 }
 
-struct inode *autofs4_get_inode(struct super_block *sb,
-				struct autofs_info *inf,
-				mode_t mode)
+struct inode *autofs4_get_inode(struct super_block *sb, mode_t mode)
 {
 	struct inode *inode = new_inode(sb);
 
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 1ad3c6ca9b03..83e5379c5ade 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -554,7 +554,7 @@ static int autofs4_dir_symlink(struct inode *dir,
 
 	strcpy(cp, symname);
 
-	inode = autofs4_get_inode(dir->i_sb, ino, S_IFLNK | 0555);
+	inode = autofs4_get_inode(dir->i_sb, S_IFLNK | 0555);
 	if (!inode) {
 		kfree(cp);
 		if (!dentry->d_fsdata)
@@ -740,7 +740,7 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 
 	autofs4_del_active(dentry);
 
-	inode = autofs4_get_inode(dir->i_sb, ino, S_IFDIR | 0555);
+	inode = autofs4_get_inode(dir->i_sb, S_IFDIR | 0555);
 	if (!inode) {
 		if (!dentry->d_fsdata)
 			kfree(ino);
-- 
cgit v1.2.2


From 5a37db302e698a83209eff22ca8f3fd05eb1d84b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 16 Jan 2011 18:29:35 -0500
Subject: autofs4: mkdir and symlink always get a dentry that had passed lookup

... so ->d_fsdata will have been set up before we get there

Acked-by: Ian Kent <raven@themaw.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/autofs4/root.c | 28 ++++++++++------------------
 1 file changed, 10 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 83e5379c5ade..a5b93e8f49b5 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -539,18 +539,15 @@ static int autofs4_dir_symlink(struct inode *dir,
 	if (!autofs4_oz_mode(sbi))
 		return -EACCES;
 
-	ino = autofs4_init_ino(ino, sbi);
-	if (!ino)
-		return -ENOMEM;
+	BUG_ON(!ino);
+
+	autofs4_init_ino(ino, sbi);
 
 	autofs4_del_active(dentry);
 
 	cp = kmalloc(size + 1, GFP_KERNEL);
-	if (!cp) {
-		if (!dentry->d_fsdata)
-			kfree(ino);
+	if (!cp)
 		return -ENOMEM;
-	}
 
 	strcpy(cp, symname);
 
@@ -565,8 +562,7 @@ static int autofs4_dir_symlink(struct inode *dir,
 	inode->i_size = size;
 	d_add(dentry, inode);
 
-	dentry->d_fsdata = ino;
-	ino->dentry = dget(dentry);
+	dget(dentry);
 	atomic_inc(&ino->count);
 	p_ino = autofs4_dentry_ino(dentry->d_parent);
 	if (p_ino && dentry->d_parent != dentry)
@@ -734,25 +730,21 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	DPRINTK("dentry %p, creating %.*s",
 		dentry, dentry->d_name.len, dentry->d_name.name);
 
-	ino = autofs4_init_ino(ino, sbi);
-	if (!ino)
-		return -ENOMEM;
+	BUG_ON(!ino);
+
+	autofs4_init_ino(ino, sbi);
 
 	autofs4_del_active(dentry);
 
 	inode = autofs4_get_inode(dir->i_sb, S_IFDIR | 0555);
-	if (!inode) {
-		if (!dentry->d_fsdata)
-			kfree(ino);
+	if (!inode)
 		return -ENOMEM;
-	}
 	d_add(dentry, inode);
 
 	if (sbi->version < 5)
 		autofs_set_leaf_automount_flags(dentry);
 
-	dentry->d_fsdata = ino;
-	ino->dentry = dget(dentry);
+	dget(dentry);
 	atomic_inc(&ino->count);
 	p_ino = autofs4_dentry_ino(dentry->d_parent);
 	if (p_ino && dentry->d_parent != dentry)
-- 
cgit v1.2.2


From 26e6c910670171410577c7df2aebe94cef76e150 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 16 Jan 2011 18:43:40 -0500
Subject: autofs4: split autofs4_init_ino()

split init_ino into new_ino and clean_ino; the former is
what used to be init_ino(NULL, sbi), the latter is for cases
where we passed non-NULL ino.  Lose unused arguments.

Acked-by: Ian Kent <raven@themaw.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/autofs4/autofs_i.h |  3 ++-
 fs/autofs4/inode.c    | 32 ++++++++++----------------------
 fs/autofs4/root.c     |  6 +++---
 3 files changed, 15 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 8f15162f1672..bfa0c6e542f2 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -277,7 +277,8 @@ static inline void managed_dentry_clear_managed(struct dentry *dentry)
 /* Initializing function */
 
 int autofs4_fill_super(struct super_block *, void *, int);
-struct autofs_info *autofs4_init_ino(struct autofs_info *, struct autofs_sb_info *sbi);
+struct autofs_info *autofs4_new_ino(struct autofs_sb_info *);
+void autofs4_clean_ino(struct autofs_info *);
 
 /* Queue management functions */
 
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index b3f9477c9745..0df0c7c46fa2 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -22,35 +22,23 @@
 #include "autofs_i.h"
 #include <linux/module.h>
 
-struct autofs_info *autofs4_init_ino(struct autofs_info *ino,
-				     struct autofs_sb_info *sbi)
+struct autofs_info *autofs4_new_ino(struct autofs_sb_info *sbi)
 {
-	int reinit = 1;
-
-	if (ino == NULL) {
-		reinit = 0;
-		ino = kmalloc(sizeof(*ino), GFP_KERNEL);
-	}
-
-	if (ino == NULL)
-		return NULL;
-
-	if (!reinit) {
-		ino->flags = 0;
-		ino->dentry = NULL;
+	struct autofs_info *ino = kzalloc(sizeof(*ino), GFP_KERNEL);
+	if (ino) {
 		INIT_LIST_HEAD(&ino->active);
-		ino->active_count = 0;
 		INIT_LIST_HEAD(&ino->expiring);
-		atomic_set(&ino->count, 0);
+		ino->last_used = jiffies;
+		ino->sbi = sbi;
 	}
+	return ino;
+}
 
+void autofs4_clean_ino(struct autofs_info *ino)
+{
 	ino->uid = 0;
 	ino->gid = 0;
 	ino->last_used = jiffies;
-
-	ino->sbi = sbi;
-
-	return ino;
 }
 
 void autofs4_free_ino(struct autofs_info *ino)
@@ -256,7 +244,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
 	/*
 	 * Get the root inode and dentry, but defer checking for errors.
 	 */
-	ino = autofs4_init_ino(NULL, sbi);
+	ino = autofs4_new_ino(sbi);
 	if (!ino)
 		goto fail_free;
 	root_inode = autofs4_get_inode(s, S_IFDIR | 0755);
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index a5b93e8f49b5..f7c97c084c1f 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -508,7 +508,7 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
 		if (autofs_type_indirect(sbi->type) && IS_ROOT(dentry->d_parent))
 			__managed_dentry_set_managed(dentry);
 
-		ino = autofs4_init_ino(NULL, sbi);
+		ino = autofs4_new_ino(sbi);
 		if (!ino)
 			return ERR_PTR(-ENOMEM);
 
@@ -541,7 +541,7 @@ static int autofs4_dir_symlink(struct inode *dir,
 
 	BUG_ON(!ino);
 
-	autofs4_init_ino(ino, sbi);
+	autofs4_clean_ino(ino);
 
 	autofs4_del_active(dentry);
 
@@ -732,7 +732,7 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 
 	BUG_ON(!ino);
 
-	autofs4_init_ino(ino, sbi);
+	autofs4_clean_ino(ino);
 
 	autofs4_del_active(dentry);
 
-- 
cgit v1.2.2


From b89b12b46211d971d75e5ca8249817bc9e11c453 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 16 Jan 2011 21:42:32 -0500
Subject: autofs4: clean ->d_release() and autofs4_free_ino() up

The latter is called only when both ino and dentry are about to
be freed, so cleaning ->d_fsdata and ->dentry is pointless.

Acked-by: Ian Kent <raven@themaw.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/autofs4/autofs_i.h |  1 -
 fs/autofs4/inode.c    |  4 ----
 fs/autofs4/root.c     | 30 ++++++++++++++++--------------
 3 files changed, 16 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index bfa0c6e542f2..54f923792728 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -338,5 +338,4 @@ static inline void autofs4_del_expiring(struct dentry *dentry)
 	return;
 }
 
-void autofs4_dentry_release(struct dentry *);
 extern void autofs4_kill_sb(struct super_block *);
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 0df0c7c46fa2..180fa2425e49 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -43,10 +43,6 @@ void autofs4_clean_ino(struct autofs_info *ino)
 
 void autofs4_free_ino(struct autofs_info *ino)
 {
-	if (ino->dentry) {
-		ino->dentry->d_fsdata = NULL;
-		ino->dentry = NULL;
-	}
 	kfree(ino);
 }
 
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index f7c97c084c1f..014e7aba3b08 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -37,6 +37,7 @@ static int autofs4_dir_open(struct inode *inode, struct file *file);
 static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *);
 static struct vfsmount *autofs4_d_automount(struct path *);
 static int autofs4_d_manage(struct dentry *, bool, bool);
+static void autofs4_dentry_release(struct dentry *);
 
 const struct file_operations autofs4_root_operations = {
 	.open		= dcache_dir_open,
@@ -138,25 +139,26 @@ out:
 	return dcache_dir_open(inode, file);
 }
 
-void autofs4_dentry_release(struct dentry *de)
+static void autofs4_dentry_release(struct dentry *de)
 {
-	struct autofs_info *inf;
+	struct autofs_info *ino = autofs4_dentry_ino(de);
+	struct autofs_sb_info *sbi = autofs4_sbi(de->d_sb);
 
 	DPRINTK("releasing %p", de);
 
-	inf = autofs4_dentry_ino(de);
-	if (inf) {
-		struct autofs_sb_info *sbi = autofs4_sbi(de->d_sb);
-		if (sbi) {
-			spin_lock(&sbi->lookup_lock);
-			if (!list_empty(&inf->active))
-				list_del(&inf->active);
-			if (!list_empty(&inf->expiring))
-				list_del(&inf->expiring);
-			spin_unlock(&sbi->lookup_lock);
-		}
-		autofs4_free_ino(inf);
+	if (!ino)
+		return;
+
+	if (sbi) {
+		spin_lock(&sbi->lookup_lock);
+		if (!list_empty(&ino->active))
+			list_del(&ino->active);
+		if (!list_empty(&ino->expiring))
+			list_del(&ino->expiring);
+		spin_unlock(&sbi->lookup_lock);
 	}
+
+	autofs4_free_ino(ino);
 }
 
 static struct dentry *autofs4_lookup_active(struct dentry *dentry)
-- 
cgit v1.2.2


From 23c3010808de86f21436eb822aacfa551bfc17e4 Mon Sep 17 00:00:00 2001
From: Benjamin Marzinski <bmarzins@redhat.com>
Date: Fri, 14 Jan 2011 22:39:16 -0600
Subject: GFS2: remove iopen glocks from cache on failed deletes

When a file gets deleted on GFS2, if a node can't get an exclusive lock on the
file's iopen glock, it punts on actually freeing up the space, because another
node is using the file.  When it does this, it needs to drop the iopen glock
from its cache so that the other node can get an exclusive lock on it. Now,
gfs2_delete_inode() sets GL_NOCACHE before dropping the shared lock on the
iopen glock in preparation for grabbing it in the exclusive state.  Since the
node needs the glock in the exclusive state, dropping the shared lock from the
cache doesn't slow down the case where no other nodes are using the file.

Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/super.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 16c2ecac7eb7..ec73ed70bae1 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1336,6 +1336,7 @@ static void gfs2_evict_inode(struct inode *inode)
 	if (error)
 		goto out_truncate;
 
+	ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
 	gfs2_glock_dq_wait(&ip->i_iopen_gh);
 	gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
 	error = gfs2_glock_nq(&ip->i_iopen_gh);
-- 
cgit v1.2.2


From 24d9765fc18c7838ccdbb0d71fb706321d9b824c Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 18 Jan 2011 14:49:08 +0000
Subject: GFS2: Fix error path in gfs2_lookup_by_inum()

In the (impossible, except if there is fs corruption) error path
in gfs2_lookup_by_inum() if the call to gfs2_inode_refresh()
fails, it was leaving the function by calling iput() rather
than iget_failed(). This would cause future lookups of the same
inode to block forever.

This patch fixes the problem by moving the call to gfs2_inode_refresh()
into gfs2_inode_lookup() where iget_failed() is part of the error path
already. Also this cleans up some unreachable code and makes
gfs2_set_iop() static.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/inode.c | 72 ++++++++++++++++++---------------------------------------
 fs/gfs2/inode.h |  1 -
 2 files changed, 22 insertions(+), 51 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 2232b3c780bd..7aa7d4f8984a 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -74,16 +74,14 @@ static struct inode *gfs2_iget(struct super_block *sb, u64 no_addr)
 }
 
 /**
- * GFS2 lookup code fills in vfs inode contents based on info obtained
- * from directory entry inside gfs2_inode_lookup(). This has caused issues
- * with NFS code path since its get_dentry routine doesn't have the relevant
- * directory entry when gfs2_inode_lookup() is invoked. Part of the code
- * segment inside gfs2_inode_lookup code needs to get moved around.
+ * gfs2_set_iop - Sets inode operations
+ * @inode: The inode with correct i_mode filled in
  *
- * Clears I_NEW as well.
- **/
+ * GFS2 lookup code fills in vfs inode contents based on info obtained
+ * from directory entry inside gfs2_inode_lookup().
+ */
 
-void gfs2_set_iop(struct inode *inode)
+static void gfs2_set_iop(struct inode *inode)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	umode_t mode = inode->i_mode;
@@ -106,8 +104,6 @@ void gfs2_set_iop(struct inode *inode)
 		inode->i_op = &gfs2_file_iops;
 		init_special_inode(inode, inode->i_mode, inode->i_rdev);
 	}
-
-	unlock_new_inode(inode);
 }
 
 /**
@@ -119,10 +115,8 @@ void gfs2_set_iop(struct inode *inode)
  * Returns: A VFS inode, or an error
  */
 
-struct inode *gfs2_inode_lookup(struct super_block *sb,
-				unsigned int type,
-				u64 no_addr,
-				u64 no_formal_ino)
+struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
+				u64 no_addr, u64 no_formal_ino)
 {
 	struct inode *inode;
 	struct gfs2_inode *ip;
@@ -152,51 +146,37 @@ struct inode *gfs2_inode_lookup(struct super_block *sb,
 		error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh);
 		if (unlikely(error))
 			goto fail_iopen;
-		ip->i_iopen_gh.gh_gl->gl_object = ip;
 
+		ip->i_iopen_gh.gh_gl->gl_object = ip;
 		gfs2_glock_put(io_gl);
 		io_gl = NULL;
 
-		if ((type == DT_UNKNOWN) && (no_formal_ino == 0))
-			goto gfs2_nfsbypass;
-
-		inode->i_mode = DT2IF(type);
-
-		/*
-		 * We must read the inode in order to work out its type in
-		 * this case. Note that this doesn't happen often as we normally
-		 * know the type beforehand. This code path only occurs during
-		 * unlinked inode recovery (where it is safe to do this glock,
-		 * which is not true in the general case).
-		 */
 		if (type == DT_UNKNOWN) {
-			struct gfs2_holder gh;
-			error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
-			if (unlikely(error))
-				goto fail_glock;
-			/* Inode is now uptodate */
-			gfs2_glock_dq_uninit(&gh);
+			/* Inode glock must be locked already */
+			error = gfs2_inode_refresh(GFS2_I(inode));
+			if (error)
+				goto fail_refresh;
+		} else {
+			inode->i_mode = DT2IF(type);
 		}
 
 		gfs2_set_iop(inode);
+		unlock_new_inode(inode);
 	}
 
-gfs2_nfsbypass:
 	return inode;
-fail_glock:
-	gfs2_glock_dq(&ip->i_iopen_gh);
+
+fail_refresh:
+	ip->i_iopen_gh.gh_gl->gl_object = NULL;
+	gfs2_glock_dq_uninit(&ip->i_iopen_gh);
 fail_iopen:
 	if (io_gl)
 		gfs2_glock_put(io_gl);
 fail_put:
-	if (inode->i_state & I_NEW)
-		ip->i_gl->gl_object = NULL;
+	ip->i_gl->gl_object = NULL;
 	gfs2_glock_put(ip->i_gl);
 fail:
-	if (inode->i_state & I_NEW)
-		iget_failed(inode);
-	else
-		iput(inode);
+	iget_failed(inode);
 	return ERR_PTR(error);
 }
 
@@ -221,14 +201,6 @@ struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
 	if (IS_ERR(inode))
 		goto fail;
 
-	error = gfs2_inode_refresh(GFS2_I(inode));
-	if (error)
-		goto fail_iput;
-
-	/* Pick up the works we bypass in gfs2_inode_lookup */
-	if (inode->i_state & I_NEW) 
-		gfs2_set_iop(inode);
-
 	/* Two extra checks for NFS only */
 	if (no_formal_ino) {
 		error = -ESTALE;
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 732a183efdb3..3e00a66e7cbd 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -96,7 +96,6 @@ err:
 	return -EIO;
 }
 
-extern void gfs2_set_iop(struct inode *inode);
 extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 
 				       u64 no_addr, u64 no_formal_ino);
 extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
-- 
cgit v1.2.2


From 50aac4fec503960380ab594a93a6fbfdf3f8915f Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 18 Jan 2011 07:59:40 -0800
Subject: ceph: fix cap_wanted_delay_{min,max} mount option initialization

These were initialized to 0 instead of the default, fallout from the RBD
refactor in 3d14c5d2b6e15c21d8e5467dc62d33127c23a644.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/super.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index bf6f0f34082a..9c5085465a63 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -290,6 +290,8 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
 
         fsopt->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
         fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
+	fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
+	fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
         fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
         fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
         fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
-- 
cgit v1.2.2


From 24be0c481067560b11441e794e27f166a3568863 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 18 Jan 2011 08:48:06 -0800
Subject: ceph: fix erroneous cap flush to non-auth mds

The int flushing is global and not clear on each iteration of the loop,
which can cause a second flush of caps to any MDSs with ids greater than
the auth.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 60d27bc9eb83..f654c7e933ac 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1658,6 +1658,8 @@ ack:
 
 		if (cap == ci->i_auth_cap && ci->i_dirty_caps)
 			flushing = __mark_caps_flushing(inode, session);
+		else
+			flushing = 0;
 
 		mds = cap->mds;  /* remember mds, so we don't repeat */
 		sent++;
-- 
cgit v1.2.2


From 088b3f5e9ee2649f5cfc2f08d8ce654e3eeba310 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 18 Jan 2011 08:56:01 -0800
Subject: ceph: fix flushing of caps vs cap import

If we are mid-flush and a cap is migrated to another node, we need to
resend the cap flush message to the new MDS, and do so with the original
flush_seq to avoid leaking across a sync boundary.  Previously we didn't
redo the flush (we only flushed newly dirty data), which would cause a
later sync to hang forever.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c | 38 ++++++++++++++++++++++++++++++++++----
 1 file changed, 34 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index f654c7e933ac..7def3f5903dd 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1560,9 +1560,10 @@ retry_locked:
 		/* NOTE: no side-effects allowed, until we take s_mutex */
 
 		revoking = cap->implemented & ~cap->issued;
-		if (revoking)
-			dout(" mds%d revoking %s\n", cap->mds,
-			     ceph_cap_string(revoking));
+		dout(" mds%d cap %p issued %s implemented %s revoking %s\n",
+		     cap->mds, cap, ceph_cap_string(cap->issued),
+		     ceph_cap_string(cap->implemented),
+		     ceph_cap_string(revoking));
 
 		if (cap == ci->i_auth_cap &&
 		    (cap->issued & CEPH_CAP_FILE_WR)) {
@@ -1942,6 +1943,35 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
 	}
 }
 
+static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
+				     struct ceph_mds_session *session,
+				     struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_cap *cap;
+	int delayed = 0;
+
+	spin_lock(&inode->i_lock);
+	cap = ci->i_auth_cap;
+	dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode,
+	     ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq);
+	__ceph_flush_snaps(ci, &session, 1);
+	if (ci->i_flushing_caps) {
+		delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
+				     __ceph_caps_used(ci),
+				     __ceph_caps_wanted(ci),
+				     cap->issued | cap->implemented,
+				     ci->i_flushing_caps, NULL);
+		if (delayed) {
+			spin_lock(&inode->i_lock);
+			__cap_delay_requeue(mdsc, ci);
+			spin_unlock(&inode->i_lock);
+		}
+	} else {
+		spin_unlock(&inode->i_lock);
+	}
+}
+
 
 /*
  * Take references to capabilities we hold, so that we don't release
@@ -2689,7 +2719,7 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
 	ceph_add_cap(inode, session, cap_id, -1,
 		     issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH,
 		     NULL /* no caps context */);
-	try_flush_caps(inode, session, NULL);
+	kick_flushing_inode_caps(mdsc, session, inode);
 	up_read(&mdsc->snap_rwsem);
 
 	/* make sure we re-request max_size, if necessary */
-- 
cgit v1.2.2


From 7e57b81c7688c762bc9e775bc83f9fc17946f527 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 18 Jan 2011 09:00:01 -0800
Subject: ceph: avoid immediate cap check after import

The NODELAY flag avoids the heuristics that delay cap (issued/wanted)
release.  There's no reason for that after we import a cap, and it kills
whatever benefit we get from those delays.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 7def3f5903dd..6b61ded701e1 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2817,8 +2817,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 	case CEPH_CAP_OP_IMPORT:
 		handle_cap_import(mdsc, inode, h, session,
 				  snaptrace, snaptrace_len);
-		ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY,
-				session);
+		ceph_check_caps(ceph_inode(inode), 0, session);
 		goto done_unlocked;
 	}
 
-- 
cgit v1.2.2


From 12fed00de963433128b5366a21a55808fab2f756 Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <piastryyy@gmail.com>
Date: Mon, 17 Jan 2011 20:15:44 +0300
Subject: CIFS: Fix oplock break handling (try #2)

When we get oplock break notification we should set the appropriate
value of OplockLevel field in oplock break acknowledge according to
the oplock level held by the client in this time. As we only can have
level II oplock or no oplock in the case of oplock break, we should be
aware only about clientCanCacheRead field in cifsInodeInfo structure.

Also fix bug connected with wrong interpretation of OplockLevel field
during oplock break notification processing.

Signed-off-by: Pavel Shilovsky <piastryyy@gmail.com>
Cc: <stable@kernel.org>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsproto.h |  2 +-
 fs/cifs/cifssmb.c   |  4 +++-
 fs/cifs/file.c      | 21 +++++++++++----------
 fs/cifs/misc.c      |  2 +-
 4 files changed, 16 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index e6d1481b16c1..95d5dbbb4c7a 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -347,7 +347,7 @@ extern int CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
 			const __u16 netfid, const __u64 len,
 			const __u64 offset, const __u32 numUnlock,
 			const __u32 numLock, const __u8 lockType,
-			const bool waitFlag);
+			const bool waitFlag, const __u8 oplock_level);
 extern int CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
 			const __u16 smb_file_id, const int get_flag,
 			const __u64 len, struct file_lock *,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 2f6795e524d3..3652cc60314c 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1663,7 +1663,8 @@ int
 CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
 	    const __u16 smb_file_id, const __u64 len,
 	    const __u64 offset, const __u32 numUnlock,
-	    const __u32 numLock, const __u8 lockType, const bool waitFlag)
+	    const __u32 numLock, const __u8 lockType,
+	    const bool waitFlag, const __u8 oplock_level)
 {
 	int rc = 0;
 	LOCK_REQ *pSMB = NULL;
@@ -1691,6 +1692,7 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
 	pSMB->NumberOfLocks = cpu_to_le16(numLock);
 	pSMB->NumberOfUnlocks = cpu_to_le16(numUnlock);
 	pSMB->LockType = lockType;
+	pSMB->OplockLevel = oplock_level;
 	pSMB->AndXCommand = 0xFF;	/* none */
 	pSMB->Fid = smb_file_id; /* netfid stays le */
 
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index d843631c028d..af371910f543 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -726,12 +726,12 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
 
 		/* BB we could chain these into one lock request BB */
 		rc = CIFSSMBLock(xid, tcon, netfid, length, pfLock->fl_start,
-				 0, 1, lockType, 0 /* wait flag */ );
+				 0, 1, lockType, 0 /* wait flag */, 0);
 		if (rc == 0) {
 			rc = CIFSSMBLock(xid, tcon, netfid, length,
 					 pfLock->fl_start, 1 /* numUnlock */ ,
 					 0 /* numLock */ , lockType,
-					 0 /* wait flag */ );
+					 0 /* wait flag */, 0);
 			pfLock->fl_type = F_UNLCK;
 			if (rc != 0)
 				cERROR(1, "Error unlocking previously locked "
@@ -748,13 +748,13 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
 				rc = CIFSSMBLock(xid, tcon, netfid, length,
 					pfLock->fl_start, 0, 1,
 					lockType | LOCKING_ANDX_SHARED_LOCK,
-					0 /* wait flag */);
+					0 /* wait flag */, 0);
 				if (rc == 0) {
 					rc = CIFSSMBLock(xid, tcon, netfid,
 						length, pfLock->fl_start, 1, 0,
 						lockType |
 						LOCKING_ANDX_SHARED_LOCK,
-						0 /* wait flag */);
+						0 /* wait flag */, 0);
 					pfLock->fl_type = F_RDLCK;
 					if (rc != 0)
 						cERROR(1, "Error unlocking "
@@ -797,8 +797,8 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
 
 		if (numLock) {
 			rc = CIFSSMBLock(xid, tcon, netfid, length,
-					pfLock->fl_start,
-					0, numLock, lockType, wait_flag);
+					 pfLock->fl_start, 0, numLock, lockType,
+					 wait_flag, 0);
 
 			if (rc == 0) {
 				/* For Windows locks we must store them. */
@@ -818,9 +818,9 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
 						(pfLock->fl_start + length) >=
 						(li->offset + li->length)) {
 					stored_rc = CIFSSMBLock(xid, tcon,
-							netfid,
-							li->length, li->offset,
-							1, 0, li->type, false);
+							netfid, li->length,
+							li->offset, 1, 0,
+							li->type, false, 0);
 					if (stored_rc)
 						rc = stored_rc;
 					else {
@@ -2192,7 +2192,8 @@ void cifs_oplock_break(struct work_struct *work)
 	 */
 	if (!cfile->oplock_break_cancelled) {
 		rc = CIFSSMBLock(0, tlink_tcon(cfile->tlink), cfile->netfid, 0,
-				 0, 0, 0, LOCKING_ANDX_OPLOCK_RELEASE, false);
+				 0, 0, 0, LOCKING_ANDX_OPLOCK_RELEASE, false,
+				 cinode->clientCanCacheRead ? 1 : 0);
 		cFYI(1, "Oplock release rc = %d", rc);
 	}
 
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 43f10281bc19..09bfcf08a90f 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -571,7 +571,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
 				pCifsInode = CIFS_I(netfile->dentry->d_inode);
 
 				cifs_set_oplock_level(pCifsInode,
-						      pSMB->OplockLevel);
+					pSMB->OplockLevel ? OPLOCK_READ : 0);
 				/*
 				 * cifs_oplock_break_put() can't be called
 				 * from here.  Get reference after queueing
-- 
cgit v1.2.2


From 941b853d779de3298e39f1eb4e252984464eaea8 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 11 Jan 2011 07:24:01 -0500
Subject: cifs: don't fail writepages on -EAGAIN errors

If CIFSSMBWrite2 returns -EAGAIN, then the error should be considered
temporary. CIFS should retry the write instead of setting an error on
the mapping and returning.

For WB_SYNC_ALL, just retry the write immediately. In the WB_SYNC_NONE
case, call redirty_page_for_writeback on all of the pages that didn't
get written out and then move on.

Also, fix up the handling of a short write with a successful return
code. MS-CIFS says that 0 bytes_written means ENOSPC or EFBIG. It
doesn't mention what a short, but non-zero write means, so for now
treat it as we would an -EAGAIN return.

Reviewed-by: Suresh Jayaraman <sjayaraman@suse.de>
Reviewed-by: Pavel Shilovsky <piastryyy@gmail.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/file.c | 49 +++++++++++++++++++++++++++++++++++++------------
 1 file changed, 37 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index af371910f543..cfa2e5ebcafe 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1377,6 +1377,7 @@ retry:
 				break;
 		}
 		if (n_iov) {
+retry_write:
 			open_file = find_writable_file(CIFS_I(mapping->host),
 							false);
 			if (!open_file) {
@@ -1389,31 +1390,55 @@ retry:
 						   &bytes_written, iov, n_iov,
 						   long_op);
 				cifsFileInfo_put(open_file);
-				cifs_update_eof(cifsi, offset, bytes_written);
 			}
 
-			if (rc || bytes_written < bytes_to_write) {
-				cERROR(1, "Write2 ret %d, wrote %d",
-					  rc, bytes_written);
-				mapping_set_error(mapping, rc);
-			} else {
+			cFYI(1, "Write2 rc=%d, wrote=%u", rc, bytes_written);
+
+			/*
+			 * For now, treat a short write as if nothing got
+			 * written. A zero length write however indicates
+			 * ENOSPC or EFBIG. We have no way to know which
+			 * though, so call it ENOSPC for now. EFBIG would
+			 * get translated to AS_EIO anyway.
+			 *
+			 * FIXME: make it take into account the data that did
+			 *	  get written
+			 */
+			if (rc == 0) {
+				if (bytes_written == 0)
+					rc = -ENOSPC;
+				else if (bytes_written < bytes_to_write)
+					rc = -EAGAIN;
+			}
+
+			/* retry on data-integrity flush */
+			if (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN)
+				goto retry_write;
+
+			/* fix the stats and EOF */
+			if (bytes_written > 0) {
 				cifs_stats_bytes_written(tcon, bytes_written);
+				cifs_update_eof(cifsi, offset, bytes_written);
 			}
 
 			for (i = 0; i < n_iov; i++) {
 				page = pvec.pages[first + i];
-				/* Should we also set page error on
-				success rc but too little data written? */
-				/* BB investigate retry logic on temporary
-				server crash cases and how recovery works
-				when page marked as error */
-				if (rc)
+				/* on retryable write error, redirty page */
+				if (rc == -EAGAIN)
+					redirty_page_for_writepage(wbc, page);
+				else if (rc != 0)
 					SetPageError(page);
 				kunmap(page);
 				unlock_page(page);
 				end_page_writeback(page);
 				page_cache_release(page);
 			}
+
+			if (rc != -EAGAIN)
+				mapping_set_error(mapping, rc);
+			else
+				rc = 0;
+
 			if ((wbc->nr_to_write -= n_iov) <= 0)
 				done = 1;
 			index = next;
-- 
cgit v1.2.2


From 9d78315b03fc91228306db42edc533efa69cb518 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 11 Jan 2011 07:24:01 -0500
Subject: cifs: no need to mark smb_ses_list as cifs_demultiplex_thread is
 exiting

The TCP_Server_Info is refcounted and every SMB session holds a
reference to it. Thus, smb_ses_list is always going to be empty when
cifsd is coming down. This is dead code.

Reviewed-by: Suresh Jayaraman <sjayaraman@suse.de>
Reviewed-by: Pavel Shilovsky <piastryyy@gmail.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 44 +++-----------------------------------------
 1 file changed, 3 insertions(+), 41 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 9f59887badd2..75b538f50b12 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -346,7 +346,6 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
 	struct kvec iov;
 	struct socket *csocket = server->ssocket;
 	struct list_head *tmp;
-	struct cifsSesInfo *ses;
 	struct task_struct *task_to_wake = NULL;
 	struct mid_q_entry *mid_entry;
 	char temp;
@@ -677,44 +676,19 @@ multi_t2_fnd:
 	if (smallbuf) /* no sense logging a debug message if NULL */
 		cifs_small_buf_release(smallbuf);
 
-	/*
-	 * BB: we shouldn't have to do any of this. It shouldn't be
-	 * possible to exit from the thread with active SMB sessions
-	 */
-	spin_lock(&cifs_tcp_ses_lock);
-	if (list_empty(&server->pending_mid_q)) {
-		/* loop through server session structures attached to this and
-		    mark them dead */
-		list_for_each(tmp, &server->smb_ses_list) {
-			ses = list_entry(tmp, struct cifsSesInfo,
-					 smb_ses_list);
-			ses->status = CifsExiting;
-			ses->server = NULL;
-		}
-		spin_unlock(&cifs_tcp_ses_lock);
-	} else {
-		/* although we can not zero the server struct pointer yet,
-		since there are active requests which may depnd on them,
-		mark the corresponding SMB sessions as exiting too */
-		list_for_each(tmp, &server->smb_ses_list) {
-			ses = list_entry(tmp, struct cifsSesInfo,
-					 smb_ses_list);
-			ses->status = CifsExiting;
-		}
-
+	if (!list_empty(&server->pending_mid_q)) {
 		spin_lock(&GlobalMid_Lock);
 		list_for_each(tmp, &server->pending_mid_q) {
-		mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
+			mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
 			if (mid_entry->midState == MID_REQUEST_SUBMITTED) {
 				cFYI(1, "Clearing Mid 0x%x - waking up ",
-					 mid_entry->mid);
+					mid_entry->mid);
 				task_to_wake = mid_entry->tsk;
 				if (task_to_wake)
 					wake_up_process(task_to_wake);
 			}
 		}
 		spin_unlock(&GlobalMid_Lock);
-		spin_unlock(&cifs_tcp_ses_lock);
 		/* 1/8th of sec is more than enough time for them to exit */
 		msleep(125);
 	}
@@ -732,18 +706,6 @@ multi_t2_fnd:
 		coming home not much else we can do but free the memory */
 	}
 
-	/* last chance to mark ses pointers invalid
-	if there are any pointing to this (e.g
-	if a crazy root user tried to kill cifsd
-	kernel thread explicitly this might happen) */
-	/* BB: This shouldn't be necessary, see above */
-	spin_lock(&cifs_tcp_ses_lock);
-	list_for_each(tmp, &server->smb_ses_list) {
-		ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
-		ses->server = NULL;
-	}
-	spin_unlock(&cifs_tcp_ses_lock);
-
 	kfree(server->hostname);
 	task_to_wake = xchg(&server->tsk, NULL);
 	kfree(server);
-- 
cgit v1.2.2


From c5797a945cac4c470f0113fc839c521aab0d799d Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 11 Jan 2011 07:24:01 -0500
Subject: cifs: make wait_for_free_request take a TCP_Server_Info pointer

The cifsSesInfo pointer is only used to get at the server.

Reviewed-by: Suresh Jayaraman <sjayaraman@suse.de>
Reviewed-by:  Pavel Shilovsky <piastryyy@gmail.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/transport.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 59ca81b16919..9a14f77e0ab2 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -244,31 +244,31 @@ smb_send(struct TCP_Server_Info *server, struct smb_hdr *smb_buffer,
 	return smb_sendv(server, &iov, 1);
 }
 
-static int wait_for_free_request(struct cifsSesInfo *ses, const int long_op)
+static int wait_for_free_request(struct TCP_Server_Info *server,
+				 const int long_op)
 {
 	if (long_op == CIFS_ASYNC_OP) {
 		/* oplock breaks must not be held up */
-		atomic_inc(&ses->server->inFlight);
+		atomic_inc(&server->inFlight);
 		return 0;
 	}
 
 	spin_lock(&GlobalMid_Lock);
 	while (1) {
-		if (atomic_read(&ses->server->inFlight) >=
-				cifs_max_pending){
+		if (atomic_read(&server->inFlight) >= cifs_max_pending) {
 			spin_unlock(&GlobalMid_Lock);
 #ifdef CONFIG_CIFS_STATS2
-			atomic_inc(&ses->server->num_waiters);
+			atomic_inc(&server->num_waiters);
 #endif
-			wait_event(ses->server->request_q,
-				   atomic_read(&ses->server->inFlight)
+			wait_event(server->request_q,
+				   atomic_read(&server->inFlight)
 				     < cifs_max_pending);
 #ifdef CONFIG_CIFS_STATS2
-			atomic_dec(&ses->server->num_waiters);
+			atomic_dec(&server->num_waiters);
 #endif
 			spin_lock(&GlobalMid_Lock);
 		} else {
-			if (ses->server->tcpStatus == CifsExiting) {
+			if (server->tcpStatus == CifsExiting) {
 				spin_unlock(&GlobalMid_Lock);
 				return -ENOENT;
 			}
@@ -278,7 +278,7 @@ static int wait_for_free_request(struct cifsSesInfo *ses, const int long_op)
 
 			/* update # of requests on the wire to server */
 			if (long_op != CIFS_BLOCKING_OP)
-				atomic_inc(&ses->server->inFlight);
+				atomic_inc(&server->inFlight);
 			spin_unlock(&GlobalMid_Lock);
 			break;
 		}
@@ -413,7 +413,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
 	   to the same server. We may make this configurable later or
 	   use ses->maxReq */
 
-	rc = wait_for_free_request(ses, long_op);
+	rc = wait_for_free_request(ses->server, long_op);
 	if (rc) {
 		cifs_small_buf_release(in_buf);
 		return rc;
@@ -610,7 +610,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
 		return -EIO;
 	}
 
-	rc = wait_for_free_request(ses, long_op);
+	rc = wait_for_free_request(ses->server, long_op);
 	if (rc)
 		return rc;
 
@@ -845,7 +845,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 		return -EIO;
 	}
 
-	rc = wait_for_free_request(ses, CIFS_BLOCKING_OP);
+	rc = wait_for_free_request(ses->server, CIFS_BLOCKING_OP);
 	if (rc)
 		return rc;
 
-- 
cgit v1.2.2


From 8097531a5cb55c6472118da094dc88caf9be66ac Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 11 Jan 2011 07:24:02 -0500
Subject: cifs: clean up accesses to midCount

It's an atomic_t and the code accesses the "counter" field in it directly
instead of using atomic_read(). It also is sometimes accessed under a
spinlock and sometimes not. Move it out of the spinlock since we don't need
belt-and-suspenders for something that's just informational.

Reviewed-by: Suresh Jayaraman <sjayaraman@suse.de>
Reviewed-by: Pavel Shilovsky <piastryyy@gmail.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_debug.c | 2 +-
 fs/cifs/connect.c    | 2 +-
 fs/cifs/transport.c  | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index ede98300a8cd..e2d0d5d455fa 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -331,7 +331,7 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
 				atomic_read(&totSmBufAllocCount));
 #endif /* CONFIG_CIFS_STATS2 */
 
-	seq_printf(m, "Operations (MIDs): %d\n", midCount.counter);
+	seq_printf(m, "Operations (MIDs): %d\n", atomic_read(&midCount));
 	seq_printf(m,
 		"\n%d session %d share reconnects\n",
 		tcpSesReconnectCount.counter, tconInfoReconnectCount.counter);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 75b538f50b12..465ecad6d7cc 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -628,7 +628,7 @@ multi_t2_fnd:
 		} else if (!is_valid_oplock_break(smb_buffer, server) &&
 			   !isMultiRsp) {
 			cERROR(1, "No task to wake, unknown frame received! "
-				   "NumMids %d", midCount.counter);
+				   "NumMids %d", atomic_read(&midCount));
 			cifs_dump_mem("Received Data is: ", (char *)smb_buffer,
 				      sizeof(struct smb_hdr));
 #ifdef CONFIG_CIFS_DEBUG2
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 9a14f77e0ab2..b9eb0cffa003 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -61,10 +61,10 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
 		temp->tsk = current;
 	}
 
-	spin_lock(&GlobalMid_Lock);
-	list_add_tail(&temp->qhead, &server->pending_mid_q);
 	atomic_inc(&midCount);
 	temp->midState = MID_REQUEST_ALLOCATED;
+	spin_lock(&GlobalMid_Lock);
+	list_add_tail(&temp->qhead, &server->pending_mid_q);
 	spin_unlock(&GlobalMid_Lock);
 	return temp;
 }
@@ -78,8 +78,8 @@ DeleteMidQEntry(struct mid_q_entry *midEntry)
 	spin_lock(&GlobalMid_Lock);
 	midEntry->midState = MID_FREE;
 	list_del(&midEntry->qhead);
-	atomic_dec(&midCount);
 	spin_unlock(&GlobalMid_Lock);
+	atomic_dec(&midCount);
 	if (midEntry->largeBuf)
 		cifs_buf_release(midEntry->resp_buf);
 	else
-- 
cgit v1.2.2


From ddc8cf8fc718587a3788330bf4f32b379f08b250 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 11 Jan 2011 07:24:02 -0500
Subject: cifs: move locked sections out of DeleteMidQEntry and AllocMidQEntry

In later patches, we're going to need to have finer-grained control
over the addition and removal of these structs from the pending_mid_q
and we'll need to be able to call the destructor while holding the
spinlock. Move the locked sections out of both routines and into
the callers. Fix up current callers of DeleteMidQEntry to call a new
routine that dequeues the entry and then destroys it.

Reviewed-by: Suresh Jayaraman <sjayaraman@suse.de>
Reviewed-by: Pavel Shilovsky <piastryyy@gmail.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/transport.c | 41 ++++++++++++++++++++++++-----------------
 1 file changed, 24 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index b9eb0cffa003..801726b11e1e 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -63,9 +63,6 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
 
 	atomic_inc(&midCount);
 	temp->midState = MID_REQUEST_ALLOCATED;
-	spin_lock(&GlobalMid_Lock);
-	list_add_tail(&temp->qhead, &server->pending_mid_q);
-	spin_unlock(&GlobalMid_Lock);
 	return temp;
 }
 
@@ -75,10 +72,7 @@ DeleteMidQEntry(struct mid_q_entry *midEntry)
 #ifdef CONFIG_CIFS_STATS2
 	unsigned long now;
 #endif
-	spin_lock(&GlobalMid_Lock);
 	midEntry->midState = MID_FREE;
-	list_del(&midEntry->qhead);
-	spin_unlock(&GlobalMid_Lock);
 	atomic_dec(&midCount);
 	if (midEntry->largeBuf)
 		cifs_buf_release(midEntry->resp_buf);
@@ -103,6 +97,16 @@ DeleteMidQEntry(struct mid_q_entry *midEntry)
 	mempool_free(midEntry, cifs_mid_poolp);
 }
 
+static void
+delete_mid(struct mid_q_entry *mid)
+{
+	spin_lock(&GlobalMid_Lock);
+	list_del(&mid->qhead);
+	spin_unlock(&GlobalMid_Lock);
+
+	DeleteMidQEntry(mid);
+}
+
 static int
 smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
 {
@@ -308,6 +312,9 @@ static int allocate_mid(struct cifsSesInfo *ses, struct smb_hdr *in_buf,
 	*ppmidQ = AllocMidQEntry(in_buf, ses->server);
 	if (*ppmidQ == NULL)
 		return -ENOMEM;
+	spin_lock(&GlobalMid_Lock);
+	list_add_tail(&(*ppmidQ)->qhead, &ses->server->pending_mid_q);
+	spin_unlock(&GlobalMid_Lock);
 	return 0;
 }
 
@@ -508,7 +515,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
 			}
 		}
 		spin_unlock(&GlobalMid_Lock);
-		DeleteMidQEntry(midQ);
+		delete_mid(midQ);
 		/* Update # of requests on wire to server */
 		atomic_dec(&ses->server->inFlight);
 		wake_up(&ses->server->request_q);
@@ -564,14 +571,14 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
 		if ((flags & CIFS_NO_RESP) == 0)
 			midQ->resp_buf = NULL;  /* mark it so buf will
 						   not be freed by
-						   DeleteMidQEntry */
+						   delete_mid */
 	} else {
 		rc = -EIO;
 		cFYI(1, "Bad MID state?");
 	}
 
 out:
-	DeleteMidQEntry(midQ);
+	delete_mid(midQ);
 	atomic_dec(&ses->server->inFlight);
 	wake_up(&ses->server->request_q);
 
@@ -699,7 +706,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
 			}
 		}
 		spin_unlock(&GlobalMid_Lock);
-		DeleteMidQEntry(midQ);
+		delete_mid(midQ);
 		/* Update # of requests on wire to server */
 		atomic_dec(&ses->server->inFlight);
 		wake_up(&ses->server->request_q);
@@ -755,7 +762,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
 	}
 
 out:
-	DeleteMidQEntry(midQ);
+	delete_mid(midQ);
 	atomic_dec(&ses->server->inFlight);
 	wake_up(&ses->server->request_q);
 
@@ -863,7 +870,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 
 	rc = cifs_sign_smb(in_buf, ses->server, &midQ->sequence_number);
 	if (rc) {
-		DeleteMidQEntry(midQ);
+		delete_mid(midQ);
 		mutex_unlock(&ses->server->srv_mutex);
 		return rc;
 	}
@@ -880,7 +887,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 	mutex_unlock(&ses->server->srv_mutex);
 
 	if (rc < 0) {
-		DeleteMidQEntry(midQ);
+		delete_mid(midQ);
 		return rc;
 	}
 
@@ -902,7 +909,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 
 			rc = send_nt_cancel(tcon, in_buf, midQ);
 			if (rc) {
-				DeleteMidQEntry(midQ);
+				delete_mid(midQ);
 				return rc;
 			}
 		} else {
@@ -914,7 +921,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 			/* If we get -ENOLCK back the lock may have
 			   already been removed. Don't exit in this case. */
 			if (rc && rc != -ENOLCK) {
-				DeleteMidQEntry(midQ);
+				delete_mid(midQ);
 				return rc;
 			}
 		}
@@ -951,7 +958,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 			}
 		}
 		spin_unlock(&GlobalMid_Lock);
-		DeleteMidQEntry(midQ);
+		delete_mid(midQ);
 		return rc;
 	}
 
@@ -1001,7 +1008,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 		BCC(out_buf) = le16_to_cpu(BCC_LE(out_buf));
 
 out:
-	DeleteMidQEntry(midQ);
+	delete_mid(midQ);
 	if (rstart && rc == -EACCES)
 		return -ERESTARTSYS;
 	return rc;
-- 
cgit v1.2.2


From 053d50344568e5a4054266b44040297531125281 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 11 Jan 2011 07:24:02 -0500
Subject: cifs: move mid result processing into common function

Reviewed-by: Suresh Jayaraman <sjayaraman@suse.de>
Reviewed-by: Pavel Shilovsky <piastryyy@gmail.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/transport.c | 121 +++++++++++++++++++---------------------------------
 1 file changed, 43 insertions(+), 78 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 801726b11e1e..15059c7ef2ae 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -389,6 +389,42 @@ SendReceiveNoRsp(const unsigned int xid, struct cifsSesInfo *ses,
 	return rc;
 }
 
+static int
+sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
+{
+	int rc = 0;
+
+	spin_lock(&GlobalMid_Lock);
+
+	if (mid->resp_buf) {
+		spin_unlock(&GlobalMid_Lock);
+		return rc;
+	}
+
+	cERROR(1, "No response to cmd %d mid %d", mid->command, mid->mid);
+	if (mid->midState == MID_REQUEST_SUBMITTED) {
+		if (server->tcpStatus == CifsExiting)
+			rc = -EHOSTDOWN;
+		else {
+			server->tcpStatus = CifsNeedReconnect;
+			mid->midState = MID_RETRY_NEEDED;
+		}
+	}
+
+	if (rc != -EHOSTDOWN) {
+		if (mid->midState == MID_RETRY_NEEDED) {
+			rc = -EAGAIN;
+			cFYI(1, "marking request for retry");
+		} else {
+			rc = -EIO;
+		}
+	}
+	spin_unlock(&GlobalMid_Lock);
+
+	delete_mid(mid);
+	return rc;
+}
+
 int
 SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
 	     struct kvec *iov, int n_vec, int *pRespBufType /* ret */,
@@ -492,37 +528,13 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
 	/* No user interrupts in wait - wreaks havoc with performance */
 	wait_for_response(ses, midQ, timeout, 10 * HZ);
 
-	spin_lock(&GlobalMid_Lock);
-
-	if (midQ->resp_buf == NULL) {
-		cERROR(1, "No response to cmd %d mid %d",
-			midQ->command, midQ->mid);
-		if (midQ->midState == MID_REQUEST_SUBMITTED) {
-			if (ses->server->tcpStatus == CifsExiting)
-				rc = -EHOSTDOWN;
-			else {
-				ses->server->tcpStatus = CifsNeedReconnect;
-				midQ->midState = MID_RETRY_NEEDED;
-			}
-		}
-
-		if (rc != -EHOSTDOWN) {
-			if (midQ->midState == MID_RETRY_NEEDED) {
-				rc = -EAGAIN;
-				cFYI(1, "marking request for retry");
-			} else {
-				rc = -EIO;
-			}
-		}
-		spin_unlock(&GlobalMid_Lock);
-		delete_mid(midQ);
-		/* Update # of requests on wire to server */
+	rc = sync_mid_result(midQ, ses->server);
+	if (rc != 0) {
 		atomic_dec(&ses->server->inFlight);
 		wake_up(&ses->server->request_q);
 		return rc;
 	}
 
-	spin_unlock(&GlobalMid_Lock);
 	receive_len = midQ->resp_buf->smb_buf_length;
 
 	if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
@@ -684,36 +696,13 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
 	/* No user interrupts in wait - wreaks havoc with performance */
 	wait_for_response(ses, midQ, timeout, 10 * HZ);
 
-	spin_lock(&GlobalMid_Lock);
-	if (midQ->resp_buf == NULL) {
-		cERROR(1, "No response for cmd %d mid %d",
-			  midQ->command, midQ->mid);
-		if (midQ->midState == MID_REQUEST_SUBMITTED) {
-			if (ses->server->tcpStatus == CifsExiting)
-				rc = -EHOSTDOWN;
-			else {
-				ses->server->tcpStatus = CifsNeedReconnect;
-				midQ->midState = MID_RETRY_NEEDED;
-			}
-		}
-
-		if (rc != -EHOSTDOWN) {
-			if (midQ->midState == MID_RETRY_NEEDED) {
-				rc = -EAGAIN;
-				cFYI(1, "marking request for retry");
-			} else {
-				rc = -EIO;
-			}
-		}
-		spin_unlock(&GlobalMid_Lock);
-		delete_mid(midQ);
-		/* Update # of requests on wire to server */
+	rc = sync_mid_result(midQ, ses->server);
+	if (rc != 0) {
 		atomic_dec(&ses->server->inFlight);
 		wake_up(&ses->server->request_q);
 		return rc;
 	}
 
-	spin_unlock(&GlobalMid_Lock);
 	receive_len = midQ->resp_buf->smb_buf_length;
 
 	if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
@@ -933,35 +922,11 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 		}
 	}
 
-	spin_lock(&GlobalMid_Lock);
-	if (midQ->resp_buf) {
-		spin_unlock(&GlobalMid_Lock);
-		receive_len = midQ->resp_buf->smb_buf_length;
-	} else {
-		cERROR(1, "No response for cmd %d mid %d",
-			  midQ->command, midQ->mid);
-		if (midQ->midState == MID_REQUEST_SUBMITTED) {
-			if (ses->server->tcpStatus == CifsExiting)
-				rc = -EHOSTDOWN;
-			else {
-				ses->server->tcpStatus = CifsNeedReconnect;
-				midQ->midState = MID_RETRY_NEEDED;
-			}
-		}
-
-		if (rc != -EHOSTDOWN) {
-			if (midQ->midState == MID_RETRY_NEEDED) {
-				rc = -EAGAIN;
-				cFYI(1, "marking request for retry");
-			} else {
-				rc = -EIO;
-			}
-		}
-		spin_unlock(&GlobalMid_Lock);
-		delete_mid(midQ);
+	rc = sync_mid_result(midQ, ses->server);
+	if (rc != 0)
 		return rc;
-	}
 
+	receive_len = midQ->resp_buf->smb_buf_length;
 	if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
 		cERROR(1, "Frame too large received.  Length: %d  Xid: %d",
 			receive_len, xid);
-- 
cgit v1.2.2


From 1cd3508d5eab6a88fa643119cedd34b04215cffe Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Wed, 19 Jan 2011 17:53:44 +0000
Subject: [CIFS] Update cifs version number

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsfs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 851030f74939..4739a531cded 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -118,5 +118,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* EXPERIMENTAL */
 
-#define CIFS_VERSION   "1.68"
+#define CIFS_VERSION   "1.69"
 #endif				/* _CIFSFS_H */
-- 
cgit v1.2.2


From 540b2e377797d8715469d408b887baa9310c5f3e Mon Sep 17 00:00:00 2001
From: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Date: Tue, 18 Jan 2011 22:33:54 -0600
Subject: cifs: Fix regression during share-level security mounts (Repost)

NTLM response length was changed to 16 bytes instead of 24 bytes
that are sent in Tree Connection Request during share-level security
share mounts.  Revert it back to 24 bytes.

Reported-and-Tested-by: Grzegorz Ozanski <grzegorz.ozanski@intel.com>
Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Acked-by: Suresh Jayaraman <sjayaraman@suse.de>
Cc: stable@kernel.org
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 465ecad6d7cc..5c7f8450dbe0 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2927,7 +2927,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 		bcc_ptr++;              /* skip password */
 		/* already aligned so no need to do it below */
 	} else {
-		pSMB->PasswordLength = cpu_to_le16(CIFS_SESS_KEY_SIZE);
+		pSMB->PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE);
 		/* BB FIXME add code to fail this if NTLMv2 or Kerberos
 		   specified as required (when that support is added to
 		   the vfs in the future) as only NTLM or the much
@@ -2945,7 +2945,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 #endif /* CIFS_WEAK_PW_HASH */
 		SMBNTencrypt(tcon->password, ses->server->cryptkey, bcc_ptr);
 
-		bcc_ptr += CIFS_SESS_KEY_SIZE;
+		bcc_ptr += CIFS_AUTH_RESP_SIZE;
 		if (ses->capabilities & CAP_UNICODE) {
 			/* must align unicode strings */
 			*bcc_ptr = 0; /* null byte password */
-- 
cgit v1.2.2


From 0da2a4ac33c291728d8be5bdb865467dcb078d13 Mon Sep 17 00:00:00 2001
From: Fred Isaman <iisaman@netapp.com>
Date: Wed, 19 Jan 2011 14:18:50 -0500
Subject: NFS: fix handling of malloc failure during nfs_flush_multi()

Cleanup of the allocated list entries should not call
put_nfs_open_context() on each entry, as the context will
always be NULL, causing an oops.

Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/write.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 10d648ea128b..c8278f4046cb 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -932,7 +932,7 @@ out_bad:
 	while (!list_empty(&list)) {
 		data = list_entry(list.next, struct nfs_write_data, pages);
 		list_del(&data->pages);
-		nfs_writedata_release(data);
+		nfs_writedata_free(data);
 	}
 	nfs_redirty_request(req);
 	return -ENOMEM;
-- 
cgit v1.2.2


From 2fbc2f1729e785a7b2faf9d8d60926bb1ff62af0 Mon Sep 17 00:00:00 2001
From: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Date: Mon, 6 Dec 2010 14:56:46 -0600
Subject: cifs: Use mask of ACEs for SID Everyone to calculate all three
 permissions user, group, and other

If a DACL has entries for ACEs for SID Everyone and Authenticated Users,
factor in mask in respective entries during calculation of permissions
for all three, user, group, and other.

http://technet.microsoft.com/en-us/library/bb463216.aspx

Signed-off-by: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsacl.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index a437ec391a01..1e7636b145a8 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -41,9 +41,12 @@ static struct cifs_wksid wksidarr[NUM_WK_SIDS] = {
 ;
 
 
-/* security id for everyone */
+/* security id for everyone/world system group */
 static const struct cifs_sid sid_everyone = {
 	1, 1, {0, 0, 0, 0, 0, 1}, {0} };
+/* security id for Authenticated Users system group */
+static const struct cifs_sid sid_authusers = {
+	1, 1, {0, 0, 0, 0, 0, 5}, {11} };
 /* group users */
 static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} };
 
@@ -365,7 +368,7 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
 	if (num_aces  > 0) {
 		umode_t user_mask = S_IRWXU;
 		umode_t group_mask = S_IRWXG;
-		umode_t other_mask = S_IRWXO;
+		umode_t other_mask = S_IRWXU | S_IRWXG | S_IRWXO;
 
 		ppace = kmalloc(num_aces * sizeof(struct cifs_ace *),
 				GFP_KERNEL);
@@ -390,6 +393,12 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
 						     ppace[i]->type,
 						     &fattr->cf_mode,
 						     &other_mask);
+			if (compare_sids(&(ppace[i]->sid), &sid_authusers))
+				access_flags_to_mode(ppace[i]->access_req,
+						     ppace[i]->type,
+						     &fattr->cf_mode,
+						     &other_mask);
+
 
 /*			memcpy((void *)(&(cifscred->aces[i])),
 				(void *)ppace[i],
-- 
cgit v1.2.2


From 0ade640e9cda805692dbf688f4bb69e94719275a Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 11 Jan 2011 07:24:02 -0500
Subject: cifs: wait indefinitely for responses

The client should not be timing out on individual SMB requests. Too much
of the state between client and server is tied to the state of the
socket. If we time out requests and issue spurious disconnects then that
comprimises data integrity.

Instead of doing this complicated dance where we try to decide how long
to wait for a response for particular requests, have the client instead
wait indefinitely for a response. Also, use a TASK_KILLABLE sleep here
so that fatal signals will break out of this waiting.

Later patches will add support for detecting dead peers and forcing
reconnects based on that.

Reviewed-by: Suresh Jayaraman <sjayaraman@suse.de>
Reviewed-by:  Pavel Shilovsky <piastryyy@gmail.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/transport.c | 110 ++++++++--------------------------------------------
 1 file changed, 17 insertions(+), 93 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 15059c7ef2ae..c41c9c4f0a79 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -318,48 +318,17 @@ static int allocate_mid(struct cifsSesInfo *ses, struct smb_hdr *in_buf,
 	return 0;
 }
 
-static int wait_for_response(struct cifsSesInfo *ses,
-			struct mid_q_entry *midQ,
-			unsigned long timeout,
-			unsigned long time_to_wait)
+static int
+wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ)
 {
-	unsigned long curr_timeout;
-
-	for (;;) {
-		curr_timeout = timeout + jiffies;
-		wait_event_timeout(ses->server->response_q,
-			midQ->midState != MID_REQUEST_SUBMITTED, timeout);
-
-		if (time_after(jiffies, curr_timeout) &&
-			(midQ->midState == MID_REQUEST_SUBMITTED) &&
-			((ses->server->tcpStatus == CifsGood) ||
-			 (ses->server->tcpStatus == CifsNew))) {
-
-			unsigned long lrt;
+	int error;
 
-			/* We timed out. Is the server still
-			   sending replies ? */
-			spin_lock(&GlobalMid_Lock);
-			lrt = ses->server->lstrp;
-			spin_unlock(&GlobalMid_Lock);
+	error = wait_event_killable(server->response_q,
+				    midQ->midState != MID_REQUEST_SUBMITTED);
+	if (error < 0)
+		return -ERESTARTSYS;
 
-			/* Calculate time_to_wait past last receive time.
-			 Although we prefer not to time out if the
-			 server is still responding - we will time
-			 out if the server takes more than 15 (or 45
-			 or 180) seconds to respond to this request
-			 and has not responded to any request from
-			 other threads on the client within 10 seconds */
-			lrt += time_to_wait;
-			if (time_after(jiffies, lrt)) {
-				/* No replies for time_to_wait. */
-				cERROR(1, "server not responding");
-				return -1;
-			}
-		} else {
-			return 0;
-		}
-	}
+	return 0;
 }
 
 
@@ -433,7 +402,6 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
 	int rc = 0;
 	int long_op;
 	unsigned int receive_len;
-	unsigned long timeout;
 	struct mid_q_entry *midQ;
 	struct smb_hdr *in_buf = iov[0].iov_base;
 
@@ -500,33 +468,12 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
 	if (rc < 0)
 		goto out;
 
-	if (long_op == CIFS_STD_OP)
-		timeout = 15 * HZ;
-	else if (long_op == CIFS_VLONG_OP) /* e.g. slow writes past EOF */
-		timeout = 180 * HZ;
-	else if (long_op == CIFS_LONG_OP)
-		timeout = 45 * HZ; /* should be greater than
-			servers oplock break timeout (about 43 seconds) */
-	else if (long_op == CIFS_ASYNC_OP)
-		goto out;
-	else if (long_op == CIFS_BLOCKING_OP)
-		timeout = 0x7FFFFFFF; /*  large, but not so large as to wrap */
-	else {
-		cERROR(1, "unknown timeout flag %d", long_op);
-		rc = -EIO;
+	if (long_op == CIFS_ASYNC_OP)
 		goto out;
-	}
-
-	/* wait for 15 seconds or until woken up due to response arriving or
-	   due to last connection to this server being unmounted */
-	if (signal_pending(current)) {
-		/* if signal pending do not hold up user for full smb timeout
-		but we still give response a chance to complete */
-		timeout = 2 * HZ;
-	}
 
-	/* No user interrupts in wait - wreaks havoc with performance */
-	wait_for_response(ses, midQ, timeout, 10 * HZ);
+	rc = wait_for_response(ses->server, midQ);
+	if (rc != 0)
+		goto out;
 
 	rc = sync_mid_result(midQ, ses->server);
 	if (rc != 0) {
@@ -604,7 +551,6 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
 {
 	int rc = 0;
 	unsigned int receive_len;
-	unsigned long timeout;
 	struct mid_q_entry *midQ;
 
 	if (ses == NULL) {
@@ -668,33 +614,12 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
 	if (rc < 0)
 		goto out;
 
-	if (long_op == CIFS_STD_OP)
-		timeout = 15 * HZ;
-	/* wait for 15 seconds or until woken up due to response arriving or
-	   due to last connection to this server being unmounted */
-	else if (long_op == CIFS_ASYNC_OP)
-		goto out;
-	else if (long_op == CIFS_VLONG_OP) /* writes past EOF can be slow */
-		timeout = 180 * HZ;
-	else if (long_op == CIFS_LONG_OP)
-		timeout = 45 * HZ; /* should be greater than
-			servers oplock break timeout (about 43 seconds) */
-	else if (long_op == CIFS_BLOCKING_OP)
-		timeout = 0x7FFFFFFF; /* large but no so large as to wrap */
-	else {
-		cERROR(1, "unknown timeout flag %d", long_op);
-		rc = -EIO;
+	if (long_op == CIFS_ASYNC_OP)
 		goto out;
-	}
 
-	if (signal_pending(current)) {
-		/* if signal pending do not hold up user for full smb timeout
-		but we still give response a chance to complete */
-		timeout = 2 * HZ;
-	}
-
-	/* No user interrupts in wait - wreaks havoc with performance */
-	wait_for_response(ses, midQ, timeout, 10 * HZ);
+	rc = wait_for_response(ses->server, midQ);
+	if (rc != 0)
+		goto out;
 
 	rc = sync_mid_result(midQ, ses->server);
 	if (rc != 0) {
@@ -915,8 +840,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 			}
 		}
 
-		/* Wait 5 seconds for the response. */
-		if (wait_for_response(ses, midQ, 5 * HZ, 5 * HZ) == 0) {
+		if (wait_for_response(ses->server, midQ) == 0) {
 			/* We got the response - restart system call. */
 			rstart = 1;
 		}
-- 
cgit v1.2.2


From dad255b182363db1d1124458cd3fb0a4deac0d0f Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 11 Jan 2011 07:24:02 -0500
Subject: cifs: don't reconnect server when we don't get a response

We only want to force a reconnect to the server under very limited and
specific circumstances. Now that we have processes waiting indefinitely
for responses, we shouldn't reach this point unless a reconnect is
already in process. Thus, there's no reason to re-mark the server for
reconnect here.

Reviewed-by: Suresh Jayaraman <sjayaraman@suse.de>
Reviewed-by:  Pavel Shilovsky <piastryyy@gmail.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/transport.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index c41c9c4f0a79..f65cdec042e4 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -374,10 +374,8 @@ sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
 	if (mid->midState == MID_REQUEST_SUBMITTED) {
 		if (server->tcpStatus == CifsExiting)
 			rc = -EHOSTDOWN;
-		else {
-			server->tcpStatus = CifsNeedReconnect;
+		else
 			mid->midState = MID_RETRY_NEEDED;
-		}
 	}
 
 	if (rc != -EHOSTDOWN) {
-- 
cgit v1.2.2


From 74dd92a881b62014ca3c754db6868e1f142f2fb9 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 11 Jan 2011 07:24:02 -0500
Subject: cifs: clean up sync_mid_result

Make it use a switch statement based on the value of the midStatus. If
the resp_buf is set, then MID_RESPONSE_RECEIVED is too.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/transport.c | 35 ++++++++++++++++++-----------------
 1 file changed, 18 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index f65cdec042e4..6abd1445c983 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -363,28 +363,29 @@ sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
 {
 	int rc = 0;
 
-	spin_lock(&GlobalMid_Lock);
+	cFYI(1, "%s: cmd=%d mid=%d state=%d", __func__, mid->command,
+		mid->mid, mid->midState);
 
-	if (mid->resp_buf) {
+	spin_lock(&GlobalMid_Lock);
+	switch (mid->midState) {
+	case MID_RESPONSE_RECEIVED:
 		spin_unlock(&GlobalMid_Lock);
 		return rc;
-	}
-
-	cERROR(1, "No response to cmd %d mid %d", mid->command, mid->mid);
-	if (mid->midState == MID_REQUEST_SUBMITTED) {
-		if (server->tcpStatus == CifsExiting)
+	case MID_REQUEST_SUBMITTED:
+		/* socket is going down, reject all calls */
+		if (server->tcpStatus == CifsExiting) {
+			cERROR(1, "%s: canceling mid=%d cmd=0x%x state=%d",
+			       __func__, mid->mid, mid->command, mid->midState);
 			rc = -EHOSTDOWN;
-		else
-			mid->midState = MID_RETRY_NEEDED;
-	}
-
-	if (rc != -EHOSTDOWN) {
-		if (mid->midState == MID_RETRY_NEEDED) {
-			rc = -EAGAIN;
-			cFYI(1, "marking request for retry");
-		} else {
-			rc = -EIO;
+			break;
 		}
+	case MID_RETRY_NEEDED:
+		rc = -EAGAIN;
+		break;
+	default:
+		cERROR(1, "%s: invalid mid state mid=%d state=%d", __func__,
+			mid->mid, mid->midState);
+		rc = -EIO;
 	}
 	spin_unlock(&GlobalMid_Lock);
 
-- 
cgit v1.2.2


From 2b84a36c5529da136d28b268e75268892d09869c Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 11 Jan 2011 07:24:21 -0500
Subject: cifs: allow for different handling of received response

In order to incorporate async requests, we need to allow for a more
general way to do things on receive, rather than just waking up a
process.

Turn the task pointer in the mid_q_entry into a callback function and a
generic data pointer. When a response comes in, or the socket is
reconnected, cifsd can call the callback function in order to wake up
the process.

The default is to just wake up the current process which should mean no
change in behavior for existing code.

Also, clean up the locking in cifs_reconnect. There doesn't seem to be
any need to hold both the srv_mutex and GlobalMid_Lock when walking the
list of mids.

Reviewed-by: Suresh Jayaraman <sjayaraman@suse.de>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_debug.c |  8 ++++----
 fs/cifs/cifsglob.h   | 15 ++++++++++++++-
 fs/cifs/connect.c    | 53 +++++++++++++++++++++++++---------------------------
 fs/cifs/transport.c  | 19 +++++++++++++++++--
 4 files changed, 60 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index e2d0d5d455fa..65829d32128c 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -79,11 +79,11 @@ void cifs_dump_mids(struct TCP_Server_Info *server)
 	spin_lock(&GlobalMid_Lock);
 	list_for_each(tmp, &server->pending_mid_q) {
 		mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
-		cERROR(1, "State: %d Cmd: %d Pid: %d Tsk: %p Mid %d",
+		cERROR(1, "State: %d Cmd: %d Pid: %d Cbdata: %p Mid %d",
 			mid_entry->midState,
 			(int)mid_entry->command,
 			mid_entry->pid,
-			mid_entry->tsk,
+			mid_entry->callback_data,
 			mid_entry->mid);
 #ifdef CONFIG_CIFS_STATS2
 		cERROR(1, "IsLarge: %d buf: %p time rcv: %ld now: %ld",
@@ -218,11 +218,11 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
 				mid_entry = list_entry(tmp3, struct mid_q_entry,
 					qhead);
 				seq_printf(m, "\tState: %d com: %d pid:"
-						" %d tsk: %p mid %d\n",
+						" %d cbdata: %p mid %d\n",
 						mid_entry->midState,
 						(int)mid_entry->command,
 						mid_entry->pid,
-						mid_entry->tsk,
+						mid_entry->callback_data,
 						mid_entry->mid);
 			}
 			spin_unlock(&GlobalMid_Lock);
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 606ca8bb7102..4de737575959 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -508,6 +508,18 @@ static inline void cifs_stats_bytes_read(struct cifsTconInfo *tcon,
 
 #endif
 
+struct mid_q_entry;
+
+/*
+ * This is the prototype for the mid callback function. When creating one,
+ * take special care to avoid deadlocks. Things to bear in mind:
+ *
+ * - it will be called by cifsd
+ * - the GlobalMid_Lock will be held
+ * - the mid will be removed from the pending_mid_q list
+ */
+typedef void (mid_callback_t)(struct mid_q_entry *mid);
+
 /* one of these for every pending CIFS request to the server */
 struct mid_q_entry {
 	struct list_head qhead;	/* mids waiting on reply from this server */
@@ -519,7 +531,8 @@ struct mid_q_entry {
 	unsigned long when_sent; /* time when smb send finished */
 	unsigned long when_received; /* when demux complete (taken off wire) */
 #endif
-	struct task_struct *tsk;	/* task waiting for response */
+	mid_callback_t *callback; /* call completion callback */
+	void *callback_data;	  /* general purpose pointer for callback */
 	struct smb_hdr *resp_buf;	/* response buffer */
 	int midState;	/* wish this were enum but can not pass to wait_event */
 	__u8 command;	/* smb command code */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 5c7f8450dbe0..aa66de1db5f5 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -152,6 +152,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
 
 	/* before reconnecting the tcp session, mark the smb session (uid)
 		and the tid bad so they are not used until reconnected */
+	cFYI(1, "%s: marking sessions and tcons for reconnect", __func__);
 	spin_lock(&cifs_tcp_ses_lock);
 	list_for_each(tmp, &server->smb_ses_list) {
 		ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
@@ -163,7 +164,9 @@ cifs_reconnect(struct TCP_Server_Info *server)
 		}
 	}
 	spin_unlock(&cifs_tcp_ses_lock);
+
 	/* do not want to be sending data on a socket we are freeing */
+	cFYI(1, "%s: tearing down socket", __func__);
 	mutex_lock(&server->srv_mutex);
 	if (server->ssocket) {
 		cFYI(1, "State: 0x%x Flags: 0x%lx", server->ssocket->state,
@@ -180,22 +183,19 @@ cifs_reconnect(struct TCP_Server_Info *server)
 	kfree(server->session_key.response);
 	server->session_key.response = NULL;
 	server->session_key.len = 0;
+	mutex_unlock(&server->srv_mutex);
 
+	/* mark submitted MIDs for retry and issue callback */
+	cFYI(1, "%s: issuing mid callbacks", __func__);
 	spin_lock(&GlobalMid_Lock);
-	list_for_each(tmp, &server->pending_mid_q) {
-		mid_entry = list_entry(tmp, struct
-					mid_q_entry,
-					qhead);
-		if (mid_entry->midState == MID_REQUEST_SUBMITTED) {
-				/* Mark other intransit requests as needing
-				   retry so we do not immediately mark the
-				   session bad again (ie after we reconnect
-				   below) as they timeout too */
+	list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
+		mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
+		if (mid_entry->midState == MID_REQUEST_SUBMITTED)
 			mid_entry->midState = MID_RETRY_NEEDED;
-		}
+		list_del_init(&mid_entry->qhead);
+		mid_entry->callback(mid_entry);
 	}
 	spin_unlock(&GlobalMid_Lock);
-	mutex_unlock(&server->srv_mutex);
 
 	while ((server->tcpStatus != CifsExiting) &&
 	       (server->tcpStatus != CifsGood)) {
@@ -212,10 +212,9 @@ cifs_reconnect(struct TCP_Server_Info *server)
 			if (server->tcpStatus != CifsExiting)
 				server->tcpStatus = CifsGood;
 			spin_unlock(&GlobalMid_Lock);
-	/*		atomic_set(&server->inFlight,0);*/
-			wake_up(&server->response_q);
 		}
 	}
+
 	return rc;
 }
 
@@ -345,7 +344,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
 	struct msghdr smb_msg;
 	struct kvec iov;
 	struct socket *csocket = server->ssocket;
-	struct list_head *tmp;
+	struct list_head *tmp, *tmp2;
 	struct task_struct *task_to_wake = NULL;
 	struct mid_q_entry *mid_entry;
 	char temp;
@@ -558,10 +557,9 @@ incomplete_rcv:
 			continue;
 		}
 
-
-		task_to_wake = NULL;
+		mid_entry = NULL;
 		spin_lock(&GlobalMid_Lock);
-		list_for_each(tmp, &server->pending_mid_q) {
+		list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
 			mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
 
 			if ((mid_entry->mid == smb_buffer->Mid) &&
@@ -602,8 +600,9 @@ incomplete_rcv:
 				mid_entry->resp_buf = smb_buffer;
 				mid_entry->largeBuf = isLargeBuf;
 multi_t2_fnd:
-				task_to_wake = mid_entry->tsk;
 				mid_entry->midState = MID_RESPONSE_RECEIVED;
+				list_del_init(&mid_entry->qhead);
+				mid_entry->callback(mid_entry);
 #ifdef CONFIG_CIFS_STATS2
 				mid_entry->when_received = jiffies;
 #endif
@@ -613,9 +612,11 @@ multi_t2_fnd:
 				server->lstrp = jiffies;
 				break;
 			}
+			mid_entry = NULL;
 		}
 		spin_unlock(&GlobalMid_Lock);
-		if (task_to_wake) {
+
+		if (mid_entry != NULL) {
 			/* Was previous buf put in mpx struct for multi-rsp? */
 			if (!isMultiRsp) {
 				/* smb buffer will be freed by user thread */
@@ -624,7 +625,6 @@ multi_t2_fnd:
 				else
 					smallbuf = NULL;
 			}
-			wake_up_process(task_to_wake);
 		} else if (!is_valid_oplock_break(smb_buffer, server) &&
 			   !isMultiRsp) {
 			cERROR(1, "No task to wake, unknown frame received! "
@@ -678,15 +678,12 @@ multi_t2_fnd:
 
 	if (!list_empty(&server->pending_mid_q)) {
 		spin_lock(&GlobalMid_Lock);
-		list_for_each(tmp, &server->pending_mid_q) {
+		list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
 			mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
-			if (mid_entry->midState == MID_REQUEST_SUBMITTED) {
-				cFYI(1, "Clearing Mid 0x%x - waking up ",
-					mid_entry->mid);
-				task_to_wake = mid_entry->tsk;
-				if (task_to_wake)
-					wake_up_process(task_to_wake);
-			}
+			cFYI(1, "Clearing Mid 0x%x - issuing callback",
+					 mid_entry->mid);
+			list_del_init(&mid_entry->qhead);
+			mid_entry->callback(mid_entry);
 		}
 		spin_unlock(&GlobalMid_Lock);
 		/* 1/8th of sec is more than enough time for them to exit */
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 6abd1445c983..d77b6154cf22 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -36,6 +36,12 @@
 
 extern mempool_t *cifs_mid_poolp;
 
+static void
+wake_up_task(struct mid_q_entry *mid)
+{
+	wake_up_process(mid->callback_data);
+}
+
 static struct mid_q_entry *
 AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
 {
@@ -58,7 +64,13 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
 	/*	do_gettimeofday(&temp->when_sent);*/ /* easier to use jiffies */
 		/* when mid allocated can be before when sent */
 		temp->when_alloc = jiffies;
-		temp->tsk = current;
+
+		/*
+		 * The default is for the mid to be synchronous, so the
+		 * default callback just wakes up the current task.
+		 */
+		temp->callback = wake_up_task;
+		temp->callback_data = current;
 	}
 
 	atomic_inc(&midCount);
@@ -367,6 +379,9 @@ sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
 		mid->mid, mid->midState);
 
 	spin_lock(&GlobalMid_Lock);
+	/* ensure that it's no longer on the pending_mid_q */
+	list_del_init(&mid->qhead);
+
 	switch (mid->midState) {
 	case MID_RESPONSE_RECEIVED:
 		spin_unlock(&GlobalMid_Lock);
@@ -389,7 +404,7 @@ sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
 	}
 	spin_unlock(&GlobalMid_Lock);
 
-	delete_mid(mid);
+	DeleteMidQEntry(mid);
 	return rc;
 }
 
-- 
cgit v1.2.2


From a6827c184ea9f5452e4aaa7c799dd3c7cc9ba05e Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 11 Jan 2011 07:24:21 -0500
Subject: cifs: add cifs_call_async

Add a function that will send a request, and set up the mid for an
async reply.

Reviewed-by: Suresh Jayaraman <sjayaraman@suse.de>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsproto.h |  5 +++++
 fs/cifs/transport.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 62 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 95d5dbbb4c7a..7f2988bb8929 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -61,6 +61,11 @@ extern char *cifs_compose_mount_options(const char *sb_mountdata,
 		const char *fullpath, const struct dfs_info3_param *ref,
 		char **devname);
 /* extern void renew_parental_timestamps(struct dentry *direntry);*/
+extern struct mid_q_entry *AllocMidQEntry(const struct smb_hdr *smb_buffer,
+					struct TCP_Server_Info *server);
+extern int cifs_call_async(struct TCP_Server_Info *server,
+			   struct smb_hdr *in_buf, mid_callback_t *callback,
+			   void *cbdata);
 extern int SendReceive(const unsigned int /* xid */ , struct cifsSesInfo *,
 			struct smb_hdr * /* input */ ,
 			struct smb_hdr * /* out */ ,
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index d77b6154cf22..166b65a3763e 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -42,7 +42,7 @@ wake_up_task(struct mid_q_entry *mid)
 	wake_up_process(mid->callback_data);
 }
 
-static struct mid_q_entry *
+struct mid_q_entry *
 AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
 {
 	struct mid_q_entry *temp;
@@ -344,6 +344,62 @@ wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ)
 }
 
 
+/*
+ * Send a SMB request and set the callback function in the mid to handle
+ * the result. Caller is responsible for dealing with timeouts.
+ */
+int
+cifs_call_async(struct TCP_Server_Info *server, struct smb_hdr *in_buf,
+		mid_callback_t *callback, void *cbdata)
+{
+	int rc;
+	struct mid_q_entry *mid;
+
+	rc = wait_for_free_request(server, CIFS_ASYNC_OP);
+	if (rc)
+		return rc;
+
+	mutex_lock(&server->srv_mutex);
+	mid = AllocMidQEntry(in_buf, server);
+	if (mid == NULL) {
+		mutex_unlock(&server->srv_mutex);
+		return -ENOMEM;
+	}
+
+	/* put it on the pending_mid_q */
+	spin_lock(&GlobalMid_Lock);
+	list_add_tail(&mid->qhead, &server->pending_mid_q);
+	spin_unlock(&GlobalMid_Lock);
+
+	rc = cifs_sign_smb(in_buf, server, &mid->sequence_number);
+	if (rc) {
+		mutex_unlock(&server->srv_mutex);
+		goto out_err;
+	}
+
+	mid->callback = callback;
+	mid->callback_data = cbdata;
+	mid->midState = MID_REQUEST_SUBMITTED;
+#ifdef CONFIG_CIFS_STATS2
+	atomic_inc(&server->inSend);
+#endif
+	rc = smb_send(server, in_buf, in_buf->smb_buf_length);
+#ifdef CONFIG_CIFS_STATS2
+	atomic_dec(&server->inSend);
+	mid->when_sent = jiffies;
+#endif
+	mutex_unlock(&server->srv_mutex);
+	if (rc)
+		goto out_err;
+
+	return rc;
+out_err:
+	delete_mid(mid);
+	atomic_dec(&server->inFlight);
+	wake_up(&server->request_q);
+	return rc;
+}
+
 /*
  *
  * Send an SMB Request.  No response info (other than return code)
-- 
cgit v1.2.2


From 766fdbb57fdb1e53bc34c431103e95383d7f13ba Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 11 Jan 2011 07:24:21 -0500
Subject: cifs: add ability to send an echo request

Reviewed-by: Suresh Jayaraman <sjayaraman@suse.de>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifspdu.h   | 15 +++++++++++++++
 fs/cifs/cifsproto.h |  2 ++
 fs/cifs/cifssmb.c   | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/cifs/transport.c |  2 +-
 4 files changed, 65 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index de36b09763a8..ea205b4fcad2 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -50,6 +50,7 @@
 #define SMB_COM_SETATTR               0x09 /* trivial response */
 #define SMB_COM_LOCKING_ANDX          0x24 /* trivial response */
 #define SMB_COM_COPY                  0x29 /* trivial rsp, fail filename ignrd*/
+#define SMB_COM_ECHO                  0x2B /* echo request */
 #define SMB_COM_OPEN_ANDX             0x2D /* Legacy open for old servers */
 #define SMB_COM_READ_ANDX             0x2E
 #define SMB_COM_WRITE_ANDX            0x2F
@@ -760,6 +761,20 @@ typedef struct smb_com_tconx_rsp_ext {
  *
  */
 
+typedef struct smb_com_echo_req {
+	struct	smb_hdr hdr;
+	__le16	EchoCount;
+	__le16	ByteCount;
+	char	Data[1];
+} __attribute__((packed)) ECHO_REQ;
+
+typedef struct smb_com_echo_rsp {
+	struct	smb_hdr hdr;
+	__le16	SequenceNumber;
+	__le16	ByteCount;
+	char	Data[1];
+} __attribute__((packed)) ECHO_RSP;
+
 typedef struct smb_com_logoff_andx_req {
 	struct smb_hdr hdr;	/* wct = 2 */
 	__u8 AndXCommand;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 7f2988bb8929..982895fa7615 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -63,6 +63,7 @@ extern char *cifs_compose_mount_options(const char *sb_mountdata,
 /* extern void renew_parental_timestamps(struct dentry *direntry);*/
 extern struct mid_q_entry *AllocMidQEntry(const struct smb_hdr *smb_buffer,
 					struct TCP_Server_Info *server);
+extern void DeleteMidQEntry(struct mid_q_entry *midEntry);
 extern int cifs_call_async(struct TCP_Server_Info *server,
 			   struct smb_hdr *in_buf, mid_callback_t *callback,
 			   void *cbdata);
@@ -358,6 +359,7 @@ extern int CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
 			const __u64 len, struct file_lock *,
 			const __u16 lock_type, const bool waitFlag);
 extern int CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon);
+extern int CIFSSMBEcho(struct TCP_Server_Info *server);
 extern int CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses);
 
 extern struct cifsSesInfo *sesInfoAlloc(void);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 3652cc60314c..54b9f5d8d1db 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -706,6 +706,53 @@ CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon)
 	return rc;
 }
 
+/*
+ * This is a no-op for now. We're not really interested in the reply, but
+ * rather in the fact that the server sent one and that server->lstrp
+ * gets updated.
+ *
+ * FIXME: maybe we should consider checking that the reply matches request?
+ */
+static void
+cifs_echo_callback(struct mid_q_entry *mid)
+{
+	struct TCP_Server_Info *server = mid->callback_data;
+
+	DeleteMidQEntry(mid);
+	atomic_dec(&server->inFlight);
+	wake_up(&server->request_q);
+}
+
+int
+CIFSSMBEcho(struct TCP_Server_Info *server)
+{
+	ECHO_REQ *smb;
+	int rc = 0;
+
+	cFYI(1, "In echo request");
+
+	rc = small_smb_init(SMB_COM_ECHO, 0, NULL, (void **)&smb);
+	if (rc)
+		return rc;
+
+	/* set up echo request */
+	smb->hdr.Tid = cpu_to_le16(0xffff);
+	smb->hdr.WordCount = cpu_to_le16(1);
+	smb->EchoCount = cpu_to_le16(1);
+	smb->ByteCount = cpu_to_le16(1);
+	smb->Data[0] = 'a';
+	smb->hdr.smb_buf_length += 3;
+
+	rc = cifs_call_async(server, (struct smb_hdr *)smb,
+				cifs_echo_callback, server);
+	if (rc)
+		cFYI(1, "Echo request failed: %d", rc);
+
+	cifs_small_buf_release(smb);
+
+	return rc;
+}
+
 int
 CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
 {
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 166b65a3763e..a0cef4960516 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -78,7 +78,7 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
 	return temp;
 }
 
-static void
+void
 DeleteMidQEntry(struct mid_q_entry *midEntry)
 {
 #ifdef CONFIG_CIFS_STATS2
-- 
cgit v1.2.2


From c74093b694998d30105d9904686da5e3576497c4 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 11 Jan 2011 07:24:23 -0500
Subject: cifs: set up recurring workqueue job to do SMB echo requests

Reviewed-by: Suresh Jayaraman <sjayaraman@suse.de>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsglob.h |  1 +
 fs/cifs/connect.c  | 29 +++++++++++++++++++++++++++++
 2 files changed, 30 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 4de737575959..9c728dd5b146 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -218,6 +218,7 @@ struct TCP_Server_Info {
 	bool	sec_kerberosu2u;	/* supports U2U Kerberos */
 	bool	sec_ntlmssp;		/* supports NTLMSSP */
 	bool session_estab; /* mark when very first sess is established */
+	struct delayed_work	echo; /* echo ping workqueue job */
 #ifdef CONFIG_CIFS_FSCACHE
 	struct fscache_cookie   *fscache; /* client index cache cookie */
 #endif
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index aa66de1db5f5..f38ca084c9d2 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -52,6 +52,9 @@
 #define CIFS_PORT 445
 #define RFC1001_PORT 139
 
+/* SMB echo "timeout" -- FIXME: tunable? */
+#define SMB_ECHO_INTERVAL (60 * HZ)
+
 extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8,
 			 unsigned char *p24);
 
@@ -333,6 +336,26 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
 
 }
 
+static void
+cifs_echo_request(struct work_struct *work)
+{
+	int rc;
+	struct TCP_Server_Info *server = container_of(work,
+					struct TCP_Server_Info, echo.work);
+
+	/* no need to ping if we got a response recently */
+	if (time_before(jiffies, server->lstrp + SMB_ECHO_INTERVAL - HZ))
+		goto requeue_echo;
+
+	rc = CIFSSMBEcho(server);
+	if (rc)
+		cFYI(1, "Unable to send echo request to server: %s",
+			server->hostname);
+
+requeue_echo:
+	queue_delayed_work(system_nrt_wq, &server->echo, SMB_ECHO_INTERVAL);
+}
+
 static int
 cifs_demultiplex_thread(struct TCP_Server_Info *server)
 {
@@ -1571,6 +1594,8 @@ cifs_put_tcp_session(struct TCP_Server_Info *server)
 	list_del_init(&server->tcp_ses_list);
 	spin_unlock(&cifs_tcp_ses_lock);
 
+	cancel_delayed_work_sync(&server->echo);
+
 	spin_lock(&GlobalMid_Lock);
 	server->tcpStatus = CifsExiting;
 	spin_unlock(&GlobalMid_Lock);
@@ -1662,6 +1687,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 	tcp_ses->sequence_number = 0;
 	INIT_LIST_HEAD(&tcp_ses->tcp_ses_list);
 	INIT_LIST_HEAD(&tcp_ses->smb_ses_list);
+	INIT_DELAYED_WORK(&tcp_ses->echo, cifs_echo_request);
 
 	/*
 	 * at this point we are the only ones with the pointer
@@ -1710,6 +1736,9 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 
 	cifs_fscache_get_client_cookie(tcp_ses);
 
+	/* queue echo request delayed work */
+	queue_delayed_work(system_nrt_wq, &tcp_ses->echo, SMB_ECHO_INTERVAL);
+
 	return tcp_ses;
 
 out_err_crypto_release:
-- 
cgit v1.2.2


From fda3594362184383e73f0a2a5fa5b38ac0e04fd8 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Thu, 20 Jan 2011 18:06:34 +0000
Subject: [CIFS] cifs: reconnect unresponsive servers

If the server isn't responding to echoes, we don't want to leave tasks
hung waiting for it to reply. At that point, we'll want to reconnect
so that soft mounts can return an error to userspace quickly.

If the client hasn't received a reply after a specified number of echo
intervals, assume that the transport is down and attempt to reconnect
the socket.

The number of echo_intervals to wait before attempting to reconnect is
tunable via a module parameter. Setting it to 0, means that the client
will never attempt to reconnect. The default is 5.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
 fs/cifs/cifsfs.c   |  6 +++++-
 fs/cifs/cifsglob.h |  3 +++
 fs/cifs/connect.c  | 21 +++++++++++++++++----
 3 files changed, 25 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index d9f652a522a6..99d777a03dd0 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -77,7 +77,11 @@ unsigned int cifs_max_pending = CIFS_MAX_REQ;
 module_param(cifs_max_pending, int, 0);
 MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. "
 				   "Default: 50 Range: 2 to 256");
-
+unsigned short echo_retries = 5;
+module_param(echo_retries, ushort, 0644);
+MODULE_PARM_DESC(echo_retries, "Number of echo attempts before giving up and "
+			       "reconnecting server. Default: 5. 0 means "
+			       "never reconnect.");
 extern mempool_t *cifs_sm_req_poolp;
 extern mempool_t *cifs_req_poolp;
 extern mempool_t *cifs_mid_poolp;
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 9c728dd5b146..7040abc638fa 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -804,6 +804,9 @@ GLOBAL_EXTERN unsigned int cifs_min_rcv;    /* min size of big ntwrk buf pool */
 GLOBAL_EXTERN unsigned int cifs_min_small;  /* min size of small buf pool */
 GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/
 
+/* reconnect after this many failed echo attempts */
+GLOBAL_EXTERN unsigned short echo_retries;
+
 void cifs_oplock_break(struct work_struct *work);
 void cifs_oplock_break_get(struct cifsFileInfo *cfile);
 void cifs_oplock_break_put(struct cifsFileInfo *cfile);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index f38ca084c9d2..f5d7b59a3553 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -186,6 +186,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
 	kfree(server->session_key.response);
 	server->session_key.response = NULL;
 	server->session_key.len = 0;
+	server->lstrp = jiffies;
 	mutex_unlock(&server->srv_mutex);
 
 	/* mark submitted MIDs for retry and issue callback */
@@ -420,7 +421,20 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
 		smb_msg.msg_control = NULL;
 		smb_msg.msg_controllen = 0;
 		pdu_length = 4; /* enough to get RFC1001 header */
+
 incomplete_rcv:
+		if (echo_retries > 0 &&
+		    time_after(jiffies, server->lstrp +
+					(echo_retries * SMB_ECHO_INTERVAL))) {
+			cERROR(1, "Server %s has not responded in %d seconds. "
+				  "Reconnecting...", server->hostname,
+				  (echo_retries * SMB_ECHO_INTERVAL / HZ));
+			cifs_reconnect(server);
+			csocket = server->ssocket;
+			wake_up(&server->response_q);
+			continue;
+		}
+
 		length =
 		    kernel_recvmsg(csocket, &smb_msg,
 				&iov, 1, pdu_length, 0 /* BB other flags? */);
@@ -581,6 +595,8 @@ incomplete_rcv:
 		}
 
 		mid_entry = NULL;
+		server->lstrp = jiffies;
+
 		spin_lock(&GlobalMid_Lock);
 		list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
 			mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
@@ -629,10 +645,6 @@ multi_t2_fnd:
 #ifdef CONFIG_CIFS_STATS2
 				mid_entry->when_received = jiffies;
 #endif
-				/* so we do not time out requests to  server
-				which is still responding (since server could
-				be busy but not dead) */
-				server->lstrp = jiffies;
 				break;
 			}
 			mid_entry = NULL;
@@ -1685,6 +1697,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 		volume_info->target_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL);
 	tcp_ses->session_estab = false;
 	tcp_ses->sequence_number = 0;
+	tcp_ses->lstrp = jiffies;
 	INIT_LIST_HEAD(&tcp_ses->tcp_ses_list);
 	INIT_LIST_HEAD(&tcp_ses->smb_ses_list);
 	INIT_DELAYED_WORK(&tcp_ses->echo, cifs_echo_request);
-- 
cgit v1.2.2


From 7749981ec31aa40e28a1ef5687e46bc1aa278fae Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 11 Jan 2011 07:24:23 -0500
Subject: cifs: remove code for setting timeouts on requests

Since we don't time out individual requests anymore, remove the code
that we used to use for setting timeouts on different requests.

Reviewed-by: Pavel Shilovsky <piastryyy@gmail.com>
Reviewed-by: Suresh Jayaraman <sjayaraman@suse.de>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsglob.h  |  9 +++------
 fs/cifs/cifssmb.c   |  8 ++++----
 fs/cifs/connect.c   |  2 +-
 fs/cifs/file.c      | 44 +++++++-------------------------------------
 fs/cifs/sess.c      |  2 +-
 fs/cifs/transport.c |  2 +-
 6 files changed, 17 insertions(+), 50 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 7040abc638fa..571132c95231 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -636,12 +636,9 @@ static inline void free_dfs_info_array(struct dfs_info3_param *param,
 #define   CIFS_IOVEC            4    /* array of response buffers */
 
 /* Type of Request to SendReceive2 */
-#define   CIFS_STD_OP	        0    /* normal request timeout */
-#define   CIFS_LONG_OP          1    /* long op (up to 45 sec, oplock time) */
-#define   CIFS_VLONG_OP         2    /* sloow op - can take up to 180 seconds */
-#define   CIFS_BLOCKING_OP      4    /* operation can block */
-#define   CIFS_ASYNC_OP         8    /* do not wait for response */
-#define   CIFS_TIMEOUT_MASK 0x00F    /* only one of 5 above set in req */
+#define   CIFS_BLOCKING_OP      1    /* operation can block */
+#define   CIFS_ASYNC_OP         2    /* do not wait for response */
+#define   CIFS_TIMEOUT_MASK 0x003    /* only one of above set in req */
 #define   CIFS_LOG_ERROR    0x010    /* log NT STATUS if non-zero */
 #define   CIFS_LARGE_BUF_OP 0x020    /* large request buffer */
 #define   CIFS_NO_RESP      0x040    /* no response buffer required */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 54b9f5d8d1db..37113450757b 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1240,7 +1240,7 @@ OldOpenRetry:
 	pSMB->ByteCount = cpu_to_le16(count);
 	/* long_op set to 1 to allow for oplock break timeouts */
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
-			(struct smb_hdr *)pSMBr, &bytes_returned, CIFS_LONG_OP);
+			(struct smb_hdr *)pSMBr, &bytes_returned, 0);
 	cifs_stats_inc(&tcon->num_opens);
 	if (rc) {
 		cFYI(1, "Error in Open = %d", rc);
@@ -1353,7 +1353,7 @@ openRetry:
 	pSMB->ByteCount = cpu_to_le16(count);
 	/* long_op set to 1 to allow for oplock break timeouts */
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
-			(struct smb_hdr *)pSMBr, &bytes_returned, CIFS_LONG_OP);
+			(struct smb_hdr *)pSMBr, &bytes_returned, 0);
 	cifs_stats_inc(&tcon->num_opens);
 	if (rc) {
 		cFYI(1, "Error in Open = %d", rc);
@@ -1435,7 +1435,7 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
 	iov[0].iov_base = (char *)pSMB;
 	iov[0].iov_len = pSMB->hdr.smb_buf_length + 4;
 	rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovecs */,
-			 &resp_buf_type, CIFS_STD_OP | CIFS_LOG_ERROR);
+			 &resp_buf_type, CIFS_LOG_ERROR);
 	cifs_stats_inc(&tcon->num_reads);
 	pSMBr = (READ_RSP *)iov[0].iov_base;
 	if (rc) {
@@ -3136,7 +3136,7 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
 	iov[0].iov_len = pSMB->hdr.smb_buf_length + 4;
 
 	rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovec */, &buf_type,
-			 CIFS_STD_OP);
+			 0);
 	cifs_stats_inc(&tcon->num_acl_get);
 	if (rc) {
 		cFYI(1, "Send error in QuerySecDesc = %d", rc);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index f5d7b59a3553..8d4657596301 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -3022,7 +3022,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 	pSMB->ByteCount = cpu_to_le16(count);
 
 	rc = SendReceive(xid, ses, smb_buffer, smb_buffer_response, &length,
-			 CIFS_STD_OP);
+			 0);
 
 	/* above now done in SendReceive */
 	if ((rc == 0) && (tcon != NULL)) {
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index cfa2e5ebcafe..bd2a028af833 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -839,29 +839,6 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
 	return rc;
 }
 
-/*
- * Set the timeout on write requests past EOF. For some servers (Windows)
- * these calls can be very long.
- *
- * If we're writing >10M past the EOF we give a 180s timeout. Anything less
- * than that gets a 45s timeout. Writes not past EOF get 15s timeouts.
- * The 10M cutoff is totally arbitrary. A better scheme for this would be
- * welcome if someone wants to suggest one.
- *
- * We may be able to do a better job with this if there were some way to
- * declare that a file should be sparse.
- */
-static int
-cifs_write_timeout(struct cifsInodeInfo *cifsi, loff_t offset)
-{
-	if (offset <= cifsi->server_eof)
-		return CIFS_STD_OP;
-	else if (offset > (cifsi->server_eof + (10 * 1024 * 1024)))
-		return CIFS_VLONG_OP;
-	else
-		return CIFS_LONG_OP;
-}
-
 /* update the file size (if needed) after a write */
 static void
 cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
@@ -882,7 +859,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
 	unsigned int total_written;
 	struct cifs_sb_info *cifs_sb;
 	struct cifsTconInfo *pTcon;
-	int xid, long_op;
+	int xid;
 	struct cifsFileInfo *open_file;
 	struct cifsInodeInfo *cifsi = CIFS_I(inode);
 
@@ -903,7 +880,6 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
 
 	xid = GetXid();
 
-	long_op = cifs_write_timeout(cifsi, *poffset);
 	for (total_written = 0; write_size > total_written;
 	     total_written += bytes_written) {
 		rc = -EAGAIN;
@@ -931,7 +907,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
 				min_t(const int, cifs_sb->wsize,
 				      write_size - total_written),
 				*poffset, &bytes_written,
-				NULL, write_data + total_written, long_op);
+				NULL, write_data + total_written, 0);
 		}
 		if (rc || (bytes_written == 0)) {
 			if (total_written)
@@ -944,8 +920,6 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
 			cifs_update_eof(cifsi, *poffset, bytes_written);
 			*poffset += bytes_written;
 		}
-		long_op = CIFS_STD_OP; /* subsequent writes fast -
-				    15 seconds is plenty */
 	}
 
 	cifs_stats_bytes_written(pTcon, total_written);
@@ -974,7 +948,7 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
 	unsigned int total_written;
 	struct cifs_sb_info *cifs_sb;
 	struct cifsTconInfo *pTcon;
-	int xid, long_op;
+	int xid;
 	struct dentry *dentry = open_file->dentry;
 	struct cifsInodeInfo *cifsi = CIFS_I(dentry->d_inode);
 
@@ -987,7 +961,6 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
 
 	xid = GetXid();
 
-	long_op = cifs_write_timeout(cifsi, *poffset);
 	for (total_written = 0; write_size > total_written;
 	     total_written += bytes_written) {
 		rc = -EAGAIN;
@@ -1017,7 +990,7 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
 				rc = CIFSSMBWrite2(xid, pTcon,
 						open_file->netfid, len,
 						*poffset, &bytes_written,
-						iov, 1, long_op);
+						iov, 1, 0);
 			} else
 				rc = CIFSSMBWrite(xid, pTcon,
 					 open_file->netfid,
@@ -1025,7 +998,7 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
 					       write_size - total_written),
 					 *poffset, &bytes_written,
 					 write_data + total_written,
-					 NULL, long_op);
+					 NULL, 0);
 		}
 		if (rc || (bytes_written == 0)) {
 			if (total_written)
@@ -1038,8 +1011,6 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
 			cifs_update_eof(cifsi, *poffset, bytes_written);
 			*poffset += bytes_written;
 		}
-		long_op = CIFS_STD_OP; /* subsequent writes fast -
-				    15 seconds is plenty */
 	}
 
 	cifs_stats_bytes_written(pTcon, total_written);
@@ -1239,7 +1210,7 @@ static int cifs_writepages(struct address_space *mapping,
 	struct pagevec pvec;
 	int rc = 0;
 	int scanned = 0;
-	int xid, long_op;
+	int xid;
 
 	cifs_sb = CIFS_SB(mapping->host->i_sb);
 
@@ -1384,11 +1355,10 @@ retry_write:
 				cERROR(1, "No writable handles for inode");
 				rc = -EBADF;
 			} else {
-				long_op = cifs_write_timeout(cifsi, offset);
 				rc = CIFSSMBWrite2(xid, tcon, open_file->netfid,
 						   bytes_to_write, offset,
 						   &bytes_written, iov, n_iov,
-						   long_op);
+						   0);
 				cifsFileInfo_put(open_file);
 			}
 
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index eb746486e49e..1cffd82c4f13 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -879,7 +879,7 @@ ssetup_ntlmssp_authenticate:
 	BCC_LE(smb_buf) = cpu_to_le16(count);
 
 	rc = SendReceive2(xid, ses, iov, 3 /* num_iovecs */, &resp_buf_type,
-			  CIFS_STD_OP /* not long */ | CIFS_LOG_ERROR);
+			  CIFS_LOG_ERROR);
 	/* SMB request buf freed in SendReceive2 */
 
 	pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base;
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index a0cef4960516..fe92c4cb75f5 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -798,7 +798,7 @@ send_lock_cancel(const unsigned int xid, struct cifsTconInfo *tcon,
 	pSMB->hdr.Mid = GetNextMid(ses->server);
 
 	return SendReceive(xid, ses, in_buf, out_buf,
-			&bytes_returned, CIFS_STD_OP);
+			&bytes_returned, 0);
 }
 
 int
-- 
cgit v1.2.2


From 76dcc26f1d7f1c98c3f595379dcd9562f01bf38d Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 11 Jan 2011 07:24:24 -0500
Subject: cifs: mangle existing header for SMB_COM_NT_CANCEL

The NT_CANCEL command looks just like the original command, except for a
few small differences. The send_nt_cancel function however currently takes
a tcon, which we don't have in SendReceive and SendReceive2.

Instead of "respinning" the entire header for an NT_CANCEL, just mangle
the existing header by replacing just the fields we need. This means we
don't need a tcon and allows us to call it from other places.

Reviewed-by: Pavel Shilovsky <piastryyy@gmail.com>
Reviewed-by: Suresh Jayaraman <sjayaraman@suse.de>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/transport.c | 63 ++++++++++++++++++++++++++++++++---------------------
 1 file changed, 38 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index fe92c4cb75f5..c8e2808cd5e6 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -464,6 +464,43 @@ sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
 	return rc;
 }
 
+/*
+ * An NT cancel request header looks just like the original request except:
+ *
+ * The Command is SMB_COM_NT_CANCEL
+ * The WordCount is zeroed out
+ * The ByteCount is zeroed out
+ *
+ * This function mangles an existing request buffer into a
+ * SMB_COM_NT_CANCEL request and then sends it.
+ */
+static int
+send_nt_cancel(struct TCP_Server_Info *server, struct smb_hdr *in_buf,
+		struct mid_q_entry *mid)
+{
+	int rc = 0;
+
+	/* -4 for RFC1001 length and +2 for BCC field */
+	in_buf->smb_buf_length = sizeof(struct smb_hdr) - 4  + 2;
+	in_buf->Command = SMB_COM_NT_CANCEL;
+	in_buf->WordCount = 0;
+	BCC_LE(in_buf) = 0;
+
+	mutex_lock(&server->srv_mutex);
+	rc = cifs_sign_smb(in_buf, server, &mid->sequence_number);
+	if (rc) {
+		mutex_unlock(&server->srv_mutex);
+		return rc;
+	}
+	rc = smb_send(server, in_buf, in_buf->smb_buf_length);
+	mutex_unlock(&server->srv_mutex);
+
+	cFYI(1, "issued NT_CANCEL for mid %u, rc = %d",
+		in_buf->Mid, rc);
+
+	return rc;
+}
+
 int
 SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
 	     struct kvec *iov, int n_vec, int *pRespBufType /* ret */,
@@ -753,29 +790,6 @@ out:
 	return rc;
 }
 
-/* Send an NT_CANCEL SMB to cause the POSIX blocking lock to return. */
-
-static int
-send_nt_cancel(struct cifsTconInfo *tcon, struct smb_hdr *in_buf,
-		struct mid_q_entry *midQ)
-{
-	int rc = 0;
-	struct cifsSesInfo *ses = tcon->ses;
-	__u16 mid = in_buf->Mid;
-
-	header_assemble(in_buf, SMB_COM_NT_CANCEL, tcon, 0);
-	in_buf->Mid = mid;
-	mutex_lock(&ses->server->srv_mutex);
-	rc = cifs_sign_smb(in_buf, ses->server, &midQ->sequence_number);
-	if (rc) {
-		mutex_unlock(&ses->server->srv_mutex);
-		return rc;
-	}
-	rc = smb_send(ses->server, in_buf, in_buf->smb_buf_length);
-	mutex_unlock(&ses->server->srv_mutex);
-	return rc;
-}
-
 /* We send a LOCKINGX_CANCEL_LOCK to cause the Windows
    blocking lock to return. */
 
@@ -890,8 +904,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 		if (in_buf->Command == SMB_COM_TRANSACTION2) {
 			/* POSIX lock. We send a NT_CANCEL SMB to cause the
 			   blocking lock to return. */
-
-			rc = send_nt_cancel(tcon, in_buf, midQ);
+			rc = send_nt_cancel(ses->server, in_buf, midQ);
 			if (rc) {
 				delete_mid(midQ);
 				return rc;
-- 
cgit v1.2.2


From 4f8ba8a0c095933dd54a2c281750c8a85b329b26 Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <piastryyy@gmail.com>
Date: Sun, 21 Nov 2010 22:36:12 +0300
Subject: CIFS: Make cifsFileInfo_put work with strict cache mode

On strict cache mode when we close the last file handle of the inode we
should set invalid_mapping flag on this inode to prevent data coherency
problem when we open it again but it has been modified on the server.

Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Pavel Shilovsky <piastryyy@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_fs_sb.h | 1 +
 fs/cifs/file.c       | 8 ++++++++
 2 files changed, 9 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 7852cd677051..ac51cd2d33ae 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -40,6 +40,7 @@
 #define CIFS_MOUNT_FSCACHE	0x8000 /* local caching enabled */
 #define CIFS_MOUNT_MF_SYMLINKS	0x10000 /* Minshall+French Symlinks enabled */
 #define CIFS_MOUNT_MULTIUSER	0x20000 /* multiuser mount */
+#define CIFS_MOUNT_STRICT_IO	0x40000 /* strict cache mode */
 
 struct cifs_sb_info {
 	struct rb_root tlink_tree;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index bd2a028af833..1b26c2717599 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -287,6 +287,7 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
 	struct inode *inode = cifs_file->dentry->d_inode;
 	struct cifsTconInfo *tcon = tlink_tcon(cifs_file->tlink);
 	struct cifsInodeInfo *cifsi = CIFS_I(inode);
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 	struct cifsLockInfo *li, *tmp;
 
 	spin_lock(&cifs_file_list_lock);
@@ -302,6 +303,13 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
 	if (list_empty(&cifsi->openFileList)) {
 		cFYI(1, "closing last open instance for inode %p",
 			cifs_file->dentry->d_inode);
+
+		/* in strict cache mode we need invalidate mapping on the last
+		   close  because it may cause a error when we open this file
+		   again and get at least level II oplock */
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO)
+			CIFS_I(inode)->invalid_mapping = true;
+
 		cifs_set_oplock_level(cifsi, 0);
 	}
 	spin_unlock(&cifs_file_list_lock);
-- 
cgit v1.2.2


From 8be7e6ba142423e6ad98fed293c96f196f685229 Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <piastryyy@gmail.com>
Date: Sun, 12 Dec 2010 13:11:13 +0300
Subject: CIFS: Implement cifs_strict_fsync

Invalidate inode mapping if we don't have at least Level II oplock in
cifs_strict_fsync. Also remove filemap_write_and_wait call from cifs_fsync
because it is previously called from vfs_fsync_range. Add file operations'
structures for strict cache mode.

Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Pavel Shilovsky <piastryyy@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsfs.c | 38 ++++++++++++++++++++++++++++++++++++++
 fs/cifs/cifsfs.h |  8 ++++++--
 fs/cifs/file.c   | 36 ++++++++++++++++++++++++++++--------
 fs/cifs/inode.c  |  8 ++++++--
 4 files changed, 78 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 99d777a03dd0..f6093e401c5a 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -733,6 +733,25 @@ const struct file_operations cifs_file_ops = {
 	.setlease = cifs_setlease,
 };
 
+const struct file_operations cifs_file_strict_ops = {
+	.read = do_sync_read,
+	.write = do_sync_write,
+	.aio_read = generic_file_aio_read,
+	.aio_write = cifs_file_aio_write,
+	.open = cifs_open,
+	.release = cifs_close,
+	.lock = cifs_lock,
+	.fsync = cifs_strict_fsync,
+	.flush = cifs_flush,
+	.mmap = cifs_file_mmap,
+	.splice_read = generic_file_splice_read,
+	.llseek = cifs_llseek,
+#ifdef CONFIG_CIFS_POSIX
+	.unlocked_ioctl	= cifs_ioctl,
+#endif /* CONFIG_CIFS_POSIX */
+	.setlease = cifs_setlease,
+};
+
 const struct file_operations cifs_file_direct_ops = {
 	/* no aio, no readv -
 	   BB reevaluate whether they can be done with directio, no cache */
@@ -751,6 +770,7 @@ const struct file_operations cifs_file_direct_ops = {
 	.llseek = cifs_llseek,
 	.setlease = cifs_setlease,
 };
+
 const struct file_operations cifs_file_nobrl_ops = {
 	.read = do_sync_read,
 	.write = do_sync_write,
@@ -769,6 +789,24 @@ const struct file_operations cifs_file_nobrl_ops = {
 	.setlease = cifs_setlease,
 };
 
+const struct file_operations cifs_file_strict_nobrl_ops = {
+	.read = do_sync_read,
+	.write = do_sync_write,
+	.aio_read = generic_file_aio_read,
+	.aio_write = cifs_file_aio_write,
+	.open = cifs_open,
+	.release = cifs_close,
+	.fsync = cifs_strict_fsync,
+	.flush = cifs_flush,
+	.mmap = cifs_file_mmap,
+	.splice_read = generic_file_splice_read,
+	.llseek = cifs_llseek,
+#ifdef CONFIG_CIFS_POSIX
+	.unlocked_ioctl	= cifs_ioctl,
+#endif /* CONFIG_CIFS_POSIX */
+	.setlease = cifs_setlease,
+};
+
 const struct file_operations cifs_file_direct_nobrl_ops = {
 	/* no mmap, no aio, no readv -
 	   BB reevaluate whether they can be done with directio, no cache */
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 4739a531cded..10c4303c282d 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -61,6 +61,7 @@ extern int cifs_rename(struct inode *, struct dentry *, struct inode *,
 		       struct dentry *);
 extern int cifs_revalidate_file(struct file *filp);
 extern int cifs_revalidate_dentry(struct dentry *);
+extern void cifs_invalidate_mapping(struct inode *inode);
 extern int cifs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 extern int cifs_setattr(struct dentry *, struct iattr *);
 
@@ -72,8 +73,10 @@ extern const struct inode_operations cifs_dfs_referral_inode_operations;
 /* Functions related to files and directories */
 extern const struct file_operations cifs_file_ops;
 extern const struct file_operations cifs_file_direct_ops; /* if directio mnt */
-extern const struct file_operations cifs_file_nobrl_ops;
-extern const struct file_operations cifs_file_direct_nobrl_ops; /* no brlocks */
+extern const struct file_operations cifs_file_strict_ops; /* if strictio mnt */
+extern const struct file_operations cifs_file_nobrl_ops; /* no brlocks */
+extern const struct file_operations cifs_file_direct_nobrl_ops;
+extern const struct file_operations cifs_file_strict_nobrl_ops;
 extern int cifs_open(struct inode *inode, struct file *file);
 extern int cifs_close(struct inode *inode, struct file *file);
 extern int cifs_closedir(struct inode *inode, struct file *file);
@@ -83,6 +86,7 @@ extern ssize_t cifs_user_write(struct file *file, const char __user *write_data,
 			 size_t write_size, loff_t *poffset);
 extern int cifs_lock(struct file *, int, struct file_lock *);
 extern int cifs_fsync(struct file *, int);
+extern int cifs_strict_fsync(struct file *, int);
 extern int cifs_flush(struct file *, fl_owner_t id);
 extern int cifs_file_mmap(struct file * , struct vm_area_struct *);
 extern const struct file_operations cifs_dir_ops;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 1b26c2717599..5790fab7349b 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1528,27 +1528,47 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
 	return rc;
 }
 
-int cifs_fsync(struct file *file, int datasync)
+int cifs_strict_fsync(struct file *file, int datasync)
 {
 	int xid;
 	int rc = 0;
 	struct cifsTconInfo *tcon;
 	struct cifsFileInfo *smbfile = file->private_data;
 	struct inode *inode = file->f_path.dentry->d_inode;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 
 	xid = GetXid();
 
 	cFYI(1, "Sync file - name: %s datasync: 0x%x",
 		file->f_path.dentry->d_name.name, datasync);
 
-	rc = filemap_write_and_wait(inode->i_mapping);
-	if (rc == 0) {
-		struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	if (!CIFS_I(inode)->clientCanCacheRead)
+		cifs_invalidate_mapping(inode);
 
-		tcon = tlink_tcon(smbfile->tlink);
-		if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC))
-			rc = CIFSSMBFlush(xid, tcon, smbfile->netfid);
-	}
+	tcon = tlink_tcon(smbfile->tlink);
+	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC))
+		rc = CIFSSMBFlush(xid, tcon, smbfile->netfid);
+
+	FreeXid(xid);
+	return rc;
+}
+
+int cifs_fsync(struct file *file, int datasync)
+{
+	int xid;
+	int rc = 0;
+	struct cifsTconInfo *tcon;
+	struct cifsFileInfo *smbfile = file->private_data;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+
+	xid = GetXid();
+
+	cFYI(1, "Sync file - name: %s datasync: 0x%x",
+		file->f_path.dentry->d_name.name, datasync);
+
+	tcon = tlink_tcon(smbfile->tlink);
+	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC))
+		rc = CIFSSMBFlush(xid, tcon, smbfile->netfid);
 
 	FreeXid(xid);
 	return rc;
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 6c9ee8014ff0..8852470b4fbb 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -44,13 +44,17 @@ static void cifs_set_ops(struct inode *inode)
 				inode->i_fop = &cifs_file_direct_nobrl_ops;
 			else
 				inode->i_fop = &cifs_file_direct_ops;
+		} else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) {
+			if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
+				inode->i_fop = &cifs_file_strict_nobrl_ops;
+			else
+				inode->i_fop = &cifs_file_strict_ops;
 		} else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
 			inode->i_fop = &cifs_file_nobrl_ops;
 		else { /* not direct, send byte range locks */
 			inode->i_fop = &cifs_file_ops;
 		}
 
-
 		/* check if server can support readpages */
 		if (cifs_sb_master_tcon(cifs_sb)->ses->server->maxBuf <
 				PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE)
@@ -1679,7 +1683,7 @@ cifs_inode_needs_reval(struct inode *inode)
 /*
  * Zap the cache. Called when invalid_mapping flag is set.
  */
-static void
+void
 cifs_invalidate_mapping(struct inode *inode)
 {
 	int rc;
-- 
cgit v1.2.2


From 7a6a19b17ab9103ec708c18befd28f2a3908d4c1 Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <piastryyy@gmail.com>
Date: Tue, 14 Dec 2010 11:29:51 +0300
Subject: CIFS: Implement cifs_file_strict_mmap (try #2)

Invalidate inode mapping if we don't have at least Level II oplock.

Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Pavel Shilovsky <piastryyy@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsfs.c |  4 ++--
 fs/cifs/cifsfs.h |  1 +
 fs/cifs/file.c   | 15 +++++++++++++++
 3 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index f6093e401c5a..e24d966fb214 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -743,7 +743,7 @@ const struct file_operations cifs_file_strict_ops = {
 	.lock = cifs_lock,
 	.fsync = cifs_strict_fsync,
 	.flush = cifs_flush,
-	.mmap = cifs_file_mmap,
+	.mmap = cifs_file_strict_mmap,
 	.splice_read = generic_file_splice_read,
 	.llseek = cifs_llseek,
 #ifdef CONFIG_CIFS_POSIX
@@ -798,7 +798,7 @@ const struct file_operations cifs_file_strict_nobrl_ops = {
 	.release = cifs_close,
 	.fsync = cifs_strict_fsync,
 	.flush = cifs_flush,
-	.mmap = cifs_file_mmap,
+	.mmap = cifs_file_strict_mmap,
 	.splice_read = generic_file_splice_read,
 	.llseek = cifs_llseek,
 #ifdef CONFIG_CIFS_POSIX
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 10c4303c282d..710072e36912 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -89,6 +89,7 @@ extern int cifs_fsync(struct file *, int);
 extern int cifs_strict_fsync(struct file *, int);
 extern int cifs_flush(struct file *, fl_owner_t id);
 extern int cifs_file_mmap(struct file * , struct vm_area_struct *);
+extern int cifs_file_strict_mmap(struct file * , struct vm_area_struct *);
 extern const struct file_operations cifs_dir_ops;
 extern int cifs_dir_open(struct inode *inode, struct file *file);
 extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 5790fab7349b..0b32377ef8b7 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1769,6 +1769,21 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
 	return total_read;
 }
 
+int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	int rc, xid;
+	struct inode *inode = file->f_path.dentry->d_inode;
+
+	xid = GetXid();
+
+	if (!CIFS_I(inode)->clientCanCacheRead)
+		cifs_invalidate_mapping(inode);
+
+	rc = generic_file_mmap(file, vma);
+	FreeXid(xid);
+	return rc;
+}
+
 int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	int rc, xid;
-- 
cgit v1.2.2


From a70307eeeb25b89f6b2baf3cf3f0cef83c96ba12 Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <piastryyy@gmail.com>
Date: Tue, 14 Dec 2010 11:50:41 +0300
Subject: CIFS: Implement cifs_strict_readv (try #4)

Read from the cache if we have at least Level II oplock - otherwise
read from the server. Add cifs_user_readv to let the client read into
iovec buffers.

Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Pavel Shilovsky <piastryyy@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsfs.c |   4 +-
 fs/cifs/cifsfs.h |   4 +-
 fs/cifs/file.c   | 116 +++++++++++++++++++++++++++++++++++++------------------
 3 files changed, 84 insertions(+), 40 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index e24d966fb214..a8323f1dc1c4 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -736,7 +736,7 @@ const struct file_operations cifs_file_ops = {
 const struct file_operations cifs_file_strict_ops = {
 	.read = do_sync_read,
 	.write = do_sync_write,
-	.aio_read = generic_file_aio_read,
+	.aio_read = cifs_strict_readv,
 	.aio_write = cifs_file_aio_write,
 	.open = cifs_open,
 	.release = cifs_close,
@@ -792,7 +792,7 @@ const struct file_operations cifs_file_nobrl_ops = {
 const struct file_operations cifs_file_strict_nobrl_ops = {
 	.read = do_sync_read,
 	.write = do_sync_write,
-	.aio_read = generic_file_aio_read,
+	.aio_read = cifs_strict_readv,
 	.aio_write = cifs_file_aio_write,
 	.open = cifs_open,
 	.release = cifs_close,
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 710072e36912..f23206d46531 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -81,7 +81,9 @@ extern int cifs_open(struct inode *inode, struct file *file);
 extern int cifs_close(struct inode *inode, struct file *file);
 extern int cifs_closedir(struct inode *inode, struct file *file);
 extern ssize_t cifs_user_read(struct file *file, char __user *read_data,
-			 size_t read_size, loff_t *poffset);
+			      size_t read_size, loff_t *poffset);
+extern ssize_t cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov,
+				 unsigned long nr_segs, loff_t pos);
 extern ssize_t cifs_user_write(struct file *file, const char __user *write_data,
 			 size_t write_size, loff_t *poffset);
 extern int cifs_lock(struct file *, int, struct file_lock *);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 0b32377ef8b7..d7d65a70678e 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1619,42 +1619,42 @@ int cifs_flush(struct file *file, fl_owner_t id)
 	return rc;
 }
 
-ssize_t cifs_user_read(struct file *file, char __user *read_data,
-	size_t read_size, loff_t *poffset)
+static ssize_t
+cifs_iovec_read(struct file *file, const struct iovec *iov,
+		 unsigned long nr_segs, loff_t *poffset)
 {
-	int rc = -EACCES;
-	unsigned int bytes_read = 0;
-	unsigned int total_read = 0;
-	unsigned int current_read_size;
+	int rc;
+	int xid;
+	unsigned int total_read, bytes_read = 0;
+	size_t len, cur_len;
+	int iov_offset = 0;
 	struct cifs_sb_info *cifs_sb;
 	struct cifsTconInfo *pTcon;
-	int xid;
 	struct cifsFileInfo *open_file;
-	char *smb_read_data;
-	char __user *current_offset;
 	struct smb_com_read_rsp *pSMBr;
+	char *read_data;
+
+	if (!nr_segs)
+		return 0;
+
+	len = iov_length(iov, nr_segs);
+	if (!len)
+		return 0;
 
 	xid = GetXid();
 	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
 
-	if (file->private_data == NULL) {
-		rc = -EBADF;
-		FreeXid(xid);
-		return rc;
-	}
 	open_file = file->private_data;
 	pTcon = tlink_tcon(open_file->tlink);
 
 	if ((file->f_flags & O_ACCMODE) == O_WRONLY)
 		cFYI(1, "attempting read on write only file instance");
 
-	for (total_read = 0, current_offset = read_data;
-	     read_size > total_read;
-	     total_read += bytes_read, current_offset += bytes_read) {
-		current_read_size = min_t(const int, read_size - total_read,
-					  cifs_sb->rsize);
+	for (total_read = 0; total_read < len; total_read += bytes_read) {
+		cur_len = min_t(const size_t, len - total_read, cifs_sb->rsize);
 		rc = -EAGAIN;
-		smb_read_data = NULL;
+		read_data = NULL;
+
 		while (rc == -EAGAIN) {
 			int buf_type = CIFS_NO_BUFFER;
 			if (open_file->invalidHandle) {
@@ -1662,27 +1662,25 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
 				if (rc != 0)
 					break;
 			}
-			rc = CIFSSMBRead(xid, pTcon,
-					 open_file->netfid,
-					 current_read_size, *poffset,
-					 &bytes_read, &smb_read_data,
-					 &buf_type);
-			pSMBr = (struct smb_com_read_rsp *)smb_read_data;
-			if (smb_read_data) {
-				if (copy_to_user(current_offset,
-						smb_read_data +
-						4 /* RFC1001 length field */ +
-						le16_to_cpu(pSMBr->DataOffset),
-						bytes_read))
+			rc = CIFSSMBRead(xid, pTcon, open_file->netfid,
+					 cur_len, *poffset, &bytes_read,
+					 &read_data, &buf_type);
+			pSMBr = (struct smb_com_read_rsp *)read_data;
+			if (read_data) {
+				char *data_offset = read_data + 4 +
+						le16_to_cpu(pSMBr->DataOffset);
+				if (memcpy_toiovecend(iov, data_offset,
+						      iov_offset, bytes_read))
 					rc = -EFAULT;
-
 				if (buf_type == CIFS_SMALL_BUFFER)
-					cifs_small_buf_release(smb_read_data);
+					cifs_small_buf_release(read_data);
 				else if (buf_type == CIFS_LARGE_BUFFER)
-					cifs_buf_release(smb_read_data);
-				smb_read_data = NULL;
+					cifs_buf_release(read_data);
+				read_data = NULL;
+				iov_offset += bytes_read;
 			}
 		}
+
 		if (rc || (bytes_read == 0)) {
 			if (total_read) {
 				break;
@@ -1695,13 +1693,57 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
 			*poffset += bytes_read;
 		}
 	}
+
 	FreeXid(xid);
 	return total_read;
 }
 
+ssize_t cifs_user_read(struct file *file, char __user *read_data,
+		       size_t read_size, loff_t *poffset)
+{
+	struct iovec iov;
+	iov.iov_base = read_data;
+	iov.iov_len = read_size;
+
+	return cifs_iovec_read(file, &iov, 1, poffset);
+}
+
+static ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov,
+			       unsigned long nr_segs, loff_t pos)
+{
+	ssize_t read;
+
+	read = cifs_iovec_read(iocb->ki_filp, iov, nr_segs, &pos);
+	if (read > 0)
+		iocb->ki_pos = pos;
+
+	return read;
+}
+
+ssize_t cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov,
+			  unsigned long nr_segs, loff_t pos)
+{
+	struct inode *inode;
+
+	inode = iocb->ki_filp->f_path.dentry->d_inode;
+
+	if (CIFS_I(inode)->clientCanCacheRead)
+		return generic_file_aio_read(iocb, iov, nr_segs, pos);
+
+	/*
+	 * In strict cache mode we need to read from the server all the time
+	 * if we don't have level II oplock because the server can delay mtime
+	 * change - so we can't make a decision about inode invalidating.
+	 * And we can also fail with pagereading if there are mandatory locks
+	 * on pages affected by this read but not on the region from pos to
+	 * pos+len-1.
+	 */
+
+	return cifs_user_readv(iocb, iov, nr_segs, pos);
+}
 
 static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
-	loff_t *poffset)
+			 loff_t *poffset)
 {
 	int rc = -EACCES;
 	unsigned int bytes_read = 0;
-- 
cgit v1.2.2


From c3dccf48174e50668b7c63544ac8c60c07a45978 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Thu, 20 Jan 2011 13:36:50 -0500
Subject: cifs: TCP_Server_Info diet

Remove fields that are completely unused, and rearrange struct
according to recommendations by "pahole".

Before:

	/* size: 1112, cachelines: 18, members: 49 */
	/* sum members: 1086, holes: 8, sum holes: 26 */
	/* bit holes: 1, sum bit holes: 7 bits */
	/* last cacheline: 24 bytes */

After:

	/* size: 1072, cachelines: 17, members: 42 */
	/* sum members: 1065, holes: 3, sum holes: 7 */
	/* last cacheline: 48 bytes */

...savings of 40 bytes per struct on x86_64. 21 bytes by field removal,
and 19 by reorganizing to eliminate holes.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsglob.h | 26 +++++++++-----------------
 fs/cifs/cifssmb.c  |  2 --
 2 files changed, 9 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 571132c95231..36f097e1ee74 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -161,6 +161,7 @@ struct TCP_Server_Info {
 	int srv_count; /* reference counter */
 	/* 15 character server name + 0x20 16th byte indicating type = srv */
 	char server_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
+	enum statusEnum tcpStatus; /* what we think the status is */
 	char *hostname; /* hostname portion of UNC string */
 	struct socket *ssocket;
 	struct sockaddr_storage dstaddr;
@@ -168,25 +169,16 @@ struct TCP_Server_Info {
 	wait_queue_head_t response_q;
 	wait_queue_head_t request_q; /* if more than maxmpx to srvr must block*/
 	struct list_head pending_mid_q;
-	void *Server_NlsInfo;	/* BB - placeholder for future NLS info  */
-	unsigned short server_codepage;	/* codepage for the server    */
-	enum protocolEnum protocolType;
-	char versionMajor;
-	char versionMinor;
-	bool svlocal:1;			/* local server or remote */
 	bool noblocksnd;		/* use blocking sendmsg */
 	bool noautotune;		/* do not autotune send buf sizes */
 	bool tcp_nodelay;
 	atomic_t inFlight;  /* number of requests on the wire to server */
-#ifdef CONFIG_CIFS_STATS2
-	atomic_t inSend; /* requests trying to send */
-	atomic_t num_waiters;   /* blocked waiting to get in sendrecv */
-#endif
-	enum statusEnum tcpStatus; /* what we think the status is */
 	struct mutex srv_mutex;
 	struct task_struct *tsk;
 	char server_GUID[16];
 	char secMode;
+	bool session_estab; /* mark when very first sess is established */
+	u16 dialect; /* dialect index that server chose */
 	enum securityEnum secType;
 	unsigned int maxReq;	/* Clients should submit no more */
 	/* than maxReq distinct unanswered SMBs to the server when using  */
@@ -199,8 +191,6 @@ struct TCP_Server_Info {
 	unsigned int max_vcs;	/* maximum number of smb sessions, at least
 				   those that can be specified uniquely with
 				   vcnumbers */
-	char sessid[4];		/* unique token id for this session */
-	/* (returned on Negotiate */
 	int capabilities; /* allow selective disabling of caps by smb sess */
 	int timeAdj;  /* Adjust for difference in server time zone in sec */
 	__u16 CurrentMid;         /* multiplex id - rotating counter */
@@ -210,18 +200,20 @@ struct TCP_Server_Info {
 	__u32 sequence_number; /* for signing, protected by srv_mutex */
 	struct session_key session_key;
 	unsigned long lstrp; /* when we got last response from this server */
-	u16 dialect; /* dialect index that server chose */
 	struct cifs_secmech secmech; /* crypto sec mech functs, descriptors */
 	/* extended security flavors that server supports */
+	bool	sec_ntlmssp;		/* supports NTLMSSP */
+	bool	sec_kerberosu2u;	/* supports U2U Kerberos */
 	bool	sec_kerberos;		/* supports plain Kerberos */
 	bool	sec_mskerberos;		/* supports legacy MS Kerberos */
-	bool	sec_kerberosu2u;	/* supports U2U Kerberos */
-	bool	sec_ntlmssp;		/* supports NTLMSSP */
-	bool session_estab; /* mark when very first sess is established */
 	struct delayed_work	echo; /* echo ping workqueue job */
 #ifdef CONFIG_CIFS_FSCACHE
 	struct fscache_cookie   *fscache; /* client index cache cookie */
 #endif
+#ifdef CONFIG_CIFS_STATS2
+	atomic_t inSend; /* requests trying to send */
+	atomic_t num_waiters;   /* blocked waiting to get in sendrecv */
+#endif
 };
 
 /*
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 37113450757b..5b1f6637f161 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -452,7 +452,6 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 		server->maxBuf = min((__u32)le16_to_cpu(rsp->MaxBufSize),
 				(__u32)CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
 		server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs);
-		GETU32(server->sessid) = le32_to_cpu(rsp->SessionKey);
 		/* even though we do not use raw we might as well set this
 		accurately, in case we ever find a need for it */
 		if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) {
@@ -566,7 +565,6 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 			(__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
 	server->max_rw = le32_to_cpu(pSMBr->MaxRawSize);
 	cFYI(DBG2, "Max buf = %d", ses->server->maxBuf);
-	GETU32(ses->server->sessid) = le32_to_cpu(pSMBr->SessionKey);
 	server->capabilities = le32_to_cpu(pSMBr->Capabilities);
 	server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone);
 	server->timeAdj *= 60;
-- 
cgit v1.2.2


From aae62fdb6b9a6605abdea7370c4a0e005e6c1cd7 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Thu, 20 Jan 2011 13:36:50 -0500
Subject: cifs: move time field in cifsInodeInfo

...and remove length qualifiers from bools.

Before:

	/* size: 1176, cachelines: 19, members: 13 */
	/* sum members: 1165, holes: 2, sum holes: 11 */
	/* bit holes: 1, sum bit holes: 4 bits */
	/* last cacheline: 24 bytes */

After:

	/* size: 1168, cachelines: 19, members: 13 */
	/* last cacheline: 16 bytes */

...savings of 8 bytes per inode.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Pavel Shilovsky <piastryyy@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsglob.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 36f097e1ee74..5bfb75346cb0 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -439,11 +439,11 @@ struct cifsInodeInfo {
 	/* BB add in lists for dirty pages i.e. write caching info for oplock */
 	struct list_head openFileList;
 	__u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */
-	unsigned long time;	/* jiffies of last update/check of inode */
-	bool clientCanCacheRead:1;	/* read oplock */
-	bool clientCanCacheAll:1;	/* read and writebehind oplock */
-	bool delete_pending:1;		/* DELETE_ON_CLOSE is set */
-	bool invalid_mapping:1;		/* pagecache is invalid */
+	bool clientCanCacheRead;	/* read oplock */
+	bool clientCanCacheAll;		/* read and writebehind oplock */
+	bool delete_pending;		/* DELETE_ON_CLOSE is set */
+	bool invalid_mapping;		/* pagecache is invalid */
+	unsigned long time;		/* jiffies of last update of inode */
 	u64  server_eof;		/* current file size on server */
 	u64  uniqueid;			/* server inode number */
 	u64  createtime;		/* creation time on server */
-- 
cgit v1.2.2


From 690c522fa5a62825af880775e3ef1e55755667b2 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Thu, 20 Jan 2011 13:36:51 -0500
Subject: cifs: use get/put_unaligned functions to access ByteCount

It's possible that when we access the ByteCount that the alignment
will be off. Most CPUs deal with that transparently, but there's
usually some performance impact. Some CPUs raise an exception on
unaligned accesses.

Fix this by accessing the byte count using the get_unaligned and
put_unaligned inlined functions. While we're at it, fix the types
of some of the variables that end up getting returns from these
functions.

Acked-by: Pavel Shilovsky <piastryyy@gmail.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifspdu.h   | 47 +++++++++++++++++++++++++++++++++++++++++++----
 fs/cifs/cifssmb.c   | 14 +++++---------
 fs/cifs/connect.c   | 10 +++++-----
 fs/cifs/netmisc.c   |  4 ++--
 fs/cifs/sess.c      | 13 ++++++-------
 fs/cifs/transport.c |  9 ++++-----
 6 files changed, 65 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index ea205b4fcad2..b5c8cc5d7a7f 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -23,6 +23,7 @@
 #define _CIFSPDU_H
 
 #include <net/sock.h>
+#include <asm/unaligned.h>
 #include "smbfsctl.h"
 
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
@@ -426,11 +427,49 @@ struct smb_hdr {
 	__u16 Mid;
 	__u8 WordCount;
 } __attribute__((packed));
-/* given a pointer to an smb_hdr retrieve the value of byte count */
-#define BCC(smb_var) (*(__u16 *)((char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount)))
-#define BCC_LE(smb_var) (*(__le16 *)((char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount)))
+
+/* given a pointer to an smb_hdr retrieve a char pointer to the byte count */
+#define BCC(smb_var) ((unsigned char *)(smb_var) + sizeof(struct smb_hdr) + \
+			 (2 * (smb_var)->WordCount))
+
 /* given a pointer to an smb_hdr retrieve the pointer to the byte area */
-#define pByteArea(smb_var) ((unsigned char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount) + 2)
+#define pByteArea(smb_var) (BCC(smb_var) + 2)
+
+/* get the converted ByteCount for a SMB packet and return it */
+static inline __u16
+get_bcc(struct smb_hdr *hdr)
+{
+	__u16 *bc_ptr = (__u16 *)BCC(hdr);
+
+	return get_unaligned(bc_ptr);
+}
+
+/* get the unconverted ByteCount for a SMB packet and return it */
+static inline __u16
+get_bcc_le(struct smb_hdr *hdr)
+{
+	__le16 *bc_ptr = (__le16 *)BCC(hdr);
+
+	return get_unaligned_le16(bc_ptr);
+}
+
+/* set the ByteCount for a SMB packet in host-byte order */
+static inline void
+put_bcc(__u16 count, struct smb_hdr *hdr)
+{
+	__u16 *bc_ptr = (__u16 *)BCC(hdr);
+
+	put_unaligned(count, bc_ptr);
+}
+
+/* set the ByteCount for a SMB packet in little-endian */
+static inline void
+put_bcc_le(__u16 count, struct smb_hdr *hdr)
+{
+	__le16 *bc_ptr = (__le16 *)BCC(hdr);
+
+	put_unaligned_le16(count, bc_ptr);
+}
 
 /*
  * Computer Name Length (since Netbios name was length 16 with last byte 0x20)
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 5b1f6637f161..39cec0d9cd1b 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -333,7 +333,6 @@ static int validate_t2(struct smb_t2_rsp *pSMB)
 {
 	int rc = -EINVAL;
 	int total_size;
-	char *pBCC;
 
 	/* check for plausible wct, bcc and t2 data and parm sizes */
 	/* check for parm and data offset going beyond end of smb */
@@ -346,13 +345,9 @@ static int validate_t2(struct smb_t2_rsp *pSMB)
 			if (total_size < 512) {
 				total_size +=
 					le16_to_cpu(pSMB->t2_rsp.DataCount);
-				/* BCC le converted in SendReceive */
-				pBCC = (pSMB->hdr.WordCount * 2) +
-					sizeof(struct smb_hdr) +
-					(char *)pSMB;
-				if ((total_size <= (*(u16 *)pBCC)) &&
-				   (total_size <
-					CIFSMaxBufSize+MAX_CIFS_HDR_SIZE)) {
+				if (total_size <= get_bcc(&pSMB->hdr) &&
+				    total_size <
+					CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
 					return 0;
 				}
 			}
@@ -362,6 +357,7 @@ static int validate_t2(struct smb_t2_rsp *pSMB)
 		sizeof(struct smb_t2_rsp) + 16);
 	return rc;
 }
+
 int
 CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 {
@@ -5609,7 +5605,7 @@ QAllEAsRetry:
 	}
 
 	/* make sure list_len doesn't go past end of SMB */
-	end_of_smb = (char *)pByteArea(&pSMBr->hdr) + BCC(&pSMBr->hdr);
+	end_of_smb = (char *)pByteArea(&pSMBr->hdr) + get_bcc(&pSMBr->hdr);
 	if ((char *)ea_response_data + list_len > end_of_smb) {
 		cFYI(1, "EA list appears to go beyond SMB");
 		rc = -EIO;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 8d4657596301..ca20e813275d 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -318,9 +318,9 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
 	memcpy(data_area_of_target, data_area_of_buf2, total_in_buf2);
 	total_in_buf += total_in_buf2;
 	pSMBt->t2_rsp.DataCount = cpu_to_le16(total_in_buf);
-	byte_count = le16_to_cpu(BCC_LE(pTargetSMB));
+	byte_count = get_bcc_le(pTargetSMB);
 	byte_count += total_in_buf2;
-	BCC_LE(pTargetSMB) = cpu_to_le16(byte_count);
+	put_bcc_le(byte_count, pTargetSMB);
 
 	byte_count = pTargetSMB->smb_buf_length;
 	byte_count += total_in_buf2;
@@ -2937,8 +2937,8 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 	TCONX_RSP *pSMBr;
 	unsigned char *bcc_ptr;
 	int rc = 0;
-	int length, bytes_left;
-	__u16 count;
+	int length;
+	__u16 bytes_left, count;
 
 	if (ses == NULL)
 		return -EIO;
@@ -3032,7 +3032,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 		tcon->need_reconnect = false;
 		tcon->tid = smb_buffer_response->Tid;
 		bcc_ptr = pByteArea(smb_buffer_response);
-		bytes_left = BCC(smb_buffer_response);
+		bytes_left = get_bcc(smb_buffer_response);
 		length = strnlen(bcc_ptr, bytes_left - 2);
 		if (smb_buffer->Flags2 & SMBFLG2_UNICODE)
 			is_unicode = true;
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index 6783ce6cdc89..8d9189f64477 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -916,14 +916,14 @@ unsigned int
 smbCalcSize(struct smb_hdr *ptr)
 {
 	return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) +
-		2 /* size of the bcc field */ + BCC(ptr));
+		2 /* size of the bcc field */ + get_bcc(ptr));
 }
 
 unsigned int
 smbCalcSize_LE(struct smb_hdr *ptr)
 {
 	return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) +
-		2 /* size of the bcc field */ + le16_to_cpu(BCC_LE(ptr)));
+		2 /* size of the bcc field */ + get_bcc_le(ptr));
 }
 
 /* The following are taken from fs/ntfs/util.c */
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 1cffd82c4f13..1adc9625a344 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -277,7 +277,7 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifsSesInfo *ses,
 }
 
 static void
-decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses,
+decode_unicode_ssetup(char **pbcc_area, __u16 bleft, struct cifsSesInfo *ses,
 		      const struct nls_table *nls_cp)
 {
 	int len;
@@ -323,7 +323,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses,
 	return;
 }
 
-static int decode_ascii_ssetup(char **pbcc_area, int bleft,
+static int decode_ascii_ssetup(char **pbcc_area, __u16 bleft,
 			       struct cifsSesInfo *ses,
 			       const struct nls_table *nls_cp)
 {
@@ -575,12 +575,11 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
 	char *str_area;
 	SESSION_SETUP_ANDX *pSMB;
 	__u32 capabilities;
-	int count;
+	__u16 count;
 	int resp_buf_type;
 	struct kvec iov[3];
 	enum securityEnum type;
-	__u16 action;
-	int bytes_remaining;
+	__u16 action, bytes_remaining;
 	struct key *spnego_key = NULL;
 	__le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
 	u16 blob_len;
@@ -876,7 +875,7 @@ ssetup_ntlmssp_authenticate:
 	count = iov[1].iov_len + iov[2].iov_len;
 	smb_buf->smb_buf_length += count;
 
-	BCC_LE(smb_buf) = cpu_to_le16(count);
+	put_bcc_le(count, smb_buf);
 
 	rc = SendReceive2(xid, ses, iov, 3 /* num_iovecs */, &resp_buf_type,
 			  CIFS_LOG_ERROR);
@@ -910,7 +909,7 @@ ssetup_ntlmssp_authenticate:
 	cFYI(1, "UID = %d ", ses->Suid);
 	/* response can have either 3 or 4 word count - Samba sends 3 */
 	/* and lanman response is 3 */
-	bytes_remaining = BCC(smb_buf);
+	bytes_remaining = get_bcc(smb_buf);
 	bcc_ptr = pByteArea(smb_buf);
 
 	if (smb_buf->WordCount == 4) {
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index c8e2808cd5e6..c1ccca1a933f 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -484,7 +484,7 @@ send_nt_cancel(struct TCP_Server_Info *server, struct smb_hdr *in_buf,
 	in_buf->smb_buf_length = sizeof(struct smb_hdr) - 4  + 2;
 	in_buf->Command = SMB_COM_NT_CANCEL;
 	in_buf->WordCount = 0;
-	BCC_LE(in_buf) = 0;
+	put_bcc_le(0, in_buf);
 
 	mutex_lock(&server->srv_mutex);
 	rc = cifs_sign_smb(in_buf, server, &mid->sequence_number);
@@ -632,8 +632,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
 		if (receive_len >= sizeof(struct smb_hdr) - 4
 		    /* do not count RFC1001 header */  +
 		    (2 * midQ->resp_buf->WordCount) + 2 /* bcc */ )
-			BCC(midQ->resp_buf) =
-				le16_to_cpu(BCC_LE(midQ->resp_buf));
+			put_bcc(get_bcc_le(midQ->resp_buf), midQ->resp_buf);
 		if ((flags & CIFS_NO_RESP) == 0)
 			midQ->resp_buf = NULL;  /* mark it so buf will
 						   not be freed by
@@ -776,7 +775,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
 		if (receive_len >= sizeof(struct smb_hdr) - 4
 		    /* do not count RFC1001 header */  +
 		    (2 * out_buf->WordCount) + 2 /* bcc */ )
-			BCC(out_buf) = le16_to_cpu(BCC_LE(out_buf));
+			put_bcc(get_bcc_le(midQ->resp_buf), midQ->resp_buf);
 	} else {
 		rc = -EIO;
 		cERROR(1, "Bad MID state?");
@@ -977,7 +976,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 	if (receive_len >= sizeof(struct smb_hdr) - 4
 	    /* do not count RFC1001 header */  +
 	    (2 * out_buf->WordCount) + 2 /* bcc */ )
-		BCC(out_buf) = le16_to_cpu(BCC_LE(out_buf));
+		put_bcc(get_bcc_le(out_buf), out_buf);
 
 out:
 	delete_mid(midQ);
-- 
cgit v1.2.2


From 12df83c9b901cfe8ca7a66fbe0effc6d873cbbb9 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Thu, 20 Jan 2011 13:36:51 -0500
Subject: cifs: clean up unaligned accesses in validate_t2

...and clean up function to reduce indentation.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Acked-by: Pavel Shilovsky <piastryyy@gmail.com>
Reviewed-by: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifssmb.c | 44 +++++++++++++++++++++++---------------------
 1 file changed, 23 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 39cec0d9cd1b..675041a6949c 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -331,31 +331,33 @@ smb_init_no_reconnect(int smb_command, int wct, struct cifsTconInfo *tcon,
 
 static int validate_t2(struct smb_t2_rsp *pSMB)
 {
-	int rc = -EINVAL;
-	int total_size;
+	unsigned int total_size;
+
+	/* check for plausible wct */
+	if (pSMB->hdr.WordCount < 10)
+		goto vt2_err;
 
-	/* check for plausible wct, bcc and t2 data and parm sizes */
 	/* check for parm and data offset going beyond end of smb */
-	if (pSMB->hdr.WordCount >= 10) {
-		if ((le16_to_cpu(pSMB->t2_rsp.ParameterOffset) <= 1024) &&
-		   (le16_to_cpu(pSMB->t2_rsp.DataOffset) <= 1024)) {
-			/* check that bcc is at least as big as parms + data */
-			/* check that bcc is less than negotiated smb buffer */
-			total_size = le16_to_cpu(pSMB->t2_rsp.ParameterCount);
-			if (total_size < 512) {
-				total_size +=
-					le16_to_cpu(pSMB->t2_rsp.DataCount);
-				if (total_size <= get_bcc(&pSMB->hdr) &&
-				    total_size <
-					CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
-					return 0;
-				}
-			}
-		}
-	}
+	if (get_unaligned_le16(&pSMB->t2_rsp.ParameterOffset) > 1024 ||
+	    get_unaligned_le16(&pSMB->t2_rsp.DataOffset) > 1024)
+		goto vt2_err;
+
+	/* check that bcc is at least as big as parms + data */
+	/* check that bcc is less than negotiated smb buffer */
+	total_size = get_unaligned_le16(&pSMB->t2_rsp.ParameterCount);
+	if (total_size >= 512)
+		goto vt2_err;
+
+	total_size += get_unaligned_le16(&pSMB->t2_rsp.DataCount);
+	if (total_size > get_bcc(&pSMB->hdr) ||
+	    total_size >= CIFSMaxBufSize + MAX_CIFS_HDR_SIZE)
+		goto vt2_err;
+
+	return 0;
+vt2_err:
 	cifs_dump_mem("Invalid transact2 SMB: ", (char *)pSMB,
 		sizeof(struct smb_t2_rsp) + 16);
-	return rc;
+	return -EINVAL;
 }
 
 int
-- 
cgit v1.2.2


From 26ec254869c0158ea8db6de83b7644e2d93cac2a Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Thu, 20 Jan 2011 13:36:51 -0500
Subject: cifs: fix unaligned access in check2ndT2 and coalesce_t2

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Acked-by: Pavel Shilovsky <piastryyy@gmail.com>
Reviewed-by: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 33 ++++++++++++++-------------------
 1 file changed, 14 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index ca20e813275d..18d3c7724d6e 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -232,9 +232,8 @@ cifs_reconnect(struct TCP_Server_Info *server)
 static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize)
 {
 	struct smb_t2_rsp *pSMBt;
-	int total_data_size;
-	int data_in_this_rsp;
 	int remaining;
+	__u16 total_data_size, data_in_this_rsp;
 
 	if (pSMB->Command != SMB_COM_TRANSACTION2)
 		return 0;
@@ -248,8 +247,8 @@ static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize)
 
 	pSMBt = (struct smb_t2_rsp *)pSMB;
 
-	total_data_size = le16_to_cpu(pSMBt->t2_rsp.TotalDataCount);
-	data_in_this_rsp = le16_to_cpu(pSMBt->t2_rsp.DataCount);
+	total_data_size = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount);
+	data_in_this_rsp = get_unaligned_le16(&pSMBt->t2_rsp.DataCount);
 
 	remaining = total_data_size - data_in_this_rsp;
 
@@ -275,21 +274,18 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
 {
 	struct smb_t2_rsp *pSMB2 = (struct smb_t2_rsp *)psecond;
 	struct smb_t2_rsp *pSMBt  = (struct smb_t2_rsp *)pTargetSMB;
-	int total_data_size;
-	int total_in_buf;
-	int remaining;
-	int total_in_buf2;
 	char *data_area_of_target;
 	char *data_area_of_buf2;
-	__u16 byte_count;
+	int remaining;
+	__u16 byte_count, total_data_size, total_in_buf, total_in_buf2;
 
-	total_data_size = le16_to_cpu(pSMBt->t2_rsp.TotalDataCount);
+	total_data_size = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount);
 
-	if (total_data_size != le16_to_cpu(pSMB2->t2_rsp.TotalDataCount)) {
+	if (total_data_size !=
+	    get_unaligned_le16(&pSMB2->t2_rsp.TotalDataCount))
 		cFYI(1, "total data size of primary and secondary t2 differ");
-	}
 
-	total_in_buf = le16_to_cpu(pSMBt->t2_rsp.DataCount);
+	total_in_buf = get_unaligned_le16(&pSMBt->t2_rsp.DataCount);
 
 	remaining = total_data_size - total_in_buf;
 
@@ -299,25 +295,25 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
 	if (remaining == 0) /* nothing to do, ignore */
 		return 0;
 
-	total_in_buf2 = le16_to_cpu(pSMB2->t2_rsp.DataCount);
+	total_in_buf2 = get_unaligned_le16(&pSMB2->t2_rsp.DataCount);
 	if (remaining < total_in_buf2) {
 		cFYI(1, "transact2 2nd response contains too much data");
 	}
 
 	/* find end of first SMB data area */
 	data_area_of_target = (char *)&pSMBt->hdr.Protocol +
-				le16_to_cpu(pSMBt->t2_rsp.DataOffset);
+				get_unaligned_le16(&pSMBt->t2_rsp.DataOffset);
 	/* validate target area */
 
-	data_area_of_buf2 = (char *) &pSMB2->hdr.Protocol +
-					le16_to_cpu(pSMB2->t2_rsp.DataOffset);
+	data_area_of_buf2 = (char *)&pSMB2->hdr.Protocol +
+				get_unaligned_le16(&pSMB2->t2_rsp.DataOffset);
 
 	data_area_of_target += total_in_buf;
 
 	/* copy second buffer into end of first buffer */
 	memcpy(data_area_of_target, data_area_of_buf2, total_in_buf2);
 	total_in_buf += total_in_buf2;
-	pSMBt->t2_rsp.DataCount = cpu_to_le16(total_in_buf);
+	put_unaligned_le16(total_in_buf, &pSMBt->t2_rsp.DataCount);
 	byte_count = get_bcc_le(pTargetSMB);
 	byte_count += total_in_buf2;
 	put_bcc_le(byte_count, pTargetSMB);
@@ -334,7 +330,6 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
 		return 0; /* we are done */
 	} else /* more responses to go */
 		return 1;
-
 }
 
 static void
-- 
cgit v1.2.2


From ba2dbf30df210b519bdd8d34ac2ecaaeeef34c44 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Thu, 20 Jan 2011 13:36:51 -0500
Subject: cifs: clean up unaligned accesses in cifs_unicode.c

Make sure we use get/put_unaligned routines when accessing wide
character strings.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Acked-by: Pavel Shilovsky <piastryyy@gmail.com>
Reviewed-by: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_unicode.c | 51 +++++++++++++++++++++++++++-----------------------
 1 file changed, 28 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 430f510a1720..5f6e71857be6 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -44,10 +44,14 @@ cifs_ucs2_bytes(const __le16 *from, int maxbytes,
 	int charlen, outlen = 0;
 	int maxwords = maxbytes / 2;
 	char tmp[NLS_MAX_CHARSET_SIZE];
+	__u16 ftmp;
 
-	for (i = 0; i < maxwords && from[i]; i++) {
-		charlen = codepage->uni2char(le16_to_cpu(from[i]), tmp,
-					     NLS_MAX_CHARSET_SIZE);
+	for (i = 0; i < maxwords; i++) {
+		ftmp = get_unaligned_le16(&from[i]);
+		if (ftmp == 0)
+			break;
+
+		charlen = codepage->uni2char(ftmp, tmp, NLS_MAX_CHARSET_SIZE);
 		if (charlen > 0)
 			outlen += charlen;
 		else
@@ -58,9 +62,9 @@ cifs_ucs2_bytes(const __le16 *from, int maxbytes,
 }
 
 /*
- * cifs_mapchar - convert a little-endian char to proper char in codepage
+ * cifs_mapchar - convert a host-endian char to proper char in codepage
  * @target - where converted character should be copied
- * @src_char - 2 byte little-endian source character
+ * @src_char - 2 byte host-endian source character
  * @cp - codepage to which character should be converted
  * @mapchar - should character be mapped according to mapchars mount option?
  *
@@ -69,7 +73,7 @@ cifs_ucs2_bytes(const __le16 *from, int maxbytes,
  * enough to hold the result of the conversion (at least NLS_MAX_CHARSET_SIZE).
  */
 static int
-cifs_mapchar(char *target, const __le16 src_char, const struct nls_table *cp,
+cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp,
 	     bool mapchar)
 {
 	int len = 1;
@@ -82,7 +86,7 @@ cifs_mapchar(char *target, const __le16 src_char, const struct nls_table *cp,
 	 *     build_path_from_dentry are modified, as they use slash as
 	 *     separator.
 	 */
-	switch (le16_to_cpu(src_char)) {
+	switch (src_char) {
 	case UNI_COLON:
 		*target = ':';
 		break;
@@ -109,8 +113,7 @@ out:
 	return len;
 
 cp_convert:
-	len = cp->uni2char(le16_to_cpu(src_char), target,
-			   NLS_MAX_CHARSET_SIZE);
+	len = cp->uni2char(src_char, target, NLS_MAX_CHARSET_SIZE);
 	if (len <= 0) {
 		*target = '?';
 		len = 1;
@@ -149,6 +152,7 @@ cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen,
 	int nullsize = nls_nullsize(codepage);
 	int fromwords = fromlen / 2;
 	char tmp[NLS_MAX_CHARSET_SIZE];
+	__u16 ftmp;
 
 	/*
 	 * because the chars can be of varying widths, we need to take care
@@ -158,19 +162,23 @@ cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen,
 	 */
 	safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize);
 
-	for (i = 0; i < fromwords && from[i]; i++) {
+	for (i = 0; i < fromwords; i++) {
+		ftmp = get_unaligned_le16(&from[i]);
+		if (ftmp == 0)
+			break;
+
 		/*
 		 * check to see if converting this character might make the
 		 * conversion bleed into the null terminator
 		 */
 		if (outlen >= safelen) {
-			charlen = cifs_mapchar(tmp, from[i], codepage, mapchar);
+			charlen = cifs_mapchar(tmp, ftmp, codepage, mapchar);
 			if ((outlen + charlen) > (tolen - nullsize))
 				break;
 		}
 
 		/* put converted char into 'to' buffer */
-		charlen = cifs_mapchar(&to[outlen], from[i], codepage, mapchar);
+		charlen = cifs_mapchar(&to[outlen], ftmp, codepage, mapchar);
 		outlen += charlen;
 	}
 
@@ -193,24 +201,21 @@ cifs_strtoUCS(__le16 *to, const char *from, int len,
 {
 	int charlen;
 	int i;
-	wchar_t *wchar_to = (wchar_t *)to; /* needed to quiet sparse */
+	wchar_t wchar_to; /* needed to quiet sparse */
 
 	for (i = 0; len && *from; i++, from += charlen, len -= charlen) {
-
-		/* works for 2.4.0 kernel or later */
-		charlen = codepage->char2uni(from, len, &wchar_to[i]);
+		charlen = codepage->char2uni(from, len, &wchar_to);
 		if (charlen < 1) {
-			cERROR(1, "strtoUCS: char2uni of %d returned %d",
-				(int)*from, charlen);
+			cERROR(1, "strtoUCS: char2uni of 0x%x returned %d",
+				*from, charlen);
 			/* A question mark */
-			to[i] = cpu_to_le16(0x003f);
+			wchar_to = 0x003f;
 			charlen = 1;
-		} else
-			to[i] = cpu_to_le16(wchar_to[i]);
-
+		}
+		put_unaligned_le16(wchar_to, &to[i]);
 	}
 
-	to[i] = 0;
+	put_unaligned_le16(0, &to[i]);
 	return i;
 }
 
-- 
cgit v1.2.2


From 84cdf74e8096a10dd6acbb870dd404b92f07a756 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Thu, 20 Jan 2011 13:36:51 -0500
Subject: cifs: fix unaligned accesses in cifsConvertToUCS

Move cifsConvertToUCS to cifs_unicode.c where all of the other unicode
related functions live. Have it store mapped characters in 'temp' and
then use put_unaligned_le16 to copy it to the target buffer. Also fix
the comments to match kernel coding style.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Acked-by: Pavel Shilovsky <piastryyy@gmail.com>
Reviewed-by: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_unicode.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/cifs/misc.c         | 71 ----------------------------------------------
 2 files changed, 76 insertions(+), 71 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 5f6e71857be6..fc0fd4fde306 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -257,3 +257,79 @@ cifs_strndup_from_ucs(const char *src, const int maxlen, const bool is_unicode,
 	return dst;
 }
 
+/*
+ * Convert 16 bit Unicode pathname to wire format from string in current code
+ * page. Conversion may involve remapping up the six characters that are
+ * only legal in POSIX-like OS (if they are present in the string). Path
+ * names are little endian 16 bit Unicode on the wire
+ */
+int
+cifsConvertToUCS(__le16 *target, const char *source, int maxlen,
+		 const struct nls_table *cp, int mapChars)
+{
+	int i, j, charlen;
+	int len_remaining = maxlen;
+	char src_char;
+	__u16 temp;
+
+	if (!mapChars)
+		return cifs_strtoUCS(target, source, PATH_MAX, cp);
+
+	for (i = 0, j = 0; i < maxlen; j++) {
+		src_char = source[i];
+		switch (src_char) {
+		case 0:
+			put_unaligned_le16(0, &target[j]);
+			goto ctoUCS_out;
+		case ':':
+			temp = UNI_COLON;
+			break;
+		case '*':
+			temp = UNI_ASTERIK;
+			break;
+		case '?':
+			temp = UNI_QUESTION;
+			break;
+		case '<':
+			temp = UNI_LESSTHAN;
+			break;
+		case '>':
+			temp = UNI_GRTRTHAN;
+			break;
+		case '|':
+			temp = UNI_PIPE;
+			break;
+		/*
+		 * FIXME: We can not handle remapping backslash (UNI_SLASH)
+		 * until all the calls to build_path_from_dentry are modified,
+		 * as they use backslash as separator.
+		 */
+		default:
+			charlen = cp->char2uni(source+i, len_remaining,
+						&temp);
+			/*
+			 * if no match, use question mark, which at least in
+			 * some cases serves as wild card
+			 */
+			if (charlen < 1) {
+				temp = 0x003f;
+				charlen = 1;
+			}
+			len_remaining -= charlen;
+			/*
+			 * character may take more than one byte in the source
+			 * string, but will take exactly two bytes in the
+			 * target string
+			 */
+			i += charlen;
+			continue;
+		}
+		put_unaligned_le16(temp, &target[j]);
+		i++; /* move to next char in source string */
+		len_remaining--;
+	}
+
+ctoUCS_out:
+	return i;
+}
+
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 09bfcf08a90f..a09e077ba925 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -637,77 +637,6 @@ dump_smb(struct smb_hdr *smb_buf, int smb_buf_length)
 	return;
 }
 
-/* Convert 16 bit Unicode pathname to wire format from string in current code
-   page.  Conversion may involve remapping up the seven characters that are
-   only legal in POSIX-like OS (if they are present in the string). Path
-   names are little endian 16 bit Unicode on the wire */
-int
-cifsConvertToUCS(__le16 *target, const char *source, int maxlen,
-		 const struct nls_table *cp, int mapChars)
-{
-	int i, j, charlen;
-	int len_remaining = maxlen;
-	char src_char;
-	__u16 temp;
-
-	if (!mapChars)
-		return cifs_strtoUCS(target, source, PATH_MAX, cp);
-
-	for (i = 0, j = 0; i < maxlen; j++) {
-		src_char = source[i];
-		switch (src_char) {
-			case 0:
-				target[j] = 0;
-				goto ctoUCS_out;
-			case ':':
-				target[j] = cpu_to_le16(UNI_COLON);
-				break;
-			case '*':
-				target[j] = cpu_to_le16(UNI_ASTERIK);
-				break;
-			case '?':
-				target[j] = cpu_to_le16(UNI_QUESTION);
-				break;
-			case '<':
-				target[j] = cpu_to_le16(UNI_LESSTHAN);
-				break;
-			case '>':
-				target[j] = cpu_to_le16(UNI_GRTRTHAN);
-				break;
-			case '|':
-				target[j] = cpu_to_le16(UNI_PIPE);
-				break;
-			/* BB We can not handle remapping slash until
-			   all the calls to build_path_from_dentry
-			   are modified, as they use slash as separator BB */
-			/* case '\\':
-				target[j] = cpu_to_le16(UNI_SLASH);
-				break;*/
-			default:
-				charlen = cp->char2uni(source+i,
-					len_remaining, &temp);
-				/* if no match, use question mark, which
-				at least in some cases servers as wild card */
-				if (charlen < 1) {
-					target[j] = cpu_to_le16(0x003f);
-					charlen = 1;
-				} else
-					target[j] = cpu_to_le16(temp);
-				len_remaining -= charlen;
-				/* character may take more than one byte in the
-				   the source string, but will take exactly two
-				   bytes in the target string */
-				i += charlen;
-				continue;
-		}
-		i++; /* move to next char in source string */
-		len_remaining--;
-	}
-
-ctoUCS_out:
-	return i;
-}
-
 void
 cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb)
 {
-- 
cgit v1.2.2


From 28e58ee8ce1f0e69c207f747b7b9054b071e328d Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 20 Jan 2011 16:21:59 -0800
Subject: Fix broken "pipe: use event aware wakeups" optimization

Commit e462c448fdc8 ("pipe: use event aware wakeups") optimized the pipe
event wakeup calls to avoid wakeups if the events do not match the
requested set.

However, the optimization was buggy, in that it didn't actually use the
correct sets for the events: when we make room for more data to be
written, the pipe poll() routine will return both the POLLOUT _and_
POLLWRNORM bits.  Similarly for read.

And most critically, when a pipe is released, that will potentially
result in POLLHUP|POLLERR (depending on whether it was the last reader
or writer), not just the regular POLLIN|POLLOUT.

This bug showed itself as a hung gnome-screensaver-dialog process, stuck
forever (or at least until it was poked by a signal or by being traced)
in a poll() system call.

Cc: Davide Libenzi <davidel@xmailserver.org>
Cc: David S. Miller <davem@davemloft.net>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/pipe.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/pipe.c b/fs/pipe.c
index 89e9e19b1b2e..da42f7db50de 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -441,7 +441,7 @@ redo:
 			break;
 		}
 		if (do_wakeup) {
-			wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT);
+			wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM);
  			kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 		}
 		pipe_wait(pipe);
@@ -450,7 +450,7 @@ redo:
 
 	/* Signal writers asynchronously that there is more room. */
 	if (do_wakeup) {
-		wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT);
+		wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM);
 		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 	}
 	if (ret > 0)
@@ -612,7 +612,7 @@ redo2:
 			break;
 		}
 		if (do_wakeup) {
-			wake_up_interruptible_sync_poll(&pipe->wait, POLLIN);
+			wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);
 			kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 			do_wakeup = 0;
 		}
@@ -623,7 +623,7 @@ redo2:
 out:
 	mutex_unlock(&inode->i_mutex);
 	if (do_wakeup) {
-		wake_up_interruptible_sync_poll(&pipe->wait, POLLIN);
+		wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);
 		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 	}
 	if (ret > 0)
@@ -715,7 +715,7 @@ pipe_release(struct inode *inode, int decr, int decw)
 	if (!pipe->readers && !pipe->writers) {
 		free_pipe_info(inode);
 	} else {
-		wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLOUT);
+		wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM | POLLERR | POLLHUP);
 		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 	}
-- 
cgit v1.2.2


From 6a108a14fa356ef607be308b68337939e56ea94e Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Thu, 20 Jan 2011 14:44:16 -0800
Subject: kconfig: rename CONFIG_EMBEDDED to CONFIG_EXPERT

The meaning of CONFIG_EMBEDDED has long since been obsoleted; the option
is used to configure any non-standard kernel with a much larger scope than
only small devices.

This patch renames the option to CONFIG_EXPERT in init/Kconfig and fixes
references to the option throughout the kernel.  A new CONFIG_EMBEDDED
option is added that automatically selects CONFIG_EXPERT when enabled and
can be used in the future to isolate options that should only be
considered for embedded systems (RISC architectures, SLOB, etc).

Calling the option "EXPERT" more accurately represents its intention: only
expert users who understand the impact of the configuration changes they
are making should enable it.

Reviewed-by: Ingo Molnar <mingo@elte.hu>
Acked-by: David Woodhouse <david.woodhouse@intel.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Greg KH <gregkh@suse.de>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Robin Holt <holt@sgi.com>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/Kconfig       | 2 +-
 fs/proc/Kconfig  | 6 +++---
 fs/sysfs/Kconfig | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 9a7921ae4763..3db9caa57edc 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -50,7 +50,7 @@ config EXPORTFS
 	tristate
 
 config FILE_LOCKING
-	bool "Enable POSIX file locking API" if EMBEDDED
+	bool "Enable POSIX file locking API" if EXPERT
 	default y
 	help
 	  This option enables standard file locking support, required
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 6a0068841d96..15af6222f8a4 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -1,5 +1,5 @@
 config PROC_FS
-	bool "/proc file system support" if EMBEDDED
+	bool "/proc file system support" if EXPERT
 	default y
 	help
 	  This is a virtual file system providing information about the status
@@ -40,7 +40,7 @@ config PROC_VMCORE
         Exports the dump image of crashed kernel in ELF format.
 
 config PROC_SYSCTL
-	bool "Sysctl support (/proc/sys)" if EMBEDDED
+	bool "Sysctl support (/proc/sys)" if EXPERT
 	depends on PROC_FS
 	select SYSCTL
 	default y
@@ -61,7 +61,7 @@ config PROC_SYSCTL
 config PROC_PAGE_MONITOR
  	default y
 	depends on PROC_FS && MMU
-	bool "Enable /proc page monitoring" if EMBEDDED
+	bool "Enable /proc page monitoring" if EXPERT
  	help
 	  Various /proc files exist to monitor process memory utilization:
 	  /proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap,
diff --git a/fs/sysfs/Kconfig b/fs/sysfs/Kconfig
index f4b67588b9d6..8c41feacbac5 100644
--- a/fs/sysfs/Kconfig
+++ b/fs/sysfs/Kconfig
@@ -1,5 +1,5 @@
 config SYSFS
-	bool "sysfs file system support" if EMBEDDED
+	bool "sysfs file system support" if EXPERT
 	default y
 	help
 	The sysfs filesystem is a virtual filesystem that the kernel uses to
-- 
cgit v1.2.2


From 20d9600cb407b0b55fef6ee814b60345c6f58264 Mon Sep 17 00:00:00 2001
From: David Dillow <dillowda@ornl.gov>
Date: Thu, 20 Jan 2011 14:44:22 -0800
Subject: fs/direct-io.c: don't try to allocate more than BIO_MAX_PAGES in a
 bio

When using devices that support max_segments > BIO_MAX_PAGES (256), direct
IO tries to allocate a bio with more pages than allowed, which leads to an
oops in dio_bio_alloc().  Clamp the request to the supported maximum, and
change dio_bio_alloc() to reflect that bio_alloc() will always return a
bio when called with __GFP_WAIT and a valid number of vectors.

[akpm@linux-foundation.org: remove redundant BUG_ON()]
Signed-off-by: David Dillow <dillowda@ornl.gov>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/direct-io.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 85882f6ba5f7..b044705eedd4 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -325,12 +325,16 @@ void dio_end_io(struct bio *bio, int error)
 }
 EXPORT_SYMBOL_GPL(dio_end_io);
 
-static int
+static void
 dio_bio_alloc(struct dio *dio, struct block_device *bdev,
 		sector_t first_sector, int nr_vecs)
 {
 	struct bio *bio;
 
+	/*
+	 * bio_alloc() is guaranteed to return a bio when called with
+	 * __GFP_WAIT and we request a valid number of vectors.
+	 */
 	bio = bio_alloc(GFP_KERNEL, nr_vecs);
 
 	bio->bi_bdev = bdev;
@@ -342,7 +346,6 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev,
 
 	dio->bio = bio;
 	dio->logical_offset_in_bio = dio->cur_page_fs_offset;
-	return 0;
 }
 
 /*
@@ -583,8 +586,9 @@ static int dio_new_bio(struct dio *dio, sector_t start_sector)
 		goto out;
 	sector = start_sector << (dio->blkbits - 9);
 	nr_pages = min(dio->pages_in_io, bio_get_nr_vecs(dio->map_bh.b_bdev));
+	nr_pages = min(nr_pages, BIO_MAX_PAGES);
 	BUG_ON(nr_pages <= 0);
-	ret = dio_bio_alloc(dio, dio->map_bh.b_bdev, sector, nr_pages);
+	dio_bio_alloc(dio, dio->map_bh.b_bdev, sector, nr_pages);
 	dio->boundary = 0;
 out:
 	return ret;
-- 
cgit v1.2.2


From 99d86c8f1b7101d7c55dbf644b32bb1f0d7eb303 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Thu, 20 Jan 2011 21:19:25 -0500
Subject: cifs: fix up CIFSSMBEcho for unaligned access

Make sure that CIFSSMBEcho can handle unaligned fields. Also fix a minor
bug that causes this warning:

fs/cifs/cifssmb.c: In function 'CIFSSMBEcho':
fs/cifs/cifssmb.c:740: warning: large integer implicitly truncated to unsigned type

...WordCount is u8, not __le16, so no need to convert it.

This patch should apply cleanly on top of the rest of the patchset to
clean up unaligned access.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifssmb.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 675041a6949c..3106f5e5c633 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -733,9 +733,9 @@ CIFSSMBEcho(struct TCP_Server_Info *server)
 
 	/* set up echo request */
 	smb->hdr.Tid = cpu_to_le16(0xffff);
-	smb->hdr.WordCount = cpu_to_le16(1);
-	smb->EchoCount = cpu_to_le16(1);
-	smb->ByteCount = cpu_to_le16(1);
+	smb->hdr.WordCount = 1;
+	put_unaligned_le16(1, &smb->EchoCount);
+	put_bcc_le(1, &smb->hdr);
 	smb->Data[0] = 'a';
 	smb->hdr.smb_buf_length += 3;
 
-- 
cgit v1.2.2


From 0ca7a5b9ac5d301845dd6382ff25a699b6263a81 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Fri, 21 Jan 2011 16:40:31 +0900
Subject: nilfs2: fix crash after one superblock became unavailable

Fixes the following kernel oops in nilfs_setup_super() which could
arise if one of two super-blocks is unavailable.

> BUG: unable to handle kernel NULL pointer dereference at   (null)
> Pid: 3529, comm: mount.nilfs2 Not tainted 2.6.37 #1 /
> EIP: 0060:[<c03196bc>] EFLAGS: 00010202 CPU: 3
> EIP is at memcpy+0xc/0x1b
> Call Trace:
>  [<f953720e>] ? nilfs_setup_super+0x6c/0xa5 [nilfs2]
>  [<f95369e9>] ? nilfs_get_root_dentry+0x81/0xcb [nilfs2]
>  [<f9537a08>] ? nilfs_mount+0x4f9/0x62c [nilfs2]
>  [<c02745cf>] ? kstrdup+0x36/0x3f
>  [<f953750f>] ? nilfs_mount+0x0/0x62c [nilfs2]
>  [<c0293940>] ? vfs_kern_mount+0x4d/0x12c
>  [<c02a5100>] ? get_fs_type+0x76/0x8f
>  [<c0293a68>] ? do_kern_mount+0x33/0xbf
>  [<c02a784a>] ? do_mount+0x2ed/0x714
>  [<c02a6171>] ? copy_mount_options+0x28/0xfc
>  [<c02a7ce3>] ? sys_mount+0x72/0xaf
>  [<c0473085>] ? syscall_call+0x7/0xb

Reported-by: Wakko Warner <wakko@animx.eu.org>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Tested-by: Wakko Warner <wakko@animx.eu.org>
Cc: stable <stable@kernel.org> [2.6.37, 2.6.36]
LKML-Reference: <20110121024918.GA29598@animx.eu.org>
---
 fs/nilfs2/super.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 0994f6a76c07..58fd707174e1 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -704,7 +704,8 @@ skip_mount_setup:
 	sbp[0]->s_state =
 		cpu_to_le16(le16_to_cpu(sbp[0]->s_state) & ~NILFS_VALID_FS);
 	/* synchronize sbp[1] with sbp[0] */
-	memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
+	if (sbp[1])
+		memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
 	return nilfs_commit_super(sbi, NILFS_SB_COMMIT_ALL);
 }
 
-- 
cgit v1.2.2


From ff5fdb61493d95332945630fcae249f896098652 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Sat, 22 Jan 2011 20:16:06 -0800
Subject: fs: fix new dcache.c kernel-doc warnings

Fix new fs/dcache.c kernel-doc warnings:

  Warning(fs/dcache.c:184): No description found for parameter 'dentry'
  Warning(fs/dcache.c:296): No description found for parameter 'parent'
  Warning(fs/dcache.c:1985): No description found for parameter 'dparent'
  Warning(fs/dcache.c:1985): Excess function parameter 'parent' description in 'd_validate'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc:	Alexander Viro <viro@zeniv.linux.org.uk>
Cc:	Nick Piggin <npiggin@kernel.dk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dcache.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 9f493ee4dcba..2a6bd9a4ae97 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -176,6 +176,7 @@ static void d_free(struct dentry *dentry)
 
 /**
  * dentry_rcuwalk_barrier - invalidate in-progress rcu-walk lookups
+ * @dentry: the target dentry
  * After this call, in-progress rcu-walk path lookup will fail. This
  * should be called after unhashing, and after changing d_inode (if
  * the dentry has not already been unhashed).
@@ -281,6 +282,7 @@ static void dentry_lru_move_tail(struct dentry *dentry)
 /**
  * d_kill - kill dentry and return parent
  * @dentry: dentry to kill
+ * @parent: parent dentry
  *
  * The dentry must already be unhashed and removed from the LRU.
  *
@@ -1973,7 +1975,7 @@ out:
 /**
  * d_validate - verify dentry provided from insecure source (deprecated)
  * @dentry: The dentry alleged to be valid child of @dparent
- * @parent: The parent dentry (known to be valid)
+ * @dparent: The parent dentry (known to be valid)
  *
  * An insecure source has sent us a dentry, here we verify it and dget() it.
  * This is used by ncpfs in its readdir implementation.
-- 
cgit v1.2.2


From 3f391c79b0686ce183668c6e2b7d02f3e716766c Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jj@chaosbits.net>
Date: Sat, 22 Jan 2011 21:07:16 +0100
Subject: CIFS: Remove pointless variable assignment in cifs_dfs_do_automount()

In fs/cifs/cifs_dfs_ref.c::cifs_dfs_do_automount() we have this code:

	...
	mnt = ERR_PTR(-EINVAL);
	if (IS_ERR(tlink)) {
		mnt = ERR_CAST(tlink);
		goto free_full_path;
	}
	ses = tlink_tcon(tlink)->ses;

	rc = get_dfs_path(xid, ses, full_path + 1, cifs_sb->local_nls,
		&num_referrals, &referrals,
		cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);

	cifs_put_tlink(tlink);

	mnt = ERR_PTR(-ENOENT);
	...

The assignment of 'mnt = ERR_PTR(-EINVAL);' is completely pointless. If we
take the 'if (IS_ERR(tlink))' branch we'll set 'mnt' again and we'll also
do so if we do not take the branch. There is no way we'll ever use 'mnt'
with the assigned 'ERR_PTR(-EINVAL)' value, so we may as well just remove
the pointless assignment.

Signed-off-by: Jesper Juhl <jj@chaosbits.net>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_dfs_ref.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 7ed36536e754..f1c68629f277 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -297,7 +297,6 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
 
 	cifs_sb = CIFS_SB(mntpt->d_inode->i_sb);
 	tlink = cifs_sb_tlink(cifs_sb);
-	mnt = ERR_PTR(-EINVAL);
 	if (IS_ERR(tlink)) {
 		mnt = ERR_CAST(tlink);
 		goto free_full_path;
-- 
cgit v1.2.2


From f1d0c998653f1eeec60ee6420e550135b62dbab4 Mon Sep 17 00:00:00 2001
From: Rob Landley <rlandley@parallels.com>
Date: Sat, 22 Jan 2011 15:44:05 -0600
Subject: Make CIFS mount work in a container.

Teach cifs about network namespaces, so mounting uses adresses/routing
visible from the container rather than from init context.

A container is a chroot on steroids that changes more than just the root
filesystem the new processes see.  One thing containers can isolate is
"network namespaces", meaning each container can have its own set of
ethernet interfaces, each with its own own IP address and routing to the
outside world.  And if you open a socket in _userspace_ from processes
within such a container, this works fine.

But sockets opened from within the kernel still use a single global
networking context in a lot of places, meaning the new socket's address
and routing are correct for PID 1 on the host, but are _not_ what
userspace processes in the container get to use.

So when you mount a network filesystem from within in a container, the
mount code in the CIFS driver uses the host's networking context and not
the container's networking context, so it gets the wrong address, uses
the wrong routing, and may even try to go out an interface that the
container can't even access...  Bad stuff.

This patch copies the mount process's network context into the CIFS
structure that stores the rest of the server information for that mount
point, and changes the socket open code to use the saved network context
instead of the global network context.  I.E. "when you attempt to use
these addresses, do so relative to THIS set of network interfaces and
routing rules, not the old global context from back before we supported
containers".

The big long HOWTO sets up a test environment on the assumption you've
never used ocntainers before.  It basically says:

1) configure and build a new kernel that has container support
2) build a new root filesystem that includes the userspace container
control package (LXC)
3) package/run them under KVM (so you don't have to mess up your host
system in order to play with containers).
4) set up some containers under the KVM system
5) set up contradictory routing in the KVM system and the container so
that the host and the container see different things for the same address
6) try to mount a CIFS share from both contexts so you can both force it
to work and force it to fail.

For a long drawn out test reproduction sequence, see:

  http://landley.livejournal.com/47024.html
  http://landley.livejournal.com/47205.html
  http://landley.livejournal.com/47476.html

Signed-off-by: Rob Landley <rlandley@parallels.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsglob.h | 33 +++++++++++++++++++++++++++++++++
 fs/cifs/connect.c  | 12 ++++++++++--
 2 files changed, 43 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 5bfb75346cb0..edd5b29b53c9 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -166,6 +166,9 @@ struct TCP_Server_Info {
 	struct socket *ssocket;
 	struct sockaddr_storage dstaddr;
 	struct sockaddr_storage srcaddr; /* locally bind to this IP */
+#ifdef CONFIG_NET_NS
+	struct net *net;
+#endif
 	wait_queue_head_t response_q;
 	wait_queue_head_t request_q; /* if more than maxmpx to srvr must block*/
 	struct list_head pending_mid_q;
@@ -216,6 +219,36 @@ struct TCP_Server_Info {
 #endif
 };
 
+/*
+ * Macros to allow the TCP_Server_Info->net field and related code to drop out
+ * when CONFIG_NET_NS isn't set.
+ */
+
+#ifdef CONFIG_NET_NS
+
+static inline struct net *cifs_net_ns(struct TCP_Server_Info *srv)
+{
+	return srv->net;
+}
+
+static inline void cifs_set_net_ns(struct TCP_Server_Info *srv, struct net *net)
+{
+	srv->net = net;
+}
+
+#else
+
+static inline struct net *cifs_net_ns(struct TCP_Server_Info *srv)
+{
+	return &init_net;
+}
+
+static inline void cifs_set_net_ns(struct TCP_Server_Info *srv, struct net *net)
+{
+}
+
+#endif
+
 /*
  * Session structure.  One of these for each uid session with a particular host
  */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 18d3c7724d6e..0cc3b81c2e84 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1568,6 +1568,9 @@ cifs_find_tcp_session(struct sockaddr *addr, struct smb_vol *vol)
 
 	spin_lock(&cifs_tcp_ses_lock);
 	list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
+		if (!net_eq(cifs_net_ns(server), current->nsproxy->net_ns))
+			continue;
+
 		if (!match_address(server, addr,
 				   (struct sockaddr *)&vol->srcaddr))
 			continue;
@@ -1598,6 +1601,8 @@ cifs_put_tcp_session(struct TCP_Server_Info *server)
 		return;
 	}
 
+	put_net(cifs_net_ns(server));
+
 	list_del_init(&server->tcp_ses_list);
 	spin_unlock(&cifs_tcp_ses_lock);
 
@@ -1672,6 +1677,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 		goto out_err;
 	}
 
+	cifs_set_net_ns(tcp_ses, get_net(current->nsproxy->net_ns));
 	tcp_ses->hostname = extract_hostname(volume_info->UNC);
 	if (IS_ERR(tcp_ses->hostname)) {
 		rc = PTR_ERR(tcp_ses->hostname);
@@ -1752,6 +1758,8 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 out_err_crypto_release:
 	cifs_crypto_shash_release(tcp_ses);
 
+	put_net(cifs_net_ns(tcp_ses));
+
 out_err:
 	if (tcp_ses) {
 		if (!IS_ERR(tcp_ses->hostname))
@@ -2263,8 +2271,8 @@ generic_ip_connect(struct TCP_Server_Info *server)
 	}
 
 	if (socket == NULL) {
-		rc = sock_create_kern(sfamily, SOCK_STREAM,
-				      IPPROTO_TCP, &socket);
+		rc = __sock_create(cifs_net_ns(server), sfamily, SOCK_STREAM,
+				   IPPROTO_TCP, &socket, 1);
 		if (rc < 0) {
 			cERROR(1, "Error %d creating socket", rc);
 			server->ssocket = NULL;
-- 
cgit v1.2.2


From d66bbd441c08fe00ed2add1cf70cb243ebc2b27e Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 21 Jan 2011 21:16:46 -0800
Subject: ceph: avoid picking MDS that is not active

Ignore replication or auth frag data if it indicates an MDS that is not
active.  This can happen if the MDS shuts down and the client has stale
data about the namespace distribution across the MDS cluster.  If that's
the case, fall back to directing the request based on the auth cap (which
should always be accurate).

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mds_client.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 509339ceef72..a6949cc7c69a 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -693,9 +693,11 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
 				dout("choose_mds %p %llx.%llx "
 				     "frag %u mds%d (%d/%d)\n",
 				     inode, ceph_vinop(inode),
-				     frag.frag, frag.mds,
+				     frag.frag, mds,
 				     (int)r, frag.ndist);
-				return mds;
+				if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
+				    CEPH_MDS_STATE_ACTIVE)
+					return mds;
 			}
 
 			/* since this file/dir wasn't known to be
@@ -708,7 +710,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
 				dout("choose_mds %p %llx.%llx "
 				     "frag %u mds%d (auth)\n",
 				     inode, ceph_vinop(inode), frag.frag, mds);
-				return mds;
+				if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
+				    CEPH_MDS_STATE_ACTIVE)
+					return mds;
 			}
 		}
 	}
-- 
cgit v1.2.2


From 93c100c0b423266c0ee28497e90fdf27c05e6b8e Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Tue, 25 Jan 2011 19:28:43 +0000
Subject: [CIFS] Replace cifs md5 hashing functions with kernel crypto APIs

Replace remaining use of md5 hash functions local to cifs module
with kernel crypto APIs.
Remove header and source file containing those local functions.

Signed-off-by: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/Makefile      |   2 +-
 fs/cifs/cifsencrypt.c |   1 -
 fs/cifs/link.c        |  59 ++++++--
 fs/cifs/md5.c         | 366 --------------------------------------------------
 fs/cifs/md5.h         |  38 ------
 fs/cifs/smbencrypt.c  |   1 -
 6 files changed, 51 insertions(+), 416 deletions(-)
 delete mode 100644 fs/cifs/md5.c
 delete mode 100644 fs/cifs/md5.h

(limited to 'fs')

diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index 43b19dd39191..e1322296cb69 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -5,7 +5,7 @@ obj-$(CONFIG_CIFS) += cifs.o
 
 cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \
 	  link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o \
-	  md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o \
+	  md4.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o \
 	  readdir.o ioctl.o sess.o export.o
 
 cifs-$(CONFIG_CIFS_ACL) += cifsacl.o
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 66f3d50d0676..35bf329c90e1 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -24,7 +24,6 @@
 #include "cifspdu.h"
 #include "cifsglob.h"
 #include "cifs_debug.h"
-#include "md5.h"
 #include "cifs_unicode.h"
 #include "cifsproto.h"
 #include "ntlmssp.h"
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 306769de2fb5..d3444ea6ac71 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -28,7 +28,6 @@
 #include "cifsproto.h"
 #include "cifs_debug.h"
 #include "cifs_fs_sb.h"
-#include "md5.h"
 
 #define CIFS_MF_SYMLINK_LEN_OFFSET (4+1)
 #define CIFS_MF_SYMLINK_MD5_OFFSET (CIFS_MF_SYMLINK_LEN_OFFSET+(4+1))
@@ -46,6 +45,45 @@
 	md5_hash[8],  md5_hash[9],  md5_hash[10], md5_hash[11],\
 	md5_hash[12], md5_hash[13], md5_hash[14], md5_hash[15]
 
+static int
+symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash)
+{
+	int rc;
+	unsigned int size;
+	struct crypto_shash *md5;
+	struct sdesc *sdescmd5;
+
+	md5 = crypto_alloc_shash("md5", 0, 0);
+	if (!md5 || IS_ERR(md5)) {
+		rc = PTR_ERR(md5);
+		cERROR(1, "%s: Crypto md5 allocation error %d\n", __func__, rc);
+		return rc;
+	}
+	size = sizeof(struct shash_desc) + crypto_shash_descsize(md5);
+	sdescmd5 = kmalloc(size, GFP_KERNEL);
+	if (!sdescmd5) {
+		rc = -ENOMEM;
+		cERROR(1, "%s: Memory allocation failure\n", __func__);
+		goto symlink_hash_err;
+	}
+	sdescmd5->shash.tfm = md5;
+	sdescmd5->shash.flags = 0x0;
+
+	rc = crypto_shash_init(&sdescmd5->shash);
+	if (rc) {
+		cERROR(1, "%s: Could not init md5 shash\n", __func__);
+		goto symlink_hash_err;
+	}
+	crypto_shash_update(&sdescmd5->shash, link_str, link_len);
+	rc = crypto_shash_final(&sdescmd5->shash, md5_hash);
+
+symlink_hash_err:
+	crypto_free_shash(md5);
+	kfree(sdescmd5);
+
+	return rc;
+}
+
 static int
 CIFSParseMFSymlink(const u8 *buf,
 		   unsigned int buf_len,
@@ -56,7 +94,6 @@ CIFSParseMFSymlink(const u8 *buf,
 	unsigned int link_len;
 	const char *md5_str1;
 	const char *link_str;
-	struct MD5Context md5_ctx;
 	u8 md5_hash[16];
 	char md5_str2[34];
 
@@ -70,9 +107,11 @@ CIFSParseMFSymlink(const u8 *buf,
 	if (rc != 1)
 		return -EINVAL;
 
-	cifs_MD5_init(&md5_ctx);
-	cifs_MD5_update(&md5_ctx, (const u8 *)link_str, link_len);
-	cifs_MD5_final(md5_hash, &md5_ctx);
+	rc = symlink_hash(link_len, link_str, md5_hash);
+	if (rc) {
+		cFYI(1, "%s: MD5 hash failure: %d\n", __func__, rc);
+		return rc;
+	}
 
 	snprintf(md5_str2, sizeof(md5_str2),
 		 CIFS_MF_SYMLINK_MD5_FORMAT,
@@ -94,9 +133,9 @@ CIFSParseMFSymlink(const u8 *buf,
 static int
 CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str)
 {
+	int rc;
 	unsigned int link_len;
 	unsigned int ofs;
-	struct MD5Context md5_ctx;
 	u8 md5_hash[16];
 
 	if (buf_len != CIFS_MF_SYMLINK_FILE_SIZE)
@@ -107,9 +146,11 @@ CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str)
 	if (link_len > CIFS_MF_SYMLINK_LINK_MAXLEN)
 		return -ENAMETOOLONG;
 
-	cifs_MD5_init(&md5_ctx);
-	cifs_MD5_update(&md5_ctx, (const u8 *)link_str, link_len);
-	cifs_MD5_final(md5_hash, &md5_ctx);
+	rc = symlink_hash(link_len, link_str, md5_hash);
+	if (rc) {
+		cFYI(1, "%s: MD5 hash failure: %d\n", __func__, rc);
+		return rc;
+	}
 
 	snprintf(buf, buf_len,
 		 CIFS_MF_SYMLINK_LEN_FORMAT CIFS_MF_SYMLINK_MD5_FORMAT,
diff --git a/fs/cifs/md5.c b/fs/cifs/md5.c
deleted file mode 100644
index 98b66a54c319..000000000000
--- a/fs/cifs/md5.c
+++ /dev/null
@@ -1,366 +0,0 @@
-/*
- * This code implements the MD5 message-digest algorithm.
- * The algorithm is due to Ron Rivest.  This code was
- * written by Colin Plumb in 1993, no copyright is claimed.
- * This code is in the public domain; do with it what you wish.
- *
- * Equivalent code is available from RSA Data Security, Inc.
- * This code has been tested against that, and is equivalent,
- * except that you don't need to include two pages of legalese
- * with every copy.
- *
- * To compute the message digest of a chunk of bytes, declare an
- * MD5Context structure, pass it to cifs_MD5_init, call cifs_MD5_update as
- * needed on buffers full of bytes, and then call cifs_MD5_final, which
- * will fill a supplied 16-byte array with the digest.
- */
-
-/* This code slightly modified to fit into Samba by
-   abartlet@samba.org Jun 2001
-   and to fit the cifs vfs by
-   Steve French sfrench@us.ibm.com */
-
-#include <linux/string.h>
-#include "md5.h"
-
-static void MD5Transform(__u32 buf[4], __u32 const in[16]);
-
-/*
- * Note: this code is harmless on little-endian machines.
- */
-static void
-byteReverse(unsigned char *buf, unsigned longs)
-{
-	__u32 t;
-	do {
-		t = (__u32) ((unsigned) buf[3] << 8 | buf[2]) << 16 |
-		    ((unsigned) buf[1] << 8 | buf[0]);
-		*(__u32 *) buf = t;
-		buf += 4;
-	} while (--longs);
-}
-
-/*
- * Start MD5 accumulation.  Set bit count to 0 and buffer to mysterious
- * initialization constants.
- */
-void
-cifs_MD5_init(struct MD5Context *ctx)
-{
-	ctx->buf[0] = 0x67452301;
-	ctx->buf[1] = 0xefcdab89;
-	ctx->buf[2] = 0x98badcfe;
-	ctx->buf[3] = 0x10325476;
-
-	ctx->bits[0] = 0;
-	ctx->bits[1] = 0;
-}
-
-/*
- * Update context to reflect the concatenation of another buffer full
- * of bytes.
- */
-void
-cifs_MD5_update(struct MD5Context *ctx, unsigned char const *buf, unsigned len)
-{
-	register __u32 t;
-
-	/* Update bitcount */
-
-	t = ctx->bits[0];
-	if ((ctx->bits[0] = t + ((__u32) len << 3)) < t)
-		ctx->bits[1]++;	/* Carry from low to high */
-	ctx->bits[1] += len >> 29;
-
-	t = (t >> 3) & 0x3f;	/* Bytes already in shsInfo->data */
-
-	/* Handle any leading odd-sized chunks */
-
-	if (t) {
-		unsigned char *p = (unsigned char *) ctx->in + t;
-
-		t = 64 - t;
-		if (len < t) {
-			memmove(p, buf, len);
-			return;
-		}
-		memmove(p, buf, t);
-		byteReverse(ctx->in, 16);
-		MD5Transform(ctx->buf, (__u32 *) ctx->in);
-		buf += t;
-		len -= t;
-	}
-	/* Process data in 64-byte chunks */
-
-	while (len >= 64) {
-		memmove(ctx->in, buf, 64);
-		byteReverse(ctx->in, 16);
-		MD5Transform(ctx->buf, (__u32 *) ctx->in);
-		buf += 64;
-		len -= 64;
-	}
-
-	/* Handle any remaining bytes of data. */
-
-	memmove(ctx->in, buf, len);
-}
-
-/*
- * Final wrapup - pad to 64-byte boundary with the bit pattern
- * 1 0* (64-bit count of bits processed, MSB-first)
- */
-void
-cifs_MD5_final(unsigned char digest[16], struct MD5Context *ctx)
-{
-	unsigned int count;
-	unsigned char *p;
-
-	/* Compute number of bytes mod 64 */
-	count = (ctx->bits[0] >> 3) & 0x3F;
-
-	/* Set the first char of padding to 0x80.  This is safe since there is
-	   always at least one byte free */
-	p = ctx->in + count;
-	*p++ = 0x80;
-
-	/* Bytes of padding needed to make 64 bytes */
-	count = 64 - 1 - count;
-
-	/* Pad out to 56 mod 64 */
-	if (count < 8) {
-		/* Two lots of padding:  Pad the first block to 64 bytes */
-		memset(p, 0, count);
-		byteReverse(ctx->in, 16);
-		MD5Transform(ctx->buf, (__u32 *) ctx->in);
-
-		/* Now fill the next block with 56 bytes */
-		memset(ctx->in, 0, 56);
-	} else {
-		/* Pad block to 56 bytes */
-		memset(p, 0, count - 8);
-	}
-	byteReverse(ctx->in, 14);
-
-	/* Append length in bits and transform */
-	((__u32 *) ctx->in)[14] = ctx->bits[0];
-	((__u32 *) ctx->in)[15] = ctx->bits[1];
-
-	MD5Transform(ctx->buf, (__u32 *) ctx->in);
-	byteReverse((unsigned char *) ctx->buf, 4);
-	memmove(digest, ctx->buf, 16);
-	memset(ctx, 0, sizeof(*ctx));	/* In case it's sensitive */
-}
-
-/* The four core functions - F1 is optimized somewhat */
-
-/* #define F1(x, y, z) (x & y | ~x & z) */
-#define F1(x, y, z) (z ^ (x & (y ^ z)))
-#define F2(x, y, z) F1(z, x, y)
-#define F3(x, y, z) (x ^ y ^ z)
-#define F4(x, y, z) (y ^ (x | ~z))
-
-/* This is the central step in the MD5 algorithm. */
-#define MD5STEP(f, w, x, y, z, data, s) \
-	(w += f(x, y, z) + data,  w = w<<s | w>>(32-s),  w += x)
-
-/*
- * The core of the MD5 algorithm, this alters an existing MD5 hash to
- * reflect the addition of 16 longwords of new data.  cifs_MD5_update blocks
- * the data and converts bytes into longwords for this routine.
- */
-static void
-MD5Transform(__u32 buf[4], __u32 const in[16])
-{
-	register __u32 a, b, c, d;
-
-	a = buf[0];
-	b = buf[1];
-	c = buf[2];
-	d = buf[3];
-
-	MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7);
-	MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12);
-	MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17);
-	MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22);
-	MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7);
-	MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12);
-	MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17);
-	MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22);
-	MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7);
-	MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12);
-	MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17);
-	MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22);
-	MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7);
-	MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12);
-	MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17);
-	MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22);
-
-	MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5);
-	MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9);
-	MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14);
-	MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20);
-	MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5);
-	MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9);
-	MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14);
-	MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20);
-	MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5);
-	MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9);
-	MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14);
-	MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20);
-	MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5);
-	MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9);
-	MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14);
-	MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20);
-
-	MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4);
-	MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11);
-	MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16);
-	MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23);
-	MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4);
-	MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11);
-	MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16);
-	MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23);
-	MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4);
-	MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11);
-	MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16);
-	MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23);
-	MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4);
-	MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11);
-	MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16);
-	MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23);
-
-	MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6);
-	MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10);
-	MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15);
-	MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21);
-	MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6);
-	MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10);
-	MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15);
-	MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21);
-	MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6);
-	MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10);
-	MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15);
-	MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21);
-	MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6);
-	MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10);
-	MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15);
-	MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21);
-
-	buf[0] += a;
-	buf[1] += b;
-	buf[2] += c;
-	buf[3] += d;
-}
-
-#if 0   /* currently unused */
-/***********************************************************************
- the rfc 2104 version of hmac_md5 initialisation.
-***********************************************************************/
-static void
-hmac_md5_init_rfc2104(unsigned char *key, int key_len,
-		      struct HMACMD5Context *ctx)
-{
-	int i;
-
-	/* if key is longer than 64 bytes reset it to key=MD5(key) */
-	if (key_len > 64) {
-		unsigned char tk[16];
-		struct MD5Context tctx;
-
-		cifs_MD5_init(&tctx);
-		cifs_MD5_update(&tctx, key, key_len);
-		cifs_MD5_final(tk, &tctx);
-
-		key = tk;
-		key_len = 16;
-	}
-
-	/* start out by storing key in pads */
-	memset(ctx->k_ipad, 0, sizeof(ctx->k_ipad));
-	memset(ctx->k_opad, 0, sizeof(ctx->k_opad));
-	memcpy(ctx->k_ipad, key, key_len);
-	memcpy(ctx->k_opad, key, key_len);
-
-	/* XOR key with ipad and opad values */
-	for (i = 0; i < 64; i++) {
-		ctx->k_ipad[i] ^= 0x36;
-		ctx->k_opad[i] ^= 0x5c;
-	}
-
-	cifs_MD5_init(&ctx->ctx);
-	cifs_MD5_update(&ctx->ctx, ctx->k_ipad, 64);
-}
-#endif
-
-/***********************************************************************
- the microsoft version of hmac_md5 initialisation.
-***********************************************************************/
-void
-hmac_md5_init_limK_to_64(const unsigned char *key, int key_len,
-			 struct HMACMD5Context *ctx)
-{
-	int i;
-
-	/* if key is longer than 64 bytes truncate it */
-	if (key_len > 64)
-		key_len = 64;
-
-	/* start out by storing key in pads */
-	memset(ctx->k_ipad, 0, sizeof(ctx->k_ipad));
-	memset(ctx->k_opad, 0, sizeof(ctx->k_opad));
-	memcpy(ctx->k_ipad, key, key_len);
-	memcpy(ctx->k_opad, key, key_len);
-
-	/* XOR key with ipad and opad values */
-	for (i = 0; i < 64; i++) {
-		ctx->k_ipad[i] ^= 0x36;
-		ctx->k_opad[i] ^= 0x5c;
-	}
-
-	cifs_MD5_init(&ctx->ctx);
-	cifs_MD5_update(&ctx->ctx, ctx->k_ipad, 64);
-}
-
-/***********************************************************************
- update hmac_md5 "inner" buffer
-***********************************************************************/
-void
-hmac_md5_update(const unsigned char *text, int text_len,
-		struct HMACMD5Context *ctx)
-{
-	cifs_MD5_update(&ctx->ctx, text, text_len);	/* then text of datagram */
-}
-
-/***********************************************************************
- finish off hmac_md5 "inner" buffer and generate outer one.
-***********************************************************************/
-void
-hmac_md5_final(unsigned char *digest, struct HMACMD5Context *ctx)
-{
-	struct MD5Context ctx_o;
-
-	cifs_MD5_final(digest, &ctx->ctx);
-
-	cifs_MD5_init(&ctx_o);
-	cifs_MD5_update(&ctx_o, ctx->k_opad, 64);
-	cifs_MD5_update(&ctx_o, digest, 16);
-	cifs_MD5_final(digest, &ctx_o);
-}
-
-/***********************************************************
- single function to calculate an HMAC MD5 digest from data.
- use the microsoft hmacmd5 init method because the key is 16 bytes.
-************************************************************/
-#if 0 /* currently unused */
-static void
-hmac_md5(unsigned char key[16], unsigned char *data, int data_len,
-	 unsigned char *digest)
-{
-	struct HMACMD5Context ctx;
-	hmac_md5_init_limK_to_64(key, 16, &ctx);
-	if (data_len != 0)
-		hmac_md5_update(data, data_len, &ctx);
-
-	hmac_md5_final(digest, &ctx);
-}
-#endif
diff --git a/fs/cifs/md5.h b/fs/cifs/md5.h
deleted file mode 100644
index 6fba8cb402fd..000000000000
--- a/fs/cifs/md5.h
+++ /dev/null
@@ -1,38 +0,0 @@
-#ifndef MD5_H
-#define MD5_H
-#ifndef HEADER_MD5_H
-/* Try to avoid clashes with OpenSSL */
-#define HEADER_MD5_H
-#endif
-
-struct MD5Context {
-	__u32 buf[4];
-	__u32 bits[2];
-	unsigned char in[64];
-};
-#endif				/* !MD5_H */
-
-#ifndef _HMAC_MD5_H
-struct HMACMD5Context {
-	struct MD5Context ctx;
-	unsigned char k_ipad[65];
-	unsigned char k_opad[65];
-};
-#endif				/* _HMAC_MD5_H */
-
-void cifs_MD5_init(struct MD5Context *context);
-void cifs_MD5_update(struct MD5Context *context, unsigned char const *buf,
-			unsigned len);
-void cifs_MD5_final(unsigned char digest[16], struct MD5Context *context);
-
-/* The following definitions come from lib/hmacmd5.c  */
-
-/* void hmac_md5_init_rfc2104(unsigned char *key, int key_len,
-			struct HMACMD5Context *ctx);*/
-void hmac_md5_init_limK_to_64(const unsigned char *key, int key_len,
-			struct HMACMD5Context *ctx);
-void hmac_md5_update(const unsigned char *text, int text_len,
-			struct HMACMD5Context *ctx);
-void hmac_md5_final(unsigned char *digest, struct HMACMD5Context *ctx);
-/* void hmac_md5(unsigned char key[16], unsigned char *data, int data_len,
-			unsigned char *digest);*/
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 192ea51af20f..30135005e4f3 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -32,7 +32,6 @@
 #include "cifs_unicode.h"
 #include "cifspdu.h"
 #include "cifsglob.h"
-#include "md5.h"
 #include "cifs_debug.h"
 #include "cifsencrypt.h"
 
-- 
cgit v1.2.2


From 72432ffcf555decbbae47f1be338e1d2f210aa69 Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <piastryyy@gmail.com>
Date: Mon, 24 Jan 2011 14:16:35 -0500
Subject: CIFS: Implement cifs_strict_writev (try #4)

If we don't have Exclusive oplock we write a data to the server.
Also set invalidate_mapping flag on the inode if we wrote something
to the server. Add cifs_iovec_write to let the client write iovec
buffers through CIFSSMBWrite2.

Signed-off-by: Pavel Shilovsky <piastryyy@gmail.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsfs.c    |  15 ++--
 fs/cifs/cifsfs.h    |   4 +-
 fs/cifs/cifsproto.h |   2 +
 fs/cifs/file.c      | 202 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 217 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index a8323f1dc1c4..f2970136d17d 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -600,10 +600,17 @@ static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 {
 	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
 	ssize_t written;
+	int rc;
 
 	written = generic_file_aio_write(iocb, iov, nr_segs, pos);
-	if (!CIFS_I(inode)->clientCanCacheAll)
-		filemap_fdatawrite(inode->i_mapping);
+
+	if (CIFS_I(inode)->clientCanCacheAll)
+		return written;
+
+	rc = filemap_fdatawrite(inode->i_mapping);
+	if (rc)
+		cFYI(1, "cifs_file_aio_write: %d rc on %p inode", rc, inode);
+
 	return written;
 }
 
@@ -737,7 +744,7 @@ const struct file_operations cifs_file_strict_ops = {
 	.read = do_sync_read,
 	.write = do_sync_write,
 	.aio_read = cifs_strict_readv,
-	.aio_write = cifs_file_aio_write,
+	.aio_write = cifs_strict_writev,
 	.open = cifs_open,
 	.release = cifs_close,
 	.lock = cifs_lock,
@@ -793,7 +800,7 @@ const struct file_operations cifs_file_strict_nobrl_ops = {
 	.read = do_sync_read,
 	.write = do_sync_write,
 	.aio_read = cifs_strict_readv,
-	.aio_write = cifs_file_aio_write,
+	.aio_write = cifs_strict_writev,
 	.open = cifs_open,
 	.release = cifs_close,
 	.fsync = cifs_strict_fsync,
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index f23206d46531..14789a97304e 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -85,7 +85,9 @@ extern ssize_t cifs_user_read(struct file *file, char __user *read_data,
 extern ssize_t cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov,
 				 unsigned long nr_segs, loff_t pos);
 extern ssize_t cifs_user_write(struct file *file, const char __user *write_data,
-			 size_t write_size, loff_t *poffset);
+			       size_t write_size, loff_t *poffset);
+extern ssize_t cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
+				  unsigned long nr_segs, loff_t pos);
 extern int cifs_lock(struct file *, int, struct file_lock *);
 extern int cifs_fsync(struct file *, int);
 extern int cifs_strict_fsync(struct file *, int);
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 982895fa7615..35c989f4924f 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -85,6 +85,8 @@ extern int checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length);
 extern bool is_valid_oplock_break(struct smb_hdr *smb,
 				  struct TCP_Server_Info *);
 extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof);
+extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
+			    unsigned int bytes_written);
 extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool);
 extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool);
 extern unsigned int smbCalcSize(struct smb_hdr *ptr);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index d7d65a70678e..0de17c1db608 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -848,7 +848,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
 }
 
 /* update the file size (if needed) after a write */
-static void
+void
 cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
 		      unsigned int bytes_written)
 {
@@ -1619,6 +1619,206 @@ int cifs_flush(struct file *file, fl_owner_t id)
 	return rc;
 }
 
+static int
+cifs_write_allocate_pages(struct page **pages, unsigned long num_pages)
+{
+	int rc = 0;
+	unsigned long i;
+
+	for (i = 0; i < num_pages; i++) {
+		pages[i] = alloc_page(__GFP_HIGHMEM);
+		if (!pages[i]) {
+			/*
+			 * save number of pages we have already allocated and
+			 * return with ENOMEM error
+			 */
+			num_pages = i;
+			rc = -ENOMEM;
+			goto error;
+		}
+	}
+
+	return rc;
+
+error:
+	for (i = 0; i < num_pages; i++)
+		put_page(pages[i]);
+	return rc;
+}
+
+static inline
+size_t get_numpages(const size_t wsize, const size_t len, size_t *cur_len)
+{
+	size_t num_pages;
+	size_t clen;
+
+	clen = min_t(const size_t, len, wsize);
+	num_pages = clen / PAGE_CACHE_SIZE;
+	if (clen % PAGE_CACHE_SIZE)
+		num_pages++;
+
+	if (cur_len)
+		*cur_len = clen;
+
+	return num_pages;
+}
+
+static ssize_t
+cifs_iovec_write(struct file *file, const struct iovec *iov,
+		 unsigned long nr_segs, loff_t *poffset)
+{
+	size_t total_written = 0, written = 0;
+	unsigned long num_pages, npages;
+	size_t copied, len, cur_len, i;
+	struct kvec *to_send;
+	struct page **pages;
+	struct iov_iter it;
+	struct inode *inode;
+	struct cifsFileInfo *open_file;
+	struct cifsTconInfo *pTcon;
+	struct cifs_sb_info *cifs_sb;
+	int xid, rc;
+
+	len = iov_length(iov, nr_segs);
+	if (!len)
+		return 0;
+
+	rc = generic_write_checks(file, poffset, &len, 0);
+	if (rc)
+		return rc;
+
+	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+	num_pages = get_numpages(cifs_sb->wsize, len, &cur_len);
+
+	pages = kmalloc(sizeof(struct pages *)*num_pages, GFP_KERNEL);
+	if (!pages)
+		return -ENOMEM;
+
+	to_send = kmalloc(sizeof(struct kvec)*(num_pages + 1), GFP_KERNEL);
+	if (!to_send) {
+		kfree(pages);
+		return -ENOMEM;
+	}
+
+	rc = cifs_write_allocate_pages(pages, num_pages);
+	if (rc) {
+		kfree(pages);
+		kfree(to_send);
+		return rc;
+	}
+
+	xid = GetXid();
+	open_file = file->private_data;
+	pTcon = tlink_tcon(open_file->tlink);
+	inode = file->f_path.dentry->d_inode;
+
+	iov_iter_init(&it, iov, nr_segs, len, 0);
+	npages = num_pages;
+
+	do {
+		size_t save_len = cur_len;
+		for (i = 0; i < npages; i++) {
+			copied = min_t(const size_t, cur_len, PAGE_CACHE_SIZE);
+			copied = iov_iter_copy_from_user(pages[i], &it, 0,
+							 copied);
+			cur_len -= copied;
+			iov_iter_advance(&it, copied);
+			to_send[i+1].iov_base = kmap(pages[i]);
+			to_send[i+1].iov_len = copied;
+		}
+
+		cur_len = save_len - cur_len;
+
+		do {
+			if (open_file->invalidHandle) {
+				rc = cifs_reopen_file(open_file, false);
+				if (rc != 0)
+					break;
+			}
+			rc = CIFSSMBWrite2(xid, pTcon, open_file->netfid,
+					   cur_len, *poffset, &written,
+					   to_send, npages, 0);
+		} while (rc == -EAGAIN);
+
+		for (i = 0; i < npages; i++)
+			kunmap(pages[i]);
+
+		if (written) {
+			len -= written;
+			total_written += written;
+			cifs_update_eof(CIFS_I(inode), *poffset, written);
+			*poffset += written;
+		} else if (rc < 0) {
+			if (!total_written)
+				total_written = rc;
+			break;
+		}
+
+		/* get length and number of kvecs of the next write */
+		npages = get_numpages(cifs_sb->wsize, len, &cur_len);
+	} while (len > 0);
+
+	if (total_written > 0) {
+		spin_lock(&inode->i_lock);
+		if (*poffset > inode->i_size)
+			i_size_write(inode, *poffset);
+		spin_unlock(&inode->i_lock);
+	}
+
+	cifs_stats_bytes_written(pTcon, total_written);
+	mark_inode_dirty_sync(inode);
+
+	for (i = 0; i < num_pages; i++)
+		put_page(pages[i]);
+	kfree(to_send);
+	kfree(pages);
+	FreeXid(xid);
+	return total_written;
+}
+
+static ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov,
+				unsigned long nr_segs, loff_t pos)
+{
+	ssize_t written;
+	struct inode *inode;
+
+	inode = iocb->ki_filp->f_path.dentry->d_inode;
+
+	/*
+	 * BB - optimize the way when signing is disabled. We can drop this
+	 * extra memory-to-memory copying and use iovec buffers for constructing
+	 * write request.
+	 */
+
+	written = cifs_iovec_write(iocb->ki_filp, iov, nr_segs, &pos);
+	if (written > 0) {
+		CIFS_I(inode)->invalid_mapping = true;
+		iocb->ki_pos = pos;
+	}
+
+	return written;
+}
+
+ssize_t cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
+			   unsigned long nr_segs, loff_t pos)
+{
+	struct inode *inode;
+
+	inode = iocb->ki_filp->f_path.dentry->d_inode;
+
+	if (CIFS_I(inode)->clientCanCacheAll)
+		return generic_file_aio_write(iocb, iov, nr_segs, pos);
+
+	/*
+	 * In strict cache mode we need to write the data to the server exactly
+	 * from the pos to pos+len-1 rather than flush all affected pages
+	 * because it may cause a error with mandatory locks on these pages but
+	 * not on the region from pos to ppos+len-1.
+	 */
+
+	return cifs_user_writev(iocb, iov, nr_segs, pos);
+}
+
 static ssize_t
 cifs_iovec_read(struct file *file, const struct iovec *iov,
 		 unsigned long nr_segs, loff_t *poffset)
-- 
cgit v1.2.2


From d39454ffe4a3c85428483b8a8a8e5e797b6363d5 Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <piastryyy@gmail.com>
Date: Mon, 24 Jan 2011 14:16:35 -0500
Subject: CIFS: Add strictcache mount option

Use for switching on strict cache mode. In this mode the
client reads from the cache all the time it has Oplock Level II,
otherwise - read from the server. As for write - the client stores
a data in the cache in Exclusive Oplock case, otherwise - write
directly to the server.

Signed-off-by: Pavel Shilovsky <piastryyy@gmail.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/README    | 5 +++++
 fs/cifs/connect.c | 5 +++++
 2 files changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/README b/fs/cifs/README
index 46af99ab3614..fe1683590828 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -452,6 +452,11 @@ A partial list of the supported mount options follows:
 		if oplock (caching token) is granted and held. Note that
 		direct allows write operations larger than page size
 		to be sent to the server.
+  strictcache   Use for switching on strict cache mode. In this mode the
+		client read from the cache all the time it has Oplock Level II,
+		otherwise - read from the server. All written data are stored
+		in the cache, but if the client doesn't have Exclusive Oplock,
+		it writes the data to the server.
   acl   	Allow setfacl and getfacl to manage posix ACLs if server
 		supports them.  (default)
   noacl 	Do not allow setfacl and getfacl calls on this mount
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 0cc3b81c2e84..47034af67b09 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -87,6 +87,7 @@ struct smb_vol {
 	bool no_xattr:1;   /* set if xattr (EA) support should be disabled*/
 	bool server_ino:1; /* use inode numbers from server ie UniqueId */
 	bool direct_io:1;
+	bool strict_io:1; /* strict cache behavior */
 	bool remap:1;      /* set to remap seven reserved chars in filenames */
 	bool posix_paths:1; /* unset to not ask for posix pathnames. */
 	bool no_linux_ext:1;
@@ -1344,6 +1345,8 @@ cifs_parse_mount_options(char *options, const char *devname,
 			vol->direct_io = 1;
 		} else if (strnicmp(data, "forcedirectio", 13) == 0) {
 			vol->direct_io = 1;
+		} else if (strnicmp(data, "strictcache", 11) == 0) {
+			vol->strict_io = 1;
 		} else if (strnicmp(data, "noac", 4) == 0) {
 			printk(KERN_WARNING "CIFS: Mount option noac not "
 				"supported. Instead set "
@@ -2584,6 +2587,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
 	if (pvolume_info->multiuser)
 		cifs_sb->mnt_cifs_flags |= (CIFS_MOUNT_MULTIUSER |
 					    CIFS_MOUNT_NO_PERM);
+	if (pvolume_info->strict_io)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_STRICT_IO;
 	if (pvolume_info->direct_io) {
 		cFYI(1, "mounting share using direct i/o");
 		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO;
-- 
cgit v1.2.2


From ad3d2eedf0ed3611f5f86b9e4d0d15cc76c63465 Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jj@chaosbits.net>
Date: Mon, 17 Jan 2011 18:41:50 +0000
Subject: NFS4: Avoid potential NULL pointer dereference in
 decode_and_add_ds().

On Mon, 17 Jan 2011, Mi Jinlong wrote:

>
>
> Jesper Juhl:
> > strrchr() can return NULL if nothing is found. If this happens we'll
> > dereference a NULL pointer in
> > fs/nfs/nfs4filelayoutdev.c::decode_and_add_ds().
> >
> > I tried to find some other code that guarantees that this can never
> > happen but I was unsuccessful. So, unless someone else can point to some
> > code that ensures this can never be a problem, I believe this patch is
> > needed.
> >
> > While I was changing this code I also noticed that all the dprintk()
> > statements, except one, start with "%s:". The one missing the ":" I added
> > it to.
>
>   Maybe another one also should be changed at decode_and_add_ds() at line 243:
>
>    243  printk("%s Decoded address and port %s\n", __func__, buf);
>
Missed that one. Thanks.

Signed-off-by: Jesper Juhl <jj@chaosbits.net>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4filelayoutdev.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index 51fe64ace55a..f5c9b125e8cc 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -214,7 +214,7 @@ decode_and_add_ds(__be32 **pp, struct inode *inode)
 
 	/* ipv6 length plus port is legal */
 	if (rlen > INET6_ADDRSTRLEN + 8) {
-		dprintk("%s Invalid address, length %d\n", __func__,
+		dprintk("%s: Invalid address, length %d\n", __func__,
 			rlen);
 		goto out_err;
 	}
@@ -225,6 +225,11 @@ decode_and_add_ds(__be32 **pp, struct inode *inode)
 	/* replace the port dots with dashes for the in4_pton() delimiter*/
 	for (i = 0; i < 2; i++) {
 		char *res = strrchr(buf, '.');
+		if (!res) {
+			dprintk("%s: Failed finding expected dots in port\n",
+				__func__);
+			goto out_free;
+		}
 		*res = '-';
 	}
 
@@ -240,7 +245,7 @@ decode_and_add_ds(__be32 **pp, struct inode *inode)
 	port = htons((tmp[0] << 8) | (tmp[1]));
 
 	ds = nfs4_pnfs_ds_add(inode, ip_addr, port);
-	dprintk("%s Decoded address and port %s\n", __func__, buf);
+	dprintk("%s: Decoded address and port %s\n", __func__, buf);
 out_free:
 	kfree(buf);
 out_err:
-- 
cgit v1.2.2


From 839f7ad6932d95f4d5ae7267b95c574714ff3d5b Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 21 Jan 2011 15:54:57 +0000
Subject: NFS: Fix "kernel BUG at fs/aio.c:554!"

Nick Piggin reports:

> I'm getting use after frees in aio code in NFS
>
> [ 2703.396766] Call Trace:
> [ 2703.396858]  [<ffffffff8100b057>] ? native_sched_clock+0x27/0x80
> [ 2703.396959]  [<ffffffff8108509e>] ? put_lock_stats+0xe/0x40
> [ 2703.397058]  [<ffffffff81088348>] ? lock_release_holdtime+0xa8/0x140
> [ 2703.397159]  [<ffffffff8108a2a5>] lock_acquire+0x95/0x1b0
> [ 2703.397260]  [<ffffffff811627db>] ? aio_put_req+0x2b/0x60
> [ 2703.397361]  [<ffffffff81039701>] ? get_parent_ip+0x11/0x50
> [ 2703.397464]  [<ffffffff81612a31>] _raw_spin_lock_irq+0x41/0x80
> [ 2703.397564]  [<ffffffff811627db>] ? aio_put_req+0x2b/0x60
> [ 2703.397662]  [<ffffffff811627db>] aio_put_req+0x2b/0x60
> [ 2703.397761]  [<ffffffff811647fe>] do_io_submit+0x2be/0x7c0
> [ 2703.397895]  [<ffffffff81164d0b>] sys_io_submit+0xb/0x10
> [ 2703.397995]  [<ffffffff8100307b>] system_call_fastpath+0x16/0x1b
>
> Adding some tracing, it is due to nfs completing the request then
> returning something other than -EIOCBQUEUED, so aio.c
> also completes the request.

To address this, prevent the NFS direct I/O engine from completing
async iocbs when the forward path returns an error without starting
any I/O.

This fix appears to survive ^C during both "xfstest no. 208" and "fsx
-Z."

It's likely this bug has existed for a very long while, as we are seeing
very similar symptoms in OEL 5.  Copying stable.

Cc: Stable <stable@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/direct.c | 34 ++++++++++++++++++++--------------
 1 file changed, 20 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index e6ace0d93c71..9943a75bb6d1 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -407,15 +407,18 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
 		pos += vec->iov_len;
 	}
 
+	/*
+	 * If no bytes were started, return the error, and let the
+	 * generic layer handle the completion.
+	 */
+	if (requested_bytes == 0) {
+		nfs_direct_req_release(dreq);
+		return result < 0 ? result : -EIO;
+	}
+
 	if (put_dreq(dreq))
 		nfs_direct_complete(dreq);
-
-	if (requested_bytes != 0)
-		return 0;
-
-	if (result < 0)
-		return result;
-	return -EIO;
+	return 0;
 }
 
 static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
@@ -841,15 +844,18 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
 		pos += vec->iov_len;
 	}
 
+	/*
+	 * If no bytes were started, return the error, and let the
+	 * generic layer handle the completion.
+	 */
+	if (requested_bytes == 0) {
+		nfs_direct_req_release(dreq);
+		return result < 0 ? result : -EIO;
+	}
+
 	if (put_dreq(dreq))
 		nfs_direct_write_complete(dreq, dreq->inode);
-
-	if (requested_bytes != 0)
-		return 0;
-
-	if (result < 0)
-		return result;
-	return -EIO;
+	return 0;
 }
 
 static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
-- 
cgit v1.2.2


From ee5dc7732bd557bae6d10873a0aac606d2c551fb Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 21 Jan 2011 03:05:18 +0000
Subject: NFS: Fix "kernel BUG at fs/nfs/nfs3xdr.c:1338!"

Milan Broz <mbroz@redhat.com> reports:

> on today Linus' tree I get OOps if using nfs.
>
> server (2.6.36) exports dir:
> /dir   172.16.1.0/24(rw,async,all_squash,no_subtree_check,anonuid=500,anongid=500)
>
> on client it is mounted  in fstab
> server:/dir  /mnt/tst  nfs  rw,soft 0 0
>
> and these commands OOpses it (simplified from a configure script):
>
> cd /dir
> touch x
> install x y
>
> [  105.327701] ------------[ cut here ]------------
> [  105.327979] kernel BUG at fs/nfs/nfs3xdr.c:1338!
> [  105.328075] invalid opcode: 0000 [#1] PREEMPT SMP
> [  105.328223] last sysfs file: /sys/devices/virtual/bdi/0:16/uevent
> [  105.328349] Modules linked in: usbcore dm_mod
> [  105.328553]
> [  105.328678] Pid: 3710, comm: install Not tainted 2.6.37+ #423 440BX Desktop Reference Platform/VMware Virtual Platform
> [  105.328853] EIP: 0060:[<c116c06c>] EFLAGS: 00010282 CPU: 0
> [  105.329152] EIP is at nfs3_xdr_enc_setacl3args+0x61/0x98
> [  105.329249] EAX: ffffffea EBX: ce941d98 ECX: 00000000 EDX: 00000004
> [  105.329340] ESI: ce941cd0 EDI: 000000a4 EBP: ce941cc0 ESP: ce941cb4
> [  105.329431]  DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068
> [  105.329525] Process install (pid: 3710, ti=ce940000 task=ced36f20 task.ti=ce940000)
> [  105.336600] Stack:
> [  105.336693]  ce941cd0 ce9dc000 00000000 ce941cf8 c12ecd02 c12f43e0 c116c00b cf754158
> [  105.336982]  ce9dc004 cf754284 ce9dc004 cf7ffee8 ceff9978 ce9dc000 cf7ffee8 ce9dc000
> [  105.337182]  ce9dc000 ce941d14 c12e698d cf75412c ce941d98 cf7ffee8 cf7fff20 00000000
> [  105.337405] Call Trace:
> [  105.337695]  [<c12ecd02>] rpcauth_wrap_req+0x75/0x7f
> [  105.337806]  [<c12f43e0>] ? xdr_encode_opaque+0x12/0x15
> [  105.337898]  [<c116c00b>] ? nfs3_xdr_enc_setacl3args+0x0/0x98
> [  105.337988]  [<c12e698d>] call_transmit+0x17e/0x1e8
> [  105.338072]  [<c12ec307>] __rpc_execute+0x6d/0x1a6
> [  105.338155]  [<c12ec474>] rpc_execute+0x34/0x37
> [  105.338235]  [<c12e738d>] rpc_run_task+0xb5/0xbd
> [  105.338316]  [<c12e7474>] rpc_call_sync+0x3d/0x58
> [  105.338402]  [<c116d0c6>] nfs3_proc_setacls+0x18e/0x24f
> [  105.338493]  [<c10b3f76>] ? __kmalloc+0x148/0x1c4
> [  105.338579]  [<c10ecd01>] ? posix_acl_alloc+0x12/0x22
> [  105.338665]  [<c116d5c8>] nfs3_proc_setacl+0xa0/0xca
> [  105.338748]  [<c116d69c>] nfs3_setxattr+0x62/0x88
> [  105.338834]  [<c1317042>] ? sub_preempt_count+0x7c/0x89
> [  105.338926]  [<c116d63a>] ? nfs3_setxattr+0x0/0x88
> [  105.339026]  [<c10cfa79>] __vfs_setxattr_noperm+0x26/0x95
> [  105.339114]  [<c10cfb43>] vfs_setxattr+0x5b/0x76
> [  105.339211]  [<c10cfbfb>] setxattr+0x9d/0xc3
> [  105.339298]  [<c10a2ea8>] ? handle_pte_fault+0x258/0x5cb
> [  105.339428]  [<c1091ff6>] ? __free_pages+0x1a/0x23
> [  105.339517]  [<c10498ea>] ? up_read+0x16/0x2c
> [  105.339599]  [<c10b8365>] ? fget+0x0/0xa3
> [  105.339677]  [<c10b8365>] ? fget+0x0/0xa3
> [  105.339760]  [<c1025d23>] ? get_parent_ip+0xb/0x31
> [  105.339843]  [<c1317042>] ? sub_preempt_count+0x7c/0x89
> [  105.339931]  [<c10cfc72>] sys_fsetxattr+0x51/0x79
> [  105.340014]  [<c1002853>] sysenter_do_call+0x12/0x32
> [  105.340133] Code: 2e 76 18 00 58 31 d2 8b 7f 28 f6 43 04 01 74 03 8b 53 08 6a 00 8b 46 04 6a 01 8b 0b 52 89 fa e8 85 10 f8 ff 83 c4 0c 85 c0 79 04 <0f> 0b eb fe 31 c9 f6 43 04 04 74 03 8b 4b 0c 68 00 10 00 00 8d
> [  105.350321] EIP: [<c116c06c>] nfs3_xdr_enc_setacl3args+0x61/0x98 SS:ESP 0068:ce941cb4
> [  105.364385] ---[ end trace 01fcfe7f0f7f6e4a ]---

nfs3_xdr_enc_setacl3args() is not properly setting up the target
buffer before nfsacl_encode() attempts to encode the ACL.

Introduced by commit d9c407b1 "NFS: Introduce new-style XDR encoding
functions for NFSv3."

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs3xdr.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 01c5e8b1941d..183c6b123d0f 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -1328,10 +1328,13 @@ static void nfs3_xdr_enc_setacl3args(struct rpc_rqst *req,
 
 	encode_nfs_fh3(xdr, NFS_FH(args->inode));
 	encode_uint32(xdr, args->mask);
+
+	base = req->rq_slen;
 	if (args->npages != 0)
 		xdr_write_pages(xdr, args->pages, 0, args->len);
+	else
+		xdr_reserve_space(xdr, NFS_ACL_INLINE_BUFSIZE);
 
-	base = req->rq_slen;
 	error = nfsacl_encode(xdr->buf, base, args->inode,
 			    (args->mask & NFS_ACL) ?
 			    args->acl_access : NULL, 1, 0);
-- 
cgit v1.2.2


From 731f3f482ad3b2c58a1af2d0a9a634a82803706a Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 21 Jan 2011 03:05:28 +0000
Subject: NFS: nfsacl_{encode,decode} should return signed integer

Clean up.

The nfsacl_encode() and nfsacl_decode() functions return negative
errno values, and each call site verifies that the returned value
is not negative.  Change the synopsis of both of these functions
to reflect this usage.

Document the synopsis and return values.

Reported-by: Trond Myklebust <trond.myklebust@netapp.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs_common/nfsacl.c | 32 ++++++++++++++++++++++++++------
 1 file changed, 26 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
index fc1c52571c03..a3e78bd18679 100644
--- a/fs/nfs_common/nfsacl.c
+++ b/fs/nfs_common/nfsacl.c
@@ -72,9 +72,20 @@ xdr_nfsace_encode(struct xdr_array2_desc *desc, void *elem)
 	return 0;
 }
 
-unsigned int
-nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
-	      struct posix_acl *acl, int encode_entries, int typeflag)
+/**
+ * nfsacl_encode - Encode an NFSv3 ACL
+ *
+ * @buf: destination xdr_buf to contain XDR encoded ACL
+ * @base: byte offset in xdr_buf where XDR'd ACL begins
+ * @inode: inode of file whose ACL this is
+ * @acl: posix_acl to encode
+ * @encode_entries: whether to encode ACEs as well
+ * @typeflag: ACL type: NFS_ACL_DEFAULT or zero
+ *
+ * Returns size of encoded ACL in bytes or a negative errno value.
+ */
+int nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
+		  struct posix_acl *acl, int encode_entries, int typeflag)
 {
 	int entries = (acl && acl->a_count) ? max_t(int, acl->a_count, 4) : 0;
 	struct nfsacl_encode_desc nfsacl_desc = {
@@ -224,9 +235,18 @@ posix_acl_from_nfsacl(struct posix_acl *acl)
 	return 0;
 }
 
-unsigned int
-nfsacl_decode(struct xdr_buf *buf, unsigned int base, unsigned int *aclcnt,
-	      struct posix_acl **pacl)
+/**
+ * nfsacl_decode - Decode an NFSv3 ACL
+ *
+ * @buf: xdr_buf containing XDR'd ACL data to decode
+ * @base: byte offset in xdr_buf where XDR'd ACL begins
+ * @aclcnt: count of ACEs in decoded posix_acl
+ * @pacl: buffer in which to place decoded posix_acl
+ *
+ * Returns the length of the decoded ACL in bytes, or a negative errno value.
+ */
+int nfsacl_decode(struct xdr_buf *buf, unsigned int base, unsigned int *aclcnt,
+		  struct posix_acl **pacl)
 {
 	struct nfsacl_decode_desc nfsacl_desc = {
 		.desc = {
-- 
cgit v1.2.2


From f61f6da0d53842e849bab7f69e1431bd3de1136d Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 21 Jan 2011 03:05:38 +0000
Subject: NFS: Prevent memory allocation failure in nfsacl_encode()

nfsacl_encode() allocates memory in certain cases.  This of course
is not guaranteed to work.

Since commit 9f06c719 "SUNRPC: New xdr_streams XDR encoder API", the
kernel's XDR encoders can't return a result indicating possibly a
failure, so a memory allocation failure in nfsacl_encode() has become
fatal (ie, the XDR code Oopses) in some cases.

However, the allocated memory is a tiny fixed amount, on the order
of 40-50 bytes.  We can easily use a stack-allocated buffer for
this, with only a wee bit of nose-holding.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs3acl.c       |  4 ++--
 fs/nfs_common/nfsacl.c | 22 +++++++++++++++-------
 fs/posix_acl.c         | 17 +++++++++++++----
 3 files changed, 30 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 9f88c5f4c7e2..274342771655 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -311,8 +311,8 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
 	if (!nfs_server_capable(inode, NFS_CAP_ACLS))
 		goto out;
 
-	/* We are doing this here, because XDR marshalling can only
-	   return -ENOMEM. */
+	/* We are doing this here because XDR marshalling does not
+	 * return any results, it BUGs. */
 	status = -ENOSPC;
 	if (acl != NULL && acl->a_count > NFS_ACL_MAX_ENTRIES)
 		goto out;
diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
index a3e78bd18679..84c27d69d421 100644
--- a/fs/nfs_common/nfsacl.c
+++ b/fs/nfs_common/nfsacl.c
@@ -42,6 +42,11 @@ struct nfsacl_encode_desc {
 	gid_t gid;
 };
 
+struct nfsacl_simple_acl {
+	struct posix_acl acl;
+	struct posix_acl_entry ace[4];
+};
+
 static int
 xdr_nfsace_encode(struct xdr_array2_desc *desc, void *elem)
 {
@@ -99,17 +104,22 @@ int nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
 		.uid = inode->i_uid,
 		.gid = inode->i_gid,
 	};
+	struct nfsacl_simple_acl aclbuf;
 	int err;
-	struct posix_acl *acl2 = NULL;
 
 	if (entries > NFS_ACL_MAX_ENTRIES ||
 	    xdr_encode_word(buf, base, entries))
 		return -EINVAL;
 	if (encode_entries && acl && acl->a_count == 3) {
-		/* Fake up an ACL_MASK entry. */
-		acl2 = posix_acl_alloc(4, GFP_KERNEL);
-		if (!acl2)
-			return -ENOMEM;
+		struct posix_acl *acl2 = &aclbuf.acl;
+
+		/* Avoid the use of posix_acl_alloc().  nfsacl_encode() is
+		 * invoked in contexts where a memory allocation failure is
+		 * fatal.  Fortunately this fake ACL is small enough to
+		 * construct on the stack. */
+		memset(acl2, 0, sizeof(acl2));
+		posix_acl_init(acl2, 4);
+
 		/* Insert entries in canonical order: other orders seem
 		 to confuse Solaris VxFS. */
 		acl2->a_entries[0] = acl->a_entries[0];  /* ACL_USER_OBJ */
@@ -120,8 +130,6 @@ int nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
 		nfsacl_desc.acl = acl2;
 	}
 	err = xdr_encode_array2(buf, base + 4, &nfsacl_desc.desc);
-	if (acl2)
-		posix_acl_release(acl2);
 	if (!err)
 		err = 8 + nfsacl_desc.desc.elem_size *
 			  nfsacl_desc.desc.array_len;
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 39df95a0ec25..b1cf6bf4b41d 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -22,6 +22,7 @@
 
 #include <linux/errno.h>
 
+EXPORT_SYMBOL(posix_acl_init);
 EXPORT_SYMBOL(posix_acl_alloc);
 EXPORT_SYMBOL(posix_acl_clone);
 EXPORT_SYMBOL(posix_acl_valid);
@@ -31,6 +32,16 @@ EXPORT_SYMBOL(posix_acl_create_masq);
 EXPORT_SYMBOL(posix_acl_chmod_masq);
 EXPORT_SYMBOL(posix_acl_permission);
 
+/*
+ * Init a fresh posix_acl
+ */
+void
+posix_acl_init(struct posix_acl *acl, int count)
+{
+	atomic_set(&acl->a_refcount, 1);
+	acl->a_count = count;
+}
+
 /*
  * Allocate a new ACL with the specified number of entries.
  */
@@ -40,10 +51,8 @@ posix_acl_alloc(int count, gfp_t flags)
 	const size_t size = sizeof(struct posix_acl) +
 	                    count * sizeof(struct posix_acl_entry);
 	struct posix_acl *acl = kmalloc(size, flags);
-	if (acl) {
-		atomic_set(&acl->a_refcount, 1);
-		acl->a_count = count;
-	}
+	if (acl)
+		posix_acl_init(acl, count);
 	return acl;
 }
 
-- 
cgit v1.2.2


From 80c30e8de4f81851b1f712bcc596e11d53bc76f1 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 24 Jan 2011 20:50:26 +0000
Subject: NLM: Fix "kernel BUG at fs/lockd/host.c:417!" or ".../host.c:283!"

Nick Bowler <nbowler@elliptictech.com> reports:

> We were just having some NFS server troubles, and my client machine
> running 2.6.38-rc1+ (specifically, commit 2b1caf6ed7b888c95) crashed
> hard (syslog output appended to this mail).
>
> I'm not sure what the exact timeline was or how to reproduce this,
> but the server was rebooted during all this.  Since I've never seen
> this happen before, it is possibly a regression from previous kernel
> releases.  However, I recently updated my nfs-utils (on the client) to
> version 1.2.3, so that might be related as well.

  [ BUG output redacted ]

When done searching, the for_each_host loop in next_host_state() falls
through and returns the final host on the host chain without bumping
it's reference count.

Since the host's ref count is only one at that point, releasing the
host in nlm_host_rebooted() attempts to destroy the host prematurely,
and therefore hits a BUG().

Likely, the original intent of the for_each_host behavior in
next_host_state() was to handle the case when the host chain is empty.
Searching the chain and finding no suitable host to return needs to be
handled as well.

Defensively restructure next_host_state() always to return NULL when
the loop falls through.

Introduced by commit b10e30f6 "lockd: reorganize nlm_host_rebooted".

Cc: J. Bruce Fields <bfields@fieldses.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/host.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 5f1bcb2f06f3..b7c99bfb3da6 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -520,7 +520,7 @@ static struct nlm_host *next_host_state(struct hlist_head *cache,
 					struct nsm_handle *nsm,
 					const struct nlm_reboot *info)
 {
-	struct nlm_host *host = NULL;
+	struct nlm_host *host;
 	struct hlist_head *chain;
 	struct hlist_node *pos;
 
@@ -532,12 +532,13 @@ static struct nlm_host *next_host_state(struct hlist_head *cache,
 			host->h_state++;
 
 			nlm_get_host(host);
-			goto out;
+			mutex_unlock(&nlm_host_mutex);
+			return host;
 		}
 	}
-out:
+
 	mutex_unlock(&nlm_host_mutex);
-	return host;
+	return NULL;
 }
 
 /**
-- 
cgit v1.2.2


From 778be232a207e79088ba70d832ac25dfea6fbf1a Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Tue, 25 Jan 2011 15:38:01 +0000
Subject: NFS do not find client in NFSv4 pg_authenticate

The information required to find the nfs_client cooresponding to the incoming
back channel request is contained in the NFS layer. Perform minimal checking
in the RPC layer pg_authenticate method, and push more detailed checking into
the NFS layer where the nfs_client can be found.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/callback.c      | 109 +++++++++++++------------------------------------
 fs/nfs/callback.h      |   4 +-
 fs/nfs/callback_proc.c |  10 +----
 fs/nfs/callback_xdr.c  |   5 +--
 fs/nfs/client.c        |  15 +++----
 fs/nfs/internal.h      |   3 +-
 fs/nfs/nfs4state.c     |   6 ---
 7 files changed, 41 insertions(+), 111 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 199016528fcb..e3d294269058 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -134,33 +134,6 @@ out_err:
 }
 
 #if defined(CONFIG_NFS_V4_1)
-/*
- *  * CB_SEQUENCE operations will fail until the callback sessionid is set.
- *   */
-int nfs4_set_callback_sessionid(struct nfs_client *clp)
-{
-	struct svc_serv *serv = clp->cl_rpcclient->cl_xprt->bc_serv;
-	struct nfs4_sessionid *bc_sid;
-
-	if (!serv->sv_bc_xprt)
-		return -EINVAL;
-
-	/* on success freed in xprt_free */
-	bc_sid = kmalloc(sizeof(struct nfs4_sessionid), GFP_KERNEL);
-	if (!bc_sid)
-		return -ENOMEM;
-	memcpy(bc_sid->data, &clp->cl_session->sess_id.data,
-		NFS4_MAX_SESSIONID_LEN);
-	spin_lock_bh(&serv->sv_cb_lock);
-	serv->sv_bc_xprt->xpt_bc_sid = bc_sid;
-	spin_unlock_bh(&serv->sv_cb_lock);
-	dprintk("%s set xpt_bc_sid=%u:%u:%u:%u for sv_bc_xprt %p\n", __func__,
-		((u32 *)bc_sid->data)[0], ((u32 *)bc_sid->data)[1],
-		((u32 *)bc_sid->data)[2], ((u32 *)bc_sid->data)[3],
-		serv->sv_bc_xprt);
-	return 0;
-}
-
 /*
  * The callback service for NFSv4.1 callbacks
  */
@@ -266,10 +239,6 @@ static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
 		struct nfs_callback_data *cb_info)
 {
 }
-int nfs4_set_callback_sessionid(struct nfs_client *clp)
-{
-	return 0;
-}
 #endif /* CONFIG_NFS_V4_1 */
 
 /*
@@ -359,78 +328,58 @@ void nfs_callback_down(int minorversion)
 	mutex_unlock(&nfs_callback_mutex);
 }
 
-static int check_gss_callback_principal(struct nfs_client *clp,
-					struct svc_rqst *rqstp)
+/* Boolean check of RPC_AUTH_GSS principal */
+int
+check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp)
 {
 	struct rpc_clnt *r = clp->cl_rpcclient;
 	char *p = svc_gss_principal(rqstp);
 
+	if (rqstp->rq_authop->flavour != RPC_AUTH_GSS)
+		return 1;
+
 	/* No RPC_AUTH_GSS on NFSv4.1 back channel yet */
 	if (clp->cl_minorversion != 0)
-		return SVC_DROP;
+		return 0;
 	/*
 	 * It might just be a normal user principal, in which case
 	 * userspace won't bother to tell us the name at all.
 	 */
 	if (p == NULL)
-		return SVC_DENIED;
+		return 0;
 
 	/* Expect a GSS_C_NT_HOSTBASED_NAME like "nfs@serverhostname" */
 
 	if (memcmp(p, "nfs@", 4) != 0)
-		return SVC_DENIED;
+		return 0;
 	p += 4;
 	if (strcmp(p, r->cl_server) != 0)
-		return SVC_DENIED;
-	return SVC_OK;
+		return 0;
+	return 1;
 }
 
-/* pg_authenticate method helper */
-static struct nfs_client *nfs_cb_find_client(struct svc_rqst *rqstp)
-{
-	struct nfs4_sessionid *sessionid = bc_xprt_sid(rqstp);
-	int is_cb_compound = rqstp->rq_proc == CB_COMPOUND ? 1 : 0;
-
-	dprintk("--> %s rq_proc %d\n", __func__, rqstp->rq_proc);
-	if (svc_is_backchannel(rqstp))
-		/* Sessionid (usually) set after CB_NULL ping */
-		return nfs4_find_client_sessionid(svc_addr(rqstp), sessionid,
-						  is_cb_compound);
-	else
-		/* No callback identifier in pg_authenticate */
-		return nfs4_find_client_no_ident(svc_addr(rqstp));
-}
-
-/* pg_authenticate method for nfsv4 callback threads. */
+/*
+ * pg_authenticate method for nfsv4 callback threads.
+ *
+ * The authflavor has been negotiated, so an incorrect flavor is a server
+ * bug. Drop packets with incorrect authflavor.
+ *
+ * All other checking done after NFS decoding where the nfs_client can be
+ * found in nfs4_callback_compound
+ */
 static int nfs_callback_authenticate(struct svc_rqst *rqstp)
 {
-	struct nfs_client *clp;
-	RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
-	int ret = SVC_OK;
-
-	/* Don't talk to strangers */
-	clp = nfs_cb_find_client(rqstp);
-	if (clp == NULL)
-		return SVC_DROP;
-
-	dprintk("%s: %s NFSv4 callback!\n", __func__,
-			svc_print_addr(rqstp, buf, sizeof(buf)));
-
 	switch (rqstp->rq_authop->flavour) {
-		case RPC_AUTH_NULL:
-			if (rqstp->rq_proc != CB_NULL)
-				ret = SVC_DENIED;
-			break;
-		case RPC_AUTH_UNIX:
-			break;
-		case RPC_AUTH_GSS:
-			ret = check_gss_callback_principal(clp, rqstp);
-			break;
-		default:
-			ret = SVC_DENIED;
+	case RPC_AUTH_NULL:
+		if (rqstp->rq_proc != CB_NULL)
+			return SVC_DROP;
+		break;
+	case RPC_AUTH_GSS:
+		/* No RPC_AUTH_GSS support yet in NFSv4.1 */
+		 if (svc_is_backchannel(rqstp))
+			return SVC_DROP;
 	}
-	nfs_put_client(clp);
-	return ret;
+	return SVC_OK;
 }
 
 /*
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index d3b44f9bd747..46d93ce7311b 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -7,6 +7,7 @@
  */
 #ifndef __LINUX_FS_NFS_CALLBACK_H
 #define __LINUX_FS_NFS_CALLBACK_H
+#include <linux/sunrpc/svc.h>
 
 #define NFS4_CALLBACK 0x40000000
 #define NFS4_CALLBACK_XDRSIZE 2048
@@ -37,7 +38,6 @@ enum nfs4_callback_opnum {
 struct cb_process_state {
 	__be32			drc_status;
 	struct nfs_client	*clp;
-	struct nfs4_sessionid	*svc_sid; /* v4.1 callback service sessionid */
 };
 
 struct cb_compound_hdr_arg {
@@ -168,7 +168,7 @@ extern unsigned nfs4_callback_layoutrecall(
 extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses);
 extern void nfs4_cb_take_slot(struct nfs_client *clp);
 #endif /* CONFIG_NFS_V4_1 */
-
+extern int check_gss_callback_principal(struct nfs_client *, struct svc_rqst *);
 extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
 				    struct cb_getattrres *res,
 				    struct cb_process_state *cps);
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 4bb91cb2620d..829f406e91dd 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -373,17 +373,11 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
 {
 	struct nfs_client *clp;
 	int i;
-	__be32 status;
+	__be32 status = htonl(NFS4ERR_BADSESSION);
 
 	cps->clp = NULL;
 
-	status = htonl(NFS4ERR_BADSESSION);
-	/* Incoming session must match the callback session */
-	if (memcmp(&args->csa_sessionid, cps->svc_sid, NFS4_MAX_SESSIONID_LEN))
-		goto out;
-
-	clp = nfs4_find_client_sessionid(args->csa_addr,
-					 &args->csa_sessionid, 1);
+	clp = nfs4_find_client_sessionid(args->csa_addr, &args->csa_sessionid);
 	if (clp == NULL)
 		goto out;
 
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 23112c263f81..14e0f9371d14 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -794,10 +794,9 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
 
 	if (hdr_arg.minorversion == 0) {
 		cps.clp = nfs4_find_client_ident(hdr_arg.cb_ident);
-		if (!cps.clp)
+		if (!cps.clp || !check_gss_callback_principal(cps.clp, rqstp))
 			return rpc_drop_reply;
-	} else
-		cps.svc_sid = bc_xprt_sid(rqstp);
+	}
 
 	hdr_res.taglen = hdr_arg.taglen;
 	hdr_res.tag = hdr_arg.tag;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 192f2f860265..bd3ca32879e7 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -1206,16 +1206,11 @@ nfs4_find_client_ident(int cb_ident)
  * For CB_COMPOUND calls, find a client by IP address, protocol version,
  * minorversion, and sessionID
  *
- * CREATE_SESSION triggers a CB_NULL ping from servers. The callback service
- * sessionid can only be set after the CREATE_SESSION return, so a CB_NULL
- * can arrive before the callback sessionid is set. For CB_NULL calls,
- * find a client by IP address protocol version, and minorversion.
- *
  * Returns NULL if no such client
  */
 struct nfs_client *
 nfs4_find_client_sessionid(const struct sockaddr *addr,
-			   struct nfs4_sessionid *sid, int is_cb_compound)
+			   struct nfs4_sessionid *sid)
 {
 	struct nfs_client *clp;
 
@@ -1227,9 +1222,9 @@ nfs4_find_client_sessionid(const struct sockaddr *addr,
 		if (!nfs4_has_session(clp))
 			continue;
 
-		/* Match sessionid unless cb_null call*/
-		if (is_cb_compound && (memcmp(clp->cl_session->sess_id.data,
-		    sid->data, NFS4_MAX_SESSIONID_LEN) != 0))
+		/* Match sessionid*/
+		if (memcmp(clp->cl_session->sess_id.data,
+		    sid->data, NFS4_MAX_SESSIONID_LEN) != 0)
 			continue;
 
 		atomic_inc(&clp->cl_count);
@@ -1244,7 +1239,7 @@ nfs4_find_client_sessionid(const struct sockaddr *addr,
 
 struct nfs_client *
 nfs4_find_client_sessionid(const struct sockaddr *addr,
-			   struct nfs4_sessionid *sid, int is_cb_compound)
+			   struct nfs4_sessionid *sid)
 {
 	return NULL;
 }
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 4644f04b4b46..cf9fdbdabc67 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -133,8 +133,7 @@ extern void nfs_put_client(struct nfs_client *);
 extern struct nfs_client *nfs4_find_client_no_ident(const struct sockaddr *);
 extern struct nfs_client *nfs4_find_client_ident(int);
 extern struct nfs_client *
-nfs4_find_client_sessionid(const struct sockaddr *, struct nfs4_sessionid *,
-			   int);
+nfs4_find_client_sessionid(const struct sockaddr *, struct nfs4_sessionid *);
 extern struct nfs_server *nfs_create_server(
 					const struct nfs_parsed_mount_data *,
 					struct nfs_fh *);
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 2336d532cf66..e6742b57a04c 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -232,12 +232,6 @@ int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
 	status = nfs4_proc_create_session(clp);
 	if (status != 0)
 		goto out;
-	status = nfs4_set_callback_sessionid(clp);
-	if (status != 0) {
-		printk(KERN_WARNING "Sessionid not set. No callback service\n");
-		nfs_callback_down(1);
-		status = 0;
-	}
 	nfs41_setup_state_renewal(clp);
 	nfs_mark_client_ready(clp, NFS_CS_READY);
 out:
-- 
cgit v1.2.2


From 2c4cdf8f6d3cfb48036400952329555099c8c92c Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Tue, 25 Jan 2011 15:38:02 +0000
Subject: NFS fix cb_sequence error processing

Always assign the cb_process_state nfs_client pointer so a processing error
in cb_sequence after the nfs_client is found and referenced returns
a non-NULL cb_process_state nfs_client and the matching nfs_put_client in
nfs4_callback_compound dereferences the client.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/callback_proc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 829f406e91dd..89587573fe50 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -408,9 +408,9 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
 	res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
 	res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
 	nfs4_cb_take_slot(clp);
-	cps->clp = clp; /* put in nfs4_callback_compound */
 
 out:
+	cps->clp = clp; /* put in nfs4_callback_compound */
 	for (i = 0; i < args->csa_nrclists; i++)
 		kfree(args->csa_rclists[i].rcl_refcalls);
 	kfree(args->csa_rclists);
-- 
cgit v1.2.2


From b2a2897dc4a59684321de425652061c62a0569d0 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Tue, 25 Jan 2011 15:38:03 +0000
Subject: NFS improve pnfs_put_deviceid_cache debug print

What we really want to know is the ref count.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/pnfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index bc4089769735..1b1bc1a0fb0a 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -951,7 +951,7 @@ pnfs_put_deviceid_cache(struct nfs_client *clp)
 {
 	struct pnfs_deviceid_cache *local = clp->cl_devid_cache;
 
-	dprintk("--> %s cl_devid_cache %p\n", __func__, clp->cl_devid_cache);
+	dprintk("--> %s ({%d})\n", __func__, atomic_read(&local->dc_ref));
 	if (atomic_dec_and_lock(&local->dc_ref, &clp->cl_lock)) {
 		int i;
 		/* Verify cache is empty */
-- 
cgit v1.2.2


From 27dc1cd3ad9300f81e1219e5fc305d91d85353f8 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 25 Jan 2011 15:28:21 -0500
Subject: NFS: nfs_wcc_update_inode() should set nfsi->attr_gencount

If the call to nfs_wcc_update_inode() results in an attribute update, we
need to ensure that the inode's attr_gencount gets bumped too, otherwise
we are not protected against races with other GETATTR calls.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index d8512423ba72..1cc600e77bb4 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -881,9 +881,10 @@ out:
 	return ret;
 }
 
-static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
+static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
+	unsigned long ret = 0;
 
 	if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE)
 			&& (fattr->valid & NFS_ATTR_FATTR_CHANGE)
@@ -891,25 +892,32 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 		nfsi->change_attr = fattr->change_attr;
 		if (S_ISDIR(inode->i_mode))
 			nfsi->cache_validity |= NFS_INO_INVALID_DATA;
+		ret |= NFS_INO_INVALID_ATTR;
 	}
 	/* If we have atomic WCC data, we may update some attributes */
 	if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME)
 			&& (fattr->valid & NFS_ATTR_FATTR_CTIME)
-			&& timespec_equal(&inode->i_ctime, &fattr->pre_ctime))
-			memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
+			&& timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) {
+		memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
+		ret |= NFS_INO_INVALID_ATTR;
+	}
 
 	if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME)
 			&& (fattr->valid & NFS_ATTR_FATTR_MTIME)
 			&& timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) {
-			memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
-			if (S_ISDIR(inode->i_mode))
-				nfsi->cache_validity |= NFS_INO_INVALID_DATA;
+		memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
+		if (S_ISDIR(inode->i_mode))
+			nfsi->cache_validity |= NFS_INO_INVALID_DATA;
+		ret |= NFS_INO_INVALID_ATTR;
 	}
 	if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE)
 			&& (fattr->valid & NFS_ATTR_FATTR_SIZE)
 			&& i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size)
-			&& nfsi->npages == 0)
-			i_size_write(inode, nfs_size_to_loff_t(fattr->size));
+			&& nfsi->npages == 0) {
+		i_size_write(inode, nfs_size_to_loff_t(fattr->size));
+		ret |= NFS_INO_INVALID_ATTR;
+	}
+	return ret;
 }
 
 /**
@@ -1223,7 +1231,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 			| NFS_INO_REVAL_PAGECACHE);
 
 	/* Do atomic weak cache consistency updates */
-	nfs_wcc_update_inode(inode, fattr);
+	invalid |= nfs_wcc_update_inode(inode, fattr);
 
 	/* More cache consistency checks */
 	if (fattr->valid & NFS_ATTR_FATTR_CHANGE) {
-- 
cgit v1.2.2


From 3689456b4bd36027022b3215eb2acba51cd0e6b5 Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@lougher.demon.co.uk>
Date: Tue, 25 Jan 2011 15:07:34 -0800
Subject: squashfs: fix use of uninitialised variable in zlib & xz
 decompressors

Fix potential use of uninitialised variable caused by recent
decompressor code optimisations.

In zlib_uncompress (zlib_wrapper.c) we have

	int zlib_err, zlib_init = 0;
	...
	do {
		...
			if (avail == 0) {
				offset = 0;
				put_bh(bh[k++]);
				continue;
			}
		...
		zlib_err = zlib_inflate(stream, Z_SYNC_FLUSH);
		...
	} while (zlib_err == Z_OK);

If continue is executed (avail == 0) then the while condition will be
evaluated testing zlib_err, which is uninitialised first time around the
loop.

Fix this by getting rid of the 'if (avail == 0)' condition test, this
edge condition should not be being handled in the decompressor code, and
instead handle it generically in the caller code.

Similarly for xz_wrapper.c.

Incidentally, on most architectures (bar Mips and Parisc), no
uninitialised variable warning is generated by gcc, this is because the
while condition test on continue is optimised out and not performed
(when executing continue zlib_err has not been changed since entering
the loop, and logically if the while condition was true previously, then
it's still true).

Signed-off-by: Phillip Lougher <phillip@lougher.demon.co.uk>
Reported-by: Jesper Juhl <jj@chaosbits.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/squashfs/block.c        | 8 ++++++++
 fs/squashfs/xz_wrapper.c   | 6 ------
 fs/squashfs/zlib_wrapper.c | 6 ------
 3 files changed, 8 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 2fb2882f0fa7..8ab48bc2fa7d 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -63,6 +63,14 @@ static struct buffer_head *get_block_length(struct super_block *sb,
 		*length = (unsigned char) bh->b_data[*offset] |
 			(unsigned char) bh->b_data[*offset + 1] << 8;
 		*offset += 2;
+
+		if (*offset == msblk->devblksize) {
+			put_bh(bh);
+			bh = sb_bread(sb, ++(*cur_index));
+			if (bh == NULL)
+				return NULL;
+			*offset = 0;
+		}
 	}
 
 	return bh;
diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c
index 856756ca5ee4..c4eb40018256 100644
--- a/fs/squashfs/xz_wrapper.c
+++ b/fs/squashfs/xz_wrapper.c
@@ -95,12 +95,6 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void **buffer,
 			if (!buffer_uptodate(bh[k]))
 				goto release_mutex;
 
-			if (avail == 0) {
-				offset = 0;
-				put_bh(bh[k++]);
-				continue;
-			}
-
 			stream->buf.in = bh[k]->b_data + offset;
 			stream->buf.in_size = avail;
 			stream->buf.in_pos = 0;
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
index 818a5e063faf..4661ae2b1cec 100644
--- a/fs/squashfs/zlib_wrapper.c
+++ b/fs/squashfs/zlib_wrapper.c
@@ -82,12 +82,6 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
 			if (!buffer_uptodate(bh[k]))
 				goto release_mutex;
 
-			if (avail == 0) {
-				offset = 0;
-				put_bh(bh[k++]);
-				continue;
-			}
-
 			stream->next_in = bh[k]->b_data + offset;
 			stream->avail_in = avail;
 			offset = 0;
-- 
cgit v1.2.2


From ac751efa6a0d70f2c9daef5c7e3a92270f5c2dff Mon Sep 17 00:00:00 2001
From: Torben Hohn <torbenh@gmx.de>
Date: Tue, 25 Jan 2011 15:07:35 -0800
Subject: console: rename acquire/release_console_sem() to
 console_lock/unlock()

The -rt patches change the console_semaphore to console_mutex.  As a
result, a quite large chunk of the patches changes all
acquire/release_console_sem() to acquire/release_console_mutex()

This commit makes things use more neutral function names which dont make
implications about the underlying lock.

The only real change is the return value of console_trylock which is
inverted from try_acquire_console_sem()

This patch also paves the way to switching console_sem from a semaphore to
a mutex.

[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: make console_trylock return 1 on success, per Geert]
Signed-off-by: Torben Hohn <torbenh@gmx.de>
Cc: Thomas Gleixner <tglx@tglx.de>
Cc: Greg KH <gregkh@suse.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/consoles.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c
index eafc22ab1fdd..b701eaa482bf 100644
--- a/fs/proc/consoles.c
+++ b/fs/proc/consoles.c
@@ -67,7 +67,7 @@ static void *c_start(struct seq_file *m, loff_t *pos)
 	struct console *con;
 	loff_t off = 0;
 
-	acquire_console_sem();
+	console_lock();
 	for_each_console(con)
 		if (off++ == *pos)
 			break;
@@ -84,7 +84,7 @@ static void *c_next(struct seq_file *m, void *v, loff_t *pos)
 
 static void c_stop(struct seq_file *m, void *v)
 {
-	release_console_sem();
+	console_unlock();
 }
 
 static const struct seq_operations consoles_op = {
-- 
cgit v1.2.2


From c7a360b05b5430ac1d75dc7d53c586ada60a05cb Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Tue, 25 Jan 2011 19:15:32 -0500
Subject: NFS construct consistent co_ownerid for v4.1

As stated in section 2.4 of RFC 5661, subsequent instances of the client need
to present the same co_ownerid. Concatinate the client's IP dot address,
host name, and the rpc_auth pseudoflavor to form the co_ownerid.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 30 ++++++++++--------------------
 1 file changed, 10 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 9d992b0346e3..78936a8f40ab 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -50,6 +50,7 @@
 #include <linux/module.h>
 #include <linux/sunrpc/bc_xprt.h>
 #include <linux/xattr.h>
+#include <linux/utsname.h>
 
 #include "nfs4_fs.h"
 #include "delegation.h"
@@ -4572,27 +4573,16 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
 	*p = htonl((u32)clp->cl_boot_time.tv_nsec);
 	args.verifier = &verifier;
 
-	while (1) {
-		args.id_len = scnprintf(args.id, sizeof(args.id),
-					"%s/%s %u",
-					clp->cl_ipaddr,
-					rpc_peeraddr2str(clp->cl_rpcclient,
-							 RPC_DISPLAY_ADDR),
-					clp->cl_id_uniquifier);
-
-		status = rpc_call_sync(clp->cl_rpcclient, &msg, 0);
-
-		if (status != -NFS4ERR_CLID_INUSE)
-			break;
-
-		if (signalled())
-			break;
-
-		if (++clp->cl_id_uniquifier == 0)
-			break;
-	}
+	args.id_len = scnprintf(args.id, sizeof(args.id),
+				"%s/%s.%s/%u",
+				clp->cl_ipaddr,
+				init_utsname()->nodename,
+				init_utsname()->domainname,
+				clp->cl_rpcclient->cl_auth->au_flavor);
 
-	status = nfs4_check_cl_exchange_flags(clp->cl_exchange_flags);
+	status = rpc_call_sync(clp->cl_rpcclient, &msg, 0);
+	if (!status)
+		status = nfs4_check_cl_exchange_flags(clp->cl_exchange_flags);
 	dprintk("<-- %s status= %d\n", __func__, status);
 	return status;
 }
-- 
cgit v1.2.2


From 8eb2d829ffea3677c21bd038f19e5d8ca6b43e36 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 9 Nov 2010 14:48:01 +0800
Subject: btrfs: Fix threshold calculation for block groups smaller than 1GB

If a block group is smaller than 1GB, the extent entry threadhold
calculation will always set the threshold to 0.

So as free space gets fragmented, btrfs will switch to use bitmap
to manage free space, but then will never switch back to extents
due to this bug.

Reviewed-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/free-space-cache.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 60d684266959..42f4015988ec 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1016,14 +1016,18 @@ static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
 	u64 max_bytes;
 	u64 bitmap_bytes;
 	u64 extent_bytes;
+	u64 size = block_group->key.offset;
 
 	/*
 	 * The goal is to keep the total amount of memory used per 1gb of space
 	 * at or below 32k, so we need to adjust how much memory we allow to be
 	 * used by extent based free space tracking
 	 */
-	max_bytes = MAX_CACHE_BYTES_PER_GIG *
-		(div64_u64(block_group->key.offset, 1024 * 1024 * 1024));
+	if (size < 1024 * 1024 * 1024)
+		max_bytes = MAX_CACHE_BYTES_PER_GIG;
+	else
+		max_bytes = MAX_CACHE_BYTES_PER_GIG *
+			div64_u64(size, 1024 * 1024 * 1024);
 
 	/*
 	 * we want to account for 1 more bitmap than what we have so we can make
-- 
cgit v1.2.2


From edf6e2d1ddbac7f326b34a27adbca71ece53ccce Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 9 Nov 2010 14:50:07 +0800
Subject: btrfs: Add helper function free_bitmap()

Remove some duplicated code.

This prepares for the next patch.

Reviewed-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/free-space-cache.c | 37 ++++++++++++++++---------------------
 1 file changed, 16 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 42f4015988ec..850104f05178 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1175,6 +1175,16 @@ static void add_new_bitmap(struct btrfs_block_group_cache *block_group,
 	recalculate_thresholds(block_group);
 }
 
+static void free_bitmap(struct btrfs_block_group_cache *block_group,
+			struct btrfs_free_space *bitmap_info)
+{
+	unlink_free_space(block_group, bitmap_info);
+	kfree(bitmap_info->bitmap);
+	kfree(bitmap_info);
+	block_group->total_bitmaps--;
+	recalculate_thresholds(block_group);
+}
+
 static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_group,
 			      struct btrfs_free_space *bitmap_info,
 			      u64 *offset, u64 *bytes)
@@ -1215,13 +1225,8 @@ again:
 
 	if (*bytes) {
 		struct rb_node *next = rb_next(&bitmap_info->offset_index);
-		if (!bitmap_info->bytes) {
-			unlink_free_space(block_group, bitmap_info);
-			kfree(bitmap_info->bitmap);
-			kfree(bitmap_info);
-			block_group->total_bitmaps--;
-			recalculate_thresholds(block_group);
-		}
+		if (!bitmap_info->bytes)
+			free_bitmap(block_group, bitmap_info);
 
 		/*
 		 * no entry after this bitmap, but we still have bytes to
@@ -1254,13 +1259,8 @@ again:
 			return -EAGAIN;
 
 		goto again;
-	} else if (!bitmap_info->bytes) {
-		unlink_free_space(block_group, bitmap_info);
-		kfree(bitmap_info->bitmap);
-		kfree(bitmap_info);
-		block_group->total_bitmaps--;
-		recalculate_thresholds(block_group);
-	}
+	} else if (!bitmap_info->bytes)
+		free_bitmap(block_group, bitmap_info);
 
 	return 0;
 }
@@ -1689,13 +1689,8 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
 	ret = offset;
 	if (entry->bitmap) {
 		bitmap_clear_bits(block_group, entry, offset, bytes);
-		if (!entry->bytes) {
-			unlink_free_space(block_group, entry);
-			kfree(entry->bitmap);
-			kfree(entry);
-			block_group->total_bitmaps--;
-			recalculate_thresholds(block_group);
-		}
+		if (!entry->bytes)
+			free_bitmap(block_group, entry);
 	} else {
 		unlink_free_space(block_group, entry);
 		entry->offset += bytes;
-- 
cgit v1.2.2


From 70b7da304f9f9bbf1566085155895e32e775a745 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 9 Nov 2010 14:51:45 +0800
Subject: btrfs: Free fully occupied bitmap in cluster

If there's no more free space in a bitmap, we should free it.

Reviewed-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/free-space-cache.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 850104f05178..cb0137e4047f 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1788,6 +1788,8 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
 
 	ret = search_start;
 	bitmap_clear_bits(block_group, entry, ret, bytes);
+	if (entry->bytes == 0)
+		free_bitmap(block_group, entry);
 out:
 	spin_unlock(&cluster->lock);
 	spin_unlock(&block_group->tree_lock);
-- 
cgit v1.2.2


From 5e71b5d5ec07e4b3fb4c78c4e4b108ff667f123f Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 9 Nov 2010 14:55:34 +0800
Subject: btrfs: Update stats when allocating from a cluster

When allocating extent entry from a cluster, we should update
the free_space and free_extents fields of the block group.

Reviewed-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/free-space-cache.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index cb0137e4047f..2974c4744d5c 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1843,15 +1843,26 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
 		entry->offset += bytes;
 		entry->bytes -= bytes;
 
-		if (entry->bytes == 0) {
+		if (entry->bytes == 0)
 			rb_erase(&entry->offset_index, &cluster->root);
-			kfree(entry);
-		}
 		break;
 	}
 out:
 	spin_unlock(&cluster->lock);
 
+	if (!ret)
+		return 0;
+
+	spin_lock(&block_group->tree_lock);
+
+	block_group->free_space -= bytes;
+	if (entry->bytes == 0) {
+		block_group->free_extents--;
+		kfree(entry);
+	}
+
+	spin_unlock(&block_group->tree_lock);
+
 	return ret;
 }
 
-- 
cgit v1.2.2


From 120d66eec0dcb966fbd03f743598b2ff2513436b Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 9 Nov 2010 14:56:50 +0800
Subject: btrfs: Add a helper try_merge_free_space()

When adding a new extent, we'll firstly see if we can merge
this extent to the left or/and right extent. Extract this as
a helper try_merge_free_space().

As a side effect, we fix a small bug that if the new extent
has non-bitmap left entry but is unmergeble, we'll directly
link the extent without trying to drop it into bitmap.

This also prepares for the next patch.

Reviewed-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/free-space-cache.c | 75 ++++++++++++++++++++++++++-------------------
 1 file changed, 43 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 2974c4744d5c..cf67dc3b7bf8 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1363,22 +1363,14 @@ out:
 	return ret;
 }
 
-int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
-			 u64 offset, u64 bytes)
+bool try_merge_free_space(struct btrfs_block_group_cache *block_group,
+			  struct btrfs_free_space *info)
 {
-	struct btrfs_free_space *right_info = NULL;
-	struct btrfs_free_space *left_info = NULL;
-	struct btrfs_free_space *info = NULL;
-	int ret = 0;
-
-	info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
-	if (!info)
-		return -ENOMEM;
-
-	info->offset = offset;
-	info->bytes = bytes;
-
-	spin_lock(&block_group->tree_lock);
+	struct btrfs_free_space *left_info;
+	struct btrfs_free_space *right_info;
+	bool merged = false;
+	u64 offset = info->offset;
+	u64 bytes = info->bytes;
 
 	/*
 	 * first we want to see if there is free space adjacent to the range we
@@ -1392,27 +1384,11 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
 	else
 		left_info = tree_search_offset(block_group, offset - 1, 0, 0);
 
-	/*
-	 * If there was no extent directly to the left or right of this new
-	 * extent then we know we're going to have to allocate a new extent, so
-	 * before we do that see if we need to drop this into a bitmap
-	 */
-	if ((!left_info || left_info->bitmap) &&
-	    (!right_info || right_info->bitmap)) {
-		ret = insert_into_bitmap(block_group, info);
-
-		if (ret < 0) {
-			goto out;
-		} else if (ret) {
-			ret = 0;
-			goto out;
-		}
-	}
-
 	if (right_info && !right_info->bitmap) {
 		unlink_free_space(block_group, right_info);
 		info->bytes += right_info->bytes;
 		kfree(right_info);
+		merged = true;
 	}
 
 	if (left_info && !left_info->bitmap &&
@@ -1421,8 +1397,43 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
 		info->offset = left_info->offset;
 		info->bytes += left_info->bytes;
 		kfree(left_info);
+		merged = true;
 	}
 
+	return merged;
+}
+
+int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+			 u64 offset, u64 bytes)
+{
+	struct btrfs_free_space *info;
+	int ret = 0;
+
+	info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
+	if (!info)
+		return -ENOMEM;
+
+	info->offset = offset;
+	info->bytes = bytes;
+
+	spin_lock(&block_group->tree_lock);
+
+	if (try_merge_free_space(block_group, info))
+		goto link;
+
+	/*
+	 * There was no extent directly to the left or right of this new
+	 * extent then we know we're going to have to allocate a new extent, so
+	 * before we do that see if we need to drop this into a bitmap
+	 */
+	ret = insert_into_bitmap(block_group, info);
+	if (ret < 0) {
+		goto out;
+	} else if (ret) {
+		ret = 0;
+		goto out;
+	}
+link:
 	ret = link_free_space(block_group, info);
 	if (ret)
 		kfree(info);
-- 
cgit v1.2.2


From f333adb5d64bc1c4d6099072fc341c3c8f84e0cf Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 9 Nov 2010 14:57:39 +0800
Subject: btrfs: Check mergeable free space when removing a cluster

After returing extents from a cluster to the block group, some
extents in the block group may be mergeable.

Reviewed-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/free-space-cache.c | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index cf67dc3b7bf8..a5501edc3c9f 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -987,11 +987,18 @@ tree_search_offset(struct btrfs_block_group_cache *block_group,
 	return entry;
 }
 
-static void unlink_free_space(struct btrfs_block_group_cache *block_group,
-			      struct btrfs_free_space *info)
+static inline void
+__unlink_free_space(struct btrfs_block_group_cache *block_group,
+		    struct btrfs_free_space *info)
 {
 	rb_erase(&info->offset_index, &block_group->free_space_offset);
 	block_group->free_extents--;
+}
+
+static void unlink_free_space(struct btrfs_block_group_cache *block_group,
+			      struct btrfs_free_space *info)
+{
+	__unlink_free_space(block_group, info);
 	block_group->free_space -= info->bytes;
 }
 
@@ -1364,7 +1371,7 @@ out:
 }
 
 bool try_merge_free_space(struct btrfs_block_group_cache *block_group,
-			  struct btrfs_free_space *info)
+			  struct btrfs_free_space *info, bool update_stat)
 {
 	struct btrfs_free_space *left_info;
 	struct btrfs_free_space *right_info;
@@ -1385,7 +1392,10 @@ bool try_merge_free_space(struct btrfs_block_group_cache *block_group,
 		left_info = tree_search_offset(block_group, offset - 1, 0, 0);
 
 	if (right_info && !right_info->bitmap) {
-		unlink_free_space(block_group, right_info);
+		if (update_stat)
+			unlink_free_space(block_group, right_info);
+		else
+			__unlink_free_space(block_group, right_info);
 		info->bytes += right_info->bytes;
 		kfree(right_info);
 		merged = true;
@@ -1393,7 +1403,10 @@ bool try_merge_free_space(struct btrfs_block_group_cache *block_group,
 
 	if (left_info && !left_info->bitmap &&
 	    left_info->offset + left_info->bytes == offset) {
-		unlink_free_space(block_group, left_info);
+		if (update_stat)
+			unlink_free_space(block_group, left_info);
+		else
+			__unlink_free_space(block_group, left_info);
 		info->offset = left_info->offset;
 		info->bytes += left_info->bytes;
 		kfree(left_info);
@@ -1418,7 +1431,7 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
 
 	spin_lock(&block_group->tree_lock);
 
-	if (try_merge_free_space(block_group, info))
+	if (try_merge_free_space(block_group, info, true))
 		goto link;
 
 	/*
@@ -1636,6 +1649,7 @@ __btrfs_return_cluster_to_free_space(
 		node = rb_next(&entry->offset_index);
 		rb_erase(&entry->offset_index, &cluster->root);
 		BUG_ON(entry->bitmap);
+		try_merge_free_space(block_group, entry, false);
 		tree_insert_offset(&block_group->free_space_offset,
 				   entry->offset, &entry->offset_index, 0);
 	}
-- 
cgit v1.2.2


From 83a4d54840c88a4a45c49670f044b8c7ddeaa8c7 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 27 Dec 2010 16:19:53 +0800
Subject: Btrfs: Fix memory leak at umount

fs_info, which is allocated in open_ctree(), should be freed
in close_ctree().

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/disk-io.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a5d2249e6da5..089871e5cd5a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2513,6 +2513,8 @@ int close_ctree(struct btrfs_root *root)
 	kfree(fs_info->chunk_root);
 	kfree(fs_info->dev_root);
 	kfree(fs_info->csum_root);
+	kfree(fs_info);
+
 	return 0;
 }
 
-- 
cgit v1.2.2


From bdc924bb4cdac92b945945c3149ab8191c92d75d Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Mon, 27 Dec 2010 16:33:15 +0800
Subject: Btrfs: Fix memory leak on finding existing super

We missed a memory deallocation in commit 450ba0ea.

If an existing super block is found at mount and there is no
error condition then the pre-allocated tree_root and fs_info
are no not used and are not freeded.

Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/super.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 61bd79abb805..f50253c2279d 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -654,6 +654,8 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
 		}
 
 		btrfs_close_devices(fs_devices);
+		kfree(fs_info);
+		kfree(tree_root);
 	} else {
 		char b[BDEVNAME_SIZE];
 
-- 
cgit v1.2.2


From 3f3d0bc0df041236fad4ffa82188a6e4ef9af75e Mon Sep 17 00:00:00 2001
From: Tero Roponen <tero.roponen@gmail.com>
Date: Mon, 27 Dec 2010 16:43:13 +0800
Subject: Btrfs: Free correct pointer after using strsep

We must save and free the original kstrdup()'ed pointer
because strsep() modifies its first argument.

Signed-off-by: Tero Roponen <tero.roponen@gmail.com>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/super.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f50253c2279d..78ee681465af 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -277,7 +277,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
 		struct btrfs_fs_devices **fs_devices)
 {
 	substring_t args[MAX_OPT_ARGS];
-	char *opts, *p;
+	char *opts, *orig, *p;
 	int error = 0;
 	int intarg;
 
@@ -291,6 +291,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
 	opts = kstrdup(options, GFP_KERNEL);
 	if (!opts)
 		return -ENOMEM;
+	orig = opts;
 
 	while ((p = strsep(&opts, ",")) != NULL) {
 		int token;
@@ -326,7 +327,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
 	}
 
  out_free_opts:
-	kfree(opts);
+	kfree(orig);
  out:
 	/*
 	 * If no subvolume name is specified we use the default one.  Allocate
-- 
cgit v1.2.2


From d0f69686c2ae775529aadc7a8acc6f13ad41de66 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Tue, 25 Jan 2011 15:46:17 +0800
Subject: Btrfs: Don't return acl info when mounting with noacl option

Steps to reproduce:

  # mkfs.btrfs /dev/sda2
  # mount /dev/sda2 /mnt
  # touch /mnt/file0
  # setfacl -m 'u:root:x,g::x,o::x' /mnt/file0
  # umount /mnt
  # mount /dev/sda2 -o noacl /mnt
  # getfacl /mnt/file0
  ...
  user::rw-
  user:root:--x
  group::--x
  mask::--x
  other::--x

The output should be:

  user::rw-
  group::--x
  other::--x

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/acl.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 2222d161c7b6..3c52fc8afe29 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -37,6 +37,9 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
 	char *value = NULL;
 	struct posix_acl *acl;
 
+	if (!IS_POSIXACL(inode))
+		return NULL;
+
 	acl = get_cached_acl(inode, type);
 	if (acl != ACL_NOT_CACHED)
 		return acl;
@@ -82,6 +85,9 @@ static int btrfs_xattr_acl_get(struct dentry *dentry, const char *name,
 	struct posix_acl *acl;
 	int ret = 0;
 
+	if (!IS_POSIXACL(dentry->d_inode))
+		return -EOPNOTSUPP;
+
 	acl = btrfs_get_acl(dentry->d_inode, type);
 
 	if (IS_ERR(acl))
-- 
cgit v1.2.2


From b897abec032deb7cc3ce67392a1f544ac965ddea Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Wed, 26 Jan 2011 16:19:22 +0800
Subject: Btrfs: Fix memory leak in writepage fixup work

fixup, which is allocated when starting page write to fix up the
extent without ORDERED bit set, should be freed after this work
is done.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/inode.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5f9194438f7c..3a6edc4c5642 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1544,6 +1544,7 @@ out:
 out_page:
 	unlock_page(page);
 	page_cache_release(page);
+	kfree(fixup);
 }
 
 /*
-- 
cgit v1.2.2


From 4d728ec7aefdca5419d2ebfb28c147e81a4b59f4 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 26 Jan 2011 14:10:43 +0800
Subject: Btrfs: Fix file clone when source offset is not 0

Suppose:
- the source extent is: [0, 100]
- the src offset is 10
- the clone length is 90
- the dest offset is 0

This statement:

	new_key.offset = key.offset + destoff - off

will produce such an extent for the dest file:

	[ino, BTRFS_EXTENT_DATA_KEY, -10]

, which is obviously wrong.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/ioctl.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index f87552a1d7ea..1b61dab64062 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1788,7 +1788,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 
 			memcpy(&new_key, &key, sizeof(new_key));
 			new_key.objectid = inode->i_ino;
-			new_key.offset = key.offset + destoff - off;
+			if (off <= key.offset)
+				new_key.offset = key.offset + destoff - off;
+			else
+				new_key.offset = destoff;
 
 			trans = btrfs_start_transaction(root, 1);
 			if (IS_ERR(trans)) {
-- 
cgit v1.2.2


From 7db37c5e6575b229a5051be1d3ef15257ae0ba5d Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 27 Jan 2011 12:02:00 +1100
Subject: xfs: fix log ticket leak on forced shutdown.

The kmemleak detector shows this after test 139:

unreferenced object 0xffff880079b88bb0 (size 264):
  comm "xfs_io", pid 4904, jiffies 4294909382 (age 276.824s)
  hex dump (first 32 bytes):
    00 00 00 00 ad 4e ad de ff ff ff ff 00 00 00 00  .....N..........
    ff ff ff ff ff ff ff ff 48 7b c9 82 ff ff ff ff  ........H{......
  backtrace:
    [<ffffffff81afb04d>] kmemleak_alloc+0x2d/0x60
    [<ffffffff8115c6cf>] kmem_cache_alloc+0x13f/0x2b0
    [<ffffffff814aaa97>] kmem_zone_alloc+0x77/0xf0
    [<ffffffff814aab2e>] kmem_zone_zalloc+0x1e/0x50
    [<ffffffff8148f394>] xlog_ticket_alloc+0x34/0x170
    [<ffffffff81494444>] xlog_cil_push+0xa4/0x3f0
    [<ffffffff81494eca>] xlog_cil_force_lsn+0x15a/0x160
    [<ffffffff814933a5>] _xfs_log_force_lsn+0x75/0x2d0
    [<ffffffff814a264d>] _xfs_trans_commit+0x2bd/0x2f0
    [<ffffffff8148bfdd>] xfs_iomap_write_allocate+0x1ad/0x350
    [<ffffffff814ac17f>] xfs_map_blocks+0x21f/0x370
    [<ffffffff814ad1b7>] xfs_vm_writepage+0x1c7/0x550
    [<ffffffff8112200a>] __writepage+0x1a/0x50
    [<ffffffff81122df2>] write_cache_pages+0x1c2/0x4c0
    [<ffffffff81123117>] generic_writepages+0x27/0x30
    [<ffffffff814aba5d>] xfs_vm_writepages+0x5d/0x80

By inspection, the leak occurs when xlog_write() returns and error
and we jump to the abort path without dropping the reference on the
active ticket.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_log_cil.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 9dc8125d04e5..c7eac5acbfea 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -543,7 +543,7 @@ xlog_cil_push(
 
 	error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0);
 	if (error)
-		goto out_abort;
+		goto out_abort_free_ticket;
 
 	/*
 	 * now that we've written the checkpoint into the log, strictly
@@ -569,8 +569,9 @@ restart:
 	}
 	spin_unlock(&cil->xc_cil_lock);
 
+	/* xfs_log_done always frees the ticket on error. */
 	commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0);
-	if (error || commit_lsn == -1)
+	if (commit_lsn == -1)
 		goto out_abort;
 
 	/* attach all the transactions w/ busy extents to iclog */
@@ -600,6 +601,8 @@ out_free_ticket:
 	kmem_free(new_ctx);
 	return 0;
 
+out_abort_free_ticket:
+	xfs_log_ticket_put(tic);
 out_abort:
 	xlog_cil_committed(ctx, XFS_LI_ABORTED);
 	return XFS_ERROR(EIO);
-- 
cgit v1.2.2


From ee2c9258501f83d3ed0fd09ce5df1cec53312cf0 Mon Sep 17 00:00:00 2001
From: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Date: Thu, 27 Jan 2011 09:58:04 -0600
Subject: cifs: More crypto cleanup (try #2)

Replaced md4 hashing function local to cifs module with kernel crypto APIs.
As a result, md4 hashing function and its supporting functions in
file md4.c are not needed anymore.

Cleaned up function declarations, removed forward function declarations,
and removed a header file that is being deleted from being included.

Verified that sec=ntlm/i, sec=ntlmv2/i, and sec=ntlmssp/i work correctly.

Signed-off-by: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/Makefile      |   2 +-
 fs/cifs/cifsencrypt.c |  32 +++++---
 fs/cifs/cifsencrypt.h |  33 --------
 fs/cifs/cifsproto.h   |   9 ++-
 fs/cifs/connect.c     |   6 +-
 fs/cifs/link.c        |   5 +-
 fs/cifs/md4.c         | 205 --------------------------------------------------
 fs/cifs/smbdes.c      |   1 -
 fs/cifs/smbencrypt.c  |  90 +++++++++++++++-------
 9 files changed, 97 insertions(+), 286 deletions(-)
 delete mode 100644 fs/cifs/cifsencrypt.h
 delete mode 100644 fs/cifs/md4.c

(limited to 'fs')

diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index e1322296cb69..d87558448e3d 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -5,7 +5,7 @@ obj-$(CONFIG_CIFS) += cifs.o
 
 cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \
 	  link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o \
-	  md4.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o \
+	  cifs_unicode.o nterr.o xattr.o cifsencrypt.o \
 	  readdir.o ioctl.o sess.o export.o
 
 cifs-$(CONFIG_CIFS_ACL) += cifsacl.o
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 35bf329c90e1..0db5f1de0227 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -36,11 +36,6 @@
 /* Note that the smb header signature field on input contains the
 	sequence number before this function is called */
 
-extern void mdfour(unsigned char *out, unsigned char *in, int n);
-extern void E_md4hash(const unsigned char *passwd, unsigned char *p16);
-extern void SMBencrypt(unsigned char *passwd, const unsigned char *c8,
-		       unsigned char *p24);
-
 static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu,
 				struct TCP_Server_Info *server, char *signature)
 {
@@ -233,6 +228,7 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
 /* first calculate 24 bytes ntlm response and then 16 byte session key */
 int setup_ntlm_response(struct cifsSesInfo *ses)
 {
+	int rc = 0;
 	unsigned int temp_len = CIFS_SESS_KEY_SIZE + CIFS_AUTH_RESP_SIZE;
 	char temp_key[CIFS_SESS_KEY_SIZE];
 
@@ -246,13 +242,26 @@ int setup_ntlm_response(struct cifsSesInfo *ses)
 	}
 	ses->auth_key.len = temp_len;
 
-	SMBNTencrypt(ses->password, ses->server->cryptkey,
+	rc = SMBNTencrypt(ses->password, ses->server->cryptkey,
 			ses->auth_key.response + CIFS_SESS_KEY_SIZE);
+	if (rc) {
+		cFYI(1, "%s Can't generate NTLM response, error: %d",
+			__func__, rc);
+		return rc;
+	}
+
+	rc = E_md4hash(ses->password, temp_key);
+	if (rc) {
+		cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc);
+		return rc;
+	}
 
-	E_md4hash(ses->password, temp_key);
-	mdfour(ses->auth_key.response, temp_key, CIFS_SESS_KEY_SIZE);
+	rc = mdfour(ses->auth_key.response, temp_key, CIFS_SESS_KEY_SIZE);
+	if (rc)
+		cFYI(1, "%s Can't generate NTLM session key, error: %d",
+			__func__, rc);
 
-	return 0;
+	return rc;
 }
 
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
@@ -699,14 +708,13 @@ cifs_crypto_shash_allocate(struct TCP_Server_Info *server)
 	unsigned int size;
 
 	server->secmech.hmacmd5 = crypto_alloc_shash("hmac(md5)", 0, 0);
-	if (!server->secmech.hmacmd5 ||
-			IS_ERR(server->secmech.hmacmd5)) {
+	if (IS_ERR(server->secmech.hmacmd5)) {
 		cERROR(1, "could not allocate crypto hmacmd5\n");
 		return PTR_ERR(server->secmech.hmacmd5);
 	}
 
 	server->secmech.md5 = crypto_alloc_shash("md5", 0, 0);
-	if (!server->secmech.md5 || IS_ERR(server->secmech.md5)) {
+	if (IS_ERR(server->secmech.md5)) {
 		cERROR(1, "could not allocate crypto md5\n");
 		rc = PTR_ERR(server->secmech.md5);
 		goto crypto_allocate_md5_fail;
diff --git a/fs/cifs/cifsencrypt.h b/fs/cifs/cifsencrypt.h
deleted file mode 100644
index 15d2ec006474..000000000000
--- a/fs/cifs/cifsencrypt.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- *   fs/cifs/cifsencrypt.h
- *
- *   Copyright (c) International Business Machines  Corp., 2005
- *   Author(s): Steve French (sfrench@us.ibm.com)
- *
- *   Externs for misc. small encryption routines
- *   so we do not have to put them in cifsproto.h
- *
- *   This library is free software; you can redistribute it and/or modify
- *   it under the terms of the GNU Lesser General Public License as published
- *   by the Free Software Foundation; either version 2.1 of the License, or
- *   (at your option) any later version.
- *
- *   This library is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU Lesser General Public License for more details.
- *
- *   You should have received a copy of the GNU Lesser General Public License
- *   along with this library; if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-/* md4.c */
-extern void mdfour(unsigned char *out, unsigned char *in, int n);
-/* smbdes.c */
-extern void E_P16(unsigned char *p14, unsigned char *p16);
-extern void E_P24(unsigned char *p21, const unsigned char *c8,
-		  unsigned char *p24);
-
-
-
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 35c989f4924f..8096f27ad9a8 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -375,7 +375,7 @@ extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *,
 extern int cifs_verify_signature(struct smb_hdr *,
 				 struct TCP_Server_Info *server,
 				__u32 expected_sequence_number);
-extern void SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *);
+extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *);
 extern int setup_ntlm_response(struct cifsSesInfo *);
 extern int setup_ntlmv2_rsp(struct cifsSesInfo *, const struct nls_table *);
 extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *);
@@ -425,4 +425,11 @@ extern bool CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr);
 extern int CIFSCheckMFSymlink(struct cifs_fattr *fattr,
 		const unsigned char *path,
 		struct cifs_sb_info *cifs_sb, int xid);
+extern int mdfour(unsigned char *, unsigned char *, int);
+extern int E_md4hash(const unsigned char *passwd, unsigned char *p16);
+extern void SMBencrypt(unsigned char *passwd, const unsigned char *c8,
+			unsigned char *p24);
+extern void E_P16(unsigned char *p14, unsigned char *p16);
+extern void E_P24(unsigned char *p21, const unsigned char *c8,
+			unsigned char *p24);
 #endif			/* _CIFSPROTO_H */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 47034af67b09..47d8ff623683 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -55,9 +55,6 @@
 /* SMB echo "timeout" -- FIXME: tunable? */
 #define SMB_ECHO_INTERVAL (60 * HZ)
 
-extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8,
-			 unsigned char *p24);
-
 extern mempool_t *cifs_req_poolp;
 
 struct smb_vol {
@@ -2990,7 +2987,8 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 					 bcc_ptr);
 		else
 #endif /* CIFS_WEAK_PW_HASH */
-		SMBNTencrypt(tcon->password, ses->server->cryptkey, bcc_ptr);
+		rc = SMBNTencrypt(tcon->password, ses->server->cryptkey,
+					bcc_ptr);
 
 		bcc_ptr += CIFS_AUTH_RESP_SIZE;
 		if (ses->capabilities & CAP_UNICODE) {
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index d3444ea6ac71..02cd60aefbff 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -54,10 +54,9 @@ symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash)
 	struct sdesc *sdescmd5;
 
 	md5 = crypto_alloc_shash("md5", 0, 0);
-	if (!md5 || IS_ERR(md5)) {
-		rc = PTR_ERR(md5);
+	if (IS_ERR(md5)) {
 		cERROR(1, "%s: Crypto md5 allocation error %d\n", __func__, rc);
-		return rc;
+		return PTR_ERR(md5);
 	}
 	size = sizeof(struct shash_desc) + crypto_shash_descsize(md5);
 	sdescmd5 = kmalloc(size, GFP_KERNEL);
diff --git a/fs/cifs/md4.c b/fs/cifs/md4.c
deleted file mode 100644
index a725c2609d67..000000000000
--- a/fs/cifs/md4.c
+++ /dev/null
@@ -1,205 +0,0 @@
-/*
-   Unix SMB/Netbios implementation.
-   Version 1.9.
-   a implementation of MD4 designed for use in the SMB authentication protocol
-   Copyright (C) Andrew Tridgell 1997-1998.
-   Modified by Steve French (sfrench@us.ibm.com) 2002-2003
-
-   This program is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published by
-   the Free Software Foundation; either version 2 of the License, or
-   (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-*/
-#include <linux/module.h>
-#include <linux/fs.h>
-#include "cifsencrypt.h"
-
-/* NOTE: This code makes no attempt to be fast! */
-
-static __u32
-F(__u32 X, __u32 Y, __u32 Z)
-{
-	return (X & Y) | ((~X) & Z);
-}
-
-static __u32
-G(__u32 X, __u32 Y, __u32 Z)
-{
-	return (X & Y) | (X & Z) | (Y & Z);
-}
-
-static __u32
-H(__u32 X, __u32 Y, __u32 Z)
-{
-	return X ^ Y ^ Z;
-}
-
-static __u32
-lshift(__u32 x, int s)
-{
-	x &= 0xFFFFFFFF;
-	return ((x << s) & 0xFFFFFFFF) | (x >> (32 - s));
-}
-
-#define ROUND1(a,b,c,d,k,s) (*a) = lshift((*a) + F(*b,*c,*d) + X[k], s)
-#define ROUND2(a,b,c,d,k,s) (*a) = lshift((*a) + G(*b,*c,*d) + X[k] + (__u32)0x5A827999,s)
-#define ROUND3(a,b,c,d,k,s) (*a) = lshift((*a) + H(*b,*c,*d) + X[k] + (__u32)0x6ED9EBA1,s)
-
-/* this applies md4 to 64 byte chunks */
-static void
-mdfour64(__u32 *M, __u32 *A, __u32 *B, __u32 *C, __u32 *D)
-{
-	int j;
-	__u32 AA, BB, CC, DD;
-	__u32 X[16];
-
-
-	for (j = 0; j < 16; j++)
-		X[j] = M[j];
-
-	AA = *A;
-	BB = *B;
-	CC = *C;
-	DD = *D;
-
-	ROUND1(A, B, C, D, 0, 3);
-	ROUND1(D, A, B, C, 1, 7);
-	ROUND1(C, D, A, B, 2, 11);
-	ROUND1(B, C, D, A, 3, 19);
-	ROUND1(A, B, C, D, 4, 3);
-	ROUND1(D, A, B, C, 5, 7);
-	ROUND1(C, D, A, B, 6, 11);
-	ROUND1(B, C, D, A, 7, 19);
-	ROUND1(A, B, C, D, 8, 3);
-	ROUND1(D, A, B, C, 9, 7);
-	ROUND1(C, D, A, B, 10, 11);
-	ROUND1(B, C, D, A, 11, 19);
-	ROUND1(A, B, C, D, 12, 3);
-	ROUND1(D, A, B, C, 13, 7);
-	ROUND1(C, D, A, B, 14, 11);
-	ROUND1(B, C, D, A, 15, 19);
-
-	ROUND2(A, B, C, D, 0, 3);
-	ROUND2(D, A, B, C, 4, 5);
-	ROUND2(C, D, A, B, 8, 9);
-	ROUND2(B, C, D, A, 12, 13);
-	ROUND2(A, B, C, D, 1, 3);
-	ROUND2(D, A, B, C, 5, 5);
-	ROUND2(C, D, A, B, 9, 9);
-	ROUND2(B, C, D, A, 13, 13);
-	ROUND2(A, B, C, D, 2, 3);
-	ROUND2(D, A, B, C, 6, 5);
-	ROUND2(C, D, A, B, 10, 9);
-	ROUND2(B, C, D, A, 14, 13);
-	ROUND2(A, B, C, D, 3, 3);
-	ROUND2(D, A, B, C, 7, 5);
-	ROUND2(C, D, A, B, 11, 9);
-	ROUND2(B, C, D, A, 15, 13);
-
-	ROUND3(A, B, C, D, 0, 3);
-	ROUND3(D, A, B, C, 8, 9);
-	ROUND3(C, D, A, B, 4, 11);
-	ROUND3(B, C, D, A, 12, 15);
-	ROUND3(A, B, C, D, 2, 3);
-	ROUND3(D, A, B, C, 10, 9);
-	ROUND3(C, D, A, B, 6, 11);
-	ROUND3(B, C, D, A, 14, 15);
-	ROUND3(A, B, C, D, 1, 3);
-	ROUND3(D, A, B, C, 9, 9);
-	ROUND3(C, D, A, B, 5, 11);
-	ROUND3(B, C, D, A, 13, 15);
-	ROUND3(A, B, C, D, 3, 3);
-	ROUND3(D, A, B, C, 11, 9);
-	ROUND3(C, D, A, B, 7, 11);
-	ROUND3(B, C, D, A, 15, 15);
-
-	*A += AA;
-	*B += BB;
-	*C += CC;
-	*D += DD;
-
-	*A &= 0xFFFFFFFF;
-	*B &= 0xFFFFFFFF;
-	*C &= 0xFFFFFFFF;
-	*D &= 0xFFFFFFFF;
-
-	for (j = 0; j < 16; j++)
-		X[j] = 0;
-}
-
-static void
-copy64(__u32 *M, unsigned char *in)
-{
-	int i;
-
-	for (i = 0; i < 16; i++)
-		M[i] = (in[i * 4 + 3] << 24) | (in[i * 4 + 2] << 16) |
-		    (in[i * 4 + 1] << 8) | (in[i * 4 + 0] << 0);
-}
-
-static void
-copy4(unsigned char *out, __u32 x)
-{
-	out[0] = x & 0xFF;
-	out[1] = (x >> 8) & 0xFF;
-	out[2] = (x >> 16) & 0xFF;
-	out[3] = (x >> 24) & 0xFF;
-}
-
-/* produce a md4 message digest from data of length n bytes */
-void
-mdfour(unsigned char *out, unsigned char *in, int n)
-{
-	unsigned char buf[128];
-	__u32 M[16];
-	__u32 b = n * 8;
-	int i;
-	__u32 A = 0x67452301;
-	__u32 B = 0xefcdab89;
-	__u32 C = 0x98badcfe;
-	__u32 D = 0x10325476;
-
-	while (n > 64) {
-		copy64(M, in);
-		mdfour64(M, &A, &B, &C, &D);
-		in += 64;
-		n -= 64;
-	}
-
-	for (i = 0; i < 128; i++)
-		buf[i] = 0;
-	memcpy(buf, in, n);
-	buf[n] = 0x80;
-
-	if (n <= 55) {
-		copy4(buf + 56, b);
-		copy64(M, buf);
-		mdfour64(M, &A, &B, &C, &D);
-	} else {
-		copy4(buf + 120, b);
-		copy64(M, buf);
-		mdfour64(M, &A, &B, &C, &D);
-		copy64(M, buf + 64);
-		mdfour64(M, &A, &B, &C, &D);
-	}
-
-	for (i = 0; i < 128; i++)
-		buf[i] = 0;
-	copy64(M, buf);
-
-	copy4(out, A);
-	copy4(out + 4, B);
-	copy4(out + 8, C);
-	copy4(out + 12, D);
-
-	A = B = C = D = 0;
-}
diff --git a/fs/cifs/smbdes.c b/fs/cifs/smbdes.c
index b6b6dcb500bf..04721485925d 100644
--- a/fs/cifs/smbdes.c
+++ b/fs/cifs/smbdes.c
@@ -45,7 +45,6 @@
    up with a different answer to the one above)
 */
 #include <linux/slab.h>
-#include "cifsencrypt.h"
 #define uchar unsigned char
 
 static uchar perm1[56] = { 57, 49, 41, 33, 25, 17, 9,
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 30135005e4f3..b5450e9f40c0 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -33,7 +33,7 @@
 #include "cifspdu.h"
 #include "cifsglob.h"
 #include "cifs_debug.h"
-#include "cifsencrypt.h"
+#include "cifsproto.h"
 
 #ifndef false
 #define false 0
@@ -47,14 +47,57 @@
 #define SSVALX(buf,pos,val) (CVAL(buf,pos)=(val)&0xFF,CVAL(buf,pos+1)=(val)>>8)
 #define SSVAL(buf,pos,val) SSVALX((buf),(pos),((__u16)(val)))
 
-/*The following definitions come from  libsmb/smbencrypt.c  */
+/* produce a md4 message digest from data of length n bytes */
+int
+mdfour(unsigned char *md4_hash, unsigned char *link_str, int link_len)
+{
+	int rc;
+	unsigned int size;
+	struct crypto_shash *md4;
+	struct sdesc *sdescmd4;
+
+	md4 = crypto_alloc_shash("md4", 0, 0);
+	if (IS_ERR(md4)) {
+		cERROR(1, "%s: Crypto md4 allocation error %d\n", __func__, rc);
+		return PTR_ERR(md4);
+	}
+	size = sizeof(struct shash_desc) + crypto_shash_descsize(md4);
+	sdescmd4 = kmalloc(size, GFP_KERNEL);
+	if (!sdescmd4) {
+		rc = -ENOMEM;
+		cERROR(1, "%s: Memory allocation failure\n", __func__);
+		goto mdfour_err;
+	}
+	sdescmd4->shash.tfm = md4;
+	sdescmd4->shash.flags = 0x0;
+
+	rc = crypto_shash_init(&sdescmd4->shash);
+	if (rc) {
+		cERROR(1, "%s: Could not init md4 shash\n", __func__);
+		goto mdfour_err;
+	}
+	crypto_shash_update(&sdescmd4->shash, link_str, link_len);
+	rc = crypto_shash_final(&sdescmd4->shash, md4_hash);
 
-void SMBencrypt(unsigned char *passwd, const unsigned char *c8,
-		unsigned char *p24);
-void E_md4hash(const unsigned char *passwd, unsigned char *p16);
-static void SMBOWFencrypt(unsigned char passwd[16], const unsigned char *c8,
-		   unsigned char p24[24]);
-void SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24);
+mdfour_err:
+	crypto_free_shash(md4);
+	kfree(sdescmd4);
+
+	return rc;
+}
+
+/* Does the des encryption from the NT or LM MD4 hash. */
+static void
+SMBOWFencrypt(unsigned char passwd[16], const unsigned char *c8,
+	      unsigned char p24[24])
+{
+	unsigned char p21[21];
+
+	memset(p21, '\0', 21);
+
+	memcpy(p21, passwd, 16);
+	E_P24(p21, c8, p24);
+}
 
 /*
    This implements the X/Open SMB password encryption
@@ -117,9 +160,10 @@ _my_mbstowcs(__u16 *dst, const unsigned char *src, int len)
  * Creates the MD4 Hash of the users password in NT UNICODE.
  */
 
-void
+int
 E_md4hash(const unsigned char *passwd, unsigned char *p16)
 {
+	int rc;
 	int len;
 	__u16 wpwd[129];
 
@@ -138,8 +182,10 @@ E_md4hash(const unsigned char *passwd, unsigned char *p16)
 	/* Calculate length in bytes */
 	len = _my_wcslen(wpwd) * sizeof(__u16);
 
-	mdfour(p16, (unsigned char *) wpwd, len);
+	rc = mdfour(p16, (unsigned char *) wpwd, len);
 	memset(wpwd, 0, 129 * 2);
+
+	return rc;
 }
 
 #if 0 /* currently unused */
@@ -211,19 +257,6 @@ ntv2_owf_gen(const unsigned char owf[16], const char *user_n,
 }
 #endif
 
-/* Does the des encryption from the NT or LM MD4 hash. */
-static void
-SMBOWFencrypt(unsigned char passwd[16], const unsigned char *c8,
-	      unsigned char p24[24])
-{
-	unsigned char p21[21];
-
-	memset(p21, '\0', 21);
-
-	memcpy(p21, passwd, 16);
-	E_P24(p21, c8, p24);
-}
-
 /* Does the des encryption from the FIRST 8 BYTES of the NT or LM MD4 hash. */
 #if 0 /* currently unused */
 static void
@@ -241,16 +274,21 @@ NTLMSSPOWFencrypt(unsigned char passwd[8],
 #endif
 
 /* Does the NT MD4 hash then des encryption. */
-
-void
+int
 SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24)
 {
+	int rc;
 	unsigned char p21[21];
 
 	memset(p21, '\0', 21);
 
-	E_md4hash(passwd, p21);
+	rc = E_md4hash(passwd, p21);
+	if (rc) {
+		cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc);
+		return rc;
+	}
 	SMBOWFencrypt(p21, c8, p24);
+	return rc;
 }
 
 
-- 
cgit v1.2.2


From e34a314c5e49fe6b763568f6576b19f1299c33c2 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 27 Jan 2011 12:13:35 +1100
Subject: xfs: fix efi item leak on forced shutdown

After test 139, kmemleak shows:

unreferenced object 0xffff880078b405d8 (size 400):
  comm "xfs_io", pid 4904, jiffies 4294909383 (age 1186.728s)
  hex dump (first 32 bytes):
    60 c1 17 79 00 88 ff ff 60 c1 17 79 00 88 ff ff  `..y....`..y....
    00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  backtrace:
    [<ffffffff81afb04d>] kmemleak_alloc+0x2d/0x60
    [<ffffffff8115c6cf>] kmem_cache_alloc+0x13f/0x2b0
    [<ffffffff814aaa97>] kmem_zone_alloc+0x77/0xf0
    [<ffffffff814aab2e>] kmem_zone_zalloc+0x1e/0x50
    [<ffffffff8147cd6b>] xfs_efi_init+0x4b/0xb0
    [<ffffffff814a4ee8>] xfs_trans_get_efi+0x58/0x90
    [<ffffffff81455fab>] xfs_bmap_finish+0x8b/0x1d0
    [<ffffffff814851b4>] xfs_itruncate_finish+0x2c4/0x5d0
    [<ffffffff814a970f>] xfs_setattr+0x8df/0xa70
    [<ffffffff814b5c7b>] xfs_vn_setattr+0x1b/0x20
    [<ffffffff8117dc00>] notify_change+0x170/0x2e0
    [<ffffffff81163bf6>] do_truncate+0x66/0xa0
    [<ffffffff81163d0b>] sys_ftruncate+0xdb/0xe0
    [<ffffffff8103a002>] system_call_fastpath+0x16/0x1b
    [<ffffffffffffffff>] 0xffffffffffffffff

The cause of the leak is that the "remove" parameter of IOP_UNPIN()
is never set when a CIL push is aborted. This means that the EFI
item is never freed if it was in the push being cancelled. The
problem is specific to delayed logging, but has uncovered a couple
of problems with the handling of IOP_UNPIN(remove).

Firstly, we cannot safely call xfs_trans_del_item() from IOP_UNPIN()
in the CIL commit failure path or the iclog write failure path
because for delayed loging we have no transaction context. Hence we
must only call xfs_trans_del_item() if the log item being unpinned
has an active log item descriptor.

Secondly, xfs_trans_uncommit() does not handle log item descriptor
freeing during the traversal of log items on a transaction. It can
reference a freed log item descriptor when unpinning an EFI item.
Hence it needs to use a safe list traversal method to allow items to
be removed from the transaction during IOP_UNPIN().

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_buf_item.c     | 12 +++++++-----
 fs/xfs/xfs_extfree_item.c |  3 ++-
 fs/xfs/xfs_trans.c        | 36 +++++++++++++++++++++++++++++-------
 3 files changed, 38 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 98c6f73b6752..6f8c21ce0d6d 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -427,13 +427,15 @@ xfs_buf_item_unpin(
 
 		if (remove) {
 			/*
-			 * We have to remove the log item from the transaction
-			 * as we are about to release our reference to the
-			 * buffer.  If we don't, the unlock that occurs later
-			 * in xfs_trans_uncommit() will ry to reference the
+			 * If we are in a transaction context, we have to
+			 * remove the log item from the transaction as we are
+			 * about to release our reference to the buffer.  If we
+			 * don't, the unlock that occurs later in
+			 * xfs_trans_uncommit() will try to reference the
 			 * buffer which we no longer have a hold on.
 			 */
-			xfs_trans_del_item(lip);
+			if (lip->li_desc)
+				xfs_trans_del_item(lip);
 
 			/*
 			 * Since the transaction no longer refers to the buffer,
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 75f2ef60e579..d22e62623437 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -138,7 +138,8 @@ xfs_efi_item_unpin(
 
 	if (remove) {
 		ASSERT(!(lip->li_flags & XFS_LI_IN_AIL));
-		xfs_trans_del_item(lip);
+		if (lip->li_desc)
+			xfs_trans_del_item(lip);
 		xfs_efi_item_free(efip);
 		return;
 	}
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 33dbc4e0ad62..29f5e5424897 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1446,6 +1446,14 @@ xfs_log_item_batch_insert(
  * Bulk operation version of xfs_trans_committed that takes a log vector of
  * items to insert into the AIL. This uses bulk AIL insertion techniques to
  * minimise lock traffic.
+ *
+ * If we are called with the aborted flag set, it is because a log write during
+ * a CIL checkpoint commit has failed. In this case, all the items in the
+ * checkpoint have already gone through IOP_COMMITED and IOP_UNLOCK, which
+ * means that checkpoint commit abort handling is treated exactly the same
+ * as an iclog write error even though we haven't started any IO yet. Hence in
+ * this case all we need to do is IOP_COMMITTED processing, followed by an
+ * IOP_UNPIN(aborted) call.
  */
 void
 xfs_trans_committed_bulk(
@@ -1472,6 +1480,16 @@ xfs_trans_committed_bulk(
 		if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0)
 			continue;
 
+		/*
+		 * if we are aborting the operation, no point in inserting the
+		 * object into the AIL as we are in a shutdown situation.
+		 */
+		if (aborted) {
+			ASSERT(XFS_FORCED_SHUTDOWN(ailp->xa_mount));
+			IOP_UNPIN(lip, 1);
+			continue;
+		}
+
 		if (item_lsn != commit_lsn) {
 
 			/*
@@ -1503,20 +1521,24 @@ xfs_trans_committed_bulk(
 }
 
 /*
- * Called from the trans_commit code when we notice that
- * the filesystem is in the middle of a forced shutdown.
+ * Called from the trans_commit code when we notice that the filesystem is in
+ * the middle of a forced shutdown.
+ *
+ * When we are called here, we have already pinned all the items in the
+ * transaction. However, neither IOP_COMMITTING or IOP_UNLOCK has been called
+ * so we can simply walk the items in the transaction, unpin them with an abort
+ * flag and then free the items. Note that unpinning the items can result in
+ * them being freed immediately, so we need to use a safe list traversal method
+ * here.
  */
 STATIC void
 xfs_trans_uncommit(
 	struct xfs_trans	*tp,
 	uint			flags)
 {
-	struct xfs_log_item_desc *lidp;
+	struct xfs_log_item_desc *lidp, *n;
 
-	list_for_each_entry(lidp, &tp->t_items, lid_trans) {
-		/*
-		 * Unpin all but those that aren't dirty.
-		 */
+	list_for_each_entry_safe(lidp, n, &tp->t_items, lid_trans) {
 		if (lidp->lid_flags & XFS_LID_DIRTY)
 			IOP_UNPIN(lidp->lid_item, 1);
 	}
-- 
cgit v1.2.2


From b8fc82630ae289bb4e661567808afc59e3298dce Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 27 Jan 2011 12:14:12 +1100
Subject: xfs: speculative delayed allocation uses rounddown_power_of_2 badly

rounddown_power_of_2() returns an undefined result when passed a
value of zero. The specualtive delayed allocation code is doing this
when the inode is zero length. Hence occasionally the preallocation
is much, much larger than is necessary (e.g. 8GB for a 270 _byte_
file). Ensure we don't even pass a zero value to this function so
the result of preallocation is always the desired size.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_iomap.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 55582bd66659..8a0f044750c3 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -337,7 +337,12 @@ xfs_iomap_prealloc_size(
 		int shift = 0;
 		int64_t freesp;
 
-		alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size);
+		/*
+		 * rounddown_pow_of_two() returns an undefined result
+		 * if we pass in alloc_blocks = 0. Hence the "+ 1" to
+		 * ensure we always pass in a non-zero value.
+		 */
+		alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size) + 1;
 		alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN,
 					rounddown_pow_of_two(alloc_blocks));
 
-- 
cgit v1.2.2


From 14b064ceaa6f51a7426cc45b4b43685b94380658 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 27 Jan 2011 12:16:28 +1100
Subject: xfs: limit extent length for allocation to AG size

Delayed allocation extents can be larger than AGs, so when trying to
convert a large range we may scan every AG inside
xfs_bmap_alloc_nullfb() trying to find an AG with a size larger than
an AG. We should stop when we find the first AG with a maximum
possible allocation size. This causes excessive CPU usage when there
are lots of AGs.

The same problem occurs when doing preallocation of a range larger
than an AG.

Fix the problem by limiting real allocation lengths to the maximum
that an AG can support. This means if we have empty AGs, we'll stop
the search at the first of them. If there are no empty AGs, we'll
still scan them all, but that is a different problem....

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_alloc.h | 16 ++++++++++++++++
 fs/xfs/xfs_bmap.c  | 18 ++++++++++--------
 2 files changed, 26 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 0ab56b32c7eb..d0b3bc72005b 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -74,6 +74,22 @@ typedef unsigned int xfs_alloctype_t;
  */
 #define XFS_ALLOC_SET_ASIDE(mp)  (4 + ((mp)->m_sb.sb_agcount * 4))
 
+/*
+ * When deciding how much space to allocate out of an AG, we limit the
+ * allocation maximum size to the size the AG. However, we cannot use all the
+ * blocks in the AG - some are permanently used by metadata. These
+ * blocks are generally:
+ *	- the AG superblock, AGF, AGI and AGFL
+ *	- the AGF (bno and cnt) and AGI btree root blocks
+ *	- 4 blocks on the AGFL according to XFS_ALLOC_SET_ASIDE() limits
+ *
+ * The AG headers are sector sized, so the amount of space they take up is
+ * dependent on filesystem geometry. The others are all single blocks.
+ */
+#define XFS_ALLOC_AG_MAX_USABLE(mp)	\
+	((mp)->m_sb.sb_agblocks - XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)) - 7)
+
+
 /*
  * Argument structure for xfs_alloc routines.
  * This is turned into a structure to avoid having 20 arguments passed
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 4111cd3966c7..f3a3768189bb 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2430,7 +2430,7 @@ xfs_bmap_btalloc_nullfb(
 		startag = ag = 0;
 
 	pag = xfs_perag_get(mp, ag);
-	while (*blen < ap->alen) {
+	while (*blen < args->maxlen) {
 		if (!pag->pagf_init) {
 			error = xfs_alloc_pagf_init(mp, args->tp, ag,
 						    XFS_ALLOC_FLAG_TRYLOCK);
@@ -2452,7 +2452,7 @@ xfs_bmap_btalloc_nullfb(
 			notinit = 1;
 
 		if (xfs_inode_is_filestream(ap->ip)) {
-			if (*blen >= ap->alen)
+			if (*blen >= args->maxlen)
 				break;
 
 			if (ap->userdata) {
@@ -2498,14 +2498,14 @@ xfs_bmap_btalloc_nullfb(
 	 * If the best seen length is less than the request
 	 * length, use the best as the minimum.
 	 */
-	else if (*blen < ap->alen)
+	else if (*blen < args->maxlen)
 		args->minlen = *blen;
 	/*
-	 * Otherwise we've seen an extent as big as alen,
+	 * Otherwise we've seen an extent as big as maxlen,
 	 * use that as the minimum.
 	 */
 	else
-		args->minlen = ap->alen;
+		args->minlen = args->maxlen;
 
 	/*
 	 * set the failure fallback case to look in the selected
@@ -2573,7 +2573,9 @@ xfs_bmap_btalloc(
 	args.tp = ap->tp;
 	args.mp = mp;
 	args.fsbno = ap->rval;
-	args.maxlen = MIN(ap->alen, mp->m_sb.sb_agblocks);
+
+	/* Trim the allocation back to the maximum an AG can fit. */
+	args.maxlen = MIN(ap->alen, XFS_ALLOC_AG_MAX_USABLE(mp));
 	args.firstblock = ap->firstblock;
 	blen = 0;
 	if (nullfb) {
@@ -2621,7 +2623,7 @@ xfs_bmap_btalloc(
 			/*
 			 * Adjust for alignment
 			 */
-			if (blen > args.alignment && blen <= ap->alen)
+			if (blen > args.alignment && blen <= args.maxlen)
 				args.minlen = blen - args.alignment;
 			args.minalignslop = 0;
 		} else {
@@ -2640,7 +2642,7 @@ xfs_bmap_btalloc(
 			 * of minlen+alignment+slop doesn't go up
 			 * between the calls.
 			 */
-			if (blen > mp->m_dalign && blen <= ap->alen)
+			if (blen > mp->m_dalign && blen <= args.maxlen)
 				nextminlen = blen - mp->m_dalign;
 			else
 				nextminlen = args.minlen;
-- 
cgit v1.2.2


From 4ce159890c00e2cc705e955a939bf1dca7b07ab8 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 27 Jan 2011 12:17:58 +1100
Subject: xfs: prevent extsize alignment from exceeding maximum extent size

When doing delayed allocation, if the allocation size is for a
maximally sized extent, extent size alignment can push it over this
limit. This results in an assert failure in xfs_bmbt_set_allf() as
the extent length is too large to find in the extent record.

Fix this by ensuring that we allow for space that extent size
alignment requires (up to 2 * (extsize -1) blocks as we have to
handle both head and tail alignment) when limiting the maximum size
of the extent.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_bmap.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index f3a3768189bb..3e9c278a8f78 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -4487,6 +4487,16 @@ xfs_bmapi(
 				/* Figure out the extent size, adjust alen */
 				extsz = xfs_get_extsz_hint(ip);
 				if (extsz) {
+					/*
+					 * make sure we don't exceed a single
+					 * extent length when we align the
+					 * extent by reducing length we are
+					 * going to allocate by the maximum
+					 * amount extent size aligment may
+					 * require.
+					 */
+					alen = XFS_FILBLKS_MIN(len,
+						   MAXEXTLEN - (2 * extsz - 1));
 					error = xfs_bmap_extsize_align(mp,
 							&got, &prev, extsz,
 							rt, eof,
-- 
cgit v1.2.2


From 5315837daee7ed76c31ef643915f7d76ef8c1aa3 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 27 Jan 2011 12:18:18 +1100
Subject: xfs: limit extsize to size of AGs and/or MAXEXTLEN
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The extent size hint can be set to larger than an AG. This means
that the alignment process can push the range to be allocated
outside the bounds of the AG, resulting in assert failures or
corrupted bmbt records. Similarly, if the extsize is larger than the
maximum extent size supported, the alignment process will produce
extents that are too large to fit into the bmbt records, resulting
in a different type of assert/corruption failure.

Fix this by limiting extsize at the time іt is set firstly to be
less than MAXEXTLEN, then to be a maximum of half the size of the
AGs in the filesystem for non-realtime inodes. Realtime inodes do
not allocate out of AGs, so don't have to be restricted by the size
of AGs.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_ioctl.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index b06ede1d0bed..f5e2a19e0f8e 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -985,10 +985,22 @@ xfs_ioctl_setattr(
 
 		/*
 		 * Extent size must be a multiple of the appropriate block
-		 * size, if set at all.
+		 * size, if set at all. It must also be smaller than the
+		 * maximum extent size supported by the filesystem.
+		 *
+		 * Also, for non-realtime files, limit the extent size hint to
+		 * half the size of the AGs in the filesystem so alignment
+		 * doesn't result in extents larger than an AG.
 		 */
 		if (fa->fsx_extsize != 0) {
-			xfs_extlen_t	size;
+			xfs_extlen_t    size;
+			xfs_fsblock_t   extsize_fsb;
+
+			extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize);
+			if (extsize_fsb > MAXEXTLEN) {
+				code = XFS_ERROR(EINVAL);
+				goto error_return;
+			}
 
 			if (XFS_IS_REALTIME_INODE(ip) ||
 			    ((mask & FSX_XFLAGS) &&
@@ -997,6 +1009,10 @@ xfs_ioctl_setattr(
 				       mp->m_sb.sb_blocklog;
 			} else {
 				size = mp->m_sb.sb_blocksize;
+				if (extsize_fsb > mp->m_sb.sb_agblocks / 2) {
+					code = XFS_ERROR(EINVAL);
+					goto error_return;
+				}
 			}
 
 			if (fa->fsx_extsize % size) {
-- 
cgit v1.2.2


From c6f990d1ff8e4e53b12f4175eb7d7ea710c3ca73 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 27 Jan 2011 13:23:28 +1100
Subject: xfs: handle CIl transaction commit failures correctly

Failure to commit a transaction into the CIL is not handled
correctly. This currently can only happen when racing with a
shutdown and requires an explicit shutdown check, so it rare and can
be avoided. Remove the shutdown check and make the CIL commit a void
function to indicate it will always succeed, thereby removing the
incorrectly handled failure case.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_log.h     | 2 +-
 fs/xfs/xfs_log_cil.c | 8 +-------
 fs/xfs/xfs_trans.c   | 5 +----
 3 files changed, 3 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 916eb7db14d9..3bd3291ef8d2 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -191,7 +191,7 @@ void	  xfs_log_ticket_put(struct xlog_ticket *ticket);
 
 xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp);
 
-int	xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
+void	xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
 				struct xfs_log_vec *log_vector,
 				xfs_lsn_t *commit_lsn, int flags);
 bool	xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index c7eac5acbfea..9ca59be08977 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -625,7 +625,7 @@ out_abort:
  * background commit, returns without it held once background commits are
  * allowed again.
  */
-int
+void
 xfs_log_commit_cil(
 	struct xfs_mount	*mp,
 	struct xfs_trans	*tp,
@@ -640,11 +640,6 @@ xfs_log_commit_cil(
 	if (flags & XFS_TRANS_RELEASE_LOG_RES)
 		log_flags = XFS_LOG_REL_PERM_RESERV;
 
-	if (XLOG_FORCED_SHUTDOWN(log)) {
-		xlog_cil_free_logvec(log_vector);
-		return XFS_ERROR(EIO);
-	}
-
 	/*
 	 * do all the hard work of formatting items (including memory
 	 * allocation) outside the CIL context lock. This prevents stalling CIL
@@ -704,7 +699,6 @@ xfs_log_commit_cil(
 	 */
 	if (push)
 		xlog_cil_push(log, 0);
-	return 0;
 }
 
 /*
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 29f5e5424897..76922793f64f 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1755,7 +1755,6 @@ xfs_trans_commit_cil(
 	int			flags)
 {
 	struct xfs_log_vec	*log_vector;
-	int			error;
 
 	/*
 	 * Get each log item to allocate a vector structure for
@@ -1766,9 +1765,7 @@ xfs_trans_commit_cil(
 	if (!log_vector)
 		return ENOMEM;
 
-	error = xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags);
-	if (error)
-		return error;
+	xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags);
 
 	current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
 	xfs_trans_free(tp);
-- 
cgit v1.2.2


From 0fbca4d1c3932c27c4794bf5c2b5fc961cf5a54f Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 28 Jan 2011 11:20:46 +1100
Subject: xfs: fix dquot shaker deadlock

Commit 368e136 ("xfs: remove duplicate code from dquot reclaim") fails
to unlock the dquot freelist when the number of loop restarts is
exceeded in xfs_qm_dqreclaim_one(). This causes hangs in memory
reclaim.

Rework the loop control logic into an unwind stack that all the
different cases jump into. This means there is only one set of code
that processes the loop exit criteria, and simplifies the unlocking
of all the items from different points in the loop. It also fixes a
double increment of the restart counter from the qi_dqlist_lock
case.

Reported-by: Malcolm Scott <lkml@malc.org.uk>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/quota/xfs_qm.c | 46 +++++++++++++++++++++-------------------------
 1 file changed, 21 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index f8e854b4fde8..206a2815ced6 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -1863,12 +1863,14 @@ xfs_qm_dqreclaim_one(void)
 	xfs_dquot_t	*dqpout;
 	xfs_dquot_t	*dqp;
 	int		restarts;
+	int		startagain;
 
 	restarts = 0;
 	dqpout = NULL;
 
 	/* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */
-startagain:
+again:
+	startagain = 0;
 	mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
 
 	list_for_each_entry(dqp, &xfs_Gqm->qm_dqfrlist, q_freelist) {
@@ -1885,13 +1887,10 @@ startagain:
 			ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE));
 
 			trace_xfs_dqreclaim_want(dqp);
-
-			xfs_dqunlock(dqp);
-			mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
-			if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
-				return NULL;
 			XQM_STATS_INC(xqmstats.xs_qm_dqwants);
-			goto startagain;
+			restarts++;
+			startagain = 1;
+			goto dqunlock;
 		}
 
 		/*
@@ -1906,23 +1905,20 @@ startagain:
 			ASSERT(list_empty(&dqp->q_mplist));
 			list_del_init(&dqp->q_freelist);
 			xfs_Gqm->qm_dqfrlist_cnt--;
-			xfs_dqunlock(dqp);
 			dqpout = dqp;
 			XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims);
-			break;
+			goto dqunlock;
 		}
 
 		ASSERT(dqp->q_hash);
 		ASSERT(!list_empty(&dqp->q_mplist));
 
 		/*
-		 * Try to grab the flush lock. If this dquot is in the process of
-		 * getting flushed to disk, we don't want to reclaim it.
+		 * Try to grab the flush lock. If this dquot is in the process
+		 * of getting flushed to disk, we don't want to reclaim it.
 		 */
-		if (!xfs_dqflock_nowait(dqp)) {
-			xfs_dqunlock(dqp);
-			continue;
-		}
+		if (!xfs_dqflock_nowait(dqp))
+			goto dqunlock;
 
 		/*
 		 * We have the flush lock so we know that this is not in the
@@ -1944,8 +1940,7 @@ startagain:
 				xfs_fs_cmn_err(CE_WARN, mp,
 			"xfs_qm_dqreclaim: dquot %p flush failed", dqp);
 			}
-			xfs_dqunlock(dqp); /* dqflush unlocks dqflock */
-			continue;
+			goto dqunlock;
 		}
 
 		/*
@@ -1967,13 +1962,8 @@ startagain:
 		 */
 		if (!mutex_trylock(&mp->m_quotainfo->qi_dqlist_lock)) {
 			restarts++;
-			mutex_unlock(&dqp->q_hash->qh_lock);
-			xfs_dqfunlock(dqp);
-			xfs_dqunlock(dqp);
-			mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
-			if (restarts++ >= XFS_QM_RECLAIM_MAX_RESTARTS)
-				return NULL;
-			goto startagain;
+			startagain = 1;
+			goto qhunlock;
 		}
 
 		ASSERT(dqp->q_nrefs == 0);
@@ -1986,14 +1976,20 @@ startagain:
 		xfs_Gqm->qm_dqfrlist_cnt--;
 		dqpout = dqp;
 		mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
+qhunlock:
 		mutex_unlock(&dqp->q_hash->qh_lock);
 dqfunlock:
 		xfs_dqfunlock(dqp);
+dqunlock:
 		xfs_dqunlock(dqp);
 		if (dqpout)
 			break;
 		if (restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
-			return NULL;
+			break;
+		if (startagain) {
+			mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
+			goto again;
+		}
 	}
 	mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
 	return dqpout;
-- 
cgit v1.2.2


From 24446fc66fdebbdd8baca0f44fd2a47ad77ba580 Mon Sep 17 00:00:00 2001
From: "bpm@sgi.com" <bpm@sgi.com>
Date: Wed, 19 Jan 2011 17:41:58 +0000
Subject: xfs: xfs_bmap_add_extent_delay_real should init br_startblock

When filling in the middle of a previous delayed allocation in
xfs_bmap_add_extent_delay_real, set br_startblock of the new delay
extent to the right to nullstartblock instead of 0 before inserting
the extent into the ifork (xfs_iext_insert), rather than setting
br_startblock afterward.

Adding the extent into the ifork with br_startblock=0 can lead to
the extent being copied into the btree by xfs_bmap_extent_to_btree
if we happen to convert from extents format to btree format before
updating br_startblock with the correct value.  The unexpected
addition of this delay extent to the btree can cause subsequent
XFS_WANT_CORRUPTED_GOTO filesystem shutdown in several
xfs_bmap_add_extent_delay_real cases where we are converting a delay
extent to real and unexpectedly find an extent already inserted.
For example:

911         case BMAP_LEFT_FILLING:
912                 /*
913                  * Filling in the first part of a previous delayed allocation.
914                  * The left neighbor is not contiguous.
915                  */
916                 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
917                 xfs_bmbt_set_startoff(ep, new_endoff);
918                 temp = PREV.br_blockcount - new->br_blockcount;
919                 xfs_bmbt_set_blockcount(ep, temp);
920                 xfs_iext_insert(ip, idx, 1, new, state);
921                 ip->i_df.if_lastex = idx;
922                 ip->i_d.di_nextents++;
923                 if (cur == NULL)
924                         rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
925                 else {
926                         rval = XFS_ILOG_CORE;
927                         if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
928                                         new->br_startblock, new->br_blockcount,
929                                         &i)))
930                                 goto done;
931                         XFS_WANT_CORRUPTED_GOTO(i == 0, done);

With the bogus extent in the btree we shutdown the filesystem at
931.  The conversion from extents to btree format happens when the
number of extents in the inode increases above ip->i_df.if_ext_max.
xfs_bmap_extent_to_btree copies extents from the ifork into the
btree, ignoring all delalloc extents which are denoted by
br_startblock having some value of nullstartblock.

SGI-PV: 1013221

Signed-off-by: Ben Myers <bpm@sgi.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_bmap.c | 33 +++++++++++++++++++++++++--------
 1 file changed, 25 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 3e9c278a8f78..dc3afd7739ff 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -1038,17 +1038,34 @@ xfs_bmap_add_extent_delay_real(
 		 * Filling in the middle part of a previous delayed allocation.
 		 * Contiguity is impossible here.
 		 * This case is avoided almost all the time.
+		 *
+		 * We start with a delayed allocation:
+		 *
+		 * +ddddddddddddddddddddddddddddddddddddddddddddddddddddddd+
+		 *  PREV @ idx
+		 *
+	         * and we are allocating:
+		 *                     +rrrrrrrrrrrrrrrrr+
+		 *			      new
+		 *
+		 * and we set it up for insertion as:
+		 * +ddddddddddddddddddd+rrrrrrrrrrrrrrrrr+ddddddddddddddddd+
+		 *                            new
+		 *  PREV @ idx          LEFT              RIGHT
+		 *                      inserted at idx + 1
 		 */
 		temp = new->br_startoff - PREV.br_startoff;
-		trace_xfs_bmap_pre_update(ip, idx, 0, _THIS_IP_);
-		xfs_bmbt_set_blockcount(ep, temp);
-		r[0] = *new;
-		r[1].br_state = PREV.br_state;
-		r[1].br_startblock = 0;
-		r[1].br_startoff = new_endoff;
 		temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff;
-		r[1].br_blockcount = temp2;
-		xfs_iext_insert(ip, idx + 1, 2, &r[0], state);
+		trace_xfs_bmap_pre_update(ip, idx, 0, _THIS_IP_);
+		xfs_bmbt_set_blockcount(ep, temp);	/* truncate PREV */
+		LEFT = *new;
+		RIGHT.br_state = PREV.br_state;
+		RIGHT.br_startblock = nullstartblock(
+				(int)xfs_bmap_worst_indlen(ip, temp2));
+		RIGHT.br_startoff = new_endoff;
+		RIGHT.br_blockcount = temp2;
+		/* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */
+		xfs_iext_insert(ip, idx + 1, 2, &LEFT, state);
 		ip->i_df.if_lastex = idx + 1;
 		ip->i_d.di_nextents++;
 		if (cur == NULL)
-- 
cgit v1.2.2


From e00b8a24041f37e56b4b8415ce4eba1cbc238065 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 27 Jan 2011 14:55:39 -0500
Subject: NFS: Fix an NFS client lockdep issue

There is no reason to be freeing the delegation cred in the rcu callback,
and doing so is resulting in a lockdep complaint that rpc_credcache_lock
is being called from both softirq and non-softirq contexts.

Reported-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: stable@kernel.org
---
 fs/nfs/delegation.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 364e4328f392..bbbc6bf5cb2e 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -23,8 +23,6 @@
 
 static void nfs_do_free_delegation(struct nfs_delegation *delegation)
 {
-	if (delegation->cred)
-		put_rpccred(delegation->cred);
 	kfree(delegation);
 }
 
@@ -37,6 +35,10 @@ static void nfs_free_delegation_callback(struct rcu_head *head)
 
 static void nfs_free_delegation(struct nfs_delegation *delegation)
 {
+	if (delegation->cred) {
+		put_rpccred(delegation->cred);
+		delegation->cred = NULL;
+	}
 	call_rcu(&delegation->rcu, nfs_free_delegation_callback);
 }
 
-- 
cgit v1.2.2


From c08e76d0cd4beb759a73c1835d98f5fccc126ed1 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 28 Jan 2011 12:40:55 -0500
Subject: NFS: Micro-optimize nfs4_decode_dirent()

Make the decoding of NFSv4 directory entries slightly more efficient
by:

  1.  Avoiding unnecessary byte swapping when checking XDR booleans,
      and

  2.  Not bumping "p" when its value will be immediately replaced by
      xdr_inline_decode()

This commit makes nfs4_decode_dirent() consistent with similar logic
in the other two decode_dirent() functions.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 2ab8e5cb8f59..009aef9e12bc 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -6086,11 +6086,11 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
 	__be32 *p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
 		goto out_overflow;
-	if (!ntohl(*p++)) {
+	if (*p == xdr_zero) {
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
 			goto out_overflow;
-		if (!ntohl(*p++))
+		if (*p == xdr_zero)
 			return -EAGAIN;
 		entry->eof = 1;
 		return -EBADCOOKIE;
@@ -6101,7 +6101,7 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
 		goto out_overflow;
 	entry->prev_cookie = entry->cookie;
 	p = xdr_decode_hyper(p, &entry->cookie);
-	entry->len = ntohl(*p++);
+	entry->len = be32_to_cpup(p);
 
 	p = xdr_inline_decode(xdr, entry->len);
 	if (unlikely(!p))
-- 
cgit v1.2.2


From d1205f87bbb8040c1408bbd9e0a720310b2b0b9b Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 28 Jan 2011 12:41:05 -0500
Subject: NFS: NFSv4 readdir loses entries

On recent 2.6.38-rc kernels, connectathon basic test 6 fails on
NFSv4 mounts of OpenSolaris with something like:

> ./test6: readdir
> 	./test6: (/mnt/klimt/matisse.test) didn't read expected 'file.12' dir entry, pass 0
> 	./test6: (/mnt/klimt/matisse.test) didn't read expected 'file.82' dir entry, pass 0
> 	./test6: (/mnt/klimt/matisse.test) didn't read expected 'file.164' dir entry, pass 0
> 	./test6: (/mnt/klimt/matisse.test) Test failed with 3 errors
> basic tests failed
> Tests failed, leaving /mnt/klimt mounted
> [cel@matisse cthon04]$

I narrowed the problem down to nfs4_decode_dirent() reporting that the
decode buffer had overflowed while decoding the entries for those
missing files.

verify_attr_len() assumes both it's pointer arguments reside on the
same page.  When these arguments point to locations on two different
pages, verify_attr_len() can report false errors.  This can happen now
that a large NFSv4 readdir result can span pages.

We have reasonably good checking in nfs4_decode_dirent() anyway, so
it should be safe to simply remove the extra checking.

At a guess, this was introduced by commit 6650239a, "NFS: Don't use
vm_map_ram() in readdir".

Cc: stable@kernel.org [2.6.37]
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 009aef9e12bc..4e2c168b6ee9 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -6132,9 +6132,6 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
 	if (entry->fattr->valid & NFS_ATTR_FATTR_TYPE)
 		entry->d_type = nfs_umode_to_dtype(entry->fattr->mode);
 
-	if (verify_attr_len(xdr, p, len) < 0)
-		goto out_overflow;
-
 	return 0;
 
 out_overflow:
-- 
cgit v1.2.2


From 6b82ce8d824bd46053e46a895876cde39d9026e4 Mon Sep 17 00:00:00 2001
From: liubo <liubo2009@cn.fujitsu.com>
Date: Wed, 26 Jan 2011 06:21:39 +0000
Subject: btrfs: fix uncheck memory allocation in btrfs_submit_compressed_read

btrfs_submit_compressed_read() is lack of memory allocation checks and
corresponding error route.

After this fix, if it comes to "no memory" case, errno will be returned
to userland step by step, and tell users this operation cannot go on.

Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/compression.c | 25 +++++++++++++++++++++++--
 fs/btrfs/extent_io.c   |  4 ++--
 2 files changed, 25 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index f745287fbf2e..3a932f183da1 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -562,7 +562,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	u64 em_len;
 	u64 em_start;
 	struct extent_map *em;
-	int ret;
+	int ret = -ENOMEM;
 	u32 *sums;
 
 	tree = &BTRFS_I(inode)->io_tree;
@@ -577,6 +577,9 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 
 	compressed_len = em->block_len;
 	cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
+	if (!cb)
+		goto out;
+
 	atomic_set(&cb->pending_bios, 0);
 	cb->errors = 0;
 	cb->inode = inode;
@@ -597,13 +600,18 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 
 	nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) /
 				 PAGE_CACHE_SIZE;
-	cb->compressed_pages = kmalloc(sizeof(struct page *) * nr_pages,
+	cb->compressed_pages = kzalloc(sizeof(struct page *) * nr_pages,
 				       GFP_NOFS);
+	if (!cb->compressed_pages)
+		goto fail1;
+
 	bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
 
 	for (page_index = 0; page_index < nr_pages; page_index++) {
 		cb->compressed_pages[page_index] = alloc_page(GFP_NOFS |
 							      __GFP_HIGHMEM);
+		if (!cb->compressed_pages[page_index])
+			goto fail2;
 	}
 	cb->nr_pages = nr_pages;
 
@@ -614,6 +622,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	cb->len = uncompressed_len;
 
 	comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS);
+	if (!comp_bio)
+		goto fail2;
 	comp_bio->bi_private = cb;
 	comp_bio->bi_end_io = end_compressed_bio_read;
 	atomic_inc(&cb->pending_bios);
@@ -681,6 +691,17 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 
 	bio_put(comp_bio);
 	return 0;
+
+fail2:
+	for (page_index = 0; page_index < nr_pages; page_index++)
+		free_page((unsigned long)cb->compressed_pages[page_index]);
+
+	kfree(cb->compressed_pages);
+fail1:
+	kfree(cb);
+out:
+	free_extent_map(em);
+	return ret;
 }
 
 static struct list_head comp_idle_workspace[BTRFS_COMPRESS_TYPES];
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 8b8d3d99ae68..6411ed6ca449 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1865,7 +1865,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
 	bio_get(bio);
 
 	if (tree->ops && tree->ops->submit_bio_hook)
-		tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
+		ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
 					   mirror_num, bio_flags, start);
 	else
 		submit_bio(rw, bio);
@@ -2126,7 +2126,7 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
 	ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
 				      &bio_flags);
 	if (bio)
-		submit_one_bio(READ, bio, 0, bio_flags);
+		ret = submit_one_bio(READ, bio, 0, bio_flags);
 	return ret;
 }
 
-- 
cgit v1.2.2


From 2a29edc6b60a5248ccab588e7ba7dad38cef0235 Mon Sep 17 00:00:00 2001
From: liubo <liubo2009@cn.fujitsu.com>
Date: Wed, 26 Jan 2011 06:22:08 +0000
Subject: btrfs: fix several uncheck memory allocations

To make btrfs more stable, add several missing necessary memory allocation
checks, and when no memory, return proper errno.

We've checked that some of those -ENOMEM errors will be returned to
userspace, and some will be catched by BUG_ON() in the upper callers,
and none will be ignored silently.

Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/export.c    |  2 ++
 fs/btrfs/file-item.c |  2 ++
 fs/btrfs/file.c      |  4 ++++
 fs/btrfs/tree-log.c  | 25 +++++++++++++++++++++++++
 4 files changed, 33 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 6f0444473594..3220ad1aafc8 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -176,6 +176,8 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
 	int ret;
 
 	path = btrfs_alloc_path();
+	if (!path)
+		return ERR_PTR(-ENOMEM);
 
 	if (dir->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
 		key.objectid = root->root_key.objectid;
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index a562a250ae77..d0bc72657cd7 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -536,6 +536,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
 	root = root->fs_info->csum_root;
 
 	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
 
 	while (1) {
 		key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index f903433f5bdf..65b2424a4116 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -945,6 +945,10 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
 		     PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
 		     (sizeof(struct page *)));
 	pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
+	if (!pages) {
+		ret = -ENOMEM;
+		goto out;
+	}
 
 	/* generic_write_checks can change our pos */
 	start_pos = pos;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 054744ac5719..c25a41d86118 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -338,6 +338,12 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
 		}
 		dst_copy = kmalloc(item_size, GFP_NOFS);
 		src_copy = kmalloc(item_size, GFP_NOFS);
+		if (!dst_copy || !src_copy) {
+			btrfs_release_path(root, path);
+			kfree(dst_copy);
+			kfree(src_copy);
+			return -ENOMEM;
+		}
 
 		read_extent_buffer(eb, src_copy, src_ptr, item_size);
 
@@ -665,6 +671,9 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
 	btrfs_dir_item_key_to_cpu(leaf, di, &location);
 	name_len = btrfs_dir_name_len(leaf, di);
 	name = kmalloc(name_len, GFP_NOFS);
+	if (!name)
+		return -ENOMEM;
+
 	read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
 	btrfs_release_path(root, path);
 
@@ -744,6 +753,9 @@ static noinline int backref_in_log(struct btrfs_root *log,
 	int match = 0;
 
 	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
 	ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
 	if (ret != 0)
 		goto out;
@@ -967,6 +979,8 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
 	key.offset = (u64)-1;
 
 	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
 
 	while (1) {
 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -1178,6 +1192,9 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
 
 	name_len = btrfs_dir_name_len(eb, di);
 	name = kmalloc(name_len, GFP_NOFS);
+	if (!name)
+		return -ENOMEM;
+
 	log_type = btrfs_dir_type(eb, di);
 	read_extent_buffer(eb, name, (unsigned long)(di + 1),
 		   name_len);
@@ -1692,6 +1709,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 		root_owner = btrfs_header_owner(parent);
 
 		next = btrfs_find_create_tree_block(root, bytenr, blocksize);
+		if (!next)
+			return -ENOMEM;
 
 		if (*level == 1) {
 			wc->process_func(root, next, wc, ptr_gen);
@@ -2194,6 +2213,9 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
 
 	log = root->log_root;
 	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
 	di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
 				   name, name_len, -1);
 	if (IS_ERR(di)) {
@@ -2594,6 +2616,9 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 
 	ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
 			   nr * sizeof(u32), GFP_NOFS);
+	if (!ins_data)
+		return -ENOMEM;
+
 	ins_sizes = (u32 *)ins_data;
 	ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
 
-- 
cgit v1.2.2


From 333e8105445d4f51101fc3d23199a919d66730b3 Mon Sep 17 00:00:00 2001
From: liubo <liubo2009@cn.fujitsu.com>
Date: Wed, 26 Jan 2011 06:22:33 +0000
Subject: btrfs: fix missing break in switch phrase

There is a missing break in switch, fix it.

Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/print-tree.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 0d126be22b63..fb2605d998e9 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -260,6 +260,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 #else
 			BUG();
 #endif
+			break;
 		case BTRFS_BLOCK_GROUP_ITEM_KEY:
 			bi = btrfs_item_ptr(l, i,
 					    struct btrfs_block_group_item);
-- 
cgit v1.2.2


From 34d19bada00f4825588b338a8ee193820f9ceeb0 Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Mon, 24 Jan 2011 19:55:19 +0000
Subject: fs/btrfs/inode.c: Add missing IS_ERR test

After the conditional that precedes the following code, inode may be an
ERR_PTR value.  This can eg result from a memory allocation failure via the
call to btrfs_iget, and thus does not imply that root is different than
sub_root.  Thus, an IS_ERR check is added to ensure that there is no
dereference of inode in this case.

The semantic match that finds this problem is as follows:
(http://coccinelle.lip6.fr/)

// <smpl>
@r@
identifier f;
@@
f(...) { ... return ERR_PTR(...); }

@@
identifier r.f, fld;
expression x;
statement S1,S2;
@@
 x = f(...)
 ... when != IS_ERR(x)
(
 if (IS_ERR(x) ||...) S1 else S2
|
*x->fld
)
// </smpl>

Signed-off-by: Julia Lawall <julia@diku.dk>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2c9a2f7d5631..2b7d251d6ad1 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4137,7 +4137,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
 	}
 	srcu_read_unlock(&root->fs_info->subvol_srcu, index);
 
-	if (root != sub_root) {
+	if (!IS_ERR(inode) && root != sub_root) {
 		down_read(&root->fs_info->cleanup_work_sem);
 		if (!(inode->i_sb->s_flags & MS_RDONLY))
 			btrfs_orphan_cleanup(sub_root);
-- 
cgit v1.2.2


From 3612b49598c303cfb22a4b609427f829828e2427 Mon Sep 17 00:00:00 2001
From: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Date: Tue, 25 Jan 2011 02:51:38 +0000
Subject: btrfs: fix return value check of btrfs_join_transaction()

The error check of btrfs_join_transaction()/btrfs_join_transaction_nolock()
is added, and the mistake of the error check in several places is
corrected.

For more stable Btrfs, I think that we should reduce BUG_ON().
But, I think that long time is necessary for this.
So, I propose this patch as a short-term solution.

With this patch:
 - To more stable Btrfs, the part that should be corrected is clarified.
 - The panic isn't done by the NULL pointer reference etc. (even if
   BUG_ON() is increased temporarily)
 - The error code is returned in the place where the error can be easily
   returned.

As a long-term plan:
 - BUG_ON() is reduced by using the forced-readonly framework, etc.

Signed-off-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c     |  5 +++++
 fs/btrfs/extent-tree.c |  2 +-
 fs/btrfs/inode.c       | 24 ++++++++++++++++--------
 fs/btrfs/ioctl.c       |  2 +-
 fs/btrfs/relocation.c  | 26 +++++++++++++++++++++++---
 fs/btrfs/transaction.c |  5 +++++
 6 files changed, 51 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2887b8be6fdd..b36eeef19194 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1550,6 +1550,7 @@ static int transaction_kthread(void *arg)
 		spin_unlock(&root->fs_info->new_trans_lock);
 
 		trans = btrfs_join_transaction(root, 1);
+		BUG_ON(IS_ERR(trans));
 		if (transid == trans->transid) {
 			ret = btrfs_commit_transaction(trans, root);
 			BUG_ON(ret);
@@ -2464,10 +2465,14 @@ int btrfs_commit_super(struct btrfs_root *root)
 	up_write(&root->fs_info->cleanup_work_sem);
 
 	trans = btrfs_join_transaction(root, 1);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
 	ret = btrfs_commit_transaction(trans, root);
 	BUG_ON(ret);
 	/* run commit again to drop the original snapshot */
 	trans = btrfs_join_transaction(root, 1);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
 	btrfs_commit_transaction(trans, root);
 	ret = btrfs_write_and_wait_transaction(NULL, root);
 	BUG_ON(ret);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index bcf303204f7f..98ee139885cc 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -7478,7 +7478,7 @@ int btrfs_drop_dead_reloc_roots(struct btrfs_root *root)
 		BUG_ON(reloc_root->commit_root != NULL);
 		while (1) {
 			trans = btrfs_join_transaction(root, 1);
-			BUG_ON(!trans);
+			BUG_ON(IS_ERR(trans));
 
 			mutex_lock(&root->fs_info->drop_mutex);
 			ret = btrfs_drop_snapshot(trans, reloc_root);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2b7d251d6ad1..40fee137dd11 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -416,7 +416,7 @@ again:
 	}
 	if (start == 0) {
 		trans = btrfs_join_transaction(root, 1);
-		BUG_ON(!trans);
+		BUG_ON(IS_ERR(trans));
 		btrfs_set_trans_block_group(trans, inode);
 		trans->block_rsv = &root->fs_info->delalloc_block_rsv;
 
@@ -612,6 +612,7 @@ retry:
 			    GFP_NOFS);
 
 		trans = btrfs_join_transaction(root, 1);
+		BUG_ON(IS_ERR(trans));
 		ret = btrfs_reserve_extent(trans, root,
 					   async_extent->compressed_size,
 					   async_extent->compressed_size,
@@ -771,7 +772,7 @@ static noinline int cow_file_range(struct inode *inode,
 
 	BUG_ON(root == root->fs_info->tree_root);
 	trans = btrfs_join_transaction(root, 1);
-	BUG_ON(!trans);
+	BUG_ON(IS_ERR(trans));
 	btrfs_set_trans_block_group(trans, inode);
 	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
 
@@ -1049,7 +1050,7 @@ static noinline int run_delalloc_nocow(struct inode *inode,
 	} else {
 		trans = btrfs_join_transaction(root, 1);
 	}
-	BUG_ON(!trans);
+	BUG_ON(IS_ERR(trans));
 
 	cow_start = (u64)-1;
 	cur_offset = start;
@@ -1704,7 +1705,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 				trans = btrfs_join_transaction_nolock(root, 1);
 			else
 				trans = btrfs_join_transaction(root, 1);
-			BUG_ON(!trans);
+			BUG_ON(IS_ERR(trans));
 			btrfs_set_trans_block_group(trans, inode);
 			trans->block_rsv = &root->fs_info->delalloc_block_rsv;
 			ret = btrfs_update_inode(trans, root, inode);
@@ -1721,6 +1722,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 		trans = btrfs_join_transaction_nolock(root, 1);
 	else
 		trans = btrfs_join_transaction(root, 1);
+	BUG_ON(IS_ERR(trans));
 	btrfs_set_trans_block_group(trans, inode);
 	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
 
@@ -2382,6 +2384,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
 
 	if (root->orphan_block_rsv || root->orphan_item_inserted) {
 		trans = btrfs_join_transaction(root, 1);
+		BUG_ON(IS_ERR(trans));
 		btrfs_end_transaction(trans, root);
 	}
 
@@ -4350,6 +4353,8 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 			trans = btrfs_join_transaction_nolock(root, 1);
 		else
 			trans = btrfs_join_transaction(root, 1);
+		if (IS_ERR(trans))
+			return PTR_ERR(trans);
 		btrfs_set_trans_block_group(trans, inode);
 		if (nolock)
 			ret = btrfs_end_transaction_nolock(trans, root);
@@ -4375,6 +4380,7 @@ void btrfs_dirty_inode(struct inode *inode)
 		return;
 
 	trans = btrfs_join_transaction(root, 1);
+	BUG_ON(IS_ERR(trans));
 	btrfs_set_trans_block_group(trans, inode);
 
 	ret = btrfs_update_inode(trans, root, inode);
@@ -5179,6 +5185,8 @@ again:
 				em = NULL;
 				btrfs_release_path(root, path);
 				trans = btrfs_join_transaction(root, 1);
+				if (IS_ERR(trans))
+					return ERR_CAST(trans);
 				goto again;
 			}
 			map = kmap(page);
@@ -5283,8 +5291,8 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
 	btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
 
 	trans = btrfs_join_transaction(root, 0);
-	if (!trans)
-		return ERR_PTR(-ENOMEM);
+	if (IS_ERR(trans))
+		return ERR_CAST(trans);
 
 	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
 
@@ -5508,7 +5516,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
 		 * while we look for nocow cross refs
 		 */
 		trans = btrfs_join_transaction(root, 0);
-		if (!trans)
+		if (IS_ERR(trans))
 			goto must_cow;
 
 		if (can_nocow_odirect(trans, inode, start, len) == 1) {
@@ -5643,7 +5651,7 @@ again:
 	BUG_ON(!ordered);
 
 	trans = btrfs_join_transaction(root, 1);
-	if (!trans) {
+	if (IS_ERR(trans)) {
 		err = -ENOMEM;
 		goto out;
 	}
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index edd82becbb9e..04b4fb9144a9 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -203,7 +203,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 
 
 	trans = btrfs_join_transaction(root, 1);
-	BUG_ON(!trans);
+	BUG_ON(IS_ERR(trans));
 
 	ret = btrfs_update_inode(trans, root, inode);
 	BUG_ON(ret);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 045c9c2b2d7e..ea9965430241 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2147,6 +2147,12 @@ again:
 	}
 
 	trans = btrfs_join_transaction(rc->extent_root, 1);
+	if (IS_ERR(trans)) {
+		if (!err)
+			btrfs_block_rsv_release(rc->extent_root,
+						rc->block_rsv, num_bytes);
+		return PTR_ERR(trans);
+	}
 
 	if (!err) {
 		if (num_bytes != rc->merging_rsv_size) {
@@ -3222,6 +3228,7 @@ truncate:
 	trans = btrfs_join_transaction(root, 0);
 	if (IS_ERR(trans)) {
 		btrfs_free_path(path);
+		ret = PTR_ERR(trans);
 		goto out;
 	}
 
@@ -3628,6 +3635,7 @@ int prepare_to_relocate(struct reloc_control *rc)
 	set_reloc_control(rc);
 
 	trans = btrfs_join_transaction(rc->extent_root, 1);
+	BUG_ON(IS_ERR(trans));
 	btrfs_commit_transaction(trans, rc->extent_root);
 	return 0;
 }
@@ -3804,7 +3812,10 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
 
 	/* get rid of pinned extents */
 	trans = btrfs_join_transaction(rc->extent_root, 1);
-	btrfs_commit_transaction(trans, rc->extent_root);
+	if (IS_ERR(trans))
+		err = PTR_ERR(trans);
+	else
+		btrfs_commit_transaction(trans, rc->extent_root);
 out_free:
 	btrfs_free_block_rsv(rc->extent_root, rc->block_rsv);
 	btrfs_free_path(path);
@@ -4125,6 +4136,11 @@ int btrfs_recover_relocation(struct btrfs_root *root)
 	set_reloc_control(rc);
 
 	trans = btrfs_join_transaction(rc->extent_root, 1);
+	if (IS_ERR(trans)) {
+		unset_reloc_control(rc);
+		err = PTR_ERR(trans);
+		goto out_free;
+	}
 
 	rc->merge_reloc_tree = 1;
 
@@ -4154,9 +4170,13 @@ int btrfs_recover_relocation(struct btrfs_root *root)
 	unset_reloc_control(rc);
 
 	trans = btrfs_join_transaction(rc->extent_root, 1);
-	btrfs_commit_transaction(trans, rc->extent_root);
-out:
+	if (IS_ERR(trans))
+		err = PTR_ERR(trans);
+	else
+		btrfs_commit_transaction(trans, rc->extent_root);
+out_free:
 	kfree(rc);
+out:
 	while (!list_empty(&reloc_roots)) {
 		reloc_root = list_entry(reloc_roots.next,
 					struct btrfs_root, root_list);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index bae5c7b8bbe2..3d73c8d93bbb 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1161,6 +1161,11 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
 	INIT_DELAYED_WORK(&ac->work, do_async_commit);
 	ac->root = root;
 	ac->newtrans = btrfs_join_transaction(root, 0);
+	if (IS_ERR(ac->newtrans)) {
+		int err = PTR_ERR(ac->newtrans);
+		kfree(ac);
+		return err;
+	}
 
 	/* take transaction reference */
 	mutex_lock(&root->fs_info->trans_mutex);
-- 
cgit v1.2.2


From abd30bb0af9d4671506502278e8631bed9e3c35c Mon Sep 17 00:00:00 2001
From: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Date: Mon, 24 Jan 2011 00:57:10 +0000
Subject: btrfs: check return value of btrfs_start_ioctl_transaction() properly

btrfs_start_ioctl_transaction() returns ERR_PTR(), not NULL.
So, it is necessary to use IS_ERR() to check the return value.

Signed-off-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 04b4fb9144a9..12dabe28cf54 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2085,7 +2085,7 @@ static long btrfs_ioctl_trans_start(struct file *file)
 
 	ret = -ENOMEM;
 	trans = btrfs_start_ioctl_transaction(root, 0);
-	if (!trans)
+	if (IS_ERR(trans))
 		goto out_drop;
 
 	file->private_data = trans;
-- 
cgit v1.2.2


From dedefd7215d3ec451291ca393e5c8e4c1882c8c6 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 24 Jan 2011 21:43:18 +0000
Subject: Btrfs: fix check_path_shared so it returns the right value

When running xfstests 224 I kept getting ENOSPC when trying to remove the files,
and this is because we were returning ret from check_path_shared while it was
uninitalized, which isn't right.  Fix this to return 0 properly, and now
xfstests 224 doesn't freak out when it tries to clean itself up.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 40fee137dd11..5621818921f8 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2718,9 +2718,10 @@ static int check_path_shared(struct btrfs_root *root,
 	struct extent_buffer *eb;
 	int level;
 	u64 refs = 1;
-	int uninitialized_var(ret);
 
 	for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
+		int ret;
+
 		if (!path->nodes[level])
 			break;
 		eb = path->nodes[level];
@@ -2731,7 +2732,7 @@ static int check_path_shared(struct btrfs_root *root,
 		if (refs > 1)
 			return 1;
 	}
-	return ret; /* XXX callers? */
+	return 0;
 }
 
 /*
-- 
cgit v1.2.2


From e9e22899de661af94cb9995885fd04e4c738838b Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 24 Jan 2011 21:43:19 +0000
Subject: Btrfs: do not release more reserved bytes to the global_block_rsv
 than we need

When we do btrfs_block_rsv_release, if global_block_rsv is not full we will
release all the extra bytes to global_block_rsv, even if it's only a little
short of the amount of space that we need to reserve.  This causes us to starve
ourselves of reservable space during the transaction which will force us to
shrink delalloc bytes and commit the transaction more often than we should.  So
instead just add the amount of bytes we need to add to the global reserve so
reserved == size, and then add the rest back into the space_info for general
use.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 98ee139885cc..7af618dcf2c0 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3589,8 +3589,20 @@ void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
 
 	if (num_bytes > 0) {
 		if (dest) {
-			block_rsv_add_bytes(dest, num_bytes, 0);
-		} else {
+			spin_lock(&dest->lock);
+			if (!dest->full) {
+				u64 bytes_to_add;
+
+				bytes_to_add = dest->size - dest->reserved;
+				bytes_to_add = min(num_bytes, bytes_to_add);
+				dest->reserved += bytes_to_add;
+				if (dest->reserved >= dest->size)
+					dest->full = 1;
+				num_bytes -= bytes_to_add;
+			}
+			spin_unlock(&dest->lock);
+		}
+		if (num_bytes) {
 			spin_lock(&space_info->lock);
 			space_info->bytes_reserved -= num_bytes;
 			spin_unlock(&space_info->lock);
-- 
cgit v1.2.2


From 68a82277b8619e6d0f2738b1d9b160b627e81e92 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 24 Jan 2011 21:43:20 +0000
Subject: Btrfs: use the global block reserve if we cannot reserve space

We call use_block_rsv right before we make an allocation in order to make sure
we have enough space.  Now normally people have called btrfs_start_transaction()
with the appropriate amount of space that we need, so we just use some of that
pre-reserved space and move along happily.  The problem is where people use
btrfs_join_transaction(), which doesn't actually reserve any space.  So we try
and reserve space here, but we cannot flush delalloc, so this forces us to
return -ENOSPC when in reality we have plenty of space.  The most common symptom
is seeing a bunch of "couldn't dirty inode" messages in syslog.  With
xfstests 224 we end up falling back to start_transaction and then doing all the
flush delalloc stuff which causes to hang for a very long time.

So instead steal from the global reserve, which is what this is meant for
anyway.  With this patch and the other 2 I have sent xfstests 224 now passes
successfully.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 7af618dcf2c0..ff6bbfd75cf7 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5646,6 +5646,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
 	      struct btrfs_root *root, u32 blocksize)
 {
 	struct btrfs_block_rsv *block_rsv;
+	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
 	int ret;
 
 	block_rsv = get_block_rsv(trans, root);
@@ -5653,14 +5654,39 @@ use_block_rsv(struct btrfs_trans_handle *trans,
 	if (block_rsv->size == 0) {
 		ret = reserve_metadata_bytes(trans, root, block_rsv,
 					     blocksize, 0);
-		if (ret)
+		/*
+		 * If we couldn't reserve metadata bytes try and use some from
+		 * the global reserve.
+		 */
+		if (ret && block_rsv != global_rsv) {
+			ret = block_rsv_use_bytes(global_rsv, blocksize);
+			if (!ret)
+				return global_rsv;
+			return ERR_PTR(ret);
+		} else if (ret) {
 			return ERR_PTR(ret);
+		}
 		return block_rsv;
 	}
 
 	ret = block_rsv_use_bytes(block_rsv, blocksize);
 	if (!ret)
 		return block_rsv;
+	if (ret) {
+		WARN_ON(1);
+		ret = reserve_metadata_bytes(trans, root, block_rsv, blocksize,
+					     0);
+		if (!ret) {
+			spin_lock(&block_rsv->lock);
+			block_rsv->size += blocksize;
+			spin_unlock(&block_rsv->lock);
+			return block_rsv;
+		} else if (ret && block_rsv != global_rsv) {
+			ret = block_rsv_use_bytes(global_rsv, blocksize);
+			if (!ret)
+				return global_rsv;
+		}
+	}
 
 	return ERR_PTR(-ENOSPC);
 }
-- 
cgit v1.2.2


From ad0397a7a97f55fd7f70998ec208c5d8b90310ff Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 28 Jan 2011 18:44:44 +0000
Subject: Btrfs: do error checking in btrfs_del_csums

Got a report of a box panicing because we got a NULL eb in read_extent_buffer.
His fs was borked and btrfs_search_path returned EIO, but we don't check for
errors so the box paniced.  Yes I know this will just make something higher up
the stack panic, but that's a problem for future Josef.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file-item.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index d0bc72657cd7..4f19a3e1bf32 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -550,7 +550,10 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
 			if (path->slots[0] == 0)
 				goto out;
 			path->slots[0]--;
+		} else if (ret < 0) {
+			goto out;
 		}
+
 		leaf = path->nodes[0];
 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 
-- 
cgit v1.2.2


From 7adf5dfbb3af65a00e20b3ead224c3a1b40e4ec4 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Tue, 25 Jan 2011 22:11:54 +0000
Subject: Btrfs: handle no memory properly in prepare_pages

Instead of doing a BUG_ON(1) in prepare_pages if grab_cache_page() fails, just
loop through the pages we've already grabbed and unlock and release them, then
return -ENOMEM like we should.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 65b2424a4116..9e097fbfc78d 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -792,8 +792,12 @@ again:
 	for (i = 0; i < num_pages; i++) {
 		pages[i] = grab_cache_page(inode->i_mapping, index + i);
 		if (!pages[i]) {
-			err = -ENOMEM;
-			BUG_ON(1);
+			int c;
+			for (c = i - 1; c >= 0; c--) {
+				unlock_page(pages[c]);
+				page_cache_release(pages[c]);
+			}
+			return -ENOMEM;
 		}
 		wait_on_page_writeback(pages[i]);
 	}
-- 
cgit v1.2.2


From af5eb745efe97d91d2cbe793029838b3311c15da Mon Sep 17 00:00:00 2001
From: Anton Altaparmakov <anton@tuxera.com>
Date: Fri, 28 Jan 2011 20:45:28 +0000
Subject: NTFS: Fix invalid pointer dereference in ntfs_mft_record_alloc().

In ntfs_mft_record_alloc() when mapping the new extent mft record with
map_extent_mft_record() we overwrite @m with the return value and on
error, we then try to use the old @m but that is no longer there as @m
now contains an error code instead so we crash when dereferencing the
error code as if it were a pointer.

The simple fix is to use a temporary variable to store the return value
thus preserving the original @m for later use.  This is a backport from
the commercial Tuxera-NTFS driver and is well tested...

Thanks go to Julia Lawall for pointing this out (whilst I had fixed it
in the commercial driver I had failed to fix it in the Linux kernel).

Signed-off-by: Anton Altaparmakov <anton@tuxera.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ntfs/mft.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index b572b6727181..326e7475a22a 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -1,7 +1,7 @@
 /**
  * mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project.
  *
- * Copyright (c) 2001-2006 Anton Altaparmakov
+ * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.
  * Copyright (c) 2002 Richard Russon
  *
  * This program/include file is free software; you can redistribute it and/or
@@ -2576,6 +2576,8 @@ mft_rec_already_initialized:
 	flush_dcache_page(page);
 	SetPageUptodate(page);
 	if (base_ni) {
+		MFT_RECORD *m_tmp;
+
 		/*
 		 * Setup the base mft record in the extent mft record.  This
 		 * completes initialization of the allocated extent mft record
@@ -2588,11 +2590,11 @@ mft_rec_already_initialized:
 		 * attach it to the base inode @base_ni and map, pin, and lock
 		 * its, i.e. the allocated, mft record.
 		 */
-		m = map_extent_mft_record(base_ni, bit, &ni);
-		if (IS_ERR(m)) {
+		m_tmp = map_extent_mft_record(base_ni, bit, &ni);
+		if (IS_ERR(m_tmp)) {
 			ntfs_error(vol->sb, "Failed to map allocated extent "
 					"mft record 0x%llx.", (long long)bit);
-			err = PTR_ERR(m);
+			err = PTR_ERR(m_tmp);
 			/* Set the mft record itself not in use. */
 			m->flags &= cpu_to_le16(
 					~le16_to_cpu(MFT_RECORD_IN_USE));
@@ -2603,6 +2605,7 @@ mft_rec_already_initialized:
 			ntfs_unmap_page(page);
 			goto undo_mftbmp_alloc;
 		}
+		BUG_ON(m != m_tmp);
 		/*
 		 * Make sure the allocated mft record is written out to disk.
 		 * No need to set the inode dirty because the caller is going
-- 
cgit v1.2.2


From ffeb414a59291d5891f09727beb793c109f19f08 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Sat, 29 Jan 2011 07:03:02 -0500
Subject: cifs: fix two compiler warning about uninitialized vars
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fs/cifs/link.c: In function ‘symlink_hash’:
fs/cifs/link.c:58:3: warning: ‘rc’ may be used uninitialized in this
function [-Wuninitialized]

fs/cifs/smbencrypt.c: In function ‘mdfour’:
fs/cifs/smbencrypt.c:61:3: warning: ‘rc’ may be used uninitialized in this
function [-Wuninitialized]

Reviewed-by: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/link.c       | 3 ++-
 fs/cifs/smbencrypt.c | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 02cd60aefbff..e8804d373404 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -55,8 +55,9 @@ symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash)
 
 	md5 = crypto_alloc_shash("md5", 0, 0);
 	if (IS_ERR(md5)) {
+		rc = PTR_ERR(md5);
 		cERROR(1, "%s: Crypto md5 allocation error %d\n", __func__, rc);
-		return PTR_ERR(md5);
+		return rc;
 	}
 	size = sizeof(struct shash_desc) + crypto_shash_descsize(md5);
 	sdescmd5 = kmalloc(size, GFP_KERNEL);
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index b5450e9f40c0..b5041c849981 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -58,8 +58,9 @@ mdfour(unsigned char *md4_hash, unsigned char *link_str, int link_len)
 
 	md4 = crypto_alloc_shash("md4", 0, 0);
 	if (IS_ERR(md4)) {
+		rc = PTR_ERR(md4);
 		cERROR(1, "%s: Crypto md4 allocation error %d\n", __func__, rc);
-		return PTR_ERR(md4);
+		return rc;
 	}
 	size = sizeof(struct shash_desc) + crypto_shash_descsize(md4);
 	sdescmd4 = kmalloc(size, GFP_KERNEL);
-- 
cgit v1.2.2


From 1be912dde772b77aaaa21770eeabb0a7a5e297a6 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 28 Jan 2011 07:08:28 -0500
Subject: cifs: handle cancelled requests better

Currently, when a request is cancelled via signal, we delete the mid
immediately. If the request was already transmitted however, the client
is still likely to receive a response. When it does, it won't recognize
it however and will pop a printk.

It's also a little dangerous to just delete the mid entry like this. We
may end up reusing that mid. If we do then we could potentially get the
response from the first request confused with the later one.

Prevent the reuse of mids by marking them as cancelled and keeping them
on the pending_mid_q list. If the reply comes in, we'll delete it from
the list then. If it never comes, then we'll delete it at reconnect
or when cifsd comes down.

Reviewed-by: Pavel Shilovsky <piastryyy@gmail.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/transport.c | 43 ++++++++++++++++++++++++++++++++++++-------
 1 file changed, 36 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index c1ccca1a933f..9b2d0373a8a7 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -579,8 +579,17 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
 		goto out;
 
 	rc = wait_for_response(ses->server, midQ);
-	if (rc != 0)
-		goto out;
+	if (rc != 0) {
+		spin_lock(&GlobalMid_Lock);
+		if (midQ->midState == MID_REQUEST_SUBMITTED) {
+			midQ->callback = DeleteMidQEntry;
+			spin_unlock(&GlobalMid_Lock);
+			atomic_dec(&ses->server->inFlight);
+			wake_up(&ses->server->request_q);
+			return rc;
+		}
+		spin_unlock(&GlobalMid_Lock);
+	}
 
 	rc = sync_mid_result(midQ, ses->server);
 	if (rc != 0) {
@@ -724,8 +733,18 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
 		goto out;
 
 	rc = wait_for_response(ses->server, midQ);
-	if (rc != 0)
-		goto out;
+	if (rc != 0) {
+		spin_lock(&GlobalMid_Lock);
+		if (midQ->midState == MID_REQUEST_SUBMITTED) {
+			/* no longer considered to be "in-flight" */
+			midQ->callback = DeleteMidQEntry;
+			spin_unlock(&GlobalMid_Lock);
+			atomic_dec(&ses->server->inFlight);
+			wake_up(&ses->server->request_q);
+			return rc;
+		}
+		spin_unlock(&GlobalMid_Lock);
+	}
 
 	rc = sync_mid_result(midQ, ses->server);
 	if (rc != 0) {
@@ -922,10 +941,20 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 			}
 		}
 
-		if (wait_for_response(ses->server, midQ) == 0) {
-			/* We got the response - restart system call. */
-			rstart = 1;
+		rc = wait_for_response(ses->server, midQ);
+		if (rc) {
+			spin_lock(&GlobalMid_Lock);
+			if (midQ->midState == MID_REQUEST_SUBMITTED) {
+				/* no longer considered to be "in-flight" */
+				midQ->callback = DeleteMidQEntry;
+				spin_unlock(&GlobalMid_Lock);
+				return rc;
+			}
+			spin_unlock(&GlobalMid_Lock);
 		}
+
+		/* We got the response - restart system call. */
+		rstart = 1;
 	}
 
 	rc = sync_mid_result(midQ, ses->server);
-- 
cgit v1.2.2


From 2db7c5815555d8daabf7d4ab1253ce690852c140 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 28 Jan 2011 07:08:28 -0500
Subject: cifs: send an NT_CANCEL request when a process is signalled

Use the new send_nt_cancel function to send an NT_CANCEL when the
process is delivered a fatal signal. This is a "best effort" enterprise
however, so don't bother to check the return code. There's nothing we
can reasonably do if it fails anyway.

Reviewed-by: Pavel Shilovsky <piastryyy@gmail.com>
Reviewed-by: Suresh Jayaraman <sjayaraman@suse.de>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/transport.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 9b2d0373a8a7..bdaa4aa58b03 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -570,20 +570,25 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
 #endif
 
 	mutex_unlock(&ses->server->srv_mutex);
-	cifs_small_buf_release(in_buf);
 
-	if (rc < 0)
+	if (rc < 0) {
+		cifs_small_buf_release(in_buf);
 		goto out;
+	}
 
-	if (long_op == CIFS_ASYNC_OP)
+	if (long_op == CIFS_ASYNC_OP) {
+		cifs_small_buf_release(in_buf);
 		goto out;
+	}
 
 	rc = wait_for_response(ses->server, midQ);
 	if (rc != 0) {
+		send_nt_cancel(ses->server, in_buf, midQ);
 		spin_lock(&GlobalMid_Lock);
 		if (midQ->midState == MID_REQUEST_SUBMITTED) {
 			midQ->callback = DeleteMidQEntry;
 			spin_unlock(&GlobalMid_Lock);
+			cifs_small_buf_release(in_buf);
 			atomic_dec(&ses->server->inFlight);
 			wake_up(&ses->server->request_q);
 			return rc;
@@ -591,6 +596,8 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
 		spin_unlock(&GlobalMid_Lock);
 	}
 
+	cifs_small_buf_release(in_buf);
+
 	rc = sync_mid_result(midQ, ses->server);
 	if (rc != 0) {
 		atomic_dec(&ses->server->inFlight);
@@ -734,6 +741,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
 
 	rc = wait_for_response(ses->server, midQ);
 	if (rc != 0) {
+		send_nt_cancel(ses->server, in_buf, midQ);
 		spin_lock(&GlobalMid_Lock);
 		if (midQ->midState == MID_REQUEST_SUBMITTED) {
 			/* no longer considered to be "in-flight" */
@@ -943,6 +951,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 
 		rc = wait_for_response(ses->server, midQ);
 		if (rc) {
+			send_nt_cancel(ses->server, in_buf, midQ);
 			spin_lock(&GlobalMid_Lock);
 			if (midQ->midState == MID_REQUEST_SUBMITTED) {
 				/* no longer considered to be "in-flight" */
-- 
cgit v1.2.2


From 68abaffa6bbd3cadfaa4b7216d10bcd32406090b Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 28 Jan 2011 15:05:42 -0500
Subject: cifs: simplify SMB header check routine

...just cleanup. There should be no behavior change.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Pavel Shilovsky <piastryyy@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/misc.c | 46 ++++++++++++++++++++++++----------------------
 1 file changed, 24 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index a09e077ba925..72e99ece78cf 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -381,29 +381,31 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
 }
 
 static int
-checkSMBhdr(struct smb_hdr *smb, __u16 mid)
+check_smb_hdr(struct smb_hdr *smb, __u16 mid)
 {
-	/* Make sure that this really is an SMB, that it is a response,
-	   and that the message ids match */
-	if ((*(__le32 *) smb->Protocol == cpu_to_le32(0x424d53ff)) &&
-		(mid == smb->Mid)) {
-		if (smb->Flags & SMBFLG_RESPONSE)
-			return 0;
-		else {
-		/* only one valid case where server sends us request */
-			if (smb->Command == SMB_COM_LOCKING_ANDX)
-				return 0;
-			else
-				cERROR(1, "Received Request not response");
-		}
-	} else { /* bad signature or mid */
-		if (*(__le32 *) smb->Protocol != cpu_to_le32(0x424d53ff))
-			cERROR(1, "Bad protocol string signature header %x",
-				*(unsigned int *) smb->Protocol);
-		if (mid != smb->Mid)
-			cERROR(1, "Mids do not match");
+	/* does it have the right SMB "signature" ? */
+	if (*(__le32 *) smb->Protocol != cpu_to_le32(0x424d53ff)) {
+		cERROR(1, "Bad protocol string signature header 0x%x",
+			*(unsigned int *)smb->Protocol);
+		return 1;
 	}
-	cERROR(1, "bad smb detected. The Mid=%d", smb->Mid);
+
+	/* Make sure that message ids match */
+	if (mid != smb->Mid) {
+		cERROR(1, "Mids do not match. received=%u expected=%u",
+			smb->Mid, mid);
+		return 1;
+	}
+
+	/* if it's a response then accept */
+	if (smb->Flags & SMBFLG_RESPONSE)
+		return 0;
+
+	/* only one valid case where server sends us request */
+	if (smb->Command == SMB_COM_LOCKING_ANDX)
+		return 0;
+
+	cERROR(1, "Server sent request, not response. mid=%u", smb->Mid);
 	return 1;
 }
 
@@ -448,7 +450,7 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
 		return 1;
 	}
 
-	if (checkSMBhdr(smb, mid))
+	if (check_smb_hdr(smb, mid))
 		return 1;
 	clc_len = smbCalcSize_LE(smb);
 
-- 
cgit v1.2.2


From d804d41d163c0975d2890c82d7135ada7a2f23a4 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 28 Jan 2011 15:05:43 -0500
Subject: cifs: don't pop a printk when sending on a socket is interrupted

If we kill the process while it's sending on a socket then the
kernel_sendmsg will return -EINTR. This is normal. No need to spam the
ring buffer with this info.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/transport.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index bdaa4aa58b03..b8c5e2eb43d0 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -236,9 +236,9 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
 		server->tcpStatus = CifsNeedReconnect;
 	}
 
-	if (rc < 0) {
+	if (rc < 0 && rc != -EINTR)
 		cERROR(1, "Error %d sending data on socket to server", rc);
-	} else
+	else
 		rc = 0;
 
 	/* Don't want to modify the buffer as a
-- 
cgit v1.2.2


From 92a4e0f0169498867ecb19c2244510dd4beba149 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Sat, 29 Jan 2011 07:02:28 -0500
Subject: cifs: force a reconnect if there are too many MIDs in flight

Currently, we allow the pending_mid_q to grow without bound with
SIGKILL'ed processes. This could eventually be a DoS'able problem. An
unprivileged user could a process that does a long-running call and then
SIGKILL it.

If he can also intercept the NT_CANCEL calls or the replies from the
server, then the pending_mid_q could grow very large, possibly even to
2^16 entries which might leave GetNextMid in an infinite loop. Fix this
by imposing a hard limit of 32k calls per server. If we cross that
limit, set the tcpStatus to CifsNeedReconnect to force cifsd to
eventually reconnect the socket and clean out the pending_mid_q.

While we're at it, clean up the function a bit and eliminate an
unnecessary NULL pointer check.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/misc.c | 37 ++++++++++++++++++++++++-------------
 1 file changed, 24 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 72e99ece78cf..24f0a9d97ad8 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -236,10 +236,7 @@ __u16 GetNextMid(struct TCP_Server_Info *server)
 {
 	__u16 mid = 0;
 	__u16 last_mid;
-	int   collision;
-
-	if (server == NULL)
-		return mid;
+	bool collision;
 
 	spin_lock(&GlobalMid_Lock);
 	last_mid = server->CurrentMid; /* we do not want to loop forever */
@@ -252,24 +249,38 @@ __u16 GetNextMid(struct TCP_Server_Info *server)
 	(and it would also have to have been a request that
 	 did not time out) */
 	while (server->CurrentMid != last_mid) {
-		struct list_head *tmp;
 		struct mid_q_entry *mid_entry;
+		unsigned int num_mids;
 
-		collision = 0;
+		collision = false;
 		if (server->CurrentMid == 0)
 			server->CurrentMid++;
 
-		list_for_each(tmp, &server->pending_mid_q) {
-			mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
-
-			if ((mid_entry->mid == server->CurrentMid) &&
-			    (mid_entry->midState == MID_REQUEST_SUBMITTED)) {
+		num_mids = 0;
+		list_for_each_entry(mid_entry, &server->pending_mid_q, qhead) {
+			++num_mids;
+			if (mid_entry->mid == server->CurrentMid &&
+			    mid_entry->midState == MID_REQUEST_SUBMITTED) {
 				/* This mid is in use, try a different one */
-				collision = 1;
+				collision = true;
 				break;
 			}
 		}
-		if (collision == 0) {
+
+		/*
+		 * if we have more than 32k mids in the list, then something
+		 * is very wrong. Possibly a local user is trying to DoS the
+		 * box by issuing long-running calls and SIGKILL'ing them. If
+		 * we get to 2^16 mids then we're in big trouble as this
+		 * function could loop forever.
+		 *
+		 * Go ahead and assign out the mid in this situation, but force
+		 * an eventual reconnect to clean out the pending_mid_q.
+		 */
+		if (num_mids > 32768)
+			server->tcpStatus = CifsNeedReconnect;
+
+		if (!collision) {
 			mid = server->CurrentMid;
 			break;
 		}
-- 
cgit v1.2.2


From f855f6cbeb4f94cd4e4a225c2246ee8012c384a2 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 31 Jan 2011 08:41:36 -0500
Subject: cifs: make CIFS depend on CRYPTO_MD4

Recently CIFS was changed to use the kernel crypto API for MD4 hashes,
but the Kconfig dependencies were not changed to reflect this.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reported-and-Tested-by: Suresh Jayaraman <sjayaraman@suse.de>
Reviewed-by: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index ee45648b0d1a..7cb0f7f847e4 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -3,6 +3,7 @@ config CIFS
 	depends on INET
 	select NLS
 	select CRYPTO
+	select CRYPTO_MD4
 	select CRYPTO_MD5
 	select CRYPTO_HMAC
 	select CRYPTO_ARC4
-- 
cgit v1.2.2


From 31c2659d78c8be970833bc1e633593d291553ed3 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 31 Jan 2011 07:24:46 -0500
Subject: cifs: clean up some compiler warnings

New compiler warnings that I noticed when building a patchset based
on recent Fedora kernel:

fs/cifs/cifssmb.c: In function 'CIFSSMBSetFileSize':
fs/cifs/cifssmb.c:4813:8: warning: variable 'data_offset' set but not used
[-Wunused-but-set-variable]

fs/cifs/file.c: In function 'cifs_open':
fs/cifs/file.c:349:24: warning: variable 'pCifsInode' set but not used
[-Wunused-but-set-variable]
fs/cifs/file.c: In function 'cifs_partialpagewrite':
fs/cifs/file.c:1149:23: warning: variable 'cifs_sb' set but not used
[-Wunused-but-set-variable]
fs/cifs/file.c: In function 'cifs_iovec_write':
fs/cifs/file.c:1740:9: warning: passing argument 6 of 'CIFSSMBWrite2' from
incompatible pointer type [enabled by default]
fs/cifs/cifsproto.h:337:12: note: expected 'unsigned int *' but argument is
of type 'size_t *'

fs/cifs/readdir.c: In function 'cifs_readdir':
fs/cifs/readdir.c:767:23: warning: variable 'cifs_sb' set but not used
[-Wunused-but-set-variable]

fs/cifs/cifs_dfs_ref.c: In function 'cifs_dfs_d_automount':
fs/cifs/cifs_dfs_ref.c:342:2: warning: 'rc' may be used uninitialized in
this function [-Wuninitialized]
fs/cifs/cifs_dfs_ref.c:278:6: note: 'rc' was declared here

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Pavel Shilovsky <piastry@etersoft.ru>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_dfs_ref.c | 9 ++++-----
 fs/cifs/cifssmb.c      | 3 ---
 fs/cifs/file.c         | 8 ++------
 fs/cifs/readdir.c      | 3 ---
 4 files changed, 6 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index f1c68629f277..0a265ad9e426 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -282,8 +282,6 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
 	cFYI(1, "in %s", __func__);
 	BUG_ON(IS_ROOT(mntpt));
 
-	xid = GetXid();
-
 	/*
 	 * The MSDFS spec states that paths in DFS referral requests and
 	 * responses must be prefixed by a single '\' character instead of
@@ -293,7 +291,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
 	mnt = ERR_PTR(-ENOMEM);
 	full_path = build_path_from_dentry(mntpt);
 	if (full_path == NULL)
-		goto free_xid;
+		goto cdda_exit;
 
 	cifs_sb = CIFS_SB(mntpt->d_inode->i_sb);
 	tlink = cifs_sb_tlink(cifs_sb);
@@ -303,9 +301,11 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
 	}
 	ses = tlink_tcon(tlink)->ses;
 
+	xid = GetXid();
 	rc = get_dfs_path(xid, ses, full_path + 1, cifs_sb->local_nls,
 		&num_referrals, &referrals,
 		cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+	FreeXid(xid);
 
 	cifs_put_tlink(tlink);
 
@@ -338,8 +338,7 @@ success:
 	free_dfs_info_array(referrals, num_referrals);
 free_full_path:
 	kfree(full_path);
-free_xid:
-	FreeXid(xid);
+cdda_exit:
 	cFYI(1, "leaving %s" , __func__);
 	return mnt;
 }
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 3106f5e5c633..46c66ed01af4 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -4914,7 +4914,6 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
 		   __u16 fid, __u32 pid_of_opener, bool SetAllocation)
 {
 	struct smb_com_transaction2_sfi_req *pSMB  = NULL;
-	char *data_offset;
 	struct file_end_of_file_info *parm_data;
 	int rc = 0;
 	__u16 params, param_offset, offset, byte_count, count;
@@ -4938,8 +4937,6 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
 	param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4;
 	offset = param_offset + params;
 
-	data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
-
 	count = sizeof(struct file_end_of_file_info);
 	pSMB->MaxParameterCount = cpu_to_le16(2);
 	/* BB find exact max SMB PDU from sess structure BB */
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 0de17c1db608..74c0a282d012 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -346,7 +346,6 @@ int cifs_open(struct inode *inode, struct file *file)
 	struct cifsTconInfo *tcon;
 	struct tcon_link *tlink;
 	struct cifsFileInfo *pCifsFile = NULL;
-	struct cifsInodeInfo *pCifsInode;
 	char *full_path = NULL;
 	bool posix_open_ok = false;
 	__u16 netfid;
@@ -361,8 +360,6 @@ int cifs_open(struct inode *inode, struct file *file)
 	}
 	tcon = tlink_tcon(tlink);
 
-	pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
-
 	full_path = build_path_from_dentry(file->f_path.dentry);
 	if (full_path == NULL) {
 		rc = -ENOMEM;
@@ -1146,7 +1143,6 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
 	char *write_data;
 	int rc = -EFAULT;
 	int bytes_written = 0;
-	struct cifs_sb_info *cifs_sb;
 	struct inode *inode;
 	struct cifsFileInfo *open_file;
 
@@ -1154,7 +1150,6 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
 		return -EFAULT;
 
 	inode = page->mapping->host;
-	cifs_sb = CIFS_SB(inode->i_sb);
 
 	offset += (loff_t)from;
 	write_data = kmap(page);
@@ -1667,7 +1662,8 @@ static ssize_t
 cifs_iovec_write(struct file *file, const struct iovec *iov,
 		 unsigned long nr_segs, loff_t *poffset)
 {
-	size_t total_written = 0, written = 0;
+	size_t total_written = 0;
+	unsigned int written = 0;
 	unsigned long num_pages, npages;
 	size_t copied, len, cur_len, i;
 	struct kvec *to_send;
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 7f25cc3d2256..f8e4cd2a7912 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -764,7 +764,6 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 {
 	int rc = 0;
 	int xid, i;
-	struct cifs_sb_info *cifs_sb;
 	struct cifsTconInfo *pTcon;
 	struct cifsFileInfo *cifsFile = NULL;
 	char *current_entry;
@@ -775,8 +774,6 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 
 	xid = GetXid();
 
-	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-
 	/*
 	 * Ensure FindFirst doesn't fail before doing filldir() for '.' and
 	 * '..'. Otherwise we won't be able to notify VFS in case of failure.
-- 
cgit v1.2.2


From 7a8587e7c8e4e32ba778bfbbb822a0a7e8d5f3e3 Mon Sep 17 00:00:00 2001
From: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Date: Sat, 29 Jan 2011 13:54:58 -0600
Subject: cifs: No need to check crypto blockcipher allocation

Missed one change as per earlier suggestion.

Signed-off-by: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsencrypt.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 0db5f1de0227..a51585f9852b 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -657,9 +657,10 @@ calc_seckey(struct cifsSesInfo *ses)
 	get_random_bytes(sec_key, CIFS_SESS_KEY_SIZE);
 
 	tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC);
-	if (!tfm_arc4 || IS_ERR(tfm_arc4)) {
+	if (IS_ERR(tfm_arc4)) {
+		rc = PTR_ERR(tfm_arc4);
 		cERROR(1, "could not allocate crypto API arc4\n");
-		return PTR_ERR(tfm_arc4);
+		return rc;
 	}
 
 	desc.tfm = tfm_arc4;
-- 
cgit v1.2.2


From b1953bcec95c189b1eea690a08e89646d7750bda Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 21 Jan 2011 21:10:01 +0000
Subject: Btrfs: make shrink_delalloc a little friendlier

Xfstests 224 will just sit there and spin for ever until eventually we give up
flushing delalloc and exit.  On my box this took several hours.  I could not
interrupt this process either, even though we use INTERRUPTIBLE.  So do 2 things

1) Keep us from looping over and over again without reclaiming anything
2) If we get interrupted exit the loop

I tested this and the test now exits in a reasonable amount of time, and can be
interrupted with ctrl+c.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ff6bbfd75cf7..f96641a93fc9 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3345,8 +3345,10 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 	u64 reserved;
 	u64 max_reclaim;
 	u64 reclaimed = 0;
+	long time_left;
 	int pause = 1;
 	int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
+	int loops = 0;
 
 	block_rsv = &root->fs_info->delalloc_block_rsv;
 	space_info = block_rsv->space_info;
@@ -3359,7 +3361,7 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 
 	max_reclaim = min(reserved, to_reclaim);
 
-	while (1) {
+	while (loops < 1024) {
 		/* have the flusher threads jump in and do some IO */
 		smp_mb();
 		nr_pages = min_t(unsigned long, nr_pages,
@@ -3367,8 +3369,12 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 		writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages);
 
 		spin_lock(&space_info->lock);
-		if (reserved > space_info->bytes_reserved)
+		if (reserved > space_info->bytes_reserved) {
+			loops = 0;
 			reclaimed += reserved - space_info->bytes_reserved;
+		} else {
+			loops++;
+		}
 		reserved = space_info->bytes_reserved;
 		spin_unlock(&space_info->lock);
 
@@ -3379,7 +3385,12 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 			return -EAGAIN;
 
 		__set_current_state(TASK_INTERRUPTIBLE);
-		schedule_timeout(pause);
+		time_left = schedule_timeout(pause);
+
+		/* We were interrupted, exit */
+		if (time_left)
+			break;
+
 		pause <<= 1;
 		if (pause > HZ / 10)
 			pause = HZ / 10;
-- 
cgit v1.2.2


From b31eabd86eb68d3c217e6821078249bc045e698a Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 31 Jan 2011 16:48:24 -0500
Subject: Btrfs: catch errors from btrfs_sync_log

btrfs_sync_log returns -EAGAIN when we need full transaction commits
instead of small log commits, but sometimes we were dropping the return
value.

In practice, we check for this a few different ways, but this is still a
bug that can leave off full log commits when we really need them.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/tree-log.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c25a41d86118..42dfc3077040 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2051,6 +2051,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 		wait_log_commit(trans, log_root_tree,
 				log_root_tree->log_transid);
 		mutex_unlock(&log_root_tree->log_mutex);
+		ret = 0;
 		goto out;
 	}
 	atomic_set(&log_root_tree->log_commit[index2], 1);
@@ -2115,7 +2116,7 @@ out:
 	smp_mb();
 	if (waitqueue_active(&root->log_commit_wait[index1]))
 		wake_up(&root->log_commit_wait[index1]);
-	return 0;
+	return ret;
 }
 
 static void free_log_tree(struct btrfs_trans_handle *trans,
-- 
cgit v1.2.2


From cab6958da0094e36a098751f844409fc9ee26251 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Mon, 31 Jan 2011 21:56:35 +0000
Subject: [CIFS] Update cifs minor version

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsfs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 14789a97304e..4a3330235d55 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -127,5 +127,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* EXPERIMENTAL */
 
-#define CIFS_VERSION   "1.69"
+#define CIFS_VERSION   "1.70"
 #endif				/* _CIFSFS_H */
-- 
cgit v1.2.2


From 6284644e8de1f4005166c918c3d2aa4c510ab9f6 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 31 Jan 2011 09:14:17 -0500
Subject: cifs: fix length checks in checkSMB

The cERROR message in checkSMB when the calculated length doesn't match
the RFC1001 length is incorrect in many cases. It always says that the
RFC1001 length is bigger than the SMB, even when it's actually the
reverse.

Fix the error message to say the reverse of what it does now when the
SMB length goes beyond the end of the received data. Also, clarify the
error message when the RFC length is too big. Finally, clarify the
comments to show that the 512 byte limit on extra data at the end of
the packet is arbitrary.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/misc.c | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 24f0a9d97ad8..2a930a752a78 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -478,25 +478,26 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
 			if (((4 + len) & 0xFFFF) == (clc_len & 0xFFFF))
 				return 0; /* bcc wrapped */
 		}
-		cFYI(1, "Calculated size %d vs length %d mismatch for mid %d",
+		cFYI(1, "Calculated size %u vs length %u mismatch for mid=%u",
 				clc_len, 4 + len, smb->Mid);
-		/* Windows XP can return a few bytes too much, presumably
-		an illegal pad, at the end of byte range lock responses
-		so we allow for that three byte pad, as long as actual
-		received length is as long or longer than calculated length */
-		/* We have now had to extend this more, since there is a
-		case in which it needs to be bigger still to handle a
-		malformed response to transact2 findfirst from WinXP when
-		access denied is returned and thus bcc and wct are zero
-		but server says length is 0x21 bytes too long as if the server
-		forget to reset the smb rfc1001 length when it reset the
-		wct and bcc to minimum size and drop the t2 parms and data */
-		if ((4+len > clc_len) && (len <= clc_len + 512))
-			return 0;
-		else {
-			cERROR(1, "RFC1001 size %d bigger than SMB for Mid=%d",
+
+		if (4 + len < clc_len) {
+			cERROR(1, "RFC1001 size %u smaller than SMB for mid=%u",
 					len, smb->Mid);
 			return 1;
+		} else if (len > clc_len + 512) {
+			/*
+			 * Some servers (Windows XP in particular) send more
+			 * data than the lengths in the SMB packet would
+			 * indicate on certain calls (byte range locks and
+			 * trans2 find first calls in particular). While the
+			 * client can handle such a frame by ignoring the
+			 * trailing data, we choose limit the amount of extra
+			 * data to 512 bytes.
+			 */
+			cERROR(1, "RFC1001 size %u more than 512 bytes larger "
+				  "than SMB for mid=%u", len, smb->Mid);
+			return 1;
 		}
 	}
 	return 0;
-- 
cgit v1.2.2


From c87fb6fdcaf7560940b31a0c78c3e6370e3433cf Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 31 Jan 2011 19:54:59 -0500
Subject: Btrfs: avoid uninit variable warnings in ordered-data.c

This one isn't really an uninit variable, but for pretty
obscure reasons.  Let's make it clearly correct.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ordered-data.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 2b61e1ddcd99..083a55477375 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -141,7 +141,7 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
 					  u64 file_offset)
 {
 	struct rb_root *root = &tree->tree;
-	struct rb_node *prev;
+	struct rb_node *prev = NULL;
 	struct rb_node *ret;
 	struct btrfs_ordered_extent *entry;
 
-- 
cgit v1.2.2


From 5df67083488ccbad925f583b698ab38f8629a016 Mon Sep 17 00:00:00 2001
From: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Date: Tue, 1 Feb 2011 09:17:35 +0000
Subject: btrfs: checking NULL or not in some functions

Because NULL is returned when the memory allocation fails,
it is checked whether it is NULL.

Signed-off-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 2 ++
 fs/btrfs/extent_io.c   | 2 ++
 fs/btrfs/tree-log.c    | 6 ++++++
 3 files changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f96641a93fc9..9de4ff03882a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6496,6 +6496,8 @@ static noinline int relocate_inode_pages(struct inode *inode, u64 start,
 	int ret = 0;
 
 	ra = kzalloc(sizeof(*ra), GFP_NOFS);
+	if (!ra)
+		return -ENOMEM;
 
 	mutex_lock(&inode->i_mutex);
 	first_index = start >> PAGE_CACHE_SHIFT;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 6411ed6ca449..8862dda46ff6 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1920,6 +1920,8 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
 		nr = bio_get_nr_vecs(bdev);
 
 	bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
+	if (!bio)
+		return -ENOMEM;
 
 	bio_add_page(bio, page, page_size, offset);
 	bio->bi_end_io = end_io_func;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 42dfc3077040..6d66e5caff97 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2751,7 +2751,13 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 	log = root->log_root;
 
 	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
 	dst_path = btrfs_alloc_path();
+	if (!dst_path) {
+		btrfs_free_path(path);
+		return -ENOMEM;
+	}
 
 	min_key.objectid = inode->i_ino;
 	min_key.type = BTRFS_INODE_ITEM_KEY;
-- 
cgit v1.2.2


From 98d5dc13e7e74b77ca3b4c3cbded9f48d2dbbbb7 Mon Sep 17 00:00:00 2001
From: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Date: Thu, 20 Jan 2011 06:19:37 +0000
Subject: btrfs: fix return value check of btrfs_start_transaction()

The error check of btrfs_start_transaction() is added, and the mistake
of the error check on several places is corrected.

Signed-off-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c |  7 +++++--
 fs/btrfs/inode.c       |  1 +
 fs/btrfs/ioctl.c       | 10 ++++++++--
 fs/btrfs/relocation.c  |  3 +++
 fs/btrfs/super.c       |  2 ++
 fs/btrfs/tree-log.c    |  1 +
 fs/btrfs/volumes.c     | 19 +++++++++++++++++--
 7 files changed, 37 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 9de4ff03882a..f07ba21cbf06 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6271,6 +6271,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
 	BUG_ON(!wc);
 
 	trans = btrfs_start_transaction(tree_root, 0);
+	BUG_ON(IS_ERR(trans));
+
 	if (block_rsv)
 		trans->block_rsv = block_rsv;
 
@@ -6368,6 +6370,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
 
 			btrfs_end_transaction_throttle(trans, tree_root);
 			trans = btrfs_start_transaction(tree_root, 0);
+			BUG_ON(IS_ERR(trans));
 			if (block_rsv)
 				trans->block_rsv = block_rsv;
 		}
@@ -7587,7 +7590,7 @@ int btrfs_cleanup_reloc_trees(struct btrfs_root *root)
 
 	if (found) {
 		trans = btrfs_start_transaction(root, 1);
-		BUG_ON(!trans);
+		BUG_ON(IS_ERR(trans));
 		ret = btrfs_commit_transaction(trans, root);
 		BUG_ON(ret);
 	}
@@ -7831,7 +7834,7 @@ static noinline int relocate_one_extent(struct btrfs_root *extent_root,
 
 
 	trans = btrfs_start_transaction(extent_root, 1);
-	BUG_ON(!trans);
+	BUG_ON(IS_ERR(trans));
 
 	if (extent_key->objectid == 0) {
 		ret = del_extent_zero(trans, extent_root, path, extent_key);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5621818921f8..36bc3f49ebf9 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2357,6 +2357,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
 		 */
 		if (is_bad_inode(inode)) {
 			trans = btrfs_start_transaction(root, 0);
+			BUG_ON(IS_ERR(trans));
 			btrfs_orphan_del(trans, inode);
 			btrfs_end_transaction(trans, root);
 			iput(inode);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 12dabe28cf54..02d224e8c83f 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -907,6 +907,10 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
 
 	if (new_size > old_size) {
 		trans = btrfs_start_transaction(root, 0);
+		if (IS_ERR(trans)) {
+			ret = PTR_ERR(trans);
+			goto out_unlock;
+		}
 		ret = btrfs_grow_device(trans, device, new_size);
 		btrfs_commit_transaction(trans, root);
 	} else {
@@ -2141,9 +2145,9 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
 	path->leave_spinning = 1;
 
 	trans = btrfs_start_transaction(root, 1);
-	if (!trans) {
+	if (IS_ERR(trans)) {
 		btrfs_free_path(path);
-		return -ENOMEM;
+		return PTR_ERR(trans);
 	}
 
 	dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
@@ -2337,6 +2341,8 @@ static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp
 	u64 transid;
 
 	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
 	transid = trans->transid;
 	btrfs_commit_transaction_async(trans, root, 0);
 
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index ea9965430241..1f5556acb530 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2028,6 +2028,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
 
 	while (1) {
 		trans = btrfs_start_transaction(root, 0);
+		BUG_ON(IS_ERR(trans));
 		trans->block_rsv = rc->block_rsv;
 
 		ret = btrfs_block_rsv_check(trans, root, rc->block_rsv,
@@ -3665,6 +3666,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
 
 	while (1) {
 		trans = btrfs_start_transaction(rc->extent_root, 0);
+		BUG_ON(IS_ERR(trans));
 
 		if (update_backref_cache(trans, &rc->backref_cache)) {
 			btrfs_end_transaction(trans, rc->extent_root);
@@ -4033,6 +4035,7 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
 	int ret;
 
 	trans = btrfs_start_transaction(root->fs_info->tree_root, 0);
+	BUG_ON(IS_ERR(trans));
 
 	memset(&root->root_item.drop_progress, 0,
 		sizeof(root->root_item.drop_progress));
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f4e45fdded30..0209b5fc772c 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -623,6 +623,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
 	btrfs_wait_ordered_extents(root, 0, 0);
 
 	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
 	ret = btrfs_commit_transaction(trans, root);
 	return ret;
 }
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 6d66e5caff97..a4bbb854dfd2 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3112,6 +3112,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
 	BUG_ON(!path);
 
 	trans = btrfs_start_transaction(fs_info->tree_root, 0);
+	BUG_ON(IS_ERR(trans));
 
 	wc.trans = trans;
 	wc.pin = 1;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f2d2f4ccc738..7cad59353b09 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1212,6 +1212,10 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
 		return -ENOMEM;
 
 	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans)) {
+		btrfs_free_path(path);
+		return PTR_ERR(trans);
+	}
 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
 	key.type = BTRFS_DEV_ITEM_KEY;
 	key.offset = device->devid;
@@ -1604,6 +1608,12 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	}
 
 	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans)) {
+		kfree(device);
+		ret = PTR_ERR(trans);
+		goto error;
+	}
+
 	lock_chunks(root);
 
 	device->barriers = 1;
@@ -1872,7 +1882,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
 		return ret;
 
 	trans = btrfs_start_transaction(root, 0);
-	BUG_ON(!trans);
+	BUG_ON(IS_ERR(trans));
 
 	lock_chunks(root);
 
@@ -2046,7 +2056,7 @@ int btrfs_balance(struct btrfs_root *dev_root)
 		BUG_ON(ret);
 
 		trans = btrfs_start_transaction(dev_root, 0);
-		BUG_ON(!trans);
+		BUG_ON(IS_ERR(trans));
 
 		ret = btrfs_grow_device(trans, device, old_size);
 		BUG_ON(ret);
@@ -2212,6 +2222,11 @@ again:
 
 	/* Shrinking succeeded, else we would be at "done". */
 	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto done;
+	}
+
 	lock_chunks(root);
 
 	device->disk_total_bytes = new_size;
-- 
cgit v1.2.2


From 9587fcff42f5bece3c0a44066b079235ee73cbb3 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 1 Feb 2011 08:40:43 -0500
Subject: cifs: fix length vs. total_read confusion in cifs_demultiplex_thread

length at this point is the length returned by the last kernel_recvmsg
call. total_read is the length of all of the data read so far. length
is more or less meaningless at this point, so use total_read for
everything.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Pavel Shilovsky <piastry@etersoft.ru>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 47d8ff623683..945b2202275f 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -578,12 +578,12 @@ incomplete_rcv:
 		else if (reconnect == 1)
 			continue;
 
-		length += 4; /* account for rfc1002 hdr */
+		total_read += 4; /* account for rfc1002 hdr */
 
-
-		dump_smb(smb_buffer, length);
-		if (checkSMB(smb_buffer, smb_buffer->Mid, total_read+4)) {
-			cifs_dump_mem("Bad SMB: ", smb_buffer, 48);
+		dump_smb(smb_buffer, total_read);
+		if (checkSMB(smb_buffer, smb_buffer->Mid, total_read)) {
+			cifs_dump_mem("Bad SMB: ", smb_buffer,
+					total_read < 48 ? total_read : 48);
 			continue;
 		}
 
-- 
cgit v1.2.2


From 0781b909b5586f4db720b5d1838b78f9d8e42f14 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 1 Feb 2011 15:52:35 -0800
Subject: epoll: epoll_wait() should not use timespec_add_ns()

commit 95aac7b1cd224f ("epoll: make epoll_wait() use the hrtimer range
feature") added a performance regression because it uses timespec_add_ns()
with potential very large 'ns' values.

[akpm@linux-foundation.org: s/epoll_set_mstimeout/ep_set_mstimeout/, per Davide]
Reported-by: Simon Kirby <sim@hostway.ca>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Shawn Bohrer <shawn.bohrer@gmail.com>
Acked-by: Davide Libenzi <davidel@xmailserver.org>
Cc: <stable@kernel.org>		[2.6.37.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventpoll.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index cc8a9b7d6064..267d0ada4541 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1114,6 +1114,17 @@ static int ep_send_events(struct eventpoll *ep,
 	return ep_scan_ready_list(ep, ep_send_events_proc, &esed);
 }
 
+static inline struct timespec ep_set_mstimeout(long ms)
+{
+	struct timespec now, ts = {
+		.tv_sec = ms / MSEC_PER_SEC,
+		.tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC),
+	};
+
+	ktime_get_ts(&now);
+	return timespec_add_safe(now, ts);
+}
+
 static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 		   int maxevents, long timeout)
 {
@@ -1121,12 +1132,11 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 	unsigned long flags;
 	long slack;
 	wait_queue_t wait;
-	struct timespec end_time;
 	ktime_t expires, *to = NULL;
 
 	if (timeout > 0) {
-		ktime_get_ts(&end_time);
-		timespec_add_ns(&end_time, (u64)timeout * NSEC_PER_MSEC);
+		struct timespec end_time = ep_set_mstimeout(timeout);
+
 		slack = select_estimate_accuracy(&end_time);
 		to = &expires;
 		*to = timespec_to_ktime(end_time);
-- 
cgit v1.2.2


From 3cd90ea42f2c15f928b70ed66f6d8ed0a8e7aadd Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Tue, 1 Feb 2011 15:52:46 -0800
Subject: vfs: sparse: add __FMODE_EXEC

FMODE_EXEC is a constant type of fmode_t but was used with normal integer
constants.  This results in following warnings from sparse.  Fix it using
new macro __FMODE_EXEC.

 fs/exec.c:116:58: warning: restricted fmode_t degrades to integer
 fs/exec.c:689:58: warning: restricted fmode_t degrades to integer
 fs/fcntl.c:777:9: warning: restricted fmode_t degrades to integer

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c  | 4 ++--
 fs/fcntl.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index c62efcb959c7..52a447d9b6ab 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -120,7 +120,7 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
 		goto out;
 
 	file = do_filp_open(AT_FDCWD, tmp,
-				O_LARGEFILE | O_RDONLY | FMODE_EXEC, 0,
+				O_LARGEFILE | O_RDONLY | __FMODE_EXEC, 0,
 				MAY_READ | MAY_EXEC | MAY_OPEN);
 	putname(tmp);
 	error = PTR_ERR(file);
@@ -723,7 +723,7 @@ struct file *open_exec(const char *name)
 	int err;
 
 	file = do_filp_open(AT_FDCWD, name,
-				O_LARGEFILE | O_RDONLY | FMODE_EXEC, 0,
+				O_LARGEFILE | O_RDONLY | __FMODE_EXEC, 0,
 				MAY_EXEC | MAY_OPEN);
 	if (IS_ERR(file))
 		goto out;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index ecc8b3954ed6..cb1026181bdc 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -815,7 +815,7 @@ static int __init fcntl_init(void)
 		__O_SYNC	| O_DSYNC	| FASYNC	|
 		O_DIRECT	| O_LARGEFILE	| O_DIRECTORY	|
 		O_NOFOLLOW	| O_NOATIME	| O_CLOEXEC	|
-		FMODE_EXEC
+		__FMODE_EXEC
 		));
 
 	fasync_cache = kmem_cache_create("fasync_cache",
-- 
cgit v1.2.2


From d54cdc8ca7aabc69e145a62155855db42b04ed0b Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Tue, 1 Feb 2011 15:52:47 -0800
Subject: fs: make block fiemap mapping length at least blocksize long

Some filesystems don't deal well with being asked to map less than
blocksize blocks (GFS2 for example).  Since we are always mapping at least
blocksize sections anyway, just make sure len is at least as big as a
blocksize so we don't trip up any filesystems.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Cc: Steven Whitehouse <swhiteho@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ioctl.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/ioctl.c b/fs/ioctl.c
index a59635e295fa..1eebeb72b202 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -273,6 +273,13 @@ int __generic_block_fiemap(struct inode *inode,
 		len = isize;
 	}
 
+	/*
+	 * Some filesystems can't deal with being asked to map less than
+	 * blocksize, so make sure our len is at least block length.
+	 */
+	if (logical_to_blk(inode, len) == 0)
+		len = blk_to_logical(inode, 1);
+
 	start_blk = logical_to_blk(inode, start);
 	last_blk = logical_to_blk(inode, start + len - 1);
 
-- 
cgit v1.2.2


From 0b0abeaf3d30cec03ac6497fe978b8f7edecc5ae Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Wed, 2 Feb 2011 21:02:12 +0200
Subject: Revert "exofs: Set i_mapping->backing_dev_info anyway"

This reverts commit 115e19c53501edc11f730191f7f047736815ae3d.

Apparently setting inode->bdi to one's own sb->s_bdi stops VFS from
sending *read-aheads*.  This problem was bisected to this commit.  A
revert fixes it.  I'll investigate farther why is this happening for the
next Kernel, but for now a revert.

I'm sending to stable@kernel.org as well, since it exists also in
2.6.37.  2.6.36 is good and does not have this patch.

CC: Stable Tree <stable@kernel.org>
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exofs/inode.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 42685424817b..a7555238c41a 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -1030,7 +1030,6 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
 		memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data));
 	}
 
-	inode->i_mapping->backing_dev_info = sb->s_bdi;
 	if (S_ISREG(inode->i_mode)) {
 		inode->i_op = &exofs_file_inode_operations;
 		inode->i_fop = &exofs_file_operations;
@@ -1131,7 +1130,6 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
 
 	sbi = sb->s_fs_info;
 
-	inode->i_mapping->backing_dev_info = sb->s_bdi;
 	sb->s_dirt = 1;
 	inode_init_owner(inode, dir, mode);
 	inode->i_ino = sbi->s_nextid++;
-- 
cgit v1.2.2


From 8f1f745331c1b560f53c0d60e55a4f4f43f7cce5 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Thu, 3 Feb 2011 14:33:15 -0500
Subject: ext4: fix panic on module unload when stopping lazyinit thread

https://bugzilla.kernel.org/show_bug.cgi?id=27652

If the lazyinit thread is running, the teardown function
ext4_destroy_lazyinit_thread() has problems:

        ext4_clear_request_list();
        while (ext4_li_info->li_task) {
                wake_up(&ext4_li_info->li_wait_daemon);
                wait_event(ext4_li_info->li_wait_task,
                           ext4_li_info->li_task == NULL);
        }

Clearing the request list will cause the thread to exit and free
ext4_li_info, so then we're waiting on something which is getting
freed.

Fix this up by making the thread respond to kthread_stop, and exit,
without the need to wait for that exit in some other homegrown way.

Cc: stable@kernel.org
Reported-and-Tested-by: Tao Ma <boyu.mt@taobao.com>
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 48ce561fafac..3d8cf2cab379 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -77,6 +77,7 @@ static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
 		       const char *dev_name, void *data);
 static void ext4_destroy_lazyinit_thread(void);
 static void ext4_unregister_li_request(struct super_block *sb);
+static void ext4_clear_request_list(void);
 
 #if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
 static struct file_system_type ext3_fs_type = {
@@ -2716,6 +2717,8 @@ static void ext4_unregister_li_request(struct super_block *sb)
 	mutex_unlock(&ext4_li_info->li_list_mtx);
 }
 
+static struct task_struct *ext4_lazyinit_task;
+
 /*
  * This is the function where ext4lazyinit thread lives. It walks
  * through the request list searching for next scheduled filesystem.
@@ -2784,6 +2787,10 @@ cont_thread:
 		if (time_before(jiffies, next_wakeup))
 			schedule();
 		finish_wait(&eli->li_wait_daemon, &wait);
+		if (kthread_should_stop()) {
+			ext4_clear_request_list();
+			goto exit_thread;
+		}
 	}
 
 exit_thread:
@@ -2808,6 +2815,7 @@ exit_thread:
 	wake_up(&eli->li_wait_task);
 
 	kfree(ext4_li_info);
+	ext4_lazyinit_task = NULL;
 	ext4_li_info = NULL;
 	mutex_unlock(&ext4_li_mtx);
 
@@ -2830,11 +2838,10 @@ static void ext4_clear_request_list(void)
 
 static int ext4_run_lazyinit_thread(void)
 {
-	struct task_struct *t;
-
-	t = kthread_run(ext4_lazyinit_thread, ext4_li_info, "ext4lazyinit");
-	if (IS_ERR(t)) {
-		int err = PTR_ERR(t);
+	ext4_lazyinit_task = kthread_run(ext4_lazyinit_thread,
+					 ext4_li_info, "ext4lazyinit");
+	if (IS_ERR(ext4_lazyinit_task)) {
+		int err = PTR_ERR(ext4_lazyinit_task);
 		ext4_clear_request_list();
 		del_timer_sync(&ext4_li_info->li_timer);
 		kfree(ext4_li_info);
@@ -2985,16 +2992,10 @@ static void ext4_destroy_lazyinit_thread(void)
 	 * If thread exited earlier
 	 * there's nothing to be done.
 	 */
-	if (!ext4_li_info)
+	if (!ext4_li_info || !ext4_lazyinit_task)
 		return;
 
-	ext4_clear_request_list();
-
-	while (ext4_li_info->li_task) {
-		wake_up(&ext4_li_info->li_wait_daemon);
-		wait_event(ext4_li_info->li_wait_task,
-			   ext4_li_info->li_task == NULL);
-	}
+	kthread_stop(ext4_lazyinit_task);
 }
 
 static int ext4_fill_super(struct super_block *sb, void *data, int silent)
-- 
cgit v1.2.2


From 8f021222c1e2756ea4c9dde93b23e1d2a0a4ec37 Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Thu, 3 Feb 2011 14:33:33 -0500
Subject: ext4: unregister features interface on module unload

Ext4 features interface was not properly unregistered which led to
problems while unloading/reloading ext4 module. This commit fixes that by
adding proper kobject unregistration code into ext4_exit_fs() as well as
fail-path of ext4_init_fs()

Reported-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/super.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 3d8cf2cab379..4898cb1ff606 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -4769,7 +4769,7 @@ static struct file_system_type ext4_fs_type = {
 	.fs_flags	= FS_REQUIRES_DEV,
 };
 
-int __init ext4_init_feat_adverts(void)
+static int __init ext4_init_feat_adverts(void)
 {
 	struct ext4_features *ef;
 	int ret = -ENOMEM;
@@ -4793,6 +4793,13 @@ out:
 	return ret;
 }
 
+static void ext4_exit_feat_adverts(void)
+{
+	kobject_put(&ext4_feat->f_kobj);
+	wait_for_completion(&ext4_feat->f_kobj_unregister);
+	kfree(ext4_feat);
+}
+
 static int __init ext4_init_fs(void)
 {
 	int err;
@@ -4839,7 +4846,7 @@ out1:
 out2:
 	ext4_exit_mballoc();
 out3:
-	kfree(ext4_feat);
+	ext4_exit_feat_adverts();
 	remove_proc_entry("fs/ext4", NULL);
 	kset_unregister(ext4_kset);
 out4:
@@ -4858,6 +4865,7 @@ static void __exit ext4_exit_fs(void)
 	destroy_inodecache();
 	ext4_exit_xattr();
 	ext4_exit_mballoc();
+	ext4_exit_feat_adverts();
 	remove_proc_entry("fs/ext4", NULL);
 	kset_unregister(ext4_kset);
 	ext4_exit_system_zone();
-- 
cgit v1.2.2


From dd68314ccf3fb918c1fb6471817edbc60ece4b52 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 3 Feb 2011 14:33:49 -0500
Subject: ext4: fix up ext4 error handling

Make sure we the correct cleanup happens if we die while trying to
load the ext4 file system.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 4898cb1ff606..86b05486dc63 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -4810,13 +4810,17 @@ static int __init ext4_init_fs(void)
 		return err;
 	err = ext4_init_system_zone();
 	if (err)
-		goto out5;
+		goto out7;
 	ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
 	if (!ext4_kset)
-		goto out4;
+		goto out6;
 	ext4_proc_root = proc_mkdir("fs/ext4", NULL);
+	if (!ext4_proc_root)
+		goto out5;
 
 	err = ext4_init_feat_adverts();
+	if (err)
+		goto out4;
 
 	err = ext4_init_mballoc();
 	if (err)
@@ -4847,11 +4851,13 @@ out2:
 	ext4_exit_mballoc();
 out3:
 	ext4_exit_feat_adverts();
+out4:
 	remove_proc_entry("fs/ext4", NULL);
+out5:
 	kset_unregister(ext4_kset);
-out4:
+out6:
 	ext4_exit_system_zone();
-out5:
+out7:
 	ext4_exit_pageio();
 	return err;
 }
-- 
cgit v1.2.2


From c5b8d0bce052949e173b5b32f96bd59bceaa2ab0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@tuxera.com>
Date: Wed, 2 Feb 2011 09:32:39 -0700
Subject: hfsplus: fix failed mount handling

Currently the error handling in hfsplus_fill_super is a mess, and can
lead to accessing fields in the superblock that haven't been even set
up yet.  Fix this by making sure we do not set up sb->s_root until we
have the mount fully set up, and before that do proper step by step
unwinding instead of using hfsplus_put_super as a big hammer.

Reported-by: Dan Williams <dcbw@redhat.com>
Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/super.c | 106 ++++++++++++++++++++++++++++++-----------------------
 1 file changed, 61 insertions(+), 45 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 9a3b4795f43c..b49b55584c84 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -338,20 +338,22 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 	struct inode *root, *inode;
 	struct qstr str;
 	struct nls_table *nls = NULL;
-	int err = -EINVAL;
+	int err;
 
+	err = -EINVAL;
 	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
 	if (!sbi)
-		return -ENOMEM;
+		goto out;
 
 	sb->s_fs_info = sbi;
 	mutex_init(&sbi->alloc_mutex);
 	mutex_init(&sbi->vh_mutex);
 	hfsplus_fill_defaults(sbi);
+
+	err = -EINVAL;
 	if (!hfsplus_parse_options(data, sbi)) {
 		printk(KERN_ERR "hfs: unable to parse mount options\n");
-		err = -EINVAL;
-		goto cleanup;
+		goto out_unload_nls;
 	}
 
 	/* temporarily use utf8 to correctly find the hidden dir below */
@@ -359,16 +361,14 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->nls = load_nls("utf8");
 	if (!sbi->nls) {
 		printk(KERN_ERR "hfs: unable to load nls for utf8\n");
-		err = -EINVAL;
-		goto cleanup;
+		goto out_unload_nls;
 	}
 
 	/* Grab the volume header */
 	if (hfsplus_read_wrapper(sb)) {
 		if (!silent)
 			printk(KERN_WARNING "hfs: unable to find HFS+ superblock\n");
-		err = -EINVAL;
-		goto cleanup;
+		goto out_unload_nls;
 	}
 	vhdr = sbi->s_vhdr;
 
@@ -377,7 +377,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 	if (be16_to_cpu(vhdr->version) < HFSPLUS_MIN_VERSION ||
 	    be16_to_cpu(vhdr->version) > HFSPLUS_CURRENT_VERSION) {
 		printk(KERN_ERR "hfs: wrong filesystem version\n");
-		goto cleanup;
+		goto out_free_vhdr;
 	}
 	sbi->total_blocks = be32_to_cpu(vhdr->total_blocks);
 	sbi->free_blocks = be32_to_cpu(vhdr->free_blocks);
@@ -421,19 +421,19 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID);
 	if (!sbi->ext_tree) {
 		printk(KERN_ERR "hfs: failed to load extents file\n");
-		goto cleanup;
+		goto out_free_vhdr;
 	}
 	sbi->cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID);
 	if (!sbi->cat_tree) {
 		printk(KERN_ERR "hfs: failed to load catalog file\n");
-		goto cleanup;
+		goto out_close_ext_tree;
 	}
 
 	inode = hfsplus_iget(sb, HFSPLUS_ALLOC_CNID);
 	if (IS_ERR(inode)) {
 		printk(KERN_ERR "hfs: failed to load allocation file\n");
 		err = PTR_ERR(inode);
-		goto cleanup;
+		goto out_close_cat_tree;
 	}
 	sbi->alloc_file = inode;
 
@@ -442,14 +442,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 	if (IS_ERR(root)) {
 		printk(KERN_ERR "hfs: failed to load root directory\n");
 		err = PTR_ERR(root);
-		goto cleanup;
-	}
-	sb->s_d_op = &hfsplus_dentry_operations;
-	sb->s_root = d_alloc_root(root);
-	if (!sb->s_root) {
-		iput(root);
-		err = -ENOMEM;
-		goto cleanup;
+		goto out_put_alloc_file;
 	}
 
 	str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1;
@@ -459,46 +452,69 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 	if (!hfs_brec_read(&fd, &entry, sizeof(entry))) {
 		hfs_find_exit(&fd);
 		if (entry.type != cpu_to_be16(HFSPLUS_FOLDER))
-			goto cleanup;
+			goto out_put_root;
 		inode = hfsplus_iget(sb, be32_to_cpu(entry.folder.id));
 		if (IS_ERR(inode)) {
 			err = PTR_ERR(inode);
-			goto cleanup;
+			goto out_put_root;
 		}
 		sbi->hidden_dir = inode;
 	} else
 		hfs_find_exit(&fd);
 
-	if (sb->s_flags & MS_RDONLY)
-		goto out;
+	if (!(sb->s_flags & MS_RDONLY)) {
+		/*
+		 * H+LX == hfsplusutils, H+Lx == this driver, H+lx is unused
+		 * all three are registered with Apple for our use
+		 */
+		vhdr->last_mount_vers = cpu_to_be32(HFSP_MOUNT_VERSION);
+		vhdr->modify_date = hfsp_now2mt();
+		be32_add_cpu(&vhdr->write_count, 1);
+		vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT);
+		vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT);
+		hfsplus_sync_fs(sb, 1);
 
-	/* H+LX == hfsplusutils, H+Lx == this driver, H+lx is unused
-	 * all three are registered with Apple for our use
-	 */
-	vhdr->last_mount_vers = cpu_to_be32(HFSP_MOUNT_VERSION);
-	vhdr->modify_date = hfsp_now2mt();
-	be32_add_cpu(&vhdr->write_count, 1);
-	vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT);
-	vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT);
-	hfsplus_sync_fs(sb, 1);
-
-	if (!sbi->hidden_dir) {
-		mutex_lock(&sbi->vh_mutex);
-		sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR);
-		hfsplus_create_cat(sbi->hidden_dir->i_ino, sb->s_root->d_inode,
-				   &str, sbi->hidden_dir);
-		mutex_unlock(&sbi->vh_mutex);
-
-		hfsplus_mark_inode_dirty(sbi->hidden_dir, HFSPLUS_I_CAT_DIRTY);
+		if (!sbi->hidden_dir) {
+			mutex_lock(&sbi->vh_mutex);
+			sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR);
+			hfsplus_create_cat(sbi->hidden_dir->i_ino, root, &str,
+					   sbi->hidden_dir);
+			mutex_unlock(&sbi->vh_mutex);
+
+			hfsplus_mark_inode_dirty(sbi->hidden_dir,
+						 HFSPLUS_I_CAT_DIRTY);
+		}
 	}
-out:
+
+	sb->s_d_op = &hfsplus_dentry_operations;
+	sb->s_root = d_alloc_root(root);
+	if (!sb->s_root) {
+		err = -ENOMEM;
+		goto out_put_hidden_dir;
+	}
+
 	unload_nls(sbi->nls);
 	sbi->nls = nls;
 	return 0;
 
-cleanup:
-	hfsplus_put_super(sb);
+out_put_hidden_dir:
+	iput(sbi->hidden_dir);
+out_put_root:
+	iput(sbi->alloc_file);
+out_put_alloc_file:
+	iput(sbi->alloc_file);
+out_close_cat_tree:
+	hfs_btree_close(sbi->cat_tree);
+out_close_ext_tree:
+	hfs_btree_close(sbi->ext_tree);
+out_free_vhdr:
+	kfree(sbi->s_vhdr);
+	kfree(sbi->s_backup_vhdr);
+out_unload_nls:
+	unload_nls(sbi->nls);
 	unload_nls(nls);
+	kfree(sbi);
+out:
 	return err;
 }
 
-- 
cgit v1.2.2


From 14dd01f88319a37b06ca909738044e39ec5bfdee Mon Sep 17 00:00:00 2001
From: Chuck Ebbert <cebbert@redhat.com>
Date: Tue, 1 Feb 2011 16:41:55 -0500
Subject: hfsplus: do not leak buffer on error

Signed-Off-By: Chuck Ebbert <cebbert@redhat.com>
Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/part_tbl.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/part_tbl.c b/fs/hfsplus/part_tbl.c
index d66ad113b1cc..40ad88c12c64 100644
--- a/fs/hfsplus/part_tbl.c
+++ b/fs/hfsplus/part_tbl.c
@@ -134,7 +134,7 @@ int hfs_part_find(struct super_block *sb,
 	res = hfsplus_submit_bio(sb->s_bdev, *part_start + HFS_PMAP_BLK,
 				 data, READ);
 	if (res)
-		return res;
+		goto out;
 
 	switch (be16_to_cpu(*((__be16 *)data))) {
 	case HFS_OLD_PMAP_MAGIC:
@@ -147,7 +147,7 @@ int hfs_part_find(struct super_block *sb,
 		res = -ENOENT;
 		break;
 	}
-
+out:
 	kfree(data);
 	return res;
 }
-- 
cgit v1.2.2


From a1dbcef0172555464b5329f8ba47d43c98132dfa Mon Sep 17 00:00:00 2001
From: Chuck Ebbert <cebbert@redhat.com>
Date: Wed, 2 Feb 2011 10:55:06 -0500
Subject: hfsplus: fix two memory leaks in wrapper.c

Signed-Off-By: Chuck Ebbert <cebbert@redhat.com>
Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/wrapper.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index 196231794f64..3031d81f5f0f 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -167,7 +167,7 @@ reread:
 		break;
 	case cpu_to_be16(HFSP_WRAP_MAGIC):
 		if (!hfsplus_read_mdb(sbi->s_vhdr, &wd))
-			goto out;
+			goto out_free_backup_vhdr;
 		wd.ablk_size >>= HFSPLUS_SECTOR_SHIFT;
 		part_start += wd.ablk_start + wd.embed_start * wd.ablk_size;
 		part_size = wd.embed_count * wd.ablk_size;
@@ -179,7 +179,7 @@ reread:
 		 * (should do this only for cdrom/loop though)
 		 */
 		if (hfs_part_find(sb, &part_start, &part_size))
-			goto out;
+			goto out_free_backup_vhdr;
 		goto reread;
 	}
 
-- 
cgit v1.2.2


From 1065348d472f97b4b8eb53b60ec67e99148cbbca Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@tuxera.com>
Date: Wed, 2 Feb 2011 09:40:33 -0700
Subject: hfsplus: fix up a comparism in hfsplus_file_extend

Revert an incorrect hunk from commit b2837fcf4994e699a4def002e26f274d95b387c1,

	"hfsplus: %L-to-%ll, macro correction, and remove unneeded braces"

revert a pointless change of comparism operation argument order, which turned
out to not even be equivalent.

Reported-by: Joe Perches <joe@perches.com>
Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/extents.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index 52a0bcaa7b6d..b1991a2a08e0 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -397,8 +397,8 @@ int hfsplus_file_extend(struct inode *inode)
 	u32 start, len, goal;
 	int res;
 
-	if (sbi->total_blocks - sbi->free_blocks + 8 >
-			sbi->alloc_file->i_size * 8) {
+	if (sbi->alloc_file->i_size * 8 <
+	    sbi->total_blocks - sbi->free_blocks + 8) {
 		/* extend alloc file */
 		printk(KERN_ERR "hfs: extend alloc file! "
 				"(%llu,%u,%u)\n",
-- 
cgit v1.2.2


From 76429c148b939f5a6863c0a024eb8960ae91469a Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <piastry@etersoft.ru>
Date: Mon, 31 Jan 2011 16:03:08 +0300
Subject: CIFS: Fix variable types in cifs_iovec_read/write (try #2)

Variable 'i' should be unsigned long as it's used in circle with num_pages,
and bytes_read/total_written should be ssize_t according to return value.

Signed-off-by: Pavel Shilovsky <piastry@etersoft.ru>
Reviewed-by: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/file.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 74c0a282d012..e964b1cd5dd0 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1662,10 +1662,10 @@ static ssize_t
 cifs_iovec_write(struct file *file, const struct iovec *iov,
 		 unsigned long nr_segs, loff_t *poffset)
 {
-	size_t total_written = 0;
-	unsigned int written = 0;
-	unsigned long num_pages, npages;
-	size_t copied, len, cur_len, i;
+	unsigned int written;
+	unsigned long num_pages, npages, i;
+	size_t copied, len, cur_len;
+	ssize_t total_written = 0;
 	struct kvec *to_send;
 	struct page **pages;
 	struct iov_iter it;
@@ -1821,7 +1821,8 @@ cifs_iovec_read(struct file *file, const struct iovec *iov,
 {
 	int rc;
 	int xid;
-	unsigned int total_read, bytes_read = 0;
+	ssize_t total_read;
+	unsigned int bytes_read = 0;
 	size_t len, cur_len;
 	int iov_offset = 0;
 	struct cifs_sb_info *cifs_sb;
-- 
cgit v1.2.2


From 78d2978874e4e10e97dfd4fd79db45bdc0748550 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Date: Fri, 4 Feb 2011 18:13:24 +0000
Subject: CRED: Fix kernel panic upon security_file_alloc() failure.

In get_empty_filp() since 2.6.29, file_free(f) is called with f->f_cred == NULL
when security_file_alloc() returned an error.  As a result, kernel will panic()
due to put_cred(NULL) call within RCU callback.

Fix this bug by assigning f->f_cred before calling security_file_alloc().

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/file_table.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/file_table.c b/fs/file_table.c
index c3e89adf53c0..eb36b6b17e26 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -125,13 +125,13 @@ struct file *get_empty_filp(void)
 		goto fail;
 
 	percpu_counter_inc(&nr_files);
+	f->f_cred = get_cred(cred);
 	if (security_file_alloc(f))
 		goto fail_sec;
 
 	INIT_LIST_HEAD(&f->f_u.fu_list);
 	atomic_long_set(&f->f_count, 1);
 	rwlock_init(&f->f_owner.lock);
-	f->f_cred = get_cred(cred);
 	spin_lock_init(&f->f_lock);
 	eventpoll_init_file(f);
 	/* f->f_version: 0 */
-- 
cgit v1.2.2


From 64474bdd07f673cc48509ea0375274422c8f73bf Mon Sep 17 00:00:00 2001
From: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Date: Thu, 3 Feb 2011 14:31:18 -0600
Subject: cifs: Possible slab memory corruption while updating extended stats
 (repost)

Updating extended statistics here can cause slab memory corruption
if a callback function frees slab memory (mid_entry).

Signed-off-by: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 945b2202275f..1f32a2893b5f 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -633,11 +633,11 @@ incomplete_rcv:
 				mid_entry->largeBuf = isLargeBuf;
 multi_t2_fnd:
 				mid_entry->midState = MID_RESPONSE_RECEIVED;
-				list_del_init(&mid_entry->qhead);
-				mid_entry->callback(mid_entry);
 #ifdef CONFIG_CIFS_STATS2
 				mid_entry->when_received = jiffies;
 #endif
+				list_del_init(&mid_entry->qhead);
+				mid_entry->callback(mid_entry);
 				break;
 			}
 			mid_entry = NULL;
-- 
cgit v1.2.2


From e3f0dadb2b44746f6223ce4560406d19e02fb1cc Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 4 Feb 2011 07:21:26 -0500
Subject: cifs: enable signing flag in SMB header when server has it on

cifs_sign_smb only generates a signature if the correct Flags2 bit is
set. Make sure that it gets set correctly if we're sending an async
call.

This patch fixes:

    https://bugzilla.kernel.org/show_bug.cgi?id=28142

Reported-and-Tested-by: JG <jg@cms.ac>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/transport.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index b8c5e2eb43d0..fbc5aace54b1 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -359,6 +359,10 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_hdr *in_buf,
 	if (rc)
 		return rc;
 
+	/* enable signing if server requires it */
+	if (server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+		in_buf->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
+
 	mutex_lock(&server->srv_mutex);
 	mid = AllocMidQEntry(in_buf, server);
 	if (mid == NULL) {
-- 
cgit v1.2.2


From 247ec9b418ba50c9022280035330059364d54540 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 4 Feb 2011 17:09:50 -0500
Subject: cifs: don't send an echo request unless NegProt has been done

When the socket to the server is disconnected, the client more or less
immediately calls cifs_reconnect to reconnect the socket. The NegProt
and SessSetup however are not done until an actual call needs to be
made.

With the addition of the SMB echo code, it's possible that the server
will initiate a disconnect on an idle socket. The client will then
reconnect the socket but no NegotiateProtocol request is done. The
SMBEcho workqueue job will then eventually pop, and an SMBEcho will be
sent on the socket. The server will then reject it since no NegProt was
done.

The ideal fix would be to either have the socket not be reconnected
until we plan to use it, or to immediately do a NegProt when the
reconnect occurs. The code is not structured for this however. For now
we must just settle for not sending any echoes until the NegProt is
done.

Reported-by: JG <jg@cms.ac>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 1f32a2893b5f..257b6d895e20 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -337,8 +337,12 @@ cifs_echo_request(struct work_struct *work)
 	struct TCP_Server_Info *server = container_of(work,
 					struct TCP_Server_Info, echo.work);
 
-	/* no need to ping if we got a response recently */
-	if (time_before(jiffies, server->lstrp + SMB_ECHO_INTERVAL - HZ))
+	/*
+	 * We cannot send an echo until the NEGOTIATE_PROTOCOL request is done.
+	 * Also, no need to ping if we got a response recently
+	 */
+	if (server->tcpStatus != CifsGood ||
+	    time_before(jiffies, server->lstrp + SMB_ECHO_INTERVAL - HZ))
 		goto requeue_echo;
 
 	rc = CIFSSMBEcho(server);
-- 
cgit v1.2.2


From 8132b65bc6ce6d9a4baafdfc28c7cd9c258ed6e4 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <kernel@fomichev.me>
Date: Sun, 6 Feb 2011 02:05:28 +0300
Subject: cifs: add check for kmalloc in parse_dacl

Exit from parse_dacl if no memory returned from the call to kmalloc.

Signed-off-by: Stanislav Fomichev <kernel@fomichev.me>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsacl.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 1e7636b145a8..beeebf194234 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -372,6 +372,10 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
 
 		ppace = kmalloc(num_aces * sizeof(struct cifs_ace *),
 				GFP_KERNEL);
+		if (!ppace) {
+			cERROR(1, "DACL memory allocation error");
+			return;
+		}
 
 		for (i = 0; i < num_aces; ++i) {
 			ppace[i] = (struct cifs_ace *) (acl_base + acl_size);
-- 
cgit v1.2.2


From 13dbc08987f25d9dba488a34b44b43e3844b027c Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 3 Feb 2011 02:39:52 +0000
Subject: Btrfs: make sure search_bitmap finds something in remove_from_bitmap

When we're cleaning up the tree log we need to be able to remove free space from
the block group.  The problem is if that free space spans bitmaps we would not
find the space since we're looking for too many bytes.  So make sure the amount
of bytes we search for is limited to either the number of bytes we want, or the
number of bytes left in the bitmap.  This was tested by a user who was hitting
the BUG() after search_bitmap.  With this patch he can now mount his fs.
Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/free-space-cache.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index a5501edc3c9f..a0390657451b 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1216,6 +1216,7 @@ again:
 	 */
 	search_start = *offset;
 	search_bytes = *bytes;
+	search_bytes = min(search_bytes, end - search_start + 1);
 	ret = search_bitmap(block_group, bitmap_info, &search_start,
 			    &search_bytes);
 	BUG_ON(ret < 0 || search_start != *offset);
-- 
cgit v1.2.2


From 3c14874acc71180553fb5aba528e3cf57c5b958b Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Wed, 2 Feb 2011 15:53:47 +0000
Subject: Btrfs: exclude super blocks when we read in block groups

This has been resulting in a BUT_ON(ret) after btrfs_reserve_extent in
btrfs_cow_file_range.  The reason is we don't actually calculate the bytes_super
for a block group until we go to cache it, which means that the space_info can
hand out reservations for space that it doesn't actually have, and we can run
out of data space.  This is also a problem if you are using space caching since
we don't ever calculate bytes_super for the block groups.  So instead everytime
we read a block group call exclude_super_stripes, which calculates the
bytes_super for the block group so it can be left out of the space_info.  Then
whenever caching completes we just call free_excluded_extents so that the super
excluded extents are freed up.  Also if we are unmounting and we hit any block
groups that haven't been cached we still need to call free_excluded_extents to
make sure things are cleaned up properly.  Thanks,

Reported-by: Arne Jansen <sensille@gmx.net>
Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f07ba21cbf06..565e22d77b1b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -320,11 +320,6 @@ static int caching_kthread(void *data)
 	if (!path)
 		return -ENOMEM;
 
-	exclude_super_stripes(extent_root, block_group);
-	spin_lock(&block_group->space_info->lock);
-	block_group->space_info->bytes_readonly += block_group->bytes_super;
-	spin_unlock(&block_group->space_info->lock);
-
 	last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
 
 	/*
@@ -467,8 +462,10 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
 			cache->cached = BTRFS_CACHE_NO;
 		}
 		spin_unlock(&cache->lock);
-		if (ret == 1)
+		if (ret == 1) {
+			free_excluded_extents(fs_info->extent_root, cache);
 			return 0;
+		}
 	}
 
 	if (load_cache_only)
@@ -4036,6 +4033,7 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
 
 	num_bytes = ALIGN(num_bytes, root->sectorsize);
 	atomic_dec(&BTRFS_I(inode)->outstanding_extents);
+	WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents) < 0);
 
 	spin_lock(&BTRFS_I(inode)->accounting_lock);
 	nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents);
@@ -8325,6 +8323,13 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 		if (block_group->cached == BTRFS_CACHE_STARTED)
 			wait_block_group_cache_done(block_group);
 
+		/*
+		 * We haven't cached this block group, which means we could
+		 * possibly have excluded extents on this block group.
+		 */
+		if (block_group->cached == BTRFS_CACHE_NO)
+			free_excluded_extents(info->extent_root, block_group);
+
 		btrfs_remove_free_space_cache(block_group);
 		btrfs_put_block_group(block_group);
 
@@ -8439,6 +8444,13 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 		cache->flags = btrfs_block_group_flags(&cache->item);
 		cache->sectorsize = root->sectorsize;
 
+		/*
+		 * We need to exclude the super stripes now so that the space
+		 * info has super bytes accounted for, otherwise we'll think
+		 * we have more space than we actually do.
+		 */
+		exclude_super_stripes(root, cache);
+
 		/*
 		 * check for two cases, either we are full, and therefore
 		 * don't need to bother with the caching work since we won't
@@ -8447,12 +8459,10 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 		 * time, particularly in the full case.
 		 */
 		if (found_key.offset == btrfs_block_group_used(&cache->item)) {
-			exclude_super_stripes(root, cache);
 			cache->last_byte_to_unpin = (u64)-1;
 			cache->cached = BTRFS_CACHE_FINISHED;
 			free_excluded_extents(root, cache);
 		} else if (btrfs_block_group_used(&cache->item) == 0) {
-			exclude_super_stripes(root, cache);
 			cache->last_byte_to_unpin = (u64)-1;
 			cache->cached = BTRFS_CACHE_FINISHED;
 			add_new_free_space(cache, root->fs_info,
-- 
cgit v1.2.2


From 554233a6e0e8557e8e81e54cc70628d101291122 Mon Sep 17 00:00:00 2001
From: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Date: Thu, 3 Feb 2011 03:16:25 +0000
Subject: btrfs: cleanup error handling in btrfs_unlink_inode()

When btrfs_alloc_path() fails, btrfs_free_path() need not be called.
Therefore, it changes the branch ahead.

Signed-off-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 36bc3f49ebf9..c9bc0afdbfc6 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2646,7 +2646,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 	path = btrfs_alloc_path();
 	if (!path) {
 		ret = -ENOMEM;
-		goto err;
+		goto out;
 	}
 
 	path->leave_spinning = 1;
-- 
cgit v1.2.2


From 8e4eef7a60eeca0fe7503e5cbd3b24ff4941c732 Mon Sep 17 00:00:00 2001
From: Alexey Charkov <alchark@gmail.com>
Date: Wed, 2 Feb 2011 21:15:35 +0000
Subject: btrfs: Drop __exit attribute on btrfs_exit_compress

As this function is called in some error paths while not
removing the module, the __exit attribute prevents the kernel
image from linking when btrfs is compiled in statically.

Signed-off-by: Alexey Charkov <alchark@gmail.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/compression.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 3a932f183da1..4d2110eafe29 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -921,7 +921,7 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
 	return ret;
 }
 
-void __exit btrfs_exit_compress(void)
+void btrfs_exit_compress(void)
 {
 	free_workspaces();
 }
-- 
cgit v1.2.2


From d402539b8fc3fa21f16eb5e654be742670399e8a Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 7 Feb 2011 08:54:35 -0500
Subject: cifs: remove checks for ses->status == CifsExiting

ses->status is never set to CifsExiting, so these checks are
always false.

Tested-by: JG <jg@cms.ac>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifssmb.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 46c66ed01af4..904aa47e3515 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -136,9 +136,6 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
 		}
 	}
 
-	if (ses->status == CifsExiting)
-		return -EIO;
-
 	/*
 	 * Give demultiplex thread up to 10 seconds to reconnect, should be
 	 * greater than cifs socket timeout which is 7 seconds
@@ -156,7 +153,7 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
 		 * retrying until process is killed or server comes
 		 * back on-line
 		 */
-		if (!tcon->retry || ses->status == CifsExiting) {
+		if (!tcon->retry) {
 			cFYI(1, "gave up waiting on reconnect in smb_init");
 			return -EHOSTDOWN;
 		}
-- 
cgit v1.2.2


From d50bdd5aa55127635fd8a5c74bd2abb256bd34e3 Mon Sep 17 00:00:00 2001
From: Curt Wohlgemuth <curtw@google.com>
Date: Mon, 7 Feb 2011 12:46:14 -0500
Subject: ext4: Fix data corruption with multi-block writepages support

This fixes a corruption problem with the multi-block
writepages submittal change for ext4, from commit
bd2d0210cf22f2bd0cef72eb97cf94fc7d31d8cc ("ext4: use bio
layer instead of buffer layer in mpage_da_submit_io").

(Note that this corruption is not present in 2.6.37 on
ext4, because the corruption was detected after the
feature was merged in 2.6.37-rc1, and so it was turned
off by adding a non-default mount option,
mblk_io_submit.  With this commit, which hopefully
fixes the last of the bugs with this feature, we'll be
able to turn on this performance feature by default in
2.6.38, and remove the mblk_io_submit option.)

The ext4 code path to bundle multiple pages for
writeback in ext4_bio_write_page() had a bug: we should
be clearing buffer head dirty flags *before* we submit
the bio, not in the completion routine.

The patch below was tested on 2.6.37 under KVM with the
postgresql script which was submitted by Jon Nelson as
documented in commit 1449032be1.

Without the patch, I'd hit the corruption problem about
50-70% of the time.  With the patch, I executed the
script > 100 times with no corruption seen.

I also fixed a bug to make sure ext4_end_bio() doesn't
dereference the bio after the bio_put() call.

Reported-by: Jon Nelson <jnelson@jamponi.net>
Reported-by: Matthias Bayer <jackdachef@gmail.com>
Signed-off-by: Curt Wohlgemuth <curtw@google.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/page-io.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 7270dcfca92a..4e9b0a242f4c 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -190,6 +190,7 @@ static void ext4_end_bio(struct bio *bio, int error)
 	struct inode *inode;
 	unsigned long flags;
 	int i;
+	sector_t bi_sector = bio->bi_sector;
 
 	BUG_ON(!io_end);
 	bio->bi_private = NULL;
@@ -207,9 +208,7 @@ static void ext4_end_bio(struct bio *bio, int error)
 		if (error)
 			SetPageError(page);
 		BUG_ON(!head);
-		if (head->b_size == PAGE_CACHE_SIZE)
-			clear_buffer_dirty(head);
-		else {
+		if (head->b_size != PAGE_CACHE_SIZE) {
 			loff_t offset;
 			loff_t io_end_offset = io_end->offset + io_end->size;
 
@@ -221,7 +220,6 @@ static void ext4_end_bio(struct bio *bio, int error)
 					if (error)
 						buffer_io_error(bh);
 
-					clear_buffer_dirty(bh);
 				}
 				if (buffer_delay(bh))
 					partial_write = 1;
@@ -257,7 +255,7 @@ static void ext4_end_bio(struct bio *bio, int error)
 			     (unsigned long long) io_end->offset,
 			     (long) io_end->size,
 			     (unsigned long long)
-			     bio->bi_sector >> (inode->i_blkbits - 9));
+			     bi_sector >> (inode->i_blkbits - 9));
 	}
 
 	/* Add the io_end to per-inode completed io list*/
@@ -380,6 +378,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
 
 	blocksize = 1 << inode->i_blkbits;
 
+	BUG_ON(!PageLocked(page));
 	BUG_ON(PageWriteback(page));
 	set_page_writeback(page);
 	ClearPageError(page);
@@ -397,12 +396,14 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
 	for (bh = head = page_buffers(page), block_start = 0;
 	     bh != head || !block_start;
 	     block_start = block_end, bh = bh->b_this_page) {
+
 		block_end = block_start + blocksize;
 		if (block_start >= len) {
 			clear_buffer_dirty(bh);
 			set_buffer_uptodate(bh);
 			continue;
 		}
+		clear_buffer_dirty(bh);
 		ret = io_submit_add_bh(io, io_page, inode, wbc, bh);
 		if (ret) {
 			/*
-- 
cgit v1.2.2


From 3a90983dbdcb2f4f48c0d771d8e5b4d88f27fae6 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@linux.intel.com>
Date: Tue, 18 Jan 2011 13:34:40 +0800
Subject: Btrfs: Fix page count calculation

take offset of start position into account when calculating page count.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9e097fbfc78d..b0ff34b96607 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -991,8 +991,8 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
 		size_t write_bytes = min(iov_iter_count(&i),
 					 nrptrs * (size_t)PAGE_CACHE_SIZE -
 					 offset);
-		size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
-					PAGE_CACHE_SHIFT;
+		size_t num_pages = (write_bytes + offset +
+				    PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 
 		WARN_ON(num_pages > nrptrs);
 		memset(pages, 0, sizeof(struct page *) * nrptrs);
@@ -1022,8 +1022,8 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
 
 		copied = btrfs_copy_from_user(pos, num_pages,
 					   write_bytes, pages, &i);
-		dirty_pages = (copied + PAGE_CACHE_SIZE - 1) >>
-					PAGE_CACHE_SHIFT;
+		dirty_pages = (copied + offset + PAGE_CACHE_SIZE - 1) >>
+				PAGE_CACHE_SHIFT;
 
 		if (num_pages > dirty_pages) {
 			if (copied > 0)
-- 
cgit v1.2.2


From 7e90d705fc9f8c5e3a1549281dce0654d049243b Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Tue, 8 Feb 2011 23:52:32 +0000
Subject: [CIFS] Do not send SMBEcho requests on new sockets until SMBNegotiate

In order to determine whether an SMBEcho request can be sent
we need to know that the socket is established (server tcpStatus == CifsGood)
AND that an SMB NegotiateProtocol has been sent (server maxBuf != 0).
Without the second check we can send an Echo request during reconnection
before the server can accept it.

CC: JG <jg@cms.ac>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsglob.h | 2 ++
 fs/cifs/connect.c  | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index edd5b29b53c9..1ab33eb71d95 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -188,6 +188,8 @@ struct TCP_Server_Info {
 	/* multiplexed reads or writes */
 	unsigned int maxBuf;	/* maxBuf specifies the maximum */
 	/* message size the server can send or receive for non-raw SMBs */
+	/* maxBuf is returned by SMB NegotiateProtocol so maxBuf is only 0 */
+	/* when socket is setup (and during reconnect) before NegProt sent */
 	unsigned int max_rw;	/* maxRw specifies the maximum */
 	/* message size the server can send or receive for */
 	/* SMB_COM_WRITE_RAW or SMB_COM_READ_RAW. */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 257b6d895e20..10011e99b34d 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -341,7 +341,7 @@ cifs_echo_request(struct work_struct *work)
 	 * We cannot send an echo until the NEGOTIATE_PROTOCOL request is done.
 	 * Also, no need to ping if we got a response recently
 	 */
-	if (server->tcpStatus != CifsGood ||
+	if ((server->tcpStatus != CifsGood) || (server->maxBuf == 0) ||
 	    time_before(jiffies, server->lstrp + SMB_ECHO_INTERVAL - HZ))
 		goto requeue_echo;
 
-- 
cgit v1.2.2


From 195291e68c2ad59a046fc56d32bf59635b100e5c Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 9 Feb 2011 12:01:42 -0500
Subject: cifs: clean up checks in cifs_echo_request

Follow-on patch to 7e90d705 which is already in Steve's tree...

The check for tcpStatus == CifsGood is not meaningful since it doesn't
indicate whether the NEGOTIATE request has been done. Also, clarify
why we're checking for maxBuf == 0.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 10011e99b34d..161f24ca4f6e 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -338,10 +338,11 @@ cifs_echo_request(struct work_struct *work)
 					struct TCP_Server_Info, echo.work);
 
 	/*
-	 * We cannot send an echo until the NEGOTIATE_PROTOCOL request is done.
-	 * Also, no need to ping if we got a response recently
+	 * We cannot send an echo until the NEGOTIATE_PROTOCOL request is
+	 * done, which is indicated by maxBuf != 0. Also, no need to ping if
+	 * we got a response recently
 	 */
-	if ((server->tcpStatus != CifsGood) || (server->maxBuf == 0) ||
+	if (server->maxBuf == 0 ||
 	    time_before(jiffies, server->lstrp + SMB_ECHO_INTERVAL - HZ))
 		goto requeue_echo;
 
-- 
cgit v1.2.2


From 71823baff1978be892e7a36eddf6170e1cc6650d Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Thu, 10 Feb 2011 08:03:50 -0500
Subject: cifs: don't always drop malformed replies on the floor (try #3)

Slight revision to this patch...use min_t() instead of conditional
assignment. Also, remove the FIXME comment and replace it with the
explanation that Steve gave earlier.

After receiving a packet, we currently check the header. If it's no
good, then we toss it out and continue the loop, leaving the caller
waiting on that response.

In cases where the packet has length inconsistencies, but the MID is
valid, this leads to unneeded delays. That's especially problematic now
that the client waits indefinitely for responses.

Instead, don't immediately discard the packet if checkSMB fails. Try to
find a matching mid_q_entry, mark it as having a malformed response and
issue the callback.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsglob.h  |  2 +-
 fs/cifs/connect.c   | 30 ++++++++++++++++++++++++------
 fs/cifs/transport.c |  3 +++
 3 files changed, 28 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 1ab33eb71d95..17afb0fbcaed 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -654,7 +654,7 @@ static inline void free_dfs_info_array(struct dfs_info3_param *param,
 #define   MID_REQUEST_SUBMITTED 2
 #define   MID_RESPONSE_RECEIVED 4
 #define   MID_RETRY_NEEDED      8 /* session closed while this request out */
-#define   MID_NO_RESP_NEEDED 0x10
+#define   MID_RESPONSE_MALFORMED 0x10
 
 /* Types of response buffer returned from SendReceive2 */
 #define   CIFS_NO_BUFFER        0    /* Response buffer not returned */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 161f24ca4f6e..8d6c17ab593d 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -586,11 +586,20 @@ incomplete_rcv:
 		total_read += 4; /* account for rfc1002 hdr */
 
 		dump_smb(smb_buffer, total_read);
-		if (checkSMB(smb_buffer, smb_buffer->Mid, total_read)) {
+
+		/*
+		 * We know that we received enough to get to the MID as we
+		 * checked the pdu_length earlier. Now check to see
+		 * if the rest of the header is OK. We borrow the length
+		 * var for the rest of the loop to avoid a new stack var.
+		 *
+		 * 48 bytes is enough to display the header and a little bit
+		 * into the payload for debugging purposes.
+		 */
+		length = checkSMB(smb_buffer, smb_buffer->Mid, total_read);
+		if (length != 0)
 			cifs_dump_mem("Bad SMB: ", smb_buffer,
-					total_read < 48 ? total_read : 48);
-			continue;
-		}
+					min_t(unsigned int, total_read, 48));
 
 		mid_entry = NULL;
 		server->lstrp = jiffies;
@@ -602,7 +611,8 @@ incomplete_rcv:
 			if ((mid_entry->mid == smb_buffer->Mid) &&
 			    (mid_entry->midState == MID_REQUEST_SUBMITTED) &&
 			    (mid_entry->command == smb_buffer->Command)) {
-				if (check2ndT2(smb_buffer,server->maxBuf) > 0) {
+				if (length == 0 &&
+				   check2ndT2(smb_buffer, server->maxBuf) > 0) {
 					/* We have a multipart transact2 resp */
 					isMultiRsp = true;
 					if (mid_entry->resp_buf) {
@@ -637,7 +647,12 @@ incomplete_rcv:
 				mid_entry->resp_buf = smb_buffer;
 				mid_entry->largeBuf = isLargeBuf;
 multi_t2_fnd:
-				mid_entry->midState = MID_RESPONSE_RECEIVED;
+				if (length == 0)
+					mid_entry->midState =
+							MID_RESPONSE_RECEIVED;
+				else
+					mid_entry->midState =
+							MID_RESPONSE_MALFORMED;
 #ifdef CONFIG_CIFS_STATS2
 				mid_entry->when_received = jiffies;
 #endif
@@ -658,6 +673,9 @@ multi_t2_fnd:
 				else
 					smallbuf = NULL;
 			}
+		} else if (length != 0) {
+			/* response sanity checks failed */
+			continue;
 		} else if (!is_valid_oplock_break(smb_buffer, server) &&
 			   !isMultiRsp) {
 			cERROR(1, "No task to wake, unknown frame received! "
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index fbc5aace54b1..46d8756f2b24 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -457,6 +457,9 @@ sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
 	case MID_RETRY_NEEDED:
 		rc = -EAGAIN;
 		break;
+	case MID_RESPONSE_MALFORMED:
+		rc = -EIO;
+		break;
 	default:
 		cERROR(1, "%s: invalid mid state mid=%d state=%d", __func__,
 			mid->mid, mid->midState);
-- 
cgit v1.2.2


From 6b155c8fd4d239f7d883d455bbad1be47724bbfc Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Fri, 11 Feb 2011 16:44:31 -0600
Subject: dlm: use single thread workqueues

The recent commit to use cmwq for send and recv threads
dcce240ead802d42b1e45ad2fcb2ed4a399cb255 introduced problems,
apparently due to multiple workqueue threads.  Single threads
make the problems go away, so return to that until we fully
understand the concurrency issues with multiple threads.

Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lowcomms.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 9c64ae9e4c1a..2d8c87b951c2 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1468,15 +1468,13 @@ static void work_stop(void)
 
 static int work_start(void)
 {
-	recv_workqueue = alloc_workqueue("dlm_recv", WQ_MEM_RECLAIM |
-					 WQ_HIGHPRI | WQ_FREEZEABLE, 0);
+	recv_workqueue = create_singlethread_workqueue("dlm_recv");
 	if (!recv_workqueue) {
 		log_print("can't start dlm_recv");
 		return -ENOMEM;
 	}
 
-	send_workqueue = alloc_workqueue("dlm_send", WQ_MEM_RECLAIM |
-					 WQ_HIGHPRI | WQ_FREEZEABLE, 0);
+	send_workqueue = create_singlethread_workqueue("dlm_send");
 	if (!send_workqueue) {
 		log_print("can't start dlm_send");
 		destroy_workqueue(recv_workqueue);
-- 
cgit v1.2.2


From 2dab597441667d6c04451a7dcf215241ad4c74f6 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 11 Feb 2011 15:53:38 -0800
Subject: Fix possible filp_cachep memory corruption

In commit 31e6b01f4183 ("fs: rcu-walk for path lookup") we started doing
path lookup using RCU, which then falls back to a careful non-RCU lookup
in case of problems (LOOKUP_REVAL).  So do_filp_open() has this "re-do
the lookup carefully" looping case.

However, that means that we must not release the open-intent file data
if we are going to loop around and use it once more!

Fix this by moving the release of the open-intent data to the function
that allocates it (do_filp_open() itself) rather than the helper
functions that can get called multiple times (finish_open() and
do_last()).  This makes the logic for the lifetime of that field much
more obvious, and avoids the possible double free.

Reported-by: J. R. Okajima <hooanon05@yahoo.co.jp>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/namei.c | 20 ++++++++++----------
 fs/open.c  |  2 ++
 2 files changed, 12 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 7d77f24d32a9..ec4b2d0190a8 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -561,10 +561,14 @@ static inline int nameidata_drop_rcu_last_maybe(struct nameidata *nd)
  */
 void release_open_intent(struct nameidata *nd)
 {
-	if (nd->intent.open.file->f_path.dentry == NULL)
-		put_filp(nd->intent.open.file);
-	else
-		fput(nd->intent.open.file);
+	struct file *file = nd->intent.open.file;
+
+	if (file && !IS_ERR(file)) {
+		if (file->f_path.dentry == NULL)
+			put_filp(file);
+		else
+			fput(file);
+	}
 }
 
 /*
@@ -2265,8 +2269,6 @@ static struct file *finish_open(struct nameidata *nd,
 	return filp;
 
 exit:
-	if (!IS_ERR(nd->intent.open.file))
-		release_open_intent(nd);
 	path_put(&nd->path);
 	return ERR_PTR(error);
 }
@@ -2389,8 +2391,6 @@ exit_mutex_unlock:
 exit_dput:
 	path_put_conditional(path, nd);
 exit:
-	if (!IS_ERR(nd->intent.open.file))
-		release_open_intent(nd);
 	path_put(&nd->path);
 	return ERR_PTR(error);
 }
@@ -2477,6 +2477,7 @@ struct file *do_filp_open(int dfd, const char *pathname,
 	}
 	audit_inode(pathname, nd.path.dentry);
 	filp = finish_open(&nd, open_flag, acc_mode);
+	release_open_intent(&nd);
 	return filp;
 
 creat:
@@ -2553,6 +2554,7 @@ out:
 		path_put(&nd.root);
 	if (filp == ERR_PTR(-ESTALE) && !(flags & LOOKUP_REVAL))
 		goto reval;
+	release_open_intent(&nd);
 	return filp;
 
 exit_dput:
@@ -2560,8 +2562,6 @@ exit_dput:
 out_path:
 	path_put(&nd.path);
 out_filp:
-	if (!IS_ERR(nd.intent.open.file))
-		release_open_intent(&nd);
 	filp = ERR_PTR(error);
 	goto out;
 }
diff --git a/fs/open.c b/fs/open.c
index e52389e1f05b..5a2c6ebc22b5 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -790,6 +790,8 @@ struct file *nameidata_to_filp(struct nameidata *nd)
 
 	/* Pick up the filp from the open intent */
 	filp = nd->intent.open.file;
+	nd->intent.open.file = NULL;
+
 	/* Has the filesystem initialised the file for us? */
 	if (filp->f_path.dentry == NULL) {
 		path_get(&nd->path);
-- 
cgit v1.2.2


From d863b50ab01333659314c2034890cb76d9fdc3c7 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Thu, 10 Feb 2011 15:01:20 -0800
Subject: vfs: call rcu_barrier after ->kill_sb()

In commit fa0d7e3de6d6 ("fs: icache RCU free inodes"), we use rcu free
inode instead of freeing the inode directly.  It causes a crash when we
rmmod immediately after we umount the volume[1].

So we need to call rcu_barrier after we kill_sb so that the inode is
freed before we do rmmod.  The idea is inspired by Aneesh Kumar.
rcu_barrier will wait for all callbacks to end before preceding.  The
original patch was done by Tao Ma, but synchronize_rcu() is not enough
here.

1. http://marc.info/?l=linux-fsdevel&m=129680863330185&w=2

Tested-by: Tao Ma <boyu.mt@taobao.com>
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Chris Mason <chris.mason@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/super.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/super.c b/fs/super.c
index 74e149efed81..7e9dd4cc2c01 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -177,6 +177,11 @@ void deactivate_locked_super(struct super_block *s)
 	struct file_system_type *fs = s->s_type;
 	if (atomic_dec_and_test(&s->s_active)) {
 		fs->kill_sb(s);
+		/*
+		 * We need to call rcu_barrier so all the delayed rcu free
+		 * inodes are flushed before we release the fs module.
+		 */
+		rcu_barrier();
 		put_filesystem(fs);
 		put_super(s);
 	} else {
-- 
cgit v1.2.2


From 2892c15ddda6a76dc10b7499e56c0f3b892e5a69 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Sat, 12 Feb 2011 08:12:18 -0500
Subject: ext4: make grpinfo slab cache names static

In 2.6.37 I was running into oopses with repeated module
loads & unloads.  I tracked this down to:

fb1813f4 ext4: use dedicated slab caches for group_info structures

(this was in addition to the features advert unload problem)

The kstrdup & subsequent kfree of the cache name was causing
a double free.  In slub, at least, if I read it right it allocates
& frees the name itself, slab seems to do something different...
so in slub I think we were leaking -our- cachep->name, and double
freeing the one allocated by slub.

After getting lost in slab/slub/slob a bit, I just looked at other
sized-caches that get allocated.  jbd2, biovec, sgpool all do it
more or less the way jbd2 does.  Below patch follows the jbd2
method of dynamically allocating a cache at mount time from
a list of static names.

(This might also possibly fix a race creating the caches with
parallel mounts running).

[Folded in a fix from Dan Carpenter which fixed an off-by-one error in
the original patch]

Cc: stable@kernel.org
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 100 ++++++++++++++++++++++++++++++++----------------------
 1 file changed, 60 insertions(+), 40 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 851f49b2f9d2..d1fe09aea73d 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -342,10 +342,15 @@ static struct kmem_cache *ext4_free_ext_cachep;
 /* We create slab caches for groupinfo data structures based on the
  * superblock block size.  There will be one per mounted filesystem for
  * each unique s_blocksize_bits */
-#define NR_GRPINFO_CACHES	\
-	(EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE + 1)
+#define NR_GRPINFO_CACHES 8
 static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
 
+static const char *ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
+	"ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k",
+	"ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k",
+	"ext4_groupinfo_64k", "ext4_groupinfo_128k"
+};
+
 static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
 					ext4_group_t group);
 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
@@ -2414,6 +2419,55 @@ err_freesgi:
 	return -ENOMEM;
 }
 
+static void ext4_groupinfo_destroy_slabs(void)
+{
+	int i;
+
+	for (i = 0; i < NR_GRPINFO_CACHES; i++) {
+		if (ext4_groupinfo_caches[i])
+			kmem_cache_destroy(ext4_groupinfo_caches[i]);
+		ext4_groupinfo_caches[i] = NULL;
+	}
+}
+
+static int ext4_groupinfo_create_slab(size_t size)
+{
+	static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex);
+	int slab_size;
+	int blocksize_bits = order_base_2(size);
+	int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
+	struct kmem_cache *cachep;
+
+	if (cache_index >= NR_GRPINFO_CACHES)
+		return -EINVAL;
+
+	if (unlikely(cache_index < 0))
+		cache_index = 0;
+
+	mutex_lock(&ext4_grpinfo_slab_create_mutex);
+	if (ext4_groupinfo_caches[cache_index]) {
+		mutex_unlock(&ext4_grpinfo_slab_create_mutex);
+		return 0;	/* Already created */
+	}
+
+	slab_size = offsetof(struct ext4_group_info,
+				bb_counters[blocksize_bits + 2]);
+
+	cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index],
+					slab_size, 0, SLAB_RECLAIM_ACCOUNT,
+					NULL);
+
+	mutex_unlock(&ext4_grpinfo_slab_create_mutex);
+	if (!cachep) {
+		printk(KERN_EMERG "EXT4: no memory for groupinfo slab cache\n");
+		return -ENOMEM;
+	}
+
+	ext4_groupinfo_caches[cache_index] = cachep;
+
+	return 0;
+}
+
 int ext4_mb_init(struct super_block *sb, int needs_recovery)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -2421,9 +2475,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
 	unsigned offset;
 	unsigned max;
 	int ret;
-	int cache_index;
-	struct kmem_cache *cachep;
-	char *namep = NULL;
 
 	i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
 
@@ -2440,30 +2491,9 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
 		goto out;
 	}
 
-	cache_index = sb->s_blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
-	cachep = ext4_groupinfo_caches[cache_index];
-	if (!cachep) {
-		char name[32];
-		int len = offsetof(struct ext4_group_info,
-					bb_counters[sb->s_blocksize_bits + 2]);
-
-		sprintf(name, "ext4_groupinfo_%d", sb->s_blocksize_bits);
-		namep = kstrdup(name, GFP_KERNEL);
-		if (!namep) {
-			ret = -ENOMEM;
-			goto out;
-		}
-
-		/* Need to free the kmem_cache_name() when we
-		 * destroy the slab */
-		cachep = kmem_cache_create(namep, len, 0,
-					     SLAB_RECLAIM_ACCOUNT, NULL);
-		if (!cachep) {
-			ret = -ENOMEM;
-			goto out;
-		}
-		ext4_groupinfo_caches[cache_index] = cachep;
-	}
+	ret = ext4_groupinfo_create_slab(sb->s_blocksize);
+	if (ret < 0)
+		goto out;
 
 	/* order 0 is regular bitmap */
 	sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
@@ -2520,7 +2550,6 @@ out:
 	if (ret) {
 		kfree(sbi->s_mb_offsets);
 		kfree(sbi->s_mb_maxs);
-		kfree(namep);
 	}
 	return ret;
 }
@@ -2734,7 +2763,6 @@ int __init ext4_init_mballoc(void)
 
 void ext4_exit_mballoc(void)
 {
-	int i;
 	/*
 	 * Wait for completion of call_rcu()'s on ext4_pspace_cachep
 	 * before destroying the slab cache.
@@ -2743,15 +2771,7 @@ void ext4_exit_mballoc(void)
 	kmem_cache_destroy(ext4_pspace_cachep);
 	kmem_cache_destroy(ext4_ac_cachep);
 	kmem_cache_destroy(ext4_free_ext_cachep);
-
-	for (i = 0; i < NR_GRPINFO_CACHES; i++) {
-		struct kmem_cache *cachep = ext4_groupinfo_caches[i];
-		if (cachep) {
-			char *name = (char *)kmem_cache_name(cachep);
-			kmem_cache_destroy(cachep);
-			kfree(name);
-		}
-	}
+	ext4_groupinfo_destroy_slabs();
 	ext4_remove_debugfs_entry();
 }
 
-- 
cgit v1.2.2


From e9e3bcecf44c04b9e6b505fd8e2eb9cea58fb94d Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Sat, 12 Feb 2011 08:17:34 -0500
Subject: ext4: serialize unaligned asynchronous DIO

ext4 has a data corruption case when doing non-block-aligned
asynchronous direct IO into a sparse file, as demonstrated
by xfstest 240.

The root cause is that while ext4 preallocates space in the
hole, mappings of that space still look "new" and
dio_zero_block() will zero out the unwritten portions.  When
more than one AIO thread is going, they both find this "new"
block and race to zero out their portion; this is uncoordinated
and causes data corruption.

Dave Chinner fixed this for xfs by simply serializing all
unaligned asynchronous direct IO.  I've done the same here.
The difference is that we only wait on conversions, not all IO.
This is a very big hammer, and I'm not very pleased with
stuffing this into ext4_file_write().  But since ext4 is
DIO_LOCKING, we need to serialize it at this high level.

I tried to move this into ext4_ext_direct_IO, but by then
we have the i_mutex already, and we will wait on the
work queue to do conversions - which must also take the
i_mutex.  So that won't work.

This was originally exposed by qemu-kvm installing to
a raw disk image with a normal sector-63 alignment.  I've
tested a backport of this patch with qemu, and it does
avoid the corruption.  It is also quite a lot slower
(14 min for package installs, vs. 8 min for well-aligned)
but I'll take slow correctness over fast corruption any day.

Mingming suggested that we can track outstanding
conversions, and wait on those so that non-sparse
files won't be affected, and I've implemented that here;
unaligned AIO to nonsparse files won't take a perf hit.

[tytso@mit.edu: Keep the mutex as a hashed array instead
 of bloating the ext4 inode]

[tytso@mit.edu: Fix up namespace issues so that global
 variables are protected with an "ext4_" prefix.]

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    | 10 ++++++++++
 fs/ext4/extents.c | 10 ++++++----
 fs/ext4/file.c    | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/ext4/page-io.c | 25 ++++++++++++-----------
 fs/ext4/super.c   | 13 +++++++++++-
 5 files changed, 100 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 0c8d97b56f34..3aa0b72b3b94 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -848,6 +848,7 @@ struct ext4_inode_info {
 	atomic_t i_ioend_count;	/* Number of outstanding io_end structs */
 	/* current io_end structure for async DIO write*/
 	ext4_io_end_t *cur_aio_dio;
+	atomic_t i_aiodio_unwritten; /* Nr. of inflight conversions pending */
 
 	spinlock_t i_block_reservation_lock;
 
@@ -2119,6 +2120,15 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh)
 
 #define in_range(b, first, len)	((b) >= (first) && (b) <= (first) + (len) - 1)
 
+/* For ioend & aio unwritten conversion wait queues */
+#define EXT4_WQ_HASH_SZ		37
+#define ext4_ioend_wq(v)   (&ext4__ioend_wq[((unsigned long)(v)) %\
+					    EXT4_WQ_HASH_SZ])
+#define ext4_aio_mutex(v)  (&ext4__aio_mutex[((unsigned long)(v)) %\
+					     EXT4_WQ_HASH_SZ])
+extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
+extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
+
 #endif	/* __KERNEL__ */
 
 #endif	/* _EXT4_H */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 63a75810b7c3..ccce8a7e94ed 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3174,9 +3174,10 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 		 * that this IO needs to convertion to written when IO is
 		 * completed
 		 */
-		if (io)
+		if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
 			io->flag = EXT4_IO_END_UNWRITTEN;
-		else
+			atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
+		} else
 			ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
 		if (ext4_should_dioread_nolock(inode))
 			map->m_flags |= EXT4_MAP_UNINIT;
@@ -3463,9 +3464,10 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 		 * that we need to perform convertion when IO is done.
 		 */
 		if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
-			if (io)
+			if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
 				io->flag = EXT4_IO_END_UNWRITTEN;
-			else
+				atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
+			} else
 				ext4_set_inode_state(inode,
 						     EXT4_STATE_DIO_UNWRITTEN);
 		}
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 2e8322c8aa88..7b80d543b89e 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -55,11 +55,47 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
 	return 0;
 }
 
+static void ext4_aiodio_wait(struct inode *inode)
+{
+	wait_queue_head_t *wq = ext4_ioend_wq(inode);
+
+	wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_aiodio_unwritten) == 0));
+}
+
+/*
+ * This tests whether the IO in question is block-aligned or not.
+ * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they
+ * are converted to written only after the IO is complete.  Until they are
+ * mapped, these blocks appear as holes, so dio_zero_block() will assume that
+ * it needs to zero out portions of the start and/or end block.  If 2 AIO
+ * threads are at work on the same unwritten block, they must be synchronized
+ * or one thread will zero the other's data, causing corruption.
+ */
+static int
+ext4_unaligned_aio(struct inode *inode, const struct iovec *iov,
+		   unsigned long nr_segs, loff_t pos)
+{
+	struct super_block *sb = inode->i_sb;
+	int blockmask = sb->s_blocksize - 1;
+	size_t count = iov_length(iov, nr_segs);
+	loff_t final_size = pos + count;
+
+	if (pos >= inode->i_size)
+		return 0;
+
+	if ((pos & blockmask) || (final_size & blockmask))
+		return 1;
+
+	return 0;
+}
+
 static ssize_t
 ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
 		unsigned long nr_segs, loff_t pos)
 {
 	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
+	int unaligned_aio = 0;
+	int ret;
 
 	/*
 	 * If we have encountered a bitmap-format file, the size limit
@@ -78,9 +114,31 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
 			nr_segs = iov_shorten((struct iovec *)iov, nr_segs,
 					      sbi->s_bitmap_maxbytes - pos);
 		}
+	} else if (unlikely((iocb->ki_filp->f_flags & O_DIRECT) &&
+		   !is_sync_kiocb(iocb))) {
+		unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos);
 	}
 
-	return generic_file_aio_write(iocb, iov, nr_segs, pos);
+	/* Unaligned direct AIO must be serialized; see comment above */
+	if (unaligned_aio) {
+		static unsigned long unaligned_warn_time;
+
+		/* Warn about this once per day */
+		if (printk_timed_ratelimit(&unaligned_warn_time, 60*60*24*HZ))
+			ext4_msg(inode->i_sb, KERN_WARNING,
+				 "Unaligned AIO/DIO on inode %ld by %s; "
+				 "performance will be poor.",
+				 inode->i_ino, current->comm);
+		mutex_lock(ext4_aio_mutex(inode));
+		ext4_aiodio_wait(inode);
+	}
+
+	ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
+
+	if (unaligned_aio)
+		mutex_unlock(ext4_aio_mutex(inode));
+
+	return ret;
 }
 
 static const struct vm_operations_struct ext4_file_vm_ops = {
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 4e9b0a242f4c..955cc309142f 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -32,14 +32,8 @@
 
 static struct kmem_cache *io_page_cachep, *io_end_cachep;
 
-#define WQ_HASH_SZ		37
-#define to_ioend_wq(v)	(&ioend_wq[((unsigned long)v) % WQ_HASH_SZ])
-static wait_queue_head_t ioend_wq[WQ_HASH_SZ];
-
 int __init ext4_init_pageio(void)
 {
-	int i;
-
 	io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT);
 	if (io_page_cachep == NULL)
 		return -ENOMEM;
@@ -48,9 +42,6 @@ int __init ext4_init_pageio(void)
 		kmem_cache_destroy(io_page_cachep);
 		return -ENOMEM;
 	}
-	for (i = 0; i < WQ_HASH_SZ; i++)
-		init_waitqueue_head(&ioend_wq[i]);
-
 	return 0;
 }
 
@@ -62,7 +53,7 @@ void ext4_exit_pageio(void)
 
 void ext4_ioend_wait(struct inode *inode)
 {
-	wait_queue_head_t *wq = to_ioend_wq(inode);
+	wait_queue_head_t *wq = ext4_ioend_wq(inode);
 
 	wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0));
 }
@@ -87,7 +78,7 @@ void ext4_free_io_end(ext4_io_end_t *io)
 	for (i = 0; i < io->num_io_pages; i++)
 		put_io_page(io->pages[i]);
 	io->num_io_pages = 0;
-	wq = to_ioend_wq(io->inode);
+	wq = ext4_ioend_wq(io->inode);
 	if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) &&
 	    waitqueue_active(wq))
 		wake_up_all(wq);
@@ -102,6 +93,7 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
 	struct inode *inode = io->inode;
 	loff_t offset = io->offset;
 	ssize_t size = io->size;
+	wait_queue_head_t *wq;
 	int ret = 0;
 
 	ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
@@ -126,7 +118,16 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
 	if (io->iocb)
 		aio_complete(io->iocb, io->result, 0);
 	/* clear the DIO AIO unwritten flag */
-	io->flag &= ~EXT4_IO_END_UNWRITTEN;
+	if (io->flag & EXT4_IO_END_UNWRITTEN) {
+		io->flag &= ~EXT4_IO_END_UNWRITTEN;
+		/* Wake up anyone waiting on unwritten extent conversion */
+		wq = ext4_ioend_wq(io->inode);
+		if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten) &&
+		    waitqueue_active(wq)) {
+			wake_up_all(wq);
+		}
+	}
+
 	return ret;
 }
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 86b05486dc63..f6a318f836b2 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -833,6 +833,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 	ei->i_sync_tid = 0;
 	ei->i_datasync_tid = 0;
 	atomic_set(&ei->i_ioend_count, 0);
+	atomic_set(&ei->i_aiodio_unwritten, 0);
 
 	return &ei->vfs_inode;
 }
@@ -4800,11 +4801,21 @@ static void ext4_exit_feat_adverts(void)
 	kfree(ext4_feat);
 }
 
+/* Shared across all ext4 file systems */
+wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
+struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
+
 static int __init ext4_init_fs(void)
 {
-	int err;
+	int i, err;
 
 	ext4_check_flag_values();
+
+	for (i = 0; i < EXT4_WQ_HASH_SZ; i++) {
+		mutex_init(&ext4__aio_mutex[i]);
+		init_waitqueue_head(&ext4__ioend_wq[i]);
+	}
+
 	err = ext4_init_pageio();
 	if (err)
 		return err;
-- 
cgit v1.2.2


From e44718318004a5618d1dfe2d080e2862532d8e5f Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 12 Feb 2011 08:18:24 -0500
Subject: jbd2: call __jbd2_log_start_commit with j_state_lock write locked

On an SMP ARM system running ext4, I've received a report that the
first J_ASSERT in jbd2_journal_commit_transaction has been triggering:

	J_ASSERT(journal->j_running_transaction != NULL);

While investigating possible causes for this problem, I noticed that
__jbd2_log_start_commit() is getting called with j_state_lock only
read-locked, in spite of the fact that it's possible for it might
j_commit_request.  Fix this by grabbing the necessary information so
we can test to see if we need to start a new transaction before
dropping the read lock, and then calling jbd2_log_start_commit() which
will grab the write lock.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/journal.c     |  9 +++++++--
 fs/jbd2/transaction.c | 21 ++++++++++++++-------
 2 files changed, 21 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 9e4686900f18..97e73469b2c4 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -473,7 +473,8 @@ int __jbd2_log_space_left(journal_t *journal)
 }
 
 /*
- * Called under j_state_lock.  Returns true if a transaction commit was started.
+ * Called with j_state_lock locked for writing.
+ * Returns true if a transaction commit was started.
  */
 int __jbd2_log_start_commit(journal_t *journal, tid_t target)
 {
@@ -520,11 +521,13 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
 {
 	transaction_t *transaction = NULL;
 	tid_t tid;
+	int need_to_start = 0;
 
 	read_lock(&journal->j_state_lock);
 	if (journal->j_running_transaction && !current->journal_info) {
 		transaction = journal->j_running_transaction;
-		__jbd2_log_start_commit(journal, transaction->t_tid);
+		if (!tid_geq(journal->j_commit_request, transaction->t_tid))
+			need_to_start = 1;
 	} else if (journal->j_committing_transaction)
 		transaction = journal->j_committing_transaction;
 
@@ -535,6 +538,8 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
 
 	tid = transaction->t_tid;
 	read_unlock(&journal->j_state_lock);
+	if (need_to_start)
+		jbd2_log_start_commit(journal, tid);
 	jbd2_log_wait_commit(journal, tid);
 	return 1;
 }
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index faad2bd787c7..1d1191050f99 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -117,10 +117,10 @@ static inline void update_t_max_wait(transaction_t *transaction)
 static int start_this_handle(journal_t *journal, handle_t *handle,
 			     int gfp_mask)
 {
-	transaction_t *transaction;
-	int needed;
-	int nblocks = handle->h_buffer_credits;
-	transaction_t *new_transaction = NULL;
+	transaction_t	*transaction, *new_transaction = NULL;
+	tid_t		tid;
+	int		needed, need_to_start;
+	int		nblocks = handle->h_buffer_credits;
 
 	if (nblocks > journal->j_max_transaction_buffers) {
 		printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
@@ -222,8 +222,11 @@ repeat:
 		atomic_sub(nblocks, &transaction->t_outstanding_credits);
 		prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
 				TASK_UNINTERRUPTIBLE);
-		__jbd2_log_start_commit(journal, transaction->t_tid);
+		tid = transaction->t_tid;
+		need_to_start = !tid_geq(journal->j_commit_request, tid);
 		read_unlock(&journal->j_state_lock);
+		if (need_to_start)
+			jbd2_log_start_commit(journal, tid);
 		schedule();
 		finish_wait(&journal->j_wait_transaction_locked, &wait);
 		goto repeat;
@@ -442,7 +445,8 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask)
 {
 	transaction_t *transaction = handle->h_transaction;
 	journal_t *journal = transaction->t_journal;
-	int ret;
+	tid_t		tid;
+	int		need_to_start, ret;
 
 	/* If we've had an abort of any type, don't even think about
 	 * actually doing the restart! */
@@ -465,8 +469,11 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask)
 	spin_unlock(&transaction->t_handle_lock);
 
 	jbd_debug(2, "restarting handle %p\n", handle);
-	__jbd2_log_start_commit(journal, transaction->t_tid);
+	tid = transaction->t_tid;
+	need_to_start = !tid_geq(journal->j_commit_request, tid);
 	read_unlock(&journal->j_state_lock);
+	if (need_to_start)
+		jbd2_log_start_commit(journal, tid);
 
 	lock_map_release(&handle->h_lockdep_map);
 	handle->h_buffer_credits = nblocks;
-- 
cgit v1.2.2