aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs.c30
-rw-r--r--fs/9p/vfs_inode.c8
-rw-r--r--fs/autofs4/root.c6
-rw-r--r--fs/binfmt_elf.c1
-rw-r--r--fs/binfmt_elf_fdpic.c1
-rw-r--r--fs/bio-integrity.c44
-rw-r--r--fs/bio.c231
-rw-r--r--fs/block_dev.c68
-rw-r--r--fs/btrfs/backref.c299
-rw-r--r--fs/btrfs/backref.h10
-rw-r--r--fs/btrfs/btrfs_inode.h15
-rw-r--r--fs/btrfs/check-integrity.c16
-rw-r--r--fs/btrfs/compression.c13
-rw-r--r--fs/btrfs/ctree.c148
-rw-r--r--fs/btrfs/ctree.h109
-rw-r--r--fs/btrfs/delayed-inode.c6
-rw-r--r--fs/btrfs/disk-io.c230
-rw-r--r--fs/btrfs/disk-io.h2
-rw-r--r--fs/btrfs/extent-tree.c376
-rw-r--r--fs/btrfs/extent_io.c128
-rw-r--r--fs/btrfs/extent_io.h23
-rw-r--r--fs/btrfs/extent_map.c55
-rw-r--r--fs/btrfs/extent_map.h8
-rw-r--r--fs/btrfs/file-item.c5
-rw-r--r--fs/btrfs/file.c447
-rw-r--r--fs/btrfs/free-space-cache.c10
-rw-r--r--fs/btrfs/hash.h10
-rw-r--r--fs/btrfs/inode-item.c285
-rw-r--r--fs/btrfs/inode.c386
-rw-r--r--fs/btrfs/ioctl.c102
-rw-r--r--fs/btrfs/ordered-data.c97
-rw-r--r--fs/btrfs/ordered-data.h12
-rw-r--r--fs/btrfs/qgroup.c40
-rw-r--r--fs/btrfs/relocation.c11
-rw-r--r--fs/btrfs/root-tree.c29
-rw-r--r--fs/btrfs/scrub.c30
-rw-r--r--fs/btrfs/send.c915
-rw-r--r--fs/btrfs/send.h1
-rw-r--r--fs/btrfs/super.c74
-rw-r--r--fs/btrfs/transaction.c283
-rw-r--r--fs/btrfs/transaction.h20
-rw-r--r--fs/btrfs/tree-log.c889
-rw-r--r--fs/btrfs/ulist.c7
-rw-r--r--fs/btrfs/ulist.h9
-rw-r--r--fs/btrfs/volumes.c73
-rw-r--r--fs/btrfs/zlib.c8
-rw-r--r--fs/ceph/export.c18
-rw-r--r--fs/cifs/cifs_spnego.c6
-rw-r--r--fs/cifs/cifs_unicode.c22
-rw-r--r--fs/cifs/cifsacl.c8
-rw-r--r--fs/cifs/connect.c9
-rw-r--r--fs/cifs/transport.c6
-rw-r--r--fs/compat.c12
-rw-r--r--fs/coredump.c5
-rw-r--r--fs/exec.c67
-rw-r--r--fs/exofs/ore.c5
-rw-r--r--fs/exofs/super.c4
-rw-r--r--fs/ext3/super.c6
-rw-r--r--fs/fat/dir.c4
-rw-r--r--fs/fat/fat.h5
-rw-r--r--fs/fat/inode.c5
-rw-r--r--fs/fat/namei_msdos.c26
-rw-r--r--fs/fat/namei_vfat.c30
-rw-r--r--fs/file.c3
-rw-r--r--fs/file_table.c2
-rw-r--r--fs/filesystems.c4
-rw-r--r--fs/fs-writeback.c2
-rw-r--r--fs/gfs2/export.c4
-rw-r--r--fs/hostfs/hostfs.h2
-rw-r--r--fs/hostfs/hostfs_kern.c12
-rw-r--r--fs/hostfs/hostfs_user.c1
-rw-r--r--fs/hpfs/super.c3
-rw-r--r--fs/hppfs/hppfs.c4
-rw-r--r--fs/internal.h4
-rw-r--r--fs/isofs/export.c2
-rw-r--r--fs/jffs2/super.c4
-rw-r--r--fs/jffs2/wbuf.c8
-rw-r--r--fs/lockd/mon.c86
-rw-r--r--fs/lockd/netns.h4
-rw-r--r--fs/lockd/svc.c18
-rw-r--r--fs/locks.c6
-rw-r--r--fs/namei.c216
-rw-r--r--fs/namespace.c16
-rw-r--r--fs/nfs/Kconfig4
-rw-r--r--fs/nfs/blocklayout/blocklayout.c306
-rw-r--r--fs/nfs/blocklayout/blocklayout.h2
-rw-r--r--fs/nfs/blocklayout/blocklayoutdev.c25
-rw-r--r--fs/nfs/blocklayout/extents.c3
-rw-r--r--fs/nfs/callback.c337
-rw-r--r--fs/nfs/callback.h3
-rw-r--r--fs/nfs/callback_proc.c31
-rw-r--r--fs/nfs/client.c23
-rw-r--r--fs/nfs/dir.c16
-rw-r--r--fs/nfs/direct.c32
-rw-r--r--fs/nfs/file.c41
-rw-r--r--fs/nfs/getroot.c2
-rw-r--r--fs/nfs/idmap.c114
-rw-r--r--fs/nfs/inode.c10
-rw-r--r--fs/nfs/internal.h15
-rw-r--r--fs/nfs/netns.h4
-rw-r--r--fs/nfs/nfs4_fs.h19
-rw-r--r--fs/nfs/nfs4client.c256
-rw-r--r--fs/nfs/nfs4file.c29
-rw-r--r--fs/nfs/nfs4filelayout.c41
-rw-r--r--fs/nfs/nfs4filelayout.h16
-rw-r--r--fs/nfs/nfs4filelayoutdev.c17
-rw-r--r--fs/nfs/nfs4namespace.c16
-rw-r--r--fs/nfs/nfs4proc.c342
-rw-r--r--fs/nfs/nfs4state.c228
-rw-r--r--fs/nfs/nfs4sysctl.c1
-rw-r--r--fs/nfs/nfs4xdr.c31
-rw-r--r--fs/nfs/objlayout/objio_osd.c9
-rw-r--r--fs/nfs/pagelist.c12
-rw-r--r--fs/nfs/pnfs.c417
-rw-r--r--fs/nfs/pnfs.h57
-rw-r--r--fs/nfs/pnfs_dev.c27
-rw-r--r--fs/nfs/super.c31
-rw-r--r--fs/nfs/write.c11
-rw-r--r--fs/nfsd/nfs2acl.c3
-rw-r--r--fs/nfsd/nfs3proc.c2
-rw-r--r--fs/nfsd/nfs4callback.c1
-rw-r--r--fs/nfsd/nfs4idmap.c4
-rw-r--r--fs/nfsd/nfs4proc.c6
-rw-r--r--fs/nfsd/nfs4state.c351
-rw-r--r--fs/nfsd/nfs4xdr.c2
-rw-r--r--fs/nfsd/nfsctl.c84
-rw-r--r--fs/nfsd/nfsd.h4
-rw-r--r--fs/nfsd/nfssvc.c26
-rw-r--r--fs/nfsd/state.h8
-rw-r--r--fs/nfsd/vfs.c2
-rw-r--r--fs/open.c29
-rw-r--r--fs/proc/base.c5
-rw-r--r--fs/proc/task_mmu.c7
-rw-r--r--fs/quota/quota.c4
-rw-r--r--fs/reiserfs/inode.c6
-rw-r--r--fs/super.c23
-rw-r--r--fs/sysv/balloc.c18
-rw-r--r--fs/sysv/ialloc.c14
-rw-r--r--fs/sysv/inode.c4
-rw-r--r--fs/sysv/super.c1
-rw-r--r--fs/sysv/sysv.h1
-rw-r--r--fs/ufs/balloc.c30
-rw-r--r--fs/ufs/ialloc.c16
-rw-r--r--fs/ufs/super.c21
-rw-r--r--fs/ufs/ufs.h1
-rw-r--r--fs/xattr.c8
-rw-r--r--fs/xattr_acl.c2
-rw-r--r--fs/xfs/xfs_export.c3
148 files changed, 6376 insertions, 3047 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 392c5dac1981..d934f04e7736 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -184,10 +184,20 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
184 v9ses->afid = option; 184 v9ses->afid = option;
185 break; 185 break;
186 case Opt_uname: 186 case Opt_uname:
187 match_strlcpy(v9ses->uname, &args[0], PATH_MAX); 187 kfree(v9ses->uname);
188 v9ses->uname = match_strdup(&args[0]);
189 if (!v9ses->uname) {
190 ret = -ENOMEM;
191 goto free_and_return;
192 }
188 break; 193 break;
189 case Opt_remotename: 194 case Opt_remotename:
190 match_strlcpy(v9ses->aname, &args[0], PATH_MAX); 195 kfree(v9ses->aname);
196 v9ses->aname = match_strdup(&args[0]);
197 if (!v9ses->aname) {
198 ret = -ENOMEM;
199 goto free_and_return;
200 }
191 break; 201 break;
192 case Opt_nodevmap: 202 case Opt_nodevmap:
193 v9ses->nodev = 1; 203 v9ses->nodev = 1;
@@ -287,21 +297,21 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
287 struct p9_fid *fid; 297 struct p9_fid *fid;
288 int rc; 298 int rc;
289 299
290 v9ses->uname = __getname(); 300 v9ses->uname = kstrdup(V9FS_DEFUSER, GFP_KERNEL);
291 if (!v9ses->uname) 301 if (!v9ses->uname)
292 return ERR_PTR(-ENOMEM); 302 return ERR_PTR(-ENOMEM);
293 303
294 v9ses->aname = __getname(); 304 v9ses->aname = kstrdup(V9FS_DEFANAME, GFP_KERNEL);
295 if (!v9ses->aname) { 305 if (!v9ses->aname) {
296 __putname(v9ses->uname); 306 kfree(v9ses->uname);
297 return ERR_PTR(-ENOMEM); 307 return ERR_PTR(-ENOMEM);
298 } 308 }
299 init_rwsem(&v9ses->rename_sem); 309 init_rwsem(&v9ses->rename_sem);
300 310
301 rc = bdi_setup_and_register(&v9ses->bdi, "9p", BDI_CAP_MAP_COPY); 311 rc = bdi_setup_and_register(&v9ses->bdi, "9p", BDI_CAP_MAP_COPY);
302 if (rc) { 312 if (rc) {
303 __putname(v9ses->aname); 313 kfree(v9ses->aname);
304 __putname(v9ses->uname); 314 kfree(v9ses->uname);
305 return ERR_PTR(rc); 315 return ERR_PTR(rc);
306 } 316 }
307 317
@@ -309,8 +319,6 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
309 list_add(&v9ses->slist, &v9fs_sessionlist); 319 list_add(&v9ses->slist, &v9fs_sessionlist);
310 spin_unlock(&v9fs_sessionlist_lock); 320 spin_unlock(&v9fs_sessionlist_lock);
311 321
312 strcpy(v9ses->uname, V9FS_DEFUSER);
313 strcpy(v9ses->aname, V9FS_DEFANAME);
314 v9ses->uid = ~0; 322 v9ses->uid = ~0;
315 v9ses->dfltuid = V9FS_DEFUID; 323 v9ses->dfltuid = V9FS_DEFUID;
316 v9ses->dfltgid = V9FS_DEFGID; 324 v9ses->dfltgid = V9FS_DEFGID;
@@ -412,8 +420,8 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
412 kfree(v9ses->cachetag); 420 kfree(v9ses->cachetag);
413 } 421 }
414#endif 422#endif
415 __putname(v9ses->uname); 423 kfree(v9ses->uname);
416 __putname(v9ses->aname); 424 kfree(v9ses->aname);
417 425
418 bdi_destroy(&v9ses->bdi); 426 bdi_destroy(&v9ses->bdi);
419 427
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index cbf9dbb1b2a2..890bed538f9b 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -1276,12 +1276,12 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
1276 } 1276 }
1277 1277
1278 /* copy extension buffer into buffer */ 1278 /* copy extension buffer into buffer */
1279 strncpy(buffer, st->extension, buflen); 1279 retval = min(strlen(st->extension)+1, (size_t)buflen);
1280 memcpy(buffer, st->extension, retval);
1280 1281
1281 p9_debug(P9_DEBUG_VFS, "%s -> %s (%s)\n", 1282 p9_debug(P9_DEBUG_VFS, "%s -> %s (%.*s)\n",
1282 dentry->d_name.name, st->extension, buffer); 1283 dentry->d_name.name, st->extension, buflen, buffer);
1283 1284
1284 retval = strnlen(buffer, buflen);
1285done: 1285done:
1286 p9stat_free(st); 1286 p9stat_free(st);
1287 kfree(st); 1287 kfree(st);
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index e7396cfdb109..91b11650722e 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -392,10 +392,12 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
392 ino->flags |= AUTOFS_INF_PENDING; 392 ino->flags |= AUTOFS_INF_PENDING;
393 spin_unlock(&sbi->fs_lock); 393 spin_unlock(&sbi->fs_lock);
394 status = autofs4_mount_wait(dentry); 394 status = autofs4_mount_wait(dentry);
395 if (status)
396 return ERR_PTR(status);
397 spin_lock(&sbi->fs_lock); 395 spin_lock(&sbi->fs_lock);
398 ino->flags &= ~AUTOFS_INF_PENDING; 396 ino->flags &= ~AUTOFS_INF_PENDING;
397 if (status) {
398 spin_unlock(&sbi->fs_lock);
399 return ERR_PTR(status);
400 }
399 } 401 }
400done: 402done:
401 if (!(ino->flags & AUTOFS_INF_EXPIRING)) { 403 if (!(ino->flags & AUTOFS_INF_EXPIRING)) {
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index e800dec958c3..fbd9f60bd763 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -36,7 +36,6 @@
36#include <asm/uaccess.h> 36#include <asm/uaccess.h>
37#include <asm/param.h> 37#include <asm/param.h>
38#include <asm/page.h> 38#include <asm/page.h>
39#include <asm/exec.h>
40 39
41#ifndef user_long_t 40#ifndef user_long_t
42#define user_long_t long 41#define user_long_t long
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 262db114ff01..a46049154107 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -39,7 +39,6 @@
39#include <asm/uaccess.h> 39#include <asm/uaccess.h>
40#include <asm/param.h> 40#include <asm/param.h>
41#include <asm/pgalloc.h> 41#include <asm/pgalloc.h>
42#include <asm/exec.h>
43 42
44typedef char *elf_caddr_t; 43typedef char *elf_caddr_t;
45 44
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index e85c04b9f61c..a3f28f331b2b 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -70,23 +70,25 @@ static inline int use_bip_pool(unsigned int idx)
70} 70}
71 71
72/** 72/**
73 * bio_integrity_alloc_bioset - Allocate integrity payload and attach it to bio 73 * bio_integrity_alloc - Allocate integrity payload and attach it to bio
74 * @bio: bio to attach integrity metadata to 74 * @bio: bio to attach integrity metadata to
75 * @gfp_mask: Memory allocation mask 75 * @gfp_mask: Memory allocation mask
76 * @nr_vecs: Number of integrity metadata scatter-gather elements 76 * @nr_vecs: Number of integrity metadata scatter-gather elements
77 * @bs: bio_set to allocate from
78 * 77 *
79 * Description: This function prepares a bio for attaching integrity 78 * Description: This function prepares a bio for attaching integrity
80 * metadata. nr_vecs specifies the maximum number of pages containing 79 * metadata. nr_vecs specifies the maximum number of pages containing
81 * integrity metadata that can be attached. 80 * integrity metadata that can be attached.
82 */ 81 */
83struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio, 82struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
84 gfp_t gfp_mask, 83 gfp_t gfp_mask,
85 unsigned int nr_vecs, 84 unsigned int nr_vecs)
86 struct bio_set *bs)
87{ 85{
88 struct bio_integrity_payload *bip; 86 struct bio_integrity_payload *bip;
89 unsigned int idx = vecs_to_idx(nr_vecs); 87 unsigned int idx = vecs_to_idx(nr_vecs);
88 struct bio_set *bs = bio->bi_pool;
89
90 if (!bs)
91 bs = fs_bio_set;
90 92
91 BUG_ON(bio == NULL); 93 BUG_ON(bio == NULL);
92 bip = NULL; 94 bip = NULL;
@@ -114,37 +116,22 @@ struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio,
114 116
115 return bip; 117 return bip;
116} 118}
117EXPORT_SYMBOL(bio_integrity_alloc_bioset);
118
119/**
120 * bio_integrity_alloc - Allocate integrity payload and attach it to bio
121 * @bio: bio to attach integrity metadata to
122 * @gfp_mask: Memory allocation mask
123 * @nr_vecs: Number of integrity metadata scatter-gather elements
124 *
125 * Description: This function prepares a bio for attaching integrity
126 * metadata. nr_vecs specifies the maximum number of pages containing
127 * integrity metadata that can be attached.
128 */
129struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
130 gfp_t gfp_mask,
131 unsigned int nr_vecs)
132{
133 return bio_integrity_alloc_bioset(bio, gfp_mask, nr_vecs, fs_bio_set);
134}
135EXPORT_SYMBOL(bio_integrity_alloc); 119EXPORT_SYMBOL(bio_integrity_alloc);
136 120
137/** 121/**
138 * bio_integrity_free - Free bio integrity payload 122 * bio_integrity_free - Free bio integrity payload
139 * @bio: bio containing bip to be freed 123 * @bio: bio containing bip to be freed
140 * @bs: bio_set this bio was allocated from
141 * 124 *
142 * Description: Used to free the integrity portion of a bio. Usually 125 * Description: Used to free the integrity portion of a bio. Usually
143 * called from bio_free(). 126 * called from bio_free().
144 */ 127 */
145void bio_integrity_free(struct bio *bio, struct bio_set *bs) 128void bio_integrity_free(struct bio *bio)
146{ 129{
147 struct bio_integrity_payload *bip = bio->bi_integrity; 130 struct bio_integrity_payload *bip = bio->bi_integrity;
131 struct bio_set *bs = bio->bi_pool;
132
133 if (!bs)
134 bs = fs_bio_set;
148 135
149 BUG_ON(bip == NULL); 136 BUG_ON(bip == NULL);
150 137
@@ -730,19 +717,18 @@ EXPORT_SYMBOL(bio_integrity_split);
730 * @bio: New bio 717 * @bio: New bio
731 * @bio_src: Original bio 718 * @bio_src: Original bio
732 * @gfp_mask: Memory allocation mask 719 * @gfp_mask: Memory allocation mask
733 * @bs: bio_set to allocate bip from
734 * 720 *
735 * Description: Called to allocate a bip when cloning a bio 721 * Description: Called to allocate a bip when cloning a bio
736 */ 722 */
737int bio_integrity_clone(struct bio *bio, struct bio *bio_src, 723int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
738 gfp_t gfp_mask, struct bio_set *bs) 724 gfp_t gfp_mask)
739{ 725{
740 struct bio_integrity_payload *bip_src = bio_src->bi_integrity; 726 struct bio_integrity_payload *bip_src = bio_src->bi_integrity;
741 struct bio_integrity_payload *bip; 727 struct bio_integrity_payload *bip;
742 728
743 BUG_ON(bip_src == NULL); 729 BUG_ON(bip_src == NULL);
744 730
745 bip = bio_integrity_alloc_bioset(bio, gfp_mask, bip_src->bip_vcnt, bs); 731 bip = bio_integrity_alloc(bio, gfp_mask, bip_src->bip_vcnt);
746 732
747 if (bip == NULL) 733 if (bip == NULL)
748 return -EIO; 734 return -EIO;
diff --git a/fs/bio.c b/fs/bio.c
index 71072ab99128..9298c65ad9c7 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -55,6 +55,7 @@ static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
55 * IO code that does not need private memory pools. 55 * IO code that does not need private memory pools.
56 */ 56 */
57struct bio_set *fs_bio_set; 57struct bio_set *fs_bio_set;
58EXPORT_SYMBOL(fs_bio_set);
58 59
59/* 60/*
60 * Our slab pool management 61 * Our slab pool management
@@ -233,26 +234,37 @@ fallback:
233 return bvl; 234 return bvl;
234} 235}
235 236
236void bio_free(struct bio *bio, struct bio_set *bs) 237static void __bio_free(struct bio *bio)
237{ 238{
239 bio_disassociate_task(bio);
240
241 if (bio_integrity(bio))
242 bio_integrity_free(bio);
243}
244
245static void bio_free(struct bio *bio)
246{
247 struct bio_set *bs = bio->bi_pool;
238 void *p; 248 void *p;
239 249
240 if (bio_has_allocated_vec(bio)) 250 __bio_free(bio);
241 bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio));
242 251
243 if (bio_integrity(bio)) 252 if (bs) {
244 bio_integrity_free(bio, bs); 253 if (bio_has_allocated_vec(bio))
254 bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio));
245 255
246 /* 256 /*
247 * If we have front padding, adjust the bio pointer before freeing 257 * If we have front padding, adjust the bio pointer before freeing
248 */ 258 */
249 p = bio; 259 p = bio;
250 if (bs->front_pad)
251 p -= bs->front_pad; 260 p -= bs->front_pad;
252 261
253 mempool_free(p, bs->bio_pool); 262 mempool_free(p, bs->bio_pool);
263 } else {
264 /* Bio was allocated by bio_kmalloc() */
265 kfree(bio);
266 }
254} 267}
255EXPORT_SYMBOL(bio_free);
256 268
257void bio_init(struct bio *bio) 269void bio_init(struct bio *bio)
258{ 270{
@@ -263,48 +275,85 @@ void bio_init(struct bio *bio)
263EXPORT_SYMBOL(bio_init); 275EXPORT_SYMBOL(bio_init);
264 276
265/** 277/**
278 * bio_reset - reinitialize a bio
279 * @bio: bio to reset
280 *
281 * Description:
282 * After calling bio_reset(), @bio will be in the same state as a freshly
283 * allocated bio returned bio bio_alloc_bioset() - the only fields that are
284 * preserved are the ones that are initialized by bio_alloc_bioset(). See
285 * comment in struct bio.
286 */
287void bio_reset(struct bio *bio)
288{
289 unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS);
290
291 __bio_free(bio);
292
293 memset(bio, 0, BIO_RESET_BYTES);
294 bio->bi_flags = flags|(1 << BIO_UPTODATE);
295}
296EXPORT_SYMBOL(bio_reset);
297
298/**
266 * bio_alloc_bioset - allocate a bio for I/O 299 * bio_alloc_bioset - allocate a bio for I/O
267 * @gfp_mask: the GFP_ mask given to the slab allocator 300 * @gfp_mask: the GFP_ mask given to the slab allocator
268 * @nr_iovecs: number of iovecs to pre-allocate 301 * @nr_iovecs: number of iovecs to pre-allocate
269 * @bs: the bio_set to allocate from. 302 * @bs: the bio_set to allocate from.
270 * 303 *
271 * Description: 304 * Description:
272 * bio_alloc_bioset will try its own mempool to satisfy the allocation. 305 * If @bs is NULL, uses kmalloc() to allocate the bio; else the allocation is
273 * If %__GFP_WAIT is set then we will block on the internal pool waiting 306 * backed by the @bs's mempool.
274 * for a &struct bio to become free.
275 * 307 *
276 * Note that the caller must set ->bi_destructor on successful return 308 * When @bs is not NULL, if %__GFP_WAIT is set then bio_alloc will always be
277 * of a bio, to do the appropriate freeing of the bio once the reference 309 * able to allocate a bio. This is due to the mempool guarantees. To make this
278 * count drops to zero. 310 * work, callers must never allocate more than 1 bio at a time from this pool.
279 **/ 311 * Callers that need to allocate more than 1 bio must always submit the
312 * previously allocated bio for IO before attempting to allocate a new one.
313 * Failure to do so can cause deadlocks under memory pressure.
314 *
315 * RETURNS:
316 * Pointer to new bio on success, NULL on failure.
317 */
280struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) 318struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
281{ 319{
320 unsigned front_pad;
321 unsigned inline_vecs;
282 unsigned long idx = BIO_POOL_NONE; 322 unsigned long idx = BIO_POOL_NONE;
283 struct bio_vec *bvl = NULL; 323 struct bio_vec *bvl = NULL;
284 struct bio *bio; 324 struct bio *bio;
285 void *p; 325 void *p;
286 326
287 p = mempool_alloc(bs->bio_pool, gfp_mask); 327 if (!bs) {
328 if (nr_iovecs > UIO_MAXIOV)
329 return NULL;
330
331 p = kmalloc(sizeof(struct bio) +
332 nr_iovecs * sizeof(struct bio_vec),
333 gfp_mask);
334 front_pad = 0;
335 inline_vecs = nr_iovecs;
336 } else {
337 p = mempool_alloc(bs->bio_pool, gfp_mask);
338 front_pad = bs->front_pad;
339 inline_vecs = BIO_INLINE_VECS;
340 }
341
288 if (unlikely(!p)) 342 if (unlikely(!p))
289 return NULL; 343 return NULL;
290 bio = p + bs->front_pad;
291 344
345 bio = p + front_pad;
292 bio_init(bio); 346 bio_init(bio);
293 347
294 if (unlikely(!nr_iovecs)) 348 if (nr_iovecs > inline_vecs) {
295 goto out_set;
296
297 if (nr_iovecs <= BIO_INLINE_VECS) {
298 bvl = bio->bi_inline_vecs;
299 nr_iovecs = BIO_INLINE_VECS;
300 } else {
301 bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs); 349 bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs);
302 if (unlikely(!bvl)) 350 if (unlikely(!bvl))
303 goto err_free; 351 goto err_free;
304 352 } else if (nr_iovecs) {
305 nr_iovecs = bvec_nr_vecs(idx); 353 bvl = bio->bi_inline_vecs;
306 } 354 }
307out_set: 355
356 bio->bi_pool = bs;
308 bio->bi_flags |= idx << BIO_POOL_OFFSET; 357 bio->bi_flags |= idx << BIO_POOL_OFFSET;
309 bio->bi_max_vecs = nr_iovecs; 358 bio->bi_max_vecs = nr_iovecs;
310 bio->bi_io_vec = bvl; 359 bio->bi_io_vec = bvl;
@@ -316,80 +365,6 @@ err_free:
316} 365}
317EXPORT_SYMBOL(bio_alloc_bioset); 366EXPORT_SYMBOL(bio_alloc_bioset);
318 367
319static void bio_fs_destructor(struct bio *bio)
320{
321 bio_free(bio, fs_bio_set);
322}
323
324/**
325 * bio_alloc - allocate a new bio, memory pool backed
326 * @gfp_mask: allocation mask to use
327 * @nr_iovecs: number of iovecs
328 *
329 * bio_alloc will allocate a bio and associated bio_vec array that can hold
330 * at least @nr_iovecs entries. Allocations will be done from the
331 * fs_bio_set. Also see @bio_alloc_bioset and @bio_kmalloc.
332 *
333 * If %__GFP_WAIT is set, then bio_alloc will always be able to allocate
334 * a bio. This is due to the mempool guarantees. To make this work, callers
335 * must never allocate more than 1 bio at a time from this pool. Callers
336 * that need to allocate more than 1 bio must always submit the previously
337 * allocated bio for IO before attempting to allocate a new one. Failure to
338 * do so can cause livelocks under memory pressure.
339 *
340 * RETURNS:
341 * Pointer to new bio on success, NULL on failure.
342 */
343struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
344{
345 struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set);
346
347 if (bio)
348 bio->bi_destructor = bio_fs_destructor;
349
350 return bio;
351}
352EXPORT_SYMBOL(bio_alloc);
353
354static void bio_kmalloc_destructor(struct bio *bio)
355{
356 if (bio_integrity(bio))
357 bio_integrity_free(bio, fs_bio_set);
358 kfree(bio);
359}
360
361/**
362 * bio_kmalloc - allocate a bio for I/O using kmalloc()
363 * @gfp_mask: the GFP_ mask given to the slab allocator
364 * @nr_iovecs: number of iovecs to pre-allocate
365 *
366 * Description:
367 * Allocate a new bio with @nr_iovecs bvecs. If @gfp_mask contains
368 * %__GFP_WAIT, the allocation is guaranteed to succeed.
369 *
370 **/
371struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs)
372{
373 struct bio *bio;
374
375 if (nr_iovecs > UIO_MAXIOV)
376 return NULL;
377
378 bio = kmalloc(sizeof(struct bio) + nr_iovecs * sizeof(struct bio_vec),
379 gfp_mask);
380 if (unlikely(!bio))
381 return NULL;
382
383 bio_init(bio);
384 bio->bi_flags |= BIO_POOL_NONE << BIO_POOL_OFFSET;
385 bio->bi_max_vecs = nr_iovecs;
386 bio->bi_io_vec = bio->bi_inline_vecs;
387 bio->bi_destructor = bio_kmalloc_destructor;
388
389 return bio;
390}
391EXPORT_SYMBOL(bio_kmalloc);
392
393void zero_fill_bio(struct bio *bio) 368void zero_fill_bio(struct bio *bio)
394{ 369{
395 unsigned long flags; 370 unsigned long flags;
@@ -420,11 +395,8 @@ void bio_put(struct bio *bio)
420 /* 395 /*
421 * last put frees it 396 * last put frees it
422 */ 397 */
423 if (atomic_dec_and_test(&bio->bi_cnt)) { 398 if (atomic_dec_and_test(&bio->bi_cnt))
424 bio_disassociate_task(bio); 399 bio_free(bio);
425 bio->bi_next = NULL;
426 bio->bi_destructor(bio);
427 }
428} 400}
429EXPORT_SYMBOL(bio_put); 401EXPORT_SYMBOL(bio_put);
430 402
@@ -466,26 +438,28 @@ void __bio_clone(struct bio *bio, struct bio *bio_src)
466EXPORT_SYMBOL(__bio_clone); 438EXPORT_SYMBOL(__bio_clone);
467 439
468/** 440/**
469 * bio_clone - clone a bio 441 * bio_clone_bioset - clone a bio
470 * @bio: bio to clone 442 * @bio: bio to clone
471 * @gfp_mask: allocation priority 443 * @gfp_mask: allocation priority
444 * @bs: bio_set to allocate from
472 * 445 *
473 * Like __bio_clone, only also allocates the returned bio 446 * Like __bio_clone, only also allocates the returned bio
474 */ 447 */
475struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask) 448struct bio *bio_clone_bioset(struct bio *bio, gfp_t gfp_mask,
449 struct bio_set *bs)
476{ 450{
477 struct bio *b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, fs_bio_set); 451 struct bio *b;
478 452
453 b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, bs);
479 if (!b) 454 if (!b)
480 return NULL; 455 return NULL;
481 456
482 b->bi_destructor = bio_fs_destructor;
483 __bio_clone(b, bio); 457 __bio_clone(b, bio);
484 458
485 if (bio_integrity(bio)) { 459 if (bio_integrity(bio)) {
486 int ret; 460 int ret;
487 461
488 ret = bio_integrity_clone(b, bio, gfp_mask, fs_bio_set); 462 ret = bio_integrity_clone(b, bio, gfp_mask);
489 463
490 if (ret < 0) { 464 if (ret < 0) {
491 bio_put(b); 465 bio_put(b);
@@ -495,7 +469,7 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
495 469
496 return b; 470 return b;
497} 471}
498EXPORT_SYMBOL(bio_clone); 472EXPORT_SYMBOL(bio_clone_bioset);
499 473
500/** 474/**
501 * bio_get_nr_vecs - return approx number of vecs 475 * bio_get_nr_vecs - return approx number of vecs
@@ -1501,7 +1475,7 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors)
1501 trace_block_split(bdev_get_queue(bi->bi_bdev), bi, 1475 trace_block_split(bdev_get_queue(bi->bi_bdev), bi,
1502 bi->bi_sector + first_sectors); 1476 bi->bi_sector + first_sectors);
1503 1477
1504 BUG_ON(bi->bi_vcnt != 1); 1478 BUG_ON(bi->bi_vcnt != 1 && bi->bi_vcnt != 0);
1505 BUG_ON(bi->bi_idx != 0); 1479 BUG_ON(bi->bi_idx != 0);
1506 atomic_set(&bp->cnt, 3); 1480 atomic_set(&bp->cnt, 3);
1507 bp->error = 0; 1481 bp->error = 0;
@@ -1511,17 +1485,22 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors)
1511 bp->bio2.bi_size -= first_sectors << 9; 1485 bp->bio2.bi_size -= first_sectors << 9;
1512 bp->bio1.bi_size = first_sectors << 9; 1486 bp->bio1.bi_size = first_sectors << 9;
1513 1487
1514 bp->bv1 = bi->bi_io_vec[0]; 1488 if (bi->bi_vcnt != 0) {
1515 bp->bv2 = bi->bi_io_vec[0]; 1489 bp->bv1 = bi->bi_io_vec[0];
1516 bp->bv2.bv_offset += first_sectors << 9; 1490 bp->bv2 = bi->bi_io_vec[0];
1517 bp->bv2.bv_len -= first_sectors << 9; 1491
1518 bp->bv1.bv_len = first_sectors << 9; 1492 if (bio_is_rw(bi)) {
1493 bp->bv2.bv_offset += first_sectors << 9;
1494 bp->bv2.bv_len -= first_sectors << 9;
1495 bp->bv1.bv_len = first_sectors << 9;
1496 }
1519 1497
1520 bp->bio1.bi_io_vec = &bp->bv1; 1498 bp->bio1.bi_io_vec = &bp->bv1;
1521 bp->bio2.bi_io_vec = &bp->bv2; 1499 bp->bio2.bi_io_vec = &bp->bv2;
1522 1500
1523 bp->bio1.bi_max_vecs = 1; 1501 bp->bio1.bi_max_vecs = 1;
1524 bp->bio2.bi_max_vecs = 1; 1502 bp->bio2.bi_max_vecs = 1;
1503 }
1525 1504
1526 bp->bio1.bi_end_io = bio_pair_end_1; 1505 bp->bio1.bi_end_io = bio_pair_end_1;
1527 bp->bio2.bi_end_io = bio_pair_end_2; 1506 bp->bio2.bi_end_io = bio_pair_end_2;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 38e721b35d45..b3c1d3dae77d 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -116,6 +116,8 @@ EXPORT_SYMBOL(invalidate_bdev);
116 116
117int set_blocksize(struct block_device *bdev, int size) 117int set_blocksize(struct block_device *bdev, int size)
118{ 118{
119 struct address_space *mapping;
120
119 /* Size must be a power of two, and between 512 and PAGE_SIZE */ 121 /* Size must be a power of two, and between 512 and PAGE_SIZE */
120 if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size)) 122 if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size))
121 return -EINVAL; 123 return -EINVAL;
@@ -124,6 +126,19 @@ int set_blocksize(struct block_device *bdev, int size)
124 if (size < bdev_logical_block_size(bdev)) 126 if (size < bdev_logical_block_size(bdev))
125 return -EINVAL; 127 return -EINVAL;
126 128
129 /* Prevent starting I/O or mapping the device */
130 percpu_down_write(&bdev->bd_block_size_semaphore);
131
132 /* Check that the block device is not memory mapped */
133 mapping = bdev->bd_inode->i_mapping;
134 mutex_lock(&mapping->i_mmap_mutex);
135 if (mapping_mapped(mapping)) {
136 mutex_unlock(&mapping->i_mmap_mutex);
137 percpu_up_write(&bdev->bd_block_size_semaphore);
138 return -EBUSY;
139 }
140 mutex_unlock(&mapping->i_mmap_mutex);
141
127 /* Don't change the size if it is same as current */ 142 /* Don't change the size if it is same as current */
128 if (bdev->bd_block_size != size) { 143 if (bdev->bd_block_size != size) {
129 sync_blockdev(bdev); 144 sync_blockdev(bdev);
@@ -131,6 +146,9 @@ int set_blocksize(struct block_device *bdev, int size)
131 bdev->bd_inode->i_blkbits = blksize_bits(size); 146 bdev->bd_inode->i_blkbits = blksize_bits(size);
132 kill_bdev(bdev); 147 kill_bdev(bdev);
133 } 148 }
149
150 percpu_up_write(&bdev->bd_block_size_semaphore);
151
134 return 0; 152 return 0;
135} 153}
136 154
@@ -441,6 +459,12 @@ static struct inode *bdev_alloc_inode(struct super_block *sb)
441 struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL); 459 struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL);
442 if (!ei) 460 if (!ei)
443 return NULL; 461 return NULL;
462
463 if (unlikely(percpu_init_rwsem(&ei->bdev.bd_block_size_semaphore))) {
464 kmem_cache_free(bdev_cachep, ei);
465 return NULL;
466 }
467
444 return &ei->vfs_inode; 468 return &ei->vfs_inode;
445} 469}
446 470
@@ -449,6 +473,8 @@ static void bdev_i_callback(struct rcu_head *head)
449 struct inode *inode = container_of(head, struct inode, i_rcu); 473 struct inode *inode = container_of(head, struct inode, i_rcu);
450 struct bdev_inode *bdi = BDEV_I(inode); 474 struct bdev_inode *bdi = BDEV_I(inode);
451 475
476 percpu_free_rwsem(&bdi->bdev.bd_block_size_semaphore);
477
452 kmem_cache_free(bdev_cachep, bdi); 478 kmem_cache_free(bdev_cachep, bdi);
453} 479}
454 480
@@ -1567,6 +1593,22 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
1567 return blkdev_ioctl(bdev, mode, cmd, arg); 1593 return blkdev_ioctl(bdev, mode, cmd, arg);
1568} 1594}
1569 1595
1596ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov,
1597 unsigned long nr_segs, loff_t pos)
1598{
1599 ssize_t ret;
1600 struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
1601
1602 percpu_down_read(&bdev->bd_block_size_semaphore);
1603
1604 ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
1605
1606 percpu_up_read(&bdev->bd_block_size_semaphore);
1607
1608 return ret;
1609}
1610EXPORT_SYMBOL_GPL(blkdev_aio_read);
1611
1570/* 1612/*
1571 * Write data to the block device. Only intended for the block device itself 1613 * Write data to the block device. Only intended for the block device itself
1572 * and the raw driver which basically is a fake block device. 1614 * and the raw driver which basically is a fake block device.
@@ -1578,12 +1620,16 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
1578 unsigned long nr_segs, loff_t pos) 1620 unsigned long nr_segs, loff_t pos)
1579{ 1621{
1580 struct file *file = iocb->ki_filp; 1622 struct file *file = iocb->ki_filp;
1623 struct block_device *bdev = I_BDEV(file->f_mapping->host);
1581 struct blk_plug plug; 1624 struct blk_plug plug;
1582 ssize_t ret; 1625 ssize_t ret;
1583 1626
1584 BUG_ON(iocb->ki_pos != pos); 1627 BUG_ON(iocb->ki_pos != pos);
1585 1628
1586 blk_start_plug(&plug); 1629 blk_start_plug(&plug);
1630
1631 percpu_down_read(&bdev->bd_block_size_semaphore);
1632
1587 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); 1633 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
1588 if (ret > 0 || ret == -EIOCBQUEUED) { 1634 if (ret > 0 || ret == -EIOCBQUEUED) {
1589 ssize_t err; 1635 ssize_t err;
@@ -1592,11 +1638,29 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
1592 if (err < 0 && ret > 0) 1638 if (err < 0 && ret > 0)
1593 ret = err; 1639 ret = err;
1594 } 1640 }
1641
1642 percpu_up_read(&bdev->bd_block_size_semaphore);
1643
1595 blk_finish_plug(&plug); 1644 blk_finish_plug(&plug);
1645
1596 return ret; 1646 return ret;
1597} 1647}
1598EXPORT_SYMBOL_GPL(blkdev_aio_write); 1648EXPORT_SYMBOL_GPL(blkdev_aio_write);
1599 1649
1650static int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
1651{
1652 int ret;
1653 struct block_device *bdev = I_BDEV(file->f_mapping->host);
1654
1655 percpu_down_read(&bdev->bd_block_size_semaphore);
1656
1657 ret = generic_file_mmap(file, vma);
1658
1659 percpu_up_read(&bdev->bd_block_size_semaphore);
1660
1661 return ret;
1662}
1663
1600/* 1664/*
1601 * Try to release a page associated with block device when the system 1665 * Try to release a page associated with block device when the system
1602 * is under memory pressure. 1666 * is under memory pressure.
@@ -1627,9 +1691,9 @@ const struct file_operations def_blk_fops = {
1627 .llseek = block_llseek, 1691 .llseek = block_llseek,
1628 .read = do_sync_read, 1692 .read = do_sync_read,
1629 .write = do_sync_write, 1693 .write = do_sync_write,
1630 .aio_read = generic_file_aio_read, 1694 .aio_read = blkdev_aio_read,
1631 .aio_write = blkdev_aio_write, 1695 .aio_write = blkdev_aio_write,
1632 .mmap = generic_file_mmap, 1696 .mmap = blkdev_mmap,
1633 .fsync = blkdev_fsync, 1697 .fsync = blkdev_fsync,
1634 .unlocked_ioctl = block_ioctl, 1698 .unlocked_ioctl = block_ioctl,
1635#ifdef CONFIG_COMPAT 1699#ifdef CONFIG_COMPAT
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index ff6475f409d6..f3187938e081 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -16,6 +16,7 @@
16 * Boston, MA 021110-1307, USA. 16 * Boston, MA 021110-1307, USA.
17 */ 17 */
18 18
19#include <linux/vmalloc.h>
19#include "ctree.h" 20#include "ctree.h"
20#include "disk-io.h" 21#include "disk-io.h"
21#include "backref.h" 22#include "backref.h"
@@ -231,7 +232,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
231 } 232 }
232 if (!ret) { 233 if (!ret) {
233 ret = ulist_add(parents, eb->start, 234 ret = ulist_add(parents, eb->start,
234 (unsigned long)eie, GFP_NOFS); 235 (uintptr_t)eie, GFP_NOFS);
235 if (ret < 0) 236 if (ret < 0)
236 break; 237 break;
237 if (!extent_item_pos) { 238 if (!extent_item_pos) {
@@ -363,8 +364,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
363 ULIST_ITER_INIT(&uiter); 364 ULIST_ITER_INIT(&uiter);
364 node = ulist_next(parents, &uiter); 365 node = ulist_next(parents, &uiter);
365 ref->parent = node ? node->val : 0; 366 ref->parent = node ? node->val : 0;
366 ref->inode_list = 367 ref->inode_list = node ?
367 node ? (struct extent_inode_elem *)node->aux : 0; 368 (struct extent_inode_elem *)(uintptr_t)node->aux : 0;
368 369
369 /* additional parents require new refs being added here */ 370 /* additional parents require new refs being added here */
370 while ((node = ulist_next(parents, &uiter))) { 371 while ((node = ulist_next(parents, &uiter))) {
@@ -375,8 +376,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
375 } 376 }
376 memcpy(new_ref, ref, sizeof(*ref)); 377 memcpy(new_ref, ref, sizeof(*ref));
377 new_ref->parent = node->val; 378 new_ref->parent = node->val;
378 new_ref->inode_list = 379 new_ref->inode_list = (struct extent_inode_elem *)
379 (struct extent_inode_elem *)node->aux; 380 (uintptr_t)node->aux;
380 list_add(&new_ref->list, &ref->list); 381 list_add(&new_ref->list, &ref->list);
381 } 382 }
382 ulist_reinit(parents); 383 ulist_reinit(parents);
@@ -914,8 +915,8 @@ again:
914 free_extent_buffer(eb); 915 free_extent_buffer(eb);
915 } 916 }
916 ret = ulist_add_merge(refs, ref->parent, 917 ret = ulist_add_merge(refs, ref->parent,
917 (unsigned long)ref->inode_list, 918 (uintptr_t)ref->inode_list,
918 (unsigned long *)&eie, GFP_NOFS); 919 (u64 *)&eie, GFP_NOFS);
919 if (!ret && extent_item_pos) { 920 if (!ret && extent_item_pos) {
920 /* 921 /*
921 * we've recorded that parent, so we must extend 922 * we've recorded that parent, so we must extend
@@ -959,7 +960,7 @@ static void free_leaf_list(struct ulist *blocks)
959 while ((node = ulist_next(blocks, &uiter))) { 960 while ((node = ulist_next(blocks, &uiter))) {
960 if (!node->aux) 961 if (!node->aux)
961 continue; 962 continue;
962 eie = (struct extent_inode_elem *)node->aux; 963 eie = (struct extent_inode_elem *)(uintptr_t)node->aux;
963 for (; eie; eie = eie_next) { 964 for (; eie; eie = eie_next) {
964 eie_next = eie->next; 965 eie_next = eie->next;
965 kfree(eie); 966 kfree(eie);
@@ -1108,26 +1109,80 @@ static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
1108 found_key); 1109 found_key);
1109} 1110}
1110 1111
1111/* 1112int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
1112 * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements 1113 u64 start_off, struct btrfs_path *path,
1113 * of the path are separated by '/' and the path is guaranteed to be 1114 struct btrfs_inode_extref **ret_extref,
1114 * 0-terminated. the path is only given within the current file system. 1115 u64 *found_off)
1115 * Therefore, it never starts with a '/'. the caller is responsible to provide 1116{
1116 * "size" bytes in "dest". the dest buffer will be filled backwards. finally, 1117 int ret, slot;
1117 * the start point of the resulting string is returned. this pointer is within 1118 struct btrfs_key key;
1118 * dest, normally. 1119 struct btrfs_key found_key;
1119 * in case the path buffer would overflow, the pointer is decremented further 1120 struct btrfs_inode_extref *extref;
1120 * as if output was written to the buffer, though no more output is actually 1121 struct extent_buffer *leaf;
1121 * generated. that way, the caller can determine how much space would be 1122 unsigned long ptr;
1122 * required for the path to fit into the buffer. in that case, the returned 1123
1123 * value will be smaller than dest. callers must check this! 1124 key.objectid = inode_objectid;
1124 */ 1125 btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY);
1125char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, 1126 key.offset = start_off;
1126 struct btrfs_inode_ref *iref, 1127
1128 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1129 if (ret < 0)
1130 return ret;
1131
1132 while (1) {
1133 leaf = path->nodes[0];
1134 slot = path->slots[0];
1135 if (slot >= btrfs_header_nritems(leaf)) {
1136 /*
1137 * If the item at offset is not found,
1138 * btrfs_search_slot will point us to the slot
1139 * where it should be inserted. In our case
1140 * that will be the slot directly before the
1141 * next INODE_REF_KEY_V2 item. In the case
1142 * that we're pointing to the last slot in a
1143 * leaf, we must move one leaf over.
1144 */
1145 ret = btrfs_next_leaf(root, path);
1146 if (ret) {
1147 if (ret >= 1)
1148 ret = -ENOENT;
1149 break;
1150 }
1151 continue;
1152 }
1153
1154 btrfs_item_key_to_cpu(leaf, &found_key, slot);
1155
1156 /*
1157 * Check that we're still looking at an extended ref key for
1158 * this particular objectid. If we have different
1159 * objectid or type then there are no more to be found
1160 * in the tree and we can exit.
1161 */
1162 ret = -ENOENT;
1163 if (found_key.objectid != inode_objectid)
1164 break;
1165 if (btrfs_key_type(&found_key) != BTRFS_INODE_EXTREF_KEY)
1166 break;
1167
1168 ret = 0;
1169 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1170 extref = (struct btrfs_inode_extref *)ptr;
1171 *ret_extref = extref;
1172 if (found_off)
1173 *found_off = found_key.offset;
1174 break;
1175 }
1176
1177 return ret;
1178}
1179
1180static char *ref_to_path(struct btrfs_root *fs_root,
1181 struct btrfs_path *path,
1182 u32 name_len, unsigned long name_off,
1127 struct extent_buffer *eb_in, u64 parent, 1183 struct extent_buffer *eb_in, u64 parent,
1128 char *dest, u32 size) 1184 char *dest, u32 size)
1129{ 1185{
1130 u32 len;
1131 int slot; 1186 int slot;
1132 u64 next_inum; 1187 u64 next_inum;
1133 int ret; 1188 int ret;
@@ -1135,17 +1190,17 @@ char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
1135 struct extent_buffer *eb = eb_in; 1190 struct extent_buffer *eb = eb_in;
1136 struct btrfs_key found_key; 1191 struct btrfs_key found_key;
1137 int leave_spinning = path->leave_spinning; 1192 int leave_spinning = path->leave_spinning;
1193 struct btrfs_inode_ref *iref;
1138 1194
1139 if (bytes_left >= 0) 1195 if (bytes_left >= 0)
1140 dest[bytes_left] = '\0'; 1196 dest[bytes_left] = '\0';
1141 1197
1142 path->leave_spinning = 1; 1198 path->leave_spinning = 1;
1143 while (1) { 1199 while (1) {
1144 len = btrfs_inode_ref_name_len(eb, iref); 1200 bytes_left -= name_len;
1145 bytes_left -= len;
1146 if (bytes_left >= 0) 1201 if (bytes_left >= 0)
1147 read_extent_buffer(eb, dest + bytes_left, 1202 read_extent_buffer(eb, dest + bytes_left,
1148 (unsigned long)(iref + 1), len); 1203 name_off, name_len);
1149 if (eb != eb_in) { 1204 if (eb != eb_in) {
1150 btrfs_tree_read_unlock_blocking(eb); 1205 btrfs_tree_read_unlock_blocking(eb);
1151 free_extent_buffer(eb); 1206 free_extent_buffer(eb);
@@ -1155,6 +1210,7 @@ char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
1155 ret = -ENOENT; 1210 ret = -ENOENT;
1156 if (ret) 1211 if (ret)
1157 break; 1212 break;
1213
1158 next_inum = found_key.offset; 1214 next_inum = found_key.offset;
1159 1215
1160 /* regular exit ahead */ 1216 /* regular exit ahead */
@@ -1170,8 +1226,11 @@ char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
1170 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); 1226 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
1171 } 1227 }
1172 btrfs_release_path(path); 1228 btrfs_release_path(path);
1173
1174 iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); 1229 iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
1230
1231 name_len = btrfs_inode_ref_name_len(eb, iref);
1232 name_off = (unsigned long)(iref + 1);
1233
1175 parent = next_inum; 1234 parent = next_inum;
1176 --bytes_left; 1235 --bytes_left;
1177 if (bytes_left >= 0) 1236 if (bytes_left >= 0)
@@ -1188,12 +1247,39 @@ char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
1188} 1247}
1189 1248
1190/* 1249/*
1250 * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements
1251 * of the path are separated by '/' and the path is guaranteed to be
1252 * 0-terminated. the path is only given within the current file system.
1253 * Therefore, it never starts with a '/'. the caller is responsible to provide
1254 * "size" bytes in "dest". the dest buffer will be filled backwards. finally,
1255 * the start point of the resulting string is returned. this pointer is within
1256 * dest, normally.
1257 * in case the path buffer would overflow, the pointer is decremented further
1258 * as if output was written to the buffer, though no more output is actually
1259 * generated. that way, the caller can determine how much space would be
1260 * required for the path to fit into the buffer. in that case, the returned
1261 * value will be smaller than dest. callers must check this!
1262 */
1263char *btrfs_iref_to_path(struct btrfs_root *fs_root,
1264 struct btrfs_path *path,
1265 struct btrfs_inode_ref *iref,
1266 struct extent_buffer *eb_in, u64 parent,
1267 char *dest, u32 size)
1268{
1269 return ref_to_path(fs_root, path,
1270 btrfs_inode_ref_name_len(eb_in, iref),
1271 (unsigned long)(iref + 1),
1272 eb_in, parent, dest, size);
1273}
1274
1275/*
1191 * this makes the path point to (logical EXTENT_ITEM *) 1276 * this makes the path point to (logical EXTENT_ITEM *)
1192 * returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for 1277 * returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for
1193 * tree blocks and <0 on error. 1278 * tree blocks and <0 on error.
1194 */ 1279 */
1195int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, 1280int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
1196 struct btrfs_path *path, struct btrfs_key *found_key) 1281 struct btrfs_path *path, struct btrfs_key *found_key,
1282 u64 *flags_ret)
1197{ 1283{
1198 int ret; 1284 int ret;
1199 u64 flags; 1285 u64 flags;
@@ -1237,10 +1323,17 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
1237 (unsigned long long)found_key->objectid, 1323 (unsigned long long)found_key->objectid,
1238 (unsigned long long)found_key->offset, 1324 (unsigned long long)found_key->offset,
1239 (unsigned long long)flags, item_size); 1325 (unsigned long long)flags, item_size);
1240 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) 1326
1241 return BTRFS_EXTENT_FLAG_TREE_BLOCK; 1327 WARN_ON(!flags_ret);
1242 if (flags & BTRFS_EXTENT_FLAG_DATA) 1328 if (flags_ret) {
1243 return BTRFS_EXTENT_FLAG_DATA; 1329 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1330 *flags_ret = BTRFS_EXTENT_FLAG_TREE_BLOCK;
1331 else if (flags & BTRFS_EXTENT_FLAG_DATA)
1332 *flags_ret = BTRFS_EXTENT_FLAG_DATA;
1333 else
1334 BUG_ON(1);
1335 return 0;
1336 }
1244 1337
1245 return -EIO; 1338 return -EIO;
1246} 1339}
@@ -1404,12 +1497,13 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
1404 ULIST_ITER_INIT(&root_uiter); 1497 ULIST_ITER_INIT(&root_uiter);
1405 while (!ret && (root_node = ulist_next(roots, &root_uiter))) { 1498 while (!ret && (root_node = ulist_next(roots, &root_uiter))) {
1406 pr_debug("root %llu references leaf %llu, data list " 1499 pr_debug("root %llu references leaf %llu, data list "
1407 "%#lx\n", root_node->val, ref_node->val, 1500 "%#llx\n", root_node->val, ref_node->val,
1408 ref_node->aux); 1501 (long long)ref_node->aux);
1409 ret = iterate_leaf_refs( 1502 ret = iterate_leaf_refs((struct extent_inode_elem *)
1410 (struct extent_inode_elem *)ref_node->aux, 1503 (uintptr_t)ref_node->aux,
1411 root_node->val, extent_item_objectid, 1504 root_node->val,
1412 iterate, ctx); 1505 extent_item_objectid,
1506 iterate, ctx);
1413 } 1507 }
1414 ulist_free(roots); 1508 ulist_free(roots);
1415 roots = NULL; 1509 roots = NULL;
@@ -1432,15 +1526,15 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
1432{ 1526{
1433 int ret; 1527 int ret;
1434 u64 extent_item_pos; 1528 u64 extent_item_pos;
1529 u64 flags = 0;
1435 struct btrfs_key found_key; 1530 struct btrfs_key found_key;
1436 int search_commit_root = path->search_commit_root; 1531 int search_commit_root = path->search_commit_root;
1437 1532
1438 ret = extent_from_logical(fs_info, logical, path, 1533 ret = extent_from_logical(fs_info, logical, path, &found_key, &flags);
1439 &found_key);
1440 btrfs_release_path(path); 1534 btrfs_release_path(path);
1441 if (ret < 0) 1535 if (ret < 0)
1442 return ret; 1536 return ret;
1443 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) 1537 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1444 return -EINVAL; 1538 return -EINVAL;
1445 1539
1446 extent_item_pos = logical - found_key.objectid; 1540 extent_item_pos = logical - found_key.objectid;
@@ -1451,9 +1545,12 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
1451 return ret; 1545 return ret;
1452} 1546}
1453 1547
1454static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, 1548typedef int (iterate_irefs_t)(u64 parent, u32 name_len, unsigned long name_off,
1455 struct btrfs_path *path, 1549 struct extent_buffer *eb, void *ctx);
1456 iterate_irefs_t *iterate, void *ctx) 1550
1551static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
1552 struct btrfs_path *path,
1553 iterate_irefs_t *iterate, void *ctx)
1457{ 1554{
1458 int ret = 0; 1555 int ret = 0;
1459 int slot; 1556 int slot;
@@ -1470,7 +1567,7 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
1470 while (!ret) { 1567 while (!ret) {
1471 path->leave_spinning = 1; 1568 path->leave_spinning = 1;
1472 ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path, 1569 ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path,
1473 &found_key); 1570 &found_key);
1474 if (ret < 0) 1571 if (ret < 0)
1475 break; 1572 break;
1476 if (ret) { 1573 if (ret) {
@@ -1498,7 +1595,8 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
1498 "tree %llu\n", cur, 1595 "tree %llu\n", cur,
1499 (unsigned long long)found_key.objectid, 1596 (unsigned long long)found_key.objectid,
1500 (unsigned long long)fs_root->objectid); 1597 (unsigned long long)fs_root->objectid);
1501 ret = iterate(parent, iref, eb, ctx); 1598 ret = iterate(parent, name_len,
1599 (unsigned long)(iref + 1), eb, ctx);
1502 if (ret) 1600 if (ret)
1503 break; 1601 break;
1504 len = sizeof(*iref) + name_len; 1602 len = sizeof(*iref) + name_len;
@@ -1513,12 +1611,98 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
1513 return ret; 1611 return ret;
1514} 1612}
1515 1613
1614static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
1615 struct btrfs_path *path,
1616 iterate_irefs_t *iterate, void *ctx)
1617{
1618 int ret;
1619 int slot;
1620 u64 offset = 0;
1621 u64 parent;
1622 int found = 0;
1623 struct extent_buffer *eb;
1624 struct btrfs_inode_extref *extref;
1625 struct extent_buffer *leaf;
1626 u32 item_size;
1627 u32 cur_offset;
1628 unsigned long ptr;
1629
1630 while (1) {
1631 ret = btrfs_find_one_extref(fs_root, inum, offset, path, &extref,
1632 &offset);
1633 if (ret < 0)
1634 break;
1635 if (ret) {
1636 ret = found ? 0 : -ENOENT;
1637 break;
1638 }
1639 ++found;
1640
1641 slot = path->slots[0];
1642 eb = path->nodes[0];
1643 /* make sure we can use eb after releasing the path */
1644 atomic_inc(&eb->refs);
1645
1646 btrfs_tree_read_lock(eb);
1647 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
1648 btrfs_release_path(path);
1649
1650 leaf = path->nodes[0];
1651 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1652 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1653 cur_offset = 0;
1654
1655 while (cur_offset < item_size) {
1656 u32 name_len;
1657
1658 extref = (struct btrfs_inode_extref *)(ptr + cur_offset);
1659 parent = btrfs_inode_extref_parent(eb, extref);
1660 name_len = btrfs_inode_extref_name_len(eb, extref);
1661 ret = iterate(parent, name_len,
1662 (unsigned long)&extref->name, eb, ctx);
1663 if (ret)
1664 break;
1665
1666 cur_offset += btrfs_inode_extref_name_len(leaf, extref);
1667 cur_offset += sizeof(*extref);
1668 }
1669 btrfs_tree_read_unlock_blocking(eb);
1670 free_extent_buffer(eb);
1671
1672 offset++;
1673 }
1674
1675 btrfs_release_path(path);
1676
1677 return ret;
1678}
1679
1680static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
1681 struct btrfs_path *path, iterate_irefs_t *iterate,
1682 void *ctx)
1683{
1684 int ret;
1685 int found_refs = 0;
1686
1687 ret = iterate_inode_refs(inum, fs_root, path, iterate, ctx);
1688 if (!ret)
1689 ++found_refs;
1690 else if (ret != -ENOENT)
1691 return ret;
1692
1693 ret = iterate_inode_extrefs(inum, fs_root, path, iterate, ctx);
1694 if (ret == -ENOENT && found_refs)
1695 return 0;
1696
1697 return ret;
1698}
1699
1516/* 1700/*
1517 * returns 0 if the path could be dumped (probably truncated) 1701 * returns 0 if the path could be dumped (probably truncated)
1518 * returns <0 in case of an error 1702 * returns <0 in case of an error
1519 */ 1703 */
1520static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref, 1704static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off,
1521 struct extent_buffer *eb, void *ctx) 1705 struct extent_buffer *eb, void *ctx)
1522{ 1706{
1523 struct inode_fs_paths *ipath = ctx; 1707 struct inode_fs_paths *ipath = ctx;
1524 char *fspath; 1708 char *fspath;
@@ -1531,20 +1715,17 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
1531 ipath->fspath->bytes_left - s_ptr : 0; 1715 ipath->fspath->bytes_left - s_ptr : 0;
1532 1716
1533 fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr; 1717 fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr;
1534 fspath = btrfs_iref_to_path(ipath->fs_root, ipath->btrfs_path, iref, eb, 1718 fspath = ref_to_path(ipath->fs_root, ipath->btrfs_path, name_len,
1535 inum, fspath_min, bytes_left); 1719 name_off, eb, inum, fspath_min,
1720 bytes_left);
1536 if (IS_ERR(fspath)) 1721 if (IS_ERR(fspath))
1537 return PTR_ERR(fspath); 1722 return PTR_ERR(fspath);
1538 1723
1539 if (fspath > fspath_min) { 1724 if (fspath > fspath_min) {
1540 pr_debug("path resolved: %s\n", fspath);
1541 ipath->fspath->val[i] = (u64)(unsigned long)fspath; 1725 ipath->fspath->val[i] = (u64)(unsigned long)fspath;
1542 ++ipath->fspath->elem_cnt; 1726 ++ipath->fspath->elem_cnt;
1543 ipath->fspath->bytes_left = fspath - fspath_min; 1727 ipath->fspath->bytes_left = fspath - fspath_min;
1544 } else { 1728 } else {
1545 pr_debug("missed path, not enough space. missing bytes: %lu, "
1546 "constructed so far: %s\n",
1547 (unsigned long)(fspath_min - fspath), fspath_min);
1548 ++ipath->fspath->elem_missed; 1729 ++ipath->fspath->elem_missed;
1549 ipath->fspath->bytes_missing += fspath_min - fspath; 1730 ipath->fspath->bytes_missing += fspath_min - fspath;
1550 ipath->fspath->bytes_left = 0; 1731 ipath->fspath->bytes_left = 0;
@@ -1566,7 +1747,7 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
1566int paths_from_inode(u64 inum, struct inode_fs_paths *ipath) 1747int paths_from_inode(u64 inum, struct inode_fs_paths *ipath)
1567{ 1748{
1568 return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path, 1749 return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path,
1569 inode_to_path, ipath); 1750 inode_to_path, ipath);
1570} 1751}
1571 1752
1572struct btrfs_data_container *init_data_container(u32 total_bytes) 1753struct btrfs_data_container *init_data_container(u32 total_bytes)
@@ -1575,7 +1756,7 @@ struct btrfs_data_container *init_data_container(u32 total_bytes)
1575 size_t alloc_bytes; 1756 size_t alloc_bytes;
1576 1757
1577 alloc_bytes = max_t(size_t, total_bytes, sizeof(*data)); 1758 alloc_bytes = max_t(size_t, total_bytes, sizeof(*data));
1578 data = kmalloc(alloc_bytes, GFP_NOFS); 1759 data = vmalloc(alloc_bytes);
1579 if (!data) 1760 if (!data)
1580 return ERR_PTR(-ENOMEM); 1761 return ERR_PTR(-ENOMEM);
1581 1762
@@ -1626,6 +1807,6 @@ void free_ipath(struct inode_fs_paths *ipath)
1626{ 1807{
1627 if (!ipath) 1808 if (!ipath)
1628 return; 1809 return;
1629 kfree(ipath->fspath); 1810 vfree(ipath->fspath);
1630 kfree(ipath); 1811 kfree(ipath);
1631} 1812}
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 032f4dc7eab8..e75533043a5f 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -33,14 +33,13 @@ struct inode_fs_paths {
33 33
34typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root, 34typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,
35 void *ctx); 35 void *ctx);
36typedef int (iterate_irefs_t)(u64 parent, struct btrfs_inode_ref *iref,
37 struct extent_buffer *eb, void *ctx);
38 36
39int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, 37int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
40 struct btrfs_path *path); 38 struct btrfs_path *path);
41 39
42int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, 40int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
43 struct btrfs_path *path, struct btrfs_key *found_key); 41 struct btrfs_path *path, struct btrfs_key *found_key,
42 u64 *flags);
44 43
45int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, 44int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
46 struct btrfs_extent_item *ei, u32 item_size, 45 struct btrfs_extent_item *ei, u32 item_size,
@@ -69,4 +68,9 @@ struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
69 struct btrfs_path *path); 68 struct btrfs_path *path);
70void free_ipath(struct inode_fs_paths *ipath); 69void free_ipath(struct inode_fs_paths *ipath);
71 70
71int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
72 u64 start_off, struct btrfs_path *path,
73 struct btrfs_inode_extref **ret_extref,
74 u64 *found_off);
75
72#endif 76#endif
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 5b2ad6bc4fe7..ed8ca7ca5eff 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -38,6 +38,7 @@
38#define BTRFS_INODE_DELALLOC_META_RESERVED 4 38#define BTRFS_INODE_DELALLOC_META_RESERVED 4
39#define BTRFS_INODE_HAS_ORPHAN_ITEM 5 39#define BTRFS_INODE_HAS_ORPHAN_ITEM 5
40#define BTRFS_INODE_HAS_ASYNC_EXTENT 6 40#define BTRFS_INODE_HAS_ASYNC_EXTENT 6
41#define BTRFS_INODE_NEEDS_FULL_SYNC 7
41 42
42/* in memory btrfs inode */ 43/* in memory btrfs inode */
43struct btrfs_inode { 44struct btrfs_inode {
@@ -143,6 +144,9 @@ struct btrfs_inode {
143 /* flags field from the on disk inode */ 144 /* flags field from the on disk inode */
144 u32 flags; 145 u32 flags;
145 146
147 /* a local copy of root's last_log_commit */
148 unsigned long last_log_commit;
149
146 /* 150 /*
147 * Counters to keep track of the number of extent item's we may use due 151 * Counters to keep track of the number of extent item's we may use due
148 * to delalloc and such. outstanding_extents is the number of extent 152 * to delalloc and such. outstanding_extents is the number of extent
@@ -202,15 +206,10 @@ static inline bool btrfs_is_free_space_inode(struct inode *inode)
202 206
203static inline int btrfs_inode_in_log(struct inode *inode, u64 generation) 207static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
204{ 208{
205 struct btrfs_root *root = BTRFS_I(inode)->root;
206 int ret = 0;
207
208 mutex_lock(&root->log_mutex);
209 if (BTRFS_I(inode)->logged_trans == generation && 209 if (BTRFS_I(inode)->logged_trans == generation &&
210 BTRFS_I(inode)->last_sub_trans <= root->last_log_commit) 210 BTRFS_I(inode)->last_sub_trans <= BTRFS_I(inode)->last_log_commit)
211 ret = 1; 211 return 1;
212 mutex_unlock(&root->log_mutex); 212 return 0;
213 return ret;
214} 213}
215 214
216#endif 215#endif
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 9197e2e33407..5a3e45db642a 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -37,8 +37,9 @@
37 * the file system was mounted, (i.e., they have been 37 * the file system was mounted, (i.e., they have been
38 * referenced by the super block) or they have been 38 * referenced by the super block) or they have been
39 * written since then and the write completion callback 39 * written since then and the write completion callback
40 * was called and a FLUSH request to the device where 40 * was called and no write error was indicated and a
41 * these blocks are located was received and completed. 41 * FLUSH request to the device where these blocks are
42 * located was received and completed.
42 * 2b. All referenced blocks need to have a generation 43 * 2b. All referenced blocks need to have a generation
43 * number which is equal to the parent's number. 44 * number which is equal to the parent's number.
44 * 45 *
@@ -2601,6 +2602,17 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
2601 (unsigned long long)l->block_ref_to->dev_bytenr, 2602 (unsigned long long)l->block_ref_to->dev_bytenr,
2602 l->block_ref_to->mirror_num); 2603 l->block_ref_to->mirror_num);
2603 ret = -1; 2604 ret = -1;
2605 } else if (l->block_ref_to->iodone_w_error) {
2606 printk(KERN_INFO "btrfs: attempt to write superblock"
2607 " which references block %c @%llu (%s/%llu/%d)"
2608 " which has write error!\n",
2609 btrfsic_get_block_type(state, l->block_ref_to),
2610 (unsigned long long)
2611 l->block_ref_to->logical_bytenr,
2612 l->block_ref_to->dev_state->name,
2613 (unsigned long long)l->block_ref_to->dev_bytenr,
2614 l->block_ref_to->mirror_num);
2615 ret = -1;
2604 } else if (l->parent_generation != 2616 } else if (l->parent_generation !=
2605 l->block_ref_to->generation && 2617 l->block_ref_to->generation &&
2606 BTRFSIC_GENERATION_UNKNOWN != 2618 BTRFSIC_GENERATION_UNKNOWN !=
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 43d1c5a3a030..c6467aa88bee 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -577,6 +577,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
577 u64 em_start; 577 u64 em_start;
578 struct extent_map *em; 578 struct extent_map *em;
579 int ret = -ENOMEM; 579 int ret = -ENOMEM;
580 int faili = 0;
580 u32 *sums; 581 u32 *sums;
581 582
582 tree = &BTRFS_I(inode)->io_tree; 583 tree = &BTRFS_I(inode)->io_tree;
@@ -626,9 +627,13 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
626 for (pg_index = 0; pg_index < nr_pages; pg_index++) { 627 for (pg_index = 0; pg_index < nr_pages; pg_index++) {
627 cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS | 628 cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS |
628 __GFP_HIGHMEM); 629 __GFP_HIGHMEM);
629 if (!cb->compressed_pages[pg_index]) 630 if (!cb->compressed_pages[pg_index]) {
631 faili = pg_index - 1;
632 ret = -ENOMEM;
630 goto fail2; 633 goto fail2;
634 }
631 } 635 }
636 faili = nr_pages - 1;
632 cb->nr_pages = nr_pages; 637 cb->nr_pages = nr_pages;
633 638
634 add_ra_bio_pages(inode, em_start + em_len, cb); 639 add_ra_bio_pages(inode, em_start + em_len, cb);
@@ -713,8 +718,10 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
713 return 0; 718 return 0;
714 719
715fail2: 720fail2:
716 for (pg_index = 0; pg_index < nr_pages; pg_index++) 721 while (faili >= 0) {
717 free_page((unsigned long)cb->compressed_pages[pg_index]); 722 __free_page(cb->compressed_pages[faili]);
723 faili--;
724 }
718 725
719 kfree(cb->compressed_pages); 726 kfree(cb->compressed_pages);
720fail1: 727fail1:
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 6d183f60d63a..b33436211000 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -4402,149 +4402,6 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans,
4402} 4402}
4403 4403
4404/* 4404/*
4405 * Given a key and some data, insert items into the tree.
4406 * This does all the path init required, making room in the tree if needed.
4407 * Returns the number of keys that were inserted.
4408 */
4409int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
4410 struct btrfs_root *root,
4411 struct btrfs_path *path,
4412 struct btrfs_key *cpu_key, u32 *data_size,
4413 int nr)
4414{
4415 struct extent_buffer *leaf;
4416 struct btrfs_item *item;
4417 int ret = 0;
4418 int slot;
4419 int i;
4420 u32 nritems;
4421 u32 total_data = 0;
4422 u32 total_size = 0;
4423 unsigned int data_end;
4424 struct btrfs_disk_key disk_key;
4425 struct btrfs_key found_key;
4426 struct btrfs_map_token token;
4427
4428 btrfs_init_map_token(&token);
4429
4430 for (i = 0; i < nr; i++) {
4431 if (total_size + data_size[i] + sizeof(struct btrfs_item) >
4432 BTRFS_LEAF_DATA_SIZE(root)) {
4433 break;
4434 nr = i;
4435 }
4436 total_data += data_size[i];
4437 total_size += data_size[i] + sizeof(struct btrfs_item);
4438 }
4439 BUG_ON(nr == 0);
4440
4441 ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4442 if (ret == 0)
4443 return -EEXIST;
4444 if (ret < 0)
4445 goto out;
4446
4447 leaf = path->nodes[0];
4448
4449 nritems = btrfs_header_nritems(leaf);
4450 data_end = leaf_data_end(root, leaf);
4451
4452 if (btrfs_leaf_free_space(root, leaf) < total_size) {
4453 for (i = nr; i >= 0; i--) {
4454 total_data -= data_size[i];
4455 total_size -= data_size[i] + sizeof(struct btrfs_item);
4456 if (total_size < btrfs_leaf_free_space(root, leaf))
4457 break;
4458 }
4459 nr = i;
4460 }
4461
4462 slot = path->slots[0];
4463 BUG_ON(slot < 0);
4464
4465 if (slot != nritems) {
4466 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4467
4468 item = btrfs_item_nr(leaf, slot);
4469 btrfs_item_key_to_cpu(leaf, &found_key, slot);
4470
4471 /* figure out how many keys we can insert in here */
4472 total_data = data_size[0];
4473 for (i = 1; i < nr; i++) {
4474 if (btrfs_comp_cpu_keys(&found_key, cpu_key + i) <= 0)
4475 break;
4476 total_data += data_size[i];
4477 }
4478 nr = i;
4479
4480 if (old_data < data_end) {
4481 btrfs_print_leaf(root, leaf);
4482 printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
4483 slot, old_data, data_end);
4484 BUG_ON(1);
4485 }
4486 /*
4487 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4488 */
4489 /* first correct the data pointers */
4490 for (i = slot; i < nritems; i++) {
4491 u32 ioff;
4492
4493 item = btrfs_item_nr(leaf, i);
4494 ioff = btrfs_token_item_offset(leaf, item, &token);
4495 btrfs_set_token_item_offset(leaf, item,
4496 ioff - total_data, &token);
4497 }
4498 /* shift the items */
4499 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4500 btrfs_item_nr_offset(slot),
4501 (nritems - slot) * sizeof(struct btrfs_item));
4502
4503 /* shift the data */
4504 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4505 data_end - total_data, btrfs_leaf_data(leaf) +
4506 data_end, old_data - data_end);
4507 data_end = old_data;
4508 } else {
4509 /*
4510 * this sucks but it has to be done, if we are inserting at
4511 * the end of the leaf only insert 1 of the items, since we
4512 * have no way of knowing whats on the next leaf and we'd have
4513 * to drop our current locks to figure it out
4514 */
4515 nr = 1;
4516 }
4517
4518 /* setup the item for the new data */
4519 for (i = 0; i < nr; i++) {
4520 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
4521 btrfs_set_item_key(leaf, &disk_key, slot + i);
4522 item = btrfs_item_nr(leaf, slot + i);
4523 btrfs_set_token_item_offset(leaf, item,
4524 data_end - data_size[i], &token);
4525 data_end -= data_size[i];
4526 btrfs_set_token_item_size(leaf, item, data_size[i], &token);
4527 }
4528 btrfs_set_header_nritems(leaf, nritems + nr);
4529 btrfs_mark_buffer_dirty(leaf);
4530
4531 ret = 0;
4532 if (slot == 0) {
4533 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4534 fixup_low_keys(trans, root, path, &disk_key, 1);
4535 }
4536
4537 if (btrfs_leaf_free_space(root, leaf) < 0) {
4538 btrfs_print_leaf(root, leaf);
4539 BUG();
4540 }
4541out:
4542 if (!ret)
4543 ret = nr;
4544 return ret;
4545}
4546
4547/*
4548 * this is a helper for btrfs_insert_empty_items, the main goal here is 4405 * this is a helper for btrfs_insert_empty_items, the main goal here is
4549 * to save stack depth by doing the bulk of the work in a function 4406 * to save stack depth by doing the bulk of the work in a function
4550 * that doesn't call btrfs_search_slot 4407 * that doesn't call btrfs_search_slot
@@ -5073,6 +4930,7 @@ static void tree_move_down(struct btrfs_root *root,
5073 struct btrfs_path *path, 4930 struct btrfs_path *path,
5074 int *level, int root_level) 4931 int *level, int root_level)
5075{ 4932{
4933 BUG_ON(*level == 0);
5076 path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level], 4934 path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level],
5077 path->slots[*level]); 4935 path->slots[*level]);
5078 path->slots[*level - 1] = 0; 4936 path->slots[*level - 1] = 0;
@@ -5089,7 +4947,7 @@ static int tree_move_next_or_upnext(struct btrfs_root *root,
5089 4947
5090 path->slots[*level]++; 4948 path->slots[*level]++;
5091 4949
5092 while (path->slots[*level] == nritems) { 4950 while (path->slots[*level] >= nritems) {
5093 if (*level == root_level) 4951 if (*level == root_level)
5094 return -1; 4952 return -1;
5095 4953
@@ -5433,9 +5291,11 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
5433 goto out; 5291 goto out;
5434 advance_right = ADVANCE; 5292 advance_right = ADVANCE;
5435 } else { 5293 } else {
5294 WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
5436 ret = tree_compare_item(left_root, left_path, 5295 ret = tree_compare_item(left_root, left_path,
5437 right_path, tmp_buf); 5296 right_path, tmp_buf);
5438 if (ret) { 5297 if (ret) {
5298 WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
5439 ret = changed_cb(left_root, right_root, 5299 ret = changed_cb(left_root, right_root,
5440 left_path, right_path, 5300 left_path, right_path,
5441 &left_key, 5301 &left_key,
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9821b672f5a2..926c9ffc66d9 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -154,6 +154,13 @@ struct btrfs_ordered_sum;
154 */ 154 */
155#define BTRFS_NAME_LEN 255 155#define BTRFS_NAME_LEN 255
156 156
157/*
158 * Theoretical limit is larger, but we keep this down to a sane
159 * value. That should limit greatly the possibility of collisions on
160 * inode ref items.
161 */
162#define BTRFS_LINK_MAX 65535U
163
157/* 32 bytes in various csum fields */ 164/* 32 bytes in various csum fields */
158#define BTRFS_CSUM_SIZE 32 165#define BTRFS_CSUM_SIZE 32
159 166
@@ -489,6 +496,8 @@ struct btrfs_super_block {
489 */ 496 */
490#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5) 497#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5)
491 498
499#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6)
500
492#define BTRFS_FEATURE_COMPAT_SUPP 0ULL 501#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
493#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL 502#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL
494#define BTRFS_FEATURE_INCOMPAT_SUPP \ 503#define BTRFS_FEATURE_INCOMPAT_SUPP \
@@ -496,7 +505,8 @@ struct btrfs_super_block {
496 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ 505 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
497 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ 506 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
498 BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ 507 BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \
499 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO) 508 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \
509 BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
500 510
501/* 511/*
502 * A leaf is full of items. offset and size tell us where to find 512 * A leaf is full of items. offset and size tell us where to find
@@ -643,6 +653,14 @@ struct btrfs_inode_ref {
643 /* name goes here */ 653 /* name goes here */
644} __attribute__ ((__packed__)); 654} __attribute__ ((__packed__));
645 655
656struct btrfs_inode_extref {
657 __le64 parent_objectid;
658 __le64 index;
659 __le16 name_len;
660 __u8 name[0];
661 /* name goes here */
662} __attribute__ ((__packed__));
663
646struct btrfs_timespec { 664struct btrfs_timespec {
647 __le64 sec; 665 __le64 sec;
648 __le32 nsec; 666 __le32 nsec;
@@ -1028,12 +1046,22 @@ struct btrfs_space_info {
1028 wait_queue_head_t wait; 1046 wait_queue_head_t wait;
1029}; 1047};
1030 1048
1049#define BTRFS_BLOCK_RSV_GLOBAL 1
1050#define BTRFS_BLOCK_RSV_DELALLOC 2
1051#define BTRFS_BLOCK_RSV_TRANS 3
1052#define BTRFS_BLOCK_RSV_CHUNK 4
1053#define BTRFS_BLOCK_RSV_DELOPS 5
1054#define BTRFS_BLOCK_RSV_EMPTY 6
1055#define BTRFS_BLOCK_RSV_TEMP 7
1056
1031struct btrfs_block_rsv { 1057struct btrfs_block_rsv {
1032 u64 size; 1058 u64 size;
1033 u64 reserved; 1059 u64 reserved;
1034 struct btrfs_space_info *space_info; 1060 struct btrfs_space_info *space_info;
1035 spinlock_t lock; 1061 spinlock_t lock;
1036 unsigned int full; 1062 unsigned short full;
1063 unsigned short type;
1064 unsigned short failfast;
1037}; 1065};
1038 1066
1039/* 1067/*
@@ -1127,6 +1155,9 @@ struct btrfs_block_group_cache {
1127 * Today it will only have one thing on it, but that may change 1155 * Today it will only have one thing on it, but that may change
1128 */ 1156 */
1129 struct list_head cluster_list; 1157 struct list_head cluster_list;
1158
1159 /* For delayed block group creation */
1160 struct list_head new_bg_list;
1130}; 1161};
1131 1162
1132/* delayed seq elem */ 1163/* delayed seq elem */
@@ -1240,7 +1271,6 @@ struct btrfs_fs_info {
1240 struct mutex reloc_mutex; 1271 struct mutex reloc_mutex;
1241 1272
1242 struct list_head trans_list; 1273 struct list_head trans_list;
1243 struct list_head hashers;
1244 struct list_head dead_roots; 1274 struct list_head dead_roots;
1245 struct list_head caching_block_groups; 1275 struct list_head caching_block_groups;
1246 1276
@@ -1366,9 +1396,6 @@ struct btrfs_fs_info {
1366 struct rb_root defrag_inodes; 1396 struct rb_root defrag_inodes;
1367 atomic_t defrag_running; 1397 atomic_t defrag_running;
1368 1398
1369 spinlock_t ref_cache_lock;
1370 u64 total_ref_cache_size;
1371
1372 /* 1399 /*
1373 * these three are in extended format (availability of single 1400 * these three are in extended format (availability of single
1374 * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other 1401 * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other
@@ -1441,6 +1468,8 @@ struct btrfs_fs_info {
1441 1468
1442 /* next backup root to be overwritten */ 1469 /* next backup root to be overwritten */
1443 int backup_root_index; 1470 int backup_root_index;
1471
1472 int num_tolerated_disk_barrier_failures;
1444}; 1473};
1445 1474
1446/* 1475/*
@@ -1481,9 +1510,9 @@ struct btrfs_root {
1481 wait_queue_head_t log_commit_wait[2]; 1510 wait_queue_head_t log_commit_wait[2];
1482 atomic_t log_writers; 1511 atomic_t log_writers;
1483 atomic_t log_commit[2]; 1512 atomic_t log_commit[2];
1513 atomic_t log_batch;
1484 unsigned long log_transid; 1514 unsigned long log_transid;
1485 unsigned long last_log_commit; 1515 unsigned long last_log_commit;
1486 unsigned long log_batch;
1487 pid_t log_start_pid; 1516 pid_t log_start_pid;
1488 bool log_multiple_pids; 1517 bool log_multiple_pids;
1489 1518
@@ -1592,6 +1621,7 @@ struct btrfs_ioctl_defrag_range_args {
1592 */ 1621 */
1593#define BTRFS_INODE_ITEM_KEY 1 1622#define BTRFS_INODE_ITEM_KEY 1
1594#define BTRFS_INODE_REF_KEY 12 1623#define BTRFS_INODE_REF_KEY 12
1624#define BTRFS_INODE_EXTREF_KEY 13
1595#define BTRFS_XATTR_ITEM_KEY 24 1625#define BTRFS_XATTR_ITEM_KEY 24
1596#define BTRFS_ORPHAN_ITEM_KEY 48 1626#define BTRFS_ORPHAN_ITEM_KEY 48
1597/* reserve 2-15 close to the inode for later flexibility */ 1627/* reserve 2-15 close to the inode for later flexibility */
@@ -1978,6 +2008,13 @@ BTRFS_SETGET_STACK_FUNCS(block_group_flags,
1978BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16); 2008BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
1979BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64); 2009BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
1980 2010
2011/* struct btrfs_inode_extref */
2012BTRFS_SETGET_FUNCS(inode_extref_parent, struct btrfs_inode_extref,
2013 parent_objectid, 64);
2014BTRFS_SETGET_FUNCS(inode_extref_name_len, struct btrfs_inode_extref,
2015 name_len, 16);
2016BTRFS_SETGET_FUNCS(inode_extref_index, struct btrfs_inode_extref, index, 64);
2017
1981/* struct btrfs_inode_item */ 2018/* struct btrfs_inode_item */
1982BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64); 2019BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64);
1983BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64); 2020BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64);
@@ -2858,6 +2895,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
2858 u64 size); 2895 u64 size);
2859int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 2896int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
2860 struct btrfs_root *root, u64 group_start); 2897 struct btrfs_root *root, u64 group_start);
2898void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
2899 struct btrfs_root *root);
2861u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); 2900u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
2862u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data); 2901u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
2863void btrfs_clear_space_info_full(struct btrfs_fs_info *info); 2902void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
@@ -2874,8 +2913,9 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
2874void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes); 2913void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
2875int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes); 2914int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
2876void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes); 2915void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes);
2877void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv); 2916void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
2878struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root); 2917struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
2918 unsigned short type);
2879void btrfs_free_block_rsv(struct btrfs_root *root, 2919void btrfs_free_block_rsv(struct btrfs_root *root,
2880 struct btrfs_block_rsv *rsv); 2920 struct btrfs_block_rsv *rsv);
2881int btrfs_block_rsv_add(struct btrfs_root *root, 2921int btrfs_block_rsv_add(struct btrfs_root *root,
@@ -3172,12 +3212,12 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
3172 struct btrfs_root *root, 3212 struct btrfs_root *root,
3173 const char *name, int name_len, 3213 const char *name, int name_len,
3174 u64 inode_objectid, u64 ref_objectid, u64 *index); 3214 u64 inode_objectid, u64 ref_objectid, u64 *index);
3175struct btrfs_inode_ref * 3215int btrfs_get_inode_ref_index(struct btrfs_trans_handle *trans,
3176btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans, 3216 struct btrfs_root *root,
3177 struct btrfs_root *root, 3217 struct btrfs_path *path,
3178 struct btrfs_path *path, 3218 const char *name, int name_len,
3179 const char *name, int name_len, 3219 u64 inode_objectid, u64 ref_objectid, int mod,
3180 u64 inode_objectid, u64 ref_objectid, int mod); 3220 u64 *ret_index);
3181int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, 3221int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
3182 struct btrfs_root *root, 3222 struct btrfs_root *root,
3183 struct btrfs_path *path, u64 objectid); 3223 struct btrfs_path *path, u64 objectid);
@@ -3185,6 +3225,19 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
3185 *root, struct btrfs_path *path, 3225 *root, struct btrfs_path *path,
3186 struct btrfs_key *location, int mod); 3226 struct btrfs_key *location, int mod);
3187 3227
3228struct btrfs_inode_extref *
3229btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
3230 struct btrfs_root *root,
3231 struct btrfs_path *path,
3232 const char *name, int name_len,
3233 u64 inode_objectid, u64 ref_objectid, int ins_len,
3234 int cow);
3235
3236int btrfs_find_name_in_ext_backref(struct btrfs_path *path,
3237 u64 ref_objectid, const char *name,
3238 int name_len,
3239 struct btrfs_inode_extref **extref_ret);
3240
3188/* file-item.c */ 3241/* file-item.c */
3189int btrfs_del_csums(struct btrfs_trans_handle *trans, 3242int btrfs_del_csums(struct btrfs_trans_handle *trans,
3190 struct btrfs_root *root, u64 bytenr, u64 len); 3243 struct btrfs_root *root, u64 bytenr, u64 len);
@@ -3249,6 +3302,8 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
3249 struct btrfs_root *root, 3302 struct btrfs_root *root,
3250 struct inode *dir, u64 objectid, 3303 struct inode *dir, u64 objectid,
3251 const char *name, int name_len); 3304 const char *name, int name_len);
3305int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
3306 int front);
3252int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, 3307int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
3253 struct btrfs_root *root, 3308 struct btrfs_root *root,
3254 struct inode *inode, u64 new_size, 3309 struct inode *inode, u64 new_size,
@@ -3308,16 +3363,27 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
3308int btrfs_defrag_file(struct inode *inode, struct file *file, 3363int btrfs_defrag_file(struct inode *inode, struct file *file,
3309 struct btrfs_ioctl_defrag_range_args *range, 3364 struct btrfs_ioctl_defrag_range_args *range,
3310 u64 newer_than, unsigned long max_pages); 3365 u64 newer_than, unsigned long max_pages);
3366void btrfs_get_block_group_info(struct list_head *groups_list,
3367 struct btrfs_ioctl_space_info *space);
3368
3311/* file.c */ 3369/* file.c */
3312int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, 3370int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
3313 struct inode *inode); 3371 struct inode *inode);
3314int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); 3372int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
3315int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); 3373int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
3316int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, 3374void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
3317 int skip_pinned); 3375 int skip_pinned);
3376int btrfs_replace_extent_cache(struct inode *inode, struct extent_map *replace,
3377 u64 start, u64 end, int skip_pinned,
3378 int modified);
3318extern const struct file_operations btrfs_file_operations; 3379extern const struct file_operations btrfs_file_operations;
3319int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode, 3380int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
3320 u64 start, u64 end, u64 *hint_byte, int drop_cache); 3381 struct btrfs_root *root, struct inode *inode,
3382 struct btrfs_path *path, u64 start, u64 end,
3383 u64 *drop_end, int drop_cache);
3384int btrfs_drop_extents(struct btrfs_trans_handle *trans,
3385 struct btrfs_root *root, struct inode *inode, u64 start,
3386 u64 end, int drop_cache);
3321int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, 3387int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
3322 struct inode *inode, u64 start, u64 end); 3388 struct inode *inode, u64 start, u64 end);
3323int btrfs_release_file(struct inode *inode, struct file *file); 3389int btrfs_release_file(struct inode *inode, struct file *file);
@@ -3378,6 +3444,11 @@ static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info,
3378 } 3444 }
3379} 3445}
3380 3446
3447/*
3448 * Call btrfs_abort_transaction as early as possible when an error condition is
3449 * detected, that way the exact line number is reported.
3450 */
3451
3381#define btrfs_abort_transaction(trans, root, errno) \ 3452#define btrfs_abort_transaction(trans, root, errno) \
3382do { \ 3453do { \
3383 __btrfs_abort_transaction(trans, root, __func__, \ 3454 __btrfs_abort_transaction(trans, root, __func__, \
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 52c85e2b95d0..478f66bdc57b 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -29,7 +29,7 @@ static struct kmem_cache *delayed_node_cache;
29 29
30int __init btrfs_delayed_inode_init(void) 30int __init btrfs_delayed_inode_init(void)
31{ 31{
32 delayed_node_cache = kmem_cache_create("delayed_node", 32 delayed_node_cache = kmem_cache_create("btrfs_delayed_node",
33 sizeof(struct btrfs_delayed_node), 33 sizeof(struct btrfs_delayed_node),
34 0, 34 0,
35 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, 35 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
@@ -650,7 +650,7 @@ static int btrfs_delayed_inode_reserve_metadata(
650 * we're accounted for. 650 * we're accounted for.
651 */ 651 */
652 if (!src_rsv || (!trans->bytes_reserved && 652 if (!src_rsv || (!trans->bytes_reserved &&
653 src_rsv != &root->fs_info->delalloc_block_rsv)) { 653 src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
654 ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); 654 ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
655 /* 655 /*
656 * Since we're under a transaction reserve_metadata_bytes could 656 * Since we're under a transaction reserve_metadata_bytes could
@@ -668,7 +668,7 @@ static int btrfs_delayed_inode_reserve_metadata(
668 num_bytes, 1); 668 num_bytes, 1);
669 } 669 }
670 return ret; 670 return ret;
671 } else if (src_rsv == &root->fs_info->delalloc_block_rsv) { 671 } else if (src_rsv->type == BTRFS_BLOCK_RSV_DELALLOC) {
672 spin_lock(&BTRFS_I(inode)->lock); 672 spin_lock(&BTRFS_I(inode)->lock);
673 if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, 673 if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
674 &BTRFS_I(inode)->runtime_flags)) { 674 &BTRFS_I(inode)->runtime_flags)) {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 22e98e04c2ea..7cda51995c1e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -46,6 +46,10 @@
46#include "check-integrity.h" 46#include "check-integrity.h"
47#include "rcu-string.h" 47#include "rcu-string.h"
48 48
49#ifdef CONFIG_X86
50#include <asm/cpufeature.h>
51#endif
52
49static struct extent_io_ops btree_extent_io_ops; 53static struct extent_io_ops btree_extent_io_ops;
50static void end_workqueue_fn(struct btrfs_work *work); 54static void end_workqueue_fn(struct btrfs_work *work);
51static void free_fs_root(struct btrfs_root *root); 55static void free_fs_root(struct btrfs_root *root);
@@ -217,26 +221,16 @@ static struct extent_map *btree_get_extent(struct inode *inode,
217 write_lock(&em_tree->lock); 221 write_lock(&em_tree->lock);
218 ret = add_extent_mapping(em_tree, em); 222 ret = add_extent_mapping(em_tree, em);
219 if (ret == -EEXIST) { 223 if (ret == -EEXIST) {
220 u64 failed_start = em->start;
221 u64 failed_len = em->len;
222
223 free_extent_map(em); 224 free_extent_map(em);
224 em = lookup_extent_mapping(em_tree, start, len); 225 em = lookup_extent_mapping(em_tree, start, len);
225 if (em) { 226 if (!em)
226 ret = 0; 227 em = ERR_PTR(-EIO);
227 } else {
228 em = lookup_extent_mapping(em_tree, failed_start,
229 failed_len);
230 ret = -EIO;
231 }
232 } else if (ret) { 228 } else if (ret) {
233 free_extent_map(em); 229 free_extent_map(em);
234 em = NULL; 230 em = ERR_PTR(ret);
235 } 231 }
236 write_unlock(&em_tree->lock); 232 write_unlock(&em_tree->lock);
237 233
238 if (ret)
239 em = ERR_PTR(ret);
240out: 234out:
241 return em; 235 return em;
242} 236}
@@ -439,10 +433,6 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
439 WARN_ON(1); 433 WARN_ON(1);
440 return 0; 434 return 0;
441 } 435 }
442 if (eb->pages[0] != page) {
443 WARN_ON(1);
444 return 0;
445 }
446 if (!PageUptodate(page)) { 436 if (!PageUptodate(page)) {
447 WARN_ON(1); 437 WARN_ON(1);
448 return 0; 438 return 0;
@@ -869,10 +859,22 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
869 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1); 859 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
870} 860}
871 861
862static int check_async_write(struct inode *inode, unsigned long bio_flags)
863{
864 if (bio_flags & EXTENT_BIO_TREE_LOG)
865 return 0;
866#ifdef CONFIG_X86
867 if (cpu_has_xmm4_2)
868 return 0;
869#endif
870 return 1;
871}
872
872static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, 873static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
873 int mirror_num, unsigned long bio_flags, 874 int mirror_num, unsigned long bio_flags,
874 u64 bio_offset) 875 u64 bio_offset)
875{ 876{
877 int async = check_async_write(inode, bio_flags);
876 int ret; 878 int ret;
877 879
878 if (!(rw & REQ_WRITE)) { 880 if (!(rw & REQ_WRITE)) {
@@ -887,6 +889,12 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
887 return ret; 889 return ret;
888 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, 890 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
889 mirror_num, 0); 891 mirror_num, 0);
892 } else if (!async) {
893 ret = btree_csum_one_bio(bio);
894 if (ret)
895 return ret;
896 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
897 mirror_num, 0);
890 } 898 }
891 899
892 /* 900 /*
@@ -1168,8 +1176,8 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1168 atomic_set(&root->log_commit[0], 0); 1176 atomic_set(&root->log_commit[0], 0);
1169 atomic_set(&root->log_commit[1], 0); 1177 atomic_set(&root->log_commit[1], 0);
1170 atomic_set(&root->log_writers, 0); 1178 atomic_set(&root->log_writers, 0);
1179 atomic_set(&root->log_batch, 0);
1171 atomic_set(&root->orphan_inodes, 0); 1180 atomic_set(&root->orphan_inodes, 0);
1172 root->log_batch = 0;
1173 root->log_transid = 0; 1181 root->log_transid = 0;
1174 root->last_log_commit = 0; 1182 root->last_log_commit = 0;
1175 extent_io_tree_init(&root->dirty_log_pages, 1183 extent_io_tree_init(&root->dirty_log_pages,
@@ -1667,9 +1675,10 @@ static int transaction_kthread(void *arg)
1667 spin_unlock(&root->fs_info->trans_lock); 1675 spin_unlock(&root->fs_info->trans_lock);
1668 1676
1669 /* If the file system is aborted, this will always fail. */ 1677 /* If the file system is aborted, this will always fail. */
1670 trans = btrfs_join_transaction(root); 1678 trans = btrfs_attach_transaction(root);
1671 if (IS_ERR(trans)) { 1679 if (IS_ERR(trans)) {
1672 cannot_commit = true; 1680 if (PTR_ERR(trans) != -ENOENT)
1681 cannot_commit = true;
1673 goto sleep; 1682 goto sleep;
1674 } 1683 }
1675 if (transid == trans->transid) { 1684 if (transid == trans->transid) {
@@ -1994,13 +2003,11 @@ int open_ctree(struct super_block *sb,
1994 INIT_LIST_HEAD(&fs_info->trans_list); 2003 INIT_LIST_HEAD(&fs_info->trans_list);
1995 INIT_LIST_HEAD(&fs_info->dead_roots); 2004 INIT_LIST_HEAD(&fs_info->dead_roots);
1996 INIT_LIST_HEAD(&fs_info->delayed_iputs); 2005 INIT_LIST_HEAD(&fs_info->delayed_iputs);
1997 INIT_LIST_HEAD(&fs_info->hashers);
1998 INIT_LIST_HEAD(&fs_info->delalloc_inodes); 2006 INIT_LIST_HEAD(&fs_info->delalloc_inodes);
1999 INIT_LIST_HEAD(&fs_info->ordered_operations); 2007 INIT_LIST_HEAD(&fs_info->ordered_operations);
2000 INIT_LIST_HEAD(&fs_info->caching_block_groups); 2008 INIT_LIST_HEAD(&fs_info->caching_block_groups);
2001 spin_lock_init(&fs_info->delalloc_lock); 2009 spin_lock_init(&fs_info->delalloc_lock);
2002 spin_lock_init(&fs_info->trans_lock); 2010 spin_lock_init(&fs_info->trans_lock);
2003 spin_lock_init(&fs_info->ref_cache_lock);
2004 spin_lock_init(&fs_info->fs_roots_radix_lock); 2011 spin_lock_init(&fs_info->fs_roots_radix_lock);
2005 spin_lock_init(&fs_info->delayed_iput_lock); 2012 spin_lock_init(&fs_info->delayed_iput_lock);
2006 spin_lock_init(&fs_info->defrag_inodes_lock); 2013 spin_lock_init(&fs_info->defrag_inodes_lock);
@@ -2014,12 +2021,15 @@ int open_ctree(struct super_block *sb,
2014 INIT_LIST_HEAD(&fs_info->space_info); 2021 INIT_LIST_HEAD(&fs_info->space_info);
2015 INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); 2022 INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
2016 btrfs_mapping_init(&fs_info->mapping_tree); 2023 btrfs_mapping_init(&fs_info->mapping_tree);
2017 btrfs_init_block_rsv(&fs_info->global_block_rsv); 2024 btrfs_init_block_rsv(&fs_info->global_block_rsv,
2018 btrfs_init_block_rsv(&fs_info->delalloc_block_rsv); 2025 BTRFS_BLOCK_RSV_GLOBAL);
2019 btrfs_init_block_rsv(&fs_info->trans_block_rsv); 2026 btrfs_init_block_rsv(&fs_info->delalloc_block_rsv,
2020 btrfs_init_block_rsv(&fs_info->chunk_block_rsv); 2027 BTRFS_BLOCK_RSV_DELALLOC);
2021 btrfs_init_block_rsv(&fs_info->empty_block_rsv); 2028 btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
2022 btrfs_init_block_rsv(&fs_info->delayed_block_rsv); 2029 btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
2030 btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
2031 btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
2032 BTRFS_BLOCK_RSV_DELOPS);
2023 atomic_set(&fs_info->nr_async_submits, 0); 2033 atomic_set(&fs_info->nr_async_submits, 0);
2024 atomic_set(&fs_info->async_delalloc_pages, 0); 2034 atomic_set(&fs_info->async_delalloc_pages, 0);
2025 atomic_set(&fs_info->async_submit_draining, 0); 2035 atomic_set(&fs_info->async_submit_draining, 0);
@@ -2491,6 +2501,8 @@ retry_root_backup:
2491 printk(KERN_ERR "Failed to read block groups: %d\n", ret); 2501 printk(KERN_ERR "Failed to read block groups: %d\n", ret);
2492 goto fail_block_groups; 2502 goto fail_block_groups;
2493 } 2503 }
2504 fs_info->num_tolerated_disk_barrier_failures =
2505 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
2494 2506
2495 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, 2507 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
2496 "btrfs-cleaner"); 2508 "btrfs-cleaner");
@@ -2874,12 +2886,10 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
2874 printk_in_rcu("btrfs: disabling barriers on dev %s\n", 2886 printk_in_rcu("btrfs: disabling barriers on dev %s\n",
2875 rcu_str_deref(device->name)); 2887 rcu_str_deref(device->name));
2876 device->nobarriers = 1; 2888 device->nobarriers = 1;
2877 } 2889 } else if (!bio_flagged(bio, BIO_UPTODATE)) {
2878 if (!bio_flagged(bio, BIO_UPTODATE)) {
2879 ret = -EIO; 2890 ret = -EIO;
2880 if (!bio_flagged(bio, BIO_EOPNOTSUPP)) 2891 btrfs_dev_stat_inc_and_print(device,
2881 btrfs_dev_stat_inc_and_print(device, 2892 BTRFS_DEV_STAT_FLUSH_ERRS);
2882 BTRFS_DEV_STAT_FLUSH_ERRS);
2883 } 2893 }
2884 2894
2885 /* drop the reference from the wait == 0 run */ 2895 /* drop the reference from the wait == 0 run */
@@ -2918,14 +2928,15 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
2918{ 2928{
2919 struct list_head *head; 2929 struct list_head *head;
2920 struct btrfs_device *dev; 2930 struct btrfs_device *dev;
2921 int errors = 0; 2931 int errors_send = 0;
2932 int errors_wait = 0;
2922 int ret; 2933 int ret;
2923 2934
2924 /* send down all the barriers */ 2935 /* send down all the barriers */
2925 head = &info->fs_devices->devices; 2936 head = &info->fs_devices->devices;
2926 list_for_each_entry_rcu(dev, head, dev_list) { 2937 list_for_each_entry_rcu(dev, head, dev_list) {
2927 if (!dev->bdev) { 2938 if (!dev->bdev) {
2928 errors++; 2939 errors_send++;
2929 continue; 2940 continue;
2930 } 2941 }
2931 if (!dev->in_fs_metadata || !dev->writeable) 2942 if (!dev->in_fs_metadata || !dev->writeable)
@@ -2933,13 +2944,13 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
2933 2944
2934 ret = write_dev_flush(dev, 0); 2945 ret = write_dev_flush(dev, 0);
2935 if (ret) 2946 if (ret)
2936 errors++; 2947 errors_send++;
2937 } 2948 }
2938 2949
2939 /* wait for all the barriers */ 2950 /* wait for all the barriers */
2940 list_for_each_entry_rcu(dev, head, dev_list) { 2951 list_for_each_entry_rcu(dev, head, dev_list) {
2941 if (!dev->bdev) { 2952 if (!dev->bdev) {
2942 errors++; 2953 errors_wait++;
2943 continue; 2954 continue;
2944 } 2955 }
2945 if (!dev->in_fs_metadata || !dev->writeable) 2956 if (!dev->in_fs_metadata || !dev->writeable)
@@ -2947,13 +2958,87 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
2947 2958
2948 ret = write_dev_flush(dev, 1); 2959 ret = write_dev_flush(dev, 1);
2949 if (ret) 2960 if (ret)
2950 errors++; 2961 errors_wait++;
2951 } 2962 }
2952 if (errors) 2963 if (errors_send > info->num_tolerated_disk_barrier_failures ||
2964 errors_wait > info->num_tolerated_disk_barrier_failures)
2953 return -EIO; 2965 return -EIO;
2954 return 0; 2966 return 0;
2955} 2967}
2956 2968
2969int btrfs_calc_num_tolerated_disk_barrier_failures(
2970 struct btrfs_fs_info *fs_info)
2971{
2972 struct btrfs_ioctl_space_info space;
2973 struct btrfs_space_info *sinfo;
2974 u64 types[] = {BTRFS_BLOCK_GROUP_DATA,
2975 BTRFS_BLOCK_GROUP_SYSTEM,
2976 BTRFS_BLOCK_GROUP_METADATA,
2977 BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA};
2978 int num_types = 4;
2979 int i;
2980 int c;
2981 int num_tolerated_disk_barrier_failures =
2982 (int)fs_info->fs_devices->num_devices;
2983
2984 for (i = 0; i < num_types; i++) {
2985 struct btrfs_space_info *tmp;
2986
2987 sinfo = NULL;
2988 rcu_read_lock();
2989 list_for_each_entry_rcu(tmp, &fs_info->space_info, list) {
2990 if (tmp->flags == types[i]) {
2991 sinfo = tmp;
2992 break;
2993 }
2994 }
2995 rcu_read_unlock();
2996
2997 if (!sinfo)
2998 continue;
2999
3000 down_read(&sinfo->groups_sem);
3001 for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
3002 if (!list_empty(&sinfo->block_groups[c])) {
3003 u64 flags;
3004
3005 btrfs_get_block_group_info(
3006 &sinfo->block_groups[c], &space);
3007 if (space.total_bytes == 0 ||
3008 space.used_bytes == 0)
3009 continue;
3010 flags = space.flags;
3011 /*
3012 * return
3013 * 0: if dup, single or RAID0 is configured for
3014 * any of metadata, system or data, else
3015 * 1: if RAID5 is configured, or if RAID1 or
3016 * RAID10 is configured and only two mirrors
3017 * are used, else
3018 * 2: if RAID6 is configured, else
3019 * num_mirrors - 1: if RAID1 or RAID10 is
3020 * configured and more than
3021 * 2 mirrors are used.
3022 */
3023 if (num_tolerated_disk_barrier_failures > 0 &&
3024 ((flags & (BTRFS_BLOCK_GROUP_DUP |
3025 BTRFS_BLOCK_GROUP_RAID0)) ||
3026 ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK)
3027 == 0)))
3028 num_tolerated_disk_barrier_failures = 0;
3029 else if (num_tolerated_disk_barrier_failures > 1
3030 &&
3031 (flags & (BTRFS_BLOCK_GROUP_RAID1 |
3032 BTRFS_BLOCK_GROUP_RAID10)))
3033 num_tolerated_disk_barrier_failures = 1;
3034 }
3035 }
3036 up_read(&sinfo->groups_sem);
3037 }
3038
3039 return num_tolerated_disk_barrier_failures;
3040}
3041
2957int write_all_supers(struct btrfs_root *root, int max_mirrors) 3042int write_all_supers(struct btrfs_root *root, int max_mirrors)
2958{ 3043{
2959 struct list_head *head; 3044 struct list_head *head;
@@ -2976,8 +3061,16 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
2976 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 3061 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2977 head = &root->fs_info->fs_devices->devices; 3062 head = &root->fs_info->fs_devices->devices;
2978 3063
2979 if (do_barriers) 3064 if (do_barriers) {
2980 barrier_all_devices(root->fs_info); 3065 ret = barrier_all_devices(root->fs_info);
3066 if (ret) {
3067 mutex_unlock(
3068 &root->fs_info->fs_devices->device_list_mutex);
3069 btrfs_error(root->fs_info, ret,
3070 "errors while submitting device barriers.");
3071 return ret;
3072 }
3073 }
2981 3074
2982 list_for_each_entry_rcu(dev, head, dev_list) { 3075 list_for_each_entry_rcu(dev, head, dev_list) {
2983 if (!dev->bdev) { 3076 if (!dev->bdev) {
@@ -3211,10 +3304,6 @@ int close_ctree(struct btrfs_root *root)
3211 printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n", 3304 printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
3212 (unsigned long long)fs_info->delalloc_bytes); 3305 (unsigned long long)fs_info->delalloc_bytes);
3213 } 3306 }
3214 if (fs_info->total_ref_cache_size) {
3215 printk(KERN_INFO "btrfs: at umount reference cache size %llu\n",
3216 (unsigned long long)fs_info->total_ref_cache_size);
3217 }
3218 3307
3219 free_extent_buffer(fs_info->extent_root->node); 3308 free_extent_buffer(fs_info->extent_root->node);
3220 free_extent_buffer(fs_info->extent_root->commit_root); 3309 free_extent_buffer(fs_info->extent_root->commit_root);
@@ -3360,52 +3449,6 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
3360 return btree_read_extent_buffer_pages(root, buf, 0, parent_transid); 3449 return btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
3361} 3450}
3362 3451
3363int btree_lock_page_hook(struct page *page, void *data,
3364 void (*flush_fn)(void *))
3365{
3366 struct inode *inode = page->mapping->host;
3367 struct btrfs_root *root = BTRFS_I(inode)->root;
3368 struct extent_buffer *eb;
3369
3370 /*
3371 * We culled this eb but the page is still hanging out on the mapping,
3372 * carry on.
3373 */
3374 if (!PagePrivate(page))
3375 goto out;
3376
3377 eb = (struct extent_buffer *)page->private;
3378 if (!eb) {
3379 WARN_ON(1);
3380 goto out;
3381 }
3382 if (page != eb->pages[0])
3383 goto out;
3384
3385 if (!btrfs_try_tree_write_lock(eb)) {
3386 flush_fn(data);
3387 btrfs_tree_lock(eb);
3388 }
3389 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
3390
3391 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
3392 spin_lock(&root->fs_info->delalloc_lock);
3393 if (root->fs_info->dirty_metadata_bytes >= eb->len)
3394 root->fs_info->dirty_metadata_bytes -= eb->len;
3395 else
3396 WARN_ON(1);
3397 spin_unlock(&root->fs_info->delalloc_lock);
3398 }
3399
3400 btrfs_tree_unlock(eb);
3401out:
3402 if (!trylock_page(page)) {
3403 flush_fn(data);
3404 lock_page(page);
3405 }
3406 return 0;
3407}
3408
3409static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, 3452static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
3410 int read_only) 3453 int read_only)
3411{ 3454{
@@ -3608,7 +3651,7 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
3608 3651
3609 while (1) { 3652 while (1) {
3610 ret = find_first_extent_bit(dirty_pages, start, &start, &end, 3653 ret = find_first_extent_bit(dirty_pages, start, &start, &end,
3611 mark); 3654 mark, NULL);
3612 if (ret) 3655 if (ret)
3613 break; 3656 break;
3614 3657
@@ -3663,7 +3706,7 @@ static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
3663again: 3706again:
3664 while (1) { 3707 while (1) {
3665 ret = find_first_extent_bit(unpin, 0, &start, &end, 3708 ret = find_first_extent_bit(unpin, 0, &start, &end,
3666 EXTENT_DIRTY); 3709 EXTENT_DIRTY, NULL);
3667 if (ret) 3710 if (ret)
3668 break; 3711 break;
3669 3712
@@ -3800,7 +3843,6 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)
3800} 3843}
3801 3844
3802static struct extent_io_ops btree_extent_io_ops = { 3845static struct extent_io_ops btree_extent_io_ops = {
3803 .write_cache_pages_lock_hook = btree_lock_page_hook,
3804 .readpage_end_io_hook = btree_readpage_end_io_hook, 3846 .readpage_end_io_hook = btree_readpage_end_io_hook,
3805 .readpage_io_failed_hook = btree_io_failed_hook, 3847 .readpage_io_failed_hook = btree_io_failed_hook,
3806 .submit_bio_hook = btree_submit_bio_hook, 3848 .submit_bio_hook = btree_submit_bio_hook,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index c5b00a735fef..2025a9132c16 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -95,6 +95,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
95 u64 objectid); 95 u64 objectid);
96int btree_lock_page_hook(struct page *page, void *data, 96int btree_lock_page_hook(struct page *page, void *data,
97 void (*flush_fn)(void *)); 97 void (*flush_fn)(void *));
98int btrfs_calc_num_tolerated_disk_barrier_failures(
99 struct btrfs_fs_info *fs_info);
98 100
99#ifdef CONFIG_DEBUG_LOCK_ALLOC 101#ifdef CONFIG_DEBUG_LOCK_ALLOC
100void btrfs_init_lockdep(void); 102void btrfs_init_lockdep(void);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ba58024d40d3..3d3e2c17d8d1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -94,8 +94,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
94 u64 flags, struct btrfs_disk_key *key, 94 u64 flags, struct btrfs_disk_key *key,
95 int level, struct btrfs_key *ins); 95 int level, struct btrfs_key *ins);
96static int do_chunk_alloc(struct btrfs_trans_handle *trans, 96static int do_chunk_alloc(struct btrfs_trans_handle *trans,
97 struct btrfs_root *extent_root, u64 alloc_bytes, 97 struct btrfs_root *extent_root, u64 flags,
98 u64 flags, int force); 98 int force);
99static int find_next_key(struct btrfs_path *path, int level, 99static int find_next_key(struct btrfs_path *path, int level,
100 struct btrfs_key *key); 100 struct btrfs_key *key);
101static void dump_space_info(struct btrfs_space_info *info, u64 bytes, 101static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
@@ -312,7 +312,8 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
312 while (start < end) { 312 while (start < end) {
313 ret = find_first_extent_bit(info->pinned_extents, start, 313 ret = find_first_extent_bit(info->pinned_extents, start,
314 &extent_start, &extent_end, 314 &extent_start, &extent_end,
315 EXTENT_DIRTY | EXTENT_UPTODATE); 315 EXTENT_DIRTY | EXTENT_UPTODATE,
316 NULL);
316 if (ret) 317 if (ret)
317 break; 318 break;
318 319
@@ -2361,10 +2362,6 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2361 } 2362 }
2362 2363
2363next: 2364next:
2364 do_chunk_alloc(trans, fs_info->extent_root,
2365 2 * 1024 * 1024,
2366 btrfs_get_alloc_profile(root, 0),
2367 CHUNK_ALLOC_NO_FORCE);
2368 cond_resched(); 2365 cond_resched();
2369 spin_lock(&delayed_refs->lock); 2366 spin_lock(&delayed_refs->lock);
2370 } 2367 }
@@ -2478,10 +2475,6 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2478 if (root == root->fs_info->extent_root) 2475 if (root == root->fs_info->extent_root)
2479 root = root->fs_info->tree_root; 2476 root = root->fs_info->tree_root;
2480 2477
2481 do_chunk_alloc(trans, root->fs_info->extent_root,
2482 2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0),
2483 CHUNK_ALLOC_NO_FORCE);
2484
2485 btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); 2478 btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
2486 2479
2487 delayed_refs = &trans->transaction->delayed_refs; 2480 delayed_refs = &trans->transaction->delayed_refs;
@@ -2551,6 +2544,12 @@ again:
2551 } 2544 }
2552 2545
2553 if (run_all) { 2546 if (run_all) {
2547 if (!list_empty(&trans->new_bgs)) {
2548 spin_unlock(&delayed_refs->lock);
2549 btrfs_create_pending_block_groups(trans, root);
2550 spin_lock(&delayed_refs->lock);
2551 }
2552
2554 node = rb_first(&delayed_refs->root); 2553 node = rb_first(&delayed_refs->root);
2555 if (!node) 2554 if (!node)
2556 goto out; 2555 goto out;
@@ -3406,7 +3405,6 @@ alloc:
3406 return PTR_ERR(trans); 3405 return PTR_ERR(trans);
3407 3406
3408 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 3407 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3409 bytes + 2 * 1024 * 1024,
3410 alloc_target, 3408 alloc_target,
3411 CHUNK_ALLOC_NO_FORCE); 3409 CHUNK_ALLOC_NO_FORCE);
3412 btrfs_end_transaction(trans, root); 3410 btrfs_end_transaction(trans, root);
@@ -3488,8 +3486,7 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
3488} 3486}
3489 3487
3490static int should_alloc_chunk(struct btrfs_root *root, 3488static int should_alloc_chunk(struct btrfs_root *root,
3491 struct btrfs_space_info *sinfo, u64 alloc_bytes, 3489 struct btrfs_space_info *sinfo, int force)
3492 int force)
3493{ 3490{
3494 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 3491 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3495 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; 3492 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
@@ -3504,7 +3501,8 @@ static int should_alloc_chunk(struct btrfs_root *root,
3504 * and purposes it's used space. Don't worry about locking the 3501 * and purposes it's used space. Don't worry about locking the
3505 * global_rsv, it doesn't change except when the transaction commits. 3502 * global_rsv, it doesn't change except when the transaction commits.
3506 */ 3503 */
3507 num_allocated += global_rsv->size; 3504 if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
3505 num_allocated += global_rsv->size;
3508 3506
3509 /* 3507 /*
3510 * in limited mode, we want to have some free space up to 3508 * in limited mode, we want to have some free space up to
@@ -3518,15 +3516,8 @@ static int should_alloc_chunk(struct btrfs_root *root,
3518 if (num_bytes - num_allocated < thresh) 3516 if (num_bytes - num_allocated < thresh)
3519 return 1; 3517 return 1;
3520 } 3518 }
3521 thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
3522 3519
3523 /* 256MB or 2% of the FS */ 3520 if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8))
3524 thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 2));
3525 /* system chunks need a much small threshold */
3526 if (sinfo->flags & BTRFS_BLOCK_GROUP_SYSTEM)
3527 thresh = 32 * 1024 * 1024;
3528
3529 if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 8))
3530 return 0; 3521 return 0;
3531 return 1; 3522 return 1;
3532} 3523}
@@ -3576,8 +3567,7 @@ static void check_system_chunk(struct btrfs_trans_handle *trans,
3576} 3567}
3577 3568
3578static int do_chunk_alloc(struct btrfs_trans_handle *trans, 3569static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3579 struct btrfs_root *extent_root, u64 alloc_bytes, 3570 struct btrfs_root *extent_root, u64 flags, int force)
3580 u64 flags, int force)
3581{ 3571{
3582 struct btrfs_space_info *space_info; 3572 struct btrfs_space_info *space_info;
3583 struct btrfs_fs_info *fs_info = extent_root->fs_info; 3573 struct btrfs_fs_info *fs_info = extent_root->fs_info;
@@ -3601,7 +3591,7 @@ again:
3601 return 0; 3591 return 0;
3602 } 3592 }
3603 3593
3604 if (!should_alloc_chunk(extent_root, space_info, alloc_bytes, force)) { 3594 if (!should_alloc_chunk(extent_root, space_info, force)) {
3605 spin_unlock(&space_info->lock); 3595 spin_unlock(&space_info->lock);
3606 return 0; 3596 return 0;
3607 } else if (space_info->chunk_alloc) { 3597 } else if (space_info->chunk_alloc) {
@@ -3669,6 +3659,46 @@ out:
3669 return ret; 3659 return ret;
3670} 3660}
3671 3661
3662static int can_overcommit(struct btrfs_root *root,
3663 struct btrfs_space_info *space_info, u64 bytes,
3664 int flush)
3665{
3666 u64 profile = btrfs_get_alloc_profile(root, 0);
3667 u64 avail;
3668 u64 used;
3669
3670 used = space_info->bytes_used + space_info->bytes_reserved +
3671 space_info->bytes_pinned + space_info->bytes_readonly +
3672 space_info->bytes_may_use;
3673
3674 spin_lock(&root->fs_info->free_chunk_lock);
3675 avail = root->fs_info->free_chunk_space;
3676 spin_unlock(&root->fs_info->free_chunk_lock);
3677
3678 /*
3679 * If we have dup, raid1 or raid10 then only half of the free
3680 * space is actually useable.
3681 */
3682 if (profile & (BTRFS_BLOCK_GROUP_DUP |
3683 BTRFS_BLOCK_GROUP_RAID1 |
3684 BTRFS_BLOCK_GROUP_RAID10))
3685 avail >>= 1;
3686
3687 /*
3688 * If we aren't flushing don't let us overcommit too much, say
3689 * 1/8th of the space. If we can flush, let it overcommit up to
3690 * 1/2 of the space.
3691 */
3692 if (flush)
3693 avail >>= 3;
3694 else
3695 avail >>= 1;
3696
3697 if (used + bytes < space_info->total_bytes + avail)
3698 return 1;
3699 return 0;
3700}
3701
3672/* 3702/*
3673 * shrink metadata reservation for delalloc 3703 * shrink metadata reservation for delalloc
3674 */ 3704 */
@@ -3693,7 +3723,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3693 if (delalloc_bytes == 0) { 3723 if (delalloc_bytes == 0) {
3694 if (trans) 3724 if (trans)
3695 return; 3725 return;
3696 btrfs_wait_ordered_extents(root, 0, 0); 3726 btrfs_wait_ordered_extents(root, 0);
3697 return; 3727 return;
3698 } 3728 }
3699 3729
@@ -3703,11 +3733,15 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3703 writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages, 3733 writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages,
3704 WB_REASON_FS_FREE_SPACE); 3734 WB_REASON_FS_FREE_SPACE);
3705 3735
3736 /*
3737 * We need to wait for the async pages to actually start before
3738 * we do anything.
3739 */
3740 wait_event(root->fs_info->async_submit_wait,
3741 !atomic_read(&root->fs_info->async_delalloc_pages));
3742
3706 spin_lock(&space_info->lock); 3743 spin_lock(&space_info->lock);
3707 if (space_info->bytes_used + space_info->bytes_reserved + 3744 if (can_overcommit(root, space_info, orig, !trans)) {
3708 space_info->bytes_pinned + space_info->bytes_readonly +
3709 space_info->bytes_may_use + orig <=
3710 space_info->total_bytes) {
3711 spin_unlock(&space_info->lock); 3745 spin_unlock(&space_info->lock);
3712 break; 3746 break;
3713 } 3747 }
@@ -3715,7 +3749,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3715 3749
3716 loops++; 3750 loops++;
3717 if (wait_ordered && !trans) { 3751 if (wait_ordered && !trans) {
3718 btrfs_wait_ordered_extents(root, 0, 0); 3752 btrfs_wait_ordered_extents(root, 0);
3719 } else { 3753 } else {
3720 time_left = schedule_timeout_killable(1); 3754 time_left = schedule_timeout_killable(1);
3721 if (time_left) 3755 if (time_left)
@@ -3784,11 +3818,12 @@ commit:
3784} 3818}
3785 3819
3786enum flush_state { 3820enum flush_state {
3787 FLUSH_DELALLOC = 1, 3821 FLUSH_DELAYED_ITEMS_NR = 1,
3788 FLUSH_DELALLOC_WAIT = 2, 3822 FLUSH_DELAYED_ITEMS = 2,
3789 FLUSH_DELAYED_ITEMS_NR = 3, 3823 FLUSH_DELALLOC = 3,
3790 FLUSH_DELAYED_ITEMS = 4, 3824 FLUSH_DELALLOC_WAIT = 4,
3791 COMMIT_TRANS = 5, 3825 ALLOC_CHUNK = 5,
3826 COMMIT_TRANS = 6,
3792}; 3827};
3793 3828
3794static int flush_space(struct btrfs_root *root, 3829static int flush_space(struct btrfs_root *root,
@@ -3800,11 +3835,6 @@ static int flush_space(struct btrfs_root *root,
3800 int ret = 0; 3835 int ret = 0;
3801 3836
3802 switch (state) { 3837 switch (state) {
3803 case FLUSH_DELALLOC:
3804 case FLUSH_DELALLOC_WAIT:
3805 shrink_delalloc(root, num_bytes, orig_bytes,
3806 state == FLUSH_DELALLOC_WAIT);
3807 break;
3808 case FLUSH_DELAYED_ITEMS_NR: 3838 case FLUSH_DELAYED_ITEMS_NR:
3809 case FLUSH_DELAYED_ITEMS: 3839 case FLUSH_DELAYED_ITEMS:
3810 if (state == FLUSH_DELAYED_ITEMS_NR) { 3840 if (state == FLUSH_DELAYED_ITEMS_NR) {
@@ -3825,6 +3855,24 @@ static int flush_space(struct btrfs_root *root,
3825 ret = btrfs_run_delayed_items_nr(trans, root, nr); 3855 ret = btrfs_run_delayed_items_nr(trans, root, nr);
3826 btrfs_end_transaction(trans, root); 3856 btrfs_end_transaction(trans, root);
3827 break; 3857 break;
3858 case FLUSH_DELALLOC:
3859 case FLUSH_DELALLOC_WAIT:
3860 shrink_delalloc(root, num_bytes, orig_bytes,
3861 state == FLUSH_DELALLOC_WAIT);
3862 break;
3863 case ALLOC_CHUNK:
3864 trans = btrfs_join_transaction(root);
3865 if (IS_ERR(trans)) {
3866 ret = PTR_ERR(trans);
3867 break;
3868 }
3869 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3870 btrfs_get_alloc_profile(root, 0),
3871 CHUNK_ALLOC_NO_FORCE);
3872 btrfs_end_transaction(trans, root);
3873 if (ret == -ENOSPC)
3874 ret = 0;
3875 break;
3828 case COMMIT_TRANS: 3876 case COMMIT_TRANS:
3829 ret = may_commit_transaction(root, space_info, orig_bytes, 0); 3877 ret = may_commit_transaction(root, space_info, orig_bytes, 0);
3830 break; 3878 break;
@@ -3856,10 +3904,9 @@ static int reserve_metadata_bytes(struct btrfs_root *root,
3856 struct btrfs_space_info *space_info = block_rsv->space_info; 3904 struct btrfs_space_info *space_info = block_rsv->space_info;
3857 u64 used; 3905 u64 used;
3858 u64 num_bytes = orig_bytes; 3906 u64 num_bytes = orig_bytes;
3859 int flush_state = FLUSH_DELALLOC; 3907 int flush_state = FLUSH_DELAYED_ITEMS_NR;
3860 int ret = 0; 3908 int ret = 0;
3861 bool flushing = false; 3909 bool flushing = false;
3862 bool committed = false;
3863 3910
3864again: 3911again:
3865 ret = 0; 3912 ret = 0;
@@ -3922,57 +3969,12 @@ again:
3922 (orig_bytes * 2); 3969 (orig_bytes * 2);
3923 } 3970 }
3924 3971
3925 if (ret) { 3972 if (ret && can_overcommit(root, space_info, orig_bytes, flush)) {
3926 u64 profile = btrfs_get_alloc_profile(root, 0); 3973 space_info->bytes_may_use += orig_bytes;
3927 u64 avail; 3974 trace_btrfs_space_reservation(root->fs_info, "space_info",
3928 3975 space_info->flags, orig_bytes,
3929 /* 3976 1);
3930 * If we have a lot of space that's pinned, don't bother doing 3977 ret = 0;
3931 * the overcommit dance yet and just commit the transaction.
3932 */
3933 avail = (space_info->total_bytes - space_info->bytes_used) * 8;
3934 do_div(avail, 10);
3935 if (space_info->bytes_pinned >= avail && flush && !committed) {
3936 space_info->flush = 1;
3937 flushing = true;
3938 spin_unlock(&space_info->lock);
3939 ret = may_commit_transaction(root, space_info,
3940 orig_bytes, 1);
3941 if (ret)
3942 goto out;
3943 committed = true;
3944 goto again;
3945 }
3946
3947 spin_lock(&root->fs_info->free_chunk_lock);
3948 avail = root->fs_info->free_chunk_space;
3949
3950 /*
3951 * If we have dup, raid1 or raid10 then only half of the free
3952 * space is actually useable.
3953 */
3954 if (profile & (BTRFS_BLOCK_GROUP_DUP |
3955 BTRFS_BLOCK_GROUP_RAID1 |
3956 BTRFS_BLOCK_GROUP_RAID10))
3957 avail >>= 1;
3958
3959 /*
3960 * If we aren't flushing don't let us overcommit too much, say
3961 * 1/8th of the space. If we can flush, let it overcommit up to
3962 * 1/2 of the space.
3963 */
3964 if (flush)
3965 avail >>= 3;
3966 else
3967 avail >>= 1;
3968 spin_unlock(&root->fs_info->free_chunk_lock);
3969
3970 if (used + num_bytes < space_info->total_bytes + avail) {
3971 space_info->bytes_may_use += orig_bytes;
3972 trace_btrfs_space_reservation(root->fs_info,
3973 "space_info", space_info->flags, orig_bytes, 1);
3974 ret = 0;
3975 }
3976 } 3978 }
3977 3979
3978 /* 3980 /*
@@ -4114,13 +4116,15 @@ static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
4114 return 0; 4116 return 0;
4115} 4117}
4116 4118
4117void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv) 4119void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
4118{ 4120{
4119 memset(rsv, 0, sizeof(*rsv)); 4121 memset(rsv, 0, sizeof(*rsv));
4120 spin_lock_init(&rsv->lock); 4122 spin_lock_init(&rsv->lock);
4123 rsv->type = type;
4121} 4124}
4122 4125
4123struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root) 4126struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
4127 unsigned short type)
4124{ 4128{
4125 struct btrfs_block_rsv *block_rsv; 4129 struct btrfs_block_rsv *block_rsv;
4126 struct btrfs_fs_info *fs_info = root->fs_info; 4130 struct btrfs_fs_info *fs_info = root->fs_info;
@@ -4129,7 +4133,7 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
4129 if (!block_rsv) 4133 if (!block_rsv)
4130 return NULL; 4134 return NULL;
4131 4135
4132 btrfs_init_block_rsv(block_rsv); 4136 btrfs_init_block_rsv(block_rsv, type);
4133 block_rsv->space_info = __find_space_info(fs_info, 4137 block_rsv->space_info = __find_space_info(fs_info,
4134 BTRFS_BLOCK_GROUP_METADATA); 4138 BTRFS_BLOCK_GROUP_METADATA);
4135 return block_rsv; 4139 return block_rsv;
@@ -4138,6 +4142,8 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
4138void btrfs_free_block_rsv(struct btrfs_root *root, 4142void btrfs_free_block_rsv(struct btrfs_root *root,
4139 struct btrfs_block_rsv *rsv) 4143 struct btrfs_block_rsv *rsv)
4140{ 4144{
4145 if (!rsv)
4146 return;
4141 btrfs_block_rsv_release(root, rsv, (u64)-1); 4147 btrfs_block_rsv_release(root, rsv, (u64)-1);
4142 kfree(rsv); 4148 kfree(rsv);
4143} 4149}
@@ -4416,10 +4422,10 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
4416 struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root); 4422 struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
4417 struct btrfs_block_rsv *dst_rsv = &pending->block_rsv; 4423 struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
4418 /* 4424 /*
4419 * two for root back/forward refs, two for directory entries 4425 * two for root back/forward refs, two for directory entries,
4420 * and one for root of the snapshot. 4426 * one for root of the snapshot and one for parent inode.
4421 */ 4427 */
4422 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5); 4428 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 6);
4423 dst_rsv->space_info = src_rsv->space_info; 4429 dst_rsv->space_info = src_rsv->space_info;
4424 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); 4430 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4425} 4431}
@@ -5018,7 +5024,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
5018 5024
5019 while (1) { 5025 while (1) {
5020 ret = find_first_extent_bit(unpin, 0, &start, &end, 5026 ret = find_first_extent_bit(unpin, 0, &start, &end,
5021 EXTENT_DIRTY); 5027 EXTENT_DIRTY, NULL);
5022 if (ret) 5028 if (ret)
5023 break; 5029 break;
5024 5030
@@ -5096,8 +5102,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5096 ret = remove_extent_backref(trans, extent_root, path, 5102 ret = remove_extent_backref(trans, extent_root, path,
5097 NULL, refs_to_drop, 5103 NULL, refs_to_drop,
5098 is_data); 5104 is_data);
5099 if (ret) 5105 if (ret) {
5100 goto abort; 5106 btrfs_abort_transaction(trans, extent_root, ret);
5107 goto out;
5108 }
5101 btrfs_release_path(path); 5109 btrfs_release_path(path);
5102 path->leave_spinning = 1; 5110 path->leave_spinning = 1;
5103 5111
@@ -5115,8 +5123,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5115 btrfs_print_leaf(extent_root, 5123 btrfs_print_leaf(extent_root,
5116 path->nodes[0]); 5124 path->nodes[0]);
5117 } 5125 }
5118 if (ret < 0) 5126 if (ret < 0) {
5119 goto abort; 5127 btrfs_abort_transaction(trans, extent_root, ret);
5128 goto out;
5129 }
5120 extent_slot = path->slots[0]; 5130 extent_slot = path->slots[0];
5121 } 5131 }
5122 } else if (ret == -ENOENT) { 5132 } else if (ret == -ENOENT) {
@@ -5130,7 +5140,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5130 (unsigned long long)owner_objectid, 5140 (unsigned long long)owner_objectid,
5131 (unsigned long long)owner_offset); 5141 (unsigned long long)owner_offset);
5132 } else { 5142 } else {
5133 goto abort; 5143 btrfs_abort_transaction(trans, extent_root, ret);
5144 goto out;
5134 } 5145 }
5135 5146
5136 leaf = path->nodes[0]; 5147 leaf = path->nodes[0];
@@ -5140,8 +5151,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5140 BUG_ON(found_extent || extent_slot != path->slots[0]); 5151 BUG_ON(found_extent || extent_slot != path->slots[0]);
5141 ret = convert_extent_item_v0(trans, extent_root, path, 5152 ret = convert_extent_item_v0(trans, extent_root, path,
5142 owner_objectid, 0); 5153 owner_objectid, 0);
5143 if (ret < 0) 5154 if (ret < 0) {
5144 goto abort; 5155 btrfs_abort_transaction(trans, extent_root, ret);
5156 goto out;
5157 }
5145 5158
5146 btrfs_release_path(path); 5159 btrfs_release_path(path);
5147 path->leave_spinning = 1; 5160 path->leave_spinning = 1;
@@ -5158,8 +5171,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5158 (unsigned long long)bytenr); 5171 (unsigned long long)bytenr);
5159 btrfs_print_leaf(extent_root, path->nodes[0]); 5172 btrfs_print_leaf(extent_root, path->nodes[0]);
5160 } 5173 }
5161 if (ret < 0) 5174 if (ret < 0) {
5162 goto abort; 5175 btrfs_abort_transaction(trans, extent_root, ret);
5176 goto out;
5177 }
5178
5163 extent_slot = path->slots[0]; 5179 extent_slot = path->slots[0];
5164 leaf = path->nodes[0]; 5180 leaf = path->nodes[0];
5165 item_size = btrfs_item_size_nr(leaf, extent_slot); 5181 item_size = btrfs_item_size_nr(leaf, extent_slot);
@@ -5196,8 +5212,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5196 ret = remove_extent_backref(trans, extent_root, path, 5212 ret = remove_extent_backref(trans, extent_root, path,
5197 iref, refs_to_drop, 5213 iref, refs_to_drop,
5198 is_data); 5214 is_data);
5199 if (ret) 5215 if (ret) {
5200 goto abort; 5216 btrfs_abort_transaction(trans, extent_root, ret);
5217 goto out;
5218 }
5201 } 5219 }
5202 } else { 5220 } else {
5203 if (found_extent) { 5221 if (found_extent) {
@@ -5214,27 +5232,29 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5214 5232
5215 ret = btrfs_del_items(trans, extent_root, path, path->slots[0], 5233 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
5216 num_to_del); 5234 num_to_del);
5217 if (ret) 5235 if (ret) {
5218 goto abort; 5236 btrfs_abort_transaction(trans, extent_root, ret);
5237 goto out;
5238 }
5219 btrfs_release_path(path); 5239 btrfs_release_path(path);
5220 5240
5221 if (is_data) { 5241 if (is_data) {
5222 ret = btrfs_del_csums(trans, root, bytenr, num_bytes); 5242 ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
5223 if (ret) 5243 if (ret) {
5224 goto abort; 5244 btrfs_abort_transaction(trans, extent_root, ret);
5245 goto out;
5246 }
5225 } 5247 }
5226 5248
5227 ret = update_block_group(trans, root, bytenr, num_bytes, 0); 5249 ret = update_block_group(trans, root, bytenr, num_bytes, 0);
5228 if (ret) 5250 if (ret) {
5229 goto abort; 5251 btrfs_abort_transaction(trans, extent_root, ret);
5252 goto out;
5253 }
5230 } 5254 }
5231out: 5255out:
5232 btrfs_free_path(path); 5256 btrfs_free_path(path);
5233 return ret; 5257 return ret;
5234
5235abort:
5236 btrfs_abort_transaction(trans, extent_root, ret);
5237 goto out;
5238} 5258}
5239 5259
5240/* 5260/*
@@ -5497,8 +5517,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
5497 struct btrfs_block_group_cache *used_block_group; 5517 struct btrfs_block_group_cache *used_block_group;
5498 u64 search_start = 0; 5518 u64 search_start = 0;
5499 int empty_cluster = 2 * 1024 * 1024; 5519 int empty_cluster = 2 * 1024 * 1024;
5500 int allowed_chunk_alloc = 0;
5501 int done_chunk_alloc = 0;
5502 struct btrfs_space_info *space_info; 5520 struct btrfs_space_info *space_info;
5503 int loop = 0; 5521 int loop = 0;
5504 int index = 0; 5522 int index = 0;
@@ -5530,9 +5548,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
5530 if (btrfs_mixed_space_info(space_info)) 5548 if (btrfs_mixed_space_info(space_info))
5531 use_cluster = false; 5549 use_cluster = false;
5532 5550
5533 if (orig_root->ref_cows || empty_size)
5534 allowed_chunk_alloc = 1;
5535
5536 if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) { 5551 if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
5537 last_ptr = &root->fs_info->meta_alloc_cluster; 5552 last_ptr = &root->fs_info->meta_alloc_cluster;
5538 if (!btrfs_test_opt(root, SSD)) 5553 if (!btrfs_test_opt(root, SSD))
@@ -5806,10 +5821,6 @@ checks:
5806 5821
5807 trace_btrfs_reserve_extent(orig_root, block_group, 5822 trace_btrfs_reserve_extent(orig_root, block_group,
5808 search_start, num_bytes); 5823 search_start, num_bytes);
5809 if (offset < search_start)
5810 btrfs_add_free_space(used_block_group, offset,
5811 search_start - offset);
5812 BUG_ON(offset > search_start);
5813 if (used_block_group != block_group) 5824 if (used_block_group != block_group)
5814 btrfs_put_block_group(used_block_group); 5825 btrfs_put_block_group(used_block_group);
5815 btrfs_put_block_group(block_group); 5826 btrfs_put_block_group(block_group);
@@ -5842,34 +5853,17 @@ loop:
5842 index = 0; 5853 index = 0;
5843 loop++; 5854 loop++;
5844 if (loop == LOOP_ALLOC_CHUNK) { 5855 if (loop == LOOP_ALLOC_CHUNK) {
5845 if (allowed_chunk_alloc) { 5856 ret = do_chunk_alloc(trans, root, data,
5846 ret = do_chunk_alloc(trans, root, num_bytes + 5857 CHUNK_ALLOC_FORCE);
5847 2 * 1024 * 1024, data, 5858 /*
5848 CHUNK_ALLOC_LIMITED); 5859 * Do not bail out on ENOSPC since we
5849 /* 5860 * can do more things.
5850 * Do not bail out on ENOSPC since we 5861 */
5851 * can do more things. 5862 if (ret < 0 && ret != -ENOSPC) {
5852 */ 5863 btrfs_abort_transaction(trans,
5853 if (ret < 0 && ret != -ENOSPC) { 5864 root, ret);
5854 btrfs_abort_transaction(trans, 5865 goto out;
5855 root, ret);
5856 goto out;
5857 }
5858 allowed_chunk_alloc = 0;
5859 if (ret == 1)
5860 done_chunk_alloc = 1;
5861 } else if (!done_chunk_alloc &&
5862 space_info->force_alloc ==
5863 CHUNK_ALLOC_NO_FORCE) {
5864 space_info->force_alloc = CHUNK_ALLOC_LIMITED;
5865 } 5866 }
5866
5867 /*
5868 * We didn't allocate a chunk, go ahead and drop the
5869 * empty size and loop again.
5870 */
5871 if (!done_chunk_alloc)
5872 loop = LOOP_NO_EMPTY_SIZE;
5873 } 5867 }
5874 5868
5875 if (loop == LOOP_NO_EMPTY_SIZE) { 5869 if (loop == LOOP_NO_EMPTY_SIZE) {
@@ -5944,20 +5938,6 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
5944 5938
5945 data = btrfs_get_alloc_profile(root, data); 5939 data = btrfs_get_alloc_profile(root, data);
5946again: 5940again:
5947 /*
5948 * the only place that sets empty_size is btrfs_realloc_node, which
5949 * is not called recursively on allocations
5950 */
5951 if (empty_size || root->ref_cows) {
5952 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
5953 num_bytes + 2 * 1024 * 1024, data,
5954 CHUNK_ALLOC_NO_FORCE);
5955 if (ret < 0 && ret != -ENOSPC) {
5956 btrfs_abort_transaction(trans, root, ret);
5957 return ret;
5958 }
5959 }
5960
5961 WARN_ON(num_bytes < root->sectorsize); 5941 WARN_ON(num_bytes < root->sectorsize);
5962 ret = find_free_extent(trans, root, num_bytes, empty_size, 5942 ret = find_free_extent(trans, root, num_bytes, empty_size,
5963 hint_byte, ins, data); 5943 hint_byte, ins, data);
@@ -5967,12 +5947,6 @@ again:
5967 num_bytes = num_bytes >> 1; 5947 num_bytes = num_bytes >> 1;
5968 num_bytes = num_bytes & ~(root->sectorsize - 1); 5948 num_bytes = num_bytes & ~(root->sectorsize - 1);
5969 num_bytes = max(num_bytes, min_alloc_size); 5949 num_bytes = max(num_bytes, min_alloc_size);
5970 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
5971 num_bytes, data, CHUNK_ALLOC_FORCE);
5972 if (ret < 0 && ret != -ENOSPC) {
5973 btrfs_abort_transaction(trans, root, ret);
5974 return ret;
5975 }
5976 if (num_bytes == min_alloc_size) 5950 if (num_bytes == min_alloc_size)
5977 final_tried = true; 5951 final_tried = true;
5978 goto again; 5952 goto again;
@@ -6314,7 +6288,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
6314 ret = block_rsv_use_bytes(block_rsv, blocksize); 6288 ret = block_rsv_use_bytes(block_rsv, blocksize);
6315 if (!ret) 6289 if (!ret)
6316 return block_rsv; 6290 return block_rsv;
6317 if (ret) { 6291 if (ret && !block_rsv->failfast) {
6318 static DEFINE_RATELIMIT_STATE(_rs, 6292 static DEFINE_RATELIMIT_STATE(_rs,
6319 DEFAULT_RATELIMIT_INTERVAL, 6293 DEFAULT_RATELIMIT_INTERVAL,
6320 /*DEFAULT_RATELIMIT_BURST*/ 2); 6294 /*DEFAULT_RATELIMIT_BURST*/ 2);
@@ -7279,7 +7253,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
7279 7253
7280 alloc_flags = update_block_group_flags(root, cache->flags); 7254 alloc_flags = update_block_group_flags(root, cache->flags);
7281 if (alloc_flags != cache->flags) { 7255 if (alloc_flags != cache->flags) {
7282 ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 7256 ret = do_chunk_alloc(trans, root, alloc_flags,
7283 CHUNK_ALLOC_FORCE); 7257 CHUNK_ALLOC_FORCE);
7284 if (ret < 0) 7258 if (ret < 0)
7285 goto out; 7259 goto out;
@@ -7289,7 +7263,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
7289 if (!ret) 7263 if (!ret)
7290 goto out; 7264 goto out;
7291 alloc_flags = get_alloc_profile(root, cache->space_info->flags); 7265 alloc_flags = get_alloc_profile(root, cache->space_info->flags);
7292 ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 7266 ret = do_chunk_alloc(trans, root, alloc_flags,
7293 CHUNK_ALLOC_FORCE); 7267 CHUNK_ALLOC_FORCE);
7294 if (ret < 0) 7268 if (ret < 0)
7295 goto out; 7269 goto out;
@@ -7303,7 +7277,7 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
7303 struct btrfs_root *root, u64 type) 7277 struct btrfs_root *root, u64 type)
7304{ 7278{
7305 u64 alloc_flags = get_alloc_profile(root, type); 7279 u64 alloc_flags = get_alloc_profile(root, type);
7306 return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 7280 return do_chunk_alloc(trans, root, alloc_flags,
7307 CHUNK_ALLOC_FORCE); 7281 CHUNK_ALLOC_FORCE);
7308} 7282}
7309 7283
@@ -7810,6 +7784,34 @@ error:
7810 return ret; 7784 return ret;
7811} 7785}
7812 7786
7787void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
7788 struct btrfs_root *root)
7789{
7790 struct btrfs_block_group_cache *block_group, *tmp;
7791 struct btrfs_root *extent_root = root->fs_info->extent_root;
7792 struct btrfs_block_group_item item;
7793 struct btrfs_key key;
7794 int ret = 0;
7795
7796 list_for_each_entry_safe(block_group, tmp, &trans->new_bgs,
7797 new_bg_list) {
7798 list_del_init(&block_group->new_bg_list);
7799
7800 if (ret)
7801 continue;
7802
7803 spin_lock(&block_group->lock);
7804 memcpy(&item, &block_group->item, sizeof(item));
7805 memcpy(&key, &block_group->key, sizeof(key));
7806 spin_unlock(&block_group->lock);
7807
7808 ret = btrfs_insert_item(trans, extent_root, &key, &item,
7809 sizeof(item));
7810 if (ret)
7811 btrfs_abort_transaction(trans, extent_root, ret);
7812 }
7813}
7814
7813int btrfs_make_block_group(struct btrfs_trans_handle *trans, 7815int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7814 struct btrfs_root *root, u64 bytes_used, 7816 struct btrfs_root *root, u64 bytes_used,
7815 u64 type, u64 chunk_objectid, u64 chunk_offset, 7817 u64 type, u64 chunk_objectid, u64 chunk_offset,
@@ -7843,6 +7845,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7843 spin_lock_init(&cache->lock); 7845 spin_lock_init(&cache->lock);
7844 INIT_LIST_HEAD(&cache->list); 7846 INIT_LIST_HEAD(&cache->list);
7845 INIT_LIST_HEAD(&cache->cluster_list); 7847 INIT_LIST_HEAD(&cache->cluster_list);
7848 INIT_LIST_HEAD(&cache->new_bg_list);
7846 7849
7847 btrfs_init_free_space_ctl(cache); 7850 btrfs_init_free_space_ctl(cache);
7848 7851
@@ -7874,12 +7877,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7874 ret = btrfs_add_block_group_cache(root->fs_info, cache); 7877 ret = btrfs_add_block_group_cache(root->fs_info, cache);
7875 BUG_ON(ret); /* Logic error */ 7878 BUG_ON(ret); /* Logic error */
7876 7879
7877 ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item, 7880 list_add_tail(&cache->new_bg_list, &trans->new_bgs);
7878 sizeof(cache->item));
7879 if (ret) {
7880 btrfs_abort_transaction(trans, extent_root, ret);
7881 return ret;
7882 }
7883 7881
7884 set_avail_alloc_bits(extent_root->fs_info, type); 7882 set_avail_alloc_bits(extent_root->fs_info, type);
7885 7883
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index b08ea4717e9d..8036d3a84853 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -45,6 +45,7 @@ struct extent_page_data {
45 struct bio *bio; 45 struct bio *bio;
46 struct extent_io_tree *tree; 46 struct extent_io_tree *tree;
47 get_extent_t *get_extent; 47 get_extent_t *get_extent;
48 unsigned long bio_flags;
48 49
49 /* tells writepage not to lock the state bits for this range 50 /* tells writepage not to lock the state bits for this range
50 * it still does the unlocking 51 * it still does the unlocking
@@ -64,13 +65,13 @@ tree_fs_info(struct extent_io_tree *tree)
64 65
65int __init extent_io_init(void) 66int __init extent_io_init(void)
66{ 67{
67 extent_state_cache = kmem_cache_create("extent_state", 68 extent_state_cache = kmem_cache_create("btrfs_extent_state",
68 sizeof(struct extent_state), 0, 69 sizeof(struct extent_state), 0,
69 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 70 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
70 if (!extent_state_cache) 71 if (!extent_state_cache)
71 return -ENOMEM; 72 return -ENOMEM;
72 73
73 extent_buffer_cache = kmem_cache_create("extent_buffers", 74 extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
74 sizeof(struct extent_buffer), 0, 75 sizeof(struct extent_buffer), 0,
75 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 76 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
76 if (!extent_buffer_cache) 77 if (!extent_buffer_cache)
@@ -942,6 +943,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
942 * @end: the end offset in bytes (inclusive) 943 * @end: the end offset in bytes (inclusive)
943 * @bits: the bits to set in this range 944 * @bits: the bits to set in this range
944 * @clear_bits: the bits to clear in this range 945 * @clear_bits: the bits to clear in this range
946 * @cached_state: state that we're going to cache
945 * @mask: the allocation mask 947 * @mask: the allocation mask
946 * 948 *
947 * This will go through and set bits for the given range. If any states exist 949 * This will go through and set bits for the given range. If any states exist
@@ -951,7 +953,8 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
951 * boundary bits like LOCK. 953 * boundary bits like LOCK.
952 */ 954 */
953int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 955int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
954 int bits, int clear_bits, gfp_t mask) 956 int bits, int clear_bits,
957 struct extent_state **cached_state, gfp_t mask)
955{ 958{
956 struct extent_state *state; 959 struct extent_state *state;
957 struct extent_state *prealloc = NULL; 960 struct extent_state *prealloc = NULL;
@@ -968,6 +971,15 @@ again:
968 } 971 }
969 972
970 spin_lock(&tree->lock); 973 spin_lock(&tree->lock);
974 if (cached_state && *cached_state) {
975 state = *cached_state;
976 if (state->start <= start && state->end > start &&
977 state->tree) {
978 node = &state->rb_node;
979 goto hit_next;
980 }
981 }
982
971 /* 983 /*
972 * this search will find all the extents that end after 984 * this search will find all the extents that end after
973 * our range starts. 985 * our range starts.
@@ -998,6 +1010,7 @@ hit_next:
998 */ 1010 */
999 if (state->start == start && state->end <= end) { 1011 if (state->start == start && state->end <= end) {
1000 set_state_bits(tree, state, &bits); 1012 set_state_bits(tree, state, &bits);
1013 cache_state(state, cached_state);
1001 state = clear_state_bit(tree, state, &clear_bits, 0); 1014 state = clear_state_bit(tree, state, &clear_bits, 0);
1002 if (last_end == (u64)-1) 1015 if (last_end == (u64)-1)
1003 goto out; 1016 goto out;
@@ -1038,6 +1051,7 @@ hit_next:
1038 goto out; 1051 goto out;
1039 if (state->end <= end) { 1052 if (state->end <= end) {
1040 set_state_bits(tree, state, &bits); 1053 set_state_bits(tree, state, &bits);
1054 cache_state(state, cached_state);
1041 state = clear_state_bit(tree, state, &clear_bits, 0); 1055 state = clear_state_bit(tree, state, &clear_bits, 0);
1042 if (last_end == (u64)-1) 1056 if (last_end == (u64)-1)
1043 goto out; 1057 goto out;
@@ -1076,6 +1090,7 @@ hit_next:
1076 &bits); 1090 &bits);
1077 if (err) 1091 if (err)
1078 extent_io_tree_panic(tree, err); 1092 extent_io_tree_panic(tree, err);
1093 cache_state(prealloc, cached_state);
1079 prealloc = NULL; 1094 prealloc = NULL;
1080 start = this_end + 1; 1095 start = this_end + 1;
1081 goto search_again; 1096 goto search_again;
@@ -1098,6 +1113,7 @@ hit_next:
1098 extent_io_tree_panic(tree, err); 1113 extent_io_tree_panic(tree, err);
1099 1114
1100 set_state_bits(tree, prealloc, &bits); 1115 set_state_bits(tree, prealloc, &bits);
1116 cache_state(prealloc, cached_state);
1101 clear_state_bit(tree, prealloc, &clear_bits, 0); 1117 clear_state_bit(tree, prealloc, &clear_bits, 0);
1102 prealloc = NULL; 1118 prealloc = NULL;
1103 goto out; 1119 goto out;
@@ -1150,6 +1166,14 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
1150 NULL, cached_state, mask); 1166 NULL, cached_state, mask);
1151} 1167}
1152 1168
1169int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
1170 struct extent_state **cached_state, gfp_t mask)
1171{
1172 return set_extent_bit(tree, start, end,
1173 EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG,
1174 NULL, cached_state, mask);
1175}
1176
1153int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 1177int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
1154 gfp_t mask) 1178 gfp_t mask)
1155{ 1179{
@@ -1294,18 +1318,42 @@ out:
1294 * If nothing was found, 1 is returned. If found something, return 0. 1318 * If nothing was found, 1 is returned. If found something, return 0.
1295 */ 1319 */
1296int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 1320int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1297 u64 *start_ret, u64 *end_ret, int bits) 1321 u64 *start_ret, u64 *end_ret, int bits,
1322 struct extent_state **cached_state)
1298{ 1323{
1299 struct extent_state *state; 1324 struct extent_state *state;
1325 struct rb_node *n;
1300 int ret = 1; 1326 int ret = 1;
1301 1327
1302 spin_lock(&tree->lock); 1328 spin_lock(&tree->lock);
1329 if (cached_state && *cached_state) {
1330 state = *cached_state;
1331 if (state->end == start - 1 && state->tree) {
1332 n = rb_next(&state->rb_node);
1333 while (n) {
1334 state = rb_entry(n, struct extent_state,
1335 rb_node);
1336 if (state->state & bits)
1337 goto got_it;
1338 n = rb_next(n);
1339 }
1340 free_extent_state(*cached_state);
1341 *cached_state = NULL;
1342 goto out;
1343 }
1344 free_extent_state(*cached_state);
1345 *cached_state = NULL;
1346 }
1347
1303 state = find_first_extent_bit_state(tree, start, bits); 1348 state = find_first_extent_bit_state(tree, start, bits);
1349got_it:
1304 if (state) { 1350 if (state) {
1351 cache_state(state, cached_state);
1305 *start_ret = state->start; 1352 *start_ret = state->start;
1306 *end_ret = state->end; 1353 *end_ret = state->end;
1307 ret = 0; 1354 ret = 0;
1308 } 1355 }
1356out:
1309 spin_unlock(&tree->lock); 1357 spin_unlock(&tree->lock);
1310 return ret; 1358 return ret;
1311} 1359}
@@ -2068,7 +2116,7 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page,
2068 } 2116 }
2069 read_unlock(&em_tree->lock); 2117 read_unlock(&em_tree->lock);
2070 2118
2071 if (!em || IS_ERR(em)) { 2119 if (!em) {
2072 kfree(failrec); 2120 kfree(failrec);
2073 return -EIO; 2121 return -EIO;
2074 } 2122 }
@@ -2304,8 +2352,8 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
2304 struct extent_state *cached = NULL; 2352 struct extent_state *cached = NULL;
2305 struct extent_state *state; 2353 struct extent_state *state;
2306 2354
2307 pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, " 2355 pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
2308 "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err, 2356 "mirror=%ld\n", (u64)bio->bi_sector, err,
2309 (long int)bio->bi_bdev); 2357 (long int)bio->bi_bdev);
2310 tree = &BTRFS_I(page->mapping->host)->io_tree; 2358 tree = &BTRFS_I(page->mapping->host)->io_tree;
2311 2359
@@ -2709,12 +2757,15 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2709 end_bio_extent_readpage, mirror_num, 2757 end_bio_extent_readpage, mirror_num,
2710 *bio_flags, 2758 *bio_flags,
2711 this_bio_flag); 2759 this_bio_flag);
2712 BUG_ON(ret == -ENOMEM); 2760 if (!ret) {
2713 nr++; 2761 nr++;
2714 *bio_flags = this_bio_flag; 2762 *bio_flags = this_bio_flag;
2763 }
2715 } 2764 }
2716 if (ret) 2765 if (ret) {
2717 SetPageError(page); 2766 SetPageError(page);
2767 unlock_extent(tree, cur, cur + iosize - 1);
2768 }
2718 cur = cur + iosize; 2769 cur = cur + iosize;
2719 pg_offset += iosize; 2770 pg_offset += iosize;
2720 } 2771 }
@@ -3161,12 +3212,16 @@ static int write_one_eb(struct extent_buffer *eb,
3161 struct block_device *bdev = fs_info->fs_devices->latest_bdev; 3212 struct block_device *bdev = fs_info->fs_devices->latest_bdev;
3162 u64 offset = eb->start; 3213 u64 offset = eb->start;
3163 unsigned long i, num_pages; 3214 unsigned long i, num_pages;
3215 unsigned long bio_flags = 0;
3164 int rw = (epd->sync_io ? WRITE_SYNC : WRITE); 3216 int rw = (epd->sync_io ? WRITE_SYNC : WRITE);
3165 int ret = 0; 3217 int ret = 0;
3166 3218
3167 clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 3219 clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
3168 num_pages = num_extent_pages(eb->start, eb->len); 3220 num_pages = num_extent_pages(eb->start, eb->len);
3169 atomic_set(&eb->io_pages, num_pages); 3221 atomic_set(&eb->io_pages, num_pages);
3222 if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID)
3223 bio_flags = EXTENT_BIO_TREE_LOG;
3224
3170 for (i = 0; i < num_pages; i++) { 3225 for (i = 0; i < num_pages; i++) {
3171 struct page *p = extent_buffer_page(eb, i); 3226 struct page *p = extent_buffer_page(eb, i);
3172 3227
@@ -3175,7 +3230,8 @@ static int write_one_eb(struct extent_buffer *eb,
3175 ret = submit_extent_page(rw, eb->tree, p, offset >> 9, 3230 ret = submit_extent_page(rw, eb->tree, p, offset >> 9,
3176 PAGE_CACHE_SIZE, 0, bdev, &epd->bio, 3231 PAGE_CACHE_SIZE, 0, bdev, &epd->bio,
3177 -1, end_bio_extent_buffer_writepage, 3232 -1, end_bio_extent_buffer_writepage,
3178 0, 0, 0); 3233 0, epd->bio_flags, bio_flags);
3234 epd->bio_flags = bio_flags;
3179 if (ret) { 3235 if (ret) {
3180 set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 3236 set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
3181 SetPageError(p); 3237 SetPageError(p);
@@ -3210,6 +3266,7 @@ int btree_write_cache_pages(struct address_space *mapping,
3210 .tree = tree, 3266 .tree = tree,
3211 .extent_locked = 0, 3267 .extent_locked = 0,
3212 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 3268 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
3269 .bio_flags = 0,
3213 }; 3270 };
3214 int ret = 0; 3271 int ret = 0;
3215 int done = 0; 3272 int done = 0;
@@ -3254,19 +3311,34 @@ retry:
3254 break; 3311 break;
3255 } 3312 }
3256 3313
3314 spin_lock(&mapping->private_lock);
3315 if (!PagePrivate(page)) {
3316 spin_unlock(&mapping->private_lock);
3317 continue;
3318 }
3319
3257 eb = (struct extent_buffer *)page->private; 3320 eb = (struct extent_buffer *)page->private;
3321
3322 /*
3323 * Shouldn't happen and normally this would be a BUG_ON
3324 * but no sense in crashing the users box for something
3325 * we can survive anyway.
3326 */
3258 if (!eb) { 3327 if (!eb) {
3328 spin_unlock(&mapping->private_lock);
3259 WARN_ON(1); 3329 WARN_ON(1);
3260 continue; 3330 continue;
3261 } 3331 }
3262 3332
3263 if (eb == prev_eb) 3333 if (eb == prev_eb) {
3334 spin_unlock(&mapping->private_lock);
3264 continue; 3335 continue;
3336 }
3265 3337
3266 if (!atomic_inc_not_zero(&eb->refs)) { 3338 ret = atomic_inc_not_zero(&eb->refs);
3267 WARN_ON(1); 3339 spin_unlock(&mapping->private_lock);
3340 if (!ret)
3268 continue; 3341 continue;
3269 }
3270 3342
3271 prev_eb = eb; 3343 prev_eb = eb;
3272 ret = lock_extent_buffer_for_io(eb, fs_info, &epd); 3344 ret = lock_extent_buffer_for_io(eb, fs_info, &epd);
@@ -3457,7 +3529,7 @@ static void flush_epd_write_bio(struct extent_page_data *epd)
3457 if (epd->sync_io) 3529 if (epd->sync_io)
3458 rw = WRITE_SYNC; 3530 rw = WRITE_SYNC;
3459 3531
3460 ret = submit_one_bio(rw, epd->bio, 0, 0); 3532 ret = submit_one_bio(rw, epd->bio, 0, epd->bio_flags);
3461 BUG_ON(ret < 0); /* -ENOMEM */ 3533 BUG_ON(ret < 0); /* -ENOMEM */
3462 epd->bio = NULL; 3534 epd->bio = NULL;
3463 } 3535 }
@@ -3480,6 +3552,7 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
3480 .get_extent = get_extent, 3552 .get_extent = get_extent,
3481 .extent_locked = 0, 3553 .extent_locked = 0,
3482 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 3554 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
3555 .bio_flags = 0,
3483 }; 3556 };
3484 3557
3485 ret = __extent_writepage(page, wbc, &epd); 3558 ret = __extent_writepage(page, wbc, &epd);
@@ -3504,6 +3577,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
3504 .get_extent = get_extent, 3577 .get_extent = get_extent,
3505 .extent_locked = 1, 3578 .extent_locked = 1,
3506 .sync_io = mode == WB_SYNC_ALL, 3579 .sync_io = mode == WB_SYNC_ALL,
3580 .bio_flags = 0,
3507 }; 3581 };
3508 struct writeback_control wbc_writepages = { 3582 struct writeback_control wbc_writepages = {
3509 .sync_mode = mode, 3583 .sync_mode = mode,
@@ -3543,6 +3617,7 @@ int extent_writepages(struct extent_io_tree *tree,
3543 .get_extent = get_extent, 3617 .get_extent = get_extent,
3544 .extent_locked = 0, 3618 .extent_locked = 0,
3545 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 3619 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
3620 .bio_flags = 0,
3546 }; 3621 };
3547 3622
3548 ret = extent_write_cache_pages(tree, mapping, wbc, 3623 ret = extent_write_cache_pages(tree, mapping, wbc,
@@ -3920,18 +3995,6 @@ out:
3920 return ret; 3995 return ret;
3921} 3996}
3922 3997
3923inline struct page *extent_buffer_page(struct extent_buffer *eb,
3924 unsigned long i)
3925{
3926 return eb->pages[i];
3927}
3928
3929inline unsigned long num_extent_pages(u64 start, u64 len)
3930{
3931 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
3932 (start >> PAGE_CACHE_SHIFT);
3933}
3934
3935static void __free_extent_buffer(struct extent_buffer *eb) 3998static void __free_extent_buffer(struct extent_buffer *eb)
3936{ 3999{
3937#if LEAK_DEBUG 4000#if LEAK_DEBUG
@@ -4047,7 +4110,7 @@ struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len)
4047 4110
4048 return eb; 4111 return eb;
4049err: 4112err:
4050 for (i--; i > 0; i--) 4113 for (i--; i >= 0; i--)
4051 __free_page(eb->pages[i]); 4114 __free_page(eb->pages[i]);
4052 __free_extent_buffer(eb); 4115 __free_extent_buffer(eb);
4053 return NULL; 4116 return NULL;
@@ -4192,10 +4255,8 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
4192 4255
4193 for (i = 0; i < num_pages; i++, index++) { 4256 for (i = 0; i < num_pages; i++, index++) {
4194 p = find_or_create_page(mapping, index, GFP_NOFS); 4257 p = find_or_create_page(mapping, index, GFP_NOFS);
4195 if (!p) { 4258 if (!p)
4196 WARN_ON(1);
4197 goto free_eb; 4259 goto free_eb;
4198 }
4199 4260
4200 spin_lock(&mapping->private_lock); 4261 spin_lock(&mapping->private_lock);
4201 if (PagePrivate(p)) { 4262 if (PagePrivate(p)) {
@@ -4338,7 +4399,6 @@ static int release_extent_buffer(struct extent_buffer *eb, gfp_t mask)
4338 4399
4339 /* Should be safe to release our pages at this point */ 4400 /* Should be safe to release our pages at this point */
4340 btrfs_release_extent_buffer_page(eb, 0); 4401 btrfs_release_extent_buffer_page(eb, 0);
4341
4342 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); 4402 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
4343 return 1; 4403 return 1;
4344 } 4404 }
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 25900af5b15d..711d12b80028 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -27,6 +27,7 @@
27 * type for this bio 27 * type for this bio
28 */ 28 */
29#define EXTENT_BIO_COMPRESSED 1 29#define EXTENT_BIO_COMPRESSED 1
30#define EXTENT_BIO_TREE_LOG 2
30#define EXTENT_BIO_FLAG_SHIFT 16 31#define EXTENT_BIO_FLAG_SHIFT 16
31 32
32/* these are bit numbers for test/set bit */ 33/* these are bit numbers for test/set bit */
@@ -232,11 +233,15 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
232int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 233int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
233 gfp_t mask); 234 gfp_t mask);
234int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 235int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
235 int bits, int clear_bits, gfp_t mask); 236 int bits, int clear_bits,
237 struct extent_state **cached_state, gfp_t mask);
236int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, 238int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
237 struct extent_state **cached_state, gfp_t mask); 239 struct extent_state **cached_state, gfp_t mask);
240int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
241 struct extent_state **cached_state, gfp_t mask);
238int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 242int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
239 u64 *start_ret, u64 *end_ret, int bits); 243 u64 *start_ret, u64 *end_ret, int bits,
244 struct extent_state **cached_state);
240struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree, 245struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
241 u64 start, int bits); 246 u64 start, int bits);
242int extent_invalidatepage(struct extent_io_tree *tree, 247int extent_invalidatepage(struct extent_io_tree *tree,
@@ -277,8 +282,18 @@ void free_extent_buffer_stale(struct extent_buffer *eb);
277int read_extent_buffer_pages(struct extent_io_tree *tree, 282int read_extent_buffer_pages(struct extent_io_tree *tree,
278 struct extent_buffer *eb, u64 start, int wait, 283 struct extent_buffer *eb, u64 start, int wait,
279 get_extent_t *get_extent, int mirror_num); 284 get_extent_t *get_extent, int mirror_num);
280unsigned long num_extent_pages(u64 start, u64 len); 285
281struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i); 286static inline unsigned long num_extent_pages(u64 start, u64 len)
287{
288 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
289 (start >> PAGE_CACHE_SHIFT);
290}
291
292static inline struct page *extent_buffer_page(struct extent_buffer *eb,
293 unsigned long i)
294{
295 return eb->pages[i];
296}
282 297
283static inline void extent_buffer_get(struct extent_buffer *eb) 298static inline void extent_buffer_get(struct extent_buffer *eb)
284{ 299{
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 7c97b3301459..b8cbc8d5c7f7 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -11,7 +11,7 @@ static struct kmem_cache *extent_map_cache;
11 11
12int __init extent_map_init(void) 12int __init extent_map_init(void)
13{ 13{
14 extent_map_cache = kmem_cache_create("extent_map", 14 extent_map_cache = kmem_cache_create("btrfs_extent_map",
15 sizeof(struct extent_map), 0, 15 sizeof(struct extent_map), 0,
16 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 16 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
17 if (!extent_map_cache) 17 if (!extent_map_cache)
@@ -35,6 +35,7 @@ void extent_map_exit(void)
35void extent_map_tree_init(struct extent_map_tree *tree) 35void extent_map_tree_init(struct extent_map_tree *tree)
36{ 36{
37 tree->map = RB_ROOT; 37 tree->map = RB_ROOT;
38 INIT_LIST_HEAD(&tree->modified_extents);
38 rwlock_init(&tree->lock); 39 rwlock_init(&tree->lock);
39} 40}
40 41
@@ -54,7 +55,9 @@ struct extent_map *alloc_extent_map(void)
54 em->in_tree = 0; 55 em->in_tree = 0;
55 em->flags = 0; 56 em->flags = 0;
56 em->compress_type = BTRFS_COMPRESS_NONE; 57 em->compress_type = BTRFS_COMPRESS_NONE;
58 em->generation = 0;
57 atomic_set(&em->refs, 1); 59 atomic_set(&em->refs, 1);
60 INIT_LIST_HEAD(&em->list);
58 return em; 61 return em;
59} 62}
60 63
@@ -72,6 +75,7 @@ void free_extent_map(struct extent_map *em)
72 WARN_ON(atomic_read(&em->refs) == 0); 75 WARN_ON(atomic_read(&em->refs) == 0);
73 if (atomic_dec_and_test(&em->refs)) { 76 if (atomic_dec_and_test(&em->refs)) {
74 WARN_ON(em->in_tree); 77 WARN_ON(em->in_tree);
78 WARN_ON(!list_empty(&em->list));
75 kmem_cache_free(extent_map_cache, em); 79 kmem_cache_free(extent_map_cache, em);
76 } 80 }
77} 81}
@@ -198,6 +202,14 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
198 em->block_len += merge->block_len; 202 em->block_len += merge->block_len;
199 em->block_start = merge->block_start; 203 em->block_start = merge->block_start;
200 merge->in_tree = 0; 204 merge->in_tree = 0;
205 if (merge->generation > em->generation) {
206 em->mod_start = em->start;
207 em->mod_len = em->len;
208 em->generation = merge->generation;
209 list_move(&em->list, &tree->modified_extents);
210 }
211
212 list_del_init(&merge->list);
201 rb_erase(&merge->rb_node, &tree->map); 213 rb_erase(&merge->rb_node, &tree->map);
202 free_extent_map(merge); 214 free_extent_map(merge);
203 } 215 }
@@ -211,14 +223,34 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
211 em->block_len += merge->len; 223 em->block_len += merge->len;
212 rb_erase(&merge->rb_node, &tree->map); 224 rb_erase(&merge->rb_node, &tree->map);
213 merge->in_tree = 0; 225 merge->in_tree = 0;
226 if (merge->generation > em->generation) {
227 em->mod_len = em->len;
228 em->generation = merge->generation;
229 list_move(&em->list, &tree->modified_extents);
230 }
231 list_del_init(&merge->list);
214 free_extent_map(merge); 232 free_extent_map(merge);
215 } 233 }
216} 234}
217 235
218int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len) 236/**
237 * unpint_extent_cache - unpin an extent from the cache
238 * @tree: tree to unpin the extent in
239 * @start: logical offset in the file
240 * @len: length of the extent
241 * @gen: generation that this extent has been modified in
242 * @prealloc: if this is set we need to clear the prealloc flag
243 *
244 * Called after an extent has been written to disk properly. Set the generation
245 * to the generation that actually added the file item to the inode so we know
246 * we need to sync this extent when we call fsync().
247 */
248int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
249 u64 gen)
219{ 250{
220 int ret = 0; 251 int ret = 0;
221 struct extent_map *em; 252 struct extent_map *em;
253 bool prealloc = false;
222 254
223 write_lock(&tree->lock); 255 write_lock(&tree->lock);
224 em = lookup_extent_mapping(tree, start, len); 256 em = lookup_extent_mapping(tree, start, len);
@@ -228,10 +260,24 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len)
228 if (!em) 260 if (!em)
229 goto out; 261 goto out;
230 262
263 list_move(&em->list, &tree->modified_extents);
264 em->generation = gen;
231 clear_bit(EXTENT_FLAG_PINNED, &em->flags); 265 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
266 em->mod_start = em->start;
267 em->mod_len = em->len;
268
269 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
270 prealloc = true;
271 clear_bit(EXTENT_FLAG_PREALLOC, &em->flags);
272 }
232 273
233 try_merge_map(tree, em); 274 try_merge_map(tree, em);
234 275
276 if (prealloc) {
277 em->mod_start = em->start;
278 em->mod_len = em->len;
279 }
280
235 free_extent_map(em); 281 free_extent_map(em);
236out: 282out:
237 write_unlock(&tree->lock); 283 write_unlock(&tree->lock);
@@ -269,6 +315,9 @@ int add_extent_mapping(struct extent_map_tree *tree,
269 } 315 }
270 atomic_inc(&em->refs); 316 atomic_inc(&em->refs);
271 317
318 em->mod_start = em->start;
319 em->mod_len = em->len;
320
272 try_merge_map(tree, em); 321 try_merge_map(tree, em);
273out: 322out:
274 return ret; 323 return ret;
@@ -358,6 +407,8 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
358 407
359 WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags)); 408 WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
360 rb_erase(&em->rb_node, &tree->map); 409 rb_erase(&em->rb_node, &tree->map);
410 if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
411 list_del_init(&em->list);
361 em->in_tree = 0; 412 em->in_tree = 0;
362 return ret; 413 return ret;
363} 414}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 1195f09761fe..679225555f7b 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -13,6 +13,7 @@
13#define EXTENT_FLAG_COMPRESSED 1 13#define EXTENT_FLAG_COMPRESSED 1
14#define EXTENT_FLAG_VACANCY 2 /* no file extent item found */ 14#define EXTENT_FLAG_VACANCY 2 /* no file extent item found */
15#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */ 15#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */
16#define EXTENT_FLAG_LOGGING 4 /* Logging this extent */
16 17
17struct extent_map { 18struct extent_map {
18 struct rb_node rb_node; 19 struct rb_node rb_node;
@@ -20,18 +21,23 @@ struct extent_map {
20 /* all of these are in bytes */ 21 /* all of these are in bytes */
21 u64 start; 22 u64 start;
22 u64 len; 23 u64 len;
24 u64 mod_start;
25 u64 mod_len;
23 u64 orig_start; 26 u64 orig_start;
24 u64 block_start; 27 u64 block_start;
25 u64 block_len; 28 u64 block_len;
29 u64 generation;
26 unsigned long flags; 30 unsigned long flags;
27 struct block_device *bdev; 31 struct block_device *bdev;
28 atomic_t refs; 32 atomic_t refs;
29 unsigned int in_tree; 33 unsigned int in_tree;
30 unsigned int compress_type; 34 unsigned int compress_type;
35 struct list_head list;
31}; 36};
32 37
33struct extent_map_tree { 38struct extent_map_tree {
34 struct rb_root map; 39 struct rb_root map;
40 struct list_head modified_extents;
35 rwlock_t lock; 41 rwlock_t lock;
36}; 42};
37 43
@@ -60,7 +66,7 @@ struct extent_map *alloc_extent_map(void);
60void free_extent_map(struct extent_map *em); 66void free_extent_map(struct extent_map *em);
61int __init extent_map_init(void); 67int __init extent_map_init(void);
62void extent_map_exit(void); 68void extent_map_exit(void);
63int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len); 69int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen);
64struct extent_map *search_extent_mapping(struct extent_map_tree *tree, 70struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
65 u64 start, u64 len); 71 u64 start, u64 len);
66#endif 72#endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 857d93cd01dc..1ad08e4e4a15 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -25,11 +25,12 @@
25#include "transaction.h" 25#include "transaction.h"
26#include "print-tree.h" 26#include "print-tree.h"
27 27
28#define __MAX_CSUM_ITEMS(r, size) ((((BTRFS_LEAF_DATA_SIZE(r) - \ 28#define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \
29 sizeof(struct btrfs_item) * 2) / \ 29 sizeof(struct btrfs_item) * 2) / \
30 size) - 1)) 30 size) - 1))
31 31
32#define MAX_CSUM_ITEMS(r, size) (min(__MAX_CSUM_ITEMS(r, size), PAGE_CACHE_SIZE)) 32#define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \
33 PAGE_CACHE_SIZE))
33 34
34#define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \ 35#define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \
35 sizeof(struct btrfs_ordered_sum)) / \ 36 sizeof(struct btrfs_ordered_sum)) / \
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index f6b40e86121b..9ab1bed88116 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -39,6 +39,7 @@
39#include "tree-log.h" 39#include "tree-log.h"
40#include "locking.h" 40#include "locking.h"
41#include "compat.h" 41#include "compat.h"
42#include "volumes.h"
42 43
43/* 44/*
44 * when auto defrag is enabled we 45 * when auto defrag is enabled we
@@ -458,14 +459,15 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
458 * this drops all the extents in the cache that intersect the range 459 * this drops all the extents in the cache that intersect the range
459 * [start, end]. Existing extents are split as required. 460 * [start, end]. Existing extents are split as required.
460 */ 461 */
461int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, 462void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
462 int skip_pinned) 463 int skip_pinned)
463{ 464{
464 struct extent_map *em; 465 struct extent_map *em;
465 struct extent_map *split = NULL; 466 struct extent_map *split = NULL;
466 struct extent_map *split2 = NULL; 467 struct extent_map *split2 = NULL;
467 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 468 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
468 u64 len = end - start + 1; 469 u64 len = end - start + 1;
470 u64 gen;
469 int ret; 471 int ret;
470 int testend = 1; 472 int testend = 1;
471 unsigned long flags; 473 unsigned long flags;
@@ -477,11 +479,14 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
477 testend = 0; 479 testend = 0;
478 } 480 }
479 while (1) { 481 while (1) {
482 int no_splits = 0;
483
480 if (!split) 484 if (!split)
481 split = alloc_extent_map(); 485 split = alloc_extent_map();
482 if (!split2) 486 if (!split2)
483 split2 = alloc_extent_map(); 487 split2 = alloc_extent_map();
484 BUG_ON(!split || !split2); /* -ENOMEM */ 488 if (!split || !split2)
489 no_splits = 1;
485 490
486 write_lock(&em_tree->lock); 491 write_lock(&em_tree->lock);
487 em = lookup_extent_mapping(em_tree, start, len); 492 em = lookup_extent_mapping(em_tree, start, len);
@@ -490,6 +495,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
490 break; 495 break;
491 } 496 }
492 flags = em->flags; 497 flags = em->flags;
498 gen = em->generation;
493 if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { 499 if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
494 if (testend && em->start + em->len >= start + len) { 500 if (testend && em->start + em->len >= start + len) {
495 free_extent_map(em); 501 free_extent_map(em);
@@ -506,6 +512,8 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
506 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 512 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
507 clear_bit(EXTENT_FLAG_PINNED, &em->flags); 513 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
508 remove_extent_mapping(em_tree, em); 514 remove_extent_mapping(em_tree, em);
515 if (no_splits)
516 goto next;
509 517
510 if (em->block_start < EXTENT_MAP_LAST_BYTE && 518 if (em->block_start < EXTENT_MAP_LAST_BYTE &&
511 em->start < start) { 519 em->start < start) {
@@ -518,12 +526,13 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
518 split->block_len = em->block_len; 526 split->block_len = em->block_len;
519 else 527 else
520 split->block_len = split->len; 528 split->block_len = split->len;
521 529 split->generation = gen;
522 split->bdev = em->bdev; 530 split->bdev = em->bdev;
523 split->flags = flags; 531 split->flags = flags;
524 split->compress_type = em->compress_type; 532 split->compress_type = em->compress_type;
525 ret = add_extent_mapping(em_tree, split); 533 ret = add_extent_mapping(em_tree, split);
526 BUG_ON(ret); /* Logic error */ 534 BUG_ON(ret); /* Logic error */
535 list_move(&split->list, &em_tree->modified_extents);
527 free_extent_map(split); 536 free_extent_map(split);
528 split = split2; 537 split = split2;
529 split2 = NULL; 538 split2 = NULL;
@@ -537,6 +546,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
537 split->bdev = em->bdev; 546 split->bdev = em->bdev;
538 split->flags = flags; 547 split->flags = flags;
539 split->compress_type = em->compress_type; 548 split->compress_type = em->compress_type;
549 split->generation = gen;
540 550
541 if (compressed) { 551 if (compressed) {
542 split->block_len = em->block_len; 552 split->block_len = em->block_len;
@@ -550,9 +560,11 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
550 560
551 ret = add_extent_mapping(em_tree, split); 561 ret = add_extent_mapping(em_tree, split);
552 BUG_ON(ret); /* Logic error */ 562 BUG_ON(ret); /* Logic error */
563 list_move(&split->list, &em_tree->modified_extents);
553 free_extent_map(split); 564 free_extent_map(split);
554 split = NULL; 565 split = NULL;
555 } 566 }
567next:
556 write_unlock(&em_tree->lock); 568 write_unlock(&em_tree->lock);
557 569
558 /* once for us */ 570 /* once for us */
@@ -564,7 +576,6 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
564 free_extent_map(split); 576 free_extent_map(split);
565 if (split2) 577 if (split2)
566 free_extent_map(split2); 578 free_extent_map(split2);
567 return 0;
568} 579}
569 580
570/* 581/*
@@ -576,13 +587,13 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
576 * it is either truncated or split. Anything entirely inside the range 587 * it is either truncated or split. Anything entirely inside the range
577 * is deleted from the tree. 588 * is deleted from the tree.
578 */ 589 */
579int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode, 590int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
580 u64 start, u64 end, u64 *hint_byte, int drop_cache) 591 struct btrfs_root *root, struct inode *inode,
592 struct btrfs_path *path, u64 start, u64 end,
593 u64 *drop_end, int drop_cache)
581{ 594{
582 struct btrfs_root *root = BTRFS_I(inode)->root;
583 struct extent_buffer *leaf; 595 struct extent_buffer *leaf;
584 struct btrfs_file_extent_item *fi; 596 struct btrfs_file_extent_item *fi;
585 struct btrfs_path *path;
586 struct btrfs_key key; 597 struct btrfs_key key;
587 struct btrfs_key new_key; 598 struct btrfs_key new_key;
588 u64 ino = btrfs_ino(inode); 599 u64 ino = btrfs_ino(inode);
@@ -597,14 +608,12 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
597 int recow; 608 int recow;
598 int ret; 609 int ret;
599 int modify_tree = -1; 610 int modify_tree = -1;
611 int update_refs = (root->ref_cows || root == root->fs_info->tree_root);
612 int found = 0;
600 613
601 if (drop_cache) 614 if (drop_cache)
602 btrfs_drop_extent_cache(inode, start, end - 1, 0); 615 btrfs_drop_extent_cache(inode, start, end - 1, 0);
603 616
604 path = btrfs_alloc_path();
605 if (!path)
606 return -ENOMEM;
607
608 if (start >= BTRFS_I(inode)->disk_i_size) 617 if (start >= BTRFS_I(inode)->disk_i_size)
609 modify_tree = 0; 618 modify_tree = 0;
610 619
@@ -666,6 +675,7 @@ next_slot:
666 goto next_slot; 675 goto next_slot;
667 } 676 }
668 677
678 found = 1;
669 search_start = max(key.offset, start); 679 search_start = max(key.offset, start);
670 if (recow || !modify_tree) { 680 if (recow || !modify_tree) {
671 modify_tree = -1; 681 modify_tree = -1;
@@ -707,14 +717,13 @@ next_slot:
707 extent_end - start); 717 extent_end - start);
708 btrfs_mark_buffer_dirty(leaf); 718 btrfs_mark_buffer_dirty(leaf);
709 719
710 if (disk_bytenr > 0) { 720 if (update_refs && disk_bytenr > 0) {
711 ret = btrfs_inc_extent_ref(trans, root, 721 ret = btrfs_inc_extent_ref(trans, root,
712 disk_bytenr, num_bytes, 0, 722 disk_bytenr, num_bytes, 0,
713 root->root_key.objectid, 723 root->root_key.objectid,
714 new_key.objectid, 724 new_key.objectid,
715 start - extent_offset, 0); 725 start - extent_offset, 0);
716 BUG_ON(ret); /* -ENOMEM */ 726 BUG_ON(ret); /* -ENOMEM */
717 *hint_byte = disk_bytenr;
718 } 727 }
719 key.offset = start; 728 key.offset = start;
720 } 729 }
@@ -734,10 +743,8 @@ next_slot:
734 btrfs_set_file_extent_num_bytes(leaf, fi, 743 btrfs_set_file_extent_num_bytes(leaf, fi,
735 extent_end - end); 744 extent_end - end);
736 btrfs_mark_buffer_dirty(leaf); 745 btrfs_mark_buffer_dirty(leaf);
737 if (disk_bytenr > 0) { 746 if (update_refs && disk_bytenr > 0)
738 inode_sub_bytes(inode, end - key.offset); 747 inode_sub_bytes(inode, end - key.offset);
739 *hint_byte = disk_bytenr;
740 }
741 break; 748 break;
742 } 749 }
743 750
@@ -753,10 +760,8 @@ next_slot:
753 btrfs_set_file_extent_num_bytes(leaf, fi, 760 btrfs_set_file_extent_num_bytes(leaf, fi,
754 start - key.offset); 761 start - key.offset);
755 btrfs_mark_buffer_dirty(leaf); 762 btrfs_mark_buffer_dirty(leaf);
756 if (disk_bytenr > 0) { 763 if (update_refs && disk_bytenr > 0)
757 inode_sub_bytes(inode, extent_end - start); 764 inode_sub_bytes(inode, extent_end - start);
758 *hint_byte = disk_bytenr;
759 }
760 if (end == extent_end) 765 if (end == extent_end)
761 break; 766 break;
762 767
@@ -777,12 +782,13 @@ next_slot:
777 del_nr++; 782 del_nr++;
778 } 783 }
779 784
780 if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 785 if (update_refs &&
786 extent_type == BTRFS_FILE_EXTENT_INLINE) {
781 inode_sub_bytes(inode, 787 inode_sub_bytes(inode,
782 extent_end - key.offset); 788 extent_end - key.offset);
783 extent_end = ALIGN(extent_end, 789 extent_end = ALIGN(extent_end,
784 root->sectorsize); 790 root->sectorsize);
785 } else if (disk_bytenr > 0) { 791 } else if (update_refs && disk_bytenr > 0) {
786 ret = btrfs_free_extent(trans, root, 792 ret = btrfs_free_extent(trans, root,
787 disk_bytenr, num_bytes, 0, 793 disk_bytenr, num_bytes, 0,
788 root->root_key.objectid, 794 root->root_key.objectid,
@@ -791,7 +797,6 @@ next_slot:
791 BUG_ON(ret); /* -ENOMEM */ 797 BUG_ON(ret); /* -ENOMEM */
792 inode_sub_bytes(inode, 798 inode_sub_bytes(inode,
793 extent_end - key.offset); 799 extent_end - key.offset);
794 *hint_byte = disk_bytenr;
795 } 800 }
796 801
797 if (end == extent_end) 802 if (end == extent_end)
@@ -806,7 +811,7 @@ next_slot:
806 del_nr); 811 del_nr);
807 if (ret) { 812 if (ret) {
808 btrfs_abort_transaction(trans, root, ret); 813 btrfs_abort_transaction(trans, root, ret);
809 goto out; 814 break;
810 } 815 }
811 816
812 del_nr = 0; 817 del_nr = 0;
@@ -825,7 +830,24 @@ next_slot:
825 btrfs_abort_transaction(trans, root, ret); 830 btrfs_abort_transaction(trans, root, ret);
826 } 831 }
827 832
828out: 833 if (drop_end)
834 *drop_end = found ? min(end, extent_end) : end;
835 btrfs_release_path(path);
836 return ret;
837}
838
839int btrfs_drop_extents(struct btrfs_trans_handle *trans,
840 struct btrfs_root *root, struct inode *inode, u64 start,
841 u64 end, int drop_cache)
842{
843 struct btrfs_path *path;
844 int ret;
845
846 path = btrfs_alloc_path();
847 if (!path)
848 return -ENOMEM;
849 ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL,
850 drop_cache);
829 btrfs_free_path(path); 851 btrfs_free_path(path);
830 return ret; 852 return ret;
831} 853}
@@ -892,8 +914,6 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
892 int ret; 914 int ret;
893 u64 ino = btrfs_ino(inode); 915 u64 ino = btrfs_ino(inode);
894 916
895 btrfs_drop_extent_cache(inode, start, end - 1, 0);
896
897 path = btrfs_alloc_path(); 917 path = btrfs_alloc_path();
898 if (!path) 918 if (!path)
899 return -ENOMEM; 919 return -ENOMEM;
@@ -935,12 +955,16 @@ again:
935 btrfs_set_item_key_safe(trans, root, path, &new_key); 955 btrfs_set_item_key_safe(trans, root, path, &new_key);
936 fi = btrfs_item_ptr(leaf, path->slots[0], 956 fi = btrfs_item_ptr(leaf, path->slots[0],
937 struct btrfs_file_extent_item); 957 struct btrfs_file_extent_item);
958 btrfs_set_file_extent_generation(leaf, fi,
959 trans->transid);
938 btrfs_set_file_extent_num_bytes(leaf, fi, 960 btrfs_set_file_extent_num_bytes(leaf, fi,
939 extent_end - end); 961 extent_end - end);
940 btrfs_set_file_extent_offset(leaf, fi, 962 btrfs_set_file_extent_offset(leaf, fi,
941 end - orig_offset); 963 end - orig_offset);
942 fi = btrfs_item_ptr(leaf, path->slots[0] - 1, 964 fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
943 struct btrfs_file_extent_item); 965 struct btrfs_file_extent_item);
966 btrfs_set_file_extent_generation(leaf, fi,
967 trans->transid);
944 btrfs_set_file_extent_num_bytes(leaf, fi, 968 btrfs_set_file_extent_num_bytes(leaf, fi,
945 end - other_start); 969 end - other_start);
946 btrfs_mark_buffer_dirty(leaf); 970 btrfs_mark_buffer_dirty(leaf);
@@ -958,12 +982,16 @@ again:
958 struct btrfs_file_extent_item); 982 struct btrfs_file_extent_item);
959 btrfs_set_file_extent_num_bytes(leaf, fi, 983 btrfs_set_file_extent_num_bytes(leaf, fi,
960 start - key.offset); 984 start - key.offset);
985 btrfs_set_file_extent_generation(leaf, fi,
986 trans->transid);
961 path->slots[0]++; 987 path->slots[0]++;
962 new_key.offset = start; 988 new_key.offset = start;
963 btrfs_set_item_key_safe(trans, root, path, &new_key); 989 btrfs_set_item_key_safe(trans, root, path, &new_key);
964 990
965 fi = btrfs_item_ptr(leaf, path->slots[0], 991 fi = btrfs_item_ptr(leaf, path->slots[0],
966 struct btrfs_file_extent_item); 992 struct btrfs_file_extent_item);
993 btrfs_set_file_extent_generation(leaf, fi,
994 trans->transid);
967 btrfs_set_file_extent_num_bytes(leaf, fi, 995 btrfs_set_file_extent_num_bytes(leaf, fi,
968 other_end - start); 996 other_end - start);
969 btrfs_set_file_extent_offset(leaf, fi, 997 btrfs_set_file_extent_offset(leaf, fi,
@@ -991,12 +1019,14 @@ again:
991 leaf = path->nodes[0]; 1019 leaf = path->nodes[0];
992 fi = btrfs_item_ptr(leaf, path->slots[0] - 1, 1020 fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
993 struct btrfs_file_extent_item); 1021 struct btrfs_file_extent_item);
1022 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
994 btrfs_set_file_extent_num_bytes(leaf, fi, 1023 btrfs_set_file_extent_num_bytes(leaf, fi,
995 split - key.offset); 1024 split - key.offset);
996 1025
997 fi = btrfs_item_ptr(leaf, path->slots[0], 1026 fi = btrfs_item_ptr(leaf, path->slots[0],
998 struct btrfs_file_extent_item); 1027 struct btrfs_file_extent_item);
999 1028
1029 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1000 btrfs_set_file_extent_offset(leaf, fi, split - orig_offset); 1030 btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
1001 btrfs_set_file_extent_num_bytes(leaf, fi, 1031 btrfs_set_file_extent_num_bytes(leaf, fi,
1002 extent_end - split); 1032 extent_end - split);
@@ -1056,12 +1086,14 @@ again:
1056 struct btrfs_file_extent_item); 1086 struct btrfs_file_extent_item);
1057 btrfs_set_file_extent_type(leaf, fi, 1087 btrfs_set_file_extent_type(leaf, fi,
1058 BTRFS_FILE_EXTENT_REG); 1088 BTRFS_FILE_EXTENT_REG);
1089 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1059 btrfs_mark_buffer_dirty(leaf); 1090 btrfs_mark_buffer_dirty(leaf);
1060 } else { 1091 } else {
1061 fi = btrfs_item_ptr(leaf, del_slot - 1, 1092 fi = btrfs_item_ptr(leaf, del_slot - 1,
1062 struct btrfs_file_extent_item); 1093 struct btrfs_file_extent_item);
1063 btrfs_set_file_extent_type(leaf, fi, 1094 btrfs_set_file_extent_type(leaf, fi,
1064 BTRFS_FILE_EXTENT_REG); 1095 BTRFS_FILE_EXTENT_REG);
1096 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1065 btrfs_set_file_extent_num_bytes(leaf, fi, 1097 btrfs_set_file_extent_num_bytes(leaf, fi,
1066 extent_end - key.offset); 1098 extent_end - key.offset);
1067 btrfs_mark_buffer_dirty(leaf); 1099 btrfs_mark_buffer_dirty(leaf);
@@ -1173,8 +1205,8 @@ again:
1173 1205
1174 clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, 1206 clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,
1175 last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC | 1207 last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
1176 EXTENT_DO_ACCOUNTING, 0, 0, &cached_state, 1208 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
1177 GFP_NOFS); 1209 0, 0, &cached_state, GFP_NOFS);
1178 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 1210 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1179 start_pos, last_pos - 1, &cached_state, 1211 start_pos, last_pos - 1, &cached_state,
1180 GFP_NOFS); 1212 GFP_NOFS);
@@ -1514,16 +1546,24 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1514 1546
1515 trace_btrfs_sync_file(file, datasync); 1547 trace_btrfs_sync_file(file, datasync);
1516 1548
1549 /*
1550 * We write the dirty pages in the range and wait until they complete
1551 * out of the ->i_mutex. If so, we can flush the dirty pages by
1552 * multi-task, and make the performance up.
1553 */
1554 ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
1555 if (ret)
1556 return ret;
1557
1517 mutex_lock(&inode->i_mutex); 1558 mutex_lock(&inode->i_mutex);
1518 1559
1519 /* 1560 /*
1520 * we wait first, since the writeback may change the inode, also wait 1561 * We flush the dirty pages again to avoid some dirty pages in the
1521 * ordered range does a filemape_write_and_wait_range which is why we 1562 * range being left.
1522 * don't do it above like other file systems.
1523 */ 1563 */
1524 root->log_batch++; 1564 atomic_inc(&root->log_batch);
1525 btrfs_wait_ordered_range(inode, start, end); 1565 btrfs_wait_ordered_range(inode, start, end);
1526 root->log_batch++; 1566 atomic_inc(&root->log_batch);
1527 1567
1528 /* 1568 /*
1529 * check the transaction that last modified this inode 1569 * check the transaction that last modified this inode
@@ -1544,6 +1584,14 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1544 BTRFS_I(inode)->last_trans <= 1584 BTRFS_I(inode)->last_trans <=
1545 root->fs_info->last_trans_committed) { 1585 root->fs_info->last_trans_committed) {
1546 BTRFS_I(inode)->last_trans = 0; 1586 BTRFS_I(inode)->last_trans = 0;
1587
1588 /*
1589 * We'v had everything committed since the last time we were
1590 * modified so clear this flag in case it was set for whatever
1591 * reason, it's no longer relevant.
1592 */
1593 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
1594 &BTRFS_I(inode)->runtime_flags);
1547 mutex_unlock(&inode->i_mutex); 1595 mutex_unlock(&inode->i_mutex);
1548 goto out; 1596 goto out;
1549 } 1597 }
@@ -1615,6 +1663,324 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
1615 return 0; 1663 return 0;
1616} 1664}
1617 1665
1666static int hole_mergeable(struct inode *inode, struct extent_buffer *leaf,
1667 int slot, u64 start, u64 end)
1668{
1669 struct btrfs_file_extent_item *fi;
1670 struct btrfs_key key;
1671
1672 if (slot < 0 || slot >= btrfs_header_nritems(leaf))
1673 return 0;
1674
1675 btrfs_item_key_to_cpu(leaf, &key, slot);
1676 if (key.objectid != btrfs_ino(inode) ||
1677 key.type != BTRFS_EXTENT_DATA_KEY)
1678 return 0;
1679
1680 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
1681
1682 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
1683 return 0;
1684
1685 if (btrfs_file_extent_disk_bytenr(leaf, fi))
1686 return 0;
1687
1688 if (key.offset == end)
1689 return 1;
1690 if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)
1691 return 1;
1692 return 0;
1693}
1694
1695static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,
1696 struct btrfs_path *path, u64 offset, u64 end)
1697{
1698 struct btrfs_root *root = BTRFS_I(inode)->root;
1699 struct extent_buffer *leaf;
1700 struct btrfs_file_extent_item *fi;
1701 struct extent_map *hole_em;
1702 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
1703 struct btrfs_key key;
1704 int ret;
1705
1706 key.objectid = btrfs_ino(inode);
1707 key.type = BTRFS_EXTENT_DATA_KEY;
1708 key.offset = offset;
1709
1710
1711 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1712 if (ret < 0)
1713 return ret;
1714 BUG_ON(!ret);
1715
1716 leaf = path->nodes[0];
1717 if (hole_mergeable(inode, leaf, path->slots[0]-1, offset, end)) {
1718 u64 num_bytes;
1719
1720 path->slots[0]--;
1721 fi = btrfs_item_ptr(leaf, path->slots[0],
1722 struct btrfs_file_extent_item);
1723 num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
1724 end - offset;
1725 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
1726 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
1727 btrfs_set_file_extent_offset(leaf, fi, 0);
1728 btrfs_mark_buffer_dirty(leaf);
1729 goto out;
1730 }
1731
1732 if (hole_mergeable(inode, leaf, path->slots[0]+1, offset, end)) {
1733 u64 num_bytes;
1734
1735 path->slots[0]++;
1736 key.offset = offset;
1737 btrfs_set_item_key_safe(trans, root, path, &key);
1738 fi = btrfs_item_ptr(leaf, path->slots[0],
1739 struct btrfs_file_extent_item);
1740 num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
1741 offset;
1742 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
1743 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
1744 btrfs_set_file_extent_offset(leaf, fi, 0);
1745 btrfs_mark_buffer_dirty(leaf);
1746 goto out;
1747 }
1748 btrfs_release_path(path);
1749
1750 ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset,
1751 0, 0, end - offset, 0, end - offset,
1752 0, 0, 0);
1753 if (ret)
1754 return ret;
1755
1756out:
1757 btrfs_release_path(path);
1758
1759 hole_em = alloc_extent_map();
1760 if (!hole_em) {
1761 btrfs_drop_extent_cache(inode, offset, end - 1, 0);
1762 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
1763 &BTRFS_I(inode)->runtime_flags);
1764 } else {
1765 hole_em->start = offset;
1766 hole_em->len = end - offset;
1767 hole_em->orig_start = offset;
1768
1769 hole_em->block_start = EXTENT_MAP_HOLE;
1770 hole_em->block_len = 0;
1771 hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
1772 hole_em->compress_type = BTRFS_COMPRESS_NONE;
1773 hole_em->generation = trans->transid;
1774
1775 do {
1776 btrfs_drop_extent_cache(inode, offset, end - 1, 0);
1777 write_lock(&em_tree->lock);
1778 ret = add_extent_mapping(em_tree, hole_em);
1779 if (!ret)
1780 list_move(&hole_em->list,
1781 &em_tree->modified_extents);
1782 write_unlock(&em_tree->lock);
1783 } while (ret == -EEXIST);
1784 free_extent_map(hole_em);
1785 if (ret)
1786 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
1787 &BTRFS_I(inode)->runtime_flags);
1788 }
1789
1790 return 0;
1791}
1792
1793static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
1794{
1795 struct btrfs_root *root = BTRFS_I(inode)->root;
1796 struct extent_state *cached_state = NULL;
1797 struct btrfs_path *path;
1798 struct btrfs_block_rsv *rsv;
1799 struct btrfs_trans_handle *trans;
1800 u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
1801 u64 lockstart = (offset + mask) & ~mask;
1802 u64 lockend = ((offset + len) & ~mask) - 1;
1803 u64 cur_offset = lockstart;
1804 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
1805 u64 drop_end;
1806 unsigned long nr;
1807 int ret = 0;
1808 int err = 0;
1809 bool same_page = (offset >> PAGE_CACHE_SHIFT) ==
1810 ((offset + len) >> PAGE_CACHE_SHIFT);
1811
1812 btrfs_wait_ordered_range(inode, offset, len);
1813
1814 mutex_lock(&inode->i_mutex);
1815 if (offset >= inode->i_size) {
1816 mutex_unlock(&inode->i_mutex);
1817 return 0;
1818 }
1819
1820 /*
1821 * Only do this if we are in the same page and we aren't doing the
1822 * entire page.
1823 */
1824 if (same_page && len < PAGE_CACHE_SIZE) {
1825 ret = btrfs_truncate_page(inode, offset, len, 0);
1826 mutex_unlock(&inode->i_mutex);
1827 return ret;
1828 }
1829
1830 /* zero back part of the first page */
1831 ret = btrfs_truncate_page(inode, offset, 0, 0);
1832 if (ret) {
1833 mutex_unlock(&inode->i_mutex);
1834 return ret;
1835 }
1836
1837 /* zero the front end of the last page */
1838 ret = btrfs_truncate_page(inode, offset + len, 0, 1);
1839 if (ret) {
1840 mutex_unlock(&inode->i_mutex);
1841 return ret;
1842 }
1843
1844 if (lockend < lockstart) {
1845 mutex_unlock(&inode->i_mutex);
1846 return 0;
1847 }
1848
1849 while (1) {
1850 struct btrfs_ordered_extent *ordered;
1851
1852 truncate_pagecache_range(inode, lockstart, lockend);
1853
1854 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
1855 0, &cached_state);
1856 ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
1857
1858 /*
1859 * We need to make sure we have no ordered extents in this range
1860 * and nobody raced in and read a page in this range, if we did
1861 * we need to try again.
1862 */
1863 if ((!ordered ||
1864 (ordered->file_offset + ordered->len < lockstart ||
1865 ordered->file_offset > lockend)) &&
1866 !test_range_bit(&BTRFS_I(inode)->io_tree, lockstart,
1867 lockend, EXTENT_UPTODATE, 0,
1868 cached_state)) {
1869 if (ordered)
1870 btrfs_put_ordered_extent(ordered);
1871 break;
1872 }
1873 if (ordered)
1874 btrfs_put_ordered_extent(ordered);
1875 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
1876 lockend, &cached_state, GFP_NOFS);
1877 btrfs_wait_ordered_range(inode, lockstart,
1878 lockend - lockstart + 1);
1879 }
1880
1881 path = btrfs_alloc_path();
1882 if (!path) {
1883 ret = -ENOMEM;
1884 goto out;
1885 }
1886
1887 rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
1888 if (!rsv) {
1889 ret = -ENOMEM;
1890 goto out_free;
1891 }
1892 rsv->size = btrfs_calc_trunc_metadata_size(root, 1);
1893 rsv->failfast = 1;
1894
1895 /*
1896 * 1 - update the inode
1897 * 1 - removing the extents in the range
1898 * 1 - adding the hole extent
1899 */
1900 trans = btrfs_start_transaction(root, 3);
1901 if (IS_ERR(trans)) {
1902 err = PTR_ERR(trans);
1903 goto out_free;
1904 }
1905
1906 ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
1907 min_size);
1908 BUG_ON(ret);
1909 trans->block_rsv = rsv;
1910
1911 while (cur_offset < lockend) {
1912 ret = __btrfs_drop_extents(trans, root, inode, path,
1913 cur_offset, lockend + 1,
1914 &drop_end, 1);
1915 if (ret != -ENOSPC)
1916 break;
1917
1918 trans->block_rsv = &root->fs_info->trans_block_rsv;
1919
1920 ret = fill_holes(trans, inode, path, cur_offset, drop_end);
1921 if (ret) {
1922 err = ret;
1923 break;
1924 }
1925
1926 cur_offset = drop_end;
1927
1928 ret = btrfs_update_inode(trans, root, inode);
1929 if (ret) {
1930 err = ret;
1931 break;
1932 }
1933
1934 nr = trans->blocks_used;
1935 btrfs_end_transaction(trans, root);
1936 btrfs_btree_balance_dirty(root, nr);
1937
1938 trans = btrfs_start_transaction(root, 3);
1939 if (IS_ERR(trans)) {
1940 ret = PTR_ERR(trans);
1941 trans = NULL;
1942 break;
1943 }
1944
1945 ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
1946 rsv, min_size);
1947 BUG_ON(ret); /* shouldn't happen */
1948 trans->block_rsv = rsv;
1949 }
1950
1951 if (ret) {
1952 err = ret;
1953 goto out_trans;
1954 }
1955
1956 trans->block_rsv = &root->fs_info->trans_block_rsv;
1957 ret = fill_holes(trans, inode, path, cur_offset, drop_end);
1958 if (ret) {
1959 err = ret;
1960 goto out_trans;
1961 }
1962
1963out_trans:
1964 if (!trans)
1965 goto out_free;
1966
1967 trans->block_rsv = &root->fs_info->trans_block_rsv;
1968 ret = btrfs_update_inode(trans, root, inode);
1969 nr = trans->blocks_used;
1970 btrfs_end_transaction(trans, root);
1971 btrfs_btree_balance_dirty(root, nr);
1972out_free:
1973 btrfs_free_path(path);
1974 btrfs_free_block_rsv(root, rsv);
1975out:
1976 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
1977 &cached_state, GFP_NOFS);
1978 mutex_unlock(&inode->i_mutex);
1979 if (ret && !err)
1980 err = ret;
1981 return err;
1982}
1983
1618static long btrfs_fallocate(struct file *file, int mode, 1984static long btrfs_fallocate(struct file *file, int mode,
1619 loff_t offset, loff_t len) 1985 loff_t offset, loff_t len)
1620{ 1986{
@@ -1633,15 +1999,18 @@ static long btrfs_fallocate(struct file *file, int mode,
1633 alloc_start = offset & ~mask; 1999 alloc_start = offset & ~mask;
1634 alloc_end = (offset + len + mask) & ~mask; 2000 alloc_end = (offset + len + mask) & ~mask;
1635 2001
1636 /* We only support the FALLOC_FL_KEEP_SIZE mode */ 2002 /* Make sure we aren't being give some crap mode */
1637 if (mode & ~FALLOC_FL_KEEP_SIZE) 2003 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
1638 return -EOPNOTSUPP; 2004 return -EOPNOTSUPP;
1639 2005
2006 if (mode & FALLOC_FL_PUNCH_HOLE)
2007 return btrfs_punch_hole(inode, offset, len);
2008
1640 /* 2009 /*
1641 * Make sure we have enough space before we do the 2010 * Make sure we have enough space before we do the
1642 * allocation. 2011 * allocation.
1643 */ 2012 */
1644 ret = btrfs_check_data_free_space(inode, len); 2013 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start + 1);
1645 if (ret) 2014 if (ret)
1646 return ret; 2015 return ret;
1647 2016
@@ -1748,7 +2117,7 @@ static long btrfs_fallocate(struct file *file, int mode,
1748out: 2117out:
1749 mutex_unlock(&inode->i_mutex); 2118 mutex_unlock(&inode->i_mutex);
1750 /* Let go of our reservation. */ 2119 /* Let go of our reservation. */
1751 btrfs_free_reserved_data_space(inode, len); 2120 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start + 1);
1752 return ret; 2121 return ret;
1753} 2122}
1754 2123
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 6b10acfc2f5c..1027b854b90c 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -966,7 +966,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
966 block_group->key.offset)) { 966 block_group->key.offset)) {
967 ret = find_first_extent_bit(unpin, start, 967 ret = find_first_extent_bit(unpin, start,
968 &extent_start, &extent_end, 968 &extent_start, &extent_end,
969 EXTENT_DIRTY); 969 EXTENT_DIRTY, NULL);
970 if (ret) { 970 if (ret) {
971 ret = 0; 971 ret = 0;
972 break; 972 break;
@@ -1454,9 +1454,7 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
1454 max_t(u64, *offset, bitmap_info->offset)); 1454 max_t(u64, *offset, bitmap_info->offset));
1455 bits = bytes_to_bits(*bytes, ctl->unit); 1455 bits = bytes_to_bits(*bytes, ctl->unit);
1456 1456
1457 for (i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i); 1457 for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {
1458 i < BITS_PER_BITMAP;
1459 i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i + 1)) {
1460 next_zero = find_next_zero_bit(bitmap_info->bitmap, 1458 next_zero = find_next_zero_bit(bitmap_info->bitmap,
1461 BITS_PER_BITMAP, i); 1459 BITS_PER_BITMAP, i);
1462 if ((next_zero - i) >= bits) { 1460 if ((next_zero - i) >= bits) {
@@ -2307,9 +2305,7 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
2307 2305
2308again: 2306again:
2309 found_bits = 0; 2307 found_bits = 0;
2310 for (i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i); 2308 for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) {
2311 i < BITS_PER_BITMAP;
2312 i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) {
2313 next_zero = find_next_zero_bit(entry->bitmap, 2309 next_zero = find_next_zero_bit(entry->bitmap,
2314 BITS_PER_BITMAP, i); 2310 BITS_PER_BITMAP, i);
2315 if (next_zero - i >= min_bits) { 2311 if (next_zero - i >= min_bits) {
diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h
index db2ff9773b99..1d982812ab67 100644
--- a/fs/btrfs/hash.h
+++ b/fs/btrfs/hash.h
@@ -24,4 +24,14 @@ static inline u64 btrfs_name_hash(const char *name, int len)
24{ 24{
25 return crc32c((u32)~1, name, len); 25 return crc32c((u32)~1, name, len);
26} 26}
27
28/*
29 * Figure the key offset of an extended inode ref
30 */
31static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name,
32 int len)
33{
34 return (u64) crc32c(parent_objectid, name, len);
35}
36
27#endif 37#endif
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index a13cf1a96c73..48b8fda93132 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -18,6 +18,7 @@
18 18
19#include "ctree.h" 19#include "ctree.h"
20#include "disk-io.h" 20#include "disk-io.h"
21#include "hash.h"
21#include "transaction.h" 22#include "transaction.h"
22#include "print-tree.h" 23#include "print-tree.h"
23 24
@@ -50,18 +51,57 @@ static int find_name_in_backref(struct btrfs_path *path, const char *name,
50 return 0; 51 return 0;
51} 52}
52 53
53struct btrfs_inode_ref * 54int btrfs_find_name_in_ext_backref(struct btrfs_path *path, u64 ref_objectid,
55 const char *name, int name_len,
56 struct btrfs_inode_extref **extref_ret)
57{
58 struct extent_buffer *leaf;
59 struct btrfs_inode_extref *extref;
60 unsigned long ptr;
61 unsigned long name_ptr;
62 u32 item_size;
63 u32 cur_offset = 0;
64 int ref_name_len;
65
66 leaf = path->nodes[0];
67 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
68 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
69
70 /*
71 * Search all extended backrefs in this item. We're only
72 * looking through any collisions so most of the time this is
73 * just going to compare against one buffer. If all is well,
74 * we'll return success and the inode ref object.
75 */
76 while (cur_offset < item_size) {
77 extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
78 name_ptr = (unsigned long)(&extref->name);
79 ref_name_len = btrfs_inode_extref_name_len(leaf, extref);
80
81 if (ref_name_len == name_len &&
82 btrfs_inode_extref_parent(leaf, extref) == ref_objectid &&
83 (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)) {
84 if (extref_ret)
85 *extref_ret = extref;
86 return 1;
87 }
88
89 cur_offset += ref_name_len + sizeof(*extref);
90 }
91 return 0;
92}
93
94static struct btrfs_inode_ref *
54btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans, 95btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
55 struct btrfs_root *root, 96 struct btrfs_root *root,
56 struct btrfs_path *path, 97 struct btrfs_path *path,
57 const char *name, int name_len, 98 const char *name, int name_len,
58 u64 inode_objectid, u64 ref_objectid, int mod) 99 u64 inode_objectid, u64 ref_objectid, int ins_len,
100 int cow)
59{ 101{
102 int ret;
60 struct btrfs_key key; 103 struct btrfs_key key;
61 struct btrfs_inode_ref *ref; 104 struct btrfs_inode_ref *ref;
62 int ins_len = mod < 0 ? -1 : 0;
63 int cow = mod != 0;
64 int ret;
65 105
66 key.objectid = inode_objectid; 106 key.objectid = inode_objectid;
67 key.type = BTRFS_INODE_REF_KEY; 107 key.type = BTRFS_INODE_REF_KEY;
@@ -77,13 +117,150 @@ btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
77 return ref; 117 return ref;
78} 118}
79 119
80int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, 120/* Returns NULL if no extref found */
121struct btrfs_inode_extref *
122btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
123 struct btrfs_root *root,
124 struct btrfs_path *path,
125 const char *name, int name_len,
126 u64 inode_objectid, u64 ref_objectid, int ins_len,
127 int cow)
128{
129 int ret;
130 struct btrfs_key key;
131 struct btrfs_inode_extref *extref;
132
133 key.objectid = inode_objectid;
134 key.type = BTRFS_INODE_EXTREF_KEY;
135 key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
136
137 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
138 if (ret < 0)
139 return ERR_PTR(ret);
140 if (ret > 0)
141 return NULL;
142 if (!btrfs_find_name_in_ext_backref(path, ref_objectid, name, name_len, &extref))
143 return NULL;
144 return extref;
145}
146
147int btrfs_get_inode_ref_index(struct btrfs_trans_handle *trans,
148 struct btrfs_root *root,
149 struct btrfs_path *path,
150 const char *name, int name_len,
151 u64 inode_objectid, u64 ref_objectid, int mod,
152 u64 *ret_index)
153{
154 struct btrfs_inode_ref *ref;
155 struct btrfs_inode_extref *extref;
156 int ins_len = mod < 0 ? -1 : 0;
157 int cow = mod != 0;
158
159 ref = btrfs_lookup_inode_ref(trans, root, path, name, name_len,
160 inode_objectid, ref_objectid, ins_len,
161 cow);
162 if (IS_ERR(ref))
163 return PTR_ERR(ref);
164
165 if (ref != NULL) {
166 *ret_index = btrfs_inode_ref_index(path->nodes[0], ref);
167 return 0;
168 }
169
170 btrfs_release_path(path);
171
172 extref = btrfs_lookup_inode_extref(trans, root, path, name,
173 name_len, inode_objectid,
174 ref_objectid, ins_len, cow);
175 if (IS_ERR(extref))
176 return PTR_ERR(extref);
177
178 if (extref) {
179 *ret_index = btrfs_inode_extref_index(path->nodes[0], extref);
180 return 0;
181 }
182
183 return -ENOENT;
184}
185
186int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
81 struct btrfs_root *root, 187 struct btrfs_root *root,
82 const char *name, int name_len, 188 const char *name, int name_len,
83 u64 inode_objectid, u64 ref_objectid, u64 *index) 189 u64 inode_objectid, u64 ref_objectid, u64 *index)
84{ 190{
85 struct btrfs_path *path; 191 struct btrfs_path *path;
86 struct btrfs_key key; 192 struct btrfs_key key;
193 struct btrfs_inode_extref *extref;
194 struct extent_buffer *leaf;
195 int ret;
196 int del_len = name_len + sizeof(*extref);
197 unsigned long ptr;
198 unsigned long item_start;
199 u32 item_size;
200
201 key.objectid = inode_objectid;
202 btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY);
203 key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
204
205 path = btrfs_alloc_path();
206 if (!path)
207 return -ENOMEM;
208
209 path->leave_spinning = 1;
210
211 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
212 if (ret > 0)
213 ret = -ENOENT;
214 if (ret < 0)
215 goto out;
216
217 /*
218 * Sanity check - did we find the right item for this name?
219 * This should always succeed so error here will make the FS
220 * readonly.
221 */
222 if (!btrfs_find_name_in_ext_backref(path, ref_objectid,
223 name, name_len, &extref)) {
224 btrfs_std_error(root->fs_info, -ENOENT);
225 ret = -EROFS;
226 goto out;
227 }
228
229 leaf = path->nodes[0];
230 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
231 if (index)
232 *index = btrfs_inode_extref_index(leaf, extref);
233
234 if (del_len == item_size) {
235 /*
236 * Common case only one ref in the item, remove the
237 * whole item.
238 */
239 ret = btrfs_del_item(trans, root, path);
240 goto out;
241 }
242
243 ptr = (unsigned long)extref;
244 item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
245
246 memmove_extent_buffer(leaf, ptr, ptr + del_len,
247 item_size - (ptr + del_len - item_start));
248
249 btrfs_truncate_item(trans, root, path, item_size - del_len, 1);
250
251out:
252 btrfs_free_path(path);
253
254 return ret;
255}
256
257int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
258 struct btrfs_root *root,
259 const char *name, int name_len,
260 u64 inode_objectid, u64 ref_objectid, u64 *index)
261{
262 struct btrfs_path *path;
263 struct btrfs_key key;
87 struct btrfs_inode_ref *ref; 264 struct btrfs_inode_ref *ref;
88 struct extent_buffer *leaf; 265 struct extent_buffer *leaf;
89 unsigned long ptr; 266 unsigned long ptr;
@@ -91,6 +268,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
91 u32 item_size; 268 u32 item_size;
92 u32 sub_item_len; 269 u32 sub_item_len;
93 int ret; 270 int ret;
271 int search_ext_refs = 0;
94 int del_len = name_len + sizeof(*ref); 272 int del_len = name_len + sizeof(*ref);
95 273
96 key.objectid = inode_objectid; 274 key.objectid = inode_objectid;
@@ -106,12 +284,14 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
106 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 284 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
107 if (ret > 0) { 285 if (ret > 0) {
108 ret = -ENOENT; 286 ret = -ENOENT;
287 search_ext_refs = 1;
109 goto out; 288 goto out;
110 } else if (ret < 0) { 289 } else if (ret < 0) {
111 goto out; 290 goto out;
112 } 291 }
113 if (!find_name_in_backref(path, name, name_len, &ref)) { 292 if (!find_name_in_backref(path, name, name_len, &ref)) {
114 ret = -ENOENT; 293 ret = -ENOENT;
294 search_ext_refs = 1;
115 goto out; 295 goto out;
116 } 296 }
117 leaf = path->nodes[0]; 297 leaf = path->nodes[0];
@@ -129,8 +309,78 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
129 item_start = btrfs_item_ptr_offset(leaf, path->slots[0]); 309 item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
130 memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, 310 memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
131 item_size - (ptr + sub_item_len - item_start)); 311 item_size - (ptr + sub_item_len - item_start));
132 btrfs_truncate_item(trans, root, path, 312 btrfs_truncate_item(trans, root, path, item_size - sub_item_len, 1);
133 item_size - sub_item_len, 1); 313out:
314 btrfs_free_path(path);
315
316 if (search_ext_refs) {
317 /*
318 * No refs were found, or we could not find the
319 * name in our ref array. Find and remove the extended
320 * inode ref then.
321 */
322 return btrfs_del_inode_extref(trans, root, name, name_len,
323 inode_objectid, ref_objectid, index);
324 }
325
326 return ret;
327}
328
329/*
330 * btrfs_insert_inode_extref() - Inserts an extended inode ref into a tree.
331 *
332 * The caller must have checked against BTRFS_LINK_MAX already.
333 */
334static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
335 struct btrfs_root *root,
336 const char *name, int name_len,
337 u64 inode_objectid, u64 ref_objectid, u64 index)
338{
339 struct btrfs_inode_extref *extref;
340 int ret;
341 int ins_len = name_len + sizeof(*extref);
342 unsigned long ptr;
343 struct btrfs_path *path;
344 struct btrfs_key key;
345 struct extent_buffer *leaf;
346 struct btrfs_item *item;
347
348 key.objectid = inode_objectid;
349 key.type = BTRFS_INODE_EXTREF_KEY;
350 key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
351
352 path = btrfs_alloc_path();
353 if (!path)
354 return -ENOMEM;
355
356 path->leave_spinning = 1;
357 ret = btrfs_insert_empty_item(trans, root, path, &key,
358 ins_len);
359 if (ret == -EEXIST) {
360 if (btrfs_find_name_in_ext_backref(path, ref_objectid,
361 name, name_len, NULL))
362 goto out;
363
364 btrfs_extend_item(trans, root, path, ins_len);
365 ret = 0;
366 }
367 if (ret < 0)
368 goto out;
369
370 leaf = path->nodes[0];
371 item = btrfs_item_nr(leaf, path->slots[0]);
372 ptr = (unsigned long)btrfs_item_ptr(leaf, path->slots[0], char);
373 ptr += btrfs_item_size(leaf, item) - ins_len;
374 extref = (struct btrfs_inode_extref *)ptr;
375
376 btrfs_set_inode_extref_name_len(path->nodes[0], extref, name_len);
377 btrfs_set_inode_extref_index(path->nodes[0], extref, index);
378 btrfs_set_inode_extref_parent(path->nodes[0], extref, ref_objectid);
379
380 ptr = (unsigned long)&extref->name;
381 write_extent_buffer(path->nodes[0], name, ptr, name_len);
382 btrfs_mark_buffer_dirty(path->nodes[0]);
383
134out: 384out:
135 btrfs_free_path(path); 385 btrfs_free_path(path);
136 return ret; 386 return ret;
@@ -191,6 +441,19 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
191 441
192out: 442out:
193 btrfs_free_path(path); 443 btrfs_free_path(path);
444
445 if (ret == -EMLINK) {
446 struct btrfs_super_block *disk_super = root->fs_info->super_copy;
447 /* We ran out of space in the ref array. Need to
448 * add an extended ref. */
449 if (btrfs_super_incompat_flags(disk_super)
450 & BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
451 ret = btrfs_insert_inode_extref(trans, root, name,
452 name_len,
453 inode_objectid,
454 ref_objectid, index);
455 }
456
194 return ret; 457 return ret;
195} 458}
196 459
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a6ed6944e50c..85a1e5053fe6 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -230,7 +230,6 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
230 u64 inline_len = actual_end - start; 230 u64 inline_len = actual_end - start;
231 u64 aligned_end = (end + root->sectorsize - 1) & 231 u64 aligned_end = (end + root->sectorsize - 1) &
232 ~((u64)root->sectorsize - 1); 232 ~((u64)root->sectorsize - 1);
233 u64 hint_byte;
234 u64 data_len = inline_len; 233 u64 data_len = inline_len;
235 int ret; 234 int ret;
236 235
@@ -247,8 +246,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
247 return 1; 246 return 1;
248 } 247 }
249 248
250 ret = btrfs_drop_extents(trans, inode, start, aligned_end, 249 ret = btrfs_drop_extents(trans, root, inode, start, aligned_end, 1);
251 &hint_byte, 1);
252 if (ret) 250 if (ret)
253 return ret; 251 return ret;
254 252
@@ -664,7 +662,7 @@ retry:
664 async_extent->compressed_size, 662 async_extent->compressed_size,
665 async_extent->compressed_size, 663 async_extent->compressed_size,
666 0, alloc_hint, &ins, 1); 664 0, alloc_hint, &ins, 1);
667 if (ret) 665 if (ret && ret != -ENOSPC)
668 btrfs_abort_transaction(trans, root, ret); 666 btrfs_abort_transaction(trans, root, ret);
669 btrfs_end_transaction(trans, root); 667 btrfs_end_transaction(trans, root);
670 } 668 }
@@ -1308,6 +1306,7 @@ out_check:
1308 em->block_start = disk_bytenr; 1306 em->block_start = disk_bytenr;
1309 em->bdev = root->fs_info->fs_devices->latest_bdev; 1307 em->bdev = root->fs_info->fs_devices->latest_bdev;
1310 set_bit(EXTENT_FLAG_PINNED, &em->flags); 1308 set_bit(EXTENT_FLAG_PINNED, &em->flags);
1309 set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
1311 while (1) { 1310 while (1) {
1312 write_lock(&em_tree->lock); 1311 write_lock(&em_tree->lock);
1313 ret = add_extent_mapping(em_tree, em); 1312 ret = add_extent_mapping(em_tree, em);
@@ -1364,11 +1363,7 @@ out_check:
1364 } 1363 }
1365 1364
1366error: 1365error:
1367 if (nolock) { 1366 err = btrfs_end_transaction(trans, root);
1368 err = btrfs_end_transaction_nolock(trans, root);
1369 } else {
1370 err = btrfs_end_transaction(trans, root);
1371 }
1372 if (!ret) 1367 if (!ret)
1373 ret = err; 1368 ret = err;
1374 1369
@@ -1785,7 +1780,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1785 struct btrfs_path *path; 1780 struct btrfs_path *path;
1786 struct extent_buffer *leaf; 1781 struct extent_buffer *leaf;
1787 struct btrfs_key ins; 1782 struct btrfs_key ins;
1788 u64 hint;
1789 int ret; 1783 int ret;
1790 1784
1791 path = btrfs_alloc_path(); 1785 path = btrfs_alloc_path();
@@ -1803,8 +1797,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1803 * the caller is expected to unpin it and allow it to be merged 1797 * the caller is expected to unpin it and allow it to be merged
1804 * with the others. 1798 * with the others.
1805 */ 1799 */
1806 ret = btrfs_drop_extents(trans, inode, file_pos, file_pos + num_bytes, 1800 ret = btrfs_drop_extents(trans, root, inode, file_pos,
1807 &hint, 0); 1801 file_pos + num_bytes, 0);
1808 if (ret) 1802 if (ret)
1809 goto out; 1803 goto out;
1810 1804
@@ -1828,10 +1822,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1828 btrfs_set_file_extent_encryption(leaf, fi, encryption); 1822 btrfs_set_file_extent_encryption(leaf, fi, encryption);
1829 btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding); 1823 btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
1830 1824
1831 btrfs_unlock_up_safe(path, 1);
1832 btrfs_set_lock_blocking(leaf);
1833
1834 btrfs_mark_buffer_dirty(leaf); 1825 btrfs_mark_buffer_dirty(leaf);
1826 btrfs_release_path(path);
1835 1827
1836 inode_add_bytes(inode, num_bytes); 1828 inode_add_bytes(inode, num_bytes);
1837 1829
@@ -1929,11 +1921,10 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
1929 ordered_extent->len, 1921 ordered_extent->len,
1930 compress_type, 0, 0, 1922 compress_type, 0, 0,
1931 BTRFS_FILE_EXTENT_REG); 1923 BTRFS_FILE_EXTENT_REG);
1932 unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
1933 ordered_extent->file_offset,
1934 ordered_extent->len);
1935 } 1924 }
1936 1925 unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
1926 ordered_extent->file_offset, ordered_extent->len,
1927 trans->transid);
1937 if (ret < 0) { 1928 if (ret < 0) {
1938 btrfs_abort_transaction(trans, root, ret); 1929 btrfs_abort_transaction(trans, root, ret);
1939 goto out_unlock; 1930 goto out_unlock;
@@ -1949,6 +1940,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
1949 btrfs_abort_transaction(trans, root, ret); 1940 btrfs_abort_transaction(trans, root, ret);
1950 goto out_unlock; 1941 goto out_unlock;
1951 } 1942 }
1943 } else {
1944 btrfs_set_inode_last_trans(trans, inode);
1952 } 1945 }
1953 ret = 0; 1946 ret = 0;
1954out_unlock: 1947out_unlock:
@@ -1958,12 +1951,8 @@ out_unlock:
1958out: 1951out:
1959 if (root != root->fs_info->tree_root) 1952 if (root != root->fs_info->tree_root)
1960 btrfs_delalloc_release_metadata(inode, ordered_extent->len); 1953 btrfs_delalloc_release_metadata(inode, ordered_extent->len);
1961 if (trans) { 1954 if (trans)
1962 if (nolock) 1955 btrfs_end_transaction(trans, root);
1963 btrfs_end_transaction_nolock(trans, root);
1964 else
1965 btrfs_end_transaction(trans, root);
1966 }
1967 1956
1968 if (ret) 1957 if (ret)
1969 clear_extent_uptodate(io_tree, ordered_extent->file_offset, 1958 clear_extent_uptodate(io_tree, ordered_extent->file_offset,
@@ -2119,7 +2108,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
2119 if (empty) 2108 if (empty)
2120 return; 2109 return;
2121 2110
2122 down_read(&root->fs_info->cleanup_work_sem);
2123 spin_lock(&fs_info->delayed_iput_lock); 2111 spin_lock(&fs_info->delayed_iput_lock);
2124 list_splice_init(&fs_info->delayed_iputs, &list); 2112 list_splice_init(&fs_info->delayed_iputs, &list);
2125 spin_unlock(&fs_info->delayed_iput_lock); 2113 spin_unlock(&fs_info->delayed_iput_lock);
@@ -2130,7 +2118,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
2130 iput(delayed->inode); 2118 iput(delayed->inode);
2131 kfree(delayed); 2119 kfree(delayed);
2132 } 2120 }
2133 up_read(&root->fs_info->cleanup_work_sem);
2134} 2121}
2135 2122
2136enum btrfs_orphan_cleanup_state { 2123enum btrfs_orphan_cleanup_state {
@@ -2198,7 +2185,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2198 int ret; 2185 int ret;
2199 2186
2200 if (!root->orphan_block_rsv) { 2187 if (!root->orphan_block_rsv) {
2201 block_rsv = btrfs_alloc_block_rsv(root); 2188 block_rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
2202 if (!block_rsv) 2189 if (!block_rsv)
2203 return -ENOMEM; 2190 return -ENOMEM;
2204 } 2191 }
@@ -2225,7 +2212,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2225 insert = 1; 2212 insert = 1;
2226#endif 2213#endif
2227 insert = 1; 2214 insert = 1;
2228 atomic_dec(&root->orphan_inodes); 2215 atomic_inc(&root->orphan_inodes);
2229 } 2216 }
2230 2217
2231 if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED, 2218 if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
@@ -2590,6 +2577,18 @@ static void btrfs_read_locked_inode(struct inode *inode)
2590 2577
2591 inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); 2578 inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
2592 BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); 2579 BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
2580 BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
2581
2582 /*
2583 * If we were modified in the current generation and evicted from memory
2584 * and then re-read we need to do a full sync since we don't have any
2585 * idea about which extents were modified before we were evicted from
2586 * cache.
2587 */
2588 if (BTRFS_I(inode)->last_trans == root->fs_info->generation)
2589 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
2590 &BTRFS_I(inode)->runtime_flags);
2591
2593 inode->i_version = btrfs_inode_sequence(leaf, inode_item); 2592 inode->i_version = btrfs_inode_sequence(leaf, inode_item);
2594 inode->i_generation = BTRFS_I(inode)->generation; 2593 inode->i_generation = BTRFS_I(inode)->generation;
2595 inode->i_rdev = 0; 2594 inode->i_rdev = 0;
@@ -2894,7 +2893,6 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
2894 struct btrfs_trans_handle *trans; 2893 struct btrfs_trans_handle *trans;
2895 struct btrfs_root *root = BTRFS_I(dir)->root; 2894 struct btrfs_root *root = BTRFS_I(dir)->root;
2896 struct btrfs_path *path; 2895 struct btrfs_path *path;
2897 struct btrfs_inode_ref *ref;
2898 struct btrfs_dir_item *di; 2896 struct btrfs_dir_item *di;
2899 struct inode *inode = dentry->d_inode; 2897 struct inode *inode = dentry->d_inode;
2900 u64 index; 2898 u64 index;
@@ -3008,17 +3006,17 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
3008 } 3006 }
3009 btrfs_release_path(path); 3007 btrfs_release_path(path);
3010 3008
3011 ref = btrfs_lookup_inode_ref(trans, root, path, 3009 ret = btrfs_get_inode_ref_index(trans, root, path, dentry->d_name.name,
3012 dentry->d_name.name, dentry->d_name.len, 3010 dentry->d_name.len, ino, dir_ino, 0,
3013 ino, dir_ino, 0); 3011 &index);
3014 if (IS_ERR(ref)) { 3012 if (ret) {
3015 err = PTR_ERR(ref); 3013 err = ret;
3016 goto out; 3014 goto out;
3017 } 3015 }
3018 BUG_ON(!ref); /* Logic error */ 3016
3019 if (check_path_shared(root, path)) 3017 if (check_path_shared(root, path))
3020 goto out; 3018 goto out;
3021 index = btrfs_inode_ref_index(path->nodes[0], ref); 3019
3022 btrfs_release_path(path); 3020 btrfs_release_path(path);
3023 3021
3024 /* 3022 /*
@@ -3061,7 +3059,7 @@ out:
3061static void __unlink_end_trans(struct btrfs_trans_handle *trans, 3059static void __unlink_end_trans(struct btrfs_trans_handle *trans,
3062 struct btrfs_root *root) 3060 struct btrfs_root *root)
3063{ 3061{
3064 if (trans->block_rsv == &root->fs_info->global_block_rsv) { 3062 if (trans->block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL) {
3065 btrfs_block_rsv_release(root, trans->block_rsv, 3063 btrfs_block_rsv_release(root, trans->block_rsv,
3066 trans->bytes_reserved); 3064 trans->bytes_reserved);
3067 trans->block_rsv = &root->fs_info->trans_block_rsv; 3065 trans->block_rsv = &root->fs_info->trans_block_rsv;
@@ -3191,9 +3189,10 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
3191 struct btrfs_trans_handle *trans; 3189 struct btrfs_trans_handle *trans;
3192 unsigned long nr = 0; 3190 unsigned long nr = 0;
3193 3191
3194 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE || 3192 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
3195 btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
3196 return -ENOTEMPTY; 3193 return -ENOTEMPTY;
3194 if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
3195 return -EPERM;
3197 3196
3198 trans = __unlink_start_trans(dir, dentry); 3197 trans = __unlink_start_trans(dir, dentry);
3199 if (IS_ERR(trans)) 3198 if (IS_ERR(trans))
@@ -3267,8 +3266,13 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
3267 return -ENOMEM; 3266 return -ENOMEM;
3268 path->reada = -1; 3267 path->reada = -1;
3269 3268
3269 /*
3270 * We want to drop from the next block forward in case this new size is
3271 * not block aligned since we will be keeping the last block of the
3272 * extent just the way it is.
3273 */
3270 if (root->ref_cows || root == root->fs_info->tree_root) 3274 if (root->ref_cows || root == root->fs_info->tree_root)
3271 btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0); 3275 btrfs_drop_extent_cache(inode, (new_size + mask) & (~mask), (u64)-1, 0);
3272 3276
3273 /* 3277 /*
3274 * This function is also used to drop the items in the log tree before 3278 * This function is also used to drop the items in the log tree before
@@ -3429,12 +3433,6 @@ delete:
3429 3433
3430 if (path->slots[0] == 0 || 3434 if (path->slots[0] == 0 ||
3431 path->slots[0] != pending_del_slot) { 3435 path->slots[0] != pending_del_slot) {
3432 if (root->ref_cows &&
3433 BTRFS_I(inode)->location.objectid !=
3434 BTRFS_FREE_INO_OBJECTID) {
3435 err = -EAGAIN;
3436 goto out;
3437 }
3438 if (pending_del_nr) { 3436 if (pending_del_nr) {
3439 ret = btrfs_del_items(trans, root, path, 3437 ret = btrfs_del_items(trans, root, path,
3440 pending_del_slot, 3438 pending_del_slot,
@@ -3465,12 +3463,20 @@ error:
3465} 3463}
3466 3464
3467/* 3465/*
3468 * taken from block_truncate_page, but does cow as it zeros out 3466 * btrfs_truncate_page - read, zero a chunk and write a page
3469 * any bytes left in the last page in the file. 3467 * @inode - inode that we're zeroing
3468 * @from - the offset to start zeroing
3469 * @len - the length to zero, 0 to zero the entire range respective to the
3470 * offset
3471 * @front - zero up to the offset instead of from the offset on
3472 *
3473 * This will find the page for the "from" offset and cow the page and zero the
3474 * part we want to zero. This is used with truncate and hole punching.
3470 */ 3475 */
3471static int btrfs_truncate_page(struct address_space *mapping, loff_t from) 3476int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
3477 int front)
3472{ 3478{
3473 struct inode *inode = mapping->host; 3479 struct address_space *mapping = inode->i_mapping;
3474 struct btrfs_root *root = BTRFS_I(inode)->root; 3480 struct btrfs_root *root = BTRFS_I(inode)->root;
3475 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3481 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3476 struct btrfs_ordered_extent *ordered; 3482 struct btrfs_ordered_extent *ordered;
@@ -3485,7 +3491,8 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
3485 u64 page_start; 3491 u64 page_start;
3486 u64 page_end; 3492 u64 page_end;
3487 3493
3488 if ((offset & (blocksize - 1)) == 0) 3494 if ((offset & (blocksize - 1)) == 0 &&
3495 (!len || ((len & (blocksize - 1)) == 0)))
3489 goto out; 3496 goto out;
3490 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 3497 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
3491 if (ret) 3498 if (ret)
@@ -3532,7 +3539,8 @@ again:
3532 } 3539 }
3533 3540
3534 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, 3541 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
3535 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, 3542 EXTENT_DIRTY | EXTENT_DELALLOC |
3543 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
3536 0, 0, &cached_state, GFP_NOFS); 3544 0, 0, &cached_state, GFP_NOFS);
3537 3545
3538 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 3546 ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
@@ -3545,8 +3553,13 @@ again:
3545 3553
3546 ret = 0; 3554 ret = 0;
3547 if (offset != PAGE_CACHE_SIZE) { 3555 if (offset != PAGE_CACHE_SIZE) {
3556 if (!len)
3557 len = PAGE_CACHE_SIZE - offset;
3548 kaddr = kmap(page); 3558 kaddr = kmap(page);
3549 memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset); 3559 if (front)
3560 memset(kaddr, 0, offset);
3561 else
3562 memset(kaddr + offset, 0, len);
3550 flush_dcache_page(page); 3563 flush_dcache_page(page);
3551 kunmap(page); 3564 kunmap(page);
3552 } 3565 }
@@ -3577,6 +3590,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3577 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3590 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3578 struct extent_map *em = NULL; 3591 struct extent_map *em = NULL;
3579 struct extent_state *cached_state = NULL; 3592 struct extent_state *cached_state = NULL;
3593 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
3580 u64 mask = root->sectorsize - 1; 3594 u64 mask = root->sectorsize - 1;
3581 u64 hole_start = (oldsize + mask) & ~mask; 3595 u64 hole_start = (oldsize + mask) & ~mask;
3582 u64 block_end = (size + mask) & ~mask; 3596 u64 block_end = (size + mask) & ~mask;
@@ -3613,7 +3627,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3613 last_byte = min(extent_map_end(em), block_end); 3627 last_byte = min(extent_map_end(em), block_end);
3614 last_byte = (last_byte + mask) & ~mask; 3628 last_byte = (last_byte + mask) & ~mask;
3615 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { 3629 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
3616 u64 hint_byte = 0; 3630 struct extent_map *hole_em;
3617 hole_size = last_byte - cur_offset; 3631 hole_size = last_byte - cur_offset;
3618 3632
3619 trans = btrfs_start_transaction(root, 3); 3633 trans = btrfs_start_transaction(root, 3);
@@ -3622,9 +3636,9 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3622 break; 3636 break;
3623 } 3637 }
3624 3638
3625 err = btrfs_drop_extents(trans, inode, cur_offset, 3639 err = btrfs_drop_extents(trans, root, inode,
3626 cur_offset + hole_size, 3640 cur_offset,
3627 &hint_byte, 1); 3641 cur_offset + hole_size, 1);
3628 if (err) { 3642 if (err) {
3629 btrfs_abort_transaction(trans, root, err); 3643 btrfs_abort_transaction(trans, root, err);
3630 btrfs_end_transaction(trans, root); 3644 btrfs_end_transaction(trans, root);
@@ -3641,9 +3655,39 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3641 break; 3655 break;
3642 } 3656 }
3643 3657
3644 btrfs_drop_extent_cache(inode, hole_start, 3658 btrfs_drop_extent_cache(inode, cur_offset,
3645 last_byte - 1, 0); 3659 cur_offset + hole_size - 1, 0);
3660 hole_em = alloc_extent_map();
3661 if (!hole_em) {
3662 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3663 &BTRFS_I(inode)->runtime_flags);
3664 goto next;
3665 }
3666 hole_em->start = cur_offset;
3667 hole_em->len = hole_size;
3668 hole_em->orig_start = cur_offset;
3646 3669
3670 hole_em->block_start = EXTENT_MAP_HOLE;
3671 hole_em->block_len = 0;
3672 hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
3673 hole_em->compress_type = BTRFS_COMPRESS_NONE;
3674 hole_em->generation = trans->transid;
3675
3676 while (1) {
3677 write_lock(&em_tree->lock);
3678 err = add_extent_mapping(em_tree, hole_em);
3679 if (!err)
3680 list_move(&hole_em->list,
3681 &em_tree->modified_extents);
3682 write_unlock(&em_tree->lock);
3683 if (err != -EEXIST)
3684 break;
3685 btrfs_drop_extent_cache(inode, cur_offset,
3686 cur_offset +
3687 hole_size - 1, 0);
3688 }
3689 free_extent_map(hole_em);
3690next:
3647 btrfs_update_inode(trans, root, inode); 3691 btrfs_update_inode(trans, root, inode);
3648 btrfs_end_transaction(trans, root); 3692 btrfs_end_transaction(trans, root);
3649 } 3693 }
@@ -3768,26 +3812,22 @@ void btrfs_evict_inode(struct inode *inode)
3768 goto no_delete; 3812 goto no_delete;
3769 } 3813 }
3770 3814
3771 rsv = btrfs_alloc_block_rsv(root); 3815 rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
3772 if (!rsv) { 3816 if (!rsv) {
3773 btrfs_orphan_del(NULL, inode); 3817 btrfs_orphan_del(NULL, inode);
3774 goto no_delete; 3818 goto no_delete;
3775 } 3819 }
3776 rsv->size = min_size; 3820 rsv->size = min_size;
3821 rsv->failfast = 1;
3777 global_rsv = &root->fs_info->global_block_rsv; 3822 global_rsv = &root->fs_info->global_block_rsv;
3778 3823
3779 btrfs_i_size_write(inode, 0); 3824 btrfs_i_size_write(inode, 0);
3780 3825
3781 /* 3826 /*
3782 * This is a bit simpler than btrfs_truncate since 3827 * This is a bit simpler than btrfs_truncate since we've already
3783 * 3828 * reserved our space for our orphan item in the unlink, so we just
3784 * 1) We've already reserved our space for our orphan item in the 3829 * need to reserve some slack space in case we add bytes and update
3785 * unlink. 3830 * inode item when doing the truncate.
3786 * 2) We're going to delete the inode item, so we don't need to update
3787 * it at all.
3788 *
3789 * So we just need to reserve some slack space in case we add bytes when
3790 * doing the truncate.
3791 */ 3831 */
3792 while (1) { 3832 while (1) {
3793 ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size); 3833 ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size);
@@ -3808,7 +3848,7 @@ void btrfs_evict_inode(struct inode *inode)
3808 goto no_delete; 3848 goto no_delete;
3809 } 3849 }
3810 3850
3811 trans = btrfs_start_transaction(root, 0); 3851 trans = btrfs_start_transaction_noflush(root, 1);
3812 if (IS_ERR(trans)) { 3852 if (IS_ERR(trans)) {
3813 btrfs_orphan_del(NULL, inode); 3853 btrfs_orphan_del(NULL, inode);
3814 btrfs_free_block_rsv(root, rsv); 3854 btrfs_free_block_rsv(root, rsv);
@@ -3818,9 +3858,13 @@ void btrfs_evict_inode(struct inode *inode)
3818 trans->block_rsv = rsv; 3858 trans->block_rsv = rsv;
3819 3859
3820 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); 3860 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
3821 if (ret != -EAGAIN) 3861 if (ret != -ENOSPC)
3822 break; 3862 break;
3823 3863
3864 trans->block_rsv = &root->fs_info->trans_block_rsv;
3865 ret = btrfs_update_inode(trans, root, inode);
3866 BUG_ON(ret);
3867
3824 nr = trans->blocks_used; 3868 nr = trans->blocks_used;
3825 btrfs_end_transaction(trans, root); 3869 btrfs_end_transaction(trans, root);
3826 trans = NULL; 3870 trans = NULL;
@@ -4470,10 +4514,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
4470 trans = btrfs_join_transaction(root); 4514 trans = btrfs_join_transaction(root);
4471 if (IS_ERR(trans)) 4515 if (IS_ERR(trans))
4472 return PTR_ERR(trans); 4516 return PTR_ERR(trans);
4473 if (nolock) 4517 ret = btrfs_commit_transaction(trans, root);
4474 ret = btrfs_end_transaction_nolock(trans, root);
4475 else
4476 ret = btrfs_commit_transaction(trans, root);
4477 } 4518 }
4478 return ret; 4519 return ret;
4479} 4520}
@@ -4671,6 +4712,14 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4671 BTRFS_I(inode)->generation = trans->transid; 4712 BTRFS_I(inode)->generation = trans->transid;
4672 inode->i_generation = BTRFS_I(inode)->generation; 4713 inode->i_generation = BTRFS_I(inode)->generation;
4673 4714
4715 /*
4716 * We could have gotten an inode number from somebody who was fsynced
4717 * and then removed in this same transaction, so let's just set full
4718 * sync since it will be a full sync anyway and this will blow away the
4719 * old info in the log.
4720 */
4721 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
4722
4674 if (S_ISDIR(mode)) 4723 if (S_ISDIR(mode))
4675 owner = 0; 4724 owner = 0;
4676 else 4725 else
@@ -4680,6 +4729,12 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4680 btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY); 4729 btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
4681 key[0].offset = 0; 4730 key[0].offset = 0;
4682 4731
4732 /*
4733 * Start new inodes with an inode_ref. This is slightly more
4734 * efficient for small numbers of hard links since they will
4735 * be packed into one item. Extended refs will kick in if we
4736 * add more hard links than can fit in the ref item.
4737 */
4683 key[1].objectid = objectid; 4738 key[1].objectid = objectid;
4684 btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY); 4739 btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
4685 key[1].offset = ref_objectid; 4740 key[1].offset = ref_objectid;
@@ -4986,7 +5041,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4986 if (root->objectid != BTRFS_I(inode)->root->objectid) 5041 if (root->objectid != BTRFS_I(inode)->root->objectid)
4987 return -EXDEV; 5042 return -EXDEV;
4988 5043
4989 if (inode->i_nlink == ~0U) 5044 if (inode->i_nlink >= BTRFS_LINK_MAX)
4990 return -EMLINK; 5045 return -EMLINK;
4991 5046
4992 err = btrfs_set_inode_index(dir, &index); 5047 err = btrfs_set_inode_index(dir, &index);
@@ -5450,7 +5505,8 @@ insert:
5450 write_unlock(&em_tree->lock); 5505 write_unlock(&em_tree->lock);
5451out: 5506out:
5452 5507
5453 trace_btrfs_get_extent(root, em); 5508 if (em)
5509 trace_btrfs_get_extent(root, em);
5454 5510
5455 if (path) 5511 if (path)
5456 btrfs_free_path(path); 5512 btrfs_free_path(path);
@@ -5836,6 +5892,48 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
5836 return ret; 5892 return ret;
5837} 5893}
5838 5894
5895static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
5896 u64 len, u64 orig_start,
5897 u64 block_start, u64 block_len,
5898 int type)
5899{
5900 struct extent_map_tree *em_tree;
5901 struct extent_map *em;
5902 struct btrfs_root *root = BTRFS_I(inode)->root;
5903 int ret;
5904
5905 em_tree = &BTRFS_I(inode)->extent_tree;
5906 em = alloc_extent_map();
5907 if (!em)
5908 return ERR_PTR(-ENOMEM);
5909
5910 em->start = start;
5911 em->orig_start = orig_start;
5912 em->len = len;
5913 em->block_len = block_len;
5914 em->block_start = block_start;
5915 em->bdev = root->fs_info->fs_devices->latest_bdev;
5916 set_bit(EXTENT_FLAG_PINNED, &em->flags);
5917 if (type == BTRFS_ORDERED_PREALLOC)
5918 set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
5919
5920 do {
5921 btrfs_drop_extent_cache(inode, em->start,
5922 em->start + em->len - 1, 0);
5923 write_lock(&em_tree->lock);
5924 ret = add_extent_mapping(em_tree, em);
5925 write_unlock(&em_tree->lock);
5926 } while (ret == -EEXIST);
5927
5928 if (ret) {
5929 free_extent_map(em);
5930 return ERR_PTR(ret);
5931 }
5932
5933 return em;
5934}
5935
5936
5839static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, 5937static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
5840 struct buffer_head *bh_result, int create) 5938 struct buffer_head *bh_result, int create)
5841{ 5939{
@@ -5950,6 +6048,19 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
5950 goto must_cow; 6048 goto must_cow;
5951 6049
5952 if (can_nocow_odirect(trans, inode, start, len) == 1) { 6050 if (can_nocow_odirect(trans, inode, start, len) == 1) {
6051 u64 orig_start = em->start;
6052
6053 if (type == BTRFS_ORDERED_PREALLOC) {
6054 free_extent_map(em);
6055 em = create_pinned_em(inode, start, len,
6056 orig_start,
6057 block_start, len, type);
6058 if (IS_ERR(em)) {
6059 btrfs_end_transaction(trans, root);
6060 goto unlock_err;
6061 }
6062 }
6063
5953 ret = btrfs_add_ordered_extent_dio(inode, start, 6064 ret = btrfs_add_ordered_extent_dio(inode, start,
5954 block_start, len, len, type); 6065 block_start, len, len, type);
5955 btrfs_end_transaction(trans, root); 6066 btrfs_end_transaction(trans, root);
@@ -5999,7 +6110,8 @@ unlock:
5999 if (lockstart < lockend) { 6110 if (lockstart < lockend) {
6000 if (create && len < lockend - lockstart) { 6111 if (create && len < lockend - lockstart) {
6001 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, 6112 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
6002 lockstart + len - 1, unlock_bits, 1, 0, 6113 lockstart + len - 1,
6114 unlock_bits | EXTENT_DEFRAG, 1, 0,
6003 &cached_state, GFP_NOFS); 6115 &cached_state, GFP_NOFS);
6004 /* 6116 /*
6005 * Beside unlock, we also need to cleanup reserved space 6117 * Beside unlock, we also need to cleanup reserved space
@@ -6007,8 +6119,8 @@ unlock:
6007 */ 6119 */
6008 clear_extent_bit(&BTRFS_I(inode)->io_tree, 6120 clear_extent_bit(&BTRFS_I(inode)->io_tree,
6009 lockstart + len, lockend, 6121 lockstart + len, lockend,
6010 unlock_bits | EXTENT_DO_ACCOUNTING, 6122 unlock_bits | EXTENT_DO_ACCOUNTING |
6011 1, 0, NULL, GFP_NOFS); 6123 EXTENT_DEFRAG, 1, 0, NULL, GFP_NOFS);
6012 } else { 6124 } else {
6013 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, 6125 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
6014 lockend, unlock_bits, 1, 0, 6126 lockend, unlock_bits, 1, 0,
@@ -6573,8 +6685,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
6573 */ 6685 */
6574 clear_extent_bit(tree, page_start, page_end, 6686 clear_extent_bit(tree, page_start, page_end,
6575 EXTENT_DIRTY | EXTENT_DELALLOC | 6687 EXTENT_DIRTY | EXTENT_DELALLOC |
6576 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0, 6688 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
6577 &cached_state, GFP_NOFS); 6689 EXTENT_DEFRAG, 1, 0, &cached_state, GFP_NOFS);
6578 /* 6690 /*
6579 * whoever cleared the private bit is responsible 6691 * whoever cleared the private bit is responsible
6580 * for the finish_ordered_io 6692 * for the finish_ordered_io
@@ -6590,7 +6702,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
6590 } 6702 }
6591 clear_extent_bit(tree, page_start, page_end, 6703 clear_extent_bit(tree, page_start, page_end,
6592 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | 6704 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
6593 EXTENT_DO_ACCOUNTING, 1, 1, &cached_state, GFP_NOFS); 6705 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
6706 &cached_state, GFP_NOFS);
6594 __btrfs_releasepage(page, GFP_NOFS); 6707 __btrfs_releasepage(page, GFP_NOFS);
6595 6708
6596 ClearPageChecked(page); 6709 ClearPageChecked(page);
@@ -6687,7 +6800,8 @@ again:
6687 * prepare_pages in the normal write path. 6800 * prepare_pages in the normal write path.
6688 */ 6801 */
6689 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, 6802 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
6690 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, 6803 EXTENT_DIRTY | EXTENT_DELALLOC |
6804 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
6691 0, 0, &cached_state, GFP_NOFS); 6805 0, 0, &cached_state, GFP_NOFS);
6692 6806
6693 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 6807 ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
@@ -6718,6 +6832,7 @@ again:
6718 6832
6719 BTRFS_I(inode)->last_trans = root->fs_info->generation; 6833 BTRFS_I(inode)->last_trans = root->fs_info->generation;
6720 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; 6834 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
6835 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
6721 6836
6722 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS); 6837 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
6723 6838
@@ -6745,7 +6860,7 @@ static int btrfs_truncate(struct inode *inode)
6745 u64 mask = root->sectorsize - 1; 6860 u64 mask = root->sectorsize - 1;
6746 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); 6861 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
6747 6862
6748 ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); 6863 ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);
6749 if (ret) 6864 if (ret)
6750 return ret; 6865 return ret;
6751 6866
@@ -6788,10 +6903,11 @@ static int btrfs_truncate(struct inode *inode)
6788 * 3) fs_info->trans_block_rsv - this will have 1 items worth left for 6903 * 3) fs_info->trans_block_rsv - this will have 1 items worth left for
6789 * updating the inode. 6904 * updating the inode.
6790 */ 6905 */
6791 rsv = btrfs_alloc_block_rsv(root); 6906 rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
6792 if (!rsv) 6907 if (!rsv)
6793 return -ENOMEM; 6908 return -ENOMEM;
6794 rsv->size = min_size; 6909 rsv->size = min_size;
6910 rsv->failfast = 1;
6795 6911
6796 /* 6912 /*
6797 * 1 for the truncate slack space 6913 * 1 for the truncate slack space
@@ -6837,36 +6953,21 @@ static int btrfs_truncate(struct inode *inode)
6837 &BTRFS_I(inode)->runtime_flags)) 6953 &BTRFS_I(inode)->runtime_flags))
6838 btrfs_add_ordered_operation(trans, root, inode); 6954 btrfs_add_ordered_operation(trans, root, inode);
6839 6955
6840 while (1) { 6956 /*
6841 ret = btrfs_block_rsv_refill(root, rsv, min_size); 6957 * So if we truncate and then write and fsync we normally would just
6842 if (ret) { 6958 * write the extents that changed, which is a problem if we need to
6843 /* 6959 * first truncate that entire inode. So set this flag so we write out
6844 * This can only happen with the original transaction we 6960 * all of the extents in the inode to the sync log so we're completely
6845 * started above, every other time we shouldn't have a 6961 * safe.
6846 * transaction started yet. 6962 */
6847 */ 6963 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
6848 if (ret == -EAGAIN) 6964 trans->block_rsv = rsv;
6849 goto end_trans;
6850 err = ret;
6851 break;
6852 }
6853
6854 if (!trans) {
6855 /* Just need the 1 for updating the inode */
6856 trans = btrfs_start_transaction(root, 1);
6857 if (IS_ERR(trans)) {
6858 ret = err = PTR_ERR(trans);
6859 trans = NULL;
6860 break;
6861 }
6862 }
6863
6864 trans->block_rsv = rsv;
6865 6965
6966 while (1) {
6866 ret = btrfs_truncate_inode_items(trans, root, inode, 6967 ret = btrfs_truncate_inode_items(trans, root, inode,
6867 inode->i_size, 6968 inode->i_size,
6868 BTRFS_EXTENT_DATA_KEY); 6969 BTRFS_EXTENT_DATA_KEY);
6869 if (ret != -EAGAIN) { 6970 if (ret != -ENOSPC) {
6870 err = ret; 6971 err = ret;
6871 break; 6972 break;
6872 } 6973 }
@@ -6877,11 +6978,22 @@ static int btrfs_truncate(struct inode *inode)
6877 err = ret; 6978 err = ret;
6878 break; 6979 break;
6879 } 6980 }
6880end_trans: 6981
6881 nr = trans->blocks_used; 6982 nr = trans->blocks_used;
6882 btrfs_end_transaction(trans, root); 6983 btrfs_end_transaction(trans, root);
6883 trans = NULL;
6884 btrfs_btree_balance_dirty(root, nr); 6984 btrfs_btree_balance_dirty(root, nr);
6985
6986 trans = btrfs_start_transaction(root, 2);
6987 if (IS_ERR(trans)) {
6988 ret = err = PTR_ERR(trans);
6989 trans = NULL;
6990 break;
6991 }
6992
6993 ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
6994 rsv, min_size);
6995 BUG_ON(ret); /* shouldn't happen */
6996 trans->block_rsv = rsv;
6885 } 6997 }
6886 6998
6887 if (ret == 0 && inode->i_nlink > 0) { 6999 if (ret == 0 && inode->i_nlink > 0) {
@@ -6965,6 +7077,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6965 ei->csum_bytes = 0; 7077 ei->csum_bytes = 0;
6966 ei->index_cnt = (u64)-1; 7078 ei->index_cnt = (u64)-1;
6967 ei->last_unlink_trans = 0; 7079 ei->last_unlink_trans = 0;
7080 ei->last_log_commit = 0;
6968 7081
6969 spin_lock_init(&ei->lock); 7082 spin_lock_init(&ei->lock);
6970 ei->outstanding_extents = 0; 7083 ei->outstanding_extents = 0;
@@ -7095,31 +7208,31 @@ void btrfs_destroy_cachep(void)
7095 7208
7096int btrfs_init_cachep(void) 7209int btrfs_init_cachep(void)
7097{ 7210{
7098 btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache", 7211 btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
7099 sizeof(struct btrfs_inode), 0, 7212 sizeof(struct btrfs_inode), 0,
7100 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once); 7213 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once);
7101 if (!btrfs_inode_cachep) 7214 if (!btrfs_inode_cachep)
7102 goto fail; 7215 goto fail;
7103 7216
7104 btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache", 7217 btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
7105 sizeof(struct btrfs_trans_handle), 0, 7218 sizeof(struct btrfs_trans_handle), 0,
7106 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 7219 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
7107 if (!btrfs_trans_handle_cachep) 7220 if (!btrfs_trans_handle_cachep)
7108 goto fail; 7221 goto fail;
7109 7222
7110 btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache", 7223 btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction",
7111 sizeof(struct btrfs_transaction), 0, 7224 sizeof(struct btrfs_transaction), 0,
7112 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 7225 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
7113 if (!btrfs_transaction_cachep) 7226 if (!btrfs_transaction_cachep)
7114 goto fail; 7227 goto fail;
7115 7228
7116 btrfs_path_cachep = kmem_cache_create("btrfs_path_cache", 7229 btrfs_path_cachep = kmem_cache_create("btrfs_path",
7117 sizeof(struct btrfs_path), 0, 7230 sizeof(struct btrfs_path), 0,
7118 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 7231 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
7119 if (!btrfs_path_cachep) 7232 if (!btrfs_path_cachep)
7120 goto fail; 7233 goto fail;
7121 7234
7122 btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space_cache", 7235 btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
7123 sizeof(struct btrfs_free_space), 0, 7236 sizeof(struct btrfs_free_space), 0,
7124 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 7237 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
7125 if (!btrfs_free_space_cachep) 7238 if (!btrfs_free_space_cachep)
@@ -7513,6 +7626,8 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
7513 loff_t actual_len, u64 *alloc_hint, 7626 loff_t actual_len, u64 *alloc_hint,
7514 struct btrfs_trans_handle *trans) 7627 struct btrfs_trans_handle *trans)
7515{ 7628{
7629 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
7630 struct extent_map *em;
7516 struct btrfs_root *root = BTRFS_I(inode)->root; 7631 struct btrfs_root *root = BTRFS_I(inode)->root;
7517 struct btrfs_key ins; 7632 struct btrfs_key ins;
7518 u64 cur_offset = start; 7633 u64 cur_offset = start;
@@ -7553,6 +7668,37 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
7553 btrfs_drop_extent_cache(inode, cur_offset, 7668 btrfs_drop_extent_cache(inode, cur_offset,
7554 cur_offset + ins.offset -1, 0); 7669 cur_offset + ins.offset -1, 0);
7555 7670
7671 em = alloc_extent_map();
7672 if (!em) {
7673 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
7674 &BTRFS_I(inode)->runtime_flags);
7675 goto next;
7676 }
7677
7678 em->start = cur_offset;
7679 em->orig_start = cur_offset;
7680 em->len = ins.offset;
7681 em->block_start = ins.objectid;
7682 em->block_len = ins.offset;
7683 em->bdev = root->fs_info->fs_devices->latest_bdev;
7684 set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
7685 em->generation = trans->transid;
7686
7687 while (1) {
7688 write_lock(&em_tree->lock);
7689 ret = add_extent_mapping(em_tree, em);
7690 if (!ret)
7691 list_move(&em->list,
7692 &em_tree->modified_extents);
7693 write_unlock(&em_tree->lock);
7694 if (ret != -EEXIST)
7695 break;
7696 btrfs_drop_extent_cache(inode, cur_offset,
7697 cur_offset + ins.offset - 1,
7698 0);
7699 }
7700 free_extent_map(em);
7701next:
7556 num_bytes -= ins.offset; 7702 num_bytes -= ins.offset;
7557 cur_offset += ins.offset; 7703 cur_offset += ins.offset;
7558 *alloc_hint = ins.objectid + ins.offset; 7704 *alloc_hint = ins.objectid + ins.offset;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 47127c1bd290..61168805f175 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -181,6 +181,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
181 int ret; 181 int ret;
182 u64 ip_oldflags; 182 u64 ip_oldflags;
183 unsigned int i_oldflags; 183 unsigned int i_oldflags;
184 umode_t mode;
184 185
185 if (btrfs_root_readonly(root)) 186 if (btrfs_root_readonly(root))
186 return -EROFS; 187 return -EROFS;
@@ -203,6 +204,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
203 204
204 ip_oldflags = ip->flags; 205 ip_oldflags = ip->flags;
205 i_oldflags = inode->i_flags; 206 i_oldflags = inode->i_flags;
207 mode = inode->i_mode;
206 208
207 flags = btrfs_mask_flags(inode->i_mode, flags); 209 flags = btrfs_mask_flags(inode->i_mode, flags);
208 oldflags = btrfs_flags_to_ioctl(ip->flags); 210 oldflags = btrfs_flags_to_ioctl(ip->flags);
@@ -237,10 +239,31 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
237 ip->flags |= BTRFS_INODE_DIRSYNC; 239 ip->flags |= BTRFS_INODE_DIRSYNC;
238 else 240 else
239 ip->flags &= ~BTRFS_INODE_DIRSYNC; 241 ip->flags &= ~BTRFS_INODE_DIRSYNC;
240 if (flags & FS_NOCOW_FL) 242 if (flags & FS_NOCOW_FL) {
241 ip->flags |= BTRFS_INODE_NODATACOW; 243 if (S_ISREG(mode)) {
242 else 244 /*
243 ip->flags &= ~BTRFS_INODE_NODATACOW; 245 * It's safe to turn csums off here, no extents exist.
246 * Otherwise we want the flag to reflect the real COW
247 * status of the file and will not set it.
248 */
249 if (inode->i_size == 0)
250 ip->flags |= BTRFS_INODE_NODATACOW
251 | BTRFS_INODE_NODATASUM;
252 } else {
253 ip->flags |= BTRFS_INODE_NODATACOW;
254 }
255 } else {
256 /*
257 * Revert back under same assuptions as above
258 */
259 if (S_ISREG(mode)) {
260 if (inode->i_size == 0)
261 ip->flags &= ~(BTRFS_INODE_NODATACOW
262 | BTRFS_INODE_NODATASUM);
263 } else {
264 ip->flags &= ~BTRFS_INODE_NODATACOW;
265 }
266 }
244 267
245 /* 268 /*
246 * The COMPRESS flag can only be changed by users, while the NOCOMPRESS 269 * The COMPRESS flag can only be changed by users, while the NOCOMPRESS
@@ -516,7 +539,8 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
516 if (!pending_snapshot) 539 if (!pending_snapshot)
517 return -ENOMEM; 540 return -ENOMEM;
518 541
519 btrfs_init_block_rsv(&pending_snapshot->block_rsv); 542 btrfs_init_block_rsv(&pending_snapshot->block_rsv,
543 BTRFS_BLOCK_RSV_TEMP);
520 pending_snapshot->dentry = dentry; 544 pending_snapshot->dentry = dentry;
521 pending_snapshot->root = root; 545 pending_snapshot->root = root;
522 pending_snapshot->readonly = readonly; 546 pending_snapshot->readonly = readonly;
@@ -525,7 +549,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
525 *inherit = NULL; /* take responsibility to free it */ 549 *inherit = NULL; /* take responsibility to free it */
526 } 550 }
527 551
528 trans = btrfs_start_transaction(root->fs_info->extent_root, 5); 552 trans = btrfs_start_transaction(root->fs_info->extent_root, 6);
529 if (IS_ERR(trans)) { 553 if (IS_ERR(trans)) {
530 ret = PTR_ERR(trans); 554 ret = PTR_ERR(trans);
531 goto fail; 555 goto fail;
@@ -614,7 +638,7 @@ static int btrfs_may_delete(struct inode *dir,struct dentry *victim,int isdir)
614 return -ENOENT; 638 return -ENOENT;
615 639
616 BUG_ON(victim->d_parent->d_inode != dir); 640 BUG_ON(victim->d_parent->d_inode != dir);
617 audit_inode_child(victim, dir); 641 audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
618 642
619 error = inode_permission(dir, MAY_WRITE | MAY_EXEC); 643 error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
620 if (error) 644 if (error)
@@ -1022,8 +1046,8 @@ again:
1022 page_start, page_end - 1, 0, &cached_state); 1046 page_start, page_end - 1, 0, &cached_state);
1023 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, 1047 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
1024 page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC | 1048 page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
1025 EXTENT_DO_ACCOUNTING, 0, 0, &cached_state, 1049 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0,
1026 GFP_NOFS); 1050 &cached_state, GFP_NOFS);
1027 1051
1028 if (i_done != page_cnt) { 1052 if (i_done != page_cnt) {
1029 spin_lock(&BTRFS_I(inode)->lock); 1053 spin_lock(&BTRFS_I(inode)->lock);
@@ -1034,8 +1058,8 @@ again:
1034 } 1058 }
1035 1059
1036 1060
1037 btrfs_set_extent_delalloc(inode, page_start, page_end - 1, 1061 set_extent_defrag(&BTRFS_I(inode)->io_tree, page_start, page_end - 1,
1038 &cached_state); 1062 &cached_state, GFP_NOFS);
1039 1063
1040 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 1064 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1041 page_start, page_end - 1, &cached_state, 1065 page_start, page_end - 1, &cached_state,
@@ -2351,7 +2375,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2351 int ret; 2375 int ret;
2352 u64 len = olen; 2376 u64 len = olen;
2353 u64 bs = root->fs_info->sb->s_blocksize; 2377 u64 bs = root->fs_info->sb->s_blocksize;
2354 u64 hint_byte;
2355 2378
2356 /* 2379 /*
2357 * TODO: 2380 * TODO:
@@ -2456,13 +2479,13 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2456 another, and lock file content */ 2479 another, and lock file content */
2457 while (1) { 2480 while (1) {
2458 struct btrfs_ordered_extent *ordered; 2481 struct btrfs_ordered_extent *ordered;
2459 lock_extent(&BTRFS_I(src)->io_tree, off, off+len); 2482 lock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
2460 ordered = btrfs_lookup_first_ordered_extent(src, off+len); 2483 ordered = btrfs_lookup_first_ordered_extent(src, off + len - 1);
2461 if (!ordered && 2484 if (!ordered &&
2462 !test_range_bit(&BTRFS_I(src)->io_tree, off, off+len, 2485 !test_range_bit(&BTRFS_I(src)->io_tree, off, off + len - 1,
2463 EXTENT_DELALLOC, 0, NULL)) 2486 EXTENT_DELALLOC, 0, NULL))
2464 break; 2487 break;
2465 unlock_extent(&BTRFS_I(src)->io_tree, off, off+len); 2488 unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
2466 if (ordered) 2489 if (ordered)
2467 btrfs_put_ordered_extent(ordered); 2490 btrfs_put_ordered_extent(ordered);
2468 btrfs_wait_ordered_range(src, off, len); 2491 btrfs_wait_ordered_range(src, off, len);
@@ -2536,7 +2559,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2536 btrfs_release_path(path); 2559 btrfs_release_path(path);
2537 2560
2538 if (key.offset + datal <= off || 2561 if (key.offset + datal <= off ||
2539 key.offset >= off+len) 2562 key.offset >= off + len - 1)
2540 goto next; 2563 goto next;
2541 2564
2542 memcpy(&new_key, &key, sizeof(new_key)); 2565 memcpy(&new_key, &key, sizeof(new_key));
@@ -2574,10 +2597,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2574 datal -= off - key.offset; 2597 datal -= off - key.offset;
2575 } 2598 }
2576 2599
2577 ret = btrfs_drop_extents(trans, inode, 2600 ret = btrfs_drop_extents(trans, root, inode,
2578 new_key.offset, 2601 new_key.offset,
2579 new_key.offset + datal, 2602 new_key.offset + datal,
2580 &hint_byte, 1); 2603 1);
2581 if (ret) { 2604 if (ret) {
2582 btrfs_abort_transaction(trans, root, 2605 btrfs_abort_transaction(trans, root,
2583 ret); 2606 ret);
@@ -2637,8 +2660,8 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2637 new_key.offset += skip; 2660 new_key.offset += skip;
2638 } 2661 }
2639 2662
2640 if (key.offset + datal > off+len) 2663 if (key.offset + datal > off + len)
2641 trim = key.offset + datal - (off+len); 2664 trim = key.offset + datal - (off + len);
2642 2665
2643 if (comp && (skip || trim)) { 2666 if (comp && (skip || trim)) {
2644 ret = -EINVAL; 2667 ret = -EINVAL;
@@ -2648,10 +2671,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2648 size -= skip + trim; 2671 size -= skip + trim;
2649 datal -= skip + trim; 2672 datal -= skip + trim;
2650 2673
2651 ret = btrfs_drop_extents(trans, inode, 2674 ret = btrfs_drop_extents(trans, root, inode,
2652 new_key.offset, 2675 new_key.offset,
2653 new_key.offset + datal, 2676 new_key.offset + datal,
2654 &hint_byte, 1); 2677 1);
2655 if (ret) { 2678 if (ret) {
2656 btrfs_abort_transaction(trans, root, 2679 btrfs_abort_transaction(trans, root,
2657 ret); 2680 ret);
@@ -2715,7 +2738,7 @@ next:
2715 ret = 0; 2738 ret = 0;
2716out: 2739out:
2717 btrfs_release_path(path); 2740 btrfs_release_path(path);
2718 unlock_extent(&BTRFS_I(src)->io_tree, off, off+len); 2741 unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
2719out_unlock: 2742out_unlock:
2720 mutex_unlock(&src->i_mutex); 2743 mutex_unlock(&src->i_mutex);
2721 mutex_unlock(&inode->i_mutex); 2744 mutex_unlock(&inode->i_mutex);
@@ -2850,8 +2873,8 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2850 return 0; 2873 return 0;
2851} 2874}
2852 2875
2853static void get_block_group_info(struct list_head *groups_list, 2876void btrfs_get_block_group_info(struct list_head *groups_list,
2854 struct btrfs_ioctl_space_info *space) 2877 struct btrfs_ioctl_space_info *space)
2855{ 2878{
2856 struct btrfs_block_group_cache *block_group; 2879 struct btrfs_block_group_cache *block_group;
2857 2880
@@ -2959,8 +2982,8 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
2959 down_read(&info->groups_sem); 2982 down_read(&info->groups_sem);
2960 for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) { 2983 for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
2961 if (!list_empty(&info->block_groups[c])) { 2984 if (!list_empty(&info->block_groups[c])) {
2962 get_block_group_info(&info->block_groups[c], 2985 btrfs_get_block_group_info(
2963 &space); 2986 &info->block_groups[c], &space);
2964 memcpy(dest, &space, sizeof(space)); 2987 memcpy(dest, &space, sizeof(space));
2965 dest++; 2988 dest++;
2966 space_args.total_spaces++; 2989 space_args.total_spaces++;
@@ -3208,11 +3231,9 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
3208{ 3231{
3209 int ret = 0; 3232 int ret = 0;
3210 int size; 3233 int size;
3211 u64 extent_item_pos;
3212 struct btrfs_ioctl_logical_ino_args *loi; 3234 struct btrfs_ioctl_logical_ino_args *loi;
3213 struct btrfs_data_container *inodes = NULL; 3235 struct btrfs_data_container *inodes = NULL;
3214 struct btrfs_path *path = NULL; 3236 struct btrfs_path *path = NULL;
3215 struct btrfs_key key;
3216 3237
3217 if (!capable(CAP_SYS_ADMIN)) 3238 if (!capable(CAP_SYS_ADMIN))
3218 return -EPERM; 3239 return -EPERM;
@@ -3230,7 +3251,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
3230 goto out; 3251 goto out;
3231 } 3252 }
3232 3253
3233 size = min_t(u32, loi->size, 4096); 3254 size = min_t(u32, loi->size, 64 * 1024);
3234 inodes = init_data_container(size); 3255 inodes = init_data_container(size);
3235 if (IS_ERR(inodes)) { 3256 if (IS_ERR(inodes)) {
3236 ret = PTR_ERR(inodes); 3257 ret = PTR_ERR(inodes);
@@ -3238,22 +3259,13 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
3238 goto out; 3259 goto out;
3239 } 3260 }
3240 3261
3241 ret = extent_from_logical(root->fs_info, loi->logical, path, &key); 3262 ret = iterate_inodes_from_logical(loi->logical, root->fs_info, path,
3242 btrfs_release_path(path); 3263 build_ino_list, inodes);
3243 3264 if (ret == -EINVAL)
3244 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
3245 ret = -ENOENT; 3265 ret = -ENOENT;
3246 if (ret < 0) 3266 if (ret < 0)
3247 goto out; 3267 goto out;
3248 3268
3249 extent_item_pos = loi->logical - key.objectid;
3250 ret = iterate_extent_inodes(root->fs_info, key.objectid,
3251 extent_item_pos, 0, build_ino_list,
3252 inodes);
3253
3254 if (ret < 0)
3255 goto out;
3256
3257 ret = copy_to_user((void *)(unsigned long)loi->inodes, 3269 ret = copy_to_user((void *)(unsigned long)loi->inodes,
3258 (void *)(unsigned long)inodes, size); 3270 (void *)(unsigned long)inodes, size);
3259 if (ret) 3271 if (ret)
@@ -3261,7 +3273,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
3261 3273
3262out: 3274out:
3263 btrfs_free_path(path); 3275 btrfs_free_path(path);
3264 kfree(inodes); 3276 vfree(inodes);
3265 kfree(loi); 3277 kfree(loi);
3266 3278
3267 return ret; 3279 return ret;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 051c7fe551dd..7772f02ba28e 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -25,6 +25,8 @@
25#include "btrfs_inode.h" 25#include "btrfs_inode.h"
26#include "extent_io.h" 26#include "extent_io.h"
27 27
28static struct kmem_cache *btrfs_ordered_extent_cache;
29
28static u64 entry_end(struct btrfs_ordered_extent *entry) 30static u64 entry_end(struct btrfs_ordered_extent *entry)
29{ 31{
30 if (entry->file_offset + entry->len < entry->file_offset) 32 if (entry->file_offset + entry->len < entry->file_offset)
@@ -187,7 +189,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
187 struct btrfs_ordered_extent *entry; 189 struct btrfs_ordered_extent *entry;
188 190
189 tree = &BTRFS_I(inode)->ordered_tree; 191 tree = &BTRFS_I(inode)->ordered_tree;
190 entry = kzalloc(sizeof(*entry), GFP_NOFS); 192 entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
191 if (!entry) 193 if (!entry)
192 return -ENOMEM; 194 return -ENOMEM;
193 195
@@ -421,7 +423,7 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
421 list_del(&sum->list); 423 list_del(&sum->list);
422 kfree(sum); 424 kfree(sum);
423 } 425 }
424 kfree(entry); 426 kmem_cache_free(btrfs_ordered_extent_cache, entry);
425 } 427 }
426} 428}
427 429
@@ -466,8 +468,7 @@ void btrfs_remove_ordered_extent(struct inode *inode,
466 * wait for all the ordered extents in a root. This is done when balancing 468 * wait for all the ordered extents in a root. This is done when balancing
467 * space between drives. 469 * space between drives.
468 */ 470 */
469void btrfs_wait_ordered_extents(struct btrfs_root *root, 471void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
470 int nocow_only, int delay_iput)
471{ 472{
472 struct list_head splice; 473 struct list_head splice;
473 struct list_head *cur; 474 struct list_head *cur;
@@ -482,15 +483,6 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root,
482 cur = splice.next; 483 cur = splice.next;
483 ordered = list_entry(cur, struct btrfs_ordered_extent, 484 ordered = list_entry(cur, struct btrfs_ordered_extent,
484 root_extent_list); 485 root_extent_list);
485 if (nocow_only &&
486 !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) &&
487 !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
488 list_move(&ordered->root_extent_list,
489 &root->fs_info->ordered_extents);
490 cond_resched_lock(&root->fs_info->ordered_extent_lock);
491 continue;
492 }
493
494 list_del_init(&ordered->root_extent_list); 486 list_del_init(&ordered->root_extent_list);
495 atomic_inc(&ordered->refs); 487 atomic_inc(&ordered->refs);
496 488
@@ -775,7 +767,6 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
775 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; 767 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
776 u64 disk_i_size; 768 u64 disk_i_size;
777 u64 new_i_size; 769 u64 new_i_size;
778 u64 i_size_test;
779 u64 i_size = i_size_read(inode); 770 u64 i_size = i_size_read(inode);
780 struct rb_node *node; 771 struct rb_node *node;
781 struct rb_node *prev = NULL; 772 struct rb_node *prev = NULL;
@@ -835,55 +826,30 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
835 break; 826 break;
836 if (test->file_offset >= i_size) 827 if (test->file_offset >= i_size)
837 break; 828 break;
838 if (test->file_offset >= disk_i_size) 829 if (test->file_offset >= disk_i_size) {
830 /*
831 * we don't update disk_i_size now, so record this
832 * undealt i_size. Or we will not know the real
833 * i_size.
834 */
835 if (test->outstanding_isize < offset)
836 test->outstanding_isize = offset;
837 if (ordered &&
838 ordered->outstanding_isize >
839 test->outstanding_isize)
840 test->outstanding_isize =
841 ordered->outstanding_isize;
839 goto out; 842 goto out;
840 }
841 new_i_size = min_t(u64, offset, i_size);
842
843 /*
844 * at this point, we know we can safely update i_size to at least
845 * the offset from this ordered extent. But, we need to
846 * walk forward and see if ios from higher up in the file have
847 * finished.
848 */
849 if (ordered) {
850 node = rb_next(&ordered->rb_node);
851 } else {
852 if (prev)
853 node = rb_next(prev);
854 else
855 node = rb_first(&tree->tree);
856 }
857
858 /*
859 * We are looking for an area between our current extent and the next
860 * ordered extent to update the i_size to. There are 3 cases here
861 *
862 * 1) We don't actually have anything and we can update to i_size.
863 * 2) We have stuff but they already did their i_size update so again we
864 * can just update to i_size.
865 * 3) We have an outstanding ordered extent so the most we can update
866 * our disk_i_size to is the start of the next offset.
867 */
868 i_size_test = i_size;
869 for (; node; node = rb_next(node)) {
870 test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
871
872 if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags))
873 continue;
874 if (test->file_offset > offset) {
875 i_size_test = test->file_offset;
876 break;
877 } 843 }
878 } 844 }
845 new_i_size = min_t(u64, offset, i_size);
879 846
880 /* 847 /*
881 * i_size_test is the end of a region after this ordered 848 * Some ordered extents may completed before the current one, and
882 * extent where there are no ordered extents, we can safely set 849 * we hold the real i_size in ->outstanding_isize.
883 * disk_i_size to this.
884 */ 850 */
885 if (i_size_test > offset) 851 if (ordered && ordered->outstanding_isize > new_i_size)
886 new_i_size = min_t(u64, i_size_test, i_size); 852 new_i_size = min_t(u64, ordered->outstanding_isize, i_size);
887 BTRFS_I(inode)->disk_i_size = new_i_size; 853 BTRFS_I(inode)->disk_i_size = new_i_size;
888 ret = 0; 854 ret = 0;
889out: 855out:
@@ -984,3 +950,20 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
984 } 950 }
985 spin_unlock(&root->fs_info->ordered_extent_lock); 951 spin_unlock(&root->fs_info->ordered_extent_lock);
986} 952}
953
954int __init ordered_data_init(void)
955{
956 btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
957 sizeof(struct btrfs_ordered_extent), 0,
958 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
959 NULL);
960 if (!btrfs_ordered_extent_cache)
961 return -ENOMEM;
962 return 0;
963}
964
965void ordered_data_exit(void)
966{
967 if (btrfs_ordered_extent_cache)
968 kmem_cache_destroy(btrfs_ordered_extent_cache);
969}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index e03c560d2997..dd27a0b46a37 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -96,6 +96,13 @@ struct btrfs_ordered_extent {
96 /* number of bytes that still need writing */ 96 /* number of bytes that still need writing */
97 u64 bytes_left; 97 u64 bytes_left;
98 98
99 /*
100 * the end of the ordered extent which is behind it but
101 * didn't update disk_i_size. Please see the comment of
102 * btrfs_ordered_update_i_size();
103 */
104 u64 outstanding_isize;
105
99 /* flags (described above) */ 106 /* flags (described above) */
100 unsigned long flags; 107 unsigned long flags;
101 108
@@ -183,6 +190,7 @@ void btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
183void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, 190void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
184 struct btrfs_root *root, 191 struct btrfs_root *root,
185 struct inode *inode); 192 struct inode *inode);
186void btrfs_wait_ordered_extents(struct btrfs_root *root, 193void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput);
187 int nocow_only, int delay_iput); 194int __init ordered_data_init(void);
195void ordered_data_exit(void);
188#endif 196#endif
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index b65015581744..5039686df6ae 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1145,12 +1145,12 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
1145 1145
1146 ulist_reinit(tmp); 1146 ulist_reinit(tmp);
1147 /* XXX id not needed */ 1147 /* XXX id not needed */
1148 ulist_add(tmp, qg->qgroupid, (unsigned long)qg, GFP_ATOMIC); 1148 ulist_add(tmp, qg->qgroupid, (u64)(uintptr_t)qg, GFP_ATOMIC);
1149 ULIST_ITER_INIT(&tmp_uiter); 1149 ULIST_ITER_INIT(&tmp_uiter);
1150 while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { 1150 while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
1151 struct btrfs_qgroup_list *glist; 1151 struct btrfs_qgroup_list *glist;
1152 1152
1153 qg = (struct btrfs_qgroup *)tmp_unode->aux; 1153 qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux;
1154 if (qg->refcnt < seq) 1154 if (qg->refcnt < seq)
1155 qg->refcnt = seq + 1; 1155 qg->refcnt = seq + 1;
1156 else 1156 else
@@ -1158,7 +1158,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
1158 1158
1159 list_for_each_entry(glist, &qg->groups, next_group) { 1159 list_for_each_entry(glist, &qg->groups, next_group) {
1160 ulist_add(tmp, glist->group->qgroupid, 1160 ulist_add(tmp, glist->group->qgroupid,
1161 (unsigned long)glist->group, 1161 (u64)(uintptr_t)glist->group,
1162 GFP_ATOMIC); 1162 GFP_ATOMIC);
1163 } 1163 }
1164 } 1164 }
@@ -1168,13 +1168,13 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
1168 * step 2: walk from the new root 1168 * step 2: walk from the new root
1169 */ 1169 */
1170 ulist_reinit(tmp); 1170 ulist_reinit(tmp);
1171 ulist_add(tmp, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC); 1171 ulist_add(tmp, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC);
1172 ULIST_ITER_INIT(&uiter); 1172 ULIST_ITER_INIT(&uiter);
1173 while ((unode = ulist_next(tmp, &uiter))) { 1173 while ((unode = ulist_next(tmp, &uiter))) {
1174 struct btrfs_qgroup *qg; 1174 struct btrfs_qgroup *qg;
1175 struct btrfs_qgroup_list *glist; 1175 struct btrfs_qgroup_list *glist;
1176 1176
1177 qg = (struct btrfs_qgroup *)unode->aux; 1177 qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
1178 if (qg->refcnt < seq) { 1178 if (qg->refcnt < seq) {
1179 /* not visited by step 1 */ 1179 /* not visited by step 1 */
1180 qg->rfer += sgn * node->num_bytes; 1180 qg->rfer += sgn * node->num_bytes;
@@ -1190,7 +1190,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
1190 1190
1191 list_for_each_entry(glist, &qg->groups, next_group) { 1191 list_for_each_entry(glist, &qg->groups, next_group) {
1192 ulist_add(tmp, glist->group->qgroupid, 1192 ulist_add(tmp, glist->group->qgroupid,
1193 (unsigned long)glist->group, GFP_ATOMIC); 1193 (uintptr_t)glist->group, GFP_ATOMIC);
1194 } 1194 }
1195 } 1195 }
1196 1196
@@ -1208,12 +1208,12 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
1208 continue; 1208 continue;
1209 1209
1210 ulist_reinit(tmp); 1210 ulist_reinit(tmp);
1211 ulist_add(tmp, qg->qgroupid, (unsigned long)qg, GFP_ATOMIC); 1211 ulist_add(tmp, qg->qgroupid, (uintptr_t)qg, GFP_ATOMIC);
1212 ULIST_ITER_INIT(&tmp_uiter); 1212 ULIST_ITER_INIT(&tmp_uiter);
1213 while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { 1213 while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
1214 struct btrfs_qgroup_list *glist; 1214 struct btrfs_qgroup_list *glist;
1215 1215
1216 qg = (struct btrfs_qgroup *)tmp_unode->aux; 1216 qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux;
1217 if (qg->tag == seq) 1217 if (qg->tag == seq)
1218 continue; 1218 continue;
1219 1219
@@ -1225,7 +1225,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
1225 1225
1226 list_for_each_entry(glist, &qg->groups, next_group) { 1226 list_for_each_entry(glist, &qg->groups, next_group) {
1227 ulist_add(tmp, glist->group->qgroupid, 1227 ulist_add(tmp, glist->group->qgroupid,
1228 (unsigned long)glist->group, 1228 (uintptr_t)glist->group,
1229 GFP_ATOMIC); 1229 GFP_ATOMIC);
1230 } 1230 }
1231 } 1231 }
@@ -1469,13 +1469,17 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
1469 * be exceeded 1469 * be exceeded
1470 */ 1470 */
1471 ulist = ulist_alloc(GFP_ATOMIC); 1471 ulist = ulist_alloc(GFP_ATOMIC);
1472 ulist_add(ulist, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC); 1472 if (!ulist) {
1473 ret = -ENOMEM;
1474 goto out;
1475 }
1476 ulist_add(ulist, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC);
1473 ULIST_ITER_INIT(&uiter); 1477 ULIST_ITER_INIT(&uiter);
1474 while ((unode = ulist_next(ulist, &uiter))) { 1478 while ((unode = ulist_next(ulist, &uiter))) {
1475 struct btrfs_qgroup *qg; 1479 struct btrfs_qgroup *qg;
1476 struct btrfs_qgroup_list *glist; 1480 struct btrfs_qgroup_list *glist;
1477 1481
1478 qg = (struct btrfs_qgroup *)unode->aux; 1482 qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
1479 1483
1480 if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) && 1484 if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
1481 qg->reserved + qg->rfer + num_bytes > 1485 qg->reserved + qg->rfer + num_bytes >
@@ -1489,7 +1493,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
1489 1493
1490 list_for_each_entry(glist, &qg->groups, next_group) { 1494 list_for_each_entry(glist, &qg->groups, next_group) {
1491 ulist_add(ulist, glist->group->qgroupid, 1495 ulist_add(ulist, glist->group->qgroupid,
1492 (unsigned long)glist->group, GFP_ATOMIC); 1496 (uintptr_t)glist->group, GFP_ATOMIC);
1493 } 1497 }
1494 } 1498 }
1495 if (ret) 1499 if (ret)
@@ -1502,7 +1506,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
1502 while ((unode = ulist_next(ulist, &uiter))) { 1506 while ((unode = ulist_next(ulist, &uiter))) {
1503 struct btrfs_qgroup *qg; 1507 struct btrfs_qgroup *qg;
1504 1508
1505 qg = (struct btrfs_qgroup *)unode->aux; 1509 qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
1506 1510
1507 qg->reserved += num_bytes; 1511 qg->reserved += num_bytes;
1508 } 1512 }
@@ -1541,19 +1545,23 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
1541 goto out; 1545 goto out;
1542 1546
1543 ulist = ulist_alloc(GFP_ATOMIC); 1547 ulist = ulist_alloc(GFP_ATOMIC);
1544 ulist_add(ulist, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC); 1548 if (!ulist) {
1549 btrfs_std_error(fs_info, -ENOMEM);
1550 goto out;
1551 }
1552 ulist_add(ulist, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC);
1545 ULIST_ITER_INIT(&uiter); 1553 ULIST_ITER_INIT(&uiter);
1546 while ((unode = ulist_next(ulist, &uiter))) { 1554 while ((unode = ulist_next(ulist, &uiter))) {
1547 struct btrfs_qgroup *qg; 1555 struct btrfs_qgroup *qg;
1548 struct btrfs_qgroup_list *glist; 1556 struct btrfs_qgroup_list *glist;
1549 1557
1550 qg = (struct btrfs_qgroup *)unode->aux; 1558 qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
1551 1559
1552 qg->reserved -= num_bytes; 1560 qg->reserved -= num_bytes;
1553 1561
1554 list_for_each_entry(glist, &qg->groups, next_group) { 1562 list_for_each_entry(glist, &qg->groups, next_group) {
1555 ulist_add(ulist, glist->group->qgroupid, 1563 ulist_add(ulist, glist->group->qgroupid,
1556 (unsigned long)glist->group, GFP_ATOMIC); 1564 (uintptr_t)glist->group, GFP_ATOMIC);
1557 } 1565 }
1558 } 1566 }
1559 1567
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 4da08652004d..776f0aa128fc 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3270,8 +3270,8 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
3270 key.offset = 0; 3270 key.offset = 0;
3271 3271
3272 inode = btrfs_iget(fs_info->sb, &key, root, NULL); 3272 inode = btrfs_iget(fs_info->sb, &key, root, NULL);
3273 if (IS_ERR_OR_NULL(inode) || is_bad_inode(inode)) { 3273 if (IS_ERR(inode) || is_bad_inode(inode)) {
3274 if (inode && !IS_ERR(inode)) 3274 if (!IS_ERR(inode))
3275 iput(inode); 3275 iput(inode);
3276 return -ENOENT; 3276 return -ENOENT;
3277 } 3277 }
@@ -3621,7 +3621,7 @@ next:
3621 3621
3622 ret = find_first_extent_bit(&rc->processed_blocks, 3622 ret = find_first_extent_bit(&rc->processed_blocks,
3623 key.objectid, &start, &end, 3623 key.objectid, &start, &end,
3624 EXTENT_DIRTY); 3624 EXTENT_DIRTY, NULL);
3625 3625
3626 if (ret == 0 && start <= key.objectid) { 3626 if (ret == 0 && start <= key.objectid) {
3627 btrfs_release_path(path); 3627 btrfs_release_path(path);
@@ -3674,7 +3674,8 @@ int prepare_to_relocate(struct reloc_control *rc)
3674 struct btrfs_trans_handle *trans; 3674 struct btrfs_trans_handle *trans;
3675 int ret; 3675 int ret;
3676 3676
3677 rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root); 3677 rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root,
3678 BTRFS_BLOCK_RSV_TEMP);
3678 if (!rc->block_rsv) 3679 if (!rc->block_rsv)
3679 return -ENOMEM; 3680 return -ENOMEM;
3680 3681
@@ -4057,7 +4058,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
4057 (unsigned long long)rc->block_group->flags); 4058 (unsigned long long)rc->block_group->flags);
4058 4059
4059 btrfs_start_delalloc_inodes(fs_info->tree_root, 0); 4060 btrfs_start_delalloc_inodes(fs_info->tree_root, 0);
4060 btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0); 4061 btrfs_wait_ordered_extents(fs_info->tree_root, 0);
4061 4062
4062 while (1) { 4063 while (1) {
4063 mutex_lock(&fs_info->cleaner_mutex); 4064 mutex_lock(&fs_info->cleaner_mutex);
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 10d8e4d88071..eb923d087da7 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -141,8 +141,10 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
141 return -ENOMEM; 141 return -ENOMEM;
142 142
143 ret = btrfs_search_slot(trans, root, key, path, 0, 1); 143 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
144 if (ret < 0) 144 if (ret < 0) {
145 goto out_abort; 145 btrfs_abort_transaction(trans, root, ret);
146 goto out;
147 }
146 148
147 if (ret != 0) { 149 if (ret != 0) {
148 btrfs_print_leaf(root, path->nodes[0]); 150 btrfs_print_leaf(root, path->nodes[0]);
@@ -166,16 +168,23 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
166 btrfs_release_path(path); 168 btrfs_release_path(path);
167 ret = btrfs_search_slot(trans, root, key, path, 169 ret = btrfs_search_slot(trans, root, key, path,
168 -1, 1); 170 -1, 1);
169 if (ret < 0) 171 if (ret < 0) {
170 goto out_abort; 172 btrfs_abort_transaction(trans, root, ret);
173 goto out;
174 }
175
171 ret = btrfs_del_item(trans, root, path); 176 ret = btrfs_del_item(trans, root, path);
172 if (ret < 0) 177 if (ret < 0) {
173 goto out_abort; 178 btrfs_abort_transaction(trans, root, ret);
179 goto out;
180 }
174 btrfs_release_path(path); 181 btrfs_release_path(path);
175 ret = btrfs_insert_empty_item(trans, root, path, 182 ret = btrfs_insert_empty_item(trans, root, path,
176 key, sizeof(*item)); 183 key, sizeof(*item));
177 if (ret < 0) 184 if (ret < 0) {
178 goto out_abort; 185 btrfs_abort_transaction(trans, root, ret);
186 goto out;
187 }
179 l = path->nodes[0]; 188 l = path->nodes[0];
180 slot = path->slots[0]; 189 slot = path->slots[0];
181 ptr = btrfs_item_ptr_offset(l, slot); 190 ptr = btrfs_item_ptr_offset(l, slot);
@@ -192,10 +201,6 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
192out: 201out:
193 btrfs_free_path(path); 202 btrfs_free_path(path);
194 return ret; 203 return ret;
195
196out_abort:
197 btrfs_abort_transaction(trans, root, ret);
198 goto out;
199} 204}
200 205
201int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, 206int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index b223620cd5a6..27892f67e69b 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -352,13 +352,14 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
352 struct extent_buffer *eb; 352 struct extent_buffer *eb;
353 struct btrfs_extent_item *ei; 353 struct btrfs_extent_item *ei;
354 struct scrub_warning swarn; 354 struct scrub_warning swarn;
355 u32 item_size; 355 unsigned long ptr = 0;
356 int ret; 356 u64 extent_item_pos;
357 u64 flags = 0;
357 u64 ref_root; 358 u64 ref_root;
359 u32 item_size;
358 u8 ref_level; 360 u8 ref_level;
359 unsigned long ptr = 0;
360 const int bufsize = 4096; 361 const int bufsize = 4096;
361 u64 extent_item_pos; 362 int ret;
362 363
363 path = btrfs_alloc_path(); 364 path = btrfs_alloc_path();
364 365
@@ -375,7 +376,8 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
375 if (!path || !swarn.scratch_buf || !swarn.msg_buf) 376 if (!path || !swarn.scratch_buf || !swarn.msg_buf)
376 goto out; 377 goto out;
377 378
378 ret = extent_from_logical(fs_info, swarn.logical, path, &found_key); 379 ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
380 &flags);
379 if (ret < 0) 381 if (ret < 0)
380 goto out; 382 goto out;
381 383
@@ -387,7 +389,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
387 item_size = btrfs_item_size_nr(eb, path->slots[0]); 389 item_size = btrfs_item_size_nr(eb, path->slots[0]);
388 btrfs_release_path(path); 390 btrfs_release_path(path);
389 391
390 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 392 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
391 do { 393 do {
392 ret = tree_backref_for_extent(&ptr, eb, ei, item_size, 394 ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
393 &ref_root, &ref_level); 395 &ref_root, &ref_level);
@@ -1029,6 +1031,7 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
1029 spin_lock(&sdev->stat_lock); 1031 spin_lock(&sdev->stat_lock);
1030 sdev->stat.malloc_errors++; 1032 sdev->stat.malloc_errors++;
1031 spin_unlock(&sdev->stat_lock); 1033 spin_unlock(&sdev->stat_lock);
1034 kfree(bbio);
1032 return -ENOMEM; 1035 return -ENOMEM;
1033 } 1036 }
1034 sblock->page_count++; 1037 sblock->page_count++;
@@ -1666,21 +1669,6 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work)
1666 scrub_block_put(sblock); 1669 scrub_block_put(sblock);
1667 } 1670 }
1668 1671
1669 if (sbio->err) {
1670 /* what is this good for??? */
1671 sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1);
1672 sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
1673 sbio->bio->bi_phys_segments = 0;
1674 sbio->bio->bi_idx = 0;
1675
1676 for (i = 0; i < sbio->page_count; i++) {
1677 struct bio_vec *bi;
1678 bi = &sbio->bio->bi_io_vec[i];
1679 bi->bv_offset = 0;
1680 bi->bv_len = PAGE_SIZE;
1681 }
1682 }
1683
1684 bio_put(sbio->bio); 1672 bio_put(sbio->bio);
1685 sbio->bio = NULL; 1673 sbio->bio = NULL;
1686 spin_lock(&sdev->list_lock); 1674 spin_lock(&sdev->list_lock);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index fb5ffe95f869..c7beb543a4a8 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -107,7 +107,6 @@ struct send_ctx {
107 int cur_inode_new; 107 int cur_inode_new;
108 int cur_inode_new_gen; 108 int cur_inode_new_gen;
109 int cur_inode_deleted; 109 int cur_inode_deleted;
110 int cur_inode_first_ref_orphan;
111 u64 cur_inode_size; 110 u64 cur_inode_size;
112 u64 cur_inode_mode; 111 u64 cur_inode_mode;
113 112
@@ -126,7 +125,15 @@ struct send_ctx {
126 125
127struct name_cache_entry { 126struct name_cache_entry {
128 struct list_head list; 127 struct list_head list;
129 struct list_head use_list; 128 /*
129 * radix_tree has only 32bit entries but we need to handle 64bit inums.
130 * We use the lower 32bit of the 64bit inum to store it in the tree. If
131 * more then one inum would fall into the same entry, we use radix_list
132 * to store the additional entries. radix_list is also used to store
133 * entries where two entries have the same inum but different
134 * generations.
135 */
136 struct list_head radix_list;
130 u64 ino; 137 u64 ino;
131 u64 gen; 138 u64 gen;
132 u64 parent_ino; 139 u64 parent_ino;
@@ -328,6 +335,7 @@ out:
328 return ret; 335 return ret;
329} 336}
330 337
338#if 0
331static void fs_path_remove(struct fs_path *p) 339static void fs_path_remove(struct fs_path *p)
332{ 340{
333 BUG_ON(p->reversed); 341 BUG_ON(p->reversed);
@@ -335,6 +343,7 @@ static void fs_path_remove(struct fs_path *p)
335 p->end--; 343 p->end--;
336 *p->end = 0; 344 *p->end = 0;
337} 345}
346#endif
338 347
339static int fs_path_copy(struct fs_path *p, struct fs_path *from) 348static int fs_path_copy(struct fs_path *p, struct fs_path *from)
340{ 349{
@@ -377,7 +386,7 @@ static struct btrfs_path *alloc_path_for_send(void)
377 return path; 386 return path;
378} 387}
379 388
380static int write_buf(struct send_ctx *sctx, const void *buf, u32 len) 389int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off)
381{ 390{
382 int ret; 391 int ret;
383 mm_segment_t old_fs; 392 mm_segment_t old_fs;
@@ -387,8 +396,7 @@ static int write_buf(struct send_ctx *sctx, const void *buf, u32 len)
387 set_fs(KERNEL_DS); 396 set_fs(KERNEL_DS);
388 397
389 while (pos < len) { 398 while (pos < len) {
390 ret = vfs_write(sctx->send_filp, (char *)buf + pos, len - pos, 399 ret = vfs_write(filp, (char *)buf + pos, len - pos, off);
391 &sctx->send_off);
392 /* TODO handle that correctly */ 400 /* TODO handle that correctly */
393 /*if (ret == -ERESTARTSYS) { 401 /*if (ret == -ERESTARTSYS) {
394 continue; 402 continue;
@@ -544,7 +552,8 @@ static int send_header(struct send_ctx *sctx)
544 strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC); 552 strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC);
545 hdr.version = cpu_to_le32(BTRFS_SEND_STREAM_VERSION); 553 hdr.version = cpu_to_le32(BTRFS_SEND_STREAM_VERSION);
546 554
547 return write_buf(sctx, &hdr, sizeof(hdr)); 555 return write_buf(sctx->send_filp, &hdr, sizeof(hdr),
556 &sctx->send_off);
548} 557}
549 558
550/* 559/*
@@ -581,7 +590,8 @@ static int send_cmd(struct send_ctx *sctx)
581 crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size); 590 crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
582 hdr->crc = cpu_to_le32(crc); 591 hdr->crc = cpu_to_le32(crc);
583 592
584 ret = write_buf(sctx, sctx->send_buf, sctx->send_size); 593 ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
594 &sctx->send_off);
585 595
586 sctx->total_send_size += sctx->send_size; 596 sctx->total_send_size += sctx->send_size;
587 sctx->cmd_send_size[le16_to_cpu(hdr->cmd)] += sctx->send_size; 597 sctx->cmd_send_size[le16_to_cpu(hdr->cmd)] += sctx->send_size;
@@ -687,7 +697,8 @@ out:
687 */ 697 */
688static int get_inode_info(struct btrfs_root *root, 698static int get_inode_info(struct btrfs_root *root,
689 u64 ino, u64 *size, u64 *gen, 699 u64 ino, u64 *size, u64 *gen,
690 u64 *mode, u64 *uid, u64 *gid) 700 u64 *mode, u64 *uid, u64 *gid,
701 u64 *rdev)
691{ 702{
692 int ret; 703 int ret;
693 struct btrfs_inode_item *ii; 704 struct btrfs_inode_item *ii;
@@ -721,6 +732,8 @@ static int get_inode_info(struct btrfs_root *root,
721 *uid = btrfs_inode_uid(path->nodes[0], ii); 732 *uid = btrfs_inode_uid(path->nodes[0], ii);
722 if (gid) 733 if (gid)
723 *gid = btrfs_inode_gid(path->nodes[0], ii); 734 *gid = btrfs_inode_gid(path->nodes[0], ii);
735 if (rdev)
736 *rdev = btrfs_inode_rdev(path->nodes[0], ii);
724 737
725out: 738out:
726 btrfs_free_path(path); 739 btrfs_free_path(path);
@@ -852,7 +865,6 @@ static int iterate_dir_item(struct send_ctx *sctx,
852 struct extent_buffer *eb; 865 struct extent_buffer *eb;
853 struct btrfs_item *item; 866 struct btrfs_item *item;
854 struct btrfs_dir_item *di; 867 struct btrfs_dir_item *di;
855 struct btrfs_path *tmp_path = NULL;
856 struct btrfs_key di_key; 868 struct btrfs_key di_key;
857 char *buf = NULL; 869 char *buf = NULL;
858 char *buf2 = NULL; 870 char *buf2 = NULL;
@@ -874,12 +886,6 @@ static int iterate_dir_item(struct send_ctx *sctx,
874 goto out; 886 goto out;
875 } 887 }
876 888
877 tmp_path = alloc_path_for_send();
878 if (!tmp_path) {
879 ret = -ENOMEM;
880 goto out;
881 }
882
883 eb = path->nodes[0]; 889 eb = path->nodes[0];
884 slot = path->slots[0]; 890 slot = path->slots[0];
885 item = btrfs_item_nr(eb, slot); 891 item = btrfs_item_nr(eb, slot);
@@ -941,7 +947,6 @@ static int iterate_dir_item(struct send_ctx *sctx,
941 } 947 }
942 948
943out: 949out:
944 btrfs_free_path(tmp_path);
945 if (buf_virtual) 950 if (buf_virtual)
946 vfree(buf); 951 vfree(buf);
947 else 952 else
@@ -1026,12 +1031,12 @@ struct backref_ctx {
1026 u64 extent_len; 1031 u64 extent_len;
1027 1032
1028 /* Just to check for bugs in backref resolving */ 1033 /* Just to check for bugs in backref resolving */
1029 int found_in_send_root; 1034 int found_itself;
1030}; 1035};
1031 1036
1032static int __clone_root_cmp_bsearch(const void *key, const void *elt) 1037static int __clone_root_cmp_bsearch(const void *key, const void *elt)
1033{ 1038{
1034 u64 root = (u64)key; 1039 u64 root = (u64)(uintptr_t)key;
1035 struct clone_root *cr = (struct clone_root *)elt; 1040 struct clone_root *cr = (struct clone_root *)elt;
1036 1041
1037 if (root < cr->root->objectid) 1042 if (root < cr->root->objectid)
@@ -1055,6 +1060,7 @@ static int __clone_root_cmp_sort(const void *e1, const void *e2)
1055 1060
1056/* 1061/*
1057 * Called for every backref that is found for the current extent. 1062 * Called for every backref that is found for the current extent.
1063 * Results are collected in sctx->clone_roots->ino/offset/found_refs
1058 */ 1064 */
1059static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_) 1065static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
1060{ 1066{
@@ -1064,7 +1070,7 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
1064 u64 i_size; 1070 u64 i_size;
1065 1071
1066 /* First check if the root is in the list of accepted clone sources */ 1072 /* First check if the root is in the list of accepted clone sources */
1067 found = bsearch((void *)root, bctx->sctx->clone_roots, 1073 found = bsearch((void *)(uintptr_t)root, bctx->sctx->clone_roots,
1068 bctx->sctx->clone_roots_cnt, 1074 bctx->sctx->clone_roots_cnt,
1069 sizeof(struct clone_root), 1075 sizeof(struct clone_root),
1070 __clone_root_cmp_bsearch); 1076 __clone_root_cmp_bsearch);
@@ -1074,14 +1080,15 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
1074 if (found->root == bctx->sctx->send_root && 1080 if (found->root == bctx->sctx->send_root &&
1075 ino == bctx->cur_objectid && 1081 ino == bctx->cur_objectid &&
1076 offset == bctx->cur_offset) { 1082 offset == bctx->cur_offset) {
1077 bctx->found_in_send_root = 1; 1083 bctx->found_itself = 1;
1078 } 1084 }
1079 1085
1080 /* 1086 /*
1081 * There are inodes that have extents that lie behind it's i_size. Don't 1087 * There are inodes that have extents that lie behind its i_size. Don't
1082 * accept clones from these extents. 1088 * accept clones from these extents.
1083 */ 1089 */
1084 ret = get_inode_info(found->root, ino, &i_size, NULL, NULL, NULL, NULL); 1090 ret = get_inode_info(found->root, ino, &i_size, NULL, NULL, NULL, NULL,
1091 NULL);
1085 if (ret < 0) 1092 if (ret < 0)
1086 return ret; 1093 return ret;
1087 1094
@@ -1101,16 +1108,12 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
1101 */ 1108 */
1102 if (ino >= bctx->cur_objectid) 1109 if (ino >= bctx->cur_objectid)
1103 return 0; 1110 return 0;
1104 /*if (ino > ctx->cur_objectid) 1111#if 0
1112 if (ino > bctx->cur_objectid)
1105 return 0; 1113 return 0;
1106 if (offset + ctx->extent_len > ctx->cur_offset) 1114 if (offset + bctx->extent_len > bctx->cur_offset)
1107 return 0;*/ 1115 return 0;
1108 1116#endif
1109 bctx->found++;
1110 found->found_refs++;
1111 found->ino = ino;
1112 found->offset = offset;
1113 return 0;
1114 } 1117 }
1115 1118
1116 bctx->found++; 1119 bctx->found++;
@@ -1130,6 +1133,12 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
1130} 1133}
1131 1134
1132/* 1135/*
1136 * Given an inode, offset and extent item, it finds a good clone for a clone
1137 * instruction. Returns -ENOENT when none could be found. The function makes
1138 * sure that the returned clone is usable at the point where sending is at the
1139 * moment. This means, that no clones are accepted which lie behind the current
1140 * inode+offset.
1141 *
1133 * path must point to the extent item when called. 1142 * path must point to the extent item when called.
1134 */ 1143 */
1135static int find_extent_clone(struct send_ctx *sctx, 1144static int find_extent_clone(struct send_ctx *sctx,
@@ -1141,20 +1150,29 @@ static int find_extent_clone(struct send_ctx *sctx,
1141 int ret; 1150 int ret;
1142 int extent_type; 1151 int extent_type;
1143 u64 logical; 1152 u64 logical;
1153 u64 disk_byte;
1144 u64 num_bytes; 1154 u64 num_bytes;
1145 u64 extent_item_pos; 1155 u64 extent_item_pos;
1156 u64 flags = 0;
1146 struct btrfs_file_extent_item *fi; 1157 struct btrfs_file_extent_item *fi;
1147 struct extent_buffer *eb = path->nodes[0]; 1158 struct extent_buffer *eb = path->nodes[0];
1148 struct backref_ctx backref_ctx; 1159 struct backref_ctx *backref_ctx = NULL;
1149 struct clone_root *cur_clone_root; 1160 struct clone_root *cur_clone_root;
1150 struct btrfs_key found_key; 1161 struct btrfs_key found_key;
1151 struct btrfs_path *tmp_path; 1162 struct btrfs_path *tmp_path;
1163 int compressed;
1152 u32 i; 1164 u32 i;
1153 1165
1154 tmp_path = alloc_path_for_send(); 1166 tmp_path = alloc_path_for_send();
1155 if (!tmp_path) 1167 if (!tmp_path)
1156 return -ENOMEM; 1168 return -ENOMEM;
1157 1169
1170 backref_ctx = kmalloc(sizeof(*backref_ctx), GFP_NOFS);
1171 if (!backref_ctx) {
1172 ret = -ENOMEM;
1173 goto out;
1174 }
1175
1158 if (data_offset >= ino_size) { 1176 if (data_offset >= ino_size) {
1159 /* 1177 /*
1160 * There may be extents that lie behind the file's size. 1178 * There may be extents that lie behind the file's size.
@@ -1172,22 +1190,23 @@ static int find_extent_clone(struct send_ctx *sctx,
1172 ret = -ENOENT; 1190 ret = -ENOENT;
1173 goto out; 1191 goto out;
1174 } 1192 }
1193 compressed = btrfs_file_extent_compression(eb, fi);
1175 1194
1176 num_bytes = btrfs_file_extent_num_bytes(eb, fi); 1195 num_bytes = btrfs_file_extent_num_bytes(eb, fi);
1177 logical = btrfs_file_extent_disk_bytenr(eb, fi); 1196 disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
1178 if (logical == 0) { 1197 if (disk_byte == 0) {
1179 ret = -ENOENT; 1198 ret = -ENOENT;
1180 goto out; 1199 goto out;
1181 } 1200 }
1182 logical += btrfs_file_extent_offset(eb, fi); 1201 logical = disk_byte + btrfs_file_extent_offset(eb, fi);
1183 1202
1184 ret = extent_from_logical(sctx->send_root->fs_info, 1203 ret = extent_from_logical(sctx->send_root->fs_info, disk_byte, tmp_path,
1185 logical, tmp_path, &found_key); 1204 &found_key, &flags);
1186 btrfs_release_path(tmp_path); 1205 btrfs_release_path(tmp_path);
1187 1206
1188 if (ret < 0) 1207 if (ret < 0)
1189 goto out; 1208 goto out;
1190 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 1209 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1191 ret = -EIO; 1210 ret = -EIO;
1192 goto out; 1211 goto out;
1193 } 1212 }
@@ -1202,12 +1221,12 @@ static int find_extent_clone(struct send_ctx *sctx,
1202 cur_clone_root->found_refs = 0; 1221 cur_clone_root->found_refs = 0;
1203 } 1222 }
1204 1223
1205 backref_ctx.sctx = sctx; 1224 backref_ctx->sctx = sctx;
1206 backref_ctx.found = 0; 1225 backref_ctx->found = 0;
1207 backref_ctx.cur_objectid = ino; 1226 backref_ctx->cur_objectid = ino;
1208 backref_ctx.cur_offset = data_offset; 1227 backref_ctx->cur_offset = data_offset;
1209 backref_ctx.found_in_send_root = 0; 1228 backref_ctx->found_itself = 0;
1210 backref_ctx.extent_len = num_bytes; 1229 backref_ctx->extent_len = num_bytes;
1211 1230
1212 /* 1231 /*
1213 * The last extent of a file may be too large due to page alignment. 1232 * The last extent of a file may be too large due to page alignment.
@@ -1215,25 +1234,31 @@ static int find_extent_clone(struct send_ctx *sctx,
1215 * __iterate_backrefs work. 1234 * __iterate_backrefs work.
1216 */ 1235 */
1217 if (data_offset + num_bytes >= ino_size) 1236 if (data_offset + num_bytes >= ino_size)
1218 backref_ctx.extent_len = ino_size - data_offset; 1237 backref_ctx->extent_len = ino_size - data_offset;
1219 1238
1220 /* 1239 /*
1221 * Now collect all backrefs. 1240 * Now collect all backrefs.
1222 */ 1241 */
1242 if (compressed == BTRFS_COMPRESS_NONE)
1243 extent_item_pos = logical - found_key.objectid;
1244 else
1245 extent_item_pos = 0;
1246
1223 extent_item_pos = logical - found_key.objectid; 1247 extent_item_pos = logical - found_key.objectid;
1224 ret = iterate_extent_inodes(sctx->send_root->fs_info, 1248 ret = iterate_extent_inodes(sctx->send_root->fs_info,
1225 found_key.objectid, extent_item_pos, 1, 1249 found_key.objectid, extent_item_pos, 1,
1226 __iterate_backrefs, &backref_ctx); 1250 __iterate_backrefs, backref_ctx);
1251
1227 if (ret < 0) 1252 if (ret < 0)
1228 goto out; 1253 goto out;
1229 1254
1230 if (!backref_ctx.found_in_send_root) { 1255 if (!backref_ctx->found_itself) {
1231 /* found a bug in backref code? */ 1256 /* found a bug in backref code? */
1232 ret = -EIO; 1257 ret = -EIO;
1233 printk(KERN_ERR "btrfs: ERROR did not find backref in " 1258 printk(KERN_ERR "btrfs: ERROR did not find backref in "
1234 "send_root. inode=%llu, offset=%llu, " 1259 "send_root. inode=%llu, offset=%llu, "
1235 "logical=%llu\n", 1260 "disk_byte=%llu found extent=%llu\n",
1236 ino, data_offset, logical); 1261 ino, data_offset, disk_byte, found_key.objectid);
1237 goto out; 1262 goto out;
1238 } 1263 }
1239 1264
@@ -1242,7 +1267,7 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "
1242 "num_bytes=%llu, logical=%llu\n", 1267 "num_bytes=%llu, logical=%llu\n",
1243 data_offset, ino, num_bytes, logical); 1268 data_offset, ino, num_bytes, logical);
1244 1269
1245 if (!backref_ctx.found) 1270 if (!backref_ctx->found)
1246 verbose_printk("btrfs: no clones found\n"); 1271 verbose_printk("btrfs: no clones found\n");
1247 1272
1248 cur_clone_root = NULL; 1273 cur_clone_root = NULL;
@@ -1253,7 +1278,6 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "
1253 else if (sctx->clone_roots[i].root == sctx->send_root) 1278 else if (sctx->clone_roots[i].root == sctx->send_root)
1254 /* prefer clones from send_root over others */ 1279 /* prefer clones from send_root over others */
1255 cur_clone_root = sctx->clone_roots + i; 1280 cur_clone_root = sctx->clone_roots + i;
1256 break;
1257 } 1281 }
1258 1282
1259 } 1283 }
@@ -1267,6 +1291,7 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "
1267 1291
1268out: 1292out:
1269 btrfs_free_path(tmp_path); 1293 btrfs_free_path(tmp_path);
1294 kfree(backref_ctx);
1270 return ret; 1295 return ret;
1271} 1296}
1272 1297
@@ -1307,8 +1332,6 @@ static int read_symlink(struct send_ctx *sctx,
1307 len = btrfs_file_extent_inline_len(path->nodes[0], ei); 1332 len = btrfs_file_extent_inline_len(path->nodes[0], ei);
1308 1333
1309 ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len); 1334 ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len);
1310 if (ret < 0)
1311 goto out;
1312 1335
1313out: 1336out:
1314 btrfs_free_path(path); 1337 btrfs_free_path(path);
@@ -1404,7 +1427,7 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
1404 u64 right_gen; 1427 u64 right_gen;
1405 1428
1406 ret = get_inode_info(sctx->send_root, ino, NULL, &left_gen, NULL, NULL, 1429 ret = get_inode_info(sctx->send_root, ino, NULL, &left_gen, NULL, NULL,
1407 NULL); 1430 NULL, NULL);
1408 if (ret < 0 && ret != -ENOENT) 1431 if (ret < 0 && ret != -ENOENT)
1409 goto out; 1432 goto out;
1410 left_ret = ret; 1433 left_ret = ret;
@@ -1413,16 +1436,16 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
1413 right_ret = -ENOENT; 1436 right_ret = -ENOENT;
1414 } else { 1437 } else {
1415 ret = get_inode_info(sctx->parent_root, ino, NULL, &right_gen, 1438 ret = get_inode_info(sctx->parent_root, ino, NULL, &right_gen,
1416 NULL, NULL, NULL); 1439 NULL, NULL, NULL, NULL);
1417 if (ret < 0 && ret != -ENOENT) 1440 if (ret < 0 && ret != -ENOENT)
1418 goto out; 1441 goto out;
1419 right_ret = ret; 1442 right_ret = ret;
1420 } 1443 }
1421 1444
1422 if (!left_ret && !right_ret) { 1445 if (!left_ret && !right_ret) {
1423 if (left_gen == gen && right_gen == gen) 1446 if (left_gen == gen && right_gen == gen) {
1424 ret = inode_state_no_change; 1447 ret = inode_state_no_change;
1425 else if (left_gen == gen) { 1448 } else if (left_gen == gen) {
1426 if (ino < sctx->send_progress) 1449 if (ino < sctx->send_progress)
1427 ret = inode_state_did_create; 1450 ret = inode_state_did_create;
1428 else 1451 else
@@ -1516,6 +1539,10 @@ out:
1516 return ret; 1539 return ret;
1517} 1540}
1518 1541
1542/*
1543 * Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir,
1544 * generation of the parent dir and the name of the dir entry.
1545 */
1519static int get_first_ref(struct send_ctx *sctx, 1546static int get_first_ref(struct send_ctx *sctx,
1520 struct btrfs_root *root, u64 ino, 1547 struct btrfs_root *root, u64 ino,
1521 u64 *dir, u64 *dir_gen, struct fs_path *name) 1548 u64 *dir, u64 *dir_gen, struct fs_path *name)
@@ -1557,7 +1584,7 @@ static int get_first_ref(struct send_ctx *sctx,
1557 btrfs_release_path(path); 1584 btrfs_release_path(path);
1558 1585
1559 ret = get_inode_info(root, found_key.offset, NULL, dir_gen, NULL, NULL, 1586 ret = get_inode_info(root, found_key.offset, NULL, dir_gen, NULL, NULL,
1560 NULL); 1587 NULL, NULL);
1561 if (ret < 0) 1588 if (ret < 0)
1562 goto out; 1589 goto out;
1563 1590
@@ -1586,22 +1613,28 @@ static int is_first_ref(struct send_ctx *sctx,
1586 if (ret < 0) 1613 if (ret < 0)
1587 goto out; 1614 goto out;
1588 1615
1589 if (name_len != fs_path_len(tmp_name)) { 1616 if (dir != tmp_dir || name_len != fs_path_len(tmp_name)) {
1590 ret = 0; 1617 ret = 0;
1591 goto out; 1618 goto out;
1592 } 1619 }
1593 1620
1594 ret = memcmp(tmp_name->start, name, name_len); 1621 ret = !memcmp(tmp_name->start, name, name_len);
1595 if (ret)
1596 ret = 0;
1597 else
1598 ret = 1;
1599 1622
1600out: 1623out:
1601 fs_path_free(sctx, tmp_name); 1624 fs_path_free(sctx, tmp_name);
1602 return ret; 1625 return ret;
1603} 1626}
1604 1627
1628/*
1629 * Used by process_recorded_refs to determine if a new ref would overwrite an
1630 * already existing ref. In case it detects an overwrite, it returns the
1631 * inode/gen in who_ino/who_gen.
1632 * When an overwrite is detected, process_recorded_refs does proper orphanizing
1633 * to make sure later references to the overwritten inode are possible.
1634 * Orphanizing is however only required for the first ref of an inode.
1635 * process_recorded_refs does an additional is_first_ref check to see if
1636 * orphanizing is really required.
1637 */
1605static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, 1638static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
1606 const char *name, int name_len, 1639 const char *name, int name_len,
1607 u64 *who_ino, u64 *who_gen) 1640 u64 *who_ino, u64 *who_gen)
@@ -1626,9 +1659,14 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
1626 goto out; 1659 goto out;
1627 } 1660 }
1628 1661
1662 /*
1663 * Check if the overwritten ref was already processed. If yes, the ref
1664 * was already unlinked/moved, so we can safely assume that we will not
1665 * overwrite anything at this point in time.
1666 */
1629 if (other_inode > sctx->send_progress) { 1667 if (other_inode > sctx->send_progress) {
1630 ret = get_inode_info(sctx->parent_root, other_inode, NULL, 1668 ret = get_inode_info(sctx->parent_root, other_inode, NULL,
1631 who_gen, NULL, NULL, NULL); 1669 who_gen, NULL, NULL, NULL, NULL);
1632 if (ret < 0) 1670 if (ret < 0)
1633 goto out; 1671 goto out;
1634 1672
@@ -1642,6 +1680,13 @@ out:
1642 return ret; 1680 return ret;
1643} 1681}
1644 1682
1683/*
1684 * Checks if the ref was overwritten by an already processed inode. This is
1685 * used by __get_cur_name_and_parent to find out if the ref was orphanized and
1686 * thus the orphan name needs be used.
1687 * process_recorded_refs also uses it to avoid unlinking of refs that were
1688 * overwritten.
1689 */
1645static int did_overwrite_ref(struct send_ctx *sctx, 1690static int did_overwrite_ref(struct send_ctx *sctx,
1646 u64 dir, u64 dir_gen, 1691 u64 dir, u64 dir_gen,
1647 u64 ino, u64 ino_gen, 1692 u64 ino, u64 ino_gen,
@@ -1671,7 +1716,7 @@ static int did_overwrite_ref(struct send_ctx *sctx,
1671 } 1716 }
1672 1717
1673 ret = get_inode_info(sctx->send_root, ow_inode, NULL, &gen, NULL, NULL, 1718 ret = get_inode_info(sctx->send_root, ow_inode, NULL, &gen, NULL, NULL,
1674 NULL); 1719 NULL, NULL);
1675 if (ret < 0) 1720 if (ret < 0)
1676 goto out; 1721 goto out;
1677 1722
@@ -1690,6 +1735,11 @@ out:
1690 return ret; 1735 return ret;
1691} 1736}
1692 1737
1738/*
1739 * Same as did_overwrite_ref, but also checks if it is the first ref of an inode
1740 * that got overwritten. This is used by process_recorded_refs to determine
1741 * if it has to use the path as returned by get_cur_path or the orphan name.
1742 */
1693static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen) 1743static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
1694{ 1744{
1695 int ret = 0; 1745 int ret = 0;
@@ -1710,39 +1760,40 @@ static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
1710 1760
1711 ret = did_overwrite_ref(sctx, dir, dir_gen, ino, gen, 1761 ret = did_overwrite_ref(sctx, dir, dir_gen, ino, gen,
1712 name->start, fs_path_len(name)); 1762 name->start, fs_path_len(name));
1713 if (ret < 0)
1714 goto out;
1715 1763
1716out: 1764out:
1717 fs_path_free(sctx, name); 1765 fs_path_free(sctx, name);
1718 return ret; 1766 return ret;
1719} 1767}
1720 1768
1769/*
1770 * Insert a name cache entry. On 32bit kernels the radix tree index is 32bit,
1771 * so we need to do some special handling in case we have clashes. This function
1772 * takes care of this with the help of name_cache_entry::radix_list.
1773 * In case of error, nce is kfreed.
1774 */
1721static int name_cache_insert(struct send_ctx *sctx, 1775static int name_cache_insert(struct send_ctx *sctx,
1722 struct name_cache_entry *nce) 1776 struct name_cache_entry *nce)
1723{ 1777{
1724 int ret = 0; 1778 int ret = 0;
1725 struct name_cache_entry **ncea; 1779 struct list_head *nce_head;
1726 1780
1727 ncea = radix_tree_lookup(&sctx->name_cache, nce->ino); 1781 nce_head = radix_tree_lookup(&sctx->name_cache,
1728 if (ncea) { 1782 (unsigned long)nce->ino);
1729 if (!ncea[0]) 1783 if (!nce_head) {
1730 ncea[0] = nce; 1784 nce_head = kmalloc(sizeof(*nce_head), GFP_NOFS);
1731 else if (!ncea[1]) 1785 if (!nce_head)
1732 ncea[1] = nce;
1733 else
1734 BUG();
1735 } else {
1736 ncea = kmalloc(sizeof(void *) * 2, GFP_NOFS);
1737 if (!ncea)
1738 return -ENOMEM; 1786 return -ENOMEM;
1787 INIT_LIST_HEAD(nce_head);
1739 1788
1740 ncea[0] = nce; 1789 ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head);
1741 ncea[1] = NULL; 1790 if (ret < 0) {
1742 ret = radix_tree_insert(&sctx->name_cache, nce->ino, ncea); 1791 kfree(nce_head);
1743 if (ret < 0) 1792 kfree(nce);
1744 return ret; 1793 return ret;
1794 }
1745 } 1795 }
1796 list_add_tail(&nce->radix_list, nce_head);
1746 list_add_tail(&nce->list, &sctx->name_cache_list); 1797 list_add_tail(&nce->list, &sctx->name_cache_list);
1747 sctx->name_cache_size++; 1798 sctx->name_cache_size++;
1748 1799
@@ -1752,50 +1803,52 @@ static int name_cache_insert(struct send_ctx *sctx,
1752static void name_cache_delete(struct send_ctx *sctx, 1803static void name_cache_delete(struct send_ctx *sctx,
1753 struct name_cache_entry *nce) 1804 struct name_cache_entry *nce)
1754{ 1805{
1755 struct name_cache_entry **ncea; 1806 struct list_head *nce_head;
1756
1757 ncea = radix_tree_lookup(&sctx->name_cache, nce->ino);
1758 BUG_ON(!ncea);
1759
1760 if (ncea[0] == nce)
1761 ncea[0] = NULL;
1762 else if (ncea[1] == nce)
1763 ncea[1] = NULL;
1764 else
1765 BUG();
1766 1807
1767 if (!ncea[0] && !ncea[1]) { 1808 nce_head = radix_tree_lookup(&sctx->name_cache,
1768 radix_tree_delete(&sctx->name_cache, nce->ino); 1809 (unsigned long)nce->ino);
1769 kfree(ncea); 1810 BUG_ON(!nce_head);
1770 }
1771 1811
1812 list_del(&nce->radix_list);
1772 list_del(&nce->list); 1813 list_del(&nce->list);
1773
1774 sctx->name_cache_size--; 1814 sctx->name_cache_size--;
1815
1816 if (list_empty(nce_head)) {
1817 radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino);
1818 kfree(nce_head);
1819 }
1775} 1820}
1776 1821
1777static struct name_cache_entry *name_cache_search(struct send_ctx *sctx, 1822static struct name_cache_entry *name_cache_search(struct send_ctx *sctx,
1778 u64 ino, u64 gen) 1823 u64 ino, u64 gen)
1779{ 1824{
1780 struct name_cache_entry **ncea; 1825 struct list_head *nce_head;
1826 struct name_cache_entry *cur;
1781 1827
1782 ncea = radix_tree_lookup(&sctx->name_cache, ino); 1828 nce_head = radix_tree_lookup(&sctx->name_cache, (unsigned long)ino);
1783 if (!ncea) 1829 if (!nce_head)
1784 return NULL; 1830 return NULL;
1785 1831
1786 if (ncea[0] && ncea[0]->gen == gen) 1832 list_for_each_entry(cur, nce_head, radix_list) {
1787 return ncea[0]; 1833 if (cur->ino == ino && cur->gen == gen)
1788 else if (ncea[1] && ncea[1]->gen == gen) 1834 return cur;
1789 return ncea[1]; 1835 }
1790 return NULL; 1836 return NULL;
1791} 1837}
1792 1838
1839/*
1840 * Removes the entry from the list and adds it back to the end. This marks the
1841 * entry as recently used so that name_cache_clean_unused does not remove it.
1842 */
1793static void name_cache_used(struct send_ctx *sctx, struct name_cache_entry *nce) 1843static void name_cache_used(struct send_ctx *sctx, struct name_cache_entry *nce)
1794{ 1844{
1795 list_del(&nce->list); 1845 list_del(&nce->list);
1796 list_add_tail(&nce->list, &sctx->name_cache_list); 1846 list_add_tail(&nce->list, &sctx->name_cache_list);
1797} 1847}
1798 1848
1849/*
1850 * Remove some entries from the beginning of name_cache_list.
1851 */
1799static void name_cache_clean_unused(struct send_ctx *sctx) 1852static void name_cache_clean_unused(struct send_ctx *sctx)
1800{ 1853{
1801 struct name_cache_entry *nce; 1854 struct name_cache_entry *nce;
@@ -1814,13 +1867,23 @@ static void name_cache_clean_unused(struct send_ctx *sctx)
1814static void name_cache_free(struct send_ctx *sctx) 1867static void name_cache_free(struct send_ctx *sctx)
1815{ 1868{
1816 struct name_cache_entry *nce; 1869 struct name_cache_entry *nce;
1817 struct name_cache_entry *tmp;
1818 1870
1819 list_for_each_entry_safe(nce, tmp, &sctx->name_cache_list, list) { 1871 while (!list_empty(&sctx->name_cache_list)) {
1872 nce = list_entry(sctx->name_cache_list.next,
1873 struct name_cache_entry, list);
1820 name_cache_delete(sctx, nce); 1874 name_cache_delete(sctx, nce);
1875 kfree(nce);
1821 } 1876 }
1822} 1877}
1823 1878
1879/*
1880 * Used by get_cur_path for each ref up to the root.
1881 * Returns 0 if it succeeded.
1882 * Returns 1 if the inode is not existent or got overwritten. In that case, the
1883 * name is an orphan name. This instructs get_cur_path to stop iterating. If 1
1884 * is returned, parent_ino/parent_gen are not guaranteed to be valid.
1885 * Returns <0 in case of error.
1886 */
1824static int __get_cur_name_and_parent(struct send_ctx *sctx, 1887static int __get_cur_name_and_parent(struct send_ctx *sctx,
1825 u64 ino, u64 gen, 1888 u64 ino, u64 gen,
1826 u64 *parent_ino, 1889 u64 *parent_ino,
@@ -1832,6 +1895,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
1832 struct btrfs_path *path = NULL; 1895 struct btrfs_path *path = NULL;
1833 struct name_cache_entry *nce = NULL; 1896 struct name_cache_entry *nce = NULL;
1834 1897
1898 /*
1899 * First check if we already did a call to this function with the same
1900 * ino/gen. If yes, check if the cache entry is still up-to-date. If yes
1901 * return the cached result.
1902 */
1835 nce = name_cache_search(sctx, ino, gen); 1903 nce = name_cache_search(sctx, ino, gen);
1836 if (nce) { 1904 if (nce) {
1837 if (ino < sctx->send_progress && nce->need_later_update) { 1905 if (ino < sctx->send_progress && nce->need_later_update) {
@@ -1854,6 +1922,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
1854 if (!path) 1922 if (!path)
1855 return -ENOMEM; 1923 return -ENOMEM;
1856 1924
1925 /*
1926 * If the inode is not existent yet, add the orphan name and return 1.
1927 * This should only happen for the parent dir that we determine in
1928 * __record_new_ref
1929 */
1857 ret = is_inode_existent(sctx, ino, gen); 1930 ret = is_inode_existent(sctx, ino, gen);
1858 if (ret < 0) 1931 if (ret < 0)
1859 goto out; 1932 goto out;
@@ -1866,6 +1939,10 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
1866 goto out_cache; 1939 goto out_cache;
1867 } 1940 }
1868 1941
1942 /*
1943 * Depending on whether the inode was already processed or not, use
1944 * send_root or parent_root for ref lookup.
1945 */
1869 if (ino < sctx->send_progress) 1946 if (ino < sctx->send_progress)
1870 ret = get_first_ref(sctx, sctx->send_root, ino, 1947 ret = get_first_ref(sctx, sctx->send_root, ino,
1871 parent_ino, parent_gen, dest); 1948 parent_ino, parent_gen, dest);
@@ -1875,6 +1952,10 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
1875 if (ret < 0) 1952 if (ret < 0)
1876 goto out; 1953 goto out;
1877 1954
1955 /*
1956 * Check if the ref was overwritten by an inode's ref that was processed
1957 * earlier. If yes, treat as orphan and return 1.
1958 */
1878 ret = did_overwrite_ref(sctx, *parent_ino, *parent_gen, ino, gen, 1959 ret = did_overwrite_ref(sctx, *parent_ino, *parent_gen, ino, gen,
1879 dest->start, dest->end - dest->start); 1960 dest->start, dest->end - dest->start);
1880 if (ret < 0) 1961 if (ret < 0)
@@ -1888,6 +1969,9 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
1888 } 1969 }
1889 1970
1890out_cache: 1971out_cache:
1972 /*
1973 * Store the result of the lookup in the name cache.
1974 */
1891 nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_NOFS); 1975 nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_NOFS);
1892 if (!nce) { 1976 if (!nce) {
1893 ret = -ENOMEM; 1977 ret = -ENOMEM;
@@ -1901,7 +1985,6 @@ out_cache:
1901 nce->name_len = fs_path_len(dest); 1985 nce->name_len = fs_path_len(dest);
1902 nce->ret = ret; 1986 nce->ret = ret;
1903 strcpy(nce->name, dest->start); 1987 strcpy(nce->name, dest->start);
1904 memset(&nce->use_list, 0, sizeof(nce->use_list));
1905 1988
1906 if (ino < sctx->send_progress) 1989 if (ino < sctx->send_progress)
1907 nce->need_later_update = 0; 1990 nce->need_later_update = 0;
@@ -2107,9 +2190,6 @@ static int send_subvol_begin(struct send_ctx *sctx)
2107 read_extent_buffer(leaf, name, (unsigned long)(ref + 1), namelen); 2190 read_extent_buffer(leaf, name, (unsigned long)(ref + 1), namelen);
2108 btrfs_release_path(path); 2191 btrfs_release_path(path);
2109 2192
2110 if (ret < 0)
2111 goto out;
2112
2113 if (parent_root) { 2193 if (parent_root) {
2114 ret = begin_cmd(sctx, BTRFS_SEND_C_SNAPSHOT); 2194 ret = begin_cmd(sctx, BTRFS_SEND_C_SNAPSHOT);
2115 if (ret < 0) 2195 if (ret < 0)
@@ -2276,7 +2356,7 @@ verbose_printk("btrfs: send_utimes %llu\n", ino);
2276 btrfs_inode_mtime(ii)); 2356 btrfs_inode_mtime(ii));
2277 TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb, 2357 TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb,
2278 btrfs_inode_ctime(ii)); 2358 btrfs_inode_ctime(ii));
2279 /* TODO otime? */ 2359 /* TODO Add otime support when the otime patches get into upstream */
2280 2360
2281 ret = send_cmd(sctx); 2361 ret = send_cmd(sctx);
2282 2362
@@ -2292,39 +2372,39 @@ out:
2292 * a valid path yet because we did not process the refs yet. So, the inode 2372 * a valid path yet because we did not process the refs yet. So, the inode
2293 * is created as orphan. 2373 * is created as orphan.
2294 */ 2374 */
2295static int send_create_inode(struct send_ctx *sctx, struct btrfs_path *path, 2375static int send_create_inode(struct send_ctx *sctx, u64 ino)
2296 struct btrfs_key *key)
2297{ 2376{
2298 int ret = 0; 2377 int ret = 0;
2299 struct extent_buffer *eb = path->nodes[0];
2300 struct btrfs_inode_item *ii;
2301 struct fs_path *p; 2378 struct fs_path *p;
2302 int slot = path->slots[0];
2303 int cmd; 2379 int cmd;
2380 u64 gen;
2304 u64 mode; 2381 u64 mode;
2382 u64 rdev;
2305 2383
2306verbose_printk("btrfs: send_create_inode %llu\n", sctx->cur_ino); 2384verbose_printk("btrfs: send_create_inode %llu\n", ino);
2307 2385
2308 p = fs_path_alloc(sctx); 2386 p = fs_path_alloc(sctx);
2309 if (!p) 2387 if (!p)
2310 return -ENOMEM; 2388 return -ENOMEM;
2311 2389
2312 ii = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); 2390 ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode, NULL,
2313 mode = btrfs_inode_mode(eb, ii); 2391 NULL, &rdev);
2392 if (ret < 0)
2393 goto out;
2314 2394
2315 if (S_ISREG(mode)) 2395 if (S_ISREG(mode)) {
2316 cmd = BTRFS_SEND_C_MKFILE; 2396 cmd = BTRFS_SEND_C_MKFILE;
2317 else if (S_ISDIR(mode)) 2397 } else if (S_ISDIR(mode)) {
2318 cmd = BTRFS_SEND_C_MKDIR; 2398 cmd = BTRFS_SEND_C_MKDIR;
2319 else if (S_ISLNK(mode)) 2399 } else if (S_ISLNK(mode)) {
2320 cmd = BTRFS_SEND_C_SYMLINK; 2400 cmd = BTRFS_SEND_C_SYMLINK;
2321 else if (S_ISCHR(mode) || S_ISBLK(mode)) 2401 } else if (S_ISCHR(mode) || S_ISBLK(mode)) {
2322 cmd = BTRFS_SEND_C_MKNOD; 2402 cmd = BTRFS_SEND_C_MKNOD;
2323 else if (S_ISFIFO(mode)) 2403 } else if (S_ISFIFO(mode)) {
2324 cmd = BTRFS_SEND_C_MKFIFO; 2404 cmd = BTRFS_SEND_C_MKFIFO;
2325 else if (S_ISSOCK(mode)) 2405 } else if (S_ISSOCK(mode)) {
2326 cmd = BTRFS_SEND_C_MKSOCK; 2406 cmd = BTRFS_SEND_C_MKSOCK;
2327 else { 2407 } else {
2328 printk(KERN_WARNING "btrfs: unexpected inode type %o", 2408 printk(KERN_WARNING "btrfs: unexpected inode type %o",
2329 (int)(mode & S_IFMT)); 2409 (int)(mode & S_IFMT));
2330 ret = -ENOTSUPP; 2410 ret = -ENOTSUPP;
@@ -2335,22 +2415,22 @@ verbose_printk("btrfs: send_create_inode %llu\n", sctx->cur_ino);
2335 if (ret < 0) 2415 if (ret < 0)
2336 goto out; 2416 goto out;
2337 2417
2338 ret = gen_unique_name(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); 2418 ret = gen_unique_name(sctx, ino, gen, p);
2339 if (ret < 0) 2419 if (ret < 0)
2340 goto out; 2420 goto out;
2341 2421
2342 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); 2422 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2343 TLV_PUT_U64(sctx, BTRFS_SEND_A_INO, sctx->cur_ino); 2423 TLV_PUT_U64(sctx, BTRFS_SEND_A_INO, ino);
2344 2424
2345 if (S_ISLNK(mode)) { 2425 if (S_ISLNK(mode)) {
2346 fs_path_reset(p); 2426 fs_path_reset(p);
2347 ret = read_symlink(sctx, sctx->send_root, sctx->cur_ino, p); 2427 ret = read_symlink(sctx, sctx->send_root, ino, p);
2348 if (ret < 0) 2428 if (ret < 0)
2349 goto out; 2429 goto out;
2350 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p); 2430 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p);
2351 } else if (S_ISCHR(mode) || S_ISBLK(mode) || 2431 } else if (S_ISCHR(mode) || S_ISBLK(mode) ||
2352 S_ISFIFO(mode) || S_ISSOCK(mode)) { 2432 S_ISFIFO(mode) || S_ISSOCK(mode)) {
2353 TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, btrfs_inode_rdev(eb, ii)); 2433 TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, rdev);
2354 } 2434 }
2355 2435
2356 ret = send_cmd(sctx); 2436 ret = send_cmd(sctx);
@@ -2364,6 +2444,92 @@ out:
2364 return ret; 2444 return ret;
2365} 2445}
2366 2446
2447/*
2448 * We need some special handling for inodes that get processed before the parent
2449 * directory got created. See process_recorded_refs for details.
2450 * This function does the check if we already created the dir out of order.
2451 */
2452static int did_create_dir(struct send_ctx *sctx, u64 dir)
2453{
2454 int ret = 0;
2455 struct btrfs_path *path = NULL;
2456 struct btrfs_key key;
2457 struct btrfs_key found_key;
2458 struct btrfs_key di_key;
2459 struct extent_buffer *eb;
2460 struct btrfs_dir_item *di;
2461 int slot;
2462
2463 path = alloc_path_for_send();
2464 if (!path) {
2465 ret = -ENOMEM;
2466 goto out;
2467 }
2468
2469 key.objectid = dir;
2470 key.type = BTRFS_DIR_INDEX_KEY;
2471 key.offset = 0;
2472 while (1) {
2473 ret = btrfs_search_slot_for_read(sctx->send_root, &key, path,
2474 1, 0);
2475 if (ret < 0)
2476 goto out;
2477 if (!ret) {
2478 eb = path->nodes[0];
2479 slot = path->slots[0];
2480 btrfs_item_key_to_cpu(eb, &found_key, slot);
2481 }
2482 if (ret || found_key.objectid != key.objectid ||
2483 found_key.type != key.type) {
2484 ret = 0;
2485 goto out;
2486 }
2487
2488 di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
2489 btrfs_dir_item_key_to_cpu(eb, di, &di_key);
2490
2491 if (di_key.objectid < sctx->send_progress) {
2492 ret = 1;
2493 goto out;
2494 }
2495
2496 key.offset = found_key.offset + 1;
2497 btrfs_release_path(path);
2498 }
2499
2500out:
2501 btrfs_free_path(path);
2502 return ret;
2503}
2504
2505/*
2506 * Only creates the inode if it is:
2507 * 1. Not a directory
2508 * 2. Or a directory which was not created already due to out of order
2509 * directories. See did_create_dir and process_recorded_refs for details.
2510 */
2511static int send_create_inode_if_needed(struct send_ctx *sctx)
2512{
2513 int ret;
2514
2515 if (S_ISDIR(sctx->cur_inode_mode)) {
2516 ret = did_create_dir(sctx, sctx->cur_ino);
2517 if (ret < 0)
2518 goto out;
2519 if (ret) {
2520 ret = 0;
2521 goto out;
2522 }
2523 }
2524
2525 ret = send_create_inode(sctx, sctx->cur_ino);
2526 if (ret < 0)
2527 goto out;
2528
2529out:
2530 return ret;
2531}
2532
2367struct recorded_ref { 2533struct recorded_ref {
2368 struct list_head list; 2534 struct list_head list;
2369 char *dir_path; 2535 char *dir_path;
@@ -2416,13 +2582,13 @@ static int record_ref(struct list_head *head, u64 dir,
2416static void __free_recorded_refs(struct send_ctx *sctx, struct list_head *head) 2582static void __free_recorded_refs(struct send_ctx *sctx, struct list_head *head)
2417{ 2583{
2418 struct recorded_ref *cur; 2584 struct recorded_ref *cur;
2419 struct recorded_ref *tmp;
2420 2585
2421 list_for_each_entry_safe(cur, tmp, head, list) { 2586 while (!list_empty(head)) {
2587 cur = list_entry(head->next, struct recorded_ref, list);
2422 fs_path_free(sctx, cur->full_path); 2588 fs_path_free(sctx, cur->full_path);
2589 list_del(&cur->list);
2423 kfree(cur); 2590 kfree(cur);
2424 } 2591 }
2425 INIT_LIST_HEAD(head);
2426} 2592}
2427 2593
2428static void free_recorded_refs(struct send_ctx *sctx) 2594static void free_recorded_refs(struct send_ctx *sctx)
@@ -2432,7 +2598,7 @@ static void free_recorded_refs(struct send_ctx *sctx)
2432} 2598}
2433 2599
2434/* 2600/*
2435 * Renames/moves a file/dir to it's orphan name. Used when the first 2601 * Renames/moves a file/dir to its orphan name. Used when the first
2436 * ref of an unprocessed inode gets overwritten and for all non empty 2602 * ref of an unprocessed inode gets overwritten and for all non empty
2437 * directories. 2603 * directories.
2438 */ 2604 */
@@ -2472,6 +2638,12 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress)
2472 struct btrfs_key loc; 2638 struct btrfs_key loc;
2473 struct btrfs_dir_item *di; 2639 struct btrfs_dir_item *di;
2474 2640
2641 /*
2642 * Don't try to rmdir the top/root subvolume dir.
2643 */
2644 if (dir == BTRFS_FIRST_FREE_OBJECTID)
2645 return 0;
2646
2475 path = alloc_path_for_send(); 2647 path = alloc_path_for_send();
2476 if (!path) 2648 if (!path)
2477 return -ENOMEM; 2649 return -ENOMEM;
@@ -2513,160 +2685,6 @@ out:
2513 return ret; 2685 return ret;
2514} 2686}
2515 2687
2516struct finish_unordered_dir_ctx {
2517 struct send_ctx *sctx;
2518 struct fs_path *cur_path;
2519 struct fs_path *dir_path;
2520 u64 dir_ino;
2521 int need_delete;
2522 int delete_pass;
2523};
2524
2525int __finish_unordered_dir(int num, struct btrfs_key *di_key,
2526 const char *name, int name_len,
2527 const char *data, int data_len,
2528 u8 type, void *ctx)
2529{
2530 int ret = 0;
2531 struct finish_unordered_dir_ctx *fctx = ctx;
2532 struct send_ctx *sctx = fctx->sctx;
2533 u64 di_gen;
2534 u64 di_mode;
2535 int is_orphan = 0;
2536
2537 if (di_key->objectid >= fctx->dir_ino)
2538 goto out;
2539
2540 fs_path_reset(fctx->cur_path);
2541
2542 ret = get_inode_info(sctx->send_root, di_key->objectid,
2543 NULL, &di_gen, &di_mode, NULL, NULL);
2544 if (ret < 0)
2545 goto out;
2546
2547 ret = is_first_ref(sctx, sctx->send_root, di_key->objectid,
2548 fctx->dir_ino, name, name_len);
2549 if (ret < 0)
2550 goto out;
2551 if (ret) {
2552 is_orphan = 1;
2553 ret = gen_unique_name(sctx, di_key->objectid, di_gen,
2554 fctx->cur_path);
2555 } else {
2556 ret = get_cur_path(sctx, di_key->objectid, di_gen,
2557 fctx->cur_path);
2558 }
2559 if (ret < 0)
2560 goto out;
2561
2562 ret = fs_path_add(fctx->dir_path, name, name_len);
2563 if (ret < 0)
2564 goto out;
2565
2566 if (!fctx->delete_pass) {
2567 if (S_ISDIR(di_mode)) {
2568 ret = send_rename(sctx, fctx->cur_path,
2569 fctx->dir_path);
2570 } else {
2571 ret = send_link(sctx, fctx->dir_path,
2572 fctx->cur_path);
2573 if (is_orphan)
2574 fctx->need_delete = 1;
2575 }
2576 } else if (!S_ISDIR(di_mode)) {
2577 ret = send_unlink(sctx, fctx->cur_path);
2578 } else {
2579 ret = 0;
2580 }
2581
2582 fs_path_remove(fctx->dir_path);
2583
2584out:
2585 return ret;
2586}
2587
2588/*
2589 * Go through all dir items and see if we find refs which could not be created
2590 * in the past because the dir did not exist at that time.
2591 */
2592static int finish_outoforder_dir(struct send_ctx *sctx, u64 dir, u64 dir_gen)
2593{
2594 int ret = 0;
2595 struct btrfs_path *path = NULL;
2596 struct btrfs_key key;
2597 struct btrfs_key found_key;
2598 struct extent_buffer *eb;
2599 struct finish_unordered_dir_ctx fctx;
2600 int slot;
2601
2602 path = alloc_path_for_send();
2603 if (!path) {
2604 ret = -ENOMEM;
2605 goto out;
2606 }
2607
2608 memset(&fctx, 0, sizeof(fctx));
2609 fctx.sctx = sctx;
2610 fctx.cur_path = fs_path_alloc(sctx);
2611 fctx.dir_path = fs_path_alloc(sctx);
2612 if (!fctx.cur_path || !fctx.dir_path) {
2613 ret = -ENOMEM;
2614 goto out;
2615 }
2616 fctx.dir_ino = dir;
2617
2618 ret = get_cur_path(sctx, dir, dir_gen, fctx.dir_path);
2619 if (ret < 0)
2620 goto out;
2621
2622 /*
2623 * We do two passes. The first links in the new refs and the second
2624 * deletes orphans if required. Deletion of orphans is not required for
2625 * directory inodes, as we always have only one ref and use rename
2626 * instead of link for those.
2627 */
2628
2629again:
2630 key.objectid = dir;
2631 key.type = BTRFS_DIR_ITEM_KEY;
2632 key.offset = 0;
2633 while (1) {
2634 ret = btrfs_search_slot_for_read(sctx->send_root, &key, path,
2635 1, 0);
2636 if (ret < 0)
2637 goto out;
2638 eb = path->nodes[0];
2639 slot = path->slots[0];
2640 btrfs_item_key_to_cpu(eb, &found_key, slot);
2641
2642 if (found_key.objectid != key.objectid ||
2643 found_key.type != key.type) {
2644 btrfs_release_path(path);
2645 break;
2646 }
2647
2648 ret = iterate_dir_item(sctx, sctx->send_root, path,
2649 &found_key, __finish_unordered_dir,
2650 &fctx);
2651 if (ret < 0)
2652 goto out;
2653
2654 key.offset = found_key.offset + 1;
2655 btrfs_release_path(path);
2656 }
2657
2658 if (!fctx.delete_pass && fctx.need_delete) {
2659 fctx.delete_pass = 1;
2660 goto again;
2661 }
2662
2663out:
2664 btrfs_free_path(path);
2665 fs_path_free(sctx, fctx.cur_path);
2666 fs_path_free(sctx, fctx.dir_path);
2667 return ret;
2668}
2669
2670/* 2688/*
2671 * This does all the move/link/unlink/rmdir magic. 2689 * This does all the move/link/unlink/rmdir magic.
2672 */ 2690 */
@@ -2674,6 +2692,7 @@ static int process_recorded_refs(struct send_ctx *sctx)
2674{ 2692{
2675 int ret = 0; 2693 int ret = 0;
2676 struct recorded_ref *cur; 2694 struct recorded_ref *cur;
2695 struct recorded_ref *cur2;
2677 struct ulist *check_dirs = NULL; 2696 struct ulist *check_dirs = NULL;
2678 struct ulist_iterator uit; 2697 struct ulist_iterator uit;
2679 struct ulist_node *un; 2698 struct ulist_node *un;
@@ -2685,6 +2704,12 @@ static int process_recorded_refs(struct send_ctx *sctx)
2685 2704
2686verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); 2705verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
2687 2706
2707 /*
2708 * This should never happen as the root dir always has the same ref
2709 * which is always '..'
2710 */
2711 BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID);
2712
2688 valid_path = fs_path_alloc(sctx); 2713 valid_path = fs_path_alloc(sctx);
2689 if (!valid_path) { 2714 if (!valid_path) {
2690 ret = -ENOMEM; 2715 ret = -ENOMEM;
@@ -2731,6 +2756,46 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
2731 2756
2732 list_for_each_entry(cur, &sctx->new_refs, list) { 2757 list_for_each_entry(cur, &sctx->new_refs, list) {
2733 /* 2758 /*
2759 * We may have refs where the parent directory does not exist
2760 * yet. This happens if the parent directories inum is higher
2761 * the the current inum. To handle this case, we create the
2762 * parent directory out of order. But we need to check if this
2763 * did already happen before due to other refs in the same dir.
2764 */
2765 ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
2766 if (ret < 0)
2767 goto out;
2768 if (ret == inode_state_will_create) {
2769 ret = 0;
2770 /*
2771 * First check if any of the current inodes refs did
2772 * already create the dir.
2773 */
2774 list_for_each_entry(cur2, &sctx->new_refs, list) {
2775 if (cur == cur2)
2776 break;
2777 if (cur2->dir == cur->dir) {
2778 ret = 1;
2779 break;
2780 }
2781 }
2782
2783 /*
2784 * If that did not happen, check if a previous inode
2785 * did already create the dir.
2786 */
2787 if (!ret)
2788 ret = did_create_dir(sctx, cur->dir);
2789 if (ret < 0)
2790 goto out;
2791 if (!ret) {
2792 ret = send_create_inode(sctx, cur->dir);
2793 if (ret < 0)
2794 goto out;
2795 }
2796 }
2797
2798 /*
2734 * Check if this new ref would overwrite the first ref of 2799 * Check if this new ref would overwrite the first ref of
2735 * another unprocessed inode. If yes, orphanize the 2800 * another unprocessed inode. If yes, orphanize the
2736 * overwritten inode. If we find an overwritten ref that is 2801 * overwritten inode. If we find an overwritten ref that is
@@ -2764,7 +2829,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
2764 * inode, move it and update valid_path. If not, link or move 2829 * inode, move it and update valid_path. If not, link or move
2765 * it depending on the inode mode. 2830 * it depending on the inode mode.
2766 */ 2831 */
2767 if (is_orphan && !sctx->cur_inode_first_ref_orphan) { 2832 if (is_orphan) {
2768 ret = send_rename(sctx, valid_path, cur->full_path); 2833 ret = send_rename(sctx, valid_path, cur->full_path);
2769 if (ret < 0) 2834 if (ret < 0)
2770 goto out; 2835 goto out;
@@ -2827,6 +2892,17 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
2827 if (ret < 0) 2892 if (ret < 0)
2828 goto out; 2893 goto out;
2829 } 2894 }
2895 } else if (S_ISDIR(sctx->cur_inode_mode) &&
2896 !list_empty(&sctx->deleted_refs)) {
2897 /*
2898 * We have a moved dir. Add the old parent to check_dirs
2899 */
2900 cur = list_entry(sctx->deleted_refs.next, struct recorded_ref,
2901 list);
2902 ret = ulist_add(check_dirs, cur->dir, cur->dir_gen,
2903 GFP_NOFS);
2904 if (ret < 0)
2905 goto out;
2830 } else if (!S_ISDIR(sctx->cur_inode_mode)) { 2906 } else if (!S_ISDIR(sctx->cur_inode_mode)) {
2831 /* 2907 /*
2832 * We have a non dir inode. Go through all deleted refs and 2908 * We have a non dir inode. Go through all deleted refs and
@@ -2840,35 +2916,9 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
2840 if (ret < 0) 2916 if (ret < 0)
2841 goto out; 2917 goto out;
2842 if (!ret) { 2918 if (!ret) {
2843 /* 2919 ret = send_unlink(sctx, cur->full_path);
2844 * In case the inode was moved to a directory 2920 if (ret < 0)
2845 * that was not created yet (see 2921 goto out;
2846 * __record_new_ref), we can not unlink the ref
2847 * as it will be needed later when the parent
2848 * directory is created, so that we can move in
2849 * the inode to the new dir.
2850 */
2851 if (!is_orphan &&
2852 sctx->cur_inode_first_ref_orphan) {
2853 ret = orphanize_inode(sctx,
2854 sctx->cur_ino,
2855 sctx->cur_inode_gen,
2856 cur->full_path);
2857 if (ret < 0)
2858 goto out;
2859 ret = gen_unique_name(sctx,
2860 sctx->cur_ino,
2861 sctx->cur_inode_gen,
2862 valid_path);
2863 if (ret < 0)
2864 goto out;
2865 is_orphan = 1;
2866
2867 } else {
2868 ret = send_unlink(sctx, cur->full_path);
2869 if (ret < 0)
2870 goto out;
2871 }
2872 } 2922 }
2873 ret = ulist_add(check_dirs, cur->dir, cur->dir_gen, 2923 ret = ulist_add(check_dirs, cur->dir, cur->dir_gen,
2874 GFP_NOFS); 2924 GFP_NOFS);
@@ -2880,12 +2930,11 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
2880 * If the inode is still orphan, unlink the orphan. This may 2930 * If the inode is still orphan, unlink the orphan. This may
2881 * happen when a previous inode did overwrite the first ref 2931 * happen when a previous inode did overwrite the first ref
2882 * of this inode and no new refs were added for the current 2932 * of this inode and no new refs were added for the current
2883 * inode. 2933 * inode. Unlinking does not mean that the inode is deleted in
2884 * We can however not delete the orphan in case the inode relies 2934 * all cases. There may still be links to this inode in other
2885 * in a directory that was not created yet (see 2935 * places.
2886 * __record_new_ref)
2887 */ 2936 */
2888 if (is_orphan && !sctx->cur_inode_first_ref_orphan) { 2937 if (is_orphan) {
2889 ret = send_unlink(sctx, valid_path); 2938 ret = send_unlink(sctx, valid_path);
2890 if (ret < 0) 2939 if (ret < 0)
2891 goto out; 2940 goto out;
@@ -2900,6 +2949,11 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
2900 */ 2949 */
2901 ULIST_ITER_INIT(&uit); 2950 ULIST_ITER_INIT(&uit);
2902 while ((un = ulist_next(check_dirs, &uit))) { 2951 while ((un = ulist_next(check_dirs, &uit))) {
2952 /*
2953 * In case we had refs into dirs that were not processed yet,
2954 * we don't need to do the utime and rmdir logic for these dirs.
2955 * The dir will be processed later.
2956 */
2903 if (un->val > sctx->cur_ino) 2957 if (un->val > sctx->cur_ino)
2904 continue; 2958 continue;
2905 2959
@@ -2929,25 +2983,6 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
2929 } 2983 }
2930 } 2984 }
2931 2985
2932 /*
2933 * Current inode is now at it's new position, so we must increase
2934 * send_progress
2935 */
2936 sctx->send_progress = sctx->cur_ino + 1;
2937
2938 /*
2939 * We may have a directory here that has pending refs which could not
2940 * be created before (because the dir did not exist before, see
2941 * __record_new_ref). finish_outoforder_dir will link/move the pending
2942 * refs.
2943 */
2944 if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_new) {
2945 ret = finish_outoforder_dir(sctx, sctx->cur_ino,
2946 sctx->cur_inode_gen);
2947 if (ret < 0)
2948 goto out;
2949 }
2950
2951 ret = 0; 2986 ret = 0;
2952 2987
2953out: 2988out:
@@ -2971,34 +3006,9 @@ static int __record_new_ref(int num, u64 dir, int index,
2971 return -ENOMEM; 3006 return -ENOMEM;
2972 3007
2973 ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL, NULL, 3008 ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL, NULL,
2974 NULL); 3009 NULL, NULL);
2975 if (ret < 0)
2976 goto out;
2977
2978 /*
2979 * The parent may be non-existent at this point in time. This happens
2980 * if the ino of the parent dir is higher then the current ino. In this
2981 * case, we can not process this ref until the parent dir is finally
2982 * created. If we reach the parent dir later, process_recorded_refs
2983 * will go through all dir items and process the refs that could not be
2984 * processed before. In case this is the first ref, we set
2985 * cur_inode_first_ref_orphan to 1 to inform process_recorded_refs to
2986 * keep an orphan of the inode so that it later can be used for
2987 * link/move
2988 */
2989 ret = is_inode_existent(sctx, dir, gen);
2990 if (ret < 0) 3010 if (ret < 0)
2991 goto out; 3011 goto out;
2992 if (!ret) {
2993 ret = is_first_ref(sctx, sctx->send_root, sctx->cur_ino, dir,
2994 name->start, fs_path_len(name));
2995 if (ret < 0)
2996 goto out;
2997 if (ret)
2998 sctx->cur_inode_first_ref_orphan = 1;
2999 ret = 0;
3000 goto out;
3001 }
3002 3012
3003 ret = get_cur_path(sctx, dir, gen, p); 3013 ret = get_cur_path(sctx, dir, gen, p);
3004 if (ret < 0) 3014 if (ret < 0)
@@ -3029,7 +3039,7 @@ static int __record_deleted_ref(int num, u64 dir, int index,
3029 return -ENOMEM; 3039 return -ENOMEM;
3030 3040
3031 ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, NULL, 3041 ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, NULL,
3032 NULL); 3042 NULL, NULL);
3033 if (ret < 0) 3043 if (ret < 0)
3034 goto out; 3044 goto out;
3035 3045
@@ -3206,33 +3216,28 @@ static int process_all_refs(struct send_ctx *sctx,
3206 key.offset = 0; 3216 key.offset = 0;
3207 while (1) { 3217 while (1) {
3208 ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); 3218 ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
3209 if (ret < 0) { 3219 if (ret < 0)
3210 btrfs_release_path(path);
3211 goto out; 3220 goto out;
3212 } 3221 if (ret)
3213 if (ret) {
3214 btrfs_release_path(path);
3215 break; 3222 break;
3216 }
3217 3223
3218 eb = path->nodes[0]; 3224 eb = path->nodes[0];
3219 slot = path->slots[0]; 3225 slot = path->slots[0];
3220 btrfs_item_key_to_cpu(eb, &found_key, slot); 3226 btrfs_item_key_to_cpu(eb, &found_key, slot);
3221 3227
3222 if (found_key.objectid != key.objectid || 3228 if (found_key.objectid != key.objectid ||
3223 found_key.type != key.type) { 3229 found_key.type != key.type)
3224 btrfs_release_path(path);
3225 break; 3230 break;
3226 }
3227 3231
3228 ret = iterate_inode_ref(sctx, sctx->parent_root, path, 3232 ret = iterate_inode_ref(sctx, root, path, &found_key, 0, cb,
3229 &found_key, 0, cb, sctx); 3233 sctx);
3230 btrfs_release_path(path); 3234 btrfs_release_path(path);
3231 if (ret < 0) 3235 if (ret < 0)
3232 goto out; 3236 goto out;
3233 3237
3234 key.offset = found_key.offset + 1; 3238 key.offset = found_key.offset + 1;
3235 } 3239 }
3240 btrfs_release_path(path);
3236 3241
3237 ret = process_recorded_refs(sctx); 3242 ret = process_recorded_refs(sctx);
3238 3243
@@ -3555,7 +3560,7 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
3555 int ret = 0; 3560 int ret = 0;
3556 struct fs_path *p; 3561 struct fs_path *p;
3557 loff_t pos = offset; 3562 loff_t pos = offset;
3558 int readed = 0; 3563 int num_read = 0;
3559 mm_segment_t old_fs; 3564 mm_segment_t old_fs;
3560 3565
3561 p = fs_path_alloc(sctx); 3566 p = fs_path_alloc(sctx);
@@ -3580,8 +3585,8 @@ verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len);
3580 ret = vfs_read(sctx->cur_inode_filp, sctx->read_buf, len, &pos); 3585 ret = vfs_read(sctx->cur_inode_filp, sctx->read_buf, len, &pos);
3581 if (ret < 0) 3586 if (ret < 0)
3582 goto out; 3587 goto out;
3583 readed = ret; 3588 num_read = ret;
3584 if (!readed) 3589 if (!num_read)
3585 goto out; 3590 goto out;
3586 3591
3587 ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); 3592 ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
@@ -3594,7 +3599,7 @@ verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len);
3594 3599
3595 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); 3600 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
3596 TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); 3601 TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
3597 TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, readed); 3602 TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, num_read);
3598 3603
3599 ret = send_cmd(sctx); 3604 ret = send_cmd(sctx);
3600 3605
@@ -3604,7 +3609,7 @@ out:
3604 set_fs(old_fs); 3609 set_fs(old_fs);
3605 if (ret < 0) 3610 if (ret < 0)
3606 return ret; 3611 return ret;
3607 return readed; 3612 return num_read;
3608} 3613}
3609 3614
3610/* 3615/*
@@ -3615,7 +3620,6 @@ static int send_clone(struct send_ctx *sctx,
3615 struct clone_root *clone_root) 3620 struct clone_root *clone_root)
3616{ 3621{
3617 int ret = 0; 3622 int ret = 0;
3618 struct btrfs_root *clone_root2 = clone_root->root;
3619 struct fs_path *p; 3623 struct fs_path *p;
3620 u64 gen; 3624 u64 gen;
3621 3625
@@ -3640,22 +3644,23 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
3640 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_LEN, len); 3644 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_LEN, len);
3641 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); 3645 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
3642 3646
3643 if (clone_root2 == sctx->send_root) { 3647 if (clone_root->root == sctx->send_root) {
3644 ret = get_inode_info(sctx->send_root, clone_root->ino, NULL, 3648 ret = get_inode_info(sctx->send_root, clone_root->ino, NULL,
3645 &gen, NULL, NULL, NULL); 3649 &gen, NULL, NULL, NULL, NULL);
3646 if (ret < 0) 3650 if (ret < 0)
3647 goto out; 3651 goto out;
3648 ret = get_cur_path(sctx, clone_root->ino, gen, p); 3652 ret = get_cur_path(sctx, clone_root->ino, gen, p);
3649 } else { 3653 } else {
3650 ret = get_inode_path(sctx, clone_root2, clone_root->ino, p); 3654 ret = get_inode_path(sctx, clone_root->root,
3655 clone_root->ino, p);
3651 } 3656 }
3652 if (ret < 0) 3657 if (ret < 0)
3653 goto out; 3658 goto out;
3654 3659
3655 TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, 3660 TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
3656 clone_root2->root_item.uuid); 3661 clone_root->root->root_item.uuid);
3657 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID, 3662 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
3658 clone_root2->root_item.ctransid); 3663 clone_root->root->root_item.ctransid);
3659 TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p); 3664 TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p);
3660 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET, 3665 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET,
3661 clone_root->offset); 3666 clone_root->offset);
@@ -3684,10 +3689,17 @@ static int send_write_or_clone(struct send_ctx *sctx,
3684 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], 3689 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
3685 struct btrfs_file_extent_item); 3690 struct btrfs_file_extent_item);
3686 type = btrfs_file_extent_type(path->nodes[0], ei); 3691 type = btrfs_file_extent_type(path->nodes[0], ei);
3687 if (type == BTRFS_FILE_EXTENT_INLINE) 3692 if (type == BTRFS_FILE_EXTENT_INLINE) {
3688 len = btrfs_file_extent_inline_len(path->nodes[0], ei); 3693 len = btrfs_file_extent_inline_len(path->nodes[0], ei);
3689 else 3694 /*
3695 * it is possible the inline item won't cover the whole page,
3696 * but there may be items after this page. Make
3697 * sure to send the whole thing
3698 */
3699 len = PAGE_CACHE_ALIGN(len);
3700 } else {
3690 len = btrfs_file_extent_num_bytes(path->nodes[0], ei); 3701 len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
3702 }
3691 3703
3692 if (offset + len > sctx->cur_inode_size) 3704 if (offset + len > sctx->cur_inode_size)
3693 len = sctx->cur_inode_size - offset; 3705 len = sctx->cur_inode_size - offset;
@@ -3735,6 +3747,8 @@ static int is_extent_unchanged(struct send_ctx *sctx,
3735 u64 left_offset_fixed; 3747 u64 left_offset_fixed;
3736 u64 left_len; 3748 u64 left_len;
3737 u64 right_len; 3749 u64 right_len;
3750 u64 left_gen;
3751 u64 right_gen;
3738 u8 left_type; 3752 u8 left_type;
3739 u8 right_type; 3753 u8 right_type;
3740 3754
@@ -3744,17 +3758,17 @@ static int is_extent_unchanged(struct send_ctx *sctx,
3744 3758
3745 eb = left_path->nodes[0]; 3759 eb = left_path->nodes[0];
3746 slot = left_path->slots[0]; 3760 slot = left_path->slots[0];
3747
3748 ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 3761 ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
3749 left_type = btrfs_file_extent_type(eb, ei); 3762 left_type = btrfs_file_extent_type(eb, ei);
3750 left_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
3751 left_len = btrfs_file_extent_num_bytes(eb, ei);
3752 left_offset = btrfs_file_extent_offset(eb, ei);
3753 3763
3754 if (left_type != BTRFS_FILE_EXTENT_REG) { 3764 if (left_type != BTRFS_FILE_EXTENT_REG) {
3755 ret = 0; 3765 ret = 0;
3756 goto out; 3766 goto out;
3757 } 3767 }
3768 left_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
3769 left_len = btrfs_file_extent_num_bytes(eb, ei);
3770 left_offset = btrfs_file_extent_offset(eb, ei);
3771 left_gen = btrfs_file_extent_generation(eb, ei);
3758 3772
3759 /* 3773 /*
3760 * Following comments will refer to these graphics. L is the left 3774 * Following comments will refer to these graphics. L is the left
@@ -3810,6 +3824,7 @@ static int is_extent_unchanged(struct send_ctx *sctx,
3810 right_disknr = btrfs_file_extent_disk_bytenr(eb, ei); 3824 right_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
3811 right_len = btrfs_file_extent_num_bytes(eb, ei); 3825 right_len = btrfs_file_extent_num_bytes(eb, ei);
3812 right_offset = btrfs_file_extent_offset(eb, ei); 3826 right_offset = btrfs_file_extent_offset(eb, ei);
3827 right_gen = btrfs_file_extent_generation(eb, ei);
3813 3828
3814 if (right_type != BTRFS_FILE_EXTENT_REG) { 3829 if (right_type != BTRFS_FILE_EXTENT_REG) {
3815 ret = 0; 3830 ret = 0;
@@ -3820,7 +3835,7 @@ static int is_extent_unchanged(struct send_ctx *sctx,
3820 * Are we at extent 8? If yes, we know the extent is changed. 3835 * Are we at extent 8? If yes, we know the extent is changed.
3821 * This may only happen on the first iteration. 3836 * This may only happen on the first iteration.
3822 */ 3837 */
3823 if (found_key.offset + right_len < ekey->offset) { 3838 if (found_key.offset + right_len <= ekey->offset) {
3824 ret = 0; 3839 ret = 0;
3825 goto out; 3840 goto out;
3826 } 3841 }
@@ -3837,8 +3852,9 @@ static int is_extent_unchanged(struct send_ctx *sctx,
3837 /* 3852 /*
3838 * Check if we have the same extent. 3853 * Check if we have the same extent.
3839 */ 3854 */
3840 if (left_disknr + left_offset_fixed != 3855 if (left_disknr != right_disknr ||
3841 right_disknr + right_offset) { 3856 left_offset_fixed != right_offset ||
3857 left_gen != right_gen) {
3842 ret = 0; 3858 ret = 0;
3843 goto out; 3859 goto out;
3844 } 3860 }
@@ -3977,6 +3993,15 @@ static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end)
3977 goto out; 3993 goto out;
3978 3994
3979 ret = process_recorded_refs(sctx); 3995 ret = process_recorded_refs(sctx);
3996 if (ret < 0)
3997 goto out;
3998
3999 /*
4000 * We have processed the refs and thus need to advance send_progress.
4001 * Now, calls to get_cur_xxx will take the updated refs of the current
4002 * inode into account.
4003 */
4004 sctx->send_progress = sctx->cur_ino + 1;
3980 4005
3981out: 4006out:
3982 return ret; 4007 return ret;
@@ -4004,7 +4029,7 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
4004 goto out; 4029 goto out;
4005 4030
4006 ret = get_inode_info(sctx->send_root, sctx->cur_ino, NULL, NULL, 4031 ret = get_inode_info(sctx->send_root, sctx->cur_ino, NULL, NULL,
4007 &left_mode, &left_uid, &left_gid); 4032 &left_mode, &left_uid, &left_gid, NULL);
4008 if (ret < 0) 4033 if (ret < 0)
4009 goto out; 4034 goto out;
4010 4035
@@ -4015,7 +4040,7 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
4015 } else { 4040 } else {
4016 ret = get_inode_info(sctx->parent_root, sctx->cur_ino, 4041 ret = get_inode_info(sctx->parent_root, sctx->cur_ino,
4017 NULL, NULL, &right_mode, &right_uid, 4042 NULL, NULL, &right_mode, &right_uid,
4018 &right_gid); 4043 &right_gid, NULL);
4019 if (ret < 0) 4044 if (ret < 0)
4020 goto out; 4045 goto out;
4021 4046
@@ -4074,7 +4099,12 @@ static int changed_inode(struct send_ctx *sctx,
4074 4099
4075 sctx->cur_ino = key->objectid; 4100 sctx->cur_ino = key->objectid;
4076 sctx->cur_inode_new_gen = 0; 4101 sctx->cur_inode_new_gen = 0;
4077 sctx->cur_inode_first_ref_orphan = 0; 4102
4103 /*
4104 * Set send_progress to current inode. This will tell all get_cur_xxx
4105 * functions that the current inode's refs are not updated yet. Later,
4106 * when process_recorded_refs is finished, it is set to cur_ino + 1.
4107 */
4078 sctx->send_progress = sctx->cur_ino; 4108 sctx->send_progress = sctx->cur_ino;
4079 4109
4080 if (result == BTRFS_COMPARE_TREE_NEW || 4110 if (result == BTRFS_COMPARE_TREE_NEW ||
@@ -4098,7 +4128,14 @@ static int changed_inode(struct send_ctx *sctx,
4098 4128
4099 right_gen = btrfs_inode_generation(sctx->right_path->nodes[0], 4129 right_gen = btrfs_inode_generation(sctx->right_path->nodes[0],
4100 right_ii); 4130 right_ii);
4101 if (left_gen != right_gen) 4131
4132 /*
4133 * The cur_ino = root dir case is special here. We can't treat
4134 * the inode as deleted+reused because it would generate a
4135 * stream that tries to delete/mkdir the root dir.
4136 */
4137 if (left_gen != right_gen &&
4138 sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
4102 sctx->cur_inode_new_gen = 1; 4139 sctx->cur_inode_new_gen = 1;
4103 } 4140 }
4104 4141
@@ -4111,8 +4148,7 @@ static int changed_inode(struct send_ctx *sctx,
4111 sctx->cur_inode_mode = btrfs_inode_mode( 4148 sctx->cur_inode_mode = btrfs_inode_mode(
4112 sctx->left_path->nodes[0], left_ii); 4149 sctx->left_path->nodes[0], left_ii);
4113 if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) 4150 if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
4114 ret = send_create_inode(sctx, sctx->left_path, 4151 ret = send_create_inode_if_needed(sctx);
4115 sctx->cmp_key);
4116 } else if (result == BTRFS_COMPARE_TREE_DELETED) { 4152 } else if (result == BTRFS_COMPARE_TREE_DELETED) {
4117 sctx->cur_inode_gen = right_gen; 4153 sctx->cur_inode_gen = right_gen;
4118 sctx->cur_inode_new = 0; 4154 sctx->cur_inode_new = 0;
@@ -4122,7 +4158,17 @@ static int changed_inode(struct send_ctx *sctx,
4122 sctx->cur_inode_mode = btrfs_inode_mode( 4158 sctx->cur_inode_mode = btrfs_inode_mode(
4123 sctx->right_path->nodes[0], right_ii); 4159 sctx->right_path->nodes[0], right_ii);
4124 } else if (result == BTRFS_COMPARE_TREE_CHANGED) { 4160 } else if (result == BTRFS_COMPARE_TREE_CHANGED) {
4161 /*
4162 * We need to do some special handling in case the inode was
4163 * reported as changed with a changed generation number. This
4164 * means that the original inode was deleted and new inode
4165 * reused the same inum. So we have to treat the old inode as
4166 * deleted and the new one as new.
4167 */
4125 if (sctx->cur_inode_new_gen) { 4168 if (sctx->cur_inode_new_gen) {
4169 /*
4170 * First, process the inode as if it was deleted.
4171 */
4126 sctx->cur_inode_gen = right_gen; 4172 sctx->cur_inode_gen = right_gen;
4127 sctx->cur_inode_new = 0; 4173 sctx->cur_inode_new = 0;
4128 sctx->cur_inode_deleted = 1; 4174 sctx->cur_inode_deleted = 1;
@@ -4135,6 +4181,9 @@ static int changed_inode(struct send_ctx *sctx,
4135 if (ret < 0) 4181 if (ret < 0)
4136 goto out; 4182 goto out;
4137 4183
4184 /*
4185 * Now process the inode as if it was new.
4186 */
4138 sctx->cur_inode_gen = left_gen; 4187 sctx->cur_inode_gen = left_gen;
4139 sctx->cur_inode_new = 1; 4188 sctx->cur_inode_new = 1;
4140 sctx->cur_inode_deleted = 0; 4189 sctx->cur_inode_deleted = 0;
@@ -4142,14 +4191,23 @@ static int changed_inode(struct send_ctx *sctx,
4142 sctx->left_path->nodes[0], left_ii); 4191 sctx->left_path->nodes[0], left_ii);
4143 sctx->cur_inode_mode = btrfs_inode_mode( 4192 sctx->cur_inode_mode = btrfs_inode_mode(
4144 sctx->left_path->nodes[0], left_ii); 4193 sctx->left_path->nodes[0], left_ii);
4145 ret = send_create_inode(sctx, sctx->left_path, 4194 ret = send_create_inode_if_needed(sctx);
4146 sctx->cmp_key);
4147 if (ret < 0) 4195 if (ret < 0)
4148 goto out; 4196 goto out;
4149 4197
4150 ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW); 4198 ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW);
4151 if (ret < 0) 4199 if (ret < 0)
4152 goto out; 4200 goto out;
4201 /*
4202 * Advance send_progress now as we did not get into
4203 * process_recorded_refs_if_needed in the new_gen case.
4204 */
4205 sctx->send_progress = sctx->cur_ino + 1;
4206
4207 /*
4208 * Now process all extents and xattrs of the inode as if
4209 * they were all new.
4210 */
4153 ret = process_all_extents(sctx); 4211 ret = process_all_extents(sctx);
4154 if (ret < 0) 4212 if (ret < 0)
4155 goto out; 4213 goto out;
@@ -4172,6 +4230,16 @@ out:
4172 return ret; 4230 return ret;
4173} 4231}
4174 4232
4233/*
4234 * We have to process new refs before deleted refs, but compare_trees gives us
4235 * the new and deleted refs mixed. To fix this, we record the new/deleted refs
4236 * first and later process them in process_recorded_refs.
4237 * For the cur_inode_new_gen case, we skip recording completely because
4238 * changed_inode did already initiate processing of refs. The reason for this is
4239 * that in this case, compare_tree actually compares the refs of 2 different
4240 * inodes. To fix this, process_all_refs is used in changed_inode to handle all
4241 * refs of the right tree as deleted and all refs of the left tree as new.
4242 */
4175static int changed_ref(struct send_ctx *sctx, 4243static int changed_ref(struct send_ctx *sctx,
4176 enum btrfs_compare_tree_result result) 4244 enum btrfs_compare_tree_result result)
4177{ 4245{
@@ -4192,6 +4260,11 @@ static int changed_ref(struct send_ctx *sctx,
4192 return ret; 4260 return ret;
4193} 4261}
4194 4262
4263/*
4264 * Process new/deleted/changed xattrs. We skip processing in the
4265 * cur_inode_new_gen case because changed_inode did already initiate processing
4266 * of xattrs. The reason is the same as in changed_ref
4267 */
4195static int changed_xattr(struct send_ctx *sctx, 4268static int changed_xattr(struct send_ctx *sctx,
4196 enum btrfs_compare_tree_result result) 4269 enum btrfs_compare_tree_result result)
4197{ 4270{
@@ -4211,6 +4284,11 @@ static int changed_xattr(struct send_ctx *sctx,
4211 return ret; 4284 return ret;
4212} 4285}
4213 4286
4287/*
4288 * Process new/deleted/changed extents. We skip processing in the
4289 * cur_inode_new_gen case because changed_inode did already initiate processing
4290 * of extents. The reason is the same as in changed_ref
4291 */
4214static int changed_extent(struct send_ctx *sctx, 4292static int changed_extent(struct send_ctx *sctx,
4215 enum btrfs_compare_tree_result result) 4293 enum btrfs_compare_tree_result result)
4216{ 4294{
@@ -4227,7 +4305,10 @@ static int changed_extent(struct send_ctx *sctx,
4227 return ret; 4305 return ret;
4228} 4306}
4229 4307
4230 4308/*
4309 * Updates compare related fields in sctx and simply forwards to the actual
4310 * changed_xxx functions.
4311 */
4231static int changed_cb(struct btrfs_root *left_root, 4312static int changed_cb(struct btrfs_root *left_root,
4232 struct btrfs_root *right_root, 4313 struct btrfs_root *right_root,
4233 struct btrfs_path *left_path, 4314 struct btrfs_path *left_path,
@@ -4247,6 +4328,11 @@ static int changed_cb(struct btrfs_root *left_root,
4247 if (ret < 0) 4328 if (ret < 0)
4248 goto out; 4329 goto out;
4249 4330
4331 /* Ignore non-FS objects */
4332 if (key->objectid == BTRFS_FREE_INO_OBJECTID ||
4333 key->objectid == BTRFS_FREE_SPACE_OBJECTID)
4334 goto out;
4335
4250 if (key->type == BTRFS_INODE_ITEM_KEY) 4336 if (key->type == BTRFS_INODE_ITEM_KEY)
4251 ret = changed_inode(sctx, result); 4337 ret = changed_inode(sctx, result);
4252 else if (key->type == BTRFS_INODE_REF_KEY) 4338 else if (key->type == BTRFS_INODE_REF_KEY)
@@ -4299,7 +4385,8 @@ join_trans:
4299 } 4385 }
4300 4386
4301 /* 4387 /*
4302 * Make sure the tree has not changed 4388 * Make sure the tree has not changed after re-joining. We detect this
4389 * by comparing start_ctransid and ctransid. They should always match.
4303 */ 4390 */
4304 spin_lock(&send_root->root_times_lock); 4391 spin_lock(&send_root->root_times_lock);
4305 ctransid = btrfs_root_ctransid(&send_root->root_item); 4392 ctransid = btrfs_root_ctransid(&send_root->root_item);
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index 9934e948e57f..1bf4f32fd4ef 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -130,4 +130,5 @@ enum {
130 130
131#ifdef __KERNEL__ 131#ifdef __KERNEL__
132long btrfs_ioctl_send(struct file *mnt_file, void __user *arg); 132long btrfs_ioctl_send(struct file *mnt_file, void __user *arg);
133int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off);
133#endif 134#endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 83d6f9f9c220..915ac14c2064 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -243,12 +243,18 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
243 struct btrfs_root *root, const char *function, 243 struct btrfs_root *root, const char *function,
244 unsigned int line, int errno) 244 unsigned int line, int errno)
245{ 245{
246 WARN_ONCE(1, KERN_DEBUG "btrfs: Transaction aborted"); 246 WARN_ONCE(1, KERN_DEBUG "btrfs: Transaction aborted\n");
247 trans->aborted = errno; 247 trans->aborted = errno;
248 /* Nothing used. The other threads that have joined this 248 /* Nothing used. The other threads that have joined this
249 * transaction may be able to continue. */ 249 * transaction may be able to continue. */
250 if (!trans->blocks_used) { 250 if (!trans->blocks_used) {
251 btrfs_printk(root->fs_info, "Aborting unused transaction.\n"); 251 char nbuf[16];
252 const char *errstr;
253
254 errstr = btrfs_decode_error(root->fs_info, errno, nbuf);
255 btrfs_printk(root->fs_info,
256 "%s:%d: Aborting unused transaction(%s).\n",
257 function, line, errstr);
252 return; 258 return;
253 } 259 }
254 trans->transaction->aborted = errno; 260 trans->transaction->aborted = errno;
@@ -407,7 +413,15 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
407 btrfs_set_opt(info->mount_opt, NODATASUM); 413 btrfs_set_opt(info->mount_opt, NODATASUM);
408 break; 414 break;
409 case Opt_nodatacow: 415 case Opt_nodatacow:
410 printk(KERN_INFO "btrfs: setting nodatacow\n"); 416 if (!btrfs_test_opt(root, COMPRESS) ||
417 !btrfs_test_opt(root, FORCE_COMPRESS)) {
418 printk(KERN_INFO "btrfs: setting nodatacow, compression disabled\n");
419 } else {
420 printk(KERN_INFO "btrfs: setting nodatacow\n");
421 }
422 info->compress_type = BTRFS_COMPRESS_NONE;
423 btrfs_clear_opt(info->mount_opt, COMPRESS);
424 btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
411 btrfs_set_opt(info->mount_opt, NODATACOW); 425 btrfs_set_opt(info->mount_opt, NODATACOW);
412 btrfs_set_opt(info->mount_opt, NODATASUM); 426 btrfs_set_opt(info->mount_opt, NODATASUM);
413 break; 427 break;
@@ -422,10 +436,14 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
422 compress_type = "zlib"; 436 compress_type = "zlib";
423 info->compress_type = BTRFS_COMPRESS_ZLIB; 437 info->compress_type = BTRFS_COMPRESS_ZLIB;
424 btrfs_set_opt(info->mount_opt, COMPRESS); 438 btrfs_set_opt(info->mount_opt, COMPRESS);
439 btrfs_clear_opt(info->mount_opt, NODATACOW);
440 btrfs_clear_opt(info->mount_opt, NODATASUM);
425 } else if (strcmp(args[0].from, "lzo") == 0) { 441 } else if (strcmp(args[0].from, "lzo") == 0) {
426 compress_type = "lzo"; 442 compress_type = "lzo";
427 info->compress_type = BTRFS_COMPRESS_LZO; 443 info->compress_type = BTRFS_COMPRESS_LZO;
428 btrfs_set_opt(info->mount_opt, COMPRESS); 444 btrfs_set_opt(info->mount_opt, COMPRESS);
445 btrfs_clear_opt(info->mount_opt, NODATACOW);
446 btrfs_clear_opt(info->mount_opt, NODATASUM);
429 btrfs_set_fs_incompat(info, COMPRESS_LZO); 447 btrfs_set_fs_incompat(info, COMPRESS_LZO);
430 } else if (strncmp(args[0].from, "no", 2) == 0) { 448 } else if (strncmp(args[0].from, "no", 2) == 0) {
431 compress_type = "no"; 449 compress_type = "no";
@@ -543,11 +561,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
543 btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG); 561 btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG);
544 break; 562 break;
545 case Opt_defrag: 563 case Opt_defrag:
546 printk(KERN_INFO "btrfs: enabling auto defrag"); 564 printk(KERN_INFO "btrfs: enabling auto defrag\n");
547 btrfs_set_opt(info->mount_opt, AUTO_DEFRAG); 565 btrfs_set_opt(info->mount_opt, AUTO_DEFRAG);
548 break; 566 break;
549 case Opt_recovery: 567 case Opt_recovery:
550 printk(KERN_INFO "btrfs: enabling auto recovery"); 568 printk(KERN_INFO "btrfs: enabling auto recovery\n");
551 btrfs_set_opt(info->mount_opt, RECOVERY); 569 btrfs_set_opt(info->mount_opt, RECOVERY);
552 break; 570 break;
553 case Opt_skip_balance: 571 case Opt_skip_balance:
@@ -846,18 +864,15 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
846 return 0; 864 return 0;
847 } 865 }
848 866
849 btrfs_wait_ordered_extents(root, 0, 0); 867 btrfs_wait_ordered_extents(root, 0);
850
851 spin_lock(&fs_info->trans_lock);
852 if (!fs_info->running_transaction) {
853 spin_unlock(&fs_info->trans_lock);
854 return 0;
855 }
856 spin_unlock(&fs_info->trans_lock);
857 868
858 trans = btrfs_join_transaction(root); 869 trans = btrfs_attach_transaction(root);
859 if (IS_ERR(trans)) 870 if (IS_ERR(trans)) {
871 /* no transaction, don't bother */
872 if (PTR_ERR(trans) == -ENOENT)
873 return 0;
860 return PTR_ERR(trans); 874 return PTR_ERR(trans);
875 }
861 return btrfs_commit_transaction(trans, root); 876 return btrfs_commit_transaction(trans, root);
862} 877}
863 878
@@ -1508,17 +1523,21 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
1508 1523
1509static int btrfs_freeze(struct super_block *sb) 1524static int btrfs_freeze(struct super_block *sb)
1510{ 1525{
1511 struct btrfs_fs_info *fs_info = btrfs_sb(sb); 1526 struct btrfs_trans_handle *trans;
1512 mutex_lock(&fs_info->transaction_kthread_mutex); 1527 struct btrfs_root *root = btrfs_sb(sb)->tree_root;
1513 mutex_lock(&fs_info->cleaner_mutex); 1528
1514 return 0; 1529 trans = btrfs_attach_transaction(root);
1530 if (IS_ERR(trans)) {
1531 /* no transaction, don't bother */
1532 if (PTR_ERR(trans) == -ENOENT)
1533 return 0;
1534 return PTR_ERR(trans);
1535 }
1536 return btrfs_commit_transaction(trans, root);
1515} 1537}
1516 1538
1517static int btrfs_unfreeze(struct super_block *sb) 1539static int btrfs_unfreeze(struct super_block *sb)
1518{ 1540{
1519 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
1520 mutex_unlock(&fs_info->cleaner_mutex);
1521 mutex_unlock(&fs_info->transaction_kthread_mutex);
1522 return 0; 1541 return 0;
1523} 1542}
1524 1543
@@ -1595,7 +1614,7 @@ static int btrfs_interface_init(void)
1595static void btrfs_interface_exit(void) 1614static void btrfs_interface_exit(void)
1596{ 1615{
1597 if (misc_deregister(&btrfs_misc) < 0) 1616 if (misc_deregister(&btrfs_misc) < 0)
1598 printk(KERN_INFO "misc_deregister failed for control device"); 1617 printk(KERN_INFO "btrfs: misc_deregister failed for control device\n");
1599} 1618}
1600 1619
1601static int __init init_btrfs_fs(void) 1620static int __init init_btrfs_fs(void)
@@ -1620,10 +1639,14 @@ static int __init init_btrfs_fs(void)
1620 if (err) 1639 if (err)
1621 goto free_extent_io; 1640 goto free_extent_io;
1622 1641
1623 err = btrfs_delayed_inode_init(); 1642 err = ordered_data_init();
1624 if (err) 1643 if (err)
1625 goto free_extent_map; 1644 goto free_extent_map;
1626 1645
1646 err = btrfs_delayed_inode_init();
1647 if (err)
1648 goto free_ordered_data;
1649
1627 err = btrfs_interface_init(); 1650 err = btrfs_interface_init();
1628 if (err) 1651 if (err)
1629 goto free_delayed_inode; 1652 goto free_delayed_inode;
@@ -1641,6 +1664,8 @@ unregister_ioctl:
1641 btrfs_interface_exit(); 1664 btrfs_interface_exit();
1642free_delayed_inode: 1665free_delayed_inode:
1643 btrfs_delayed_inode_exit(); 1666 btrfs_delayed_inode_exit();
1667free_ordered_data:
1668 ordered_data_exit();
1644free_extent_map: 1669free_extent_map:
1645 extent_map_exit(); 1670 extent_map_exit();
1646free_extent_io: 1671free_extent_io:
@@ -1657,6 +1682,7 @@ static void __exit exit_btrfs_fs(void)
1657{ 1682{
1658 btrfs_destroy_cachep(); 1683 btrfs_destroy_cachep();
1659 btrfs_delayed_inode_exit(); 1684 btrfs_delayed_inode_exit();
1685 ordered_data_exit();
1660 extent_map_exit(); 1686 extent_map_exit();
1661 extent_io_exit(); 1687 extent_io_exit();
1662 btrfs_interface_exit(); 1688 btrfs_interface_exit();
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 27c26004e050..77db875b5116 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -53,7 +53,7 @@ static noinline void switch_commit_root(struct btrfs_root *root)
53/* 53/*
54 * either allocate a new transaction or hop into the existing one 54 * either allocate a new transaction or hop into the existing one
55 */ 55 */
56static noinline int join_transaction(struct btrfs_root *root, int nofail) 56static noinline int join_transaction(struct btrfs_root *root, int type)
57{ 57{
58 struct btrfs_transaction *cur_trans; 58 struct btrfs_transaction *cur_trans;
59 struct btrfs_fs_info *fs_info = root->fs_info; 59 struct btrfs_fs_info *fs_info = root->fs_info;
@@ -67,7 +67,13 @@ loop:
67 } 67 }
68 68
69 if (fs_info->trans_no_join) { 69 if (fs_info->trans_no_join) {
70 if (!nofail) { 70 /*
71 * If we are JOIN_NOLOCK we're already committing a current
72 * transaction, we just need a handle to deal with something
73 * when committing the transaction, such as inode cache and
74 * space cache. It is a special case.
75 */
76 if (type != TRANS_JOIN_NOLOCK) {
71 spin_unlock(&fs_info->trans_lock); 77 spin_unlock(&fs_info->trans_lock);
72 return -EBUSY; 78 return -EBUSY;
73 } 79 }
@@ -87,6 +93,13 @@ loop:
87 } 93 }
88 spin_unlock(&fs_info->trans_lock); 94 spin_unlock(&fs_info->trans_lock);
89 95
96 /*
97 * If we are ATTACH, we just want to catch the current transaction,
98 * and commit it. If there is no transaction, just return ENOENT.
99 */
100 if (type == TRANS_ATTACH)
101 return -ENOENT;
102
90 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS); 103 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
91 if (!cur_trans) 104 if (!cur_trans)
92 return -ENOMEM; 105 return -ENOMEM;
@@ -267,13 +280,6 @@ static void wait_current_trans(struct btrfs_root *root)
267 } 280 }
268} 281}
269 282
270enum btrfs_trans_type {
271 TRANS_START,
272 TRANS_JOIN,
273 TRANS_USERSPACE,
274 TRANS_JOIN_NOLOCK,
275};
276
277static int may_wait_transaction(struct btrfs_root *root, int type) 283static int may_wait_transaction(struct btrfs_root *root, int type)
278{ 284{
279 if (root->fs_info->log_root_recovering) 285 if (root->fs_info->log_root_recovering)
@@ -290,7 +296,8 @@ static int may_wait_transaction(struct btrfs_root *root, int type)
290} 296}
291 297
292static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, 298static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
293 u64 num_items, int type) 299 u64 num_items, int type,
300 int noflush)
294{ 301{
295 struct btrfs_trans_handle *h; 302 struct btrfs_trans_handle *h;
296 struct btrfs_transaction *cur_trans; 303 struct btrfs_transaction *cur_trans;
@@ -324,9 +331,14 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
324 } 331 }
325 332
326 num_bytes = btrfs_calc_trans_metadata_size(root, num_items); 333 num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
327 ret = btrfs_block_rsv_add(root, 334 if (noflush)
328 &root->fs_info->trans_block_rsv, 335 ret = btrfs_block_rsv_add_noflush(root,
329 num_bytes); 336 &root->fs_info->trans_block_rsv,
337 num_bytes);
338 else
339 ret = btrfs_block_rsv_add(root,
340 &root->fs_info->trans_block_rsv,
341 num_bytes);
330 if (ret) 342 if (ret)
331 return ERR_PTR(ret); 343 return ERR_PTR(ret);
332 } 344 }
@@ -335,19 +347,34 @@ again:
335 if (!h) 347 if (!h)
336 return ERR_PTR(-ENOMEM); 348 return ERR_PTR(-ENOMEM);
337 349
338 sb_start_intwrite(root->fs_info->sb); 350 /*
351 * If we are JOIN_NOLOCK we're already committing a transaction and
352 * waiting on this guy, so we don't need to do the sb_start_intwrite
353 * because we're already holding a ref. We need this because we could
354 * have raced in and did an fsync() on a file which can kick a commit
355 * and then we deadlock with somebody doing a freeze.
356 *
357 * If we are ATTACH, it means we just want to catch the current
358 * transaction and commit it, so we needn't do sb_start_intwrite().
359 */
360 if (type < TRANS_JOIN_NOLOCK)
361 sb_start_intwrite(root->fs_info->sb);
339 362
340 if (may_wait_transaction(root, type)) 363 if (may_wait_transaction(root, type))
341 wait_current_trans(root); 364 wait_current_trans(root);
342 365
343 do { 366 do {
344 ret = join_transaction(root, type == TRANS_JOIN_NOLOCK); 367 ret = join_transaction(root, type);
345 if (ret == -EBUSY) 368 if (ret == -EBUSY)
346 wait_current_trans(root); 369 wait_current_trans(root);
347 } while (ret == -EBUSY); 370 } while (ret == -EBUSY);
348 371
349 if (ret < 0) { 372 if (ret < 0) {
350 sb_end_intwrite(root->fs_info->sb); 373 /* We must get the transaction if we are JOIN_NOLOCK. */
374 BUG_ON(type == TRANS_JOIN_NOLOCK);
375
376 if (type < TRANS_JOIN_NOLOCK)
377 sb_end_intwrite(root->fs_info->sb);
351 kmem_cache_free(btrfs_trans_handle_cachep, h); 378 kmem_cache_free(btrfs_trans_handle_cachep, h);
352 return ERR_PTR(ret); 379 return ERR_PTR(ret);
353 } 380 }
@@ -367,7 +394,9 @@ again:
367 h->aborted = 0; 394 h->aborted = 0;
368 h->qgroup_reserved = qgroup_reserved; 395 h->qgroup_reserved = qgroup_reserved;
369 h->delayed_ref_elem.seq = 0; 396 h->delayed_ref_elem.seq = 0;
397 h->type = type;
370 INIT_LIST_HEAD(&h->qgroup_ref_list); 398 INIT_LIST_HEAD(&h->qgroup_ref_list);
399 INIT_LIST_HEAD(&h->new_bgs);
371 400
372 smp_mb(); 401 smp_mb();
373 if (cur_trans->blocked && may_wait_transaction(root, type)) { 402 if (cur_trans->blocked && may_wait_transaction(root, type)) {
@@ -393,21 +422,33 @@ got_it:
393struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 422struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
394 int num_items) 423 int num_items)
395{ 424{
396 return start_transaction(root, num_items, TRANS_START); 425 return start_transaction(root, num_items, TRANS_START, 0);
426}
427
428struct btrfs_trans_handle *btrfs_start_transaction_noflush(
429 struct btrfs_root *root, int num_items)
430{
431 return start_transaction(root, num_items, TRANS_START, 1);
397} 432}
433
398struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root) 434struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
399{ 435{
400 return start_transaction(root, 0, TRANS_JOIN); 436 return start_transaction(root, 0, TRANS_JOIN, 0);
401} 437}
402 438
403struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root) 439struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root)
404{ 440{
405 return start_transaction(root, 0, TRANS_JOIN_NOLOCK); 441 return start_transaction(root, 0, TRANS_JOIN_NOLOCK, 0);
406} 442}
407 443
408struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root) 444struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root)
409{ 445{
410 return start_transaction(root, 0, TRANS_USERSPACE); 446 return start_transaction(root, 0, TRANS_USERSPACE, 0);
447}
448
449struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)
450{
451 return start_transaction(root, 0, TRANS_ATTACH, 0);
411} 452}
412 453
413/* wait for a transaction commit to be fully complete */ 454/* wait for a transaction commit to be fully complete */
@@ -506,11 +547,12 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
506} 547}
507 548
508static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, 549static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
509 struct btrfs_root *root, int throttle, int lock) 550 struct btrfs_root *root, int throttle)
510{ 551{
511 struct btrfs_transaction *cur_trans = trans->transaction; 552 struct btrfs_transaction *cur_trans = trans->transaction;
512 struct btrfs_fs_info *info = root->fs_info; 553 struct btrfs_fs_info *info = root->fs_info;
513 int count = 0; 554 int count = 0;
555 int lock = (trans->type != TRANS_JOIN_NOLOCK);
514 int err = 0; 556 int err = 0;
515 557
516 if (--trans->use_count) { 558 if (--trans->use_count) {
@@ -536,6 +578,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
536 trans->qgroup_reserved = 0; 578 trans->qgroup_reserved = 0;
537 } 579 }
538 580
581 if (!list_empty(&trans->new_bgs))
582 btrfs_create_pending_block_groups(trans, root);
583
539 while (count < 2) { 584 while (count < 2) {
540 unsigned long cur = trans->delayed_ref_updates; 585 unsigned long cur = trans->delayed_ref_updates;
541 trans->delayed_ref_updates = 0; 586 trans->delayed_ref_updates = 0;
@@ -551,7 +596,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
551 btrfs_trans_release_metadata(trans, root); 596 btrfs_trans_release_metadata(trans, root);
552 trans->block_rsv = NULL; 597 trans->block_rsv = NULL;
553 598
554 sb_end_intwrite(root->fs_info->sb); 599 if (!list_empty(&trans->new_bgs))
600 btrfs_create_pending_block_groups(trans, root);
555 601
556 if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && 602 if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
557 should_end_transaction(trans, root)) { 603 should_end_transaction(trans, root)) {
@@ -573,6 +619,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
573 } 619 }
574 } 620 }
575 621
622 if (trans->type < TRANS_JOIN_NOLOCK)
623 sb_end_intwrite(root->fs_info->sb);
624
576 WARN_ON(cur_trans != info->running_transaction); 625 WARN_ON(cur_trans != info->running_transaction);
577 WARN_ON(atomic_read(&cur_trans->num_writers) < 1); 626 WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
578 atomic_dec(&cur_trans->num_writers); 627 atomic_dec(&cur_trans->num_writers);
@@ -604,7 +653,7 @@ int btrfs_end_transaction(struct btrfs_trans_handle *trans,
604{ 653{
605 int ret; 654 int ret;
606 655
607 ret = __btrfs_end_transaction(trans, root, 0, 1); 656 ret = __btrfs_end_transaction(trans, root, 0);
608 if (ret) 657 if (ret)
609 return ret; 658 return ret;
610 return 0; 659 return 0;
@@ -615,18 +664,7 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
615{ 664{
616 int ret; 665 int ret;
617 666
618 ret = __btrfs_end_transaction(trans, root, 1, 1); 667 ret = __btrfs_end_transaction(trans, root, 1);
619 if (ret)
620 return ret;
621 return 0;
622}
623
624int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
625 struct btrfs_root *root)
626{
627 int ret;
628
629 ret = __btrfs_end_transaction(trans, root, 0, 0);
630 if (ret) 668 if (ret)
631 return ret; 669 return ret;
632 return 0; 670 return 0;
@@ -635,7 +673,7 @@ int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
635int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans, 673int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
636 struct btrfs_root *root) 674 struct btrfs_root *root)
637{ 675{
638 return __btrfs_end_transaction(trans, root, 1, 1); 676 return __btrfs_end_transaction(trans, root, 1);
639} 677}
640 678
641/* 679/*
@@ -649,13 +687,15 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
649 int err = 0; 687 int err = 0;
650 int werr = 0; 688 int werr = 0;
651 struct address_space *mapping = root->fs_info->btree_inode->i_mapping; 689 struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
690 struct extent_state *cached_state = NULL;
652 u64 start = 0; 691 u64 start = 0;
653 u64 end; 692 u64 end;
654 693
655 while (!find_first_extent_bit(dirty_pages, start, &start, &end, 694 while (!find_first_extent_bit(dirty_pages, start, &start, &end,
656 mark)) { 695 mark, &cached_state)) {
657 convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, mark, 696 convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
658 GFP_NOFS); 697 mark, &cached_state, GFP_NOFS);
698 cached_state = NULL;
659 err = filemap_fdatawrite_range(mapping, start, end); 699 err = filemap_fdatawrite_range(mapping, start, end);
660 if (err) 700 if (err)
661 werr = err; 701 werr = err;
@@ -679,12 +719,14 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
679 int err = 0; 719 int err = 0;
680 int werr = 0; 720 int werr = 0;
681 struct address_space *mapping = root->fs_info->btree_inode->i_mapping; 721 struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
722 struct extent_state *cached_state = NULL;
682 u64 start = 0; 723 u64 start = 0;
683 u64 end; 724 u64 end;
684 725
685 while (!find_first_extent_bit(dirty_pages, start, &start, &end, 726 while (!find_first_extent_bit(dirty_pages, start, &start, &end,
686 EXTENT_NEED_WAIT)) { 727 EXTENT_NEED_WAIT, &cached_state)) {
687 clear_extent_bits(dirty_pages, start, end, EXTENT_NEED_WAIT, GFP_NOFS); 728 clear_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
729 0, 0, &cached_state, GFP_NOFS);
688 err = filemap_fdatawait_range(mapping, start, end); 730 err = filemap_fdatawait_range(mapping, start, end);
689 if (err) 731 if (err)
690 werr = err; 732 werr = err;
@@ -955,6 +997,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
955 struct btrfs_root *parent_root; 997 struct btrfs_root *parent_root;
956 struct btrfs_block_rsv *rsv; 998 struct btrfs_block_rsv *rsv;
957 struct inode *parent_inode; 999 struct inode *parent_inode;
1000 struct btrfs_path *path;
1001 struct btrfs_dir_item *dir_item;
958 struct dentry *parent; 1002 struct dentry *parent;
959 struct dentry *dentry; 1003 struct dentry *dentry;
960 struct extent_buffer *tmp; 1004 struct extent_buffer *tmp;
@@ -967,18 +1011,22 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
967 u64 root_flags; 1011 u64 root_flags;
968 uuid_le new_uuid; 1012 uuid_le new_uuid;
969 1013
970 rsv = trans->block_rsv; 1014 path = btrfs_alloc_path();
1015 if (!path) {
1016 ret = pending->error = -ENOMEM;
1017 goto path_alloc_fail;
1018 }
971 1019
972 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); 1020 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
973 if (!new_root_item) { 1021 if (!new_root_item) {
974 ret = pending->error = -ENOMEM; 1022 ret = pending->error = -ENOMEM;
975 goto fail; 1023 goto root_item_alloc_fail;
976 } 1024 }
977 1025
978 ret = btrfs_find_free_objectid(tree_root, &objectid); 1026 ret = btrfs_find_free_objectid(tree_root, &objectid);
979 if (ret) { 1027 if (ret) {
980 pending->error = ret; 1028 pending->error = ret;
981 goto fail; 1029 goto no_free_objectid;
982 } 1030 }
983 1031
984 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); 1032 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
@@ -988,22 +1036,22 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
988 to_reserve); 1036 to_reserve);
989 if (ret) { 1037 if (ret) {
990 pending->error = ret; 1038 pending->error = ret;
991 goto fail; 1039 goto no_free_objectid;
992 } 1040 }
993 } 1041 }
994 1042
995 ret = btrfs_qgroup_inherit(trans, fs_info, root->root_key.objectid, 1043 ret = btrfs_qgroup_inherit(trans, fs_info, root->root_key.objectid,
996 objectid, pending->inherit); 1044 objectid, pending->inherit);
997 kfree(pending->inherit);
998 if (ret) { 1045 if (ret) {
999 pending->error = ret; 1046 pending->error = ret;
1000 goto fail; 1047 goto no_free_objectid;
1001 } 1048 }
1002 1049
1003 key.objectid = objectid; 1050 key.objectid = objectid;
1004 key.offset = (u64)-1; 1051 key.offset = (u64)-1;
1005 key.type = BTRFS_ROOT_ITEM_KEY; 1052 key.type = BTRFS_ROOT_ITEM_KEY;
1006 1053
1054 rsv = trans->block_rsv;
1007 trans->block_rsv = &pending->block_rsv; 1055 trans->block_rsv = &pending->block_rsv;
1008 1056
1009 dentry = pending->dentry; 1057 dentry = pending->dentry;
@@ -1017,24 +1065,21 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1017 */ 1065 */
1018 ret = btrfs_set_inode_index(parent_inode, &index); 1066 ret = btrfs_set_inode_index(parent_inode, &index);
1019 BUG_ON(ret); /* -ENOMEM */ 1067 BUG_ON(ret); /* -ENOMEM */
1020 ret = btrfs_insert_dir_item(trans, parent_root, 1068
1021 dentry->d_name.name, dentry->d_name.len, 1069 /* check if there is a file/dir which has the same name. */
1022 parent_inode, &key, 1070 dir_item = btrfs_lookup_dir_item(NULL, parent_root, path,
1023 BTRFS_FT_DIR, index); 1071 btrfs_ino(parent_inode),
1024 if (ret == -EEXIST) { 1072 dentry->d_name.name,
1073 dentry->d_name.len, 0);
1074 if (dir_item != NULL && !IS_ERR(dir_item)) {
1025 pending->error = -EEXIST; 1075 pending->error = -EEXIST;
1026 dput(parent);
1027 goto fail; 1076 goto fail;
1028 } else if (ret) { 1077 } else if (IS_ERR(dir_item)) {
1029 goto abort_trans_dput; 1078 ret = PTR_ERR(dir_item);
1079 btrfs_abort_transaction(trans, root, ret);
1080 goto fail;
1030 } 1081 }
1031 1082 btrfs_release_path(path);
1032 btrfs_i_size_write(parent_inode, parent_inode->i_size +
1033 dentry->d_name.len * 2);
1034 parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
1035 ret = btrfs_update_inode(trans, parent_root, parent_inode);
1036 if (ret)
1037 goto abort_trans_dput;
1038 1083
1039 /* 1084 /*
1040 * pull in the delayed directory update 1085 * pull in the delayed directory update
@@ -1043,8 +1088,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1043 * snapshot 1088 * snapshot
1044 */ 1089 */
1045 ret = btrfs_run_delayed_items(trans, root); 1090 ret = btrfs_run_delayed_items(trans, root);
1046 if (ret) { /* Transaction aborted */ 1091 if (ret) { /* Transaction aborted */
1047 dput(parent); 1092 btrfs_abort_transaction(trans, root, ret);
1048 goto fail; 1093 goto fail;
1049 } 1094 }
1050 1095
@@ -1079,7 +1124,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1079 if (ret) { 1124 if (ret) {
1080 btrfs_tree_unlock(old); 1125 btrfs_tree_unlock(old);
1081 free_extent_buffer(old); 1126 free_extent_buffer(old);
1082 goto abort_trans_dput; 1127 btrfs_abort_transaction(trans, root, ret);
1128 goto fail;
1083 } 1129 }
1084 1130
1085 btrfs_set_lock_blocking(old); 1131 btrfs_set_lock_blocking(old);
@@ -1088,8 +1134,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1088 /* clean up in any case */ 1134 /* clean up in any case */
1089 btrfs_tree_unlock(old); 1135 btrfs_tree_unlock(old);
1090 free_extent_buffer(old); 1136 free_extent_buffer(old);
1091 if (ret) 1137 if (ret) {
1092 goto abort_trans_dput; 1138 btrfs_abort_transaction(trans, root, ret);
1139 goto fail;
1140 }
1093 1141
1094 /* see comments in should_cow_block() */ 1142 /* see comments in should_cow_block() */
1095 root->force_cow = 1; 1143 root->force_cow = 1;
@@ -1101,8 +1149,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1101 ret = btrfs_insert_root(trans, tree_root, &key, new_root_item); 1149 ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
1102 btrfs_tree_unlock(tmp); 1150 btrfs_tree_unlock(tmp);
1103 free_extent_buffer(tmp); 1151 free_extent_buffer(tmp);
1104 if (ret) 1152 if (ret) {
1105 goto abort_trans_dput; 1153 btrfs_abort_transaction(trans, root, ret);
1154 goto fail;
1155 }
1106 1156
1107 /* 1157 /*
1108 * insert root back/forward references 1158 * insert root back/forward references
@@ -1111,32 +1161,58 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1111 parent_root->root_key.objectid, 1161 parent_root->root_key.objectid,
1112 btrfs_ino(parent_inode), index, 1162 btrfs_ino(parent_inode), index,
1113 dentry->d_name.name, dentry->d_name.len); 1163 dentry->d_name.name, dentry->d_name.len);
1114 dput(parent); 1164 if (ret) {
1115 if (ret) 1165 btrfs_abort_transaction(trans, root, ret);
1116 goto fail; 1166 goto fail;
1167 }
1117 1168
1118 key.offset = (u64)-1; 1169 key.offset = (u64)-1;
1119 pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key); 1170 pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
1120 if (IS_ERR(pending->snap)) { 1171 if (IS_ERR(pending->snap)) {
1121 ret = PTR_ERR(pending->snap); 1172 ret = PTR_ERR(pending->snap);
1122 goto abort_trans; 1173 btrfs_abort_transaction(trans, root, ret);
1174 goto fail;
1123 } 1175 }
1124 1176
1125 ret = btrfs_reloc_post_snapshot(trans, pending); 1177 ret = btrfs_reloc_post_snapshot(trans, pending);
1178 if (ret) {
1179 btrfs_abort_transaction(trans, root, ret);
1180 goto fail;
1181 }
1182
1183 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
1184 if (ret) {
1185 btrfs_abort_transaction(trans, root, ret);
1186 goto fail;
1187 }
1188
1189 ret = btrfs_insert_dir_item(trans, parent_root,
1190 dentry->d_name.name, dentry->d_name.len,
1191 parent_inode, &key,
1192 BTRFS_FT_DIR, index);
1193 /* We have check then name at the beginning, so it is impossible. */
1194 BUG_ON(ret == -EEXIST);
1195 if (ret) {
1196 btrfs_abort_transaction(trans, root, ret);
1197 goto fail;
1198 }
1199
1200 btrfs_i_size_write(parent_inode, parent_inode->i_size +
1201 dentry->d_name.len * 2);
1202 parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
1203 ret = btrfs_update_inode(trans, parent_root, parent_inode);
1126 if (ret) 1204 if (ret)
1127 goto abort_trans; 1205 btrfs_abort_transaction(trans, root, ret);
1128 ret = 0;
1129fail: 1206fail:
1130 kfree(new_root_item); 1207 dput(parent);
1131 trans->block_rsv = rsv; 1208 trans->block_rsv = rsv;
1209no_free_objectid:
1210 kfree(new_root_item);
1211root_item_alloc_fail:
1212 btrfs_free_path(path);
1213path_alloc_fail:
1132 btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1); 1214 btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
1133 return ret; 1215 return ret;
1134
1135abort_trans_dput:
1136 dput(parent);
1137abort_trans:
1138 btrfs_abort_transaction(trans, root, ret);
1139 goto fail;
1140} 1216}
1141 1217
1142/* 1218/*
@@ -1229,6 +1305,16 @@ static void do_async_commit(struct work_struct *work)
1229 struct btrfs_async_commit *ac = 1305 struct btrfs_async_commit *ac =
1230 container_of(work, struct btrfs_async_commit, work.work); 1306 container_of(work, struct btrfs_async_commit, work.work);
1231 1307
1308 /*
1309 * We've got freeze protection passed with the transaction.
1310 * Tell lockdep about it.
1311 */
1312 rwsem_acquire_read(
1313 &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
1314 0, 1, _THIS_IP_);
1315
1316 current->journal_info = ac->newtrans;
1317
1232 btrfs_commit_transaction(ac->newtrans, ac->root); 1318 btrfs_commit_transaction(ac->newtrans, ac->root);
1233 kfree(ac); 1319 kfree(ac);
1234} 1320}
@@ -1258,6 +1344,14 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
1258 atomic_inc(&cur_trans->use_count); 1344 atomic_inc(&cur_trans->use_count);
1259 1345
1260 btrfs_end_transaction(trans, root); 1346 btrfs_end_transaction(trans, root);
1347
1348 /*
1349 * Tell lockdep we've released the freeze rwsem, since the
1350 * async commit thread will be the one to unlock it.
1351 */
1352 rwsem_release(&root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
1353 1, _THIS_IP_);
1354
1261 schedule_delayed_work(&ac->work, 0); 1355 schedule_delayed_work(&ac->work, 0);
1262 1356
1263 /* wait for transaction to start and unblock */ 1357 /* wait for transaction to start and unblock */
@@ -1348,6 +1442,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1348 */ 1442 */
1349 cur_trans->delayed_refs.flushing = 1; 1443 cur_trans->delayed_refs.flushing = 1;
1350 1444
1445 if (!list_empty(&trans->new_bgs))
1446 btrfs_create_pending_block_groups(trans, root);
1447
1351 ret = btrfs_run_delayed_refs(trans, root, 0); 1448 ret = btrfs_run_delayed_refs(trans, root, 0);
1352 if (ret) 1449 if (ret)
1353 goto cleanup_transaction; 1450 goto cleanup_transaction;
@@ -1403,7 +1500,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1403 1500
1404 if (flush_on_commit || snap_pending) { 1501 if (flush_on_commit || snap_pending) {
1405 btrfs_start_delalloc_inodes(root, 1); 1502 btrfs_start_delalloc_inodes(root, 1);
1406 btrfs_wait_ordered_extents(root, 0, 1); 1503 btrfs_wait_ordered_extents(root, 1);
1407 } 1504 }
1408 1505
1409 ret = btrfs_run_delayed_items(trans, root); 1506 ret = btrfs_run_delayed_items(trans, root);
@@ -1456,13 +1553,28 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1456 */ 1553 */
1457 mutex_lock(&root->fs_info->reloc_mutex); 1554 mutex_lock(&root->fs_info->reloc_mutex);
1458 1555
1459 ret = btrfs_run_delayed_items(trans, root); 1556 /*
1557 * We needn't worry about the delayed items because we will
1558 * deal with them in create_pending_snapshot(), which is the
1559 * core function of the snapshot creation.
1560 */
1561 ret = create_pending_snapshots(trans, root->fs_info);
1460 if (ret) { 1562 if (ret) {
1461 mutex_unlock(&root->fs_info->reloc_mutex); 1563 mutex_unlock(&root->fs_info->reloc_mutex);
1462 goto cleanup_transaction; 1564 goto cleanup_transaction;
1463 } 1565 }
1464 1566
1465 ret = create_pending_snapshots(trans, root->fs_info); 1567 /*
1568 * We insert the dir indexes of the snapshots and update the inode
1569 * of the snapshots' parents after the snapshot creation, so there
1570 * are some delayed items which are not dealt with. Now deal with
1571 * them.
1572 *
1573 * We needn't worry that this operation will corrupt the snapshots,
1574 * because all the tree which are snapshoted will be forced to COW
1575 * the nodes and leaves.
1576 */
1577 ret = btrfs_run_delayed_items(trans, root);
1466 if (ret) { 1578 if (ret) {
1467 mutex_unlock(&root->fs_info->reloc_mutex); 1579 mutex_unlock(&root->fs_info->reloc_mutex);
1468 goto cleanup_transaction; 1580 goto cleanup_transaction;
@@ -1584,7 +1696,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1584 put_transaction(cur_trans); 1696 put_transaction(cur_trans);
1585 put_transaction(cur_trans); 1697 put_transaction(cur_trans);
1586 1698
1587 sb_end_intwrite(root->fs_info->sb); 1699 if (trans->type < TRANS_JOIN_NOLOCK)
1700 sb_end_intwrite(root->fs_info->sb);
1588 1701
1589 trace_btrfs_transaction_commit(root); 1702 trace_btrfs_transaction_commit(root);
1590 1703
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index e8b8416c688b..80961947a6b2 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -47,6 +47,14 @@ struct btrfs_transaction {
47 int aborted; 47 int aborted;
48}; 48};
49 49
50enum btrfs_trans_type {
51 TRANS_START,
52 TRANS_JOIN,
53 TRANS_USERSPACE,
54 TRANS_JOIN_NOLOCK,
55 TRANS_ATTACH,
56};
57
50struct btrfs_trans_handle { 58struct btrfs_trans_handle {
51 u64 transid; 59 u64 transid;
52 u64 bytes_reserved; 60 u64 bytes_reserved;
@@ -58,8 +66,9 @@ struct btrfs_trans_handle {
58 struct btrfs_transaction *transaction; 66 struct btrfs_transaction *transaction;
59 struct btrfs_block_rsv *block_rsv; 67 struct btrfs_block_rsv *block_rsv;
60 struct btrfs_block_rsv *orig_rsv; 68 struct btrfs_block_rsv *orig_rsv;
61 int aborted; 69 short aborted;
62 int adding_csums; 70 short adding_csums;
71 enum btrfs_trans_type type;
63 /* 72 /*
64 * this root is only needed to validate that the root passed to 73 * this root is only needed to validate that the root passed to
65 * start_transaction is the same as the one passed to end_transaction. 74 * start_transaction is the same as the one passed to end_transaction.
@@ -68,6 +77,7 @@ struct btrfs_trans_handle {
68 struct btrfs_root *root; 77 struct btrfs_root *root;
69 struct seq_list delayed_ref_elem; 78 struct seq_list delayed_ref_elem;
70 struct list_head qgroup_ref_list; 79 struct list_head qgroup_ref_list;
80 struct list_head new_bgs;
71}; 81};
72 82
73struct btrfs_pending_snapshot { 83struct btrfs_pending_snapshot {
@@ -88,16 +98,18 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
88{ 98{
89 BTRFS_I(inode)->last_trans = trans->transaction->transid; 99 BTRFS_I(inode)->last_trans = trans->transaction->transid;
90 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; 100 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
101 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
91} 102}
92 103
93int btrfs_end_transaction(struct btrfs_trans_handle *trans, 104int btrfs_end_transaction(struct btrfs_trans_handle *trans,
94 struct btrfs_root *root); 105 struct btrfs_root *root);
95int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
96 struct btrfs_root *root);
97struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 106struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
98 int num_items); 107 int num_items);
108struct btrfs_trans_handle *btrfs_start_transaction_noflush(
109 struct btrfs_root *root, int num_items);
99struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root); 110struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
100struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root); 111struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
112struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root);
101struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root); 113struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root);
102int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid); 114int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
103int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, 115int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c86670f4f285..81e407d9677a 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -18,13 +18,16 @@
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/slab.h> 20#include <linux/slab.h>
21#include <linux/list_sort.h>
21#include "ctree.h" 22#include "ctree.h"
22#include "transaction.h" 23#include "transaction.h"
23#include "disk-io.h" 24#include "disk-io.h"
24#include "locking.h" 25#include "locking.h"
25#include "print-tree.h" 26#include "print-tree.h"
27#include "backref.h"
26#include "compat.h" 28#include "compat.h"
27#include "tree-log.h" 29#include "tree-log.h"
30#include "hash.h"
28 31
29/* magic values for the inode_only field in btrfs_log_inode: 32/* magic values for the inode_only field in btrfs_log_inode:
30 * 33 *
@@ -146,7 +149,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
146 root->log_multiple_pids = true; 149 root->log_multiple_pids = true;
147 } 150 }
148 151
149 root->log_batch++; 152 atomic_inc(&root->log_batch);
150 atomic_inc(&root->log_writers); 153 atomic_inc(&root->log_writers);
151 mutex_unlock(&root->log_mutex); 154 mutex_unlock(&root->log_mutex);
152 return 0; 155 return 0;
@@ -165,7 +168,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
165 err = ret; 168 err = ret;
166 } 169 }
167 mutex_unlock(&root->fs_info->tree_log_mutex); 170 mutex_unlock(&root->fs_info->tree_log_mutex);
168 root->log_batch++; 171 atomic_inc(&root->log_batch);
169 atomic_inc(&root->log_writers); 172 atomic_inc(&root->log_writers);
170 mutex_unlock(&root->log_mutex); 173 mutex_unlock(&root->log_mutex);
171 return err; 174 return err;
@@ -484,7 +487,6 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
484 int found_type; 487 int found_type;
485 u64 mask = root->sectorsize - 1; 488 u64 mask = root->sectorsize - 1;
486 u64 extent_end; 489 u64 extent_end;
487 u64 alloc_hint;
488 u64 start = key->offset; 490 u64 start = key->offset;
489 u64 saved_nbytes; 491 u64 saved_nbytes;
490 struct btrfs_file_extent_item *item; 492 struct btrfs_file_extent_item *item;
@@ -550,8 +552,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
550 552
551 saved_nbytes = inode_get_bytes(inode); 553 saved_nbytes = inode_get_bytes(inode);
552 /* drop any overlapping extents */ 554 /* drop any overlapping extents */
553 ret = btrfs_drop_extents(trans, inode, start, extent_end, 555 ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1);
554 &alloc_hint, 1);
555 BUG_ON(ret); 556 BUG_ON(ret);
556 557
557 if (found_type == BTRFS_FILE_EXTENT_REG || 558 if (found_type == BTRFS_FILE_EXTENT_REG ||
@@ -744,6 +745,7 @@ out:
744 */ 745 */
745static noinline int backref_in_log(struct btrfs_root *log, 746static noinline int backref_in_log(struct btrfs_root *log,
746 struct btrfs_key *key, 747 struct btrfs_key *key,
748 u64 ref_objectid,
747 char *name, int namelen) 749 char *name, int namelen)
748{ 750{
749 struct btrfs_path *path; 751 struct btrfs_path *path;
@@ -764,8 +766,17 @@ static noinline int backref_in_log(struct btrfs_root *log,
764 if (ret != 0) 766 if (ret != 0)
765 goto out; 767 goto out;
766 768
767 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
768 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 769 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
770
771 if (key->type == BTRFS_INODE_EXTREF_KEY) {
772 if (btrfs_find_name_in_ext_backref(path, ref_objectid,
773 name, namelen, NULL))
774 match = 1;
775
776 goto out;
777 }
778
779 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
769 ptr_end = ptr + item_size; 780 ptr_end = ptr + item_size;
770 while (ptr < ptr_end) { 781 while (ptr < ptr_end) {
771 ref = (struct btrfs_inode_ref *)ptr; 782 ref = (struct btrfs_inode_ref *)ptr;
@@ -786,91 +797,42 @@ out:
786 return match; 797 return match;
787} 798}
788 799
789 800static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
790/*
791 * replay one inode back reference item found in the log tree.
792 * eb, slot and key refer to the buffer and key found in the log tree.
793 * root is the destination we are replaying into, and path is for temp
794 * use by this function. (it should be released on return).
795 */
796static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
797 struct btrfs_root *root, 801 struct btrfs_root *root,
798 struct btrfs_root *log,
799 struct btrfs_path *path, 802 struct btrfs_path *path,
800 struct extent_buffer *eb, int slot, 803 struct btrfs_root *log_root,
801 struct btrfs_key *key) 804 struct inode *dir, struct inode *inode,
805 struct extent_buffer *eb,
806 u64 inode_objectid, u64 parent_objectid,
807 u64 ref_index, char *name, int namelen,
808 int *search_done)
802{ 809{
803 struct btrfs_inode_ref *ref;
804 struct btrfs_dir_item *di;
805 struct inode *dir;
806 struct inode *inode;
807 unsigned long ref_ptr;
808 unsigned long ref_end;
809 char *name;
810 int namelen;
811 int ret; 810 int ret;
812 int search_done = 0; 811 char *victim_name;
813 812 int victim_name_len;
814 /* 813 struct extent_buffer *leaf;
815 * it is possible that we didn't log all the parent directories 814 struct btrfs_dir_item *di;
816 * for a given inode. If we don't find the dir, just don't 815 struct btrfs_key search_key;
817 * copy the back ref in. The link count fixup code will take 816 struct btrfs_inode_extref *extref;
818 * care of the rest
819 */
820 dir = read_one_inode(root, key->offset);
821 if (!dir)
822 return -ENOENT;
823
824 inode = read_one_inode(root, key->objectid);
825 if (!inode) {
826 iput(dir);
827 return -EIO;
828 }
829
830 ref_ptr = btrfs_item_ptr_offset(eb, slot);
831 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
832 817
833again: 818again:
834 ref = (struct btrfs_inode_ref *)ref_ptr; 819 /* Search old style refs */
835 820 search_key.objectid = inode_objectid;
836 namelen = btrfs_inode_ref_name_len(eb, ref); 821 search_key.type = BTRFS_INODE_REF_KEY;
837 name = kmalloc(namelen, GFP_NOFS); 822 search_key.offset = parent_objectid;
838 BUG_ON(!name); 823 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
839
840 read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen);
841
842 /* if we already have a perfect match, we're done */
843 if (inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
844 btrfs_inode_ref_index(eb, ref),
845 name, namelen)) {
846 goto out;
847 }
848
849 /*
850 * look for a conflicting back reference in the metadata.
851 * if we find one we have to unlink that name of the file
852 * before we add our new link. Later on, we overwrite any
853 * existing back reference, and we don't want to create
854 * dangling pointers in the directory.
855 */
856
857 if (search_done)
858 goto insert;
859
860 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
861 if (ret == 0) { 824 if (ret == 0) {
862 char *victim_name;
863 int victim_name_len;
864 struct btrfs_inode_ref *victim_ref; 825 struct btrfs_inode_ref *victim_ref;
865 unsigned long ptr; 826 unsigned long ptr;
866 unsigned long ptr_end; 827 unsigned long ptr_end;
867 struct extent_buffer *leaf = path->nodes[0]; 828
829 leaf = path->nodes[0];
868 830
869 /* are we trying to overwrite a back ref for the root directory 831 /* are we trying to overwrite a back ref for the root directory
870 * if so, just jump out, we're done 832 * if so, just jump out, we're done
871 */ 833 */
872 if (key->objectid == key->offset) 834 if (search_key.objectid == search_key.offset)
873 goto out_nowrite; 835 return 1;
874 836
875 /* check all the names in this back reference to see 837 /* check all the names in this back reference to see
876 * if they are in the log. if so, we allow them to stay 838 * if they are in the log. if so, we allow them to stay
@@ -889,7 +851,9 @@ again:
889 (unsigned long)(victim_ref + 1), 851 (unsigned long)(victim_ref + 1),
890 victim_name_len); 852 victim_name_len);
891 853
892 if (!backref_in_log(log, key, victim_name, 854 if (!backref_in_log(log_root, &search_key,
855 parent_objectid,
856 victim_name,
893 victim_name_len)) { 857 victim_name_len)) {
894 btrfs_inc_nlink(inode); 858 btrfs_inc_nlink(inode);
895 btrfs_release_path(path); 859 btrfs_release_path(path);
@@ -897,9 +861,14 @@ again:
897 ret = btrfs_unlink_inode(trans, root, dir, 861 ret = btrfs_unlink_inode(trans, root, dir,
898 inode, victim_name, 862 inode, victim_name,
899 victim_name_len); 863 victim_name_len);
864 BUG_ON(ret);
900 btrfs_run_delayed_items(trans, root); 865 btrfs_run_delayed_items(trans, root);
866 kfree(victim_name);
867 *search_done = 1;
868 goto again;
901 } 869 }
902 kfree(victim_name); 870 kfree(victim_name);
871
903 ptr = (unsigned long)(victim_ref + 1) + victim_name_len; 872 ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
904 } 873 }
905 BUG_ON(ret); 874 BUG_ON(ret);
@@ -908,14 +877,78 @@ again:
908 * NOTE: we have searched root tree and checked the 877 * NOTE: we have searched root tree and checked the
909 * coresponding ref, it does not need to check again. 878 * coresponding ref, it does not need to check again.
910 */ 879 */
911 search_done = 1; 880 *search_done = 1;
881 }
882 btrfs_release_path(path);
883
884 /* Same search but for extended refs */
885 extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
886 inode_objectid, parent_objectid, 0,
887 0);
888 if (!IS_ERR_OR_NULL(extref)) {
889 u32 item_size;
890 u32 cur_offset = 0;
891 unsigned long base;
892 struct inode *victim_parent;
893
894 leaf = path->nodes[0];
895
896 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
897 base = btrfs_item_ptr_offset(leaf, path->slots[0]);
898
899 while (cur_offset < item_size) {
900 extref = (struct btrfs_inode_extref *)base + cur_offset;
901
902 victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
903
904 if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
905 goto next;
906
907 victim_name = kmalloc(victim_name_len, GFP_NOFS);
908 read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
909 victim_name_len);
910
911 search_key.objectid = inode_objectid;
912 search_key.type = BTRFS_INODE_EXTREF_KEY;
913 search_key.offset = btrfs_extref_hash(parent_objectid,
914 victim_name,
915 victim_name_len);
916 ret = 0;
917 if (!backref_in_log(log_root, &search_key,
918 parent_objectid, victim_name,
919 victim_name_len)) {
920 ret = -ENOENT;
921 victim_parent = read_one_inode(root,
922 parent_objectid);
923 if (victim_parent) {
924 btrfs_inc_nlink(inode);
925 btrfs_release_path(path);
926
927 ret = btrfs_unlink_inode(trans, root,
928 victim_parent,
929 inode,
930 victim_name,
931 victim_name_len);
932 btrfs_run_delayed_items(trans, root);
933 }
934 BUG_ON(ret);
935 iput(victim_parent);
936 kfree(victim_name);
937 *search_done = 1;
938 goto again;
939 }
940 kfree(victim_name);
941 BUG_ON(ret);
942next:
943 cur_offset += victim_name_len + sizeof(*extref);
944 }
945 *search_done = 1;
912 } 946 }
913 btrfs_release_path(path); 947 btrfs_release_path(path);
914 948
915 /* look for a conflicting sequence number */ 949 /* look for a conflicting sequence number */
916 di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), 950 di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
917 btrfs_inode_ref_index(eb, ref), 951 ref_index, name, namelen, 0);
918 name, namelen, 0);
919 if (di && !IS_ERR(di)) { 952 if (di && !IS_ERR(di)) {
920 ret = drop_one_dir_item(trans, root, path, dir, di); 953 ret = drop_one_dir_item(trans, root, path, dir, di);
921 BUG_ON(ret); 954 BUG_ON(ret);
@@ -931,25 +964,173 @@ again:
931 } 964 }
932 btrfs_release_path(path); 965 btrfs_release_path(path);
933 966
934insert: 967 return 0;
935 /* insert our name */ 968}
936 ret = btrfs_add_link(trans, dir, inode, name, namelen, 0,
937 btrfs_inode_ref_index(eb, ref));
938 BUG_ON(ret);
939 969
940 btrfs_update_inode(trans, root, inode); 970static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
971 u32 *namelen, char **name, u64 *index,
972 u64 *parent_objectid)
973{
974 struct btrfs_inode_extref *extref;
941 975
942out: 976 extref = (struct btrfs_inode_extref *)ref_ptr;
943 ref_ptr = (unsigned long)(ref + 1) + namelen; 977
944 kfree(name); 978 *namelen = btrfs_inode_extref_name_len(eb, extref);
945 if (ref_ptr < ref_end) 979 *name = kmalloc(*namelen, GFP_NOFS);
946 goto again; 980 if (*name == NULL)
981 return -ENOMEM;
982
983 read_extent_buffer(eb, *name, (unsigned long)&extref->name,
984 *namelen);
985
986 *index = btrfs_inode_extref_index(eb, extref);
987 if (parent_objectid)
988 *parent_objectid = btrfs_inode_extref_parent(eb, extref);
989
990 return 0;
991}
992
993static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
994 u32 *namelen, char **name, u64 *index)
995{
996 struct btrfs_inode_ref *ref;
997
998 ref = (struct btrfs_inode_ref *)ref_ptr;
999
1000 *namelen = btrfs_inode_ref_name_len(eb, ref);
1001 *name = kmalloc(*namelen, GFP_NOFS);
1002 if (*name == NULL)
1003 return -ENOMEM;
1004
1005 read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen);
1006
1007 *index = btrfs_inode_ref_index(eb, ref);
1008
1009 return 0;
1010}
1011
1012/*
1013 * replay one inode back reference item found in the log tree.
1014 * eb, slot and key refer to the buffer and key found in the log tree.
1015 * root is the destination we are replaying into, and path is for temp
1016 * use by this function. (it should be released on return).
1017 */
1018static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
1019 struct btrfs_root *root,
1020 struct btrfs_root *log,
1021 struct btrfs_path *path,
1022 struct extent_buffer *eb, int slot,
1023 struct btrfs_key *key)
1024{
1025 struct inode *dir;
1026 struct inode *inode;
1027 unsigned long ref_ptr;
1028 unsigned long ref_end;
1029 char *name;
1030 int namelen;
1031 int ret;
1032 int search_done = 0;
1033 int log_ref_ver = 0;
1034 u64 parent_objectid;
1035 u64 inode_objectid;
1036 u64 ref_index = 0;
1037 int ref_struct_size;
1038
1039 ref_ptr = btrfs_item_ptr_offset(eb, slot);
1040 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
1041
1042 if (key->type == BTRFS_INODE_EXTREF_KEY) {
1043 struct btrfs_inode_extref *r;
1044
1045 ref_struct_size = sizeof(struct btrfs_inode_extref);
1046 log_ref_ver = 1;
1047 r = (struct btrfs_inode_extref *)ref_ptr;
1048 parent_objectid = btrfs_inode_extref_parent(eb, r);
1049 } else {
1050 ref_struct_size = sizeof(struct btrfs_inode_ref);
1051 parent_objectid = key->offset;
1052 }
1053 inode_objectid = key->objectid;
1054
1055 /*
1056 * it is possible that we didn't log all the parent directories
1057 * for a given inode. If we don't find the dir, just don't
1058 * copy the back ref in. The link count fixup code will take
1059 * care of the rest
1060 */
1061 dir = read_one_inode(root, parent_objectid);
1062 if (!dir)
1063 return -ENOENT;
1064
1065 inode = read_one_inode(root, inode_objectid);
1066 if (!inode) {
1067 iput(dir);
1068 return -EIO;
1069 }
1070
1071 while (ref_ptr < ref_end) {
1072 if (log_ref_ver) {
1073 ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
1074 &ref_index, &parent_objectid);
1075 /*
1076 * parent object can change from one array
1077 * item to another.
1078 */
1079 if (!dir)
1080 dir = read_one_inode(root, parent_objectid);
1081 if (!dir)
1082 return -ENOENT;
1083 } else {
1084 ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
1085 &ref_index);
1086 }
1087 if (ret)
1088 return ret;
1089
1090 /* if we already have a perfect match, we're done */
1091 if (!inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
1092 ref_index, name, namelen)) {
1093 /*
1094 * look for a conflicting back reference in the
1095 * metadata. if we find one we have to unlink that name
1096 * of the file before we add our new link. Later on, we
1097 * overwrite any existing back reference, and we don't
1098 * want to create dangling pointers in the directory.
1099 */
1100
1101 if (!search_done) {
1102 ret = __add_inode_ref(trans, root, path, log,
1103 dir, inode, eb,
1104 inode_objectid,
1105 parent_objectid,
1106 ref_index, name, namelen,
1107 &search_done);
1108 if (ret == 1)
1109 goto out;
1110 BUG_ON(ret);
1111 }
1112
1113 /* insert our name */
1114 ret = btrfs_add_link(trans, dir, inode, name, namelen,
1115 0, ref_index);
1116 BUG_ON(ret);
1117
1118 btrfs_update_inode(trans, root, inode);
1119 }
1120
1121 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
1122 kfree(name);
1123 if (log_ref_ver) {
1124 iput(dir);
1125 dir = NULL;
1126 }
1127 }
947 1128
948 /* finally write the back reference in the inode */ 1129 /* finally write the back reference in the inode */
949 ret = overwrite_item(trans, root, path, eb, slot, key); 1130 ret = overwrite_item(trans, root, path, eb, slot, key);
950 BUG_ON(ret); 1131 BUG_ON(ret);
951 1132
952out_nowrite: 1133out:
953 btrfs_release_path(path); 1134 btrfs_release_path(path);
954 iput(dir); 1135 iput(dir);
955 iput(inode); 1136 iput(inode);
@@ -966,25 +1147,55 @@ static int insert_orphan_item(struct btrfs_trans_handle *trans,
966 return ret; 1147 return ret;
967} 1148}
968 1149
1150static int count_inode_extrefs(struct btrfs_root *root,
1151 struct inode *inode, struct btrfs_path *path)
1152{
1153 int ret = 0;
1154 int name_len;
1155 unsigned int nlink = 0;
1156 u32 item_size;
1157 u32 cur_offset = 0;
1158 u64 inode_objectid = btrfs_ino(inode);
1159 u64 offset = 0;
1160 unsigned long ptr;
1161 struct btrfs_inode_extref *extref;
1162 struct extent_buffer *leaf;
969 1163
970/* 1164 while (1) {
971 * There are a few corners where the link count of the file can't 1165 ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
972 * be properly maintained during replay. So, instead of adding 1166 &extref, &offset);
973 * lots of complexity to the log code, we just scan the backrefs 1167 if (ret)
974 * for any file that has been through replay. 1168 break;
975 * 1169
976 * The scan will update the link count on the inode to reflect the 1170 leaf = path->nodes[0];
977 * number of back refs found. If it goes down to zero, the iput 1171 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
978 * will free the inode. 1172 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
979 */ 1173
980static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, 1174 while (cur_offset < item_size) {
981 struct btrfs_root *root, 1175 extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
982 struct inode *inode) 1176 name_len = btrfs_inode_extref_name_len(leaf, extref);
1177
1178 nlink++;
1179
1180 cur_offset += name_len + sizeof(*extref);
1181 }
1182
1183 offset++;
1184 btrfs_release_path(path);
1185 }
1186 btrfs_release_path(path);
1187
1188 if (ret < 0)
1189 return ret;
1190 return nlink;
1191}
1192
1193static int count_inode_refs(struct btrfs_root *root,
1194 struct inode *inode, struct btrfs_path *path)
983{ 1195{
984 struct btrfs_path *path;
985 int ret; 1196 int ret;
986 struct btrfs_key key; 1197 struct btrfs_key key;
987 u64 nlink = 0; 1198 unsigned int nlink = 0;
988 unsigned long ptr; 1199 unsigned long ptr;
989 unsigned long ptr_end; 1200 unsigned long ptr_end;
990 int name_len; 1201 int name_len;
@@ -994,10 +1205,6 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
994 key.type = BTRFS_INODE_REF_KEY; 1205 key.type = BTRFS_INODE_REF_KEY;
995 key.offset = (u64)-1; 1206 key.offset = (u64)-1;
996 1207
997 path = btrfs_alloc_path();
998 if (!path)
999 return -ENOMEM;
1000
1001 while (1) { 1208 while (1) {
1002 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1209 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1003 if (ret < 0) 1210 if (ret < 0)
@@ -1031,6 +1238,50 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1031 btrfs_release_path(path); 1238 btrfs_release_path(path);
1032 } 1239 }
1033 btrfs_release_path(path); 1240 btrfs_release_path(path);
1241
1242 return nlink;
1243}
1244
1245/*
1246 * There are a few corners where the link count of the file can't
1247 * be properly maintained during replay. So, instead of adding
1248 * lots of complexity to the log code, we just scan the backrefs
1249 * for any file that has been through replay.
1250 *
1251 * The scan will update the link count on the inode to reflect the
1252 * number of back refs found. If it goes down to zero, the iput
1253 * will free the inode.
1254 */
1255static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1256 struct btrfs_root *root,
1257 struct inode *inode)
1258{
1259 struct btrfs_path *path;
1260 int ret;
1261 u64 nlink = 0;
1262 u64 ino = btrfs_ino(inode);
1263
1264 path = btrfs_alloc_path();
1265 if (!path)
1266 return -ENOMEM;
1267
1268 ret = count_inode_refs(root, inode, path);
1269 if (ret < 0)
1270 goto out;
1271
1272 nlink = ret;
1273
1274 ret = count_inode_extrefs(root, inode, path);
1275 if (ret == -ENOENT)
1276 ret = 0;
1277
1278 if (ret < 0)
1279 goto out;
1280
1281 nlink += ret;
1282
1283 ret = 0;
1284
1034 if (nlink != inode->i_nlink) { 1285 if (nlink != inode->i_nlink) {
1035 set_nlink(inode, nlink); 1286 set_nlink(inode, nlink);
1036 btrfs_update_inode(trans, root, inode); 1287 btrfs_update_inode(trans, root, inode);
@@ -1046,9 +1297,10 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1046 ret = insert_orphan_item(trans, root, ino); 1297 ret = insert_orphan_item(trans, root, ino);
1047 BUG_ON(ret); 1298 BUG_ON(ret);
1048 } 1299 }
1049 btrfs_free_path(path);
1050 1300
1051 return 0; 1301out:
1302 btrfs_free_path(path);
1303 return ret;
1052} 1304}
1053 1305
1054static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, 1306static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
@@ -1695,6 +1947,10 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1695 ret = add_inode_ref(wc->trans, root, log, path, 1947 ret = add_inode_ref(wc->trans, root, log, path,
1696 eb, i, &key); 1948 eb, i, &key);
1697 BUG_ON(ret && ret != -ENOENT); 1949 BUG_ON(ret && ret != -ENOENT);
1950 } else if (key.type == BTRFS_INODE_EXTREF_KEY) {
1951 ret = add_inode_ref(wc->trans, root, log, path,
1952 eb, i, &key);
1953 BUG_ON(ret && ret != -ENOENT);
1698 } else if (key.type == BTRFS_EXTENT_DATA_KEY) { 1954 } else if (key.type == BTRFS_EXTENT_DATA_KEY) {
1699 ret = replay_one_extent(wc->trans, root, path, 1955 ret = replay_one_extent(wc->trans, root, path,
1700 eb, i, &key); 1956 eb, i, &key);
@@ -2037,7 +2293,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2037 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2293 if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
2038 wait_log_commit(trans, root, root->log_transid - 1); 2294 wait_log_commit(trans, root, root->log_transid - 1);
2039 while (1) { 2295 while (1) {
2040 unsigned long batch = root->log_batch; 2296 int batch = atomic_read(&root->log_batch);
2041 /* when we're on an ssd, just kick the log commit out */ 2297 /* when we're on an ssd, just kick the log commit out */
2042 if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) { 2298 if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) {
2043 mutex_unlock(&root->log_mutex); 2299 mutex_unlock(&root->log_mutex);
@@ -2045,7 +2301,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2045 mutex_lock(&root->log_mutex); 2301 mutex_lock(&root->log_mutex);
2046 } 2302 }
2047 wait_for_writer(trans, root); 2303 wait_for_writer(trans, root);
2048 if (batch == root->log_batch) 2304 if (batch == atomic_read(&root->log_batch))
2049 break; 2305 break;
2050 } 2306 }
2051 2307
@@ -2074,7 +2330,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2074 2330
2075 btrfs_set_root_node(&log->root_item, log->node); 2331 btrfs_set_root_node(&log->root_item, log->node);
2076 2332
2077 root->log_batch = 0;
2078 root->log_transid++; 2333 root->log_transid++;
2079 log->log_transid = root->log_transid; 2334 log->log_transid = root->log_transid;
2080 root->log_start_pid = 0; 2335 root->log_start_pid = 0;
@@ -2087,7 +2342,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2087 mutex_unlock(&root->log_mutex); 2342 mutex_unlock(&root->log_mutex);
2088 2343
2089 mutex_lock(&log_root_tree->log_mutex); 2344 mutex_lock(&log_root_tree->log_mutex);
2090 log_root_tree->log_batch++; 2345 atomic_inc(&log_root_tree->log_batch);
2091 atomic_inc(&log_root_tree->log_writers); 2346 atomic_inc(&log_root_tree->log_writers);
2092 mutex_unlock(&log_root_tree->log_mutex); 2347 mutex_unlock(&log_root_tree->log_mutex);
2093 2348
@@ -2157,7 +2412,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2157 btrfs_set_super_log_root_level(root->fs_info->super_for_commit, 2412 btrfs_set_super_log_root_level(root->fs_info->super_for_commit,
2158 btrfs_header_level(log_root_tree->node)); 2413 btrfs_header_level(log_root_tree->node));
2159 2414
2160 log_root_tree->log_batch = 0;
2161 log_root_tree->log_transid++; 2415 log_root_tree->log_transid++;
2162 smp_mb(); 2416 smp_mb();
2163 2417
@@ -2171,9 +2425,12 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2171 * in and cause problems either. 2425 * in and cause problems either.
2172 */ 2426 */
2173 btrfs_scrub_pause_super(root); 2427 btrfs_scrub_pause_super(root);
2174 write_ctree_super(trans, root->fs_info->tree_root, 1); 2428 ret = write_ctree_super(trans, root->fs_info->tree_root, 1);
2175 btrfs_scrub_continue_super(root); 2429 btrfs_scrub_continue_super(root);
2176 ret = 0; 2430 if (ret) {
2431 btrfs_abort_transaction(trans, root, ret);
2432 goto out_wake_log_root;
2433 }
2177 2434
2178 mutex_lock(&root->log_mutex); 2435 mutex_lock(&root->log_mutex);
2179 if (root->last_log_commit < log_transid) 2436 if (root->last_log_commit < log_transid)
@@ -2209,7 +2466,8 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
2209 2466
2210 while (1) { 2467 while (1) {
2211 ret = find_first_extent_bit(&log->dirty_log_pages, 2468 ret = find_first_extent_bit(&log->dirty_log_pages,
2212 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW); 2469 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW,
2470 NULL);
2213 if (ret) 2471 if (ret)
2214 break; 2472 break;
2215 2473
@@ -2646,6 +2904,7 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
2646 int ret; 2904 int ret;
2647 struct btrfs_key key; 2905 struct btrfs_key key;
2648 struct btrfs_key found_key; 2906 struct btrfs_key found_key;
2907 int start_slot;
2649 2908
2650 key.objectid = objectid; 2909 key.objectid = objectid;
2651 key.type = max_key_type; 2910 key.type = max_key_type;
@@ -2667,8 +2926,18 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
2667 if (found_key.objectid != objectid) 2926 if (found_key.objectid != objectid)
2668 break; 2927 break;
2669 2928
2670 ret = btrfs_del_item(trans, log, path); 2929 found_key.offset = 0;
2671 if (ret) 2930 found_key.type = 0;
2931 ret = btrfs_bin_search(path->nodes[0], &found_key, 0,
2932 &start_slot);
2933
2934 ret = btrfs_del_items(trans, log, path, start_slot,
2935 path->slots[0] - start_slot + 1);
2936 /*
2937 * If start slot isn't 0 then we don't need to re-search, we've
2938 * found the last guy with the objectid in this tree.
2939 */
2940 if (ret || start_slot != 0)
2672 break; 2941 break;
2673 btrfs_release_path(path); 2942 btrfs_release_path(path);
2674 } 2943 }
@@ -2678,14 +2947,64 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
2678 return ret; 2947 return ret;
2679} 2948}
2680 2949
2950static void fill_inode_item(struct btrfs_trans_handle *trans,
2951 struct extent_buffer *leaf,
2952 struct btrfs_inode_item *item,
2953 struct inode *inode, int log_inode_only)
2954{
2955 btrfs_set_inode_uid(leaf, item, i_uid_read(inode));
2956 btrfs_set_inode_gid(leaf, item, i_gid_read(inode));
2957 btrfs_set_inode_mode(leaf, item, inode->i_mode);
2958 btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
2959
2960 btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item),
2961 inode->i_atime.tv_sec);
2962 btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item),
2963 inode->i_atime.tv_nsec);
2964
2965 btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item),
2966 inode->i_mtime.tv_sec);
2967 btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item),
2968 inode->i_mtime.tv_nsec);
2969
2970 btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item),
2971 inode->i_ctime.tv_sec);
2972 btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item),
2973 inode->i_ctime.tv_nsec);
2974
2975 btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
2976
2977 btrfs_set_inode_sequence(leaf, item, inode->i_version);
2978 btrfs_set_inode_transid(leaf, item, trans->transid);
2979 btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
2980 btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
2981 btrfs_set_inode_block_group(leaf, item, 0);
2982
2983 if (log_inode_only) {
2984 /* set the generation to zero so the recover code
2985 * can tell the difference between an logging
2986 * just to say 'this inode exists' and a logging
2987 * to say 'update this inode with these values'
2988 */
2989 btrfs_set_inode_generation(leaf, item, 0);
2990 btrfs_set_inode_size(leaf, item, 0);
2991 } else {
2992 btrfs_set_inode_generation(leaf, item,
2993 BTRFS_I(inode)->generation);
2994 btrfs_set_inode_size(leaf, item, inode->i_size);
2995 }
2996
2997}
2998
2681static noinline int copy_items(struct btrfs_trans_handle *trans, 2999static noinline int copy_items(struct btrfs_trans_handle *trans,
2682 struct btrfs_root *log, 3000 struct inode *inode,
2683 struct btrfs_path *dst_path, 3001 struct btrfs_path *dst_path,
2684 struct extent_buffer *src, 3002 struct extent_buffer *src,
2685 int start_slot, int nr, int inode_only) 3003 int start_slot, int nr, int inode_only)
2686{ 3004{
2687 unsigned long src_offset; 3005 unsigned long src_offset;
2688 unsigned long dst_offset; 3006 unsigned long dst_offset;
3007 struct btrfs_root *log = BTRFS_I(inode)->root->log_root;
2689 struct btrfs_file_extent_item *extent; 3008 struct btrfs_file_extent_item *extent;
2690 struct btrfs_inode_item *inode_item; 3009 struct btrfs_inode_item *inode_item;
2691 int ret; 3010 int ret;
@@ -2694,6 +3013,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2694 char *ins_data; 3013 char *ins_data;
2695 int i; 3014 int i;
2696 struct list_head ordered_sums; 3015 struct list_head ordered_sums;
3016 int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
2697 3017
2698 INIT_LIST_HEAD(&ordered_sums); 3018 INIT_LIST_HEAD(&ordered_sums);
2699 3019
@@ -2722,29 +3042,23 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2722 3042
2723 src_offset = btrfs_item_ptr_offset(src, start_slot + i); 3043 src_offset = btrfs_item_ptr_offset(src, start_slot + i);
2724 3044
2725 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 3045 if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
2726 src_offset, ins_sizes[i]);
2727
2728 if (inode_only == LOG_INODE_EXISTS &&
2729 ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
2730 inode_item = btrfs_item_ptr(dst_path->nodes[0], 3046 inode_item = btrfs_item_ptr(dst_path->nodes[0],
2731 dst_path->slots[0], 3047 dst_path->slots[0],
2732 struct btrfs_inode_item); 3048 struct btrfs_inode_item);
2733 btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0); 3049 fill_inode_item(trans, dst_path->nodes[0], inode_item,
2734 3050 inode, inode_only == LOG_INODE_EXISTS);
2735 /* set the generation to zero so the recover code 3051 } else {
2736 * can tell the difference between an logging 3052 copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
2737 * just to say 'this inode exists' and a logging 3053 src_offset, ins_sizes[i]);
2738 * to say 'update this inode with these values'
2739 */
2740 btrfs_set_inode_generation(dst_path->nodes[0],
2741 inode_item, 0);
2742 } 3054 }
3055
2743 /* take a reference on file data extents so that truncates 3056 /* take a reference on file data extents so that truncates
2744 * or deletes of this inode don't have to relog the inode 3057 * or deletes of this inode don't have to relog the inode
2745 * again 3058 * again
2746 */ 3059 */
2747 if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) { 3060 if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY &&
3061 !skip_csum) {
2748 int found_type; 3062 int found_type;
2749 extent = btrfs_item_ptr(src, start_slot + i, 3063 extent = btrfs_item_ptr(src, start_slot + i,
2750 struct btrfs_file_extent_item); 3064 struct btrfs_file_extent_item);
@@ -2753,8 +3067,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2753 continue; 3067 continue;
2754 3068
2755 found_type = btrfs_file_extent_type(src, extent); 3069 found_type = btrfs_file_extent_type(src, extent);
2756 if (found_type == BTRFS_FILE_EXTENT_REG || 3070 if (found_type == BTRFS_FILE_EXTENT_REG) {
2757 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
2758 u64 ds, dl, cs, cl; 3071 u64 ds, dl, cs, cl;
2759 ds = btrfs_file_extent_disk_bytenr(src, 3072 ds = btrfs_file_extent_disk_bytenr(src,
2760 extent); 3073 extent);
@@ -2803,6 +3116,239 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2803 return ret; 3116 return ret;
2804} 3117}
2805 3118
3119static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
3120{
3121 struct extent_map *em1, *em2;
3122
3123 em1 = list_entry(a, struct extent_map, list);
3124 em2 = list_entry(b, struct extent_map, list);
3125
3126 if (em1->start < em2->start)
3127 return -1;
3128 else if (em1->start > em2->start)
3129 return 1;
3130 return 0;
3131}
3132
3133struct log_args {
3134 struct extent_buffer *src;
3135 u64 next_offset;
3136 int start_slot;
3137 int nr;
3138};
3139
3140static int log_one_extent(struct btrfs_trans_handle *trans,
3141 struct inode *inode, struct btrfs_root *root,
3142 struct extent_map *em, struct btrfs_path *path,
3143 struct btrfs_path *dst_path, struct log_args *args)
3144{
3145 struct btrfs_root *log = root->log_root;
3146 struct btrfs_file_extent_item *fi;
3147 struct btrfs_key key;
3148 u64 start = em->mod_start;
3149 u64 search_start = start;
3150 u64 len = em->mod_len;
3151 u64 num_bytes;
3152 int nritems;
3153 int ret;
3154
3155 if (BTRFS_I(inode)->logged_trans == trans->transid) {
3156 ret = __btrfs_drop_extents(trans, log, inode, dst_path, start,
3157 start + len, NULL, 0);
3158 if (ret)
3159 return ret;
3160 }
3161
3162 while (len) {
3163 if (args->nr)
3164 goto next_slot;
3165again:
3166 key.objectid = btrfs_ino(inode);
3167 key.type = BTRFS_EXTENT_DATA_KEY;
3168 key.offset = search_start;
3169
3170 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3171 if (ret < 0)
3172 return ret;
3173
3174 if (ret) {
3175 /*
3176 * A rare case were we can have an em for a section of a
3177 * larger extent so we need to make sure that this em
3178 * falls within the extent we've found. If not we just
3179 * bail and go back to ye-olde way of doing things but
3180 * it happens often enough in testing that we need to do
3181 * this dance to make sure.
3182 */
3183 do {
3184 if (path->slots[0] == 0) {
3185 btrfs_release_path(path);
3186 if (search_start == 0)
3187 return -ENOENT;
3188 search_start--;
3189 goto again;
3190 }
3191
3192 path->slots[0]--;
3193 btrfs_item_key_to_cpu(path->nodes[0], &key,
3194 path->slots[0]);
3195 if (key.objectid != btrfs_ino(inode) ||
3196 key.type != BTRFS_EXTENT_DATA_KEY) {
3197 btrfs_release_path(path);
3198 return -ENOENT;
3199 }
3200 } while (key.offset > start);
3201
3202 fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
3203 struct btrfs_file_extent_item);
3204 num_bytes = btrfs_file_extent_num_bytes(path->nodes[0],
3205 fi);
3206 if (key.offset + num_bytes <= start) {
3207 btrfs_release_path(path);
3208 return -ENOENT;
3209 }
3210 }
3211 args->src = path->nodes[0];
3212next_slot:
3213 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
3214 fi = btrfs_item_ptr(args->src, path->slots[0],
3215 struct btrfs_file_extent_item);
3216 if (args->nr &&
3217 args->start_slot + args->nr == path->slots[0]) {
3218 args->nr++;
3219 } else if (args->nr) {
3220 ret = copy_items(trans, inode, dst_path, args->src,
3221 args->start_slot, args->nr,
3222 LOG_INODE_ALL);
3223 if (ret)
3224 return ret;
3225 args->nr = 1;
3226 args->start_slot = path->slots[0];
3227 } else if (!args->nr) {
3228 args->nr = 1;
3229 args->start_slot = path->slots[0];
3230 }
3231 nritems = btrfs_header_nritems(path->nodes[0]);
3232 path->slots[0]++;
3233 num_bytes = btrfs_file_extent_num_bytes(args->src, fi);
3234 if (len < num_bytes) {
3235 /* I _think_ this is ok, envision we write to a
3236 * preallocated space that is adjacent to a previously
3237 * written preallocated space that gets merged when we
3238 * mark this preallocated space written. If we do not
3239 * have the adjacent extent in cache then when we copy
3240 * this extent it could end up being larger than our EM
3241 * thinks it is, which is a-ok, so just set len to 0.
3242 */
3243 len = 0;
3244 } else {
3245 len -= num_bytes;
3246 }
3247 start = key.offset + num_bytes;
3248 args->next_offset = start;
3249 search_start = start;
3250
3251 if (path->slots[0] < nritems) {
3252 if (len)
3253 goto next_slot;
3254 break;
3255 }
3256
3257 if (args->nr) {
3258 ret = copy_items(trans, inode, dst_path, args->src,
3259 args->start_slot, args->nr,
3260 LOG_INODE_ALL);
3261 if (ret)
3262 return ret;
3263 args->nr = 0;
3264 btrfs_release_path(path);
3265 }
3266 }
3267
3268 return 0;
3269}
3270
3271static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
3272 struct btrfs_root *root,
3273 struct inode *inode,
3274 struct btrfs_path *path,
3275 struct btrfs_path *dst_path)
3276{
3277 struct log_args args;
3278 struct extent_map *em, *n;
3279 struct list_head extents;
3280 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
3281 u64 test_gen;
3282 int ret = 0;
3283
3284 INIT_LIST_HEAD(&extents);
3285
3286 memset(&args, 0, sizeof(args));
3287
3288 write_lock(&tree->lock);
3289 test_gen = root->fs_info->last_trans_committed;
3290
3291 list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
3292 list_del_init(&em->list);
3293 if (em->generation <= test_gen)
3294 continue;
3295 /* Need a ref to keep it from getting evicted from cache */
3296 atomic_inc(&em->refs);
3297 set_bit(EXTENT_FLAG_LOGGING, &em->flags);
3298 list_add_tail(&em->list, &extents);
3299 }
3300
3301 list_sort(NULL, &extents, extent_cmp);
3302
3303 while (!list_empty(&extents)) {
3304 em = list_entry(extents.next, struct extent_map, list);
3305
3306 list_del_init(&em->list);
3307 clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
3308
3309 /*
3310 * If we had an error we just need to delete everybody from our
3311 * private list.
3312 */
3313 if (ret) {
3314 free_extent_map(em);
3315 continue;
3316 }
3317
3318 write_unlock(&tree->lock);
3319
3320 /*
3321 * If the previous EM and the last extent we left off on aren't
3322 * sequential then we need to copy the items we have and redo
3323 * our search
3324 */
3325 if (args.nr && em->mod_start != args.next_offset) {
3326 ret = copy_items(trans, inode, dst_path, args.src,
3327 args.start_slot, args.nr,
3328 LOG_INODE_ALL);
3329 if (ret) {
3330 free_extent_map(em);
3331 write_lock(&tree->lock);
3332 continue;
3333 }
3334 btrfs_release_path(path);
3335 args.nr = 0;
3336 }
3337
3338 ret = log_one_extent(trans, inode, root, em, path, dst_path, &args);
3339 free_extent_map(em);
3340 write_lock(&tree->lock);
3341 }
3342 WARN_ON(!list_empty(&extents));
3343 write_unlock(&tree->lock);
3344
3345 if (!ret && args.nr)
3346 ret = copy_items(trans, inode, dst_path, args.src,
3347 args.start_slot, args.nr, LOG_INODE_ALL);
3348 btrfs_release_path(path);
3349 return ret;
3350}
3351
2806/* log a single inode in the tree log. 3352/* log a single inode in the tree log.
2807 * At least one parent directory for this inode must exist in the tree 3353 * At least one parent directory for this inode must exist in the tree
2808 * or be logged already. 3354 * or be logged already.
@@ -2832,6 +3378,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2832 int nritems; 3378 int nritems;
2833 int ins_start_slot = 0; 3379 int ins_start_slot = 0;
2834 int ins_nr; 3380 int ins_nr;
3381 bool fast_search = false;
2835 u64 ino = btrfs_ino(inode); 3382 u64 ino = btrfs_ino(inode);
2836 3383
2837 log = root->log_root; 3384 log = root->log_root;
@@ -2851,21 +3398,23 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2851 3398
2852 max_key.objectid = ino; 3399 max_key.objectid = ino;
2853 3400
2854 /* today the code can only do partial logging of directories */
2855 if (!S_ISDIR(inode->i_mode))
2856 inode_only = LOG_INODE_ALL;
2857 3401
3402 /* today the code can only do partial logging of directories */
2858 if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) 3403 if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
2859 max_key.type = BTRFS_XATTR_ITEM_KEY; 3404 max_key.type = BTRFS_XATTR_ITEM_KEY;
2860 else 3405 else
2861 max_key.type = (u8)-1; 3406 max_key.type = (u8)-1;
2862 max_key.offset = (u64)-1; 3407 max_key.offset = (u64)-1;
2863 3408
2864 ret = btrfs_commit_inode_delayed_items(trans, inode); 3409 /* Only run delayed items if we are a dir or a new file */
2865 if (ret) { 3410 if (S_ISDIR(inode->i_mode) ||
2866 btrfs_free_path(path); 3411 BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) {
2867 btrfs_free_path(dst_path); 3412 ret = btrfs_commit_inode_delayed_items(trans, inode);
2868 return ret; 3413 if (ret) {
3414 btrfs_free_path(path);
3415 btrfs_free_path(dst_path);
3416 return ret;
3417 }
2869 } 3418 }
2870 3419
2871 mutex_lock(&BTRFS_I(inode)->log_mutex); 3420 mutex_lock(&BTRFS_I(inode)->log_mutex);
@@ -2881,7 +3430,16 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2881 max_key_type = BTRFS_XATTR_ITEM_KEY; 3430 max_key_type = BTRFS_XATTR_ITEM_KEY;
2882 ret = drop_objectid_items(trans, log, path, ino, max_key_type); 3431 ret = drop_objectid_items(trans, log, path, ino, max_key_type);
2883 } else { 3432 } else {
2884 ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); 3433 if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3434 &BTRFS_I(inode)->runtime_flags)) {
3435 ret = btrfs_truncate_inode_items(trans, log,
3436 inode, 0, 0);
3437 } else {
3438 fast_search = true;
3439 max_key.type = BTRFS_XATTR_ITEM_KEY;
3440 ret = drop_objectid_items(trans, log, path, ino,
3441 BTRFS_XATTR_ITEM_KEY);
3442 }
2885 } 3443 }
2886 if (ret) { 3444 if (ret) {
2887 err = ret; 3445 err = ret;
@@ -2912,7 +3470,7 @@ again:
2912 goto next_slot; 3470 goto next_slot;
2913 } 3471 }
2914 3472
2915 ret = copy_items(trans, log, dst_path, src, ins_start_slot, 3473 ret = copy_items(trans, inode, dst_path, src, ins_start_slot,
2916 ins_nr, inode_only); 3474 ins_nr, inode_only);
2917 if (ret) { 3475 if (ret) {
2918 err = ret; 3476 err = ret;
@@ -2930,7 +3488,7 @@ next_slot:
2930 goto again; 3488 goto again;
2931 } 3489 }
2932 if (ins_nr) { 3490 if (ins_nr) {
2933 ret = copy_items(trans, log, dst_path, src, 3491 ret = copy_items(trans, inode, dst_path, src,
2934 ins_start_slot, 3492 ins_start_slot,
2935 ins_nr, inode_only); 3493 ins_nr, inode_only);
2936 if (ret) { 3494 if (ret) {
@@ -2951,8 +3509,7 @@ next_slot:
2951 break; 3509 break;
2952 } 3510 }
2953 if (ins_nr) { 3511 if (ins_nr) {
2954 ret = copy_items(trans, log, dst_path, src, 3512 ret = copy_items(trans, inode, dst_path, src, ins_start_slot,
2955 ins_start_slot,
2956 ins_nr, inode_only); 3513 ins_nr, inode_only);
2957 if (ret) { 3514 if (ret) {
2958 err = ret; 3515 err = ret;
@@ -2960,7 +3517,24 @@ next_slot:
2960 } 3517 }
2961 ins_nr = 0; 3518 ins_nr = 0;
2962 } 3519 }
2963 WARN_ON(ins_nr); 3520
3521 if (fast_search) {
3522 btrfs_release_path(path);
3523 btrfs_release_path(dst_path);
3524 ret = btrfs_log_changed_extents(trans, root, inode, path,
3525 dst_path);
3526 if (ret) {
3527 err = ret;
3528 goto out_unlock;
3529 }
3530 } else {
3531 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
3532 struct extent_map *em, *n;
3533
3534 list_for_each_entry_safe(em, n, &tree->modified_extents, list)
3535 list_del_init(&em->list);
3536 }
3537
2964 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { 3538 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
2965 btrfs_release_path(path); 3539 btrfs_release_path(path);
2966 btrfs_release_path(dst_path); 3540 btrfs_release_path(dst_path);
@@ -2971,6 +3545,7 @@ next_slot:
2971 } 3545 }
2972 } 3546 }
2973 BTRFS_I(inode)->logged_trans = trans->transid; 3547 BTRFS_I(inode)->logged_trans = trans->transid;
3548 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
2974out_unlock: 3549out_unlock:
2975 mutex_unlock(&BTRFS_I(inode)->log_mutex); 3550 mutex_unlock(&BTRFS_I(inode)->log_mutex);
2976 3551
@@ -3138,7 +3713,7 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
3138end_trans: 3713end_trans:
3139 dput(old_parent); 3714 dput(old_parent);
3140 if (ret < 0) { 3715 if (ret < 0) {
3141 BUG_ON(ret != -ENOSPC); 3716 WARN_ON(ret != -ENOSPC);
3142 root->fs_info->last_trans_log_full_commit = trans->transid; 3717 root->fs_info->last_trans_log_full_commit = trans->transid;
3143 ret = 1; 3718 ret = 1;
3144 } 3719 }
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index ab942f46b3dd..99be4c138db6 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -143,14 +143,13 @@ EXPORT_SYMBOL(ulist_free);
143 * In case of allocation failure -ENOMEM is returned and the ulist stays 143 * In case of allocation failure -ENOMEM is returned and the ulist stays
144 * unaltered. 144 * unaltered.
145 */ 145 */
146int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, 146int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask)
147 gfp_t gfp_mask)
148{ 147{
149 return ulist_add_merge(ulist, val, aux, NULL, gfp_mask); 148 return ulist_add_merge(ulist, val, aux, NULL, gfp_mask);
150} 149}
151 150
152int ulist_add_merge(struct ulist *ulist, u64 val, unsigned long aux, 151int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
153 unsigned long *old_aux, gfp_t gfp_mask) 152 u64 *old_aux, gfp_t gfp_mask)
154{ 153{
155 int i; 154 int i;
156 155
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
index 21bdc8ec8130..21a1963439c3 100644
--- a/fs/btrfs/ulist.h
+++ b/fs/btrfs/ulist.h
@@ -33,7 +33,7 @@ struct ulist_iterator {
33 */ 33 */
34struct ulist_node { 34struct ulist_node {
35 u64 val; /* value to store */ 35 u64 val; /* value to store */
36 unsigned long aux; /* auxiliary value saved along with the val */ 36 u64 aux; /* auxiliary value saved along with the val */
37}; 37};
38 38
39struct ulist { 39struct ulist {
@@ -65,10 +65,9 @@ void ulist_fini(struct ulist *ulist);
65void ulist_reinit(struct ulist *ulist); 65void ulist_reinit(struct ulist *ulist);
66struct ulist *ulist_alloc(gfp_t gfp_mask); 66struct ulist *ulist_alloc(gfp_t gfp_mask);
67void ulist_free(struct ulist *ulist); 67void ulist_free(struct ulist *ulist);
68int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, 68int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask);
69 gfp_t gfp_mask); 69int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
70int ulist_add_merge(struct ulist *ulist, u64 val, unsigned long aux, 70 u64 *old_aux, gfp_t gfp_mask);
71 unsigned long *old_aux, gfp_t gfp_mask);
72struct ulist_node *ulist_next(struct ulist *ulist, 71struct ulist_node *ulist_next(struct ulist *ulist,
73 struct ulist_iterator *uiter); 72 struct ulist_iterator *uiter);
74 73
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 88b969aeeb71..029b903a4ae3 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -639,7 +639,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
639 639
640 bdev = blkdev_get_by_path(device->name->str, flags, holder); 640 bdev = blkdev_get_by_path(device->name->str, flags, holder);
641 if (IS_ERR(bdev)) { 641 if (IS_ERR(bdev)) {
642 printk(KERN_INFO "open %s failed\n", device->name->str); 642 printk(KERN_INFO "btrfs: open %s failed\n", device->name->str);
643 goto error; 643 goto error;
644 } 644 }
645 filemap_write_and_wait(bdev->bd_inode->i_mapping); 645 filemap_write_and_wait(bdev->bd_inode->i_mapping);
@@ -1475,6 +1475,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1475 free_fs_devices(cur_devices); 1475 free_fs_devices(cur_devices);
1476 } 1476 }
1477 1477
1478 root->fs_info->num_tolerated_disk_barrier_failures =
1479 btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
1480
1478 /* 1481 /*
1479 * at this point, the device is zero sized. We want to 1482 * at this point, the device is zero sized. We want to
1480 * remove it from the devices list and zero out the old super 1483 * remove it from the devices list and zero out the old super
@@ -1775,15 +1778,21 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1775 1778
1776 if (seeding_dev) { 1779 if (seeding_dev) {
1777 ret = init_first_rw_device(trans, root, device); 1780 ret = init_first_rw_device(trans, root, device);
1778 if (ret) 1781 if (ret) {
1782 btrfs_abort_transaction(trans, root, ret);
1779 goto error_trans; 1783 goto error_trans;
1784 }
1780 ret = btrfs_finish_sprout(trans, root); 1785 ret = btrfs_finish_sprout(trans, root);
1781 if (ret) 1786 if (ret) {
1787 btrfs_abort_transaction(trans, root, ret);
1782 goto error_trans; 1788 goto error_trans;
1789 }
1783 } else { 1790 } else {
1784 ret = btrfs_add_device(trans, root, device); 1791 ret = btrfs_add_device(trans, root, device);
1785 if (ret) 1792 if (ret) {
1793 btrfs_abort_transaction(trans, root, ret);
1786 goto error_trans; 1794 goto error_trans;
1795 }
1787 } 1796 }
1788 1797
1789 /* 1798 /*
@@ -1793,6 +1802,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1793 btrfs_clear_space_info_full(root->fs_info); 1802 btrfs_clear_space_info_full(root->fs_info);
1794 1803
1795 unlock_chunks(root); 1804 unlock_chunks(root);
1805 root->fs_info->num_tolerated_disk_barrier_failures =
1806 btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
1796 ret = btrfs_commit_transaction(trans, root); 1807 ret = btrfs_commit_transaction(trans, root);
1797 1808
1798 if (seeding_dev) { 1809 if (seeding_dev) {
@@ -1814,7 +1825,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1814 1825
1815error_trans: 1826error_trans:
1816 unlock_chunks(root); 1827 unlock_chunks(root);
1817 btrfs_abort_transaction(trans, root, ret);
1818 btrfs_end_transaction(trans, root); 1828 btrfs_end_transaction(trans, root);
1819 rcu_string_free(device->name); 1829 rcu_string_free(device->name);
1820 kfree(device); 1830 kfree(device);
@@ -2804,6 +2814,26 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
2804 } 2814 }
2805 } 2815 }
2806 2816
2817 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
2818 int num_tolerated_disk_barrier_failures;
2819 u64 target = bctl->sys.target;
2820
2821 num_tolerated_disk_barrier_failures =
2822 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
2823 if (num_tolerated_disk_barrier_failures > 0 &&
2824 (target &
2825 (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
2826 BTRFS_AVAIL_ALLOC_BIT_SINGLE)))
2827 num_tolerated_disk_barrier_failures = 0;
2828 else if (num_tolerated_disk_barrier_failures > 1 &&
2829 (target &
2830 (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)))
2831 num_tolerated_disk_barrier_failures = 1;
2832
2833 fs_info->num_tolerated_disk_barrier_failures =
2834 num_tolerated_disk_barrier_failures;
2835 }
2836
2807 ret = insert_balance_item(fs_info->tree_root, bctl); 2837 ret = insert_balance_item(fs_info->tree_root, bctl);
2808 if (ret && ret != -EEXIST) 2838 if (ret && ret != -EEXIST)
2809 goto out; 2839 goto out;
@@ -2836,6 +2866,11 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
2836 __cancel_balance(fs_info); 2866 __cancel_balance(fs_info);
2837 } 2867 }
2838 2868
2869 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
2870 fs_info->num_tolerated_disk_barrier_failures =
2871 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
2872 }
2873
2839 wake_up(&fs_info->balance_wait_q); 2874 wake_up(&fs_info->balance_wait_q);
2840 2875
2841 return ret; 2876 return ret;
@@ -3608,12 +3643,16 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
3608 ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, 3643 ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
3609 &sys_chunk_size, &sys_stripe_size, 3644 &sys_chunk_size, &sys_stripe_size,
3610 sys_chunk_offset, alloc_profile); 3645 sys_chunk_offset, alloc_profile);
3611 if (ret) 3646 if (ret) {
3612 goto abort; 3647 btrfs_abort_transaction(trans, root, ret);
3648 goto out;
3649 }
3613 3650
3614 ret = btrfs_add_device(trans, fs_info->chunk_root, device); 3651 ret = btrfs_add_device(trans, fs_info->chunk_root, device);
3615 if (ret) 3652 if (ret) {
3616 goto abort; 3653 btrfs_abort_transaction(trans, root, ret);
3654 goto out;
3655 }
3617 3656
3618 /* 3657 /*
3619 * Modifying chunk tree needs allocating new blocks from both 3658 * Modifying chunk tree needs allocating new blocks from both
@@ -3623,19 +3662,19 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
3623 */ 3662 */
3624 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, 3663 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
3625 chunk_size, stripe_size); 3664 chunk_size, stripe_size);
3626 if (ret) 3665 if (ret) {
3627 goto abort; 3666 btrfs_abort_transaction(trans, root, ret);
3667 goto out;
3668 }
3628 3669
3629 ret = __finish_chunk_alloc(trans, extent_root, sys_map, 3670 ret = __finish_chunk_alloc(trans, extent_root, sys_map,
3630 sys_chunk_offset, sys_chunk_size, 3671 sys_chunk_offset, sys_chunk_size,
3631 sys_stripe_size); 3672 sys_stripe_size);
3632 if (ret) 3673 if (ret)
3633 goto abort; 3674 btrfs_abort_transaction(trans, root, ret);
3634 3675
3635 return 0; 3676out:
3636 3677
3637abort:
3638 btrfs_abort_transaction(trans, root, ret);
3639 return ret; 3678 return ret;
3640} 3679}
3641 3680
@@ -3760,7 +3799,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3760 read_unlock(&em_tree->lock); 3799 read_unlock(&em_tree->lock);
3761 3800
3762 if (!em) { 3801 if (!em) {
3763 printk(KERN_CRIT "unable to find logical %llu len %llu\n", 3802 printk(KERN_CRIT "btrfs: unable to find logical %llu len %llu\n",
3764 (unsigned long long)logical, 3803 (unsigned long long)logical,
3765 (unsigned long long)*length); 3804 (unsigned long long)*length);
3766 BUG(); 3805 BUG();
@@ -4217,7 +4256,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4217 4256
4218 total_devs = bbio->num_stripes; 4257 total_devs = bbio->num_stripes;
4219 if (map_length < length) { 4258 if (map_length < length) {
4220 printk(KERN_CRIT "mapping failed logical %llu bio len %llu " 4259 printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu "
4221 "len %llu\n", (unsigned long long)logical, 4260 "len %llu\n", (unsigned long long)logical,
4222 (unsigned long long)length, 4261 (unsigned long long)length,
4223 (unsigned long long)map_length); 4262 (unsigned long long)map_length);
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 92c20654cc55..9acb846c3e7f 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -97,7 +97,7 @@ static int zlib_compress_pages(struct list_head *ws,
97 *total_in = 0; 97 *total_in = 0;
98 98
99 if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) { 99 if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
100 printk(KERN_WARNING "deflateInit failed\n"); 100 printk(KERN_WARNING "btrfs: deflateInit failed\n");
101 ret = -1; 101 ret = -1;
102 goto out; 102 goto out;
103 } 103 }
@@ -125,7 +125,7 @@ static int zlib_compress_pages(struct list_head *ws,
125 while (workspace->def_strm.total_in < len) { 125 while (workspace->def_strm.total_in < len) {
126 ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH); 126 ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);
127 if (ret != Z_OK) { 127 if (ret != Z_OK) {
128 printk(KERN_DEBUG "btrfs deflate in loop returned %d\n", 128 printk(KERN_DEBUG "btrfs: deflate in loop returned %d\n",
129 ret); 129 ret);
130 zlib_deflateEnd(&workspace->def_strm); 130 zlib_deflateEnd(&workspace->def_strm);
131 ret = -1; 131 ret = -1;
@@ -252,7 +252,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
252 } 252 }
253 253
254 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { 254 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
255 printk(KERN_WARNING "inflateInit failed\n"); 255 printk(KERN_WARNING "btrfs: inflateInit failed\n");
256 return -1; 256 return -1;
257 } 257 }
258 while (workspace->inf_strm.total_in < srclen) { 258 while (workspace->inf_strm.total_in < srclen) {
@@ -336,7 +336,7 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
336 } 336 }
337 337
338 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { 338 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
339 printk(KERN_WARNING "inflateInit failed\n"); 339 printk(KERN_WARNING "btrfs: inflateInit failed\n");
340 return -1; 340 return -1;
341 } 341 }
342 342
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 8e1b60e557b6..02ce90972d81 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -99,7 +99,7 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
99 * FIXME: we should try harder by querying the mds for the ino. 99 * FIXME: we should try harder by querying the mds for the ino.
100 */ 100 */
101static struct dentry *__fh_to_dentry(struct super_block *sb, 101static struct dentry *__fh_to_dentry(struct super_block *sb,
102 struct ceph_nfs_fh *fh) 102 struct ceph_nfs_fh *fh, int fh_len)
103{ 103{
104 struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc; 104 struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
105 struct inode *inode; 105 struct inode *inode;
@@ -107,6 +107,9 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
107 struct ceph_vino vino; 107 struct ceph_vino vino;
108 int err; 108 int err;
109 109
110 if (fh_len < sizeof(*fh) / 4)
111 return ERR_PTR(-ESTALE);
112
110 dout("__fh_to_dentry %llx\n", fh->ino); 113 dout("__fh_to_dentry %llx\n", fh->ino);
111 vino.ino = fh->ino; 114 vino.ino = fh->ino;
112 vino.snap = CEPH_NOSNAP; 115 vino.snap = CEPH_NOSNAP;
@@ -150,7 +153,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
150 * convert connectable fh to dentry 153 * convert connectable fh to dentry
151 */ 154 */
152static struct dentry *__cfh_to_dentry(struct super_block *sb, 155static struct dentry *__cfh_to_dentry(struct super_block *sb,
153 struct ceph_nfs_confh *cfh) 156 struct ceph_nfs_confh *cfh, int fh_len)
154{ 157{
155 struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc; 158 struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
156 struct inode *inode; 159 struct inode *inode;
@@ -158,6 +161,9 @@ static struct dentry *__cfh_to_dentry(struct super_block *sb,
158 struct ceph_vino vino; 161 struct ceph_vino vino;
159 int err; 162 int err;
160 163
164 if (fh_len < sizeof(*cfh) / 4)
165 return ERR_PTR(-ESTALE);
166
161 dout("__cfh_to_dentry %llx (%llx/%x)\n", 167 dout("__cfh_to_dentry %llx (%llx/%x)\n",
162 cfh->ino, cfh->parent_ino, cfh->parent_name_hash); 168 cfh->ino, cfh->parent_ino, cfh->parent_name_hash);
163 169
@@ -207,9 +213,11 @@ static struct dentry *ceph_fh_to_dentry(struct super_block *sb, struct fid *fid,
207 int fh_len, int fh_type) 213 int fh_len, int fh_type)
208{ 214{
209 if (fh_type == 1) 215 if (fh_type == 1)
210 return __fh_to_dentry(sb, (struct ceph_nfs_fh *)fid->raw); 216 return __fh_to_dentry(sb, (struct ceph_nfs_fh *)fid->raw,
217 fh_len);
211 else 218 else
212 return __cfh_to_dentry(sb, (struct ceph_nfs_confh *)fid->raw); 219 return __cfh_to_dentry(sb, (struct ceph_nfs_confh *)fid->raw,
220 fh_len);
213} 221}
214 222
215/* 223/*
@@ -230,6 +238,8 @@ static struct dentry *ceph_fh_to_parent(struct super_block *sb,
230 238
231 if (fh_type == 1) 239 if (fh_type == 1)
232 return ERR_PTR(-ESTALE); 240 return ERR_PTR(-ESTALE);
241 if (fh_len < sizeof(*cfh) / 4)
242 return ERR_PTR(-ESTALE);
233 243
234 pr_debug("fh_to_parent %llx/%d\n", cfh->parent_ino, 244 pr_debug("fh_to_parent %llx/%d\n", cfh->parent_ino,
235 cfh->parent_name_hash); 245 cfh->parent_name_hash);
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index e622863b292f..086f381d6489 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -31,18 +31,18 @@
31 31
32/* create a new cifs key */ 32/* create a new cifs key */
33static int 33static int
34cifs_spnego_key_instantiate(struct key *key, const void *data, size_t datalen) 34cifs_spnego_key_instantiate(struct key *key, struct key_preparsed_payload *prep)
35{ 35{
36 char *payload; 36 char *payload;
37 int ret; 37 int ret;
38 38
39 ret = -ENOMEM; 39 ret = -ENOMEM;
40 payload = kmalloc(datalen, GFP_KERNEL); 40 payload = kmalloc(prep->datalen, GFP_KERNEL);
41 if (!payload) 41 if (!payload)
42 goto error; 42 goto error;
43 43
44 /* attach the data */ 44 /* attach the data */
45 memcpy(payload, data, datalen); 45 memcpy(payload, prep->data, prep->datalen);
46 key->payload.data = payload; 46 key->payload.data = payload;
47 ret = 0; 47 ret = 0;
48 48
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 53cf2aabce87..71d5d0a5f6b2 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -203,6 +203,27 @@ cifs_strtoUTF16(__le16 *to, const char *from, int len,
203 int i; 203 int i;
204 wchar_t wchar_to; /* needed to quiet sparse */ 204 wchar_t wchar_to; /* needed to quiet sparse */
205 205
206 /* special case for utf8 to handle no plane0 chars */
207 if (!strcmp(codepage->charset, "utf8")) {
208 /*
209 * convert utf8 -> utf16, we assume we have enough space
210 * as caller should have assumed conversion does not overflow
211 * in destination len is length in wchar_t units (16bits)
212 */
213 i = utf8s_to_utf16s(from, len, UTF16_LITTLE_ENDIAN,
214 (wchar_t *) to, len);
215
216 /* if success terminate and exit */
217 if (i >= 0)
218 goto success;
219 /*
220 * if fails fall back to UCS encoding as this
221 * function should not return negative values
222 * currently can fail only if source contains
223 * invalid encoded characters
224 */
225 }
226
206 for (i = 0; len && *from; i++, from += charlen, len -= charlen) { 227 for (i = 0; len && *from; i++, from += charlen, len -= charlen) {
207 charlen = codepage->char2uni(from, len, &wchar_to); 228 charlen = codepage->char2uni(from, len, &wchar_to);
208 if (charlen < 1) { 229 if (charlen < 1) {
@@ -215,6 +236,7 @@ cifs_strtoUTF16(__le16 *to, const char *from, int len,
215 put_unaligned_le16(wchar_to, &to[i]); 236 put_unaligned_le16(wchar_to, &to[i]);
216 } 237 }
217 238
239success:
218 put_unaligned_le16(0, &to[i]); 240 put_unaligned_le16(0, &to[i]);
219 return i; 241 return i;
220} 242}
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 2ee5c54797fa..fc783e264420 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -167,17 +167,17 @@ static struct shrinker cifs_shrinker = {
167}; 167};
168 168
169static int 169static int
170cifs_idmap_key_instantiate(struct key *key, const void *data, size_t datalen) 170cifs_idmap_key_instantiate(struct key *key, struct key_preparsed_payload *prep)
171{ 171{
172 char *payload; 172 char *payload;
173 173
174 payload = kmalloc(datalen, GFP_KERNEL); 174 payload = kmalloc(prep->datalen, GFP_KERNEL);
175 if (!payload) 175 if (!payload)
176 return -ENOMEM; 176 return -ENOMEM;
177 177
178 memcpy(payload, data, datalen); 178 memcpy(payload, prep->data, prep->datalen);
179 key->payload.data = payload; 179 key->payload.data = payload;
180 key->datalen = datalen; 180 key->datalen = prep->datalen;
181 return 0; 181 return 0;
182} 182}
183 183
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 2fdbe08a7a23..5c670b998ffb 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -67,6 +67,7 @@ enum {
67 /* Mount options that take no arguments */ 67 /* Mount options that take no arguments */
68 Opt_user_xattr, Opt_nouser_xattr, 68 Opt_user_xattr, Opt_nouser_xattr,
69 Opt_forceuid, Opt_noforceuid, 69 Opt_forceuid, Opt_noforceuid,
70 Opt_forcegid, Opt_noforcegid,
70 Opt_noblocksend, Opt_noautotune, 71 Opt_noblocksend, Opt_noautotune,
71 Opt_hard, Opt_soft, Opt_perm, Opt_noperm, 72 Opt_hard, Opt_soft, Opt_perm, Opt_noperm,
72 Opt_mapchars, Opt_nomapchars, Opt_sfu, 73 Opt_mapchars, Opt_nomapchars, Opt_sfu,
@@ -117,6 +118,8 @@ static const match_table_t cifs_mount_option_tokens = {
117 { Opt_nouser_xattr, "nouser_xattr" }, 118 { Opt_nouser_xattr, "nouser_xattr" },
118 { Opt_forceuid, "forceuid" }, 119 { Opt_forceuid, "forceuid" },
119 { Opt_noforceuid, "noforceuid" }, 120 { Opt_noforceuid, "noforceuid" },
121 { Opt_forcegid, "forcegid" },
122 { Opt_noforcegid, "noforcegid" },
120 { Opt_noblocksend, "noblocksend" }, 123 { Opt_noblocksend, "noblocksend" },
121 { Opt_noautotune, "noautotune" }, 124 { Opt_noautotune, "noautotune" },
122 { Opt_hard, "hard" }, 125 { Opt_hard, "hard" },
@@ -1195,6 +1198,12 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
1195 case Opt_noforceuid: 1198 case Opt_noforceuid:
1196 override_uid = 0; 1199 override_uid = 0;
1197 break; 1200 break;
1201 case Opt_forcegid:
1202 override_gid = 1;
1203 break;
1204 case Opt_noforcegid:
1205 override_gid = 0;
1206 break;
1198 case Opt_noblocksend: 1207 case Opt_noblocksend:
1199 vol->noblocksnd = 1; 1208 vol->noblocksnd = 1;
1200 break; 1209 break;
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 2126ab185045..76d974c952fe 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -183,6 +183,12 @@ smb_send_kvec(struct TCP_Server_Info *server, struct kvec *iov, size_t n_vec,
183 rc = kernel_sendmsg(ssocket, &smb_msg, &iov[first_vec], 183 rc = kernel_sendmsg(ssocket, &smb_msg, &iov[first_vec],
184 n_vec - first_vec, remaining); 184 n_vec - first_vec, remaining);
185 if (rc == -ENOSPC || rc == -EAGAIN) { 185 if (rc == -ENOSPC || rc == -EAGAIN) {
186 /*
187 * Catch if a low level driver returns -ENOSPC. This
188 * WARN_ON will be removed by 3.10 if no one reports
189 * seeing this.
190 */
191 WARN_ON_ONCE(rc == -ENOSPC);
186 i++; 192 i++;
187 if (i >= 14 || (!server->noblocksnd && (i > 2))) { 193 if (i >= 14 || (!server->noblocksnd && (i > 2))) {
188 cERROR(1, "sends on sock %p stuck for 15 " 194 cERROR(1, "sends on sock %p stuck for 15 "
diff --git a/fs/compat.c b/fs/compat.c
index b7a24d0ca30d..015e1e1f87c6 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -776,16 +776,16 @@ asmlinkage long compat_sys_mount(const char __user * dev_name,
776 char *kernel_type; 776 char *kernel_type;
777 unsigned long data_page; 777 unsigned long data_page;
778 char *kernel_dev; 778 char *kernel_dev;
779 char *dir_page; 779 struct filename *dir;
780 int retval; 780 int retval;
781 781
782 retval = copy_mount_string(type, &kernel_type); 782 retval = copy_mount_string(type, &kernel_type);
783 if (retval < 0) 783 if (retval < 0)
784 goto out; 784 goto out;
785 785
786 dir_page = getname(dir_name); 786 dir = getname(dir_name);
787 retval = PTR_ERR(dir_page); 787 retval = PTR_ERR(dir);
788 if (IS_ERR(dir_page)) 788 if (IS_ERR(dir))
789 goto out1; 789 goto out1;
790 790
791 retval = copy_mount_string(dev_name, &kernel_dev); 791 retval = copy_mount_string(dev_name, &kernel_dev);
@@ -807,7 +807,7 @@ asmlinkage long compat_sys_mount(const char __user * dev_name,
807 } 807 }
808 } 808 }
809 809
810 retval = do_mount(kernel_dev, dir_page, kernel_type, 810 retval = do_mount(kernel_dev, dir->name, kernel_type,
811 flags, (void*)data_page); 811 flags, (void*)data_page);
812 812
813 out4: 813 out4:
@@ -815,7 +815,7 @@ asmlinkage long compat_sys_mount(const char __user * dev_name,
815 out3: 815 out3:
816 kfree(kernel_dev); 816 kfree(kernel_dev);
817 out2: 817 out2:
818 putname(dir_page); 818 putname(dir);
819 out1: 819 out1:
820 kfree(kernel_type); 820 kfree(kernel_type);
821 out: 821 out:
diff --git a/fs/coredump.c b/fs/coredump.c
index fd37facac8dc..ce47379bfa61 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -450,11 +450,12 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
450 450
451 cp->file = files[1]; 451 cp->file = files[1];
452 452
453 replace_fd(0, files[0], 0); 453 err = replace_fd(0, files[0], 0);
454 fput(files[0]);
454 /* and disallow core files too */ 455 /* and disallow core files too */
455 current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1}; 456 current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1};
456 457
457 return 0; 458 return err;
458} 459}
459 460
460void do_coredump(siginfo_t *siginfo, struct pt_regs *regs) 461void do_coredump(siginfo_t *siginfo, struct pt_regs *regs)
diff --git a/fs/exec.c b/fs/exec.c
index 4f2bebc276c5..8b9011b67041 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -59,7 +59,6 @@
59#include <asm/uaccess.h> 59#include <asm/uaccess.h>
60#include <asm/mmu_context.h> 60#include <asm/mmu_context.h>
61#include <asm/tlb.h> 61#include <asm/tlb.h>
62#include <asm/exec.h>
63 62
64#include <trace/events/task.h> 63#include <trace/events/task.h>
65#include "internal.h" 64#include "internal.h"
@@ -106,7 +105,7 @@ static inline void put_binfmt(struct linux_binfmt * fmt)
106SYSCALL_DEFINE1(uselib, const char __user *, library) 105SYSCALL_DEFINE1(uselib, const char __user *, library)
107{ 106{
108 struct file *file; 107 struct file *file;
109 char *tmp = getname(library); 108 struct filename *tmp = getname(library);
110 int error = PTR_ERR(tmp); 109 int error = PTR_ERR(tmp);
111 static const struct open_flags uselib_flags = { 110 static const struct open_flags uselib_flags = {
112 .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, 111 .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
@@ -392,7 +391,7 @@ struct user_arg_ptr {
392 union { 391 union {
393 const char __user *const __user *native; 392 const char __user *const __user *native;
394#ifdef CONFIG_COMPAT 393#ifdef CONFIG_COMPAT
395 compat_uptr_t __user *compat; 394 const compat_uptr_t __user *compat;
396#endif 395#endif
397 } ptr; 396 } ptr;
398}; 397};
@@ -752,13 +751,14 @@ struct file *open_exec(const char *name)
752{ 751{
753 struct file *file; 752 struct file *file;
754 int err; 753 int err;
754 struct filename tmp = { .name = name };
755 static const struct open_flags open_exec_flags = { 755 static const struct open_flags open_exec_flags = {
756 .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, 756 .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
757 .acc_mode = MAY_EXEC | MAY_OPEN, 757 .acc_mode = MAY_EXEC | MAY_OPEN,
758 .intent = LOOKUP_OPEN 758 .intent = LOOKUP_OPEN
759 }; 759 };
760 760
761 file = do_filp_open(AT_FDCWD, name, &open_exec_flags, LOOKUP_FOLLOW); 761 file = do_filp_open(AT_FDCWD, &tmp, &open_exec_flags, LOOKUP_FOLLOW);
762 if (IS_ERR(file)) 762 if (IS_ERR(file))
763 goto out; 763 goto out;
764 764
@@ -1574,9 +1574,9 @@ int do_execve(const char *filename,
1574} 1574}
1575 1575
1576#ifdef CONFIG_COMPAT 1576#ifdef CONFIG_COMPAT
1577int compat_do_execve(char *filename, 1577int compat_do_execve(const char *filename,
1578 compat_uptr_t __user *__argv, 1578 const compat_uptr_t __user *__argv,
1579 compat_uptr_t __user *__envp, 1579 const compat_uptr_t __user *__envp,
1580 struct pt_regs *regs) 1580 struct pt_regs *regs)
1581{ 1581{
1582 struct user_arg_ptr argv = { 1582 struct user_arg_ptr argv = {
@@ -1658,3 +1658,56 @@ int get_dumpable(struct mm_struct *mm)
1658{ 1658{
1659 return __get_dumpable(mm->flags); 1659 return __get_dumpable(mm->flags);
1660} 1660}
1661
1662#ifdef __ARCH_WANT_SYS_EXECVE
1663SYSCALL_DEFINE3(execve,
1664 const char __user *, filename,
1665 const char __user *const __user *, argv,
1666 const char __user *const __user *, envp)
1667{
1668 struct filename *path = getname(filename);
1669 int error = PTR_ERR(path);
1670 if (!IS_ERR(path)) {
1671 error = do_execve(path->name, argv, envp, current_pt_regs());
1672 putname(path);
1673 }
1674 return error;
1675}
1676#ifdef CONFIG_COMPAT
1677asmlinkage long compat_sys_execve(const char __user * filename,
1678 const compat_uptr_t __user * argv,
1679 const compat_uptr_t __user * envp)
1680{
1681 struct filename *path = getname(filename);
1682 int error = PTR_ERR(path);
1683 if (!IS_ERR(path)) {
1684 error = compat_do_execve(path->name, argv, envp,
1685 current_pt_regs());
1686 putname(path);
1687 }
1688 return error;
1689}
1690#endif
1691#endif
1692
1693#ifdef __ARCH_WANT_KERNEL_EXECVE
1694int kernel_execve(const char *filename,
1695 const char *const argv[],
1696 const char *const envp[])
1697{
1698 struct pt_regs *p = current_pt_regs();
1699 int ret;
1700
1701 ret = do_execve(filename,
1702 (const char __user *const __user *)argv,
1703 (const char __user *const __user *)envp, p);
1704 if (ret < 0)
1705 return ret;
1706
1707 /*
1708 * We were successful. We won't be returning to our caller, but
1709 * instead to user space by manipulating the kernel stack.
1710 */
1711 ret_from_kernel_execve(p);
1712}
1713#endif
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index 1585db1aa365..f936cb50dc0d 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -814,8 +814,8 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
814 struct bio *bio; 814 struct bio *bio;
815 815
816 if (per_dev != master_dev) { 816 if (per_dev != master_dev) {
817 bio = bio_kmalloc(GFP_KERNEL, 817 bio = bio_clone_kmalloc(master_dev->bio,
818 master_dev->bio->bi_max_vecs); 818 GFP_KERNEL);
819 if (unlikely(!bio)) { 819 if (unlikely(!bio)) {
820 ORE_DBGMSG( 820 ORE_DBGMSG(
821 "Failed to allocate BIO size=%u\n", 821 "Failed to allocate BIO size=%u\n",
@@ -824,7 +824,6 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
824 goto out; 824 goto out;
825 } 825 }
826 826
827 __bio_clone(bio, master_dev->bio);
828 bio->bi_bdev = NULL; 827 bio->bi_bdev = NULL;
829 bio->bi_next = NULL; 828 bio->bi_next = NULL;
830 per_dev->offset = master_dev->offset; 829 per_dev->offset = master_dev->offset;
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 59e3bbfac0b1..5e59280d42d7 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -389,8 +389,6 @@ static int exofs_sync_fs(struct super_block *sb, int wait)
389 if (unlikely(ret)) 389 if (unlikely(ret))
390 goto out; 390 goto out;
391 391
392 lock_super(sb);
393
394 ios->length = offsetof(struct exofs_fscb, s_dev_table_oid); 392 ios->length = offsetof(struct exofs_fscb, s_dev_table_oid);
395 memset(fscb, 0, ios->length); 393 memset(fscb, 0, ios->length);
396 fscb->s_nextid = cpu_to_le64(sbi->s_nextid); 394 fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
@@ -406,8 +404,6 @@ static int exofs_sync_fs(struct super_block *sb, int wait)
406 if (unlikely(ret)) 404 if (unlikely(ret))
407 EXOFS_ERR("%s: ore_write failed.\n", __func__); 405 EXOFS_ERR("%s: ore_write failed.\n", __func__);
408 406
409
410 unlock_super(sb);
411out: 407out:
412 EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret); 408 EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret);
413 ore_put_io_state(ios); 409 ore_put_io_state(ios);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index ebf8312c3a4e..5366393528df 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2578,11 +2578,9 @@ out:
2578static int ext3_unfreeze(struct super_block *sb) 2578static int ext3_unfreeze(struct super_block *sb)
2579{ 2579{
2580 if (!(sb->s_flags & MS_RDONLY)) { 2580 if (!(sb->s_flags & MS_RDONLY)) {
2581 lock_super(sb);
2582 /* Reser the needs_recovery flag before the fs is unlocked. */ 2581 /* Reser the needs_recovery flag before the fs is unlocked. */
2583 EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); 2582 EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
2584 ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1); 2583 ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1);
2585 unlock_super(sb);
2586 journal_unlock_updates(EXT3_SB(sb)->s_journal); 2584 journal_unlock_updates(EXT3_SB(sb)->s_journal);
2587 } 2585 }
2588 return 0; 2586 return 0;
@@ -2602,7 +2600,6 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2602#endif 2600#endif
2603 2601
2604 /* Store the original options */ 2602 /* Store the original options */
2605 lock_super(sb);
2606 old_sb_flags = sb->s_flags; 2603 old_sb_flags = sb->s_flags;
2607 old_opts.s_mount_opt = sbi->s_mount_opt; 2604 old_opts.s_mount_opt = sbi->s_mount_opt;
2608 old_opts.s_resuid = sbi->s_resuid; 2605 old_opts.s_resuid = sbi->s_resuid;
@@ -2708,8 +2705,6 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2708 old_opts.s_qf_names[i] != sbi->s_qf_names[i]) 2705 old_opts.s_qf_names[i] != sbi->s_qf_names[i])
2709 kfree(old_opts.s_qf_names[i]); 2706 kfree(old_opts.s_qf_names[i]);
2710#endif 2707#endif
2711 unlock_super(sb);
2712
2713 if (enable_quota) 2708 if (enable_quota)
2714 dquot_resume(sb, -1); 2709 dquot_resume(sb, -1);
2715 return 0; 2710 return 0;
@@ -2728,7 +2723,6 @@ restore_opts:
2728 sbi->s_qf_names[i] = old_opts.s_qf_names[i]; 2723 sbi->s_qf_names[i] = old_opts.s_qf_names[i];
2729 } 2724 }
2730#endif 2725#endif
2731 unlock_super(sb);
2732 return err; 2726 return err;
2733} 2727}
2734 2728
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index bca6d0a1255e..2a182342442e 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -571,7 +571,7 @@ static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent,
571 int short_len = 0, fill_len = 0; 571 int short_len = 0, fill_len = 0;
572 int ret = 0; 572 int ret = 0;
573 573
574 lock_super(sb); 574 mutex_lock(&sbi->s_lock);
575 575
576 cpos = filp->f_pos; 576 cpos = filp->f_pos;
577 /* Fake . and .. for the root directory. */ 577 /* Fake . and .. for the root directory. */
@@ -693,7 +693,7 @@ fill_failed:
693 if (unicode) 693 if (unicode)
694 __putname(unicode); 694 __putname(unicode);
695out: 695out:
696 unlock_super(sb); 696 mutex_unlock(&sbi->s_lock);
697 return ret; 697 return ret;
698} 698}
699 699
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index ca7e8f8bad7c..623f36f0423b 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -71,8 +71,9 @@ struct msdos_sb_info {
71 unsigned long root_cluster; /* first cluster of the root directory */ 71 unsigned long root_cluster; /* first cluster of the root directory */
72 unsigned long fsinfo_sector; /* sector number of FAT32 fsinfo */ 72 unsigned long fsinfo_sector; /* sector number of FAT32 fsinfo */
73 struct mutex fat_lock; 73 struct mutex fat_lock;
74 unsigned int prev_free; /* previously allocated cluster number */ 74 struct mutex s_lock;
75 unsigned int free_clusters; /* -1 if undefined */ 75 unsigned int prev_free; /* previously allocated cluster number */
76 unsigned int free_clusters; /* -1 if undefined */
76 unsigned int free_clus_valid; /* is free_clusters valid? */ 77 unsigned int free_clus_valid; /* is free_clusters valid? */
77 struct fat_mount_options options; 78 struct fat_mount_options options;
78 struct nls_table *nls_disk; /* Codepage used on disk */ 79 struct nls_table *nls_disk; /* Codepage used on disk */
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 76f60c642c06..5bafaad00530 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -673,9 +673,9 @@ static int fat_write_inode(struct inode *inode, struct writeback_control *wbc)
673 if (inode->i_ino == MSDOS_FSINFO_INO) { 673 if (inode->i_ino == MSDOS_FSINFO_INO) {
674 struct super_block *sb = inode->i_sb; 674 struct super_block *sb = inode->i_sb;
675 675
676 lock_super(sb); 676 mutex_lock(&MSDOS_SB(sb)->s_lock);
677 err = fat_clusters_flush(sb); 677 err = fat_clusters_flush(sb);
678 unlock_super(sb); 678 mutex_unlock(&MSDOS_SB(sb)->s_lock);
679 } else 679 } else
680 err = __fat_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); 680 err = __fat_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
681 681
@@ -1268,6 +1268,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
1268 b = (struct fat_boot_sector *) bh->b_data; 1268 b = (struct fat_boot_sector *) bh->b_data;
1269 } 1269 }
1270 1270
1271 mutex_init(&sbi->s_lock);
1271 sbi->cluster_size = sb->s_blocksize * sbi->sec_per_clus; 1272 sbi->cluster_size = sb->s_blocksize * sbi->sec_per_clus;
1272 sbi->cluster_bits = ffs(sbi->cluster_size) - 1; 1273 sbi->cluster_bits = ffs(sbi->cluster_size) - 1;
1273 sbi->fats = b->fats; 1274 sbi->fats = b->fats;
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index c1055e778fff..e2cfda94a28d 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -208,7 +208,7 @@ static struct dentry *msdos_lookup(struct inode *dir, struct dentry *dentry,
208 struct inode *inode; 208 struct inode *inode;
209 int err; 209 int err;
210 210
211 lock_super(sb); 211 mutex_lock(&MSDOS_SB(sb)->s_lock);
212 err = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo); 212 err = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
213 switch (err) { 213 switch (err) {
214 case -ENOENT: 214 case -ENOENT:
@@ -221,7 +221,7 @@ static struct dentry *msdos_lookup(struct inode *dir, struct dentry *dentry,
221 default: 221 default:
222 inode = ERR_PTR(err); 222 inode = ERR_PTR(err);
223 } 223 }
224 unlock_super(sb); 224 mutex_unlock(&MSDOS_SB(sb)->s_lock);
225 return d_splice_alias(inode, dentry); 225 return d_splice_alias(inode, dentry);
226} 226}
227 227
@@ -273,7 +273,7 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, umode_t mode,
273 unsigned char msdos_name[MSDOS_NAME]; 273 unsigned char msdos_name[MSDOS_NAME];
274 int err, is_hid; 274 int err, is_hid;
275 275
276 lock_super(sb); 276 mutex_lock(&MSDOS_SB(sb)->s_lock);
277 277
278 err = msdos_format_name(dentry->d_name.name, dentry->d_name.len, 278 err = msdos_format_name(dentry->d_name.name, dentry->d_name.len,
279 msdos_name, &MSDOS_SB(sb)->options); 279 msdos_name, &MSDOS_SB(sb)->options);
@@ -302,7 +302,7 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, umode_t mode,
302 302
303 d_instantiate(dentry, inode); 303 d_instantiate(dentry, inode);
304out: 304out:
305 unlock_super(sb); 305 mutex_unlock(&MSDOS_SB(sb)->s_lock);
306 if (!err) 306 if (!err)
307 err = fat_flush_inodes(sb, dir, inode); 307 err = fat_flush_inodes(sb, dir, inode);
308 return err; 308 return err;
@@ -316,7 +316,7 @@ static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
316 struct fat_slot_info sinfo; 316 struct fat_slot_info sinfo;
317 int err; 317 int err;
318 318
319 lock_super(sb); 319 mutex_lock(&MSDOS_SB(sb)->s_lock);
320 /* 320 /*
321 * Check whether the directory is not in use, then check 321 * Check whether the directory is not in use, then check
322 * whether it is empty. 322 * whether it is empty.
@@ -337,7 +337,7 @@ static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
337 inode->i_ctime = CURRENT_TIME_SEC; 337 inode->i_ctime = CURRENT_TIME_SEC;
338 fat_detach(inode); 338 fat_detach(inode);
339out: 339out:
340 unlock_super(sb); 340 mutex_unlock(&MSDOS_SB(sb)->s_lock);
341 if (!err) 341 if (!err)
342 err = fat_flush_inodes(sb, dir, inode); 342 err = fat_flush_inodes(sb, dir, inode);
343 343
@@ -354,7 +354,7 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
354 struct timespec ts; 354 struct timespec ts;
355 int err, is_hid, cluster; 355 int err, is_hid, cluster;
356 356
357 lock_super(sb); 357 mutex_lock(&MSDOS_SB(sb)->s_lock);
358 358
359 err = msdos_format_name(dentry->d_name.name, dentry->d_name.len, 359 err = msdos_format_name(dentry->d_name.name, dentry->d_name.len,
360 msdos_name, &MSDOS_SB(sb)->options); 360 msdos_name, &MSDOS_SB(sb)->options);
@@ -392,14 +392,14 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
392 392
393 d_instantiate(dentry, inode); 393 d_instantiate(dentry, inode);
394 394
395 unlock_super(sb); 395 mutex_unlock(&MSDOS_SB(sb)->s_lock);
396 fat_flush_inodes(sb, dir, inode); 396 fat_flush_inodes(sb, dir, inode);
397 return 0; 397 return 0;
398 398
399out_free: 399out_free:
400 fat_free_clusters(dir, cluster); 400 fat_free_clusters(dir, cluster);
401out: 401out:
402 unlock_super(sb); 402 mutex_unlock(&MSDOS_SB(sb)->s_lock);
403 return err; 403 return err;
404} 404}
405 405
@@ -411,7 +411,7 @@ static int msdos_unlink(struct inode *dir, struct dentry *dentry)
411 struct fat_slot_info sinfo; 411 struct fat_slot_info sinfo;
412 int err; 412 int err;
413 413
414 lock_super(sb); 414 mutex_lock(&MSDOS_SB(sb)->s_lock);
415 err = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo); 415 err = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
416 if (err) 416 if (err)
417 goto out; 417 goto out;
@@ -423,7 +423,7 @@ static int msdos_unlink(struct inode *dir, struct dentry *dentry)
423 inode->i_ctime = CURRENT_TIME_SEC; 423 inode->i_ctime = CURRENT_TIME_SEC;
424 fat_detach(inode); 424 fat_detach(inode);
425out: 425out:
426 unlock_super(sb); 426 mutex_unlock(&MSDOS_SB(sb)->s_lock);
427 if (!err) 427 if (!err)
428 err = fat_flush_inodes(sb, dir, inode); 428 err = fat_flush_inodes(sb, dir, inode);
429 429
@@ -606,7 +606,7 @@ static int msdos_rename(struct inode *old_dir, struct dentry *old_dentry,
606 unsigned char old_msdos_name[MSDOS_NAME], new_msdos_name[MSDOS_NAME]; 606 unsigned char old_msdos_name[MSDOS_NAME], new_msdos_name[MSDOS_NAME];
607 int err, is_hid; 607 int err, is_hid;
608 608
609 lock_super(sb); 609 mutex_lock(&MSDOS_SB(sb)->s_lock);
610 610
611 err = msdos_format_name(old_dentry->d_name.name, 611 err = msdos_format_name(old_dentry->d_name.name,
612 old_dentry->d_name.len, old_msdos_name, 612 old_dentry->d_name.len, old_msdos_name,
@@ -625,7 +625,7 @@ static int msdos_rename(struct inode *old_dir, struct dentry *old_dentry,
625 err = do_msdos_rename(old_dir, old_msdos_name, old_dentry, 625 err = do_msdos_rename(old_dir, old_msdos_name, old_dentry,
626 new_dir, new_msdos_name, new_dentry, is_hid); 626 new_dir, new_msdos_name, new_dentry, is_hid);
627out: 627out:
628 unlock_super(sb); 628 mutex_unlock(&MSDOS_SB(sb)->s_lock);
629 if (!err) 629 if (!err)
630 err = fat_flush_inodes(sb, old_dir, new_dir); 630 err = fat_flush_inodes(sb, old_dir, new_dir);
631 return err; 631 return err;
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index e535dd75b986..ac959d655e7d 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -721,7 +721,7 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
721 struct dentry *alias; 721 struct dentry *alias;
722 int err; 722 int err;
723 723
724 lock_super(sb); 724 mutex_lock(&MSDOS_SB(sb)->s_lock);
725 725
726 err = vfat_find(dir, &dentry->d_name, &sinfo); 726 err = vfat_find(dir, &dentry->d_name, &sinfo);
727 if (err) { 727 if (err) {
@@ -752,13 +752,13 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
752 if (!S_ISDIR(inode->i_mode)) 752 if (!S_ISDIR(inode->i_mode))
753 d_move(alias, dentry); 753 d_move(alias, dentry);
754 iput(inode); 754 iput(inode);
755 unlock_super(sb); 755 mutex_unlock(&MSDOS_SB(sb)->s_lock);
756 return alias; 756 return alias;
757 } else 757 } else
758 dput(alias); 758 dput(alias);
759 759
760out: 760out:
761 unlock_super(sb); 761 mutex_unlock(&MSDOS_SB(sb)->s_lock);
762 dentry->d_time = dentry->d_parent->d_inode->i_version; 762 dentry->d_time = dentry->d_parent->d_inode->i_version;
763 dentry = d_splice_alias(inode, dentry); 763 dentry = d_splice_alias(inode, dentry);
764 if (dentry) 764 if (dentry)
@@ -766,7 +766,7 @@ out:
766 return dentry; 766 return dentry;
767 767
768error: 768error:
769 unlock_super(sb); 769 mutex_unlock(&MSDOS_SB(sb)->s_lock);
770 return ERR_PTR(err); 770 return ERR_PTR(err);
771} 771}
772 772
@@ -779,7 +779,7 @@ static int vfat_create(struct inode *dir, struct dentry *dentry, umode_t mode,
779 struct timespec ts; 779 struct timespec ts;
780 int err; 780 int err;
781 781
782 lock_super(sb); 782 mutex_lock(&MSDOS_SB(sb)->s_lock);
783 783
784 ts = CURRENT_TIME_SEC; 784 ts = CURRENT_TIME_SEC;
785 err = vfat_add_entry(dir, &dentry->d_name, 0, 0, &ts, &sinfo); 785 err = vfat_add_entry(dir, &dentry->d_name, 0, 0, &ts, &sinfo);
@@ -800,7 +800,7 @@ static int vfat_create(struct inode *dir, struct dentry *dentry, umode_t mode,
800 dentry->d_time = dentry->d_parent->d_inode->i_version; 800 dentry->d_time = dentry->d_parent->d_inode->i_version;
801 d_instantiate(dentry, inode); 801 d_instantiate(dentry, inode);
802out: 802out:
803 unlock_super(sb); 803 mutex_unlock(&MSDOS_SB(sb)->s_lock);
804 return err; 804 return err;
805} 805}
806 806
@@ -811,7 +811,7 @@ static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
811 struct fat_slot_info sinfo; 811 struct fat_slot_info sinfo;
812 int err; 812 int err;
813 813
814 lock_super(sb); 814 mutex_lock(&MSDOS_SB(sb)->s_lock);
815 815
816 err = fat_dir_empty(inode); 816 err = fat_dir_empty(inode);
817 if (err) 817 if (err)
@@ -829,7 +829,7 @@ static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
829 inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC; 829 inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC;
830 fat_detach(inode); 830 fat_detach(inode);
831out: 831out:
832 unlock_super(sb); 832 mutex_unlock(&MSDOS_SB(sb)->s_lock);
833 833
834 return err; 834 return err;
835} 835}
@@ -841,7 +841,7 @@ static int vfat_unlink(struct inode *dir, struct dentry *dentry)
841 struct fat_slot_info sinfo; 841 struct fat_slot_info sinfo;
842 int err; 842 int err;
843 843
844 lock_super(sb); 844 mutex_lock(&MSDOS_SB(sb)->s_lock);
845 845
846 err = vfat_find(dir, &dentry->d_name, &sinfo); 846 err = vfat_find(dir, &dentry->d_name, &sinfo);
847 if (err) 847 if (err)
@@ -854,7 +854,7 @@ static int vfat_unlink(struct inode *dir, struct dentry *dentry)
854 inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC; 854 inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC;
855 fat_detach(inode); 855 fat_detach(inode);
856out: 856out:
857 unlock_super(sb); 857 mutex_unlock(&MSDOS_SB(sb)->s_lock);
858 858
859 return err; 859 return err;
860} 860}
@@ -867,7 +867,7 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
867 struct timespec ts; 867 struct timespec ts;
868 int err, cluster; 868 int err, cluster;
869 869
870 lock_super(sb); 870 mutex_lock(&MSDOS_SB(sb)->s_lock);
871 871
872 ts = CURRENT_TIME_SEC; 872 ts = CURRENT_TIME_SEC;
873 cluster = fat_alloc_new_dir(dir, &ts); 873 cluster = fat_alloc_new_dir(dir, &ts);
@@ -896,13 +896,13 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
896 dentry->d_time = dentry->d_parent->d_inode->i_version; 896 dentry->d_time = dentry->d_parent->d_inode->i_version;
897 d_instantiate(dentry, inode); 897 d_instantiate(dentry, inode);
898 898
899 unlock_super(sb); 899 mutex_unlock(&MSDOS_SB(sb)->s_lock);
900 return 0; 900 return 0;
901 901
902out_free: 902out_free:
903 fat_free_clusters(dir, cluster); 903 fat_free_clusters(dir, cluster);
904out: 904out:
905 unlock_super(sb); 905 mutex_unlock(&MSDOS_SB(sb)->s_lock);
906 return err; 906 return err;
907} 907}
908 908
@@ -921,7 +921,7 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
921 old_sinfo.bh = sinfo.bh = dotdot_bh = NULL; 921 old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
922 old_inode = old_dentry->d_inode; 922 old_inode = old_dentry->d_inode;
923 new_inode = new_dentry->d_inode; 923 new_inode = new_dentry->d_inode;
924 lock_super(sb); 924 mutex_lock(&MSDOS_SB(sb)->s_lock);
925 err = vfat_find(old_dir, &old_dentry->d_name, &old_sinfo); 925 err = vfat_find(old_dir, &old_dentry->d_name, &old_sinfo);
926 if (err) 926 if (err)
927 goto out; 927 goto out;
@@ -996,7 +996,7 @@ out:
996 brelse(sinfo.bh); 996 brelse(sinfo.bh);
997 brelse(dotdot_bh); 997 brelse(dotdot_bh);
998 brelse(old_sinfo.bh); 998 brelse(old_sinfo.bh);
999 unlock_super(sb); 999 mutex_unlock(&MSDOS_SB(sb)->s_lock);
1000 1000
1001 return err; 1001 return err;
1002 1002
diff --git a/fs/file.c b/fs/file.c
index 0f1bda4bebfa..d3b5fa80b71b 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -922,6 +922,9 @@ SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags)
922 if ((flags & ~O_CLOEXEC) != 0) 922 if ((flags & ~O_CLOEXEC) != 0)
923 return -EINVAL; 923 return -EINVAL;
924 924
925 if (unlikely(oldfd == newfd))
926 return -EINVAL;
927
925 if (newfd >= rlimit(RLIMIT_NOFILE)) 928 if (newfd >= rlimit(RLIMIT_NOFILE))
926 return -EMFILE; 929 return -EMFILE;
927 930
diff --git a/fs/file_table.c b/fs/file_table.c
index dac67923330f..a72bf9ddd0d2 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -36,7 +36,7 @@ struct files_stat_struct files_stat = {
36 .max_files = NR_FILE 36 .max_files = NR_FILE
37}; 37};
38 38
39DEFINE_LGLOCK(files_lglock); 39DEFINE_STATIC_LGLOCK(files_lglock);
40 40
41/* SLAB cache for file structures */ 41/* SLAB cache for file structures */
42static struct kmem_cache *filp_cachep __read_mostly; 42static struct kmem_cache *filp_cachep __read_mostly;
diff --git a/fs/filesystems.c b/fs/filesystems.c
index 96f24286667a..da165f6adcbf 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -124,7 +124,7 @@ EXPORT_SYMBOL(unregister_filesystem);
124static int fs_index(const char __user * __name) 124static int fs_index(const char __user * __name)
125{ 125{
126 struct file_system_type * tmp; 126 struct file_system_type * tmp;
127 char * name; 127 struct filename *name;
128 int err, index; 128 int err, index;
129 129
130 name = getname(__name); 130 name = getname(__name);
@@ -135,7 +135,7 @@ static int fs_index(const char __user * __name)
135 err = -EINVAL; 135 err = -EINVAL;
136 read_lock(&file_systems_lock); 136 read_lock(&file_systems_lock);
137 for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next, index++) { 137 for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next, index++) {
138 if (strcmp(tmp->name,name) == 0) { 138 if (strcmp(tmp->name, name->name) == 0) {
139 err = index; 139 err = index;
140 break; 140 break;
141 } 141 }
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 401b6c6248ae..51ea267d444c 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -249,7 +249,7 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
249} 249}
250 250
251/* 251/*
252 * Move expired (dirtied after work->older_than_this) dirty inodes from 252 * Move expired (dirtied before work->older_than_this) dirty inodes from
253 * @delaying_queue to @dispatch_queue. 253 * @delaying_queue to @dispatch_queue.
254 */ 254 */
255static int move_expired_inodes(struct list_head *delaying_queue, 255static int move_expired_inodes(struct list_head *delaying_queue,
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index e8ed6d4a6181..4767774a5f3e 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -161,6 +161,8 @@ static struct dentry *gfs2_fh_to_dentry(struct super_block *sb, struct fid *fid,
161 case GFS2_SMALL_FH_SIZE: 161 case GFS2_SMALL_FH_SIZE:
162 case GFS2_LARGE_FH_SIZE: 162 case GFS2_LARGE_FH_SIZE:
163 case GFS2_OLD_FH_SIZE: 163 case GFS2_OLD_FH_SIZE:
164 if (fh_len < GFS2_SMALL_FH_SIZE)
165 return NULL;
164 this.no_formal_ino = ((u64)be32_to_cpu(fh[0])) << 32; 166 this.no_formal_ino = ((u64)be32_to_cpu(fh[0])) << 32;
165 this.no_formal_ino |= be32_to_cpu(fh[1]); 167 this.no_formal_ino |= be32_to_cpu(fh[1]);
166 this.no_addr = ((u64)be32_to_cpu(fh[2])) << 32; 168 this.no_addr = ((u64)be32_to_cpu(fh[2])) << 32;
@@ -180,6 +182,8 @@ static struct dentry *gfs2_fh_to_parent(struct super_block *sb, struct fid *fid,
180 switch (fh_type) { 182 switch (fh_type) {
181 case GFS2_LARGE_FH_SIZE: 183 case GFS2_LARGE_FH_SIZE:
182 case GFS2_OLD_FH_SIZE: 184 case GFS2_OLD_FH_SIZE:
185 if (fh_len < GFS2_LARGE_FH_SIZE)
186 return NULL;
183 parent.no_formal_ino = ((u64)be32_to_cpu(fh[4])) << 32; 187 parent.no_formal_ino = ((u64)be32_to_cpu(fh[4])) << 32;
184 parent.no_formal_ino |= be32_to_cpu(fh[5]); 188 parent.no_formal_ino |= be32_to_cpu(fh[5]);
185 parent.no_addr = ((u64)be32_to_cpu(fh[6])) << 32; 189 parent.no_addr = ((u64)be32_to_cpu(fh[6])) << 32;
diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h
index 1fe731337f07..9c88da0e855a 100644
--- a/fs/hostfs/hostfs.h
+++ b/fs/hostfs/hostfs.h
@@ -1,7 +1,7 @@
1#ifndef __UM_FS_HOSTFS 1#ifndef __UM_FS_HOSTFS
2#define __UM_FS_HOSTFS 2#define __UM_FS_HOSTFS
3 3
4#include "os.h" 4#include <os.h>
5 5
6/* 6/*
7 * These are exactly the same definitions as in fs.h, but the names are 7 * These are exactly the same definitions as in fs.h, but the names are
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 6c9f3a9d5e21..457addc5c91f 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -16,8 +16,8 @@
16#include <linux/mount.h> 16#include <linux/mount.h>
17#include <linux/namei.h> 17#include <linux/namei.h>
18#include "hostfs.h" 18#include "hostfs.h"
19#include "init.h" 19#include <init.h>
20#include "kern.h" 20#include <kern.h>
21 21
22struct hostfs_inode_info { 22struct hostfs_inode_info {
23 int fd; 23 int fd;
@@ -848,9 +848,11 @@ int hostfs_setattr(struct dentry *dentry, struct iattr *attr)
848 attr->ia_size != i_size_read(inode)) { 848 attr->ia_size != i_size_read(inode)) {
849 int error; 849 int error;
850 850
851 error = vmtruncate(inode, attr->ia_size); 851 error = inode_newsize_ok(inode, attr->ia_size);
852 if (err) 852 if (error)
853 return err; 853 return error;
854
855 truncate_setsize(inode, attr->ia_size);
854 } 856 }
855 857
856 setattr_copy(inode, attr); 858 setattr_copy(inode, attr);
diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c
index a74ad0d371c2..67838f3aa20a 100644
--- a/fs/hostfs/hostfs_user.c
+++ b/fs/hostfs/hostfs_user.c
@@ -15,7 +15,6 @@
15#include <sys/types.h> 15#include <sys/types.h>
16#include <sys/vfs.h> 16#include <sys/vfs.h>
17#include "hostfs.h" 17#include "hostfs.h"
18#include "os.h"
19#include <utime.h> 18#include <utime.h>
20 19
21static void stat64_to_hostfs(const struct stat64 *buf, struct hostfs_stat *p) 20static void stat64_to_hostfs(const struct stat64 *buf, struct hostfs_stat *p)
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index bc28bf077a6a..a3076228523d 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -398,7 +398,6 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
398 *flags |= MS_NOATIME; 398 *flags |= MS_NOATIME;
399 399
400 hpfs_lock(s); 400 hpfs_lock(s);
401 lock_super(s);
402 uid = sbi->sb_uid; gid = sbi->sb_gid; 401 uid = sbi->sb_uid; gid = sbi->sb_gid;
403 umask = 0777 & ~sbi->sb_mode; 402 umask = 0777 & ~sbi->sb_mode;
404 lowercase = sbi->sb_lowercase; 403 lowercase = sbi->sb_lowercase;
@@ -431,12 +430,10 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
431 430
432 replace_mount_options(s, new_opts); 431 replace_mount_options(s, new_opts);
433 432
434 unlock_super(s);
435 hpfs_unlock(s); 433 hpfs_unlock(s);
436 return 0; 434 return 0;
437 435
438out_err: 436out_err:
439 unlock_super(s);
440 hpfs_unlock(s); 437 hpfs_unlock(s);
441 kfree(new_opts); 438 kfree(new_opts);
442 return -EINVAL; 439 return -EINVAL;
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index c1dffe47fde2..78f21f8dc2ec 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -18,7 +18,7 @@
18#include <linux/pid_namespace.h> 18#include <linux/pid_namespace.h>
19#include <linux/namei.h> 19#include <linux/namei.h>
20#include <asm/uaccess.h> 20#include <asm/uaccess.h>
21#include "os.h" 21#include <os.h>
22 22
23static struct inode *get_inode(struct super_block *, struct dentry *); 23static struct inode *get_inode(struct super_block *, struct dentry *);
24 24
@@ -674,7 +674,7 @@ static struct inode *get_inode(struct super_block *sb, struct dentry *dentry)
674 674
675 if (!inode) { 675 if (!inode) {
676 dput(dentry); 676 dput(dentry);
677 return ERR_PTR(-ENOMEM); 677 return NULL;
678 } 678 }
679 679
680 if (S_ISDIR(dentry->d_inode->i_mode)) { 680 if (S_ISDIR(dentry->d_inode->i_mode)) {
diff --git a/fs/internal.h b/fs/internal.h
index 371bcc4b1697..916b7cbf3e3e 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -97,8 +97,8 @@ struct open_flags {
97 int acc_mode; 97 int acc_mode;
98 int intent; 98 int intent;
99}; 99};
100extern struct file *do_filp_open(int dfd, const char *pathname, 100extern struct file *do_filp_open(int dfd, struct filename *pathname,
101 const struct open_flags *op, int lookup_flags); 101 const struct open_flags *op, int flags);
102extern struct file *do_file_open_root(struct dentry *, struct vfsmount *, 102extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
103 const char *, const struct open_flags *, int lookup_flags); 103 const char *, const struct open_flags *, int lookup_flags);
104 104
diff --git a/fs/isofs/export.c b/fs/isofs/export.c
index 1d3804492aa7..2b4f2358eadb 100644
--- a/fs/isofs/export.c
+++ b/fs/isofs/export.c
@@ -175,7 +175,7 @@ static struct dentry *isofs_fh_to_parent(struct super_block *sb,
175{ 175{
176 struct isofs_fid *ifid = (struct isofs_fid *)fid; 176 struct isofs_fid *ifid = (struct isofs_fid *)fid;
177 177
178 if (fh_type != 2) 178 if (fh_len < 2 || fh_type != 2)
179 return NULL; 179 return NULL;
180 180
181 return isofs_export_iget(sb, 181 return isofs_export_iget(sb,
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index ff487954cd96..d3d8799e2187 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -100,6 +100,10 @@ static int jffs2_sync_fs(struct super_block *sb, int wait)
100{ 100{
101 struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); 101 struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
102 102
103#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
104 cancel_delayed_work_sync(&c->wbuf_dwork);
105#endif
106
103 mutex_lock(&c->alloc_sem); 107 mutex_lock(&c->alloc_sem);
104 jffs2_flush_wbuf_pad(c); 108 jffs2_flush_wbuf_pad(c);
105 mutex_unlock(&c->alloc_sem); 109 mutex_unlock(&c->alloc_sem);
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 6f4529d3697f..a6597d60d76d 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -1044,10 +1044,10 @@ int jffs2_check_oob_empty(struct jffs2_sb_info *c,
1044 ops.datbuf = NULL; 1044 ops.datbuf = NULL;
1045 1045
1046 ret = mtd_read_oob(c->mtd, jeb->offset, &ops); 1046 ret = mtd_read_oob(c->mtd, jeb->offset, &ops);
1047 if (ret || ops.oobretlen != ops.ooblen) { 1047 if ((ret && !mtd_is_bitflip(ret)) || ops.oobretlen != ops.ooblen) {
1048 pr_err("cannot read OOB for EB at %08x, requested %zd bytes, read %zd bytes, error %d\n", 1048 pr_err("cannot read OOB for EB at %08x, requested %zd bytes, read %zd bytes, error %d\n",
1049 jeb->offset, ops.ooblen, ops.oobretlen, ret); 1049 jeb->offset, ops.ooblen, ops.oobretlen, ret);
1050 if (!ret) 1050 if (!ret || mtd_is_bitflip(ret))
1051 ret = -EIO; 1051 ret = -EIO;
1052 return ret; 1052 return ret;
1053 } 1053 }
@@ -1086,10 +1086,10 @@ int jffs2_check_nand_cleanmarker(struct jffs2_sb_info *c,
1086 ops.datbuf = NULL; 1086 ops.datbuf = NULL;
1087 1087
1088 ret = mtd_read_oob(c->mtd, jeb->offset, &ops); 1088 ret = mtd_read_oob(c->mtd, jeb->offset, &ops);
1089 if (ret || ops.oobretlen != ops.ooblen) { 1089 if ((ret && !mtd_is_bitflip(ret)) || ops.oobretlen != ops.ooblen) {
1090 pr_err("cannot read OOB for EB at %08x, requested %zd bytes, read %zd bytes, error %d\n", 1090 pr_err("cannot read OOB for EB at %08x, requested %zd bytes, read %zd bytes, error %d\n",
1091 jeb->offset, ops.ooblen, ops.oobretlen, ret); 1091 jeb->offset, ops.ooblen, ops.oobretlen, ret);
1092 if (!ret) 1092 if (!ret || mtd_is_bitflip(ret))
1093 ret = -EIO; 1093 ret = -EIO;
1094 return ret; 1094 return ret;
1095 } 1095 }
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 7ef14b3c5bee..e4fb3ba5a58a 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -7,7 +7,6 @@
7 */ 7 */
8 8
9#include <linux/types.h> 9#include <linux/types.h>
10#include <linux/utsname.h>
11#include <linux/kernel.h> 10#include <linux/kernel.h>
12#include <linux/ktime.h> 11#include <linux/ktime.h>
13#include <linux/slab.h> 12#include <linux/slab.h>
@@ -19,6 +18,8 @@
19 18
20#include <asm/unaligned.h> 19#include <asm/unaligned.h>
21 20
21#include "netns.h"
22
22#define NLMDBG_FACILITY NLMDBG_MONITOR 23#define NLMDBG_FACILITY NLMDBG_MONITOR
23#define NSM_PROGRAM 100024 24#define NSM_PROGRAM 100024
24#define NSM_VERSION 1 25#define NSM_VERSION 1
@@ -40,6 +41,7 @@ struct nsm_args {
40 u32 proc; 41 u32 proc;
41 42
42 char *mon_name; 43 char *mon_name;
44 char *nodename;
43}; 45};
44 46
45struct nsm_res { 47struct nsm_res {
@@ -70,7 +72,7 @@ static struct rpc_clnt *nsm_create(struct net *net)
70 }; 72 };
71 struct rpc_create_args args = { 73 struct rpc_create_args args = {
72 .net = net, 74 .net = net,
73 .protocol = XPRT_TRANSPORT_UDP, 75 .protocol = XPRT_TRANSPORT_TCP,
74 .address = (struct sockaddr *)&sin, 76 .address = (struct sockaddr *)&sin,
75 .addrsize = sizeof(sin), 77 .addrsize = sizeof(sin),
76 .servername = "rpc.statd", 78 .servername = "rpc.statd",
@@ -83,10 +85,54 @@ static struct rpc_clnt *nsm_create(struct net *net)
83 return rpc_create(&args); 85 return rpc_create(&args);
84} 86}
85 87
86static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res, 88static struct rpc_clnt *nsm_client_get(struct net *net)
87 struct net *net)
88{ 89{
90 static DEFINE_MUTEX(nsm_create_mutex);
89 struct rpc_clnt *clnt; 91 struct rpc_clnt *clnt;
92 struct lockd_net *ln = net_generic(net, lockd_net_id);
93
94 spin_lock(&ln->nsm_clnt_lock);
95 if (ln->nsm_users) {
96 ln->nsm_users++;
97 clnt = ln->nsm_clnt;
98 spin_unlock(&ln->nsm_clnt_lock);
99 goto out;
100 }
101 spin_unlock(&ln->nsm_clnt_lock);
102
103 mutex_lock(&nsm_create_mutex);
104 clnt = nsm_create(net);
105 if (!IS_ERR(clnt)) {
106 ln->nsm_clnt = clnt;
107 smp_wmb();
108 ln->nsm_users = 1;
109 }
110 mutex_unlock(&nsm_create_mutex);
111out:
112 return clnt;
113}
114
115static void nsm_client_put(struct net *net)
116{
117 struct lockd_net *ln = net_generic(net, lockd_net_id);
118 struct rpc_clnt *clnt = ln->nsm_clnt;
119 int shutdown = 0;
120
121 spin_lock(&ln->nsm_clnt_lock);
122 if (ln->nsm_users) {
123 if (--ln->nsm_users)
124 ln->nsm_clnt = NULL;
125 shutdown = !ln->nsm_users;
126 }
127 spin_unlock(&ln->nsm_clnt_lock);
128
129 if (shutdown)
130 rpc_shutdown_client(clnt);
131}
132
133static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res,
134 struct rpc_clnt *clnt)
135{
90 int status; 136 int status;
91 struct nsm_args args = { 137 struct nsm_args args = {
92 .priv = &nsm->sm_priv, 138 .priv = &nsm->sm_priv,
@@ -94,31 +140,24 @@ static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res,
94 .vers = 3, 140 .vers = 3,
95 .proc = NLMPROC_NSM_NOTIFY, 141 .proc = NLMPROC_NSM_NOTIFY,
96 .mon_name = nsm->sm_mon_name, 142 .mon_name = nsm->sm_mon_name,
143 .nodename = clnt->cl_nodename,
97 }; 144 };
98 struct rpc_message msg = { 145 struct rpc_message msg = {
99 .rpc_argp = &args, 146 .rpc_argp = &args,
100 .rpc_resp = res, 147 .rpc_resp = res,
101 }; 148 };
102 149
103 clnt = nsm_create(net); 150 BUG_ON(clnt == NULL);
104 if (IS_ERR(clnt)) {
105 status = PTR_ERR(clnt);
106 dprintk("lockd: failed to create NSM upcall transport, "
107 "status=%d\n", status);
108 goto out;
109 }
110 151
111 memset(res, 0, sizeof(*res)); 152 memset(res, 0, sizeof(*res));
112 153
113 msg.rpc_proc = &clnt->cl_procinfo[proc]; 154 msg.rpc_proc = &clnt->cl_procinfo[proc];
114 status = rpc_call_sync(clnt, &msg, 0); 155 status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFTCONN);
115 if (status < 0) 156 if (status < 0)
116 dprintk("lockd: NSM upcall RPC failed, status=%d\n", 157 dprintk("lockd: NSM upcall RPC failed, status=%d\n",
117 status); 158 status);
118 else 159 else
119 status = 0; 160 status = 0;
120 rpc_shutdown_client(clnt);
121 out:
122 return status; 161 return status;
123} 162}
124 163
@@ -138,6 +177,7 @@ int nsm_monitor(const struct nlm_host *host)
138 struct nsm_handle *nsm = host->h_nsmhandle; 177 struct nsm_handle *nsm = host->h_nsmhandle;
139 struct nsm_res res; 178 struct nsm_res res;
140 int status; 179 int status;
180 struct rpc_clnt *clnt;
141 181
142 dprintk("lockd: nsm_monitor(%s)\n", nsm->sm_name); 182 dprintk("lockd: nsm_monitor(%s)\n", nsm->sm_name);
143 183
@@ -150,7 +190,15 @@ int nsm_monitor(const struct nlm_host *host)
150 */ 190 */
151 nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf; 191 nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf;
152 192
153 status = nsm_mon_unmon(nsm, NSMPROC_MON, &res, host->net); 193 clnt = nsm_client_get(host->net);
194 if (IS_ERR(clnt)) {
195 status = PTR_ERR(clnt);
196 dprintk("lockd: failed to create NSM upcall transport, "
197 "status=%d, net=%p\n", status, host->net);
198 return status;
199 }
200
201 status = nsm_mon_unmon(nsm, NSMPROC_MON, &res, clnt);
154 if (unlikely(res.status != 0)) 202 if (unlikely(res.status != 0))
155 status = -EIO; 203 status = -EIO;
156 if (unlikely(status < 0)) { 204 if (unlikely(status < 0)) {
@@ -182,9 +230,11 @@ void nsm_unmonitor(const struct nlm_host *host)
182 230
183 if (atomic_read(&nsm->sm_count) == 1 231 if (atomic_read(&nsm->sm_count) == 1
184 && nsm->sm_monitored && !nsm->sm_sticky) { 232 && nsm->sm_monitored && !nsm->sm_sticky) {
233 struct lockd_net *ln = net_generic(host->net, lockd_net_id);
234
185 dprintk("lockd: nsm_unmonitor(%s)\n", nsm->sm_name); 235 dprintk("lockd: nsm_unmonitor(%s)\n", nsm->sm_name);
186 236
187 status = nsm_mon_unmon(nsm, NSMPROC_UNMON, &res, host->net); 237 status = nsm_mon_unmon(nsm, NSMPROC_UNMON, &res, ln->nsm_clnt);
188 if (res.status != 0) 238 if (res.status != 0)
189 status = -EIO; 239 status = -EIO;
190 if (status < 0) 240 if (status < 0)
@@ -192,6 +242,8 @@ void nsm_unmonitor(const struct nlm_host *host)
192 nsm->sm_name); 242 nsm->sm_name);
193 else 243 else
194 nsm->sm_monitored = 0; 244 nsm->sm_monitored = 0;
245
246 nsm_client_put(host->net);
195 } 247 }
196} 248}
197 249
@@ -430,7 +482,7 @@ static void encode_my_id(struct xdr_stream *xdr, const struct nsm_args *argp)
430{ 482{
431 __be32 *p; 483 __be32 *p;
432 484
433 encode_nsm_string(xdr, utsname()->nodename); 485 encode_nsm_string(xdr, argp->nodename);
434 p = xdr_reserve_space(xdr, 4 + 4 + 4); 486 p = xdr_reserve_space(xdr, 4 + 4 + 4);
435 *p++ = cpu_to_be32(argp->prog); 487 *p++ = cpu_to_be32(argp->prog);
436 *p++ = cpu_to_be32(argp->vers); 488 *p++ = cpu_to_be32(argp->vers);
diff --git a/fs/lockd/netns.h b/fs/lockd/netns.h
index 4eee248ba96e..5010b55628b4 100644
--- a/fs/lockd/netns.h
+++ b/fs/lockd/netns.h
@@ -12,6 +12,10 @@ struct lockd_net {
12 struct delayed_work grace_period_end; 12 struct delayed_work grace_period_end;
13 struct lock_manager lockd_manager; 13 struct lock_manager lockd_manager;
14 struct list_head grace_list; 14 struct list_head grace_list;
15
16 spinlock_t nsm_clnt_lock;
17 unsigned int nsm_users;
18 struct rpc_clnt *nsm_clnt;
15}; 19};
16 20
17extern int lockd_net_id; 21extern int lockd_net_id;
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 31a63f87b806..a2aa97d45670 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -126,7 +126,7 @@ static void restart_grace(void)
126static int 126static int
127lockd(void *vrqstp) 127lockd(void *vrqstp)
128{ 128{
129 int err = 0, preverr = 0; 129 int err = 0;
130 struct svc_rqst *rqstp = vrqstp; 130 struct svc_rqst *rqstp = vrqstp;
131 131
132 /* try_to_freeze() is called from svc_recv() */ 132 /* try_to_freeze() is called from svc_recv() */
@@ -165,21 +165,8 @@ lockd(void *vrqstp)
165 * recvfrom routine. 165 * recvfrom routine.
166 */ 166 */
167 err = svc_recv(rqstp, timeout); 167 err = svc_recv(rqstp, timeout);
168 if (err == -EAGAIN || err == -EINTR) { 168 if (err == -EAGAIN || err == -EINTR)
169 preverr = err;
170 continue; 169 continue;
171 }
172 if (err < 0) {
173 if (err != preverr) {
174 printk(KERN_WARNING "%s: unexpected error "
175 "from svc_recv (%d)\n", __func__, err);
176 preverr = err;
177 }
178 schedule_timeout_interruptible(HZ);
179 continue;
180 }
181 preverr = err;
182
183 dprintk("lockd: request from %s\n", 170 dprintk("lockd: request from %s\n",
184 svc_print_addr(rqstp, buf, sizeof(buf))); 171 svc_print_addr(rqstp, buf, sizeof(buf)));
185 172
@@ -596,6 +583,7 @@ static int lockd_init_net(struct net *net)
596 583
597 INIT_DELAYED_WORK(&ln->grace_period_end, grace_ender); 584 INIT_DELAYED_WORK(&ln->grace_period_end, grace_ender);
598 INIT_LIST_HEAD(&ln->grace_list); 585 INIT_LIST_HEAD(&ln->grace_list);
586 spin_lock_init(&ln->nsm_clnt_lock);
599 return 0; 587 return 0;
600} 588}
601 589
diff --git a/fs/locks.c b/fs/locks.c
index abc7dc6c490b..a94e331a52a2 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1289,7 +1289,7 @@ EXPORT_SYMBOL(__break_lease);
1289void lease_get_mtime(struct inode *inode, struct timespec *time) 1289void lease_get_mtime(struct inode *inode, struct timespec *time)
1290{ 1290{
1291 struct file_lock *flock = inode->i_flock; 1291 struct file_lock *flock = inode->i_flock;
1292 if (flock && IS_LEASE(flock) && (flock->fl_type & F_WRLCK)) 1292 if (flock && IS_LEASE(flock) && (flock->fl_type == F_WRLCK))
1293 *time = current_fs_time(inode->i_sb); 1293 *time = current_fs_time(inode->i_sb);
1294 else 1294 else
1295 *time = inode->i_mtime; 1295 *time = inode->i_mtime;
@@ -2185,8 +2185,8 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
2185 } else { 2185 } else {
2186 seq_printf(f, "%s ", 2186 seq_printf(f, "%s ",
2187 (lease_breaking(fl)) 2187 (lease_breaking(fl))
2188 ? (fl->fl_type & F_UNLCK) ? "UNLCK" : "READ " 2188 ? (fl->fl_type == F_UNLCK) ? "UNLCK" : "READ "
2189 : (fl->fl_type & F_WRLCK) ? "WRITE" : "READ "); 2189 : (fl->fl_type == F_WRLCK) ? "WRITE" : "READ ");
2190 } 2190 }
2191 if (inode) { 2191 if (inode) {
2192#ifdef WE_CAN_BREAK_LSLK_NOW 2192#ifdef WE_CAN_BREAK_LSLK_NOW
diff --git a/fs/namei.c b/fs/namei.c
index aa30d19e9edd..d1895f308156 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -117,18 +117,70 @@
117 * POSIX.1 2.4: an empty pathname is invalid (ENOENT). 117 * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
118 * PATH_MAX includes the nul terminator --RR. 118 * PATH_MAX includes the nul terminator --RR.
119 */ 119 */
120static char *getname_flags(const char __user *filename, int flags, int *empty) 120void final_putname(struct filename *name)
121{ 121{
122 char *result = __getname(), *err; 122 if (name->separate) {
123 __putname(name->name);
124 kfree(name);
125 } else {
126 __putname(name);
127 }
128}
129
130#define EMBEDDED_NAME_MAX (PATH_MAX - sizeof(struct filename))
131
132static struct filename *
133getname_flags(const char __user *filename, int flags, int *empty)
134{
135 struct filename *result, *err;
123 int len; 136 int len;
137 long max;
138 char *kname;
124 139
140 result = audit_reusename(filename);
141 if (result)
142 return result;
143
144 result = __getname();
125 if (unlikely(!result)) 145 if (unlikely(!result))
126 return ERR_PTR(-ENOMEM); 146 return ERR_PTR(-ENOMEM);
127 147
128 len = strncpy_from_user(result, filename, PATH_MAX); 148 /*
129 err = ERR_PTR(len); 149 * First, try to embed the struct filename inside the names_cache
130 if (unlikely(len < 0)) 150 * allocation
151 */
152 kname = (char *)result + sizeof(*result);
153 result->name = kname;
154 result->separate = false;
155 max = EMBEDDED_NAME_MAX;
156
157recopy:
158 len = strncpy_from_user(kname, filename, max);
159 if (unlikely(len < 0)) {
160 err = ERR_PTR(len);
131 goto error; 161 goto error;
162 }
163
164 /*
165 * Uh-oh. We have a name that's approaching PATH_MAX. Allocate a
166 * separate struct filename so we can dedicate the entire
167 * names_cache allocation for the pathname, and re-do the copy from
168 * userland.
169 */
170 if (len == EMBEDDED_NAME_MAX && max == EMBEDDED_NAME_MAX) {
171 kname = (char *)result;
172
173 result = kzalloc(sizeof(*result), GFP_KERNEL);
174 if (!result) {
175 err = ERR_PTR(-ENOMEM);
176 result = (struct filename *)kname;
177 goto error;
178 }
179 result->name = kname;
180 result->separate = true;
181 max = PATH_MAX;
182 goto recopy;
183 }
132 184
133 /* The empty path is special. */ 185 /* The empty path is special. */
134 if (unlikely(!len)) { 186 if (unlikely(!len)) {
@@ -140,30 +192,32 @@ static char *getname_flags(const char __user *filename, int flags, int *empty)
140 } 192 }
141 193
142 err = ERR_PTR(-ENAMETOOLONG); 194 err = ERR_PTR(-ENAMETOOLONG);
143 if (likely(len < PATH_MAX)) { 195 if (unlikely(len >= PATH_MAX))
144 audit_getname(result); 196 goto error;
145 return result; 197
146 } 198 result->uptr = filename;
199 audit_getname(result);
200 return result;
147 201
148error: 202error:
149 __putname(result); 203 final_putname(result);
150 return err; 204 return err;
151} 205}
152 206
153char *getname(const char __user * filename) 207struct filename *
208getname(const char __user * filename)
154{ 209{
155 return getname_flags(filename, 0, NULL); 210 return getname_flags(filename, 0, NULL);
156} 211}
212EXPORT_SYMBOL(getname);
157 213
158#ifdef CONFIG_AUDITSYSCALL 214#ifdef CONFIG_AUDITSYSCALL
159void putname(const char *name) 215void putname(struct filename *name)
160{ 216{
161 if (unlikely(!audit_dummy_context())) 217 if (unlikely(!audit_dummy_context()))
162 audit_putname(name); 218 return audit_putname(name);
163 else 219 final_putname(name);
164 __putname(name);
165} 220}
166EXPORT_SYMBOL(putname);
167#endif 221#endif
168 222
169static int check_acl(struct inode *inode, int mask) 223static int check_acl(struct inode *inode, int mask)
@@ -692,9 +746,9 @@ static inline int may_follow_link(struct path *link, struct nameidata *nd)
692 if (uid_eq(parent->i_uid, inode->i_uid)) 746 if (uid_eq(parent->i_uid, inode->i_uid))
693 return 0; 747 return 0;
694 748
749 audit_log_link_denied("follow_link", link);
695 path_put_conditional(link, nd); 750 path_put_conditional(link, nd);
696 path_put(&nd->path); 751 path_put(&nd->path);
697 audit_log_link_denied("follow_link", link);
698 return -EACCES; 752 return -EACCES;
699} 753}
700 754
@@ -810,6 +864,7 @@ follow_link(struct path *link, struct nameidata *nd, void **p)
810 return error; 864 return error;
811 865
812out_put_nd_path: 866out_put_nd_path:
867 *p = NULL;
813 path_put(&nd->path); 868 path_put(&nd->path);
814 path_put(link); 869 path_put(link);
815 return error; 870 return error;
@@ -1962,24 +2017,29 @@ static int path_lookupat(int dfd, const char *name,
1962 return err; 2017 return err;
1963} 2018}
1964 2019
1965static int do_path_lookup(int dfd, const char *name, 2020static int filename_lookup(int dfd, struct filename *name,
1966 unsigned int flags, struct nameidata *nd) 2021 unsigned int flags, struct nameidata *nd)
1967{ 2022{
1968 int retval = path_lookupat(dfd, name, flags | LOOKUP_RCU, nd); 2023 int retval = path_lookupat(dfd, name->name, flags | LOOKUP_RCU, nd);
1969 if (unlikely(retval == -ECHILD)) 2024 if (unlikely(retval == -ECHILD))
1970 retval = path_lookupat(dfd, name, flags, nd); 2025 retval = path_lookupat(dfd, name->name, flags, nd);
1971 if (unlikely(retval == -ESTALE)) 2026 if (unlikely(retval == -ESTALE))
1972 retval = path_lookupat(dfd, name, flags | LOOKUP_REVAL, nd); 2027 retval = path_lookupat(dfd, name->name,
2028 flags | LOOKUP_REVAL, nd);
1973 2029
1974 if (likely(!retval)) { 2030 if (likely(!retval))
1975 if (unlikely(!audit_dummy_context())) { 2031 audit_inode(name, nd->path.dentry, flags & LOOKUP_PARENT);
1976 if (nd->path.dentry && nd->inode)
1977 audit_inode(name, nd->path.dentry);
1978 }
1979 }
1980 return retval; 2032 return retval;
1981} 2033}
1982 2034
2035static int do_path_lookup(int dfd, const char *name,
2036 unsigned int flags, struct nameidata *nd)
2037{
2038 struct filename filename = { .name = name };
2039
2040 return filename_lookup(dfd, &filename, flags, nd);
2041}
2042
1983/* does lookup, returns the object with parent locked */ 2043/* does lookup, returns the object with parent locked */
1984struct dentry *kern_path_locked(const char *name, struct path *path) 2044struct dentry *kern_path_locked(const char *name, struct path *path)
1985{ 2045{
@@ -2097,13 +2157,13 @@ int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
2097 struct path *path, int *empty) 2157 struct path *path, int *empty)
2098{ 2158{
2099 struct nameidata nd; 2159 struct nameidata nd;
2100 char *tmp = getname_flags(name, flags, empty); 2160 struct filename *tmp = getname_flags(name, flags, empty);
2101 int err = PTR_ERR(tmp); 2161 int err = PTR_ERR(tmp);
2102 if (!IS_ERR(tmp)) { 2162 if (!IS_ERR(tmp)) {
2103 2163
2104 BUG_ON(flags & LOOKUP_PARENT); 2164 BUG_ON(flags & LOOKUP_PARENT);
2105 2165
2106 err = do_path_lookup(dfd, tmp, flags, &nd); 2166 err = filename_lookup(dfd, tmp, flags, &nd);
2107 putname(tmp); 2167 putname(tmp);
2108 if (!err) 2168 if (!err)
2109 *path = nd.path; 2169 *path = nd.path;
@@ -2117,22 +2177,28 @@ int user_path_at(int dfd, const char __user *name, unsigned flags,
2117 return user_path_at_empty(dfd, name, flags, path, NULL); 2177 return user_path_at_empty(dfd, name, flags, path, NULL);
2118} 2178}
2119 2179
2120static int user_path_parent(int dfd, const char __user *path, 2180/*
2121 struct nameidata *nd, char **name) 2181 * NB: most callers don't do anything directly with the reference to the
2182 * to struct filename, but the nd->last pointer points into the name string
2183 * allocated by getname. So we must hold the reference to it until all
2184 * path-walking is complete.
2185 */
2186static struct filename *
2187user_path_parent(int dfd, const char __user *path, struct nameidata *nd)
2122{ 2188{
2123 char *s = getname(path); 2189 struct filename *s = getname(path);
2124 int error; 2190 int error;
2125 2191
2126 if (IS_ERR(s)) 2192 if (IS_ERR(s))
2127 return PTR_ERR(s); 2193 return s;
2128 2194
2129 error = do_path_lookup(dfd, s, LOOKUP_PARENT, nd); 2195 error = filename_lookup(dfd, s, LOOKUP_PARENT, nd);
2130 if (error) 2196 if (error) {
2131 putname(s); 2197 putname(s);
2132 else 2198 return ERR_PTR(error);
2133 *name = s; 2199 }
2134 2200
2135 return error; 2201 return s;
2136} 2202}
2137 2203
2138/* 2204/*
@@ -2179,7 +2245,7 @@ static int may_delete(struct inode *dir,struct dentry *victim,int isdir)
2179 return -ENOENT; 2245 return -ENOENT;
2180 2246
2181 BUG_ON(victim->d_parent->d_inode != dir); 2247 BUG_ON(victim->d_parent->d_inode != dir);
2182 audit_inode_child(victim, dir); 2248 audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
2183 2249
2184 error = inode_permission(dir, MAY_WRITE | MAY_EXEC); 2250 error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
2185 if (error) 2251 if (error)
@@ -2624,7 +2690,7 @@ out_dput:
2624 */ 2690 */
2625static int do_last(struct nameidata *nd, struct path *path, 2691static int do_last(struct nameidata *nd, struct path *path,
2626 struct file *file, const struct open_flags *op, 2692 struct file *file, const struct open_flags *op,
2627 int *opened, const char *pathname) 2693 int *opened, struct filename *name)
2628{ 2694{
2629 struct dentry *dir = nd->path.dentry; 2695 struct dentry *dir = nd->path.dentry;
2630 int open_flag = op->open_flag; 2696 int open_flag = op->open_flag;
@@ -2651,7 +2717,7 @@ static int do_last(struct nameidata *nd, struct path *path,
2651 error = complete_walk(nd); 2717 error = complete_walk(nd);
2652 if (error) 2718 if (error)
2653 return error; 2719 return error;
2654 audit_inode(pathname, nd->path.dentry); 2720 audit_inode(name, nd->path.dentry, 0);
2655 if (open_flag & O_CREAT) { 2721 if (open_flag & O_CREAT) {
2656 error = -EISDIR; 2722 error = -EISDIR;
2657 goto out; 2723 goto out;
@@ -2661,7 +2727,7 @@ static int do_last(struct nameidata *nd, struct path *path,
2661 error = complete_walk(nd); 2727 error = complete_walk(nd);
2662 if (error) 2728 if (error)
2663 return error; 2729 return error;
2664 audit_inode(pathname, dir); 2730 audit_inode(name, dir, 0);
2665 goto finish_open; 2731 goto finish_open;
2666 } 2732 }
2667 2733
@@ -2690,7 +2756,7 @@ static int do_last(struct nameidata *nd, struct path *path,
2690 if (error) 2756 if (error)
2691 return error; 2757 return error;
2692 2758
2693 audit_inode(pathname, dir); 2759 audit_inode(name, dir, 0);
2694 error = -EISDIR; 2760 error = -EISDIR;
2695 /* trailing slashes? */ 2761 /* trailing slashes? */
2696 if (nd->last.name[nd->last.len]) 2762 if (nd->last.name[nd->last.len])
@@ -2720,7 +2786,7 @@ retry_lookup:
2720 !S_ISREG(file->f_path.dentry->d_inode->i_mode)) 2786 !S_ISREG(file->f_path.dentry->d_inode->i_mode))
2721 will_truncate = false; 2787 will_truncate = false;
2722 2788
2723 audit_inode(pathname, file->f_path.dentry); 2789 audit_inode(name, file->f_path.dentry, 0);
2724 goto opened; 2790 goto opened;
2725 } 2791 }
2726 2792
@@ -2737,7 +2803,7 @@ retry_lookup:
2737 * create/update audit record if it already exists. 2803 * create/update audit record if it already exists.
2738 */ 2804 */
2739 if (path->dentry->d_inode) 2805 if (path->dentry->d_inode)
2740 audit_inode(pathname, path->dentry); 2806 audit_inode(name, path->dentry, 0);
2741 2807
2742 /* 2808 /*
2743 * If atomic_open() acquired write access it is dropped now due to 2809 * If atomic_open() acquired write access it is dropped now due to
@@ -2802,7 +2868,7 @@ finish_lookup:
2802 error = -ENOTDIR; 2868 error = -ENOTDIR;
2803 if ((nd->flags & LOOKUP_DIRECTORY) && !nd->inode->i_op->lookup) 2869 if ((nd->flags & LOOKUP_DIRECTORY) && !nd->inode->i_op->lookup)
2804 goto out; 2870 goto out;
2805 audit_inode(pathname, nd->path.dentry); 2871 audit_inode(name, nd->path.dentry, 0);
2806finish_open: 2872finish_open:
2807 if (!S_ISREG(nd->inode->i_mode)) 2873 if (!S_ISREG(nd->inode->i_mode))
2808 will_truncate = false; 2874 will_truncate = false;
@@ -2870,7 +2936,7 @@ stale_open:
2870 goto retry_lookup; 2936 goto retry_lookup;
2871} 2937}
2872 2938
2873static struct file *path_openat(int dfd, const char *pathname, 2939static struct file *path_openat(int dfd, struct filename *pathname,
2874 struct nameidata *nd, const struct open_flags *op, int flags) 2940 struct nameidata *nd, const struct open_flags *op, int flags)
2875{ 2941{
2876 struct file *base = NULL; 2942 struct file *base = NULL;
@@ -2885,12 +2951,12 @@ static struct file *path_openat(int dfd, const char *pathname,
2885 2951
2886 file->f_flags = op->open_flag; 2952 file->f_flags = op->open_flag;
2887 2953
2888 error = path_init(dfd, pathname, flags | LOOKUP_PARENT, nd, &base); 2954 error = path_init(dfd, pathname->name, flags | LOOKUP_PARENT, nd, &base);
2889 if (unlikely(error)) 2955 if (unlikely(error))
2890 goto out; 2956 goto out;
2891 2957
2892 current->total_link_count = 0; 2958 current->total_link_count = 0;
2893 error = link_path_walk(pathname, nd); 2959 error = link_path_walk(pathname->name, nd);
2894 if (unlikely(error)) 2960 if (unlikely(error))
2895 goto out; 2961 goto out;
2896 2962
@@ -2936,7 +3002,7 @@ out:
2936 return file; 3002 return file;
2937} 3003}
2938 3004
2939struct file *do_filp_open(int dfd, const char *pathname, 3005struct file *do_filp_open(int dfd, struct filename *pathname,
2940 const struct open_flags *op, int flags) 3006 const struct open_flags *op, int flags)
2941{ 3007{
2942 struct nameidata nd; 3008 struct nameidata nd;
@@ -2955,6 +3021,7 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
2955{ 3021{
2956 struct nameidata nd; 3022 struct nameidata nd;
2957 struct file *file; 3023 struct file *file;
3024 struct filename filename = { .name = name };
2958 3025
2959 nd.root.mnt = mnt; 3026 nd.root.mnt = mnt;
2960 nd.root.dentry = dentry; 3027 nd.root.dentry = dentry;
@@ -2964,11 +3031,11 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
2964 if (dentry->d_inode->i_op->follow_link && op->intent & LOOKUP_OPEN) 3031 if (dentry->d_inode->i_op->follow_link && op->intent & LOOKUP_OPEN)
2965 return ERR_PTR(-ELOOP); 3032 return ERR_PTR(-ELOOP);
2966 3033
2967 file = path_openat(-1, name, &nd, op, flags | LOOKUP_RCU); 3034 file = path_openat(-1, &filename, &nd, op, flags | LOOKUP_RCU);
2968 if (unlikely(file == ERR_PTR(-ECHILD))) 3035 if (unlikely(file == ERR_PTR(-ECHILD)))
2969 file = path_openat(-1, name, &nd, op, flags); 3036 file = path_openat(-1, &filename, &nd, op, flags);
2970 if (unlikely(file == ERR_PTR(-ESTALE))) 3037 if (unlikely(file == ERR_PTR(-ESTALE)))
2971 file = path_openat(-1, name, &nd, op, flags | LOOKUP_REVAL); 3038 file = path_openat(-1, &filename, &nd, op, flags | LOOKUP_REVAL);
2972 return file; 3039 return file;
2973} 3040}
2974 3041
@@ -3043,11 +3110,11 @@ EXPORT_SYMBOL(done_path_create);
3043 3110
3044struct dentry *user_path_create(int dfd, const char __user *pathname, struct path *path, int is_dir) 3111struct dentry *user_path_create(int dfd, const char __user *pathname, struct path *path, int is_dir)
3045{ 3112{
3046 char *tmp = getname(pathname); 3113 struct filename *tmp = getname(pathname);
3047 struct dentry *res; 3114 struct dentry *res;
3048 if (IS_ERR(tmp)) 3115 if (IS_ERR(tmp))
3049 return ERR_CAST(tmp); 3116 return ERR_CAST(tmp);
3050 res = kern_path_create(dfd, tmp, path, is_dir); 3117 res = kern_path_create(dfd, tmp->name, path, is_dir);
3051 putname(tmp); 3118 putname(tmp);
3052 return res; 3119 return res;
3053} 3120}
@@ -3252,13 +3319,13 @@ out:
3252static long do_rmdir(int dfd, const char __user *pathname) 3319static long do_rmdir(int dfd, const char __user *pathname)
3253{ 3320{
3254 int error = 0; 3321 int error = 0;
3255 char * name; 3322 struct filename *name;
3256 struct dentry *dentry; 3323 struct dentry *dentry;
3257 struct nameidata nd; 3324 struct nameidata nd;
3258 3325
3259 error = user_path_parent(dfd, pathname, &nd, &name); 3326 name = user_path_parent(dfd, pathname, &nd);
3260 if (error) 3327 if (IS_ERR(name))
3261 return error; 3328 return PTR_ERR(name);
3262 3329
3263 switch(nd.last_type) { 3330 switch(nd.last_type) {
3264 case LAST_DOTDOT: 3331 case LAST_DOTDOT:
@@ -3347,14 +3414,14 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
3347static long do_unlinkat(int dfd, const char __user *pathname) 3414static long do_unlinkat(int dfd, const char __user *pathname)
3348{ 3415{
3349 int error; 3416 int error;
3350 char *name; 3417 struct filename *name;
3351 struct dentry *dentry; 3418 struct dentry *dentry;
3352 struct nameidata nd; 3419 struct nameidata nd;
3353 struct inode *inode = NULL; 3420 struct inode *inode = NULL;
3354 3421
3355 error = user_path_parent(dfd, pathname, &nd, &name); 3422 name = user_path_parent(dfd, pathname, &nd);
3356 if (error) 3423 if (IS_ERR(name))
3357 return error; 3424 return PTR_ERR(name);
3358 3425
3359 error = -EISDIR; 3426 error = -EISDIR;
3360 if (nd.last_type != LAST_NORM) 3427 if (nd.last_type != LAST_NORM)
@@ -3438,7 +3505,7 @@ SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
3438 int, newdfd, const char __user *, newname) 3505 int, newdfd, const char __user *, newname)
3439{ 3506{
3440 int error; 3507 int error;
3441 char *from; 3508 struct filename *from;
3442 struct dentry *dentry; 3509 struct dentry *dentry;
3443 struct path path; 3510 struct path path;
3444 3511
@@ -3451,9 +3518,9 @@ SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
3451 if (IS_ERR(dentry)) 3518 if (IS_ERR(dentry))
3452 goto out_putname; 3519 goto out_putname;
3453 3520
3454 error = security_path_symlink(&path, dentry, from); 3521 error = security_path_symlink(&path, dentry, from->name);
3455 if (!error) 3522 if (!error)
3456 error = vfs_symlink(path.dentry->d_inode, dentry, from); 3523 error = vfs_symlink(path.dentry->d_inode, dentry, from->name);
3457 done_path_create(&path, dentry); 3524 done_path_create(&path, dentry);
3458out_putname: 3525out_putname:
3459 putname(from); 3526 putname(from);
@@ -3733,17 +3800,21 @@ SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
3733 struct dentry *old_dentry, *new_dentry; 3800 struct dentry *old_dentry, *new_dentry;
3734 struct dentry *trap; 3801 struct dentry *trap;
3735 struct nameidata oldnd, newnd; 3802 struct nameidata oldnd, newnd;
3736 char *from; 3803 struct filename *from;
3737 char *to; 3804 struct filename *to;
3738 int error; 3805 int error;
3739 3806
3740 error = user_path_parent(olddfd, oldname, &oldnd, &from); 3807 from = user_path_parent(olddfd, oldname, &oldnd);
3741 if (error) 3808 if (IS_ERR(from)) {
3809 error = PTR_ERR(from);
3742 goto exit; 3810 goto exit;
3811 }
3743 3812
3744 error = user_path_parent(newdfd, newname, &newnd, &to); 3813 to = user_path_parent(newdfd, newname, &newnd);
3745 if (error) 3814 if (IS_ERR(to)) {
3815 error = PTR_ERR(to);
3746 goto exit1; 3816 goto exit1;
3817 }
3747 3818
3748 error = -EXDEV; 3819 error = -EXDEV;
3749 if (oldnd.path.mnt != newnd.path.mnt) 3820 if (oldnd.path.mnt != newnd.path.mnt)
@@ -3967,7 +4038,6 @@ EXPORT_SYMBOL(follow_down_one);
3967EXPORT_SYMBOL(follow_down); 4038EXPORT_SYMBOL(follow_down);
3968EXPORT_SYMBOL(follow_up); 4039EXPORT_SYMBOL(follow_up);
3969EXPORT_SYMBOL(get_write_access); /* nfsd */ 4040EXPORT_SYMBOL(get_write_access); /* nfsd */
3970EXPORT_SYMBOL(getname);
3971EXPORT_SYMBOL(lock_rename); 4041EXPORT_SYMBOL(lock_rename);
3972EXPORT_SYMBOL(lookup_one_len); 4042EXPORT_SYMBOL(lookup_one_len);
3973EXPORT_SYMBOL(page_follow_link_light); 4043EXPORT_SYMBOL(page_follow_link_light);
diff --git a/fs/namespace.c b/fs/namespace.c
index 7bdf7907413f..24960626bb6b 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1640,7 +1640,7 @@ static int do_change_type(struct path *path, int flag)
1640/* 1640/*
1641 * do loopback mount. 1641 * do loopback mount.
1642 */ 1642 */
1643static int do_loopback(struct path *path, char *old_name, 1643static int do_loopback(struct path *path, const char *old_name,
1644 int recurse) 1644 int recurse)
1645{ 1645{
1646 LIST_HEAD(umount_list); 1646 LIST_HEAD(umount_list);
@@ -1764,7 +1764,7 @@ static inline int tree_contains_unbindable(struct mount *mnt)
1764 return 0; 1764 return 0;
1765} 1765}
1766 1766
1767static int do_move_mount(struct path *path, char *old_name) 1767static int do_move_mount(struct path *path, const char *old_name)
1768{ 1768{
1769 struct path old_path, parent_path; 1769 struct path old_path, parent_path;
1770 struct mount *p; 1770 struct mount *p;
@@ -1917,8 +1917,8 @@ unlock:
1917 * create a new mount for userspace and request it to be added into the 1917 * create a new mount for userspace and request it to be added into the
1918 * namespace's tree 1918 * namespace's tree
1919 */ 1919 */
1920static int do_new_mount(struct path *path, char *type, int flags, 1920static int do_new_mount(struct path *path, const char *type, int flags,
1921 int mnt_flags, char *name, void *data) 1921 int mnt_flags, const char *name, void *data)
1922{ 1922{
1923 struct vfsmount *mnt; 1923 struct vfsmount *mnt;
1924 int err; 1924 int err;
@@ -2191,8 +2191,8 @@ int copy_mount_string(const void __user *data, char **where)
2191 * Therefore, if this magic number is present, it carries no information 2191 * Therefore, if this magic number is present, it carries no information
2192 * and must be discarded. 2192 * and must be discarded.
2193 */ 2193 */
2194long do_mount(char *dev_name, char *dir_name, char *type_page, 2194long do_mount(const char *dev_name, const char *dir_name,
2195 unsigned long flags, void *data_page) 2195 const char *type_page, unsigned long flags, void *data_page)
2196{ 2196{
2197 struct path path; 2197 struct path path;
2198 int retval = 0; 2198 int retval = 0;
@@ -2408,7 +2408,7 @@ SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
2408{ 2408{
2409 int ret; 2409 int ret;
2410 char *kernel_type; 2410 char *kernel_type;
2411 char *kernel_dir; 2411 struct filename *kernel_dir;
2412 char *kernel_dev; 2412 char *kernel_dev;
2413 unsigned long data_page; 2413 unsigned long data_page;
2414 2414
@@ -2430,7 +2430,7 @@ SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
2430 if (ret < 0) 2430 if (ret < 0)
2431 goto out_data; 2431 goto out_data;
2432 2432
2433 ret = do_mount(kernel_dev, kernel_dir, kernel_type, flags, 2433 ret = do_mount(kernel_dev, kernel_dir->name, kernel_type, flags,
2434 (void *) data_page); 2434 (void *) data_page);
2435 2435
2436 free_page(data_page); 2436 free_page(data_page);
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index db7ad719628a..13ca196385f5 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -95,8 +95,8 @@ config NFS_SWAP
95 This option enables swapon to work on files located on NFS mounts. 95 This option enables swapon to work on files located on NFS mounts.
96 96
97config NFS_V4_1 97config NFS_V4_1
98 bool "NFS client support for NFSv4.1 (EXPERIMENTAL)" 98 bool "NFS client support for NFSv4.1"
99 depends on NFS_V4 && EXPERIMENTAL 99 depends on NFS_V4
100 select SUNRPC_BACKCHANNEL 100 select SUNRPC_BACKCHANNEL
101 help 101 help
102 This option enables support for minor version 1 of the NFSv4 protocol 102 This option enables support for minor version 1 of the NFSv4 protocol
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index dd392ed5f2e2..f1027b06a1a9 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -37,6 +37,7 @@
37#include <linux/bio.h> /* struct bio */ 37#include <linux/bio.h> /* struct bio */
38#include <linux/buffer_head.h> /* various write calls */ 38#include <linux/buffer_head.h> /* various write calls */
39#include <linux/prefetch.h> 39#include <linux/prefetch.h>
40#include <linux/pagevec.h>
40 41
41#include "../pnfs.h" 42#include "../pnfs.h"
42#include "../internal.h" 43#include "../internal.h"
@@ -162,25 +163,39 @@ static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
162 return bio; 163 return bio;
163} 164}
164 165
165static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw, 166static struct bio *do_add_page_to_bio(struct bio *bio, int npg, int rw,
166 sector_t isect, struct page *page, 167 sector_t isect, struct page *page,
167 struct pnfs_block_extent *be, 168 struct pnfs_block_extent *be,
168 void (*end_io)(struct bio *, int err), 169 void (*end_io)(struct bio *, int err),
169 struct parallel_io *par) 170 struct parallel_io *par,
171 unsigned int offset, int len)
170{ 172{
173 isect = isect + (offset >> SECTOR_SHIFT);
174 dprintk("%s: npg %d rw %d isect %llu offset %u len %d\n", __func__,
175 npg, rw, (unsigned long long)isect, offset, len);
171retry: 176retry:
172 if (!bio) { 177 if (!bio) {
173 bio = bl_alloc_init_bio(npg, isect, be, end_io, par); 178 bio = bl_alloc_init_bio(npg, isect, be, end_io, par);
174 if (!bio) 179 if (!bio)
175 return ERR_PTR(-ENOMEM); 180 return ERR_PTR(-ENOMEM);
176 } 181 }
177 if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) { 182 if (bio_add_page(bio, page, len, offset) < len) {
178 bio = bl_submit_bio(rw, bio); 183 bio = bl_submit_bio(rw, bio);
179 goto retry; 184 goto retry;
180 } 185 }
181 return bio; 186 return bio;
182} 187}
183 188
189static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw,
190 sector_t isect, struct page *page,
191 struct pnfs_block_extent *be,
192 void (*end_io)(struct bio *, int err),
193 struct parallel_io *par)
194{
195 return do_add_page_to_bio(bio, npg, rw, isect, page, be,
196 end_io, par, 0, PAGE_CACHE_SIZE);
197}
198
184/* This is basically copied from mpage_end_io_read */ 199/* This is basically copied from mpage_end_io_read */
185static void bl_end_io_read(struct bio *bio, int err) 200static void bl_end_io_read(struct bio *bio, int err)
186{ 201{
@@ -228,14 +243,6 @@ bl_end_par_io_read(void *data, int unused)
228 schedule_work(&rdata->task.u.tk_work); 243 schedule_work(&rdata->task.u.tk_work);
229} 244}
230 245
231static bool
232bl_check_alignment(u64 offset, u32 len, unsigned long blkmask)
233{
234 if ((offset & blkmask) || (len & blkmask))
235 return false;
236 return true;
237}
238
239static enum pnfs_try_status 246static enum pnfs_try_status
240bl_read_pagelist(struct nfs_read_data *rdata) 247bl_read_pagelist(struct nfs_read_data *rdata)
241{ 248{
@@ -246,15 +253,15 @@ bl_read_pagelist(struct nfs_read_data *rdata)
246 sector_t isect, extent_length = 0; 253 sector_t isect, extent_length = 0;
247 struct parallel_io *par; 254 struct parallel_io *par;
248 loff_t f_offset = rdata->args.offset; 255 loff_t f_offset = rdata->args.offset;
256 size_t bytes_left = rdata->args.count;
257 unsigned int pg_offset, pg_len;
249 struct page **pages = rdata->args.pages; 258 struct page **pages = rdata->args.pages;
250 int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT; 259 int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT;
260 const bool is_dio = (header->dreq != NULL);
251 261
252 dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__, 262 dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__,
253 rdata->pages.npages, f_offset, (unsigned int)rdata->args.count); 263 rdata->pages.npages, f_offset, (unsigned int)rdata->args.count);
254 264
255 if (!bl_check_alignment(f_offset, rdata->args.count, PAGE_CACHE_MASK))
256 goto use_mds;
257
258 par = alloc_parallel(rdata); 265 par = alloc_parallel(rdata);
259 if (!par) 266 if (!par)
260 goto use_mds; 267 goto use_mds;
@@ -284,36 +291,53 @@ bl_read_pagelist(struct nfs_read_data *rdata)
284 extent_length = min(extent_length, cow_length); 291 extent_length = min(extent_length, cow_length);
285 } 292 }
286 } 293 }
294
295 if (is_dio) {
296 pg_offset = f_offset & ~PAGE_CACHE_MASK;
297 if (pg_offset + bytes_left > PAGE_CACHE_SIZE)
298 pg_len = PAGE_CACHE_SIZE - pg_offset;
299 else
300 pg_len = bytes_left;
301
302 f_offset += pg_len;
303 bytes_left -= pg_len;
304 isect += (pg_offset >> SECTOR_SHIFT);
305 } else {
306 pg_offset = 0;
307 pg_len = PAGE_CACHE_SIZE;
308 }
309
287 hole = is_hole(be, isect); 310 hole = is_hole(be, isect);
288 if (hole && !cow_read) { 311 if (hole && !cow_read) {
289 bio = bl_submit_bio(READ, bio); 312 bio = bl_submit_bio(READ, bio);
290 /* Fill hole w/ zeroes w/o accessing device */ 313 /* Fill hole w/ zeroes w/o accessing device */
291 dprintk("%s Zeroing page for hole\n", __func__); 314 dprintk("%s Zeroing page for hole\n", __func__);
292 zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE); 315 zero_user_segment(pages[i], pg_offset, pg_len);
293 print_page(pages[i]); 316 print_page(pages[i]);
294 SetPageUptodate(pages[i]); 317 SetPageUptodate(pages[i]);
295 } else { 318 } else {
296 struct pnfs_block_extent *be_read; 319 struct pnfs_block_extent *be_read;
297 320
298 be_read = (hole && cow_read) ? cow_read : be; 321 be_read = (hole && cow_read) ? cow_read : be;
299 bio = bl_add_page_to_bio(bio, rdata->pages.npages - i, 322 bio = do_add_page_to_bio(bio, rdata->pages.npages - i,
300 READ, 323 READ,
301 isect, pages[i], be_read, 324 isect, pages[i], be_read,
302 bl_end_io_read, par); 325 bl_end_io_read, par,
326 pg_offset, pg_len);
303 if (IS_ERR(bio)) { 327 if (IS_ERR(bio)) {
304 header->pnfs_error = PTR_ERR(bio); 328 header->pnfs_error = PTR_ERR(bio);
305 bio = NULL; 329 bio = NULL;
306 goto out; 330 goto out;
307 } 331 }
308 } 332 }
309 isect += PAGE_CACHE_SECTORS; 333 isect += (pg_len >> SECTOR_SHIFT);
310 extent_length -= PAGE_CACHE_SECTORS; 334 extent_length -= PAGE_CACHE_SECTORS;
311 } 335 }
312 if ((isect << SECTOR_SHIFT) >= header->inode->i_size) { 336 if ((isect << SECTOR_SHIFT) >= header->inode->i_size) {
313 rdata->res.eof = 1; 337 rdata->res.eof = 1;
314 rdata->res.count = header->inode->i_size - f_offset; 338 rdata->res.count = header->inode->i_size - rdata->args.offset;
315 } else { 339 } else {
316 rdata->res.count = (isect << SECTOR_SHIFT) - f_offset; 340 rdata->res.count = (isect << SECTOR_SHIFT) - rdata->args.offset;
317 } 341 }
318out: 342out:
319 bl_put_extent(be); 343 bl_put_extent(be);
@@ -461,6 +485,106 @@ map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be)
461 return; 485 return;
462} 486}
463 487
488static void
489bl_read_single_end_io(struct bio *bio, int error)
490{
491 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
492 struct page *page = bvec->bv_page;
493
494 /* Only one page in bvec */
495 unlock_page(page);
496}
497
498static int
499bl_do_readpage_sync(struct page *page, struct pnfs_block_extent *be,
500 unsigned int offset, unsigned int len)
501{
502 struct bio *bio;
503 struct page *shadow_page;
504 sector_t isect;
505 char *kaddr, *kshadow_addr;
506 int ret = 0;
507
508 dprintk("%s: offset %u len %u\n", __func__, offset, len);
509
510 shadow_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
511 if (shadow_page == NULL)
512 return -ENOMEM;
513
514 bio = bio_alloc(GFP_NOIO, 1);
515 if (bio == NULL)
516 return -ENOMEM;
517
518 isect = (page->index << PAGE_CACHE_SECTOR_SHIFT) +
519 (offset / SECTOR_SIZE);
520
521 bio->bi_sector = isect - be->be_f_offset + be->be_v_offset;
522 bio->bi_bdev = be->be_mdev;
523 bio->bi_end_io = bl_read_single_end_io;
524
525 lock_page(shadow_page);
526 if (bio_add_page(bio, shadow_page,
527 SECTOR_SIZE, round_down(offset, SECTOR_SIZE)) == 0) {
528 unlock_page(shadow_page);
529 bio_put(bio);
530 return -EIO;
531 }
532
533 submit_bio(READ, bio);
534 wait_on_page_locked(shadow_page);
535 if (unlikely(!test_bit(BIO_UPTODATE, &bio->bi_flags))) {
536 ret = -EIO;
537 } else {
538 kaddr = kmap_atomic(page);
539 kshadow_addr = kmap_atomic(shadow_page);
540 memcpy(kaddr + offset, kshadow_addr + offset, len);
541 kunmap_atomic(kshadow_addr);
542 kunmap_atomic(kaddr);
543 }
544 __free_page(shadow_page);
545 bio_put(bio);
546
547 return ret;
548}
549
550static int
551bl_read_partial_page_sync(struct page *page, struct pnfs_block_extent *be,
552 unsigned int dirty_offset, unsigned int dirty_len,
553 bool full_page)
554{
555 int ret = 0;
556 unsigned int start, end;
557
558 if (full_page) {
559 start = 0;
560 end = PAGE_CACHE_SIZE;
561 } else {
562 start = round_down(dirty_offset, SECTOR_SIZE);
563 end = round_up(dirty_offset + dirty_len, SECTOR_SIZE);
564 }
565
566 dprintk("%s: offset %u len %d\n", __func__, dirty_offset, dirty_len);
567 if (!be) {
568 zero_user_segments(page, start, dirty_offset,
569 dirty_offset + dirty_len, end);
570 if (start == 0 && end == PAGE_CACHE_SIZE &&
571 trylock_page(page)) {
572 SetPageUptodate(page);
573 unlock_page(page);
574 }
575 return ret;
576 }
577
578 if (start != dirty_offset)
579 ret = bl_do_readpage_sync(page, be, start, dirty_offset - start);
580
581 if (!ret && (dirty_offset + dirty_len < end))
582 ret = bl_do_readpage_sync(page, be, dirty_offset + dirty_len,
583 end - dirty_offset - dirty_len);
584
585 return ret;
586}
587
464/* Given an unmapped page, zero it or read in page for COW, page is locked 588/* Given an unmapped page, zero it or read in page for COW, page is locked
465 * by caller. 589 * by caller.
466 */ 590 */
@@ -494,7 +618,6 @@ init_page_for_write(struct page *page, struct pnfs_block_extent *cow_read)
494 SetPageUptodate(page); 618 SetPageUptodate(page);
495 619
496cleanup: 620cleanup:
497 bl_put_extent(cow_read);
498 if (bh) 621 if (bh)
499 free_buffer_head(bh); 622 free_buffer_head(bh);
500 if (ret) { 623 if (ret) {
@@ -566,6 +689,7 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
566 struct parallel_io *par = NULL; 689 struct parallel_io *par = NULL;
567 loff_t offset = wdata->args.offset; 690 loff_t offset = wdata->args.offset;
568 size_t count = wdata->args.count; 691 size_t count = wdata->args.count;
692 unsigned int pg_offset, pg_len, saved_len;
569 struct page **pages = wdata->args.pages; 693 struct page **pages = wdata->args.pages;
570 struct page *page; 694 struct page *page;
571 pgoff_t index; 695 pgoff_t index;
@@ -574,10 +698,13 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
574 NFS_SERVER(header->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT; 698 NFS_SERVER(header->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT;
575 699
576 dprintk("%s enter, %Zu@%lld\n", __func__, count, offset); 700 dprintk("%s enter, %Zu@%lld\n", __func__, count, offset);
577 /* Check for alignment first */
578 if (!bl_check_alignment(offset, count, PAGE_CACHE_MASK))
579 goto out_mds;
580 701
702 if (header->dreq != NULL &&
703 (!IS_ALIGNED(offset, NFS_SERVER(header->inode)->pnfs_blksize) ||
704 !IS_ALIGNED(count, NFS_SERVER(header->inode)->pnfs_blksize))) {
705 dprintk("pnfsblock nonblock aligned DIO writes. Resend MDS\n");
706 goto out_mds;
707 }
581 /* At this point, wdata->pages is a (sequential) list of nfs_pages. 708 /* At this point, wdata->pages is a (sequential) list of nfs_pages.
582 * We want to write each, and if there is an error set pnfs_error 709 * We want to write each, and if there is an error set pnfs_error
583 * to have it redone using nfs. 710 * to have it redone using nfs.
@@ -674,10 +801,11 @@ next_page:
674 if (!extent_length) { 801 if (!extent_length) {
675 /* We've used up the previous extent */ 802 /* We've used up the previous extent */
676 bl_put_extent(be); 803 bl_put_extent(be);
804 bl_put_extent(cow_read);
677 bio = bl_submit_bio(WRITE, bio); 805 bio = bl_submit_bio(WRITE, bio);
678 /* Get the next one */ 806 /* Get the next one */
679 be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg), 807 be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg),
680 isect, NULL); 808 isect, &cow_read);
681 if (!be || !is_writable(be, isect)) { 809 if (!be || !is_writable(be, isect)) {
682 header->pnfs_error = -EINVAL; 810 header->pnfs_error = -EINVAL;
683 goto out; 811 goto out;
@@ -694,7 +822,26 @@ next_page:
694 extent_length = be->be_length - 822 extent_length = be->be_length -
695 (isect - be->be_f_offset); 823 (isect - be->be_f_offset);
696 } 824 }
697 if (be->be_state == PNFS_BLOCK_INVALID_DATA) { 825
826 dprintk("%s offset %lld count %Zu\n", __func__, offset, count);
827 pg_offset = offset & ~PAGE_CACHE_MASK;
828 if (pg_offset + count > PAGE_CACHE_SIZE)
829 pg_len = PAGE_CACHE_SIZE - pg_offset;
830 else
831 pg_len = count;
832
833 saved_len = pg_len;
834 if (be->be_state == PNFS_BLOCK_INVALID_DATA &&
835 !bl_is_sector_init(be->be_inval, isect)) {
836 ret = bl_read_partial_page_sync(pages[i], cow_read,
837 pg_offset, pg_len, true);
838 if (ret) {
839 dprintk("%s bl_read_partial_page_sync fail %d\n",
840 __func__, ret);
841 header->pnfs_error = ret;
842 goto out;
843 }
844
698 ret = bl_mark_sectors_init(be->be_inval, isect, 845 ret = bl_mark_sectors_init(be->be_inval, isect,
699 PAGE_CACHE_SECTORS); 846 PAGE_CACHE_SECTORS);
700 if (unlikely(ret)) { 847 if (unlikely(ret)) {
@@ -703,15 +850,35 @@ next_page:
703 header->pnfs_error = ret; 850 header->pnfs_error = ret;
704 goto out; 851 goto out;
705 } 852 }
853
854 /* Expand to full page write */
855 pg_offset = 0;
856 pg_len = PAGE_CACHE_SIZE;
857 } else if ((pg_offset & (SECTOR_SIZE - 1)) ||
858 (pg_len & (SECTOR_SIZE - 1))){
859 /* ahh, nasty case. We have to do sync full sector
860 * read-modify-write cycles.
861 */
862 unsigned int saved_offset = pg_offset;
863 ret = bl_read_partial_page_sync(pages[i], be, pg_offset,
864 pg_len, false);
865 pg_offset = round_down(pg_offset, SECTOR_SIZE);
866 pg_len = round_up(saved_offset + pg_len, SECTOR_SIZE)
867 - pg_offset;
706 } 868 }
707 bio = bl_add_page_to_bio(bio, wdata->pages.npages - i, WRITE, 869
870
871 bio = do_add_page_to_bio(bio, wdata->pages.npages - i, WRITE,
708 isect, pages[i], be, 872 isect, pages[i], be,
709 bl_end_io_write, par); 873 bl_end_io_write, par,
874 pg_offset, pg_len);
710 if (IS_ERR(bio)) { 875 if (IS_ERR(bio)) {
711 header->pnfs_error = PTR_ERR(bio); 876 header->pnfs_error = PTR_ERR(bio);
712 bio = NULL; 877 bio = NULL;
713 goto out; 878 goto out;
714 } 879 }
880 offset += saved_len;
881 count -= saved_len;
715 isect += PAGE_CACHE_SECTORS; 882 isect += PAGE_CACHE_SECTORS;
716 last_isect = isect; 883 last_isect = isect;
717 extent_length -= PAGE_CACHE_SECTORS; 884 extent_length -= PAGE_CACHE_SECTORS;
@@ -729,17 +896,16 @@ next_page:
729 } 896 }
730 897
731write_done: 898write_done:
732 wdata->res.count = (last_isect << SECTOR_SHIFT) - (offset); 899 wdata->res.count = wdata->args.count;
733 if (count < wdata->res.count) {
734 wdata->res.count = count;
735 }
736out: 900out:
737 bl_put_extent(be); 901 bl_put_extent(be);
902 bl_put_extent(cow_read);
738 bl_submit_bio(WRITE, bio); 903 bl_submit_bio(WRITE, bio);
739 put_parallel(par); 904 put_parallel(par);
740 return PNFS_ATTEMPTED; 905 return PNFS_ATTEMPTED;
741out_mds: 906out_mds:
742 bl_put_extent(be); 907 bl_put_extent(be);
908 bl_put_extent(cow_read);
743 kfree(par); 909 kfree(par);
744 return PNFS_NOT_ATTEMPTED; 910 return PNFS_NOT_ATTEMPTED;
745} 911}
@@ -874,7 +1040,7 @@ static void free_blk_mountid(struct block_mount_id *mid)
874 } 1040 }
875} 1041}
876 1042
877/* This is mostly copied from the filelayout's get_device_info function. 1043/* This is mostly copied from the filelayout_get_device_info function.
878 * It seems much of this should be at the generic pnfs level. 1044 * It seems much of this should be at the generic pnfs level.
879 */ 1045 */
880static struct pnfs_block_dev * 1046static struct pnfs_block_dev *
@@ -1011,33 +1177,95 @@ bl_clear_layoutdriver(struct nfs_server *server)
1011 return 0; 1177 return 0;
1012} 1178}
1013 1179
1180static bool
1181is_aligned_req(struct nfs_page *req, unsigned int alignment)
1182{
1183 return IS_ALIGNED(req->wb_offset, alignment) &&
1184 IS_ALIGNED(req->wb_bytes, alignment);
1185}
1186
1014static void 1187static void
1015bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) 1188bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
1016{ 1189{
1017 if (!bl_check_alignment(req->wb_offset, req->wb_bytes, PAGE_CACHE_MASK)) 1190 if (pgio->pg_dreq != NULL &&
1191 !is_aligned_req(req, SECTOR_SIZE))
1018 nfs_pageio_reset_read_mds(pgio); 1192 nfs_pageio_reset_read_mds(pgio);
1019 else 1193 else
1020 pnfs_generic_pg_init_read(pgio, req); 1194 pnfs_generic_pg_init_read(pgio, req);
1021} 1195}
1022 1196
1197static bool
1198bl_pg_test_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
1199 struct nfs_page *req)
1200{
1201 if (pgio->pg_dreq != NULL &&
1202 !is_aligned_req(req, SECTOR_SIZE))
1203 return false;
1204
1205 return pnfs_generic_pg_test(pgio, prev, req);
1206}
1207
1208/*
1209 * Return the number of contiguous bytes for a given inode
1210 * starting at page frame idx.
1211 */
1212static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx)
1213{
1214 struct address_space *mapping = inode->i_mapping;
1215 pgoff_t end;
1216
1217 /* Optimize common case that writes from 0 to end of file */
1218 end = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE);
1219 if (end != NFS_I(inode)->npages) {
1220 rcu_read_lock();
1221 end = radix_tree_next_hole(&mapping->page_tree, idx + 1, ULONG_MAX);
1222 rcu_read_unlock();
1223 }
1224
1225 if (!end)
1226 return i_size_read(inode) - (idx << PAGE_CACHE_SHIFT);
1227 else
1228 return (end - idx) << PAGE_CACHE_SHIFT;
1229}
1230
1023static void 1231static void
1024bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) 1232bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
1025{ 1233{
1026 if (!bl_check_alignment(req->wb_offset, req->wb_bytes, PAGE_CACHE_MASK)) 1234 if (pgio->pg_dreq != NULL &&
1235 !is_aligned_req(req, PAGE_CACHE_SIZE)) {
1027 nfs_pageio_reset_write_mds(pgio); 1236 nfs_pageio_reset_write_mds(pgio);
1028 else 1237 } else {
1029 pnfs_generic_pg_init_write(pgio, req); 1238 u64 wb_size;
1239 if (pgio->pg_dreq == NULL)
1240 wb_size = pnfs_num_cont_bytes(pgio->pg_inode,
1241 req->wb_index);
1242 else
1243 wb_size = nfs_dreq_bytes_left(pgio->pg_dreq);
1244
1245 pnfs_generic_pg_init_write(pgio, req, wb_size);
1246 }
1247}
1248
1249static bool
1250bl_pg_test_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
1251 struct nfs_page *req)
1252{
1253 if (pgio->pg_dreq != NULL &&
1254 !is_aligned_req(req, PAGE_CACHE_SIZE))
1255 return false;
1256
1257 return pnfs_generic_pg_test(pgio, prev, req);
1030} 1258}
1031 1259
1032static const struct nfs_pageio_ops bl_pg_read_ops = { 1260static const struct nfs_pageio_ops bl_pg_read_ops = {
1033 .pg_init = bl_pg_init_read, 1261 .pg_init = bl_pg_init_read,
1034 .pg_test = pnfs_generic_pg_test, 1262 .pg_test = bl_pg_test_read,
1035 .pg_doio = pnfs_generic_pg_readpages, 1263 .pg_doio = pnfs_generic_pg_readpages,
1036}; 1264};
1037 1265
1038static const struct nfs_pageio_ops bl_pg_write_ops = { 1266static const struct nfs_pageio_ops bl_pg_write_ops = {
1039 .pg_init = bl_pg_init_write, 1267 .pg_init = bl_pg_init_write,
1040 .pg_test = pnfs_generic_pg_test, 1268 .pg_test = bl_pg_test_write,
1041 .pg_doio = pnfs_generic_pg_writepages, 1269 .pg_doio = pnfs_generic_pg_writepages,
1042}; 1270};
1043 1271
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 03350690118e..f4891bde8851 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -41,6 +41,7 @@
41 41
42#define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> SECTOR_SHIFT) 42#define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> SECTOR_SHIFT)
43#define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT) 43#define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT)
44#define SECTOR_SIZE (1 << SECTOR_SHIFT)
44 45
45struct block_mount_id { 46struct block_mount_id {
46 spinlock_t bm_lock; /* protects list */ 47 spinlock_t bm_lock; /* protects list */
@@ -172,7 +173,6 @@ struct bl_msg_hdr {
172/* blocklayoutdev.c */ 173/* blocklayoutdev.c */
173ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t); 174ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t);
174void bl_pipe_destroy_msg(struct rpc_pipe_msg *); 175void bl_pipe_destroy_msg(struct rpc_pipe_msg *);
175struct block_device *nfs4_blkdev_get(dev_t dev);
176int nfs4_blkdev_put(struct block_device *bdev); 176int nfs4_blkdev_put(struct block_device *bdev);
177struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server, 177struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server,
178 struct pnfs_device *dev); 178 struct pnfs_device *dev);
diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c
index c96554245ccf..a86c5bdad9e3 100644
--- a/fs/nfs/blocklayout/blocklayoutdev.c
+++ b/fs/nfs/blocklayout/blocklayoutdev.c
@@ -53,22 +53,6 @@ static int decode_sector_number(__be32 **rp, sector_t *sp)
53 return 0; 53 return 0;
54} 54}
55 55
56/* Open a block_device by device number. */
57struct block_device *nfs4_blkdev_get(dev_t dev)
58{
59 struct block_device *bd;
60
61 dprintk("%s enter\n", __func__);
62 bd = blkdev_get_by_dev(dev, FMODE_READ, NULL);
63 if (IS_ERR(bd))
64 goto fail;
65 return bd;
66fail:
67 dprintk("%s failed to open device : %ld\n",
68 __func__, PTR_ERR(bd));
69 return NULL;
70}
71
72/* 56/*
73 * Release the block device 57 * Release the block device
74 */ 58 */
@@ -172,11 +156,12 @@ nfs4_blk_decode_device(struct nfs_server *server,
172 goto out; 156 goto out;
173 } 157 }
174 158
175 bd = nfs4_blkdev_get(MKDEV(reply->major, reply->minor)); 159 bd = blkdev_get_by_dev(MKDEV(reply->major, reply->minor),
160 FMODE_READ, NULL);
176 if (IS_ERR(bd)) { 161 if (IS_ERR(bd)) {
177 rc = PTR_ERR(bd); 162 dprintk("%s failed to open device : %ld\n", __func__,
178 dprintk("%s failed to open device : %d\n", __func__, rc); 163 PTR_ERR(bd));
179 rv = ERR_PTR(rc); 164 rv = ERR_CAST(bd);
180 goto out; 165 goto out;
181 } 166 }
182 167
diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
index 1f9a6032796b..9c3e117c3ed1 100644
--- a/fs/nfs/blocklayout/extents.c
+++ b/fs/nfs/blocklayout/extents.c
@@ -683,8 +683,7 @@ encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
683 p = xdr_encode_hyper(p, lce->bse_length << SECTOR_SHIFT); 683 p = xdr_encode_hyper(p, lce->bse_length << SECTOR_SHIFT);
684 p = xdr_encode_hyper(p, 0LL); 684 p = xdr_encode_hyper(p, 0LL);
685 *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA); 685 *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
686 list_del(&lce->bse_node); 686 list_move_tail(&lce->bse_node, &bl->bl_committing);
687 list_add_tail(&lce->bse_node, &bl->bl_committing);
688 bl->bl_count--; 687 bl->bl_count--;
689 count++; 688 count++;
690 } 689 }
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 4c8459e5bdee..9a521fb39869 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -12,6 +12,7 @@
12#include <linux/sunrpc/svc.h> 12#include <linux/sunrpc/svc.h>
13#include <linux/sunrpc/svcsock.h> 13#include <linux/sunrpc/svcsock.h>
14#include <linux/nfs_fs.h> 14#include <linux/nfs_fs.h>
15#include <linux/errno.h>
15#include <linux/mutex.h> 16#include <linux/mutex.h>
16#include <linux/freezer.h> 17#include <linux/freezer.h>
17#include <linux/kthread.h> 18#include <linux/kthread.h>
@@ -23,6 +24,7 @@
23#include "nfs4_fs.h" 24#include "nfs4_fs.h"
24#include "callback.h" 25#include "callback.h"
25#include "internal.h" 26#include "internal.h"
27#include "netns.h"
26 28
27#define NFSDBG_FACILITY NFSDBG_CALLBACK 29#define NFSDBG_FACILITY NFSDBG_CALLBACK
28 30
@@ -37,7 +39,32 @@ static struct nfs_callback_data nfs_callback_info[NFS4_MAX_MINOR_VERSION + 1];
37static DEFINE_MUTEX(nfs_callback_mutex); 39static DEFINE_MUTEX(nfs_callback_mutex);
38static struct svc_program nfs4_callback_program; 40static struct svc_program nfs4_callback_program;
39 41
40unsigned short nfs_callback_tcpport6; 42static int nfs4_callback_up_net(struct svc_serv *serv, struct net *net)
43{
44 int ret;
45 struct nfs_net *nn = net_generic(net, nfs_net_id);
46
47 ret = svc_create_xprt(serv, "tcp", net, PF_INET,
48 nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
49 if (ret <= 0)
50 goto out_err;
51 nn->nfs_callback_tcpport = ret;
52 dprintk("NFS: Callback listener port = %u (af %u, net %p)\n",
53 nn->nfs_callback_tcpport, PF_INET, net);
54
55 ret = svc_create_xprt(serv, "tcp", net, PF_INET6,
56 nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
57 if (ret > 0) {
58 nn->nfs_callback_tcpport6 = ret;
59 dprintk("NFS: Callback listener port = %u (af %u, net %p)\n",
60 nn->nfs_callback_tcpport6, PF_INET6, net);
61 } else if (ret != -EAFNOSUPPORT)
62 goto out_err;
63 return 0;
64
65out_err:
66 return (ret) ? ret : -ENOMEM;
67}
41 68
42/* 69/*
43 * This is the NFSv4 callback kernel thread. 70 * This is the NFSv4 callback kernel thread.
@@ -45,7 +72,7 @@ unsigned short nfs_callback_tcpport6;
45static int 72static int
46nfs4_callback_svc(void *vrqstp) 73nfs4_callback_svc(void *vrqstp)
47{ 74{
48 int err, preverr = 0; 75 int err;
49 struct svc_rqst *rqstp = vrqstp; 76 struct svc_rqst *rqstp = vrqstp;
50 77
51 set_freezable(); 78 set_freezable();
@@ -55,20 +82,8 @@ nfs4_callback_svc(void *vrqstp)
55 * Listen for a request on the socket 82 * Listen for a request on the socket
56 */ 83 */
57 err = svc_recv(rqstp, MAX_SCHEDULE_TIMEOUT); 84 err = svc_recv(rqstp, MAX_SCHEDULE_TIMEOUT);
58 if (err == -EAGAIN || err == -EINTR) { 85 if (err == -EAGAIN || err == -EINTR)
59 preverr = err;
60 continue;
61 }
62 if (err < 0) {
63 if (err != preverr) {
64 printk(KERN_WARNING "NFS: %s: unexpected error "
65 "from svc_recv (%d)\n", __func__, err);
66 preverr = err;
67 }
68 schedule_timeout_uninterruptible(HZ);
69 continue; 86 continue;
70 }
71 preverr = err;
72 svc_process(rqstp); 87 svc_process(rqstp);
73 } 88 }
74 return 0; 89 return 0;
@@ -78,38 +93,23 @@ nfs4_callback_svc(void *vrqstp)
78 * Prepare to bring up the NFSv4 callback service 93 * Prepare to bring up the NFSv4 callback service
79 */ 94 */
80static struct svc_rqst * 95static struct svc_rqst *
81nfs4_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt) 96nfs4_callback_up(struct svc_serv *serv)
82{ 97{
83 int ret;
84
85 ret = svc_create_xprt(serv, "tcp", &init_net, PF_INET,
86 nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
87 if (ret <= 0)
88 goto out_err;
89 nfs_callback_tcpport = ret;
90 dprintk("NFS: Callback listener port = %u (af %u)\n",
91 nfs_callback_tcpport, PF_INET);
92
93 ret = svc_create_xprt(serv, "tcp", &init_net, PF_INET6,
94 nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
95 if (ret > 0) {
96 nfs_callback_tcpport6 = ret;
97 dprintk("NFS: Callback listener port = %u (af %u)\n",
98 nfs_callback_tcpport6, PF_INET6);
99 } else if (ret == -EAFNOSUPPORT)
100 ret = 0;
101 else
102 goto out_err;
103
104 return svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE); 98 return svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
105
106out_err:
107 if (ret == 0)
108 ret = -ENOMEM;
109 return ERR_PTR(ret);
110} 99}
111 100
112#if defined(CONFIG_NFS_V4_1) 101#if defined(CONFIG_NFS_V4_1)
102static int nfs41_callback_up_net(struct svc_serv *serv, struct net *net)
103{
104 /*
105 * Create an svc_sock for the back channel service that shares the
106 * fore channel connection.
107 * Returns the input port (0) and sets the svc_serv bc_xprt on success
108 */
109 return svc_create_xprt(serv, "tcp-bc", net, PF_INET, 0,
110 SVC_SOCK_ANONYMOUS);
111}
112
113/* 113/*
114 * The callback service for NFSv4.1 callbacks 114 * The callback service for NFSv4.1 callbacks
115 */ 115 */
@@ -149,28 +149,9 @@ nfs41_callback_svc(void *vrqstp)
149 * Bring up the NFSv4.1 callback service 149 * Bring up the NFSv4.1 callback service
150 */ 150 */
151static struct svc_rqst * 151static struct svc_rqst *
152nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt) 152nfs41_callback_up(struct svc_serv *serv)
153{ 153{
154 struct svc_rqst *rqstp; 154 struct svc_rqst *rqstp;
155 int ret;
156
157 /*
158 * Create an svc_sock for the back channel service that shares the
159 * fore channel connection.
160 * Returns the input port (0) and sets the svc_serv bc_xprt on success
161 */
162 ret = svc_create_xprt(serv, "tcp-bc", &init_net, PF_INET, 0,
163 SVC_SOCK_ANONYMOUS);
164 if (ret < 0) {
165 rqstp = ERR_PTR(ret);
166 goto out;
167 }
168
169 /*
170 * Save the svc_serv in the transport so that it can
171 * be referenced when the session backchannel is initialized
172 */
173 xprt->bc_serv = serv;
174 155
175 INIT_LIST_HEAD(&serv->sv_cb_list); 156 INIT_LIST_HEAD(&serv->sv_cb_list);
176 spin_lock_init(&serv->sv_cb_lock); 157 spin_lock_init(&serv->sv_cb_lock);
@@ -180,90 +161,74 @@ nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt)
180 svc_xprt_put(serv->sv_bc_xprt); 161 svc_xprt_put(serv->sv_bc_xprt);
181 serv->sv_bc_xprt = NULL; 162 serv->sv_bc_xprt = NULL;
182 } 163 }
183out:
184 dprintk("--> %s return %ld\n", __func__, 164 dprintk("--> %s return %ld\n", __func__,
185 IS_ERR(rqstp) ? PTR_ERR(rqstp) : 0); 165 IS_ERR(rqstp) ? PTR_ERR(rqstp) : 0);
186 return rqstp; 166 return rqstp;
187} 167}
188 168
189static inline int nfs_minorversion_callback_svc_setup(u32 minorversion, 169static void nfs_minorversion_callback_svc_setup(struct svc_serv *serv,
190 struct svc_serv *serv, struct rpc_xprt *xprt,
191 struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp)) 170 struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp))
192{ 171{
193 if (minorversion) { 172 *rqstpp = nfs41_callback_up(serv);
194 *rqstpp = nfs41_callback_up(serv, xprt); 173 *callback_svc = nfs41_callback_svc;
195 *callback_svc = nfs41_callback_svc;
196 }
197 return minorversion;
198} 174}
199 175
200static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt, 176static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
201 struct nfs_callback_data *cb_info) 177 struct svc_serv *serv)
202{ 178{
203 if (minorversion) 179 if (minorversion)
204 xprt->bc_serv = cb_info->serv; 180 /*
181 * Save the svc_serv in the transport so that it can
182 * be referenced when the session backchannel is initialized
183 */
184 xprt->bc_serv = serv;
205} 185}
206#else 186#else
207static inline int nfs_minorversion_callback_svc_setup(u32 minorversion, 187static int nfs41_callback_up_net(struct svc_serv *serv, struct net *net)
208 struct svc_serv *serv, struct rpc_xprt *xprt,
209 struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp))
210{ 188{
211 return 0; 189 return 0;
212} 190}
213 191
192static void nfs_minorversion_callback_svc_setup(struct svc_serv *serv,
193 struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp))
194{
195 *rqstpp = ERR_PTR(-ENOTSUPP);
196 *callback_svc = ERR_PTR(-ENOTSUPP);
197}
198
214static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt, 199static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
215 struct nfs_callback_data *cb_info) 200 struct svc_serv *serv)
216{ 201{
217} 202}
218#endif /* CONFIG_NFS_V4_1 */ 203#endif /* CONFIG_NFS_V4_1 */
219 204
220/* 205static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt,
221 * Bring up the callback thread if it is not already up. 206 struct svc_serv *serv)
222 */
223int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt)
224{ 207{
225 struct svc_serv *serv = NULL;
226 struct svc_rqst *rqstp; 208 struct svc_rqst *rqstp;
227 int (*callback_svc)(void *vrqstp); 209 int (*callback_svc)(void *vrqstp);
228 struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; 210 struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
229 char svc_name[12]; 211 char svc_name[12];
230 int ret = 0; 212 int ret;
231 int minorversion_setup;
232 struct net *net = &init_net;
233 213
234 mutex_lock(&nfs_callback_mutex); 214 nfs_callback_bc_serv(minorversion, xprt, serv);
235 if (cb_info->users++ || cb_info->task != NULL) {
236 nfs_callback_bc_serv(minorversion, xprt, cb_info);
237 goto out;
238 }
239 serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL);
240 if (!serv) {
241 ret = -ENOMEM;
242 goto out_err;
243 }
244 /* As there is only one thread we need to over-ride the
245 * default maximum of 80 connections
246 */
247 serv->sv_maxconn = 1024;
248 215
249 ret = svc_bind(serv, net); 216 if (cb_info->task)
250 if (ret < 0) { 217 return 0;
251 printk(KERN_WARNING "NFS: bind callback service failed\n");
252 goto out_err;
253 }
254 218
255 minorversion_setup = nfs_minorversion_callback_svc_setup(minorversion, 219 switch (minorversion) {
256 serv, xprt, &rqstp, &callback_svc); 220 case 0:
257 if (!minorversion_setup) {
258 /* v4.0 callback setup */ 221 /* v4.0 callback setup */
259 rqstp = nfs4_callback_up(serv, xprt); 222 rqstp = nfs4_callback_up(serv);
260 callback_svc = nfs4_callback_svc; 223 callback_svc = nfs4_callback_svc;
224 break;
225 default:
226 nfs_minorversion_callback_svc_setup(serv,
227 &rqstp, &callback_svc);
261 } 228 }
262 229
263 if (IS_ERR(rqstp)) { 230 if (IS_ERR(rqstp))
264 ret = PTR_ERR(rqstp); 231 return PTR_ERR(rqstp);
265 goto out_err;
266 }
267 232
268 svc_sock_update_bufs(serv); 233 svc_sock_update_bufs(serv);
269 234
@@ -276,41 +241,165 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt)
276 svc_exit_thread(cb_info->rqst); 241 svc_exit_thread(cb_info->rqst);
277 cb_info->rqst = NULL; 242 cb_info->rqst = NULL;
278 cb_info->task = NULL; 243 cb_info->task = NULL;
279 goto out_err; 244 return PTR_ERR(cb_info->task);
245 }
246 dprintk("nfs_callback_up: service started\n");
247 return 0;
248}
249
250static void nfs_callback_down_net(u32 minorversion, struct svc_serv *serv, struct net *net)
251{
252 struct nfs_net *nn = net_generic(net, nfs_net_id);
253
254 if (--nn->cb_users[minorversion])
255 return;
256
257 dprintk("NFS: destroy per-net callback data; net=%p\n", net);
258 svc_shutdown_net(serv, net);
259}
260
261static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, struct net *net)
262{
263 struct nfs_net *nn = net_generic(net, nfs_net_id);
264 int ret;
265
266 if (nn->cb_users[minorversion]++)
267 return 0;
268
269 dprintk("NFS: create per-net callback data; net=%p\n", net);
270
271 ret = svc_bind(serv, net);
272 if (ret < 0) {
273 printk(KERN_WARNING "NFS: bind callback service failed\n");
274 goto err_bind;
275 }
276
277 switch (minorversion) {
278 case 0:
279 ret = nfs4_callback_up_net(serv, net);
280 break;
281 case 1:
282 ret = nfs41_callback_up_net(serv, net);
283 break;
284 default:
285 printk(KERN_ERR "NFS: unknown callback version: %d\n",
286 minorversion);
287 ret = -EINVAL;
288 break;
289 }
290
291 if (ret < 0) {
292 printk(KERN_ERR "NFS: callback service start failed\n");
293 goto err_socks;
294 }
295 return 0;
296
297err_socks:
298 svc_rpcb_cleanup(serv, net);
299err_bind:
300 dprintk("NFS: Couldn't create callback socket: err = %d; "
301 "net = %p\n", ret, net);
302 return ret;
303}
304
305static struct svc_serv *nfs_callback_create_svc(int minorversion)
306{
307 struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
308 struct svc_serv *serv;
309
310 /*
311 * Check whether we're already up and running.
312 */
313 if (cb_info->task) {
314 /*
315 * Note: increase service usage, because later in case of error
316 * svc_destroy() will be called.
317 */
318 svc_get(cb_info->serv);
319 return cb_info->serv;
320 }
321
322 /*
323 * Sanity check: if there's no task,
324 * we should be the first user ...
325 */
326 if (cb_info->users)
327 printk(KERN_WARNING "nfs_callback_create_svc: no kthread, %d users??\n",
328 cb_info->users);
329
330 serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL);
331 if (!serv) {
332 printk(KERN_ERR "nfs_callback_create_svc: create service failed\n");
333 return ERR_PTR(-ENOMEM);
334 }
335 /* As there is only one thread we need to over-ride the
336 * default maximum of 80 connections
337 */
338 serv->sv_maxconn = 1024;
339 dprintk("nfs_callback_create_svc: service created\n");
340 return serv;
341}
342
343/*
344 * Bring up the callback thread if it is not already up.
345 */
346int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt)
347{
348 struct svc_serv *serv;
349 struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
350 int ret;
351 struct net *net = xprt->xprt_net;
352
353 mutex_lock(&nfs_callback_mutex);
354
355 serv = nfs_callback_create_svc(minorversion);
356 if (IS_ERR(serv)) {
357 ret = PTR_ERR(serv);
358 goto err_create;
280 } 359 }
281out: 360
361 ret = nfs_callback_up_net(minorversion, serv, net);
362 if (ret < 0)
363 goto err_net;
364
365 ret = nfs_callback_start_svc(minorversion, xprt, serv);
366 if (ret < 0)
367 goto err_start;
368
369 cb_info->users++;
282 /* 370 /*
283 * svc_create creates the svc_serv with sv_nrthreads == 1, and then 371 * svc_create creates the svc_serv with sv_nrthreads == 1, and then
284 * svc_prepare_thread increments that. So we need to call svc_destroy 372 * svc_prepare_thread increments that. So we need to call svc_destroy
285 * on both success and failure so that the refcount is 1 when the 373 * on both success and failure so that the refcount is 1 when the
286 * thread exits. 374 * thread exits.
287 */ 375 */
288 if (serv) 376err_net:
289 svc_destroy(serv); 377 svc_destroy(serv);
378err_create:
290 mutex_unlock(&nfs_callback_mutex); 379 mutex_unlock(&nfs_callback_mutex);
291 return ret; 380 return ret;
292out_err: 381
293 dprintk("NFS: Couldn't create callback socket or server thread; " 382err_start:
294 "err = %d\n", ret); 383 nfs_callback_down_net(minorversion, serv, net);
295 cb_info->users--; 384 dprintk("NFS: Couldn't create server thread; err = %d\n", ret);
296 if (serv) 385 goto err_net;
297 svc_shutdown_net(serv, net);
298 goto out;
299} 386}
300 387
301/* 388/*
302 * Kill the callback thread if it's no longer being used. 389 * Kill the callback thread if it's no longer being used.
303 */ 390 */
304void nfs_callback_down(int minorversion) 391void nfs_callback_down(int minorversion, struct net *net)
305{ 392{
306 struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; 393 struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
307 394
308 mutex_lock(&nfs_callback_mutex); 395 mutex_lock(&nfs_callback_mutex);
396 nfs_callback_down_net(minorversion, cb_info->serv, net);
309 cb_info->users--; 397 cb_info->users--;
310 if (cb_info->users == 0 && cb_info->task != NULL) { 398 if (cb_info->users == 0 && cb_info->task != NULL) {
311 kthread_stop(cb_info->task); 399 kthread_stop(cb_info->task);
312 svc_shutdown_net(cb_info->serv, &init_net); 400 dprintk("nfs_callback_down: service stopped\n");
313 svc_exit_thread(cb_info->rqst); 401 svc_exit_thread(cb_info->rqst);
402 dprintk("nfs_callback_down: service destroyed\n");
314 cb_info->serv = NULL; 403 cb_info->serv = NULL;
315 cb_info->rqst = NULL; 404 cb_info->rqst = NULL;
316 cb_info->task = NULL; 405 cb_info->task = NULL;
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index b44d7b128b71..4251c2ae06ad 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -194,7 +194,7 @@ extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
194 struct cb_process_state *cps); 194 struct cb_process_state *cps);
195#if IS_ENABLED(CONFIG_NFS_V4) 195#if IS_ENABLED(CONFIG_NFS_V4)
196extern int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt); 196extern int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt);
197extern void nfs_callback_down(int minorversion); 197extern void nfs_callback_down(int minorversion, struct net *net);
198extern int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, 198extern int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation,
199 const nfs4_stateid *stateid); 199 const nfs4_stateid *stateid);
200extern int nfs4_set_callback_sessionid(struct nfs_client *clp); 200extern int nfs4_set_callback_sessionid(struct nfs_client *clp);
@@ -209,6 +209,5 @@ extern int nfs4_set_callback_sessionid(struct nfs_client *clp);
209 209
210extern unsigned int nfs_callback_set_tcpport; 210extern unsigned int nfs_callback_set_tcpport;
211extern unsigned short nfs_callback_tcpport; 211extern unsigned short nfs_callback_tcpport;
212extern unsigned short nfs_callback_tcpport6;
213 212
214#endif /* __LINUX_FS_NFS_CALLBACK_H */ 213#endif /* __LINUX_FS_NFS_CALLBACK_H */
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 1b5d809a105e..76b4a7a3e559 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -122,7 +122,15 @@ static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp,
122 ino = igrab(lo->plh_inode); 122 ino = igrab(lo->plh_inode);
123 if (!ino) 123 if (!ino)
124 continue; 124 continue;
125 get_layout_hdr(lo); 125 spin_lock(&ino->i_lock);
126 /* Is this layout in the process of being freed? */
127 if (NFS_I(ino)->layout != lo) {
128 spin_unlock(&ino->i_lock);
129 iput(ino);
130 continue;
131 }
132 pnfs_get_layout_hdr(lo);
133 spin_unlock(&ino->i_lock);
126 return lo; 134 return lo;
127 } 135 }
128 } 136 }
@@ -158,7 +166,7 @@ static u32 initiate_file_draining(struct nfs_client *clp,
158 ino = lo->plh_inode; 166 ino = lo->plh_inode;
159 spin_lock(&ino->i_lock); 167 spin_lock(&ino->i_lock);
160 if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || 168 if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
161 mark_matching_lsegs_invalid(lo, &free_me_list, 169 pnfs_mark_matching_lsegs_invalid(lo, &free_me_list,
162 &args->cbl_range)) 170 &args->cbl_range))
163 rv = NFS4ERR_DELAY; 171 rv = NFS4ERR_DELAY;
164 else 172 else
@@ -166,7 +174,7 @@ static u32 initiate_file_draining(struct nfs_client *clp,
166 pnfs_set_layout_stateid(lo, &args->cbl_stateid, true); 174 pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
167 spin_unlock(&ino->i_lock); 175 spin_unlock(&ino->i_lock);
168 pnfs_free_lseg_list(&free_me_list); 176 pnfs_free_lseg_list(&free_me_list);
169 put_layout_hdr(lo); 177 pnfs_put_layout_hdr(lo);
170 iput(ino); 178 iput(ino);
171 return rv; 179 return rv;
172} 180}
@@ -196,9 +204,18 @@ static u32 initiate_bulk_draining(struct nfs_client *clp,
196 continue; 204 continue;
197 205
198 list_for_each_entry(lo, &server->layouts, plh_layouts) { 206 list_for_each_entry(lo, &server->layouts, plh_layouts) {
199 if (!igrab(lo->plh_inode)) 207 ino = igrab(lo->plh_inode);
208 if (ino)
209 continue;
210 spin_lock(&ino->i_lock);
211 /* Is this layout in the process of being freed? */
212 if (NFS_I(ino)->layout != lo) {
213 spin_unlock(&ino->i_lock);
214 iput(ino);
200 continue; 215 continue;
201 get_layout_hdr(lo); 216 }
217 pnfs_get_layout_hdr(lo);
218 spin_unlock(&ino->i_lock);
202 BUG_ON(!list_empty(&lo->plh_bulk_recall)); 219 BUG_ON(!list_empty(&lo->plh_bulk_recall));
203 list_add(&lo->plh_bulk_recall, &recall_list); 220 list_add(&lo->plh_bulk_recall, &recall_list);
204 } 221 }
@@ -211,12 +228,12 @@ static u32 initiate_bulk_draining(struct nfs_client *clp,
211 ino = lo->plh_inode; 228 ino = lo->plh_inode;
212 spin_lock(&ino->i_lock); 229 spin_lock(&ino->i_lock);
213 set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); 230 set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
214 if (mark_matching_lsegs_invalid(lo, &free_me_list, &range)) 231 if (pnfs_mark_matching_lsegs_invalid(lo, &free_me_list, &range))
215 rv = NFS4ERR_DELAY; 232 rv = NFS4ERR_DELAY;
216 list_del_init(&lo->plh_bulk_recall); 233 list_del_init(&lo->plh_bulk_recall);
217 spin_unlock(&ino->i_lock); 234 spin_unlock(&ino->i_lock);
218 pnfs_free_lseg_list(&free_me_list); 235 pnfs_free_lseg_list(&free_me_list);
219 put_layout_hdr(lo); 236 pnfs_put_layout_hdr(lo);
220 iput(ino); 237 iput(ino);
221 } 238 }
222 return rv; 239 return rv;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 99694442b93f..8b39a42ac35e 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -93,10 +93,10 @@ static struct nfs_subversion *find_nfs_version(unsigned int version)
93 spin_unlock(&nfs_version_lock); 93 spin_unlock(&nfs_version_lock);
94 return nfs; 94 return nfs;
95 } 95 }
96 }; 96 }
97 97
98 spin_unlock(&nfs_version_lock); 98 spin_unlock(&nfs_version_lock);
99 return ERR_PTR(-EPROTONOSUPPORT);; 99 return ERR_PTR(-EPROTONOSUPPORT);
100} 100}
101 101
102struct nfs_subversion *get_nfs_version(unsigned int version) 102struct nfs_subversion *get_nfs_version(unsigned int version)
@@ -498,7 +498,8 @@ nfs_get_client(const struct nfs_client_initdata *cl_init,
498 return nfs_found_client(cl_init, clp); 498 return nfs_found_client(cl_init, clp);
499 } 499 }
500 if (new) { 500 if (new) {
501 list_add(&new->cl_share_link, &nn->nfs_client_list); 501 list_add_tail(&new->cl_share_link,
502 &nn->nfs_client_list);
502 spin_unlock(&nn->nfs_client_lock); 503 spin_unlock(&nn->nfs_client_lock);
503 new->cl_flags = cl_init->init_flags; 504 new->cl_flags = cl_init->init_flags;
504 return rpc_ops->init_client(new, timeparms, ip_addr, 505 return rpc_ops->init_client(new, timeparms, ip_addr,
@@ -668,7 +669,8 @@ int nfs_init_server_rpcclient(struct nfs_server *server,
668{ 669{
669 struct nfs_client *clp = server->nfs_client; 670 struct nfs_client *clp = server->nfs_client;
670 671
671 server->client = rpc_clone_client(clp->cl_rpcclient); 672 server->client = rpc_clone_client_set_auth(clp->cl_rpcclient,
673 pseudoflavour);
672 if (IS_ERR(server->client)) { 674 if (IS_ERR(server->client)) {
673 dprintk("%s: couldn't create rpc_client!\n", __func__); 675 dprintk("%s: couldn't create rpc_client!\n", __func__);
674 return PTR_ERR(server->client); 676 return PTR_ERR(server->client);
@@ -678,16 +680,6 @@ int nfs_init_server_rpcclient(struct nfs_server *server,
678 timeo, 680 timeo,
679 sizeof(server->client->cl_timeout_default)); 681 sizeof(server->client->cl_timeout_default));
680 server->client->cl_timeout = &server->client->cl_timeout_default; 682 server->client->cl_timeout = &server->client->cl_timeout_default;
681
682 if (pseudoflavour != clp->cl_rpcclient->cl_auth->au_flavor) {
683 struct rpc_auth *auth;
684
685 auth = rpcauth_create(pseudoflavour, server->client);
686 if (IS_ERR(auth)) {
687 dprintk("%s: couldn't create credcache!\n", __func__);
688 return PTR_ERR(auth);
689 }
690 }
691 server->client->cl_softrtry = 0; 683 server->client->cl_softrtry = 0;
692 if (server->flags & NFS_MOUNT_SOFT) 684 if (server->flags & NFS_MOUNT_SOFT)
693 server->client->cl_softrtry = 1; 685 server->client->cl_softrtry = 1;
@@ -761,6 +753,8 @@ static int nfs_init_server(struct nfs_server *server,
761 data->timeo, data->retrans); 753 data->timeo, data->retrans);
762 if (data->flags & NFS_MOUNT_NORESVPORT) 754 if (data->flags & NFS_MOUNT_NORESVPORT)
763 set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); 755 set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
756 if (server->options & NFS_OPTION_MIGRATION)
757 set_bit(NFS_CS_MIGRATION, &cl_init.init_flags);
764 758
765 /* Allocate or find a client reference we can use */ 759 /* Allocate or find a client reference we can use */
766 clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX); 760 clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX);
@@ -855,7 +849,6 @@ static void nfs_server_set_fsinfo(struct nfs_server *server,
855 if (server->wsize > NFS_MAX_FILE_IO_SIZE) 849 if (server->wsize > NFS_MAX_FILE_IO_SIZE)
856 server->wsize = NFS_MAX_FILE_IO_SIZE; 850 server->wsize = NFS_MAX_FILE_IO_SIZE;
857 server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 851 server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
858 server->pnfs_blksize = fsinfo->blksize;
859 852
860 server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL); 853 server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL);
861 854
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 627f108ede23..ce8cb926526b 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -2072,7 +2072,7 @@ found:
2072 nfs_access_free_entry(entry); 2072 nfs_access_free_entry(entry);
2073} 2073}
2074 2074
2075static void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set) 2075void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
2076{ 2076{
2077 struct nfs_access_entry *cache = kmalloc(sizeof(*cache), GFP_KERNEL); 2077 struct nfs_access_entry *cache = kmalloc(sizeof(*cache), GFP_KERNEL);
2078 if (cache == NULL) 2078 if (cache == NULL)
@@ -2098,6 +2098,20 @@ static void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *s
2098 spin_unlock(&nfs_access_lru_lock); 2098 spin_unlock(&nfs_access_lru_lock);
2099 } 2099 }
2100} 2100}
2101EXPORT_SYMBOL_GPL(nfs_access_add_cache);
2102
2103void nfs_access_set_mask(struct nfs_access_entry *entry, u32 access_result)
2104{
2105 entry->mask = 0;
2106 if (access_result & NFS4_ACCESS_READ)
2107 entry->mask |= MAY_READ;
2108 if (access_result &
2109 (NFS4_ACCESS_MODIFY | NFS4_ACCESS_EXTEND | NFS4_ACCESS_DELETE))
2110 entry->mask |= MAY_WRITE;
2111 if (access_result & (NFS4_ACCESS_LOOKUP|NFS4_ACCESS_EXECUTE))
2112 entry->mask |= MAY_EXEC;
2113}
2114EXPORT_SYMBOL_GPL(nfs_access_set_mask);
2101 2115
2102static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask) 2116static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask)
2103{ 2117{
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 1ba385b7c90d..cae26cbd59ee 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -46,6 +46,7 @@
46#include <linux/kref.h> 46#include <linux/kref.h>
47#include <linux/slab.h> 47#include <linux/slab.h>
48#include <linux/task_io_accounting_ops.h> 48#include <linux/task_io_accounting_ops.h>
49#include <linux/module.h>
49 50
50#include <linux/nfs_fs.h> 51#include <linux/nfs_fs.h>
51#include <linux/nfs_page.h> 52#include <linux/nfs_page.h>
@@ -78,6 +79,7 @@ struct nfs_direct_req {
78 atomic_t io_count; /* i/os we're waiting for */ 79 atomic_t io_count; /* i/os we're waiting for */
79 spinlock_t lock; /* protect completion state */ 80 spinlock_t lock; /* protect completion state */
80 ssize_t count, /* bytes actually processed */ 81 ssize_t count, /* bytes actually processed */
82 bytes_left, /* bytes left to be sent */
81 error; /* any reported error */ 83 error; /* any reported error */
82 struct completion completion; /* wait for i/o completion */ 84 struct completion completion; /* wait for i/o completion */
83 85
@@ -190,6 +192,12 @@ static void nfs_direct_req_release(struct nfs_direct_req *dreq)
190 kref_put(&dreq->kref, nfs_direct_req_free); 192 kref_put(&dreq->kref, nfs_direct_req_free);
191} 193}
192 194
195ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq)
196{
197 return dreq->bytes_left;
198}
199EXPORT_SYMBOL_GPL(nfs_dreq_bytes_left);
200
193/* 201/*
194 * Collects and returns the final error value/byte-count. 202 * Collects and returns the final error value/byte-count.
195 */ 203 */
@@ -390,6 +398,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *de
390 user_addr += req_len; 398 user_addr += req_len;
391 pos += req_len; 399 pos += req_len;
392 count -= req_len; 400 count -= req_len;
401 dreq->bytes_left -= req_len;
393 } 402 }
394 /* The nfs_page now hold references to these pages */ 403 /* The nfs_page now hold references to these pages */
395 nfs_direct_release_pages(pagevec, npages); 404 nfs_direct_release_pages(pagevec, npages);
@@ -450,23 +459,28 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
450 ssize_t result = -ENOMEM; 459 ssize_t result = -ENOMEM;
451 struct inode *inode = iocb->ki_filp->f_mapping->host; 460 struct inode *inode = iocb->ki_filp->f_mapping->host;
452 struct nfs_direct_req *dreq; 461 struct nfs_direct_req *dreq;
462 struct nfs_lock_context *l_ctx;
453 463
454 dreq = nfs_direct_req_alloc(); 464 dreq = nfs_direct_req_alloc();
455 if (dreq == NULL) 465 if (dreq == NULL)
456 goto out; 466 goto out;
457 467
458 dreq->inode = inode; 468 dreq->inode = inode;
469 dreq->bytes_left = iov_length(iov, nr_segs);
459 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); 470 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
460 dreq->l_ctx = nfs_get_lock_context(dreq->ctx); 471 l_ctx = nfs_get_lock_context(dreq->ctx);
461 if (dreq->l_ctx == NULL) 472 if (IS_ERR(l_ctx)) {
473 result = PTR_ERR(l_ctx);
462 goto out_release; 474 goto out_release;
475 }
476 dreq->l_ctx = l_ctx;
463 if (!is_sync_kiocb(iocb)) 477 if (!is_sync_kiocb(iocb))
464 dreq->iocb = iocb; 478 dreq->iocb = iocb;
465 479
480 NFS_I(inode)->read_io += iov_length(iov, nr_segs);
466 result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos, uio); 481 result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos, uio);
467 if (!result) 482 if (!result)
468 result = nfs_direct_wait(dreq); 483 result = nfs_direct_wait(dreq);
469 NFS_I(inode)->read_io += result;
470out_release: 484out_release:
471 nfs_direct_req_release(dreq); 485 nfs_direct_req_release(dreq);
472out: 486out:
@@ -706,6 +720,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *d
706 user_addr += req_len; 720 user_addr += req_len;
707 pos += req_len; 721 pos += req_len;
708 count -= req_len; 722 count -= req_len;
723 dreq->bytes_left -= req_len;
709 } 724 }
710 /* The nfs_page now hold references to these pages */ 725 /* The nfs_page now hold references to these pages */
711 nfs_direct_release_pages(pagevec, npages); 726 nfs_direct_release_pages(pagevec, npages);
@@ -814,6 +829,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
814 get_dreq(dreq); 829 get_dreq(dreq);
815 atomic_inc(&inode->i_dio_count); 830 atomic_inc(&inode->i_dio_count);
816 831
832 NFS_I(dreq->inode)->write_io += iov_length(iov, nr_segs);
817 for (seg = 0; seg < nr_segs; seg++) { 833 for (seg = 0; seg < nr_segs; seg++) {
818 const struct iovec *vec = &iov[seg]; 834 const struct iovec *vec = &iov[seg];
819 result = nfs_direct_write_schedule_segment(&desc, vec, pos, uio); 835 result = nfs_direct_write_schedule_segment(&desc, vec, pos, uio);
@@ -825,7 +841,6 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
825 pos += vec->iov_len; 841 pos += vec->iov_len;
826 } 842 }
827 nfs_pageio_complete(&desc); 843 nfs_pageio_complete(&desc);
828 NFS_I(dreq->inode)->write_io += desc.pg_bytes_written;
829 844
830 /* 845 /*
831 * If no bytes were started, return the error, and let the 846 * If no bytes were started, return the error, and let the
@@ -849,16 +864,21 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
849 ssize_t result = -ENOMEM; 864 ssize_t result = -ENOMEM;
850 struct inode *inode = iocb->ki_filp->f_mapping->host; 865 struct inode *inode = iocb->ki_filp->f_mapping->host;
851 struct nfs_direct_req *dreq; 866 struct nfs_direct_req *dreq;
867 struct nfs_lock_context *l_ctx;
852 868
853 dreq = nfs_direct_req_alloc(); 869 dreq = nfs_direct_req_alloc();
854 if (!dreq) 870 if (!dreq)
855 goto out; 871 goto out;
856 872
857 dreq->inode = inode; 873 dreq->inode = inode;
874 dreq->bytes_left = count;
858 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); 875 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
859 dreq->l_ctx = nfs_get_lock_context(dreq->ctx); 876 l_ctx = nfs_get_lock_context(dreq->ctx);
860 if (dreq->l_ctx == NULL) 877 if (IS_ERR(l_ctx)) {
878 result = PTR_ERR(l_ctx);
861 goto out_release; 879 goto out_release;
880 }
881 dreq->l_ctx = l_ctx;
862 if (!is_sync_kiocb(iocb)) 882 if (!is_sync_kiocb(iocb))
863 dreq->iocb = iocb; 883 dreq->iocb = iocb;
864 884
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index f692be97676d..582bb8866131 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -259,7 +259,7 @@ nfs_file_fsync_commit(struct file *file, loff_t start, loff_t end, int datasync)
259 struct dentry *dentry = file->f_path.dentry; 259 struct dentry *dentry = file->f_path.dentry;
260 struct nfs_open_context *ctx = nfs_file_open_context(file); 260 struct nfs_open_context *ctx = nfs_file_open_context(file);
261 struct inode *inode = dentry->d_inode; 261 struct inode *inode = dentry->d_inode;
262 int have_error, status; 262 int have_error, do_resend, status;
263 int ret = 0; 263 int ret = 0;
264 264
265 dprintk("NFS: fsync file(%s/%s) datasync %d\n", 265 dprintk("NFS: fsync file(%s/%s) datasync %d\n",
@@ -267,15 +267,23 @@ nfs_file_fsync_commit(struct file *file, loff_t start, loff_t end, int datasync)
267 datasync); 267 datasync);
268 268
269 nfs_inc_stats(inode, NFSIOS_VFSFSYNC); 269 nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
270 do_resend = test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
270 have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); 271 have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
271 status = nfs_commit_inode(inode, FLUSH_SYNC); 272 status = nfs_commit_inode(inode, FLUSH_SYNC);
272 if (status >= 0 && ret < 0)
273 status = ret;
274 have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); 273 have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
275 if (have_error) 274 if (have_error) {
276 ret = xchg(&ctx->error, 0); 275 ret = xchg(&ctx->error, 0);
277 if (!ret && status < 0) 276 if (ret)
277 goto out;
278 }
279 if (status < 0) {
278 ret = status; 280 ret = status;
281 goto out;
282 }
283 do_resend |= test_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
284 if (do_resend)
285 ret = -EAGAIN;
286out:
279 return ret; 287 return ret;
280} 288}
281EXPORT_SYMBOL_GPL(nfs_file_fsync_commit); 289EXPORT_SYMBOL_GPL(nfs_file_fsync_commit);
@@ -286,13 +294,22 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
286 int ret; 294 int ret;
287 struct inode *inode = file->f_path.dentry->d_inode; 295 struct inode *inode = file->f_path.dentry->d_inode;
288 296
289 ret = filemap_write_and_wait_range(inode->i_mapping, start, end); 297 do {
290 if (ret != 0) 298 ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
291 goto out; 299 if (ret != 0)
292 mutex_lock(&inode->i_mutex); 300 break;
293 ret = nfs_file_fsync_commit(file, start, end, datasync); 301 mutex_lock(&inode->i_mutex);
294 mutex_unlock(&inode->i_mutex); 302 ret = nfs_file_fsync_commit(file, start, end, datasync);
295out: 303 mutex_unlock(&inode->i_mutex);
304 /*
305 * If nfs_file_fsync_commit detected a server reboot, then
306 * resend all dirty pages that might have been covered by
307 * the NFS_CONTEXT_RESEND_WRITES flag
308 */
309 start = 0;
310 end = LLONG_MAX;
311 } while (ret == -EAGAIN);
312
296 return ret; 313 return ret;
297} 314}
298 315
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index 4654ced096a6..033803c36644 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -32,6 +32,8 @@
32 32
33#include <asm/uaccess.h> 33#include <asm/uaccess.h>
34 34
35#include "internal.h"
36
35#define NFSDBG_FACILITY NFSDBG_CLIENT 37#define NFSDBG_FACILITY NFSDBG_CLIENT
36 38
37/* 39/*
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index a850079467d8..9cc4a3fbf4b0 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -55,18 +55,19 @@
55static const struct cred *id_resolver_cache; 55static const struct cred *id_resolver_cache;
56static struct key_type key_type_id_resolver_legacy; 56static struct key_type key_type_id_resolver_legacy;
57 57
58struct idmap {
59 struct rpc_pipe *idmap_pipe;
60 struct key_construction *idmap_key_cons;
61 struct mutex idmap_mutex;
62};
63
64struct idmap_legacy_upcalldata { 58struct idmap_legacy_upcalldata {
65 struct rpc_pipe_msg pipe_msg; 59 struct rpc_pipe_msg pipe_msg;
66 struct idmap_msg idmap_msg; 60 struct idmap_msg idmap_msg;
61 struct key_construction *key_cons;
67 struct idmap *idmap; 62 struct idmap *idmap;
68}; 63};
69 64
65struct idmap {
66 struct rpc_pipe *idmap_pipe;
67 struct idmap_legacy_upcalldata *idmap_upcall_data;
68 struct mutex idmap_mutex;
69};
70
70/** 71/**
71 * nfs_fattr_init_names - initialise the nfs_fattr owner_name/group_name fields 72 * nfs_fattr_init_names - initialise the nfs_fattr owner_name/group_name fields
72 * @fattr: fully initialised struct nfs_fattr 73 * @fattr: fully initialised struct nfs_fattr
@@ -158,7 +159,7 @@ static int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *re
158 return 0; 159 return 0;
159 memcpy(buf, name, namelen); 160 memcpy(buf, name, namelen);
160 buf[namelen] = '\0'; 161 buf[namelen] = '\0';
161 if (strict_strtoul(buf, 0, &val) != 0) 162 if (kstrtoul(buf, 0, &val) != 0)
162 return 0; 163 return 0;
163 *res = val; 164 *res = val;
164 return 1; 165 return 1;
@@ -330,7 +331,6 @@ static ssize_t nfs_idmap_get_key(const char *name, size_t namelen,
330 ret = nfs_idmap_request_key(&key_type_id_resolver_legacy, 331 ret = nfs_idmap_request_key(&key_type_id_resolver_legacy,
331 name, namelen, type, data, 332 name, namelen, type, data,
332 data_size, idmap); 333 data_size, idmap);
333 idmap->idmap_key_cons = NULL;
334 mutex_unlock(&idmap->idmap_mutex); 334 mutex_unlock(&idmap->idmap_mutex);
335 } 335 }
336 return ret; 336 return ret;
@@ -364,7 +364,7 @@ static int nfs_idmap_lookup_id(const char *name, size_t namelen, const char *typ
364 if (data_size <= 0) { 364 if (data_size <= 0) {
365 ret = -EINVAL; 365 ret = -EINVAL;
366 } else { 366 } else {
367 ret = strict_strtol(id_str, 10, &id_long); 367 ret = kstrtol(id_str, 10, &id_long);
368 *id = (__u32)id_long; 368 *id = (__u32)id_long;
369 } 369 }
370 return ret; 370 return ret;
@@ -465,8 +465,6 @@ nfs_idmap_new(struct nfs_client *clp)
465 struct rpc_pipe *pipe; 465 struct rpc_pipe *pipe;
466 int error; 466 int error;
467 467
468 BUG_ON(clp->cl_idmap != NULL);
469
470 idmap = kzalloc(sizeof(*idmap), GFP_KERNEL); 468 idmap = kzalloc(sizeof(*idmap), GFP_KERNEL);
471 if (idmap == NULL) 469 if (idmap == NULL)
472 return -ENOMEM; 470 return -ENOMEM;
@@ -510,7 +508,6 @@ static int __rpc_pipefs_event(struct nfs_client *clp, unsigned long event,
510 508
511 switch (event) { 509 switch (event) {
512 case RPC_PIPEFS_MOUNT: 510 case RPC_PIPEFS_MOUNT:
513 BUG_ON(clp->cl_rpcclient->cl_dentry == NULL);
514 err = __nfs_idmap_register(clp->cl_rpcclient->cl_dentry, 511 err = __nfs_idmap_register(clp->cl_rpcclient->cl_dentry,
515 clp->cl_idmap, 512 clp->cl_idmap,
516 clp->cl_idmap->idmap_pipe); 513 clp->cl_idmap->idmap_pipe);
@@ -632,9 +629,6 @@ static int nfs_idmap_prepare_message(char *desc, struct idmap *idmap,
632 substring_t substr; 629 substring_t substr;
633 int token, ret; 630 int token, ret;
634 631
635 memset(im, 0, sizeof(*im));
636 memset(msg, 0, sizeof(*msg));
637
638 im->im_type = IDMAP_TYPE_GROUP; 632 im->im_type = IDMAP_TYPE_GROUP;
639 token = match_token(desc, nfs_idmap_tokens, &substr); 633 token = match_token(desc, nfs_idmap_tokens, &substr);
640 634
@@ -665,6 +659,35 @@ out:
665 return ret; 659 return ret;
666} 660}
667 661
662static bool
663nfs_idmap_prepare_pipe_upcall(struct idmap *idmap,
664 struct idmap_legacy_upcalldata *data)
665{
666 if (idmap->idmap_upcall_data != NULL) {
667 WARN_ON_ONCE(1);
668 return false;
669 }
670 idmap->idmap_upcall_data = data;
671 return true;
672}
673
674static void
675nfs_idmap_complete_pipe_upcall_locked(struct idmap *idmap, int ret)
676{
677 struct key_construction *cons = idmap->idmap_upcall_data->key_cons;
678
679 kfree(idmap->idmap_upcall_data);
680 idmap->idmap_upcall_data = NULL;
681 complete_request_key(cons, ret);
682}
683
684static void
685nfs_idmap_abort_pipe_upcall(struct idmap *idmap, int ret)
686{
687 if (idmap->idmap_upcall_data != NULL)
688 nfs_idmap_complete_pipe_upcall_locked(idmap, ret);
689}
690
668static int nfs_idmap_legacy_upcall(struct key_construction *cons, 691static int nfs_idmap_legacy_upcall(struct key_construction *cons,
669 const char *op, 692 const char *op,
670 void *aux) 693 void *aux)
@@ -677,29 +700,28 @@ static int nfs_idmap_legacy_upcall(struct key_construction *cons,
677 int ret = -ENOMEM; 700 int ret = -ENOMEM;
678 701
679 /* msg and im are freed in idmap_pipe_destroy_msg */ 702 /* msg and im are freed in idmap_pipe_destroy_msg */
680 data = kmalloc(sizeof(*data), GFP_KERNEL); 703 data = kzalloc(sizeof(*data), GFP_KERNEL);
681 if (!data) 704 if (!data)
682 goto out1; 705 goto out1;
683 706
684 msg = &data->pipe_msg; 707 msg = &data->pipe_msg;
685 im = &data->idmap_msg; 708 im = &data->idmap_msg;
686 data->idmap = idmap; 709 data->idmap = idmap;
710 data->key_cons = cons;
687 711
688 ret = nfs_idmap_prepare_message(key->description, idmap, im, msg); 712 ret = nfs_idmap_prepare_message(key->description, idmap, im, msg);
689 if (ret < 0) 713 if (ret < 0)
690 goto out2; 714 goto out2;
691 715
692 BUG_ON(idmap->idmap_key_cons != NULL); 716 ret = -EAGAIN;
693 idmap->idmap_key_cons = cons; 717 if (!nfs_idmap_prepare_pipe_upcall(idmap, data))
718 goto out2;
694 719
695 ret = rpc_queue_upcall(idmap->idmap_pipe, msg); 720 ret = rpc_queue_upcall(idmap->idmap_pipe, msg);
696 if (ret < 0) 721 if (ret < 0)
697 goto out3; 722 nfs_idmap_abort_pipe_upcall(idmap, ret);
698 723
699 return ret; 724 return ret;
700
701out3:
702 idmap->idmap_key_cons = NULL;
703out2: 725out2:
704 kfree(data); 726 kfree(data);
705out1: 727out1:
@@ -714,21 +736,32 @@ static int nfs_idmap_instantiate(struct key *key, struct key *authkey, char *dat
714 authkey); 736 authkey);
715} 737}
716 738
717static int nfs_idmap_read_message(struct idmap_msg *im, struct key *key, struct key *authkey) 739static int nfs_idmap_read_and_verify_message(struct idmap_msg *im,
740 struct idmap_msg *upcall,
741 struct key *key, struct key *authkey)
718{ 742{
719 char id_str[NFS_UINT_MAXLEN]; 743 char id_str[NFS_UINT_MAXLEN];
720 int ret = -EINVAL; 744 int ret = -ENOKEY;
721 745
746 /* ret = -ENOKEY */
747 if (upcall->im_type != im->im_type || upcall->im_conv != im->im_conv)
748 goto out;
722 switch (im->im_conv) { 749 switch (im->im_conv) {
723 case IDMAP_CONV_NAMETOID: 750 case IDMAP_CONV_NAMETOID:
751 if (strcmp(upcall->im_name, im->im_name) != 0)
752 break;
724 sprintf(id_str, "%d", im->im_id); 753 sprintf(id_str, "%d", im->im_id);
725 ret = nfs_idmap_instantiate(key, authkey, id_str); 754 ret = nfs_idmap_instantiate(key, authkey, id_str);
726 break; 755 break;
727 case IDMAP_CONV_IDTONAME: 756 case IDMAP_CONV_IDTONAME:
757 if (upcall->im_id != im->im_id)
758 break;
728 ret = nfs_idmap_instantiate(key, authkey, im->im_name); 759 ret = nfs_idmap_instantiate(key, authkey, im->im_name);
729 break; 760 break;
761 default:
762 ret = -EINVAL;
730 } 763 }
731 764out:
732 return ret; 765 return ret;
733} 766}
734 767
@@ -740,14 +773,16 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
740 struct key_construction *cons; 773 struct key_construction *cons;
741 struct idmap_msg im; 774 struct idmap_msg im;
742 size_t namelen_in; 775 size_t namelen_in;
743 int ret; 776 int ret = -ENOKEY;
744 777
745 /* If instantiation is successful, anyone waiting for key construction 778 /* If instantiation is successful, anyone waiting for key construction
746 * will have been woken up and someone else may now have used 779 * will have been woken up and someone else may now have used
747 * idmap_key_cons - so after this point we may no longer touch it. 780 * idmap_key_cons - so after this point we may no longer touch it.
748 */ 781 */
749 cons = ACCESS_ONCE(idmap->idmap_key_cons); 782 if (idmap->idmap_upcall_data == NULL)
750 idmap->idmap_key_cons = NULL; 783 goto out_noupcall;
784
785 cons = idmap->idmap_upcall_data->key_cons;
751 786
752 if (mlen != sizeof(im)) { 787 if (mlen != sizeof(im)) {
753 ret = -ENOSPC; 788 ret = -ENOSPC;
@@ -768,16 +803,19 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
768 if (namelen_in == 0 || namelen_in == IDMAP_NAMESZ) { 803 if (namelen_in == 0 || namelen_in == IDMAP_NAMESZ) {
769 ret = -EINVAL; 804 ret = -EINVAL;
770 goto out; 805 goto out;
771 } 806}
772 807
773 ret = nfs_idmap_read_message(&im, cons->key, cons->authkey); 808 ret = nfs_idmap_read_and_verify_message(&im,
809 &idmap->idmap_upcall_data->idmap_msg,
810 cons->key, cons->authkey);
774 if (ret >= 0) { 811 if (ret >= 0) {
775 key_set_timeout(cons->key, nfs_idmap_cache_timeout); 812 key_set_timeout(cons->key, nfs_idmap_cache_timeout);
776 ret = mlen; 813 ret = mlen;
777 } 814 }
778 815
779out: 816out:
780 complete_request_key(cons, ret); 817 nfs_idmap_complete_pipe_upcall_locked(idmap, ret);
818out_noupcall:
781 return ret; 819 return ret;
782} 820}
783 821
@@ -788,14 +826,9 @@ idmap_pipe_destroy_msg(struct rpc_pipe_msg *msg)
788 struct idmap_legacy_upcalldata, 826 struct idmap_legacy_upcalldata,
789 pipe_msg); 827 pipe_msg);
790 struct idmap *idmap = data->idmap; 828 struct idmap *idmap = data->idmap;
791 struct key_construction *cons; 829
792 if (msg->errno) { 830 if (msg->errno)
793 cons = ACCESS_ONCE(idmap->idmap_key_cons); 831 nfs_idmap_abort_pipe_upcall(idmap, msg->errno);
794 idmap->idmap_key_cons = NULL;
795 complete_request_key(cons, msg->errno);
796 }
797 /* Free memory allocated in nfs_idmap_legacy_upcall() */
798 kfree(data);
799} 832}
800 833
801static void 834static void
@@ -803,7 +836,8 @@ idmap_release_pipe(struct inode *inode)
803{ 836{
804 struct rpc_inode *rpci = RPC_I(inode); 837 struct rpc_inode *rpci = RPC_I(inode);
805 struct idmap *idmap = (struct idmap *)rpci->private; 838 struct idmap *idmap = (struct idmap *)rpci->private;
806 idmap->idmap_key_cons = NULL; 839
840 nfs_idmap_abort_pipe_upcall(idmap, -EPIPE);
807} 841}
808 842
809int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid) 843int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index e4c716d374a8..5c7325c5c5e6 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -547,8 +547,8 @@ EXPORT_SYMBOL_GPL(nfs_getattr);
547static void nfs_init_lock_context(struct nfs_lock_context *l_ctx) 547static void nfs_init_lock_context(struct nfs_lock_context *l_ctx)
548{ 548{
549 atomic_set(&l_ctx->count, 1); 549 atomic_set(&l_ctx->count, 1);
550 l_ctx->lockowner = current->files; 550 l_ctx->lockowner.l_owner = current->files;
551 l_ctx->pid = current->tgid; 551 l_ctx->lockowner.l_pid = current->tgid;
552 INIT_LIST_HEAD(&l_ctx->list); 552 INIT_LIST_HEAD(&l_ctx->list);
553} 553}
554 554
@@ -557,9 +557,9 @@ static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context
557 struct nfs_lock_context *pos; 557 struct nfs_lock_context *pos;
558 558
559 list_for_each_entry(pos, &ctx->lock_context.list, list) { 559 list_for_each_entry(pos, &ctx->lock_context.list, list) {
560 if (pos->lockowner != current->files) 560 if (pos->lockowner.l_owner != current->files)
561 continue; 561 continue;
562 if (pos->pid != current->tgid) 562 if (pos->lockowner.l_pid != current->tgid)
563 continue; 563 continue;
564 atomic_inc(&pos->count); 564 atomic_inc(&pos->count);
565 return pos; 565 return pos;
@@ -578,7 +578,7 @@ struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx)
578 spin_unlock(&inode->i_lock); 578 spin_unlock(&inode->i_lock);
579 new = kmalloc(sizeof(*new), GFP_KERNEL); 579 new = kmalloc(sizeof(*new), GFP_KERNEL);
580 if (new == NULL) 580 if (new == NULL)
581 return NULL; 581 return ERR_PTR(-ENOMEM);
582 nfs_init_lock_context(new); 582 nfs_init_lock_context(new);
583 spin_lock(&inode->i_lock); 583 spin_lock(&inode->i_lock);
584 res = __nfs_find_lock_context(ctx); 584 res = __nfs_find_lock_context(ctx);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 31fdb03225cd..59b133c5d652 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -101,11 +101,11 @@ struct nfs_client_initdata {
101 */ 101 */
102struct nfs_parsed_mount_data { 102struct nfs_parsed_mount_data {
103 int flags; 103 int flags;
104 int rsize, wsize; 104 unsigned int rsize, wsize;
105 int timeo, retrans; 105 unsigned int timeo, retrans;
106 int acregmin, acregmax, 106 unsigned int acregmin, acregmax,
107 acdirmin, acdirmax; 107 acdirmin, acdirmax;
108 int namlen; 108 unsigned int namlen;
109 unsigned int options; 109 unsigned int options;
110 unsigned int bsize; 110 unsigned int bsize;
111 unsigned int auth_flavor_len; 111 unsigned int auth_flavor_len;
@@ -464,6 +464,7 @@ static inline void nfs_inode_dio_wait(struct inode *inode)
464{ 464{
465 inode_dio_wait(inode); 465 inode_dio_wait(inode);
466} 466}
467extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
467 468
468/* nfs4proc.c */ 469/* nfs4proc.c */
469extern void __nfs4_read_done_cb(struct nfs_read_data *); 470extern void __nfs4_read_done_cb(struct nfs_read_data *);
@@ -483,6 +484,12 @@ extern int _nfs4_call_sync_session(struct rpc_clnt *clnt,
483 struct nfs4_sequence_args *args, 484 struct nfs4_sequence_args *args,
484 struct nfs4_sequence_res *res, 485 struct nfs4_sequence_res *res,
485 int cache_reply); 486 int cache_reply);
487extern int nfs40_walk_client_list(struct nfs_client *clp,
488 struct nfs_client **result,
489 struct rpc_cred *cred);
490extern int nfs41_walk_client_list(struct nfs_client *clp,
491 struct nfs_client **result,
492 struct rpc_cred *cred);
486 493
487/* 494/*
488 * Determine the device name as a string 495 * Determine the device name as a string
diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h
index 0539de1b8d1f..8ee1fab83268 100644
--- a/fs/nfs/netns.h
+++ b/fs/nfs/netns.h
@@ -5,6 +5,7 @@
5#ifndef __NFS_NETNS_H__ 5#ifndef __NFS_NETNS_H__
6#define __NFS_NETNS_H__ 6#define __NFS_NETNS_H__
7 7
8#include <linux/nfs4.h>
8#include <net/net_namespace.h> 9#include <net/net_namespace.h>
9#include <net/netns/generic.h> 10#include <net/netns/generic.h>
10 11
@@ -22,6 +23,9 @@ struct nfs_net {
22 struct list_head nfs_volume_list; 23 struct list_head nfs_volume_list;
23#if IS_ENABLED(CONFIG_NFS_V4) 24#if IS_ENABLED(CONFIG_NFS_V4)
24 struct idr cb_ident_idr; /* Protected by nfs_client_lock */ 25 struct idr cb_ident_idr; /* Protected by nfs_client_lock */
26 unsigned short nfs_callback_tcpport;
27 unsigned short nfs_callback_tcpport6;
28 int cb_users[NFS4_MAX_MINOR_VERSION + 1];
25#endif 29#endif
26 spinlock_t nfs_client_lock; 30 spinlock_t nfs_client_lock;
27 struct timespec boot_time; 31 struct timespec boot_time;
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index da0618aeeadb..a525fdefccde 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -132,8 +132,8 @@ struct nfs4_lock_owner {
132struct nfs4_lock_state { 132struct nfs4_lock_state {
133 struct list_head ls_locks; /* Other lock stateids */ 133 struct list_head ls_locks; /* Other lock stateids */
134 struct nfs4_state * ls_state; /* Pointer to open state */ 134 struct nfs4_state * ls_state; /* Pointer to open state */
135#define NFS_LOCK_INITIALIZED 1 135#define NFS_LOCK_INITIALIZED 0
136 int ls_flags; 136 unsigned long ls_flags;
137 struct nfs_seqid_counter ls_seqid; 137 struct nfs_seqid_counter ls_seqid;
138 nfs4_stateid ls_stateid; 138 nfs4_stateid ls_stateid;
139 atomic_t ls_count; 139 atomic_t ls_count;
@@ -191,6 +191,8 @@ struct nfs4_state_recovery_ops {
191 int (*establish_clid)(struct nfs_client *, struct rpc_cred *); 191 int (*establish_clid)(struct nfs_client *, struct rpc_cred *);
192 struct rpc_cred * (*get_clid_cred)(struct nfs_client *); 192 struct rpc_cred * (*get_clid_cred)(struct nfs_client *);
193 int (*reclaim_complete)(struct nfs_client *); 193 int (*reclaim_complete)(struct nfs_client *);
194 int (*detect_trunking)(struct nfs_client *, struct nfs_client **,
195 struct rpc_cred *);
194}; 196};
195 197
196struct nfs4_state_maintenance_ops { 198struct nfs4_state_maintenance_ops {
@@ -223,7 +225,7 @@ extern int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred);
223extern int nfs4_destroy_clientid(struct nfs_client *clp); 225extern int nfs4_destroy_clientid(struct nfs_client *clp);
224extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *); 226extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
225extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *); 227extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *);
226extern int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc); 228extern int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait);
227extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); 229extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
228extern int nfs4_proc_fs_locations(struct rpc_clnt *, struct inode *, const struct qstr *, 230extern int nfs4_proc_fs_locations(struct rpc_clnt *, struct inode *, const struct qstr *,
229 struct nfs4_fs_locations *, struct page *); 231 struct nfs4_fs_locations *, struct page *);
@@ -320,9 +322,15 @@ extern void nfs4_renew_state(struct work_struct *);
320/* nfs4state.c */ 322/* nfs4state.c */
321struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp); 323struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp);
322struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp); 324struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp);
325int nfs4_discover_server_trunking(struct nfs_client *clp,
326 struct nfs_client **);
327int nfs40_discover_server_trunking(struct nfs_client *clp,
328 struct nfs_client **, struct rpc_cred *);
323#if defined(CONFIG_NFS_V4_1) 329#if defined(CONFIG_NFS_V4_1)
324struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp); 330struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp);
325struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp); 331struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp);
332int nfs41_discover_server_trunking(struct nfs_client *clp,
333 struct nfs_client **, struct rpc_cred *);
326extern void nfs4_schedule_session_recovery(struct nfs4_session *, int); 334extern void nfs4_schedule_session_recovery(struct nfs4_session *, int);
327#else 335#else
328static inline void nfs4_schedule_session_recovery(struct nfs4_session *session, int err) 336static inline void nfs4_schedule_session_recovery(struct nfs4_session *session, int err)
@@ -351,7 +359,7 @@ extern void nfs41_handle_server_scope(struct nfs_client *,
351extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); 359extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
352extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); 360extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
353extern void nfs4_select_rw_stateid(nfs4_stateid *, struct nfs4_state *, 361extern void nfs4_select_rw_stateid(nfs4_stateid *, struct nfs4_state *,
354 fmode_t, fl_owner_t, pid_t); 362 fmode_t, const struct nfs_lockowner *);
355 363
356extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask); 364extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask);
357extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task); 365extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task);
@@ -372,6 +380,9 @@ extern bool nfs4_disable_idmapping;
372extern unsigned short max_session_slots; 380extern unsigned short max_session_slots;
373extern unsigned short send_implementation_id; 381extern unsigned short send_implementation_id;
374 382
383#define NFS4_CLIENT_ID_UNIQ_LEN (64)
384extern char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN];
385
375/* nfs4sysctl.c */ 386/* nfs4sysctl.c */
376#ifdef CONFIG_SYSCTL 387#ifdef CONFIG_SYSCTL
377int nfs4_register_sysctl(void); 388int nfs4_register_sysctl(void);
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 24eb663f8ed5..6bacfde1319a 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -84,7 +84,7 @@ error:
84static void nfs4_destroy_callback(struct nfs_client *clp) 84static void nfs4_destroy_callback(struct nfs_client *clp)
85{ 85{
86 if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state)) 86 if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state))
87 nfs_callback_down(clp->cl_mvops->minor_version); 87 nfs_callback_down(clp->cl_mvops->minor_version, clp->cl_net);
88} 88}
89 89
90static void nfs4_shutdown_client(struct nfs_client *clp) 90static void nfs4_shutdown_client(struct nfs_client *clp)
@@ -185,6 +185,7 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp,
185 rpc_authflavor_t authflavour) 185 rpc_authflavor_t authflavour)
186{ 186{
187 char buf[INET6_ADDRSTRLEN + 1]; 187 char buf[INET6_ADDRSTRLEN + 1];
188 struct nfs_client *old;
188 int error; 189 int error;
189 190
190 if (clp->cl_cons_state == NFS_CS_READY) { 191 if (clp->cl_cons_state == NFS_CS_READY) {
@@ -230,6 +231,17 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp,
230 231
231 if (!nfs4_has_session(clp)) 232 if (!nfs4_has_session(clp))
232 nfs_mark_client_ready(clp, NFS_CS_READY); 233 nfs_mark_client_ready(clp, NFS_CS_READY);
234
235 error = nfs4_discover_server_trunking(clp, &old);
236 if (error < 0)
237 goto error;
238 if (clp != old) {
239 clp->cl_preserve_clid = true;
240 nfs_put_client(clp);
241 clp = old;
242 atomic_inc(&clp->cl_count);
243 }
244
233 return clp; 245 return clp;
234 246
235error: 247error:
@@ -239,6 +251,248 @@ error:
239 return ERR_PTR(error); 251 return ERR_PTR(error);
240} 252}
241 253
254/*
255 * SETCLIENTID just did a callback update with the callback ident in
256 * "drop," but server trunking discovery claims "drop" and "keep" are
257 * actually the same server. Swap the callback IDs so that "keep"
258 * will continue to use the callback ident the server now knows about,
259 * and so that "keep"'s original callback ident is destroyed when
260 * "drop" is freed.
261 */
262static void nfs4_swap_callback_idents(struct nfs_client *keep,
263 struct nfs_client *drop)
264{
265 struct nfs_net *nn = net_generic(keep->cl_net, nfs_net_id);
266 unsigned int save = keep->cl_cb_ident;
267
268 if (keep->cl_cb_ident == drop->cl_cb_ident)
269 return;
270
271 dprintk("%s: keeping callback ident %u and dropping ident %u\n",
272 __func__, keep->cl_cb_ident, drop->cl_cb_ident);
273
274 spin_lock(&nn->nfs_client_lock);
275
276 idr_replace(&nn->cb_ident_idr, keep, drop->cl_cb_ident);
277 keep->cl_cb_ident = drop->cl_cb_ident;
278
279 idr_replace(&nn->cb_ident_idr, drop, save);
280 drop->cl_cb_ident = save;
281
282 spin_unlock(&nn->nfs_client_lock);
283}
284
285/**
286 * nfs40_walk_client_list - Find server that recognizes a client ID
287 *
288 * @new: nfs_client with client ID to test
289 * @result: OUT: found nfs_client, or new
290 * @cred: credential to use for trunking test
291 *
292 * Returns zero, a negative errno, or a negative NFS4ERR status.
293 * If zero is returned, an nfs_client pointer is planted in "result."
294 *
295 * NB: nfs40_walk_client_list() relies on the new nfs_client being
296 * the last nfs_client on the list.
297 */
298int nfs40_walk_client_list(struct nfs_client *new,
299 struct nfs_client **result,
300 struct rpc_cred *cred)
301{
302 struct nfs_net *nn = net_generic(new->cl_net, nfs_net_id);
303 struct nfs_client *pos, *n, *prev = NULL;
304 struct nfs4_setclientid_res clid = {
305 .clientid = new->cl_clientid,
306 .confirm = new->cl_confirm,
307 };
308 int status;
309
310 spin_lock(&nn->nfs_client_lock);
311 list_for_each_entry_safe(pos, n, &nn->nfs_client_list, cl_share_link) {
312 /* If "pos" isn't marked ready, we can't trust the
313 * remaining fields in "pos" */
314 if (pos->cl_cons_state < NFS_CS_READY)
315 continue;
316
317 if (pos->rpc_ops != new->rpc_ops)
318 continue;
319
320 if (pos->cl_proto != new->cl_proto)
321 continue;
322
323 if (pos->cl_minorversion != new->cl_minorversion)
324 continue;
325
326 if (pos->cl_clientid != new->cl_clientid)
327 continue;
328
329 atomic_inc(&pos->cl_count);
330 spin_unlock(&nn->nfs_client_lock);
331
332 if (prev)
333 nfs_put_client(prev);
334
335 status = nfs4_proc_setclientid_confirm(pos, &clid, cred);
336 if (status == 0) {
337 nfs4_swap_callback_idents(pos, new);
338
339 nfs_put_client(pos);
340 *result = pos;
341 dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n",
342 __func__, pos, atomic_read(&pos->cl_count));
343 return 0;
344 }
345 if (status != -NFS4ERR_STALE_CLIENTID) {
346 nfs_put_client(pos);
347 dprintk("NFS: <-- %s status = %d, no result\n",
348 __func__, status);
349 return status;
350 }
351
352 spin_lock(&nn->nfs_client_lock);
353 prev = pos;
354 }
355
356 /*
357 * No matching nfs_client found. This should be impossible,
358 * because the new nfs_client has already been added to
359 * nfs_client_list by nfs_get_client().
360 *
361 * Don't BUG(), since the caller is holding a mutex.
362 */
363 if (prev)
364 nfs_put_client(prev);
365 spin_unlock(&nn->nfs_client_lock);
366 pr_err("NFS: %s Error: no matching nfs_client found\n", __func__);
367 return -NFS4ERR_STALE_CLIENTID;
368}
369
370#ifdef CONFIG_NFS_V4_1
371/*
372 * Returns true if the client IDs match
373 */
374static bool nfs4_match_clientids(struct nfs_client *a, struct nfs_client *b)
375{
376 if (a->cl_clientid != b->cl_clientid) {
377 dprintk("NFS: --> %s client ID %llx does not match %llx\n",
378 __func__, a->cl_clientid, b->cl_clientid);
379 return false;
380 }
381 dprintk("NFS: --> %s client ID %llx matches %llx\n",
382 __func__, a->cl_clientid, b->cl_clientid);
383 return true;
384}
385
386/*
387 * Returns true if the server owners match
388 */
389static bool
390nfs4_match_serverowners(struct nfs_client *a, struct nfs_client *b)
391{
392 struct nfs41_server_owner *o1 = a->cl_serverowner;
393 struct nfs41_server_owner *o2 = b->cl_serverowner;
394
395 if (o1->minor_id != o2->minor_id) {
396 dprintk("NFS: --> %s server owner minor IDs do not match\n",
397 __func__);
398 return false;
399 }
400
401 if (o1->major_id_sz != o2->major_id_sz)
402 goto out_major_mismatch;
403 if (memcmp(o1->major_id, o2->major_id, o1->major_id_sz) != 0)
404 goto out_major_mismatch;
405
406 dprintk("NFS: --> %s server owners match\n", __func__);
407 return true;
408
409out_major_mismatch:
410 dprintk("NFS: --> %s server owner major IDs do not match\n",
411 __func__);
412 return false;
413}
414
415/**
416 * nfs41_walk_client_list - Find nfs_client that matches a client/server owner
417 *
418 * @new: nfs_client with client ID to test
419 * @result: OUT: found nfs_client, or new
420 * @cred: credential to use for trunking test
421 *
422 * Returns zero, a negative errno, or a negative NFS4ERR status.
423 * If zero is returned, an nfs_client pointer is planted in "result."
424 *
425 * NB: nfs41_walk_client_list() relies on the new nfs_client being
426 * the last nfs_client on the list.
427 */
428int nfs41_walk_client_list(struct nfs_client *new,
429 struct nfs_client **result,
430 struct rpc_cred *cred)
431{
432 struct nfs_net *nn = net_generic(new->cl_net, nfs_net_id);
433 struct nfs_client *pos, *n, *prev = NULL;
434 int error;
435
436 spin_lock(&nn->nfs_client_lock);
437 list_for_each_entry_safe(pos, n, &nn->nfs_client_list, cl_share_link) {
438 /* If "pos" isn't marked ready, we can't trust the
439 * remaining fields in "pos", especially the client
440 * ID and serverowner fields. Wait for CREATE_SESSION
441 * to finish. */
442 if (pos->cl_cons_state < NFS_CS_READY) {
443 atomic_inc(&pos->cl_count);
444 spin_unlock(&nn->nfs_client_lock);
445
446 if (prev)
447 nfs_put_client(prev);
448 prev = pos;
449
450 error = nfs_wait_client_init_complete(pos);
451 if (error < 0) {
452 nfs_put_client(pos);
453 spin_lock(&nn->nfs_client_lock);
454 continue;
455 }
456
457 spin_lock(&nn->nfs_client_lock);
458 }
459
460 if (pos->rpc_ops != new->rpc_ops)
461 continue;
462
463 if (pos->cl_proto != new->cl_proto)
464 continue;
465
466 if (pos->cl_minorversion != new->cl_minorversion)
467 continue;
468
469 if (!nfs4_match_clientids(pos, new))
470 continue;
471
472 if (!nfs4_match_serverowners(pos, new))
473 continue;
474
475 spin_unlock(&nn->nfs_client_lock);
476 dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n",
477 __func__, pos, atomic_read(&pos->cl_count));
478
479 *result = pos;
480 return 0;
481 }
482
483 /*
484 * No matching nfs_client found. This should be impossible,
485 * because the new nfs_client has already been added to
486 * nfs_client_list by nfs_get_client().
487 *
488 * Don't BUG(), since the caller is holding a mutex.
489 */
490 spin_unlock(&nn->nfs_client_lock);
491 pr_err("NFS: %s Error: no matching nfs_client found\n", __func__);
492 return -NFS4ERR_STALE_CLIENTID;
493}
494#endif /* CONFIG_NFS_V4_1 */
495
242static void nfs4_destroy_server(struct nfs_server *server) 496static void nfs4_destroy_server(struct nfs_server *server)
243{ 497{
244 nfs_server_return_all_delegations(server); 498 nfs_server_return_all_delegations(server);
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index eb5eb8eef4d3..afddd6639afb 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -95,16 +95,25 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
95 int ret; 95 int ret;
96 struct inode *inode = file->f_path.dentry->d_inode; 96 struct inode *inode = file->f_path.dentry->d_inode;
97 97
98 ret = filemap_write_and_wait_range(inode->i_mapping, start, end); 98 do {
99 if (ret != 0) 99 ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
100 goto out; 100 if (ret != 0)
101 mutex_lock(&inode->i_mutex); 101 break;
102 ret = nfs_file_fsync_commit(file, start, end, datasync); 102 mutex_lock(&inode->i_mutex);
103 if (!ret && !datasync) 103 ret = nfs_file_fsync_commit(file, start, end, datasync);
104 /* application has asked for meta-data sync */ 104 if (!ret && !datasync)
105 ret = pnfs_layoutcommit_inode(inode, true); 105 /* application has asked for meta-data sync */
106 mutex_unlock(&inode->i_mutex); 106 ret = pnfs_layoutcommit_inode(inode, true);
107out: 107 mutex_unlock(&inode->i_mutex);
108 /*
109 * If nfs_file_fsync_commit detected a server reboot, then
110 * resend all dirty pages that might have been covered by
111 * the NFS_CONTEXT_RESEND_WRITES flag
112 */
113 start = 0;
114 end = LLONG_MAX;
115 } while (ret == -EAGAIN);
116
108 return ret; 117 return ret;
109} 118}
110 119
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 53f94d915bd1..52d847212066 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -190,8 +190,6 @@ static int filelayout_async_handle_error(struct rpc_task *task,
190 * i/o and all i/o waiting on the slot table to the MDS until 190 * i/o and all i/o waiting on the slot table to the MDS until
191 * layout is destroyed and a new valid layout is obtained. 191 * layout is destroyed and a new valid layout is obtained.
192 */ 192 */
193 set_bit(NFS_LAYOUT_INVALID,
194 &NFS_I(inode)->layout->plh_flags);
195 pnfs_destroy_layout(NFS_I(inode)); 193 pnfs_destroy_layout(NFS_I(inode));
196 rpc_wake_up(&tbl->slot_tbl_waitq); 194 rpc_wake_up(&tbl->slot_tbl_waitq);
197 goto reset; 195 goto reset;
@@ -205,7 +203,7 @@ static int filelayout_async_handle_error(struct rpc_task *task,
205 case -EPIPE: 203 case -EPIPE:
206 dprintk("%s DS connection error %d\n", __func__, 204 dprintk("%s DS connection error %d\n", __func__,
207 task->tk_status); 205 task->tk_status);
208 filelayout_mark_devid_invalid(devid); 206 nfs4_mark_deviceid_unavailable(devid);
209 clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(inode)->flags); 207 clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(inode)->flags);
210 _pnfs_return_layout(inode); 208 _pnfs_return_layout(inode);
211 rpc_wake_up(&tbl->slot_tbl_waitq); 209 rpc_wake_up(&tbl->slot_tbl_waitq);
@@ -269,6 +267,21 @@ filelayout_set_layoutcommit(struct nfs_write_data *wdata)
269 (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb); 267 (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb);
270} 268}
271 269
270bool
271filelayout_test_devid_unavailable(struct nfs4_deviceid_node *node)
272{
273 return filelayout_test_devid_invalid(node) ||
274 nfs4_test_deviceid_unavailable(node);
275}
276
277static bool
278filelayout_reset_to_mds(struct pnfs_layout_segment *lseg)
279{
280 struct nfs4_deviceid_node *node = FILELAYOUT_DEVID_NODE(lseg);
281
282 return filelayout_test_devid_unavailable(node);
283}
284
272/* 285/*
273 * Call ops for the async read/write cases 286 * Call ops for the async read/write cases
274 * In the case of dense layouts, the offset needs to be reset to its 287 * In the case of dense layouts, the offset needs to be reset to its
@@ -453,7 +466,7 @@ static void filelayout_commit_release(void *calldata)
453 struct nfs_commit_data *data = calldata; 466 struct nfs_commit_data *data = calldata;
454 467
455 data->completion_ops->completion(data); 468 data->completion_ops->completion(data);
456 put_lseg(data->lseg); 469 pnfs_put_lseg(data->lseg);
457 nfs_put_client(data->ds_clp); 470 nfs_put_client(data->ds_clp);
458 nfs_commitdata_release(data); 471 nfs_commitdata_release(data);
459} 472}
@@ -608,13 +621,13 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
608 d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode)->pnfs_curr_ld, 621 d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode)->pnfs_curr_ld,
609 NFS_SERVER(lo->plh_inode)->nfs_client, id); 622 NFS_SERVER(lo->plh_inode)->nfs_client, id);
610 if (d == NULL) { 623 if (d == NULL) {
611 dsaddr = get_device_info(lo->plh_inode, id, gfp_flags); 624 dsaddr = filelayout_get_device_info(lo->plh_inode, id, gfp_flags);
612 if (dsaddr == NULL) 625 if (dsaddr == NULL)
613 goto out; 626 goto out;
614 } else 627 } else
615 dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node); 628 dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
616 /* Found deviceid is being reaped */ 629 /* Found deviceid is unavailable */
617 if (test_bit(NFS_DEVICEID_INVALID, &dsaddr->id_node.flags)) 630 if (filelayout_test_devid_unavailable(&dsaddr->id_node))
618 goto out_put; 631 goto out_put;
619 632
620 fl->dsaddr = dsaddr; 633 fl->dsaddr = dsaddr;
@@ -931,7 +944,7 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio,
931 nfs_init_cinfo(&cinfo, pgio->pg_inode, pgio->pg_dreq); 944 nfs_init_cinfo(&cinfo, pgio->pg_inode, pgio->pg_dreq);
932 status = filelayout_alloc_commit_info(pgio->pg_lseg, &cinfo, GFP_NOFS); 945 status = filelayout_alloc_commit_info(pgio->pg_lseg, &cinfo, GFP_NOFS);
933 if (status < 0) { 946 if (status < 0) {
934 put_lseg(pgio->pg_lseg); 947 pnfs_put_lseg(pgio->pg_lseg);
935 pgio->pg_lseg = NULL; 948 pgio->pg_lseg = NULL;
936 goto out_mds; 949 goto out_mds;
937 } 950 }
@@ -985,7 +998,7 @@ filelayout_clear_request_commit(struct nfs_page *req,
985out: 998out:
986 nfs_request_remove_commit_list(req, cinfo); 999 nfs_request_remove_commit_list(req, cinfo);
987 spin_unlock(cinfo->lock); 1000 spin_unlock(cinfo->lock);
988 put_lseg(freeme); 1001 pnfs_put_lseg(freeme);
989} 1002}
990 1003
991static struct list_head * 1004static struct list_head *
@@ -1018,7 +1031,7 @@ filelayout_choose_commit_list(struct nfs_page *req,
1018 * off due to a rewrite, in which case it will be done in 1031 * off due to a rewrite, in which case it will be done in
1019 * filelayout_clear_request_commit 1032 * filelayout_clear_request_commit
1020 */ 1033 */
1021 buckets[i].wlseg = get_lseg(lseg); 1034 buckets[i].wlseg = pnfs_get_lseg(lseg);
1022 } 1035 }
1023 set_bit(PG_COMMIT_TO_DS, &req->wb_flags); 1036 set_bit(PG_COMMIT_TO_DS, &req->wb_flags);
1024 cinfo->ds->nwritten++; 1037 cinfo->ds->nwritten++;
@@ -1128,7 +1141,7 @@ filelayout_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
1128 if (list_empty(src)) 1141 if (list_empty(src))
1129 bucket->wlseg = NULL; 1142 bucket->wlseg = NULL;
1130 else 1143 else
1131 get_lseg(bucket->clseg); 1144 pnfs_get_lseg(bucket->clseg);
1132 } 1145 }
1133 return ret; 1146 return ret;
1134} 1147}
@@ -1159,12 +1172,12 @@ static void filelayout_recover_commit_reqs(struct list_head *dst,
1159 1172
1160 /* NOTE cinfo->lock is NOT held, relying on fact that this is 1173 /* NOTE cinfo->lock is NOT held, relying on fact that this is
1161 * only called on single thread per dreq. 1174 * only called on single thread per dreq.
1162 * Can't take the lock because need to do put_lseg 1175 * Can't take the lock because need to do pnfs_put_lseg
1163 */ 1176 */
1164 for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) { 1177 for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
1165 if (transfer_commit_list(&b->written, dst, cinfo, 0)) { 1178 if (transfer_commit_list(&b->written, dst, cinfo, 0)) {
1166 BUG_ON(!list_empty(&b->written)); 1179 BUG_ON(!list_empty(&b->written));
1167 put_lseg(b->wlseg); 1180 pnfs_put_lseg(b->wlseg);
1168 b->wlseg = NULL; 1181 b->wlseg = NULL;
1169 } 1182 }
1170 } 1183 }
@@ -1200,7 +1213,7 @@ alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list)
1200 if (list_empty(&bucket->committing)) 1213 if (list_empty(&bucket->committing))
1201 continue; 1214 continue;
1202 nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo); 1215 nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo);
1203 put_lseg(bucket->clseg); 1216 pnfs_put_lseg(bucket->clseg);
1204 bucket->clseg = NULL; 1217 bucket->clseg = NULL;
1205 } 1218 }
1206 /* Caller will clean up entries put on list */ 1219 /* Caller will clean up entries put on list */
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index 43fe802dd678..dca47d786710 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -129,23 +129,13 @@ filelayout_mark_devid_invalid(struct nfs4_deviceid_node *node)
129} 129}
130 130
131static inline bool 131static inline bool
132filelayout_test_layout_invalid(struct pnfs_layout_hdr *lo)
133{
134 return test_bit(NFS_LAYOUT_INVALID, &lo->plh_flags);
135}
136
137static inline bool
138filelayout_test_devid_invalid(struct nfs4_deviceid_node *node) 132filelayout_test_devid_invalid(struct nfs4_deviceid_node *node)
139{ 133{
140 return test_bit(NFS_DEVICEID_INVALID, &node->flags); 134 return test_bit(NFS_DEVICEID_INVALID, &node->flags);
141} 135}
142 136
143static inline bool 137extern bool
144filelayout_reset_to_mds(struct pnfs_layout_segment *lseg) 138filelayout_test_devid_unavailable(struct nfs4_deviceid_node *node);
145{
146 return filelayout_test_devid_invalid(FILELAYOUT_DEVID_NODE(lseg)) ||
147 filelayout_test_layout_invalid(lseg->pls_layout);
148}
149 139
150extern struct nfs_fh * 140extern struct nfs_fh *
151nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j); 141nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j);
@@ -158,7 +148,7 @@ struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
158extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr); 148extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
159extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr); 149extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
160struct nfs4_file_layout_dsaddr * 150struct nfs4_file_layout_dsaddr *
161get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags); 151filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags);
162void nfs4_ds_disconnect(struct nfs_client *clp); 152void nfs4_ds_disconnect(struct nfs_client *clp);
163 153
164#endif /* FS_NFS_NFS4FILELAYOUT_H */ 154#endif /* FS_NFS_NFS4FILELAYOUT_H */
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index f81231f30d94..3336d5eaf879 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -690,7 +690,7 @@ decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_fl
690 * of available devices, and return it. 690 * of available devices, and return it.
691 */ 691 */
692struct nfs4_file_layout_dsaddr * 692struct nfs4_file_layout_dsaddr *
693get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags) 693filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags)
694{ 694{
695 struct pnfs_device *pdev = NULL; 695 struct pnfs_device *pdev = NULL;
696 u32 max_resp_sz; 696 u32 max_resp_sz;
@@ -804,13 +804,14 @@ nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
804 struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx]; 804 struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx];
805 struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg); 805 struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg);
806 806
807 if (filelayout_test_devid_invalid(devid)) 807 if (filelayout_test_devid_unavailable(devid))
808 return NULL; 808 return NULL;
809 809
810 if (ds == NULL) { 810 if (ds == NULL) {
811 printk(KERN_ERR "NFS: %s: No data server for offset index %d\n", 811 printk(KERN_ERR "NFS: %s: No data server for offset index %d\n",
812 __func__, ds_idx); 812 __func__, ds_idx);
813 goto mark_dev_invalid; 813 filelayout_mark_devid_invalid(devid);
814 return NULL;
814 } 815 }
815 816
816 if (!ds->ds_clp) { 817 if (!ds->ds_clp) {
@@ -818,14 +819,12 @@ nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
818 int err; 819 int err;
819 820
820 err = nfs4_ds_connect(s, ds); 821 err = nfs4_ds_connect(s, ds);
821 if (err) 822 if (err) {
822 goto mark_dev_invalid; 823 nfs4_mark_deviceid_unavailable(devid);
824 return NULL;
825 }
823 } 826 }
824 return ds; 827 return ds;
825
826mark_dev_invalid:
827 filelayout_mark_devid_invalid(devid);
828 return NULL;
829} 828}
830 829
831module_param(dataserver_retrans, uint, 0644); 830module_param(dataserver_retrans, uint, 0644);
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 017b4b01a69c..79fbb61ce202 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -192,25 +192,13 @@ out:
192struct rpc_clnt *nfs4_create_sec_client(struct rpc_clnt *clnt, struct inode *inode, 192struct rpc_clnt *nfs4_create_sec_client(struct rpc_clnt *clnt, struct inode *inode,
193 struct qstr *name) 193 struct qstr *name)
194{ 194{
195 struct rpc_clnt *clone;
196 struct rpc_auth *auth;
197 rpc_authflavor_t flavor; 195 rpc_authflavor_t flavor;
198 196
199 flavor = nfs4_negotiate_security(inode, name); 197 flavor = nfs4_negotiate_security(inode, name);
200 if ((int)flavor < 0) 198 if ((int)flavor < 0)
201 return ERR_PTR(flavor); 199 return ERR_PTR((int)flavor);
202 200
203 clone = rpc_clone_client(clnt); 201 return rpc_clone_client_set_auth(clnt, flavor);
204 if (IS_ERR(clone))
205 return clone;
206
207 auth = rpcauth_create(flavor, clone);
208 if (!auth) {
209 rpc_shutdown_client(clone);
210 clone = ERR_PTR(-EIO);
211 }
212
213 return clone;
214} 202}
215 203
216static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, 204static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 1e50326d00dd..68b21d81b7ac 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -104,6 +104,8 @@ static int nfs4_map_errors(int err)
104 return -EACCES; 104 return -EACCES;
105 case -NFS4ERR_MINOR_VERS_MISMATCH: 105 case -NFS4ERR_MINOR_VERS_MISMATCH:
106 return -EPROTONOSUPPORT; 106 return -EPROTONOSUPPORT;
107 case -NFS4ERR_ACCESS:
108 return -EACCES;
107 default: 109 default:
108 dprintk("%s could not handle NFSv4 error %d\n", 110 dprintk("%s could not handle NFSv4 error %d\n",
109 __func__, -err); 111 __func__, -err);
@@ -150,6 +152,12 @@ static const u32 nfs4_pnfs_open_bitmap[3] = {
150 FATTR4_WORD2_MDSTHRESHOLD 152 FATTR4_WORD2_MDSTHRESHOLD
151}; 153};
152 154
155static const u32 nfs4_open_noattr_bitmap[3] = {
156 FATTR4_WORD0_TYPE
157 | FATTR4_WORD0_CHANGE
158 | FATTR4_WORD0_FILEID,
159};
160
153const u32 nfs4_statfs_bitmap[2] = { 161const u32 nfs4_statfs_bitmap[2] = {
154 FATTR4_WORD0_FILES_AVAIL 162 FATTR4_WORD0_FILES_AVAIL
155 | FATTR4_WORD0_FILES_FREE 163 | FATTR4_WORD0_FILES_FREE
@@ -832,6 +840,7 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p)
832 p->o_res.seqid = p->o_arg.seqid; 840 p->o_res.seqid = p->o_arg.seqid;
833 p->c_res.seqid = p->c_arg.seqid; 841 p->c_res.seqid = p->c_arg.seqid;
834 p->o_res.server = p->o_arg.server; 842 p->o_res.server = p->o_arg.server;
843 p->o_res.access_request = p->o_arg.access;
835 nfs_fattr_init(&p->f_attr); 844 nfs_fattr_init(&p->f_attr);
836 nfs_fattr_init_names(&p->f_attr, &p->owner_name, &p->group_name); 845 nfs_fattr_init_names(&p->f_attr, &p->owner_name, &p->group_name);
837} 846}
@@ -860,6 +869,14 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
860 p->o_arg.fh = NFS_FH(dir); 869 p->o_arg.fh = NFS_FH(dir);
861 p->o_arg.open_flags = flags; 870 p->o_arg.open_flags = flags;
862 p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE); 871 p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE);
872 /* don't put an ACCESS op in OPEN compound if O_EXCL, because ACCESS
873 * will return permission denied for all bits until close */
874 if (!(flags & O_EXCL)) {
875 /* ask server to check for all possible rights as results
876 * are cached */
877 p->o_arg.access = NFS4_ACCESS_READ | NFS4_ACCESS_MODIFY |
878 NFS4_ACCESS_EXTEND | NFS4_ACCESS_EXECUTE;
879 }
863 p->o_arg.clientid = server->nfs_client->cl_clientid; 880 p->o_arg.clientid = server->nfs_client->cl_clientid;
864 p->o_arg.id.create_time = ktime_to_ns(sp->so_seqid.create_time); 881 p->o_arg.id.create_time = ktime_to_ns(sp->so_seqid.create_time);
865 p->o_arg.id.uniquifier = sp->so_seqid.owner_id; 882 p->o_arg.id.uniquifier = sp->so_seqid.owner_id;
@@ -1115,11 +1132,80 @@ out_return_state:
1115 return state; 1132 return state;
1116} 1133}
1117 1134
1118static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data) 1135static void
1136nfs4_opendata_check_deleg(struct nfs4_opendata *data, struct nfs4_state *state)
1137{
1138 struct nfs_client *clp = NFS_SERVER(state->inode)->nfs_client;
1139 struct nfs_delegation *delegation;
1140 int delegation_flags = 0;
1141
1142 rcu_read_lock();
1143 delegation = rcu_dereference(NFS_I(state->inode)->delegation);
1144 if (delegation)
1145 delegation_flags = delegation->flags;
1146 rcu_read_unlock();
1147 if (data->o_arg.claim == NFS4_OPEN_CLAIM_DELEGATE_CUR) {
1148 pr_err_ratelimited("NFS: Broken NFSv4 server %s is "
1149 "returning a delegation for "
1150 "OPEN(CLAIM_DELEGATE_CUR)\n",
1151 clp->cl_hostname);
1152 } else if ((delegation_flags & 1UL<<NFS_DELEGATION_NEED_RECLAIM) == 0)
1153 nfs_inode_set_delegation(state->inode,
1154 data->owner->so_cred,
1155 &data->o_res);
1156 else
1157 nfs_inode_reclaim_delegation(state->inode,
1158 data->owner->so_cred,
1159 &data->o_res);
1160}
1161
1162/*
1163 * Check the inode attributes against the CLAIM_PREVIOUS returned attributes
1164 * and update the nfs4_state.
1165 */
1166static struct nfs4_state *
1167_nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data)
1168{
1169 struct inode *inode = data->state->inode;
1170 struct nfs4_state *state = data->state;
1171 int ret;
1172
1173 if (!data->rpc_done) {
1174 ret = data->rpc_status;
1175 goto err;
1176 }
1177
1178 ret = -ESTALE;
1179 if (!(data->f_attr.valid & NFS_ATTR_FATTR_TYPE) ||
1180 !(data->f_attr.valid & NFS_ATTR_FATTR_FILEID) ||
1181 !(data->f_attr.valid & NFS_ATTR_FATTR_CHANGE))
1182 goto err;
1183
1184 ret = -ENOMEM;
1185 state = nfs4_get_open_state(inode, data->owner);
1186 if (state == NULL)
1187 goto err;
1188
1189 ret = nfs_refresh_inode(inode, &data->f_attr);
1190 if (ret)
1191 goto err;
1192
1193 if (data->o_res.delegation_type != 0)
1194 nfs4_opendata_check_deleg(data, state);
1195 update_open_stateid(state, &data->o_res.stateid, NULL,
1196 data->o_arg.fmode);
1197
1198 return state;
1199err:
1200 return ERR_PTR(ret);
1201
1202}
1203
1204static struct nfs4_state *
1205_nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
1119{ 1206{
1120 struct inode *inode; 1207 struct inode *inode;
1121 struct nfs4_state *state = NULL; 1208 struct nfs4_state *state = NULL;
1122 struct nfs_delegation *delegation;
1123 int ret; 1209 int ret;
1124 1210
1125 if (!data->rpc_done) { 1211 if (!data->rpc_done) {
@@ -1138,30 +1224,8 @@ static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data
1138 state = nfs4_get_open_state(inode, data->owner); 1224 state = nfs4_get_open_state(inode, data->owner);
1139 if (state == NULL) 1225 if (state == NULL)
1140 goto err_put_inode; 1226 goto err_put_inode;
1141 if (data->o_res.delegation_type != 0) { 1227 if (data->o_res.delegation_type != 0)
1142 struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; 1228 nfs4_opendata_check_deleg(data, state);
1143 int delegation_flags = 0;
1144
1145 rcu_read_lock();
1146 delegation = rcu_dereference(NFS_I(inode)->delegation);
1147 if (delegation)
1148 delegation_flags = delegation->flags;
1149 rcu_read_unlock();
1150 if (data->o_arg.claim == NFS4_OPEN_CLAIM_DELEGATE_CUR) {
1151 pr_err_ratelimited("NFS: Broken NFSv4 server %s is "
1152 "returning a delegation for "
1153 "OPEN(CLAIM_DELEGATE_CUR)\n",
1154 clp->cl_hostname);
1155 } else if ((delegation_flags & 1UL<<NFS_DELEGATION_NEED_RECLAIM) == 0)
1156 nfs_inode_set_delegation(state->inode,
1157 data->owner->so_cred,
1158 &data->o_res);
1159 else
1160 nfs_inode_reclaim_delegation(state->inode,
1161 data->owner->so_cred,
1162 &data->o_res);
1163 }
1164
1165 update_open_stateid(state, &data->o_res.stateid, NULL, 1229 update_open_stateid(state, &data->o_res.stateid, NULL,
1166 data->o_arg.fmode); 1230 data->o_arg.fmode);
1167 iput(inode); 1231 iput(inode);
@@ -1173,6 +1237,14 @@ err:
1173 return ERR_PTR(ret); 1237 return ERR_PTR(ret);
1174} 1238}
1175 1239
1240static struct nfs4_state *
1241nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
1242{
1243 if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS)
1244 return _nfs4_opendata_reclaim_to_nfs4_state(data);
1245 return _nfs4_opendata_to_nfs4_state(data);
1246}
1247
1176static struct nfs_open_context *nfs4_state_find_open_context(struct nfs4_state *state) 1248static struct nfs_open_context *nfs4_state_find_open_context(struct nfs4_state *state)
1177{ 1249{
1178 struct nfs_inode *nfsi = NFS_I(state->inode); 1250 struct nfs_inode *nfsi = NFS_I(state->inode);
@@ -1494,6 +1566,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
1494 data->o_arg.clientid = sp->so_server->nfs_client->cl_clientid; 1566 data->o_arg.clientid = sp->so_server->nfs_client->cl_clientid;
1495 if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) { 1567 if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) {
1496 task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR]; 1568 task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR];
1569 data->o_arg.open_bitmap = &nfs4_open_noattr_bitmap[0];
1497 nfs_copy_fh(&data->o_res.fh, data->o_arg.fh); 1570 nfs_copy_fh(&data->o_res.fh, data->o_arg.fh);
1498 } 1571 }
1499 data->timestamp = jiffies; 1572 data->timestamp = jiffies;
@@ -1526,7 +1599,8 @@ static void nfs4_open_done(struct rpc_task *task, void *calldata)
1526 return; 1599 return;
1527 1600
1528 if (task->tk_status == 0) { 1601 if (task->tk_status == 0) {
1529 switch (data->o_res.f_attr->mode & S_IFMT) { 1602 if (data->o_res.f_attr->valid & NFS_ATTR_FATTR_TYPE) {
1603 switch (data->o_res.f_attr->mode & S_IFMT) {
1530 case S_IFREG: 1604 case S_IFREG:
1531 break; 1605 break;
1532 case S_IFLNK: 1606 case S_IFLNK:
@@ -1537,6 +1611,7 @@ static void nfs4_open_done(struct rpc_task *task, void *calldata)
1537 break; 1611 break;
1538 default: 1612 default:
1539 data->rpc_status = -ENOTDIR; 1613 data->rpc_status = -ENOTDIR;
1614 }
1540 } 1615 }
1541 renew_lease(data->o_res.server, data->timestamp); 1616 renew_lease(data->o_res.server, data->timestamp);
1542 if (!(data->o_res.rflags & NFS4_OPEN_RESULT_CONFIRM)) 1617 if (!(data->o_res.rflags & NFS4_OPEN_RESULT_CONFIRM))
@@ -1643,6 +1718,39 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data)
1643 return status; 1718 return status;
1644} 1719}
1645 1720
1721static int nfs4_opendata_access(struct rpc_cred *cred,
1722 struct nfs4_opendata *opendata,
1723 struct nfs4_state *state, fmode_t fmode)
1724{
1725 struct nfs_access_entry cache;
1726 u32 mask;
1727
1728 /* access call failed or for some reason the server doesn't
1729 * support any access modes -- defer access call until later */
1730 if (opendata->o_res.access_supported == 0)
1731 return 0;
1732
1733 mask = 0;
1734 /* don't check MAY_WRITE - a newly created file may not have
1735 * write mode bits, but POSIX allows the creating process to write */
1736 if (fmode & FMODE_READ)
1737 mask |= MAY_READ;
1738 if (fmode & FMODE_EXEC)
1739 mask |= MAY_EXEC;
1740
1741 cache.cred = cred;
1742 cache.jiffies = jiffies;
1743 nfs_access_set_mask(&cache, opendata->o_res.access_result);
1744 nfs_access_add_cache(state->inode, &cache);
1745
1746 if ((mask & ~cache.mask & (MAY_READ | MAY_EXEC)) == 0)
1747 return 0;
1748
1749 /* even though OPEN succeeded, access is denied. Close the file */
1750 nfs4_close_state(state, fmode);
1751 return -NFS4ERR_ACCESS;
1752}
1753
1646/* 1754/*
1647 * Note: On error, nfs4_proc_open will free the struct nfs4_opendata 1755 * Note: On error, nfs4_proc_open will free the struct nfs4_opendata
1648 */ 1756 */
@@ -1774,7 +1882,11 @@ static void nfs41_clear_delegation_stateid(struct nfs4_state *state)
1774 * informs us the stateid is unrecognized. */ 1882 * informs us the stateid is unrecognized. */
1775 if (status != -NFS4ERR_BAD_STATEID) 1883 if (status != -NFS4ERR_BAD_STATEID)
1776 nfs41_free_stateid(server, stateid); 1884 nfs41_free_stateid(server, stateid);
1885 nfs_remove_bad_delegation(state->inode);
1777 1886
1887 write_seqlock(&state->seqlock);
1888 nfs4_stateid_copy(&state->stateid, &state->open_stateid);
1889 write_sequnlock(&state->seqlock);
1778 clear_bit(NFS_DELEGATED_STATE, &state->flags); 1890 clear_bit(NFS_DELEGATED_STATE, &state->flags);
1779 } 1891 }
1780} 1892}
@@ -1790,7 +1902,7 @@ static void nfs41_clear_delegation_stateid(struct nfs4_state *state)
1790static int nfs41_check_open_stateid(struct nfs4_state *state) 1902static int nfs41_check_open_stateid(struct nfs4_state *state)
1791{ 1903{
1792 struct nfs_server *server = NFS_SERVER(state->inode); 1904 struct nfs_server *server = NFS_SERVER(state->inode);
1793 nfs4_stateid *stateid = &state->stateid; 1905 nfs4_stateid *stateid = &state->open_stateid;
1794 int status; 1906 int status;
1795 1907
1796 /* If a state reset has been done, test_stateid is unneeded */ 1908 /* If a state reset has been done, test_stateid is unneeded */
@@ -1896,6 +2008,10 @@ static int _nfs4_do_open(struct inode *dir,
1896 if (server->caps & NFS_CAP_POSIX_LOCK) 2008 if (server->caps & NFS_CAP_POSIX_LOCK)
1897 set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); 2009 set_bit(NFS_STATE_POSIX_LOCKS, &state->flags);
1898 2010
2011 status = nfs4_opendata_access(cred, opendata, state, fmode);
2012 if (status != 0)
2013 goto err_opendata_put;
2014
1899 if (opendata->o_arg.open_flags & O_EXCL) { 2015 if (opendata->o_arg.open_flags & O_EXCL) {
1900 nfs4_exclusive_attrset(opendata, sattr); 2016 nfs4_exclusive_attrset(opendata, sattr);
1901 2017
@@ -1941,7 +2057,7 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir,
1941 struct nfs4_state *res; 2057 struct nfs4_state *res;
1942 int status; 2058 int status;
1943 2059
1944 fmode &= FMODE_READ|FMODE_WRITE; 2060 fmode &= FMODE_READ|FMODE_WRITE|FMODE_EXEC;
1945 do { 2061 do {
1946 status = _nfs4_do_open(dir, dentry, fmode, flags, sattr, cred, 2062 status = _nfs4_do_open(dir, dentry, fmode, flags, sattr, cred,
1947 &res, ctx_th); 2063 &res, ctx_th);
@@ -2013,8 +2129,12 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
2013 nfs_fattr_init(fattr); 2129 nfs_fattr_init(fattr);
2014 2130
2015 if (state != NULL) { 2131 if (state != NULL) {
2132 struct nfs_lockowner lockowner = {
2133 .l_owner = current->files,
2134 .l_pid = current->tgid,
2135 };
2016 nfs4_select_rw_stateid(&arg.stateid, state, FMODE_WRITE, 2136 nfs4_select_rw_stateid(&arg.stateid, state, FMODE_WRITE,
2017 current->files, current->tgid); 2137 &lockowner);
2018 } else if (nfs4_copy_delegation_stateid(&arg.stateid, inode, 2138 } else if (nfs4_copy_delegation_stateid(&arg.stateid, inode,
2019 FMODE_WRITE)) { 2139 FMODE_WRITE)) {
2020 /* Use that stateid */ 2140 /* Use that stateid */
@@ -2133,6 +2253,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
2133{ 2253{
2134 struct nfs4_closedata *calldata = data; 2254 struct nfs4_closedata *calldata = data;
2135 struct nfs4_state *state = calldata->state; 2255 struct nfs4_state *state = calldata->state;
2256 struct inode *inode = calldata->inode;
2136 int call_close = 0; 2257 int call_close = 0;
2137 2258
2138 dprintk("%s: begin!\n", __func__); 2259 dprintk("%s: begin!\n", __func__);
@@ -2166,16 +2287,13 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
2166 if (calldata->arg.fmode == 0) { 2287 if (calldata->arg.fmode == 0) {
2167 task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE]; 2288 task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE];
2168 if (calldata->roc && 2289 if (calldata->roc &&
2169 pnfs_roc_drain(calldata->inode, &calldata->roc_barrier)) { 2290 pnfs_roc_drain(inode, &calldata->roc_barrier, task))
2170 rpc_sleep_on(&NFS_SERVER(calldata->inode)->roc_rpcwaitq,
2171 task, NULL);
2172 goto out; 2291 goto out;
2173 }
2174 } 2292 }
2175 2293
2176 nfs_fattr_init(calldata->res.fattr); 2294 nfs_fattr_init(calldata->res.fattr);
2177 calldata->timestamp = jiffies; 2295 calldata->timestamp = jiffies;
2178 if (nfs4_setup_sequence(NFS_SERVER(calldata->inode), 2296 if (nfs4_setup_sequence(NFS_SERVER(inode),
2179 &calldata->arg.seq_args, 2297 &calldata->arg.seq_args,
2180 &calldata->res.seq_res, 2298 &calldata->res.seq_res,
2181 task)) 2299 task))
@@ -2202,7 +2320,7 @@ static const struct rpc_call_ops nfs4_close_ops = {
2202 * 2320 *
2203 * NOTE: Caller must be holding the sp->so_owner semaphore! 2321 * NOTE: Caller must be holding the sp->so_owner semaphore!
2204 */ 2322 */
2205int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc) 2323int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait)
2206{ 2324{
2207 struct nfs_server *server = NFS_SERVER(state->inode); 2325 struct nfs_server *server = NFS_SERVER(state->inode);
2208 struct nfs4_closedata *calldata; 2326 struct nfs4_closedata *calldata;
@@ -2238,7 +2356,7 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc)
2238 calldata->res.fattr = &calldata->fattr; 2356 calldata->res.fattr = &calldata->fattr;
2239 calldata->res.seqid = calldata->arg.seqid; 2357 calldata->res.seqid = calldata->arg.seqid;
2240 calldata->res.server = server; 2358 calldata->res.server = server;
2241 calldata->roc = roc; 2359 calldata->roc = pnfs_roc(state->inode);
2242 nfs_sb_active(calldata->inode->i_sb); 2360 nfs_sb_active(calldata->inode->i_sb);
2243 2361
2244 msg.rpc_argp = &calldata->arg; 2362 msg.rpc_argp = &calldata->arg;
@@ -2255,8 +2373,6 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc)
2255out_free_calldata: 2373out_free_calldata:
2256 kfree(calldata); 2374 kfree(calldata);
2257out: 2375out:
2258 if (roc)
2259 pnfs_roc_release(state->inode);
2260 nfs4_put_open_state(state); 2376 nfs4_put_open_state(state);
2261 nfs4_put_state_owner(sp); 2377 nfs4_put_state_owner(sp);
2262 return status; 2378 return status;
@@ -2399,7 +2515,7 @@ static int nfs4_lookup_root_sec(struct nfs_server *server, struct nfs_fh *fhandl
2399 int ret; 2515 int ret;
2400 2516
2401 auth = rpcauth_create(flavor, server->client); 2517 auth = rpcauth_create(flavor, server->client);
2402 if (!auth) { 2518 if (IS_ERR(auth)) {
2403 ret = -EIO; 2519 ret = -EIO;
2404 goto out; 2520 goto out;
2405 } 2521 }
@@ -2767,13 +2883,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
2767 2883
2768 status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); 2884 status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
2769 if (!status) { 2885 if (!status) {
2770 entry->mask = 0; 2886 nfs_access_set_mask(entry, res.access);
2771 if (res.access & NFS4_ACCESS_READ)
2772 entry->mask |= MAY_READ;
2773 if (res.access & (NFS4_ACCESS_MODIFY | NFS4_ACCESS_EXTEND | NFS4_ACCESS_DELETE))
2774 entry->mask |= MAY_WRITE;
2775 if (res.access & (NFS4_ACCESS_LOOKUP|NFS4_ACCESS_EXECUTE))
2776 entry->mask |= MAY_EXEC;
2777 nfs_refresh_inode(inode, res.fattr); 2887 nfs_refresh_inode(inode, res.fattr);
2778 } 2888 }
2779 nfs_free_fattr(res.fattr); 2889 nfs_free_fattr(res.fattr);
@@ -3362,8 +3472,11 @@ static int nfs4_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, s
3362 3472
3363 nfs_fattr_init(fsinfo->fattr); 3473 nfs_fattr_init(fsinfo->fattr);
3364 error = nfs4_do_fsinfo(server, fhandle, fsinfo); 3474 error = nfs4_do_fsinfo(server, fhandle, fsinfo);
3365 if (error == 0) 3475 if (error == 0) {
3476 /* block layout checks this! */
3477 server->pnfs_blksize = fsinfo->blksize;
3366 set_pnfs_layoutdriver(server, fhandle, fsinfo->layouttype); 3478 set_pnfs_layoutdriver(server, fhandle, fsinfo->layouttype);
3479 }
3367 3480
3368 return error; 3481 return error;
3369} 3482}
@@ -4007,6 +4120,36 @@ static void nfs4_init_boot_verifier(const struct nfs_client *clp,
4007 memcpy(bootverf->data, verf, sizeof(bootverf->data)); 4120 memcpy(bootverf->data, verf, sizeof(bootverf->data));
4008} 4121}
4009 4122
4123static unsigned int
4124nfs4_init_nonuniform_client_string(const struct nfs_client *clp,
4125 char *buf, size_t len)
4126{
4127 unsigned int result;
4128
4129 rcu_read_lock();
4130 result = scnprintf(buf, len, "Linux NFSv4.0 %s/%s %s",
4131 clp->cl_ipaddr,
4132 rpc_peeraddr2str(clp->cl_rpcclient,
4133 RPC_DISPLAY_ADDR),
4134 rpc_peeraddr2str(clp->cl_rpcclient,
4135 RPC_DISPLAY_PROTO));
4136 rcu_read_unlock();
4137 return result;
4138}
4139
4140static unsigned int
4141nfs4_init_uniform_client_string(const struct nfs_client *clp,
4142 char *buf, size_t len)
4143{
4144 char *nodename = clp->cl_rpcclient->cl_nodename;
4145
4146 if (nfs4_client_id_uniquifier[0] != '\0')
4147 nodename = nfs4_client_id_uniquifier;
4148 return scnprintf(buf, len, "Linux NFSv%u.%u %s",
4149 clp->rpc_ops->version, clp->cl_minorversion,
4150 nodename);
4151}
4152
4010/** 4153/**
4011 * nfs4_proc_setclientid - Negotiate client ID 4154 * nfs4_proc_setclientid - Negotiate client ID
4012 * @clp: state data structure 4155 * @clp: state data structure
@@ -4037,15 +4180,18 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
4037 4180
4038 /* nfs_client_id4 */ 4181 /* nfs_client_id4 */
4039 nfs4_init_boot_verifier(clp, &sc_verifier); 4182 nfs4_init_boot_verifier(clp, &sc_verifier);
4040 rcu_read_lock(); 4183 if (test_bit(NFS_CS_MIGRATION, &clp->cl_flags))
4041 setclientid.sc_name_len = scnprintf(setclientid.sc_name, 4184 setclientid.sc_name_len =
4042 sizeof(setclientid.sc_name), "%s/%s %s", 4185 nfs4_init_uniform_client_string(clp,
4043 clp->cl_ipaddr, 4186 setclientid.sc_name,
4044 rpc_peeraddr2str(clp->cl_rpcclient, 4187 sizeof(setclientid.sc_name));
4045 RPC_DISPLAY_ADDR), 4188 else
4046 rpc_peeraddr2str(clp->cl_rpcclient, 4189 setclientid.sc_name_len =
4047 RPC_DISPLAY_PROTO)); 4190 nfs4_init_nonuniform_client_string(clp,
4191 setclientid.sc_name,
4192 sizeof(setclientid.sc_name));
4048 /* cb_client4 */ 4193 /* cb_client4 */
4194 rcu_read_lock();
4049 setclientid.sc_netid_len = scnprintf(setclientid.sc_netid, 4195 setclientid.sc_netid_len = scnprintf(setclientid.sc_netid,
4050 sizeof(setclientid.sc_netid), 4196 sizeof(setclientid.sc_netid),
4051 rpc_peeraddr2str(clp->cl_rpcclient, 4197 rpc_peeraddr2str(clp->cl_rpcclient,
@@ -4391,7 +4537,7 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)
4391 4537
4392 if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0) 4538 if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
4393 return; 4539 return;
4394 if ((calldata->lsp->ls_flags & NFS_LOCK_INITIALIZED) == 0) { 4540 if (test_bit(NFS_LOCK_INITIALIZED, &calldata->lsp->ls_flags) == 0) {
4395 /* Note: exit _without_ running nfs4_locku_done */ 4541 /* Note: exit _without_ running nfs4_locku_done */
4396 task->tk_action = NULL; 4542 task->tk_action = NULL;
4397 return; 4543 return;
@@ -4585,7 +4731,7 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
4585 } 4731 }
4586 if (data->rpc_status == 0) { 4732 if (data->rpc_status == 0) {
4587 nfs4_stateid_copy(&data->lsp->ls_stateid, &data->res.stateid); 4733 nfs4_stateid_copy(&data->lsp->ls_stateid, &data->res.stateid);
4588 data->lsp->ls_flags |= NFS_LOCK_INITIALIZED; 4734 set_bit(NFS_LOCK_INITIALIZED, &data->lsp->ls_flags);
4589 renew_lease(NFS_SERVER(data->ctx->dentry->d_inode), data->timestamp); 4735 renew_lease(NFS_SERVER(data->ctx->dentry->d_inode), data->timestamp);
4590 } 4736 }
4591out: 4737out:
@@ -4632,7 +4778,7 @@ static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_
4632 case -NFS4ERR_BAD_STATEID: 4778 case -NFS4ERR_BAD_STATEID:
4633 lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED; 4779 lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED;
4634 if (new_lock_owner != 0 || 4780 if (new_lock_owner != 0 ||
4635 (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0) 4781 test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0)
4636 nfs4_schedule_stateid_recovery(server, lsp->ls_state); 4782 nfs4_schedule_stateid_recovery(server, lsp->ls_state);
4637 break; 4783 break;
4638 case -NFS4ERR_STALE_STATEID: 4784 case -NFS4ERR_STALE_STATEID:
@@ -4756,7 +4902,7 @@ static int nfs41_check_expired_locks(struct nfs4_state *state)
4756 struct nfs_server *server = NFS_SERVER(state->inode); 4902 struct nfs_server *server = NFS_SERVER(state->inode);
4757 4903
4758 list_for_each_entry(lsp, &state->lock_states, ls_locks) { 4904 list_for_each_entry(lsp, &state->lock_states, ls_locks) {
4759 if (lsp->ls_flags & NFS_LOCK_INITIALIZED) { 4905 if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) {
4760 status = nfs41_test_stateid(server, &lsp->ls_stateid); 4906 status = nfs41_test_stateid(server, &lsp->ls_stateid);
4761 if (status != NFS_OK) { 4907 if (status != NFS_OK) {
4762 /* Free the stateid unless the server 4908 /* Free the stateid unless the server
@@ -4764,7 +4910,7 @@ static int nfs41_check_expired_locks(struct nfs4_state *state)
4764 if (status != -NFS4ERR_BAD_STATEID) 4910 if (status != -NFS4ERR_BAD_STATEID)
4765 nfs41_free_stateid(server, 4911 nfs41_free_stateid(server,
4766 &lsp->ls_stateid); 4912 &lsp->ls_stateid);
4767 lsp->ls_flags &= ~NFS_LOCK_INITIALIZED; 4913 clear_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags);
4768 ret = status; 4914 ret = status;
4769 } 4915 }
4770 } 4916 }
@@ -5267,10 +5413,8 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
5267 }; 5413 };
5268 5414
5269 nfs4_init_boot_verifier(clp, &verifier); 5415 nfs4_init_boot_verifier(clp, &verifier);
5270 args.id_len = scnprintf(args.id, sizeof(args.id), 5416 args.id_len = nfs4_init_uniform_client_string(clp, args.id,
5271 "%s/%s", 5417 sizeof(args.id));
5272 clp->cl_ipaddr,
5273 clp->cl_rpcclient->cl_nodename);
5274 dprintk("NFS call exchange_id auth=%s, '%.*s'\n", 5418 dprintk("NFS call exchange_id auth=%s, '%.*s'\n",
5275 clp->cl_rpcclient->cl_auth->au_ops->au_name, 5419 clp->cl_rpcclient->cl_auth->au_ops->au_name,
5276 args.id_len, args.id); 5420 args.id_len, args.id);
@@ -5391,6 +5535,8 @@ int nfs4_destroy_clientid(struct nfs_client *clp)
5391 goto out; 5535 goto out;
5392 if (clp->cl_exchange_flags == 0) 5536 if (clp->cl_exchange_flags == 0)
5393 goto out; 5537 goto out;
5538 if (clp->cl_preserve_clid)
5539 goto out;
5394 cred = nfs4_get_exchange_id_cred(clp); 5540 cred = nfs4_get_exchange_id_cred(clp);
5395 ret = nfs4_proc_destroy_clientid(clp, cred); 5541 ret = nfs4_proc_destroy_clientid(clp, cred);
5396 if (cred) 5542 if (cred)
@@ -6196,26 +6342,44 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
6196static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) 6342static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
6197{ 6343{
6198 struct nfs4_layoutget *lgp = calldata; 6344 struct nfs4_layoutget *lgp = calldata;
6199 struct nfs_server *server = NFS_SERVER(lgp->args.inode); 6345 struct inode *inode = lgp->args.inode;
6346 struct nfs_server *server = NFS_SERVER(inode);
6347 struct pnfs_layout_hdr *lo;
6348 struct nfs4_state *state = NULL;
6200 6349
6201 dprintk("--> %s\n", __func__); 6350 dprintk("--> %s\n", __func__);
6202 6351
6203 if (!nfs4_sequence_done(task, &lgp->res.seq_res)) 6352 if (!nfs4_sequence_done(task, &lgp->res.seq_res))
6204 return; 6353 goto out;
6205 6354
6206 switch (task->tk_status) { 6355 switch (task->tk_status) {
6207 case 0: 6356 case 0:
6208 break; 6357 goto out;
6209 case -NFS4ERR_LAYOUTTRYLATER: 6358 case -NFS4ERR_LAYOUTTRYLATER:
6210 case -NFS4ERR_RECALLCONFLICT: 6359 case -NFS4ERR_RECALLCONFLICT:
6211 task->tk_status = -NFS4ERR_DELAY; 6360 task->tk_status = -NFS4ERR_DELAY;
6212 /* Fall through */ 6361 break;
6213 default: 6362 case -NFS4ERR_EXPIRED:
6214 if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) { 6363 case -NFS4ERR_BAD_STATEID:
6215 rpc_restart_call_prepare(task); 6364 spin_lock(&inode->i_lock);
6216 return; 6365 lo = NFS_I(inode)->layout;
6366 if (!lo || list_empty(&lo->plh_segs)) {
6367 spin_unlock(&inode->i_lock);
6368 /* If the open stateid was bad, then recover it. */
6369 state = lgp->args.ctx->state;
6370 } else {
6371 LIST_HEAD(head);
6372
6373 pnfs_mark_matching_lsegs_invalid(lo, &head, NULL);
6374 spin_unlock(&inode->i_lock);
6375 /* Mark the bad layout state as invalid, then
6376 * retry using the open stateid. */
6377 pnfs_free_lseg_list(&head);
6217 } 6378 }
6218 } 6379 }
6380 if (nfs4_async_handle_error(task, server, state) == -EAGAIN)
6381 rpc_restart_call_prepare(task);
6382out:
6219 dprintk("<-- %s\n", __func__); 6383 dprintk("<-- %s\n", __func__);
6220} 6384}
6221 6385
@@ -6282,7 +6446,8 @@ static const struct rpc_call_ops nfs4_layoutget_call_ops = {
6282 .rpc_release = nfs4_layoutget_release, 6446 .rpc_release = nfs4_layoutget_release,
6283}; 6447};
6284 6448
6285void nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) 6449struct pnfs_layout_segment *
6450nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
6286{ 6451{
6287 struct nfs_server *server = NFS_SERVER(lgp->args.inode); 6452 struct nfs_server *server = NFS_SERVER(lgp->args.inode);
6288 size_t max_pages = max_response_pages(server); 6453 size_t max_pages = max_response_pages(server);
@@ -6299,6 +6464,7 @@ void nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
6299 .callback_data = lgp, 6464 .callback_data = lgp,
6300 .flags = RPC_TASK_ASYNC, 6465 .flags = RPC_TASK_ASYNC,
6301 }; 6466 };
6467 struct pnfs_layout_segment *lseg = NULL;
6302 int status = 0; 6468 int status = 0;
6303 6469
6304 dprintk("--> %s\n", __func__); 6470 dprintk("--> %s\n", __func__);
@@ -6306,7 +6472,7 @@ void nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
6306 lgp->args.layout.pages = nfs4_alloc_pages(max_pages, gfp_flags); 6472 lgp->args.layout.pages = nfs4_alloc_pages(max_pages, gfp_flags);
6307 if (!lgp->args.layout.pages) { 6473 if (!lgp->args.layout.pages) {
6308 nfs4_layoutget_release(lgp); 6474 nfs4_layoutget_release(lgp);
6309 return; 6475 return ERR_PTR(-ENOMEM);
6310 } 6476 }
6311 lgp->args.layout.pglen = max_pages * PAGE_SIZE; 6477 lgp->args.layout.pglen = max_pages * PAGE_SIZE;
6312 6478
@@ -6315,15 +6481,17 @@ void nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
6315 nfs41_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0); 6481 nfs41_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0);
6316 task = rpc_run_task(&task_setup_data); 6482 task = rpc_run_task(&task_setup_data);
6317 if (IS_ERR(task)) 6483 if (IS_ERR(task))
6318 return; 6484 return ERR_CAST(task);
6319 status = nfs4_wait_for_completion_rpc_task(task); 6485 status = nfs4_wait_for_completion_rpc_task(task);
6320 if (status == 0) 6486 if (status == 0)
6321 status = task->tk_status; 6487 status = task->tk_status;
6322 if (status == 0) 6488 if (status == 0)
6323 status = pnfs_layout_process(lgp); 6489 lseg = pnfs_layout_process(lgp);
6324 rpc_put_task(task); 6490 rpc_put_task(task);
6325 dprintk("<-- %s status=%d\n", __func__, status); 6491 dprintk("<-- %s status=%d\n", __func__, status);
6326 return; 6492 if (status)
6493 return ERR_PTR(status);
6494 return lseg;
6327} 6495}
6328 6496
6329static void 6497static void
@@ -6342,7 +6510,6 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
6342{ 6510{
6343 struct nfs4_layoutreturn *lrp = calldata; 6511 struct nfs4_layoutreturn *lrp = calldata;
6344 struct nfs_server *server; 6512 struct nfs_server *server;
6345 struct pnfs_layout_hdr *lo = lrp->args.layout;
6346 6513
6347 dprintk("--> %s\n", __func__); 6514 dprintk("--> %s\n", __func__);
6348 6515
@@ -6354,20 +6521,21 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
6354 rpc_restart_call_prepare(task); 6521 rpc_restart_call_prepare(task);
6355 return; 6522 return;
6356 } 6523 }
6357 spin_lock(&lo->plh_inode->i_lock);
6358 if (task->tk_status == 0 && lrp->res.lrs_present)
6359 pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
6360 lo->plh_block_lgets--;
6361 spin_unlock(&lo->plh_inode->i_lock);
6362 dprintk("<-- %s\n", __func__); 6524 dprintk("<-- %s\n", __func__);
6363} 6525}
6364 6526
6365static void nfs4_layoutreturn_release(void *calldata) 6527static void nfs4_layoutreturn_release(void *calldata)
6366{ 6528{
6367 struct nfs4_layoutreturn *lrp = calldata; 6529 struct nfs4_layoutreturn *lrp = calldata;
6530 struct pnfs_layout_hdr *lo = lrp->args.layout;
6368 6531
6369 dprintk("--> %s\n", __func__); 6532 dprintk("--> %s\n", __func__);
6370 put_layout_hdr(lrp->args.layout); 6533 spin_lock(&lo->plh_inode->i_lock);
6534 if (lrp->res.lrs_present)
6535 pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
6536 lo->plh_block_lgets--;
6537 spin_unlock(&lo->plh_inode->i_lock);
6538 pnfs_put_layout_hdr(lrp->args.layout);
6371 kfree(calldata); 6539 kfree(calldata);
6372 dprintk("<-- %s\n", __func__); 6540 dprintk("<-- %s\n", __func__);
6373} 6541}
@@ -6541,7 +6709,7 @@ static void nfs4_layoutcommit_release(void *calldata)
6541 list_del_init(&lseg->pls_lc_list); 6709 list_del_init(&lseg->pls_lc_list);
6542 if (test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, 6710 if (test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT,
6543 &lseg->pls_flags)) 6711 &lseg->pls_flags))
6544 put_lseg(lseg); 6712 pnfs_put_lseg(lseg);
6545 } 6713 }
6546 6714
6547 clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock); 6715 clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock);
@@ -6800,6 +6968,7 @@ static const struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = {
6800 .recover_lock = nfs4_lock_reclaim, 6968 .recover_lock = nfs4_lock_reclaim,
6801 .establish_clid = nfs4_init_clientid, 6969 .establish_clid = nfs4_init_clientid,
6802 .get_clid_cred = nfs4_get_setclientid_cred, 6970 .get_clid_cred = nfs4_get_setclientid_cred,
6971 .detect_trunking = nfs40_discover_server_trunking,
6803}; 6972};
6804 6973
6805#if defined(CONFIG_NFS_V4_1) 6974#if defined(CONFIG_NFS_V4_1)
@@ -6811,6 +6980,7 @@ static const struct nfs4_state_recovery_ops nfs41_reboot_recovery_ops = {
6811 .establish_clid = nfs41_init_clientid, 6980 .establish_clid = nfs41_init_clientid,
6812 .get_clid_cred = nfs4_get_exchange_id_cred, 6981 .get_clid_cred = nfs4_get_exchange_id_cred,
6813 .reclaim_complete = nfs41_proc_reclaim_complete, 6982 .reclaim_complete = nfs41_proc_reclaim_complete,
6983 .detect_trunking = nfs41_discover_server_trunking,
6814}; 6984};
6815#endif /* CONFIG_NFS_V4_1 */ 6985#endif /* CONFIG_NFS_V4_1 */
6816 6986
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 55148def5540..c351e6b39838 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -51,18 +51,21 @@
51#include <linux/bitops.h> 51#include <linux/bitops.h>
52#include <linux/jiffies.h> 52#include <linux/jiffies.h>
53 53
54#include <linux/sunrpc/clnt.h>
55
54#include "nfs4_fs.h" 56#include "nfs4_fs.h"
55#include "callback.h" 57#include "callback.h"
56#include "delegation.h" 58#include "delegation.h"
57#include "internal.h" 59#include "internal.h"
58#include "pnfs.h" 60#include "pnfs.h"
61#include "netns.h"
59 62
60#define NFSDBG_FACILITY NFSDBG_STATE 63#define NFSDBG_FACILITY NFSDBG_STATE
61 64
62#define OPENOWNER_POOL_SIZE 8 65#define OPENOWNER_POOL_SIZE 8
63 66
64const nfs4_stateid zero_stateid; 67const nfs4_stateid zero_stateid;
65 68static DEFINE_MUTEX(nfs_clid_init_mutex);
66static LIST_HEAD(nfs4_clientid_list); 69static LIST_HEAD(nfs4_clientid_list);
67 70
68int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) 71int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
@@ -73,12 +76,13 @@ int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
73 }; 76 };
74 unsigned short port; 77 unsigned short port;
75 int status; 78 int status;
79 struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
76 80
77 if (test_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state)) 81 if (test_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state))
78 goto do_confirm; 82 goto do_confirm;
79 port = nfs_callback_tcpport; 83 port = nn->nfs_callback_tcpport;
80 if (clp->cl_addr.ss_family == AF_INET6) 84 if (clp->cl_addr.ss_family == AF_INET6)
81 port = nfs_callback_tcpport6; 85 port = nn->nfs_callback_tcpport6;
82 86
83 status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred, &clid); 87 status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred, &clid);
84 if (status != 0) 88 if (status != 0)
@@ -96,6 +100,56 @@ out:
96 return status; 100 return status;
97} 101}
98 102
103/**
104 * nfs40_discover_server_trunking - Detect server IP address trunking (mv0)
105 *
106 * @clp: nfs_client under test
107 * @result: OUT: found nfs_client, or clp
108 * @cred: credential to use for trunking test
109 *
110 * Returns zero, a negative errno, or a negative NFS4ERR status.
111 * If zero is returned, an nfs_client pointer is planted in
112 * "result".
113 *
114 * Note: The returned client may not yet be marked ready.
115 */
116int nfs40_discover_server_trunking(struct nfs_client *clp,
117 struct nfs_client **result,
118 struct rpc_cred *cred)
119{
120 struct nfs4_setclientid_res clid = {
121 .clientid = clp->cl_clientid,
122 .confirm = clp->cl_confirm,
123 };
124 struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
125 unsigned short port;
126 int status;
127
128 port = nn->nfs_callback_tcpport;
129 if (clp->cl_addr.ss_family == AF_INET6)
130 port = nn->nfs_callback_tcpport6;
131
132 status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred, &clid);
133 if (status != 0)
134 goto out;
135 clp->cl_clientid = clid.clientid;
136 clp->cl_confirm = clid.confirm;
137
138 status = nfs40_walk_client_list(clp, result, cred);
139 switch (status) {
140 case -NFS4ERR_STALE_CLIENTID:
141 set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
142 case 0:
143 /* Sustain the lease, even if it's empty. If the clientid4
144 * goes stale it's of no use for trunking discovery. */
145 nfs4_schedule_state_renewal(*result);
146 break;
147 }
148
149out:
150 return status;
151}
152
99struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp) 153struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp)
100{ 154{
101 struct rpc_cred *cred = NULL; 155 struct rpc_cred *cred = NULL;
@@ -275,6 +329,33 @@ out:
275 return status; 329 return status;
276} 330}
277 331
332/**
333 * nfs41_discover_server_trunking - Detect server IP address trunking (mv1)
334 *
335 * @clp: nfs_client under test
336 * @result: OUT: found nfs_client, or clp
337 * @cred: credential to use for trunking test
338 *
339 * Returns NFS4_OK, a negative errno, or a negative NFS4ERR status.
340 * If NFS4_OK is returned, an nfs_client pointer is planted in
341 * "result".
342 *
343 * Note: The returned client may not yet be marked ready.
344 */
345int nfs41_discover_server_trunking(struct nfs_client *clp,
346 struct nfs_client **result,
347 struct rpc_cred *cred)
348{
349 int status;
350
351 status = nfs4_proc_exchange_id(clp, cred);
352 if (status != NFS4_OK)
353 return status;
354 set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
355
356 return nfs41_walk_client_list(clp, result, cred);
357}
358
278struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp) 359struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp)
279{ 360{
280 struct rpc_cred *cred; 361 struct rpc_cred *cred;
@@ -729,11 +810,8 @@ static void __nfs4_close(struct nfs4_state *state,
729 if (!call_close) { 810 if (!call_close) {
730 nfs4_put_open_state(state); 811 nfs4_put_open_state(state);
731 nfs4_put_state_owner(owner); 812 nfs4_put_state_owner(owner);
732 } else { 813 } else
733 bool roc = pnfs_roc(state->inode); 814 nfs4_do_close(state, gfp_mask, wait);
734
735 nfs4_do_close(state, gfp_mask, wait, roc);
736 }
737} 815}
738 816
739void nfs4_close_state(struct nfs4_state *state, fmode_t fmode) 817void nfs4_close_state(struct nfs4_state *state, fmode_t fmode)
@@ -865,7 +943,7 @@ void nfs4_put_lock_state(struct nfs4_lock_state *lsp)
865 if (list_empty(&state->lock_states)) 943 if (list_empty(&state->lock_states))
866 clear_bit(LK_STATE_IN_USE, &state->flags); 944 clear_bit(LK_STATE_IN_USE, &state->flags);
867 spin_unlock(&state->state_lock); 945 spin_unlock(&state->state_lock);
868 if (lsp->ls_flags & NFS_LOCK_INITIALIZED) { 946 if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) {
869 if (nfs4_release_lockowner(lsp) == 0) 947 if (nfs4_release_lockowner(lsp) == 0)
870 return; 948 return;
871 } 949 }
@@ -911,17 +989,25 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl)
911} 989}
912 990
913static bool nfs4_copy_lock_stateid(nfs4_stateid *dst, struct nfs4_state *state, 991static bool nfs4_copy_lock_stateid(nfs4_stateid *dst, struct nfs4_state *state,
914 fl_owner_t fl_owner, pid_t fl_pid) 992 const struct nfs_lockowner *lockowner)
915{ 993{
916 struct nfs4_lock_state *lsp; 994 struct nfs4_lock_state *lsp;
995 fl_owner_t fl_owner;
996 pid_t fl_pid;
917 bool ret = false; 997 bool ret = false;
918 998
999
1000 if (lockowner == NULL)
1001 goto out;
1002
919 if (test_bit(LK_STATE_IN_USE, &state->flags) == 0) 1003 if (test_bit(LK_STATE_IN_USE, &state->flags) == 0)
920 goto out; 1004 goto out;
921 1005
1006 fl_owner = lockowner->l_owner;
1007 fl_pid = lockowner->l_pid;
922 spin_lock(&state->state_lock); 1008 spin_lock(&state->state_lock);
923 lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE); 1009 lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE);
924 if (lsp != NULL && (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0) { 1010 if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) {
925 nfs4_stateid_copy(dst, &lsp->ls_stateid); 1011 nfs4_stateid_copy(dst, &lsp->ls_stateid);
926 ret = true; 1012 ret = true;
927 } 1013 }
@@ -946,11 +1032,11 @@ static void nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state)
946 * requests. 1032 * requests.
947 */ 1033 */
948void nfs4_select_rw_stateid(nfs4_stateid *dst, struct nfs4_state *state, 1034void nfs4_select_rw_stateid(nfs4_stateid *dst, struct nfs4_state *state,
949 fmode_t fmode, fl_owner_t fl_owner, pid_t fl_pid) 1035 fmode_t fmode, const struct nfs_lockowner *lockowner)
950{ 1036{
951 if (nfs4_copy_delegation_stateid(dst, state->inode, fmode)) 1037 if (nfs4_copy_delegation_stateid(dst, state->inode, fmode))
952 return; 1038 return;
953 if (nfs4_copy_lock_stateid(dst, state, fl_owner, fl_pid)) 1039 if (nfs4_copy_lock_stateid(dst, state, lockowner))
954 return; 1040 return;
955 nfs4_copy_open_stateid(dst, state); 1041 nfs4_copy_open_stateid(dst, state);
956} 1042}
@@ -1289,7 +1375,7 @@ restart:
1289 if (status >= 0) { 1375 if (status >= 0) {
1290 spin_lock(&state->state_lock); 1376 spin_lock(&state->state_lock);
1291 list_for_each_entry(lock, &state->lock_states, ls_locks) { 1377 list_for_each_entry(lock, &state->lock_states, ls_locks) {
1292 if (!(lock->ls_flags & NFS_LOCK_INITIALIZED)) 1378 if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags))
1293 pr_warn_ratelimited("NFS: " 1379 pr_warn_ratelimited("NFS: "
1294 "%s: Lock reclaim " 1380 "%s: Lock reclaim "
1295 "failed!\n", __func__); 1381 "failed!\n", __func__);
@@ -1361,7 +1447,7 @@ static void nfs4_clear_open_state(struct nfs4_state *state)
1361 spin_lock(&state->state_lock); 1447 spin_lock(&state->state_lock);
1362 list_for_each_entry(lock, &state->lock_states, ls_locks) { 1448 list_for_each_entry(lock, &state->lock_states, ls_locks) {
1363 lock->ls_seqid.flags = 0; 1449 lock->ls_seqid.flags = 0;
1364 lock->ls_flags &= ~NFS_LOCK_INITIALIZED; 1450 clear_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags);
1365 } 1451 }
1366 spin_unlock(&state->state_lock); 1452 spin_unlock(&state->state_lock);
1367} 1453}
@@ -1595,8 +1681,8 @@ out:
1595 return nfs4_recovery_handle_error(clp, status); 1681 return nfs4_recovery_handle_error(clp, status);
1596} 1682}
1597 1683
1598/* Set NFS4CLNT_LEASE_EXPIRED for all v4.0 errors and for recoverable errors 1684/* Set NFS4CLNT_LEASE_EXPIRED and reclaim reboot state for all v4.0 errors
1599 * on EXCHANGE_ID for v4.1 1685 * and for recoverable errors on EXCHANGE_ID for v4.1
1600 */ 1686 */
1601static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status) 1687static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status)
1602{ 1688{
@@ -1606,8 +1692,12 @@ static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status)
1606 return -ESERVERFAULT; 1692 return -ESERVERFAULT;
1607 /* Lease confirmation error: retry after purging the lease */ 1693 /* Lease confirmation error: retry after purging the lease */
1608 ssleep(1); 1694 ssleep(1);
1695 clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
1696 break;
1609 case -NFS4ERR_STALE_CLIENTID: 1697 case -NFS4ERR_STALE_CLIENTID:
1610 clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); 1698 clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
1699 nfs4_state_clear_reclaim_reboot(clp);
1700 nfs4_state_start_reclaim_reboot(clp);
1611 break; 1701 break;
1612 case -NFS4ERR_CLID_INUSE: 1702 case -NFS4ERR_CLID_INUSE:
1613 pr_err("NFS: Server %s reports our clientid is in use\n", 1703 pr_err("NFS: Server %s reports our clientid is in use\n",
@@ -1698,6 +1788,109 @@ static int nfs4_purge_lease(struct nfs_client *clp)
1698 return 0; 1788 return 0;
1699} 1789}
1700 1790
1791/**
1792 * nfs4_discover_server_trunking - Detect server IP address trunking
1793 *
1794 * @clp: nfs_client under test
1795 * @result: OUT: found nfs_client, or clp
1796 *
1797 * Returns zero or a negative errno. If zero is returned,
1798 * an nfs_client pointer is planted in "result".
1799 *
1800 * Note: since we are invoked in process context, and
1801 * not from inside the state manager, we cannot use
1802 * nfs4_handle_reclaim_lease_error().
1803 */
1804int nfs4_discover_server_trunking(struct nfs_client *clp,
1805 struct nfs_client **result)
1806{
1807 const struct nfs4_state_recovery_ops *ops =
1808 clp->cl_mvops->reboot_recovery_ops;
1809 rpc_authflavor_t *flavors, flav, save;
1810 struct rpc_clnt *clnt;
1811 struct rpc_cred *cred;
1812 int i, len, status;
1813
1814 dprintk("NFS: %s: testing '%s'\n", __func__, clp->cl_hostname);
1815
1816 len = NFS_MAX_SECFLAVORS;
1817 flavors = kcalloc(len, sizeof(*flavors), GFP_KERNEL);
1818 if (flavors == NULL) {
1819 status = -ENOMEM;
1820 goto out;
1821 }
1822 len = rpcauth_list_flavors(flavors, len);
1823 if (len < 0) {
1824 status = len;
1825 goto out_free;
1826 }
1827 clnt = clp->cl_rpcclient;
1828 save = clnt->cl_auth->au_flavor;
1829 i = 0;
1830
1831 mutex_lock(&nfs_clid_init_mutex);
1832 status = -ENOENT;
1833again:
1834 cred = ops->get_clid_cred(clp);
1835 if (cred == NULL)
1836 goto out_unlock;
1837
1838 status = ops->detect_trunking(clp, result, cred);
1839 put_rpccred(cred);
1840 switch (status) {
1841 case 0:
1842 break;
1843
1844 case -EACCES:
1845 if (clp->cl_machine_cred == NULL)
1846 break;
1847 /* Handle case where the user hasn't set up machine creds */
1848 nfs4_clear_machine_cred(clp);
1849 case -NFS4ERR_DELAY:
1850 case -ETIMEDOUT:
1851 case -EAGAIN:
1852 ssleep(1);
1853 dprintk("NFS: %s after status %d, retrying\n",
1854 __func__, status);
1855 goto again;
1856
1857 case -NFS4ERR_CLID_INUSE:
1858 case -NFS4ERR_WRONGSEC:
1859 status = -EPERM;
1860 if (i >= len)
1861 break;
1862
1863 flav = flavors[i++];
1864 if (flav == save)
1865 flav = flavors[i++];
1866 clnt = rpc_clone_client_set_auth(clnt, flav);
1867 if (IS_ERR(clnt)) {
1868 status = PTR_ERR(clnt);
1869 break;
1870 }
1871 clp->cl_rpcclient = clnt;
1872 goto again;
1873
1874 case -NFS4ERR_MINOR_VERS_MISMATCH:
1875 status = -EPROTONOSUPPORT;
1876 break;
1877
1878 case -EKEYEXPIRED:
1879 nfs4_warn_keyexpired(clp->cl_hostname);
1880 case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery
1881 * in nfs4_exchange_id */
1882 status = -EKEYEXPIRED;
1883 }
1884
1885out_unlock:
1886 mutex_unlock(&nfs_clid_init_mutex);
1887out_free:
1888 kfree(flavors);
1889out:
1890 dprintk("NFS: %s: status = %d\n", __func__, status);
1891 return status;
1892}
1893
1701#ifdef CONFIG_NFS_V4_1 1894#ifdef CONFIG_NFS_V4_1
1702void nfs4_schedule_session_recovery(struct nfs4_session *session, int err) 1895void nfs4_schedule_session_recovery(struct nfs4_session *session, int err)
1703{ 1896{
@@ -2008,6 +2201,7 @@ out_error:
2008 pr_warn_ratelimited("NFS: state manager%s%s failed on NFSv4 server %s" 2201 pr_warn_ratelimited("NFS: state manager%s%s failed on NFSv4 server %s"
2009 " with error %d\n", section_sep, section, 2202 " with error %d\n", section_sep, section,
2010 clp->cl_hostname, -status); 2203 clp->cl_hostname, -status);
2204 ssleep(1);
2011 nfs4_end_drain_session(clp); 2205 nfs4_end_drain_session(clp);
2012 nfs4_clear_state_manager_bit(clp); 2206 nfs4_clear_state_manager_bit(clp);
2013} 2207}
diff --git a/fs/nfs/nfs4sysctl.c b/fs/nfs/nfs4sysctl.c
index 5729bc8aa75d..2628d921b7e3 100644
--- a/fs/nfs/nfs4sysctl.c
+++ b/fs/nfs/nfs4sysctl.c
@@ -9,6 +9,7 @@
9#include <linux/nfs_idmap.h> 9#include <linux/nfs_idmap.h>
10#include <linux/nfs_fs.h> 10#include <linux/nfs_fs.h>
11 11
12#include "nfs4_fs.h"
12#include "callback.h" 13#include "callback.h"
13 14
14static const int nfs_set_port_min = 0; 15static const int nfs_set_port_min = 0;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 8dba6bd48557..40836ee5dc3a 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -447,12 +447,14 @@ static int nfs4_stat_to_errno(int);
447 encode_sequence_maxsz + \ 447 encode_sequence_maxsz + \
448 encode_putfh_maxsz + \ 448 encode_putfh_maxsz + \
449 encode_open_maxsz + \ 449 encode_open_maxsz + \
450 encode_access_maxsz + \
450 encode_getfh_maxsz + \ 451 encode_getfh_maxsz + \
451 encode_getattr_maxsz) 452 encode_getattr_maxsz)
452#define NFS4_dec_open_sz (compound_decode_hdr_maxsz + \ 453#define NFS4_dec_open_sz (compound_decode_hdr_maxsz + \
453 decode_sequence_maxsz + \ 454 decode_sequence_maxsz + \
454 decode_putfh_maxsz + \ 455 decode_putfh_maxsz + \
455 decode_open_maxsz + \ 456 decode_open_maxsz + \
457 decode_access_maxsz + \
456 decode_getfh_maxsz + \ 458 decode_getfh_maxsz + \
457 decode_getattr_maxsz) 459 decode_getattr_maxsz)
458#define NFS4_enc_open_confirm_sz \ 460#define NFS4_enc_open_confirm_sz \
@@ -467,11 +469,13 @@ static int nfs4_stat_to_errno(int);
467 encode_sequence_maxsz + \ 469 encode_sequence_maxsz + \
468 encode_putfh_maxsz + \ 470 encode_putfh_maxsz + \
469 encode_open_maxsz + \ 471 encode_open_maxsz + \
472 encode_access_maxsz + \
470 encode_getattr_maxsz) 473 encode_getattr_maxsz)
471#define NFS4_dec_open_noattr_sz (compound_decode_hdr_maxsz + \ 474#define NFS4_dec_open_noattr_sz (compound_decode_hdr_maxsz + \
472 decode_sequence_maxsz + \ 475 decode_sequence_maxsz + \
473 decode_putfh_maxsz + \ 476 decode_putfh_maxsz + \
474 decode_open_maxsz + \ 477 decode_open_maxsz + \
478 decode_access_maxsz + \
475 decode_getattr_maxsz) 479 decode_getattr_maxsz)
476#define NFS4_enc_open_downgrade_sz \ 480#define NFS4_enc_open_downgrade_sz \
477 (compound_encode_hdr_maxsz + \ 481 (compound_encode_hdr_maxsz + \
@@ -1509,8 +1513,12 @@ static void encode_open_stateid(struct xdr_stream *xdr,
1509 nfs4_stateid stateid; 1513 nfs4_stateid stateid;
1510 1514
1511 if (ctx->state != NULL) { 1515 if (ctx->state != NULL) {
1516 const struct nfs_lockowner *lockowner = NULL;
1517
1518 if (l_ctx != NULL)
1519 lockowner = &l_ctx->lockowner;
1512 nfs4_select_rw_stateid(&stateid, ctx->state, 1520 nfs4_select_rw_stateid(&stateid, ctx->state,
1513 fmode, l_ctx->lockowner, l_ctx->pid); 1521 fmode, lockowner);
1514 if (zero_seqid) 1522 if (zero_seqid)
1515 stateid.seqid = 0; 1523 stateid.seqid = 0;
1516 encode_nfs4_stateid(xdr, &stateid); 1524 encode_nfs4_stateid(xdr, &stateid);
@@ -2216,6 +2224,8 @@ static void nfs4_xdr_enc_open(struct rpc_rqst *req, struct xdr_stream *xdr,
2216 encode_putfh(xdr, args->fh, &hdr); 2224 encode_putfh(xdr, args->fh, &hdr);
2217 encode_open(xdr, args, &hdr); 2225 encode_open(xdr, args, &hdr);
2218 encode_getfh(xdr, &hdr); 2226 encode_getfh(xdr, &hdr);
2227 if (args->access)
2228 encode_access(xdr, args->access, &hdr);
2219 encode_getfattr_open(xdr, args->bitmask, args->open_bitmap, &hdr); 2229 encode_getfattr_open(xdr, args->bitmask, args->open_bitmap, &hdr);
2220 encode_nops(&hdr); 2230 encode_nops(&hdr);
2221} 2231}
@@ -2252,7 +2262,9 @@ static void nfs4_xdr_enc_open_noattr(struct rpc_rqst *req,
2252 encode_sequence(xdr, &args->seq_args, &hdr); 2262 encode_sequence(xdr, &args->seq_args, &hdr);
2253 encode_putfh(xdr, args->fh, &hdr); 2263 encode_putfh(xdr, args->fh, &hdr);
2254 encode_open(xdr, args, &hdr); 2264 encode_open(xdr, args, &hdr);
2255 encode_getfattr(xdr, args->bitmask, &hdr); 2265 if (args->access)
2266 encode_access(xdr, args->access, &hdr);
2267 encode_getfattr_open(xdr, args->bitmask, args->open_bitmap, &hdr);
2256 encode_nops(&hdr); 2268 encode_nops(&hdr);
2257} 2269}
2258 2270
@@ -4095,7 +4107,7 @@ out_overflow:
4095 return -EIO; 4107 return -EIO;
4096} 4108}
4097 4109
4098static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access) 4110static int decode_access(struct xdr_stream *xdr, u32 *supported, u32 *access)
4099{ 4111{
4100 __be32 *p; 4112 __be32 *p;
4101 uint32_t supp, acc; 4113 uint32_t supp, acc;
@@ -4109,8 +4121,8 @@ static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access)
4109 goto out_overflow; 4121 goto out_overflow;
4110 supp = be32_to_cpup(p++); 4122 supp = be32_to_cpup(p++);
4111 acc = be32_to_cpup(p); 4123 acc = be32_to_cpup(p);
4112 access->supported = supp; 4124 *supported = supp;
4113 access->access = acc; 4125 *access = acc;
4114 return 0; 4126 return 0;
4115out_overflow: 4127out_overflow:
4116 print_overflow_msg(__func__, xdr); 4128 print_overflow_msg(__func__, xdr);
@@ -5642,7 +5654,8 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr,
5642 * and places the remaining xdr data in xdr_buf->tail 5654 * and places the remaining xdr data in xdr_buf->tail
5643 */ 5655 */
5644 pdev->mincount = be32_to_cpup(p); 5656 pdev->mincount = be32_to_cpup(p);
5645 xdr_read_pages(xdr, pdev->mincount); /* include space for the length */ 5657 if (xdr_read_pages(xdr, pdev->mincount) != pdev->mincount)
5658 goto out_overflow;
5646 5659
5647 /* Parse notification bitmap, verifying that it is zero. */ 5660 /* Parse notification bitmap, verifying that it is zero. */
5648 p = xdr_inline_decode(xdr, 4); 5661 p = xdr_inline_decode(xdr, 4);
@@ -5887,7 +5900,7 @@ static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5887 status = decode_putfh(xdr); 5900 status = decode_putfh(xdr);
5888 if (status != 0) 5901 if (status != 0)
5889 goto out; 5902 goto out;
5890 status = decode_access(xdr, res); 5903 status = decode_access(xdr, &res->supported, &res->access);
5891 if (status != 0) 5904 if (status != 0)
5892 goto out; 5905 goto out;
5893 decode_getfattr(xdr, res->fattr, res->server); 5906 decode_getfattr(xdr, res->fattr, res->server);
@@ -6228,6 +6241,8 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
6228 status = decode_getfh(xdr, &res->fh); 6241 status = decode_getfh(xdr, &res->fh);
6229 if (status) 6242 if (status)
6230 goto out; 6243 goto out;
6244 if (res->access_request)
6245 decode_access(xdr, &res->access_supported, &res->access_result);
6231 decode_getfattr(xdr, res->f_attr, res->server); 6246 decode_getfattr(xdr, res->f_attr, res->server);
6232out: 6247out:
6233 return status; 6248 return status;
@@ -6276,6 +6291,8 @@ static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp,
6276 status = decode_open(xdr, res); 6291 status = decode_open(xdr, res);
6277 if (status) 6292 if (status)
6278 goto out; 6293 goto out;
6294 if (res->access_request)
6295 decode_access(xdr, &res->access_supported, &res->access_result);
6279 decode_getfattr(xdr, res->f_attr, res->server); 6296 decode_getfattr(xdr, res->f_attr, res->server);
6280out: 6297out:
6281 return status; 6298 return status;
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index ea6d111b03e9..be731e6b7b9c 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -41,6 +41,7 @@
41#include <scsi/osd_ore.h> 41#include <scsi/osd_ore.h>
42 42
43#include "objlayout.h" 43#include "objlayout.h"
44#include "../internal.h"
44 45
45#define NFSDBG_FACILITY NFSDBG_PNFS_LD 46#define NFSDBG_FACILITY NFSDBG_PNFS_LD
46 47
@@ -606,8 +607,14 @@ static bool aligned_on_raid_stripe(u64 offset, struct ore_layout *layout,
606void objio_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) 607void objio_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
607{ 608{
608 unsigned long stripe_end = 0; 609 unsigned long stripe_end = 0;
610 u64 wb_size;
609 611
610 pnfs_generic_pg_init_write(pgio, req); 612 if (pgio->pg_dreq == NULL)
613 wb_size = i_size_read(pgio->pg_inode) - req_offset(req);
614 else
615 wb_size = nfs_dreq_bytes_left(pgio->pg_dreq);
616
617 pnfs_generic_pg_init_write(pgio, req, wb_size);
611 if (unlikely(pgio->pg_lseg == NULL)) 618 if (unlikely(pgio->pg_lseg == NULL))
612 return; /* Not pNFS */ 619 return; /* Not pNFS */
613 620
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 311a79681e2b..e56e846e9d2d 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -102,6 +102,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
102 unsigned int offset, unsigned int count) 102 unsigned int offset, unsigned int count)
103{ 103{
104 struct nfs_page *req; 104 struct nfs_page *req;
105 struct nfs_lock_context *l_ctx;
105 106
106 /* try to allocate the request struct */ 107 /* try to allocate the request struct */
107 req = nfs_page_alloc(); 108 req = nfs_page_alloc();
@@ -109,11 +110,12 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
109 return ERR_PTR(-ENOMEM); 110 return ERR_PTR(-ENOMEM);
110 111
111 /* get lock context early so we can deal with alloc failures */ 112 /* get lock context early so we can deal with alloc failures */
112 req->wb_lock_context = nfs_get_lock_context(ctx); 113 l_ctx = nfs_get_lock_context(ctx);
113 if (req->wb_lock_context == NULL) { 114 if (IS_ERR(l_ctx)) {
114 nfs_page_free(req); 115 nfs_page_free(req);
115 return ERR_PTR(-ENOMEM); 116 return ERR_CAST(l_ctx);
116 } 117 }
118 req->wb_lock_context = l_ctx;
117 119
118 /* Initialize the request struct. Initially, we assume a 120 /* Initialize the request struct. Initially, we assume a
119 * long write-back delay. This will be adjusted in 121 * long write-back delay. This will be adjusted in
@@ -290,7 +292,9 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,
290{ 292{
291 if (req->wb_context->cred != prev->wb_context->cred) 293 if (req->wb_context->cred != prev->wb_context->cred)
292 return false; 294 return false;
293 if (req->wb_lock_context->lockowner != prev->wb_lock_context->lockowner) 295 if (req->wb_lock_context->lockowner.l_owner != prev->wb_lock_context->lockowner.l_owner)
296 return false;
297 if (req->wb_lock_context->lockowner.l_pid != prev->wb_lock_context->lockowner.l_pid)
294 return false; 298 return false;
295 if (req->wb_context->state != prev->wb_context->state) 299 if (req->wb_context->state != prev->wb_context->state)
296 return false; 300 return false;
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 2e00feacd4be..fe624c91bd00 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -35,6 +35,7 @@
35#include "iostat.h" 35#include "iostat.h"
36 36
37#define NFSDBG_FACILITY NFSDBG_PNFS 37#define NFSDBG_FACILITY NFSDBG_PNFS
38#define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ)
38 39
39/* Locking: 40/* Locking:
40 * 41 *
@@ -190,7 +191,7 @@ EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
190 191
191/* Need to hold i_lock if caller does not already hold reference */ 192/* Need to hold i_lock if caller does not already hold reference */
192void 193void
193get_layout_hdr(struct pnfs_layout_hdr *lo) 194pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo)
194{ 195{
195 atomic_inc(&lo->plh_refcount); 196 atomic_inc(&lo->plh_refcount);
196} 197}
@@ -199,43 +200,107 @@ static struct pnfs_layout_hdr *
199pnfs_alloc_layout_hdr(struct inode *ino, gfp_t gfp_flags) 200pnfs_alloc_layout_hdr(struct inode *ino, gfp_t gfp_flags)
200{ 201{
201 struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld; 202 struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
202 return ld->alloc_layout_hdr ? ld->alloc_layout_hdr(ino, gfp_flags) : 203 return ld->alloc_layout_hdr(ino, gfp_flags);
203 kzalloc(sizeof(struct pnfs_layout_hdr), gfp_flags);
204} 204}
205 205
206static void 206static void
207pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo) 207pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo)
208{ 208{
209 struct pnfs_layoutdriver_type *ld = NFS_SERVER(lo->plh_inode)->pnfs_curr_ld; 209 struct nfs_server *server = NFS_SERVER(lo->plh_inode);
210 struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld;
211
212 if (!list_empty(&lo->plh_layouts)) {
213 struct nfs_client *clp = server->nfs_client;
214
215 spin_lock(&clp->cl_lock);
216 list_del_init(&lo->plh_layouts);
217 spin_unlock(&clp->cl_lock);
218 }
210 put_rpccred(lo->plh_lc_cred); 219 put_rpccred(lo->plh_lc_cred);
211 return ld->alloc_layout_hdr ? ld->free_layout_hdr(lo) : kfree(lo); 220 return ld->free_layout_hdr(lo);
212} 221}
213 222
214static void 223static void
215destroy_layout_hdr(struct pnfs_layout_hdr *lo) 224pnfs_detach_layout_hdr(struct pnfs_layout_hdr *lo)
216{ 225{
226 struct nfs_inode *nfsi = NFS_I(lo->plh_inode);
217 dprintk("%s: freeing layout cache %p\n", __func__, lo); 227 dprintk("%s: freeing layout cache %p\n", __func__, lo);
218 BUG_ON(!list_empty(&lo->plh_layouts)); 228 nfsi->layout = NULL;
219 NFS_I(lo->plh_inode)->layout = NULL; 229 /* Reset MDS Threshold I/O counters */
220 pnfs_free_layout_hdr(lo); 230 nfsi->write_io = 0;
231 nfsi->read_io = 0;
232}
233
234void
235pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
236{
237 struct inode *inode = lo->plh_inode;
238
239 if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
240 pnfs_detach_layout_hdr(lo);
241 spin_unlock(&inode->i_lock);
242 pnfs_free_layout_hdr(lo);
243 }
244}
245
246static int
247pnfs_iomode_to_fail_bit(u32 iomode)
248{
249 return iomode == IOMODE_RW ?
250 NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED;
221} 251}
222 252
223static void 253static void
224put_layout_hdr_locked(struct pnfs_layout_hdr *lo) 254pnfs_layout_set_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit)
225{ 255{
226 if (atomic_dec_and_test(&lo->plh_refcount)) 256 lo->plh_retry_timestamp = jiffies;
227 destroy_layout_hdr(lo); 257 if (test_and_set_bit(fail_bit, &lo->plh_flags))
258 atomic_inc(&lo->plh_refcount);
228} 259}
229 260
230void 261static void
231put_layout_hdr(struct pnfs_layout_hdr *lo) 262pnfs_layout_clear_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit)
263{
264 if (test_and_clear_bit(fail_bit, &lo->plh_flags))
265 atomic_dec(&lo->plh_refcount);
266}
267
268static void
269pnfs_layout_io_set_failed(struct pnfs_layout_hdr *lo, u32 iomode)
232{ 270{
233 struct inode *inode = lo->plh_inode; 271 struct inode *inode = lo->plh_inode;
272 struct pnfs_layout_range range = {
273 .iomode = iomode,
274 .offset = 0,
275 .length = NFS4_MAX_UINT64,
276 };
277 LIST_HEAD(head);
234 278
235 if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) { 279 spin_lock(&inode->i_lock);
236 destroy_layout_hdr(lo); 280 pnfs_layout_set_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
237 spin_unlock(&inode->i_lock); 281 pnfs_mark_matching_lsegs_invalid(lo, &head, &range);
282 spin_unlock(&inode->i_lock);
283 pnfs_free_lseg_list(&head);
284 dprintk("%s Setting layout IOMODE_%s fail bit\n", __func__,
285 iomode == IOMODE_RW ? "RW" : "READ");
286}
287
288static bool
289pnfs_layout_io_test_failed(struct pnfs_layout_hdr *lo, u32 iomode)
290{
291 unsigned long start, end;
292 int fail_bit = pnfs_iomode_to_fail_bit(iomode);
293
294 if (test_bit(fail_bit, &lo->plh_flags) == 0)
295 return false;
296 end = jiffies;
297 start = end - PNFS_LAYOUTGET_RETRY_TIMEOUT;
298 if (!time_in_range(lo->plh_retry_timestamp, start, end)) {
299 /* It is time to retry the failed layoutgets */
300 pnfs_layout_clear_fail_bit(lo, fail_bit);
301 return false;
238 } 302 }
303 return true;
239} 304}
240 305
241static void 306static void
@@ -249,33 +314,32 @@ init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
249 lseg->pls_layout = lo; 314 lseg->pls_layout = lo;
250} 315}
251 316
252static void free_lseg(struct pnfs_layout_segment *lseg) 317static void pnfs_free_lseg(struct pnfs_layout_segment *lseg)
253{ 318{
254 struct inode *ino = lseg->pls_layout->plh_inode; 319 struct inode *ino = lseg->pls_layout->plh_inode;
255 320
256 NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg); 321 NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
257 /* Matched by get_layout_hdr in pnfs_insert_layout */
258 put_layout_hdr(NFS_I(ino)->layout);
259} 322}
260 323
261static void 324static void
262put_lseg_common(struct pnfs_layout_segment *lseg) 325pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo,
326 struct pnfs_layout_segment *lseg)
263{ 327{
264 struct inode *inode = lseg->pls_layout->plh_inode; 328 struct inode *inode = lo->plh_inode;
265 329
266 WARN_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); 330 WARN_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
267 list_del_init(&lseg->pls_list); 331 list_del_init(&lseg->pls_list);
268 if (list_empty(&lseg->pls_layout->plh_segs)) { 332 /* Matched by pnfs_get_layout_hdr in pnfs_layout_insert_lseg */
269 set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags); 333 atomic_dec(&lo->plh_refcount);
270 /* Matched by initial refcount set in alloc_init_layout_hdr */ 334 if (list_empty(&lo->plh_segs))
271 put_layout_hdr_locked(lseg->pls_layout); 335 clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
272 }
273 rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq); 336 rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq);
274} 337}
275 338
276void 339void
277put_lseg(struct pnfs_layout_segment *lseg) 340pnfs_put_lseg(struct pnfs_layout_segment *lseg)
278{ 341{
342 struct pnfs_layout_hdr *lo;
279 struct inode *inode; 343 struct inode *inode;
280 344
281 if (!lseg) 345 if (!lseg)
@@ -284,17 +348,17 @@ put_lseg(struct pnfs_layout_segment *lseg)
284 dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, 348 dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
285 atomic_read(&lseg->pls_refcount), 349 atomic_read(&lseg->pls_refcount),
286 test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); 350 test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
287 inode = lseg->pls_layout->plh_inode; 351 lo = lseg->pls_layout;
352 inode = lo->plh_inode;
288 if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) { 353 if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) {
289 LIST_HEAD(free_me); 354 pnfs_get_layout_hdr(lo);
290 355 pnfs_layout_remove_lseg(lo, lseg);
291 put_lseg_common(lseg);
292 list_add(&lseg->pls_list, &free_me);
293 spin_unlock(&inode->i_lock); 356 spin_unlock(&inode->i_lock);
294 pnfs_free_lseg_list(&free_me); 357 pnfs_free_lseg(lseg);
358 pnfs_put_layout_hdr(lo);
295 } 359 }
296} 360}
297EXPORT_SYMBOL_GPL(put_lseg); 361EXPORT_SYMBOL_GPL(pnfs_put_lseg);
298 362
299static inline u64 363static inline u64
300end_offset(u64 start, u64 len) 364end_offset(u64 start, u64 len)
@@ -378,7 +442,7 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
378 dprintk("%s: lseg %p ref %d\n", __func__, lseg, 442 dprintk("%s: lseg %p ref %d\n", __func__, lseg,
379 atomic_read(&lseg->pls_refcount)); 443 atomic_read(&lseg->pls_refcount));
380 if (atomic_dec_and_test(&lseg->pls_refcount)) { 444 if (atomic_dec_and_test(&lseg->pls_refcount)) {
381 put_lseg_common(lseg); 445 pnfs_layout_remove_lseg(lseg->pls_layout, lseg);
382 list_add(&lseg->pls_list, tmp_list); 446 list_add(&lseg->pls_list, tmp_list);
383 rv = 1; 447 rv = 1;
384 } 448 }
@@ -390,7 +454,7 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
390 * after call. 454 * after call.
391 */ 455 */
392int 456int
393mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, 457pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
394 struct list_head *tmp_list, 458 struct list_head *tmp_list,
395 struct pnfs_layout_range *recall_range) 459 struct pnfs_layout_range *recall_range)
396{ 460{
@@ -399,14 +463,8 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
399 463
400 dprintk("%s:Begin lo %p\n", __func__, lo); 464 dprintk("%s:Begin lo %p\n", __func__, lo);
401 465
402 if (list_empty(&lo->plh_segs)) { 466 if (list_empty(&lo->plh_segs))
403 /* Reset MDS Threshold I/O counters */
404 NFS_I(lo->plh_inode)->write_io = 0;
405 NFS_I(lo->plh_inode)->read_io = 0;
406 if (!test_and_set_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags))
407 put_layout_hdr_locked(lo);
408 return 0; 467 return 0;
409 }
410 list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) 468 list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
411 if (!recall_range || 469 if (!recall_range ||
412 should_free_lseg(&lseg->pls_range, recall_range)) { 470 should_free_lseg(&lseg->pls_range, recall_range)) {
@@ -426,25 +484,13 @@ void
426pnfs_free_lseg_list(struct list_head *free_me) 484pnfs_free_lseg_list(struct list_head *free_me)
427{ 485{
428 struct pnfs_layout_segment *lseg, *tmp; 486 struct pnfs_layout_segment *lseg, *tmp;
429 struct pnfs_layout_hdr *lo;
430 487
431 if (list_empty(free_me)) 488 if (list_empty(free_me))
432 return; 489 return;
433 490
434 lo = list_first_entry(free_me, struct pnfs_layout_segment,
435 pls_list)->pls_layout;
436
437 if (test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags)) {
438 struct nfs_client *clp;
439
440 clp = NFS_SERVER(lo->plh_inode)->nfs_client;
441 spin_lock(&clp->cl_lock);
442 list_del_init(&lo->plh_layouts);
443 spin_unlock(&clp->cl_lock);
444 }
445 list_for_each_entry_safe(lseg, tmp, free_me, pls_list) { 491 list_for_each_entry_safe(lseg, tmp, free_me, pls_list) {
446 list_del(&lseg->pls_list); 492 list_del(&lseg->pls_list);
447 free_lseg(lseg); 493 pnfs_free_lseg(lseg);
448 } 494 }
449} 495}
450 496
@@ -458,10 +504,15 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
458 lo = nfsi->layout; 504 lo = nfsi->layout;
459 if (lo) { 505 if (lo) {
460 lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */ 506 lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
461 mark_matching_lsegs_invalid(lo, &tmp_list, NULL); 507 pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL);
462 } 508 pnfs_get_layout_hdr(lo);
463 spin_unlock(&nfsi->vfs_inode.i_lock); 509 pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED);
464 pnfs_free_lseg_list(&tmp_list); 510 pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED);
511 spin_unlock(&nfsi->vfs_inode.i_lock);
512 pnfs_free_lseg_list(&tmp_list);
513 pnfs_put_layout_hdr(lo);
514 } else
515 spin_unlock(&nfsi->vfs_inode.i_lock);
465} 516}
466EXPORT_SYMBOL_GPL(pnfs_destroy_layout); 517EXPORT_SYMBOL_GPL(pnfs_destroy_layout);
467 518
@@ -498,46 +549,54 @@ pnfs_destroy_all_layouts(struct nfs_client *clp)
498 } 549 }
499} 550}
500 551
552/*
553 * Compare 2 layout stateid sequence ids, to see which is newer,
554 * taking into account wraparound issues.
555 */
556static bool pnfs_seqid_is_newer(u32 s1, u32 s2)
557{
558 return (s32)s1 - (s32)s2 > 0;
559}
560
501/* update lo->plh_stateid with new if is more recent */ 561/* update lo->plh_stateid with new if is more recent */
502void 562void
503pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, 563pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
504 bool update_barrier) 564 bool update_barrier)
505{ 565{
506 u32 oldseq, newseq; 566 u32 oldseq, newseq, new_barrier;
567 int empty = list_empty(&lo->plh_segs);
507 568
508 oldseq = be32_to_cpu(lo->plh_stateid.seqid); 569 oldseq = be32_to_cpu(lo->plh_stateid.seqid);
509 newseq = be32_to_cpu(new->seqid); 570 newseq = be32_to_cpu(new->seqid);
510 if ((int)(newseq - oldseq) > 0) { 571 if (empty || pnfs_seqid_is_newer(newseq, oldseq)) {
511 nfs4_stateid_copy(&lo->plh_stateid, new); 572 nfs4_stateid_copy(&lo->plh_stateid, new);
512 if (update_barrier) { 573 if (update_barrier) {
513 u32 new_barrier = be32_to_cpu(new->seqid); 574 new_barrier = be32_to_cpu(new->seqid);
514
515 if ((int)(new_barrier - lo->plh_barrier))
516 lo->plh_barrier = new_barrier;
517 } else { 575 } else {
518 /* Because of wraparound, we want to keep the barrier 576 /* Because of wraparound, we want to keep the barrier
519 * "close" to the current seqids. It needs to be 577 * "close" to the current seqids.
520 * within 2**31 to count as "behind", so if it
521 * gets too near that limit, give us a litle leeway
522 * and bring it to within 2**30.
523 * NOTE - and yes, this is all unsigned arithmetic.
524 */ 578 */
525 if (unlikely((newseq - lo->plh_barrier) > (3 << 29))) 579 new_barrier = newseq - atomic_read(&lo->plh_outstanding);
526 lo->plh_barrier = newseq - (1 << 30);
527 } 580 }
581 if (empty || pnfs_seqid_is_newer(new_barrier, lo->plh_barrier))
582 lo->plh_barrier = new_barrier;
528 } 583 }
529} 584}
530 585
586static bool
587pnfs_layout_stateid_blocked(const struct pnfs_layout_hdr *lo,
588 const nfs4_stateid *stateid)
589{
590 u32 seqid = be32_to_cpu(stateid->seqid);
591
592 return !pnfs_seqid_is_newer(seqid, lo->plh_barrier);
593}
594
531/* lget is set to 1 if called from inside send_layoutget call chain */ 595/* lget is set to 1 if called from inside send_layoutget call chain */
532static bool 596static bool
533pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid, 597pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo, int lget)
534 int lget)
535{ 598{
536 if ((stateid) &&
537 (int)(lo->plh_barrier - be32_to_cpu(stateid->seqid)) >= 0)
538 return true;
539 return lo->plh_block_lgets || 599 return lo->plh_block_lgets ||
540 test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags) ||
541 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || 600 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
542 (list_empty(&lo->plh_segs) && 601 (list_empty(&lo->plh_segs) &&
543 (atomic_read(&lo->plh_outstanding) > lget)); 602 (atomic_read(&lo->plh_outstanding) > lget));
@@ -551,7 +610,7 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
551 610
552 dprintk("--> %s\n", __func__); 611 dprintk("--> %s\n", __func__);
553 spin_lock(&lo->plh_inode->i_lock); 612 spin_lock(&lo->plh_inode->i_lock);
554 if (pnfs_layoutgets_blocked(lo, NULL, 1)) { 613 if (pnfs_layoutgets_blocked(lo, 1)) {
555 status = -EAGAIN; 614 status = -EAGAIN;
556 } else if (list_empty(&lo->plh_segs)) { 615 } else if (list_empty(&lo->plh_segs)) {
557 int seq; 616 int seq;
@@ -582,7 +641,7 @@ send_layoutget(struct pnfs_layout_hdr *lo,
582 struct inode *ino = lo->plh_inode; 641 struct inode *ino = lo->plh_inode;
583 struct nfs_server *server = NFS_SERVER(ino); 642 struct nfs_server *server = NFS_SERVER(ino);
584 struct nfs4_layoutget *lgp; 643 struct nfs4_layoutget *lgp;
585 struct pnfs_layout_segment *lseg = NULL; 644 struct pnfs_layout_segment *lseg;
586 645
587 dprintk("--> %s\n", __func__); 646 dprintk("--> %s\n", __func__);
588 647
@@ -599,16 +658,22 @@ send_layoutget(struct pnfs_layout_hdr *lo,
599 lgp->args.type = server->pnfs_curr_ld->id; 658 lgp->args.type = server->pnfs_curr_ld->id;
600 lgp->args.inode = ino; 659 lgp->args.inode = ino;
601 lgp->args.ctx = get_nfs_open_context(ctx); 660 lgp->args.ctx = get_nfs_open_context(ctx);
602 lgp->lsegpp = &lseg;
603 lgp->gfp_flags = gfp_flags; 661 lgp->gfp_flags = gfp_flags;
604 662
605 /* Synchronously retrieve layout information from server and 663 /* Synchronously retrieve layout information from server and
606 * store in lseg. 664 * store in lseg.
607 */ 665 */
608 nfs4_proc_layoutget(lgp, gfp_flags); 666 lseg = nfs4_proc_layoutget(lgp, gfp_flags);
609 if (!lseg) { 667 if (IS_ERR(lseg)) {
610 /* remember that LAYOUTGET failed and suspend trying */ 668 switch (PTR_ERR(lseg)) {
611 set_bit(lo_fail_bit(range->iomode), &lo->plh_flags); 669 case -ENOMEM:
670 case -ERESTARTSYS:
671 break;
672 default:
673 /* remember that LAYOUTGET failed and suspend trying */
674 pnfs_layout_io_set_failed(lo, range->iomode);
675 }
676 return NULL;
612 } 677 }
613 678
614 return lseg; 679 return lseg;
@@ -636,25 +701,24 @@ _pnfs_return_layout(struct inode *ino)
636 701
637 spin_lock(&ino->i_lock); 702 spin_lock(&ino->i_lock);
638 lo = nfsi->layout; 703 lo = nfsi->layout;
639 if (!lo || pnfs_test_layout_returned(lo)) { 704 if (!lo) {
640 spin_unlock(&ino->i_lock); 705 spin_unlock(&ino->i_lock);
641 dprintk("NFS: %s no layout to return\n", __func__); 706 dprintk("NFS: %s no layout to return\n", __func__);
642 goto out; 707 goto out;
643 } 708 }
644 stateid = nfsi->layout->plh_stateid; 709 stateid = nfsi->layout->plh_stateid;
645 /* Reference matched in nfs4_layoutreturn_release */ 710 /* Reference matched in nfs4_layoutreturn_release */
646 get_layout_hdr(lo); 711 pnfs_get_layout_hdr(lo);
647 empty = list_empty(&lo->plh_segs); 712 empty = list_empty(&lo->plh_segs);
648 mark_matching_lsegs_invalid(lo, &tmp_list, NULL); 713 pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL);
649 /* Don't send a LAYOUTRETURN if list was initially empty */ 714 /* Don't send a LAYOUTRETURN if list was initially empty */
650 if (empty) { 715 if (empty) {
651 spin_unlock(&ino->i_lock); 716 spin_unlock(&ino->i_lock);
652 put_layout_hdr(lo); 717 pnfs_put_layout_hdr(lo);
653 dprintk("NFS: %s no layout segments to return\n", __func__); 718 dprintk("NFS: %s no layout segments to return\n", __func__);
654 goto out; 719 goto out;
655 } 720 }
656 lo->plh_block_lgets++; 721 lo->plh_block_lgets++;
657 pnfs_mark_layout_returned(lo);
658 spin_unlock(&ino->i_lock); 722 spin_unlock(&ino->i_lock);
659 pnfs_free_lseg_list(&tmp_list); 723 pnfs_free_lseg_list(&tmp_list);
660 724
@@ -663,10 +727,10 @@ _pnfs_return_layout(struct inode *ino)
663 lrp = kzalloc(sizeof(*lrp), GFP_KERNEL); 727 lrp = kzalloc(sizeof(*lrp), GFP_KERNEL);
664 if (unlikely(lrp == NULL)) { 728 if (unlikely(lrp == NULL)) {
665 status = -ENOMEM; 729 status = -ENOMEM;
666 set_bit(NFS_LAYOUT_RW_FAILED, &lo->plh_flags); 730 spin_lock(&ino->i_lock);
667 set_bit(NFS_LAYOUT_RO_FAILED, &lo->plh_flags); 731 lo->plh_block_lgets--;
668 pnfs_clear_layout_returned(lo); 732 spin_unlock(&ino->i_lock);
669 put_layout_hdr(lo); 733 pnfs_put_layout_hdr(lo);
670 goto out; 734 goto out;
671 } 735 }
672 736
@@ -703,7 +767,7 @@ bool pnfs_roc(struct inode *ino)
703 if (!found) 767 if (!found)
704 goto out_nolayout; 768 goto out_nolayout;
705 lo->plh_block_lgets++; 769 lo->plh_block_lgets++;
706 get_layout_hdr(lo); /* matched in pnfs_roc_release */ 770 pnfs_get_layout_hdr(lo); /* matched in pnfs_roc_release */
707 spin_unlock(&ino->i_lock); 771 spin_unlock(&ino->i_lock);
708 pnfs_free_lseg_list(&tmp_list); 772 pnfs_free_lseg_list(&tmp_list);
709 return true; 773 return true;
@@ -720,8 +784,12 @@ void pnfs_roc_release(struct inode *ino)
720 spin_lock(&ino->i_lock); 784 spin_lock(&ino->i_lock);
721 lo = NFS_I(ino)->layout; 785 lo = NFS_I(ino)->layout;
722 lo->plh_block_lgets--; 786 lo->plh_block_lgets--;
723 put_layout_hdr_locked(lo); 787 if (atomic_dec_and_test(&lo->plh_refcount)) {
724 spin_unlock(&ino->i_lock); 788 pnfs_detach_layout_hdr(lo);
789 spin_unlock(&ino->i_lock);
790 pnfs_free_layout_hdr(lo);
791 } else
792 spin_unlock(&ino->i_lock);
725} 793}
726 794
727void pnfs_roc_set_barrier(struct inode *ino, u32 barrier) 795void pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
@@ -730,32 +798,34 @@ void pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
730 798
731 spin_lock(&ino->i_lock); 799 spin_lock(&ino->i_lock);
732 lo = NFS_I(ino)->layout; 800 lo = NFS_I(ino)->layout;
733 if ((int)(barrier - lo->plh_barrier) > 0) 801 if (pnfs_seqid_is_newer(barrier, lo->plh_barrier))
734 lo->plh_barrier = barrier; 802 lo->plh_barrier = barrier;
735 spin_unlock(&ino->i_lock); 803 spin_unlock(&ino->i_lock);
736} 804}
737 805
738bool pnfs_roc_drain(struct inode *ino, u32 *barrier) 806bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task)
739{ 807{
740 struct nfs_inode *nfsi = NFS_I(ino); 808 struct nfs_inode *nfsi = NFS_I(ino);
809 struct pnfs_layout_hdr *lo;
741 struct pnfs_layout_segment *lseg; 810 struct pnfs_layout_segment *lseg;
811 u32 current_seqid;
742 bool found = false; 812 bool found = false;
743 813
744 spin_lock(&ino->i_lock); 814 spin_lock(&ino->i_lock);
745 list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list) 815 list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list)
746 if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) { 816 if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
817 rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
747 found = true; 818 found = true;
748 break; 819 goto out;
749 } 820 }
750 if (!found) { 821 lo = nfsi->layout;
751 struct pnfs_layout_hdr *lo = nfsi->layout; 822 current_seqid = be32_to_cpu(lo->plh_stateid.seqid);
752 u32 current_seqid = be32_to_cpu(lo->plh_stateid.seqid);
753 823
754 /* Since close does not return a layout stateid for use as 824 /* Since close does not return a layout stateid for use as
755 * a barrier, we choose the worst-case barrier. 825 * a barrier, we choose the worst-case barrier.
756 */ 826 */
757 *barrier = current_seqid + atomic_read(&lo->plh_outstanding); 827 *barrier = current_seqid + atomic_read(&lo->plh_outstanding);
758 } 828out:
759 spin_unlock(&ino->i_lock); 829 spin_unlock(&ino->i_lock);
760 return found; 830 return found;
761} 831}
@@ -786,14 +856,13 @@ cmp_layout(struct pnfs_layout_range *l1,
786} 856}
787 857
788static void 858static void
789pnfs_insert_layout(struct pnfs_layout_hdr *lo, 859pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo,
790 struct pnfs_layout_segment *lseg) 860 struct pnfs_layout_segment *lseg)
791{ 861{
792 struct pnfs_layout_segment *lp; 862 struct pnfs_layout_segment *lp;
793 863
794 dprintk("%s:Begin\n", __func__); 864 dprintk("%s:Begin\n", __func__);
795 865
796 assert_spin_locked(&lo->plh_inode->i_lock);
797 list_for_each_entry(lp, &lo->plh_segs, pls_list) { 866 list_for_each_entry(lp, &lo->plh_segs, pls_list) {
798 if (cmp_layout(&lseg->pls_range, &lp->pls_range) > 0) 867 if (cmp_layout(&lseg->pls_range, &lp->pls_range) > 0)
799 continue; 868 continue;
@@ -813,7 +882,7 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,
813 __func__, lseg, lseg->pls_range.iomode, 882 __func__, lseg, lseg->pls_range.iomode,
814 lseg->pls_range.offset, lseg->pls_range.length); 883 lseg->pls_range.offset, lseg->pls_range.length);
815out: 884out:
816 get_layout_hdr(lo); 885 pnfs_get_layout_hdr(lo);
817 886
818 dprintk("%s:Return\n", __func__); 887 dprintk("%s:Return\n", __func__);
819} 888}
@@ -847,21 +916,19 @@ pnfs_find_alloc_layout(struct inode *ino,
847 916
848 dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout); 917 dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout);
849 918
850 assert_spin_locked(&ino->i_lock); 919 if (nfsi->layout != NULL)
851 if (nfsi->layout) { 920 goto out_existing;
852 if (test_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags))
853 return NULL;
854 else
855 return nfsi->layout;
856 }
857 spin_unlock(&ino->i_lock); 921 spin_unlock(&ino->i_lock);
858 new = alloc_init_layout_hdr(ino, ctx, gfp_flags); 922 new = alloc_init_layout_hdr(ino, ctx, gfp_flags);
859 spin_lock(&ino->i_lock); 923 spin_lock(&ino->i_lock);
860 924
861 if (likely(nfsi->layout == NULL)) /* Won the race? */ 925 if (likely(nfsi->layout == NULL)) { /* Won the race? */
862 nfsi->layout = new; 926 nfsi->layout = new;
863 else 927 return new;
864 pnfs_free_layout_hdr(new); 928 }
929 pnfs_free_layout_hdr(new);
930out_existing:
931 pnfs_get_layout_hdr(nfsi->layout);
865 return nfsi->layout; 932 return nfsi->layout;
866} 933}
867 934
@@ -904,11 +971,10 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo,
904 971
905 dprintk("%s:Begin\n", __func__); 972 dprintk("%s:Begin\n", __func__);
906 973
907 assert_spin_locked(&lo->plh_inode->i_lock);
908 list_for_each_entry(lseg, &lo->plh_segs, pls_list) { 974 list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
909 if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) && 975 if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
910 is_matching_lseg(&lseg->pls_range, range)) { 976 is_matching_lseg(&lseg->pls_range, range)) {
911 ret = get_lseg(lseg); 977 ret = pnfs_get_lseg(lseg);
912 break; 978 break;
913 } 979 }
914 if (lseg->pls_range.offset > range->offset) 980 if (lseg->pls_range.offset > range->offset)
@@ -1013,7 +1079,6 @@ pnfs_update_layout(struct inode *ino,
1013 .length = count, 1079 .length = count,
1014 }; 1080 };
1015 unsigned pg_offset; 1081 unsigned pg_offset;
1016 struct nfs_inode *nfsi = NFS_I(ino);
1017 struct nfs_server *server = NFS_SERVER(ino); 1082 struct nfs_server *server = NFS_SERVER(ino);
1018 struct nfs_client *clp = server->nfs_client; 1083 struct nfs_client *clp = server->nfs_client;
1019 struct pnfs_layout_hdr *lo; 1084 struct pnfs_layout_hdr *lo;
@@ -1021,16 +1086,16 @@ pnfs_update_layout(struct inode *ino,
1021 bool first = false; 1086 bool first = false;
1022 1087
1023 if (!pnfs_enabled_sb(NFS_SERVER(ino))) 1088 if (!pnfs_enabled_sb(NFS_SERVER(ino)))
1024 return NULL; 1089 goto out;
1025 1090
1026 if (pnfs_within_mdsthreshold(ctx, ino, iomode)) 1091 if (pnfs_within_mdsthreshold(ctx, ino, iomode))
1027 return NULL; 1092 goto out;
1028 1093
1029 spin_lock(&ino->i_lock); 1094 spin_lock(&ino->i_lock);
1030 lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags); 1095 lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
1031 if (lo == NULL) { 1096 if (lo == NULL) {
1032 dprintk("%s ERROR: can't get pnfs_layout_hdr\n", __func__); 1097 spin_unlock(&ino->i_lock);
1033 goto out_unlock; 1098 goto out;
1034 } 1099 }
1035 1100
1036 /* Do we even need to bother with this? */ 1101 /* Do we even need to bother with this? */
@@ -1040,7 +1105,7 @@ pnfs_update_layout(struct inode *ino,
1040 } 1105 }
1041 1106
1042 /* if LAYOUTGET already failed once we don't try again */ 1107 /* if LAYOUTGET already failed once we don't try again */
1043 if (test_bit(lo_fail_bit(iomode), &nfsi->layout->plh_flags)) 1108 if (pnfs_layout_io_test_failed(lo, iomode))
1044 goto out_unlock; 1109 goto out_unlock;
1045 1110
1046 /* Check to see if the layout for the given range already exists */ 1111 /* Check to see if the layout for the given range already exists */
@@ -1048,17 +1113,13 @@ pnfs_update_layout(struct inode *ino,
1048 if (lseg) 1113 if (lseg)
1049 goto out_unlock; 1114 goto out_unlock;
1050 1115
1051 if (pnfs_layoutgets_blocked(lo, NULL, 0)) 1116 if (pnfs_layoutgets_blocked(lo, 0))
1052 goto out_unlock; 1117 goto out_unlock;
1053 atomic_inc(&lo->plh_outstanding); 1118 atomic_inc(&lo->plh_outstanding);
1054 1119
1055 get_layout_hdr(lo);
1056 if (list_empty(&lo->plh_segs)) 1120 if (list_empty(&lo->plh_segs))
1057 first = true; 1121 first = true;
1058 1122
1059 /* Enable LAYOUTRETURNs */
1060 pnfs_clear_layout_returned(lo);
1061
1062 spin_unlock(&ino->i_lock); 1123 spin_unlock(&ino->i_lock);
1063 if (first) { 1124 if (first) {
1064 /* The lo must be on the clp list if there is any 1125 /* The lo must be on the clp list if there is any
@@ -1079,24 +1140,26 @@ pnfs_update_layout(struct inode *ino,
1079 arg.length = PAGE_CACHE_ALIGN(arg.length); 1140 arg.length = PAGE_CACHE_ALIGN(arg.length);
1080 1141
1081 lseg = send_layoutget(lo, ctx, &arg, gfp_flags); 1142 lseg = send_layoutget(lo, ctx, &arg, gfp_flags);
1082 if (!lseg && first) {
1083 spin_lock(&clp->cl_lock);
1084 list_del_init(&lo->plh_layouts);
1085 spin_unlock(&clp->cl_lock);
1086 }
1087 atomic_dec(&lo->plh_outstanding); 1143 atomic_dec(&lo->plh_outstanding);
1088 put_layout_hdr(lo); 1144out_put_layout_hdr:
1145 pnfs_put_layout_hdr(lo);
1089out: 1146out:
1090 dprintk("%s end, state 0x%lx lseg %p\n", __func__, 1147 dprintk("%s: inode %s/%llu pNFS layout segment %s for "
1091 nfsi->layout ? nfsi->layout->plh_flags : -1, lseg); 1148 "(%s, offset: %llu, length: %llu)\n",
1149 __func__, ino->i_sb->s_id,
1150 (unsigned long long)NFS_FILEID(ino),
1151 lseg == NULL ? "not found" : "found",
1152 iomode==IOMODE_RW ? "read/write" : "read-only",
1153 (unsigned long long)pos,
1154 (unsigned long long)count);
1092 return lseg; 1155 return lseg;
1093out_unlock: 1156out_unlock:
1094 spin_unlock(&ino->i_lock); 1157 spin_unlock(&ino->i_lock);
1095 goto out; 1158 goto out_put_layout_hdr;
1096} 1159}
1097EXPORT_SYMBOL_GPL(pnfs_update_layout); 1160EXPORT_SYMBOL_GPL(pnfs_update_layout);
1098 1161
1099int 1162struct pnfs_layout_segment *
1100pnfs_layout_process(struct nfs4_layoutget *lgp) 1163pnfs_layout_process(struct nfs4_layoutget *lgp)
1101{ 1164{
1102 struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout; 1165 struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout;
@@ -1123,25 +1186,29 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
1123 goto out_forget_reply; 1186 goto out_forget_reply;
1124 } 1187 }
1125 1188
1126 if (pnfs_layoutgets_blocked(lo, &res->stateid, 1)) { 1189 if (pnfs_layoutgets_blocked(lo, 1) ||
1190 pnfs_layout_stateid_blocked(lo, &res->stateid)) {
1127 dprintk("%s forget reply due to state\n", __func__); 1191 dprintk("%s forget reply due to state\n", __func__);
1128 goto out_forget_reply; 1192 goto out_forget_reply;
1129 } 1193 }
1194
1195 /* Done processing layoutget. Set the layout stateid */
1196 pnfs_set_layout_stateid(lo, &res->stateid, false);
1197
1130 init_lseg(lo, lseg); 1198 init_lseg(lo, lseg);
1131 lseg->pls_range = res->range; 1199 lseg->pls_range = res->range;
1132 *lgp->lsegpp = get_lseg(lseg); 1200 pnfs_get_lseg(lseg);
1133 pnfs_insert_layout(lo, lseg); 1201 pnfs_layout_insert_lseg(lo, lseg);
1134 1202
1135 if (res->return_on_close) { 1203 if (res->return_on_close) {
1136 set_bit(NFS_LSEG_ROC, &lseg->pls_flags); 1204 set_bit(NFS_LSEG_ROC, &lseg->pls_flags);
1137 set_bit(NFS_LAYOUT_ROC, &lo->plh_flags); 1205 set_bit(NFS_LAYOUT_ROC, &lo->plh_flags);
1138 } 1206 }
1139 1207
1140 /* Done processing layoutget. Set the layout stateid */
1141 pnfs_set_layout_stateid(lo, &res->stateid, false);
1142 spin_unlock(&ino->i_lock); 1208 spin_unlock(&ino->i_lock);
1209 return lseg;
1143out: 1210out:
1144 return status; 1211 return ERR_PTR(status);
1145 1212
1146out_forget_reply: 1213out_forget_reply:
1147 spin_unlock(&ino->i_lock); 1214 spin_unlock(&ino->i_lock);
@@ -1153,16 +1220,24 @@ out_forget_reply:
1153void 1220void
1154pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) 1221pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
1155{ 1222{
1223 u64 rd_size = req->wb_bytes;
1224
1156 BUG_ON(pgio->pg_lseg != NULL); 1225 BUG_ON(pgio->pg_lseg != NULL);
1157 1226
1158 if (req->wb_offset != req->wb_pgbase) { 1227 if (req->wb_offset != req->wb_pgbase) {
1159 nfs_pageio_reset_read_mds(pgio); 1228 nfs_pageio_reset_read_mds(pgio);
1160 return; 1229 return;
1161 } 1230 }
1231
1232 if (pgio->pg_dreq == NULL)
1233 rd_size = i_size_read(pgio->pg_inode) - req_offset(req);
1234 else
1235 rd_size = nfs_dreq_bytes_left(pgio->pg_dreq);
1236
1162 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, 1237 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
1163 req->wb_context, 1238 req->wb_context,
1164 req_offset(req), 1239 req_offset(req),
1165 req->wb_bytes, 1240 rd_size,
1166 IOMODE_READ, 1241 IOMODE_READ,
1167 GFP_KERNEL); 1242 GFP_KERNEL);
1168 /* If no lseg, fall back to read through mds */ 1243 /* If no lseg, fall back to read through mds */
@@ -1173,7 +1248,8 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r
1173EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_read); 1248EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_read);
1174 1249
1175void 1250void
1176pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) 1251pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
1252 struct nfs_page *req, u64 wb_size)
1177{ 1253{
1178 BUG_ON(pgio->pg_lseg != NULL); 1254 BUG_ON(pgio->pg_lseg != NULL);
1179 1255
@@ -1181,10 +1257,11 @@ pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *
1181 nfs_pageio_reset_write_mds(pgio); 1257 nfs_pageio_reset_write_mds(pgio);
1182 return; 1258 return;
1183 } 1259 }
1260
1184 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, 1261 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
1185 req->wb_context, 1262 req->wb_context,
1186 req_offset(req), 1263 req_offset(req),
1187 req->wb_bytes, 1264 wb_size,
1188 IOMODE_RW, 1265 IOMODE_RW,
1189 GFP_NOFS); 1266 GFP_NOFS);
1190 /* If no lseg, fall back to write through mds */ 1267 /* If no lseg, fall back to write through mds */
@@ -1362,12 +1439,12 @@ pnfs_do_multiple_writes(struct nfs_pageio_descriptor *desc, struct list_head *he
1362 if (trypnfs == PNFS_NOT_ATTEMPTED) 1439 if (trypnfs == PNFS_NOT_ATTEMPTED)
1363 pnfs_write_through_mds(desc, data); 1440 pnfs_write_through_mds(desc, data);
1364 } 1441 }
1365 put_lseg(lseg); 1442 pnfs_put_lseg(lseg);
1366} 1443}
1367 1444
1368static void pnfs_writehdr_free(struct nfs_pgio_header *hdr) 1445static void pnfs_writehdr_free(struct nfs_pgio_header *hdr)
1369{ 1446{
1370 put_lseg(hdr->lseg); 1447 pnfs_put_lseg(hdr->lseg);
1371 nfs_writehdr_free(hdr); 1448 nfs_writehdr_free(hdr);
1372} 1449}
1373EXPORT_SYMBOL_GPL(pnfs_writehdr_free); 1450EXPORT_SYMBOL_GPL(pnfs_writehdr_free);
@@ -1382,17 +1459,17 @@ pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
1382 whdr = nfs_writehdr_alloc(); 1459 whdr = nfs_writehdr_alloc();
1383 if (!whdr) { 1460 if (!whdr) {
1384 desc->pg_completion_ops->error_cleanup(&desc->pg_list); 1461 desc->pg_completion_ops->error_cleanup(&desc->pg_list);
1385 put_lseg(desc->pg_lseg); 1462 pnfs_put_lseg(desc->pg_lseg);
1386 desc->pg_lseg = NULL; 1463 desc->pg_lseg = NULL;
1387 return -ENOMEM; 1464 return -ENOMEM;
1388 } 1465 }
1389 hdr = &whdr->header; 1466 hdr = &whdr->header;
1390 nfs_pgheader_init(desc, hdr, pnfs_writehdr_free); 1467 nfs_pgheader_init(desc, hdr, pnfs_writehdr_free);
1391 hdr->lseg = get_lseg(desc->pg_lseg); 1468 hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
1392 atomic_inc(&hdr->refcnt); 1469 atomic_inc(&hdr->refcnt);
1393 ret = nfs_generic_flush(desc, hdr); 1470 ret = nfs_generic_flush(desc, hdr);
1394 if (ret != 0) { 1471 if (ret != 0) {
1395 put_lseg(desc->pg_lseg); 1472 pnfs_put_lseg(desc->pg_lseg);
1396 desc->pg_lseg = NULL; 1473 desc->pg_lseg = NULL;
1397 } else 1474 } else
1398 pnfs_do_multiple_writes(desc, &hdr->rpc_list, desc->pg_ioflags); 1475 pnfs_do_multiple_writes(desc, &hdr->rpc_list, desc->pg_ioflags);
@@ -1517,12 +1594,12 @@ pnfs_do_multiple_reads(struct nfs_pageio_descriptor *desc, struct list_head *hea
1517 if (trypnfs == PNFS_NOT_ATTEMPTED) 1594 if (trypnfs == PNFS_NOT_ATTEMPTED)
1518 pnfs_read_through_mds(desc, data); 1595 pnfs_read_through_mds(desc, data);
1519 } 1596 }
1520 put_lseg(lseg); 1597 pnfs_put_lseg(lseg);
1521} 1598}
1522 1599
1523static void pnfs_readhdr_free(struct nfs_pgio_header *hdr) 1600static void pnfs_readhdr_free(struct nfs_pgio_header *hdr)
1524{ 1601{
1525 put_lseg(hdr->lseg); 1602 pnfs_put_lseg(hdr->lseg);
1526 nfs_readhdr_free(hdr); 1603 nfs_readhdr_free(hdr);
1527} 1604}
1528EXPORT_SYMBOL_GPL(pnfs_readhdr_free); 1605EXPORT_SYMBOL_GPL(pnfs_readhdr_free);
@@ -1538,17 +1615,17 @@ pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
1538 if (!rhdr) { 1615 if (!rhdr) {
1539 desc->pg_completion_ops->error_cleanup(&desc->pg_list); 1616 desc->pg_completion_ops->error_cleanup(&desc->pg_list);
1540 ret = -ENOMEM; 1617 ret = -ENOMEM;
1541 put_lseg(desc->pg_lseg); 1618 pnfs_put_lseg(desc->pg_lseg);
1542 desc->pg_lseg = NULL; 1619 desc->pg_lseg = NULL;
1543 return ret; 1620 return ret;
1544 } 1621 }
1545 hdr = &rhdr->header; 1622 hdr = &rhdr->header;
1546 nfs_pgheader_init(desc, hdr, pnfs_readhdr_free); 1623 nfs_pgheader_init(desc, hdr, pnfs_readhdr_free);
1547 hdr->lseg = get_lseg(desc->pg_lseg); 1624 hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
1548 atomic_inc(&hdr->refcnt); 1625 atomic_inc(&hdr->refcnt);
1549 ret = nfs_generic_pagein(desc, hdr); 1626 ret = nfs_generic_pagein(desc, hdr);
1550 if (ret != 0) { 1627 if (ret != 0) {
1551 put_lseg(desc->pg_lseg); 1628 pnfs_put_lseg(desc->pg_lseg);
1552 desc->pg_lseg = NULL; 1629 desc->pg_lseg = NULL;
1553 } else 1630 } else
1554 pnfs_do_multiple_reads(desc, &hdr->rpc_list); 1631 pnfs_do_multiple_reads(desc, &hdr->rpc_list);
@@ -1574,13 +1651,7 @@ static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp)
1574 1651
1575void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg) 1652void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg)
1576{ 1653{
1577 if (lseg->pls_range.iomode == IOMODE_RW) { 1654 pnfs_layout_io_set_failed(lseg->pls_layout, lseg->pls_range.iomode);
1578 dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__);
1579 set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
1580 } else {
1581 dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__);
1582 set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
1583 }
1584} 1655}
1585EXPORT_SYMBOL_GPL(pnfs_set_lo_fail); 1656EXPORT_SYMBOL_GPL(pnfs_set_lo_fail);
1586 1657
@@ -1601,7 +1672,7 @@ pnfs_set_layoutcommit(struct nfs_write_data *wdata)
1601 } 1672 }
1602 if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &hdr->lseg->pls_flags)) { 1673 if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &hdr->lseg->pls_flags)) {
1603 /* references matched in nfs4_layoutcommit_release */ 1674 /* references matched in nfs4_layoutcommit_release */
1604 get_lseg(hdr->lseg); 1675 pnfs_get_lseg(hdr->lseg);
1605 } 1676 }
1606 if (end_pos > nfsi->layout->plh_lwb) 1677 if (end_pos > nfsi->layout->plh_lwb)
1607 nfsi->layout->plh_lwb = end_pos; 1678 nfsi->layout->plh_lwb = end_pos;
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 745aa1b39e7c..2d722dba1111 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -62,9 +62,6 @@ enum {
62 NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */ 62 NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */
63 NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */ 63 NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */
64 NFS_LAYOUT_ROC, /* some lseg had roc bit set */ 64 NFS_LAYOUT_ROC, /* some lseg had roc bit set */
65 NFS_LAYOUT_DESTROYED, /* no new use of layout allowed */
66 NFS_LAYOUT_INVALID, /* layout is being destroyed */
67 NFS_LAYOUT_RETURNED, /* layout has already been returned */
68}; 65};
69 66
70enum layoutdriver_policy_flags { 67enum layoutdriver_policy_flags {
@@ -140,6 +137,7 @@ struct pnfs_layout_hdr {
140 atomic_t plh_outstanding; /* number of RPCs out */ 137 atomic_t plh_outstanding; /* number of RPCs out */
141 unsigned long plh_block_lgets; /* block LAYOUTGET if >0 */ 138 unsigned long plh_block_lgets; /* block LAYOUTGET if >0 */
142 u32 plh_barrier; /* ignore lower seqids */ 139 u32 plh_barrier; /* ignore lower seqids */
140 unsigned long plh_retry_timestamp;
143 unsigned long plh_flags; 141 unsigned long plh_flags;
144 loff_t plh_lwb; /* last write byte for layoutcommit */ 142 loff_t plh_lwb; /* last write byte for layoutcommit */
145 struct rpc_cred *plh_lc_cred; /* layoutcommit cred */ 143 struct rpc_cred *plh_lc_cred; /* layoutcommit cred */
@@ -172,12 +170,12 @@ extern int nfs4_proc_getdevicelist(struct nfs_server *server,
172 struct pnfs_devicelist *devlist); 170 struct pnfs_devicelist *devlist);
173extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, 171extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
174 struct pnfs_device *dev); 172 struct pnfs_device *dev);
175extern void nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags); 173extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags);
176extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp); 174extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp);
177 175
178/* pnfs.c */ 176/* pnfs.c */
179void get_layout_hdr(struct pnfs_layout_hdr *lo); 177void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo);
180void put_lseg(struct pnfs_layout_segment *lseg); 178void pnfs_put_lseg(struct pnfs_layout_segment *lseg);
181 179
182void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *, 180void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *,
183 const struct nfs_pgio_completion_ops *); 181 const struct nfs_pgio_completion_ops *);
@@ -188,28 +186,29 @@ void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32);
188void unset_pnfs_layoutdriver(struct nfs_server *); 186void unset_pnfs_layoutdriver(struct nfs_server *);
189void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *); 187void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *);
190int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc); 188int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc);
191void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *, struct nfs_page *); 189void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
190 struct nfs_page *req, u64 wb_size);
192int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc); 191int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc);
193bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req); 192bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req);
194void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg); 193void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg);
195int pnfs_layout_process(struct nfs4_layoutget *lgp); 194struct pnfs_layout_segment *pnfs_layout_process(struct nfs4_layoutget *lgp);
196void pnfs_free_lseg_list(struct list_head *tmp_list); 195void pnfs_free_lseg_list(struct list_head *tmp_list);
197void pnfs_destroy_layout(struct nfs_inode *); 196void pnfs_destroy_layout(struct nfs_inode *);
198void pnfs_destroy_all_layouts(struct nfs_client *); 197void pnfs_destroy_all_layouts(struct nfs_client *);
199void put_layout_hdr(struct pnfs_layout_hdr *lo); 198void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo);
200void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, 199void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
201 const nfs4_stateid *new, 200 const nfs4_stateid *new,
202 bool update_barrier); 201 bool update_barrier);
203int pnfs_choose_layoutget_stateid(nfs4_stateid *dst, 202int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
204 struct pnfs_layout_hdr *lo, 203 struct pnfs_layout_hdr *lo,
205 struct nfs4_state *open_state); 204 struct nfs4_state *open_state);
206int mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, 205int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
207 struct list_head *tmp_list, 206 struct list_head *tmp_list,
208 struct pnfs_layout_range *recall_range); 207 struct pnfs_layout_range *recall_range);
209bool pnfs_roc(struct inode *ino); 208bool pnfs_roc(struct inode *ino);
210void pnfs_roc_release(struct inode *ino); 209void pnfs_roc_release(struct inode *ino);
211void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); 210void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
212bool pnfs_roc_drain(struct inode *ino, u32 *barrier); 211bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task);
213void pnfs_set_layoutcommit(struct nfs_write_data *wdata); 212void pnfs_set_layoutcommit(struct nfs_write_data *wdata);
214void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data); 213void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
215int pnfs_layoutcommit_inode(struct inode *inode, bool sync); 214int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
@@ -233,6 +232,7 @@ struct nfs4_threshold *pnfs_mdsthreshold_alloc(void);
233/* nfs4_deviceid_flags */ 232/* nfs4_deviceid_flags */
234enum { 233enum {
235 NFS_DEVICEID_INVALID = 0, /* set when MDS clientid recalled */ 234 NFS_DEVICEID_INVALID = 0, /* set when MDS clientid recalled */
235 NFS_DEVICEID_UNAVAILABLE, /* device temporarily unavailable */
236}; 236};
237 237
238/* pnfs_dev.c */ 238/* pnfs_dev.c */
@@ -242,6 +242,7 @@ struct nfs4_deviceid_node {
242 const struct pnfs_layoutdriver_type *ld; 242 const struct pnfs_layoutdriver_type *ld;
243 const struct nfs_client *nfs_client; 243 const struct nfs_client *nfs_client;
244 unsigned long flags; 244 unsigned long flags;
245 unsigned long timestamp_unavailable;
245 struct nfs4_deviceid deviceid; 246 struct nfs4_deviceid deviceid;
246 atomic_t ref; 247 atomic_t ref;
247}; 248};
@@ -254,34 +255,12 @@ void nfs4_init_deviceid_node(struct nfs4_deviceid_node *,
254 const struct nfs4_deviceid *); 255 const struct nfs4_deviceid *);
255struct nfs4_deviceid_node *nfs4_insert_deviceid_node(struct nfs4_deviceid_node *); 256struct nfs4_deviceid_node *nfs4_insert_deviceid_node(struct nfs4_deviceid_node *);
256bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *); 257bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *);
258void nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node);
259bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node);
257void nfs4_deviceid_purge_client(const struct nfs_client *); 260void nfs4_deviceid_purge_client(const struct nfs_client *);
258 261
259static inline void
260pnfs_mark_layout_returned(struct pnfs_layout_hdr *lo)
261{
262 set_bit(NFS_LAYOUT_RETURNED, &lo->plh_flags);
263}
264
265static inline void
266pnfs_clear_layout_returned(struct pnfs_layout_hdr *lo)
267{
268 clear_bit(NFS_LAYOUT_RETURNED, &lo->plh_flags);
269}
270
271static inline bool
272pnfs_test_layout_returned(struct pnfs_layout_hdr *lo)
273{
274 return test_bit(NFS_LAYOUT_RETURNED, &lo->plh_flags);
275}
276
277static inline int lo_fail_bit(u32 iomode)
278{
279 return iomode == IOMODE_RW ?
280 NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED;
281}
282
283static inline struct pnfs_layout_segment * 262static inline struct pnfs_layout_segment *
284get_lseg(struct pnfs_layout_segment *lseg) 263pnfs_get_lseg(struct pnfs_layout_segment *lseg)
285{ 264{
286 if (lseg) { 265 if (lseg) {
287 atomic_inc(&lseg->pls_refcount); 266 atomic_inc(&lseg->pls_refcount);
@@ -406,12 +385,12 @@ static inline void pnfs_destroy_layout(struct nfs_inode *nfsi)
406} 385}
407 386
408static inline struct pnfs_layout_segment * 387static inline struct pnfs_layout_segment *
409get_lseg(struct pnfs_layout_segment *lseg) 388pnfs_get_lseg(struct pnfs_layout_segment *lseg)
410{ 389{
411 return NULL; 390 return NULL;
412} 391}
413 392
414static inline void put_lseg(struct pnfs_layout_segment *lseg) 393static inline void pnfs_put_lseg(struct pnfs_layout_segment *lseg)
415{ 394{
416} 395}
417 396
@@ -443,7 +422,7 @@ pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
443} 422}
444 423
445static inline bool 424static inline bool
446pnfs_roc_drain(struct inode *ino, u32 *barrier) 425pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task)
447{ 426{
448 return false; 427 return false;
449} 428}
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
index 73f701f1f4d3..d35b62e83ea6 100644
--- a/fs/nfs/pnfs_dev.c
+++ b/fs/nfs/pnfs_dev.c
@@ -40,6 +40,8 @@
40#define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS) 40#define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS)
41#define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1) 41#define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1)
42 42
43#define PNFS_DEVICE_RETRY_TIMEOUT (120*HZ)
44
43static struct hlist_head nfs4_deviceid_cache[NFS4_DEVICE_ID_HASH_SIZE]; 45static struct hlist_head nfs4_deviceid_cache[NFS4_DEVICE_ID_HASH_SIZE];
44static DEFINE_SPINLOCK(nfs4_deviceid_lock); 46static DEFINE_SPINLOCK(nfs4_deviceid_lock);
45 47
@@ -218,6 +220,30 @@ nfs4_put_deviceid_node(struct nfs4_deviceid_node *d)
218} 220}
219EXPORT_SYMBOL_GPL(nfs4_put_deviceid_node); 221EXPORT_SYMBOL_GPL(nfs4_put_deviceid_node);
220 222
223void
224nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node)
225{
226 node->timestamp_unavailable = jiffies;
227 set_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags);
228}
229EXPORT_SYMBOL_GPL(nfs4_mark_deviceid_unavailable);
230
231bool
232nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node)
233{
234 if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags)) {
235 unsigned long start, end;
236
237 end = jiffies;
238 start = end - PNFS_DEVICE_RETRY_TIMEOUT;
239 if (time_in_range(node->timestamp_unavailable, start, end))
240 return true;
241 clear_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags);
242 }
243 return false;
244}
245EXPORT_SYMBOL_GPL(nfs4_test_deviceid_unavailable);
246
221static void 247static void
222_deviceid_purge_client(const struct nfs_client *clp, long hash) 248_deviceid_purge_client(const struct nfs_client *clp, long hash)
223{ 249{
@@ -276,3 +302,4 @@ nfs4_deviceid_mark_client_invalid(struct nfs_client *clp)
276 } 302 }
277 rcu_read_unlock(); 303 rcu_read_unlock();
278} 304}
305
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index d2c7f5db0847..e831bce49766 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -88,6 +88,7 @@ enum {
88 Opt_sharecache, Opt_nosharecache, 88 Opt_sharecache, Opt_nosharecache,
89 Opt_resvport, Opt_noresvport, 89 Opt_resvport, Opt_noresvport,
90 Opt_fscache, Opt_nofscache, 90 Opt_fscache, Opt_nofscache,
91 Opt_migration, Opt_nomigration,
91 92
92 /* Mount options that take integer arguments */ 93 /* Mount options that take integer arguments */
93 Opt_port, 94 Opt_port,
@@ -147,6 +148,8 @@ static const match_table_t nfs_mount_option_tokens = {
147 { Opt_noresvport, "noresvport" }, 148 { Opt_noresvport, "noresvport" },
148 { Opt_fscache, "fsc" }, 149 { Opt_fscache, "fsc" },
149 { Opt_nofscache, "nofsc" }, 150 { Opt_nofscache, "nofsc" },
151 { Opt_migration, "migration" },
152 { Opt_nomigration, "nomigration" },
150 153
151 { Opt_port, "port=%s" }, 154 { Opt_port, "port=%s" },
152 { Opt_rsize, "rsize=%s" }, 155 { Opt_rsize, "rsize=%s" },
@@ -676,6 +679,9 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
676 if (nfss->options & NFS_OPTION_FSCACHE) 679 if (nfss->options & NFS_OPTION_FSCACHE)
677 seq_printf(m, ",fsc"); 680 seq_printf(m, ",fsc");
678 681
682 if (nfss->options & NFS_OPTION_MIGRATION)
683 seq_printf(m, ",migration");
684
679 if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG) { 685 if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG) {
680 if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONE) 686 if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONE)
681 seq_printf(m, ",lookupcache=none"); 687 seq_printf(m, ",lookupcache=none");
@@ -1106,7 +1112,7 @@ static int nfs_get_option_ul(substring_t args[], unsigned long *option)
1106 string = match_strdup(args); 1112 string = match_strdup(args);
1107 if (string == NULL) 1113 if (string == NULL)
1108 return -ENOMEM; 1114 return -ENOMEM;
1109 rc = strict_strtoul(string, 10, option); 1115 rc = kstrtoul(string, 10, option);
1110 kfree(string); 1116 kfree(string);
1111 1117
1112 return rc; 1118 return rc;
@@ -1243,6 +1249,12 @@ static int nfs_parse_mount_options(char *raw,
1243 kfree(mnt->fscache_uniq); 1249 kfree(mnt->fscache_uniq);
1244 mnt->fscache_uniq = NULL; 1250 mnt->fscache_uniq = NULL;
1245 break; 1251 break;
1252 case Opt_migration:
1253 mnt->options |= NFS_OPTION_MIGRATION;
1254 break;
1255 case Opt_nomigration:
1256 mnt->options &= NFS_OPTION_MIGRATION;
1257 break;
1246 1258
1247 /* 1259 /*
1248 * options that take numeric values 1260 * options that take numeric values
@@ -1535,6 +1547,10 @@ static int nfs_parse_mount_options(char *raw,
1535 if (mnt->minorversion && mnt->version != 4) 1547 if (mnt->minorversion && mnt->version != 4)
1536 goto out_minorversion_mismatch; 1548 goto out_minorversion_mismatch;
1537 1549
1550 if (mnt->options & NFS_OPTION_MIGRATION &&
1551 mnt->version != 4 && mnt->minorversion != 0)
1552 goto out_migration_misuse;
1553
1538 /* 1554 /*
1539 * verify that any proto=/mountproto= options match the address 1555 * verify that any proto=/mountproto= options match the address
1540 * families in the addr=/mountaddr= options. 1556 * families in the addr=/mountaddr= options.
@@ -1572,6 +1588,10 @@ out_minorversion_mismatch:
1572 printk(KERN_INFO "NFS: mount option vers=%u does not support " 1588 printk(KERN_INFO "NFS: mount option vers=%u does not support "
1573 "minorversion=%u\n", mnt->version, mnt->minorversion); 1589 "minorversion=%u\n", mnt->version, mnt->minorversion);
1574 return 0; 1590 return 0;
1591out_migration_misuse:
1592 printk(KERN_INFO
1593 "NFS: 'migration' not supported for this NFS version\n");
1594 return 0;
1575out_nomem: 1595out_nomem:
1576 printk(KERN_INFO "NFS: not enough memory to parse option\n"); 1596 printk(KERN_INFO "NFS: not enough memory to parse option\n");
1577 return 0; 1597 return 0;
@@ -2494,7 +2514,7 @@ EXPORT_SYMBOL_GPL(nfs_kill_super);
2494/* 2514/*
2495 * Clone an NFS2/3/4 server record on xdev traversal (FSID-change) 2515 * Clone an NFS2/3/4 server record on xdev traversal (FSID-change)
2496 */ 2516 */
2497struct dentry * 2517static struct dentry *
2498nfs_xdev_mount(struct file_system_type *fs_type, int flags, 2518nfs_xdev_mount(struct file_system_type *fs_type, int flags,
2499 const char *dev_name, void *raw_data) 2519 const char *dev_name, void *raw_data)
2500{ 2520{
@@ -2642,6 +2662,7 @@ unsigned int nfs_idmap_cache_timeout = 600;
2642bool nfs4_disable_idmapping = true; 2662bool nfs4_disable_idmapping = true;
2643unsigned short max_session_slots = NFS4_DEF_SLOT_TABLE_SIZE; 2663unsigned short max_session_slots = NFS4_DEF_SLOT_TABLE_SIZE;
2644unsigned short send_implementation_id = 1; 2664unsigned short send_implementation_id = 1;
2665char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN] = "";
2645 2666
2646EXPORT_SYMBOL_GPL(nfs_callback_set_tcpport); 2667EXPORT_SYMBOL_GPL(nfs_callback_set_tcpport);
2647EXPORT_SYMBOL_GPL(nfs_callback_tcpport); 2668EXPORT_SYMBOL_GPL(nfs_callback_tcpport);
@@ -2649,6 +2670,7 @@ EXPORT_SYMBOL_GPL(nfs_idmap_cache_timeout);
2649EXPORT_SYMBOL_GPL(nfs4_disable_idmapping); 2670EXPORT_SYMBOL_GPL(nfs4_disable_idmapping);
2650EXPORT_SYMBOL_GPL(max_session_slots); 2671EXPORT_SYMBOL_GPL(max_session_slots);
2651EXPORT_SYMBOL_GPL(send_implementation_id); 2672EXPORT_SYMBOL_GPL(send_implementation_id);
2673EXPORT_SYMBOL_GPL(nfs4_client_id_uniquifier);
2652 2674
2653#define NFS_CALLBACK_MAXPORTNR (65535U) 2675#define NFS_CALLBACK_MAXPORTNR (65535U)
2654 2676
@@ -2659,7 +2681,7 @@ static int param_set_portnr(const char *val, const struct kernel_param *kp)
2659 2681
2660 if (!val) 2682 if (!val)
2661 return -EINVAL; 2683 return -EINVAL;
2662 ret = strict_strtoul(val, 0, &num); 2684 ret = kstrtoul(val, 0, &num);
2663 if (ret == -EINVAL || num > NFS_CALLBACK_MAXPORTNR) 2685 if (ret == -EINVAL || num > NFS_CALLBACK_MAXPORTNR)
2664 return -EINVAL; 2686 return -EINVAL;
2665 *((unsigned int *)kp->arg) = num; 2687 *((unsigned int *)kp->arg) = num;
@@ -2674,6 +2696,8 @@ static struct kernel_param_ops param_ops_portnr = {
2674module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644); 2696module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644);
2675module_param(nfs_idmap_cache_timeout, int, 0644); 2697module_param(nfs_idmap_cache_timeout, int, 0644);
2676module_param(nfs4_disable_idmapping, bool, 0644); 2698module_param(nfs4_disable_idmapping, bool, 0644);
2699module_param_string(nfs4_unique_id, nfs4_client_id_uniquifier,
2700 NFS4_CLIENT_ID_UNIQ_LEN, 0600);
2677MODULE_PARM_DESC(nfs4_disable_idmapping, 2701MODULE_PARM_DESC(nfs4_disable_idmapping,
2678 "Turn off NFSv4 idmapping when using 'sec=sys'"); 2702 "Turn off NFSv4 idmapping when using 'sec=sys'");
2679module_param(max_session_slots, ushort, 0644); 2703module_param(max_session_slots, ushort, 0644);
@@ -2682,6 +2706,7 @@ MODULE_PARM_DESC(max_session_slots, "Maximum number of outstanding NFSv4.1 "
2682module_param(send_implementation_id, ushort, 0644); 2706module_param(send_implementation_id, ushort, 0644);
2683MODULE_PARM_DESC(send_implementation_id, 2707MODULE_PARM_DESC(send_implementation_id,
2684 "Send implementation ID with NFSv4.1 exchange_id"); 2708 "Send implementation ID with NFSv4.1 exchange_id");
2709MODULE_PARM_DESC(nfs4_unique_id, "nfs_client_id4 uniquifier string");
2685MODULE_ALIAS("nfs4"); 2710MODULE_ALIAS("nfs4");
2686 2711
2687#endif /* CONFIG_NFS_V4 */ 2712#endif /* CONFIG_NFS_V4 */
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index e3b55372726c..9347ab7c9574 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -846,6 +846,7 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
846int nfs_flush_incompatible(struct file *file, struct page *page) 846int nfs_flush_incompatible(struct file *file, struct page *page)
847{ 847{
848 struct nfs_open_context *ctx = nfs_file_open_context(file); 848 struct nfs_open_context *ctx = nfs_file_open_context(file);
849 struct nfs_lock_context *l_ctx;
849 struct nfs_page *req; 850 struct nfs_page *req;
850 int do_flush, status; 851 int do_flush, status;
851 /* 852 /*
@@ -860,9 +861,12 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
860 req = nfs_page_find_request(page); 861 req = nfs_page_find_request(page);
861 if (req == NULL) 862 if (req == NULL)
862 return 0; 863 return 0;
863 do_flush = req->wb_page != page || req->wb_context != ctx || 864 l_ctx = req->wb_lock_context;
864 req->wb_lock_context->lockowner != current->files || 865 do_flush = req->wb_page != page || req->wb_context != ctx;
865 req->wb_lock_context->pid != current->tgid; 866 if (l_ctx) {
867 do_flush |= l_ctx->lockowner.l_owner != current->files
868 || l_ctx->lockowner.l_pid != current->tgid;
869 }
866 nfs_release_request(req); 870 nfs_release_request(req);
867 if (!do_flush) 871 if (!do_flush)
868 return 0; 872 return 0;
@@ -1576,6 +1580,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
1576 /* We have a mismatch. Write the page again */ 1580 /* We have a mismatch. Write the page again */
1577 dprintk(" mismatch\n"); 1581 dprintk(" mismatch\n");
1578 nfs_mark_request_dirty(req); 1582 nfs_mark_request_dirty(req);
1583 set_bit(NFS_CONTEXT_RESEND_WRITES, &req->wb_context->flags);
1579 next: 1584 next:
1580 nfs_unlock_and_release_request(req); 1585 nfs_unlock_and_release_request(req);
1581 } 1586 }
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index 6aa5590c3679..b314888825d5 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -218,8 +218,7 @@ static int nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p,
218 * There must be an encoding function for void results so svc_process 218 * There must be an encoding function for void results so svc_process
219 * will work properly. 219 * will work properly.
220 */ 220 */
221int 221static int nfsaclsvc_encode_voidres(struct svc_rqst *rqstp, __be32 *p, void *dummy)
222nfsaclsvc_encode_voidres(struct svc_rqst *rqstp, __be32 *p, void *dummy)
223{ 222{
224 return xdr_ressize_check(rqstp, p); 223 return xdr_ressize_check(rqstp, p);
225} 224}
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 9095f3c21df9..97d90d1c8608 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -247,7 +247,7 @@ nfsd3_proc_create(struct svc_rqst *rqstp, struct nfsd3_createargs *argp,
247 /* Now create the file and set attributes */ 247 /* Now create the file and set attributes */
248 nfserr = do_nfsd_create(rqstp, dirfhp, argp->name, argp->len, 248 nfserr = do_nfsd_create(rqstp, dirfhp, argp->name, argp->len,
249 attr, newfhp, 249 attr, newfhp,
250 argp->createmode, argp->verf, NULL, NULL); 250 argp->createmode, (u32 *)argp->verf, NULL, NULL);
251 251
252 RETURN_STATUS(nfserr); 252 RETURN_STATUS(nfserr);
253} 253}
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 4c7bd35b1876..bdf29c96e4cd 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -1028,7 +1028,6 @@ void nfsd4_cb_recall(struct nfs4_delegation *dp)
1028 cb->cb_msg.rpc_cred = callback_cred; 1028 cb->cb_msg.rpc_cred = callback_cred;
1029 1029
1030 cb->cb_ops = &nfsd4_cb_recall_ops; 1030 cb->cb_ops = &nfsd4_cb_recall_ops;
1031 dp->dl_retries = 1;
1032 1031
1033 INIT_LIST_HEAD(&cb->cb_per_client); 1032 INIT_LIST_HEAD(&cb->cb_per_client);
1034 cb->cb_done = true; 1033 cb->cb_done = true;
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index fdc91a6fc9c4..a1f10c0a6255 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -478,7 +478,7 @@ nfsd_idmap_init(struct net *net)
478 goto destroy_idtoname_cache; 478 goto destroy_idtoname_cache;
479 nn->nametoid_cache = cache_create_net(&nametoid_cache_template, net); 479 nn->nametoid_cache = cache_create_net(&nametoid_cache_template, net);
480 if (IS_ERR(nn->nametoid_cache)) { 480 if (IS_ERR(nn->nametoid_cache)) {
481 rv = PTR_ERR(nn->idtoname_cache); 481 rv = PTR_ERR(nn->nametoid_cache);
482 goto unregister_idtoname_cache; 482 goto unregister_idtoname_cache;
483 } 483 }
484 rv = cache_register_net(nn->nametoid_cache, net); 484 rv = cache_register_net(nn->nametoid_cache, net);
@@ -598,7 +598,7 @@ numeric_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namel
598 /* Just to make sure it's null-terminated: */ 598 /* Just to make sure it's null-terminated: */
599 memcpy(buf, name, namelen); 599 memcpy(buf, name, namelen);
600 buf[namelen] = '\0'; 600 buf[namelen] = '\0';
601 ret = kstrtouint(name, 10, id); 601 ret = kstrtouint(buf, 10, id);
602 return ret == 0; 602 return ret == 0;
603} 603}
604 604
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index c9c1c0a25417..6c9a4b291dba 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -370,7 +370,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
370 break; 370 break;
371 case NFS4_OPEN_CLAIM_PREVIOUS: 371 case NFS4_OPEN_CLAIM_PREVIOUS:
372 open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; 372 open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
373 status = nfs4_check_open_reclaim(&open->op_clientid); 373 status = nfs4_check_open_reclaim(&open->op_clientid, cstate->minorversion);
374 if (status) 374 if (status)
375 goto out; 375 goto out;
376 case NFS4_OPEN_CLAIM_FH: 376 case NFS4_OPEN_CLAIM_FH:
@@ -1054,8 +1054,8 @@ struct nfsd4_operation {
1054 char *op_name; 1054 char *op_name;
1055 /* Try to get response size before operation */ 1055 /* Try to get response size before operation */
1056 nfsd4op_rsize op_rsize_bop; 1056 nfsd4op_rsize op_rsize_bop;
1057 stateid_setter op_get_currentstateid; 1057 stateid_getter op_get_currentstateid;
1058 stateid_getter op_set_currentstateid; 1058 stateid_setter op_set_currentstateid;
1059}; 1059};
1060 1060
1061static struct nfsd4_operation nfsd4_ops[]; 1061static struct nfsd4_operation nfsd4_ops[];
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 48a1bad37334..d0237f872cc4 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -758,7 +758,7 @@ static void nfsd4_put_drc_mem(int slotsize, int num)
758 spin_unlock(&nfsd_drc_lock); 758 spin_unlock(&nfsd_drc_lock);
759} 759}
760 760
761static struct nfsd4_session *alloc_session(int slotsize, int numslots) 761static struct nfsd4_session *__alloc_session(int slotsize, int numslots)
762{ 762{
763 struct nfsd4_session *new; 763 struct nfsd4_session *new;
764 int mem, i; 764 int mem, i;
@@ -852,35 +852,28 @@ static int nfsd4_register_conn(struct nfsd4_conn *conn)
852 return register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user); 852 return register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user);
853} 853}
854 854
855static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses, u32 dir) 855static void nfsd4_init_conn(struct svc_rqst *rqstp, struct nfsd4_conn *conn, struct nfsd4_session *ses)
856{ 856{
857 struct nfsd4_conn *conn;
858 int ret; 857 int ret;
859 858
860 conn = alloc_conn(rqstp, dir);
861 if (!conn)
862 return nfserr_jukebox;
863 nfsd4_hash_conn(conn, ses); 859 nfsd4_hash_conn(conn, ses);
864 ret = nfsd4_register_conn(conn); 860 ret = nfsd4_register_conn(conn);
865 if (ret) 861 if (ret)
866 /* oops; xprt is already down: */ 862 /* oops; xprt is already down: */
867 nfsd4_conn_lost(&conn->cn_xpt_user); 863 nfsd4_conn_lost(&conn->cn_xpt_user);
868 if (ses->se_client->cl_cb_state == NFSD4_CB_DOWN && 864 if (conn->cn_flags & NFS4_CDFC4_BACK) {
869 dir & NFS4_CDFC4_BACK) {
870 /* callback channel may be back up */ 865 /* callback channel may be back up */
871 nfsd4_probe_callback(ses->se_client); 866 nfsd4_probe_callback(ses->se_client);
872 } 867 }
873 return nfs_ok;
874} 868}
875 869
876static __be32 nfsd4_new_conn_from_crses(struct svc_rqst *rqstp, struct nfsd4_session *ses) 870static struct nfsd4_conn *alloc_conn_from_crses(struct svc_rqst *rqstp, struct nfsd4_create_session *cses)
877{ 871{
878 u32 dir = NFS4_CDFC4_FORE; 872 u32 dir = NFS4_CDFC4_FORE;
879 873
880 if (ses->se_flags & SESSION4_BACK_CHAN) 874 if (cses->flags & SESSION4_BACK_CHAN)
881 dir |= NFS4_CDFC4_BACK; 875 dir |= NFS4_CDFC4_BACK;
882 876 return alloc_conn(rqstp, dir);
883 return nfsd4_new_conn(rqstp, ses, dir);
884} 877}
885 878
886/* must be called under client_lock */ 879/* must be called under client_lock */
@@ -903,20 +896,21 @@ static void nfsd4_del_conns(struct nfsd4_session *s)
903 spin_unlock(&clp->cl_lock); 896 spin_unlock(&clp->cl_lock);
904} 897}
905 898
899static void __free_session(struct nfsd4_session *ses)
900{
901 nfsd4_put_drc_mem(slot_bytes(&ses->se_fchannel), ses->se_fchannel.maxreqs);
902 free_session_slots(ses);
903 kfree(ses);
904}
905
906static void free_session(struct kref *kref) 906static void free_session(struct kref *kref)
907{ 907{
908 struct nfsd4_session *ses; 908 struct nfsd4_session *ses;
909 int mem;
910 909
911 lockdep_assert_held(&client_lock); 910 lockdep_assert_held(&client_lock);
912 ses = container_of(kref, struct nfsd4_session, se_ref); 911 ses = container_of(kref, struct nfsd4_session, se_ref);
913 nfsd4_del_conns(ses); 912 nfsd4_del_conns(ses);
914 spin_lock(&nfsd_drc_lock); 913 __free_session(ses);
915 mem = ses->se_fchannel.maxreqs * slot_bytes(&ses->se_fchannel);
916 nfsd_drc_mem_used -= mem;
917 spin_unlock(&nfsd_drc_lock);
918 free_session_slots(ses);
919 kfree(ses);
920} 914}
921 915
922void nfsd4_put_session(struct nfsd4_session *ses) 916void nfsd4_put_session(struct nfsd4_session *ses)
@@ -926,14 +920,10 @@ void nfsd4_put_session(struct nfsd4_session *ses)
926 spin_unlock(&client_lock); 920 spin_unlock(&client_lock);
927} 921}
928 922
929static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp, struct nfsd4_create_session *cses) 923static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fchan)
930{ 924{
931 struct nfsd4_session *new; 925 struct nfsd4_session *new;
932 struct nfsd4_channel_attrs *fchan = &cses->fore_channel;
933 int numslots, slotsize; 926 int numslots, slotsize;
934 __be32 status;
935 int idx;
936
937 /* 927 /*
938 * Note decreasing slot size below client's request may 928 * Note decreasing slot size below client's request may
939 * make it difficult for client to function correctly, whereas 929 * make it difficult for client to function correctly, whereas
@@ -946,12 +936,18 @@ static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct n
946 if (numslots < 1) 936 if (numslots < 1)
947 return NULL; 937 return NULL;
948 938
949 new = alloc_session(slotsize, numslots); 939 new = __alloc_session(slotsize, numslots);
950 if (!new) { 940 if (!new) {
951 nfsd4_put_drc_mem(slotsize, fchan->maxreqs); 941 nfsd4_put_drc_mem(slotsize, fchan->maxreqs);
952 return NULL; 942 return NULL;
953 } 943 }
954 init_forechannel_attrs(&new->se_fchannel, fchan, numslots, slotsize); 944 init_forechannel_attrs(&new->se_fchannel, fchan, numslots, slotsize);
945 return new;
946}
947
948static struct nfsd4_session *init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, struct nfs4_client *clp, struct nfsd4_create_session *cses)
949{
950 int idx;
955 951
956 new->se_client = clp; 952 new->se_client = clp;
957 gen_sessionid(new); 953 gen_sessionid(new);
@@ -970,14 +966,6 @@ static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct n
970 spin_unlock(&clp->cl_lock); 966 spin_unlock(&clp->cl_lock);
971 spin_unlock(&client_lock); 967 spin_unlock(&client_lock);
972 968
973 status = nfsd4_new_conn_from_crses(rqstp, new);
974 /* whoops: benny points out, status is ignored! (err, or bogus) */
975 if (status) {
976 spin_lock(&client_lock);
977 free_session(&new->se_ref);
978 spin_unlock(&client_lock);
979 return NULL;
980 }
981 if (cses->flags & SESSION4_BACK_CHAN) { 969 if (cses->flags & SESSION4_BACK_CHAN) {
982 struct sockaddr *sa = svc_addr(rqstp); 970 struct sockaddr *sa = svc_addr(rqstp);
983 /* 971 /*
@@ -990,7 +978,6 @@ static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct n
990 rpc_copy_addr((struct sockaddr *)&clp->cl_cb_conn.cb_addr, sa); 978 rpc_copy_addr((struct sockaddr *)&clp->cl_cb_conn.cb_addr, sa);
991 clp->cl_cb_conn.cb_addrlen = svc_addr_len(sa); 979 clp->cl_cb_conn.cb_addrlen = svc_addr_len(sa);
992 } 980 }
993 nfsd4_probe_callback(clp);
994 return new; 981 return new;
995} 982}
996 983
@@ -1131,7 +1118,7 @@ unhash_client_locked(struct nfs4_client *clp)
1131} 1118}
1132 1119
1133static void 1120static void
1134expire_client(struct nfs4_client *clp) 1121destroy_client(struct nfs4_client *clp)
1135{ 1122{
1136 struct nfs4_openowner *oo; 1123 struct nfs4_openowner *oo;
1137 struct nfs4_delegation *dp; 1124 struct nfs4_delegation *dp;
@@ -1165,6 +1152,12 @@ expire_client(struct nfs4_client *clp)
1165 spin_unlock(&client_lock); 1152 spin_unlock(&client_lock);
1166} 1153}
1167 1154
1155static void expire_client(struct nfs4_client *clp)
1156{
1157 nfsd4_client_record_remove(clp);
1158 destroy_client(clp);
1159}
1160
1168static void copy_verf(struct nfs4_client *target, nfs4_verifier *source) 1161static void copy_verf(struct nfs4_client *target, nfs4_verifier *source)
1169{ 1162{
1170 memcpy(target->cl_verifier.data, source->data, 1163 memcpy(target->cl_verifier.data, source->data,
@@ -1223,10 +1216,26 @@ static bool groups_equal(struct group_info *g1, struct group_info *g2)
1223 return true; 1216 return true;
1224} 1217}
1225 1218
1219/*
1220 * RFC 3530 language requires clid_inuse be returned when the
1221 * "principal" associated with a requests differs from that previously
1222 * used. We use uid, gid's, and gss principal string as our best
1223 * approximation. We also don't want to allow non-gss use of a client
1224 * established using gss: in theory cr_principal should catch that
1225 * change, but in practice cr_principal can be null even in the gss case
1226 * since gssd doesn't always pass down a principal string.
1227 */
1228static bool is_gss_cred(struct svc_cred *cr)
1229{
1230 /* Is cr_flavor one of the gss "pseudoflavors"?: */
1231 return (cr->cr_flavor > RPC_AUTH_MAXFLAVOR);
1232}
1233
1234
1226static bool 1235static bool
1227same_creds(struct svc_cred *cr1, struct svc_cred *cr2) 1236same_creds(struct svc_cred *cr1, struct svc_cred *cr2)
1228{ 1237{
1229 if ((cr1->cr_flavor != cr2->cr_flavor) 1238 if ((is_gss_cred(cr1) != is_gss_cred(cr2))
1230 || (cr1->cr_uid != cr2->cr_uid) 1239 || (cr1->cr_uid != cr2->cr_uid)
1231 || (cr1->cr_gid != cr2->cr_gid) 1240 || (cr1->cr_gid != cr2->cr_gid)
1232 || !groups_equal(cr1->cr_group_info, cr2->cr_group_info)) 1241 || !groups_equal(cr1->cr_group_info, cr2->cr_group_info))
@@ -1340,13 +1349,15 @@ move_to_confirmed(struct nfs4_client *clp)
1340} 1349}
1341 1350
1342static struct nfs4_client * 1351static struct nfs4_client *
1343find_confirmed_client(clientid_t *clid) 1352find_confirmed_client(clientid_t *clid, bool sessions)
1344{ 1353{
1345 struct nfs4_client *clp; 1354 struct nfs4_client *clp;
1346 unsigned int idhashval = clientid_hashval(clid->cl_id); 1355 unsigned int idhashval = clientid_hashval(clid->cl_id);
1347 1356
1348 list_for_each_entry(clp, &conf_id_hashtbl[idhashval], cl_idhash) { 1357 list_for_each_entry(clp, &conf_id_hashtbl[idhashval], cl_idhash) {
1349 if (same_clid(&clp->cl_clientid, clid)) { 1358 if (same_clid(&clp->cl_clientid, clid)) {
1359 if ((bool)clp->cl_minorversion != sessions)
1360 return NULL;
1350 renew_client(clp); 1361 renew_client(clp);
1351 return clp; 1362 return clp;
1352 } 1363 }
@@ -1355,14 +1366,17 @@ find_confirmed_client(clientid_t *clid)
1355} 1366}
1356 1367
1357static struct nfs4_client * 1368static struct nfs4_client *
1358find_unconfirmed_client(clientid_t *clid) 1369find_unconfirmed_client(clientid_t *clid, bool sessions)
1359{ 1370{
1360 struct nfs4_client *clp; 1371 struct nfs4_client *clp;
1361 unsigned int idhashval = clientid_hashval(clid->cl_id); 1372 unsigned int idhashval = clientid_hashval(clid->cl_id);
1362 1373
1363 list_for_each_entry(clp, &unconf_id_hashtbl[idhashval], cl_idhash) { 1374 list_for_each_entry(clp, &unconf_id_hashtbl[idhashval], cl_idhash) {
1364 if (same_clid(&clp->cl_clientid, clid)) 1375 if (same_clid(&clp->cl_clientid, clid)) {
1376 if ((bool)clp->cl_minorversion != sessions)
1377 return NULL;
1365 return clp; 1378 return clp;
1379 }
1366 } 1380 }
1367 return NULL; 1381 return NULL;
1368} 1382}
@@ -1651,6 +1665,7 @@ out_new:
1651 status = nfserr_jukebox; 1665 status = nfserr_jukebox;
1652 goto out; 1666 goto out;
1653 } 1667 }
1668 new->cl_minorversion = 1;
1654 1669
1655 gen_clid(new); 1670 gen_clid(new);
1656 add_to_unconfirmed(new, strhashval); 1671 add_to_unconfirmed(new, strhashval);
@@ -1743,67 +1758,71 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1743 struct sockaddr *sa = svc_addr(rqstp); 1758 struct sockaddr *sa = svc_addr(rqstp);
1744 struct nfs4_client *conf, *unconf; 1759 struct nfs4_client *conf, *unconf;
1745 struct nfsd4_session *new; 1760 struct nfsd4_session *new;
1761 struct nfsd4_conn *conn;
1746 struct nfsd4_clid_slot *cs_slot = NULL; 1762 struct nfsd4_clid_slot *cs_slot = NULL;
1747 bool confirm_me = false;
1748 __be32 status = 0; 1763 __be32 status = 0;
1749 1764
1750 if (cr_ses->flags & ~SESSION4_FLAG_MASK_A) 1765 if (cr_ses->flags & ~SESSION4_FLAG_MASK_A)
1751 return nfserr_inval; 1766 return nfserr_inval;
1767 if (check_forechannel_attrs(cr_ses->fore_channel))
1768 return nfserr_toosmall;
1769 new = alloc_session(&cr_ses->fore_channel);
1770 if (!new)
1771 return nfserr_jukebox;
1772 status = nfserr_jukebox;
1773 conn = alloc_conn_from_crses(rqstp, cr_ses);
1774 if (!conn)
1775 goto out_free_session;
1752 1776
1753 nfs4_lock_state(); 1777 nfs4_lock_state();
1754 unconf = find_unconfirmed_client(&cr_ses->clientid); 1778 unconf = find_unconfirmed_client(&cr_ses->clientid, true);
1755 conf = find_confirmed_client(&cr_ses->clientid); 1779 conf = find_confirmed_client(&cr_ses->clientid, true);
1756 1780
1757 if (conf) { 1781 if (conf) {
1758 cs_slot = &conf->cl_cs_slot; 1782 cs_slot = &conf->cl_cs_slot;
1759 status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0); 1783 status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
1760 if (status == nfserr_replay_cache) { 1784 if (status == nfserr_replay_cache) {
1761 status = nfsd4_replay_create_session(cr_ses, cs_slot); 1785 status = nfsd4_replay_create_session(cr_ses, cs_slot);
1762 goto out; 1786 goto out_free_conn;
1763 } else if (cr_ses->seqid != cs_slot->sl_seqid + 1) { 1787 } else if (cr_ses->seqid != cs_slot->sl_seqid + 1) {
1764 status = nfserr_seq_misordered; 1788 status = nfserr_seq_misordered;
1765 goto out; 1789 goto out_free_conn;
1766 } 1790 }
1767 } else if (unconf) { 1791 } else if (unconf) {
1792 unsigned int hash;
1793 struct nfs4_client *old;
1768 if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) || 1794 if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) ||
1769 !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) { 1795 !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) {
1770 status = nfserr_clid_inuse; 1796 status = nfserr_clid_inuse;
1771 goto out; 1797 goto out_free_conn;
1772 } 1798 }
1773 cs_slot = &unconf->cl_cs_slot; 1799 cs_slot = &unconf->cl_cs_slot;
1774 status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0); 1800 status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
1775 if (status) { 1801 if (status) {
1776 /* an unconfirmed replay returns misordered */ 1802 /* an unconfirmed replay returns misordered */
1777 status = nfserr_seq_misordered; 1803 status = nfserr_seq_misordered;
1778 goto out; 1804 goto out_free_conn;
1779 } 1805 }
1780 confirm_me = true; 1806 hash = clientstr_hashval(unconf->cl_recdir);
1807 old = find_confirmed_client_by_str(unconf->cl_recdir, hash);
1808 if (old)
1809 expire_client(old);
1810 move_to_confirmed(unconf);
1781 conf = unconf; 1811 conf = unconf;
1782 } else { 1812 } else {
1783 status = nfserr_stale_clientid; 1813 status = nfserr_stale_clientid;
1784 goto out; 1814 goto out_free_conn;
1785 } 1815 }
1786 1816 status = nfs_ok;
1787 /*
1788 * XXX: we should probably set this at creation time, and check
1789 * for consistent minorversion use throughout:
1790 */
1791 conf->cl_minorversion = 1;
1792 /* 1817 /*
1793 * We do not support RDMA or persistent sessions 1818 * We do not support RDMA or persistent sessions
1794 */ 1819 */
1795 cr_ses->flags &= ~SESSION4_PERSIST; 1820 cr_ses->flags &= ~SESSION4_PERSIST;
1796 cr_ses->flags &= ~SESSION4_RDMA; 1821 cr_ses->flags &= ~SESSION4_RDMA;
1797 1822
1798 status = nfserr_toosmall; 1823 init_session(rqstp, new, conf, cr_ses);
1799 if (check_forechannel_attrs(cr_ses->fore_channel)) 1824 nfsd4_init_conn(rqstp, conn, new);
1800 goto out;
1801 1825
1802 status = nfserr_jukebox;
1803 new = alloc_init_session(rqstp, conf, cr_ses);
1804 if (!new)
1805 goto out;
1806 status = nfs_ok;
1807 memcpy(cr_ses->sessionid.data, new->se_sessionid.data, 1826 memcpy(cr_ses->sessionid.data, new->se_sessionid.data,
1808 NFS4_MAX_SESSIONID_LEN); 1827 NFS4_MAX_SESSIONID_LEN);
1809 memcpy(&cr_ses->fore_channel, &new->se_fchannel, 1828 memcpy(&cr_ses->fore_channel, &new->se_fchannel,
@@ -1813,18 +1832,15 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1813 1832
1814 /* cache solo and embedded create sessions under the state lock */ 1833 /* cache solo and embedded create sessions under the state lock */
1815 nfsd4_cache_create_session(cr_ses, cs_slot, status); 1834 nfsd4_cache_create_session(cr_ses, cs_slot, status);
1816 if (confirm_me) {
1817 unsigned int hash = clientstr_hashval(unconf->cl_recdir);
1818 struct nfs4_client *old =
1819 find_confirmed_client_by_str(conf->cl_recdir, hash);
1820 if (old)
1821 expire_client(old);
1822 move_to_confirmed(conf);
1823 }
1824out: 1835out:
1825 nfs4_unlock_state(); 1836 nfs4_unlock_state();
1826 dprintk("%s returns %d\n", __func__, ntohl(status)); 1837 dprintk("%s returns %d\n", __func__, ntohl(status));
1827 return status; 1838 return status;
1839out_free_conn:
1840 free_conn(conn);
1841out_free_session:
1842 __free_session(new);
1843 goto out;
1828} 1844}
1829 1845
1830static bool nfsd4_last_compound_op(struct svc_rqst *rqstp) 1846static bool nfsd4_last_compound_op(struct svc_rqst *rqstp)
@@ -1854,6 +1870,7 @@ __be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp,
1854 struct nfsd4_bind_conn_to_session *bcts) 1870 struct nfsd4_bind_conn_to_session *bcts)
1855{ 1871{
1856 __be32 status; 1872 __be32 status;
1873 struct nfsd4_conn *conn;
1857 1874
1858 if (!nfsd4_last_compound_op(rqstp)) 1875 if (!nfsd4_last_compound_op(rqstp))
1859 return nfserr_not_only_op; 1876 return nfserr_not_only_op;
@@ -1870,9 +1887,13 @@ __be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp,
1870 return nfserr_badsession; 1887 return nfserr_badsession;
1871 1888
1872 status = nfsd4_map_bcts_dir(&bcts->dir); 1889 status = nfsd4_map_bcts_dir(&bcts->dir);
1873 if (!status) 1890 if (status)
1874 nfsd4_new_conn(rqstp, cstate->session, bcts->dir); 1891 return status;
1875 return status; 1892 conn = alloc_conn(rqstp, bcts->dir);
1893 if (!conn)
1894 return nfserr_jukebox;
1895 nfsd4_init_conn(rqstp, conn, cstate->session);
1896 return nfs_ok;
1876} 1897}
1877 1898
1878static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid) 1899static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid)
@@ -2085,8 +2106,8 @@ nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta
2085 __be32 status = 0; 2106 __be32 status = 0;
2086 2107
2087 nfs4_lock_state(); 2108 nfs4_lock_state();
2088 unconf = find_unconfirmed_client(&dc->clientid); 2109 unconf = find_unconfirmed_client(&dc->clientid, true);
2089 conf = find_confirmed_client(&dc->clientid); 2110 conf = find_confirmed_client(&dc->clientid, true);
2090 2111
2091 if (conf) { 2112 if (conf) {
2092 clp = conf; 2113 clp = conf;
@@ -2200,10 +2221,6 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2200 copy_clid(new, conf); 2221 copy_clid(new, conf);
2201 else /* case 4 (new client) or cases 2, 3 (client reboot): */ 2222 else /* case 4 (new client) or cases 2, 3 (client reboot): */
2202 gen_clid(new); 2223 gen_clid(new);
2203 /*
2204 * XXX: we should probably set this at creation time, and check
2205 * for consistent minorversion use throughout:
2206 */
2207 new->cl_minorversion = 0; 2224 new->cl_minorversion = 0;
2208 gen_callback(new, setclid, rqstp); 2225 gen_callback(new, setclid, rqstp);
2209 add_to_unconfirmed(new, strhashval); 2226 add_to_unconfirmed(new, strhashval);
@@ -2232,8 +2249,8 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
2232 return nfserr_stale_clientid; 2249 return nfserr_stale_clientid;
2233 nfs4_lock_state(); 2250 nfs4_lock_state();
2234 2251
2235 conf = find_confirmed_client(clid); 2252 conf = find_confirmed_client(clid, false);
2236 unconf = find_unconfirmed_client(clid); 2253 unconf = find_unconfirmed_client(clid, false);
2237 /* 2254 /*
2238 * We try hard to give out unique clientid's, so if we get an 2255 * We try hard to give out unique clientid's, so if we get an
2239 * attempt to confirm the same clientid with a different cred, 2256 * attempt to confirm the same clientid with a different cred,
@@ -2262,10 +2279,8 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
2262 unsigned int hash = clientstr_hashval(unconf->cl_recdir); 2279 unsigned int hash = clientstr_hashval(unconf->cl_recdir);
2263 2280
2264 conf = find_confirmed_client_by_str(unconf->cl_recdir, hash); 2281 conf = find_confirmed_client_by_str(unconf->cl_recdir, hash);
2265 if (conf) { 2282 if (conf)
2266 nfsd4_client_record_remove(conf);
2267 expire_client(conf); 2283 expire_client(conf);
2268 }
2269 move_to_confirmed(unconf); 2284 move_to_confirmed(unconf);
2270 nfsd4_probe_callback(unconf); 2285 nfsd4_probe_callback(unconf);
2271 } 2286 }
@@ -2447,16 +2462,20 @@ same_owner_str(struct nfs4_stateowner *sop, struct xdr_netobj *owner,
2447} 2462}
2448 2463
2449static struct nfs4_openowner * 2464static struct nfs4_openowner *
2450find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open) 2465find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open, bool sessions)
2451{ 2466{
2452 struct nfs4_stateowner *so; 2467 struct nfs4_stateowner *so;
2453 struct nfs4_openowner *oo; 2468 struct nfs4_openowner *oo;
2469 struct nfs4_client *clp;
2454 2470
2455 list_for_each_entry(so, &ownerstr_hashtbl[hashval], so_strhash) { 2471 list_for_each_entry(so, &ownerstr_hashtbl[hashval], so_strhash) {
2456 if (!so->so_is_open_owner) 2472 if (!so->so_is_open_owner)
2457 continue; 2473 continue;
2458 if (same_owner_str(so, &open->op_owner, &open->op_clientid)) { 2474 if (same_owner_str(so, &open->op_owner, &open->op_clientid)) {
2459 oo = openowner(so); 2475 oo = openowner(so);
2476 clp = oo->oo_owner.so_client;
2477 if ((bool)clp->cl_minorversion != sessions)
2478 return NULL;
2460 renew_client(oo->oo_owner.so_client); 2479 renew_client(oo->oo_owner.so_client);
2461 return oo; 2480 return oo;
2462 } 2481 }
@@ -2600,10 +2619,10 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
2600 return nfserr_jukebox; 2619 return nfserr_jukebox;
2601 2620
2602 strhashval = ownerstr_hashval(clientid->cl_id, &open->op_owner); 2621 strhashval = ownerstr_hashval(clientid->cl_id, &open->op_owner);
2603 oo = find_openstateowner_str(strhashval, open); 2622 oo = find_openstateowner_str(strhashval, open, cstate->minorversion);
2604 open->op_openowner = oo; 2623 open->op_openowner = oo;
2605 if (!oo) { 2624 if (!oo) {
2606 clp = find_confirmed_client(clientid); 2625 clp = find_confirmed_client(clientid, cstate->minorversion);
2607 if (clp == NULL) 2626 if (clp == NULL)
2608 return nfserr_expired; 2627 return nfserr_expired;
2609 goto new_owner; 2628 goto new_owner;
@@ -2705,11 +2724,6 @@ nfs4_check_open(struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_ol_st
2705 return nfs_ok; 2724 return nfs_ok;
2706} 2725}
2707 2726
2708static void nfs4_free_stateid(struct nfs4_ol_stateid *s)
2709{
2710 kmem_cache_free(stateid_slab, s);
2711}
2712
2713static inline int nfs4_access_to_access(u32 nfs4_access) 2727static inline int nfs4_access_to_access(u32 nfs4_access)
2714{ 2728{
2715 int flags = 0; 2729 int flags = 0;
@@ -3087,7 +3101,7 @@ void nfsd4_cleanup_open_state(struct nfsd4_open *open, __be32 status)
3087 if (open->op_file) 3101 if (open->op_file)
3088 nfsd4_free_file(open->op_file); 3102 nfsd4_free_file(open->op_file);
3089 if (open->op_stp) 3103 if (open->op_stp)
3090 nfs4_free_stateid(open->op_stp); 3104 free_generic_stateid(open->op_stp);
3091} 3105}
3092 3106
3093__be32 3107__be32
@@ -3104,7 +3118,7 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3104 status = nfserr_stale_clientid; 3118 status = nfserr_stale_clientid;
3105 if (STALE_CLIENTID(clid, nn)) 3119 if (STALE_CLIENTID(clid, nn))
3106 goto out; 3120 goto out;
3107 clp = find_confirmed_client(clid); 3121 clp = find_confirmed_client(clid, cstate->minorversion);
3108 status = nfserr_expired; 3122 status = nfserr_expired;
3109 if (clp == NULL) { 3123 if (clp == NULL) {
3110 /* We assume the client took too long to RENEW. */ 3124 /* We assume the client took too long to RENEW. */
@@ -3180,7 +3194,6 @@ nfs4_laundromat(void)
3180 clp = list_entry(pos, struct nfs4_client, cl_lru); 3194 clp = list_entry(pos, struct nfs4_client, cl_lru);
3181 dprintk("NFSD: purging unused client (clientid %08x)\n", 3195 dprintk("NFSD: purging unused client (clientid %08x)\n",
3182 clp->cl_clientid.cl_id); 3196 clp->cl_clientid.cl_id);
3183 nfsd4_client_record_remove(clp);
3184 expire_client(clp); 3197 expire_client(clp);
3185 } 3198 }
3186 spin_lock(&recall_lock); 3199 spin_lock(&recall_lock);
@@ -3372,7 +3385,7 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid)
3372 return nfs_ok; 3385 return nfs_ok;
3373} 3386}
3374 3387
3375static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask, struct nfs4_stid **s) 3388static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask, struct nfs4_stid **s, bool sessions)
3376{ 3389{
3377 struct nfs4_client *cl; 3390 struct nfs4_client *cl;
3378 struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); 3391 struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
@@ -3381,7 +3394,7 @@ static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask, s
3381 return nfserr_bad_stateid; 3394 return nfserr_bad_stateid;
3382 if (STALE_STATEID(stateid, nn)) 3395 if (STALE_STATEID(stateid, nn))
3383 return nfserr_stale_stateid; 3396 return nfserr_stale_stateid;
3384 cl = find_confirmed_client(&stateid->si_opaque.so_clid); 3397 cl = find_confirmed_client(&stateid->si_opaque.so_clid, sessions);
3385 if (!cl) 3398 if (!cl)
3386 return nfserr_expired; 3399 return nfserr_expired;
3387 *s = find_stateid_by_type(cl, stateid, typemask); 3400 *s = find_stateid_by_type(cl, stateid, typemask);
@@ -3414,7 +3427,7 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate,
3414 if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) 3427 if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
3415 return check_special_stateids(net, current_fh, stateid, flags); 3428 return check_special_stateids(net, current_fh, stateid, flags);
3416 3429
3417 status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID, &s); 3430 status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID, &s, cstate->minorversion);
3418 if (status) 3431 if (status)
3419 return status; 3432 return status;
3420 status = check_stateid_generation(stateid, &s->sc_stateid, nfsd4_has_session(cstate)); 3433 status = check_stateid_generation(stateid, &s->sc_stateid, nfsd4_has_session(cstate));
@@ -3564,7 +3577,7 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
3564 seqid, STATEID_VAL(stateid)); 3577 seqid, STATEID_VAL(stateid));
3565 3578
3566 *stpp = NULL; 3579 *stpp = NULL;
3567 status = nfsd4_lookup_stateid(stateid, typemask, &s); 3580 status = nfsd4_lookup_stateid(stateid, typemask, &s, cstate->minorversion);
3568 if (status) 3581 if (status)
3569 return status; 3582 return status;
3570 *stpp = openlockstateid(s); 3583 *stpp = openlockstateid(s);
@@ -3765,6 +3778,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3765 memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); 3778 memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
3766 3779
3767 nfsd4_close_open_stateid(stp); 3780 nfsd4_close_open_stateid(stp);
3781 release_last_closed_stateid(oo);
3768 oo->oo_last_closed_stid = stp; 3782 oo->oo_last_closed_stid = stp;
3769 3783
3770 if (list_empty(&oo->oo_owner.so_stateids)) { 3784 if (list_empty(&oo->oo_owner.so_stateids)) {
@@ -3801,7 +3815,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3801 inode = cstate->current_fh.fh_dentry->d_inode; 3815 inode = cstate->current_fh.fh_dentry->d_inode;
3802 3816
3803 nfs4_lock_state(); 3817 nfs4_lock_state();
3804 status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID, &s); 3818 status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID, &s, cstate->minorversion);
3805 if (status) 3819 if (status)
3806 goto out; 3820 goto out;
3807 dp = delegstateid(s); 3821 dp = delegstateid(s);
@@ -4045,8 +4059,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4045 struct nfs4_lockowner *lock_sop = NULL; 4059 struct nfs4_lockowner *lock_sop = NULL;
4046 struct nfs4_ol_stateid *lock_stp; 4060 struct nfs4_ol_stateid *lock_stp;
4047 struct file *filp = NULL; 4061 struct file *filp = NULL;
4048 struct file_lock file_lock; 4062 struct file_lock *file_lock = NULL;
4049 struct file_lock conflock; 4063 struct file_lock *conflock = NULL;
4050 __be32 status = 0; 4064 __be32 status = 0;
4051 bool new_state = false; 4065 bool new_state = false;
4052 int lkflg; 4066 int lkflg;
@@ -4116,21 +4130,28 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4116 if (!locks_in_grace(SVC_NET(rqstp)) && lock->lk_reclaim) 4130 if (!locks_in_grace(SVC_NET(rqstp)) && lock->lk_reclaim)
4117 goto out; 4131 goto out;
4118 4132
4119 locks_init_lock(&file_lock); 4133 file_lock = locks_alloc_lock();
4134 if (!file_lock) {
4135 dprintk("NFSD: %s: unable to allocate lock!\n", __func__);
4136 status = nfserr_jukebox;
4137 goto out;
4138 }
4139
4140 locks_init_lock(file_lock);
4120 switch (lock->lk_type) { 4141 switch (lock->lk_type) {
4121 case NFS4_READ_LT: 4142 case NFS4_READ_LT:
4122 case NFS4_READW_LT: 4143 case NFS4_READW_LT:
4123 filp = find_readable_file(lock_stp->st_file); 4144 filp = find_readable_file(lock_stp->st_file);
4124 if (filp) 4145 if (filp)
4125 get_lock_access(lock_stp, NFS4_SHARE_ACCESS_READ); 4146 get_lock_access(lock_stp, NFS4_SHARE_ACCESS_READ);
4126 file_lock.fl_type = F_RDLCK; 4147 file_lock->fl_type = F_RDLCK;
4127 break; 4148 break;
4128 case NFS4_WRITE_LT: 4149 case NFS4_WRITE_LT:
4129 case NFS4_WRITEW_LT: 4150 case NFS4_WRITEW_LT:
4130 filp = find_writeable_file(lock_stp->st_file); 4151 filp = find_writeable_file(lock_stp->st_file);
4131 if (filp) 4152 if (filp)
4132 get_lock_access(lock_stp, NFS4_SHARE_ACCESS_WRITE); 4153 get_lock_access(lock_stp, NFS4_SHARE_ACCESS_WRITE);
4133 file_lock.fl_type = F_WRLCK; 4154 file_lock->fl_type = F_WRLCK;
4134 break; 4155 break;
4135 default: 4156 default:
4136 status = nfserr_inval; 4157 status = nfserr_inval;
@@ -4140,22 +4161,23 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4140 status = nfserr_openmode; 4161 status = nfserr_openmode;
4141 goto out; 4162 goto out;
4142 } 4163 }
4143 file_lock.fl_owner = (fl_owner_t)lock_sop; 4164 file_lock->fl_owner = (fl_owner_t)lock_sop;
4144 file_lock.fl_pid = current->tgid; 4165 file_lock->fl_pid = current->tgid;
4145 file_lock.fl_file = filp; 4166 file_lock->fl_file = filp;
4146 file_lock.fl_flags = FL_POSIX; 4167 file_lock->fl_flags = FL_POSIX;
4147 file_lock.fl_lmops = &nfsd_posix_mng_ops; 4168 file_lock->fl_lmops = &nfsd_posix_mng_ops;
4148 4169 file_lock->fl_start = lock->lk_offset;
4149 file_lock.fl_start = lock->lk_offset; 4170 file_lock->fl_end = last_byte_offset(lock->lk_offset, lock->lk_length);
4150 file_lock.fl_end = last_byte_offset(lock->lk_offset, lock->lk_length); 4171 nfs4_transform_lock_offset(file_lock);
4151 nfs4_transform_lock_offset(&file_lock); 4172
4152 4173 conflock = locks_alloc_lock();
4153 /* 4174 if (!conflock) {
4154 * Try to lock the file in the VFS. 4175 dprintk("NFSD: %s: unable to allocate lock!\n", __func__);
4155 * Note: locks.c uses the BKL to protect the inode's lock list. 4176 status = nfserr_jukebox;
4156 */ 4177 goto out;
4178 }
4157 4179
4158 err = vfs_lock_file(filp, F_SETLK, &file_lock, &conflock); 4180 err = vfs_lock_file(filp, F_SETLK, file_lock, conflock);
4159 switch (-err) { 4181 switch (-err) {
4160 case 0: /* success! */ 4182 case 0: /* success! */
4161 update_stateid(&lock_stp->st_stid.sc_stateid); 4183 update_stateid(&lock_stp->st_stid.sc_stateid);
@@ -4166,7 +4188,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4166 case (EAGAIN): /* conflock holds conflicting lock */ 4188 case (EAGAIN): /* conflock holds conflicting lock */
4167 status = nfserr_denied; 4189 status = nfserr_denied;
4168 dprintk("NFSD: nfsd4_lock: conflicting lock found!\n"); 4190 dprintk("NFSD: nfsd4_lock: conflicting lock found!\n");
4169 nfs4_set_lock_denied(&conflock, &lock->lk_denied); 4191 nfs4_set_lock_denied(conflock, &lock->lk_denied);
4170 break; 4192 break;
4171 case (EDEADLK): 4193 case (EDEADLK):
4172 status = nfserr_deadlock; 4194 status = nfserr_deadlock;
@@ -4181,6 +4203,10 @@ out:
4181 release_lockowner(lock_sop); 4203 release_lockowner(lock_sop);
4182 if (!cstate->replay_owner) 4204 if (!cstate->replay_owner)
4183 nfs4_unlock_state(); 4205 nfs4_unlock_state();
4206 if (file_lock)
4207 locks_free_lock(file_lock);
4208 if (conflock)
4209 locks_free_lock(conflock);
4184 return status; 4210 return status;
4185} 4211}
4186 4212
@@ -4209,7 +4235,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4209 struct nfsd4_lockt *lockt) 4235 struct nfsd4_lockt *lockt)
4210{ 4236{
4211 struct inode *inode; 4237 struct inode *inode;
4212 struct file_lock file_lock; 4238 struct file_lock *file_lock = NULL;
4213 struct nfs4_lockowner *lo; 4239 struct nfs4_lockowner *lo;
4214 __be32 status; 4240 __be32 status;
4215 struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); 4241 struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
@@ -4230,15 +4256,21 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4230 goto out; 4256 goto out;
4231 4257
4232 inode = cstate->current_fh.fh_dentry->d_inode; 4258 inode = cstate->current_fh.fh_dentry->d_inode;
4233 locks_init_lock(&file_lock); 4259 file_lock = locks_alloc_lock();
4260 if (!file_lock) {
4261 dprintk("NFSD: %s: unable to allocate lock!\n", __func__);
4262 status = nfserr_jukebox;
4263 goto out;
4264 }
4265 locks_init_lock(file_lock);
4234 switch (lockt->lt_type) { 4266 switch (lockt->lt_type) {
4235 case NFS4_READ_LT: 4267 case NFS4_READ_LT:
4236 case NFS4_READW_LT: 4268 case NFS4_READW_LT:
4237 file_lock.fl_type = F_RDLCK; 4269 file_lock->fl_type = F_RDLCK;
4238 break; 4270 break;
4239 case NFS4_WRITE_LT: 4271 case NFS4_WRITE_LT:
4240 case NFS4_WRITEW_LT: 4272 case NFS4_WRITEW_LT:
4241 file_lock.fl_type = F_WRLCK; 4273 file_lock->fl_type = F_WRLCK;
4242 break; 4274 break;
4243 default: 4275 default:
4244 dprintk("NFSD: nfs4_lockt: bad lock type!\n"); 4276 dprintk("NFSD: nfs4_lockt: bad lock type!\n");
@@ -4248,25 +4280,27 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4248 4280
4249 lo = find_lockowner_str(inode, &lockt->lt_clientid, &lockt->lt_owner); 4281 lo = find_lockowner_str(inode, &lockt->lt_clientid, &lockt->lt_owner);
4250 if (lo) 4282 if (lo)
4251 file_lock.fl_owner = (fl_owner_t)lo; 4283 file_lock->fl_owner = (fl_owner_t)lo;
4252 file_lock.fl_pid = current->tgid; 4284 file_lock->fl_pid = current->tgid;
4253 file_lock.fl_flags = FL_POSIX; 4285 file_lock->fl_flags = FL_POSIX;
4254 4286
4255 file_lock.fl_start = lockt->lt_offset; 4287 file_lock->fl_start = lockt->lt_offset;
4256 file_lock.fl_end = last_byte_offset(lockt->lt_offset, lockt->lt_length); 4288 file_lock->fl_end = last_byte_offset(lockt->lt_offset, lockt->lt_length);
4257 4289
4258 nfs4_transform_lock_offset(&file_lock); 4290 nfs4_transform_lock_offset(file_lock);
4259 4291
4260 status = nfsd_test_lock(rqstp, &cstate->current_fh, &file_lock); 4292 status = nfsd_test_lock(rqstp, &cstate->current_fh, file_lock);
4261 if (status) 4293 if (status)
4262 goto out; 4294 goto out;
4263 4295
4264 if (file_lock.fl_type != F_UNLCK) { 4296 if (file_lock->fl_type != F_UNLCK) {
4265 status = nfserr_denied; 4297 status = nfserr_denied;
4266 nfs4_set_lock_denied(&file_lock, &lockt->lt_denied); 4298 nfs4_set_lock_denied(file_lock, &lockt->lt_denied);
4267 } 4299 }
4268out: 4300out:
4269 nfs4_unlock_state(); 4301 nfs4_unlock_state();
4302 if (file_lock)
4303 locks_free_lock(file_lock);
4270 return status; 4304 return status;
4271} 4305}
4272 4306
@@ -4276,7 +4310,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4276{ 4310{
4277 struct nfs4_ol_stateid *stp; 4311 struct nfs4_ol_stateid *stp;
4278 struct file *filp = NULL; 4312 struct file *filp = NULL;
4279 struct file_lock file_lock; 4313 struct file_lock *file_lock = NULL;
4280 __be32 status; 4314 __be32 status;
4281 int err; 4315 int err;
4282 4316
@@ -4298,23 +4332,29 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4298 status = nfserr_lock_range; 4332 status = nfserr_lock_range;
4299 goto out; 4333 goto out;
4300 } 4334 }
4301 BUG_ON(!filp); 4335 file_lock = locks_alloc_lock();
4302 locks_init_lock(&file_lock); 4336 if (!file_lock) {
4303 file_lock.fl_type = F_UNLCK; 4337 dprintk("NFSD: %s: unable to allocate lock!\n", __func__);
4304 file_lock.fl_owner = (fl_owner_t)lockowner(stp->st_stateowner); 4338 status = nfserr_jukebox;
4305 file_lock.fl_pid = current->tgid; 4339 goto out;
4306 file_lock.fl_file = filp; 4340 }
4307 file_lock.fl_flags = FL_POSIX; 4341 locks_init_lock(file_lock);
4308 file_lock.fl_lmops = &nfsd_posix_mng_ops; 4342 file_lock->fl_type = F_UNLCK;
4309 file_lock.fl_start = locku->lu_offset; 4343 file_lock->fl_owner = (fl_owner_t)lockowner(stp->st_stateowner);
4310 4344 file_lock->fl_pid = current->tgid;
4311 file_lock.fl_end = last_byte_offset(locku->lu_offset, locku->lu_length); 4345 file_lock->fl_file = filp;
4312 nfs4_transform_lock_offset(&file_lock); 4346 file_lock->fl_flags = FL_POSIX;
4347 file_lock->fl_lmops = &nfsd_posix_mng_ops;
4348 file_lock->fl_start = locku->lu_offset;
4349
4350 file_lock->fl_end = last_byte_offset(locku->lu_offset,
4351 locku->lu_length);
4352 nfs4_transform_lock_offset(file_lock);
4313 4353
4314 /* 4354 /*
4315 * Try to unlock the file in the VFS. 4355 * Try to unlock the file in the VFS.
4316 */ 4356 */
4317 err = vfs_lock_file(filp, F_SETLK, &file_lock, NULL); 4357 err = vfs_lock_file(filp, F_SETLK, file_lock, NULL);
4318 if (err) { 4358 if (err) {
4319 dprintk("NFSD: nfs4_locku: vfs_lock_file failed!\n"); 4359 dprintk("NFSD: nfs4_locku: vfs_lock_file failed!\n");
4320 goto out_nfserr; 4360 goto out_nfserr;
@@ -4328,6 +4368,8 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4328out: 4368out:
4329 if (!cstate->replay_owner) 4369 if (!cstate->replay_owner)
4330 nfs4_unlock_state(); 4370 nfs4_unlock_state();
4371 if (file_lock)
4372 locks_free_lock(file_lock);
4331 return status; 4373 return status;
4332 4374
4333out_nfserr: 4375out_nfserr:
@@ -4501,12 +4543,12 @@ nfsd4_find_reclaim_client(struct nfs4_client *clp)
4501* Called from OPEN. Look for clientid in reclaim list. 4543* Called from OPEN. Look for clientid in reclaim list.
4502*/ 4544*/
4503__be32 4545__be32
4504nfs4_check_open_reclaim(clientid_t *clid) 4546nfs4_check_open_reclaim(clientid_t *clid, bool sessions)
4505{ 4547{
4506 struct nfs4_client *clp; 4548 struct nfs4_client *clp;
4507 4549
4508 /* find clientid in conf_id_hashtbl */ 4550 /* find clientid in conf_id_hashtbl */
4509 clp = find_confirmed_client(clid); 4551 clp = find_confirmed_client(clid, sessions);
4510 if (clp == NULL) 4552 if (clp == NULL)
4511 return nfserr_reclaim_bad; 4553 return nfserr_reclaim_bad;
4512 4554
@@ -4522,7 +4564,6 @@ void nfsd_forget_clients(u64 num)
4522 4564
4523 nfs4_lock_state(); 4565 nfs4_lock_state();
4524 list_for_each_entry_safe(clp, next, &client_lru, cl_lru) { 4566 list_for_each_entry_safe(clp, next, &client_lru, cl_lru) {
4525 nfsd4_client_record_remove(clp);
4526 expire_client(clp); 4567 expire_client(clp);
4527 if (++count == num) 4568 if (++count == num)
4528 break; 4569 break;
@@ -4582,7 +4623,7 @@ void nfsd_forget_openowners(u64 num)
4582 printk(KERN_INFO "NFSD: Forgot %d open owners", count); 4623 printk(KERN_INFO "NFSD: Forgot %d open owners", count);
4583} 4624}
4584 4625
4585int nfsd_process_n_delegations(u64 num, struct list_head *list) 4626static int nfsd_process_n_delegations(u64 num, struct list_head *list)
4586{ 4627{
4587 int i, count = 0; 4628 int i, count = 0;
4588 struct nfs4_file *fp, *fnext; 4629 struct nfs4_file *fp, *fnext;
@@ -4747,11 +4788,11 @@ __nfs4_state_shutdown(void)
4747 for (i = 0; i < CLIENT_HASH_SIZE; i++) { 4788 for (i = 0; i < CLIENT_HASH_SIZE; i++) {
4748 while (!list_empty(&conf_id_hashtbl[i])) { 4789 while (!list_empty(&conf_id_hashtbl[i])) {
4749 clp = list_entry(conf_id_hashtbl[i].next, struct nfs4_client, cl_idhash); 4790 clp = list_entry(conf_id_hashtbl[i].next, struct nfs4_client, cl_idhash);
4750 expire_client(clp); 4791 destroy_client(clp);
4751 } 4792 }
4752 while (!list_empty(&unconf_str_hashtbl[i])) { 4793 while (!list_empty(&unconf_str_hashtbl[i])) {
4753 clp = list_entry(unconf_str_hashtbl[i].next, struct nfs4_client, cl_strhash); 4794 clp = list_entry(unconf_str_hashtbl[i].next, struct nfs4_client, cl_strhash);
4754 expire_client(clp); 4795 destroy_client(clp);
4755 } 4796 }
4756 } 4797 }
4757 INIT_LIST_HEAD(&reaplist); 4798 INIT_LIST_HEAD(&reaplist);
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 6322df36031f..fd548d155088 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2659,7 +2659,7 @@ static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp,
2659 RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 8); 2659 RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 8);
2660 WRITEMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN); 2660 WRITEMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN);
2661 WRITE32(bcts->dir); 2661 WRITE32(bcts->dir);
2662 /* XXX: ? */ 2662 /* Sorry, we do not yet support RDMA over 4.1: */
2663 WRITE32(0); 2663 WRITE32(0);
2664 ADJUST_ARGS(); 2664 ADJUST_ARGS();
2665 } 2665 }
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index fa49cff5ee65..dab350dfc376 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -406,7 +406,7 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size)
406 return rv; 406 return rv;
407 if (newthreads < 0) 407 if (newthreads < 0)
408 return -EINVAL; 408 return -EINVAL;
409 rv = nfsd_svc(NFS_PORT, newthreads); 409 rv = nfsd_svc(newthreads);
410 if (rv < 0) 410 if (rv < 0)
411 return rv; 411 return rv;
412 } else 412 } else
@@ -683,25 +683,6 @@ static ssize_t __write_ports_addfd(char *buf)
683} 683}
684 684
685/* 685/*
686 * A '-' followed by the 'name' of a socket means we close the socket.
687 */
688static ssize_t __write_ports_delfd(char *buf)
689{
690 char *toclose;
691 int len = 0;
692
693 toclose = kstrdup(buf + 1, GFP_KERNEL);
694 if (toclose == NULL)
695 return -ENOMEM;
696
697 if (nfsd_serv != NULL)
698 len = svc_sock_names(nfsd_serv, buf,
699 SIMPLE_TRANSACTION_LIMIT, toclose);
700 kfree(toclose);
701 return len;
702}
703
704/*
705 * A transport listener is added by writing it's transport name and 686 * A transport listener is added by writing it's transport name and
706 * a port number. 687 * a port number.
707 */ 688 */
@@ -712,7 +693,7 @@ static ssize_t __write_ports_addxprt(char *buf)
712 int port, err; 693 int port, err;
713 struct net *net = &init_net; 694 struct net *net = &init_net;
714 695
715 if (sscanf(buf, "%15s %4u", transport, &port) != 2) 696 if (sscanf(buf, "%15s %5u", transport, &port) != 2)
716 return -EINVAL; 697 return -EINVAL;
717 698
718 if (port < 1 || port > USHRT_MAX) 699 if (port < 1 || port > USHRT_MAX)
@@ -746,31 +727,6 @@ out_err:
746 return err; 727 return err;
747} 728}
748 729
749/*
750 * A transport listener is removed by writing a "-", it's transport
751 * name, and it's port number.
752 */
753static ssize_t __write_ports_delxprt(char *buf)
754{
755 struct svc_xprt *xprt;
756 char transport[16];
757 int port;
758
759 if (sscanf(&buf[1], "%15s %4u", transport, &port) != 2)
760 return -EINVAL;
761
762 if (port < 1 || port > USHRT_MAX || nfsd_serv == NULL)
763 return -EINVAL;
764
765 xprt = svc_find_xprt(nfsd_serv, transport, &init_net, AF_UNSPEC, port);
766 if (xprt == NULL)
767 return -ENOTCONN;
768
769 svc_close_xprt(xprt);
770 svc_xprt_put(xprt);
771 return 0;
772}
773
774static ssize_t __write_ports(struct file *file, char *buf, size_t size) 730static ssize_t __write_ports(struct file *file, char *buf, size_t size)
775{ 731{
776 if (size == 0) 732 if (size == 0)
@@ -779,15 +735,9 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
779 if (isdigit(buf[0])) 735 if (isdigit(buf[0]))
780 return __write_ports_addfd(buf); 736 return __write_ports_addfd(buf);
781 737
782 if (buf[0] == '-' && isdigit(buf[1]))
783 return __write_ports_delfd(buf);
784
785 if (isalpha(buf[0])) 738 if (isalpha(buf[0]))
786 return __write_ports_addxprt(buf); 739 return __write_ports_addxprt(buf);
787 740
788 if (buf[0] == '-' && isalpha(buf[1]))
789 return __write_ports_delxprt(buf);
790
791 return -EINVAL; 741 return -EINVAL;
792} 742}
793 743
@@ -825,21 +775,6 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
825 * OR 775 * OR
826 * 776 *
827 * Input: 777 * Input:
828 * buf: C string containing a "-" followed
829 * by an integer value representing a
830 * previously passed in socket file
831 * descriptor
832 * size: non-zero length of C string in @buf
833 * Output:
834 * On success: NFS service no longer listens on that socket;
835 * passed-in buffer filled with a '\n'-terminated C
836 * string containing a unique name of the listener;
837 * return code is the size in bytes of the string
838 * On error: return code is a negative errno value
839 *
840 * OR
841 *
842 * Input:
843 * buf: C string containing a transport 778 * buf: C string containing a transport
844 * name and an unsigned integer value 779 * name and an unsigned integer value
845 * representing the port to listen on, 780 * representing the port to listen on,
@@ -848,19 +783,6 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
848 * Output: 783 * Output:
849 * On success: returns zero; NFS service is started 784 * On success: returns zero; NFS service is started
850 * On error: return code is a negative errno value 785 * On error: return code is a negative errno value
851 *
852 * OR
853 *
854 * Input:
855 * buf: C string containing a "-" followed
856 * by a transport name and an unsigned
857 * integer value representing the port
858 * to listen on, separated by whitespace
859 * size: non-zero length of C string in @buf
860 * Output:
861 * On success: returns zero; NFS service no longer listens
862 * on that transport
863 * On error: return code is a negative errno value
864 */ 786 */
865static ssize_t write_ports(struct file *file, char *buf, size_t size) 787static ssize_t write_ports(struct file *file, char *buf, size_t size)
866{ 788{
@@ -1008,8 +930,6 @@ static ssize_t write_gracetime(struct file *file, char *buf, size_t size)
1008 return nfsd4_write_time(file, buf, size, &nfsd4_grace); 930 return nfsd4_write_time(file, buf, size, &nfsd4_grace);
1009} 931}
1010 932
1011extern char *nfs4_recoverydir(void);
1012
1013static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size) 933static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size)
1014{ 934{
1015 char *mesg = buf; 935 char *mesg = buf;
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 2244222368ab..80d5ce40aadb 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -65,7 +65,7 @@ extern const struct seq_operations nfs_exports_op;
65/* 65/*
66 * Function prototypes. 66 * Function prototypes.
67 */ 67 */
68int nfsd_svc(unsigned short port, int nrservs); 68int nfsd_svc(int nrservs);
69int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp); 69int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp);
70 70
71int nfsd_nrthreads(void); 71int nfsd_nrthreads(void);
@@ -124,6 +124,7 @@ int nfs4_state_start(void);
124void nfs4_state_shutdown(void); 124void nfs4_state_shutdown(void);
125void nfs4_reset_lease(time_t leasetime); 125void nfs4_reset_lease(time_t leasetime);
126int nfs4_reset_recoverydir(char *recdir); 126int nfs4_reset_recoverydir(char *recdir);
127char * nfs4_recoverydir(void);
127#else 128#else
128static inline void nfs4_state_init(void) { } 129static inline void nfs4_state_init(void) { }
129static inline int nfsd4_init_slabs(void) { return 0; } 130static inline int nfsd4_init_slabs(void) { return 0; }
@@ -132,6 +133,7 @@ static inline int nfs4_state_start(void) { return 0; }
132static inline void nfs4_state_shutdown(void) { } 133static inline void nfs4_state_shutdown(void) { }
133static inline void nfs4_reset_lease(time_t leasetime) { } 134static inline void nfs4_reset_lease(time_t leasetime) { }
134static inline int nfs4_reset_recoverydir(char *recdir) { return 0; } 135static inline int nfs4_reset_recoverydir(char *recdir) { return 0; }
136static inline char * nfs4_recoverydir(void) {return NULL; }
135#endif 137#endif
136 138
137/* 139/*
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 240473cb708f..2013aa001dab 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -183,18 +183,18 @@ int nfsd_nrthreads(void)
183 return rv; 183 return rv;
184} 184}
185 185
186static int nfsd_init_socks(int port) 186static int nfsd_init_socks(void)
187{ 187{
188 int error; 188 int error;
189 if (!list_empty(&nfsd_serv->sv_permsocks)) 189 if (!list_empty(&nfsd_serv->sv_permsocks))
190 return 0; 190 return 0;
191 191
192 error = svc_create_xprt(nfsd_serv, "udp", &init_net, PF_INET, port, 192 error = svc_create_xprt(nfsd_serv, "udp", &init_net, PF_INET, NFS_PORT,
193 SVC_SOCK_DEFAULTS); 193 SVC_SOCK_DEFAULTS);
194 if (error < 0) 194 if (error < 0)
195 return error; 195 return error;
196 196
197 error = svc_create_xprt(nfsd_serv, "tcp", &init_net, PF_INET, port, 197 error = svc_create_xprt(nfsd_serv, "tcp", &init_net, PF_INET, NFS_PORT,
198 SVC_SOCK_DEFAULTS); 198 SVC_SOCK_DEFAULTS);
199 if (error < 0) 199 if (error < 0)
200 return error; 200 return error;
@@ -204,7 +204,7 @@ static int nfsd_init_socks(int port)
204 204
205static bool nfsd_up = false; 205static bool nfsd_up = false;
206 206
207static int nfsd_startup(unsigned short port, int nrservs) 207static int nfsd_startup(int nrservs)
208{ 208{
209 int ret; 209 int ret;
210 210
@@ -218,7 +218,7 @@ static int nfsd_startup(unsigned short port, int nrservs)
218 ret = nfsd_racache_init(2*nrservs); 218 ret = nfsd_racache_init(2*nrservs);
219 if (ret) 219 if (ret)
220 return ret; 220 return ret;
221 ret = nfsd_init_socks(port); 221 ret = nfsd_init_socks();
222 if (ret) 222 if (ret)
223 goto out_racache; 223 goto out_racache;
224 ret = lockd_up(&init_net); 224 ret = lockd_up(&init_net);
@@ -436,7 +436,7 @@ int nfsd_set_nrthreads(int n, int *nthreads)
436 * this is the first time nrservs is nonzero. 436 * this is the first time nrservs is nonzero.
437 */ 437 */
438int 438int
439nfsd_svc(unsigned short port, int nrservs) 439nfsd_svc(int nrservs)
440{ 440{
441 int error; 441 int error;
442 bool nfsd_up_before; 442 bool nfsd_up_before;
@@ -458,7 +458,7 @@ nfsd_svc(unsigned short port, int nrservs)
458 458
459 nfsd_up_before = nfsd_up; 459 nfsd_up_before = nfsd_up;
460 460
461 error = nfsd_startup(port, nrservs); 461 error = nfsd_startup(nrservs);
462 if (error) 462 if (error)
463 goto out_destroy; 463 goto out_destroy;
464 error = svc_set_num_threads(nfsd_serv, NULL, nrservs); 464 error = svc_set_num_threads(nfsd_serv, NULL, nrservs);
@@ -487,7 +487,7 @@ static int
487nfsd(void *vrqstp) 487nfsd(void *vrqstp)
488{ 488{
489 struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp; 489 struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp;
490 int err, preverr = 0; 490 int err;
491 491
492 /* Lock module and set up kernel thread */ 492 /* Lock module and set up kernel thread */
493 mutex_lock(&nfsd_mutex); 493 mutex_lock(&nfsd_mutex);
@@ -534,16 +534,6 @@ nfsd(void *vrqstp)
534 ; 534 ;
535 if (err == -EINTR) 535 if (err == -EINTR)
536 break; 536 break;
537 else if (err < 0) {
538 if (err != preverr) {
539 printk(KERN_WARNING "%s: unexpected error "
540 "from svc_recv (%d)\n", __func__, -err);
541 preverr = err;
542 }
543 schedule_timeout_uninterruptible(HZ);
544 continue;
545 }
546
547 validate_process_creds(); 537 validate_process_creds();
548 svc_process(rqstp); 538 svc_process(rqstp);
549 validate_process_creds(); 539 validate_process_creds();
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 22bd0a66c356..e036894bce57 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -373,11 +373,7 @@ static inline struct nfs4_lockowner * lockowner(struct nfs4_stateowner *so)
373 return container_of(so, struct nfs4_lockowner, lo_owner); 373 return container_of(so, struct nfs4_lockowner, lo_owner);
374} 374}
375 375
376/* 376/* nfs4_file: a file opened by some number of (open) nfs4_stateowners. */
377* nfs4_file: a file opened by some number of (open) nfs4_stateowners.
378* o fi_perfile list is used to search for conflicting
379* share_acces, share_deny on the file.
380*/
381struct nfs4_file { 377struct nfs4_file {
382 atomic_t fi_ref; 378 atomic_t fi_ref;
383 struct list_head fi_hash; /* hash by "struct inode *" */ 379 struct list_head fi_hash; /* hash by "struct inode *" */
@@ -459,7 +455,7 @@ extern void nfs4_unlock_state(void);
459extern int nfs4_in_grace(void); 455extern int nfs4_in_grace(void);
460extern void nfs4_release_reclaim(void); 456extern void nfs4_release_reclaim(void);
461extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(struct nfs4_client *crp); 457extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(struct nfs4_client *crp);
462extern __be32 nfs4_check_open_reclaim(clientid_t *clid); 458extern __be32 nfs4_check_open_reclaim(clientid_t *clid, bool sessions);
463extern void nfs4_free_openowner(struct nfs4_openowner *); 459extern void nfs4_free_openowner(struct nfs4_openowner *);
464extern void nfs4_free_lockowner(struct nfs4_lockowner *); 460extern void nfs4_free_lockowner(struct nfs4_lockowner *);
465extern int set_callback_cred(void); 461extern int set_callback_cred(void);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 3f67b8e12251..c120b48ec305 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1581,7 +1581,7 @@ nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp)
1581 */ 1581 */
1582 1582
1583 oldfs = get_fs(); set_fs(KERNEL_DS); 1583 oldfs = get_fs(); set_fs(KERNEL_DS);
1584 host_err = inode->i_op->readlink(path.dentry, buf, *lenp); 1584 host_err = inode->i_op->readlink(path.dentry, (char __user *)buf, *lenp);
1585 set_fs(oldfs); 1585 set_fs(oldfs);
1586 1586
1587 if (host_err < 0) 1587 if (host_err < 0)
diff --git a/fs/open.c b/fs/open.c
index 44da0feeca2c..59071f55bf7f 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -478,7 +478,7 @@ SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode)
478 478
479 file = fget(fd); 479 file = fget(fd);
480 if (file) { 480 if (file) {
481 audit_inode(NULL, file->f_path.dentry); 481 audit_inode(NULL, file->f_path.dentry, 0);
482 err = chmod_common(&file->f_path, mode); 482 err = chmod_common(&file->f_path, mode);
483 fput(file); 483 fput(file);
484 } 484 }
@@ -588,7 +588,7 @@ SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
588 error = mnt_want_write_file(f.file); 588 error = mnt_want_write_file(f.file);
589 if (error) 589 if (error)
590 goto out_fput; 590 goto out_fput;
591 audit_inode(NULL, f.file->f_path.dentry); 591 audit_inode(NULL, f.file->f_path.dentry, 0);
592 error = chown_common(&f.file->f_path, user, group); 592 error = chown_common(&f.file->f_path, user, group);
593 mnt_drop_write_file(f.file); 593 mnt_drop_write_file(f.file);
594out_fput: 594out_fput:
@@ -859,6 +859,24 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o
859} 859}
860 860
861/** 861/**
862 * file_open_name - open file and return file pointer
863 *
864 * @name: struct filename containing path to open
865 * @flags: open flags as per the open(2) second argument
866 * @mode: mode for the new file if O_CREAT is set, else ignored
867 *
868 * This is the helper to open a file from kernelspace if you really
869 * have to. But in generally you should not do this, so please move
870 * along, nothing to see here..
871 */
872struct file *file_open_name(struct filename *name, int flags, umode_t mode)
873{
874 struct open_flags op;
875 int lookup = build_open_flags(flags, mode, &op);
876 return do_filp_open(AT_FDCWD, name, &op, lookup);
877}
878
879/**
862 * filp_open - open file and return file pointer 880 * filp_open - open file and return file pointer
863 * 881 *
864 * @filename: path to open 882 * @filename: path to open
@@ -871,9 +889,8 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o
871 */ 889 */
872struct file *filp_open(const char *filename, int flags, umode_t mode) 890struct file *filp_open(const char *filename, int flags, umode_t mode)
873{ 891{
874 struct open_flags op; 892 struct filename name = {.name = filename};
875 int lookup = build_open_flags(flags, mode, &op); 893 return file_open_name(&name, flags, mode);
876 return do_filp_open(AT_FDCWD, filename, &op, lookup);
877} 894}
878EXPORT_SYMBOL(filp_open); 895EXPORT_SYMBOL(filp_open);
879 896
@@ -895,7 +912,7 @@ long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
895{ 912{
896 struct open_flags op; 913 struct open_flags op;
897 int lookup = build_open_flags(flags, mode, &op); 914 int lookup = build_open_flags(flags, mode, &op);
898 char *tmp = getname(filename); 915 struct filename *tmp = getname(filename);
899 int fd = PTR_ERR(tmp); 916 int fd = PTR_ERR(tmp);
900 917
901 if (!IS_ERR(tmp)) { 918 if (!IS_ERR(tmp)) {
diff --git a/fs/proc/base.c b/fs/proc/base.c
index ef5c84be66f9..144a96732dd7 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2258,7 +2258,8 @@ static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
2258 pid_t tgid = task_tgid_nr_ns(current, ns); 2258 pid_t tgid = task_tgid_nr_ns(current, ns);
2259 char *name = ERR_PTR(-ENOENT); 2259 char *name = ERR_PTR(-ENOENT);
2260 if (tgid) { 2260 if (tgid) {
2261 name = __getname(); 2261 /* 11 for max length of signed int in decimal + NULL term */
2262 name = kmalloc(12, GFP_KERNEL);
2262 if (!name) 2263 if (!name)
2263 name = ERR_PTR(-ENOMEM); 2264 name = ERR_PTR(-ENOMEM);
2264 else 2265 else
@@ -2273,7 +2274,7 @@ static void proc_self_put_link(struct dentry *dentry, struct nameidata *nd,
2273{ 2274{
2274 char *s = nd_get_link(nd); 2275 char *s = nd_get_link(nd);
2275 if (!IS_ERR(s)) 2276 if (!IS_ERR(s))
2276 __putname(s); 2277 kfree(s);
2277} 2278}
2278 2279
2279static const struct inode_operations proc_self_inode_operations = { 2280static const struct inode_operations proc_self_inode_operations = {
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 79827ce03e3b..14df8806ff29 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1158,6 +1158,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
1158 struct vm_area_struct *vma = v; 1158 struct vm_area_struct *vma = v;
1159 struct numa_maps *md = &numa_priv->md; 1159 struct numa_maps *md = &numa_priv->md;
1160 struct file *file = vma->vm_file; 1160 struct file *file = vma->vm_file;
1161 struct task_struct *task = proc_priv->task;
1161 struct mm_struct *mm = vma->vm_mm; 1162 struct mm_struct *mm = vma->vm_mm;
1162 struct mm_walk walk = {}; 1163 struct mm_walk walk = {};
1163 struct mempolicy *pol; 1164 struct mempolicy *pol;
@@ -1177,9 +1178,11 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
1177 walk.private = md; 1178 walk.private = md;
1178 walk.mm = mm; 1179 walk.mm = mm;
1179 1180
1180 pol = get_vma_policy(proc_priv->task, vma, vma->vm_start); 1181 task_lock(task);
1182 pol = get_vma_policy(task, vma, vma->vm_start);
1181 mpol_to_str(buffer, sizeof(buffer), pol, 0); 1183 mpol_to_str(buffer, sizeof(buffer), pol, 0);
1182 mpol_cond_put(pol); 1184 mpol_cond_put(pol);
1185 task_unlock(task);
1183 1186
1184 seq_printf(m, "%08lx %s", vma->vm_start, buffer); 1187 seq_printf(m, "%08lx %s", vma->vm_start, buffer);
1185 1188
@@ -1189,7 +1192,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
1189 } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { 1192 } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
1190 seq_printf(m, " heap"); 1193 seq_printf(m, " heap");
1191 } else { 1194 } else {
1192 pid_t tid = vm_is_stack(proc_priv->task, vma, is_pid); 1195 pid_t tid = vm_is_stack(task, vma, is_pid);
1193 if (tid != 0) { 1196 if (tid != 0) {
1194 /* 1197 /*
1195 * Thread stack in /proc/PID/task/TID/maps or 1198 * Thread stack in /proc/PID/task/TID/maps or
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index ff0135d6bc51..af1661f7a54f 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -331,11 +331,11 @@ static struct super_block *quotactl_block(const char __user *special, int cmd)
331#ifdef CONFIG_BLOCK 331#ifdef CONFIG_BLOCK
332 struct block_device *bdev; 332 struct block_device *bdev;
333 struct super_block *sb; 333 struct super_block *sb;
334 char *tmp = getname(special); 334 struct filename *tmp = getname(special);
335 335
336 if (IS_ERR(tmp)) 336 if (IS_ERR(tmp))
337 return ERR_CAST(tmp); 337 return ERR_CAST(tmp);
338 bdev = lookup_bdev(tmp); 338 bdev = lookup_bdev(tmp->name);
339 putname(tmp); 339 putname(tmp);
340 if (IS_ERR(bdev)) 340 if (IS_ERR(bdev))
341 return ERR_CAST(bdev); 341 return ERR_CAST(bdev);
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 46485557cdc6..f27f01a98aa2 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1573,8 +1573,10 @@ struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
1573 reiserfs_warning(sb, "reiserfs-13077", 1573 reiserfs_warning(sb, "reiserfs-13077",
1574 "nfsd/reiserfs, fhtype=%d, len=%d - odd", 1574 "nfsd/reiserfs, fhtype=%d, len=%d - odd",
1575 fh_type, fh_len); 1575 fh_type, fh_len);
1576 fh_type = 5; 1576 fh_type = fh_len;
1577 } 1577 }
1578 if (fh_len < 2)
1579 return NULL;
1578 1580
1579 return reiserfs_get_dentry(sb, fid->raw[0], fid->raw[1], 1581 return reiserfs_get_dentry(sb, fid->raw[0], fid->raw[1],
1580 (fh_type == 3 || fh_type >= 5) ? fid->raw[2] : 0); 1582 (fh_type == 3 || fh_type >= 5) ? fid->raw[2] : 0);
@@ -1583,6 +1585,8 @@ struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
1583struct dentry *reiserfs_fh_to_parent(struct super_block *sb, struct fid *fid, 1585struct dentry *reiserfs_fh_to_parent(struct super_block *sb, struct fid *fid,
1584 int fh_len, int fh_type) 1586 int fh_len, int fh_type)
1585{ 1587{
1588 if (fh_type > fh_len)
1589 fh_type = fh_len;
1586 if (fh_type < 4) 1590 if (fh_type < 4)
1587 return NULL; 1591 return NULL;
1588 1592
diff --git a/fs/super.c b/fs/super.c
index a3bc935069d9..12f123712161 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -186,15 +186,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
186 spin_lock_init(&s->s_inode_lru_lock); 186 spin_lock_init(&s->s_inode_lru_lock);
187 INIT_LIST_HEAD(&s->s_mounts); 187 INIT_LIST_HEAD(&s->s_mounts);
188 init_rwsem(&s->s_umount); 188 init_rwsem(&s->s_umount);
189 mutex_init(&s->s_lock);
190 lockdep_set_class(&s->s_umount, &type->s_umount_key); 189 lockdep_set_class(&s->s_umount, &type->s_umount_key);
191 /* 190 /*
192 * The locking rules for s_lock are up to the
193 * filesystem. For example ext3fs has different
194 * lock ordering than usbfs:
195 */
196 lockdep_set_class(&s->s_lock, &type->s_lock_key);
197 /*
198 * sget() can have s_umount recursion. 191 * sget() can have s_umount recursion.
199 * 192 *
200 * When it cannot find a suitable sb, it allocates a new 193 * When it cannot find a suitable sb, it allocates a new
@@ -394,22 +387,6 @@ bool grab_super_passive(struct super_block *sb)
394 return false; 387 return false;
395} 388}
396 389
397/*
398 * Superblock locking. We really ought to get rid of these two.
399 */
400void lock_super(struct super_block * sb)
401{
402 mutex_lock(&sb->s_lock);
403}
404
405void unlock_super(struct super_block * sb)
406{
407 mutex_unlock(&sb->s_lock);
408}
409
410EXPORT_SYMBOL(lock_super);
411EXPORT_SYMBOL(unlock_super);
412
413/** 390/**
414 * generic_shutdown_super - common helper for ->kill_sb() 391 * generic_shutdown_super - common helper for ->kill_sb()
415 * @sb: superblock to kill 392 * @sb: superblock to kill
diff --git a/fs/sysv/balloc.c b/fs/sysv/balloc.c
index 9a6ad96acf27..921c053fc052 100644
--- a/fs/sysv/balloc.c
+++ b/fs/sysv/balloc.c
@@ -60,12 +60,12 @@ void sysv_free_block(struct super_block * sb, sysv_zone_t nr)
60 return; 60 return;
61 } 61 }
62 62
63 lock_super(sb); 63 mutex_lock(&sbi->s_lock);
64 count = fs16_to_cpu(sbi, *sbi->s_bcache_count); 64 count = fs16_to_cpu(sbi, *sbi->s_bcache_count);
65 65
66 if (count > sbi->s_flc_size) { 66 if (count > sbi->s_flc_size) {
67 printk("sysv_free_block: flc_count > flc_size\n"); 67 printk("sysv_free_block: flc_count > flc_size\n");
68 unlock_super(sb); 68 mutex_unlock(&sbi->s_lock);
69 return; 69 return;
70 } 70 }
71 /* If the free list head in super-block is full, it is copied 71 /* If the free list head in super-block is full, it is copied
@@ -77,7 +77,7 @@ void sysv_free_block(struct super_block * sb, sysv_zone_t nr)
77 bh = sb_getblk(sb, block); 77 bh = sb_getblk(sb, block);
78 if (!bh) { 78 if (!bh) {
79 printk("sysv_free_block: getblk() failed\n"); 79 printk("sysv_free_block: getblk() failed\n");
80 unlock_super(sb); 80 mutex_unlock(&sbi->s_lock);
81 return; 81 return;
82 } 82 }
83 memset(bh->b_data, 0, sb->s_blocksize); 83 memset(bh->b_data, 0, sb->s_blocksize);
@@ -93,7 +93,7 @@ void sysv_free_block(struct super_block * sb, sysv_zone_t nr)
93 *sbi->s_bcache_count = cpu_to_fs16(sbi, count); 93 *sbi->s_bcache_count = cpu_to_fs16(sbi, count);
94 fs32_add(sbi, sbi->s_free_blocks, 1); 94 fs32_add(sbi, sbi->s_free_blocks, 1);
95 dirty_sb(sb); 95 dirty_sb(sb);
96 unlock_super(sb); 96 mutex_unlock(&sbi->s_lock);
97} 97}
98 98
99sysv_zone_t sysv_new_block(struct super_block * sb) 99sysv_zone_t sysv_new_block(struct super_block * sb)
@@ -104,7 +104,7 @@ sysv_zone_t sysv_new_block(struct super_block * sb)
104 struct buffer_head * bh; 104 struct buffer_head * bh;
105 unsigned count; 105 unsigned count;
106 106
107 lock_super(sb); 107 mutex_lock(&sbi->s_lock);
108 count = fs16_to_cpu(sbi, *sbi->s_bcache_count); 108 count = fs16_to_cpu(sbi, *sbi->s_bcache_count);
109 109
110 if (count == 0) /* Applies only to Coherent FS */ 110 if (count == 0) /* Applies only to Coherent FS */
@@ -147,11 +147,11 @@ sysv_zone_t sysv_new_block(struct super_block * sb)
147 /* Now the free list head in the superblock is valid again. */ 147 /* Now the free list head in the superblock is valid again. */
148 fs32_add(sbi, sbi->s_free_blocks, -1); 148 fs32_add(sbi, sbi->s_free_blocks, -1);
149 dirty_sb(sb); 149 dirty_sb(sb);
150 unlock_super(sb); 150 mutex_unlock(&sbi->s_lock);
151 return nr; 151 return nr;
152 152
153Enospc: 153Enospc:
154 unlock_super(sb); 154 mutex_unlock(&sbi->s_lock);
155 return 0; 155 return 0;
156} 156}
157 157
@@ -173,7 +173,7 @@ unsigned long sysv_count_free_blocks(struct super_block * sb)
173 if (sbi->s_type == FSTYPE_AFS) 173 if (sbi->s_type == FSTYPE_AFS)
174 return 0; 174 return 0;
175 175
176 lock_super(sb); 176 mutex_lock(&sbi->s_lock);
177 sb_count = fs32_to_cpu(sbi, *sbi->s_free_blocks); 177 sb_count = fs32_to_cpu(sbi, *sbi->s_free_blocks);
178 178
179 if (0) 179 if (0)
@@ -211,7 +211,7 @@ unsigned long sysv_count_free_blocks(struct super_block * sb)
211 if (count != sb_count) 211 if (count != sb_count)
212 goto Ecount; 212 goto Ecount;
213done: 213done:
214 unlock_super(sb); 214 mutex_unlock(&sbi->s_lock);
215 return count; 215 return count;
216 216
217Einval: 217Einval:
diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c
index 8233b02eccae..f9db4eb31db4 100644
--- a/fs/sysv/ialloc.c
+++ b/fs/sysv/ialloc.c
@@ -118,7 +118,7 @@ void sysv_free_inode(struct inode * inode)
118 "%s\n", inode->i_sb->s_id); 118 "%s\n", inode->i_sb->s_id);
119 return; 119 return;
120 } 120 }
121 lock_super(sb); 121 mutex_lock(&sbi->s_lock);
122 count = fs16_to_cpu(sbi, *sbi->s_sb_fic_count); 122 count = fs16_to_cpu(sbi, *sbi->s_sb_fic_count);
123 if (count < sbi->s_fic_size) { 123 if (count < sbi->s_fic_size) {
124 *sv_sb_fic_inode(sb,count++) = cpu_to_fs16(sbi, ino); 124 *sv_sb_fic_inode(sb,count++) = cpu_to_fs16(sbi, ino);
@@ -128,7 +128,7 @@ void sysv_free_inode(struct inode * inode)
128 dirty_sb(sb); 128 dirty_sb(sb);
129 memset(raw_inode, 0, sizeof(struct sysv_inode)); 129 memset(raw_inode, 0, sizeof(struct sysv_inode));
130 mark_buffer_dirty(bh); 130 mark_buffer_dirty(bh);
131 unlock_super(sb); 131 mutex_unlock(&sbi->s_lock);
132 brelse(bh); 132 brelse(bh);
133} 133}
134 134
@@ -147,13 +147,13 @@ struct inode * sysv_new_inode(const struct inode * dir, umode_t mode)
147 if (!inode) 147 if (!inode)
148 return ERR_PTR(-ENOMEM); 148 return ERR_PTR(-ENOMEM);
149 149
150 lock_super(sb); 150 mutex_lock(&sbi->s_lock);
151 count = fs16_to_cpu(sbi, *sbi->s_sb_fic_count); 151 count = fs16_to_cpu(sbi, *sbi->s_sb_fic_count);
152 if (count == 0 || (*sv_sb_fic_inode(sb,count-1) == 0)) { 152 if (count == 0 || (*sv_sb_fic_inode(sb,count-1) == 0)) {
153 count = refill_free_cache(sb); 153 count = refill_free_cache(sb);
154 if (count == 0) { 154 if (count == 0) {
155 iput(inode); 155 iput(inode);
156 unlock_super(sb); 156 mutex_unlock(&sbi->s_lock);
157 return ERR_PTR(-ENOSPC); 157 return ERR_PTR(-ENOSPC);
158 } 158 }
159 } 159 }
@@ -174,7 +174,7 @@ struct inode * sysv_new_inode(const struct inode * dir, umode_t mode)
174 sysv_write_inode(inode, &wbc); /* ensure inode not allocated again */ 174 sysv_write_inode(inode, &wbc); /* ensure inode not allocated again */
175 mark_inode_dirty(inode); /* cleared by sysv_write_inode() */ 175 mark_inode_dirty(inode); /* cleared by sysv_write_inode() */
176 /* That's it. */ 176 /* That's it. */
177 unlock_super(sb); 177 mutex_unlock(&sbi->s_lock);
178 return inode; 178 return inode;
179} 179}
180 180
@@ -185,7 +185,7 @@ unsigned long sysv_count_free_inodes(struct super_block * sb)
185 struct sysv_inode * raw_inode; 185 struct sysv_inode * raw_inode;
186 int ino, count, sb_count; 186 int ino, count, sb_count;
187 187
188 lock_super(sb); 188 mutex_lock(&sbi->s_lock);
189 189
190 sb_count = fs16_to_cpu(sbi, *sbi->s_sb_total_free_inodes); 190 sb_count = fs16_to_cpu(sbi, *sbi->s_sb_total_free_inodes);
191 191
@@ -213,7 +213,7 @@ unsigned long sysv_count_free_inodes(struct super_block * sb)
213 if (count != sb_count) 213 if (count != sb_count)
214 goto Einval; 214 goto Einval;
215out: 215out:
216 unlock_super(sb); 216 mutex_unlock(&sbi->s_lock);
217 return count; 217 return count;
218 218
219Einval: 219Einval:
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index d33e506c1eac..c327d4ee1235 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -36,7 +36,7 @@ static int sysv_sync_fs(struct super_block *sb, int wait)
36 struct sysv_sb_info *sbi = SYSV_SB(sb); 36 struct sysv_sb_info *sbi = SYSV_SB(sb);
37 unsigned long time = get_seconds(), old_time; 37 unsigned long time = get_seconds(), old_time;
38 38
39 lock_super(sb); 39 mutex_lock(&sbi->s_lock);
40 40
41 /* 41 /*
42 * If we are going to write out the super block, 42 * If we are going to write out the super block,
@@ -51,7 +51,7 @@ static int sysv_sync_fs(struct super_block *sb, int wait)
51 mark_buffer_dirty(sbi->s_bh2); 51 mark_buffer_dirty(sbi->s_bh2);
52 } 52 }
53 53
54 unlock_super(sb); 54 mutex_unlock(&sbi->s_lock);
55 55
56 return 0; 56 return 0;
57} 57}
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index 7491c33b6468..a38e87bdd78d 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c
@@ -368,6 +368,7 @@ static int sysv_fill_super(struct super_block *sb, void *data, int silent)
368 368
369 sbi->s_sb = sb; 369 sbi->s_sb = sb;
370 sbi->s_block_base = 0; 370 sbi->s_block_base = 0;
371 mutex_init(&sbi->s_lock);
371 sb->s_fs_info = sbi; 372 sb->s_fs_info = sbi;
372 373
373 sb_set_blocksize(sb, BLOCK_SIZE); 374 sb_set_blocksize(sb, BLOCK_SIZE);
diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h
index 0bc35fdc58e2..69d488986cce 100644
--- a/fs/sysv/sysv.h
+++ b/fs/sysv/sysv.h
@@ -58,6 +58,7 @@ struct sysv_sb_info {
58 u32 s_nzones; /* same as s_sbd->s_fsize */ 58 u32 s_nzones; /* same as s_sbd->s_fsize */
59 u16 s_namelen; /* max length of dir entry */ 59 u16 s_namelen; /* max length of dir entry */
60 int s_forced_ro; 60 int s_forced_ro;
61 struct mutex s_lock;
61}; 62};
62 63
63/* 64/*
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 1b3e410bf334..a7ea492ae660 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -54,7 +54,7 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
54 if (ufs_fragnum(fragment) + count > uspi->s_fpg) 54 if (ufs_fragnum(fragment) + count > uspi->s_fpg)
55 ufs_error (sb, "ufs_free_fragments", "internal error"); 55 ufs_error (sb, "ufs_free_fragments", "internal error");
56 56
57 lock_super(sb); 57 mutex_lock(&UFS_SB(sb)->s_lock);
58 58
59 cgno = ufs_dtog(uspi, fragment); 59 cgno = ufs_dtog(uspi, fragment);
60 bit = ufs_dtogd(uspi, fragment); 60 bit = ufs_dtogd(uspi, fragment);
@@ -118,12 +118,12 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
118 ubh_sync_block(UCPI_UBH(ucpi)); 118 ubh_sync_block(UCPI_UBH(ucpi));
119 ufs_mark_sb_dirty(sb); 119 ufs_mark_sb_dirty(sb);
120 120
121 unlock_super (sb); 121 mutex_unlock(&UFS_SB(sb)->s_lock);
122 UFSD("EXIT\n"); 122 UFSD("EXIT\n");
123 return; 123 return;
124 124
125failed: 125failed:
126 unlock_super (sb); 126 mutex_unlock(&UFS_SB(sb)->s_lock);
127 UFSD("EXIT (FAILED)\n"); 127 UFSD("EXIT (FAILED)\n");
128 return; 128 return;
129} 129}
@@ -155,7 +155,7 @@ void ufs_free_blocks(struct inode *inode, u64 fragment, unsigned count)
155 goto failed; 155 goto failed;
156 } 156 }
157 157
158 lock_super(sb); 158 mutex_lock(&UFS_SB(sb)->s_lock);
159 159
160do_more: 160do_more:
161 overflow = 0; 161 overflow = 0;
@@ -215,12 +215,12 @@ do_more:
215 } 215 }
216 216
217 ufs_mark_sb_dirty(sb); 217 ufs_mark_sb_dirty(sb);
218 unlock_super (sb); 218 mutex_unlock(&UFS_SB(sb)->s_lock);
219 UFSD("EXIT\n"); 219 UFSD("EXIT\n");
220 return; 220 return;
221 221
222failed_unlock: 222failed_unlock:
223 unlock_super (sb); 223 mutex_unlock(&UFS_SB(sb)->s_lock);
224failed: 224failed:
225 UFSD("EXIT (FAILED)\n"); 225 UFSD("EXIT (FAILED)\n");
226 return; 226 return;
@@ -361,7 +361,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
361 usb1 = ubh_get_usb_first(uspi); 361 usb1 = ubh_get_usb_first(uspi);
362 *err = -ENOSPC; 362 *err = -ENOSPC;
363 363
364 lock_super (sb); 364 mutex_lock(&UFS_SB(sb)->s_lock);
365 tmp = ufs_data_ptr_to_cpu(sb, p); 365 tmp = ufs_data_ptr_to_cpu(sb, p);
366 366
367 if (count + ufs_fragnum(fragment) > uspi->s_fpb) { 367 if (count + ufs_fragnum(fragment) > uspi->s_fpb) {
@@ -382,19 +382,19 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
382 "fragment %llu, tmp %llu\n", 382 "fragment %llu, tmp %llu\n",
383 (unsigned long long)fragment, 383 (unsigned long long)fragment,
384 (unsigned long long)tmp); 384 (unsigned long long)tmp);
385 unlock_super(sb); 385 mutex_unlock(&UFS_SB(sb)->s_lock);
386 return INVBLOCK; 386 return INVBLOCK;
387 } 387 }
388 if (fragment < UFS_I(inode)->i_lastfrag) { 388 if (fragment < UFS_I(inode)->i_lastfrag) {
389 UFSD("EXIT (ALREADY ALLOCATED)\n"); 389 UFSD("EXIT (ALREADY ALLOCATED)\n");
390 unlock_super (sb); 390 mutex_unlock(&UFS_SB(sb)->s_lock);
391 return 0; 391 return 0;
392 } 392 }
393 } 393 }
394 else { 394 else {
395 if (tmp) { 395 if (tmp) {
396 UFSD("EXIT (ALREADY ALLOCATED)\n"); 396 UFSD("EXIT (ALREADY ALLOCATED)\n");
397 unlock_super(sb); 397 mutex_unlock(&UFS_SB(sb)->s_lock);
398 return 0; 398 return 0;
399 } 399 }
400 } 400 }
@@ -403,7 +403,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
403 * There is not enough space for user on the device 403 * There is not enough space for user on the device
404 */ 404 */
405 if (!capable(CAP_SYS_RESOURCE) && ufs_freespace(uspi, UFS_MINFREE) <= 0) { 405 if (!capable(CAP_SYS_RESOURCE) && ufs_freespace(uspi, UFS_MINFREE) <= 0) {
406 unlock_super (sb); 406 mutex_unlock(&UFS_SB(sb)->s_lock);
407 UFSD("EXIT (FAILED)\n"); 407 UFSD("EXIT (FAILED)\n");
408 return 0; 408 return 0;
409 } 409 }
@@ -428,7 +428,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
428 ufs_clear_frags(inode, result + oldcount, 428 ufs_clear_frags(inode, result + oldcount,
429 newcount - oldcount, locked_page != NULL); 429 newcount - oldcount, locked_page != NULL);
430 } 430 }
431 unlock_super(sb); 431 mutex_unlock(&UFS_SB(sb)->s_lock);
432 UFSD("EXIT, result %llu\n", (unsigned long long)result); 432 UFSD("EXIT, result %llu\n", (unsigned long long)result);
433 return result; 433 return result;
434 } 434 }
@@ -443,7 +443,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
443 fragment + count); 443 fragment + count);
444 ufs_clear_frags(inode, result + oldcount, newcount - oldcount, 444 ufs_clear_frags(inode, result + oldcount, newcount - oldcount,
445 locked_page != NULL); 445 locked_page != NULL);
446 unlock_super(sb); 446 mutex_unlock(&UFS_SB(sb)->s_lock);
447 UFSD("EXIT, result %llu\n", (unsigned long long)result); 447 UFSD("EXIT, result %llu\n", (unsigned long long)result);
448 return result; 448 return result;
449 } 449 }
@@ -481,7 +481,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
481 *err = 0; 481 *err = 0;
482 UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag, 482 UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag,
483 fragment + count); 483 fragment + count);
484 unlock_super(sb); 484 mutex_unlock(&UFS_SB(sb)->s_lock);
485 if (newcount < request) 485 if (newcount < request)
486 ufs_free_fragments (inode, result + newcount, request - newcount); 486 ufs_free_fragments (inode, result + newcount, request - newcount);
487 ufs_free_fragments (inode, tmp, oldcount); 487 ufs_free_fragments (inode, tmp, oldcount);
@@ -489,7 +489,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
489 return result; 489 return result;
490 } 490 }
491 491
492 unlock_super(sb); 492 mutex_unlock(&UFS_SB(sb)->s_lock);
493 UFSD("EXIT (FAILED)\n"); 493 UFSD("EXIT (FAILED)\n");
494 return 0; 494 return 0;
495} 495}
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index e84cbe21b986..d0426d74817b 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -71,11 +71,11 @@ void ufs_free_inode (struct inode * inode)
71 71
72 ino = inode->i_ino; 72 ino = inode->i_ino;
73 73
74 lock_super (sb); 74 mutex_lock(&UFS_SB(sb)->s_lock);
75 75
76 if (!((ino > 1) && (ino < (uspi->s_ncg * uspi->s_ipg )))) { 76 if (!((ino > 1) && (ino < (uspi->s_ncg * uspi->s_ipg )))) {
77 ufs_warning(sb, "ufs_free_inode", "reserved inode or nonexistent inode %u\n", ino); 77 ufs_warning(sb, "ufs_free_inode", "reserved inode or nonexistent inode %u\n", ino);
78 unlock_super (sb); 78 mutex_unlock(&UFS_SB(sb)->s_lock);
79 return; 79 return;
80 } 80 }
81 81
@@ -83,7 +83,7 @@ void ufs_free_inode (struct inode * inode)
83 bit = ufs_inotocgoff (ino); 83 bit = ufs_inotocgoff (ino);
84 ucpi = ufs_load_cylinder (sb, cg); 84 ucpi = ufs_load_cylinder (sb, cg);
85 if (!ucpi) { 85 if (!ucpi) {
86 unlock_super (sb); 86 mutex_unlock(&UFS_SB(sb)->s_lock);
87 return; 87 return;
88 } 88 }
89 ucg = ubh_get_ucg(UCPI_UBH(ucpi)); 89 ucg = ubh_get_ucg(UCPI_UBH(ucpi));
@@ -117,7 +117,7 @@ void ufs_free_inode (struct inode * inode)
117 ubh_sync_block(UCPI_UBH(ucpi)); 117 ubh_sync_block(UCPI_UBH(ucpi));
118 118
119 ufs_mark_sb_dirty(sb); 119 ufs_mark_sb_dirty(sb);
120 unlock_super (sb); 120 mutex_unlock(&UFS_SB(sb)->s_lock);
121 UFSD("EXIT\n"); 121 UFSD("EXIT\n");
122} 122}
123 123
@@ -197,7 +197,7 @@ struct inode *ufs_new_inode(struct inode *dir, umode_t mode)
197 uspi = sbi->s_uspi; 197 uspi = sbi->s_uspi;
198 usb1 = ubh_get_usb_first(uspi); 198 usb1 = ubh_get_usb_first(uspi);
199 199
200 lock_super (sb); 200 mutex_lock(&sbi->s_lock);
201 201
202 /* 202 /*
203 * Try to place the inode in its parent directory 203 * Try to place the inode in its parent directory
@@ -333,20 +333,20 @@ cg_found:
333 brelse(bh); 333 brelse(bh);
334 } 334 }
335 335
336 unlock_super (sb); 336 mutex_unlock(&sbi->s_lock);
337 337
338 UFSD("allocating inode %lu\n", inode->i_ino); 338 UFSD("allocating inode %lu\n", inode->i_ino);
339 UFSD("EXIT\n"); 339 UFSD("EXIT\n");
340 return inode; 340 return inode;
341 341
342fail_remove_inode: 342fail_remove_inode:
343 unlock_super(sb); 343 mutex_unlock(&sbi->s_lock);
344 clear_nlink(inode); 344 clear_nlink(inode);
345 iput(inode); 345 iput(inode);
346 UFSD("EXIT (FAILED): err %d\n", err); 346 UFSD("EXIT (FAILED): err %d\n", err);
347 return ERR_PTR(err); 347 return ERR_PTR(err);
348failed: 348failed:
349 unlock_super (sb); 349 mutex_unlock(&sbi->s_lock);
350 make_bad_inode(inode); 350 make_bad_inode(inode);
351 iput (inode); 351 iput (inode);
352 UFSD("EXIT (FAILED): err %d\n", err); 352 UFSD("EXIT (FAILED): err %d\n", err);
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index f7cfecfe1cab..dc8e3a861d0f 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -699,7 +699,7 @@ static int ufs_sync_fs(struct super_block *sb, int wait)
699 unsigned flags; 699 unsigned flags;
700 700
701 lock_ufs(sb); 701 lock_ufs(sb);
702 lock_super(sb); 702 mutex_lock(&UFS_SB(sb)->s_lock);
703 703
704 UFSD("ENTER\n"); 704 UFSD("ENTER\n");
705 705
@@ -717,7 +717,7 @@ static int ufs_sync_fs(struct super_block *sb, int wait)
717 ufs_put_cstotal(sb); 717 ufs_put_cstotal(sb);
718 718
719 UFSD("EXIT\n"); 719 UFSD("EXIT\n");
720 unlock_super(sb); 720 mutex_unlock(&UFS_SB(sb)->s_lock);
721 unlock_ufs(sb); 721 unlock_ufs(sb);
722 722
723 return 0; 723 return 0;
@@ -805,6 +805,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
805 } 805 }
806#endif 806#endif
807 mutex_init(&sbi->mutex); 807 mutex_init(&sbi->mutex);
808 mutex_init(&sbi->s_lock);
808 spin_lock_init(&sbi->work_lock); 809 spin_lock_init(&sbi->work_lock);
809 INIT_DELAYED_WORK(&sbi->sync_work, delayed_sync_fs); 810 INIT_DELAYED_WORK(&sbi->sync_work, delayed_sync_fs);
810 /* 811 /*
@@ -1280,7 +1281,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
1280 unsigned flags; 1281 unsigned flags;
1281 1282
1282 lock_ufs(sb); 1283 lock_ufs(sb);
1283 lock_super(sb); 1284 mutex_lock(&UFS_SB(sb)->s_lock);
1284 uspi = UFS_SB(sb)->s_uspi; 1285 uspi = UFS_SB(sb)->s_uspi;
1285 flags = UFS_SB(sb)->s_flags; 1286 flags = UFS_SB(sb)->s_flags;
1286 usb1 = ubh_get_usb_first(uspi); 1287 usb1 = ubh_get_usb_first(uspi);
@@ -1294,7 +1295,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
1294 new_mount_opt = 0; 1295 new_mount_opt = 0;
1295 ufs_set_opt (new_mount_opt, ONERROR_LOCK); 1296 ufs_set_opt (new_mount_opt, ONERROR_LOCK);
1296 if (!ufs_parse_options (data, &new_mount_opt)) { 1297 if (!ufs_parse_options (data, &new_mount_opt)) {
1297 unlock_super(sb); 1298 mutex_unlock(&UFS_SB(sb)->s_lock);
1298 unlock_ufs(sb); 1299 unlock_ufs(sb);
1299 return -EINVAL; 1300 return -EINVAL;
1300 } 1301 }
@@ -1302,14 +1303,14 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
1302 new_mount_opt |= ufstype; 1303 new_mount_opt |= ufstype;
1303 } else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) { 1304 } else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) {
1304 printk("ufstype can't be changed during remount\n"); 1305 printk("ufstype can't be changed during remount\n");
1305 unlock_super(sb); 1306 mutex_unlock(&UFS_SB(sb)->s_lock);
1306 unlock_ufs(sb); 1307 unlock_ufs(sb);
1307 return -EINVAL; 1308 return -EINVAL;
1308 } 1309 }
1309 1310
1310 if ((*mount_flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { 1311 if ((*mount_flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
1311 UFS_SB(sb)->s_mount_opt = new_mount_opt; 1312 UFS_SB(sb)->s_mount_opt = new_mount_opt;
1312 unlock_super(sb); 1313 mutex_unlock(&UFS_SB(sb)->s_lock);
1313 unlock_ufs(sb); 1314 unlock_ufs(sb);
1314 return 0; 1315 return 0;
1315 } 1316 }
@@ -1334,7 +1335,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
1334#ifndef CONFIG_UFS_FS_WRITE 1335#ifndef CONFIG_UFS_FS_WRITE
1335 printk("ufs was compiled with read-only support, " 1336 printk("ufs was compiled with read-only support, "
1336 "can't be mounted as read-write\n"); 1337 "can't be mounted as read-write\n");
1337 unlock_super(sb); 1338 mutex_unlock(&UFS_SB(sb)->s_lock);
1338 unlock_ufs(sb); 1339 unlock_ufs(sb);
1339 return -EINVAL; 1340 return -EINVAL;
1340#else 1341#else
@@ -1344,13 +1345,13 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
1344 ufstype != UFS_MOUNT_UFSTYPE_SUNx86 && 1345 ufstype != UFS_MOUNT_UFSTYPE_SUNx86 &&
1345 ufstype != UFS_MOUNT_UFSTYPE_UFS2) { 1346 ufstype != UFS_MOUNT_UFSTYPE_UFS2) {
1346 printk("this ufstype is read-only supported\n"); 1347 printk("this ufstype is read-only supported\n");
1347 unlock_super(sb); 1348 mutex_unlock(&UFS_SB(sb)->s_lock);
1348 unlock_ufs(sb); 1349 unlock_ufs(sb);
1349 return -EINVAL; 1350 return -EINVAL;
1350 } 1351 }
1351 if (!ufs_read_cylinder_structures(sb)) { 1352 if (!ufs_read_cylinder_structures(sb)) {
1352 printk("failed during remounting\n"); 1353 printk("failed during remounting\n");
1353 unlock_super(sb); 1354 mutex_unlock(&UFS_SB(sb)->s_lock);
1354 unlock_ufs(sb); 1355 unlock_ufs(sb);
1355 return -EPERM; 1356 return -EPERM;
1356 } 1357 }
@@ -1358,7 +1359,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
1358#endif 1359#endif
1359 } 1360 }
1360 UFS_SB(sb)->s_mount_opt = new_mount_opt; 1361 UFS_SB(sb)->s_mount_opt = new_mount_opt;
1361 unlock_super(sb); 1362 mutex_unlock(&UFS_SB(sb)->s_lock);
1362 unlock_ufs(sb); 1363 unlock_ufs(sb);
1363 return 0; 1364 return 0;
1364} 1365}
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index 343e6fc571e5..ff2c15ab81aa 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -24,6 +24,7 @@ struct ufs_sb_info {
24 int work_queued; /* non-zero if the delayed work is queued */ 24 int work_queued; /* non-zero if the delayed work is queued */
25 struct delayed_work sync_work; /* FS sync delayed work */ 25 struct delayed_work sync_work; /* FS sync delayed work */
26 spinlock_t work_lock; /* protects sync_work and work_queued */ 26 spinlock_t work_lock; /* protects sync_work and work_queued */
27 struct mutex s_lock;
27}; 28};
28 29
29struct ufs_inode_info { 30struct ufs_inode_info {
diff --git a/fs/xattr.c b/fs/xattr.c
index 1780f062dbaf..e164dddb8e96 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -412,7 +412,7 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
412 if (!f.file) 412 if (!f.file)
413 return error; 413 return error;
414 dentry = f.file->f_path.dentry; 414 dentry = f.file->f_path.dentry;
415 audit_inode(NULL, dentry); 415 audit_inode(NULL, dentry, 0);
416 error = mnt_want_write_file(f.file); 416 error = mnt_want_write_file(f.file);
417 if (!error) { 417 if (!error) {
418 error = setxattr(dentry, name, value, size, flags); 418 error = setxattr(dentry, name, value, size, flags);
@@ -507,7 +507,7 @@ SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name,
507 507
508 if (!f.file) 508 if (!f.file)
509 return error; 509 return error;
510 audit_inode(NULL, f.file->f_path.dentry); 510 audit_inode(NULL, f.file->f_path.dentry, 0);
511 error = getxattr(f.file->f_path.dentry, name, value, size); 511 error = getxattr(f.file->f_path.dentry, name, value, size);
512 fdput(f); 512 fdput(f);
513 return error; 513 return error;
@@ -586,7 +586,7 @@ SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size)
586 586
587 if (!f.file) 587 if (!f.file)
588 return error; 588 return error;
589 audit_inode(NULL, f.file->f_path.dentry); 589 audit_inode(NULL, f.file->f_path.dentry, 0);
590 error = listxattr(f.file->f_path.dentry, list, size); 590 error = listxattr(f.file->f_path.dentry, list, size);
591 fdput(f); 591 fdput(f);
592 return error; 592 return error;
@@ -655,7 +655,7 @@ SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
655 if (!f.file) 655 if (!f.file)
656 return error; 656 return error;
657 dentry = f.file->f_path.dentry; 657 dentry = f.file->f_path.dentry;
658 audit_inode(NULL, dentry); 658 audit_inode(NULL, dentry, 0);
659 error = mnt_want_write_file(f.file); 659 error = mnt_want_write_file(f.file);
660 if (!error) { 660 if (!error) {
661 error = removexattr(dentry, name); 661 error = removexattr(dentry, name);
diff --git a/fs/xattr_acl.c b/fs/xattr_acl.c
index 11efd830b5f5..9fbea87fdb6e 100644
--- a/fs/xattr_acl.c
+++ b/fs/xattr_acl.c
@@ -45,7 +45,7 @@ static void posix_acl_fix_xattr_userns(
45 break; 45 break;
46 case ACL_GROUP: 46 case ACL_GROUP:
47 gid = make_kgid(from, le32_to_cpu(entry->e_id)); 47 gid = make_kgid(from, le32_to_cpu(entry->e_id));
48 entry->e_id = cpu_to_le32(from_kuid(to, uid)); 48 entry->e_id = cpu_to_le32(from_kgid(to, gid));
49 break; 49 break;
50 default: 50 default:
51 break; 51 break;
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 42679223a0fd..8c6d1d70278c 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -189,6 +189,9 @@ xfs_fs_fh_to_parent(struct super_block *sb, struct fid *fid,
189 struct xfs_fid64 *fid64 = (struct xfs_fid64 *)fid; 189 struct xfs_fid64 *fid64 = (struct xfs_fid64 *)fid;
190 struct inode *inode = NULL; 190 struct inode *inode = NULL;
191 191
192 if (fh_len < xfs_fileid_length(fileid_type))
193 return NULL;
194
192 switch (fileid_type) { 195 switch (fileid_type) {
193 case FILEID_INO32_GEN_PARENT: 196 case FILEID_INO32_GEN_PARENT:
194 inode = xfs_nfs_get_inode(sb, fid->i32.parent_ino, 197 inode = xfs_nfs_get_inode(sb, fid->i32.parent_ino,